@img/sharp-libvips-dev 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +7 -1
- package/include/aom/aom_image.h +24 -12
- package/include/aom/aom_integer.h +3 -3
- package/include/aom/aomcx.h +15 -0
- package/include/aom/aomdx.h +5 -2
- package/include/archive.h +7 -5
- package/include/archive_entry.h +5 -3
- package/include/cgif.h +3 -0
- package/include/freetype2/freetype/config/ftoption.h +1 -1
- package/include/fribidi/fribidi-config.h +2 -2
- package/include/fribidi/fribidi-unicode-version.h +3 -3
- package/include/glib-2.0/gio/gappinfo.h +40 -25
- package/include/glib-2.0/gio/gasyncresult.h +1 -1
- package/include/glib-2.0/gio/gconverter.h +5 -0
- package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
- package/include/glib-2.0/gio/gfile.h +16 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/gsettings.h +8 -0
- package/include/glib-2.0/gio/gvfs.h +2 -2
- package/include/glib-2.0/girepository/gi-visibility.h +34 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
- package/include/glib-2.0/glib/giochannel.h +2 -2
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +12 -5
- package/include/glib-2.0/glib/gmain.h +93 -7
- package/include/glib-2.0/glib/gqsort.h +8 -1
- package/include/glib-2.0/glib/gstrfuncs.h +0 -12
- package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
- package/include/glib-2.0/glib/gunicode.h +1 -1
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gtype.h +6 -6
- package/include/harfbuzz/hb-buffer.h +6 -0
- package/include/harfbuzz/hb-common.h +6 -9
- package/include/harfbuzz/hb-cplusplus.hh +8 -11
- package/include/harfbuzz/hb-subset.h +17 -4
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +28 -0
- package/include/hwy/aligned_allocator.h +48 -1
- package/include/hwy/base.h +235 -34
- package/include/hwy/detect_compiler_arch.h +84 -10
- package/include/hwy/detect_targets.h +95 -29
- package/include/hwy/foreach_target.h +12 -1
- package/include/hwy/highway.h +205 -50
- package/include/hwy/ops/arm_neon-inl.h +841 -99
- package/include/hwy/ops/arm_sve-inl.h +413 -141
- package/include/hwy/ops/emu128-inl.h +373 -360
- package/include/hwy/ops/generic_ops-inl.h +804 -401
- package/include/hwy/ops/inside-inl.h +691 -0
- package/include/hwy/ops/ppc_vsx-inl.h +456 -166
- package/include/hwy/ops/rvv-inl.h +537 -249
- package/include/hwy/ops/scalar-inl.h +169 -79
- package/include/hwy/ops/set_macros-inl.h +106 -18
- package/include/hwy/ops/shared-inl.h +23 -0
- package/include/hwy/ops/wasm_128-inl.h +130 -108
- package/include/hwy/ops/x86_128-inl.h +1892 -577
- package/include/hwy/ops/x86_256-inl.h +625 -184
- package/include/hwy/ops/x86_512-inl.h +733 -131
- package/include/hwy/targets.h +22 -21
- package/include/hwy/timer-inl.h +3 -3
- package/include/hwy/timer.h +5 -1
- package/include/libheif/heif.h +170 -15
- package/include/libheif/heif_items.h +237 -0
- package/include/libheif/heif_properties.h +38 -2
- package/include/libheif/heif_regions.h +1 -1
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/pnglibconf.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
- package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
- package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
- package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
- package/include/libxml2/libxml/HTMLparser.h +12 -19
- package/include/libxml2/libxml/c14n.h +1 -12
- package/include/libxml2/libxml/debugXML.h +1 -1
- package/include/libxml2/libxml/encoding.h +9 -0
- package/include/libxml2/libxml/entities.h +12 -1
- package/include/libxml2/libxml/hash.h +19 -0
- package/include/libxml2/libxml/list.h +2 -2
- package/include/libxml2/libxml/nanohttp.h +17 -0
- package/include/libxml2/libxml/parser.h +61 -55
- package/include/libxml2/libxml/parserInternals.h +9 -1
- package/include/libxml2/libxml/pattern.h +6 -0
- package/include/libxml2/libxml/tree.h +32 -12
- package/include/libxml2/libxml/uri.h +11 -0
- package/include/libxml2/libxml/valid.h +29 -2
- package/include/libxml2/libxml/xinclude.h +7 -0
- package/include/libxml2/libxml/xmlIO.h +21 -4
- package/include/libxml2/libxml/xmlerror.h +14 -0
- package/include/libxml2/libxml/xmlexports.h +111 -15
- package/include/libxml2/libxml/xmlmemory.h +8 -45
- package/include/libxml2/libxml/xmlreader.h +2 -0
- package/include/libxml2/libxml/xmlsave.h +5 -0
- package/include/libxml2/libxml/xmlunicode.h +165 -1
- package/include/libxml2/libxml/xmlversion.h +15 -179
- package/include/libxml2/libxml/xmlwriter.h +1 -0
- package/include/libxml2/libxml/xpath.h +4 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-item.h +4 -2
- package/include/pango-1.0/pango/pango-version-macros.h +25 -0
- package/include/pango-1.0/pango/pangofc-font.h +2 -1
- package/include/pnglibconf.h +1 -1
- package/include/vips/util.h +1 -2
- package/include/vips/version.h +4 -4
- package/include/webp/decode.h +58 -56
- package/include/webp/demux.h +25 -21
- package/include/webp/encode.h +44 -39
- package/include/webp/mux.h +76 -15
- package/include/webp/mux_types.h +2 -1
- package/include/webp/sharpyuv/sharpyuv.h +77 -8
- package/include/webp/types.h +29 -8
- package/include/zconf.h +1 -1
- package/include/zlib.h +12 -12
- package/package.json +1 -1
- package/versions.json +14 -15
|
@@ -43,12 +43,23 @@
|
|
|
43
43
|
|
|
44
44
|
// HWY_SVE_HAVE_BF16_VEC is defined to 1 if the SVE svbfloat16_t vector type
|
|
45
45
|
// is supported, even if HWY_SVE_HAVE_BF16_FEATURE (= intrinsics) is 0.
|
|
46
|
-
#if HWY_SVE_HAVE_BF16_FEATURE ||
|
|
46
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || \
|
|
47
|
+
(HWY_COMPILER_CLANG >= 1200 && defined(__ARM_FEATURE_SVE_BF16)) || \
|
|
48
|
+
HWY_COMPILER_GCC_ACTUAL >= 1000
|
|
47
49
|
#define HWY_SVE_HAVE_BF16_VEC 1
|
|
48
50
|
#else
|
|
49
51
|
#define HWY_SVE_HAVE_BF16_VEC 0
|
|
50
52
|
#endif
|
|
51
53
|
|
|
54
|
+
// HWY_SVE_HAVE_F32_TO_BF16C is defined to 1 if the SVE svcvt_bf16_f32_x
|
|
55
|
+
// and svcvtnt_bf16_f32_x intrinsics are available, even if the __bf16 type
|
|
56
|
+
// is disabled
|
|
57
|
+
#if HWY_SVE_HAVE_BF16_VEC && defined(__ARM_FEATURE_SVE_BF16)
|
|
58
|
+
#define HWY_SVE_HAVE_F32_TO_BF16C 1
|
|
59
|
+
#else
|
|
60
|
+
#define HWY_SVE_HAVE_F32_TO_BF16C 0
|
|
61
|
+
#endif
|
|
62
|
+
|
|
52
63
|
HWY_BEFORE_NAMESPACE();
|
|
53
64
|
namespace hwy {
|
|
54
65
|
namespace HWY_NAMESPACE {
|
|
@@ -99,7 +110,13 @@ namespace detail { // for code folding
|
|
|
99
110
|
#define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP) \
|
|
100
111
|
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(X_MACRO, NAME, OP)
|
|
101
112
|
// We have both f16 and bf16, so nothing is emulated.
|
|
102
|
-
|
|
113
|
+
|
|
114
|
+
// NOTE: hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr is used instead of
|
|
115
|
+
// hwy::EnableIf<false>* = nullptr to avoid compiler errors since
|
|
116
|
+
// !hwy::IsSame<D, D>() is always false and as !hwy::IsSame<D, D>() will cause
|
|
117
|
+
// SFINAE to occur instead of a hard error due to a dependency on the D template
|
|
118
|
+
// argument
|
|
119
|
+
#define HWY_SVE_IF_EMULATED_D(D) hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr
|
|
103
120
|
#define HWY_SVE_IF_NOT_EMULATED_D(D) hwy::EnableIf<true>* = nullptr
|
|
104
121
|
#else
|
|
105
122
|
#define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP)
|
|
@@ -302,7 +319,9 @@ HWY_API size_t Lanes(Simd<T, N, kPow2> d) {
|
|
|
302
319
|
return sv##OP##_b##BITS##_u32(uint32_t{0}, static_cast<uint32_t>(limit)); \
|
|
303
320
|
}
|
|
304
321
|
HWY_SVE_FOREACH(HWY_SVE_FIRSTN, FirstN, whilelt)
|
|
305
|
-
|
|
322
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
323
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_FIRSTN, FirstN, whilelt)
|
|
324
|
+
#endif
|
|
306
325
|
|
|
307
326
|
template <class D, HWY_SVE_IF_EMULATED_D(D)>
|
|
308
327
|
svbool_t FirstN(D /* tag */, size_t count) {
|
|
@@ -327,7 +346,7 @@ namespace detail {
|
|
|
327
346
|
}
|
|
328
347
|
|
|
329
348
|
HWY_SVE_FOREACH(HWY_SVE_WRAP_PTRUE, PTrue, ptrue) // return all-true
|
|
330
|
-
|
|
349
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_WRAP_PTRUE, PTrue, ptrue)
|
|
331
350
|
#undef HWY_SVE_WRAP_PTRUE
|
|
332
351
|
|
|
333
352
|
HWY_API svbool_t PFalse() { return svpfalse_b(); }
|
|
@@ -433,29 +452,24 @@ HWY_SVE_FOREACH_UI32(HWY_SVE_CAST, _, reinterpret)
|
|
|
433
452
|
HWY_SVE_FOREACH_UI64(HWY_SVE_CAST, _, reinterpret)
|
|
434
453
|
HWY_SVE_FOREACH_F(HWY_SVE_CAST, _, reinterpret)
|
|
435
454
|
|
|
436
|
-
#
|
|
437
|
-
|
|
438
|
-
|
|
455
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
456
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CAST, _, reinterpret)
|
|
457
|
+
#else // !(HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC)
|
|
439
458
|
template <class V, HWY_SVE_IF_EMULATED_D(DFromV<V>)>
|
|
440
459
|
HWY_INLINE svuint8_t BitCastToByte(V v) {
|
|
441
|
-
#if HWY_SVE_HAVE_BF16_VEC
|
|
442
|
-
return svreinterpret_u8_bf16(v);
|
|
443
|
-
#else
|
|
444
460
|
const RebindToUnsigned<DFromV<V>> du;
|
|
445
461
|
return BitCastToByte(BitCast(du, v));
|
|
446
|
-
#endif
|
|
447
462
|
}
|
|
448
463
|
|
|
449
464
|
template <class D, HWY_SVE_IF_EMULATED_D(D)>
|
|
450
465
|
HWY_INLINE VFromD<D> BitCastFromByte(D d, svuint8_t v) {
|
|
451
|
-
#if HWY_SVE_HAVE_BF16_VEC
|
|
452
|
-
(void)d;
|
|
453
|
-
return svreinterpret_bf16_u8(v);
|
|
454
|
-
#else
|
|
455
466
|
const RebindToUnsigned<decltype(d)> du;
|
|
456
467
|
return BitCastFromByte(du, v);
|
|
457
|
-
#endif
|
|
458
468
|
}
|
|
469
|
+
#endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
470
|
+
|
|
471
|
+
#undef HWY_SVE_CAST_NOP
|
|
472
|
+
#undef HWY_SVE_CAST
|
|
459
473
|
|
|
460
474
|
} // namespace detail
|
|
461
475
|
|
|
@@ -474,6 +488,9 @@ HWY_API VFromD<D> BitCast(D d, FromV v) {
|
|
|
474
488
|
}
|
|
475
489
|
|
|
476
490
|
HWY_SVE_FOREACH(HWY_SVE_UNDEFINED, Undefined, undef)
|
|
491
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
492
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_UNDEFINED, Undefined, undef)
|
|
493
|
+
#endif
|
|
477
494
|
|
|
478
495
|
template <class D, HWY_SVE_IF_EMULATED_D(D)>
|
|
479
496
|
VFromD<D> Undefined(D d) {
|
|
@@ -506,7 +523,9 @@ VFromD<D> Undefined(D d) {
|
|
|
506
523
|
}
|
|
507
524
|
|
|
508
525
|
HWY_SVE_FOREACH(HWY_SVE_CREATE, Create, create)
|
|
509
|
-
|
|
526
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
527
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CREATE, Create, create)
|
|
528
|
+
#endif
|
|
510
529
|
#undef HWY_SVE_CREATE
|
|
511
530
|
|
|
512
531
|
template <class D>
|
|
@@ -531,7 +550,9 @@ using Vec4 = decltype(Create4(D(), Zero(D()), Zero(D()), Zero(D()), Zero(D())));
|
|
|
531
550
|
}
|
|
532
551
|
|
|
533
552
|
HWY_SVE_FOREACH(HWY_SVE_GET, Get, get)
|
|
534
|
-
|
|
553
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
554
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_GET, Get, get)
|
|
555
|
+
#endif
|
|
535
556
|
#undef HWY_SVE_GET
|
|
536
557
|
|
|
537
558
|
#define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
@@ -552,7 +573,9 @@ HWY_SVE_FOREACH_BF16(HWY_SVE_GET, Get, get)
|
|
|
552
573
|
}
|
|
553
574
|
|
|
554
575
|
HWY_SVE_FOREACH(HWY_SVE_SET, Set, set)
|
|
555
|
-
|
|
576
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
577
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_SET, Set, set)
|
|
578
|
+
#endif
|
|
556
579
|
#undef HWY_SVE_SET
|
|
557
580
|
|
|
558
581
|
// ------------------------------ ResizeBitCast
|
|
@@ -613,10 +636,14 @@ HWY_API svfloat16_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
|
613
636
|
return svdupq_n_f16(t0, t1, t2, t3, t4, t5, t6, t7);
|
|
614
637
|
}
|
|
615
638
|
|
|
616
|
-
template <class D,
|
|
639
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
617
640
|
HWY_API VBF16 Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, TFromD<D> t2,
|
|
618
641
|
TFromD<D> t3, TFromD<D> t4, TFromD<D> t5,
|
|
619
642
|
TFromD<D> t6, TFromD<D> t7) {
|
|
643
|
+
#if HWY_SVE_HAVE_BF16_FEATURE
|
|
644
|
+
(void)d;
|
|
645
|
+
return svdupq_n_bf16(t0, t1, t2, t3, t4, t5, t6, t7);
|
|
646
|
+
#else
|
|
620
647
|
const RebindToUnsigned<decltype(d)> du;
|
|
621
648
|
return BitCast(
|
|
622
649
|
d, Dup128VecFromValues(
|
|
@@ -624,6 +651,7 @@ HWY_API VBF16 Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, TFromD<D> t2,
|
|
|
624
651
|
BitCastScalar<uint16_t>(t2), BitCastScalar<uint16_t>(t3),
|
|
625
652
|
BitCastScalar<uint16_t>(t4), BitCastScalar<uint16_t>(t5),
|
|
626
653
|
BitCastScalar<uint16_t>(t6), BitCastScalar<uint16_t>(t7)));
|
|
654
|
+
#endif
|
|
627
655
|
}
|
|
628
656
|
|
|
629
657
|
template <class D, HWY_IF_I32_D(D)>
|
|
@@ -683,6 +711,10 @@ HWY_API V And(const V a, const V b) {
|
|
|
683
711
|
|
|
684
712
|
// ------------------------------ Or
|
|
685
713
|
|
|
714
|
+
namespace detail {
|
|
715
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, OrN, orr_n)
|
|
716
|
+
} // namespace detail
|
|
717
|
+
|
|
686
718
|
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Or, orr)
|
|
687
719
|
|
|
688
720
|
template <class V, HWY_IF_FLOAT_V(V)>
|
|
@@ -1012,14 +1044,15 @@ HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVV, AbsDiff, abd)
|
|
|
1012
1044
|
|
|
1013
1045
|
// ------------------------------ ShiftLeft[Same]
|
|
1014
1046
|
|
|
1015
|
-
#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP)
|
|
1016
|
-
template <int kBits>
|
|
1017
|
-
HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) {
|
|
1018
|
-
return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits);
|
|
1019
|
-
}
|
|
1020
|
-
HWY_API HWY_SVE_V(BASE, BITS)
|
|
1021
|
-
NAME##Same(HWY_SVE_V(BASE, BITS) v,
|
|
1022
|
-
return sv##OP##_##CHAR##BITS##_x(
|
|
1047
|
+
#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
1048
|
+
template <int kBits> \
|
|
1049
|
+
HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
|
|
1050
|
+
return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits); \
|
|
1051
|
+
} \
|
|
1052
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
1053
|
+
NAME##Same(HWY_SVE_V(BASE, BITS) v, int bits) { \
|
|
1054
|
+
return sv##OP##_##CHAR##BITS##_x( \
|
|
1055
|
+
HWY_SVE_PTRUE(BITS), v, static_cast<HWY_SVE_T(uint, BITS)>(bits)); \
|
|
1023
1056
|
}
|
|
1024
1057
|
|
|
1025
1058
|
HWY_SVE_FOREACH_UI(HWY_SVE_SHIFT_N, ShiftLeft, lsl_n)
|
|
@@ -1033,15 +1066,35 @@ HWY_SVE_FOREACH_I(HWY_SVE_SHIFT_N, ShiftRight, asr_n)
|
|
|
1033
1066
|
|
|
1034
1067
|
// ------------------------------ RotateRight
|
|
1035
1068
|
|
|
1036
|
-
|
|
1037
|
-
|
|
1069
|
+
#if HWY_SVE_HAVE_2
|
|
1070
|
+
|
|
1071
|
+
#define HWY_SVE_ROTATE_RIGHT_N(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
1072
|
+
template <int kBits> \
|
|
1073
|
+
HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
|
|
1074
|
+
if (kBits == 0) return v; \
|
|
1075
|
+
return sv##OP##_##CHAR##BITS(v, Zero(DFromV<decltype(v)>()), \
|
|
1076
|
+
HWY_MAX(kBits, 1)); \
|
|
1077
|
+
}
|
|
1078
|
+
|
|
1079
|
+
HWY_SVE_FOREACH_U(HWY_SVE_ROTATE_RIGHT_N, RotateRight, xar_n)
|
|
1080
|
+
HWY_SVE_FOREACH_I(HWY_SVE_ROTATE_RIGHT_N, RotateRight, xar_n)
|
|
1081
|
+
|
|
1082
|
+
#undef HWY_SVE_ROTATE_RIGHT_N
|
|
1083
|
+
|
|
1084
|
+
#else // !HWY_SVE_HAVE_2
|
|
1085
|
+
template <int kBits, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
1038
1086
|
HWY_API V RotateRight(const V v) {
|
|
1087
|
+
const DFromV<decltype(v)> d;
|
|
1088
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1089
|
+
|
|
1039
1090
|
constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
|
|
1040
1091
|
static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
|
|
1041
1092
|
if (kBits == 0) return v;
|
|
1042
|
-
|
|
1093
|
+
|
|
1094
|
+
return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
|
|
1043
1095
|
ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
|
|
1044
1096
|
}
|
|
1097
|
+
#endif
|
|
1045
1098
|
|
|
1046
1099
|
// ------------------------------ Shl/r
|
|
1047
1100
|
|
|
@@ -1089,11 +1142,7 @@ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, MaxN, max_n)
|
|
|
1089
1142
|
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVV, Mul, mul)
|
|
1090
1143
|
|
|
1091
1144
|
// ------------------------------ MulHigh
|
|
1092
|
-
|
|
1093
|
-
// Not part of API, used internally:
|
|
1094
|
-
HWY_SVE_FOREACH_UI08(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
|
|
1095
|
-
HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
|
|
1096
|
-
HWY_SVE_FOREACH_U64(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
|
|
1145
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
|
|
1097
1146
|
|
|
1098
1147
|
// ------------------------------ MulFixedPoint15
|
|
1099
1148
|
HWY_API svint16_t MulFixedPoint15(svint16_t a, svint16_t b) {
|
|
@@ -1275,6 +1324,7 @@ HWY_API size_t FindKnownFirstTrue(D d, svbool_t m) {
|
|
|
1275
1324
|
}
|
|
1276
1325
|
|
|
1277
1326
|
HWY_SVE_FOREACH(HWY_SVE_IF_THEN_ELSE, IfThenElse, sel)
|
|
1327
|
+
HWY_SVE_FOREACH_BF16(HWY_SVE_IF_THEN_ELSE, IfThenElse, sel)
|
|
1278
1328
|
#undef HWY_SVE_IF_THEN_ELSE
|
|
1279
1329
|
|
|
1280
1330
|
template <class V, class D = DFromV<V>, HWY_SVE_IF_EMULATED_D(D)>
|
|
@@ -1562,6 +1612,22 @@ HWY_API VFromD<D> VecFromMask(const D d, svbool_t mask) {
|
|
|
1562
1612
|
return BitCast(d, IfThenElseZero(mask, Set(di, -1)));
|
|
1563
1613
|
}
|
|
1564
1614
|
|
|
1615
|
+
// ------------------------------ IsNegative (Lt)
|
|
1616
|
+
#ifdef HWY_NATIVE_IS_NEGATIVE
|
|
1617
|
+
#undef HWY_NATIVE_IS_NEGATIVE
|
|
1618
|
+
#else
|
|
1619
|
+
#define HWY_NATIVE_IS_NEGATIVE
|
|
1620
|
+
#endif
|
|
1621
|
+
|
|
1622
|
+
template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
|
|
1623
|
+
HWY_API svbool_t IsNegative(V v) {
|
|
1624
|
+
const DFromV<decltype(v)> d;
|
|
1625
|
+
const RebindToSigned<decltype(d)> di;
|
|
1626
|
+
using TI = TFromD<decltype(di)>;
|
|
1627
|
+
|
|
1628
|
+
return detail::LtN(BitCast(di, v), static_cast<TI>(0));
|
|
1629
|
+
}
|
|
1630
|
+
|
|
1565
1631
|
// ------------------------------ IfVecThenElse (MaskFromVec, IfThenElse)
|
|
1566
1632
|
|
|
1567
1633
|
#if HWY_SVE_HAVE_2
|
|
@@ -2486,6 +2552,29 @@ HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svuint64_t v) {
|
|
|
2486
2552
|
return TruncateTo(dn, vn);
|
|
2487
2553
|
}
|
|
2488
2554
|
|
|
2555
|
+
// ------------------------------ Unsigned to signed demotions
|
|
2556
|
+
|
|
2557
|
+
// Disable the default unsigned to signed DemoteTo/ReorderDemote2To
|
|
2558
|
+
// implementations in generic_ops-inl.h on SVE/SVE2 as the SVE/SVE2 targets have
|
|
2559
|
+
// target-specific implementations of the unsigned to signed DemoteTo and
|
|
2560
|
+
// ReorderDemote2To ops
|
|
2561
|
+
|
|
2562
|
+
// NOTE: hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr is used instead of
|
|
2563
|
+
// hwy::EnableIf<false>* = nullptr to avoid compiler errors since
|
|
2564
|
+
// !hwy::IsSame<V, V>() is always false and as !hwy::IsSame<V, V>() will cause
|
|
2565
|
+
// SFINAE to occur instead of a hard error due to a dependency on the V template
|
|
2566
|
+
// argument
|
|
2567
|
+
#undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
|
|
2568
|
+
#define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \
|
|
2569
|
+
hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
|
|
2570
|
+
|
|
2571
|
+
template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V),
|
|
2572
|
+
HWY_IF_T_SIZE_LE_D(D, sizeof(TFromV<V>) - 1)>
|
|
2573
|
+
HWY_API VFromD<D> DemoteTo(D dn, V v) {
|
|
2574
|
+
const RebindToUnsigned<D> dn_u;
|
|
2575
|
+
return BitCast(dn, TruncateTo(dn_u, detail::SaturateU<TFromD<D>>(v)));
|
|
2576
|
+
}
|
|
2577
|
+
|
|
2489
2578
|
// ------------------------------ ConcatEven/ConcatOdd
|
|
2490
2579
|
|
|
2491
2580
|
// WARNING: the upper half of these needs fixing up (uzp1/uzp2 use the
|
|
@@ -2499,14 +2588,22 @@ namespace detail {
|
|
|
2499
2588
|
}
|
|
2500
2589
|
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1)
|
|
2501
2590
|
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2)
|
|
2502
|
-
|
|
2503
|
-
|
|
2591
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
2592
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull,
|
|
2593
|
+
uzp1)
|
|
2594
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull,
|
|
2595
|
+
uzp2)
|
|
2596
|
+
#endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
2504
2597
|
#if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
|
|
2505
2598
|
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q)
|
|
2506
2599
|
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
|
|
2507
|
-
|
|
2508
|
-
|
|
2509
|
-
|
|
2600
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
2601
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND,
|
|
2602
|
+
ConcatEvenBlocks, uzp1q)
|
|
2603
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks,
|
|
2604
|
+
uzp2q)
|
|
2605
|
+
#endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
2606
|
+
#endif // defined(__ARM_FEATURE_SVE_MATMUL_FP64)
|
|
2510
2607
|
#undef HWY_SVE_CONCAT_EVERY_SECOND
|
|
2511
2608
|
|
|
2512
2609
|
// Used to slide up / shift whole register left; mask indicates which range
|
|
@@ -2551,6 +2648,18 @@ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
|
2551
2648
|
return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
|
|
2552
2649
|
}
|
|
2553
2650
|
|
|
2651
|
+
// ------------------------------ PromoteEvenTo/PromoteOddTo
|
|
2652
|
+
|
|
2653
|
+
// Signed to signed PromoteEvenTo: 1 instruction instead of 2 in generic-inl.h.
|
|
2654
|
+
// Might as well also enable unsigned to unsigned, though it is just an And.
|
|
2655
|
+
namespace detail {
|
|
2656
|
+
HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGPV, NativePromoteEvenTo, extb)
|
|
2657
|
+
HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGPV, NativePromoteEvenTo, exth)
|
|
2658
|
+
HWY_SVE_FOREACH_UI64(HWY_SVE_RETV_ARGPV, NativePromoteEvenTo, extw)
|
|
2659
|
+
} // namespace detail
|
|
2660
|
+
|
|
2661
|
+
#include "hwy/ops/inside-inl.h"
|
|
2662
|
+
|
|
2554
2663
|
// ------------------------------ DemoteTo F
|
|
2555
2664
|
|
|
2556
2665
|
// We already toggled HWY_NATIVE_F16C above.
|
|
@@ -2576,10 +2685,46 @@ HWY_API svfloat16_t DemoteTo(Simd<float16_t, N, kPow2> d, const svfloat64_t v) {
|
|
|
2576
2685
|
in_even); // lower half
|
|
2577
2686
|
}
|
|
2578
2687
|
|
|
2688
|
+
#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
2689
|
+
#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
2690
|
+
#else
|
|
2691
|
+
#define HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
2692
|
+
#endif
|
|
2693
|
+
|
|
2694
|
+
#if !HWY_SVE_HAVE_F32_TO_BF16C
|
|
2695
|
+
namespace detail {
|
|
2696
|
+
|
|
2697
|
+
// Round a F32 value to the nearest BF16 value, with the result returned as the
|
|
2698
|
+
// rounded F32 value bitcasted to an U32
|
|
2699
|
+
|
|
2700
|
+
// RoundF32ForDemoteToBF16 also converts NaN values to QNaN values to prevent
|
|
2701
|
+
// NaN F32 values from being converted to an infinity
|
|
2702
|
+
HWY_INLINE svuint32_t RoundF32ForDemoteToBF16(svfloat32_t v) {
|
|
2703
|
+
const DFromV<decltype(v)> df32;
|
|
2704
|
+
const RebindToUnsigned<decltype(df32)> du32;
|
|
2705
|
+
|
|
2706
|
+
const auto is_non_nan = Eq(v, v);
|
|
2707
|
+
const auto bits32 = BitCast(du32, v);
|
|
2708
|
+
|
|
2709
|
+
const auto round_incr =
|
|
2710
|
+
detail::AddN(detail::AndN(ShiftRight<16>(bits32), 1u), 0x7FFFu);
|
|
2711
|
+
return MaskedAddOr(detail::OrN(bits32, 0x00400000u), is_non_nan, bits32,
|
|
2712
|
+
round_incr);
|
|
2713
|
+
}
|
|
2714
|
+
|
|
2715
|
+
} // namespace detail
|
|
2716
|
+
#endif // !HWY_SVE_HAVE_F32_TO_BF16C
|
|
2717
|
+
|
|
2579
2718
|
template <size_t N, int kPow2>
|
|
2580
2719
|
HWY_API VBF16 DemoteTo(Simd<bfloat16_t, N, kPow2> dbf16, svfloat32_t v) {
|
|
2581
|
-
|
|
2582
|
-
|
|
2720
|
+
#if HWY_SVE_HAVE_F32_TO_BF16C
|
|
2721
|
+
const VBF16 in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), v);
|
|
2722
|
+
return detail::ConcatEvenFull(in_even, in_even);
|
|
2723
|
+
#else
|
|
2724
|
+
const svuint16_t in_odd =
|
|
2725
|
+
BitCast(ScalableTag<uint16_t>(), detail::RoundF32ForDemoteToBF16(v));
|
|
2726
|
+
return BitCast(dbf16, detail::ConcatOddFull(in_odd, in_odd)); // lower half
|
|
2727
|
+
#endif
|
|
2583
2728
|
}
|
|
2584
2729
|
|
|
2585
2730
|
template <size_t N, int kPow2>
|
|
@@ -2620,32 +2765,31 @@ HWY_API svfloat32_t DemoteTo(Simd<float, N, kPow2> d, const svuint64_t v) {
|
|
|
2620
2765
|
// ------------------------------ ConvertTo F
|
|
2621
2766
|
|
|
2622
2767
|
#define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
2623
|
-
/* signed
|
|
2768
|
+
/* Float from signed */ \
|
|
2624
2769
|
template <size_t N, int kPow2> \
|
|
2625
2770
|
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
2626
2771
|
NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(int, BITS) v) { \
|
|
2627
2772
|
return sv##OP##_##CHAR##BITS##_s##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
|
|
2628
2773
|
} \
|
|
2629
|
-
/* unsigned
|
|
2774
|
+
/* Float from unsigned */ \
|
|
2630
2775
|
template <size_t N, int kPow2> \
|
|
2631
2776
|
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
2632
2777
|
NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(uint, BITS) v) { \
|
|
2633
2778
|
return sv##OP##_##CHAR##BITS##_u##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
|
|
2634
2779
|
} \
|
|
2635
|
-
/*
|
|
2780
|
+
/* Signed from float, rounding toward zero */ \
|
|
2636
2781
|
template <size_t N, int kPow2> \
|
|
2637
2782
|
HWY_API HWY_SVE_V(int, BITS) \
|
|
2638
2783
|
NAME(HWY_SVE_D(int, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \
|
|
2639
2784
|
return sv##OP##_s##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
|
|
2640
2785
|
} \
|
|
2641
|
-
/*
|
|
2786
|
+
/* Unsigned from float, rounding toward zero */ \
|
|
2642
2787
|
template <size_t N, int kPow2> \
|
|
2643
2788
|
HWY_API HWY_SVE_V(uint, BITS) \
|
|
2644
2789
|
NAME(HWY_SVE_D(uint, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \
|
|
2645
2790
|
return sv##OP##_u##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
|
|
2646
2791
|
}
|
|
2647
2792
|
|
|
2648
|
-
// API only requires f32 but we provide f64 for use by Iota.
|
|
2649
2793
|
HWY_SVE_FOREACH_F(HWY_SVE_CONVERT, ConvertTo, cvt)
|
|
2650
2794
|
#undef HWY_SVE_CONVERT
|
|
2651
2795
|
|
|
@@ -3124,13 +3268,15 @@ HWY_API TFromV<V> ExtractLane(V v, size_t i) {
|
|
|
3124
3268
|
}
|
|
3125
3269
|
|
|
3126
3270
|
// ------------------------------ InsertLane (IfThenElse)
|
|
3127
|
-
template <class V>
|
|
3128
|
-
HWY_API V InsertLane(const V v, size_t i,
|
|
3271
|
+
template <class V, typename T>
|
|
3272
|
+
HWY_API V InsertLane(const V v, size_t i, T t) {
|
|
3273
|
+
static_assert(sizeof(TFromV<V>) == sizeof(T), "Lane size mismatch");
|
|
3129
3274
|
const DFromV<V> d;
|
|
3130
3275
|
const RebindToSigned<decltype(d)> di;
|
|
3131
3276
|
using TI = TFromD<decltype(di)>;
|
|
3132
3277
|
const svbool_t is_i = detail::EqN(Iota(di, 0), static_cast<TI>(i));
|
|
3133
|
-
return IfThenElse(RebindMask(d, is_i),
|
|
3278
|
+
return IfThenElse(RebindMask(d, is_i),
|
|
3279
|
+
Set(d, hwy::ConvertScalarTo<TFromV<V>>(t)), v);
|
|
3134
3280
|
}
|
|
3135
3281
|
|
|
3136
3282
|
// ------------------------------ DupEven
|
|
@@ -3185,6 +3331,18 @@ HWY_API V OddEven(const V odd, const V even) {
|
|
|
3185
3331
|
|
|
3186
3332
|
#endif // HWY_TARGET
|
|
3187
3333
|
|
|
3334
|
+
// ------------------------------ InterleaveEven
|
|
3335
|
+
template <class D>
|
|
3336
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3337
|
+
return detail::InterleaveEven(a, b);
|
|
3338
|
+
}
|
|
3339
|
+
|
|
3340
|
+
// ------------------------------ InterleaveOdd
|
|
3341
|
+
template <class D>
|
|
3342
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3343
|
+
return detail::InterleaveOdd(a, b);
|
|
3344
|
+
}
|
|
3345
|
+
|
|
3188
3346
|
// ------------------------------ OddEvenBlocks
|
|
3189
3347
|
template <class V>
|
|
3190
3348
|
HWY_API V OddEvenBlocks(const V odd, const V even) {
|
|
@@ -3239,7 +3397,9 @@ HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) {
|
|
|
3239
3397
|
}
|
|
3240
3398
|
|
|
3241
3399
|
HWY_SVE_FOREACH(HWY_SVE_TABLE, TableLookupLanes, tbl)
|
|
3242
|
-
|
|
3400
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
3401
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_TABLE, TableLookupLanes, tbl)
|
|
3402
|
+
#endif
|
|
3243
3403
|
#undef HWY_SVE_TABLE
|
|
3244
3404
|
|
|
3245
3405
|
#if HWY_SVE_HAVE_2
|
|
@@ -3251,7 +3411,10 @@ namespace detail {
|
|
|
3251
3411
|
}
|
|
3252
3412
|
|
|
3253
3413
|
HWY_SVE_FOREACH(HWY_SVE_TABLE2, NativeTwoTableLookupLanes, tbl2)
|
|
3254
|
-
|
|
3414
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
3415
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_TABLE2, NativeTwoTableLookupLanes,
|
|
3416
|
+
tbl2)
|
|
3417
|
+
#endif
|
|
3255
3418
|
#undef HWY_SVE_TABLE
|
|
3256
3419
|
} // namespace detail
|
|
3257
3420
|
#endif // HWY_SVE_HAVE_2
|
|
@@ -3323,7 +3486,9 @@ namespace detail {
|
|
|
3323
3486
|
}
|
|
3324
3487
|
|
|
3325
3488
|
HWY_SVE_FOREACH(HWY_SVE_REVERSE, ReverseFull, rev)
|
|
3326
|
-
|
|
3489
|
+
#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
|
|
3490
|
+
HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_REVERSE, ReverseFull, rev)
|
|
3491
|
+
#endif
|
|
3327
3492
|
#undef HWY_SVE_REVERSE
|
|
3328
3493
|
|
|
3329
3494
|
} // namespace detail
|
|
@@ -4077,6 +4242,95 @@ HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
|
|
|
4077
4242
|
|
|
4078
4243
|
// ================================================== Ops with dependencies
|
|
4079
4244
|
|
|
4245
|
+
// ------------------------------ AddSub (Reverse2)
|
|
4246
|
+
|
|
4247
|
+
// NOTE: svcadd_f*_x(HWY_SVE_PTRUE(BITS), a, b, 90) computes a[i] - b[i + 1] in
|
|
4248
|
+
// the even lanes and a[i] + b[i - 1] in the odd lanes.
|
|
4249
|
+
|
|
4250
|
+
#define HWY_SVE_ADDSUB_F(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
4251
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
4252
|
+
NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
|
|
4253
|
+
const DFromV<decltype(b)> d; \
|
|
4254
|
+
return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, Reverse2(d, b), \
|
|
4255
|
+
90); \
|
|
4256
|
+
}
|
|
4257
|
+
|
|
4258
|
+
HWY_SVE_FOREACH_F(HWY_SVE_ADDSUB_F, AddSub, cadd)
|
|
4259
|
+
|
|
4260
|
+
#undef HWY_SVE_ADDSUB_F
|
|
4261
|
+
|
|
4262
|
+
// NOTE: svcadd_s*(a, b, 90) and svcadd_u*(a, b, 90) compute a[i] - b[i + 1] in
|
|
4263
|
+
// the even lanes and a[i] + b[i - 1] in the odd lanes.
|
|
4264
|
+
|
|
4265
|
+
#if HWY_SVE_HAVE_2
|
|
4266
|
+
#define HWY_SVE_ADDSUB_UI(BASE, CHAR, BITS, HALF, NAME, OP) \
|
|
4267
|
+
HWY_API HWY_SVE_V(BASE, BITS) \
|
|
4268
|
+
NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
|
|
4269
|
+
const DFromV<decltype(b)> d; \
|
|
4270
|
+
return sv##OP##_##CHAR##BITS(a, Reverse2(d, b), 90); \
|
|
4271
|
+
}
|
|
4272
|
+
|
|
4273
|
+
HWY_SVE_FOREACH_UI(HWY_SVE_ADDSUB_UI, AddSub, cadd)
|
|
4274
|
+
|
|
4275
|
+
#undef HWY_SVE_ADDSUB_UI
|
|
4276
|
+
|
|
4277
|
+
// Disable the default implementation of AddSub in generic_ops-inl.h on SVE2
|
|
4278
|
+
#undef HWY_IF_ADDSUB_V
|
|
4279
|
+
#define HWY_IF_ADDSUB_V(V) \
|
|
4280
|
+
HWY_IF_LANES_GT_D(DFromV<V>, 1), \
|
|
4281
|
+
hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
|
|
4282
|
+
|
|
4283
|
+
#else // !HWY_SVE_HAVE_2
|
|
4284
|
+
|
|
4285
|
+
// Disable the default implementation of AddSub in generic_ops-inl.h for
|
|
4286
|
+
// floating-point vectors on SVE, but enable the default implementation of
|
|
4287
|
+
// AddSub in generic_ops-inl.h for integer vectors on SVE that do not support
|
|
4288
|
+
// SVE2
|
|
4289
|
+
#undef HWY_IF_ADDSUB_V
|
|
4290
|
+
#define HWY_IF_ADDSUB_V(V) \
|
|
4291
|
+
HWY_IF_LANES_GT_D(DFromV<V>, 1), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
|
|
4292
|
+
|
|
4293
|
+
#endif // HWY_SVE_HAVE_2
|
|
4294
|
+
|
|
4295
|
+
// ------------------------------ MulAddSub (AddSub)
|
|
4296
|
+
|
|
4297
|
+
template <class V, HWY_IF_LANES_GT_D(DFromV<V>, 1), HWY_IF_FLOAT_V(V)>
|
|
4298
|
+
HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
|
|
4299
|
+
using T = TFromV<V>;
|
|
4300
|
+
|
|
4301
|
+
const DFromV<V> d;
|
|
4302
|
+
const T neg_zero = ConvertScalarTo<T>(-0.0f);
|
|
4303
|
+
|
|
4304
|
+
return MulAdd(mul, x, AddSub(Set(d, neg_zero), sub_or_add));
|
|
4305
|
+
}
|
|
4306
|
+
|
|
4307
|
+
#if HWY_SVE_HAVE_2
|
|
4308
|
+
|
|
4309
|
+
// Disable the default implementation of MulAddSub in generic_ops-inl.h on SVE2
|
|
4310
|
+
#undef HWY_IF_MULADDSUB_V
|
|
4311
|
+
#define HWY_IF_MULADDSUB_V(V) \
|
|
4312
|
+
HWY_IF_LANES_GT_D(DFromV<V>, 1), \
|
|
4313
|
+
hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
|
|
4314
|
+
|
|
4315
|
+
template <class V, HWY_IF_LANES_GT_D(DFromV<V>, 1),
|
|
4316
|
+
HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
4317
|
+
HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
|
|
4318
|
+
const DFromV<V> d;
|
|
4319
|
+
return MulAdd(mul, x, AddSub(Zero(d), sub_or_add));
|
|
4320
|
+
}
|
|
4321
|
+
|
|
4322
|
+
#else // !HWY_SVE_HAVE_2
|
|
4323
|
+
|
|
4324
|
+
// Disable the default implementation of MulAddSub in generic_ops-inl.h for
|
|
4325
|
+
// floating-point vectors on SVE, but enable the default implementation of
|
|
4326
|
+
// AddSub in generic_ops-inl.h for integer vectors on SVE targets that do not
|
|
4327
|
+
// support SVE2
|
|
4328
|
+
#undef HWY_IF_MULADDSUB_V
|
|
4329
|
+
#define HWY_IF_MULADDSUB_V(V) \
|
|
4330
|
+
HWY_IF_LANES_GT_D(DFromV<V>, 1), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
|
|
4331
|
+
|
|
4332
|
+
#endif // HWY_SVE_HAVE_2
|
|
4333
|
+
|
|
4080
4334
|
// ------------------------------ PromoteTo bfloat16 (ZipLower)
|
|
4081
4335
|
template <size_t N, int kPow2>
|
|
4082
4336
|
HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> df32, VBF16 v) {
|
|
@@ -4209,10 +4463,17 @@ HWY_INLINE VFromD<D> PromoteOddTo(ToTypeTag to_type_tag,
|
|
|
4209
4463
|
template <size_t N, int kPow2>
|
|
4210
4464
|
HWY_API VBF16 ReorderDemote2To(Simd<bfloat16_t, N, kPow2> dbf16, svfloat32_t a,
|
|
4211
4465
|
svfloat32_t b) {
|
|
4212
|
-
|
|
4213
|
-
const
|
|
4214
|
-
|
|
4215
|
-
|
|
4466
|
+
#if HWY_SVE_HAVE_F32_TO_BF16C
|
|
4467
|
+
const VBF16 b_in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), b);
|
|
4468
|
+
return svcvtnt_bf16_f32_x(b_in_even, detail::PTrue(dbf16), a);
|
|
4469
|
+
#else
|
|
4470
|
+
(void)dbf16;
|
|
4471
|
+
const auto a_in_odd =
|
|
4472
|
+
BitCast(ScalableTag<uint16_t>(), detail::RoundF32ForDemoteToBF16(a));
|
|
4473
|
+
const auto b_in_odd =
|
|
4474
|
+
BitCast(ScalableTag<uint16_t>(), detail::RoundF32ForDemoteToBF16(b));
|
|
4475
|
+
return BitCast(dbf16, detail::InterleaveOdd(b_in_odd, a_in_odd));
|
|
4476
|
+
#endif
|
|
4216
4477
|
}
|
|
4217
4478
|
|
|
4218
4479
|
template <size_t N, int kPow2>
|
|
@@ -4350,6 +4611,14 @@ HWY_API svuint32_t ReorderDemote2To(Simd<uint32_t, N, kPow2> d32, svuint64_t a,
|
|
|
4350
4611
|
#endif
|
|
4351
4612
|
}
|
|
4352
4613
|
|
|
4614
|
+
template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V),
|
|
4615
|
+
HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>) / 2)>
|
|
4616
|
+
HWY_API VFromD<D> ReorderDemote2To(D dn, V a, V b) {
|
|
4617
|
+
const auto clamped_a = BitCast(dn, detail::SaturateU<TFromD<D>>(a));
|
|
4618
|
+
const auto clamped_b = BitCast(dn, detail::SaturateU<TFromD<D>>(b));
|
|
4619
|
+
return detail::InterleaveEven(clamped_a, clamped_b);
|
|
4620
|
+
}
|
|
4621
|
+
|
|
4353
4622
|
template <class D, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
|
|
4354
4623
|
HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
|
|
4355
4624
|
HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2)>
|
|
@@ -4360,10 +4629,20 @@ HWY_API VFromD<D> OrderedDemote2To(D dn, V a, V b) {
|
|
|
4360
4629
|
return Combine(dn, demoted_b, demoted_a);
|
|
4361
4630
|
}
|
|
4362
4631
|
|
|
4363
|
-
template <
|
|
4364
|
-
HWY_API
|
|
4365
|
-
|
|
4366
|
-
|
|
4632
|
+
template <size_t N, int kPow2>
|
|
4633
|
+
HWY_API VBF16 OrderedDemote2To(Simd<bfloat16_t, N, kPow2> dbf16, svfloat32_t a,
|
|
4634
|
+
svfloat32_t b) {
|
|
4635
|
+
#if HWY_SVE_HAVE_F32_TO_BF16C
|
|
4636
|
+
(void)dbf16;
|
|
4637
|
+
const VBF16 a_in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), a);
|
|
4638
|
+
const VBF16 b_in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), b);
|
|
4639
|
+
return ConcatEven(dbf16, b_in_even, a_in_even);
|
|
4640
|
+
#else
|
|
4641
|
+
const RebindToUnsigned<decltype(dbf16)> du16;
|
|
4642
|
+
const svuint16_t a_in_odd = BitCast(du16, detail::RoundF32ForDemoteToBF16(a));
|
|
4643
|
+
const svuint16_t b_in_odd = BitCast(du16, detail::RoundF32ForDemoteToBF16(b));
|
|
4644
|
+
return BitCast(dbf16, ConcatOdd(du16, b_in_odd, a_in_odd)); // lower half
|
|
4645
|
+
#endif
|
|
4367
4646
|
}
|
|
4368
4647
|
|
|
4369
4648
|
// ------------------------------ I8/U8/I16/U16 Div
|
|
@@ -4401,12 +4680,6 @@ HWY_API V MaskedModOr(V no, M m, V a, V b) {
|
|
|
4401
4680
|
return IfThenElse(m, Mod(a, b), no);
|
|
4402
4681
|
}
|
|
4403
4682
|
|
|
4404
|
-
// ------------------------------ ZeroIfNegative (Lt, IfThenElse)
|
|
4405
|
-
template <class V>
|
|
4406
|
-
HWY_API V ZeroIfNegative(const V v) {
|
|
4407
|
-
return IfThenZeroElse(detail::LtN(v, 0), v);
|
|
4408
|
-
}
|
|
4409
|
-
|
|
4410
4683
|
// ------------------------------ BroadcastSignBit (ShiftRight)
|
|
4411
4684
|
template <class V>
|
|
4412
4685
|
HWY_API V BroadcastSignBit(const V v) {
|
|
@@ -4417,11 +4690,7 @@ HWY_API V BroadcastSignBit(const V v) {
|
|
|
4417
4690
|
template <class V>
|
|
4418
4691
|
HWY_API V IfNegativeThenElse(V v, V yes, V no) {
|
|
4419
4692
|
static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float");
|
|
4420
|
-
|
|
4421
|
-
const RebindToSigned<decltype(d)> di;
|
|
4422
|
-
|
|
4423
|
-
const svbool_t m = detail::LtN(BitCast(di, v), 0);
|
|
4424
|
-
return IfThenElse(m, yes, no);
|
|
4693
|
+
return IfThenElse(IsNegative(v), yes, no);
|
|
4425
4694
|
}
|
|
4426
4695
|
|
|
4427
4696
|
// ------------------------------ AverageRound (ShiftRight)
|
|
@@ -5445,12 +5714,24 @@ HWY_API VFromD<DW> MulOdd(const V a, const V b) {
|
|
|
5445
5714
|
#endif
|
|
5446
5715
|
}
|
|
5447
5716
|
|
|
5717
|
+
HWY_API svint64_t MulEven(const svint64_t a, const svint64_t b) {
|
|
5718
|
+
const auto lo = Mul(a, b);
|
|
5719
|
+
const auto hi = MulHigh(a, b);
|
|
5720
|
+
return detail::InterleaveEven(lo, hi);
|
|
5721
|
+
}
|
|
5722
|
+
|
|
5448
5723
|
HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b) {
|
|
5449
5724
|
const auto lo = Mul(a, b);
|
|
5450
5725
|
const auto hi = MulHigh(a, b);
|
|
5451
5726
|
return detail::InterleaveEven(lo, hi);
|
|
5452
5727
|
}
|
|
5453
5728
|
|
|
5729
|
+
HWY_API svint64_t MulOdd(const svint64_t a, const svint64_t b) {
|
|
5730
|
+
const auto lo = Mul(a, b);
|
|
5731
|
+
const auto hi = MulHigh(a, b);
|
|
5732
|
+
return detail::InterleaveOdd(lo, hi);
|
|
5733
|
+
}
|
|
5734
|
+
|
|
5454
5735
|
HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) {
|
|
5455
5736
|
const auto lo = Mul(a, b);
|
|
5456
5737
|
const auto hi = MulHigh(a, b);
|
|
@@ -5460,23 +5741,14 @@ HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) {
|
|
|
5460
5741
|
// ------------------------------ WidenMulPairwiseAdd
|
|
5461
5742
|
|
|
5462
5743
|
template <size_t N, int kPow2>
|
|
5463
|
-
HWY_API svfloat32_t WidenMulPairwiseAdd(Simd<float, N, kPow2>
|
|
5744
|
+
HWY_API svfloat32_t WidenMulPairwiseAdd(Simd<float, N, kPow2> df, VBF16 a,
|
|
5464
5745
|
VBF16 b) {
|
|
5465
|
-
#if
|
|
5466
|
-
const svfloat32_t even = svbfmlalb_f32(Zero(
|
|
5746
|
+
#if HWY_SVE_HAVE_F32_TO_BF16C
|
|
5747
|
+
const svfloat32_t even = svbfmlalb_f32(Zero(df), a, b);
|
|
5467
5748
|
return svbfmlalt_f32(even, a, b);
|
|
5468
5749
|
#else
|
|
5469
|
-
|
|
5470
|
-
|
|
5471
|
-
// RearrangeToOddPlusEven prefers.
|
|
5472
|
-
using VU32 = VFromD<decltype(du32)>;
|
|
5473
|
-
const VU32 odd = Set(du32, 0xFFFF0000u);
|
|
5474
|
-
const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
|
|
5475
|
-
const VU32 ao = And(BitCast(du32, a), odd);
|
|
5476
|
-
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
5477
|
-
const VU32 bo = And(BitCast(du32, b), odd);
|
|
5478
|
-
return MulAdd(BitCast(df32, ae), BitCast(df32, be),
|
|
5479
|
-
Mul(BitCast(df32, ao), BitCast(df32, bo)));
|
|
5750
|
+
return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
|
|
5751
|
+
Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
|
|
5480
5752
|
#endif // HWY_SVE_HAVE_BF16_FEATURE
|
|
5481
5753
|
}
|
|
5482
5754
|
|
|
@@ -5487,14 +5759,8 @@ HWY_API svint32_t WidenMulPairwiseAdd(Simd<int32_t, N, kPow2> d32, svint16_t a,
|
|
|
5487
5759
|
(void)d32;
|
|
5488
5760
|
return svmlalt_s32(svmullb_s32(a, b), a, b);
|
|
5489
5761
|
#else
|
|
5490
|
-
|
|
5491
|
-
|
|
5492
|
-
// Fortunately SVE has sign-extension for the even lanes.
|
|
5493
|
-
const svint32_t ae = svexth_s32_x(pg, BitCast(d32, a));
|
|
5494
|
-
const svint32_t be = svexth_s32_x(pg, BitCast(d32, b));
|
|
5495
|
-
const svint32_t ao = ShiftRight<16>(BitCast(d32, a));
|
|
5496
|
-
const svint32_t bo = ShiftRight<16>(BitCast(d32, b));
|
|
5497
|
-
return svmla_s32_x(pg, svmul_s32_x(pg, ao, bo), ae, be);
|
|
5762
|
+
return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b),
|
|
5763
|
+
Mul(PromoteOddTo(d32, a), PromoteOddTo(d32, b)));
|
|
5498
5764
|
#endif
|
|
5499
5765
|
}
|
|
5500
5766
|
|
|
@@ -5505,43 +5771,59 @@ HWY_API svuint32_t WidenMulPairwiseAdd(Simd<uint32_t, N, kPow2> d32,
|
|
|
5505
5771
|
(void)d32;
|
|
5506
5772
|
return svmlalt_u32(svmullb_u32(a, b), a, b);
|
|
5507
5773
|
#else
|
|
5508
|
-
|
|
5509
|
-
|
|
5510
|
-
// Fortunately SVE has sign-extension for the even lanes.
|
|
5511
|
-
const svuint32_t ae = svexth_u32_x(pg, BitCast(d32, a));
|
|
5512
|
-
const svuint32_t be = svexth_u32_x(pg, BitCast(d32, b));
|
|
5513
|
-
const svuint32_t ao = ShiftRight<16>(BitCast(d32, a));
|
|
5514
|
-
const svuint32_t bo = ShiftRight<16>(BitCast(d32, b));
|
|
5515
|
-
return svmla_u32_x(pg, svmul_u32_x(pg, ao, bo), ae, be);
|
|
5774
|
+
return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b),
|
|
5775
|
+
Mul(PromoteOddTo(d32, a), PromoteOddTo(d32, b)));
|
|
5516
5776
|
#endif
|
|
5517
5777
|
}
|
|
5518
5778
|
|
|
5779
|
+
// ------------------------------ SatWidenMulAccumFixedPoint
|
|
5780
|
+
|
|
5781
|
+
#if HWY_SVE_HAVE_2
|
|
5782
|
+
|
|
5783
|
+
#ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
|
|
5784
|
+
#undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
|
|
5785
|
+
#else
|
|
5786
|
+
#define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
|
|
5787
|
+
#endif
|
|
5788
|
+
|
|
5789
|
+
template <class DI32, HWY_IF_I32_D(DI32)>
|
|
5790
|
+
HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 /*di32*/,
|
|
5791
|
+
VFromD<Rebind<int16_t, DI32>> a,
|
|
5792
|
+
VFromD<Rebind<int16_t, DI32>> b,
|
|
5793
|
+
VFromD<DI32> sum) {
|
|
5794
|
+
return svqdmlalb_s32(sum, detail::ZipLowerSame(a, a),
|
|
5795
|
+
detail::ZipLowerSame(b, b));
|
|
5796
|
+
}
|
|
5797
|
+
|
|
5798
|
+
#endif // HWY_SVE_HAVE_2
|
|
5799
|
+
|
|
5519
5800
|
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
|
|
5520
5801
|
|
|
5521
|
-
template <size_t N, int kPow2>
|
|
5522
|
-
HWY_API svfloat32_t ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
|
|
5523
|
-
VBF16 a, VBF16 b,
|
|
5524
|
-
const svfloat32_t sum0,
|
|
5525
|
-
svfloat32_t& sum1) {
|
|
5526
5802
|
#if HWY_SVE_HAVE_BF16_FEATURE
|
|
5527
|
-
|
|
5528
|
-
|
|
5529
|
-
|
|
5803
|
+
|
|
5804
|
+
// NOTE: we currently do not use SVE BFDOT for bf16 ReorderWidenMulAccumulate
|
|
5805
|
+
// because, apparently unlike NEON, it uses round to odd unless the additional
|
|
5806
|
+
// FEAT_EBF16 feature is available and enabled.
|
|
5807
|
+
#ifdef HWY_NATIVE_MUL_EVEN_BF16
|
|
5808
|
+
#undef HWY_NATIVE_MUL_EVEN_BF16
|
|
5530
5809
|
#else
|
|
5531
|
-
|
|
5532
|
-
|
|
5533
|
-
|
|
5534
|
-
|
|
5535
|
-
|
|
5536
|
-
|
|
5537
|
-
|
|
5538
|
-
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
5539
|
-
const VU32 bo = And(BitCast(du32, b), odd);
|
|
5540
|
-
sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
|
|
5541
|
-
return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
|
|
5542
|
-
#endif // HWY_SVE_HAVE_BF16_FEATURE
|
|
5810
|
+
#define HWY_NATIVE_MUL_EVEN_BF16
|
|
5811
|
+
#endif
|
|
5812
|
+
|
|
5813
|
+
template <size_t N, int kPow2>
|
|
5814
|
+
HWY_API svfloat32_t MulEvenAdd(Simd<float, N, kPow2> /* d */, VBF16 a, VBF16 b,
|
|
5815
|
+
const svfloat32_t c) {
|
|
5816
|
+
return svbfmlalb_f32(c, a, b);
|
|
5543
5817
|
}
|
|
5544
5818
|
|
|
5819
|
+
template <size_t N, int kPow2>
|
|
5820
|
+
HWY_API svfloat32_t MulOddAdd(Simd<float, N, kPow2> /* d */, VBF16 a, VBF16 b,
|
|
5821
|
+
const svfloat32_t c) {
|
|
5822
|
+
return svbfmlalt_f32(c, a, b);
|
|
5823
|
+
}
|
|
5824
|
+
|
|
5825
|
+
#endif // HWY_SVE_HAVE_BF16_FEATURE
|
|
5826
|
+
|
|
5545
5827
|
template <size_t N, int kPow2>
|
|
5546
5828
|
HWY_API svint32_t ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32,
|
|
5547
5829
|
svint16_t a, svint16_t b,
|
|
@@ -5552,15 +5834,10 @@ HWY_API svint32_t ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32,
|
|
|
5552
5834
|
sum1 = svmlalt_s32(sum1, a, b);
|
|
5553
5835
|
return svmlalb_s32(sum0, a, b);
|
|
5554
5836
|
#else
|
|
5555
|
-
|
|
5556
|
-
//
|
|
5557
|
-
|
|
5558
|
-
|
|
5559
|
-
const svint32_t be = svexth_s32_x(pg, BitCast(d32, b));
|
|
5560
|
-
const svint32_t ao = ShiftRight<16>(BitCast(d32, a));
|
|
5561
|
-
const svint32_t bo = ShiftRight<16>(BitCast(d32, b));
|
|
5562
|
-
sum1 = svmla_s32_x(pg, sum1, ao, bo);
|
|
5563
|
-
return svmla_s32_x(pg, sum0, ae, be);
|
|
5837
|
+
// Lane order within sum0/1 is undefined, hence we can avoid the
|
|
5838
|
+
// longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
|
|
5839
|
+
sum1 = MulAdd(PromoteOddTo(d32, a), PromoteOddTo(d32, b), sum1);
|
|
5840
|
+
return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b), sum0);
|
|
5564
5841
|
#endif
|
|
5565
5842
|
}
|
|
5566
5843
|
|
|
@@ -5574,15 +5851,10 @@ HWY_API svuint32_t ReorderWidenMulAccumulate(Simd<uint32_t, N, kPow2> d32,
|
|
|
5574
5851
|
sum1 = svmlalt_u32(sum1, a, b);
|
|
5575
5852
|
return svmlalb_u32(sum0, a, b);
|
|
5576
5853
|
#else
|
|
5577
|
-
|
|
5578
|
-
//
|
|
5579
|
-
|
|
5580
|
-
|
|
5581
|
-
const svuint32_t be = svexth_u32_x(pg, BitCast(d32, b));
|
|
5582
|
-
const svuint32_t ao = ShiftRight<16>(BitCast(d32, a));
|
|
5583
|
-
const svuint32_t bo = ShiftRight<16>(BitCast(d32, b));
|
|
5584
|
-
sum1 = svmla_u32_x(pg, sum1, ao, bo);
|
|
5585
|
-
return svmla_u32_x(pg, sum0, ae, be);
|
|
5854
|
+
// Lane order within sum0/1 is undefined, hence we can avoid the
|
|
5855
|
+
// longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
|
|
5856
|
+
sum1 = MulAdd(PromoteOddTo(d32, a), PromoteOddTo(d32, b), sum1);
|
|
5857
|
+
return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b), sum0);
|
|
5586
5858
|
#endif
|
|
5587
5859
|
}
|
|
5588
5860
|
|