@img/sharp-libvips-dev 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/expat.h +21 -10
- package/include/expat_config.h +11 -5
- package/include/ffi.h +12 -25
- package/include/freetype2/freetype/config/ftoption.h +1 -1
- package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
- package/include/glib-2.0/gio/gapplication.h +6 -0
- package/include/glib-2.0/gio/giotypes.h +0 -1
- package/include/glib-2.0/girepository/giarginfo.h +23 -6
- package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
- package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
- package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
- package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
- package/include/glib-2.0/girepository/gienuminfo.h +20 -21
- package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
- package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
- package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
- package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
- package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
- package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
- package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
- package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
- package/include/glib-2.0/girepository/girepository.h +53 -62
- package/include/glib-2.0/girepository/girffi.h +8 -7
- package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
- package/include/glib-2.0/girepository/gistructinfo.h +26 -11
- package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
- package/include/glib-2.0/girepository/gitypelib.h +9 -13
- package/include/glib-2.0/girepository/gitypes.h +52 -104
- package/include/glib-2.0/girepository/giunioninfo.h +28 -12
- package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
- package/include/glib-2.0/girepository/givalueinfo.h +65 -0
- package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
- package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
- package/include/glib-2.0/glib/gbitlock.h +31 -0
- package/include/glib-2.0/glib/gmessages.h +8 -0
- package/include/glib-2.0/glib/gslice.h +2 -0
- package/include/glib-2.0/glib/gstrfuncs.h +24 -18
- package/include/glib-2.0/glib/gthread.h +191 -3
- package/include/glib-2.0/glib-unix.h +7 -1
- package/include/glib-2.0/gobject/genums.h +6 -6
- package/include/glib-2.0/gobject/glib-types.h +11 -0
- package/include/glib-2.0/gobject/gsignal.h +16 -6
- package/include/hwy/aligned_allocator.h +171 -6
- package/include/hwy/base.h +1765 -543
- package/include/hwy/cache_control.h +24 -6
- package/include/hwy/detect_compiler_arch.h +23 -2
- package/include/hwy/detect_targets.h +56 -13
- package/include/hwy/foreach_target.h +24 -0
- package/include/hwy/highway.h +20 -3
- package/include/hwy/ops/arm_neon-inl.h +1086 -667
- package/include/hwy/ops/arm_sve-inl.h +1091 -235
- package/include/hwy/ops/emu128-inl.h +271 -196
- package/include/hwy/ops/generic_ops-inl.h +2270 -399
- package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
- package/include/hwy/ops/rvv-inl.h +1043 -311
- package/include/hwy/ops/scalar-inl.h +189 -159
- package/include/hwy/ops/set_macros-inl.h +66 -6
- package/include/hwy/ops/shared-inl.h +175 -56
- package/include/hwy/ops/wasm_128-inl.h +153 -136
- package/include/hwy/ops/x86_128-inl.h +1647 -646
- package/include/hwy/ops/x86_256-inl.h +1003 -370
- package/include/hwy/ops/x86_512-inl.h +948 -353
- package/include/hwy/per_target.h +4 -0
- package/include/hwy/profiler.h +648 -0
- package/include/hwy/robust_statistics.h +2 -2
- package/include/hwy/targets.h +18 -11
- package/include/hwy/timer.h +11 -0
- package/include/libpng16/png.h +32 -29
- package/include/libpng16/pngconf.h +2 -2
- package/include/libpng16/pnglibconf.h +7 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
- package/include/libxml2/libxml/parser.h +16 -7
- package/include/libxml2/libxml/xmlIO.h +0 -1
- package/include/libxml2/libxml/xmlversion.h +4 -4
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-fontmap.h +7 -0
- package/include/pixman-1/pixman-version.h +2 -2
- package/include/png.h +32 -29
- package/include/pngconf.h +2 -2
- package/include/pnglibconf.h +7 -2
- package/include/vips/connection.h +9 -3
- package/include/vips/util.h +0 -9
- package/include/vips/version.h +4 -4
- package/package.json +1 -1
- package/versions.json +11 -11
|
@@ -339,8 +339,11 @@ namespace detail { // for code folding
|
|
|
339
339
|
// Full support for f16 in all ops
|
|
340
340
|
#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
|
|
341
341
|
HWY_RVV_FOREACH_F16_UNCONDITIONAL(X_MACRO, NAME, OP, LMULS)
|
|
342
|
+
// Only BF16 is emulated.
|
|
343
|
+
#define HWY_RVV_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
|
|
342
344
|
#else
|
|
343
345
|
#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS)
|
|
346
|
+
#define HWY_RVV_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
|
|
344
347
|
#endif
|
|
345
348
|
#define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \
|
|
346
349
|
HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, float, f, NAME, OP)
|
|
@@ -389,15 +392,11 @@ namespace detail { // for code folding
|
|
|
389
392
|
// For all combinations of SEW:
|
|
390
393
|
#define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
|
|
391
394
|
HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
|
|
392
|
-
|
|
393
|
-
HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
|
|
394
|
-
HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
|
|
395
|
+
HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS)
|
|
395
396
|
|
|
396
397
|
#define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \
|
|
397
398
|
HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \
|
|
398
|
-
|
|
399
|
-
HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
|
|
400
|
-
HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
|
|
399
|
+
HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS)
|
|
401
400
|
|
|
402
401
|
#define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS) \
|
|
403
402
|
HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
|
|
@@ -409,8 +408,7 @@ namespace detail { // for code folding
|
|
|
409
408
|
HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS)
|
|
410
409
|
|
|
411
410
|
#define HWY_RVV_FOREACH(X_MACRO, NAME, OP, LMULS) \
|
|
412
|
-
|
|
413
|
-
HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \
|
|
411
|
+
HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS) \
|
|
414
412
|
HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS)
|
|
415
413
|
|
|
416
414
|
// Assemble types for use in x-macros
|
|
@@ -480,18 +478,12 @@ HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _, _ALL)
|
|
|
480
478
|
|
|
481
479
|
HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e, _ALL)
|
|
482
480
|
HWY_RVV_FOREACH(HWY_RVV_LANES_VIRT, Lanes, lenb, _VIRT)
|
|
483
|
-
// If not already defined via HWY_RVV_FOREACH, define the overloads because
|
|
484
|
-
// they do not require any new instruction.
|
|
485
|
-
#if !HWY_HAVE_FLOAT16
|
|
486
|
-
HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_LANES, Lanes, setvlmax_e, _ALL)
|
|
487
|
-
HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_LANES_VIRT, Lanes, lenb, _VIRT)
|
|
488
|
-
#endif
|
|
489
481
|
#undef HWY_RVV_LANES
|
|
490
482
|
#undef HWY_RVV_LANES_VIRT
|
|
491
483
|
|
|
492
|
-
template <
|
|
493
|
-
HWY_API size_t Lanes(
|
|
494
|
-
return Lanes(
|
|
484
|
+
template <class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
485
|
+
HWY_API size_t Lanes(D /* tag*/) {
|
|
486
|
+
return Lanes(RebindToUnsigned<D>());
|
|
495
487
|
}
|
|
496
488
|
|
|
497
489
|
// ------------------------------ Common x-macros
|
|
@@ -525,10 +517,20 @@ HWY_API size_t Lanes(Simd<bfloat16_t, N, kPow2> /* tag*/) {
|
|
|
525
517
|
HWY_RVV_AVL(SEW, SHIFT)); \
|
|
526
518
|
}
|
|
527
519
|
|
|
520
|
+
// vector = f(vector, mask, vector, vector), e.g. MaskedAddOr
|
|
521
|
+
#define HWY_RVV_RETV_ARGMVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
522
|
+
SHIFT, MLEN, NAME, OP) \
|
|
523
|
+
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
524
|
+
NAME(HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_M(MLEN) m, \
|
|
525
|
+
HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
|
|
526
|
+
return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_mu(m, no, a, b, \
|
|
527
|
+
HWY_RVV_AVL(SEW, SHIFT)); \
|
|
528
|
+
}
|
|
529
|
+
|
|
528
530
|
// mask = f(mask)
|
|
529
|
-
#define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP)
|
|
530
|
-
HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) {
|
|
531
|
-
return __riscv_vm##OP##_m_b##MLEN(m,
|
|
531
|
+
#define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP) \
|
|
532
|
+
HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) { \
|
|
533
|
+
return __riscv_vm##OP##_m_b##MLEN(m, HWY_RVV_AVL(SEW, SHIFT)); \
|
|
532
534
|
}
|
|
533
535
|
|
|
534
536
|
// ================================================== INIT
|
|
@@ -550,20 +552,18 @@ HWY_RVV_FOREACH_F(HWY_RVV_SET, Set, fmv_v_f, _ALL_VIRT)
|
|
|
550
552
|
// Treat bfloat16_t as int16_t (using the previously defined Set overloads);
|
|
551
553
|
// required for Zero and VFromD.
|
|
552
554
|
template <size_t N, int kPow2>
|
|
553
|
-
decltype(Set(Simd<int16_t, N, kPow2>(), 0)) Set(
|
|
554
|
-
|
|
555
|
-
return Set(RebindToSigned<decltype(d)>(), arg
|
|
555
|
+
decltype(Set(Simd<int16_t, N, kPow2>(), 0)) Set(
|
|
556
|
+
Simd<hwy::bfloat16_t, N, kPow2> d, hwy::bfloat16_t arg) {
|
|
557
|
+
return Set(RebindToSigned<decltype(d)>(), BitCastScalar<int16_t>(arg));
|
|
556
558
|
}
|
|
557
559
|
#if !HWY_HAVE_FLOAT16 // Otherwise already defined above.
|
|
558
560
|
// WARNING: returns a different type than emulated bfloat16_t so that we can
|
|
559
561
|
// implement PromoteTo overloads for both bfloat16_t and float16_t, and also
|
|
560
|
-
// provide a Neg(float16_t) overload that coexists with Neg(int16_t).
|
|
562
|
+
// provide a Neg(hwy::float16_t) overload that coexists with Neg(int16_t).
|
|
561
563
|
template <size_t N, int kPow2>
|
|
562
|
-
decltype(Set(Simd<uint16_t, N, kPow2>(), 0)) Set(
|
|
563
|
-
|
|
564
|
-
uint16_t
|
|
565
|
-
CopySameSize(&arg, &bits);
|
|
566
|
-
return Set(RebindToUnsigned<decltype(d)>(), bits);
|
|
564
|
+
decltype(Set(Simd<uint16_t, N, kPow2>(), 0)) Set(
|
|
565
|
+
Simd<hwy::float16_t, N, kPow2> d, hwy::float16_t arg) {
|
|
566
|
+
return Set(RebindToUnsigned<decltype(d)>(), BitCastScalar<uint16_t>(arg));
|
|
567
567
|
}
|
|
568
568
|
#endif
|
|
569
569
|
|
|
@@ -642,16 +642,7 @@ HWY_RVV_FOREACH(HWY_RVV_EXT, Ext, lmul_ext, _EXT)
|
|
|
642
642
|
HWY_RVV_FOREACH(HWY_RVV_EXT_VIRT, Ext, lmul_ext, _VIRT)
|
|
643
643
|
#undef HWY_RVV_EXT_VIRT
|
|
644
644
|
|
|
645
|
-
|
|
646
|
-
template <class D, HWY_IF_F16_D(D)>
|
|
647
|
-
VFromD<D> Ext(D d, VFromD<Half<D>> v) {
|
|
648
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
649
|
-
const Half<decltype(du)> duh;
|
|
650
|
-
return BitCast(d, Ext(du, BitCast(duh, v)));
|
|
651
|
-
}
|
|
652
|
-
#endif
|
|
653
|
-
|
|
654
|
-
template <class D, HWY_IF_BF16_D(D)>
|
|
645
|
+
template <class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
655
646
|
VFromD<D> Ext(D d, VFromD<Half<D>> v) {
|
|
656
647
|
const RebindToUnsigned<decltype(d)> du;
|
|
657
648
|
const Half<decltype(du)> duh;
|
|
@@ -769,7 +760,7 @@ HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
|
|
|
769
760
|
#else
|
|
770
761
|
template <size_t N, int kPow2>
|
|
771
762
|
HWY_INLINE VFromD<Simd<uint16_t, N, kPow2>> BitCastFromByte(
|
|
772
|
-
Simd<float16_t, N, kPow2> /* d */, VFromD<Simd<uint8_t, N, kPow2>> v) {
|
|
763
|
+
Simd<hwy::float16_t, N, kPow2> /* d */, VFromD<Simd<uint8_t, N, kPow2>> v) {
|
|
773
764
|
return BitCastFromByte(Simd<uint16_t, N, kPow2>(), v);
|
|
774
765
|
}
|
|
775
766
|
#endif
|
|
@@ -783,7 +774,8 @@ HWY_INLINE VFromD<Simd<uint16_t, N, kPow2>> BitCastFromByte(
|
|
|
783
774
|
|
|
784
775
|
template <size_t N, int kPow2>
|
|
785
776
|
HWY_INLINE VFromD<Simd<int16_t, N, kPow2>> BitCastFromByte(
|
|
786
|
-
Simd<bfloat16_t, N, kPow2> /* d */,
|
|
777
|
+
Simd<hwy::bfloat16_t, N, kPow2> /* d */,
|
|
778
|
+
VFromD<Simd<uint8_t, N, kPow2>> v) {
|
|
787
779
|
return BitCastFromByte(Simd<int16_t, N, kPow2>(), v);
|
|
788
780
|
}
|
|
789
781
|
|
|
@@ -1048,7 +1040,7 @@ HWY_RVV_FOREACH_I(HWY_RVV_SHIFT, ShiftRight, sra, _ALL)
|
|
|
1048
1040
|
#undef HWY_RVV_SHIFT
|
|
1049
1041
|
|
|
1050
1042
|
// ------------------------------ SumsOf8 (ShiftRight, Add)
|
|
1051
|
-
template <class VU8>
|
|
1043
|
+
template <class VU8, HWY_IF_U8_D(DFromV<VU8>)>
|
|
1052
1044
|
HWY_API VFromD<Repartition<uint64_t, DFromV<VU8>>> SumsOf8(const VU8 v) {
|
|
1053
1045
|
const DFromV<VU8> du8;
|
|
1054
1046
|
const RepartitionToWide<decltype(du8)> du16;
|
|
@@ -1071,6 +1063,31 @@ HWY_API VFromD<Repartition<uint64_t, DFromV<VU8>>> SumsOf8(const VU8 v) {
|
|
|
1071
1063
|
return detail::AndS(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), 0xFFFFull);
|
|
1072
1064
|
}
|
|
1073
1065
|
|
|
1066
|
+
template <class VI8, HWY_IF_I8_D(DFromV<VI8>)>
|
|
1067
|
+
HWY_API VFromD<Repartition<int64_t, DFromV<VI8>>> SumsOf8(const VI8 v) {
|
|
1068
|
+
const DFromV<VI8> di8;
|
|
1069
|
+
const RepartitionToWide<decltype(di8)> di16;
|
|
1070
|
+
const RepartitionToWide<decltype(di16)> di32;
|
|
1071
|
+
const RepartitionToWide<decltype(di32)> di64;
|
|
1072
|
+
const RebindToUnsigned<decltype(di32)> du32;
|
|
1073
|
+
const RebindToUnsigned<decltype(di64)> du64;
|
|
1074
|
+
using VI16 = VFromD<decltype(di16)>;
|
|
1075
|
+
|
|
1076
|
+
const VI16 vFDB97531 = ShiftRight<8>(BitCast(di16, v));
|
|
1077
|
+
const VI16 vECA86420 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, v)));
|
|
1078
|
+
const VI16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
|
|
1079
|
+
|
|
1080
|
+
const VI16 sDC_zz_98_zz_54_zz_10_zz =
|
|
1081
|
+
BitCast(di16, ShiftLeft<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
|
|
1082
|
+
const VI16 sFC_xx_B8_xx_74_xx_30_xx =
|
|
1083
|
+
Add(sFE_DC_BA_98_76_54_32_10, sDC_zz_98_zz_54_zz_10_zz);
|
|
1084
|
+
const VI16 sB8_xx_zz_zz_30_xx_zz_zz =
|
|
1085
|
+
BitCast(di16, ShiftLeft<32>(BitCast(du64, sFC_xx_B8_xx_74_xx_30_xx)));
|
|
1086
|
+
const VI16 sF8_xx_xx_xx_70_xx_xx_xx =
|
|
1087
|
+
Add(sFC_xx_B8_xx_74_xx_30_xx, sB8_xx_zz_zz_30_xx_zz_zz);
|
|
1088
|
+
return ShiftRight<48>(BitCast(di64, sF8_xx_xx_xx_70_xx_xx_xx));
|
|
1089
|
+
}
|
|
1090
|
+
|
|
1074
1091
|
// ------------------------------ RotateRight
|
|
1075
1092
|
template <int kBits, class V>
|
|
1076
1093
|
HWY_API V RotateRight(const V v) {
|
|
@@ -1184,8 +1201,57 @@ HWY_RVV_FOREACH_I16(HWY_RVV_MUL15, MulFixedPoint15, smul, _ALL)
|
|
|
1184
1201
|
#undef HWY_RVV_MUL15
|
|
1185
1202
|
|
|
1186
1203
|
// ------------------------------ Div
|
|
1204
|
+
#ifdef HWY_NATIVE_INT_DIV
|
|
1205
|
+
#undef HWY_NATIVE_INT_DIV
|
|
1206
|
+
#else
|
|
1207
|
+
#define HWY_NATIVE_INT_DIV
|
|
1208
|
+
#endif
|
|
1209
|
+
|
|
1210
|
+
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Div, divu, _ALL)
|
|
1211
|
+
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Div, div, _ALL)
|
|
1187
1212
|
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Div, fdiv, _ALL)
|
|
1188
1213
|
|
|
1214
|
+
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Mod, remu, _ALL)
|
|
1215
|
+
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Mod, rem, _ALL)
|
|
1216
|
+
|
|
1217
|
+
// ------------------------------ MaskedAddOr etc.
|
|
1218
|
+
|
|
1219
|
+
#ifdef HWY_NATIVE_MASKED_ARITH
|
|
1220
|
+
#undef HWY_NATIVE_MASKED_ARITH
|
|
1221
|
+
#else
|
|
1222
|
+
#define HWY_NATIVE_MASKED_ARITH
|
|
1223
|
+
#endif
|
|
1224
|
+
|
|
1225
|
+
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedMinOr, minu, _ALL)
|
|
1226
|
+
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedMinOr, min, _ALL)
|
|
1227
|
+
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedMinOr, fmin, _ALL)
|
|
1228
|
+
|
|
1229
|
+
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedMaxOr, maxu, _ALL)
|
|
1230
|
+
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedMaxOr, max, _ALL)
|
|
1231
|
+
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedMaxOr, fmax, _ALL)
|
|
1232
|
+
|
|
1233
|
+
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGMVV, MaskedAddOr, add, _ALL)
|
|
1234
|
+
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedAddOr, fadd, _ALL)
|
|
1235
|
+
|
|
1236
|
+
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGMVV, MaskedSubOr, sub, _ALL)
|
|
1237
|
+
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedSubOr, fsub, _ALL)
|
|
1238
|
+
|
|
1239
|
+
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGMVV, MaskedMulOr, mul, _ALL)
|
|
1240
|
+
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedMulOr, fmul, _ALL)
|
|
1241
|
+
|
|
1242
|
+
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedDivOr, divu, _ALL)
|
|
1243
|
+
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedDivOr, div, _ALL)
|
|
1244
|
+
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedDivOr, fdiv, _ALL)
|
|
1245
|
+
|
|
1246
|
+
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedModOr, remu, _ALL)
|
|
1247
|
+
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedModOr, rem, _ALL)
|
|
1248
|
+
|
|
1249
|
+
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedSatAddOr, saddu, _ALL)
|
|
1250
|
+
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedSatAddOr, sadd, _ALL)
|
|
1251
|
+
|
|
1252
|
+
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedSatSubOr, ssubu, _ALL)
|
|
1253
|
+
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedSatSubOr, ssub, _ALL)
|
|
1254
|
+
|
|
1189
1255
|
// ------------------------------ ApproximateReciprocal
|
|
1190
1256
|
#ifdef HWY_NATIVE_F64_APPROX_RECIP
|
|
1191
1257
|
#undef HWY_NATIVE_F64_APPROX_RECIP
|
|
@@ -1247,26 +1313,6 @@ HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub, fnmacc, _ALL)
|
|
|
1247
1313
|
// vboolXX_t is a power of two divisor for vector bits. SEW=8 / LMUL=1 = 1/8th
|
|
1248
1314
|
// of all bits; SEW=8 / LMUL=4 = half of all bits.
|
|
1249
1315
|
|
|
1250
|
-
// SFINAE for mapping Simd<> to MLEN (up to 64).
|
|
1251
|
-
#define HWY_RVV_IF_MLEN_D(D, MLEN) \
|
|
1252
|
-
hwy::EnableIf<MLenFromD(D()) == MLEN>* = nullptr
|
|
1253
|
-
|
|
1254
|
-
// Specialized for RVV instead of the generic test_util-inl.h implementation
|
|
1255
|
-
// because more efficient, and helps implement MFromD.
|
|
1256
|
-
|
|
1257
|
-
#define HWY_RVV_MASK_FALSE(SEW, SHIFT, MLEN, NAME, OP) \
|
|
1258
|
-
template <class D, HWY_RVV_IF_MLEN_D(D, MLEN)> \
|
|
1259
|
-
HWY_API HWY_RVV_M(MLEN) NAME(D d) { \
|
|
1260
|
-
return __riscv_vm##OP##_m_b##MLEN(Lanes(d)); \
|
|
1261
|
-
}
|
|
1262
|
-
|
|
1263
|
-
HWY_RVV_FOREACH_B(HWY_RVV_MASK_FALSE, MaskFalse, clr)
|
|
1264
|
-
#undef HWY_RVV_MASK_FALSE
|
|
1265
|
-
#undef HWY_RVV_IF_MLEN_D
|
|
1266
|
-
|
|
1267
|
-
template <class D>
|
|
1268
|
-
using MFromD = decltype(MaskFalse(D()));
|
|
1269
|
-
|
|
1270
1316
|
// mask = f(vector, vector)
|
|
1271
1317
|
#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
1272
1318
|
SHIFT, MLEN, NAME, OP) \
|
|
@@ -1405,11 +1451,32 @@ HWY_RVV_FOREACH_F(HWY_RVV_IF_THEN_ZERO_ELSE, IfThenZeroElse, fmerge_vfm, _ALL)
|
|
|
1405
1451
|
#undef HWY_RVV_IF_THEN_ZERO_ELSE
|
|
1406
1452
|
|
|
1407
1453
|
// ------------------------------ MaskFromVec
|
|
1454
|
+
|
|
1455
|
+
template <class D>
|
|
1456
|
+
using MFromD = decltype(Eq(Zero(D()), Zero(D())));
|
|
1457
|
+
|
|
1408
1458
|
template <class V>
|
|
1409
1459
|
HWY_API MFromD<DFromV<V>> MaskFromVec(const V v) {
|
|
1410
1460
|
return detail::NeS(v, 0);
|
|
1411
1461
|
}
|
|
1412
1462
|
|
|
1463
|
+
// ------------------------------ MaskFalse
|
|
1464
|
+
|
|
1465
|
+
// For mask ops including vmclr, elements past VL are tail-agnostic and cannot
|
|
1466
|
+
// be relied upon, so define a variant of the generic_ops-inl implementation of
|
|
1467
|
+
// MaskFalse that ensures all bits are zero as required by mask_test.
|
|
1468
|
+
#ifdef HWY_NATIVE_MASK_FALSE
|
|
1469
|
+
#undef HWY_NATIVE_MASK_FALSE
|
|
1470
|
+
#else
|
|
1471
|
+
#define HWY_NATIVE_MASK_FALSE
|
|
1472
|
+
#endif
|
|
1473
|
+
|
|
1474
|
+
template <class D>
|
|
1475
|
+
HWY_API MFromD<D> MaskFalse(D d) {
|
|
1476
|
+
const DFromV<VFromD<decltype(d)>> d_full;
|
|
1477
|
+
return MaskFromVec(Zero(d_full));
|
|
1478
|
+
}
|
|
1479
|
+
|
|
1413
1480
|
// ------------------------------ RebindMask
|
|
1414
1481
|
template <class D, typename MFrom>
|
|
1415
1482
|
HWY_API MFromD<D> RebindMask(const D /*d*/, const MFrom mask) {
|
|
@@ -1427,10 +1494,12 @@ HWY_API MFromD<D> RebindMask(const D /*d*/, const MFrom mask) {
|
|
|
1427
1494
|
template <size_t N> \
|
|
1428
1495
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
1429
1496
|
NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_M(MLEN) m) { \
|
|
1430
|
-
|
|
1497
|
+
/* MaskFalse requires we set all lanes for capped d and virtual LMUL. */ \
|
|
1498
|
+
const DFromV<VFromD<decltype(d)>> d_full; \
|
|
1499
|
+
const RebindToSigned<decltype(d_full)> di; \
|
|
1431
1500
|
using TI = TFromD<decltype(di)>; \
|
|
1432
|
-
return BitCast(
|
|
1433
|
-
|
|
1501
|
+
return BitCast(d_full, __riscv_v##OP##_i##SEW##LMUL(Zero(di), TI{-1}, m, \
|
|
1502
|
+
Lanes(d_full))); \
|
|
1434
1503
|
}
|
|
1435
1504
|
|
|
1436
1505
|
HWY_RVV_FOREACH_UI(HWY_RVV_VEC_FROM_MASK, VecFromMask, merge_vxm, _ALL_VIRT)
|
|
@@ -1518,6 +1587,38 @@ HWY_RVV_FOREACH_B(HWY_RVV_ALL_TRUE, _, _)
|
|
|
1518
1587
|
HWY_RVV_FOREACH_B(HWY_RVV_COUNT_TRUE, _, _)
|
|
1519
1588
|
#undef HWY_RVV_COUNT_TRUE
|
|
1520
1589
|
|
|
1590
|
+
// ------------------------------ PromoteMaskTo
|
|
1591
|
+
|
|
1592
|
+
#ifdef HWY_NATIVE_PROMOTE_MASK_TO
|
|
1593
|
+
#undef HWY_NATIVE_PROMOTE_MASK_TO
|
|
1594
|
+
#else
|
|
1595
|
+
#define HWY_NATIVE_PROMOTE_MASK_TO
|
|
1596
|
+
#endif
|
|
1597
|
+
|
|
1598
|
+
template <class DTo, class DFrom,
|
|
1599
|
+
HWY_IF_T_SIZE_GT_D(DTo, sizeof(TFromD<DFrom>)),
|
|
1600
|
+
hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DFrom>>()>* = nullptr>
|
|
1601
|
+
HWY_API MFromD<DTo> PromoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
|
|
1602
|
+
MFromD<DFrom> m) {
|
|
1603
|
+
return m;
|
|
1604
|
+
}
|
|
1605
|
+
|
|
1606
|
+
// ------------------------------ DemoteMaskTo
|
|
1607
|
+
|
|
1608
|
+
#ifdef HWY_NATIVE_DEMOTE_MASK_TO
|
|
1609
|
+
#undef HWY_NATIVE_DEMOTE_MASK_TO
|
|
1610
|
+
#else
|
|
1611
|
+
#define HWY_NATIVE_DEMOTE_MASK_TO
|
|
1612
|
+
#endif
|
|
1613
|
+
|
|
1614
|
+
template <class DTo, class DFrom,
|
|
1615
|
+
HWY_IF_T_SIZE_LE_D(DTo, sizeof(TFromD<DFrom>) - 1),
|
|
1616
|
+
hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DFrom>>()>* = nullptr>
|
|
1617
|
+
HWY_API MFromD<DTo> DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
|
|
1618
|
+
MFromD<DFrom> m) {
|
|
1619
|
+
return m;
|
|
1620
|
+
}
|
|
1621
|
+
|
|
1521
1622
|
// ================================================== MEMORY
|
|
1522
1623
|
|
|
1523
1624
|
// ------------------------------ Load
|
|
@@ -1528,47 +1629,18 @@ HWY_RVV_FOREACH_B(HWY_RVV_COUNT_TRUE, _, _)
|
|
|
1528
1629
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
1529
1630
|
NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
|
|
1530
1631
|
const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
|
|
1531
|
-
using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
|
|
1532
1632
|
return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \
|
|
1533
|
-
|
|
1633
|
+
detail::NativeLanePointer(p), Lanes(d)); \
|
|
1534
1634
|
}
|
|
1535
1635
|
HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le, _ALL_VIRT)
|
|
1536
1636
|
#undef HWY_RVV_LOAD
|
|
1537
1637
|
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
|
|
1542
|
-
return Load(RebindToSigned<decltype(d)>(),
|
|
1543
|
-
reinterpret_cast<const int16_t * HWY_RESTRICT>(p));
|
|
1544
|
-
}
|
|
1545
|
-
|
|
1546
|
-
template <size_t N, int kPow2>
|
|
1547
|
-
HWY_API void Store(VFromD<Simd<int16_t, N, kPow2>> v,
|
|
1548
|
-
Simd<bfloat16_t, N, kPow2> d, bfloat16_t* HWY_RESTRICT p) {
|
|
1549
|
-
Store(v, RebindToSigned<decltype(d)>(),
|
|
1550
|
-
reinterpret_cast<int16_t * HWY_RESTRICT>(p));
|
|
1551
|
-
}
|
|
1552
|
-
|
|
1553
|
-
#if !HWY_HAVE_FLOAT16 // Otherwise already defined above.
|
|
1554
|
-
|
|
1555
|
-
// NOTE: different type for float16_t than bfloat16_t, see Set().
|
|
1556
|
-
template <size_t N, int kPow2>
|
|
1557
|
-
HWY_API VFromD<Simd<uint16_t, N, kPow2>> Load(Simd<float16_t, N, kPow2> d,
|
|
1558
|
-
const float16_t* HWY_RESTRICT p) {
|
|
1559
|
-
return Load(RebindToUnsigned<decltype(d)>(),
|
|
1560
|
-
reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
|
|
1561
|
-
}
|
|
1562
|
-
|
|
1563
|
-
template <size_t N, int kPow2>
|
|
1564
|
-
HWY_API void Store(VFromD<Simd<uint16_t, N, kPow2>> v,
|
|
1565
|
-
Simd<float16_t, N, kPow2> d, float16_t* HWY_RESTRICT p) {
|
|
1566
|
-
Store(v, RebindToUnsigned<decltype(d)>(),
|
|
1567
|
-
reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
|
|
1638
|
+
template <class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
1639
|
+
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
1640
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1641
|
+
return BitCast(d, Load(du, detail::U16LanePointer(p)));
|
|
1568
1642
|
}
|
|
1569
1643
|
|
|
1570
|
-
#endif // !HWY_HAVE_FLOAT16
|
|
1571
|
-
|
|
1572
1644
|
// ------------------------------ LoadU
|
|
1573
1645
|
template <class D>
|
|
1574
1646
|
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
@@ -1584,23 +1656,37 @@ HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
|
1584
1656
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
1585
1657
|
NAME(HWY_RVV_M(MLEN) m, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
|
|
1586
1658
|
const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
|
|
1587
|
-
using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
|
|
1588
1659
|
return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_mu( \
|
|
1589
|
-
m, Zero(d),
|
|
1660
|
+
m, Zero(d), detail::NativeLanePointer(p), Lanes(d)); \
|
|
1590
1661
|
} \
|
|
1591
1662
|
template <size_t N> \
|
|
1592
1663
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
1593
1664
|
NAME##Or(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
|
|
1594
1665
|
HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
|
|
1595
1666
|
const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
|
|
1596
|
-
using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
|
|
1597
1667
|
return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_mu( \
|
|
1598
|
-
m, v,
|
|
1668
|
+
m, v, detail::NativeLanePointer(p), Lanes(d)); \
|
|
1599
1669
|
}
|
|
1600
1670
|
|
|
1601
1671
|
HWY_RVV_FOREACH(HWY_RVV_MASKED_LOAD, MaskedLoad, le, _ALL_VIRT)
|
|
1602
1672
|
#undef HWY_RVV_MASKED_LOAD
|
|
1603
1673
|
|
|
1674
|
+
template <class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
1675
|
+
HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
|
|
1676
|
+
const TFromD<D>* HWY_RESTRICT p) {
|
|
1677
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1678
|
+
return BitCast(d,
|
|
1679
|
+
MaskedLoad(RebindMask(du, m), du, detail::U16LanePointer(p)));
|
|
1680
|
+
}
|
|
1681
|
+
|
|
1682
|
+
template <class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
1683
|
+
HWY_API VFromD<D> MaskedLoadOr(VFromD<D> no, MFromD<D> m, D d,
|
|
1684
|
+
const TFromD<D>* HWY_RESTRICT p) {
|
|
1685
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1686
|
+
return BitCast(d, MaskedLoadOr(BitCast(du, no), RebindMask(du, m), du,
|
|
1687
|
+
detail::U16LanePointer(p)));
|
|
1688
|
+
}
|
|
1689
|
+
|
|
1604
1690
|
// ------------------------------ LoadN
|
|
1605
1691
|
|
|
1606
1692
|
// Native with avl is faster than the generic_ops using FirstN.
|
|
@@ -1616,29 +1702,41 @@ HWY_RVV_FOREACH(HWY_RVV_MASKED_LOAD, MaskedLoad, le, _ALL_VIRT)
|
|
|
1616
1702
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
1617
1703
|
NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
|
|
1618
1704
|
const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t num_lanes) { \
|
|
1619
|
-
using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
|
|
1620
1705
|
/* Use a tail-undisturbed load in LoadN as the tail-undisturbed load */ \
|
|
1621
1706
|
/* operation below will leave any lanes past the first */ \
|
|
1622
1707
|
/* (lowest-indexed) HWY_MIN(num_lanes, Lanes(d)) lanes unchanged */ \
|
|
1623
1708
|
return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_tu( \
|
|
1624
|
-
Zero(d),
|
|
1709
|
+
Zero(d), detail::NativeLanePointer(p), CappedLanes(d, num_lanes)); \
|
|
1625
1710
|
} \
|
|
1626
1711
|
template <size_t N> \
|
|
1627
1712
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME##Or( \
|
|
1628
1713
|
HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
|
|
1629
1714
|
const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t num_lanes) { \
|
|
1630
|
-
using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
|
|
1631
1715
|
/* Use a tail-undisturbed load in LoadNOr as the tail-undisturbed load */ \
|
|
1632
1716
|
/* operation below will set any lanes past the first */ \
|
|
1633
1717
|
/* (lowest-indexed) HWY_MIN(num_lanes, Lanes(d)) lanes to the */ \
|
|
1634
1718
|
/* corresponding lanes in no */ \
|
|
1635
1719
|
return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_tu( \
|
|
1636
|
-
no,
|
|
1720
|
+
no, detail::NativeLanePointer(p), CappedLanes(d, num_lanes)); \
|
|
1637
1721
|
}
|
|
1638
1722
|
|
|
1639
1723
|
HWY_RVV_FOREACH(HWY_RVV_LOADN, LoadN, le, _ALL_VIRT)
|
|
1640
1724
|
#undef HWY_RVV_LOADN
|
|
1641
1725
|
|
|
1726
|
+
template <class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
1727
|
+
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
|
|
1728
|
+
size_t num_lanes) {
|
|
1729
|
+
const RebindToUnsigned<D> du;
|
|
1730
|
+
return BitCast(d, LoadN(du, detail::U16LanePointer(p), num_lanes));
|
|
1731
|
+
}
|
|
1732
|
+
template <class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
1733
|
+
HWY_API VFromD<D> LoadNOr(VFromD<D> v, D d, const TFromD<D>* HWY_RESTRICT p,
|
|
1734
|
+
size_t num_lanes) {
|
|
1735
|
+
const RebindToUnsigned<D> du;
|
|
1736
|
+
return BitCast(
|
|
1737
|
+
d, LoadNOr(BitCast(du, v), du, detail::U16LanePointer(p), num_lanes));
|
|
1738
|
+
}
|
|
1739
|
+
|
|
1642
1740
|
// ------------------------------ Store
|
|
1643
1741
|
|
|
1644
1742
|
#define HWY_RVV_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
|
|
@@ -1647,13 +1745,18 @@ HWY_RVV_FOREACH(HWY_RVV_LOADN, LoadN, le, _ALL_VIRT)
|
|
|
1647
1745
|
HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
|
|
1648
1746
|
HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
|
|
1649
1747
|
HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
v, Lanes(d)); \
|
|
1748
|
+
return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \
|
|
1749
|
+
detail::NativeLanePointer(p), v, Lanes(d)); \
|
|
1653
1750
|
}
|
|
1654
1751
|
HWY_RVV_FOREACH(HWY_RVV_STORE, Store, se, _ALL_VIRT)
|
|
1655
1752
|
#undef HWY_RVV_STORE
|
|
1656
1753
|
|
|
1754
|
+
template <class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
1755
|
+
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
|
|
1756
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1757
|
+
Store(BitCast(du, v), du, detail::U16LanePointer(p));
|
|
1758
|
+
}
|
|
1759
|
+
|
|
1657
1760
|
// ------------------------------ BlendedStore
|
|
1658
1761
|
|
|
1659
1762
|
#define HWY_RVV_BLENDED_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
@@ -1662,13 +1765,20 @@ HWY_RVV_FOREACH(HWY_RVV_STORE, Store, se, _ALL_VIRT)
|
|
|
1662
1765
|
HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
|
|
1663
1766
|
HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
|
|
1664
1767
|
HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
|
|
1665
|
-
using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
|
|
1666
1768
|
return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_m( \
|
|
1667
|
-
m,
|
|
1769
|
+
m, detail::NativeLanePointer(p), v, Lanes(d)); \
|
|
1668
1770
|
}
|
|
1669
1771
|
HWY_RVV_FOREACH(HWY_RVV_BLENDED_STORE, BlendedStore, se, _ALL_VIRT)
|
|
1670
1772
|
#undef HWY_RVV_BLENDED_STORE
|
|
1671
1773
|
|
|
1774
|
+
template <class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
1775
|
+
HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
|
|
1776
|
+
TFromD<D>* HWY_RESTRICT p) {
|
|
1777
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1778
|
+
BlendedStore(BitCast(du, v), RebindMask(du, m), du,
|
|
1779
|
+
detail::U16LanePointer(p));
|
|
1780
|
+
}
|
|
1781
|
+
|
|
1672
1782
|
// ------------------------------ StoreN
|
|
1673
1783
|
|
|
1674
1784
|
namespace detail {
|
|
@@ -1679,13 +1789,18 @@ namespace detail {
|
|
|
1679
1789
|
HWY_API void NAME(size_t count, HWY_RVV_V(BASE, SEW, LMUL) v, \
|
|
1680
1790
|
HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, \
|
|
1681
1791
|
HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
v, count); \
|
|
1792
|
+
return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \
|
|
1793
|
+
detail::NativeLanePointer(p), v, count); \
|
|
1685
1794
|
}
|
|
1686
1795
|
HWY_RVV_FOREACH(HWY_RVV_STOREN, StoreN, se, _ALL_VIRT)
|
|
1687
1796
|
#undef HWY_RVV_STOREN
|
|
1688
1797
|
|
|
1798
|
+
template <class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
1799
|
+
HWY_API void StoreN(size_t count, VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
|
|
1800
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1801
|
+
StoreN(count, BitCast(du, v), du, detail::U16LanePointer(p));
|
|
1802
|
+
}
|
|
1803
|
+
|
|
1689
1804
|
} // namespace detail
|
|
1690
1805
|
|
|
1691
1806
|
#ifdef HWY_NATIVE_STORE_N
|
|
@@ -1694,9 +1809,8 @@ HWY_RVV_FOREACH(HWY_RVV_STOREN, StoreN, se, _ALL_VIRT)
|
|
|
1694
1809
|
#define HWY_NATIVE_STORE_N
|
|
1695
1810
|
#endif
|
|
1696
1811
|
|
|
1697
|
-
template <class D
|
|
1698
|
-
|
|
1699
|
-
HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
|
|
1812
|
+
template <class D>
|
|
1813
|
+
HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
|
|
1700
1814
|
size_t max_lanes_to_store) {
|
|
1701
1815
|
// NOTE: Need to call Lanes(d) and clamp max_lanes_to_store to Lanes(d), even
|
|
1702
1816
|
// if MaxLanes(d) >= MaxLanes(DFromV<VFromD<D>>()) is true, as it is possible
|
|
@@ -1713,19 +1827,6 @@ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
|
|
|
1713
1827
|
detail::StoreN(HWY_MIN(max_lanes_to_store, N), v, d, p);
|
|
1714
1828
|
}
|
|
1715
1829
|
|
|
1716
|
-
// StoreN for BF16/F16 vectors
|
|
1717
|
-
template <class D, typename T = TFromD<D>,
|
|
1718
|
-
hwy::EnableIf<!hwy::IsSame<T, TFromV<VFromD<D>>>()>* = nullptr,
|
|
1719
|
-
HWY_IF_SPECIAL_FLOAT(T)>
|
|
1720
|
-
HWY_API void StoreN(VFromD<D> v, D /*d*/, T* HWY_RESTRICT p,
|
|
1721
|
-
size_t max_lanes_to_store) {
|
|
1722
|
-
using TStore = TFromV<VFromD<D>>;
|
|
1723
|
-
const Rebind<TStore, D> d_store;
|
|
1724
|
-
const size_t N = Lanes(d_store);
|
|
1725
|
-
detail::StoreN(HWY_MIN(max_lanes_to_store, N), v, d_store,
|
|
1726
|
-
reinterpret_cast<TStore * HWY_RESTRICT>(p));
|
|
1727
|
-
}
|
|
1728
|
-
|
|
1729
1830
|
// ------------------------------ StoreU
|
|
1730
1831
|
template <class V, class D>
|
|
1731
1832
|
HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
|
|
@@ -1747,17 +1848,16 @@ HWY_API void Stream(const V v, D d, T* HWY_RESTRICT aligned) {
|
|
|
1747
1848
|
#define HWY_NATIVE_SCATTER
|
|
1748
1849
|
#endif
|
|
1749
1850
|
|
|
1750
|
-
#define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,
|
|
1751
|
-
SHIFT, MLEN, NAME, OP)
|
|
1752
|
-
template <size_t N>
|
|
1753
|
-
HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v,
|
|
1754
|
-
HWY_RVV_D(BASE, SEW, N, SHIFT) d,
|
|
1755
|
-
HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base,
|
|
1756
|
-
HWY_RVV_V(int, SEW, LMUL) offset) {
|
|
1757
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
reinterpret_cast<T*>(base), BitCast(du, offset), v, Lanes(d)); \
|
|
1851
|
+
#define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
1852
|
+
SHIFT, MLEN, NAME, OP) \
|
|
1853
|
+
template <size_t N> \
|
|
1854
|
+
HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
|
|
1855
|
+
HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
|
|
1856
|
+
HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
|
|
1857
|
+
HWY_RVV_V(int, SEW, LMUL) offset) { \
|
|
1858
|
+
const RebindToUnsigned<decltype(d)> du; \
|
|
1859
|
+
return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
|
|
1860
|
+
detail::NativeLanePointer(base), BitCast(du, offset), v, Lanes(d)); \
|
|
1761
1861
|
}
|
|
1762
1862
|
HWY_RVV_FOREACH(HWY_RVV_SCATTER, ScatterOffset, sux, _ALL_VIRT)
|
|
1763
1863
|
#undef HWY_RVV_SCATTER
|
|
@@ -1772,19 +1872,18 @@ HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
|
|
|
1772
1872
|
|
|
1773
1873
|
// ------------------------------ MaskedScatterIndex
|
|
1774
1874
|
|
|
1775
|
-
#define HWY_RVV_MASKED_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD,
|
|
1776
|
-
LMULH, SHIFT, MLEN, NAME, OP)
|
|
1777
|
-
template <size_t N>
|
|
1778
|
-
HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m,
|
|
1779
|
-
HWY_RVV_D(BASE, SEW, N, SHIFT) d,
|
|
1780
|
-
HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base,
|
|
1781
|
-
HWY_RVV_V(int, SEW, LMUL) indices) {
|
|
1782
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
1783
|
-
|
|
1784
|
-
|
|
1785
|
-
|
|
1786
|
-
|
|
1787
|
-
v, Lanes(d)); \
|
|
1875
|
+
#define HWY_RVV_MASKED_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
|
|
1876
|
+
LMULH, SHIFT, MLEN, NAME, OP) \
|
|
1877
|
+
template <size_t N> \
|
|
1878
|
+
HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
|
|
1879
|
+
HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
|
|
1880
|
+
HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
|
|
1881
|
+
HWY_RVV_V(int, SEW, LMUL) indices) { \
|
|
1882
|
+
const RebindToUnsigned<decltype(d)> du; \
|
|
1883
|
+
constexpr size_t kBits = CeilLog2(sizeof(TFromD<decltype(d)>)); \
|
|
1884
|
+
return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL##_m( \
|
|
1885
|
+
m, detail::NativeLanePointer(base), \
|
|
1886
|
+
ShiftLeft<kBits>(BitCast(du, indices)), v, Lanes(d)); \
|
|
1788
1887
|
}
|
|
1789
1888
|
HWY_RVV_FOREACH(HWY_RVV_MASKED_SCATTER, MaskedScatterIndex, sux, _ALL_VIRT)
|
|
1790
1889
|
#undef HWY_RVV_MASKED_SCATTER
|
|
@@ -1805,9 +1904,8 @@ HWY_RVV_FOREACH(HWY_RVV_MASKED_SCATTER, MaskedScatterIndex, sux, _ALL_VIRT)
|
|
|
1805
1904
|
const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
|
|
1806
1905
|
HWY_RVV_V(int, SEW, LMUL) offset) { \
|
|
1807
1906
|
const RebindToUnsigned<decltype(d)> du; \
|
|
1808
|
-
using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
|
|
1809
1907
|
return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
|
|
1810
|
-
|
|
1908
|
+
detail::NativeLanePointer(base), BitCast(du, offset), Lanes(d)); \
|
|
1811
1909
|
}
|
|
1812
1910
|
HWY_RVV_FOREACH(HWY_RVV_GATHER, GatherOffset, lux, _ALL_VIRT)
|
|
1813
1911
|
#undef HWY_RVV_GATHER
|
|
@@ -1821,25 +1919,34 @@ HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
|
|
|
1821
1919
|
return GatherOffset(d, base, ShiftLeft<kBits>(index));
|
|
1822
1920
|
}
|
|
1823
1921
|
|
|
1824
|
-
// ------------------------------
|
|
1922
|
+
// ------------------------------ MaskedGatherIndexOr
|
|
1825
1923
|
|
|
1826
1924
|
#define HWY_RVV_MASKED_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
1827
1925
|
SHIFT, MLEN, NAME, OP) \
|
|
1828
1926
|
template <size_t N> \
|
|
1829
1927
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
1830
|
-
NAME(
|
|
1928
|
+
NAME(HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_M(MLEN) m, \
|
|
1929
|
+
HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
|
|
1831
1930
|
const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
|
|
1832
1931
|
HWY_RVV_V(int, SEW, LMUL) indices) { \
|
|
1833
1932
|
const RebindToUnsigned<decltype(d)> du; \
|
|
1834
|
-
|
|
1933
|
+
const RebindToSigned<decltype(d)> di; \
|
|
1934
|
+
(void)di; /* for HWY_DASSERT */ \
|
|
1835
1935
|
constexpr size_t kBits = CeilLog2(SEW / 8); \
|
|
1936
|
+
HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di)))); \
|
|
1836
1937
|
return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL##_mu( \
|
|
1837
|
-
m,
|
|
1938
|
+
m, no, detail::NativeLanePointer(base), \
|
|
1838
1939
|
ShiftLeft<kBits>(BitCast(du, indices)), Lanes(d)); \
|
|
1839
1940
|
}
|
|
1840
|
-
HWY_RVV_FOREACH(HWY_RVV_MASKED_GATHER,
|
|
1941
|
+
HWY_RVV_FOREACH(HWY_RVV_MASKED_GATHER, MaskedGatherIndexOr, lux, _ALL_VIRT)
|
|
1841
1942
|
#undef HWY_RVV_MASKED_GATHER
|
|
1842
1943
|
|
|
1944
|
+
template <class D>
|
|
1945
|
+
HWY_API VFromD<D> MaskedGatherIndex(MFromD<D> m, D d, const TFromD<D>* base,
|
|
1946
|
+
VFromD<RebindToSigned<D>> indices) {
|
|
1947
|
+
return MaskedGatherIndexOr(Zero(d), m, d, base, indices);
|
|
1948
|
+
}
|
|
1949
|
+
|
|
1843
1950
|
// ================================================== CONVERT
|
|
1844
1951
|
|
|
1845
1952
|
// ------------------------------ PromoteTo
|
|
@@ -1996,7 +2103,7 @@ HWY_API auto PromoteTo(Simd<int64_t, N, kPow2> d,
|
|
|
1996
2103
|
|
|
1997
2104
|
template <size_t N, int kPow2>
|
|
1998
2105
|
HWY_API auto PromoteTo(Simd<float32_t, N, kPow2> d,
|
|
1999
|
-
VFromD<Rebind<bfloat16_t, decltype(d)>> v)
|
|
2106
|
+
VFromD<Rebind<hwy::bfloat16_t, decltype(d)>> v)
|
|
2000
2107
|
-> VFromD<decltype(d)> {
|
|
2001
2108
|
const RebindToSigned<decltype(d)> di32;
|
|
2002
2109
|
const Rebind<uint16_t, decltype(d)> du16;
|
|
@@ -2633,8 +2740,8 @@ HWY_RVV_FOREACH_U32(HWY_RVV_DEMOTE_TO_SHR_16, DemoteToShr16, nclipu_wx_,
|
|
|
2633
2740
|
#undef HWY_RVV_DEMOTE_TO_SHR_16
|
|
2634
2741
|
|
|
2635
2742
|
template <size_t N, int kPow2>
|
|
2636
|
-
HWY_API VFromD<Simd<bfloat16_t, N, kPow2>> DemoteTo(
|
|
2637
|
-
Simd<bfloat16_t, N, kPow2> d, VFromD<Simd<float, N, kPow2 + 1>> v) {
|
|
2743
|
+
HWY_API VFromD<Simd<hwy::bfloat16_t, N, kPow2>> DemoteTo(
|
|
2744
|
+
Simd<hwy::bfloat16_t, N, kPow2> d, VFromD<Simd<float, N, kPow2 + 1>> v) {
|
|
2638
2745
|
const RebindToUnsigned<decltype(d)> du16;
|
|
2639
2746
|
const Rebind<uint32_t, decltype(d)> du32;
|
|
2640
2747
|
return BitCast(d, detail::DemoteToShr16(du16, BitCast(du32, v)));
|
|
@@ -2918,9 +3025,10 @@ HWY_RVV_FOREACH_B(HWY_RVV_SET_AT_OR_AFTER_FIRST, _, _)
|
|
|
2918
3025
|
|
|
2919
3026
|
// ------------------------------ InsertLane
|
|
2920
3027
|
|
|
2921
|
-
template
|
|
2922
|
-
|
|
2923
|
-
|
|
3028
|
+
// T template arg because TFromV<V> might not match the hwy::float16_t argument.
|
|
3029
|
+
template <class V, typename T, HWY_IF_NOT_T_SIZE_V(V, 1)>
|
|
3030
|
+
HWY_API V InsertLane(const V v, size_t i, T t) {
|
|
3031
|
+
const Rebind<T, DFromV<V>> d;
|
|
2924
3032
|
const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only
|
|
2925
3033
|
using TU = TFromD<decltype(du)>;
|
|
2926
3034
|
const auto is_i = detail::EqS(detail::Iota0(du), static_cast<TU>(i));
|
|
@@ -2928,9 +3036,9 @@ HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
|
|
|
2928
3036
|
}
|
|
2929
3037
|
|
|
2930
3038
|
// For 8-bit lanes, Iota0 might overflow.
|
|
2931
|
-
template <class V, HWY_IF_T_SIZE_V(V, 1)>
|
|
2932
|
-
HWY_API V InsertLane(const V v, size_t i,
|
|
2933
|
-
const DFromV<V
|
|
3039
|
+
template <class V, typename T, HWY_IF_T_SIZE_V(V, 1)>
|
|
3040
|
+
HWY_API V InsertLane(const V v, size_t i, T t) {
|
|
3041
|
+
const Rebind<T, DFromV<V>> d;
|
|
2934
3042
|
const auto zero = Zero(d);
|
|
2935
3043
|
const auto one = Set(d, 1);
|
|
2936
3044
|
const auto ge_i = Eq(detail::SlideUp(zero, one, i), one);
|
|
@@ -3034,9 +3142,6 @@ HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) {
|
|
|
3034
3142
|
return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx));
|
|
3035
3143
|
}
|
|
3036
3144
|
|
|
3037
|
-
// TODO(janwas): avoid using this for 8-bit; wrap in detail namespace.
|
|
3038
|
-
// For large 8-bit vectors, index overflow will lead to incorrect results.
|
|
3039
|
-
// Reverse already uses TableLookupLanes16 to prevent this.
|
|
3040
3145
|
#define HWY_RVV_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
|
|
3041
3146
|
MLEN, NAME, OP) \
|
|
3042
3147
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
@@ -3045,12 +3150,14 @@ HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) {
|
|
|
3045
3150
|
HWY_RVV_AVL(SEW, SHIFT)); \
|
|
3046
3151
|
}
|
|
3047
3152
|
|
|
3153
|
+
// TableLookupLanes is supported for all types, but beware that indices are
|
|
3154
|
+
// likely to wrap around for 8-bit lanes. When using TableLookupLanes inside
|
|
3155
|
+
// this file, ensure that it is safe or use TableLookupLanes16 instead.
|
|
3048
3156
|
HWY_RVV_FOREACH(HWY_RVV_TABLE, TableLookupLanes, rgather, _ALL)
|
|
3049
3157
|
#undef HWY_RVV_TABLE
|
|
3050
3158
|
|
|
3051
3159
|
namespace detail {
|
|
3052
3160
|
|
|
3053
|
-
// Used by I8/U8 Reverse
|
|
3054
3161
|
#define HWY_RVV_TABLE16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
3055
3162
|
SHIFT, MLEN, NAME, OP) \
|
|
3056
3163
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
@@ -3122,6 +3229,67 @@ HWY_API VFromD<D> Reverse(D /* tag */, VFromD<D> v) {
|
|
|
3122
3229
|
return TableLookupLanes(v, idx);
|
|
3123
3230
|
}
|
|
3124
3231
|
|
|
3232
|
+
// ------------------------------ ResizeBitCast
|
|
3233
|
+
|
|
3234
|
+
// Extends or truncates a vector to match the given d.
|
|
3235
|
+
namespace detail {
|
|
3236
|
+
|
|
3237
|
+
template <class D>
|
|
3238
|
+
HWY_INLINE VFromD<D> ChangeLMUL(D /* d */, VFromD<D> v) {
|
|
3239
|
+
return v;
|
|
3240
|
+
}
|
|
3241
|
+
|
|
3242
|
+
// Sanity check: when calling ChangeLMUL, the caller (ResizeBitCast) already
|
|
3243
|
+
// BitCast to the same lane type. Note that V may use the native lane type for
|
|
3244
|
+
// f16, so convert D to that before checking.
|
|
3245
|
+
#define HWY_RVV_IF_SAME_T_DV(D, V) \
|
|
3246
|
+
hwy::EnableIf<IsSame<NativeLaneType<TFromD<D>>, TFromV<V>>()>* = nullptr
|
|
3247
|
+
|
|
3248
|
+
// LMUL of VFromD<D> < LMUL of V: need to truncate v
|
|
3249
|
+
template <class D, class V, // HWY_RVV_IF_SAME_T_DV(D, V),
|
|
3250
|
+
HWY_IF_POW2_LE_D(DFromV<VFromD<D>>, DFromV<V>().Pow2() - 1)>
|
|
3251
|
+
HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) {
|
|
3252
|
+
const DFromV<V> d_from;
|
|
3253
|
+
const Half<decltype(d_from)> dh_from;
|
|
3254
|
+
static_assert(
|
|
3255
|
+
DFromV<VFromD<decltype(dh_from)>>().Pow2() < DFromV<V>().Pow2(),
|
|
3256
|
+
"The LMUL of VFromD<decltype(dh_from)> must be less than the LMUL of V");
|
|
3257
|
+
static_assert(
|
|
3258
|
+
DFromV<VFromD<D>>().Pow2() <= DFromV<VFromD<decltype(dh_from)>>().Pow2(),
|
|
3259
|
+
"The LMUL of VFromD<D> must be less than or equal to the LMUL of "
|
|
3260
|
+
"VFromD<decltype(dh_from)>");
|
|
3261
|
+
return ChangeLMUL(d, Trunc(v));
|
|
3262
|
+
}
|
|
3263
|
+
|
|
3264
|
+
// LMUL of VFromD<D> > LMUL of V: need to extend v
|
|
3265
|
+
template <class D, class V, // HWY_RVV_IF_SAME_T_DV(D, V),
|
|
3266
|
+
HWY_IF_POW2_GT_D(DFromV<VFromD<D>>, DFromV<V>().Pow2())>
|
|
3267
|
+
HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) {
|
|
3268
|
+
const DFromV<V> d_from;
|
|
3269
|
+
const Twice<decltype(d_from)> dt_from;
|
|
3270
|
+
static_assert(DFromV<VFromD<decltype(dt_from)>>().Pow2() > DFromV<V>().Pow2(),
|
|
3271
|
+
"The LMUL of VFromD<decltype(dt_from)> must be greater than "
|
|
3272
|
+
"the LMUL of V");
|
|
3273
|
+
static_assert(
|
|
3274
|
+
DFromV<VFromD<D>>().Pow2() >= DFromV<VFromD<decltype(dt_from)>>().Pow2(),
|
|
3275
|
+
"The LMUL of VFromD<D> must be greater than or equal to the LMUL of "
|
|
3276
|
+
"VFromD<decltype(dt_from)>");
|
|
3277
|
+
return ChangeLMUL(d, Ext(dt_from, v));
|
|
3278
|
+
}
|
|
3279
|
+
|
|
3280
|
+
#undef HWY_RVV_IF_SAME_T_DV
|
|
3281
|
+
|
|
3282
|
+
} // namespace detail
|
|
3283
|
+
|
|
3284
|
+
template <class DTo, class VFrom>
|
|
3285
|
+
HWY_API VFromD<DTo> ResizeBitCast(DTo /*dto*/, VFrom v) {
|
|
3286
|
+
const DFromV<decltype(v)> d_from;
|
|
3287
|
+
const Repartition<uint8_t, decltype(d_from)> du8_from;
|
|
3288
|
+
const DFromV<VFromD<DTo>> d_to;
|
|
3289
|
+
const Repartition<uint8_t, decltype(d_to)> du8_to;
|
|
3290
|
+
return BitCast(d_to, detail::ChangeLMUL(du8_to, BitCast(du8_from, v)));
|
|
3291
|
+
}
|
|
3292
|
+
|
|
3125
3293
|
// ------------------------------ Reverse2 (RotateRight, OddEven)
|
|
3126
3294
|
|
|
3127
3295
|
// Per-target flags to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8.
|
|
@@ -3307,7 +3475,7 @@ template <class V, class M, class D>
|
|
|
3307
3475
|
HWY_API size_t CompressBlendedStore(const V v, const M mask, const D d,
|
|
3308
3476
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
3309
3477
|
const size_t count = CountTrue(d, mask);
|
|
3310
|
-
|
|
3478
|
+
StoreN(Compress(v, mask), d, unaligned, count);
|
|
3311
3479
|
return count;
|
|
3312
3480
|
}
|
|
3313
3481
|
|
|
@@ -3483,50 +3651,6 @@ HWY_API V Shuffle0123(const V v) {
|
|
|
3483
3651
|
|
|
3484
3652
|
// ------------------------------ TableLookupBytes
|
|
3485
3653
|
|
|
3486
|
-
// Extends or truncates a vector to match the given d.
|
|
3487
|
-
namespace detail {
|
|
3488
|
-
|
|
3489
|
-
template <class D>
|
|
3490
|
-
HWY_INLINE VFromD<D> ChangeLMUL(D /* d */, VFromD<D> v) {
|
|
3491
|
-
return v;
|
|
3492
|
-
}
|
|
3493
|
-
|
|
3494
|
-
// LMUL of VFromD<D> < LMUL of V: need to truncate v
|
|
3495
|
-
template <class D, class V,
|
|
3496
|
-
hwy::EnableIf<IsSame<TFromD<D>, TFromV<V>>()>* = nullptr,
|
|
3497
|
-
HWY_IF_POW2_LE_D(DFromV<VFromD<D>>, DFromV<V>().Pow2() - 1)>
|
|
3498
|
-
HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) {
|
|
3499
|
-
const DFromV<decltype(v)> d_from;
|
|
3500
|
-
const Half<decltype(d_from)> dh_from;
|
|
3501
|
-
static_assert(
|
|
3502
|
-
DFromV<VFromD<decltype(dh_from)>>().Pow2() < DFromV<V>().Pow2(),
|
|
3503
|
-
"The LMUL of VFromD<decltype(dh_from)> must be less than the LMUL of V");
|
|
3504
|
-
static_assert(
|
|
3505
|
-
DFromV<VFromD<D>>().Pow2() <= DFromV<VFromD<decltype(dh_from)>>().Pow2(),
|
|
3506
|
-
"The LMUL of VFromD<D> must be less than or equal to the LMUL of "
|
|
3507
|
-
"VFromD<decltype(dh_from)>");
|
|
3508
|
-
return ChangeLMUL(d, Trunc(v));
|
|
3509
|
-
}
|
|
3510
|
-
|
|
3511
|
-
// LMUL of VFromD<D> > LMUL of V: need to extend v
|
|
3512
|
-
template <class D, class V,
|
|
3513
|
-
hwy::EnableIf<IsSame<TFromD<D>, TFromV<V>>()>* = nullptr,
|
|
3514
|
-
HWY_IF_POW2_GT_D(DFromV<VFromD<D>>, DFromV<V>().Pow2())>
|
|
3515
|
-
HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) {
|
|
3516
|
-
const DFromV<decltype(v)> d_from;
|
|
3517
|
-
const Twice<decltype(d_from)> dt_from;
|
|
3518
|
-
static_assert(DFromV<VFromD<decltype(dt_from)>>().Pow2() > DFromV<V>().Pow2(),
|
|
3519
|
-
"The LMUL of VFromD<decltype(dt_from)> must be greater than "
|
|
3520
|
-
"the LMUL of V");
|
|
3521
|
-
static_assert(
|
|
3522
|
-
DFromV<VFromD<D>>().Pow2() >= DFromV<VFromD<decltype(dt_from)>>().Pow2(),
|
|
3523
|
-
"The LMUL of VFromD<D> must be greater than or equal to the LMUL of "
|
|
3524
|
-
"VFromD<decltype(dt_from)>");
|
|
3525
|
-
return ChangeLMUL(d, Ext(dt_from, v));
|
|
3526
|
-
}
|
|
3527
|
-
|
|
3528
|
-
} // namespace detail
|
|
3529
|
-
|
|
3530
3654
|
template <class VT, class VI>
|
|
3531
3655
|
HWY_API VI TableLookupBytes(const VT vt, const VI vi) {
|
|
3532
3656
|
const DFromV<VT> dt; // T=table, I=index.
|
|
@@ -3563,7 +3687,8 @@ HWY_API VI TableLookupBytesOr0(const VT vt, const VI idx) {
|
|
|
3563
3687
|
|
|
3564
3688
|
// ------------------------------ TwoTablesLookupLanes
|
|
3565
3689
|
|
|
3566
|
-
//
|
|
3690
|
+
// WARNING: 8-bit lanes may lead to unexpected results because idx is the same
|
|
3691
|
+
// size and may overflow.
|
|
3567
3692
|
template <class D, HWY_IF_POW2_LE_D(D, 2)>
|
|
3568
3693
|
HWY_API VFromD<D> TwoTablesLookupLanes(D d, VFromD<D> a, VFromD<D> b,
|
|
3569
3694
|
VFromD<RebindToUnsigned<D>> idx) {
|
|
@@ -3597,11 +3722,47 @@ HWY_API V TwoTablesLookupLanes(V a, V b,
|
|
|
3597
3722
|
}
|
|
3598
3723
|
|
|
3599
3724
|
// ------------------------------ Broadcast
|
|
3600
|
-
|
|
3725
|
+
|
|
3726
|
+
// 8-bit requires 16-bit tables.
|
|
3727
|
+
template <int kLane, class V, class D = DFromV<V>, HWY_IF_T_SIZE_D(D, 1),
|
|
3728
|
+
HWY_IF_POW2_LE_D(D, 2)>
|
|
3729
|
+
HWY_API V Broadcast(const V v) {
|
|
3730
|
+
HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
|
|
3731
|
+
const D d;
|
|
3732
|
+
const Rebind<uint16_t, decltype(d)> du16;
|
|
3733
|
+
VFromD<decltype(du16)> idx =
|
|
3734
|
+
detail::OffsetsOf128BitBlocks(d, detail::Iota0(du16));
|
|
3735
|
+
if (kLane != 0) {
|
|
3736
|
+
idx = detail::AddS(idx, kLane);
|
|
3737
|
+
}
|
|
3738
|
+
return detail::TableLookupLanes16(v, idx);
|
|
3739
|
+
}
|
|
3740
|
+
|
|
3741
|
+
// 8-bit and max LMUL: split into halves.
|
|
3742
|
+
template <int kLane, class V, class D = DFromV<V>, HWY_IF_T_SIZE_D(D, 1),
|
|
3743
|
+
HWY_IF_POW2_GT_D(D, 2)>
|
|
3601
3744
|
HWY_API V Broadcast(const V v) {
|
|
3602
|
-
const DFromV<V> d;
|
|
3603
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
3604
3745
|
HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
|
|
3746
|
+
const D d;
|
|
3747
|
+
const Half<decltype(d)> dh;
|
|
3748
|
+
using VH = VFromD<decltype(dh)>;
|
|
3749
|
+
const Rebind<uint16_t, decltype(dh)> du16;
|
|
3750
|
+
VFromD<decltype(du16)> idx =
|
|
3751
|
+
detail::OffsetsOf128BitBlocks(d, detail::Iota0(du16));
|
|
3752
|
+
if (kLane != 0) {
|
|
3753
|
+
idx = detail::AddS(idx, kLane);
|
|
3754
|
+
}
|
|
3755
|
+
const VH lo = detail::TableLookupLanes16(LowerHalf(dh, v), idx);
|
|
3756
|
+
const VH hi = detail::TableLookupLanes16(UpperHalf(dh, v), idx);
|
|
3757
|
+
return Combine(d, lo, hi);
|
|
3758
|
+
}
|
|
3759
|
+
|
|
3760
|
+
template <int kLane, class V, class D = DFromV<V>,
|
|
3761
|
+
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))>
|
|
3762
|
+
HWY_API V Broadcast(const V v) {
|
|
3763
|
+
HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
|
|
3764
|
+
const D d;
|
|
3765
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
3605
3766
|
auto idx = detail::OffsetsOf128BitBlocks(d, detail::Iota0(du));
|
|
3606
3767
|
if (kLane != 0) {
|
|
3607
3768
|
idx = detail::AddS(idx, kLane);
|
|
@@ -3778,20 +3939,194 @@ HWY_API V ShiftRightBytes(const D d, const V v) {
|
|
|
3778
3939
|
return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v)));
|
|
3779
3940
|
}
|
|
3780
3941
|
|
|
3781
|
-
// ------------------------------
|
|
3942
|
+
// ------------------------------ InterleaveWholeLower
|
|
3943
|
+
#ifdef HWY_NATIVE_INTERLEAVE_WHOLE
|
|
3944
|
+
#undef HWY_NATIVE_INTERLEAVE_WHOLE
|
|
3945
|
+
#else
|
|
3946
|
+
#define HWY_NATIVE_INTERLEAVE_WHOLE
|
|
3947
|
+
#endif
|
|
3782
3948
|
|
|
3783
|
-
|
|
3949
|
+
namespace detail {
|
|
3950
|
+
// Returns double-length vector with interleaved lanes.
|
|
3951
|
+
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
3952
|
+
HWY_IF_POW2_GT_D(D, -3)>
|
|
3953
|
+
HWY_API VFromD<D> InterleaveWhole(D d, VFromD<Half<D>> a, VFromD<Half<D>> b) {
|
|
3954
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
3955
|
+
using TW = MakeWide<TFromD<decltype(du)>>;
|
|
3956
|
+
const Rebind<TW, Half<decltype(du)>> dw;
|
|
3957
|
+
const Half<decltype(du)> duh; // cast inputs to unsigned so we zero-extend
|
|
3958
|
+
|
|
3959
|
+
const VFromD<decltype(dw)> aw = PromoteTo(dw, BitCast(duh, a));
|
|
3960
|
+
const VFromD<decltype(dw)> bw = PromoteTo(dw, BitCast(duh, b));
|
|
3961
|
+
return BitCast(d, Or(aw, BitCast(dw, detail::Slide1Up(BitCast(du, bw)))));
|
|
3962
|
+
}
|
|
3963
|
+
// 64-bit: cannot PromoteTo, but can Ext.
|
|
3964
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)>
|
|
3965
|
+
HWY_API VFromD<D> InterleaveWhole(D d, VFromD<Half<D>> a, VFromD<Half<D>> b) {
|
|
3966
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
3967
|
+
const auto idx = ShiftRight<1>(detail::Iota0(du));
|
|
3968
|
+
return OddEven(TableLookupLanes(detail::Ext(d, b), idx),
|
|
3969
|
+
TableLookupLanes(detail::Ext(d, a), idx));
|
|
3970
|
+
}
|
|
3971
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_GT_D(D, 2)>
|
|
3972
|
+
HWY_API VFromD<D> InterleaveWhole(D d, VFromD<Half<D>> a, VFromD<Half<D>> b) {
|
|
3973
|
+
const Half<D> dh;
|
|
3974
|
+
const Half<decltype(dh)> dq;
|
|
3975
|
+
const VFromD<decltype(dh)> i0 =
|
|
3976
|
+
InterleaveWhole(dh, LowerHalf(dq, a), LowerHalf(dq, b));
|
|
3977
|
+
const VFromD<decltype(dh)> i1 =
|
|
3978
|
+
InterleaveWhole(dh, UpperHalf(dq, a), UpperHalf(dq, b));
|
|
3979
|
+
return Combine(d, i1, i0);
|
|
3980
|
+
}
|
|
3981
|
+
|
|
3982
|
+
} // namespace detail
|
|
3983
|
+
|
|
3984
|
+
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
|
|
3985
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
3986
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
3987
|
+
const detail::AdjustSimdTagToMinVecPow2<RepartitionToWide<decltype(du)>> dw;
|
|
3988
|
+
const RepartitionToNarrow<decltype(dw)> du_src;
|
|
3989
|
+
|
|
3990
|
+
const VFromD<D> aw =
|
|
3991
|
+
ResizeBitCast(d, PromoteLowerTo(dw, ResizeBitCast(du_src, a)));
|
|
3992
|
+
const VFromD<D> bw =
|
|
3993
|
+
ResizeBitCast(d, PromoteLowerTo(dw, ResizeBitCast(du_src, b)));
|
|
3994
|
+
return Or(aw, detail::Slide1Up(bw));
|
|
3995
|
+
}
|
|
3996
|
+
|
|
3997
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
3998
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
3999
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4000
|
+
const auto idx = ShiftRight<1>(detail::Iota0(du));
|
|
4001
|
+
return OddEven(TableLookupLanes(b, idx), TableLookupLanes(a, idx));
|
|
4002
|
+
}
|
|
4003
|
+
|
|
4004
|
+
// ------------------------------ InterleaveWholeUpper
|
|
4005
|
+
|
|
4006
|
+
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
|
|
4007
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
4008
|
+
// Use Lanes(d) / 2 instead of Lanes(Half<D>()) as Lanes(Half<D>()) can only
|
|
4009
|
+
// be called if (d.Pow2() >= -2 && d.Pow2() == DFromV<VFromD<D>>().Pow2()) is
|
|
4010
|
+
// true and and as the results of InterleaveWholeUpper are
|
|
4011
|
+
// implementation-defined if Lanes(d) is less than 2.
|
|
4012
|
+
const size_t half_N = Lanes(d) / 2;
|
|
4013
|
+
return InterleaveWholeLower(d, detail::SlideDown(a, half_N),
|
|
4014
|
+
detail::SlideDown(b, half_N));
|
|
4015
|
+
}
|
|
4016
|
+
|
|
4017
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
4018
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
4019
|
+
// Use Lanes(d) / 2 instead of Lanes(Half<D>()) as Lanes(Half<D>()) can only
|
|
4020
|
+
// be called if (d.Pow2() >= -2 && d.Pow2() == DFromV<VFromD<D>>().Pow2()) is
|
|
4021
|
+
// true and as the results of InterleaveWholeUpper are implementation-defined
|
|
4022
|
+
// if Lanes(d) is less than 2.
|
|
4023
|
+
const size_t half_N = Lanes(d) / 2;
|
|
4024
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4025
|
+
const auto idx = detail::AddS(ShiftRight<1>(detail::Iota0(du)),
|
|
4026
|
+
static_cast<uint64_t>(half_N));
|
|
4027
|
+
return OddEven(TableLookupLanes(b, idx), TableLookupLanes(a, idx));
|
|
4028
|
+
}
|
|
4029
|
+
|
|
4030
|
+
// ------------------------------ InterleaveLower (InterleaveWholeLower)
|
|
4031
|
+
|
|
4032
|
+
namespace detail {
|
|
4033
|
+
|
|
4034
|
+
// Definitely at least 128 bit: match x86 semantics (independent blocks). Using
|
|
4035
|
+
// InterleaveWhole and 64-bit Compress avoids 8-bit overflow.
|
|
4036
|
+
template <class D, class V, HWY_IF_POW2_LE_D(D, 2)>
|
|
4037
|
+
HWY_INLINE V InterleaveLowerBlocks(D d, const V a, const V b) {
|
|
4038
|
+
static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
|
|
4039
|
+
const Twice<D> dt;
|
|
4040
|
+
const RebindToUnsigned<decltype(dt)> dt_u;
|
|
4041
|
+
const VFromD<decltype(dt)> interleaved = detail::InterleaveWhole(dt, a, b);
|
|
4042
|
+
// Keep only even 128-bit blocks. This is faster than u64 ConcatEven
|
|
4043
|
+
// because we only have a single vector.
|
|
4044
|
+
constexpr size_t kShift = CeilLog2(16 / sizeof(TFromD<D>));
|
|
4045
|
+
const VFromD<decltype(dt_u)> idx_block =
|
|
4046
|
+
ShiftRight<kShift>(detail::Iota0(dt_u));
|
|
4047
|
+
const MFromD<decltype(dt_u)> is_even =
|
|
4048
|
+
detail::EqS(detail::AndS(idx_block, 1), 0);
|
|
4049
|
+
return BitCast(d, LowerHalf(Compress(BitCast(dt_u, interleaved), is_even)));
|
|
4050
|
+
}
|
|
4051
|
+
template <class D, class V, HWY_IF_POW2_GT_D(D, 2)>
|
|
4052
|
+
HWY_INLINE V InterleaveLowerBlocks(D d, const V a, const V b) {
|
|
4053
|
+
const Half<D> dh;
|
|
4054
|
+
const VFromD<decltype(dh)> i0 =
|
|
4055
|
+
InterleaveLowerBlocks(dh, LowerHalf(dh, a), LowerHalf(dh, b));
|
|
4056
|
+
const VFromD<decltype(dh)> i1 =
|
|
4057
|
+
InterleaveLowerBlocks(dh, UpperHalf(dh, a), UpperHalf(dh, b));
|
|
4058
|
+
return Combine(d, i1, i0);
|
|
4059
|
+
}
|
|
4060
|
+
|
|
4061
|
+
// As above, for the upper half of blocks.
|
|
4062
|
+
template <class D, class V, HWY_IF_POW2_LE_D(D, 2)>
|
|
4063
|
+
HWY_INLINE V InterleaveUpperBlocks(D d, const V a, const V b) {
|
|
4064
|
+
static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
|
|
4065
|
+
const Twice<D> dt;
|
|
4066
|
+
const RebindToUnsigned<decltype(dt)> dt_u;
|
|
4067
|
+
const VFromD<decltype(dt)> interleaved = detail::InterleaveWhole(dt, a, b);
|
|
4068
|
+
// Keep only odd 128-bit blocks. This is faster than u64 ConcatEven
|
|
4069
|
+
// because we only have a single vector.
|
|
4070
|
+
constexpr size_t kShift = CeilLog2(16 / sizeof(TFromD<D>));
|
|
4071
|
+
const VFromD<decltype(dt_u)> idx_block =
|
|
4072
|
+
ShiftRight<kShift>(detail::Iota0(dt_u));
|
|
4073
|
+
const MFromD<decltype(dt_u)> is_odd =
|
|
4074
|
+
detail::EqS(detail::AndS(idx_block, 1), 1);
|
|
4075
|
+
return BitCast(d, LowerHalf(Compress(BitCast(dt_u, interleaved), is_odd)));
|
|
4076
|
+
}
|
|
4077
|
+
template <class D, class V, HWY_IF_POW2_GT_D(D, 2)>
|
|
4078
|
+
HWY_INLINE V InterleaveUpperBlocks(D d, const V a, const V b) {
|
|
4079
|
+
const Half<D> dh;
|
|
4080
|
+
const VFromD<decltype(dh)> i0 =
|
|
4081
|
+
InterleaveUpperBlocks(dh, LowerHalf(dh, a), LowerHalf(dh, b));
|
|
4082
|
+
const VFromD<decltype(dh)> i1 =
|
|
4083
|
+
InterleaveUpperBlocks(dh, UpperHalf(dh, a), UpperHalf(dh, b));
|
|
4084
|
+
return Combine(d, i1, i0);
|
|
4085
|
+
}
|
|
4086
|
+
|
|
4087
|
+
// RVV vectors are at least 128 bit when there is no fractional LMUL nor cap.
|
|
4088
|
+
// Used by functions with per-block behavior such as InterleaveLower.
|
|
4089
|
+
template <typename T, size_t N, int kPow2>
|
|
4090
|
+
constexpr bool IsGE128(Simd<T, N, kPow2> /* d */) {
|
|
4091
|
+
return N * sizeof(T) >= 16 && kPow2 >= 0;
|
|
4092
|
+
}
|
|
4093
|
+
|
|
4094
|
+
// Definitely less than 128-bit only if there is a small cap; fractional LMUL
|
|
4095
|
+
// might not be enough if vectors are large.
|
|
4096
|
+
template <typename T, size_t N, int kPow2>
|
|
4097
|
+
constexpr bool IsLT128(Simd<T, N, kPow2> /* d */) {
|
|
4098
|
+
return N * sizeof(T) < 16;
|
|
4099
|
+
}
|
|
4100
|
+
|
|
4101
|
+
} // namespace detail
|
|
4102
|
+
|
|
4103
|
+
#define HWY_RVV_IF_GE128_D(D) hwy::EnableIf<detail::IsGE128(D())>* = nullptr
|
|
4104
|
+
#define HWY_RVV_IF_LT128_D(D) hwy::EnableIf<detail::IsLT128(D())>* = nullptr
|
|
4105
|
+
#define HWY_RVV_IF_CAN128_D(D) \
|
|
4106
|
+
hwy::EnableIf<!detail::IsLT128(D()) && !detail::IsGE128(D())>* = nullptr
|
|
4107
|
+
|
|
4108
|
+
template <class D, class V, HWY_RVV_IF_GE128_D(D)>
|
|
4109
|
+
HWY_API V InterleaveLower(D d, const V a, const V b) {
|
|
4110
|
+
return detail::InterleaveLowerBlocks(d, a, b);
|
|
4111
|
+
}
|
|
4112
|
+
|
|
4113
|
+
// Single block: interleave without extra Compress.
|
|
4114
|
+
template <class D, class V, HWY_RVV_IF_LT128_D(D)>
|
|
3784
4115
|
HWY_API V InterleaveLower(D d, const V a, const V b) {
|
|
3785
4116
|
static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
|
|
3786
|
-
|
|
3787
|
-
|
|
3788
|
-
|
|
3789
|
-
|
|
3790
|
-
|
|
3791
|
-
|
|
3792
|
-
|
|
3793
|
-
|
|
3794
|
-
|
|
4117
|
+
return InterleaveWholeLower(d, a, b);
|
|
4118
|
+
}
|
|
4119
|
+
|
|
4120
|
+
// Could be either; branch at runtime.
|
|
4121
|
+
template <class D, class V, HWY_RVV_IF_CAN128_D(D)>
|
|
4122
|
+
HWY_API V InterleaveLower(D d, const V a, const V b) {
|
|
4123
|
+
if (Lanes(d) * sizeof(TFromD<D>) <= 16) {
|
|
4124
|
+
return InterleaveWholeLower(d, a, b);
|
|
4125
|
+
}
|
|
4126
|
+
// Fractional LMUL: use LMUL=1 to ensure we can cast to u64.
|
|
4127
|
+
const ScalableTag<TFromD<D>, HWY_MAX(d.Pow2(), 0)> d1;
|
|
4128
|
+
return ResizeBitCast(d, detail::InterleaveLowerBlocks(
|
|
4129
|
+
d1, ResizeBitCast(d1, a), ResizeBitCast(d1, b)));
|
|
3795
4130
|
}
|
|
3796
4131
|
|
|
3797
4132
|
template <class V>
|
|
@@ -3799,21 +4134,30 @@ HWY_API V InterleaveLower(const V a, const V b) {
|
|
|
3799
4134
|
return InterleaveLower(DFromV<V>(), a, b);
|
|
3800
4135
|
}
|
|
3801
4136
|
|
|
3802
|
-
// ------------------------------ InterleaveUpper
|
|
4137
|
+
// ------------------------------ InterleaveUpper (Compress)
|
|
3803
4138
|
|
|
3804
|
-
template <class D, class V>
|
|
3805
|
-
HWY_API V InterleaveUpper(
|
|
4139
|
+
template <class D, class V, HWY_RVV_IF_GE128_D(D)>
|
|
4140
|
+
HWY_API V InterleaveUpper(D d, const V a, const V b) {
|
|
4141
|
+
return detail::InterleaveUpperBlocks(d, a, b);
|
|
4142
|
+
}
|
|
4143
|
+
|
|
4144
|
+
// Single block: interleave without extra Compress.
|
|
4145
|
+
template <class D, class V, HWY_RVV_IF_LT128_D(D)>
|
|
4146
|
+
HWY_API V InterleaveUpper(D d, const V a, const V b) {
|
|
3806
4147
|
static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
|
|
3807
|
-
|
|
3808
|
-
|
|
3809
|
-
|
|
3810
|
-
|
|
3811
|
-
|
|
3812
|
-
|
|
3813
|
-
|
|
3814
|
-
|
|
3815
|
-
|
|
3816
|
-
|
|
4148
|
+
return InterleaveWholeUpper(d, a, b);
|
|
4149
|
+
}
|
|
4150
|
+
|
|
4151
|
+
// Could be either; branch at runtime.
|
|
4152
|
+
template <class D, class V, HWY_RVV_IF_CAN128_D(D)>
|
|
4153
|
+
HWY_API V InterleaveUpper(D d, const V a, const V b) {
|
|
4154
|
+
if (Lanes(d) * sizeof(TFromD<D>) <= 16) {
|
|
4155
|
+
return InterleaveWholeUpper(d, a, b);
|
|
4156
|
+
}
|
|
4157
|
+
// Fractional LMUL: use LMUL=1 to ensure we can cast to u64.
|
|
4158
|
+
const ScalableTag<TFromD<D>, HWY_MAX(d.Pow2(), 0)> d1;
|
|
4159
|
+
return ResizeBitCast(d, detail::InterleaveUpperBlocks(
|
|
4160
|
+
d1, ResizeBitCast(d1, a), ResizeBitCast(d1, b)));
|
|
3817
4161
|
}
|
|
3818
4162
|
|
|
3819
4163
|
// ------------------------------ ZipLower
|
|
@@ -3840,67 +4184,98 @@ HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
|
|
|
3840
4184
|
|
|
3841
4185
|
// ================================================== REDUCE
|
|
3842
4186
|
|
|
3843
|
-
//
|
|
4187
|
+
// We have ReduceSum, generic_ops-inl.h defines SumOfLanes via Set.
|
|
4188
|
+
#ifdef HWY_NATIVE_REDUCE_SCALAR
|
|
4189
|
+
#undef HWY_NATIVE_REDUCE_SCALAR
|
|
4190
|
+
#else
|
|
4191
|
+
#define HWY_NATIVE_REDUCE_SCALAR
|
|
4192
|
+
#endif
|
|
4193
|
+
|
|
4194
|
+
// scalar = f(vector, zero_m1)
|
|
3844
4195
|
#define HWY_RVV_REDUCE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
|
|
3845
4196
|
MLEN, NAME, OP) \
|
|
3846
|
-
template <
|
|
3847
|
-
HWY_API
|
|
3848
|
-
NAME(
|
|
3849
|
-
|
|
3850
|
-
|
|
3851
|
-
|
|
4197
|
+
template <size_t N> \
|
|
4198
|
+
HWY_API HWY_RVV_T(BASE, SEW) \
|
|
4199
|
+
NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) v, \
|
|
4200
|
+
HWY_RVV_V(BASE, SEW, m1) v0) { \
|
|
4201
|
+
return GetLane(__riscv_v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1( \
|
|
4202
|
+
v, v0, Lanes(d))); \
|
|
3852
4203
|
}
|
|
3853
4204
|
|
|
3854
|
-
//
|
|
4205
|
+
// detail::RedSum, detail::RedMin, and detail::RedMax is more efficient
|
|
4206
|
+
// for N=4 I8/U8 reductions on RVV than the default implementations of the
|
|
4207
|
+
// the N=4 I8/U8 ReduceSum/ReduceMin/ReduceMax operations in generic_ops-inl.h
|
|
4208
|
+
#undef HWY_IF_REDUCE_D
|
|
4209
|
+
#define HWY_IF_REDUCE_D(D) hwy::EnableIf<HWY_MAX_LANES_D(D) != 1>* = nullptr
|
|
4210
|
+
|
|
4211
|
+
#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
4212
|
+
#undef HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
4213
|
+
#else
|
|
4214
|
+
#define HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
4215
|
+
#endif
|
|
4216
|
+
|
|
4217
|
+
#ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
4218
|
+
#undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
4219
|
+
#else
|
|
4220
|
+
#define HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
4221
|
+
#endif
|
|
4222
|
+
|
|
4223
|
+
// ------------------------------ ReduceSum
|
|
3855
4224
|
|
|
3856
4225
|
namespace detail {
|
|
3857
|
-
HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum,
|
|
3858
|
-
HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredusum,
|
|
4226
|
+
HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum, _ALL_VIRT)
|
|
4227
|
+
HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredusum, _ALL_VIRT)
|
|
3859
4228
|
} // namespace detail
|
|
3860
4229
|
|
|
3861
|
-
template <class D>
|
|
3862
|
-
HWY_API
|
|
4230
|
+
template <class D, HWY_IF_REDUCE_D(D)>
|
|
4231
|
+
HWY_API TFromD<D> ReduceSum(D d, const VFromD<D> v) {
|
|
3863
4232
|
const auto v0 = Zero(ScalableTag<TFromD<D>>()); // always m1
|
|
3864
4233
|
return detail::RedSum(d, v, v0);
|
|
3865
4234
|
}
|
|
3866
4235
|
|
|
3867
|
-
|
|
3868
|
-
HWY_API TFromD<D> ReduceSum(D d, const VFromD<D> v) {
|
|
3869
|
-
return GetLane(SumOfLanes(d, v));
|
|
3870
|
-
}
|
|
3871
|
-
|
|
3872
|
-
// ------------------------------ MinOfLanes
|
|
4236
|
+
// ------------------------------ ReduceMin
|
|
3873
4237
|
namespace detail {
|
|
3874
|
-
HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu,
|
|
3875
|
-
HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin,
|
|
3876
|
-
HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin,
|
|
4238
|
+
HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu, _ALL_VIRT)
|
|
4239
|
+
HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin, _ALL_VIRT)
|
|
4240
|
+
HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin, _ALL_VIRT)
|
|
3877
4241
|
} // namespace detail
|
|
3878
4242
|
|
|
3879
|
-
template <class D>
|
|
3880
|
-
HWY_API
|
|
3881
|
-
using T = TFromD<D>;
|
|
4243
|
+
template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
|
|
4244
|
+
HWY_API T ReduceMin(D d, const VFromD<D> v) {
|
|
3882
4245
|
const ScalableTag<T> d1; // always m1
|
|
3883
|
-
|
|
3884
|
-
return detail::RedMin(d, v, neutral);
|
|
4246
|
+
return detail::RedMin(d, v, Set(d1, HighestValue<T>()));
|
|
3885
4247
|
}
|
|
3886
4248
|
|
|
3887
|
-
// ------------------------------
|
|
4249
|
+
// ------------------------------ ReduceMax
|
|
3888
4250
|
namespace detail {
|
|
3889
|
-
HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu,
|
|
3890
|
-
HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax,
|
|
3891
|
-
HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax,
|
|
4251
|
+
HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu, _ALL_VIRT)
|
|
4252
|
+
HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax, _ALL_VIRT)
|
|
4253
|
+
HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax, _ALL_VIRT)
|
|
3892
4254
|
} // namespace detail
|
|
3893
4255
|
|
|
3894
|
-
template <class D>
|
|
3895
|
-
HWY_API
|
|
3896
|
-
using T = TFromD<D>;
|
|
4256
|
+
template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
|
|
4257
|
+
HWY_API T ReduceMax(D d, const VFromD<D> v) {
|
|
3897
4258
|
const ScalableTag<T> d1; // always m1
|
|
3898
|
-
|
|
3899
|
-
return detail::RedMax(d, v, neutral);
|
|
4259
|
+
return detail::RedMax(d, v, Set(d1, LowestValue<T>()));
|
|
3900
4260
|
}
|
|
3901
4261
|
|
|
3902
4262
|
#undef HWY_RVV_REDUCE
|
|
3903
4263
|
|
|
4264
|
+
// ------------------------------ SumOfLanes
|
|
4265
|
+
|
|
4266
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
4267
|
+
HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
|
|
4268
|
+
return Set(d, ReduceSum(d, v));
|
|
4269
|
+
}
|
|
4270
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
4271
|
+
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
4272
|
+
return Set(d, ReduceMin(d, v));
|
|
4273
|
+
}
|
|
4274
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
4275
|
+
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
4276
|
+
return Set(d, ReduceMax(d, v));
|
|
4277
|
+
}
|
|
4278
|
+
|
|
3904
4279
|
// ================================================== Ops with dependencies
|
|
3905
4280
|
|
|
3906
4281
|
// ------------------------------ LoadInterleaved2
|
|
@@ -4229,15 +4604,87 @@ HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
|
|
|
4229
4604
|
|
|
4230
4605
|
#endif // HWY_HAVE_TUPLE
|
|
4231
4606
|
|
|
4232
|
-
// ------------------------------ ResizeBitCast
|
|
4607
|
+
// ------------------------------ Dup128VecFromValues (ResizeBitCast)
|
|
4233
4608
|
|
|
4234
|
-
template <class D,
|
|
4235
|
-
HWY_API VFromD<D>
|
|
4236
|
-
|
|
4237
|
-
|
|
4238
|
-
|
|
4239
|
-
|
|
4240
|
-
|
|
4609
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_D(D, 1)>
|
|
4610
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> /*t1*/) {
|
|
4611
|
+
return Set(d, t0);
|
|
4612
|
+
}
|
|
4613
|
+
|
|
4614
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)>
|
|
4615
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) {
|
|
4616
|
+
const auto even_lanes = Set(d, t0);
|
|
4617
|
+
#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
|
|
4618
|
+
if (__builtin_constant_p(BitCastScalar<uint64_t>(t0) ==
|
|
4619
|
+
BitCastScalar<uint64_t>(t1)) &&
|
|
4620
|
+
(BitCastScalar<uint64_t>(t0) == BitCastScalar<uint64_t>(t1))) {
|
|
4621
|
+
return even_lanes;
|
|
4622
|
+
}
|
|
4623
|
+
#endif
|
|
4624
|
+
|
|
4625
|
+
const auto odd_lanes = Set(d, t1);
|
|
4626
|
+
return OddEven(odd_lanes, even_lanes);
|
|
4627
|
+
}
|
|
4628
|
+
|
|
4629
|
+
namespace detail {
|
|
4630
|
+
|
|
4631
|
+
#pragma pack(push, 1)
|
|
4632
|
+
|
|
4633
|
+
template <class T>
|
|
4634
|
+
struct alignas(8) Vec64ValsWrapper {
|
|
4635
|
+
static_assert(sizeof(T) >= 1, "sizeof(T) >= 1 must be true");
|
|
4636
|
+
static_assert(sizeof(T) <= 8, "sizeof(T) <= 8 must be true");
|
|
4637
|
+
T vals[8 / sizeof(T)];
|
|
4638
|
+
};
|
|
4639
|
+
|
|
4640
|
+
#pragma pack(pop)
|
|
4641
|
+
|
|
4642
|
+
} // namespace detail
|
|
4643
|
+
|
|
4644
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
4645
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
4646
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
4647
|
+
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
|
|
4648
|
+
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
|
|
4649
|
+
TFromD<D> t11, TFromD<D> t12,
|
|
4650
|
+
TFromD<D> t13, TFromD<D> t14,
|
|
4651
|
+
TFromD<D> t15) {
|
|
4652
|
+
const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
|
|
4653
|
+
return ResizeBitCast(
|
|
4654
|
+
d, Dup128VecFromValues(
|
|
4655
|
+
du64,
|
|
4656
|
+
BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{
|
|
4657
|
+
{t0, t1, t2, t3, t4, t5, t6, t7}}),
|
|
4658
|
+
BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{
|
|
4659
|
+
{t8, t9, t10, t11, t12, t13, t14, t15}})));
|
|
4660
|
+
}
|
|
4661
|
+
|
|
4662
|
+
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
4663
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
4664
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
4665
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
4666
|
+
TFromD<D> t7) {
|
|
4667
|
+
const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
|
|
4668
|
+
return ResizeBitCast(
|
|
4669
|
+
d, Dup128VecFromValues(
|
|
4670
|
+
du64,
|
|
4671
|
+
BitCastScalar<uint64_t>(
|
|
4672
|
+
detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1, t2, t3}}),
|
|
4673
|
+
BitCastScalar<uint64_t>(
|
|
4674
|
+
detail::Vec64ValsWrapper<TFromD<D>>{{t4, t5, t6, t7}})));
|
|
4675
|
+
}
|
|
4676
|
+
|
|
4677
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
4678
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
4679
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
4680
|
+
const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
|
|
4681
|
+
return ResizeBitCast(
|
|
4682
|
+
d,
|
|
4683
|
+
Dup128VecFromValues(du64,
|
|
4684
|
+
BitCastScalar<uint64_t>(
|
|
4685
|
+
detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}}),
|
|
4686
|
+
BitCastScalar<uint64_t>(
|
|
4687
|
+
detail::Vec64ValsWrapper<TFromD<D>>{{t2, t3}})));
|
|
4241
4688
|
}
|
|
4242
4689
|
|
|
4243
4690
|
// ------------------------------ PopulationCount (ShiftRight)
|
|
@@ -4366,6 +4813,278 @@ HWY_API MFromD<D> FirstN(const D d, const size_t n) {
|
|
|
4366
4813
|
return Eq(detail::SlideUp(one, zero, n), one);
|
|
4367
4814
|
}
|
|
4368
4815
|
|
|
4816
|
+
// ------------------------------ LowerHalfOfMask/UpperHalfOfMask
|
|
4817
|
+
|
|
4818
|
+
#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
4819
|
+
|
|
4820
|
+
// Target-specific implementations of LowerHalfOfMask, UpperHalfOfMask,
|
|
4821
|
+
// CombineMasks, OrderedDemote2MasksTo, and Dup128MaskFromMaskBits are possible
|
|
4822
|
+
// on RVV if the __riscv_vreinterpret_v_b*_u8m1 and
|
|
4823
|
+
// __riscv_vreinterpret_v_u8m1_b* intrinsics are available.
|
|
4824
|
+
|
|
4825
|
+
// The __riscv_vreinterpret_v_b*_u8m1 and __riscv_vreinterpret_v_u8m1_b*
|
|
4826
|
+
// intrinsics available with Clang 17 and later and GCC 14 and later.
|
|
4827
|
+
|
|
4828
|
+
namespace detail {
|
|
4829
|
+
|
|
4830
|
+
HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool1_t m) {
|
|
4831
|
+
return __riscv_vreinterpret_v_b1_u8m1(m);
|
|
4832
|
+
}
|
|
4833
|
+
|
|
4834
|
+
HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool2_t m) {
|
|
4835
|
+
return __riscv_vreinterpret_v_b2_u8m1(m);
|
|
4836
|
+
}
|
|
4837
|
+
|
|
4838
|
+
HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool4_t m) {
|
|
4839
|
+
return __riscv_vreinterpret_v_b4_u8m1(m);
|
|
4840
|
+
}
|
|
4841
|
+
|
|
4842
|
+
HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool8_t m) {
|
|
4843
|
+
return __riscv_vreinterpret_v_b8_u8m1(m);
|
|
4844
|
+
}
|
|
4845
|
+
|
|
4846
|
+
HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool16_t m) {
|
|
4847
|
+
return __riscv_vreinterpret_v_b16_u8m1(m);
|
|
4848
|
+
}
|
|
4849
|
+
|
|
4850
|
+
HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool32_t m) {
|
|
4851
|
+
return __riscv_vreinterpret_v_b32_u8m1(m);
|
|
4852
|
+
}
|
|
4853
|
+
|
|
4854
|
+
HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool64_t m) {
|
|
4855
|
+
return __riscv_vreinterpret_v_b64_u8m1(m);
|
|
4856
|
+
}
|
|
4857
|
+
|
|
4858
|
+
template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool1_t>()>* = nullptr>
|
|
4859
|
+
HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
|
|
4860
|
+
return __riscv_vreinterpret_v_u8m1_b1(v);
|
|
4861
|
+
}
|
|
4862
|
+
|
|
4863
|
+
template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool2_t>()>* = nullptr>
|
|
4864
|
+
HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
|
|
4865
|
+
return __riscv_vreinterpret_v_u8m1_b2(v);
|
|
4866
|
+
}
|
|
4867
|
+
|
|
4868
|
+
template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool4_t>()>* = nullptr>
|
|
4869
|
+
HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
|
|
4870
|
+
return __riscv_vreinterpret_v_u8m1_b4(v);
|
|
4871
|
+
}
|
|
4872
|
+
|
|
4873
|
+
template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool8_t>()>* = nullptr>
|
|
4874
|
+
HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
|
|
4875
|
+
return __riscv_vreinterpret_v_u8m1_b8(v);
|
|
4876
|
+
}
|
|
4877
|
+
|
|
4878
|
+
template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool16_t>()>* = nullptr>
|
|
4879
|
+
HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
|
|
4880
|
+
return __riscv_vreinterpret_v_u8m1_b16(v);
|
|
4881
|
+
}
|
|
4882
|
+
|
|
4883
|
+
template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool32_t>()>* = nullptr>
|
|
4884
|
+
HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
|
|
4885
|
+
return __riscv_vreinterpret_v_u8m1_b32(v);
|
|
4886
|
+
}
|
|
4887
|
+
|
|
4888
|
+
template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool64_t>()>* = nullptr>
|
|
4889
|
+
HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
|
|
4890
|
+
return __riscv_vreinterpret_v_u8m1_b64(v);
|
|
4891
|
+
}
|
|
4892
|
+
|
|
4893
|
+
} // namespace detail
|
|
4894
|
+
|
|
4895
|
+
#ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
|
|
4896
|
+
#undef HWY_NATIVE_LOWER_HALF_OF_MASK
|
|
4897
|
+
#else
|
|
4898
|
+
#define HWY_NATIVE_LOWER_HALF_OF_MASK
|
|
4899
|
+
#endif
|
|
4900
|
+
|
|
4901
|
+
template <class D>
|
|
4902
|
+
HWY_API MFromD<D> LowerHalfOfMask(D d, MFromD<Twice<D>> m) {
|
|
4903
|
+
return detail::U8MaskBitsVecToMask(d, detail::MaskToU8MaskBitsVec(m));
|
|
4904
|
+
}
|
|
4905
|
+
|
|
4906
|
+
#ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
|
|
4907
|
+
#undef HWY_NATIVE_UPPER_HALF_OF_MASK
|
|
4908
|
+
#else
|
|
4909
|
+
#define HWY_NATIVE_UPPER_HALF_OF_MASK
|
|
4910
|
+
#endif
|
|
4911
|
+
|
|
4912
|
+
template <class D>
|
|
4913
|
+
HWY_API MFromD<D> UpperHalfOfMask(D d, MFromD<Twice<D>> m) {
|
|
4914
|
+
const size_t N = Lanes(d);
|
|
4915
|
+
|
|
4916
|
+
vuint8m1_t mask_bits = detail::MaskToU8MaskBitsVec(m);
|
|
4917
|
+
mask_bits = ShiftRightSame(mask_bits, static_cast<int>(N & 7));
|
|
4918
|
+
if (HWY_MAX_LANES_D(D) >= 8) {
|
|
4919
|
+
mask_bits = SlideDownLanes(ScalableTag<uint8_t>(), mask_bits, N / 8);
|
|
4920
|
+
}
|
|
4921
|
+
|
|
4922
|
+
return detail::U8MaskBitsVecToMask(d, mask_bits);
|
|
4923
|
+
}
|
|
4924
|
+
|
|
4925
|
+
// ------------------------------ CombineMasks
|
|
4926
|
+
|
|
4927
|
+
#ifdef HWY_NATIVE_COMBINE_MASKS
|
|
4928
|
+
#undef HWY_NATIVE_COMBINE_MASKS
|
|
4929
|
+
#else
|
|
4930
|
+
#define HWY_NATIVE_COMBINE_MASKS
|
|
4931
|
+
#endif
|
|
4932
|
+
|
|
4933
|
+
template <class D>
|
|
4934
|
+
HWY_API MFromD<D> CombineMasks(D d, MFromD<Half<D>> hi, MFromD<Half<D>> lo) {
|
|
4935
|
+
const Half<decltype(d)> dh;
|
|
4936
|
+
const size_t half_N = Lanes(dh);
|
|
4937
|
+
|
|
4938
|
+
const auto ext_lo_mask =
|
|
4939
|
+
And(detail::U8MaskBitsVecToMask(d, detail::MaskToU8MaskBitsVec(lo)),
|
|
4940
|
+
FirstN(d, half_N));
|
|
4941
|
+
vuint8m1_t hi_mask_bits = detail::MaskToU8MaskBitsVec(hi);
|
|
4942
|
+
hi_mask_bits = ShiftLeftSame(hi_mask_bits, static_cast<int>(half_N & 7));
|
|
4943
|
+
if (HWY_MAX_LANES_D(D) >= 8) {
|
|
4944
|
+
hi_mask_bits =
|
|
4945
|
+
SlideUpLanes(ScalableTag<uint8_t>(), hi_mask_bits, half_N / 8);
|
|
4946
|
+
}
|
|
4947
|
+
|
|
4948
|
+
return Or(ext_lo_mask, detail::U8MaskBitsVecToMask(d, hi_mask_bits));
|
|
4949
|
+
}
|
|
4950
|
+
|
|
4951
|
+
// ------------------------------ OrderedDemote2MasksTo
|
|
4952
|
+
|
|
4953
|
+
#ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
|
|
4954
|
+
#undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
|
|
4955
|
+
#else
|
|
4956
|
+
#define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
|
|
4957
|
+
#endif
|
|
4958
|
+
|
|
4959
|
+
template <class DTo, class DFrom,
|
|
4960
|
+
HWY_IF_T_SIZE_D(DTo, sizeof(TFromD<DFrom>) / 2),
|
|
4961
|
+
class DTo_2 = Repartition<TFromD<DTo>, DFrom>,
|
|
4962
|
+
hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DTo_2>>()>* = nullptr>
|
|
4963
|
+
HWY_API MFromD<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom /*d_from*/,
|
|
4964
|
+
MFromD<DFrom> a, MFromD<DFrom> b) {
|
|
4965
|
+
return CombineMasks(d_to, b, a);
|
|
4966
|
+
}
|
|
4967
|
+
|
|
4968
|
+
#endif // HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
4969
|
+
|
|
4970
|
+
// ------------------------------ Dup128MaskFromMaskBits
|
|
4971
|
+
|
|
4972
|
+
namespace detail {
|
|
4973
|
+
// Even though this is only used after checking if (kN < X), this helper
|
|
4974
|
+
// function prevents "shift count exceeded" errors.
|
|
4975
|
+
template <size_t kN, HWY_IF_LANES_LE(kN, 31)>
|
|
4976
|
+
constexpr unsigned MaxMaskBits() {
|
|
4977
|
+
return (1u << kN) - 1;
|
|
4978
|
+
}
|
|
4979
|
+
template <size_t kN, HWY_IF_LANES_GT(kN, 31)>
|
|
4980
|
+
constexpr unsigned MaxMaskBits() {
|
|
4981
|
+
return ~0u;
|
|
4982
|
+
}
|
|
4983
|
+
} // namespace detail
|
|
4984
|
+
|
|
4985
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_LE_D(D, 8)>
|
|
4986
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
4987
|
+
constexpr size_t kN = MaxLanes(d);
|
|
4988
|
+
if (kN < 8) mask_bits &= detail::MaxMaskBits<kN>();
|
|
4989
|
+
|
|
4990
|
+
#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
4991
|
+
return detail::U8MaskBitsVecToMask(
|
|
4992
|
+
d, Set(ScalableTag<uint8_t>(), static_cast<uint8_t>(mask_bits)));
|
|
4993
|
+
#else
|
|
4994
|
+
const RebindToUnsigned<decltype(d)> du8;
|
|
4995
|
+
const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, decltype(du8)>>
|
|
4996
|
+
du64;
|
|
4997
|
+
|
|
4998
|
+
const auto bytes = ResizeBitCast(
|
|
4999
|
+
du8, detail::AndS(
|
|
5000
|
+
ResizeBitCast(du64, Set(du8, static_cast<uint8_t>(mask_bits))),
|
|
5001
|
+
uint64_t{0x8040201008040201u}));
|
|
5002
|
+
return detail::NeS(bytes, uint8_t{0});
|
|
5003
|
+
#endif
|
|
5004
|
+
}
|
|
5005
|
+
|
|
5006
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_GT_D(D, 8)>
|
|
5007
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
5008
|
+
#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
5009
|
+
const ScalableTag<uint8_t> du8;
|
|
5010
|
+
const ScalableTag<uint16_t> du16;
|
|
5011
|
+
// There are exactly 16 mask bits for 128 vector bits of 8-bit lanes.
|
|
5012
|
+
return detail::U8MaskBitsVecToMask(
|
|
5013
|
+
d, BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits))));
|
|
5014
|
+
#else
|
|
5015
|
+
// Slow fallback for completeness; the above bits to mask cast is preferred.
|
|
5016
|
+
const RebindToUnsigned<decltype(d)> du8;
|
|
5017
|
+
const Repartition<uint16_t, decltype(du8)> du16;
|
|
5018
|
+
const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, decltype(du8)>>
|
|
5019
|
+
du64;
|
|
5020
|
+
|
|
5021
|
+
// Replicate the lower 16 bits of mask_bits to each u16 lane of a u16 vector,
|
|
5022
|
+
// and then bitcast the replicated mask_bits to a u8 vector
|
|
5023
|
+
const auto bytes = BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits)));
|
|
5024
|
+
// Replicate bytes 8x such that each byte contains the bit that governs it.
|
|
5025
|
+
const auto rep8 = TableLookupLanes(bytes, ShiftRight<3>(detail::Iota0(du8)));
|
|
5026
|
+
|
|
5027
|
+
const auto masked_out_rep8 = ResizeBitCast(
|
|
5028
|
+
du8,
|
|
5029
|
+
detail::AndS(ResizeBitCast(du64, rep8), uint64_t{0x8040201008040201u}));
|
|
5030
|
+
return detail::NeS(masked_out_rep8, uint8_t{0});
|
|
5031
|
+
#endif
|
|
5032
|
+
}
|
|
5033
|
+
|
|
5034
|
+
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
5035
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
5036
|
+
constexpr size_t kN = MaxLanes(d);
|
|
5037
|
+
if (kN < 8) mask_bits &= detail::MaxMaskBits<kN>();
|
|
5038
|
+
|
|
5039
|
+
#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
5040
|
+
const ScalableTag<uint8_t> du8;
|
|
5041
|
+
// There are exactly 8 mask bits for 128 vector bits of 16-bit lanes.
|
|
5042
|
+
return detail::U8MaskBitsVecToMask(d,
|
|
5043
|
+
Set(du8, static_cast<uint8_t>(mask_bits)));
|
|
5044
|
+
#else
|
|
5045
|
+
// Slow fallback for completeness; the above bits to mask cast is preferred.
|
|
5046
|
+
const RebindToUnsigned<D> du;
|
|
5047
|
+
const VFromD<decltype(du)> bits =
|
|
5048
|
+
Shl(Set(du, uint16_t{1}), Iota(du, uint16_t{0}));
|
|
5049
|
+
return TestBit(Set(du, static_cast<uint16_t>(mask_bits)), bits);
|
|
5050
|
+
#endif
|
|
5051
|
+
}
|
|
5052
|
+
|
|
5053
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
5054
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
5055
|
+
constexpr size_t kN = MaxLanes(d);
|
|
5056
|
+
if (kN < 4) mask_bits &= detail::MaxMaskBits<kN>();
|
|
5057
|
+
|
|
5058
|
+
#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
5059
|
+
const ScalableTag<uint8_t> du8;
|
|
5060
|
+
return detail::U8MaskBitsVecToMask(
|
|
5061
|
+
d, Set(du8, static_cast<uint8_t>(mask_bits * 0x11)));
|
|
5062
|
+
#else
|
|
5063
|
+
// Slow fallback for completeness; the above bits to mask cast is preferred.
|
|
5064
|
+
const RebindToUnsigned<D> du;
|
|
5065
|
+
const VFromD<decltype(du)> bits =
|
|
5066
|
+
Shl(Set(du, uint32_t{1}), Iota(du, uint32_t{0}));
|
|
5067
|
+
return TestBit(Set(du, static_cast<uint32_t>(mask_bits)), bits);
|
|
5068
|
+
#endif
|
|
5069
|
+
}
|
|
5070
|
+
|
|
5071
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
5072
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
5073
|
+
constexpr size_t kN = MaxLanes(d);
|
|
5074
|
+
if (kN < 2) mask_bits &= detail::MaxMaskBits<kN>();
|
|
5075
|
+
|
|
5076
|
+
#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
5077
|
+
const ScalableTag<uint8_t> du8;
|
|
5078
|
+
return detail::U8MaskBitsVecToMask(
|
|
5079
|
+
d, Set(du8, static_cast<uint8_t>(mask_bits * 0x55)));
|
|
5080
|
+
#else
|
|
5081
|
+
// Slow fallback for completeness; the above bits to mask cast is preferred.
|
|
5082
|
+
const RebindToUnsigned<D> du;
|
|
5083
|
+
const VFromD<decltype(du)> bits = Dup128VecFromValues(du, 0, 1);
|
|
5084
|
+
return TestBit(Set(du, static_cast<uint64_t>(mask_bits)), bits);
|
|
5085
|
+
#endif
|
|
5086
|
+
}
|
|
5087
|
+
|
|
4369
5088
|
// ------------------------------ Neg (Sub)
|
|
4370
5089
|
|
|
4371
5090
|
template <class V, HWY_IF_SIGNED_V(V)>
|
|
@@ -4385,7 +5104,7 @@ HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Neg, fsgnjn, _ALL)
|
|
|
4385
5104
|
|
|
4386
5105
|
#if !HWY_HAVE_FLOAT16
|
|
4387
5106
|
|
|
4388
|
-
template <class V, HWY_IF_U16_D(DFromV<V>)> // float16_t
|
|
5107
|
+
template <class V, HWY_IF_U16_D(DFromV<V>)> // hwy::float16_t
|
|
4389
5108
|
HWY_API V Neg(V v) {
|
|
4390
5109
|
const DFromV<decltype(v)> d;
|
|
4391
5110
|
const RebindToUnsigned<decltype(d)> du;
|
|
@@ -4479,6 +5198,14 @@ HWY_API MFromD<DFromV<V>> IsNaN(const V v) {
|
|
|
4479
5198
|
return Ne(v, v);
|
|
4480
5199
|
}
|
|
4481
5200
|
|
|
5201
|
+
// Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite.
|
|
5202
|
+
// We use a fused Set/comparison for IsFinite.
|
|
5203
|
+
#ifdef HWY_NATIVE_ISINF
|
|
5204
|
+
#undef HWY_NATIVE_ISINF
|
|
5205
|
+
#else
|
|
5206
|
+
#define HWY_NATIVE_ISINF
|
|
5207
|
+
#endif
|
|
5208
|
+
|
|
4482
5209
|
template <class V, class D = DFromV<V>>
|
|
4483
5210
|
HWY_API MFromD<D> IsInf(const V v) {
|
|
4484
5211
|
const D d;
|
|
@@ -4507,22 +5234,24 @@ HWY_API MFromD<D> IsFinite(const V v) {
|
|
|
4507
5234
|
|
|
4508
5235
|
// ------------------------------ Iota (ConvertTo)
|
|
4509
5236
|
|
|
4510
|
-
template <class D, HWY_IF_UNSIGNED_D(D)>
|
|
4511
|
-
HWY_API VFromD<D> Iota(const D d,
|
|
4512
|
-
return detail::AddS(detail::Iota0(d), first);
|
|
5237
|
+
template <class D, typename T2, HWY_IF_UNSIGNED_D(D)>
|
|
5238
|
+
HWY_API VFromD<D> Iota(const D d, T2 first) {
|
|
5239
|
+
return detail::AddS(detail::Iota0(d), static_cast<TFromD<D>>(first));
|
|
4513
5240
|
}
|
|
4514
5241
|
|
|
4515
|
-
template <class D, HWY_IF_SIGNED_D(D)>
|
|
4516
|
-
HWY_API VFromD<D> Iota(const D d,
|
|
5242
|
+
template <class D, typename T2, HWY_IF_SIGNED_D(D)>
|
|
5243
|
+
HWY_API VFromD<D> Iota(const D d, T2 first) {
|
|
4517
5244
|
const RebindToUnsigned<D> du;
|
|
4518
|
-
return detail::AddS(BitCast(d, detail::Iota0(du)),
|
|
5245
|
+
return detail::AddS(BitCast(d, detail::Iota0(du)),
|
|
5246
|
+
static_cast<TFromD<D>>(first));
|
|
4519
5247
|
}
|
|
4520
5248
|
|
|
4521
|
-
template <class D, HWY_IF_FLOAT_D(D)>
|
|
4522
|
-
HWY_API VFromD<D> Iota(const D d,
|
|
5249
|
+
template <class D, typename T2, HWY_IF_FLOAT_D(D)>
|
|
5250
|
+
HWY_API VFromD<D> Iota(const D d, T2 first) {
|
|
4523
5251
|
const RebindToUnsigned<D> du;
|
|
4524
5252
|
const RebindToSigned<D> di;
|
|
4525
|
-
return detail::AddS(ConvertTo(d, BitCast(di, detail::Iota0(du))),
|
|
5253
|
+
return detail::AddS(ConvertTo(d, BitCast(di, detail::Iota0(du))),
|
|
5254
|
+
ConvertScalarTo<TFromD<D>>(first));
|
|
4526
5255
|
}
|
|
4527
5256
|
|
|
4528
5257
|
// ------------------------------ MulEven/Odd (Mul, OddEven)
|
|
@@ -4561,8 +5290,8 @@ HWY_INLINE V MulOdd(const V a, const V b) {
|
|
|
4561
5290
|
// ------------------------------ ReorderDemote2To (OddEven, Combine)
|
|
4562
5291
|
|
|
4563
5292
|
template <size_t N, int kPow2>
|
|
4564
|
-
HWY_API VFromD<Simd<bfloat16_t, N, kPow2>> ReorderDemote2To(
|
|
4565
|
-
Simd<bfloat16_t, N, kPow2> dbf16,
|
|
5293
|
+
HWY_API VFromD<Simd<hwy::bfloat16_t, N, kPow2>> ReorderDemote2To(
|
|
5294
|
+
Simd<hwy::bfloat16_t, N, kPow2> dbf16,
|
|
4566
5295
|
VFromD<RepartitionToWide<decltype(dbf16)>> a,
|
|
4567
5296
|
VFromD<RepartitionToWide<decltype(dbf16)>> b) {
|
|
4568
5297
|
const RebindToUnsigned<decltype(dbf16)> du16;
|
|
@@ -4618,8 +5347,8 @@ HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
|
|
|
4618
5347
|
}
|
|
4619
5348
|
|
|
4620
5349
|
// If LMUL is not the max, Combine first to avoid another DemoteTo.
|
|
4621
|
-
template <class DN,
|
|
4622
|
-
HWY_IF_F32_D(DFromV<V>),
|
|
5350
|
+
template <class DN, HWY_IF_SPECIAL_FLOAT_D(DN), HWY_IF_POW2_LE_D(DN, 2),
|
|
5351
|
+
class V, HWY_IF_F32_D(DFromV<V>),
|
|
4623
5352
|
class V2 = VFromD<Repartition<TFromV<V>, DN>>,
|
|
4624
5353
|
hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr>
|
|
4625
5354
|
HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
|
|
@@ -4629,8 +5358,8 @@ HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
|
|
|
4629
5358
|
}
|
|
4630
5359
|
|
|
4631
5360
|
// Max LMUL: must DemoteTo first, then Combine.
|
|
4632
|
-
template <class DN,
|
|
4633
|
-
HWY_IF_F32_D(DFromV<V>),
|
|
5361
|
+
template <class DN, HWY_IF_SPECIAL_FLOAT_D(DN), HWY_IF_POW2_GT_D(DN, 2),
|
|
5362
|
+
class V, HWY_IF_F32_D(DFromV<V>),
|
|
4634
5363
|
class V2 = VFromD<Repartition<TFromV<V>, DN>>,
|
|
4635
5364
|
hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr>
|
|
4636
5365
|
HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
|
|
@@ -4654,7 +5383,7 @@ HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
|
|
|
4654
5383
|
// ------------------------------ WidenMulPairwiseAdd
|
|
4655
5384
|
|
|
4656
5385
|
template <class D32, HWY_IF_F32_D(D32),
|
|
4657
|
-
class V16 = VFromD<Repartition<bfloat16_t, D32>>>
|
|
5386
|
+
class V16 = VFromD<Repartition<hwy::bfloat16_t, D32>>>
|
|
4658
5387
|
HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) {
|
|
4659
5388
|
const RebindToUnsigned<decltype(df32)> du32;
|
|
4660
5389
|
using VU32 = VFromD<decltype(du32)>;
|
|
@@ -4698,7 +5427,7 @@ namespace detail {
|
|
|
4698
5427
|
// Non-overloaded wrapper function so we can define DF32 in template args.
|
|
4699
5428
|
template <size_t N, int kPow2, class DF32 = Simd<float, N, kPow2>,
|
|
4700
5429
|
class VF32 = VFromD<DF32>,
|
|
4701
|
-
class DBF16 = Repartition<bfloat16_t, Simd<float, N, kPow2>>>
|
|
5430
|
+
class DBF16 = Repartition<hwy::bfloat16_t, Simd<float, N, kPow2>>>
|
|
4702
5431
|
HWY_API VF32 ReorderWidenMulAccumulateBF16(Simd<float, N, kPow2> df32,
|
|
4703
5432
|
VFromD<DBF16> a, VFromD<DBF16> b,
|
|
4704
5433
|
const VF32 sum0, VF32& sum1) {
|
|
@@ -4994,7 +5723,6 @@ HWY_INLINE VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) {
|
|
|
4994
5723
|
}
|
|
4995
5724
|
|
|
4996
5725
|
// ================================================== END MACROS
|
|
4997
|
-
namespace detail { // for code folding
|
|
4998
5726
|
#undef HWY_RVV_AVL
|
|
4999
5727
|
#undef HWY_RVV_D
|
|
5000
5728
|
#undef HWY_RVV_FOREACH
|
|
@@ -5055,15 +5783,19 @@ namespace detail { // for code folding
|
|
|
5055
5783
|
#undef HWY_RVV_FOREACH_UI32
|
|
5056
5784
|
#undef HWY_RVV_FOREACH_UI3264
|
|
5057
5785
|
#undef HWY_RVV_FOREACH_UI64
|
|
5786
|
+
#undef HWY_RVV_IF_EMULATED_D
|
|
5787
|
+
#undef HWY_RVV_IF_CAN128_D
|
|
5788
|
+
#undef HWY_RVV_IF_GE128_D
|
|
5789
|
+
#undef HWY_RVV_IF_LT128_D
|
|
5058
5790
|
#undef HWY_RVV_INSERT_VXRM
|
|
5059
5791
|
#undef HWY_RVV_M
|
|
5060
5792
|
#undef HWY_RVV_RETM_ARGM
|
|
5793
|
+
#undef HWY_RVV_RETV_ARGMVV
|
|
5061
5794
|
#undef HWY_RVV_RETV_ARGV
|
|
5062
5795
|
#undef HWY_RVV_RETV_ARGVS
|
|
5063
5796
|
#undef HWY_RVV_RETV_ARGVV
|
|
5064
5797
|
#undef HWY_RVV_T
|
|
5065
5798
|
#undef HWY_RVV_V
|
|
5066
|
-
} // namespace detail
|
|
5067
5799
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
|
5068
5800
|
} // namespace HWY_NAMESPACE
|
|
5069
5801
|
} // namespace hwy
|