@img/sharp-libvips-dev 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +7 -1
- package/include/aom/aom_image.h +24 -12
- package/include/aom/aom_integer.h +3 -3
- package/include/aom/aomcx.h +15 -0
- package/include/aom/aomdx.h +5 -2
- package/include/archive.h +7 -5
- package/include/archive_entry.h +5 -3
- package/include/cgif.h +3 -0
- package/include/freetype2/freetype/config/ftoption.h +1 -1
- package/include/fribidi/fribidi-config.h +2 -2
- package/include/fribidi/fribidi-unicode-version.h +3 -3
- package/include/glib-2.0/gio/gappinfo.h +40 -25
- package/include/glib-2.0/gio/gasyncresult.h +1 -1
- package/include/glib-2.0/gio/gconverter.h +5 -0
- package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
- package/include/glib-2.0/gio/gfile.h +16 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/gsettings.h +8 -0
- package/include/glib-2.0/gio/gvfs.h +2 -2
- package/include/glib-2.0/girepository/gi-visibility.h +34 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
- package/include/glib-2.0/glib/giochannel.h +2 -2
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +12 -5
- package/include/glib-2.0/glib/gmain.h +93 -7
- package/include/glib-2.0/glib/gqsort.h +8 -1
- package/include/glib-2.0/glib/gstrfuncs.h +0 -12
- package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
- package/include/glib-2.0/glib/gunicode.h +1 -1
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gtype.h +6 -6
- package/include/harfbuzz/hb-buffer.h +6 -0
- package/include/harfbuzz/hb-common.h +6 -9
- package/include/harfbuzz/hb-cplusplus.hh +8 -11
- package/include/harfbuzz/hb-subset.h +17 -4
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +28 -0
- package/include/hwy/aligned_allocator.h +48 -1
- package/include/hwy/base.h +235 -34
- package/include/hwy/detect_compiler_arch.h +84 -10
- package/include/hwy/detect_targets.h +95 -29
- package/include/hwy/foreach_target.h +12 -1
- package/include/hwy/highway.h +205 -50
- package/include/hwy/ops/arm_neon-inl.h +841 -99
- package/include/hwy/ops/arm_sve-inl.h +413 -141
- package/include/hwy/ops/emu128-inl.h +373 -360
- package/include/hwy/ops/generic_ops-inl.h +804 -401
- package/include/hwy/ops/inside-inl.h +691 -0
- package/include/hwy/ops/ppc_vsx-inl.h +456 -166
- package/include/hwy/ops/rvv-inl.h +537 -249
- package/include/hwy/ops/scalar-inl.h +169 -79
- package/include/hwy/ops/set_macros-inl.h +106 -18
- package/include/hwy/ops/shared-inl.h +23 -0
- package/include/hwy/ops/wasm_128-inl.h +130 -108
- package/include/hwy/ops/x86_128-inl.h +1892 -577
- package/include/hwy/ops/x86_256-inl.h +625 -184
- package/include/hwy/ops/x86_512-inl.h +733 -131
- package/include/hwy/targets.h +22 -21
- package/include/hwy/timer-inl.h +3 -3
- package/include/hwy/timer.h +5 -1
- package/include/libheif/heif.h +170 -15
- package/include/libheif/heif_items.h +237 -0
- package/include/libheif/heif_properties.h +38 -2
- package/include/libheif/heif_regions.h +1 -1
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/pnglibconf.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
- package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
- package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
- package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
- package/include/libxml2/libxml/HTMLparser.h +12 -19
- package/include/libxml2/libxml/c14n.h +1 -12
- package/include/libxml2/libxml/debugXML.h +1 -1
- package/include/libxml2/libxml/encoding.h +9 -0
- package/include/libxml2/libxml/entities.h +12 -1
- package/include/libxml2/libxml/hash.h +19 -0
- package/include/libxml2/libxml/list.h +2 -2
- package/include/libxml2/libxml/nanohttp.h +17 -0
- package/include/libxml2/libxml/parser.h +61 -55
- package/include/libxml2/libxml/parserInternals.h +9 -1
- package/include/libxml2/libxml/pattern.h +6 -0
- package/include/libxml2/libxml/tree.h +32 -12
- package/include/libxml2/libxml/uri.h +11 -0
- package/include/libxml2/libxml/valid.h +29 -2
- package/include/libxml2/libxml/xinclude.h +7 -0
- package/include/libxml2/libxml/xmlIO.h +21 -4
- package/include/libxml2/libxml/xmlerror.h +14 -0
- package/include/libxml2/libxml/xmlexports.h +111 -15
- package/include/libxml2/libxml/xmlmemory.h +8 -45
- package/include/libxml2/libxml/xmlreader.h +2 -0
- package/include/libxml2/libxml/xmlsave.h +5 -0
- package/include/libxml2/libxml/xmlunicode.h +165 -1
- package/include/libxml2/libxml/xmlversion.h +15 -179
- package/include/libxml2/libxml/xmlwriter.h +1 -0
- package/include/libxml2/libxml/xpath.h +4 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-item.h +4 -2
- package/include/pango-1.0/pango/pango-version-macros.h +25 -0
- package/include/pango-1.0/pango/pangofc-font.h +2 -1
- package/include/pnglibconf.h +1 -1
- package/include/vips/util.h +1 -2
- package/include/vips/version.h +4 -4
- package/include/webp/decode.h +58 -56
- package/include/webp/demux.h +25 -21
- package/include/webp/encode.h +44 -39
- package/include/webp/mux.h +76 -15
- package/include/webp/mux_types.h +2 -1
- package/include/webp/sharpyuv/sharpyuv.h +77 -8
- package/include/webp/types.h +29 -8
- package/include/zconf.h +1 -1
- package/include/zlib.h +12 -12
- package/package.json +1 -1
- package/versions.json +14 -15
|
@@ -436,22 +436,134 @@ HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _, _ALL)
|
|
|
436
436
|
// ------------------------------ Lanes
|
|
437
437
|
|
|
438
438
|
// WARNING: we want to query VLMAX/sizeof(T), but this may actually change VL!
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
/* If
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
}
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
439
|
+
|
|
440
|
+
#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
|
|
441
|
+
// HWY_RVV_CAPPED_LANES_SPECIAL_CASES provides some additional optimizations
|
|
442
|
+
// to CappedLanes in non-debug builds
|
|
443
|
+
#define HWY_RVV_CAPPED_LANES_SPECIAL_CASES(BASE, SEW, LMUL) \
|
|
444
|
+
if (__builtin_constant_p(cap >= kMaxLanes) && (cap >= kMaxLanes)) { \
|
|
445
|
+
/* If cap is known to be greater than or equal to MaxLanes(d), */ \
|
|
446
|
+
/* HWY_MIN(cap, Lanes(d)) will be equal to Lanes(d) */ \
|
|
447
|
+
return Lanes(d); \
|
|
448
|
+
} \
|
|
449
|
+
\
|
|
450
|
+
if ((__builtin_constant_p((cap & (cap - 1)) == 0) && \
|
|
451
|
+
((cap & (cap - 1)) == 0)) || \
|
|
452
|
+
(__builtin_constant_p(cap <= HWY_MAX(kMinLanesPerFullVec, 4)) && \
|
|
453
|
+
(cap <= HWY_MAX(kMinLanesPerFullVec, 4)))) { \
|
|
454
|
+
/* If cap is known to be a power of 2, then */ \
|
|
455
|
+
/* vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the same */ \
|
|
456
|
+
/* result as HWY_MIN(cap, Lanes(d)) as kMaxLanes is a power of 2 and */ \
|
|
457
|
+
/* as (cap > VLMAX && cap < 2 * VLMAX) can only be true if cap is not a */ \
|
|
458
|
+
/* power of 2 since VLMAX is always a power of 2 */ \
|
|
459
|
+
\
|
|
460
|
+
/* If cap is known to be less than or equal to 4, then */ \
|
|
461
|
+
/* vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the same */ \
|
|
462
|
+
/* result as HWY_MIN(cap, Lanes(d)) as HWY_MIN(cap, kMaxLanes) <= 4 is */ \
|
|
463
|
+
/* true if cap <= 4 and as vsetvl(HWY_MIN(cap, kMaxLanes)) is */ \
|
|
464
|
+
/* guaranteed to return the same result as HWY_MIN(cap, Lanes(d)) */ \
|
|
465
|
+
/* if HWY_MIN(cap, kMaxLanes) <= 4 is true */ \
|
|
466
|
+
\
|
|
467
|
+
/* If cap is known to be less than or equal to kMinLanesPerFullVec, */ \
|
|
468
|
+
/* then vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the */ \
|
|
469
|
+
/* same result as HWY_MIN(cap, Lanes(d)) as */ \
|
|
470
|
+
/* HWY_MIN(cap, kMaxLanes) <= kMinLanesPerFullVec is true if */ \
|
|
471
|
+
/* cap <= kMinLanesPerFullVec is true */ \
|
|
472
|
+
\
|
|
473
|
+
/* If cap <= HWY_MAX(kMinLanesPerFullVec, 4) is true, then either */ \
|
|
474
|
+
/* cap <= 4 or cap <= kMinLanesPerFullVec must be true */ \
|
|
475
|
+
\
|
|
476
|
+
/* If cap <= HWY_MAX(kMinLanesPerFullVec, 4) is known to be true, */ \
|
|
477
|
+
/* then vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the */ \
|
|
478
|
+
/* same result as HWY_MIN(cap, Lanes(d)) */ \
|
|
479
|
+
\
|
|
480
|
+
/* If no cap, avoid the HWY_MIN. */ \
|
|
481
|
+
return detail::IsFull(d) \
|
|
482
|
+
? __riscv_vsetvl_e##SEW##LMUL(cap) \
|
|
483
|
+
: __riscv_vsetvl_e##SEW##LMUL(HWY_MIN(cap, kMaxLanes)); \
|
|
484
|
+
}
|
|
485
|
+
#else
|
|
486
|
+
#define HWY_RVV_CAPPED_LANES_SPECIAL_CASES(BASE, SEW, LMUL)
|
|
487
|
+
#endif
|
|
488
|
+
|
|
489
|
+
#define HWY_RVV_LANES(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
|
|
490
|
+
MLEN, NAME, OP) \
|
|
491
|
+
template <size_t N> \
|
|
492
|
+
HWY_API size_t NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \
|
|
493
|
+
constexpr size_t kFull = HWY_LANES(HWY_RVV_T(BASE, SEW)); \
|
|
494
|
+
constexpr size_t kCap = MaxLanes(d); \
|
|
495
|
+
/* If no cap, avoid generating a constant by using VLMAX. */ \
|
|
496
|
+
return N == kFull ? __riscv_vsetvlmax_e##SEW##LMUL() \
|
|
497
|
+
: __riscv_vsetvl_e##SEW##LMUL(kCap); \
|
|
498
|
+
} \
|
|
499
|
+
template <size_t N> \
|
|
500
|
+
HWY_API size_t Capped##NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, size_t cap) { \
|
|
501
|
+
/* NOTE: Section 6.3 of the RVV specification, which can be found at */ \
|
|
502
|
+
/* https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc, */ \
|
|
503
|
+
/* allows vsetvl to return a result less than Lanes(d) but greater than */ \
|
|
504
|
+
/* or equal to ((cap + 1) / 2) if */ \
|
|
505
|
+
/* (Lanes(d) > 2 && cap > HWY_MAX(Lanes(d), 4) && cap < (2 * Lanes(d))) */ \
|
|
506
|
+
/* is true */ \
|
|
507
|
+
\
|
|
508
|
+
/* VLMAX is the number of lanes in a vector of type */ \
|
|
509
|
+
/* VFromD<decltype(d)>, which is returned by */ \
|
|
510
|
+
/* Lanes(DFromV<VFromD<decltype(d)>>()) */ \
|
|
511
|
+
\
|
|
512
|
+
/* VLMAX is guaranteed to be a power of 2 under Section 2 of the RVV */ \
|
|
513
|
+
/* specification */ \
|
|
514
|
+
\
|
|
515
|
+
/* The VLMAX of a vector of type VFromD<decltype(d)> is at least 2 as */ \
|
|
516
|
+
/* the HWY_RVV target requires support for the RVV Zvl128b extension, */ \
|
|
517
|
+
/* which guarantees that vectors with LMUL=1 are at least 16 bytes */ \
|
|
518
|
+
\
|
|
519
|
+
/* If VLMAX == 2 is true, then vsetvl(cap) is equal to HWY_MIN(cap, 2) */ \
|
|
520
|
+
/* as cap == 3 is the only value such that */ \
|
|
521
|
+
/* (cap > VLMAX && cap < 2 * VLMAX) if VLMAX == 2 and as */ \
|
|
522
|
+
/* ((3 + 1) / 2) is equal to 2 */ \
|
|
523
|
+
\
|
|
524
|
+
/* If cap <= 4 is true, then vsetvl(cap) must be equal to */ \
|
|
525
|
+
/* HWY_MIN(cap, VLMAX) as cap <= VLMAX is true if VLMAX >= 4 is true */ \
|
|
526
|
+
/* and as vsetvl(cap) is guaranteed to be equal to HWY_MIN(cap, VLMAX) */ \
|
|
527
|
+
/* if VLMAX == 2 */ \
|
|
528
|
+
\
|
|
529
|
+
/* We want CappedLanes(d, cap) to return Lanes(d) if cap > Lanes(d) as */ \
|
|
530
|
+
/* LoadN(d, p, cap) expects to load exactly HWY_MIN(cap, Lanes(d)) */ \
|
|
531
|
+
/* lanes and StoreN(v, d, p, cap) expects to store exactly */ \
|
|
532
|
+
/* HWY_MIN(cap, Lanes(d)) lanes, even in the case where vsetvl returns */ \
|
|
533
|
+
/* a result that is less than HWY_MIN(cap, Lanes(d)) */ \
|
|
534
|
+
\
|
|
535
|
+
/* kMinLanesPerFullVec is the minimum value of VLMAX for a vector of */ \
|
|
536
|
+
/* type VFromD<decltype(d)> */ \
|
|
537
|
+
constexpr size_t kMinLanesPerFullVec = \
|
|
538
|
+
detail::ScaleByPower(16 / (SEW / 8), SHIFT); \
|
|
539
|
+
/* kMaxLanes is the maximum number of lanes returned by Lanes(d) */ \
|
|
540
|
+
constexpr size_t kMaxLanes = MaxLanes(d); \
|
|
541
|
+
\
|
|
542
|
+
HWY_RVV_CAPPED_LANES_SPECIAL_CASES(BASE, SEW, LMUL) \
|
|
543
|
+
\
|
|
544
|
+
if (kMaxLanes <= HWY_MAX(kMinLanesPerFullVec, 4)) { \
|
|
545
|
+
/* If kMaxLanes <= kMinLanesPerFullVec is true, then */ \
|
|
546
|
+
/* vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return */ \
|
|
547
|
+
/* HWY_MIN(cap, Lanes(d)) as */ \
|
|
548
|
+
/* HWY_MIN(cap, kMaxLanes) <= kMaxLanes <= VLMAX is true if */ \
|
|
549
|
+
/* kMaxLanes <= kMinLanesPerFullVec is true */ \
|
|
550
|
+
\
|
|
551
|
+
/* If kMaxLanes <= 4 is true, then vsetvl(HWY_MIN(cap, kMaxLanes)) is */ \
|
|
552
|
+
/* guaranteed to return the same result as HWY_MIN(cap, Lanes(d)) as */ \
|
|
553
|
+
/* HWY_MIN(cap, kMaxLanes) <= 4 is true if kMaxLanes <= 4 is true */ \
|
|
554
|
+
\
|
|
555
|
+
/* If kMaxLanes <= HWY_MAX(kMinLanesPerFullVec, 4) is true, then */ \
|
|
556
|
+
/* either kMaxLanes <= 4 or kMaxLanes <= kMinLanesPerFullVec must be */ \
|
|
557
|
+
/* true */ \
|
|
558
|
+
\
|
|
559
|
+
return __riscv_vsetvl_e##SEW##LMUL(HWY_MIN(cap, kMaxLanes)); \
|
|
560
|
+
} else { \
|
|
561
|
+
/* If kMaxLanes > HWY_MAX(kMinLanesPerFullVec, 4) is true, need to */ \
|
|
562
|
+
/* obtain the actual number of lanes using Lanes(d) and clamp cap to */ \
|
|
563
|
+
/* the result of Lanes(d) */ \
|
|
564
|
+
const size_t actual = Lanes(d); \
|
|
565
|
+
return HWY_MIN(actual, cap); \
|
|
566
|
+
} \
|
|
455
567
|
}
|
|
456
568
|
|
|
457
569
|
#define HWY_RVV_LANES_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
@@ -480,12 +592,18 @@ HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e, _ALL)
|
|
|
480
592
|
HWY_RVV_FOREACH(HWY_RVV_LANES_VIRT, Lanes, lenb, _VIRT)
|
|
481
593
|
#undef HWY_RVV_LANES
|
|
482
594
|
#undef HWY_RVV_LANES_VIRT
|
|
595
|
+
#undef HWY_RVV_CAPPED_LANES_SPECIAL_CASES
|
|
483
596
|
|
|
484
597
|
template <class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
485
598
|
HWY_API size_t Lanes(D /* tag*/) {
|
|
486
599
|
return Lanes(RebindToUnsigned<D>());
|
|
487
600
|
}
|
|
488
601
|
|
|
602
|
+
template <class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
603
|
+
HWY_API size_t CappedLanes(D /* tag*/, size_t cap) {
|
|
604
|
+
return CappedLanes(RebindToUnsigned<D>(), cap);
|
|
605
|
+
}
|
|
606
|
+
|
|
489
607
|
// ------------------------------ Common x-macros
|
|
490
608
|
|
|
491
609
|
// Last argument to most intrinsics. Use when the op has no d arg of its own,
|
|
@@ -551,18 +669,16 @@ HWY_RVV_FOREACH_F(HWY_RVV_SET, Set, fmv_v_f, _ALL_VIRT)
|
|
|
551
669
|
|
|
552
670
|
// Treat bfloat16_t as int16_t (using the previously defined Set overloads);
|
|
553
671
|
// required for Zero and VFromD.
|
|
554
|
-
template <
|
|
555
|
-
decltype(Set(
|
|
556
|
-
Simd<hwy::bfloat16_t, N, kPow2> d, hwy::bfloat16_t arg) {
|
|
672
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
673
|
+
decltype(Set(RebindToSigned<D>(), 0)) Set(D d, hwy::bfloat16_t arg) {
|
|
557
674
|
return Set(RebindToSigned<decltype(d)>(), BitCastScalar<int16_t>(arg));
|
|
558
675
|
}
|
|
559
676
|
#if !HWY_HAVE_FLOAT16 // Otherwise already defined above.
|
|
560
677
|
// WARNING: returns a different type than emulated bfloat16_t so that we can
|
|
561
678
|
// implement PromoteTo overloads for both bfloat16_t and float16_t, and also
|
|
562
679
|
// provide a Neg(hwy::float16_t) overload that coexists with Neg(int16_t).
|
|
563
|
-
template <
|
|
564
|
-
decltype(Set(
|
|
565
|
-
Simd<hwy::float16_t, N, kPow2> d, hwy::float16_t arg) {
|
|
680
|
+
template <class D, HWY_IF_F16_D(D)>
|
|
681
|
+
decltype(Set(RebindToUnsigned<D>(), 0)) Set(D d, hwy::float16_t arg) {
|
|
566
682
|
return Set(RebindToUnsigned<decltype(d)>(), BitCastScalar<uint16_t>(arg));
|
|
567
683
|
}
|
|
568
684
|
#endif
|
|
@@ -758,10 +874,10 @@ HWY_RVV_FOREACH_F(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
|
|
|
758
874
|
HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_CAST_IF, _, reinterpret, _ALL)
|
|
759
875
|
HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
|
|
760
876
|
#else
|
|
761
|
-
template <
|
|
762
|
-
HWY_INLINE VFromD<
|
|
763
|
-
|
|
764
|
-
return BitCastFromByte(
|
|
877
|
+
template <class D, HWY_IF_F16_D(D)>
|
|
878
|
+
HWY_INLINE VFromD<RebindToUnsigned<D>> BitCastFromByte(
|
|
879
|
+
D /* d */, VFromD<Repartition<uint8_t, D>> v) {
|
|
880
|
+
return BitCastFromByte(RebindToUnsigned<D>(), v);
|
|
765
881
|
}
|
|
766
882
|
#endif
|
|
767
883
|
|
|
@@ -772,11 +888,10 @@ HWY_INLINE VFromD<Simd<uint16_t, N, kPow2>> BitCastFromByte(
|
|
|
772
888
|
#undef HWY_RVV_CAST_VIRT_U
|
|
773
889
|
#undef HWY_RVV_CAST_VIRT_IF
|
|
774
890
|
|
|
775
|
-
template <
|
|
776
|
-
HWY_INLINE VFromD<
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
return BitCastFromByte(Simd<int16_t, N, kPow2>(), v);
|
|
891
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
892
|
+
HWY_INLINE VFromD<RebindToSigned<D>> BitCastFromByte(
|
|
893
|
+
D d, VFromD<Repartition<uint8_t, D>> v) {
|
|
894
|
+
return BitCastFromByte(RebindToSigned<decltype(d)>(), v);
|
|
780
895
|
}
|
|
781
896
|
|
|
782
897
|
} // namespace detail
|
|
@@ -934,6 +1049,35 @@ HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, SubS, sub_vx, _ALL)
|
|
|
934
1049
|
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Sub, sub, _ALL)
|
|
935
1050
|
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Sub, fsub, _ALL)
|
|
936
1051
|
|
|
1052
|
+
// ------------------------------ Neg (ReverseSubS, Xor)
|
|
1053
|
+
|
|
1054
|
+
template <class V, HWY_IF_SIGNED_V(V)>
|
|
1055
|
+
HWY_API V Neg(const V v) {
|
|
1056
|
+
return detail::ReverseSubS(v, 0);
|
|
1057
|
+
}
|
|
1058
|
+
|
|
1059
|
+
// vector = f(vector), but argument is repeated
|
|
1060
|
+
#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
1061
|
+
SHIFT, MLEN, NAME, OP) \
|
|
1062
|
+
HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
1063
|
+
return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, v, \
|
|
1064
|
+
HWY_RVV_AVL(SEW, SHIFT)); \
|
|
1065
|
+
}
|
|
1066
|
+
|
|
1067
|
+
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Neg, fsgnjn, _ALL)
|
|
1068
|
+
|
|
1069
|
+
#if !HWY_HAVE_FLOAT16
|
|
1070
|
+
|
|
1071
|
+
template <class V, HWY_IF_U16_D(DFromV<V>)> // hwy::float16_t
|
|
1072
|
+
HWY_API V Neg(V v) {
|
|
1073
|
+
const DFromV<decltype(v)> d;
|
|
1074
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1075
|
+
using TU = TFromD<decltype(du)>;
|
|
1076
|
+
return BitCast(d, Xor(BitCast(du, v), Set(du, SignMask<TU>())));
|
|
1077
|
+
}
|
|
1078
|
+
|
|
1079
|
+
#endif // !HWY_HAVE_FLOAT16
|
|
1080
|
+
|
|
937
1081
|
// ------------------------------ SaturatedAdd
|
|
938
1082
|
|
|
939
1083
|
#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
|
|
@@ -1089,12 +1233,16 @@ HWY_API VFromD<Repartition<int64_t, DFromV<VI8>>> SumsOf8(const VI8 v) {
|
|
|
1089
1233
|
}
|
|
1090
1234
|
|
|
1091
1235
|
// ------------------------------ RotateRight
|
|
1092
|
-
template <int kBits, class V>
|
|
1236
|
+
template <int kBits, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
1093
1237
|
HWY_API V RotateRight(const V v) {
|
|
1238
|
+
const DFromV<decltype(v)> d;
|
|
1239
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1240
|
+
|
|
1094
1241
|
constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
|
|
1095
1242
|
static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
|
|
1096
1243
|
if (kBits == 0) return v;
|
|
1097
|
-
|
|
1244
|
+
|
|
1245
|
+
return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
|
|
1098
1246
|
ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
|
|
1099
1247
|
}
|
|
1100
1248
|
|
|
@@ -1175,15 +1323,8 @@ HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Mul, fmul, _ALL)
|
|
|
1175
1323
|
|
|
1176
1324
|
// ------------------------------ MulHigh
|
|
1177
1325
|
|
|
1178
|
-
// Only for internal use (Highway only promises MulHigh for 16-bit inputs).
|
|
1179
|
-
// Used by MulEven; vwmul does not work for m8.
|
|
1180
|
-
namespace detail {
|
|
1181
1326
|
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, MulHigh, mulh, _ALL)
|
|
1182
1327
|
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, MulHigh, mulhu, _ALL)
|
|
1183
|
-
} // namespace detail
|
|
1184
|
-
|
|
1185
|
-
HWY_RVV_FOREACH_U16(HWY_RVV_RETV_ARGVV, MulHigh, mulhu, _ALL)
|
|
1186
|
-
HWY_RVV_FOREACH_I16(HWY_RVV_RETV_ARGVV, MulHigh, mulh, _ALL)
|
|
1187
1328
|
|
|
1188
1329
|
// ------------------------------ MulFixedPoint15
|
|
1189
1330
|
|
|
@@ -1460,6 +1601,23 @@ HWY_API MFromD<DFromV<V>> MaskFromVec(const V v) {
|
|
|
1460
1601
|
return detail::NeS(v, 0);
|
|
1461
1602
|
}
|
|
1462
1603
|
|
|
1604
|
+
// ------------------------------ IsNegative (MFromD)
|
|
1605
|
+
#ifdef HWY_NATIVE_IS_NEGATIVE
|
|
1606
|
+
#undef HWY_NATIVE_IS_NEGATIVE
|
|
1607
|
+
#else
|
|
1608
|
+
#define HWY_NATIVE_IS_NEGATIVE
|
|
1609
|
+
#endif
|
|
1610
|
+
|
|
1611
|
+
// Generic for all vector lengths
|
|
1612
|
+
template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
|
|
1613
|
+
HWY_API MFromD<DFromV<V>> IsNegative(V v) {
|
|
1614
|
+
const DFromV<decltype(v)> d;
|
|
1615
|
+
const RebindToSigned<decltype(d)> di;
|
|
1616
|
+
using TI = TFromD<decltype(di)>;
|
|
1617
|
+
|
|
1618
|
+
return detail::LtS(BitCast(di, v), static_cast<TI>(0));
|
|
1619
|
+
}
|
|
1620
|
+
|
|
1463
1621
|
// ------------------------------ MaskFalse
|
|
1464
1622
|
|
|
1465
1623
|
// For mask ops including vmclr, elements past VL are tail-agnostic and cannot
|
|
@@ -1517,14 +1675,8 @@ HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
|
|
|
1517
1675
|
return IfThenElse(MaskFromVec(mask), yes, no);
|
|
1518
1676
|
}
|
|
1519
1677
|
|
|
1520
|
-
// ------------------------------ ZeroIfNegative
|
|
1521
|
-
template <class V>
|
|
1522
|
-
HWY_API V ZeroIfNegative(const V v) {
|
|
1523
|
-
return IfThenZeroElse(detail::LtS(v, 0), v);
|
|
1524
|
-
}
|
|
1525
|
-
|
|
1526
1678
|
// ------------------------------ BroadcastSignBit
|
|
1527
|
-
template <class V>
|
|
1679
|
+
template <class V, HWY_IF_SIGNED_V(V)>
|
|
1528
1680
|
HWY_API V BroadcastSignBit(const V v) {
|
|
1529
1681
|
return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v);
|
|
1530
1682
|
}
|
|
@@ -1533,11 +1685,7 @@ HWY_API V BroadcastSignBit(const V v) {
|
|
|
1533
1685
|
template <class V>
|
|
1534
1686
|
HWY_API V IfNegativeThenElse(V v, V yes, V no) {
|
|
1535
1687
|
static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float");
|
|
1536
|
-
|
|
1537
|
-
const RebindToSigned<decltype(d)> di;
|
|
1538
|
-
|
|
1539
|
-
MFromD<decltype(d)> m = detail::LtS(BitCast(di, v), 0);
|
|
1540
|
-
return IfThenElse(m, yes, no);
|
|
1688
|
+
return IfThenElse(IsNegative(v), yes, no);
|
|
1541
1689
|
}
|
|
1542
1690
|
|
|
1543
1691
|
// ------------------------------ FindFirstTrue
|
|
@@ -1812,9 +1960,9 @@ HWY_API void StoreN(size_t count, VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
|
|
|
1812
1960
|
template <class D>
|
|
1813
1961
|
HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
|
|
1814
1962
|
size_t max_lanes_to_store) {
|
|
1815
|
-
// NOTE: Need to
|
|
1816
|
-
//
|
|
1817
|
-
//
|
|
1963
|
+
// NOTE: Need to clamp max_lanes_to_store to Lanes(d), even if
|
|
1964
|
+
// MaxLanes(d) >= MaxLanes(DFromV<VFromD<D>>()) is true, as it is possible for
|
|
1965
|
+
// detail::StoreN(max_lanes_to_store, v, d, p) to store fewer than
|
|
1818
1966
|
// Lanes(DFromV<VFromD<D>>()) lanes to p if
|
|
1819
1967
|
// max_lanes_to_store > Lanes(DFromV<VFromD<D>>()) and
|
|
1820
1968
|
// max_lanes_to_store < 2 * Lanes(DFromV<VFromD<D>>()) are both true.
|
|
@@ -1823,8 +1971,7 @@ HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
|
|
|
1823
1971
|
// if Lanes(d) < Lanes(DFromV<VFromD<D>>()) is true, which is possible if
|
|
1824
1972
|
// MaxLanes(d) < MaxLanes(DFromV<VFromD<D>>()) or
|
|
1825
1973
|
// d.Pow2() < DFromV<VFromD<D>>().Pow2() is true.
|
|
1826
|
-
|
|
1827
|
-
detail::StoreN(HWY_MIN(max_lanes_to_store, N), v, d, p);
|
|
1974
|
+
detail::StoreN(CappedLanes(d, max_lanes_to_store), v, d, p);
|
|
1828
1975
|
}
|
|
1829
1976
|
|
|
1830
1977
|
// ------------------------------ StoreU
|
|
@@ -2059,52 +2206,38 @@ HWY_API auto PromoteTo(Simd<uint64_t, N, -1> d,
|
|
|
2059
2206
|
}
|
|
2060
2207
|
|
|
2061
2208
|
// Unsigned to signed: cast for unsigned promote.
|
|
2062
|
-
template <
|
|
2063
|
-
HWY_API
|
|
2064
|
-
VFromD<Rebind<uint8_t, decltype(d)>> v)
|
|
2065
|
-
-> VFromD<decltype(d)> {
|
|
2209
|
+
template <class D, HWY_IF_I16_D(D)>
|
|
2210
|
+
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) {
|
|
2066
2211
|
return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
|
|
2067
2212
|
}
|
|
2068
2213
|
|
|
2069
|
-
template <
|
|
2070
|
-
HWY_API
|
|
2071
|
-
VFromD<Rebind<uint8_t, decltype(d)>> v)
|
|
2072
|
-
-> VFromD<decltype(d)> {
|
|
2214
|
+
template <class D, HWY_IF_I32_D(D)>
|
|
2215
|
+
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) {
|
|
2073
2216
|
return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
|
|
2074
2217
|
}
|
|
2075
2218
|
|
|
2076
|
-
template <
|
|
2077
|
-
HWY_API
|
|
2078
|
-
VFromD<Rebind<uint16_t, decltype(d)>> v)
|
|
2079
|
-
-> VFromD<decltype(d)> {
|
|
2219
|
+
template <class D, HWY_IF_I32_D(D)>
|
|
2220
|
+
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint16_t, D>> v) {
|
|
2080
2221
|
return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
|
|
2081
2222
|
}
|
|
2082
2223
|
|
|
2083
|
-
template <
|
|
2084
|
-
HWY_API
|
|
2085
|
-
VFromD<Rebind<uint32_t, decltype(d)>> v)
|
|
2086
|
-
-> VFromD<decltype(d)> {
|
|
2224
|
+
template <class D, HWY_IF_I64_D(D)>
|
|
2225
|
+
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint32_t, D>> v) {
|
|
2087
2226
|
return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
|
|
2088
2227
|
}
|
|
2089
2228
|
|
|
2090
|
-
template <
|
|
2091
|
-
HWY_API
|
|
2092
|
-
VFromD<Rebind<uint16_t, decltype(d)>> v)
|
|
2093
|
-
-> VFromD<decltype(d)> {
|
|
2229
|
+
template <class D, HWY_IF_I64_D(D)>
|
|
2230
|
+
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint16_t, D>> v) {
|
|
2094
2231
|
return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
|
|
2095
2232
|
}
|
|
2096
2233
|
|
|
2097
|
-
template <
|
|
2098
|
-
HWY_API
|
|
2099
|
-
VFromD<Rebind<uint8_t, decltype(d)>> v)
|
|
2100
|
-
-> VFromD<decltype(d)> {
|
|
2234
|
+
template <class D, HWY_IF_I64_D(D)>
|
|
2235
|
+
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) {
|
|
2101
2236
|
return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
|
|
2102
2237
|
}
|
|
2103
2238
|
|
|
2104
|
-
template <
|
|
2105
|
-
HWY_API
|
|
2106
|
-
VFromD<Rebind<hwy::bfloat16_t, decltype(d)>> v)
|
|
2107
|
-
-> VFromD<decltype(d)> {
|
|
2239
|
+
template <class D, HWY_IF_F32_D(D)>
|
|
2240
|
+
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<hwy::bfloat16_t, D>> v) {
|
|
2108
2241
|
const RebindToSigned<decltype(d)> di32;
|
|
2109
2242
|
const Rebind<uint16_t, decltype(d)> du16;
|
|
2110
2243
|
return BitCast(d, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
|
|
@@ -2204,28 +2337,24 @@ HWY_API vuint8m2_t DemoteTo(Simd<uint8_t, N, 1> d, const vuint32m8_t v) {
|
|
|
2204
2337
|
HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));
|
|
2205
2338
|
}
|
|
2206
2339
|
|
|
2207
|
-
template <
|
|
2208
|
-
HWY_API VFromD<
|
|
2209
|
-
|
|
2210
|
-
return DemoteTo(d, DemoteTo(Simd<uint32_t, N, kPow2 + 2>(), v));
|
|
2340
|
+
template <class D, HWY_IF_U8_D(D)>
|
|
2341
|
+
HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<int64_t, D>> v) {
|
|
2342
|
+
return DemoteTo(d, DemoteTo(Rebind<uint32_t, D>(), v));
|
|
2211
2343
|
}
|
|
2212
2344
|
|
|
2213
|
-
template <
|
|
2214
|
-
HWY_API VFromD<
|
|
2215
|
-
|
|
2216
|
-
return DemoteTo(d, DemoteTo(Simd<uint32_t, N, kPow2 + 2>(), v));
|
|
2345
|
+
template <class D, HWY_IF_U8_D(D)>
|
|
2346
|
+
HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<uint64_t, D>> v) {
|
|
2347
|
+
return DemoteTo(d, DemoteTo(Rebind<uint32_t, D>(), v));
|
|
2217
2348
|
}
|
|
2218
2349
|
|
|
2219
|
-
template <
|
|
2220
|
-
HWY_API VFromD<
|
|
2221
|
-
|
|
2222
|
-
return DemoteTo(d, DemoteTo(Simd<uint32_t, N, kPow2 + 1>(), v));
|
|
2350
|
+
template <class D, HWY_IF_U16_D(D)>
|
|
2351
|
+
HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<int64_t, D>> v) {
|
|
2352
|
+
return DemoteTo(d, DemoteTo(Rebind<uint32_t, D>(), v));
|
|
2223
2353
|
}
|
|
2224
2354
|
|
|
2225
|
-
template <
|
|
2226
|
-
HWY_API VFromD<
|
|
2227
|
-
|
|
2228
|
-
return DemoteTo(d, DemoteTo(Simd<uint32_t, N, kPow2 + 1>(), v));
|
|
2355
|
+
template <class D, HWY_IF_U16_D(D)>
|
|
2356
|
+
HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<uint64_t, D>> v) {
|
|
2357
|
+
return DemoteTo(d, DemoteTo(Rebind<uint32_t, D>(), v));
|
|
2229
2358
|
}
|
|
2230
2359
|
|
|
2231
2360
|
HWY_API vuint8mf8_t U8FromU32(const vuint32mf2_t v) {
|
|
@@ -2608,16 +2737,14 @@ HWY_API vint8m2_t DemoteTo(Simd<int8_t, N, 1> d, const vint32m8_t v) {
|
|
|
2608
2737
|
return DemoteTo(d, DemoteTo(Simd<int16_t, N, 2>(), v));
|
|
2609
2738
|
}
|
|
2610
2739
|
|
|
2611
|
-
template <
|
|
2612
|
-
HWY_API VFromD<
|
|
2613
|
-
|
|
2614
|
-
return DemoteTo(d, DemoteTo(Simd<int32_t, N, kPow2 + 2>(), v));
|
|
2740
|
+
template <class D, HWY_IF_I8_D(D)>
|
|
2741
|
+
HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<int64_t, D>> v) {
|
|
2742
|
+
return DemoteTo(d, DemoteTo(Rebind<int32_t, D>(), v));
|
|
2615
2743
|
}
|
|
2616
2744
|
|
|
2617
|
-
template <
|
|
2618
|
-
HWY_API VFromD<
|
|
2619
|
-
|
|
2620
|
-
return DemoteTo(d, DemoteTo(Simd<int32_t, N, kPow2 + 1>(), v));
|
|
2745
|
+
template <class D, HWY_IF_I16_D(D)>
|
|
2746
|
+
HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<int64_t, D>> v) {
|
|
2747
|
+
return DemoteTo(d, DemoteTo(Rebind<int32_t, D>(), v));
|
|
2621
2748
|
}
|
|
2622
2749
|
|
|
2623
2750
|
#undef HWY_RVV_DEMOTE
|
|
@@ -2634,9 +2761,15 @@ HWY_API VFromD<Simd<int16_t, N, kPow2>> DemoteTo(
|
|
|
2634
2761
|
}
|
|
2635
2762
|
|
|
2636
2763
|
#if HWY_HAVE_FLOAT16 || HWY_RVV_HAVE_F16C
|
|
2637
|
-
HWY_RVV_FOREACH_F32(HWY_RVV_DEMOTE_F, DemoteTo,
|
|
2764
|
+
HWY_RVV_FOREACH_F32(HWY_RVV_DEMOTE_F, DemoteTo, fncvt_f_f_w_f, _DEMOTE_VIRT)
|
|
2638
2765
|
#endif
|
|
2639
|
-
HWY_RVV_FOREACH_F64(HWY_RVV_DEMOTE_F, DemoteTo,
|
|
2766
|
+
HWY_RVV_FOREACH_F64(HWY_RVV_DEMOTE_F, DemoteTo, fncvt_f_f_w_f, _DEMOTE_VIRT)
|
|
2767
|
+
|
|
2768
|
+
namespace detail {
|
|
2769
|
+
HWY_RVV_FOREACH_F64(HWY_RVV_DEMOTE_F, DemoteToF32WithRoundToOdd,
|
|
2770
|
+
fncvt_rod_f_f_w_f, _DEMOTE_VIRT)
|
|
2771
|
+
} // namespace detail
|
|
2772
|
+
|
|
2640
2773
|
#undef HWY_RVV_DEMOTE_F
|
|
2641
2774
|
|
|
2642
2775
|
// TODO(janwas): add BASE2 arg to allow generating this via DEMOTE_F.
|
|
@@ -2724,27 +2857,72 @@ HWY_API vfloat32m4_t DemoteTo(Simd<float, N, 2> d, const vuint64m8_t v) {
|
|
|
2724
2857
|
return __riscv_vfncvt_f_xu_w_f32m4(v, Lanes(d));
|
|
2725
2858
|
}
|
|
2726
2859
|
|
|
2860
|
+
// Narrows f32 bits to bf16 using round to even.
|
|
2727
2861
|
// SEW is for the source so we can use _DEMOTE_VIRT.
|
|
2728
|
-
#
|
|
2729
|
-
|
|
2862
|
+
#ifdef HWY_RVV_AVOID_VXRM
|
|
2863
|
+
#define HWY_RVV_DEMOTE_16_NEAREST_EVEN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, \
|
|
2864
|
+
LMULD, LMULH, SHIFT, MLEN, NAME, OP) \
|
|
2865
|
+
template <size_t N> \
|
|
2866
|
+
HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \
|
|
2867
|
+
HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
2868
|
+
const auto round = \
|
|
2869
|
+
detail::AddS(detail::AndS(ShiftRight<16>(v), 1u), 0x7FFFu); \
|
|
2870
|
+
v = Add(v, round); \
|
|
2871
|
+
/* The default rounding mode appears to be RNU=0, which adds the LSB. */ \
|
|
2872
|
+
/* Prevent further rounding by clearing the bits we want to truncate. */ \
|
|
2873
|
+
v = detail::AndS(v, 0xFFFF0000u); \
|
|
2874
|
+
return __riscv_v##OP##CHAR##SEWH##LMULH(v, 16, Lanes(d)); \
|
|
2875
|
+
}
|
|
2876
|
+
|
|
2877
|
+
#else
|
|
2878
|
+
#define HWY_RVV_DEMOTE_16_NEAREST_EVEN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, \
|
|
2879
|
+
LMULD, LMULH, SHIFT, MLEN, NAME, OP) \
|
|
2730
2880
|
template <size_t N> \
|
|
2731
2881
|
HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \
|
|
2732
2882
|
HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
2733
2883
|
return __riscv_v##OP##CHAR##SEWH##LMULH( \
|
|
2734
|
-
v, 16, HWY_RVV_INSERT_VXRM(
|
|
2884
|
+
v, 16, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNE, Lanes(d))); \
|
|
2735
2885
|
}
|
|
2886
|
+
#endif // HWY_RVV_AVOID_VXRM
|
|
2736
2887
|
namespace detail {
|
|
2737
|
-
HWY_RVV_FOREACH_U32(
|
|
2738
|
-
_DEMOTE_VIRT)
|
|
2888
|
+
HWY_RVV_FOREACH_U32(HWY_RVV_DEMOTE_16_NEAREST_EVEN, DemoteTo16NearestEven,
|
|
2889
|
+
nclipu_wx_, _DEMOTE_VIRT)
|
|
2739
2890
|
}
|
|
2740
|
-
#undef
|
|
2891
|
+
#undef HWY_RVV_DEMOTE_16_NEAREST_EVEN
|
|
2741
2892
|
|
|
2742
|
-
|
|
2743
|
-
|
|
2744
|
-
|
|
2893
|
+
#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
2894
|
+
#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
2895
|
+
#else
|
|
2896
|
+
#define HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
2897
|
+
#endif
|
|
2898
|
+
|
|
2899
|
+
template <class DBF16, HWY_IF_BF16_D(DBF16)>
|
|
2900
|
+
HWY_API VFromD<DBF16> DemoteTo(DBF16 d, VFromD<Rebind<float, DBF16>> v) {
|
|
2901
|
+
const DFromV<decltype(v)> df;
|
|
2902
|
+
const RebindToUnsigned<decltype(df)> du32;
|
|
2745
2903
|
const RebindToUnsigned<decltype(d)> du16;
|
|
2746
|
-
|
|
2747
|
-
|
|
2904
|
+
// Consider an f32 mantissa with the upper 7 bits set, followed by a 1-bit
|
|
2905
|
+
// and at least one other bit set. This will round to 0 and increment the
|
|
2906
|
+
// exponent. If the exponent was already 0xFF (NaN), then the result is -inf;
|
|
2907
|
+
// there no wraparound because nclipu saturates. Note that in this case, the
|
|
2908
|
+
// input cannot have been inf because its mantissa bits are zero. To avoid
|
|
2909
|
+
// converting NaN to inf, we canonicalize the NaN to prevent the rounding.
|
|
2910
|
+
const decltype(v) canonicalized =
|
|
2911
|
+
IfThenElse(Eq(v, v), v, BitCast(df, Set(du32, 0x7F800000)));
|
|
2912
|
+
return BitCast(
|
|
2913
|
+
d, detail::DemoteTo16NearestEven(du16, BitCast(du32, canonicalized)));
|
|
2914
|
+
}
|
|
2915
|
+
|
|
2916
|
+
#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
2917
|
+
#undef HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
2918
|
+
#else
|
|
2919
|
+
#define HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
2920
|
+
#endif
|
|
2921
|
+
|
|
2922
|
+
template <class D, HWY_IF_F16_D(D)>
|
|
2923
|
+
HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
|
|
2924
|
+
const Rebind<float, decltype(df16)> df32;
|
|
2925
|
+
return DemoteTo(df16, detail::DemoteToF32WithRoundToOdd(df32, v));
|
|
2748
2926
|
}
|
|
2749
2927
|
|
|
2750
2928
|
// ------------------------------ ConvertTo F
|
|
@@ -2771,8 +2949,8 @@ HWY_API VFromD<Simd<hwy::bfloat16_t, N, kPow2>> DemoteTo(
|
|
|
2771
2949
|
HWY_API HWY_RVV_V(uint, SEW, LMUL) ConvertTo( \
|
|
2772
2950
|
HWY_RVV_D(uint, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
2773
2951
|
return __riscv_vfcvt_rtz_xu_f_v_u##SEW##LMUL(v, Lanes(d)); \
|
|
2774
|
-
}
|
|
2775
|
-
|
|
2952
|
+
}
|
|
2953
|
+
|
|
2776
2954
|
HWY_RVV_FOREACH_F(HWY_RVV_CONVERT, _, _, _ALL_VIRT)
|
|
2777
2955
|
#undef HWY_RVV_CONVERT
|
|
2778
2956
|
|
|
@@ -2811,7 +2989,7 @@ HWY_INLINE size_t LanesPerBlock(Simd<T, N, kPow2> d) {
|
|
|
2811
2989
|
|
|
2812
2990
|
template <class D, class V>
|
|
2813
2991
|
HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) {
|
|
2814
|
-
using T = MakeUnsigned<
|
|
2992
|
+
using T = MakeUnsigned<TFromV<V>>;
|
|
2815
2993
|
return AndS(iota0, static_cast<T>(~(LanesPerBlock(d) - 1)));
|
|
2816
2994
|
}
|
|
2817
2995
|
|
|
@@ -3099,6 +3277,18 @@ HWY_API V DupOdd(const V v) {
|
|
|
3099
3277
|
return OddEven(v, down);
|
|
3100
3278
|
}
|
|
3101
3279
|
|
|
3280
|
+
// ------------------------------ InterleaveEven (OddEven)
|
|
3281
|
+
template <class D>
|
|
3282
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3283
|
+
return OddEven(detail::Slide1Up(b), a);
|
|
3284
|
+
}
|
|
3285
|
+
|
|
3286
|
+
// ------------------------------ InterleaveOdd (OddEven)
|
|
3287
|
+
template <class D>
|
|
3288
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3289
|
+
return OddEven(b, detail::Slide1Down(a));
|
|
3290
|
+
}
|
|
3291
|
+
|
|
3102
3292
|
// ------------------------------ OddEvenBlocks
|
|
3103
3293
|
template <class V>
|
|
3104
3294
|
HWY_API V OddEvenBlocks(const V a, const V b) {
|
|
@@ -3577,6 +3767,9 @@ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
|
3577
3767
|
return Combine(d, LowerHalf(dh, hi_even), LowerHalf(dh, lo_even));
|
|
3578
3768
|
}
|
|
3579
3769
|
|
|
3770
|
+
// ------------------------------ PromoteEvenTo/PromoteOddTo
|
|
3771
|
+
#include "hwy/ops/inside-inl.h"
|
|
3772
|
+
|
|
3580
3773
|
// ================================================== BLOCKWISE
|
|
3581
3774
|
|
|
3582
3775
|
// ------------------------------ CombineShiftRightBytes
|
|
@@ -3727,8 +3920,9 @@ HWY_API V TwoTablesLookupLanes(V a, V b,
|
|
|
3727
3920
|
template <int kLane, class V, class D = DFromV<V>, HWY_IF_T_SIZE_D(D, 1),
|
|
3728
3921
|
HWY_IF_POW2_LE_D(D, 2)>
|
|
3729
3922
|
HWY_API V Broadcast(const V v) {
|
|
3730
|
-
HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
|
|
3731
3923
|
const D d;
|
|
3924
|
+
HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
|
|
3925
|
+
|
|
3732
3926
|
const Rebind<uint16_t, decltype(d)> du16;
|
|
3733
3927
|
VFromD<decltype(du16)> idx =
|
|
3734
3928
|
detail::OffsetsOf128BitBlocks(d, detail::Iota0(du16));
|
|
@@ -3742,8 +3936,9 @@ HWY_API V Broadcast(const V v) {
|
|
|
3742
3936
|
template <int kLane, class V, class D = DFromV<V>, HWY_IF_T_SIZE_D(D, 1),
|
|
3743
3937
|
HWY_IF_POW2_GT_D(D, 2)>
|
|
3744
3938
|
HWY_API V Broadcast(const V v) {
|
|
3745
|
-
HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
|
|
3746
3939
|
const D d;
|
|
3940
|
+
HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
|
|
3941
|
+
|
|
3747
3942
|
const Half<decltype(d)> dh;
|
|
3748
3943
|
using VH = VFromD<decltype(dh)>;
|
|
3749
3944
|
const Rebind<uint16_t, decltype(dh)> du16;
|
|
@@ -3754,14 +3949,15 @@ HWY_API V Broadcast(const V v) {
|
|
|
3754
3949
|
}
|
|
3755
3950
|
const VH lo = detail::TableLookupLanes16(LowerHalf(dh, v), idx);
|
|
3756
3951
|
const VH hi = detail::TableLookupLanes16(UpperHalf(dh, v), idx);
|
|
3757
|
-
return Combine(d,
|
|
3952
|
+
return Combine(d, hi, lo);
|
|
3758
3953
|
}
|
|
3759
3954
|
|
|
3760
3955
|
template <int kLane, class V, class D = DFromV<V>,
|
|
3761
3956
|
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))>
|
|
3762
3957
|
HWY_API V Broadcast(const V v) {
|
|
3763
|
-
HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
|
|
3764
3958
|
const D d;
|
|
3959
|
+
HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
|
|
3960
|
+
|
|
3765
3961
|
const RebindToUnsigned<decltype(d)> du;
|
|
3766
3962
|
auto idx = detail::OffsetsOf128BitBlocks(d, detail::Iota0(du));
|
|
3767
3963
|
if (kLane != 0) {
|
|
@@ -5045,7 +5241,7 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
|
5045
5241
|
// Slow fallback for completeness; the above bits to mask cast is preferred.
|
|
5046
5242
|
const RebindToUnsigned<D> du;
|
|
5047
5243
|
const VFromD<decltype(du)> bits =
|
|
5048
|
-
Shl(Set(du, uint16_t{1}),
|
|
5244
|
+
Shl(Set(du, uint16_t{1}), detail::AndS(detail::Iota0(du), 7));
|
|
5049
5245
|
return TestBit(Set(du, static_cast<uint16_t>(mask_bits)), bits);
|
|
5050
5246
|
#endif
|
|
5051
5247
|
}
|
|
@@ -5062,8 +5258,7 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
|
5062
5258
|
#else
|
|
5063
5259
|
// Slow fallback for completeness; the above bits to mask cast is preferred.
|
|
5064
5260
|
const RebindToUnsigned<D> du;
|
|
5065
|
-
const VFromD<decltype(du)> bits =
|
|
5066
|
-
Shl(Set(du, uint32_t{1}), Iota(du, uint32_t{0}));
|
|
5261
|
+
const VFromD<decltype(du)> bits = Dup128VecFromValues(du, 1, 2, 4, 8);
|
|
5067
5262
|
return TestBit(Set(du, static_cast<uint32_t>(mask_bits)), bits);
|
|
5068
5263
|
#endif
|
|
5069
5264
|
}
|
|
@@ -5080,40 +5275,11 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
|
5080
5275
|
#else
|
|
5081
5276
|
// Slow fallback for completeness; the above bits to mask cast is preferred.
|
|
5082
5277
|
const RebindToUnsigned<D> du;
|
|
5083
|
-
const VFromD<decltype(du)> bits = Dup128VecFromValues(du,
|
|
5278
|
+
const VFromD<decltype(du)> bits = Dup128VecFromValues(du, 1, 2);
|
|
5084
5279
|
return TestBit(Set(du, static_cast<uint64_t>(mask_bits)), bits);
|
|
5085
5280
|
#endif
|
|
5086
5281
|
}
|
|
5087
5282
|
|
|
5088
|
-
// ------------------------------ Neg (Sub)
|
|
5089
|
-
|
|
5090
|
-
template <class V, HWY_IF_SIGNED_V(V)>
|
|
5091
|
-
HWY_API V Neg(const V v) {
|
|
5092
|
-
return detail::ReverseSubS(v, 0);
|
|
5093
|
-
}
|
|
5094
|
-
|
|
5095
|
-
// vector = f(vector), but argument is repeated
|
|
5096
|
-
#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
5097
|
-
SHIFT, MLEN, NAME, OP) \
|
|
5098
|
-
HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
5099
|
-
return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, v, \
|
|
5100
|
-
HWY_RVV_AVL(SEW, SHIFT)); \
|
|
5101
|
-
}
|
|
5102
|
-
|
|
5103
|
-
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Neg, fsgnjn, _ALL)
|
|
5104
|
-
|
|
5105
|
-
#if !HWY_HAVE_FLOAT16
|
|
5106
|
-
|
|
5107
|
-
template <class V, HWY_IF_U16_D(DFromV<V>)> // hwy::float16_t
|
|
5108
|
-
HWY_API V Neg(V v) {
|
|
5109
|
-
const DFromV<decltype(v)> d;
|
|
5110
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
5111
|
-
using TU = TFromD<decltype(du)>;
|
|
5112
|
-
return BitCast(d, Xor(BitCast(du, v), Set(du, SignMask<TU>())));
|
|
5113
|
-
}
|
|
5114
|
-
|
|
5115
|
-
#endif // !HWY_HAVE_FLOAT16
|
|
5116
|
-
|
|
5117
5283
|
// ------------------------------ Abs (Max, Neg)
|
|
5118
5284
|
|
|
5119
5285
|
template <class V, HWY_IF_SIGNED_V(V)>
|
|
@@ -5171,23 +5337,99 @@ HWY_API V Trunc(const V v) {
|
|
|
5171
5337
|
}
|
|
5172
5338
|
|
|
5173
5339
|
// ------------------------------ Ceil
|
|
5340
|
+
#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1400) || \
|
|
5341
|
+
(HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1700)
|
|
5342
|
+
namespace detail {
|
|
5343
|
+
#define HWY_RVV_CEIL_INT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
5344
|
+
SHIFT, MLEN, NAME, OP) \
|
|
5345
|
+
HWY_API HWY_RVV_V(int, SEW, LMUL) CeilInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
5346
|
+
return __riscv_vfcvt_x_f_v_i##SEW##LMUL##_rm(v, __RISCV_FRM_RUP, \
|
|
5347
|
+
HWY_RVV_AVL(SEW, SHIFT)); \
|
|
5348
|
+
}
|
|
5349
|
+
HWY_RVV_FOREACH_F(HWY_RVV_CEIL_INT, _, _, _ALL)
|
|
5350
|
+
#undef HWY_RVV_CEIL_INT
|
|
5351
|
+
|
|
5352
|
+
} // namespace detail
|
|
5353
|
+
|
|
5174
5354
|
template <class V>
|
|
5175
5355
|
HWY_API V Ceil(const V v) {
|
|
5176
|
-
|
|
5177
|
-
|
|
5178
|
-
|
|
5179
|
-
|
|
5356
|
+
const DFromV<V> df;
|
|
5357
|
+
|
|
5358
|
+
const auto integer = detail::CeilInt(v);
|
|
5359
|
+
const auto int_f = ConvertTo(df, integer);
|
|
5360
|
+
|
|
5361
|
+
return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
|
|
5180
5362
|
}
|
|
5181
5363
|
|
|
5364
|
+
#else // GCC 13 or earlier or Clang 16 or earlier
|
|
5365
|
+
|
|
5366
|
+
template <class V>
|
|
5367
|
+
HWY_API V Ceil(const V v) {
|
|
5368
|
+
const DFromV<decltype(v)> df;
|
|
5369
|
+
const RebindToSigned<decltype(df)> di;
|
|
5370
|
+
|
|
5371
|
+
using T = TFromD<decltype(df)>;
|
|
5372
|
+
|
|
5373
|
+
const auto integer = ConvertTo(di, v); // round toward 0
|
|
5374
|
+
const auto int_f = ConvertTo(df, integer);
|
|
5375
|
+
|
|
5376
|
+
// Truncating a positive non-integer ends up smaller; if so, add 1.
|
|
5377
|
+
const auto pos1 =
|
|
5378
|
+
IfThenElseZero(Lt(int_f, v), Set(df, ConvertScalarTo<T>(1.0)));
|
|
5379
|
+
|
|
5380
|
+
return IfThenElse(detail::UseInt(v), Add(int_f, pos1), v);
|
|
5381
|
+
}
|
|
5382
|
+
|
|
5383
|
+
#endif // (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1400) ||
|
|
5384
|
+
// (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1700)
|
|
5385
|
+
|
|
5182
5386
|
// ------------------------------ Floor
|
|
5387
|
+
#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1400) || \
|
|
5388
|
+
(HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1700)
|
|
5389
|
+
namespace detail {
|
|
5390
|
+
#define HWY_RVV_FLOOR_INT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
5391
|
+
SHIFT, MLEN, NAME, OP) \
|
|
5392
|
+
HWY_API HWY_RVV_V(int, SEW, LMUL) FloorInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
5393
|
+
return __riscv_vfcvt_x_f_v_i##SEW##LMUL##_rm(v, __RISCV_FRM_RDN, \
|
|
5394
|
+
HWY_RVV_AVL(SEW, SHIFT)); \
|
|
5395
|
+
}
|
|
5396
|
+
HWY_RVV_FOREACH_F(HWY_RVV_FLOOR_INT, _, _, _ALL)
|
|
5397
|
+
#undef HWY_RVV_FLOOR_INT
|
|
5398
|
+
|
|
5399
|
+
} // namespace detail
|
|
5400
|
+
|
|
5183
5401
|
template <class V>
|
|
5184
5402
|
HWY_API V Floor(const V v) {
|
|
5185
|
-
|
|
5186
|
-
|
|
5187
|
-
|
|
5188
|
-
|
|
5403
|
+
const DFromV<V> df;
|
|
5404
|
+
|
|
5405
|
+
const auto integer = detail::FloorInt(v);
|
|
5406
|
+
const auto int_f = ConvertTo(df, integer);
|
|
5407
|
+
|
|
5408
|
+
return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
|
|
5189
5409
|
}
|
|
5190
5410
|
|
|
5411
|
+
#else // GCC 13 or earlier or Clang 16 or earlier
|
|
5412
|
+
|
|
5413
|
+
template <class V>
|
|
5414
|
+
HWY_API V Floor(const V v) {
|
|
5415
|
+
const DFromV<decltype(v)> df;
|
|
5416
|
+
const RebindToSigned<decltype(df)> di;
|
|
5417
|
+
|
|
5418
|
+
using T = TFromD<decltype(df)>;
|
|
5419
|
+
|
|
5420
|
+
const auto integer = ConvertTo(di, v); // round toward 0
|
|
5421
|
+
const auto int_f = ConvertTo(df, integer);
|
|
5422
|
+
|
|
5423
|
+
// Truncating a negative non-integer ends up larger; if so, subtract 1.
|
|
5424
|
+
const auto neg1 =
|
|
5425
|
+
IfThenElseZero(Gt(int_f, v), Set(df, ConvertScalarTo<T>(-1.0)));
|
|
5426
|
+
|
|
5427
|
+
return IfThenElse(detail::UseInt(v), Add(int_f, neg1), v);
|
|
5428
|
+
}
|
|
5429
|
+
|
|
5430
|
+
#endif // (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1400) ||
|
|
5431
|
+
// (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1700)
|
|
5432
|
+
|
|
5191
5433
|
// ------------------------------ Floating-point classification (Ne)
|
|
5192
5434
|
|
|
5193
5435
|
// vfclass does not help because it would require 3 instructions (to AND and
|
|
@@ -5254,13 +5496,65 @@ HWY_API VFromD<D> Iota(const D d, T2 first) {
|
|
|
5254
5496
|
ConvertScalarTo<TFromD<D>>(first));
|
|
5255
5497
|
}
|
|
5256
5498
|
|
|
5499
|
+
// ------------------------------ BitShuffle (PromoteTo, Rol, SumsOf8)
|
|
5500
|
+
|
|
5501
|
+
// Native implementation required to avoid 8-bit wraparound on long vectors.
|
|
5502
|
+
#ifdef HWY_NATIVE_BITSHUFFLE
|
|
5503
|
+
#undef HWY_NATIVE_BITSHUFFLE
|
|
5504
|
+
#else
|
|
5505
|
+
#define HWY_NATIVE_BITSHUFFLE
|
|
5506
|
+
#endif
|
|
5507
|
+
|
|
5508
|
+
// Cannot handle LMUL=8 because we promote indices.
|
|
5509
|
+
template <class V64, class VI, HWY_IF_UI8(TFromV<VI>), class D64 = DFromV<V64>,
|
|
5510
|
+
HWY_IF_UI64_D(D64), HWY_IF_POW2_LE_D(D64, 2)>
|
|
5511
|
+
HWY_API V64 BitShuffle(V64 values, VI idx) {
|
|
5512
|
+
const RebindToUnsigned<D64> du64;
|
|
5513
|
+
const Repartition<uint8_t, D64> du8;
|
|
5514
|
+
const Rebind<uint16_t, decltype(du8)> du16;
|
|
5515
|
+
using VU8 = VFromD<decltype(du8)>;
|
|
5516
|
+
using VU16 = VFromD<decltype(du16)>;
|
|
5517
|
+
// For each 16-bit (to avoid wraparound for long vectors) index of an output
|
|
5518
|
+
// byte: offset of the u64 lane to which it belongs.
|
|
5519
|
+
const VU16 byte_offsets =
|
|
5520
|
+
detail::AndS(detail::Iota0(du16), static_cast<uint16_t>(~7u));
|
|
5521
|
+
// idx is for a bit; shifting makes that bytes. Promote so we can add
|
|
5522
|
+
// byte_offsets, then we have the u8 lane index within the whole vector.
|
|
5523
|
+
const VU16 idx16 =
|
|
5524
|
+
Add(byte_offsets, PromoteTo(du16, ShiftRight<3>(BitCast(du8, idx))));
|
|
5525
|
+
const VU8 bytes = detail::TableLookupLanes16(BitCast(du8, values), idx16);
|
|
5526
|
+
|
|
5527
|
+
// We want to shift right by idx & 7 to extract the desired bit in `bytes`,
|
|
5528
|
+
// and left by iota & 7 to put it in the correct output bit. To correctly
|
|
5529
|
+
// handle shift counts from -7 to 7, we rotate (unfortunately not natively
|
|
5530
|
+
// supported on RVV).
|
|
5531
|
+
const VU8 rotate_left_bits = Sub(detail::Iota0(du8), BitCast(du8, idx));
|
|
5532
|
+
const VU8 extracted_bits_mask =
|
|
5533
|
+
BitCast(du8, Set(du64, static_cast<uint64_t>(0x8040201008040201u)));
|
|
5534
|
+
const VU8 extracted_bits =
|
|
5535
|
+
And(Rol(bytes, rotate_left_bits), extracted_bits_mask);
|
|
5536
|
+
// Combine bit-sliced (one bit per byte) into one 64-bit sum.
|
|
5537
|
+
return BitCast(D64(), SumsOf8(extracted_bits));
|
|
5538
|
+
}
|
|
5539
|
+
|
|
5540
|
+
template <class V64, class VI, HWY_IF_UI8(TFromV<VI>), class D64 = DFromV<V64>,
|
|
5541
|
+
HWY_IF_UI64_D(D64), HWY_IF_POW2_GT_D(D64, 2)>
|
|
5542
|
+
HWY_API V64 BitShuffle(V64 values, VI idx) {
|
|
5543
|
+
const Half<D64> dh;
|
|
5544
|
+
const Half<DFromV<VI>> dih;
|
|
5545
|
+
using V64H = VFromD<decltype(dh)>;
|
|
5546
|
+
const V64H r0 = BitShuffle(LowerHalf(dh, values), LowerHalf(dih, idx));
|
|
5547
|
+
const V64H r1 = BitShuffle(UpperHalf(dh, values), UpperHalf(dih, idx));
|
|
5548
|
+
return Combine(D64(), r1, r0);
|
|
5549
|
+
}
|
|
5550
|
+
|
|
5257
5551
|
// ------------------------------ MulEven/Odd (Mul, OddEven)
|
|
5258
5552
|
|
|
5259
5553
|
template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
5260
5554
|
class D = DFromV<V>, class DW = RepartitionToWide<D>>
|
|
5261
5555
|
HWY_API VFromD<DW> MulEven(const V a, const V b) {
|
|
5262
5556
|
const auto lo = Mul(a, b);
|
|
5263
|
-
const auto hi =
|
|
5557
|
+
const auto hi = MulHigh(a, b);
|
|
5264
5558
|
return BitCast(DW(), OddEven(detail::Slide1Up(hi), lo));
|
|
5265
5559
|
}
|
|
5266
5560
|
|
|
@@ -5268,7 +5562,7 @@ template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
|
5268
5562
|
class D = DFromV<V>, class DW = RepartitionToWide<D>>
|
|
5269
5563
|
HWY_API VFromD<DW> MulOdd(const V a, const V b) {
|
|
5270
5564
|
const auto lo = Mul(a, b);
|
|
5271
|
-
const auto hi =
|
|
5565
|
+
const auto hi = MulHigh(a, b);
|
|
5272
5566
|
return BitCast(DW(), OddEven(hi, detail::Slide1Down(lo)));
|
|
5273
5567
|
}
|
|
5274
5568
|
|
|
@@ -5276,28 +5570,34 @@ HWY_API VFromD<DW> MulOdd(const V a, const V b) {
|
|
|
5276
5570
|
template <class V, HWY_IF_T_SIZE_V(V, 8)>
|
|
5277
5571
|
HWY_INLINE V MulEven(const V a, const V b) {
|
|
5278
5572
|
const auto lo = Mul(a, b);
|
|
5279
|
-
const auto hi =
|
|
5573
|
+
const auto hi = MulHigh(a, b);
|
|
5280
5574
|
return OddEven(detail::Slide1Up(hi), lo);
|
|
5281
5575
|
}
|
|
5282
5576
|
|
|
5283
5577
|
template <class V, HWY_IF_T_SIZE_V(V, 8)>
|
|
5284
5578
|
HWY_INLINE V MulOdd(const V a, const V b) {
|
|
5285
5579
|
const auto lo = Mul(a, b);
|
|
5286
|
-
const auto hi =
|
|
5580
|
+
const auto hi = MulHigh(a, b);
|
|
5287
5581
|
return OddEven(hi, detail::Slide1Down(lo));
|
|
5288
5582
|
}
|
|
5289
5583
|
|
|
5290
5584
|
// ------------------------------ ReorderDemote2To (OddEven, Combine)
|
|
5291
5585
|
|
|
5292
|
-
template <
|
|
5293
|
-
HWY_API VFromD<
|
|
5294
|
-
|
|
5295
|
-
VFromD<RepartitionToWide<decltype(dbf16)>> a,
|
|
5296
|
-
VFromD<RepartitionToWide<decltype(dbf16)>> b) {
|
|
5586
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
5587
|
+
HWY_API VFromD<D> ReorderDemote2To(D dbf16, VFromD<RepartitionToWide<D>> a,
|
|
5588
|
+
VFromD<RepartitionToWide<D>> b) {
|
|
5297
5589
|
const RebindToUnsigned<decltype(dbf16)> du16;
|
|
5590
|
+
const Half<decltype(du16)> du16_half;
|
|
5298
5591
|
const RebindToUnsigned<DFromV<decltype(a)>> du32;
|
|
5299
|
-
const VFromD<decltype(du32)>
|
|
5300
|
-
|
|
5592
|
+
const VFromD<decltype(du32)> a_in_even = PromoteTo(
|
|
5593
|
+
du32, detail::DemoteTo16NearestEven(du16_half, BitCast(du32, a)));
|
|
5594
|
+
const VFromD<decltype(du32)> b_in_even = PromoteTo(
|
|
5595
|
+
du32, detail::DemoteTo16NearestEven(du16_half, BitCast(du32, b)));
|
|
5596
|
+
// Equivalent to InterleaveEven, but because the upper 16 bits are zero, we
|
|
5597
|
+
// can OR instead of OddEven.
|
|
5598
|
+
const VFromD<decltype(du16)> a_in_odd =
|
|
5599
|
+
detail::Slide1Up(BitCast(du16, a_in_even));
|
|
5600
|
+
return BitCast(dbf16, Or(a_in_odd, BitCast(du16, b_in_even)));
|
|
5301
5601
|
}
|
|
5302
5602
|
|
|
5303
5603
|
// If LMUL is not the max, Combine first to avoid another DemoteTo.
|
|
@@ -5382,68 +5682,26 @@ HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
|
|
|
5382
5682
|
|
|
5383
5683
|
// ------------------------------ WidenMulPairwiseAdd
|
|
5384
5684
|
|
|
5385
|
-
template <class
|
|
5386
|
-
class
|
|
5387
|
-
HWY_API VFromD<
|
|
5388
|
-
const
|
|
5389
|
-
|
|
5390
|
-
const
|
|
5391
|
-
|
|
5392
|
-
|
|
5393
|
-
|
|
5394
|
-
|
|
5395
|
-
|
|
5396
|
-
|
|
5397
|
-
return MulAdd(
|
|
5398
|
-
Mul(
|
|
5399
|
-
}
|
|
5400
|
-
|
|
5401
|
-
template <class D, HWY_IF_I32_D(D), class VI16>
|
|
5402
|
-
HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, VI16 a, VI16 b) {
|
|
5403
|
-
using VI32 = VFromD<decltype(d32)>;
|
|
5404
|
-
// Manual sign extension requires two shifts for even lanes.
|
|
5405
|
-
const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a)));
|
|
5406
|
-
const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b)));
|
|
5407
|
-
const VI32 ao = ShiftRight<16>(BitCast(d32, a));
|
|
5408
|
-
const VI32 bo = ShiftRight<16>(BitCast(d32, b));
|
|
5409
|
-
return Add(Mul(ae, be), Mul(ao, bo));
|
|
5410
|
-
}
|
|
5411
|
-
|
|
5412
|
-
template <class D, HWY_IF_U32_D(D), class VI16>
|
|
5413
|
-
HWY_API VFromD<D> WidenMulPairwiseAdd(D du32, VI16 a, VI16 b) {
|
|
5414
|
-
using VU32 = VFromD<decltype(du32)>;
|
|
5415
|
-
// Manual sign extension requires two shifts for even lanes.
|
|
5416
|
-
const VU32 ae = detail::AndS(BitCast(du32, a), uint32_t{0x0000FFFFu});
|
|
5417
|
-
const VU32 be = detail::AndS(BitCast(du32, b), uint32_t{0x0000FFFFu});
|
|
5418
|
-
const VU32 ao = ShiftRight<16>(BitCast(du32, a));
|
|
5419
|
-
const VU32 bo = ShiftRight<16>(BitCast(du32, b));
|
|
5420
|
-
return Add(Mul(ae, be), Mul(ao, bo));
|
|
5685
|
+
template <class DF, HWY_IF_F32_D(DF),
|
|
5686
|
+
class VBF = VFromD<Repartition<hwy::bfloat16_t, DF>>>
|
|
5687
|
+
HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
|
|
5688
|
+
const VFromD<DF> ae = PromoteEvenTo(df, a);
|
|
5689
|
+
const VFromD<DF> be = PromoteEvenTo(df, b);
|
|
5690
|
+
const VFromD<DF> ao = PromoteOddTo(df, a);
|
|
5691
|
+
const VFromD<DF> bo = PromoteOddTo(df, b);
|
|
5692
|
+
return MulAdd(ae, be, Mul(ao, bo));
|
|
5693
|
+
}
|
|
5694
|
+
|
|
5695
|
+
template <class D, HWY_IF_UI32_D(D), class V16 = VFromD<RepartitionToNarrow<D>>>
|
|
5696
|
+
HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, V16 a, V16 b) {
|
|
5697
|
+
return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b),
|
|
5698
|
+
Mul(PromoteOddTo(d32, a), PromoteOddTo(d32, b)));
|
|
5421
5699
|
}
|
|
5422
5700
|
|
|
5423
5701
|
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
|
|
5424
5702
|
|
|
5425
5703
|
namespace detail {
|
|
5426
5704
|
|
|
5427
|
-
// Non-overloaded wrapper function so we can define DF32 in template args.
|
|
5428
|
-
template <size_t N, int kPow2, class DF32 = Simd<float, N, kPow2>,
|
|
5429
|
-
class VF32 = VFromD<DF32>,
|
|
5430
|
-
class DBF16 = Repartition<hwy::bfloat16_t, Simd<float, N, kPow2>>>
|
|
5431
|
-
HWY_API VF32 ReorderWidenMulAccumulateBF16(Simd<float, N, kPow2> df32,
|
|
5432
|
-
VFromD<DBF16> a, VFromD<DBF16> b,
|
|
5433
|
-
const VF32 sum0, VF32& sum1) {
|
|
5434
|
-
const RebindToUnsigned<DF32> du32;
|
|
5435
|
-
using VU32 = VFromD<decltype(du32)>;
|
|
5436
|
-
const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
|
|
5437
|
-
// Using shift/and instead of Zip leads to the odd/even order that
|
|
5438
|
-
// RearrangeToOddPlusEven prefers.
|
|
5439
|
-
const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
|
|
5440
|
-
const VU32 ao = And(BitCast(du32, a), odd);
|
|
5441
|
-
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
5442
|
-
const VU32 bo = And(BitCast(du32, b), odd);
|
|
5443
|
-
sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
|
|
5444
|
-
return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
|
|
5445
|
-
}
|
|
5446
|
-
|
|
5447
5705
|
#define HWY_RVV_WIDEN_MACC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
5448
5706
|
SHIFT, MLEN, NAME, OP) \
|
|
5449
5707
|
template <size_t N> \
|
|
@@ -5519,21 +5777,15 @@ HWY_API VFromD<D32> ReorderWidenMulAccumulateU16(D32 d32, VFromD<D16> a,
|
|
|
5519
5777
|
|
|
5520
5778
|
} // namespace detail
|
|
5521
5779
|
|
|
5522
|
-
template <
|
|
5523
|
-
HWY_API VW ReorderWidenMulAccumulate(
|
|
5524
|
-
|
|
5525
|
-
return detail::ReorderWidenMulAccumulateBF16(d32, a, b, sum0, sum1);
|
|
5526
|
-
}
|
|
5527
|
-
|
|
5528
|
-
template <size_t N, int kPow2, class VN, class VW>
|
|
5529
|
-
HWY_API VW ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32, VN a, VN b,
|
|
5530
|
-
const VW sum0, VW& sum1) {
|
|
5780
|
+
template <class D, HWY_IF_I32_D(D), class VN, class VW>
|
|
5781
|
+
HWY_API VW ReorderWidenMulAccumulate(D d32, VN a, VN b, const VW sum0,
|
|
5782
|
+
VW& sum1) {
|
|
5531
5783
|
return detail::ReorderWidenMulAccumulateI16(d32, a, b, sum0, sum1);
|
|
5532
5784
|
}
|
|
5533
5785
|
|
|
5534
|
-
template <
|
|
5535
|
-
HWY_API VW ReorderWidenMulAccumulate(
|
|
5536
|
-
|
|
5786
|
+
template <class D, HWY_IF_U32_D(D), class VN, class VW>
|
|
5787
|
+
HWY_API VW ReorderWidenMulAccumulate(D d32, VN a, VN b, const VW sum0,
|
|
5788
|
+
VW& sum1) {
|
|
5537
5789
|
return detail::ReorderWidenMulAccumulateU16(d32, a, b, sum0, sum1);
|
|
5538
5790
|
}
|
|
5539
5791
|
|
|
@@ -5601,6 +5853,40 @@ HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
|
|
|
5601
5853
|
}
|
|
5602
5854
|
|
|
5603
5855
|
// ------------------------------ Lt128
|
|
5856
|
+
#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
5857
|
+
|
|
5858
|
+
template <class D>
|
|
5859
|
+
HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) {
|
|
5860
|
+
static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
|
|
5861
|
+
// The subsequent computations are performed using e8mf8 (8-bit elements with
|
|
5862
|
+
// a fractional LMUL of 1/8) for the following reasons:
|
|
5863
|
+
// 1. It is correct for the possible input vector types e64m<1,2,4,8>. This is
|
|
5864
|
+
// because the resulting mask can occupy at most 1/8 of a full vector when
|
|
5865
|
+
// using e64m8.
|
|
5866
|
+
// 2. It can be more efficient than using a full vector or a vector group.
|
|
5867
|
+
//
|
|
5868
|
+
// The algorithm computes the result as follows:
|
|
5869
|
+
// 1. Compute cH | (=H & cL) in the high bits, where cH and cL represent the
|
|
5870
|
+
// comparison results for the high and low 64-bit elements, respectively.
|
|
5871
|
+
// 2. Shift the result right by 1 to duplicate the comparison results for the
|
|
5872
|
+
// low bits.
|
|
5873
|
+
// 3. Obtain the final result by performing a bitwise OR on the high and low
|
|
5874
|
+
// bits.
|
|
5875
|
+
auto du8mf8 = ScalableTag<uint8_t, -3>{};
|
|
5876
|
+
const vuint8mf8_t ltHL0 =
|
|
5877
|
+
detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Lt(a, b)));
|
|
5878
|
+
const vuint8mf8_t eqHL0 =
|
|
5879
|
+
detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Eq(a, b)));
|
|
5880
|
+
const vuint8mf8_t ltLx0 = Add(ltHL0, ltHL0);
|
|
5881
|
+
const vuint8mf8_t resultHx = detail::AndS(OrAnd(ltHL0, ltLx0, eqHL0), 0xaa);
|
|
5882
|
+
const vuint8mf8_t resultxL = ShiftRight<1>(resultHx);
|
|
5883
|
+
const vuint8mf8_t result = Or(resultHx, resultxL);
|
|
5884
|
+
auto du8m1 = ScalableTag<uint8_t>{};
|
|
5885
|
+
return detail::U8MaskBitsVecToMask(d, detail::ChangeLMUL(du8m1, result));
|
|
5886
|
+
}
|
|
5887
|
+
|
|
5888
|
+
#else
|
|
5889
|
+
|
|
5604
5890
|
template <class D>
|
|
5605
5891
|
HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) {
|
|
5606
5892
|
static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
|
|
@@ -5626,6 +5912,8 @@ HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) {
|
|
|
5626
5912
|
return MaskFromVec(OddEven(vecHx, detail::Slide1Down(vecHx)));
|
|
5627
5913
|
}
|
|
5628
5914
|
|
|
5915
|
+
#endif // HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
5916
|
+
|
|
5629
5917
|
// ------------------------------ Lt128Upper
|
|
5630
5918
|
template <class D>
|
|
5631
5919
|
HWY_INLINE MFromD<D> Lt128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
|