@img/sharp-libvips-dev 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +7 -1
- package/include/aom/aom_image.h +24 -12
- package/include/aom/aom_integer.h +3 -3
- package/include/aom/aomcx.h +15 -0
- package/include/aom/aomdx.h +5 -2
- package/include/archive.h +7 -5
- package/include/archive_entry.h +5 -3
- package/include/cgif.h +3 -0
- package/include/expat.h +21 -10
- package/include/expat_config.h +11 -5
- package/include/ffi.h +12 -25
- package/include/freetype2/freetype/config/ftoption.h +2 -2
- package/include/fribidi/fribidi-config.h +2 -2
- package/include/fribidi/fribidi-unicode-version.h +3 -3
- package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
- package/include/glib-2.0/gio/gappinfo.h +40 -25
- package/include/glib-2.0/gio/gapplication.h +6 -0
- package/include/glib-2.0/gio/gasyncresult.h +1 -1
- package/include/glib-2.0/gio/gconverter.h +5 -0
- package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
- package/include/glib-2.0/gio/gfile.h +16 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/giotypes.h +0 -1
- package/include/glib-2.0/gio/gsettings.h +8 -0
- package/include/glib-2.0/gio/gvfs.h +2 -2
- package/include/glib-2.0/girepository/gi-visibility.h +34 -0
- package/include/glib-2.0/girepository/giarginfo.h +23 -6
- package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
- package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
- package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
- package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
- package/include/glib-2.0/girepository/gienuminfo.h +20 -21
- package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
- package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
- package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
- package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
- package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
- package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
- package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
- package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
- package/include/glib-2.0/girepository/girepository.h +53 -62
- package/include/glib-2.0/girepository/girffi.h +8 -7
- package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
- package/include/glib-2.0/girepository/gistructinfo.h +26 -11
- package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
- package/include/glib-2.0/girepository/gitypelib.h +9 -13
- package/include/glib-2.0/girepository/gitypes.h +52 -104
- package/include/glib-2.0/girepository/giunioninfo.h +28 -12
- package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
- package/include/glib-2.0/girepository/givalueinfo.h +65 -0
- package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
- package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
- package/include/glib-2.0/glib/gbitlock.h +31 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
- package/include/glib-2.0/glib/giochannel.h +2 -2
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +12 -5
- package/include/glib-2.0/glib/gmain.h +93 -7
- package/include/glib-2.0/glib/gmessages.h +8 -0
- package/include/glib-2.0/glib/gqsort.h +8 -1
- package/include/glib-2.0/glib/gslice.h +2 -0
- package/include/glib-2.0/glib/gstrfuncs.h +24 -30
- package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
- package/include/glib-2.0/glib/gthread.h +191 -3
- package/include/glib-2.0/glib/gunicode.h +1 -1
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/glib-unix.h +7 -1
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/genums.h +6 -6
- package/include/glib-2.0/gobject/glib-types.h +11 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gsignal.h +16 -6
- package/include/glib-2.0/gobject/gtype.h +6 -6
- package/include/harfbuzz/hb-buffer.h +6 -0
- package/include/harfbuzz/hb-common.h +6 -9
- package/include/harfbuzz/hb-cplusplus.hh +8 -11
- package/include/harfbuzz/hb-subset.h +17 -4
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +28 -0
- package/include/hwy/aligned_allocator.h +218 -6
- package/include/hwy/base.h +1935 -512
- package/include/hwy/cache_control.h +24 -6
- package/include/hwy/detect_compiler_arch.h +105 -10
- package/include/hwy/detect_targets.h +146 -37
- package/include/hwy/foreach_target.h +36 -1
- package/include/hwy/highway.h +222 -50
- package/include/hwy/ops/arm_neon-inl.h +2055 -894
- package/include/hwy/ops/arm_sve-inl.h +1476 -348
- package/include/hwy/ops/emu128-inl.h +711 -623
- package/include/hwy/ops/generic_ops-inl.h +4431 -2157
- package/include/hwy/ops/inside-inl.h +691 -0
- package/include/hwy/ops/ppc_vsx-inl.h +2186 -673
- package/include/hwy/ops/rvv-inl.h +1556 -536
- package/include/hwy/ops/scalar-inl.h +353 -233
- package/include/hwy/ops/set_macros-inl.h +171 -23
- package/include/hwy/ops/shared-inl.h +198 -56
- package/include/hwy/ops/wasm_128-inl.h +283 -244
- package/include/hwy/ops/x86_128-inl.h +3673 -1357
- package/include/hwy/ops/x86_256-inl.h +1737 -663
- package/include/hwy/ops/x86_512-inl.h +1697 -500
- package/include/hwy/per_target.h +4 -0
- package/include/hwy/profiler.h +648 -0
- package/include/hwy/robust_statistics.h +2 -2
- package/include/hwy/targets.h +40 -32
- package/include/hwy/timer-inl.h +3 -3
- package/include/hwy/timer.h +16 -1
- package/include/libheif/heif.h +170 -15
- package/include/libheif/heif_items.h +237 -0
- package/include/libheif/heif_properties.h +38 -2
- package/include/libheif/heif_regions.h +1 -1
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/png.h +32 -29
- package/include/libpng16/pngconf.h +2 -2
- package/include/libpng16/pnglibconf.h +8 -3
- package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
- package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
- package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
- package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
- package/include/libxml2/libxml/HTMLparser.h +12 -19
- package/include/libxml2/libxml/c14n.h +1 -12
- package/include/libxml2/libxml/debugXML.h +1 -1
- package/include/libxml2/libxml/encoding.h +9 -0
- package/include/libxml2/libxml/entities.h +12 -1
- package/include/libxml2/libxml/hash.h +19 -0
- package/include/libxml2/libxml/list.h +2 -2
- package/include/libxml2/libxml/nanohttp.h +17 -0
- package/include/libxml2/libxml/parser.h +73 -58
- package/include/libxml2/libxml/parserInternals.h +9 -1
- package/include/libxml2/libxml/pattern.h +6 -0
- package/include/libxml2/libxml/tree.h +32 -12
- package/include/libxml2/libxml/uri.h +11 -0
- package/include/libxml2/libxml/valid.h +29 -2
- package/include/libxml2/libxml/xinclude.h +7 -0
- package/include/libxml2/libxml/xmlIO.h +21 -5
- package/include/libxml2/libxml/xmlerror.h +14 -0
- package/include/libxml2/libxml/xmlexports.h +111 -15
- package/include/libxml2/libxml/xmlmemory.h +8 -45
- package/include/libxml2/libxml/xmlreader.h +2 -0
- package/include/libxml2/libxml/xmlsave.h +5 -0
- package/include/libxml2/libxml/xmlunicode.h +165 -1
- package/include/libxml2/libxml/xmlversion.h +15 -179
- package/include/libxml2/libxml/xmlwriter.h +1 -0
- package/include/libxml2/libxml/xpath.h +4 -0
- package/include/pango-1.0/pango/pango-features.h +2 -2
- package/include/pango-1.0/pango/pango-fontmap.h +7 -0
- package/include/pango-1.0/pango/pango-item.h +4 -2
- package/include/pango-1.0/pango/pango-version-macros.h +25 -0
- package/include/pango-1.0/pango/pangofc-font.h +2 -1
- package/include/pixman-1/pixman-version.h +2 -2
- package/include/png.h +32 -29
- package/include/pngconf.h +2 -2
- package/include/pnglibconf.h +8 -3
- package/include/vips/connection.h +9 -3
- package/include/vips/util.h +1 -11
- package/include/vips/version.h +4 -4
- package/include/webp/decode.h +58 -56
- package/include/webp/demux.h +25 -21
- package/include/webp/encode.h +44 -39
- package/include/webp/mux.h +76 -15
- package/include/webp/mux_types.h +2 -1
- package/include/webp/sharpyuv/sharpyuv.h +77 -8
- package/include/webp/types.h +29 -8
- package/include/zconf.h +1 -1
- package/include/zlib.h +12 -12
- package/package.json +1 -1
- package/versions.json +18 -19
|
@@ -339,8 +339,11 @@ namespace detail { // for code folding
|
|
|
339
339
|
// Full support for f16 in all ops
|
|
340
340
|
#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
|
|
341
341
|
HWY_RVV_FOREACH_F16_UNCONDITIONAL(X_MACRO, NAME, OP, LMULS)
|
|
342
|
+
// Only BF16 is emulated.
|
|
343
|
+
#define HWY_RVV_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
|
|
342
344
|
#else
|
|
343
345
|
#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS)
|
|
346
|
+
#define HWY_RVV_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
|
|
344
347
|
#endif
|
|
345
348
|
#define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \
|
|
346
349
|
HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, float, f, NAME, OP)
|
|
@@ -389,15 +392,11 @@ namespace detail { // for code folding
|
|
|
389
392
|
// For all combinations of SEW:
|
|
390
393
|
#define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
|
|
391
394
|
HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
|
|
392
|
-
|
|
393
|
-
HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
|
|
394
|
-
HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
|
|
395
|
+
HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS)
|
|
395
396
|
|
|
396
397
|
#define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \
|
|
397
398
|
HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \
|
|
398
|
-
|
|
399
|
-
HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
|
|
400
|
-
HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
|
|
399
|
+
HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS)
|
|
401
400
|
|
|
402
401
|
#define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS) \
|
|
403
402
|
HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
|
|
@@ -409,8 +408,7 @@ namespace detail { // for code folding
|
|
|
409
408
|
HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS)
|
|
410
409
|
|
|
411
410
|
#define HWY_RVV_FOREACH(X_MACRO, NAME, OP, LMULS) \
|
|
412
|
-
|
|
413
|
-
HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \
|
|
411
|
+
HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS) \
|
|
414
412
|
HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS)
|
|
415
413
|
|
|
416
414
|
// Assemble types for use in x-macros
|
|
@@ -438,22 +436,134 @@ HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _, _ALL)
|
|
|
438
436
|
// ------------------------------ Lanes
|
|
439
437
|
|
|
440
438
|
// WARNING: we want to query VLMAX/sizeof(T), but this may actually change VL!
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
/* If
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
}
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
439
|
+
|
|
440
|
+
#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
|
|
441
|
+
// HWY_RVV_CAPPED_LANES_SPECIAL_CASES provides some additional optimizations
|
|
442
|
+
// to CappedLanes in non-debug builds
|
|
443
|
+
#define HWY_RVV_CAPPED_LANES_SPECIAL_CASES(BASE, SEW, LMUL) \
|
|
444
|
+
if (__builtin_constant_p(cap >= kMaxLanes) && (cap >= kMaxLanes)) { \
|
|
445
|
+
/* If cap is known to be greater than or equal to MaxLanes(d), */ \
|
|
446
|
+
/* HWY_MIN(cap, Lanes(d)) will be equal to Lanes(d) */ \
|
|
447
|
+
return Lanes(d); \
|
|
448
|
+
} \
|
|
449
|
+
\
|
|
450
|
+
if ((__builtin_constant_p((cap & (cap - 1)) == 0) && \
|
|
451
|
+
((cap & (cap - 1)) == 0)) || \
|
|
452
|
+
(__builtin_constant_p(cap <= HWY_MAX(kMinLanesPerFullVec, 4)) && \
|
|
453
|
+
(cap <= HWY_MAX(kMinLanesPerFullVec, 4)))) { \
|
|
454
|
+
/* If cap is known to be a power of 2, then */ \
|
|
455
|
+
/* vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the same */ \
|
|
456
|
+
/* result as HWY_MIN(cap, Lanes(d)) as kMaxLanes is a power of 2 and */ \
|
|
457
|
+
/* as (cap > VLMAX && cap < 2 * VLMAX) can only be true if cap is not a */ \
|
|
458
|
+
/* power of 2 since VLMAX is always a power of 2 */ \
|
|
459
|
+
\
|
|
460
|
+
/* If cap is known to be less than or equal to 4, then */ \
|
|
461
|
+
/* vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the same */ \
|
|
462
|
+
/* result as HWY_MIN(cap, Lanes(d)) as HWY_MIN(cap, kMaxLanes) <= 4 is */ \
|
|
463
|
+
/* true if cap <= 4 and as vsetvl(HWY_MIN(cap, kMaxLanes)) is */ \
|
|
464
|
+
/* guaranteed to return the same result as HWY_MIN(cap, Lanes(d)) */ \
|
|
465
|
+
/* if HWY_MIN(cap, kMaxLanes) <= 4 is true */ \
|
|
466
|
+
\
|
|
467
|
+
/* If cap is known to be less than or equal to kMinLanesPerFullVec, */ \
|
|
468
|
+
/* then vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the */ \
|
|
469
|
+
/* same result as HWY_MIN(cap, Lanes(d)) as */ \
|
|
470
|
+
/* HWY_MIN(cap, kMaxLanes) <= kMinLanesPerFullVec is true if */ \
|
|
471
|
+
/* cap <= kMinLanesPerFullVec is true */ \
|
|
472
|
+
\
|
|
473
|
+
/* If cap <= HWY_MAX(kMinLanesPerFullVec, 4) is true, then either */ \
|
|
474
|
+
/* cap <= 4 or cap <= kMinLanesPerFullVec must be true */ \
|
|
475
|
+
\
|
|
476
|
+
/* If cap <= HWY_MAX(kMinLanesPerFullVec, 4) is known to be true, */ \
|
|
477
|
+
/* then vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the */ \
|
|
478
|
+
/* same result as HWY_MIN(cap, Lanes(d)) */ \
|
|
479
|
+
\
|
|
480
|
+
/* If no cap, avoid the HWY_MIN. */ \
|
|
481
|
+
return detail::IsFull(d) \
|
|
482
|
+
? __riscv_vsetvl_e##SEW##LMUL(cap) \
|
|
483
|
+
: __riscv_vsetvl_e##SEW##LMUL(HWY_MIN(cap, kMaxLanes)); \
|
|
484
|
+
}
|
|
485
|
+
#else
|
|
486
|
+
#define HWY_RVV_CAPPED_LANES_SPECIAL_CASES(BASE, SEW, LMUL)
|
|
487
|
+
#endif
|
|
488
|
+
|
|
489
|
+
#define HWY_RVV_LANES(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
|
|
490
|
+
MLEN, NAME, OP) \
|
|
491
|
+
template <size_t N> \
|
|
492
|
+
HWY_API size_t NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \
|
|
493
|
+
constexpr size_t kFull = HWY_LANES(HWY_RVV_T(BASE, SEW)); \
|
|
494
|
+
constexpr size_t kCap = MaxLanes(d); \
|
|
495
|
+
/* If no cap, avoid generating a constant by using VLMAX. */ \
|
|
496
|
+
return N == kFull ? __riscv_vsetvlmax_e##SEW##LMUL() \
|
|
497
|
+
: __riscv_vsetvl_e##SEW##LMUL(kCap); \
|
|
498
|
+
} \
|
|
499
|
+
template <size_t N> \
|
|
500
|
+
HWY_API size_t Capped##NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, size_t cap) { \
|
|
501
|
+
/* NOTE: Section 6.3 of the RVV specification, which can be found at */ \
|
|
502
|
+
/* https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc, */ \
|
|
503
|
+
/* allows vsetvl to return a result less than Lanes(d) but greater than */ \
|
|
504
|
+
/* or equal to ((cap + 1) / 2) if */ \
|
|
505
|
+
/* (Lanes(d) > 2 && cap > HWY_MAX(Lanes(d), 4) && cap < (2 * Lanes(d))) */ \
|
|
506
|
+
/* is true */ \
|
|
507
|
+
\
|
|
508
|
+
/* VLMAX is the number of lanes in a vector of type */ \
|
|
509
|
+
/* VFromD<decltype(d)>, which is returned by */ \
|
|
510
|
+
/* Lanes(DFromV<VFromD<decltype(d)>>()) */ \
|
|
511
|
+
\
|
|
512
|
+
/* VLMAX is guaranteed to be a power of 2 under Section 2 of the RVV */ \
|
|
513
|
+
/* specification */ \
|
|
514
|
+
\
|
|
515
|
+
/* The VLMAX of a vector of type VFromD<decltype(d)> is at least 2 as */ \
|
|
516
|
+
/* the HWY_RVV target requires support for the RVV Zvl128b extension, */ \
|
|
517
|
+
/* which guarantees that vectors with LMUL=1 are at least 16 bytes */ \
|
|
518
|
+
\
|
|
519
|
+
/* If VLMAX == 2 is true, then vsetvl(cap) is equal to HWY_MIN(cap, 2) */ \
|
|
520
|
+
/* as cap == 3 is the only value such that */ \
|
|
521
|
+
/* (cap > VLMAX && cap < 2 * VLMAX) if VLMAX == 2 and as */ \
|
|
522
|
+
/* ((3 + 1) / 2) is equal to 2 */ \
|
|
523
|
+
\
|
|
524
|
+
/* If cap <= 4 is true, then vsetvl(cap) must be equal to */ \
|
|
525
|
+
/* HWY_MIN(cap, VLMAX) as cap <= VLMAX is true if VLMAX >= 4 is true */ \
|
|
526
|
+
/* and as vsetvl(cap) is guaranteed to be equal to HWY_MIN(cap, VLMAX) */ \
|
|
527
|
+
/* if VLMAX == 2 */ \
|
|
528
|
+
\
|
|
529
|
+
/* We want CappedLanes(d, cap) to return Lanes(d) if cap > Lanes(d) as */ \
|
|
530
|
+
/* LoadN(d, p, cap) expects to load exactly HWY_MIN(cap, Lanes(d)) */ \
|
|
531
|
+
/* lanes and StoreN(v, d, p, cap) expects to store exactly */ \
|
|
532
|
+
/* HWY_MIN(cap, Lanes(d)) lanes, even in the case where vsetvl returns */ \
|
|
533
|
+
/* a result that is less than HWY_MIN(cap, Lanes(d)) */ \
|
|
534
|
+
\
|
|
535
|
+
/* kMinLanesPerFullVec is the minimum value of VLMAX for a vector of */ \
|
|
536
|
+
/* type VFromD<decltype(d)> */ \
|
|
537
|
+
constexpr size_t kMinLanesPerFullVec = \
|
|
538
|
+
detail::ScaleByPower(16 / (SEW / 8), SHIFT); \
|
|
539
|
+
/* kMaxLanes is the maximum number of lanes returned by Lanes(d) */ \
|
|
540
|
+
constexpr size_t kMaxLanes = MaxLanes(d); \
|
|
541
|
+
\
|
|
542
|
+
HWY_RVV_CAPPED_LANES_SPECIAL_CASES(BASE, SEW, LMUL) \
|
|
543
|
+
\
|
|
544
|
+
if (kMaxLanes <= HWY_MAX(kMinLanesPerFullVec, 4)) { \
|
|
545
|
+
/* If kMaxLanes <= kMinLanesPerFullVec is true, then */ \
|
|
546
|
+
/* vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return */ \
|
|
547
|
+
/* HWY_MIN(cap, Lanes(d)) as */ \
|
|
548
|
+
/* HWY_MIN(cap, kMaxLanes) <= kMaxLanes <= VLMAX is true if */ \
|
|
549
|
+
/* kMaxLanes <= kMinLanesPerFullVec is true */ \
|
|
550
|
+
\
|
|
551
|
+
/* If kMaxLanes <= 4 is true, then vsetvl(HWY_MIN(cap, kMaxLanes)) is */ \
|
|
552
|
+
/* guaranteed to return the same result as HWY_MIN(cap, Lanes(d)) as */ \
|
|
553
|
+
/* HWY_MIN(cap, kMaxLanes) <= 4 is true if kMaxLanes <= 4 is true */ \
|
|
554
|
+
\
|
|
555
|
+
/* If kMaxLanes <= HWY_MAX(kMinLanesPerFullVec, 4) is true, then */ \
|
|
556
|
+
/* either kMaxLanes <= 4 or kMaxLanes <= kMinLanesPerFullVec must be */ \
|
|
557
|
+
/* true */ \
|
|
558
|
+
\
|
|
559
|
+
return __riscv_vsetvl_e##SEW##LMUL(HWY_MIN(cap, kMaxLanes)); \
|
|
560
|
+
} else { \
|
|
561
|
+
/* If kMaxLanes > HWY_MAX(kMinLanesPerFullVec, 4) is true, need to */ \
|
|
562
|
+
/* obtain the actual number of lanes using Lanes(d) and clamp cap to */ \
|
|
563
|
+
/* the result of Lanes(d) */ \
|
|
564
|
+
const size_t actual = Lanes(d); \
|
|
565
|
+
return HWY_MIN(actual, cap); \
|
|
566
|
+
} \
|
|
457
567
|
}
|
|
458
568
|
|
|
459
569
|
#define HWY_RVV_LANES_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
@@ -480,18 +590,18 @@ HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _, _ALL)
|
|
|
480
590
|
|
|
481
591
|
HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e, _ALL)
|
|
482
592
|
HWY_RVV_FOREACH(HWY_RVV_LANES_VIRT, Lanes, lenb, _VIRT)
|
|
483
|
-
// If not already defined via HWY_RVV_FOREACH, define the overloads because
|
|
484
|
-
// they do not require any new instruction.
|
|
485
|
-
#if !HWY_HAVE_FLOAT16
|
|
486
|
-
HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_LANES, Lanes, setvlmax_e, _ALL)
|
|
487
|
-
HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_LANES_VIRT, Lanes, lenb, _VIRT)
|
|
488
|
-
#endif
|
|
489
593
|
#undef HWY_RVV_LANES
|
|
490
594
|
#undef HWY_RVV_LANES_VIRT
|
|
595
|
+
#undef HWY_RVV_CAPPED_LANES_SPECIAL_CASES
|
|
596
|
+
|
|
597
|
+
template <class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
598
|
+
HWY_API size_t Lanes(D /* tag*/) {
|
|
599
|
+
return Lanes(RebindToUnsigned<D>());
|
|
600
|
+
}
|
|
491
601
|
|
|
492
|
-
template <
|
|
493
|
-
HWY_API size_t
|
|
494
|
-
return
|
|
602
|
+
template <class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
603
|
+
HWY_API size_t CappedLanes(D /* tag*/, size_t cap) {
|
|
604
|
+
return CappedLanes(RebindToUnsigned<D>(), cap);
|
|
495
605
|
}
|
|
496
606
|
|
|
497
607
|
// ------------------------------ Common x-macros
|
|
@@ -525,10 +635,20 @@ HWY_API size_t Lanes(Simd<bfloat16_t, N, kPow2> /* tag*/) {
|
|
|
525
635
|
HWY_RVV_AVL(SEW, SHIFT)); \
|
|
526
636
|
}
|
|
527
637
|
|
|
638
|
+
// vector = f(vector, mask, vector, vector), e.g. MaskedAddOr
|
|
639
|
+
#define HWY_RVV_RETV_ARGMVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
640
|
+
SHIFT, MLEN, NAME, OP) \
|
|
641
|
+
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
642
|
+
NAME(HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_M(MLEN) m, \
|
|
643
|
+
HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
|
|
644
|
+
return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_mu(m, no, a, b, \
|
|
645
|
+
HWY_RVV_AVL(SEW, SHIFT)); \
|
|
646
|
+
}
|
|
647
|
+
|
|
528
648
|
// mask = f(mask)
|
|
529
|
-
#define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP)
|
|
530
|
-
HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) {
|
|
531
|
-
return __riscv_vm##OP##_m_b##MLEN(m,
|
|
649
|
+
#define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP) \
|
|
650
|
+
HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) { \
|
|
651
|
+
return __riscv_vm##OP##_m_b##MLEN(m, HWY_RVV_AVL(SEW, SHIFT)); \
|
|
532
652
|
}
|
|
533
653
|
|
|
534
654
|
// ================================================== INIT
|
|
@@ -549,21 +669,17 @@ HWY_RVV_FOREACH_F(HWY_RVV_SET, Set, fmv_v_f, _ALL_VIRT)
|
|
|
549
669
|
|
|
550
670
|
// Treat bfloat16_t as int16_t (using the previously defined Set overloads);
|
|
551
671
|
// required for Zero and VFromD.
|
|
552
|
-
template <
|
|
553
|
-
decltype(Set(
|
|
554
|
-
|
|
555
|
-
return Set(RebindToSigned<decltype(d)>(), arg.bits);
|
|
672
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
673
|
+
decltype(Set(RebindToSigned<D>(), 0)) Set(D d, hwy::bfloat16_t arg) {
|
|
674
|
+
return Set(RebindToSigned<decltype(d)>(), BitCastScalar<int16_t>(arg));
|
|
556
675
|
}
|
|
557
676
|
#if !HWY_HAVE_FLOAT16 // Otherwise already defined above.
|
|
558
677
|
// WARNING: returns a different type than emulated bfloat16_t so that we can
|
|
559
678
|
// implement PromoteTo overloads for both bfloat16_t and float16_t, and also
|
|
560
|
-
// provide a Neg(float16_t) overload that coexists with Neg(int16_t).
|
|
561
|
-
template <
|
|
562
|
-
decltype(Set(
|
|
563
|
-
|
|
564
|
-
uint16_t bits;
|
|
565
|
-
CopySameSize(&arg, &bits);
|
|
566
|
-
return Set(RebindToUnsigned<decltype(d)>(), bits);
|
|
679
|
+
// provide a Neg(hwy::float16_t) overload that coexists with Neg(int16_t).
|
|
680
|
+
template <class D, HWY_IF_F16_D(D)>
|
|
681
|
+
decltype(Set(RebindToUnsigned<D>(), 0)) Set(D d, hwy::float16_t arg) {
|
|
682
|
+
return Set(RebindToUnsigned<decltype(d)>(), BitCastScalar<uint16_t>(arg));
|
|
567
683
|
}
|
|
568
684
|
#endif
|
|
569
685
|
|
|
@@ -642,16 +758,7 @@ HWY_RVV_FOREACH(HWY_RVV_EXT, Ext, lmul_ext, _EXT)
|
|
|
642
758
|
HWY_RVV_FOREACH(HWY_RVV_EXT_VIRT, Ext, lmul_ext, _VIRT)
|
|
643
759
|
#undef HWY_RVV_EXT_VIRT
|
|
644
760
|
|
|
645
|
-
|
|
646
|
-
template <class D, HWY_IF_F16_D(D)>
|
|
647
|
-
VFromD<D> Ext(D d, VFromD<Half<D>> v) {
|
|
648
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
649
|
-
const Half<decltype(du)> duh;
|
|
650
|
-
return BitCast(d, Ext(du, BitCast(duh, v)));
|
|
651
|
-
}
|
|
652
|
-
#endif
|
|
653
|
-
|
|
654
|
-
template <class D, HWY_IF_BF16_D(D)>
|
|
761
|
+
template <class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
655
762
|
VFromD<D> Ext(D d, VFromD<Half<D>> v) {
|
|
656
763
|
const RebindToUnsigned<decltype(d)> du;
|
|
657
764
|
const Half<decltype(du)> duh;
|
|
@@ -767,10 +874,10 @@ HWY_RVV_FOREACH_F(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
|
|
|
767
874
|
HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_CAST_IF, _, reinterpret, _ALL)
|
|
768
875
|
HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
|
|
769
876
|
#else
|
|
770
|
-
template <
|
|
771
|
-
HWY_INLINE VFromD<
|
|
772
|
-
|
|
773
|
-
return BitCastFromByte(
|
|
877
|
+
template <class D, HWY_IF_F16_D(D)>
|
|
878
|
+
HWY_INLINE VFromD<RebindToUnsigned<D>> BitCastFromByte(
|
|
879
|
+
D /* d */, VFromD<Repartition<uint8_t, D>> v) {
|
|
880
|
+
return BitCastFromByte(RebindToUnsigned<D>(), v);
|
|
774
881
|
}
|
|
775
882
|
#endif
|
|
776
883
|
|
|
@@ -781,10 +888,10 @@ HWY_INLINE VFromD<Simd<uint16_t, N, kPow2>> BitCastFromByte(
|
|
|
781
888
|
#undef HWY_RVV_CAST_VIRT_U
|
|
782
889
|
#undef HWY_RVV_CAST_VIRT_IF
|
|
783
890
|
|
|
784
|
-
template <
|
|
785
|
-
HWY_INLINE VFromD<
|
|
786
|
-
|
|
787
|
-
return BitCastFromByte(
|
|
891
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
892
|
+
HWY_INLINE VFromD<RebindToSigned<D>> BitCastFromByte(
|
|
893
|
+
D d, VFromD<Repartition<uint8_t, D>> v) {
|
|
894
|
+
return BitCastFromByte(RebindToSigned<decltype(d)>(), v);
|
|
788
895
|
}
|
|
789
896
|
|
|
790
897
|
} // namespace detail
|
|
@@ -942,6 +1049,35 @@ HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, SubS, sub_vx, _ALL)
|
|
|
942
1049
|
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Sub, sub, _ALL)
|
|
943
1050
|
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Sub, fsub, _ALL)
|
|
944
1051
|
|
|
1052
|
+
// ------------------------------ Neg (ReverseSubS, Xor)
|
|
1053
|
+
|
|
1054
|
+
template <class V, HWY_IF_SIGNED_V(V)>
|
|
1055
|
+
HWY_API V Neg(const V v) {
|
|
1056
|
+
return detail::ReverseSubS(v, 0);
|
|
1057
|
+
}
|
|
1058
|
+
|
|
1059
|
+
// vector = f(vector), but argument is repeated
|
|
1060
|
+
#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
1061
|
+
SHIFT, MLEN, NAME, OP) \
|
|
1062
|
+
HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
1063
|
+
return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, v, \
|
|
1064
|
+
HWY_RVV_AVL(SEW, SHIFT)); \
|
|
1065
|
+
}
|
|
1066
|
+
|
|
1067
|
+
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Neg, fsgnjn, _ALL)
|
|
1068
|
+
|
|
1069
|
+
#if !HWY_HAVE_FLOAT16
|
|
1070
|
+
|
|
1071
|
+
template <class V, HWY_IF_U16_D(DFromV<V>)> // hwy::float16_t
|
|
1072
|
+
HWY_API V Neg(V v) {
|
|
1073
|
+
const DFromV<decltype(v)> d;
|
|
1074
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1075
|
+
using TU = TFromD<decltype(du)>;
|
|
1076
|
+
return BitCast(d, Xor(BitCast(du, v), Set(du, SignMask<TU>())));
|
|
1077
|
+
}
|
|
1078
|
+
|
|
1079
|
+
#endif // !HWY_HAVE_FLOAT16
|
|
1080
|
+
|
|
945
1081
|
// ------------------------------ SaturatedAdd
|
|
946
1082
|
|
|
947
1083
|
#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
|
|
@@ -1048,7 +1184,7 @@ HWY_RVV_FOREACH_I(HWY_RVV_SHIFT, ShiftRight, sra, _ALL)
|
|
|
1048
1184
|
#undef HWY_RVV_SHIFT
|
|
1049
1185
|
|
|
1050
1186
|
// ------------------------------ SumsOf8 (ShiftRight, Add)
|
|
1051
|
-
template <class VU8>
|
|
1187
|
+
template <class VU8, HWY_IF_U8_D(DFromV<VU8>)>
|
|
1052
1188
|
HWY_API VFromD<Repartition<uint64_t, DFromV<VU8>>> SumsOf8(const VU8 v) {
|
|
1053
1189
|
const DFromV<VU8> du8;
|
|
1054
1190
|
const RepartitionToWide<decltype(du8)> du16;
|
|
@@ -1071,13 +1207,42 @@ HWY_API VFromD<Repartition<uint64_t, DFromV<VU8>>> SumsOf8(const VU8 v) {
|
|
|
1071
1207
|
return detail::AndS(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), 0xFFFFull);
|
|
1072
1208
|
}
|
|
1073
1209
|
|
|
1210
|
+
template <class VI8, HWY_IF_I8_D(DFromV<VI8>)>
|
|
1211
|
+
HWY_API VFromD<Repartition<int64_t, DFromV<VI8>>> SumsOf8(const VI8 v) {
|
|
1212
|
+
const DFromV<VI8> di8;
|
|
1213
|
+
const RepartitionToWide<decltype(di8)> di16;
|
|
1214
|
+
const RepartitionToWide<decltype(di16)> di32;
|
|
1215
|
+
const RepartitionToWide<decltype(di32)> di64;
|
|
1216
|
+
const RebindToUnsigned<decltype(di32)> du32;
|
|
1217
|
+
const RebindToUnsigned<decltype(di64)> du64;
|
|
1218
|
+
using VI16 = VFromD<decltype(di16)>;
|
|
1219
|
+
|
|
1220
|
+
const VI16 vFDB97531 = ShiftRight<8>(BitCast(di16, v));
|
|
1221
|
+
const VI16 vECA86420 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, v)));
|
|
1222
|
+
const VI16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
|
|
1223
|
+
|
|
1224
|
+
const VI16 sDC_zz_98_zz_54_zz_10_zz =
|
|
1225
|
+
BitCast(di16, ShiftLeft<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
|
|
1226
|
+
const VI16 sFC_xx_B8_xx_74_xx_30_xx =
|
|
1227
|
+
Add(sFE_DC_BA_98_76_54_32_10, sDC_zz_98_zz_54_zz_10_zz);
|
|
1228
|
+
const VI16 sB8_xx_zz_zz_30_xx_zz_zz =
|
|
1229
|
+
BitCast(di16, ShiftLeft<32>(BitCast(du64, sFC_xx_B8_xx_74_xx_30_xx)));
|
|
1230
|
+
const VI16 sF8_xx_xx_xx_70_xx_xx_xx =
|
|
1231
|
+
Add(sFC_xx_B8_xx_74_xx_30_xx, sB8_xx_zz_zz_30_xx_zz_zz);
|
|
1232
|
+
return ShiftRight<48>(BitCast(di64, sF8_xx_xx_xx_70_xx_xx_xx));
|
|
1233
|
+
}
|
|
1234
|
+
|
|
1074
1235
|
// ------------------------------ RotateRight
|
|
1075
|
-
template <int kBits, class V>
|
|
1236
|
+
template <int kBits, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
1076
1237
|
HWY_API V RotateRight(const V v) {
|
|
1238
|
+
const DFromV<decltype(v)> d;
|
|
1239
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1240
|
+
|
|
1077
1241
|
constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
|
|
1078
1242
|
static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
|
|
1079
1243
|
if (kBits == 0) return v;
|
|
1080
|
-
|
|
1244
|
+
|
|
1245
|
+
return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
|
|
1081
1246
|
ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
|
|
1082
1247
|
}
|
|
1083
1248
|
|
|
@@ -1158,15 +1323,8 @@ HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Mul, fmul, _ALL)
|
|
|
1158
1323
|
|
|
1159
1324
|
// ------------------------------ MulHigh
|
|
1160
1325
|
|
|
1161
|
-
// Only for internal use (Highway only promises MulHigh for 16-bit inputs).
|
|
1162
|
-
// Used by MulEven; vwmul does not work for m8.
|
|
1163
|
-
namespace detail {
|
|
1164
1326
|
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, MulHigh, mulh, _ALL)
|
|
1165
1327
|
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, MulHigh, mulhu, _ALL)
|
|
1166
|
-
} // namespace detail
|
|
1167
|
-
|
|
1168
|
-
HWY_RVV_FOREACH_U16(HWY_RVV_RETV_ARGVV, MulHigh, mulhu, _ALL)
|
|
1169
|
-
HWY_RVV_FOREACH_I16(HWY_RVV_RETV_ARGVV, MulHigh, mulh, _ALL)
|
|
1170
1328
|
|
|
1171
1329
|
// ------------------------------ MulFixedPoint15
|
|
1172
1330
|
|
|
@@ -1184,8 +1342,57 @@ HWY_RVV_FOREACH_I16(HWY_RVV_MUL15, MulFixedPoint15, smul, _ALL)
|
|
|
1184
1342
|
#undef HWY_RVV_MUL15
|
|
1185
1343
|
|
|
1186
1344
|
// ------------------------------ Div
|
|
1345
|
+
#ifdef HWY_NATIVE_INT_DIV
|
|
1346
|
+
#undef HWY_NATIVE_INT_DIV
|
|
1347
|
+
#else
|
|
1348
|
+
#define HWY_NATIVE_INT_DIV
|
|
1349
|
+
#endif
|
|
1350
|
+
|
|
1351
|
+
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Div, divu, _ALL)
|
|
1352
|
+
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Div, div, _ALL)
|
|
1187
1353
|
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Div, fdiv, _ALL)
|
|
1188
1354
|
|
|
1355
|
+
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Mod, remu, _ALL)
|
|
1356
|
+
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Mod, rem, _ALL)
|
|
1357
|
+
|
|
1358
|
+
// ------------------------------ MaskedAddOr etc.
|
|
1359
|
+
|
|
1360
|
+
#ifdef HWY_NATIVE_MASKED_ARITH
|
|
1361
|
+
#undef HWY_NATIVE_MASKED_ARITH
|
|
1362
|
+
#else
|
|
1363
|
+
#define HWY_NATIVE_MASKED_ARITH
|
|
1364
|
+
#endif
|
|
1365
|
+
|
|
1366
|
+
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedMinOr, minu, _ALL)
|
|
1367
|
+
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedMinOr, min, _ALL)
|
|
1368
|
+
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedMinOr, fmin, _ALL)
|
|
1369
|
+
|
|
1370
|
+
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedMaxOr, maxu, _ALL)
|
|
1371
|
+
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedMaxOr, max, _ALL)
|
|
1372
|
+
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedMaxOr, fmax, _ALL)
|
|
1373
|
+
|
|
1374
|
+
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGMVV, MaskedAddOr, add, _ALL)
|
|
1375
|
+
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedAddOr, fadd, _ALL)
|
|
1376
|
+
|
|
1377
|
+
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGMVV, MaskedSubOr, sub, _ALL)
|
|
1378
|
+
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedSubOr, fsub, _ALL)
|
|
1379
|
+
|
|
1380
|
+
HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGMVV, MaskedMulOr, mul, _ALL)
|
|
1381
|
+
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedMulOr, fmul, _ALL)
|
|
1382
|
+
|
|
1383
|
+
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedDivOr, divu, _ALL)
|
|
1384
|
+
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedDivOr, div, _ALL)
|
|
1385
|
+
HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedDivOr, fdiv, _ALL)
|
|
1386
|
+
|
|
1387
|
+
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedModOr, remu, _ALL)
|
|
1388
|
+
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedModOr, rem, _ALL)
|
|
1389
|
+
|
|
1390
|
+
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedSatAddOr, saddu, _ALL)
|
|
1391
|
+
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedSatAddOr, sadd, _ALL)
|
|
1392
|
+
|
|
1393
|
+
HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedSatSubOr, ssubu, _ALL)
|
|
1394
|
+
HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedSatSubOr, ssub, _ALL)
|
|
1395
|
+
|
|
1189
1396
|
// ------------------------------ ApproximateReciprocal
|
|
1190
1397
|
#ifdef HWY_NATIVE_F64_APPROX_RECIP
|
|
1191
1398
|
#undef HWY_NATIVE_F64_APPROX_RECIP
|
|
@@ -1247,26 +1454,6 @@ HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub, fnmacc, _ALL)
|
|
|
1247
1454
|
// vboolXX_t is a power of two divisor for vector bits. SEW=8 / LMUL=1 = 1/8th
|
|
1248
1455
|
// of all bits; SEW=8 / LMUL=4 = half of all bits.
|
|
1249
1456
|
|
|
1250
|
-
// SFINAE for mapping Simd<> to MLEN (up to 64).
|
|
1251
|
-
#define HWY_RVV_IF_MLEN_D(D, MLEN) \
|
|
1252
|
-
hwy::EnableIf<MLenFromD(D()) == MLEN>* = nullptr
|
|
1253
|
-
|
|
1254
|
-
// Specialized for RVV instead of the generic test_util-inl.h implementation
|
|
1255
|
-
// because more efficient, and helps implement MFromD.
|
|
1256
|
-
|
|
1257
|
-
#define HWY_RVV_MASK_FALSE(SEW, SHIFT, MLEN, NAME, OP) \
|
|
1258
|
-
template <class D, HWY_RVV_IF_MLEN_D(D, MLEN)> \
|
|
1259
|
-
HWY_API HWY_RVV_M(MLEN) NAME(D d) { \
|
|
1260
|
-
return __riscv_vm##OP##_m_b##MLEN(Lanes(d)); \
|
|
1261
|
-
}
|
|
1262
|
-
|
|
1263
|
-
HWY_RVV_FOREACH_B(HWY_RVV_MASK_FALSE, MaskFalse, clr)
|
|
1264
|
-
#undef HWY_RVV_MASK_FALSE
|
|
1265
|
-
#undef HWY_RVV_IF_MLEN_D
|
|
1266
|
-
|
|
1267
|
-
template <class D>
|
|
1268
|
-
using MFromD = decltype(MaskFalse(D()));
|
|
1269
|
-
|
|
1270
1457
|
// mask = f(vector, vector)
|
|
1271
1458
|
#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
1272
1459
|
SHIFT, MLEN, NAME, OP) \
|
|
@@ -1405,11 +1592,49 @@ HWY_RVV_FOREACH_F(HWY_RVV_IF_THEN_ZERO_ELSE, IfThenZeroElse, fmerge_vfm, _ALL)
|
|
|
1405
1592
|
#undef HWY_RVV_IF_THEN_ZERO_ELSE
|
|
1406
1593
|
|
|
1407
1594
|
// ------------------------------ MaskFromVec
|
|
1595
|
+
|
|
1596
|
+
template <class D>
|
|
1597
|
+
using MFromD = decltype(Eq(Zero(D()), Zero(D())));
|
|
1598
|
+
|
|
1408
1599
|
template <class V>
|
|
1409
1600
|
HWY_API MFromD<DFromV<V>> MaskFromVec(const V v) {
|
|
1410
1601
|
return detail::NeS(v, 0);
|
|
1411
1602
|
}
|
|
1412
1603
|
|
|
1604
|
+
// ------------------------------ IsNegative (MFromD)
|
|
1605
|
+
#ifdef HWY_NATIVE_IS_NEGATIVE
|
|
1606
|
+
#undef HWY_NATIVE_IS_NEGATIVE
|
|
1607
|
+
#else
|
|
1608
|
+
#define HWY_NATIVE_IS_NEGATIVE
|
|
1609
|
+
#endif
|
|
1610
|
+
|
|
1611
|
+
// Generic for all vector lengths
|
|
1612
|
+
template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
|
|
1613
|
+
HWY_API MFromD<DFromV<V>> IsNegative(V v) {
|
|
1614
|
+
const DFromV<decltype(v)> d;
|
|
1615
|
+
const RebindToSigned<decltype(d)> di;
|
|
1616
|
+
using TI = TFromD<decltype(di)>;
|
|
1617
|
+
|
|
1618
|
+
return detail::LtS(BitCast(di, v), static_cast<TI>(0));
|
|
1619
|
+
}
|
|
1620
|
+
|
|
1621
|
+
// ------------------------------ MaskFalse
|
|
1622
|
+
|
|
1623
|
+
// For mask ops including vmclr, elements past VL are tail-agnostic and cannot
|
|
1624
|
+
// be relied upon, so define a variant of the generic_ops-inl implementation of
|
|
1625
|
+
// MaskFalse that ensures all bits are zero as required by mask_test.
|
|
1626
|
+
#ifdef HWY_NATIVE_MASK_FALSE
|
|
1627
|
+
#undef HWY_NATIVE_MASK_FALSE
|
|
1628
|
+
#else
|
|
1629
|
+
#define HWY_NATIVE_MASK_FALSE
|
|
1630
|
+
#endif
|
|
1631
|
+
|
|
1632
|
+
template <class D>
|
|
1633
|
+
HWY_API MFromD<D> MaskFalse(D d) {
|
|
1634
|
+
const DFromV<VFromD<decltype(d)>> d_full;
|
|
1635
|
+
return MaskFromVec(Zero(d_full));
|
|
1636
|
+
}
|
|
1637
|
+
|
|
1413
1638
|
// ------------------------------ RebindMask
|
|
1414
1639
|
template <class D, typename MFrom>
|
|
1415
1640
|
HWY_API MFromD<D> RebindMask(const D /*d*/, const MFrom mask) {
|
|
@@ -1427,10 +1652,12 @@ HWY_API MFromD<D> RebindMask(const D /*d*/, const MFrom mask) {
|
|
|
1427
1652
|
template <size_t N> \
|
|
1428
1653
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
1429
1654
|
NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_M(MLEN) m) { \
|
|
1430
|
-
|
|
1655
|
+
/* MaskFalse requires we set all lanes for capped d and virtual LMUL. */ \
|
|
1656
|
+
const DFromV<VFromD<decltype(d)>> d_full; \
|
|
1657
|
+
const RebindToSigned<decltype(d_full)> di; \
|
|
1431
1658
|
using TI = TFromD<decltype(di)>; \
|
|
1432
|
-
return BitCast(
|
|
1433
|
-
|
|
1659
|
+
return BitCast(d_full, __riscv_v##OP##_i##SEW##LMUL(Zero(di), TI{-1}, m, \
|
|
1660
|
+
Lanes(d_full))); \
|
|
1434
1661
|
}
|
|
1435
1662
|
|
|
1436
1663
|
HWY_RVV_FOREACH_UI(HWY_RVV_VEC_FROM_MASK, VecFromMask, merge_vxm, _ALL_VIRT)
|
|
@@ -1448,14 +1675,8 @@ HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
|
|
|
1448
1675
|
return IfThenElse(MaskFromVec(mask), yes, no);
|
|
1449
1676
|
}
|
|
1450
1677
|
|
|
1451
|
-
// ------------------------------ ZeroIfNegative
|
|
1452
|
-
template <class V>
|
|
1453
|
-
HWY_API V ZeroIfNegative(const V v) {
|
|
1454
|
-
return IfThenZeroElse(detail::LtS(v, 0), v);
|
|
1455
|
-
}
|
|
1456
|
-
|
|
1457
1678
|
// ------------------------------ BroadcastSignBit
|
|
1458
|
-
template <class V>
|
|
1679
|
+
template <class V, HWY_IF_SIGNED_V(V)>
|
|
1459
1680
|
HWY_API V BroadcastSignBit(const V v) {
|
|
1460
1681
|
return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v);
|
|
1461
1682
|
}
|
|
@@ -1464,11 +1685,7 @@ HWY_API V BroadcastSignBit(const V v) {
|
|
|
1464
1685
|
template <class V>
|
|
1465
1686
|
HWY_API V IfNegativeThenElse(V v, V yes, V no) {
|
|
1466
1687
|
static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float");
|
|
1467
|
-
|
|
1468
|
-
const RebindToSigned<decltype(d)> di;
|
|
1469
|
-
|
|
1470
|
-
MFromD<decltype(d)> m = detail::LtS(BitCast(di, v), 0);
|
|
1471
|
-
return IfThenElse(m, yes, no);
|
|
1688
|
+
return IfThenElse(IsNegative(v), yes, no);
|
|
1472
1689
|
}
|
|
1473
1690
|
|
|
1474
1691
|
// ------------------------------ FindFirstTrue
|
|
@@ -1518,6 +1735,38 @@ HWY_RVV_FOREACH_B(HWY_RVV_ALL_TRUE, _, _)
|
|
|
1518
1735
|
HWY_RVV_FOREACH_B(HWY_RVV_COUNT_TRUE, _, _)
|
|
1519
1736
|
#undef HWY_RVV_COUNT_TRUE
|
|
1520
1737
|
|
|
1738
|
+
// ------------------------------ PromoteMaskTo
|
|
1739
|
+
|
|
1740
|
+
#ifdef HWY_NATIVE_PROMOTE_MASK_TO
|
|
1741
|
+
#undef HWY_NATIVE_PROMOTE_MASK_TO
|
|
1742
|
+
#else
|
|
1743
|
+
#define HWY_NATIVE_PROMOTE_MASK_TO
|
|
1744
|
+
#endif
|
|
1745
|
+
|
|
1746
|
+
template <class DTo, class DFrom,
|
|
1747
|
+
HWY_IF_T_SIZE_GT_D(DTo, sizeof(TFromD<DFrom>)),
|
|
1748
|
+
hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DFrom>>()>* = nullptr>
|
|
1749
|
+
HWY_API MFromD<DTo> PromoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
|
|
1750
|
+
MFromD<DFrom> m) {
|
|
1751
|
+
return m;
|
|
1752
|
+
}
|
|
1753
|
+
|
|
1754
|
+
// ------------------------------ DemoteMaskTo
|
|
1755
|
+
|
|
1756
|
+
#ifdef HWY_NATIVE_DEMOTE_MASK_TO
|
|
1757
|
+
#undef HWY_NATIVE_DEMOTE_MASK_TO
|
|
1758
|
+
#else
|
|
1759
|
+
#define HWY_NATIVE_DEMOTE_MASK_TO
|
|
1760
|
+
#endif
|
|
1761
|
+
|
|
1762
|
+
template <class DTo, class DFrom,
|
|
1763
|
+
HWY_IF_T_SIZE_LE_D(DTo, sizeof(TFromD<DFrom>) - 1),
|
|
1764
|
+
hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DFrom>>()>* = nullptr>
|
|
1765
|
+
HWY_API MFromD<DTo> DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
|
|
1766
|
+
MFromD<DFrom> m) {
|
|
1767
|
+
return m;
|
|
1768
|
+
}
|
|
1769
|
+
|
|
1521
1770
|
// ================================================== MEMORY
|
|
1522
1771
|
|
|
1523
1772
|
// ------------------------------ Load
|
|
@@ -1528,47 +1777,18 @@ HWY_RVV_FOREACH_B(HWY_RVV_COUNT_TRUE, _, _)
|
|
|
1528
1777
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
1529
1778
|
NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
|
|
1530
1779
|
const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
|
|
1531
|
-
using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
|
|
1532
1780
|
return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \
|
|
1533
|
-
|
|
1781
|
+
detail::NativeLanePointer(p), Lanes(d)); \
|
|
1534
1782
|
}
|
|
1535
1783
|
HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le, _ALL_VIRT)
|
|
1536
1784
|
#undef HWY_RVV_LOAD
|
|
1537
1785
|
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
|
|
1542
|
-
return Load(RebindToSigned<decltype(d)>(),
|
|
1543
|
-
reinterpret_cast<const int16_t * HWY_RESTRICT>(p));
|
|
1544
|
-
}
|
|
1545
|
-
|
|
1546
|
-
template <size_t N, int kPow2>
|
|
1547
|
-
HWY_API void Store(VFromD<Simd<int16_t, N, kPow2>> v,
|
|
1548
|
-
Simd<bfloat16_t, N, kPow2> d, bfloat16_t* HWY_RESTRICT p) {
|
|
1549
|
-
Store(v, RebindToSigned<decltype(d)>(),
|
|
1550
|
-
reinterpret_cast<int16_t * HWY_RESTRICT>(p));
|
|
1551
|
-
}
|
|
1552
|
-
|
|
1553
|
-
#if !HWY_HAVE_FLOAT16 // Otherwise already defined above.
|
|
1554
|
-
|
|
1555
|
-
// NOTE: different type for float16_t than bfloat16_t, see Set().
|
|
1556
|
-
template <size_t N, int kPow2>
|
|
1557
|
-
HWY_API VFromD<Simd<uint16_t, N, kPow2>> Load(Simd<float16_t, N, kPow2> d,
|
|
1558
|
-
const float16_t* HWY_RESTRICT p) {
|
|
1559
|
-
return Load(RebindToUnsigned<decltype(d)>(),
|
|
1560
|
-
reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
|
|
1561
|
-
}
|
|
1562
|
-
|
|
1563
|
-
template <size_t N, int kPow2>
|
|
1564
|
-
HWY_API void Store(VFromD<Simd<uint16_t, N, kPow2>> v,
|
|
1565
|
-
Simd<float16_t, N, kPow2> d, float16_t* HWY_RESTRICT p) {
|
|
1566
|
-
Store(v, RebindToUnsigned<decltype(d)>(),
|
|
1567
|
-
reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
|
|
1786
|
+
template <class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
1787
|
+
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
1788
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1789
|
+
return BitCast(d, Load(du, detail::U16LanePointer(p)));
|
|
1568
1790
|
}
|
|
1569
1791
|
|
|
1570
|
-
#endif // !HWY_HAVE_FLOAT16
|
|
1571
|
-
|
|
1572
1792
|
// ------------------------------ LoadU
|
|
1573
1793
|
template <class D>
|
|
1574
1794
|
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
@@ -1584,23 +1804,37 @@ HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
|
1584
1804
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
1585
1805
|
NAME(HWY_RVV_M(MLEN) m, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
|
|
1586
1806
|
const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
|
|
1587
|
-
using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
|
|
1588
1807
|
return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_mu( \
|
|
1589
|
-
m, Zero(d),
|
|
1808
|
+
m, Zero(d), detail::NativeLanePointer(p), Lanes(d)); \
|
|
1590
1809
|
} \
|
|
1591
1810
|
template <size_t N> \
|
|
1592
1811
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
1593
1812
|
NAME##Or(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
|
|
1594
1813
|
HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
|
|
1595
1814
|
const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
|
|
1596
|
-
using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
|
|
1597
1815
|
return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_mu( \
|
|
1598
|
-
m, v,
|
|
1816
|
+
m, v, detail::NativeLanePointer(p), Lanes(d)); \
|
|
1599
1817
|
}
|
|
1600
1818
|
|
|
1601
1819
|
HWY_RVV_FOREACH(HWY_RVV_MASKED_LOAD, MaskedLoad, le, _ALL_VIRT)
|
|
1602
1820
|
#undef HWY_RVV_MASKED_LOAD
|
|
1603
1821
|
|
|
1822
|
+
template <class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
1823
|
+
HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
|
|
1824
|
+
const TFromD<D>* HWY_RESTRICT p) {
|
|
1825
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1826
|
+
return BitCast(d,
|
|
1827
|
+
MaskedLoad(RebindMask(du, m), du, detail::U16LanePointer(p)));
|
|
1828
|
+
}
|
|
1829
|
+
|
|
1830
|
+
template <class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
1831
|
+
HWY_API VFromD<D> MaskedLoadOr(VFromD<D> no, MFromD<D> m, D d,
|
|
1832
|
+
const TFromD<D>* HWY_RESTRICT p) {
|
|
1833
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1834
|
+
return BitCast(d, MaskedLoadOr(BitCast(du, no), RebindMask(du, m), du,
|
|
1835
|
+
detail::U16LanePointer(p)));
|
|
1836
|
+
}
|
|
1837
|
+
|
|
1604
1838
|
// ------------------------------ LoadN
|
|
1605
1839
|
|
|
1606
1840
|
// Native with avl is faster than the generic_ops using FirstN.
|
|
@@ -1616,29 +1850,41 @@ HWY_RVV_FOREACH(HWY_RVV_MASKED_LOAD, MaskedLoad, le, _ALL_VIRT)
|
|
|
1616
1850
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
1617
1851
|
NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
|
|
1618
1852
|
const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t num_lanes) { \
|
|
1619
|
-
using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
|
|
1620
1853
|
/* Use a tail-undisturbed load in LoadN as the tail-undisturbed load */ \
|
|
1621
1854
|
/* operation below will leave any lanes past the first */ \
|
|
1622
1855
|
/* (lowest-indexed) HWY_MIN(num_lanes, Lanes(d)) lanes unchanged */ \
|
|
1623
1856
|
return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_tu( \
|
|
1624
|
-
Zero(d),
|
|
1857
|
+
Zero(d), detail::NativeLanePointer(p), CappedLanes(d, num_lanes)); \
|
|
1625
1858
|
} \
|
|
1626
1859
|
template <size_t N> \
|
|
1627
1860
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME##Or( \
|
|
1628
1861
|
HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
|
|
1629
1862
|
const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t num_lanes) { \
|
|
1630
|
-
using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
|
|
1631
1863
|
/* Use a tail-undisturbed load in LoadNOr as the tail-undisturbed load */ \
|
|
1632
1864
|
/* operation below will set any lanes past the first */ \
|
|
1633
1865
|
/* (lowest-indexed) HWY_MIN(num_lanes, Lanes(d)) lanes to the */ \
|
|
1634
1866
|
/* corresponding lanes in no */ \
|
|
1635
1867
|
return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_tu( \
|
|
1636
|
-
no,
|
|
1868
|
+
no, detail::NativeLanePointer(p), CappedLanes(d, num_lanes)); \
|
|
1637
1869
|
}
|
|
1638
1870
|
|
|
1639
1871
|
HWY_RVV_FOREACH(HWY_RVV_LOADN, LoadN, le, _ALL_VIRT)
|
|
1640
1872
|
#undef HWY_RVV_LOADN
|
|
1641
1873
|
|
|
1874
|
+
template <class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
1875
|
+
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
|
|
1876
|
+
size_t num_lanes) {
|
|
1877
|
+
const RebindToUnsigned<D> du;
|
|
1878
|
+
return BitCast(d, LoadN(du, detail::U16LanePointer(p), num_lanes));
|
|
1879
|
+
}
|
|
1880
|
+
template <class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
1881
|
+
HWY_API VFromD<D> LoadNOr(VFromD<D> v, D d, const TFromD<D>* HWY_RESTRICT p,
|
|
1882
|
+
size_t num_lanes) {
|
|
1883
|
+
const RebindToUnsigned<D> du;
|
|
1884
|
+
return BitCast(
|
|
1885
|
+
d, LoadNOr(BitCast(du, v), du, detail::U16LanePointer(p), num_lanes));
|
|
1886
|
+
}
|
|
1887
|
+
|
|
1642
1888
|
// ------------------------------ Store
|
|
1643
1889
|
|
|
1644
1890
|
#define HWY_RVV_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
|
|
@@ -1647,13 +1893,18 @@ HWY_RVV_FOREACH(HWY_RVV_LOADN, LoadN, le, _ALL_VIRT)
|
|
|
1647
1893
|
HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
|
|
1648
1894
|
HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
|
|
1649
1895
|
HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
v, Lanes(d)); \
|
|
1896
|
+
return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \
|
|
1897
|
+
detail::NativeLanePointer(p), v, Lanes(d)); \
|
|
1653
1898
|
}
|
|
1654
1899
|
HWY_RVV_FOREACH(HWY_RVV_STORE, Store, se, _ALL_VIRT)
|
|
1655
1900
|
#undef HWY_RVV_STORE
|
|
1656
1901
|
|
|
1902
|
+
template <class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
1903
|
+
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
|
|
1904
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1905
|
+
Store(BitCast(du, v), du, detail::U16LanePointer(p));
|
|
1906
|
+
}
|
|
1907
|
+
|
|
1657
1908
|
// ------------------------------ BlendedStore
|
|
1658
1909
|
|
|
1659
1910
|
#define HWY_RVV_BLENDED_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
@@ -1662,13 +1913,20 @@ HWY_RVV_FOREACH(HWY_RVV_STORE, Store, se, _ALL_VIRT)
|
|
|
1662
1913
|
HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
|
|
1663
1914
|
HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
|
|
1664
1915
|
HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
|
|
1665
|
-
using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
|
|
1666
1916
|
return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_m( \
|
|
1667
|
-
m,
|
|
1917
|
+
m, detail::NativeLanePointer(p), v, Lanes(d)); \
|
|
1668
1918
|
}
|
|
1669
1919
|
HWY_RVV_FOREACH(HWY_RVV_BLENDED_STORE, BlendedStore, se, _ALL_VIRT)
|
|
1670
1920
|
#undef HWY_RVV_BLENDED_STORE
|
|
1671
1921
|
|
|
1922
|
+
template <class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
1923
|
+
HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
|
|
1924
|
+
TFromD<D>* HWY_RESTRICT p) {
|
|
1925
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1926
|
+
BlendedStore(BitCast(du, v), RebindMask(du, m), du,
|
|
1927
|
+
detail::U16LanePointer(p));
|
|
1928
|
+
}
|
|
1929
|
+
|
|
1672
1930
|
// ------------------------------ StoreN
|
|
1673
1931
|
|
|
1674
1932
|
namespace detail {
|
|
@@ -1679,13 +1937,18 @@ namespace detail {
|
|
|
1679
1937
|
HWY_API void NAME(size_t count, HWY_RVV_V(BASE, SEW, LMUL) v, \
|
|
1680
1938
|
HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, \
|
|
1681
1939
|
HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
v, count); \
|
|
1940
|
+
return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \
|
|
1941
|
+
detail::NativeLanePointer(p), v, count); \
|
|
1685
1942
|
}
|
|
1686
1943
|
HWY_RVV_FOREACH(HWY_RVV_STOREN, StoreN, se, _ALL_VIRT)
|
|
1687
1944
|
#undef HWY_RVV_STOREN
|
|
1688
1945
|
|
|
1946
|
+
template <class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
1947
|
+
HWY_API void StoreN(size_t count, VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
|
|
1948
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1949
|
+
StoreN(count, BitCast(du, v), du, detail::U16LanePointer(p));
|
|
1950
|
+
}
|
|
1951
|
+
|
|
1689
1952
|
} // namespace detail
|
|
1690
1953
|
|
|
1691
1954
|
#ifdef HWY_NATIVE_STORE_N
|
|
@@ -1694,13 +1957,12 @@ HWY_RVV_FOREACH(HWY_RVV_STOREN, StoreN, se, _ALL_VIRT)
|
|
|
1694
1957
|
#define HWY_NATIVE_STORE_N
|
|
1695
1958
|
#endif
|
|
1696
1959
|
|
|
1697
|
-
template <class D
|
|
1698
|
-
|
|
1699
|
-
HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
|
|
1960
|
+
template <class D>
|
|
1961
|
+
HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
|
|
1700
1962
|
size_t max_lanes_to_store) {
|
|
1701
|
-
// NOTE: Need to
|
|
1702
|
-
//
|
|
1703
|
-
//
|
|
1963
|
+
// NOTE: Need to clamp max_lanes_to_store to Lanes(d), even if
|
|
1964
|
+
// MaxLanes(d) >= MaxLanes(DFromV<VFromD<D>>()) is true, as it is possible for
|
|
1965
|
+
// detail::StoreN(max_lanes_to_store, v, d, p) to store fewer than
|
|
1704
1966
|
// Lanes(DFromV<VFromD<D>>()) lanes to p if
|
|
1705
1967
|
// max_lanes_to_store > Lanes(DFromV<VFromD<D>>()) and
|
|
1706
1968
|
// max_lanes_to_store < 2 * Lanes(DFromV<VFromD<D>>()) are both true.
|
|
@@ -1709,21 +1971,7 @@ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
|
|
|
1709
1971
|
// if Lanes(d) < Lanes(DFromV<VFromD<D>>()) is true, which is possible if
|
|
1710
1972
|
// MaxLanes(d) < MaxLanes(DFromV<VFromD<D>>()) or
|
|
1711
1973
|
// d.Pow2() < DFromV<VFromD<D>>().Pow2() is true.
|
|
1712
|
-
|
|
1713
|
-
detail::StoreN(HWY_MIN(max_lanes_to_store, N), v, d, p);
|
|
1714
|
-
}
|
|
1715
|
-
|
|
1716
|
-
// StoreN for BF16/F16 vectors
|
|
1717
|
-
template <class D, typename T = TFromD<D>,
|
|
1718
|
-
hwy::EnableIf<!hwy::IsSame<T, TFromV<VFromD<D>>>()>* = nullptr,
|
|
1719
|
-
HWY_IF_SPECIAL_FLOAT(T)>
|
|
1720
|
-
HWY_API void StoreN(VFromD<D> v, D /*d*/, T* HWY_RESTRICT p,
|
|
1721
|
-
size_t max_lanes_to_store) {
|
|
1722
|
-
using TStore = TFromV<VFromD<D>>;
|
|
1723
|
-
const Rebind<TStore, D> d_store;
|
|
1724
|
-
const size_t N = Lanes(d_store);
|
|
1725
|
-
detail::StoreN(HWY_MIN(max_lanes_to_store, N), v, d_store,
|
|
1726
|
-
reinterpret_cast<TStore * HWY_RESTRICT>(p));
|
|
1974
|
+
detail::StoreN(CappedLanes(d, max_lanes_to_store), v, d, p);
|
|
1727
1975
|
}
|
|
1728
1976
|
|
|
1729
1977
|
// ------------------------------ StoreU
|
|
@@ -1747,17 +1995,16 @@ HWY_API void Stream(const V v, D d, T* HWY_RESTRICT aligned) {
|
|
|
1747
1995
|
#define HWY_NATIVE_SCATTER
|
|
1748
1996
|
#endif
|
|
1749
1997
|
|
|
1750
|
-
#define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,
|
|
1751
|
-
SHIFT, MLEN, NAME, OP)
|
|
1752
|
-
template <size_t N>
|
|
1753
|
-
HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v,
|
|
1754
|
-
HWY_RVV_D(BASE, SEW, N, SHIFT) d,
|
|
1755
|
-
HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base,
|
|
1756
|
-
HWY_RVV_V(int, SEW, LMUL) offset) {
|
|
1757
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
reinterpret_cast<T*>(base), BitCast(du, offset), v, Lanes(d)); \
|
|
1998
|
+
#define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
1999
|
+
SHIFT, MLEN, NAME, OP) \
|
|
2000
|
+
template <size_t N> \
|
|
2001
|
+
HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
|
|
2002
|
+
HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
|
|
2003
|
+
HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
|
|
2004
|
+
HWY_RVV_V(int, SEW, LMUL) offset) { \
|
|
2005
|
+
const RebindToUnsigned<decltype(d)> du; \
|
|
2006
|
+
return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
|
|
2007
|
+
detail::NativeLanePointer(base), BitCast(du, offset), v, Lanes(d)); \
|
|
1761
2008
|
}
|
|
1762
2009
|
HWY_RVV_FOREACH(HWY_RVV_SCATTER, ScatterOffset, sux, _ALL_VIRT)
|
|
1763
2010
|
#undef HWY_RVV_SCATTER
|
|
@@ -1772,19 +2019,18 @@ HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
|
|
|
1772
2019
|
|
|
1773
2020
|
// ------------------------------ MaskedScatterIndex
|
|
1774
2021
|
|
|
1775
|
-
#define HWY_RVV_MASKED_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD,
|
|
1776
|
-
LMULH, SHIFT, MLEN, NAME, OP)
|
|
1777
|
-
template <size_t N>
|
|
1778
|
-
HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m,
|
|
1779
|
-
HWY_RVV_D(BASE, SEW, N, SHIFT) d,
|
|
1780
|
-
HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base,
|
|
1781
|
-
HWY_RVV_V(int, SEW, LMUL) indices) {
|
|
1782
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
1783
|
-
|
|
1784
|
-
|
|
1785
|
-
|
|
1786
|
-
|
|
1787
|
-
v, Lanes(d)); \
|
|
2022
|
+
#define HWY_RVV_MASKED_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
|
|
2023
|
+
LMULH, SHIFT, MLEN, NAME, OP) \
|
|
2024
|
+
template <size_t N> \
|
|
2025
|
+
HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
|
|
2026
|
+
HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
|
|
2027
|
+
HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
|
|
2028
|
+
HWY_RVV_V(int, SEW, LMUL) indices) { \
|
|
2029
|
+
const RebindToUnsigned<decltype(d)> du; \
|
|
2030
|
+
constexpr size_t kBits = CeilLog2(sizeof(TFromD<decltype(d)>)); \
|
|
2031
|
+
return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL##_m( \
|
|
2032
|
+
m, detail::NativeLanePointer(base), \
|
|
2033
|
+
ShiftLeft<kBits>(BitCast(du, indices)), v, Lanes(d)); \
|
|
1788
2034
|
}
|
|
1789
2035
|
HWY_RVV_FOREACH(HWY_RVV_MASKED_SCATTER, MaskedScatterIndex, sux, _ALL_VIRT)
|
|
1790
2036
|
#undef HWY_RVV_MASKED_SCATTER
|
|
@@ -1805,9 +2051,8 @@ HWY_RVV_FOREACH(HWY_RVV_MASKED_SCATTER, MaskedScatterIndex, sux, _ALL_VIRT)
|
|
|
1805
2051
|
const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
|
|
1806
2052
|
HWY_RVV_V(int, SEW, LMUL) offset) { \
|
|
1807
2053
|
const RebindToUnsigned<decltype(d)> du; \
|
|
1808
|
-
using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
|
|
1809
2054
|
return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
|
|
1810
|
-
|
|
2055
|
+
detail::NativeLanePointer(base), BitCast(du, offset), Lanes(d)); \
|
|
1811
2056
|
}
|
|
1812
2057
|
HWY_RVV_FOREACH(HWY_RVV_GATHER, GatherOffset, lux, _ALL_VIRT)
|
|
1813
2058
|
#undef HWY_RVV_GATHER
|
|
@@ -1821,25 +2066,34 @@ HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
|
|
|
1821
2066
|
return GatherOffset(d, base, ShiftLeft<kBits>(index));
|
|
1822
2067
|
}
|
|
1823
2068
|
|
|
1824
|
-
// ------------------------------
|
|
2069
|
+
// ------------------------------ MaskedGatherIndexOr
|
|
1825
2070
|
|
|
1826
2071
|
#define HWY_RVV_MASKED_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
1827
2072
|
SHIFT, MLEN, NAME, OP) \
|
|
1828
2073
|
template <size_t N> \
|
|
1829
2074
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
1830
|
-
NAME(
|
|
2075
|
+
NAME(HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_M(MLEN) m, \
|
|
2076
|
+
HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
|
|
1831
2077
|
const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
|
|
1832
2078
|
HWY_RVV_V(int, SEW, LMUL) indices) { \
|
|
1833
2079
|
const RebindToUnsigned<decltype(d)> du; \
|
|
1834
|
-
|
|
2080
|
+
const RebindToSigned<decltype(d)> di; \
|
|
2081
|
+
(void)di; /* for HWY_DASSERT */ \
|
|
1835
2082
|
constexpr size_t kBits = CeilLog2(SEW / 8); \
|
|
2083
|
+
HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di)))); \
|
|
1836
2084
|
return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL##_mu( \
|
|
1837
|
-
m,
|
|
2085
|
+
m, no, detail::NativeLanePointer(base), \
|
|
1838
2086
|
ShiftLeft<kBits>(BitCast(du, indices)), Lanes(d)); \
|
|
1839
2087
|
}
|
|
1840
|
-
HWY_RVV_FOREACH(HWY_RVV_MASKED_GATHER,
|
|
2088
|
+
HWY_RVV_FOREACH(HWY_RVV_MASKED_GATHER, MaskedGatherIndexOr, lux, _ALL_VIRT)
|
|
1841
2089
|
#undef HWY_RVV_MASKED_GATHER
|
|
1842
2090
|
|
|
2091
|
+
template <class D>
|
|
2092
|
+
HWY_API VFromD<D> MaskedGatherIndex(MFromD<D> m, D d, const TFromD<D>* base,
|
|
2093
|
+
VFromD<RebindToSigned<D>> indices) {
|
|
2094
|
+
return MaskedGatherIndexOr(Zero(d), m, d, base, indices);
|
|
2095
|
+
}
|
|
2096
|
+
|
|
1843
2097
|
// ================================================== CONVERT
|
|
1844
2098
|
|
|
1845
2099
|
// ------------------------------ PromoteTo
|
|
@@ -1952,52 +2206,38 @@ HWY_API auto PromoteTo(Simd<uint64_t, N, -1> d,
|
|
|
1952
2206
|
}
|
|
1953
2207
|
|
|
1954
2208
|
// Unsigned to signed: cast for unsigned promote.
|
|
1955
|
-
template <
|
|
1956
|
-
HWY_API
|
|
1957
|
-
VFromD<Rebind<uint8_t, decltype(d)>> v)
|
|
1958
|
-
-> VFromD<decltype(d)> {
|
|
2209
|
+
template <class D, HWY_IF_I16_D(D)>
|
|
2210
|
+
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) {
|
|
1959
2211
|
return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
|
|
1960
2212
|
}
|
|
1961
2213
|
|
|
1962
|
-
template <
|
|
1963
|
-
HWY_API
|
|
1964
|
-
VFromD<Rebind<uint8_t, decltype(d)>> v)
|
|
1965
|
-
-> VFromD<decltype(d)> {
|
|
2214
|
+
template <class D, HWY_IF_I32_D(D)>
|
|
2215
|
+
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) {
|
|
1966
2216
|
return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
|
|
1967
2217
|
}
|
|
1968
2218
|
|
|
1969
|
-
template <
|
|
1970
|
-
HWY_API
|
|
1971
|
-
VFromD<Rebind<uint16_t, decltype(d)>> v)
|
|
1972
|
-
-> VFromD<decltype(d)> {
|
|
2219
|
+
template <class D, HWY_IF_I32_D(D)>
|
|
2220
|
+
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint16_t, D>> v) {
|
|
1973
2221
|
return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
|
|
1974
2222
|
}
|
|
1975
2223
|
|
|
1976
|
-
template <
|
|
1977
|
-
HWY_API
|
|
1978
|
-
VFromD<Rebind<uint32_t, decltype(d)>> v)
|
|
1979
|
-
-> VFromD<decltype(d)> {
|
|
2224
|
+
template <class D, HWY_IF_I64_D(D)>
|
|
2225
|
+
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint32_t, D>> v) {
|
|
1980
2226
|
return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
|
|
1981
2227
|
}
|
|
1982
2228
|
|
|
1983
|
-
template <
|
|
1984
|
-
HWY_API
|
|
1985
|
-
VFromD<Rebind<uint16_t, decltype(d)>> v)
|
|
1986
|
-
-> VFromD<decltype(d)> {
|
|
2229
|
+
template <class D, HWY_IF_I64_D(D)>
|
|
2230
|
+
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint16_t, D>> v) {
|
|
1987
2231
|
return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
|
|
1988
2232
|
}
|
|
1989
2233
|
|
|
1990
|
-
template <
|
|
1991
|
-
HWY_API
|
|
1992
|
-
VFromD<Rebind<uint8_t, decltype(d)>> v)
|
|
1993
|
-
-> VFromD<decltype(d)> {
|
|
2234
|
+
template <class D, HWY_IF_I64_D(D)>
|
|
2235
|
+
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) {
|
|
1994
2236
|
return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
|
|
1995
2237
|
}
|
|
1996
2238
|
|
|
1997
|
-
template <
|
|
1998
|
-
HWY_API
|
|
1999
|
-
VFromD<Rebind<bfloat16_t, decltype(d)>> v)
|
|
2000
|
-
-> VFromD<decltype(d)> {
|
|
2239
|
+
template <class D, HWY_IF_F32_D(D)>
|
|
2240
|
+
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<hwy::bfloat16_t, D>> v) {
|
|
2001
2241
|
const RebindToSigned<decltype(d)> di32;
|
|
2002
2242
|
const Rebind<uint16_t, decltype(d)> du16;
|
|
2003
2243
|
return BitCast(d, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
|
|
@@ -2097,28 +2337,24 @@ HWY_API vuint8m2_t DemoteTo(Simd<uint8_t, N, 1> d, const vuint32m8_t v) {
|
|
|
2097
2337
|
HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));
|
|
2098
2338
|
}
|
|
2099
2339
|
|
|
2100
|
-
template <
|
|
2101
|
-
HWY_API VFromD<
|
|
2102
|
-
|
|
2103
|
-
return DemoteTo(d, DemoteTo(Simd<uint32_t, N, kPow2 + 2>(), v));
|
|
2340
|
+
template <class D, HWY_IF_U8_D(D)>
|
|
2341
|
+
HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<int64_t, D>> v) {
|
|
2342
|
+
return DemoteTo(d, DemoteTo(Rebind<uint32_t, D>(), v));
|
|
2104
2343
|
}
|
|
2105
2344
|
|
|
2106
|
-
template <
|
|
2107
|
-
HWY_API VFromD<
|
|
2108
|
-
|
|
2109
|
-
return DemoteTo(d, DemoteTo(Simd<uint32_t, N, kPow2 + 2>(), v));
|
|
2345
|
+
template <class D, HWY_IF_U8_D(D)>
|
|
2346
|
+
HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<uint64_t, D>> v) {
|
|
2347
|
+
return DemoteTo(d, DemoteTo(Rebind<uint32_t, D>(), v));
|
|
2110
2348
|
}
|
|
2111
2349
|
|
|
2112
|
-
template <
|
|
2113
|
-
HWY_API VFromD<
|
|
2114
|
-
|
|
2115
|
-
return DemoteTo(d, DemoteTo(Simd<uint32_t, N, kPow2 + 1>(), v));
|
|
2350
|
+
template <class D, HWY_IF_U16_D(D)>
|
|
2351
|
+
HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<int64_t, D>> v) {
|
|
2352
|
+
return DemoteTo(d, DemoteTo(Rebind<uint32_t, D>(), v));
|
|
2116
2353
|
}
|
|
2117
2354
|
|
|
2118
|
-
template <
|
|
2119
|
-
HWY_API VFromD<
|
|
2120
|
-
|
|
2121
|
-
return DemoteTo(d, DemoteTo(Simd<uint32_t, N, kPow2 + 1>(), v));
|
|
2355
|
+
template <class D, HWY_IF_U16_D(D)>
|
|
2356
|
+
HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<uint64_t, D>> v) {
|
|
2357
|
+
return DemoteTo(d, DemoteTo(Rebind<uint32_t, D>(), v));
|
|
2122
2358
|
}
|
|
2123
2359
|
|
|
2124
2360
|
HWY_API vuint8mf8_t U8FromU32(const vuint32mf2_t v) {
|
|
@@ -2501,16 +2737,14 @@ HWY_API vint8m2_t DemoteTo(Simd<int8_t, N, 1> d, const vint32m8_t v) {
|
|
|
2501
2737
|
return DemoteTo(d, DemoteTo(Simd<int16_t, N, 2>(), v));
|
|
2502
2738
|
}
|
|
2503
2739
|
|
|
2504
|
-
template <
|
|
2505
|
-
HWY_API VFromD<
|
|
2506
|
-
|
|
2507
|
-
return DemoteTo(d, DemoteTo(Simd<int32_t, N, kPow2 + 2>(), v));
|
|
2740
|
+
template <class D, HWY_IF_I8_D(D)>
|
|
2741
|
+
HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<int64_t, D>> v) {
|
|
2742
|
+
return DemoteTo(d, DemoteTo(Rebind<int32_t, D>(), v));
|
|
2508
2743
|
}
|
|
2509
2744
|
|
|
2510
|
-
template <
|
|
2511
|
-
HWY_API VFromD<
|
|
2512
|
-
|
|
2513
|
-
return DemoteTo(d, DemoteTo(Simd<int32_t, N, kPow2 + 1>(), v));
|
|
2745
|
+
template <class D, HWY_IF_I16_D(D)>
|
|
2746
|
+
HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<int64_t, D>> v) {
|
|
2747
|
+
return DemoteTo(d, DemoteTo(Rebind<int32_t, D>(), v));
|
|
2514
2748
|
}
|
|
2515
2749
|
|
|
2516
2750
|
#undef HWY_RVV_DEMOTE
|
|
@@ -2527,9 +2761,15 @@ HWY_API VFromD<Simd<int16_t, N, kPow2>> DemoteTo(
|
|
|
2527
2761
|
}
|
|
2528
2762
|
|
|
2529
2763
|
#if HWY_HAVE_FLOAT16 || HWY_RVV_HAVE_F16C
|
|
2530
|
-
HWY_RVV_FOREACH_F32(HWY_RVV_DEMOTE_F, DemoteTo,
|
|
2764
|
+
HWY_RVV_FOREACH_F32(HWY_RVV_DEMOTE_F, DemoteTo, fncvt_f_f_w_f, _DEMOTE_VIRT)
|
|
2531
2765
|
#endif
|
|
2532
|
-
HWY_RVV_FOREACH_F64(HWY_RVV_DEMOTE_F, DemoteTo,
|
|
2766
|
+
HWY_RVV_FOREACH_F64(HWY_RVV_DEMOTE_F, DemoteTo, fncvt_f_f_w_f, _DEMOTE_VIRT)
|
|
2767
|
+
|
|
2768
|
+
namespace detail {
|
|
2769
|
+
HWY_RVV_FOREACH_F64(HWY_RVV_DEMOTE_F, DemoteToF32WithRoundToOdd,
|
|
2770
|
+
fncvt_rod_f_f_w_f, _DEMOTE_VIRT)
|
|
2771
|
+
} // namespace detail
|
|
2772
|
+
|
|
2533
2773
|
#undef HWY_RVV_DEMOTE_F
|
|
2534
2774
|
|
|
2535
2775
|
// TODO(janwas): add BASE2 arg to allow generating this via DEMOTE_F.
|
|
@@ -2617,27 +2857,72 @@ HWY_API vfloat32m4_t DemoteTo(Simd<float, N, 2> d, const vuint64m8_t v) {
|
|
|
2617
2857
|
return __riscv_vfncvt_f_xu_w_f32m4(v, Lanes(d));
|
|
2618
2858
|
}
|
|
2619
2859
|
|
|
2860
|
+
// Narrows f32 bits to bf16 using round to even.
|
|
2620
2861
|
// SEW is for the source so we can use _DEMOTE_VIRT.
|
|
2621
|
-
#
|
|
2622
|
-
|
|
2862
|
+
#ifdef HWY_RVV_AVOID_VXRM
|
|
2863
|
+
#define HWY_RVV_DEMOTE_16_NEAREST_EVEN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, \
|
|
2864
|
+
LMULD, LMULH, SHIFT, MLEN, NAME, OP) \
|
|
2865
|
+
template <size_t N> \
|
|
2866
|
+
HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \
|
|
2867
|
+
HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
2868
|
+
const auto round = \
|
|
2869
|
+
detail::AddS(detail::AndS(ShiftRight<16>(v), 1u), 0x7FFFu); \
|
|
2870
|
+
v = Add(v, round); \
|
|
2871
|
+
/* The default rounding mode appears to be RNU=0, which adds the LSB. */ \
|
|
2872
|
+
/* Prevent further rounding by clearing the bits we want to truncate. */ \
|
|
2873
|
+
v = detail::AndS(v, 0xFFFF0000u); \
|
|
2874
|
+
return __riscv_v##OP##CHAR##SEWH##LMULH(v, 16, Lanes(d)); \
|
|
2875
|
+
}
|
|
2876
|
+
|
|
2877
|
+
#else
|
|
2878
|
+
#define HWY_RVV_DEMOTE_16_NEAREST_EVEN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, \
|
|
2879
|
+
LMULD, LMULH, SHIFT, MLEN, NAME, OP) \
|
|
2623
2880
|
template <size_t N> \
|
|
2624
2881
|
HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \
|
|
2625
2882
|
HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
2626
2883
|
return __riscv_v##OP##CHAR##SEWH##LMULH( \
|
|
2627
|
-
v, 16, HWY_RVV_INSERT_VXRM(
|
|
2884
|
+
v, 16, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNE, Lanes(d))); \
|
|
2628
2885
|
}
|
|
2886
|
+
#endif // HWY_RVV_AVOID_VXRM
|
|
2629
2887
|
namespace detail {
|
|
2630
|
-
HWY_RVV_FOREACH_U32(
|
|
2631
|
-
_DEMOTE_VIRT)
|
|
2888
|
+
HWY_RVV_FOREACH_U32(HWY_RVV_DEMOTE_16_NEAREST_EVEN, DemoteTo16NearestEven,
|
|
2889
|
+
nclipu_wx_, _DEMOTE_VIRT)
|
|
2632
2890
|
}
|
|
2633
|
-
#undef
|
|
2891
|
+
#undef HWY_RVV_DEMOTE_16_NEAREST_EVEN
|
|
2892
|
+
|
|
2893
|
+
#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
2894
|
+
#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
2895
|
+
#else
|
|
2896
|
+
#define HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
2897
|
+
#endif
|
|
2634
2898
|
|
|
2635
|
-
template <
|
|
2636
|
-
HWY_API VFromD<
|
|
2637
|
-
|
|
2899
|
+
template <class DBF16, HWY_IF_BF16_D(DBF16)>
|
|
2900
|
+
HWY_API VFromD<DBF16> DemoteTo(DBF16 d, VFromD<Rebind<float, DBF16>> v) {
|
|
2901
|
+
const DFromV<decltype(v)> df;
|
|
2902
|
+
const RebindToUnsigned<decltype(df)> du32;
|
|
2638
2903
|
const RebindToUnsigned<decltype(d)> du16;
|
|
2639
|
-
|
|
2640
|
-
|
|
2904
|
+
// Consider an f32 mantissa with the upper 7 bits set, followed by a 1-bit
|
|
2905
|
+
// and at least one other bit set. This will round to 0 and increment the
|
|
2906
|
+
// exponent. If the exponent was already 0xFF (NaN), then the result is -inf;
|
|
2907
|
+
// there no wraparound because nclipu saturates. Note that in this case, the
|
|
2908
|
+
// input cannot have been inf because its mantissa bits are zero. To avoid
|
|
2909
|
+
// converting NaN to inf, we canonicalize the NaN to prevent the rounding.
|
|
2910
|
+
const decltype(v) canonicalized =
|
|
2911
|
+
IfThenElse(Eq(v, v), v, BitCast(df, Set(du32, 0x7F800000)));
|
|
2912
|
+
return BitCast(
|
|
2913
|
+
d, detail::DemoteTo16NearestEven(du16, BitCast(du32, canonicalized)));
|
|
2914
|
+
}
|
|
2915
|
+
|
|
2916
|
+
#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
2917
|
+
#undef HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
2918
|
+
#else
|
|
2919
|
+
#define HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
2920
|
+
#endif
|
|
2921
|
+
|
|
2922
|
+
template <class D, HWY_IF_F16_D(D)>
|
|
2923
|
+
HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
|
|
2924
|
+
const Rebind<float, decltype(df16)> df32;
|
|
2925
|
+
return DemoteTo(df16, detail::DemoteToF32WithRoundToOdd(df32, v));
|
|
2641
2926
|
}
|
|
2642
2927
|
|
|
2643
2928
|
// ------------------------------ ConvertTo F
|
|
@@ -2664,8 +2949,8 @@ HWY_API VFromD<Simd<bfloat16_t, N, kPow2>> DemoteTo(
|
|
|
2664
2949
|
HWY_API HWY_RVV_V(uint, SEW, LMUL) ConvertTo( \
|
|
2665
2950
|
HWY_RVV_D(uint, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
2666
2951
|
return __riscv_vfcvt_rtz_xu_f_v_u##SEW##LMUL(v, Lanes(d)); \
|
|
2667
|
-
}
|
|
2668
|
-
|
|
2952
|
+
}
|
|
2953
|
+
|
|
2669
2954
|
HWY_RVV_FOREACH_F(HWY_RVV_CONVERT, _, _, _ALL_VIRT)
|
|
2670
2955
|
#undef HWY_RVV_CONVERT
|
|
2671
2956
|
|
|
@@ -2704,7 +2989,7 @@ HWY_INLINE size_t LanesPerBlock(Simd<T, N, kPow2> d) {
|
|
|
2704
2989
|
|
|
2705
2990
|
template <class D, class V>
|
|
2706
2991
|
HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) {
|
|
2707
|
-
using T = MakeUnsigned<
|
|
2992
|
+
using T = MakeUnsigned<TFromV<V>>;
|
|
2708
2993
|
return AndS(iota0, static_cast<T>(~(LanesPerBlock(d) - 1)));
|
|
2709
2994
|
}
|
|
2710
2995
|
|
|
@@ -2918,9 +3203,10 @@ HWY_RVV_FOREACH_B(HWY_RVV_SET_AT_OR_AFTER_FIRST, _, _)
|
|
|
2918
3203
|
|
|
2919
3204
|
// ------------------------------ InsertLane
|
|
2920
3205
|
|
|
2921
|
-
template
|
|
2922
|
-
|
|
2923
|
-
|
|
3206
|
+
// T template arg because TFromV<V> might not match the hwy::float16_t argument.
|
|
3207
|
+
template <class V, typename T, HWY_IF_NOT_T_SIZE_V(V, 1)>
|
|
3208
|
+
HWY_API V InsertLane(const V v, size_t i, T t) {
|
|
3209
|
+
const Rebind<T, DFromV<V>> d;
|
|
2924
3210
|
const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only
|
|
2925
3211
|
using TU = TFromD<decltype(du)>;
|
|
2926
3212
|
const auto is_i = detail::EqS(detail::Iota0(du), static_cast<TU>(i));
|
|
@@ -2928,9 +3214,9 @@ HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
|
|
|
2928
3214
|
}
|
|
2929
3215
|
|
|
2930
3216
|
// For 8-bit lanes, Iota0 might overflow.
|
|
2931
|
-
template <class V, HWY_IF_T_SIZE_V(V, 1)>
|
|
2932
|
-
HWY_API V InsertLane(const V v, size_t i,
|
|
2933
|
-
const DFromV<V
|
|
3217
|
+
template <class V, typename T, HWY_IF_T_SIZE_V(V, 1)>
|
|
3218
|
+
HWY_API V InsertLane(const V v, size_t i, T t) {
|
|
3219
|
+
const Rebind<T, DFromV<V>> d;
|
|
2934
3220
|
const auto zero = Zero(d);
|
|
2935
3221
|
const auto one = Set(d, 1);
|
|
2936
3222
|
const auto ge_i = Eq(detail::SlideUp(zero, one, i), one);
|
|
@@ -2991,6 +3277,18 @@ HWY_API V DupOdd(const V v) {
|
|
|
2991
3277
|
return OddEven(v, down);
|
|
2992
3278
|
}
|
|
2993
3279
|
|
|
3280
|
+
// ------------------------------ InterleaveEven (OddEven)
|
|
3281
|
+
template <class D>
|
|
3282
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3283
|
+
return OddEven(detail::Slide1Up(b), a);
|
|
3284
|
+
}
|
|
3285
|
+
|
|
3286
|
+
// ------------------------------ InterleaveOdd (OddEven)
|
|
3287
|
+
template <class D>
|
|
3288
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3289
|
+
return OddEven(b, detail::Slide1Down(a));
|
|
3290
|
+
}
|
|
3291
|
+
|
|
2994
3292
|
// ------------------------------ OddEvenBlocks
|
|
2995
3293
|
template <class V>
|
|
2996
3294
|
HWY_API V OddEvenBlocks(const V a, const V b) {
|
|
@@ -3034,9 +3332,6 @@ HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) {
|
|
|
3034
3332
|
return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx));
|
|
3035
3333
|
}
|
|
3036
3334
|
|
|
3037
|
-
// TODO(janwas): avoid using this for 8-bit; wrap in detail namespace.
|
|
3038
|
-
// For large 8-bit vectors, index overflow will lead to incorrect results.
|
|
3039
|
-
// Reverse already uses TableLookupLanes16 to prevent this.
|
|
3040
3335
|
#define HWY_RVV_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
|
|
3041
3336
|
MLEN, NAME, OP) \
|
|
3042
3337
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
@@ -3045,12 +3340,14 @@ HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) {
|
|
|
3045
3340
|
HWY_RVV_AVL(SEW, SHIFT)); \
|
|
3046
3341
|
}
|
|
3047
3342
|
|
|
3343
|
+
// TableLookupLanes is supported for all types, but beware that indices are
|
|
3344
|
+
// likely to wrap around for 8-bit lanes. When using TableLookupLanes inside
|
|
3345
|
+
// this file, ensure that it is safe or use TableLookupLanes16 instead.
|
|
3048
3346
|
HWY_RVV_FOREACH(HWY_RVV_TABLE, TableLookupLanes, rgather, _ALL)
|
|
3049
3347
|
#undef HWY_RVV_TABLE
|
|
3050
3348
|
|
|
3051
3349
|
namespace detail {
|
|
3052
3350
|
|
|
3053
|
-
// Used by I8/U8 Reverse
|
|
3054
3351
|
#define HWY_RVV_TABLE16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
3055
3352
|
SHIFT, MLEN, NAME, OP) \
|
|
3056
3353
|
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
@@ -3122,17 +3419,78 @@ HWY_API VFromD<D> Reverse(D /* tag */, VFromD<D> v) {
|
|
|
3122
3419
|
return TableLookupLanes(v, idx);
|
|
3123
3420
|
}
|
|
3124
3421
|
|
|
3125
|
-
// ------------------------------
|
|
3422
|
+
// ------------------------------ ResizeBitCast
|
|
3126
3423
|
|
|
3127
|
-
//
|
|
3128
|
-
|
|
3129
|
-
#undef HWY_NATIVE_REVERSE2_8
|
|
3130
|
-
#else
|
|
3131
|
-
#define HWY_NATIVE_REVERSE2_8
|
|
3132
|
-
#endif
|
|
3424
|
+
// Extends or truncates a vector to match the given d.
|
|
3425
|
+
namespace detail {
|
|
3133
3426
|
|
|
3134
|
-
|
|
3135
|
-
|
|
3427
|
+
template <class D>
|
|
3428
|
+
HWY_INLINE VFromD<D> ChangeLMUL(D /* d */, VFromD<D> v) {
|
|
3429
|
+
return v;
|
|
3430
|
+
}
|
|
3431
|
+
|
|
3432
|
+
// Sanity check: when calling ChangeLMUL, the caller (ResizeBitCast) already
|
|
3433
|
+
// BitCast to the same lane type. Note that V may use the native lane type for
|
|
3434
|
+
// f16, so convert D to that before checking.
|
|
3435
|
+
#define HWY_RVV_IF_SAME_T_DV(D, V) \
|
|
3436
|
+
hwy::EnableIf<IsSame<NativeLaneType<TFromD<D>>, TFromV<V>>()>* = nullptr
|
|
3437
|
+
|
|
3438
|
+
// LMUL of VFromD<D> < LMUL of V: need to truncate v
|
|
3439
|
+
template <class D, class V, // HWY_RVV_IF_SAME_T_DV(D, V),
|
|
3440
|
+
HWY_IF_POW2_LE_D(DFromV<VFromD<D>>, DFromV<V>().Pow2() - 1)>
|
|
3441
|
+
HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) {
|
|
3442
|
+
const DFromV<V> d_from;
|
|
3443
|
+
const Half<decltype(d_from)> dh_from;
|
|
3444
|
+
static_assert(
|
|
3445
|
+
DFromV<VFromD<decltype(dh_from)>>().Pow2() < DFromV<V>().Pow2(),
|
|
3446
|
+
"The LMUL of VFromD<decltype(dh_from)> must be less than the LMUL of V");
|
|
3447
|
+
static_assert(
|
|
3448
|
+
DFromV<VFromD<D>>().Pow2() <= DFromV<VFromD<decltype(dh_from)>>().Pow2(),
|
|
3449
|
+
"The LMUL of VFromD<D> must be less than or equal to the LMUL of "
|
|
3450
|
+
"VFromD<decltype(dh_from)>");
|
|
3451
|
+
return ChangeLMUL(d, Trunc(v));
|
|
3452
|
+
}
|
|
3453
|
+
|
|
3454
|
+
// LMUL of VFromD<D> > LMUL of V: need to extend v
|
|
3455
|
+
template <class D, class V, // HWY_RVV_IF_SAME_T_DV(D, V),
|
|
3456
|
+
HWY_IF_POW2_GT_D(DFromV<VFromD<D>>, DFromV<V>().Pow2())>
|
|
3457
|
+
HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) {
|
|
3458
|
+
const DFromV<V> d_from;
|
|
3459
|
+
const Twice<decltype(d_from)> dt_from;
|
|
3460
|
+
static_assert(DFromV<VFromD<decltype(dt_from)>>().Pow2() > DFromV<V>().Pow2(),
|
|
3461
|
+
"The LMUL of VFromD<decltype(dt_from)> must be greater than "
|
|
3462
|
+
"the LMUL of V");
|
|
3463
|
+
static_assert(
|
|
3464
|
+
DFromV<VFromD<D>>().Pow2() >= DFromV<VFromD<decltype(dt_from)>>().Pow2(),
|
|
3465
|
+
"The LMUL of VFromD<D> must be greater than or equal to the LMUL of "
|
|
3466
|
+
"VFromD<decltype(dt_from)>");
|
|
3467
|
+
return ChangeLMUL(d, Ext(dt_from, v));
|
|
3468
|
+
}
|
|
3469
|
+
|
|
3470
|
+
#undef HWY_RVV_IF_SAME_T_DV
|
|
3471
|
+
|
|
3472
|
+
} // namespace detail
|
|
3473
|
+
|
|
3474
|
+
template <class DTo, class VFrom>
|
|
3475
|
+
HWY_API VFromD<DTo> ResizeBitCast(DTo /*dto*/, VFrom v) {
|
|
3476
|
+
const DFromV<decltype(v)> d_from;
|
|
3477
|
+
const Repartition<uint8_t, decltype(d_from)> du8_from;
|
|
3478
|
+
const DFromV<VFromD<DTo>> d_to;
|
|
3479
|
+
const Repartition<uint8_t, decltype(d_to)> du8_to;
|
|
3480
|
+
return BitCast(d_to, detail::ChangeLMUL(du8_to, BitCast(du8_from, v)));
|
|
3481
|
+
}
|
|
3482
|
+
|
|
3483
|
+
// ------------------------------ Reverse2 (RotateRight, OddEven)
|
|
3484
|
+
|
|
3485
|
+
// Per-target flags to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8.
|
|
3486
|
+
#ifdef HWY_NATIVE_REVERSE2_8
|
|
3487
|
+
#undef HWY_NATIVE_REVERSE2_8
|
|
3488
|
+
#else
|
|
3489
|
+
#define HWY_NATIVE_REVERSE2_8
|
|
3490
|
+
#endif
|
|
3491
|
+
|
|
3492
|
+
// Shifting and adding requires fewer instructions than blending, but casting to
|
|
3493
|
+
// u32 only works for LMUL in [1/2, 8].
|
|
3136
3494
|
|
|
3137
3495
|
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
3138
3496
|
HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
|
|
@@ -3307,7 +3665,7 @@ template <class V, class M, class D>
|
|
|
3307
3665
|
HWY_API size_t CompressBlendedStore(const V v, const M mask, const D d,
|
|
3308
3666
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
3309
3667
|
const size_t count = CountTrue(d, mask);
|
|
3310
|
-
|
|
3668
|
+
StoreN(Compress(v, mask), d, unaligned, count);
|
|
3311
3669
|
return count;
|
|
3312
3670
|
}
|
|
3313
3671
|
|
|
@@ -3409,6 +3767,9 @@ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
|
3409
3767
|
return Combine(d, LowerHalf(dh, hi_even), LowerHalf(dh, lo_even));
|
|
3410
3768
|
}
|
|
3411
3769
|
|
|
3770
|
+
// ------------------------------ PromoteEvenTo/PromoteOddTo
|
|
3771
|
+
#include "hwy/ops/inside-inl.h"
|
|
3772
|
+
|
|
3412
3773
|
// ================================================== BLOCKWISE
|
|
3413
3774
|
|
|
3414
3775
|
// ------------------------------ CombineShiftRightBytes
|
|
@@ -3483,50 +3844,6 @@ HWY_API V Shuffle0123(const V v) {
|
|
|
3483
3844
|
|
|
3484
3845
|
// ------------------------------ TableLookupBytes
|
|
3485
3846
|
|
|
3486
|
-
// Extends or truncates a vector to match the given d.
|
|
3487
|
-
namespace detail {
|
|
3488
|
-
|
|
3489
|
-
template <class D>
|
|
3490
|
-
HWY_INLINE VFromD<D> ChangeLMUL(D /* d */, VFromD<D> v) {
|
|
3491
|
-
return v;
|
|
3492
|
-
}
|
|
3493
|
-
|
|
3494
|
-
// LMUL of VFromD<D> < LMUL of V: need to truncate v
|
|
3495
|
-
template <class D, class V,
|
|
3496
|
-
hwy::EnableIf<IsSame<TFromD<D>, TFromV<V>>()>* = nullptr,
|
|
3497
|
-
HWY_IF_POW2_LE_D(DFromV<VFromD<D>>, DFromV<V>().Pow2() - 1)>
|
|
3498
|
-
HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) {
|
|
3499
|
-
const DFromV<decltype(v)> d_from;
|
|
3500
|
-
const Half<decltype(d_from)> dh_from;
|
|
3501
|
-
static_assert(
|
|
3502
|
-
DFromV<VFromD<decltype(dh_from)>>().Pow2() < DFromV<V>().Pow2(),
|
|
3503
|
-
"The LMUL of VFromD<decltype(dh_from)> must be less than the LMUL of V");
|
|
3504
|
-
static_assert(
|
|
3505
|
-
DFromV<VFromD<D>>().Pow2() <= DFromV<VFromD<decltype(dh_from)>>().Pow2(),
|
|
3506
|
-
"The LMUL of VFromD<D> must be less than or equal to the LMUL of "
|
|
3507
|
-
"VFromD<decltype(dh_from)>");
|
|
3508
|
-
return ChangeLMUL(d, Trunc(v));
|
|
3509
|
-
}
|
|
3510
|
-
|
|
3511
|
-
// LMUL of VFromD<D> > LMUL of V: need to extend v
|
|
3512
|
-
template <class D, class V,
|
|
3513
|
-
hwy::EnableIf<IsSame<TFromD<D>, TFromV<V>>()>* = nullptr,
|
|
3514
|
-
HWY_IF_POW2_GT_D(DFromV<VFromD<D>>, DFromV<V>().Pow2())>
|
|
3515
|
-
HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) {
|
|
3516
|
-
const DFromV<decltype(v)> d_from;
|
|
3517
|
-
const Twice<decltype(d_from)> dt_from;
|
|
3518
|
-
static_assert(DFromV<VFromD<decltype(dt_from)>>().Pow2() > DFromV<V>().Pow2(),
|
|
3519
|
-
"The LMUL of VFromD<decltype(dt_from)> must be greater than "
|
|
3520
|
-
"the LMUL of V");
|
|
3521
|
-
static_assert(
|
|
3522
|
-
DFromV<VFromD<D>>().Pow2() >= DFromV<VFromD<decltype(dt_from)>>().Pow2(),
|
|
3523
|
-
"The LMUL of VFromD<D> must be greater than or equal to the LMUL of "
|
|
3524
|
-
"VFromD<decltype(dt_from)>");
|
|
3525
|
-
return ChangeLMUL(d, Ext(dt_from, v));
|
|
3526
|
-
}
|
|
3527
|
-
|
|
3528
|
-
} // namespace detail
|
|
3529
|
-
|
|
3530
3847
|
template <class VT, class VI>
|
|
3531
3848
|
HWY_API VI TableLookupBytes(const VT vt, const VI vi) {
|
|
3532
3849
|
const DFromV<VT> dt; // T=table, I=index.
|
|
@@ -3563,7 +3880,8 @@ HWY_API VI TableLookupBytesOr0(const VT vt, const VI idx) {
|
|
|
3563
3880
|
|
|
3564
3881
|
// ------------------------------ TwoTablesLookupLanes
|
|
3565
3882
|
|
|
3566
|
-
//
|
|
3883
|
+
// WARNING: 8-bit lanes may lead to unexpected results because idx is the same
|
|
3884
|
+
// size and may overflow.
|
|
3567
3885
|
template <class D, HWY_IF_POW2_LE_D(D, 2)>
|
|
3568
3886
|
HWY_API VFromD<D> TwoTablesLookupLanes(D d, VFromD<D> a, VFromD<D> b,
|
|
3569
3887
|
VFromD<RebindToUnsigned<D>> idx) {
|
|
@@ -3597,11 +3915,50 @@ HWY_API V TwoTablesLookupLanes(V a, V b,
|
|
|
3597
3915
|
}
|
|
3598
3916
|
|
|
3599
3917
|
// ------------------------------ Broadcast
|
|
3600
|
-
|
|
3918
|
+
|
|
3919
|
+
// 8-bit requires 16-bit tables.
|
|
3920
|
+
template <int kLane, class V, class D = DFromV<V>, HWY_IF_T_SIZE_D(D, 1),
|
|
3921
|
+
HWY_IF_POW2_LE_D(D, 2)>
|
|
3601
3922
|
HWY_API V Broadcast(const V v) {
|
|
3602
|
-
const
|
|
3603
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
3923
|
+
const D d;
|
|
3604
3924
|
HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
|
|
3925
|
+
|
|
3926
|
+
const Rebind<uint16_t, decltype(d)> du16;
|
|
3927
|
+
VFromD<decltype(du16)> idx =
|
|
3928
|
+
detail::OffsetsOf128BitBlocks(d, detail::Iota0(du16));
|
|
3929
|
+
if (kLane != 0) {
|
|
3930
|
+
idx = detail::AddS(idx, kLane);
|
|
3931
|
+
}
|
|
3932
|
+
return detail::TableLookupLanes16(v, idx);
|
|
3933
|
+
}
|
|
3934
|
+
|
|
3935
|
+
// 8-bit and max LMUL: split into halves.
|
|
3936
|
+
template <int kLane, class V, class D = DFromV<V>, HWY_IF_T_SIZE_D(D, 1),
|
|
3937
|
+
HWY_IF_POW2_GT_D(D, 2)>
|
|
3938
|
+
HWY_API V Broadcast(const V v) {
|
|
3939
|
+
const D d;
|
|
3940
|
+
HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
|
|
3941
|
+
|
|
3942
|
+
const Half<decltype(d)> dh;
|
|
3943
|
+
using VH = VFromD<decltype(dh)>;
|
|
3944
|
+
const Rebind<uint16_t, decltype(dh)> du16;
|
|
3945
|
+
VFromD<decltype(du16)> idx =
|
|
3946
|
+
detail::OffsetsOf128BitBlocks(d, detail::Iota0(du16));
|
|
3947
|
+
if (kLane != 0) {
|
|
3948
|
+
idx = detail::AddS(idx, kLane);
|
|
3949
|
+
}
|
|
3950
|
+
const VH lo = detail::TableLookupLanes16(LowerHalf(dh, v), idx);
|
|
3951
|
+
const VH hi = detail::TableLookupLanes16(UpperHalf(dh, v), idx);
|
|
3952
|
+
return Combine(d, hi, lo);
|
|
3953
|
+
}
|
|
3954
|
+
|
|
3955
|
+
template <int kLane, class V, class D = DFromV<V>,
|
|
3956
|
+
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))>
|
|
3957
|
+
HWY_API V Broadcast(const V v) {
|
|
3958
|
+
const D d;
|
|
3959
|
+
HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
|
|
3960
|
+
|
|
3961
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
3605
3962
|
auto idx = detail::OffsetsOf128BitBlocks(d, detail::Iota0(du));
|
|
3606
3963
|
if (kLane != 0) {
|
|
3607
3964
|
idx = detail::AddS(idx, kLane);
|
|
@@ -3778,20 +4135,194 @@ HWY_API V ShiftRightBytes(const D d, const V v) {
|
|
|
3778
4135
|
return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v)));
|
|
3779
4136
|
}
|
|
3780
4137
|
|
|
3781
|
-
// ------------------------------
|
|
4138
|
+
// ------------------------------ InterleaveWholeLower
|
|
4139
|
+
#ifdef HWY_NATIVE_INTERLEAVE_WHOLE
|
|
4140
|
+
#undef HWY_NATIVE_INTERLEAVE_WHOLE
|
|
4141
|
+
#else
|
|
4142
|
+
#define HWY_NATIVE_INTERLEAVE_WHOLE
|
|
4143
|
+
#endif
|
|
4144
|
+
|
|
4145
|
+
namespace detail {
|
|
4146
|
+
// Returns double-length vector with interleaved lanes.
|
|
4147
|
+
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
4148
|
+
HWY_IF_POW2_GT_D(D, -3)>
|
|
4149
|
+
HWY_API VFromD<D> InterleaveWhole(D d, VFromD<Half<D>> a, VFromD<Half<D>> b) {
|
|
4150
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4151
|
+
using TW = MakeWide<TFromD<decltype(du)>>;
|
|
4152
|
+
const Rebind<TW, Half<decltype(du)>> dw;
|
|
4153
|
+
const Half<decltype(du)> duh; // cast inputs to unsigned so we zero-extend
|
|
4154
|
+
|
|
4155
|
+
const VFromD<decltype(dw)> aw = PromoteTo(dw, BitCast(duh, a));
|
|
4156
|
+
const VFromD<decltype(dw)> bw = PromoteTo(dw, BitCast(duh, b));
|
|
4157
|
+
return BitCast(d, Or(aw, BitCast(dw, detail::Slide1Up(BitCast(du, bw)))));
|
|
4158
|
+
}
|
|
4159
|
+
// 64-bit: cannot PromoteTo, but can Ext.
|
|
4160
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)>
|
|
4161
|
+
HWY_API VFromD<D> InterleaveWhole(D d, VFromD<Half<D>> a, VFromD<Half<D>> b) {
|
|
4162
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4163
|
+
const auto idx = ShiftRight<1>(detail::Iota0(du));
|
|
4164
|
+
return OddEven(TableLookupLanes(detail::Ext(d, b), idx),
|
|
4165
|
+
TableLookupLanes(detail::Ext(d, a), idx));
|
|
4166
|
+
}
|
|
4167
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_GT_D(D, 2)>
|
|
4168
|
+
HWY_API VFromD<D> InterleaveWhole(D d, VFromD<Half<D>> a, VFromD<Half<D>> b) {
|
|
4169
|
+
const Half<D> dh;
|
|
4170
|
+
const Half<decltype(dh)> dq;
|
|
4171
|
+
const VFromD<decltype(dh)> i0 =
|
|
4172
|
+
InterleaveWhole(dh, LowerHalf(dq, a), LowerHalf(dq, b));
|
|
4173
|
+
const VFromD<decltype(dh)> i1 =
|
|
4174
|
+
InterleaveWhole(dh, UpperHalf(dq, a), UpperHalf(dq, b));
|
|
4175
|
+
return Combine(d, i1, i0);
|
|
4176
|
+
}
|
|
4177
|
+
|
|
4178
|
+
} // namespace detail
|
|
3782
4179
|
|
|
3783
|
-
template <class D,
|
|
4180
|
+
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
|
|
4181
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
4182
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4183
|
+
const detail::AdjustSimdTagToMinVecPow2<RepartitionToWide<decltype(du)>> dw;
|
|
4184
|
+
const RepartitionToNarrow<decltype(dw)> du_src;
|
|
4185
|
+
|
|
4186
|
+
const VFromD<D> aw =
|
|
4187
|
+
ResizeBitCast(d, PromoteLowerTo(dw, ResizeBitCast(du_src, a)));
|
|
4188
|
+
const VFromD<D> bw =
|
|
4189
|
+
ResizeBitCast(d, PromoteLowerTo(dw, ResizeBitCast(du_src, b)));
|
|
4190
|
+
return Or(aw, detail::Slide1Up(bw));
|
|
4191
|
+
}
|
|
4192
|
+
|
|
4193
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
4194
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
4195
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4196
|
+
const auto idx = ShiftRight<1>(detail::Iota0(du));
|
|
4197
|
+
return OddEven(TableLookupLanes(b, idx), TableLookupLanes(a, idx));
|
|
4198
|
+
}
|
|
4199
|
+
|
|
4200
|
+
// ------------------------------ InterleaveWholeUpper
|
|
4201
|
+
|
|
4202
|
+
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
|
|
4203
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
4204
|
+
// Use Lanes(d) / 2 instead of Lanes(Half<D>()) as Lanes(Half<D>()) can only
|
|
4205
|
+
// be called if (d.Pow2() >= -2 && d.Pow2() == DFromV<VFromD<D>>().Pow2()) is
|
|
4206
|
+
// true and and as the results of InterleaveWholeUpper are
|
|
4207
|
+
// implementation-defined if Lanes(d) is less than 2.
|
|
4208
|
+
const size_t half_N = Lanes(d) / 2;
|
|
4209
|
+
return InterleaveWholeLower(d, detail::SlideDown(a, half_N),
|
|
4210
|
+
detail::SlideDown(b, half_N));
|
|
4211
|
+
}
|
|
4212
|
+
|
|
4213
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
4214
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
4215
|
+
// Use Lanes(d) / 2 instead of Lanes(Half<D>()) as Lanes(Half<D>()) can only
|
|
4216
|
+
// be called if (d.Pow2() >= -2 && d.Pow2() == DFromV<VFromD<D>>().Pow2()) is
|
|
4217
|
+
// true and as the results of InterleaveWholeUpper are implementation-defined
|
|
4218
|
+
// if Lanes(d) is less than 2.
|
|
4219
|
+
const size_t half_N = Lanes(d) / 2;
|
|
4220
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4221
|
+
const auto idx = detail::AddS(ShiftRight<1>(detail::Iota0(du)),
|
|
4222
|
+
static_cast<uint64_t>(half_N));
|
|
4223
|
+
return OddEven(TableLookupLanes(b, idx), TableLookupLanes(a, idx));
|
|
4224
|
+
}
|
|
4225
|
+
|
|
4226
|
+
// ------------------------------ InterleaveLower (InterleaveWholeLower)
|
|
4227
|
+
|
|
4228
|
+
namespace detail {
|
|
4229
|
+
|
|
4230
|
+
// Definitely at least 128 bit: match x86 semantics (independent blocks). Using
|
|
4231
|
+
// InterleaveWhole and 64-bit Compress avoids 8-bit overflow.
|
|
4232
|
+
template <class D, class V, HWY_IF_POW2_LE_D(D, 2)>
|
|
4233
|
+
HWY_INLINE V InterleaveLowerBlocks(D d, const V a, const V b) {
|
|
4234
|
+
static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
|
|
4235
|
+
const Twice<D> dt;
|
|
4236
|
+
const RebindToUnsigned<decltype(dt)> dt_u;
|
|
4237
|
+
const VFromD<decltype(dt)> interleaved = detail::InterleaveWhole(dt, a, b);
|
|
4238
|
+
// Keep only even 128-bit blocks. This is faster than u64 ConcatEven
|
|
4239
|
+
// because we only have a single vector.
|
|
4240
|
+
constexpr size_t kShift = CeilLog2(16 / sizeof(TFromD<D>));
|
|
4241
|
+
const VFromD<decltype(dt_u)> idx_block =
|
|
4242
|
+
ShiftRight<kShift>(detail::Iota0(dt_u));
|
|
4243
|
+
const MFromD<decltype(dt_u)> is_even =
|
|
4244
|
+
detail::EqS(detail::AndS(idx_block, 1), 0);
|
|
4245
|
+
return BitCast(d, LowerHalf(Compress(BitCast(dt_u, interleaved), is_even)));
|
|
4246
|
+
}
|
|
4247
|
+
template <class D, class V, HWY_IF_POW2_GT_D(D, 2)>
|
|
4248
|
+
HWY_INLINE V InterleaveLowerBlocks(D d, const V a, const V b) {
|
|
4249
|
+
const Half<D> dh;
|
|
4250
|
+
const VFromD<decltype(dh)> i0 =
|
|
4251
|
+
InterleaveLowerBlocks(dh, LowerHalf(dh, a), LowerHalf(dh, b));
|
|
4252
|
+
const VFromD<decltype(dh)> i1 =
|
|
4253
|
+
InterleaveLowerBlocks(dh, UpperHalf(dh, a), UpperHalf(dh, b));
|
|
4254
|
+
return Combine(d, i1, i0);
|
|
4255
|
+
}
|
|
4256
|
+
|
|
4257
|
+
// As above, for the upper half of blocks.
|
|
4258
|
+
template <class D, class V, HWY_IF_POW2_LE_D(D, 2)>
|
|
4259
|
+
HWY_INLINE V InterleaveUpperBlocks(D d, const V a, const V b) {
|
|
4260
|
+
static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
|
|
4261
|
+
const Twice<D> dt;
|
|
4262
|
+
const RebindToUnsigned<decltype(dt)> dt_u;
|
|
4263
|
+
const VFromD<decltype(dt)> interleaved = detail::InterleaveWhole(dt, a, b);
|
|
4264
|
+
// Keep only odd 128-bit blocks. This is faster than u64 ConcatEven
|
|
4265
|
+
// because we only have a single vector.
|
|
4266
|
+
constexpr size_t kShift = CeilLog2(16 / sizeof(TFromD<D>));
|
|
4267
|
+
const VFromD<decltype(dt_u)> idx_block =
|
|
4268
|
+
ShiftRight<kShift>(detail::Iota0(dt_u));
|
|
4269
|
+
const MFromD<decltype(dt_u)> is_odd =
|
|
4270
|
+
detail::EqS(detail::AndS(idx_block, 1), 1);
|
|
4271
|
+
return BitCast(d, LowerHalf(Compress(BitCast(dt_u, interleaved), is_odd)));
|
|
4272
|
+
}
|
|
4273
|
+
template <class D, class V, HWY_IF_POW2_GT_D(D, 2)>
|
|
4274
|
+
HWY_INLINE V InterleaveUpperBlocks(D d, const V a, const V b) {
|
|
4275
|
+
const Half<D> dh;
|
|
4276
|
+
const VFromD<decltype(dh)> i0 =
|
|
4277
|
+
InterleaveUpperBlocks(dh, LowerHalf(dh, a), LowerHalf(dh, b));
|
|
4278
|
+
const VFromD<decltype(dh)> i1 =
|
|
4279
|
+
InterleaveUpperBlocks(dh, UpperHalf(dh, a), UpperHalf(dh, b));
|
|
4280
|
+
return Combine(d, i1, i0);
|
|
4281
|
+
}
|
|
4282
|
+
|
|
4283
|
+
// RVV vectors are at least 128 bit when there is no fractional LMUL nor cap.
|
|
4284
|
+
// Used by functions with per-block behavior such as InterleaveLower.
|
|
4285
|
+
template <typename T, size_t N, int kPow2>
|
|
4286
|
+
constexpr bool IsGE128(Simd<T, N, kPow2> /* d */) {
|
|
4287
|
+
return N * sizeof(T) >= 16 && kPow2 >= 0;
|
|
4288
|
+
}
|
|
4289
|
+
|
|
4290
|
+
// Definitely less than 128-bit only if there is a small cap; fractional LMUL
|
|
4291
|
+
// might not be enough if vectors are large.
|
|
4292
|
+
template <typename T, size_t N, int kPow2>
|
|
4293
|
+
constexpr bool IsLT128(Simd<T, N, kPow2> /* d */) {
|
|
4294
|
+
return N * sizeof(T) < 16;
|
|
4295
|
+
}
|
|
4296
|
+
|
|
4297
|
+
} // namespace detail
|
|
4298
|
+
|
|
4299
|
+
#define HWY_RVV_IF_GE128_D(D) hwy::EnableIf<detail::IsGE128(D())>* = nullptr
|
|
4300
|
+
#define HWY_RVV_IF_LT128_D(D) hwy::EnableIf<detail::IsLT128(D())>* = nullptr
|
|
4301
|
+
#define HWY_RVV_IF_CAN128_D(D) \
|
|
4302
|
+
hwy::EnableIf<!detail::IsLT128(D()) && !detail::IsGE128(D())>* = nullptr
|
|
4303
|
+
|
|
4304
|
+
template <class D, class V, HWY_RVV_IF_GE128_D(D)>
|
|
4305
|
+
HWY_API V InterleaveLower(D d, const V a, const V b) {
|
|
4306
|
+
return detail::InterleaveLowerBlocks(d, a, b);
|
|
4307
|
+
}
|
|
4308
|
+
|
|
4309
|
+
// Single block: interleave without extra Compress.
|
|
4310
|
+
template <class D, class V, HWY_RVV_IF_LT128_D(D)>
|
|
3784
4311
|
HWY_API V InterleaveLower(D d, const V a, const V b) {
|
|
3785
4312
|
static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
|
|
3786
|
-
|
|
3787
|
-
|
|
3788
|
-
|
|
3789
|
-
|
|
3790
|
-
|
|
3791
|
-
|
|
3792
|
-
|
|
3793
|
-
|
|
3794
|
-
|
|
4313
|
+
return InterleaveWholeLower(d, a, b);
|
|
4314
|
+
}
|
|
4315
|
+
|
|
4316
|
+
// Could be either; branch at runtime.
|
|
4317
|
+
template <class D, class V, HWY_RVV_IF_CAN128_D(D)>
|
|
4318
|
+
HWY_API V InterleaveLower(D d, const V a, const V b) {
|
|
4319
|
+
if (Lanes(d) * sizeof(TFromD<D>) <= 16) {
|
|
4320
|
+
return InterleaveWholeLower(d, a, b);
|
|
4321
|
+
}
|
|
4322
|
+
// Fractional LMUL: use LMUL=1 to ensure we can cast to u64.
|
|
4323
|
+
const ScalableTag<TFromD<D>, HWY_MAX(d.Pow2(), 0)> d1;
|
|
4324
|
+
return ResizeBitCast(d, detail::InterleaveLowerBlocks(
|
|
4325
|
+
d1, ResizeBitCast(d1, a), ResizeBitCast(d1, b)));
|
|
3795
4326
|
}
|
|
3796
4327
|
|
|
3797
4328
|
template <class V>
|
|
@@ -3799,21 +4330,30 @@ HWY_API V InterleaveLower(const V a, const V b) {
|
|
|
3799
4330
|
return InterleaveLower(DFromV<V>(), a, b);
|
|
3800
4331
|
}
|
|
3801
4332
|
|
|
3802
|
-
// ------------------------------ InterleaveUpper
|
|
4333
|
+
// ------------------------------ InterleaveUpper (Compress)
|
|
3803
4334
|
|
|
3804
|
-
template <class D, class V>
|
|
3805
|
-
HWY_API V InterleaveUpper(
|
|
4335
|
+
template <class D, class V, HWY_RVV_IF_GE128_D(D)>
|
|
4336
|
+
HWY_API V InterleaveUpper(D d, const V a, const V b) {
|
|
4337
|
+
return detail::InterleaveUpperBlocks(d, a, b);
|
|
4338
|
+
}
|
|
4339
|
+
|
|
4340
|
+
// Single block: interleave without extra Compress.
|
|
4341
|
+
template <class D, class V, HWY_RVV_IF_LT128_D(D)>
|
|
4342
|
+
HWY_API V InterleaveUpper(D d, const V a, const V b) {
|
|
3806
4343
|
static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
|
|
3807
|
-
|
|
3808
|
-
|
|
3809
|
-
|
|
3810
|
-
|
|
3811
|
-
|
|
3812
|
-
|
|
3813
|
-
|
|
3814
|
-
|
|
3815
|
-
|
|
3816
|
-
|
|
4344
|
+
return InterleaveWholeUpper(d, a, b);
|
|
4345
|
+
}
|
|
4346
|
+
|
|
4347
|
+
// Could be either; branch at runtime.
|
|
4348
|
+
template <class D, class V, HWY_RVV_IF_CAN128_D(D)>
|
|
4349
|
+
HWY_API V InterleaveUpper(D d, const V a, const V b) {
|
|
4350
|
+
if (Lanes(d) * sizeof(TFromD<D>) <= 16) {
|
|
4351
|
+
return InterleaveWholeUpper(d, a, b);
|
|
4352
|
+
}
|
|
4353
|
+
// Fractional LMUL: use LMUL=1 to ensure we can cast to u64.
|
|
4354
|
+
const ScalableTag<TFromD<D>, HWY_MAX(d.Pow2(), 0)> d1;
|
|
4355
|
+
return ResizeBitCast(d, detail::InterleaveUpperBlocks(
|
|
4356
|
+
d1, ResizeBitCast(d1, a), ResizeBitCast(d1, b)));
|
|
3817
4357
|
}
|
|
3818
4358
|
|
|
3819
4359
|
// ------------------------------ ZipLower
|
|
@@ -3840,67 +4380,98 @@ HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
|
|
|
3840
4380
|
|
|
3841
4381
|
// ================================================== REDUCE
|
|
3842
4382
|
|
|
3843
|
-
//
|
|
4383
|
+
// We have ReduceSum, generic_ops-inl.h defines SumOfLanes via Set.
|
|
4384
|
+
#ifdef HWY_NATIVE_REDUCE_SCALAR
|
|
4385
|
+
#undef HWY_NATIVE_REDUCE_SCALAR
|
|
4386
|
+
#else
|
|
4387
|
+
#define HWY_NATIVE_REDUCE_SCALAR
|
|
4388
|
+
#endif
|
|
4389
|
+
|
|
4390
|
+
// scalar = f(vector, zero_m1)
|
|
3844
4391
|
#define HWY_RVV_REDUCE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
|
|
3845
4392
|
MLEN, NAME, OP) \
|
|
3846
|
-
template <
|
|
3847
|
-
HWY_API
|
|
3848
|
-
NAME(
|
|
3849
|
-
|
|
3850
|
-
|
|
3851
|
-
|
|
4393
|
+
template <size_t N> \
|
|
4394
|
+
HWY_API HWY_RVV_T(BASE, SEW) \
|
|
4395
|
+
NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) v, \
|
|
4396
|
+
HWY_RVV_V(BASE, SEW, m1) v0) { \
|
|
4397
|
+
return GetLane(__riscv_v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1( \
|
|
4398
|
+
v, v0, Lanes(d))); \
|
|
3852
4399
|
}
|
|
3853
4400
|
|
|
3854
|
-
//
|
|
4401
|
+
// detail::RedSum, detail::RedMin, and detail::RedMax is more efficient
|
|
4402
|
+
// for N=4 I8/U8 reductions on RVV than the default implementations of the
|
|
4403
|
+
// the N=4 I8/U8 ReduceSum/ReduceMin/ReduceMax operations in generic_ops-inl.h
|
|
4404
|
+
#undef HWY_IF_REDUCE_D
|
|
4405
|
+
#define HWY_IF_REDUCE_D(D) hwy::EnableIf<HWY_MAX_LANES_D(D) != 1>* = nullptr
|
|
4406
|
+
|
|
4407
|
+
#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
4408
|
+
#undef HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
4409
|
+
#else
|
|
4410
|
+
#define HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
4411
|
+
#endif
|
|
4412
|
+
|
|
4413
|
+
#ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
4414
|
+
#undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
4415
|
+
#else
|
|
4416
|
+
#define HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
4417
|
+
#endif
|
|
4418
|
+
|
|
4419
|
+
// ------------------------------ ReduceSum
|
|
3855
4420
|
|
|
3856
4421
|
namespace detail {
|
|
3857
|
-
HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum,
|
|
3858
|
-
HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredusum,
|
|
4422
|
+
HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum, _ALL_VIRT)
|
|
4423
|
+
HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredusum, _ALL_VIRT)
|
|
3859
4424
|
} // namespace detail
|
|
3860
4425
|
|
|
3861
|
-
template <class D>
|
|
3862
|
-
HWY_API
|
|
4426
|
+
template <class D, HWY_IF_REDUCE_D(D)>
|
|
4427
|
+
HWY_API TFromD<D> ReduceSum(D d, const VFromD<D> v) {
|
|
3863
4428
|
const auto v0 = Zero(ScalableTag<TFromD<D>>()); // always m1
|
|
3864
4429
|
return detail::RedSum(d, v, v0);
|
|
3865
4430
|
}
|
|
3866
4431
|
|
|
3867
|
-
|
|
3868
|
-
HWY_API TFromD<D> ReduceSum(D d, const VFromD<D> v) {
|
|
3869
|
-
return GetLane(SumOfLanes(d, v));
|
|
3870
|
-
}
|
|
3871
|
-
|
|
3872
|
-
// ------------------------------ MinOfLanes
|
|
4432
|
+
// ------------------------------ ReduceMin
|
|
3873
4433
|
namespace detail {
|
|
3874
|
-
HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu,
|
|
3875
|
-
HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin,
|
|
3876
|
-
HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin,
|
|
4434
|
+
HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu, _ALL_VIRT)
|
|
4435
|
+
HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin, _ALL_VIRT)
|
|
4436
|
+
HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin, _ALL_VIRT)
|
|
3877
4437
|
} // namespace detail
|
|
3878
4438
|
|
|
3879
|
-
template <class D>
|
|
3880
|
-
HWY_API
|
|
3881
|
-
using T = TFromD<D>;
|
|
4439
|
+
template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
|
|
4440
|
+
HWY_API T ReduceMin(D d, const VFromD<D> v) {
|
|
3882
4441
|
const ScalableTag<T> d1; // always m1
|
|
3883
|
-
|
|
3884
|
-
return detail::RedMin(d, v, neutral);
|
|
4442
|
+
return detail::RedMin(d, v, Set(d1, HighestValue<T>()));
|
|
3885
4443
|
}
|
|
3886
4444
|
|
|
3887
|
-
// ------------------------------
|
|
4445
|
+
// ------------------------------ ReduceMax
|
|
3888
4446
|
namespace detail {
|
|
3889
|
-
HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu,
|
|
3890
|
-
HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax,
|
|
3891
|
-
HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax,
|
|
4447
|
+
HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu, _ALL_VIRT)
|
|
4448
|
+
HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax, _ALL_VIRT)
|
|
4449
|
+
HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax, _ALL_VIRT)
|
|
3892
4450
|
} // namespace detail
|
|
3893
4451
|
|
|
3894
|
-
template <class D>
|
|
3895
|
-
HWY_API
|
|
3896
|
-
using T = TFromD<D>;
|
|
4452
|
+
template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
|
|
4453
|
+
HWY_API T ReduceMax(D d, const VFromD<D> v) {
|
|
3897
4454
|
const ScalableTag<T> d1; // always m1
|
|
3898
|
-
|
|
3899
|
-
return detail::RedMax(d, v, neutral);
|
|
4455
|
+
return detail::RedMax(d, v, Set(d1, LowestValue<T>()));
|
|
3900
4456
|
}
|
|
3901
4457
|
|
|
3902
4458
|
#undef HWY_RVV_REDUCE
|
|
3903
4459
|
|
|
4460
|
+
// ------------------------------ SumOfLanes
|
|
4461
|
+
|
|
4462
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
4463
|
+
HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
|
|
4464
|
+
return Set(d, ReduceSum(d, v));
|
|
4465
|
+
}
|
|
4466
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
4467
|
+
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
4468
|
+
return Set(d, ReduceMin(d, v));
|
|
4469
|
+
}
|
|
4470
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
4471
|
+
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
4472
|
+
return Set(d, ReduceMax(d, v));
|
|
4473
|
+
}
|
|
4474
|
+
|
|
3904
4475
|
// ================================================== Ops with dependencies
|
|
3905
4476
|
|
|
3906
4477
|
// ------------------------------ LoadInterleaved2
|
|
@@ -4229,15 +4800,87 @@ HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
|
|
|
4229
4800
|
|
|
4230
4801
|
#endif // HWY_HAVE_TUPLE
|
|
4231
4802
|
|
|
4232
|
-
// ------------------------------ ResizeBitCast
|
|
4803
|
+
// ------------------------------ Dup128VecFromValues (ResizeBitCast)
|
|
4233
4804
|
|
|
4234
|
-
template <class D,
|
|
4235
|
-
HWY_API VFromD<D>
|
|
4236
|
-
|
|
4237
|
-
|
|
4238
|
-
|
|
4239
|
-
|
|
4240
|
-
|
|
4805
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_D(D, 1)>
|
|
4806
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> /*t1*/) {
|
|
4807
|
+
return Set(d, t0);
|
|
4808
|
+
}
|
|
4809
|
+
|
|
4810
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)>
|
|
4811
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) {
|
|
4812
|
+
const auto even_lanes = Set(d, t0);
|
|
4813
|
+
#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
|
|
4814
|
+
if (__builtin_constant_p(BitCastScalar<uint64_t>(t0) ==
|
|
4815
|
+
BitCastScalar<uint64_t>(t1)) &&
|
|
4816
|
+
(BitCastScalar<uint64_t>(t0) == BitCastScalar<uint64_t>(t1))) {
|
|
4817
|
+
return even_lanes;
|
|
4818
|
+
}
|
|
4819
|
+
#endif
|
|
4820
|
+
|
|
4821
|
+
const auto odd_lanes = Set(d, t1);
|
|
4822
|
+
return OddEven(odd_lanes, even_lanes);
|
|
4823
|
+
}
|
|
4824
|
+
|
|
4825
|
+
namespace detail {
|
|
4826
|
+
|
|
4827
|
+
#pragma pack(push, 1)
|
|
4828
|
+
|
|
4829
|
+
template <class T>
|
|
4830
|
+
struct alignas(8) Vec64ValsWrapper {
|
|
4831
|
+
static_assert(sizeof(T) >= 1, "sizeof(T) >= 1 must be true");
|
|
4832
|
+
static_assert(sizeof(T) <= 8, "sizeof(T) <= 8 must be true");
|
|
4833
|
+
T vals[8 / sizeof(T)];
|
|
4834
|
+
};
|
|
4835
|
+
|
|
4836
|
+
#pragma pack(pop)
|
|
4837
|
+
|
|
4838
|
+
} // namespace detail
|
|
4839
|
+
|
|
4840
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
4841
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
4842
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
4843
|
+
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
|
|
4844
|
+
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
|
|
4845
|
+
TFromD<D> t11, TFromD<D> t12,
|
|
4846
|
+
TFromD<D> t13, TFromD<D> t14,
|
|
4847
|
+
TFromD<D> t15) {
|
|
4848
|
+
const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
|
|
4849
|
+
return ResizeBitCast(
|
|
4850
|
+
d, Dup128VecFromValues(
|
|
4851
|
+
du64,
|
|
4852
|
+
BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{
|
|
4853
|
+
{t0, t1, t2, t3, t4, t5, t6, t7}}),
|
|
4854
|
+
BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{
|
|
4855
|
+
{t8, t9, t10, t11, t12, t13, t14, t15}})));
|
|
4856
|
+
}
|
|
4857
|
+
|
|
4858
|
+
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
4859
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
4860
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
4861
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
4862
|
+
TFromD<D> t7) {
|
|
4863
|
+
const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
|
|
4864
|
+
return ResizeBitCast(
|
|
4865
|
+
d, Dup128VecFromValues(
|
|
4866
|
+
du64,
|
|
4867
|
+
BitCastScalar<uint64_t>(
|
|
4868
|
+
detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1, t2, t3}}),
|
|
4869
|
+
BitCastScalar<uint64_t>(
|
|
4870
|
+
detail::Vec64ValsWrapper<TFromD<D>>{{t4, t5, t6, t7}})));
|
|
4871
|
+
}
|
|
4872
|
+
|
|
4873
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
4874
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
4875
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
4876
|
+
const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
|
|
4877
|
+
return ResizeBitCast(
|
|
4878
|
+
d,
|
|
4879
|
+
Dup128VecFromValues(du64,
|
|
4880
|
+
BitCastScalar<uint64_t>(
|
|
4881
|
+
detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}}),
|
|
4882
|
+
BitCastScalar<uint64_t>(
|
|
4883
|
+
detail::Vec64ValsWrapper<TFromD<D>>{{t2, t3}})));
|
|
4241
4884
|
}
|
|
4242
4885
|
|
|
4243
4886
|
// ------------------------------ PopulationCount (ShiftRight)
|
|
@@ -4366,34 +5009,276 @@ HWY_API MFromD<D> FirstN(const D d, const size_t n) {
|
|
|
4366
5009
|
return Eq(detail::SlideUp(one, zero, n), one);
|
|
4367
5010
|
}
|
|
4368
5011
|
|
|
4369
|
-
// ------------------------------
|
|
5012
|
+
// ------------------------------ LowerHalfOfMask/UpperHalfOfMask
|
|
4370
5013
|
|
|
4371
|
-
|
|
4372
|
-
|
|
4373
|
-
|
|
5014
|
+
#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
5015
|
+
|
|
5016
|
+
// Target-specific implementations of LowerHalfOfMask, UpperHalfOfMask,
|
|
5017
|
+
// CombineMasks, OrderedDemote2MasksTo, and Dup128MaskFromMaskBits are possible
|
|
5018
|
+
// on RVV if the __riscv_vreinterpret_v_b*_u8m1 and
|
|
5019
|
+
// __riscv_vreinterpret_v_u8m1_b* intrinsics are available.
|
|
5020
|
+
|
|
5021
|
+
// The __riscv_vreinterpret_v_b*_u8m1 and __riscv_vreinterpret_v_u8m1_b*
|
|
5022
|
+
// intrinsics available with Clang 17 and later and GCC 14 and later.
|
|
5023
|
+
|
|
5024
|
+
namespace detail {
|
|
5025
|
+
|
|
5026
|
+
HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool1_t m) {
|
|
5027
|
+
return __riscv_vreinterpret_v_b1_u8m1(m);
|
|
4374
5028
|
}
|
|
4375
5029
|
|
|
4376
|
-
|
|
4377
|
-
|
|
4378
|
-
|
|
4379
|
-
|
|
4380
|
-
|
|
4381
|
-
|
|
5030
|
+
HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool2_t m) {
|
|
5031
|
+
return __riscv_vreinterpret_v_b2_u8m1(m);
|
|
5032
|
+
}
|
|
5033
|
+
|
|
5034
|
+
HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool4_t m) {
|
|
5035
|
+
return __riscv_vreinterpret_v_b4_u8m1(m);
|
|
5036
|
+
}
|
|
5037
|
+
|
|
5038
|
+
HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool8_t m) {
|
|
5039
|
+
return __riscv_vreinterpret_v_b8_u8m1(m);
|
|
5040
|
+
}
|
|
5041
|
+
|
|
5042
|
+
HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool16_t m) {
|
|
5043
|
+
return __riscv_vreinterpret_v_b16_u8m1(m);
|
|
5044
|
+
}
|
|
5045
|
+
|
|
5046
|
+
HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool32_t m) {
|
|
5047
|
+
return __riscv_vreinterpret_v_b32_u8m1(m);
|
|
5048
|
+
}
|
|
5049
|
+
|
|
5050
|
+
HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool64_t m) {
|
|
5051
|
+
return __riscv_vreinterpret_v_b64_u8m1(m);
|
|
5052
|
+
}
|
|
5053
|
+
|
|
5054
|
+
template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool1_t>()>* = nullptr>
|
|
5055
|
+
HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
|
|
5056
|
+
return __riscv_vreinterpret_v_u8m1_b1(v);
|
|
5057
|
+
}
|
|
5058
|
+
|
|
5059
|
+
template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool2_t>()>* = nullptr>
|
|
5060
|
+
HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
|
|
5061
|
+
return __riscv_vreinterpret_v_u8m1_b2(v);
|
|
5062
|
+
}
|
|
5063
|
+
|
|
5064
|
+
template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool4_t>()>* = nullptr>
|
|
5065
|
+
HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
|
|
5066
|
+
return __riscv_vreinterpret_v_u8m1_b4(v);
|
|
5067
|
+
}
|
|
5068
|
+
|
|
5069
|
+
template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool8_t>()>* = nullptr>
|
|
5070
|
+
HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
|
|
5071
|
+
return __riscv_vreinterpret_v_u8m1_b8(v);
|
|
5072
|
+
}
|
|
5073
|
+
|
|
5074
|
+
template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool16_t>()>* = nullptr>
|
|
5075
|
+
HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
|
|
5076
|
+
return __riscv_vreinterpret_v_u8m1_b16(v);
|
|
5077
|
+
}
|
|
5078
|
+
|
|
5079
|
+
template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool32_t>()>* = nullptr>
|
|
5080
|
+
HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
|
|
5081
|
+
return __riscv_vreinterpret_v_u8m1_b32(v);
|
|
5082
|
+
}
|
|
5083
|
+
|
|
5084
|
+
template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool64_t>()>* = nullptr>
|
|
5085
|
+
HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
|
|
5086
|
+
return __riscv_vreinterpret_v_u8m1_b64(v);
|
|
5087
|
+
}
|
|
5088
|
+
|
|
5089
|
+
} // namespace detail
|
|
5090
|
+
|
|
5091
|
+
#ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
|
|
5092
|
+
#undef HWY_NATIVE_LOWER_HALF_OF_MASK
|
|
5093
|
+
#else
|
|
5094
|
+
#define HWY_NATIVE_LOWER_HALF_OF_MASK
|
|
5095
|
+
#endif
|
|
5096
|
+
|
|
5097
|
+
template <class D>
|
|
5098
|
+
HWY_API MFromD<D> LowerHalfOfMask(D d, MFromD<Twice<D>> m) {
|
|
5099
|
+
return detail::U8MaskBitsVecToMask(d, detail::MaskToU8MaskBitsVec(m));
|
|
5100
|
+
}
|
|
5101
|
+
|
|
5102
|
+
#ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
|
|
5103
|
+
#undef HWY_NATIVE_UPPER_HALF_OF_MASK
|
|
5104
|
+
#else
|
|
5105
|
+
#define HWY_NATIVE_UPPER_HALF_OF_MASK
|
|
5106
|
+
#endif
|
|
5107
|
+
|
|
5108
|
+
template <class D>
|
|
5109
|
+
HWY_API MFromD<D> UpperHalfOfMask(D d, MFromD<Twice<D>> m) {
|
|
5110
|
+
const size_t N = Lanes(d);
|
|
5111
|
+
|
|
5112
|
+
vuint8m1_t mask_bits = detail::MaskToU8MaskBitsVec(m);
|
|
5113
|
+
mask_bits = ShiftRightSame(mask_bits, static_cast<int>(N & 7));
|
|
5114
|
+
if (HWY_MAX_LANES_D(D) >= 8) {
|
|
5115
|
+
mask_bits = SlideDownLanes(ScalableTag<uint8_t>(), mask_bits, N / 8);
|
|
4382
5116
|
}
|
|
4383
5117
|
|
|
4384
|
-
|
|
5118
|
+
return detail::U8MaskBitsVecToMask(d, mask_bits);
|
|
5119
|
+
}
|
|
4385
5120
|
|
|
4386
|
-
|
|
5121
|
+
// ------------------------------ CombineMasks
|
|
4387
5122
|
|
|
4388
|
-
|
|
4389
|
-
|
|
4390
|
-
|
|
4391
|
-
|
|
4392
|
-
|
|
4393
|
-
|
|
5123
|
+
#ifdef HWY_NATIVE_COMBINE_MASKS
|
|
5124
|
+
#undef HWY_NATIVE_COMBINE_MASKS
|
|
5125
|
+
#else
|
|
5126
|
+
#define HWY_NATIVE_COMBINE_MASKS
|
|
5127
|
+
#endif
|
|
5128
|
+
|
|
5129
|
+
template <class D>
|
|
5130
|
+
HWY_API MFromD<D> CombineMasks(D d, MFromD<Half<D>> hi, MFromD<Half<D>> lo) {
|
|
5131
|
+
const Half<decltype(d)> dh;
|
|
5132
|
+
const size_t half_N = Lanes(dh);
|
|
5133
|
+
|
|
5134
|
+
const auto ext_lo_mask =
|
|
5135
|
+
And(detail::U8MaskBitsVecToMask(d, detail::MaskToU8MaskBitsVec(lo)),
|
|
5136
|
+
FirstN(d, half_N));
|
|
5137
|
+
vuint8m1_t hi_mask_bits = detail::MaskToU8MaskBitsVec(hi);
|
|
5138
|
+
hi_mask_bits = ShiftLeftSame(hi_mask_bits, static_cast<int>(half_N & 7));
|
|
5139
|
+
if (HWY_MAX_LANES_D(D) >= 8) {
|
|
5140
|
+
hi_mask_bits =
|
|
5141
|
+
SlideUpLanes(ScalableTag<uint8_t>(), hi_mask_bits, half_N / 8);
|
|
5142
|
+
}
|
|
5143
|
+
|
|
5144
|
+
return Or(ext_lo_mask, detail::U8MaskBitsVecToMask(d, hi_mask_bits));
|
|
4394
5145
|
}
|
|
4395
5146
|
|
|
4396
|
-
|
|
5147
|
+
// ------------------------------ OrderedDemote2MasksTo
|
|
5148
|
+
|
|
5149
|
+
#ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
|
|
5150
|
+
#undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
|
|
5151
|
+
#else
|
|
5152
|
+
#define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
|
|
5153
|
+
#endif
|
|
5154
|
+
|
|
5155
|
+
template <class DTo, class DFrom,
|
|
5156
|
+
HWY_IF_T_SIZE_D(DTo, sizeof(TFromD<DFrom>) / 2),
|
|
5157
|
+
class DTo_2 = Repartition<TFromD<DTo>, DFrom>,
|
|
5158
|
+
hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DTo_2>>()>* = nullptr>
|
|
5159
|
+
HWY_API MFromD<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom /*d_from*/,
|
|
5160
|
+
MFromD<DFrom> a, MFromD<DFrom> b) {
|
|
5161
|
+
return CombineMasks(d_to, b, a);
|
|
5162
|
+
}
|
|
5163
|
+
|
|
5164
|
+
#endif // HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
5165
|
+
|
|
5166
|
+
// ------------------------------ Dup128MaskFromMaskBits
|
|
5167
|
+
|
|
5168
|
+
namespace detail {
|
|
5169
|
+
// Even though this is only used after checking if (kN < X), this helper
|
|
5170
|
+
// function prevents "shift count exceeded" errors.
|
|
5171
|
+
template <size_t kN, HWY_IF_LANES_LE(kN, 31)>
|
|
5172
|
+
constexpr unsigned MaxMaskBits() {
|
|
5173
|
+
return (1u << kN) - 1;
|
|
5174
|
+
}
|
|
5175
|
+
template <size_t kN, HWY_IF_LANES_GT(kN, 31)>
|
|
5176
|
+
constexpr unsigned MaxMaskBits() {
|
|
5177
|
+
return ~0u;
|
|
5178
|
+
}
|
|
5179
|
+
} // namespace detail
|
|
5180
|
+
|
|
5181
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_LE_D(D, 8)>
|
|
5182
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
5183
|
+
constexpr size_t kN = MaxLanes(d);
|
|
5184
|
+
if (kN < 8) mask_bits &= detail::MaxMaskBits<kN>();
|
|
5185
|
+
|
|
5186
|
+
#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
5187
|
+
return detail::U8MaskBitsVecToMask(
|
|
5188
|
+
d, Set(ScalableTag<uint8_t>(), static_cast<uint8_t>(mask_bits)));
|
|
5189
|
+
#else
|
|
5190
|
+
const RebindToUnsigned<decltype(d)> du8;
|
|
5191
|
+
const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, decltype(du8)>>
|
|
5192
|
+
du64;
|
|
5193
|
+
|
|
5194
|
+
const auto bytes = ResizeBitCast(
|
|
5195
|
+
du8, detail::AndS(
|
|
5196
|
+
ResizeBitCast(du64, Set(du8, static_cast<uint8_t>(mask_bits))),
|
|
5197
|
+
uint64_t{0x8040201008040201u}));
|
|
5198
|
+
return detail::NeS(bytes, uint8_t{0});
|
|
5199
|
+
#endif
|
|
5200
|
+
}
|
|
5201
|
+
|
|
5202
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_GT_D(D, 8)>
|
|
5203
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
5204
|
+
#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
5205
|
+
const ScalableTag<uint8_t> du8;
|
|
5206
|
+
const ScalableTag<uint16_t> du16;
|
|
5207
|
+
// There are exactly 16 mask bits for 128 vector bits of 8-bit lanes.
|
|
5208
|
+
return detail::U8MaskBitsVecToMask(
|
|
5209
|
+
d, BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits))));
|
|
5210
|
+
#else
|
|
5211
|
+
// Slow fallback for completeness; the above bits to mask cast is preferred.
|
|
5212
|
+
const RebindToUnsigned<decltype(d)> du8;
|
|
5213
|
+
const Repartition<uint16_t, decltype(du8)> du16;
|
|
5214
|
+
const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, decltype(du8)>>
|
|
5215
|
+
du64;
|
|
5216
|
+
|
|
5217
|
+
// Replicate the lower 16 bits of mask_bits to each u16 lane of a u16 vector,
|
|
5218
|
+
// and then bitcast the replicated mask_bits to a u8 vector
|
|
5219
|
+
const auto bytes = BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits)));
|
|
5220
|
+
// Replicate bytes 8x such that each byte contains the bit that governs it.
|
|
5221
|
+
const auto rep8 = TableLookupLanes(bytes, ShiftRight<3>(detail::Iota0(du8)));
|
|
5222
|
+
|
|
5223
|
+
const auto masked_out_rep8 = ResizeBitCast(
|
|
5224
|
+
du8,
|
|
5225
|
+
detail::AndS(ResizeBitCast(du64, rep8), uint64_t{0x8040201008040201u}));
|
|
5226
|
+
return detail::NeS(masked_out_rep8, uint8_t{0});
|
|
5227
|
+
#endif
|
|
5228
|
+
}
|
|
5229
|
+
|
|
5230
|
+
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
5231
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
5232
|
+
constexpr size_t kN = MaxLanes(d);
|
|
5233
|
+
if (kN < 8) mask_bits &= detail::MaxMaskBits<kN>();
|
|
5234
|
+
|
|
5235
|
+
#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
5236
|
+
const ScalableTag<uint8_t> du8;
|
|
5237
|
+
// There are exactly 8 mask bits for 128 vector bits of 16-bit lanes.
|
|
5238
|
+
return detail::U8MaskBitsVecToMask(d,
|
|
5239
|
+
Set(du8, static_cast<uint8_t>(mask_bits)));
|
|
5240
|
+
#else
|
|
5241
|
+
// Slow fallback for completeness; the above bits to mask cast is preferred.
|
|
5242
|
+
const RebindToUnsigned<D> du;
|
|
5243
|
+
const VFromD<decltype(du)> bits =
|
|
5244
|
+
Shl(Set(du, uint16_t{1}), detail::AndS(detail::Iota0(du), 7));
|
|
5245
|
+
return TestBit(Set(du, static_cast<uint16_t>(mask_bits)), bits);
|
|
5246
|
+
#endif
|
|
5247
|
+
}
|
|
5248
|
+
|
|
5249
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
5250
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
5251
|
+
constexpr size_t kN = MaxLanes(d);
|
|
5252
|
+
if (kN < 4) mask_bits &= detail::MaxMaskBits<kN>();
|
|
5253
|
+
|
|
5254
|
+
#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
5255
|
+
const ScalableTag<uint8_t> du8;
|
|
5256
|
+
return detail::U8MaskBitsVecToMask(
|
|
5257
|
+
d, Set(du8, static_cast<uint8_t>(mask_bits * 0x11)));
|
|
5258
|
+
#else
|
|
5259
|
+
// Slow fallback for completeness; the above bits to mask cast is preferred.
|
|
5260
|
+
const RebindToUnsigned<D> du;
|
|
5261
|
+
const VFromD<decltype(du)> bits = Dup128VecFromValues(du, 1, 2, 4, 8);
|
|
5262
|
+
return TestBit(Set(du, static_cast<uint32_t>(mask_bits)), bits);
|
|
5263
|
+
#endif
|
|
5264
|
+
}
|
|
5265
|
+
|
|
5266
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
5267
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
5268
|
+
constexpr size_t kN = MaxLanes(d);
|
|
5269
|
+
if (kN < 2) mask_bits &= detail::MaxMaskBits<kN>();
|
|
5270
|
+
|
|
5271
|
+
#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
5272
|
+
const ScalableTag<uint8_t> du8;
|
|
5273
|
+
return detail::U8MaskBitsVecToMask(
|
|
5274
|
+
d, Set(du8, static_cast<uint8_t>(mask_bits * 0x55)));
|
|
5275
|
+
#else
|
|
5276
|
+
// Slow fallback for completeness; the above bits to mask cast is preferred.
|
|
5277
|
+
const RebindToUnsigned<D> du;
|
|
5278
|
+
const VFromD<decltype(du)> bits = Dup128VecFromValues(du, 1, 2);
|
|
5279
|
+
return TestBit(Set(du, static_cast<uint64_t>(mask_bits)), bits);
|
|
5280
|
+
#endif
|
|
5281
|
+
}
|
|
4397
5282
|
|
|
4398
5283
|
// ------------------------------ Abs (Max, Neg)
|
|
4399
5284
|
|
|
@@ -4452,23 +5337,99 @@ HWY_API V Trunc(const V v) {
|
|
|
4452
5337
|
}
|
|
4453
5338
|
|
|
4454
5339
|
// ------------------------------ Ceil
|
|
5340
|
+
#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1400) || \
|
|
5341
|
+
(HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1700)
|
|
5342
|
+
namespace detail {
|
|
5343
|
+
#define HWY_RVV_CEIL_INT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
5344
|
+
SHIFT, MLEN, NAME, OP) \
|
|
5345
|
+
HWY_API HWY_RVV_V(int, SEW, LMUL) CeilInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
5346
|
+
return __riscv_vfcvt_x_f_v_i##SEW##LMUL##_rm(v, __RISCV_FRM_RUP, \
|
|
5347
|
+
HWY_RVV_AVL(SEW, SHIFT)); \
|
|
5348
|
+
}
|
|
5349
|
+
HWY_RVV_FOREACH_F(HWY_RVV_CEIL_INT, _, _, _ALL)
|
|
5350
|
+
#undef HWY_RVV_CEIL_INT
|
|
5351
|
+
|
|
5352
|
+
} // namespace detail
|
|
5353
|
+
|
|
4455
5354
|
template <class V>
|
|
4456
5355
|
HWY_API V Ceil(const V v) {
|
|
4457
|
-
|
|
4458
|
-
|
|
4459
|
-
|
|
4460
|
-
|
|
5356
|
+
const DFromV<V> df;
|
|
5357
|
+
|
|
5358
|
+
const auto integer = detail::CeilInt(v);
|
|
5359
|
+
const auto int_f = ConvertTo(df, integer);
|
|
5360
|
+
|
|
5361
|
+
return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
|
|
4461
5362
|
}
|
|
4462
5363
|
|
|
5364
|
+
#else // GCC 13 or earlier or Clang 16 or earlier
|
|
5365
|
+
|
|
5366
|
+
template <class V>
|
|
5367
|
+
HWY_API V Ceil(const V v) {
|
|
5368
|
+
const DFromV<decltype(v)> df;
|
|
5369
|
+
const RebindToSigned<decltype(df)> di;
|
|
5370
|
+
|
|
5371
|
+
using T = TFromD<decltype(df)>;
|
|
5372
|
+
|
|
5373
|
+
const auto integer = ConvertTo(di, v); // round toward 0
|
|
5374
|
+
const auto int_f = ConvertTo(df, integer);
|
|
5375
|
+
|
|
5376
|
+
// Truncating a positive non-integer ends up smaller; if so, add 1.
|
|
5377
|
+
const auto pos1 =
|
|
5378
|
+
IfThenElseZero(Lt(int_f, v), Set(df, ConvertScalarTo<T>(1.0)));
|
|
5379
|
+
|
|
5380
|
+
return IfThenElse(detail::UseInt(v), Add(int_f, pos1), v);
|
|
5381
|
+
}
|
|
5382
|
+
|
|
5383
|
+
#endif // (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1400) ||
|
|
5384
|
+
// (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1700)
|
|
5385
|
+
|
|
4463
5386
|
// ------------------------------ Floor
|
|
5387
|
+
#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1400) || \
|
|
5388
|
+
(HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1700)
|
|
5389
|
+
namespace detail {
|
|
5390
|
+
#define HWY_RVV_FLOOR_INT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
5391
|
+
SHIFT, MLEN, NAME, OP) \
|
|
5392
|
+
HWY_API HWY_RVV_V(int, SEW, LMUL) FloorInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
5393
|
+
return __riscv_vfcvt_x_f_v_i##SEW##LMUL##_rm(v, __RISCV_FRM_RDN, \
|
|
5394
|
+
HWY_RVV_AVL(SEW, SHIFT)); \
|
|
5395
|
+
}
|
|
5396
|
+
HWY_RVV_FOREACH_F(HWY_RVV_FLOOR_INT, _, _, _ALL)
|
|
5397
|
+
#undef HWY_RVV_FLOOR_INT
|
|
5398
|
+
|
|
5399
|
+
} // namespace detail
|
|
5400
|
+
|
|
4464
5401
|
template <class V>
|
|
4465
5402
|
HWY_API V Floor(const V v) {
|
|
4466
|
-
|
|
4467
|
-
|
|
4468
|
-
|
|
4469
|
-
|
|
5403
|
+
const DFromV<V> df;
|
|
5404
|
+
|
|
5405
|
+
const auto integer = detail::FloorInt(v);
|
|
5406
|
+
const auto int_f = ConvertTo(df, integer);
|
|
5407
|
+
|
|
5408
|
+
return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
|
|
4470
5409
|
}
|
|
4471
5410
|
|
|
5411
|
+
#else // GCC 13 or earlier or Clang 16 or earlier
|
|
5412
|
+
|
|
5413
|
+
template <class V>
|
|
5414
|
+
HWY_API V Floor(const V v) {
|
|
5415
|
+
const DFromV<decltype(v)> df;
|
|
5416
|
+
const RebindToSigned<decltype(df)> di;
|
|
5417
|
+
|
|
5418
|
+
using T = TFromD<decltype(df)>;
|
|
5419
|
+
|
|
5420
|
+
const auto integer = ConvertTo(di, v); // round toward 0
|
|
5421
|
+
const auto int_f = ConvertTo(df, integer);
|
|
5422
|
+
|
|
5423
|
+
// Truncating a negative non-integer ends up larger; if so, subtract 1.
|
|
5424
|
+
const auto neg1 =
|
|
5425
|
+
IfThenElseZero(Gt(int_f, v), Set(df, ConvertScalarTo<T>(-1.0)));
|
|
5426
|
+
|
|
5427
|
+
return IfThenElse(detail::UseInt(v), Add(int_f, neg1), v);
|
|
5428
|
+
}
|
|
5429
|
+
|
|
5430
|
+
#endif // (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1400) ||
|
|
5431
|
+
// (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1700)
|
|
5432
|
+
|
|
4472
5433
|
// ------------------------------ Floating-point classification (Ne)
|
|
4473
5434
|
|
|
4474
5435
|
// vfclass does not help because it would require 3 instructions (to AND and
|
|
@@ -4479,6 +5440,14 @@ HWY_API MFromD<DFromV<V>> IsNaN(const V v) {
|
|
|
4479
5440
|
return Ne(v, v);
|
|
4480
5441
|
}
|
|
4481
5442
|
|
|
5443
|
+
// Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite.
|
|
5444
|
+
// We use a fused Set/comparison for IsFinite.
|
|
5445
|
+
#ifdef HWY_NATIVE_ISINF
|
|
5446
|
+
#undef HWY_NATIVE_ISINF
|
|
5447
|
+
#else
|
|
5448
|
+
#define HWY_NATIVE_ISINF
|
|
5449
|
+
#endif
|
|
5450
|
+
|
|
4482
5451
|
template <class V, class D = DFromV<V>>
|
|
4483
5452
|
HWY_API MFromD<D> IsInf(const V v) {
|
|
4484
5453
|
const D d;
|
|
@@ -4507,22 +5476,76 @@ HWY_API MFromD<D> IsFinite(const V v) {
|
|
|
4507
5476
|
|
|
4508
5477
|
// ------------------------------ Iota (ConvertTo)
|
|
4509
5478
|
|
|
4510
|
-
template <class D, HWY_IF_UNSIGNED_D(D)>
|
|
4511
|
-
HWY_API VFromD<D> Iota(const D d,
|
|
4512
|
-
return detail::AddS(detail::Iota0(d), first);
|
|
5479
|
+
template <class D, typename T2, HWY_IF_UNSIGNED_D(D)>
|
|
5480
|
+
HWY_API VFromD<D> Iota(const D d, T2 first) {
|
|
5481
|
+
return detail::AddS(detail::Iota0(d), static_cast<TFromD<D>>(first));
|
|
4513
5482
|
}
|
|
4514
5483
|
|
|
4515
|
-
template <class D, HWY_IF_SIGNED_D(D)>
|
|
4516
|
-
HWY_API VFromD<D> Iota(const D d,
|
|
5484
|
+
template <class D, typename T2, HWY_IF_SIGNED_D(D)>
|
|
5485
|
+
HWY_API VFromD<D> Iota(const D d, T2 first) {
|
|
4517
5486
|
const RebindToUnsigned<D> du;
|
|
4518
|
-
return detail::AddS(BitCast(d, detail::Iota0(du)),
|
|
5487
|
+
return detail::AddS(BitCast(d, detail::Iota0(du)),
|
|
5488
|
+
static_cast<TFromD<D>>(first));
|
|
4519
5489
|
}
|
|
4520
5490
|
|
|
4521
|
-
template <class D, HWY_IF_FLOAT_D(D)>
|
|
4522
|
-
HWY_API VFromD<D> Iota(const D d,
|
|
5491
|
+
template <class D, typename T2, HWY_IF_FLOAT_D(D)>
|
|
5492
|
+
HWY_API VFromD<D> Iota(const D d, T2 first) {
|
|
4523
5493
|
const RebindToUnsigned<D> du;
|
|
4524
5494
|
const RebindToSigned<D> di;
|
|
4525
|
-
return detail::AddS(ConvertTo(d, BitCast(di, detail::Iota0(du))),
|
|
5495
|
+
return detail::AddS(ConvertTo(d, BitCast(di, detail::Iota0(du))),
|
|
5496
|
+
ConvertScalarTo<TFromD<D>>(first));
|
|
5497
|
+
}
|
|
5498
|
+
|
|
5499
|
+
// ------------------------------ BitShuffle (PromoteTo, Rol, SumsOf8)
|
|
5500
|
+
|
|
5501
|
+
// Native implementation required to avoid 8-bit wraparound on long vectors.
|
|
5502
|
+
#ifdef HWY_NATIVE_BITSHUFFLE
|
|
5503
|
+
#undef HWY_NATIVE_BITSHUFFLE
|
|
5504
|
+
#else
|
|
5505
|
+
#define HWY_NATIVE_BITSHUFFLE
|
|
5506
|
+
#endif
|
|
5507
|
+
|
|
5508
|
+
// Cannot handle LMUL=8 because we promote indices.
|
|
5509
|
+
template <class V64, class VI, HWY_IF_UI8(TFromV<VI>), class D64 = DFromV<V64>,
|
|
5510
|
+
HWY_IF_UI64_D(D64), HWY_IF_POW2_LE_D(D64, 2)>
|
|
5511
|
+
HWY_API V64 BitShuffle(V64 values, VI idx) {
|
|
5512
|
+
const RebindToUnsigned<D64> du64;
|
|
5513
|
+
const Repartition<uint8_t, D64> du8;
|
|
5514
|
+
const Rebind<uint16_t, decltype(du8)> du16;
|
|
5515
|
+
using VU8 = VFromD<decltype(du8)>;
|
|
5516
|
+
using VU16 = VFromD<decltype(du16)>;
|
|
5517
|
+
// For each 16-bit (to avoid wraparound for long vectors) index of an output
|
|
5518
|
+
// byte: offset of the u64 lane to which it belongs.
|
|
5519
|
+
const VU16 byte_offsets =
|
|
5520
|
+
detail::AndS(detail::Iota0(du16), static_cast<uint16_t>(~7u));
|
|
5521
|
+
// idx is for a bit; shifting makes that bytes. Promote so we can add
|
|
5522
|
+
// byte_offsets, then we have the u8 lane index within the whole vector.
|
|
5523
|
+
const VU16 idx16 =
|
|
5524
|
+
Add(byte_offsets, PromoteTo(du16, ShiftRight<3>(BitCast(du8, idx))));
|
|
5525
|
+
const VU8 bytes = detail::TableLookupLanes16(BitCast(du8, values), idx16);
|
|
5526
|
+
|
|
5527
|
+
// We want to shift right by idx & 7 to extract the desired bit in `bytes`,
|
|
5528
|
+
// and left by iota & 7 to put it in the correct output bit. To correctly
|
|
5529
|
+
// handle shift counts from -7 to 7, we rotate (unfortunately not natively
|
|
5530
|
+
// supported on RVV).
|
|
5531
|
+
const VU8 rotate_left_bits = Sub(detail::Iota0(du8), BitCast(du8, idx));
|
|
5532
|
+
const VU8 extracted_bits_mask =
|
|
5533
|
+
BitCast(du8, Set(du64, static_cast<uint64_t>(0x8040201008040201u)));
|
|
5534
|
+
const VU8 extracted_bits =
|
|
5535
|
+
And(Rol(bytes, rotate_left_bits), extracted_bits_mask);
|
|
5536
|
+
// Combine bit-sliced (one bit per byte) into one 64-bit sum.
|
|
5537
|
+
return BitCast(D64(), SumsOf8(extracted_bits));
|
|
5538
|
+
}
|
|
5539
|
+
|
|
5540
|
+
template <class V64, class VI, HWY_IF_UI8(TFromV<VI>), class D64 = DFromV<V64>,
|
|
5541
|
+
HWY_IF_UI64_D(D64), HWY_IF_POW2_GT_D(D64, 2)>
|
|
5542
|
+
HWY_API V64 BitShuffle(V64 values, VI idx) {
|
|
5543
|
+
const Half<D64> dh;
|
|
5544
|
+
const Half<DFromV<VI>> dih;
|
|
5545
|
+
using V64H = VFromD<decltype(dh)>;
|
|
5546
|
+
const V64H r0 = BitShuffle(LowerHalf(dh, values), LowerHalf(dih, idx));
|
|
5547
|
+
const V64H r1 = BitShuffle(UpperHalf(dh, values), UpperHalf(dih, idx));
|
|
5548
|
+
return Combine(D64(), r1, r0);
|
|
4526
5549
|
}
|
|
4527
5550
|
|
|
4528
5551
|
// ------------------------------ MulEven/Odd (Mul, OddEven)
|
|
@@ -4531,7 +5554,7 @@ template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
|
4531
5554
|
class D = DFromV<V>, class DW = RepartitionToWide<D>>
|
|
4532
5555
|
HWY_API VFromD<DW> MulEven(const V a, const V b) {
|
|
4533
5556
|
const auto lo = Mul(a, b);
|
|
4534
|
-
const auto hi =
|
|
5557
|
+
const auto hi = MulHigh(a, b);
|
|
4535
5558
|
return BitCast(DW(), OddEven(detail::Slide1Up(hi), lo));
|
|
4536
5559
|
}
|
|
4537
5560
|
|
|
@@ -4539,7 +5562,7 @@ template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
|
4539
5562
|
class D = DFromV<V>, class DW = RepartitionToWide<D>>
|
|
4540
5563
|
HWY_API VFromD<DW> MulOdd(const V a, const V b) {
|
|
4541
5564
|
const auto lo = Mul(a, b);
|
|
4542
|
-
const auto hi =
|
|
5565
|
+
const auto hi = MulHigh(a, b);
|
|
4543
5566
|
return BitCast(DW(), OddEven(hi, detail::Slide1Down(lo)));
|
|
4544
5567
|
}
|
|
4545
5568
|
|
|
@@ -4547,28 +5570,34 @@ HWY_API VFromD<DW> MulOdd(const V a, const V b) {
|
|
|
4547
5570
|
template <class V, HWY_IF_T_SIZE_V(V, 8)>
|
|
4548
5571
|
HWY_INLINE V MulEven(const V a, const V b) {
|
|
4549
5572
|
const auto lo = Mul(a, b);
|
|
4550
|
-
const auto hi =
|
|
5573
|
+
const auto hi = MulHigh(a, b);
|
|
4551
5574
|
return OddEven(detail::Slide1Up(hi), lo);
|
|
4552
5575
|
}
|
|
4553
5576
|
|
|
4554
5577
|
template <class V, HWY_IF_T_SIZE_V(V, 8)>
|
|
4555
5578
|
HWY_INLINE V MulOdd(const V a, const V b) {
|
|
4556
5579
|
const auto lo = Mul(a, b);
|
|
4557
|
-
const auto hi =
|
|
5580
|
+
const auto hi = MulHigh(a, b);
|
|
4558
5581
|
return OddEven(hi, detail::Slide1Down(lo));
|
|
4559
5582
|
}
|
|
4560
5583
|
|
|
4561
5584
|
// ------------------------------ ReorderDemote2To (OddEven, Combine)
|
|
4562
5585
|
|
|
4563
|
-
template <
|
|
4564
|
-
HWY_API VFromD<
|
|
4565
|
-
|
|
4566
|
-
VFromD<RepartitionToWide<decltype(dbf16)>> a,
|
|
4567
|
-
VFromD<RepartitionToWide<decltype(dbf16)>> b) {
|
|
5586
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
5587
|
+
HWY_API VFromD<D> ReorderDemote2To(D dbf16, VFromD<RepartitionToWide<D>> a,
|
|
5588
|
+
VFromD<RepartitionToWide<D>> b) {
|
|
4568
5589
|
const RebindToUnsigned<decltype(dbf16)> du16;
|
|
5590
|
+
const Half<decltype(du16)> du16_half;
|
|
4569
5591
|
const RebindToUnsigned<DFromV<decltype(a)>> du32;
|
|
4570
|
-
const VFromD<decltype(du32)>
|
|
4571
|
-
|
|
5592
|
+
const VFromD<decltype(du32)> a_in_even = PromoteTo(
|
|
5593
|
+
du32, detail::DemoteTo16NearestEven(du16_half, BitCast(du32, a)));
|
|
5594
|
+
const VFromD<decltype(du32)> b_in_even = PromoteTo(
|
|
5595
|
+
du32, detail::DemoteTo16NearestEven(du16_half, BitCast(du32, b)));
|
|
5596
|
+
// Equivalent to InterleaveEven, but because the upper 16 bits are zero, we
|
|
5597
|
+
// can OR instead of OddEven.
|
|
5598
|
+
const VFromD<decltype(du16)> a_in_odd =
|
|
5599
|
+
detail::Slide1Up(BitCast(du16, a_in_even));
|
|
5600
|
+
return BitCast(dbf16, Or(a_in_odd, BitCast(du16, b_in_even)));
|
|
4572
5601
|
}
|
|
4573
5602
|
|
|
4574
5603
|
// If LMUL is not the max, Combine first to avoid another DemoteTo.
|
|
@@ -4618,8 +5647,8 @@ HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
|
|
|
4618
5647
|
}
|
|
4619
5648
|
|
|
4620
5649
|
// If LMUL is not the max, Combine first to avoid another DemoteTo.
|
|
4621
|
-
template <class DN,
|
|
4622
|
-
HWY_IF_F32_D(DFromV<V>),
|
|
5650
|
+
template <class DN, HWY_IF_SPECIAL_FLOAT_D(DN), HWY_IF_POW2_LE_D(DN, 2),
|
|
5651
|
+
class V, HWY_IF_F32_D(DFromV<V>),
|
|
4623
5652
|
class V2 = VFromD<Repartition<TFromV<V>, DN>>,
|
|
4624
5653
|
hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr>
|
|
4625
5654
|
HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
|
|
@@ -4629,8 +5658,8 @@ HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
|
|
|
4629
5658
|
}
|
|
4630
5659
|
|
|
4631
5660
|
// Max LMUL: must DemoteTo first, then Combine.
|
|
4632
|
-
template <class DN,
|
|
4633
|
-
HWY_IF_F32_D(DFromV<V>),
|
|
5661
|
+
template <class DN, HWY_IF_SPECIAL_FLOAT_D(DN), HWY_IF_POW2_GT_D(DN, 2),
|
|
5662
|
+
class V, HWY_IF_F32_D(DFromV<V>),
|
|
4634
5663
|
class V2 = VFromD<Repartition<TFromV<V>, DN>>,
|
|
4635
5664
|
hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr>
|
|
4636
5665
|
HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
|
|
@@ -4653,68 +5682,26 @@ HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
|
|
|
4653
5682
|
|
|
4654
5683
|
// ------------------------------ WidenMulPairwiseAdd
|
|
4655
5684
|
|
|
4656
|
-
template <class
|
|
4657
|
-
class
|
|
4658
|
-
HWY_API VFromD<
|
|
4659
|
-
const
|
|
4660
|
-
|
|
4661
|
-
const
|
|
4662
|
-
|
|
4663
|
-
|
|
4664
|
-
|
|
4665
|
-
|
|
4666
|
-
|
|
4667
|
-
|
|
4668
|
-
return MulAdd(
|
|
4669
|
-
Mul(
|
|
4670
|
-
}
|
|
4671
|
-
|
|
4672
|
-
template <class D, HWY_IF_I32_D(D), class VI16>
|
|
4673
|
-
HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, VI16 a, VI16 b) {
|
|
4674
|
-
using VI32 = VFromD<decltype(d32)>;
|
|
4675
|
-
// Manual sign extension requires two shifts for even lanes.
|
|
4676
|
-
const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a)));
|
|
4677
|
-
const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b)));
|
|
4678
|
-
const VI32 ao = ShiftRight<16>(BitCast(d32, a));
|
|
4679
|
-
const VI32 bo = ShiftRight<16>(BitCast(d32, b));
|
|
4680
|
-
return Add(Mul(ae, be), Mul(ao, bo));
|
|
4681
|
-
}
|
|
4682
|
-
|
|
4683
|
-
template <class D, HWY_IF_U32_D(D), class VI16>
|
|
4684
|
-
HWY_API VFromD<D> WidenMulPairwiseAdd(D du32, VI16 a, VI16 b) {
|
|
4685
|
-
using VU32 = VFromD<decltype(du32)>;
|
|
4686
|
-
// Manual sign extension requires two shifts for even lanes.
|
|
4687
|
-
const VU32 ae = detail::AndS(BitCast(du32, a), uint32_t{0x0000FFFFu});
|
|
4688
|
-
const VU32 be = detail::AndS(BitCast(du32, b), uint32_t{0x0000FFFFu});
|
|
4689
|
-
const VU32 ao = ShiftRight<16>(BitCast(du32, a));
|
|
4690
|
-
const VU32 bo = ShiftRight<16>(BitCast(du32, b));
|
|
4691
|
-
return Add(Mul(ae, be), Mul(ao, bo));
|
|
5685
|
+
template <class DF, HWY_IF_F32_D(DF),
|
|
5686
|
+
class VBF = VFromD<Repartition<hwy::bfloat16_t, DF>>>
|
|
5687
|
+
HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
|
|
5688
|
+
const VFromD<DF> ae = PromoteEvenTo(df, a);
|
|
5689
|
+
const VFromD<DF> be = PromoteEvenTo(df, b);
|
|
5690
|
+
const VFromD<DF> ao = PromoteOddTo(df, a);
|
|
5691
|
+
const VFromD<DF> bo = PromoteOddTo(df, b);
|
|
5692
|
+
return MulAdd(ae, be, Mul(ao, bo));
|
|
5693
|
+
}
|
|
5694
|
+
|
|
5695
|
+
template <class D, HWY_IF_UI32_D(D), class V16 = VFromD<RepartitionToNarrow<D>>>
|
|
5696
|
+
HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, V16 a, V16 b) {
|
|
5697
|
+
return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b),
|
|
5698
|
+
Mul(PromoteOddTo(d32, a), PromoteOddTo(d32, b)));
|
|
4692
5699
|
}
|
|
4693
5700
|
|
|
4694
5701
|
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
|
|
4695
5702
|
|
|
4696
5703
|
namespace detail {
|
|
4697
5704
|
|
|
4698
|
-
// Non-overloaded wrapper function so we can define DF32 in template args.
|
|
4699
|
-
template <size_t N, int kPow2, class DF32 = Simd<float, N, kPow2>,
|
|
4700
|
-
class VF32 = VFromD<DF32>,
|
|
4701
|
-
class DBF16 = Repartition<bfloat16_t, Simd<float, N, kPow2>>>
|
|
4702
|
-
HWY_API VF32 ReorderWidenMulAccumulateBF16(Simd<float, N, kPow2> df32,
|
|
4703
|
-
VFromD<DBF16> a, VFromD<DBF16> b,
|
|
4704
|
-
const VF32 sum0, VF32& sum1) {
|
|
4705
|
-
const RebindToUnsigned<DF32> du32;
|
|
4706
|
-
using VU32 = VFromD<decltype(du32)>;
|
|
4707
|
-
const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
|
|
4708
|
-
// Using shift/and instead of Zip leads to the odd/even order that
|
|
4709
|
-
// RearrangeToOddPlusEven prefers.
|
|
4710
|
-
const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
|
|
4711
|
-
const VU32 ao = And(BitCast(du32, a), odd);
|
|
4712
|
-
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
4713
|
-
const VU32 bo = And(BitCast(du32, b), odd);
|
|
4714
|
-
sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
|
|
4715
|
-
return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
|
|
4716
|
-
}
|
|
4717
|
-
|
|
4718
5705
|
#define HWY_RVV_WIDEN_MACC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
4719
5706
|
SHIFT, MLEN, NAME, OP) \
|
|
4720
5707
|
template <size_t N> \
|
|
@@ -4790,21 +5777,15 @@ HWY_API VFromD<D32> ReorderWidenMulAccumulateU16(D32 d32, VFromD<D16> a,
|
|
|
4790
5777
|
|
|
4791
5778
|
} // namespace detail
|
|
4792
5779
|
|
|
4793
|
-
template <
|
|
4794
|
-
HWY_API VW ReorderWidenMulAccumulate(
|
|
4795
|
-
|
|
4796
|
-
return detail::ReorderWidenMulAccumulateBF16(d32, a, b, sum0, sum1);
|
|
4797
|
-
}
|
|
4798
|
-
|
|
4799
|
-
template <size_t N, int kPow2, class VN, class VW>
|
|
4800
|
-
HWY_API VW ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32, VN a, VN b,
|
|
4801
|
-
const VW sum0, VW& sum1) {
|
|
5780
|
+
template <class D, HWY_IF_I32_D(D), class VN, class VW>
|
|
5781
|
+
HWY_API VW ReorderWidenMulAccumulate(D d32, VN a, VN b, const VW sum0,
|
|
5782
|
+
VW& sum1) {
|
|
4802
5783
|
return detail::ReorderWidenMulAccumulateI16(d32, a, b, sum0, sum1);
|
|
4803
5784
|
}
|
|
4804
5785
|
|
|
4805
|
-
template <
|
|
4806
|
-
HWY_API VW ReorderWidenMulAccumulate(
|
|
4807
|
-
|
|
5786
|
+
template <class D, HWY_IF_U32_D(D), class VN, class VW>
|
|
5787
|
+
HWY_API VW ReorderWidenMulAccumulate(D d32, VN a, VN b, const VW sum0,
|
|
5788
|
+
VW& sum1) {
|
|
4808
5789
|
return detail::ReorderWidenMulAccumulateU16(d32, a, b, sum0, sum1);
|
|
4809
5790
|
}
|
|
4810
5791
|
|
|
@@ -4872,6 +5853,40 @@ HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
|
|
|
4872
5853
|
}
|
|
4873
5854
|
|
|
4874
5855
|
// ------------------------------ Lt128
|
|
5856
|
+
#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
5857
|
+
|
|
5858
|
+
template <class D>
|
|
5859
|
+
HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) {
|
|
5860
|
+
static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
|
|
5861
|
+
// The subsequent computations are performed using e8mf8 (8-bit elements with
|
|
5862
|
+
// a fractional LMUL of 1/8) for the following reasons:
|
|
5863
|
+
// 1. It is correct for the possible input vector types e64m<1,2,4,8>. This is
|
|
5864
|
+
// because the resulting mask can occupy at most 1/8 of a full vector when
|
|
5865
|
+
// using e64m8.
|
|
5866
|
+
// 2. It can be more efficient than using a full vector or a vector group.
|
|
5867
|
+
//
|
|
5868
|
+
// The algorithm computes the result as follows:
|
|
5869
|
+
// 1. Compute cH | (=H & cL) in the high bits, where cH and cL represent the
|
|
5870
|
+
// comparison results for the high and low 64-bit elements, respectively.
|
|
5871
|
+
// 2. Shift the result right by 1 to duplicate the comparison results for the
|
|
5872
|
+
// low bits.
|
|
5873
|
+
// 3. Obtain the final result by performing a bitwise OR on the high and low
|
|
5874
|
+
// bits.
|
|
5875
|
+
auto du8mf8 = ScalableTag<uint8_t, -3>{};
|
|
5876
|
+
const vuint8mf8_t ltHL0 =
|
|
5877
|
+
detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Lt(a, b)));
|
|
5878
|
+
const vuint8mf8_t eqHL0 =
|
|
5879
|
+
detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Eq(a, b)));
|
|
5880
|
+
const vuint8mf8_t ltLx0 = Add(ltHL0, ltHL0);
|
|
5881
|
+
const vuint8mf8_t resultHx = detail::AndS(OrAnd(ltHL0, ltLx0, eqHL0), 0xaa);
|
|
5882
|
+
const vuint8mf8_t resultxL = ShiftRight<1>(resultHx);
|
|
5883
|
+
const vuint8mf8_t result = Or(resultHx, resultxL);
|
|
5884
|
+
auto du8m1 = ScalableTag<uint8_t>{};
|
|
5885
|
+
return detail::U8MaskBitsVecToMask(d, detail::ChangeLMUL(du8m1, result));
|
|
5886
|
+
}
|
|
5887
|
+
|
|
5888
|
+
#else
|
|
5889
|
+
|
|
4875
5890
|
template <class D>
|
|
4876
5891
|
HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) {
|
|
4877
5892
|
static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
|
|
@@ -4897,6 +5912,8 @@ HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) {
|
|
|
4897
5912
|
return MaskFromVec(OddEven(vecHx, detail::Slide1Down(vecHx)));
|
|
4898
5913
|
}
|
|
4899
5914
|
|
|
5915
|
+
#endif // HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
5916
|
+
|
|
4900
5917
|
// ------------------------------ Lt128Upper
|
|
4901
5918
|
template <class D>
|
|
4902
5919
|
HWY_INLINE MFromD<D> Lt128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
|
|
@@ -4994,7 +6011,6 @@ HWY_INLINE VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) {
|
|
|
4994
6011
|
}
|
|
4995
6012
|
|
|
4996
6013
|
// ================================================== END MACROS
|
|
4997
|
-
namespace detail { // for code folding
|
|
4998
6014
|
#undef HWY_RVV_AVL
|
|
4999
6015
|
#undef HWY_RVV_D
|
|
5000
6016
|
#undef HWY_RVV_FOREACH
|
|
@@ -5055,15 +6071,19 @@ namespace detail { // for code folding
|
|
|
5055
6071
|
#undef HWY_RVV_FOREACH_UI32
|
|
5056
6072
|
#undef HWY_RVV_FOREACH_UI3264
|
|
5057
6073
|
#undef HWY_RVV_FOREACH_UI64
|
|
6074
|
+
#undef HWY_RVV_IF_EMULATED_D
|
|
6075
|
+
#undef HWY_RVV_IF_CAN128_D
|
|
6076
|
+
#undef HWY_RVV_IF_GE128_D
|
|
6077
|
+
#undef HWY_RVV_IF_LT128_D
|
|
5058
6078
|
#undef HWY_RVV_INSERT_VXRM
|
|
5059
6079
|
#undef HWY_RVV_M
|
|
5060
6080
|
#undef HWY_RVV_RETM_ARGM
|
|
6081
|
+
#undef HWY_RVV_RETV_ARGMVV
|
|
5061
6082
|
#undef HWY_RVV_RETV_ARGV
|
|
5062
6083
|
#undef HWY_RVV_RETV_ARGVS
|
|
5063
6084
|
#undef HWY_RVV_RETV_ARGVV
|
|
5064
6085
|
#undef HWY_RVV_T
|
|
5065
6086
|
#undef HWY_RVV_V
|
|
5066
|
-
} // namespace detail
|
|
5067
6087
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
|
5068
6088
|
} // namespace HWY_NAMESPACE
|
|
5069
6089
|
} // namespace hwy
|