@img/sharp-libvips-dev 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. package/README.md +1 -2
  2. package/include/aom/aom_decoder.h +1 -1
  3. package/include/aom/aom_encoder.h +7 -1
  4. package/include/aom/aom_image.h +24 -12
  5. package/include/aom/aom_integer.h +3 -3
  6. package/include/aom/aomcx.h +15 -0
  7. package/include/aom/aomdx.h +5 -2
  8. package/include/archive.h +7 -5
  9. package/include/archive_entry.h +5 -3
  10. package/include/cgif.h +3 -0
  11. package/include/expat.h +21 -10
  12. package/include/expat_config.h +11 -5
  13. package/include/ffi.h +12 -25
  14. package/include/freetype2/freetype/config/ftoption.h +2 -2
  15. package/include/fribidi/fribidi-config.h +2 -2
  16. package/include/fribidi/fribidi-unicode-version.h +3 -3
  17. package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
  18. package/include/glib-2.0/gio/gappinfo.h +40 -25
  19. package/include/glib-2.0/gio/gapplication.h +6 -0
  20. package/include/glib-2.0/gio/gasyncresult.h +1 -1
  21. package/include/glib-2.0/gio/gconverter.h +5 -0
  22. package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
  23. package/include/glib-2.0/gio/gfile.h +16 -0
  24. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  25. package/include/glib-2.0/gio/giotypes.h +0 -1
  26. package/include/glib-2.0/gio/gsettings.h +8 -0
  27. package/include/glib-2.0/gio/gvfs.h +2 -2
  28. package/include/glib-2.0/girepository/gi-visibility.h +34 -0
  29. package/include/glib-2.0/girepository/giarginfo.h +23 -6
  30. package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
  31. package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
  32. package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
  33. package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
  34. package/include/glib-2.0/girepository/gienuminfo.h +20 -21
  35. package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
  36. package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
  37. package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
  38. package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
  39. package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
  40. package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
  41. package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
  42. package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
  43. package/include/glib-2.0/girepository/girepository.h +53 -62
  44. package/include/glib-2.0/girepository/girffi.h +8 -7
  45. package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
  46. package/include/glib-2.0/girepository/gistructinfo.h +26 -11
  47. package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
  48. package/include/glib-2.0/girepository/gitypelib.h +9 -13
  49. package/include/glib-2.0/girepository/gitypes.h +52 -104
  50. package/include/glib-2.0/girepository/giunioninfo.h +28 -12
  51. package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
  52. package/include/glib-2.0/girepository/givalueinfo.h +65 -0
  53. package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
  54. package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
  55. package/include/glib-2.0/glib/gbitlock.h +31 -0
  56. package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
  57. package/include/glib-2.0/glib/giochannel.h +2 -2
  58. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  59. package/include/glib-2.0/glib/gmacros.h +12 -5
  60. package/include/glib-2.0/glib/gmain.h +93 -7
  61. package/include/glib-2.0/glib/gmessages.h +8 -0
  62. package/include/glib-2.0/glib/gqsort.h +8 -1
  63. package/include/glib-2.0/glib/gslice.h +2 -0
  64. package/include/glib-2.0/glib/gstrfuncs.h +24 -30
  65. package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
  66. package/include/glib-2.0/glib/gthread.h +191 -3
  67. package/include/glib-2.0/glib/gunicode.h +1 -1
  68. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  69. package/include/glib-2.0/glib-unix.h +7 -1
  70. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  71. package/include/glib-2.0/gobject/genums.h +6 -6
  72. package/include/glib-2.0/gobject/glib-types.h +11 -0
  73. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  74. package/include/glib-2.0/gobject/gsignal.h +16 -6
  75. package/include/glib-2.0/gobject/gtype.h +6 -6
  76. package/include/harfbuzz/hb-buffer.h +6 -0
  77. package/include/harfbuzz/hb-common.h +6 -9
  78. package/include/harfbuzz/hb-cplusplus.hh +8 -11
  79. package/include/harfbuzz/hb-subset.h +17 -4
  80. package/include/harfbuzz/hb-version.h +3 -3
  81. package/include/hwy/abort.h +28 -0
  82. package/include/hwy/aligned_allocator.h +218 -6
  83. package/include/hwy/base.h +1935 -512
  84. package/include/hwy/cache_control.h +24 -6
  85. package/include/hwy/detect_compiler_arch.h +105 -10
  86. package/include/hwy/detect_targets.h +146 -37
  87. package/include/hwy/foreach_target.h +36 -1
  88. package/include/hwy/highway.h +222 -50
  89. package/include/hwy/ops/arm_neon-inl.h +2055 -894
  90. package/include/hwy/ops/arm_sve-inl.h +1476 -348
  91. package/include/hwy/ops/emu128-inl.h +711 -623
  92. package/include/hwy/ops/generic_ops-inl.h +4431 -2157
  93. package/include/hwy/ops/inside-inl.h +691 -0
  94. package/include/hwy/ops/ppc_vsx-inl.h +2186 -673
  95. package/include/hwy/ops/rvv-inl.h +1556 -536
  96. package/include/hwy/ops/scalar-inl.h +353 -233
  97. package/include/hwy/ops/set_macros-inl.h +171 -23
  98. package/include/hwy/ops/shared-inl.h +198 -56
  99. package/include/hwy/ops/wasm_128-inl.h +283 -244
  100. package/include/hwy/ops/x86_128-inl.h +3673 -1357
  101. package/include/hwy/ops/x86_256-inl.h +1737 -663
  102. package/include/hwy/ops/x86_512-inl.h +1697 -500
  103. package/include/hwy/per_target.h +4 -0
  104. package/include/hwy/profiler.h +648 -0
  105. package/include/hwy/robust_statistics.h +2 -2
  106. package/include/hwy/targets.h +40 -32
  107. package/include/hwy/timer-inl.h +3 -3
  108. package/include/hwy/timer.h +16 -1
  109. package/include/libheif/heif.h +170 -15
  110. package/include/libheif/heif_items.h +237 -0
  111. package/include/libheif/heif_properties.h +38 -2
  112. package/include/libheif/heif_regions.h +1 -1
  113. package/include/libheif/heif_version.h +2 -2
  114. package/include/libpng16/png.h +32 -29
  115. package/include/libpng16/pngconf.h +2 -2
  116. package/include/libpng16/pnglibconf.h +8 -3
  117. package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
  118. package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
  119. package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
  120. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  121. package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
  122. package/include/libxml2/libxml/HTMLparser.h +12 -19
  123. package/include/libxml2/libxml/c14n.h +1 -12
  124. package/include/libxml2/libxml/debugXML.h +1 -1
  125. package/include/libxml2/libxml/encoding.h +9 -0
  126. package/include/libxml2/libxml/entities.h +12 -1
  127. package/include/libxml2/libxml/hash.h +19 -0
  128. package/include/libxml2/libxml/list.h +2 -2
  129. package/include/libxml2/libxml/nanohttp.h +17 -0
  130. package/include/libxml2/libxml/parser.h +73 -58
  131. package/include/libxml2/libxml/parserInternals.h +9 -1
  132. package/include/libxml2/libxml/pattern.h +6 -0
  133. package/include/libxml2/libxml/tree.h +32 -12
  134. package/include/libxml2/libxml/uri.h +11 -0
  135. package/include/libxml2/libxml/valid.h +29 -2
  136. package/include/libxml2/libxml/xinclude.h +7 -0
  137. package/include/libxml2/libxml/xmlIO.h +21 -5
  138. package/include/libxml2/libxml/xmlerror.h +14 -0
  139. package/include/libxml2/libxml/xmlexports.h +111 -15
  140. package/include/libxml2/libxml/xmlmemory.h +8 -45
  141. package/include/libxml2/libxml/xmlreader.h +2 -0
  142. package/include/libxml2/libxml/xmlsave.h +5 -0
  143. package/include/libxml2/libxml/xmlunicode.h +165 -1
  144. package/include/libxml2/libxml/xmlversion.h +15 -179
  145. package/include/libxml2/libxml/xmlwriter.h +1 -0
  146. package/include/libxml2/libxml/xpath.h +4 -0
  147. package/include/pango-1.0/pango/pango-features.h +2 -2
  148. package/include/pango-1.0/pango/pango-fontmap.h +7 -0
  149. package/include/pango-1.0/pango/pango-item.h +4 -2
  150. package/include/pango-1.0/pango/pango-version-macros.h +25 -0
  151. package/include/pango-1.0/pango/pangofc-font.h +2 -1
  152. package/include/pixman-1/pixman-version.h +2 -2
  153. package/include/png.h +32 -29
  154. package/include/pngconf.h +2 -2
  155. package/include/pnglibconf.h +8 -3
  156. package/include/vips/connection.h +9 -3
  157. package/include/vips/util.h +1 -11
  158. package/include/vips/version.h +4 -4
  159. package/include/webp/decode.h +58 -56
  160. package/include/webp/demux.h +25 -21
  161. package/include/webp/encode.h +44 -39
  162. package/include/webp/mux.h +76 -15
  163. package/include/webp/mux_types.h +2 -1
  164. package/include/webp/sharpyuv/sharpyuv.h +77 -8
  165. package/include/webp/types.h +29 -8
  166. package/include/zconf.h +1 -1
  167. package/include/zlib.h +12 -12
  168. package/package.json +1 -1
  169. package/versions.json +18 -19
@@ -339,8 +339,11 @@ namespace detail { // for code folding
339
339
  // Full support for f16 in all ops
340
340
  #define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
341
341
  HWY_RVV_FOREACH_F16_UNCONDITIONAL(X_MACRO, NAME, OP, LMULS)
342
+ // Only BF16 is emulated.
343
+ #define HWY_RVV_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
342
344
  #else
343
345
  #define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS)
346
+ #define HWY_RVV_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
344
347
  #endif
345
348
  #define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \
346
349
  HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, float, f, NAME, OP)
@@ -389,15 +392,11 @@ namespace detail { // for code folding
389
392
  // For all combinations of SEW:
390
393
  #define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
391
394
  HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
392
- HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
393
- HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
394
- HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
395
+ HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS)
395
396
 
396
397
  #define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \
397
398
  HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \
398
- HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \
399
- HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
400
- HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
399
+ HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS)
401
400
 
402
401
  #define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS) \
403
402
  HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
@@ -409,8 +408,7 @@ namespace detail { // for code folding
409
408
  HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS)
410
409
 
411
410
  #define HWY_RVV_FOREACH(X_MACRO, NAME, OP, LMULS) \
412
- HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
413
- HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \
411
+ HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS) \
414
412
  HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS)
415
413
 
416
414
  // Assemble types for use in x-macros
@@ -438,22 +436,134 @@ HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _, _ALL)
438
436
  // ------------------------------ Lanes
439
437
 
440
438
  // WARNING: we want to query VLMAX/sizeof(T), but this may actually change VL!
441
- #define HWY_RVV_LANES(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
442
- MLEN, NAME, OP) \
443
- template <size_t N> \
444
- HWY_API size_t NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \
445
- constexpr size_t kFull = HWY_LANES(HWY_RVV_T(BASE, SEW)); \
446
- constexpr size_t kCap = MaxLanes(d); \
447
- /* If no cap, avoid generating a constant by using VLMAX. */ \
448
- return N == kFull ? __riscv_vsetvlmax_e##SEW##LMUL() \
449
- : __riscv_vsetvl_e##SEW##LMUL(kCap); \
450
- } \
451
- template <size_t N> \
452
- HWY_API size_t Capped##NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, size_t cap) { \
453
- /* If no cap, avoid the HWY_MIN. */ \
454
- return detail::IsFull(d) \
455
- ? __riscv_vsetvl_e##SEW##LMUL(cap) \
456
- : __riscv_vsetvl_e##SEW##LMUL(HWY_MIN(cap, MaxLanes(d))); \
439
+
440
+ #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
441
+ // HWY_RVV_CAPPED_LANES_SPECIAL_CASES provides some additional optimizations
442
+ // to CappedLanes in non-debug builds
443
+ #define HWY_RVV_CAPPED_LANES_SPECIAL_CASES(BASE, SEW, LMUL) \
444
+ if (__builtin_constant_p(cap >= kMaxLanes) && (cap >= kMaxLanes)) { \
445
+ /* If cap is known to be greater than or equal to MaxLanes(d), */ \
446
+ /* HWY_MIN(cap, Lanes(d)) will be equal to Lanes(d) */ \
447
+ return Lanes(d); \
448
+ } \
449
+ \
450
+ if ((__builtin_constant_p((cap & (cap - 1)) == 0) && \
451
+ ((cap & (cap - 1)) == 0)) || \
452
+ (__builtin_constant_p(cap <= HWY_MAX(kMinLanesPerFullVec, 4)) && \
453
+ (cap <= HWY_MAX(kMinLanesPerFullVec, 4)))) { \
454
+ /* If cap is known to be a power of 2, then */ \
455
+ /* vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the same */ \
456
+ /* result as HWY_MIN(cap, Lanes(d)) as kMaxLanes is a power of 2 and */ \
457
+ /* as (cap > VLMAX && cap < 2 * VLMAX) can only be true if cap is not a */ \
458
+ /* power of 2 since VLMAX is always a power of 2 */ \
459
+ \
460
+ /* If cap is known to be less than or equal to 4, then */ \
461
+ /* vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the same */ \
462
+ /* result as HWY_MIN(cap, Lanes(d)) as HWY_MIN(cap, kMaxLanes) <= 4 is */ \
463
+ /* true if cap <= 4 and as vsetvl(HWY_MIN(cap, kMaxLanes)) is */ \
464
+ /* guaranteed to return the same result as HWY_MIN(cap, Lanes(d)) */ \
465
+ /* if HWY_MIN(cap, kMaxLanes) <= 4 is true */ \
466
+ \
467
+ /* If cap is known to be less than or equal to kMinLanesPerFullVec, */ \
468
+ /* then vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the */ \
469
+ /* same result as HWY_MIN(cap, Lanes(d)) as */ \
470
+ /* HWY_MIN(cap, kMaxLanes) <= kMinLanesPerFullVec is true if */ \
471
+ /* cap <= kMinLanesPerFullVec is true */ \
472
+ \
473
+ /* If cap <= HWY_MAX(kMinLanesPerFullVec, 4) is true, then either */ \
474
+ /* cap <= 4 or cap <= kMinLanesPerFullVec must be true */ \
475
+ \
476
+ /* If cap <= HWY_MAX(kMinLanesPerFullVec, 4) is known to be true, */ \
477
+ /* then vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the */ \
478
+ /* same result as HWY_MIN(cap, Lanes(d)) */ \
479
+ \
480
+ /* If no cap, avoid the HWY_MIN. */ \
481
+ return detail::IsFull(d) \
482
+ ? __riscv_vsetvl_e##SEW##LMUL(cap) \
483
+ : __riscv_vsetvl_e##SEW##LMUL(HWY_MIN(cap, kMaxLanes)); \
484
+ }
485
+ #else
486
+ #define HWY_RVV_CAPPED_LANES_SPECIAL_CASES(BASE, SEW, LMUL)
487
+ #endif
488
+
489
+ #define HWY_RVV_LANES(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
490
+ MLEN, NAME, OP) \
491
+ template <size_t N> \
492
+ HWY_API size_t NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \
493
+ constexpr size_t kFull = HWY_LANES(HWY_RVV_T(BASE, SEW)); \
494
+ constexpr size_t kCap = MaxLanes(d); \
495
+ /* If no cap, avoid generating a constant by using VLMAX. */ \
496
+ return N == kFull ? __riscv_vsetvlmax_e##SEW##LMUL() \
497
+ : __riscv_vsetvl_e##SEW##LMUL(kCap); \
498
+ } \
499
+ template <size_t N> \
500
+ HWY_API size_t Capped##NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, size_t cap) { \
501
+ /* NOTE: Section 6.3 of the RVV specification, which can be found at */ \
502
+ /* https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc, */ \
503
+ /* allows vsetvl to return a result less than Lanes(d) but greater than */ \
504
+ /* or equal to ((cap + 1) / 2) if */ \
505
+ /* (Lanes(d) > 2 && cap > HWY_MAX(Lanes(d), 4) && cap < (2 * Lanes(d))) */ \
506
+ /* is true */ \
507
+ \
508
+ /* VLMAX is the number of lanes in a vector of type */ \
509
+ /* VFromD<decltype(d)>, which is returned by */ \
510
+ /* Lanes(DFromV<VFromD<decltype(d)>>()) */ \
511
+ \
512
+ /* VLMAX is guaranteed to be a power of 2 under Section 2 of the RVV */ \
513
+ /* specification */ \
514
+ \
515
+ /* The VLMAX of a vector of type VFromD<decltype(d)> is at least 2 as */ \
516
+ /* the HWY_RVV target requires support for the RVV Zvl128b extension, */ \
517
+ /* which guarantees that vectors with LMUL=1 are at least 16 bytes */ \
518
+ \
519
+ /* If VLMAX == 2 is true, then vsetvl(cap) is equal to HWY_MIN(cap, 2) */ \
520
+ /* as cap == 3 is the only value such that */ \
521
+ /* (cap > VLMAX && cap < 2 * VLMAX) if VLMAX == 2 and as */ \
522
+ /* ((3 + 1) / 2) is equal to 2 */ \
523
+ \
524
+ /* If cap <= 4 is true, then vsetvl(cap) must be equal to */ \
525
+ /* HWY_MIN(cap, VLMAX) as cap <= VLMAX is true if VLMAX >= 4 is true */ \
526
+ /* and as vsetvl(cap) is guaranteed to be equal to HWY_MIN(cap, VLMAX) */ \
527
+ /* if VLMAX == 2 */ \
528
+ \
529
+ /* We want CappedLanes(d, cap) to return Lanes(d) if cap > Lanes(d) as */ \
530
+ /* LoadN(d, p, cap) expects to load exactly HWY_MIN(cap, Lanes(d)) */ \
531
+ /* lanes and StoreN(v, d, p, cap) expects to store exactly */ \
532
+ /* HWY_MIN(cap, Lanes(d)) lanes, even in the case where vsetvl returns */ \
533
+ /* a result that is less than HWY_MIN(cap, Lanes(d)) */ \
534
+ \
535
+ /* kMinLanesPerFullVec is the minimum value of VLMAX for a vector of */ \
536
+ /* type VFromD<decltype(d)> */ \
537
+ constexpr size_t kMinLanesPerFullVec = \
538
+ detail::ScaleByPower(16 / (SEW / 8), SHIFT); \
539
+ /* kMaxLanes is the maximum number of lanes returned by Lanes(d) */ \
540
+ constexpr size_t kMaxLanes = MaxLanes(d); \
541
+ \
542
+ HWY_RVV_CAPPED_LANES_SPECIAL_CASES(BASE, SEW, LMUL) \
543
+ \
544
+ if (kMaxLanes <= HWY_MAX(kMinLanesPerFullVec, 4)) { \
545
+ /* If kMaxLanes <= kMinLanesPerFullVec is true, then */ \
546
+ /* vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return */ \
547
+ /* HWY_MIN(cap, Lanes(d)) as */ \
548
+ /* HWY_MIN(cap, kMaxLanes) <= kMaxLanes <= VLMAX is true if */ \
549
+ /* kMaxLanes <= kMinLanesPerFullVec is true */ \
550
+ \
551
+ /* If kMaxLanes <= 4 is true, then vsetvl(HWY_MIN(cap, kMaxLanes)) is */ \
552
+ /* guaranteed to return the same result as HWY_MIN(cap, Lanes(d)) as */ \
553
+ /* HWY_MIN(cap, kMaxLanes) <= 4 is true if kMaxLanes <= 4 is true */ \
554
+ \
555
+ /* If kMaxLanes <= HWY_MAX(kMinLanesPerFullVec, 4) is true, then */ \
556
+ /* either kMaxLanes <= 4 or kMaxLanes <= kMinLanesPerFullVec must be */ \
557
+ /* true */ \
558
+ \
559
+ return __riscv_vsetvl_e##SEW##LMUL(HWY_MIN(cap, kMaxLanes)); \
560
+ } else { \
561
+ /* If kMaxLanes > HWY_MAX(kMinLanesPerFullVec, 4) is true, need to */ \
562
+ /* obtain the actual number of lanes using Lanes(d) and clamp cap to */ \
563
+ /* the result of Lanes(d) */ \
564
+ const size_t actual = Lanes(d); \
565
+ return HWY_MIN(actual, cap); \
566
+ } \
457
567
  }
458
568
 
459
569
  #define HWY_RVV_LANES_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
@@ -480,18 +590,18 @@ HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _, _ALL)
480
590
 
481
591
  HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e, _ALL)
482
592
  HWY_RVV_FOREACH(HWY_RVV_LANES_VIRT, Lanes, lenb, _VIRT)
483
- // If not already defined via HWY_RVV_FOREACH, define the overloads because
484
- // they do not require any new instruction.
485
- #if !HWY_HAVE_FLOAT16
486
- HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_LANES, Lanes, setvlmax_e, _ALL)
487
- HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_LANES_VIRT, Lanes, lenb, _VIRT)
488
- #endif
489
593
  #undef HWY_RVV_LANES
490
594
  #undef HWY_RVV_LANES_VIRT
595
+ #undef HWY_RVV_CAPPED_LANES_SPECIAL_CASES
596
+
597
+ template <class D, HWY_RVV_IF_EMULATED_D(D)>
598
+ HWY_API size_t Lanes(D /* tag*/) {
599
+ return Lanes(RebindToUnsigned<D>());
600
+ }
491
601
 
492
- template <size_t N, int kPow2>
493
- HWY_API size_t Lanes(Simd<bfloat16_t, N, kPow2> /* tag*/) {
494
- return Lanes(Simd<int16_t, N, kPow2>());
602
+ template <class D, HWY_RVV_IF_EMULATED_D(D)>
603
+ HWY_API size_t CappedLanes(D /* tag*/, size_t cap) {
604
+ return CappedLanes(RebindToUnsigned<D>(), cap);
495
605
  }
496
606
 
497
607
  // ------------------------------ Common x-macros
@@ -525,10 +635,20 @@ HWY_API size_t Lanes(Simd<bfloat16_t, N, kPow2> /* tag*/) {
525
635
  HWY_RVV_AVL(SEW, SHIFT)); \
526
636
  }
527
637
 
638
+ // vector = f(vector, mask, vector, vector), e.g. MaskedAddOr
639
+ #define HWY_RVV_RETV_ARGMVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
640
+ SHIFT, MLEN, NAME, OP) \
641
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
642
+ NAME(HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_M(MLEN) m, \
643
+ HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
644
+ return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_mu(m, no, a, b, \
645
+ HWY_RVV_AVL(SEW, SHIFT)); \
646
+ }
647
+
528
648
  // mask = f(mask)
529
- #define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP) \
530
- HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) { \
531
- return __riscv_vm##OP##_m_b##MLEN(m, ~0ull); \
649
+ #define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP) \
650
+ HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) { \
651
+ return __riscv_vm##OP##_m_b##MLEN(m, HWY_RVV_AVL(SEW, SHIFT)); \
532
652
  }
533
653
 
534
654
  // ================================================== INIT
@@ -549,21 +669,17 @@ HWY_RVV_FOREACH_F(HWY_RVV_SET, Set, fmv_v_f, _ALL_VIRT)
549
669
 
550
670
  // Treat bfloat16_t as int16_t (using the previously defined Set overloads);
551
671
  // required for Zero and VFromD.
552
- template <size_t N, int kPow2>
553
- decltype(Set(Simd<int16_t, N, kPow2>(), 0)) Set(Simd<bfloat16_t, N, kPow2> d,
554
- bfloat16_t arg) {
555
- return Set(RebindToSigned<decltype(d)>(), arg.bits);
672
+ template <class D, HWY_IF_BF16_D(D)>
673
+ decltype(Set(RebindToSigned<D>(), 0)) Set(D d, hwy::bfloat16_t arg) {
674
+ return Set(RebindToSigned<decltype(d)>(), BitCastScalar<int16_t>(arg));
556
675
  }
557
676
  #if !HWY_HAVE_FLOAT16 // Otherwise already defined above.
558
677
  // WARNING: returns a different type than emulated bfloat16_t so that we can
559
678
  // implement PromoteTo overloads for both bfloat16_t and float16_t, and also
560
- // provide a Neg(float16_t) overload that coexists with Neg(int16_t).
561
- template <size_t N, int kPow2>
562
- decltype(Set(Simd<uint16_t, N, kPow2>(), 0)) Set(Simd<float16_t, N, kPow2> d,
563
- float16_t arg) {
564
- uint16_t bits;
565
- CopySameSize(&arg, &bits);
566
- return Set(RebindToUnsigned<decltype(d)>(), bits);
679
+ // provide a Neg(hwy::float16_t) overload that coexists with Neg(int16_t).
680
+ template <class D, HWY_IF_F16_D(D)>
681
+ decltype(Set(RebindToUnsigned<D>(), 0)) Set(D d, hwy::float16_t arg) {
682
+ return Set(RebindToUnsigned<decltype(d)>(), BitCastScalar<uint16_t>(arg));
567
683
  }
568
684
  #endif
569
685
 
@@ -642,16 +758,7 @@ HWY_RVV_FOREACH(HWY_RVV_EXT, Ext, lmul_ext, _EXT)
642
758
  HWY_RVV_FOREACH(HWY_RVV_EXT_VIRT, Ext, lmul_ext, _VIRT)
643
759
  #undef HWY_RVV_EXT_VIRT
644
760
 
645
- #if !HWY_HAVE_FLOAT16
646
- template <class D, HWY_IF_F16_D(D)>
647
- VFromD<D> Ext(D d, VFromD<Half<D>> v) {
648
- const RebindToUnsigned<decltype(d)> du;
649
- const Half<decltype(du)> duh;
650
- return BitCast(d, Ext(du, BitCast(duh, v)));
651
- }
652
- #endif
653
-
654
- template <class D, HWY_IF_BF16_D(D)>
761
+ template <class D, HWY_RVV_IF_EMULATED_D(D)>
655
762
  VFromD<D> Ext(D d, VFromD<Half<D>> v) {
656
763
  const RebindToUnsigned<decltype(d)> du;
657
764
  const Half<decltype(du)> duh;
@@ -767,10 +874,10 @@ HWY_RVV_FOREACH_F(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
767
874
  HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_CAST_IF, _, reinterpret, _ALL)
768
875
  HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
769
876
  #else
770
- template <size_t N, int kPow2>
771
- HWY_INLINE VFromD<Simd<uint16_t, N, kPow2>> BitCastFromByte(
772
- Simd<float16_t, N, kPow2> /* d */, VFromD<Simd<uint8_t, N, kPow2>> v) {
773
- return BitCastFromByte(Simd<uint16_t, N, kPow2>(), v);
877
+ template <class D, HWY_IF_F16_D(D)>
878
+ HWY_INLINE VFromD<RebindToUnsigned<D>> BitCastFromByte(
879
+ D /* d */, VFromD<Repartition<uint8_t, D>> v) {
880
+ return BitCastFromByte(RebindToUnsigned<D>(), v);
774
881
  }
775
882
  #endif
776
883
 
@@ -781,10 +888,10 @@ HWY_INLINE VFromD<Simd<uint16_t, N, kPow2>> BitCastFromByte(
781
888
  #undef HWY_RVV_CAST_VIRT_U
782
889
  #undef HWY_RVV_CAST_VIRT_IF
783
890
 
784
- template <size_t N, int kPow2>
785
- HWY_INLINE VFromD<Simd<int16_t, N, kPow2>> BitCastFromByte(
786
- Simd<bfloat16_t, N, kPow2> /* d */, VFromD<Simd<uint8_t, N, kPow2>> v) {
787
- return BitCastFromByte(Simd<int16_t, N, kPow2>(), v);
891
+ template <class D, HWY_IF_BF16_D(D)>
892
+ HWY_INLINE VFromD<RebindToSigned<D>> BitCastFromByte(
893
+ D d, VFromD<Repartition<uint8_t, D>> v) {
894
+ return BitCastFromByte(RebindToSigned<decltype(d)>(), v);
788
895
  }
789
896
 
790
897
  } // namespace detail
@@ -942,6 +1049,35 @@ HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, SubS, sub_vx, _ALL)
942
1049
  HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Sub, sub, _ALL)
943
1050
  HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Sub, fsub, _ALL)
944
1051
 
1052
+ // ------------------------------ Neg (ReverseSubS, Xor)
1053
+
1054
+ template <class V, HWY_IF_SIGNED_V(V)>
1055
+ HWY_API V Neg(const V v) {
1056
+ return detail::ReverseSubS(v, 0);
1057
+ }
1058
+
1059
+ // vector = f(vector), but argument is repeated
1060
+ #define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1061
+ SHIFT, MLEN, NAME, OP) \
1062
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
1063
+ return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, v, \
1064
+ HWY_RVV_AVL(SEW, SHIFT)); \
1065
+ }
1066
+
1067
+ HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Neg, fsgnjn, _ALL)
1068
+
1069
+ #if !HWY_HAVE_FLOAT16
1070
+
1071
+ template <class V, HWY_IF_U16_D(DFromV<V>)> // hwy::float16_t
1072
+ HWY_API V Neg(V v) {
1073
+ const DFromV<decltype(v)> d;
1074
+ const RebindToUnsigned<decltype(d)> du;
1075
+ using TU = TFromD<decltype(du)>;
1076
+ return BitCast(d, Xor(BitCast(du, v), Set(du, SignMask<TU>())));
1077
+ }
1078
+
1079
+ #endif // !HWY_HAVE_FLOAT16
1080
+
945
1081
  // ------------------------------ SaturatedAdd
946
1082
 
947
1083
  #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
@@ -1048,7 +1184,7 @@ HWY_RVV_FOREACH_I(HWY_RVV_SHIFT, ShiftRight, sra, _ALL)
1048
1184
  #undef HWY_RVV_SHIFT
1049
1185
 
1050
1186
  // ------------------------------ SumsOf8 (ShiftRight, Add)
1051
- template <class VU8>
1187
+ template <class VU8, HWY_IF_U8_D(DFromV<VU8>)>
1052
1188
  HWY_API VFromD<Repartition<uint64_t, DFromV<VU8>>> SumsOf8(const VU8 v) {
1053
1189
  const DFromV<VU8> du8;
1054
1190
  const RepartitionToWide<decltype(du8)> du16;
@@ -1071,13 +1207,42 @@ HWY_API VFromD<Repartition<uint64_t, DFromV<VU8>>> SumsOf8(const VU8 v) {
1071
1207
  return detail::AndS(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), 0xFFFFull);
1072
1208
  }
1073
1209
 
1210
+ template <class VI8, HWY_IF_I8_D(DFromV<VI8>)>
1211
+ HWY_API VFromD<Repartition<int64_t, DFromV<VI8>>> SumsOf8(const VI8 v) {
1212
+ const DFromV<VI8> di8;
1213
+ const RepartitionToWide<decltype(di8)> di16;
1214
+ const RepartitionToWide<decltype(di16)> di32;
1215
+ const RepartitionToWide<decltype(di32)> di64;
1216
+ const RebindToUnsigned<decltype(di32)> du32;
1217
+ const RebindToUnsigned<decltype(di64)> du64;
1218
+ using VI16 = VFromD<decltype(di16)>;
1219
+
1220
+ const VI16 vFDB97531 = ShiftRight<8>(BitCast(di16, v));
1221
+ const VI16 vECA86420 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, v)));
1222
+ const VI16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
1223
+
1224
+ const VI16 sDC_zz_98_zz_54_zz_10_zz =
1225
+ BitCast(di16, ShiftLeft<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
1226
+ const VI16 sFC_xx_B8_xx_74_xx_30_xx =
1227
+ Add(sFE_DC_BA_98_76_54_32_10, sDC_zz_98_zz_54_zz_10_zz);
1228
+ const VI16 sB8_xx_zz_zz_30_xx_zz_zz =
1229
+ BitCast(di16, ShiftLeft<32>(BitCast(du64, sFC_xx_B8_xx_74_xx_30_xx)));
1230
+ const VI16 sF8_xx_xx_xx_70_xx_xx_xx =
1231
+ Add(sFC_xx_B8_xx_74_xx_30_xx, sB8_xx_zz_zz_30_xx_zz_zz);
1232
+ return ShiftRight<48>(BitCast(di64, sF8_xx_xx_xx_70_xx_xx_xx));
1233
+ }
1234
+
1074
1235
  // ------------------------------ RotateRight
1075
- template <int kBits, class V>
1236
+ template <int kBits, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
1076
1237
  HWY_API V RotateRight(const V v) {
1238
+ const DFromV<decltype(v)> d;
1239
+ const RebindToUnsigned<decltype(d)> du;
1240
+
1077
1241
  constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
1078
1242
  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
1079
1243
  if (kBits == 0) return v;
1080
- return Or(ShiftRight<kBits>(v),
1244
+
1245
+ return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
1081
1246
  ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
1082
1247
  }
1083
1248
 
@@ -1158,15 +1323,8 @@ HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Mul, fmul, _ALL)
1158
1323
 
1159
1324
  // ------------------------------ MulHigh
1160
1325
 
1161
- // Only for internal use (Highway only promises MulHigh for 16-bit inputs).
1162
- // Used by MulEven; vwmul does not work for m8.
1163
- namespace detail {
1164
1326
  HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, MulHigh, mulh, _ALL)
1165
1327
  HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, MulHigh, mulhu, _ALL)
1166
- } // namespace detail
1167
-
1168
- HWY_RVV_FOREACH_U16(HWY_RVV_RETV_ARGVV, MulHigh, mulhu, _ALL)
1169
- HWY_RVV_FOREACH_I16(HWY_RVV_RETV_ARGVV, MulHigh, mulh, _ALL)
1170
1328
 
1171
1329
  // ------------------------------ MulFixedPoint15
1172
1330
 
@@ -1184,8 +1342,57 @@ HWY_RVV_FOREACH_I16(HWY_RVV_MUL15, MulFixedPoint15, smul, _ALL)
1184
1342
  #undef HWY_RVV_MUL15
1185
1343
 
1186
1344
  // ------------------------------ Div
1345
+ #ifdef HWY_NATIVE_INT_DIV
1346
+ #undef HWY_NATIVE_INT_DIV
1347
+ #else
1348
+ #define HWY_NATIVE_INT_DIV
1349
+ #endif
1350
+
1351
+ HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Div, divu, _ALL)
1352
+ HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Div, div, _ALL)
1187
1353
  HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Div, fdiv, _ALL)
1188
1354
 
1355
+ HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Mod, remu, _ALL)
1356
+ HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Mod, rem, _ALL)
1357
+
1358
+ // ------------------------------ MaskedAddOr etc.
1359
+
1360
+ #ifdef HWY_NATIVE_MASKED_ARITH
1361
+ #undef HWY_NATIVE_MASKED_ARITH
1362
+ #else
1363
+ #define HWY_NATIVE_MASKED_ARITH
1364
+ #endif
1365
+
1366
+ HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedMinOr, minu, _ALL)
1367
+ HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedMinOr, min, _ALL)
1368
+ HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedMinOr, fmin, _ALL)
1369
+
1370
+ HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedMaxOr, maxu, _ALL)
1371
+ HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedMaxOr, max, _ALL)
1372
+ HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedMaxOr, fmax, _ALL)
1373
+
1374
+ HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGMVV, MaskedAddOr, add, _ALL)
1375
+ HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedAddOr, fadd, _ALL)
1376
+
1377
+ HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGMVV, MaskedSubOr, sub, _ALL)
1378
+ HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedSubOr, fsub, _ALL)
1379
+
1380
+ HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGMVV, MaskedMulOr, mul, _ALL)
1381
+ HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedMulOr, fmul, _ALL)
1382
+
1383
+ HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedDivOr, divu, _ALL)
1384
+ HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedDivOr, div, _ALL)
1385
+ HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedDivOr, fdiv, _ALL)
1386
+
1387
+ HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedModOr, remu, _ALL)
1388
+ HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedModOr, rem, _ALL)
1389
+
1390
+ HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedSatAddOr, saddu, _ALL)
1391
+ HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedSatAddOr, sadd, _ALL)
1392
+
1393
+ HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedSatSubOr, ssubu, _ALL)
1394
+ HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedSatSubOr, ssub, _ALL)
1395
+
1189
1396
  // ------------------------------ ApproximateReciprocal
1190
1397
  #ifdef HWY_NATIVE_F64_APPROX_RECIP
1191
1398
  #undef HWY_NATIVE_F64_APPROX_RECIP
@@ -1247,26 +1454,6 @@ HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub, fnmacc, _ALL)
1247
1454
  // vboolXX_t is a power of two divisor for vector bits. SEW=8 / LMUL=1 = 1/8th
1248
1455
  // of all bits; SEW=8 / LMUL=4 = half of all bits.
1249
1456
 
1250
- // SFINAE for mapping Simd<> to MLEN (up to 64).
1251
- #define HWY_RVV_IF_MLEN_D(D, MLEN) \
1252
- hwy::EnableIf<MLenFromD(D()) == MLEN>* = nullptr
1253
-
1254
- // Specialized for RVV instead of the generic test_util-inl.h implementation
1255
- // because more efficient, and helps implement MFromD.
1256
-
1257
- #define HWY_RVV_MASK_FALSE(SEW, SHIFT, MLEN, NAME, OP) \
1258
- template <class D, HWY_RVV_IF_MLEN_D(D, MLEN)> \
1259
- HWY_API HWY_RVV_M(MLEN) NAME(D d) { \
1260
- return __riscv_vm##OP##_m_b##MLEN(Lanes(d)); \
1261
- }
1262
-
1263
- HWY_RVV_FOREACH_B(HWY_RVV_MASK_FALSE, MaskFalse, clr)
1264
- #undef HWY_RVV_MASK_FALSE
1265
- #undef HWY_RVV_IF_MLEN_D
1266
-
1267
- template <class D>
1268
- using MFromD = decltype(MaskFalse(D()));
1269
-
1270
1457
  // mask = f(vector, vector)
1271
1458
  #define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1272
1459
  SHIFT, MLEN, NAME, OP) \
@@ -1405,11 +1592,49 @@ HWY_RVV_FOREACH_F(HWY_RVV_IF_THEN_ZERO_ELSE, IfThenZeroElse, fmerge_vfm, _ALL)
1405
1592
  #undef HWY_RVV_IF_THEN_ZERO_ELSE
1406
1593
 
1407
1594
  // ------------------------------ MaskFromVec
1595
+
1596
+ template <class D>
1597
+ using MFromD = decltype(Eq(Zero(D()), Zero(D())));
1598
+
1408
1599
  template <class V>
1409
1600
  HWY_API MFromD<DFromV<V>> MaskFromVec(const V v) {
1410
1601
  return detail::NeS(v, 0);
1411
1602
  }
1412
1603
 
1604
+ // ------------------------------ IsNegative (MFromD)
1605
+ #ifdef HWY_NATIVE_IS_NEGATIVE
1606
+ #undef HWY_NATIVE_IS_NEGATIVE
1607
+ #else
1608
+ #define HWY_NATIVE_IS_NEGATIVE
1609
+ #endif
1610
+
1611
+ // Generic for all vector lengths
1612
+ template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
1613
+ HWY_API MFromD<DFromV<V>> IsNegative(V v) {
1614
+ const DFromV<decltype(v)> d;
1615
+ const RebindToSigned<decltype(d)> di;
1616
+ using TI = TFromD<decltype(di)>;
1617
+
1618
+ return detail::LtS(BitCast(di, v), static_cast<TI>(0));
1619
+ }
1620
+
1621
+ // ------------------------------ MaskFalse
1622
+
1623
+ // For mask ops including vmclr, elements past VL are tail-agnostic and cannot
1624
+ // be relied upon, so define a variant of the generic_ops-inl implementation of
1625
+ // MaskFalse that ensures all bits are zero as required by mask_test.
1626
+ #ifdef HWY_NATIVE_MASK_FALSE
1627
+ #undef HWY_NATIVE_MASK_FALSE
1628
+ #else
1629
+ #define HWY_NATIVE_MASK_FALSE
1630
+ #endif
1631
+
1632
+ template <class D>
1633
+ HWY_API MFromD<D> MaskFalse(D d) {
1634
+ const DFromV<VFromD<decltype(d)>> d_full;
1635
+ return MaskFromVec(Zero(d_full));
1636
+ }
1637
+
1413
1638
  // ------------------------------ RebindMask
1414
1639
  template <class D, typename MFrom>
1415
1640
  HWY_API MFromD<D> RebindMask(const D /*d*/, const MFrom mask) {
@@ -1427,10 +1652,12 @@ HWY_API MFromD<D> RebindMask(const D /*d*/, const MFrom mask) {
1427
1652
  template <size_t N> \
1428
1653
  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1429
1654
  NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_M(MLEN) m) { \
1430
- const RebindToSigned<decltype(d)> di; \
1655
+ /* MaskFalse requires we set all lanes for capped d and virtual LMUL. */ \
1656
+ const DFromV<VFromD<decltype(d)>> d_full; \
1657
+ const RebindToSigned<decltype(d_full)> di; \
1431
1658
  using TI = TFromD<decltype(di)>; \
1432
- return BitCast( \
1433
- d, __riscv_v##OP##_i##SEW##LMUL(Zero(di), TI{-1}, m, Lanes(d))); \
1659
+ return BitCast(d_full, __riscv_v##OP##_i##SEW##LMUL(Zero(di), TI{-1}, m, \
1660
+ Lanes(d_full))); \
1434
1661
  }
1435
1662
 
1436
1663
  HWY_RVV_FOREACH_UI(HWY_RVV_VEC_FROM_MASK, VecFromMask, merge_vxm, _ALL_VIRT)
@@ -1448,14 +1675,8 @@ HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
1448
1675
  return IfThenElse(MaskFromVec(mask), yes, no);
1449
1676
  }
1450
1677
 
1451
- // ------------------------------ ZeroIfNegative
1452
- template <class V>
1453
- HWY_API V ZeroIfNegative(const V v) {
1454
- return IfThenZeroElse(detail::LtS(v, 0), v);
1455
- }
1456
-
1457
1678
  // ------------------------------ BroadcastSignBit
1458
- template <class V>
1679
+ template <class V, HWY_IF_SIGNED_V(V)>
1459
1680
  HWY_API V BroadcastSignBit(const V v) {
1460
1681
  return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v);
1461
1682
  }
@@ -1464,11 +1685,7 @@ HWY_API V BroadcastSignBit(const V v) {
1464
1685
  template <class V>
1465
1686
  HWY_API V IfNegativeThenElse(V v, V yes, V no) {
1466
1687
  static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float");
1467
- const DFromV<V> d;
1468
- const RebindToSigned<decltype(d)> di;
1469
-
1470
- MFromD<decltype(d)> m = detail::LtS(BitCast(di, v), 0);
1471
- return IfThenElse(m, yes, no);
1688
+ return IfThenElse(IsNegative(v), yes, no);
1472
1689
  }
1473
1690
 
1474
1691
  // ------------------------------ FindFirstTrue
@@ -1518,6 +1735,38 @@ HWY_RVV_FOREACH_B(HWY_RVV_ALL_TRUE, _, _)
1518
1735
  HWY_RVV_FOREACH_B(HWY_RVV_COUNT_TRUE, _, _)
1519
1736
  #undef HWY_RVV_COUNT_TRUE
1520
1737
 
1738
+ // ------------------------------ PromoteMaskTo
1739
+
1740
+ #ifdef HWY_NATIVE_PROMOTE_MASK_TO
1741
+ #undef HWY_NATIVE_PROMOTE_MASK_TO
1742
+ #else
1743
+ #define HWY_NATIVE_PROMOTE_MASK_TO
1744
+ #endif
1745
+
1746
+ template <class DTo, class DFrom,
1747
+ HWY_IF_T_SIZE_GT_D(DTo, sizeof(TFromD<DFrom>)),
1748
+ hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DFrom>>()>* = nullptr>
1749
+ HWY_API MFromD<DTo> PromoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
1750
+ MFromD<DFrom> m) {
1751
+ return m;
1752
+ }
1753
+
1754
+ // ------------------------------ DemoteMaskTo
1755
+
1756
+ #ifdef HWY_NATIVE_DEMOTE_MASK_TO
1757
+ #undef HWY_NATIVE_DEMOTE_MASK_TO
1758
+ #else
1759
+ #define HWY_NATIVE_DEMOTE_MASK_TO
1760
+ #endif
1761
+
1762
+ template <class DTo, class DFrom,
1763
+ HWY_IF_T_SIZE_LE_D(DTo, sizeof(TFromD<DFrom>) - 1),
1764
+ hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DFrom>>()>* = nullptr>
1765
+ HWY_API MFromD<DTo> DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
1766
+ MFromD<DFrom> m) {
1767
+ return m;
1768
+ }
1769
+
1521
1770
  // ================================================== MEMORY
1522
1771
 
1523
1772
  // ------------------------------ Load
@@ -1528,47 +1777,18 @@ HWY_RVV_FOREACH_B(HWY_RVV_COUNT_TRUE, _, _)
1528
1777
  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1529
1778
  NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1530
1779
  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1531
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
1532
1780
  return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \
1533
- reinterpret_cast<const T*>(p), Lanes(d)); \
1781
+ detail::NativeLanePointer(p), Lanes(d)); \
1534
1782
  }
1535
1783
  HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le, _ALL_VIRT)
1536
1784
  #undef HWY_RVV_LOAD
1537
1785
 
1538
- // There is no native BF16, treat as uint16_t.
1539
- template <size_t N, int kPow2>
1540
- HWY_API VFromD<Simd<int16_t, N, kPow2>> Load(Simd<bfloat16_t, N, kPow2> d,
1541
- const bfloat16_t* HWY_RESTRICT p) {
1542
- return Load(RebindToSigned<decltype(d)>(),
1543
- reinterpret_cast<const int16_t * HWY_RESTRICT>(p));
1544
- }
1545
-
1546
- template <size_t N, int kPow2>
1547
- HWY_API void Store(VFromD<Simd<int16_t, N, kPow2>> v,
1548
- Simd<bfloat16_t, N, kPow2> d, bfloat16_t* HWY_RESTRICT p) {
1549
- Store(v, RebindToSigned<decltype(d)>(),
1550
- reinterpret_cast<int16_t * HWY_RESTRICT>(p));
1551
- }
1552
-
1553
- #if !HWY_HAVE_FLOAT16 // Otherwise already defined above.
1554
-
1555
- // NOTE: different type for float16_t than bfloat16_t, see Set().
1556
- template <size_t N, int kPow2>
1557
- HWY_API VFromD<Simd<uint16_t, N, kPow2>> Load(Simd<float16_t, N, kPow2> d,
1558
- const float16_t* HWY_RESTRICT p) {
1559
- return Load(RebindToUnsigned<decltype(d)>(),
1560
- reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
1561
- }
1562
-
1563
- template <size_t N, int kPow2>
1564
- HWY_API void Store(VFromD<Simd<uint16_t, N, kPow2>> v,
1565
- Simd<float16_t, N, kPow2> d, float16_t* HWY_RESTRICT p) {
1566
- Store(v, RebindToUnsigned<decltype(d)>(),
1567
- reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
1786
+ template <class D, HWY_RVV_IF_EMULATED_D(D)>
1787
+ HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
1788
+ const RebindToUnsigned<decltype(d)> du;
1789
+ return BitCast(d, Load(du, detail::U16LanePointer(p)));
1568
1790
  }
1569
1791
 
1570
- #endif // !HWY_HAVE_FLOAT16
1571
-
1572
1792
  // ------------------------------ LoadU
1573
1793
  template <class D>
1574
1794
  HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
@@ -1584,23 +1804,37 @@ HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
1584
1804
  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1585
1805
  NAME(HWY_RVV_M(MLEN) m, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1586
1806
  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1587
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
1588
1807
  return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_mu( \
1589
- m, Zero(d), reinterpret_cast<const T*>(p), Lanes(d)); \
1808
+ m, Zero(d), detail::NativeLanePointer(p), Lanes(d)); \
1590
1809
  } \
1591
1810
  template <size_t N> \
1592
1811
  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1593
1812
  NAME##Or(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
1594
1813
  HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1595
1814
  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1596
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
1597
1815
  return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_mu( \
1598
- m, v, reinterpret_cast<const T*>(p), Lanes(d)); \
1816
+ m, v, detail::NativeLanePointer(p), Lanes(d)); \
1599
1817
  }
1600
1818
 
1601
1819
  HWY_RVV_FOREACH(HWY_RVV_MASKED_LOAD, MaskedLoad, le, _ALL_VIRT)
1602
1820
  #undef HWY_RVV_MASKED_LOAD
1603
1821
 
1822
+ template <class D, HWY_RVV_IF_EMULATED_D(D)>
1823
+ HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
1824
+ const TFromD<D>* HWY_RESTRICT p) {
1825
+ const RebindToUnsigned<decltype(d)> du;
1826
+ return BitCast(d,
1827
+ MaskedLoad(RebindMask(du, m), du, detail::U16LanePointer(p)));
1828
+ }
1829
+
1830
+ template <class D, HWY_RVV_IF_EMULATED_D(D)>
1831
+ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> no, MFromD<D> m, D d,
1832
+ const TFromD<D>* HWY_RESTRICT p) {
1833
+ const RebindToUnsigned<decltype(d)> du;
1834
+ return BitCast(d, MaskedLoadOr(BitCast(du, no), RebindMask(du, m), du,
1835
+ detail::U16LanePointer(p)));
1836
+ }
1837
+
1604
1838
  // ------------------------------ LoadN
1605
1839
 
1606
1840
  // Native with avl is faster than the generic_ops using FirstN.
@@ -1616,29 +1850,41 @@ HWY_RVV_FOREACH(HWY_RVV_MASKED_LOAD, MaskedLoad, le, _ALL_VIRT)
1616
1850
  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1617
1851
  NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1618
1852
  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t num_lanes) { \
1619
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
1620
1853
  /* Use a tail-undisturbed load in LoadN as the tail-undisturbed load */ \
1621
1854
  /* operation below will leave any lanes past the first */ \
1622
1855
  /* (lowest-indexed) HWY_MIN(num_lanes, Lanes(d)) lanes unchanged */ \
1623
1856
  return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_tu( \
1624
- Zero(d), reinterpret_cast<const T*>(p), CappedLanes(d, num_lanes)); \
1857
+ Zero(d), detail::NativeLanePointer(p), CappedLanes(d, num_lanes)); \
1625
1858
  } \
1626
1859
  template <size_t N> \
1627
1860
  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME##Or( \
1628
1861
  HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1629
1862
  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t num_lanes) { \
1630
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
1631
1863
  /* Use a tail-undisturbed load in LoadNOr as the tail-undisturbed load */ \
1632
1864
  /* operation below will set any lanes past the first */ \
1633
1865
  /* (lowest-indexed) HWY_MIN(num_lanes, Lanes(d)) lanes to the */ \
1634
1866
  /* corresponding lanes in no */ \
1635
1867
  return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_tu( \
1636
- no, reinterpret_cast<const T*>(p), CappedLanes(d, num_lanes)); \
1868
+ no, detail::NativeLanePointer(p), CappedLanes(d, num_lanes)); \
1637
1869
  }
1638
1870
 
1639
1871
  HWY_RVV_FOREACH(HWY_RVV_LOADN, LoadN, le, _ALL_VIRT)
1640
1872
  #undef HWY_RVV_LOADN
1641
1873
 
1874
+ template <class D, HWY_RVV_IF_EMULATED_D(D)>
1875
+ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
1876
+ size_t num_lanes) {
1877
+ const RebindToUnsigned<D> du;
1878
+ return BitCast(d, LoadN(du, detail::U16LanePointer(p), num_lanes));
1879
+ }
1880
+ template <class D, HWY_RVV_IF_EMULATED_D(D)>
1881
+ HWY_API VFromD<D> LoadNOr(VFromD<D> v, D d, const TFromD<D>* HWY_RESTRICT p,
1882
+ size_t num_lanes) {
1883
+ const RebindToUnsigned<D> du;
1884
+ return BitCast(
1885
+ d, LoadNOr(BitCast(du, v), du, detail::U16LanePointer(p), num_lanes));
1886
+ }
1887
+
1642
1888
  // ------------------------------ Store
1643
1889
 
1644
1890
  #define HWY_RVV_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
@@ -1647,13 +1893,18 @@ HWY_RVV_FOREACH(HWY_RVV_LOADN, LoadN, le, _ALL_VIRT)
1647
1893
  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
1648
1894
  HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1649
1895
  HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1650
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
1651
- return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL(reinterpret_cast<T*>(p), \
1652
- v, Lanes(d)); \
1896
+ return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \
1897
+ detail::NativeLanePointer(p), v, Lanes(d)); \
1653
1898
  }
1654
1899
  HWY_RVV_FOREACH(HWY_RVV_STORE, Store, se, _ALL_VIRT)
1655
1900
  #undef HWY_RVV_STORE
1656
1901
 
1902
+ template <class D, HWY_RVV_IF_EMULATED_D(D)>
1903
+ HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
1904
+ const RebindToUnsigned<decltype(d)> du;
1905
+ Store(BitCast(du, v), du, detail::U16LanePointer(p));
1906
+ }
1907
+
1657
1908
  // ------------------------------ BlendedStore
1658
1909
 
1659
1910
  #define HWY_RVV_BLENDED_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
@@ -1662,13 +1913,20 @@ HWY_RVV_FOREACH(HWY_RVV_STORE, Store, se, _ALL_VIRT)
1662
1913
  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
1663
1914
  HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1664
1915
  HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1665
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
1666
1916
  return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_m( \
1667
- m, reinterpret_cast<T*>(p), v, Lanes(d)); \
1917
+ m, detail::NativeLanePointer(p), v, Lanes(d)); \
1668
1918
  }
1669
1919
  HWY_RVV_FOREACH(HWY_RVV_BLENDED_STORE, BlendedStore, se, _ALL_VIRT)
1670
1920
  #undef HWY_RVV_BLENDED_STORE
1671
1921
 
1922
+ template <class D, HWY_RVV_IF_EMULATED_D(D)>
1923
+ HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
1924
+ TFromD<D>* HWY_RESTRICT p) {
1925
+ const RebindToUnsigned<decltype(d)> du;
1926
+ BlendedStore(BitCast(du, v), RebindMask(du, m), du,
1927
+ detail::U16LanePointer(p));
1928
+ }
1929
+
1672
1930
  // ------------------------------ StoreN
1673
1931
 
1674
1932
  namespace detail {
@@ -1679,13 +1937,18 @@ namespace detail {
1679
1937
  HWY_API void NAME(size_t count, HWY_RVV_V(BASE, SEW, LMUL) v, \
1680
1938
  HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, \
1681
1939
  HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1682
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
1683
- return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL(reinterpret_cast<T*>(p), \
1684
- v, count); \
1940
+ return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \
1941
+ detail::NativeLanePointer(p), v, count); \
1685
1942
  }
1686
1943
  HWY_RVV_FOREACH(HWY_RVV_STOREN, StoreN, se, _ALL_VIRT)
1687
1944
  #undef HWY_RVV_STOREN
1688
1945
 
1946
+ template <class D, HWY_RVV_IF_EMULATED_D(D)>
1947
+ HWY_API void StoreN(size_t count, VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
1948
+ const RebindToUnsigned<decltype(d)> du;
1949
+ StoreN(count, BitCast(du, v), du, detail::U16LanePointer(p));
1950
+ }
1951
+
1689
1952
  } // namespace detail
1690
1953
 
1691
1954
  #ifdef HWY_NATIVE_STORE_N
@@ -1694,13 +1957,12 @@ HWY_RVV_FOREACH(HWY_RVV_STOREN, StoreN, se, _ALL_VIRT)
1694
1957
  #define HWY_NATIVE_STORE_N
1695
1958
  #endif
1696
1959
 
1697
- template <class D, typename T = TFromD<D>,
1698
- hwy::EnableIf<hwy::IsSame<T, TFromV<VFromD<D>>>()>* = nullptr>
1699
- HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
1960
+ template <class D>
1961
+ HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
1700
1962
  size_t max_lanes_to_store) {
1701
- // NOTE: Need to call Lanes(d) and clamp max_lanes_to_store to Lanes(d), even
1702
- // if MaxLanes(d) >= MaxLanes(DFromV<VFromD<D>>()) is true, as it is possible
1703
- // for detail::StoreN(max_lanes_to_store, v, d, p) to store fewer than
1963
+ // NOTE: Need to clamp max_lanes_to_store to Lanes(d), even if
1964
+ // MaxLanes(d) >= MaxLanes(DFromV<VFromD<D>>()) is true, as it is possible for
1965
+ // detail::StoreN(max_lanes_to_store, v, d, p) to store fewer than
1704
1966
  // Lanes(DFromV<VFromD<D>>()) lanes to p if
1705
1967
  // max_lanes_to_store > Lanes(DFromV<VFromD<D>>()) and
1706
1968
  // max_lanes_to_store < 2 * Lanes(DFromV<VFromD<D>>()) are both true.
@@ -1709,21 +1971,7 @@ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
1709
1971
  // if Lanes(d) < Lanes(DFromV<VFromD<D>>()) is true, which is possible if
1710
1972
  // MaxLanes(d) < MaxLanes(DFromV<VFromD<D>>()) or
1711
1973
  // d.Pow2() < DFromV<VFromD<D>>().Pow2() is true.
1712
- const size_t N = Lanes(d);
1713
- detail::StoreN(HWY_MIN(max_lanes_to_store, N), v, d, p);
1714
- }
1715
-
1716
- // StoreN for BF16/F16 vectors
1717
- template <class D, typename T = TFromD<D>,
1718
- hwy::EnableIf<!hwy::IsSame<T, TFromV<VFromD<D>>>()>* = nullptr,
1719
- HWY_IF_SPECIAL_FLOAT(T)>
1720
- HWY_API void StoreN(VFromD<D> v, D /*d*/, T* HWY_RESTRICT p,
1721
- size_t max_lanes_to_store) {
1722
- using TStore = TFromV<VFromD<D>>;
1723
- const Rebind<TStore, D> d_store;
1724
- const size_t N = Lanes(d_store);
1725
- detail::StoreN(HWY_MIN(max_lanes_to_store, N), v, d_store,
1726
- reinterpret_cast<TStore * HWY_RESTRICT>(p));
1974
+ detail::StoreN(CappedLanes(d, max_lanes_to_store), v, d, p);
1727
1975
  }
1728
1976
 
1729
1977
  // ------------------------------ StoreU
@@ -1747,17 +1995,16 @@ HWY_API void Stream(const V v, D d, T* HWY_RESTRICT aligned) {
1747
1995
  #define HWY_NATIVE_SCATTER
1748
1996
  #endif
1749
1997
 
1750
- #define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1751
- SHIFT, MLEN, NAME, OP) \
1752
- template <size_t N> \
1753
- HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
1754
- HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1755
- HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
1756
- HWY_RVV_V(int, SEW, LMUL) offset) { \
1757
- const RebindToUnsigned<decltype(d)> du; \
1758
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
1759
- return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
1760
- reinterpret_cast<T*>(base), BitCast(du, offset), v, Lanes(d)); \
1998
+ #define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1999
+ SHIFT, MLEN, NAME, OP) \
2000
+ template <size_t N> \
2001
+ HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
2002
+ HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
2003
+ HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
2004
+ HWY_RVV_V(int, SEW, LMUL) offset) { \
2005
+ const RebindToUnsigned<decltype(d)> du; \
2006
+ return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
2007
+ detail::NativeLanePointer(base), BitCast(du, offset), v, Lanes(d)); \
1761
2008
  }
1762
2009
  HWY_RVV_FOREACH(HWY_RVV_SCATTER, ScatterOffset, sux, _ALL_VIRT)
1763
2010
  #undef HWY_RVV_SCATTER
@@ -1772,19 +2019,18 @@ HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
1772
2019
 
1773
2020
  // ------------------------------ MaskedScatterIndex
1774
2021
 
1775
- #define HWY_RVV_MASKED_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
1776
- LMULH, SHIFT, MLEN, NAME, OP) \
1777
- template <size_t N> \
1778
- HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
1779
- HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1780
- HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
1781
- HWY_RVV_V(int, SEW, LMUL) indices) { \
1782
- const RebindToUnsigned<decltype(d)> du; \
1783
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
1784
- constexpr size_t kBits = CeilLog2(sizeof(TFromD<decltype(d)>)); \
1785
- return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL##_m( \
1786
- m, reinterpret_cast<T*>(base), ShiftLeft<kBits>(BitCast(du, indices)), \
1787
- v, Lanes(d)); \
2022
+ #define HWY_RVV_MASKED_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
2023
+ LMULH, SHIFT, MLEN, NAME, OP) \
2024
+ template <size_t N> \
2025
+ HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
2026
+ HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
2027
+ HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
2028
+ HWY_RVV_V(int, SEW, LMUL) indices) { \
2029
+ const RebindToUnsigned<decltype(d)> du; \
2030
+ constexpr size_t kBits = CeilLog2(sizeof(TFromD<decltype(d)>)); \
2031
+ return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL##_m( \
2032
+ m, detail::NativeLanePointer(base), \
2033
+ ShiftLeft<kBits>(BitCast(du, indices)), v, Lanes(d)); \
1788
2034
  }
1789
2035
  HWY_RVV_FOREACH(HWY_RVV_MASKED_SCATTER, MaskedScatterIndex, sux, _ALL_VIRT)
1790
2036
  #undef HWY_RVV_MASKED_SCATTER
@@ -1805,9 +2051,8 @@ HWY_RVV_FOREACH(HWY_RVV_MASKED_SCATTER, MaskedScatterIndex, sux, _ALL_VIRT)
1805
2051
  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
1806
2052
  HWY_RVV_V(int, SEW, LMUL) offset) { \
1807
2053
  const RebindToUnsigned<decltype(d)> du; \
1808
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
1809
2054
  return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
1810
- reinterpret_cast<const T*>(base), BitCast(du, offset), Lanes(d)); \
2055
+ detail::NativeLanePointer(base), BitCast(du, offset), Lanes(d)); \
1811
2056
  }
1812
2057
  HWY_RVV_FOREACH(HWY_RVV_GATHER, GatherOffset, lux, _ALL_VIRT)
1813
2058
  #undef HWY_RVV_GATHER
@@ -1821,25 +2066,34 @@ HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
1821
2066
  return GatherOffset(d, base, ShiftLeft<kBits>(index));
1822
2067
  }
1823
2068
 
1824
- // ------------------------------ MaskedGatherIndex
2069
+ // ------------------------------ MaskedGatherIndexOr
1825
2070
 
1826
2071
  #define HWY_RVV_MASKED_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1827
2072
  SHIFT, MLEN, NAME, OP) \
1828
2073
  template <size_t N> \
1829
2074
  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1830
- NAME(HWY_RVV_M(MLEN) m, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
2075
+ NAME(HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_M(MLEN) m, \
2076
+ HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1831
2077
  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
1832
2078
  HWY_RVV_V(int, SEW, LMUL) indices) { \
1833
2079
  const RebindToUnsigned<decltype(d)> du; \
1834
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
2080
+ const RebindToSigned<decltype(d)> di; \
2081
+ (void)di; /* for HWY_DASSERT */ \
1835
2082
  constexpr size_t kBits = CeilLog2(SEW / 8); \
2083
+ HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di)))); \
1836
2084
  return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL##_mu( \
1837
- m, Zero(d), reinterpret_cast<const T*>(base), \
2085
+ m, no, detail::NativeLanePointer(base), \
1838
2086
  ShiftLeft<kBits>(BitCast(du, indices)), Lanes(d)); \
1839
2087
  }
1840
- HWY_RVV_FOREACH(HWY_RVV_MASKED_GATHER, MaskedGatherIndex, lux, _ALL_VIRT)
2088
+ HWY_RVV_FOREACH(HWY_RVV_MASKED_GATHER, MaskedGatherIndexOr, lux, _ALL_VIRT)
1841
2089
  #undef HWY_RVV_MASKED_GATHER
1842
2090
 
2091
+ template <class D>
2092
+ HWY_API VFromD<D> MaskedGatherIndex(MFromD<D> m, D d, const TFromD<D>* base,
2093
+ VFromD<RebindToSigned<D>> indices) {
2094
+ return MaskedGatherIndexOr(Zero(d), m, d, base, indices);
2095
+ }
2096
+
1843
2097
  // ================================================== CONVERT
1844
2098
 
1845
2099
  // ------------------------------ PromoteTo
@@ -1952,52 +2206,38 @@ HWY_API auto PromoteTo(Simd<uint64_t, N, -1> d,
1952
2206
  }
1953
2207
 
1954
2208
  // Unsigned to signed: cast for unsigned promote.
1955
- template <size_t N, int kPow2>
1956
- HWY_API auto PromoteTo(Simd<int16_t, N, kPow2> d,
1957
- VFromD<Rebind<uint8_t, decltype(d)>> v)
1958
- -> VFromD<decltype(d)> {
2209
+ template <class D, HWY_IF_I16_D(D)>
2210
+ HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) {
1959
2211
  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
1960
2212
  }
1961
2213
 
1962
- template <size_t N, int kPow2>
1963
- HWY_API auto PromoteTo(Simd<int32_t, N, kPow2> d,
1964
- VFromD<Rebind<uint8_t, decltype(d)>> v)
1965
- -> VFromD<decltype(d)> {
2214
+ template <class D, HWY_IF_I32_D(D)>
2215
+ HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) {
1966
2216
  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
1967
2217
  }
1968
2218
 
1969
- template <size_t N, int kPow2>
1970
- HWY_API auto PromoteTo(Simd<int32_t, N, kPow2> d,
1971
- VFromD<Rebind<uint16_t, decltype(d)>> v)
1972
- -> VFromD<decltype(d)> {
2219
+ template <class D, HWY_IF_I32_D(D)>
2220
+ HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint16_t, D>> v) {
1973
2221
  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
1974
2222
  }
1975
2223
 
1976
- template <size_t N, int kPow2>
1977
- HWY_API auto PromoteTo(Simd<int64_t, N, kPow2> d,
1978
- VFromD<Rebind<uint32_t, decltype(d)>> v)
1979
- -> VFromD<decltype(d)> {
2224
+ template <class D, HWY_IF_I64_D(D)>
2225
+ HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint32_t, D>> v) {
1980
2226
  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
1981
2227
  }
1982
2228
 
1983
- template <size_t N, int kPow2>
1984
- HWY_API auto PromoteTo(Simd<int64_t, N, kPow2> d,
1985
- VFromD<Rebind<uint16_t, decltype(d)>> v)
1986
- -> VFromD<decltype(d)> {
2229
+ template <class D, HWY_IF_I64_D(D)>
2230
+ HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint16_t, D>> v) {
1987
2231
  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
1988
2232
  }
1989
2233
 
1990
- template <size_t N, int kPow2>
1991
- HWY_API auto PromoteTo(Simd<int64_t, N, kPow2> d,
1992
- VFromD<Rebind<uint8_t, decltype(d)>> v)
1993
- -> VFromD<decltype(d)> {
2234
+ template <class D, HWY_IF_I64_D(D)>
2235
+ HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) {
1994
2236
  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
1995
2237
  }
1996
2238
 
1997
- template <size_t N, int kPow2>
1998
- HWY_API auto PromoteTo(Simd<float32_t, N, kPow2> d,
1999
- VFromD<Rebind<bfloat16_t, decltype(d)>> v)
2000
- -> VFromD<decltype(d)> {
2239
+ template <class D, HWY_IF_F32_D(D)>
2240
+ HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<hwy::bfloat16_t, D>> v) {
2001
2241
  const RebindToSigned<decltype(d)> di32;
2002
2242
  const Rebind<uint16_t, decltype(d)> du16;
2003
2243
  return BitCast(d, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
@@ -2097,28 +2337,24 @@ HWY_API vuint8m2_t DemoteTo(Simd<uint8_t, N, 1> d, const vuint32m8_t v) {
2097
2337
  HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));
2098
2338
  }
2099
2339
 
2100
- template <size_t N, int kPow2>
2101
- HWY_API VFromD<Simd<uint8_t, N, kPow2>> DemoteTo(
2102
- Simd<uint8_t, N, kPow2> d, VFromD<Simd<int64_t, N, kPow2 + 3>> v) {
2103
- return DemoteTo(d, DemoteTo(Simd<uint32_t, N, kPow2 + 2>(), v));
2340
+ template <class D, HWY_IF_U8_D(D)>
2341
+ HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<int64_t, D>> v) {
2342
+ return DemoteTo(d, DemoteTo(Rebind<uint32_t, D>(), v));
2104
2343
  }
2105
2344
 
2106
- template <size_t N, int kPow2>
2107
- HWY_API VFromD<Simd<uint8_t, N, kPow2>> DemoteTo(
2108
- Simd<uint8_t, N, kPow2> d, VFromD<Simd<uint64_t, N, kPow2 + 3>> v) {
2109
- return DemoteTo(d, DemoteTo(Simd<uint32_t, N, kPow2 + 2>(), v));
2345
+ template <class D, HWY_IF_U8_D(D)>
2346
+ HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<uint64_t, D>> v) {
2347
+ return DemoteTo(d, DemoteTo(Rebind<uint32_t, D>(), v));
2110
2348
  }
2111
2349
 
2112
- template <size_t N, int kPow2>
2113
- HWY_API VFromD<Simd<uint16_t, N, kPow2>> DemoteTo(
2114
- Simd<uint16_t, N, kPow2> d, VFromD<Simd<int64_t, N, kPow2 + 2>> v) {
2115
- return DemoteTo(d, DemoteTo(Simd<uint32_t, N, kPow2 + 1>(), v));
2350
+ template <class D, HWY_IF_U16_D(D)>
2351
+ HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<int64_t, D>> v) {
2352
+ return DemoteTo(d, DemoteTo(Rebind<uint32_t, D>(), v));
2116
2353
  }
2117
2354
 
2118
- template <size_t N, int kPow2>
2119
- HWY_API VFromD<Simd<uint16_t, N, kPow2>> DemoteTo(
2120
- Simd<uint16_t, N, kPow2> d, VFromD<Simd<uint64_t, N, kPow2 + 2>> v) {
2121
- return DemoteTo(d, DemoteTo(Simd<uint32_t, N, kPow2 + 1>(), v));
2355
+ template <class D, HWY_IF_U16_D(D)>
2356
+ HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<uint64_t, D>> v) {
2357
+ return DemoteTo(d, DemoteTo(Rebind<uint32_t, D>(), v));
2122
2358
  }
2123
2359
 
2124
2360
  HWY_API vuint8mf8_t U8FromU32(const vuint32mf2_t v) {
@@ -2501,16 +2737,14 @@ HWY_API vint8m2_t DemoteTo(Simd<int8_t, N, 1> d, const vint32m8_t v) {
2501
2737
  return DemoteTo(d, DemoteTo(Simd<int16_t, N, 2>(), v));
2502
2738
  }
2503
2739
 
2504
- template <size_t N, int kPow2>
2505
- HWY_API VFromD<Simd<int8_t, N, kPow2>> DemoteTo(
2506
- Simd<int8_t, N, kPow2> d, VFromD<Simd<int64_t, N, kPow2 + 3>> v) {
2507
- return DemoteTo(d, DemoteTo(Simd<int32_t, N, kPow2 + 2>(), v));
2740
+ template <class D, HWY_IF_I8_D(D)>
2741
+ HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<int64_t, D>> v) {
2742
+ return DemoteTo(d, DemoteTo(Rebind<int32_t, D>(), v));
2508
2743
  }
2509
2744
 
2510
- template <size_t N, int kPow2>
2511
- HWY_API VFromD<Simd<int16_t, N, kPow2>> DemoteTo(
2512
- Simd<int16_t, N, kPow2> d, VFromD<Simd<int64_t, N, kPow2 + 2>> v) {
2513
- return DemoteTo(d, DemoteTo(Simd<int32_t, N, kPow2 + 1>(), v));
2745
+ template <class D, HWY_IF_I16_D(D)>
2746
+ HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<int64_t, D>> v) {
2747
+ return DemoteTo(d, DemoteTo(Rebind<int32_t, D>(), v));
2514
2748
  }
2515
2749
 
2516
2750
  #undef HWY_RVV_DEMOTE
@@ -2527,9 +2761,15 @@ HWY_API VFromD<Simd<int16_t, N, kPow2>> DemoteTo(
2527
2761
  }
2528
2762
 
2529
2763
  #if HWY_HAVE_FLOAT16 || HWY_RVV_HAVE_F16C
2530
- HWY_RVV_FOREACH_F32(HWY_RVV_DEMOTE_F, DemoteTo, fncvt_rod_f_f_w_f, _DEMOTE_VIRT)
2764
+ HWY_RVV_FOREACH_F32(HWY_RVV_DEMOTE_F, DemoteTo, fncvt_f_f_w_f, _DEMOTE_VIRT)
2531
2765
  #endif
2532
- HWY_RVV_FOREACH_F64(HWY_RVV_DEMOTE_F, DemoteTo, fncvt_rod_f_f_w_f, _DEMOTE_VIRT)
2766
+ HWY_RVV_FOREACH_F64(HWY_RVV_DEMOTE_F, DemoteTo, fncvt_f_f_w_f, _DEMOTE_VIRT)
2767
+
2768
+ namespace detail {
2769
+ HWY_RVV_FOREACH_F64(HWY_RVV_DEMOTE_F, DemoteToF32WithRoundToOdd,
2770
+ fncvt_rod_f_f_w_f, _DEMOTE_VIRT)
2771
+ } // namespace detail
2772
+
2533
2773
  #undef HWY_RVV_DEMOTE_F
2534
2774
 
2535
2775
  // TODO(janwas): add BASE2 arg to allow generating this via DEMOTE_F.
@@ -2617,27 +2857,72 @@ HWY_API vfloat32m4_t DemoteTo(Simd<float, N, 2> d, const vuint64m8_t v) {
2617
2857
  return __riscv_vfncvt_f_xu_w_f32m4(v, Lanes(d));
2618
2858
  }
2619
2859
 
2860
+ // Narrows f32 bits to bf16 using round to even.
2620
2861
  // SEW is for the source so we can use _DEMOTE_VIRT.
2621
- #define HWY_RVV_DEMOTE_TO_SHR_16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
2622
- LMULH, SHIFT, MLEN, NAME, OP) \
2862
+ #ifdef HWY_RVV_AVOID_VXRM
2863
+ #define HWY_RVV_DEMOTE_16_NEAREST_EVEN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, \
2864
+ LMULD, LMULH, SHIFT, MLEN, NAME, OP) \
2865
+ template <size_t N> \
2866
+ HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \
2867
+ HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
2868
+ const auto round = \
2869
+ detail::AddS(detail::AndS(ShiftRight<16>(v), 1u), 0x7FFFu); \
2870
+ v = Add(v, round); \
2871
+ /* The default rounding mode appears to be RNU=0, which adds the LSB. */ \
2872
+ /* Prevent further rounding by clearing the bits we want to truncate. */ \
2873
+ v = detail::AndS(v, 0xFFFF0000u); \
2874
+ return __riscv_v##OP##CHAR##SEWH##LMULH(v, 16, Lanes(d)); \
2875
+ }
2876
+
2877
+ #else
2878
+ #define HWY_RVV_DEMOTE_16_NEAREST_EVEN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, \
2879
+ LMULD, LMULH, SHIFT, MLEN, NAME, OP) \
2623
2880
  template <size_t N> \
2624
2881
  HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \
2625
2882
  HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
2626
2883
  return __riscv_v##OP##CHAR##SEWH##LMULH( \
2627
- v, 16, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); \
2884
+ v, 16, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNE, Lanes(d))); \
2628
2885
  }
2886
+ #endif // HWY_RVV_AVOID_VXRM
2629
2887
  namespace detail {
2630
- HWY_RVV_FOREACH_U32(HWY_RVV_DEMOTE_TO_SHR_16, DemoteToShr16, nclipu_wx_,
2631
- _DEMOTE_VIRT)
2888
+ HWY_RVV_FOREACH_U32(HWY_RVV_DEMOTE_16_NEAREST_EVEN, DemoteTo16NearestEven,
2889
+ nclipu_wx_, _DEMOTE_VIRT)
2632
2890
  }
2633
- #undef HWY_RVV_DEMOTE_TO_SHR_16
2891
+ #undef HWY_RVV_DEMOTE_16_NEAREST_EVEN
2892
+
2893
+ #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
2894
+ #undef HWY_NATIVE_DEMOTE_F32_TO_BF16
2895
+ #else
2896
+ #define HWY_NATIVE_DEMOTE_F32_TO_BF16
2897
+ #endif
2634
2898
 
2635
- template <size_t N, int kPow2>
2636
- HWY_API VFromD<Simd<bfloat16_t, N, kPow2>> DemoteTo(
2637
- Simd<bfloat16_t, N, kPow2> d, VFromD<Simd<float, N, kPow2 + 1>> v) {
2899
+ template <class DBF16, HWY_IF_BF16_D(DBF16)>
2900
+ HWY_API VFromD<DBF16> DemoteTo(DBF16 d, VFromD<Rebind<float, DBF16>> v) {
2901
+ const DFromV<decltype(v)> df;
2902
+ const RebindToUnsigned<decltype(df)> du32;
2638
2903
  const RebindToUnsigned<decltype(d)> du16;
2639
- const Rebind<uint32_t, decltype(d)> du32;
2640
- return BitCast(d, detail::DemoteToShr16(du16, BitCast(du32, v)));
2904
+ // Consider an f32 mantissa with the upper 7 bits set, followed by a 1-bit
2905
+ // and at least one other bit set. This will round to 0 and increment the
2906
+ // exponent. If the exponent was already 0xFF (NaN), then the result is -inf;
2907
+ // there no wraparound because nclipu saturates. Note that in this case, the
2908
+ // input cannot have been inf because its mantissa bits are zero. To avoid
2909
+ // converting NaN to inf, we canonicalize the NaN to prevent the rounding.
2910
+ const decltype(v) canonicalized =
2911
+ IfThenElse(Eq(v, v), v, BitCast(df, Set(du32, 0x7F800000)));
2912
+ return BitCast(
2913
+ d, detail::DemoteTo16NearestEven(du16, BitCast(du32, canonicalized)));
2914
+ }
2915
+
2916
+ #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
2917
+ #undef HWY_NATIVE_DEMOTE_F64_TO_F16
2918
+ #else
2919
+ #define HWY_NATIVE_DEMOTE_F64_TO_F16
2920
+ #endif
2921
+
2922
+ template <class D, HWY_IF_F16_D(D)>
2923
+ HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
2924
+ const Rebind<float, decltype(df16)> df32;
2925
+ return DemoteTo(df16, detail::DemoteToF32WithRoundToOdd(df32, v));
2641
2926
  }
2642
2927
 
2643
2928
  // ------------------------------ ConvertTo F
@@ -2664,8 +2949,8 @@ HWY_API VFromD<Simd<bfloat16_t, N, kPow2>> DemoteTo(
2664
2949
  HWY_API HWY_RVV_V(uint, SEW, LMUL) ConvertTo( \
2665
2950
  HWY_RVV_D(uint, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
2666
2951
  return __riscv_vfcvt_rtz_xu_f_v_u##SEW##LMUL(v, Lanes(d)); \
2667
- } \
2668
- // API only requires f32 but we provide f64 for internal use.
2952
+ }
2953
+
2669
2954
  HWY_RVV_FOREACH_F(HWY_RVV_CONVERT, _, _, _ALL_VIRT)
2670
2955
  #undef HWY_RVV_CONVERT
2671
2956
 
@@ -2704,7 +2989,7 @@ HWY_INLINE size_t LanesPerBlock(Simd<T, N, kPow2> d) {
2704
2989
 
2705
2990
  template <class D, class V>
2706
2991
  HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) {
2707
- using T = MakeUnsigned<TFromD<D>>;
2992
+ using T = MakeUnsigned<TFromV<V>>;
2708
2993
  return AndS(iota0, static_cast<T>(~(LanesPerBlock(d) - 1)));
2709
2994
  }
2710
2995
 
@@ -2918,9 +3203,10 @@ HWY_RVV_FOREACH_B(HWY_RVV_SET_AT_OR_AFTER_FIRST, _, _)
2918
3203
 
2919
3204
  // ------------------------------ InsertLane
2920
3205
 
2921
- template <class V, HWY_IF_NOT_T_SIZE_V(V, 1)>
2922
- HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
2923
- const DFromV<V> d;
3206
+ // T template arg because TFromV<V> might not match the hwy::float16_t argument.
3207
+ template <class V, typename T, HWY_IF_NOT_T_SIZE_V(V, 1)>
3208
+ HWY_API V InsertLane(const V v, size_t i, T t) {
3209
+ const Rebind<T, DFromV<V>> d;
2924
3210
  const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only
2925
3211
  using TU = TFromD<decltype(du)>;
2926
3212
  const auto is_i = detail::EqS(detail::Iota0(du), static_cast<TU>(i));
@@ -2928,9 +3214,9 @@ HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
2928
3214
  }
2929
3215
 
2930
3216
  // For 8-bit lanes, Iota0 might overflow.
2931
- template <class V, HWY_IF_T_SIZE_V(V, 1)>
2932
- HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
2933
- const DFromV<V> d;
3217
+ template <class V, typename T, HWY_IF_T_SIZE_V(V, 1)>
3218
+ HWY_API V InsertLane(const V v, size_t i, T t) {
3219
+ const Rebind<T, DFromV<V>> d;
2934
3220
  const auto zero = Zero(d);
2935
3221
  const auto one = Set(d, 1);
2936
3222
  const auto ge_i = Eq(detail::SlideUp(zero, one, i), one);
@@ -2991,6 +3277,18 @@ HWY_API V DupOdd(const V v) {
2991
3277
  return OddEven(v, down);
2992
3278
  }
2993
3279
 
3280
+ // ------------------------------ InterleaveEven (OddEven)
3281
+ template <class D>
3282
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
3283
+ return OddEven(detail::Slide1Up(b), a);
3284
+ }
3285
+
3286
+ // ------------------------------ InterleaveOdd (OddEven)
3287
+ template <class D>
3288
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
3289
+ return OddEven(b, detail::Slide1Down(a));
3290
+ }
3291
+
2994
3292
  // ------------------------------ OddEvenBlocks
2995
3293
  template <class V>
2996
3294
  HWY_API V OddEvenBlocks(const V a, const V b) {
@@ -3034,9 +3332,6 @@ HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) {
3034
3332
  return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx));
3035
3333
  }
3036
3334
 
3037
- // TODO(janwas): avoid using this for 8-bit; wrap in detail namespace.
3038
- // For large 8-bit vectors, index overflow will lead to incorrect results.
3039
- // Reverse already uses TableLookupLanes16 to prevent this.
3040
3335
  #define HWY_RVV_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
3041
3336
  MLEN, NAME, OP) \
3042
3337
  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
@@ -3045,12 +3340,14 @@ HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) {
3045
3340
  HWY_RVV_AVL(SEW, SHIFT)); \
3046
3341
  }
3047
3342
 
3343
+ // TableLookupLanes is supported for all types, but beware that indices are
3344
+ // likely to wrap around for 8-bit lanes. When using TableLookupLanes inside
3345
+ // this file, ensure that it is safe or use TableLookupLanes16 instead.
3048
3346
  HWY_RVV_FOREACH(HWY_RVV_TABLE, TableLookupLanes, rgather, _ALL)
3049
3347
  #undef HWY_RVV_TABLE
3050
3348
 
3051
3349
  namespace detail {
3052
3350
 
3053
- // Used by I8/U8 Reverse
3054
3351
  #define HWY_RVV_TABLE16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
3055
3352
  SHIFT, MLEN, NAME, OP) \
3056
3353
  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
@@ -3122,17 +3419,78 @@ HWY_API VFromD<D> Reverse(D /* tag */, VFromD<D> v) {
3122
3419
  return TableLookupLanes(v, idx);
3123
3420
  }
3124
3421
 
3125
- // ------------------------------ Reverse2 (RotateRight, OddEven)
3422
+ // ------------------------------ ResizeBitCast
3126
3423
 
3127
- // Per-target flags to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8.
3128
- #ifdef HWY_NATIVE_REVERSE2_8
3129
- #undef HWY_NATIVE_REVERSE2_8
3130
- #else
3131
- #define HWY_NATIVE_REVERSE2_8
3132
- #endif
3424
+ // Extends or truncates a vector to match the given d.
3425
+ namespace detail {
3133
3426
 
3134
- // Shifting and adding requires fewer instructions than blending, but casting to
3135
- // u32 only works for LMUL in [1/2, 8].
3427
+ template <class D>
3428
+ HWY_INLINE VFromD<D> ChangeLMUL(D /* d */, VFromD<D> v) {
3429
+ return v;
3430
+ }
3431
+
3432
+ // Sanity check: when calling ChangeLMUL, the caller (ResizeBitCast) already
3433
+ // BitCast to the same lane type. Note that V may use the native lane type for
3434
+ // f16, so convert D to that before checking.
3435
+ #define HWY_RVV_IF_SAME_T_DV(D, V) \
3436
+ hwy::EnableIf<IsSame<NativeLaneType<TFromD<D>>, TFromV<V>>()>* = nullptr
3437
+
3438
+ // LMUL of VFromD<D> < LMUL of V: need to truncate v
3439
+ template <class D, class V, // HWY_RVV_IF_SAME_T_DV(D, V),
3440
+ HWY_IF_POW2_LE_D(DFromV<VFromD<D>>, DFromV<V>().Pow2() - 1)>
3441
+ HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) {
3442
+ const DFromV<V> d_from;
3443
+ const Half<decltype(d_from)> dh_from;
3444
+ static_assert(
3445
+ DFromV<VFromD<decltype(dh_from)>>().Pow2() < DFromV<V>().Pow2(),
3446
+ "The LMUL of VFromD<decltype(dh_from)> must be less than the LMUL of V");
3447
+ static_assert(
3448
+ DFromV<VFromD<D>>().Pow2() <= DFromV<VFromD<decltype(dh_from)>>().Pow2(),
3449
+ "The LMUL of VFromD<D> must be less than or equal to the LMUL of "
3450
+ "VFromD<decltype(dh_from)>");
3451
+ return ChangeLMUL(d, Trunc(v));
3452
+ }
3453
+
3454
+ // LMUL of VFromD<D> > LMUL of V: need to extend v
3455
+ template <class D, class V, // HWY_RVV_IF_SAME_T_DV(D, V),
3456
+ HWY_IF_POW2_GT_D(DFromV<VFromD<D>>, DFromV<V>().Pow2())>
3457
+ HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) {
3458
+ const DFromV<V> d_from;
3459
+ const Twice<decltype(d_from)> dt_from;
3460
+ static_assert(DFromV<VFromD<decltype(dt_from)>>().Pow2() > DFromV<V>().Pow2(),
3461
+ "The LMUL of VFromD<decltype(dt_from)> must be greater than "
3462
+ "the LMUL of V");
3463
+ static_assert(
3464
+ DFromV<VFromD<D>>().Pow2() >= DFromV<VFromD<decltype(dt_from)>>().Pow2(),
3465
+ "The LMUL of VFromD<D> must be greater than or equal to the LMUL of "
3466
+ "VFromD<decltype(dt_from)>");
3467
+ return ChangeLMUL(d, Ext(dt_from, v));
3468
+ }
3469
+
3470
+ #undef HWY_RVV_IF_SAME_T_DV
3471
+
3472
+ } // namespace detail
3473
+
3474
+ template <class DTo, class VFrom>
3475
+ HWY_API VFromD<DTo> ResizeBitCast(DTo /*dto*/, VFrom v) {
3476
+ const DFromV<decltype(v)> d_from;
3477
+ const Repartition<uint8_t, decltype(d_from)> du8_from;
3478
+ const DFromV<VFromD<DTo>> d_to;
3479
+ const Repartition<uint8_t, decltype(d_to)> du8_to;
3480
+ return BitCast(d_to, detail::ChangeLMUL(du8_to, BitCast(du8_from, v)));
3481
+ }
3482
+
3483
+ // ------------------------------ Reverse2 (RotateRight, OddEven)
3484
+
3485
+ // Per-target flags to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8.
3486
+ #ifdef HWY_NATIVE_REVERSE2_8
3487
+ #undef HWY_NATIVE_REVERSE2_8
3488
+ #else
3489
+ #define HWY_NATIVE_REVERSE2_8
3490
+ #endif
3491
+
3492
+ // Shifting and adding requires fewer instructions than blending, but casting to
3493
+ // u32 only works for LMUL in [1/2, 8].
3136
3494
 
3137
3495
  template <class D, HWY_IF_T_SIZE_D(D, 1)>
3138
3496
  HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
@@ -3307,7 +3665,7 @@ template <class V, class M, class D>
3307
3665
  HWY_API size_t CompressBlendedStore(const V v, const M mask, const D d,
3308
3666
  TFromD<D>* HWY_RESTRICT unaligned) {
3309
3667
  const size_t count = CountTrue(d, mask);
3310
- detail::StoreN(count, Compress(v, mask), d, unaligned);
3668
+ StoreN(Compress(v, mask), d, unaligned, count);
3311
3669
  return count;
3312
3670
  }
3313
3671
 
@@ -3409,6 +3767,9 @@ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
3409
3767
  return Combine(d, LowerHalf(dh, hi_even), LowerHalf(dh, lo_even));
3410
3768
  }
3411
3769
 
3770
+ // ------------------------------ PromoteEvenTo/PromoteOddTo
3771
+ #include "hwy/ops/inside-inl.h"
3772
+
3412
3773
  // ================================================== BLOCKWISE
3413
3774
 
3414
3775
  // ------------------------------ CombineShiftRightBytes
@@ -3483,50 +3844,6 @@ HWY_API V Shuffle0123(const V v) {
3483
3844
 
3484
3845
  // ------------------------------ TableLookupBytes
3485
3846
 
3486
- // Extends or truncates a vector to match the given d.
3487
- namespace detail {
3488
-
3489
- template <class D>
3490
- HWY_INLINE VFromD<D> ChangeLMUL(D /* d */, VFromD<D> v) {
3491
- return v;
3492
- }
3493
-
3494
- // LMUL of VFromD<D> < LMUL of V: need to truncate v
3495
- template <class D, class V,
3496
- hwy::EnableIf<IsSame<TFromD<D>, TFromV<V>>()>* = nullptr,
3497
- HWY_IF_POW2_LE_D(DFromV<VFromD<D>>, DFromV<V>().Pow2() - 1)>
3498
- HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) {
3499
- const DFromV<decltype(v)> d_from;
3500
- const Half<decltype(d_from)> dh_from;
3501
- static_assert(
3502
- DFromV<VFromD<decltype(dh_from)>>().Pow2() < DFromV<V>().Pow2(),
3503
- "The LMUL of VFromD<decltype(dh_from)> must be less than the LMUL of V");
3504
- static_assert(
3505
- DFromV<VFromD<D>>().Pow2() <= DFromV<VFromD<decltype(dh_from)>>().Pow2(),
3506
- "The LMUL of VFromD<D> must be less than or equal to the LMUL of "
3507
- "VFromD<decltype(dh_from)>");
3508
- return ChangeLMUL(d, Trunc(v));
3509
- }
3510
-
3511
- // LMUL of VFromD<D> > LMUL of V: need to extend v
3512
- template <class D, class V,
3513
- hwy::EnableIf<IsSame<TFromD<D>, TFromV<V>>()>* = nullptr,
3514
- HWY_IF_POW2_GT_D(DFromV<VFromD<D>>, DFromV<V>().Pow2())>
3515
- HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) {
3516
- const DFromV<decltype(v)> d_from;
3517
- const Twice<decltype(d_from)> dt_from;
3518
- static_assert(DFromV<VFromD<decltype(dt_from)>>().Pow2() > DFromV<V>().Pow2(),
3519
- "The LMUL of VFromD<decltype(dt_from)> must be greater than "
3520
- "the LMUL of V");
3521
- static_assert(
3522
- DFromV<VFromD<D>>().Pow2() >= DFromV<VFromD<decltype(dt_from)>>().Pow2(),
3523
- "The LMUL of VFromD<D> must be greater than or equal to the LMUL of "
3524
- "VFromD<decltype(dt_from)>");
3525
- return ChangeLMUL(d, Ext(dt_from, v));
3526
- }
3527
-
3528
- } // namespace detail
3529
-
3530
3847
  template <class VT, class VI>
3531
3848
  HWY_API VI TableLookupBytes(const VT vt, const VI vi) {
3532
3849
  const DFromV<VT> dt; // T=table, I=index.
@@ -3563,7 +3880,8 @@ HWY_API VI TableLookupBytesOr0(const VT vt, const VI idx) {
3563
3880
 
3564
3881
  // ------------------------------ TwoTablesLookupLanes
3565
3882
 
3566
- // TODO(janwas): special-case 8-bit lanes to safely handle VL >= 256
3883
+ // WARNING: 8-bit lanes may lead to unexpected results because idx is the same
3884
+ // size and may overflow.
3567
3885
  template <class D, HWY_IF_POW2_LE_D(D, 2)>
3568
3886
  HWY_API VFromD<D> TwoTablesLookupLanes(D d, VFromD<D> a, VFromD<D> b,
3569
3887
  VFromD<RebindToUnsigned<D>> idx) {
@@ -3597,11 +3915,50 @@ HWY_API V TwoTablesLookupLanes(V a, V b,
3597
3915
  }
3598
3916
 
3599
3917
  // ------------------------------ Broadcast
3600
- template <int kLane, class V>
3918
+
3919
+ // 8-bit requires 16-bit tables.
3920
+ template <int kLane, class V, class D = DFromV<V>, HWY_IF_T_SIZE_D(D, 1),
3921
+ HWY_IF_POW2_LE_D(D, 2)>
3601
3922
  HWY_API V Broadcast(const V v) {
3602
- const DFromV<V> d;
3603
- const RebindToUnsigned<decltype(d)> du;
3923
+ const D d;
3604
3924
  HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
3925
+
3926
+ const Rebind<uint16_t, decltype(d)> du16;
3927
+ VFromD<decltype(du16)> idx =
3928
+ detail::OffsetsOf128BitBlocks(d, detail::Iota0(du16));
3929
+ if (kLane != 0) {
3930
+ idx = detail::AddS(idx, kLane);
3931
+ }
3932
+ return detail::TableLookupLanes16(v, idx);
3933
+ }
3934
+
3935
+ // 8-bit and max LMUL: split into halves.
3936
+ template <int kLane, class V, class D = DFromV<V>, HWY_IF_T_SIZE_D(D, 1),
3937
+ HWY_IF_POW2_GT_D(D, 2)>
3938
+ HWY_API V Broadcast(const V v) {
3939
+ const D d;
3940
+ HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
3941
+
3942
+ const Half<decltype(d)> dh;
3943
+ using VH = VFromD<decltype(dh)>;
3944
+ const Rebind<uint16_t, decltype(dh)> du16;
3945
+ VFromD<decltype(du16)> idx =
3946
+ detail::OffsetsOf128BitBlocks(d, detail::Iota0(du16));
3947
+ if (kLane != 0) {
3948
+ idx = detail::AddS(idx, kLane);
3949
+ }
3950
+ const VH lo = detail::TableLookupLanes16(LowerHalf(dh, v), idx);
3951
+ const VH hi = detail::TableLookupLanes16(UpperHalf(dh, v), idx);
3952
+ return Combine(d, hi, lo);
3953
+ }
3954
+
3955
+ template <int kLane, class V, class D = DFromV<V>,
3956
+ HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))>
3957
+ HWY_API V Broadcast(const V v) {
3958
+ const D d;
3959
+ HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
3960
+
3961
+ const RebindToUnsigned<decltype(d)> du;
3605
3962
  auto idx = detail::OffsetsOf128BitBlocks(d, detail::Iota0(du));
3606
3963
  if (kLane != 0) {
3607
3964
  idx = detail::AddS(idx, kLane);
@@ -3778,20 +4135,194 @@ HWY_API V ShiftRightBytes(const D d, const V v) {
3778
4135
  return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v)));
3779
4136
  }
3780
4137
 
3781
- // ------------------------------ InterleaveLower
4138
+ // ------------------------------ InterleaveWholeLower
4139
+ #ifdef HWY_NATIVE_INTERLEAVE_WHOLE
4140
+ #undef HWY_NATIVE_INTERLEAVE_WHOLE
4141
+ #else
4142
+ #define HWY_NATIVE_INTERLEAVE_WHOLE
4143
+ #endif
4144
+
4145
+ namespace detail {
4146
+ // Returns double-length vector with interleaved lanes.
4147
+ template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
4148
+ HWY_IF_POW2_GT_D(D, -3)>
4149
+ HWY_API VFromD<D> InterleaveWhole(D d, VFromD<Half<D>> a, VFromD<Half<D>> b) {
4150
+ const RebindToUnsigned<decltype(d)> du;
4151
+ using TW = MakeWide<TFromD<decltype(du)>>;
4152
+ const Rebind<TW, Half<decltype(du)>> dw;
4153
+ const Half<decltype(du)> duh; // cast inputs to unsigned so we zero-extend
4154
+
4155
+ const VFromD<decltype(dw)> aw = PromoteTo(dw, BitCast(duh, a));
4156
+ const VFromD<decltype(dw)> bw = PromoteTo(dw, BitCast(duh, b));
4157
+ return BitCast(d, Or(aw, BitCast(dw, detail::Slide1Up(BitCast(du, bw)))));
4158
+ }
4159
+ // 64-bit: cannot PromoteTo, but can Ext.
4160
+ template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)>
4161
+ HWY_API VFromD<D> InterleaveWhole(D d, VFromD<Half<D>> a, VFromD<Half<D>> b) {
4162
+ const RebindToUnsigned<decltype(d)> du;
4163
+ const auto idx = ShiftRight<1>(detail::Iota0(du));
4164
+ return OddEven(TableLookupLanes(detail::Ext(d, b), idx),
4165
+ TableLookupLanes(detail::Ext(d, a), idx));
4166
+ }
4167
+ template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_GT_D(D, 2)>
4168
+ HWY_API VFromD<D> InterleaveWhole(D d, VFromD<Half<D>> a, VFromD<Half<D>> b) {
4169
+ const Half<D> dh;
4170
+ const Half<decltype(dh)> dq;
4171
+ const VFromD<decltype(dh)> i0 =
4172
+ InterleaveWhole(dh, LowerHalf(dq, a), LowerHalf(dq, b));
4173
+ const VFromD<decltype(dh)> i1 =
4174
+ InterleaveWhole(dh, UpperHalf(dq, a), UpperHalf(dq, b));
4175
+ return Combine(d, i1, i0);
4176
+ }
4177
+
4178
+ } // namespace detail
3782
4179
 
3783
- template <class D, class V>
4180
+ template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
4181
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
4182
+ const RebindToUnsigned<decltype(d)> du;
4183
+ const detail::AdjustSimdTagToMinVecPow2<RepartitionToWide<decltype(du)>> dw;
4184
+ const RepartitionToNarrow<decltype(dw)> du_src;
4185
+
4186
+ const VFromD<D> aw =
4187
+ ResizeBitCast(d, PromoteLowerTo(dw, ResizeBitCast(du_src, a)));
4188
+ const VFromD<D> bw =
4189
+ ResizeBitCast(d, PromoteLowerTo(dw, ResizeBitCast(du_src, b)));
4190
+ return Or(aw, detail::Slide1Up(bw));
4191
+ }
4192
+
4193
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
4194
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
4195
+ const RebindToUnsigned<decltype(d)> du;
4196
+ const auto idx = ShiftRight<1>(detail::Iota0(du));
4197
+ return OddEven(TableLookupLanes(b, idx), TableLookupLanes(a, idx));
4198
+ }
4199
+
4200
+ // ------------------------------ InterleaveWholeUpper
4201
+
4202
+ template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
4203
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
4204
+ // Use Lanes(d) / 2 instead of Lanes(Half<D>()) as Lanes(Half<D>()) can only
4205
+ // be called if (d.Pow2() >= -2 && d.Pow2() == DFromV<VFromD<D>>().Pow2()) is
4206
+ // true and and as the results of InterleaveWholeUpper are
4207
+ // implementation-defined if Lanes(d) is less than 2.
4208
+ const size_t half_N = Lanes(d) / 2;
4209
+ return InterleaveWholeLower(d, detail::SlideDown(a, half_N),
4210
+ detail::SlideDown(b, half_N));
4211
+ }
4212
+
4213
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
4214
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
4215
+ // Use Lanes(d) / 2 instead of Lanes(Half<D>()) as Lanes(Half<D>()) can only
4216
+ // be called if (d.Pow2() >= -2 && d.Pow2() == DFromV<VFromD<D>>().Pow2()) is
4217
+ // true and as the results of InterleaveWholeUpper are implementation-defined
4218
+ // if Lanes(d) is less than 2.
4219
+ const size_t half_N = Lanes(d) / 2;
4220
+ const RebindToUnsigned<decltype(d)> du;
4221
+ const auto idx = detail::AddS(ShiftRight<1>(detail::Iota0(du)),
4222
+ static_cast<uint64_t>(half_N));
4223
+ return OddEven(TableLookupLanes(b, idx), TableLookupLanes(a, idx));
4224
+ }
4225
+
4226
+ // ------------------------------ InterleaveLower (InterleaveWholeLower)
4227
+
4228
+ namespace detail {
4229
+
4230
+ // Definitely at least 128 bit: match x86 semantics (independent blocks). Using
4231
+ // InterleaveWhole and 64-bit Compress avoids 8-bit overflow.
4232
+ template <class D, class V, HWY_IF_POW2_LE_D(D, 2)>
4233
+ HWY_INLINE V InterleaveLowerBlocks(D d, const V a, const V b) {
4234
+ static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
4235
+ const Twice<D> dt;
4236
+ const RebindToUnsigned<decltype(dt)> dt_u;
4237
+ const VFromD<decltype(dt)> interleaved = detail::InterleaveWhole(dt, a, b);
4238
+ // Keep only even 128-bit blocks. This is faster than u64 ConcatEven
4239
+ // because we only have a single vector.
4240
+ constexpr size_t kShift = CeilLog2(16 / sizeof(TFromD<D>));
4241
+ const VFromD<decltype(dt_u)> idx_block =
4242
+ ShiftRight<kShift>(detail::Iota0(dt_u));
4243
+ const MFromD<decltype(dt_u)> is_even =
4244
+ detail::EqS(detail::AndS(idx_block, 1), 0);
4245
+ return BitCast(d, LowerHalf(Compress(BitCast(dt_u, interleaved), is_even)));
4246
+ }
4247
+ template <class D, class V, HWY_IF_POW2_GT_D(D, 2)>
4248
+ HWY_INLINE V InterleaveLowerBlocks(D d, const V a, const V b) {
4249
+ const Half<D> dh;
4250
+ const VFromD<decltype(dh)> i0 =
4251
+ InterleaveLowerBlocks(dh, LowerHalf(dh, a), LowerHalf(dh, b));
4252
+ const VFromD<decltype(dh)> i1 =
4253
+ InterleaveLowerBlocks(dh, UpperHalf(dh, a), UpperHalf(dh, b));
4254
+ return Combine(d, i1, i0);
4255
+ }
4256
+
4257
+ // As above, for the upper half of blocks.
4258
+ template <class D, class V, HWY_IF_POW2_LE_D(D, 2)>
4259
+ HWY_INLINE V InterleaveUpperBlocks(D d, const V a, const V b) {
4260
+ static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
4261
+ const Twice<D> dt;
4262
+ const RebindToUnsigned<decltype(dt)> dt_u;
4263
+ const VFromD<decltype(dt)> interleaved = detail::InterleaveWhole(dt, a, b);
4264
+ // Keep only odd 128-bit blocks. This is faster than u64 ConcatEven
4265
+ // because we only have a single vector.
4266
+ constexpr size_t kShift = CeilLog2(16 / sizeof(TFromD<D>));
4267
+ const VFromD<decltype(dt_u)> idx_block =
4268
+ ShiftRight<kShift>(detail::Iota0(dt_u));
4269
+ const MFromD<decltype(dt_u)> is_odd =
4270
+ detail::EqS(detail::AndS(idx_block, 1), 1);
4271
+ return BitCast(d, LowerHalf(Compress(BitCast(dt_u, interleaved), is_odd)));
4272
+ }
4273
+ template <class D, class V, HWY_IF_POW2_GT_D(D, 2)>
4274
+ HWY_INLINE V InterleaveUpperBlocks(D d, const V a, const V b) {
4275
+ const Half<D> dh;
4276
+ const VFromD<decltype(dh)> i0 =
4277
+ InterleaveUpperBlocks(dh, LowerHalf(dh, a), LowerHalf(dh, b));
4278
+ const VFromD<decltype(dh)> i1 =
4279
+ InterleaveUpperBlocks(dh, UpperHalf(dh, a), UpperHalf(dh, b));
4280
+ return Combine(d, i1, i0);
4281
+ }
4282
+
4283
+ // RVV vectors are at least 128 bit when there is no fractional LMUL nor cap.
4284
+ // Used by functions with per-block behavior such as InterleaveLower.
4285
+ template <typename T, size_t N, int kPow2>
4286
+ constexpr bool IsGE128(Simd<T, N, kPow2> /* d */) {
4287
+ return N * sizeof(T) >= 16 && kPow2 >= 0;
4288
+ }
4289
+
4290
+ // Definitely less than 128-bit only if there is a small cap; fractional LMUL
4291
+ // might not be enough if vectors are large.
4292
+ template <typename T, size_t N, int kPow2>
4293
+ constexpr bool IsLT128(Simd<T, N, kPow2> /* d */) {
4294
+ return N * sizeof(T) < 16;
4295
+ }
4296
+
4297
+ } // namespace detail
4298
+
4299
+ #define HWY_RVV_IF_GE128_D(D) hwy::EnableIf<detail::IsGE128(D())>* = nullptr
4300
+ #define HWY_RVV_IF_LT128_D(D) hwy::EnableIf<detail::IsLT128(D())>* = nullptr
4301
+ #define HWY_RVV_IF_CAN128_D(D) \
4302
+ hwy::EnableIf<!detail::IsLT128(D()) && !detail::IsGE128(D())>* = nullptr
4303
+
4304
+ template <class D, class V, HWY_RVV_IF_GE128_D(D)>
4305
+ HWY_API V InterleaveLower(D d, const V a, const V b) {
4306
+ return detail::InterleaveLowerBlocks(d, a, b);
4307
+ }
4308
+
4309
+ // Single block: interleave without extra Compress.
4310
+ template <class D, class V, HWY_RVV_IF_LT128_D(D)>
3784
4311
  HWY_API V InterleaveLower(D d, const V a, const V b) {
3785
4312
  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
3786
- const RebindToUnsigned<decltype(d)> du;
3787
- using TU = TFromD<decltype(du)>;
3788
- const auto i = detail::Iota0(du);
3789
- const auto idx_mod = ShiftRight<1>(
3790
- detail::AndS(i, static_cast<TU>(detail::LanesPerBlock(du) - 1)));
3791
- const auto idx = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i));
3792
- const auto is_even = detail::EqS(detail::AndS(i, 1), 0u);
3793
- return IfThenElse(is_even, TableLookupLanes(a, idx),
3794
- TableLookupLanes(b, idx));
4313
+ return InterleaveWholeLower(d, a, b);
4314
+ }
4315
+
4316
+ // Could be either; branch at runtime.
4317
+ template <class D, class V, HWY_RVV_IF_CAN128_D(D)>
4318
+ HWY_API V InterleaveLower(D d, const V a, const V b) {
4319
+ if (Lanes(d) * sizeof(TFromD<D>) <= 16) {
4320
+ return InterleaveWholeLower(d, a, b);
4321
+ }
4322
+ // Fractional LMUL: use LMUL=1 to ensure we can cast to u64.
4323
+ const ScalableTag<TFromD<D>, HWY_MAX(d.Pow2(), 0)> d1;
4324
+ return ResizeBitCast(d, detail::InterleaveLowerBlocks(
4325
+ d1, ResizeBitCast(d1, a), ResizeBitCast(d1, b)));
3795
4326
  }
3796
4327
 
3797
4328
  template <class V>
@@ -3799,21 +4330,30 @@ HWY_API V InterleaveLower(const V a, const V b) {
3799
4330
  return InterleaveLower(DFromV<V>(), a, b);
3800
4331
  }
3801
4332
 
3802
- // ------------------------------ InterleaveUpper
4333
+ // ------------------------------ InterleaveUpper (Compress)
3803
4334
 
3804
- template <class D, class V>
3805
- HWY_API V InterleaveUpper(const D d, const V a, const V b) {
4335
+ template <class D, class V, HWY_RVV_IF_GE128_D(D)>
4336
+ HWY_API V InterleaveUpper(D d, const V a, const V b) {
4337
+ return detail::InterleaveUpperBlocks(d, a, b);
4338
+ }
4339
+
4340
+ // Single block: interleave without extra Compress.
4341
+ template <class D, class V, HWY_RVV_IF_LT128_D(D)>
4342
+ HWY_API V InterleaveUpper(D d, const V a, const V b) {
3806
4343
  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
3807
- const RebindToUnsigned<decltype(d)> du;
3808
- using TU = TFromD<decltype(du)>;
3809
- const size_t lpb = detail::LanesPerBlock(du);
3810
- const auto i = detail::Iota0(du);
3811
- const auto idx_mod = ShiftRight<1>(detail::AndS(i, static_cast<TU>(lpb - 1)));
3812
- const auto idx_lower = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i));
3813
- const auto idx = detail::AddS(idx_lower, static_cast<TU>(lpb / 2));
3814
- const auto is_even = detail::EqS(detail::AndS(i, 1), 0u);
3815
- return IfThenElse(is_even, TableLookupLanes(a, idx),
3816
- TableLookupLanes(b, idx));
4344
+ return InterleaveWholeUpper(d, a, b);
4345
+ }
4346
+
4347
+ // Could be either; branch at runtime.
4348
+ template <class D, class V, HWY_RVV_IF_CAN128_D(D)>
4349
+ HWY_API V InterleaveUpper(D d, const V a, const V b) {
4350
+ if (Lanes(d) * sizeof(TFromD<D>) <= 16) {
4351
+ return InterleaveWholeUpper(d, a, b);
4352
+ }
4353
+ // Fractional LMUL: use LMUL=1 to ensure we can cast to u64.
4354
+ const ScalableTag<TFromD<D>, HWY_MAX(d.Pow2(), 0)> d1;
4355
+ return ResizeBitCast(d, detail::InterleaveUpperBlocks(
4356
+ d1, ResizeBitCast(d1, a), ResizeBitCast(d1, b)));
3817
4357
  }
3818
4358
 
3819
4359
  // ------------------------------ ZipLower
@@ -3840,67 +4380,98 @@ HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
3840
4380
 
3841
4381
  // ================================================== REDUCE
3842
4382
 
3843
- // vector = f(vector, zero_m1)
4383
+ // We have ReduceSum, generic_ops-inl.h defines SumOfLanes via Set.
4384
+ #ifdef HWY_NATIVE_REDUCE_SCALAR
4385
+ #undef HWY_NATIVE_REDUCE_SCALAR
4386
+ #else
4387
+ #define HWY_NATIVE_REDUCE_SCALAR
4388
+ #endif
4389
+
4390
+ // scalar = f(vector, zero_m1)
3844
4391
  #define HWY_RVV_REDUCE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
3845
4392
  MLEN, NAME, OP) \
3846
- template <class D> \
3847
- HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
3848
- NAME(D d, HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, m1) v0) { \
3849
- return Set(d, \
3850
- GetLane(__riscv_v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1( \
3851
- v, v0, Lanes(d)))); \
4393
+ template <size_t N> \
4394
+ HWY_API HWY_RVV_T(BASE, SEW) \
4395
+ NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) v, \
4396
+ HWY_RVV_V(BASE, SEW, m1) v0) { \
4397
+ return GetLane(__riscv_v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1( \
4398
+ v, v0, Lanes(d))); \
3852
4399
  }
3853
4400
 
3854
- // ------------------------------ SumOfLanes
4401
+ // detail::RedSum, detail::RedMin, and detail::RedMax is more efficient
4402
+ // for N=4 I8/U8 reductions on RVV than the default implementations of the
4403
+ // the N=4 I8/U8 ReduceSum/ReduceMin/ReduceMax operations in generic_ops-inl.h
4404
+ #undef HWY_IF_REDUCE_D
4405
+ #define HWY_IF_REDUCE_D(D) hwy::EnableIf<HWY_MAX_LANES_D(D) != 1>* = nullptr
4406
+
4407
+ #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
4408
+ #undef HWY_NATIVE_REDUCE_SUM_4_UI8
4409
+ #else
4410
+ #define HWY_NATIVE_REDUCE_SUM_4_UI8
4411
+ #endif
4412
+
4413
+ #ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
4414
+ #undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
4415
+ #else
4416
+ #define HWY_NATIVE_REDUCE_MINMAX_4_UI8
4417
+ #endif
4418
+
4419
+ // ------------------------------ ReduceSum
3855
4420
 
3856
4421
  namespace detail {
3857
- HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum, _ALL)
3858
- HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredusum, _ALL)
4422
+ HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum, _ALL_VIRT)
4423
+ HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredusum, _ALL_VIRT)
3859
4424
  } // namespace detail
3860
4425
 
3861
- template <class D>
3862
- HWY_API VFromD<D> SumOfLanes(D d, const VFromD<D> v) {
4426
+ template <class D, HWY_IF_REDUCE_D(D)>
4427
+ HWY_API TFromD<D> ReduceSum(D d, const VFromD<D> v) {
3863
4428
  const auto v0 = Zero(ScalableTag<TFromD<D>>()); // always m1
3864
4429
  return detail::RedSum(d, v, v0);
3865
4430
  }
3866
4431
 
3867
- template <class D>
3868
- HWY_API TFromD<D> ReduceSum(D d, const VFromD<D> v) {
3869
- return GetLane(SumOfLanes(d, v));
3870
- }
3871
-
3872
- // ------------------------------ MinOfLanes
4432
+ // ------------------------------ ReduceMin
3873
4433
  namespace detail {
3874
- HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu, _ALL)
3875
- HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin, _ALL)
3876
- HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin, _ALL)
4434
+ HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu, _ALL_VIRT)
4435
+ HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin, _ALL_VIRT)
4436
+ HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin, _ALL_VIRT)
3877
4437
  } // namespace detail
3878
4438
 
3879
- template <class D>
3880
- HWY_API VFromD<D> MinOfLanes(D d, const VFromD<D> v) {
3881
- using T = TFromD<D>;
4439
+ template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
4440
+ HWY_API T ReduceMin(D d, const VFromD<D> v) {
3882
4441
  const ScalableTag<T> d1; // always m1
3883
- const auto neutral = Set(d1, HighestValue<T>());
3884
- return detail::RedMin(d, v, neutral);
4442
+ return detail::RedMin(d, v, Set(d1, HighestValue<T>()));
3885
4443
  }
3886
4444
 
3887
- // ------------------------------ MaxOfLanes
4445
+ // ------------------------------ ReduceMax
3888
4446
  namespace detail {
3889
- HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu, _ALL)
3890
- HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax, _ALL)
3891
- HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax, _ALL)
4447
+ HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu, _ALL_VIRT)
4448
+ HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax, _ALL_VIRT)
4449
+ HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax, _ALL_VIRT)
3892
4450
  } // namespace detail
3893
4451
 
3894
- template <class D>
3895
- HWY_API VFromD<D> MaxOfLanes(D d, const VFromD<D> v) {
3896
- using T = TFromD<D>;
4452
+ template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
4453
+ HWY_API T ReduceMax(D d, const VFromD<D> v) {
3897
4454
  const ScalableTag<T> d1; // always m1
3898
- const auto neutral = Set(d1, LowestValue<T>());
3899
- return detail::RedMax(d, v, neutral);
4455
+ return detail::RedMax(d, v, Set(d1, LowestValue<T>()));
3900
4456
  }
3901
4457
 
3902
4458
  #undef HWY_RVV_REDUCE
3903
4459
 
4460
+ // ------------------------------ SumOfLanes
4461
+
4462
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
4463
+ HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
4464
+ return Set(d, ReduceSum(d, v));
4465
+ }
4466
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
4467
+ HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
4468
+ return Set(d, ReduceMin(d, v));
4469
+ }
4470
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
4471
+ HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
4472
+ return Set(d, ReduceMax(d, v));
4473
+ }
4474
+
3904
4475
  // ================================================== Ops with dependencies
3905
4476
 
3906
4477
  // ------------------------------ LoadInterleaved2
@@ -4229,15 +4800,87 @@ HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
4229
4800
 
4230
4801
  #endif // HWY_HAVE_TUPLE
4231
4802
 
4232
- // ------------------------------ ResizeBitCast
4803
+ // ------------------------------ Dup128VecFromValues (ResizeBitCast)
4233
4804
 
4234
- template <class D, class FromV>
4235
- HWY_API VFromD<D> ResizeBitCast(D /*d*/, FromV v) {
4236
- const DFromV<decltype(v)> d_from;
4237
- const Repartition<uint8_t, decltype(d_from)> du8_from;
4238
- const DFromV<VFromD<D>> d_to;
4239
- const Repartition<uint8_t, decltype(d_to)> du8_to;
4240
- return BitCast(d_to, detail::ChangeLMUL(du8_to, BitCast(du8_from, v)));
4805
+ template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_D(D, 1)>
4806
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> /*t1*/) {
4807
+ return Set(d, t0);
4808
+ }
4809
+
4810
+ template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)>
4811
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) {
4812
+ const auto even_lanes = Set(d, t0);
4813
+ #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
4814
+ if (__builtin_constant_p(BitCastScalar<uint64_t>(t0) ==
4815
+ BitCastScalar<uint64_t>(t1)) &&
4816
+ (BitCastScalar<uint64_t>(t0) == BitCastScalar<uint64_t>(t1))) {
4817
+ return even_lanes;
4818
+ }
4819
+ #endif
4820
+
4821
+ const auto odd_lanes = Set(d, t1);
4822
+ return OddEven(odd_lanes, even_lanes);
4823
+ }
4824
+
4825
+ namespace detail {
4826
+
4827
+ #pragma pack(push, 1)
4828
+
4829
+ template <class T>
4830
+ struct alignas(8) Vec64ValsWrapper {
4831
+ static_assert(sizeof(T) >= 1, "sizeof(T) >= 1 must be true");
4832
+ static_assert(sizeof(T) <= 8, "sizeof(T) <= 8 must be true");
4833
+ T vals[8 / sizeof(T)];
4834
+ };
4835
+
4836
+ #pragma pack(pop)
4837
+
4838
+ } // namespace detail
4839
+
4840
+ template <class D, HWY_IF_T_SIZE_D(D, 1)>
4841
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
4842
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
4843
+ TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
4844
+ TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
4845
+ TFromD<D> t11, TFromD<D> t12,
4846
+ TFromD<D> t13, TFromD<D> t14,
4847
+ TFromD<D> t15) {
4848
+ const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
4849
+ return ResizeBitCast(
4850
+ d, Dup128VecFromValues(
4851
+ du64,
4852
+ BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{
4853
+ {t0, t1, t2, t3, t4, t5, t6, t7}}),
4854
+ BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{
4855
+ {t8, t9, t10, t11, t12, t13, t14, t15}})));
4856
+ }
4857
+
4858
+ template <class D, HWY_IF_T_SIZE_D(D, 2)>
4859
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
4860
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
4861
+ TFromD<D> t5, TFromD<D> t6,
4862
+ TFromD<D> t7) {
4863
+ const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
4864
+ return ResizeBitCast(
4865
+ d, Dup128VecFromValues(
4866
+ du64,
4867
+ BitCastScalar<uint64_t>(
4868
+ detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1, t2, t3}}),
4869
+ BitCastScalar<uint64_t>(
4870
+ detail::Vec64ValsWrapper<TFromD<D>>{{t4, t5, t6, t7}})));
4871
+ }
4872
+
4873
+ template <class D, HWY_IF_T_SIZE_D(D, 4)>
4874
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
4875
+ TFromD<D> t2, TFromD<D> t3) {
4876
+ const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
4877
+ return ResizeBitCast(
4878
+ d,
4879
+ Dup128VecFromValues(du64,
4880
+ BitCastScalar<uint64_t>(
4881
+ detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}}),
4882
+ BitCastScalar<uint64_t>(
4883
+ detail::Vec64ValsWrapper<TFromD<D>>{{t2, t3}})));
4241
4884
  }
4242
4885
 
4243
4886
  // ------------------------------ PopulationCount (ShiftRight)
@@ -4366,34 +5009,276 @@ HWY_API MFromD<D> FirstN(const D d, const size_t n) {
4366
5009
  return Eq(detail::SlideUp(one, zero, n), one);
4367
5010
  }
4368
5011
 
4369
- // ------------------------------ Neg (Sub)
5012
+ // ------------------------------ LowerHalfOfMask/UpperHalfOfMask
4370
5013
 
4371
- template <class V, HWY_IF_SIGNED_V(V)>
4372
- HWY_API V Neg(const V v) {
4373
- return detail::ReverseSubS(v, 0);
5014
+ #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5015
+
5016
+ // Target-specific implementations of LowerHalfOfMask, UpperHalfOfMask,
5017
+ // CombineMasks, OrderedDemote2MasksTo, and Dup128MaskFromMaskBits are possible
5018
+ // on RVV if the __riscv_vreinterpret_v_b*_u8m1 and
5019
+ // __riscv_vreinterpret_v_u8m1_b* intrinsics are available.
5020
+
5021
+ // The __riscv_vreinterpret_v_b*_u8m1 and __riscv_vreinterpret_v_u8m1_b*
5022
+ // intrinsics available with Clang 17 and later and GCC 14 and later.
5023
+
5024
+ namespace detail {
5025
+
5026
+ HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool1_t m) {
5027
+ return __riscv_vreinterpret_v_b1_u8m1(m);
4374
5028
  }
4375
5029
 
4376
- // vector = f(vector), but argument is repeated
4377
- #define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
4378
- SHIFT, MLEN, NAME, OP) \
4379
- HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
4380
- return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, v, \
4381
- HWY_RVV_AVL(SEW, SHIFT)); \
5030
+ HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool2_t m) {
5031
+ return __riscv_vreinterpret_v_b2_u8m1(m);
5032
+ }
5033
+
5034
+ HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool4_t m) {
5035
+ return __riscv_vreinterpret_v_b4_u8m1(m);
5036
+ }
5037
+
5038
+ HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool8_t m) {
5039
+ return __riscv_vreinterpret_v_b8_u8m1(m);
5040
+ }
5041
+
5042
+ HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool16_t m) {
5043
+ return __riscv_vreinterpret_v_b16_u8m1(m);
5044
+ }
5045
+
5046
+ HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool32_t m) {
5047
+ return __riscv_vreinterpret_v_b32_u8m1(m);
5048
+ }
5049
+
5050
+ HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool64_t m) {
5051
+ return __riscv_vreinterpret_v_b64_u8m1(m);
5052
+ }
5053
+
5054
+ template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool1_t>()>* = nullptr>
5055
+ HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
5056
+ return __riscv_vreinterpret_v_u8m1_b1(v);
5057
+ }
5058
+
5059
+ template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool2_t>()>* = nullptr>
5060
+ HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
5061
+ return __riscv_vreinterpret_v_u8m1_b2(v);
5062
+ }
5063
+
5064
+ template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool4_t>()>* = nullptr>
5065
+ HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
5066
+ return __riscv_vreinterpret_v_u8m1_b4(v);
5067
+ }
5068
+
5069
+ template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool8_t>()>* = nullptr>
5070
+ HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
5071
+ return __riscv_vreinterpret_v_u8m1_b8(v);
5072
+ }
5073
+
5074
+ template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool16_t>()>* = nullptr>
5075
+ HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
5076
+ return __riscv_vreinterpret_v_u8m1_b16(v);
5077
+ }
5078
+
5079
+ template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool32_t>()>* = nullptr>
5080
+ HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
5081
+ return __riscv_vreinterpret_v_u8m1_b32(v);
5082
+ }
5083
+
5084
+ template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool64_t>()>* = nullptr>
5085
+ HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
5086
+ return __riscv_vreinterpret_v_u8m1_b64(v);
5087
+ }
5088
+
5089
+ } // namespace detail
5090
+
5091
+ #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
5092
+ #undef HWY_NATIVE_LOWER_HALF_OF_MASK
5093
+ #else
5094
+ #define HWY_NATIVE_LOWER_HALF_OF_MASK
5095
+ #endif
5096
+
5097
+ template <class D>
5098
+ HWY_API MFromD<D> LowerHalfOfMask(D d, MFromD<Twice<D>> m) {
5099
+ return detail::U8MaskBitsVecToMask(d, detail::MaskToU8MaskBitsVec(m));
5100
+ }
5101
+
5102
+ #ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
5103
+ #undef HWY_NATIVE_UPPER_HALF_OF_MASK
5104
+ #else
5105
+ #define HWY_NATIVE_UPPER_HALF_OF_MASK
5106
+ #endif
5107
+
5108
+ template <class D>
5109
+ HWY_API MFromD<D> UpperHalfOfMask(D d, MFromD<Twice<D>> m) {
5110
+ const size_t N = Lanes(d);
5111
+
5112
+ vuint8m1_t mask_bits = detail::MaskToU8MaskBitsVec(m);
5113
+ mask_bits = ShiftRightSame(mask_bits, static_cast<int>(N & 7));
5114
+ if (HWY_MAX_LANES_D(D) >= 8) {
5115
+ mask_bits = SlideDownLanes(ScalableTag<uint8_t>(), mask_bits, N / 8);
4382
5116
  }
4383
5117
 
4384
- HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Neg, fsgnjn, _ALL)
5118
+ return detail::U8MaskBitsVecToMask(d, mask_bits);
5119
+ }
4385
5120
 
4386
- #if !HWY_HAVE_FLOAT16
5121
+ // ------------------------------ CombineMasks
4387
5122
 
4388
- template <class V, HWY_IF_U16_D(DFromV<V>)> // float16_t
4389
- HWY_API V Neg(V v) {
4390
- const DFromV<decltype(v)> d;
4391
- const RebindToUnsigned<decltype(d)> du;
4392
- using TU = TFromD<decltype(du)>;
4393
- return BitCast(d, Xor(BitCast(du, v), Set(du, SignMask<TU>())));
5123
+ #ifdef HWY_NATIVE_COMBINE_MASKS
5124
+ #undef HWY_NATIVE_COMBINE_MASKS
5125
+ #else
5126
+ #define HWY_NATIVE_COMBINE_MASKS
5127
+ #endif
5128
+
5129
+ template <class D>
5130
+ HWY_API MFromD<D> CombineMasks(D d, MFromD<Half<D>> hi, MFromD<Half<D>> lo) {
5131
+ const Half<decltype(d)> dh;
5132
+ const size_t half_N = Lanes(dh);
5133
+
5134
+ const auto ext_lo_mask =
5135
+ And(detail::U8MaskBitsVecToMask(d, detail::MaskToU8MaskBitsVec(lo)),
5136
+ FirstN(d, half_N));
5137
+ vuint8m1_t hi_mask_bits = detail::MaskToU8MaskBitsVec(hi);
5138
+ hi_mask_bits = ShiftLeftSame(hi_mask_bits, static_cast<int>(half_N & 7));
5139
+ if (HWY_MAX_LANES_D(D) >= 8) {
5140
+ hi_mask_bits =
5141
+ SlideUpLanes(ScalableTag<uint8_t>(), hi_mask_bits, half_N / 8);
5142
+ }
5143
+
5144
+ return Or(ext_lo_mask, detail::U8MaskBitsVecToMask(d, hi_mask_bits));
4394
5145
  }
4395
5146
 
4396
- #endif // !HWY_HAVE_FLOAT16
5147
+ // ------------------------------ OrderedDemote2MasksTo
5148
+
5149
+ #ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
5150
+ #undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
5151
+ #else
5152
+ #define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
5153
+ #endif
5154
+
5155
+ template <class DTo, class DFrom,
5156
+ HWY_IF_T_SIZE_D(DTo, sizeof(TFromD<DFrom>) / 2),
5157
+ class DTo_2 = Repartition<TFromD<DTo>, DFrom>,
5158
+ hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DTo_2>>()>* = nullptr>
5159
+ HWY_API MFromD<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom /*d_from*/,
5160
+ MFromD<DFrom> a, MFromD<DFrom> b) {
5161
+ return CombineMasks(d_to, b, a);
5162
+ }
5163
+
5164
+ #endif // HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5165
+
5166
+ // ------------------------------ Dup128MaskFromMaskBits
5167
+
5168
+ namespace detail {
5169
+ // Even though this is only used after checking if (kN < X), this helper
5170
+ // function prevents "shift count exceeded" errors.
5171
+ template <size_t kN, HWY_IF_LANES_LE(kN, 31)>
5172
+ constexpr unsigned MaxMaskBits() {
5173
+ return (1u << kN) - 1;
5174
+ }
5175
+ template <size_t kN, HWY_IF_LANES_GT(kN, 31)>
5176
+ constexpr unsigned MaxMaskBits() {
5177
+ return ~0u;
5178
+ }
5179
+ } // namespace detail
5180
+
5181
+ template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_LE_D(D, 8)>
5182
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
5183
+ constexpr size_t kN = MaxLanes(d);
5184
+ if (kN < 8) mask_bits &= detail::MaxMaskBits<kN>();
5185
+
5186
+ #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5187
+ return detail::U8MaskBitsVecToMask(
5188
+ d, Set(ScalableTag<uint8_t>(), static_cast<uint8_t>(mask_bits)));
5189
+ #else
5190
+ const RebindToUnsigned<decltype(d)> du8;
5191
+ const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, decltype(du8)>>
5192
+ du64;
5193
+
5194
+ const auto bytes = ResizeBitCast(
5195
+ du8, detail::AndS(
5196
+ ResizeBitCast(du64, Set(du8, static_cast<uint8_t>(mask_bits))),
5197
+ uint64_t{0x8040201008040201u}));
5198
+ return detail::NeS(bytes, uint8_t{0});
5199
+ #endif
5200
+ }
5201
+
5202
+ template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_GT_D(D, 8)>
5203
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
5204
+ #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5205
+ const ScalableTag<uint8_t> du8;
5206
+ const ScalableTag<uint16_t> du16;
5207
+ // There are exactly 16 mask bits for 128 vector bits of 8-bit lanes.
5208
+ return detail::U8MaskBitsVecToMask(
5209
+ d, BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits))));
5210
+ #else
5211
+ // Slow fallback for completeness; the above bits to mask cast is preferred.
5212
+ const RebindToUnsigned<decltype(d)> du8;
5213
+ const Repartition<uint16_t, decltype(du8)> du16;
5214
+ const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, decltype(du8)>>
5215
+ du64;
5216
+
5217
+ // Replicate the lower 16 bits of mask_bits to each u16 lane of a u16 vector,
5218
+ // and then bitcast the replicated mask_bits to a u8 vector
5219
+ const auto bytes = BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits)));
5220
+ // Replicate bytes 8x such that each byte contains the bit that governs it.
5221
+ const auto rep8 = TableLookupLanes(bytes, ShiftRight<3>(detail::Iota0(du8)));
5222
+
5223
+ const auto masked_out_rep8 = ResizeBitCast(
5224
+ du8,
5225
+ detail::AndS(ResizeBitCast(du64, rep8), uint64_t{0x8040201008040201u}));
5226
+ return detail::NeS(masked_out_rep8, uint8_t{0});
5227
+ #endif
5228
+ }
5229
+
5230
+ template <class D, HWY_IF_T_SIZE_D(D, 2)>
5231
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
5232
+ constexpr size_t kN = MaxLanes(d);
5233
+ if (kN < 8) mask_bits &= detail::MaxMaskBits<kN>();
5234
+
5235
+ #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5236
+ const ScalableTag<uint8_t> du8;
5237
+ // There are exactly 8 mask bits for 128 vector bits of 16-bit lanes.
5238
+ return detail::U8MaskBitsVecToMask(d,
5239
+ Set(du8, static_cast<uint8_t>(mask_bits)));
5240
+ #else
5241
+ // Slow fallback for completeness; the above bits to mask cast is preferred.
5242
+ const RebindToUnsigned<D> du;
5243
+ const VFromD<decltype(du)> bits =
5244
+ Shl(Set(du, uint16_t{1}), detail::AndS(detail::Iota0(du), 7));
5245
+ return TestBit(Set(du, static_cast<uint16_t>(mask_bits)), bits);
5246
+ #endif
5247
+ }
5248
+
5249
+ template <class D, HWY_IF_T_SIZE_D(D, 4)>
5250
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
5251
+ constexpr size_t kN = MaxLanes(d);
5252
+ if (kN < 4) mask_bits &= detail::MaxMaskBits<kN>();
5253
+
5254
+ #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5255
+ const ScalableTag<uint8_t> du8;
5256
+ return detail::U8MaskBitsVecToMask(
5257
+ d, Set(du8, static_cast<uint8_t>(mask_bits * 0x11)));
5258
+ #else
5259
+ // Slow fallback for completeness; the above bits to mask cast is preferred.
5260
+ const RebindToUnsigned<D> du;
5261
+ const VFromD<decltype(du)> bits = Dup128VecFromValues(du, 1, 2, 4, 8);
5262
+ return TestBit(Set(du, static_cast<uint32_t>(mask_bits)), bits);
5263
+ #endif
5264
+ }
5265
+
5266
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
5267
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
5268
+ constexpr size_t kN = MaxLanes(d);
5269
+ if (kN < 2) mask_bits &= detail::MaxMaskBits<kN>();
5270
+
5271
+ #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5272
+ const ScalableTag<uint8_t> du8;
5273
+ return detail::U8MaskBitsVecToMask(
5274
+ d, Set(du8, static_cast<uint8_t>(mask_bits * 0x55)));
5275
+ #else
5276
+ // Slow fallback for completeness; the above bits to mask cast is preferred.
5277
+ const RebindToUnsigned<D> du;
5278
+ const VFromD<decltype(du)> bits = Dup128VecFromValues(du, 1, 2);
5279
+ return TestBit(Set(du, static_cast<uint64_t>(mask_bits)), bits);
5280
+ #endif
5281
+ }
4397
5282
 
4398
5283
  // ------------------------------ Abs (Max, Neg)
4399
5284
 
@@ -4452,23 +5337,99 @@ HWY_API V Trunc(const V v) {
4452
5337
  }
4453
5338
 
4454
5339
  // ------------------------------ Ceil
5340
+ #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1400) || \
5341
+ (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1700)
5342
+ namespace detail {
5343
+ #define HWY_RVV_CEIL_INT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
5344
+ SHIFT, MLEN, NAME, OP) \
5345
+ HWY_API HWY_RVV_V(int, SEW, LMUL) CeilInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \
5346
+ return __riscv_vfcvt_x_f_v_i##SEW##LMUL##_rm(v, __RISCV_FRM_RUP, \
5347
+ HWY_RVV_AVL(SEW, SHIFT)); \
5348
+ }
5349
+ HWY_RVV_FOREACH_F(HWY_RVV_CEIL_INT, _, _, _ALL)
5350
+ #undef HWY_RVV_CEIL_INT
5351
+
5352
+ } // namespace detail
5353
+
4455
5354
  template <class V>
4456
5355
  HWY_API V Ceil(const V v) {
4457
- asm volatile("fsrm %0" ::"r"(detail::kUp));
4458
- const auto ret = Round(v);
4459
- asm volatile("fsrm %0" ::"r"(detail::kNear));
4460
- return ret;
5356
+ const DFromV<V> df;
5357
+
5358
+ const auto integer = detail::CeilInt(v);
5359
+ const auto int_f = ConvertTo(df, integer);
5360
+
5361
+ return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
4461
5362
  }
4462
5363
 
5364
+ #else // GCC 13 or earlier or Clang 16 or earlier
5365
+
5366
+ template <class V>
5367
+ HWY_API V Ceil(const V v) {
5368
+ const DFromV<decltype(v)> df;
5369
+ const RebindToSigned<decltype(df)> di;
5370
+
5371
+ using T = TFromD<decltype(df)>;
5372
+
5373
+ const auto integer = ConvertTo(di, v); // round toward 0
5374
+ const auto int_f = ConvertTo(df, integer);
5375
+
5376
+ // Truncating a positive non-integer ends up smaller; if so, add 1.
5377
+ const auto pos1 =
5378
+ IfThenElseZero(Lt(int_f, v), Set(df, ConvertScalarTo<T>(1.0)));
5379
+
5380
+ return IfThenElse(detail::UseInt(v), Add(int_f, pos1), v);
5381
+ }
5382
+
5383
+ #endif // (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1400) ||
5384
+ // (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1700)
5385
+
4463
5386
  // ------------------------------ Floor
5387
+ #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1400) || \
5388
+ (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1700)
5389
+ namespace detail {
5390
+ #define HWY_RVV_FLOOR_INT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
5391
+ SHIFT, MLEN, NAME, OP) \
5392
+ HWY_API HWY_RVV_V(int, SEW, LMUL) FloorInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \
5393
+ return __riscv_vfcvt_x_f_v_i##SEW##LMUL##_rm(v, __RISCV_FRM_RDN, \
5394
+ HWY_RVV_AVL(SEW, SHIFT)); \
5395
+ }
5396
+ HWY_RVV_FOREACH_F(HWY_RVV_FLOOR_INT, _, _, _ALL)
5397
+ #undef HWY_RVV_FLOOR_INT
5398
+
5399
+ } // namespace detail
5400
+
4464
5401
  template <class V>
4465
5402
  HWY_API V Floor(const V v) {
4466
- asm volatile("fsrm %0" ::"r"(detail::kDown));
4467
- const auto ret = Round(v);
4468
- asm volatile("fsrm %0" ::"r"(detail::kNear));
4469
- return ret;
5403
+ const DFromV<V> df;
5404
+
5405
+ const auto integer = detail::FloorInt(v);
5406
+ const auto int_f = ConvertTo(df, integer);
5407
+
5408
+ return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
4470
5409
  }
4471
5410
 
5411
+ #else // GCC 13 or earlier or Clang 16 or earlier
5412
+
5413
+ template <class V>
5414
+ HWY_API V Floor(const V v) {
5415
+ const DFromV<decltype(v)> df;
5416
+ const RebindToSigned<decltype(df)> di;
5417
+
5418
+ using T = TFromD<decltype(df)>;
5419
+
5420
+ const auto integer = ConvertTo(di, v); // round toward 0
5421
+ const auto int_f = ConvertTo(df, integer);
5422
+
5423
+ // Truncating a negative non-integer ends up larger; if so, subtract 1.
5424
+ const auto neg1 =
5425
+ IfThenElseZero(Gt(int_f, v), Set(df, ConvertScalarTo<T>(-1.0)));
5426
+
5427
+ return IfThenElse(detail::UseInt(v), Add(int_f, neg1), v);
5428
+ }
5429
+
5430
+ #endif // (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1400) ||
5431
+ // (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1700)
5432
+
4472
5433
  // ------------------------------ Floating-point classification (Ne)
4473
5434
 
4474
5435
  // vfclass does not help because it would require 3 instructions (to AND and
@@ -4479,6 +5440,14 @@ HWY_API MFromD<DFromV<V>> IsNaN(const V v) {
4479
5440
  return Ne(v, v);
4480
5441
  }
4481
5442
 
5443
+ // Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite.
5444
+ // We use a fused Set/comparison for IsFinite.
5445
+ #ifdef HWY_NATIVE_ISINF
5446
+ #undef HWY_NATIVE_ISINF
5447
+ #else
5448
+ #define HWY_NATIVE_ISINF
5449
+ #endif
5450
+
4482
5451
  template <class V, class D = DFromV<V>>
4483
5452
  HWY_API MFromD<D> IsInf(const V v) {
4484
5453
  const D d;
@@ -4507,22 +5476,76 @@ HWY_API MFromD<D> IsFinite(const V v) {
4507
5476
 
4508
5477
  // ------------------------------ Iota (ConvertTo)
4509
5478
 
4510
- template <class D, HWY_IF_UNSIGNED_D(D)>
4511
- HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
4512
- return detail::AddS(detail::Iota0(d), first);
5479
+ template <class D, typename T2, HWY_IF_UNSIGNED_D(D)>
5480
+ HWY_API VFromD<D> Iota(const D d, T2 first) {
5481
+ return detail::AddS(detail::Iota0(d), static_cast<TFromD<D>>(first));
4513
5482
  }
4514
5483
 
4515
- template <class D, HWY_IF_SIGNED_D(D)>
4516
- HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
5484
+ template <class D, typename T2, HWY_IF_SIGNED_D(D)>
5485
+ HWY_API VFromD<D> Iota(const D d, T2 first) {
4517
5486
  const RebindToUnsigned<D> du;
4518
- return detail::AddS(BitCast(d, detail::Iota0(du)), first);
5487
+ return detail::AddS(BitCast(d, detail::Iota0(du)),
5488
+ static_cast<TFromD<D>>(first));
4519
5489
  }
4520
5490
 
4521
- template <class D, HWY_IF_FLOAT_D(D)>
4522
- HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
5491
+ template <class D, typename T2, HWY_IF_FLOAT_D(D)>
5492
+ HWY_API VFromD<D> Iota(const D d, T2 first) {
4523
5493
  const RebindToUnsigned<D> du;
4524
5494
  const RebindToSigned<D> di;
4525
- return detail::AddS(ConvertTo(d, BitCast(di, detail::Iota0(du))), first);
5495
+ return detail::AddS(ConvertTo(d, BitCast(di, detail::Iota0(du))),
5496
+ ConvertScalarTo<TFromD<D>>(first));
5497
+ }
5498
+
5499
+ // ------------------------------ BitShuffle (PromoteTo, Rol, SumsOf8)
5500
+
5501
+ // Native implementation required to avoid 8-bit wraparound on long vectors.
5502
+ #ifdef HWY_NATIVE_BITSHUFFLE
5503
+ #undef HWY_NATIVE_BITSHUFFLE
5504
+ #else
5505
+ #define HWY_NATIVE_BITSHUFFLE
5506
+ #endif
5507
+
5508
+ // Cannot handle LMUL=8 because we promote indices.
5509
+ template <class V64, class VI, HWY_IF_UI8(TFromV<VI>), class D64 = DFromV<V64>,
5510
+ HWY_IF_UI64_D(D64), HWY_IF_POW2_LE_D(D64, 2)>
5511
+ HWY_API V64 BitShuffle(V64 values, VI idx) {
5512
+ const RebindToUnsigned<D64> du64;
5513
+ const Repartition<uint8_t, D64> du8;
5514
+ const Rebind<uint16_t, decltype(du8)> du16;
5515
+ using VU8 = VFromD<decltype(du8)>;
5516
+ using VU16 = VFromD<decltype(du16)>;
5517
+ // For each 16-bit (to avoid wraparound for long vectors) index of an output
5518
+ // byte: offset of the u64 lane to which it belongs.
5519
+ const VU16 byte_offsets =
5520
+ detail::AndS(detail::Iota0(du16), static_cast<uint16_t>(~7u));
5521
+ // idx is for a bit; shifting makes that bytes. Promote so we can add
5522
+ // byte_offsets, then we have the u8 lane index within the whole vector.
5523
+ const VU16 idx16 =
5524
+ Add(byte_offsets, PromoteTo(du16, ShiftRight<3>(BitCast(du8, idx))));
5525
+ const VU8 bytes = detail::TableLookupLanes16(BitCast(du8, values), idx16);
5526
+
5527
+ // We want to shift right by idx & 7 to extract the desired bit in `bytes`,
5528
+ // and left by iota & 7 to put it in the correct output bit. To correctly
5529
+ // handle shift counts from -7 to 7, we rotate (unfortunately not natively
5530
+ // supported on RVV).
5531
+ const VU8 rotate_left_bits = Sub(detail::Iota0(du8), BitCast(du8, idx));
5532
+ const VU8 extracted_bits_mask =
5533
+ BitCast(du8, Set(du64, static_cast<uint64_t>(0x8040201008040201u)));
5534
+ const VU8 extracted_bits =
5535
+ And(Rol(bytes, rotate_left_bits), extracted_bits_mask);
5536
+ // Combine bit-sliced (one bit per byte) into one 64-bit sum.
5537
+ return BitCast(D64(), SumsOf8(extracted_bits));
5538
+ }
5539
+
5540
+ template <class V64, class VI, HWY_IF_UI8(TFromV<VI>), class D64 = DFromV<V64>,
5541
+ HWY_IF_UI64_D(D64), HWY_IF_POW2_GT_D(D64, 2)>
5542
+ HWY_API V64 BitShuffle(V64 values, VI idx) {
5543
+ const Half<D64> dh;
5544
+ const Half<DFromV<VI>> dih;
5545
+ using V64H = VFromD<decltype(dh)>;
5546
+ const V64H r0 = BitShuffle(LowerHalf(dh, values), LowerHalf(dih, idx));
5547
+ const V64H r1 = BitShuffle(UpperHalf(dh, values), UpperHalf(dih, idx));
5548
+ return Combine(D64(), r1, r0);
4526
5549
  }
4527
5550
 
4528
5551
  // ------------------------------ MulEven/Odd (Mul, OddEven)
@@ -4531,7 +5554,7 @@ template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4)),
4531
5554
  class D = DFromV<V>, class DW = RepartitionToWide<D>>
4532
5555
  HWY_API VFromD<DW> MulEven(const V a, const V b) {
4533
5556
  const auto lo = Mul(a, b);
4534
- const auto hi = detail::MulHigh(a, b);
5557
+ const auto hi = MulHigh(a, b);
4535
5558
  return BitCast(DW(), OddEven(detail::Slide1Up(hi), lo));
4536
5559
  }
4537
5560
 
@@ -4539,7 +5562,7 @@ template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4)),
4539
5562
  class D = DFromV<V>, class DW = RepartitionToWide<D>>
4540
5563
  HWY_API VFromD<DW> MulOdd(const V a, const V b) {
4541
5564
  const auto lo = Mul(a, b);
4542
- const auto hi = detail::MulHigh(a, b);
5565
+ const auto hi = MulHigh(a, b);
4543
5566
  return BitCast(DW(), OddEven(hi, detail::Slide1Down(lo)));
4544
5567
  }
4545
5568
 
@@ -4547,28 +5570,34 @@ HWY_API VFromD<DW> MulOdd(const V a, const V b) {
4547
5570
  template <class V, HWY_IF_T_SIZE_V(V, 8)>
4548
5571
  HWY_INLINE V MulEven(const V a, const V b) {
4549
5572
  const auto lo = Mul(a, b);
4550
- const auto hi = detail::MulHigh(a, b);
5573
+ const auto hi = MulHigh(a, b);
4551
5574
  return OddEven(detail::Slide1Up(hi), lo);
4552
5575
  }
4553
5576
 
4554
5577
  template <class V, HWY_IF_T_SIZE_V(V, 8)>
4555
5578
  HWY_INLINE V MulOdd(const V a, const V b) {
4556
5579
  const auto lo = Mul(a, b);
4557
- const auto hi = detail::MulHigh(a, b);
5580
+ const auto hi = MulHigh(a, b);
4558
5581
  return OddEven(hi, detail::Slide1Down(lo));
4559
5582
  }
4560
5583
 
4561
5584
  // ------------------------------ ReorderDemote2To (OddEven, Combine)
4562
5585
 
4563
- template <size_t N, int kPow2>
4564
- HWY_API VFromD<Simd<bfloat16_t, N, kPow2>> ReorderDemote2To(
4565
- Simd<bfloat16_t, N, kPow2> dbf16,
4566
- VFromD<RepartitionToWide<decltype(dbf16)>> a,
4567
- VFromD<RepartitionToWide<decltype(dbf16)>> b) {
5586
+ template <class D, HWY_IF_BF16_D(D)>
5587
+ HWY_API VFromD<D> ReorderDemote2To(D dbf16, VFromD<RepartitionToWide<D>> a,
5588
+ VFromD<RepartitionToWide<D>> b) {
4568
5589
  const RebindToUnsigned<decltype(dbf16)> du16;
5590
+ const Half<decltype(du16)> du16_half;
4569
5591
  const RebindToUnsigned<DFromV<decltype(a)>> du32;
4570
- const VFromD<decltype(du32)> b_in_even = ShiftRight<16>(BitCast(du32, b));
4571
- return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
5592
+ const VFromD<decltype(du32)> a_in_even = PromoteTo(
5593
+ du32, detail::DemoteTo16NearestEven(du16_half, BitCast(du32, a)));
5594
+ const VFromD<decltype(du32)> b_in_even = PromoteTo(
5595
+ du32, detail::DemoteTo16NearestEven(du16_half, BitCast(du32, b)));
5596
+ // Equivalent to InterleaveEven, but because the upper 16 bits are zero, we
5597
+ // can OR instead of OddEven.
5598
+ const VFromD<decltype(du16)> a_in_odd =
5599
+ detail::Slide1Up(BitCast(du16, a_in_even));
5600
+ return BitCast(dbf16, Or(a_in_odd, BitCast(du16, b_in_even)));
4572
5601
  }
4573
5602
 
4574
5603
  // If LMUL is not the max, Combine first to avoid another DemoteTo.
@@ -4618,8 +5647,8 @@ HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
4618
5647
  }
4619
5648
 
4620
5649
  // If LMUL is not the max, Combine first to avoid another DemoteTo.
4621
- template <class DN, HWY_IF_BF16_D(DN), HWY_IF_POW2_LE_D(DN, 2), class V,
4622
- HWY_IF_F32_D(DFromV<V>),
5650
+ template <class DN, HWY_IF_SPECIAL_FLOAT_D(DN), HWY_IF_POW2_LE_D(DN, 2),
5651
+ class V, HWY_IF_F32_D(DFromV<V>),
4623
5652
  class V2 = VFromD<Repartition<TFromV<V>, DN>>,
4624
5653
  hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr>
4625
5654
  HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
@@ -4629,8 +5658,8 @@ HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
4629
5658
  }
4630
5659
 
4631
5660
  // Max LMUL: must DemoteTo first, then Combine.
4632
- template <class DN, HWY_IF_BF16_D(DN), HWY_IF_POW2_GT_D(DN, 2), class V,
4633
- HWY_IF_F32_D(DFromV<V>),
5661
+ template <class DN, HWY_IF_SPECIAL_FLOAT_D(DN), HWY_IF_POW2_GT_D(DN, 2),
5662
+ class V, HWY_IF_F32_D(DFromV<V>),
4634
5663
  class V2 = VFromD<Repartition<TFromV<V>, DN>>,
4635
5664
  hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr>
4636
5665
  HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
@@ -4653,68 +5682,26 @@ HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
4653
5682
 
4654
5683
  // ------------------------------ WidenMulPairwiseAdd
4655
5684
 
4656
- template <class D32, HWY_IF_F32_D(D32),
4657
- class V16 = VFromD<Repartition<bfloat16_t, D32>>>
4658
- HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) {
4659
- const RebindToUnsigned<decltype(df32)> du32;
4660
- using VU32 = VFromD<decltype(du32)>;
4661
- const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
4662
- // Using shift/and instead of Zip leads to the odd/even order that
4663
- // RearrangeToOddPlusEven prefers.
4664
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
4665
- const VU32 ao = And(BitCast(du32, a), odd);
4666
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
4667
- const VU32 bo = And(BitCast(du32, b), odd);
4668
- return MulAdd(BitCast(df32, ae), BitCast(df32, be),
4669
- Mul(BitCast(df32, ao), BitCast(df32, bo)));
4670
- }
4671
-
4672
- template <class D, HWY_IF_I32_D(D), class VI16>
4673
- HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, VI16 a, VI16 b) {
4674
- using VI32 = VFromD<decltype(d32)>;
4675
- // Manual sign extension requires two shifts for even lanes.
4676
- const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a)));
4677
- const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b)));
4678
- const VI32 ao = ShiftRight<16>(BitCast(d32, a));
4679
- const VI32 bo = ShiftRight<16>(BitCast(d32, b));
4680
- return Add(Mul(ae, be), Mul(ao, bo));
4681
- }
4682
-
4683
- template <class D, HWY_IF_U32_D(D), class VI16>
4684
- HWY_API VFromD<D> WidenMulPairwiseAdd(D du32, VI16 a, VI16 b) {
4685
- using VU32 = VFromD<decltype(du32)>;
4686
- // Manual sign extension requires two shifts for even lanes.
4687
- const VU32 ae = detail::AndS(BitCast(du32, a), uint32_t{0x0000FFFFu});
4688
- const VU32 be = detail::AndS(BitCast(du32, b), uint32_t{0x0000FFFFu});
4689
- const VU32 ao = ShiftRight<16>(BitCast(du32, a));
4690
- const VU32 bo = ShiftRight<16>(BitCast(du32, b));
4691
- return Add(Mul(ae, be), Mul(ao, bo));
5685
+ template <class DF, HWY_IF_F32_D(DF),
5686
+ class VBF = VFromD<Repartition<hwy::bfloat16_t, DF>>>
5687
+ HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
5688
+ const VFromD<DF> ae = PromoteEvenTo(df, a);
5689
+ const VFromD<DF> be = PromoteEvenTo(df, b);
5690
+ const VFromD<DF> ao = PromoteOddTo(df, a);
5691
+ const VFromD<DF> bo = PromoteOddTo(df, b);
5692
+ return MulAdd(ae, be, Mul(ao, bo));
5693
+ }
5694
+
5695
+ template <class D, HWY_IF_UI32_D(D), class V16 = VFromD<RepartitionToNarrow<D>>>
5696
+ HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, V16 a, V16 b) {
5697
+ return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b),
5698
+ Mul(PromoteOddTo(d32, a), PromoteOddTo(d32, b)));
4692
5699
  }
4693
5700
 
4694
5701
  // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
4695
5702
 
4696
5703
  namespace detail {
4697
5704
 
4698
- // Non-overloaded wrapper function so we can define DF32 in template args.
4699
- template <size_t N, int kPow2, class DF32 = Simd<float, N, kPow2>,
4700
- class VF32 = VFromD<DF32>,
4701
- class DBF16 = Repartition<bfloat16_t, Simd<float, N, kPow2>>>
4702
- HWY_API VF32 ReorderWidenMulAccumulateBF16(Simd<float, N, kPow2> df32,
4703
- VFromD<DBF16> a, VFromD<DBF16> b,
4704
- const VF32 sum0, VF32& sum1) {
4705
- const RebindToUnsigned<DF32> du32;
4706
- using VU32 = VFromD<decltype(du32)>;
4707
- const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
4708
- // Using shift/and instead of Zip leads to the odd/even order that
4709
- // RearrangeToOddPlusEven prefers.
4710
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
4711
- const VU32 ao = And(BitCast(du32, a), odd);
4712
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
4713
- const VU32 bo = And(BitCast(du32, b), odd);
4714
- sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
4715
- return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
4716
- }
4717
-
4718
5705
  #define HWY_RVV_WIDEN_MACC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
4719
5706
  SHIFT, MLEN, NAME, OP) \
4720
5707
  template <size_t N> \
@@ -4790,21 +5777,15 @@ HWY_API VFromD<D32> ReorderWidenMulAccumulateU16(D32 d32, VFromD<D16> a,
4790
5777
 
4791
5778
  } // namespace detail
4792
5779
 
4793
- template <size_t N, int kPow2, class VN, class VW>
4794
- HWY_API VW ReorderWidenMulAccumulate(Simd<float, N, kPow2> d32, VN a, VN b,
4795
- const VW sum0, VW& sum1) {
4796
- return detail::ReorderWidenMulAccumulateBF16(d32, a, b, sum0, sum1);
4797
- }
4798
-
4799
- template <size_t N, int kPow2, class VN, class VW>
4800
- HWY_API VW ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32, VN a, VN b,
4801
- const VW sum0, VW& sum1) {
5780
+ template <class D, HWY_IF_I32_D(D), class VN, class VW>
5781
+ HWY_API VW ReorderWidenMulAccumulate(D d32, VN a, VN b, const VW sum0,
5782
+ VW& sum1) {
4802
5783
  return detail::ReorderWidenMulAccumulateI16(d32, a, b, sum0, sum1);
4803
5784
  }
4804
5785
 
4805
- template <size_t N, int kPow2, class VN, class VW>
4806
- HWY_API VW ReorderWidenMulAccumulate(Simd<uint32_t, N, kPow2> d32, VN a, VN b,
4807
- const VW sum0, VW& sum1) {
5786
+ template <class D, HWY_IF_U32_D(D), class VN, class VW>
5787
+ HWY_API VW ReorderWidenMulAccumulate(D d32, VN a, VN b, const VW sum0,
5788
+ VW& sum1) {
4808
5789
  return detail::ReorderWidenMulAccumulateU16(d32, a, b, sum0, sum1);
4809
5790
  }
4810
5791
 
@@ -4872,6 +5853,40 @@ HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
4872
5853
  }
4873
5854
 
4874
5855
  // ------------------------------ Lt128
5856
+ #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5857
+
5858
+ template <class D>
5859
+ HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) {
5860
+ static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
5861
+ // The subsequent computations are performed using e8mf8 (8-bit elements with
5862
+ // a fractional LMUL of 1/8) for the following reasons:
5863
+ // 1. It is correct for the possible input vector types e64m<1,2,4,8>. This is
5864
+ // because the resulting mask can occupy at most 1/8 of a full vector when
5865
+ // using e64m8.
5866
+ // 2. It can be more efficient than using a full vector or a vector group.
5867
+ //
5868
+ // The algorithm computes the result as follows:
5869
+ // 1. Compute cH | (=H & cL) in the high bits, where cH and cL represent the
5870
+ // comparison results for the high and low 64-bit elements, respectively.
5871
+ // 2. Shift the result right by 1 to duplicate the comparison results for the
5872
+ // low bits.
5873
+ // 3. Obtain the final result by performing a bitwise OR on the high and low
5874
+ // bits.
5875
+ auto du8mf8 = ScalableTag<uint8_t, -3>{};
5876
+ const vuint8mf8_t ltHL0 =
5877
+ detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Lt(a, b)));
5878
+ const vuint8mf8_t eqHL0 =
5879
+ detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Eq(a, b)));
5880
+ const vuint8mf8_t ltLx0 = Add(ltHL0, ltHL0);
5881
+ const vuint8mf8_t resultHx = detail::AndS(OrAnd(ltHL0, ltLx0, eqHL0), 0xaa);
5882
+ const vuint8mf8_t resultxL = ShiftRight<1>(resultHx);
5883
+ const vuint8mf8_t result = Or(resultHx, resultxL);
5884
+ auto du8m1 = ScalableTag<uint8_t>{};
5885
+ return detail::U8MaskBitsVecToMask(d, detail::ChangeLMUL(du8m1, result));
5886
+ }
5887
+
5888
+ #else
5889
+
4875
5890
  template <class D>
4876
5891
  HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) {
4877
5892
  static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
@@ -4897,6 +5912,8 @@ HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) {
4897
5912
  return MaskFromVec(OddEven(vecHx, detail::Slide1Down(vecHx)));
4898
5913
  }
4899
5914
 
5915
+ #endif // HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5916
+
4900
5917
  // ------------------------------ Lt128Upper
4901
5918
  template <class D>
4902
5919
  HWY_INLINE MFromD<D> Lt128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
@@ -4994,7 +6011,6 @@ HWY_INLINE VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) {
4994
6011
  }
4995
6012
 
4996
6013
  // ================================================== END MACROS
4997
- namespace detail { // for code folding
4998
6014
  #undef HWY_RVV_AVL
4999
6015
  #undef HWY_RVV_D
5000
6016
  #undef HWY_RVV_FOREACH
@@ -5055,15 +6071,19 @@ namespace detail { // for code folding
5055
6071
  #undef HWY_RVV_FOREACH_UI32
5056
6072
  #undef HWY_RVV_FOREACH_UI3264
5057
6073
  #undef HWY_RVV_FOREACH_UI64
6074
+ #undef HWY_RVV_IF_EMULATED_D
6075
+ #undef HWY_RVV_IF_CAN128_D
6076
+ #undef HWY_RVV_IF_GE128_D
6077
+ #undef HWY_RVV_IF_LT128_D
5058
6078
  #undef HWY_RVV_INSERT_VXRM
5059
6079
  #undef HWY_RVV_M
5060
6080
  #undef HWY_RVV_RETM_ARGM
6081
+ #undef HWY_RVV_RETV_ARGMVV
5061
6082
  #undef HWY_RVV_RETV_ARGV
5062
6083
  #undef HWY_RVV_RETV_ARGVS
5063
6084
  #undef HWY_RVV_RETV_ARGVV
5064
6085
  #undef HWY_RVV_T
5065
6086
  #undef HWY_RVV_V
5066
- } // namespace detail
5067
6087
  // NOLINTNEXTLINE(google-readability-namespace-comments)
5068
6088
  } // namespace HWY_NAMESPACE
5069
6089
  } // namespace hwy