@img/sharp-libvips-dev 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (230) hide show
  1. package/include/aom/aom_encoder.h +3 -3
  2. package/include/aom/aomcx.h +17 -8
  3. package/include/expat.h +21 -10
  4. package/include/expat_config.h +11 -5
  5. package/include/ffi.h +12 -25
  6. package/include/fontconfig/fontconfig.h +5 -3
  7. package/include/freetype2/freetype/config/ftoption.h +1 -1
  8. package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -7
  9. package/include/gio-unix-2.0/gio/gunixinputstream.h +0 -5
  10. package/include/gio-unix-2.0/gio/gunixoutputstream.h +0 -5
  11. package/include/glib-2.0/gio/gappinfo.h +0 -7
  12. package/include/glib-2.0/gio/gapplication.h +6 -0
  13. package/include/glib-2.0/gio/gapplicationcommandline.h +12 -1
  14. package/include/glib-2.0/gio/gasyncinitable.h +0 -7
  15. package/include/glib-2.0/gio/gasyncresult.h +0 -6
  16. package/include/glib-2.0/gio/gbufferedinputstream.h +0 -5
  17. package/include/glib-2.0/gio/gbufferedoutputstream.h +0 -5
  18. package/include/glib-2.0/gio/gbytesicon.h +0 -5
  19. package/include/glib-2.0/gio/gcancellable.h +0 -5
  20. package/include/glib-2.0/gio/gconverter.h +0 -7
  21. package/include/glib-2.0/gio/gconverterinputstream.h +0 -6
  22. package/include/glib-2.0/gio/gconverteroutputstream.h +0 -6
  23. package/include/glib-2.0/gio/gdatagrambased.h +0 -7
  24. package/include/glib-2.0/gio/gdatainputstream.h +0 -6
  25. package/include/glib-2.0/gio/gdataoutputstream.h +0 -6
  26. package/include/glib-2.0/gio/gdbusinterface.h +0 -8
  27. package/include/glib-2.0/gio/gdbusinterfaceskeleton.h +0 -8
  28. package/include/glib-2.0/gio/gdbusmessage.h +2 -1
  29. package/include/glib-2.0/gio/gdbusobjectmanagerclient.h +0 -8
  30. package/include/glib-2.0/gio/gdbusobjectmanagerserver.h +0 -8
  31. package/include/glib-2.0/gio/gdbusobjectproxy.h +0 -8
  32. package/include/glib-2.0/gio/gdbusobjectskeleton.h +0 -8
  33. package/include/glib-2.0/gio/gdbusproxy.h +0 -8
  34. package/include/glib-2.0/gio/gdebugcontroller.h +0 -8
  35. package/include/glib-2.0/gio/gdebugcontrollerdbus.h +0 -7
  36. package/include/glib-2.0/gio/gdtlsserverconnection.h +0 -8
  37. package/include/glib-2.0/gio/gemblem.h +0 -5
  38. package/include/glib-2.0/gio/gemblemedicon.h +0 -5
  39. package/include/glib-2.0/gio/gfile.h +0 -10
  40. package/include/glib-2.0/gio/gfileenumerator.h +0 -5
  41. package/include/glib-2.0/gio/gfileicon.h +0 -5
  42. package/include/glib-2.0/gio/gfileinfo.h +0 -5
  43. package/include/glib-2.0/gio/gfileinputstream.h +0 -8
  44. package/include/glib-2.0/gio/gfileiostream.h +0 -8
  45. package/include/glib-2.0/gio/gfilemonitor.h +0 -5
  46. package/include/glib-2.0/gio/gfilenamecompleter.h +0 -5
  47. package/include/glib-2.0/gio/gfileoutputstream.h +0 -8
  48. package/include/glib-2.0/gio/gfilterinputstream.h +0 -5
  49. package/include/glib-2.0/gio/gfilteroutputstream.h +0 -5
  50. package/include/glib-2.0/gio/gicon.h +0 -5
  51. package/include/glib-2.0/gio/ginitable.h +0 -7
  52. package/include/glib-2.0/gio/ginputstream.h +0 -5
  53. package/include/glib-2.0/gio/gio-autocleanups.h +4 -0
  54. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  55. package/include/glib-2.0/gio/gioenums.h +6 -1
  56. package/include/glib-2.0/gio/giomodule.h +0 -5
  57. package/include/glib-2.0/gio/giostream.h +0 -5
  58. package/include/glib-2.0/gio/giotypes.h +5 -108
  59. package/include/glib-2.0/gio/gloadableicon.h +0 -6
  60. package/include/glib-2.0/gio/gmemoryinputstream.h +0 -5
  61. package/include/glib-2.0/gio/gmemoryoutputstream.h +0 -5
  62. package/include/glib-2.0/gio/gmountoperation.h +0 -6
  63. package/include/glib-2.0/gio/gnetworking.h +4 -0
  64. package/include/glib-2.0/gio/goutputstream.h +0 -9
  65. package/include/glib-2.0/gio/gpollableinputstream.h +0 -7
  66. package/include/glib-2.0/gio/gpollableoutputstream.h +0 -7
  67. package/include/glib-2.0/gio/gproxy.h +0 -7
  68. package/include/glib-2.0/gio/gproxyaddressenumerator.h +0 -8
  69. package/include/glib-2.0/gio/gseekable.h +0 -5
  70. package/include/glib-2.0/gio/gsettingsbackend.h +0 -5
  71. package/include/glib-2.0/gio/gsimpleactiongroup.h +0 -7
  72. package/include/glib-2.0/gio/gsimpleasyncresult.h +0 -5
  73. package/include/glib-2.0/gio/gsimpleproxyresolver.h +0 -5
  74. package/include/glib-2.0/gio/gsocket.h +13 -0
  75. package/include/glib-2.0/gio/gsocketaddressenumerator.h +0 -6
  76. package/include/glib-2.0/gio/gsocketconnectable.h +0 -5
  77. package/include/glib-2.0/gio/gtask.h +12 -0
  78. package/include/glib-2.0/gio/gthemedicon.h +0 -5
  79. package/include/glib-2.0/gio/gtlsserverconnection.h +0 -8
  80. package/include/glib-2.0/gio/gunixcredentialsmessage.h +0 -8
  81. package/include/glib-2.0/gio/gvfs.h +0 -5
  82. package/include/glib-2.0/gio/gvolume.h +2 -2
  83. package/include/glib-2.0/gio/gvolumemonitor.h +0 -5
  84. package/include/glib-2.0/girepository/gi-visibility.h +986 -0
  85. package/include/glib-2.0/girepository/giarginfo.h +100 -0
  86. package/include/glib-2.0/girepository/gibaseinfo.h +129 -0
  87. package/include/glib-2.0/girepository/gicallableinfo.h +119 -0
  88. package/include/glib-2.0/girepository/gicallbackinfo.h +60 -0
  89. package/include/glib-2.0/girepository/giconstantinfo.h +72 -0
  90. package/include/glib-2.0/girepository/gienuminfo.h +82 -0
  91. package/include/glib-2.0/girepository/gifieldinfo.h +84 -0
  92. package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
  93. package/include/glib-2.0/girepository/gifunctioninfo.h +117 -0
  94. package/include/glib-2.0/girepository/giinterfaceinfo.h +120 -0
  95. package/include/glib-2.0/girepository/giobjectinfo.h +230 -0
  96. package/include/glib-2.0/girepository/gipropertyinfo.h +77 -0
  97. package/include/glib-2.0/girepository/giregisteredtypeinfo.h +75 -0
  98. package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
  99. package/include/glib-2.0/girepository/girepository.h +247 -0
  100. package/include/glib-2.0/girepository/girffi.h +129 -0
  101. package/include/glib-2.0/girepository/gisignalinfo.h +72 -0
  102. package/include/glib-2.0/girepository/gistructinfo.h +102 -0
  103. package/include/glib-2.0/girepository/gitypeinfo.h +144 -0
  104. package/include/glib-2.0/girepository/gitypelib.h +61 -0
  105. package/include/glib-2.0/girepository/gitypes.h +421 -0
  106. package/include/glib-2.0/girepository/giunioninfo.h +105 -0
  107. package/include/glib-2.0/girepository/giunresolvedinfo.h +60 -0
  108. package/include/glib-2.0/girepository/givalueinfo.h +65 -0
  109. package/include/glib-2.0/girepository/givfuncinfo.h +88 -0
  110. package/include/glib-2.0/glib/deprecated/gcompletion.h +1 -1
  111. package/include/glib-2.0/glib/deprecated/grel.h +0 -23
  112. package/include/glib-2.0/glib/deprecated/gthread.h +10 -6
  113. package/include/glib-2.0/glib/gatomic.h +20 -20
  114. package/include/glib-2.0/glib/gbitlock.h +31 -0
  115. package/include/glib-2.0/glib/gbookmarkfile.h +39 -1
  116. package/include/glib-2.0/glib/gchecksum.h +0 -10
  117. package/include/glib-2.0/glib/gdate.h +0 -9
  118. package/include/glib-2.0/glib/gdatetime.h +33 -1
  119. package/include/glib-2.0/glib/gdir.h +5 -0
  120. package/include/glib-2.0/glib/ghmac.h +0 -9
  121. package/include/glib-2.0/glib/glib-autocleanups.h +4 -0
  122. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  123. package/include/glib-2.0/glib/gmacros.h +1 -0
  124. package/include/glib-2.0/glib/gmessages.h +11 -0
  125. package/include/glib-2.0/glib/gpathbuf.h +0 -7
  126. package/include/glib-2.0/glib/gslice.h +2 -0
  127. package/include/glib-2.0/glib/gstdio.h +1 -1
  128. package/include/glib-2.0/glib/gstrfuncs.h +24 -18
  129. package/include/glib-2.0/glib/gstrvbuilder.h +4 -8
  130. package/include/glib-2.0/glib/gtestutils.h +5 -0
  131. package/include/glib-2.0/glib/gthread.h +216 -3
  132. package/include/glib-2.0/glib/gunicode.h +12 -2
  133. package/include/glib-2.0/glib/gvarianttype.h +1 -10
  134. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  135. package/include/glib-2.0/glib/gwin32.h +4 -4
  136. package/include/glib-2.0/glib-unix.h +214 -0
  137. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  138. package/include/glib-2.0/gobject/gbinding.h +0 -8
  139. package/include/glib-2.0/gobject/gbindinggroup.h +0 -8
  140. package/include/glib-2.0/gobject/gclosure.h +1 -9
  141. package/include/glib-2.0/gobject/genums.h +6 -6
  142. package/include/glib-2.0/gobject/glib-types.h +44 -0
  143. package/include/glib-2.0/gobject/gobject-autocleanups.h +4 -0
  144. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  145. package/include/glib-2.0/gobject/gobject.h +1 -16
  146. package/include/glib-2.0/gobject/gparam.h +3 -12
  147. package/include/glib-2.0/gobject/gsignal.h +16 -6
  148. package/include/glib-2.0/gobject/gsignalgroup.h +0 -8
  149. package/include/glib-2.0/gobject/gtype.h +53 -20
  150. package/include/glib-2.0/gobject/gtypemodule.h +0 -7
  151. package/include/glib-2.0/gobject/gtypeplugin.h +0 -6
  152. package/include/glib-2.0/gobject/gvaluearray.h +0 -7
  153. package/include/glib-2.0/gobject/gvaluecollector.h +1 -11
  154. package/include/glib-2.0/gobject/gvaluetypes.h +2 -0
  155. package/include/hwy/aligned_allocator.h +171 -6
  156. package/include/hwy/base.h +1765 -543
  157. package/include/hwy/cache_control.h +24 -6
  158. package/include/hwy/detect_compiler_arch.h +23 -2
  159. package/include/hwy/detect_targets.h +56 -13
  160. package/include/hwy/foreach_target.h +24 -0
  161. package/include/hwy/highway.h +20 -3
  162. package/include/hwy/ops/arm_neon-inl.h +1086 -667
  163. package/include/hwy/ops/arm_sve-inl.h +1091 -235
  164. package/include/hwy/ops/emu128-inl.h +271 -196
  165. package/include/hwy/ops/generic_ops-inl.h +2270 -399
  166. package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
  167. package/include/hwy/ops/rvv-inl.h +1043 -311
  168. package/include/hwy/ops/scalar-inl.h +189 -159
  169. package/include/hwy/ops/set_macros-inl.h +66 -6
  170. package/include/hwy/ops/shared-inl.h +175 -56
  171. package/include/hwy/ops/wasm_128-inl.h +153 -136
  172. package/include/hwy/ops/x86_128-inl.h +1647 -646
  173. package/include/hwy/ops/x86_256-inl.h +1003 -370
  174. package/include/hwy/ops/x86_512-inl.h +948 -353
  175. package/include/hwy/per_target.h +4 -0
  176. package/include/hwy/profiler.h +648 -0
  177. package/include/hwy/robust_statistics.h +2 -2
  178. package/include/hwy/targets.h +18 -11
  179. package/include/hwy/timer.h +11 -0
  180. package/include/lcms2.h +46 -7
  181. package/include/lcms2_plugin.h +4 -4
  182. package/include/libheif/heif_version.h +2 -2
  183. package/include/libpng16/png.h +32 -29
  184. package/include/libpng16/pngconf.h +2 -2
  185. package/include/libpng16/pnglibconf.h +7 -2
  186. package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
  187. package/include/libxml2/libxml/HTMLparser.h +23 -0
  188. package/include/libxml2/libxml/SAX.h +0 -2
  189. package/include/libxml2/libxml/SAX2.h +0 -2
  190. package/include/libxml2/libxml/c14n.h +0 -2
  191. package/include/libxml2/libxml/dict.h +1 -0
  192. package/include/libxml2/libxml/encoding.h +16 -14
  193. package/include/libxml2/libxml/entities.h +4 -0
  194. package/include/libxml2/libxml/globals.h +15 -503
  195. package/include/libxml2/libxml/hash.h +57 -61
  196. package/include/libxml2/libxml/nanoftp.h +2 -2
  197. package/include/libxml2/libxml/parser.h +137 -18
  198. package/include/libxml2/libxml/parserInternals.h +1 -0
  199. package/include/libxml2/libxml/relaxng.h +2 -1
  200. package/include/libxml2/libxml/schemasInternals.h +1 -0
  201. package/include/libxml2/libxml/schematron.h +1 -0
  202. package/include/libxml2/libxml/threads.h +4 -11
  203. package/include/libxml2/libxml/tree.h +68 -20
  204. package/include/libxml2/libxml/uri.h +2 -1
  205. package/include/libxml2/libxml/valid.h +2 -0
  206. package/include/libxml2/libxml/xmlIO.h +65 -13
  207. package/include/libxml2/libxml/xmlerror.h +37 -8
  208. package/include/libxml2/libxml/xmlmemory.h +37 -40
  209. package/include/libxml2/libxml/xmlreader.h +6 -0
  210. package/include/libxml2/libxml/xmlregexp.h +2 -9
  211. package/include/libxml2/libxml/xmlsave.h +9 -0
  212. package/include/libxml2/libxml/xmlschemas.h +3 -0
  213. package/include/libxml2/libxml/xmlversion.h +28 -43
  214. package/include/libxml2/libxml/xpath.h +1 -1
  215. package/include/libxml2/libxml/xpathInternals.h +2 -1
  216. package/include/libxml2/libxml/xpointer.h +5 -4
  217. package/include/pango-1.0/pango/pango-features.h +3 -3
  218. package/include/pango-1.0/pango/pango-fontmap.h +7 -0
  219. package/include/pixman-1/pixman-version.h +3 -3
  220. package/include/pixman-1/pixman.h +9 -2
  221. package/include/png.h +32 -29
  222. package/include/pngconf.h +2 -2
  223. package/include/pnglibconf.h +7 -2
  224. package/include/vips/connection.h +9 -3
  225. package/include/vips/util.h +0 -9
  226. package/include/vips/version.h +4 -4
  227. package/include/zconf.h +3 -0
  228. package/include/zlib.h +3 -3
  229. package/package.json +1 -1
  230. package/versions.json +15 -15
@@ -339,8 +339,11 @@ namespace detail { // for code folding
339
339
  // Full support for f16 in all ops
340
340
  #define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
341
341
  HWY_RVV_FOREACH_F16_UNCONDITIONAL(X_MACRO, NAME, OP, LMULS)
342
+ // Only BF16 is emulated.
343
+ #define HWY_RVV_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
342
344
  #else
343
345
  #define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS)
346
+ #define HWY_RVV_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
344
347
  #endif
345
348
  #define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \
346
349
  HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, float, f, NAME, OP)
@@ -389,15 +392,11 @@ namespace detail { // for code folding
389
392
  // For all combinations of SEW:
390
393
  #define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
391
394
  HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
392
- HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
393
- HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
394
- HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
395
+ HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS)
395
396
 
396
397
  #define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \
397
398
  HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \
398
- HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \
399
- HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
400
- HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
399
+ HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS)
401
400
 
402
401
  #define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS) \
403
402
  HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
@@ -409,8 +408,7 @@ namespace detail { // for code folding
409
408
  HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS)
410
409
 
411
410
  #define HWY_RVV_FOREACH(X_MACRO, NAME, OP, LMULS) \
412
- HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
413
- HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \
411
+ HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS) \
414
412
  HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS)
415
413
 
416
414
  // Assemble types for use in x-macros
@@ -480,18 +478,12 @@ HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _, _ALL)
480
478
 
481
479
  HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e, _ALL)
482
480
  HWY_RVV_FOREACH(HWY_RVV_LANES_VIRT, Lanes, lenb, _VIRT)
483
- // If not already defined via HWY_RVV_FOREACH, define the overloads because
484
- // they do not require any new instruction.
485
- #if !HWY_HAVE_FLOAT16
486
- HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_LANES, Lanes, setvlmax_e, _ALL)
487
- HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_LANES_VIRT, Lanes, lenb, _VIRT)
488
- #endif
489
481
  #undef HWY_RVV_LANES
490
482
  #undef HWY_RVV_LANES_VIRT
491
483
 
492
- template <size_t N, int kPow2>
493
- HWY_API size_t Lanes(Simd<bfloat16_t, N, kPow2> /* tag*/) {
494
- return Lanes(Simd<int16_t, N, kPow2>());
484
+ template <class D, HWY_RVV_IF_EMULATED_D(D)>
485
+ HWY_API size_t Lanes(D /* tag*/) {
486
+ return Lanes(RebindToUnsigned<D>());
495
487
  }
496
488
 
497
489
  // ------------------------------ Common x-macros
@@ -525,10 +517,20 @@ HWY_API size_t Lanes(Simd<bfloat16_t, N, kPow2> /* tag*/) {
525
517
  HWY_RVV_AVL(SEW, SHIFT)); \
526
518
  }
527
519
 
520
+ // vector = f(vector, mask, vector, vector), e.g. MaskedAddOr
521
+ #define HWY_RVV_RETV_ARGMVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
522
+ SHIFT, MLEN, NAME, OP) \
523
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
524
+ NAME(HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_M(MLEN) m, \
525
+ HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
526
+ return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_mu(m, no, a, b, \
527
+ HWY_RVV_AVL(SEW, SHIFT)); \
528
+ }
529
+
528
530
  // mask = f(mask)
529
- #define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP) \
530
- HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) { \
531
- return __riscv_vm##OP##_m_b##MLEN(m, ~0ull); \
531
+ #define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP) \
532
+ HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) { \
533
+ return __riscv_vm##OP##_m_b##MLEN(m, HWY_RVV_AVL(SEW, SHIFT)); \
532
534
  }
533
535
 
534
536
  // ================================================== INIT
@@ -550,20 +552,18 @@ HWY_RVV_FOREACH_F(HWY_RVV_SET, Set, fmv_v_f, _ALL_VIRT)
550
552
  // Treat bfloat16_t as int16_t (using the previously defined Set overloads);
551
553
  // required for Zero and VFromD.
552
554
  template <size_t N, int kPow2>
553
- decltype(Set(Simd<int16_t, N, kPow2>(), 0)) Set(Simd<bfloat16_t, N, kPow2> d,
554
- bfloat16_t arg) {
555
- return Set(RebindToSigned<decltype(d)>(), arg.bits);
555
+ decltype(Set(Simd<int16_t, N, kPow2>(), 0)) Set(
556
+ Simd<hwy::bfloat16_t, N, kPow2> d, hwy::bfloat16_t arg) {
557
+ return Set(RebindToSigned<decltype(d)>(), BitCastScalar<int16_t>(arg));
556
558
  }
557
559
  #if !HWY_HAVE_FLOAT16 // Otherwise already defined above.
558
560
  // WARNING: returns a different type than emulated bfloat16_t so that we can
559
561
  // implement PromoteTo overloads for both bfloat16_t and float16_t, and also
560
- // provide a Neg(float16_t) overload that coexists with Neg(int16_t).
562
+ // provide a Neg(hwy::float16_t) overload that coexists with Neg(int16_t).
561
563
  template <size_t N, int kPow2>
562
- decltype(Set(Simd<uint16_t, N, kPow2>(), 0)) Set(Simd<float16_t, N, kPow2> d,
563
- float16_t arg) {
564
- uint16_t bits;
565
- CopySameSize(&arg, &bits);
566
- return Set(RebindToUnsigned<decltype(d)>(), bits);
564
+ decltype(Set(Simd<uint16_t, N, kPow2>(), 0)) Set(
565
+ Simd<hwy::float16_t, N, kPow2> d, hwy::float16_t arg) {
566
+ return Set(RebindToUnsigned<decltype(d)>(), BitCastScalar<uint16_t>(arg));
567
567
  }
568
568
  #endif
569
569
 
@@ -642,16 +642,7 @@ HWY_RVV_FOREACH(HWY_RVV_EXT, Ext, lmul_ext, _EXT)
642
642
  HWY_RVV_FOREACH(HWY_RVV_EXT_VIRT, Ext, lmul_ext, _VIRT)
643
643
  #undef HWY_RVV_EXT_VIRT
644
644
 
645
- #if !HWY_HAVE_FLOAT16
646
- template <class D, HWY_IF_F16_D(D)>
647
- VFromD<D> Ext(D d, VFromD<Half<D>> v) {
648
- const RebindToUnsigned<decltype(d)> du;
649
- const Half<decltype(du)> duh;
650
- return BitCast(d, Ext(du, BitCast(duh, v)));
651
- }
652
- #endif
653
-
654
- template <class D, HWY_IF_BF16_D(D)>
645
+ template <class D, HWY_RVV_IF_EMULATED_D(D)>
655
646
  VFromD<D> Ext(D d, VFromD<Half<D>> v) {
656
647
  const RebindToUnsigned<decltype(d)> du;
657
648
  const Half<decltype(du)> duh;
@@ -769,7 +760,7 @@ HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
769
760
  #else
770
761
  template <size_t N, int kPow2>
771
762
  HWY_INLINE VFromD<Simd<uint16_t, N, kPow2>> BitCastFromByte(
772
- Simd<float16_t, N, kPow2> /* d */, VFromD<Simd<uint8_t, N, kPow2>> v) {
763
+ Simd<hwy::float16_t, N, kPow2> /* d */, VFromD<Simd<uint8_t, N, kPow2>> v) {
773
764
  return BitCastFromByte(Simd<uint16_t, N, kPow2>(), v);
774
765
  }
775
766
  #endif
@@ -783,7 +774,8 @@ HWY_INLINE VFromD<Simd<uint16_t, N, kPow2>> BitCastFromByte(
783
774
 
784
775
  template <size_t N, int kPow2>
785
776
  HWY_INLINE VFromD<Simd<int16_t, N, kPow2>> BitCastFromByte(
786
- Simd<bfloat16_t, N, kPow2> /* d */, VFromD<Simd<uint8_t, N, kPow2>> v) {
777
+ Simd<hwy::bfloat16_t, N, kPow2> /* d */,
778
+ VFromD<Simd<uint8_t, N, kPow2>> v) {
787
779
  return BitCastFromByte(Simd<int16_t, N, kPow2>(), v);
788
780
  }
789
781
 
@@ -1048,7 +1040,7 @@ HWY_RVV_FOREACH_I(HWY_RVV_SHIFT, ShiftRight, sra, _ALL)
1048
1040
  #undef HWY_RVV_SHIFT
1049
1041
 
1050
1042
  // ------------------------------ SumsOf8 (ShiftRight, Add)
1051
- template <class VU8>
1043
+ template <class VU8, HWY_IF_U8_D(DFromV<VU8>)>
1052
1044
  HWY_API VFromD<Repartition<uint64_t, DFromV<VU8>>> SumsOf8(const VU8 v) {
1053
1045
  const DFromV<VU8> du8;
1054
1046
  const RepartitionToWide<decltype(du8)> du16;
@@ -1071,6 +1063,31 @@ HWY_API VFromD<Repartition<uint64_t, DFromV<VU8>>> SumsOf8(const VU8 v) {
1071
1063
  return detail::AndS(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), 0xFFFFull);
1072
1064
  }
1073
1065
 
1066
+ template <class VI8, HWY_IF_I8_D(DFromV<VI8>)>
1067
+ HWY_API VFromD<Repartition<int64_t, DFromV<VI8>>> SumsOf8(const VI8 v) {
1068
+ const DFromV<VI8> di8;
1069
+ const RepartitionToWide<decltype(di8)> di16;
1070
+ const RepartitionToWide<decltype(di16)> di32;
1071
+ const RepartitionToWide<decltype(di32)> di64;
1072
+ const RebindToUnsigned<decltype(di32)> du32;
1073
+ const RebindToUnsigned<decltype(di64)> du64;
1074
+ using VI16 = VFromD<decltype(di16)>;
1075
+
1076
+ const VI16 vFDB97531 = ShiftRight<8>(BitCast(di16, v));
1077
+ const VI16 vECA86420 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, v)));
1078
+ const VI16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
1079
+
1080
+ const VI16 sDC_zz_98_zz_54_zz_10_zz =
1081
+ BitCast(di16, ShiftLeft<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
1082
+ const VI16 sFC_xx_B8_xx_74_xx_30_xx =
1083
+ Add(sFE_DC_BA_98_76_54_32_10, sDC_zz_98_zz_54_zz_10_zz);
1084
+ const VI16 sB8_xx_zz_zz_30_xx_zz_zz =
1085
+ BitCast(di16, ShiftLeft<32>(BitCast(du64, sFC_xx_B8_xx_74_xx_30_xx)));
1086
+ const VI16 sF8_xx_xx_xx_70_xx_xx_xx =
1087
+ Add(sFC_xx_B8_xx_74_xx_30_xx, sB8_xx_zz_zz_30_xx_zz_zz);
1088
+ return ShiftRight<48>(BitCast(di64, sF8_xx_xx_xx_70_xx_xx_xx));
1089
+ }
1090
+
1074
1091
  // ------------------------------ RotateRight
1075
1092
  template <int kBits, class V>
1076
1093
  HWY_API V RotateRight(const V v) {
@@ -1184,8 +1201,57 @@ HWY_RVV_FOREACH_I16(HWY_RVV_MUL15, MulFixedPoint15, smul, _ALL)
1184
1201
  #undef HWY_RVV_MUL15
1185
1202
 
1186
1203
  // ------------------------------ Div
1204
+ #ifdef HWY_NATIVE_INT_DIV
1205
+ #undef HWY_NATIVE_INT_DIV
1206
+ #else
1207
+ #define HWY_NATIVE_INT_DIV
1208
+ #endif
1209
+
1210
+ HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Div, divu, _ALL)
1211
+ HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Div, div, _ALL)
1187
1212
  HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Div, fdiv, _ALL)
1188
1213
 
1214
+ HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Mod, remu, _ALL)
1215
+ HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Mod, rem, _ALL)
1216
+
1217
+ // ------------------------------ MaskedAddOr etc.
1218
+
1219
+ #ifdef HWY_NATIVE_MASKED_ARITH
1220
+ #undef HWY_NATIVE_MASKED_ARITH
1221
+ #else
1222
+ #define HWY_NATIVE_MASKED_ARITH
1223
+ #endif
1224
+
1225
+ HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedMinOr, minu, _ALL)
1226
+ HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedMinOr, min, _ALL)
1227
+ HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedMinOr, fmin, _ALL)
1228
+
1229
+ HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedMaxOr, maxu, _ALL)
1230
+ HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedMaxOr, max, _ALL)
1231
+ HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedMaxOr, fmax, _ALL)
1232
+
1233
+ HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGMVV, MaskedAddOr, add, _ALL)
1234
+ HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedAddOr, fadd, _ALL)
1235
+
1236
+ HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGMVV, MaskedSubOr, sub, _ALL)
1237
+ HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedSubOr, fsub, _ALL)
1238
+
1239
+ HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGMVV, MaskedMulOr, mul, _ALL)
1240
+ HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedMulOr, fmul, _ALL)
1241
+
1242
+ HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedDivOr, divu, _ALL)
1243
+ HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedDivOr, div, _ALL)
1244
+ HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedDivOr, fdiv, _ALL)
1245
+
1246
+ HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedModOr, remu, _ALL)
1247
+ HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedModOr, rem, _ALL)
1248
+
1249
+ HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedSatAddOr, saddu, _ALL)
1250
+ HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedSatAddOr, sadd, _ALL)
1251
+
1252
+ HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedSatSubOr, ssubu, _ALL)
1253
+ HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedSatSubOr, ssub, _ALL)
1254
+
1189
1255
  // ------------------------------ ApproximateReciprocal
1190
1256
  #ifdef HWY_NATIVE_F64_APPROX_RECIP
1191
1257
  #undef HWY_NATIVE_F64_APPROX_RECIP
@@ -1247,26 +1313,6 @@ HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub, fnmacc, _ALL)
1247
1313
  // vboolXX_t is a power of two divisor for vector bits. SEW=8 / LMUL=1 = 1/8th
1248
1314
  // of all bits; SEW=8 / LMUL=4 = half of all bits.
1249
1315
 
1250
- // SFINAE for mapping Simd<> to MLEN (up to 64).
1251
- #define HWY_RVV_IF_MLEN_D(D, MLEN) \
1252
- hwy::EnableIf<MLenFromD(D()) == MLEN>* = nullptr
1253
-
1254
- // Specialized for RVV instead of the generic test_util-inl.h implementation
1255
- // because more efficient, and helps implement MFromD.
1256
-
1257
- #define HWY_RVV_MASK_FALSE(SEW, SHIFT, MLEN, NAME, OP) \
1258
- template <class D, HWY_RVV_IF_MLEN_D(D, MLEN)> \
1259
- HWY_API HWY_RVV_M(MLEN) NAME(D d) { \
1260
- return __riscv_vm##OP##_m_b##MLEN(Lanes(d)); \
1261
- }
1262
-
1263
- HWY_RVV_FOREACH_B(HWY_RVV_MASK_FALSE, MaskFalse, clr)
1264
- #undef HWY_RVV_MASK_FALSE
1265
- #undef HWY_RVV_IF_MLEN_D
1266
-
1267
- template <class D>
1268
- using MFromD = decltype(MaskFalse(D()));
1269
-
1270
1316
  // mask = f(vector, vector)
1271
1317
  #define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1272
1318
  SHIFT, MLEN, NAME, OP) \
@@ -1405,11 +1451,32 @@ HWY_RVV_FOREACH_F(HWY_RVV_IF_THEN_ZERO_ELSE, IfThenZeroElse, fmerge_vfm, _ALL)
1405
1451
  #undef HWY_RVV_IF_THEN_ZERO_ELSE
1406
1452
 
1407
1453
  // ------------------------------ MaskFromVec
1454
+
1455
+ template <class D>
1456
+ using MFromD = decltype(Eq(Zero(D()), Zero(D())));
1457
+
1408
1458
  template <class V>
1409
1459
  HWY_API MFromD<DFromV<V>> MaskFromVec(const V v) {
1410
1460
  return detail::NeS(v, 0);
1411
1461
  }
1412
1462
 
1463
+ // ------------------------------ MaskFalse
1464
+
1465
+ // For mask ops including vmclr, elements past VL are tail-agnostic and cannot
1466
+ // be relied upon, so define a variant of the generic_ops-inl implementation of
1467
+ // MaskFalse that ensures all bits are zero as required by mask_test.
1468
+ #ifdef HWY_NATIVE_MASK_FALSE
1469
+ #undef HWY_NATIVE_MASK_FALSE
1470
+ #else
1471
+ #define HWY_NATIVE_MASK_FALSE
1472
+ #endif
1473
+
1474
+ template <class D>
1475
+ HWY_API MFromD<D> MaskFalse(D d) {
1476
+ const DFromV<VFromD<decltype(d)>> d_full;
1477
+ return MaskFromVec(Zero(d_full));
1478
+ }
1479
+
1413
1480
  // ------------------------------ RebindMask
1414
1481
  template <class D, typename MFrom>
1415
1482
  HWY_API MFromD<D> RebindMask(const D /*d*/, const MFrom mask) {
@@ -1427,10 +1494,12 @@ HWY_API MFromD<D> RebindMask(const D /*d*/, const MFrom mask) {
1427
1494
  template <size_t N> \
1428
1495
  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1429
1496
  NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_M(MLEN) m) { \
1430
- const RebindToSigned<decltype(d)> di; \
1497
+ /* MaskFalse requires we set all lanes for capped d and virtual LMUL. */ \
1498
+ const DFromV<VFromD<decltype(d)>> d_full; \
1499
+ const RebindToSigned<decltype(d_full)> di; \
1431
1500
  using TI = TFromD<decltype(di)>; \
1432
- return BitCast( \
1433
- d, __riscv_v##OP##_i##SEW##LMUL(Zero(di), TI{-1}, m, Lanes(d))); \
1501
+ return BitCast(d_full, __riscv_v##OP##_i##SEW##LMUL(Zero(di), TI{-1}, m, \
1502
+ Lanes(d_full))); \
1434
1503
  }
1435
1504
 
1436
1505
  HWY_RVV_FOREACH_UI(HWY_RVV_VEC_FROM_MASK, VecFromMask, merge_vxm, _ALL_VIRT)
@@ -1518,6 +1587,38 @@ HWY_RVV_FOREACH_B(HWY_RVV_ALL_TRUE, _, _)
1518
1587
  HWY_RVV_FOREACH_B(HWY_RVV_COUNT_TRUE, _, _)
1519
1588
  #undef HWY_RVV_COUNT_TRUE
1520
1589
 
1590
+ // ------------------------------ PromoteMaskTo
1591
+
1592
+ #ifdef HWY_NATIVE_PROMOTE_MASK_TO
1593
+ #undef HWY_NATIVE_PROMOTE_MASK_TO
1594
+ #else
1595
+ #define HWY_NATIVE_PROMOTE_MASK_TO
1596
+ #endif
1597
+
1598
+ template <class DTo, class DFrom,
1599
+ HWY_IF_T_SIZE_GT_D(DTo, sizeof(TFromD<DFrom>)),
1600
+ hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DFrom>>()>* = nullptr>
1601
+ HWY_API MFromD<DTo> PromoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
1602
+ MFromD<DFrom> m) {
1603
+ return m;
1604
+ }
1605
+
1606
+ // ------------------------------ DemoteMaskTo
1607
+
1608
+ #ifdef HWY_NATIVE_DEMOTE_MASK_TO
1609
+ #undef HWY_NATIVE_DEMOTE_MASK_TO
1610
+ #else
1611
+ #define HWY_NATIVE_DEMOTE_MASK_TO
1612
+ #endif
1613
+
1614
+ template <class DTo, class DFrom,
1615
+ HWY_IF_T_SIZE_LE_D(DTo, sizeof(TFromD<DFrom>) - 1),
1616
+ hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DFrom>>()>* = nullptr>
1617
+ HWY_API MFromD<DTo> DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
1618
+ MFromD<DFrom> m) {
1619
+ return m;
1620
+ }
1621
+
1521
1622
  // ================================================== MEMORY
1522
1623
 
1523
1624
  // ------------------------------ Load
@@ -1528,47 +1629,18 @@ HWY_RVV_FOREACH_B(HWY_RVV_COUNT_TRUE, _, _)
1528
1629
  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1529
1630
  NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1530
1631
  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1531
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
1532
1632
  return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \
1533
- reinterpret_cast<const T*>(p), Lanes(d)); \
1633
+ detail::NativeLanePointer(p), Lanes(d)); \
1534
1634
  }
1535
1635
  HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le, _ALL_VIRT)
1536
1636
  #undef HWY_RVV_LOAD
1537
1637
 
1538
- // There is no native BF16, treat as uint16_t.
1539
- template <size_t N, int kPow2>
1540
- HWY_API VFromD<Simd<int16_t, N, kPow2>> Load(Simd<bfloat16_t, N, kPow2> d,
1541
- const bfloat16_t* HWY_RESTRICT p) {
1542
- return Load(RebindToSigned<decltype(d)>(),
1543
- reinterpret_cast<const int16_t * HWY_RESTRICT>(p));
1544
- }
1545
-
1546
- template <size_t N, int kPow2>
1547
- HWY_API void Store(VFromD<Simd<int16_t, N, kPow2>> v,
1548
- Simd<bfloat16_t, N, kPow2> d, bfloat16_t* HWY_RESTRICT p) {
1549
- Store(v, RebindToSigned<decltype(d)>(),
1550
- reinterpret_cast<int16_t * HWY_RESTRICT>(p));
1551
- }
1552
-
1553
- #if !HWY_HAVE_FLOAT16 // Otherwise already defined above.
1554
-
1555
- // NOTE: different type for float16_t than bfloat16_t, see Set().
1556
- template <size_t N, int kPow2>
1557
- HWY_API VFromD<Simd<uint16_t, N, kPow2>> Load(Simd<float16_t, N, kPow2> d,
1558
- const float16_t* HWY_RESTRICT p) {
1559
- return Load(RebindToUnsigned<decltype(d)>(),
1560
- reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
1561
- }
1562
-
1563
- template <size_t N, int kPow2>
1564
- HWY_API void Store(VFromD<Simd<uint16_t, N, kPow2>> v,
1565
- Simd<float16_t, N, kPow2> d, float16_t* HWY_RESTRICT p) {
1566
- Store(v, RebindToUnsigned<decltype(d)>(),
1567
- reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
1638
+ template <class D, HWY_RVV_IF_EMULATED_D(D)>
1639
+ HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
1640
+ const RebindToUnsigned<decltype(d)> du;
1641
+ return BitCast(d, Load(du, detail::U16LanePointer(p)));
1568
1642
  }
1569
1643
 
1570
- #endif // !HWY_HAVE_FLOAT16
1571
-
1572
1644
  // ------------------------------ LoadU
1573
1645
  template <class D>
1574
1646
  HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
@@ -1584,23 +1656,37 @@ HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
1584
1656
  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1585
1657
  NAME(HWY_RVV_M(MLEN) m, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1586
1658
  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1587
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
1588
1659
  return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_mu( \
1589
- m, Zero(d), reinterpret_cast<const T*>(p), Lanes(d)); \
1660
+ m, Zero(d), detail::NativeLanePointer(p), Lanes(d)); \
1590
1661
  } \
1591
1662
  template <size_t N> \
1592
1663
  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1593
1664
  NAME##Or(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
1594
1665
  HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1595
1666
  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1596
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
1597
1667
  return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_mu( \
1598
- m, v, reinterpret_cast<const T*>(p), Lanes(d)); \
1668
+ m, v, detail::NativeLanePointer(p), Lanes(d)); \
1599
1669
  }
1600
1670
 
1601
1671
  HWY_RVV_FOREACH(HWY_RVV_MASKED_LOAD, MaskedLoad, le, _ALL_VIRT)
1602
1672
  #undef HWY_RVV_MASKED_LOAD
1603
1673
 
1674
+ template <class D, HWY_RVV_IF_EMULATED_D(D)>
1675
+ HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
1676
+ const TFromD<D>* HWY_RESTRICT p) {
1677
+ const RebindToUnsigned<decltype(d)> du;
1678
+ return BitCast(d,
1679
+ MaskedLoad(RebindMask(du, m), du, detail::U16LanePointer(p)));
1680
+ }
1681
+
1682
+ template <class D, HWY_RVV_IF_EMULATED_D(D)>
1683
+ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> no, MFromD<D> m, D d,
1684
+ const TFromD<D>* HWY_RESTRICT p) {
1685
+ const RebindToUnsigned<decltype(d)> du;
1686
+ return BitCast(d, MaskedLoadOr(BitCast(du, no), RebindMask(du, m), du,
1687
+ detail::U16LanePointer(p)));
1688
+ }
1689
+
1604
1690
  // ------------------------------ LoadN
1605
1691
 
1606
1692
  // Native with avl is faster than the generic_ops using FirstN.
@@ -1616,29 +1702,41 @@ HWY_RVV_FOREACH(HWY_RVV_MASKED_LOAD, MaskedLoad, le, _ALL_VIRT)
1616
1702
  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1617
1703
  NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1618
1704
  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t num_lanes) { \
1619
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
1620
1705
  /* Use a tail-undisturbed load in LoadN as the tail-undisturbed load */ \
1621
1706
  /* operation below will leave any lanes past the first */ \
1622
1707
  /* (lowest-indexed) HWY_MIN(num_lanes, Lanes(d)) lanes unchanged */ \
1623
1708
  return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_tu( \
1624
- Zero(d), reinterpret_cast<const T*>(p), CappedLanes(d, num_lanes)); \
1709
+ Zero(d), detail::NativeLanePointer(p), CappedLanes(d, num_lanes)); \
1625
1710
  } \
1626
1711
  template <size_t N> \
1627
1712
  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME##Or( \
1628
1713
  HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1629
1714
  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t num_lanes) { \
1630
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
1631
1715
  /* Use a tail-undisturbed load in LoadNOr as the tail-undisturbed load */ \
1632
1716
  /* operation below will set any lanes past the first */ \
1633
1717
  /* (lowest-indexed) HWY_MIN(num_lanes, Lanes(d)) lanes to the */ \
1634
1718
  /* corresponding lanes in no */ \
1635
1719
  return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_tu( \
1636
- no, reinterpret_cast<const T*>(p), CappedLanes(d, num_lanes)); \
1720
+ no, detail::NativeLanePointer(p), CappedLanes(d, num_lanes)); \
1637
1721
  }
1638
1722
 
1639
1723
  HWY_RVV_FOREACH(HWY_RVV_LOADN, LoadN, le, _ALL_VIRT)
1640
1724
  #undef HWY_RVV_LOADN
1641
1725
 
1726
+ template <class D, HWY_RVV_IF_EMULATED_D(D)>
1727
+ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
1728
+ size_t num_lanes) {
1729
+ const RebindToUnsigned<D> du;
1730
+ return BitCast(d, LoadN(du, detail::U16LanePointer(p), num_lanes));
1731
+ }
1732
+ template <class D, HWY_RVV_IF_EMULATED_D(D)>
1733
+ HWY_API VFromD<D> LoadNOr(VFromD<D> v, D d, const TFromD<D>* HWY_RESTRICT p,
1734
+ size_t num_lanes) {
1735
+ const RebindToUnsigned<D> du;
1736
+ return BitCast(
1737
+ d, LoadNOr(BitCast(du, v), du, detail::U16LanePointer(p), num_lanes));
1738
+ }
1739
+
1642
1740
  // ------------------------------ Store
1643
1741
 
1644
1742
  #define HWY_RVV_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
@@ -1647,13 +1745,18 @@ HWY_RVV_FOREACH(HWY_RVV_LOADN, LoadN, le, _ALL_VIRT)
1647
1745
  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
1648
1746
  HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1649
1747
  HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1650
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
1651
- return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL(reinterpret_cast<T*>(p), \
1652
- v, Lanes(d)); \
1748
+ return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \
1749
+ detail::NativeLanePointer(p), v, Lanes(d)); \
1653
1750
  }
1654
1751
  HWY_RVV_FOREACH(HWY_RVV_STORE, Store, se, _ALL_VIRT)
1655
1752
  #undef HWY_RVV_STORE
1656
1753
 
1754
+ template <class D, HWY_RVV_IF_EMULATED_D(D)>
1755
+ HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
1756
+ const RebindToUnsigned<decltype(d)> du;
1757
+ Store(BitCast(du, v), du, detail::U16LanePointer(p));
1758
+ }
1759
+
1657
1760
  // ------------------------------ BlendedStore
1658
1761
 
1659
1762
  #define HWY_RVV_BLENDED_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
@@ -1662,13 +1765,20 @@ HWY_RVV_FOREACH(HWY_RVV_STORE, Store, se, _ALL_VIRT)
1662
1765
  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
1663
1766
  HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1664
1767
  HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1665
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
1666
1768
  return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_m( \
1667
- m, reinterpret_cast<T*>(p), v, Lanes(d)); \
1769
+ m, detail::NativeLanePointer(p), v, Lanes(d)); \
1668
1770
  }
1669
1771
  HWY_RVV_FOREACH(HWY_RVV_BLENDED_STORE, BlendedStore, se, _ALL_VIRT)
1670
1772
  #undef HWY_RVV_BLENDED_STORE
1671
1773
 
1774
+ template <class D, HWY_RVV_IF_EMULATED_D(D)>
1775
+ HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
1776
+ TFromD<D>* HWY_RESTRICT p) {
1777
+ const RebindToUnsigned<decltype(d)> du;
1778
+ BlendedStore(BitCast(du, v), RebindMask(du, m), du,
1779
+ detail::U16LanePointer(p));
1780
+ }
1781
+
1672
1782
  // ------------------------------ StoreN
1673
1783
 
1674
1784
  namespace detail {
@@ -1679,13 +1789,18 @@ namespace detail {
1679
1789
  HWY_API void NAME(size_t count, HWY_RVV_V(BASE, SEW, LMUL) v, \
1680
1790
  HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, \
1681
1791
  HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1682
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
1683
- return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL(reinterpret_cast<T*>(p), \
1684
- v, count); \
1792
+ return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \
1793
+ detail::NativeLanePointer(p), v, count); \
1685
1794
  }
1686
1795
  HWY_RVV_FOREACH(HWY_RVV_STOREN, StoreN, se, _ALL_VIRT)
1687
1796
  #undef HWY_RVV_STOREN
1688
1797
 
1798
+ template <class D, HWY_RVV_IF_EMULATED_D(D)>
1799
+ HWY_API void StoreN(size_t count, VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
1800
+ const RebindToUnsigned<decltype(d)> du;
1801
+ StoreN(count, BitCast(du, v), du, detail::U16LanePointer(p));
1802
+ }
1803
+
1689
1804
  } // namespace detail
1690
1805
 
1691
1806
  #ifdef HWY_NATIVE_STORE_N
@@ -1694,9 +1809,8 @@ HWY_RVV_FOREACH(HWY_RVV_STOREN, StoreN, se, _ALL_VIRT)
1694
1809
  #define HWY_NATIVE_STORE_N
1695
1810
  #endif
1696
1811
 
1697
- template <class D, typename T = TFromD<D>,
1698
- hwy::EnableIf<hwy::IsSame<T, TFromV<VFromD<D>>>()>* = nullptr>
1699
- HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
1812
+ template <class D>
1813
+ HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
1700
1814
  size_t max_lanes_to_store) {
1701
1815
  // NOTE: Need to call Lanes(d) and clamp max_lanes_to_store to Lanes(d), even
1702
1816
  // if MaxLanes(d) >= MaxLanes(DFromV<VFromD<D>>()) is true, as it is possible
@@ -1713,19 +1827,6 @@ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
1713
1827
  detail::StoreN(HWY_MIN(max_lanes_to_store, N), v, d, p);
1714
1828
  }
1715
1829
 
1716
- // StoreN for BF16/F16 vectors
1717
- template <class D, typename T = TFromD<D>,
1718
- hwy::EnableIf<!hwy::IsSame<T, TFromV<VFromD<D>>>()>* = nullptr,
1719
- HWY_IF_SPECIAL_FLOAT(T)>
1720
- HWY_API void StoreN(VFromD<D> v, D /*d*/, T* HWY_RESTRICT p,
1721
- size_t max_lanes_to_store) {
1722
- using TStore = TFromV<VFromD<D>>;
1723
- const Rebind<TStore, D> d_store;
1724
- const size_t N = Lanes(d_store);
1725
- detail::StoreN(HWY_MIN(max_lanes_to_store, N), v, d_store,
1726
- reinterpret_cast<TStore * HWY_RESTRICT>(p));
1727
- }
1728
-
1729
1830
  // ------------------------------ StoreU
1730
1831
  template <class V, class D>
1731
1832
  HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
@@ -1747,17 +1848,16 @@ HWY_API void Stream(const V v, D d, T* HWY_RESTRICT aligned) {
1747
1848
  #define HWY_NATIVE_SCATTER
1748
1849
  #endif
1749
1850
 
1750
- #define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1751
- SHIFT, MLEN, NAME, OP) \
1752
- template <size_t N> \
1753
- HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
1754
- HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1755
- HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
1756
- HWY_RVV_V(int, SEW, LMUL) offset) { \
1757
- const RebindToUnsigned<decltype(d)> du; \
1758
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
1759
- return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
1760
- reinterpret_cast<T*>(base), BitCast(du, offset), v, Lanes(d)); \
1851
+ #define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1852
+ SHIFT, MLEN, NAME, OP) \
1853
+ template <size_t N> \
1854
+ HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
1855
+ HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1856
+ HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
1857
+ HWY_RVV_V(int, SEW, LMUL) offset) { \
1858
+ const RebindToUnsigned<decltype(d)> du; \
1859
+ return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
1860
+ detail::NativeLanePointer(base), BitCast(du, offset), v, Lanes(d)); \
1761
1861
  }
1762
1862
  HWY_RVV_FOREACH(HWY_RVV_SCATTER, ScatterOffset, sux, _ALL_VIRT)
1763
1863
  #undef HWY_RVV_SCATTER
@@ -1772,19 +1872,18 @@ HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
1772
1872
 
1773
1873
  // ------------------------------ MaskedScatterIndex
1774
1874
 
1775
- #define HWY_RVV_MASKED_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
1776
- LMULH, SHIFT, MLEN, NAME, OP) \
1777
- template <size_t N> \
1778
- HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
1779
- HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1780
- HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
1781
- HWY_RVV_V(int, SEW, LMUL) indices) { \
1782
- const RebindToUnsigned<decltype(d)> du; \
1783
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
1784
- constexpr size_t kBits = CeilLog2(sizeof(TFromD<decltype(d)>)); \
1785
- return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL##_m( \
1786
- m, reinterpret_cast<T*>(base), ShiftLeft<kBits>(BitCast(du, indices)), \
1787
- v, Lanes(d)); \
1875
+ #define HWY_RVV_MASKED_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
1876
+ LMULH, SHIFT, MLEN, NAME, OP) \
1877
+ template <size_t N> \
1878
+ HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
1879
+ HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1880
+ HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
1881
+ HWY_RVV_V(int, SEW, LMUL) indices) { \
1882
+ const RebindToUnsigned<decltype(d)> du; \
1883
+ constexpr size_t kBits = CeilLog2(sizeof(TFromD<decltype(d)>)); \
1884
+ return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL##_m( \
1885
+ m, detail::NativeLanePointer(base), \
1886
+ ShiftLeft<kBits>(BitCast(du, indices)), v, Lanes(d)); \
1788
1887
  }
1789
1888
  HWY_RVV_FOREACH(HWY_RVV_MASKED_SCATTER, MaskedScatterIndex, sux, _ALL_VIRT)
1790
1889
  #undef HWY_RVV_MASKED_SCATTER
@@ -1805,9 +1904,8 @@ HWY_RVV_FOREACH(HWY_RVV_MASKED_SCATTER, MaskedScatterIndex, sux, _ALL_VIRT)
1805
1904
  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
1806
1905
  HWY_RVV_V(int, SEW, LMUL) offset) { \
1807
1906
  const RebindToUnsigned<decltype(d)> du; \
1808
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
1809
1907
  return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
1810
- reinterpret_cast<const T*>(base), BitCast(du, offset), Lanes(d)); \
1908
+ detail::NativeLanePointer(base), BitCast(du, offset), Lanes(d)); \
1811
1909
  }
1812
1910
  HWY_RVV_FOREACH(HWY_RVV_GATHER, GatherOffset, lux, _ALL_VIRT)
1813
1911
  #undef HWY_RVV_GATHER
@@ -1821,25 +1919,34 @@ HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
1821
1919
  return GatherOffset(d, base, ShiftLeft<kBits>(index));
1822
1920
  }
1823
1921
 
1824
- // ------------------------------ MaskedGatherIndex
1922
+ // ------------------------------ MaskedGatherIndexOr
1825
1923
 
1826
1924
  #define HWY_RVV_MASKED_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1827
1925
  SHIFT, MLEN, NAME, OP) \
1828
1926
  template <size_t N> \
1829
1927
  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1830
- NAME(HWY_RVV_M(MLEN) m, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1928
+ NAME(HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_M(MLEN) m, \
1929
+ HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1831
1930
  const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
1832
1931
  HWY_RVV_V(int, SEW, LMUL) indices) { \
1833
1932
  const RebindToUnsigned<decltype(d)> du; \
1834
- using T = detail::NativeLaneType<HWY_RVV_T(BASE, SEW)>; \
1933
+ const RebindToSigned<decltype(d)> di; \
1934
+ (void)di; /* for HWY_DASSERT */ \
1835
1935
  constexpr size_t kBits = CeilLog2(SEW / 8); \
1936
+ HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di)))); \
1836
1937
  return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL##_mu( \
1837
- m, Zero(d), reinterpret_cast<const T*>(base), \
1938
+ m, no, detail::NativeLanePointer(base), \
1838
1939
  ShiftLeft<kBits>(BitCast(du, indices)), Lanes(d)); \
1839
1940
  }
1840
- HWY_RVV_FOREACH(HWY_RVV_MASKED_GATHER, MaskedGatherIndex, lux, _ALL_VIRT)
1941
+ HWY_RVV_FOREACH(HWY_RVV_MASKED_GATHER, MaskedGatherIndexOr, lux, _ALL_VIRT)
1841
1942
  #undef HWY_RVV_MASKED_GATHER
1842
1943
 
1944
+ template <class D>
1945
+ HWY_API VFromD<D> MaskedGatherIndex(MFromD<D> m, D d, const TFromD<D>* base,
1946
+ VFromD<RebindToSigned<D>> indices) {
1947
+ return MaskedGatherIndexOr(Zero(d), m, d, base, indices);
1948
+ }
1949
+
1843
1950
  // ================================================== CONVERT
1844
1951
 
1845
1952
  // ------------------------------ PromoteTo
@@ -1996,7 +2103,7 @@ HWY_API auto PromoteTo(Simd<int64_t, N, kPow2> d,
1996
2103
 
1997
2104
  template <size_t N, int kPow2>
1998
2105
  HWY_API auto PromoteTo(Simd<float32_t, N, kPow2> d,
1999
- VFromD<Rebind<bfloat16_t, decltype(d)>> v)
2106
+ VFromD<Rebind<hwy::bfloat16_t, decltype(d)>> v)
2000
2107
  -> VFromD<decltype(d)> {
2001
2108
  const RebindToSigned<decltype(d)> di32;
2002
2109
  const Rebind<uint16_t, decltype(d)> du16;
@@ -2633,8 +2740,8 @@ HWY_RVV_FOREACH_U32(HWY_RVV_DEMOTE_TO_SHR_16, DemoteToShr16, nclipu_wx_,
2633
2740
  #undef HWY_RVV_DEMOTE_TO_SHR_16
2634
2741
 
2635
2742
  template <size_t N, int kPow2>
2636
- HWY_API VFromD<Simd<bfloat16_t, N, kPow2>> DemoteTo(
2637
- Simd<bfloat16_t, N, kPow2> d, VFromD<Simd<float, N, kPow2 + 1>> v) {
2743
+ HWY_API VFromD<Simd<hwy::bfloat16_t, N, kPow2>> DemoteTo(
2744
+ Simd<hwy::bfloat16_t, N, kPow2> d, VFromD<Simd<float, N, kPow2 + 1>> v) {
2638
2745
  const RebindToUnsigned<decltype(d)> du16;
2639
2746
  const Rebind<uint32_t, decltype(d)> du32;
2640
2747
  return BitCast(d, detail::DemoteToShr16(du16, BitCast(du32, v)));
@@ -2918,9 +3025,10 @@ HWY_RVV_FOREACH_B(HWY_RVV_SET_AT_OR_AFTER_FIRST, _, _)
2918
3025
 
2919
3026
  // ------------------------------ InsertLane
2920
3027
 
2921
- template <class V, HWY_IF_NOT_T_SIZE_V(V, 1)>
2922
- HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
2923
- const DFromV<V> d;
3028
+ // T template arg because TFromV<V> might not match the hwy::float16_t argument.
3029
+ template <class V, typename T, HWY_IF_NOT_T_SIZE_V(V, 1)>
3030
+ HWY_API V InsertLane(const V v, size_t i, T t) {
3031
+ const Rebind<T, DFromV<V>> d;
2924
3032
  const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only
2925
3033
  using TU = TFromD<decltype(du)>;
2926
3034
  const auto is_i = detail::EqS(detail::Iota0(du), static_cast<TU>(i));
@@ -2928,9 +3036,9 @@ HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
2928
3036
  }
2929
3037
 
2930
3038
  // For 8-bit lanes, Iota0 might overflow.
2931
- template <class V, HWY_IF_T_SIZE_V(V, 1)>
2932
- HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
2933
- const DFromV<V> d;
3039
+ template <class V, typename T, HWY_IF_T_SIZE_V(V, 1)>
3040
+ HWY_API V InsertLane(const V v, size_t i, T t) {
3041
+ const Rebind<T, DFromV<V>> d;
2934
3042
  const auto zero = Zero(d);
2935
3043
  const auto one = Set(d, 1);
2936
3044
  const auto ge_i = Eq(detail::SlideUp(zero, one, i), one);
@@ -3034,9 +3142,6 @@ HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) {
3034
3142
  return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx));
3035
3143
  }
3036
3144
 
3037
- // TODO(janwas): avoid using this for 8-bit; wrap in detail namespace.
3038
- // For large 8-bit vectors, index overflow will lead to incorrect results.
3039
- // Reverse already uses TableLookupLanes16 to prevent this.
3040
3145
  #define HWY_RVV_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
3041
3146
  MLEN, NAME, OP) \
3042
3147
  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
@@ -3045,12 +3150,14 @@ HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) {
3045
3150
  HWY_RVV_AVL(SEW, SHIFT)); \
3046
3151
  }
3047
3152
 
3153
+ // TableLookupLanes is supported for all types, but beware that indices are
3154
+ // likely to wrap around for 8-bit lanes. When using TableLookupLanes inside
3155
+ // this file, ensure that it is safe or use TableLookupLanes16 instead.
3048
3156
  HWY_RVV_FOREACH(HWY_RVV_TABLE, TableLookupLanes, rgather, _ALL)
3049
3157
  #undef HWY_RVV_TABLE
3050
3158
 
3051
3159
  namespace detail {
3052
3160
 
3053
- // Used by I8/U8 Reverse
3054
3161
  #define HWY_RVV_TABLE16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
3055
3162
  SHIFT, MLEN, NAME, OP) \
3056
3163
  HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
@@ -3122,6 +3229,67 @@ HWY_API VFromD<D> Reverse(D /* tag */, VFromD<D> v) {
3122
3229
  return TableLookupLanes(v, idx);
3123
3230
  }
3124
3231
 
3232
+ // ------------------------------ ResizeBitCast
3233
+
3234
+ // Extends or truncates a vector to match the given d.
3235
+ namespace detail {
3236
+
3237
+ template <class D>
3238
+ HWY_INLINE VFromD<D> ChangeLMUL(D /* d */, VFromD<D> v) {
3239
+ return v;
3240
+ }
3241
+
3242
+ // Sanity check: when calling ChangeLMUL, the caller (ResizeBitCast) already
3243
+ // BitCast to the same lane type. Note that V may use the native lane type for
3244
+ // f16, so convert D to that before checking.
3245
+ #define HWY_RVV_IF_SAME_T_DV(D, V) \
3246
+ hwy::EnableIf<IsSame<NativeLaneType<TFromD<D>>, TFromV<V>>()>* = nullptr
3247
+
3248
+ // LMUL of VFromD<D> < LMUL of V: need to truncate v
3249
+ template <class D, class V, // HWY_RVV_IF_SAME_T_DV(D, V),
3250
+ HWY_IF_POW2_LE_D(DFromV<VFromD<D>>, DFromV<V>().Pow2() - 1)>
3251
+ HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) {
3252
+ const DFromV<V> d_from;
3253
+ const Half<decltype(d_from)> dh_from;
3254
+ static_assert(
3255
+ DFromV<VFromD<decltype(dh_from)>>().Pow2() < DFromV<V>().Pow2(),
3256
+ "The LMUL of VFromD<decltype(dh_from)> must be less than the LMUL of V");
3257
+ static_assert(
3258
+ DFromV<VFromD<D>>().Pow2() <= DFromV<VFromD<decltype(dh_from)>>().Pow2(),
3259
+ "The LMUL of VFromD<D> must be less than or equal to the LMUL of "
3260
+ "VFromD<decltype(dh_from)>");
3261
+ return ChangeLMUL(d, Trunc(v));
3262
+ }
3263
+
3264
+ // LMUL of VFromD<D> > LMUL of V: need to extend v
3265
+ template <class D, class V, // HWY_RVV_IF_SAME_T_DV(D, V),
3266
+ HWY_IF_POW2_GT_D(DFromV<VFromD<D>>, DFromV<V>().Pow2())>
3267
+ HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) {
3268
+ const DFromV<V> d_from;
3269
+ const Twice<decltype(d_from)> dt_from;
3270
+ static_assert(DFromV<VFromD<decltype(dt_from)>>().Pow2() > DFromV<V>().Pow2(),
3271
+ "The LMUL of VFromD<decltype(dt_from)> must be greater than "
3272
+ "the LMUL of V");
3273
+ static_assert(
3274
+ DFromV<VFromD<D>>().Pow2() >= DFromV<VFromD<decltype(dt_from)>>().Pow2(),
3275
+ "The LMUL of VFromD<D> must be greater than or equal to the LMUL of "
3276
+ "VFromD<decltype(dt_from)>");
3277
+ return ChangeLMUL(d, Ext(dt_from, v));
3278
+ }
3279
+
3280
+ #undef HWY_RVV_IF_SAME_T_DV
3281
+
3282
+ } // namespace detail
3283
+
3284
+ template <class DTo, class VFrom>
3285
+ HWY_API VFromD<DTo> ResizeBitCast(DTo /*dto*/, VFrom v) {
3286
+ const DFromV<decltype(v)> d_from;
3287
+ const Repartition<uint8_t, decltype(d_from)> du8_from;
3288
+ const DFromV<VFromD<DTo>> d_to;
3289
+ const Repartition<uint8_t, decltype(d_to)> du8_to;
3290
+ return BitCast(d_to, detail::ChangeLMUL(du8_to, BitCast(du8_from, v)));
3291
+ }
3292
+
3125
3293
  // ------------------------------ Reverse2 (RotateRight, OddEven)
3126
3294
 
3127
3295
  // Per-target flags to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8.
@@ -3307,7 +3475,7 @@ template <class V, class M, class D>
3307
3475
  HWY_API size_t CompressBlendedStore(const V v, const M mask, const D d,
3308
3476
  TFromD<D>* HWY_RESTRICT unaligned) {
3309
3477
  const size_t count = CountTrue(d, mask);
3310
- detail::StoreN(count, Compress(v, mask), d, unaligned);
3478
+ StoreN(Compress(v, mask), d, unaligned, count);
3311
3479
  return count;
3312
3480
  }
3313
3481
 
@@ -3483,50 +3651,6 @@ HWY_API V Shuffle0123(const V v) {
3483
3651
 
3484
3652
  // ------------------------------ TableLookupBytes
3485
3653
 
3486
- // Extends or truncates a vector to match the given d.
3487
- namespace detail {
3488
-
3489
- template <class D>
3490
- HWY_INLINE VFromD<D> ChangeLMUL(D /* d */, VFromD<D> v) {
3491
- return v;
3492
- }
3493
-
3494
- // LMUL of VFromD<D> < LMUL of V: need to truncate v
3495
- template <class D, class V,
3496
- hwy::EnableIf<IsSame<TFromD<D>, TFromV<V>>()>* = nullptr,
3497
- HWY_IF_POW2_LE_D(DFromV<VFromD<D>>, DFromV<V>().Pow2() - 1)>
3498
- HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) {
3499
- const DFromV<decltype(v)> d_from;
3500
- const Half<decltype(d_from)> dh_from;
3501
- static_assert(
3502
- DFromV<VFromD<decltype(dh_from)>>().Pow2() < DFromV<V>().Pow2(),
3503
- "The LMUL of VFromD<decltype(dh_from)> must be less than the LMUL of V");
3504
- static_assert(
3505
- DFromV<VFromD<D>>().Pow2() <= DFromV<VFromD<decltype(dh_from)>>().Pow2(),
3506
- "The LMUL of VFromD<D> must be less than or equal to the LMUL of "
3507
- "VFromD<decltype(dh_from)>");
3508
- return ChangeLMUL(d, Trunc(v));
3509
- }
3510
-
3511
- // LMUL of VFromD<D> > LMUL of V: need to extend v
3512
- template <class D, class V,
3513
- hwy::EnableIf<IsSame<TFromD<D>, TFromV<V>>()>* = nullptr,
3514
- HWY_IF_POW2_GT_D(DFromV<VFromD<D>>, DFromV<V>().Pow2())>
3515
- HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) {
3516
- const DFromV<decltype(v)> d_from;
3517
- const Twice<decltype(d_from)> dt_from;
3518
- static_assert(DFromV<VFromD<decltype(dt_from)>>().Pow2() > DFromV<V>().Pow2(),
3519
- "The LMUL of VFromD<decltype(dt_from)> must be greater than "
3520
- "the LMUL of V");
3521
- static_assert(
3522
- DFromV<VFromD<D>>().Pow2() >= DFromV<VFromD<decltype(dt_from)>>().Pow2(),
3523
- "The LMUL of VFromD<D> must be greater than or equal to the LMUL of "
3524
- "VFromD<decltype(dt_from)>");
3525
- return ChangeLMUL(d, Ext(dt_from, v));
3526
- }
3527
-
3528
- } // namespace detail
3529
-
3530
3654
  template <class VT, class VI>
3531
3655
  HWY_API VI TableLookupBytes(const VT vt, const VI vi) {
3532
3656
  const DFromV<VT> dt; // T=table, I=index.
@@ -3563,7 +3687,8 @@ HWY_API VI TableLookupBytesOr0(const VT vt, const VI idx) {
3563
3687
 
3564
3688
  // ------------------------------ TwoTablesLookupLanes
3565
3689
 
3566
- // TODO(janwas): special-case 8-bit lanes to safely handle VL >= 256
3690
+ // WARNING: 8-bit lanes may lead to unexpected results because idx is the same
3691
+ // size and may overflow.
3567
3692
  template <class D, HWY_IF_POW2_LE_D(D, 2)>
3568
3693
  HWY_API VFromD<D> TwoTablesLookupLanes(D d, VFromD<D> a, VFromD<D> b,
3569
3694
  VFromD<RebindToUnsigned<D>> idx) {
@@ -3597,11 +3722,47 @@ HWY_API V TwoTablesLookupLanes(V a, V b,
3597
3722
  }
3598
3723
 
3599
3724
  // ------------------------------ Broadcast
3600
- template <int kLane, class V>
3725
+
3726
+ // 8-bit requires 16-bit tables.
3727
+ template <int kLane, class V, class D = DFromV<V>, HWY_IF_T_SIZE_D(D, 1),
3728
+ HWY_IF_POW2_LE_D(D, 2)>
3729
+ HWY_API V Broadcast(const V v) {
3730
+ HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
3731
+ const D d;
3732
+ const Rebind<uint16_t, decltype(d)> du16;
3733
+ VFromD<decltype(du16)> idx =
3734
+ detail::OffsetsOf128BitBlocks(d, detail::Iota0(du16));
3735
+ if (kLane != 0) {
3736
+ idx = detail::AddS(idx, kLane);
3737
+ }
3738
+ return detail::TableLookupLanes16(v, idx);
3739
+ }
3740
+
3741
+ // 8-bit and max LMUL: split into halves.
3742
+ template <int kLane, class V, class D = DFromV<V>, HWY_IF_T_SIZE_D(D, 1),
3743
+ HWY_IF_POW2_GT_D(D, 2)>
3601
3744
  HWY_API V Broadcast(const V v) {
3602
- const DFromV<V> d;
3603
- const RebindToUnsigned<decltype(d)> du;
3604
3745
  HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
3746
+ const D d;
3747
+ const Half<decltype(d)> dh;
3748
+ using VH = VFromD<decltype(dh)>;
3749
+ const Rebind<uint16_t, decltype(dh)> du16;
3750
+ VFromD<decltype(du16)> idx =
3751
+ detail::OffsetsOf128BitBlocks(d, detail::Iota0(du16));
3752
+ if (kLane != 0) {
3753
+ idx = detail::AddS(idx, kLane);
3754
+ }
3755
+ const VH lo = detail::TableLookupLanes16(LowerHalf(dh, v), idx);
3756
+ const VH hi = detail::TableLookupLanes16(UpperHalf(dh, v), idx);
3757
+ return Combine(d, lo, hi);
3758
+ }
3759
+
3760
+ template <int kLane, class V, class D = DFromV<V>,
3761
+ HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))>
3762
+ HWY_API V Broadcast(const V v) {
3763
+ HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
3764
+ const D d;
3765
+ const RebindToUnsigned<decltype(d)> du;
3605
3766
  auto idx = detail::OffsetsOf128BitBlocks(d, detail::Iota0(du));
3606
3767
  if (kLane != 0) {
3607
3768
  idx = detail::AddS(idx, kLane);
@@ -3778,20 +3939,194 @@ HWY_API V ShiftRightBytes(const D d, const V v) {
3778
3939
  return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v)));
3779
3940
  }
3780
3941
 
3781
- // ------------------------------ InterleaveLower
3942
+ // ------------------------------ InterleaveWholeLower
3943
+ #ifdef HWY_NATIVE_INTERLEAVE_WHOLE
3944
+ #undef HWY_NATIVE_INTERLEAVE_WHOLE
3945
+ #else
3946
+ #define HWY_NATIVE_INTERLEAVE_WHOLE
3947
+ #endif
3782
3948
 
3783
- template <class D, class V>
3949
+ namespace detail {
3950
+ // Returns double-length vector with interleaved lanes.
3951
+ template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
3952
+ HWY_IF_POW2_GT_D(D, -3)>
3953
+ HWY_API VFromD<D> InterleaveWhole(D d, VFromD<Half<D>> a, VFromD<Half<D>> b) {
3954
+ const RebindToUnsigned<decltype(d)> du;
3955
+ using TW = MakeWide<TFromD<decltype(du)>>;
3956
+ const Rebind<TW, Half<decltype(du)>> dw;
3957
+ const Half<decltype(du)> duh; // cast inputs to unsigned so we zero-extend
3958
+
3959
+ const VFromD<decltype(dw)> aw = PromoteTo(dw, BitCast(duh, a));
3960
+ const VFromD<decltype(dw)> bw = PromoteTo(dw, BitCast(duh, b));
3961
+ return BitCast(d, Or(aw, BitCast(dw, detail::Slide1Up(BitCast(du, bw)))));
3962
+ }
3963
+ // 64-bit: cannot PromoteTo, but can Ext.
3964
+ template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)>
3965
+ HWY_API VFromD<D> InterleaveWhole(D d, VFromD<Half<D>> a, VFromD<Half<D>> b) {
3966
+ const RebindToUnsigned<decltype(d)> du;
3967
+ const auto idx = ShiftRight<1>(detail::Iota0(du));
3968
+ return OddEven(TableLookupLanes(detail::Ext(d, b), idx),
3969
+ TableLookupLanes(detail::Ext(d, a), idx));
3970
+ }
3971
+ template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_GT_D(D, 2)>
3972
+ HWY_API VFromD<D> InterleaveWhole(D d, VFromD<Half<D>> a, VFromD<Half<D>> b) {
3973
+ const Half<D> dh;
3974
+ const Half<decltype(dh)> dq;
3975
+ const VFromD<decltype(dh)> i0 =
3976
+ InterleaveWhole(dh, LowerHalf(dq, a), LowerHalf(dq, b));
3977
+ const VFromD<decltype(dh)> i1 =
3978
+ InterleaveWhole(dh, UpperHalf(dq, a), UpperHalf(dq, b));
3979
+ return Combine(d, i1, i0);
3980
+ }
3981
+
3982
+ } // namespace detail
3983
+
3984
+ template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
3985
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
3986
+ const RebindToUnsigned<decltype(d)> du;
3987
+ const detail::AdjustSimdTagToMinVecPow2<RepartitionToWide<decltype(du)>> dw;
3988
+ const RepartitionToNarrow<decltype(dw)> du_src;
3989
+
3990
+ const VFromD<D> aw =
3991
+ ResizeBitCast(d, PromoteLowerTo(dw, ResizeBitCast(du_src, a)));
3992
+ const VFromD<D> bw =
3993
+ ResizeBitCast(d, PromoteLowerTo(dw, ResizeBitCast(du_src, b)));
3994
+ return Or(aw, detail::Slide1Up(bw));
3995
+ }
3996
+
3997
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
3998
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
3999
+ const RebindToUnsigned<decltype(d)> du;
4000
+ const auto idx = ShiftRight<1>(detail::Iota0(du));
4001
+ return OddEven(TableLookupLanes(b, idx), TableLookupLanes(a, idx));
4002
+ }
4003
+
4004
+ // ------------------------------ InterleaveWholeUpper
4005
+
4006
+ template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
4007
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
4008
+ // Use Lanes(d) / 2 instead of Lanes(Half<D>()) as Lanes(Half<D>()) can only
4009
+ // be called if (d.Pow2() >= -2 && d.Pow2() == DFromV<VFromD<D>>().Pow2()) is
4010
+ // true and and as the results of InterleaveWholeUpper are
4011
+ // implementation-defined if Lanes(d) is less than 2.
4012
+ const size_t half_N = Lanes(d) / 2;
4013
+ return InterleaveWholeLower(d, detail::SlideDown(a, half_N),
4014
+ detail::SlideDown(b, half_N));
4015
+ }
4016
+
4017
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
4018
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
4019
+ // Use Lanes(d) / 2 instead of Lanes(Half<D>()) as Lanes(Half<D>()) can only
4020
+ // be called if (d.Pow2() >= -2 && d.Pow2() == DFromV<VFromD<D>>().Pow2()) is
4021
+ // true and as the results of InterleaveWholeUpper are implementation-defined
4022
+ // if Lanes(d) is less than 2.
4023
+ const size_t half_N = Lanes(d) / 2;
4024
+ const RebindToUnsigned<decltype(d)> du;
4025
+ const auto idx = detail::AddS(ShiftRight<1>(detail::Iota0(du)),
4026
+ static_cast<uint64_t>(half_N));
4027
+ return OddEven(TableLookupLanes(b, idx), TableLookupLanes(a, idx));
4028
+ }
4029
+
4030
+ // ------------------------------ InterleaveLower (InterleaveWholeLower)
4031
+
4032
+ namespace detail {
4033
+
4034
+ // Definitely at least 128 bit: match x86 semantics (independent blocks). Using
4035
+ // InterleaveWhole and 64-bit Compress avoids 8-bit overflow.
4036
+ template <class D, class V, HWY_IF_POW2_LE_D(D, 2)>
4037
+ HWY_INLINE V InterleaveLowerBlocks(D d, const V a, const V b) {
4038
+ static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
4039
+ const Twice<D> dt;
4040
+ const RebindToUnsigned<decltype(dt)> dt_u;
4041
+ const VFromD<decltype(dt)> interleaved = detail::InterleaveWhole(dt, a, b);
4042
+ // Keep only even 128-bit blocks. This is faster than u64 ConcatEven
4043
+ // because we only have a single vector.
4044
+ constexpr size_t kShift = CeilLog2(16 / sizeof(TFromD<D>));
4045
+ const VFromD<decltype(dt_u)> idx_block =
4046
+ ShiftRight<kShift>(detail::Iota0(dt_u));
4047
+ const MFromD<decltype(dt_u)> is_even =
4048
+ detail::EqS(detail::AndS(idx_block, 1), 0);
4049
+ return BitCast(d, LowerHalf(Compress(BitCast(dt_u, interleaved), is_even)));
4050
+ }
4051
+ template <class D, class V, HWY_IF_POW2_GT_D(D, 2)>
4052
+ HWY_INLINE V InterleaveLowerBlocks(D d, const V a, const V b) {
4053
+ const Half<D> dh;
4054
+ const VFromD<decltype(dh)> i0 =
4055
+ InterleaveLowerBlocks(dh, LowerHalf(dh, a), LowerHalf(dh, b));
4056
+ const VFromD<decltype(dh)> i1 =
4057
+ InterleaveLowerBlocks(dh, UpperHalf(dh, a), UpperHalf(dh, b));
4058
+ return Combine(d, i1, i0);
4059
+ }
4060
+
4061
+ // As above, for the upper half of blocks.
4062
+ template <class D, class V, HWY_IF_POW2_LE_D(D, 2)>
4063
+ HWY_INLINE V InterleaveUpperBlocks(D d, const V a, const V b) {
4064
+ static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
4065
+ const Twice<D> dt;
4066
+ const RebindToUnsigned<decltype(dt)> dt_u;
4067
+ const VFromD<decltype(dt)> interleaved = detail::InterleaveWhole(dt, a, b);
4068
+ // Keep only odd 128-bit blocks. This is faster than u64 ConcatEven
4069
+ // because we only have a single vector.
4070
+ constexpr size_t kShift = CeilLog2(16 / sizeof(TFromD<D>));
4071
+ const VFromD<decltype(dt_u)> idx_block =
4072
+ ShiftRight<kShift>(detail::Iota0(dt_u));
4073
+ const MFromD<decltype(dt_u)> is_odd =
4074
+ detail::EqS(detail::AndS(idx_block, 1), 1);
4075
+ return BitCast(d, LowerHalf(Compress(BitCast(dt_u, interleaved), is_odd)));
4076
+ }
4077
+ template <class D, class V, HWY_IF_POW2_GT_D(D, 2)>
4078
+ HWY_INLINE V InterleaveUpperBlocks(D d, const V a, const V b) {
4079
+ const Half<D> dh;
4080
+ const VFromD<decltype(dh)> i0 =
4081
+ InterleaveUpperBlocks(dh, LowerHalf(dh, a), LowerHalf(dh, b));
4082
+ const VFromD<decltype(dh)> i1 =
4083
+ InterleaveUpperBlocks(dh, UpperHalf(dh, a), UpperHalf(dh, b));
4084
+ return Combine(d, i1, i0);
4085
+ }
4086
+
4087
+ // RVV vectors are at least 128 bit when there is no fractional LMUL nor cap.
4088
+ // Used by functions with per-block behavior such as InterleaveLower.
4089
+ template <typename T, size_t N, int kPow2>
4090
+ constexpr bool IsGE128(Simd<T, N, kPow2> /* d */) {
4091
+ return N * sizeof(T) >= 16 && kPow2 >= 0;
4092
+ }
4093
+
4094
+ // Definitely less than 128-bit only if there is a small cap; fractional LMUL
4095
+ // might not be enough if vectors are large.
4096
+ template <typename T, size_t N, int kPow2>
4097
+ constexpr bool IsLT128(Simd<T, N, kPow2> /* d */) {
4098
+ return N * sizeof(T) < 16;
4099
+ }
4100
+
4101
+ } // namespace detail
4102
+
4103
+ #define HWY_RVV_IF_GE128_D(D) hwy::EnableIf<detail::IsGE128(D())>* = nullptr
4104
+ #define HWY_RVV_IF_LT128_D(D) hwy::EnableIf<detail::IsLT128(D())>* = nullptr
4105
+ #define HWY_RVV_IF_CAN128_D(D) \
4106
+ hwy::EnableIf<!detail::IsLT128(D()) && !detail::IsGE128(D())>* = nullptr
4107
+
4108
+ template <class D, class V, HWY_RVV_IF_GE128_D(D)>
4109
+ HWY_API V InterleaveLower(D d, const V a, const V b) {
4110
+ return detail::InterleaveLowerBlocks(d, a, b);
4111
+ }
4112
+
4113
+ // Single block: interleave without extra Compress.
4114
+ template <class D, class V, HWY_RVV_IF_LT128_D(D)>
3784
4115
  HWY_API V InterleaveLower(D d, const V a, const V b) {
3785
4116
  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
3786
- const RebindToUnsigned<decltype(d)> du;
3787
- using TU = TFromD<decltype(du)>;
3788
- const auto i = detail::Iota0(du);
3789
- const auto idx_mod = ShiftRight<1>(
3790
- detail::AndS(i, static_cast<TU>(detail::LanesPerBlock(du) - 1)));
3791
- const auto idx = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i));
3792
- const auto is_even = detail::EqS(detail::AndS(i, 1), 0u);
3793
- return IfThenElse(is_even, TableLookupLanes(a, idx),
3794
- TableLookupLanes(b, idx));
4117
+ return InterleaveWholeLower(d, a, b);
4118
+ }
4119
+
4120
+ // Could be either; branch at runtime.
4121
+ template <class D, class V, HWY_RVV_IF_CAN128_D(D)>
4122
+ HWY_API V InterleaveLower(D d, const V a, const V b) {
4123
+ if (Lanes(d) * sizeof(TFromD<D>) <= 16) {
4124
+ return InterleaveWholeLower(d, a, b);
4125
+ }
4126
+ // Fractional LMUL: use LMUL=1 to ensure we can cast to u64.
4127
+ const ScalableTag<TFromD<D>, HWY_MAX(d.Pow2(), 0)> d1;
4128
+ return ResizeBitCast(d, detail::InterleaveLowerBlocks(
4129
+ d1, ResizeBitCast(d1, a), ResizeBitCast(d1, b)));
3795
4130
  }
3796
4131
 
3797
4132
  template <class V>
@@ -3799,21 +4134,30 @@ HWY_API V InterleaveLower(const V a, const V b) {
3799
4134
  return InterleaveLower(DFromV<V>(), a, b);
3800
4135
  }
3801
4136
 
3802
- // ------------------------------ InterleaveUpper
4137
+ // ------------------------------ InterleaveUpper (Compress)
3803
4138
 
3804
- template <class D, class V>
3805
- HWY_API V InterleaveUpper(const D d, const V a, const V b) {
4139
+ template <class D, class V, HWY_RVV_IF_GE128_D(D)>
4140
+ HWY_API V InterleaveUpper(D d, const V a, const V b) {
4141
+ return detail::InterleaveUpperBlocks(d, a, b);
4142
+ }
4143
+
4144
+ // Single block: interleave without extra Compress.
4145
+ template <class D, class V, HWY_RVV_IF_LT128_D(D)>
4146
+ HWY_API V InterleaveUpper(D d, const V a, const V b) {
3806
4147
  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
3807
- const RebindToUnsigned<decltype(d)> du;
3808
- using TU = TFromD<decltype(du)>;
3809
- const size_t lpb = detail::LanesPerBlock(du);
3810
- const auto i = detail::Iota0(du);
3811
- const auto idx_mod = ShiftRight<1>(detail::AndS(i, static_cast<TU>(lpb - 1)));
3812
- const auto idx_lower = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i));
3813
- const auto idx = detail::AddS(idx_lower, static_cast<TU>(lpb / 2));
3814
- const auto is_even = detail::EqS(detail::AndS(i, 1), 0u);
3815
- return IfThenElse(is_even, TableLookupLanes(a, idx),
3816
- TableLookupLanes(b, idx));
4148
+ return InterleaveWholeUpper(d, a, b);
4149
+ }
4150
+
4151
+ // Could be either; branch at runtime.
4152
+ template <class D, class V, HWY_RVV_IF_CAN128_D(D)>
4153
+ HWY_API V InterleaveUpper(D d, const V a, const V b) {
4154
+ if (Lanes(d) * sizeof(TFromD<D>) <= 16) {
4155
+ return InterleaveWholeUpper(d, a, b);
4156
+ }
4157
+ // Fractional LMUL: use LMUL=1 to ensure we can cast to u64.
4158
+ const ScalableTag<TFromD<D>, HWY_MAX(d.Pow2(), 0)> d1;
4159
+ return ResizeBitCast(d, detail::InterleaveUpperBlocks(
4160
+ d1, ResizeBitCast(d1, a), ResizeBitCast(d1, b)));
3817
4161
  }
3818
4162
 
3819
4163
  // ------------------------------ ZipLower
@@ -3840,67 +4184,98 @@ HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
3840
4184
 
3841
4185
  // ================================================== REDUCE
3842
4186
 
3843
- // vector = f(vector, zero_m1)
4187
+ // We have ReduceSum, generic_ops-inl.h defines SumOfLanes via Set.
4188
+ #ifdef HWY_NATIVE_REDUCE_SCALAR
4189
+ #undef HWY_NATIVE_REDUCE_SCALAR
4190
+ #else
4191
+ #define HWY_NATIVE_REDUCE_SCALAR
4192
+ #endif
4193
+
4194
+ // scalar = f(vector, zero_m1)
3844
4195
  #define HWY_RVV_REDUCE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
3845
4196
  MLEN, NAME, OP) \
3846
- template <class D> \
3847
- HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
3848
- NAME(D d, HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, m1) v0) { \
3849
- return Set(d, \
3850
- GetLane(__riscv_v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1( \
3851
- v, v0, Lanes(d)))); \
4197
+ template <size_t N> \
4198
+ HWY_API HWY_RVV_T(BASE, SEW) \
4199
+ NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) v, \
4200
+ HWY_RVV_V(BASE, SEW, m1) v0) { \
4201
+ return GetLane(__riscv_v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1( \
4202
+ v, v0, Lanes(d))); \
3852
4203
  }
3853
4204
 
3854
- // ------------------------------ SumOfLanes
4205
+ // detail::RedSum, detail::RedMin, and detail::RedMax is more efficient
4206
+ // for N=4 I8/U8 reductions on RVV than the default implementations of the
4207
+ // the N=4 I8/U8 ReduceSum/ReduceMin/ReduceMax operations in generic_ops-inl.h
4208
+ #undef HWY_IF_REDUCE_D
4209
+ #define HWY_IF_REDUCE_D(D) hwy::EnableIf<HWY_MAX_LANES_D(D) != 1>* = nullptr
4210
+
4211
+ #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
4212
+ #undef HWY_NATIVE_REDUCE_SUM_4_UI8
4213
+ #else
4214
+ #define HWY_NATIVE_REDUCE_SUM_4_UI8
4215
+ #endif
4216
+
4217
+ #ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
4218
+ #undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
4219
+ #else
4220
+ #define HWY_NATIVE_REDUCE_MINMAX_4_UI8
4221
+ #endif
4222
+
4223
+ // ------------------------------ ReduceSum
3855
4224
 
3856
4225
  namespace detail {
3857
- HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum, _ALL)
3858
- HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredusum, _ALL)
4226
+ HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum, _ALL_VIRT)
4227
+ HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredusum, _ALL_VIRT)
3859
4228
  } // namespace detail
3860
4229
 
3861
- template <class D>
3862
- HWY_API VFromD<D> SumOfLanes(D d, const VFromD<D> v) {
4230
+ template <class D, HWY_IF_REDUCE_D(D)>
4231
+ HWY_API TFromD<D> ReduceSum(D d, const VFromD<D> v) {
3863
4232
  const auto v0 = Zero(ScalableTag<TFromD<D>>()); // always m1
3864
4233
  return detail::RedSum(d, v, v0);
3865
4234
  }
3866
4235
 
3867
- template <class D>
3868
- HWY_API TFromD<D> ReduceSum(D d, const VFromD<D> v) {
3869
- return GetLane(SumOfLanes(d, v));
3870
- }
3871
-
3872
- // ------------------------------ MinOfLanes
4236
+ // ------------------------------ ReduceMin
3873
4237
  namespace detail {
3874
- HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu, _ALL)
3875
- HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin, _ALL)
3876
- HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin, _ALL)
4238
+ HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu, _ALL_VIRT)
4239
+ HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin, _ALL_VIRT)
4240
+ HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin, _ALL_VIRT)
3877
4241
  } // namespace detail
3878
4242
 
3879
- template <class D>
3880
- HWY_API VFromD<D> MinOfLanes(D d, const VFromD<D> v) {
3881
- using T = TFromD<D>;
4243
+ template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
4244
+ HWY_API T ReduceMin(D d, const VFromD<D> v) {
3882
4245
  const ScalableTag<T> d1; // always m1
3883
- const auto neutral = Set(d1, HighestValue<T>());
3884
- return detail::RedMin(d, v, neutral);
4246
+ return detail::RedMin(d, v, Set(d1, HighestValue<T>()));
3885
4247
  }
3886
4248
 
3887
- // ------------------------------ MaxOfLanes
4249
+ // ------------------------------ ReduceMax
3888
4250
  namespace detail {
3889
- HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu, _ALL)
3890
- HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax, _ALL)
3891
- HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax, _ALL)
4251
+ HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu, _ALL_VIRT)
4252
+ HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax, _ALL_VIRT)
4253
+ HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax, _ALL_VIRT)
3892
4254
  } // namespace detail
3893
4255
 
3894
- template <class D>
3895
- HWY_API VFromD<D> MaxOfLanes(D d, const VFromD<D> v) {
3896
- using T = TFromD<D>;
4256
+ template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
4257
+ HWY_API T ReduceMax(D d, const VFromD<D> v) {
3897
4258
  const ScalableTag<T> d1; // always m1
3898
- const auto neutral = Set(d1, LowestValue<T>());
3899
- return detail::RedMax(d, v, neutral);
4259
+ return detail::RedMax(d, v, Set(d1, LowestValue<T>()));
3900
4260
  }
3901
4261
 
3902
4262
  #undef HWY_RVV_REDUCE
3903
4263
 
4264
+ // ------------------------------ SumOfLanes
4265
+
4266
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
4267
+ HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
4268
+ return Set(d, ReduceSum(d, v));
4269
+ }
4270
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
4271
+ HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
4272
+ return Set(d, ReduceMin(d, v));
4273
+ }
4274
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
4275
+ HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
4276
+ return Set(d, ReduceMax(d, v));
4277
+ }
4278
+
3904
4279
  // ================================================== Ops with dependencies
3905
4280
 
3906
4281
  // ------------------------------ LoadInterleaved2
@@ -4229,15 +4604,87 @@ HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
4229
4604
 
4230
4605
  #endif // HWY_HAVE_TUPLE
4231
4606
 
4232
- // ------------------------------ ResizeBitCast
4607
+ // ------------------------------ Dup128VecFromValues (ResizeBitCast)
4233
4608
 
4234
- template <class D, class FromV>
4235
- HWY_API VFromD<D> ResizeBitCast(D /*d*/, FromV v) {
4236
- const DFromV<decltype(v)> d_from;
4237
- const Repartition<uint8_t, decltype(d_from)> du8_from;
4238
- const DFromV<VFromD<D>> d_to;
4239
- const Repartition<uint8_t, decltype(d_to)> du8_to;
4240
- return BitCast(d_to, detail::ChangeLMUL(du8_to, BitCast(du8_from, v)));
4609
+ template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_D(D, 1)>
4610
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> /*t1*/) {
4611
+ return Set(d, t0);
4612
+ }
4613
+
4614
+ template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)>
4615
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) {
4616
+ const auto even_lanes = Set(d, t0);
4617
+ #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
4618
+ if (__builtin_constant_p(BitCastScalar<uint64_t>(t0) ==
4619
+ BitCastScalar<uint64_t>(t1)) &&
4620
+ (BitCastScalar<uint64_t>(t0) == BitCastScalar<uint64_t>(t1))) {
4621
+ return even_lanes;
4622
+ }
4623
+ #endif
4624
+
4625
+ const auto odd_lanes = Set(d, t1);
4626
+ return OddEven(odd_lanes, even_lanes);
4627
+ }
4628
+
4629
+ namespace detail {
4630
+
4631
+ #pragma pack(push, 1)
4632
+
4633
+ template <class T>
4634
+ struct alignas(8) Vec64ValsWrapper {
4635
+ static_assert(sizeof(T) >= 1, "sizeof(T) >= 1 must be true");
4636
+ static_assert(sizeof(T) <= 8, "sizeof(T) <= 8 must be true");
4637
+ T vals[8 / sizeof(T)];
4638
+ };
4639
+
4640
+ #pragma pack(pop)
4641
+
4642
+ } // namespace detail
4643
+
4644
+ template <class D, HWY_IF_T_SIZE_D(D, 1)>
4645
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
4646
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
4647
+ TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
4648
+ TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
4649
+ TFromD<D> t11, TFromD<D> t12,
4650
+ TFromD<D> t13, TFromD<D> t14,
4651
+ TFromD<D> t15) {
4652
+ const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
4653
+ return ResizeBitCast(
4654
+ d, Dup128VecFromValues(
4655
+ du64,
4656
+ BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{
4657
+ {t0, t1, t2, t3, t4, t5, t6, t7}}),
4658
+ BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{
4659
+ {t8, t9, t10, t11, t12, t13, t14, t15}})));
4660
+ }
4661
+
4662
+ template <class D, HWY_IF_T_SIZE_D(D, 2)>
4663
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
4664
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
4665
+ TFromD<D> t5, TFromD<D> t6,
4666
+ TFromD<D> t7) {
4667
+ const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
4668
+ return ResizeBitCast(
4669
+ d, Dup128VecFromValues(
4670
+ du64,
4671
+ BitCastScalar<uint64_t>(
4672
+ detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1, t2, t3}}),
4673
+ BitCastScalar<uint64_t>(
4674
+ detail::Vec64ValsWrapper<TFromD<D>>{{t4, t5, t6, t7}})));
4675
+ }
4676
+
4677
+ template <class D, HWY_IF_T_SIZE_D(D, 4)>
4678
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
4679
+ TFromD<D> t2, TFromD<D> t3) {
4680
+ const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
4681
+ return ResizeBitCast(
4682
+ d,
4683
+ Dup128VecFromValues(du64,
4684
+ BitCastScalar<uint64_t>(
4685
+ detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}}),
4686
+ BitCastScalar<uint64_t>(
4687
+ detail::Vec64ValsWrapper<TFromD<D>>{{t2, t3}})));
4241
4688
  }
4242
4689
 
4243
4690
  // ------------------------------ PopulationCount (ShiftRight)
@@ -4366,6 +4813,278 @@ HWY_API MFromD<D> FirstN(const D d, const size_t n) {
4366
4813
  return Eq(detail::SlideUp(one, zero, n), one);
4367
4814
  }
4368
4815
 
4816
+ // ------------------------------ LowerHalfOfMask/UpperHalfOfMask
4817
+
4818
+ #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
4819
+
4820
+ // Target-specific implementations of LowerHalfOfMask, UpperHalfOfMask,
4821
+ // CombineMasks, OrderedDemote2MasksTo, and Dup128MaskFromMaskBits are possible
4822
+ // on RVV if the __riscv_vreinterpret_v_b*_u8m1 and
4823
+ // __riscv_vreinterpret_v_u8m1_b* intrinsics are available.
4824
+
4825
+ // The __riscv_vreinterpret_v_b*_u8m1 and __riscv_vreinterpret_v_u8m1_b*
4826
+ // intrinsics available with Clang 17 and later and GCC 14 and later.
4827
+
4828
+ namespace detail {
4829
+
4830
+ HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool1_t m) {
4831
+ return __riscv_vreinterpret_v_b1_u8m1(m);
4832
+ }
4833
+
4834
+ HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool2_t m) {
4835
+ return __riscv_vreinterpret_v_b2_u8m1(m);
4836
+ }
4837
+
4838
+ HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool4_t m) {
4839
+ return __riscv_vreinterpret_v_b4_u8m1(m);
4840
+ }
4841
+
4842
+ HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool8_t m) {
4843
+ return __riscv_vreinterpret_v_b8_u8m1(m);
4844
+ }
4845
+
4846
+ HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool16_t m) {
4847
+ return __riscv_vreinterpret_v_b16_u8m1(m);
4848
+ }
4849
+
4850
+ HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool32_t m) {
4851
+ return __riscv_vreinterpret_v_b32_u8m1(m);
4852
+ }
4853
+
4854
+ HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool64_t m) {
4855
+ return __riscv_vreinterpret_v_b64_u8m1(m);
4856
+ }
4857
+
4858
+ template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool1_t>()>* = nullptr>
4859
+ HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
4860
+ return __riscv_vreinterpret_v_u8m1_b1(v);
4861
+ }
4862
+
4863
+ template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool2_t>()>* = nullptr>
4864
+ HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
4865
+ return __riscv_vreinterpret_v_u8m1_b2(v);
4866
+ }
4867
+
4868
+ template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool4_t>()>* = nullptr>
4869
+ HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
4870
+ return __riscv_vreinterpret_v_u8m1_b4(v);
4871
+ }
4872
+
4873
+ template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool8_t>()>* = nullptr>
4874
+ HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
4875
+ return __riscv_vreinterpret_v_u8m1_b8(v);
4876
+ }
4877
+
4878
+ template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool16_t>()>* = nullptr>
4879
+ HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
4880
+ return __riscv_vreinterpret_v_u8m1_b16(v);
4881
+ }
4882
+
4883
+ template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool32_t>()>* = nullptr>
4884
+ HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
4885
+ return __riscv_vreinterpret_v_u8m1_b32(v);
4886
+ }
4887
+
4888
+ template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool64_t>()>* = nullptr>
4889
+ HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
4890
+ return __riscv_vreinterpret_v_u8m1_b64(v);
4891
+ }
4892
+
4893
+ } // namespace detail
4894
+
4895
+ #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
4896
+ #undef HWY_NATIVE_LOWER_HALF_OF_MASK
4897
+ #else
4898
+ #define HWY_NATIVE_LOWER_HALF_OF_MASK
4899
+ #endif
4900
+
4901
+ template <class D>
4902
+ HWY_API MFromD<D> LowerHalfOfMask(D d, MFromD<Twice<D>> m) {
4903
+ return detail::U8MaskBitsVecToMask(d, detail::MaskToU8MaskBitsVec(m));
4904
+ }
4905
+
4906
+ #ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
4907
+ #undef HWY_NATIVE_UPPER_HALF_OF_MASK
4908
+ #else
4909
+ #define HWY_NATIVE_UPPER_HALF_OF_MASK
4910
+ #endif
4911
+
4912
+ template <class D>
4913
+ HWY_API MFromD<D> UpperHalfOfMask(D d, MFromD<Twice<D>> m) {
4914
+ const size_t N = Lanes(d);
4915
+
4916
+ vuint8m1_t mask_bits = detail::MaskToU8MaskBitsVec(m);
4917
+ mask_bits = ShiftRightSame(mask_bits, static_cast<int>(N & 7));
4918
+ if (HWY_MAX_LANES_D(D) >= 8) {
4919
+ mask_bits = SlideDownLanes(ScalableTag<uint8_t>(), mask_bits, N / 8);
4920
+ }
4921
+
4922
+ return detail::U8MaskBitsVecToMask(d, mask_bits);
4923
+ }
4924
+
4925
+ // ------------------------------ CombineMasks
4926
+
4927
+ #ifdef HWY_NATIVE_COMBINE_MASKS
4928
+ #undef HWY_NATIVE_COMBINE_MASKS
4929
+ #else
4930
+ #define HWY_NATIVE_COMBINE_MASKS
4931
+ #endif
4932
+
4933
+ template <class D>
4934
+ HWY_API MFromD<D> CombineMasks(D d, MFromD<Half<D>> hi, MFromD<Half<D>> lo) {
4935
+ const Half<decltype(d)> dh;
4936
+ const size_t half_N = Lanes(dh);
4937
+
4938
+ const auto ext_lo_mask =
4939
+ And(detail::U8MaskBitsVecToMask(d, detail::MaskToU8MaskBitsVec(lo)),
4940
+ FirstN(d, half_N));
4941
+ vuint8m1_t hi_mask_bits = detail::MaskToU8MaskBitsVec(hi);
4942
+ hi_mask_bits = ShiftLeftSame(hi_mask_bits, static_cast<int>(half_N & 7));
4943
+ if (HWY_MAX_LANES_D(D) >= 8) {
4944
+ hi_mask_bits =
4945
+ SlideUpLanes(ScalableTag<uint8_t>(), hi_mask_bits, half_N / 8);
4946
+ }
4947
+
4948
+ return Or(ext_lo_mask, detail::U8MaskBitsVecToMask(d, hi_mask_bits));
4949
+ }
4950
+
4951
+ // ------------------------------ OrderedDemote2MasksTo
4952
+
4953
+ #ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
4954
+ #undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
4955
+ #else
4956
+ #define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
4957
+ #endif
4958
+
4959
+ template <class DTo, class DFrom,
4960
+ HWY_IF_T_SIZE_D(DTo, sizeof(TFromD<DFrom>) / 2),
4961
+ class DTo_2 = Repartition<TFromD<DTo>, DFrom>,
4962
+ hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DTo_2>>()>* = nullptr>
4963
+ HWY_API MFromD<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom /*d_from*/,
4964
+ MFromD<DFrom> a, MFromD<DFrom> b) {
4965
+ return CombineMasks(d_to, b, a);
4966
+ }
4967
+
4968
+ #endif // HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
4969
+
4970
+ // ------------------------------ Dup128MaskFromMaskBits
4971
+
4972
+ namespace detail {
4973
+ // Even though this is only used after checking if (kN < X), this helper
4974
+ // function prevents "shift count exceeded" errors.
4975
+ template <size_t kN, HWY_IF_LANES_LE(kN, 31)>
4976
+ constexpr unsigned MaxMaskBits() {
4977
+ return (1u << kN) - 1;
4978
+ }
4979
+ template <size_t kN, HWY_IF_LANES_GT(kN, 31)>
4980
+ constexpr unsigned MaxMaskBits() {
4981
+ return ~0u;
4982
+ }
4983
+ } // namespace detail
4984
+
4985
+ template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_LE_D(D, 8)>
4986
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
4987
+ constexpr size_t kN = MaxLanes(d);
4988
+ if (kN < 8) mask_bits &= detail::MaxMaskBits<kN>();
4989
+
4990
+ #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
4991
+ return detail::U8MaskBitsVecToMask(
4992
+ d, Set(ScalableTag<uint8_t>(), static_cast<uint8_t>(mask_bits)));
4993
+ #else
4994
+ const RebindToUnsigned<decltype(d)> du8;
4995
+ const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, decltype(du8)>>
4996
+ du64;
4997
+
4998
+ const auto bytes = ResizeBitCast(
4999
+ du8, detail::AndS(
5000
+ ResizeBitCast(du64, Set(du8, static_cast<uint8_t>(mask_bits))),
5001
+ uint64_t{0x8040201008040201u}));
5002
+ return detail::NeS(bytes, uint8_t{0});
5003
+ #endif
5004
+ }
5005
+
5006
+ template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_GT_D(D, 8)>
5007
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
5008
+ #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5009
+ const ScalableTag<uint8_t> du8;
5010
+ const ScalableTag<uint16_t> du16;
5011
+ // There are exactly 16 mask bits for 128 vector bits of 8-bit lanes.
5012
+ return detail::U8MaskBitsVecToMask(
5013
+ d, BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits))));
5014
+ #else
5015
+ // Slow fallback for completeness; the above bits to mask cast is preferred.
5016
+ const RebindToUnsigned<decltype(d)> du8;
5017
+ const Repartition<uint16_t, decltype(du8)> du16;
5018
+ const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, decltype(du8)>>
5019
+ du64;
5020
+
5021
+ // Replicate the lower 16 bits of mask_bits to each u16 lane of a u16 vector,
5022
+ // and then bitcast the replicated mask_bits to a u8 vector
5023
+ const auto bytes = BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits)));
5024
+ // Replicate bytes 8x such that each byte contains the bit that governs it.
5025
+ const auto rep8 = TableLookupLanes(bytes, ShiftRight<3>(detail::Iota0(du8)));
5026
+
5027
+ const auto masked_out_rep8 = ResizeBitCast(
5028
+ du8,
5029
+ detail::AndS(ResizeBitCast(du64, rep8), uint64_t{0x8040201008040201u}));
5030
+ return detail::NeS(masked_out_rep8, uint8_t{0});
5031
+ #endif
5032
+ }
5033
+
5034
+ template <class D, HWY_IF_T_SIZE_D(D, 2)>
5035
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
5036
+ constexpr size_t kN = MaxLanes(d);
5037
+ if (kN < 8) mask_bits &= detail::MaxMaskBits<kN>();
5038
+
5039
+ #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5040
+ const ScalableTag<uint8_t> du8;
5041
+ // There are exactly 8 mask bits for 128 vector bits of 16-bit lanes.
5042
+ return detail::U8MaskBitsVecToMask(d,
5043
+ Set(du8, static_cast<uint8_t>(mask_bits)));
5044
+ #else
5045
+ // Slow fallback for completeness; the above bits to mask cast is preferred.
5046
+ const RebindToUnsigned<D> du;
5047
+ const VFromD<decltype(du)> bits =
5048
+ Shl(Set(du, uint16_t{1}), Iota(du, uint16_t{0}));
5049
+ return TestBit(Set(du, static_cast<uint16_t>(mask_bits)), bits);
5050
+ #endif
5051
+ }
5052
+
5053
+ template <class D, HWY_IF_T_SIZE_D(D, 4)>
5054
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
5055
+ constexpr size_t kN = MaxLanes(d);
5056
+ if (kN < 4) mask_bits &= detail::MaxMaskBits<kN>();
5057
+
5058
+ #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5059
+ const ScalableTag<uint8_t> du8;
5060
+ return detail::U8MaskBitsVecToMask(
5061
+ d, Set(du8, static_cast<uint8_t>(mask_bits * 0x11)));
5062
+ #else
5063
+ // Slow fallback for completeness; the above bits to mask cast is preferred.
5064
+ const RebindToUnsigned<D> du;
5065
+ const VFromD<decltype(du)> bits =
5066
+ Shl(Set(du, uint32_t{1}), Iota(du, uint32_t{0}));
5067
+ return TestBit(Set(du, static_cast<uint32_t>(mask_bits)), bits);
5068
+ #endif
5069
+ }
5070
+
5071
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
5072
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
5073
+ constexpr size_t kN = MaxLanes(d);
5074
+ if (kN < 2) mask_bits &= detail::MaxMaskBits<kN>();
5075
+
5076
+ #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5077
+ const ScalableTag<uint8_t> du8;
5078
+ return detail::U8MaskBitsVecToMask(
5079
+ d, Set(du8, static_cast<uint8_t>(mask_bits * 0x55)));
5080
+ #else
5081
+ // Slow fallback for completeness; the above bits to mask cast is preferred.
5082
+ const RebindToUnsigned<D> du;
5083
+ const VFromD<decltype(du)> bits = Dup128VecFromValues(du, 0, 1);
5084
+ return TestBit(Set(du, static_cast<uint64_t>(mask_bits)), bits);
5085
+ #endif
5086
+ }
5087
+
4369
5088
  // ------------------------------ Neg (Sub)
4370
5089
 
4371
5090
  template <class V, HWY_IF_SIGNED_V(V)>
@@ -4385,7 +5104,7 @@ HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Neg, fsgnjn, _ALL)
4385
5104
 
4386
5105
  #if !HWY_HAVE_FLOAT16
4387
5106
 
4388
- template <class V, HWY_IF_U16_D(DFromV<V>)> // float16_t
5107
+ template <class V, HWY_IF_U16_D(DFromV<V>)> // hwy::float16_t
4389
5108
  HWY_API V Neg(V v) {
4390
5109
  const DFromV<decltype(v)> d;
4391
5110
  const RebindToUnsigned<decltype(d)> du;
@@ -4479,6 +5198,14 @@ HWY_API MFromD<DFromV<V>> IsNaN(const V v) {
4479
5198
  return Ne(v, v);
4480
5199
  }
4481
5200
 
5201
+ // Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite.
5202
+ // We use a fused Set/comparison for IsFinite.
5203
+ #ifdef HWY_NATIVE_ISINF
5204
+ #undef HWY_NATIVE_ISINF
5205
+ #else
5206
+ #define HWY_NATIVE_ISINF
5207
+ #endif
5208
+
4482
5209
  template <class V, class D = DFromV<V>>
4483
5210
  HWY_API MFromD<D> IsInf(const V v) {
4484
5211
  const D d;
@@ -4507,22 +5234,24 @@ HWY_API MFromD<D> IsFinite(const V v) {
4507
5234
 
4508
5235
  // ------------------------------ Iota (ConvertTo)
4509
5236
 
4510
- template <class D, HWY_IF_UNSIGNED_D(D)>
4511
- HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
4512
- return detail::AddS(detail::Iota0(d), first);
5237
+ template <class D, typename T2, HWY_IF_UNSIGNED_D(D)>
5238
+ HWY_API VFromD<D> Iota(const D d, T2 first) {
5239
+ return detail::AddS(detail::Iota0(d), static_cast<TFromD<D>>(first));
4513
5240
  }
4514
5241
 
4515
- template <class D, HWY_IF_SIGNED_D(D)>
4516
- HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
5242
+ template <class D, typename T2, HWY_IF_SIGNED_D(D)>
5243
+ HWY_API VFromD<D> Iota(const D d, T2 first) {
4517
5244
  const RebindToUnsigned<D> du;
4518
- return detail::AddS(BitCast(d, detail::Iota0(du)), first);
5245
+ return detail::AddS(BitCast(d, detail::Iota0(du)),
5246
+ static_cast<TFromD<D>>(first));
4519
5247
  }
4520
5248
 
4521
- template <class D, HWY_IF_FLOAT_D(D)>
4522
- HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
5249
+ template <class D, typename T2, HWY_IF_FLOAT_D(D)>
5250
+ HWY_API VFromD<D> Iota(const D d, T2 first) {
4523
5251
  const RebindToUnsigned<D> du;
4524
5252
  const RebindToSigned<D> di;
4525
- return detail::AddS(ConvertTo(d, BitCast(di, detail::Iota0(du))), first);
5253
+ return detail::AddS(ConvertTo(d, BitCast(di, detail::Iota0(du))),
5254
+ ConvertScalarTo<TFromD<D>>(first));
4526
5255
  }
4527
5256
 
4528
5257
  // ------------------------------ MulEven/Odd (Mul, OddEven)
@@ -4561,8 +5290,8 @@ HWY_INLINE V MulOdd(const V a, const V b) {
4561
5290
  // ------------------------------ ReorderDemote2To (OddEven, Combine)
4562
5291
 
4563
5292
  template <size_t N, int kPow2>
4564
- HWY_API VFromD<Simd<bfloat16_t, N, kPow2>> ReorderDemote2To(
4565
- Simd<bfloat16_t, N, kPow2> dbf16,
5293
+ HWY_API VFromD<Simd<hwy::bfloat16_t, N, kPow2>> ReorderDemote2To(
5294
+ Simd<hwy::bfloat16_t, N, kPow2> dbf16,
4566
5295
  VFromD<RepartitionToWide<decltype(dbf16)>> a,
4567
5296
  VFromD<RepartitionToWide<decltype(dbf16)>> b) {
4568
5297
  const RebindToUnsigned<decltype(dbf16)> du16;
@@ -4618,8 +5347,8 @@ HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
4618
5347
  }
4619
5348
 
4620
5349
  // If LMUL is not the max, Combine first to avoid another DemoteTo.
4621
- template <class DN, HWY_IF_BF16_D(DN), HWY_IF_POW2_LE_D(DN, 2), class V,
4622
- HWY_IF_F32_D(DFromV<V>),
5350
+ template <class DN, HWY_IF_SPECIAL_FLOAT_D(DN), HWY_IF_POW2_LE_D(DN, 2),
5351
+ class V, HWY_IF_F32_D(DFromV<V>),
4623
5352
  class V2 = VFromD<Repartition<TFromV<V>, DN>>,
4624
5353
  hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr>
4625
5354
  HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
@@ -4629,8 +5358,8 @@ HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
4629
5358
  }
4630
5359
 
4631
5360
  // Max LMUL: must DemoteTo first, then Combine.
4632
- template <class DN, HWY_IF_BF16_D(DN), HWY_IF_POW2_GT_D(DN, 2), class V,
4633
- HWY_IF_F32_D(DFromV<V>),
5361
+ template <class DN, HWY_IF_SPECIAL_FLOAT_D(DN), HWY_IF_POW2_GT_D(DN, 2),
5362
+ class V, HWY_IF_F32_D(DFromV<V>),
4634
5363
  class V2 = VFromD<Repartition<TFromV<V>, DN>>,
4635
5364
  hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr>
4636
5365
  HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
@@ -4654,7 +5383,7 @@ HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
4654
5383
  // ------------------------------ WidenMulPairwiseAdd
4655
5384
 
4656
5385
  template <class D32, HWY_IF_F32_D(D32),
4657
- class V16 = VFromD<Repartition<bfloat16_t, D32>>>
5386
+ class V16 = VFromD<Repartition<hwy::bfloat16_t, D32>>>
4658
5387
  HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) {
4659
5388
  const RebindToUnsigned<decltype(df32)> du32;
4660
5389
  using VU32 = VFromD<decltype(du32)>;
@@ -4698,7 +5427,7 @@ namespace detail {
4698
5427
  // Non-overloaded wrapper function so we can define DF32 in template args.
4699
5428
  template <size_t N, int kPow2, class DF32 = Simd<float, N, kPow2>,
4700
5429
  class VF32 = VFromD<DF32>,
4701
- class DBF16 = Repartition<bfloat16_t, Simd<float, N, kPow2>>>
5430
+ class DBF16 = Repartition<hwy::bfloat16_t, Simd<float, N, kPow2>>>
4702
5431
  HWY_API VF32 ReorderWidenMulAccumulateBF16(Simd<float, N, kPow2> df32,
4703
5432
  VFromD<DBF16> a, VFromD<DBF16> b,
4704
5433
  const VF32 sum0, VF32& sum1) {
@@ -4994,7 +5723,6 @@ HWY_INLINE VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) {
4994
5723
  }
4995
5724
 
4996
5725
  // ================================================== END MACROS
4997
- namespace detail { // for code folding
4998
5726
  #undef HWY_RVV_AVL
4999
5727
  #undef HWY_RVV_D
5000
5728
  #undef HWY_RVV_FOREACH
@@ -5055,15 +5783,19 @@ namespace detail { // for code folding
5055
5783
  #undef HWY_RVV_FOREACH_UI32
5056
5784
  #undef HWY_RVV_FOREACH_UI3264
5057
5785
  #undef HWY_RVV_FOREACH_UI64
5786
+ #undef HWY_RVV_IF_EMULATED_D
5787
+ #undef HWY_RVV_IF_CAN128_D
5788
+ #undef HWY_RVV_IF_GE128_D
5789
+ #undef HWY_RVV_IF_LT128_D
5058
5790
  #undef HWY_RVV_INSERT_VXRM
5059
5791
  #undef HWY_RVV_M
5060
5792
  #undef HWY_RVV_RETM_ARGM
5793
+ #undef HWY_RVV_RETV_ARGMVV
5061
5794
  #undef HWY_RVV_RETV_ARGV
5062
5795
  #undef HWY_RVV_RETV_ARGVS
5063
5796
  #undef HWY_RVV_RETV_ARGVV
5064
5797
  #undef HWY_RVV_T
5065
5798
  #undef HWY_RVV_V
5066
- } // namespace detail
5067
5799
  // NOLINTNEXTLINE(google-readability-namespace-comments)
5068
5800
  } // namespace HWY_NAMESPACE
5069
5801
  } // namespace hwy