@img/sharp-libvips-dev 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (230) hide show
  1. package/include/aom/aom_encoder.h +3 -3
  2. package/include/aom/aomcx.h +17 -8
  3. package/include/expat.h +21 -10
  4. package/include/expat_config.h +11 -5
  5. package/include/ffi.h +12 -25
  6. package/include/fontconfig/fontconfig.h +5 -3
  7. package/include/freetype2/freetype/config/ftoption.h +1 -1
  8. package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -7
  9. package/include/gio-unix-2.0/gio/gunixinputstream.h +0 -5
  10. package/include/gio-unix-2.0/gio/gunixoutputstream.h +0 -5
  11. package/include/glib-2.0/gio/gappinfo.h +0 -7
  12. package/include/glib-2.0/gio/gapplication.h +6 -0
  13. package/include/glib-2.0/gio/gapplicationcommandline.h +12 -1
  14. package/include/glib-2.0/gio/gasyncinitable.h +0 -7
  15. package/include/glib-2.0/gio/gasyncresult.h +0 -6
  16. package/include/glib-2.0/gio/gbufferedinputstream.h +0 -5
  17. package/include/glib-2.0/gio/gbufferedoutputstream.h +0 -5
  18. package/include/glib-2.0/gio/gbytesicon.h +0 -5
  19. package/include/glib-2.0/gio/gcancellable.h +0 -5
  20. package/include/glib-2.0/gio/gconverter.h +0 -7
  21. package/include/glib-2.0/gio/gconverterinputstream.h +0 -6
  22. package/include/glib-2.0/gio/gconverteroutputstream.h +0 -6
  23. package/include/glib-2.0/gio/gdatagrambased.h +0 -7
  24. package/include/glib-2.0/gio/gdatainputstream.h +0 -6
  25. package/include/glib-2.0/gio/gdataoutputstream.h +0 -6
  26. package/include/glib-2.0/gio/gdbusinterface.h +0 -8
  27. package/include/glib-2.0/gio/gdbusinterfaceskeleton.h +0 -8
  28. package/include/glib-2.0/gio/gdbusmessage.h +2 -1
  29. package/include/glib-2.0/gio/gdbusobjectmanagerclient.h +0 -8
  30. package/include/glib-2.0/gio/gdbusobjectmanagerserver.h +0 -8
  31. package/include/glib-2.0/gio/gdbusobjectproxy.h +0 -8
  32. package/include/glib-2.0/gio/gdbusobjectskeleton.h +0 -8
  33. package/include/glib-2.0/gio/gdbusproxy.h +0 -8
  34. package/include/glib-2.0/gio/gdebugcontroller.h +0 -8
  35. package/include/glib-2.0/gio/gdebugcontrollerdbus.h +0 -7
  36. package/include/glib-2.0/gio/gdtlsserverconnection.h +0 -8
  37. package/include/glib-2.0/gio/gemblem.h +0 -5
  38. package/include/glib-2.0/gio/gemblemedicon.h +0 -5
  39. package/include/glib-2.0/gio/gfile.h +0 -10
  40. package/include/glib-2.0/gio/gfileenumerator.h +0 -5
  41. package/include/glib-2.0/gio/gfileicon.h +0 -5
  42. package/include/glib-2.0/gio/gfileinfo.h +0 -5
  43. package/include/glib-2.0/gio/gfileinputstream.h +0 -8
  44. package/include/glib-2.0/gio/gfileiostream.h +0 -8
  45. package/include/glib-2.0/gio/gfilemonitor.h +0 -5
  46. package/include/glib-2.0/gio/gfilenamecompleter.h +0 -5
  47. package/include/glib-2.0/gio/gfileoutputstream.h +0 -8
  48. package/include/glib-2.0/gio/gfilterinputstream.h +0 -5
  49. package/include/glib-2.0/gio/gfilteroutputstream.h +0 -5
  50. package/include/glib-2.0/gio/gicon.h +0 -5
  51. package/include/glib-2.0/gio/ginitable.h +0 -7
  52. package/include/glib-2.0/gio/ginputstream.h +0 -5
  53. package/include/glib-2.0/gio/gio-autocleanups.h +4 -0
  54. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  55. package/include/glib-2.0/gio/gioenums.h +6 -1
  56. package/include/glib-2.0/gio/giomodule.h +0 -5
  57. package/include/glib-2.0/gio/giostream.h +0 -5
  58. package/include/glib-2.0/gio/giotypes.h +5 -108
  59. package/include/glib-2.0/gio/gloadableicon.h +0 -6
  60. package/include/glib-2.0/gio/gmemoryinputstream.h +0 -5
  61. package/include/glib-2.0/gio/gmemoryoutputstream.h +0 -5
  62. package/include/glib-2.0/gio/gmountoperation.h +0 -6
  63. package/include/glib-2.0/gio/gnetworking.h +4 -0
  64. package/include/glib-2.0/gio/goutputstream.h +0 -9
  65. package/include/glib-2.0/gio/gpollableinputstream.h +0 -7
  66. package/include/glib-2.0/gio/gpollableoutputstream.h +0 -7
  67. package/include/glib-2.0/gio/gproxy.h +0 -7
  68. package/include/glib-2.0/gio/gproxyaddressenumerator.h +0 -8
  69. package/include/glib-2.0/gio/gseekable.h +0 -5
  70. package/include/glib-2.0/gio/gsettingsbackend.h +0 -5
  71. package/include/glib-2.0/gio/gsimpleactiongroup.h +0 -7
  72. package/include/glib-2.0/gio/gsimpleasyncresult.h +0 -5
  73. package/include/glib-2.0/gio/gsimpleproxyresolver.h +0 -5
  74. package/include/glib-2.0/gio/gsocket.h +13 -0
  75. package/include/glib-2.0/gio/gsocketaddressenumerator.h +0 -6
  76. package/include/glib-2.0/gio/gsocketconnectable.h +0 -5
  77. package/include/glib-2.0/gio/gtask.h +12 -0
  78. package/include/glib-2.0/gio/gthemedicon.h +0 -5
  79. package/include/glib-2.0/gio/gtlsserverconnection.h +0 -8
  80. package/include/glib-2.0/gio/gunixcredentialsmessage.h +0 -8
  81. package/include/glib-2.0/gio/gvfs.h +0 -5
  82. package/include/glib-2.0/gio/gvolume.h +2 -2
  83. package/include/glib-2.0/gio/gvolumemonitor.h +0 -5
  84. package/include/glib-2.0/girepository/gi-visibility.h +986 -0
  85. package/include/glib-2.0/girepository/giarginfo.h +100 -0
  86. package/include/glib-2.0/girepository/gibaseinfo.h +129 -0
  87. package/include/glib-2.0/girepository/gicallableinfo.h +119 -0
  88. package/include/glib-2.0/girepository/gicallbackinfo.h +60 -0
  89. package/include/glib-2.0/girepository/giconstantinfo.h +72 -0
  90. package/include/glib-2.0/girepository/gienuminfo.h +82 -0
  91. package/include/glib-2.0/girepository/gifieldinfo.h +84 -0
  92. package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
  93. package/include/glib-2.0/girepository/gifunctioninfo.h +117 -0
  94. package/include/glib-2.0/girepository/giinterfaceinfo.h +120 -0
  95. package/include/glib-2.0/girepository/giobjectinfo.h +230 -0
  96. package/include/glib-2.0/girepository/gipropertyinfo.h +77 -0
  97. package/include/glib-2.0/girepository/giregisteredtypeinfo.h +75 -0
  98. package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
  99. package/include/glib-2.0/girepository/girepository.h +247 -0
  100. package/include/glib-2.0/girepository/girffi.h +129 -0
  101. package/include/glib-2.0/girepository/gisignalinfo.h +72 -0
  102. package/include/glib-2.0/girepository/gistructinfo.h +102 -0
  103. package/include/glib-2.0/girepository/gitypeinfo.h +144 -0
  104. package/include/glib-2.0/girepository/gitypelib.h +61 -0
  105. package/include/glib-2.0/girepository/gitypes.h +421 -0
  106. package/include/glib-2.0/girepository/giunioninfo.h +105 -0
  107. package/include/glib-2.0/girepository/giunresolvedinfo.h +60 -0
  108. package/include/glib-2.0/girepository/givalueinfo.h +65 -0
  109. package/include/glib-2.0/girepository/givfuncinfo.h +88 -0
  110. package/include/glib-2.0/glib/deprecated/gcompletion.h +1 -1
  111. package/include/glib-2.0/glib/deprecated/grel.h +0 -23
  112. package/include/glib-2.0/glib/deprecated/gthread.h +10 -6
  113. package/include/glib-2.0/glib/gatomic.h +20 -20
  114. package/include/glib-2.0/glib/gbitlock.h +31 -0
  115. package/include/glib-2.0/glib/gbookmarkfile.h +39 -1
  116. package/include/glib-2.0/glib/gchecksum.h +0 -10
  117. package/include/glib-2.0/glib/gdate.h +0 -9
  118. package/include/glib-2.0/glib/gdatetime.h +33 -1
  119. package/include/glib-2.0/glib/gdir.h +5 -0
  120. package/include/glib-2.0/glib/ghmac.h +0 -9
  121. package/include/glib-2.0/glib/glib-autocleanups.h +4 -0
  122. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  123. package/include/glib-2.0/glib/gmacros.h +1 -0
  124. package/include/glib-2.0/glib/gmessages.h +11 -0
  125. package/include/glib-2.0/glib/gpathbuf.h +0 -7
  126. package/include/glib-2.0/glib/gslice.h +2 -0
  127. package/include/glib-2.0/glib/gstdio.h +1 -1
  128. package/include/glib-2.0/glib/gstrfuncs.h +24 -18
  129. package/include/glib-2.0/glib/gstrvbuilder.h +4 -8
  130. package/include/glib-2.0/glib/gtestutils.h +5 -0
  131. package/include/glib-2.0/glib/gthread.h +216 -3
  132. package/include/glib-2.0/glib/gunicode.h +12 -2
  133. package/include/glib-2.0/glib/gvarianttype.h +1 -10
  134. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  135. package/include/glib-2.0/glib/gwin32.h +4 -4
  136. package/include/glib-2.0/glib-unix.h +214 -0
  137. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  138. package/include/glib-2.0/gobject/gbinding.h +0 -8
  139. package/include/glib-2.0/gobject/gbindinggroup.h +0 -8
  140. package/include/glib-2.0/gobject/gclosure.h +1 -9
  141. package/include/glib-2.0/gobject/genums.h +6 -6
  142. package/include/glib-2.0/gobject/glib-types.h +44 -0
  143. package/include/glib-2.0/gobject/gobject-autocleanups.h +4 -0
  144. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  145. package/include/glib-2.0/gobject/gobject.h +1 -16
  146. package/include/glib-2.0/gobject/gparam.h +3 -12
  147. package/include/glib-2.0/gobject/gsignal.h +16 -6
  148. package/include/glib-2.0/gobject/gsignalgroup.h +0 -8
  149. package/include/glib-2.0/gobject/gtype.h +53 -20
  150. package/include/glib-2.0/gobject/gtypemodule.h +0 -7
  151. package/include/glib-2.0/gobject/gtypeplugin.h +0 -6
  152. package/include/glib-2.0/gobject/gvaluearray.h +0 -7
  153. package/include/glib-2.0/gobject/gvaluecollector.h +1 -11
  154. package/include/glib-2.0/gobject/gvaluetypes.h +2 -0
  155. package/include/hwy/aligned_allocator.h +171 -6
  156. package/include/hwy/base.h +1765 -543
  157. package/include/hwy/cache_control.h +24 -6
  158. package/include/hwy/detect_compiler_arch.h +23 -2
  159. package/include/hwy/detect_targets.h +56 -13
  160. package/include/hwy/foreach_target.h +24 -0
  161. package/include/hwy/highway.h +20 -3
  162. package/include/hwy/ops/arm_neon-inl.h +1086 -667
  163. package/include/hwy/ops/arm_sve-inl.h +1091 -235
  164. package/include/hwy/ops/emu128-inl.h +271 -196
  165. package/include/hwy/ops/generic_ops-inl.h +2270 -399
  166. package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
  167. package/include/hwy/ops/rvv-inl.h +1043 -311
  168. package/include/hwy/ops/scalar-inl.h +189 -159
  169. package/include/hwy/ops/set_macros-inl.h +66 -6
  170. package/include/hwy/ops/shared-inl.h +175 -56
  171. package/include/hwy/ops/wasm_128-inl.h +153 -136
  172. package/include/hwy/ops/x86_128-inl.h +1647 -646
  173. package/include/hwy/ops/x86_256-inl.h +1003 -370
  174. package/include/hwy/ops/x86_512-inl.h +948 -353
  175. package/include/hwy/per_target.h +4 -0
  176. package/include/hwy/profiler.h +648 -0
  177. package/include/hwy/robust_statistics.h +2 -2
  178. package/include/hwy/targets.h +18 -11
  179. package/include/hwy/timer.h +11 -0
  180. package/include/lcms2.h +46 -7
  181. package/include/lcms2_plugin.h +4 -4
  182. package/include/libheif/heif_version.h +2 -2
  183. package/include/libpng16/png.h +32 -29
  184. package/include/libpng16/pngconf.h +2 -2
  185. package/include/libpng16/pnglibconf.h +7 -2
  186. package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
  187. package/include/libxml2/libxml/HTMLparser.h +23 -0
  188. package/include/libxml2/libxml/SAX.h +0 -2
  189. package/include/libxml2/libxml/SAX2.h +0 -2
  190. package/include/libxml2/libxml/c14n.h +0 -2
  191. package/include/libxml2/libxml/dict.h +1 -0
  192. package/include/libxml2/libxml/encoding.h +16 -14
  193. package/include/libxml2/libxml/entities.h +4 -0
  194. package/include/libxml2/libxml/globals.h +15 -503
  195. package/include/libxml2/libxml/hash.h +57 -61
  196. package/include/libxml2/libxml/nanoftp.h +2 -2
  197. package/include/libxml2/libxml/parser.h +137 -18
  198. package/include/libxml2/libxml/parserInternals.h +1 -0
  199. package/include/libxml2/libxml/relaxng.h +2 -1
  200. package/include/libxml2/libxml/schemasInternals.h +1 -0
  201. package/include/libxml2/libxml/schematron.h +1 -0
  202. package/include/libxml2/libxml/threads.h +4 -11
  203. package/include/libxml2/libxml/tree.h +68 -20
  204. package/include/libxml2/libxml/uri.h +2 -1
  205. package/include/libxml2/libxml/valid.h +2 -0
  206. package/include/libxml2/libxml/xmlIO.h +65 -13
  207. package/include/libxml2/libxml/xmlerror.h +37 -8
  208. package/include/libxml2/libxml/xmlmemory.h +37 -40
  209. package/include/libxml2/libxml/xmlreader.h +6 -0
  210. package/include/libxml2/libxml/xmlregexp.h +2 -9
  211. package/include/libxml2/libxml/xmlsave.h +9 -0
  212. package/include/libxml2/libxml/xmlschemas.h +3 -0
  213. package/include/libxml2/libxml/xmlversion.h +28 -43
  214. package/include/libxml2/libxml/xpath.h +1 -1
  215. package/include/libxml2/libxml/xpathInternals.h +2 -1
  216. package/include/libxml2/libxml/xpointer.h +5 -4
  217. package/include/pango-1.0/pango/pango-features.h +3 -3
  218. package/include/pango-1.0/pango/pango-fontmap.h +7 -0
  219. package/include/pixman-1/pixman-version.h +3 -3
  220. package/include/pixman-1/pixman.h +9 -2
  221. package/include/png.h +32 -29
  222. package/include/pngconf.h +2 -2
  223. package/include/pnglibconf.h +7 -2
  224. package/include/vips/connection.h +9 -3
  225. package/include/vips/util.h +0 -9
  226. package/include/vips/version.h +4 -4
  227. package/include/zconf.h +3 -0
  228. package/include/zlib.h +3 -3
  229. package/package.json +1 -1
  230. package/versions.json +15 -15
@@ -33,6 +33,22 @@
33
33
  #define HWY_SVE_HAVE_2 0
34
34
  #endif
35
35
 
36
+ // If 1, both __bf16 and a limited set of *_bf16 SVE intrinsics are available:
37
+ // create/get/set/dup, ld/st, sel, rev, trn, uzp, zip.
38
+ #if HWY_ARM_HAVE_SCALAR_BF16_TYPE && defined(__ARM_FEATURE_SVE_BF16)
39
+ #define HWY_SVE_HAVE_BF16_FEATURE 1
40
+ #else
41
+ #define HWY_SVE_HAVE_BF16_FEATURE 0
42
+ #endif
43
+
44
+ // HWY_SVE_HAVE_BF16_VEC is defined to 1 if the SVE svbfloat16_t vector type
45
+ // is supported, even if HWY_SVE_HAVE_BF16_FEATURE (= intrinsics) is 0.
46
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_COMPILER_GCC_ACTUAL >= 1000
47
+ #define HWY_SVE_HAVE_BF16_VEC 1
48
+ #else
49
+ #define HWY_SVE_HAVE_BF16_VEC 0
50
+ #endif
51
+
36
52
  HWY_BEFORE_NAMESPACE();
37
53
  namespace hwy {
38
54
  namespace HWY_NAMESPACE {
@@ -76,12 +92,20 @@ namespace detail { // for code folding
76
92
  #define HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) \
77
93
  X_MACRO(float, f, 64, 32, NAME, OP)
78
94
 
79
- #if HWY_SVE_HAVE_BFLOAT16
80
- #define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP) \
95
+ #define HWY_SVE_FOREACH_BF16_UNCONDITIONAL(X_MACRO, NAME, OP) \
81
96
  X_MACRO(bfloat, bf, 16, 16, NAME, OP)
97
+
98
+ #if HWY_SVE_HAVE_BF16_FEATURE
99
+ #define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP) \
100
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(X_MACRO, NAME, OP)
101
+ // We have both f16 and bf16, so nothing is emulated.
102
+ #define HWY_SVE_IF_EMULATED_D(D) hwy::EnableIf<false>* = nullptr
103
+ #define HWY_SVE_IF_NOT_EMULATED_D(D) hwy::EnableIf<true>* = nullptr
82
104
  #else
83
105
  #define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP)
84
- #endif
106
+ #define HWY_SVE_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
107
+ #define HWY_SVE_IF_NOT_EMULATED_D(D) HWY_IF_NOT_BF16_D(D)
108
+ #endif // HWY_SVE_HAVE_BF16_FEATURE
85
109
 
86
110
  // For all element sizes:
87
111
  #define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
@@ -96,12 +120,16 @@ namespace detail { // for code folding
96
120
  HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) \
97
121
  HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
98
122
 
123
+ #define HWY_SVE_FOREACH_F3264(X_MACRO, NAME, OP) \
124
+ HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
125
+ HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
126
+
99
127
  // HWY_SVE_FOREACH_F does not include HWY_SVE_FOREACH_BF16 because SVE lacks
100
128
  // bf16 overloads for some intrinsics (especially less-common arithmetic).
129
+ // However, this does include f16 because SVE supports it unconditionally.
101
130
  #define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) \
102
131
  HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
103
- HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
104
- HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
132
+ HWY_SVE_FOREACH_F3264(X_MACRO, NAME, OP)
105
133
 
106
134
  // Commonly used type categories for a given element size:
107
135
  #define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP) \
@@ -123,8 +151,7 @@ namespace detail { // for code folding
123
151
  #define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP) \
124
152
  HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
125
153
  HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
126
- HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
127
- HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
154
+ HWY_SVE_FOREACH_F3264(X_MACRO, NAME, OP)
128
155
 
129
156
  // Commonly used type categories:
130
157
  #define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP) \
@@ -155,7 +182,9 @@ namespace detail { // for code folding
155
182
  };
156
183
 
157
184
  HWY_SVE_FOREACH(HWY_SPECIALIZE, _, _)
158
- HWY_SVE_FOREACH_BF16(HWY_SPECIALIZE, _, _)
185
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
186
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _)
187
+ #endif
159
188
  #undef HWY_SPECIALIZE
160
189
 
161
190
  // Note: _x (don't-care value for inactive lanes) avoids additional MOVPRFX
@@ -184,15 +213,24 @@ HWY_SVE_FOREACH_BF16(HWY_SPECIALIZE, _, _)
184
213
  }
185
214
 
186
215
  // vector = f(vector, vector), e.g. Add
216
+ #define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP) \
217
+ HWY_API HWY_SVE_V(BASE, BITS) \
218
+ NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
219
+ return sv##OP##_##CHAR##BITS(a, b); \
220
+ }
221
+ // All-true mask
187
222
  #define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP) \
188
223
  HWY_API HWY_SVE_V(BASE, BITS) \
189
224
  NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
190
225
  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
191
226
  }
192
- #define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP) \
193
- HWY_API HWY_SVE_V(BASE, BITS) \
194
- NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
195
- return sv##OP##_##CHAR##BITS(a, b); \
227
+ // User-specified mask. Mask=false value is undefined and must be set by caller
228
+ // because SVE instructions take it from one of the two inputs, whereas
229
+ // AVX-512, RVV and Highway allow a third argument.
230
+ #define HWY_SVE_RETV_ARGMVV(BASE, CHAR, BITS, HALF, NAME, OP) \
231
+ HWY_API HWY_SVE_V(BASE, BITS) \
232
+ NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
233
+ return sv##OP##_##CHAR##BITS##_x(m, a, b); \
196
234
  }
197
235
 
198
236
  #define HWY_SVE_RETV_ARGVVV(BASE, CHAR, BITS, HALF, NAME, OP) \
@@ -266,24 +304,15 @@ HWY_API size_t Lanes(Simd<T, N, kPow2> d) {
266
304
  HWY_SVE_FOREACH(HWY_SVE_FIRSTN, FirstN, whilelt)
267
305
  HWY_SVE_FOREACH_BF16(HWY_SVE_FIRSTN, FirstN, whilelt)
268
306
 
269
- #undef HWY_SVE_FIRSTN
270
-
271
- template <class D>
272
- using MFromD = decltype(FirstN(D(), 0));
273
-
274
- #if !HWY_HAVE_FLOAT16
275
- template <class D, HWY_IF_F16_D(D)>
276
- MFromD<RebindToUnsigned<D>> FirstN(D /* tag */, size_t count) {
307
+ template <class D, HWY_SVE_IF_EMULATED_D(D)>
308
+ svbool_t FirstN(D /* tag */, size_t count) {
277
309
  return FirstN(RebindToUnsigned<D>(), count);
278
310
  }
279
- #endif // !HWY_HAVE_FLOAT16
280
311
 
281
- #if !HWY_SVE_HAVE_BFLOAT16
282
- template <class D, HWY_IF_BF16_D(D)>
283
- MFromD<RebindToUnsigned<D>> FirstN(D /* tag */, size_t count) {
284
- return FirstN(RebindToUnsigned<D>(), count);
285
- }
286
- #endif // !HWY_SVE_HAVE_BFLOAT16
312
+ #undef HWY_SVE_FIRSTN
313
+
314
+ template <class D>
315
+ using MFromD = svbool_t;
287
316
 
288
317
  namespace detail {
289
318
 
@@ -314,6 +343,17 @@ svbool_t MakeMask(D d) {
314
343
 
315
344
  } // namespace detail
316
345
 
346
+ #ifdef HWY_NATIVE_MASK_FALSE
347
+ #undef HWY_NATIVE_MASK_FALSE
348
+ #else
349
+ #define HWY_NATIVE_MASK_FALSE
350
+ #endif
351
+
352
+ template <class D>
353
+ HWY_API svbool_t MaskFalse(const D /*d*/) {
354
+ return detail::PFalse();
355
+ }
356
+
317
357
  // ================================================== INIT
318
358
 
319
359
  // ------------------------------ Set
@@ -326,14 +366,23 @@ svbool_t MakeMask(D d) {
326
366
  }
327
367
 
328
368
  HWY_SVE_FOREACH(HWY_SVE_SET, Set, dup_n)
369
+ #if HWY_SVE_HAVE_BF16_FEATURE // for if-elif chain
329
370
  HWY_SVE_FOREACH_BF16(HWY_SVE_SET, Set, dup_n)
330
- #if !HWY_SVE_HAVE_BFLOAT16
371
+ #elif HWY_SVE_HAVE_BF16_VEC
331
372
  // Required for Zero and VFromD
332
- template <size_t N, int kPow2>
333
- svuint16_t Set(Simd<bfloat16_t, N, kPow2> d, bfloat16_t arg) {
334
- return Set(RebindToUnsigned<decltype(d)>(), arg.bits);
373
+ template <class D, HWY_IF_BF16_D(D)>
374
+ HWY_API svbfloat16_t Set(D d, bfloat16_t arg) {
375
+ return svreinterpret_bf16_u16(
376
+ Set(RebindToUnsigned<decltype(d)>(), BitCastScalar<uint16_t>(arg)));
377
+ }
378
+ #else // neither bf16 feature nor vector: emulate with u16
379
+ // Required for Zero and VFromD
380
+ template <class D, HWY_IF_BF16_D(D)>
381
+ HWY_API svuint16_t Set(D d, bfloat16_t arg) {
382
+ const RebindToUnsigned<decltype(d)> du;
383
+ return Set(du, BitCastScalar<uint16_t>(arg));
335
384
  }
336
- #endif // HWY_SVE_HAVE_BFLOAT16
385
+ #endif // HWY_SVE_HAVE_BF16_FEATURE
337
386
  #undef HWY_SVE_SET
338
387
 
339
388
  template <class D>
@@ -350,17 +399,6 @@ VFromD<D> Zero(D d) {
350
399
  return BitCast(d, Set(du, 0));
351
400
  }
352
401
 
353
- // ------------------------------ Undefined
354
-
355
- #define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP) \
356
- template <size_t N, int kPow2> \
357
- HWY_API HWY_SVE_V(BASE, BITS) \
358
- NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
359
- return sv##OP##_##CHAR##BITS(); \
360
- }
361
-
362
- HWY_SVE_FOREACH(HWY_SVE_UNDEFINED, Undefined, undef)
363
-
364
402
  // ------------------------------ BitCast
365
403
 
366
404
  namespace detail {
@@ -387,24 +425,37 @@ namespace detail {
387
425
  return sv##OP##_##CHAR##BITS##_u8(v); \
388
426
  }
389
427
 
428
+ // U08 is special-cased, hence do not use FOREACH.
390
429
  HWY_SVE_FOREACH_U08(HWY_SVE_CAST_NOP, _, _)
391
430
  HWY_SVE_FOREACH_I08(HWY_SVE_CAST, _, reinterpret)
392
431
  HWY_SVE_FOREACH_UI16(HWY_SVE_CAST, _, reinterpret)
393
432
  HWY_SVE_FOREACH_UI32(HWY_SVE_CAST, _, reinterpret)
394
433
  HWY_SVE_FOREACH_UI64(HWY_SVE_CAST, _, reinterpret)
395
434
  HWY_SVE_FOREACH_F(HWY_SVE_CAST, _, reinterpret)
396
- HWY_SVE_FOREACH_BF16(HWY_SVE_CAST, _, reinterpret)
397
435
 
398
436
  #undef HWY_SVE_CAST_NOP
399
437
  #undef HWY_SVE_CAST
400
438
 
401
- #if !HWY_SVE_HAVE_BFLOAT16
402
- template <size_t N, int kPow2>
403
- HWY_INLINE VBF16 BitCastFromByte(Simd<bfloat16_t, N, kPow2> /* d */,
404
- svuint8_t v) {
405
- return BitCastFromByte(Simd<uint16_t, N, kPow2>(), v);
439
+ template <class V, HWY_SVE_IF_EMULATED_D(DFromV<V>)>
440
+ HWY_INLINE svuint8_t BitCastToByte(V v) {
441
+ #if HWY_SVE_HAVE_BF16_VEC
442
+ return svreinterpret_u8_bf16(v);
443
+ #else
444
+ const RebindToUnsigned<DFromV<V>> du;
445
+ return BitCastToByte(BitCast(du, v));
446
+ #endif
447
+ }
448
+
449
+ template <class D, HWY_SVE_IF_EMULATED_D(D)>
450
+ HWY_INLINE VFromD<D> BitCastFromByte(D d, svuint8_t v) {
451
+ #if HWY_SVE_HAVE_BF16_VEC
452
+ (void)d;
453
+ return svreinterpret_bf16_u8(v);
454
+ #else
455
+ const RebindToUnsigned<decltype(d)> du;
456
+ return BitCastFromByte(du, v);
457
+ #endif
406
458
  }
407
- #endif // !HWY_SVE_HAVE_BFLOAT16
408
459
 
409
460
  } // namespace detail
410
461
 
@@ -413,6 +464,23 @@ HWY_API VFromD<D> BitCast(D d, FromV v) {
413
464
  return detail::BitCastFromByte(d, detail::BitCastToByte(v));
414
465
  }
415
466
 
467
+ // ------------------------------ Undefined
468
+
469
+ #define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP) \
470
+ template <size_t N, int kPow2> \
471
+ HWY_API HWY_SVE_V(BASE, BITS) \
472
+ NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
473
+ return sv##OP##_##CHAR##BITS(); \
474
+ }
475
+
476
+ HWY_SVE_FOREACH(HWY_SVE_UNDEFINED, Undefined, undef)
477
+
478
+ template <class D, HWY_SVE_IF_EMULATED_D(D)>
479
+ VFromD<D> Undefined(D d) {
480
+ const RebindToUnsigned<D> du;
481
+ return BitCast(d, Undefined(du));
482
+ }
483
+
416
484
  // ------------------------------ Tuple
417
485
 
418
486
  // tuples = f(d, v..), e.g. Create2
@@ -495,6 +563,102 @@ HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
495
563
  return BitCast(d, v);
496
564
  }
497
565
 
566
+ // ------------------------------ Dup128VecFromValues
567
+
568
+ template <class D, HWY_IF_I8_D(D)>
569
+ HWY_API svint8_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
570
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
571
+ TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
572
+ TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
573
+ TFromD<D> t11, TFromD<D> t12,
574
+ TFromD<D> t13, TFromD<D> t14,
575
+ TFromD<D> t15) {
576
+ return svdupq_n_s8(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13,
577
+ t14, t15);
578
+ }
579
+
580
+ template <class D, HWY_IF_U8_D(D)>
581
+ HWY_API svuint8_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
582
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
583
+ TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
584
+ TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
585
+ TFromD<D> t11, TFromD<D> t12,
586
+ TFromD<D> t13, TFromD<D> t14,
587
+ TFromD<D> t15) {
588
+ return svdupq_n_u8(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13,
589
+ t14, t15);
590
+ }
591
+
592
+ template <class D, HWY_IF_I16_D(D)>
593
+ HWY_API svint16_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
594
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
595
+ TFromD<D> t5, TFromD<D> t6,
596
+ TFromD<D> t7) {
597
+ return svdupq_n_s16(t0, t1, t2, t3, t4, t5, t6, t7);
598
+ }
599
+
600
+ template <class D, HWY_IF_U16_D(D)>
601
+ HWY_API svuint16_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
602
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
603
+ TFromD<D> t5, TFromD<D> t6,
604
+ TFromD<D> t7) {
605
+ return svdupq_n_u16(t0, t1, t2, t3, t4, t5, t6, t7);
606
+ }
607
+
608
+ template <class D, HWY_IF_F16_D(D)>
609
+ HWY_API svfloat16_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
610
+ TFromD<D> t2, TFromD<D> t3,
611
+ TFromD<D> t4, TFromD<D> t5,
612
+ TFromD<D> t6, TFromD<D> t7) {
613
+ return svdupq_n_f16(t0, t1, t2, t3, t4, t5, t6, t7);
614
+ }
615
+
616
+ template <class D, HWY_SVE_IF_EMULATED_D(D)>
617
+ HWY_API VBF16 Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, TFromD<D> t2,
618
+ TFromD<D> t3, TFromD<D> t4, TFromD<D> t5,
619
+ TFromD<D> t6, TFromD<D> t7) {
620
+ const RebindToUnsigned<decltype(d)> du;
621
+ return BitCast(
622
+ d, Dup128VecFromValues(
623
+ du, BitCastScalar<uint16_t>(t0), BitCastScalar<uint16_t>(t1),
624
+ BitCastScalar<uint16_t>(t2), BitCastScalar<uint16_t>(t3),
625
+ BitCastScalar<uint16_t>(t4), BitCastScalar<uint16_t>(t5),
626
+ BitCastScalar<uint16_t>(t6), BitCastScalar<uint16_t>(t7)));
627
+ }
628
+
629
+ template <class D, HWY_IF_I32_D(D)>
630
+ HWY_API svint32_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
631
+ TFromD<D> t2, TFromD<D> t3) {
632
+ return svdupq_n_s32(t0, t1, t2, t3);
633
+ }
634
+
635
+ template <class D, HWY_IF_U32_D(D)>
636
+ HWY_API svuint32_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
637
+ TFromD<D> t2, TFromD<D> t3) {
638
+ return svdupq_n_u32(t0, t1, t2, t3);
639
+ }
640
+
641
+ template <class D, HWY_IF_F32_D(D)>
642
+ HWY_API svfloat32_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
643
+ TFromD<D> t2, TFromD<D> t3) {
644
+ return svdupq_n_f32(t0, t1, t2, t3);
645
+ }
646
+
647
+ template <class D, HWY_IF_I64_D(D)>
648
+ HWY_API svint64_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
649
+ return svdupq_n_s64(t0, t1);
650
+ }
651
+
652
+ template <class D, HWY_IF_U64_D(D)>
653
+ HWY_API svuint64_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
654
+ return svdupq_n_u64(t0, t1);
655
+ }
656
+
657
+ template <class D, HWY_IF_F64_D(D)>
658
+ HWY_API svfloat64_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
659
+ return svdupq_n_f64(t0, t1);
660
+ }
661
+
498
662
  // ================================================== LOGICAL
499
663
 
500
664
  // detail::*N() functions accept a scalar argument to avoid extra Set().
@@ -632,9 +796,37 @@ HWY_API VBF16 Neg(VBF16 v) {
632
796
  return BitCast(d, Xor(BitCast(du, v), Set(du, SignMask<TU>())));
633
797
  }
634
798
 
799
+ // ------------------------------ SaturatedNeg
800
+ #if HWY_SVE_HAVE_2
801
+ #ifdef HWY_NATIVE_SATURATED_NEG_8_16_32
802
+ #undef HWY_NATIVE_SATURATED_NEG_8_16_32
803
+ #else
804
+ #define HWY_NATIVE_SATURATED_NEG_8_16_32
805
+ #endif
806
+
807
+ #ifdef HWY_NATIVE_SATURATED_NEG_64
808
+ #undef HWY_NATIVE_SATURATED_NEG_64
809
+ #else
810
+ #define HWY_NATIVE_SATURATED_NEG_64
811
+ #endif
812
+
813
+ HWY_SVE_FOREACH_I(HWY_SVE_RETV_ARGPV, SaturatedNeg, qneg)
814
+ #endif // HWY_SVE_HAVE_2
815
+
635
816
  // ------------------------------ Abs
636
817
  HWY_SVE_FOREACH_IF(HWY_SVE_RETV_ARGPV, Abs, abs)
637
818
 
819
+ // ------------------------------ SaturatedAbs
820
+ #if HWY_SVE_HAVE_2
821
+ #ifdef HWY_NATIVE_SATURATED_ABS
822
+ #undef HWY_NATIVE_SATURATED_ABS
823
+ #else
824
+ #define HWY_NATIVE_SATURATED_ABS
825
+ #endif
826
+
827
+ HWY_SVE_FOREACH_I(HWY_SVE_RETV_ARGPV, SaturatedAbs, qabs)
828
+ #endif // HWY_SVE_HAVE_2
829
+
638
830
  // ================================================== ARITHMETIC
639
831
 
640
832
  // Per-target flags to prevent generic_ops-inl.h defining Add etc.
@@ -676,13 +868,107 @@ HWY_API svuint64_t SumsOf8(const svuint8_t v) {
676
868
 
677
869
  const svuint32_t sums_of_4 = svdot_n_u32(Zero(du32), v, 1);
678
870
  // Compute pairwise sum of u32 and extend to u64.
679
- // TODO(janwas): on SVE2, we can instead use svaddp.
871
+
872
+ #if HWY_SVE_HAVE_2
873
+ return svadalp_u64_x(pg, Zero(du64), sums_of_4);
874
+ #else
680
875
  const svuint64_t hi = svlsr_n_u64_x(pg, BitCast(du64, sums_of_4), 32);
681
876
  // Isolate the lower 32 bits (to be added to the upper 32 and zero-extended)
682
877
  const svuint64_t lo = svextw_u64_x(pg, BitCast(du64, sums_of_4));
683
878
  return Add(hi, lo);
879
+ #endif
880
+ }
881
+
882
+ HWY_API svint64_t SumsOf8(const svint8_t v) {
883
+ const ScalableTag<int32_t> di32;
884
+ const ScalableTag<int64_t> di64;
885
+ const svbool_t pg = detail::PTrue(di64);
886
+
887
+ const svint32_t sums_of_4 = svdot_n_s32(Zero(di32), v, 1);
888
+ #if HWY_SVE_HAVE_2
889
+ return svadalp_s64_x(pg, Zero(di64), sums_of_4);
890
+ #else
891
+ const svint64_t hi = svasr_n_s64_x(pg, BitCast(di64, sums_of_4), 32);
892
+ // Isolate the lower 32 bits (to be added to the upper 32 and sign-extended)
893
+ const svint64_t lo = svextw_s64_x(pg, BitCast(di64, sums_of_4));
894
+ return Add(hi, lo);
895
+ #endif
896
+ }
897
+
898
+ // ------------------------------ SumsOf2
899
+ #if HWY_SVE_HAVE_2
900
+ namespace detail {
901
+
902
+ HWY_INLINE svint16_t SumsOf2(hwy::SignedTag /*type_tag*/,
903
+ hwy::SizeTag<1> /*lane_size_tag*/, svint8_t v) {
904
+ const ScalableTag<int16_t> di16;
905
+ const svbool_t pg = detail::PTrue(di16);
906
+ return svadalp_s16_x(pg, Zero(di16), v);
907
+ }
908
+
909
+ HWY_INLINE svuint16_t SumsOf2(hwy::UnsignedTag /*type_tag*/,
910
+ hwy::SizeTag<1> /*lane_size_tag*/, svuint8_t v) {
911
+ const ScalableTag<uint16_t> du16;
912
+ const svbool_t pg = detail::PTrue(du16);
913
+ return svadalp_u16_x(pg, Zero(du16), v);
914
+ }
915
+
916
+ HWY_INLINE svint32_t SumsOf2(hwy::SignedTag /*type_tag*/,
917
+ hwy::SizeTag<2> /*lane_size_tag*/, svint16_t v) {
918
+ const ScalableTag<int32_t> di32;
919
+ const svbool_t pg = detail::PTrue(di32);
920
+ return svadalp_s32_x(pg, Zero(di32), v);
921
+ }
922
+
923
+ HWY_INLINE svuint32_t SumsOf2(hwy::UnsignedTag /*type_tag*/,
924
+ hwy::SizeTag<2> /*lane_size_tag*/, svuint16_t v) {
925
+ const ScalableTag<uint32_t> du32;
926
+ const svbool_t pg = detail::PTrue(du32);
927
+ return svadalp_u32_x(pg, Zero(du32), v);
928
+ }
929
+
930
+ HWY_INLINE svint64_t SumsOf2(hwy::SignedTag /*type_tag*/,
931
+ hwy::SizeTag<4> /*lane_size_tag*/, svint32_t v) {
932
+ const ScalableTag<int64_t> di64;
933
+ const svbool_t pg = detail::PTrue(di64);
934
+ return svadalp_s64_x(pg, Zero(di64), v);
935
+ }
936
+
937
+ HWY_INLINE svuint64_t SumsOf2(hwy::UnsignedTag /*type_tag*/,
938
+ hwy::SizeTag<4> /*lane_size_tag*/, svuint32_t v) {
939
+ const ScalableTag<uint64_t> du64;
940
+ const svbool_t pg = detail::PTrue(du64);
941
+ return svadalp_u64_x(pg, Zero(du64), v);
684
942
  }
685
943
 
944
+ } // namespace detail
945
+ #endif // HWY_SVE_HAVE_2
946
+
947
+ // ------------------------------ SumsOf4
948
+ namespace detail {
949
+
950
+ HWY_INLINE svint32_t SumsOf4(hwy::SignedTag /*type_tag*/,
951
+ hwy::SizeTag<1> /*lane_size_tag*/, svint8_t v) {
952
+ return svdot_n_s32(Zero(ScalableTag<int32_t>()), v, 1);
953
+ }
954
+
955
+ HWY_INLINE svuint32_t SumsOf4(hwy::UnsignedTag /*type_tag*/,
956
+ hwy::SizeTag<1> /*lane_size_tag*/, svuint8_t v) {
957
+ return svdot_n_u32(Zero(ScalableTag<uint32_t>()), v, 1);
958
+ }
959
+
960
+ HWY_INLINE svint64_t SumsOf4(hwy::SignedTag /*type_tag*/,
961
+ hwy::SizeTag<2> /*lane_size_tag*/, svint16_t v) {
962
+ return svdot_n_s64(Zero(ScalableTag<int64_t>()), v, 1);
963
+ }
964
+
965
+ HWY_INLINE svuint64_t SumsOf4(hwy::UnsignedTag /*type_tag*/,
966
+ hwy::SizeTag<2> /*lane_size_tag*/, svuint16_t v) {
967
+ return svdot_n_u64(Zero(ScalableTag<uint64_t>()), v, 1);
968
+ }
969
+
970
+ } // namespace detail
971
+
686
972
  // ------------------------------ SaturatedAdd
687
973
 
688
974
  #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
@@ -830,6 +1116,14 @@ HWY_API svint16_t MulFixedPoint15(svint16_t a, svint16_t b) {
830
1116
  }
831
1117
 
832
1118
  // ------------------------------ Div
1119
+ #ifdef HWY_NATIVE_INT_DIV
1120
+ #undef HWY_NATIVE_INT_DIV
1121
+ #else
1122
+ #define HWY_NATIVE_INT_DIV
1123
+ #endif
1124
+
1125
+ HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGPVV, Div, div)
1126
+ HWY_SVE_FOREACH_UI64(HWY_SVE_RETV_ARGPVV, Div, div)
833
1127
  HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Div, div)
834
1128
 
835
1129
  // ------------------------------ ApproximateReciprocal
@@ -983,16 +1277,37 @@ HWY_API size_t FindKnownFirstTrue(D d, svbool_t m) {
983
1277
  HWY_SVE_FOREACH(HWY_SVE_IF_THEN_ELSE, IfThenElse, sel)
984
1278
  #undef HWY_SVE_IF_THEN_ELSE
985
1279
 
1280
+ template <class V, class D = DFromV<V>, HWY_SVE_IF_EMULATED_D(D)>
1281
+ HWY_API V IfThenElse(const svbool_t mask, V yes, V no) {
1282
+ const RebindToUnsigned<D> du;
1283
+ return BitCast(
1284
+ D(), IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no)));
1285
+ }
1286
+
986
1287
  // ------------------------------ IfThenElseZero
987
- template <class V>
1288
+
1289
+ template <class V, class D = DFromV<V>, HWY_SVE_IF_NOT_EMULATED_D(D)>
988
1290
  HWY_API V IfThenElseZero(const svbool_t mask, const V yes) {
989
- return IfThenElse(mask, yes, Zero(DFromV<V>()));
1291
+ return IfThenElse(mask, yes, Zero(D()));
1292
+ }
1293
+
1294
+ template <class V, class D = DFromV<V>, HWY_SVE_IF_EMULATED_D(D)>
1295
+ HWY_API V IfThenElseZero(const svbool_t mask, V yes) {
1296
+ const RebindToUnsigned<D> du;
1297
+ return BitCast(D(), IfThenElseZero(RebindMask(du, mask), BitCast(du, yes)));
990
1298
  }
991
1299
 
992
1300
  // ------------------------------ IfThenZeroElse
993
- template <class V>
1301
+
1302
+ template <class V, class D = DFromV<V>, HWY_SVE_IF_NOT_EMULATED_D(D)>
994
1303
  HWY_API V IfThenZeroElse(const svbool_t mask, const V no) {
995
- return IfThenElse(mask, Zero(DFromV<V>()), no);
1304
+ return IfThenElse(mask, Zero(D()), no);
1305
+ }
1306
+
1307
+ template <class V, class D = DFromV<V>, HWY_SVE_IF_EMULATED_D(D)>
1308
+ HWY_API V IfThenZeroElse(const svbool_t mask, V no) {
1309
+ const RebindToUnsigned<D> du;
1310
+ return BitCast(D(), IfThenZeroElse(RebindMask(du, mask), BitCast(du, no)));
996
1311
  }
997
1312
 
998
1313
  // ------------------------------ Additional mask logical operations
@@ -1016,6 +1331,162 @@ HWY_API svbool_t SetAtOrAfterFirst(svbool_t m) {
1016
1331
  return Not(SetBeforeFirst(m));
1017
1332
  }
1018
1333
 
1334
+ // ------------------------------ PromoteMaskTo
1335
+
1336
+ #ifdef HWY_NATIVE_PROMOTE_MASK_TO
1337
+ #undef HWY_NATIVE_PROMOTE_MASK_TO
1338
+ #else
1339
+ #define HWY_NATIVE_PROMOTE_MASK_TO
1340
+ #endif
1341
+
1342
+ template <class DTo, class DFrom,
1343
+ HWY_IF_T_SIZE_D(DTo, sizeof(TFromD<DFrom>) * 2)>
1344
+ HWY_API svbool_t PromoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, svbool_t m) {
1345
+ return svunpklo_b(m);
1346
+ }
1347
+
1348
+ template <class DTo, class DFrom,
1349
+ HWY_IF_T_SIZE_GT_D(DTo, sizeof(TFromD<DFrom>) * 2)>
1350
+ HWY_API svbool_t PromoteMaskTo(DTo d_to, DFrom d_from, svbool_t m) {
1351
+ using TFrom = TFromD<DFrom>;
1352
+ using TWFrom = MakeWide<MakeUnsigned<TFrom>>;
1353
+ static_assert(sizeof(TWFrom) > sizeof(TFrom),
1354
+ "sizeof(TWFrom) > sizeof(TFrom) must be true");
1355
+
1356
+ const Rebind<TWFrom, decltype(d_from)> dw_from;
1357
+ return PromoteMaskTo(d_to, dw_from, PromoteMaskTo(dw_from, d_from, m));
1358
+ }
1359
+
1360
+ // ------------------------------ DemoteMaskTo
1361
+
1362
+ #ifdef HWY_NATIVE_DEMOTE_MASK_TO
1363
+ #undef HWY_NATIVE_DEMOTE_MASK_TO
1364
+ #else
1365
+ #define HWY_NATIVE_DEMOTE_MASK_TO
1366
+ #endif
1367
+
1368
+ template <class DTo, class DFrom, HWY_IF_T_SIZE_D(DTo, 1),
1369
+ HWY_IF_T_SIZE_D(DFrom, 2)>
1370
+ HWY_API svbool_t DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, svbool_t m) {
1371
+ return svuzp1_b8(m, m);
1372
+ }
1373
+
1374
+ template <class DTo, class DFrom, HWY_IF_T_SIZE_D(DTo, 2),
1375
+ HWY_IF_T_SIZE_D(DFrom, 4)>
1376
+ HWY_API svbool_t DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, svbool_t m) {
1377
+ return svuzp1_b16(m, m);
1378
+ }
1379
+
1380
+ template <class DTo, class DFrom, HWY_IF_T_SIZE_D(DTo, 4),
1381
+ HWY_IF_T_SIZE_D(DFrom, 8)>
1382
+ HWY_API svbool_t DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, svbool_t m) {
1383
+ return svuzp1_b32(m, m);
1384
+ }
1385
+
1386
+ template <class DTo, class DFrom,
1387
+ HWY_IF_T_SIZE_LE_D(DTo, sizeof(TFromD<DFrom>) / 4)>
1388
+ HWY_API svbool_t DemoteMaskTo(DTo d_to, DFrom d_from, svbool_t m) {
1389
+ using TFrom = TFromD<DFrom>;
1390
+ using TNFrom = MakeNarrow<MakeUnsigned<TFrom>>;
1391
+ static_assert(sizeof(TNFrom) < sizeof(TFrom),
1392
+ "sizeof(TNFrom) < sizeof(TFrom) must be true");
1393
+
1394
+ const Rebind<TNFrom, decltype(d_from)> dn_from;
1395
+ return DemoteMaskTo(d_to, dn_from, DemoteMaskTo(dn_from, d_from, m));
1396
+ }
1397
+
1398
+ // ------------------------------ LowerHalfOfMask
1399
+ #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
1400
+ #undef HWY_NATIVE_LOWER_HALF_OF_MASK
1401
+ #else
1402
+ #define HWY_NATIVE_LOWER_HALF_OF_MASK
1403
+ #endif
1404
+
1405
+ template <class D>
1406
+ HWY_API svbool_t LowerHalfOfMask(D /*d*/, svbool_t m) {
1407
+ return m;
1408
+ }
1409
+
1410
+ // ------------------------------ MaskedAddOr etc. (IfThenElse)
1411
+
1412
+ #ifdef HWY_NATIVE_MASKED_ARITH
1413
+ #undef HWY_NATIVE_MASKED_ARITH
1414
+ #else
1415
+ #define HWY_NATIVE_MASKED_ARITH
1416
+ #endif
1417
+
1418
+ namespace detail {
1419
+ HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedMin, min)
1420
+ HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedMax, max)
1421
+ HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedAdd, add)
1422
+ HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedSub, sub)
1423
+ HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedMul, mul)
1424
+ HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGMVV, MaskedDiv, div)
1425
+ HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGMVV, MaskedDiv, div)
1426
+ HWY_SVE_FOREACH_UI64(HWY_SVE_RETV_ARGMVV, MaskedDiv, div)
1427
+ #if HWY_SVE_HAVE_2
1428
+ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV, MaskedSatAdd, qadd)
1429
+ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV, MaskedSatSub, qsub)
1430
+ #endif
1431
+ } // namespace detail
1432
+
1433
+ template <class V, class M>
1434
+ HWY_API V MaskedMinOr(V no, M m, V a, V b) {
1435
+ return IfThenElse(m, detail::MaskedMin(m, a, b), no);
1436
+ }
1437
+
1438
+ template <class V, class M>
1439
+ HWY_API V MaskedMaxOr(V no, M m, V a, V b) {
1440
+ return IfThenElse(m, detail::MaskedMax(m, a, b), no);
1441
+ }
1442
+
1443
+ template <class V, class M>
1444
+ HWY_API V MaskedAddOr(V no, M m, V a, V b) {
1445
+ return IfThenElse(m, detail::MaskedAdd(m, a, b), no);
1446
+ }
1447
+
1448
+ template <class V, class M>
1449
+ HWY_API V MaskedSubOr(V no, M m, V a, V b) {
1450
+ return IfThenElse(m, detail::MaskedSub(m, a, b), no);
1451
+ }
1452
+
1453
+ template <class V, class M>
1454
+ HWY_API V MaskedMulOr(V no, M m, V a, V b) {
1455
+ return IfThenElse(m, detail::MaskedMul(m, a, b), no);
1456
+ }
1457
+
1458
+ template <class V, class M,
1459
+ HWY_IF_T_SIZE_ONE_OF_V(
1460
+ V, (hwy::IsSame<TFromV<V>, hwy::float16_t>() ? (1 << 2) : 0) |
1461
+ (1 << 4) | (1 << 8))>
1462
+ HWY_API V MaskedDivOr(V no, M m, V a, V b) {
1463
+ return IfThenElse(m, detail::MaskedDiv(m, a, b), no);
1464
+ }
1465
+
1466
+ // I8/U8/I16/U16 MaskedDivOr is implemented after I8/U8/I16/U16 Div
1467
+
1468
+ #if HWY_SVE_HAVE_2
1469
+ template <class V, class M>
1470
+ HWY_API V MaskedSatAddOr(V no, M m, V a, V b) {
1471
+ return IfThenElse(m, detail::MaskedSatAdd(m, a, b), no);
1472
+ }
1473
+
1474
+ template <class V, class M>
1475
+ HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
1476
+ return IfThenElse(m, detail::MaskedSatSub(m, a, b), no);
1477
+ }
1478
+ #else
1479
+ template <class V, class M>
1480
+ HWY_API V MaskedSatAddOr(V no, M m, V a, V b) {
1481
+ return IfThenElse(m, SaturatedAdd(a, b), no);
1482
+ }
1483
+
1484
+ template <class V, class M>
1485
+ HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
1486
+ return IfThenElse(m, SaturatedSub(a, b), no);
1487
+ }
1488
+ #endif
1489
+
1019
1490
  // ================================================== COMPARE
1020
1491
 
1021
1492
  // mask = f(vector, vector)
@@ -1078,7 +1549,8 @@ HWY_API svbool_t TestBit(const V a, const V bit) {
1078
1549
  // ------------------------------ MaskFromVec (Ne)
1079
1550
  template <class V>
1080
1551
  HWY_API svbool_t MaskFromVec(const V v) {
1081
- return detail::NeN(v, static_cast<TFromV<V>>(0));
1552
+ using T = TFromV<V>;
1553
+ return detail::NeN(v, ConvertScalarTo<T>(0));
1082
1554
  }
1083
1555
 
1084
1556
  // ------------------------------ VecFromMask
@@ -1159,14 +1631,27 @@ HWY_API svbool_t IsNaN(const V v) {
1159
1631
  return Ne(v, v); // could also use cmpuo
1160
1632
  }
1161
1633
 
1634
+ // Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite.
1635
+ // We use a fused Set/comparison for IsFinite.
1636
+ #ifdef HWY_NATIVE_ISINF
1637
+ #undef HWY_NATIVE_ISINF
1638
+ #else
1639
+ #define HWY_NATIVE_ISINF
1640
+ #endif
1641
+
1162
1642
  template <class V>
1163
1643
  HWY_API svbool_t IsInf(const V v) {
1164
1644
  using T = TFromV<V>;
1165
1645
  const DFromV<decltype(v)> d;
1646
+ const RebindToUnsigned<decltype(d)> du;
1166
1647
  const RebindToSigned<decltype(d)> di;
1167
- const VFromD<decltype(di)> vi = BitCast(di, v);
1168
- // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
1169
- return RebindMask(d, detail::EqN(Add(vi, vi), hwy::MaxExponentTimes2<T>()));
1648
+
1649
+ // 'Shift left' to clear the sign bit
1650
+ const VFromD<decltype(du)> vu = BitCast(du, v);
1651
+ const VFromD<decltype(du)> v2 = Add(vu, vu);
1652
+ // Check for exponent=max and mantissa=0.
1653
+ const VFromD<decltype(di)> max2 = Set(di, hwy::MaxExponentTimes2<T>());
1654
+ return RebindMask(d, Eq(v2, BitCast(du, max2)));
1170
1655
  }
1171
1656
 
1172
1657
  // Returns whether normal/subnormal/zero.
@@ -1187,147 +1672,135 @@ HWY_API svbool_t IsFinite(const V v) {
1187
1672
 
1188
1673
  // ================================================== MEMORY
1189
1674
 
1190
- // ------------------------------ Load/MaskedLoad/LoadDup128/Store/Stream
1675
+ // ------------------------------ LoadU/MaskedLoad/LoadDup128/StoreU/Stream
1191
1676
 
1192
- #define HWY_SVE_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \
1193
- template <size_t N, int kPow2> \
1194
- HWY_API HWY_SVE_V(BASE, BITS) \
1195
- NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1196
- const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1197
- using T = detail::NativeLaneType<HWY_SVE_T(BASE, BITS)>; \
1198
- return sv##OP##_##CHAR##BITS(detail::MakeMask(d), \
1199
- reinterpret_cast<const T*>(p)); \
1677
+ #define HWY_SVE_MEM(BASE, CHAR, BITS, HALF, NAME, OP) \
1678
+ template <size_t N, int kPow2> \
1679
+ HWY_API HWY_SVE_V(BASE, BITS) \
1680
+ LoadU(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1681
+ const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1682
+ return svld1_##CHAR##BITS(detail::MakeMask(d), \
1683
+ detail::NativeLanePointer(p)); \
1684
+ } \
1685
+ template <size_t N, int kPow2> \
1686
+ HWY_API HWY_SVE_V(BASE, BITS) \
1687
+ MaskedLoad(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
1688
+ const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1689
+ return svld1_##CHAR##BITS(m, detail::NativeLanePointer(p)); \
1690
+ } \
1691
+ template <size_t N, int kPow2> \
1692
+ HWY_API void StoreU(HWY_SVE_V(BASE, BITS) v, \
1693
+ HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1694
+ HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1695
+ svst1_##CHAR##BITS(detail::MakeMask(d), detail::NativeLanePointer(p), v); \
1696
+ } \
1697
+ template <size_t N, int kPow2> \
1698
+ HWY_API void Stream(HWY_SVE_V(BASE, BITS) v, \
1699
+ HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1700
+ HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1701
+ svstnt1_##CHAR##BITS(detail::MakeMask(d), detail::NativeLanePointer(p), \
1702
+ v); \
1703
+ } \
1704
+ template <size_t N, int kPow2> \
1705
+ HWY_API void BlendedStore(HWY_SVE_V(BASE, BITS) v, svbool_t m, \
1706
+ HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
1707
+ HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1708
+ svst1_##CHAR##BITS(m, detail::NativeLanePointer(p), v); \
1200
1709
  }
1201
1710
 
1202
- #define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \
1203
- template <size_t N, int kPow2> \
1204
- HWY_API HWY_SVE_V(BASE, BITS) \
1205
- NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
1206
- const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1207
- using T = detail::NativeLaneType<HWY_SVE_T(BASE, BITS)>; \
1208
- return sv##OP##_##CHAR##BITS(m, reinterpret_cast<const T*>(p)); \
1209
- }
1711
+ HWY_SVE_FOREACH(HWY_SVE_MEM, _, _)
1712
+ HWY_SVE_FOREACH_BF16(HWY_SVE_MEM, _, _)
1210
1713
 
1211
- #define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP) \
1212
- template <size_t N, int kPow2> \
1213
- HWY_API HWY_SVE_V(BASE, BITS) \
1214
- NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
1215
- const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1216
- using T = detail::NativeLaneType<HWY_SVE_T(BASE, BITS)>; \
1217
- /* All-true predicate to load all 128 bits. */ \
1218
- return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(8), \
1219
- reinterpret_cast<const T*>(p)); \
1220
- }
1714
+ template <class D, HWY_SVE_IF_EMULATED_D(D)>
1715
+ HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
1716
+ const RebindToUnsigned<decltype(d)> du;
1717
+ return BitCast(d, LoadU(du, detail::U16LanePointer(p)));
1718
+ }
1221
1719
 
1222
- #define HWY_SVE_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \
1223
- template <size_t N, int kPow2> \
1224
- HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \
1225
- HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1226
- HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1227
- using T = detail::NativeLaneType<HWY_SVE_T(BASE, BITS)>; \
1228
- sv##OP##_##CHAR##BITS(detail::MakeMask(d), reinterpret_cast<T*>(p), v); \
1229
- }
1720
+ template <class D, HWY_SVE_IF_EMULATED_D(D)>
1721
+ HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
1722
+ const RebindToUnsigned<decltype(d)> du;
1723
+ StoreU(BitCast(du, v), du, detail::U16LanePointer(p));
1724
+ }
1230
1725
 
1231
- #define HWY_SVE_BLENDED_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \
1232
- template <size_t N, int kPow2> \
1233
- HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, svbool_t m, \
1234
- HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
1235
- HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1236
- using T = detail::NativeLaneType<HWY_SVE_T(BASE, BITS)>; \
1237
- sv##OP##_##CHAR##BITS(m, reinterpret_cast<T*>(p), v); \
1238
- }
1726
+ template <class D, HWY_SVE_IF_EMULATED_D(D)>
1727
+ HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
1728
+ const TFromD<D>* HWY_RESTRICT p) {
1729
+ const RebindToUnsigned<decltype(d)> du;
1730
+ return BitCast(d,
1731
+ MaskedLoad(RebindMask(du, m), du, detail::U16LanePointer(p)));
1732
+ }
1239
1733
 
1240
- HWY_SVE_FOREACH(HWY_SVE_LOAD, Load, ld1)
1241
- HWY_SVE_FOREACH(HWY_SVE_MASKED_LOAD, MaskedLoad, ld1)
1242
- HWY_SVE_FOREACH(HWY_SVE_STORE, Store, st1)
1243
- HWY_SVE_FOREACH(HWY_SVE_STORE, Stream, stnt1)
1244
- HWY_SVE_FOREACH(HWY_SVE_BLENDED_STORE, BlendedStore, st1)
1734
+ // MaskedLoadOr is generic and does not require emulation.
1245
1735
 
1246
- HWY_SVE_FOREACH_BF16(HWY_SVE_LOAD, Load, ld1)
1247
- HWY_SVE_FOREACH_BF16(HWY_SVE_MASKED_LOAD, MaskedLoad, ld1)
1248
- HWY_SVE_FOREACH_BF16(HWY_SVE_STORE, Store, st1)
1249
- HWY_SVE_FOREACH_BF16(HWY_SVE_STORE, Stream, stnt1)
1250
- HWY_SVE_FOREACH_BF16(HWY_SVE_BLENDED_STORE, BlendedStore, st1)
1736
+ template <class D, HWY_SVE_IF_EMULATED_D(D)>
1737
+ HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
1738
+ TFromD<D>* HWY_RESTRICT p) {
1739
+ const RebindToUnsigned<decltype(d)> du;
1740
+ BlendedStore(BitCast(du, v), RebindMask(du, m), du,
1741
+ detail::U16LanePointer(p));
1742
+ }
1743
+
1744
+ #undef HWY_SVE_MEM
1251
1745
 
1252
1746
  #if HWY_TARGET != HWY_SVE2_128
1253
1747
  namespace detail {
1254
- HWY_SVE_FOREACH(HWY_SVE_LOAD_DUP128, LoadDupFull128, ld1rq)
1255
- } // namespace detail
1256
- #endif // HWY_TARGET != HWY_SVE2_128
1257
-
1258
- #undef HWY_SVE_LOAD
1259
- #undef HWY_SVE_MASKED_LOAD
1260
- #undef HWY_SVE_LOAD_DUP128
1261
- #undef HWY_SVE_STORE
1262
- #undef HWY_SVE_BLENDED_STORE
1748
+ #define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP) \
1749
+ template <size_t N, int kPow2> \
1750
+ HWY_API HWY_SVE_V(BASE, BITS) \
1751
+ NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
1752
+ const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1753
+ /* All-true predicate to load all 128 bits. */ \
1754
+ return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(8), \
1755
+ detail::NativeLanePointer(p)); \
1756
+ }
1263
1757
 
1264
- #if !HWY_SVE_HAVE_BFLOAT16
1758
+ HWY_SVE_FOREACH(HWY_SVE_LOAD_DUP128, LoadDupFull128, ld1rq)
1759
+ HWY_SVE_FOREACH_BF16(HWY_SVE_LOAD_DUP128, LoadDupFull128, ld1rq)
1265
1760
 
1266
- template <size_t N, int kPow2>
1267
- HWY_API VBF16 Load(Simd<bfloat16_t, N, kPow2> d,
1268
- const bfloat16_t* HWY_RESTRICT p) {
1269
- return Load(RebindToUnsigned<decltype(d)>(),
1270
- reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
1761
+ template <class D, HWY_SVE_IF_EMULATED_D(D)>
1762
+ HWY_API VFromD<D> LoadDupFull128(D d, const TFromD<D>* HWY_RESTRICT p) {
1763
+ const RebindToUnsigned<decltype(d)> du;
1764
+ return BitCast(d, LoadDupFull128(du, detail::U16LanePointer(p)));
1271
1765
  }
1272
1766
 
1273
- #endif // !HWY_SVE_HAVE_BFLOAT16
1767
+ } // namespace detail
1768
+ #endif // HWY_TARGET != HWY_SVE2_128
1274
1769
 
1275
1770
  #if HWY_TARGET == HWY_SVE2_128
1276
- // On the HWY_SVE2_128 target, LoadDup128 is the same as Load since vectors
1771
+ // On the HWY_SVE2_128 target, LoadDup128 is the same as LoadU since vectors
1277
1772
  // cannot exceed 16 bytes on the HWY_SVE2_128 target.
1278
1773
  template <class D>
1279
1774
  HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
1280
- return Load(d, p);
1775
+ return LoadU(d, p);
1281
1776
  }
1282
1777
  #else // HWY_TARGET != HWY_SVE2_128
1283
- // If D().MaxBytes() <= 16 is true, simply do a Load operation.
1778
+ // If D().MaxBytes() <= 16 is true, simply do a LoadU operation.
1284
1779
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
1285
1780
  HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
1286
- return Load(d, p);
1781
+ return LoadU(d, p);
1287
1782
  }
1288
1783
 
1289
1784
  // If D().MaxBytes() > 16 is true, need to load the vector using ld1rq
1290
- template <class D, HWY_IF_V_SIZE_GT_D(D, 16),
1291
- hwy::EnableIf<!IsSame<TFromD<D>, bfloat16_t>()>* = nullptr>
1785
+ template <class D, HWY_IF_V_SIZE_GT_D(D, 16)>
1292
1786
  HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
1293
1787
  return detail::LoadDupFull128(d, p);
1294
1788
  }
1295
1789
 
1296
- #if !HWY_SVE_HAVE_BFLOAT16
1297
-
1298
- template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_BF16_D(D)>
1299
- HWY_API VBF16 LoadDup128(D d, const bfloat16_t* HWY_RESTRICT p) {
1300
- return detail::LoadDupFull128(
1301
- RebindToUnsigned<decltype(d)>(),
1302
- reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
1303
- }
1304
- #endif // !HWY_SVE_HAVE_BFLOAT16
1305
-
1306
1790
  #endif // HWY_TARGET != HWY_SVE2_128
1307
1791
 
1308
- #if !HWY_SVE_HAVE_BFLOAT16
1309
-
1310
- template <size_t N, int kPow2>
1311
- HWY_API void Store(VBF16 v, Simd<bfloat16_t, N, kPow2> d,
1312
- bfloat16_t* HWY_RESTRICT p) {
1313
- Store(v, RebindToUnsigned<decltype(d)>(),
1314
- reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
1315
- }
1316
-
1317
- #endif
1318
-
1319
- // ------------------------------ Load/StoreU
1792
+ // ------------------------------ Load/Store
1320
1793
 
1321
1794
  // SVE only requires lane alignment, not natural alignment of the entire
1322
- // vector.
1795
+ // vector, so Load/Store are the same as LoadU/StoreU.
1323
1796
  template <class D>
1324
- HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
1325
- return Load(d, p);
1797
+ HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
1798
+ return LoadU(d, p);
1326
1799
  }
1327
1800
 
1328
1801
  template <class V, class D>
1329
- HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
1330
- Store(v, d, p);
1802
+ HWY_API void Store(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
1803
+ StoreU(v, d, p);
1331
1804
  }
1332
1805
 
1333
1806
  // ------------------------------ MaskedLoadOr
@@ -1362,8 +1835,8 @@ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
1362
1835
  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, svbool_t m, \
1363
1836
  HWY_SVE_D(BASE, BITS, N, kPow2) /*d*/, \
1364
1837
  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1365
- HWY_SVE_V(int, BITS) index) { \
1366
- sv##OP##_s##BITS##index_##CHAR##BITS(m, base, index, v); \
1838
+ HWY_SVE_V(int, BITS) indices) { \
1839
+ sv##OP##_s##BITS##index_##CHAR##BITS(m, base, indices, v); \
1367
1840
  }
1368
1841
 
1369
1842
  HWY_SVE_FOREACH_UIF3264(HWY_SVE_SCATTER_OFFSET, ScatterOffset, st1_scatter)
@@ -1398,10 +1871,13 @@ HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
1398
1871
  #define HWY_SVE_MASKED_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \
1399
1872
  template <size_t N, int kPow2> \
1400
1873
  HWY_API HWY_SVE_V(BASE, BITS) \
1401
- NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /*d*/, \
1874
+ NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1402
1875
  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1403
- HWY_SVE_V(int, BITS) index) { \
1404
- return sv##OP##_s##BITS##index_##CHAR##BITS(m, base, index); \
1876
+ HWY_SVE_V(int, BITS) indices) { \
1877
+ const RebindToSigned<decltype(d)> di; \
1878
+ (void)di; /* for HWY_DASSERT */ \
1879
+ HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di)))); \
1880
+ return sv##OP##_s##BITS##index_##CHAR##BITS(m, base, indices); \
1405
1881
  }
1406
1882
 
1407
1883
  HWY_SVE_FOREACH_UIF3264(HWY_SVE_GATHER_OFFSET, GatherOffset, ld1_gather)
@@ -1410,6 +1886,13 @@ HWY_SVE_FOREACH_UIF3264(HWY_SVE_MASKED_GATHER_INDEX, MaskedGatherIndex,
1410
1886
  #undef HWY_SVE_GATHER_OFFSET
1411
1887
  #undef HWY_SVE_MASKED_GATHER_INDEX
1412
1888
 
1889
+ template <class D>
1890
+ HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, svbool_t m, D d,
1891
+ const TFromD<D>* HWY_RESTRICT p,
1892
+ VFromD<RebindToSigned<D>> indices) {
1893
+ return IfThenElse(m, MaskedGatherIndex(m, d, p, indices), no);
1894
+ }
1895
+
1413
1896
  template <class D>
1414
1897
  HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT p,
1415
1898
  VFromD<RebindToSigned<D>> indices) {
@@ -1430,8 +1913,8 @@ HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT p,
1430
1913
  HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1431
1914
  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1432
1915
  HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1) { \
1433
- const HWY_SVE_TUPLE(BASE, BITS, 2) tuple = \
1434
- sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \
1916
+ const HWY_SVE_TUPLE(BASE, BITS, 2) tuple = sv##OP##_##CHAR##BITS( \
1917
+ detail::MakeMask(d), detail::NativeLanePointer(unaligned)); \
1435
1918
  v0 = svget2(tuple, 0); \
1436
1919
  v1 = svget2(tuple, 1); \
1437
1920
  }
@@ -1447,8 +1930,8 @@ HWY_SVE_FOREACH(HWY_SVE_LOAD2, LoadInterleaved2, ld2)
1447
1930
  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1448
1931
  HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \
1449
1932
  HWY_SVE_V(BASE, BITS) & v2) { \
1450
- const HWY_SVE_TUPLE(BASE, BITS, 3) tuple = \
1451
- sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \
1933
+ const HWY_SVE_TUPLE(BASE, BITS, 3) tuple = sv##OP##_##CHAR##BITS( \
1934
+ detail::MakeMask(d), detail::NativeLanePointer(unaligned)); \
1452
1935
  v0 = svget3(tuple, 0); \
1453
1936
  v1 = svget3(tuple, 1); \
1454
1937
  v2 = svget3(tuple, 2); \
@@ -1465,8 +1948,8 @@ HWY_SVE_FOREACH(HWY_SVE_LOAD3, LoadInterleaved3, ld3)
1465
1948
  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1466
1949
  HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \
1467
1950
  HWY_SVE_V(BASE, BITS) & v2, HWY_SVE_V(BASE, BITS) & v3) { \
1468
- const HWY_SVE_TUPLE(BASE, BITS, 4) tuple = \
1469
- sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \
1951
+ const HWY_SVE_TUPLE(BASE, BITS, 4) tuple = sv##OP##_##CHAR##BITS( \
1952
+ detail::MakeMask(d), detail::NativeLanePointer(unaligned)); \
1470
1953
  v0 = svget4(tuple, 0); \
1471
1954
  v1 = svget4(tuple, 1); \
1472
1955
  v2 = svget4(tuple, 2); \
@@ -1478,12 +1961,14 @@ HWY_SVE_FOREACH(HWY_SVE_LOAD4, LoadInterleaved4, ld4)
1478
1961
 
1479
1962
  // ------------------------------ StoreInterleaved2
1480
1963
 
1481
- #define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP) \
1482
- template <size_t N, int kPow2> \
1483
- HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
1484
- HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1485
- HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1486
- sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, Create2(d, v0, v1)); \
1964
+ #define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP) \
1965
+ template <size_t N, int kPow2> \
1966
+ HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
1967
+ HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1968
+ HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1969
+ sv##OP##_##CHAR##BITS(detail::MakeMask(d), \
1970
+ detail::NativeLanePointer(unaligned), \
1971
+ Create2(d, v0, v1)); \
1487
1972
  }
1488
1973
  HWY_SVE_FOREACH(HWY_SVE_STORE2, StoreInterleaved2, st2)
1489
1974
 
@@ -1497,7 +1982,8 @@ HWY_SVE_FOREACH(HWY_SVE_STORE2, StoreInterleaved2, st2)
1497
1982
  HWY_SVE_V(BASE, BITS) v2, \
1498
1983
  HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1499
1984
  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1500
- sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, \
1985
+ sv##OP##_##CHAR##BITS(detail::MakeMask(d), \
1986
+ detail::NativeLanePointer(unaligned), \
1501
1987
  Create3(d, v0, v1, v2)); \
1502
1988
  }
1503
1989
  HWY_SVE_FOREACH(HWY_SVE_STORE3, StoreInterleaved3, st3)
@@ -1512,7 +1998,8 @@ HWY_SVE_FOREACH(HWY_SVE_STORE3, StoreInterleaved3, st3)
1512
1998
  HWY_SVE_V(BASE, BITS) v2, HWY_SVE_V(BASE, BITS) v3, \
1513
1999
  HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1514
2000
  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1515
- sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, \
2001
+ sv##OP##_##CHAR##BITS(detail::MakeMask(d), \
2002
+ detail::NativeLanePointer(unaligned), \
1516
2003
  Create4(d, v0, v1, v2, v3)); \
1517
2004
  }
1518
2005
  HWY_SVE_FOREACH(HWY_SVE_STORE4, StoreInterleaved4, st4)
@@ -1602,6 +2089,22 @@ HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> /* d */,
1602
2089
  return svcvt_f32_f16_x(detail::PTrue(Simd<float16_t, N, kPow2>()), vv);
1603
2090
  }
1604
2091
 
2092
+ #ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
2093
+ #undef HWY_NATIVE_PROMOTE_F16_TO_F64
2094
+ #else
2095
+ #define HWY_NATIVE_PROMOTE_F16_TO_F64
2096
+ #endif
2097
+
2098
+ template <size_t N, int kPow2>
2099
+ HWY_API svfloat64_t PromoteTo(Simd<float64_t, N, kPow2> /* d */,
2100
+ const svfloat16_t v) {
2101
+ // svcvt* expects inputs in even lanes, whereas Highway wants lower lanes, so
2102
+ // first replicate each lane once.
2103
+ const svfloat16_t vv = detail::ZipLowerSame(v, v);
2104
+ return svcvt_f64_f16_x(detail::PTrue(Simd<float16_t, N, kPow2>()),
2105
+ detail::ZipLowerSame(vv, vv));
2106
+ }
2107
+
1605
2108
  template <size_t N, int kPow2>
1606
2109
  HWY_API svfloat64_t PromoteTo(Simd<float64_t, N, kPow2> /* d */,
1607
2110
  const svfloat32_t v) {
@@ -1637,19 +2140,43 @@ HWY_API svuint64_t PromoteTo(Simd<uint64_t, N, kPow2> /* d */,
1637
2140
  return svcvt_u64_f32_x(detail::PTrue(Simd<float, N, kPow2>()), vv);
1638
2141
  }
1639
2142
 
1640
- // For 16-bit Compress
2143
+ // ------------------------------ PromoteUpperTo
2144
+
1641
2145
  namespace detail {
2146
+ HWY_SVE_FOREACH_UI16(HWY_SVE_PROMOTE_TO, PromoteUpperTo, unpkhi)
1642
2147
  HWY_SVE_FOREACH_UI32(HWY_SVE_PROMOTE_TO, PromoteUpperTo, unpkhi)
2148
+ HWY_SVE_FOREACH_UI64(HWY_SVE_PROMOTE_TO, PromoteUpperTo, unpkhi)
1643
2149
  #undef HWY_SVE_PROMOTE_TO
2150
+ } // namespace detail
1644
2151
 
1645
- template <size_t N, int kPow2>
1646
- HWY_API svfloat32_t PromoteUpperTo(Simd<float, N, kPow2> df, svfloat16_t v) {
1647
- const RebindToUnsigned<decltype(df)> du;
1648
- const RepartitionToNarrow<decltype(du)> dn;
1649
- return BitCast(df, PromoteUpperTo(du, BitCast(dn, v)));
2152
+ #ifdef HWY_NATIVE_PROMOTE_UPPER_TO
2153
+ #undef HWY_NATIVE_PROMOTE_UPPER_TO
2154
+ #else
2155
+ #define HWY_NATIVE_PROMOTE_UPPER_TO
2156
+ #endif
2157
+
2158
+ // Unsigned->Unsigned or Signed->Signed
2159
+ template <class D, class V, typename TD = TFromD<D>, typename TV = TFromV<V>,
2160
+ hwy::EnableIf<IsInteger<TD>() && IsInteger<TV>() &&
2161
+ (IsSigned<TD>() == IsSigned<TV>())>* = nullptr>
2162
+ HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
2163
+ if (detail::IsFull(d)) {
2164
+ return detail::PromoteUpperTo(d, v);
2165
+ }
2166
+ const Rebind<TFromV<V>, decltype(d)> dh;
2167
+ return PromoteTo(d, UpperHalf(dh, v));
1650
2168
  }
1651
2169
 
1652
- } // namespace detail
2170
+ // Differing signs or either is float
2171
+ template <class D, class V, typename TD = TFromD<D>, typename TV = TFromV<V>,
2172
+ hwy::EnableIf<!IsInteger<TD>() || !IsInteger<TV>() ||
2173
+ (IsSigned<TD>() != IsSigned<TV>())>* = nullptr>
2174
+ HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
2175
+ // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
2176
+ // because it cannot be deduced from D (could be either bf16 or f16).
2177
+ const Rebind<TFromV<V>, decltype(d)> dh;
2178
+ return PromoteTo(d, UpperHalf(dh, v));
2179
+ }
1653
2180
 
1654
2181
  // ------------------------------ DemoteTo U
1655
2182
 
@@ -1972,9 +2499,13 @@ namespace detail {
1972
2499
  }
1973
2500
  HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1)
1974
2501
  HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2)
2502
+ HWY_SVE_FOREACH_BF16(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1)
2503
+ HWY_SVE_FOREACH_BF16(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2)
1975
2504
  #if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
1976
2505
  HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q)
1977
2506
  HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
2507
+ HWY_SVE_FOREACH_BF16(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q)
2508
+ HWY_SVE_FOREACH_BF16(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
1978
2509
  #endif
1979
2510
  #undef HWY_SVE_CONCAT_EVERY_SECOND
1980
2511
 
@@ -1986,6 +2517,16 @@ HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
1986
2517
  return sv##OP##_##CHAR##BITS(mask, lo, hi); \
1987
2518
  }
1988
2519
  HWY_SVE_FOREACH(HWY_SVE_SPLICE, Splice, splice)
2520
+ #if HWY_SVE_HAVE_BF16_FEATURE
2521
+ HWY_SVE_FOREACH_BF16(HWY_SVE_SPLICE, Splice, splice)
2522
+ #else
2523
+ template <class V, HWY_IF_BF16_D(DFromV<V>)>
2524
+ HWY_INLINE V Splice(V hi, V lo, svbool_t mask) {
2525
+ const DFromV<V> d;
2526
+ const RebindToUnsigned<decltype(d)> du;
2527
+ return BitCast(d, Splice(BitCast(du, hi), BitCast(du, lo), mask));
2528
+ }
2529
+ #endif // HWY_SVE_HAVE_BF16_FEATURE
1989
2530
  #undef HWY_SVE_SPLICE
1990
2531
 
1991
2532
  } // namespace detail
@@ -2021,6 +2562,20 @@ HWY_API svfloat16_t DemoteTo(Simd<float16_t, N, kPow2> d, const svfloat32_t v) {
2021
2562
  in_even); // lower half
2022
2563
  }
2023
2564
 
2565
+ #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
2566
+ #undef HWY_NATIVE_DEMOTE_F64_TO_F16
2567
+ #else
2568
+ #define HWY_NATIVE_DEMOTE_F64_TO_F16
2569
+ #endif
2570
+
2571
+ template <size_t N, int kPow2>
2572
+ HWY_API svfloat16_t DemoteTo(Simd<float16_t, N, kPow2> d, const svfloat64_t v) {
2573
+ const svfloat16_t in_lo16 = svcvt_f16_f64_x(detail::PTrue(d), v);
2574
+ const svfloat16_t in_even = detail::ConcatEvenFull(in_lo16, in_lo16);
2575
+ return detail::ConcatEvenFull(in_even,
2576
+ in_even); // lower half
2577
+ }
2578
+
2024
2579
  template <size_t N, int kPow2>
2025
2580
  HWY_API VBF16 DemoteTo(Simd<bfloat16_t, N, kPow2> dbf16, svfloat32_t v) {
2026
2581
  const svuint16_t in_even = BitCast(ScalableTag<uint16_t>(), v);
@@ -2103,20 +2658,22 @@ HWY_API VFromD<DI> NearestInt(VF v) {
2103
2658
 
2104
2659
  // ------------------------------ Iota (Add, ConvertTo)
2105
2660
 
2106
- #define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP) \
2107
- template <size_t N, int kPow2> \
2108
- HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
2109
- HWY_SVE_T(BASE, BITS) first) { \
2110
- return sv##OP##_##CHAR##BITS(first, 1); \
2661
+ #define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP) \
2662
+ template <size_t N, int kPow2, typename T2> \
2663
+ HWY_API HWY_SVE_V(BASE, BITS) \
2664
+ NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, T2 first) { \
2665
+ return sv##OP##_##CHAR##BITS( \
2666
+ ConvertScalarTo<HWY_SVE_T(BASE, BITS)>(first), 1); \
2111
2667
  }
2112
2668
 
2113
2669
  HWY_SVE_FOREACH_UI(HWY_SVE_IOTA, Iota, index)
2114
2670
  #undef HWY_SVE_IOTA
2115
2671
 
2116
- template <class D, HWY_IF_FLOAT_D(D)>
2117
- HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
2672
+ template <class D, typename T2, HWY_IF_FLOAT_D(D)>
2673
+ HWY_API VFromD<D> Iota(const D d, T2 first) {
2118
2674
  const RebindToSigned<D> di;
2119
- return detail::AddN(ConvertTo(d, Iota(di, 0)), first);
2675
+ return detail::AddN(ConvertTo(d, Iota(di, 0)),
2676
+ ConvertScalarTo<TFromD<D>>(first));
2120
2677
  }
2121
2678
 
2122
2679
  // ------------------------------ InterleaveLower
@@ -2147,12 +2704,10 @@ HWY_API V InterleaveLower(const V a, const V b) {
2147
2704
 
2148
2705
  // Only use zip2 if vector are a powers of two, otherwise getting the actual
2149
2706
  // "upper half" requires MaskUpperHalf.
2150
- #if HWY_TARGET == HWY_SVE2_128
2151
2707
  namespace detail {
2152
2708
  // Unlike Highway's ZipUpper, this returns the same type.
2153
2709
  HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipUpperSame, zip2)
2154
2710
  } // namespace detail
2155
- #endif
2156
2711
 
2157
2712
  // Full vector: guaranteed to have at least one block
2158
2713
  template <class D, class V = VFromD<D>,
@@ -2184,6 +2739,30 @@ HWY_API V InterleaveUpper(D d, const V a, const V b) {
2184
2739
  return InterleaveUpper(DFromV<V>(), a, b);
2185
2740
  }
2186
2741
 
2742
+ // ------------------------------ InterleaveWholeLower
2743
+ #ifdef HWY_NATIVE_INTERLEAVE_WHOLE
2744
+ #undef HWY_NATIVE_INTERLEAVE_WHOLE
2745
+ #else
2746
+ #define HWY_NATIVE_INTERLEAVE_WHOLE
2747
+ #endif
2748
+
2749
+ template <class D>
2750
+ HWY_API VFromD<D> InterleaveWholeLower(D /*d*/, VFromD<D> a, VFromD<D> b) {
2751
+ return detail::ZipLowerSame(a, b);
2752
+ }
2753
+
2754
+ // ------------------------------ InterleaveWholeUpper
2755
+
2756
+ template <class D>
2757
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
2758
+ if (HWY_SVE_IS_POW2 && detail::IsFull(d)) {
2759
+ return detail::ZipUpperSame(a, b);
2760
+ }
2761
+
2762
+ const Half<decltype(d)> d2;
2763
+ return InterleaveWholeLower(d, UpperHalf(d2, a), UpperHalf(d2, b));
2764
+ }
2765
+
2187
2766
  // ------------------------------ Per4LaneBlockShuffle
2188
2767
 
2189
2768
  namespace detail {
@@ -2432,7 +3011,13 @@ HWY_API V UpperHalf(const DH dh, const V v) {
2432
3011
 
2433
3012
  // ================================================== REDUCE
2434
3013
 
2435
- // These return T, whereas the Highway op returns a broadcasted vector.
3014
+ #ifdef HWY_NATIVE_REDUCE_SCALAR
3015
+ #undef HWY_NATIVE_REDUCE_SCALAR
3016
+ #else
3017
+ #define HWY_NATIVE_REDUCE_SCALAR
3018
+ #endif
3019
+
3020
+ // These return T, suitable for ReduceSum.
2436
3021
  namespace detail {
2437
3022
  #define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP) \
2438
3023
  HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
@@ -2462,24 +3047,53 @@ HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanesM, maxnmv)
2462
3047
  #undef HWY_SVE_REDUCE_ADD
2463
3048
  } // namespace detail
2464
3049
 
2465
- template <class D, class V>
2466
- V SumOfLanes(D d, V v) {
2467
- return Set(d, detail::SumOfLanesM(detail::MakeMask(d), v));
2468
- }
3050
+ // detail::SumOfLanesM, detail::MinOfLanesM, and detail::MaxOfLanesM is more
3051
+ // efficient for N=4 I8/U8 reductions on SVE than the default implementations
3052
+ // of the N=4 I8/U8 ReduceSum/ReduceMin/ReduceMax operations in
3053
+ // generic_ops-inl.h
3054
+ #undef HWY_IF_REDUCE_D
3055
+ #define HWY_IF_REDUCE_D(D) hwy::EnableIf<HWY_MAX_LANES_D(D) != 1>* = nullptr
2469
3056
 
2470
- template <class D, class V>
2471
- TFromV<V> ReduceSum(D d, V v) {
3057
+ #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
3058
+ #undef HWY_NATIVE_REDUCE_SUM_4_UI8
3059
+ #else
3060
+ #define HWY_NATIVE_REDUCE_SUM_4_UI8
3061
+ #endif
3062
+
3063
+ #ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
3064
+ #undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
3065
+ #else
3066
+ #define HWY_NATIVE_REDUCE_MINMAX_4_UI8
3067
+ #endif
3068
+
3069
+ template <class D, HWY_IF_REDUCE_D(D)>
3070
+ HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
2472
3071
  return detail::SumOfLanesM(detail::MakeMask(d), v);
2473
3072
  }
2474
3073
 
2475
- template <class D, class V>
2476
- V MinOfLanes(D d, V v) {
2477
- return Set(d, detail::MinOfLanesM(detail::MakeMask(d), v));
3074
+ template <class D, HWY_IF_REDUCE_D(D)>
3075
+ HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) {
3076
+ return detail::MinOfLanesM(detail::MakeMask(d), v);
2478
3077
  }
2479
3078
 
2480
- template <class D, class V>
2481
- V MaxOfLanes(D d, V v) {
2482
- return Set(d, detail::MaxOfLanesM(detail::MakeMask(d), v));
3079
+ template <class D, HWY_IF_REDUCE_D(D)>
3080
+ HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
3081
+ return detail::MaxOfLanesM(detail::MakeMask(d), v);
3082
+ }
3083
+
3084
+ // ------------------------------ SumOfLanes
3085
+
3086
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
3087
+ HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
3088
+ return Set(d, ReduceSum(d, v));
3089
+ }
3090
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
3091
+ HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
3092
+ return Set(d, ReduceMin(d, v));
3093
+ }
3094
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
3095
+ HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
3096
+ return Set(d, ReduceMax(d, v));
2483
3097
  }
2484
3098
 
2485
3099
  // ================================================== SWIZZLE
@@ -2513,7 +3127,9 @@ HWY_API TFromV<V> ExtractLane(V v, size_t i) {
2513
3127
  template <class V>
2514
3128
  HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
2515
3129
  const DFromV<V> d;
2516
- const auto is_i = detail::EqN(Iota(d, 0), static_cast<TFromV<V>>(i));
3130
+ const RebindToSigned<decltype(d)> di;
3131
+ using TI = TFromD<decltype(di)>;
3132
+ const svbool_t is_i = detail::EqN(Iota(di, 0), static_cast<TI>(i));
2517
3133
  return IfThenElse(RebindMask(d, is_i), Set(d, t), v);
2518
3134
  }
2519
3135
 
@@ -2623,6 +3239,7 @@ HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) {
2623
3239
  }
2624
3240
 
2625
3241
  HWY_SVE_FOREACH(HWY_SVE_TABLE, TableLookupLanes, tbl)
3242
+ HWY_SVE_FOREACH_BF16(HWY_SVE_TABLE, TableLookupLanes, tbl)
2626
3243
  #undef HWY_SVE_TABLE
2627
3244
 
2628
3245
  #if HWY_SVE_HAVE_2
@@ -2634,6 +3251,7 @@ namespace detail {
2634
3251
  }
2635
3252
 
2636
3253
  HWY_SVE_FOREACH(HWY_SVE_TABLE2, NativeTwoTableLookupLanes, tbl2)
3254
+ HWY_SVE_FOREACH_BF16(HWY_SVE_TABLE2, NativeTwoTableLookupLanes, tbl2)
2637
3255
  #undef HWY_SVE_TABLE
2638
3256
  } // namespace detail
2639
3257
  #endif // HWY_SVE_HAVE_2
@@ -2705,6 +3323,7 @@ namespace detail {
2705
3323
  }
2706
3324
 
2707
3325
  HWY_SVE_FOREACH(HWY_SVE_REVERSE, ReverseFull, rev)
3326
+ HWY_SVE_FOREACH_BF16(HWY_SVE_REVERSE, ReverseFull, rev)
2708
3327
  #undef HWY_SVE_REVERSE
2709
3328
 
2710
3329
  } // namespace detail
@@ -2775,14 +3394,14 @@ HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { // 3210
2775
3394
  template <class D, HWY_IF_T_SIZE_D(D, 1)>
2776
3395
  HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
2777
3396
  const RebindToUnsigned<decltype(d)> du;
2778
- const RepartitionToWide<RepartitionToWide<decltype(du)>> du32;
3397
+ const RepartitionToWideX2<decltype(du)> du32;
2779
3398
  return BitCast(d, svrevb_u32_x(detail::PTrue(d), BitCast(du32, v)));
2780
3399
  }
2781
3400
 
2782
3401
  template <class D, HWY_IF_T_SIZE_D(D, 2)>
2783
3402
  HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
2784
3403
  const RebindToUnsigned<decltype(d)> du;
2785
- const RepartitionToWide<RepartitionToWide<decltype(du)>> du64;
3404
+ const RepartitionToWideX2<decltype(du)> du64;
2786
3405
  return BitCast(d, svrevh_u64_x(detail::PTrue(d), BitCast(du64, v)));
2787
3406
  }
2788
3407
 
@@ -2943,20 +3562,23 @@ HWY_API V BroadcastBlock(V v) {
2943
3562
  static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(),
2944
3563
  "Invalid block index");
2945
3564
 
3565
+ const RebindToUnsigned<decltype(d)> du; // for bfloat16_t
3566
+ using VU = VFromD<decltype(du)>;
3567
+ const VU vu = BitCast(du, v);
3568
+
2946
3569
  #if HWY_TARGET == HWY_SVE_256
2947
- return (kBlockIdx == 0) ? ConcatLowerLower(d, v, v)
2948
- : ConcatUpperUpper(d, v, v);
3570
+ return BitCast(d, (kBlockIdx == 0) ? ConcatLowerLower(du, vu, vu)
3571
+ : ConcatUpperUpper(du, vu, vu));
2949
3572
  #else
2950
- const RebindToUnsigned<decltype(d)> du;
2951
3573
  using TU = TFromD<decltype(du)>;
2952
3574
  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d);
2953
3575
  constexpr size_t kBlockOffset =
2954
3576
  static_cast<size_t>(kBlockIdx) * kLanesPerBlock;
2955
3577
 
2956
- const auto idx = detail::AddN(
3578
+ const VU idx = detail::AddN(
2957
3579
  detail::AndN(Iota(du, TU{0}), static_cast<TU>(kLanesPerBlock - 1)),
2958
3580
  static_cast<TU>(kBlockOffset));
2959
- return TableLookupLanes(v, idx);
3581
+ return BitCast(d, TableLookupLanes(vu, idx));
2960
3582
  #endif
2961
3583
  }
2962
3584
 
@@ -3462,6 +4084,126 @@ HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> df32, VBF16 v) {
3462
4084
  return BitCast(df32, detail::ZipLowerSame(svdup_n_u16(0), BitCast(du16, v)));
3463
4085
  }
3464
4086
 
4087
+ // ------------------------------ PromoteEvenTo/PromoteOddTo (ConcatOddFull)
4088
+
4089
+ namespace detail {
4090
+
4091
+ // Signed to signed PromoteEvenTo
4092
+ template <class D>
4093
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
4094
+ hwy::SizeTag<2> /*to_lane_size_tag*/,
4095
+ hwy::SignedTag /*from_type_tag*/, D d_to,
4096
+ svint8_t v) {
4097
+ return svextb_s16_x(detail::PTrue(d_to), BitCast(d_to, v));
4098
+ }
4099
+
4100
+ template <class D>
4101
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
4102
+ hwy::SizeTag<4> /*to_lane_size_tag*/,
4103
+ hwy::SignedTag /*from_type_tag*/, D d_to,
4104
+ svint16_t v) {
4105
+ return svexth_s32_x(detail::PTrue(d_to), BitCast(d_to, v));
4106
+ }
4107
+
4108
+ template <class D>
4109
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
4110
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
4111
+ hwy::SignedTag /*from_type_tag*/, D d_to,
4112
+ svint32_t v) {
4113
+ return svextw_s64_x(detail::PTrue(d_to), BitCast(d_to, v));
4114
+ }
4115
+
4116
+ // F16->F32 PromoteEvenTo
4117
+ template <class D>
4118
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
4119
+ hwy::SizeTag<4> /*to_lane_size_tag*/,
4120
+ hwy::FloatTag /*from_type_tag*/, D d_to,
4121
+ svfloat16_t v) {
4122
+ const Repartition<float, decltype(d_to)> d_from;
4123
+ return svcvt_f32_f16_x(detail::PTrue(d_from), v);
4124
+ }
4125
+
4126
+ // F32->F64 PromoteEvenTo
4127
+ template <class D>
4128
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
4129
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
4130
+ hwy::FloatTag /*from_type_tag*/, D d_to,
4131
+ svfloat32_t v) {
4132
+ const Repartition<float, decltype(d_to)> d_from;
4133
+ return svcvt_f64_f32_x(detail::PTrue(d_from), v);
4134
+ }
4135
+
4136
+ // I32->F64 PromoteEvenTo
4137
+ template <class D>
4138
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
4139
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
4140
+ hwy::SignedTag /*from_type_tag*/, D d_to,
4141
+ svint32_t v) {
4142
+ const Repartition<float, decltype(d_to)> d_from;
4143
+ return svcvt_f64_s32_x(detail::PTrue(d_from), v);
4144
+ }
4145
+
4146
+ // U32->F64 PromoteEvenTo
4147
+ template <class D>
4148
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
4149
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
4150
+ hwy::UnsignedTag /*from_type_tag*/, D d_to,
4151
+ svuint32_t v) {
4152
+ const Repartition<float, decltype(d_to)> d_from;
4153
+ return svcvt_f64_u32_x(detail::PTrue(d_from), v);
4154
+ }
4155
+
4156
+ // F32->I64 PromoteEvenTo
4157
+ template <class D>
4158
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
4159
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
4160
+ hwy::FloatTag /*from_type_tag*/, D d_to,
4161
+ svfloat32_t v) {
4162
+ const Repartition<float, decltype(d_to)> d_from;
4163
+ return svcvt_s64_f32_x(detail::PTrue(d_from), v);
4164
+ }
4165
+
4166
+ // F32->U64 PromoteEvenTo
4167
+ template <class D>
4168
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::UnsignedTag /*to_type_tag*/,
4169
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
4170
+ hwy::FloatTag /*from_type_tag*/, D d_to,
4171
+ svfloat32_t v) {
4172
+ const Repartition<float, decltype(d_to)> d_from;
4173
+ return svcvt_u64_f32_x(detail::PTrue(d_from), v);
4174
+ }
4175
+
4176
+ // F16->F32 PromoteOddTo
4177
+ template <class D>
4178
+ HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag to_type_tag,
4179
+ hwy::SizeTag<4> to_lane_size_tag,
4180
+ hwy::FloatTag from_type_tag, D d_to,
4181
+ svfloat16_t v) {
4182
+ return PromoteEvenTo(to_type_tag, to_lane_size_tag, from_type_tag, d_to,
4183
+ DupOdd(v));
4184
+ }
4185
+
4186
+ // I32/U32/F32->F64 PromoteOddTo
4187
+ template <class FromTypeTag, class D, class V>
4188
+ HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag to_type_tag,
4189
+ hwy::SizeTag<8> to_lane_size_tag,
4190
+ FromTypeTag from_type_tag, D d_to, V v) {
4191
+ return PromoteEvenTo(to_type_tag, to_lane_size_tag, from_type_tag, d_to,
4192
+ DupOdd(v));
4193
+ }
4194
+
4195
+ // F32->I64/U64 PromoteOddTo
4196
+ template <class ToTypeTag, class D, HWY_IF_UI64_D(D)>
4197
+ HWY_INLINE VFromD<D> PromoteOddTo(ToTypeTag to_type_tag,
4198
+ hwy::SizeTag<8> to_lane_size_tag,
4199
+ hwy::FloatTag from_type_tag, D d_to,
4200
+ svfloat32_t v) {
4201
+ return PromoteEvenTo(to_type_tag, to_lane_size_tag, from_type_tag, d_to,
4202
+ DupOdd(v));
4203
+ }
4204
+
4205
+ } // namespace detail
4206
+
3465
4207
  // ------------------------------ ReorderDemote2To (OddEven)
3466
4208
 
3467
4209
  template <size_t N, int kPow2>
@@ -3618,15 +4360,45 @@ HWY_API VFromD<D> OrderedDemote2To(D dn, V a, V b) {
3618
4360
  return Combine(dn, demoted_b, demoted_a);
3619
4361
  }
3620
4362
 
3621
- template <class D, HWY_IF_BF16_D(D)>
3622
- HWY_API VBF16 OrderedDemote2To(D dn, svfloat32_t a, svfloat32_t b) {
4363
+ template <class D, HWY_IF_SPECIAL_FLOAT_D(D)>
4364
+ HWY_API VFromD<D> OrderedDemote2To(D dn, svfloat32_t a, svfloat32_t b) {
3623
4365
  const Half<decltype(dn)> dnh;
3624
- const RebindToUnsigned<decltype(dn)> dn_u;
3625
- const RebindToUnsigned<decltype(dnh)> dnh_u;
3626
- const auto demoted_a = DemoteTo(dnh, a);
3627
- const auto demoted_b = DemoteTo(dnh, b);
3628
- return BitCast(
3629
- dn, Combine(dn_u, BitCast(dnh_u, demoted_b), BitCast(dnh_u, demoted_a)));
4366
+ return Combine(dn, DemoteTo(dnh, b), DemoteTo(dnh, a));
4367
+ }
4368
+
4369
+ // ------------------------------ I8/U8/I16/U16 Div
4370
+
4371
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
4372
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
4373
+ HWY_API V Div(V a, V b) {
4374
+ const DFromV<decltype(a)> d;
4375
+ const Half<decltype(d)> dh;
4376
+ const RepartitionToWide<decltype(d)> dw;
4377
+
4378
+ const auto q_lo =
4379
+ Div(PromoteTo(dw, LowerHalf(dh, a)), PromoteTo(dw, LowerHalf(dh, b)));
4380
+ const auto q_hi = Div(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b));
4381
+
4382
+ return OrderedDemote2To(d, q_lo, q_hi);
4383
+ }
4384
+
4385
+ // ------------------------------ I8/U8/I16/U16 MaskedDivOr
4386
+ template <class V, class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
4387
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
4388
+ HWY_API V MaskedDivOr(V no, M m, V a, V b) {
4389
+ return IfThenElse(m, Div(a, b), no);
4390
+ }
4391
+
4392
+ // ------------------------------ Mod (Div, NegMulAdd)
4393
+ template <class V>
4394
+ HWY_API V Mod(V a, V b) {
4395
+ return NegMulAdd(Div(a, b), b, a);
4396
+ }
4397
+
4398
+ // ------------------------------ MaskedModOr (Mod)
4399
+ template <class V, class M>
4400
+ HWY_API V MaskedModOr(V no, M m, V a, V b) {
4401
+ return IfThenElse(m, Mod(a, b), no);
3630
4402
  }
3631
4403
 
3632
4404
  // ------------------------------ ZeroIfNegative (Lt, IfThenElse)
@@ -3735,6 +4507,84 @@ HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
3735
4507
  return TestBit(vbits, bit);
3736
4508
  }
3737
4509
 
4510
+ // ------------------------------ Dup128MaskFromMaskBits
4511
+
4512
+ template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 8)>
4513
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
4514
+ const RebindToUnsigned<decltype(d)> du;
4515
+
4516
+ constexpr size_t kN = MaxLanes(d);
4517
+ if (kN < 8) mask_bits &= (1u << kN) - 1;
4518
+
4519
+ // Replicate the lower 8 bits of mask_bits to each u8 lane
4520
+ const svuint8_t bytes = BitCast(du, Set(du, static_cast<uint8_t>(mask_bits)));
4521
+
4522
+ const svuint8_t bit =
4523
+ svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
4524
+ return TestBit(bytes, bit);
4525
+ }
4526
+
4527
+ template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_GT_D(D, 8)>
4528
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
4529
+ const RebindToUnsigned<decltype(d)> du;
4530
+ const Repartition<uint16_t, decltype(du)> du16;
4531
+
4532
+ // Replicate the lower 16 bits of mask_bits to each u16 lane of a u16 vector,
4533
+ // and then bitcast the replicated mask_bits to a u8 vector
4534
+ const svuint8_t bytes =
4535
+ BitCast(du, Set(du16, static_cast<uint16_t>(mask_bits)));
4536
+ // Replicate bytes 8x such that each byte contains the bit that governs it.
4537
+ const svuint8_t rep8 = svtbl_u8(bytes, ShiftRight<3>(Iota(du, 0)));
4538
+
4539
+ const svuint8_t bit =
4540
+ svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
4541
+ return TestBit(rep8, bit);
4542
+ }
4543
+
4544
+ template <class D, HWY_IF_T_SIZE_D(D, 2)>
4545
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
4546
+ const RebindToUnsigned<decltype(d)> du;
4547
+ const Repartition<uint8_t, decltype(d)> du8;
4548
+
4549
+ constexpr size_t kN = MaxLanes(d);
4550
+ if (kN < 8) mask_bits &= (1u << kN) - 1;
4551
+
4552
+ // Set all of the u8 lanes of bytes to the lower 8 bits of mask_bits
4553
+ const svuint8_t bytes = Set(du8, static_cast<uint8_t>(mask_bits));
4554
+
4555
+ const svuint16_t bit = svdupq_n_u16(1, 2, 4, 8, 16, 32, 64, 128);
4556
+ return TestBit(BitCast(du, bytes), bit);
4557
+ }
4558
+
4559
+ template <class D, HWY_IF_T_SIZE_D(D, 4)>
4560
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
4561
+ const RebindToUnsigned<decltype(d)> du;
4562
+ const Repartition<uint8_t, decltype(d)> du8;
4563
+
4564
+ constexpr size_t kN = MaxLanes(d);
4565
+ if (kN < 4) mask_bits &= (1u << kN) - 1;
4566
+
4567
+ // Set all of the u8 lanes of bytes to the lower 8 bits of mask_bits
4568
+ const svuint8_t bytes = Set(du8, static_cast<uint8_t>(mask_bits));
4569
+
4570
+ const svuint32_t bit = svdupq_n_u32(1, 2, 4, 8);
4571
+ return TestBit(BitCast(du, bytes), bit);
4572
+ }
4573
+
4574
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
4575
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
4576
+ const RebindToUnsigned<decltype(d)> du;
4577
+ const Repartition<uint8_t, decltype(d)> du8;
4578
+
4579
+ if (MaxLanes(d) < 2) mask_bits &= 1u;
4580
+
4581
+ // Set all of the u8 lanes of bytes to the lower 8 bits of mask_bits
4582
+ const svuint8_t bytes = Set(du8, static_cast<uint8_t>(mask_bits));
4583
+
4584
+ const svuint64_t bit = svdupq_n_u64(1, 2);
4585
+ return TestBit(BitCast(du, bytes), bit);
4586
+ }
4587
+
3738
4588
  // ------------------------------ StoreMaskBits
3739
4589
 
3740
4590
  namespace detail {
@@ -4100,12 +4950,13 @@ HWY_INLINE VFromD<DU> LaneIndicesFromByteIndices(D, svuint8_t idx) {
4100
4950
  template <class V>
4101
4951
  HWY_INLINE V ExpandLoop(V v, svbool_t mask) {
4102
4952
  const DFromV<V> d;
4953
+ using T = TFromV<V>;
4103
4954
  uint8_t mask_bytes[256 / 8];
4104
4955
  StoreMaskBits(d, mask, mask_bytes);
4105
4956
 
4106
4957
  // ShiftLeftLanes is expensive, so we're probably better off storing to memory
4107
4958
  // and loading the final result.
4108
- alignas(16) TFromV<V> out[2 * MaxLanes(d)];
4959
+ alignas(16) T out[2 * MaxLanes(d)];
4109
4960
 
4110
4961
  svbool_t next = svpfalse_b();
4111
4962
  size_t input_consumed = 0;
@@ -4117,7 +4968,7 @@ HWY_INLINE V ExpandLoop(V v, svbool_t mask) {
4117
4968
  // instruction for variable-shift-reg, but we can splice.
4118
4969
  const V vH = detail::Splice(v, v, next);
4119
4970
  input_consumed += PopCount(mask_bits);
4120
- next = detail::GeN(iota, static_cast<TFromV<V>>(input_consumed));
4971
+ next = detail::GeN(iota, ConvertScalarTo<T>(input_consumed));
4121
4972
 
4122
4973
  const auto idx = detail::LaneIndicesFromByteIndices(
4123
4974
  d, detail::IndicesForExpandFromBits(mask_bits));
@@ -4611,7 +5462,7 @@ HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) {
4611
5462
  template <size_t N, int kPow2>
4612
5463
  HWY_API svfloat32_t WidenMulPairwiseAdd(Simd<float, N, kPow2> df32, VBF16 a,
4613
5464
  VBF16 b) {
4614
- #if HWY_SVE_HAVE_BFLOAT16
5465
+ #if HWY_SVE_HAVE_BF16_FEATURE
4615
5466
  const svfloat32_t even = svbfmlalb_f32(Zero(df32), a, b);
4616
5467
  return svbfmlalt_f32(even, a, b);
4617
5468
  #else
@@ -4626,7 +5477,7 @@ HWY_API svfloat32_t WidenMulPairwiseAdd(Simd<float, N, kPow2> df32, VBF16 a,
4626
5477
  const VU32 bo = And(BitCast(du32, b), odd);
4627
5478
  return MulAdd(BitCast(df32, ae), BitCast(df32, be),
4628
5479
  Mul(BitCast(df32, ao), BitCast(df32, bo)));
4629
- #endif // HWY_SVE_HAVE_BFLOAT16
5480
+ #endif // HWY_SVE_HAVE_BF16_FEATURE
4630
5481
  }
4631
5482
 
4632
5483
  template <size_t N, int kPow2>
@@ -4672,7 +5523,7 @@ HWY_API svfloat32_t ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
4672
5523
  VBF16 a, VBF16 b,
4673
5524
  const svfloat32_t sum0,
4674
5525
  svfloat32_t& sum1) {
4675
- #if HWY_SVE_HAVE_BFLOAT16
5526
+ #if HWY_SVE_HAVE_BF16_FEATURE
4676
5527
  (void)df32;
4677
5528
  sum1 = svbfmlalt_f32(sum1, a, b);
4678
5529
  return svbfmlalb_f32(sum0, a, b);
@@ -4688,7 +5539,7 @@ HWY_API svfloat32_t ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
4688
5539
  const VU32 bo = And(BitCast(du32, b), odd);
4689
5540
  sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
4690
5541
  return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
4691
- #endif // HWY_SVE_HAVE_BFLOAT16
5542
+ #endif // HWY_SVE_HAVE_BF16_FEATURE
4692
5543
  }
4693
5544
 
4694
5545
  template <size_t N, int kPow2>
@@ -4817,8 +5668,10 @@ HWY_API VFromD<DU64> SumOfMulQuadAccumulate(DU64 /*du64*/, svuint16_t a,
4817
5668
 
4818
5669
  // ------------------------------ AESRound / CLMul
4819
5670
 
5671
+ // Static dispatch with -march=armv8-a+sve2+aes, or dynamic dispatch WITHOUT a
5672
+ // baseline, in which case we check for AES support at runtime.
4820
5673
  #if defined(__ARM_FEATURE_SVE2_AES) || \
4821
- (HWY_SVE_HAVE_2 && HWY_HAVE_RUNTIME_DISPATCH)
5674
+ (HWY_SVE_HAVE_2 && HWY_HAVE_RUNTIME_DISPATCH && HWY_BASELINE_SVE2 == 0)
4822
5675
 
4823
5676
  // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
4824
5677
  #ifdef HWY_NATIVE_AES
@@ -5059,14 +5912,15 @@ HWY_API V HighestSetBitIndex(V v) {
5059
5912
  }
5060
5913
 
5061
5914
  // ================================================== END MACROS
5062
- namespace detail { // for code folding
5063
5915
  #undef HWY_SVE_ALL_PTRUE
5064
5916
  #undef HWY_SVE_D
5065
5917
  #undef HWY_SVE_FOREACH
5066
5918
  #undef HWY_SVE_FOREACH_BF16
5919
+ #undef HWY_SVE_FOREACH_BF16_UNCONDITIONAL
5067
5920
  #undef HWY_SVE_FOREACH_F
5068
5921
  #undef HWY_SVE_FOREACH_F16
5069
5922
  #undef HWY_SVE_FOREACH_F32
5923
+ #undef HWY_SVE_FOREACH_F3264
5070
5924
  #undef HWY_SVE_FOREACH_F64
5071
5925
  #undef HWY_SVE_FOREACH_I
5072
5926
  #undef HWY_SVE_FOREACH_I08
@@ -5086,7 +5940,10 @@ namespace detail { // for code folding
5086
5940
  #undef HWY_SVE_FOREACH_UI64
5087
5941
  #undef HWY_SVE_FOREACH_UIF3264
5088
5942
  #undef HWY_SVE_HAVE_2
5943
+ #undef HWY_SVE_IF_EMULATED_D
5944
+ #undef HWY_SVE_IF_NOT_EMULATED_D
5089
5945
  #undef HWY_SVE_PTRUE
5946
+ #undef HWY_SVE_RETV_ARGMVV
5090
5947
  #undef HWY_SVE_RETV_ARGPV
5091
5948
  #undef HWY_SVE_RETV_ARGPVN
5092
5949
  #undef HWY_SVE_RETV_ARGPVV
@@ -5098,7 +5955,6 @@ namespace detail { // for code folding
5098
5955
  #undef HWY_SVE_UNDEFINED
5099
5956
  #undef HWY_SVE_V
5100
5957
 
5101
- } // namespace detail
5102
5958
  // NOLINTNEXTLINE(google-readability-namespace-comments)
5103
5959
  } // namespace HWY_NAMESPACE
5104
5960
  } // namespace hwy