@img/sharp-libvips-dev 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. package/README.md +1 -2
  2. package/include/aom/aom_decoder.h +1 -1
  3. package/include/aom/aom_encoder.h +7 -1
  4. package/include/aom/aom_image.h +24 -12
  5. package/include/aom/aom_integer.h +3 -3
  6. package/include/aom/aomcx.h +15 -0
  7. package/include/aom/aomdx.h +5 -2
  8. package/include/archive.h +7 -5
  9. package/include/archive_entry.h +5 -3
  10. package/include/cgif.h +3 -0
  11. package/include/expat.h +21 -10
  12. package/include/expat_config.h +11 -5
  13. package/include/ffi.h +12 -25
  14. package/include/freetype2/freetype/config/ftoption.h +2 -2
  15. package/include/fribidi/fribidi-config.h +2 -2
  16. package/include/fribidi/fribidi-unicode-version.h +3 -3
  17. package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
  18. package/include/glib-2.0/gio/gappinfo.h +40 -25
  19. package/include/glib-2.0/gio/gapplication.h +6 -0
  20. package/include/glib-2.0/gio/gasyncresult.h +1 -1
  21. package/include/glib-2.0/gio/gconverter.h +5 -0
  22. package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
  23. package/include/glib-2.0/gio/gfile.h +16 -0
  24. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  25. package/include/glib-2.0/gio/giotypes.h +0 -1
  26. package/include/glib-2.0/gio/gsettings.h +8 -0
  27. package/include/glib-2.0/gio/gvfs.h +2 -2
  28. package/include/glib-2.0/girepository/gi-visibility.h +34 -0
  29. package/include/glib-2.0/girepository/giarginfo.h +23 -6
  30. package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
  31. package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
  32. package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
  33. package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
  34. package/include/glib-2.0/girepository/gienuminfo.h +20 -21
  35. package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
  36. package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
  37. package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
  38. package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
  39. package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
  40. package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
  41. package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
  42. package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
  43. package/include/glib-2.0/girepository/girepository.h +53 -62
  44. package/include/glib-2.0/girepository/girffi.h +8 -7
  45. package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
  46. package/include/glib-2.0/girepository/gistructinfo.h +26 -11
  47. package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
  48. package/include/glib-2.0/girepository/gitypelib.h +9 -13
  49. package/include/glib-2.0/girepository/gitypes.h +52 -104
  50. package/include/glib-2.0/girepository/giunioninfo.h +28 -12
  51. package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
  52. package/include/glib-2.0/girepository/givalueinfo.h +65 -0
  53. package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
  54. package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
  55. package/include/glib-2.0/glib/gbitlock.h +31 -0
  56. package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
  57. package/include/glib-2.0/glib/giochannel.h +2 -2
  58. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  59. package/include/glib-2.0/glib/gmacros.h +12 -5
  60. package/include/glib-2.0/glib/gmain.h +93 -7
  61. package/include/glib-2.0/glib/gmessages.h +8 -0
  62. package/include/glib-2.0/glib/gqsort.h +8 -1
  63. package/include/glib-2.0/glib/gslice.h +2 -0
  64. package/include/glib-2.0/glib/gstrfuncs.h +24 -30
  65. package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
  66. package/include/glib-2.0/glib/gthread.h +191 -3
  67. package/include/glib-2.0/glib/gunicode.h +1 -1
  68. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  69. package/include/glib-2.0/glib-unix.h +7 -1
  70. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  71. package/include/glib-2.0/gobject/genums.h +6 -6
  72. package/include/glib-2.0/gobject/glib-types.h +11 -0
  73. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  74. package/include/glib-2.0/gobject/gsignal.h +16 -6
  75. package/include/glib-2.0/gobject/gtype.h +6 -6
  76. package/include/harfbuzz/hb-buffer.h +6 -0
  77. package/include/harfbuzz/hb-common.h +6 -9
  78. package/include/harfbuzz/hb-cplusplus.hh +8 -11
  79. package/include/harfbuzz/hb-subset.h +17 -4
  80. package/include/harfbuzz/hb-version.h +3 -3
  81. package/include/hwy/abort.h +28 -0
  82. package/include/hwy/aligned_allocator.h +218 -6
  83. package/include/hwy/base.h +1935 -512
  84. package/include/hwy/cache_control.h +24 -6
  85. package/include/hwy/detect_compiler_arch.h +105 -10
  86. package/include/hwy/detect_targets.h +146 -37
  87. package/include/hwy/foreach_target.h +36 -1
  88. package/include/hwy/highway.h +222 -50
  89. package/include/hwy/ops/arm_neon-inl.h +2055 -894
  90. package/include/hwy/ops/arm_sve-inl.h +1476 -348
  91. package/include/hwy/ops/emu128-inl.h +711 -623
  92. package/include/hwy/ops/generic_ops-inl.h +4431 -2157
  93. package/include/hwy/ops/inside-inl.h +691 -0
  94. package/include/hwy/ops/ppc_vsx-inl.h +2186 -673
  95. package/include/hwy/ops/rvv-inl.h +1556 -536
  96. package/include/hwy/ops/scalar-inl.h +353 -233
  97. package/include/hwy/ops/set_macros-inl.h +171 -23
  98. package/include/hwy/ops/shared-inl.h +198 -56
  99. package/include/hwy/ops/wasm_128-inl.h +283 -244
  100. package/include/hwy/ops/x86_128-inl.h +3673 -1357
  101. package/include/hwy/ops/x86_256-inl.h +1737 -663
  102. package/include/hwy/ops/x86_512-inl.h +1697 -500
  103. package/include/hwy/per_target.h +4 -0
  104. package/include/hwy/profiler.h +648 -0
  105. package/include/hwy/robust_statistics.h +2 -2
  106. package/include/hwy/targets.h +40 -32
  107. package/include/hwy/timer-inl.h +3 -3
  108. package/include/hwy/timer.h +16 -1
  109. package/include/libheif/heif.h +170 -15
  110. package/include/libheif/heif_items.h +237 -0
  111. package/include/libheif/heif_properties.h +38 -2
  112. package/include/libheif/heif_regions.h +1 -1
  113. package/include/libheif/heif_version.h +2 -2
  114. package/include/libpng16/png.h +32 -29
  115. package/include/libpng16/pngconf.h +2 -2
  116. package/include/libpng16/pnglibconf.h +8 -3
  117. package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
  118. package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
  119. package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
  120. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  121. package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
  122. package/include/libxml2/libxml/HTMLparser.h +12 -19
  123. package/include/libxml2/libxml/c14n.h +1 -12
  124. package/include/libxml2/libxml/debugXML.h +1 -1
  125. package/include/libxml2/libxml/encoding.h +9 -0
  126. package/include/libxml2/libxml/entities.h +12 -1
  127. package/include/libxml2/libxml/hash.h +19 -0
  128. package/include/libxml2/libxml/list.h +2 -2
  129. package/include/libxml2/libxml/nanohttp.h +17 -0
  130. package/include/libxml2/libxml/parser.h +73 -58
  131. package/include/libxml2/libxml/parserInternals.h +9 -1
  132. package/include/libxml2/libxml/pattern.h +6 -0
  133. package/include/libxml2/libxml/tree.h +32 -12
  134. package/include/libxml2/libxml/uri.h +11 -0
  135. package/include/libxml2/libxml/valid.h +29 -2
  136. package/include/libxml2/libxml/xinclude.h +7 -0
  137. package/include/libxml2/libxml/xmlIO.h +21 -5
  138. package/include/libxml2/libxml/xmlerror.h +14 -0
  139. package/include/libxml2/libxml/xmlexports.h +111 -15
  140. package/include/libxml2/libxml/xmlmemory.h +8 -45
  141. package/include/libxml2/libxml/xmlreader.h +2 -0
  142. package/include/libxml2/libxml/xmlsave.h +5 -0
  143. package/include/libxml2/libxml/xmlunicode.h +165 -1
  144. package/include/libxml2/libxml/xmlversion.h +15 -179
  145. package/include/libxml2/libxml/xmlwriter.h +1 -0
  146. package/include/libxml2/libxml/xpath.h +4 -0
  147. package/include/pango-1.0/pango/pango-features.h +2 -2
  148. package/include/pango-1.0/pango/pango-fontmap.h +7 -0
  149. package/include/pango-1.0/pango/pango-item.h +4 -2
  150. package/include/pango-1.0/pango/pango-version-macros.h +25 -0
  151. package/include/pango-1.0/pango/pangofc-font.h +2 -1
  152. package/include/pixman-1/pixman-version.h +2 -2
  153. package/include/png.h +32 -29
  154. package/include/pngconf.h +2 -2
  155. package/include/pnglibconf.h +8 -3
  156. package/include/vips/connection.h +9 -3
  157. package/include/vips/util.h +1 -11
  158. package/include/vips/version.h +4 -4
  159. package/include/webp/decode.h +58 -56
  160. package/include/webp/demux.h +25 -21
  161. package/include/webp/encode.h +44 -39
  162. package/include/webp/mux.h +76 -15
  163. package/include/webp/mux_types.h +2 -1
  164. package/include/webp/sharpyuv/sharpyuv.h +77 -8
  165. package/include/webp/types.h +29 -8
  166. package/include/zconf.h +1 -1
  167. package/include/zlib.h +12 -12
  168. package/package.json +1 -1
  169. package/versions.json +18 -19
@@ -33,6 +33,33 @@
33
33
  #define HWY_SVE_HAVE_2 0
34
34
  #endif
35
35
 
36
+ // If 1, both __bf16 and a limited set of *_bf16 SVE intrinsics are available:
37
+ // create/get/set/dup, ld/st, sel, rev, trn, uzp, zip.
38
+ #if HWY_ARM_HAVE_SCALAR_BF16_TYPE && defined(__ARM_FEATURE_SVE_BF16)
39
+ #define HWY_SVE_HAVE_BF16_FEATURE 1
40
+ #else
41
+ #define HWY_SVE_HAVE_BF16_FEATURE 0
42
+ #endif
43
+
44
+ // HWY_SVE_HAVE_BF16_VEC is defined to 1 if the SVE svbfloat16_t vector type
45
+ // is supported, even if HWY_SVE_HAVE_BF16_FEATURE (= intrinsics) is 0.
46
+ #if HWY_SVE_HAVE_BF16_FEATURE || \
47
+ (HWY_COMPILER_CLANG >= 1200 && defined(__ARM_FEATURE_SVE_BF16)) || \
48
+ HWY_COMPILER_GCC_ACTUAL >= 1000
49
+ #define HWY_SVE_HAVE_BF16_VEC 1
50
+ #else
51
+ #define HWY_SVE_HAVE_BF16_VEC 0
52
+ #endif
53
+
54
+ // HWY_SVE_HAVE_F32_TO_BF16C is defined to 1 if the SVE svcvt_bf16_f32_x
55
+ // and svcvtnt_bf16_f32_x intrinsics are available, even if the __bf16 type
56
+ // is disabled
57
+ #if HWY_SVE_HAVE_BF16_VEC && defined(__ARM_FEATURE_SVE_BF16)
58
+ #define HWY_SVE_HAVE_F32_TO_BF16C 1
59
+ #else
60
+ #define HWY_SVE_HAVE_F32_TO_BF16C 0
61
+ #endif
62
+
36
63
  HWY_BEFORE_NAMESPACE();
37
64
  namespace hwy {
38
65
  namespace HWY_NAMESPACE {
@@ -76,12 +103,26 @@ namespace detail { // for code folding
76
103
  #define HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) \
77
104
  X_MACRO(float, f, 64, 32, NAME, OP)
78
105
 
79
- #if HWY_SVE_HAVE_BFLOAT16
80
- #define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP) \
106
+ #define HWY_SVE_FOREACH_BF16_UNCONDITIONAL(X_MACRO, NAME, OP) \
81
107
  X_MACRO(bfloat, bf, 16, 16, NAME, OP)
108
+
109
+ #if HWY_SVE_HAVE_BF16_FEATURE
110
+ #define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP) \
111
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(X_MACRO, NAME, OP)
112
+ // We have both f16 and bf16, so nothing is emulated.
113
+
114
+ // NOTE: hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr is used instead of
115
+ // hwy::EnableIf<false>* = nullptr to avoid compiler errors since
116
+ // !hwy::IsSame<D, D>() is always false and as !hwy::IsSame<D, D>() will cause
117
+ // SFINAE to occur instead of a hard error due to a dependency on the D template
118
+ // argument
119
+ #define HWY_SVE_IF_EMULATED_D(D) hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr
120
+ #define HWY_SVE_IF_NOT_EMULATED_D(D) hwy::EnableIf<true>* = nullptr
82
121
  #else
83
122
  #define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP)
84
- #endif
123
+ #define HWY_SVE_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
124
+ #define HWY_SVE_IF_NOT_EMULATED_D(D) HWY_IF_NOT_BF16_D(D)
125
+ #endif // HWY_SVE_HAVE_BF16_FEATURE
85
126
 
86
127
  // For all element sizes:
87
128
  #define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
@@ -96,12 +137,16 @@ namespace detail { // for code folding
96
137
  HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) \
97
138
  HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
98
139
 
140
+ #define HWY_SVE_FOREACH_F3264(X_MACRO, NAME, OP) \
141
+ HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
142
+ HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
143
+
99
144
  // HWY_SVE_FOREACH_F does not include HWY_SVE_FOREACH_BF16 because SVE lacks
100
145
  // bf16 overloads for some intrinsics (especially less-common arithmetic).
146
+ // However, this does include f16 because SVE supports it unconditionally.
101
147
  #define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) \
102
148
  HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
103
- HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
104
- HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
149
+ HWY_SVE_FOREACH_F3264(X_MACRO, NAME, OP)
105
150
 
106
151
  // Commonly used type categories for a given element size:
107
152
  #define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP) \
@@ -123,8 +168,7 @@ namespace detail { // for code folding
123
168
  #define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP) \
124
169
  HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
125
170
  HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
126
- HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
127
- HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
171
+ HWY_SVE_FOREACH_F3264(X_MACRO, NAME, OP)
128
172
 
129
173
  // Commonly used type categories:
130
174
  #define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP) \
@@ -155,7 +199,9 @@ namespace detail { // for code folding
155
199
  };
156
200
 
157
201
  HWY_SVE_FOREACH(HWY_SPECIALIZE, _, _)
158
- HWY_SVE_FOREACH_BF16(HWY_SPECIALIZE, _, _)
202
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
203
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _)
204
+ #endif
159
205
  #undef HWY_SPECIALIZE
160
206
 
161
207
  // Note: _x (don't-care value for inactive lanes) avoids additional MOVPRFX
@@ -184,15 +230,24 @@ HWY_SVE_FOREACH_BF16(HWY_SPECIALIZE, _, _)
184
230
  }
185
231
 
186
232
  // vector = f(vector, vector), e.g. Add
233
+ #define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP) \
234
+ HWY_API HWY_SVE_V(BASE, BITS) \
235
+ NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
236
+ return sv##OP##_##CHAR##BITS(a, b); \
237
+ }
238
+ // All-true mask
187
239
  #define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP) \
188
240
  HWY_API HWY_SVE_V(BASE, BITS) \
189
241
  NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
190
242
  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
191
243
  }
192
- #define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP) \
193
- HWY_API HWY_SVE_V(BASE, BITS) \
194
- NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
195
- return sv##OP##_##CHAR##BITS(a, b); \
244
+ // User-specified mask. Mask=false value is undefined and must be set by caller
245
+ // because SVE instructions take it from one of the two inputs, whereas
246
+ // AVX-512, RVV and Highway allow a third argument.
247
+ #define HWY_SVE_RETV_ARGMVV(BASE, CHAR, BITS, HALF, NAME, OP) \
248
+ HWY_API HWY_SVE_V(BASE, BITS) \
249
+ NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
250
+ return sv##OP##_##CHAR##BITS##_x(m, a, b); \
196
251
  }
197
252
 
198
253
  #define HWY_SVE_RETV_ARGVVV(BASE, CHAR, BITS, HALF, NAME, OP) \
@@ -264,26 +319,19 @@ HWY_API size_t Lanes(Simd<T, N, kPow2> d) {
264
319
  return sv##OP##_b##BITS##_u32(uint32_t{0}, static_cast<uint32_t>(limit)); \
265
320
  }
266
321
  HWY_SVE_FOREACH(HWY_SVE_FIRSTN, FirstN, whilelt)
267
- HWY_SVE_FOREACH_BF16(HWY_SVE_FIRSTN, FirstN, whilelt)
268
-
269
- #undef HWY_SVE_FIRSTN
270
-
271
- template <class D>
272
- using MFromD = decltype(FirstN(D(), 0));
322
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
323
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_FIRSTN, FirstN, whilelt)
324
+ #endif
273
325
 
274
- #if !HWY_HAVE_FLOAT16
275
- template <class D, HWY_IF_F16_D(D)>
276
- MFromD<RebindToUnsigned<D>> FirstN(D /* tag */, size_t count) {
326
+ template <class D, HWY_SVE_IF_EMULATED_D(D)>
327
+ svbool_t FirstN(D /* tag */, size_t count) {
277
328
  return FirstN(RebindToUnsigned<D>(), count);
278
329
  }
279
- #endif // !HWY_HAVE_FLOAT16
280
330
 
281
- #if !HWY_SVE_HAVE_BFLOAT16
282
- template <class D, HWY_IF_BF16_D(D)>
283
- MFromD<RebindToUnsigned<D>> FirstN(D /* tag */, size_t count) {
284
- return FirstN(RebindToUnsigned<D>(), count);
285
- }
286
- #endif // !HWY_SVE_HAVE_BFLOAT16
331
+ #undef HWY_SVE_FIRSTN
332
+
333
+ template <class D>
334
+ using MFromD = svbool_t;
287
335
 
288
336
  namespace detail {
289
337
 
@@ -298,7 +346,7 @@ namespace detail {
298
346
  }
299
347
 
300
348
  HWY_SVE_FOREACH(HWY_SVE_WRAP_PTRUE, PTrue, ptrue) // return all-true
301
- HWY_SVE_FOREACH_BF16(HWY_SVE_WRAP_PTRUE, PTrue, ptrue)
349
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_WRAP_PTRUE, PTrue, ptrue)
302
350
  #undef HWY_SVE_WRAP_PTRUE
303
351
 
304
352
  HWY_API svbool_t PFalse() { return svpfalse_b(); }
@@ -314,6 +362,17 @@ svbool_t MakeMask(D d) {
314
362
 
315
363
  } // namespace detail
316
364
 
365
+ #ifdef HWY_NATIVE_MASK_FALSE
366
+ #undef HWY_NATIVE_MASK_FALSE
367
+ #else
368
+ #define HWY_NATIVE_MASK_FALSE
369
+ #endif
370
+
371
+ template <class D>
372
+ HWY_API svbool_t MaskFalse(const D /*d*/) {
373
+ return detail::PFalse();
374
+ }
375
+
317
376
  // ================================================== INIT
318
377
 
319
378
  // ------------------------------ Set
@@ -326,14 +385,23 @@ svbool_t MakeMask(D d) {
326
385
  }
327
386
 
328
387
  HWY_SVE_FOREACH(HWY_SVE_SET, Set, dup_n)
388
+ #if HWY_SVE_HAVE_BF16_FEATURE // for if-elif chain
329
389
  HWY_SVE_FOREACH_BF16(HWY_SVE_SET, Set, dup_n)
330
- #if !HWY_SVE_HAVE_BFLOAT16
390
+ #elif HWY_SVE_HAVE_BF16_VEC
331
391
  // Required for Zero and VFromD
332
- template <size_t N, int kPow2>
333
- svuint16_t Set(Simd<bfloat16_t, N, kPow2> d, bfloat16_t arg) {
334
- return Set(RebindToUnsigned<decltype(d)>(), arg.bits);
392
+ template <class D, HWY_IF_BF16_D(D)>
393
+ HWY_API svbfloat16_t Set(D d, bfloat16_t arg) {
394
+ return svreinterpret_bf16_u16(
395
+ Set(RebindToUnsigned<decltype(d)>(), BitCastScalar<uint16_t>(arg)));
335
396
  }
336
- #endif // HWY_SVE_HAVE_BFLOAT16
397
+ #else // neither bf16 feature nor vector: emulate with u16
398
+ // Required for Zero and VFromD
399
+ template <class D, HWY_IF_BF16_D(D)>
400
+ HWY_API svuint16_t Set(D d, bfloat16_t arg) {
401
+ const RebindToUnsigned<decltype(d)> du;
402
+ return Set(du, BitCastScalar<uint16_t>(arg));
403
+ }
404
+ #endif // HWY_SVE_HAVE_BF16_FEATURE
337
405
  #undef HWY_SVE_SET
338
406
 
339
407
  template <class D>
@@ -350,17 +418,6 @@ VFromD<D> Zero(D d) {
350
418
  return BitCast(d, Set(du, 0));
351
419
  }
352
420
 
353
- // ------------------------------ Undefined
354
-
355
- #define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP) \
356
- template <size_t N, int kPow2> \
357
- HWY_API HWY_SVE_V(BASE, BITS) \
358
- NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
359
- return sv##OP##_##CHAR##BITS(); \
360
- }
361
-
362
- HWY_SVE_FOREACH(HWY_SVE_UNDEFINED, Undefined, undef)
363
-
364
421
  // ------------------------------ BitCast
365
422
 
366
423
  namespace detail {
@@ -387,24 +444,32 @@ namespace detail {
387
444
  return sv##OP##_##CHAR##BITS##_u8(v); \
388
445
  }
389
446
 
447
+ // U08 is special-cased, hence do not use FOREACH.
390
448
  HWY_SVE_FOREACH_U08(HWY_SVE_CAST_NOP, _, _)
391
449
  HWY_SVE_FOREACH_I08(HWY_SVE_CAST, _, reinterpret)
392
450
  HWY_SVE_FOREACH_UI16(HWY_SVE_CAST, _, reinterpret)
393
451
  HWY_SVE_FOREACH_UI32(HWY_SVE_CAST, _, reinterpret)
394
452
  HWY_SVE_FOREACH_UI64(HWY_SVE_CAST, _, reinterpret)
395
453
  HWY_SVE_FOREACH_F(HWY_SVE_CAST, _, reinterpret)
396
- HWY_SVE_FOREACH_BF16(HWY_SVE_CAST, _, reinterpret)
397
454
 
398
- #undef HWY_SVE_CAST_NOP
399
- #undef HWY_SVE_CAST
455
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
456
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CAST, _, reinterpret)
457
+ #else // !(HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC)
458
+ template <class V, HWY_SVE_IF_EMULATED_D(DFromV<V>)>
459
+ HWY_INLINE svuint8_t BitCastToByte(V v) {
460
+ const RebindToUnsigned<DFromV<V>> du;
461
+ return BitCastToByte(BitCast(du, v));
462
+ }
400
463
 
401
- #if !HWY_SVE_HAVE_BFLOAT16
402
- template <size_t N, int kPow2>
403
- HWY_INLINE VBF16 BitCastFromByte(Simd<bfloat16_t, N, kPow2> /* d */,
404
- svuint8_t v) {
405
- return BitCastFromByte(Simd<uint16_t, N, kPow2>(), v);
464
+ template <class D, HWY_SVE_IF_EMULATED_D(D)>
465
+ HWY_INLINE VFromD<D> BitCastFromByte(D d, svuint8_t v) {
466
+ const RebindToUnsigned<decltype(d)> du;
467
+ return BitCastFromByte(du, v);
406
468
  }
407
- #endif // !HWY_SVE_HAVE_BFLOAT16
469
+ #endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
470
+
471
+ #undef HWY_SVE_CAST_NOP
472
+ #undef HWY_SVE_CAST
408
473
 
409
474
  } // namespace detail
410
475
 
@@ -413,6 +478,26 @@ HWY_API VFromD<D> BitCast(D d, FromV v) {
413
478
  return detail::BitCastFromByte(d, detail::BitCastToByte(v));
414
479
  }
415
480
 
481
+ // ------------------------------ Undefined
482
+
483
+ #define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP) \
484
+ template <size_t N, int kPow2> \
485
+ HWY_API HWY_SVE_V(BASE, BITS) \
486
+ NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
487
+ return sv##OP##_##CHAR##BITS(); \
488
+ }
489
+
490
+ HWY_SVE_FOREACH(HWY_SVE_UNDEFINED, Undefined, undef)
491
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
492
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_UNDEFINED, Undefined, undef)
493
+ #endif
494
+
495
+ template <class D, HWY_SVE_IF_EMULATED_D(D)>
496
+ VFromD<D> Undefined(D d) {
497
+ const RebindToUnsigned<D> du;
498
+ return BitCast(d, Undefined(du));
499
+ }
500
+
416
501
  // ------------------------------ Tuple
417
502
 
418
503
  // tuples = f(d, v..), e.g. Create2
@@ -438,7 +523,9 @@ HWY_API VFromD<D> BitCast(D d, FromV v) {
438
523
  }
439
524
 
440
525
  HWY_SVE_FOREACH(HWY_SVE_CREATE, Create, create)
441
- HWY_SVE_FOREACH_BF16(HWY_SVE_CREATE, Create, create)
526
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
527
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CREATE, Create, create)
528
+ #endif
442
529
  #undef HWY_SVE_CREATE
443
530
 
444
531
  template <class D>
@@ -463,7 +550,9 @@ using Vec4 = decltype(Create4(D(), Zero(D()), Zero(D()), Zero(D()), Zero(D())));
463
550
  }
464
551
 
465
552
  HWY_SVE_FOREACH(HWY_SVE_GET, Get, get)
466
- HWY_SVE_FOREACH_BF16(HWY_SVE_GET, Get, get)
553
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
554
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_GET, Get, get)
555
+ #endif
467
556
  #undef HWY_SVE_GET
468
557
 
469
558
  #define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP) \
@@ -484,7 +573,9 @@ HWY_SVE_FOREACH_BF16(HWY_SVE_GET, Get, get)
484
573
  }
485
574
 
486
575
  HWY_SVE_FOREACH(HWY_SVE_SET, Set, set)
487
- HWY_SVE_FOREACH_BF16(HWY_SVE_SET, Set, set)
576
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
577
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_SET, Set, set)
578
+ #endif
488
579
  #undef HWY_SVE_SET
489
580
 
490
581
  // ------------------------------ ResizeBitCast
@@ -495,6 +586,107 @@ HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
495
586
  return BitCast(d, v);
496
587
  }
497
588
 
589
+ // ------------------------------ Dup128VecFromValues
590
+
591
+ template <class D, HWY_IF_I8_D(D)>
592
+ HWY_API svint8_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
593
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
594
+ TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
595
+ TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
596
+ TFromD<D> t11, TFromD<D> t12,
597
+ TFromD<D> t13, TFromD<D> t14,
598
+ TFromD<D> t15) {
599
+ return svdupq_n_s8(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13,
600
+ t14, t15);
601
+ }
602
+
603
+ template <class D, HWY_IF_U8_D(D)>
604
+ HWY_API svuint8_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
605
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
606
+ TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
607
+ TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
608
+ TFromD<D> t11, TFromD<D> t12,
609
+ TFromD<D> t13, TFromD<D> t14,
610
+ TFromD<D> t15) {
611
+ return svdupq_n_u8(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13,
612
+ t14, t15);
613
+ }
614
+
615
+ template <class D, HWY_IF_I16_D(D)>
616
+ HWY_API svint16_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
617
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
618
+ TFromD<D> t5, TFromD<D> t6,
619
+ TFromD<D> t7) {
620
+ return svdupq_n_s16(t0, t1, t2, t3, t4, t5, t6, t7);
621
+ }
622
+
623
+ template <class D, HWY_IF_U16_D(D)>
624
+ HWY_API svuint16_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
625
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
626
+ TFromD<D> t5, TFromD<D> t6,
627
+ TFromD<D> t7) {
628
+ return svdupq_n_u16(t0, t1, t2, t3, t4, t5, t6, t7);
629
+ }
630
+
631
+ template <class D, HWY_IF_F16_D(D)>
632
+ HWY_API svfloat16_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
633
+ TFromD<D> t2, TFromD<D> t3,
634
+ TFromD<D> t4, TFromD<D> t5,
635
+ TFromD<D> t6, TFromD<D> t7) {
636
+ return svdupq_n_f16(t0, t1, t2, t3, t4, t5, t6, t7);
637
+ }
638
+
639
+ template <class D, HWY_IF_BF16_D(D)>
640
+ HWY_API VBF16 Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, TFromD<D> t2,
641
+ TFromD<D> t3, TFromD<D> t4, TFromD<D> t5,
642
+ TFromD<D> t6, TFromD<D> t7) {
643
+ #if HWY_SVE_HAVE_BF16_FEATURE
644
+ (void)d;
645
+ return svdupq_n_bf16(t0, t1, t2, t3, t4, t5, t6, t7);
646
+ #else
647
+ const RebindToUnsigned<decltype(d)> du;
648
+ return BitCast(
649
+ d, Dup128VecFromValues(
650
+ du, BitCastScalar<uint16_t>(t0), BitCastScalar<uint16_t>(t1),
651
+ BitCastScalar<uint16_t>(t2), BitCastScalar<uint16_t>(t3),
652
+ BitCastScalar<uint16_t>(t4), BitCastScalar<uint16_t>(t5),
653
+ BitCastScalar<uint16_t>(t6), BitCastScalar<uint16_t>(t7)));
654
+ #endif
655
+ }
656
+
657
+ template <class D, HWY_IF_I32_D(D)>
658
+ HWY_API svint32_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
659
+ TFromD<D> t2, TFromD<D> t3) {
660
+ return svdupq_n_s32(t0, t1, t2, t3);
661
+ }
662
+
663
+ template <class D, HWY_IF_U32_D(D)>
664
+ HWY_API svuint32_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
665
+ TFromD<D> t2, TFromD<D> t3) {
666
+ return svdupq_n_u32(t0, t1, t2, t3);
667
+ }
668
+
669
+ template <class D, HWY_IF_F32_D(D)>
670
+ HWY_API svfloat32_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
671
+ TFromD<D> t2, TFromD<D> t3) {
672
+ return svdupq_n_f32(t0, t1, t2, t3);
673
+ }
674
+
675
+ template <class D, HWY_IF_I64_D(D)>
676
+ HWY_API svint64_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
677
+ return svdupq_n_s64(t0, t1);
678
+ }
679
+
680
+ template <class D, HWY_IF_U64_D(D)>
681
+ HWY_API svuint64_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
682
+ return svdupq_n_u64(t0, t1);
683
+ }
684
+
685
+ template <class D, HWY_IF_F64_D(D)>
686
+ HWY_API svfloat64_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
687
+ return svdupq_n_f64(t0, t1);
688
+ }
689
+
498
690
  // ================================================== LOGICAL
499
691
 
500
692
  // detail::*N() functions accept a scalar argument to avoid extra Set().
@@ -519,6 +711,10 @@ HWY_API V And(const V a, const V b) {
519
711
 
520
712
  // ------------------------------ Or
521
713
 
714
+ namespace detail {
715
+ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, OrN, orr_n)
716
+ } // namespace detail
717
+
522
718
  HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Or, orr)
523
719
 
524
720
  template <class V, HWY_IF_FLOAT_V(V)>
@@ -632,9 +828,37 @@ HWY_API VBF16 Neg(VBF16 v) {
632
828
  return BitCast(d, Xor(BitCast(du, v), Set(du, SignMask<TU>())));
633
829
  }
634
830
 
831
+ // ------------------------------ SaturatedNeg
832
+ #if HWY_SVE_HAVE_2
833
+ #ifdef HWY_NATIVE_SATURATED_NEG_8_16_32
834
+ #undef HWY_NATIVE_SATURATED_NEG_8_16_32
835
+ #else
836
+ #define HWY_NATIVE_SATURATED_NEG_8_16_32
837
+ #endif
838
+
839
+ #ifdef HWY_NATIVE_SATURATED_NEG_64
840
+ #undef HWY_NATIVE_SATURATED_NEG_64
841
+ #else
842
+ #define HWY_NATIVE_SATURATED_NEG_64
843
+ #endif
844
+
845
+ HWY_SVE_FOREACH_I(HWY_SVE_RETV_ARGPV, SaturatedNeg, qneg)
846
+ #endif // HWY_SVE_HAVE_2
847
+
635
848
  // ------------------------------ Abs
636
849
  HWY_SVE_FOREACH_IF(HWY_SVE_RETV_ARGPV, Abs, abs)
637
850
 
851
+ // ------------------------------ SaturatedAbs
852
+ #if HWY_SVE_HAVE_2
853
+ #ifdef HWY_NATIVE_SATURATED_ABS
854
+ #undef HWY_NATIVE_SATURATED_ABS
855
+ #else
856
+ #define HWY_NATIVE_SATURATED_ABS
857
+ #endif
858
+
859
+ HWY_SVE_FOREACH_I(HWY_SVE_RETV_ARGPV, SaturatedAbs, qabs)
860
+ #endif // HWY_SVE_HAVE_2
861
+
638
862
  // ================================================== ARITHMETIC
639
863
 
640
864
  // Per-target flags to prevent generic_ops-inl.h defining Add etc.
@@ -676,13 +900,107 @@ HWY_API svuint64_t SumsOf8(const svuint8_t v) {
676
900
 
677
901
  const svuint32_t sums_of_4 = svdot_n_u32(Zero(du32), v, 1);
678
902
  // Compute pairwise sum of u32 and extend to u64.
679
- // TODO(janwas): on SVE2, we can instead use svaddp.
903
+
904
+ #if HWY_SVE_HAVE_2
905
+ return svadalp_u64_x(pg, Zero(du64), sums_of_4);
906
+ #else
680
907
  const svuint64_t hi = svlsr_n_u64_x(pg, BitCast(du64, sums_of_4), 32);
681
908
  // Isolate the lower 32 bits (to be added to the upper 32 and zero-extended)
682
909
  const svuint64_t lo = svextw_u64_x(pg, BitCast(du64, sums_of_4));
683
910
  return Add(hi, lo);
911
+ #endif
912
+ }
913
+
914
+ HWY_API svint64_t SumsOf8(const svint8_t v) {
915
+ const ScalableTag<int32_t> di32;
916
+ const ScalableTag<int64_t> di64;
917
+ const svbool_t pg = detail::PTrue(di64);
918
+
919
+ const svint32_t sums_of_4 = svdot_n_s32(Zero(di32), v, 1);
920
+ #if HWY_SVE_HAVE_2
921
+ return svadalp_s64_x(pg, Zero(di64), sums_of_4);
922
+ #else
923
+ const svint64_t hi = svasr_n_s64_x(pg, BitCast(di64, sums_of_4), 32);
924
+ // Isolate the lower 32 bits (to be added to the upper 32 and sign-extended)
925
+ const svint64_t lo = svextw_s64_x(pg, BitCast(di64, sums_of_4));
926
+ return Add(hi, lo);
927
+ #endif
928
+ }
929
+
930
+ // ------------------------------ SumsOf2
931
+ #if HWY_SVE_HAVE_2
932
+ namespace detail {
933
+
934
+ HWY_INLINE svint16_t SumsOf2(hwy::SignedTag /*type_tag*/,
935
+ hwy::SizeTag<1> /*lane_size_tag*/, svint8_t v) {
936
+ const ScalableTag<int16_t> di16;
937
+ const svbool_t pg = detail::PTrue(di16);
938
+ return svadalp_s16_x(pg, Zero(di16), v);
939
+ }
940
+
941
+ HWY_INLINE svuint16_t SumsOf2(hwy::UnsignedTag /*type_tag*/,
942
+ hwy::SizeTag<1> /*lane_size_tag*/, svuint8_t v) {
943
+ const ScalableTag<uint16_t> du16;
944
+ const svbool_t pg = detail::PTrue(du16);
945
+ return svadalp_u16_x(pg, Zero(du16), v);
946
+ }
947
+
948
+ HWY_INLINE svint32_t SumsOf2(hwy::SignedTag /*type_tag*/,
949
+ hwy::SizeTag<2> /*lane_size_tag*/, svint16_t v) {
950
+ const ScalableTag<int32_t> di32;
951
+ const svbool_t pg = detail::PTrue(di32);
952
+ return svadalp_s32_x(pg, Zero(di32), v);
953
+ }
954
+
955
+ HWY_INLINE svuint32_t SumsOf2(hwy::UnsignedTag /*type_tag*/,
956
+ hwy::SizeTag<2> /*lane_size_tag*/, svuint16_t v) {
957
+ const ScalableTag<uint32_t> du32;
958
+ const svbool_t pg = detail::PTrue(du32);
959
+ return svadalp_u32_x(pg, Zero(du32), v);
960
+ }
961
+
962
+ HWY_INLINE svint64_t SumsOf2(hwy::SignedTag /*type_tag*/,
963
+ hwy::SizeTag<4> /*lane_size_tag*/, svint32_t v) {
964
+ const ScalableTag<int64_t> di64;
965
+ const svbool_t pg = detail::PTrue(di64);
966
+ return svadalp_s64_x(pg, Zero(di64), v);
967
+ }
968
+
969
+ HWY_INLINE svuint64_t SumsOf2(hwy::UnsignedTag /*type_tag*/,
970
+ hwy::SizeTag<4> /*lane_size_tag*/, svuint32_t v) {
971
+ const ScalableTag<uint64_t> du64;
972
+ const svbool_t pg = detail::PTrue(du64);
973
+ return svadalp_u64_x(pg, Zero(du64), v);
974
+ }
975
+
976
+ } // namespace detail
977
+ #endif // HWY_SVE_HAVE_2
978
+
979
+ // ------------------------------ SumsOf4
980
+ namespace detail {
981
+
982
+ HWY_INLINE svint32_t SumsOf4(hwy::SignedTag /*type_tag*/,
983
+ hwy::SizeTag<1> /*lane_size_tag*/, svint8_t v) {
984
+ return svdot_n_s32(Zero(ScalableTag<int32_t>()), v, 1);
985
+ }
986
+
987
+ HWY_INLINE svuint32_t SumsOf4(hwy::UnsignedTag /*type_tag*/,
988
+ hwy::SizeTag<1> /*lane_size_tag*/, svuint8_t v) {
989
+ return svdot_n_u32(Zero(ScalableTag<uint32_t>()), v, 1);
990
+ }
991
+
992
+ HWY_INLINE svint64_t SumsOf4(hwy::SignedTag /*type_tag*/,
993
+ hwy::SizeTag<2> /*lane_size_tag*/, svint16_t v) {
994
+ return svdot_n_s64(Zero(ScalableTag<int64_t>()), v, 1);
995
+ }
996
+
997
+ HWY_INLINE svuint64_t SumsOf4(hwy::UnsignedTag /*type_tag*/,
998
+ hwy::SizeTag<2> /*lane_size_tag*/, svuint16_t v) {
999
+ return svdot_n_u64(Zero(ScalableTag<uint64_t>()), v, 1);
684
1000
  }
685
1001
 
1002
+ } // namespace detail
1003
+
686
1004
  // ------------------------------ SaturatedAdd
687
1005
 
688
1006
  #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
@@ -726,14 +1044,15 @@ HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVV, AbsDiff, abd)
726
1044
 
727
1045
  // ------------------------------ ShiftLeft[Same]
728
1046
 
729
- #define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP) \
730
- template <int kBits> \
731
- HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
732
- return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits); \
733
- } \
734
- HWY_API HWY_SVE_V(BASE, BITS) \
735
- NAME##Same(HWY_SVE_V(BASE, BITS) v, HWY_SVE_T(uint, BITS) bits) { \
736
- return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, bits); \
1047
+ #define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP) \
1048
+ template <int kBits> \
1049
+ HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
1050
+ return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits); \
1051
+ } \
1052
+ HWY_API HWY_SVE_V(BASE, BITS) \
1053
+ NAME##Same(HWY_SVE_V(BASE, BITS) v, int bits) { \
1054
+ return sv##OP##_##CHAR##BITS##_x( \
1055
+ HWY_SVE_PTRUE(BITS), v, static_cast<HWY_SVE_T(uint, BITS)>(bits)); \
737
1056
  }
738
1057
 
739
1058
  HWY_SVE_FOREACH_UI(HWY_SVE_SHIFT_N, ShiftLeft, lsl_n)
@@ -747,15 +1066,35 @@ HWY_SVE_FOREACH_I(HWY_SVE_SHIFT_N, ShiftRight, asr_n)
747
1066
 
748
1067
  // ------------------------------ RotateRight
749
1068
 
750
- // TODO(janwas): svxar on SVE2
751
- template <int kBits, class V>
1069
+ #if HWY_SVE_HAVE_2
1070
+
1071
+ #define HWY_SVE_ROTATE_RIGHT_N(BASE, CHAR, BITS, HALF, NAME, OP) \
1072
+ template <int kBits> \
1073
+ HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
1074
+ if (kBits == 0) return v; \
1075
+ return sv##OP##_##CHAR##BITS(v, Zero(DFromV<decltype(v)>()), \
1076
+ HWY_MAX(kBits, 1)); \
1077
+ }
1078
+
1079
+ HWY_SVE_FOREACH_U(HWY_SVE_ROTATE_RIGHT_N, RotateRight, xar_n)
1080
+ HWY_SVE_FOREACH_I(HWY_SVE_ROTATE_RIGHT_N, RotateRight, xar_n)
1081
+
1082
+ #undef HWY_SVE_ROTATE_RIGHT_N
1083
+
1084
+ #else // !HWY_SVE_HAVE_2
1085
+ template <int kBits, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
752
1086
  HWY_API V RotateRight(const V v) {
1087
+ const DFromV<decltype(v)> d;
1088
+ const RebindToUnsigned<decltype(d)> du;
1089
+
753
1090
  constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
754
1091
  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
755
1092
  if (kBits == 0) return v;
756
- return Or(ShiftRight<kBits>(v),
1093
+
1094
+ return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
757
1095
  ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
758
1096
  }
1097
+ #endif
759
1098
 
760
1099
  // ------------------------------ Shl/r
761
1100
 
@@ -803,11 +1142,7 @@ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, MaxN, max_n)
803
1142
  HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVV, Mul, mul)
804
1143
 
805
1144
  // ------------------------------ MulHigh
806
- HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
807
- // Not part of API, used internally:
808
- HWY_SVE_FOREACH_UI08(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
809
- HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
810
- HWY_SVE_FOREACH_U64(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
1145
+ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
811
1146
 
812
1147
  // ------------------------------ MulFixedPoint15
813
1148
  HWY_API svint16_t MulFixedPoint15(svint16_t a, svint16_t b) {
@@ -830,6 +1165,14 @@ HWY_API svint16_t MulFixedPoint15(svint16_t a, svint16_t b) {
830
1165
  }
831
1166
 
832
1167
  // ------------------------------ Div
1168
+ #ifdef HWY_NATIVE_INT_DIV
1169
+ #undef HWY_NATIVE_INT_DIV
1170
+ #else
1171
+ #define HWY_NATIVE_INT_DIV
1172
+ #endif
1173
+
1174
+ HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGPVV, Div, div)
1175
+ HWY_SVE_FOREACH_UI64(HWY_SVE_RETV_ARGPVV, Div, div)
833
1176
  HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Div, div)
834
1177
 
835
1178
  // ------------------------------ ApproximateReciprocal
@@ -981,18 +1324,40 @@ HWY_API size_t FindKnownFirstTrue(D d, svbool_t m) {
981
1324
  }
982
1325
 
983
1326
  HWY_SVE_FOREACH(HWY_SVE_IF_THEN_ELSE, IfThenElse, sel)
1327
+ HWY_SVE_FOREACH_BF16(HWY_SVE_IF_THEN_ELSE, IfThenElse, sel)
984
1328
  #undef HWY_SVE_IF_THEN_ELSE
985
1329
 
1330
+ template <class V, class D = DFromV<V>, HWY_SVE_IF_EMULATED_D(D)>
1331
+ HWY_API V IfThenElse(const svbool_t mask, V yes, V no) {
1332
+ const RebindToUnsigned<D> du;
1333
+ return BitCast(
1334
+ D(), IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no)));
1335
+ }
1336
+
986
1337
  // ------------------------------ IfThenElseZero
987
- template <class V>
1338
+
1339
+ template <class V, class D = DFromV<V>, HWY_SVE_IF_NOT_EMULATED_D(D)>
988
1340
  HWY_API V IfThenElseZero(const svbool_t mask, const V yes) {
989
- return IfThenElse(mask, yes, Zero(DFromV<V>()));
1341
+ return IfThenElse(mask, yes, Zero(D()));
1342
+ }
1343
+
1344
+ template <class V, class D = DFromV<V>, HWY_SVE_IF_EMULATED_D(D)>
1345
+ HWY_API V IfThenElseZero(const svbool_t mask, V yes) {
1346
+ const RebindToUnsigned<D> du;
1347
+ return BitCast(D(), IfThenElseZero(RebindMask(du, mask), BitCast(du, yes)));
990
1348
  }
991
1349
 
992
1350
  // ------------------------------ IfThenZeroElse
993
- template <class V>
1351
+
1352
+ template <class V, class D = DFromV<V>, HWY_SVE_IF_NOT_EMULATED_D(D)>
994
1353
  HWY_API V IfThenZeroElse(const svbool_t mask, const V no) {
995
- return IfThenElse(mask, Zero(DFromV<V>()), no);
1354
+ return IfThenElse(mask, Zero(D()), no);
1355
+ }
1356
+
1357
+ template <class V, class D = DFromV<V>, HWY_SVE_IF_EMULATED_D(D)>
1358
+ HWY_API V IfThenZeroElse(const svbool_t mask, V no) {
1359
+ const RebindToUnsigned<D> du;
1360
+ return BitCast(D(), IfThenZeroElse(RebindMask(du, mask), BitCast(du, no)));
996
1361
  }
997
1362
 
998
1363
  // ------------------------------ Additional mask logical operations
@@ -1016,6 +1381,162 @@ HWY_API svbool_t SetAtOrAfterFirst(svbool_t m) {
1016
1381
  return Not(SetBeforeFirst(m));
1017
1382
  }
1018
1383
 
1384
+ // ------------------------------ PromoteMaskTo
1385
+
1386
+ #ifdef HWY_NATIVE_PROMOTE_MASK_TO
1387
+ #undef HWY_NATIVE_PROMOTE_MASK_TO
1388
+ #else
1389
+ #define HWY_NATIVE_PROMOTE_MASK_TO
1390
+ #endif
1391
+
1392
+ template <class DTo, class DFrom,
1393
+ HWY_IF_T_SIZE_D(DTo, sizeof(TFromD<DFrom>) * 2)>
1394
+ HWY_API svbool_t PromoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, svbool_t m) {
1395
+ return svunpklo_b(m);
1396
+ }
1397
+
1398
+ template <class DTo, class DFrom,
1399
+ HWY_IF_T_SIZE_GT_D(DTo, sizeof(TFromD<DFrom>) * 2)>
1400
+ HWY_API svbool_t PromoteMaskTo(DTo d_to, DFrom d_from, svbool_t m) {
1401
+ using TFrom = TFromD<DFrom>;
1402
+ using TWFrom = MakeWide<MakeUnsigned<TFrom>>;
1403
+ static_assert(sizeof(TWFrom) > sizeof(TFrom),
1404
+ "sizeof(TWFrom) > sizeof(TFrom) must be true");
1405
+
1406
+ const Rebind<TWFrom, decltype(d_from)> dw_from;
1407
+ return PromoteMaskTo(d_to, dw_from, PromoteMaskTo(dw_from, d_from, m));
1408
+ }
1409
+
1410
+ // ------------------------------ DemoteMaskTo
1411
+
1412
+ #ifdef HWY_NATIVE_DEMOTE_MASK_TO
1413
+ #undef HWY_NATIVE_DEMOTE_MASK_TO
1414
+ #else
1415
+ #define HWY_NATIVE_DEMOTE_MASK_TO
1416
+ #endif
1417
+
1418
+ template <class DTo, class DFrom, HWY_IF_T_SIZE_D(DTo, 1),
1419
+ HWY_IF_T_SIZE_D(DFrom, 2)>
1420
+ HWY_API svbool_t DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, svbool_t m) {
1421
+ return svuzp1_b8(m, m);
1422
+ }
1423
+
1424
+ template <class DTo, class DFrom, HWY_IF_T_SIZE_D(DTo, 2),
1425
+ HWY_IF_T_SIZE_D(DFrom, 4)>
1426
+ HWY_API svbool_t DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, svbool_t m) {
1427
+ return svuzp1_b16(m, m);
1428
+ }
1429
+
1430
+ template <class DTo, class DFrom, HWY_IF_T_SIZE_D(DTo, 4),
1431
+ HWY_IF_T_SIZE_D(DFrom, 8)>
1432
+ HWY_API svbool_t DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, svbool_t m) {
1433
+ return svuzp1_b32(m, m);
1434
+ }
1435
+
1436
+ template <class DTo, class DFrom,
1437
+ HWY_IF_T_SIZE_LE_D(DTo, sizeof(TFromD<DFrom>) / 4)>
1438
+ HWY_API svbool_t DemoteMaskTo(DTo d_to, DFrom d_from, svbool_t m) {
1439
+ using TFrom = TFromD<DFrom>;
1440
+ using TNFrom = MakeNarrow<MakeUnsigned<TFrom>>;
1441
+ static_assert(sizeof(TNFrom) < sizeof(TFrom),
1442
+ "sizeof(TNFrom) < sizeof(TFrom) must be true");
1443
+
1444
+ const Rebind<TNFrom, decltype(d_from)> dn_from;
1445
+ return DemoteMaskTo(d_to, dn_from, DemoteMaskTo(dn_from, d_from, m));
1446
+ }
1447
+
1448
+ // ------------------------------ LowerHalfOfMask
1449
+ #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
1450
+ #undef HWY_NATIVE_LOWER_HALF_OF_MASK
1451
+ #else
1452
+ #define HWY_NATIVE_LOWER_HALF_OF_MASK
1453
+ #endif
1454
+
1455
+ template <class D>
1456
+ HWY_API svbool_t LowerHalfOfMask(D /*d*/, svbool_t m) {
1457
+ return m;
1458
+ }
1459
+
1460
+ // ------------------------------ MaskedAddOr etc. (IfThenElse)
1461
+
1462
+ #ifdef HWY_NATIVE_MASKED_ARITH
1463
+ #undef HWY_NATIVE_MASKED_ARITH
1464
+ #else
1465
+ #define HWY_NATIVE_MASKED_ARITH
1466
+ #endif
1467
+
1468
+ namespace detail {
1469
+ HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedMin, min)
1470
+ HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedMax, max)
1471
+ HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedAdd, add)
1472
+ HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedSub, sub)
1473
+ HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedMul, mul)
1474
+ HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGMVV, MaskedDiv, div)
1475
+ HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGMVV, MaskedDiv, div)
1476
+ HWY_SVE_FOREACH_UI64(HWY_SVE_RETV_ARGMVV, MaskedDiv, div)
1477
+ #if HWY_SVE_HAVE_2
1478
+ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV, MaskedSatAdd, qadd)
1479
+ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV, MaskedSatSub, qsub)
1480
+ #endif
1481
+ } // namespace detail
1482
+
1483
+ template <class V, class M>
1484
+ HWY_API V MaskedMinOr(V no, M m, V a, V b) {
1485
+ return IfThenElse(m, detail::MaskedMin(m, a, b), no);
1486
+ }
1487
+
1488
+ template <class V, class M>
1489
+ HWY_API V MaskedMaxOr(V no, M m, V a, V b) {
1490
+ return IfThenElse(m, detail::MaskedMax(m, a, b), no);
1491
+ }
1492
+
1493
+ template <class V, class M>
1494
+ HWY_API V MaskedAddOr(V no, M m, V a, V b) {
1495
+ return IfThenElse(m, detail::MaskedAdd(m, a, b), no);
1496
+ }
1497
+
1498
+ template <class V, class M>
1499
+ HWY_API V MaskedSubOr(V no, M m, V a, V b) {
1500
+ return IfThenElse(m, detail::MaskedSub(m, a, b), no);
1501
+ }
1502
+
1503
+ template <class V, class M>
1504
+ HWY_API V MaskedMulOr(V no, M m, V a, V b) {
1505
+ return IfThenElse(m, detail::MaskedMul(m, a, b), no);
1506
+ }
1507
+
1508
+ template <class V, class M,
1509
+ HWY_IF_T_SIZE_ONE_OF_V(
1510
+ V, (hwy::IsSame<TFromV<V>, hwy::float16_t>() ? (1 << 2) : 0) |
1511
+ (1 << 4) | (1 << 8))>
1512
+ HWY_API V MaskedDivOr(V no, M m, V a, V b) {
1513
+ return IfThenElse(m, detail::MaskedDiv(m, a, b), no);
1514
+ }
1515
+
1516
+ // I8/U8/I16/U16 MaskedDivOr is implemented after I8/U8/I16/U16 Div
1517
+
1518
+ #if HWY_SVE_HAVE_2
1519
+ template <class V, class M>
1520
+ HWY_API V MaskedSatAddOr(V no, M m, V a, V b) {
1521
+ return IfThenElse(m, detail::MaskedSatAdd(m, a, b), no);
1522
+ }
1523
+
1524
+ template <class V, class M>
1525
+ HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
1526
+ return IfThenElse(m, detail::MaskedSatSub(m, a, b), no);
1527
+ }
1528
+ #else
1529
+ template <class V, class M>
1530
+ HWY_API V MaskedSatAddOr(V no, M m, V a, V b) {
1531
+ return IfThenElse(m, SaturatedAdd(a, b), no);
1532
+ }
1533
+
1534
+ template <class V, class M>
1535
+ HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
1536
+ return IfThenElse(m, SaturatedSub(a, b), no);
1537
+ }
1538
+ #endif
1539
+
1019
1540
  // ================================================== COMPARE
1020
1541
 
1021
1542
  // mask = f(vector, vector)
@@ -1078,7 +1599,8 @@ HWY_API svbool_t TestBit(const V a, const V bit) {
1078
1599
  // ------------------------------ MaskFromVec (Ne)
1079
1600
  template <class V>
1080
1601
  HWY_API svbool_t MaskFromVec(const V v) {
1081
- return detail::NeN(v, static_cast<TFromV<V>>(0));
1602
+ using T = TFromV<V>;
1603
+ return detail::NeN(v, ConvertScalarTo<T>(0));
1082
1604
  }
1083
1605
 
1084
1606
  // ------------------------------ VecFromMask
@@ -1090,6 +1612,22 @@ HWY_API VFromD<D> VecFromMask(const D d, svbool_t mask) {
1090
1612
  return BitCast(d, IfThenElseZero(mask, Set(di, -1)));
1091
1613
  }
1092
1614
 
1615
+ // ------------------------------ IsNegative (Lt)
1616
+ #ifdef HWY_NATIVE_IS_NEGATIVE
1617
+ #undef HWY_NATIVE_IS_NEGATIVE
1618
+ #else
1619
+ #define HWY_NATIVE_IS_NEGATIVE
1620
+ #endif
1621
+
1622
+ template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
1623
+ HWY_API svbool_t IsNegative(V v) {
1624
+ const DFromV<decltype(v)> d;
1625
+ const RebindToSigned<decltype(d)> di;
1626
+ using TI = TFromD<decltype(di)>;
1627
+
1628
+ return detail::LtN(BitCast(di, v), static_cast<TI>(0));
1629
+ }
1630
+
1093
1631
  // ------------------------------ IfVecThenElse (MaskFromVec, IfThenElse)
1094
1632
 
1095
1633
  #if HWY_SVE_HAVE_2
@@ -1159,14 +1697,27 @@ HWY_API svbool_t IsNaN(const V v) {
1159
1697
  return Ne(v, v); // could also use cmpuo
1160
1698
  }
1161
1699
 
1700
+ // Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite.
1701
+ // We use a fused Set/comparison for IsFinite.
1702
+ #ifdef HWY_NATIVE_ISINF
1703
+ #undef HWY_NATIVE_ISINF
1704
+ #else
1705
+ #define HWY_NATIVE_ISINF
1706
+ #endif
1707
+
1162
1708
  template <class V>
1163
1709
  HWY_API svbool_t IsInf(const V v) {
1164
1710
  using T = TFromV<V>;
1165
1711
  const DFromV<decltype(v)> d;
1712
+ const RebindToUnsigned<decltype(d)> du;
1166
1713
  const RebindToSigned<decltype(d)> di;
1167
- const VFromD<decltype(di)> vi = BitCast(di, v);
1168
- // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
1169
- return RebindMask(d, detail::EqN(Add(vi, vi), hwy::MaxExponentTimes2<T>()));
1714
+
1715
+ // 'Shift left' to clear the sign bit
1716
+ const VFromD<decltype(du)> vu = BitCast(du, v);
1717
+ const VFromD<decltype(du)> v2 = Add(vu, vu);
1718
+ // Check for exponent=max and mantissa=0.
1719
+ const VFromD<decltype(di)> max2 = Set(di, hwy::MaxExponentTimes2<T>());
1720
+ return RebindMask(d, Eq(v2, BitCast(du, max2)));
1170
1721
  }
1171
1722
 
1172
1723
  // Returns whether normal/subnormal/zero.
@@ -1187,147 +1738,135 @@ HWY_API svbool_t IsFinite(const V v) {
1187
1738
 
1188
1739
  // ================================================== MEMORY
1189
1740
 
1190
- // ------------------------------ Load/MaskedLoad/LoadDup128/Store/Stream
1741
+ // ------------------------------ LoadU/MaskedLoad/LoadDup128/StoreU/Stream
1191
1742
 
1192
- #define HWY_SVE_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \
1193
- template <size_t N, int kPow2> \
1194
- HWY_API HWY_SVE_V(BASE, BITS) \
1195
- NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1196
- const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1197
- using T = detail::NativeLaneType<HWY_SVE_T(BASE, BITS)>; \
1198
- return sv##OP##_##CHAR##BITS(detail::MakeMask(d), \
1199
- reinterpret_cast<const T*>(p)); \
1743
+ #define HWY_SVE_MEM(BASE, CHAR, BITS, HALF, NAME, OP) \
1744
+ template <size_t N, int kPow2> \
1745
+ HWY_API HWY_SVE_V(BASE, BITS) \
1746
+ LoadU(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1747
+ const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1748
+ return svld1_##CHAR##BITS(detail::MakeMask(d), \
1749
+ detail::NativeLanePointer(p)); \
1750
+ } \
1751
+ template <size_t N, int kPow2> \
1752
+ HWY_API HWY_SVE_V(BASE, BITS) \
1753
+ MaskedLoad(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
1754
+ const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1755
+ return svld1_##CHAR##BITS(m, detail::NativeLanePointer(p)); \
1756
+ } \
1757
+ template <size_t N, int kPow2> \
1758
+ HWY_API void StoreU(HWY_SVE_V(BASE, BITS) v, \
1759
+ HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1760
+ HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1761
+ svst1_##CHAR##BITS(detail::MakeMask(d), detail::NativeLanePointer(p), v); \
1762
+ } \
1763
+ template <size_t N, int kPow2> \
1764
+ HWY_API void Stream(HWY_SVE_V(BASE, BITS) v, \
1765
+ HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1766
+ HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1767
+ svstnt1_##CHAR##BITS(detail::MakeMask(d), detail::NativeLanePointer(p), \
1768
+ v); \
1769
+ } \
1770
+ template <size_t N, int kPow2> \
1771
+ HWY_API void BlendedStore(HWY_SVE_V(BASE, BITS) v, svbool_t m, \
1772
+ HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
1773
+ HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1774
+ svst1_##CHAR##BITS(m, detail::NativeLanePointer(p), v); \
1200
1775
  }
1201
1776
 
1202
- #define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \
1203
- template <size_t N, int kPow2> \
1204
- HWY_API HWY_SVE_V(BASE, BITS) \
1205
- NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
1206
- const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1207
- using T = detail::NativeLaneType<HWY_SVE_T(BASE, BITS)>; \
1208
- return sv##OP##_##CHAR##BITS(m, reinterpret_cast<const T*>(p)); \
1209
- }
1777
+ HWY_SVE_FOREACH(HWY_SVE_MEM, _, _)
1778
+ HWY_SVE_FOREACH_BF16(HWY_SVE_MEM, _, _)
1210
1779
 
1211
- #define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP) \
1212
- template <size_t N, int kPow2> \
1213
- HWY_API HWY_SVE_V(BASE, BITS) \
1214
- NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
1215
- const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1216
- using T = detail::NativeLaneType<HWY_SVE_T(BASE, BITS)>; \
1217
- /* All-true predicate to load all 128 bits. */ \
1218
- return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(8), \
1219
- reinterpret_cast<const T*>(p)); \
1220
- }
1780
+ template <class D, HWY_SVE_IF_EMULATED_D(D)>
1781
+ HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
1782
+ const RebindToUnsigned<decltype(d)> du;
1783
+ return BitCast(d, LoadU(du, detail::U16LanePointer(p)));
1784
+ }
1221
1785
 
1222
- #define HWY_SVE_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \
1223
- template <size_t N, int kPow2> \
1224
- HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \
1225
- HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1226
- HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1227
- using T = detail::NativeLaneType<HWY_SVE_T(BASE, BITS)>; \
1228
- sv##OP##_##CHAR##BITS(detail::MakeMask(d), reinterpret_cast<T*>(p), v); \
1229
- }
1786
+ template <class D, HWY_SVE_IF_EMULATED_D(D)>
1787
+ HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
1788
+ const RebindToUnsigned<decltype(d)> du;
1789
+ StoreU(BitCast(du, v), du, detail::U16LanePointer(p));
1790
+ }
1230
1791
 
1231
- #define HWY_SVE_BLENDED_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \
1232
- template <size_t N, int kPow2> \
1233
- HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, svbool_t m, \
1234
- HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
1235
- HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1236
- using T = detail::NativeLaneType<HWY_SVE_T(BASE, BITS)>; \
1237
- sv##OP##_##CHAR##BITS(m, reinterpret_cast<T*>(p), v); \
1238
- }
1792
+ template <class D, HWY_SVE_IF_EMULATED_D(D)>
1793
+ HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
1794
+ const TFromD<D>* HWY_RESTRICT p) {
1795
+ const RebindToUnsigned<decltype(d)> du;
1796
+ return BitCast(d,
1797
+ MaskedLoad(RebindMask(du, m), du, detail::U16LanePointer(p)));
1798
+ }
1799
+
1800
+ // MaskedLoadOr is generic and does not require emulation.
1239
1801
 
1240
- HWY_SVE_FOREACH(HWY_SVE_LOAD, Load, ld1)
1241
- HWY_SVE_FOREACH(HWY_SVE_MASKED_LOAD, MaskedLoad, ld1)
1242
- HWY_SVE_FOREACH(HWY_SVE_STORE, Store, st1)
1243
- HWY_SVE_FOREACH(HWY_SVE_STORE, Stream, stnt1)
1244
- HWY_SVE_FOREACH(HWY_SVE_BLENDED_STORE, BlendedStore, st1)
1802
+ template <class D, HWY_SVE_IF_EMULATED_D(D)>
1803
+ HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
1804
+ TFromD<D>* HWY_RESTRICT p) {
1805
+ const RebindToUnsigned<decltype(d)> du;
1806
+ BlendedStore(BitCast(du, v), RebindMask(du, m), du,
1807
+ detail::U16LanePointer(p));
1808
+ }
1245
1809
 
1246
- HWY_SVE_FOREACH_BF16(HWY_SVE_LOAD, Load, ld1)
1247
- HWY_SVE_FOREACH_BF16(HWY_SVE_MASKED_LOAD, MaskedLoad, ld1)
1248
- HWY_SVE_FOREACH_BF16(HWY_SVE_STORE, Store, st1)
1249
- HWY_SVE_FOREACH_BF16(HWY_SVE_STORE, Stream, stnt1)
1250
- HWY_SVE_FOREACH_BF16(HWY_SVE_BLENDED_STORE, BlendedStore, st1)
1810
+ #undef HWY_SVE_MEM
1251
1811
 
1252
1812
  #if HWY_TARGET != HWY_SVE2_128
1253
1813
  namespace detail {
1254
- HWY_SVE_FOREACH(HWY_SVE_LOAD_DUP128, LoadDupFull128, ld1rq)
1255
- } // namespace detail
1256
- #endif // HWY_TARGET != HWY_SVE2_128
1257
-
1258
- #undef HWY_SVE_LOAD
1259
- #undef HWY_SVE_MASKED_LOAD
1260
- #undef HWY_SVE_LOAD_DUP128
1261
- #undef HWY_SVE_STORE
1262
- #undef HWY_SVE_BLENDED_STORE
1814
+ #define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP) \
1815
+ template <size_t N, int kPow2> \
1816
+ HWY_API HWY_SVE_V(BASE, BITS) \
1817
+ NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
1818
+ const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
1819
+ /* All-true predicate to load all 128 bits. */ \
1820
+ return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(8), \
1821
+ detail::NativeLanePointer(p)); \
1822
+ }
1263
1823
 
1264
- #if !HWY_SVE_HAVE_BFLOAT16
1824
+ HWY_SVE_FOREACH(HWY_SVE_LOAD_DUP128, LoadDupFull128, ld1rq)
1825
+ HWY_SVE_FOREACH_BF16(HWY_SVE_LOAD_DUP128, LoadDupFull128, ld1rq)
1265
1826
 
1266
- template <size_t N, int kPow2>
1267
- HWY_API VBF16 Load(Simd<bfloat16_t, N, kPow2> d,
1268
- const bfloat16_t* HWY_RESTRICT p) {
1269
- return Load(RebindToUnsigned<decltype(d)>(),
1270
- reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
1827
+ template <class D, HWY_SVE_IF_EMULATED_D(D)>
1828
+ HWY_API VFromD<D> LoadDupFull128(D d, const TFromD<D>* HWY_RESTRICT p) {
1829
+ const RebindToUnsigned<decltype(d)> du;
1830
+ return BitCast(d, LoadDupFull128(du, detail::U16LanePointer(p)));
1271
1831
  }
1272
1832
 
1273
- #endif // !HWY_SVE_HAVE_BFLOAT16
1833
+ } // namespace detail
1834
+ #endif // HWY_TARGET != HWY_SVE2_128
1274
1835
 
1275
1836
  #if HWY_TARGET == HWY_SVE2_128
1276
- // On the HWY_SVE2_128 target, LoadDup128 is the same as Load since vectors
1837
+ // On the HWY_SVE2_128 target, LoadDup128 is the same as LoadU since vectors
1277
1838
  // cannot exceed 16 bytes on the HWY_SVE2_128 target.
1278
1839
  template <class D>
1279
1840
  HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
1280
- return Load(d, p);
1841
+ return LoadU(d, p);
1281
1842
  }
1282
1843
  #else // HWY_TARGET != HWY_SVE2_128
1283
- // If D().MaxBytes() <= 16 is true, simply do a Load operation.
1844
+ // If D().MaxBytes() <= 16 is true, simply do a LoadU operation.
1284
1845
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
1285
1846
  HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
1286
- return Load(d, p);
1847
+ return LoadU(d, p);
1287
1848
  }
1288
1849
 
1289
1850
  // If D().MaxBytes() > 16 is true, need to load the vector using ld1rq
1290
- template <class D, HWY_IF_V_SIZE_GT_D(D, 16),
1291
- hwy::EnableIf<!IsSame<TFromD<D>, bfloat16_t>()>* = nullptr>
1851
+ template <class D, HWY_IF_V_SIZE_GT_D(D, 16)>
1292
1852
  HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
1293
1853
  return detail::LoadDupFull128(d, p);
1294
1854
  }
1295
1855
 
1296
- #if !HWY_SVE_HAVE_BFLOAT16
1297
-
1298
- template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_BF16_D(D)>
1299
- HWY_API VBF16 LoadDup128(D d, const bfloat16_t* HWY_RESTRICT p) {
1300
- return detail::LoadDupFull128(
1301
- RebindToUnsigned<decltype(d)>(),
1302
- reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
1303
- }
1304
- #endif // !HWY_SVE_HAVE_BFLOAT16
1305
-
1306
1856
  #endif // HWY_TARGET != HWY_SVE2_128
1307
1857
 
1308
- #if !HWY_SVE_HAVE_BFLOAT16
1309
-
1310
- template <size_t N, int kPow2>
1311
- HWY_API void Store(VBF16 v, Simd<bfloat16_t, N, kPow2> d,
1312
- bfloat16_t* HWY_RESTRICT p) {
1313
- Store(v, RebindToUnsigned<decltype(d)>(),
1314
- reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
1315
- }
1316
-
1317
- #endif
1318
-
1319
- // ------------------------------ Load/StoreU
1858
+ // ------------------------------ Load/Store
1320
1859
 
1321
1860
  // SVE only requires lane alignment, not natural alignment of the entire
1322
- // vector.
1861
+ // vector, so Load/Store are the same as LoadU/StoreU.
1323
1862
  template <class D>
1324
- HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
1325
- return Load(d, p);
1863
+ HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
1864
+ return LoadU(d, p);
1326
1865
  }
1327
1866
 
1328
1867
  template <class V, class D>
1329
- HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
1330
- Store(v, d, p);
1868
+ HWY_API void Store(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
1869
+ StoreU(v, d, p);
1331
1870
  }
1332
1871
 
1333
1872
  // ------------------------------ MaskedLoadOr
@@ -1362,8 +1901,8 @@ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
1362
1901
  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, svbool_t m, \
1363
1902
  HWY_SVE_D(BASE, BITS, N, kPow2) /*d*/, \
1364
1903
  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1365
- HWY_SVE_V(int, BITS) index) { \
1366
- sv##OP##_s##BITS##index_##CHAR##BITS(m, base, index, v); \
1904
+ HWY_SVE_V(int, BITS) indices) { \
1905
+ sv##OP##_s##BITS##index_##CHAR##BITS(m, base, indices, v); \
1367
1906
  }
1368
1907
 
1369
1908
  HWY_SVE_FOREACH_UIF3264(HWY_SVE_SCATTER_OFFSET, ScatterOffset, st1_scatter)
@@ -1398,10 +1937,13 @@ HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
1398
1937
  #define HWY_SVE_MASKED_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \
1399
1938
  template <size_t N, int kPow2> \
1400
1939
  HWY_API HWY_SVE_V(BASE, BITS) \
1401
- NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /*d*/, \
1940
+ NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1402
1941
  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
1403
- HWY_SVE_V(int, BITS) index) { \
1404
- return sv##OP##_s##BITS##index_##CHAR##BITS(m, base, index); \
1942
+ HWY_SVE_V(int, BITS) indices) { \
1943
+ const RebindToSigned<decltype(d)> di; \
1944
+ (void)di; /* for HWY_DASSERT */ \
1945
+ HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di)))); \
1946
+ return sv##OP##_s##BITS##index_##CHAR##BITS(m, base, indices); \
1405
1947
  }
1406
1948
 
1407
1949
  HWY_SVE_FOREACH_UIF3264(HWY_SVE_GATHER_OFFSET, GatherOffset, ld1_gather)
@@ -1410,6 +1952,13 @@ HWY_SVE_FOREACH_UIF3264(HWY_SVE_MASKED_GATHER_INDEX, MaskedGatherIndex,
1410
1952
  #undef HWY_SVE_GATHER_OFFSET
1411
1953
  #undef HWY_SVE_MASKED_GATHER_INDEX
1412
1954
 
1955
+ template <class D>
1956
+ HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, svbool_t m, D d,
1957
+ const TFromD<D>* HWY_RESTRICT p,
1958
+ VFromD<RebindToSigned<D>> indices) {
1959
+ return IfThenElse(m, MaskedGatherIndex(m, d, p, indices), no);
1960
+ }
1961
+
1413
1962
  template <class D>
1414
1963
  HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT p,
1415
1964
  VFromD<RebindToSigned<D>> indices) {
@@ -1430,8 +1979,8 @@ HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT p,
1430
1979
  HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1431
1980
  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1432
1981
  HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1) { \
1433
- const HWY_SVE_TUPLE(BASE, BITS, 2) tuple = \
1434
- sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \
1982
+ const HWY_SVE_TUPLE(BASE, BITS, 2) tuple = sv##OP##_##CHAR##BITS( \
1983
+ detail::MakeMask(d), detail::NativeLanePointer(unaligned)); \
1435
1984
  v0 = svget2(tuple, 0); \
1436
1985
  v1 = svget2(tuple, 1); \
1437
1986
  }
@@ -1447,8 +1996,8 @@ HWY_SVE_FOREACH(HWY_SVE_LOAD2, LoadInterleaved2, ld2)
1447
1996
  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1448
1997
  HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \
1449
1998
  HWY_SVE_V(BASE, BITS) & v2) { \
1450
- const HWY_SVE_TUPLE(BASE, BITS, 3) tuple = \
1451
- sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \
1999
+ const HWY_SVE_TUPLE(BASE, BITS, 3) tuple = sv##OP##_##CHAR##BITS( \
2000
+ detail::MakeMask(d), detail::NativeLanePointer(unaligned)); \
1452
2001
  v0 = svget3(tuple, 0); \
1453
2002
  v1 = svget3(tuple, 1); \
1454
2003
  v2 = svget3(tuple, 2); \
@@ -1465,8 +2014,8 @@ HWY_SVE_FOREACH(HWY_SVE_LOAD3, LoadInterleaved3, ld3)
1465
2014
  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \
1466
2015
  HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \
1467
2016
  HWY_SVE_V(BASE, BITS) & v2, HWY_SVE_V(BASE, BITS) & v3) { \
1468
- const HWY_SVE_TUPLE(BASE, BITS, 4) tuple = \
1469
- sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \
2017
+ const HWY_SVE_TUPLE(BASE, BITS, 4) tuple = sv##OP##_##CHAR##BITS( \
2018
+ detail::MakeMask(d), detail::NativeLanePointer(unaligned)); \
1470
2019
  v0 = svget4(tuple, 0); \
1471
2020
  v1 = svget4(tuple, 1); \
1472
2021
  v2 = svget4(tuple, 2); \
@@ -1478,12 +2027,14 @@ HWY_SVE_FOREACH(HWY_SVE_LOAD4, LoadInterleaved4, ld4)
1478
2027
 
1479
2028
  // ------------------------------ StoreInterleaved2
1480
2029
 
1481
- #define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP) \
1482
- template <size_t N, int kPow2> \
1483
- HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
1484
- HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1485
- HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1486
- sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, Create2(d, v0, v1)); \
2030
+ #define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP) \
2031
+ template <size_t N, int kPow2> \
2032
+ HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
2033
+ HWY_SVE_D(BASE, BITS, N, kPow2) d, \
2034
+ HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
2035
+ sv##OP##_##CHAR##BITS(detail::MakeMask(d), \
2036
+ detail::NativeLanePointer(unaligned), \
2037
+ Create2(d, v0, v1)); \
1487
2038
  }
1488
2039
  HWY_SVE_FOREACH(HWY_SVE_STORE2, StoreInterleaved2, st2)
1489
2040
 
@@ -1497,7 +2048,8 @@ HWY_SVE_FOREACH(HWY_SVE_STORE2, StoreInterleaved2, st2)
1497
2048
  HWY_SVE_V(BASE, BITS) v2, \
1498
2049
  HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1499
2050
  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1500
- sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, \
2051
+ sv##OP##_##CHAR##BITS(detail::MakeMask(d), \
2052
+ detail::NativeLanePointer(unaligned), \
1501
2053
  Create3(d, v0, v1, v2)); \
1502
2054
  }
1503
2055
  HWY_SVE_FOREACH(HWY_SVE_STORE3, StoreInterleaved3, st3)
@@ -1512,7 +2064,8 @@ HWY_SVE_FOREACH(HWY_SVE_STORE3, StoreInterleaved3, st3)
1512
2064
  HWY_SVE_V(BASE, BITS) v2, HWY_SVE_V(BASE, BITS) v3, \
1513
2065
  HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1514
2066
  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1515
- sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, \
2067
+ sv##OP##_##CHAR##BITS(detail::MakeMask(d), \
2068
+ detail::NativeLanePointer(unaligned), \
1516
2069
  Create4(d, v0, v1, v2, v3)); \
1517
2070
  }
1518
2071
  HWY_SVE_FOREACH(HWY_SVE_STORE4, StoreInterleaved4, st4)
@@ -1602,6 +2155,22 @@ HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> /* d */,
1602
2155
  return svcvt_f32_f16_x(detail::PTrue(Simd<float16_t, N, kPow2>()), vv);
1603
2156
  }
1604
2157
 
2158
+ #ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
2159
+ #undef HWY_NATIVE_PROMOTE_F16_TO_F64
2160
+ #else
2161
+ #define HWY_NATIVE_PROMOTE_F16_TO_F64
2162
+ #endif
2163
+
2164
+ template <size_t N, int kPow2>
2165
+ HWY_API svfloat64_t PromoteTo(Simd<float64_t, N, kPow2> /* d */,
2166
+ const svfloat16_t v) {
2167
+ // svcvt* expects inputs in even lanes, whereas Highway wants lower lanes, so
2168
+ // first replicate each lane once.
2169
+ const svfloat16_t vv = detail::ZipLowerSame(v, v);
2170
+ return svcvt_f64_f16_x(detail::PTrue(Simd<float16_t, N, kPow2>()),
2171
+ detail::ZipLowerSame(vv, vv));
2172
+ }
2173
+
1605
2174
  template <size_t N, int kPow2>
1606
2175
  HWY_API svfloat64_t PromoteTo(Simd<float64_t, N, kPow2> /* d */,
1607
2176
  const svfloat32_t v) {
@@ -1637,19 +2206,43 @@ HWY_API svuint64_t PromoteTo(Simd<uint64_t, N, kPow2> /* d */,
1637
2206
  return svcvt_u64_f32_x(detail::PTrue(Simd<float, N, kPow2>()), vv);
1638
2207
  }
1639
2208
 
1640
- // For 16-bit Compress
2209
+ // ------------------------------ PromoteUpperTo
2210
+
1641
2211
  namespace detail {
2212
+ HWY_SVE_FOREACH_UI16(HWY_SVE_PROMOTE_TO, PromoteUpperTo, unpkhi)
1642
2213
  HWY_SVE_FOREACH_UI32(HWY_SVE_PROMOTE_TO, PromoteUpperTo, unpkhi)
2214
+ HWY_SVE_FOREACH_UI64(HWY_SVE_PROMOTE_TO, PromoteUpperTo, unpkhi)
1643
2215
  #undef HWY_SVE_PROMOTE_TO
2216
+ } // namespace detail
1644
2217
 
1645
- template <size_t N, int kPow2>
1646
- HWY_API svfloat32_t PromoteUpperTo(Simd<float, N, kPow2> df, svfloat16_t v) {
1647
- const RebindToUnsigned<decltype(df)> du;
1648
- const RepartitionToNarrow<decltype(du)> dn;
1649
- return BitCast(df, PromoteUpperTo(du, BitCast(dn, v)));
2218
+ #ifdef HWY_NATIVE_PROMOTE_UPPER_TO
2219
+ #undef HWY_NATIVE_PROMOTE_UPPER_TO
2220
+ #else
2221
+ #define HWY_NATIVE_PROMOTE_UPPER_TO
2222
+ #endif
2223
+
2224
+ // Unsigned->Unsigned or Signed->Signed
2225
+ template <class D, class V, typename TD = TFromD<D>, typename TV = TFromV<V>,
2226
+ hwy::EnableIf<IsInteger<TD>() && IsInteger<TV>() &&
2227
+ (IsSigned<TD>() == IsSigned<TV>())>* = nullptr>
2228
+ HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
2229
+ if (detail::IsFull(d)) {
2230
+ return detail::PromoteUpperTo(d, v);
2231
+ }
2232
+ const Rebind<TFromV<V>, decltype(d)> dh;
2233
+ return PromoteTo(d, UpperHalf(dh, v));
1650
2234
  }
1651
2235
 
1652
- } // namespace detail
2236
+ // Differing signs or either is float
2237
+ template <class D, class V, typename TD = TFromD<D>, typename TV = TFromV<V>,
2238
+ hwy::EnableIf<!IsInteger<TD>() || !IsInteger<TV>() ||
2239
+ (IsSigned<TD>() != IsSigned<TV>())>* = nullptr>
2240
+ HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
2241
+ // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
2242
+ // because it cannot be deduced from D (could be either bf16 or f16).
2243
+ const Rebind<TFromV<V>, decltype(d)> dh;
2244
+ return PromoteTo(d, UpperHalf(dh, v));
2245
+ }
1653
2246
 
1654
2247
  // ------------------------------ DemoteTo U
1655
2248
 
@@ -1959,6 +2552,29 @@ HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svuint64_t v) {
1959
2552
  return TruncateTo(dn, vn);
1960
2553
  }
1961
2554
 
2555
+ // ------------------------------ Unsigned to signed demotions
2556
+
2557
+ // Disable the default unsigned to signed DemoteTo/ReorderDemote2To
2558
+ // implementations in generic_ops-inl.h on SVE/SVE2 as the SVE/SVE2 targets have
2559
+ // target-specific implementations of the unsigned to signed DemoteTo and
2560
+ // ReorderDemote2To ops
2561
+
2562
+ // NOTE: hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr is used instead of
2563
+ // hwy::EnableIf<false>* = nullptr to avoid compiler errors since
2564
+ // !hwy::IsSame<V, V>() is always false and as !hwy::IsSame<V, V>() will cause
2565
+ // SFINAE to occur instead of a hard error due to a dependency on the V template
2566
+ // argument
2567
+ #undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
2568
+ #define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \
2569
+ hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
2570
+
2571
+ template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V),
2572
+ HWY_IF_T_SIZE_LE_D(D, sizeof(TFromV<V>) - 1)>
2573
+ HWY_API VFromD<D> DemoteTo(D dn, V v) {
2574
+ const RebindToUnsigned<D> dn_u;
2575
+ return BitCast(dn, TruncateTo(dn_u, detail::SaturateU<TFromD<D>>(v)));
2576
+ }
2577
+
1962
2578
  // ------------------------------ ConcatEven/ConcatOdd
1963
2579
 
1964
2580
  // WARNING: the upper half of these needs fixing up (uzp1/uzp2 use the
@@ -1972,10 +2588,22 @@ namespace detail {
1972
2588
  }
1973
2589
  HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1)
1974
2590
  HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2)
2591
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
2592
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull,
2593
+ uzp1)
2594
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull,
2595
+ uzp2)
2596
+ #endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
1975
2597
  #if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
1976
2598
  HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q)
1977
2599
  HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
1978
- #endif
2600
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
2601
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND,
2602
+ ConcatEvenBlocks, uzp1q)
2603
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks,
2604
+ uzp2q)
2605
+ #endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
2606
+ #endif // defined(__ARM_FEATURE_SVE_MATMUL_FP64)
1979
2607
  #undef HWY_SVE_CONCAT_EVERY_SECOND
1980
2608
 
1981
2609
  // Used to slide up / shift whole register left; mask indicates which range
@@ -1986,6 +2614,16 @@ HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
1986
2614
  return sv##OP##_##CHAR##BITS(mask, lo, hi); \
1987
2615
  }
1988
2616
  HWY_SVE_FOREACH(HWY_SVE_SPLICE, Splice, splice)
2617
+ #if HWY_SVE_HAVE_BF16_FEATURE
2618
+ HWY_SVE_FOREACH_BF16(HWY_SVE_SPLICE, Splice, splice)
2619
+ #else
2620
+ template <class V, HWY_IF_BF16_D(DFromV<V>)>
2621
+ HWY_INLINE V Splice(V hi, V lo, svbool_t mask) {
2622
+ const DFromV<V> d;
2623
+ const RebindToUnsigned<decltype(d)> du;
2624
+ return BitCast(d, Splice(BitCast(du, hi), BitCast(du, lo), mask));
2625
+ }
2626
+ #endif // HWY_SVE_HAVE_BF16_FEATURE
1989
2627
  #undef HWY_SVE_SPLICE
1990
2628
 
1991
2629
  } // namespace detail
@@ -2010,6 +2648,18 @@ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
2010
2648
  return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
2011
2649
  }
2012
2650
 
2651
+ // ------------------------------ PromoteEvenTo/PromoteOddTo
2652
+
2653
+ // Signed to signed PromoteEvenTo: 1 instruction instead of 2 in generic-inl.h.
2654
+ // Might as well also enable unsigned to unsigned, though it is just an And.
2655
+ namespace detail {
2656
+ HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGPV, NativePromoteEvenTo, extb)
2657
+ HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGPV, NativePromoteEvenTo, exth)
2658
+ HWY_SVE_FOREACH_UI64(HWY_SVE_RETV_ARGPV, NativePromoteEvenTo, extw)
2659
+ } // namespace detail
2660
+
2661
+ #include "hwy/ops/inside-inl.h"
2662
+
2013
2663
  // ------------------------------ DemoteTo F
2014
2664
 
2015
2665
  // We already toggled HWY_NATIVE_F16C above.
@@ -2021,10 +2671,60 @@ HWY_API svfloat16_t DemoteTo(Simd<float16_t, N, kPow2> d, const svfloat32_t v) {
2021
2671
  in_even); // lower half
2022
2672
  }
2023
2673
 
2674
+ #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
2675
+ #undef HWY_NATIVE_DEMOTE_F64_TO_F16
2676
+ #else
2677
+ #define HWY_NATIVE_DEMOTE_F64_TO_F16
2678
+ #endif
2679
+
2680
+ template <size_t N, int kPow2>
2681
+ HWY_API svfloat16_t DemoteTo(Simd<float16_t, N, kPow2> d, const svfloat64_t v) {
2682
+ const svfloat16_t in_lo16 = svcvt_f16_f64_x(detail::PTrue(d), v);
2683
+ const svfloat16_t in_even = detail::ConcatEvenFull(in_lo16, in_lo16);
2684
+ return detail::ConcatEvenFull(in_even,
2685
+ in_even); // lower half
2686
+ }
2687
+
2688
+ #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
2689
+ #undef HWY_NATIVE_DEMOTE_F32_TO_BF16
2690
+ #else
2691
+ #define HWY_NATIVE_DEMOTE_F32_TO_BF16
2692
+ #endif
2693
+
2694
+ #if !HWY_SVE_HAVE_F32_TO_BF16C
2695
+ namespace detail {
2696
+
2697
+ // Round a F32 value to the nearest BF16 value, with the result returned as the
2698
+ // rounded F32 value bitcasted to an U32
2699
+
2700
+ // RoundF32ForDemoteToBF16 also converts NaN values to QNaN values to prevent
2701
+ // NaN F32 values from being converted to an infinity
2702
+ HWY_INLINE svuint32_t RoundF32ForDemoteToBF16(svfloat32_t v) {
2703
+ const DFromV<decltype(v)> df32;
2704
+ const RebindToUnsigned<decltype(df32)> du32;
2705
+
2706
+ const auto is_non_nan = Eq(v, v);
2707
+ const auto bits32 = BitCast(du32, v);
2708
+
2709
+ const auto round_incr =
2710
+ detail::AddN(detail::AndN(ShiftRight<16>(bits32), 1u), 0x7FFFu);
2711
+ return MaskedAddOr(detail::OrN(bits32, 0x00400000u), is_non_nan, bits32,
2712
+ round_incr);
2713
+ }
2714
+
2715
+ } // namespace detail
2716
+ #endif // !HWY_SVE_HAVE_F32_TO_BF16C
2717
+
2024
2718
  template <size_t N, int kPow2>
2025
2719
  HWY_API VBF16 DemoteTo(Simd<bfloat16_t, N, kPow2> dbf16, svfloat32_t v) {
2026
- const svuint16_t in_even = BitCast(ScalableTag<uint16_t>(), v);
2027
- return BitCast(dbf16, detail::ConcatOddFull(in_even, in_even)); // lower half
2720
+ #if HWY_SVE_HAVE_F32_TO_BF16C
2721
+ const VBF16 in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), v);
2722
+ return detail::ConcatEvenFull(in_even, in_even);
2723
+ #else
2724
+ const svuint16_t in_odd =
2725
+ BitCast(ScalableTag<uint16_t>(), detail::RoundF32ForDemoteToBF16(v));
2726
+ return BitCast(dbf16, detail::ConcatOddFull(in_odd, in_odd)); // lower half
2727
+ #endif
2028
2728
  }
2029
2729
 
2030
2730
  template <size_t N, int kPow2>
@@ -2065,32 +2765,31 @@ HWY_API svfloat32_t DemoteTo(Simd<float, N, kPow2> d, const svuint64_t v) {
2065
2765
  // ------------------------------ ConvertTo F
2066
2766
 
2067
2767
  #define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP) \
2068
- /* signed integers */ \
2768
+ /* Float from signed */ \
2069
2769
  template <size_t N, int kPow2> \
2070
2770
  HWY_API HWY_SVE_V(BASE, BITS) \
2071
2771
  NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(int, BITS) v) { \
2072
2772
  return sv##OP##_##CHAR##BITS##_s##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
2073
2773
  } \
2074
- /* unsigned integers */ \
2774
+ /* Float from unsigned */ \
2075
2775
  template <size_t N, int kPow2> \
2076
2776
  HWY_API HWY_SVE_V(BASE, BITS) \
2077
2777
  NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(uint, BITS) v) { \
2078
2778
  return sv##OP##_##CHAR##BITS##_u##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
2079
2779
  } \
2080
- /* Truncates (rounds toward zero). */ \
2780
+ /* Signed from float, rounding toward zero */ \
2081
2781
  template <size_t N, int kPow2> \
2082
2782
  HWY_API HWY_SVE_V(int, BITS) \
2083
2783
  NAME(HWY_SVE_D(int, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \
2084
2784
  return sv##OP##_s##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
2085
2785
  } \
2086
- /* Truncates to unsigned (rounds toward zero). */ \
2786
+ /* Unsigned from float, rounding toward zero */ \
2087
2787
  template <size_t N, int kPow2> \
2088
2788
  HWY_API HWY_SVE_V(uint, BITS) \
2089
2789
  NAME(HWY_SVE_D(uint, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \
2090
2790
  return sv##OP##_u##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
2091
2791
  }
2092
2792
 
2093
- // API only requires f32 but we provide f64 for use by Iota.
2094
2793
  HWY_SVE_FOREACH_F(HWY_SVE_CONVERT, ConvertTo, cvt)
2095
2794
  #undef HWY_SVE_CONVERT
2096
2795
 
@@ -2103,20 +2802,22 @@ HWY_API VFromD<DI> NearestInt(VF v) {
2103
2802
 
2104
2803
  // ------------------------------ Iota (Add, ConvertTo)
2105
2804
 
2106
- #define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP) \
2107
- template <size_t N, int kPow2> \
2108
- HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
2109
- HWY_SVE_T(BASE, BITS) first) { \
2110
- return sv##OP##_##CHAR##BITS(first, 1); \
2805
+ #define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP) \
2806
+ template <size_t N, int kPow2, typename T2> \
2807
+ HWY_API HWY_SVE_V(BASE, BITS) \
2808
+ NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, T2 first) { \
2809
+ return sv##OP##_##CHAR##BITS( \
2810
+ ConvertScalarTo<HWY_SVE_T(BASE, BITS)>(first), 1); \
2111
2811
  }
2112
2812
 
2113
2813
  HWY_SVE_FOREACH_UI(HWY_SVE_IOTA, Iota, index)
2114
2814
  #undef HWY_SVE_IOTA
2115
2815
 
2116
- template <class D, HWY_IF_FLOAT_D(D)>
2117
- HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
2816
+ template <class D, typename T2, HWY_IF_FLOAT_D(D)>
2817
+ HWY_API VFromD<D> Iota(const D d, T2 first) {
2118
2818
  const RebindToSigned<D> di;
2119
- return detail::AddN(ConvertTo(d, Iota(di, 0)), first);
2819
+ return detail::AddN(ConvertTo(d, Iota(di, 0)),
2820
+ ConvertScalarTo<TFromD<D>>(first));
2120
2821
  }
2121
2822
 
2122
2823
  // ------------------------------ InterleaveLower
@@ -2147,12 +2848,10 @@ HWY_API V InterleaveLower(const V a, const V b) {
2147
2848
 
2148
2849
  // Only use zip2 if vector are a powers of two, otherwise getting the actual
2149
2850
  // "upper half" requires MaskUpperHalf.
2150
- #if HWY_TARGET == HWY_SVE2_128
2151
2851
  namespace detail {
2152
2852
  // Unlike Highway's ZipUpper, this returns the same type.
2153
2853
  HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipUpperSame, zip2)
2154
2854
  } // namespace detail
2155
- #endif
2156
2855
 
2157
2856
  // Full vector: guaranteed to have at least one block
2158
2857
  template <class D, class V = VFromD<D>,
@@ -2184,6 +2883,30 @@ HWY_API V InterleaveUpper(D d, const V a, const V b) {
2184
2883
  return InterleaveUpper(DFromV<V>(), a, b);
2185
2884
  }
2186
2885
 
2886
+ // ------------------------------ InterleaveWholeLower
2887
+ #ifdef HWY_NATIVE_INTERLEAVE_WHOLE
2888
+ #undef HWY_NATIVE_INTERLEAVE_WHOLE
2889
+ #else
2890
+ #define HWY_NATIVE_INTERLEAVE_WHOLE
2891
+ #endif
2892
+
2893
+ template <class D>
2894
+ HWY_API VFromD<D> InterleaveWholeLower(D /*d*/, VFromD<D> a, VFromD<D> b) {
2895
+ return detail::ZipLowerSame(a, b);
2896
+ }
2897
+
2898
+ // ------------------------------ InterleaveWholeUpper
2899
+
2900
+ template <class D>
2901
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
2902
+ if (HWY_SVE_IS_POW2 && detail::IsFull(d)) {
2903
+ return detail::ZipUpperSame(a, b);
2904
+ }
2905
+
2906
+ const Half<decltype(d)> d2;
2907
+ return InterleaveWholeLower(d, UpperHalf(d2, a), UpperHalf(d2, b));
2908
+ }
2909
+
2187
2910
  // ------------------------------ Per4LaneBlockShuffle
2188
2911
 
2189
2912
  namespace detail {
@@ -2432,7 +3155,13 @@ HWY_API V UpperHalf(const DH dh, const V v) {
2432
3155
 
2433
3156
  // ================================================== REDUCE
2434
3157
 
2435
- // These return T, whereas the Highway op returns a broadcasted vector.
3158
+ #ifdef HWY_NATIVE_REDUCE_SCALAR
3159
+ #undef HWY_NATIVE_REDUCE_SCALAR
3160
+ #else
3161
+ #define HWY_NATIVE_REDUCE_SCALAR
3162
+ #endif
3163
+
3164
+ // These return T, suitable for ReduceSum.
2436
3165
  namespace detail {
2437
3166
  #define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP) \
2438
3167
  HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
@@ -2462,24 +3191,53 @@ HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanesM, maxnmv)
2462
3191
  #undef HWY_SVE_REDUCE_ADD
2463
3192
  } // namespace detail
2464
3193
 
2465
- template <class D, class V>
2466
- V SumOfLanes(D d, V v) {
2467
- return Set(d, detail::SumOfLanesM(detail::MakeMask(d), v));
2468
- }
3194
+ // detail::SumOfLanesM, detail::MinOfLanesM, and detail::MaxOfLanesM is more
3195
+ // efficient for N=4 I8/U8 reductions on SVE than the default implementations
3196
+ // of the N=4 I8/U8 ReduceSum/ReduceMin/ReduceMax operations in
3197
+ // generic_ops-inl.h
3198
+ #undef HWY_IF_REDUCE_D
3199
+ #define HWY_IF_REDUCE_D(D) hwy::EnableIf<HWY_MAX_LANES_D(D) != 1>* = nullptr
2469
3200
 
2470
- template <class D, class V>
2471
- TFromV<V> ReduceSum(D d, V v) {
3201
+ #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
3202
+ #undef HWY_NATIVE_REDUCE_SUM_4_UI8
3203
+ #else
3204
+ #define HWY_NATIVE_REDUCE_SUM_4_UI8
3205
+ #endif
3206
+
3207
+ #ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
3208
+ #undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
3209
+ #else
3210
+ #define HWY_NATIVE_REDUCE_MINMAX_4_UI8
3211
+ #endif
3212
+
3213
+ template <class D, HWY_IF_REDUCE_D(D)>
3214
+ HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
2472
3215
  return detail::SumOfLanesM(detail::MakeMask(d), v);
2473
3216
  }
2474
3217
 
2475
- template <class D, class V>
2476
- V MinOfLanes(D d, V v) {
2477
- return Set(d, detail::MinOfLanesM(detail::MakeMask(d), v));
3218
+ template <class D, HWY_IF_REDUCE_D(D)>
3219
+ HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) {
3220
+ return detail::MinOfLanesM(detail::MakeMask(d), v);
2478
3221
  }
2479
3222
 
2480
- template <class D, class V>
2481
- V MaxOfLanes(D d, V v) {
2482
- return Set(d, detail::MaxOfLanesM(detail::MakeMask(d), v));
3223
+ template <class D, HWY_IF_REDUCE_D(D)>
3224
+ HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
3225
+ return detail::MaxOfLanesM(detail::MakeMask(d), v);
3226
+ }
3227
+
3228
+ // ------------------------------ SumOfLanes
3229
+
3230
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
3231
+ HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
3232
+ return Set(d, ReduceSum(d, v));
3233
+ }
3234
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
3235
+ HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
3236
+ return Set(d, ReduceMin(d, v));
3237
+ }
3238
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
3239
+ HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
3240
+ return Set(d, ReduceMax(d, v));
2483
3241
  }
2484
3242
 
2485
3243
  // ================================================== SWIZZLE
@@ -2510,11 +3268,15 @@ HWY_API TFromV<V> ExtractLane(V v, size_t i) {
2510
3268
  }
2511
3269
 
2512
3270
  // ------------------------------ InsertLane (IfThenElse)
2513
- template <class V>
2514
- HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
3271
+ template <class V, typename T>
3272
+ HWY_API V InsertLane(const V v, size_t i, T t) {
3273
+ static_assert(sizeof(TFromV<V>) == sizeof(T), "Lane size mismatch");
2515
3274
  const DFromV<V> d;
2516
- const auto is_i = detail::EqN(Iota(d, 0), static_cast<TFromV<V>>(i));
2517
- return IfThenElse(RebindMask(d, is_i), Set(d, t), v);
3275
+ const RebindToSigned<decltype(d)> di;
3276
+ using TI = TFromD<decltype(di)>;
3277
+ const svbool_t is_i = detail::EqN(Iota(di, 0), static_cast<TI>(i));
3278
+ return IfThenElse(RebindMask(d, is_i),
3279
+ Set(d, hwy::ConvertScalarTo<TFromV<V>>(t)), v);
2518
3280
  }
2519
3281
 
2520
3282
  // ------------------------------ DupEven
@@ -2569,6 +3331,18 @@ HWY_API V OddEven(const V odd, const V even) {
2569
3331
 
2570
3332
  #endif // HWY_TARGET
2571
3333
 
3334
+ // ------------------------------ InterleaveEven
3335
+ template <class D>
3336
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
3337
+ return detail::InterleaveEven(a, b);
3338
+ }
3339
+
3340
+ // ------------------------------ InterleaveOdd
3341
+ template <class D>
3342
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
3343
+ return detail::InterleaveOdd(a, b);
3344
+ }
3345
+
2572
3346
  // ------------------------------ OddEvenBlocks
2573
3347
  template <class V>
2574
3348
  HWY_API V OddEvenBlocks(const V odd, const V even) {
@@ -2623,6 +3397,9 @@ HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) {
2623
3397
  }
2624
3398
 
2625
3399
  HWY_SVE_FOREACH(HWY_SVE_TABLE, TableLookupLanes, tbl)
3400
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
3401
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_TABLE, TableLookupLanes, tbl)
3402
+ #endif
2626
3403
  #undef HWY_SVE_TABLE
2627
3404
 
2628
3405
  #if HWY_SVE_HAVE_2
@@ -2634,6 +3411,10 @@ namespace detail {
2634
3411
  }
2635
3412
 
2636
3413
  HWY_SVE_FOREACH(HWY_SVE_TABLE2, NativeTwoTableLookupLanes, tbl2)
3414
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
3415
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_TABLE2, NativeTwoTableLookupLanes,
3416
+ tbl2)
3417
+ #endif
2637
3418
  #undef HWY_SVE_TABLE
2638
3419
  } // namespace detail
2639
3420
  #endif // HWY_SVE_HAVE_2
@@ -2705,6 +3486,9 @@ namespace detail {
2705
3486
  }
2706
3487
 
2707
3488
  HWY_SVE_FOREACH(HWY_SVE_REVERSE, ReverseFull, rev)
3489
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
3490
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_REVERSE, ReverseFull, rev)
3491
+ #endif
2708
3492
  #undef HWY_SVE_REVERSE
2709
3493
 
2710
3494
  } // namespace detail
@@ -2775,14 +3559,14 @@ HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { // 3210
2775
3559
  template <class D, HWY_IF_T_SIZE_D(D, 1)>
2776
3560
  HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
2777
3561
  const RebindToUnsigned<decltype(d)> du;
2778
- const RepartitionToWide<RepartitionToWide<decltype(du)>> du32;
3562
+ const RepartitionToWideX2<decltype(du)> du32;
2779
3563
  return BitCast(d, svrevb_u32_x(detail::PTrue(d), BitCast(du32, v)));
2780
3564
  }
2781
3565
 
2782
3566
  template <class D, HWY_IF_T_SIZE_D(D, 2)>
2783
3567
  HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
2784
3568
  const RebindToUnsigned<decltype(d)> du;
2785
- const RepartitionToWide<RepartitionToWide<decltype(du)>> du64;
3569
+ const RepartitionToWideX2<decltype(du)> du64;
2786
3570
  return BitCast(d, svrevh_u64_x(detail::PTrue(d), BitCast(du64, v)));
2787
3571
  }
2788
3572
 
@@ -2943,20 +3727,23 @@ HWY_API V BroadcastBlock(V v) {
2943
3727
  static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(),
2944
3728
  "Invalid block index");
2945
3729
 
3730
+ const RebindToUnsigned<decltype(d)> du; // for bfloat16_t
3731
+ using VU = VFromD<decltype(du)>;
3732
+ const VU vu = BitCast(du, v);
3733
+
2946
3734
  #if HWY_TARGET == HWY_SVE_256
2947
- return (kBlockIdx == 0) ? ConcatLowerLower(d, v, v)
2948
- : ConcatUpperUpper(d, v, v);
3735
+ return BitCast(d, (kBlockIdx == 0) ? ConcatLowerLower(du, vu, vu)
3736
+ : ConcatUpperUpper(du, vu, vu));
2949
3737
  #else
2950
- const RebindToUnsigned<decltype(d)> du;
2951
3738
  using TU = TFromD<decltype(du)>;
2952
3739
  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d);
2953
3740
  constexpr size_t kBlockOffset =
2954
3741
  static_cast<size_t>(kBlockIdx) * kLanesPerBlock;
2955
3742
 
2956
- const auto idx = detail::AddN(
3743
+ const VU idx = detail::AddN(
2957
3744
  detail::AndN(Iota(du, TU{0}), static_cast<TU>(kLanesPerBlock - 1)),
2958
3745
  static_cast<TU>(kBlockOffset));
2959
- return TableLookupLanes(v, idx);
3746
+ return BitCast(d, TableLookupLanes(vu, idx));
2960
3747
  #endif
2961
3748
  }
2962
3749
 
@@ -3455,6 +4242,95 @@ HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
3455
4242
 
3456
4243
  // ================================================== Ops with dependencies
3457
4244
 
4245
+ // ------------------------------ AddSub (Reverse2)
4246
+
4247
+ // NOTE: svcadd_f*_x(HWY_SVE_PTRUE(BITS), a, b, 90) computes a[i] - b[i + 1] in
4248
+ // the even lanes and a[i] + b[i - 1] in the odd lanes.
4249
+
4250
+ #define HWY_SVE_ADDSUB_F(BASE, CHAR, BITS, HALF, NAME, OP) \
4251
+ HWY_API HWY_SVE_V(BASE, BITS) \
4252
+ NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
4253
+ const DFromV<decltype(b)> d; \
4254
+ return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, Reverse2(d, b), \
4255
+ 90); \
4256
+ }
4257
+
4258
+ HWY_SVE_FOREACH_F(HWY_SVE_ADDSUB_F, AddSub, cadd)
4259
+
4260
+ #undef HWY_SVE_ADDSUB_F
4261
+
4262
+ // NOTE: svcadd_s*(a, b, 90) and svcadd_u*(a, b, 90) compute a[i] - b[i + 1] in
4263
+ // the even lanes and a[i] + b[i - 1] in the odd lanes.
4264
+
4265
+ #if HWY_SVE_HAVE_2
4266
+ #define HWY_SVE_ADDSUB_UI(BASE, CHAR, BITS, HALF, NAME, OP) \
4267
+ HWY_API HWY_SVE_V(BASE, BITS) \
4268
+ NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
4269
+ const DFromV<decltype(b)> d; \
4270
+ return sv##OP##_##CHAR##BITS(a, Reverse2(d, b), 90); \
4271
+ }
4272
+
4273
+ HWY_SVE_FOREACH_UI(HWY_SVE_ADDSUB_UI, AddSub, cadd)
4274
+
4275
+ #undef HWY_SVE_ADDSUB_UI
4276
+
4277
+ // Disable the default implementation of AddSub in generic_ops-inl.h on SVE2
4278
+ #undef HWY_IF_ADDSUB_V
4279
+ #define HWY_IF_ADDSUB_V(V) \
4280
+ HWY_IF_LANES_GT_D(DFromV<V>, 1), \
4281
+ hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
4282
+
4283
+ #else // !HWY_SVE_HAVE_2
4284
+
4285
+ // Disable the default implementation of AddSub in generic_ops-inl.h for
4286
+ // floating-point vectors on SVE, but enable the default implementation of
4287
+ // AddSub in generic_ops-inl.h for integer vectors on SVE that do not support
4288
+ // SVE2
4289
+ #undef HWY_IF_ADDSUB_V
4290
+ #define HWY_IF_ADDSUB_V(V) \
4291
+ HWY_IF_LANES_GT_D(DFromV<V>, 1), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
4292
+
4293
+ #endif // HWY_SVE_HAVE_2
4294
+
4295
+ // ------------------------------ MulAddSub (AddSub)
4296
+
4297
+ template <class V, HWY_IF_LANES_GT_D(DFromV<V>, 1), HWY_IF_FLOAT_V(V)>
4298
+ HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
4299
+ using T = TFromV<V>;
4300
+
4301
+ const DFromV<V> d;
4302
+ const T neg_zero = ConvertScalarTo<T>(-0.0f);
4303
+
4304
+ return MulAdd(mul, x, AddSub(Set(d, neg_zero), sub_or_add));
4305
+ }
4306
+
4307
+ #if HWY_SVE_HAVE_2
4308
+
4309
+ // Disable the default implementation of MulAddSub in generic_ops-inl.h on SVE2
4310
+ #undef HWY_IF_MULADDSUB_V
4311
+ #define HWY_IF_MULADDSUB_V(V) \
4312
+ HWY_IF_LANES_GT_D(DFromV<V>, 1), \
4313
+ hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
4314
+
4315
+ template <class V, HWY_IF_LANES_GT_D(DFromV<V>, 1),
4316
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
4317
+ HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
4318
+ const DFromV<V> d;
4319
+ return MulAdd(mul, x, AddSub(Zero(d), sub_or_add));
4320
+ }
4321
+
4322
+ #else // !HWY_SVE_HAVE_2
4323
+
4324
+ // Disable the default implementation of MulAddSub in generic_ops-inl.h for
4325
+ // floating-point vectors on SVE, but enable the default implementation of
4326
+ // AddSub in generic_ops-inl.h for integer vectors on SVE targets that do not
4327
+ // support SVE2
4328
+ #undef HWY_IF_MULADDSUB_V
4329
+ #define HWY_IF_MULADDSUB_V(V) \
4330
+ HWY_IF_LANES_GT_D(DFromV<V>, 1), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
4331
+
4332
+ #endif // HWY_SVE_HAVE_2
4333
+
3458
4334
  // ------------------------------ PromoteTo bfloat16 (ZipLower)
3459
4335
  template <size_t N, int kPow2>
3460
4336
  HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> df32, VBF16 v) {
@@ -3462,15 +4338,142 @@ HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> df32, VBF16 v) {
3462
4338
  return BitCast(df32, detail::ZipLowerSame(svdup_n_u16(0), BitCast(du16, v)));
3463
4339
  }
3464
4340
 
4341
+ // ------------------------------ PromoteEvenTo/PromoteOddTo (ConcatOddFull)
4342
+
4343
+ namespace detail {
4344
+
4345
+ // Signed to signed PromoteEvenTo
4346
+ template <class D>
4347
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
4348
+ hwy::SizeTag<2> /*to_lane_size_tag*/,
4349
+ hwy::SignedTag /*from_type_tag*/, D d_to,
4350
+ svint8_t v) {
4351
+ return svextb_s16_x(detail::PTrue(d_to), BitCast(d_to, v));
4352
+ }
4353
+
4354
+ template <class D>
4355
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
4356
+ hwy::SizeTag<4> /*to_lane_size_tag*/,
4357
+ hwy::SignedTag /*from_type_tag*/, D d_to,
4358
+ svint16_t v) {
4359
+ return svexth_s32_x(detail::PTrue(d_to), BitCast(d_to, v));
4360
+ }
4361
+
4362
+ template <class D>
4363
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
4364
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
4365
+ hwy::SignedTag /*from_type_tag*/, D d_to,
4366
+ svint32_t v) {
4367
+ return svextw_s64_x(detail::PTrue(d_to), BitCast(d_to, v));
4368
+ }
4369
+
4370
+ // F16->F32 PromoteEvenTo
4371
+ template <class D>
4372
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
4373
+ hwy::SizeTag<4> /*to_lane_size_tag*/,
4374
+ hwy::FloatTag /*from_type_tag*/, D d_to,
4375
+ svfloat16_t v) {
4376
+ const Repartition<float, decltype(d_to)> d_from;
4377
+ return svcvt_f32_f16_x(detail::PTrue(d_from), v);
4378
+ }
4379
+
4380
+ // F32->F64 PromoteEvenTo
4381
+ template <class D>
4382
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
4383
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
4384
+ hwy::FloatTag /*from_type_tag*/, D d_to,
4385
+ svfloat32_t v) {
4386
+ const Repartition<float, decltype(d_to)> d_from;
4387
+ return svcvt_f64_f32_x(detail::PTrue(d_from), v);
4388
+ }
4389
+
4390
+ // I32->F64 PromoteEvenTo
4391
+ template <class D>
4392
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
4393
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
4394
+ hwy::SignedTag /*from_type_tag*/, D d_to,
4395
+ svint32_t v) {
4396
+ const Repartition<float, decltype(d_to)> d_from;
4397
+ return svcvt_f64_s32_x(detail::PTrue(d_from), v);
4398
+ }
4399
+
4400
+ // U32->F64 PromoteEvenTo
4401
+ template <class D>
4402
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
4403
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
4404
+ hwy::UnsignedTag /*from_type_tag*/, D d_to,
4405
+ svuint32_t v) {
4406
+ const Repartition<float, decltype(d_to)> d_from;
4407
+ return svcvt_f64_u32_x(detail::PTrue(d_from), v);
4408
+ }
4409
+
4410
+ // F32->I64 PromoteEvenTo
4411
+ template <class D>
4412
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
4413
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
4414
+ hwy::FloatTag /*from_type_tag*/, D d_to,
4415
+ svfloat32_t v) {
4416
+ const Repartition<float, decltype(d_to)> d_from;
4417
+ return svcvt_s64_f32_x(detail::PTrue(d_from), v);
4418
+ }
4419
+
4420
+ // F32->U64 PromoteEvenTo
4421
+ template <class D>
4422
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::UnsignedTag /*to_type_tag*/,
4423
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
4424
+ hwy::FloatTag /*from_type_tag*/, D d_to,
4425
+ svfloat32_t v) {
4426
+ const Repartition<float, decltype(d_to)> d_from;
4427
+ return svcvt_u64_f32_x(detail::PTrue(d_from), v);
4428
+ }
4429
+
4430
+ // F16->F32 PromoteOddTo
4431
+ template <class D>
4432
+ HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag to_type_tag,
4433
+ hwy::SizeTag<4> to_lane_size_tag,
4434
+ hwy::FloatTag from_type_tag, D d_to,
4435
+ svfloat16_t v) {
4436
+ return PromoteEvenTo(to_type_tag, to_lane_size_tag, from_type_tag, d_to,
4437
+ DupOdd(v));
4438
+ }
4439
+
4440
+ // I32/U32/F32->F64 PromoteOddTo
4441
+ template <class FromTypeTag, class D, class V>
4442
+ HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag to_type_tag,
4443
+ hwy::SizeTag<8> to_lane_size_tag,
4444
+ FromTypeTag from_type_tag, D d_to, V v) {
4445
+ return PromoteEvenTo(to_type_tag, to_lane_size_tag, from_type_tag, d_to,
4446
+ DupOdd(v));
4447
+ }
4448
+
4449
+ // F32->I64/U64 PromoteOddTo
4450
+ template <class ToTypeTag, class D, HWY_IF_UI64_D(D)>
4451
+ HWY_INLINE VFromD<D> PromoteOddTo(ToTypeTag to_type_tag,
4452
+ hwy::SizeTag<8> to_lane_size_tag,
4453
+ hwy::FloatTag from_type_tag, D d_to,
4454
+ svfloat32_t v) {
4455
+ return PromoteEvenTo(to_type_tag, to_lane_size_tag, from_type_tag, d_to,
4456
+ DupOdd(v));
4457
+ }
4458
+
4459
+ } // namespace detail
4460
+
3465
4461
  // ------------------------------ ReorderDemote2To (OddEven)
3466
4462
 
3467
4463
  template <size_t N, int kPow2>
3468
4464
  HWY_API VBF16 ReorderDemote2To(Simd<bfloat16_t, N, kPow2> dbf16, svfloat32_t a,
3469
4465
  svfloat32_t b) {
3470
- const RebindToUnsigned<decltype(dbf16)> du16;
3471
- const Repartition<uint32_t, decltype(dbf16)> du32;
3472
- const svuint32_t b_in_even = ShiftRight<16>(BitCast(du32, b));
3473
- return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
4466
+ #if HWY_SVE_HAVE_F32_TO_BF16C
4467
+ const VBF16 b_in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), b);
4468
+ return svcvtnt_bf16_f32_x(b_in_even, detail::PTrue(dbf16), a);
4469
+ #else
4470
+ (void)dbf16;
4471
+ const auto a_in_odd =
4472
+ BitCast(ScalableTag<uint16_t>(), detail::RoundF32ForDemoteToBF16(a));
4473
+ const auto b_in_odd =
4474
+ BitCast(ScalableTag<uint16_t>(), detail::RoundF32ForDemoteToBF16(b));
4475
+ return BitCast(dbf16, detail::InterleaveOdd(b_in_odd, a_in_odd));
4476
+ #endif
3474
4477
  }
3475
4478
 
3476
4479
  template <size_t N, int kPow2>
@@ -3608,6 +4611,14 @@ HWY_API svuint32_t ReorderDemote2To(Simd<uint32_t, N, kPow2> d32, svuint64_t a,
3608
4611
  #endif
3609
4612
  }
3610
4613
 
4614
+ template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V),
4615
+ HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>) / 2)>
4616
+ HWY_API VFromD<D> ReorderDemote2To(D dn, V a, V b) {
4617
+ const auto clamped_a = BitCast(dn, detail::SaturateU<TFromD<D>>(a));
4618
+ const auto clamped_b = BitCast(dn, detail::SaturateU<TFromD<D>>(b));
4619
+ return detail::InterleaveEven(clamped_a, clamped_b);
4620
+ }
4621
+
3611
4622
  template <class D, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
3612
4623
  HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
3613
4624
  HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2)>
@@ -3618,21 +4629,55 @@ HWY_API VFromD<D> OrderedDemote2To(D dn, V a, V b) {
3618
4629
  return Combine(dn, demoted_b, demoted_a);
3619
4630
  }
3620
4631
 
3621
- template <class D, HWY_IF_BF16_D(D)>
3622
- HWY_API VBF16 OrderedDemote2To(D dn, svfloat32_t a, svfloat32_t b) {
3623
- const Half<decltype(dn)> dnh;
3624
- const RebindToUnsigned<decltype(dn)> dn_u;
3625
- const RebindToUnsigned<decltype(dnh)> dnh_u;
3626
- const auto demoted_a = DemoteTo(dnh, a);
3627
- const auto demoted_b = DemoteTo(dnh, b);
3628
- return BitCast(
3629
- dn, Combine(dn_u, BitCast(dnh_u, demoted_b), BitCast(dnh_u, demoted_a)));
4632
+ template <size_t N, int kPow2>
4633
+ HWY_API VBF16 OrderedDemote2To(Simd<bfloat16_t, N, kPow2> dbf16, svfloat32_t a,
4634
+ svfloat32_t b) {
4635
+ #if HWY_SVE_HAVE_F32_TO_BF16C
4636
+ (void)dbf16;
4637
+ const VBF16 a_in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), a);
4638
+ const VBF16 b_in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), b);
4639
+ return ConcatEven(dbf16, b_in_even, a_in_even);
4640
+ #else
4641
+ const RebindToUnsigned<decltype(dbf16)> du16;
4642
+ const svuint16_t a_in_odd = BitCast(du16, detail::RoundF32ForDemoteToBF16(a));
4643
+ const svuint16_t b_in_odd = BitCast(du16, detail::RoundF32ForDemoteToBF16(b));
4644
+ return BitCast(dbf16, ConcatOdd(du16, b_in_odd, a_in_odd)); // lower half
4645
+ #endif
4646
+ }
4647
+
4648
+ // ------------------------------ I8/U8/I16/U16 Div
4649
+
4650
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
4651
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
4652
+ HWY_API V Div(V a, V b) {
4653
+ const DFromV<decltype(a)> d;
4654
+ const Half<decltype(d)> dh;
4655
+ const RepartitionToWide<decltype(d)> dw;
4656
+
4657
+ const auto q_lo =
4658
+ Div(PromoteTo(dw, LowerHalf(dh, a)), PromoteTo(dw, LowerHalf(dh, b)));
4659
+ const auto q_hi = Div(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b));
4660
+
4661
+ return OrderedDemote2To(d, q_lo, q_hi);
4662
+ }
4663
+
4664
+ // ------------------------------ I8/U8/I16/U16 MaskedDivOr
4665
+ template <class V, class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
4666
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
4667
+ HWY_API V MaskedDivOr(V no, M m, V a, V b) {
4668
+ return IfThenElse(m, Div(a, b), no);
3630
4669
  }
3631
4670
 
3632
- // ------------------------------ ZeroIfNegative (Lt, IfThenElse)
4671
+ // ------------------------------ Mod (Div, NegMulAdd)
3633
4672
  template <class V>
3634
- HWY_API V ZeroIfNegative(const V v) {
3635
- return IfThenZeroElse(detail::LtN(v, 0), v);
4673
+ HWY_API V Mod(V a, V b) {
4674
+ return NegMulAdd(Div(a, b), b, a);
4675
+ }
4676
+
4677
+ // ------------------------------ MaskedModOr (Mod)
4678
+ template <class V, class M>
4679
+ HWY_API V MaskedModOr(V no, M m, V a, V b) {
4680
+ return IfThenElse(m, Mod(a, b), no);
3636
4681
  }
3637
4682
 
3638
4683
  // ------------------------------ BroadcastSignBit (ShiftRight)
@@ -3645,11 +4690,7 @@ HWY_API V BroadcastSignBit(const V v) {
3645
4690
  template <class V>
3646
4691
  HWY_API V IfNegativeThenElse(V v, V yes, V no) {
3647
4692
  static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float");
3648
- const DFromV<V> d;
3649
- const RebindToSigned<decltype(d)> di;
3650
-
3651
- const svbool_t m = detail::LtN(BitCast(di, v), 0);
3652
- return IfThenElse(m, yes, no);
4693
+ return IfThenElse(IsNegative(v), yes, no);
3653
4694
  }
3654
4695
 
3655
4696
  // ------------------------------ AverageRound (ShiftRight)
@@ -3735,6 +4776,84 @@ HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
3735
4776
  return TestBit(vbits, bit);
3736
4777
  }
3737
4778
 
4779
+ // ------------------------------ Dup128MaskFromMaskBits
4780
+
4781
+ template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 8)>
4782
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
4783
+ const RebindToUnsigned<decltype(d)> du;
4784
+
4785
+ constexpr size_t kN = MaxLanes(d);
4786
+ if (kN < 8) mask_bits &= (1u << kN) - 1;
4787
+
4788
+ // Replicate the lower 8 bits of mask_bits to each u8 lane
4789
+ const svuint8_t bytes = BitCast(du, Set(du, static_cast<uint8_t>(mask_bits)));
4790
+
4791
+ const svuint8_t bit =
4792
+ svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
4793
+ return TestBit(bytes, bit);
4794
+ }
4795
+
4796
+ template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_GT_D(D, 8)>
4797
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
4798
+ const RebindToUnsigned<decltype(d)> du;
4799
+ const Repartition<uint16_t, decltype(du)> du16;
4800
+
4801
+ // Replicate the lower 16 bits of mask_bits to each u16 lane of a u16 vector,
4802
+ // and then bitcast the replicated mask_bits to a u8 vector
4803
+ const svuint8_t bytes =
4804
+ BitCast(du, Set(du16, static_cast<uint16_t>(mask_bits)));
4805
+ // Replicate bytes 8x such that each byte contains the bit that governs it.
4806
+ const svuint8_t rep8 = svtbl_u8(bytes, ShiftRight<3>(Iota(du, 0)));
4807
+
4808
+ const svuint8_t bit =
4809
+ svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
4810
+ return TestBit(rep8, bit);
4811
+ }
4812
+
4813
+ template <class D, HWY_IF_T_SIZE_D(D, 2)>
4814
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
4815
+ const RebindToUnsigned<decltype(d)> du;
4816
+ const Repartition<uint8_t, decltype(d)> du8;
4817
+
4818
+ constexpr size_t kN = MaxLanes(d);
4819
+ if (kN < 8) mask_bits &= (1u << kN) - 1;
4820
+
4821
+ // Set all of the u8 lanes of bytes to the lower 8 bits of mask_bits
4822
+ const svuint8_t bytes = Set(du8, static_cast<uint8_t>(mask_bits));
4823
+
4824
+ const svuint16_t bit = svdupq_n_u16(1, 2, 4, 8, 16, 32, 64, 128);
4825
+ return TestBit(BitCast(du, bytes), bit);
4826
+ }
4827
+
4828
+ template <class D, HWY_IF_T_SIZE_D(D, 4)>
4829
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
4830
+ const RebindToUnsigned<decltype(d)> du;
4831
+ const Repartition<uint8_t, decltype(d)> du8;
4832
+
4833
+ constexpr size_t kN = MaxLanes(d);
4834
+ if (kN < 4) mask_bits &= (1u << kN) - 1;
4835
+
4836
+ // Set all of the u8 lanes of bytes to the lower 8 bits of mask_bits
4837
+ const svuint8_t bytes = Set(du8, static_cast<uint8_t>(mask_bits));
4838
+
4839
+ const svuint32_t bit = svdupq_n_u32(1, 2, 4, 8);
4840
+ return TestBit(BitCast(du, bytes), bit);
4841
+ }
4842
+
4843
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
4844
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
4845
+ const RebindToUnsigned<decltype(d)> du;
4846
+ const Repartition<uint8_t, decltype(d)> du8;
4847
+
4848
+ if (MaxLanes(d) < 2) mask_bits &= 1u;
4849
+
4850
+ // Set all of the u8 lanes of bytes to the lower 8 bits of mask_bits
4851
+ const svuint8_t bytes = Set(du8, static_cast<uint8_t>(mask_bits));
4852
+
4853
+ const svuint64_t bit = svdupq_n_u64(1, 2);
4854
+ return TestBit(BitCast(du, bytes), bit);
4855
+ }
4856
+
3738
4857
  // ------------------------------ StoreMaskBits
3739
4858
 
3740
4859
  namespace detail {
@@ -4100,12 +5219,13 @@ HWY_INLINE VFromD<DU> LaneIndicesFromByteIndices(D, svuint8_t idx) {
4100
5219
  template <class V>
4101
5220
  HWY_INLINE V ExpandLoop(V v, svbool_t mask) {
4102
5221
  const DFromV<V> d;
5222
+ using T = TFromV<V>;
4103
5223
  uint8_t mask_bytes[256 / 8];
4104
5224
  StoreMaskBits(d, mask, mask_bytes);
4105
5225
 
4106
5226
  // ShiftLeftLanes is expensive, so we're probably better off storing to memory
4107
5227
  // and loading the final result.
4108
- alignas(16) TFromV<V> out[2 * MaxLanes(d)];
5228
+ alignas(16) T out[2 * MaxLanes(d)];
4109
5229
 
4110
5230
  svbool_t next = svpfalse_b();
4111
5231
  size_t input_consumed = 0;
@@ -4117,7 +5237,7 @@ HWY_INLINE V ExpandLoop(V v, svbool_t mask) {
4117
5237
  // instruction for variable-shift-reg, but we can splice.
4118
5238
  const V vH = detail::Splice(v, v, next);
4119
5239
  input_consumed += PopCount(mask_bits);
4120
- next = detail::GeN(iota, static_cast<TFromV<V>>(input_consumed));
5240
+ next = detail::GeN(iota, ConvertScalarTo<T>(input_consumed));
4121
5241
 
4122
5242
  const auto idx = detail::LaneIndicesFromByteIndices(
4123
5243
  d, detail::IndicesForExpandFromBits(mask_bits));
@@ -4594,12 +5714,24 @@ HWY_API VFromD<DW> MulOdd(const V a, const V b) {
4594
5714
  #endif
4595
5715
  }
4596
5716
 
5717
+ HWY_API svint64_t MulEven(const svint64_t a, const svint64_t b) {
5718
+ const auto lo = Mul(a, b);
5719
+ const auto hi = MulHigh(a, b);
5720
+ return detail::InterleaveEven(lo, hi);
5721
+ }
5722
+
4597
5723
  HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b) {
4598
5724
  const auto lo = Mul(a, b);
4599
5725
  const auto hi = MulHigh(a, b);
4600
5726
  return detail::InterleaveEven(lo, hi);
4601
5727
  }
4602
5728
 
5729
+ HWY_API svint64_t MulOdd(const svint64_t a, const svint64_t b) {
5730
+ const auto lo = Mul(a, b);
5731
+ const auto hi = MulHigh(a, b);
5732
+ return detail::InterleaveOdd(lo, hi);
5733
+ }
5734
+
4603
5735
  HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) {
4604
5736
  const auto lo = Mul(a, b);
4605
5737
  const auto hi = MulHigh(a, b);
@@ -4609,24 +5741,15 @@ HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) {
4609
5741
  // ------------------------------ WidenMulPairwiseAdd
4610
5742
 
4611
5743
  template <size_t N, int kPow2>
4612
- HWY_API svfloat32_t WidenMulPairwiseAdd(Simd<float, N, kPow2> df32, VBF16 a,
5744
+ HWY_API svfloat32_t WidenMulPairwiseAdd(Simd<float, N, kPow2> df, VBF16 a,
4613
5745
  VBF16 b) {
4614
- #if HWY_SVE_HAVE_BFLOAT16
4615
- const svfloat32_t even = svbfmlalb_f32(Zero(df32), a, b);
5746
+ #if HWY_SVE_HAVE_F32_TO_BF16C
5747
+ const svfloat32_t even = svbfmlalb_f32(Zero(df), a, b);
4616
5748
  return svbfmlalt_f32(even, a, b);
4617
5749
  #else
4618
- const RebindToUnsigned<decltype(df32)> du32;
4619
- // Using shift/and instead of Zip leads to the odd/even order that
4620
- // RearrangeToOddPlusEven prefers.
4621
- using VU32 = VFromD<decltype(du32)>;
4622
- const VU32 odd = Set(du32, 0xFFFF0000u);
4623
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
4624
- const VU32 ao = And(BitCast(du32, a), odd);
4625
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
4626
- const VU32 bo = And(BitCast(du32, b), odd);
4627
- return MulAdd(BitCast(df32, ae), BitCast(df32, be),
4628
- Mul(BitCast(df32, ao), BitCast(df32, bo)));
4629
- #endif // HWY_SVE_HAVE_BFLOAT16
5750
+ return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
5751
+ Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
5752
+ #endif // HWY_SVE_HAVE_BF16_FEATURE
4630
5753
  }
4631
5754
 
4632
5755
  template <size_t N, int kPow2>
@@ -4636,14 +5759,8 @@ HWY_API svint32_t WidenMulPairwiseAdd(Simd<int32_t, N, kPow2> d32, svint16_t a,
4636
5759
  (void)d32;
4637
5760
  return svmlalt_s32(svmullb_s32(a, b), a, b);
4638
5761
  #else
4639
- const svbool_t pg = detail::PTrue(d32);
4640
- // Shifting extracts the odd lanes as RearrangeToOddPlusEven prefers.
4641
- // Fortunately SVE has sign-extension for the even lanes.
4642
- const svint32_t ae = svexth_s32_x(pg, BitCast(d32, a));
4643
- const svint32_t be = svexth_s32_x(pg, BitCast(d32, b));
4644
- const svint32_t ao = ShiftRight<16>(BitCast(d32, a));
4645
- const svint32_t bo = ShiftRight<16>(BitCast(d32, b));
4646
- return svmla_s32_x(pg, svmul_s32_x(pg, ao, bo), ae, be);
5762
+ return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b),
5763
+ Mul(PromoteOddTo(d32, a), PromoteOddTo(d32, b)));
4647
5764
  #endif
4648
5765
  }
4649
5766
 
@@ -4654,43 +5771,59 @@ HWY_API svuint32_t WidenMulPairwiseAdd(Simd<uint32_t, N, kPow2> d32,
4654
5771
  (void)d32;
4655
5772
  return svmlalt_u32(svmullb_u32(a, b), a, b);
4656
5773
  #else
4657
- const svbool_t pg = detail::PTrue(d32);
4658
- // Shifting extracts the odd lanes as RearrangeToOddPlusEven prefers.
4659
- // Fortunately SVE has sign-extension for the even lanes.
4660
- const svuint32_t ae = svexth_u32_x(pg, BitCast(d32, a));
4661
- const svuint32_t be = svexth_u32_x(pg, BitCast(d32, b));
4662
- const svuint32_t ao = ShiftRight<16>(BitCast(d32, a));
4663
- const svuint32_t bo = ShiftRight<16>(BitCast(d32, b));
4664
- return svmla_u32_x(pg, svmul_u32_x(pg, ao, bo), ae, be);
5774
+ return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b),
5775
+ Mul(PromoteOddTo(d32, a), PromoteOddTo(d32, b)));
5776
+ #endif
5777
+ }
5778
+
5779
+ // ------------------------------ SatWidenMulAccumFixedPoint
5780
+
5781
+ #if HWY_SVE_HAVE_2
5782
+
5783
+ #ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
5784
+ #undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
5785
+ #else
5786
+ #define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
4665
5787
  #endif
5788
+
5789
+ template <class DI32, HWY_IF_I32_D(DI32)>
5790
+ HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 /*di32*/,
5791
+ VFromD<Rebind<int16_t, DI32>> a,
5792
+ VFromD<Rebind<int16_t, DI32>> b,
5793
+ VFromD<DI32> sum) {
5794
+ return svqdmlalb_s32(sum, detail::ZipLowerSame(a, a),
5795
+ detail::ZipLowerSame(b, b));
4666
5796
  }
4667
5797
 
5798
+ #endif // HWY_SVE_HAVE_2
5799
+
4668
5800
  // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
4669
5801
 
4670
- template <size_t N, int kPow2>
4671
- HWY_API svfloat32_t ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
4672
- VBF16 a, VBF16 b,
4673
- const svfloat32_t sum0,
4674
- svfloat32_t& sum1) {
4675
- #if HWY_SVE_HAVE_BFLOAT16
4676
- (void)df32;
4677
- sum1 = svbfmlalt_f32(sum1, a, b);
4678
- return svbfmlalb_f32(sum0, a, b);
5802
+ #if HWY_SVE_HAVE_BF16_FEATURE
5803
+
5804
+ // NOTE: we currently do not use SVE BFDOT for bf16 ReorderWidenMulAccumulate
5805
+ // because, apparently unlike NEON, it uses round to odd unless the additional
5806
+ // FEAT_EBF16 feature is available and enabled.
5807
+ #ifdef HWY_NATIVE_MUL_EVEN_BF16
5808
+ #undef HWY_NATIVE_MUL_EVEN_BF16
4679
5809
  #else
4680
- const RebindToUnsigned<decltype(df32)> du32;
4681
- // Using shift/and instead of Zip leads to the odd/even order that
4682
- // RearrangeToOddPlusEven prefers.
4683
- using VU32 = VFromD<decltype(du32)>;
4684
- const VU32 odd = Set(du32, 0xFFFF0000u);
4685
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
4686
- const VU32 ao = And(BitCast(du32, a), odd);
4687
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
4688
- const VU32 bo = And(BitCast(du32, b), odd);
4689
- sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
4690
- return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
4691
- #endif // HWY_SVE_HAVE_BFLOAT16
5810
+ #define HWY_NATIVE_MUL_EVEN_BF16
5811
+ #endif
5812
+
5813
+ template <size_t N, int kPow2>
5814
+ HWY_API svfloat32_t MulEvenAdd(Simd<float, N, kPow2> /* d */, VBF16 a, VBF16 b,
5815
+ const svfloat32_t c) {
5816
+ return svbfmlalb_f32(c, a, b);
5817
+ }
5818
+
5819
+ template <size_t N, int kPow2>
5820
+ HWY_API svfloat32_t MulOddAdd(Simd<float, N, kPow2> /* d */, VBF16 a, VBF16 b,
5821
+ const svfloat32_t c) {
5822
+ return svbfmlalt_f32(c, a, b);
4692
5823
  }
4693
5824
 
5825
+ #endif // HWY_SVE_HAVE_BF16_FEATURE
5826
+
4694
5827
  template <size_t N, int kPow2>
4695
5828
  HWY_API svint32_t ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32,
4696
5829
  svint16_t a, svint16_t b,
@@ -4701,15 +5834,10 @@ HWY_API svint32_t ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32,
4701
5834
  sum1 = svmlalt_s32(sum1, a, b);
4702
5835
  return svmlalb_s32(sum0, a, b);
4703
5836
  #else
4704
- const svbool_t pg = detail::PTrue(d32);
4705
- // Shifting extracts the odd lanes as RearrangeToOddPlusEven prefers.
4706
- // Fortunately SVE has sign-extension for the even lanes.
4707
- const svint32_t ae = svexth_s32_x(pg, BitCast(d32, a));
4708
- const svint32_t be = svexth_s32_x(pg, BitCast(d32, b));
4709
- const svint32_t ao = ShiftRight<16>(BitCast(d32, a));
4710
- const svint32_t bo = ShiftRight<16>(BitCast(d32, b));
4711
- sum1 = svmla_s32_x(pg, sum1, ao, bo);
4712
- return svmla_s32_x(pg, sum0, ae, be);
5837
+ // Lane order within sum0/1 is undefined, hence we can avoid the
5838
+ // longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
5839
+ sum1 = MulAdd(PromoteOddTo(d32, a), PromoteOddTo(d32, b), sum1);
5840
+ return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b), sum0);
4713
5841
  #endif
4714
5842
  }
4715
5843
 
@@ -4723,15 +5851,10 @@ HWY_API svuint32_t ReorderWidenMulAccumulate(Simd<uint32_t, N, kPow2> d32,
4723
5851
  sum1 = svmlalt_u32(sum1, a, b);
4724
5852
  return svmlalb_u32(sum0, a, b);
4725
5853
  #else
4726
- const svbool_t pg = detail::PTrue(d32);
4727
- // Shifting extracts the odd lanes as RearrangeToOddPlusEven prefers.
4728
- // Fortunately SVE has sign-extension for the even lanes.
4729
- const svuint32_t ae = svexth_u32_x(pg, BitCast(d32, a));
4730
- const svuint32_t be = svexth_u32_x(pg, BitCast(d32, b));
4731
- const svuint32_t ao = ShiftRight<16>(BitCast(d32, a));
4732
- const svuint32_t bo = ShiftRight<16>(BitCast(d32, b));
4733
- sum1 = svmla_u32_x(pg, sum1, ao, bo);
4734
- return svmla_u32_x(pg, sum0, ae, be);
5854
+ // Lane order within sum0/1 is undefined, hence we can avoid the
5855
+ // longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
5856
+ sum1 = MulAdd(PromoteOddTo(d32, a), PromoteOddTo(d32, b), sum1);
5857
+ return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b), sum0);
4735
5858
  #endif
4736
5859
  }
4737
5860
 
@@ -4817,8 +5940,10 @@ HWY_API VFromD<DU64> SumOfMulQuadAccumulate(DU64 /*du64*/, svuint16_t a,
4817
5940
 
4818
5941
  // ------------------------------ AESRound / CLMul
4819
5942
 
5943
+ // Static dispatch with -march=armv8-a+sve2+aes, or dynamic dispatch WITHOUT a
5944
+ // baseline, in which case we check for AES support at runtime.
4820
5945
  #if defined(__ARM_FEATURE_SVE2_AES) || \
4821
- (HWY_SVE_HAVE_2 && HWY_HAVE_RUNTIME_DISPATCH)
5946
+ (HWY_SVE_HAVE_2 && HWY_HAVE_RUNTIME_DISPATCH && HWY_BASELINE_SVE2 == 0)
4822
5947
 
4823
5948
  // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
4824
5949
  #ifdef HWY_NATIVE_AES
@@ -5059,14 +6184,15 @@ HWY_API V HighestSetBitIndex(V v) {
5059
6184
  }
5060
6185
 
5061
6186
  // ================================================== END MACROS
5062
- namespace detail { // for code folding
5063
6187
  #undef HWY_SVE_ALL_PTRUE
5064
6188
  #undef HWY_SVE_D
5065
6189
  #undef HWY_SVE_FOREACH
5066
6190
  #undef HWY_SVE_FOREACH_BF16
6191
+ #undef HWY_SVE_FOREACH_BF16_UNCONDITIONAL
5067
6192
  #undef HWY_SVE_FOREACH_F
5068
6193
  #undef HWY_SVE_FOREACH_F16
5069
6194
  #undef HWY_SVE_FOREACH_F32
6195
+ #undef HWY_SVE_FOREACH_F3264
5070
6196
  #undef HWY_SVE_FOREACH_F64
5071
6197
  #undef HWY_SVE_FOREACH_I
5072
6198
  #undef HWY_SVE_FOREACH_I08
@@ -5086,7 +6212,10 @@ namespace detail { // for code folding
5086
6212
  #undef HWY_SVE_FOREACH_UI64
5087
6213
  #undef HWY_SVE_FOREACH_UIF3264
5088
6214
  #undef HWY_SVE_HAVE_2
6215
+ #undef HWY_SVE_IF_EMULATED_D
6216
+ #undef HWY_SVE_IF_NOT_EMULATED_D
5089
6217
  #undef HWY_SVE_PTRUE
6218
+ #undef HWY_SVE_RETV_ARGMVV
5090
6219
  #undef HWY_SVE_RETV_ARGPV
5091
6220
  #undef HWY_SVE_RETV_ARGPVN
5092
6221
  #undef HWY_SVE_RETV_ARGPVV
@@ -5098,7 +6227,6 @@ namespace detail { // for code folding
5098
6227
  #undef HWY_SVE_UNDEFINED
5099
6228
  #undef HWY_SVE_V
5100
6229
 
5101
- } // namespace detail
5102
6230
  // NOLINTNEXTLINE(google-readability-namespace-comments)
5103
6231
  } // namespace HWY_NAMESPACE
5104
6232
  } // namespace hwy