@img/sharp-libvips-dev 1.2.1 → 1.2.2-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/include/aom/aom_decoder.h +1 -1
  2. package/include/aom/aom_encoder.h +2 -0
  3. package/include/aom/aomcx.h +106 -25
  4. package/include/ffi.h +3 -3
  5. package/include/freetype2/freetype/config/ftconfig.h +1 -1
  6. package/include/freetype2/freetype/config/ftheader.h +1 -1
  7. package/include/freetype2/freetype/config/ftoption.h +37 -12
  8. package/include/freetype2/freetype/config/ftstdlib.h +1 -1
  9. package/include/freetype2/freetype/config/integer-types.h +29 -2
  10. package/include/freetype2/freetype/config/mac-support.h +1 -1
  11. package/include/freetype2/freetype/config/public-macros.h +3 -3
  12. package/include/freetype2/freetype/freetype.h +51 -47
  13. package/include/freetype2/freetype/ftadvanc.h +1 -1
  14. package/include/freetype2/freetype/ftbbox.h +1 -1
  15. package/include/freetype2/freetype/ftbdf.h +1 -1
  16. package/include/freetype2/freetype/ftbitmap.h +1 -1
  17. package/include/freetype2/freetype/ftbzip2.h +1 -1
  18. package/include/freetype2/freetype/ftcache.h +1 -1
  19. package/include/freetype2/freetype/ftcid.h +1 -1
  20. package/include/freetype2/freetype/ftcolor.h +13 -4
  21. package/include/freetype2/freetype/ftdriver.h +3 -3
  22. package/include/freetype2/freetype/fterrdef.h +1 -1
  23. package/include/freetype2/freetype/fterrors.h +1 -1
  24. package/include/freetype2/freetype/ftfntfmt.h +1 -1
  25. package/include/freetype2/freetype/ftgasp.h +1 -1
  26. package/include/freetype2/freetype/ftglyph.h +1 -1
  27. package/include/freetype2/freetype/ftgxval.h +1 -1
  28. package/include/freetype2/freetype/ftgzip.h +1 -1
  29. package/include/freetype2/freetype/ftimage.h +6 -2
  30. package/include/freetype2/freetype/ftincrem.h +1 -1
  31. package/include/freetype2/freetype/ftlcdfil.h +1 -1
  32. package/include/freetype2/freetype/ftlist.h +1 -1
  33. package/include/freetype2/freetype/ftlogging.h +184 -0
  34. package/include/freetype2/freetype/ftlzw.h +1 -1
  35. package/include/freetype2/freetype/ftmac.h +1 -1
  36. package/include/freetype2/freetype/ftmm.h +159 -103
  37. package/include/freetype2/freetype/ftmodapi.h +1 -1
  38. package/include/freetype2/freetype/ftmoderr.h +1 -1
  39. package/include/freetype2/freetype/ftotval.h +1 -1
  40. package/include/freetype2/freetype/ftoutln.h +1 -1
  41. package/include/freetype2/freetype/ftparams.h +1 -1
  42. package/include/freetype2/freetype/ftpfr.h +1 -1
  43. package/include/freetype2/freetype/ftrender.h +1 -1
  44. package/include/freetype2/freetype/ftsizes.h +1 -1
  45. package/include/freetype2/freetype/ftsnames.h +1 -1
  46. package/include/freetype2/freetype/ftstroke.h +1 -1
  47. package/include/freetype2/freetype/ftsynth.h +1 -1
  48. package/include/freetype2/freetype/ftsystem.h +1 -1
  49. package/include/freetype2/freetype/fttrigon.h +1 -1
  50. package/include/freetype2/freetype/fttypes.h +1 -1
  51. package/include/freetype2/freetype/ftwinfnt.h +2 -3
  52. package/include/freetype2/freetype/otsvg.h +1 -1
  53. package/include/freetype2/freetype/t1tables.h +1 -1
  54. package/include/freetype2/freetype/ttnameid.h +129 -129
  55. package/include/freetype2/freetype/tttables.h +8 -5
  56. package/include/freetype2/freetype/tttags.h +1 -1
  57. package/include/freetype2/ft2build.h +1 -1
  58. package/include/glib-2.0/gio/gdbuserror.h +9 -8
  59. package/include/glib-2.0/gio/ginetaddress.h +12 -0
  60. package/include/glib-2.0/gio/gioenums.h +9 -2
  61. package/include/glib-2.0/glib/gstring.h +2 -2
  62. package/include/glib-2.0/glib/gunicode.h +1 -1
  63. package/include/glib-2.0/gobject/glib-types.h +1 -1
  64. package/include/glib-2.0/gobject/gparam.h +1 -1
  65. package/include/glib-2.0/gobject/gvalue.h +78 -35
  66. package/include/harfbuzz/hb-script-list.h +12 -0
  67. package/include/harfbuzz/hb-version.h +3 -3
  68. package/include/hwy/abort.h +2 -19
  69. package/include/hwy/aligned_allocator.h +11 -7
  70. package/include/hwy/auto_tune.h +504 -0
  71. package/include/hwy/base.h +425 -104
  72. package/include/hwy/cache_control.h +16 -0
  73. package/include/hwy/detect_compiler_arch.h +32 -1
  74. package/include/hwy/detect_targets.h +251 -67
  75. package/include/hwy/foreach_target.h +35 -0
  76. package/include/hwy/highway.h +185 -76
  77. package/include/hwy/nanobenchmark.h +1 -19
  78. package/include/hwy/ops/arm_neon-inl.h +969 -458
  79. package/include/hwy/ops/arm_sve-inl.h +1137 -359
  80. package/include/hwy/ops/emu128-inl.h +97 -11
  81. package/include/hwy/ops/generic_ops-inl.h +1222 -34
  82. package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
  83. package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
  84. package/include/hwy/ops/ppc_vsx-inl.h +306 -126
  85. package/include/hwy/ops/rvv-inl.h +546 -51
  86. package/include/hwy/ops/scalar-inl.h +77 -22
  87. package/include/hwy/ops/set_macros-inl.h +138 -17
  88. package/include/hwy/ops/shared-inl.h +50 -10
  89. package/include/hwy/ops/wasm_128-inl.h +137 -92
  90. package/include/hwy/ops/x86_128-inl.h +773 -214
  91. package/include/hwy/ops/x86_256-inl.h +712 -255
  92. package/include/hwy/ops/x86_512-inl.h +429 -753
  93. package/include/hwy/ops/x86_avx3-inl.h +501 -0
  94. package/include/hwy/per_target.h +2 -1
  95. package/include/hwy/profiler.h +622 -486
  96. package/include/hwy/targets.h +62 -20
  97. package/include/hwy/timer-inl.h +8 -160
  98. package/include/hwy/timer.h +170 -3
  99. package/include/hwy/x86_cpuid.h +81 -0
  100. package/include/libheif/heif_cxx.h +25 -5
  101. package/include/libheif/heif_regions.h +5 -5
  102. package/include/libheif/heif_version.h +2 -2
  103. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  104. package/include/libxml2/libxml/valid.h +0 -3
  105. package/include/libxml2/libxml/xmlerror.h +1 -1
  106. package/include/libxml2/libxml/xmlversion.h +4 -4
  107. package/include/pango-1.0/pango/pango-enum-types.h +3 -0
  108. package/include/pango-1.0/pango/pango-features.h +3 -3
  109. package/include/pango-1.0/pango/pango-font.h +30 -0
  110. package/include/pango-1.0/pango/pango-version-macros.h +26 -0
  111. package/include/vips/connection.h +4 -4
  112. package/include/vips/version.h +4 -4
  113. package/include/zlib.h +3 -3
  114. package/package.json +1 -1
  115. package/versions.json +13 -13
@@ -117,10 +117,13 @@ namespace detail { // for code folding
117
117
  // SFINAE to occur instead of a hard error due to a dependency on the D template
118
118
  // argument
119
119
  #define HWY_SVE_IF_EMULATED_D(D) hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr
120
+ #define HWY_GENERIC_IF_EMULATED_D(D) \
121
+ hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr
120
122
  #define HWY_SVE_IF_NOT_EMULATED_D(D) hwy::EnableIf<true>* = nullptr
121
123
  #else
122
124
  #define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP)
123
125
  #define HWY_SVE_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
126
+ #define HWY_GENERIC_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
124
127
  #define HWY_SVE_IF_NOT_EMULATED_D(D) HWY_IF_NOT_BF16_D(D)
125
128
  #endif // HWY_SVE_HAVE_BF16_FEATURE
126
129
 
@@ -216,6 +219,19 @@ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _)
216
219
  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
217
220
  return sv##OP##_##CHAR##BITS(v); \
218
221
  }
222
+ #define HWY_SVE_RETV_ARGMV_M(BASE, CHAR, BITS, HALF, NAME, OP) \
223
+ HWY_API HWY_SVE_V(BASE, BITS) \
224
+ NAME(HWY_SVE_V(BASE, BITS) no, svbool_t m, HWY_SVE_V(BASE, BITS) a) { \
225
+ return sv##OP##_##CHAR##BITS##_m(no, m, a); \
226
+ }
227
+ #define HWY_SVE_RETV_ARGMV(BASE, CHAR, BITS, HALF, NAME, OP) \
228
+ HWY_API HWY_SVE_V(BASE, BITS) NAME(svbool_t m, HWY_SVE_V(BASE, BITS) v) { \
229
+ return sv##OP##_##CHAR##BITS##_x(m, v); \
230
+ }
231
+ #define HWY_SVE_RETV_ARGMV_Z(BASE, CHAR, BITS, HALF, NAME, OP) \
232
+ HWY_API HWY_SVE_V(BASE, BITS) NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a) { \
233
+ return sv##OP##_##CHAR##BITS##_z(m, a); \
234
+ }
219
235
 
220
236
  // vector = f(vector, scalar), e.g. detail::AddN
221
237
  #define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP) \
@@ -249,6 +265,12 @@ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _)
249
265
  NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
250
266
  return sv##OP##_##CHAR##BITS##_x(m, a, b); \
251
267
  }
268
+ // User-specified mask. Mask=false value is zero.
269
+ #define HWY_SVE_RETV_ARGMVV_Z(BASE, CHAR, BITS, HALF, NAME, OP) \
270
+ HWY_API HWY_SVE_V(BASE, BITS) \
271
+ NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
272
+ return sv##OP##_##CHAR##BITS##_z(m, a, b); \
273
+ }
252
274
 
253
275
  #define HWY_SVE_RETV_ARGVVV(BASE, CHAR, BITS, HALF, NAME, OP) \
254
276
  HWY_API HWY_SVE_V(BASE, BITS) \
@@ -256,6 +278,18 @@ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _)
256
278
  HWY_SVE_V(BASE, BITS) c) { \
257
279
  return sv##OP##_##CHAR##BITS(a, b, c); \
258
280
  }
281
+ #define HWY_SVE_RETV_ARGMVVV(BASE, CHAR, BITS, HALF, NAME, OP) \
282
+ HWY_API HWY_SVE_V(BASE, BITS) \
283
+ NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b, \
284
+ HWY_SVE_V(BASE, BITS) c) { \
285
+ return sv##OP##_##CHAR##BITS##_x(m, a, b, c); \
286
+ }
287
+ #define HWY_SVE_RETV_ARGMVVV_Z(BASE, CHAR, BITS, HALF, NAME, OP) \
288
+ HWY_API HWY_SVE_V(BASE, BITS) \
289
+ NAME(svbool_t m, HWY_SVE_V(BASE, BITS) mul, HWY_SVE_V(BASE, BITS) x, \
290
+ HWY_SVE_V(BASE, BITS) add) { \
291
+ return sv##OP##_##CHAR##BITS##_z(m, x, mul, add); \
292
+ }
259
293
 
260
294
  // ------------------------------ Lanes
261
295
 
@@ -409,6 +443,27 @@ using VFromD = decltype(Set(D(), TFromD<D>()));
409
443
 
410
444
  using VBF16 = VFromD<ScalableTag<bfloat16_t>>;
411
445
 
446
+ // ------------------------------ MaskedSetOr/MaskedSet
447
+
448
+ #define HWY_SVE_MASKED_SET_OR(BASE, CHAR, BITS, HALF, NAME, OP) \
449
+ HWY_API HWY_SVE_V(BASE, BITS) \
450
+ NAME(HWY_SVE_V(BASE, BITS) no, svbool_t m, HWY_SVE_T(BASE, BITS) op) { \
451
+ return sv##OP##_##CHAR##BITS##_m(no, m, op); \
452
+ }
453
+
454
+ HWY_SVE_FOREACH(HWY_SVE_MASKED_SET_OR, MaskedSetOr, dup_n)
455
+ #undef HWY_SVE_MASKED_SET_OR
456
+
457
+ #define HWY_SVE_MASKED_SET(BASE, CHAR, BITS, HALF, NAME, OP) \
458
+ template <size_t N, int kPow2> \
459
+ HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
460
+ svbool_t m, HWY_SVE_T(BASE, BITS) op) { \
461
+ return sv##OP##_##CHAR##BITS##_z(m, op); \
462
+ }
463
+
464
+ HWY_SVE_FOREACH(HWY_SVE_MASKED_SET, MaskedSet, dup_n)
465
+ #undef HWY_SVE_MASKED_SET
466
+
412
467
  // ------------------------------ Zero
413
468
 
414
469
  template <class D>
@@ -687,6 +742,25 @@ HWY_API svfloat64_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
687
742
  return svdupq_n_f64(t0, t1);
688
743
  }
689
744
 
745
+ // ------------------------------ GetLane
746
+
747
+ namespace detail {
748
+ #define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP) \
749
+ HWY_INLINE HWY_SVE_T(BASE, BITS) \
750
+ NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \
751
+ return sv##OP##_##CHAR##BITS(mask, v); \
752
+ }
753
+
754
+ HWY_SVE_FOREACH(HWY_SVE_GET_LANE, GetLaneM, lasta)
755
+ HWY_SVE_FOREACH(HWY_SVE_GET_LANE, ExtractLastMatchingLaneM, lastb)
756
+ #undef HWY_SVE_GET_LANE
757
+ } // namespace detail
758
+
759
+ template <class V>
760
+ HWY_API TFromV<V> GetLane(V v) {
761
+ return detail::GetLaneM(v, detail::PFalse());
762
+ }
763
+
690
764
  // ================================================== LOGICAL
691
765
 
692
766
  // detail::*N() functions accept a scalar argument to avoid extra Set().
@@ -724,6 +798,9 @@ HWY_API V Or(const V a, const V b) {
724
798
  return BitCast(df, Or(BitCast(du, a), BitCast(du, b)));
725
799
  }
726
800
 
801
+ // ------------------------------ MaskedOr
802
+ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV_Z, MaskedOr, orr)
803
+
727
804
  // ------------------------------ Xor
728
805
 
729
806
  namespace detail {
@@ -845,20 +922,6 @@ HWY_API VBF16 Neg(VBF16 v) {
845
922
  HWY_SVE_FOREACH_I(HWY_SVE_RETV_ARGPV, SaturatedNeg, qneg)
846
923
  #endif // HWY_SVE_HAVE_2
847
924
 
848
- // ------------------------------ Abs
849
- HWY_SVE_FOREACH_IF(HWY_SVE_RETV_ARGPV, Abs, abs)
850
-
851
- // ------------------------------ SaturatedAbs
852
- #if HWY_SVE_HAVE_2
853
- #ifdef HWY_NATIVE_SATURATED_ABS
854
- #undef HWY_NATIVE_SATURATED_ABS
855
- #else
856
- #define HWY_NATIVE_SATURATED_ABS
857
- #endif
858
-
859
- HWY_SVE_FOREACH_I(HWY_SVE_RETV_ARGPV, SaturatedAbs, qabs)
860
- #endif // HWY_SVE_HAVE_2
861
-
862
925
  // ================================================== ARITHMETIC
863
926
 
864
927
  // Per-target flags to prevent generic_ops-inl.h defining Add etc.
@@ -1064,6 +1127,35 @@ HWY_SVE_FOREACH_I(HWY_SVE_SHIFT_N, ShiftRight, asr_n)
1064
1127
 
1065
1128
  #undef HWY_SVE_SHIFT_N
1066
1129
 
1130
+ // ------------------------------ MaskedShift[Left/Right]
1131
+
1132
+ #define HWY_SVE_SHIFT_Z(BASE, CHAR, BITS, HALF, NAME, OP) \
1133
+ template <int kBits> \
1134
+ HWY_API HWY_SVE_V(BASE, BITS) NAME(svbool_t m, HWY_SVE_V(BASE, BITS) v) { \
1135
+ auto shifts = static_cast<HWY_SVE_T(uint, BITS)>(kBits); \
1136
+ return sv##OP##_##CHAR##BITS##_z(m, v, shifts); \
1137
+ }
1138
+ HWY_SVE_FOREACH_UI(HWY_SVE_SHIFT_Z, MaskedShiftLeft, lsl_n)
1139
+ HWY_SVE_FOREACH_I(HWY_SVE_SHIFT_Z, MaskedShiftRight, asr_n)
1140
+ HWY_SVE_FOREACH_U(HWY_SVE_SHIFT_Z, MaskedShiftRight, lsr_n)
1141
+
1142
+ #undef HWY_SVE_SHIFT_Z
1143
+
1144
+ // ------------------------------ MaskedShiftRightOr
1145
+
1146
+ #define HWY_SVE_SHIFT_OR(BASE, CHAR, BITS, HALF, NAME, OP) \
1147
+ template <int kBits> \
1148
+ HWY_API HWY_SVE_V(BASE, BITS) \
1149
+ NAME(HWY_SVE_V(BASE, BITS) no, svbool_t m, HWY_SVE_V(BASE, BITS) v) { \
1150
+ auto shifts = static_cast<HWY_SVE_T(uint, BITS)>(kBits); \
1151
+ return svsel##_##CHAR##BITS(m, sv##OP##_##CHAR##BITS##_z(m, v, shifts), \
1152
+ no); \
1153
+ }
1154
+ HWY_SVE_FOREACH_I(HWY_SVE_SHIFT_OR, MaskedShiftRightOr, asr_n)
1155
+ HWY_SVE_FOREACH_U(HWY_SVE_SHIFT_OR, MaskedShiftRightOr, lsr_n)
1156
+
1157
+ #undef HWY_SVE_SHIFT_OR
1158
+
1067
1159
  // ------------------------------ RotateRight
1068
1160
 
1069
1161
  #if HWY_SVE_HAVE_2
@@ -1096,7 +1188,7 @@ HWY_API V RotateRight(const V v) {
1096
1188
  }
1097
1189
  #endif
1098
1190
 
1099
- // ------------------------------ Shl/r
1191
+ // ------------------------------ Shl, Shr
1100
1192
 
1101
1193
  #define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP) \
1102
1194
  HWY_API HWY_SVE_V(BASE, BITS) \
@@ -1113,17 +1205,95 @@ HWY_SVE_FOREACH_I(HWY_SVE_SHIFT, Shr, asr)
1113
1205
 
1114
1206
  #undef HWY_SVE_SHIFT
1115
1207
 
1116
- // ------------------------------ Min/Max
1208
+ // ------------------------------ RoundingShiftLeft[Same]/RoundingShr
1117
1209
 
1118
- HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Min, min)
1119
- HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Max, max)
1120
- HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Min, minnm)
1121
- HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Max, maxnm)
1210
+ #if HWY_SVE_HAVE_2
1122
1211
 
1123
- namespace detail {
1124
- HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, MinN, min_n)
1125
- HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, MaxN, max_n)
1126
- } // namespace detail
1212
+ #ifdef HWY_NATIVE_ROUNDING_SHR
1213
+ #undef HWY_NATIVE_ROUNDING_SHR
1214
+ #else
1215
+ #define HWY_NATIVE_ROUNDING_SHR
1216
+ #endif
1217
+
1218
+ #define HWY_SVE_ROUNDING_SHR_N(BASE, CHAR, BITS, HALF, NAME, OP) \
1219
+ template <int kBits> \
1220
+ HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
1221
+ HWY_IF_CONSTEXPR(kBits == 0) { return v; } \
1222
+ \
1223
+ return sv##OP##_##CHAR##BITS##_x( \
1224
+ HWY_SVE_PTRUE(BITS), v, static_cast<uint64_t>(HWY_MAX(kBits, 1))); \
1225
+ }
1226
+
1227
+ HWY_SVE_FOREACH_UI(HWY_SVE_ROUNDING_SHR_N, RoundingShiftRight, rshr_n)
1228
+
1229
+ #undef HWY_SVE_ROUNDING_SHR_N
1230
+
1231
+ #define HWY_SVE_ROUNDING_SHR(BASE, CHAR, BITS, HALF, NAME, OP) \
1232
+ HWY_API HWY_SVE_V(BASE, BITS) \
1233
+ NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(BASE, BITS) bits) { \
1234
+ const RebindToSigned<DFromV<decltype(v)>> di; \
1235
+ return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, \
1236
+ Neg(BitCast(di, bits))); \
1237
+ }
1238
+
1239
+ HWY_SVE_FOREACH_UI(HWY_SVE_ROUNDING_SHR, RoundingShr, rshl)
1240
+
1241
+ #undef HWY_SVE_ROUNDING_SHR
1242
+
1243
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
1244
+ HWY_API V RoundingShiftRightSame(V v, int bits) {
1245
+ const DFromV<V> d;
1246
+ using T = TFromD<decltype(d)>;
1247
+ return RoundingShr(v, Set(d, static_cast<T>(bits)));
1248
+ }
1249
+
1250
+ #endif // HWY_SVE_HAVE_2
1251
+
1252
+ // ------------------------------ BroadcastSignBit (ShiftRight)
1253
+ template <class V>
1254
+ HWY_API V BroadcastSignBit(const V v) {
1255
+ return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v);
1256
+ }
1257
+
1258
+ // ------------------------------ Abs (ShiftRight, Add, Xor, AndN)
1259
+
1260
+ // Workaround for incorrect results with `svabs`.
1261
+ #if HWY_COMPILER_CLANG
1262
+ template <class V, HWY_IF_SIGNED_V(V)>
1263
+ HWY_API V Abs(V v) {
1264
+ const V sign = BroadcastSignBit(v);
1265
+ return Xor(Add(v, sign), sign);
1266
+ }
1267
+
1268
+ template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
1269
+ HWY_NOINLINE V Abs(V v) {
1270
+ const DFromV<V> d;
1271
+ const RebindToUnsigned<decltype(d)> du;
1272
+ using TU = MakeUnsigned<TFromD<decltype(d)>>;
1273
+ return BitCast(
1274
+ d, detail::AndN(BitCast(du, v), static_cast<TU>(~SignMask<TU>())));
1275
+ }
1276
+
1277
+ #else
1278
+ HWY_SVE_FOREACH_IF(HWY_SVE_RETV_ARGPV, Abs, abs)
1279
+ #endif
1280
+
1281
+ // ------------------------------ SaturatedAbs
1282
+ #if HWY_SVE_HAVE_2
1283
+ #ifdef HWY_NATIVE_SATURATED_ABS
1284
+ #undef HWY_NATIVE_SATURATED_ABS
1285
+ #else
1286
+ #define HWY_NATIVE_SATURATED_ABS
1287
+ #endif
1288
+
1289
+ HWY_SVE_FOREACH_I(HWY_SVE_RETV_ARGPV, SaturatedAbs, qabs)
1290
+ #endif // HWY_SVE_HAVE_2
1291
+
1292
+ // ------------------------------ MaskedAbsOr
1293
+ HWY_SVE_FOREACH_IF(HWY_SVE_RETV_ARGMV_M, MaskedAbsOr, abs)
1294
+
1295
+ // ------------------------------ MaskedAbs
1296
+ HWY_SVE_FOREACH_IF(HWY_SVE_RETV_ARGMV_Z, MaskedAbs, abs)
1127
1297
 
1128
1298
  // ------------------------------ Mul
1129
1299
 
@@ -1187,6 +1357,15 @@ HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGV, ApproximateReciprocal, recpe)
1187
1357
  // ------------------------------ Sqrt
1188
1358
  HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Sqrt, sqrt)
1189
1359
 
1360
+ // ------------------------------ MaskedSqrt
1361
+ #ifdef HWY_NATIVE_MASKED_SQRT
1362
+ #undef HWY_NATIVE_MASKED_SQRT
1363
+ #else
1364
+ #define HWY_NATIVE_MASKED_SQRT
1365
+ #endif
1366
+
1367
+ HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGMV_Z, MaskedSqrt, sqrt)
1368
+
1190
1369
  // ------------------------------ ApproximateReciprocalSqrt
1191
1370
  #ifdef HWY_NATIVE_F64_APPROX_RSQRT
1192
1371
  #undef HWY_NATIVE_F64_APPROX_RSQRT
@@ -1466,14 +1645,17 @@ HWY_API svbool_t LowerHalfOfMask(D /*d*/, svbool_t m) {
1466
1645
  #endif
1467
1646
 
1468
1647
  namespace detail {
1469
- HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedMin, min)
1470
- HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedMax, max)
1648
+ HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGMVV, MaskedMin, minnm)
1649
+ HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGMVV, MaskedMax, maxnm)
1650
+ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV, MaskedMin, min)
1651
+ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV, MaskedMax, max)
1471
1652
  HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedAdd, add)
1472
1653
  HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedSub, sub)
1473
1654
  HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedMul, mul)
1474
1655
  HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGMVV, MaskedDiv, div)
1475
1656
  HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGMVV, MaskedDiv, div)
1476
1657
  HWY_SVE_FOREACH_UI64(HWY_SVE_RETV_ARGMVV, MaskedDiv, div)
1658
+ HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGMV, MaskedSqrt, sqrt)
1477
1659
  #if HWY_SVE_HAVE_2
1478
1660
  HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV, MaskedSatAdd, qadd)
1479
1661
  HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV, MaskedSatSub, qsub)
@@ -1537,6 +1719,187 @@ HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
1537
1719
  }
1538
1720
  #endif
1539
1721
 
1722
+ // ------------------------------ MaskedMulAddOr
1723
+ namespace detail {
1724
+ HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVVV, MaskedMulAdd, mad)
1725
+ }
1726
+
1727
+ // Per-target flag to prevent generic_ops-inl.h from defining int
1728
+ // MaskedMulAddOr.
1729
+ #ifdef HWY_NATIVE_MASKED_INT_FMA
1730
+ #undef HWY_NATIVE_MASKED_INT_FMA
1731
+ #else
1732
+ #define HWY_NATIVE_MASKED_INT_FMA
1733
+ #endif
1734
+
1735
+ template <class V, class M>
1736
+ HWY_API V MaskedMulAddOr(V no, M m, V mul, V x, V add) {
1737
+ return IfThenElse(m, detail::MaskedMulAdd(m, mul, x, add), no);
1738
+ }
1739
+
1740
+ template <class V, HWY_IF_FLOAT_V(V), class M>
1741
+ HWY_API V MaskedSqrtOr(V no, M m, V v) {
1742
+ return IfThenElse(m, detail::MaskedSqrt(m, v), no);
1743
+ }
1744
+
1745
+ // ================================================== REDUCE
1746
+
1747
+ #ifdef HWY_NATIVE_REDUCE_SCALAR
1748
+ #undef HWY_NATIVE_REDUCE_SCALAR
1749
+ #else
1750
+ #define HWY_NATIVE_REDUCE_SCALAR
1751
+ #endif
1752
+
1753
+ // These return T, suitable for ReduceSum.
1754
+ namespace detail {
1755
+ #define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP) \
1756
+ HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
1757
+ /* The intrinsic returns [u]int64_t; truncate to T so we can broadcast. */ \
1758
+ using T = HWY_SVE_T(BASE, BITS); \
1759
+ using TU = MakeUnsigned<T>; \
1760
+ constexpr uint64_t kMask = LimitsMax<TU>(); \
1761
+ return static_cast<T>(static_cast<TU>( \
1762
+ static_cast<uint64_t>(sv##OP##_##CHAR##BITS(pg, v)) & kMask)); \
1763
+ }
1764
+
1765
+ #define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP) \
1766
+ HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
1767
+ return sv##OP##_##CHAR##BITS(pg, v); \
1768
+ }
1769
+
1770
+ // TODO: Remove SumOfLanesM in favor of using MaskedReduceSum
1771
+ HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE_ADD, SumOfLanesM, addv)
1772
+ HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, SumOfLanesM, addv)
1773
+
1774
+ HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MinOfLanesM, minv)
1775
+ HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MaxOfLanesM, maxv)
1776
+ // NaN if all are
1777
+ HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MinOfLanesM, minnmv)
1778
+ HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanesM, maxnmv)
1779
+
1780
+ #undef HWY_SVE_REDUCE
1781
+ #undef HWY_SVE_REDUCE_ADD
1782
+ } // namespace detail
1783
+
1784
+ // detail::SumOfLanesM, detail::MinOfLanesM, and detail::MaxOfLanesM is more
1785
+ // efficient for N=4 I8/U8 reductions on SVE than the default implementations
1786
+ // of the N=4 I8/U8 ReduceSum/ReduceMin/ReduceMax operations in
1787
+ // generic_ops-inl.h
1788
+ #undef HWY_IF_REDUCE_D
1789
+ #define HWY_IF_REDUCE_D(D) hwy::EnableIf<HWY_MAX_LANES_D(D) != 1>* = nullptr
1790
+
1791
+ #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
1792
+ #undef HWY_NATIVE_REDUCE_SUM_4_UI8
1793
+ #else
1794
+ #define HWY_NATIVE_REDUCE_SUM_4_UI8
1795
+ #endif
1796
+
1797
+ #ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
1798
+ #undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
1799
+ #else
1800
+ #define HWY_NATIVE_REDUCE_MINMAX_4_UI8
1801
+ #endif
1802
+
1803
+ template <class D, HWY_IF_REDUCE_D(D)>
1804
+ HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
1805
+ return detail::SumOfLanesM(detail::MakeMask(d), v);
1806
+ }
1807
+
1808
+ template <class D, HWY_IF_REDUCE_D(D)>
1809
+ HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) {
1810
+ return detail::MinOfLanesM(detail::MakeMask(d), v);
1811
+ }
1812
+
1813
+ template <class D, HWY_IF_REDUCE_D(D)>
1814
+ HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
1815
+ return detail::MaxOfLanesM(detail::MakeMask(d), v);
1816
+ }
1817
+
1818
+ #ifdef HWY_NATIVE_MASKED_REDUCE_SCALAR
1819
+ #undef HWY_NATIVE_MASKED_REDUCE_SCALAR
1820
+ #else
1821
+ #define HWY_NATIVE_MASKED_REDUCE_SCALAR
1822
+ #endif
1823
+
1824
+ template <class D, class M>
1825
+ HWY_API TFromD<D> MaskedReduceSum(D /*d*/, M m, VFromD<D> v) {
1826
+ return detail::SumOfLanesM(m, v);
1827
+ }
1828
+ template <class D, class M>
1829
+ HWY_API TFromD<D> MaskedReduceMin(D /*d*/, M m, VFromD<D> v) {
1830
+ return detail::MinOfLanesM(m, v);
1831
+ }
1832
+ template <class D, class M>
1833
+ HWY_API TFromD<D> MaskedReduceMax(D /*d*/, M m, VFromD<D> v) {
1834
+ return detail::MaxOfLanesM(m, v);
1835
+ }
1836
+
1837
+ // ------------------------------ SumOfLanes
1838
+
1839
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
1840
+ HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
1841
+ return Set(d, ReduceSum(d, v));
1842
+ }
1843
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
1844
+ HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
1845
+ return Set(d, ReduceMin(d, v));
1846
+ }
1847
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
1848
+ HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
1849
+ return Set(d, ReduceMax(d, v));
1850
+ }
1851
+
1852
+ // ------------------------------ MaskedAdd etc. (IfThenElse)
1853
+
1854
+ #ifdef HWY_NATIVE_ZERO_MASKED_ARITH
1855
+ #undef HWY_NATIVE_ZERO_MASKED_ARITH
1856
+ #else
1857
+ #define HWY_NATIVE_ZERO_MASKED_ARITH
1858
+ #endif
1859
+
1860
+ HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV_Z, MaskedMax, max)
1861
+ HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV_Z, MaskedAdd, add)
1862
+ HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV_Z, MaskedSub, sub)
1863
+ HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV_Z, MaskedMul, mul)
1864
+ HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGMVV_Z, MaskedDiv, div)
1865
+ HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGMVV_Z, MaskedDiv, div)
1866
+ HWY_SVE_FOREACH_UI64(HWY_SVE_RETV_ARGMVV_Z, MaskedDiv, div)
1867
+ HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVVV_Z, MaskedMulAdd, mad)
1868
+ HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVVV_Z, MaskedNegMulAdd, msb)
1869
+
1870
+ // I8/U8/I16/U16 MaskedDiv is implemented after I8/U8/I16/U16 Div
1871
+
1872
+ #if HWY_SVE_HAVE_2
1873
+ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV_Z, MaskedSaturatedAdd, qadd)
1874
+ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV_Z, MaskedSaturatedSub, qsub)
1875
+ #else
1876
+ template <class V, class M>
1877
+ HWY_API V MaskedSaturatedAdd(M m, V a, V b) {
1878
+ return IfThenElseZero(m, SaturatedAdd(a, b));
1879
+ }
1880
+
1881
+ template <class V, class M>
1882
+ HWY_API V MaskedSaturatedSub(M m, V a, V b) {
1883
+ return IfThenElseZero(m, SaturatedSub(a, b));
1884
+ }
1885
+ #endif
1886
+
1887
+ template <class V, class M, typename D = DFromV<V>, HWY_IF_I16_D(D)>
1888
+ HWY_API V MaskedMulFixedPoint15(M m, V a, V b) {
1889
+ return IfThenElseZero(m, MulFixedPoint15(a, b));
1890
+ }
1891
+
1892
+ template <class D, class M, HWY_IF_UI32_D(D),
1893
+ class V16 = VFromD<RepartitionToNarrow<D>>>
1894
+ HWY_API VFromD<D> MaskedWidenMulPairwiseAdd(D d32, M m, V16 a, V16 b) {
1895
+ return IfThenElseZero(m, WidenMulPairwiseAdd(d32, a, b));
1896
+ }
1897
+
1898
+ template <class DF, class M, HWY_IF_F32_D(DF), class VBF>
1899
+ HWY_API VFromD<DF> MaskedWidenMulPairwiseAdd(DF df, M m, VBF a, VBF b) {
1900
+ return IfThenElseZero(m, WidenMulPairwiseAdd(df, a, b));
1901
+ }
1902
+
1540
1903
  // ================================================== COMPARE
1541
1904
 
1542
1905
  // mask = f(vector, vector)
@@ -1596,8 +1959,122 @@ HWY_API svbool_t TestBit(const V a, const V bit) {
1596
1959
  return detail::NeN(And(a, bit), 0);
1597
1960
  }
1598
1961
 
1599
- // ------------------------------ MaskFromVec (Ne)
1600
- template <class V>
1962
+ // ------------------------------ Min/Max (Lt, IfThenElse)
1963
+
1964
+ HWY_SVE_FOREACH_U(HWY_SVE_RETV_ARGPVV, Min, min)
1965
+ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Max, max)
1966
+ HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Max, maxnm)
1967
+
1968
+ // Workaround for incorrect results with `svmin`.
1969
+ #if HWY_COMPILER_CLANG
1970
+ template <class V, HWY_IF_SIGNED_V(V)>
1971
+ HWY_API V Min(V a, V b) {
1972
+ return IfThenElse(Lt(a, b), a, b);
1973
+ }
1974
+ template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
1975
+ HWY_API V Min(V a, V b) {
1976
+ return IfThenElse(Or(Lt(a, b), Ne(b, b)), a, b);
1977
+ }
1978
+ #else
1979
+ HWY_SVE_FOREACH_I(HWY_SVE_RETV_ARGPVV, Min, min)
1980
+ HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Min, minnm)
1981
+ #endif
1982
+
1983
+ namespace detail {
1984
+ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, MinN, min_n)
1985
+ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, MaxN, max_n)
1986
+ } // namespace detail
1987
+
1988
+ // ================================================== SWIZZLE
1989
+
1990
+ // ------------------------------ ConcatEven/ConcatOdd
1991
+
1992
+ // WARNING: the upper half of these needs fixing up (uzp1/uzp2 use the
1993
+ // full vector length, not rounded down to a power of two as we require).
1994
+ namespace detail {
1995
+
1996
+ #define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP) \
1997
+ HWY_INLINE HWY_SVE_V(BASE, BITS) \
1998
+ NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
1999
+ return sv##OP##_##CHAR##BITS(lo, hi); \
2000
+ }
2001
+ HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1)
2002
+ HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2)
2003
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
2004
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull,
2005
+ uzp1)
2006
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull,
2007
+ uzp2)
2008
+ #endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
2009
+ #if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
2010
+ HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q)
2011
+ HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
2012
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
2013
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND,
2014
+ ConcatEvenBlocks, uzp1q)
2015
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks,
2016
+ uzp2q)
2017
+ #endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
2018
+ #endif // defined(__ARM_FEATURE_SVE_MATMUL_FP64)
2019
+ #undef HWY_SVE_CONCAT_EVERY_SECOND
2020
+
2021
+ // Used to slide up / shift whole register left; mask indicates which range
2022
+ // to take from lo, and the rest is filled from hi starting at its lowest.
2023
+ #define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP) \
2024
+ HWY_API HWY_SVE_V(BASE, BITS) NAME( \
2025
+ HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo, svbool_t mask) { \
2026
+ return sv##OP##_##CHAR##BITS(mask, lo, hi); \
2027
+ }
2028
+ HWY_SVE_FOREACH(HWY_SVE_SPLICE, Splice, splice)
2029
+ #if HWY_SVE_HAVE_BF16_FEATURE
2030
+ HWY_SVE_FOREACH_BF16(HWY_SVE_SPLICE, Splice, splice)
2031
+ #else
2032
+ template <class V, HWY_IF_BF16_D(DFromV<V>)>
2033
+ HWY_INLINE V Splice(V hi, V lo, svbool_t mask) {
2034
+ const DFromV<V> d;
2035
+ const RebindToUnsigned<decltype(d)> du;
2036
+ return BitCast(d, Splice(BitCast(du, hi), BitCast(du, lo), mask));
2037
+ }
2038
+ #endif // HWY_SVE_HAVE_BF16_FEATURE
2039
+ #undef HWY_SVE_SPLICE
2040
+
2041
+ } // namespace detail
2042
+
2043
+ template <class D>
2044
+ HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
2045
+ #if HWY_SVE_IS_POW2
2046
+ if (detail::IsFull(d)) return detail::ConcatOddFull(hi, lo);
2047
+ #endif
2048
+ const VFromD<D> hi_odd = detail::ConcatOddFull(hi, hi);
2049
+ const VFromD<D> lo_odd = detail::ConcatOddFull(lo, lo);
2050
+ return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
2051
+ }
2052
+
2053
+ template <class D>
2054
+ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
2055
+ #if HWY_SVE_IS_POW2
2056
+ if (detail::IsFull(d)) return detail::ConcatEvenFull(hi, lo);
2057
+ #endif
2058
+ const VFromD<D> hi_odd = detail::ConcatEvenFull(hi, hi);
2059
+ const VFromD<D> lo_odd = detail::ConcatEvenFull(lo, lo);
2060
+ return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
2061
+ }
2062
+
2063
+ HWY_API svuint8_t U8FromU32(const svuint32_t v) {
2064
+ const DFromV<svuint32_t> du32;
2065
+ const RepartitionToNarrow<decltype(du32)> du16;
2066
+ const RepartitionToNarrow<decltype(du16)> du8;
2067
+
2068
+ const svuint16_t cast16 = BitCast(du16, v);
2069
+ const svuint16_t x2 = svuzp1_u16(cast16, cast16);
2070
+ const svuint8_t cast8 = BitCast(du8, x2);
2071
+ return svuzp1_u8(cast8, cast8);
2072
+ }
2073
+
2074
+ // ================================================== MASK
2075
+
2076
+ // ------------------------------ MaskFromVec (Ne)
2077
+ template <class V>
1601
2078
  HWY_API svbool_t MaskFromVec(const V v) {
1602
2079
  using T = TFromV<V>;
1603
2080
  return detail::NeN(v, ConvertScalarTo<T>(0));
@@ -1612,6 +2089,87 @@ HWY_API VFromD<D> VecFromMask(const D d, svbool_t mask) {
1612
2089
  return BitCast(d, IfThenElseZero(mask, Set(di, -1)));
1613
2090
  }
1614
2091
 
2092
+ // ------------------------------ BitsFromMask (AndN, Shl, ReduceSum, GetLane
2093
+ // ConcatEvenFull, U8FromU32)
2094
+
2095
+ namespace detail {
2096
+
2097
+ // For each mask lane (governing lane type T), store 1 or 0 in BYTE lanes.
2098
+ template <class D, HWY_IF_T_SIZE_D(D, 1)>
2099
+ HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
2100
+ return svdup_n_u8_z(m, 1);
2101
+ }
2102
+ template <class D, HWY_IF_T_SIZE_D(D, 2)>
2103
+ HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
2104
+ const ScalableTag<uint8_t> d8;
2105
+ const svuint8_t b16 = BitCast(d8, svdup_n_u16_z(m, 1));
2106
+ return detail::ConcatEvenFull(b16, b16); // lower half
2107
+ }
2108
+ template <class D, HWY_IF_T_SIZE_D(D, 4)>
2109
+ HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
2110
+ return U8FromU32(svdup_n_u32_z(m, 1));
2111
+ }
2112
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
2113
+ HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
2114
+ const ScalableTag<uint32_t> d32;
2115
+ const svuint32_t b64 = BitCast(d32, svdup_n_u64_z(m, 1));
2116
+ return U8FromU32(detail::ConcatEvenFull(b64, b64)); // lower half
2117
+ }
2118
+
2119
+ // Compacts groups of 8 u8 into 8 contiguous bits in a 64-bit lane.
2120
+ HWY_INLINE svuint64_t BitsFromBool(svuint8_t x) {
2121
+ const ScalableTag<uint8_t> d8;
2122
+ const ScalableTag<uint16_t> d16;
2123
+ const ScalableTag<uint32_t> d32;
2124
+ const ScalableTag<uint64_t> d64;
2125
+ // TODO(janwas): could use SVE2 BDEP, but it's optional.
2126
+ x = Or(x, BitCast(d8, ShiftRight<7>(BitCast(d16, x))));
2127
+ x = Or(x, BitCast(d8, ShiftRight<14>(BitCast(d32, x))));
2128
+ x = Or(x, BitCast(d8, ShiftRight<28>(BitCast(d64, x))));
2129
+ return BitCast(d64, x);
2130
+ }
2131
+
2132
+ } // namespace detail
2133
+
2134
+ // BitsFromMask is required if `HWY_MAX_BYTES <= 64`, which is true for the
2135
+ // fixed-size SVE targets.
2136
+ #if HWY_TARGET == HWY_SVE2_128 || HWY_TARGET == HWY_SVE_256
2137
+ template <class D>
2138
+ HWY_API uint64_t BitsFromMask(D d, svbool_t mask) {
2139
+ const Repartition<uint64_t, D> du64;
2140
+ svuint64_t bits_in_u64 = detail::BitsFromBool(detail::BoolFromMask<D>(mask));
2141
+
2142
+ constexpr size_t N = MaxLanes(d);
2143
+ static_assert(N < 64, "SVE2_128 and SVE_256 are only 128 or 256 bits");
2144
+ const uint64_t valid = (1ull << N) - 1;
2145
+ HWY_IF_CONSTEXPR(N <= 8) {
2146
+ // Upper bits are undefined even if N == 8, hence mask.
2147
+ return GetLane(bits_in_u64) & valid;
2148
+ }
2149
+
2150
+ // Up to 8 of the least-significant bits of each u64 lane are valid.
2151
+ bits_in_u64 = detail::AndN(bits_in_u64, 0xFF);
2152
+
2153
+ // 128-bit vector: only two u64, so avoid ReduceSum.
2154
+ HWY_IF_CONSTEXPR(HWY_TARGET == HWY_SVE2_128) {
2155
+ alignas(16) uint64_t lanes[2];
2156
+ Store(bits_in_u64, du64, lanes);
2157
+ // lanes[0] is always valid because we know N > 8, but lanes[1] might
2158
+ // not be - we may mask it out below.
2159
+ const uint64_t result = lanes[0] + (lanes[1] << 8);
2160
+ // 8-bit lanes, no further masking
2161
+ HWY_IF_CONSTEXPR(N == 16) return result;
2162
+ return result & valid;
2163
+ }
2164
+
2165
+ // Shift the 8-bit groups into place in each u64 lane.
2166
+ alignas(32) uint64_t kShifts[4] = {0 * 8, 1 * 8, 2 * 8, 3 * 8};
2167
+ bits_in_u64 = Shl(bits_in_u64, Load(du64, kShifts));
2168
+ return ReduceSum(du64, bits_in_u64) & valid;
2169
+ }
2170
+
2171
+ #endif // HWY_TARGET == HWY_SVE2_128 || HWY_TARGET == HWY_SVE_256
2172
+
1615
2173
  // ------------------------------ IsNegative (Lt)
1616
2174
  #ifdef HWY_NATIVE_IS_NEGATIVE
1617
2175
  #undef HWY_NATIVE_IS_NEGATIVE
@@ -1736,6 +2294,56 @@ HWY_API svbool_t IsFinite(const V v) {
1736
2294
  return RebindMask(d, detail::LtN(exp, hwy::MaxExponentField<T>()));
1737
2295
  }
1738
2296
 
2297
+ // ------------------------------ MulByPow2/MulByFloorPow2
2298
+
2299
+ #define HWY_SVE_MUL_BY_POW2(BASE, CHAR, BITS, HALF, NAME, OP) \
2300
+ HWY_API HWY_SVE_V(BASE, BITS) \
2301
+ NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(int, BITS) exp) { \
2302
+ return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, exp); \
2303
+ }
2304
+
2305
+ HWY_SVE_FOREACH_F(HWY_SVE_MUL_BY_POW2, MulByPow2, scale)
2306
+
2307
+ #undef HWY_SVE_MUL_BY_POW2
2308
+
2309
+ // ------------------------------ MaskedEq etc.
2310
+ #ifdef HWY_NATIVE_MASKED_COMP
2311
+ #undef HWY_NATIVE_MASKED_COMP
2312
+ #else
2313
+ #define HWY_NATIVE_MASKED_COMP
2314
+ #endif
2315
+
2316
+ // mask = f(mask, vector, vector)
2317
+ #define HWY_SVE_COMPARE_Z(BASE, CHAR, BITS, HALF, NAME, OP) \
2318
+ HWY_API svbool_t NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, \
2319
+ HWY_SVE_V(BASE, BITS) b) { \
2320
+ return sv##OP##_##CHAR##BITS(m, a, b); \
2321
+ }
2322
+
2323
+ HWY_SVE_FOREACH(HWY_SVE_COMPARE_Z, MaskedEq, cmpeq)
2324
+ HWY_SVE_FOREACH(HWY_SVE_COMPARE_Z, MaskedNe, cmpne)
2325
+ HWY_SVE_FOREACH(HWY_SVE_COMPARE_Z, MaskedLt, cmplt)
2326
+ HWY_SVE_FOREACH(HWY_SVE_COMPARE_Z, MaskedLe, cmple)
2327
+
2328
+ #undef HWY_SVE_COMPARE_Z
2329
+
2330
+ template <class V, class M, class D = DFromV<V>>
2331
+ HWY_API MFromD<D> MaskedGt(M m, V a, V b) {
2332
+ // Swap args to reverse comparison
2333
+ return MaskedLt(m, b, a);
2334
+ }
2335
+
2336
+ template <class V, class M, class D = DFromV<V>>
2337
+ HWY_API MFromD<D> MaskedGe(M m, V a, V b) {
2338
+ // Swap args to reverse comparison
2339
+ return MaskedLe(m, b, a);
2340
+ }
2341
+
2342
+ template <class V, class M, class D = DFromV<V>>
2343
+ HWY_API MFromD<D> MaskedIsNaN(const M m, const V v) {
2344
+ return MaskedNe(m, v, v);
2345
+ }
2346
+
1739
2347
  // ================================================== MEMORY
1740
2348
 
1741
2349
  // ------------------------------ LoadU/MaskedLoad/LoadDup128/StoreU/Stream
@@ -1855,6 +2463,38 @@ HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
1855
2463
 
1856
2464
  #endif // HWY_TARGET != HWY_SVE2_128
1857
2465
 
2466
+ // Truncate to smaller size and store
2467
+ #ifdef HWY_NATIVE_STORE_TRUNCATED
2468
+ #undef HWY_NATIVE_STORE_TRUNCATED
2469
+ #else
2470
+ #define HWY_NATIVE_STORE_TRUNCATED
2471
+ #endif
2472
+
2473
+ #define HWY_SVE_STORE_TRUNCATED(BASE, CHAR, BITS, HALF, NAME, OP, TO_BITS) \
2474
+ template <size_t N, int kPow2> \
2475
+ HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \
2476
+ const HWY_SVE_D(BASE, BITS, N, kPow2) d, \
2477
+ HWY_SVE_T(BASE, TO_BITS) * HWY_RESTRICT p) { \
2478
+ sv##OP##_##CHAR##BITS(detail::MakeMask(d), detail::NativeLanePointer(p), \
2479
+ v); \
2480
+ }
2481
+
2482
+ #define HWY_SVE_STORE_TRUNCATED_BYTE(BASE, CHAR, BITS, HALF, NAME, OP) \
2483
+ HWY_SVE_STORE_TRUNCATED(BASE, CHAR, BITS, HALF, NAME, OP, 8)
2484
+ #define HWY_SVE_STORE_TRUNCATED_HALF(BASE, CHAR, BITS, HALF, NAME, OP) \
2485
+ HWY_SVE_STORE_TRUNCATED(BASE, CHAR, BITS, HALF, NAME, OP, 16)
2486
+ #define HWY_SVE_STORE_TRUNCATED_WORD(BASE, CHAR, BITS, HALF, NAME, OP) \
2487
+ HWY_SVE_STORE_TRUNCATED(BASE, CHAR, BITS, HALF, NAME, OP, 32)
2488
+
2489
+ HWY_SVE_FOREACH_UI16(HWY_SVE_STORE_TRUNCATED_BYTE, TruncateStore, st1b)
2490
+ HWY_SVE_FOREACH_UI32(HWY_SVE_STORE_TRUNCATED_BYTE, TruncateStore, st1b)
2491
+ HWY_SVE_FOREACH_UI64(HWY_SVE_STORE_TRUNCATED_BYTE, TruncateStore, st1b)
2492
+ HWY_SVE_FOREACH_UI32(HWY_SVE_STORE_TRUNCATED_HALF, TruncateStore, st1h)
2493
+ HWY_SVE_FOREACH_UI64(HWY_SVE_STORE_TRUNCATED_HALF, TruncateStore, st1h)
2494
+ HWY_SVE_FOREACH_UI64(HWY_SVE_STORE_TRUNCATED_WORD, TruncateStore, st1w)
2495
+
2496
+ #undef HWY_SVE_STORE_TRUNCATED
2497
+
1858
2498
  // ------------------------------ Load/Store
1859
2499
 
1860
2500
  // SVE only requires lane alignment, not natural alignment of the entire
@@ -1985,6 +2625,7 @@ HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT p,
1985
2625
  v1 = svget2(tuple, 1); \
1986
2626
  }
1987
2627
  HWY_SVE_FOREACH(HWY_SVE_LOAD2, LoadInterleaved2, ld2)
2628
+ HWY_SVE_FOREACH_BF16(HWY_SVE_LOAD2, LoadInterleaved2, ld2)
1988
2629
 
1989
2630
  #undef HWY_SVE_LOAD2
1990
2631
 
@@ -2003,6 +2644,7 @@ HWY_SVE_FOREACH(HWY_SVE_LOAD2, LoadInterleaved2, ld2)
2003
2644
  v2 = svget3(tuple, 2); \
2004
2645
  }
2005
2646
  HWY_SVE_FOREACH(HWY_SVE_LOAD3, LoadInterleaved3, ld3)
2647
+ HWY_SVE_FOREACH_BF16(HWY_SVE_LOAD3, LoadInterleaved3, ld3)
2006
2648
 
2007
2649
  #undef HWY_SVE_LOAD3
2008
2650
 
@@ -2022,6 +2664,7 @@ HWY_SVE_FOREACH(HWY_SVE_LOAD3, LoadInterleaved3, ld3)
2022
2664
  v3 = svget4(tuple, 3); \
2023
2665
  }
2024
2666
  HWY_SVE_FOREACH(HWY_SVE_LOAD4, LoadInterleaved4, ld4)
2667
+ HWY_SVE_FOREACH_BF16(HWY_SVE_LOAD4, LoadInterleaved4, ld4)
2025
2668
 
2026
2669
  #undef HWY_SVE_LOAD4
2027
2670
 
@@ -2037,6 +2680,7 @@ HWY_SVE_FOREACH(HWY_SVE_LOAD4, LoadInterleaved4, ld4)
2037
2680
  Create2(d, v0, v1)); \
2038
2681
  }
2039
2682
  HWY_SVE_FOREACH(HWY_SVE_STORE2, StoreInterleaved2, st2)
2683
+ HWY_SVE_FOREACH_BF16(HWY_SVE_STORE2, StoreInterleaved2, st2)
2040
2684
 
2041
2685
  #undef HWY_SVE_STORE2
2042
2686
 
@@ -2053,6 +2697,7 @@ HWY_SVE_FOREACH(HWY_SVE_STORE2, StoreInterleaved2, st2)
2053
2697
  Create3(d, v0, v1, v2)); \
2054
2698
  }
2055
2699
  HWY_SVE_FOREACH(HWY_SVE_STORE3, StoreInterleaved3, st3)
2700
+ HWY_SVE_FOREACH_BF16(HWY_SVE_STORE3, StoreInterleaved3, st3)
2056
2701
 
2057
2702
  #undef HWY_SVE_STORE3
2058
2703
 
@@ -2069,9 +2714,13 @@ HWY_SVE_FOREACH(HWY_SVE_STORE3, StoreInterleaved3, st3)
2069
2714
  Create4(d, v0, v1, v2, v3)); \
2070
2715
  }
2071
2716
  HWY_SVE_FOREACH(HWY_SVE_STORE4, StoreInterleaved4, st4)
2717
+ HWY_SVE_FOREACH_BF16(HWY_SVE_STORE4, StoreInterleaved4, st4)
2072
2718
 
2073
2719
  #undef HWY_SVE_STORE4
2074
2720
 
2721
+ // Fall back on generic Load/StoreInterleaved[234] for any emulated types.
2722
+ // Requires HWY_GENERIC_IF_EMULATED_D mirrors HWY_SVE_IF_EMULATED_D.
2723
+
2075
2724
  // ================================================== CONVERT
2076
2725
 
2077
2726
  // ------------------------------ PromoteTo
@@ -2312,17 +2961,6 @@ HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svint32_t v) {
2312
2961
  return svuzp1_u8(x2, x2);
2313
2962
  }
2314
2963
 
2315
- HWY_API svuint8_t U8FromU32(const svuint32_t v) {
2316
- const DFromV<svuint32_t> du32;
2317
- const RepartitionToNarrow<decltype(du32)> du16;
2318
- const RepartitionToNarrow<decltype(du16)> du8;
2319
-
2320
- const svuint16_t cast16 = BitCast(du16, v);
2321
- const svuint16_t x2 = svuzp1_u16(cast16, cast16);
2322
- const svuint8_t cast8 = BitCast(du8, x2);
2323
- return svuzp1_u8(cast8, cast8);
2324
- }
2325
-
2326
2964
  template <size_t N, int kPow2>
2327
2965
  HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svuint16_t v) {
2328
2966
  #if HWY_SVE_HAVE_2
@@ -2575,79 +3213,6 @@ HWY_API VFromD<D> DemoteTo(D dn, V v) {
2575
3213
  return BitCast(dn, TruncateTo(dn_u, detail::SaturateU<TFromD<D>>(v)));
2576
3214
  }
2577
3215
 
2578
- // ------------------------------ ConcatEven/ConcatOdd
2579
-
2580
- // WARNING: the upper half of these needs fixing up (uzp1/uzp2 use the
2581
- // full vector length, not rounded down to a power of two as we require).
2582
- namespace detail {
2583
-
2584
- #define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP) \
2585
- HWY_INLINE HWY_SVE_V(BASE, BITS) \
2586
- NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
2587
- return sv##OP##_##CHAR##BITS(lo, hi); \
2588
- }
2589
- HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1)
2590
- HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2)
2591
- #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
2592
- HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull,
2593
- uzp1)
2594
- HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull,
2595
- uzp2)
2596
- #endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
2597
- #if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
2598
- HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q)
2599
- HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
2600
- #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
2601
- HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND,
2602
- ConcatEvenBlocks, uzp1q)
2603
- HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks,
2604
- uzp2q)
2605
- #endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
2606
- #endif // defined(__ARM_FEATURE_SVE_MATMUL_FP64)
2607
- #undef HWY_SVE_CONCAT_EVERY_SECOND
2608
-
2609
- // Used to slide up / shift whole register left; mask indicates which range
2610
- // to take from lo, and the rest is filled from hi starting at its lowest.
2611
- #define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP) \
2612
- HWY_API HWY_SVE_V(BASE, BITS) NAME( \
2613
- HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo, svbool_t mask) { \
2614
- return sv##OP##_##CHAR##BITS(mask, lo, hi); \
2615
- }
2616
- HWY_SVE_FOREACH(HWY_SVE_SPLICE, Splice, splice)
2617
- #if HWY_SVE_HAVE_BF16_FEATURE
2618
- HWY_SVE_FOREACH_BF16(HWY_SVE_SPLICE, Splice, splice)
2619
- #else
2620
- template <class V, HWY_IF_BF16_D(DFromV<V>)>
2621
- HWY_INLINE V Splice(V hi, V lo, svbool_t mask) {
2622
- const DFromV<V> d;
2623
- const RebindToUnsigned<decltype(d)> du;
2624
- return BitCast(d, Splice(BitCast(du, hi), BitCast(du, lo), mask));
2625
- }
2626
- #endif // HWY_SVE_HAVE_BF16_FEATURE
2627
- #undef HWY_SVE_SPLICE
2628
-
2629
- } // namespace detail
2630
-
2631
- template <class D>
2632
- HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
2633
- #if HWY_SVE_IS_POW2
2634
- if (detail::IsFull(d)) return detail::ConcatOddFull(hi, lo);
2635
- #endif
2636
- const VFromD<D> hi_odd = detail::ConcatOddFull(hi, hi);
2637
- const VFromD<D> lo_odd = detail::ConcatOddFull(lo, lo);
2638
- return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
2639
- }
2640
-
2641
- template <class D>
2642
- HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
2643
- #if HWY_SVE_IS_POW2
2644
- if (detail::IsFull(d)) return detail::ConcatEvenFull(hi, lo);
2645
- #endif
2646
- const VFromD<D> hi_odd = detail::ConcatEvenFull(hi, hi);
2647
- const VFromD<D> lo_odd = detail::ConcatEvenFull(lo, lo);
2648
- return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
2649
- }
2650
-
2651
3216
  // ------------------------------ PromoteEvenTo/PromoteOddTo
2652
3217
 
2653
3218
  // Signed to signed PromoteEvenTo: 1 instruction instead of 2 in generic-inl.h.
@@ -2793,6 +3358,41 @@ HWY_API svfloat32_t DemoteTo(Simd<float, N, kPow2> d, const svuint64_t v) {
2793
3358
  HWY_SVE_FOREACH_F(HWY_SVE_CONVERT, ConvertTo, cvt)
2794
3359
  #undef HWY_SVE_CONVERT
2795
3360
 
3361
+ // ------------------------------ MaskedConvertTo F
3362
+
3363
+ #define HWY_SVE_MASKED_CONVERT_TO_OR_ZERO(BASE, CHAR, BITS, HALF, NAME, OP) \
3364
+ /* Float from signed */ \
3365
+ template <size_t N, int kPow2> \
3366
+ HWY_API HWY_SVE_V(BASE, BITS) \
3367
+ NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
3368
+ HWY_SVE_V(int, BITS) v) { \
3369
+ return sv##OP##_##CHAR##BITS##_s##BITS##_z(m, v); \
3370
+ } \
3371
+ /* Float from unsigned */ \
3372
+ template <size_t N, int kPow2> \
3373
+ HWY_API HWY_SVE_V(BASE, BITS) \
3374
+ NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
3375
+ HWY_SVE_V(uint, BITS) v) { \
3376
+ return sv##OP##_##CHAR##BITS##_u##BITS##_z(m, v); \
3377
+ } \
3378
+ /* Signed from float, rounding toward zero */ \
3379
+ template <size_t N, int kPow2> \
3380
+ HWY_API HWY_SVE_V(int, BITS) \
3381
+ NAME(svbool_t m, HWY_SVE_D(int, BITS, N, kPow2) /* d */, \
3382
+ HWY_SVE_V(BASE, BITS) v) { \
3383
+ return sv##OP##_s##BITS##_##CHAR##BITS##_z(m, v); \
3384
+ } \
3385
+ /* Unsigned from float, rounding toward zero */ \
3386
+ template <size_t N, int kPow2> \
3387
+ HWY_API HWY_SVE_V(uint, BITS) \
3388
+ NAME(svbool_t m, HWY_SVE_D(uint, BITS, N, kPow2) /* d */, \
3389
+ HWY_SVE_V(BASE, BITS) v) { \
3390
+ return sv##OP##_u##BITS##_##CHAR##BITS##_z(m, v); \
3391
+ }
3392
+
3393
+ HWY_SVE_FOREACH_F(HWY_SVE_MASKED_CONVERT_TO_OR_ZERO, MaskedConvertTo, cvt)
3394
+ #undef HWY_SVE_MASKED_CONVERT_TO_OR_ZERO
3395
+
2796
3396
  // ------------------------------ NearestInt (Round, ConvertTo)
2797
3397
  template <class VF, class DI = RebindToSigned<DFromV<VF>>>
2798
3398
  HWY_API VFromD<DI> NearestInt(VF v) {
@@ -2800,7 +3400,14 @@ HWY_API VFromD<DI> NearestInt(VF v) {
2800
3400
  return ConvertTo(DI(), Round(v));
2801
3401
  }
2802
3402
 
2803
- // ------------------------------ Iota (Add, ConvertTo)
3403
+ template <class DI32, HWY_IF_I32_D(DI32)>
3404
+ HWY_API VFromD<DI32> DemoteToNearestInt(DI32 di32,
3405
+ VFromD<Rebind<double, DI32>> v) {
3406
+ // No single instruction, round then demote.
3407
+ return DemoteTo(di32, Round(v));
3408
+ }
3409
+
3410
+ // ------------------------------ Iota (AddN, ConvertTo)
2804
3411
 
2805
3412
  #define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP) \
2806
3413
  template <size_t N, int kPow2, typename T2> \
@@ -2813,13 +3420,64 @@ HWY_API VFromD<DI> NearestInt(VF v) {
2813
3420
  HWY_SVE_FOREACH_UI(HWY_SVE_IOTA, Iota, index)
2814
3421
  #undef HWY_SVE_IOTA
2815
3422
 
2816
- template <class D, typename T2, HWY_IF_FLOAT_D(D)>
3423
+ template <class D, typename T = TFromD<D>, typename T2, HWY_IF_FLOAT(T)>
2817
3424
  HWY_API VFromD<D> Iota(const D d, T2 first) {
2818
3425
  const RebindToSigned<D> di;
2819
- return detail::AddN(ConvertTo(d, Iota(di, 0)),
2820
- ConvertScalarTo<TFromD<D>>(first));
3426
+ const T first_f = ConvertScalarTo<T>(first);
3427
+ const VFromD<D> iota_f = ConvertTo(d, Iota(di, 0));
3428
+ return detail::AddN(iota_f, first_f);
3429
+ }
3430
+
3431
+ // ================================================== LANE ACCESS
3432
+
3433
+ // ------------------------------ ExtractLane (GetLaneM, FirstN)
3434
+ template <class V>
3435
+ HWY_API TFromV<V> ExtractLane(V v, size_t i) {
3436
+ return detail::GetLaneM(v, FirstN(DFromV<V>(), i));
3437
+ }
3438
+
3439
+ // ------------------------------ InsertLane (IfThenElse, EqN)
3440
+ template <class V, typename T>
3441
+ HWY_API V InsertLane(const V v, size_t i, T t) {
3442
+ static_assert(sizeof(TFromV<V>) == sizeof(T), "Lane size mismatch");
3443
+ const DFromV<V> d;
3444
+ const RebindToSigned<decltype(d)> di;
3445
+ using TI = TFromD<decltype(di)>;
3446
+ const svbool_t is_i = detail::EqN(Iota(di, 0), static_cast<TI>(i));
3447
+ // The actual type may be int16_t for special floats; copy, not cast.
3448
+ TFromV<V> t_bits;
3449
+ hwy::CopySameSize(&t, &t_bits);
3450
+ return IfThenElse(RebindMask(d, is_i), Set(d, t_bits), v);
2821
3451
  }
2822
3452
 
3453
+ // ------------------------------ GetExponent
3454
+
3455
+ #if HWY_SVE_HAVE_2 || HWY_IDE
3456
+ #ifdef HWY_NATIVE_GET_EXPONENT
3457
+ #undef HWY_NATIVE_GET_EXPONENT
3458
+ #else
3459
+ #define HWY_NATIVE_GET_EXPONENT
3460
+ #endif
3461
+
3462
+ namespace detail {
3463
+ #define HWY_SVE_GET_EXP(BASE, CHAR, BITS, HALF, NAME, OP) \
3464
+ HWY_API HWY_SVE_V(int, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
3465
+ return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
3466
+ }
3467
+ HWY_SVE_FOREACH_F(HWY_SVE_GET_EXP, GetExponent, logb)
3468
+ #undef HWY_SVE_GET_EXP
3469
+ } // namespace detail
3470
+
3471
+ template <class V, HWY_IF_FLOAT_V(V)>
3472
+ HWY_API V GetExponent(V v) {
3473
+ const DFromV<V> d;
3474
+ const RebindToSigned<decltype(d)> di;
3475
+ const VFromD<decltype(di)> exponent_int = detail::GetExponent(v);
3476
+ // convert integer to original type
3477
+ return ConvertTo(d, exponent_int);
3478
+ }
3479
+ #endif // HWY_SVE_HAVE_2
3480
+
2823
3481
  // ------------------------------ InterleaveLower
2824
3482
 
2825
3483
  template <class D, class V>
@@ -2945,10 +3603,10 @@ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xDD> /*idx_3210_tag*/,
2945
3603
 
2946
3604
  namespace detail {
2947
3605
 
2948
- #if HWY_TARGET == HWY_SVE_256 || HWY_IDE
3606
+ #if (HWY_TARGET == HWY_SVE_256 && HWY_HAVE_CONSTEXPR_LANES) || HWY_IDE
2949
3607
  template <class D, HWY_IF_T_SIZE_D(D, 1)>
2950
3608
  svbool_t MaskLowerHalf(D d) {
2951
- switch (Lanes(d)) {
3609
+ switch (MaxLanes(d)) {
2952
3610
  case 32:
2953
3611
  return svptrue_pat_b8(SV_VL16);
2954
3612
  case 16:
@@ -2963,7 +3621,7 @@ svbool_t MaskLowerHalf(D d) {
2963
3621
  }
2964
3622
  template <class D, HWY_IF_T_SIZE_D(D, 2)>
2965
3623
  svbool_t MaskLowerHalf(D d) {
2966
- switch (Lanes(d)) {
3624
+ switch (MaxLanes(d)) {
2967
3625
  case 16:
2968
3626
  return svptrue_pat_b16(SV_VL8);
2969
3627
  case 8:
@@ -2976,7 +3634,7 @@ svbool_t MaskLowerHalf(D d) {
2976
3634
  }
2977
3635
  template <class D, HWY_IF_T_SIZE_D(D, 4)>
2978
3636
  svbool_t MaskLowerHalf(D d) {
2979
- switch (Lanes(d)) {
3637
+ switch (MaxLanes(d)) {
2980
3638
  case 8:
2981
3639
  return svptrue_pat_b32(SV_VL4);
2982
3640
  case 4:
@@ -2987,7 +3645,7 @@ svbool_t MaskLowerHalf(D d) {
2987
3645
  }
2988
3646
  template <class D, HWY_IF_T_SIZE_D(D, 8)>
2989
3647
  svbool_t MaskLowerHalf(D d) {
2990
- switch (Lanes(d)) {
3648
+ switch (MaxLanes(d)) {
2991
3649
  case 4:
2992
3650
  return svptrue_pat_b64(SV_VL2);
2993
3651
  default:
@@ -2995,10 +3653,10 @@ svbool_t MaskLowerHalf(D d) {
2995
3653
  }
2996
3654
  }
2997
3655
  #endif
2998
- #if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
3656
+ #if (HWY_TARGET == HWY_SVE2_128 && HWY_HAVE_CONSTEXPR_LANES) || HWY_IDE
2999
3657
  template <class D, HWY_IF_T_SIZE_D(D, 1)>
3000
3658
  svbool_t MaskLowerHalf(D d) {
3001
- switch (Lanes(d)) {
3659
+ switch (MaxLanes(d)) {
3002
3660
  case 16:
3003
3661
  return svptrue_pat_b8(SV_VL8);
3004
3662
  case 8:
@@ -3013,7 +3671,7 @@ svbool_t MaskLowerHalf(D d) {
3013
3671
  }
3014
3672
  template <class D, HWY_IF_T_SIZE_D(D, 2)>
3015
3673
  svbool_t MaskLowerHalf(D d) {
3016
- switch (Lanes(d)) {
3674
+ switch (MaxLanes(d)) {
3017
3675
  case 8:
3018
3676
  return svptrue_pat_b16(SV_VL4);
3019
3677
  case 4:
@@ -3026,14 +3684,15 @@ svbool_t MaskLowerHalf(D d) {
3026
3684
  }
3027
3685
  template <class D, HWY_IF_T_SIZE_D(D, 4)>
3028
3686
  svbool_t MaskLowerHalf(D d) {
3029
- return svptrue_pat_b32(Lanes(d) == 4 ? SV_VL2 : SV_VL1);
3687
+ return svptrue_pat_b32(MaxLanes(d) == 4 ? SV_VL2 : SV_VL1);
3030
3688
  }
3031
3689
  template <class D, HWY_IF_T_SIZE_D(D, 8)>
3032
3690
  svbool_t MaskLowerHalf(D /*d*/) {
3033
3691
  return svptrue_pat_b64(SV_VL1);
3034
3692
  }
3035
3693
  #endif // HWY_TARGET == HWY_SVE2_128
3036
- #if HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128
3694
+ #if (HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128) || \
3695
+ !HWY_HAVE_CONSTEXPR_LANES
3037
3696
  template <class D>
3038
3697
  svbool_t MaskLowerHalf(D d) {
3039
3698
  return FirstN(d, Lanes(d) / 2);
@@ -3089,7 +3748,7 @@ HWY_API V ConcatLowerLower(const D d, const V hi, const V lo) {
3089
3748
  // ------------------------------ ConcatLowerUpper
3090
3749
  template <class D, class V>
3091
3750
  HWY_API V ConcatLowerUpper(const D d, const V hi, const V lo) {
3092
- #if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 // constexpr Lanes
3751
+ #if HWY_HAVE_CONSTEXPR_LANES
3093
3752
  if (detail::IsFull(d)) {
3094
3753
  return detail::Ext<Lanes(d) / 2>(hi, lo);
3095
3754
  }
@@ -3135,150 +3794,26 @@ HWY_API V LowerHalf(D2 /* tag */, const V v) {
3135
3794
  }
3136
3795
 
3137
3796
  template <class V>
3138
- HWY_API V LowerHalf(const V v) {
3139
- return v;
3140
- }
3141
-
3142
- template <class DH, class V>
3143
- HWY_API V UpperHalf(const DH dh, const V v) {
3144
- const Twice<decltype(dh)> d;
3145
- // Cast so that we support bfloat16_t.
3146
- const RebindToUnsigned<decltype(d)> du;
3147
- const VFromD<decltype(du)> vu = BitCast(du, v);
3148
- #if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 // constexpr Lanes
3149
- return BitCast(d, detail::Ext<Lanes(dh)>(vu, vu));
3150
- #else
3151
- const MFromD<decltype(du)> mask = detail::MaskUpperHalf(du);
3152
- return BitCast(d, detail::Splice(vu, vu, mask));
3153
- #endif
3154
- }
3155
-
3156
- // ================================================== REDUCE
3157
-
3158
- #ifdef HWY_NATIVE_REDUCE_SCALAR
3159
- #undef HWY_NATIVE_REDUCE_SCALAR
3160
- #else
3161
- #define HWY_NATIVE_REDUCE_SCALAR
3162
- #endif
3163
-
3164
- // These return T, suitable for ReduceSum.
3165
- namespace detail {
3166
- #define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP) \
3167
- HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
3168
- /* The intrinsic returns [u]int64_t; truncate to T so we can broadcast. */ \
3169
- using T = HWY_SVE_T(BASE, BITS); \
3170
- using TU = MakeUnsigned<T>; \
3171
- constexpr uint64_t kMask = LimitsMax<TU>(); \
3172
- return static_cast<T>(static_cast<TU>( \
3173
- static_cast<uint64_t>(sv##OP##_##CHAR##BITS(pg, v)) & kMask)); \
3174
- }
3175
-
3176
- #define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP) \
3177
- HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
3178
- return sv##OP##_##CHAR##BITS(pg, v); \
3179
- }
3180
-
3181
- HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE_ADD, SumOfLanesM, addv)
3182
- HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, SumOfLanesM, addv)
3183
-
3184
- HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MinOfLanesM, minv)
3185
- HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MaxOfLanesM, maxv)
3186
- // NaN if all are
3187
- HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MinOfLanesM, minnmv)
3188
- HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanesM, maxnmv)
3189
-
3190
- #undef HWY_SVE_REDUCE
3191
- #undef HWY_SVE_REDUCE_ADD
3192
- } // namespace detail
3193
-
3194
- // detail::SumOfLanesM, detail::MinOfLanesM, and detail::MaxOfLanesM is more
3195
- // efficient for N=4 I8/U8 reductions on SVE than the default implementations
3196
- // of the N=4 I8/U8 ReduceSum/ReduceMin/ReduceMax operations in
3197
- // generic_ops-inl.h
3198
- #undef HWY_IF_REDUCE_D
3199
- #define HWY_IF_REDUCE_D(D) hwy::EnableIf<HWY_MAX_LANES_D(D) != 1>* = nullptr
3200
-
3201
- #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
3202
- #undef HWY_NATIVE_REDUCE_SUM_4_UI8
3203
- #else
3204
- #define HWY_NATIVE_REDUCE_SUM_4_UI8
3205
- #endif
3206
-
3207
- #ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
3208
- #undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
3209
- #else
3210
- #define HWY_NATIVE_REDUCE_MINMAX_4_UI8
3211
- #endif
3212
-
3213
- template <class D, HWY_IF_REDUCE_D(D)>
3214
- HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
3215
- return detail::SumOfLanesM(detail::MakeMask(d), v);
3216
- }
3217
-
3218
- template <class D, HWY_IF_REDUCE_D(D)>
3219
- HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) {
3220
- return detail::MinOfLanesM(detail::MakeMask(d), v);
3221
- }
3222
-
3223
- template <class D, HWY_IF_REDUCE_D(D)>
3224
- HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
3225
- return detail::MaxOfLanesM(detail::MakeMask(d), v);
3226
- }
3227
-
3228
- // ------------------------------ SumOfLanes
3229
-
3230
- template <class D, HWY_IF_LANES_GT_D(D, 1)>
3231
- HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
3232
- return Set(d, ReduceSum(d, v));
3233
- }
3234
- template <class D, HWY_IF_LANES_GT_D(D, 1)>
3235
- HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
3236
- return Set(d, ReduceMin(d, v));
3237
- }
3238
- template <class D, HWY_IF_LANES_GT_D(D, 1)>
3239
- HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
3240
- return Set(d, ReduceMax(d, v));
3241
- }
3242
-
3243
- // ================================================== SWIZZLE
3244
-
3245
- // ------------------------------ GetLane
3246
-
3247
- namespace detail {
3248
- #define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP) \
3249
- HWY_INLINE HWY_SVE_T(BASE, BITS) \
3250
- NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \
3251
- return sv##OP##_##CHAR##BITS(mask, v); \
3252
- }
3253
-
3254
- HWY_SVE_FOREACH(HWY_SVE_GET_LANE, GetLaneM, lasta)
3255
- HWY_SVE_FOREACH(HWY_SVE_GET_LANE, ExtractLastMatchingLaneM, lastb)
3256
- #undef HWY_SVE_GET_LANE
3257
- } // namespace detail
3258
-
3259
- template <class V>
3260
- HWY_API TFromV<V> GetLane(V v) {
3261
- return detail::GetLaneM(v, detail::PFalse());
3262
- }
3263
-
3264
- // ------------------------------ ExtractLane
3265
- template <class V>
3266
- HWY_API TFromV<V> ExtractLane(V v, size_t i) {
3267
- return detail::GetLaneM(v, FirstN(DFromV<V>(), i));
3797
+ HWY_API V LowerHalf(const V v) {
3798
+ return v;
3268
3799
  }
3269
3800
 
3270
- // ------------------------------ InsertLane (IfThenElse)
3271
- template <class V, typename T>
3272
- HWY_API V InsertLane(const V v, size_t i, T t) {
3273
- static_assert(sizeof(TFromV<V>) == sizeof(T), "Lane size mismatch");
3274
- const DFromV<V> d;
3275
- const RebindToSigned<decltype(d)> di;
3276
- using TI = TFromD<decltype(di)>;
3277
- const svbool_t is_i = detail::EqN(Iota(di, 0), static_cast<TI>(i));
3278
- return IfThenElse(RebindMask(d, is_i),
3279
- Set(d, hwy::ConvertScalarTo<TFromV<V>>(t)), v);
3801
+ template <class DH, class V>
3802
+ HWY_API V UpperHalf(const DH dh, const V v) {
3803
+ const Twice<decltype(dh)> d;
3804
+ // Cast so that we support bfloat16_t.
3805
+ const RebindToUnsigned<decltype(d)> du;
3806
+ const VFromD<decltype(du)> vu = BitCast(du, v);
3807
+ #if HWY_HAVE_CONSTEXPR_LANES
3808
+ return BitCast(d, detail::Ext<Lanes(dh)>(vu, vu));
3809
+ #else
3810
+ const MFromD<decltype(du)> mask = detail::MaskUpperHalf(du);
3811
+ return BitCast(d, detail::Splice(vu, vu, mask));
3812
+ #endif
3280
3813
  }
3281
3814
 
3815
+ // ================================================== SWIZZLE
3816
+
3282
3817
  // ------------------------------ DupEven
3283
3818
 
3284
3819
  namespace detail {
@@ -3447,6 +3982,40 @@ HWY_API V TwoTablesLookupLanes(V a, V b,
3447
3982
  return TwoTablesLookupLanes(d, a, b, idx);
3448
3983
  }
3449
3984
 
3985
+ // ------------------------------ SlideUpLanes (FirstN)
3986
+ template <class D>
3987
+ HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
3988
+ return detail::Splice(v, Zero(d), FirstN(d, amt));
3989
+ }
3990
+
3991
+ // ------------------------------ Slide1Up
3992
+
3993
+ #ifdef HWY_NATIVE_SLIDE1_UP_DOWN
3994
+ #undef HWY_NATIVE_SLIDE1_UP_DOWN
3995
+ #else
3996
+ #define HWY_NATIVE_SLIDE1_UP_DOWN
3997
+ #endif
3998
+
3999
+ template <class D>
4000
+ HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) {
4001
+ return SlideUpLanes(d, v, 1);
4002
+ }
4003
+
4004
+ // ------------------------------ SlideDownLanes (TableLookupLanes)
4005
+ template <class D>
4006
+ HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
4007
+ const RebindToUnsigned<decltype(d)> du;
4008
+ using TU = TFromD<decltype(du)>;
4009
+ const auto idx = Iota(du, static_cast<TU>(amt));
4010
+ return IfThenElseZero(FirstN(d, Lanes(d) - amt), TableLookupLanes(v, idx));
4011
+ }
4012
+
4013
+ // ------------------------------ Slide1Down
4014
+ template <class D>
4015
+ HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) {
4016
+ return SlideDownLanes(d, v, 1);
4017
+ }
4018
+
3450
4019
  // ------------------------------ SwapAdjacentBlocks (TableLookupLanes)
3451
4020
 
3452
4021
  namespace detail {
@@ -3476,6 +4045,40 @@ HWY_API V SwapAdjacentBlocks(const V v) {
3476
4045
  #endif
3477
4046
  }
3478
4047
 
4048
+ // ------------------------------ InterleaveEvenBlocks
4049
+ // (ConcatLowerLower, SlideUpLanes, OddEvenBlocks)
4050
+
4051
+ template <class D, class V = VFromD<D>>
4052
+ HWY_API V InterleaveEvenBlocks(D d, V a, V b) {
4053
+ #if HWY_TARGET == HWY_SVE_256
4054
+ return ConcatLowerLower(d, b, a);
4055
+ #elif HWY_TARGET == HWY_SVE2_128
4056
+ (void)d;
4057
+ (void)b;
4058
+ return a;
4059
+ #else
4060
+ constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d);
4061
+ return OddEvenBlocks(SlideUpLanes(d, b, kLanesPerBlock), a);
4062
+ #endif
4063
+ }
4064
+
4065
+ // ------------------------------ InterleaveOddBlocks
4066
+ // (ConcatUpperUpper, SlideDownLanes, OddEvenBlocks)
4067
+
4068
+ template <class D, class V = VFromD<D>>
4069
+ HWY_API V InterleaveOddBlocks(D d, V a, V b) {
4070
+ #if HWY_TARGET == HWY_SVE_256
4071
+ return ConcatUpperUpper(d, b, a);
4072
+ #elif HWY_TARGET == HWY_SVE2_128
4073
+ (void)d;
4074
+ (void)b;
4075
+ return a;
4076
+ #else
4077
+ constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d);
4078
+ return OddEvenBlocks(b, SlideDownLanes(d, a, kLanesPerBlock));
4079
+ #endif
4080
+ }
4081
+
3479
4082
  // ------------------------------ Reverse
3480
4083
 
3481
4084
  namespace detail {
@@ -3630,43 +4233,6 @@ HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
3630
4233
  HWY_SVE_FOREACH_UI(HWY_SVE_REVERSE_BITS, ReverseBits, rbit)
3631
4234
  #undef HWY_SVE_REVERSE_BITS
3632
4235
 
3633
- // ------------------------------ SlideUpLanes
3634
-
3635
- template <class D>
3636
- HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
3637
- return detail::Splice(v, Zero(d), FirstN(d, amt));
3638
- }
3639
-
3640
- // ------------------------------ Slide1Up
3641
-
3642
- #ifdef HWY_NATIVE_SLIDE1_UP_DOWN
3643
- #undef HWY_NATIVE_SLIDE1_UP_DOWN
3644
- #else
3645
- #define HWY_NATIVE_SLIDE1_UP_DOWN
3646
- #endif
3647
-
3648
- template <class D>
3649
- HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) {
3650
- return SlideUpLanes(d, v, 1);
3651
- }
3652
-
3653
- // ------------------------------ SlideDownLanes (TableLookupLanes)
3654
-
3655
- template <class D>
3656
- HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
3657
- const RebindToUnsigned<decltype(d)> du;
3658
- using TU = TFromD<decltype(du)>;
3659
- const auto idx = Iota(du, static_cast<TU>(amt));
3660
- return IfThenElseZero(FirstN(d, Lanes(d) - amt), TableLookupLanes(v, idx));
3661
- }
3662
-
3663
- // ------------------------------ Slide1Down
3664
-
3665
- template <class D>
3666
- HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) {
3667
- return SlideDownLanes(d, v, 1);
3668
- }
3669
-
3670
4236
  // ------------------------------ Block insert/extract/broadcast ops
3671
4237
  #if HWY_TARGET != HWY_SVE2_128
3672
4238
 
@@ -4668,6 +5234,12 @@ HWY_API V MaskedDivOr(V no, M m, V a, V b) {
4668
5234
  return IfThenElse(m, Div(a, b), no);
4669
5235
  }
4670
5236
 
5237
+ template <class V, class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
5238
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
5239
+ HWY_API V MaskedDiv(M m, V a, V b) {
5240
+ return IfThenElseZero(m, Div(a, b));
5241
+ }
5242
+
4671
5243
  // ------------------------------ Mod (Div, NegMulAdd)
4672
5244
  template <class V>
4673
5245
  HWY_API V Mod(V a, V b) {
@@ -4680,28 +5252,50 @@ HWY_API V MaskedModOr(V no, M m, V a, V b) {
4680
5252
  return IfThenElse(m, Mod(a, b), no);
4681
5253
  }
4682
5254
 
4683
- // ------------------------------ BroadcastSignBit (ShiftRight)
4684
- template <class V>
4685
- HWY_API V BroadcastSignBit(const V v) {
4686
- return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v);
4687
- }
4688
-
4689
5255
  // ------------------------------ IfNegativeThenElse (BroadcastSignBit)
4690
5256
  template <class V>
4691
5257
  HWY_API V IfNegativeThenElse(V v, V yes, V no) {
4692
5258
  static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float");
4693
5259
  return IfThenElse(IsNegative(v), yes, no);
4694
5260
  }
5261
+ // ------------------------------ IfNegativeThenNegOrUndefIfZero
5262
+
5263
+ #ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
5264
+ #undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
5265
+ #else
5266
+ #define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
5267
+ #endif
5268
+
5269
+ #define HWY_SVE_NEG_IF(BASE, CHAR, BITS, HALF, NAME, OP) \
5270
+ HWY_API HWY_SVE_V(BASE, BITS) \
5271
+ NAME(HWY_SVE_V(BASE, BITS) mask, HWY_SVE_V(BASE, BITS) v) { \
5272
+ return sv##OP##_##CHAR##BITS##_m(v, IsNegative(mask), v); \
5273
+ }
5274
+
5275
+ HWY_SVE_FOREACH_IF(HWY_SVE_NEG_IF, IfNegativeThenNegOrUndefIfZero, neg)
5276
+
5277
+ #undef HWY_SVE_NEG_IF
4695
5278
 
4696
5279
  // ------------------------------ AverageRound (ShiftRight)
4697
5280
 
5281
+ #ifdef HWY_NATIVE_AVERAGE_ROUND_UI32
5282
+ #undef HWY_NATIVE_AVERAGE_ROUND_UI32
5283
+ #else
5284
+ #define HWY_NATIVE_AVERAGE_ROUND_UI32
5285
+ #endif
5286
+
5287
+ #ifdef HWY_NATIVE_AVERAGE_ROUND_UI64
5288
+ #undef HWY_NATIVE_AVERAGE_ROUND_UI64
5289
+ #else
5290
+ #define HWY_NATIVE_AVERAGE_ROUND_UI64
5291
+ #endif
5292
+
4698
5293
  #if HWY_SVE_HAVE_2
4699
- HWY_SVE_FOREACH_U08(HWY_SVE_RETV_ARGPVV, AverageRound, rhadd)
4700
- HWY_SVE_FOREACH_U16(HWY_SVE_RETV_ARGPVV, AverageRound, rhadd)
5294
+ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, AverageRound, rhadd)
4701
5295
  #else
4702
- template <class V>
4703
- V AverageRound(const V a, const V b) {
4704
- return ShiftRight<1>(detail::AddN(Add(a, b), 1));
5296
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
5297
+ HWY_API V AverageRound(const V a, const V b) {
5298
+ return Sub(Or(a, b), ShiftRight<1>(Xor(a, b)));
4705
5299
  }
4706
5300
  #endif // HWY_SVE_HAVE_2
4707
5301
 
@@ -4710,6 +5304,12 @@ V AverageRound(const V a, const V b) {
4710
5304
  // `p` points to at least 8 readable bytes, not all of which need be valid.
4711
5305
  template <class D, HWY_IF_T_SIZE_D(D, 1)>
4712
5306
  HWY_INLINE svbool_t LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
5307
+ #if HWY_COMPILER_CLANG >= 1901 || HWY_COMPILER_GCC_ACTUAL >= 1200
5308
+ typedef svbool_t UnalignedSveMaskT
5309
+ __attribute__((__aligned__(1), __may_alias__));
5310
+ (void)d;
5311
+ return *reinterpret_cast<const UnalignedSveMaskT*>(bits);
5312
+ #else
4713
5313
  // TODO(janwas): with SVE2.1, load to vector, then PMOV
4714
5314
  const RebindToUnsigned<D> du;
4715
5315
  const svuint8_t iota = Iota(du, 0);
@@ -4722,6 +5322,7 @@ HWY_INLINE svbool_t LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
4722
5322
  const svuint8_t bit =
4723
5323
  svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
4724
5324
  return TestBit(rep8, bit);
5325
+ #endif
4725
5326
  }
4726
5327
 
4727
5328
  template <class D, HWY_IF_T_SIZE_D(D, 2)>
@@ -4854,57 +5455,31 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
4854
5455
  return TestBit(BitCast(du, bytes), bit);
4855
5456
  }
4856
5457
 
4857
- // ------------------------------ StoreMaskBits
4858
-
4859
- namespace detail {
4860
-
4861
- // For each mask lane (governing lane type T), store 1 or 0 in BYTE lanes.
4862
- template <class T, HWY_IF_T_SIZE(T, 1)>
4863
- HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
4864
- return svdup_n_u8_z(m, 1);
4865
- }
4866
- template <class T, HWY_IF_T_SIZE(T, 2)>
4867
- HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
4868
- const ScalableTag<uint8_t> d8;
4869
- const svuint8_t b16 = BitCast(d8, svdup_n_u16_z(m, 1));
4870
- return detail::ConcatEvenFull(b16, b16); // lower half
4871
- }
4872
- template <class T, HWY_IF_T_SIZE(T, 4)>
4873
- HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
4874
- return U8FromU32(svdup_n_u32_z(m, 1));
4875
- }
4876
- template <class T, HWY_IF_T_SIZE(T, 8)>
4877
- HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
4878
- const ScalableTag<uint32_t> d32;
4879
- const svuint32_t b64 = BitCast(d32, svdup_n_u64_z(m, 1));
4880
- return U8FromU32(detail::ConcatEvenFull(b64, b64)); // lower half
4881
- }
4882
-
4883
- // Compacts groups of 8 u8 into 8 contiguous bits in a 64-bit lane.
4884
- HWY_INLINE svuint64_t BitsFromBool(svuint8_t x) {
4885
- const ScalableTag<uint8_t> d8;
4886
- const ScalableTag<uint16_t> d16;
4887
- const ScalableTag<uint32_t> d32;
4888
- const ScalableTag<uint64_t> d64;
4889
- // TODO(janwas): could use SVE2 BDEP, but it's optional.
4890
- x = Or(x, BitCast(d8, ShiftRight<7>(BitCast(d16, x))));
4891
- x = Or(x, BitCast(d8, ShiftRight<14>(BitCast(d32, x))));
4892
- x = Or(x, BitCast(d8, ShiftRight<28>(BitCast(d64, x))));
4893
- return BitCast(d64, x);
4894
- }
4895
-
4896
- } // namespace detail
5458
+ // ------------------------------ StoreMaskBits (BitsFromMask)
4897
5459
 
4898
5460
  // `p` points to at least 8 writable bytes.
4899
- // TODO(janwas): specialize for HWY_SVE_256
4900
5461
  // TODO(janwas): with SVE2.1, use PMOV to store to vector, then StoreU
4901
5462
  template <class D>
4902
5463
  HWY_API size_t StoreMaskBits(D d, svbool_t m, uint8_t* bits) {
4903
- svuint64_t bits_in_u64 =
4904
- detail::BitsFromBool(detail::BoolFromMask<TFromD<D>>(m));
5464
+ #if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
5465
+ constexpr size_t N = MaxLanes(d);
5466
+ const uint64_t bits64 = BitsFromMask(d, m);
5467
+ HWY_IF_CONSTEXPR(N < 8) {
5468
+ // BitsFromMask guarantees upper bits are zero, hence no masking.
5469
+ bits[0] = static_cast<uint8_t>(bits64);
5470
+ }
5471
+ else {
5472
+ static_assert(N % 8 == 0, "N is pow2 >= 8, hence divisible");
5473
+ static_assert(HWY_IS_LITTLE_ENDIAN, "");
5474
+ hwy::CopyBytes<N / 8>(&bits64, bits);
5475
+ }
5476
+ constexpr size_t num_bytes = hwy::DivCeil(N, size_t{8});
5477
+ return num_bytes;
5478
+ #else
5479
+ svuint64_t bits_in_u64 = detail::BitsFromBool(detail::BoolFromMask<D>(m));
4905
5480
 
4906
5481
  const size_t num_bits = Lanes(d);
4907
- const size_t num_bytes = (num_bits + 8 - 1) / 8; // Round up, see below
5482
+ const size_t num_bytes = hwy::DivCeil(num_bits, size_t{8});
4908
5483
 
4909
5484
  // Truncate each u64 to 8 bits and store to u8.
4910
5485
  svst1b_u64(FirstN(ScalableTag<uint64_t>(), num_bytes), bits, bits_in_u64);
@@ -4918,6 +5493,7 @@ HWY_API size_t StoreMaskBits(D d, svbool_t m, uint8_t* bits) {
4918
5493
  // Else: we wrote full bytes because num_bits is a power of two >= 8.
4919
5494
 
4920
5495
  return num_bytes;
5496
+ #endif // HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
4921
5497
  }
4922
5498
 
4923
5499
  // ------------------------------ CompressBits (LoadMaskBits)
@@ -5738,6 +6314,38 @@ HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) {
5738
6314
  return detail::InterleaveOdd(lo, hi);
5739
6315
  }
5740
6316
 
6317
+ // ------------------------------ PairwiseAdd/PairwiseSub
6318
+ #if HWY_TARGET != HWY_SCALAR
6319
+ #if HWY_SVE_HAVE_2 || HWY_IDE
6320
+
6321
+ #ifdef HWY_NATIVE_PAIRWISE_ADD
6322
+ #undef HWY_NATIVE_PAIRWISE_ADD
6323
+ #else
6324
+ #define HWY_NATIVE_PAIRWISE_ADD
6325
+ #endif
6326
+
6327
+ namespace detail {
6328
+ #define HWY_SVE_SV_PAIRWISE_ADD(BASE, CHAR, BITS, HALF, NAME, OP) \
6329
+ template <size_t N, int kPow2> \
6330
+ HWY_API HWY_SVE_V(BASE, BITS) \
6331
+ NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /*d*/, HWY_SVE_V(BASE, BITS) a, \
6332
+ HWY_SVE_V(BASE, BITS) b) { \
6333
+ return sv##OP##_##CHAR##BITS##_m(HWY_SVE_PTRUE(BITS), a, b); \
6334
+ }
6335
+
6336
+ HWY_SVE_FOREACH(HWY_SVE_SV_PAIRWISE_ADD, PairwiseAdd, addp)
6337
+ #undef HWY_SVE_SV_PAIRWISE_ADD
6338
+ } // namespace detail
6339
+
6340
+ // Pairwise add returning interleaved output of a and b
6341
+ template <class D, class V, HWY_IF_LANES_GT_D(D, 1)>
6342
+ HWY_API V PairwiseAdd(D d, V a, V b) {
6343
+ return detail::PairwiseAdd(d, a, b);
6344
+ }
6345
+
6346
+ #endif // HWY_SVE_HAVE_2
6347
+ #endif // HWY_TARGET != HWY_SCALAR
6348
+
5741
6349
  // ------------------------------ WidenMulPairwiseAdd
5742
6350
 
5743
6351
  template <size_t N, int kPow2>
@@ -5776,6 +6384,29 @@ HWY_API svuint32_t WidenMulPairwiseAdd(Simd<uint32_t, N, kPow2> d32,
5776
6384
  #endif
5777
6385
  }
5778
6386
 
6387
+ // ------------------------------ SatWidenMulPairwiseAccumulate
6388
+ #if HWY_SVE_HAVE_2
6389
+ #define HWY_SVE_SAT_MUL_WIDEN_PW_ACC_SVE_2(BASE, CHAR, BITS, HALF, NAME, OP) \
6390
+ template <size_t N, int kPow2> \
6391
+ HWY_API HWY_SVE_V(BASE, BITS) \
6392
+ NAME(HWY_SVE_D(BASE, BITS, N, kPow2) dw, HWY_SVE_V(BASE, HALF) a, \
6393
+ HWY_SVE_V(BASE, HALF) b, HWY_SVE_V(BASE, BITS) sum) { \
6394
+ auto product = svmlalt_##CHAR##BITS(svmullb_##CHAR##BITS(a, b), a, b); \
6395
+ const auto mul_overflow = IfThenElseZero( \
6396
+ Eq(product, Set(dw, LimitsMin<int##BITS##_t>())), Set(dw, -1)); \
6397
+ return SaturatedAdd(Sub(sum, And(BroadcastSignBit(sum), mul_overflow)), \
6398
+ Add(product, mul_overflow)); \
6399
+ }
6400
+ HWY_SVE_FOREACH_UI16(HWY_SVE_SAT_MUL_WIDEN_PW_ACC_SVE_2,
6401
+ SatWidenMulPairwiseAccumulate, _)
6402
+ HWY_SVE_FOREACH_UI32(HWY_SVE_SAT_MUL_WIDEN_PW_ACC_SVE_2,
6403
+ SatWidenMulPairwiseAccumulate, _)
6404
+ HWY_SVE_FOREACH_UI64(HWY_SVE_SAT_MUL_WIDEN_PW_ACC_SVE_2,
6405
+ SatWidenMulPairwiseAccumulate, _)
6406
+
6407
+ #undef HWY_SVE_SAT_MUL_WIDEN_PW_ACC_SVE_2
6408
+ #endif
6409
+
5779
6410
  // ------------------------------ SatWidenMulAccumFixedPoint
5780
6411
 
5781
6412
  #if HWY_SVE_HAVE_2
@@ -5938,6 +6569,130 @@ HWY_API VFromD<DU64> SumOfMulQuadAccumulate(DU64 /*du64*/, svuint16_t a,
5938
6569
  return svdot_u64(sum, a, b);
5939
6570
  }
5940
6571
 
6572
+ // ------------------------------ MulComplex* / MaskedMulComplex*
6573
+
6574
+ // Per-target flag to prevent generic_ops-inl.h from defining MulComplex*.
6575
+ #ifdef HWY_NATIVE_CPLX
6576
+ #undef HWY_NATIVE_CPLX
6577
+ #else
6578
+ #define HWY_NATIVE_CPLX
6579
+ #endif
6580
+
6581
+ template <class V, HWY_IF_NOT_UNSIGNED(TFromV<V>)>
6582
+ HWY_API V ComplexConj(V a) {
6583
+ return OddEven(Neg(a), a);
6584
+ }
6585
+
6586
+ namespace detail {
6587
+ #define HWY_SVE_CPLX_FMA_ROT(BASE, CHAR, BITS, HALF, NAME, OP, ROT) \
6588
+ HWY_API HWY_SVE_V(BASE, BITS) \
6589
+ NAME##ROT(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b, \
6590
+ HWY_SVE_V(BASE, BITS) c) { \
6591
+ return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b, c, ROT); \
6592
+ } \
6593
+ HWY_API HWY_SVE_V(BASE, BITS) \
6594
+ NAME##Z##ROT(svbool_t m, HWY_SVE_V(BASE, BITS) a, \
6595
+ HWY_SVE_V(BASE, BITS) b, HWY_SVE_V(BASE, BITS) c) { \
6596
+ return sv##OP##_##CHAR##BITS##_z(m, a, b, c, ROT); \
6597
+ }
6598
+
6599
+ #define HWY_SVE_CPLX_FMA(BASE, CHAR, BITS, HALF, NAME, OP) \
6600
+ HWY_SVE_CPLX_FMA_ROT(BASE, CHAR, BITS, HALF, NAME, OP, 0) \
6601
+ HWY_SVE_CPLX_FMA_ROT(BASE, CHAR, BITS, HALF, NAME, OP, 90) \
6602
+ HWY_SVE_CPLX_FMA_ROT(BASE, CHAR, BITS, HALF, NAME, OP, 180) \
6603
+ HWY_SVE_CPLX_FMA_ROT(BASE, CHAR, BITS, HALF, NAME, OP, 270)
6604
+
6605
+ // Only SVE2 has complex multiply add for integer types
6606
+ // and these do not include masked variants
6607
+ HWY_SVE_FOREACH_F(HWY_SVE_CPLX_FMA, ComplexMulAdd, cmla)
6608
+ #undef HWY_SVE_CPLX_FMA
6609
+ #undef HWY_SVE_CPLX_FMA_ROT
6610
+ } // namespace detail
6611
+
6612
+ template <class V, class M, HWY_IF_FLOAT_V(V)>
6613
+ HWY_API V MaskedMulComplexConjAdd(M mask, V a, V b, V c) {
6614
+ const V t = detail::ComplexMulAddZ0(mask, c, b, a);
6615
+ return detail::ComplexMulAddZ270(mask, t, b, a);
6616
+ }
6617
+
6618
+ template <class V, class M, HWY_IF_FLOAT_V(V)>
6619
+ HWY_API V MaskedMulComplexConj(M mask, V a, V b) {
6620
+ return MaskedMulComplexConjAdd(mask, a, b, Zero(DFromV<V>()));
6621
+ }
6622
+
6623
+ template <class V, HWY_IF_FLOAT_V(V)>
6624
+ HWY_API V MulComplexAdd(V a, V b, V c) {
6625
+ return detail::ComplexMulAdd90(detail::ComplexMulAdd0(c, a, b), a, b);
6626
+ }
6627
+
6628
+ template <class V, HWY_IF_FLOAT_V(V)>
6629
+ HWY_API V MulComplex(V a, V b) {
6630
+ return MulComplexAdd(a, b, Zero(DFromV<V>()));
6631
+ }
6632
+
6633
+ template <class V, class M, HWY_IF_FLOAT_V(V)>
6634
+ HWY_API V MaskedMulComplexOr(V no, M mask, V a, V b) {
6635
+ return IfThenElse(mask, MulComplex(a, b), no);
6636
+ }
6637
+
6638
+ template <class V, HWY_IF_FLOAT_V(V)>
6639
+ HWY_API V MulComplexConjAdd(V a, V b, V c) {
6640
+ return detail::ComplexMulAdd270(detail::ComplexMulAdd0(c, b, a), b, a);
6641
+ }
6642
+
6643
+ template <class V, HWY_IF_FLOAT_V(V)>
6644
+ HWY_API V MulComplexConj(V a, V b) {
6645
+ return MulComplexConjAdd(a, b, Zero(DFromV<V>()));
6646
+ }
6647
+
6648
+ // TODO SVE2 does have intrinsics for integers but not masked variants
6649
+ template <class V, HWY_IF_NOT_FLOAT_V(V)>
6650
+ HWY_API V MulComplex(V a, V b) {
6651
+ // a = u + iv, b = x + iy
6652
+ const auto u = DupEven(a);
6653
+ const auto v = DupOdd(a);
6654
+ const auto x = DupEven(b);
6655
+ const auto y = DupOdd(b);
6656
+
6657
+ return OddEven(MulAdd(u, y, Mul(v, x)), Sub(Mul(u, x), Mul(v, y)));
6658
+ }
6659
+
6660
+ template <class V, HWY_IF_NOT_FLOAT_V(V)>
6661
+ HWY_API V MulComplexConj(V a, V b) {
6662
+ // a = u + iv, b = x + iy
6663
+ const auto u = DupEven(a);
6664
+ const auto v = DupOdd(a);
6665
+ const auto x = DupEven(b);
6666
+ const auto y = DupOdd(b);
6667
+
6668
+ return OddEven(Sub(Mul(v, x), Mul(u, y)), MulAdd(u, x, Mul(v, y)));
6669
+ }
6670
+
6671
+ template <class V, HWY_IF_NOT_FLOAT_V(V)>
6672
+ HWY_API V MulComplexAdd(V a, V b, V c) {
6673
+ return Add(MulComplex(a, b), c);
6674
+ }
6675
+
6676
+ template <class V, HWY_IF_NOT_FLOAT_V(V)>
6677
+ HWY_API V MulComplexConjAdd(V a, V b, V c) {
6678
+ return Add(MulComplexConj(a, b), c);
6679
+ }
6680
+
6681
+ template <class V, class M, HWY_IF_NOT_FLOAT_V(V)>
6682
+ HWY_API V MaskedMulComplexConjAdd(M mask, V a, V b, V c) {
6683
+ return IfThenElseZero(mask, MulComplexConjAdd(a, b, c));
6684
+ }
6685
+
6686
+ template <class V, class M, HWY_IF_NOT_FLOAT_V(V)>
6687
+ HWY_API V MaskedMulComplexConj(M mask, V a, V b) {
6688
+ return IfThenElseZero(mask, MulComplexConj(a, b));
6689
+ }
6690
+
6691
+ template <class V, class M, HWY_IF_NOT_FLOAT_V(V)>
6692
+ HWY_API V MaskedMulComplexOr(V no, M mask, V a, V b) {
6693
+ return IfThenElse(mask, MulComplex(a, b), no);
6694
+ }
6695
+
5941
6696
  // ------------------------------ AESRound / CLMul
5942
6697
 
5943
6698
  // Static dispatch with -march=armv8-a+sve2+aes, or dynamic dispatch WITHOUT a
@@ -6183,6 +6938,22 @@ HWY_API V HighestSetBitIndex(V v) {
6183
6938
  return BitCast(d, Sub(Set(d, T{sizeof(T) * 8 - 1}), LeadingZeroCount(v)));
6184
6939
  }
6185
6940
 
6941
+ #ifdef HWY_NATIVE_MASKED_LEADING_ZERO_COUNT
6942
+ #undef HWY_NATIVE_MASKED_LEADING_ZERO_COUNT
6943
+ #else
6944
+ #define HWY_NATIVE_MASKED_LEADING_ZERO_COUNT
6945
+ #endif
6946
+
6947
+ #define HWY_SVE_MASKED_LEADING_ZERO_COUNT(BASE, CHAR, BITS, HALF, NAME, OP) \
6948
+ HWY_API HWY_SVE_V(BASE, BITS) NAME(svbool_t m, HWY_SVE_V(BASE, BITS) v) { \
6949
+ const DFromV<decltype(v)> d; \
6950
+ return BitCast(d, sv##OP##_##CHAR##BITS##_z(m, v)); \
6951
+ }
6952
+
6953
+ HWY_SVE_FOREACH_UI(HWY_SVE_MASKED_LEADING_ZERO_COUNT, MaskedLeadingZeroCount,
6954
+ clz)
6955
+ #undef HWY_SVE_LEADING_ZERO_COUNT
6956
+
6186
6957
  // ================================================== END MACROS
6187
6958
  #undef HWY_SVE_ALL_PTRUE
6188
6959
  #undef HWY_SVE_D
@@ -6216,13 +6987,20 @@ HWY_API V HighestSetBitIndex(V v) {
6216
6987
  #undef HWY_SVE_IF_NOT_EMULATED_D
6217
6988
  #undef HWY_SVE_PTRUE
6218
6989
  #undef HWY_SVE_RETV_ARGMVV
6990
+ #undef HWY_SVE_RETV_ARGMVV_Z
6991
+ #undef HWY_SVE_RETV_ARGMV_Z
6992
+ #undef HWY_SVE_RETV_ARGMV
6993
+ #undef HWY_SVE_RETV_ARGMVV_Z
6219
6994
  #undef HWY_SVE_RETV_ARGPV
6220
6995
  #undef HWY_SVE_RETV_ARGPVN
6221
6996
  #undef HWY_SVE_RETV_ARGPVV
6222
6997
  #undef HWY_SVE_RETV_ARGV
6223
6998
  #undef HWY_SVE_RETV_ARGVN
6999
+ #undef HWY_SVE_RETV_ARGMV_M
6224
7000
  #undef HWY_SVE_RETV_ARGVV
6225
7001
  #undef HWY_SVE_RETV_ARGVVV
7002
+ #undef HWY_SVE_RETV_ARGMVVV_Z
7003
+ #undef HWY_SVE_RETV_ARGMVVV
6226
7004
  #undef HWY_SVE_T
6227
7005
  #undef HWY_SVE_UNDEFINED
6228
7006
  #undef HWY_SVE_V