@img/sharp-libvips-dev 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. package/README.md +1 -2
  2. package/include/aom/aom_decoder.h +1 -1
  3. package/include/aom/aom_encoder.h +7 -1
  4. package/include/aom/aom_image.h +24 -12
  5. package/include/aom/aom_integer.h +3 -3
  6. package/include/aom/aomcx.h +15 -0
  7. package/include/aom/aomdx.h +5 -2
  8. package/include/archive.h +7 -5
  9. package/include/archive_entry.h +5 -3
  10. package/include/cgif.h +3 -0
  11. package/include/expat.h +21 -10
  12. package/include/expat_config.h +11 -5
  13. package/include/ffi.h +12 -25
  14. package/include/freetype2/freetype/config/ftoption.h +2 -2
  15. package/include/fribidi/fribidi-config.h +2 -2
  16. package/include/fribidi/fribidi-unicode-version.h +3 -3
  17. package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
  18. package/include/glib-2.0/gio/gappinfo.h +40 -25
  19. package/include/glib-2.0/gio/gapplication.h +6 -0
  20. package/include/glib-2.0/gio/gasyncresult.h +1 -1
  21. package/include/glib-2.0/gio/gconverter.h +5 -0
  22. package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
  23. package/include/glib-2.0/gio/gfile.h +16 -0
  24. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  25. package/include/glib-2.0/gio/giotypes.h +0 -1
  26. package/include/glib-2.0/gio/gsettings.h +8 -0
  27. package/include/glib-2.0/gio/gvfs.h +2 -2
  28. package/include/glib-2.0/girepository/gi-visibility.h +34 -0
  29. package/include/glib-2.0/girepository/giarginfo.h +23 -6
  30. package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
  31. package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
  32. package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
  33. package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
  34. package/include/glib-2.0/girepository/gienuminfo.h +20 -21
  35. package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
  36. package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
  37. package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
  38. package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
  39. package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
  40. package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
  41. package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
  42. package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
  43. package/include/glib-2.0/girepository/girepository.h +53 -62
  44. package/include/glib-2.0/girepository/girffi.h +8 -7
  45. package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
  46. package/include/glib-2.0/girepository/gistructinfo.h +26 -11
  47. package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
  48. package/include/glib-2.0/girepository/gitypelib.h +9 -13
  49. package/include/glib-2.0/girepository/gitypes.h +52 -104
  50. package/include/glib-2.0/girepository/giunioninfo.h +28 -12
  51. package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
  52. package/include/glib-2.0/girepository/givalueinfo.h +65 -0
  53. package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
  54. package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
  55. package/include/glib-2.0/glib/gbitlock.h +31 -0
  56. package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
  57. package/include/glib-2.0/glib/giochannel.h +2 -2
  58. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  59. package/include/glib-2.0/glib/gmacros.h +12 -5
  60. package/include/glib-2.0/glib/gmain.h +93 -7
  61. package/include/glib-2.0/glib/gmessages.h +8 -0
  62. package/include/glib-2.0/glib/gqsort.h +8 -1
  63. package/include/glib-2.0/glib/gslice.h +2 -0
  64. package/include/glib-2.0/glib/gstrfuncs.h +24 -30
  65. package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
  66. package/include/glib-2.0/glib/gthread.h +191 -3
  67. package/include/glib-2.0/glib/gunicode.h +1 -1
  68. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  69. package/include/glib-2.0/glib-unix.h +7 -1
  70. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  71. package/include/glib-2.0/gobject/genums.h +6 -6
  72. package/include/glib-2.0/gobject/glib-types.h +11 -0
  73. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  74. package/include/glib-2.0/gobject/gsignal.h +16 -6
  75. package/include/glib-2.0/gobject/gtype.h +6 -6
  76. package/include/harfbuzz/hb-buffer.h +6 -0
  77. package/include/harfbuzz/hb-common.h +6 -9
  78. package/include/harfbuzz/hb-cplusplus.hh +8 -11
  79. package/include/harfbuzz/hb-subset.h +17 -4
  80. package/include/harfbuzz/hb-version.h +3 -3
  81. package/include/hwy/abort.h +28 -0
  82. package/include/hwy/aligned_allocator.h +218 -6
  83. package/include/hwy/base.h +1935 -512
  84. package/include/hwy/cache_control.h +24 -6
  85. package/include/hwy/detect_compiler_arch.h +105 -10
  86. package/include/hwy/detect_targets.h +146 -37
  87. package/include/hwy/foreach_target.h +36 -1
  88. package/include/hwy/highway.h +222 -50
  89. package/include/hwy/ops/arm_neon-inl.h +2055 -894
  90. package/include/hwy/ops/arm_sve-inl.h +1476 -348
  91. package/include/hwy/ops/emu128-inl.h +711 -623
  92. package/include/hwy/ops/generic_ops-inl.h +4431 -2157
  93. package/include/hwy/ops/inside-inl.h +691 -0
  94. package/include/hwy/ops/ppc_vsx-inl.h +2186 -673
  95. package/include/hwy/ops/rvv-inl.h +1556 -536
  96. package/include/hwy/ops/scalar-inl.h +353 -233
  97. package/include/hwy/ops/set_macros-inl.h +171 -23
  98. package/include/hwy/ops/shared-inl.h +198 -56
  99. package/include/hwy/ops/wasm_128-inl.h +283 -244
  100. package/include/hwy/ops/x86_128-inl.h +3673 -1357
  101. package/include/hwy/ops/x86_256-inl.h +1737 -663
  102. package/include/hwy/ops/x86_512-inl.h +1697 -500
  103. package/include/hwy/per_target.h +4 -0
  104. package/include/hwy/profiler.h +648 -0
  105. package/include/hwy/robust_statistics.h +2 -2
  106. package/include/hwy/targets.h +40 -32
  107. package/include/hwy/timer-inl.h +3 -3
  108. package/include/hwy/timer.h +16 -1
  109. package/include/libheif/heif.h +170 -15
  110. package/include/libheif/heif_items.h +237 -0
  111. package/include/libheif/heif_properties.h +38 -2
  112. package/include/libheif/heif_regions.h +1 -1
  113. package/include/libheif/heif_version.h +2 -2
  114. package/include/libpng16/png.h +32 -29
  115. package/include/libpng16/pngconf.h +2 -2
  116. package/include/libpng16/pnglibconf.h +8 -3
  117. package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
  118. package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
  119. package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
  120. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  121. package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
  122. package/include/libxml2/libxml/HTMLparser.h +12 -19
  123. package/include/libxml2/libxml/c14n.h +1 -12
  124. package/include/libxml2/libxml/debugXML.h +1 -1
  125. package/include/libxml2/libxml/encoding.h +9 -0
  126. package/include/libxml2/libxml/entities.h +12 -1
  127. package/include/libxml2/libxml/hash.h +19 -0
  128. package/include/libxml2/libxml/list.h +2 -2
  129. package/include/libxml2/libxml/nanohttp.h +17 -0
  130. package/include/libxml2/libxml/parser.h +73 -58
  131. package/include/libxml2/libxml/parserInternals.h +9 -1
  132. package/include/libxml2/libxml/pattern.h +6 -0
  133. package/include/libxml2/libxml/tree.h +32 -12
  134. package/include/libxml2/libxml/uri.h +11 -0
  135. package/include/libxml2/libxml/valid.h +29 -2
  136. package/include/libxml2/libxml/xinclude.h +7 -0
  137. package/include/libxml2/libxml/xmlIO.h +21 -5
  138. package/include/libxml2/libxml/xmlerror.h +14 -0
  139. package/include/libxml2/libxml/xmlexports.h +111 -15
  140. package/include/libxml2/libxml/xmlmemory.h +8 -45
  141. package/include/libxml2/libxml/xmlreader.h +2 -0
  142. package/include/libxml2/libxml/xmlsave.h +5 -0
  143. package/include/libxml2/libxml/xmlunicode.h +165 -1
  144. package/include/libxml2/libxml/xmlversion.h +15 -179
  145. package/include/libxml2/libxml/xmlwriter.h +1 -0
  146. package/include/libxml2/libxml/xpath.h +4 -0
  147. package/include/pango-1.0/pango/pango-features.h +2 -2
  148. package/include/pango-1.0/pango/pango-fontmap.h +7 -0
  149. package/include/pango-1.0/pango/pango-item.h +4 -2
  150. package/include/pango-1.0/pango/pango-version-macros.h +25 -0
  151. package/include/pango-1.0/pango/pangofc-font.h +2 -1
  152. package/include/pixman-1/pixman-version.h +2 -2
  153. package/include/png.h +32 -29
  154. package/include/pngconf.h +2 -2
  155. package/include/pnglibconf.h +8 -3
  156. package/include/vips/connection.h +9 -3
  157. package/include/vips/util.h +1 -11
  158. package/include/vips/version.h +4 -4
  159. package/include/webp/decode.h +58 -56
  160. package/include/webp/demux.h +25 -21
  161. package/include/webp/encode.h +44 -39
  162. package/include/webp/mux.h +76 -15
  163. package/include/webp/mux_types.h +2 -1
  164. package/include/webp/sharpyuv/sharpyuv.h +77 -8
  165. package/include/webp/types.h +29 -8
  166. package/include/zconf.h +1 -1
  167. package/include/zlib.h +12 -12
  168. package/package.json +1 -1
  169. package/versions.json +18 -19
@@ -13,9 +13,15 @@
13
13
  // See the License for the specific language governing permissions and
14
14
  // limitations under the License.
15
15
 
16
- // 128-bit vectors for VSX
16
+ // 128-bit vectors for VSX/Z14
17
17
  // External include guard in highway.h - see comment there.
18
18
 
19
+ #if HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15
20
+ #define HWY_S390X_HAVE_Z14 1
21
+ #else
22
+ #define HWY_S390X_HAVE_Z14 0
23
+ #endif
24
+
19
25
  #pragma push_macro("vector")
20
26
  #pragma push_macro("pixel")
21
27
  #pragma push_macro("bool")
@@ -24,7 +30,11 @@
24
30
  #undef pixel
25
31
  #undef bool
26
32
 
33
+ #if HWY_S390X_HAVE_Z14
34
+ #include <vecintrin.h>
35
+ #else
27
36
  #include <altivec.h>
37
+ #endif
28
38
 
29
39
  #pragma pop_macro("vector")
30
40
  #pragma pop_macro("pixel")
@@ -37,20 +47,26 @@
37
47
  // This means we can only use POWER10-specific intrinsics in static dispatch
38
48
  // mode (where the -mpower10-vector compiler flag is passed). Same for PPC9.
39
49
  // On other compilers, the usual target check is sufficient.
40
- #if HWY_TARGET <= HWY_PPC9 && \
50
+ #if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC9 && \
41
51
  (defined(_ARCH_PWR9) || defined(__POWER9_VECTOR__))
42
52
  #define HWY_PPC_HAVE_9 1
43
53
  #else
44
54
  #define HWY_PPC_HAVE_9 0
45
55
  #endif
46
56
 
47
- #if HWY_TARGET <= HWY_PPC10 && \
57
+ #if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC10 && \
48
58
  (defined(_ARCH_PWR10) || defined(__POWER10_VECTOR__))
49
59
  #define HWY_PPC_HAVE_10 1
50
60
  #else
51
61
  #define HWY_PPC_HAVE_10 0
52
62
  #endif
53
63
 
64
+ #if HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_Z15 && __ARCH__ >= 13
65
+ #define HWY_S390X_HAVE_Z15 1
66
+ #else
67
+ #define HWY_S390X_HAVE_Z15 0
68
+ #endif
69
+
54
70
  HWY_BEFORE_NAMESPACE();
55
71
  namespace hwy {
56
72
  namespace HWY_NAMESPACE {
@@ -125,6 +141,9 @@ class Vec128 {
125
141
  HWY_INLINE Vec128& operator-=(const Vec128 other) {
126
142
  return *this = (*this - other);
127
143
  }
144
+ HWY_INLINE Vec128& operator%=(const Vec128 other) {
145
+ return *this = (*this % other);
146
+ }
128
147
  HWY_INLINE Vec128& operator&=(const Vec128 other) {
129
148
  return *this = (*this & other);
130
149
  }
@@ -180,9 +199,6 @@ HWY_API Vec128<T, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
180
199
  template <class D>
181
200
  using VFromD = decltype(Zero(D()));
182
201
 
183
- // ------------------------------ Tuple (VFromD)
184
- #include "hwy/ops/tuple-inl.h"
185
-
186
202
  // ------------------------------ BitCast
187
203
 
188
204
  template <class D, typename FromT>
@@ -215,6 +231,12 @@ HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
215
231
  return VFromD<D>{vec_splats(static_cast<RawLane>(t))};
216
232
  }
217
233
 
234
+ template <class D, HWY_IF_SPECIAL_FLOAT(TFromD<D>)>
235
+ HWY_API VFromD<D> Set(D d, TFromD<D> t) {
236
+ const RebindToUnsigned<decltype(d)> du;
237
+ return BitCast(d, Set(du, BitCastScalar<TFromD<decltype(du)>>(t)));
238
+ }
239
+
218
240
  // Returns a vector with uninitialized elements.
219
241
  template <class D>
220
242
  HWY_API VFromD<D> Undefined(D d) {
@@ -222,6 +244,8 @@ HWY_API VFromD<D> Undefined(D d) {
222
244
  // Suppressing maybe-uninitialized both here and at the caller does not work,
223
245
  // so initialize.
224
246
  return Zero(d);
247
+ #elif HWY_HAS_BUILTIN(__builtin_nondeterministic_value)
248
+ return VFromD<D>{__builtin_nondeterministic_value(Zero(d).raw)};
225
249
  #else
226
250
  HWY_DIAGNOSTICS(push)
227
251
  HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
@@ -240,6 +264,58 @@ HWY_API T GetLane(Vec128<T, N> v) {
240
264
  return static_cast<T>(v.raw[0]);
241
265
  }
242
266
 
267
+ // ------------------------------ Dup128VecFromValues
268
+
269
+ template <class D, HWY_IF_T_SIZE_D(D, 1)>
270
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
271
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
272
+ TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
273
+ TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
274
+ TFromD<D> t11, TFromD<D> t12,
275
+ TFromD<D> t13, TFromD<D> t14,
276
+ TFromD<D> t15) {
277
+ const typename detail::Raw128<TFromD<D>>::type raw = {
278
+ t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15};
279
+ return VFromD<D>{raw};
280
+ }
281
+
282
+ template <class D, HWY_IF_UI16_D(D)>
283
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
284
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
285
+ TFromD<D> t5, TFromD<D> t6,
286
+ TFromD<D> t7) {
287
+ const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1, t2, t3,
288
+ t4, t5, t6, t7};
289
+ return VFromD<D>{raw};
290
+ }
291
+
292
+ template <class D, HWY_IF_SPECIAL_FLOAT_D(D)>
293
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
294
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
295
+ TFromD<D> t5, TFromD<D> t6,
296
+ TFromD<D> t7) {
297
+ const RebindToUnsigned<decltype(d)> du;
298
+ return BitCast(
299
+ d, Dup128VecFromValues(
300
+ du, BitCastScalar<uint16_t>(t0), BitCastScalar<uint16_t>(t1),
301
+ BitCastScalar<uint16_t>(t2), BitCastScalar<uint16_t>(t3),
302
+ BitCastScalar<uint16_t>(t4), BitCastScalar<uint16_t>(t5),
303
+ BitCastScalar<uint16_t>(t6), BitCastScalar<uint16_t>(t7)));
304
+ }
305
+
306
+ template <class D, HWY_IF_T_SIZE_D(D, 4)>
307
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
308
+ TFromD<D> t2, TFromD<D> t3) {
309
+ const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1, t2, t3};
310
+ return VFromD<D>{raw};
311
+ }
312
+
313
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
314
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
315
+ const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1};
316
+ return VFromD<D>{raw};
317
+ }
318
+
243
319
  // ================================================== LOGICAL
244
320
 
245
321
  // ------------------------------ And
@@ -249,7 +325,11 @@ HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
249
325
  const DFromV<decltype(a)> d;
250
326
  const RebindToUnsigned<decltype(d)> du;
251
327
  using VU = VFromD<decltype(du)>;
328
+ #if HWY_S390X_HAVE_Z14
329
+ return BitCast(d, VU{BitCast(du, a).raw & BitCast(du, b).raw});
330
+ #else
252
331
  return BitCast(d, VU{vec_and(BitCast(du, a).raw, BitCast(du, b).raw)});
332
+ #endif
253
333
  }
254
334
 
255
335
  // ------------------------------ AndNot
@@ -271,7 +351,11 @@ HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
271
351
  const DFromV<decltype(a)> d;
272
352
  const RebindToUnsigned<decltype(d)> du;
273
353
  using VU = VFromD<decltype(du)>;
354
+ #if HWY_S390X_HAVE_Z14
355
+ return BitCast(d, VU{BitCast(du, a).raw | BitCast(du, b).raw});
356
+ #else
274
357
  return BitCast(d, VU{vec_or(BitCast(du, a).raw, BitCast(du, b).raw)});
358
+ #endif
275
359
  }
276
360
 
277
361
  // ------------------------------ Xor
@@ -281,7 +365,11 @@ HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
281
365
  const DFromV<decltype(a)> d;
282
366
  const RebindToUnsigned<decltype(d)> du;
283
367
  using VU = VFromD<decltype(du)>;
368
+ #if HWY_S390X_HAVE_Z14
369
+ return BitCast(d, VU{BitCast(du, a).raw ^ BitCast(du, b).raw});
370
+ #else
284
371
  return BitCast(d, VU{vec_xor(BitCast(du, a).raw, BitCast(du, b).raw)});
372
+ #endif
285
373
  }
286
374
 
287
375
  // ------------------------------ Not
@@ -476,9 +564,21 @@ HWY_API Vec128<T, N> operator^(Vec128<T, N> a, Vec128<T, N> b) {
476
564
 
477
565
  // ------------------------------ Neg
478
566
 
479
- template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
480
- HWY_INLINE Vec128<T, N> Neg(Vec128<T, N> v) {
567
+ template <typename T, size_t N, HWY_IF_SIGNED(T)>
568
+ HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
569
+ // If T is an signed integer type, use Zero(d) - v instead of vec_neg to
570
+ // avoid undefined behavior in the case where v[i] == LimitsMin<T>()
571
+ const DFromV<decltype(v)> d;
572
+ return Zero(d) - v;
573
+ }
574
+
575
+ template <typename T, size_t N, HWY_IF_FLOAT3264(T)>
576
+ HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
577
+ #if HWY_S390X_HAVE_Z14
578
+ return Xor(v, SignBit(DFromV<decltype(v)>()));
579
+ #else
481
580
  return Vec128<T, N>{vec_neg(v.raw)};
581
+ #endif
482
582
  }
483
583
 
484
584
  template <typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)>
@@ -489,13 +589,40 @@ HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
489
589
  // ------------------------------ Abs
490
590
 
491
591
  // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
492
- template <class T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
592
+ template <class T, size_t N, HWY_IF_SIGNED(T)>
593
+ HWY_API Vec128<T, N> Abs(Vec128<T, N> v) {
594
+ // If T is a signed integer type, use Max(v, Neg(v)) instead of vec_abs to
595
+ // avoid undefined behavior in the case where v[i] == LimitsMin<T>().
596
+ return Max(v, Neg(v));
597
+ }
598
+
599
+ template <class T, size_t N, HWY_IF_FLOAT3264(T)>
493
600
  HWY_API Vec128<T, N> Abs(Vec128<T, N> v) {
494
601
  return Vec128<T, N>{vec_abs(v.raw)};
495
602
  }
496
603
 
497
604
  // ------------------------------ CopySign
498
605
 
606
+ #if HWY_S390X_HAVE_Z14
607
+ template <class V>
608
+ HWY_API V CopySign(const V magn, const V sign) {
609
+ static_assert(IsFloat<TFromV<V>>(), "Only makes sense for floating-point");
610
+
611
+ const DFromV<decltype(magn)> d;
612
+ const auto msb = SignBit(d);
613
+
614
+ // Truth table for msb, magn, sign | bitwise msb ? sign : mag
615
+ // 0 0 0 | 0
616
+ // 0 0 1 | 0
617
+ // 0 1 0 | 1
618
+ // 0 1 1 | 1
619
+ // 1 0 0 | 0
620
+ // 1 0 1 | 1
621
+ // 1 1 0 | 0
622
+ // 1 1 1 | 1
623
+ return BitwiseIfThenElse(msb, sign, magn);
624
+ }
625
+ #else // VSX
499
626
  template <size_t N>
500
627
  HWY_API Vec128<float, N> CopySign(Vec128<float, N> magn,
501
628
  Vec128<float, N> sign) {
@@ -525,6 +652,7 @@ HWY_API Vec128<double, N> CopySign(Vec128<double, N> magn,
525
652
  return Vec128<double, N>{vec_cpsgn(sign.raw, magn.raw)};
526
653
  #endif
527
654
  }
655
+ #endif // HWY_S390X_HAVE_Z14
528
656
 
529
657
  template <typename T, size_t N>
530
658
  HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
@@ -542,10 +670,21 @@ HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
542
670
 
543
671
  template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
544
672
  HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
673
+ // Suppress the ignoring attributes warning that is generated by
674
+ // HWY_RCAST_ALIGNED(const LoadRaw*, aligned) with GCC
675
+ #if HWY_COMPILER_GCC
676
+ HWY_DIAGNOSTICS(push)
677
+ HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
678
+ #endif
679
+
545
680
  using LoadRaw = typename detail::Raw128<T>::AlignedRawVec;
546
- const LoadRaw* HWY_RESTRICT p = reinterpret_cast<const LoadRaw*>(aligned);
681
+ const LoadRaw* HWY_RESTRICT p = HWY_RCAST_ALIGNED(const LoadRaw*, aligned);
547
682
  using ResultRaw = typename detail::Raw128<T>::type;
548
683
  return Vec128<T>{reinterpret_cast<ResultRaw>(*p)};
684
+
685
+ #if HWY_COMPILER_GCC
686
+ HWY_DIAGNOSTICS(pop)
687
+ #endif
549
688
  }
550
689
 
551
690
  // Any <= 64 bit
@@ -598,19 +737,13 @@ HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
598
737
  // mask ? yes : 0
599
738
  template <typename T, size_t N>
600
739
  HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
601
- const DFromV<decltype(yes)> d;
602
- const RebindToUnsigned<decltype(d)> du;
603
- return BitCast(d,
604
- VFromD<decltype(du)>{vec_and(BitCast(du, yes).raw, mask.raw)});
740
+ return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
605
741
  }
606
742
 
607
743
  // mask ? 0 : no
608
744
  template <typename T, size_t N>
609
745
  HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
610
- const DFromV<decltype(no)> d;
611
- const RebindToUnsigned<decltype(d)> du;
612
- return BitCast(d,
613
- VFromD<decltype(du)>{vec_andc(BitCast(du, no).raw, mask.raw)});
746
+ return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
614
747
  }
615
748
 
616
749
  // ------------------------------ Mask logical
@@ -622,7 +755,11 @@ HWY_API Mask128<T, N> Not(Mask128<T, N> m) {
622
755
 
623
756
  template <typename T, size_t N>
624
757
  HWY_API Mask128<T, N> And(Mask128<T, N> a, Mask128<T, N> b) {
758
+ #if HWY_S390X_HAVE_Z14
759
+ return Mask128<T, N>{a.raw & b.raw};
760
+ #else
625
761
  return Mask128<T, N>{vec_and(a.raw, b.raw)};
762
+ #endif
626
763
  }
627
764
 
628
765
  template <typename T, size_t N>
@@ -632,12 +769,20 @@ HWY_API Mask128<T, N> AndNot(Mask128<T, N> a, Mask128<T, N> b) {
632
769
 
633
770
  template <typename T, size_t N>
634
771
  HWY_API Mask128<T, N> Or(Mask128<T, N> a, Mask128<T, N> b) {
772
+ #if HWY_S390X_HAVE_Z14
773
+ return Mask128<T, N>{a.raw | b.raw};
774
+ #else
635
775
  return Mask128<T, N>{vec_or(a.raw, b.raw)};
776
+ #endif
636
777
  }
637
778
 
638
779
  template <typename T, size_t N>
639
780
  HWY_API Mask128<T, N> Xor(Mask128<T, N> a, Mask128<T, N> b) {
781
+ #if HWY_S390X_HAVE_Z14
782
+ return Mask128<T, N>{a.raw ^ b.raw};
783
+ #else
640
784
  return Mask128<T, N>{vec_xor(a.raw, b.raw)};
785
+ #endif
641
786
  }
642
787
 
643
788
  template <typename T, size_t N>
@@ -645,36 +790,24 @@ HWY_API Mask128<T, N> ExclusiveNeither(Mask128<T, N> a, Mask128<T, N> b) {
645
790
  return Mask128<T, N>{vec_nor(a.raw, b.raw)};
646
791
  }
647
792
 
648
- // ------------------------------ BroadcastSignBit
649
-
650
- template <size_t N>
651
- HWY_API Vec128<int8_t, N> BroadcastSignBit(Vec128<int8_t, N> v) {
652
- return Vec128<int8_t, N>{
653
- vec_sra(v.raw, vec_splats(static_cast<unsigned char>(7)))};
654
- }
655
-
656
- template <size_t N>
657
- HWY_API Vec128<int16_t, N> BroadcastSignBit(Vec128<int16_t, N> v) {
658
- return Vec128<int16_t, N>{
659
- vec_sra(v.raw, vec_splats(static_cast<unsigned short>(15)))};
660
- }
661
-
662
- template <size_t N>
663
- HWY_API Vec128<int32_t, N> BroadcastSignBit(Vec128<int32_t, N> v) {
664
- return Vec128<int32_t, N>{vec_sra(v.raw, vec_splats(31u))};
665
- }
666
-
667
- template <size_t N>
668
- HWY_API Vec128<int64_t, N> BroadcastSignBit(Vec128<int64_t, N> v) {
669
- return Vec128<int64_t, N>{vec_sra(v.raw, vec_splats(63ULL))};
670
- }
671
-
672
793
  // ------------------------------ ShiftLeftSame
673
794
 
674
795
  template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
675
796
  HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, const int bits) {
676
- using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT;
677
- return Vec128<T, N>{vec_sl(v.raw, vec_splats(static_cast<TU>(bits)))};
797
+ const DFromV<decltype(v)> d;
798
+ const RebindToUnsigned<decltype(d)> du;
799
+ using TU = TFromD<decltype(du)>;
800
+
801
+ #if HWY_S390X_HAVE_Z14
802
+ return BitCast(d,
803
+ VFromD<decltype(du)>{BitCast(du, v).raw
804
+ << Set(du, static_cast<TU>(bits)).raw});
805
+ #else
806
+ // Do an unsigned vec_sl operation to avoid undefined behavior
807
+ return BitCast(
808
+ d, VFromD<decltype(du)>{
809
+ vec_sl(BitCast(du, v).raw, Set(du, static_cast<TU>(bits)).raw)});
810
+ #endif
678
811
  }
679
812
 
680
813
  // ------------------------------ ShiftRightSame
@@ -682,13 +815,22 @@ HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, const int bits) {
682
815
  template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
683
816
  HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, const int bits) {
684
817
  using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT;
818
+ #if HWY_S390X_HAVE_Z14
819
+ return Vec128<T, N>{v.raw >> vec_splats(static_cast<TU>(bits))};
820
+ #else
685
821
  return Vec128<T, N>{vec_sr(v.raw, vec_splats(static_cast<TU>(bits)))};
822
+ #endif
686
823
  }
687
824
 
688
825
  template <typename T, size_t N, HWY_IF_SIGNED(T)>
689
826
  HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, const int bits) {
827
+ #if HWY_S390X_HAVE_Z14
828
+ using TI = typename detail::Raw128<T>::RawT;
829
+ return Vec128<T, N>{v.raw >> vec_splats(static_cast<TI>(bits))};
830
+ #else
690
831
  using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT;
691
832
  return Vec128<T, N>{vec_sra(v.raw, vec_splats(static_cast<TU>(bits)))};
833
+ #endif
692
834
  }
693
835
 
694
836
  // ------------------------------ ShiftLeft
@@ -707,6 +849,13 @@ HWY_API Vec128<T, N> ShiftRight(Vec128<T, N> v) {
707
849
  return ShiftRightSame(v, kBits);
708
850
  }
709
851
 
852
+ // ------------------------------ BroadcastSignBit
853
+
854
+ template <typename T, size_t N, HWY_IF_SIGNED(T)>
855
+ HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
856
+ return ShiftRightSame(v, static_cast<int>(sizeof(T) * 8 - 1));
857
+ }
858
+
710
859
  // ================================================== SWIZZLE (1)
711
860
 
712
861
  // ------------------------------ TableLookupBytes
@@ -1003,7 +1152,7 @@ HWY_API VFromD<D> LoadDup128(D d, const T* HWY_RESTRICT p) {
1003
1152
  return LoadU(d, p);
1004
1153
  }
1005
1154
 
1006
- #if HWY_PPC_HAVE_9
1155
+ #if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
1007
1156
  #ifdef HWY_NATIVE_LOAD_N
1008
1157
  #undef HWY_NATIVE_LOAD_N
1009
1158
  #else
@@ -1027,11 +1176,20 @@ HWY_API VFromD<D> LoadN(D d, const T* HWY_RESTRICT p,
1027
1176
  const size_t num_of_bytes_to_load =
1028
1177
  HWY_MIN(max_lanes_to_load, HWY_MAX_LANES_D(D)) * sizeof(TFromD<D>);
1029
1178
  const Repartition<uint8_t, decltype(d)> du8;
1179
+ #if HWY_S390X_HAVE_Z14
1180
+ return (num_of_bytes_to_load > 0)
1181
+ ? BitCast(d, VFromD<decltype(du8)>{vec_load_len(
1182
+ const_cast<unsigned char*>(
1183
+ reinterpret_cast<const unsigned char*>(p)),
1184
+ static_cast<unsigned>(num_of_bytes_to_load - 1))})
1185
+ : Zero(d);
1186
+ #else
1030
1187
  return BitCast(
1031
1188
  d,
1032
1189
  VFromD<decltype(du8)>{vec_xl_len(
1033
1190
  const_cast<unsigned char*>(reinterpret_cast<const unsigned char*>(p)),
1034
1191
  num_of_bytes_to_load)});
1192
+ #endif
1035
1193
  }
1036
1194
 
1037
1195
  template <class D, typename T = TFromD<D>>
@@ -1048,18 +1206,11 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const T* HWY_RESTRICT p,
1048
1206
  }
1049
1207
  #endif
1050
1208
 
1051
- const size_t num_of_bytes_to_load =
1052
- HWY_MIN(max_lanes_to_load, HWY_MAX_LANES_D(D)) * sizeof(TFromD<D>);
1053
- const Repartition<uint8_t, decltype(d)> du8;
1054
- const VFromD<D> v = BitCast(
1055
- d,
1056
- VFromD<decltype(du8)>{vec_xl_len(
1057
- const_cast<unsigned char*>(reinterpret_cast<const unsigned char*>(p)),
1058
- num_of_bytes_to_load)});
1059
- return IfThenElse(FirstN(d, max_lanes_to_load), v, no);
1209
+ return IfThenElse(FirstN(d, max_lanes_to_load),
1210
+ LoadN(d, p, max_lanes_to_load), no);
1060
1211
  }
1061
1212
 
1062
- #endif // HWY_PPC_HAVE_9
1213
+ #endif // HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
1063
1214
 
1064
1215
  // Returns a vector with lane i=[0, N) set to "first" + i.
1065
1216
  namespace detail {
@@ -1134,8 +1285,19 @@ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
1134
1285
 
1135
1286
  template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
1136
1287
  HWY_API void Store(Vec128<T> v, D /* tag */, T* HWY_RESTRICT aligned) {
1288
+ // Suppress the ignoring attributes warning that is generated by
1289
+ // HWY_RCAST_ALIGNED(StoreRaw*, aligned) with GCC
1290
+ #if HWY_COMPILER_GCC
1291
+ HWY_DIAGNOSTICS(push)
1292
+ HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
1293
+ #endif
1294
+
1137
1295
  using StoreRaw = typename detail::Raw128<T>::AlignedRawVec;
1138
- *reinterpret_cast<StoreRaw*>(aligned) = reinterpret_cast<StoreRaw>(v.raw);
1296
+ *HWY_RCAST_ALIGNED(StoreRaw*, aligned) = reinterpret_cast<StoreRaw>(v.raw);
1297
+
1298
+ #if HWY_COMPILER_GCC
1299
+ HWY_DIAGNOSTICS(pop)
1300
+ #endif
1139
1301
  }
1140
1302
 
1141
1303
  template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
@@ -1159,7 +1321,7 @@ HWY_API void StoreU(VFromD<D> v, D d, T* HWY_RESTRICT p) {
1159
1321
  Store(v, d, p);
1160
1322
  }
1161
1323
 
1162
- #if HWY_PPC_HAVE_9
1324
+ #if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
1163
1325
 
1164
1326
  #ifdef HWY_NATIVE_STORE_N
1165
1327
  #undef HWY_NATIVE_STORE_N
@@ -1185,8 +1347,15 @@ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
1185
1347
  const size_t num_of_bytes_to_store =
1186
1348
  HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D)) * sizeof(TFromD<D>);
1187
1349
  const Repartition<uint8_t, decltype(d)> du8;
1350
+ #if HWY_S390X_HAVE_Z14
1351
+ if (num_of_bytes_to_store > 0) {
1352
+ vec_store_len(BitCast(du8, v).raw, reinterpret_cast<unsigned char*>(p),
1353
+ static_cast<unsigned>(num_of_bytes_to_store - 1));
1354
+ }
1355
+ #else
1188
1356
  vec_xst_len(BitCast(du8, v).raw, reinterpret_cast<unsigned char*>(p),
1189
1357
  num_of_bytes_to_store);
1358
+ #endif
1190
1359
  }
1191
1360
  #endif
1192
1361
 
@@ -1195,180 +1364,104 @@ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
1195
1364
  template <class D>
1196
1365
  HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
1197
1366
  TFromD<D>* HWY_RESTRICT p) {
1198
- const RebindToSigned<decltype(d)> di; // for testing mask if T=bfloat16_t.
1199
- using TI = TFromD<decltype(di)>;
1200
- alignas(16) TI buf[MaxLanes(d)];
1201
- alignas(16) TI mask[MaxLanes(d)];
1202
- Store(BitCast(di, v), di, buf);
1203
- Store(BitCast(di, VecFromMask(d, m)), di, mask);
1204
- for (size_t i = 0; i < MaxLanes(d); ++i) {
1205
- if (mask[i]) {
1206
- CopySameSize(buf + i, p + i);
1207
- }
1208
- }
1367
+ const VFromD<D> old = LoadU(d, p);
1368
+ StoreU(IfThenElse(RebindMask(d, m), v, old), d, p);
1209
1369
  }
1210
1370
 
1211
1371
  // ================================================== ARITHMETIC
1212
1372
 
1373
+ namespace detail {
1374
+ // If TFromD<D> is an integer type, detail::RebindToUnsignedIfNotFloat<D>
1375
+ // rebinds D to MakeUnsigned<TFromD<D>>.
1376
+
1377
+ // Otherwise, if TFromD<D> is a floating-point type (including F16 and BF16),
1378
+ // detail::RebindToUnsignedIfNotFloat<D> is the same as D.
1379
+ template <class D>
1380
+ using RebindToUnsignedIfNotFloat =
1381
+ hwy::If<(!hwy::IsFloat<TFromD<D>>() && !hwy::IsSpecialFloat<TFromD<D>>()),
1382
+ RebindToUnsigned<D>, D>;
1383
+ } // namespace detail
1384
+
1213
1385
  // ------------------------------ Addition
1214
1386
 
1215
1387
  template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
1216
1388
  HWY_API Vec128<T, N> operator+(Vec128<T, N> a, Vec128<T, N> b) {
1217
- return Vec128<T, N>{vec_add(a.raw, b.raw)};
1389
+ const DFromV<decltype(a)> d;
1390
+ const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
1391
+
1392
+ // If T is an integer type, do an unsigned vec_add to avoid undefined behavior
1393
+ #if HWY_S390X_HAVE_Z14
1394
+ return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw +
1395
+ BitCast(d_arith, b).raw});
1396
+ #else
1397
+ return BitCast(d, VFromD<decltype(d_arith)>{vec_add(
1398
+ BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
1399
+ #endif
1218
1400
  }
1219
1401
 
1220
1402
  // ------------------------------ Subtraction
1221
1403
 
1222
1404
  template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
1223
1405
  HWY_API Vec128<T, N> operator-(Vec128<T, N> a, Vec128<T, N> b) {
1224
- return Vec128<T, N>{vec_sub(a.raw, b.raw)};
1225
- }
1226
-
1227
- // ------------------------------ SumsOf8
1228
- namespace detail {
1229
-
1230
- // Casts nominally int32_t result to D.
1231
- template <class D>
1232
- HWY_INLINE VFromD<D> AltivecVsum4sbs(D d, __vector signed char a,
1233
- __vector signed int b) {
1234
- const Repartition<int32_t, D> di32;
1235
- #ifdef __OPTIMIZE__
1236
- if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
1237
- const int64_t sum0 =
1238
- static_cast<int64_t>(a[0]) + static_cast<int64_t>(a[1]) +
1239
- static_cast<int64_t>(a[2]) + static_cast<int64_t>(a[3]) +
1240
- static_cast<int64_t>(b[0]);
1241
- const int64_t sum1 =
1242
- static_cast<int64_t>(a[4]) + static_cast<int64_t>(a[5]) +
1243
- static_cast<int64_t>(a[6]) + static_cast<int64_t>(a[7]) +
1244
- static_cast<int64_t>(b[1]);
1245
- const int64_t sum2 =
1246
- static_cast<int64_t>(a[8]) + static_cast<int64_t>(a[9]) +
1247
- static_cast<int64_t>(a[10]) + static_cast<int64_t>(a[11]) +
1248
- static_cast<int64_t>(b[2]);
1249
- const int64_t sum3 =
1250
- static_cast<int64_t>(a[12]) + static_cast<int64_t>(a[13]) +
1251
- static_cast<int64_t>(a[14]) + static_cast<int64_t>(a[15]) +
1252
- static_cast<int64_t>(b[3]);
1253
- const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
1254
- const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
1255
- const int32_t sign2 = static_cast<int32_t>(sum2 >> 63);
1256
- const int32_t sign3 = static_cast<int32_t>(sum3 >> 63);
1257
- using Raw = typename detail::Raw128<int32_t>::type;
1258
- return BitCast(
1259
- d,
1260
- VFromD<decltype(di32)>{Raw{
1261
- (sign0 == (sum0 >> 31)) ? static_cast<int32_t>(sum0)
1262
- : static_cast<int32_t>(sign0 ^ 0x7FFFFFFF),
1263
- (sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1)
1264
- : static_cast<int32_t>(sign1 ^ 0x7FFFFFFF),
1265
- (sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2)
1266
- : static_cast<int32_t>(sign2 ^ 0x7FFFFFFF),
1267
- (sign3 == (sum3 >> 31))
1268
- ? static_cast<int32_t>(sum3)
1269
- : static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}});
1270
- } else // NOLINT
1271
- #endif
1272
- {
1273
- return BitCast(d, VFromD<decltype(di32)>{vec_vsum4sbs(a, b)});
1274
- }
1275
- }
1406
+ const DFromV<decltype(a)> d;
1407
+ const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
1276
1408
 
1277
- // Casts nominally uint32_t result to D.
1278
- template <class D>
1279
- HWY_INLINE VFromD<D> AltivecVsum4ubs(D d, __vector unsigned char a,
1280
- __vector unsigned int b) {
1281
- const Repartition<uint32_t, D> du32;
1282
- #ifdef __OPTIMIZE__
1283
- if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
1284
- const uint64_t sum0 =
1285
- static_cast<uint64_t>(a[0]) + static_cast<uint64_t>(a[1]) +
1286
- static_cast<uint64_t>(a[2]) + static_cast<uint64_t>(a[3]) +
1287
- static_cast<uint64_t>(b[0]);
1288
- const uint64_t sum1 =
1289
- static_cast<uint64_t>(a[4]) + static_cast<uint64_t>(a[5]) +
1290
- static_cast<uint64_t>(a[6]) + static_cast<uint64_t>(a[7]) +
1291
- static_cast<uint64_t>(b[1]);
1292
- const uint64_t sum2 =
1293
- static_cast<uint64_t>(a[8]) + static_cast<uint64_t>(a[9]) +
1294
- static_cast<uint64_t>(a[10]) + static_cast<uint64_t>(a[11]) +
1295
- static_cast<uint64_t>(b[2]);
1296
- const uint64_t sum3 =
1297
- static_cast<uint64_t>(a[12]) + static_cast<uint64_t>(a[13]) +
1298
- static_cast<uint64_t>(a[14]) + static_cast<uint64_t>(a[15]) +
1299
- static_cast<uint64_t>(b[3]);
1300
- return BitCast(
1301
- d,
1302
- VFromD<decltype(du32)>{(__vector unsigned int){
1303
- static_cast<unsigned int>(sum0 <= 0xFFFFFFFFu ? sum0 : 0xFFFFFFFFu),
1304
- static_cast<unsigned int>(sum1 <= 0xFFFFFFFFu ? sum1 : 0xFFFFFFFFu),
1305
- static_cast<unsigned int>(sum2 <= 0xFFFFFFFFu ? sum2 : 0xFFFFFFFFu),
1306
- static_cast<unsigned int>(sum3 <= 0xFFFFFFFFu ? sum3
1307
- : 0xFFFFFFFFu)}});
1308
- } else // NOLINT
1409
+ // If T is an integer type, do an unsigned vec_sub to avoid undefined behavior
1410
+ #if HWY_S390X_HAVE_Z14
1411
+ return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw -
1412
+ BitCast(d_arith, b).raw});
1413
+ #else
1414
+ return BitCast(d, VFromD<decltype(d_arith)>{vec_sub(
1415
+ BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
1309
1416
  #endif
1310
- {
1311
- return BitCast(d, VFromD<decltype(du32)>{vec_vsum4ubs(a, b)});
1312
- }
1313
1417
  }
1314
1418
 
1315
- // Casts nominally int32_t result to D.
1316
- template <class D>
1317
- HWY_INLINE VFromD<D> AltivecVsum2sws(D d, __vector signed int a,
1318
- __vector signed int b) {
1319
- const Repartition<int32_t, D> di32;
1320
- #ifdef __OPTIMIZE__
1321
- const Repartition<uint64_t, D> du64;
1322
- constexpr int kDestLaneOffset = HWY_IS_BIG_ENDIAN;
1323
- if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset]) &&
1324
- __builtin_constant_p(b[kDestLaneOffset + 2])) {
1325
- const int64_t sum0 = static_cast<int64_t>(a[0]) +
1326
- static_cast<int64_t>(a[1]) +
1327
- static_cast<int64_t>(b[kDestLaneOffset]);
1328
- const int64_t sum1 = static_cast<int64_t>(a[2]) +
1329
- static_cast<int64_t>(a[3]) +
1330
- static_cast<int64_t>(b[kDestLaneOffset + 2]);
1331
- const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
1332
- const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
1333
- return BitCast(d, VFromD<decltype(du64)>{(__vector unsigned long long){
1334
- (sign0 == (sum0 >> 31))
1335
- ? static_cast<uint32_t>(sum0)
1336
- : static_cast<uint32_t>(sign0 ^ 0x7FFFFFFF),
1337
- (sign1 == (sum1 >> 31))
1338
- ? static_cast<uint32_t>(sum1)
1339
- : static_cast<uint32_t>(sign1 ^ 0x7FFFFFFF)}});
1340
- } else // NOLINT
1341
- #endif
1342
- {
1343
- __vector signed int sum;
1344
-
1345
- // Inline assembly is used for vsum2sws to avoid unnecessary shuffling
1346
- // on little-endian PowerPC targets as the result of the vsum2sws
1347
- // instruction will already be in the correct lanes on little-endian
1348
- // PowerPC targets.
1349
- __asm__("vsum2sws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b));
1350
-
1351
- return BitCast(d, VFromD<decltype(di32)>{sum});
1352
- }
1419
+ // ------------------------------ SumsOf8
1420
+ template <class V, HWY_IF_U8(TFromV<V>)>
1421
+ HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) {
1422
+ return SumsOf2(SumsOf4(v));
1353
1423
  }
1354
1424
 
1355
- } // namespace detail
1356
-
1357
- template <size_t N>
1358
- HWY_API Vec128<uint64_t, N / 8> SumsOf8(Vec128<uint8_t, N> v) {
1359
- const Repartition<uint64_t, DFromV<decltype(v)>> du64;
1360
- const Repartition<int32_t, decltype(du64)> di32;
1361
- const RebindToUnsigned<decltype(di32)> du32;
1425
+ template <class V, HWY_IF_I8(TFromV<V>)>
1426
+ HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) {
1427
+ #if HWY_S390X_HAVE_Z14
1428
+ const DFromV<decltype(v)> di8;
1429
+ const RebindToUnsigned<decltype(di8)> du8;
1430
+ const RepartitionToWideX3<decltype(di8)> di64;
1362
1431
 
1363
- return detail::AltivecVsum2sws(
1364
- du64, detail::AltivecVsum4ubs(di32, v.raw, Zero(du32).raw).raw,
1365
- Zero(di32).raw);
1432
+ return BitCast(di64, SumsOf8(BitCast(du8, Xor(v, SignBit(di8))))) +
1433
+ Set(di64, int64_t{-1024});
1434
+ #else
1435
+ return SumsOf2(SumsOf4(v));
1436
+ #endif
1366
1437
  }
1367
1438
 
1368
1439
  // ------------------------------ SaturatedAdd
1369
1440
 
1370
1441
  // Returns a + b clamped to the destination range.
1371
1442
 
1443
+ #if HWY_S390X_HAVE_Z14
1444
+ // Z14/Z15/Z16 does not have I8/U8/I16/U16 SaturatedAdd instructions unlike most
1445
+ // other integer SIMD instruction sets
1446
+
1447
+ template <typename T, size_t N, HWY_IF_UNSIGNED(T),
1448
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
1449
+ HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
1450
+ return Add(a, Min(b, Not(a)));
1451
+ }
1452
+
1453
+ template <typename T, size_t N, HWY_IF_SIGNED(T),
1454
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
1455
+ HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
1456
+ const DFromV<decltype(a)> d;
1457
+ const auto sum = Add(a, b);
1458
+ const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum));
1459
+ const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax<T>()));
1460
+ return IfNegativeThenElse(overflow_mask, overflow_result, sum);
1461
+ }
1462
+
1463
+ #else // VSX
1464
+
1372
1465
  #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
1373
1466
  #undef HWY_NATIVE_I32_SATURATED_ADDSUB
1374
1467
  #else
@@ -1386,6 +1479,7 @@ template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
1386
1479
  HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
1387
1480
  return Vec128<T, N>{vec_adds(a.raw, b.raw)};
1388
1481
  }
1482
+ #endif // HWY_S390X_HAVE_Z14
1389
1483
 
1390
1484
  #if HWY_PPC_HAVE_10
1391
1485
 
@@ -1412,11 +1506,34 @@ HWY_API V SaturatedAdd(V a, V b) {
1412
1506
 
1413
1507
  // Returns a - b clamped to the destination range.
1414
1508
 
1509
+ #if HWY_S390X_HAVE_Z14
1510
+ // Z14/Z15/Z16 does not have I8/U8/I16/U16 SaturatedSub instructions unlike most
1511
+ // other integer SIMD instruction sets
1512
+
1513
+ template <typename T, size_t N, HWY_IF_UNSIGNED(T),
1514
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
1515
+ HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
1516
+ return Sub(a, Min(a, b));
1517
+ }
1518
+
1519
+ template <typename T, size_t N, HWY_IF_SIGNED(T),
1520
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
1521
+ HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
1522
+ const DFromV<decltype(a)> d;
1523
+ const auto diff = Sub(a, b);
1524
+ const auto overflow_mask = And(Xor(a, b), Xor(a, diff));
1525
+ const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax<T>()));
1526
+ return IfNegativeThenElse(overflow_mask, overflow_result, diff);
1527
+ }
1528
+
1529
+ #else // VSX
1530
+
1415
1531
  template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
1416
1532
  HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
1417
1533
  HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
1418
1534
  return Vec128<T, N>{vec_subs(a.raw, b.raw)};
1419
1535
  }
1536
+ #endif // HWY_S390X_HAVE_Z14
1420
1537
 
1421
1538
  #if HWY_PPC_HAVE_10
1422
1539
 
@@ -1459,32 +1576,96 @@ HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, Vec128<T, N> b) {
1459
1576
 
1460
1577
  template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
1461
1578
  HWY_API Vec128<T, N> operator*(Vec128<T, N> a, Vec128<T, N> b) {
1462
- return Vec128<T, N>{a.raw * b.raw};
1579
+ const DFromV<decltype(a)> d;
1580
+ const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
1581
+
1582
+ // If T is an integer type, do an unsigned vec_mul to avoid undefined behavior
1583
+ #if HWY_S390X_HAVE_Z14
1584
+ return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw *
1585
+ BitCast(d_arith, b).raw});
1586
+ #else
1587
+ return BitCast(d, VFromD<decltype(d_arith)>{vec_mul(
1588
+ BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
1589
+ #endif
1590
+ }
1591
+
1592
+ // Returns the upper sizeof(T)*8 bits of a * b in each lane.
1593
+
1594
+ #if HWY_S390X_HAVE_Z14
1595
+ #define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
1596
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))
1597
+ #define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
1598
+ hwy::EnableIf<!hwy::IsSame<T, T>()>* = nullptr
1599
+ #elif HWY_PPC_HAVE_10
1600
+ #define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
1601
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))
1602
+ #define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
1603
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))
1604
+ #else
1605
+ #define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
1606
+ hwy::EnableIf<!hwy::IsSame<T, T>()>* = nullptr
1607
+ #define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
1608
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))
1609
+ #endif
1610
+
1611
+ #if HWY_S390X_HAVE_Z14 || HWY_PPC_HAVE_10
1612
+ template <typename T, size_t N, HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T),
1613
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1614
+ HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
1615
+ return Vec128<T, N>{vec_mulh(a.raw, b.raw)};
1616
+ }
1617
+ #endif
1618
+
1619
+ template <typename T, HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T),
1620
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1621
+ HWY_API Vec128<T, 1> MulHigh(Vec128<T, 1> a, Vec128<T, 1> b) {
1622
+ const auto p_even = MulEven(a, b);
1623
+
1624
+ #if HWY_IS_LITTLE_ENDIAN
1625
+ const auto p_even_full = ResizeBitCast(Full128<T>(), p_even);
1626
+ return Vec128<T, 1>{
1627
+ vec_sld(p_even_full.raw, p_even_full.raw, 16 - sizeof(T))};
1628
+ #else
1629
+ const DFromV<decltype(a)> d;
1630
+ return ResizeBitCast(d, p_even);
1631
+ #endif
1463
1632
  }
1464
1633
 
1465
- // Returns the upper 16 bits of a * b in each lane.
1466
- template <typename T, size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_NOT_FLOAT(T)>
1634
+ template <typename T, size_t N,
1635
+ HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T),
1636
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL(T), HWY_IF_LANES_GT(N, 1)>
1467
1637
  HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
1468
1638
  const DFromV<decltype(a)> d;
1469
- const RepartitionToWide<decltype(d)> dw;
1470
- const VFromD<decltype(dw)> p1{vec_mule(a.raw, b.raw)};
1471
- const VFromD<decltype(dw)> p2{vec_mulo(a.raw, b.raw)};
1639
+
1640
+ const auto p_even = BitCast(d, MulEven(a, b));
1641
+ const auto p_odd = BitCast(d, MulOdd(a, b));
1642
+
1472
1643
  #if HWY_IS_LITTLE_ENDIAN
1473
- const __vector unsigned char kShuffle = {2, 3, 18, 19, 6, 7, 22, 23,
1474
- 10, 11, 26, 27, 14, 15, 30, 31};
1644
+ return InterleaveOdd(d, p_even, p_odd);
1475
1645
  #else
1476
- const __vector unsigned char kShuffle = {0, 1, 16, 17, 4, 5, 20, 21,
1477
- 8, 9, 24, 25, 12, 13, 28, 29};
1646
+ return InterleaveEven(d, p_even, p_odd);
1478
1647
  #endif
1479
- return BitCast(d, VFromD<decltype(dw)>{vec_perm(p1.raw, p2.raw, kShuffle)});
1480
1648
  }
1481
1649
 
1482
- template <size_t N>
1483
- HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
1484
- Vec128<int16_t, N> b) {
1485
- const Vec128<int16_t> zero = Zero(Full128<int16_t>());
1486
- return Vec128<int16_t, N>{vec_mradds(a.raw, b.raw, zero.raw)};
1650
+ #if !HWY_PPC_HAVE_10
1651
+ template <class T, HWY_IF_UI64(T)>
1652
+ HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) {
1653
+ T p_hi;
1654
+ Mul128(GetLane(a), GetLane(b), &p_hi);
1655
+ return Set(Full64<T>(), p_hi);
1656
+ }
1657
+
1658
+ template <class T, HWY_IF_UI64(T)>
1659
+ HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
1660
+ const DFromV<decltype(a)> d;
1661
+ const Half<decltype(d)> dh;
1662
+ return Combine(d, MulHigh(UpperHalf(dh, a), UpperHalf(dh, b)),
1663
+ MulHigh(LowerHalf(dh, a), LowerHalf(dh, b)));
1487
1664
  }
1665
+ #endif // !HWY_PPC_HAVE_10
1666
+
1667
+ #undef HWY_PPC_IF_MULHIGH_USING_VEC_MULH
1668
+ #undef HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH
1488
1669
 
1489
1670
  // Multiplies even lanes (0, 2, ..) and places the double-wide result into
1490
1671
  // even and the upper half into its odd neighbor lane.
@@ -1506,24 +1687,83 @@ HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulOdd(Vec128<T, N> a,
1506
1687
  return Vec128<MakeWide<T>, (N + 1) / 2>{vec_mulo(a.raw, b.raw)};
1507
1688
  }
1508
1689
 
1690
+ // ------------------------------ Rol/Ror
1691
+
1692
+ #ifdef HWY_NATIVE_ROL_ROR_8
1693
+ #undef HWY_NATIVE_ROL_ROR_8
1694
+ #else
1695
+ #define HWY_NATIVE_ROL_ROR_8
1696
+ #endif
1697
+
1698
+ #ifdef HWY_NATIVE_ROL_ROR_16
1699
+ #undef HWY_NATIVE_ROL_ROR_16
1700
+ #else
1701
+ #define HWY_NATIVE_ROL_ROR_16
1702
+ #endif
1703
+
1704
+ #ifdef HWY_NATIVE_ROL_ROR_32_64
1705
+ #undef HWY_NATIVE_ROL_ROR_32_64
1706
+ #else
1707
+ #define HWY_NATIVE_ROL_ROR_32_64
1708
+ #endif
1709
+
1710
+ template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1711
+ HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) {
1712
+ const DFromV<decltype(a)> d;
1713
+ const RebindToUnsigned<decltype(d)> du;
1714
+ return BitCast(
1715
+ d, VFromD<decltype(du)>{vec_rl(BitCast(du, a).raw, BitCast(du, b).raw)});
1716
+ }
1717
+
1718
+ template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1719
+ HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
1720
+ const DFromV<decltype(a)> d;
1721
+ const RebindToSigned<decltype(d)> di;
1722
+ return Rol(a, BitCast(d, Neg(BitCast(di, b))));
1723
+ }
1724
+
1509
1725
  // ------------------------------ RotateRight
1510
- template <int kBits, typename T, size_t N>
1726
+ template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1511
1727
  HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
1512
1728
  const DFromV<decltype(v)> d;
1513
1729
  constexpr size_t kSizeInBits = sizeof(T) * 8;
1514
1730
  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
1515
- if (kBits == 0) return v;
1516
- return Vec128<T, N>{vec_rl(v.raw, Set(d, kSizeInBits - kBits).raw)};
1731
+
1732
+ return (kBits == 0)
1733
+ ? v
1734
+ : Rol(v, Set(d, static_cast<T>(static_cast<int>(kSizeInBits) -
1735
+ kBits)));
1517
1736
  }
1518
1737
 
1519
- // ------------------------------ ZeroIfNegative (BroadcastSignBit)
1520
- template <typename T, size_t N>
1521
- HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
1522
- static_assert(IsFloat<T>(), "Only works for float");
1738
+ // ------------------------------ RotateLeftSame/RotateRightSame
1739
+ #ifdef HWY_NATIVE_ROL_ROR_SAME_8
1740
+ #undef HWY_NATIVE_ROL_ROR_SAME_8
1741
+ #else
1742
+ #define HWY_NATIVE_ROL_ROR_SAME_8
1743
+ #endif
1744
+
1745
+ #ifdef HWY_NATIVE_ROL_ROR_SAME_16
1746
+ #undef HWY_NATIVE_ROL_ROR_SAME_16
1747
+ #else
1748
+ #define HWY_NATIVE_ROL_ROR_SAME_16
1749
+ #endif
1750
+
1751
+ #ifdef HWY_NATIVE_ROL_ROR_SAME_32_64
1752
+ #undef HWY_NATIVE_ROL_ROR_SAME_32_64
1753
+ #else
1754
+ #define HWY_NATIVE_ROL_ROR_SAME_32_64
1755
+ #endif
1756
+
1757
+ template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1758
+ HWY_API Vec128<T, N> RotateLeftSame(Vec128<T, N> v, int bits) {
1523
1759
  const DFromV<decltype(v)> d;
1524
- const RebindToSigned<decltype(d)> di;
1525
- const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
1526
- return IfThenElse(mask, Zero(d), v);
1760
+ return Rol(v, Set(d, static_cast<T>(static_cast<unsigned>(bits))));
1761
+ }
1762
+
1763
+ template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1764
+ HWY_API Vec128<T, N> RotateRightSame(Vec128<T, N> v, int bits) {
1765
+ const DFromV<decltype(v)> d;
1766
+ return Rol(v, Set(d, static_cast<T>(0u - static_cast<unsigned>(bits))));
1527
1767
  }
1528
1768
 
1529
1769
  // ------------------------------ IfNegativeThenElse
@@ -1541,10 +1781,35 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
1541
1781
  BitCast(du, no).raw, BitCast(du, yes).raw, BitCast(du, v).raw)});
1542
1782
  #else
1543
1783
  const RebindToSigned<decltype(d)> di;
1544
- return IfThenElse(MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))),
1545
- yes, no);
1784
+ return IfVecThenElse(BitCast(d, BroadcastSignBit(BitCast(di, v))), yes, no);
1785
+ #endif
1786
+ }
1787
+
1788
+ #if HWY_PPC_HAVE_10
1789
+ #ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
1790
+ #undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
1791
+ #else
1792
+ #define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
1546
1793
  #endif
1794
+
1795
+ #ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
1796
+ #undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
1797
+ #else
1798
+ #define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
1799
+ #endif
1800
+
1801
+ template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
1802
+ HWY_API V IfNegativeThenElseZero(V v, V yes) {
1803
+ const DFromV<decltype(v)> d;
1804
+ return IfNegativeThenElse(v, yes, Zero(d));
1805
+ }
1806
+
1807
+ template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
1808
+ HWY_API V IfNegativeThenZeroElse(V v, V no) {
1809
+ const DFromV<decltype(v)> d;
1810
+ return IfNegativeThenElse(v, Zero(d), no);
1547
1811
  }
1812
+ #endif
1548
1813
 
1549
1814
  // generic_ops takes care of integer T.
1550
1815
  template <typename T, size_t N, HWY_IF_FLOAT(T)>
@@ -1598,17 +1863,42 @@ HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
1598
1863
  #endif
1599
1864
 
1600
1865
  template <typename T, size_t N, HWY_IF_FLOAT(T)>
1601
- HWY_API Vec128<T, N> ApproximateReciprocal(Vec128<T, N> v) {
1602
- return Vec128<T, N>{vec_re(v.raw)};
1866
+ HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
1867
+ #if HWY_S390X_HAVE_Z14
1868
+ return Vec128<T, N>{a.raw / b.raw};
1869
+ #else
1870
+ return Vec128<T, N>{vec_div(a.raw, b.raw)};
1871
+ #endif
1603
1872
  }
1604
1873
 
1605
1874
  template <typename T, size_t N, HWY_IF_FLOAT(T)>
1606
- HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
1607
- return Vec128<T, N>{vec_div(a.raw, b.raw)};
1875
+ HWY_API Vec128<T, N> ApproximateReciprocal(Vec128<T, N> v) {
1876
+ #if HWY_S390X_HAVE_Z14
1877
+ const DFromV<decltype(v)> d;
1878
+ return Set(d, T(1.0)) / v;
1879
+ #else
1880
+ return Vec128<T, N>{vec_re(v.raw)};
1881
+ #endif
1608
1882
  }
1609
1883
 
1610
1884
  // ------------------------------ Floating-point square root
1611
1885
 
1886
+ #if HWY_S390X_HAVE_Z14
1887
+ // Approximate reciprocal square root
1888
+ template <size_t N>
1889
+ HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
1890
+ const DFromV<decltype(v)> d;
1891
+ const RebindToUnsigned<decltype(d)> du;
1892
+
1893
+ const auto half = v * Set(d, 0.5f);
1894
+ // Initial guess based on log2(f)
1895
+ const auto guess = BitCast(
1896
+ d, Set(du, uint32_t{0x5F3759DFu}) - ShiftRight<1>(BitCast(du, v)));
1897
+ // One Newton-Raphson iteration
1898
+ return guess * NegMulAdd(half * guess, guess, Set(d, 1.5f));
1899
+ }
1900
+ #else // VSX
1901
+
1612
1902
  #ifdef HWY_NATIVE_F64_APPROX_RSQRT
1613
1903
  #undef HWY_NATIVE_F64_APPROX_RSQRT
1614
1904
  #else
@@ -1620,6 +1910,7 @@ template <class T, size_t N, HWY_IF_FLOAT(T)>
1620
1910
  HWY_API Vec128<T, N> ApproximateReciprocalSqrt(Vec128<T, N> v) {
1621
1911
  return Vec128<T, N>{vec_rsqrte(v.raw)};
1622
1912
  }
1913
+ #endif // HWY_S390X_HAVE_Z14
1623
1914
 
1624
1915
  // Full precision square root
1625
1916
  template <class T, size_t N, HWY_IF_FLOAT(T)>
@@ -1668,6 +1959,167 @@ HWY_API V AbsDiff(const V a, const V b) {
1668
1959
 
1669
1960
  #endif // HWY_PPC_HAVE_9
1670
1961
 
1962
+ // ------------------------------ Integer Div for PPC10
1963
+ #if HWY_PPC_HAVE_10
1964
+ #ifdef HWY_NATIVE_INT_DIV
1965
+ #undef HWY_NATIVE_INT_DIV
1966
+ #else
1967
+ #define HWY_NATIVE_INT_DIV
1968
+ #endif
1969
+
1970
+ template <size_t N>
1971
+ HWY_API Vec128<int32_t, N> operator/(Vec128<int32_t, N> a,
1972
+ Vec128<int32_t, N> b) {
1973
+ // Inline assembly is used instead of vec_div for I32 Div on PPC10 to avoid
1974
+ // undefined behavior if b[i] == 0 or
1975
+ // (a[i] == LimitsMin<int32_t>() && b[i] == -1)
1976
+
1977
+ // Clang will also optimize out I32 vec_div on PPC10 if optimizations are
1978
+ // enabled and any of the lanes of b are known to be zero (even in the unused
1979
+ // lanes of a partial vector)
1980
+ __vector signed int raw_result;
1981
+ __asm__("vdivsw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
1982
+ return Vec128<int32_t, N>{raw_result};
1983
+ }
1984
+
1985
+ template <size_t N>
1986
+ HWY_API Vec128<uint32_t, N> operator/(Vec128<uint32_t, N> a,
1987
+ Vec128<uint32_t, N> b) {
1988
+ // Inline assembly is used instead of vec_div for U32 Div on PPC10 to avoid
1989
+ // undefined behavior if b[i] == 0
1990
+
1991
+ // Clang will also optimize out U32 vec_div on PPC10 if optimizations are
1992
+ // enabled and any of the lanes of b are known to be zero (even in the unused
1993
+ // lanes of a partial vector)
1994
+ __vector unsigned int raw_result;
1995
+ __asm__("vdivuw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
1996
+ return Vec128<uint32_t, N>{raw_result};
1997
+ }
1998
+
1999
+ template <size_t N>
2000
+ HWY_API Vec128<int64_t, N> operator/(Vec128<int64_t, N> a,
2001
+ Vec128<int64_t, N> b) {
2002
+ // Inline assembly is used instead of vec_div for I64 Div on PPC10 to avoid
2003
+ // undefined behavior if b[i] == 0 or
2004
+ // (a[i] == LimitsMin<int64_t>() && b[i] == -1)
2005
+
2006
+ // Clang will also optimize out I64 vec_div on PPC10 if optimizations are
2007
+ // enabled and any of the lanes of b are known to be zero (even in the unused
2008
+ // lanes of a partial vector)
2009
+ __vector signed long long raw_result;
2010
+ __asm__("vdivsd %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
2011
+ return Vec128<int64_t, N>{raw_result};
2012
+ }
2013
+
2014
+ template <size_t N>
2015
+ HWY_API Vec128<uint64_t, N> operator/(Vec128<uint64_t, N> a,
2016
+ Vec128<uint64_t, N> b) {
2017
+ // Inline assembly is used instead of vec_div for U64 Div on PPC10 to avoid
2018
+ // undefined behavior if b[i] == 0
2019
+
2020
+ // Clang will also optimize out U64 vec_div on PPC10 if optimizations are
2021
+ // enabled and any of the lanes of b are known to be zero (even in the unused
2022
+ // lanes of a partial vector)
2023
+ __vector unsigned long long raw_result;
2024
+ __asm__("vdivud %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
2025
+ return Vec128<uint64_t, N>{raw_result};
2026
+ }
2027
+
2028
+ template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
2029
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
2030
+ HWY_API Vec128<T> operator/(Vec128<T> a, Vec128<T> b) {
2031
+ const DFromV<decltype(a)> d;
2032
+ const RepartitionToWide<decltype(d)> dw;
2033
+ return OrderedDemote2To(d, PromoteLowerTo(dw, a) / PromoteLowerTo(dw, b),
2034
+ PromoteUpperTo(dw, a) / PromoteUpperTo(dw, b));
2035
+ }
2036
+
2037
+ template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
2038
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
2039
+ HWY_IF_V_SIZE_LE(T, N, 8)>
2040
+ HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
2041
+ const DFromV<decltype(a)> d;
2042
+ const Rebind<MakeWide<T>, decltype(d)> dw;
2043
+ return DemoteTo(d, PromoteTo(dw, a) / PromoteTo(dw, b));
2044
+ }
2045
+
2046
+ template <size_t N>
2047
+ HWY_API Vec128<int32_t, N> operator%(Vec128<int32_t, N> a,
2048
+ Vec128<int32_t, N> b) {
2049
+ // Inline assembly is used instead of vec_mod for I32 Mod on PPC10 to avoid
2050
+ // undefined behavior if b[i] == 0 or
2051
+ // (a[i] == LimitsMin<int32_t>() && b[i] == -1)
2052
+
2053
+ // Clang will also optimize out I32 vec_mod on PPC10 if optimizations are
2054
+ // enabled and any of the lanes of b are known to be zero (even in the unused
2055
+ // lanes of a partial vector)
2056
+ __vector signed int raw_result;
2057
+ __asm__("vmodsw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
2058
+ return Vec128<int32_t, N>{raw_result};
2059
+ }
2060
+
2061
+ template <size_t N>
2062
+ HWY_API Vec128<uint32_t, N> operator%(Vec128<uint32_t, N> a,
2063
+ Vec128<uint32_t, N> b) {
2064
+ // Inline assembly is used instead of vec_mod for U32 Mod on PPC10 to avoid
2065
+ // undefined behavior if b[i] == 0
2066
+
2067
+ // Clang will also optimize out U32 vec_mod on PPC10 if optimizations are
2068
+ // enabled and any of the lanes of b are known to be zero (even in the unused
2069
+ // lanes of a partial vector)
2070
+ __vector unsigned int raw_result;
2071
+ __asm__("vmoduw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
2072
+ return Vec128<uint32_t, N>{raw_result};
2073
+ }
2074
+
2075
+ template <size_t N>
2076
+ HWY_API Vec128<int64_t, N> operator%(Vec128<int64_t, N> a,
2077
+ Vec128<int64_t, N> b) {
2078
+ // Inline assembly is used instead of vec_mod for I64 Mod on PPC10 to avoid
2079
+ // undefined behavior if b[i] == 0 or
2080
+ // (a[i] == LimitsMin<int64_t>() && b[i] == -1)
2081
+
2082
+ // Clang will also optimize out I64 vec_mod on PPC10 if optimizations are
2083
+ // enabled and any of the lanes of b are known to be zero (even in the unused
2084
+ // lanes of a partial vector)
2085
+ __vector signed long long raw_result;
2086
+ __asm__("vmodsd %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
2087
+ return Vec128<int64_t, N>{raw_result};
2088
+ }
2089
+
2090
+ template <size_t N>
2091
+ HWY_API Vec128<uint64_t, N> operator%(Vec128<uint64_t, N> a,
2092
+ Vec128<uint64_t, N> b) {
2093
+ // Inline assembly is used instead of vec_mod for U64 Mod on PPC10 to avoid
2094
+ // undefined behavior if b[i] == 0
2095
+
2096
+ // Clang will also optimize out U64 vec_mod on PPC10 if optimizations are
2097
+ // enabled and any of the lanes of b are known to be zero (even in the unused
2098
+ // lanes of a partial vector)
2099
+ __vector unsigned long long raw_result;
2100
+ __asm__("vmodud %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
2101
+ return Vec128<uint64_t, N>{raw_result};
2102
+ }
2103
+
2104
+ template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
2105
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
2106
+ HWY_API Vec128<T> operator%(Vec128<T> a, Vec128<T> b) {
2107
+ const DFromV<decltype(a)> d;
2108
+ const RepartitionToWide<decltype(d)> dw;
2109
+ return OrderedDemote2To(d, PromoteLowerTo(dw, a) % PromoteLowerTo(dw, b),
2110
+ PromoteUpperTo(dw, a) % PromoteUpperTo(dw, b));
2111
+ }
2112
+
2113
+ template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
2114
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
2115
+ HWY_IF_V_SIZE_LE(T, N, 8)>
2116
+ HWY_API Vec128<T, N> operator%(Vec128<T, N> a, Vec128<T, N> b) {
2117
+ const DFromV<decltype(a)> d;
2118
+ const Rebind<MakeWide<T>, decltype(d)> dw;
2119
+ return DemoteTo(d, PromoteTo(dw, a) % PromoteTo(dw, b));
2120
+ }
2121
+ #endif
2122
+
1671
2123
  // ================================================== MEMORY (3)
1672
2124
 
1673
2125
  // ------------------------------ Non-temporal stores
@@ -1800,7 +2252,7 @@ template <typename T, size_t N>
1800
2252
  HWY_API Vec128<T, N> InsertLane(Vec128<T, N> v, size_t i, T t) {
1801
2253
  #if HWY_IS_LITTLE_ENDIAN
1802
2254
  typename detail::Raw128<T>::type raw_result = v.raw;
1803
- raw_result[i] = t;
2255
+ raw_result[i] = BitCastScalar<typename detail::Raw128<T>::RawT>(t);
1804
2256
  return Vec128<T, N>{raw_result};
1805
2257
  #else
1806
2258
  // On ppc64be without this, mul_test fails, but swizzle_test passes.
@@ -2070,7 +2522,7 @@ HWY_API Vec32<T> Reverse(D d, Vec32<T> v) {
2070
2522
 
2071
2523
  // ------------------------------- ReverseLaneBytes
2072
2524
 
2073
- #if HWY_PPC_HAVE_9 && \
2525
+ #if (HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14) && \
2074
2526
  (HWY_COMPILER_GCC_ACTUAL >= 710 || HWY_COMPILER_CLANG >= 400)
2075
2527
 
2076
2528
  // Per-target flag to prevent generic_ops-inl.h defining 8-bit ReverseLaneBytes.
@@ -2111,7 +2563,7 @@ HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
2111
2563
  return BitCast(d, ReverseLaneBytes(BitCast(du64, v)));
2112
2564
  }
2113
2565
 
2114
- #endif // HWY_PPC_HAVE_9
2566
+ #endif // HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
2115
2567
 
2116
2568
  template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
2117
2569
  HWY_API Vec16<T> Reverse(D d, Vec16<T> v) {
@@ -2268,11 +2720,15 @@ HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
2268
2720
  Set(Full128<uint32_t>(),
2269
2721
  static_cast<uint32_t>(amt * sizeof(TFromD<D>) * 8)));
2270
2722
 
2723
+ #if HWY_S390X_HAVE_Z14
2724
+ return BitCast(d, VU8{vec_srb(BitCast(du8, v).raw, v_shift_amt.raw)});
2725
+ #else // VSX
2271
2726
  #if HWY_IS_LITTLE_ENDIAN
2272
2727
  return BitCast(d, VU8{vec_slo(BitCast(du8, v).raw, v_shift_amt.raw)});
2273
2728
  #else
2274
2729
  return BitCast(d, VU8{vec_sro(BitCast(du8, v).raw, v_shift_amt.raw)});
2275
- #endif
2730
+ #endif // HWY_IS_LITTLE_ENDIAN
2731
+ #endif // HWY_S390X_HAVE_Z14
2276
2732
  }
2277
2733
 
2278
2734
  // ------------------------------ SlideDownLanes
@@ -2300,11 +2756,15 @@ HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
2300
2756
  Set(Full128<uint32_t>(),
2301
2757
  static_cast<uint32_t>(amt * sizeof(TFromD<D>) * 8)));
2302
2758
 
2759
+ #if HWY_S390X_HAVE_Z14
2760
+ return BitCast(d, VU8{vec_slb(BitCast(du8, v).raw, v_shift_amt.raw)});
2761
+ #else // VSX
2303
2762
  #if HWY_IS_LITTLE_ENDIAN
2304
2763
  return BitCast(d, VU8{vec_sro(BitCast(du8, v).raw, v_shift_amt.raw)});
2305
2764
  #else
2306
2765
  return BitCast(d, VU8{vec_slo(BitCast(du8, v).raw, v_shift_amt.raw)});
2307
- #endif
2766
+ #endif // HWY_IS_LITTLE_ENDIAN
2767
+ #endif // HWY_S390X_HAVE_Z14
2308
2768
  }
2309
2769
 
2310
2770
  // ================================================== COMBINE
@@ -2637,7 +3097,15 @@ HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
2637
3097
 
2638
3098
  template <typename T, HWY_IF_T_SIZE(T, 4)>
2639
3099
  HWY_API Vec128<T> DupEven(Vec128<T> v) {
3100
+ #if HWY_S390X_HAVE_Z14
3101
+ const DFromV<decltype(v)> d;
3102
+ const Repartition<uint8_t, decltype(d)> du8;
3103
+ return TableLookupBytes(
3104
+ v, BitCast(d, Dup128VecFromValues(du8, 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10,
3105
+ 11, 8, 9, 10, 11)));
3106
+ #else
2640
3107
  return Vec128<T>{vec_mergee(v.raw, v.raw)};
3108
+ #endif
2641
3109
  }
2642
3110
 
2643
3111
  // ------------------------------ DupOdd (InterleaveUpper)
@@ -2662,7 +3130,15 @@ HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
2662
3130
 
2663
3131
  template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
2664
3132
  HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
3133
+ #if HWY_S390X_HAVE_Z14
3134
+ const DFromV<decltype(v)> d;
3135
+ const Repartition<uint8_t, decltype(d)> du8;
3136
+ return TableLookupBytes(
3137
+ v, BitCast(d, Dup128VecFromValues(du8, 4, 5, 6, 7, 4, 5, 6, 7, 12, 13, 14,
3138
+ 15, 12, 13, 14, 15)));
3139
+ #else
2665
3140
  return Vec128<T, N>{vec_mergeo(v.raw, v.raw)};
3141
+ #endif
2666
3142
  }
2667
3143
 
2668
3144
  template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
@@ -2706,6 +3182,96 @@ HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) {
2706
3182
  return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 8>{mask}), b, a);
2707
3183
  }
2708
3184
 
3185
+ // ------------------------------ InterleaveEven
3186
+
3187
+ template <class D, HWY_IF_T_SIZE_D(D, 1)>
3188
+ HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
3189
+ const Full128<TFromD<D>> d_full;
3190
+ const Indices128<TFromD<D>> idx{
3191
+ Dup128VecFromValues(Full128<uint8_t>(), 0, 16, 2, 18, 4, 20, 6, 22, 8, 24,
3192
+ 10, 26, 12, 28, 14, 30)
3193
+ .raw};
3194
+ return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
3195
+ ResizeBitCast(d_full, b), idx));
3196
+ }
3197
+
3198
+ template <class D, HWY_IF_T_SIZE_D(D, 2)>
3199
+ HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
3200
+ const Full128<TFromD<D>> d_full;
3201
+ const Indices128<TFromD<D>> idx{Dup128VecFromValues(Full128<uint8_t>(), 0, 1,
3202
+ 16, 17, 4, 5, 20, 21, 8,
3203
+ 9, 24, 25, 12, 13, 28, 29)
3204
+ .raw};
3205
+ return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
3206
+ ResizeBitCast(d_full, b), idx));
3207
+ }
3208
+
3209
+ template <class D, HWY_IF_T_SIZE_D(D, 4)>
3210
+ HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
3211
+ #if HWY_S390X_HAVE_Z14
3212
+ const Full128<TFromD<D>> d_full;
3213
+ const Indices128<TFromD<D>> idx{Dup128VecFromValues(Full128<uint8_t>(), 0, 1,
3214
+ 2, 3, 16, 17, 18, 19, 8,
3215
+ 9, 10, 11, 24, 25, 26, 27)
3216
+ .raw};
3217
+ return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
3218
+ ResizeBitCast(d_full, b), idx));
3219
+ #else
3220
+ (void)d;
3221
+ return VFromD<D>{vec_mergee(a.raw, b.raw)};
3222
+ #endif
3223
+ }
3224
+
3225
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
3226
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
3227
+ return InterleaveLower(a, b);
3228
+ }
3229
+
3230
+ // ------------------------------ InterleaveOdd
3231
+
3232
+ template <class D, HWY_IF_T_SIZE_D(D, 1)>
3233
+ HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
3234
+ const Full128<TFromD<D>> d_full;
3235
+ const Indices128<TFromD<D>> idx{
3236
+ Dup128VecFromValues(Full128<uint8_t>(), 1, 17, 3, 19, 5, 21, 7, 23, 9, 25,
3237
+ 11, 27, 13, 29, 15, 31)
3238
+ .raw};
3239
+ return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
3240
+ ResizeBitCast(d_full, b), idx));
3241
+ }
3242
+
3243
+ template <class D, HWY_IF_T_SIZE_D(D, 2)>
3244
+ HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
3245
+ const Full128<TFromD<D>> d_full;
3246
+ const Indices128<TFromD<D>> idx{
3247
+ Dup128VecFromValues(Full128<uint8_t>(), 2, 3, 18, 19, 6, 7, 22, 23, 10,
3248
+ 11, 26, 27, 14, 15, 30, 31)
3249
+ .raw};
3250
+ return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
3251
+ ResizeBitCast(d_full, b), idx));
3252
+ }
3253
+
3254
+ template <class D, HWY_IF_T_SIZE_D(D, 4)>
3255
+ HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
3256
+ #if HWY_S390X_HAVE_Z14
3257
+ const Full128<TFromD<D>> d_full;
3258
+ const Indices128<TFromD<D>> idx{
3259
+ Dup128VecFromValues(Full128<uint8_t>(), 4, 5, 6, 7, 20, 21, 22, 23, 12,
3260
+ 13, 14, 15, 28, 29, 30, 31)
3261
+ .raw};
3262
+ return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
3263
+ ResizeBitCast(d_full, b), idx));
3264
+ #else
3265
+ (void)d;
3266
+ return VFromD<D>{vec_mergeo(a.raw, b.raw)};
3267
+ #endif
3268
+ }
3269
+
3270
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
3271
+ HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
3272
+ return InterleaveUpper(d, a, b);
3273
+ }
3274
+
2709
3275
  // ------------------------------ OddEvenBlocks
2710
3276
  template <typename T, size_t N>
2711
3277
  HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
@@ -2719,26 +3285,64 @@ HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
2719
3285
  return v;
2720
3286
  }
2721
3287
 
2722
- // ------------------------------ Shl
3288
+ // ------------------------------ MulFixedPoint15 (OddEven)
2723
3289
 
2724
- namespace detail {
2725
- template <typename T, size_t N>
2726
- HWY_API Vec128<T, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<T, N> v,
2727
- Vec128<T, N> bits) {
2728
- return Vec128<T, N>{vec_sl(v.raw, bits.raw)};
2729
- }
3290
+ #if HWY_S390X_HAVE_Z14
3291
+ HWY_API Vec16<int16_t> MulFixedPoint15(Vec16<int16_t> a, Vec16<int16_t> b) {
3292
+ const DFromV<decltype(a)> di16;
3293
+ const RepartitionToWide<decltype(di16)> di32;
2730
3294
 
2731
- // Signed left shift is the same as unsigned.
2732
- template <typename T, size_t N>
2733
- HWY_API Vec128<T, N> Shl(hwy::SignedTag /*tag*/, Vec128<T, N> v,
2734
- Vec128<T, N> bits) {
2735
- const DFromV<decltype(v)> di;
2736
- const RebindToUnsigned<decltype(di)> du;
2737
- return BitCast(di,
2738
- Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits)));
2739
- }
3295
+ const auto round_up_incr = Set(di32, 0x4000);
3296
+ const auto i32_product = MulEven(a, b) + round_up_incr;
2740
3297
 
2741
- } // namespace detail
3298
+ return ResizeBitCast(di16, ShiftLeft<1>(i32_product));
3299
+ }
3300
+ template <size_t N, HWY_IF_LANES_GT(N, 1)>
3301
+ HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
3302
+ Vec128<int16_t, N> b) {
3303
+ const DFromV<decltype(a)> di16;
3304
+ const RepartitionToWide<decltype(di16)> di32;
3305
+
3306
+ const auto round_up_incr = Set(di32, 0x4000);
3307
+ const auto even_product = MulEven(a, b) + round_up_incr;
3308
+ const auto odd_product = MulOdd(a, b) + round_up_incr;
3309
+
3310
+ return OddEven(BitCast(di16, ShiftRight<15>(odd_product)),
3311
+ BitCast(di16, ShiftLeft<1>(even_product)));
3312
+ }
3313
+ #else
3314
+ template <size_t N>
3315
+ HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
3316
+ Vec128<int16_t, N> b) {
3317
+ const Vec128<int16_t> zero = Zero(Full128<int16_t>());
3318
+ return Vec128<int16_t, N>{vec_mradds(a.raw, b.raw, zero.raw)};
3319
+ }
3320
+ #endif
3321
+
3322
+ // ------------------------------ Shl
3323
+
3324
+ namespace detail {
3325
+ template <typename T, size_t N>
3326
+ HWY_API Vec128<T, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<T, N> v,
3327
+ Vec128<T, N> bits) {
3328
+ #if HWY_S390X_HAVE_Z14
3329
+ return Vec128<T, N>{v.raw << bits.raw};
3330
+ #else
3331
+ return Vec128<T, N>{vec_sl(v.raw, bits.raw)};
3332
+ #endif
3333
+ }
3334
+
3335
+ // Signed left shift is the same as unsigned.
3336
+ template <typename T, size_t N>
3337
+ HWY_API Vec128<T, N> Shl(hwy::SignedTag /*tag*/, Vec128<T, N> v,
3338
+ Vec128<T, N> bits) {
3339
+ const DFromV<decltype(v)> di;
3340
+ const RebindToUnsigned<decltype(di)> du;
3341
+ return BitCast(di,
3342
+ Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits)));
3343
+ }
3344
+
3345
+ } // namespace detail
2742
3346
 
2743
3347
  template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
2744
3348
  HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
@@ -2751,15 +3355,23 @@ namespace detail {
2751
3355
  template <typename T, size_t N>
2752
3356
  HWY_API Vec128<T, N> Shr(hwy::UnsignedTag /*tag*/, Vec128<T, N> v,
2753
3357
  Vec128<T, N> bits) {
3358
+ #if HWY_S390X_HAVE_Z14
3359
+ return Vec128<T, N>{v.raw >> bits.raw};
3360
+ #else
2754
3361
  return Vec128<T, N>{vec_sr(v.raw, bits.raw)};
3362
+ #endif
2755
3363
  }
2756
3364
 
2757
3365
  template <typename T, size_t N>
2758
3366
  HWY_API Vec128<T, N> Shr(hwy::SignedTag /*tag*/, Vec128<T, N> v,
2759
3367
  Vec128<T, N> bits) {
3368
+ #if HWY_S390X_HAVE_Z14
3369
+ return Vec128<T, N>{v.raw >> bits.raw};
3370
+ #else
2760
3371
  const DFromV<decltype(v)> di;
2761
3372
  const RebindToUnsigned<decltype(di)> du;
2762
3373
  return Vec128<T, N>{vec_sra(v.raw, BitCast(du, bits).raw)};
3374
+ #endif
2763
3375
  }
2764
3376
 
2765
3377
  } // namespace detail
@@ -2771,100 +3383,85 @@ HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) {
2771
3383
 
2772
3384
  // ------------------------------ MulEven/Odd 64x64 (UpperHalf)
2773
3385
 
2774
- HWY_INLINE Vec128<uint64_t> MulEven(Vec128<uint64_t> a, Vec128<uint64_t> b) {
3386
+ template <class T, HWY_IF_UI64(T)>
3387
+ HWY_INLINE Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
2775
3388
  #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
2776
- using VU64 = __vector unsigned long long;
2777
- const VU64 mul128_result = reinterpret_cast<VU64>(vec_mule(a.raw, b.raw));
3389
+ using V64 = typename detail::Raw128<T>::type;
3390
+ const V64 mul128_result = reinterpret_cast<V64>(vec_mule(a.raw, b.raw));
2778
3391
  #if HWY_IS_LITTLE_ENDIAN
2779
- return Vec128<uint64_t>{mul128_result};
3392
+ return Vec128<T>{mul128_result};
2780
3393
  #else
2781
3394
  // Need to swap the two halves of mul128_result on big-endian targets as
2782
3395
  // the upper 64 bits of the product are in lane 0 of mul128_result and
2783
3396
  // the lower 64 bits of the product are in lane 1 of mul128_result
2784
- return Vec128<uint64_t>{vec_sld(mul128_result, mul128_result, 8)};
3397
+ return Vec128<T>{vec_sld(mul128_result, mul128_result, 8)};
2785
3398
  #endif
2786
3399
  #else
2787
- alignas(16) uint64_t mul[2];
3400
+ alignas(16) T mul[2];
2788
3401
  mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
2789
- return Load(Full128<uint64_t>(), mul);
3402
+ return Load(Full128<T>(), mul);
2790
3403
  #endif
2791
3404
  }
2792
3405
 
2793
- HWY_INLINE Vec128<uint64_t> MulOdd(Vec128<uint64_t> a, Vec128<uint64_t> b) {
3406
+ template <class T, HWY_IF_UI64(T)>
3407
+ HWY_INLINE Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
2794
3408
  #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
2795
- using VU64 = __vector unsigned long long;
2796
- const VU64 mul128_result = reinterpret_cast<VU64>(vec_mulo(a.raw, b.raw));
3409
+ using V64 = typename detail::Raw128<T>::type;
3410
+ const V64 mul128_result = reinterpret_cast<V64>(vec_mulo(a.raw, b.raw));
2797
3411
  #if HWY_IS_LITTLE_ENDIAN
2798
- return Vec128<uint64_t>{mul128_result};
3412
+ return Vec128<T>{mul128_result};
2799
3413
  #else
2800
3414
  // Need to swap the two halves of mul128_result on big-endian targets as
2801
3415
  // the upper 64 bits of the product are in lane 0 of mul128_result and
2802
3416
  // the lower 64 bits of the product are in lane 1 of mul128_result
2803
- return Vec128<uint64_t>{vec_sld(mul128_result, mul128_result, 8)};
3417
+ return Vec128<T>{vec_sld(mul128_result, mul128_result, 8)};
2804
3418
  #endif
2805
3419
  #else
2806
- alignas(16) uint64_t mul[2];
2807
- const Full64<uint64_t> d2;
3420
+ alignas(16) T mul[2];
3421
+ const Full64<T> d2;
2808
3422
  mul[0] =
2809
3423
  Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
2810
- return Load(Full128<uint64_t>(), mul);
3424
+ return Load(Full128<T>(), mul);
2811
3425
  #endif
2812
3426
  }
2813
3427
 
3428
+ // ------------------------------ PromoteEvenTo/PromoteOddTo
3429
+ #include "hwy/ops/inside-inl.h"
3430
+
2814
3431
  // ------------------------------ WidenMulPairwiseAdd
2815
3432
 
2816
- template <class D32, HWY_IF_F32_D(D32),
2817
- class V16 = VFromD<Repartition<bfloat16_t, D32>>>
2818
- HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) {
2819
- const RebindToUnsigned<decltype(df32)> du32;
2820
- // Lane order within sum0/1 is undefined, hence we can avoid the
2821
- // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip
2822
- // leads to the odd/even order that RearrangeToOddPlusEven prefers.
2823
- using VU32 = VFromD<decltype(du32)>;
2824
- const VU32 odd = Set(du32, 0xFFFF0000u);
2825
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
2826
- const VU32 ao = And(BitCast(du32, a), odd);
2827
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
2828
- const VU32 bo = And(BitCast(du32, b), odd);
2829
- return MulAdd(BitCast(df32, ae), BitCast(df32, be),
2830
- Mul(BitCast(df32, ao), BitCast(df32, bo)));
3433
+ template <class DF, HWY_IF_F32_D(DF),
3434
+ class VBF = VFromD<Repartition<bfloat16_t, DF>>>
3435
+ HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
3436
+ return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
3437
+ Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
2831
3438
  }
2832
3439
 
2833
3440
  // Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe.
2834
3441
  template <class D32, HWY_IF_UI32_D(D32),
2835
3442
  class V16 = VFromD<RepartitionToNarrow<D32>>>
2836
3443
  HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 d32, V16 a, V16 b) {
3444
+ #if HWY_S390X_HAVE_Z14
3445
+ (void)d32;
3446
+ return MulEven(a, b) + MulOdd(a, b);
3447
+ #else
2837
3448
  return VFromD<D32>{vec_msum(a.raw, b.raw, Zero(d32).raw)};
3449
+ #endif
2838
3450
  }
2839
3451
 
2840
3452
  // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
2841
3453
 
2842
- template <class D32, HWY_IF_F32_D(D32),
2843
- class V16 = VFromD<Repartition<bfloat16_t, D32>>>
2844
- HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b,
2845
- VFromD<D32> sum0,
2846
- VFromD<D32>& sum1) {
2847
- const RebindToUnsigned<decltype(df32)> du32;
2848
- // Lane order within sum0/1 is undefined, hence we can avoid the
2849
- // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip
2850
- // leads to the odd/even order that RearrangeToOddPlusEven prefers.
2851
- using VU32 = VFromD<decltype(du32)>;
2852
- const VU32 odd = Set(du32, 0xFFFF0000u);
2853
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
2854
- const VU32 ao = And(BitCast(du32, a), odd);
2855
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
2856
- const VU32 bo = And(BitCast(du32, b), odd);
2857
- sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
2858
- return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
2859
- }
2860
-
2861
3454
  // Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe.
2862
3455
  template <class D32, HWY_IF_UI32_D(D32),
2863
3456
  class V16 = VFromD<RepartitionToNarrow<D32>>>
2864
- HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 /* tag */, V16 a, V16 b,
3457
+ HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 /*d32*/, V16 a, V16 b,
2865
3458
  VFromD<D32> sum0,
2866
3459
  VFromD<D32>& /*sum1*/) {
3460
+ #if HWY_S390X_HAVE_Z14
3461
+ return MulEven(a, b) + MulOdd(a, b) + sum0;
3462
+ #else
2867
3463
  return VFromD<D32>{vec_msum(a.raw, b.raw, sum0.raw)};
3464
+ #endif
2868
3465
  }
2869
3466
 
2870
3467
  // ------------------------------ RearrangeToOddPlusEven
@@ -2885,7 +3482,27 @@ HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
2885
3482
  return Add(sum0, sum1);
2886
3483
  }
2887
3484
 
3485
+ // ------------------------------ SatWidenMulPairwiseAccumulate
3486
+ #if !HWY_S390X_HAVE_Z14
3487
+
3488
+ #ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
3489
+ #undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
3490
+ #else
3491
+ #define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
3492
+ #endif
3493
+
3494
+ template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
3495
+ HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate(
3496
+ DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a,
3497
+ VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
3498
+ return VFromD<DI32>{vec_msums(a.raw, b.raw, sum.raw)};
3499
+ }
3500
+
3501
+ #endif // !HWY_S390X_HAVE_Z14
3502
+
2888
3503
  // ------------------------------ SumOfMulQuadAccumulate
3504
+ #if !HWY_S390X_HAVE_Z14
3505
+
2889
3506
  #ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
2890
3507
  #undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
2891
3508
  #else
@@ -2925,11 +3542,12 @@ HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32,
2925
3542
 
2926
3543
  const auto result_sum_0 =
2927
3544
  SumOfMulQuadAccumulate(di32, BitCast(du8, a), b, sum);
2928
- const auto result_sum_1 = ShiftLeft<8>(detail::AltivecVsum4sbs(
2929
- di32, And(b, BroadcastSignBit(a)).raw, Zero(di32).raw));
3545
+ const auto result_sum_1 = ShiftLeft<8>(SumsOf4(And(b, BroadcastSignBit(a))));
2930
3546
  return result_sum_0 - result_sum_1;
2931
3547
  }
2932
3548
 
3549
+ #endif // !HWY_S390X_HAVE_Z14
3550
+
2933
3551
  // ================================================== CONVERT
2934
3552
 
2935
3553
  // ------------------------------ Promotions (part w/ narrow lanes -> full)
@@ -3018,29 +3636,59 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
3018
3636
  }
3019
3637
 
3020
3638
  template <class D, HWY_IF_F64_D(D)>
3021
- HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
3639
+ HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<int32_t, D>> v) {
3640
+ #if HWY_S390X_HAVE_Z14
3641
+ const RebindToSigned<decltype(df64)> di64;
3642
+ return ConvertTo(df64, PromoteTo(di64, v));
3643
+ #else // VSX
3644
+ (void)df64;
3022
3645
  const __vector signed int raw_v = InterleaveLower(v, v).raw;
3023
3646
  #if HWY_IS_LITTLE_ENDIAN
3024
3647
  return VFromD<D>{vec_doubleo(raw_v)};
3025
3648
  #else
3026
3649
  return VFromD<D>{vec_doublee(raw_v)};
3027
3650
  #endif
3651
+ #endif // HWY_S390X_HAVE_Z14
3028
3652
  }
3029
3653
 
3030
3654
  template <class D, HWY_IF_F64_D(D)>
3031
- HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
3655
+ HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<uint32_t, D>> v) {
3656
+ #if HWY_S390X_HAVE_Z14
3657
+ const RebindToUnsigned<decltype(df64)> du64;
3658
+ return ConvertTo(df64, PromoteTo(du64, v));
3659
+ #else // VSX
3660
+ (void)df64;
3032
3661
  const __vector unsigned int raw_v = InterleaveLower(v, v).raw;
3033
3662
  #if HWY_IS_LITTLE_ENDIAN
3034
3663
  return VFromD<D>{vec_doubleo(raw_v)};
3035
3664
  #else
3036
3665
  return VFromD<D>{vec_doublee(raw_v)};
3037
3666
  #endif
3667
+ #endif // HWY_S390X_HAVE_Z14
3038
3668
  }
3039
3669
 
3670
+ #if !HWY_S390X_HAVE_Z14
3671
+ namespace detail {
3672
+
3673
+ template <class V>
3674
+ static HWY_INLINE V VsxF2INormalizeSrcVals(V v) {
3675
+ #if !defined(HWY_DISABLE_PPC_VSX_QEMU_F2I_WORKAROUND)
3676
+ // Workaround for QEMU 7/8 VSX float to int conversion bug
3677
+ return IfThenElseZero(v == v, v);
3678
+ #else
3679
+ return v;
3680
+ #endif
3681
+ }
3682
+
3683
+ } // namespace detail
3684
+ #endif // !HWY_S390X_HAVE_Z14
3685
+
3040
3686
  template <class D, HWY_IF_I64_D(D)>
3041
3687
  HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
3042
- #if HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds)
3043
- const __vector float raw_v = InterleaveLower(v, v).raw;
3688
+ #if !HWY_S390X_HAVE_Z14 && \
3689
+ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
3690
+ const __vector float raw_v =
3691
+ detail::VsxF2INormalizeSrcVals(InterleaveLower(v, v)).raw;
3044
3692
  return VFromD<decltype(di64)>{__builtin_vsx_xvcvspsxds(raw_v)};
3045
3693
  #else
3046
3694
  const RebindToFloat<decltype(di64)> df64;
@@ -3050,8 +3698,10 @@ HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
3050
3698
 
3051
3699
  template <class D, HWY_IF_U64_D(D)>
3052
3700
  HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
3053
- #if HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds)
3054
- const __vector float raw_v = InterleaveLower(v, v).raw;
3701
+ #if !HWY_S390X_HAVE_Z14 && \
3702
+ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
3703
+ const __vector float raw_v =
3704
+ detail::VsxF2INormalizeSrcVals(InterleaveLower(v, v)).raw;
3055
3705
  return VFromD<decltype(du64)>{reinterpret_cast<__vector unsigned long long>(
3056
3706
  __builtin_vsx_xvcvspuxds(raw_v))};
3057
3707
  #else
@@ -3123,7 +3773,12 @@ HWY_API VFromD<D> PromoteUpperTo(D /*tag*/, Vec128<float> v) {
3123
3773
  }
3124
3774
 
3125
3775
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
3126
- HWY_API VFromD<D> PromoteUpperTo(D /*tag*/, Vec128<int32_t> v) {
3776
+ HWY_API VFromD<D> PromoteUpperTo(D df64, Vec128<int32_t> v) {
3777
+ #if HWY_S390X_HAVE_Z14
3778
+ const RebindToSigned<decltype(df64)> di64;
3779
+ return ConvertTo(df64, PromoteUpperTo(di64, v));
3780
+ #else // VSX
3781
+ (void)df64;
3127
3782
  const __vector signed int raw_v =
3128
3783
  InterleaveUpper(Full128<int32_t>(), v, v).raw;
3129
3784
  #if HWY_IS_LITTLE_ENDIAN
@@ -3131,10 +3786,16 @@ HWY_API VFromD<D> PromoteUpperTo(D /*tag*/, Vec128<int32_t> v) {
3131
3786
  #else
3132
3787
  return VFromD<D>{vec_doublee(raw_v)};
3133
3788
  #endif
3789
+ #endif // HWY_S390X_HAVE_Z14
3134
3790
  }
3135
3791
 
3136
3792
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
3137
- HWY_API VFromD<D> PromoteUpperTo(D /*tag*/, Vec128<uint32_t> v) {
3793
+ HWY_API VFromD<D> PromoteUpperTo(D df64, Vec128<uint32_t> v) {
3794
+ #if HWY_S390X_HAVE_Z14
3795
+ const RebindToUnsigned<decltype(df64)> du64;
3796
+ return ConvertTo(df64, PromoteUpperTo(du64, v));
3797
+ #else // VSX
3798
+ (void)df64;
3138
3799
  const __vector unsigned int raw_v =
3139
3800
  InterleaveUpper(Full128<uint32_t>(), v, v).raw;
3140
3801
  #if HWY_IS_LITTLE_ENDIAN
@@ -3142,12 +3803,16 @@ HWY_API VFromD<D> PromoteUpperTo(D /*tag*/, Vec128<uint32_t> v) {
3142
3803
  #else
3143
3804
  return VFromD<D>{vec_doublee(raw_v)};
3144
3805
  #endif
3806
+ #endif // HWY_S390X_HAVE_Z14
3145
3807
  }
3146
3808
 
3147
3809
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
3148
3810
  HWY_API VFromD<D> PromoteUpperTo(D di64, Vec128<float> v) {
3149
- #if HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds)
3150
- const __vector float raw_v = InterleaveUpper(Full128<float>(), v, v).raw;
3811
+ #if !HWY_S390X_HAVE_Z14 && \
3812
+ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
3813
+ const __vector float raw_v =
3814
+ detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v))
3815
+ .raw;
3151
3816
  return VFromD<decltype(di64)>{__builtin_vsx_xvcvspsxds(raw_v)};
3152
3817
  #else
3153
3818
  const RebindToFloat<decltype(di64)> df64;
@@ -3157,8 +3822,11 @@ HWY_API VFromD<D> PromoteUpperTo(D di64, Vec128<float> v) {
3157
3822
 
3158
3823
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
3159
3824
  HWY_API VFromD<D> PromoteUpperTo(D du64, Vec128<float> v) {
3160
- #if HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds)
3161
- const __vector float raw_v = InterleaveUpper(Full128<float>(), v, v).raw;
3825
+ #if !HWY_S390X_HAVE_Z14 && \
3826
+ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
3827
+ const __vector float raw_v =
3828
+ detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v))
3829
+ .raw;
3162
3830
  return VFromD<decltype(du64)>{reinterpret_cast<__vector unsigned long long>(
3163
3831
  __builtin_vsx_xvcvspuxds(raw_v))};
3164
3832
  #else
@@ -3174,6 +3842,219 @@ HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
3174
3842
  return PromoteTo(d, UpperHalf(dh, v));
3175
3843
  }
3176
3844
 
3845
+ // ------------------------------ PromoteEvenTo/PromoteOddTo
3846
+
3847
+ namespace detail {
3848
+
3849
+ // Signed to Signed PromoteEvenTo/PromoteOddTo for PPC9/PPC10
3850
+ #if HWY_PPC_HAVE_9 && \
3851
+ (HWY_COMPILER_GCC_ACTUAL >= 1200 || HWY_COMPILER_CLANG >= 1200)
3852
+
3853
+ #if HWY_IS_LITTLE_ENDIAN
3854
+ template <class D, class V>
3855
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
3856
+ hwy::SizeTag<4> /*to_lane_size_tag*/,
3857
+ hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
3858
+ V v) {
3859
+ return VFromD<D>{vec_signexti(v.raw)};
3860
+ }
3861
+ template <class D, class V>
3862
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
3863
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3864
+ hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
3865
+ V v) {
3866
+ return VFromD<D>{vec_signextll(v.raw)};
3867
+ }
3868
+ #else
3869
+ template <class D, class V>
3870
+ HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
3871
+ hwy::SizeTag<4> /*to_lane_size_tag*/,
3872
+ hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
3873
+ V v) {
3874
+ return VFromD<D>{vec_signexti(v.raw)};
3875
+ }
3876
+ template <class D, class V>
3877
+ HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
3878
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3879
+ hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
3880
+ V v) {
3881
+ return VFromD<D>{vec_signextll(v.raw)};
3882
+ }
3883
+ #endif // HWY_IS_LITTLE_ENDIAN
3884
+
3885
+ #endif // HWY_PPC_HAVE_9
3886
+
3887
+ // I32/U32/F32->F64 PromoteEvenTo
3888
+ #if HWY_S390X_HAVE_Z14
3889
+ template <class D, class V>
3890
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
3891
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3892
+ hwy::FloatTag /*from_type_tag*/, D /*d_to*/,
3893
+ V v) {
3894
+ return VFromD<D>{vec_doublee(v.raw)};
3895
+ }
3896
+ template <class D, class V, class FromTypeTag, HWY_IF_UI32(TFromV<V>)>
3897
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
3898
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3899
+ FromTypeTag /*from_type_tag*/, D d_to, V v) {
3900
+ const Rebind<MakeWide<TFromV<V>>, decltype(d_to)> dw;
3901
+ return ConvertTo(d_to, PromoteEvenTo(dw, v));
3902
+ }
3903
+ #else // VSX
3904
+ template <class D, class V, class FromTypeTag>
3905
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
3906
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3907
+ FromTypeTag /*from_type_tag*/, D /*d_to*/,
3908
+ V v) {
3909
+ return VFromD<D>{vec_doublee(v.raw)};
3910
+ }
3911
+ #endif // HWY_S390X_HAVE_Z14
3912
+
3913
+ // F32->I64 PromoteEvenTo
3914
+ template <class D, class V>
3915
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
3916
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3917
+ hwy::FloatTag /*from_type_tag*/, D d_to,
3918
+ V v) {
3919
+ #if !HWY_S390X_HAVE_Z14 && \
3920
+ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
3921
+ (void)d_to;
3922
+ const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
3923
+ #if HWY_IS_LITTLE_ENDIAN
3924
+ // __builtin_vsx_xvcvspsxds expects the source values to be in the odd lanes
3925
+ // on little-endian PPC, and the vec_sld operation below will shift the even
3926
+ // lanes of normalized_v into the odd lanes.
3927
+ return VFromD<D>{
3928
+ __builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))};
3929
+ #else
3930
+ // __builtin_vsx_xvcvspsxds expects the source values to be in the even lanes
3931
+ // on big-endian PPC.
3932
+ return VFromD<D>{__builtin_vsx_xvcvspsxds(normalized_v.raw)};
3933
+ #endif
3934
+ #else
3935
+ const RebindToFloat<decltype(d_to)> df64;
3936
+ return ConvertTo(d_to, PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(),
3937
+ hwy::FloatTag(), df64, v));
3938
+ #endif
3939
+ }
3940
+
3941
+ // F32->U64 PromoteEvenTo
3942
+ template <class D, class V>
3943
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::UnsignedTag /*to_type_tag*/,
3944
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3945
+ hwy::FloatTag /*from_type_tag*/, D d_to,
3946
+ V v) {
3947
+ #if !HWY_S390X_HAVE_Z14 && \
3948
+ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
3949
+ (void)d_to;
3950
+ const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
3951
+ #if HWY_IS_LITTLE_ENDIAN
3952
+ // __builtin_vsx_xvcvspuxds expects the source values to be in the odd lanes
3953
+ // on little-endian PPC, and the vec_sld operation below will shift the even
3954
+ // lanes of normalized_v into the odd lanes.
3955
+ return VFromD<D>{
3956
+ reinterpret_cast<__vector unsigned long long>(__builtin_vsx_xvcvspuxds(
3957
+ vec_sld(normalized_v.raw, normalized_v.raw, 4)))};
3958
+ #else
3959
+ // __builtin_vsx_xvcvspuxds expects the source values to be in the even lanes
3960
+ // on big-endian PPC.
3961
+ return VFromD<D>{reinterpret_cast<__vector unsigned long long>(
3962
+ __builtin_vsx_xvcvspuxds(normalized_v.raw))};
3963
+ #endif
3964
+ #else
3965
+ const RebindToFloat<decltype(d_to)> df64;
3966
+ return ConvertTo(d_to, PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(),
3967
+ hwy::FloatTag(), df64, v));
3968
+ #endif
3969
+ }
3970
+
3971
+ // I32/U32/F32->F64 PromoteOddTo
3972
+ #if HWY_S390X_HAVE_Z14
3973
+ template <class D, class V>
3974
+ HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
3975
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3976
+ hwy::FloatTag /*from_type_tag*/, D d_to,
3977
+ V v) {
3978
+ return PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(), hwy::FloatTag(),
3979
+ d_to, V{vec_sld(v.raw, v.raw, 4)});
3980
+ }
3981
+ template <class D, class V, class FromTypeTag, HWY_IF_UI32(TFromV<V>)>
3982
+ HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
3983
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3984
+ FromTypeTag /*from_type_tag*/, D d_to, V v) {
3985
+ const Rebind<MakeWide<TFromV<V>>, decltype(d_to)> dw;
3986
+ return ConvertTo(d_to, PromoteOddTo(dw, v));
3987
+ }
3988
+ #else
3989
+ template <class D, class V, class FromTypeTag>
3990
+ HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
3991
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3992
+ FromTypeTag /*from_type_tag*/, D /*d_to*/,
3993
+ V v) {
3994
+ return VFromD<D>{vec_doubleo(v.raw)};
3995
+ }
3996
+ #endif
3997
+
3998
+ // F32->I64 PromoteOddTo
3999
+ template <class D, class V>
4000
+ HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
4001
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
4002
+ hwy::FloatTag /*from_type_tag*/, D d_to,
4003
+ V v) {
4004
+ #if !HWY_S390X_HAVE_Z14 && \
4005
+ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
4006
+ (void)d_to;
4007
+ const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
4008
+ #if HWY_IS_LITTLE_ENDIAN
4009
+ // __builtin_vsx_xvcvspsxds expects the source values to be in the odd lanes
4010
+ // on little-endian PPC
4011
+ return VFromD<D>{__builtin_vsx_xvcvspsxds(normalized_v.raw)};
4012
+ #else
4013
+ // __builtin_vsx_xvcvspsxds expects the source values to be in the even lanes
4014
+ // on big-endian PPC, and the vec_sld operation below will shift the odd lanes
4015
+ // of normalized_v into the even lanes.
4016
+ return VFromD<D>{
4017
+ __builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))};
4018
+ #endif
4019
+ #else
4020
+ const RebindToFloat<decltype(d_to)> df64;
4021
+ return ConvertTo(d_to, PromoteOddTo(hwy::FloatTag(), hwy::SizeTag<8>(),
4022
+ hwy::FloatTag(), df64, v));
4023
+ #endif
4024
+ }
4025
+
4026
+ // F32->U64 PromoteOddTo
4027
+ template <class D, class V>
4028
+ HWY_INLINE VFromD<D> PromoteOddTo(hwy::UnsignedTag /*to_type_tag*/,
4029
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
4030
+ hwy::FloatTag /*from_type_tag*/, D d_to,
4031
+ V v) {
4032
+ #if !HWY_S390X_HAVE_Z14 && \
4033
+ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
4034
+ (void)d_to;
4035
+ const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
4036
+ #if HWY_IS_LITTLE_ENDIAN
4037
+ // __builtin_vsx_xvcvspuxds expects the source values to be in the odd lanes
4038
+ // on little-endian PPC
4039
+ return VFromD<D>{reinterpret_cast<__vector unsigned long long>(
4040
+ __builtin_vsx_xvcvspuxds(normalized_v.raw))};
4041
+ #else
4042
+ // __builtin_vsx_xvcvspuxds expects the source values to be in the even lanes
4043
+ // on big-endian PPC, and the vec_sld operation below will shift the odd lanes
4044
+ // of normalized_v into the even lanes.
4045
+ return VFromD<D>{
4046
+ reinterpret_cast<__vector unsigned long long>(__builtin_vsx_xvcvspuxds(
4047
+ vec_sld(normalized_v.raw, normalized_v.raw, 4)))};
4048
+ #endif
4049
+ #else
4050
+ const RebindToFloat<decltype(d_to)> df64;
4051
+ return ConvertTo(d_to, PromoteOddTo(hwy::FloatTag(), hwy::SizeTag<8>(),
4052
+ hwy::FloatTag(), df64, v));
4053
+ #endif
4054
+ }
4055
+
4056
+ } // namespace detail
4057
+
3177
4058
  // ------------------------------ Demotions (full -> part w/ narrow lanes)
3178
4059
 
3179
4060
  template <class D, typename FromT, HWY_IF_UNSIGNED_D(D),
@@ -3254,29 +4135,138 @@ HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
3254
4135
 
3255
4136
  #endif // HWY_PPC_HAVE_9
3256
4137
 
3257
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
3258
- HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
3259
- const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
3260
- const Rebind<uint16_t, decltype(dbf16)> du16;
3261
- const auto bits_in_32 = ShiftRight<16>(BitCast(du32, v));
3262
- return BitCast(dbf16, TruncateTo(du16, bits_in_32));
4138
+ #if HWY_PPC_HAVE_9
4139
+
4140
+ #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
4141
+ #undef HWY_NATIVE_DEMOTE_F64_TO_F16
4142
+ #else
4143
+ #define HWY_NATIVE_DEMOTE_F64_TO_F16
4144
+ #endif
4145
+
4146
+ namespace detail {
4147
+
4148
+ // On big-endian PPC9, VsxXscvdphp converts vf64[0] to a F16, returned as an U64
4149
+ // vector with the resulting F16 bits in the lower 16 bits of U64 lane 0
4150
+
4151
+ // On little-endian PPC9, VsxXscvdphp converts vf64[1] to a F16, returned as
4152
+ // an U64 vector with the resulting F16 bits in the lower 16 bits of U64 lane 1
4153
+ static HWY_INLINE Vec128<uint64_t> VsxXscvdphp(Vec128<double> vf64) {
4154
+ // Inline assembly is needed for the PPC9 xscvdphp instruction as there is
4155
+ // currently no intrinsic available for the PPC9 xscvdphp instruction
4156
+ __vector unsigned long long raw_result;
4157
+ __asm__("xscvdphp %x0, %x1" : "=wa"(raw_result) : "wa"(vf64.raw));
4158
+ return Vec128<uint64_t>{raw_result};
3263
4159
  }
3264
4160
 
3265
- template <class D, HWY_IF_BF16_D(D), class V32 = VFromD<Repartition<float, D>>>
3266
- HWY_API VFromD<D> ReorderDemote2To(D dbf16, V32 a, V32 b) {
3267
- const RebindToUnsigned<decltype(dbf16)> du16;
3268
- const Repartition<uint32_t, decltype(dbf16)> du32;
4161
+ } // namespace detail
4162
+
4163
+ template <class D, HWY_IF_F16_D(D), HWY_IF_LANES_D(D, 1)>
4164
+ HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
4165
+ const RebindToUnsigned<decltype(df16)> du16;
4166
+ const Rebind<uint64_t, decltype(df16)> du64;
4167
+
4168
+ const Full128<double> df64_full;
3269
4169
  #if HWY_IS_LITTLE_ENDIAN
3270
- const auto a_in_odd = a;
3271
- const auto b_in_even = ShiftRight<16>(BitCast(du32, b));
4170
+ const auto bits16_as_u64 =
4171
+ UpperHalf(du64, detail::VsxXscvdphp(Combine(df64_full, v, v)));
3272
4172
  #else
3273
- const auto a_in_odd = ShiftRight<16>(BitCast(du32, a));
3274
- const auto b_in_even = b;
4173
+ const auto bits16_as_u64 =
4174
+ LowerHalf(du64, detail::VsxXscvdphp(ResizeBitCast(df64_full, v)));
3275
4175
  #endif
3276
- return BitCast(dbf16,
3277
- OddEven(BitCast(du16, a_in_odd), BitCast(du16, b_in_even)));
4176
+
4177
+ return BitCast(df16, TruncateTo(du16, bits16_as_u64));
4178
+ }
4179
+
4180
+ template <class D, HWY_IF_F16_D(D), HWY_IF_LANES_D(D, 2)>
4181
+ HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
4182
+ const RebindToUnsigned<decltype(df16)> du16;
4183
+ const Rebind<uint64_t, decltype(df16)> du64;
4184
+ const Rebind<double, decltype(df16)> df64;
4185
+
4186
+ #if HWY_IS_LITTLE_ENDIAN
4187
+ const auto bits64_as_u64_0 = detail::VsxXscvdphp(InterleaveLower(df64, v, v));
4188
+ const auto bits64_as_u64_1 = detail::VsxXscvdphp(v);
4189
+ const auto bits64_as_u64 =
4190
+ InterleaveUpper(du64, bits64_as_u64_0, bits64_as_u64_1);
4191
+ #else
4192
+ const auto bits64_as_u64_0 = detail::VsxXscvdphp(v);
4193
+ const auto bits64_as_u64_1 = detail::VsxXscvdphp(InterleaveUpper(df64, v, v));
4194
+ const auto bits64_as_u64 =
4195
+ InterleaveLower(du64, bits64_as_u64_0, bits64_as_u64_1);
4196
+ #endif
4197
+
4198
+ return BitCast(df16, TruncateTo(du16, bits64_as_u64));
4199
+ }
4200
+
4201
+ #elif HWY_S390X_HAVE_Z14
4202
+
4203
+ #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
4204
+ #undef HWY_NATIVE_DEMOTE_F64_TO_F16
4205
+ #else
4206
+ #define HWY_NATIVE_DEMOTE_F64_TO_F16
4207
+ #endif
4208
+
4209
+ namespace detail {
4210
+
4211
+ template <class DF32, HWY_IF_F32_D(DF32)>
4212
+ static HWY_INLINE VFromD<DF32> DemoteToF32WithRoundToOdd(
4213
+ DF32 df32, VFromD<Rebind<double, DF32>> v) {
4214
+ const Twice<DF32> dt_f32;
4215
+
4216
+ __vector float raw_f32_in_even;
4217
+ __asm__("vledb %0,%1,0,3" : "=v"(raw_f32_in_even) : "v"(v.raw));
4218
+
4219
+ const VFromD<decltype(dt_f32)> f32_in_even{raw_f32_in_even};
4220
+ return LowerHalf(df32, ConcatEven(dt_f32, f32_in_even, f32_in_even));
4221
+ }
4222
+
4223
+ } // namespace detail
4224
+
4225
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)>
4226
+ HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
4227
+ const Rebind<float, decltype(df16)> df32;
4228
+ return DemoteTo(df16, detail::DemoteToF32WithRoundToOdd(df32, v));
3278
4229
  }
3279
4230
 
4231
+ #endif // HWY_PPC_HAVE_9
4232
+
4233
+ #if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
4234
+
4235
+ #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
4236
+ #undef HWY_NATIVE_DEMOTE_F32_TO_BF16
4237
+ #else
4238
+ #define HWY_NATIVE_DEMOTE_F32_TO_BF16
4239
+ #endif
4240
+
4241
+ namespace detail {
4242
+
4243
+ // VsxXvcvspbf16 converts a F32 vector to a BF16 vector, bitcasted to an U32
4244
+ // vector with the resulting BF16 bits in the lower 16 bits of each U32 lane
4245
+ template <class D, HWY_IF_BF16_D(D)>
4246
+ static HWY_INLINE VFromD<Rebind<uint32_t, D>> VsxXvcvspbf16(
4247
+ D dbf16, VFromD<Rebind<float, D>> v) {
4248
+ const Rebind<uint32_t, decltype(dbf16)> du32;
4249
+ const Repartition<uint8_t, decltype(du32)> du32_as_du8;
4250
+
4251
+ using VU32 = __vector unsigned int;
4252
+
4253
+ // Even though the __builtin_vsx_xvcvspbf16 builtin performs a F32 to BF16
4254
+ // conversion, the __builtin_vsx_xvcvspbf16 intrinsic expects a
4255
+ // __vector unsigned char argument (at least as of GCC 13 and Clang 17)
4256
+ return VFromD<Rebind<uint32_t, D>>{reinterpret_cast<VU32>(
4257
+ __builtin_vsx_xvcvspbf16(BitCast(du32_as_du8, v).raw))};
4258
+ }
4259
+
4260
+ } // namespace detail
4261
+
4262
+ template <class D, HWY_IF_BF16_D(D)>
4263
+ HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
4264
+ const RebindToUnsigned<decltype(dbf16)> du16;
4265
+ return BitCast(dbf16, TruncateTo(du16, detail::VsxXvcvspbf16(dbf16, v)));
4266
+ }
4267
+
4268
+ #endif // HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
4269
+
3280
4270
  // Specializations for partial vectors because vec_packs sets lanes above 2*N.
3281
4271
  template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4), HWY_IF_SIGNED_D(DN),
3282
4272
  HWY_IF_SIGNED_V(V),
@@ -3368,6 +4358,18 @@ HWY_API VFromD<DN> ReorderDemote2To(DN /*dn*/, V a, V b) {
3368
4358
  return VFromD<DN>{vec_packs(a.raw, b.raw)};
3369
4359
  }
3370
4360
 
4361
+ #if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
4362
+ template <class D, class V, HWY_IF_BF16_D(D), HWY_IF_F32(TFromV<V>),
4363
+ HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V) * 2)>
4364
+ HWY_API VFromD<D> ReorderDemote2To(D dbf16, V a, V b) {
4365
+ const RebindToUnsigned<decltype(dbf16)> du16;
4366
+ const Half<decltype(dbf16)> dh_bf16;
4367
+ return BitCast(dbf16,
4368
+ OrderedTruncate2To(du16, detail::VsxXvcvspbf16(dh_bf16, a),
4369
+ detail::VsxXvcvspbf16(dh_bf16, b)));
4370
+ }
4371
+ #endif
4372
+
3371
4373
  template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), class V,
3372
4374
  HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
3373
4375
  HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
@@ -3376,15 +4378,13 @@ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
3376
4378
  return ReorderDemote2To(d, a, b);
3377
4379
  }
3378
4380
 
3379
- template <class D, HWY_IF_BF16_D(D), class V32 = VFromD<Repartition<float, D>>>
3380
- HWY_API VFromD<D> OrderedDemote2To(D dbf16, V32 a, V32 b) {
3381
- const RebindToUnsigned<decltype(dbf16)> du16;
3382
- #if HWY_IS_LITTLE_ENDIAN
3383
- return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a)));
3384
- #else
3385
- return BitCast(dbf16, ConcatEven(du16, BitCast(du16, b), BitCast(du16, a)));
3386
- #endif
4381
+ #if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
4382
+ template <class D, HWY_IF_BF16_D(D), class V, HWY_IF_F32(TFromV<V>),
4383
+ HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
4384
+ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
4385
+ return ReorderDemote2To(d, a, b);
3387
4386
  }
4387
+ #endif
3388
4388
 
3389
4389
  template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
3390
4390
  HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) {
@@ -3393,90 +4393,164 @@ HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) {
3393
4393
 
3394
4394
  template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
3395
4395
  HWY_API Vec64<float> DemoteTo(D d, Vec128<double> v) {
3396
- #if HWY_IS_LITTLE_ENDIAN
4396
+ #if HWY_S390X_HAVE_Z14 || HWY_IS_LITTLE_ENDIAN
3397
4397
  const Vec128<float> f64_to_f32{vec_floate(v.raw)};
3398
4398
  #else
3399
4399
  const Vec128<float> f64_to_f32{vec_floato(v.raw)};
3400
4400
  #endif
3401
4401
 
4402
+ #if HWY_S390X_HAVE_Z14
4403
+ const Twice<decltype(d)> dt;
4404
+ return LowerHalf(d, ConcatEven(dt, f64_to_f32, f64_to_f32));
4405
+ #else
3402
4406
  const RebindToUnsigned<D> du;
3403
4407
  const Rebind<uint64_t, D> du64;
3404
4408
  return Vec64<float>{
3405
4409
  BitCast(d, TruncateTo(du, BitCast(du64, f64_to_f32))).raw};
4410
+ #endif
3406
4411
  }
3407
4412
 
3408
4413
  template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)>
3409
- HWY_API Vec32<int32_t> DemoteTo(D /* tag */, Vec64<double> v) {
3410
- return Vec32<int32_t>{vec_signede(v.raw)};
4414
+ HWY_API Vec32<int32_t> DemoteTo(D di32, Vec64<double> v) {
4415
+ #if HWY_S390X_HAVE_Z14
4416
+ const Rebind<int64_t, decltype(di32)> di64;
4417
+ return DemoteTo(di32, ConvertTo(di64, v));
4418
+ #else
4419
+ (void)di32;
4420
+ return Vec32<int32_t>{vec_signede(detail::VsxF2INormalizeSrcVals(v).raw)};
4421
+ #endif
3411
4422
  }
3412
4423
 
3413
4424
  template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)>
3414
- HWY_API Vec64<int32_t> DemoteTo(D /* tag */, Vec128<double> v) {
4425
+ HWY_API Vec64<int32_t> DemoteTo(D di32, Vec128<double> v) {
4426
+ #if HWY_S390X_HAVE_Z14
4427
+ const Rebind<int64_t, decltype(di32)> di64;
4428
+ return DemoteTo(di32, ConvertTo(di64, v));
4429
+ #else
4430
+ (void)di32;
4431
+
3415
4432
  #if HWY_IS_LITTLE_ENDIAN
3416
- const Vec128<int32_t> f64_to_i32{vec_signede(v.raw)};
4433
+ const Vec128<int32_t> f64_to_i32{
4434
+ vec_signede(detail::VsxF2INormalizeSrcVals(v).raw)};
3417
4435
  #else
3418
- const Vec128<int32_t> f64_to_i32{vec_signedo(v.raw)};
4436
+ const Vec128<int32_t> f64_to_i32{
4437
+ vec_signedo(detail::VsxF2INormalizeSrcVals(v).raw)};
3419
4438
  #endif
3420
4439
 
3421
4440
  const Rebind<int64_t, D> di64;
3422
4441
  const Vec128<int64_t> vi64 = BitCast(di64, f64_to_i32);
3423
4442
  return Vec64<int32_t>{vec_pack(vi64.raw, vi64.raw)};
4443
+ #endif
3424
4444
  }
3425
4445
 
3426
4446
  template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U32_D(D)>
3427
- HWY_API Vec32<uint32_t> DemoteTo(D /* tag */, Vec64<double> v) {
3428
- return Vec32<uint32_t>{vec_unsignede(v.raw)};
4447
+ HWY_API Vec32<uint32_t> DemoteTo(D du32, Vec64<double> v) {
4448
+ #if HWY_S390X_HAVE_Z14
4449
+ const Rebind<uint64_t, decltype(du32)> du64;
4450
+ return DemoteTo(du32, ConvertTo(du64, v));
4451
+ #else
4452
+ (void)du32;
4453
+ return Vec32<uint32_t>{vec_unsignede(detail::VsxF2INormalizeSrcVals(v).raw)};
4454
+ #endif
3429
4455
  }
3430
4456
 
3431
4457
  template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)>
3432
- HWY_API Vec64<uint32_t> DemoteTo(D /* tag */, Vec128<double> v) {
4458
+ HWY_API Vec64<uint32_t> DemoteTo(D du32, Vec128<double> v) {
4459
+ #if HWY_S390X_HAVE_Z14
4460
+ const Rebind<uint64_t, decltype(du32)> du64;
4461
+ return DemoteTo(du32, ConvertTo(du64, v));
4462
+ #else
4463
+ (void)du32;
3433
4464
  #if HWY_IS_LITTLE_ENDIAN
3434
- const Vec128<uint32_t> f64_to_u32{vec_unsignede(v.raw)};
4465
+ const Vec128<uint32_t> f64_to_u32{
4466
+ vec_unsignede(detail::VsxF2INormalizeSrcVals(v).raw)};
3435
4467
  #else
3436
- const Vec128<uint32_t> f64_to_u32{vec_unsignedo(v.raw)};
4468
+ const Vec128<uint32_t> f64_to_u32{
4469
+ vec_unsignedo(detail::VsxF2INormalizeSrcVals(v).raw)};
3437
4470
  #endif
3438
4471
 
3439
4472
  const Rebind<uint64_t, D> du64;
3440
4473
  const Vec128<uint64_t> vu64 = BitCast(du64, f64_to_u32);
3441
4474
  return Vec64<uint32_t>{vec_pack(vu64.raw, vu64.raw)};
4475
+ #endif
4476
+ }
4477
+
4478
+ #if HWY_S390X_HAVE_Z14
4479
+ namespace detail {
4480
+
4481
+ template <class V, HWY_IF_I64(TFromV<V>)>
4482
+ HWY_INLINE VFromD<RebindToFloat<DFromV<V>>> ConvToF64WithRoundToOdd(V v) {
4483
+ __vector double raw_result;
4484
+ // Use inline assembly to do a round-to-odd I64->F64 conversion on Z14
4485
+ __asm__("vcdgb %0,%1,0,3" : "=v"(raw_result) : "v"(v.raw));
4486
+ return VFromD<RebindToFloat<DFromV<V>>>{raw_result};
4487
+ }
4488
+
4489
+ template <class V, HWY_IF_U64(TFromV<V>)>
4490
+ HWY_INLINE VFromD<RebindToFloat<DFromV<V>>> ConvToF64WithRoundToOdd(V v) {
4491
+ __vector double raw_result;
4492
+ // Use inline assembly to do a round-to-odd U64->F64 conversion on Z14
4493
+ __asm__("vcdlgb %0,%1,0,3" : "=v"(raw_result) : "v"(v.raw));
4494
+ return VFromD<RebindToFloat<DFromV<V>>>{raw_result};
3442
4495
  }
3443
4496
 
4497
+ } // namespace detail
4498
+ #endif // HWY_S390X_HAVE_Z14
4499
+
3444
4500
  template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
3445
- HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<int64_t> v) {
4501
+ HWY_API Vec32<float> DemoteTo(D df32, Vec64<int64_t> v) {
4502
+ #if HWY_S390X_HAVE_Z14
4503
+ return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
4504
+ #else // VSX
4505
+ (void)df32;
3446
4506
  return Vec32<float>{vec_floate(v.raw)};
4507
+ #endif
3447
4508
  }
3448
4509
 
3449
4510
  template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
3450
- HWY_API Vec64<float> DemoteTo(D d, Vec128<int64_t> v) {
4511
+ HWY_API Vec64<float> DemoteTo(D df32, Vec128<int64_t> v) {
4512
+ #if HWY_S390X_HAVE_Z14
4513
+ return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
4514
+ #else // VSX
3451
4515
  #if HWY_IS_LITTLE_ENDIAN
3452
4516
  const Vec128<float> i64_to_f32{vec_floate(v.raw)};
3453
4517
  #else
3454
4518
  const Vec128<float> i64_to_f32{vec_floato(v.raw)};
3455
4519
  #endif
3456
4520
 
3457
- const RebindToUnsigned<D> du;
3458
- const Rebind<uint64_t, D> du64;
4521
+ const RebindToUnsigned<decltype(df32)> du32;
4522
+ const Rebind<uint64_t, decltype(df32)> du64;
3459
4523
  return Vec64<float>{
3460
- BitCast(d, TruncateTo(du, BitCast(du64, i64_to_f32))).raw};
4524
+ BitCast(df32, TruncateTo(du32, BitCast(du64, i64_to_f32))).raw};
4525
+ #endif
3461
4526
  }
3462
4527
 
3463
4528
  template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
3464
- HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<uint64_t> v) {
4529
+ HWY_API Vec32<float> DemoteTo(D df32, Vec64<uint64_t> v) {
4530
+ #if HWY_S390X_HAVE_Z14
4531
+ return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
4532
+ #else // VSX
4533
+ (void)df32;
3465
4534
  return Vec32<float>{vec_floate(v.raw)};
4535
+ #endif
3466
4536
  }
3467
4537
 
3468
4538
  template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
3469
- HWY_API Vec64<float> DemoteTo(D d, Vec128<uint64_t> v) {
4539
+ HWY_API Vec64<float> DemoteTo(D df32, Vec128<uint64_t> v) {
4540
+ #if HWY_S390X_HAVE_Z14
4541
+ return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
4542
+ #else // VSX
3470
4543
  #if HWY_IS_LITTLE_ENDIAN
3471
4544
  const Vec128<float> u64_to_f32{vec_floate(v.raw)};
3472
4545
  #else
3473
4546
  const Vec128<float> u64_to_f32{vec_floato(v.raw)};
3474
4547
  #endif
3475
4548
 
3476
- const RebindToUnsigned<D> du;
3477
- const Rebind<uint64_t, D> du64;
4549
+ const RebindToUnsigned<decltype(df32)> du;
4550
+ const Rebind<uint64_t, decltype(df32)> du64;
3478
4551
  return Vec64<float>{
3479
- BitCast(d, TruncateTo(du, BitCast(du64, u64_to_f32))).raw};
4552
+ BitCast(df32, TruncateTo(du, BitCast(du64, u64_to_f32))).raw};
4553
+ #endif
3480
4554
  }
3481
4555
 
3482
4556
  // For already range-limited input [0, 255].
@@ -3491,17 +4565,39 @@ HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) {
3491
4565
  // Note: altivec.h vec_ct* currently contain C casts which triggers
3492
4566
  // -Wdeprecate-lax-vec-conv-all warnings, so disable them.
3493
4567
 
3494
- template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_NOT_FLOAT(FromT),
3495
- HWY_IF_T_SIZE_D(D, sizeof(FromT))>
4568
+ #if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
4569
+ template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT),
4570
+ HWY_IF_V_SIZE_LE_D(D, 8)>
4571
+ HWY_API VFromD<D> ConvertTo(D df32,
4572
+ Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
4573
+ const Rebind<double, decltype(df32)> df64;
4574
+ return DemoteTo(df32, PromoteTo(df64, v));
4575
+ }
4576
+ template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT),
4577
+ HWY_IF_V_SIZE_D(D, 16)>
4578
+ HWY_API VFromD<D> ConvertTo(D df32, Vec128<FromT> v) {
4579
+ const RepartitionToWide<decltype(df32)> df64;
4580
+
4581
+ const VFromD<D> vf32_lo{vec_floate(PromoteLowerTo(df64, v).raw)};
4582
+ const VFromD<D> vf32_hi{vec_floate(PromoteUpperTo(df64, v).raw)};
4583
+ return ConcatEven(df32, vf32_hi, vf32_lo);
4584
+ }
4585
+ #else // Z15 or PPC
4586
+ template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT)>
3496
4587
  HWY_API VFromD<D> ConvertTo(D /* tag */,
3497
4588
  Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
3498
4589
  HWY_DIAGNOSTICS(push)
3499
4590
  #if HWY_COMPILER_CLANG
3500
4591
  HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
3501
4592
  #endif
4593
+ #if HWY_S390X_HAVE_Z15
4594
+ return VFromD<D>{vec_float(v.raw)};
4595
+ #else
3502
4596
  return VFromD<D>{vec_ctf(v.raw, 0)};
4597
+ #endif
3503
4598
  HWY_DIAGNOSTICS(pop)
3504
4599
  }
4600
+ #endif // HWY_TARGET == HWY_Z14
3505
4601
 
3506
4602
  template <class D, typename FromT, HWY_IF_F64_D(D), HWY_IF_NOT_FLOAT(FromT),
3507
4603
  HWY_IF_T_SIZE_D(D, sizeof(FromT))>
@@ -3511,38 +4607,195 @@ HWY_API VFromD<D> ConvertTo(D /* tag */,
3511
4607
  }
3512
4608
 
3513
4609
  // Truncates (rounds toward zero).
3514
- template <class D, typename FromT, HWY_IF_SIGNED_D(D), HWY_IF_FLOAT(FromT),
3515
- HWY_IF_T_SIZE_D(D, sizeof(FromT))>
4610
+ #if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
4611
+ template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
4612
+ HWY_API VFromD<D> ConvertTo(D di32,
4613
+ Vec128<float, Rebind<float, D>().MaxLanes()> v) {
4614
+ const Rebind<int64_t, decltype(di32)> di64;
4615
+ return DemoteTo(di32, PromoteTo(di64, v));
4616
+ }
4617
+ template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_D(D, 16)>
4618
+ HWY_API VFromD<D> ConvertTo(D di32,
4619
+ Vec128<float, Rebind<float, D>().MaxLanes()> v) {
4620
+ const RepartitionToWide<decltype(di32)> di64;
4621
+ return OrderedDemote2To(di32, PromoteLowerTo(di64, v),
4622
+ PromoteUpperTo(di64, v));
4623
+ }
4624
+ #else // Z15 or PPC
4625
+ template <class D, HWY_IF_I32_D(D)>
3516
4626
  HWY_API VFromD<D> ConvertTo(D /* tag */,
3517
- Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
4627
+ Vec128<float, Rebind<float, D>().MaxLanes()> v) {
4628
+ #if defined(__OPTIMIZE__)
4629
+ if (detail::IsConstantRawAltivecVect(v.raw)) {
4630
+ constexpr int32_t kMinI32 = LimitsMin<int32_t>();
4631
+ constexpr int32_t kMaxI32 = LimitsMax<int32_t>();
4632
+ return Dup128VecFromValues(
4633
+ D(),
4634
+ (v.raw[0] >= -2147483648.0f)
4635
+ ? ((v.raw[0] < 2147483648.0f) ? static_cast<int32_t>(v.raw[0])
4636
+ : kMaxI32)
4637
+ : ((v.raw[0] < 0) ? kMinI32 : 0),
4638
+ (v.raw[1] >= -2147483648.0f)
4639
+ ? ((v.raw[1] < 2147483648.0f) ? static_cast<int32_t>(v.raw[1])
4640
+ : kMaxI32)
4641
+ : ((v.raw[1] < 0) ? kMinI32 : 0),
4642
+ (v.raw[2] >= -2147483648.0f)
4643
+ ? ((v.raw[2] < 2147483648.0f) ? static_cast<int32_t>(v.raw[2])
4644
+ : kMaxI32)
4645
+ : ((v.raw[2] < 0) ? kMinI32 : 0),
4646
+ (v.raw[3] >= -2147483648.0f)
4647
+ ? ((v.raw[3] < 2147483648.0f) ? static_cast<int32_t>(v.raw[3])
4648
+ : kMaxI32)
4649
+ : ((v.raw[3] < 0) ? kMinI32 : 0));
4650
+ }
4651
+ #endif
4652
+
4653
+ #if HWY_S390X_HAVE_Z15
4654
+ // Use inline assembly on Z15 to avoid undefined behavior if v[i] is not in
4655
+ // the range of an int32_t
4656
+ __vector signed int raw_result;
4657
+ __asm__("vcfeb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
4658
+ return VFromD<D>{raw_result};
4659
+ #else
3518
4660
  HWY_DIAGNOSTICS(push)
3519
4661
  #if HWY_COMPILER_CLANG
3520
4662
  HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
3521
4663
  #endif
3522
4664
  return VFromD<D>{vec_cts(v.raw, 0)};
3523
4665
  HWY_DIAGNOSTICS(pop)
4666
+ #endif // HWY_S390X_HAVE_Z15
3524
4667
  }
4668
+ #endif // HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
3525
4669
 
3526
- template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_FLOAT(FromT),
3527
- HWY_IF_T_SIZE_D(D, sizeof(FromT))>
4670
+ template <class D, HWY_IF_I64_D(D)>
3528
4671
  HWY_API VFromD<D> ConvertTo(D /* tag */,
3529
- Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
4672
+ Vec128<double, Rebind<double, D>().MaxLanes()> v) {
4673
+ #if defined(__OPTIMIZE__)
4674
+ if (detail::IsConstantRawAltivecVect(v.raw)) {
4675
+ constexpr int64_t kMinI64 = LimitsMin<int64_t>();
4676
+ constexpr int64_t kMaxI64 = LimitsMax<int64_t>();
4677
+ return Dup128VecFromValues(D(),
4678
+ (v.raw[0] >= -9223372036854775808.0)
4679
+ ? ((v.raw[0] < 9223372036854775808.0)
4680
+ ? static_cast<int64_t>(v.raw[0])
4681
+ : kMaxI64)
4682
+ : ((v.raw[0] < 0) ? kMinI64 : 0LL),
4683
+ (v.raw[1] >= -9223372036854775808.0)
4684
+ ? ((v.raw[1] < 9223372036854775808.0)
4685
+ ? static_cast<int64_t>(v.raw[1])
4686
+ : kMaxI64)
4687
+ : ((v.raw[1] < 0) ? kMinI64 : 0LL));
4688
+ }
4689
+ #endif
4690
+
4691
+ // Use inline assembly to avoid undefined behavior if v[i] is not within the
4692
+ // range of an int64_t
4693
+ __vector signed long long raw_result;
4694
+ #if HWY_S390X_HAVE_Z14
4695
+ __asm__("vcgdb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
4696
+ #else
4697
+ __asm__("xvcvdpsxds %x0,%x1"
4698
+ : "=wa"(raw_result)
4699
+ : "wa"(detail::VsxF2INormalizeSrcVals(v).raw));
4700
+ #endif
4701
+ return VFromD<D>{raw_result};
4702
+ }
4703
+
4704
+ #if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
4705
+ template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
4706
+ HWY_API VFromD<D> ConvertTo(D du32,
4707
+ Vec128<float, Rebind<float, D>().MaxLanes()> v) {
4708
+ const Rebind<uint64_t, decltype(du32)> du64;
4709
+ return DemoteTo(du32, PromoteTo(du64, v));
4710
+ }
4711
+ template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_D(D, 16)>
4712
+ HWY_API VFromD<D> ConvertTo(D du32,
4713
+ Vec128<float, Rebind<float, D>().MaxLanes()> v) {
4714
+ const RepartitionToWide<decltype(du32)> du64;
4715
+ return OrderedDemote2To(du32, PromoteLowerTo(du64, v),
4716
+ PromoteUpperTo(du64, v));
4717
+ }
4718
+ #else // Z15 or VSX
4719
+ template <class D, HWY_IF_U32_D(D)>
4720
+ HWY_API VFromD<D> ConvertTo(D /* tag */,
4721
+ Vec128<float, Rebind<float, D>().MaxLanes()> v) {
4722
+ #if defined(__OPTIMIZE__)
4723
+ if (detail::IsConstantRawAltivecVect(v.raw)) {
4724
+ constexpr uint32_t kMaxU32 = LimitsMax<uint32_t>();
4725
+ return Dup128VecFromValues(
4726
+ D(),
4727
+ (v.raw[0] >= 0.0f)
4728
+ ? ((v.raw[0] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[0])
4729
+ : kMaxU32)
4730
+ : 0,
4731
+ (v.raw[1] >= 0.0f)
4732
+ ? ((v.raw[1] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[1])
4733
+ : kMaxU32)
4734
+ : 0,
4735
+ (v.raw[2] >= 0.0f)
4736
+ ? ((v.raw[2] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[2])
4737
+ : kMaxU32)
4738
+ : 0,
4739
+ (v.raw[3] >= 0.0f)
4740
+ ? ((v.raw[3] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[3])
4741
+ : kMaxU32)
4742
+ : 0);
4743
+ }
4744
+ #endif
4745
+
4746
+ #if HWY_S390X_HAVE_Z15
4747
+ // Use inline assembly on Z15 to avoid undefined behavior if v[i] is not in
4748
+ // the range of an uint32_t
4749
+ __vector unsigned int raw_result;
4750
+ __asm__("vclfeb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
4751
+ return VFromD<D>{raw_result};
4752
+ #else // VSX
3530
4753
  HWY_DIAGNOSTICS(push)
3531
4754
  #if HWY_COMPILER_CLANG
3532
4755
  HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
3533
4756
  #endif
3534
- return VFromD<D>{vec_ctu(ZeroIfNegative(v).raw, 0)};
4757
+ VFromD<D> result{vec_ctu(v.raw, 0)};
3535
4758
  HWY_DIAGNOSTICS(pop)
4759
+ return result;
4760
+ #endif // HWY_S390X_HAVE_Z15
3536
4761
  }
4762
+ #endif // HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
3537
4763
 
3538
- template <size_t N>
3539
- HWY_API Vec128<int32_t, N> NearestInt(Vec128<float, N> v) {
4764
+ template <class D, HWY_IF_U64_D(D)>
4765
+ HWY_API VFromD<D> ConvertTo(D /* tag */,
4766
+ Vec128<double, Rebind<double, D>().MaxLanes()> v) {
3540
4767
  HWY_DIAGNOSTICS(push)
3541
4768
  #if HWY_COMPILER_CLANG
3542
4769
  HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
3543
4770
  #endif
3544
- return Vec128<int32_t, N>{vec_cts(vec_round(v.raw), 0)};
3545
- HWY_DIAGNOSTICS(pop)
4771
+
4772
+ #if defined(__OPTIMIZE__)
4773
+ if (detail::IsConstantRawAltivecVect(v.raw)) {
4774
+ constexpr uint64_t kMaxU64 = LimitsMax<uint64_t>();
4775
+ return Dup128VecFromValues(
4776
+ D(),
4777
+ (v.raw[0] >= 0.0) ? ((v.raw[0] < 18446744073709551616.0)
4778
+ ? static_cast<uint64_t>(v.raw[0])
4779
+ : kMaxU64)
4780
+ : 0,
4781
+ (v.raw[1] >= 0.0) ? ((v.raw[1] < 18446744073709551616.0)
4782
+ ? static_cast<uint64_t>(v.raw[1])
4783
+ : kMaxU64)
4784
+ : 0);
4785
+ }
4786
+ #endif
4787
+
4788
+ // Use inline assembly to avoid undefined behavior if v[i] is not within the
4789
+ // range of an uint64_t
4790
+ __vector unsigned long long raw_result;
4791
+ #if HWY_S390X_HAVE_Z14
4792
+ __asm__("vclgdb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
4793
+ #else // VSX
4794
+ __asm__("xvcvdpuxds %x0,%x1"
4795
+ : "=wa"(raw_result)
4796
+ : "wa"(detail::VsxF2INormalizeSrcVals(v).raw));
4797
+ #endif
4798
+ return VFromD<D>{raw_result};
3546
4799
  }
3547
4800
 
3548
4801
  // ------------------------------ Floating-point rounding (ConvertTo)
@@ -3555,7 +4808,18 @@ HWY_API Vec128<float, N> Round(Vec128<float, N> v) {
3555
4808
 
3556
4809
  template <size_t N>
3557
4810
  HWY_API Vec128<double, N> Round(Vec128<double, N> v) {
4811
+ #if HWY_S390X_HAVE_Z14
4812
+ return Vec128<double, N>{vec_round(v.raw)};
4813
+ #else
3558
4814
  return Vec128<double, N>{vec_rint(v.raw)};
4815
+ #endif
4816
+ }
4817
+
4818
+ template <size_t N>
4819
+ HWY_API Vec128<int32_t, N> NearestInt(Vec128<float, N> v) {
4820
+ const DFromV<decltype(v)> d;
4821
+ const RebindToSigned<decltype(d)> di;
4822
+ return ConvertTo(di, Round(v));
3559
4823
  }
3560
4824
 
3561
4825
  // Toward zero, aka truncate
@@ -3613,7 +4877,7 @@ HWY_API Mask128<T, N> IsFinite(Vec128<T, N> v) {
3613
4877
 
3614
4878
  // ================================================== CRYPTO
3615
4879
 
3616
- #if !defined(HWY_DISABLE_PPC8_CRYPTO)
4880
+ #if !HWY_S390X_HAVE_Z14 && !defined(HWY_DISABLE_PPC8_CRYPTO)
3617
4881
 
3618
4882
  // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
3619
4883
  #ifdef HWY_NATIVE_AES
@@ -3918,6 +5182,15 @@ struct CompressIsPartition {
3918
5182
  enum { value = (sizeof(T) != 1) };
3919
5183
  };
3920
5184
 
5185
+ // ------------------------------ Dup128MaskFromMaskBits
5186
+
5187
+ template <class D>
5188
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
5189
+ constexpr size_t kN = MaxLanes(d);
5190
+ if (kN < 8) mask_bits &= (1u << kN) - 1;
5191
+ return detail::LoadMaskBits128(d, mask_bits);
5192
+ }
5193
+
3921
5194
  // ------------------------------ StoreMaskBits
3922
5195
 
3923
5196
  namespace detail {
@@ -3930,37 +5203,45 @@ HWY_INLINE uint64_t ExtractSignBits(Vec128<uint8_t, N> sign_bits,
3930
5203
  // clang POWER8 and 9 targets appear to differ in their return type of
3931
5204
  // vec_vbpermq: unsigned or signed, so cast to avoid a warning.
3932
5205
  using VU64 = detail::Raw128<uint64_t>::type;
5206
+ #if HWY_S390X_HAVE_Z14
5207
+ const Vec128<uint64_t> extracted{
5208
+ reinterpret_cast<VU64>(vec_bperm_u128(sign_bits.raw, bit_shuffle))};
5209
+ #else
3933
5210
  const Vec128<uint64_t> extracted{
3934
5211
  reinterpret_cast<VU64>(vec_vbpermq(sign_bits.raw, bit_shuffle))};
5212
+ #endif
3935
5213
  return extracted.raw[HWY_IS_LITTLE_ENDIAN];
3936
5214
  }
3937
5215
 
3938
- #endif // !HWY_PPC_HAVE_10
5216
+ #endif // !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN
3939
5217
 
3940
5218
  template <typename T, size_t N>
3941
5219
  HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T, N> mask) {
3942
5220
  const DFromM<decltype(mask)> d;
3943
5221
  const Repartition<uint8_t, decltype(d)> du8;
3944
5222
  const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
5223
+
3945
5224
  #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
3946
5225
  return static_cast<uint64_t>(vec_extractm(sign_bits.raw));
3947
- #else
5226
+ #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
3948
5227
  const __vector unsigned char kBitShuffle = {120, 112, 104, 96, 88, 80, 72, 64,
3949
5228
  56, 48, 40, 32, 24, 16, 8, 0};
3950
5229
  return ExtractSignBits(sign_bits, kBitShuffle);
3951
- #endif // HWY_PPC_HAVE_10
5230
+ #endif // HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
3952
5231
  }
3953
5232
 
3954
5233
  template <typename T, size_t N>
3955
5234
  HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T, N> mask) {
3956
5235
  const DFromM<decltype(mask)> d;
5236
+ const RebindToUnsigned<decltype(d)> du;
5237
+
3957
5238
  const Repartition<uint8_t, decltype(d)> du8;
3958
5239
  const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
3959
5240
 
3960
5241
  #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
3961
- const RebindToUnsigned<decltype(d)> du;
3962
5242
  return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
3963
- #else
5243
+ #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
5244
+ (void)du;
3964
5245
  #if HWY_IS_LITTLE_ENDIAN
3965
5246
  const __vector unsigned char kBitShuffle = {
3966
5247
  112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128};
@@ -3975,12 +5256,15 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T, N> mask) {
3975
5256
  template <typename T, size_t N>
3976
5257
  HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) {
3977
5258
  const DFromM<decltype(mask)> d;
5259
+ const RebindToUnsigned<decltype(d)> du;
5260
+
3978
5261
  const Repartition<uint8_t, decltype(d)> du8;
3979
5262
  const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
5263
+
3980
5264
  #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
3981
- const RebindToUnsigned<decltype(d)> du;
3982
5265
  return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
3983
- #else
5266
+ #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
5267
+ (void)du;
3984
5268
  #if HWY_IS_LITTLE_ENDIAN
3985
5269
  const __vector unsigned char kBitShuffle = {96, 64, 32, 0, 128, 128,
3986
5270
  128, 128, 128, 128, 128, 128,
@@ -3997,12 +5281,15 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) {
3997
5281
  template <typename T, size_t N>
3998
5282
  HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, N> mask) {
3999
5283
  const DFromM<decltype(mask)> d;
5284
+ const RebindToUnsigned<decltype(d)> du;
5285
+
4000
5286
  const Repartition<uint8_t, decltype(d)> du8;
4001
5287
  const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
5288
+
4002
5289
  #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
4003
- const RebindToUnsigned<decltype(d)> du;
4004
5290
  return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
4005
- #else
5291
+ #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
5292
+ (void)du;
4006
5293
  #if HWY_IS_LITTLE_ENDIAN
4007
5294
  const __vector unsigned char kBitShuffle = {64, 0, 128, 128, 128, 128,
4008
5295
  128, 128, 128, 128, 128, 128,
@@ -4076,31 +5363,32 @@ HWY_API size_t StoreMaskBits(D /*d*/, MFromD<D> mask, uint8_t* bits) {
4076
5363
  template <class D, HWY_IF_V_SIZE_D(D, 16)>
4077
5364
  HWY_API bool AllFalse(D d, MFromD<D> mask) {
4078
5365
  const RebindToUnsigned<decltype(d)> du;
4079
- return static_cast<bool>(vec_all_eq(RebindMask(du, mask).raw, Zero(du).raw));
5366
+ return static_cast<bool>(
5367
+ vec_all_eq(VecFromMask(du, RebindMask(du, mask)).raw, Zero(du).raw));
4080
5368
  }
4081
5369
 
4082
5370
  template <class D, HWY_IF_V_SIZE_D(D, 16)>
4083
5371
  HWY_API bool AllTrue(D d, MFromD<D> mask) {
4084
5372
  const RebindToUnsigned<decltype(d)> du;
4085
5373
  using TU = TFromD<decltype(du)>;
4086
- return static_cast<bool>(
4087
- vec_all_eq(RebindMask(du, mask).raw, Set(du, hwy::LimitsMax<TU>()).raw));
5374
+ return static_cast<bool>(vec_all_eq(VecFromMask(du, RebindMask(du, mask)).raw,
5375
+ Set(du, hwy::LimitsMax<TU>()).raw));
4088
5376
  }
4089
5377
 
4090
5378
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
4091
5379
  HWY_API bool AllFalse(D d, MFromD<D> mask) {
4092
5380
  const Full128<TFromD<D>> d_full;
4093
5381
  constexpr size_t kN = MaxLanes(d);
4094
- return AllFalse(d_full, MFromD<decltype(d_full)>{
4095
- vec_and(mask.raw, FirstN(d_full, kN).raw)});
5382
+ return AllFalse(d_full,
5383
+ And(MFromD<decltype(d_full)>{mask.raw}, FirstN(d_full, kN)));
4096
5384
  }
4097
5385
 
4098
5386
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
4099
5387
  HWY_API bool AllTrue(D d, MFromD<D> mask) {
4100
5388
  const Full128<TFromD<D>> d_full;
4101
5389
  constexpr size_t kN = MaxLanes(d);
4102
- return AllTrue(d_full, MFromD<decltype(d_full)>{
4103
- vec_or(mask.raw, Not(FirstN(d_full, kN)).raw)});
5390
+ return AllTrue(
5391
+ d_full, Or(MFromD<decltype(d_full)>{mask.raw}, Not(FirstN(d_full, kN))));
4104
5392
  }
4105
5393
 
4106
5394
  template <class D>
@@ -4222,7 +5510,7 @@ HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
4222
5510
  __asm__("xxgenpcvbm %x0, %1, %2"
4223
5511
  : "=wa"(idx)
4224
5512
  : "v"(mask.raw), "i"(kGenPcvmMode));
4225
- return VFromD<D>{idx};
5513
+ return VFromD<decltype(d)>{idx};
4226
5514
  }
4227
5515
  template <bool kIsCompress, class D, HWY_IF_T_SIZE_D(D, 2)>
4228
5516
  HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
@@ -4235,7 +5523,7 @@ HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
4235
5523
  __asm__("xxgenpcvhm %x0, %1, %2"
4236
5524
  : "=wa"(idx)
4237
5525
  : "v"(mask.raw), "i"(kGenPcvmMode));
4238
- return VFromD<D>{idx};
5526
+ return VFromD<decltype(d)>{idx};
4239
5527
  }
4240
5528
  template <bool kIsCompress, class D, HWY_IF_T_SIZE_D(D, 4)>
4241
5529
  HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
@@ -4248,7 +5536,7 @@ HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
4248
5536
  __asm__("xxgenpcvwm %x0, %1, %2"
4249
5537
  : "=wa"(idx)
4250
5538
  : "v"(mask.raw), "i"(kGenPcvmMode));
4251
- return VFromD<D>{idx};
5539
+ return VFromD<decltype(d)>{idx};
4252
5540
  }
4253
5541
  #endif
4254
5542
 
@@ -4821,7 +6109,7 @@ HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
4821
6109
 
4822
6110
  const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
4823
6111
  const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
4824
- #if HWY_PPC_HAVE_9
6112
+ #if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
4825
6113
  StoreN(compressed, d, unaligned, count);
4826
6114
  #else
4827
6115
  BlendedStore(compressed, FirstN(d, count), d, unaligned);
@@ -4939,7 +6227,11 @@ HWY_INLINE V Per128BitBlkRevLanesOnBe(V v) {
4939
6227
 
4940
6228
  template <class V>
4941
6229
  HWY_INLINE V I128Subtract(V a, V b) {
4942
- #if defined(__SIZEOF_INT128__)
6230
+ #if HWY_S390X_HAVE_Z14
6231
+ const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>(
6232
+ vec_sub_u128(reinterpret_cast<__vector unsigned char>(a.raw),
6233
+ reinterpret_cast<__vector unsigned char>(b.raw)))};
6234
+ #elif defined(__SIZEOF_INT128__)
4943
6235
  using VU128 = __vector unsigned __int128;
4944
6236
  const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>(
4945
6237
  vec_sub(reinterpret_cast<VU128>(a.raw), reinterpret_cast<VU128>(b.raw)))};
@@ -5067,84 +6359,133 @@ HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
5067
6359
  return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask))));
5068
6360
  }
5069
6361
 
5070
- // ------------------------------ Reductions
5071
-
6362
+ // ------------------------------ SumsOf2 and SumsOf4
5072
6363
  namespace detail {
5073
6364
 
5074
- // N=1 for any T: no-op
5075
- template <typename T>
5076
- HWY_INLINE Vec128<T, 1> SumOfLanes(Vec128<T, 1> v) {
5077
- return v;
5078
- }
5079
- template <typename T>
5080
- HWY_INLINE Vec128<T, 1> MinOfLanes(Vec128<T, 1> v) {
5081
- return v;
5082
- }
5083
- template <typename T>
5084
- HWY_INLINE Vec128<T, 1> MaxOfLanes(Vec128<T, 1> v) {
5085
- return v;
6365
+ #if !HWY_S390X_HAVE_Z14
6366
+ // Casts nominally int32_t result to D.
6367
+ template <class D>
6368
+ HWY_INLINE VFromD<D> AltivecVsum4sbs(D d, __vector signed char a,
6369
+ __vector signed int b) {
6370
+ const Repartition<int32_t, D> di32;
6371
+ #ifdef __OPTIMIZE__
6372
+ if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
6373
+ const int64_t sum0 =
6374
+ static_cast<int64_t>(a[0]) + static_cast<int64_t>(a[1]) +
6375
+ static_cast<int64_t>(a[2]) + static_cast<int64_t>(a[3]) +
6376
+ static_cast<int64_t>(b[0]);
6377
+ const int64_t sum1 =
6378
+ static_cast<int64_t>(a[4]) + static_cast<int64_t>(a[5]) +
6379
+ static_cast<int64_t>(a[6]) + static_cast<int64_t>(a[7]) +
6380
+ static_cast<int64_t>(b[1]);
6381
+ const int64_t sum2 =
6382
+ static_cast<int64_t>(a[8]) + static_cast<int64_t>(a[9]) +
6383
+ static_cast<int64_t>(a[10]) + static_cast<int64_t>(a[11]) +
6384
+ static_cast<int64_t>(b[2]);
6385
+ const int64_t sum3 =
6386
+ static_cast<int64_t>(a[12]) + static_cast<int64_t>(a[13]) +
6387
+ static_cast<int64_t>(a[14]) + static_cast<int64_t>(a[15]) +
6388
+ static_cast<int64_t>(b[3]);
6389
+ const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
6390
+ const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
6391
+ const int32_t sign2 = static_cast<int32_t>(sum2 >> 63);
6392
+ const int32_t sign3 = static_cast<int32_t>(sum3 >> 63);
6393
+ using Raw = typename detail::Raw128<int32_t>::type;
6394
+ return BitCast(
6395
+ d,
6396
+ VFromD<decltype(di32)>{Raw{
6397
+ (sign0 == (sum0 >> 31)) ? static_cast<int32_t>(sum0)
6398
+ : static_cast<int32_t>(sign0 ^ 0x7FFFFFFF),
6399
+ (sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1)
6400
+ : static_cast<int32_t>(sign1 ^ 0x7FFFFFFF),
6401
+ (sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2)
6402
+ : static_cast<int32_t>(sign2 ^ 0x7FFFFFFF),
6403
+ (sign3 == (sum3 >> 31))
6404
+ ? static_cast<int32_t>(sum3)
6405
+ : static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}});
6406
+ } else // NOLINT
6407
+ #endif
6408
+ {
6409
+ return BitCast(d, VFromD<decltype(di32)>{vec_vsum4sbs(a, b)});
6410
+ }
5086
6411
  }
5087
6412
 
5088
- // u32/i32/f32:
5089
-
5090
- // N=2
5091
- template <typename T, HWY_IF_T_SIZE(T, 4)>
5092
- HWY_INLINE Vec128<T, 2> SumOfLanes(Vec128<T, 2> v10) {
5093
- // NOTE: AltivecVsum2sws cannot be used here as AltivecVsum2sws
5094
- // computes the signed saturated sum of the lanes.
5095
- return v10 + Shuffle2301(v10);
5096
- }
5097
- template <typename T, HWY_IF_T_SIZE(T, 4)>
5098
- HWY_INLINE Vec128<T, 2> MinOfLanes(Vec128<T, 2> v10) {
5099
- return Min(v10, Shuffle2301(v10));
5100
- }
5101
- template <typename T, HWY_IF_T_SIZE(T, 4)>
5102
- HWY_INLINE Vec128<T, 2> MaxOfLanes(Vec128<T, 2> v10) {
5103
- return Max(v10, Shuffle2301(v10));
6413
+ // Casts nominally uint32_t result to D.
6414
+ template <class D>
6415
+ HWY_INLINE VFromD<D> AltivecVsum4ubs(D d, __vector unsigned char a,
6416
+ __vector unsigned int b) {
6417
+ const Repartition<uint32_t, D> du32;
6418
+ #ifdef __OPTIMIZE__
6419
+ if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
6420
+ const uint64_t sum0 =
6421
+ static_cast<uint64_t>(a[0]) + static_cast<uint64_t>(a[1]) +
6422
+ static_cast<uint64_t>(a[2]) + static_cast<uint64_t>(a[3]) +
6423
+ static_cast<uint64_t>(b[0]);
6424
+ const uint64_t sum1 =
6425
+ static_cast<uint64_t>(a[4]) + static_cast<uint64_t>(a[5]) +
6426
+ static_cast<uint64_t>(a[6]) + static_cast<uint64_t>(a[7]) +
6427
+ static_cast<uint64_t>(b[1]);
6428
+ const uint64_t sum2 =
6429
+ static_cast<uint64_t>(a[8]) + static_cast<uint64_t>(a[9]) +
6430
+ static_cast<uint64_t>(a[10]) + static_cast<uint64_t>(a[11]) +
6431
+ static_cast<uint64_t>(b[2]);
6432
+ const uint64_t sum3 =
6433
+ static_cast<uint64_t>(a[12]) + static_cast<uint64_t>(a[13]) +
6434
+ static_cast<uint64_t>(a[14]) + static_cast<uint64_t>(a[15]) +
6435
+ static_cast<uint64_t>(b[3]);
6436
+ return BitCast(
6437
+ d,
6438
+ VFromD<decltype(du32)>{(__vector unsigned int){
6439
+ static_cast<unsigned int>(sum0 <= 0xFFFFFFFFu ? sum0 : 0xFFFFFFFFu),
6440
+ static_cast<unsigned int>(sum1 <= 0xFFFFFFFFu ? sum1 : 0xFFFFFFFFu),
6441
+ static_cast<unsigned int>(sum2 <= 0xFFFFFFFFu ? sum2 : 0xFFFFFFFFu),
6442
+ static_cast<unsigned int>(sum3 <= 0xFFFFFFFFu ? sum3
6443
+ : 0xFFFFFFFFu)}});
6444
+ } else // NOLINT
6445
+ #endif
6446
+ {
6447
+ return BitCast(d, VFromD<decltype(du32)>{vec_vsum4ubs(a, b)});
6448
+ }
5104
6449
  }
5105
6450
 
5106
- // N=4 (full)
5107
- template <typename T, HWY_IF_T_SIZE(T, 4)>
5108
- HWY_INLINE Vec128<T> SumOfLanes(Vec128<T> v3210) {
5109
- // NOTE: AltivecVsumsws cannot be used here as AltivecVsumsws
5110
- // computes the signed saturated sum of the lanes.
5111
- const Vec128<T> v1032 = Shuffle1032(v3210);
5112
- const Vec128<T> v31_20_31_20 = v3210 + v1032;
5113
- const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
5114
- return v20_31_20_31 + v31_20_31_20;
5115
- }
5116
- template <typename T, HWY_IF_T_SIZE(T, 4)>
5117
- HWY_INLINE Vec128<T> MinOfLanes(Vec128<T> v3210) {
5118
- const Vec128<T> v1032 = Shuffle1032(v3210);
5119
- const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
5120
- const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
5121
- return Min(v20_31_20_31, v31_20_31_20);
5122
- }
5123
- template <typename T, HWY_IF_T_SIZE(T, 4)>
5124
- HWY_INLINE Vec128<T> MaxOfLanes(Vec128<T> v3210) {
5125
- const Vec128<T> v1032 = Shuffle1032(v3210);
5126
- const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
5127
- const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
5128
- return Max(v20_31_20_31, v31_20_31_20);
5129
- }
6451
+ // Casts nominally int32_t result to D.
6452
+ template <class D>
6453
+ HWY_INLINE VFromD<D> AltivecVsum2sws(D d, __vector signed int a,
6454
+ __vector signed int b) {
6455
+ const Repartition<int32_t, D> di32;
6456
+ #ifdef __OPTIMIZE__
6457
+ const Repartition<uint64_t, D> du64;
6458
+ constexpr int kDestLaneOffset = HWY_IS_BIG_ENDIAN;
6459
+ if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset]) &&
6460
+ __builtin_constant_p(b[kDestLaneOffset + 2])) {
6461
+ const int64_t sum0 = static_cast<int64_t>(a[0]) +
6462
+ static_cast<int64_t>(a[1]) +
6463
+ static_cast<int64_t>(b[kDestLaneOffset]);
6464
+ const int64_t sum1 = static_cast<int64_t>(a[2]) +
6465
+ static_cast<int64_t>(a[3]) +
6466
+ static_cast<int64_t>(b[kDestLaneOffset + 2]);
6467
+ const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
6468
+ const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
6469
+ return BitCast(d, VFromD<decltype(du64)>{(__vector unsigned long long){
6470
+ (sign0 == (sum0 >> 31))
6471
+ ? static_cast<uint32_t>(sum0)
6472
+ : static_cast<uint32_t>(sign0 ^ 0x7FFFFFFF),
6473
+ (sign1 == (sum1 >> 31))
6474
+ ? static_cast<uint32_t>(sum1)
6475
+ : static_cast<uint32_t>(sign1 ^ 0x7FFFFFFF)}});
6476
+ } else // NOLINT
6477
+ #endif
6478
+ {
6479
+ __vector signed int sum;
5130
6480
 
5131
- // u64/i64/f64:
6481
+ // Inline assembly is used for vsum2sws to avoid unnecessary shuffling
6482
+ // on little-endian PowerPC targets as the result of the vsum2sws
6483
+ // instruction will already be in the correct lanes on little-endian
6484
+ // PowerPC targets.
6485
+ __asm__("vsum2sws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b));
5132
6486
 
5133
- // N=2 (full)
5134
- template <typename T, HWY_IF_T_SIZE(T, 8)>
5135
- HWY_INLINE Vec128<T> SumOfLanes(Vec128<T> v10) {
5136
- const Vec128<T> v01 = Shuffle01(v10);
5137
- return v10 + v01;
5138
- }
5139
- template <typename T, HWY_IF_T_SIZE(T, 8)>
5140
- HWY_INLINE Vec128<T> MinOfLanes(Vec128<T> v10) {
5141
- const Vec128<T> v01 = Shuffle01(v10);
5142
- return Min(v10, v01);
5143
- }
5144
- template <typename T, HWY_IF_T_SIZE(T, 8)>
5145
- HWY_INLINE Vec128<T> MaxOfLanes(Vec128<T> v10) {
5146
- const Vec128<T> v01 = Shuffle01(v10);
5147
- return Max(v10, v01);
6487
+ return BitCast(d, VFromD<decltype(di32)>{sum});
6488
+ }
5148
6489
  }
5149
6490
 
5150
6491
  // Casts nominally int32_t result to D.
@@ -5238,275 +6579,419 @@ HWY_INLINE Vec128<int32_t, N / 2> AltivecU16SumsOf2(Vec128<uint16_t, N> v) {
5238
6579
  return AltivecVsum4shs(di32, Xor(BitCast(di16, v), Set(di16, -32768)).raw,
5239
6580
  Set(di32, 65536).raw);
5240
6581
  }
6582
+ #endif // !HWY_S390X_HAVE_Z14
6583
+
6584
+ // U16->U32 SumsOf2
6585
+ template <class V>
6586
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
6587
+ hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
6588
+ const DFromV<V> d;
6589
+ const RepartitionToWide<decltype(d)> dw;
6590
+
6591
+ #if HWY_S390X_HAVE_Z14
6592
+ return VFromD<decltype(dw)>{vec_sum4(v.raw, Zero(d).raw)};
6593
+ #else
6594
+ return BitCast(dw, AltivecU16SumsOf2(v));
6595
+ #endif
6596
+ }
6597
+
6598
+ // I16->I32 SumsOf2
6599
+ template <class V>
6600
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
6601
+ hwy::SignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
6602
+ const DFromV<V> d;
6603
+ const RepartitionToWide<decltype(d)> dw;
6604
+
6605
+ #if HWY_S390X_HAVE_Z14
6606
+ const RebindToUnsigned<decltype(d)> du;
6607
+ return BitCast(dw, SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<2>(),
6608
+ BitCast(du, Xor(v, SignBit(d))))) +
6609
+ Set(dw, int32_t{-65536});
6610
+ #else
6611
+ return AltivecVsum4shs(dw, v.raw, Zero(dw).raw);
6612
+ #endif
6613
+ }
6614
+
6615
+ #if HWY_S390X_HAVE_Z14
6616
+ // U32->U64 SumsOf2
6617
+ template <class V>
6618
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
6619
+ hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
6620
+ const DFromV<V> d;
6621
+ const RepartitionToWide<decltype(d)> dw;
6622
+ return VFromD<decltype(dw)>{vec_sum2(v.raw, Zero(d).raw)};
6623
+ }
5241
6624
 
5242
- HWY_API Vec32<uint16_t> SumOfLanes(Vec32<uint16_t> v) {
6625
+ // I32->I64 SumsOf2
6626
+ template <class V>
6627
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
6628
+ hwy::SignedTag /*type_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
6629
+ const DFromV<V> d;
6630
+ const RepartitionToWide<decltype(d)> dw;
6631
+ const RebindToUnsigned<decltype(d)> du;
6632
+
6633
+ return BitCast(dw, SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<4>(),
6634
+ BitCast(du, Xor(v, SignBit(d))))) +
6635
+ Set(dw, int64_t{-4294967296LL});
6636
+ }
6637
+ #endif
6638
+
6639
+ // U8->U32 SumsOf4
6640
+ template <class V>
6641
+ HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
6642
+ hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
6643
+ const DFromV<V> d;
6644
+ const RepartitionToWideX2<decltype(d)> dw2;
6645
+
6646
+ #if HWY_S390X_HAVE_Z14
6647
+ return VFromD<decltype(dw2)>{vec_sum4(v.raw, Zero(d).raw)};
6648
+ #else
6649
+ return AltivecVsum4ubs(dw2, v.raw, Zero(dw2).raw);
6650
+ #endif
6651
+ }
6652
+
6653
+ // I8->I32 SumsOf4
6654
+ template <class V>
6655
+ HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
6656
+ hwy::SignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
6657
+ const DFromV<V> d;
6658
+ const RepartitionToWideX2<decltype(d)> dw2;
6659
+
6660
+ #if HWY_S390X_HAVE_Z14
6661
+ const RebindToUnsigned<decltype(d)> du;
6662
+ return BitCast(dw2, SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(),
6663
+ BitCast(du, Xor(v, SignBit(d))))) +
6664
+ Set(dw2, int32_t{-512});
6665
+ #else
6666
+ return AltivecVsum4sbs(dw2, v.raw, Zero(dw2).raw);
6667
+ #endif
6668
+ }
6669
+
6670
+ // U16->U64 SumsOf4
6671
+ template <class V>
6672
+ HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
6673
+ hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
6674
+ const DFromV<V> d;
6675
+ const RepartitionToWide<decltype(d)> dw;
6676
+ const RepartitionToWide<decltype(dw)> dw2;
6677
+
6678
+ #if HWY_S390X_HAVE_Z14
6679
+ return VFromD<decltype(dw2)>{vec_sum2(v.raw, Zero(d).raw)};
6680
+ #else
6681
+ const RebindToSigned<decltype(dw)> dw_i;
6682
+ return AltivecVsum2sws(dw2, BitCast(dw_i, SumsOf2(v)).raw, Zero(dw_i).raw);
6683
+ #endif
6684
+ }
6685
+
6686
+ // I16->I64 SumsOf4
6687
+ template <class V>
6688
+ HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
6689
+ hwy::SignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
6690
+ const DFromV<V> d;
6691
+ const RepartitionToWide<decltype(d)> dw;
6692
+ const RepartitionToWide<decltype(dw)> dw2;
6693
+
6694
+ #if HWY_S390X_HAVE_Z14
6695
+ const RebindToUnsigned<decltype(d)> du;
6696
+ return BitCast(dw2, SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<2>(),
6697
+ BitCast(du, Xor(v, SignBit(d))))) +
6698
+ Set(dw2, int64_t{-131072});
6699
+ #else // VSX
6700
+ const auto sums_of_4_in_lo32 =
6701
+ AltivecVsum2sws(dw, SumsOf2(v).raw, Zero(dw).raw);
6702
+
6703
+ #if HWY_IS_LITTLE_ENDIAN
6704
+ return PromoteEvenTo(dw2, sums_of_4_in_lo32);
6705
+ #else
6706
+ return PromoteOddTo(dw2, sums_of_4_in_lo32);
6707
+ #endif // HWY_IS_LITTLE_ENDIAN
6708
+ #endif // HWY_S390X_HAVE_Z14
6709
+ }
6710
+
6711
+ } // namespace detail
6712
+
6713
+ // ------------------------------ SumOfLanes
6714
+
6715
+ // We define SumOfLanes for 8/16-bit types (and I32/U32/I64/U64 on Z14/Z15/Z16);
6716
+ // enable generic for the rest.
6717
+ #undef HWY_IF_SUM_OF_LANES_D
6718
+ #if HWY_S390X_HAVE_Z14
6719
+ #define HWY_IF_SUM_OF_LANES_D(D) HWY_IF_LANES_GT_D(D, 1), HWY_IF_FLOAT3264_D(D)
6720
+ #else
6721
+ #define HWY_IF_SUM_OF_LANES_D(D) \
6722
+ HWY_IF_LANES_GT_D(D, 1), HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))
6723
+ #endif
6724
+
6725
+ #if HWY_S390X_HAVE_Z14
6726
+ namespace detail {
6727
+
6728
+ template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
6729
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
6730
+ HWY_INLINE Vec128<T> SumOfU32OrU64LanesAsU128(Vec128<T> v) {
6731
+ const DFromV<decltype(v)> d;
6732
+ const RebindToUnsigned<decltype(d)> du;
6733
+ return BitCast(
6734
+ d, Vec128<uint8_t>{vec_sum_u128(BitCast(du, v).raw, Zero(du).raw)});
6735
+ }
6736
+
6737
+ } // namespace detail
6738
+
6739
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)>
6740
+ HWY_API VFromD<D> SumOfLanes(D /*d64*/, VFromD<D> v) {
6741
+ return Broadcast<1>(detail::SumOfU32OrU64LanesAsU128(v));
6742
+ }
6743
+ #endif
6744
+
6745
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)>
6746
+ HWY_API Vec32<uint16_t> SumOfLanes(D du16, Vec32<uint16_t> v) {
5243
6747
  constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN;
5244
- DFromV<decltype(v)> du16;
5245
- return Broadcast<kSumLaneIdx>(BitCast(du16, AltivecU16SumsOf2(v)));
6748
+ return Broadcast<kSumLaneIdx>(
6749
+ BitCast(du16, detail::SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<2>(), v)));
5246
6750
  }
5247
6751
 
5248
- HWY_API Vec64<uint16_t> SumOfLanes(Vec64<uint16_t> v) {
6752
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
6753
+ HWY_API Vec64<uint16_t> SumOfLanes(D du16, Vec64<uint16_t> v) {
5249
6754
  constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
5250
- const Full64<uint16_t> du16;
5251
- const auto zero = Zero(Full128<int32_t>());
5252
6755
  return Broadcast<kSumLaneIdx>(
5253
- AltivecVsum2sws(du16, AltivecU16SumsOf2(v).raw, zero.raw));
6756
+ BitCast(du16, detail::SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<2>(), v)));
5254
6757
  }
5255
6758
 
5256
- HWY_API Vec128<uint16_t> SumOfLanes(Vec128<uint16_t> v) {
6759
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
6760
+ HWY_API Vec128<uint16_t> SumOfLanes(D du16, Vec128<uint16_t> v) {
5257
6761
  constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
5258
- const Full128<uint16_t> du16;
6762
+ #if HWY_S390X_HAVE_Z14
6763
+ return Broadcast<kSumLaneIdx>(
6764
+ BitCast(du16, detail::SumOfU32OrU64LanesAsU128(detail::SumsOf4(
6765
+ hwy::UnsignedTag(), hwy::SizeTag<2>(), v))));
6766
+ #else // VSX
5259
6767
  const auto zero = Zero(Full128<int32_t>());
5260
6768
  return Broadcast<kSumLaneIdx>(
5261
- AltivecVsumsws(du16, AltivecU16SumsOf2(v).raw, zero.raw));
6769
+ detail::AltivecVsumsws(du16, detail::AltivecU16SumsOf2(v).raw, zero.raw));
6770
+ #endif
5262
6771
  }
5263
6772
 
5264
- HWY_API Vec32<int16_t> SumOfLanes(Vec32<int16_t> v) {
6773
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I16_D(D)>
6774
+ HWY_API Vec32<int16_t> SumOfLanes(D di16, Vec32<int16_t> v) {
6775
+ #if HWY_S390X_HAVE_Z14
6776
+ const RebindToUnsigned<decltype(di16)> du16;
6777
+ return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
6778
+ #else
5265
6779
  constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN;
5266
- const Full32<int16_t> di16;
5267
- const auto zero = Zero(Full128<int32_t>());
5268
- return Broadcast<kSumLaneIdx>(AltivecVsum4shs(di16, v.raw, zero.raw));
6780
+ return Broadcast<kSumLaneIdx>(
6781
+ BitCast(di16, detail::SumsOf2(hwy::SignedTag(), hwy::SizeTag<2>(), v)));
6782
+ #endif
5269
6783
  }
5270
6784
 
5271
- HWY_API Vec64<int16_t> SumOfLanes(Vec64<int16_t> v) {
6785
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)>
6786
+ HWY_API Vec64<int16_t> SumOfLanes(D di16, Vec64<int16_t> v) {
6787
+ #if HWY_S390X_HAVE_Z14
6788
+ const RebindToUnsigned<decltype(di16)> du16;
6789
+ return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
6790
+ #else
5272
6791
  constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
5273
- const Full128<int32_t> di32;
5274
- const Full64<int16_t> di16;
5275
- const auto zero = Zero(di32);
5276
- return Broadcast<kSumLaneIdx>(AltivecVsum2sws(
5277
- di16, AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw));
6792
+ return Broadcast<kSumLaneIdx>(
6793
+ BitCast(di16, detail::SumsOf4(hwy::SignedTag(), hwy::SizeTag<2>(), v)));
6794
+ #endif
5278
6795
  }
5279
6796
 
5280
- HWY_API Vec128<int16_t> SumOfLanes(Vec128<int16_t> v) {
6797
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
6798
+ HWY_API Vec128<int16_t> SumOfLanes(D di16, Vec128<int16_t> v) {
6799
+ #if HWY_S390X_HAVE_Z14
6800
+ const RebindToUnsigned<decltype(di16)> du16;
6801
+ return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
6802
+ #else
5281
6803
  constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
5282
- const Full128<int16_t> di16;
5283
6804
  const Full128<int32_t> di32;
5284
6805
  const auto zero = Zero(di32);
5285
- return Broadcast<kSumLaneIdx>(AltivecVsumsws(
5286
- di16, AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw));
6806
+ return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
6807
+ di16, detail::AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw));
6808
+ #endif
5287
6809
  }
5288
6810
 
5289
- // u8, N=2, N=4, N=8, N=16:
5290
- HWY_API Vec16<uint8_t> SumOfLanes(Vec16<uint8_t> v) {
6811
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U8_D(D)>
6812
+ HWY_API Vec32<uint8_t> SumOfLanes(D du8, Vec32<uint8_t> v) {
5291
6813
  constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
5292
- const Full16<uint8_t> du8;
5293
- const Full16<uint16_t> du16;
5294
- const Twice<decltype(du8)> dt_u8;
5295
- const Twice<decltype(du16)> dt_u16;
5296
- const Full128<uint32_t> du32;
5297
- return LowerHalf(Broadcast<kSumLaneIdx>(AltivecVsum4ubs(
5298
- dt_u8, BitCast(dt_u8, Combine(dt_u16, Zero(du16), BitCast(du16, v))).raw,
5299
- Zero(du32).raw)));
6814
+ return Broadcast<kSumLaneIdx>(
6815
+ BitCast(du8, detail::SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(), v)));
5300
6816
  }
5301
6817
 
5302
- HWY_API Vec32<uint8_t> SumOfLanes(Vec32<uint8_t> v) {
5303
- constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
5304
- const Full128<uint32_t> du32;
5305
- const Full32<uint8_t> du8;
5306
- return Broadcast<kSumLaneIdx>(AltivecVsum4ubs(du8, v.raw, Zero(du32).raw));
6818
+ template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_U8_D(D)>
6819
+ HWY_API Vec16<uint8_t> SumOfLanes(D du8, Vec16<uint8_t> v) {
6820
+ const Twice<decltype(du8)> dt_u8;
6821
+ return LowerHalf(du8, SumOfLanes(dt_u8, Combine(dt_u8, Zero(du8), v)));
5307
6822
  }
5308
6823
 
5309
- HWY_API Vec64<uint8_t> SumOfLanes(Vec64<uint8_t> v) {
6824
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
6825
+ HWY_API Vec64<uint8_t> SumOfLanes(D du8, Vec64<uint8_t> v) {
5310
6826
  constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
5311
- const Full64<uint8_t> du8;
5312
6827
  return Broadcast<kSumLaneIdx>(BitCast(du8, SumsOf8(v)));
5313
6828
  }
5314
6829
 
5315
- HWY_API Vec128<uint8_t> SumOfLanes(Vec128<uint8_t> v) {
6830
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
6831
+ HWY_API Vec128<uint8_t> SumOfLanes(D du8, Vec128<uint8_t> v) {
5316
6832
  constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15;
5317
6833
 
6834
+ #if HWY_S390X_HAVE_Z14
6835
+ return Broadcast<kSumLaneIdx>(
6836
+ BitCast(du8, detail::SumOfU32OrU64LanesAsU128(detail::SumsOf4(
6837
+ hwy::UnsignedTag(), hwy::SizeTag<1>(), v))));
6838
+ #else
5318
6839
  const Full128<uint32_t> du32;
5319
6840
  const RebindToSigned<decltype(du32)> di32;
5320
- const Full128<uint8_t> du8;
5321
6841
  const Vec128<uint32_t> zero = Zero(du32);
5322
- return Broadcast<kSumLaneIdx>(
5323
- AltivecVsumsws(du8, AltivecVsum4ubs(di32, v.raw, zero.raw).raw,
5324
- BitCast(di32, zero).raw));
6842
+ return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
6843
+ du8, detail::AltivecVsum4ubs(di32, v.raw, zero.raw).raw,
6844
+ BitCast(di32, zero).raw));
6845
+ #endif
5325
6846
  }
5326
6847
 
5327
- HWY_API Vec16<int8_t> SumOfLanes(Vec16<int8_t> v) {
6848
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I8_D(D)>
6849
+ HWY_API Vec32<int8_t> SumOfLanes(D di8, Vec32<int8_t> v) {
6850
+ #if HWY_S390X_HAVE_Z14
6851
+ const RebindToUnsigned<decltype(di8)> du8;
6852
+ return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
6853
+ #else
5328
6854
  constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
5329
-
5330
- const Full128<uint16_t> du16;
5331
- const Repartition<int32_t, decltype(du16)> di32;
5332
- const Repartition<int8_t, decltype(du16)> di8;
5333
- const Vec128<int8_t> zzvv = BitCast(
5334
- di8, InterleaveLower(BitCast(du16, Vec128<int8_t>{v.raw}), Zero(du16)));
5335
- return Vec16<int8_t>{
5336
- Broadcast<kSumLaneIdx>(AltivecVsum4sbs(di8, zzvv.raw, Zero(di32).raw))
5337
- .raw};
6855
+ return Broadcast<kSumLaneIdx>(
6856
+ BitCast(di8, detail::SumsOf4(hwy::SignedTag(), hwy::SizeTag<1>(), v)));
6857
+ #endif
5338
6858
  }
5339
6859
 
5340
- HWY_API Vec32<int8_t> SumOfLanes(Vec32<int8_t> v) {
5341
- constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
5342
- const Full32<int8_t> di8;
5343
- const Vec128<int32_t> zero = Zero(Full128<int32_t>());
5344
- return Broadcast<kSumLaneIdx>(AltivecVsum4sbs(di8, v.raw, zero.raw));
6860
+ template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_I8_D(D)>
6861
+ HWY_API Vec16<int8_t> SumOfLanes(D di8, Vec16<int8_t> v) {
6862
+ const Twice<decltype(di8)> dt_i8;
6863
+ return LowerHalf(di8, SumOfLanes(dt_i8, Combine(dt_i8, Zero(di8), v)));
5345
6864
  }
5346
6865
 
5347
- HWY_API Vec64<int8_t> SumOfLanes(Vec64<int8_t> v) {
6866
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
6867
+ HWY_API Vec64<int8_t> SumOfLanes(D di8, Vec64<int8_t> v) {
6868
+ #if HWY_S390X_HAVE_Z14
6869
+ const RebindToUnsigned<decltype(di8)> du8;
6870
+ return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
6871
+ #else
5348
6872
  constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
5349
- const Full128<int32_t> di32;
5350
- const Vec128<int32_t> zero = Zero(di32);
5351
- const Full64<int8_t> di8;
5352
- return Broadcast<kSumLaneIdx>(AltivecVsum2sws(
5353
- di8, AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw));
6873
+ return Broadcast<kSumLaneIdx>(BitCast(di8, SumsOf8(v)));
6874
+ #endif
5354
6875
  }
5355
6876
 
5356
- HWY_API Vec128<int8_t> SumOfLanes(Vec128<int8_t> v) {
6877
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
6878
+ HWY_API Vec128<int8_t> SumOfLanes(D di8, Vec128<int8_t> v) {
6879
+ #if HWY_S390X_HAVE_Z14
6880
+ const RebindToUnsigned<decltype(di8)> du8;
6881
+ return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
6882
+ #else
5357
6883
  constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15;
5358
- const Full128<int8_t> di8;
5359
6884
  const Full128<int32_t> di32;
5360
6885
  const Vec128<int32_t> zero = Zero(di32);
5361
- return Broadcast<kSumLaneIdx>(AltivecVsumsws(
5362
- di8, AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw));
6886
+ return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
6887
+ di8, detail::AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw));
6888
+ #endif
5363
6889
  }
5364
6890
 
5365
- template <size_t N, HWY_IF_V_SIZE_GT(uint8_t, N, 4)>
5366
- HWY_API Vec128<uint8_t, N> MaxOfLanes(Vec128<uint8_t, N> v) {
5367
- const DFromV<decltype(v)> d;
5368
- const RepartitionToWide<decltype(d)> d16;
5369
- const RepartitionToWide<decltype(d16)> d32;
5370
- Vec128<uint8_t, N> vm = Max(v, Reverse2(d, v));
5371
- vm = Max(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
5372
- vm = Max(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
5373
- if (N > 8) {
5374
- const RepartitionToWide<decltype(d32)> d64;
5375
- vm = Max(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
5376
- }
5377
- return vm;
6891
+ #if HWY_S390X_HAVE_Z14
6892
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_UI32_D(D)>
6893
+ HWY_API VFromD<D> SumOfLanes(D d32, VFromD<D> v) {
6894
+ const RebindToUnsigned<decltype(d32)> du32;
6895
+ return Broadcast<1>(
6896
+ BitCast(d32, detail::SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<4>(),
6897
+ BitCast(du32, v))));
5378
6898
  }
5379
6899
 
5380
- template <size_t N, HWY_IF_V_SIZE_GT(uint8_t, N, 4)>
5381
- HWY_API Vec128<uint8_t, N> MinOfLanes(Vec128<uint8_t, N> v) {
5382
- const DFromV<decltype(v)> d;
5383
- const RepartitionToWide<decltype(d)> d16;
5384
- const RepartitionToWide<decltype(d16)> d32;
5385
- Vec128<uint8_t, N> vm = Min(v, Reverse2(d, v));
5386
- vm = Min(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
5387
- vm = Min(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
5388
- if (N > 8) {
5389
- const RepartitionToWide<decltype(d32)> d64;
5390
- vm = Min(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
5391
- }
5392
- return vm;
6900
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
6901
+ HWY_API VFromD<D> SumOfLanes(D /*d32*/, VFromD<D> v) {
6902
+ return Broadcast<3>(detail::SumOfU32OrU64LanesAsU128(v));
5393
6903
  }
6904
+ #endif
5394
6905
 
5395
- template <size_t N, HWY_IF_V_SIZE_GT(int8_t, N, 4)>
5396
- HWY_API Vec128<int8_t, N> MaxOfLanes(Vec128<int8_t, N> v) {
5397
- const DFromV<decltype(v)> d;
5398
- const RepartitionToWide<decltype(d)> d16;
5399
- const RepartitionToWide<decltype(d16)> d32;
5400
- Vec128<int8_t, N> vm = Max(v, Reverse2(d, v));
5401
- vm = Max(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
5402
- vm = Max(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
5403
- if (N > 8) {
5404
- const RepartitionToWide<decltype(d32)> d64;
5405
- vm = Max(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
5406
- }
5407
- return vm;
5408
- }
6906
+ // generic_ops defines MinOfLanes and MaxOfLanes.
5409
6907
 
5410
- template <size_t N, HWY_IF_V_SIZE_GT(int8_t, N, 4)>
5411
- HWY_API Vec128<int8_t, N> MinOfLanes(Vec128<int8_t, N> v) {
5412
- const DFromV<decltype(v)> d;
5413
- const RepartitionToWide<decltype(d)> d16;
5414
- const RepartitionToWide<decltype(d16)> d32;
5415
- Vec128<int8_t, N> vm = Min(v, Reverse2(d, v));
5416
- vm = Min(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
5417
- vm = Min(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
5418
- if (N > 8) {
5419
- const RepartitionToWide<decltype(d32)> d64;
5420
- vm = Min(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
5421
- }
5422
- return vm;
5423
- }
6908
+ // ------------------------------ ReduceSum for N=4 I8/U8
5424
6909
 
5425
- template <size_t N, HWY_IF_V_SIZE_GT(uint16_t, N, 2)>
5426
- HWY_API Vec128<uint16_t, N> MinOfLanes(Vec128<uint16_t, N> v) {
5427
- const Simd<uint16_t, N, 0> d;
5428
- const RepartitionToWide<decltype(d)> d32;
5429
- #if HWY_IS_LITTLE_ENDIAN
5430
- const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
5431
- const auto odd = ShiftRight<16>(BitCast(d32, v));
5432
- #else
5433
- const auto even = ShiftRight<16>(BitCast(d32, v));
5434
- const auto odd = And(BitCast(d32, v), Set(d32, 0xFFFF));
5435
- #endif
5436
- const auto min = MinOfLanes(Min(even, odd));
5437
- // Also broadcast into odd lanes on little-endian and into even lanes
5438
- // on big-endian
5439
- return Vec128<uint16_t, N>{vec_pack(min.raw, min.raw)};
5440
- }
5441
- template <size_t N, HWY_IF_V_SIZE_GT(int16_t, N, 2)>
5442
- HWY_API Vec128<int16_t, N> MinOfLanes(Vec128<int16_t, N> v) {
5443
- const Simd<int16_t, N, 0> d;
5444
- const RepartitionToWide<decltype(d)> d32;
5445
- // Sign-extend
5446
- #if HWY_IS_LITTLE_ENDIAN
5447
- const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
5448
- const auto odd = ShiftRight<16>(BitCast(d32, v));
6910
+ // GetLane(SumsOf4(v)) is more efficient on PPC/Z14 than the default N=4
6911
+ // I8/U8 ReduceSum implementation in generic_ops-inl.h
6912
+ #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
6913
+ #undef HWY_NATIVE_REDUCE_SUM_4_UI8
5449
6914
  #else
5450
- const auto even = ShiftRight<16>(BitCast(d32, v));
5451
- const auto odd = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
6915
+ #define HWY_NATIVE_REDUCE_SUM_4_UI8
5452
6916
  #endif
5453
- const auto min = MinOfLanes(Min(even, odd));
5454
- // Also broadcast into odd lanes on little-endian and into even lanes
5455
- // on big-endian
5456
- return Vec128<int16_t, N>{vec_pack(min.raw, min.raw)};
6917
+
6918
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
6919
+ HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) {
6920
+ return static_cast<TFromD<D>>(GetLane(SumsOf4(v)));
5457
6921
  }
5458
6922
 
5459
- template <size_t N, HWY_IF_V_SIZE_GT(uint16_t, N, 2)>
5460
- HWY_API Vec128<uint16_t, N> MaxOfLanes(Vec128<uint16_t, N> v) {
5461
- const Simd<uint16_t, N, 0> d;
5462
- const RepartitionToWide<decltype(d)> d32;
5463
- #if HWY_IS_LITTLE_ENDIAN
5464
- const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
5465
- const auto odd = ShiftRight<16>(BitCast(d32, v));
6923
+ // ------------------------------ BitShuffle
6924
+
6925
+ #ifdef HWY_NATIVE_BITSHUFFLE
6926
+ #undef HWY_NATIVE_BITSHUFFLE
5466
6927
  #else
5467
- const auto even = ShiftRight<16>(BitCast(d32, v));
5468
- const auto odd = And(BitCast(d32, v), Set(d32, 0xFFFF));
6928
+ #define HWY_NATIVE_BITSHUFFLE
5469
6929
  #endif
5470
- const auto max = MaxOfLanes(Max(even, odd));
5471
- // Also broadcast into odd lanes.
5472
- return Vec128<uint16_t, N>{vec_pack(max.raw, max.raw)};
5473
- }
5474
- template <size_t N, HWY_IF_V_SIZE_GT(int16_t, N, 2)>
5475
- HWY_API Vec128<int16_t, N> MaxOfLanes(Vec128<int16_t, N> v) {
5476
- const Simd<int16_t, N, 0> d;
5477
- const RepartitionToWide<decltype(d)> d32;
5478
- // Sign-extend
6930
+
6931
+ template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
6932
+ HWY_IF_V_SIZE_V(VI, HWY_MAX_LANES_V(V) * 8)>
6933
+ HWY_API V BitShuffle(V v, VI idx) {
6934
+ const DFromV<decltype(v)> d64;
6935
+ const RebindToUnsigned<decltype(d64)> du64;
6936
+ const Repartition<uint8_t, decltype(d64)> du8;
6937
+
6938
+ const Full128<TFromD<decltype(du64)>> d_full_u64;
6939
+ const Full128<TFromD<decltype(du8)>> d_full_u8;
6940
+
6941
+ using RawVU64 = __vector unsigned long long;
6942
+
6943
+ #if HWY_PPC_HAVE_9
6944
+
5479
6945
  #if HWY_IS_LITTLE_ENDIAN
5480
- const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
5481
- const auto odd = ShiftRight<16>(BitCast(d32, v));
6946
+ (void)d_full_u64;
6947
+ auto bit_idx = ResizeBitCast(d_full_u8, idx);
5482
6948
  #else
5483
- const auto even = ShiftRight<16>(BitCast(d32, v));
5484
- const auto odd = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
6949
+ auto bit_idx =
6950
+ BitCast(d_full_u8, ReverseLaneBytes(ResizeBitCast(d_full_u64, idx)));
5485
6951
  #endif
5486
- const auto max = MaxOfLanes(Max(even, odd));
5487
- // Also broadcast into odd lanes on little-endian and into even lanes
5488
- // on big-endian
5489
- return Vec128<int16_t, N>{vec_pack(max.raw, max.raw)};
5490
- }
5491
6952
 
5492
- } // namespace detail
6953
+ bit_idx = Xor(bit_idx, Set(d_full_u8, uint8_t{0x3F}));
5493
6954
 
5494
- // Supported for u/i/f 32/64. Returns the same value in each lane.
5495
- template <class D>
5496
- HWY_API VFromD<D> SumOfLanes(D /* tag */, VFromD<D> v) {
5497
- return detail::SumOfLanes(v);
5498
- }
5499
- template <class D>
5500
- HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v) {
5501
- return GetLane(detail::SumOfLanes(v));
5502
- }
5503
- template <class D>
5504
- HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) {
5505
- return detail::MinOfLanes(v);
5506
- }
5507
- template <class D>
5508
- HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) {
5509
- return detail::MaxOfLanes(v);
6955
+ return BitCast(d64, VFromD<decltype(du64)>{reinterpret_cast<RawVU64>(
6956
+ vec_bperm(BitCast(du64, v).raw, bit_idx.raw))});
6957
+ #else // !HWY_PPC_HAVE_9
6958
+
6959
+ #if HWY_IS_LITTLE_ENDIAN
6960
+ const auto bit_idx_xor_mask = BitCast(
6961
+ d_full_u8, Dup128VecFromValues(d_full_u64, uint64_t{0x7F7F7F7F7F7F7F7Fu},
6962
+ uint64_t{0x3F3F3F3F3F3F3F3Fu}));
6963
+ const auto bit_idx = Xor(ResizeBitCast(d_full_u8, idx), bit_idx_xor_mask);
6964
+ constexpr int kBitShufResultByteShrAmt = 8;
6965
+ #else
6966
+ const auto bit_idx_xor_mask = BitCast(
6967
+ d_full_u8, Dup128VecFromValues(d_full_u64, uint64_t{0x3F3F3F3F3F3F3F3Fu},
6968
+ uint64_t{0x7F7F7F7F7F7F7F7Fu}));
6969
+ const auto bit_idx =
6970
+ Xor(BitCast(d_full_u8, ReverseLaneBytes(ResizeBitCast(d_full_u64, idx))),
6971
+ bit_idx_xor_mask);
6972
+ constexpr int kBitShufResultByteShrAmt = 6;
6973
+ #endif
6974
+
6975
+ #if HWY_S390X_HAVE_Z14
6976
+ const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>(
6977
+ vec_bperm_u128(BitCast(du8, v).raw, bit_idx.raw))};
6978
+ #elif defined(__SIZEOF_INT128__)
6979
+ using RawVU128 = __vector unsigned __int128;
6980
+ const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>(
6981
+ vec_vbpermq(reinterpret_cast<RawVU128>(v.raw), bit_idx.raw))};
6982
+ #else
6983
+ using RawVU128 = __vector unsigned char;
6984
+ const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>(
6985
+ vec_vbpermq(reinterpret_cast<RawVU128>(v.raw), bit_idx.raw))};
6986
+ #endif
6987
+
6988
+ return ResizeBitCast(
6989
+ d64, PromoteTo(d_full_u64,
6990
+ ResizeBitCast(
6991
+ Rebind<uint8_t, decltype(d_full_u64)>(),
6992
+ CombineShiftRightBytes<kBitShufResultByteShrAmt>(
6993
+ d_full_u64, bit_shuf_result, bit_shuf_result))));
6994
+ #endif // HWY_PPC_HAVE_9
5510
6995
  }
5511
6996
 
5512
6997
  // ------------------------------ Lt128
@@ -5672,7 +7157,20 @@ HWY_API V Max128Upper(D d, const V a, const V b) {
5672
7157
 
5673
7158
  template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
5674
7159
  HWY_API V LeadingZeroCount(V v) {
7160
+ #if HWY_S390X_HAVE_Z14
7161
+ const DFromV<decltype(v)> d;
7162
+ const RebindToUnsigned<decltype(d)> du;
7163
+
7164
+ #if HWY_COMPILER_GCC_ACTUAL && defined(__OPTIMIZE__)
7165
+ // Work around for GCC compiler bug in vec_cnttz on Z14/Z15 if v[i] is a
7166
+ // constant
7167
+ __asm__("" : "+v"(v.raw));
7168
+ #endif
7169
+
7170
+ return BitCast(d, VFromD<decltype(du)>{vec_cntlz(BitCast(du, v).raw)});
7171
+ #else
5675
7172
  return V{vec_cntlz(v.raw)};
7173
+ #endif
5676
7174
  }
5677
7175
 
5678
7176
  template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
@@ -5682,14 +7180,27 @@ HWY_API V HighestSetBitIndex(V v) {
5682
7180
  return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v));
5683
7181
  }
5684
7182
 
5685
- #if HWY_PPC_HAVE_9
7183
+ #if HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
5686
7184
  template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
5687
7185
  HWY_API V TrailingZeroCount(V v) {
5688
7186
  #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
5689
7187
  return V{vec_vctz(v.raw)};
5690
7188
  #else
5691
- return V{vec_cnttz(v.raw)};
7189
+ #if HWY_S390X_HAVE_Z14
7190
+ const DFromV<decltype(v)> d;
7191
+ const RebindToUnsigned<decltype(d)> du;
7192
+
7193
+ #if HWY_COMPILER_GCC_ACTUAL && defined(__OPTIMIZE__)
7194
+ // Work around for GCC compiler bug in vec_cnttz on Z14/Z15 if v[i] is a
7195
+ // constant
7196
+ __asm__("" : "+v"(v.raw));
5692
7197
  #endif
7198
+
7199
+ return BitCast(d, VFromD<decltype(du)>{vec_cnttz(BitCast(du, v).raw)});
7200
+ #else
7201
+ return V{vec_cnttz(v.raw)};
7202
+ #endif // HWY_S390X_HAVE_Z14
7203
+ #endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
5693
7204
  }
5694
7205
  #else
5695
7206
  template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
@@ -5709,6 +7220,8 @@ HWY_API V TrailingZeroCount(V v) {
5709
7220
 
5710
7221
  #undef HWY_PPC_HAVE_9
5711
7222
  #undef HWY_PPC_HAVE_10
7223
+ #undef HWY_S390X_HAVE_Z14
7224
+ #undef HWY_S390X_HAVE_Z15
5712
7225
 
5713
7226
  // NOLINTNEXTLINE(google-readability-namespace-comments)
5714
7227
  } // namespace HWY_NAMESPACE