@img/sharp-libvips-dev 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (230) hide show
  1. package/include/aom/aom_encoder.h +3 -3
  2. package/include/aom/aomcx.h +17 -8
  3. package/include/expat.h +21 -10
  4. package/include/expat_config.h +11 -5
  5. package/include/ffi.h +12 -25
  6. package/include/fontconfig/fontconfig.h +5 -3
  7. package/include/freetype2/freetype/config/ftoption.h +1 -1
  8. package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -7
  9. package/include/gio-unix-2.0/gio/gunixinputstream.h +0 -5
  10. package/include/gio-unix-2.0/gio/gunixoutputstream.h +0 -5
  11. package/include/glib-2.0/gio/gappinfo.h +0 -7
  12. package/include/glib-2.0/gio/gapplication.h +6 -0
  13. package/include/glib-2.0/gio/gapplicationcommandline.h +12 -1
  14. package/include/glib-2.0/gio/gasyncinitable.h +0 -7
  15. package/include/glib-2.0/gio/gasyncresult.h +0 -6
  16. package/include/glib-2.0/gio/gbufferedinputstream.h +0 -5
  17. package/include/glib-2.0/gio/gbufferedoutputstream.h +0 -5
  18. package/include/glib-2.0/gio/gbytesicon.h +0 -5
  19. package/include/glib-2.0/gio/gcancellable.h +0 -5
  20. package/include/glib-2.0/gio/gconverter.h +0 -7
  21. package/include/glib-2.0/gio/gconverterinputstream.h +0 -6
  22. package/include/glib-2.0/gio/gconverteroutputstream.h +0 -6
  23. package/include/glib-2.0/gio/gdatagrambased.h +0 -7
  24. package/include/glib-2.0/gio/gdatainputstream.h +0 -6
  25. package/include/glib-2.0/gio/gdataoutputstream.h +0 -6
  26. package/include/glib-2.0/gio/gdbusinterface.h +0 -8
  27. package/include/glib-2.0/gio/gdbusinterfaceskeleton.h +0 -8
  28. package/include/glib-2.0/gio/gdbusmessage.h +2 -1
  29. package/include/glib-2.0/gio/gdbusobjectmanagerclient.h +0 -8
  30. package/include/glib-2.0/gio/gdbusobjectmanagerserver.h +0 -8
  31. package/include/glib-2.0/gio/gdbusobjectproxy.h +0 -8
  32. package/include/glib-2.0/gio/gdbusobjectskeleton.h +0 -8
  33. package/include/glib-2.0/gio/gdbusproxy.h +0 -8
  34. package/include/glib-2.0/gio/gdebugcontroller.h +0 -8
  35. package/include/glib-2.0/gio/gdebugcontrollerdbus.h +0 -7
  36. package/include/glib-2.0/gio/gdtlsserverconnection.h +0 -8
  37. package/include/glib-2.0/gio/gemblem.h +0 -5
  38. package/include/glib-2.0/gio/gemblemedicon.h +0 -5
  39. package/include/glib-2.0/gio/gfile.h +0 -10
  40. package/include/glib-2.0/gio/gfileenumerator.h +0 -5
  41. package/include/glib-2.0/gio/gfileicon.h +0 -5
  42. package/include/glib-2.0/gio/gfileinfo.h +0 -5
  43. package/include/glib-2.0/gio/gfileinputstream.h +0 -8
  44. package/include/glib-2.0/gio/gfileiostream.h +0 -8
  45. package/include/glib-2.0/gio/gfilemonitor.h +0 -5
  46. package/include/glib-2.0/gio/gfilenamecompleter.h +0 -5
  47. package/include/glib-2.0/gio/gfileoutputstream.h +0 -8
  48. package/include/glib-2.0/gio/gfilterinputstream.h +0 -5
  49. package/include/glib-2.0/gio/gfilteroutputstream.h +0 -5
  50. package/include/glib-2.0/gio/gicon.h +0 -5
  51. package/include/glib-2.0/gio/ginitable.h +0 -7
  52. package/include/glib-2.0/gio/ginputstream.h +0 -5
  53. package/include/glib-2.0/gio/gio-autocleanups.h +4 -0
  54. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  55. package/include/glib-2.0/gio/gioenums.h +6 -1
  56. package/include/glib-2.0/gio/giomodule.h +0 -5
  57. package/include/glib-2.0/gio/giostream.h +0 -5
  58. package/include/glib-2.0/gio/giotypes.h +5 -108
  59. package/include/glib-2.0/gio/gloadableicon.h +0 -6
  60. package/include/glib-2.0/gio/gmemoryinputstream.h +0 -5
  61. package/include/glib-2.0/gio/gmemoryoutputstream.h +0 -5
  62. package/include/glib-2.0/gio/gmountoperation.h +0 -6
  63. package/include/glib-2.0/gio/gnetworking.h +4 -0
  64. package/include/glib-2.0/gio/goutputstream.h +0 -9
  65. package/include/glib-2.0/gio/gpollableinputstream.h +0 -7
  66. package/include/glib-2.0/gio/gpollableoutputstream.h +0 -7
  67. package/include/glib-2.0/gio/gproxy.h +0 -7
  68. package/include/glib-2.0/gio/gproxyaddressenumerator.h +0 -8
  69. package/include/glib-2.0/gio/gseekable.h +0 -5
  70. package/include/glib-2.0/gio/gsettingsbackend.h +0 -5
  71. package/include/glib-2.0/gio/gsimpleactiongroup.h +0 -7
  72. package/include/glib-2.0/gio/gsimpleasyncresult.h +0 -5
  73. package/include/glib-2.0/gio/gsimpleproxyresolver.h +0 -5
  74. package/include/glib-2.0/gio/gsocket.h +13 -0
  75. package/include/glib-2.0/gio/gsocketaddressenumerator.h +0 -6
  76. package/include/glib-2.0/gio/gsocketconnectable.h +0 -5
  77. package/include/glib-2.0/gio/gtask.h +12 -0
  78. package/include/glib-2.0/gio/gthemedicon.h +0 -5
  79. package/include/glib-2.0/gio/gtlsserverconnection.h +0 -8
  80. package/include/glib-2.0/gio/gunixcredentialsmessage.h +0 -8
  81. package/include/glib-2.0/gio/gvfs.h +0 -5
  82. package/include/glib-2.0/gio/gvolume.h +2 -2
  83. package/include/glib-2.0/gio/gvolumemonitor.h +0 -5
  84. package/include/glib-2.0/girepository/gi-visibility.h +986 -0
  85. package/include/glib-2.0/girepository/giarginfo.h +100 -0
  86. package/include/glib-2.0/girepository/gibaseinfo.h +129 -0
  87. package/include/glib-2.0/girepository/gicallableinfo.h +119 -0
  88. package/include/glib-2.0/girepository/gicallbackinfo.h +60 -0
  89. package/include/glib-2.0/girepository/giconstantinfo.h +72 -0
  90. package/include/glib-2.0/girepository/gienuminfo.h +82 -0
  91. package/include/glib-2.0/girepository/gifieldinfo.h +84 -0
  92. package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
  93. package/include/glib-2.0/girepository/gifunctioninfo.h +117 -0
  94. package/include/glib-2.0/girepository/giinterfaceinfo.h +120 -0
  95. package/include/glib-2.0/girepository/giobjectinfo.h +230 -0
  96. package/include/glib-2.0/girepository/gipropertyinfo.h +77 -0
  97. package/include/glib-2.0/girepository/giregisteredtypeinfo.h +75 -0
  98. package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
  99. package/include/glib-2.0/girepository/girepository.h +247 -0
  100. package/include/glib-2.0/girepository/girffi.h +129 -0
  101. package/include/glib-2.0/girepository/gisignalinfo.h +72 -0
  102. package/include/glib-2.0/girepository/gistructinfo.h +102 -0
  103. package/include/glib-2.0/girepository/gitypeinfo.h +144 -0
  104. package/include/glib-2.0/girepository/gitypelib.h +61 -0
  105. package/include/glib-2.0/girepository/gitypes.h +421 -0
  106. package/include/glib-2.0/girepository/giunioninfo.h +105 -0
  107. package/include/glib-2.0/girepository/giunresolvedinfo.h +60 -0
  108. package/include/glib-2.0/girepository/givalueinfo.h +65 -0
  109. package/include/glib-2.0/girepository/givfuncinfo.h +88 -0
  110. package/include/glib-2.0/glib/deprecated/gcompletion.h +1 -1
  111. package/include/glib-2.0/glib/deprecated/grel.h +0 -23
  112. package/include/glib-2.0/glib/deprecated/gthread.h +10 -6
  113. package/include/glib-2.0/glib/gatomic.h +20 -20
  114. package/include/glib-2.0/glib/gbitlock.h +31 -0
  115. package/include/glib-2.0/glib/gbookmarkfile.h +39 -1
  116. package/include/glib-2.0/glib/gchecksum.h +0 -10
  117. package/include/glib-2.0/glib/gdate.h +0 -9
  118. package/include/glib-2.0/glib/gdatetime.h +33 -1
  119. package/include/glib-2.0/glib/gdir.h +5 -0
  120. package/include/glib-2.0/glib/ghmac.h +0 -9
  121. package/include/glib-2.0/glib/glib-autocleanups.h +4 -0
  122. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  123. package/include/glib-2.0/glib/gmacros.h +1 -0
  124. package/include/glib-2.0/glib/gmessages.h +11 -0
  125. package/include/glib-2.0/glib/gpathbuf.h +0 -7
  126. package/include/glib-2.0/glib/gslice.h +2 -0
  127. package/include/glib-2.0/glib/gstdio.h +1 -1
  128. package/include/glib-2.0/glib/gstrfuncs.h +24 -18
  129. package/include/glib-2.0/glib/gstrvbuilder.h +4 -8
  130. package/include/glib-2.0/glib/gtestutils.h +5 -0
  131. package/include/glib-2.0/glib/gthread.h +216 -3
  132. package/include/glib-2.0/glib/gunicode.h +12 -2
  133. package/include/glib-2.0/glib/gvarianttype.h +1 -10
  134. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  135. package/include/glib-2.0/glib/gwin32.h +4 -4
  136. package/include/glib-2.0/glib-unix.h +214 -0
  137. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  138. package/include/glib-2.0/gobject/gbinding.h +0 -8
  139. package/include/glib-2.0/gobject/gbindinggroup.h +0 -8
  140. package/include/glib-2.0/gobject/gclosure.h +1 -9
  141. package/include/glib-2.0/gobject/genums.h +6 -6
  142. package/include/glib-2.0/gobject/glib-types.h +44 -0
  143. package/include/glib-2.0/gobject/gobject-autocleanups.h +4 -0
  144. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  145. package/include/glib-2.0/gobject/gobject.h +1 -16
  146. package/include/glib-2.0/gobject/gparam.h +3 -12
  147. package/include/glib-2.0/gobject/gsignal.h +16 -6
  148. package/include/glib-2.0/gobject/gsignalgroup.h +0 -8
  149. package/include/glib-2.0/gobject/gtype.h +53 -20
  150. package/include/glib-2.0/gobject/gtypemodule.h +0 -7
  151. package/include/glib-2.0/gobject/gtypeplugin.h +0 -6
  152. package/include/glib-2.0/gobject/gvaluearray.h +0 -7
  153. package/include/glib-2.0/gobject/gvaluecollector.h +1 -11
  154. package/include/glib-2.0/gobject/gvaluetypes.h +2 -0
  155. package/include/hwy/aligned_allocator.h +171 -6
  156. package/include/hwy/base.h +1765 -543
  157. package/include/hwy/cache_control.h +24 -6
  158. package/include/hwy/detect_compiler_arch.h +23 -2
  159. package/include/hwy/detect_targets.h +56 -13
  160. package/include/hwy/foreach_target.h +24 -0
  161. package/include/hwy/highway.h +20 -3
  162. package/include/hwy/ops/arm_neon-inl.h +1086 -667
  163. package/include/hwy/ops/arm_sve-inl.h +1091 -235
  164. package/include/hwy/ops/emu128-inl.h +271 -196
  165. package/include/hwy/ops/generic_ops-inl.h +2270 -399
  166. package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
  167. package/include/hwy/ops/rvv-inl.h +1043 -311
  168. package/include/hwy/ops/scalar-inl.h +189 -159
  169. package/include/hwy/ops/set_macros-inl.h +66 -6
  170. package/include/hwy/ops/shared-inl.h +175 -56
  171. package/include/hwy/ops/wasm_128-inl.h +153 -136
  172. package/include/hwy/ops/x86_128-inl.h +1647 -646
  173. package/include/hwy/ops/x86_256-inl.h +1003 -370
  174. package/include/hwy/ops/x86_512-inl.h +948 -353
  175. package/include/hwy/per_target.h +4 -0
  176. package/include/hwy/profiler.h +648 -0
  177. package/include/hwy/robust_statistics.h +2 -2
  178. package/include/hwy/targets.h +18 -11
  179. package/include/hwy/timer.h +11 -0
  180. package/include/lcms2.h +46 -7
  181. package/include/lcms2_plugin.h +4 -4
  182. package/include/libheif/heif_version.h +2 -2
  183. package/include/libpng16/png.h +32 -29
  184. package/include/libpng16/pngconf.h +2 -2
  185. package/include/libpng16/pnglibconf.h +7 -2
  186. package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
  187. package/include/libxml2/libxml/HTMLparser.h +23 -0
  188. package/include/libxml2/libxml/SAX.h +0 -2
  189. package/include/libxml2/libxml/SAX2.h +0 -2
  190. package/include/libxml2/libxml/c14n.h +0 -2
  191. package/include/libxml2/libxml/dict.h +1 -0
  192. package/include/libxml2/libxml/encoding.h +16 -14
  193. package/include/libxml2/libxml/entities.h +4 -0
  194. package/include/libxml2/libxml/globals.h +15 -503
  195. package/include/libxml2/libxml/hash.h +57 -61
  196. package/include/libxml2/libxml/nanoftp.h +2 -2
  197. package/include/libxml2/libxml/parser.h +137 -18
  198. package/include/libxml2/libxml/parserInternals.h +1 -0
  199. package/include/libxml2/libxml/relaxng.h +2 -1
  200. package/include/libxml2/libxml/schemasInternals.h +1 -0
  201. package/include/libxml2/libxml/schematron.h +1 -0
  202. package/include/libxml2/libxml/threads.h +4 -11
  203. package/include/libxml2/libxml/tree.h +68 -20
  204. package/include/libxml2/libxml/uri.h +2 -1
  205. package/include/libxml2/libxml/valid.h +2 -0
  206. package/include/libxml2/libxml/xmlIO.h +65 -13
  207. package/include/libxml2/libxml/xmlerror.h +37 -8
  208. package/include/libxml2/libxml/xmlmemory.h +37 -40
  209. package/include/libxml2/libxml/xmlreader.h +6 -0
  210. package/include/libxml2/libxml/xmlregexp.h +2 -9
  211. package/include/libxml2/libxml/xmlsave.h +9 -0
  212. package/include/libxml2/libxml/xmlschemas.h +3 -0
  213. package/include/libxml2/libxml/xmlversion.h +28 -43
  214. package/include/libxml2/libxml/xpath.h +1 -1
  215. package/include/libxml2/libxml/xpathInternals.h +2 -1
  216. package/include/libxml2/libxml/xpointer.h +5 -4
  217. package/include/pango-1.0/pango/pango-features.h +3 -3
  218. package/include/pango-1.0/pango/pango-fontmap.h +7 -0
  219. package/include/pixman-1/pixman-version.h +3 -3
  220. package/include/pixman-1/pixman.h +9 -2
  221. package/include/png.h +32 -29
  222. package/include/pngconf.h +2 -2
  223. package/include/pnglibconf.h +7 -2
  224. package/include/vips/connection.h +9 -3
  225. package/include/vips/util.h +0 -9
  226. package/include/vips/version.h +4 -4
  227. package/include/zconf.h +3 -0
  228. package/include/zlib.h +3 -3
  229. package/package.json +1 -1
  230. package/versions.json +15 -15
@@ -13,9 +13,15 @@
13
13
  // See the License for the specific language governing permissions and
14
14
  // limitations under the License.
15
15
 
16
- // 128-bit vectors for VSX
16
+ // 128-bit vectors for VSX/Z14
17
17
  // External include guard in highway.h - see comment there.
18
18
 
19
+ #if HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15
20
+ #define HWY_S390X_HAVE_Z14 1
21
+ #else
22
+ #define HWY_S390X_HAVE_Z14 0
23
+ #endif
24
+
19
25
  #pragma push_macro("vector")
20
26
  #pragma push_macro("pixel")
21
27
  #pragma push_macro("bool")
@@ -24,7 +30,11 @@
24
30
  #undef pixel
25
31
  #undef bool
26
32
 
33
+ #if HWY_S390X_HAVE_Z14
34
+ #include <vecintrin.h>
35
+ #else
27
36
  #include <altivec.h>
37
+ #endif
28
38
 
29
39
  #pragma pop_macro("vector")
30
40
  #pragma pop_macro("pixel")
@@ -37,20 +47,26 @@
37
47
  // This means we can only use POWER10-specific intrinsics in static dispatch
38
48
  // mode (where the -mpower10-vector compiler flag is passed). Same for PPC9.
39
49
  // On other compilers, the usual target check is sufficient.
40
- #if HWY_TARGET <= HWY_PPC9 && \
50
+ #if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC9 && \
41
51
  (defined(_ARCH_PWR9) || defined(__POWER9_VECTOR__))
42
52
  #define HWY_PPC_HAVE_9 1
43
53
  #else
44
54
  #define HWY_PPC_HAVE_9 0
45
55
  #endif
46
56
 
47
- #if HWY_TARGET <= HWY_PPC10 && \
57
+ #if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC10 && \
48
58
  (defined(_ARCH_PWR10) || defined(__POWER10_VECTOR__))
49
59
  #define HWY_PPC_HAVE_10 1
50
60
  #else
51
61
  #define HWY_PPC_HAVE_10 0
52
62
  #endif
53
63
 
64
+ #if HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_Z15 && __ARCH__ >= 13
65
+ #define HWY_S390X_HAVE_Z15 1
66
+ #else
67
+ #define HWY_S390X_HAVE_Z15 0
68
+ #endif
69
+
54
70
  HWY_BEFORE_NAMESPACE();
55
71
  namespace hwy {
56
72
  namespace HWY_NAMESPACE {
@@ -125,6 +141,9 @@ class Vec128 {
125
141
  HWY_INLINE Vec128& operator-=(const Vec128 other) {
126
142
  return *this = (*this - other);
127
143
  }
144
+ HWY_INLINE Vec128& operator%=(const Vec128 other) {
145
+ return *this = (*this % other);
146
+ }
128
147
  HWY_INLINE Vec128& operator&=(const Vec128 other) {
129
148
  return *this = (*this & other);
130
149
  }
@@ -215,6 +234,12 @@ HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
215
234
  return VFromD<D>{vec_splats(static_cast<RawLane>(t))};
216
235
  }
217
236
 
237
+ template <class D, HWY_IF_SPECIAL_FLOAT(TFromD<D>)>
238
+ HWY_API VFromD<D> Set(D d, TFromD<D> t) {
239
+ const RebindToUnsigned<decltype(d)> du;
240
+ return BitCast(d, Set(du, BitCastScalar<TFromD<decltype(du)>>(t)));
241
+ }
242
+
218
243
  // Returns a vector with uninitialized elements.
219
244
  template <class D>
220
245
  HWY_API VFromD<D> Undefined(D d) {
@@ -240,6 +265,58 @@ HWY_API T GetLane(Vec128<T, N> v) {
240
265
  return static_cast<T>(v.raw[0]);
241
266
  }
242
267
 
268
+ // ------------------------------ Dup128VecFromValues
269
+
270
+ template <class D, HWY_IF_T_SIZE_D(D, 1)>
271
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
272
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
273
+ TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
274
+ TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
275
+ TFromD<D> t11, TFromD<D> t12,
276
+ TFromD<D> t13, TFromD<D> t14,
277
+ TFromD<D> t15) {
278
+ const typename detail::Raw128<TFromD<D>>::type raw = {
279
+ t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15};
280
+ return VFromD<D>{raw};
281
+ }
282
+
283
+ template <class D, HWY_IF_UI16_D(D)>
284
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
285
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
286
+ TFromD<D> t5, TFromD<D> t6,
287
+ TFromD<D> t7) {
288
+ const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1, t2, t3,
289
+ t4, t5, t6, t7};
290
+ return VFromD<D>{raw};
291
+ }
292
+
293
+ template <class D, HWY_IF_SPECIAL_FLOAT_D(D)>
294
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
295
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
296
+ TFromD<D> t5, TFromD<D> t6,
297
+ TFromD<D> t7) {
298
+ const RebindToUnsigned<decltype(d)> du;
299
+ return BitCast(
300
+ d, Dup128VecFromValues(
301
+ du, BitCastScalar<uint16_t>(t0), BitCastScalar<uint16_t>(t1),
302
+ BitCastScalar<uint16_t>(t2), BitCastScalar<uint16_t>(t3),
303
+ BitCastScalar<uint16_t>(t4), BitCastScalar<uint16_t>(t5),
304
+ BitCastScalar<uint16_t>(t6), BitCastScalar<uint16_t>(t7)));
305
+ }
306
+
307
+ template <class D, HWY_IF_T_SIZE_D(D, 4)>
308
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
309
+ TFromD<D> t2, TFromD<D> t3) {
310
+ const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1, t2, t3};
311
+ return VFromD<D>{raw};
312
+ }
313
+
314
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
315
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
316
+ const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1};
317
+ return VFromD<D>{raw};
318
+ }
319
+
243
320
  // ================================================== LOGICAL
244
321
 
245
322
  // ------------------------------ And
@@ -249,7 +326,11 @@ HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
249
326
  const DFromV<decltype(a)> d;
250
327
  const RebindToUnsigned<decltype(d)> du;
251
328
  using VU = VFromD<decltype(du)>;
329
+ #if HWY_S390X_HAVE_Z14
330
+ return BitCast(d, VU{BitCast(du, a).raw & BitCast(du, b).raw});
331
+ #else
252
332
  return BitCast(d, VU{vec_and(BitCast(du, a).raw, BitCast(du, b).raw)});
333
+ #endif
253
334
  }
254
335
 
255
336
  // ------------------------------ AndNot
@@ -271,7 +352,11 @@ HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
271
352
  const DFromV<decltype(a)> d;
272
353
  const RebindToUnsigned<decltype(d)> du;
273
354
  using VU = VFromD<decltype(du)>;
355
+ #if HWY_S390X_HAVE_Z14
356
+ return BitCast(d, VU{BitCast(du, a).raw | BitCast(du, b).raw});
357
+ #else
274
358
  return BitCast(d, VU{vec_or(BitCast(du, a).raw, BitCast(du, b).raw)});
359
+ #endif
275
360
  }
276
361
 
277
362
  // ------------------------------ Xor
@@ -281,7 +366,11 @@ HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
281
366
  const DFromV<decltype(a)> d;
282
367
  const RebindToUnsigned<decltype(d)> du;
283
368
  using VU = VFromD<decltype(du)>;
369
+ #if HWY_S390X_HAVE_Z14
370
+ return BitCast(d, VU{BitCast(du, a).raw ^ BitCast(du, b).raw});
371
+ #else
284
372
  return BitCast(d, VU{vec_xor(BitCast(du, a).raw, BitCast(du, b).raw)});
373
+ #endif
285
374
  }
286
375
 
287
376
  // ------------------------------ Not
@@ -476,9 +565,21 @@ HWY_API Vec128<T, N> operator^(Vec128<T, N> a, Vec128<T, N> b) {
476
565
 
477
566
  // ------------------------------ Neg
478
567
 
479
- template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
480
- HWY_INLINE Vec128<T, N> Neg(Vec128<T, N> v) {
568
+ template <typename T, size_t N, HWY_IF_SIGNED(T)>
569
+ HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
570
+ // If T is an signed integer type, use Zero(d) - v instead of vec_neg to
571
+ // avoid undefined behavior in the case where v[i] == LimitsMin<T>()
572
+ const DFromV<decltype(v)> d;
573
+ return Zero(d) - v;
574
+ }
575
+
576
+ template <typename T, size_t N, HWY_IF_FLOAT3264(T)>
577
+ HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
578
+ #if HWY_S390X_HAVE_Z14
579
+ return Xor(v, SignBit(DFromV<decltype(v)>()));
580
+ #else
481
581
  return Vec128<T, N>{vec_neg(v.raw)};
582
+ #endif
482
583
  }
483
584
 
484
585
  template <typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)>
@@ -489,13 +590,40 @@ HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
489
590
  // ------------------------------ Abs
490
591
 
491
592
  // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
492
- template <class T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
593
+ template <class T, size_t N, HWY_IF_SIGNED(T)>
594
+ HWY_API Vec128<T, N> Abs(Vec128<T, N> v) {
595
+ // If T is a signed integer type, use Max(v, Neg(v)) instead of vec_abs to
596
+ // avoid undefined behavior in the case where v[i] == LimitsMin<T>().
597
+ return Max(v, Neg(v));
598
+ }
599
+
600
+ template <class T, size_t N, HWY_IF_FLOAT3264(T)>
493
601
  HWY_API Vec128<T, N> Abs(Vec128<T, N> v) {
494
602
  return Vec128<T, N>{vec_abs(v.raw)};
495
603
  }
496
604
 
497
605
  // ------------------------------ CopySign
498
606
 
607
+ #if HWY_S390X_HAVE_Z14
608
+ template <class V>
609
+ HWY_API V CopySign(const V magn, const V sign) {
610
+ static_assert(IsFloat<TFromV<V>>(), "Only makes sense for floating-point");
611
+
612
+ const DFromV<decltype(magn)> d;
613
+ const auto msb = SignBit(d);
614
+
615
+ // Truth table for msb, magn, sign | bitwise msb ? sign : mag
616
+ // 0 0 0 | 0
617
+ // 0 0 1 | 0
618
+ // 0 1 0 | 1
619
+ // 0 1 1 | 1
620
+ // 1 0 0 | 0
621
+ // 1 0 1 | 1
622
+ // 1 1 0 | 0
623
+ // 1 1 1 | 1
624
+ return BitwiseIfThenElse(msb, sign, magn);
625
+ }
626
+ #else // VSX
499
627
  template <size_t N>
500
628
  HWY_API Vec128<float, N> CopySign(Vec128<float, N> magn,
501
629
  Vec128<float, N> sign) {
@@ -525,6 +653,7 @@ HWY_API Vec128<double, N> CopySign(Vec128<double, N> magn,
525
653
  return Vec128<double, N>{vec_cpsgn(sign.raw, magn.raw)};
526
654
  #endif
527
655
  }
656
+ #endif // HWY_S390X_HAVE_Z14
528
657
 
529
658
  template <typename T, size_t N>
530
659
  HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
@@ -543,7 +672,7 @@ HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
543
672
  template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
544
673
  HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
545
674
  using LoadRaw = typename detail::Raw128<T>::AlignedRawVec;
546
- const LoadRaw* HWY_RESTRICT p = reinterpret_cast<const LoadRaw*>(aligned);
675
+ const LoadRaw* HWY_RESTRICT p = HWY_RCAST_ALIGNED(const LoadRaw*, aligned);
547
676
  using ResultRaw = typename detail::Raw128<T>::type;
548
677
  return Vec128<T>{reinterpret_cast<ResultRaw>(*p)};
549
678
  }
@@ -598,19 +727,13 @@ HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
598
727
  // mask ? yes : 0
599
728
  template <typename T, size_t N>
600
729
  HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
601
- const DFromV<decltype(yes)> d;
602
- const RebindToUnsigned<decltype(d)> du;
603
- return BitCast(d,
604
- VFromD<decltype(du)>{vec_and(BitCast(du, yes).raw, mask.raw)});
730
+ return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
605
731
  }
606
732
 
607
733
  // mask ? 0 : no
608
734
  template <typename T, size_t N>
609
735
  HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
610
- const DFromV<decltype(no)> d;
611
- const RebindToUnsigned<decltype(d)> du;
612
- return BitCast(d,
613
- VFromD<decltype(du)>{vec_andc(BitCast(du, no).raw, mask.raw)});
736
+ return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
614
737
  }
615
738
 
616
739
  // ------------------------------ Mask logical
@@ -622,7 +745,11 @@ HWY_API Mask128<T, N> Not(Mask128<T, N> m) {
622
745
 
623
746
  template <typename T, size_t N>
624
747
  HWY_API Mask128<T, N> And(Mask128<T, N> a, Mask128<T, N> b) {
748
+ #if HWY_S390X_HAVE_Z14
749
+ return Mask128<T, N>{a.raw & b.raw};
750
+ #else
625
751
  return Mask128<T, N>{vec_and(a.raw, b.raw)};
752
+ #endif
626
753
  }
627
754
 
628
755
  template <typename T, size_t N>
@@ -632,12 +759,20 @@ HWY_API Mask128<T, N> AndNot(Mask128<T, N> a, Mask128<T, N> b) {
632
759
 
633
760
  template <typename T, size_t N>
634
761
  HWY_API Mask128<T, N> Or(Mask128<T, N> a, Mask128<T, N> b) {
762
+ #if HWY_S390X_HAVE_Z14
763
+ return Mask128<T, N>{a.raw | b.raw};
764
+ #else
635
765
  return Mask128<T, N>{vec_or(a.raw, b.raw)};
766
+ #endif
636
767
  }
637
768
 
638
769
  template <typename T, size_t N>
639
770
  HWY_API Mask128<T, N> Xor(Mask128<T, N> a, Mask128<T, N> b) {
771
+ #if HWY_S390X_HAVE_Z14
772
+ return Mask128<T, N>{a.raw ^ b.raw};
773
+ #else
640
774
  return Mask128<T, N>{vec_xor(a.raw, b.raw)};
775
+ #endif
641
776
  }
642
777
 
643
778
  template <typename T, size_t N>
@@ -645,36 +780,24 @@ HWY_API Mask128<T, N> ExclusiveNeither(Mask128<T, N> a, Mask128<T, N> b) {
645
780
  return Mask128<T, N>{vec_nor(a.raw, b.raw)};
646
781
  }
647
782
 
648
- // ------------------------------ BroadcastSignBit
649
-
650
- template <size_t N>
651
- HWY_API Vec128<int8_t, N> BroadcastSignBit(Vec128<int8_t, N> v) {
652
- return Vec128<int8_t, N>{
653
- vec_sra(v.raw, vec_splats(static_cast<unsigned char>(7)))};
654
- }
655
-
656
- template <size_t N>
657
- HWY_API Vec128<int16_t, N> BroadcastSignBit(Vec128<int16_t, N> v) {
658
- return Vec128<int16_t, N>{
659
- vec_sra(v.raw, vec_splats(static_cast<unsigned short>(15)))};
660
- }
661
-
662
- template <size_t N>
663
- HWY_API Vec128<int32_t, N> BroadcastSignBit(Vec128<int32_t, N> v) {
664
- return Vec128<int32_t, N>{vec_sra(v.raw, vec_splats(31u))};
665
- }
666
-
667
- template <size_t N>
668
- HWY_API Vec128<int64_t, N> BroadcastSignBit(Vec128<int64_t, N> v) {
669
- return Vec128<int64_t, N>{vec_sra(v.raw, vec_splats(63ULL))};
670
- }
671
-
672
783
  // ------------------------------ ShiftLeftSame
673
784
 
674
785
  template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
675
786
  HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, const int bits) {
676
- using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT;
677
- return Vec128<T, N>{vec_sl(v.raw, vec_splats(static_cast<TU>(bits)))};
787
+ const DFromV<decltype(v)> d;
788
+ const RebindToUnsigned<decltype(d)> du;
789
+ using TU = TFromD<decltype(du)>;
790
+
791
+ #if HWY_S390X_HAVE_Z14
792
+ return BitCast(d,
793
+ VFromD<decltype(du)>{BitCast(du, v).raw
794
+ << Set(du, static_cast<TU>(bits)).raw});
795
+ #else
796
+ // Do an unsigned vec_sl operation to avoid undefined behavior
797
+ return BitCast(
798
+ d, VFromD<decltype(du)>{
799
+ vec_sl(BitCast(du, v).raw, Set(du, static_cast<TU>(bits)).raw)});
800
+ #endif
678
801
  }
679
802
 
680
803
  // ------------------------------ ShiftRightSame
@@ -682,13 +805,22 @@ HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, const int bits) {
682
805
  template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
683
806
  HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, const int bits) {
684
807
  using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT;
808
+ #if HWY_S390X_HAVE_Z14
809
+ return Vec128<T, N>{v.raw >> vec_splats(static_cast<TU>(bits))};
810
+ #else
685
811
  return Vec128<T, N>{vec_sr(v.raw, vec_splats(static_cast<TU>(bits)))};
812
+ #endif
686
813
  }
687
814
 
688
815
  template <typename T, size_t N, HWY_IF_SIGNED(T)>
689
816
  HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, const int bits) {
817
+ #if HWY_S390X_HAVE_Z14
818
+ using TI = typename detail::Raw128<T>::RawT;
819
+ return Vec128<T, N>{v.raw >> vec_splats(static_cast<TI>(bits))};
820
+ #else
690
821
  using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT;
691
822
  return Vec128<T, N>{vec_sra(v.raw, vec_splats(static_cast<TU>(bits)))};
823
+ #endif
692
824
  }
693
825
 
694
826
  // ------------------------------ ShiftLeft
@@ -707,6 +839,13 @@ HWY_API Vec128<T, N> ShiftRight(Vec128<T, N> v) {
707
839
  return ShiftRightSame(v, kBits);
708
840
  }
709
841
 
842
+ // ------------------------------ BroadcastSignBit
843
+
844
+ template <typename T, size_t N, HWY_IF_SIGNED(T)>
845
+ HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
846
+ return ShiftRightSame(v, static_cast<int>(sizeof(T) * 8 - 1));
847
+ }
848
+
710
849
  // ================================================== SWIZZLE (1)
711
850
 
712
851
  // ------------------------------ TableLookupBytes
@@ -1003,7 +1142,7 @@ HWY_API VFromD<D> LoadDup128(D d, const T* HWY_RESTRICT p) {
1003
1142
  return LoadU(d, p);
1004
1143
  }
1005
1144
 
1006
- #if HWY_PPC_HAVE_9
1145
+ #if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
1007
1146
  #ifdef HWY_NATIVE_LOAD_N
1008
1147
  #undef HWY_NATIVE_LOAD_N
1009
1148
  #else
@@ -1027,11 +1166,20 @@ HWY_API VFromD<D> LoadN(D d, const T* HWY_RESTRICT p,
1027
1166
  const size_t num_of_bytes_to_load =
1028
1167
  HWY_MIN(max_lanes_to_load, HWY_MAX_LANES_D(D)) * sizeof(TFromD<D>);
1029
1168
  const Repartition<uint8_t, decltype(d)> du8;
1169
+ #if HWY_S390X_HAVE_Z14
1170
+ return (num_of_bytes_to_load > 0)
1171
+ ? BitCast(d, VFromD<decltype(du8)>{vec_load_len(
1172
+ const_cast<unsigned char*>(
1173
+ reinterpret_cast<const unsigned char*>(p)),
1174
+ static_cast<unsigned>(num_of_bytes_to_load - 1))})
1175
+ : Zero(d);
1176
+ #else
1030
1177
  return BitCast(
1031
1178
  d,
1032
1179
  VFromD<decltype(du8)>{vec_xl_len(
1033
1180
  const_cast<unsigned char*>(reinterpret_cast<const unsigned char*>(p)),
1034
1181
  num_of_bytes_to_load)});
1182
+ #endif
1035
1183
  }
1036
1184
 
1037
1185
  template <class D, typename T = TFromD<D>>
@@ -1048,18 +1196,11 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const T* HWY_RESTRICT p,
1048
1196
  }
1049
1197
  #endif
1050
1198
 
1051
- const size_t num_of_bytes_to_load =
1052
- HWY_MIN(max_lanes_to_load, HWY_MAX_LANES_D(D)) * sizeof(TFromD<D>);
1053
- const Repartition<uint8_t, decltype(d)> du8;
1054
- const VFromD<D> v = BitCast(
1055
- d,
1056
- VFromD<decltype(du8)>{vec_xl_len(
1057
- const_cast<unsigned char*>(reinterpret_cast<const unsigned char*>(p)),
1058
- num_of_bytes_to_load)});
1059
- return IfThenElse(FirstN(d, max_lanes_to_load), v, no);
1199
+ return IfThenElse(FirstN(d, max_lanes_to_load),
1200
+ LoadN(d, p, max_lanes_to_load), no);
1060
1201
  }
1061
1202
 
1062
- #endif // HWY_PPC_HAVE_9
1203
+ #endif // HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
1063
1204
 
1064
1205
  // Returns a vector with lane i=[0, N) set to "first" + i.
1065
1206
  namespace detail {
@@ -1135,7 +1276,7 @@ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
1135
1276
  template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
1136
1277
  HWY_API void Store(Vec128<T> v, D /* tag */, T* HWY_RESTRICT aligned) {
1137
1278
  using StoreRaw = typename detail::Raw128<T>::AlignedRawVec;
1138
- *reinterpret_cast<StoreRaw*>(aligned) = reinterpret_cast<StoreRaw>(v.raw);
1279
+ *HWY_RCAST_ALIGNED(StoreRaw*, aligned) = reinterpret_cast<StoreRaw>(v.raw);
1139
1280
  }
1140
1281
 
1141
1282
  template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
@@ -1159,7 +1300,7 @@ HWY_API void StoreU(VFromD<D> v, D d, T* HWY_RESTRICT p) {
1159
1300
  Store(v, d, p);
1160
1301
  }
1161
1302
 
1162
- #if HWY_PPC_HAVE_9
1303
+ #if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
1163
1304
 
1164
1305
  #ifdef HWY_NATIVE_STORE_N
1165
1306
  #undef HWY_NATIVE_STORE_N
@@ -1185,8 +1326,15 @@ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
1185
1326
  const size_t num_of_bytes_to_store =
1186
1327
  HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D)) * sizeof(TFromD<D>);
1187
1328
  const Repartition<uint8_t, decltype(d)> du8;
1329
+ #if HWY_S390X_HAVE_Z14
1330
+ if (num_of_bytes_to_store > 0) {
1331
+ vec_store_len(BitCast(du8, v).raw, reinterpret_cast<unsigned char*>(p),
1332
+ static_cast<unsigned>(num_of_bytes_to_store - 1));
1333
+ }
1334
+ #else
1188
1335
  vec_xst_len(BitCast(du8, v).raw, reinterpret_cast<unsigned char*>(p),
1189
1336
  num_of_bytes_to_store);
1337
+ #endif
1190
1338
  }
1191
1339
  #endif
1192
1340
 
@@ -1210,164 +1358,97 @@ HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
1210
1358
 
1211
1359
  // ================================================== ARITHMETIC
1212
1360
 
1361
+ namespace detail {
1362
+ // If TFromD<D> is an integer type, detail::RebindToUnsignedIfNotFloat<D>
1363
+ // rebinds D to MakeUnsigned<TFromD<D>>.
1364
+
1365
+ // Otherwise, if TFromD<D> is a floating-point type (including F16 and BF16),
1366
+ // detail::RebindToUnsignedIfNotFloat<D> is the same as D.
1367
+ template <class D>
1368
+ using RebindToUnsignedIfNotFloat =
1369
+ hwy::If<(!hwy::IsFloat<TFromD<D>>() && !hwy::IsSpecialFloat<TFromD<D>>()),
1370
+ RebindToUnsigned<D>, D>;
1371
+ } // namespace detail
1372
+
1213
1373
  // ------------------------------ Addition
1214
1374
 
1215
1375
  template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
1216
1376
  HWY_API Vec128<T, N> operator+(Vec128<T, N> a, Vec128<T, N> b) {
1217
- return Vec128<T, N>{vec_add(a.raw, b.raw)};
1377
+ const DFromV<decltype(a)> d;
1378
+ const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
1379
+
1380
+ // If T is an integer type, do an unsigned vec_add to avoid undefined behavior
1381
+ #if HWY_S390X_HAVE_Z14
1382
+ return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw +
1383
+ BitCast(d_arith, b).raw});
1384
+ #else
1385
+ return BitCast(d, VFromD<decltype(d_arith)>{vec_add(
1386
+ BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
1387
+ #endif
1218
1388
  }
1219
1389
 
1220
1390
  // ------------------------------ Subtraction
1221
1391
 
1222
1392
  template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
1223
1393
  HWY_API Vec128<T, N> operator-(Vec128<T, N> a, Vec128<T, N> b) {
1224
- return Vec128<T, N>{vec_sub(a.raw, b.raw)};
1225
- }
1226
-
1227
- // ------------------------------ SumsOf8
1228
- namespace detail {
1394
+ const DFromV<decltype(a)> d;
1395
+ const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
1229
1396
 
1230
- // Casts nominally int32_t result to D.
1231
- template <class D>
1232
- HWY_INLINE VFromD<D> AltivecVsum4sbs(D d, __vector signed char a,
1233
- __vector signed int b) {
1234
- const Repartition<int32_t, D> di32;
1235
- #ifdef __OPTIMIZE__
1236
- if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
1237
- const int64_t sum0 =
1238
- static_cast<int64_t>(a[0]) + static_cast<int64_t>(a[1]) +
1239
- static_cast<int64_t>(a[2]) + static_cast<int64_t>(a[3]) +
1240
- static_cast<int64_t>(b[0]);
1241
- const int64_t sum1 =
1242
- static_cast<int64_t>(a[4]) + static_cast<int64_t>(a[5]) +
1243
- static_cast<int64_t>(a[6]) + static_cast<int64_t>(a[7]) +
1244
- static_cast<int64_t>(b[1]);
1245
- const int64_t sum2 =
1246
- static_cast<int64_t>(a[8]) + static_cast<int64_t>(a[9]) +
1247
- static_cast<int64_t>(a[10]) + static_cast<int64_t>(a[11]) +
1248
- static_cast<int64_t>(b[2]);
1249
- const int64_t sum3 =
1250
- static_cast<int64_t>(a[12]) + static_cast<int64_t>(a[13]) +
1251
- static_cast<int64_t>(a[14]) + static_cast<int64_t>(a[15]) +
1252
- static_cast<int64_t>(b[3]);
1253
- const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
1254
- const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
1255
- const int32_t sign2 = static_cast<int32_t>(sum2 >> 63);
1256
- const int32_t sign3 = static_cast<int32_t>(sum3 >> 63);
1257
- using Raw = typename detail::Raw128<int32_t>::type;
1258
- return BitCast(
1259
- d,
1260
- VFromD<decltype(di32)>{Raw{
1261
- (sign0 == (sum0 >> 31)) ? static_cast<int32_t>(sum0)
1262
- : static_cast<int32_t>(sign0 ^ 0x7FFFFFFF),
1263
- (sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1)
1264
- : static_cast<int32_t>(sign1 ^ 0x7FFFFFFF),
1265
- (sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2)
1266
- : static_cast<int32_t>(sign2 ^ 0x7FFFFFFF),
1267
- (sign3 == (sum3 >> 31))
1268
- ? static_cast<int32_t>(sum3)
1269
- : static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}});
1270
- } else // NOLINT
1397
+ // If T is an integer type, do an unsigned vec_sub to avoid undefined behavior
1398
+ #if HWY_S390X_HAVE_Z14
1399
+ return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw -
1400
+ BitCast(d_arith, b).raw});
1401
+ #else
1402
+ return BitCast(d, VFromD<decltype(d_arith)>{vec_sub(
1403
+ BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
1271
1404
  #endif
1272
- {
1273
- return BitCast(d, VFromD<decltype(di32)>{vec_vsum4sbs(a, b)});
1274
- }
1275
1405
  }
1276
1406
 
1277
- // Casts nominally uint32_t result to D.
1278
- template <class D>
1279
- HWY_INLINE VFromD<D> AltivecVsum4ubs(D d, __vector unsigned char a,
1280
- __vector unsigned int b) {
1281
- const Repartition<uint32_t, D> du32;
1282
- #ifdef __OPTIMIZE__
1283
- if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
1284
- const uint64_t sum0 =
1285
- static_cast<uint64_t>(a[0]) + static_cast<uint64_t>(a[1]) +
1286
- static_cast<uint64_t>(a[2]) + static_cast<uint64_t>(a[3]) +
1287
- static_cast<uint64_t>(b[0]);
1288
- const uint64_t sum1 =
1289
- static_cast<uint64_t>(a[4]) + static_cast<uint64_t>(a[5]) +
1290
- static_cast<uint64_t>(a[6]) + static_cast<uint64_t>(a[7]) +
1291
- static_cast<uint64_t>(b[1]);
1292
- const uint64_t sum2 =
1293
- static_cast<uint64_t>(a[8]) + static_cast<uint64_t>(a[9]) +
1294
- static_cast<uint64_t>(a[10]) + static_cast<uint64_t>(a[11]) +
1295
- static_cast<uint64_t>(b[2]);
1296
- const uint64_t sum3 =
1297
- static_cast<uint64_t>(a[12]) + static_cast<uint64_t>(a[13]) +
1298
- static_cast<uint64_t>(a[14]) + static_cast<uint64_t>(a[15]) +
1299
- static_cast<uint64_t>(b[3]);
1300
- return BitCast(
1301
- d,
1302
- VFromD<decltype(du32)>{(__vector unsigned int){
1303
- static_cast<unsigned int>(sum0 <= 0xFFFFFFFFu ? sum0 : 0xFFFFFFFFu),
1304
- static_cast<unsigned int>(sum1 <= 0xFFFFFFFFu ? sum1 : 0xFFFFFFFFu),
1305
- static_cast<unsigned int>(sum2 <= 0xFFFFFFFFu ? sum2 : 0xFFFFFFFFu),
1306
- static_cast<unsigned int>(sum3 <= 0xFFFFFFFFu ? sum3
1307
- : 0xFFFFFFFFu)}});
1308
- } else // NOLINT
1309
- #endif
1310
- {
1311
- return BitCast(d, VFromD<decltype(du32)>{vec_vsum4ubs(a, b)});
1312
- }
1407
+ // ------------------------------ SumsOf8
1408
+ template <class V, HWY_IF_U8(TFromV<V>)>
1409
+ HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) {
1410
+ return SumsOf2(SumsOf4(v));
1313
1411
  }
1314
1412
 
1315
- // Casts nominally int32_t result to D.
1316
- template <class D>
1317
- HWY_INLINE VFromD<D> AltivecVsum2sws(D d, __vector signed int a,
1318
- __vector signed int b) {
1319
- const Repartition<int32_t, D> di32;
1320
- #ifdef __OPTIMIZE__
1321
- const Repartition<uint64_t, D> du64;
1322
- constexpr int kDestLaneOffset = HWY_IS_BIG_ENDIAN;
1323
- if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset]) &&
1324
- __builtin_constant_p(b[kDestLaneOffset + 2])) {
1325
- const int64_t sum0 = static_cast<int64_t>(a[0]) +
1326
- static_cast<int64_t>(a[1]) +
1327
- static_cast<int64_t>(b[kDestLaneOffset]);
1328
- const int64_t sum1 = static_cast<int64_t>(a[2]) +
1329
- static_cast<int64_t>(a[3]) +
1330
- static_cast<int64_t>(b[kDestLaneOffset + 2]);
1331
- const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
1332
- const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
1333
- return BitCast(d, VFromD<decltype(du64)>{(__vector unsigned long long){
1334
- (sign0 == (sum0 >> 31))
1335
- ? static_cast<uint32_t>(sum0)
1336
- : static_cast<uint32_t>(sign0 ^ 0x7FFFFFFF),
1337
- (sign1 == (sum1 >> 31))
1338
- ? static_cast<uint32_t>(sum1)
1339
- : static_cast<uint32_t>(sign1 ^ 0x7FFFFFFF)}});
1340
- } else // NOLINT
1341
- #endif
1342
- {
1343
- __vector signed int sum;
1344
-
1345
- // Inline assembly is used for vsum2sws to avoid unnecessary shuffling
1346
- // on little-endian PowerPC targets as the result of the vsum2sws
1347
- // instruction will already be in the correct lanes on little-endian
1348
- // PowerPC targets.
1349
- __asm__("vsum2sws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b));
1413
+ template <class V, HWY_IF_I8(TFromV<V>)>
1414
+ HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) {
1415
+ #if HWY_S390X_HAVE_Z14
1416
+ const DFromV<decltype(v)> di8;
1417
+ const RebindToUnsigned<decltype(di8)> du8;
1418
+ const RepartitionToWideX3<decltype(di8)> di64;
1350
1419
 
1351
- return BitCast(d, VFromD<decltype(di32)>{sum});
1352
- }
1420
+ return BitCast(di64, SumsOf8(BitCast(du8, Xor(v, SignBit(di8))))) +
1421
+ Set(di64, int64_t{-1024});
1422
+ #else
1423
+ return SumsOf2(SumsOf4(v));
1424
+ #endif
1353
1425
  }
1354
1426
 
1355
- } // namespace detail
1427
+ // ------------------------------ SaturatedAdd
1356
1428
 
1357
- template <size_t N>
1358
- HWY_API Vec128<uint64_t, N / 8> SumsOf8(Vec128<uint8_t, N> v) {
1359
- const Repartition<uint64_t, DFromV<decltype(v)>> du64;
1360
- const Repartition<int32_t, decltype(du64)> di32;
1361
- const RebindToUnsigned<decltype(di32)> du32;
1429
+ // Returns a + b clamped to the destination range.
1430
+
1431
+ #if HWY_S390X_HAVE_Z14
1432
+ // Z14/Z15/Z16 does not have I8/U8/I16/U16 SaturatedAdd instructions unlike most
1433
+ // other integer SIMD instruction sets
1362
1434
 
1363
- return detail::AltivecVsum2sws(
1364
- du64, detail::AltivecVsum4ubs(di32, v.raw, Zero(du32).raw).raw,
1365
- Zero(di32).raw);
1435
+ template <typename T, size_t N, HWY_IF_UNSIGNED(T),
1436
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
1437
+ HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
1438
+ return Add(a, Min(b, Not(a)));
1366
1439
  }
1367
1440
 
1368
- // ------------------------------ SaturatedAdd
1441
+ template <typename T, size_t N, HWY_IF_SIGNED(T),
1442
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
1443
+ HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
1444
+ const DFromV<decltype(a)> d;
1445
+ const auto sum = Add(a, b);
1446
+ const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum));
1447
+ const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax<T>()));
1448
+ return IfNegativeThenElse(overflow_mask, overflow_result, sum);
1449
+ }
1369
1450
 
1370
- // Returns a + b clamped to the destination range.
1451
+ #else // VSX
1371
1452
 
1372
1453
  #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
1373
1454
  #undef HWY_NATIVE_I32_SATURATED_ADDSUB
@@ -1386,6 +1467,7 @@ template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
1386
1467
  HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
1387
1468
  return Vec128<T, N>{vec_adds(a.raw, b.raw)};
1388
1469
  }
1470
+ #endif // HWY_S390X_HAVE_Z14
1389
1471
 
1390
1472
  #if HWY_PPC_HAVE_10
1391
1473
 
@@ -1412,14 +1494,37 @@ HWY_API V SaturatedAdd(V a, V b) {
1412
1494
 
1413
1495
  // Returns a - b clamped to the destination range.
1414
1496
 
1415
- template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
1416
- HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
1497
+ #if HWY_S390X_HAVE_Z14
1498
+ // Z14/Z15/Z16 does not have I8/U8/I16/U16 SaturatedSub instructions unlike most
1499
+ // other integer SIMD instruction sets
1500
+
1501
+ template <typename T, size_t N, HWY_IF_UNSIGNED(T),
1502
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
1417
1503
  HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
1418
- return Vec128<T, N>{vec_subs(a.raw, b.raw)};
1504
+ return Sub(a, Min(a, b));
1419
1505
  }
1420
1506
 
1421
- #if HWY_PPC_HAVE_10
1422
-
1507
+ template <typename T, size_t N, HWY_IF_SIGNED(T),
1508
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
1509
+ HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
1510
+ const DFromV<decltype(a)> d;
1511
+ const auto diff = Sub(a, b);
1512
+ const auto overflow_mask = And(Xor(a, b), Xor(a, diff));
1513
+ const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax<T>()));
1514
+ return IfNegativeThenElse(overflow_mask, overflow_result, diff);
1515
+ }
1516
+
1517
+ #else // VSX
1518
+
1519
+ template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
1520
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
1521
+ HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
1522
+ return Vec128<T, N>{vec_subs(a.raw, b.raw)};
1523
+ }
1524
+ #endif // HWY_S390X_HAVE_Z14
1525
+
1526
+ #if HWY_PPC_HAVE_10
1527
+
1423
1528
  template <class V, HWY_IF_I64_D(DFromV<V>)>
1424
1529
  HWY_API V SaturatedSub(V a, V b) {
1425
1530
  const DFromV<decltype(a)> d;
@@ -1459,12 +1564,25 @@ HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, Vec128<T, N> b) {
1459
1564
 
1460
1565
  template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
1461
1566
  HWY_API Vec128<T, N> operator*(Vec128<T, N> a, Vec128<T, N> b) {
1462
- return Vec128<T, N>{a.raw * b.raw};
1567
+ const DFromV<decltype(a)> d;
1568
+ const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
1569
+
1570
+ // If T is an integer type, do an unsigned vec_mul to avoid undefined behavior
1571
+ #if HWY_S390X_HAVE_Z14
1572
+ return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw *
1573
+ BitCast(d_arith, b).raw});
1574
+ #else
1575
+ return BitCast(d, VFromD<decltype(d_arith)>{vec_mul(
1576
+ BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
1577
+ #endif
1463
1578
  }
1464
1579
 
1465
1580
  // Returns the upper 16 bits of a * b in each lane.
1466
1581
  template <typename T, size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_NOT_FLOAT(T)>
1467
1582
  HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
1583
+ #if HWY_S390X_HAVE_Z14
1584
+ return Vec128<T, N>{vec_mulh(a.raw, b.raw)};
1585
+ #else
1468
1586
  const DFromV<decltype(a)> d;
1469
1587
  const RepartitionToWide<decltype(d)> dw;
1470
1588
  const VFromD<decltype(dw)> p1{vec_mule(a.raw, b.raw)};
@@ -1477,13 +1595,7 @@ HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
1477
1595
  8, 9, 24, 25, 12, 13, 28, 29};
1478
1596
  #endif
1479
1597
  return BitCast(d, VFromD<decltype(dw)>{vec_perm(p1.raw, p2.raw, kShuffle)});
1480
- }
1481
-
1482
- template <size_t N>
1483
- HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
1484
- Vec128<int16_t, N> b) {
1485
- const Vec128<int16_t> zero = Zero(Full128<int16_t>());
1486
- return Vec128<int16_t, N>{vec_mradds(a.raw, b.raw, zero.raw)};
1598
+ #endif
1487
1599
  }
1488
1600
 
1489
1601
  // Multiplies even lanes (0, 2, ..) and places the double-wide result into
@@ -1510,10 +1622,15 @@ HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulOdd(Vec128<T, N> a,
1510
1622
  template <int kBits, typename T, size_t N>
1511
1623
  HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
1512
1624
  const DFromV<decltype(v)> d;
1625
+ const RebindToUnsigned<decltype(d)> du;
1513
1626
  constexpr size_t kSizeInBits = sizeof(T) * 8;
1514
1627
  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
1628
+
1515
1629
  if (kBits == 0) return v;
1516
- return Vec128<T, N>{vec_rl(v.raw, Set(d, kSizeInBits - kBits).raw)};
1630
+
1631
+ // Do an unsigned vec_rl operation to avoid undefined behavior
1632
+ return BitCast(d, VFromD<decltype(du)>{vec_rl(
1633
+ BitCast(du, v).raw, Set(du, kSizeInBits - kBits).raw)});
1517
1634
  }
1518
1635
 
1519
1636
  // ------------------------------ ZeroIfNegative (BroadcastSignBit)
@@ -1541,8 +1658,7 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
1541
1658
  BitCast(du, no).raw, BitCast(du, yes).raw, BitCast(du, v).raw)});
1542
1659
  #else
1543
1660
  const RebindToSigned<decltype(d)> di;
1544
- return IfThenElse(MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))),
1545
- yes, no);
1661
+ return IfVecThenElse(BitCast(d, BroadcastSignBit(BitCast(di, v))), yes, no);
1546
1662
  #endif
1547
1663
  }
1548
1664
 
@@ -1598,17 +1714,42 @@ HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
1598
1714
  #endif
1599
1715
 
1600
1716
  template <typename T, size_t N, HWY_IF_FLOAT(T)>
1601
- HWY_API Vec128<T, N> ApproximateReciprocal(Vec128<T, N> v) {
1602
- return Vec128<T, N>{vec_re(v.raw)};
1717
+ HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
1718
+ #if HWY_S390X_HAVE_Z14
1719
+ return Vec128<T, N>{a.raw / b.raw};
1720
+ #else
1721
+ return Vec128<T, N>{vec_div(a.raw, b.raw)};
1722
+ #endif
1603
1723
  }
1604
1724
 
1605
1725
  template <typename T, size_t N, HWY_IF_FLOAT(T)>
1606
- HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
1607
- return Vec128<T, N>{vec_div(a.raw, b.raw)};
1726
+ HWY_API Vec128<T, N> ApproximateReciprocal(Vec128<T, N> v) {
1727
+ #if HWY_S390X_HAVE_Z14
1728
+ const DFromV<decltype(v)> d;
1729
+ return Set(d, T(1.0)) / v;
1730
+ #else
1731
+ return Vec128<T, N>{vec_re(v.raw)};
1732
+ #endif
1608
1733
  }
1609
1734
 
1610
1735
  // ------------------------------ Floating-point square root
1611
1736
 
1737
+ #if HWY_S390X_HAVE_Z14
1738
+ // Approximate reciprocal square root
1739
+ template <size_t N>
1740
+ HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
1741
+ const DFromV<decltype(v)> d;
1742
+ const RebindToUnsigned<decltype(d)> du;
1743
+
1744
+ const auto half = v * Set(d, 0.5f);
1745
+ // Initial guess based on log2(f)
1746
+ const auto guess = BitCast(
1747
+ d, Set(du, uint32_t{0x5F3759DFu}) - ShiftRight<1>(BitCast(du, v)));
1748
+ // One Newton-Raphson iteration
1749
+ return guess * NegMulAdd(half * guess, guess, Set(d, 1.5f));
1750
+ }
1751
+ #else // VSX
1752
+
1612
1753
  #ifdef HWY_NATIVE_F64_APPROX_RSQRT
1613
1754
  #undef HWY_NATIVE_F64_APPROX_RSQRT
1614
1755
  #else
@@ -1620,6 +1761,7 @@ template <class T, size_t N, HWY_IF_FLOAT(T)>
1620
1761
  HWY_API Vec128<T, N> ApproximateReciprocalSqrt(Vec128<T, N> v) {
1621
1762
  return Vec128<T, N>{vec_rsqrte(v.raw)};
1622
1763
  }
1764
+ #endif // HWY_S390X_HAVE_Z14
1623
1765
 
1624
1766
  // Full precision square root
1625
1767
  template <class T, size_t N, HWY_IF_FLOAT(T)>
@@ -1668,6 +1810,167 @@ HWY_API V AbsDiff(const V a, const V b) {
1668
1810
 
1669
1811
  #endif // HWY_PPC_HAVE_9
1670
1812
 
1813
+ // ------------------------------ Integer Div for PPC10
1814
+ #if HWY_PPC_HAVE_10
1815
+ #ifdef HWY_NATIVE_INT_DIV
1816
+ #undef HWY_NATIVE_INT_DIV
1817
+ #else
1818
+ #define HWY_NATIVE_INT_DIV
1819
+ #endif
1820
+
1821
+ template <size_t N>
1822
+ HWY_API Vec128<int32_t, N> operator/(Vec128<int32_t, N> a,
1823
+ Vec128<int32_t, N> b) {
1824
+ // Inline assembly is used instead of vec_div for I32 Div on PPC10 to avoid
1825
+ // undefined behavior if b[i] == 0 or
1826
+ // (a[i] == LimitsMin<int32_t>() && b[i] == -1)
1827
+
1828
+ // Clang will also optimize out I32 vec_div on PPC10 if optimizations are
1829
+ // enabled and any of the lanes of b are known to be zero (even in the unused
1830
+ // lanes of a partial vector)
1831
+ __vector signed int raw_result;
1832
+ __asm__("vdivsw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
1833
+ return Vec128<int32_t, N>{raw_result};
1834
+ }
1835
+
1836
+ template <size_t N>
1837
+ HWY_API Vec128<uint32_t, N> operator/(Vec128<uint32_t, N> a,
1838
+ Vec128<uint32_t, N> b) {
1839
+ // Inline assembly is used instead of vec_div for U32 Div on PPC10 to avoid
1840
+ // undefined behavior if b[i] == 0
1841
+
1842
+ // Clang will also optimize out U32 vec_div on PPC10 if optimizations are
1843
+ // enabled and any of the lanes of b are known to be zero (even in the unused
1844
+ // lanes of a partial vector)
1845
+ __vector unsigned int raw_result;
1846
+ __asm__("vdivuw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
1847
+ return Vec128<uint32_t, N>{raw_result};
1848
+ }
1849
+
1850
+ template <size_t N>
1851
+ HWY_API Vec128<int64_t, N> operator/(Vec128<int64_t, N> a,
1852
+ Vec128<int64_t, N> b) {
1853
+ // Inline assembly is used instead of vec_div for I64 Div on PPC10 to avoid
1854
+ // undefined behavior if b[i] == 0 or
1855
+ // (a[i] == LimitsMin<int64_t>() && b[i] == -1)
1856
+
1857
+ // Clang will also optimize out I64 vec_div on PPC10 if optimizations are
1858
+ // enabled and any of the lanes of b are known to be zero (even in the unused
1859
+ // lanes of a partial vector)
1860
+ __vector signed long long raw_result;
1861
+ __asm__("vdivsd %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
1862
+ return Vec128<int64_t, N>{raw_result};
1863
+ }
1864
+
1865
+ template <size_t N>
1866
+ HWY_API Vec128<uint64_t, N> operator/(Vec128<uint64_t, N> a,
1867
+ Vec128<uint64_t, N> b) {
1868
+ // Inline assembly is used instead of vec_div for U64 Div on PPC10 to avoid
1869
+ // undefined behavior if b[i] == 0
1870
+
1871
+ // Clang will also optimize out U64 vec_div on PPC10 if optimizations are
1872
+ // enabled and any of the lanes of b are known to be zero (even in the unused
1873
+ // lanes of a partial vector)
1874
+ __vector unsigned long long raw_result;
1875
+ __asm__("vdivud %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
1876
+ return Vec128<uint64_t, N>{raw_result};
1877
+ }
1878
+
1879
+ template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
1880
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
1881
+ HWY_API Vec128<T> operator/(Vec128<T> a, Vec128<T> b) {
1882
+ const DFromV<decltype(a)> d;
1883
+ const RepartitionToWide<decltype(d)> dw;
1884
+ return OrderedDemote2To(d, PromoteLowerTo(dw, a) / PromoteLowerTo(dw, b),
1885
+ PromoteUpperTo(dw, a) / PromoteUpperTo(dw, b));
1886
+ }
1887
+
1888
+ template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
1889
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
1890
+ HWY_IF_V_SIZE_LE(T, N, 8)>
1891
+ HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
1892
+ const DFromV<decltype(a)> d;
1893
+ const Rebind<MakeWide<T>, decltype(d)> dw;
1894
+ return DemoteTo(d, PromoteTo(dw, a) / PromoteTo(dw, b));
1895
+ }
1896
+
1897
+ template <size_t N>
1898
+ HWY_API Vec128<int32_t, N> operator%(Vec128<int32_t, N> a,
1899
+ Vec128<int32_t, N> b) {
1900
+ // Inline assembly is used instead of vec_mod for I32 Mod on PPC10 to avoid
1901
+ // undefined behavior if b[i] == 0 or
1902
+ // (a[i] == LimitsMin<int32_t>() && b[i] == -1)
1903
+
1904
+ // Clang will also optimize out I32 vec_mod on PPC10 if optimizations are
1905
+ // enabled and any of the lanes of b are known to be zero (even in the unused
1906
+ // lanes of a partial vector)
1907
+ __vector signed int raw_result;
1908
+ __asm__("vmodsw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
1909
+ return Vec128<int32_t, N>{raw_result};
1910
+ }
1911
+
1912
+ template <size_t N>
1913
+ HWY_API Vec128<uint32_t, N> operator%(Vec128<uint32_t, N> a,
1914
+ Vec128<uint32_t, N> b) {
1915
+ // Inline assembly is used instead of vec_mod for U32 Mod on PPC10 to avoid
1916
+ // undefined behavior if b[i] == 0
1917
+
1918
+ // Clang will also optimize out U32 vec_mod on PPC10 if optimizations are
1919
+ // enabled and any of the lanes of b are known to be zero (even in the unused
1920
+ // lanes of a partial vector)
1921
+ __vector unsigned int raw_result;
1922
+ __asm__("vmoduw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
1923
+ return Vec128<uint32_t, N>{raw_result};
1924
+ }
1925
+
1926
+ template <size_t N>
1927
+ HWY_API Vec128<int64_t, N> operator%(Vec128<int64_t, N> a,
1928
+ Vec128<int64_t, N> b) {
1929
+ // Inline assembly is used instead of vec_mod for I64 Mod on PPC10 to avoid
1930
+ // undefined behavior if b[i] == 0 or
1931
+ // (a[i] == LimitsMin<int64_t>() && b[i] == -1)
1932
+
1933
+ // Clang will also optimize out I64 vec_mod on PPC10 if optimizations are
1934
+ // enabled and any of the lanes of b are known to be zero (even in the unused
1935
+ // lanes of a partial vector)
1936
+ __vector signed long long raw_result;
1937
+ __asm__("vmodsd %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
1938
+ return Vec128<int64_t, N>{raw_result};
1939
+ }
1940
+
1941
+ template <size_t N>
1942
+ HWY_API Vec128<uint64_t, N> operator%(Vec128<uint64_t, N> a,
1943
+ Vec128<uint64_t, N> b) {
1944
+ // Inline assembly is used instead of vec_mod for U64 Mod on PPC10 to avoid
1945
+ // undefined behavior if b[i] == 0
1946
+
1947
+ // Clang will also optimize out U64 vec_mod on PPC10 if optimizations are
1948
+ // enabled and any of the lanes of b are known to be zero (even in the unused
1949
+ // lanes of a partial vector)
1950
+ __vector unsigned long long raw_result;
1951
+ __asm__("vmodud %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
1952
+ return Vec128<uint64_t, N>{raw_result};
1953
+ }
1954
+
1955
+ template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
1956
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
1957
+ HWY_API Vec128<T> operator%(Vec128<T> a, Vec128<T> b) {
1958
+ const DFromV<decltype(a)> d;
1959
+ const RepartitionToWide<decltype(d)> dw;
1960
+ return OrderedDemote2To(d, PromoteLowerTo(dw, a) % PromoteLowerTo(dw, b),
1961
+ PromoteUpperTo(dw, a) % PromoteUpperTo(dw, b));
1962
+ }
1963
+
1964
+ template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
1965
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
1966
+ HWY_IF_V_SIZE_LE(T, N, 8)>
1967
+ HWY_API Vec128<T, N> operator%(Vec128<T, N> a, Vec128<T, N> b) {
1968
+ const DFromV<decltype(a)> d;
1969
+ const Rebind<MakeWide<T>, decltype(d)> dw;
1970
+ return DemoteTo(d, PromoteTo(dw, a) % PromoteTo(dw, b));
1971
+ }
1972
+ #endif
1973
+
1671
1974
  // ================================================== MEMORY (3)
1672
1975
 
1673
1976
  // ------------------------------ Non-temporal stores
@@ -1800,7 +2103,7 @@ template <typename T, size_t N>
1800
2103
  HWY_API Vec128<T, N> InsertLane(Vec128<T, N> v, size_t i, T t) {
1801
2104
  #if HWY_IS_LITTLE_ENDIAN
1802
2105
  typename detail::Raw128<T>::type raw_result = v.raw;
1803
- raw_result[i] = t;
2106
+ raw_result[i] = BitCastScalar<typename detail::Raw128<T>::RawT>(t);
1804
2107
  return Vec128<T, N>{raw_result};
1805
2108
  #else
1806
2109
  // On ppc64be without this, mul_test fails, but swizzle_test passes.
@@ -2070,7 +2373,7 @@ HWY_API Vec32<T> Reverse(D d, Vec32<T> v) {
2070
2373
 
2071
2374
  // ------------------------------- ReverseLaneBytes
2072
2375
 
2073
- #if HWY_PPC_HAVE_9 && \
2376
+ #if (HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14) && \
2074
2377
  (HWY_COMPILER_GCC_ACTUAL >= 710 || HWY_COMPILER_CLANG >= 400)
2075
2378
 
2076
2379
  // Per-target flag to prevent generic_ops-inl.h defining 8-bit ReverseLaneBytes.
@@ -2111,7 +2414,7 @@ HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
2111
2414
  return BitCast(d, ReverseLaneBytes(BitCast(du64, v)));
2112
2415
  }
2113
2416
 
2114
- #endif // HWY_PPC_HAVE_9
2417
+ #endif // HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
2115
2418
 
2116
2419
  template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
2117
2420
  HWY_API Vec16<T> Reverse(D d, Vec16<T> v) {
@@ -2268,11 +2571,15 @@ HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
2268
2571
  Set(Full128<uint32_t>(),
2269
2572
  static_cast<uint32_t>(amt * sizeof(TFromD<D>) * 8)));
2270
2573
 
2574
+ #if HWY_S390X_HAVE_Z14
2575
+ return BitCast(d, VU8{vec_srb(BitCast(du8, v).raw, v_shift_amt.raw)});
2576
+ #else // VSX
2271
2577
  #if HWY_IS_LITTLE_ENDIAN
2272
2578
  return BitCast(d, VU8{vec_slo(BitCast(du8, v).raw, v_shift_amt.raw)});
2273
2579
  #else
2274
2580
  return BitCast(d, VU8{vec_sro(BitCast(du8, v).raw, v_shift_amt.raw)});
2275
- #endif
2581
+ #endif // HWY_IS_LITTLE_ENDIAN
2582
+ #endif // HWY_S390X_HAVE_Z14
2276
2583
  }
2277
2584
 
2278
2585
  // ------------------------------ SlideDownLanes
@@ -2300,11 +2607,15 @@ HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
2300
2607
  Set(Full128<uint32_t>(),
2301
2608
  static_cast<uint32_t>(amt * sizeof(TFromD<D>) * 8)));
2302
2609
 
2610
+ #if HWY_S390X_HAVE_Z14
2611
+ return BitCast(d, VU8{vec_slb(BitCast(du8, v).raw, v_shift_amt.raw)});
2612
+ #else // VSX
2303
2613
  #if HWY_IS_LITTLE_ENDIAN
2304
2614
  return BitCast(d, VU8{vec_sro(BitCast(du8, v).raw, v_shift_amt.raw)});
2305
2615
  #else
2306
2616
  return BitCast(d, VU8{vec_slo(BitCast(du8, v).raw, v_shift_amt.raw)});
2307
- #endif
2617
+ #endif // HWY_IS_LITTLE_ENDIAN
2618
+ #endif // HWY_S390X_HAVE_Z14
2308
2619
  }
2309
2620
 
2310
2621
  // ================================================== COMBINE
@@ -2637,7 +2948,15 @@ HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
2637
2948
 
2638
2949
  template <typename T, HWY_IF_T_SIZE(T, 4)>
2639
2950
  HWY_API Vec128<T> DupEven(Vec128<T> v) {
2951
+ #if HWY_S390X_HAVE_Z14
2952
+ const DFromV<decltype(v)> d;
2953
+ const Repartition<uint8_t, decltype(d)> du8;
2954
+ return TableLookupBytes(
2955
+ v, BitCast(d, Dup128VecFromValues(du8, 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10,
2956
+ 11, 8, 9, 10, 11)));
2957
+ #else
2640
2958
  return Vec128<T>{vec_mergee(v.raw, v.raw)};
2959
+ #endif
2641
2960
  }
2642
2961
 
2643
2962
  // ------------------------------ DupOdd (InterleaveUpper)
@@ -2662,7 +2981,15 @@ HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
2662
2981
 
2663
2982
  template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
2664
2983
  HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
2984
+ #if HWY_S390X_HAVE_Z14
2985
+ const DFromV<decltype(v)> d;
2986
+ const Repartition<uint8_t, decltype(d)> du8;
2987
+ return TableLookupBytes(
2988
+ v, BitCast(d, Dup128VecFromValues(du8, 4, 5, 6, 7, 4, 5, 6, 7, 12, 13, 14,
2989
+ 15, 12, 13, 14, 15)));
2990
+ #else
2665
2991
  return Vec128<T, N>{vec_mergeo(v.raw, v.raw)};
2992
+ #endif
2666
2993
  }
2667
2994
 
2668
2995
  template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
@@ -2719,13 +3046,51 @@ HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
2719
3046
  return v;
2720
3047
  }
2721
3048
 
3049
+ // ------------------------------ MulFixedPoint15 (OddEven)
3050
+
3051
+ #if HWY_S390X_HAVE_Z14
3052
+ HWY_API Vec16<int16_t> MulFixedPoint15(Vec16<int16_t> a, Vec16<int16_t> b) {
3053
+ const DFromV<decltype(a)> di16;
3054
+ const RepartitionToWide<decltype(di16)> di32;
3055
+
3056
+ const auto round_up_incr = Set(di32, 0x4000);
3057
+ const auto i32_product = MulEven(a, b) + round_up_incr;
3058
+
3059
+ return ResizeBitCast(di16, ShiftLeft<1>(i32_product));
3060
+ }
3061
+ template <size_t N, HWY_IF_LANES_GT(N, 1)>
3062
+ HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
3063
+ Vec128<int16_t, N> b) {
3064
+ const DFromV<decltype(a)> di16;
3065
+ const RepartitionToWide<decltype(di16)> di32;
3066
+
3067
+ const auto round_up_incr = Set(di32, 0x4000);
3068
+ const auto even_product = MulEven(a, b) + round_up_incr;
3069
+ const auto odd_product = MulOdd(a, b) + round_up_incr;
3070
+
3071
+ return OddEven(BitCast(di16, ShiftRight<15>(odd_product)),
3072
+ BitCast(di16, ShiftLeft<1>(even_product)));
3073
+ }
3074
+ #else
3075
+ template <size_t N>
3076
+ HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
3077
+ Vec128<int16_t, N> b) {
3078
+ const Vec128<int16_t> zero = Zero(Full128<int16_t>());
3079
+ return Vec128<int16_t, N>{vec_mradds(a.raw, b.raw, zero.raw)};
3080
+ }
3081
+ #endif
3082
+
2722
3083
  // ------------------------------ Shl
2723
3084
 
2724
3085
  namespace detail {
2725
3086
  template <typename T, size_t N>
2726
3087
  HWY_API Vec128<T, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<T, N> v,
2727
3088
  Vec128<T, N> bits) {
3089
+ #if HWY_S390X_HAVE_Z14
3090
+ return Vec128<T, N>{v.raw << bits.raw};
3091
+ #else
2728
3092
  return Vec128<T, N>{vec_sl(v.raw, bits.raw)};
3093
+ #endif
2729
3094
  }
2730
3095
 
2731
3096
  // Signed left shift is the same as unsigned.
@@ -2751,15 +3116,23 @@ namespace detail {
2751
3116
  template <typename T, size_t N>
2752
3117
  HWY_API Vec128<T, N> Shr(hwy::UnsignedTag /*tag*/, Vec128<T, N> v,
2753
3118
  Vec128<T, N> bits) {
3119
+ #if HWY_S390X_HAVE_Z14
3120
+ return Vec128<T, N>{v.raw >> bits.raw};
3121
+ #else
2754
3122
  return Vec128<T, N>{vec_sr(v.raw, bits.raw)};
3123
+ #endif
2755
3124
  }
2756
3125
 
2757
3126
  template <typename T, size_t N>
2758
3127
  HWY_API Vec128<T, N> Shr(hwy::SignedTag /*tag*/, Vec128<T, N> v,
2759
3128
  Vec128<T, N> bits) {
3129
+ #if HWY_S390X_HAVE_Z14
3130
+ return Vec128<T, N>{v.raw >> bits.raw};
3131
+ #else
2760
3132
  const DFromV<decltype(v)> di;
2761
3133
  const RebindToUnsigned<decltype(di)> du;
2762
3134
  return Vec128<T, N>{vec_sra(v.raw, BitCast(du, bits).raw)};
3135
+ #endif
2763
3136
  }
2764
3137
 
2765
3138
  } // namespace detail
@@ -2834,7 +3207,12 @@ HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) {
2834
3207
  template <class D32, HWY_IF_UI32_D(D32),
2835
3208
  class V16 = VFromD<RepartitionToNarrow<D32>>>
2836
3209
  HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 d32, V16 a, V16 b) {
3210
+ #if HWY_S390X_HAVE_Z14
3211
+ (void)d32;
3212
+ return MulEven(a, b) + MulOdd(a, b);
3213
+ #else
2837
3214
  return VFromD<D32>{vec_msum(a.raw, b.raw, Zero(d32).raw)};
3215
+ #endif
2838
3216
  }
2839
3217
 
2840
3218
  // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
@@ -2861,10 +3239,14 @@ HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b,
2861
3239
  // Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe.
2862
3240
  template <class D32, HWY_IF_UI32_D(D32),
2863
3241
  class V16 = VFromD<RepartitionToNarrow<D32>>>
2864
- HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 /* tag */, V16 a, V16 b,
3242
+ HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 /*d32*/, V16 a, V16 b,
2865
3243
  VFromD<D32> sum0,
2866
3244
  VFromD<D32>& /*sum1*/) {
3245
+ #if HWY_S390X_HAVE_Z14
3246
+ return MulEven(a, b) + MulOdd(a, b) + sum0;
3247
+ #else
2867
3248
  return VFromD<D32>{vec_msum(a.raw, b.raw, sum0.raw)};
3249
+ #endif
2868
3250
  }
2869
3251
 
2870
3252
  // ------------------------------ RearrangeToOddPlusEven
@@ -2886,6 +3268,8 @@ HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
2886
3268
  }
2887
3269
 
2888
3270
  // ------------------------------ SumOfMulQuadAccumulate
3271
+ #if !HWY_S390X_HAVE_Z14
3272
+
2889
3273
  #ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
2890
3274
  #undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
2891
3275
  #else
@@ -2925,11 +3309,12 @@ HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32,
2925
3309
 
2926
3310
  const auto result_sum_0 =
2927
3311
  SumOfMulQuadAccumulate(di32, BitCast(du8, a), b, sum);
2928
- const auto result_sum_1 = ShiftLeft<8>(detail::AltivecVsum4sbs(
2929
- di32, And(b, BroadcastSignBit(a)).raw, Zero(di32).raw));
3312
+ const auto result_sum_1 = ShiftLeft<8>(SumsOf4(And(b, BroadcastSignBit(a))));
2930
3313
  return result_sum_0 - result_sum_1;
2931
3314
  }
2932
3315
 
3316
+ #endif // !HWY_S390X_HAVE_Z14
3317
+
2933
3318
  // ================================================== CONVERT
2934
3319
 
2935
3320
  // ------------------------------ Promotions (part w/ narrow lanes -> full)
@@ -3018,29 +3403,59 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
3018
3403
  }
3019
3404
 
3020
3405
  template <class D, HWY_IF_F64_D(D)>
3021
- HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
3406
+ HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<int32_t, D>> v) {
3407
+ #if HWY_S390X_HAVE_Z14
3408
+ const RebindToSigned<decltype(df64)> di64;
3409
+ return ConvertTo(df64, PromoteTo(di64, v));
3410
+ #else // VSX
3411
+ (void)df64;
3022
3412
  const __vector signed int raw_v = InterleaveLower(v, v).raw;
3023
3413
  #if HWY_IS_LITTLE_ENDIAN
3024
3414
  return VFromD<D>{vec_doubleo(raw_v)};
3025
3415
  #else
3026
3416
  return VFromD<D>{vec_doublee(raw_v)};
3027
3417
  #endif
3418
+ #endif // HWY_S390X_HAVE_Z14
3028
3419
  }
3029
3420
 
3030
3421
  template <class D, HWY_IF_F64_D(D)>
3031
- HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
3422
+ HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<uint32_t, D>> v) {
3423
+ #if HWY_S390X_HAVE_Z14
3424
+ const RebindToUnsigned<decltype(df64)> du64;
3425
+ return ConvertTo(df64, PromoteTo(du64, v));
3426
+ #else // VSX
3427
+ (void)df64;
3032
3428
  const __vector unsigned int raw_v = InterleaveLower(v, v).raw;
3033
3429
  #if HWY_IS_LITTLE_ENDIAN
3034
3430
  return VFromD<D>{vec_doubleo(raw_v)};
3035
3431
  #else
3036
3432
  return VFromD<D>{vec_doublee(raw_v)};
3037
3433
  #endif
3434
+ #endif // HWY_S390X_HAVE_Z14
3435
+ }
3436
+
3437
+ #if !HWY_S390X_HAVE_Z14
3438
+ namespace detail {
3439
+
3440
+ template <class V>
3441
+ static HWY_INLINE V VsxF2INormalizeSrcVals(V v) {
3442
+ #if !defined(HWY_DISABLE_PPC_VSX_QEMU_F2I_WORKAROUND)
3443
+ // Workaround for QEMU 7/8 VSX float to int conversion bug
3444
+ return IfThenElseZero(v == v, v);
3445
+ #else
3446
+ return v;
3447
+ #endif
3038
3448
  }
3039
3449
 
3450
+ } // namespace detail
3451
+ #endif // !HWY_S390X_HAVE_Z14
3452
+
3040
3453
  template <class D, HWY_IF_I64_D(D)>
3041
3454
  HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
3042
- #if HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds)
3043
- const __vector float raw_v = InterleaveLower(v, v).raw;
3455
+ #if !HWY_S390X_HAVE_Z14 && \
3456
+ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
3457
+ const __vector float raw_v =
3458
+ detail::VsxF2INormalizeSrcVals(InterleaveLower(v, v)).raw;
3044
3459
  return VFromD<decltype(di64)>{__builtin_vsx_xvcvspsxds(raw_v)};
3045
3460
  #else
3046
3461
  const RebindToFloat<decltype(di64)> df64;
@@ -3050,8 +3465,10 @@ HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
3050
3465
 
3051
3466
  template <class D, HWY_IF_U64_D(D)>
3052
3467
  HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
3053
- #if HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds)
3054
- const __vector float raw_v = InterleaveLower(v, v).raw;
3468
+ #if !HWY_S390X_HAVE_Z14 && \
3469
+ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
3470
+ const __vector float raw_v =
3471
+ detail::VsxF2INormalizeSrcVals(InterleaveLower(v, v)).raw;
3055
3472
  return VFromD<decltype(du64)>{reinterpret_cast<__vector unsigned long long>(
3056
3473
  __builtin_vsx_xvcvspuxds(raw_v))};
3057
3474
  #else
@@ -3123,7 +3540,12 @@ HWY_API VFromD<D> PromoteUpperTo(D /*tag*/, Vec128<float> v) {
3123
3540
  }
3124
3541
 
3125
3542
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
3126
- HWY_API VFromD<D> PromoteUpperTo(D /*tag*/, Vec128<int32_t> v) {
3543
+ HWY_API VFromD<D> PromoteUpperTo(D df64, Vec128<int32_t> v) {
3544
+ #if HWY_S390X_HAVE_Z14
3545
+ const RebindToSigned<decltype(df64)> di64;
3546
+ return ConvertTo(df64, PromoteUpperTo(di64, v));
3547
+ #else // VSX
3548
+ (void)df64;
3127
3549
  const __vector signed int raw_v =
3128
3550
  InterleaveUpper(Full128<int32_t>(), v, v).raw;
3129
3551
  #if HWY_IS_LITTLE_ENDIAN
@@ -3131,10 +3553,16 @@ HWY_API VFromD<D> PromoteUpperTo(D /*tag*/, Vec128<int32_t> v) {
3131
3553
  #else
3132
3554
  return VFromD<D>{vec_doublee(raw_v)};
3133
3555
  #endif
3556
+ #endif // HWY_S390X_HAVE_Z14
3134
3557
  }
3135
3558
 
3136
3559
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
3137
- HWY_API VFromD<D> PromoteUpperTo(D /*tag*/, Vec128<uint32_t> v) {
3560
+ HWY_API VFromD<D> PromoteUpperTo(D df64, Vec128<uint32_t> v) {
3561
+ #if HWY_S390X_HAVE_Z14
3562
+ const RebindToUnsigned<decltype(df64)> du64;
3563
+ return ConvertTo(df64, PromoteUpperTo(du64, v));
3564
+ #else // VSX
3565
+ (void)df64;
3138
3566
  const __vector unsigned int raw_v =
3139
3567
  InterleaveUpper(Full128<uint32_t>(), v, v).raw;
3140
3568
  #if HWY_IS_LITTLE_ENDIAN
@@ -3142,12 +3570,16 @@ HWY_API VFromD<D> PromoteUpperTo(D /*tag*/, Vec128<uint32_t> v) {
3142
3570
  #else
3143
3571
  return VFromD<D>{vec_doublee(raw_v)};
3144
3572
  #endif
3573
+ #endif // HWY_S390X_HAVE_Z14
3145
3574
  }
3146
3575
 
3147
3576
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
3148
3577
  HWY_API VFromD<D> PromoteUpperTo(D di64, Vec128<float> v) {
3149
- #if HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds)
3150
- const __vector float raw_v = InterleaveUpper(Full128<float>(), v, v).raw;
3578
+ #if !HWY_S390X_HAVE_Z14 && \
3579
+ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
3580
+ const __vector float raw_v =
3581
+ detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v))
3582
+ .raw;
3151
3583
  return VFromD<decltype(di64)>{__builtin_vsx_xvcvspsxds(raw_v)};
3152
3584
  #else
3153
3585
  const RebindToFloat<decltype(di64)> df64;
@@ -3157,8 +3589,11 @@ HWY_API VFromD<D> PromoteUpperTo(D di64, Vec128<float> v) {
3157
3589
 
3158
3590
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
3159
3591
  HWY_API VFromD<D> PromoteUpperTo(D du64, Vec128<float> v) {
3160
- #if HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds)
3161
- const __vector float raw_v = InterleaveUpper(Full128<float>(), v, v).raw;
3592
+ #if !HWY_S390X_HAVE_Z14 && \
3593
+ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
3594
+ const __vector float raw_v =
3595
+ detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v))
3596
+ .raw;
3162
3597
  return VFromD<decltype(du64)>{reinterpret_cast<__vector unsigned long long>(
3163
3598
  __builtin_vsx_xvcvspuxds(raw_v))};
3164
3599
  #else
@@ -3174,6 +3609,219 @@ HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
3174
3609
  return PromoteTo(d, UpperHalf(dh, v));
3175
3610
  }
3176
3611
 
3612
+ // ------------------------------ PromoteEvenTo/PromoteOddTo
3613
+
3614
+ namespace detail {
3615
+
3616
+ // Signed to Signed PromoteEvenTo/PromoteOddTo for PPC9/PPC10
3617
+ #if HWY_PPC_HAVE_9 && \
3618
+ (HWY_COMPILER_GCC_ACTUAL >= 1200 || HWY_COMPILER_CLANG >= 1200)
3619
+
3620
+ #if HWY_IS_LITTLE_ENDIAN
3621
+ template <class D, class V>
3622
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
3623
+ hwy::SizeTag<4> /*to_lane_size_tag*/,
3624
+ hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
3625
+ V v) {
3626
+ return VFromD<D>{vec_signexti(v.raw)};
3627
+ }
3628
+ template <class D, class V>
3629
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
3630
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3631
+ hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
3632
+ V v) {
3633
+ return VFromD<D>{vec_signextll(v.raw)};
3634
+ }
3635
+ #else
3636
+ template <class D, class V>
3637
+ HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
3638
+ hwy::SizeTag<4> /*to_lane_size_tag*/,
3639
+ hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
3640
+ V v) {
3641
+ return VFromD<D>{vec_signexti(v.raw)};
3642
+ }
3643
+ template <class D, class V>
3644
+ HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
3645
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3646
+ hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
3647
+ V v) {
3648
+ return VFromD<D>{vec_signextll(v.raw)};
3649
+ }
3650
+ #endif // HWY_IS_LITTLE_ENDIAN
3651
+
3652
+ #endif // HWY_PPC_HAVE_9
3653
+
3654
+ // I32/U32/F32->F64 PromoteEvenTo
3655
+ #if HWY_S390X_HAVE_Z14
3656
+ template <class D, class V>
3657
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
3658
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3659
+ hwy::FloatTag /*from_type_tag*/, D /*d_to*/,
3660
+ V v) {
3661
+ return VFromD<D>{vec_doublee(v.raw)};
3662
+ }
3663
+ template <class D, class V, class FromTypeTag, HWY_IF_UI32(TFromV<V>)>
3664
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
3665
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3666
+ FromTypeTag /*from_type_tag*/, D d_to, V v) {
3667
+ const Rebind<MakeWide<TFromV<V>>, decltype(d_to)> dw;
3668
+ return ConvertTo(d_to, PromoteEvenTo(dw, v));
3669
+ }
3670
+ #else // VSX
3671
+ template <class D, class V, class FromTypeTag>
3672
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
3673
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3674
+ FromTypeTag /*from_type_tag*/, D /*d_to*/,
3675
+ V v) {
3676
+ return VFromD<D>{vec_doublee(v.raw)};
3677
+ }
3678
+ #endif // HWY_S390X_HAVE_Z14
3679
+
3680
+ // F32->I64 PromoteEvenTo
3681
+ template <class D, class V>
3682
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
3683
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3684
+ hwy::FloatTag /*from_type_tag*/, D d_to,
3685
+ V v) {
3686
+ #if !HWY_S390X_HAVE_Z14 && \
3687
+ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
3688
+ (void)d_to;
3689
+ const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
3690
+ #if HWY_IS_LITTLE_ENDIAN
3691
+ // __builtin_vsx_xvcvspsxds expects the source values to be in the odd lanes
3692
+ // on little-endian PPC, and the vec_sld operation below will shift the even
3693
+ // lanes of normalized_v into the odd lanes.
3694
+ return VFromD<D>{
3695
+ __builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))};
3696
+ #else
3697
+ // __builtin_vsx_xvcvspsxds expects the source values to be in the even lanes
3698
+ // on big-endian PPC.
3699
+ return VFromD<D>{__builtin_vsx_xvcvspsxds(normalized_v.raw)};
3700
+ #endif
3701
+ #else
3702
+ const RebindToFloat<decltype(d_to)> df64;
3703
+ return ConvertTo(d_to, PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(),
3704
+ hwy::FloatTag(), df64, v));
3705
+ #endif
3706
+ }
3707
+
3708
+ // F32->U64 PromoteEvenTo
3709
+ template <class D, class V>
3710
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::UnsignedTag /*to_type_tag*/,
3711
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3712
+ hwy::FloatTag /*from_type_tag*/, D d_to,
3713
+ V v) {
3714
+ #if !HWY_S390X_HAVE_Z14 && \
3715
+ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
3716
+ (void)d_to;
3717
+ const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
3718
+ #if HWY_IS_LITTLE_ENDIAN
3719
+ // __builtin_vsx_xvcvspuxds expects the source values to be in the odd lanes
3720
+ // on little-endian PPC, and the vec_sld operation below will shift the even
3721
+ // lanes of normalized_v into the odd lanes.
3722
+ return VFromD<D>{
3723
+ reinterpret_cast<__vector unsigned long long>(__builtin_vsx_xvcvspuxds(
3724
+ vec_sld(normalized_v.raw, normalized_v.raw, 4)))};
3725
+ #else
3726
+ // __builtin_vsx_xvcvspuxds expects the source values to be in the even lanes
3727
+ // on big-endian PPC.
3728
+ return VFromD<D>{reinterpret_cast<__vector unsigned long long>(
3729
+ __builtin_vsx_xvcvspuxds(normalized_v.raw))};
3730
+ #endif
3731
+ #else
3732
+ const RebindToFloat<decltype(d_to)> df64;
3733
+ return ConvertTo(d_to, PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(),
3734
+ hwy::FloatTag(), df64, v));
3735
+ #endif
3736
+ }
3737
+
3738
+ // I32/U32/F32->F64 PromoteOddTo
3739
+ #if HWY_S390X_HAVE_Z14
3740
+ template <class D, class V>
3741
+ HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
3742
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3743
+ hwy::FloatTag /*from_type_tag*/, D d_to,
3744
+ V v) {
3745
+ return PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(), hwy::FloatTag(),
3746
+ d_to, V{vec_sld(v.raw, v.raw, 4)});
3747
+ }
3748
+ template <class D, class V, class FromTypeTag, HWY_IF_UI32(TFromV<V>)>
3749
+ HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
3750
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3751
+ FromTypeTag /*from_type_tag*/, D d_to, V v) {
3752
+ const Rebind<MakeWide<TFromV<V>>, decltype(d_to)> dw;
3753
+ return ConvertTo(d_to, PromoteOddTo(dw, v));
3754
+ }
3755
+ #else
3756
+ template <class D, class V, class FromTypeTag>
3757
+ HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
3758
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3759
+ FromTypeTag /*from_type_tag*/, D /*d_to*/,
3760
+ V v) {
3761
+ return VFromD<D>{vec_doubleo(v.raw)};
3762
+ }
3763
+ #endif
3764
+
3765
+ // F32->I64 PromoteOddTo
3766
+ template <class D, class V>
3767
+ HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
3768
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3769
+ hwy::FloatTag /*from_type_tag*/, D d_to,
3770
+ V v) {
3771
+ #if !HWY_S390X_HAVE_Z14 && \
3772
+ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
3773
+ (void)d_to;
3774
+ const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
3775
+ #if HWY_IS_LITTLE_ENDIAN
3776
+ // __builtin_vsx_xvcvspsxds expects the source values to be in the odd lanes
3777
+ // on little-endian PPC
3778
+ return VFromD<D>{__builtin_vsx_xvcvspsxds(normalized_v.raw)};
3779
+ #else
3780
+ // __builtin_vsx_xvcvspsxds expects the source values to be in the even lanes
3781
+ // on big-endian PPC, and the vec_sld operation below will shift the odd lanes
3782
+ // of normalized_v into the even lanes.
3783
+ return VFromD<D>{
3784
+ __builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))};
3785
+ #endif
3786
+ #else
3787
+ const RebindToFloat<decltype(d_to)> df64;
3788
+ return ConvertTo(d_to, PromoteOddTo(hwy::FloatTag(), hwy::SizeTag<8>(),
3789
+ hwy::FloatTag(), df64, v));
3790
+ #endif
3791
+ }
3792
+
3793
+ // F32->U64 PromoteOddTo
3794
+ template <class D, class V>
3795
+ HWY_INLINE VFromD<D> PromoteOddTo(hwy::UnsignedTag /*to_type_tag*/,
3796
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
3797
+ hwy::FloatTag /*from_type_tag*/, D d_to,
3798
+ V v) {
3799
+ #if !HWY_S390X_HAVE_Z14 && \
3800
+ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
3801
+ (void)d_to;
3802
+ const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
3803
+ #if HWY_IS_LITTLE_ENDIAN
3804
+ // __builtin_vsx_xvcvspuxds expects the source values to be in the odd lanes
3805
+ // on little-endian PPC
3806
+ return VFromD<D>{reinterpret_cast<__vector unsigned long long>(
3807
+ __builtin_vsx_xvcvspuxds(normalized_v.raw))};
3808
+ #else
3809
+ // __builtin_vsx_xvcvspuxds expects the source values to be in the even lanes
3810
+ // on big-endian PPC, and the vec_sld operation below will shift the odd lanes
3811
+ // of normalized_v into the even lanes.
3812
+ return VFromD<D>{
3813
+ reinterpret_cast<__vector unsigned long long>(__builtin_vsx_xvcvspuxds(
3814
+ vec_sld(normalized_v.raw, normalized_v.raw, 4)))};
3815
+ #endif
3816
+ #else
3817
+ const RebindToFloat<decltype(d_to)> df64;
3818
+ return ConvertTo(d_to, PromoteOddTo(hwy::FloatTag(), hwy::SizeTag<8>(),
3819
+ hwy::FloatTag(), df64, v));
3820
+ #endif
3821
+ }
3822
+
3823
+ } // namespace detail
3824
+
3177
3825
  // ------------------------------ Demotions (full -> part w/ narrow lanes)
3178
3826
 
3179
3827
  template <class D, typename FromT, HWY_IF_UNSIGNED_D(D),
@@ -3254,6 +3902,101 @@ HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
3254
3902
 
3255
3903
  #endif // HWY_PPC_HAVE_9
3256
3904
 
3905
+ #if HWY_PPC_HAVE_9
3906
+
3907
+ #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
3908
+ #undef HWY_NATIVE_DEMOTE_F64_TO_F16
3909
+ #else
3910
+ #define HWY_NATIVE_DEMOTE_F64_TO_F16
3911
+ #endif
3912
+
3913
+ namespace detail {
3914
+
3915
+ // On big-endian PPC9, VsxXscvdphp converts vf64[0] to a F16, returned as an U64
3916
+ // vector with the resulting F16 bits in the lower 16 bits of U64 lane 0
3917
+
3918
+ // On little-endian PPC9, VsxXscvdphp converts vf64[1] to a F16, returned as
3919
+ // an U64 vector with the resulting F16 bits in the lower 16 bits of U64 lane 1
3920
+ static HWY_INLINE Vec128<uint64_t> VsxXscvdphp(Vec128<double> vf64) {
3921
+ // Inline assembly is needed for the PPC9 xscvdphp instruction as there is
3922
+ // currently no intrinsic available for the PPC9 xscvdphp instruction
3923
+ __vector unsigned long long raw_result;
3924
+ __asm__("xscvdphp %x0, %x1" : "=wa"(raw_result) : "wa"(vf64.raw));
3925
+ return Vec128<uint64_t>{raw_result};
3926
+ }
3927
+
3928
+ } // namespace detail
3929
+
3930
+ template <class D, HWY_IF_F16_D(D), HWY_IF_LANES_D(D, 1)>
3931
+ HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
3932
+ const RebindToUnsigned<decltype(df16)> du16;
3933
+ const Rebind<uint64_t, decltype(df16)> du64;
3934
+
3935
+ const Full128<double> df64_full;
3936
+ #if HWY_IS_LITTLE_ENDIAN
3937
+ const auto bits16_as_u64 =
3938
+ UpperHalf(du64, detail::VsxXscvdphp(Combine(df64_full, v, v)));
3939
+ #else
3940
+ const auto bits16_as_u64 =
3941
+ LowerHalf(du64, detail::VsxXscvdphp(ResizeBitCast(df64_full, v)));
3942
+ #endif
3943
+
3944
+ return BitCast(df16, TruncateTo(du16, bits16_as_u64));
3945
+ }
3946
+
3947
+ template <class D, HWY_IF_F16_D(D), HWY_IF_LANES_D(D, 2)>
3948
+ HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
3949
+ const RebindToUnsigned<decltype(df16)> du16;
3950
+ const Rebind<uint64_t, decltype(df16)> du64;
3951
+ const Rebind<double, decltype(df16)> df64;
3952
+
3953
+ #if HWY_IS_LITTLE_ENDIAN
3954
+ const auto bits64_as_u64_0 = detail::VsxXscvdphp(InterleaveLower(df64, v, v));
3955
+ const auto bits64_as_u64_1 = detail::VsxXscvdphp(v);
3956
+ const auto bits64_as_u64 =
3957
+ InterleaveUpper(du64, bits64_as_u64_0, bits64_as_u64_1);
3958
+ #else
3959
+ const auto bits64_as_u64_0 = detail::VsxXscvdphp(v);
3960
+ const auto bits64_as_u64_1 = detail::VsxXscvdphp(InterleaveUpper(df64, v, v));
3961
+ const auto bits64_as_u64 =
3962
+ InterleaveLower(du64, bits64_as_u64_0, bits64_as_u64_1);
3963
+ #endif
3964
+
3965
+ return BitCast(df16, TruncateTo(du16, bits64_as_u64));
3966
+ }
3967
+
3968
+ #elif HWY_S390X_HAVE_Z14
3969
+
3970
+ #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
3971
+ #undef HWY_NATIVE_DEMOTE_F64_TO_F16
3972
+ #else
3973
+ #define HWY_NATIVE_DEMOTE_F64_TO_F16
3974
+ #endif
3975
+
3976
+ namespace detail {
3977
+
3978
+ template <class DF32, HWY_IF_F32_D(DF32)>
3979
+ static HWY_INLINE VFromD<DF32> DemoteToF32WithRoundToOdd(
3980
+ DF32 df32, VFromD<Rebind<double, DF32>> v) {
3981
+ const Twice<DF32> dt_f32;
3982
+
3983
+ __vector float raw_f32_in_even;
3984
+ __asm__("vledb %0,%1,0,3" : "=v"(raw_f32_in_even) : "v"(v.raw));
3985
+
3986
+ const VFromD<decltype(dt_f32)> f32_in_even{raw_f32_in_even};
3987
+ return LowerHalf(df32, ConcatEven(dt_f32, f32_in_even, f32_in_even));
3988
+ }
3989
+
3990
+ } // namespace detail
3991
+
3992
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)>
3993
+ HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
3994
+ const Rebind<float, decltype(df16)> df32;
3995
+ return DemoteTo(df16, detail::DemoteToF32WithRoundToOdd(df32, v));
3996
+ }
3997
+
3998
+ #endif // HWY_PPC_HAVE_9
3999
+
3257
4000
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
3258
4001
  HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
3259
4002
  const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
@@ -3393,90 +4136,164 @@ HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) {
3393
4136
 
3394
4137
  template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
3395
4138
  HWY_API Vec64<float> DemoteTo(D d, Vec128<double> v) {
3396
- #if HWY_IS_LITTLE_ENDIAN
4139
+ #if HWY_S390X_HAVE_Z14 || HWY_IS_LITTLE_ENDIAN
3397
4140
  const Vec128<float> f64_to_f32{vec_floate(v.raw)};
3398
4141
  #else
3399
4142
  const Vec128<float> f64_to_f32{vec_floato(v.raw)};
3400
4143
  #endif
3401
4144
 
4145
+ #if HWY_S390X_HAVE_Z14
4146
+ const Twice<decltype(d)> dt;
4147
+ return LowerHalf(d, ConcatEven(dt, f64_to_f32, f64_to_f32));
4148
+ #else
3402
4149
  const RebindToUnsigned<D> du;
3403
4150
  const Rebind<uint64_t, D> du64;
3404
4151
  return Vec64<float>{
3405
4152
  BitCast(d, TruncateTo(du, BitCast(du64, f64_to_f32))).raw};
4153
+ #endif
3406
4154
  }
3407
4155
 
3408
4156
  template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)>
3409
- HWY_API Vec32<int32_t> DemoteTo(D /* tag */, Vec64<double> v) {
3410
- return Vec32<int32_t>{vec_signede(v.raw)};
4157
+ HWY_API Vec32<int32_t> DemoteTo(D di32, Vec64<double> v) {
4158
+ #if HWY_S390X_HAVE_Z14
4159
+ const Rebind<int64_t, decltype(di32)> di64;
4160
+ return DemoteTo(di32, ConvertTo(di64, v));
4161
+ #else
4162
+ (void)di32;
4163
+ return Vec32<int32_t>{vec_signede(detail::VsxF2INormalizeSrcVals(v).raw)};
4164
+ #endif
3411
4165
  }
3412
4166
 
3413
4167
  template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)>
3414
- HWY_API Vec64<int32_t> DemoteTo(D /* tag */, Vec128<double> v) {
4168
+ HWY_API Vec64<int32_t> DemoteTo(D di32, Vec128<double> v) {
4169
+ #if HWY_S390X_HAVE_Z14
4170
+ const Rebind<int64_t, decltype(di32)> di64;
4171
+ return DemoteTo(di32, ConvertTo(di64, v));
4172
+ #else
4173
+ (void)di32;
4174
+
3415
4175
  #if HWY_IS_LITTLE_ENDIAN
3416
- const Vec128<int32_t> f64_to_i32{vec_signede(v.raw)};
4176
+ const Vec128<int32_t> f64_to_i32{
4177
+ vec_signede(detail::VsxF2INormalizeSrcVals(v).raw)};
3417
4178
  #else
3418
- const Vec128<int32_t> f64_to_i32{vec_signedo(v.raw)};
4179
+ const Vec128<int32_t> f64_to_i32{
4180
+ vec_signedo(detail::VsxF2INormalizeSrcVals(v).raw)};
3419
4181
  #endif
3420
4182
 
3421
4183
  const Rebind<int64_t, D> di64;
3422
4184
  const Vec128<int64_t> vi64 = BitCast(di64, f64_to_i32);
3423
4185
  return Vec64<int32_t>{vec_pack(vi64.raw, vi64.raw)};
4186
+ #endif
3424
4187
  }
3425
4188
 
3426
4189
  template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U32_D(D)>
3427
- HWY_API Vec32<uint32_t> DemoteTo(D /* tag */, Vec64<double> v) {
3428
- return Vec32<uint32_t>{vec_unsignede(v.raw)};
4190
+ HWY_API Vec32<uint32_t> DemoteTo(D du32, Vec64<double> v) {
4191
+ #if HWY_S390X_HAVE_Z14
4192
+ const Rebind<uint64_t, decltype(du32)> du64;
4193
+ return DemoteTo(du32, ConvertTo(du64, v));
4194
+ #else
4195
+ (void)du32;
4196
+ return Vec32<uint32_t>{vec_unsignede(detail::VsxF2INormalizeSrcVals(v).raw)};
4197
+ #endif
3429
4198
  }
3430
4199
 
3431
4200
  template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)>
3432
- HWY_API Vec64<uint32_t> DemoteTo(D /* tag */, Vec128<double> v) {
4201
+ HWY_API Vec64<uint32_t> DemoteTo(D du32, Vec128<double> v) {
4202
+ #if HWY_S390X_HAVE_Z14
4203
+ const Rebind<uint64_t, decltype(du32)> du64;
4204
+ return DemoteTo(du32, ConvertTo(du64, v));
4205
+ #else
4206
+ (void)du32;
3433
4207
  #if HWY_IS_LITTLE_ENDIAN
3434
- const Vec128<uint32_t> f64_to_u32{vec_unsignede(v.raw)};
4208
+ const Vec128<uint32_t> f64_to_u32{
4209
+ vec_unsignede(detail::VsxF2INormalizeSrcVals(v).raw)};
3435
4210
  #else
3436
- const Vec128<uint32_t> f64_to_u32{vec_unsignedo(v.raw)};
4211
+ const Vec128<uint32_t> f64_to_u32{
4212
+ vec_unsignedo(detail::VsxF2INormalizeSrcVals(v).raw)};
3437
4213
  #endif
3438
4214
 
3439
4215
  const Rebind<uint64_t, D> du64;
3440
4216
  const Vec128<uint64_t> vu64 = BitCast(du64, f64_to_u32);
3441
4217
  return Vec64<uint32_t>{vec_pack(vu64.raw, vu64.raw)};
4218
+ #endif
4219
+ }
4220
+
4221
+ #if HWY_S390X_HAVE_Z14
4222
+ namespace detail {
4223
+
4224
+ template <class V, HWY_IF_I64(TFromV<V>)>
4225
+ HWY_INLINE VFromD<RebindToFloat<DFromV<V>>> ConvToF64WithRoundToOdd(V v) {
4226
+ __vector double raw_result;
4227
+ // Use inline assembly to do a round-to-odd I64->F64 conversion on Z14
4228
+ __asm__("vcdgb %0,%1,0,3" : "=v"(raw_result) : "v"(v.raw));
4229
+ return VFromD<RebindToFloat<DFromV<V>>>{raw_result};
4230
+ }
4231
+
4232
+ template <class V, HWY_IF_U64(TFromV<V>)>
4233
+ HWY_INLINE VFromD<RebindToFloat<DFromV<V>>> ConvToF64WithRoundToOdd(V v) {
4234
+ __vector double raw_result;
4235
+ // Use inline assembly to do a round-to-odd U64->F64 conversion on Z14
4236
+ __asm__("vcdlgb %0,%1,0,3" : "=v"(raw_result) : "v"(v.raw));
4237
+ return VFromD<RebindToFloat<DFromV<V>>>{raw_result};
3442
4238
  }
3443
4239
 
4240
+ } // namespace detail
4241
+ #endif // HWY_S390X_HAVE_Z14
4242
+
3444
4243
  template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
3445
- HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<int64_t> v) {
4244
+ HWY_API Vec32<float> DemoteTo(D df32, Vec64<int64_t> v) {
4245
+ #if HWY_S390X_HAVE_Z14
4246
+ return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
4247
+ #else // VSX
4248
+ (void)df32;
3446
4249
  return Vec32<float>{vec_floate(v.raw)};
4250
+ #endif
3447
4251
  }
3448
4252
 
3449
4253
  template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
3450
- HWY_API Vec64<float> DemoteTo(D d, Vec128<int64_t> v) {
4254
+ HWY_API Vec64<float> DemoteTo(D df32, Vec128<int64_t> v) {
4255
+ #if HWY_S390X_HAVE_Z14
4256
+ return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
4257
+ #else // VSX
3451
4258
  #if HWY_IS_LITTLE_ENDIAN
3452
4259
  const Vec128<float> i64_to_f32{vec_floate(v.raw)};
3453
4260
  #else
3454
4261
  const Vec128<float> i64_to_f32{vec_floato(v.raw)};
3455
4262
  #endif
3456
4263
 
3457
- const RebindToUnsigned<D> du;
3458
- const Rebind<uint64_t, D> du64;
4264
+ const RebindToUnsigned<decltype(df32)> du32;
4265
+ const Rebind<uint64_t, decltype(df32)> du64;
3459
4266
  return Vec64<float>{
3460
- BitCast(d, TruncateTo(du, BitCast(du64, i64_to_f32))).raw};
4267
+ BitCast(df32, TruncateTo(du32, BitCast(du64, i64_to_f32))).raw};
4268
+ #endif
3461
4269
  }
3462
4270
 
3463
4271
  template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
3464
- HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<uint64_t> v) {
4272
+ HWY_API Vec32<float> DemoteTo(D df32, Vec64<uint64_t> v) {
4273
+ #if HWY_S390X_HAVE_Z14
4274
+ return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
4275
+ #else // VSX
4276
+ (void)df32;
3465
4277
  return Vec32<float>{vec_floate(v.raw)};
4278
+ #endif
3466
4279
  }
3467
4280
 
3468
4281
  template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
3469
- HWY_API Vec64<float> DemoteTo(D d, Vec128<uint64_t> v) {
4282
+ HWY_API Vec64<float> DemoteTo(D df32, Vec128<uint64_t> v) {
4283
+ #if HWY_S390X_HAVE_Z14
4284
+ return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
4285
+ #else // VSX
3470
4286
  #if HWY_IS_LITTLE_ENDIAN
3471
4287
  const Vec128<float> u64_to_f32{vec_floate(v.raw)};
3472
4288
  #else
3473
4289
  const Vec128<float> u64_to_f32{vec_floato(v.raw)};
3474
4290
  #endif
3475
4291
 
3476
- const RebindToUnsigned<D> du;
3477
- const Rebind<uint64_t, D> du64;
4292
+ const RebindToUnsigned<decltype(df32)> du;
4293
+ const Rebind<uint64_t, decltype(df32)> du64;
3478
4294
  return Vec64<float>{
3479
- BitCast(d, TruncateTo(du, BitCast(du64, u64_to_f32))).raw};
4295
+ BitCast(df32, TruncateTo(du, BitCast(du64, u64_to_f32))).raw};
4296
+ #endif
3480
4297
  }
3481
4298
 
3482
4299
  // For already range-limited input [0, 255].
@@ -3491,17 +4308,39 @@ HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) {
3491
4308
  // Note: altivec.h vec_ct* currently contain C casts which triggers
3492
4309
  // -Wdeprecate-lax-vec-conv-all warnings, so disable them.
3493
4310
 
3494
- template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_NOT_FLOAT(FromT),
3495
- HWY_IF_T_SIZE_D(D, sizeof(FromT))>
4311
+ #if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
4312
+ template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT),
4313
+ HWY_IF_V_SIZE_LE_D(D, 8)>
4314
+ HWY_API VFromD<D> ConvertTo(D df32,
4315
+ Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
4316
+ const Rebind<double, decltype(df32)> df64;
4317
+ return DemoteTo(df32, PromoteTo(df64, v));
4318
+ }
4319
+ template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT),
4320
+ HWY_IF_V_SIZE_D(D, 16)>
4321
+ HWY_API VFromD<D> ConvertTo(D df32, Vec128<FromT> v) {
4322
+ const RepartitionToWide<decltype(df32)> df64;
4323
+
4324
+ const VFromD<D> vf32_lo{vec_floate(PromoteLowerTo(df64, v).raw)};
4325
+ const VFromD<D> vf32_hi{vec_floate(PromoteUpperTo(df64, v).raw)};
4326
+ return ConcatEven(df32, vf32_hi, vf32_lo);
4327
+ }
4328
+ #else // Z15 or PPC
4329
+ template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT)>
3496
4330
  HWY_API VFromD<D> ConvertTo(D /* tag */,
3497
4331
  Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
3498
4332
  HWY_DIAGNOSTICS(push)
3499
4333
  #if HWY_COMPILER_CLANG
3500
4334
  HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
3501
4335
  #endif
4336
+ #if HWY_S390X_HAVE_Z15
4337
+ return VFromD<D>{vec_float(v.raw)};
4338
+ #else
3502
4339
  return VFromD<D>{vec_ctf(v.raw, 0)};
4340
+ #endif
3503
4341
  HWY_DIAGNOSTICS(pop)
3504
4342
  }
4343
+ #endif // HWY_TARGET == HWY_Z14
3505
4344
 
3506
4345
  template <class D, typename FromT, HWY_IF_F64_D(D), HWY_IF_NOT_FLOAT(FromT),
3507
4346
  HWY_IF_T_SIZE_D(D, sizeof(FromT))>
@@ -3511,38 +4350,195 @@ HWY_API VFromD<D> ConvertTo(D /* tag */,
3511
4350
  }
3512
4351
 
3513
4352
  // Truncates (rounds toward zero).
3514
- template <class D, typename FromT, HWY_IF_SIGNED_D(D), HWY_IF_FLOAT(FromT),
3515
- HWY_IF_T_SIZE_D(D, sizeof(FromT))>
4353
+ #if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
4354
+ template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
4355
+ HWY_API VFromD<D> ConvertTo(D di32,
4356
+ Vec128<float, Rebind<float, D>().MaxLanes()> v) {
4357
+ const Rebind<int64_t, decltype(di32)> di64;
4358
+ return DemoteTo(di32, PromoteTo(di64, v));
4359
+ }
4360
+ template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_D(D, 16)>
4361
+ HWY_API VFromD<D> ConvertTo(D di32,
4362
+ Vec128<float, Rebind<float, D>().MaxLanes()> v) {
4363
+ const RepartitionToWide<decltype(di32)> di64;
4364
+ return OrderedDemote2To(di32, PromoteLowerTo(di64, v),
4365
+ PromoteUpperTo(di64, v));
4366
+ }
4367
+ #else // Z15 or PPC
4368
+ template <class D, HWY_IF_I32_D(D)>
3516
4369
  HWY_API VFromD<D> ConvertTo(D /* tag */,
3517
- Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
4370
+ Vec128<float, Rebind<float, D>().MaxLanes()> v) {
4371
+ #if defined(__OPTIMIZE__)
4372
+ if (detail::IsConstantRawAltivecVect(v.raw)) {
4373
+ constexpr int32_t kMinI32 = LimitsMin<int32_t>();
4374
+ constexpr int32_t kMaxI32 = LimitsMax<int32_t>();
4375
+ return Dup128VecFromValues(
4376
+ D(),
4377
+ (v.raw[0] >= -2147483648.0f)
4378
+ ? ((v.raw[0] < 2147483648.0f) ? static_cast<int32_t>(v.raw[0])
4379
+ : kMaxI32)
4380
+ : ((v.raw[0] < 0) ? kMinI32 : 0),
4381
+ (v.raw[1] >= -2147483648.0f)
4382
+ ? ((v.raw[1] < 2147483648.0f) ? static_cast<int32_t>(v.raw[1])
4383
+ : kMaxI32)
4384
+ : ((v.raw[1] < 0) ? kMinI32 : 0),
4385
+ (v.raw[2] >= -2147483648.0f)
4386
+ ? ((v.raw[2] < 2147483648.0f) ? static_cast<int32_t>(v.raw[2])
4387
+ : kMaxI32)
4388
+ : ((v.raw[2] < 0) ? kMinI32 : 0),
4389
+ (v.raw[3] >= -2147483648.0f)
4390
+ ? ((v.raw[3] < 2147483648.0f) ? static_cast<int32_t>(v.raw[3])
4391
+ : kMaxI32)
4392
+ : ((v.raw[3] < 0) ? kMinI32 : 0));
4393
+ }
4394
+ #endif
4395
+
4396
+ #if HWY_S390X_HAVE_Z15
4397
+ // Use inline assembly on Z15 to avoid undefined behavior if v[i] is not in
4398
+ // the range of an int32_t
4399
+ __vector signed int raw_result;
4400
+ __asm__("vcfeb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
4401
+ return VFromD<D>{raw_result};
4402
+ #else
3518
4403
  HWY_DIAGNOSTICS(push)
3519
4404
  #if HWY_COMPILER_CLANG
3520
4405
  HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
3521
4406
  #endif
3522
4407
  return VFromD<D>{vec_cts(v.raw, 0)};
3523
4408
  HWY_DIAGNOSTICS(pop)
4409
+ #endif // HWY_S390X_HAVE_Z15
3524
4410
  }
4411
+ #endif // HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
3525
4412
 
3526
- template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_FLOAT(FromT),
3527
- HWY_IF_T_SIZE_D(D, sizeof(FromT))>
4413
+ template <class D, HWY_IF_I64_D(D)>
3528
4414
  HWY_API VFromD<D> ConvertTo(D /* tag */,
3529
- Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
4415
+ Vec128<double, Rebind<double, D>().MaxLanes()> v) {
4416
+ #if defined(__OPTIMIZE__)
4417
+ if (detail::IsConstantRawAltivecVect(v.raw)) {
4418
+ constexpr int64_t kMinI64 = LimitsMin<int64_t>();
4419
+ constexpr int64_t kMaxI64 = LimitsMax<int64_t>();
4420
+ return Dup128VecFromValues(D(),
4421
+ (v.raw[0] >= -9223372036854775808.0)
4422
+ ? ((v.raw[0] < 9223372036854775808.0)
4423
+ ? static_cast<int64_t>(v.raw[0])
4424
+ : kMaxI64)
4425
+ : ((v.raw[0] < 0) ? kMinI64 : 0LL),
4426
+ (v.raw[1] >= -9223372036854775808.0)
4427
+ ? ((v.raw[1] < 9223372036854775808.0)
4428
+ ? static_cast<int64_t>(v.raw[1])
4429
+ : kMaxI64)
4430
+ : ((v.raw[1] < 0) ? kMinI64 : 0LL));
4431
+ }
4432
+ #endif
4433
+
4434
+ // Use inline assembly to avoid undefined behavior if v[i] is not within the
4435
+ // range of an int64_t
4436
+ __vector signed long long raw_result;
4437
+ #if HWY_S390X_HAVE_Z14
4438
+ __asm__("vcgdb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
4439
+ #else
4440
+ __asm__("xvcvdpsxds %x0,%x1"
4441
+ : "=wa"(raw_result)
4442
+ : "wa"(detail::VsxF2INormalizeSrcVals(v).raw));
4443
+ #endif
4444
+ return VFromD<D>{raw_result};
4445
+ }
4446
+
4447
+ #if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
4448
+ template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
4449
+ HWY_API VFromD<D> ConvertTo(D du32,
4450
+ Vec128<float, Rebind<float, D>().MaxLanes()> v) {
4451
+ const Rebind<uint64_t, decltype(du32)> du64;
4452
+ return DemoteTo(du32, PromoteTo(du64, v));
4453
+ }
4454
+ template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_D(D, 16)>
4455
+ HWY_API VFromD<D> ConvertTo(D du32,
4456
+ Vec128<float, Rebind<float, D>().MaxLanes()> v) {
4457
+ const RepartitionToWide<decltype(du32)> du64;
4458
+ return OrderedDemote2To(du32, PromoteLowerTo(du64, v),
4459
+ PromoteUpperTo(du64, v));
4460
+ }
4461
+ #else // Z15 or VSX
4462
+ template <class D, HWY_IF_U32_D(D)>
4463
+ HWY_API VFromD<D> ConvertTo(D /* tag */,
4464
+ Vec128<float, Rebind<float, D>().MaxLanes()> v) {
4465
+ #if defined(__OPTIMIZE__)
4466
+ if (detail::IsConstantRawAltivecVect(v.raw)) {
4467
+ constexpr uint32_t kMaxU32 = LimitsMax<uint32_t>();
4468
+ return Dup128VecFromValues(
4469
+ D(),
4470
+ (v.raw[0] >= 0.0f)
4471
+ ? ((v.raw[0] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[0])
4472
+ : kMaxU32)
4473
+ : 0,
4474
+ (v.raw[1] >= 0.0f)
4475
+ ? ((v.raw[1] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[1])
4476
+ : kMaxU32)
4477
+ : 0,
4478
+ (v.raw[2] >= 0.0f)
4479
+ ? ((v.raw[2] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[2])
4480
+ : kMaxU32)
4481
+ : 0,
4482
+ (v.raw[3] >= 0.0f)
4483
+ ? ((v.raw[3] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[3])
4484
+ : kMaxU32)
4485
+ : 0);
4486
+ }
4487
+ #endif
4488
+
4489
+ #if HWY_S390X_HAVE_Z15
4490
+ // Use inline assembly on Z15 to avoid undefined behavior if v[i] is not in
4491
+ // the range of an uint32_t
4492
+ __vector unsigned int raw_result;
4493
+ __asm__("vclfeb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
4494
+ return VFromD<D>{raw_result};
4495
+ #else // VSX
3530
4496
  HWY_DIAGNOSTICS(push)
3531
4497
  #if HWY_COMPILER_CLANG
3532
4498
  HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
3533
4499
  #endif
3534
- return VFromD<D>{vec_ctu(ZeroIfNegative(v).raw, 0)};
4500
+ VFromD<D> result{vec_ctu(v.raw, 0)};
3535
4501
  HWY_DIAGNOSTICS(pop)
4502
+ return result;
4503
+ #endif // HWY_S390X_HAVE_Z15
3536
4504
  }
4505
+ #endif // HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
3537
4506
 
3538
- template <size_t N>
3539
- HWY_API Vec128<int32_t, N> NearestInt(Vec128<float, N> v) {
4507
+ template <class D, HWY_IF_U64_D(D)>
4508
+ HWY_API VFromD<D> ConvertTo(D /* tag */,
4509
+ Vec128<double, Rebind<double, D>().MaxLanes()> v) {
3540
4510
  HWY_DIAGNOSTICS(push)
3541
4511
  #if HWY_COMPILER_CLANG
3542
4512
  HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
3543
4513
  #endif
3544
- return Vec128<int32_t, N>{vec_cts(vec_round(v.raw), 0)};
3545
- HWY_DIAGNOSTICS(pop)
4514
+
4515
+ #if defined(__OPTIMIZE__)
4516
+ if (detail::IsConstantRawAltivecVect(v.raw)) {
4517
+ constexpr uint64_t kMaxU64 = LimitsMax<uint64_t>();
4518
+ return Dup128VecFromValues(
4519
+ D(),
4520
+ (v.raw[0] >= 0.0) ? ((v.raw[0] < 18446744073709551616.0)
4521
+ ? static_cast<uint64_t>(v.raw[0])
4522
+ : kMaxU64)
4523
+ : 0,
4524
+ (v.raw[1] >= 0.0) ? ((v.raw[1] < 18446744073709551616.0)
4525
+ ? static_cast<uint64_t>(v.raw[1])
4526
+ : kMaxU64)
4527
+ : 0);
4528
+ }
4529
+ #endif
4530
+
4531
+ // Use inline assembly to avoid undefined behavior if v[i] is not within the
4532
+ // range of an uint64_t
4533
+ __vector unsigned long long raw_result;
4534
+ #if HWY_S390X_HAVE_Z14
4535
+ __asm__("vclgdb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
4536
+ #else // VSX
4537
+ __asm__("xvcvdpuxds %x0,%x1"
4538
+ : "=wa"(raw_result)
4539
+ : "wa"(detail::VsxF2INormalizeSrcVals(v).raw));
4540
+ #endif
4541
+ return VFromD<D>{raw_result};
3546
4542
  }
3547
4543
 
3548
4544
  // ------------------------------ Floating-point rounding (ConvertTo)
@@ -3555,7 +4551,18 @@ HWY_API Vec128<float, N> Round(Vec128<float, N> v) {
3555
4551
 
3556
4552
  template <size_t N>
3557
4553
  HWY_API Vec128<double, N> Round(Vec128<double, N> v) {
4554
+ #if HWY_S390X_HAVE_Z14
4555
+ return Vec128<double, N>{vec_round(v.raw)};
4556
+ #else
3558
4557
  return Vec128<double, N>{vec_rint(v.raw)};
4558
+ #endif
4559
+ }
4560
+
4561
+ template <size_t N>
4562
+ HWY_API Vec128<int32_t, N> NearestInt(Vec128<float, N> v) {
4563
+ const DFromV<decltype(v)> d;
4564
+ const RebindToSigned<decltype(d)> di;
4565
+ return ConvertTo(di, Round(v));
3559
4566
  }
3560
4567
 
3561
4568
  // Toward zero, aka truncate
@@ -3613,7 +4620,7 @@ HWY_API Mask128<T, N> IsFinite(Vec128<T, N> v) {
3613
4620
 
3614
4621
  // ================================================== CRYPTO
3615
4622
 
3616
- #if !defined(HWY_DISABLE_PPC8_CRYPTO)
4623
+ #if !HWY_S390X_HAVE_Z14 && !defined(HWY_DISABLE_PPC8_CRYPTO)
3617
4624
 
3618
4625
  // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
3619
4626
  #ifdef HWY_NATIVE_AES
@@ -3918,11 +4925,20 @@ struct CompressIsPartition {
3918
4925
  enum { value = (sizeof(T) != 1) };
3919
4926
  };
3920
4927
 
4928
+ // ------------------------------ Dup128MaskFromMaskBits
4929
+
4930
+ template <class D>
4931
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
4932
+ constexpr size_t kN = MaxLanes(d);
4933
+ if (kN < 8) mask_bits &= (1u << kN) - 1;
4934
+ return detail::LoadMaskBits128(d, mask_bits);
4935
+ }
4936
+
3921
4937
  // ------------------------------ StoreMaskBits
3922
4938
 
3923
4939
  namespace detail {
3924
4940
 
3925
- #if !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN
4941
+ #if !HWY_S390X_HAVE_Z14 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
3926
4942
  // fallback for missing vec_extractm
3927
4943
  template <size_t N>
3928
4944
  HWY_INLINE uint64_t ExtractSignBits(Vec128<uint8_t, N> sign_bits,
@@ -3935,32 +4951,70 @@ HWY_INLINE uint64_t ExtractSignBits(Vec128<uint8_t, N> sign_bits,
3935
4951
  return extracted.raw[HWY_IS_LITTLE_ENDIAN];
3936
4952
  }
3937
4953
 
3938
- #endif // !HWY_PPC_HAVE_10
4954
+ #endif // !HWY_S390X_HAVE_Z14 && !HWY_PPC_HAVE_10
4955
+
4956
+ #if HWY_S390X_HAVE_Z14
4957
+ template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
4958
+ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T, N> mask) {
4959
+ const DFromM<decltype(mask)> d;
4960
+ const Repartition<uint8_t, decltype(d)> du8;
4961
+ const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
4962
+
4963
+ return ReduceSum(
4964
+ du8, And(sign_bits, Dup128VecFromValues(du8, 1, 2, 4, 8, 16, 32, 64, 128,
4965
+ 1, 2, 4, 8, 16, 32, 64, 128)));
4966
+ }
4967
+
4968
+ template <typename T>
4969
+ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T> mask) {
4970
+ const DFromM<decltype(mask)> d;
4971
+ const Repartition<uint8_t, decltype(d)> du8;
4972
+ const Repartition<uint64_t, decltype(d)> du64;
4973
+ const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
4974
+
4975
+ const auto mask_bytes = SumsOf8(
4976
+ And(sign_bits, Dup128VecFromValues(du8, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2,
4977
+ 4, 8, 16, 32, 64, 128)));
3939
4978
 
4979
+ const Rebind<uint8_t, decltype(du64)> du8_2;
4980
+ const Repartition<uint16_t, decltype(du8_2)> du16_1;
4981
+ return GetLane(
4982
+ BitCast(du16_1, TruncateTo(du8_2, Reverse2(du64, mask_bytes))));
4983
+ }
4984
+ #else
3940
4985
  template <typename T, size_t N>
3941
4986
  HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T, N> mask) {
3942
4987
  const DFromM<decltype(mask)> d;
3943
4988
  const Repartition<uint8_t, decltype(d)> du8;
3944
4989
  const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
4990
+
3945
4991
  #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
3946
4992
  return static_cast<uint64_t>(vec_extractm(sign_bits.raw));
3947
- #else
4993
+ #else // PPC8, PPC9, or big-endian PPC10
3948
4994
  const __vector unsigned char kBitShuffle = {120, 112, 104, 96, 88, 80, 72, 64,
3949
4995
  56, 48, 40, 32, 24, 16, 8, 0};
3950
4996
  return ExtractSignBits(sign_bits, kBitShuffle);
3951
- #endif // HWY_PPC_HAVE_10
4997
+ #endif // HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
3952
4998
  }
4999
+ #endif // HWY_S390X_HAVE_Z14
3953
5000
 
3954
5001
  template <typename T, size_t N>
3955
5002
  HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T, N> mask) {
3956
5003
  const DFromM<decltype(mask)> d;
5004
+ const RebindToUnsigned<decltype(d)> du;
5005
+
5006
+ #if HWY_S390X_HAVE_Z14
5007
+ const VFromD<decltype(du)> sign_bits = BitCast(du, VecFromMask(d, mask));
5008
+ return ReduceSum(
5009
+ du, And(sign_bits, Dup128VecFromValues(du, 1, 2, 4, 8, 16, 32, 64, 128)));
5010
+ #else // VSX
3957
5011
  const Repartition<uint8_t, decltype(d)> du8;
3958
5012
  const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
3959
5013
 
3960
5014
  #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
3961
- const RebindToUnsigned<decltype(d)> du;
3962
5015
  return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
3963
- #else
5016
+ #else // PPC8, PPC9, or big-endian PPC10
5017
+ (void)du;
3964
5018
  #if HWY_IS_LITTLE_ENDIAN
3965
5019
  const __vector unsigned char kBitShuffle = {
3966
5020
  112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128};
@@ -3970,17 +5024,25 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T, N> mask) {
3970
5024
  #endif
3971
5025
  return ExtractSignBits(sign_bits, kBitShuffle);
3972
5026
  #endif // HWY_PPC_HAVE_10
5027
+ #endif // HWY_S390X_HAVE_Z14
3973
5028
  }
3974
5029
 
3975
5030
  template <typename T, size_t N>
3976
5031
  HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) {
3977
5032
  const DFromM<decltype(mask)> d;
5033
+ const RebindToUnsigned<decltype(d)> du;
5034
+
5035
+ #if HWY_S390X_HAVE_Z14
5036
+ const VFromD<decltype(du)> sign_bits = BitCast(du, VecFromMask(d, mask));
5037
+ return ReduceSum(du, And(sign_bits, Dup128VecFromValues(du, 1, 2, 4, 8)));
5038
+ #else // VSX
3978
5039
  const Repartition<uint8_t, decltype(d)> du8;
3979
5040
  const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
5041
+
3980
5042
  #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
3981
- const RebindToUnsigned<decltype(d)> du;
3982
5043
  return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
3983
- #else
5044
+ #else // PPC8, PPC9, or big-endian PPC10
5045
+ (void)du;
3984
5046
  #if HWY_IS_LITTLE_ENDIAN
3985
5047
  const __vector unsigned char kBitShuffle = {96, 64, 32, 0, 128, 128,
3986
5048
  128, 128, 128, 128, 128, 128,
@@ -3992,17 +5054,25 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) {
3992
5054
  #endif
3993
5055
  return ExtractSignBits(sign_bits, kBitShuffle);
3994
5056
  #endif // HWY_PPC_HAVE_10
5057
+ #endif // HWY_S390X_HAVE_Z14
3995
5058
  }
3996
5059
 
3997
5060
  template <typename T, size_t N>
3998
5061
  HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, N> mask) {
3999
5062
  const DFromM<decltype(mask)> d;
5063
+ const RebindToUnsigned<decltype(d)> du;
5064
+
5065
+ #if HWY_S390X_HAVE_Z14
5066
+ const VFromD<decltype(du)> sign_bits = BitCast(du, VecFromMask(d, mask));
5067
+ return ReduceSum(du, And(sign_bits, Dup128VecFromValues(du, 1, 2)));
5068
+ #else // VSX
4000
5069
  const Repartition<uint8_t, decltype(d)> du8;
4001
5070
  const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
5071
+
4002
5072
  #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
4003
- const RebindToUnsigned<decltype(d)> du;
4004
5073
  return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
4005
5074
  #else
5075
+ (void)du;
4006
5076
  #if HWY_IS_LITTLE_ENDIAN
4007
5077
  const __vector unsigned char kBitShuffle = {64, 0, 128, 128, 128, 128,
4008
5078
  128, 128, 128, 128, 128, 128,
@@ -4014,6 +5084,7 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, N> mask) {
4014
5084
  #endif
4015
5085
  return ExtractSignBits(sign_bits, kBitShuffle);
4016
5086
  #endif // HWY_PPC_HAVE_10
5087
+ #endif // HWY_S390X_HAVE_Z14
4017
5088
  }
4018
5089
 
4019
5090
  // Returns the lowest N of the mask bits.
@@ -4076,31 +5147,32 @@ HWY_API size_t StoreMaskBits(D /*d*/, MFromD<D> mask, uint8_t* bits) {
4076
5147
  template <class D, HWY_IF_V_SIZE_D(D, 16)>
4077
5148
  HWY_API bool AllFalse(D d, MFromD<D> mask) {
4078
5149
  const RebindToUnsigned<decltype(d)> du;
4079
- return static_cast<bool>(vec_all_eq(RebindMask(du, mask).raw, Zero(du).raw));
5150
+ return static_cast<bool>(
5151
+ vec_all_eq(VecFromMask(du, RebindMask(du, mask)).raw, Zero(du).raw));
4080
5152
  }
4081
5153
 
4082
5154
  template <class D, HWY_IF_V_SIZE_D(D, 16)>
4083
5155
  HWY_API bool AllTrue(D d, MFromD<D> mask) {
4084
5156
  const RebindToUnsigned<decltype(d)> du;
4085
5157
  using TU = TFromD<decltype(du)>;
4086
- return static_cast<bool>(
4087
- vec_all_eq(RebindMask(du, mask).raw, Set(du, hwy::LimitsMax<TU>()).raw));
5158
+ return static_cast<bool>(vec_all_eq(VecFromMask(du, RebindMask(du, mask)).raw,
5159
+ Set(du, hwy::LimitsMax<TU>()).raw));
4088
5160
  }
4089
5161
 
4090
5162
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
4091
5163
  HWY_API bool AllFalse(D d, MFromD<D> mask) {
4092
5164
  const Full128<TFromD<D>> d_full;
4093
5165
  constexpr size_t kN = MaxLanes(d);
4094
- return AllFalse(d_full, MFromD<decltype(d_full)>{
4095
- vec_and(mask.raw, FirstN(d_full, kN).raw)});
5166
+ return AllFalse(d_full,
5167
+ And(MFromD<decltype(d_full)>{mask.raw}, FirstN(d_full, kN)));
4096
5168
  }
4097
5169
 
4098
5170
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
4099
5171
  HWY_API bool AllTrue(D d, MFromD<D> mask) {
4100
5172
  const Full128<TFromD<D>> d_full;
4101
5173
  constexpr size_t kN = MaxLanes(d);
4102
- return AllTrue(d_full, MFromD<decltype(d_full)>{
4103
- vec_or(mask.raw, Not(FirstN(d_full, kN)).raw)});
5174
+ return AllTrue(
5175
+ d_full, Or(MFromD<decltype(d_full)>{mask.raw}, Not(FirstN(d_full, kN))));
4104
5176
  }
4105
5177
 
4106
5178
  template <class D>
@@ -4222,7 +5294,7 @@ HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
4222
5294
  __asm__("xxgenpcvbm %x0, %1, %2"
4223
5295
  : "=wa"(idx)
4224
5296
  : "v"(mask.raw), "i"(kGenPcvmMode));
4225
- return VFromD<D>{idx};
5297
+ return VFromD<decltype(d)>{idx};
4226
5298
  }
4227
5299
  template <bool kIsCompress, class D, HWY_IF_T_SIZE_D(D, 2)>
4228
5300
  HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
@@ -4235,7 +5307,7 @@ HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
4235
5307
  __asm__("xxgenpcvhm %x0, %1, %2"
4236
5308
  : "=wa"(idx)
4237
5309
  : "v"(mask.raw), "i"(kGenPcvmMode));
4238
- return VFromD<D>{idx};
5310
+ return VFromD<decltype(d)>{idx};
4239
5311
  }
4240
5312
  template <bool kIsCompress, class D, HWY_IF_T_SIZE_D(D, 4)>
4241
5313
  HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
@@ -4248,7 +5320,7 @@ HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
4248
5320
  __asm__("xxgenpcvwm %x0, %1, %2"
4249
5321
  : "=wa"(idx)
4250
5322
  : "v"(mask.raw), "i"(kGenPcvmMode));
4251
- return VFromD<D>{idx};
5323
+ return VFromD<decltype(d)>{idx};
4252
5324
  }
4253
5325
  #endif
4254
5326
 
@@ -4821,7 +5893,7 @@ HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
4821
5893
 
4822
5894
  const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
4823
5895
  const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
4824
- #if HWY_PPC_HAVE_9
5896
+ #if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
4825
5897
  StoreN(compressed, d, unaligned, count);
4826
5898
  #else
4827
5899
  BlendedStore(compressed, FirstN(d, count), d, unaligned);
@@ -4939,7 +6011,11 @@ HWY_INLINE V Per128BitBlkRevLanesOnBe(V v) {
4939
6011
 
4940
6012
  template <class V>
4941
6013
  HWY_INLINE V I128Subtract(V a, V b) {
4942
- #if defined(__SIZEOF_INT128__)
6014
+ #if HWY_S390X_HAVE_Z14
6015
+ const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>(
6016
+ vec_sub_u128(reinterpret_cast<__vector unsigned char>(a.raw),
6017
+ reinterpret_cast<__vector unsigned char>(b.raw)))};
6018
+ #elif defined(__SIZEOF_INT128__)
4943
6019
  using VU128 = __vector unsigned __int128;
4944
6020
  const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>(
4945
6021
  vec_sub(reinterpret_cast<VU128>(a.raw), reinterpret_cast<VU128>(b.raw)))};
@@ -5067,84 +6143,133 @@ HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
5067
6143
  return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask))));
5068
6144
  }
5069
6145
 
5070
- // ------------------------------ Reductions
5071
-
6146
+ // ------------------------------ SumsOf2 and SumsOf4
5072
6147
  namespace detail {
5073
6148
 
5074
- // N=1 for any T: no-op
5075
- template <typename T>
5076
- HWY_INLINE Vec128<T, 1> SumOfLanes(Vec128<T, 1> v) {
5077
- return v;
5078
- }
5079
- template <typename T>
5080
- HWY_INLINE Vec128<T, 1> MinOfLanes(Vec128<T, 1> v) {
5081
- return v;
5082
- }
5083
- template <typename T>
5084
- HWY_INLINE Vec128<T, 1> MaxOfLanes(Vec128<T, 1> v) {
5085
- return v;
6149
+ #if !HWY_S390X_HAVE_Z14
6150
+ // Casts nominally int32_t result to D.
6151
+ template <class D>
6152
+ HWY_INLINE VFromD<D> AltivecVsum4sbs(D d, __vector signed char a,
6153
+ __vector signed int b) {
6154
+ const Repartition<int32_t, D> di32;
6155
+ #ifdef __OPTIMIZE__
6156
+ if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
6157
+ const int64_t sum0 =
6158
+ static_cast<int64_t>(a[0]) + static_cast<int64_t>(a[1]) +
6159
+ static_cast<int64_t>(a[2]) + static_cast<int64_t>(a[3]) +
6160
+ static_cast<int64_t>(b[0]);
6161
+ const int64_t sum1 =
6162
+ static_cast<int64_t>(a[4]) + static_cast<int64_t>(a[5]) +
6163
+ static_cast<int64_t>(a[6]) + static_cast<int64_t>(a[7]) +
6164
+ static_cast<int64_t>(b[1]);
6165
+ const int64_t sum2 =
6166
+ static_cast<int64_t>(a[8]) + static_cast<int64_t>(a[9]) +
6167
+ static_cast<int64_t>(a[10]) + static_cast<int64_t>(a[11]) +
6168
+ static_cast<int64_t>(b[2]);
6169
+ const int64_t sum3 =
6170
+ static_cast<int64_t>(a[12]) + static_cast<int64_t>(a[13]) +
6171
+ static_cast<int64_t>(a[14]) + static_cast<int64_t>(a[15]) +
6172
+ static_cast<int64_t>(b[3]);
6173
+ const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
6174
+ const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
6175
+ const int32_t sign2 = static_cast<int32_t>(sum2 >> 63);
6176
+ const int32_t sign3 = static_cast<int32_t>(sum3 >> 63);
6177
+ using Raw = typename detail::Raw128<int32_t>::type;
6178
+ return BitCast(
6179
+ d,
6180
+ VFromD<decltype(di32)>{Raw{
6181
+ (sign0 == (sum0 >> 31)) ? static_cast<int32_t>(sum0)
6182
+ : static_cast<int32_t>(sign0 ^ 0x7FFFFFFF),
6183
+ (sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1)
6184
+ : static_cast<int32_t>(sign1 ^ 0x7FFFFFFF),
6185
+ (sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2)
6186
+ : static_cast<int32_t>(sign2 ^ 0x7FFFFFFF),
6187
+ (sign3 == (sum3 >> 31))
6188
+ ? static_cast<int32_t>(sum3)
6189
+ : static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}});
6190
+ } else // NOLINT
6191
+ #endif
6192
+ {
6193
+ return BitCast(d, VFromD<decltype(di32)>{vec_vsum4sbs(a, b)});
6194
+ }
5086
6195
  }
5087
6196
 
5088
- // u32/i32/f32:
5089
-
5090
- // N=2
5091
- template <typename T, HWY_IF_T_SIZE(T, 4)>
5092
- HWY_INLINE Vec128<T, 2> SumOfLanes(Vec128<T, 2> v10) {
5093
- // NOTE: AltivecVsum2sws cannot be used here as AltivecVsum2sws
5094
- // computes the signed saturated sum of the lanes.
5095
- return v10 + Shuffle2301(v10);
5096
- }
5097
- template <typename T, HWY_IF_T_SIZE(T, 4)>
5098
- HWY_INLINE Vec128<T, 2> MinOfLanes(Vec128<T, 2> v10) {
5099
- return Min(v10, Shuffle2301(v10));
5100
- }
5101
- template <typename T, HWY_IF_T_SIZE(T, 4)>
5102
- HWY_INLINE Vec128<T, 2> MaxOfLanes(Vec128<T, 2> v10) {
5103
- return Max(v10, Shuffle2301(v10));
6197
+ // Casts nominally uint32_t result to D.
6198
+ template <class D>
6199
+ HWY_INLINE VFromD<D> AltivecVsum4ubs(D d, __vector unsigned char a,
6200
+ __vector unsigned int b) {
6201
+ const Repartition<uint32_t, D> du32;
6202
+ #ifdef __OPTIMIZE__
6203
+ if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
6204
+ const uint64_t sum0 =
6205
+ static_cast<uint64_t>(a[0]) + static_cast<uint64_t>(a[1]) +
6206
+ static_cast<uint64_t>(a[2]) + static_cast<uint64_t>(a[3]) +
6207
+ static_cast<uint64_t>(b[0]);
6208
+ const uint64_t sum1 =
6209
+ static_cast<uint64_t>(a[4]) + static_cast<uint64_t>(a[5]) +
6210
+ static_cast<uint64_t>(a[6]) + static_cast<uint64_t>(a[7]) +
6211
+ static_cast<uint64_t>(b[1]);
6212
+ const uint64_t sum2 =
6213
+ static_cast<uint64_t>(a[8]) + static_cast<uint64_t>(a[9]) +
6214
+ static_cast<uint64_t>(a[10]) + static_cast<uint64_t>(a[11]) +
6215
+ static_cast<uint64_t>(b[2]);
6216
+ const uint64_t sum3 =
6217
+ static_cast<uint64_t>(a[12]) + static_cast<uint64_t>(a[13]) +
6218
+ static_cast<uint64_t>(a[14]) + static_cast<uint64_t>(a[15]) +
6219
+ static_cast<uint64_t>(b[3]);
6220
+ return BitCast(
6221
+ d,
6222
+ VFromD<decltype(du32)>{(__vector unsigned int){
6223
+ static_cast<unsigned int>(sum0 <= 0xFFFFFFFFu ? sum0 : 0xFFFFFFFFu),
6224
+ static_cast<unsigned int>(sum1 <= 0xFFFFFFFFu ? sum1 : 0xFFFFFFFFu),
6225
+ static_cast<unsigned int>(sum2 <= 0xFFFFFFFFu ? sum2 : 0xFFFFFFFFu),
6226
+ static_cast<unsigned int>(sum3 <= 0xFFFFFFFFu ? sum3
6227
+ : 0xFFFFFFFFu)}});
6228
+ } else // NOLINT
6229
+ #endif
6230
+ {
6231
+ return BitCast(d, VFromD<decltype(du32)>{vec_vsum4ubs(a, b)});
6232
+ }
5104
6233
  }
5105
6234
 
5106
- // N=4 (full)
5107
- template <typename T, HWY_IF_T_SIZE(T, 4)>
5108
- HWY_INLINE Vec128<T> SumOfLanes(Vec128<T> v3210) {
5109
- // NOTE: AltivecVsumsws cannot be used here as AltivecVsumsws
5110
- // computes the signed saturated sum of the lanes.
5111
- const Vec128<T> v1032 = Shuffle1032(v3210);
5112
- const Vec128<T> v31_20_31_20 = v3210 + v1032;
5113
- const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
5114
- return v20_31_20_31 + v31_20_31_20;
5115
- }
5116
- template <typename T, HWY_IF_T_SIZE(T, 4)>
5117
- HWY_INLINE Vec128<T> MinOfLanes(Vec128<T> v3210) {
5118
- const Vec128<T> v1032 = Shuffle1032(v3210);
5119
- const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
5120
- const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
5121
- return Min(v20_31_20_31, v31_20_31_20);
5122
- }
5123
- template <typename T, HWY_IF_T_SIZE(T, 4)>
5124
- HWY_INLINE Vec128<T> MaxOfLanes(Vec128<T> v3210) {
5125
- const Vec128<T> v1032 = Shuffle1032(v3210);
5126
- const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
5127
- const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
5128
- return Max(v20_31_20_31, v31_20_31_20);
5129
- }
6235
+ // Casts nominally int32_t result to D.
6236
+ template <class D>
6237
+ HWY_INLINE VFromD<D> AltivecVsum2sws(D d, __vector signed int a,
6238
+ __vector signed int b) {
6239
+ const Repartition<int32_t, D> di32;
6240
+ #ifdef __OPTIMIZE__
6241
+ const Repartition<uint64_t, D> du64;
6242
+ constexpr int kDestLaneOffset = HWY_IS_BIG_ENDIAN;
6243
+ if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset]) &&
6244
+ __builtin_constant_p(b[kDestLaneOffset + 2])) {
6245
+ const int64_t sum0 = static_cast<int64_t>(a[0]) +
6246
+ static_cast<int64_t>(a[1]) +
6247
+ static_cast<int64_t>(b[kDestLaneOffset]);
6248
+ const int64_t sum1 = static_cast<int64_t>(a[2]) +
6249
+ static_cast<int64_t>(a[3]) +
6250
+ static_cast<int64_t>(b[kDestLaneOffset + 2]);
6251
+ const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
6252
+ const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
6253
+ return BitCast(d, VFromD<decltype(du64)>{(__vector unsigned long long){
6254
+ (sign0 == (sum0 >> 31))
6255
+ ? static_cast<uint32_t>(sum0)
6256
+ : static_cast<uint32_t>(sign0 ^ 0x7FFFFFFF),
6257
+ (sign1 == (sum1 >> 31))
6258
+ ? static_cast<uint32_t>(sum1)
6259
+ : static_cast<uint32_t>(sign1 ^ 0x7FFFFFFF)}});
6260
+ } else // NOLINT
6261
+ #endif
6262
+ {
6263
+ __vector signed int sum;
5130
6264
 
5131
- // u64/i64/f64:
6265
+ // Inline assembly is used for vsum2sws to avoid unnecessary shuffling
6266
+ // on little-endian PowerPC targets as the result of the vsum2sws
6267
+ // instruction will already be in the correct lanes on little-endian
6268
+ // PowerPC targets.
6269
+ __asm__("vsum2sws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b));
5132
6270
 
5133
- // N=2 (full)
5134
- template <typename T, HWY_IF_T_SIZE(T, 8)>
5135
- HWY_INLINE Vec128<T> SumOfLanes(Vec128<T> v10) {
5136
- const Vec128<T> v01 = Shuffle01(v10);
5137
- return v10 + v01;
5138
- }
5139
- template <typename T, HWY_IF_T_SIZE(T, 8)>
5140
- HWY_INLINE Vec128<T> MinOfLanes(Vec128<T> v10) {
5141
- const Vec128<T> v01 = Shuffle01(v10);
5142
- return Min(v10, v01);
5143
- }
5144
- template <typename T, HWY_IF_T_SIZE(T, 8)>
5145
- HWY_INLINE Vec128<T> MaxOfLanes(Vec128<T> v10) {
5146
- const Vec128<T> v01 = Shuffle01(v10);
5147
- return Max(v10, v01);
6271
+ return BitCast(d, VFromD<decltype(di32)>{sum});
6272
+ }
5148
6273
  }
5149
6274
 
5150
6275
  // Casts nominally int32_t result to D.
@@ -5238,275 +6363,345 @@ HWY_INLINE Vec128<int32_t, N / 2> AltivecU16SumsOf2(Vec128<uint16_t, N> v) {
5238
6363
  return AltivecVsum4shs(di32, Xor(BitCast(di16, v), Set(di16, -32768)).raw,
5239
6364
  Set(di32, 65536).raw);
5240
6365
  }
6366
+ #endif // !HWY_S390X_HAVE_Z14
6367
+
6368
+ // U16->U32 SumsOf2
6369
+ template <class V>
6370
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
6371
+ hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
6372
+ const DFromV<V> d;
6373
+ const RepartitionToWide<decltype(d)> dw;
6374
+
6375
+ #if HWY_S390X_HAVE_Z14
6376
+ return VFromD<decltype(dw)>{vec_sum4(v.raw, Zero(d).raw)};
6377
+ #else
6378
+ return BitCast(dw, AltivecU16SumsOf2(v));
6379
+ #endif
6380
+ }
6381
+
6382
+ // I16->I32 SumsOf2
6383
+ template <class V>
6384
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
6385
+ hwy::SignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
6386
+ const DFromV<V> d;
6387
+ const RepartitionToWide<decltype(d)> dw;
6388
+
6389
+ #if HWY_S390X_HAVE_Z14
6390
+ const RebindToUnsigned<decltype(d)> du;
6391
+ return BitCast(dw, SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<2>(),
6392
+ BitCast(du, Xor(v, SignBit(d))))) +
6393
+ Set(dw, int32_t{-65536});
6394
+ #else
6395
+ return AltivecVsum4shs(dw, v.raw, Zero(dw).raw);
6396
+ #endif
6397
+ }
6398
+
6399
+ #if HWY_S390X_HAVE_Z14
6400
+ // U32->U64 SumsOf2
6401
+ template <class V>
6402
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
6403
+ hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
6404
+ const DFromV<V> d;
6405
+ const RepartitionToWide<decltype(d)> dw;
6406
+ return VFromD<decltype(dw)>{vec_sum2(v.raw, Zero(d).raw)};
6407
+ }
6408
+
6409
+ // I32->I64 SumsOf2
6410
+ template <class V>
6411
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
6412
+ hwy::SignedTag /*type_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
6413
+ const DFromV<V> d;
6414
+ const RepartitionToWide<decltype(d)> dw;
6415
+ const RebindToUnsigned<decltype(d)> du;
6416
+
6417
+ return BitCast(dw, SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<4>(),
6418
+ BitCast(du, Xor(v, SignBit(d))))) +
6419
+ Set(dw, int64_t{-4294967296LL});
6420
+ }
6421
+ #endif
6422
+
6423
+ // U8->U32 SumsOf4
6424
+ template <class V>
6425
+ HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
6426
+ hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
6427
+ const DFromV<V> d;
6428
+ const RepartitionToWideX2<decltype(d)> dw2;
6429
+
6430
+ #if HWY_S390X_HAVE_Z14
6431
+ return VFromD<decltype(dw2)>{vec_sum4(v.raw, Zero(d).raw)};
6432
+ #else
6433
+ return AltivecVsum4ubs(dw2, v.raw, Zero(dw2).raw);
6434
+ #endif
6435
+ }
6436
+
6437
+ // I8->I32 SumsOf4
6438
+ template <class V>
6439
+ HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
6440
+ hwy::SignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
6441
+ const DFromV<V> d;
6442
+ const RepartitionToWideX2<decltype(d)> dw2;
6443
+
6444
+ #if HWY_S390X_HAVE_Z14
6445
+ const RebindToUnsigned<decltype(d)> du;
6446
+ return BitCast(dw2, SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(),
6447
+ BitCast(du, Xor(v, SignBit(d))))) +
6448
+ Set(dw2, int32_t{-512});
6449
+ #else
6450
+ return AltivecVsum4sbs(dw2, v.raw, Zero(dw2).raw);
6451
+ #endif
6452
+ }
6453
+
6454
+ // U16->U64 SumsOf4
6455
+ template <class V>
6456
+ HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
6457
+ hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
6458
+ const DFromV<V> d;
6459
+ const RepartitionToWide<decltype(d)> dw;
6460
+ const RepartitionToWide<decltype(dw)> dw2;
6461
+
6462
+ #if HWY_S390X_HAVE_Z14
6463
+ return VFromD<decltype(dw2)>{vec_sum2(v.raw, Zero(d).raw)};
6464
+ #else
6465
+ const RebindToSigned<decltype(dw)> dw_i;
6466
+ return AltivecVsum2sws(dw2, BitCast(dw_i, SumsOf2(v)).raw, Zero(dw_i).raw);
6467
+ #endif
6468
+ }
6469
+
6470
+ // I16->I64 SumsOf4
6471
+ template <class V>
6472
+ HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
6473
+ hwy::SignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
6474
+ const DFromV<V> d;
6475
+ const RepartitionToWide<decltype(d)> dw;
6476
+ const RepartitionToWide<decltype(dw)> dw2;
6477
+
6478
+ #if HWY_S390X_HAVE_Z14
6479
+ const RebindToUnsigned<decltype(d)> du;
6480
+ return BitCast(dw2, SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<2>(),
6481
+ BitCast(du, Xor(v, SignBit(d))))) +
6482
+ Set(dw2, int64_t{-131072});
6483
+ #else // VSX
6484
+ const auto sums_of_4_in_lo32 =
6485
+ AltivecVsum2sws(dw, SumsOf2(v).raw, Zero(dw).raw);
6486
+
6487
+ #if HWY_IS_LITTLE_ENDIAN
6488
+ return PromoteEvenTo(dw2, sums_of_4_in_lo32);
6489
+ #else
6490
+ return PromoteOddTo(dw2, sums_of_4_in_lo32);
6491
+ #endif // HWY_IS_LITTLE_ENDIAN
6492
+ #endif // HWY_S390X_HAVE_Z14
6493
+ }
6494
+
6495
+ } // namespace detail
6496
+
6497
+ // ------------------------------ SumOfLanes
6498
+
6499
+ // We define SumOfLanes for 8/16-bit types (and I32/U32/I64/U64 on Z14/Z15/Z16);
6500
+ // enable generic for the rest.
6501
+ #undef HWY_IF_SUM_OF_LANES_D
6502
+ #if HWY_S390X_HAVE_Z14
6503
+ #define HWY_IF_SUM_OF_LANES_D(D) HWY_IF_LANES_GT_D(D, 1), HWY_IF_FLOAT3264_D(D)
6504
+ #else
6505
+ #define HWY_IF_SUM_OF_LANES_D(D) \
6506
+ HWY_IF_LANES_GT_D(D, 1), HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))
6507
+ #endif
6508
+
6509
+ #if HWY_S390X_HAVE_Z14
6510
+ namespace detail {
6511
+
6512
+ template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
6513
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
6514
+ HWY_INLINE Vec128<T> SumOfU32OrU64LanesAsU128(Vec128<T> v) {
6515
+ const DFromV<decltype(v)> d;
6516
+ const RebindToUnsigned<decltype(d)> du;
6517
+ return BitCast(
6518
+ d, Vec128<uint8_t>{vec_sum_u128(BitCast(du, v).raw, Zero(du).raw)});
6519
+ }
5241
6520
 
5242
- HWY_API Vec32<uint16_t> SumOfLanes(Vec32<uint16_t> v) {
6521
+ } // namespace detail
6522
+
6523
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)>
6524
+ HWY_API VFromD<D> SumOfLanes(D /*d64*/, VFromD<D> v) {
6525
+ return Broadcast<1>(detail::SumOfU32OrU64LanesAsU128(v));
6526
+ }
6527
+ #endif
6528
+
6529
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)>
6530
+ HWY_API Vec32<uint16_t> SumOfLanes(D du16, Vec32<uint16_t> v) {
5243
6531
  constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN;
5244
- DFromV<decltype(v)> du16;
5245
- return Broadcast<kSumLaneIdx>(BitCast(du16, AltivecU16SumsOf2(v)));
6532
+ return Broadcast<kSumLaneIdx>(
6533
+ BitCast(du16, detail::SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<2>(), v)));
5246
6534
  }
5247
6535
 
5248
- HWY_API Vec64<uint16_t> SumOfLanes(Vec64<uint16_t> v) {
6536
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
6537
+ HWY_API Vec64<uint16_t> SumOfLanes(D du16, Vec64<uint16_t> v) {
5249
6538
  constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
5250
- const Full64<uint16_t> du16;
5251
- const auto zero = Zero(Full128<int32_t>());
5252
6539
  return Broadcast<kSumLaneIdx>(
5253
- AltivecVsum2sws(du16, AltivecU16SumsOf2(v).raw, zero.raw));
6540
+ BitCast(du16, detail::SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<2>(), v)));
5254
6541
  }
5255
6542
 
5256
- HWY_API Vec128<uint16_t> SumOfLanes(Vec128<uint16_t> v) {
6543
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
6544
+ HWY_API Vec128<uint16_t> SumOfLanes(D du16, Vec128<uint16_t> v) {
5257
6545
  constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
5258
- const Full128<uint16_t> du16;
6546
+ #if HWY_S390X_HAVE_Z14
6547
+ return Broadcast<kSumLaneIdx>(
6548
+ BitCast(du16, detail::SumOfU32OrU64LanesAsU128(detail::SumsOf4(
6549
+ hwy::UnsignedTag(), hwy::SizeTag<2>(), v))));
6550
+ #else // VSX
5259
6551
  const auto zero = Zero(Full128<int32_t>());
5260
6552
  return Broadcast<kSumLaneIdx>(
5261
- AltivecVsumsws(du16, AltivecU16SumsOf2(v).raw, zero.raw));
6553
+ detail::AltivecVsumsws(du16, detail::AltivecU16SumsOf2(v).raw, zero.raw));
6554
+ #endif
5262
6555
  }
5263
6556
 
5264
- HWY_API Vec32<int16_t> SumOfLanes(Vec32<int16_t> v) {
6557
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I16_D(D)>
6558
+ HWY_API Vec32<int16_t> SumOfLanes(D di16, Vec32<int16_t> v) {
6559
+ #if HWY_S390X_HAVE_Z14
6560
+ const RebindToUnsigned<decltype(di16)> du16;
6561
+ return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
6562
+ #else
5265
6563
  constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN;
5266
- const Full32<int16_t> di16;
5267
- const auto zero = Zero(Full128<int32_t>());
5268
- return Broadcast<kSumLaneIdx>(AltivecVsum4shs(di16, v.raw, zero.raw));
6564
+ return Broadcast<kSumLaneIdx>(
6565
+ BitCast(di16, detail::SumsOf2(hwy::SignedTag(), hwy::SizeTag<2>(), v)));
6566
+ #endif
5269
6567
  }
5270
6568
 
5271
- HWY_API Vec64<int16_t> SumOfLanes(Vec64<int16_t> v) {
6569
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)>
6570
+ HWY_API Vec64<int16_t> SumOfLanes(D di16, Vec64<int16_t> v) {
6571
+ #if HWY_S390X_HAVE_Z14
6572
+ const RebindToUnsigned<decltype(di16)> du16;
6573
+ return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
6574
+ #else
5272
6575
  constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
5273
- const Full128<int32_t> di32;
5274
- const Full64<int16_t> di16;
5275
- const auto zero = Zero(di32);
5276
- return Broadcast<kSumLaneIdx>(AltivecVsum2sws(
5277
- di16, AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw));
6576
+ return Broadcast<kSumLaneIdx>(
6577
+ BitCast(di16, detail::SumsOf4(hwy::SignedTag(), hwy::SizeTag<2>(), v)));
6578
+ #endif
5278
6579
  }
5279
6580
 
5280
- HWY_API Vec128<int16_t> SumOfLanes(Vec128<int16_t> v) {
6581
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
6582
+ HWY_API Vec128<int16_t> SumOfLanes(D di16, Vec128<int16_t> v) {
6583
+ #if HWY_S390X_HAVE_Z14
6584
+ const RebindToUnsigned<decltype(di16)> du16;
6585
+ return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
6586
+ #else
5281
6587
  constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
5282
- const Full128<int16_t> di16;
5283
6588
  const Full128<int32_t> di32;
5284
6589
  const auto zero = Zero(di32);
5285
- return Broadcast<kSumLaneIdx>(AltivecVsumsws(
5286
- di16, AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw));
6590
+ return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
6591
+ di16, detail::AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw));
6592
+ #endif
5287
6593
  }
5288
6594
 
5289
- // u8, N=2, N=4, N=8, N=16:
5290
- HWY_API Vec16<uint8_t> SumOfLanes(Vec16<uint8_t> v) {
6595
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U8_D(D)>
6596
+ HWY_API Vec32<uint8_t> SumOfLanes(D du8, Vec32<uint8_t> v) {
5291
6597
  constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
5292
- const Full16<uint8_t> du8;
5293
- const Full16<uint16_t> du16;
5294
- const Twice<decltype(du8)> dt_u8;
5295
- const Twice<decltype(du16)> dt_u16;
5296
- const Full128<uint32_t> du32;
5297
- return LowerHalf(Broadcast<kSumLaneIdx>(AltivecVsum4ubs(
5298
- dt_u8, BitCast(dt_u8, Combine(dt_u16, Zero(du16), BitCast(du16, v))).raw,
5299
- Zero(du32).raw)));
6598
+ return Broadcast<kSumLaneIdx>(
6599
+ BitCast(du8, detail::SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(), v)));
5300
6600
  }
5301
6601
 
5302
- HWY_API Vec32<uint8_t> SumOfLanes(Vec32<uint8_t> v) {
5303
- constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
5304
- const Full128<uint32_t> du32;
5305
- const Full32<uint8_t> du8;
5306
- return Broadcast<kSumLaneIdx>(AltivecVsum4ubs(du8, v.raw, Zero(du32).raw));
6602
+ template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_U8_D(D)>
6603
+ HWY_API Vec16<uint8_t> SumOfLanes(D du8, Vec16<uint8_t> v) {
6604
+ const Twice<decltype(du8)> dt_u8;
6605
+ return LowerHalf(du8, SumOfLanes(dt_u8, Combine(dt_u8, Zero(du8), v)));
5307
6606
  }
5308
6607
 
5309
- HWY_API Vec64<uint8_t> SumOfLanes(Vec64<uint8_t> v) {
6608
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
6609
+ HWY_API Vec64<uint8_t> SumOfLanes(D du8, Vec64<uint8_t> v) {
5310
6610
  constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
5311
- const Full64<uint8_t> du8;
5312
6611
  return Broadcast<kSumLaneIdx>(BitCast(du8, SumsOf8(v)));
5313
6612
  }
5314
6613
 
5315
- HWY_API Vec128<uint8_t> SumOfLanes(Vec128<uint8_t> v) {
6614
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
6615
+ HWY_API Vec128<uint8_t> SumOfLanes(D du8, Vec128<uint8_t> v) {
5316
6616
  constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15;
5317
6617
 
6618
+ #if HWY_S390X_HAVE_Z14
6619
+ return Broadcast<kSumLaneIdx>(
6620
+ BitCast(du8, detail::SumOfU32OrU64LanesAsU128(detail::SumsOf4(
6621
+ hwy::UnsignedTag(), hwy::SizeTag<1>(), v))));
6622
+ #else
5318
6623
  const Full128<uint32_t> du32;
5319
6624
  const RebindToSigned<decltype(du32)> di32;
5320
- const Full128<uint8_t> du8;
5321
6625
  const Vec128<uint32_t> zero = Zero(du32);
5322
- return Broadcast<kSumLaneIdx>(
5323
- AltivecVsumsws(du8, AltivecVsum4ubs(di32, v.raw, zero.raw).raw,
5324
- BitCast(di32, zero).raw));
6626
+ return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
6627
+ du8, detail::AltivecVsum4ubs(di32, v.raw, zero.raw).raw,
6628
+ BitCast(di32, zero).raw));
6629
+ #endif
5325
6630
  }
5326
6631
 
5327
- HWY_API Vec16<int8_t> SumOfLanes(Vec16<int8_t> v) {
6632
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I8_D(D)>
6633
+ HWY_API Vec32<int8_t> SumOfLanes(D di8, Vec32<int8_t> v) {
6634
+ #if HWY_S390X_HAVE_Z14
6635
+ const RebindToUnsigned<decltype(di8)> du8;
6636
+ return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
6637
+ #else
5328
6638
  constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
5329
-
5330
- const Full128<uint16_t> du16;
5331
- const Repartition<int32_t, decltype(du16)> di32;
5332
- const Repartition<int8_t, decltype(du16)> di8;
5333
- const Vec128<int8_t> zzvv = BitCast(
5334
- di8, InterleaveLower(BitCast(du16, Vec128<int8_t>{v.raw}), Zero(du16)));
5335
- return Vec16<int8_t>{
5336
- Broadcast<kSumLaneIdx>(AltivecVsum4sbs(di8, zzvv.raw, Zero(di32).raw))
5337
- .raw};
6639
+ return Broadcast<kSumLaneIdx>(
6640
+ BitCast(di8, detail::SumsOf4(hwy::SignedTag(), hwy::SizeTag<1>(), v)));
6641
+ #endif
5338
6642
  }
5339
6643
 
5340
- HWY_API Vec32<int8_t> SumOfLanes(Vec32<int8_t> v) {
5341
- constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
5342
- const Full32<int8_t> di8;
5343
- const Vec128<int32_t> zero = Zero(Full128<int32_t>());
5344
- return Broadcast<kSumLaneIdx>(AltivecVsum4sbs(di8, v.raw, zero.raw));
6644
+ template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_I8_D(D)>
6645
+ HWY_API Vec16<int8_t> SumOfLanes(D di8, Vec16<int8_t> v) {
6646
+ const Twice<decltype(di8)> dt_i8;
6647
+ return LowerHalf(di8, SumOfLanes(dt_i8, Combine(dt_i8, Zero(di8), v)));
5345
6648
  }
5346
6649
 
5347
- HWY_API Vec64<int8_t> SumOfLanes(Vec64<int8_t> v) {
6650
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
6651
+ HWY_API Vec64<int8_t> SumOfLanes(D di8, Vec64<int8_t> v) {
6652
+ #if HWY_S390X_HAVE_Z14
6653
+ const RebindToUnsigned<decltype(di8)> du8;
6654
+ return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
6655
+ #else
5348
6656
  constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
5349
- const Full128<int32_t> di32;
5350
- const Vec128<int32_t> zero = Zero(di32);
5351
- const Full64<int8_t> di8;
5352
- return Broadcast<kSumLaneIdx>(AltivecVsum2sws(
5353
- di8, AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw));
6657
+ return Broadcast<kSumLaneIdx>(BitCast(di8, SumsOf8(v)));
6658
+ #endif
5354
6659
  }
5355
6660
 
5356
- HWY_API Vec128<int8_t> SumOfLanes(Vec128<int8_t> v) {
6661
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
6662
+ HWY_API Vec128<int8_t> SumOfLanes(D di8, Vec128<int8_t> v) {
6663
+ #if HWY_S390X_HAVE_Z14
6664
+ const RebindToUnsigned<decltype(di8)> du8;
6665
+ return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
6666
+ #else
5357
6667
  constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15;
5358
- const Full128<int8_t> di8;
5359
6668
  const Full128<int32_t> di32;
5360
6669
  const Vec128<int32_t> zero = Zero(di32);
5361
- return Broadcast<kSumLaneIdx>(AltivecVsumsws(
5362
- di8, AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw));
5363
- }
5364
-
5365
- template <size_t N, HWY_IF_V_SIZE_GT(uint8_t, N, 4)>
5366
- HWY_API Vec128<uint8_t, N> MaxOfLanes(Vec128<uint8_t, N> v) {
5367
- const DFromV<decltype(v)> d;
5368
- const RepartitionToWide<decltype(d)> d16;
5369
- const RepartitionToWide<decltype(d16)> d32;
5370
- Vec128<uint8_t, N> vm = Max(v, Reverse2(d, v));
5371
- vm = Max(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
5372
- vm = Max(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
5373
- if (N > 8) {
5374
- const RepartitionToWide<decltype(d32)> d64;
5375
- vm = Max(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
5376
- }
5377
- return vm;
6670
+ return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
6671
+ di8, detail::AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw));
6672
+ #endif
5378
6673
  }
5379
6674
 
5380
- template <size_t N, HWY_IF_V_SIZE_GT(uint8_t, N, 4)>
5381
- HWY_API Vec128<uint8_t, N> MinOfLanes(Vec128<uint8_t, N> v) {
5382
- const DFromV<decltype(v)> d;
5383
- const RepartitionToWide<decltype(d)> d16;
5384
- const RepartitionToWide<decltype(d16)> d32;
5385
- Vec128<uint8_t, N> vm = Min(v, Reverse2(d, v));
5386
- vm = Min(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
5387
- vm = Min(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
5388
- if (N > 8) {
5389
- const RepartitionToWide<decltype(d32)> d64;
5390
- vm = Min(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
5391
- }
5392
- return vm;
6675
+ #if HWY_S390X_HAVE_Z14
6676
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_UI32_D(D)>
6677
+ HWY_API VFromD<D> SumOfLanes(D d32, VFromD<D> v) {
6678
+ const RebindToUnsigned<decltype(d32)> du32;
6679
+ return Broadcast<1>(
6680
+ BitCast(d32, detail::SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<4>(),
6681
+ BitCast(du32, v))));
5393
6682
  }
5394
6683
 
5395
- template <size_t N, HWY_IF_V_SIZE_GT(int8_t, N, 4)>
5396
- HWY_API Vec128<int8_t, N> MaxOfLanes(Vec128<int8_t, N> v) {
5397
- const DFromV<decltype(v)> d;
5398
- const RepartitionToWide<decltype(d)> d16;
5399
- const RepartitionToWide<decltype(d16)> d32;
5400
- Vec128<int8_t, N> vm = Max(v, Reverse2(d, v));
5401
- vm = Max(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
5402
- vm = Max(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
5403
- if (N > 8) {
5404
- const RepartitionToWide<decltype(d32)> d64;
5405
- vm = Max(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
5406
- }
5407
- return vm;
6684
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
6685
+ HWY_API VFromD<D> SumOfLanes(D /*d32*/, VFromD<D> v) {
6686
+ return Broadcast<3>(detail::SumOfU32OrU64LanesAsU128(v));
5408
6687
  }
6688
+ #endif
5409
6689
 
5410
- template <size_t N, HWY_IF_V_SIZE_GT(int8_t, N, 4)>
5411
- HWY_API Vec128<int8_t, N> MinOfLanes(Vec128<int8_t, N> v) {
5412
- const DFromV<decltype(v)> d;
5413
- const RepartitionToWide<decltype(d)> d16;
5414
- const RepartitionToWide<decltype(d16)> d32;
5415
- Vec128<int8_t, N> vm = Min(v, Reverse2(d, v));
5416
- vm = Min(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
5417
- vm = Min(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
5418
- if (N > 8) {
5419
- const RepartitionToWide<decltype(d32)> d64;
5420
- vm = Min(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
5421
- }
5422
- return vm;
5423
- }
6690
+ // generic_ops defines MinOfLanes and MaxOfLanes.
5424
6691
 
5425
- template <size_t N, HWY_IF_V_SIZE_GT(uint16_t, N, 2)>
5426
- HWY_API Vec128<uint16_t, N> MinOfLanes(Vec128<uint16_t, N> v) {
5427
- const Simd<uint16_t, N, 0> d;
5428
- const RepartitionToWide<decltype(d)> d32;
5429
- #if HWY_IS_LITTLE_ENDIAN
5430
- const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
5431
- const auto odd = ShiftRight<16>(BitCast(d32, v));
5432
- #else
5433
- const auto even = ShiftRight<16>(BitCast(d32, v));
5434
- const auto odd = And(BitCast(d32, v), Set(d32, 0xFFFF));
5435
- #endif
5436
- const auto min = MinOfLanes(Min(even, odd));
5437
- // Also broadcast into odd lanes on little-endian and into even lanes
5438
- // on big-endian
5439
- return Vec128<uint16_t, N>{vec_pack(min.raw, min.raw)};
5440
- }
5441
- template <size_t N, HWY_IF_V_SIZE_GT(int16_t, N, 2)>
5442
- HWY_API Vec128<int16_t, N> MinOfLanes(Vec128<int16_t, N> v) {
5443
- const Simd<int16_t, N, 0> d;
5444
- const RepartitionToWide<decltype(d)> d32;
5445
- // Sign-extend
5446
- #if HWY_IS_LITTLE_ENDIAN
5447
- const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
5448
- const auto odd = ShiftRight<16>(BitCast(d32, v));
5449
- #else
5450
- const auto even = ShiftRight<16>(BitCast(d32, v));
5451
- const auto odd = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
5452
- #endif
5453
- const auto min = MinOfLanes(Min(even, odd));
5454
- // Also broadcast into odd lanes on little-endian and into even lanes
5455
- // on big-endian
5456
- return Vec128<int16_t, N>{vec_pack(min.raw, min.raw)};
5457
- }
6692
+ // ------------------------------ ReduceSum for N=4 I8/U8
5458
6693
 
5459
- template <size_t N, HWY_IF_V_SIZE_GT(uint16_t, N, 2)>
5460
- HWY_API Vec128<uint16_t, N> MaxOfLanes(Vec128<uint16_t, N> v) {
5461
- const Simd<uint16_t, N, 0> d;
5462
- const RepartitionToWide<decltype(d)> d32;
5463
- #if HWY_IS_LITTLE_ENDIAN
5464
- const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
5465
- const auto odd = ShiftRight<16>(BitCast(d32, v));
5466
- #else
5467
- const auto even = ShiftRight<16>(BitCast(d32, v));
5468
- const auto odd = And(BitCast(d32, v), Set(d32, 0xFFFF));
5469
- #endif
5470
- const auto max = MaxOfLanes(Max(even, odd));
5471
- // Also broadcast into odd lanes.
5472
- return Vec128<uint16_t, N>{vec_pack(max.raw, max.raw)};
5473
- }
5474
- template <size_t N, HWY_IF_V_SIZE_GT(int16_t, N, 2)>
5475
- HWY_API Vec128<int16_t, N> MaxOfLanes(Vec128<int16_t, N> v) {
5476
- const Simd<int16_t, N, 0> d;
5477
- const RepartitionToWide<decltype(d)> d32;
5478
- // Sign-extend
5479
- #if HWY_IS_LITTLE_ENDIAN
5480
- const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
5481
- const auto odd = ShiftRight<16>(BitCast(d32, v));
6694
+ // GetLane(SumsOf4(v)) is more efficient on PPC/Z14 than the default N=4
6695
+ // I8/U8 ReduceSum implementation in generic_ops-inl.h
6696
+ #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
6697
+ #undef HWY_NATIVE_REDUCE_SUM_4_UI8
5482
6698
  #else
5483
- const auto even = ShiftRight<16>(BitCast(d32, v));
5484
- const auto odd = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
6699
+ #define HWY_NATIVE_REDUCE_SUM_4_UI8
5485
6700
  #endif
5486
- const auto max = MaxOfLanes(Max(even, odd));
5487
- // Also broadcast into odd lanes on little-endian and into even lanes
5488
- // on big-endian
5489
- return Vec128<int16_t, N>{vec_pack(max.raw, max.raw)};
5490
- }
5491
-
5492
- } // namespace detail
5493
6701
 
5494
- // Supported for u/i/f 32/64. Returns the same value in each lane.
5495
- template <class D>
5496
- HWY_API VFromD<D> SumOfLanes(D /* tag */, VFromD<D> v) {
5497
- return detail::SumOfLanes(v);
5498
- }
5499
- template <class D>
5500
- HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v) {
5501
- return GetLane(detail::SumOfLanes(v));
5502
- }
5503
- template <class D>
5504
- HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) {
5505
- return detail::MinOfLanes(v);
5506
- }
5507
- template <class D>
5508
- HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) {
5509
- return detail::MaxOfLanes(v);
6702
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
6703
+ HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) {
6704
+ return static_cast<TFromD<D>>(GetLane(SumsOf4(v)));
5510
6705
  }
5511
6706
 
5512
6707
  // ------------------------------ Lt128
@@ -5672,7 +6867,20 @@ HWY_API V Max128Upper(D d, const V a, const V b) {
5672
6867
 
5673
6868
  template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
5674
6869
  HWY_API V LeadingZeroCount(V v) {
6870
+ #if HWY_S390X_HAVE_Z14
6871
+ const DFromV<decltype(v)> d;
6872
+ const RebindToUnsigned<decltype(d)> du;
6873
+
6874
+ #if HWY_COMPILER_GCC_ACTUAL && defined(__OPTIMIZE__)
6875
+ // Work around for GCC compiler bug in vec_cnttz on Z14/Z15 if v[i] is a
6876
+ // constant
6877
+ __asm__("" : "+v"(v.raw));
6878
+ #endif
6879
+
6880
+ return BitCast(d, VFromD<decltype(du)>{vec_cntlz(BitCast(du, v).raw)});
6881
+ #else
5675
6882
  return V{vec_cntlz(v.raw)};
6883
+ #endif
5676
6884
  }
5677
6885
 
5678
6886
  template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
@@ -5682,14 +6890,27 @@ HWY_API V HighestSetBitIndex(V v) {
5682
6890
  return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v));
5683
6891
  }
5684
6892
 
5685
- #if HWY_PPC_HAVE_9
6893
+ #if HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
5686
6894
  template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
5687
6895
  HWY_API V TrailingZeroCount(V v) {
5688
6896
  #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
5689
6897
  return V{vec_vctz(v.raw)};
5690
6898
  #else
5691
- return V{vec_cnttz(v.raw)};
6899
+ #if HWY_S390X_HAVE_Z14
6900
+ const DFromV<decltype(v)> d;
6901
+ const RebindToUnsigned<decltype(d)> du;
6902
+
6903
+ #if HWY_COMPILER_GCC_ACTUAL && defined(__OPTIMIZE__)
6904
+ // Work around for GCC compiler bug in vec_cnttz on Z14/Z15 if v[i] is a
6905
+ // constant
6906
+ __asm__("" : "+v"(v.raw));
5692
6907
  #endif
6908
+
6909
+ return BitCast(d, VFromD<decltype(du)>{vec_cnttz(BitCast(du, v).raw)});
6910
+ #else
6911
+ return V{vec_cnttz(v.raw)};
6912
+ #endif // HWY_S390X_HAVE_Z14
6913
+ #endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
5693
6914
  }
5694
6915
  #else
5695
6916
  template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
@@ -5709,6 +6930,8 @@ HWY_API V TrailingZeroCount(V v) {
5709
6930
 
5710
6931
  #undef HWY_PPC_HAVE_9
5711
6932
  #undef HWY_PPC_HAVE_10
6933
+ #undef HWY_S390X_HAVE_Z14
6934
+ #undef HWY_S390X_HAVE_Z15
5712
6935
 
5713
6936
  // NOLINTNEXTLINE(google-readability-namespace-comments)
5714
6937
  } // namespace HWY_NAMESPACE