@img/sharp-libvips-dev 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (230) hide show
  1. package/include/aom/aom_encoder.h +3 -3
  2. package/include/aom/aomcx.h +17 -8
  3. package/include/expat.h +21 -10
  4. package/include/expat_config.h +11 -5
  5. package/include/ffi.h +12 -25
  6. package/include/fontconfig/fontconfig.h +5 -3
  7. package/include/freetype2/freetype/config/ftoption.h +1 -1
  8. package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -7
  9. package/include/gio-unix-2.0/gio/gunixinputstream.h +0 -5
  10. package/include/gio-unix-2.0/gio/gunixoutputstream.h +0 -5
  11. package/include/glib-2.0/gio/gappinfo.h +0 -7
  12. package/include/glib-2.0/gio/gapplication.h +6 -0
  13. package/include/glib-2.0/gio/gapplicationcommandline.h +12 -1
  14. package/include/glib-2.0/gio/gasyncinitable.h +0 -7
  15. package/include/glib-2.0/gio/gasyncresult.h +0 -6
  16. package/include/glib-2.0/gio/gbufferedinputstream.h +0 -5
  17. package/include/glib-2.0/gio/gbufferedoutputstream.h +0 -5
  18. package/include/glib-2.0/gio/gbytesicon.h +0 -5
  19. package/include/glib-2.0/gio/gcancellable.h +0 -5
  20. package/include/glib-2.0/gio/gconverter.h +0 -7
  21. package/include/glib-2.0/gio/gconverterinputstream.h +0 -6
  22. package/include/glib-2.0/gio/gconverteroutputstream.h +0 -6
  23. package/include/glib-2.0/gio/gdatagrambased.h +0 -7
  24. package/include/glib-2.0/gio/gdatainputstream.h +0 -6
  25. package/include/glib-2.0/gio/gdataoutputstream.h +0 -6
  26. package/include/glib-2.0/gio/gdbusinterface.h +0 -8
  27. package/include/glib-2.0/gio/gdbusinterfaceskeleton.h +0 -8
  28. package/include/glib-2.0/gio/gdbusmessage.h +2 -1
  29. package/include/glib-2.0/gio/gdbusobjectmanagerclient.h +0 -8
  30. package/include/glib-2.0/gio/gdbusobjectmanagerserver.h +0 -8
  31. package/include/glib-2.0/gio/gdbusobjectproxy.h +0 -8
  32. package/include/glib-2.0/gio/gdbusobjectskeleton.h +0 -8
  33. package/include/glib-2.0/gio/gdbusproxy.h +0 -8
  34. package/include/glib-2.0/gio/gdebugcontroller.h +0 -8
  35. package/include/glib-2.0/gio/gdebugcontrollerdbus.h +0 -7
  36. package/include/glib-2.0/gio/gdtlsserverconnection.h +0 -8
  37. package/include/glib-2.0/gio/gemblem.h +0 -5
  38. package/include/glib-2.0/gio/gemblemedicon.h +0 -5
  39. package/include/glib-2.0/gio/gfile.h +0 -10
  40. package/include/glib-2.0/gio/gfileenumerator.h +0 -5
  41. package/include/glib-2.0/gio/gfileicon.h +0 -5
  42. package/include/glib-2.0/gio/gfileinfo.h +0 -5
  43. package/include/glib-2.0/gio/gfileinputstream.h +0 -8
  44. package/include/glib-2.0/gio/gfileiostream.h +0 -8
  45. package/include/glib-2.0/gio/gfilemonitor.h +0 -5
  46. package/include/glib-2.0/gio/gfilenamecompleter.h +0 -5
  47. package/include/glib-2.0/gio/gfileoutputstream.h +0 -8
  48. package/include/glib-2.0/gio/gfilterinputstream.h +0 -5
  49. package/include/glib-2.0/gio/gfilteroutputstream.h +0 -5
  50. package/include/glib-2.0/gio/gicon.h +0 -5
  51. package/include/glib-2.0/gio/ginitable.h +0 -7
  52. package/include/glib-2.0/gio/ginputstream.h +0 -5
  53. package/include/glib-2.0/gio/gio-autocleanups.h +4 -0
  54. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  55. package/include/glib-2.0/gio/gioenums.h +6 -1
  56. package/include/glib-2.0/gio/giomodule.h +0 -5
  57. package/include/glib-2.0/gio/giostream.h +0 -5
  58. package/include/glib-2.0/gio/giotypes.h +5 -108
  59. package/include/glib-2.0/gio/gloadableicon.h +0 -6
  60. package/include/glib-2.0/gio/gmemoryinputstream.h +0 -5
  61. package/include/glib-2.0/gio/gmemoryoutputstream.h +0 -5
  62. package/include/glib-2.0/gio/gmountoperation.h +0 -6
  63. package/include/glib-2.0/gio/gnetworking.h +4 -0
  64. package/include/glib-2.0/gio/goutputstream.h +0 -9
  65. package/include/glib-2.0/gio/gpollableinputstream.h +0 -7
  66. package/include/glib-2.0/gio/gpollableoutputstream.h +0 -7
  67. package/include/glib-2.0/gio/gproxy.h +0 -7
  68. package/include/glib-2.0/gio/gproxyaddressenumerator.h +0 -8
  69. package/include/glib-2.0/gio/gseekable.h +0 -5
  70. package/include/glib-2.0/gio/gsettingsbackend.h +0 -5
  71. package/include/glib-2.0/gio/gsimpleactiongroup.h +0 -7
  72. package/include/glib-2.0/gio/gsimpleasyncresult.h +0 -5
  73. package/include/glib-2.0/gio/gsimpleproxyresolver.h +0 -5
  74. package/include/glib-2.0/gio/gsocket.h +13 -0
  75. package/include/glib-2.0/gio/gsocketaddressenumerator.h +0 -6
  76. package/include/glib-2.0/gio/gsocketconnectable.h +0 -5
  77. package/include/glib-2.0/gio/gtask.h +12 -0
  78. package/include/glib-2.0/gio/gthemedicon.h +0 -5
  79. package/include/glib-2.0/gio/gtlsserverconnection.h +0 -8
  80. package/include/glib-2.0/gio/gunixcredentialsmessage.h +0 -8
  81. package/include/glib-2.0/gio/gvfs.h +0 -5
  82. package/include/glib-2.0/gio/gvolume.h +2 -2
  83. package/include/glib-2.0/gio/gvolumemonitor.h +0 -5
  84. package/include/glib-2.0/girepository/gi-visibility.h +986 -0
  85. package/include/glib-2.0/girepository/giarginfo.h +100 -0
  86. package/include/glib-2.0/girepository/gibaseinfo.h +129 -0
  87. package/include/glib-2.0/girepository/gicallableinfo.h +119 -0
  88. package/include/glib-2.0/girepository/gicallbackinfo.h +60 -0
  89. package/include/glib-2.0/girepository/giconstantinfo.h +72 -0
  90. package/include/glib-2.0/girepository/gienuminfo.h +82 -0
  91. package/include/glib-2.0/girepository/gifieldinfo.h +84 -0
  92. package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
  93. package/include/glib-2.0/girepository/gifunctioninfo.h +117 -0
  94. package/include/glib-2.0/girepository/giinterfaceinfo.h +120 -0
  95. package/include/glib-2.0/girepository/giobjectinfo.h +230 -0
  96. package/include/glib-2.0/girepository/gipropertyinfo.h +77 -0
  97. package/include/glib-2.0/girepository/giregisteredtypeinfo.h +75 -0
  98. package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
  99. package/include/glib-2.0/girepository/girepository.h +247 -0
  100. package/include/glib-2.0/girepository/girffi.h +129 -0
  101. package/include/glib-2.0/girepository/gisignalinfo.h +72 -0
  102. package/include/glib-2.0/girepository/gistructinfo.h +102 -0
  103. package/include/glib-2.0/girepository/gitypeinfo.h +144 -0
  104. package/include/glib-2.0/girepository/gitypelib.h +61 -0
  105. package/include/glib-2.0/girepository/gitypes.h +421 -0
  106. package/include/glib-2.0/girepository/giunioninfo.h +105 -0
  107. package/include/glib-2.0/girepository/giunresolvedinfo.h +60 -0
  108. package/include/glib-2.0/girepository/givalueinfo.h +65 -0
  109. package/include/glib-2.0/girepository/givfuncinfo.h +88 -0
  110. package/include/glib-2.0/glib/deprecated/gcompletion.h +1 -1
  111. package/include/glib-2.0/glib/deprecated/grel.h +0 -23
  112. package/include/glib-2.0/glib/deprecated/gthread.h +10 -6
  113. package/include/glib-2.0/glib/gatomic.h +20 -20
  114. package/include/glib-2.0/glib/gbitlock.h +31 -0
  115. package/include/glib-2.0/glib/gbookmarkfile.h +39 -1
  116. package/include/glib-2.0/glib/gchecksum.h +0 -10
  117. package/include/glib-2.0/glib/gdate.h +0 -9
  118. package/include/glib-2.0/glib/gdatetime.h +33 -1
  119. package/include/glib-2.0/glib/gdir.h +5 -0
  120. package/include/glib-2.0/glib/ghmac.h +0 -9
  121. package/include/glib-2.0/glib/glib-autocleanups.h +4 -0
  122. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  123. package/include/glib-2.0/glib/gmacros.h +1 -0
  124. package/include/glib-2.0/glib/gmessages.h +11 -0
  125. package/include/glib-2.0/glib/gpathbuf.h +0 -7
  126. package/include/glib-2.0/glib/gslice.h +2 -0
  127. package/include/glib-2.0/glib/gstdio.h +1 -1
  128. package/include/glib-2.0/glib/gstrfuncs.h +24 -18
  129. package/include/glib-2.0/glib/gstrvbuilder.h +4 -8
  130. package/include/glib-2.0/glib/gtestutils.h +5 -0
  131. package/include/glib-2.0/glib/gthread.h +216 -3
  132. package/include/glib-2.0/glib/gunicode.h +12 -2
  133. package/include/glib-2.0/glib/gvarianttype.h +1 -10
  134. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  135. package/include/glib-2.0/glib/gwin32.h +4 -4
  136. package/include/glib-2.0/glib-unix.h +214 -0
  137. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  138. package/include/glib-2.0/gobject/gbinding.h +0 -8
  139. package/include/glib-2.0/gobject/gbindinggroup.h +0 -8
  140. package/include/glib-2.0/gobject/gclosure.h +1 -9
  141. package/include/glib-2.0/gobject/genums.h +6 -6
  142. package/include/glib-2.0/gobject/glib-types.h +44 -0
  143. package/include/glib-2.0/gobject/gobject-autocleanups.h +4 -0
  144. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  145. package/include/glib-2.0/gobject/gobject.h +1 -16
  146. package/include/glib-2.0/gobject/gparam.h +3 -12
  147. package/include/glib-2.0/gobject/gsignal.h +16 -6
  148. package/include/glib-2.0/gobject/gsignalgroup.h +0 -8
  149. package/include/glib-2.0/gobject/gtype.h +53 -20
  150. package/include/glib-2.0/gobject/gtypemodule.h +0 -7
  151. package/include/glib-2.0/gobject/gtypeplugin.h +0 -6
  152. package/include/glib-2.0/gobject/gvaluearray.h +0 -7
  153. package/include/glib-2.0/gobject/gvaluecollector.h +1 -11
  154. package/include/glib-2.0/gobject/gvaluetypes.h +2 -0
  155. package/include/hwy/aligned_allocator.h +171 -6
  156. package/include/hwy/base.h +1765 -543
  157. package/include/hwy/cache_control.h +24 -6
  158. package/include/hwy/detect_compiler_arch.h +23 -2
  159. package/include/hwy/detect_targets.h +56 -13
  160. package/include/hwy/foreach_target.h +24 -0
  161. package/include/hwy/highway.h +20 -3
  162. package/include/hwy/ops/arm_neon-inl.h +1086 -667
  163. package/include/hwy/ops/arm_sve-inl.h +1091 -235
  164. package/include/hwy/ops/emu128-inl.h +271 -196
  165. package/include/hwy/ops/generic_ops-inl.h +2270 -399
  166. package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
  167. package/include/hwy/ops/rvv-inl.h +1043 -311
  168. package/include/hwy/ops/scalar-inl.h +189 -159
  169. package/include/hwy/ops/set_macros-inl.h +66 -6
  170. package/include/hwy/ops/shared-inl.h +175 -56
  171. package/include/hwy/ops/wasm_128-inl.h +153 -136
  172. package/include/hwy/ops/x86_128-inl.h +1647 -646
  173. package/include/hwy/ops/x86_256-inl.h +1003 -370
  174. package/include/hwy/ops/x86_512-inl.h +948 -353
  175. package/include/hwy/per_target.h +4 -0
  176. package/include/hwy/profiler.h +648 -0
  177. package/include/hwy/robust_statistics.h +2 -2
  178. package/include/hwy/targets.h +18 -11
  179. package/include/hwy/timer.h +11 -0
  180. package/include/lcms2.h +46 -7
  181. package/include/lcms2_plugin.h +4 -4
  182. package/include/libheif/heif_version.h +2 -2
  183. package/include/libpng16/png.h +32 -29
  184. package/include/libpng16/pngconf.h +2 -2
  185. package/include/libpng16/pnglibconf.h +7 -2
  186. package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
  187. package/include/libxml2/libxml/HTMLparser.h +23 -0
  188. package/include/libxml2/libxml/SAX.h +0 -2
  189. package/include/libxml2/libxml/SAX2.h +0 -2
  190. package/include/libxml2/libxml/c14n.h +0 -2
  191. package/include/libxml2/libxml/dict.h +1 -0
  192. package/include/libxml2/libxml/encoding.h +16 -14
  193. package/include/libxml2/libxml/entities.h +4 -0
  194. package/include/libxml2/libxml/globals.h +15 -503
  195. package/include/libxml2/libxml/hash.h +57 -61
  196. package/include/libxml2/libxml/nanoftp.h +2 -2
  197. package/include/libxml2/libxml/parser.h +137 -18
  198. package/include/libxml2/libxml/parserInternals.h +1 -0
  199. package/include/libxml2/libxml/relaxng.h +2 -1
  200. package/include/libxml2/libxml/schemasInternals.h +1 -0
  201. package/include/libxml2/libxml/schematron.h +1 -0
  202. package/include/libxml2/libxml/threads.h +4 -11
  203. package/include/libxml2/libxml/tree.h +68 -20
  204. package/include/libxml2/libxml/uri.h +2 -1
  205. package/include/libxml2/libxml/valid.h +2 -0
  206. package/include/libxml2/libxml/xmlIO.h +65 -13
  207. package/include/libxml2/libxml/xmlerror.h +37 -8
  208. package/include/libxml2/libxml/xmlmemory.h +37 -40
  209. package/include/libxml2/libxml/xmlreader.h +6 -0
  210. package/include/libxml2/libxml/xmlregexp.h +2 -9
  211. package/include/libxml2/libxml/xmlsave.h +9 -0
  212. package/include/libxml2/libxml/xmlschemas.h +3 -0
  213. package/include/libxml2/libxml/xmlversion.h +28 -43
  214. package/include/libxml2/libxml/xpath.h +1 -1
  215. package/include/libxml2/libxml/xpathInternals.h +2 -1
  216. package/include/libxml2/libxml/xpointer.h +5 -4
  217. package/include/pango-1.0/pango/pango-features.h +3 -3
  218. package/include/pango-1.0/pango/pango-fontmap.h +7 -0
  219. package/include/pixman-1/pixman-version.h +3 -3
  220. package/include/pixman-1/pixman.h +9 -2
  221. package/include/png.h +32 -29
  222. package/include/pngconf.h +2 -2
  223. package/include/pnglibconf.h +7 -2
  224. package/include/vips/connection.h +9 -3
  225. package/include/vips/util.h +0 -9
  226. package/include/vips/version.h +4 -4
  227. package/include/zconf.h +3 -0
  228. package/include/zlib.h +3 -3
  229. package/package.json +1 -1
  230. package/versions.json +15 -15
@@ -17,6 +17,9 @@
17
17
 
18
18
  // Target-independent types/functions defined after target-specific ops.
19
19
 
20
+ // The "include guards" in this file that check HWY_TARGET_TOGGLE serve to skip
21
+ // the generic implementation here if native ops are already defined.
22
+
20
23
  #include "hwy/base.h"
21
24
 
22
25
  // Define detail::Shuffle1230 etc, but only when viewing the current header;
@@ -194,6 +197,21 @@ HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from,
194
197
  #endif
195
198
  }
196
199
 
200
+ // ------------------------------ MaskFalse
201
+ #if (defined(HWY_NATIVE_MASK_FALSE) == defined(HWY_TARGET_TOGGLE))
202
+ #ifdef HWY_NATIVE_MASK_FALSE
203
+ #undef HWY_NATIVE_MASK_FALSE
204
+ #else
205
+ #define HWY_NATIVE_MASK_FALSE
206
+ #endif
207
+
208
+ template <class D>
209
+ HWY_API Mask<D> MaskFalse(D d) {
210
+ return MaskFromVec(Zero(d));
211
+ }
212
+
213
+ #endif // HWY_NATIVE_MASK_FALSE
214
+
197
215
  // ------------------------------ BitwiseIfThenElse
198
216
  #if (defined(HWY_NATIVE_BITWISE_IF_THEN_ELSE) == defined(HWY_TARGET_TOGGLE))
199
217
  #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
@@ -209,9 +227,634 @@ HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
209
227
 
210
228
  #endif // HWY_NATIVE_BITWISE_IF_THEN_ELSE
211
229
 
212
- // "Include guard": skip if native instructions are available. The generic
213
- // implementation is currently shared between x86_* and wasm_*, and is too large
214
- // to duplicate.
230
+ // ------------------------------ PromoteMaskTo
231
+
232
+ #if (defined(HWY_NATIVE_PROMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE))
233
+ #ifdef HWY_NATIVE_PROMOTE_MASK_TO
234
+ #undef HWY_NATIVE_PROMOTE_MASK_TO
235
+ #else
236
+ #define HWY_NATIVE_PROMOTE_MASK_TO
237
+ #endif
238
+
239
+ template <class DTo, class DFrom>
240
+ HWY_API Mask<DTo> PromoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) {
241
+ static_assert(
242
+ sizeof(TFromD<DTo>) > sizeof(TFromD<DFrom>),
243
+ "sizeof(TFromD<DTo>) must be greater than sizeof(TFromD<DFrom>)");
244
+ static_assert(
245
+ IsSame<Mask<DFrom>, Mask<Rebind<TFromD<DFrom>, DTo>>>(),
246
+ "Mask<DFrom> must be the same type as Mask<Rebind<TFromD<DFrom>, DTo>>");
247
+
248
+ const RebindToSigned<decltype(d_to)> di_to;
249
+ const RebindToSigned<decltype(d_from)> di_from;
250
+
251
+ return MaskFromVec(BitCast(
252
+ d_to, PromoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m)))));
253
+ }
254
+
255
+ #endif // HWY_NATIVE_PROMOTE_MASK_TO
256
+
257
+ // ------------------------------ DemoteMaskTo
258
+
259
+ #if (defined(HWY_NATIVE_DEMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE))
260
+ #ifdef HWY_NATIVE_DEMOTE_MASK_TO
261
+ #undef HWY_NATIVE_DEMOTE_MASK_TO
262
+ #else
263
+ #define HWY_NATIVE_DEMOTE_MASK_TO
264
+ #endif
265
+
266
+ template <class DTo, class DFrom>
267
+ HWY_API Mask<DTo> DemoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) {
268
+ static_assert(sizeof(TFromD<DTo>) < sizeof(TFromD<DFrom>),
269
+ "sizeof(TFromD<DTo>) must be less than sizeof(TFromD<DFrom>)");
270
+ static_assert(
271
+ IsSame<Mask<DFrom>, Mask<Rebind<TFromD<DFrom>, DTo>>>(),
272
+ "Mask<DFrom> must be the same type as Mask<Rebind<TFromD<DFrom>, DTo>>");
273
+
274
+ const RebindToSigned<decltype(d_to)> di_to;
275
+ const RebindToSigned<decltype(d_from)> di_from;
276
+
277
+ return MaskFromVec(
278
+ BitCast(d_to, DemoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m)))));
279
+ }
280
+
281
+ #endif // HWY_NATIVE_DEMOTE_MASK_TO
282
+
283
+ // ------------------------------ CombineMasks
284
+
285
+ #if (defined(HWY_NATIVE_COMBINE_MASKS) == defined(HWY_TARGET_TOGGLE))
286
+ #ifdef HWY_NATIVE_COMBINE_MASKS
287
+ #undef HWY_NATIVE_COMBINE_MASKS
288
+ #else
289
+ #define HWY_NATIVE_COMBINE_MASKS
290
+ #endif
291
+
292
+ #if HWY_TARGET != HWY_SCALAR
293
+ template <class D>
294
+ HWY_API Mask<D> CombineMasks(D d, Mask<Half<D>> hi, Mask<Half<D>> lo) {
295
+ const Half<decltype(d)> dh;
296
+ return MaskFromVec(Combine(d, VecFromMask(dh, hi), VecFromMask(dh, lo)));
297
+ }
298
+ #endif
299
+
300
+ #endif // HWY_NATIVE_COMBINE_MASKS
301
+
302
+ // ------------------------------ LowerHalfOfMask
303
+
304
+ #if (defined(HWY_NATIVE_LOWER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE))
305
+ #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
306
+ #undef HWY_NATIVE_LOWER_HALF_OF_MASK
307
+ #else
308
+ #define HWY_NATIVE_LOWER_HALF_OF_MASK
309
+ #endif
310
+
311
+ template <class D>
312
+ HWY_API Mask<D> LowerHalfOfMask(D d, Mask<Twice<D>> m) {
313
+ const Twice<decltype(d)> dt;
314
+ return MaskFromVec(LowerHalf(d, VecFromMask(dt, m)));
315
+ }
316
+
317
+ #endif // HWY_NATIVE_LOWER_HALF_OF_MASK
318
+
319
+ // ------------------------------ UpperHalfOfMask
320
+
321
+ #if (defined(HWY_NATIVE_UPPER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE))
322
+ #ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
323
+ #undef HWY_NATIVE_UPPER_HALF_OF_MASK
324
+ #else
325
+ #define HWY_NATIVE_UPPER_HALF_OF_MASK
326
+ #endif
327
+
328
+ #if HWY_TARGET != HWY_SCALAR
329
+ template <class D>
330
+ HWY_API Mask<D> UpperHalfOfMask(D d, Mask<Twice<D>> m) {
331
+ const Twice<decltype(d)> dt;
332
+ return MaskFromVec(UpperHalf(d, VecFromMask(dt, m)));
333
+ }
334
+ #endif
335
+
336
+ #endif // HWY_NATIVE_UPPER_HALF_OF_MASK
337
+
338
+ // ------------------------------ OrderedDemote2MasksTo
339
+
340
+ #if (defined(HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO) == \
341
+ defined(HWY_TARGET_TOGGLE))
342
+ #ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
343
+ #undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
344
+ #else
345
+ #define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
346
+ #endif
347
+
348
+ #if HWY_TARGET != HWY_SCALAR
349
+ template <class DTo, class DFrom>
350
+ HWY_API Mask<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom d_from, Mask<DFrom> a,
351
+ Mask<DFrom> b) {
352
+ static_assert(
353
+ sizeof(TFromD<DTo>) == sizeof(TFromD<DFrom>) / 2,
354
+ "sizeof(TFromD<DTo>) must be equal to sizeof(TFromD<DFrom>) / 2");
355
+ static_assert(IsSame<Mask<DTo>, Mask<Repartition<TFromD<DTo>, DFrom>>>(),
356
+ "Mask<DTo> must be the same type as "
357
+ "Mask<Repartition<TFromD<DTo>, DFrom>>>()");
358
+
359
+ const RebindToSigned<decltype(d_from)> di_from;
360
+ const RebindToSigned<decltype(d_to)> di_to;
361
+
362
+ const auto va = BitCast(di_from, VecFromMask(d_from, a));
363
+ const auto vb = BitCast(di_from, VecFromMask(d_from, b));
364
+ return MaskFromVec(BitCast(d_to, OrderedDemote2To(di_to, va, vb)));
365
+ }
366
+ #endif
367
+
368
+ #endif // HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
369
+
370
+ // ------------------------------ InterleaveWholeLower/InterleaveWholeUpper
371
+ #if (defined(HWY_NATIVE_INTERLEAVE_WHOLE) == defined(HWY_TARGET_TOGGLE))
372
+ #ifdef HWY_NATIVE_INTERLEAVE_WHOLE
373
+ #undef HWY_NATIVE_INTERLEAVE_WHOLE
374
+ #else
375
+ #define HWY_NATIVE_INTERLEAVE_WHOLE
376
+ #endif
377
+
378
+ #if HWY_TARGET != HWY_SCALAR
379
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
380
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
381
+ // InterleaveWholeLower(d, a, b) is equivalent to InterleaveLower(a, b) if
382
+ // D().MaxBytes() <= 16 is true
383
+ return InterleaveLower(d, a, b);
384
+ }
385
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
386
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
387
+ // InterleaveWholeUpper(d, a, b) is equivalent to InterleaveUpper(a, b) if
388
+ // D().MaxBytes() <= 16 is true
389
+ return InterleaveUpper(d, a, b);
390
+ }
391
+
392
+ // InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on AVX2/AVX3
393
+ // is implemented in x86_256-inl.h.
394
+
395
+ // InterleaveWholeLower/InterleaveWholeUpper for 64-byte vectors on AVX3 is
396
+ // implemented in x86_512-inl.h.
397
+
398
+ // InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on WASM_EMU256
399
+ // is implemented in wasm_256-inl.h.
400
+ #endif // HWY_TARGET != HWY_SCALAR
401
+
402
+ #endif // HWY_NATIVE_INTERLEAVE_WHOLE
403
+
404
+ #if HWY_TARGET != HWY_SCALAR
405
+ // The InterleaveWholeLower without the optional D parameter is generic for all
406
+ // vector lengths.
407
+ template <class V>
408
+ HWY_API V InterleaveWholeLower(V a, V b) {
409
+ return InterleaveWholeLower(DFromV<V>(), a, b);
410
+ }
411
+ #endif // HWY_TARGET != HWY_SCALAR
412
+
413
+ // ------------------------------ AddSub
414
+
415
+ template <class V, HWY_IF_LANES_D(DFromV<V>, 1)>
416
+ HWY_API V AddSub(V a, V b) {
417
+ // AddSub(a, b) for a one-lane vector is equivalent to Sub(a, b)
418
+ return Sub(a, b);
419
+ }
420
+
421
+ // AddSub for F32x2, F32x4, and F64x2 vectors is implemented in x86_128-inl.h on
422
+ // SSSE3/SSE4/AVX2/AVX3
423
+
424
+ // AddSub for F32x8 and F64x4 vectors is implemented in x86_256-inl.h on
425
+ // AVX2/AVX3
426
+ template <class V, HWY_IF_V_SIZE_GT_V(V, ((HWY_TARGET <= HWY_SSSE3 &&
427
+ hwy::IsFloat3264<TFromV<V>>())
428
+ ? 32
429
+ : sizeof(TFromV<V>)))>
430
+ HWY_API V AddSub(V a, V b) {
431
+ using D = DFromV<decltype(a)>;
432
+ using T = TFromD<D>;
433
+ using TNegate = If<!hwy::IsSigned<T>(), MakeSigned<T>, T>;
434
+
435
+ const D d;
436
+ const Rebind<TNegate, D> d_negate;
437
+
438
+ // Negate the even lanes of b
439
+ const auto negated_even_b = OddEven(b, BitCast(d, Neg(BitCast(d_negate, b))));
440
+
441
+ return Add(a, negated_even_b);
442
+ }
443
+
444
+ // ------------------------------ MaskedAddOr etc.
445
+ #if (defined(HWY_NATIVE_MASKED_ARITH) == defined(HWY_TARGET_TOGGLE))
446
+ #ifdef HWY_NATIVE_MASKED_ARITH
447
+ #undef HWY_NATIVE_MASKED_ARITH
448
+ #else
449
+ #define HWY_NATIVE_MASKED_ARITH
450
+ #endif
451
+
452
+ template <class V, class M>
453
+ HWY_API V MaskedMinOr(V no, M m, V a, V b) {
454
+ return IfThenElse(m, Min(a, b), no);
455
+ }
456
+
457
+ template <class V, class M>
458
+ HWY_API V MaskedMaxOr(V no, M m, V a, V b) {
459
+ return IfThenElse(m, Max(a, b), no);
460
+ }
461
+
462
+ template <class V, class M>
463
+ HWY_API V MaskedAddOr(V no, M m, V a, V b) {
464
+ return IfThenElse(m, Add(a, b), no);
465
+ }
466
+
467
+ template <class V, class M>
468
+ HWY_API V MaskedSubOr(V no, M m, V a, V b) {
469
+ return IfThenElse(m, Sub(a, b), no);
470
+ }
471
+
472
+ template <class V, class M>
473
+ HWY_API V MaskedMulOr(V no, M m, V a, V b) {
474
+ return IfThenElse(m, Mul(a, b), no);
475
+ }
476
+
477
+ template <class V, class M>
478
+ HWY_API V MaskedDivOr(V no, M m, V a, V b) {
479
+ return IfThenElse(m, Div(a, b), no);
480
+ }
481
+
482
+ template <class V, class M>
483
+ HWY_API V MaskedModOr(V no, M m, V a, V b) {
484
+ return IfThenElse(m, Mod(a, b), no);
485
+ }
486
+
487
+ template <class V, class M>
488
+ HWY_API V MaskedSatAddOr(V no, M m, V a, V b) {
489
+ return IfThenElse(m, SaturatedAdd(a, b), no);
490
+ }
491
+
492
+ template <class V, class M>
493
+ HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
494
+ return IfThenElse(m, SaturatedSub(a, b), no);
495
+ }
496
+ #endif // HWY_NATIVE_MASKED_ARITH
497
+
498
+ // ------------------------------ IfNegativeThenNegOrUndefIfZero
499
+
500
+ #if (defined(HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG) == \
501
+ defined(HWY_TARGET_TOGGLE))
502
+ #ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
503
+ #undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
504
+ #else
505
+ #define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
506
+ #endif
507
+
508
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
509
+ HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
510
+ #if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
511
+ // MaskedSubOr is more efficient than IfNegativeThenElse on RVV/SVE
512
+ const auto zero = Zero(DFromV<V>());
513
+ return MaskedSubOr(v, Lt(mask, zero), zero, v);
514
+ #else
515
+ return IfNegativeThenElse(mask, Neg(v), v);
516
+ #endif
517
+ }
518
+
519
+ #endif // HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
520
+
521
+ template <class V, HWY_IF_FLOAT_V(V)>
522
+ HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
523
+ return CopySign(v, Xor(mask, v));
524
+ }
525
+
526
+ // ------------------------------ SaturatedNeg
527
+
528
+ #if (defined(HWY_NATIVE_SATURATED_NEG_8_16_32) == defined(HWY_TARGET_TOGGLE))
529
+ #ifdef HWY_NATIVE_SATURATED_NEG_8_16_32
530
+ #undef HWY_NATIVE_SATURATED_NEG_8_16_32
531
+ #else
532
+ #define HWY_NATIVE_SATURATED_NEG_8_16_32
533
+ #endif
534
+
535
+ template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
536
+ HWY_IF_SIGNED_V(V)>
537
+ HWY_API V SaturatedNeg(V v) {
538
+ const DFromV<decltype(v)> d;
539
+ return SaturatedSub(Zero(d), v);
540
+ }
541
+
542
+ template <class V, HWY_IF_I32(TFromV<V>)>
543
+ HWY_API V SaturatedNeg(V v) {
544
+ const DFromV<decltype(v)> d;
545
+
546
+ #if HWY_TARGET == HWY_RVV || \
547
+ (HWY_TARGET >= HWY_PPC10 && HWY_TARGET <= HWY_PPC8) || \
548
+ (HWY_TARGET >= HWY_SVE2_128 && HWY_TARGET <= HWY_NEON_WITHOUT_AES)
549
+ // RVV/NEON/SVE/PPC have native I32 SaturatedSub instructions
550
+ return SaturatedSub(Zero(d), v);
551
+ #else
552
+ // ~v[i] - ((v[i] > LimitsMin<int32_t>()) ? -1 : 0) is equivalent to
553
+ // (v[i] > LimitsMin<int32_t>) ? (-v[i]) : LimitsMax<int32_t>() since
554
+ // -v[i] == ~v[i] + 1 == ~v[i] - (-1) and
555
+ // ~LimitsMin<int32_t>() == LimitsMax<int32_t>().
556
+ return Sub(Not(v), VecFromMask(d, Gt(v, Set(d, LimitsMin<int32_t>()))));
557
+ #endif
558
+ }
559
+ #endif // HWY_NATIVE_SATURATED_NEG_8_16_32
560
+
561
+ #if (defined(HWY_NATIVE_SATURATED_NEG_64) == defined(HWY_TARGET_TOGGLE))
562
+ #ifdef HWY_NATIVE_SATURATED_NEG_64
563
+ #undef HWY_NATIVE_SATURATED_NEG_64
564
+ #else
565
+ #define HWY_NATIVE_SATURATED_NEG_64
566
+ #endif
567
+
568
+ template <class V, HWY_IF_I64(TFromV<V>)>
569
+ HWY_API V SaturatedNeg(V v) {
570
+ #if HWY_TARGET == HWY_RVV || \
571
+ (HWY_TARGET >= HWY_SVE2_128 && HWY_TARGET <= HWY_NEON_WITHOUT_AES)
572
+ // RVV/NEON/SVE have native I64 SaturatedSub instructions
573
+ const DFromV<decltype(v)> d;
574
+ return SaturatedSub(Zero(d), v);
575
+ #else
576
+ const auto neg_v = Neg(v);
577
+ return Add(neg_v, BroadcastSignBit(And(v, neg_v)));
578
+ #endif
579
+ }
580
+ #endif // HWY_NATIVE_SATURATED_NEG_64
581
+
582
+ // ------------------------------ SaturatedAbs
583
+
584
+ #if (defined(HWY_NATIVE_SATURATED_ABS) == defined(HWY_TARGET_TOGGLE))
585
+ #ifdef HWY_NATIVE_SATURATED_ABS
586
+ #undef HWY_NATIVE_SATURATED_ABS
587
+ #else
588
+ #define HWY_NATIVE_SATURATED_ABS
589
+ #endif
590
+
591
+ template <class V, HWY_IF_SIGNED_V(V)>
592
+ HWY_API V SaturatedAbs(V v) {
593
+ return Max(v, SaturatedNeg(v));
594
+ }
595
+
596
+ #endif
597
+
598
+ // ------------------------------ Reductions
599
+
600
+ // Targets follow one of two strategies. If HWY_NATIVE_REDUCE_SCALAR is toggled,
601
+ // they (RVV/SVE/Armv8/Emu128) implement ReduceSum and SumOfLanes via Set.
602
+ // Otherwise, they (Armv7/PPC/scalar/WASM/x86) define zero to most of the
603
+ // SumOfLanes overloads. For the latter group, we here define the remaining
604
+ // overloads, plus ReduceSum which uses them plus GetLane.
605
+ #if (defined(HWY_NATIVE_REDUCE_SCALAR) == defined(HWY_TARGET_TOGGLE))
606
+ #ifdef HWY_NATIVE_REDUCE_SCALAR
607
+ #undef HWY_NATIVE_REDUCE_SCALAR
608
+ #else
609
+ #define HWY_NATIVE_REDUCE_SCALAR
610
+ #endif
611
+
612
+ namespace detail {
613
+
614
+ // Allows reusing the same shuffle code for SumOfLanes/MinOfLanes/MaxOfLanes.
615
+ struct AddFunc {
616
+ template <class V>
617
+ V operator()(V a, V b) const {
618
+ return Add(a, b);
619
+ }
620
+ };
621
+
622
+ struct MinFunc {
623
+ template <class V>
624
+ V operator()(V a, V b) const {
625
+ return Min(a, b);
626
+ }
627
+ };
628
+
629
+ struct MaxFunc {
630
+ template <class V>
631
+ V operator()(V a, V b) const {
632
+ return Max(a, b);
633
+ }
634
+ };
635
+
636
+ // No-op for vectors of at most one block.
637
+ template <class D, class Func, HWY_IF_V_SIZE_LE_D(D, 16)>
638
+ HWY_INLINE VFromD<D> ReduceAcrossBlocks(D, Func, VFromD<D> v) {
639
+ return v;
640
+ }
641
+
642
+ // Reduces a lane with its counterpart in other block(s). Shared by AVX2 and
643
+ // WASM_EMU256. AVX3 has its own overload.
644
+ template <class D, class Func, HWY_IF_V_SIZE_D(D, 32)>
645
+ HWY_INLINE VFromD<D> ReduceAcrossBlocks(D /*d*/, Func f, VFromD<D> v) {
646
+ return f(v, SwapAdjacentBlocks(v));
647
+ }
648
+
649
+ // These return the reduction result broadcasted across all lanes. They assume
650
+ // the caller has already reduced across blocks.
651
+
652
+ template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
653
+ HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v10) {
654
+ return f(v10, Reverse2(d, v10));
655
+ }
656
+
657
+ template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
658
+ HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v3210) {
659
+ const VFromD<D> v0123 = Reverse4(d, v3210);
660
+ const VFromD<D> v03_12_12_03 = f(v3210, v0123);
661
+ const VFromD<D> v12_03_03_12 = Reverse2(d, v03_12_12_03);
662
+ return f(v03_12_12_03, v12_03_03_12);
663
+ }
664
+
665
+ template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 8)>
666
+ HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v76543210) {
667
+ // The upper half is reversed from the lower half; omit for brevity.
668
+ const VFromD<D> v34_25_16_07 = f(v76543210, Reverse8(d, v76543210));
669
+ const VFromD<D> v0347_1625_1625_0347 =
670
+ f(v34_25_16_07, Reverse4(d, v34_25_16_07));
671
+ return f(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
672
+ }
673
+
674
+ template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 16), HWY_IF_U8_D(D)>
675
+ HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v) {
676
+ const RepartitionToWide<decltype(d)> dw;
677
+ using VW = VFromD<decltype(dw)>;
678
+ const VW vw = BitCast(dw, v);
679
+ // f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN.
680
+ const VW even = And(vw, Set(dw, 0xFF));
681
+ const VW odd = ShiftRight<8>(vw);
682
+ const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd));
683
+ #if HWY_IS_LITTLE_ENDIAN
684
+ return DupEven(BitCast(d, reduced));
685
+ #else
686
+ return DupOdd(BitCast(d, reduced));
687
+ #endif
688
+ }
689
+
690
+ template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 16), HWY_IF_I8_D(D)>
691
+ HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v) {
692
+ const RepartitionToWide<decltype(d)> dw;
693
+ using VW = VFromD<decltype(dw)>;
694
+ const VW vw = BitCast(dw, v);
695
+ // Sign-extend
696
+ // f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN.
697
+ const VW even = ShiftRight<8>(ShiftLeft<8>(vw));
698
+ const VW odd = ShiftRight<8>(vw);
699
+ const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd));
700
+ #if HWY_IS_LITTLE_ENDIAN
701
+ return DupEven(BitCast(d, reduced));
702
+ #else
703
+ return DupOdd(BitCast(d, reduced));
704
+ #endif
705
+ }
706
+
707
+ } // namespace detail
708
+
709
+ template <class D, HWY_IF_SUM_OF_LANES_D(D)>
710
+ HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
711
+ const detail::AddFunc f;
712
+ v = detail::ReduceAcrossBlocks(d, f, v);
713
+ return detail::ReduceWithinBlocks(d, f, v);
714
+ }
715
+ template <class D, HWY_IF_MINMAX_OF_LANES_D(D)>
716
+ HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
717
+ const detail::MinFunc f;
718
+ v = detail::ReduceAcrossBlocks(d, f, v);
719
+ return detail::ReduceWithinBlocks(d, f, v);
720
+ }
721
+ template <class D, HWY_IF_MINMAX_OF_LANES_D(D)>
722
+ HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
723
+ const detail::MaxFunc f;
724
+ v = detail::ReduceAcrossBlocks(d, f, v);
725
+ return detail::ReduceWithinBlocks(d, f, v);
726
+ }
727
+
728
+ template <class D, HWY_IF_REDUCE_D(D)>
729
+ HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
730
+ return GetLane(SumOfLanes(d, v));
731
+ }
732
+ template <class D, HWY_IF_REDUCE_D(D)>
733
+ HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) {
734
+ return GetLane(MinOfLanes(d, v));
735
+ }
736
+ template <class D, HWY_IF_REDUCE_D(D)>
737
+ HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
738
+ return GetLane(MaxOfLanes(d, v));
739
+ }
740
+
741
+ #endif // HWY_NATIVE_REDUCE_SCALAR
742
+
743
+ // Corner cases for both generic and native implementations:
744
+ // N=1 (native covers N=2 e.g. for u64x2 and even u32x2 on Arm)
745
+ template <class D, HWY_IF_LANES_D(D, 1)>
746
+ HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) {
747
+ return GetLane(v);
748
+ }
749
+ template <class D, HWY_IF_LANES_D(D, 1)>
750
+ HWY_API TFromD<D> ReduceMin(D /*d*/, VFromD<D> v) {
751
+ return GetLane(v);
752
+ }
753
+ template <class D, HWY_IF_LANES_D(D, 1)>
754
+ HWY_API TFromD<D> ReduceMax(D /*d*/, VFromD<D> v) {
755
+ return GetLane(v);
756
+ }
757
+
758
+ template <class D, HWY_IF_LANES_D(D, 1)>
759
+ HWY_API VFromD<D> SumOfLanes(D /* tag */, VFromD<D> v) {
760
+ return v;
761
+ }
762
+ template <class D, HWY_IF_LANES_D(D, 1)>
763
+ HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) {
764
+ return v;
765
+ }
766
+ template <class D, HWY_IF_LANES_D(D, 1)>
767
+ HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) {
768
+ return v;
769
+ }
770
+
771
+ // N=4 for 8-bit is still less than the minimum native size.
772
+
773
+ // ARMv7 NEON/PPC/RVV/SVE have target-specific implementations of the N=4 I8/U8
774
+ // ReduceSum operations
775
+ #if (defined(HWY_NATIVE_REDUCE_SUM_4_UI8) == defined(HWY_TARGET_TOGGLE))
776
+ #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
777
+ #undef HWY_NATIVE_REDUCE_SUM_4_UI8
778
+ #else
779
+ #define HWY_NATIVE_REDUCE_SUM_4_UI8
780
+ #endif
781
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
782
+ HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
783
+ const Twice<RepartitionToWide<decltype(d)>> dw;
784
+ return static_cast<TFromD<D>>(ReduceSum(dw, PromoteTo(dw, v)));
785
+ }
786
+ #endif // HWY_NATIVE_REDUCE_SUM_4_UI8
787
+
788
+ // RVV/SVE have target-specific implementations of the N=4 I8/U8
789
+ // ReduceMin/ReduceMax operations
790
+ #if (defined(HWY_NATIVE_REDUCE_MINMAX_4_UI8) == defined(HWY_TARGET_TOGGLE))
791
+ #ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
792
+ #undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
793
+ #else
794
+ #define HWY_NATIVE_REDUCE_MINMAX_4_UI8
795
+ #endif
796
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
797
+ HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) {
798
+ const Twice<RepartitionToWide<decltype(d)>> dw;
799
+ return static_cast<TFromD<D>>(ReduceMin(dw, PromoteTo(dw, v)));
800
+ }
801
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
802
+ HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
803
+ const Twice<RepartitionToWide<decltype(d)>> dw;
804
+ return static_cast<TFromD<D>>(ReduceMax(dw, PromoteTo(dw, v)));
805
+ }
806
+ #endif // HWY_NATIVE_REDUCE_MINMAX_4_UI8
807
+
808
+ // ------------------------------ IsInf, IsFinite
809
+
810
+ // AVX3 has target-specific implementations of these.
811
+ #if (defined(HWY_NATIVE_ISINF) == defined(HWY_TARGET_TOGGLE))
812
+ #ifdef HWY_NATIVE_ISINF
813
+ #undef HWY_NATIVE_ISINF
814
+ #else
815
+ #define HWY_NATIVE_ISINF
816
+ #endif
817
+
818
+ template <class V, class D = DFromV<V>>
819
+ HWY_API MFromD<D> IsInf(const V v) {
820
+ using T = TFromD<D>;
821
+ const D d;
822
+ const RebindToUnsigned<decltype(d)> du;
823
+ const VFromD<decltype(du)> vu = BitCast(du, v);
824
+ // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
825
+ return RebindMask(
826
+ d,
827
+ Eq(Add(vu, vu),
828
+ Set(du, static_cast<MakeUnsigned<T>>(hwy::MaxExponentTimes2<T>()))));
829
+ }
830
+
831
+ // Returns whether normal/subnormal/zero.
832
+ template <class V, class D = DFromV<V>>
833
+ HWY_API MFromD<D> IsFinite(const V v) {
834
+ using T = TFromD<D>;
835
+ const D d;
836
+ const RebindToUnsigned<decltype(d)> du;
837
+ const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
838
+ const VFromD<decltype(du)> vu = BitCast(du, v);
839
+ // 'Shift left' to clear the sign bit. MSVC seems to generate incorrect code
840
+ // for AVX2 if we instead add vu + vu.
841
+ #if HWY_COMPILER_MSVC
842
+ const VFromD<decltype(du)> shl = ShiftLeft<1>(vu);
843
+ #else
844
+ const VFromD<decltype(du)> shl = Add(vu, vu);
845
+ #endif
846
+
847
+ // Then shift right so we can compare with the max exponent (cannot compare
848
+ // with MaxExponentTimes2 directly because it is negative and non-negative
849
+ // floats would be greater).
850
+ const VFromD<decltype(di)> exp =
851
+ BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(shl));
852
+ return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
853
+ }
854
+
855
+ #endif // HWY_NATIVE_ISINF
856
+
857
+ // ------------------------------ LoadInterleaved2
215
858
 
216
859
  #if HWY_IDE || \
217
860
  (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE))
@@ -221,8 +864,6 @@ HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
221
864
  #define HWY_NATIVE_LOAD_STORE_INTERLEAVED
222
865
  #endif
223
866
 
224
- // ------------------------------ LoadInterleaved2
225
-
226
867
  template <class D, HWY_IF_LANES_GT_D(D, 1)>
227
868
  HWY_API void LoadInterleaved2(D d, const TFromD<D>* HWY_RESTRICT unaligned,
228
869
  VFromD<D>& v0, VFromD<D>& v1) {
@@ -277,6 +918,7 @@ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
277
918
  VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
278
919
  const RebindToUnsigned<decltype(d)> du;
279
920
  using V = VFromD<D>;
921
+ using VU = VFromD<decltype(du)>;
280
922
  // Compact notation so these fit on one line: 12 := v1[2].
281
923
  V A; // 05 24 14 04 23 13 03 22 12 02 21 11 01 20 10 00
282
924
  V B; // 1a 0a 29 19 09 28 18 08 27 17 07 26 16 06 25 15
@@ -284,33 +926,33 @@ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
284
926
  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
285
927
  // Compress all lanes belonging to v0 into consecutive lanes.
286
928
  constexpr uint8_t Z = 0x80;
287
- alignas(16) static constexpr uint8_t kIdx_v0A[16] = {
288
- 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z};
289
- alignas(16) static constexpr uint8_t kIdx_v0B[16] = {
290
- Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z};
291
- alignas(16) static constexpr uint8_t kIdx_v0C[16] = {
292
- Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13};
293
- alignas(16) static constexpr uint8_t kIdx_v1A[16] = {
294
- 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z};
295
- alignas(16) static constexpr uint8_t kIdx_v1B[16] = {
296
- Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z};
297
- alignas(16) static constexpr uint8_t kIdx_v1C[16] = {
298
- Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14};
299
- alignas(16) static constexpr uint8_t kIdx_v2A[16] = {
300
- 2, 5, 8, 11, 14, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z};
301
- alignas(16) static constexpr uint8_t kIdx_v2B[16] = {
302
- Z, Z, Z, Z, Z, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z};
303
- alignas(16) static constexpr uint8_t kIdx_v2C[16] = {
304
- Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15};
305
- const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
306
- const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
307
- const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
308
- const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
309
- const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
310
- const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
311
- const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
312
- const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
313
- const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
929
+ const VU idx_v0A =
930
+ Dup128VecFromValues(du, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
931
+ const VU idx_v0B =
932
+ Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z);
933
+ const VU idx_v0C =
934
+ Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13);
935
+ const VU idx_v1A =
936
+ Dup128VecFromValues(du, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
937
+ const VU idx_v1B =
938
+ Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z);
939
+ const VU idx_v1C =
940
+ Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14);
941
+ const VU idx_v2A =
942
+ Dup128VecFromValues(du, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
943
+ const VU idx_v2B =
944
+ Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z);
945
+ const VU idx_v2C =
946
+ Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15);
947
+ const V v0L = BitCast(d, TableLookupBytesOr0(A, idx_v0A));
948
+ const V v0M = BitCast(d, TableLookupBytesOr0(B, idx_v0B));
949
+ const V v0U = BitCast(d, TableLookupBytesOr0(C, idx_v0C));
950
+ const V v1L = BitCast(d, TableLookupBytesOr0(A, idx_v1A));
951
+ const V v1M = BitCast(d, TableLookupBytesOr0(B, idx_v1B));
952
+ const V v1U = BitCast(d, TableLookupBytesOr0(C, idx_v1C));
953
+ const V v2L = BitCast(d, TableLookupBytesOr0(A, idx_v2A));
954
+ const V v2M = BitCast(d, TableLookupBytesOr0(B, idx_v2B));
955
+ const V v2U = BitCast(d, TableLookupBytesOr0(C, idx_v2C));
314
956
  v0 = Xor3(v0L, v0M, v0U);
315
957
  v1 = Xor3(v1L, v1M, v1U);
316
958
  v2 = Xor3(v2L, v2M, v2U);
@@ -322,30 +964,40 @@ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
322
964
  VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
323
965
  const RebindToUnsigned<decltype(d)> du;
324
966
  using V = VFromD<D>;
967
+ using VU = VFromD<decltype(du)>;
325
968
  V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
326
969
  V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
327
970
  V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
328
971
  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
329
972
  // Compress all lanes belonging to v0 into consecutive lanes.
330
973
  constexpr uint8_t Z = 0x80;
331
- alignas(16) static constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, Z, Z, Z, Z, Z};
332
- alignas(16) static constexpr uint8_t kIdx_v0B[16] = {Z, Z, Z, 1, 4, 7, Z, Z};
333
- alignas(16) static constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, 2, 5};
334
- alignas(16) static constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, Z, Z, Z, Z, Z};
335
- alignas(16) static constexpr uint8_t kIdx_v1B[16] = {Z, Z, Z, 2, 5, Z, Z, Z};
336
- alignas(16) static constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, 0, 3, 6};
337
- alignas(16) static constexpr uint8_t kIdx_v2A[16] = {2, 5, Z, Z, Z, Z, Z, Z};
338
- alignas(16) static constexpr uint8_t kIdx_v2B[16] = {Z, Z, 0, 3, 6, Z, Z, Z};
339
- alignas(16) static constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, 1, 4, 7};
340
- const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
341
- const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
342
- const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
343
- const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
344
- const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
345
- const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
346
- const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
347
- const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
348
- const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
974
+ const VU idx_v0A =
975
+ Dup128VecFromValues(du, 0, 3, 6, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
976
+ const VU idx_v0B =
977
+ Dup128VecFromValues(du, Z, Z, Z, 1, 4, 7, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
978
+ const VU idx_v0C =
979
+ Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 0, 0, 0, 0, 0, 0, 0, 0);
980
+ const VU idx_v1A =
981
+ Dup128VecFromValues(du, 1, 4, 7, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
982
+ const VU idx_v1B =
983
+ Dup128VecFromValues(du, Z, Z, Z, 2, 5, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
984
+ const VU idx_v1C =
985
+ Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 0);
986
+ const VU idx_v2A =
987
+ Dup128VecFromValues(du, 2, 5, Z, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
988
+ const VU idx_v2B =
989
+ Dup128VecFromValues(du, Z, Z, 0, 3, 6, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
990
+ const VU idx_v2C =
991
+ Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0);
992
+ const V v0L = BitCast(d, TableLookupBytesOr0(A, idx_v0A));
993
+ const V v0M = BitCast(d, TableLookupBytesOr0(B, idx_v0B));
994
+ const V v0U = BitCast(d, TableLookupBytesOr0(C, idx_v0C));
995
+ const V v1L = BitCast(d, TableLookupBytesOr0(A, idx_v1A));
996
+ const V v1M = BitCast(d, TableLookupBytesOr0(B, idx_v1B));
997
+ const V v1U = BitCast(d, TableLookupBytesOr0(C, idx_v1C));
998
+ const V v2L = BitCast(d, TableLookupBytesOr0(A, idx_v2A));
999
+ const V v2M = BitCast(d, TableLookupBytesOr0(B, idx_v2B));
1000
+ const V v2U = BitCast(d, TableLookupBytesOr0(C, idx_v2C));
349
1001
  v0 = Xor3(v0L, v0M, v0U);
350
1002
  v1 = Xor3(v1L, v1M, v1U);
351
1003
  v2 = Xor3(v2L, v2M, v2U);
@@ -358,6 +1010,7 @@ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
358
1010
  const RebindToUnsigned<decltype(d)> du;
359
1011
  const Repartition<uint8_t, decltype(du)> du8;
360
1012
  using V = VFromD<D>;
1013
+ using VU8 = VFromD<decltype(du8)>;
361
1014
  V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
362
1015
  V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
363
1016
  V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
@@ -365,33 +1018,33 @@ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
365
1018
  // Compress all lanes belonging to v0 into consecutive lanes. Same as above,
366
1019
  // but each element of the array contains a byte index for a byte of a lane.
367
1020
  constexpr uint8_t Z = 0x80;
368
- alignas(16) static constexpr uint8_t kIdx_v0A[16] = {
369
- 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z};
370
- alignas(16) static constexpr uint8_t kIdx_v0B[16] = {
371
- Z, Z, Z, Z, Z, Z, 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z};
372
- alignas(16) static constexpr uint8_t kIdx_v0C[16] = {
373
- Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x04, 0x05, 0x0A, 0x0B};
374
- alignas(16) static constexpr uint8_t kIdx_v1A[16] = {
375
- 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z};
376
- alignas(16) static constexpr uint8_t kIdx_v1B[16] = {
377
- Z, Z, Z, Z, Z, Z, 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z, Z, Z, Z};
378
- alignas(16) static constexpr uint8_t kIdx_v1C[16] = {
379
- Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D};
380
- alignas(16) static constexpr uint8_t kIdx_v2A[16] = {
381
- 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z};
382
- alignas(16) static constexpr uint8_t kIdx_v2B[16] = {
383
- Z, Z, Z, Z, 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z};
384
- alignas(16) static constexpr uint8_t kIdx_v2C[16] = {
385
- Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F};
386
- const V v0L = TableLookupBytesOr0(A, BitCast(d, LoadDup128(du8, kIdx_v0A)));
387
- const V v0M = TableLookupBytesOr0(B, BitCast(d, LoadDup128(du8, kIdx_v0B)));
388
- const V v0U = TableLookupBytesOr0(C, BitCast(d, LoadDup128(du8, kIdx_v0C)));
389
- const V v1L = TableLookupBytesOr0(A, BitCast(d, LoadDup128(du8, kIdx_v1A)));
390
- const V v1M = TableLookupBytesOr0(B, BitCast(d, LoadDup128(du8, kIdx_v1B)));
391
- const V v1U = TableLookupBytesOr0(C, BitCast(d, LoadDup128(du8, kIdx_v1C)));
392
- const V v2L = TableLookupBytesOr0(A, BitCast(d, LoadDup128(du8, kIdx_v2A)));
393
- const V v2M = TableLookupBytesOr0(B, BitCast(d, LoadDup128(du8, kIdx_v2B)));
394
- const V v2U = TableLookupBytesOr0(C, BitCast(d, LoadDup128(du8, kIdx_v2C)));
1021
+ const VU8 idx_v0A = Dup128VecFromValues(du8, 0x00, 0x01, 0x06, 0x07, 0x0C,
1022
+ 0x0D, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
1023
+ const VU8 idx_v0B = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, 0x02, 0x03,
1024
+ 0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z);
1025
+ const VU8 idx_v0C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
1026
+ Z, 0x04, 0x05, 0x0A, 0x0B);
1027
+ const VU8 idx_v1A = Dup128VecFromValues(du8, 0x02, 0x03, 0x08, 0x09, 0x0E,
1028
+ 0x0F, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
1029
+ const VU8 idx_v1B = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, 0x04, 0x05,
1030
+ 0x0A, 0x0B, Z, Z, Z, Z, Z, Z);
1031
+ const VU8 idx_v1C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
1032
+ 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D);
1033
+ const VU8 idx_v2A = Dup128VecFromValues(du8, 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z,
1034
+ Z, Z, Z, Z, Z, Z, Z, Z, Z);
1035
+ const VU8 idx_v2B = Dup128VecFromValues(du8, Z, Z, Z, Z, 0x00, 0x01, 0x06,
1036
+ 0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z);
1037
+ const VU8 idx_v2C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
1038
+ 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F);
1039
+ const V v0L = TableLookupBytesOr0(A, BitCast(d, idx_v0A));
1040
+ const V v0M = TableLookupBytesOr0(B, BitCast(d, idx_v0B));
1041
+ const V v0U = TableLookupBytesOr0(C, BitCast(d, idx_v0C));
1042
+ const V v1L = TableLookupBytesOr0(A, BitCast(d, idx_v1A));
1043
+ const V v1M = TableLookupBytesOr0(B, BitCast(d, idx_v1B));
1044
+ const V v1U = TableLookupBytesOr0(C, BitCast(d, idx_v1C));
1045
+ const V v2L = TableLookupBytesOr0(A, BitCast(d, idx_v2A));
1046
+ const V v2M = TableLookupBytesOr0(B, BitCast(d, idx_v2B));
1047
+ const V v2U = TableLookupBytesOr0(C, BitCast(d, idx_v2C));
395
1048
  v0 = Xor3(v0L, v0M, v0U);
396
1049
  v1 = Xor3(v1L, v1M, v1U);
397
1050
  v2 = Xor3(v2L, v2M, v2U);
@@ -644,16 +1297,16 @@ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
644
1297
  // v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes
645
1298
  // to their place, with 0x80 so lanes to be filled from other vectors are 0
646
1299
  // to enable blending by ORing together.
647
- alignas(16) static constexpr uint8_t tbl_v0[16] = {
648
- 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
649
- 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
650
- alignas(16) static constexpr uint8_t tbl_v1[16] = {
651
- 0x80, 0, 0x80, 0x80, 1, 0x80, //
652
- 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
1300
+ const VFromD<decltype(du)> shuf_A0 =
1301
+ Dup128VecFromValues(du, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3,
1302
+ 0x80, 0x80, 4, 0x80, 0x80, 5);
1303
+ // Cannot reuse shuf_A0 because it contains 5.
1304
+ const VFromD<decltype(du)> shuf_A1 =
1305
+ Dup128VecFromValues(du, 0x80, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
1306
+ 3, 0x80, 0x80, 4, 0x80, 0x80);
653
1307
  // The interleaved vectors will be named A, B, C; temporaries with suffix
654
1308
  // 0..2 indicate which input vector's lanes they hold.
655
- const auto shuf_A0 = LoadDup128(du, tbl_v0);
656
- const auto shuf_A1 = LoadDup128(du, tbl_v1); // cannot reuse shuf_A0 (has 5)
1309
+ // cannot reuse shuf_A0 (has 5)
657
1310
  const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
658
1311
  const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0
659
1312
  const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0.
@@ -693,19 +1346,16 @@ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
693
1346
  // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
694
1347
  // filled from other vectors are 0 for blending. Note that these are byte
695
1348
  // indices for 16-bit lanes.
696
- alignas(16) static constexpr uint8_t tbl_v1[16] = {
697
- 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80,
698
- 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5};
699
- alignas(16) static constexpr uint8_t tbl_v2[16] = {
700
- 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
701
- 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
1349
+ const VFromD<decltype(du8)> shuf_A1 =
1350
+ Dup128VecFromValues(du8, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, 2, 3,
1351
+ 0x80, 0x80, 0x80, 0x80, 4, 5);
1352
+ const VFromD<decltype(du8)> shuf_A2 =
1353
+ Dup128VecFromValues(du8, 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80,
1354
+ 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80);
702
1355
 
703
1356
  // The interleaved vectors will be named A, B, C; temporaries with suffix
704
1357
  // 0..2 indicate which input vector's lanes they hold.
705
- const auto shuf_A1 = LoadDup128(du8, tbl_v1); // 2..1..0.
706
- // .2..1..0
707
1358
  const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
708
- const auto shuf_A2 = LoadDup128(du8, tbl_v2); // ..1..0..
709
1359
 
710
1360
  const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
711
1361
  const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
@@ -1104,19 +1754,22 @@ HWY_INLINE VFromD<DTo> LoadNResizeBitCast(DTo d_to, DFrom d_from,
1104
1754
 
1105
1755
  } // namespace detail
1106
1756
 
1107
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1)>
1757
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1),
1758
+ HWY_IF_NOT_BF16_D(D)>
1108
1759
  HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
1109
1760
  size_t num_lanes) {
1110
1761
  return (num_lanes > 0) ? LoadU(d, p) : Zero(d);
1111
1762
  }
1112
1763
 
1113
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1)>
1764
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1),
1765
+ HWY_IF_NOT_BF16_D(D)>
1114
1766
  HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
1115
1767
  size_t num_lanes) {
1116
1768
  return (num_lanes > 0) ? LoadU(d, p) : no;
1117
1769
  }
1118
1770
 
1119
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
1771
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2),
1772
+ HWY_IF_NOT_BF16_D(D)>
1120
1773
  HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
1121
1774
  size_t num_lanes) {
1122
1775
  const FixedTag<TFromD<D>, 1> d1;
@@ -1126,7 +1779,8 @@ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
1126
1779
  return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p));
1127
1780
  }
1128
1781
 
1129
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
1782
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2),
1783
+ HWY_IF_NOT_BF16_D(D)>
1130
1784
  HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
1131
1785
  size_t num_lanes) {
1132
1786
  const FixedTag<TFromD<D>, 1> d1;
@@ -1136,7 +1790,8 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
1136
1790
  return InterleaveLower(ResizeBitCast(d, LoadU(d1, p)), no);
1137
1791
  }
1138
1792
 
1139
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
1793
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4),
1794
+ HWY_IF_NOT_BF16_D(D)>
1140
1795
  HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
1141
1796
  size_t num_lanes) {
1142
1797
  const FixedTag<TFromD<D>, 2> d2;
@@ -1151,7 +1806,8 @@ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
1151
1806
  return (num_lanes == 2) ? v_lo : InsertLane(v_lo, 2, p[2]);
1152
1807
  }
1153
1808
 
1154
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
1809
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4),
1810
+ HWY_IF_NOT_BF16_D(D)>
1155
1811
  HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
1156
1812
  size_t num_lanes) {
1157
1813
  const FixedTag<TFromD<D>, 2> d2;
@@ -1166,7 +1822,8 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
1166
1822
  return (num_lanes == 2) ? v_lo : InsertLane(v_lo, 2, p[2]);
1167
1823
  }
1168
1824
 
1169
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
1825
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8),
1826
+ HWY_IF_NOT_BF16_D(D)>
1170
1827
  HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
1171
1828
  size_t num_lanes) {
1172
1829
  const FixedTag<TFromD<D>, 4> d4;
@@ -1201,7 +1858,8 @@ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
1201
1858
  }
1202
1859
  }
1203
1860
 
1204
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
1861
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8),
1862
+ HWY_IF_NOT_BF16_D(D)>
1205
1863
  HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
1206
1864
  size_t num_lanes) {
1207
1865
  const FixedTag<TFromD<D>, 4> d4;
@@ -1238,7 +1896,8 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
1238
1896
  }
1239
1897
  }
1240
1898
 
1241
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16)>
1899
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16),
1900
+ HWY_IF_NOT_BF16_D(D)>
1242
1901
  HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
1243
1902
  size_t num_lanes) {
1244
1903
  const FixedTag<TFromD<D>, 8> d8;
@@ -1283,7 +1942,8 @@ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
1283
1942
  }
1284
1943
  }
1285
1944
 
1286
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16)>
1945
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16),
1946
+ HWY_IF_NOT_BF16_D(D)>
1287
1947
  HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
1288
1948
  size_t num_lanes) {
1289
1949
  const FixedTag<TFromD<D>, 8> d8;
@@ -1338,7 +1998,7 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
1338
1998
 
1339
1999
  #if HWY_MAX_BYTES >= 32
1340
2000
 
1341
- template <class D, HWY_IF_V_SIZE_GT_D(D, 16)>
2001
+ template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_NOT_BF16_D(D)>
1342
2002
  HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
1343
2003
  size_t num_lanes) {
1344
2004
  if (num_lanes >= Lanes(d)) return LoadU(d, p);
@@ -1354,7 +2014,7 @@ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
1354
2014
  }
1355
2015
  }
1356
2016
 
1357
- template <class D, HWY_IF_V_SIZE_GT_D(D, 16)>
2017
+ template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_NOT_BF16_D(D)>
1358
2018
  HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
1359
2019
  size_t num_lanes) {
1360
2020
  if (num_lanes >= Lanes(d)) return LoadU(d, p);
@@ -1374,7 +2034,23 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
1374
2034
  }
1375
2035
 
1376
2036
  #endif // HWY_MAX_BYTES >= 32
1377
- #else // !HWY_MEM_OPS_MIGHT_FAULT || HWY_HAVE_SCALABLE
2037
+
2038
+ template <class D, HWY_IF_BF16_D(D)>
2039
+ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
2040
+ size_t num_lanes) {
2041
+ const RebindToUnsigned<D> du;
2042
+ return BitCast(d, LoadN(du, detail::U16LanePointer(p), num_lanes));
2043
+ }
2044
+
2045
+ template <class D, HWY_IF_BF16_D(D)>
2046
+ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
2047
+ size_t num_lanes) {
2048
+ const RebindToUnsigned<D> du;
2049
+ return BitCast(
2050
+ d, LoadNOr(BitCast(du, no), du, detail::U16LanePointer(p), num_lanes));
2051
+ }
2052
+
2053
+ #else // !HWY_MEM_OPS_MIGHT_FAULT || HWY_HAVE_SCALABLE
1378
2054
 
1379
2055
  // For SVE and non-sanitizer AVX-512; RVV has its own specialization.
1380
2056
  template <class D>
@@ -1549,9 +2225,7 @@ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
1549
2225
 
1550
2226
  BlendedStore(v, FirstN(d, clamped_max_lanes_to_store), d, p);
1551
2227
 
1552
- #if HWY_MEM_OPS_MIGHT_FAULT
1553
2228
  detail::MaybeUnpoison(p, clamped_max_lanes_to_store);
1554
- #endif
1555
2229
  }
1556
2230
  #endif // HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
1557
2231
 
@@ -1649,6 +2323,7 @@ HWY_API VFromD<D> GatherOffset(D d, const T* HWY_RESTRICT base,
1649
2323
  HWY_ALIGN T lanes[MaxLanes(d)];
1650
2324
  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
1651
2325
  for (size_t i = 0; i < MaxLanes(d); ++i) {
2326
+ HWY_DASSERT(offset_lanes[i] >= 0);
1652
2327
  CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
1653
2328
  }
1654
2329
  return Load(d, lanes);
@@ -1666,6 +2341,7 @@ HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base,
1666
2341
 
1667
2342
  HWY_ALIGN T lanes[MaxLanes(d)];
1668
2343
  for (size_t i = 0; i < MaxLanes(d); ++i) {
2344
+ HWY_DASSERT(index_lanes[i] >= 0);
1669
2345
  lanes[i] = base[index_lanes[i]];
1670
2346
  }
1671
2347
  return Load(d, lanes);
@@ -1687,11 +2363,37 @@ HWY_API VFromD<D> MaskedGatherIndex(MFromD<D> m, D d,
1687
2363
 
1688
2364
  HWY_ALIGN T lanes[MaxLanes(d)];
1689
2365
  for (size_t i = 0; i < MaxLanes(d); ++i) {
2366
+ HWY_DASSERT(index_lanes[i] >= 0);
1690
2367
  lanes[i] = mask_lanes[i] ? base[index_lanes[i]] : T{0};
1691
2368
  }
1692
2369
  return Load(d, lanes);
1693
2370
  }
1694
2371
 
2372
+ template <class D, typename T = TFromD<D>>
2373
+ HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
2374
+ const T* HWY_RESTRICT base,
2375
+ VFromD<RebindToSigned<D>> index) {
2376
+ const RebindToSigned<D> di;
2377
+ using TI = TFromD<decltype(di)>;
2378
+ static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
2379
+
2380
+ HWY_ALIGN TI index_lanes[MaxLanes(di)];
2381
+ Store(index, di, index_lanes);
2382
+
2383
+ HWY_ALIGN TI mask_lanes[MaxLanes(di)];
2384
+ Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes);
2385
+
2386
+ HWY_ALIGN T no_lanes[MaxLanes(d)];
2387
+ Store(no, d, no_lanes);
2388
+
2389
+ HWY_ALIGN T lanes[MaxLanes(d)];
2390
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
2391
+ HWY_DASSERT(index_lanes[i] >= 0);
2392
+ lanes[i] = mask_lanes[i] ? base[index_lanes[i]] : no_lanes[i];
2393
+ }
2394
+ return Load(d, lanes);
2395
+ }
2396
+
1695
2397
  #endif // (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE))
1696
2398
 
1697
2399
  // ------------------------------ ScatterN/GatherN
@@ -1733,10 +2435,14 @@ HWY_API V AbsDiff(V a, V b) {
1733
2435
  #define HWY_NATIVE_SUMS_OF_8_ABS_DIFF
1734
2436
  #endif
1735
2437
 
1736
- template <class V, HWY_IF_U8_D(DFromV<V>),
2438
+ template <class V, HWY_IF_UI8_D(DFromV<V>),
1737
2439
  HWY_IF_V_SIZE_GT_D(DFromV<V>, (HWY_TARGET == HWY_SCALAR ? 0 : 4))>
1738
- HWY_API Vec<Repartition<uint64_t, DFromV<V>>> SumsOf8AbsDiff(V a, V b) {
1739
- return SumsOf8(AbsDiff(a, b));
2440
+ HWY_API Vec<RepartitionToWideX3<DFromV<V>>> SumsOf8AbsDiff(V a, V b) {
2441
+ const DFromV<decltype(a)> d;
2442
+ const RebindToUnsigned<decltype(d)> du;
2443
+ const RepartitionToWideX3<decltype(d)> dw;
2444
+
2445
+ return BitCast(dw, SumsOf8(BitCast(du, AbsDiff(a, b))));
1740
2446
  }
1741
2447
 
1742
2448
  #endif // HWY_NATIVE_SUMS_OF_8_ABS_DIFF
@@ -1923,6 +2629,248 @@ HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
1923
2629
  #endif // HWY_TARGET != HWY_SCALAR
1924
2630
  #endif // HWY_NATIVE_PROMOTE_UPPER_TO
1925
2631
 
2632
+ // ------------------------------ PromoteEvenTo/PromoteOddTo
2633
+
2634
+ #if HWY_TARGET != HWY_SCALAR
2635
+ namespace detail {
2636
+
2637
+ // Tag dispatch is used in detail::PromoteEvenTo and detail::PromoteOddTo as
2638
+ // there are target-specific specializations for some of the
2639
+ // detail::PromoteEvenTo and detail::PromoteOddTo cases on
2640
+ // SVE/PPC/SSE2/SSSE3/SSE4/AVX2.
2641
+
2642
+ // All targets except HWY_SCALAR use the implementations of
2643
+ // detail::PromoteEvenTo and detail::PromoteOddTo in generic_ops-inl.h for at
2644
+ // least some of the PromoteEvenTo and PromoteOddTo cases.
2645
+
2646
+ // Signed to signed PromoteEvenTo/PromoteOddTo
2647
+ template <size_t kToLaneSize, class D, class V>
2648
+ HWY_INLINE VFromD<D> PromoteEvenTo(
2649
+ hwy::SignedTag /*to_type_tag*/,
2650
+ hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
2651
+ hwy::SignedTag /*from_type_tag*/, D d_to, V v) {
2652
+ #if HWY_IS_LITTLE_ENDIAN
2653
+ // On little-endian targets, need to shift each lane of the bitcasted vector
2654
+ // left by kToLaneSize * 4 bits to get the bits of the even source lanes into
2655
+ // the upper kToLaneSize * 4 bits of even_in_hi.
2656
+ const auto even_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v));
2657
+ #else
2658
+ // On big-endian targets, the bits of the even source lanes are already in
2659
+ // the upper kToLaneSize * 4 bits of the lanes of the bitcasted vector.
2660
+ const auto even_in_hi = BitCast(d_to, v);
2661
+ #endif
2662
+
2663
+ // Right-shift even_in_hi by kToLaneSize * 4 bits
2664
+ return ShiftRight<kToLaneSize * 4>(even_in_hi);
2665
+ }
2666
+
2667
+ template <size_t kToLaneSize, class D, class V>
2668
+ HWY_INLINE VFromD<D> PromoteOddTo(
2669
+ hwy::SignedTag /*to_type_tag*/,
2670
+ hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
2671
+ hwy::SignedTag /*from_type_tag*/, D d_to, V v) {
2672
+ #if HWY_IS_LITTLE_ENDIAN
2673
+ // On little-endian targets, the bits of the odd source lanes are already in
2674
+ // the upper kToLaneSize * 4 bits of the lanes of the bitcasted vector.
2675
+ const auto odd_in_hi = BitCast(d_to, v);
2676
+ #else
2677
+ // On big-endian targets, need to shift each lane of the bitcasted vector left
2678
+ // by kToLaneSize * 4 bits to get the bits of the odd source lanes into the
2679
+ // upper kToLaneSize * 4 bits of odd_in_hi.
2680
+ const auto odd_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v));
2681
+ #endif
2682
+
2683
+ // Right-shift odd_in_hi by kToLaneSize * 4 bits
2684
+ return ShiftRight<kToLaneSize * 4>(odd_in_hi);
2685
+ }
2686
+
2687
+ // Unsigned to unsigned PromoteEvenTo/PromoteOddTo
2688
+ template <size_t kToLaneSize, class D, class V>
2689
+ HWY_INLINE VFromD<D> PromoteEvenTo(
2690
+ hwy::UnsignedTag /*to_type_tag*/,
2691
+ hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
2692
+ hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
2693
+ #if HWY_IS_LITTLE_ENDIAN
2694
+ // On little-endian targets, the bits of the even source lanes are already
2695
+ // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector.
2696
+
2697
+ // Simply need to zero out the upper bits of each lane of the bitcasted
2698
+ // vector.
2699
+ return And(BitCast(d_to, v),
2700
+ Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>())));
2701
+ #else
2702
+ // On big-endian targets, need to shift each lane of the bitcasted vector
2703
+ // right by kToLaneSize * 4 bits to get the bits of the even source lanes into
2704
+ // the lower kToLaneSize * 4 bits of the result.
2705
+
2706
+ // The right shift below will zero out the upper kToLaneSize * 4 bits of the
2707
+ // result.
2708
+ return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v));
2709
+ #endif
2710
+ }
2711
+
2712
+ template <size_t kToLaneSize, class D, class V>
2713
+ HWY_INLINE VFromD<D> PromoteOddTo(
2714
+ hwy::UnsignedTag /*to_type_tag*/,
2715
+ hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
2716
+ hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
2717
+ #if HWY_IS_LITTLE_ENDIAN
2718
+ // On little-endian targets, need to shift each lane of the bitcasted vector
2719
+ // right by kToLaneSize * 4 bits to get the bits of the odd source lanes into
2720
+ // the lower kToLaneSize * 4 bits of the result.
2721
+
2722
+ // The right shift below will zero out the upper kToLaneSize * 4 bits of the
2723
+ // result.
2724
+ return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v));
2725
+ #else
2726
+ // On big-endian targets, the bits of the even source lanes are already
2727
+ // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector.
2728
+
2729
+ // Simply need to zero out the upper bits of each lane of the bitcasted
2730
+ // vector.
2731
+ return And(BitCast(d_to, v),
2732
+ Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>())));
2733
+ #endif
2734
+ }
2735
+
2736
+ // Unsigned to signed: Same as unsigned->unsigned PromoteEvenTo/PromoteOddTo
2737
+ // followed by BitCast to signed
2738
+ template <size_t kToLaneSize, class D, class V>
2739
+ HWY_INLINE VFromD<D> PromoteEvenTo(
2740
+ hwy::SignedTag /*to_type_tag*/,
2741
+ hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
2742
+ hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
2743
+ const RebindToUnsigned<decltype(d_to)> du_to;
2744
+ return BitCast(d_to,
2745
+ PromoteEvenTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(),
2746
+ hwy::UnsignedTag(), du_to, v));
2747
+ }
2748
+
2749
+ template <size_t kToLaneSize, class D, class V>
2750
+ HWY_INLINE VFromD<D> PromoteOddTo(
2751
+ hwy::SignedTag /*to_type_tag*/,
2752
+ hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
2753
+ hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
2754
+ const RebindToUnsigned<decltype(d_to)> du_to;
2755
+ return BitCast(d_to,
2756
+ PromoteOddTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(),
2757
+ hwy::UnsignedTag(), du_to, v));
2758
+ }
2759
+
2760
+ // BF16->F32 PromoteEvenTo
2761
+
2762
+ // NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag
2763
+ // instead of hwy::FloatTag on targets that use scalable vectors.
2764
+
2765
+ // VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same
2766
+ // type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>>
2767
+
2768
+ // The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered
2769
+ // to be a bfloat16_t vector.
2770
+ template <class FromTypeTag, class DF32, class VBF16,
2771
+ class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>,
2772
+ hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr>
2773
+ HWY_INLINE VFromD<DF32> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
2774
+ hwy::SizeTag<4> /*to_lane_size_tag*/,
2775
+ FromTypeTag /*from_type_tag*/, DF32 d_to,
2776
+ VBF16 v) {
2777
+ const RebindToUnsigned<decltype(d_to)> du_to;
2778
+ #if HWY_IS_LITTLE_ENDIAN
2779
+ // On little-endian platforms, need to shift left each lane of the bitcasted
2780
+ // vector by 16 bits.
2781
+ return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v)));
2782
+ #else
2783
+ // On big-endian platforms, the even lanes of the source vector are already
2784
+ // in the upper 16 bits of the lanes of the bitcasted vector.
2785
+
2786
+ // Need to simply zero out the lower 16 bits of each lane of the bitcasted
2787
+ // vector.
2788
+ return BitCast(d_to,
2789
+ And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u})));
2790
+ #endif
2791
+ }
2792
+
2793
+ // BF16->F32 PromoteOddTo
2794
+
2795
+ // NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag
2796
+ // instead of hwy::FloatTag on targets that use scalable vectors.
2797
+
2798
+ // VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same
2799
+ // type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>>
2800
+
2801
+ // The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered
2802
+ // to be a bfloat16_t vector.
2803
+ template <class FromTypeTag, class DF32, class VBF16,
2804
+ class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>,
2805
+ hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr>
2806
+ HWY_INLINE VFromD<DF32> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
2807
+ hwy::SizeTag<4> /*to_lane_size_tag*/,
2808
+ FromTypeTag /*from_type_tag*/, DF32 d_to,
2809
+ VBF16 v) {
2810
+ const RebindToUnsigned<decltype(d_to)> du_to;
2811
+ #if HWY_IS_LITTLE_ENDIAN
2812
+ // On little-endian platforms, the odd lanes of the source vector are already
2813
+ // in the upper 16 bits of the lanes of the bitcasted vector.
2814
+
2815
+ // Need to simply zero out the lower 16 bits of each lane of the bitcasted
2816
+ // vector.
2817
+ return BitCast(d_to,
2818
+ And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u})));
2819
+ #else
2820
+ // On big-endian platforms, need to shift left each lane of the bitcasted
2821
+ // vector by 16 bits.
2822
+ return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v)));
2823
+ #endif
2824
+ }
2825
+
2826
+ // Default PromoteEvenTo/PromoteOddTo implementations
2827
+ template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
2828
+ class V, HWY_IF_LANES_D(D, 1)>
2829
+ HWY_INLINE VFromD<D> PromoteEvenTo(
2830
+ ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
2831
+ FromTypeTag /*from_type_tag*/, D d_to, V v) {
2832
+ return PromoteLowerTo(d_to, v);
2833
+ }
2834
+
2835
+ template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
2836
+ class V, HWY_IF_LANES_GT_D(D, 1)>
2837
+ HWY_INLINE VFromD<D> PromoteEvenTo(
2838
+ ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
2839
+ FromTypeTag /*from_type_tag*/, D d_to, V v) {
2840
+ const DFromV<decltype(v)> d;
2841
+ return PromoteLowerTo(d_to, ConcatEven(d, v, v));
2842
+ }
2843
+
2844
+ template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
2845
+ class V>
2846
+ HWY_INLINE VFromD<D> PromoteOddTo(
2847
+ ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
2848
+ FromTypeTag /*from_type_tag*/, D d_to, V v) {
2849
+ const DFromV<decltype(v)> d;
2850
+ return PromoteLowerTo(d_to, ConcatOdd(d, v, v));
2851
+ }
2852
+
2853
+ } // namespace detail
2854
+
2855
+ template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)),
2856
+ class V2 = VFromD<Repartition<TFromV<V>, D>>,
2857
+ HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))>
2858
+ HWY_API VFromD<D> PromoteEvenTo(D d, V v) {
2859
+ return detail::PromoteEvenTo(hwy::TypeTag<TFromD<D>>(),
2860
+ hwy::SizeTag<sizeof(TFromD<D>)>(),
2861
+ hwy::TypeTag<TFromV<V>>(), d, v);
2862
+ }
2863
+
2864
+ template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)),
2865
+ class V2 = VFromD<Repartition<TFromV<V>, D>>,
2866
+ HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))>
2867
+ HWY_API VFromD<D> PromoteOddTo(D d, V v) {
2868
+ return detail::PromoteOddTo(hwy::TypeTag<TFromD<D>>(),
2869
+ hwy::SizeTag<sizeof(TFromD<D>)>(),
2870
+ hwy::TypeTag<TFromV<V>>(), d, v);
2871
+ }
2872
+ #endif // HWY_TARGET != HWY_SCALAR
2873
+
1926
2874
  // ------------------------------ float16_t <-> float
1927
2875
 
1928
2876
  #if (defined(HWY_NATIVE_F16C) == defined(HWY_TARGET_TOGGLE))
@@ -1956,41 +2904,237 @@ HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<float16_t, D>> v) {
1956
2904
 
1957
2905
  template <class D, HWY_IF_F16_D(D)>
1958
2906
  HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
1959
- const RebindToUnsigned<decltype(df16)> du16;
1960
- const Rebind<uint32_t, decltype(df16)> du32;
1961
- const RebindToSigned<decltype(du32)> di32;
1962
- using VU32 = VFromD<decltype(du32)>;
1963
- using VI32 = VFromD<decltype(di32)>;
1964
-
1965
- const VU32 bits32 = BitCast(du32, v);
1966
- const VU32 sign = ShiftRight<31>(bits32);
1967
- const VU32 biased_exp32 = And(ShiftRight<23>(bits32), Set(du32, 0xFF));
1968
- const VU32 mantissa32 = And(bits32, Set(du32, 0x7FFFFF));
1969
-
1970
- const VI32 k15 = Set(di32, 15);
1971
- const VI32 exp = Min(Sub(BitCast(di32, biased_exp32), Set(di32, 127)), k15);
1972
- const MFromD<decltype(di32)> is_tiny = Lt(exp, Set(di32, -24));
1973
-
1974
- const MFromD<decltype(di32)> is_subnormal = Lt(exp, Set(di32, -14));
1975
- const VU32 biased_exp16 =
1976
- BitCast(du32, IfThenZeroElse(is_subnormal, Add(exp, k15)));
1977
- const VU32 sub_exp = BitCast(du32, Sub(Set(di32, -14), exp)); // [1, 11)
1978
- // Clamp shift counts to prevent warnings in emu_128 Shr.
1979
- const VU32 k31 = Set(du32, 31);
1980
- const VU32 shift_m = Min(Add(Set(du32, 13), sub_exp), k31);
1981
- const VU32 shift_1 = Min(Sub(Set(du32, 10), sub_exp), k31);
1982
- const VU32 sub_m = Add(Shl(Set(du32, 1), shift_1), Shr(mantissa32, shift_m));
1983
- const VU32 mantissa16 = IfThenElse(RebindMask(du32, is_subnormal), sub_m,
1984
- ShiftRight<13>(mantissa32)); // <1024
1985
-
1986
- const VU32 sign16 = ShiftLeft<15>(sign);
1987
- const VU32 normal16 = Or3(sign16, ShiftLeft<10>(biased_exp16), mantissa16);
1988
- const VI32 bits16 = IfThenZeroElse(is_tiny, BitCast(di32, normal16));
1989
- return BitCast(df16, DemoteTo(du16, bits16));
2907
+ const RebindToSigned<decltype(df16)> di16;
2908
+ const Rebind<int32_t, decltype(df16)> di32;
2909
+ const RebindToFloat<decltype(di32)> df32;
2910
+ const RebindToUnsigned<decltype(df32)> du32;
2911
+
2912
+ // There are 23 fractional bits (plus the implied 1 bit) in the mantissa of
2913
+ // a F32, and there are 10 fractional bits (plus the implied 1 bit) in the
2914
+ // mantissa of a F16
2915
+
2916
+ // We want the unbiased exponent of round_incr[i] to be at least (-14) + 13 as
2917
+ // 2^(-14) is the smallest positive normal F16 value and as we want 13
2918
+ // mantissa bits (including the implicit 1 bit) to the left of the
2919
+ // F32 mantissa bits in rounded_val[i] since 23 - 10 is equal to 13
2920
+
2921
+ // The biased exponent of round_incr[i] needs to be at least 126 as
2922
+ // (-14) + 13 + 127 is equal to 126
2923
+
2924
+ // We also want to biased exponent of round_incr[i] to be less than or equal
2925
+ // to 255 (which is equal to MaxExponentField<float>())
2926
+
2927
+ // The biased F64 exponent of round_incr is equal to
2928
+ // HWY_MAX(HWY_MIN(((exp_bits[i] >> 23) & 255) + 13, 255), 126)
2929
+
2930
+ // hi9_bits[i] is equal to the upper 9 bits of v[i]
2931
+ const auto hi9_bits = ShiftRight<23>(BitCast(du32, v));
2932
+
2933
+ const auto k13 = Set(du32, uint32_t{13u});
2934
+
2935
+ // Minimum biased F32 exponent of round_incr
2936
+ const auto k126 = Set(du32, uint32_t{126u});
2937
+
2938
+ // round_incr_hi9_bits[i] is equivalent to
2939
+ // (hi9_bits[i] & 0x100) |
2940
+ // HWY_MAX(HWY_MIN((hi9_bits[i] & 0xFF) + 13, 255), 126)
2941
+
2942
+ #if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
2943
+ const auto k255 = Set(du32, uint32_t{255u});
2944
+ const auto round_incr_hi9_bits = BitwiseIfThenElse(
2945
+ k255, Max(Min(Add(And(hi9_bits, k255), k13), k255), k126), hi9_bits);
2946
+ #else
2947
+ // On targets other than SCALAR and EMU128, the exponent bits of hi9_bits can
2948
+ // be incremented by 13 and clamped to the [13, 255] range without overflowing
2949
+ // into the sign bit of hi9_bits by using U8 SaturatedAdd as there are 8
2950
+ // exponent bits in an F32
2951
+
2952
+ // U8 Max can be used on targets other than SCALAR and EMU128 to clamp
2953
+ // ((hi9_bits & 0xFF) + 13) to the [126, 255] range without affecting the sign
2954
+ // bit
2955
+
2956
+ const Repartition<uint8_t, decltype(du32)> du32_as_u8;
2957
+ const auto round_incr_hi9_bits = BitCast(
2958
+ du32,
2959
+ Max(SaturatedAdd(BitCast(du32_as_u8, hi9_bits), BitCast(du32_as_u8, k13)),
2960
+ BitCast(du32_as_u8, k126)));
2961
+ #endif
2962
+
2963
+ // (round_incr_hi9_bits >> 8) is equal to (hi9_bits >> 8), and
2964
+ // (round_incr_hi9_bits & 0xFF) is equal to
2965
+ // HWY_MAX(HWY_MIN((round_incr_hi9_bits & 0xFF) + 13, 255), 126)
2966
+
2967
+ const auto round_incr = BitCast(df32, ShiftLeft<23>(round_incr_hi9_bits));
2968
+
2969
+ // Add round_incr[i] to v[i] to round the mantissa to the nearest F16 mantissa
2970
+ // and to move the fractional bits of the resulting non-NaN mantissa down to
2971
+ // the lower 10 bits of rounded_val if (v[i] + round_incr[i]) is a non-NaN
2972
+ // value
2973
+ const auto rounded_val = Add(v, round_incr);
2974
+
2975
+ // rounded_val_bits is the bits of rounded_val as a U32
2976
+ const auto rounded_val_bits = BitCast(du32, rounded_val);
2977
+
2978
+ // rounded_val[i] is known to have the same biased exponent as round_incr[i]
2979
+ // as |round_incr[i]| > 2^12*|v[i]| is true if round_incr[i] is a finite
2980
+ // value, round_incr[i] and v[i] both have the same sign, and |round_incr[i]|
2981
+ // is either a power of 2 that is greater than or equal to 2^-1 or infinity.
2982
+
2983
+ // If rounded_val[i] is a finite F32 value, then
2984
+ // (rounded_val_bits[i] & 0x00000FFF) is the bit representation of the
2985
+ // rounded mantissa of rounded_val[i] as a UQ2.10 fixed point number that is
2986
+ // in the range [0, 2].
2987
+
2988
+ // In other words, (rounded_val_bits[i] & 0x00000FFF) is between 0 and 0x0800,
2989
+ // with (rounded_val_bits[i] & 0x000003FF) being the fractional bits of the
2990
+ // resulting F16 mantissa, if rounded_v[i] is a finite F32 value.
2991
+
2992
+ // (rounded_val_bits[i] & 0x007FF000) == 0 is guaranteed to be true if
2993
+ // rounded_val[i] is a non-NaN value
2994
+
2995
+ // The biased exponent of rounded_val[i] is guaranteed to be at least 126 as
2996
+ // the biased exponent of round_incr[i] is at least 126 and as both v[i] and
2997
+ // round_incr[i] have the same sign bit
2998
+
2999
+ // The ULP of a F32 value with a biased exponent of 126 is equal to
3000
+ // 2^(126 - 127 - 23), which is equal to 2^(-24) (which is also the ULP of a
3001
+ // F16 value with a biased exponent of 0 or 1 as (1 - 15 - 10) is equal to
3002
+ // -24)
3003
+
3004
+ // The biased exponent (before subtracting by 126) needs to be clamped to the
3005
+ // [126, 157] range as 126 + 31 is equal to 157 and as 31 is the largest
3006
+ // biased exponent of a F16.
3007
+
3008
+ // The biased exponent of the resulting F16 value is equal to
3009
+ // HWY_MIN((round_incr_hi9_bits[i] & 0xFF) +
3010
+ // ((rounded_val_bits[i] >> 10) & 0xFF), 157) - 126
3011
+
3012
+ #if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
3013
+ auto f16_exp_bits =
3014
+ Min(Add(ShiftLeft<10>(And(round_incr_hi9_bits, k255)),
3015
+ And(rounded_val_bits,
3016
+ Set(du32, static_cast<uint32_t>(uint32_t{0xFFu} << 10)))),
3017
+ Set(du32, static_cast<uint32_t>(uint32_t{157u} << 10)));
3018
+ #else
3019
+ auto f16_exp_bits = ShiftLeft<10>(BitCast(
3020
+ du32,
3021
+ Min(SaturatedAdd(BitCast(du32_as_u8, round_incr_hi9_bits),
3022
+ BitCast(du32_as_u8, ShiftRight<10>(rounded_val_bits))),
3023
+ BitCast(du32_as_u8, Set(du32, uint32_t{157})))));
3024
+ #endif
3025
+
3026
+ f16_exp_bits =
3027
+ Sub(f16_exp_bits, Set(du32, static_cast<uint32_t>(uint32_t{126u} << 10)));
3028
+
3029
+ const auto f16_unmasked_mant_bits =
3030
+ BitCast(di32, Or(rounded_val, VecFromMask(df32, IsNaN(rounded_val))));
3031
+
3032
+ const auto f16_exp_mant_bits =
3033
+ OrAnd(BitCast(di32, f16_exp_bits), f16_unmasked_mant_bits,
3034
+ Set(di32, int32_t{0x03FF}));
3035
+
3036
+ // f16_bits_as_i32 is the F16 bits sign-extended to an I32 (with the upper 17
3037
+ // bits of f16_bits_as_i32[i] set to the sign bit of rounded_val[i]) to allow
3038
+ // efficient truncation of the F16 bits to an I16 using an I32->I16 DemoteTo
3039
+ // operation
3040
+ const auto f16_bits_as_i32 =
3041
+ OrAnd(f16_exp_mant_bits, ShiftRight<16>(BitCast(di32, rounded_val_bits)),
3042
+ Set(di32, static_cast<int32_t>(0xFFFF8000u)));
3043
+ return BitCast(df16, DemoteTo(di16, f16_bits_as_i32));
1990
3044
  }
1991
3045
 
1992
3046
  #endif // HWY_NATIVE_F16C
1993
3047
 
3048
+ // ------------------------------ F64->F16 DemoteTo
3049
+ #if (defined(HWY_NATIVE_DEMOTE_F64_TO_F16) == defined(HWY_TARGET_TOGGLE))
3050
+ #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
3051
+ #undef HWY_NATIVE_DEMOTE_F64_TO_F16
3052
+ #else
3053
+ #define HWY_NATIVE_DEMOTE_F64_TO_F16
3054
+ #endif
3055
+
3056
+ #if HWY_HAVE_FLOAT64
3057
+ template <class D, HWY_IF_F16_D(D)>
3058
+ HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
3059
+ const Rebind<double, D> df64;
3060
+ const Rebind<uint64_t, D> du64;
3061
+ const Rebind<float, D> df32;
3062
+
3063
+ // The mantissa bits of v[i] are first rounded using round-to-odd rounding to
3064
+ // the nearest F64 value that has the lower 29 bits zeroed out to ensure that
3065
+ // the result is correctly rounded to a F16.
3066
+
3067
+ const auto vf64_rounded = OrAnd(
3068
+ And(v,
3069
+ BitCast(df64, Set(du64, static_cast<uint64_t>(0xFFFFFFFFE0000000u)))),
3070
+ BitCast(df64, Add(BitCast(du64, v),
3071
+ Set(du64, static_cast<uint64_t>(0x000000001FFFFFFFu)))),
3072
+ BitCast(df64, Set(du64, static_cast<uint64_t>(0x0000000020000000ULL))));
3073
+
3074
+ return DemoteTo(df16, DemoteTo(df32, vf64_rounded));
3075
+ }
3076
+ #endif // HWY_HAVE_FLOAT64
3077
+
3078
+ #endif // HWY_NATIVE_DEMOTE_F64_TO_F16
3079
+
3080
+ // ------------------------------ F16->F64 PromoteTo
3081
+ #if (defined(HWY_NATIVE_PROMOTE_F16_TO_F64) == defined(HWY_TARGET_TOGGLE))
3082
+ #ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
3083
+ #undef HWY_NATIVE_PROMOTE_F16_TO_F64
3084
+ #else
3085
+ #define HWY_NATIVE_PROMOTE_F16_TO_F64
3086
+ #endif
3087
+
3088
+ #if HWY_HAVE_FLOAT64
3089
+ template <class D, HWY_IF_F64_D(D)>
3090
+ HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<float16_t, D>> v) {
3091
+ return PromoteTo(df64, PromoteTo(Rebind<float, D>(), v));
3092
+ }
3093
+ #endif // HWY_HAVE_FLOAT64
3094
+
3095
+ #endif // HWY_NATIVE_PROMOTE_F16_TO_F64
3096
+
3097
+ // ------------------------------ SumsOf2
3098
+
3099
+ #if HWY_TARGET != HWY_SCALAR
3100
+ namespace detail {
3101
+
3102
+ template <class TypeTag, size_t kLaneSize, class V>
3103
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
3104
+ TypeTag /*type_tag*/, hwy::SizeTag<kLaneSize> /*lane_size_tag*/, V v) {
3105
+ const DFromV<decltype(v)> d;
3106
+ const RepartitionToWide<decltype(d)> dw;
3107
+ return Add(PromoteEvenTo(dw, v), PromoteOddTo(dw, v));
3108
+ }
3109
+
3110
+ } // namespace detail
3111
+
3112
+ template <class V>
3113
+ HWY_API VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(V v) {
3114
+ return detail::SumsOf2(hwy::TypeTag<TFromV<V>>(),
3115
+ hwy::SizeTag<sizeof(TFromV<V>)>(), v);
3116
+ }
3117
+ #endif // HWY_TARGET != HWY_SCALAR
3118
+
3119
+ // ------------------------------ SumsOf4
3120
+
3121
+ namespace detail {
3122
+
3123
+ template <class TypeTag, size_t kLaneSize, class V>
3124
+ HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
3125
+ TypeTag /*type_tag*/, hwy::SizeTag<kLaneSize> /*lane_size_tag*/, V v) {
3126
+ using hwy::HWY_NAMESPACE::SumsOf2;
3127
+ return SumsOf2(SumsOf2(v));
3128
+ }
3129
+
3130
+ } // namespace detail
3131
+
3132
+ template <class V>
3133
+ HWY_API VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(V v) {
3134
+ return detail::SumsOf4(hwy::TypeTag<TFromV<V>>(),
3135
+ hwy::SizeTag<sizeof(TFromV<V>)>(), v);
3136
+ }
3137
+
1994
3138
  // ------------------------------ OrderedTruncate2To
1995
3139
 
1996
3140
  #if HWY_IDE || \
@@ -2206,8 +3350,7 @@ HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
2206
3350
  #if HWY_TARGET == HWY_SCALAR
2207
3351
  const uint64_t u64_val = GetLane(v);
2208
3352
  const float f32_val = static_cast<float>(u64_val);
2209
- uint32_t f32_bits;
2210
- CopySameSize(&f32_val, &f32_bits);
3353
+ const uint32_t f32_bits = BitCastScalar<uint32_t>(f32_val);
2211
3354
  return Set(d, static_cast<uint64_t>(f32_bits >> 23));
2212
3355
  #else
2213
3356
  const Repartition<uint32_t, decltype(d)> du32;
@@ -2320,30 +3463,29 @@ HWY_INLINE V SubBytesMulInverseAndAffineLookup(V state, V affine_tblL,
2320
3463
 
2321
3464
  // Change polynomial basis to GF(2^4)
2322
3465
  {
2323
- alignas(16) static constexpr uint8_t basisL[16] = {
2324
- 0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2,
2325
- 0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA};
2326
- alignas(16) static constexpr uint8_t basisU[16] = {
2327
- 0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C,
2328
- 0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD};
3466
+ const VFromD<decltype(du)> basisL =
3467
+ Dup128VecFromValues(du, 0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2,
3468
+ 0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA);
3469
+ const VFromD<decltype(du)> basisU =
3470
+ Dup128VecFromValues(du, 0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C,
3471
+ 0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD);
2329
3472
  const auto sL = And(state, mask);
2330
3473
  const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero
2331
- const auto gf4L = TableLookupBytes(LoadDup128(du, basisL), sL);
2332
- const auto gf4U = TableLookupBytes(LoadDup128(du, basisU), sU);
3474
+ const auto gf4L = TableLookupBytes(basisL, sL);
3475
+ const auto gf4U = TableLookupBytes(basisU, sU);
2333
3476
  state = Xor(gf4L, gf4U);
2334
3477
  }
2335
3478
 
2336
3479
  // Inversion in GF(2^4). Elements 0 represent "infinity" (division by 0) and
2337
3480
  // cause TableLookupBytesOr0 to return 0.
2338
- alignas(16) static constexpr uint8_t kZetaInv[16] = {
2339
- 0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3};
2340
- alignas(16) static constexpr uint8_t kInv[16] = {
2341
- 0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4};
2342
- const auto tbl = LoadDup128(du, kInv);
3481
+ const VFromD<decltype(du)> zetaInv = Dup128VecFromValues(
3482
+ du, 0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3);
3483
+ const VFromD<decltype(du)> tbl = Dup128VecFromValues(
3484
+ du, 0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4);
2343
3485
  const auto sL = And(state, mask); // L=low nibble, U=upper
2344
3486
  const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero
2345
3487
  const auto sX = Xor(sU, sL);
2346
- const auto invL = TableLookupBytes(LoadDup128(du, kZetaInv), sL);
3488
+ const auto invL = TableLookupBytes(zetaInv, sL);
2347
3489
  const auto invU = TableLookupBytes(tbl, sU);
2348
3490
  const auto invX = TableLookupBytes(tbl, sX);
2349
3491
  const auto outL = Xor(sX, TableLookupBytesOr0(tbl, Xor(invL, invU)));
@@ -2359,26 +3501,25 @@ HWY_INLINE V SubBytes(V state) {
2359
3501
  const DFromV<V> du;
2360
3502
  // Linear skew (cannot bake 0x63 bias into the table because out* indices
2361
3503
  // may have the infinity flag set).
2362
- alignas(16) static constexpr uint8_t kAffineL[16] = {
2363
- 0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0,
2364
- 0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15};
2365
- alignas(16) static constexpr uint8_t kAffineU[16] = {
2366
- 0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF,
2367
- 0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E};
2368
- return Xor(SubBytesMulInverseAndAffineLookup(state, LoadDup128(du, kAffineL),
2369
- LoadDup128(du, kAffineU)),
3504
+ const VFromD<decltype(du)> affineL =
3505
+ Dup128VecFromValues(du, 0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0,
3506
+ 0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15);
3507
+ const VFromD<decltype(du)> affineU =
3508
+ Dup128VecFromValues(du, 0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF,
3509
+ 0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E);
3510
+ return Xor(SubBytesMulInverseAndAffineLookup(state, affineL, affineU),
2370
3511
  Set(du, uint8_t{0x63}));
2371
3512
  }
2372
3513
 
2373
3514
  template <class V> // u8
2374
3515
  HWY_INLINE V InvSubBytes(V state) {
2375
3516
  const DFromV<V> du;
2376
- alignas(16) static constexpr uint8_t kGF2P4InvToGF2P8InvL[16]{
2377
- 0x00, 0x40, 0xF9, 0x7E, 0x53, 0xEA, 0x87, 0x13,
2378
- 0x2D, 0x3E, 0x94, 0xD4, 0xB9, 0x6D, 0xAA, 0xC7};
2379
- alignas(16) static constexpr uint8_t kGF2P4InvToGF2P8InvU[16]{
2380
- 0x00, 0x1D, 0x44, 0x93, 0x0F, 0x56, 0xD7, 0x12,
2381
- 0x9C, 0x8E, 0xC5, 0xD8, 0x59, 0x81, 0x4B, 0xCA};
3517
+ const VFromD<decltype(du)> gF2P4InvToGF2P8InvL =
3518
+ Dup128VecFromValues(du, 0x00, 0x40, 0xF9, 0x7E, 0x53, 0xEA, 0x87, 0x13,
3519
+ 0x2D, 0x3E, 0x94, 0xD4, 0xB9, 0x6D, 0xAA, 0xC7);
3520
+ const VFromD<decltype(du)> gF2P4InvToGF2P8InvU =
3521
+ Dup128VecFromValues(du, 0x00, 0x1D, 0x44, 0x93, 0x0F, 0x56, 0xD7, 0x12,
3522
+ 0x9C, 0x8E, 0xC5, 0xD8, 0x59, 0x81, 0x4B, 0xCA);
2382
3523
 
2383
3524
  // Apply the inverse affine transformation
2384
3525
  const auto b = Xor(Xor3(Or(ShiftLeft<1>(state), ShiftRight<7>(state)),
@@ -2392,16 +3533,14 @@ HWY_INLINE V InvSubBytes(V state) {
2392
3533
  // - Converting the GF(2^4) multiplicative inverse to the GF(2^8)
2393
3534
  // multiplicative inverse through table lookups using the
2394
3535
  // kGF2P4InvToGF2P8InvL and kGF2P4InvToGF2P8InvU tables
2395
- return SubBytesMulInverseAndAffineLookup(
2396
- b, LoadDup128(du, kGF2P4InvToGF2P8InvL),
2397
- LoadDup128(du, kGF2P4InvToGF2P8InvU));
3536
+ return SubBytesMulInverseAndAffineLookup(b, gF2P4InvToGF2P8InvL,
3537
+ gF2P4InvToGF2P8InvU);
2398
3538
  }
2399
3539
 
2400
3540
  } // namespace detail
2401
3541
 
2402
3542
  #endif // HWY_TARGET != HWY_SCALAR
2403
3543
 
2404
- // "Include guard": skip if native AES instructions are available.
2405
3544
  #if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE))
2406
3545
  #ifdef HWY_NATIVE_AES
2407
3546
  #undef HWY_NATIVE_AES
@@ -2417,24 +3556,18 @@ namespace detail {
2417
3556
  template <class V> // u8
2418
3557
  HWY_INLINE V ShiftRows(const V state) {
2419
3558
  const DFromV<V> du;
2420
- alignas(16) static constexpr uint8_t kShiftRow[16] = {
2421
- 0, 5, 10, 15, // transposed: state is column major
2422
- 4, 9, 14, 3, //
2423
- 8, 13, 2, 7, //
2424
- 12, 1, 6, 11};
2425
- const auto shift_row = LoadDup128(du, kShiftRow);
3559
+ // transposed: state is column major
3560
+ const VFromD<decltype(du)> shift_row = Dup128VecFromValues(
3561
+ du, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11);
2426
3562
  return TableLookupBytes(state, shift_row);
2427
3563
  }
2428
3564
 
2429
3565
  template <class V> // u8
2430
3566
  HWY_INLINE V InvShiftRows(const V state) {
2431
3567
  const DFromV<V> du;
2432
- alignas(16) static constexpr uint8_t kShiftRow[16] = {
2433
- 0, 13, 10, 7, // transposed: state is column major
2434
- 4, 1, 14, 11, //
2435
- 8, 5, 2, 15, //
2436
- 12, 9, 6, 3};
2437
- const auto shift_row = LoadDup128(du, kShiftRow);
3568
+ // transposed: state is column major
3569
+ const VFromD<decltype(du)> shift_row = Dup128VecFromValues(
3570
+ du, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3);
2438
3571
  return TableLookupBytes(state, shift_row);
2439
3572
  }
2440
3573
 
@@ -2455,15 +3588,15 @@ HWY_INLINE V MixColumns(const V state) {
2455
3588
  // 1 2 3 1 // d are on diagonal, no permutation needed.
2456
3589
  // 1 1 2 3 // t1230 indicates column indices of threes for the 4 rows.
2457
3590
  // 3 1 1 2 // We also need to compute s2301 and s3012 (=1230 o 2301).
2458
- alignas(16) static constexpr uint8_t k2301[16] = {
2459
- 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13};
2460
- alignas(16) static constexpr uint8_t k1230[16] = {
2461
- 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12};
3591
+ const VFromD<decltype(du)> v2301 = Dup128VecFromValues(
3592
+ du, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13);
3593
+ const VFromD<decltype(du)> v1230 = Dup128VecFromValues(
3594
+ du, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12);
2462
3595
  const auto d = GF2P8Mod11BMulBy2(state); // = state*2 in GF(2^8).
2463
- const auto s2301 = TableLookupBytes(state, LoadDup128(du, k2301));
3596
+ const auto s2301 = TableLookupBytes(state, v2301);
2464
3597
  const auto d_s2301 = Xor(d, s2301);
2465
3598
  const auto t_s2301 = Xor(state, d_s2301); // t(s*3) = XOR-sum {s, d(s*2)}
2466
- const auto t1230_s3012 = TableLookupBytes(t_s2301, LoadDup128(du, k1230));
3599
+ const auto t1230_s3012 = TableLookupBytes(t_s2301, v1230);
2467
3600
  return Xor(d_s2301, t1230_s3012); // XOR-sum of 4 terms
2468
3601
  }
2469
3602
 
@@ -2475,11 +3608,10 @@ HWY_INLINE V InvMixColumns(const V state) {
2475
3608
  // 9 14 11 13
2476
3609
  // 13 9 14 11
2477
3610
  // 11 13 9 14
2478
- alignas(16) static constexpr uint8_t k2301[16] = {
2479
- 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13};
2480
- alignas(16) static constexpr uint8_t k1230[16] = {
2481
- 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12};
2482
- const auto v1230 = LoadDup128(du, k1230);
3611
+ const VFromD<decltype(du)> v2301 = Dup128VecFromValues(
3612
+ du, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13);
3613
+ const VFromD<decltype(du)> v1230 = Dup128VecFromValues(
3614
+ du, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12);
2483
3615
 
2484
3616
  const auto sx2 = GF2P8Mod11BMulBy2(state); /* = state*2 in GF(2^8) */
2485
3617
  const auto sx4 = GF2P8Mod11BMulBy2(sx2); /* = state*4 in GF(2^8) */
@@ -2491,8 +3623,7 @@ HWY_INLINE V InvMixColumns(const V state) {
2491
3623
 
2492
3624
  const auto sx13_0123_sx9_1230 = Xor(sx13, TableLookupBytes(sx9, v1230));
2493
3625
  const auto sx14_0123_sx11_1230 = Xor(sx14, TableLookupBytes(sx11, v1230));
2494
- const auto sx13_2301_sx9_3012 =
2495
- TableLookupBytes(sx13_0123_sx9_1230, LoadDup128(du, k2301));
3626
+ const auto sx13_2301_sx9_3012 = TableLookupBytes(sx13_0123_sx9_1230, v2301);
2496
3627
  return Xor(sx14_0123_sx11_1230, sx13_2301_sx9_3012);
2497
3628
  }
2498
3629
 
@@ -2543,15 +3674,15 @@ HWY_API V AESLastRoundInv(V state, const V round_key) {
2543
3674
 
2544
3675
  template <uint8_t kRcon, class V, HWY_IF_U8_D(DFromV<V>)>
2545
3676
  HWY_API V AESKeyGenAssist(V v) {
2546
- alignas(16) static constexpr uint8_t kRconXorMask[16] = {
2547
- 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0};
2548
- alignas(16) static constexpr uint8_t kRotWordShuffle[16] = {
2549
- 4, 5, 6, 7, 5, 6, 7, 4, 12, 13, 14, 15, 13, 14, 15, 12};
2550
3677
  const DFromV<decltype(v)> d;
3678
+ const V rconXorMask = Dup128VecFromValues(d, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0,
3679
+ 0, 0, kRcon, 0, 0, 0);
3680
+ const V rotWordShuffle = Dup128VecFromValues(d, 4, 5, 6, 7, 5, 6, 7, 4, 12,
3681
+ 13, 14, 15, 13, 14, 15, 12);
2551
3682
  const auto sub_word_result = detail::SubBytes(v);
2552
3683
  const auto rot_word_result =
2553
- TableLookupBytes(sub_word_result, LoadDup128(d, kRotWordShuffle));
2554
- return Xor(rot_word_result, LoadDup128(d, kRconXorMask));
3684
+ TableLookupBytes(sub_word_result, rotWordShuffle);
3685
+ return Xor(rot_word_result, rconXorMask);
2555
3686
  }
2556
3687
 
2557
3688
  // Constant-time implementation inspired by
@@ -2602,203 +3733,751 @@ HWY_API V CLMulUpper(V a, V b) {
2602
3733
  const auto b2 = And(b, k4);
2603
3734
  const auto b3 = And(b, k8);
2604
3735
 
2605
- auto m0 = Xor(MulOdd(a0, b0), MulOdd(a1, b3));
2606
- auto m1 = Xor(MulOdd(a0, b1), MulOdd(a1, b0));
2607
- auto m2 = Xor(MulOdd(a0, b2), MulOdd(a1, b1));
2608
- auto m3 = Xor(MulOdd(a0, b3), MulOdd(a1, b2));
2609
- m0 = Xor(m0, Xor(MulOdd(a2, b2), MulOdd(a3, b1)));
2610
- m1 = Xor(m1, Xor(MulOdd(a2, b3), MulOdd(a3, b2)));
2611
- m2 = Xor(m2, Xor(MulOdd(a2, b0), MulOdd(a3, b3)));
2612
- m3 = Xor(m3, Xor(MulOdd(a2, b1), MulOdd(a3, b0)));
2613
- return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
3736
+ auto m0 = Xor(MulOdd(a0, b0), MulOdd(a1, b3));
3737
+ auto m1 = Xor(MulOdd(a0, b1), MulOdd(a1, b0));
3738
+ auto m2 = Xor(MulOdd(a0, b2), MulOdd(a1, b1));
3739
+ auto m3 = Xor(MulOdd(a0, b3), MulOdd(a1, b2));
3740
+ m0 = Xor(m0, Xor(MulOdd(a2, b2), MulOdd(a3, b1)));
3741
+ m1 = Xor(m1, Xor(MulOdd(a2, b3), MulOdd(a3, b2)));
3742
+ m2 = Xor(m2, Xor(MulOdd(a2, b0), MulOdd(a3, b3)));
3743
+ m3 = Xor(m3, Xor(MulOdd(a2, b1), MulOdd(a3, b0)));
3744
+ return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
3745
+ }
3746
+
3747
+ #endif // HWY_NATIVE_AES
3748
+ #endif // HWY_TARGET != HWY_SCALAR
3749
+
3750
+ // ------------------------------ PopulationCount
3751
+
3752
+ #if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE))
3753
+ #ifdef HWY_NATIVE_POPCNT
3754
+ #undef HWY_NATIVE_POPCNT
3755
+ #else
3756
+ #define HWY_NATIVE_POPCNT
3757
+ #endif
3758
+
3759
+ // This overload requires vectors to be at least 16 bytes, which is the case
3760
+ // for LMUL >= 2.
3761
+ #undef HWY_IF_POPCNT
3762
+ #if HWY_TARGET == HWY_RVV
3763
+ #define HWY_IF_POPCNT(D) \
3764
+ hwy::EnableIf<D().Pow2() >= 1 && D().MaxLanes() >= 16>* = nullptr
3765
+ #else
3766
+ // Other targets only have these two overloads which are mutually exclusive, so
3767
+ // no further conditions are required.
3768
+ #define HWY_IF_POPCNT(D) void* = nullptr
3769
+ #endif // HWY_TARGET == HWY_RVV
3770
+
3771
+ template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
3772
+ HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_POPCNT(D)>
3773
+ HWY_API V PopulationCount(V v) {
3774
+ const D d;
3775
+ const V lookup =
3776
+ Dup128VecFromValues(d, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
3777
+ const auto lo = And(v, Set(d, uint8_t{0xF}));
3778
+ const auto hi = ShiftRight<4>(v);
3779
+ return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
3780
+ }
3781
+
3782
+ // RVV has a specialization that avoids the Set().
3783
+ #if HWY_TARGET != HWY_RVV
3784
+ // Slower fallback for capped vectors.
3785
+ template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
3786
+ HWY_IF_V_SIZE_LE_D(D, 8)>
3787
+ HWY_API V PopulationCount(V v) {
3788
+ const D d;
3789
+ // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
3790
+ const V k33 = Set(d, uint8_t{0x33});
3791
+ v = Sub(v, And(ShiftRight<1>(v), Set(d, uint8_t{0x55})));
3792
+ v = Add(And(ShiftRight<2>(v), k33), And(v, k33));
3793
+ return And(Add(v, ShiftRight<4>(v)), Set(d, uint8_t{0x0F}));
3794
+ }
3795
+ #endif // HWY_TARGET != HWY_RVV
3796
+
3797
+ template <class V, class D = DFromV<V>, HWY_IF_U16_D(D)>
3798
+ HWY_API V PopulationCount(V v) {
3799
+ const D d;
3800
+ const Repartition<uint8_t, decltype(d)> d8;
3801
+ const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
3802
+ return Add(ShiftRight<8>(vals), And(vals, Set(d, uint16_t{0xFF})));
3803
+ }
3804
+
3805
+ template <class V, class D = DFromV<V>, HWY_IF_U32_D(D)>
3806
+ HWY_API V PopulationCount(V v) {
3807
+ const D d;
3808
+ Repartition<uint16_t, decltype(d)> d16;
3809
+ auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
3810
+ return Add(ShiftRight<16>(vals), And(vals, Set(d, uint32_t{0xFF})));
3811
+ }
3812
+
3813
+ #if HWY_HAVE_INTEGER64
3814
+ template <class V, class D = DFromV<V>, HWY_IF_U64_D(D)>
3815
+ HWY_API V PopulationCount(V v) {
3816
+ const D d;
3817
+ Repartition<uint32_t, decltype(d)> d32;
3818
+ auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
3819
+ return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFFULL)));
3820
+ }
3821
+ #endif
3822
+
3823
+ #endif // HWY_NATIVE_POPCNT
3824
+
3825
+ // ------------------------------ 8-bit multiplication
3826
+
3827
+ #if (defined(HWY_NATIVE_MUL_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
3828
+ #ifdef HWY_NATIVE_MUL_8
3829
+ #undef HWY_NATIVE_MUL_8
3830
+ #else
3831
+ #define HWY_NATIVE_MUL_8
3832
+ #endif
3833
+
3834
+ // 8 bit and fits in wider reg: promote
3835
+ template <class V, HWY_IF_T_SIZE_V(V, 1),
3836
+ HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
3837
+ HWY_API V operator*(const V a, const V b) {
3838
+ const DFromV<decltype(a)> d;
3839
+ const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw;
3840
+ const RebindToUnsigned<decltype(d)> du; // TruncateTo result
3841
+ const RebindToUnsigned<decltype(dw)> dwu; // TruncateTo input
3842
+ const VFromD<decltype(dw)> mul = PromoteTo(dw, a) * PromoteTo(dw, b);
3843
+ // TruncateTo is cheaper than ConcatEven.
3844
+ return BitCast(d, TruncateTo(du, BitCast(dwu, mul)));
3845
+ }
3846
+
3847
+ // 8 bit full reg: promote halves
3848
+ template <class V, HWY_IF_T_SIZE_V(V, 1),
3849
+ HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
3850
+ HWY_API V operator*(const V a, const V b) {
3851
+ const DFromV<decltype(a)> d;
3852
+ const Half<decltype(d)> dh;
3853
+ const Twice<RepartitionToWide<decltype(dh)>> dw;
3854
+ const VFromD<decltype(dw)> a0 = PromoteTo(dw, LowerHalf(dh, a));
3855
+ const VFromD<decltype(dw)> a1 = PromoteTo(dw, UpperHalf(dh, a));
3856
+ const VFromD<decltype(dw)> b0 = PromoteTo(dw, LowerHalf(dh, b));
3857
+ const VFromD<decltype(dw)> b1 = PromoteTo(dw, UpperHalf(dh, b));
3858
+ const VFromD<decltype(dw)> m0 = a0 * b0;
3859
+ const VFromD<decltype(dw)> m1 = a1 * b1;
3860
+ return ConcatEven(d, BitCast(d, m1), BitCast(d, m0));
3861
+ }
3862
+
3863
+ #endif // HWY_NATIVE_MUL_8
3864
+
3865
+ // ------------------------------ 64-bit multiplication
3866
+
3867
+ #if (defined(HWY_NATIVE_MUL_64) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
3868
+ #ifdef HWY_NATIVE_MUL_64
3869
+ #undef HWY_NATIVE_MUL_64
3870
+ #else
3871
+ #define HWY_NATIVE_MUL_64
3872
+ #endif
3873
+
3874
+ // Single-lane i64 or u64
3875
+ template <class V, HWY_IF_T_SIZE_V(V, 8), HWY_IF_V_SIZE_V(V, 8),
3876
+ HWY_IF_NOT_FLOAT_V(V)>
3877
+ HWY_API V operator*(V x, V y) {
3878
+ const DFromV<V> d;
3879
+ using T = TFromD<decltype(d)>;
3880
+ using TU = MakeUnsigned<T>;
3881
+ const TU xu = static_cast<TU>(GetLane(x));
3882
+ const TU yu = static_cast<TU>(GetLane(y));
3883
+ return Set(d, static_cast<T>(xu * yu));
3884
+ }
3885
+
3886
+ template <class V, class D64 = DFromV<V>, HWY_IF_U64_D(D64),
3887
+ HWY_IF_V_SIZE_GT_D(D64, 8)>
3888
+ HWY_API V operator*(V x, V y) {
3889
+ RepartitionToNarrow<D64> d32;
3890
+ auto x32 = BitCast(d32, x);
3891
+ auto y32 = BitCast(d32, y);
3892
+ auto lolo = BitCast(d32, MulEven(x32, y32));
3893
+ auto lohi = BitCast(d32, MulEven(x32, BitCast(d32, ShiftRight<32>(y))));
3894
+ auto hilo = BitCast(d32, MulEven(BitCast(d32, ShiftRight<32>(x)), y32));
3895
+ auto hi = BitCast(d32, ShiftLeft<32>(BitCast(D64{}, lohi + hilo)));
3896
+ return BitCast(D64{}, lolo + hi);
3897
+ }
3898
+ template <class V, class DI64 = DFromV<V>, HWY_IF_I64_D(DI64),
3899
+ HWY_IF_V_SIZE_GT_D(DI64, 8)>
3900
+ HWY_API V operator*(V x, V y) {
3901
+ RebindToUnsigned<DI64> du64;
3902
+ return BitCast(DI64{}, BitCast(du64, x) * BitCast(du64, y));
3903
+ }
3904
+
3905
+ #endif // HWY_NATIVE_MUL_64
3906
+
3907
+ // ------------------------------ MulAdd / NegMulAdd
3908
+
3909
+ #if (defined(HWY_NATIVE_INT_FMA) == defined(HWY_TARGET_TOGGLE))
3910
+ #ifdef HWY_NATIVE_INT_FMA
3911
+ #undef HWY_NATIVE_INT_FMA
3912
+ #else
3913
+ #define HWY_NATIVE_INT_FMA
3914
+ #endif
3915
+
3916
+ #ifdef HWY_NATIVE_INT_FMSUB
3917
+ #undef HWY_NATIVE_INT_FMSUB
3918
+ #else
3919
+ #define HWY_NATIVE_INT_FMSUB
3920
+ #endif
3921
+
3922
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
3923
+ HWY_API V MulAdd(V mul, V x, V add) {
3924
+ return Add(Mul(mul, x), add);
3925
+ }
3926
+
3927
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
3928
+ HWY_API V NegMulAdd(V mul, V x, V add) {
3929
+ return Sub(add, Mul(mul, x));
3930
+ }
3931
+
3932
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
3933
+ HWY_API V MulSub(V mul, V x, V sub) {
3934
+ return Sub(Mul(mul, x), sub);
3935
+ }
3936
+ #endif // HWY_NATIVE_INT_FMA
3937
+
3938
+ // ------------------------------ Integer MulSub / NegMulSub
3939
+ #if (defined(HWY_NATIVE_INT_FMSUB) == defined(HWY_TARGET_TOGGLE))
3940
+ #ifdef HWY_NATIVE_INT_FMSUB
3941
+ #undef HWY_NATIVE_INT_FMSUB
3942
+ #else
3943
+ #define HWY_NATIVE_INT_FMSUB
3944
+ #endif
3945
+
3946
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
3947
+ HWY_API V MulSub(V mul, V x, V sub) {
3948
+ const DFromV<decltype(mul)> d;
3949
+ const RebindToSigned<decltype(d)> di;
3950
+ return MulAdd(mul, x, BitCast(d, Neg(BitCast(di, sub))));
3951
+ }
3952
+
3953
+ #endif // HWY_NATIVE_INT_FMSUB
3954
+
3955
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
3956
+ HWY_API V NegMulSub(V mul, V x, V sub) {
3957
+ const DFromV<decltype(mul)> d;
3958
+ const RebindToSigned<decltype(d)> di;
3959
+
3960
+ return BitCast(d, Neg(BitCast(di, MulAdd(mul, x, sub))));
3961
+ }
3962
+
3963
+ // ------------------------------ MulAddSub
3964
+
3965
+ // MulAddSub(mul, x, sub_or_add) for a 1-lane vector is equivalent to
3966
+ // MulSub(mul, x, sub_or_add)
3967
+ template <class V, HWY_IF_LANES_D(DFromV<V>, 1)>
3968
+ HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
3969
+ return MulSub(mul, x, sub_or_add);
3970
+ }
3971
+
3972
+ // MulAddSub for F16/F32/F64 vectors with 2 or more lanes on
3973
+ // SSSE3/SSE4/AVX2/AVX3 is implemented in x86_128-inl.h, x86_256-inl.h, and
3974
+ // x86_512-inl.h
3975
+ template <class V, HWY_IF_LANES_GT_D(DFromV<V>, 1),
3976
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | ((HWY_TARGET <= HWY_SSSE3 &&
3977
+ hwy::IsFloat<TFromV<V>>())
3978
+ ? 0
3979
+ : ((1 << 2) | (1 << 4) |
3980
+ (1 << 8))))>
3981
+ HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
3982
+ using D = DFromV<V>;
3983
+ using T = TFromD<D>;
3984
+ using TNegate = If<!IsSigned<T>(), MakeSigned<T>, T>;
3985
+
3986
+ const D d;
3987
+ const Rebind<TNegate, D> d_negate;
3988
+
3989
+ const auto add =
3990
+ OddEven(sub_or_add, BitCast(d, Neg(BitCast(d_negate, sub_or_add))));
3991
+ return MulAdd(mul, x, add);
3992
+ }
3993
+
3994
+ // ------------------------------ Integer division
3995
+ #if (defined(HWY_NATIVE_INT_DIV) == defined(HWY_TARGET_TOGGLE))
3996
+ #ifdef HWY_NATIVE_INT_DIV
3997
+ #undef HWY_NATIVE_INT_DIV
3998
+ #else
3999
+ #define HWY_NATIVE_INT_DIV
4000
+ #endif
4001
+
4002
+ namespace detail {
4003
+
4004
+ template <class D, class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))>
4005
+ HWY_INLINE Vec<D> IntDivConvFloatToInt(D di, V vf) {
4006
+ return ConvertTo(di, vf);
4007
+ }
4008
+
4009
+ template <class D, class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))>
4010
+ HWY_INLINE Vec<D> IntDivConvIntToFloat(D df, V vi) {
4011
+ return ConvertTo(df, vi);
4012
+ }
4013
+
4014
+ #if !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
4015
+ template <class D, class V, HWY_IF_UI64_D(D), HWY_IF_F32(TFromV<V>)>
4016
+ HWY_INLINE Vec<D> IntDivConvFloatToInt(D df, V vi) {
4017
+ return PromoteTo(df, vi);
4018
+ }
4019
+
4020
+ // If !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 is true, then UI64->F32
4021
+ // IntDivConvIntToFloat(df, vi) returns an approximation of
4022
+ // static_cast<float>(v[i]) that is within 4 ULP of static_cast<float>(v[i])
4023
+ template <class D, class V, HWY_IF_F32_D(D), HWY_IF_I64(TFromV<V>)>
4024
+ HWY_INLINE Vec<D> IntDivConvIntToFloat(D df32, V vi) {
4025
+ const Twice<decltype(df32)> dt_f32;
4026
+
4027
+ auto vf32 =
4028
+ ConvertTo(dt_f32, BitCast(RebindToSigned<decltype(dt_f32)>(), vi));
4029
+
4030
+ #if HWY_IS_LITTLE_ENDIAN
4031
+ const auto lo_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
4032
+ auto hi_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
4033
+ #else
4034
+ const auto lo_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
4035
+ auto hi_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
4036
+ #endif
4037
+
4038
+ const RebindToSigned<decltype(df32)> di32;
4039
+
4040
+ hi_f32 =
4041
+ Add(hi_f32, And(BitCast(df32, BroadcastSignBit(BitCast(di32, lo_f32))),
4042
+ Set(df32, 1.0f)));
4043
+ return hwy::HWY_NAMESPACE::MulAdd(hi_f32, Set(df32, 4294967296.0f), lo_f32);
4044
+ }
4045
+
4046
+ template <class D, class V, HWY_IF_F32_D(D), HWY_IF_U64(TFromV<V>)>
4047
+ HWY_INLINE Vec<D> IntDivConvIntToFloat(D df32, V vu) {
4048
+ const Twice<decltype(df32)> dt_f32;
4049
+
4050
+ auto vf32 =
4051
+ ConvertTo(dt_f32, BitCast(RebindToUnsigned<decltype(dt_f32)>(), vu));
4052
+
4053
+ #if HWY_IS_LITTLE_ENDIAN
4054
+ const auto lo_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
4055
+ const auto hi_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
4056
+ #else
4057
+ const auto lo_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
4058
+ const auto hi_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
4059
+ #endif
4060
+
4061
+ return hwy::HWY_NAMESPACE::MulAdd(hi_f32, Set(df32, 4294967296.0f), lo_f32);
4062
+ }
4063
+ #endif // !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
4064
+
4065
+ template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
4066
+ HWY_IF_T_SIZE_GT(TFromV<V>, kOrigLaneSize)>
4067
+ HWY_INLINE V IntDivUsingFloatDiv(V a, V b) {
4068
+ const DFromV<decltype(a)> d;
4069
+ const RebindToFloat<decltype(d)> df;
4070
+
4071
+ // If kOrigLaneSize < sizeof(T) is true, then a[i] and b[i] are both in the
4072
+ // [LimitsMin<SignedFromSize<kOrigLaneSize>>(),
4073
+ // LimitsMax<UnsignedFromSize<kOrigLaneSize>>()] range.
4074
+
4075
+ // floor(|a[i] / b[i]|) <= |flt_q| < floor(|a[i] / b[i]|) + 1 is also
4076
+ // guaranteed to be true if MakeFloat<T> has at least kOrigLaneSize*8 + 1
4077
+ // mantissa bits (including the implied one bit), where flt_q is equal to
4078
+ // static_cast<MakeFloat<T>>(a[i]) / static_cast<MakeFloat<T>>(b[i]),
4079
+ // even in the case where the magnitude of an inexact floating point division
4080
+ // result is rounded up.
4081
+
4082
+ // In other words, floor(flt_q) < flt_q < ceil(flt_q) is guaranteed to be true
4083
+ // if (a[i] % b[i]) != 0 is true and MakeFloat<T> has at least
4084
+ // kOrigLaneSize*8 + 1 mantissa bits (including the implied one bit), even in
4085
+ // the case where the magnitude of an inexact floating point division result
4086
+ // is rounded up.
4087
+
4088
+ #if (HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) && \
4089
+ !HWY_HAVE_FLOAT64
4090
+ // On Armv7, do division by multiplying by the ApproximateReciprocal
4091
+ // to avoid unnecessary overhead as F32 Div refines the approximate
4092
+ // reciprocal using 4 Newton-Raphson iterations
4093
+
4094
+ const RebindToSigned<decltype(d)> di;
4095
+ const RebindToUnsigned<decltype(d)> du;
4096
+
4097
+ const auto flt_b = ConvertTo(df, b);
4098
+ auto flt_recip_b = ApproximateReciprocal(flt_b);
4099
+ if (kOrigLaneSize > 1) {
4100
+ flt_recip_b =
4101
+ Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b));
4102
+ }
4103
+
4104
+ auto q0 = ConvertTo(d, Mul(ConvertTo(df, a), flt_recip_b));
4105
+ const auto r0 = BitCast(di, hwy::HWY_NAMESPACE::NegMulAdd(q0, b, a));
4106
+
4107
+ auto r1 = r0;
4108
+
4109
+ // Need to negate r1[i] if a[i] < 0 is true
4110
+ if (IsSigned<TFromV<V>>()) {
4111
+ r1 = IfNegativeThenNegOrUndefIfZero(BitCast(di, a), r1);
4112
+ }
4113
+
4114
+ // r1[i] is now equal to (a[i] < 0) ? (-r0[i]) : r0[i]
4115
+
4116
+ auto abs_b = BitCast(du, b);
4117
+ if (IsSigned<TFromV<V>>()) {
4118
+ abs_b = BitCast(du, Abs(BitCast(di, abs_b)));
4119
+ }
4120
+
4121
+ // If (r1[i] < 0 || r1[i] >= abs_b[i]) is true, then set q1[i] to -1.
4122
+ // Otherwise, set q1[i] to 0.
4123
+
4124
+ // (r1[i] < 0 || r1[i] >= abs_b[i]) can be carried out using a single unsigned
4125
+ // comparison as static_cast<TU>(r1[i]) >= TU(LimitsMax<TI>() + 1) >= abs_b[i]
4126
+ // will be true if r1[i] < 0 is true.
4127
+ auto q1 = BitCast(di, VecFromMask(du, Ge(BitCast(du, r1), abs_b)));
4128
+
4129
+ // q1[i] is now equal to (r1[i] < 0 || r1[i] >= abs_b[i]) ? -1 : 0
4130
+
4131
+ // Need to negate q1[i] if r0[i] and b[i] do not have the same sign
4132
+ auto q1_negate_mask = r0;
4133
+ if (IsSigned<TFromV<V>>()) {
4134
+ q1_negate_mask = Xor(q1_negate_mask, BitCast(di, b));
4135
+ }
4136
+ q1 = IfNegativeThenElse(q1_negate_mask, Neg(q1), q1);
4137
+
4138
+ // q1[i] is now equal to (r1[i] < 0 || r1[i] >= abs_b[i]) ?
4139
+ // (((r0[i] ^ b[i]) < 0) ? 1 : -1)
4140
+
4141
+ // Need to subtract q1[i] from q0[i] to get the final result
4142
+ return Sub(q0, BitCast(d, q1));
4143
+ #else
4144
+ // On targets other than Armv7 NEON, use F16 or F32 division as most targets
4145
+ // other than Armv7 NEON have native F32 divide instructions
4146
+ return ConvertTo(d, Div(ConvertTo(df, a), ConvertTo(df, b)));
4147
+ #endif
4148
+ }
4149
+
4150
+ template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
4151
+ HWY_IF_T_SIZE(TFromV<V>, kOrigLaneSize),
4152
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
4153
+ HWY_INLINE V IntDivUsingFloatDiv(V a, V b) {
4154
+ // If kOrigLaneSize == sizeof(T) is true, at least two reciprocal
4155
+ // multiplication steps are needed as the mantissa of MakeFloat<T> has fewer
4156
+ // than kOrigLaneSize*8 + 1 bits
4157
+
4158
+ using T = TFromV<V>;
4159
+
4160
+ #if HWY_HAVE_FLOAT64
4161
+ using TF = MakeFloat<T>;
4162
+ #else
4163
+ using TF = float;
4164
+ #endif
4165
+
4166
+ const DFromV<decltype(a)> d;
4167
+ const RebindToSigned<decltype(d)> di;
4168
+ const RebindToUnsigned<decltype(d)> du;
4169
+ const Rebind<TF, decltype(d)> df;
4170
+
4171
+ if (!IsSigned<T>()) {
4172
+ // If T is unsigned, set a[i] to (a[i] >= b[i] ? 1 : 0) and set b[i] to 1 if
4173
+ // b[i] > LimitsMax<MakeSigned<T>>() is true
4174
+
4175
+ const auto one = Set(di, MakeSigned<T>{1});
4176
+ a = BitCast(
4177
+ d, IfNegativeThenElse(BitCast(di, b),
4178
+ IfThenElseZero(RebindMask(di, Ge(a, b)), one),
4179
+ BitCast(di, a)));
4180
+ b = BitCast(d, IfNegativeThenElse(BitCast(di, b), one, BitCast(di, b)));
4181
+ }
4182
+
4183
+ // LimitsMin<T>() <= b[i] <= LimitsMax<MakeSigned<T>>() is now true
4184
+
4185
+ const auto flt_b = IntDivConvIntToFloat(df, b);
4186
+
4187
+ #if (HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) && \
4188
+ !HWY_HAVE_FLOAT64
4189
+ auto flt_recip_b = ApproximateReciprocal(flt_b);
4190
+ flt_recip_b =
4191
+ Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b));
4192
+ #else
4193
+ const auto flt_recip_b = Div(Set(df, TF(1.0)), flt_b);
4194
+ #endif
4195
+
4196
+ auto q0 =
4197
+ IntDivConvFloatToInt(d, Mul(IntDivConvIntToFloat(df, a), flt_recip_b));
4198
+ const auto r0 = BitCast(di, hwy::HWY_NAMESPACE::NegMulAdd(q0, b, a));
4199
+
4200
+ auto q1 =
4201
+ IntDivConvFloatToInt(di, Mul(IntDivConvIntToFloat(df, r0), flt_recip_b));
4202
+ const auto r1 = hwy::HWY_NAMESPACE::NegMulAdd(q1, BitCast(di, b), r0);
4203
+
4204
+ auto r3 = r1;
4205
+
4206
+ #if !HWY_HAVE_FLOAT64
4207
+ // Need two additional reciprocal multiplication steps for I64/U64 vectors if
4208
+ // HWY_HAVE_FLOAT64 is 0
4209
+ if (sizeof(T) == 8) {
4210
+ const auto q2 = IntDivConvFloatToInt(
4211
+ di, Mul(IntDivConvIntToFloat(df, r1), flt_recip_b));
4212
+ const auto r2 = hwy::HWY_NAMESPACE::NegMulAdd(q2, BitCast(di, b), r1);
4213
+
4214
+ const auto q3 = IntDivConvFloatToInt(
4215
+ di, Mul(IntDivConvIntToFloat(df, r2), flt_recip_b));
4216
+ r3 = hwy::HWY_NAMESPACE::NegMulAdd(q3, BitCast(di, b), r2);
4217
+
4218
+ q0 = Add(q0, BitCast(d, q2));
4219
+ q1 = Add(q1, q3);
4220
+ }
4221
+ #endif // !HWY_HAVE_FLOAT64
4222
+
4223
+ auto r4 = r3;
4224
+
4225
+ // Need to negate r4[i] if a[i] < 0 is true
4226
+ if (IsSigned<TFromV<V>>()) {
4227
+ r4 = IfNegativeThenNegOrUndefIfZero(BitCast(di, a), r4);
4228
+ }
4229
+
4230
+ // r4[i] is now equal to (a[i] < 0) ? (-r3[i]) : r3[i]
4231
+
4232
+ auto abs_b = BitCast(du, b);
4233
+ if (IsSigned<TFromV<V>>()) {
4234
+ abs_b = BitCast(du, Abs(BitCast(di, abs_b)));
4235
+ }
4236
+
4237
+ // If (r4[i] < 0 || r4[i] >= abs_b[i]) is true, then set q4[i] to -1.
4238
+ // Otherwise, set r4[i] to 0.
4239
+
4240
+ // (r4[i] < 0 || r4[i] >= abs_b[i]) can be carried out using a single unsigned
4241
+ // comparison as static_cast<TU>(r4[i]) >= TU(LimitsMax<TI>() + 1) >= abs_b[i]
4242
+ // will be true if r4[i] < 0 is true.
4243
+ auto q4 = BitCast(di, VecFromMask(du, Ge(BitCast(du, r4), abs_b)));
4244
+
4245
+ // q4[i] is now equal to (r4[i] < 0 || r4[i] >= abs_b[i]) ? -1 : 0
4246
+
4247
+ // Need to negate q4[i] if r3[i] and b[i] do not have the same sign
4248
+ auto q4_negate_mask = r3;
4249
+ if (IsSigned<TFromV<V>>()) {
4250
+ q4_negate_mask = Xor(q4_negate_mask, BitCast(di, b));
4251
+ }
4252
+ q4 = IfNegativeThenElse(q4_negate_mask, Neg(q4), q4);
4253
+
4254
+ // q4[i] is now equal to (r4[i] < 0 || r4[i] >= abs_b[i]) ?
4255
+ // (((r3[i] ^ b[i]) < 0) ? 1 : -1)
4256
+
4257
+ // The final result is equal to q0[i] + q1[i] - q4[i]
4258
+ return Sub(Add(q0, BitCast(d, q1)), BitCast(d, q4));
2614
4259
  }
2615
4260
 
2616
- #endif // HWY_NATIVE_AES
2617
- #endif // HWY_TARGET != HWY_SCALAR
4261
+ template <size_t kOrigLaneSize, class V,
4262
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
4263
+ HWY_IF_V_SIZE_LE_V(
4264
+ V, HWY_MAX_BYTES /
4265
+ ((!HWY_HAVE_FLOAT16 && sizeof(TFromV<V>) == 1) ? 4 : 2))>
4266
+ HWY_INLINE V IntDiv(V a, V b) {
4267
+ using T = TFromV<V>;
2618
4268
 
2619
- // ------------------------------ PopulationCount
4269
+ // If HWY_HAVE_FLOAT16 is 0, need to promote I8 to I32 and U8 to U32
4270
+ using TW = MakeWide<
4271
+ If<(!HWY_HAVE_FLOAT16 && sizeof(TFromV<V>) == 1), MakeWide<T>, T>>;
2620
4272
 
2621
- // "Include guard": skip if native POPCNT-related instructions are available.
2622
- #if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE))
2623
- #ifdef HWY_NATIVE_POPCNT
2624
- #undef HWY_NATIVE_POPCNT
4273
+ const DFromV<decltype(a)> d;
4274
+ const Rebind<TW, decltype(d)> dw;
4275
+
4276
+ #if HWY_TARGET <= HWY_SSE2
4277
+ // On SSE2/SSSE3/SSE4/AVX2/AVX3, promote to and from MakeSigned<TW> to avoid
4278
+ // unnecessary overhead
4279
+ const RebindToSigned<decltype(dw)> dw_i;
4280
+
4281
+ // On SSE2/SSSE3/SSE4/AVX2/AVX3, demote to MakeSigned<T> if
4282
+ // kOrigLaneSize < sizeof(T) to avoid unnecessary overhead
4283
+ const If<(kOrigLaneSize < sizeof(T)), RebindToSigned<decltype(d)>,
4284
+ decltype(d)>
4285
+ d_demote_to;
2625
4286
  #else
2626
- #define HWY_NATIVE_POPCNT
4287
+ // On other targets, promote to TW and demote to T
4288
+ const decltype(dw) dw_i;
4289
+ const decltype(d) d_demote_to;
2627
4290
  #endif
2628
4291
 
2629
- // This overload requires vectors to be at least 16 bytes, which is the case
2630
- // for LMUL >= 2.
2631
- #undef HWY_IF_POPCNT
2632
- #if HWY_TARGET == HWY_RVV
2633
- #define HWY_IF_POPCNT(D) \
2634
- hwy::EnableIf<D().Pow2() >= 1 && D().MaxLanes() >= 16>* = nullptr
4292
+ return BitCast(
4293
+ d, DemoteTo(d_demote_to, IntDivUsingFloatDiv<kOrigLaneSize>(
4294
+ PromoteTo(dw_i, a), PromoteTo(dw_i, b))));
4295
+ }
4296
+
4297
+ template <size_t kOrigLaneSize, class V,
4298
+ HWY_IF_T_SIZE_ONE_OF_V(V,
4299
+ (HWY_HAVE_FLOAT16 ? (1 << 1) : 0) | (1 << 2)),
4300
+ HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
4301
+ HWY_INLINE V IntDiv(V a, V b) {
4302
+ const DFromV<decltype(a)> d;
4303
+ const RepartitionToWide<decltype(d)> dw;
4304
+
4305
+ #if HWY_TARGET <= HWY_SSE2
4306
+ // On SSE2/SSSE3/SSE4/AVX2/AVX3, promote to and from MakeSigned<TW> to avoid
4307
+ // unnecessary overhead
4308
+ const RebindToSigned<decltype(dw)> dw_i;
4309
+
4310
+ // On SSE2/SSSE3/SSE4/AVX2/AVX3, demote to MakeSigned<TFromV<V>> if
4311
+ // kOrigLaneSize < sizeof(TFromV<V>) to avoid unnecessary overhead
4312
+ const If<(kOrigLaneSize < sizeof(TFromV<V>)), RebindToSigned<decltype(d)>,
4313
+ decltype(d)>
4314
+ d_demote_to;
2635
4315
  #else
2636
- // Other targets only have these two overloads which are mutually exclusive, so
2637
- // no further conditions are required.
2638
- #define HWY_IF_POPCNT(D) void* = nullptr
2639
- #endif // HWY_TARGET == HWY_RVV
4316
+ // On other targets, promote to MakeWide<TFromV<V>> and demote to TFromV<V>
4317
+ const decltype(dw) dw_i;
4318
+ const decltype(d) d_demote_to;
4319
+ #endif
2640
4320
 
2641
- template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
2642
- HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_POPCNT(D)>
2643
- HWY_API V PopulationCount(V v) {
2644
- const D d;
2645
- HWY_ALIGN constexpr uint8_t kLookup[16] = {
2646
- 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
2647
- };
2648
- const auto lo = And(v, Set(d, uint8_t{0xF}));
2649
- const auto hi = ShiftRight<4>(v);
2650
- const auto lookup = LoadDup128(d, kLookup);
2651
- return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
4321
+ return BitCast(d, OrderedDemote2To(
4322
+ d_demote_to,
4323
+ IntDivUsingFloatDiv<kOrigLaneSize>(
4324
+ PromoteLowerTo(dw_i, a), PromoteLowerTo(dw_i, b)),
4325
+ IntDivUsingFloatDiv<kOrigLaneSize>(
4326
+ PromoteUpperTo(dw_i, a), PromoteUpperTo(dw_i, b))));
2652
4327
  }
2653
4328
 
2654
- // RVV has a specialization that avoids the Set().
2655
- #if HWY_TARGET != HWY_RVV
2656
- // Slower fallback for capped vectors.
2657
- template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
2658
- HWY_IF_V_SIZE_LE_D(D, 8)>
2659
- HWY_API V PopulationCount(V v) {
2660
- const D d;
2661
- // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
2662
- const V k33 = Set(d, uint8_t{0x33});
2663
- v = Sub(v, And(ShiftRight<1>(v), Set(d, uint8_t{0x55})));
2664
- v = Add(And(ShiftRight<2>(v), k33), And(v, k33));
2665
- return And(Add(v, ShiftRight<4>(v)), Set(d, uint8_t{0x0F}));
2666
- }
2667
- #endif // HWY_TARGET != HWY_RVV
4329
+ #if !HWY_HAVE_FLOAT16
4330
+ template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
4331
+ HWY_IF_V_SIZE_V(V, HWY_MAX_BYTES / 2)>
4332
+ HWY_INLINE V IntDiv(V a, V b) {
4333
+ const DFromV<decltype(a)> d;
4334
+ const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw;
2668
4335
 
2669
- template <class V, class D = DFromV<V>, HWY_IF_U16_D(D)>
2670
- HWY_API V PopulationCount(V v) {
2671
- const D d;
2672
- const Repartition<uint8_t, decltype(d)> d8;
2673
- const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
2674
- return Add(ShiftRight<8>(vals), And(vals, Set(d, uint16_t{0xFF})));
4336
+ #if HWY_TARGET <= HWY_SSE2
4337
+ // On SSE2/SSSE3, demote from int16_t to TFromV<V> to avoid unnecessary
4338
+ // overhead
4339
+ const RebindToSigned<decltype(dw)> dw_i;
4340
+ #else
4341
+ // On other targets, demote from MakeWide<TFromV<V>> to TFromV<V>
4342
+ const decltype(dw) dw_i;
4343
+ #endif
4344
+
4345
+ return DemoteTo(d,
4346
+ BitCast(dw_i, IntDiv<1>(PromoteTo(dw, a), PromoteTo(dw, b))));
2675
4347
  }
4348
+ template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
4349
+ HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
4350
+ HWY_INLINE V IntDiv(V a, V b) {
4351
+ const DFromV<decltype(a)> d;
4352
+ const RepartitionToWide<decltype(d)> dw;
2676
4353
 
2677
- template <class V, class D = DFromV<V>, HWY_IF_U32_D(D)>
2678
- HWY_API V PopulationCount(V v) {
2679
- const D d;
2680
- Repartition<uint16_t, decltype(d)> d16;
2681
- auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
2682
- return Add(ShiftRight<16>(vals), And(vals, Set(d, uint32_t{0xFF})));
4354
+ #if HWY_TARGET <= HWY_SSE2
4355
+ // On SSE2/SSSE3, demote from int16_t to TFromV<V> to avoid unnecessary
4356
+ // overhead
4357
+ const RebindToSigned<decltype(dw)> dw_i;
4358
+ #else
4359
+ // On other targets, demote from MakeWide<TFromV<V>> to TFromV<V>
4360
+ const decltype(dw) dw_i;
4361
+ #endif
4362
+
4363
+ return OrderedDemote2To(
4364
+ d, BitCast(dw_i, IntDiv<1>(PromoteLowerTo(dw, a), PromoteLowerTo(dw, b))),
4365
+ BitCast(dw_i, IntDiv<1>(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b))));
2683
4366
  }
4367
+ #endif // !HWY_HAVE_FLOAT16
2684
4368
 
2685
- #if HWY_HAVE_INTEGER64
2686
- template <class V, class D = DFromV<V>, HWY_IF_U64_D(D)>
2687
- HWY_API V PopulationCount(V v) {
2688
- const D d;
2689
- Repartition<uint32_t, decltype(d)> d32;
2690
- auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
2691
- return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFFULL)));
4369
+ template <size_t kOrigLaneSize, class V,
4370
+ HWY_IF_T_SIZE_ONE_OF_V(V,
4371
+ (HWY_HAVE_FLOAT64 ? 0 : (1 << 4)) | (1 << 8))>
4372
+ HWY_INLINE V IntDiv(V a, V b) {
4373
+ return IntDivUsingFloatDiv<kOrigLaneSize>(a, b);
2692
4374
  }
2693
- #endif
2694
4375
 
2695
- #endif // HWY_NATIVE_POPCNT
4376
+ #if HWY_HAVE_FLOAT64
4377
+ template <size_t kOrigLaneSize, class V, HWY_IF_UI32(TFromV<V>),
4378
+ HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
4379
+ HWY_INLINE V IntDiv(V a, V b) {
4380
+ const DFromV<decltype(a)> d;
4381
+ const Rebind<double, decltype(d)> df64;
2696
4382
 
2697
- // ------------------------------ 8-bit multiplication
4383
+ return DemoteTo(d, Div(PromoteTo(df64, a), PromoteTo(df64, b)));
4384
+ }
4385
+ template <size_t kOrigLaneSize, class V, HWY_IF_UI32(TFromV<V>),
4386
+ HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
4387
+ HWY_INLINE V IntDiv(V a, V b) {
4388
+ const DFromV<decltype(a)> d;
4389
+ const Half<decltype(d)> dh;
4390
+ const Repartition<double, decltype(d)> df64;
2698
4391
 
2699
- // "Include guard": skip if native 8-bit mul instructions are available.
2700
- #if (defined(HWY_NATIVE_MUL_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
2701
- #ifdef HWY_NATIVE_MUL_8
2702
- #undef HWY_NATIVE_MUL_8
2703
- #else
2704
- #define HWY_NATIVE_MUL_8
2705
- #endif
4392
+ return Combine(
4393
+ d, DemoteTo(dh, Div(PromoteUpperTo(df64, a), PromoteUpperTo(df64, b))),
4394
+ DemoteTo(dh, Div(PromoteLowerTo(df64, a), PromoteLowerTo(df64, b))));
4395
+ }
4396
+ #endif // HWY_HAVE_FLOAT64
2706
4397
 
2707
- // 8 bit and fits in wider reg: promote
2708
- template <class V, HWY_IF_T_SIZE_V(V, 1),
4398
+ template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
4399
+ HWY_IF_T_SIZE_ONE_OF_V(V, ((HWY_TARGET <= HWY_SSE2 ||
4400
+ HWY_TARGET == HWY_WASM ||
4401
+ HWY_TARGET == HWY_WASM_EMU256)
4402
+ ? 0
4403
+ : (1 << 1)) |
4404
+ (1 << 2) | (1 << 4) | (1 << 8))>
4405
+ HWY_INLINE V IntMod(V a, V b) {
4406
+ return hwy::HWY_NAMESPACE::NegMulAdd(IntDiv<kOrigLaneSize>(a, b), b, a);
4407
+ }
4408
+
4409
+ #if HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || \
4410
+ HWY_TARGET == HWY_WASM_EMU256
4411
+ template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
2709
4412
  HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
2710
- HWY_API V operator*(const V a, const V b) {
4413
+ HWY_INLINE V IntMod(V a, V b) {
2711
4414
  const DFromV<decltype(a)> d;
2712
4415
  const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw;
2713
- const RebindToUnsigned<decltype(d)> du; // TruncateTo result
2714
- const RebindToUnsigned<decltype(dw)> dwu; // TruncateTo input
2715
- const VFromD<decltype(dw)> mul = PromoteTo(dw, a) * PromoteTo(dw, b);
2716
- // TruncateTo is cheaper than ConcatEven.
2717
- return BitCast(d, TruncateTo(du, BitCast(dwu, mul)));
4416
+ return DemoteTo(d, IntMod<kOrigLaneSize>(PromoteTo(dw, a), PromoteTo(dw, b)));
2718
4417
  }
2719
4418
 
2720
- // 8 bit full reg: promote halves
2721
- template <class V, HWY_IF_T_SIZE_V(V, 1),
4419
+ template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
2722
4420
  HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
2723
- HWY_API V operator*(const V a, const V b) {
4421
+ HWY_INLINE V IntMod(V a, V b) {
2724
4422
  const DFromV<decltype(a)> d;
2725
- const Half<decltype(d)> dh;
2726
- const Twice<RepartitionToWide<decltype(dh)>> dw;
2727
- const VFromD<decltype(dw)> a0 = PromoteTo(dw, LowerHalf(dh, a));
2728
- const VFromD<decltype(dw)> a1 = PromoteTo(dw, UpperHalf(dh, a));
2729
- const VFromD<decltype(dw)> b0 = PromoteTo(dw, LowerHalf(dh, b));
2730
- const VFromD<decltype(dw)> b1 = PromoteTo(dw, UpperHalf(dh, b));
2731
- const VFromD<decltype(dw)> m0 = a0 * b0;
2732
- const VFromD<decltype(dw)> m1 = a1 * b1;
2733
- return ConcatEven(d, BitCast(d, m1), BitCast(d, m0));
4423
+ const RepartitionToWide<decltype(d)> dw;
4424
+ return OrderedDemote2To(
4425
+ d, IntMod<kOrigLaneSize>(PromoteLowerTo(dw, a), PromoteLowerTo(dw, b)),
4426
+ IntMod<kOrigLaneSize>(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b)));
2734
4427
  }
4428
+ #endif // HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || HWY_TARGET ==
4429
+ // HWY_WASM_EMU256
2735
4430
 
2736
- #endif // HWY_NATIVE_MUL_8
2737
-
2738
- // ------------------------------ 64-bit multiplication
2739
-
2740
- // "Include guard": skip if native 64-bit mul instructions are available.
2741
- #if (defined(HWY_NATIVE_MUL_64) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
2742
- #ifdef HWY_NATIVE_MUL_64
2743
- #undef HWY_NATIVE_MUL_64
2744
- #else
2745
- #define HWY_NATIVE_MUL_64
2746
- #endif
4431
+ } // namespace detail
2747
4432
 
2748
- // Single-lane i64 or u64
2749
- template <class V, HWY_IF_T_SIZE_V(V, 8), HWY_IF_V_SIZE_V(V, 8),
2750
- HWY_IF_NOT_FLOAT_V(V)>
2751
- HWY_API V operator*(V x, V y) {
2752
- const DFromV<V> d;
2753
- using T = TFromD<decltype(d)>;
2754
- using TU = MakeUnsigned<T>;
2755
- const TU xu = static_cast<TU>(GetLane(x));
2756
- const TU yu = static_cast<TU>(GetLane(y));
2757
- return Set(d, static_cast<T>(xu * yu));
2758
- }
4433
+ #if HWY_TARGET == HWY_SCALAR
2759
4434
 
2760
- template <class V, class D64 = DFromV<V>, HWY_IF_U64_D(D64),
2761
- HWY_IF_V_SIZE_GT_D(D64, 8)>
2762
- HWY_API V operator*(V x, V y) {
2763
- RepartitionToNarrow<D64> d32;
2764
- auto x32 = BitCast(d32, x);
2765
- auto y32 = BitCast(d32, y);
2766
- auto lolo = BitCast(d32, MulEven(x32, y32));
2767
- auto lohi = BitCast(d32, MulEven(x32, BitCast(d32, ShiftRight<32>(y))));
2768
- auto hilo = BitCast(d32, MulEven(BitCast(d32, ShiftRight<32>(x)), y32));
2769
- auto hi = BitCast(d32, ShiftLeft<32>(BitCast(D64{}, lohi + hilo)));
2770
- return BitCast(D64{}, lolo + hi);
4435
+ template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
4436
+ HWY_API Vec1<T> operator/(Vec1<T> a, Vec1<T> b) {
4437
+ return detail::IntDiv<sizeof(T)>(a, b);
2771
4438
  }
2772
- template <class V, class DI64 = DFromV<V>, HWY_IF_I64_D(DI64),
2773
- HWY_IF_V_SIZE_GT_D(DI64, 8)>
2774
- HWY_API V operator*(V x, V y) {
2775
- RebindToUnsigned<DI64> du64;
2776
- return BitCast(DI64{}, BitCast(du64, x) * BitCast(du64, y));
4439
+ template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
4440
+ HWY_API Vec1<T> operator%(Vec1<T> a, Vec1<T> b) {
4441
+ return detail::IntMod<sizeof(T)>(a, b);
2777
4442
  }
2778
4443
 
2779
- #endif // HWY_NATIVE_MUL_64
4444
+ #else // HWY_TARGET != HWY_SCALAR
2780
4445
 
2781
- // ------------------------------ MulAdd / NegMulAdd
4446
+ template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
4447
+ HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
4448
+ return detail::IntDiv<sizeof(T)>(a, b);
4449
+ }
2782
4450
 
2783
- // "Include guard": skip if native int MulAdd instructions are available.
2784
- #if (defined(HWY_NATIVE_INT_FMA) == defined(HWY_TARGET_TOGGLE))
2785
- #ifdef HWY_NATIVE_INT_FMA
2786
- #undef HWY_NATIVE_INT_FMA
2787
- #else
2788
- #define HWY_NATIVE_INT_FMA
2789
- #endif
4451
+ template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
4452
+ HWY_API Vec128<T, N> operator%(Vec128<T, N> a, Vec128<T, N> b) {
4453
+ return detail::IntMod<sizeof(T)>(a, b);
4454
+ }
2790
4455
 
2791
- template <class V, HWY_IF_NOT_FLOAT_V(V)>
2792
- HWY_API V MulAdd(V mul, V x, V add) {
2793
- return Add(Mul(mul, x), add);
4456
+ #if HWY_CAP_GE256
4457
+ template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
4458
+ HWY_API Vec256<T> operator/(Vec256<T> a, Vec256<T> b) {
4459
+ return detail::IntDiv<sizeof(T)>(a, b);
2794
4460
  }
4461
+ template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
4462
+ HWY_API Vec256<T> operator%(Vec256<T> a, Vec256<T> b) {
4463
+ return detail::IntMod<sizeof(T)>(a, b);
4464
+ }
4465
+ #endif
2795
4466
 
2796
- template <class V, HWY_IF_NOT_FLOAT_V(V)>
2797
- HWY_API V NegMulAdd(V mul, V x, V add) {
2798
- return Sub(add, Mul(mul, x));
4467
+ #if HWY_CAP_GE512
4468
+ template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
4469
+ HWY_API Vec512<T> operator/(Vec512<T> a, Vec512<T> b) {
4470
+ return detail::IntDiv<sizeof(T)>(a, b);
2799
4471
  }
4472
+ template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
4473
+ HWY_API Vec512<T> operator%(Vec512<T> a, Vec512<T> b) {
4474
+ return detail::IntMod<sizeof(T)>(a, b);
4475
+ }
4476
+ #endif
2800
4477
 
2801
- #endif // HWY_NATIVE_INT_FMA
4478
+ #endif // HWY_TARGET == HWY_SCALAR
4479
+
4480
+ #endif // HWY_NATIVE_INT_DIV
2802
4481
 
2803
4482
  // ------------------------------ SatWidenMulPairwiseAdd
2804
4483
 
@@ -2819,11 +4498,11 @@ template <class DI16, class VU8, class VI8,
2819
4498
  HWY_API Vec<DI16> SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b) {
2820
4499
  const RebindToUnsigned<decltype(di16)> du16;
2821
4500
 
2822
- const auto a0 = And(BitCast(di16, a), Set(di16, int16_t{0x00FF}));
2823
- const auto b0 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, b)));
4501
+ const auto a0 = BitCast(di16, PromoteEvenTo(du16, a));
4502
+ const auto b0 = PromoteEvenTo(di16, b);
2824
4503
 
2825
- const auto a1 = BitCast(di16, ShiftRight<8>(BitCast(du16, a)));
2826
- const auto b1 = ShiftRight<8>(BitCast(di16, b));
4504
+ const auto a1 = BitCast(di16, PromoteOddTo(du16, a));
4505
+ const auto b1 = PromoteOddTo(di16, b);
2827
4506
 
2828
4507
  return SaturatedAdd(Mul(a0, b0), Mul(a1, b1));
2829
4508
  }
@@ -2848,11 +4527,11 @@ HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32,
2848
4527
  VFromD<DI32> sum) {
2849
4528
  const Repartition<int16_t, decltype(di32)> di16;
2850
4529
 
2851
- const auto a0 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, a)));
2852
- const auto b0 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, b)));
4530
+ const auto a0 = PromoteEvenTo(di16, a);
4531
+ const auto b0 = PromoteEvenTo(di16, b);
2853
4532
 
2854
- const auto a1 = ShiftRight<8>(BitCast(di16, a));
2855
- const auto b1 = ShiftRight<8>(BitCast(di16, b));
4533
+ const auto a1 = PromoteOddTo(di16, a);
4534
+ const auto b1 = PromoteOddTo(di16, b);
2856
4535
 
2857
4536
  return Add(sum, Add(WidenMulPairwiseAdd(di32, a0, b0),
2858
4537
  WidenMulPairwiseAdd(di32, a1, b1)));
@@ -2985,12 +4664,10 @@ HWY_API VFromD<DU64> SumOfMulQuadAccumulate(
2985
4664
  const auto u32_even_prod = MulEven(a, b);
2986
4665
  const auto u32_odd_prod = MulOdd(a, b);
2987
4666
 
2988
- const auto lo32_mask = Set(du64, uint64_t{0xFFFFFFFFu});
2989
-
2990
- const auto p0 = Add(And(BitCast(du64, u32_even_prod), lo32_mask),
2991
- And(BitCast(du64, u32_odd_prod), lo32_mask));
2992
- const auto p1 = Add(ShiftRight<32>(BitCast(du64, u32_even_prod)),
2993
- ShiftRight<32>(BitCast(du64, u32_odd_prod)));
4667
+ const auto p0 = Add(PromoteEvenTo(du64, u32_even_prod),
4668
+ PromoteEvenTo(du64, u32_odd_prod));
4669
+ const auto p1 =
4670
+ Add(PromoteOddTo(du64, u32_even_prod), PromoteOddTo(du64, u32_odd_prod));
2994
4671
 
2995
4672
  return Add(sum, Add(p0, p1));
2996
4673
  }
@@ -3043,7 +4720,6 @@ HWY_API V ApproximateReciprocalSqrt(V v) {
3043
4720
 
3044
4721
  // ------------------------------ Compress*
3045
4722
 
3046
- // "Include guard": skip if native 8-bit compress instructions are available.
3047
4723
  #if (defined(HWY_NATIVE_COMPRESS8) == defined(HWY_TARGET_TOGGLE))
3048
4724
  #ifdef HWY_NATIVE_COMPRESS8
3049
4725
  #undef HWY_NATIVE_COMPRESS8
@@ -3244,7 +4920,6 @@ HWY_API V CompressNot(V v, M mask) {
3244
4920
 
3245
4921
  // ------------------------------ Expand
3246
4922
 
3247
- // "Include guard": skip if native 8/16-bit Expand/LoadExpand are available.
3248
4923
  // Note that this generic implementation assumes <= 128 bit fixed vectors;
3249
4924
  // the SVE and RVV targets provide their own native implementations.
3250
4925
  #if (defined(HWY_NATIVE_EXPAND) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
@@ -3853,7 +5528,9 @@ HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
3853
5528
  BitCast(du, InterleaveLower(du8x2, indices8, indices8));
3854
5529
  // TableLookupBytesOr0 operates on bytes. To convert u16 lane indices to byte
3855
5530
  // indices, add 0 to even and 1 to odd byte lanes.
3856
- const Vec128<uint16_t, N> byte_indices = Add(indices16, Set(du, 0x0100));
5531
+ const Vec128<uint16_t, N> byte_indices = Add(
5532
+ indices16,
5533
+ Set(du, static_cast<uint16_t>(HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001)));
3857
5534
  return BitCast(d, TableLookupBytesOr0(v, byte_indices));
3858
5535
  }
3859
5536
 
@@ -3947,9 +5624,9 @@ HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
3947
5624
  const Repartition<uint16_t, decltype(d)> du16;
3948
5625
  return BitCast(d, RotateRight<8>(BitCast(du16, v)));
3949
5626
  #else
3950
- alignas(16) static constexpr TFromD<D> kShuffle[16] = {
3951
- 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
3952
- return TableLookupBytes(v, LoadDup128(d, kShuffle));
5627
+ const VFromD<D> shuffle = Dup128VecFromValues(d, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,
5628
+ 11, 10, 13, 12, 15, 14);
5629
+ return TableLookupBytes(v, shuffle);
3953
5630
  #endif
3954
5631
  }
3955
5632
 
@@ -3959,10 +5636,10 @@ HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
3959
5636
  const Repartition<uint16_t, decltype(d)> du16;
3960
5637
  return BitCast(d, Reverse2(du16, BitCast(du16, Reverse2(d, v))));
3961
5638
  #else
3962
- alignas(16) static constexpr uint8_t kShuffle[16] = {
3963
- 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12};
3964
5639
  const Repartition<uint8_t, decltype(d)> du8;
3965
- return TableLookupBytes(v, BitCast(d, LoadDup128(du8, kShuffle)));
5640
+ const VFromD<decltype(du8)> shuffle = Dup128VecFromValues(
5641
+ du8, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
5642
+ return TableLookupBytes(v, BitCast(d, shuffle));
3966
5643
  #endif
3967
5644
  }
3968
5645
 
@@ -3972,10 +5649,10 @@ HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
3972
5649
  const Repartition<uint32_t, D> du32;
3973
5650
  return BitCast(d, Reverse2(du32, BitCast(du32, Reverse4(d, v))));
3974
5651
  #else
3975
- alignas(16) static constexpr uint8_t kShuffle[16] = {
3976
- 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
3977
5652
  const Repartition<uint8_t, decltype(d)> du8;
3978
- return TableLookupBytes(v, BitCast(d, LoadDup128(du8, kShuffle)));
5653
+ const VFromD<decltype(du8)> shuffle = Dup128VecFromValues(
5654
+ du8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
5655
+ return TableLookupBytes(v, BitCast(d, shuffle));
3979
5656
  #endif
3980
5657
  }
3981
5658
 
@@ -4111,8 +5788,6 @@ HWY_INLINE Vec<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
4111
5788
  const uint32_t x2,
4112
5789
  const uint32_t x1,
4113
5790
  const uint32_t x0) {
4114
- alignas(16) const uint32_t lanes[4] = {x0, x1, x2, x3};
4115
-
4116
5791
  #if HWY_TARGET == HWY_RVV
4117
5792
  constexpr int kPow2 = d.Pow2();
4118
5793
  constexpr int kLoadPow2 = HWY_MAX(kPow2, -1);
@@ -4128,8 +5803,7 @@ HWY_INLINE Vec<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
4128
5803
  HWY_MAX(kMaxBytes / sizeof(uint32_t), kMinLanesToLoad);
4129
5804
  const CappedTag<uint32_t, kNumToLoad> d_load;
4130
5805
  #endif
4131
-
4132
- return ResizeBitCast(d, LoadDup128(d_load, lanes));
5806
+ return ResizeBitCast(d, Dup128VecFromValues(d_load, x0, x1, x2, x3));
4133
5807
  }
4134
5808
 
4135
5809
  } // namespace detail
@@ -4291,10 +5965,6 @@ HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
4291
5965
  const uint16_t u16_idx1 = static_cast<uint16_t>(idx1);
4292
5966
  const uint16_t u16_idx2 = static_cast<uint16_t>(idx2);
4293
5967
  const uint16_t u16_idx3 = static_cast<uint16_t>(idx3);
4294
- alignas(16)
4295
- const uint16_t indices[8] = {u16_idx0, u16_idx1, u16_idx2, u16_idx3,
4296
- u16_idx0, u16_idx1, u16_idx2, u16_idx3};
4297
-
4298
5968
  #if HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES
4299
5969
  constexpr size_t kMinLanesToLoad = 4;
4300
5970
  #else
@@ -4302,8 +5972,9 @@ HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
4302
5972
  #endif
4303
5973
  constexpr size_t kNumToLoad = HWY_MAX(HWY_MAX_LANES_D(D), kMinLanesToLoad);
4304
5974
  const CappedTag<uint16_t, kNumToLoad> d_load;
4305
-
4306
- return ResizeBitCast(d, LoadDup128(d_load, indices));
5975
+ return ResizeBitCast(
5976
+ d, Dup128VecFromValues(d_load, u16_idx0, u16_idx1, u16_idx2, u16_idx3,
5977
+ u16_idx0, u16_idx1, u16_idx2, u16_idx3));
4307
5978
  }
4308
5979
 
4309
5980
  template <class D, HWY_IF_T_SIZE_D(D, 4)>
@@ -4672,6 +6343,202 @@ HWY_API VFromD<D> SlideDownBlocks(D d, VFromD<D> v) {
4672
6343
  }
4673
6344
  #endif
4674
6345
 
6346
+ // ------------------------------ SumsOfAdjQuadAbsDiff
6347
+
6348
+ #if (defined(HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF) == \
6349
+ defined(HWY_TARGET_TOGGLE))
6350
+ #ifdef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
6351
+ #undef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
6352
+ #else
6353
+ #define HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
6354
+ #endif
6355
+
6356
+ #if HWY_TARGET != HWY_SCALAR
6357
+ template <int kAOffset, int kBOffset, class V8, HWY_IF_UI8_D(DFromV<V8>)>
6358
+ HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfAdjQuadAbsDiff(V8 a, V8 b) {
6359
+ static_assert(0 <= kAOffset && kAOffset <= 1,
6360
+ "kAOffset must be between 0 and 1");
6361
+ static_assert(0 <= kBOffset && kBOffset <= 3,
6362
+ "kBOffset must be between 0 and 3");
6363
+ using D8 = DFromV<V8>;
6364
+ const D8 d8;
6365
+ const RebindToUnsigned<decltype(d8)> du8;
6366
+ const RepartitionToWide<decltype(d8)> d16;
6367
+ const RepartitionToWide<decltype(du8)> du16;
6368
+
6369
+ // Ensure that a is resized to a vector that has at least
6370
+ // HWY_MAX(Lanes(d8), size_t{8} << kAOffset) lanes for the interleave and
6371
+ // CombineShiftRightBytes operations below.
6372
+ #if HWY_TARGET == HWY_RVV
6373
+ // On RVV targets, need to ensure that d8_interleave.Pow2() >= 0 is true
6374
+ // to ensure that Lanes(d8_interleave) >= 16 is true.
6375
+
6376
+ // Lanes(d8_interleave) >= Lanes(d8) is guaranteed to be true on RVV
6377
+ // targets as d8_interleave.Pow2() >= d8.Pow2() is true.
6378
+ constexpr int kInterleavePow2 = HWY_MAX(d8.Pow2(), 0);
6379
+ const ScalableTag<TFromD<D8>, kInterleavePow2> d8_interleave;
6380
+ #elif HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || \
6381
+ HWY_TARGET == HWY_SVE2_128
6382
+ // On SVE targets, Lanes(d8_interleave) >= 16 and
6383
+ // Lanes(d8_interleave) >= Lanes(d8) are both already true as d8 is a SIMD
6384
+ // tag for a full u8/i8 vector on SVE.
6385
+ const D8 d8_interleave;
6386
+ #else
6387
+ // On targets that use non-scalable vector types, Lanes(d8_interleave) is
6388
+ // equal to HWY_MAX(Lanes(d8), size_t{8} << kAOffset).
6389
+ constexpr size_t kInterleaveLanes =
6390
+ HWY_MAX(HWY_MAX_LANES_D(D8), size_t{8} << kAOffset);
6391
+ const FixedTag<TFromD<D8>, kInterleaveLanes> d8_interleave;
6392
+ #endif
6393
+
6394
+ // The ResizeBitCast operation below will resize a to a vector that has
6395
+ // at least HWY_MAX(Lanes(d8), size_t{8} << kAOffset) lanes for the
6396
+ // InterleaveLower, InterleaveUpper, and CombineShiftRightBytes operations
6397
+ // below.
6398
+ const auto a_to_interleave = ResizeBitCast(d8_interleave, a);
6399
+
6400
+ const auto a_interleaved_lo =
6401
+ InterleaveLower(d8_interleave, a_to_interleave, a_to_interleave);
6402
+ const auto a_interleaved_hi =
6403
+ InterleaveUpper(d8_interleave, a_to_interleave, a_to_interleave);
6404
+
6405
+ /* a01: { a[kAOffset*4+0], a[kAOffset*4+1], a[kAOffset*4+1], a[kAOffset*4+2],
6406
+ a[kAOffset*4+2], a[kAOffset*4+3], a[kAOffset*4+3], a[kAOffset*4+4],
6407
+ a[kAOffset*4+4], a[kAOffset*4+5], a[kAOffset*4+5], a[kAOffset*4+6],
6408
+ a[kAOffset*4+6], a[kAOffset*4+7], a[kAOffset*4+7], a[kAOffset*4+8] }
6409
+ */
6410
+ /* a23: { a[kAOffset*4+2], a[kAOffset*4+3], a[kAOffset*4+3], a[kAOffset*4+4],
6411
+ a[kAOffset*4+4], a[kAOffset*4+5], a[kAOffset*4+5], a[kAOffset*4+6],
6412
+ a[kAOffset*4+6], a[kAOffset*4+7], a[kAOffset*4+7], a[kAOffset*4+8],
6413
+ a[kAOffset*4+8], a[kAOffset*4+9], a[kAOffset*4+9], a[kAOffset*4+10]
6414
+ } */
6415
+
6416
+ // a01 and a23 are resized back to V8 as only the first Lanes(d8) lanes of
6417
+ // the CombineShiftRightBytes are needed for the subsequent AbsDiff operations
6418
+ // and as a01 and a23 need to be the same vector type as b01 and b23 for the
6419
+ // AbsDiff operations below.
6420
+ const V8 a01 =
6421
+ ResizeBitCast(d8, CombineShiftRightBytes<kAOffset * 8 + 1>(
6422
+ d8_interleave, a_interleaved_hi, a_interleaved_lo));
6423
+ const V8 a23 =
6424
+ ResizeBitCast(d8, CombineShiftRightBytes<kAOffset * 8 + 5>(
6425
+ d8_interleave, a_interleaved_hi, a_interleaved_lo));
6426
+
6427
+ /* b01: { b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1],
6428
+ b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1],
6429
+ b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1],
6430
+ b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1] }
6431
+ */
6432
+ /* b23: { b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3],
6433
+ b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3],
6434
+ b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3],
6435
+ b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3] }
6436
+ */
6437
+ const V8 b01 = BitCast(d8, Broadcast<kBOffset * 2>(BitCast(d16, b)));
6438
+ const V8 b23 = BitCast(d8, Broadcast<kBOffset * 2 + 1>(BitCast(d16, b)));
6439
+
6440
+ const VFromD<decltype(du16)> absdiff_sum_01 =
6441
+ SumsOf2(BitCast(du8, AbsDiff(a01, b01)));
6442
+ const VFromD<decltype(du16)> absdiff_sum_23 =
6443
+ SumsOf2(BitCast(du8, AbsDiff(a23, b23)));
6444
+ return BitCast(d16, Add(absdiff_sum_01, absdiff_sum_23));
6445
+ }
6446
+ #endif // HWY_TARGET != HWY_SCALAR
6447
+
6448
+ #endif // HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
6449
+
6450
+ // ------------------------------ SumsOfShuffledQuadAbsDiff
6451
+
6452
+ #if (defined(HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF) == \
6453
+ defined(HWY_TARGET_TOGGLE))
6454
+ #ifdef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
6455
+ #undef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
6456
+ #else
6457
+ #define HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
6458
+ #endif
6459
+
6460
+ #if HWY_TARGET != HWY_SCALAR
6461
+ template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, class V8,
6462
+ HWY_IF_UI8_D(DFromV<V8>)>
6463
+ HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfShuffledQuadAbsDiff(V8 a,
6464
+ V8 b) {
6465
+ static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3");
6466
+ static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3");
6467
+ static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3");
6468
+ static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3");
6469
+
6470
+ #if HWY_TARGET == HWY_RVV
6471
+ // On RVV, ensure that both vA and vB have a LMUL of at least 1/2 so that
6472
+ // both vA and vB can be bitcasted to a u32 vector.
6473
+ const detail::AdjustSimdTagToMinVecPow2<
6474
+ RepartitionToWideX2<DFromV<decltype(a)>>>
6475
+ d32;
6476
+ const RepartitionToNarrow<decltype(d32)> d16;
6477
+ const RepartitionToNarrow<decltype(d16)> d8;
6478
+
6479
+ const auto vA = ResizeBitCast(d8, a);
6480
+ const auto vB = ResizeBitCast(d8, b);
6481
+ #else
6482
+ const DFromV<decltype(a)> d8;
6483
+ const RepartitionToWide<decltype(d8)> d16;
6484
+ const RepartitionToWide<decltype(d16)> d32;
6485
+
6486
+ const auto vA = a;
6487
+ const auto vB = b;
6488
+ #endif
6489
+
6490
+ const RebindToUnsigned<decltype(d8)> du8;
6491
+
6492
+ const auto a_shuf =
6493
+ Per4LaneBlockShuffle<kIdx3, kIdx2, kIdx1, kIdx0>(BitCast(d32, vA));
6494
+ /* a0123_2345: { a_shuf[0], a_shuf[1], a_shuf[2], a_shuf[3],
6495
+ a_shuf[2], a_shuf[3], a_shuf[4], a_shuf[5],
6496
+ a_shuf[8], a_shuf[9], a_shuf[10], a_shuf[11],
6497
+ a_shuf[10], a_shuf[11], a_shuf[12], a_shuf[13] } */
6498
+ /* a1234_3456: { a_shuf[1], a_shuf[2], a_shuf[3], a_shuf[4],
6499
+ a_shuf[3], a_shuf[4], a_shuf[5], a_shuf[6],
6500
+ a_shuf[9], a_shuf[10], a_shuf[11], a_shuf[12],
6501
+ a_shuf[11], a_shuf[12], a_shuf[13], a_shuf[14] } */
6502
+ #if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
6503
+ // On RVV/SVE targets, use Slide1Up/Slide1Down instead of
6504
+ // ShiftLeftBytes/ShiftRightBytes to avoid unnecessary zeroing out of any
6505
+ // lanes that are shifted into an adjacent 16-byte block as any lanes that are
6506
+ // shifted into an adjacent 16-byte block by Slide1Up/Slide1Down will be
6507
+ // replaced by the OddEven operation.
6508
+ const auto a_0123_2345 = BitCast(
6509
+ d8, OddEven(BitCast(d32, Slide1Up(d16, BitCast(d16, a_shuf))), a_shuf));
6510
+ const auto a_1234_3456 =
6511
+ BitCast(d8, OddEven(BitCast(d32, Slide1Up(d8, BitCast(d8, a_shuf))),
6512
+ BitCast(d32, Slide1Down(d8, BitCast(d8, a_shuf)))));
6513
+ #else
6514
+ const auto a_0123_2345 =
6515
+ BitCast(d8, OddEven(ShiftLeftBytes<2>(d32, a_shuf), a_shuf));
6516
+ const auto a_1234_3456 = BitCast(
6517
+ d8,
6518
+ OddEven(ShiftLeftBytes<1>(d32, a_shuf), ShiftRightBytes<1>(d32, a_shuf)));
6519
+ #endif
6520
+
6521
+ auto even_sums = SumsOf4(BitCast(du8, AbsDiff(a_0123_2345, vB)));
6522
+ auto odd_sums = SumsOf4(BitCast(du8, AbsDiff(a_1234_3456, vB)));
6523
+
6524
+ #if HWY_IS_LITTLE_ENDIAN
6525
+ odd_sums = ShiftLeft<16>(odd_sums);
6526
+ #else
6527
+ even_sums = ShiftLeft<16>(even_sums);
6528
+ #endif
6529
+
6530
+ const auto sums = OddEven(BitCast(d16, odd_sums), BitCast(d16, even_sums));
6531
+
6532
+ #if HWY_TARGET == HWY_RVV
6533
+ return ResizeBitCast(RepartitionToWide<DFromV<V8>>(), sums);
6534
+ #else
6535
+ return sums;
6536
+ #endif
6537
+ }
6538
+ #endif // HWY_TARGET != HWY_SCALAR
6539
+
6540
+ #endif // HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
6541
+
4675
6542
  // ================================================== Operator wrapper
4676
6543
 
4677
6544
  // SVE* and RVV currently cannot define operators and have already defined
@@ -4700,6 +6567,10 @@ template <class V>
4700
6567
  HWY_API V Div(V a, V b) {
4701
6568
  return a / b;
4702
6569
  }
6570
+ template <class V>
6571
+ HWY_API V Mod(V a, V b) {
6572
+ return a % b;
6573
+ }
4703
6574
 
4704
6575
  template <class V>
4705
6576
  V Shl(V a, V b) {