@img/sharp-libvips-dev 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (230) hide show
  1. package/include/aom/aom_encoder.h +3 -3
  2. package/include/aom/aomcx.h +17 -8
  3. package/include/expat.h +21 -10
  4. package/include/expat_config.h +11 -5
  5. package/include/ffi.h +12 -25
  6. package/include/fontconfig/fontconfig.h +5 -3
  7. package/include/freetype2/freetype/config/ftoption.h +1 -1
  8. package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -7
  9. package/include/gio-unix-2.0/gio/gunixinputstream.h +0 -5
  10. package/include/gio-unix-2.0/gio/gunixoutputstream.h +0 -5
  11. package/include/glib-2.0/gio/gappinfo.h +0 -7
  12. package/include/glib-2.0/gio/gapplication.h +6 -0
  13. package/include/glib-2.0/gio/gapplicationcommandline.h +12 -1
  14. package/include/glib-2.0/gio/gasyncinitable.h +0 -7
  15. package/include/glib-2.0/gio/gasyncresult.h +0 -6
  16. package/include/glib-2.0/gio/gbufferedinputstream.h +0 -5
  17. package/include/glib-2.0/gio/gbufferedoutputstream.h +0 -5
  18. package/include/glib-2.0/gio/gbytesicon.h +0 -5
  19. package/include/glib-2.0/gio/gcancellable.h +0 -5
  20. package/include/glib-2.0/gio/gconverter.h +0 -7
  21. package/include/glib-2.0/gio/gconverterinputstream.h +0 -6
  22. package/include/glib-2.0/gio/gconverteroutputstream.h +0 -6
  23. package/include/glib-2.0/gio/gdatagrambased.h +0 -7
  24. package/include/glib-2.0/gio/gdatainputstream.h +0 -6
  25. package/include/glib-2.0/gio/gdataoutputstream.h +0 -6
  26. package/include/glib-2.0/gio/gdbusinterface.h +0 -8
  27. package/include/glib-2.0/gio/gdbusinterfaceskeleton.h +0 -8
  28. package/include/glib-2.0/gio/gdbusmessage.h +2 -1
  29. package/include/glib-2.0/gio/gdbusobjectmanagerclient.h +0 -8
  30. package/include/glib-2.0/gio/gdbusobjectmanagerserver.h +0 -8
  31. package/include/glib-2.0/gio/gdbusobjectproxy.h +0 -8
  32. package/include/glib-2.0/gio/gdbusobjectskeleton.h +0 -8
  33. package/include/glib-2.0/gio/gdbusproxy.h +0 -8
  34. package/include/glib-2.0/gio/gdebugcontroller.h +0 -8
  35. package/include/glib-2.0/gio/gdebugcontrollerdbus.h +0 -7
  36. package/include/glib-2.0/gio/gdtlsserverconnection.h +0 -8
  37. package/include/glib-2.0/gio/gemblem.h +0 -5
  38. package/include/glib-2.0/gio/gemblemedicon.h +0 -5
  39. package/include/glib-2.0/gio/gfile.h +0 -10
  40. package/include/glib-2.0/gio/gfileenumerator.h +0 -5
  41. package/include/glib-2.0/gio/gfileicon.h +0 -5
  42. package/include/glib-2.0/gio/gfileinfo.h +0 -5
  43. package/include/glib-2.0/gio/gfileinputstream.h +0 -8
  44. package/include/glib-2.0/gio/gfileiostream.h +0 -8
  45. package/include/glib-2.0/gio/gfilemonitor.h +0 -5
  46. package/include/glib-2.0/gio/gfilenamecompleter.h +0 -5
  47. package/include/glib-2.0/gio/gfileoutputstream.h +0 -8
  48. package/include/glib-2.0/gio/gfilterinputstream.h +0 -5
  49. package/include/glib-2.0/gio/gfilteroutputstream.h +0 -5
  50. package/include/glib-2.0/gio/gicon.h +0 -5
  51. package/include/glib-2.0/gio/ginitable.h +0 -7
  52. package/include/glib-2.0/gio/ginputstream.h +0 -5
  53. package/include/glib-2.0/gio/gio-autocleanups.h +4 -0
  54. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  55. package/include/glib-2.0/gio/gioenums.h +6 -1
  56. package/include/glib-2.0/gio/giomodule.h +0 -5
  57. package/include/glib-2.0/gio/giostream.h +0 -5
  58. package/include/glib-2.0/gio/giotypes.h +5 -108
  59. package/include/glib-2.0/gio/gloadableicon.h +0 -6
  60. package/include/glib-2.0/gio/gmemoryinputstream.h +0 -5
  61. package/include/glib-2.0/gio/gmemoryoutputstream.h +0 -5
  62. package/include/glib-2.0/gio/gmountoperation.h +0 -6
  63. package/include/glib-2.0/gio/gnetworking.h +4 -0
  64. package/include/glib-2.0/gio/goutputstream.h +0 -9
  65. package/include/glib-2.0/gio/gpollableinputstream.h +0 -7
  66. package/include/glib-2.0/gio/gpollableoutputstream.h +0 -7
  67. package/include/glib-2.0/gio/gproxy.h +0 -7
  68. package/include/glib-2.0/gio/gproxyaddressenumerator.h +0 -8
  69. package/include/glib-2.0/gio/gseekable.h +0 -5
  70. package/include/glib-2.0/gio/gsettingsbackend.h +0 -5
  71. package/include/glib-2.0/gio/gsimpleactiongroup.h +0 -7
  72. package/include/glib-2.0/gio/gsimpleasyncresult.h +0 -5
  73. package/include/glib-2.0/gio/gsimpleproxyresolver.h +0 -5
  74. package/include/glib-2.0/gio/gsocket.h +13 -0
  75. package/include/glib-2.0/gio/gsocketaddressenumerator.h +0 -6
  76. package/include/glib-2.0/gio/gsocketconnectable.h +0 -5
  77. package/include/glib-2.0/gio/gtask.h +12 -0
  78. package/include/glib-2.0/gio/gthemedicon.h +0 -5
  79. package/include/glib-2.0/gio/gtlsserverconnection.h +0 -8
  80. package/include/glib-2.0/gio/gunixcredentialsmessage.h +0 -8
  81. package/include/glib-2.0/gio/gvfs.h +0 -5
  82. package/include/glib-2.0/gio/gvolume.h +2 -2
  83. package/include/glib-2.0/gio/gvolumemonitor.h +0 -5
  84. package/include/glib-2.0/girepository/gi-visibility.h +986 -0
  85. package/include/glib-2.0/girepository/giarginfo.h +100 -0
  86. package/include/glib-2.0/girepository/gibaseinfo.h +129 -0
  87. package/include/glib-2.0/girepository/gicallableinfo.h +119 -0
  88. package/include/glib-2.0/girepository/gicallbackinfo.h +60 -0
  89. package/include/glib-2.0/girepository/giconstantinfo.h +72 -0
  90. package/include/glib-2.0/girepository/gienuminfo.h +82 -0
  91. package/include/glib-2.0/girepository/gifieldinfo.h +84 -0
  92. package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
  93. package/include/glib-2.0/girepository/gifunctioninfo.h +117 -0
  94. package/include/glib-2.0/girepository/giinterfaceinfo.h +120 -0
  95. package/include/glib-2.0/girepository/giobjectinfo.h +230 -0
  96. package/include/glib-2.0/girepository/gipropertyinfo.h +77 -0
  97. package/include/glib-2.0/girepository/giregisteredtypeinfo.h +75 -0
  98. package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
  99. package/include/glib-2.0/girepository/girepository.h +247 -0
  100. package/include/glib-2.0/girepository/girffi.h +129 -0
  101. package/include/glib-2.0/girepository/gisignalinfo.h +72 -0
  102. package/include/glib-2.0/girepository/gistructinfo.h +102 -0
  103. package/include/glib-2.0/girepository/gitypeinfo.h +144 -0
  104. package/include/glib-2.0/girepository/gitypelib.h +61 -0
  105. package/include/glib-2.0/girepository/gitypes.h +421 -0
  106. package/include/glib-2.0/girepository/giunioninfo.h +105 -0
  107. package/include/glib-2.0/girepository/giunresolvedinfo.h +60 -0
  108. package/include/glib-2.0/girepository/givalueinfo.h +65 -0
  109. package/include/glib-2.0/girepository/givfuncinfo.h +88 -0
  110. package/include/glib-2.0/glib/deprecated/gcompletion.h +1 -1
  111. package/include/glib-2.0/glib/deprecated/grel.h +0 -23
  112. package/include/glib-2.0/glib/deprecated/gthread.h +10 -6
  113. package/include/glib-2.0/glib/gatomic.h +20 -20
  114. package/include/glib-2.0/glib/gbitlock.h +31 -0
  115. package/include/glib-2.0/glib/gbookmarkfile.h +39 -1
  116. package/include/glib-2.0/glib/gchecksum.h +0 -10
  117. package/include/glib-2.0/glib/gdate.h +0 -9
  118. package/include/glib-2.0/glib/gdatetime.h +33 -1
  119. package/include/glib-2.0/glib/gdir.h +5 -0
  120. package/include/glib-2.0/glib/ghmac.h +0 -9
  121. package/include/glib-2.0/glib/glib-autocleanups.h +4 -0
  122. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  123. package/include/glib-2.0/glib/gmacros.h +1 -0
  124. package/include/glib-2.0/glib/gmessages.h +11 -0
  125. package/include/glib-2.0/glib/gpathbuf.h +0 -7
  126. package/include/glib-2.0/glib/gslice.h +2 -0
  127. package/include/glib-2.0/glib/gstdio.h +1 -1
  128. package/include/glib-2.0/glib/gstrfuncs.h +24 -18
  129. package/include/glib-2.0/glib/gstrvbuilder.h +4 -8
  130. package/include/glib-2.0/glib/gtestutils.h +5 -0
  131. package/include/glib-2.0/glib/gthread.h +216 -3
  132. package/include/glib-2.0/glib/gunicode.h +12 -2
  133. package/include/glib-2.0/glib/gvarianttype.h +1 -10
  134. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  135. package/include/glib-2.0/glib/gwin32.h +4 -4
  136. package/include/glib-2.0/glib-unix.h +214 -0
  137. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  138. package/include/glib-2.0/gobject/gbinding.h +0 -8
  139. package/include/glib-2.0/gobject/gbindinggroup.h +0 -8
  140. package/include/glib-2.0/gobject/gclosure.h +1 -9
  141. package/include/glib-2.0/gobject/genums.h +6 -6
  142. package/include/glib-2.0/gobject/glib-types.h +44 -0
  143. package/include/glib-2.0/gobject/gobject-autocleanups.h +4 -0
  144. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  145. package/include/glib-2.0/gobject/gobject.h +1 -16
  146. package/include/glib-2.0/gobject/gparam.h +3 -12
  147. package/include/glib-2.0/gobject/gsignal.h +16 -6
  148. package/include/glib-2.0/gobject/gsignalgroup.h +0 -8
  149. package/include/glib-2.0/gobject/gtype.h +53 -20
  150. package/include/glib-2.0/gobject/gtypemodule.h +0 -7
  151. package/include/glib-2.0/gobject/gtypeplugin.h +0 -6
  152. package/include/glib-2.0/gobject/gvaluearray.h +0 -7
  153. package/include/glib-2.0/gobject/gvaluecollector.h +1 -11
  154. package/include/glib-2.0/gobject/gvaluetypes.h +2 -0
  155. package/include/hwy/aligned_allocator.h +171 -6
  156. package/include/hwy/base.h +1765 -543
  157. package/include/hwy/cache_control.h +24 -6
  158. package/include/hwy/detect_compiler_arch.h +23 -2
  159. package/include/hwy/detect_targets.h +56 -13
  160. package/include/hwy/foreach_target.h +24 -0
  161. package/include/hwy/highway.h +20 -3
  162. package/include/hwy/ops/arm_neon-inl.h +1086 -667
  163. package/include/hwy/ops/arm_sve-inl.h +1091 -235
  164. package/include/hwy/ops/emu128-inl.h +271 -196
  165. package/include/hwy/ops/generic_ops-inl.h +2270 -399
  166. package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
  167. package/include/hwy/ops/rvv-inl.h +1043 -311
  168. package/include/hwy/ops/scalar-inl.h +189 -159
  169. package/include/hwy/ops/set_macros-inl.h +66 -6
  170. package/include/hwy/ops/shared-inl.h +175 -56
  171. package/include/hwy/ops/wasm_128-inl.h +153 -136
  172. package/include/hwy/ops/x86_128-inl.h +1647 -646
  173. package/include/hwy/ops/x86_256-inl.h +1003 -370
  174. package/include/hwy/ops/x86_512-inl.h +948 -353
  175. package/include/hwy/per_target.h +4 -0
  176. package/include/hwy/profiler.h +648 -0
  177. package/include/hwy/robust_statistics.h +2 -2
  178. package/include/hwy/targets.h +18 -11
  179. package/include/hwy/timer.h +11 -0
  180. package/include/lcms2.h +46 -7
  181. package/include/lcms2_plugin.h +4 -4
  182. package/include/libheif/heif_version.h +2 -2
  183. package/include/libpng16/png.h +32 -29
  184. package/include/libpng16/pngconf.h +2 -2
  185. package/include/libpng16/pnglibconf.h +7 -2
  186. package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
  187. package/include/libxml2/libxml/HTMLparser.h +23 -0
  188. package/include/libxml2/libxml/SAX.h +0 -2
  189. package/include/libxml2/libxml/SAX2.h +0 -2
  190. package/include/libxml2/libxml/c14n.h +0 -2
  191. package/include/libxml2/libxml/dict.h +1 -0
  192. package/include/libxml2/libxml/encoding.h +16 -14
  193. package/include/libxml2/libxml/entities.h +4 -0
  194. package/include/libxml2/libxml/globals.h +15 -503
  195. package/include/libxml2/libxml/hash.h +57 -61
  196. package/include/libxml2/libxml/nanoftp.h +2 -2
  197. package/include/libxml2/libxml/parser.h +137 -18
  198. package/include/libxml2/libxml/parserInternals.h +1 -0
  199. package/include/libxml2/libxml/relaxng.h +2 -1
  200. package/include/libxml2/libxml/schemasInternals.h +1 -0
  201. package/include/libxml2/libxml/schematron.h +1 -0
  202. package/include/libxml2/libxml/threads.h +4 -11
  203. package/include/libxml2/libxml/tree.h +68 -20
  204. package/include/libxml2/libxml/uri.h +2 -1
  205. package/include/libxml2/libxml/valid.h +2 -0
  206. package/include/libxml2/libxml/xmlIO.h +65 -13
  207. package/include/libxml2/libxml/xmlerror.h +37 -8
  208. package/include/libxml2/libxml/xmlmemory.h +37 -40
  209. package/include/libxml2/libxml/xmlreader.h +6 -0
  210. package/include/libxml2/libxml/xmlregexp.h +2 -9
  211. package/include/libxml2/libxml/xmlsave.h +9 -0
  212. package/include/libxml2/libxml/xmlschemas.h +3 -0
  213. package/include/libxml2/libxml/xmlversion.h +28 -43
  214. package/include/libxml2/libxml/xpath.h +1 -1
  215. package/include/libxml2/libxml/xpathInternals.h +2 -1
  216. package/include/libxml2/libxml/xpointer.h +5 -4
  217. package/include/pango-1.0/pango/pango-features.h +3 -3
  218. package/include/pango-1.0/pango/pango-fontmap.h +7 -0
  219. package/include/pixman-1/pixman-version.h +3 -3
  220. package/include/pixman-1/pixman.h +9 -2
  221. package/include/png.h +32 -29
  222. package/include/pngconf.h +2 -2
  223. package/include/pnglibconf.h +7 -2
  224. package/include/vips/connection.h +9 -3
  225. package/include/vips/util.h +0 -9
  226. package/include/vips/version.h +4 -4
  227. package/include/zconf.h +3 -0
  228. package/include/zlib.h +3 -3
  229. package/package.json +1 -1
  230. package/versions.json +15 -15
@@ -101,6 +101,9 @@ class Vec256 {
101
101
  HWY_INLINE Vec256& operator-=(const Vec256 other) {
102
102
  return *this = (*this - other);
103
103
  }
104
+ HWY_INLINE Vec256& operator%=(const Vec256 other) {
105
+ return *this = (*this % other);
106
+ }
104
107
  HWY_INLINE Vec256& operator&=(const Vec256 other) {
105
108
  return *this = (*this & other);
106
109
  }
@@ -359,6 +362,85 @@ HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
359
362
  ResizeBitCast(Full128<uint8_t>(), v).raw)});
360
363
  }
361
364
 
365
+ // ------------------------------ Dup128VecFromValues
366
+
367
+ template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 32)>
368
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
369
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
370
+ TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
371
+ TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
372
+ TFromD<D> t11, TFromD<D> t12,
373
+ TFromD<D> t13, TFromD<D> t14,
374
+ TFromD<D> t15) {
375
+ return VFromD<D>{_mm256_setr_epi8(
376
+ static_cast<char>(t0), static_cast<char>(t1), static_cast<char>(t2),
377
+ static_cast<char>(t3), static_cast<char>(t4), static_cast<char>(t5),
378
+ static_cast<char>(t6), static_cast<char>(t7), static_cast<char>(t8),
379
+ static_cast<char>(t9), static_cast<char>(t10), static_cast<char>(t11),
380
+ static_cast<char>(t12), static_cast<char>(t13), static_cast<char>(t14),
381
+ static_cast<char>(t15), static_cast<char>(t0), static_cast<char>(t1),
382
+ static_cast<char>(t2), static_cast<char>(t3), static_cast<char>(t4),
383
+ static_cast<char>(t5), static_cast<char>(t6), static_cast<char>(t7),
384
+ static_cast<char>(t8), static_cast<char>(t9), static_cast<char>(t10),
385
+ static_cast<char>(t11), static_cast<char>(t12), static_cast<char>(t13),
386
+ static_cast<char>(t14), static_cast<char>(t15))};
387
+ }
388
+
389
+ template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 32)>
390
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
391
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
392
+ TFromD<D> t5, TFromD<D> t6,
393
+ TFromD<D> t7) {
394
+ return VFromD<D>{
395
+ _mm256_setr_epi16(static_cast<int16_t>(t0), static_cast<int16_t>(t1),
396
+ static_cast<int16_t>(t2), static_cast<int16_t>(t3),
397
+ static_cast<int16_t>(t4), static_cast<int16_t>(t5),
398
+ static_cast<int16_t>(t6), static_cast<int16_t>(t7),
399
+ static_cast<int16_t>(t0), static_cast<int16_t>(t1),
400
+ static_cast<int16_t>(t2), static_cast<int16_t>(t3),
401
+ static_cast<int16_t>(t4), static_cast<int16_t>(t5),
402
+ static_cast<int16_t>(t6), static_cast<int16_t>(t7))};
403
+ }
404
+
405
+ #if HWY_HAVE_FLOAT16
406
+ template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 32)>
407
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
408
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
409
+ TFromD<D> t5, TFromD<D> t6,
410
+ TFromD<D> t7) {
411
+ return VFromD<D>{_mm256_setr_ph(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2,
412
+ t3, t4, t5, t6, t7)};
413
+ }
414
+ #endif
415
+
416
+ template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 32)>
417
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
418
+ TFromD<D> t2, TFromD<D> t3) {
419
+ return VFromD<D>{
420
+ _mm256_setr_epi32(static_cast<int32_t>(t0), static_cast<int32_t>(t1),
421
+ static_cast<int32_t>(t2), static_cast<int32_t>(t3),
422
+ static_cast<int32_t>(t0), static_cast<int32_t>(t1),
423
+ static_cast<int32_t>(t2), static_cast<int32_t>(t3))};
424
+ }
425
+
426
+ template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 32)>
427
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
428
+ TFromD<D> t2, TFromD<D> t3) {
429
+ return VFromD<D>{_mm256_setr_ps(t0, t1, t2, t3, t0, t1, t2, t3)};
430
+ }
431
+
432
+ template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 32)>
433
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
434
+ return VFromD<D>{
435
+ _mm256_setr_epi64x(static_cast<int64_t>(t0), static_cast<int64_t>(t1),
436
+ static_cast<int64_t>(t0), static_cast<int64_t>(t1))};
437
+ }
438
+
439
+ template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 32)>
440
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
441
+ return VFromD<D>{_mm256_setr_pd(t0, t1, t0, t1)};
442
+ }
443
+
362
444
  // ================================================== LOGICAL
363
445
 
364
446
  // ------------------------------ And
@@ -367,7 +449,8 @@ template <typename T>
367
449
  HWY_API Vec256<T> And(Vec256<T> a, Vec256<T> b) {
368
450
  const DFromV<decltype(a)> d; // for float16_t
369
451
  const RebindToUnsigned<decltype(d)> du;
370
- return BitCast(d, VFromD<decltype(du)>{_mm256_and_si256(a.raw, b.raw)});
452
+ return BitCast(d, VFromD<decltype(du)>{_mm256_and_si256(BitCast(du, a).raw,
453
+ BitCast(du, b).raw)});
371
454
  }
372
455
 
373
456
  HWY_API Vec256<float> And(Vec256<float> a, Vec256<float> b) {
@@ -384,8 +467,8 @@ template <typename T>
384
467
  HWY_API Vec256<T> AndNot(Vec256<T> not_mask, Vec256<T> mask) {
385
468
  const DFromV<decltype(mask)> d; // for float16_t
386
469
  const RebindToUnsigned<decltype(d)> du;
387
- return BitCast(
388
- d, VFromD<decltype(du)>{_mm256_andnot_si256(not_mask.raw, mask.raw)});
470
+ return BitCast(d, VFromD<decltype(du)>{_mm256_andnot_si256(
471
+ BitCast(du, not_mask).raw, BitCast(du, mask).raw)});
389
472
  }
390
473
  HWY_API Vec256<float> AndNot(Vec256<float> not_mask, Vec256<float> mask) {
391
474
  return Vec256<float>{_mm256_andnot_ps(not_mask.raw, mask.raw)};
@@ -400,7 +483,8 @@ template <typename T>
400
483
  HWY_API Vec256<T> Or(Vec256<T> a, Vec256<T> b) {
401
484
  const DFromV<decltype(a)> d; // for float16_t
402
485
  const RebindToUnsigned<decltype(d)> du;
403
- return BitCast(d, VFromD<decltype(du)>{_mm256_or_si256(a.raw, b.raw)});
486
+ return BitCast(d, VFromD<decltype(du)>{_mm256_or_si256(BitCast(du, a).raw,
487
+ BitCast(du, b).raw)});
404
488
  }
405
489
 
406
490
  HWY_API Vec256<float> Or(Vec256<float> a, Vec256<float> b) {
@@ -416,7 +500,8 @@ template <typename T>
416
500
  HWY_API Vec256<T> Xor(Vec256<T> a, Vec256<T> b) {
417
501
  const DFromV<decltype(a)> d; // for float16_t
418
502
  const RebindToUnsigned<decltype(d)> du;
419
- return BitCast(d, VFromD<decltype(du)>{_mm256_xor_si256(a.raw, b.raw)});
503
+ return BitCast(d, VFromD<decltype(du)>{_mm256_xor_si256(BitCast(du, a).raw,
504
+ BitCast(du, b).raw)});
420
505
  }
421
506
 
422
507
  HWY_API Vec256<float> Xor(Vec256<float> a, Vec256<float> b) {
@@ -589,7 +674,7 @@ HWY_INLINE Vec256<T> IfThenElse(hwy::SizeTag<8> /* tag */, Mask256<T> mask,
589
674
 
590
675
  } // namespace detail
591
676
 
592
- template <typename T>
677
+ template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
593
678
  HWY_API Vec256<T> IfThenElse(Mask256<T> mask, Vec256<T> yes, Vec256<T> no) {
594
679
  return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
595
680
  }
@@ -634,7 +719,7 @@ HWY_INLINE Vec256<T> IfThenElseZero(hwy::SizeTag<8> /* tag */, Mask256<T> mask,
634
719
 
635
720
  } // namespace detail
636
721
 
637
- template <typename T, HWY_IF_NOT_FLOAT3264(T)>
722
+ template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
638
723
  HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) {
639
724
  return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
640
725
  }
@@ -672,7 +757,7 @@ HWY_INLINE Vec256<T> IfThenZeroElse(hwy::SizeTag<8> /* tag */, Mask256<T> mask,
672
757
 
673
758
  } // namespace detail
674
759
 
675
- template <typename T, HWY_IF_NOT_FLOAT3264(T)>
760
+ template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
676
761
  HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
677
762
  return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
678
763
  }
@@ -879,6 +964,58 @@ HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<8> /*tag*/,
879
964
  #endif
880
965
  }
881
966
 
967
+ // UnmaskedNot returns ~m.raw without zeroing out any invalid bits
968
+ template <typename T, HWY_IF_T_SIZE(T, 1)>
969
+ HWY_INLINE Mask256<T> UnmaskedNot(const Mask256<T> m) {
970
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
971
+ return Mask256<T>{static_cast<__mmask32>(_knot_mask32(m.raw))};
972
+ #else
973
+ return Mask256<T>{static_cast<__mmask32>(~m.raw)};
974
+ #endif
975
+ }
976
+
977
+ template <typename T, HWY_IF_T_SIZE(T, 2)>
978
+ HWY_INLINE Mask256<T> UnmaskedNot(const Mask256<T> m) {
979
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
980
+ return Mask256<T>{static_cast<__mmask16>(_knot_mask16(m.raw))};
981
+ #else
982
+ return Mask256<T>{static_cast<__mmask16>(~m.raw)};
983
+ #endif
984
+ }
985
+
986
+ template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
987
+ HWY_INLINE Mask256<T> UnmaskedNot(const Mask256<T> m) {
988
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
989
+ return Mask256<T>{static_cast<__mmask8>(_knot_mask8(m.raw))};
990
+ #else
991
+ return Mask256<T>{static_cast<__mmask8>(~m.raw)};
992
+ #endif
993
+ }
994
+
995
+ template <typename T>
996
+ HWY_INLINE Mask256<T> Not(hwy::SizeTag<1> /*tag*/, const Mask256<T> m) {
997
+ // sizeof(T) == 1: simply return ~m as all 32 bits of m are valid
998
+ return UnmaskedNot(m);
999
+ }
1000
+ template <typename T>
1001
+ HWY_INLINE Mask256<T> Not(hwy::SizeTag<2> /*tag*/, const Mask256<T> m) {
1002
+ // sizeof(T) == 2: simply return ~m as all 16 bits of m are valid
1003
+ return UnmaskedNot(m);
1004
+ }
1005
+ template <typename T>
1006
+ HWY_INLINE Mask256<T> Not(hwy::SizeTag<4> /*tag*/, const Mask256<T> m) {
1007
+ // sizeof(T) == 4: simply return ~m as all 8 bits of m are valid
1008
+ return UnmaskedNot(m);
1009
+ }
1010
+ template <typename T>
1011
+ HWY_INLINE Mask256<T> Not(hwy::SizeTag<8> /*tag*/, const Mask256<T> m) {
1012
+ // sizeof(T) == 8: need to zero out the upper 4 bits of ~m as only the lower
1013
+ // 4 bits of m are valid
1014
+
1015
+ // Return (~m) & 0x0F
1016
+ return AndNot(hwy::SizeTag<8>(), m, Mask256<T>::FromBits(uint64_t{0x0F}));
1017
+ }
1018
+
882
1019
  } // namespace detail
883
1020
 
884
1021
  template <typename T>
@@ -904,8 +1041,7 @@ HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
904
1041
  template <typename T>
905
1042
  HWY_API Mask256<T> Not(const Mask256<T> m) {
906
1043
  // Flip only the valid bits.
907
- constexpr size_t N = 32 / sizeof(T);
908
- return Xor(m, Mask256<T>::FromBits((1ull << N) - 1));
1044
+ return detail::Not(hwy::SizeTag<sizeof(T)>(), m);
909
1045
  }
910
1046
 
911
1047
  template <typename T>
@@ -913,6 +1049,31 @@ HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) {
913
1049
  return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
914
1050
  }
915
1051
 
1052
+ template <class D, HWY_IF_LANES_D(D, 32)>
1053
+ HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
1054
+ MFromD<Half<D>> lo) {
1055
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
1056
+ const __mmask32 combined_mask = _mm512_kunpackw(
1057
+ static_cast<__mmask32>(hi.raw), static_cast<__mmask32>(lo.raw));
1058
+ #else
1059
+ const auto combined_mask =
1060
+ ((static_cast<uint32_t>(hi.raw) << 16) | (lo.raw & 0xFFFFu));
1061
+ #endif
1062
+
1063
+ return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
1064
+ }
1065
+
1066
+ template <class D, HWY_IF_LANES_D(D, 16)>
1067
+ HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
1068
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
1069
+ const auto shifted_mask = _kshiftri_mask32(static_cast<__mmask32>(m.raw), 16);
1070
+ #else
1071
+ const auto shifted_mask = static_cast<uint32_t>(m.raw) >> 16;
1072
+ #endif
1073
+
1074
+ return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
1075
+ }
1076
+
916
1077
  #else // AVX2
917
1078
 
918
1079
  // ------------------------------ Mask
@@ -1072,7 +1233,11 @@ HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1072
1233
  #if HWY_HAVE_FLOAT16
1073
1234
  HWY_API Mask256<float16_t> operator==(Vec256<float16_t> a,
1074
1235
  Vec256<float16_t> b) {
1236
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
1237
+ HWY_DIAGNOSTICS(push)
1238
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1075
1239
  return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1240
+ HWY_DIAGNOSTICS(pop)
1076
1241
  }
1077
1242
  #endif // HWY_HAVE_FLOAT16
1078
1243
  HWY_API Mask256<float> operator==(Vec256<float> a, Vec256<float> b) {
@@ -1105,7 +1270,11 @@ HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
1105
1270
  #if HWY_HAVE_FLOAT16
1106
1271
  HWY_API Mask256<float16_t> operator!=(Vec256<float16_t> a,
1107
1272
  Vec256<float16_t> b) {
1273
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
1274
+ HWY_DIAGNOSTICS(push)
1275
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1108
1276
  return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1277
+ HWY_DIAGNOSTICS(pop)
1109
1278
  }
1110
1279
  #endif // HWY_HAVE_FLOAT16
1111
1280
  HWY_API Mask256<float> operator!=(Vec256<float> a, Vec256<float> b) {
@@ -1146,7 +1315,11 @@ HWY_API Mask256<uint64_t> operator>(Vec256<uint64_t> a, Vec256<uint64_t> b) {
1146
1315
 
1147
1316
  #if HWY_HAVE_FLOAT16
1148
1317
  HWY_API Mask256<float16_t> operator>(Vec256<float16_t> a, Vec256<float16_t> b) {
1318
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
1319
+ HWY_DIAGNOSTICS(push)
1320
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1149
1321
  return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)};
1322
+ HWY_DIAGNOSTICS(pop)
1150
1323
  }
1151
1324
  #endif // HWY_HAVE_FLOAT16
1152
1325
  HWY_API Mask256<float> operator>(Vec256<float> a, Vec256<float> b) {
@@ -1161,7 +1334,11 @@ HWY_API Mask256<double> operator>(Vec256<double> a, Vec256<double> b) {
1161
1334
  #if HWY_HAVE_FLOAT16
1162
1335
  HWY_API Mask256<float16_t> operator>=(Vec256<float16_t> a,
1163
1336
  Vec256<float16_t> b) {
1337
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
1338
+ HWY_DIAGNOSTICS(push)
1339
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1164
1340
  return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)};
1341
+ HWY_DIAGNOSTICS(pop)
1165
1342
  }
1166
1343
  #endif // HWY_HAVE_FLOAT16
1167
1344
 
@@ -1617,7 +1794,7 @@ HWY_INLINE VFromD<D> Iota0(D /*d*/) {
1617
1794
 
1618
1795
  template <class D, HWY_IF_V_SIZE_D(D, 32), typename T2>
1619
1796
  HWY_API VFromD<D> Iota(D d, const T2 first) {
1620
- return detail::Iota0(d) + Set(d, static_cast<TFromD<D>>(first));
1797
+ return detail::Iota0(d) + Set(d, ConvertScalarTo<TFromD<D>>(first));
1621
1798
  }
1622
1799
 
1623
1800
  // ------------------------------ FirstN (Iota, Lt)
@@ -1732,6 +1909,15 @@ HWY_API Vec256<double> operator-(Vec256<double> a, Vec256<double> b) {
1732
1909
  return Vec256<double>{_mm256_sub_pd(a.raw, b.raw)};
1733
1910
  }
1734
1911
 
1912
+ // ------------------------------ AddSub
1913
+
1914
+ HWY_API Vec256<float> AddSub(Vec256<float> a, Vec256<float> b) {
1915
+ return Vec256<float>{_mm256_addsub_ps(a.raw, b.raw)};
1916
+ }
1917
+ HWY_API Vec256<double> AddSub(Vec256<double> a, Vec256<double> b) {
1918
+ return Vec256<double>{_mm256_addsub_pd(a.raw, b.raw)};
1919
+ }
1920
+
1735
1921
  // ------------------------------ SumsOf8
1736
1922
  HWY_API Vec256<uint64_t> SumsOf8(Vec256<uint8_t> v) {
1737
1923
  return Vec256<uint64_t>{_mm256_sad_epu8(v.raw, _mm256_setzero_si256())};
@@ -1741,6 +1927,56 @@ HWY_API Vec256<uint64_t> SumsOf8AbsDiff(Vec256<uint8_t> a, Vec256<uint8_t> b) {
1741
1927
  return Vec256<uint64_t>{_mm256_sad_epu8(a.raw, b.raw)};
1742
1928
  }
1743
1929
 
1930
+ // ------------------------------ SumsOf4
1931
+ #if HWY_TARGET <= HWY_AVX3
1932
+ namespace detail {
1933
+
1934
+ HWY_INLINE Vec256<uint32_t> SumsOf4(hwy::UnsignedTag /*type_tag*/,
1935
+ hwy::SizeTag<1> /*lane_size_tag*/,
1936
+ Vec256<uint8_t> v) {
1937
+ const DFromV<decltype(v)> d;
1938
+
1939
+ // _mm256_maskz_dbsad_epu8 is used below as the odd uint16_t lanes need to be
1940
+ // zeroed out and the sums of the 4 consecutive lanes are already in the
1941
+ // even uint16_t lanes of the _mm256_maskz_dbsad_epu8 result.
1942
+ return Vec256<uint32_t>{_mm256_maskz_dbsad_epu8(
1943
+ static_cast<__mmask16>(0x5555), v.raw, Zero(d).raw, 0)};
1944
+ }
1945
+
1946
+ // detail::SumsOf4 for Vec256<int8_t> on AVX3 is implemented in x86_512-inl.h
1947
+
1948
+ } // namespace detail
1949
+ #endif // HWY_TARGET <= HWY_AVX3
1950
+
1951
+ // ------------------------------ SumsOfAdjQuadAbsDiff
1952
+
1953
+ template <int kAOffset, int kBOffset>
1954
+ static Vec256<uint16_t> SumsOfAdjQuadAbsDiff(Vec256<uint8_t> a,
1955
+ Vec256<uint8_t> b) {
1956
+ static_assert(0 <= kAOffset && kAOffset <= 1,
1957
+ "kAOffset must be between 0 and 1");
1958
+ static_assert(0 <= kBOffset && kBOffset <= 3,
1959
+ "kBOffset must be between 0 and 3");
1960
+ return Vec256<uint16_t>{_mm256_mpsadbw_epu8(
1961
+ a.raw, b.raw,
1962
+ (kAOffset << 5) | (kBOffset << 3) | (kAOffset << 2) | kBOffset)};
1963
+ }
1964
+
1965
+ // ------------------------------ SumsOfShuffledQuadAbsDiff
1966
+
1967
+ #if HWY_TARGET <= HWY_AVX3
1968
+ template <int kIdx3, int kIdx2, int kIdx1, int kIdx0>
1969
+ static Vec256<uint16_t> SumsOfShuffledQuadAbsDiff(Vec256<uint8_t> a,
1970
+ Vec256<uint8_t> b) {
1971
+ static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3");
1972
+ static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3");
1973
+ static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3");
1974
+ static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3");
1975
+ return Vec256<uint16_t>{
1976
+ _mm256_dbsad_epu8(b.raw, a.raw, _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0))};
1977
+ }
1978
+ #endif
1979
+
1744
1980
  // ------------------------------ SaturatedAdd
1745
1981
 
1746
1982
  // Returns a + b clamped to the destination range.
@@ -1860,15 +2096,12 @@ HWY_API Vec256<int16_t> Abs(const Vec256<int16_t> v) {
1860
2096
  HWY_API Vec256<int32_t> Abs(const Vec256<int32_t> v) {
1861
2097
  return Vec256<int32_t>{_mm256_abs_epi32(v.raw)};
1862
2098
  }
1863
- // i64 is implemented after BroadcastSignBit.
1864
2099
 
1865
- template <typename T, HWY_IF_FLOAT(T)>
1866
- HWY_API Vec256<T> Abs(const Vec256<T> v) {
1867
- const DFromV<decltype(v)> d;
1868
- const RebindToSigned<decltype(d)> di;
1869
- using TI = TFromD<decltype(di)>;
1870
- return v & BitCast(d, Set(di, static_cast<TI>(~SignMask<TI>())));
2100
+ #if HWY_TARGET <= HWY_AVX3
2101
+ HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) {
2102
+ return Vec256<int64_t>{_mm256_abs_epi64(v.raw)};
1871
2103
  }
2104
+ #endif
1872
2105
 
1873
2106
  // ------------------------------ Integer multiplication
1874
2107
 
@@ -2086,16 +2319,6 @@ HWY_API Vec256<int64_t> ShiftRight(const Vec256<int64_t> v) {
2086
2319
  #endif
2087
2320
  }
2088
2321
 
2089
- HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) {
2090
- #if HWY_TARGET <= HWY_AVX3
2091
- return Vec256<int64_t>{_mm256_abs_epi64(v.raw)};
2092
- #else
2093
- const DFromV<decltype(v)> d;
2094
- const auto zero = Zero(d);
2095
- return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
2096
- #endif
2097
- }
2098
-
2099
2322
  // ------------------------------ IfNegativeThenElse (BroadcastSignBit)
2100
2323
  HWY_API Vec256<int8_t> IfNegativeThenElse(Vec256<int8_t> v, Vec256<int8_t> yes,
2101
2324
  Vec256<int8_t> no) {
@@ -2136,6 +2359,23 @@ HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
2136
2359
  #endif
2137
2360
  }
2138
2361
 
2362
+ // ------------------------------ IfNegativeThenNegOrUndefIfZero
2363
+
2364
+ HWY_API Vec256<int8_t> IfNegativeThenNegOrUndefIfZero(Vec256<int8_t> mask,
2365
+ Vec256<int8_t> v) {
2366
+ return Vec256<int8_t>{_mm256_sign_epi8(v.raw, mask.raw)};
2367
+ }
2368
+
2369
+ HWY_API Vec256<int16_t> IfNegativeThenNegOrUndefIfZero(Vec256<int16_t> mask,
2370
+ Vec256<int16_t> v) {
2371
+ return Vec256<int16_t>{_mm256_sign_epi16(v.raw, mask.raw)};
2372
+ }
2373
+
2374
+ HWY_API Vec256<int32_t> IfNegativeThenNegOrUndefIfZero(Vec256<int32_t> mask,
2375
+ Vec256<int32_t> v) {
2376
+ return Vec256<int32_t>{_mm256_sign_epi32(v.raw, mask.raw)};
2377
+ }
2378
+
2139
2379
  // ------------------------------ ShiftLeftSame
2140
2380
 
2141
2381
  HWY_API Vec256<uint16_t> ShiftLeftSame(const Vec256<uint16_t> v,
@@ -2359,6 +2599,326 @@ HWY_API Vec256<double> ApproximateReciprocal(Vec256<double> v) {
2359
2599
  }
2360
2600
  #endif
2361
2601
 
2602
+ // ------------------------------ MaskedMinOr
2603
+
2604
+ #if HWY_TARGET <= HWY_AVX3
2605
+
2606
+ template <typename T, HWY_IF_U8(T)>
2607
+ HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2608
+ Vec256<T> b) {
2609
+ return Vec256<T>{_mm256_mask_min_epu8(no.raw, m.raw, a.raw, b.raw)};
2610
+ }
2611
+ template <typename T, HWY_IF_I8(T)>
2612
+ HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2613
+ Vec256<T> b) {
2614
+ return Vec256<T>{_mm256_mask_min_epi8(no.raw, m.raw, a.raw, b.raw)};
2615
+ }
2616
+
2617
+ template <typename T, HWY_IF_U16(T)>
2618
+ HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2619
+ Vec256<T> b) {
2620
+ return Vec256<T>{_mm256_mask_min_epu16(no.raw, m.raw, a.raw, b.raw)};
2621
+ }
2622
+ template <typename T, HWY_IF_I16(T)>
2623
+ HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2624
+ Vec256<T> b) {
2625
+ return Vec256<T>{_mm256_mask_min_epi16(no.raw, m.raw, a.raw, b.raw)};
2626
+ }
2627
+
2628
+ template <typename T, HWY_IF_U32(T)>
2629
+ HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2630
+ Vec256<T> b) {
2631
+ return Vec256<T>{_mm256_mask_min_epu32(no.raw, m.raw, a.raw, b.raw)};
2632
+ }
2633
+ template <typename T, HWY_IF_I32(T)>
2634
+ HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2635
+ Vec256<T> b) {
2636
+ return Vec256<T>{_mm256_mask_min_epi32(no.raw, m.raw, a.raw, b.raw)};
2637
+ }
2638
+
2639
+ template <typename T, HWY_IF_U64(T)>
2640
+ HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2641
+ Vec256<T> b) {
2642
+ return Vec256<T>{_mm256_mask_min_epu64(no.raw, m.raw, a.raw, b.raw)};
2643
+ }
2644
+ template <typename T, HWY_IF_I64(T)>
2645
+ HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2646
+ Vec256<T> b) {
2647
+ return Vec256<T>{_mm256_mask_min_epi64(no.raw, m.raw, a.raw, b.raw)};
2648
+ }
2649
+
2650
+ template <typename T, HWY_IF_F32(T)>
2651
+ HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2652
+ Vec256<T> b) {
2653
+ return Vec256<T>{_mm256_mask_min_ps(no.raw, m.raw, a.raw, b.raw)};
2654
+ }
2655
+
2656
+ template <typename T, HWY_IF_F64(T)>
2657
+ HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2658
+ Vec256<T> b) {
2659
+ return Vec256<T>{_mm256_mask_min_pd(no.raw, m.raw, a.raw, b.raw)};
2660
+ }
2661
+
2662
+ #if HWY_HAVE_FLOAT16
2663
+ template <typename T, HWY_IF_F16(T)>
2664
+ HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2665
+ Vec256<T> b) {
2666
+ return Vec256<T>{_mm256_mask_min_ph(no.raw, m.raw, a.raw, b.raw)};
2667
+ }
2668
+ #endif // HWY_HAVE_FLOAT16
2669
+
2670
+ // ------------------------------ MaskedMaxOr
2671
+
2672
+ template <typename T, HWY_IF_U8(T)>
2673
+ HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2674
+ Vec256<T> b) {
2675
+ return Vec256<T>{_mm256_mask_max_epu8(no.raw, m.raw, a.raw, b.raw)};
2676
+ }
2677
+ template <typename T, HWY_IF_I8(T)>
2678
+ HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2679
+ Vec256<T> b) {
2680
+ return Vec256<T>{_mm256_mask_max_epi8(no.raw, m.raw, a.raw, b.raw)};
2681
+ }
2682
+
2683
+ template <typename T, HWY_IF_U16(T)>
2684
+ HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2685
+ Vec256<T> b) {
2686
+ return Vec256<T>{_mm256_mask_max_epu16(no.raw, m.raw, a.raw, b.raw)};
2687
+ }
2688
+ template <typename T, HWY_IF_I16(T)>
2689
+ HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2690
+ Vec256<T> b) {
2691
+ return Vec256<T>{_mm256_mask_max_epi16(no.raw, m.raw, a.raw, b.raw)};
2692
+ }
2693
+
2694
+ template <typename T, HWY_IF_U32(T)>
2695
+ HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2696
+ Vec256<T> b) {
2697
+ return Vec256<T>{_mm256_mask_max_epu32(no.raw, m.raw, a.raw, b.raw)};
2698
+ }
2699
+ template <typename T, HWY_IF_I32(T)>
2700
+ HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2701
+ Vec256<T> b) {
2702
+ return Vec256<T>{_mm256_mask_max_epi32(no.raw, m.raw, a.raw, b.raw)};
2703
+ }
2704
+
2705
+ template <typename T, HWY_IF_U64(T)>
2706
+ HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2707
+ Vec256<T> b) {
2708
+ return Vec256<T>{_mm256_mask_max_epu64(no.raw, m.raw, a.raw, b.raw)};
2709
+ }
2710
+ template <typename T, HWY_IF_I64(T)>
2711
+ HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2712
+ Vec256<T> b) {
2713
+ return Vec256<T>{_mm256_mask_max_epi64(no.raw, m.raw, a.raw, b.raw)};
2714
+ }
2715
+
2716
+ template <typename T, HWY_IF_F32(T)>
2717
+ HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2718
+ Vec256<T> b) {
2719
+ return Vec256<T>{_mm256_mask_max_ps(no.raw, m.raw, a.raw, b.raw)};
2720
+ }
2721
+
2722
+ template <typename T, HWY_IF_F64(T)>
2723
+ HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2724
+ Vec256<T> b) {
2725
+ return Vec256<T>{_mm256_mask_max_pd(no.raw, m.raw, a.raw, b.raw)};
2726
+ }
2727
+
2728
+ #if HWY_HAVE_FLOAT16
2729
+ template <typename T, HWY_IF_F16(T)>
2730
+ HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2731
+ Vec256<T> b) {
2732
+ return Vec256<T>{_mm256_mask_max_ph(no.raw, m.raw, a.raw, b.raw)};
2733
+ }
2734
+ #endif // HWY_HAVE_FLOAT16
2735
+
2736
+ // ------------------------------ MaskedAddOr
2737
+
2738
+ template <typename T, HWY_IF_UI8(T)>
2739
+ HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2740
+ Vec256<T> b) {
2741
+ return Vec256<T>{_mm256_mask_add_epi8(no.raw, m.raw, a.raw, b.raw)};
2742
+ }
2743
+
2744
+ template <typename T, HWY_IF_UI16(T)>
2745
+ HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2746
+ Vec256<T> b) {
2747
+ return Vec256<T>{_mm256_mask_add_epi16(no.raw, m.raw, a.raw, b.raw)};
2748
+ }
2749
+
2750
+ template <typename T, HWY_IF_UI32(T)>
2751
+ HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2752
+ Vec256<T> b) {
2753
+ return Vec256<T>{_mm256_mask_add_epi32(no.raw, m.raw, a.raw, b.raw)};
2754
+ }
2755
+
2756
+ template <typename T, HWY_IF_UI64(T)>
2757
+ HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2758
+ Vec256<T> b) {
2759
+ return Vec256<T>{_mm256_mask_add_epi64(no.raw, m.raw, a.raw, b.raw)};
2760
+ }
2761
+
2762
+ template <typename T, HWY_IF_F32(T)>
2763
+ HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2764
+ Vec256<T> b) {
2765
+ return Vec256<T>{_mm256_mask_add_ps(no.raw, m.raw, a.raw, b.raw)};
2766
+ }
2767
+
2768
+ template <typename T, HWY_IF_F64(T)>
2769
+ HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2770
+ Vec256<T> b) {
2771
+ return Vec256<T>{_mm256_mask_add_pd(no.raw, m.raw, a.raw, b.raw)};
2772
+ }
2773
+
2774
+ #if HWY_HAVE_FLOAT16
2775
+ template <typename T, HWY_IF_F16(T)>
2776
+ HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2777
+ Vec256<T> b) {
2778
+ return Vec256<T>{_mm256_mask_add_ph(no.raw, m.raw, a.raw, b.raw)};
2779
+ }
2780
+ #endif // HWY_HAVE_FLOAT16
2781
+
2782
+ // ------------------------------ MaskedSubOr
2783
+
2784
+ template <typename T, HWY_IF_UI8(T)>
2785
+ HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2786
+ Vec256<T> b) {
2787
+ return Vec256<T>{_mm256_mask_sub_epi8(no.raw, m.raw, a.raw, b.raw)};
2788
+ }
2789
+
2790
+ template <typename T, HWY_IF_UI16(T)>
2791
+ HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2792
+ Vec256<T> b) {
2793
+ return Vec256<T>{_mm256_mask_sub_epi16(no.raw, m.raw, a.raw, b.raw)};
2794
+ }
2795
+
2796
+ template <typename T, HWY_IF_UI32(T)>
2797
+ HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2798
+ Vec256<T> b) {
2799
+ return Vec256<T>{_mm256_mask_sub_epi32(no.raw, m.raw, a.raw, b.raw)};
2800
+ }
2801
+
2802
+ template <typename T, HWY_IF_UI64(T)>
2803
+ HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2804
+ Vec256<T> b) {
2805
+ return Vec256<T>{_mm256_mask_sub_epi64(no.raw, m.raw, a.raw, b.raw)};
2806
+ }
2807
+
2808
+ template <typename T, HWY_IF_F32(T)>
2809
+ HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2810
+ Vec256<T> b) {
2811
+ return Vec256<T>{_mm256_mask_sub_ps(no.raw, m.raw, a.raw, b.raw)};
2812
+ }
2813
+
2814
+ template <typename T, HWY_IF_F64(T)>
2815
+ HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2816
+ Vec256<T> b) {
2817
+ return Vec256<T>{_mm256_mask_sub_pd(no.raw, m.raw, a.raw, b.raw)};
2818
+ }
2819
+
2820
+ #if HWY_HAVE_FLOAT16
2821
+ template <typename T, HWY_IF_F16(T)>
2822
+ HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2823
+ Vec256<T> b) {
2824
+ return Vec256<T>{_mm256_mask_sub_ph(no.raw, m.raw, a.raw, b.raw)};
2825
+ }
2826
+ #endif // HWY_HAVE_FLOAT16
2827
+
2828
+ // ------------------------------ MaskedMulOr
2829
+
2830
+ HWY_API Vec256<float> MaskedMulOr(Vec256<float> no, Mask256<float> m,
2831
+ Vec256<float> a, Vec256<float> b) {
2832
+ return Vec256<float>{_mm256_mask_mul_ps(no.raw, m.raw, a.raw, b.raw)};
2833
+ }
2834
+
2835
+ HWY_API Vec256<double> MaskedMulOr(Vec256<double> no, Mask256<double> m,
2836
+ Vec256<double> a, Vec256<double> b) {
2837
+ return Vec256<double>{_mm256_mask_mul_pd(no.raw, m.raw, a.raw, b.raw)};
2838
+ }
2839
+
2840
+ #if HWY_HAVE_FLOAT16
2841
+ HWY_API Vec256<float16_t> MaskedMulOr(Vec256<float16_t> no,
2842
+ Mask256<float16_t> m, Vec256<float16_t> a,
2843
+ Vec256<float16_t> b) {
2844
+ return Vec256<float16_t>{_mm256_mask_mul_ph(no.raw, m.raw, a.raw, b.raw)};
2845
+ }
2846
+ #endif // HWY_HAVE_FLOAT16
2847
+
2848
+ // ------------------------------ MaskedDivOr
2849
+
2850
+ HWY_API Vec256<float> MaskedDivOr(Vec256<float> no, Mask256<float> m,
2851
+ Vec256<float> a, Vec256<float> b) {
2852
+ return Vec256<float>{_mm256_mask_div_ps(no.raw, m.raw, a.raw, b.raw)};
2853
+ }
2854
+
2855
+ HWY_API Vec256<double> MaskedDivOr(Vec256<double> no, Mask256<double> m,
2856
+ Vec256<double> a, Vec256<double> b) {
2857
+ return Vec256<double>{_mm256_mask_div_pd(no.raw, m.raw, a.raw, b.raw)};
2858
+ }
2859
+
2860
+ #if HWY_HAVE_FLOAT16
2861
+ HWY_API Vec256<float16_t> MaskedDivOr(Vec256<float16_t> no,
2862
+ Mask256<float16_t> m, Vec256<float16_t> a,
2863
+ Vec256<float16_t> b) {
2864
+ return Vec256<float16_t>{_mm256_mask_div_ph(no.raw, m.raw, a.raw, b.raw)};
2865
+ }
2866
+ #endif // HWY_HAVE_FLOAT16
2867
+
2868
+ // ------------------------------ MaskedSatAddOr
2869
+
2870
+ template <typename T, HWY_IF_I8(T)>
2871
+ HWY_API Vec256<T> MaskedSatAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2872
+ Vec256<T> b) {
2873
+ return Vec256<T>{_mm256_mask_adds_epi8(no.raw, m.raw, a.raw, b.raw)};
2874
+ }
2875
+
2876
+ template <typename T, HWY_IF_U8(T)>
2877
+ HWY_API Vec256<T> MaskedSatAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2878
+ Vec256<T> b) {
2879
+ return Vec256<T>{_mm256_mask_adds_epu8(no.raw, m.raw, a.raw, b.raw)};
2880
+ }
2881
+
2882
+ template <typename T, HWY_IF_I16(T)>
2883
+ HWY_API Vec256<T> MaskedSatAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2884
+ Vec256<T> b) {
2885
+ return Vec256<T>{_mm256_mask_adds_epi16(no.raw, m.raw, a.raw, b.raw)};
2886
+ }
2887
+
2888
+ template <typename T, HWY_IF_U16(T)>
2889
+ HWY_API Vec256<T> MaskedSatAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2890
+ Vec256<T> b) {
2891
+ return Vec256<T>{_mm256_mask_adds_epu16(no.raw, m.raw, a.raw, b.raw)};
2892
+ }
2893
+
2894
+ // ------------------------------ MaskedSatSubOr
2895
+
2896
+ template <typename T, HWY_IF_I8(T)>
2897
+ HWY_API Vec256<T> MaskedSatSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2898
+ Vec256<T> b) {
2899
+ return Vec256<T>{_mm256_mask_subs_epi8(no.raw, m.raw, a.raw, b.raw)};
2900
+ }
2901
+
2902
+ template <typename T, HWY_IF_U8(T)>
2903
+ HWY_API Vec256<T> MaskedSatSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2904
+ Vec256<T> b) {
2905
+ return Vec256<T>{_mm256_mask_subs_epu8(no.raw, m.raw, a.raw, b.raw)};
2906
+ }
2907
+
2908
+ template <typename T, HWY_IF_I16(T)>
2909
+ HWY_API Vec256<T> MaskedSatSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2910
+ Vec256<T> b) {
2911
+ return Vec256<T>{_mm256_mask_subs_epi16(no.raw, m.raw, a.raw, b.raw)};
2912
+ }
2913
+
2914
+ template <typename T, HWY_IF_U16(T)>
2915
+ HWY_API Vec256<T> MaskedSatSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2916
+ Vec256<T> b) {
2917
+ return Vec256<T>{_mm256_mask_subs_epu16(no.raw, m.raw, a.raw, b.raw)};
2918
+ }
2919
+
2920
+ #endif // HWY_TARGET <= HWY_AVX3
2921
+
2362
2922
  // ------------------------------ Floating-point multiply-add variants
2363
2923
 
2364
2924
  #if HWY_HAVE_FLOAT16
@@ -2453,6 +3013,31 @@ HWY_API Vec256<double> NegMulSub(Vec256<double> mul, Vec256<double> x,
2453
3013
  #endif
2454
3014
  }
2455
3015
 
3016
+ #if HWY_HAVE_FLOAT16
3017
+ HWY_API Vec256<float16_t> MulAddSub(Vec256<float16_t> mul, Vec256<float16_t> x,
3018
+ Vec256<float16_t> sub_or_add) {
3019
+ return Vec256<float16_t>{_mm256_fmaddsub_ph(mul.raw, x.raw, sub_or_add.raw)};
3020
+ }
3021
+ #endif // HWY_HAVE_FLOAT16
3022
+
3023
+ HWY_API Vec256<float> MulAddSub(Vec256<float> mul, Vec256<float> x,
3024
+ Vec256<float> sub_or_add) {
3025
+ #ifdef HWY_DISABLE_BMI2_FMA
3026
+ return AddSub(mul * x, sub_or_add);
3027
+ #else
3028
+ return Vec256<float>{_mm256_fmaddsub_ps(mul.raw, x.raw, sub_or_add.raw)};
3029
+ #endif
3030
+ }
3031
+
3032
+ HWY_API Vec256<double> MulAddSub(Vec256<double> mul, Vec256<double> x,
3033
+ Vec256<double> sub_or_add) {
3034
+ #ifdef HWY_DISABLE_BMI2_FMA
3035
+ return AddSub(mul * x, sub_or_add);
3036
+ #else
3037
+ return Vec256<double>{_mm256_fmaddsub_pd(mul.raw, x.raw, sub_or_add.raw)};
3038
+ #endif
3039
+ }
3040
+
2456
3041
  // ------------------------------ Floating-point square root
2457
3042
 
2458
3043
  // Full precision square root
@@ -2621,35 +3206,6 @@ HWY_API Mask256<double> IsFinite(Vec256<double> v) {
2621
3206
  HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)});
2622
3207
  }
2623
3208
 
2624
- #else
2625
-
2626
- template <typename T>
2627
- HWY_API Mask256<T> IsInf(const Vec256<T> v) {
2628
- static_assert(IsFloat<T>(), "Only for float");
2629
- const DFromV<decltype(v)> d;
2630
- const RebindToSigned<decltype(d)> di;
2631
- const VFromD<decltype(di)> vi = BitCast(di, v);
2632
- // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
2633
- return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
2634
- }
2635
-
2636
- // Returns whether normal/subnormal/zero.
2637
- template <typename T>
2638
- HWY_API Mask256<T> IsFinite(const Vec256<T> v) {
2639
- static_assert(IsFloat<T>(), "Only for float");
2640
- const DFromV<decltype(v)> d;
2641
- const RebindToUnsigned<decltype(d)> du;
2642
- const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
2643
- const VFromD<decltype(du)> vu = BitCast(du, v);
2644
- // Shift left to clear the sign bit, then right so we can compare with the
2645
- // max exponent (cannot compare with MaxExponentTimes2 directly because it is
2646
- // negative and non-negative floats would be greater). MSVC seems to generate
2647
- // incorrect code if we instead add vu + vu.
2648
- const VFromD<decltype(di)> exp =
2649
- BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(ShiftLeft<1>(vu)));
2650
- return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
2651
- }
2652
-
2653
3209
  #endif // HWY_TARGET <= HWY_AVX3
2654
3210
 
2655
3211
  // ================================================== MEMORY
@@ -2662,16 +3218,13 @@ HWY_API VFromD<D> Load(D /* tag */, const TFromD<D>* HWY_RESTRICT aligned) {
2662
3218
  _mm256_load_si256(reinterpret_cast<const __m256i*>(aligned))};
2663
3219
  }
2664
3220
  // bfloat16_t is handled by x86_128-inl.h.
2665
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
2666
- HWY_API Vec256<float16_t> Load(D d, const float16_t* HWY_RESTRICT aligned) {
2667
3221
  #if HWY_HAVE_FLOAT16
2668
- (void)d;
3222
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
3223
+ HWY_API Vec256<float16_t> Load(D /* tag */,
3224
+ const float16_t* HWY_RESTRICT aligned) {
2669
3225
  return Vec256<float16_t>{_mm256_load_ph(aligned)};
2670
- #else
2671
- const RebindToUnsigned<decltype(d)> du;
2672
- return BitCast(d, Load(du, reinterpret_cast<const uint16_t*>(aligned)));
2673
- #endif // HWY_HAVE_FLOAT16
2674
3226
  }
3227
+ #endif
2675
3228
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
2676
3229
  HWY_API Vec256<float> Load(D /* tag */, const float* HWY_RESTRICT aligned) {
2677
3230
  return Vec256<float>{_mm256_load_ps(aligned)};
@@ -2686,16 +3239,12 @@ HWY_API VFromD<D> LoadU(D /* tag */, const TFromD<D>* HWY_RESTRICT p) {
2686
3239
  return VFromD<D>{_mm256_loadu_si256(reinterpret_cast<const __m256i*>(p))};
2687
3240
  }
2688
3241
  // bfloat16_t is handled by x86_128-inl.h.
2689
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
2690
- HWY_API Vec256<float16_t> LoadU(D d, const float16_t* HWY_RESTRICT p) {
2691
3242
  #if HWY_HAVE_FLOAT16
2692
- (void)d;
3243
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
3244
+ HWY_API Vec256<float16_t> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) {
2693
3245
  return Vec256<float16_t>{_mm256_loadu_ph(p)};
2694
- #else
2695
- const RebindToUnsigned<decltype(d)> du;
2696
- return BitCast(d, LoadU(du, reinterpret_cast<const uint16_t*>(p)));
2697
- #endif // HWY_HAVE_FLOAT16
2698
3246
  }
3247
+ #endif
2699
3248
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
2700
3249
  HWY_API Vec256<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) {
2701
3250
  return Vec256<float>{_mm256_loadu_ps(p)};
@@ -2756,8 +3305,8 @@ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
2756
3305
  HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
2757
3306
  const TFromD<D>* HWY_RESTRICT p) {
2758
3307
  const RebindToUnsigned<decltype(d)> du; // for float16_t
2759
- return BitCast(
2760
- d, VFromD<decltype(du)>{_mm256_mask_loadu_epi16(v.raw, m.raw, p)});
3308
+ return BitCast(d, VFromD<decltype(du)>{
3309
+ _mm256_mask_loadu_epi16(BitCast(du, v).raw, m.raw, p)});
2761
3310
  }
2762
3311
 
2763
3312
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
@@ -2831,22 +3380,24 @@ HWY_API Vec256<double> MaskedLoad(Mask256<double> m, D d,
2831
3380
  // Loads 128 bit and duplicates into both 128-bit halves. This avoids the
2832
3381
  // 3-cycle cost of moving data between 128-bit halves and avoids port 5.
2833
3382
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
2834
- HWY_API VFromD<D> LoadDup128(D /* tag */, const TFromD<D>* HWY_RESTRICT p) {
3383
+ HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
3384
+ const RebindToUnsigned<decltype(d)> du;
2835
3385
  const Full128<TFromD<D>> d128;
3386
+ const RebindToUnsigned<decltype(d128)> du128;
3387
+ const __m128i v128 = BitCast(du128, LoadU(d128, p)).raw;
2836
3388
  #if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
2837
3389
  // Workaround for incorrect results with _mm256_broadcastsi128_si256. Note
2838
3390
  // that MSVC also lacks _mm256_zextsi128_si256, but cast (which leaves the
2839
3391
  // upper half undefined) is fine because we're overwriting that anyway.
2840
3392
  // This workaround seems in turn to generate incorrect code in MSVC 2022
2841
3393
  // (19.31), so use broadcastsi128 there.
2842
- const __m128i v128 = LoadU(d128, p).raw;
2843
- return VFromD<D>{
2844
- _mm256_inserti128_si256(_mm256_castsi128_si256(v128), v128, 1)};
3394
+ return BitCast(d, VFromD<decltype(du)>{_mm256_inserti128_si256(
3395
+ _mm256_castsi128_si256(v128), v128, 1)});
2845
3396
  #else
2846
3397
  // The preferred path. This is perhaps surprising, because vbroadcasti128
2847
3398
  // with xmm input has 7 cycle latency on Intel, but Clang >= 7 is able to
2848
3399
  // pattern-match this to vbroadcastf128 with a memory operand as desired.
2849
- return VFromD<D>{_mm256_broadcastsi128_si256(LoadU(d128, p).raw)};
3400
+ return BitCast(d, VFromD<decltype(du)>{_mm256_broadcastsi128_si256(v128)});
2850
3401
  #endif
2851
3402
  }
2852
3403
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
@@ -2879,16 +3430,13 @@ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
2879
3430
  HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
2880
3431
  _mm256_store_si256(reinterpret_cast<__m256i*>(aligned), v.raw);
2881
3432
  }
2882
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
2883
- HWY_API void Store(Vec256<float16_t> v, D d, float16_t* HWY_RESTRICT aligned) {
2884
3433
  #if HWY_HAVE_FLOAT16
2885
- (void)d;
3434
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
3435
+ HWY_API void Store(Vec256<float16_t> v, D /* tag */,
3436
+ float16_t* HWY_RESTRICT aligned) {
2886
3437
  _mm256_store_ph(aligned, v.raw);
2887
- #else
2888
- const RebindToUnsigned<decltype(d)> du;
2889
- Store(BitCast(du, v), du, reinterpret_cast<uint16_t*>(aligned));
2890
- #endif // HWY_HAVE_FLOAT16
2891
3438
  }
3439
+ #endif // HWY_HAVE_FLOAT16
2892
3440
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
2893
3441
  HWY_API void Store(Vec256<float> v, D /* tag */, float* HWY_RESTRICT aligned) {
2894
3442
  _mm256_store_ps(aligned, v.raw);
@@ -2903,16 +3451,13 @@ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
2903
3451
  HWY_API void StoreU(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
2904
3452
  _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v.raw);
2905
3453
  }
2906
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
2907
- HWY_API void StoreU(Vec256<float16_t> v, D d, float16_t* HWY_RESTRICT p) {
2908
3454
  #if HWY_HAVE_FLOAT16
2909
- (void)d;
3455
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
3456
+ HWY_API void StoreU(Vec256<float16_t> v, D /* tag */,
3457
+ float16_t* HWY_RESTRICT p) {
2910
3458
  _mm256_storeu_ph(p, v.raw);
2911
- #else
2912
- const RebindToUnsigned<decltype(d)> du;
2913
- StoreU(BitCast(du, v), du, reinterpret_cast<uint16_t*>(p));
2914
- #endif // HWY_HAVE_FLOAT16
2915
3459
  }
3460
+ #endif
2916
3461
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
2917
3462
  HWY_API void StoreU(Vec256<float> v, D /* tag */, float* HWY_RESTRICT p) {
2918
3463
  _mm256_storeu_ps(p, v.raw);
@@ -3140,118 +3685,133 @@ HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D /* tag */,
3140
3685
 
3141
3686
  // ------------------------------ Gather
3142
3687
 
3143
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
3144
- HWY_INLINE VFromD<D> GatherOffset(D /* tag */,
3145
- const TFromD<D>* HWY_RESTRICT base,
3146
- Vec256<int32_t> offset) {
3147
- return VFromD<D>{_mm256_i32gather_epi32(
3148
- reinterpret_cast<const int32_t*>(base), offset.raw, 1)};
3149
- }
3150
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
3151
- HWY_INLINE VFromD<D> GatherIndex(D /* tag */,
3152
- const TFromD<D>* HWY_RESTRICT base,
3153
- Vec256<int32_t> index) {
3154
- return VFromD<D>{_mm256_i32gather_epi32(
3155
- reinterpret_cast<const int32_t*>(base), index.raw, 4)};
3156
- }
3688
+ namespace detail {
3157
3689
 
3158
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
3159
- HWY_INLINE VFromD<D> GatherOffset(D /* tag */,
3160
- const TFromD<D>* HWY_RESTRICT base,
3161
- Vec256<int64_t> offset) {
3162
- return VFromD<D>{_mm256_i64gather_epi64(
3163
- reinterpret_cast<const GatherIndex64*>(base), offset.raw, 1)};
3690
+ template <int kScale, typename T, HWY_IF_UI32(T)>
3691
+ HWY_INLINE Vec256<T> NativeGather256(const T* HWY_RESTRICT base,
3692
+ Vec256<int32_t> indices) {
3693
+ return Vec256<T>{_mm256_i32gather_epi32(
3694
+ reinterpret_cast<const int32_t*>(base), indices.raw, kScale)};
3164
3695
  }
3165
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
3166
- HWY_INLINE VFromD<D> GatherIndex(D /* tag */,
3167
- const TFromD<D>* HWY_RESTRICT base,
3168
- Vec256<int64_t> index) {
3169
- return VFromD<D>{_mm256_i64gather_epi64(
3170
- reinterpret_cast<const GatherIndex64*>(base), index.raw, 8)};
3696
+
3697
+ template <int kScale, typename T, HWY_IF_UI64(T)>
3698
+ HWY_INLINE Vec256<T> NativeGather256(const T* HWY_RESTRICT base,
3699
+ Vec256<int64_t> indices) {
3700
+ return Vec256<T>{_mm256_i64gather_epi64(
3701
+ reinterpret_cast<const GatherIndex64*>(base), indices.raw, kScale)};
3171
3702
  }
3172
3703
 
3173
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3174
- HWY_API Vec256<float> GatherOffset(D /* tag */, const float* HWY_RESTRICT base,
3175
- Vec256<int32_t> offset) {
3176
- return Vec256<float>{_mm256_i32gather_ps(base, offset.raw, 1)};
3704
+ template <int kScale>
3705
+ HWY_API Vec256<float> NativeGather256(const float* HWY_RESTRICT base,
3706
+ Vec256<int32_t> indices) {
3707
+ return Vec256<float>{_mm256_i32gather_ps(base, indices.raw, kScale)};
3177
3708
  }
3178
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3179
- HWY_API Vec256<float> GatherIndex(D /* tag */, const float* HWY_RESTRICT base,
3180
- Vec256<int32_t> index) {
3181
- return Vec256<float>{_mm256_i32gather_ps(base, index.raw, 4)};
3709
+
3710
+ template <int kScale>
3711
+ HWY_API Vec256<double> NativeGather256(const double* HWY_RESTRICT base,
3712
+ Vec256<int64_t> indices) {
3713
+ return Vec256<double>{_mm256_i64gather_pd(base, indices.raw, kScale)};
3182
3714
  }
3183
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3184
- HWY_API Vec256<double> GatherOffset(D /* tag */,
3185
- const double* HWY_RESTRICT base,
3186
- Vec256<int64_t> offset) {
3187
- return Vec256<double>{_mm256_i64gather_pd(base, offset.raw, 1)};
3715
+
3716
+ } // namespace detail
3717
+
3718
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
3719
+ HWY_API VFromD<D> GatherOffset(D d, const TFromD<D>* HWY_RESTRICT base,
3720
+ VFromD<RebindToSigned<D>> offsets) {
3721
+ const RebindToSigned<decltype(d)> di;
3722
+ (void)di; // for HWY_DASSERT
3723
+ HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di))));
3724
+ return detail::NativeGather256<1>(base, offsets);
3188
3725
  }
3189
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3190
- HWY_API Vec256<double> GatherIndex(D /* tag */, const double* HWY_RESTRICT base,
3191
- Vec256<int64_t> index) {
3192
- return Vec256<double>{_mm256_i64gather_pd(base, index.raw, 8)};
3726
+
3727
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
3728
+ HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
3729
+ VFromD<RebindToSigned<D>> indices) {
3730
+ const RebindToSigned<decltype(d)> di;
3731
+ (void)di; // for HWY_DASSERT
3732
+ HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
3733
+ return detail::NativeGather256<sizeof(TFromD<D>)>(base, indices);
3193
3734
  }
3194
3735
 
3195
- // ------------------------------ MaskedGatherIndex
3736
+ // ------------------------------ MaskedGatherIndexOr
3196
3737
 
3197
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
3198
- HWY_INLINE VFromD<D> MaskedGatherIndex(MFromD<D> m, D d,
3199
- const TFromD<D>* HWY_RESTRICT base,
3200
- Vec256<int32_t> index) {
3738
+ namespace detail {
3739
+
3740
+ template <int kScale, typename T, HWY_IF_UI32(T)>
3741
+ HWY_INLINE Vec256<T> NativeMaskedGatherOr256(Vec256<T> no, Mask256<T> m,
3742
+ const T* HWY_RESTRICT base,
3743
+ Vec256<int32_t> indices) {
3201
3744
  #if HWY_TARGET <= HWY_AVX3
3202
- return VFromD<D>{
3203
- _mm256_mmask_i32gather_epi32(Zero(d).raw, m.raw, index.raw,
3204
- reinterpret_cast<const int32_t*>(base), 4)};
3745
+ return Vec256<T>{_mm256_mmask_i32gather_epi32(
3746
+ no.raw, m.raw, indices.raw, reinterpret_cast<const int32_t*>(base),
3747
+ kScale)};
3205
3748
  #else
3206
- return VFromD<D>{_mm256_mask_i32gather_epi32(
3207
- Zero(d).raw, reinterpret_cast<const int32_t*>(base), index.raw, m.raw,
3208
- 4)};
3749
+ return Vec256<T>{_mm256_mask_i32gather_epi32(
3750
+ no.raw, reinterpret_cast<const int32_t*>(base), indices.raw, m.raw,
3751
+ kScale)};
3209
3752
  #endif
3210
3753
  }
3211
3754
 
3212
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
3213
- HWY_INLINE VFromD<D> MaskedGatherIndex(MFromD<D> m, D d,
3214
- const TFromD<D>* HWY_RESTRICT base,
3215
- Vec256<int64_t> index) {
3755
+ template <int kScale, typename T, HWY_IF_UI64(T)>
3756
+ HWY_INLINE Vec256<T> NativeMaskedGatherOr256(Vec256<T> no, Mask256<T> m,
3757
+ const T* HWY_RESTRICT base,
3758
+ Vec256<int64_t> indices) {
3216
3759
  #if HWY_TARGET <= HWY_AVX3
3217
- return VFromD<D>{_mm256_mmask_i64gather_epi64(
3218
- Zero(d).raw, m.raw, index.raw,
3219
- reinterpret_cast<const GatherIndex64*>(base), 8)};
3760
+ return Vec256<T>{_mm256_mmask_i64gather_epi64(
3761
+ no.raw, m.raw, indices.raw, reinterpret_cast<const GatherIndex64*>(base),
3762
+ kScale)};
3220
3763
  #else
3221
3764
  // For reasons unknown, _mm256_mask_i64gather_epi64 returns all-zeros.
3222
- const RebindToFloat<D> df;
3223
- return BitCast(d, Vec256<double>{_mm256_mask_i64gather_pd(
3224
- Zero(df).raw, reinterpret_cast<const double*>(base),
3225
- index.raw, RebindMask(df, m).raw, 8)});
3765
+ const Full256<T> d;
3766
+ const Full256<double> dd;
3767
+ return BitCast(d,
3768
+ Vec256<double>{_mm256_mask_i64gather_pd(
3769
+ BitCast(dd, no).raw, reinterpret_cast<const double*>(base),
3770
+ indices.raw, RebindMask(dd, m).raw, kScale)});
3226
3771
  #endif
3227
3772
  }
3228
3773
 
3229
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3230
- HWY_API Vec256<float> MaskedGatherIndex(MFromD<D> m, D d,
3231
- const float* HWY_RESTRICT base,
3232
- Vec256<int32_t> index) {
3774
+ template <int kScale>
3775
+ HWY_API Vec256<float> NativeMaskedGatherOr256(Vec256<float> no,
3776
+ Mask256<float> m,
3777
+ const float* HWY_RESTRICT base,
3778
+ Vec256<int32_t> indices) {
3233
3779
  #if HWY_TARGET <= HWY_AVX3
3234
3780
  return Vec256<float>{
3235
- _mm256_mmask_i32gather_ps(Zero(d).raw, m.raw, index.raw, base, 4)};
3781
+ _mm256_mmask_i32gather_ps(no.raw, m.raw, indices.raw, base, kScale)};
3236
3782
  #else
3237
3783
  return Vec256<float>{
3238
- _mm256_mask_i32gather_ps(Zero(d).raw, base, index.raw, m.raw, 4)};
3784
+ _mm256_mask_i32gather_ps(no.raw, base, indices.raw, m.raw, kScale)};
3239
3785
  #endif
3240
3786
  }
3241
3787
 
3242
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3243
- HWY_API Vec256<double> MaskedGatherIndex(MFromD<D> m, D d,
3244
- const double* HWY_RESTRICT base,
3245
- Vec256<int64_t> index) {
3788
+ template <int kScale>
3789
+ HWY_API Vec256<double> NativeMaskedGatherOr256(Vec256<double> no,
3790
+ Mask256<double> m,
3791
+ const double* HWY_RESTRICT base,
3792
+ Vec256<int64_t> indices) {
3246
3793
  #if HWY_TARGET <= HWY_AVX3
3247
3794
  return Vec256<double>{
3248
- _mm256_mmask_i64gather_pd(Zero(d).raw, m.raw, index.raw, base, 8)};
3795
+ _mm256_mmask_i64gather_pd(no.raw, m.raw, indices.raw, base, kScale)};
3249
3796
  #else
3250
3797
  return Vec256<double>{
3251
- _mm256_mask_i64gather_pd(Zero(d).raw, base, index.raw, m.raw, 8)};
3798
+ _mm256_mask_i64gather_pd(no.raw, base, indices.raw, m.raw, kScale)};
3252
3799
  #endif
3253
3800
  }
3254
3801
 
3802
+ } // namespace detail
3803
+
3804
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
3805
+ HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
3806
+ const TFromD<D>* HWY_RESTRICT base,
3807
+ VFromD<RebindToSigned<D>> indices) {
3808
+ const RebindToSigned<decltype(d)> di;
3809
+ (void)di; // for HWY_DASSERT
3810
+ HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
3811
+ return detail::NativeMaskedGatherOr256<sizeof(TFromD<D>)>(no, m, base,
3812
+ indices);
3813
+ }
3814
+
3255
3815
  HWY_DIAGNOSTICS(pop)
3256
3816
 
3257
3817
  // ================================================== SWIZZLE
@@ -3294,7 +3854,7 @@ HWY_API Vec128<T> LowerHalf(Vec256<T> v) {
3294
3854
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)>
3295
3855
  HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
3296
3856
  const RebindToUnsigned<decltype(d)> du; // for float16_t
3297
- const Twice<decltype(d)> dut;
3857
+ const Twice<decltype(du)> dut;
3298
3858
  return BitCast(d, VFromD<decltype(du)>{
3299
3859
  _mm256_extracti128_si256(BitCast(dut, v).raw, 1)});
3300
3860
  }
@@ -3375,22 +3935,16 @@ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3375
3935
  HWY_API VFromD<D> ZeroExtendVector(D /* tag */, VFromD<Half<D>> lo) {
3376
3936
  #if HWY_HAVE_ZEXT
3377
3937
  return VFromD<D>{_mm256_zextsi128_si256(lo.raw)};
3938
+ #elif HWY_COMPILER_MSVC
3939
+ // Workaround: _mm256_inserti128_si256 does not actually zero the hi part.
3940
+ return VFromD<D>{_mm256_set_m128i(_mm_setzero_si128(), lo.raw)};
3378
3941
  #else
3379
3942
  return VFromD<D>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
3380
3943
  #endif
3381
3944
  }
3382
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
3383
- HWY_API Vec256<bfloat16_t> ZeroExtendVector(D d, Vec128<bfloat16_t> lo) {
3384
- (void)d;
3385
- #if HWY_HAVE_ZEXT
3386
- return VFromD<D>{_mm256_zextsi128_si256(lo.raw)};
3387
- #else
3388
- return VFromD<D>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
3389
- #endif // HWY_HAVE_ZEXT
3390
- }
3945
+ #if HWY_HAVE_FLOAT16
3391
3946
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
3392
3947
  HWY_API Vec256<float16_t> ZeroExtendVector(D d, Vec128<float16_t> lo) {
3393
- #if HWY_HAVE_FLOAT16
3394
3948
  #if HWY_HAVE_ZEXT
3395
3949
  (void)d;
3396
3950
  return Vec256<float16_t>{_mm256_zextph128_ph256(lo.raw)};
@@ -3398,15 +3952,8 @@ HWY_API Vec256<float16_t> ZeroExtendVector(D d, Vec128<float16_t> lo) {
3398
3952
  const RebindToUnsigned<D> du;
3399
3953
  return BitCast(d, ZeroExtendVector(du, BitCast(du, lo)));
3400
3954
  #endif // HWY_HAVE_ZEXT
3401
- #else
3402
- (void)d;
3403
- #if HWY_HAVE_ZEXT
3404
- return VFromD<D>{_mm256_zextsi128_si256(lo.raw)};
3405
- #else
3406
- return VFromD<D>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
3407
- #endif // HWY_HAVE_ZEXT
3408
- #endif // HWY_HAVE_FLOAT16
3409
3955
  }
3956
+ #endif // HWY_HAVE_FLOAT16
3410
3957
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3411
3958
  HWY_API Vec256<float> ZeroExtendVector(D /* tag */, Vec128<float> lo) {
3412
3959
  #if HWY_HAVE_ZEXT
@@ -3443,8 +3990,11 @@ HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
3443
3990
 
3444
3991
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
3445
3992
  HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) {
3446
- const auto lo256 = ZeroExtendVector(d, lo);
3447
- return VFromD<D>{_mm256_inserti128_si256(lo256.raw, hi.raw, 1)};
3993
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
3994
+ const Half<decltype(du)> dh_u;
3995
+ const auto lo256 = ZeroExtendVector(du, BitCast(dh_u, lo));
3996
+ return BitCast(d, VFromD<decltype(du)>{_mm256_inserti128_si256(
3997
+ lo256.raw, BitCast(dh_u, hi).raw, 1)});
3448
3998
  }
3449
3999
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3450
4000
  HWY_API Vec256<float> Combine(D d, Vec128<float> hi, Vec128<float> lo) {
@@ -3547,8 +4097,12 @@ HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
3547
4097
  template <class T, HWY_IF_T_SIZE(T, 2)>
3548
4098
  HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
3549
4099
  Vec256<T> v) {
3550
- const Half<DFromV<decltype(v)>> dh;
3551
- return Vec256<T>{_mm256_broadcastw_epi16(LowerHalf(dh, v).raw)};
4100
+ const DFromV<decltype(v)> d;
4101
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4102
+ const Half<decltype(d)> dh;
4103
+ const RebindToUnsigned<decltype(dh)> dh_u;
4104
+ return BitCast(d, VFromD<decltype(du)>{_mm256_broadcastw_epi16(
4105
+ BitCast(dh_u, LowerHalf(dh, v)).raw)});
3552
4106
  }
3553
4107
 
3554
4108
  template <class T, HWY_IF_UI32(T)>
@@ -3983,7 +4537,10 @@ HWY_API Vec256<double> TwoTablesLookupLanes(Vec256<double> a, Vec256<double> b,
3983
4537
 
3984
4538
  template <typename T>
3985
4539
  HWY_API Vec256<T> SwapAdjacentBlocks(Vec256<T> v) {
3986
- return Vec256<T>{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(1, 0, 3, 2))};
4540
+ const DFromV<decltype(v)> d;
4541
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4542
+ return BitCast(d, VFromD<decltype(du)>{_mm256_permute4x64_epi64(
4543
+ BitCast(du, v).raw, _MM_SHUFFLE(1, 0, 3, 2))});
3987
4544
  }
3988
4545
 
3989
4546
  HWY_API Vec256<double> SwapAdjacentBlocks(Vec256<double> v) {
@@ -4022,9 +4579,9 @@ HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
4022
4579
  _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
4023
4580
  #else
4024
4581
  const RebindToSigned<decltype(d)> di;
4025
- alignas(16) static constexpr int16_t kShuffle[8] = {
4026
- 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100};
4027
- const auto rev128 = TableLookupBytes(v, LoadDup128(di, kShuffle));
4582
+ const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
4583
+ di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
4584
+ const auto rev128 = TableLookupBytes(v, shuffle);
4028
4585
  return VFromD<D>{
4029
4586
  _mm256_permute4x64_epi64(rev128.raw, _MM_SHUFFLE(1, 0, 3, 2))};
4030
4587
  #endif
@@ -4053,9 +4610,9 @@ HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
4053
4610
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
4054
4611
  HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
4055
4612
  const RebindToSigned<decltype(d)> di;
4056
- alignas(16) static constexpr int16_t kShuffle[8] = {
4057
- 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908};
4058
- return BitCast(d, TableLookupBytes(v, LoadDup128(di, kShuffle)));
4613
+ const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
4614
+ di, 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908);
4615
+ return BitCast(d, TableLookupBytes(v, shuffle));
4059
4616
  }
4060
4617
 
4061
4618
  // 32 bit Reverse4 defined in x86_128.
@@ -4071,9 +4628,9 @@ HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) {
4071
4628
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
4072
4629
  HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
4073
4630
  const RebindToSigned<decltype(d)> di;
4074
- alignas(16) static constexpr int16_t kShuffle[8] = {
4075
- 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100};
4076
- return BitCast(d, TableLookupBytes(v, LoadDup128(di, kShuffle)));
4631
+ const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
4632
+ di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
4633
+ return BitCast(d, TableLookupBytes(v, shuffle));
4077
4634
  }
4078
4635
 
4079
4636
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
@@ -4162,8 +4719,12 @@ HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
4162
4719
  // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
4163
4720
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4164
4721
  HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
4722
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4165
4723
  const Half<decltype(d)> d2;
4166
- return VFromD<D>{_mm256_inserti128_si256(lo.raw, LowerHalf(d2, hi).raw, 1)};
4724
+ const RebindToUnsigned<decltype(d2)> du2; // for float16_t
4725
+ return BitCast(
4726
+ d, VFromD<decltype(du)>{_mm256_inserti128_si256(
4727
+ BitCast(du, lo).raw, BitCast(du2, LowerHalf(d2, hi)).raw, 1)});
4167
4728
  }
4168
4729
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4169
4730
  HWY_API Vec256<float> ConcatLowerLower(D d, Vec256<float> hi,
@@ -4180,8 +4741,10 @@ HWY_API Vec256<double> ConcatLowerLower(D d, Vec256<double> hi,
4180
4741
 
4181
4742
  // hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
4182
4743
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4183
- HWY_API VFromD<D> ConcatLowerUpper(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
4184
- return VFromD<D>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x21)};
4744
+ HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
4745
+ const RebindToUnsigned<decltype(d)> du;
4746
+ return BitCast(d, VFromD<decltype(du)>{_mm256_permute2x128_si256(
4747
+ BitCast(du, lo).raw, BitCast(du, hi).raw, 0x21)});
4185
4748
  }
4186
4749
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4187
4750
  HWY_API Vec256<float> ConcatLowerUpper(D /* tag */, Vec256<float> hi,
@@ -4196,8 +4759,10 @@ HWY_API Vec256<double> ConcatLowerUpper(D /* tag */, Vec256<double> hi,
4196
4759
 
4197
4760
  // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
4198
4761
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4199
- HWY_API VFromD<D> ConcatUpperLower(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
4200
- return VFromD<D>{_mm256_blend_epi32(hi.raw, lo.raw, 0x0F)};
4762
+ HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
4763
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4764
+ return BitCast(d, VFromD<decltype(du)>{_mm256_blend_epi32(
4765
+ BitCast(du, hi).raw, BitCast(du, lo).raw, 0x0F)});
4201
4766
  }
4202
4767
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4203
4768
  HWY_API Vec256<float> ConcatUpperLower(D /* tag */, Vec256<float> hi,
@@ -4212,8 +4777,10 @@ HWY_API Vec256<double> ConcatUpperLower(D /* tag */, Vec256<double> hi,
4212
4777
 
4213
4778
  // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
4214
4779
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4215
- HWY_API VFromD<D> ConcatUpperUpper(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
4216
- return VFromD<D>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x31)};
4780
+ HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
4781
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4782
+ return BitCast(d, VFromD<decltype(du)>{_mm256_permute2x128_si256(
4783
+ BitCast(du, lo).raw, BitCast(du, hi).raw, 0x31)});
4217
4784
  }
4218
4785
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4219
4786
  HWY_API Vec256<float> ConcatUpperUpper(D /* tag */, Vec256<float> hi,
@@ -4274,7 +4841,8 @@ HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
4274
4841
  const Vec256<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi));
4275
4842
  const Vec256<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo));
4276
4843
  const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
4277
- return VFromD<D>{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))};
4844
+ return BitCast(d, VFromD<decltype(du)>{_mm256_permute4x64_epi64(
4845
+ u16, _MM_SHUFFLE(3, 1, 2, 0))});
4278
4846
  #endif
4279
4847
  }
4280
4848
 
@@ -4380,7 +4948,8 @@ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
4380
4948
  const Vec256<uint32_t> uH = And(BitCast(dw, hi), mask);
4381
4949
  const Vec256<uint32_t> uL = And(BitCast(dw, lo), mask);
4382
4950
  const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
4383
- return VFromD<D>{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))};
4951
+ return BitCast(d, VFromD<decltype(du)>{_mm256_permute4x64_epi64(
4952
+ u16, _MM_SHUFFLE(3, 1, 2, 0))});
4384
4953
  #endif
4385
4954
  }
4386
4955
 
@@ -4450,6 +5019,126 @@ HWY_API Vec256<double> ConcatEven(D d, Vec256<double> hi, Vec256<double> lo) {
4450
5019
  #endif
4451
5020
  }
4452
5021
 
5022
+ // ------------------------------ InterleaveWholeLower
5023
+
5024
+ #if HWY_TARGET <= HWY_AVX3
5025
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
5026
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
5027
+ #if HWY_TARGET <= HWY_AVX3_DL
5028
+ const RebindToUnsigned<decltype(d)> du;
5029
+ alignas(32) static constexpr uint8_t kIdx[32] = {
5030
+ 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
5031
+ 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47};
5032
+ return VFromD<D>{_mm256_permutex2var_epi8(a.raw, Load(du, kIdx).raw, b.raw)};
5033
+ #else
5034
+ return ConcatLowerLower(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
5035
+ #endif
5036
+ }
5037
+
5038
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
5039
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
5040
+ const RebindToUnsigned<decltype(d)> du;
5041
+ alignas(32) static constexpr uint16_t kIdx[16] = {0, 16, 1, 17, 2, 18, 3, 19,
5042
+ 4, 20, 5, 21, 6, 22, 7, 23};
5043
+ return BitCast(
5044
+ d, VFromD<decltype(du)>{_mm256_permutex2var_epi16(
5045
+ BitCast(du, a).raw, Load(du, kIdx).raw, BitCast(du, b).raw)});
5046
+ }
5047
+
5048
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
5049
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
5050
+ const RebindToUnsigned<decltype(d)> du;
5051
+ alignas(32) static constexpr uint32_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
5052
+ return VFromD<D>{_mm256_permutex2var_epi32(a.raw, Load(du, kIdx).raw, b.raw)};
5053
+ }
5054
+
5055
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
5056
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
5057
+ const RebindToUnsigned<decltype(d)> du;
5058
+ alignas(32) static constexpr uint32_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
5059
+ return VFromD<D>{_mm256_permutex2var_ps(a.raw, Load(du, kIdx).raw, b.raw)};
5060
+ }
5061
+
5062
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
5063
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
5064
+ const RebindToUnsigned<decltype(d)> du;
5065
+ alignas(32) static constexpr uint64_t kIdx[4] = {0, 4, 1, 5};
5066
+ return VFromD<D>{_mm256_permutex2var_epi64(a.raw, Load(du, kIdx).raw, b.raw)};
5067
+ }
5068
+
5069
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
5070
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
5071
+ const RebindToUnsigned<decltype(d)> du;
5072
+ alignas(32) static constexpr uint64_t kIdx[4] = {0, 4, 1, 5};
5073
+ return VFromD<D>{_mm256_permutex2var_pd(a.raw, Load(du, kIdx).raw, b.raw)};
5074
+ }
5075
+ #else // AVX2
5076
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
5077
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
5078
+ return ConcatLowerLower(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
5079
+ }
5080
+ #endif
5081
+
5082
+ // ------------------------------ InterleaveWholeUpper
5083
+
5084
+ #if HWY_TARGET <= HWY_AVX3
5085
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
5086
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
5087
+ #if HWY_TARGET <= HWY_AVX3_DL
5088
+ const RebindToUnsigned<decltype(d)> du;
5089
+ alignas(32) static constexpr uint8_t kIdx[32] = {
5090
+ 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
5091
+ 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63};
5092
+ return VFromD<D>{_mm256_permutex2var_epi8(a.raw, Load(du, kIdx).raw, b.raw)};
5093
+ #else
5094
+ return ConcatUpperUpper(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
5095
+ #endif
5096
+ }
5097
+
5098
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
5099
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
5100
+ const RebindToUnsigned<decltype(d)> du;
5101
+ alignas(32) static constexpr uint16_t kIdx[16] = {
5102
+ 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31};
5103
+ return BitCast(
5104
+ d, VFromD<decltype(du)>{_mm256_permutex2var_epi16(
5105
+ BitCast(du, a).raw, Load(du, kIdx).raw, BitCast(du, b).raw)});
5106
+ }
5107
+
5108
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
5109
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
5110
+ const RebindToUnsigned<decltype(d)> du;
5111
+ alignas(32) static constexpr uint32_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
5112
+ return VFromD<D>{_mm256_permutex2var_epi32(a.raw, Load(du, kIdx).raw, b.raw)};
5113
+ }
5114
+
5115
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
5116
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
5117
+ const RebindToUnsigned<decltype(d)> du;
5118
+ alignas(32) static constexpr uint32_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
5119
+ return VFromD<D>{_mm256_permutex2var_ps(a.raw, Load(du, kIdx).raw, b.raw)};
5120
+ }
5121
+
5122
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
5123
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
5124
+ const RebindToUnsigned<decltype(d)> du;
5125
+ alignas(32) static constexpr uint64_t kIdx[4] = {2, 6, 3, 7};
5126
+ return VFromD<D>{_mm256_permutex2var_epi64(a.raw, Load(du, kIdx).raw, b.raw)};
5127
+ }
5128
+
5129
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
5130
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
5131
+ const RebindToUnsigned<decltype(d)> du;
5132
+ alignas(32) static constexpr uint64_t kIdx[4] = {2, 6, 3, 7};
5133
+ return VFromD<D>{_mm256_permutex2var_pd(a.raw, Load(du, kIdx).raw, b.raw)};
5134
+ }
5135
+ #else // AVX2
5136
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
5137
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
5138
+ return ConcatUpperUpper(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
5139
+ }
5140
+ #endif
5141
+
4453
5142
  // ------------------------------ DupEven (InterleaveLower)
4454
5143
 
4455
5144
  template <typename T, HWY_IF_UI32(T)>
@@ -4490,9 +5179,10 @@ template <typename T, HWY_IF_T_SIZE(T, 1)>
4490
5179
  HWY_INLINE Vec256<T> OddEven(Vec256<T> a, Vec256<T> b) {
4491
5180
  const DFromV<decltype(a)> d;
4492
5181
  const Full256<uint8_t> d8;
4493
- alignas(32) static constexpr uint8_t mask[16] = {
4494
- 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
4495
- return IfThenElse(MaskFromVec(BitCast(d, LoadDup128(d8, mask))), b, a);
5182
+ const VFromD<decltype(d8)> mask =
5183
+ Dup128VecFromValues(d8, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF,
5184
+ 0, 0xFF, 0, 0xFF, 0);
5185
+ return IfThenElse(MaskFromVec(BitCast(d, mask)), b, a);
4496
5186
  }
4497
5187
 
4498
5188
  template <typename T, HWY_IF_UI16(T)>
@@ -4505,7 +5195,8 @@ HWY_INLINE Vec256<T> OddEven(Vec256<T> a, Vec256<T> b) {
4505
5195
 
4506
5196
  #if HWY_HAVE_FLOAT16
4507
5197
  HWY_INLINE Vec256<float16_t> OddEven(Vec256<float16_t> a, Vec256<float16_t> b) {
4508
- return Vec256<float16_t>{_mm256_mask_blend_ph(a.raw, b.raw, 0x55)};
5198
+ return Vec256<float16_t>{
5199
+ _mm256_mask_blend_ph(static_cast<__mmask16>(0x5555), a.raw, b.raw)};
4509
5200
  }
4510
5201
  #endif // HWY_HAVE_FLOAT16
4511
5202
 
@@ -4531,7 +5222,10 @@ HWY_API Vec256<double> OddEven(Vec256<double> a, Vec256<double> b) {
4531
5222
 
4532
5223
  template <typename T, HWY_IF_NOT_FLOAT3264(T)>
4533
5224
  Vec256<T> OddEvenBlocks(Vec256<T> odd, Vec256<T> even) {
4534
- return Vec256<T>{_mm256_blend_epi32(odd.raw, even.raw, 0xFu)};
5225
+ const DFromV<decltype(odd)> d;
5226
+ const RebindToUnsigned<decltype(d)> du;
5227
+ return BitCast(d, VFromD<decltype(du)>{_mm256_blend_epi32(
5228
+ BitCast(du, odd).raw, BitCast(du, even).raw, 0xFu)});
4535
5229
  }
4536
5230
 
4537
5231
  HWY_API Vec256<float> OddEvenBlocks(Vec256<float> odd, Vec256<float> even) {
@@ -4554,7 +5248,10 @@ HWY_API VFromD<D> ReverseBlocks(D /*d*/, VFromD<D> v) {
4554
5248
  // Both full
4555
5249
  template <typename T, typename TI>
4556
5250
  HWY_API Vec256<TI> TableLookupBytes(Vec256<T> bytes, Vec256<TI> from) {
4557
- return Vec256<TI>{_mm256_shuffle_epi8(bytes.raw, from.raw)};
5251
+ const DFromV<decltype(from)> d;
5252
+ return BitCast(d, Vec256<uint8_t>{_mm256_shuffle_epi8(
5253
+ BitCast(Full256<uint8_t>(), bytes).raw,
5254
+ BitCast(Full256<uint8_t>(), from).raw)});
4558
5255
  }
4559
5256
 
4560
5257
  // Partial index vector
@@ -5114,14 +5811,15 @@ HWY_API Vec256<uint8_t> Shl(hwy::UnsignedTag tag, Vec256<uint8_t> v,
5114
5811
  const DFromV<decltype(v)> d;
5115
5812
  #if HWY_TARGET <= HWY_AVX3_DL
5116
5813
  (void)tag;
5117
- // kMask[i] = 0xFF >> i
5118
- alignas(16) static constexpr uint8_t kMasks[16] = {
5119
- 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0x00};
5814
+ // masks[i] = 0xFF >> i
5815
+ const VFromD<decltype(d)> masks =
5816
+ Dup128VecFromValues(d, 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0,
5817
+ 0, 0, 0, 0, 0, 0, 0);
5120
5818
  // kShl[i] = 1 << i
5121
- alignas(16) static constexpr uint8_t kShl[16] = {1, 2, 4, 8, 0x10,
5122
- 0x20, 0x40, 0x80, 0x00};
5123
- v = And(v, TableLookupBytes(LoadDup128(d, kMasks), bits));
5124
- const VFromD<decltype(d)> mul = TableLookupBytes(LoadDup128(d, kShl), bits);
5819
+ const VFromD<decltype(d)> shl = Dup128VecFromValues(
5820
+ d, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 0, 0, 0, 0, 0, 0, 0, 0);
5821
+ v = And(v, TableLookupBytes(masks, bits));
5822
+ const VFromD<decltype(d)> mul = TableLookupBytes(shl, bits);
5125
5823
  return VFromD<decltype(d)>{_mm256_gf2p8mul_epi8(v.raw, mul.raw)};
5126
5824
  #else
5127
5825
  const Repartition<uint16_t, decltype(d)> dw;
@@ -5472,11 +6170,36 @@ HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
5472
6170
  }
5473
6171
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
5474
6172
  HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
5475
- return VFromD<D>{
5476
- _mm256_maskz_cvttps_epu64(_knot_mask8(MaskFromVec(v).raw), v.raw)};
6173
+ return VFromD<D>{_mm256_maskz_cvttps_epu64(
6174
+ detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
5477
6175
  }
5478
6176
  #endif // HWY_TARGET <= HWY_AVX3
5479
6177
 
6178
+ // ------------------------------ PromoteEvenTo/PromoteOddTo
6179
+ #if HWY_TARGET > HWY_AVX3
6180
+ namespace detail {
6181
+
6182
+ // I32->I64 PromoteEvenTo/PromoteOddTo
6183
+
6184
+ template <class D, HWY_IF_LANES_D(D, 4)>
6185
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
6186
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
6187
+ hwy::SignedTag /*from_type_tag*/, D d_to,
6188
+ Vec256<int32_t> v) {
6189
+ return BitCast(d_to, OddEven(DupEven(BroadcastSignBit(v)), v));
6190
+ }
6191
+
6192
+ template <class D, HWY_IF_LANES_D(D, 4)>
6193
+ HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
6194
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
6195
+ hwy::SignedTag /*from_type_tag*/, D d_to,
6196
+ Vec256<int32_t> v) {
6197
+ return BitCast(d_to, OddEven(BroadcastSignBit(v), DupOdd(v)));
6198
+ }
6199
+
6200
+ } // namespace detail
6201
+ #endif
6202
+
5480
6203
  // ------------------------------ Demotions (full -> part w/ narrow lanes)
5481
6204
 
5482
6205
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
@@ -5565,32 +6288,17 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int64_t> v) {
5565
6288
 
5566
6289
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
5567
6290
  HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int64_t> v) {
5568
- const auto neg_mask = MaskFromVec(v);
5569
- #if HWY_COMPILER_HAS_MASK_INTRINSICS
5570
- const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
5571
- #else
5572
- const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
5573
- #endif
6291
+ const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
5574
6292
  return VFromD<D>{_mm256_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)};
5575
6293
  }
5576
6294
  template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
5577
6295
  HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int64_t> v) {
5578
- const auto neg_mask = MaskFromVec(v);
5579
- #if HWY_COMPILER_HAS_MASK_INTRINSICS
5580
- const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
5581
- #else
5582
- const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
5583
- #endif
6296
+ const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
5584
6297
  return VFromD<D>{_mm256_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)};
5585
6298
  }
5586
6299
  template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U8_D(D)>
5587
6300
  HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int64_t> v) {
5588
- const auto neg_mask = MaskFromVec(v);
5589
- #if HWY_COMPILER_HAS_MASK_INTRINSICS
5590
- const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
5591
- #else
5592
- const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
5593
- #endif
6301
+ const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
5594
6302
  return VFromD<D>{_mm256_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)};
5595
6303
  }
5596
6304
 
@@ -5617,14 +6325,22 @@ HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wsign-conversion")
5617
6325
 
5618
6326
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
5619
6327
  HWY_API VFromD<D> DemoteTo(D df16, Vec256<float> v) {
5620
- (void)df16;
5621
- return VFromD<D>{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
6328
+ const RebindToUnsigned<decltype(df16)> du16;
6329
+ return BitCast(
6330
+ df16, VFromD<decltype(du16)>{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)});
5622
6331
  }
5623
6332
 
5624
6333
  HWY_DIAGNOSTICS(pop)
5625
6334
 
5626
6335
  #endif // HWY_DISABLE_F16C
5627
6336
 
6337
+ #if HWY_HAVE_FLOAT16
6338
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F16_D(D)>
6339
+ HWY_API VFromD<D> DemoteTo(D /*df16*/, Vec256<double> v) {
6340
+ return VFromD<D>{_mm256_cvtpd_ph(v.raw)};
6341
+ }
6342
+ #endif // HWY_HAVE_FLOAT16
6343
+
5628
6344
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
5629
6345
  HWY_API VFromD<D> DemoteTo(D dbf16, Vec256<float> v) {
5630
6346
  // TODO(janwas): _mm256_cvtneps_pbh once we have avx512bf16.
@@ -5777,8 +6493,8 @@ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
5777
6493
  HWY_API VFromD<D> DemoteTo(D du32, Vec256<double> v) {
5778
6494
  #if HWY_TARGET <= HWY_AVX3
5779
6495
  (void)du32;
5780
- return VFromD<D>{
5781
- _mm256_maskz_cvttpd_epu32(_knot_mask8(MaskFromVec(v).raw), v.raw)};
6496
+ return VFromD<D>{_mm256_maskz_cvttpd_epu32(
6497
+ detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
5782
6498
  #else // AVX2
5783
6499
  const Rebind<double, decltype(du32)> df64;
5784
6500
  const RebindToUnsigned<decltype(df64)> du64;
@@ -5967,6 +6683,11 @@ HWY_API VFromD<D> ConvertTo(D d, Vec256<float16_t> v) {
5967
6683
  return detail::FixConversionOverflow(d, v,
5968
6684
  VFromD<D>{_mm256_cvttph_epi16(v.raw)});
5969
6685
  }
6686
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
6687
+ HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
6688
+ return VFromD<D>{_mm256_maskz_cvttph_epu16(
6689
+ detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
6690
+ }
5970
6691
  #endif // HWY_HAVE_FLOAT16
5971
6692
 
5972
6693
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
@@ -5983,13 +6704,13 @@ HWY_API VFromD<D> ConvertTo(D di, Vec256<double> v) {
5983
6704
  }
5984
6705
  template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U32_D(DU)>
5985
6706
  HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
5986
- return VFromD<DU>{
5987
- _mm256_maskz_cvttps_epu32(_knot_mask8(MaskFromVec(v).raw), v.raw)};
6707
+ return VFromD<DU>{_mm256_maskz_cvttps_epu32(
6708
+ detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
5988
6709
  }
5989
6710
  template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U64_D(DU)>
5990
6711
  HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
5991
- return VFromD<DU>{
5992
- _mm256_maskz_cvttpd_epu64(_knot_mask8(MaskFromVec(v).raw), v.raw)};
6712
+ return VFromD<DU>{_mm256_maskz_cvttpd_epu64(
6713
+ detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
5993
6714
  }
5994
6715
  #else // AVX2
5995
6716
  template <class DU32, HWY_IF_V_SIZE_D(DU32, 32), HWY_IF_U32_D(DU32)>
@@ -6035,6 +6756,15 @@ HWY_API VFromD<D> PromoteTo(D df32, Vec128<float16_t> v) {
6035
6756
 
6036
6757
  #endif // HWY_DISABLE_F16C
6037
6758
 
6759
+ #if HWY_HAVE_FLOAT16
6760
+
6761
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
6762
+ HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, Vec64<float16_t> v) {
6763
+ return VFromD<D>{_mm256_cvtph_pd(v.raw)};
6764
+ }
6765
+
6766
+ #endif // HWY_HAVE_FLOAT16
6767
+
6038
6768
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
6039
6769
  HWY_API VFromD<D> PromoteTo(D df32, Vec128<bfloat16_t> v) {
6040
6770
  const Rebind<uint16_t, decltype(df32)> du16;
@@ -6120,14 +6850,14 @@ template <uint8_t kRcon>
6120
6850
  HWY_API Vec256<uint8_t> AESKeyGenAssist(Vec256<uint8_t> v) {
6121
6851
  const Full256<uint8_t> d;
6122
6852
  #if HWY_TARGET <= HWY_AVX3_DL
6123
- alignas(16) static constexpr uint8_t kRconXorMask[16] = {
6124
- 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0};
6125
- alignas(16) static constexpr uint8_t kRotWordShuffle[16] = {
6126
- 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12};
6853
+ const VFromD<decltype(d)> rconXorMask = Dup128VecFromValues(
6854
+ d, 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0);
6855
+ const VFromD<decltype(d)> rotWordShuffle = Dup128VecFromValues(
6856
+ d, 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12);
6127
6857
  const Repartition<uint32_t, decltype(d)> du32;
6128
6858
  const auto w13 = BitCast(d, DupOdd(BitCast(du32, v)));
6129
- const auto sub_word_result = AESLastRound(w13, LoadDup128(d, kRconXorMask));
6130
- return TableLookupBytes(sub_word_result, LoadDup128(d, kRotWordShuffle));
6859
+ const auto sub_word_result = AESLastRound(w13, rconXorMask);
6860
+ return TableLookupBytes(sub_word_result, rotWordShuffle);
6131
6861
  #else
6132
6862
  const Half<decltype(d)> d2;
6133
6863
  return Combine(d, AESKeyGenAssist<kRcon>(UpperHalf(d2, v)),
@@ -6387,9 +7117,9 @@ HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) {
6387
7117
  0x0303030303030303ull};
6388
7118
  const auto rep8 = TableLookupBytes(vbits, BitCast(du, Load(du64, kRep8)));
6389
7119
 
6390
- alignas(32) static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
6391
- 1, 2, 4, 8, 16, 32, 64, 128};
6392
- return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
7120
+ const VFromD<decltype(du)> bit = Dup128VecFromValues(
7121
+ du, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
7122
+ return RebindMask(d, TestBit(rep8, bit));
6393
7123
  }
6394
7124
 
6395
7125
  template <typename T, HWY_IF_T_SIZE(T, 2)>
@@ -6923,6 +7653,16 @@ HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
6923
7653
 
6924
7654
  #endif // HWY_TARGET <= HWY_AVX3
6925
7655
 
7656
+ // ------------------------------ Dup128MaskFromMaskBits
7657
+
7658
+ // Generic for all vector lengths >= 32 bytes
7659
+ template <class D, HWY_IF_V_SIZE_GT_D(D, 16)>
7660
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
7661
+ const Half<decltype(d)> dh;
7662
+ const auto mh = Dup128MaskFromMaskBits(dh, mask_bits);
7663
+ return CombineMasks(d, mh, mh);
7664
+ }
7665
+
6926
7666
  // ------------------------------ Expand
6927
7667
 
6928
7668
  // Always define Expand/LoadExpand because generic_ops only does so for Vec128.
@@ -7396,116 +8136,9 @@ HWY_API Mask256<T> SetAtOrBeforeFirst(Mask256<T> mask) {
7396
8136
  }
7397
8137
  #endif // HWY_TARGET <= HWY_AVX3
7398
8138
 
7399
- // ------------------------------ Reductions
7400
-
7401
- namespace detail {
7402
-
7403
- // These functions start with each lane per 128-bit block being reduced with the
7404
- // corresponding lane in the other block, so we use the same logic as x86_128
7405
- // but running on both blocks at the same time. There are two (64-bit) to eight
7406
- // (16-bit) lanes per block.
7407
- template <typename T, HWY_IF_T_SIZE(T, 8)>
7408
- HWY_INLINE Vec256<T> SumOfLanes(Vec256<T> v10) {
7409
- const DFromV<decltype(v10)> d;
7410
- return Add(v10, Reverse2(d, v10));
7411
- }
7412
- template <typename T, HWY_IF_T_SIZE(T, 8)>
7413
- HWY_INLINE Vec256<T> MinOfLanes(Vec256<T> v10) {
7414
- const DFromV<decltype(v10)> d;
7415
- return Min(v10, Reverse2(d, v10));
7416
- }
7417
- template <typename T, HWY_IF_T_SIZE(T, 8)>
7418
- HWY_INLINE Vec256<T> MaxOfLanes(Vec256<T> v10) {
7419
- const DFromV<decltype(v10)> d;
7420
- return Max(v10, Reverse2(d, v10));
7421
- }
7422
-
7423
- template <typename T, HWY_IF_T_SIZE(T, 4)>
7424
- HWY_INLINE Vec256<T> SumOfLanes(Vec256<T> v3210) {
7425
- using V = decltype(v3210);
7426
- const DFromV<V> d;
7427
- const V v0123 = Reverse4(d, v3210);
7428
- const V v03_12_12_03 = Add(v3210, v0123);
7429
- const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
7430
- return Add(v03_12_12_03, v12_03_03_12);
7431
- }
7432
- template <typename T, HWY_IF_T_SIZE(T, 4)>
7433
- HWY_INLINE Vec256<T> MinOfLanes(Vec256<T> v3210) {
7434
- using V = decltype(v3210);
7435
- const DFromV<V> d;
7436
- const V v0123 = Reverse4(d, v3210);
7437
- const V v03_12_12_03 = Min(v3210, v0123);
7438
- const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
7439
- return Min(v03_12_12_03, v12_03_03_12);
7440
- }
7441
- template <typename T, HWY_IF_T_SIZE(T, 4)>
7442
- HWY_INLINE Vec256<T> MaxOfLanes(Vec256<T> v3210) {
7443
- using V = decltype(v3210);
7444
- const DFromV<V> d;
7445
- const V v0123 = Reverse4(d, v3210);
7446
- const V v03_12_12_03 = Max(v3210, v0123);
7447
- const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
7448
- return Max(v03_12_12_03, v12_03_03_12);
7449
- }
7450
-
7451
- template <typename T, HWY_IF_T_SIZE(T, 2)>
7452
- HWY_INLINE Vec256<T> SumOfLanes(Vec256<T> v76543210) {
7453
- using V = decltype(v76543210);
7454
- const DFromV<V> d;
7455
- // The upper half is reversed from the lower half; omit for brevity.
7456
- const V v34_25_16_07 = Add(v76543210, Reverse8(d, v76543210));
7457
- const V v0347_1625_1625_0347 = Add(v34_25_16_07, Reverse4(d, v34_25_16_07));
7458
- return Add(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
7459
- }
7460
- template <typename T, HWY_IF_T_SIZE(T, 2)>
7461
- HWY_INLINE Vec256<T> MinOfLanes(Vec256<T> v76543210) {
7462
- using V = decltype(v76543210);
7463
- const DFromV<V> d;
7464
- // The upper half is reversed from the lower half; omit for brevity.
7465
- const V v34_25_16_07 = Min(v76543210, Reverse8(d, v76543210));
7466
- const V v0347_1625_1625_0347 = Min(v34_25_16_07, Reverse4(d, v34_25_16_07));
7467
- return Min(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
7468
- }
7469
- template <typename T, HWY_IF_T_SIZE(T, 2)>
7470
- HWY_INLINE Vec256<T> MaxOfLanes(Vec256<T> v76543210) {
7471
- using V = decltype(v76543210);
7472
- const DFromV<V> d;
7473
- // The upper half is reversed from the lower half; omit for brevity.
7474
- const V v34_25_16_07 = Max(v76543210, Reverse8(d, v76543210));
7475
- const V v0347_1625_1625_0347 = Max(v34_25_16_07, Reverse4(d, v34_25_16_07));
7476
- return Max(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
7477
- }
7478
-
7479
- } // namespace detail
7480
-
7481
- // Supported for >8-bit types. Returns the broadcasted result.
7482
- template <class D, HWY_IF_V_SIZE_D(D, 32)>
7483
- HWY_API VFromD<D> SumOfLanes(D /*d*/, VFromD<D> vHL) {
7484
- const VFromD<D> vLH = SwapAdjacentBlocks(vHL);
7485
- return detail::SumOfLanes(Add(vLH, vHL));
7486
- }
7487
- template <class D, HWY_IF_V_SIZE_D(D, 32)>
7488
- HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
7489
- return GetLane(SumOfLanes(d, v));
7490
- }
7491
- #if HWY_HAVE_FLOAT16
7492
- template <class D, HWY_IF_V_SIZE_D(D, 32)>
7493
- HWY_API float16_t ReduceSum(D, VFromD<D> v) {
7494
- return _mm256_reduce_add_ph(v.raw);
7495
- }
7496
- #endif // HWY_HAVE_FLOAT16
7497
- template <class D, HWY_IF_V_SIZE_D(D, 32)>
7498
- HWY_API VFromD<D> MinOfLanes(D /*d*/, VFromD<D> vHL) {
7499
- const VFromD<D> vLH = SwapAdjacentBlocks(vHL);
7500
- return detail::MinOfLanes(Min(vLH, vHL));
7501
- }
7502
- template <class D, HWY_IF_V_SIZE_D(D, 32)>
7503
- HWY_API VFromD<D> MaxOfLanes(D /*d*/, VFromD<D> vHL) {
7504
- const VFromD<D> vLH = SwapAdjacentBlocks(vHL);
7505
- return detail::MaxOfLanes(Max(vLH, vHL));
7506
- }
8139
+ // ------------------------------ Reductions in generic_ops
7507
8140
 
7508
- // -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex
8141
+ // ------------------------------ LeadingZeroCount
7509
8142
 
7510
8143
  #if HWY_TARGET <= HWY_AVX3
7511
8144
  template <class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>