@img/sharp-libvips-dev 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. package/README.md +1 -2
  2. package/include/aom/aom_decoder.h +1 -1
  3. package/include/aom/aom_encoder.h +7 -1
  4. package/include/aom/aom_image.h +24 -12
  5. package/include/aom/aom_integer.h +3 -3
  6. package/include/aom/aomcx.h +15 -0
  7. package/include/aom/aomdx.h +5 -2
  8. package/include/archive.h +7 -5
  9. package/include/archive_entry.h +5 -3
  10. package/include/cgif.h +3 -0
  11. package/include/expat.h +21 -10
  12. package/include/expat_config.h +11 -5
  13. package/include/ffi.h +12 -25
  14. package/include/freetype2/freetype/config/ftoption.h +2 -2
  15. package/include/fribidi/fribidi-config.h +2 -2
  16. package/include/fribidi/fribidi-unicode-version.h +3 -3
  17. package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
  18. package/include/glib-2.0/gio/gappinfo.h +40 -25
  19. package/include/glib-2.0/gio/gapplication.h +6 -0
  20. package/include/glib-2.0/gio/gasyncresult.h +1 -1
  21. package/include/glib-2.0/gio/gconverter.h +5 -0
  22. package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
  23. package/include/glib-2.0/gio/gfile.h +16 -0
  24. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  25. package/include/glib-2.0/gio/giotypes.h +0 -1
  26. package/include/glib-2.0/gio/gsettings.h +8 -0
  27. package/include/glib-2.0/gio/gvfs.h +2 -2
  28. package/include/glib-2.0/girepository/gi-visibility.h +34 -0
  29. package/include/glib-2.0/girepository/giarginfo.h +23 -6
  30. package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
  31. package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
  32. package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
  33. package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
  34. package/include/glib-2.0/girepository/gienuminfo.h +20 -21
  35. package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
  36. package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
  37. package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
  38. package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
  39. package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
  40. package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
  41. package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
  42. package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
  43. package/include/glib-2.0/girepository/girepository.h +53 -62
  44. package/include/glib-2.0/girepository/girffi.h +8 -7
  45. package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
  46. package/include/glib-2.0/girepository/gistructinfo.h +26 -11
  47. package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
  48. package/include/glib-2.0/girepository/gitypelib.h +9 -13
  49. package/include/glib-2.0/girepository/gitypes.h +52 -104
  50. package/include/glib-2.0/girepository/giunioninfo.h +28 -12
  51. package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
  52. package/include/glib-2.0/girepository/givalueinfo.h +65 -0
  53. package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
  54. package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
  55. package/include/glib-2.0/glib/gbitlock.h +31 -0
  56. package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
  57. package/include/glib-2.0/glib/giochannel.h +2 -2
  58. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  59. package/include/glib-2.0/glib/gmacros.h +12 -5
  60. package/include/glib-2.0/glib/gmain.h +93 -7
  61. package/include/glib-2.0/glib/gmessages.h +8 -0
  62. package/include/glib-2.0/glib/gqsort.h +8 -1
  63. package/include/glib-2.0/glib/gslice.h +2 -0
  64. package/include/glib-2.0/glib/gstrfuncs.h +24 -30
  65. package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
  66. package/include/glib-2.0/glib/gthread.h +191 -3
  67. package/include/glib-2.0/glib/gunicode.h +1 -1
  68. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  69. package/include/glib-2.0/glib-unix.h +7 -1
  70. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  71. package/include/glib-2.0/gobject/genums.h +6 -6
  72. package/include/glib-2.0/gobject/glib-types.h +11 -0
  73. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  74. package/include/glib-2.0/gobject/gsignal.h +16 -6
  75. package/include/glib-2.0/gobject/gtype.h +6 -6
  76. package/include/harfbuzz/hb-buffer.h +6 -0
  77. package/include/harfbuzz/hb-common.h +6 -9
  78. package/include/harfbuzz/hb-cplusplus.hh +8 -11
  79. package/include/harfbuzz/hb-subset.h +17 -4
  80. package/include/harfbuzz/hb-version.h +3 -3
  81. package/include/hwy/abort.h +28 -0
  82. package/include/hwy/aligned_allocator.h +218 -6
  83. package/include/hwy/base.h +1935 -512
  84. package/include/hwy/cache_control.h +24 -6
  85. package/include/hwy/detect_compiler_arch.h +105 -10
  86. package/include/hwy/detect_targets.h +146 -37
  87. package/include/hwy/foreach_target.h +36 -1
  88. package/include/hwy/highway.h +222 -50
  89. package/include/hwy/ops/arm_neon-inl.h +2055 -894
  90. package/include/hwy/ops/arm_sve-inl.h +1476 -348
  91. package/include/hwy/ops/emu128-inl.h +711 -623
  92. package/include/hwy/ops/generic_ops-inl.h +4431 -2157
  93. package/include/hwy/ops/inside-inl.h +691 -0
  94. package/include/hwy/ops/ppc_vsx-inl.h +2186 -673
  95. package/include/hwy/ops/rvv-inl.h +1556 -536
  96. package/include/hwy/ops/scalar-inl.h +353 -233
  97. package/include/hwy/ops/set_macros-inl.h +171 -23
  98. package/include/hwy/ops/shared-inl.h +198 -56
  99. package/include/hwy/ops/wasm_128-inl.h +283 -244
  100. package/include/hwy/ops/x86_128-inl.h +3673 -1357
  101. package/include/hwy/ops/x86_256-inl.h +1737 -663
  102. package/include/hwy/ops/x86_512-inl.h +1697 -500
  103. package/include/hwy/per_target.h +4 -0
  104. package/include/hwy/profiler.h +648 -0
  105. package/include/hwy/robust_statistics.h +2 -2
  106. package/include/hwy/targets.h +40 -32
  107. package/include/hwy/timer-inl.h +3 -3
  108. package/include/hwy/timer.h +16 -1
  109. package/include/libheif/heif.h +170 -15
  110. package/include/libheif/heif_items.h +237 -0
  111. package/include/libheif/heif_properties.h +38 -2
  112. package/include/libheif/heif_regions.h +1 -1
  113. package/include/libheif/heif_version.h +2 -2
  114. package/include/libpng16/png.h +32 -29
  115. package/include/libpng16/pngconf.h +2 -2
  116. package/include/libpng16/pnglibconf.h +8 -3
  117. package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
  118. package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
  119. package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
  120. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  121. package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
  122. package/include/libxml2/libxml/HTMLparser.h +12 -19
  123. package/include/libxml2/libxml/c14n.h +1 -12
  124. package/include/libxml2/libxml/debugXML.h +1 -1
  125. package/include/libxml2/libxml/encoding.h +9 -0
  126. package/include/libxml2/libxml/entities.h +12 -1
  127. package/include/libxml2/libxml/hash.h +19 -0
  128. package/include/libxml2/libxml/list.h +2 -2
  129. package/include/libxml2/libxml/nanohttp.h +17 -0
  130. package/include/libxml2/libxml/parser.h +73 -58
  131. package/include/libxml2/libxml/parserInternals.h +9 -1
  132. package/include/libxml2/libxml/pattern.h +6 -0
  133. package/include/libxml2/libxml/tree.h +32 -12
  134. package/include/libxml2/libxml/uri.h +11 -0
  135. package/include/libxml2/libxml/valid.h +29 -2
  136. package/include/libxml2/libxml/xinclude.h +7 -0
  137. package/include/libxml2/libxml/xmlIO.h +21 -5
  138. package/include/libxml2/libxml/xmlerror.h +14 -0
  139. package/include/libxml2/libxml/xmlexports.h +111 -15
  140. package/include/libxml2/libxml/xmlmemory.h +8 -45
  141. package/include/libxml2/libxml/xmlreader.h +2 -0
  142. package/include/libxml2/libxml/xmlsave.h +5 -0
  143. package/include/libxml2/libxml/xmlunicode.h +165 -1
  144. package/include/libxml2/libxml/xmlversion.h +15 -179
  145. package/include/libxml2/libxml/xmlwriter.h +1 -0
  146. package/include/libxml2/libxml/xpath.h +4 -0
  147. package/include/pango-1.0/pango/pango-features.h +2 -2
  148. package/include/pango-1.0/pango/pango-fontmap.h +7 -0
  149. package/include/pango-1.0/pango/pango-item.h +4 -2
  150. package/include/pango-1.0/pango/pango-version-macros.h +25 -0
  151. package/include/pango-1.0/pango/pangofc-font.h +2 -1
  152. package/include/pixman-1/pixman-version.h +2 -2
  153. package/include/png.h +32 -29
  154. package/include/pngconf.h +2 -2
  155. package/include/pnglibconf.h +8 -3
  156. package/include/vips/connection.h +9 -3
  157. package/include/vips/util.h +1 -11
  158. package/include/vips/version.h +4 -4
  159. package/include/webp/decode.h +58 -56
  160. package/include/webp/demux.h +25 -21
  161. package/include/webp/encode.h +44 -39
  162. package/include/webp/mux.h +76 -15
  163. package/include/webp/mux_types.h +2 -1
  164. package/include/webp/sharpyuv/sharpyuv.h +77 -8
  165. package/include/webp/types.h +29 -8
  166. package/include/zconf.h +1 -1
  167. package/include/zlib.h +12 -12
  168. package/package.json +1 -1
  169. package/versions.json +18 -19
@@ -101,6 +101,9 @@ class Vec256 {
101
101
  HWY_INLINE Vec256& operator-=(const Vec256 other) {
102
102
  return *this = (*this - other);
103
103
  }
104
+ HWY_INLINE Vec256& operator%=(const Vec256 other) {
105
+ return *this = (*this % other);
106
+ }
104
107
  HWY_INLINE Vec256& operator&=(const Vec256 other) {
105
108
  return *this = (*this & other);
106
109
  }
@@ -191,6 +194,25 @@ HWY_INLINE __m256i BitCastToInteger(__m256d v) {
191
194
  return _mm256_castpd_si256(v);
192
195
  }
193
196
 
197
+ #if HWY_AVX3_HAVE_F32_TO_BF16C
198
+ HWY_INLINE __m256i BitCastToInteger(__m256bh v) {
199
+ // Need to use reinterpret_cast on GCC/Clang or BitCastScalar on MSVC to
200
+ // bit cast a __m256bh to a __m256i as there is currently no intrinsic
201
+ // available (as of GCC 13 and Clang 17) that can bit cast a __m256bh vector
202
+ // to a __m256i vector
203
+
204
+ #if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
205
+ // On GCC or Clang, use reinterpret_cast to bit cast a __m256bh to a __m256i
206
+ return reinterpret_cast<__m256i>(v);
207
+ #else
208
+ // On MSVC, use BitCastScalar to bit cast a __m256bh to a __m256i as MSVC does
209
+ // not allow reinterpret_cast, static_cast, or a C-style cast to be used to
210
+ // bit cast from one AVX vector type to a different AVX vector type
211
+ return BitCastScalar<__m256i>(v);
212
+ #endif // HWY_COMPILER_GCC || HWY_COMPILER_CLANG
213
+ }
214
+ #endif // HWY_AVX3_HAVE_F32_TO_BF16C
215
+
194
216
  template <typename T>
195
217
  HWY_INLINE Vec256<uint8_t> BitCastToByte(Vec256<T> v) {
196
218
  return Vec256<uint8_t>{BitCastToInteger(v.raw)};
@@ -359,6 +381,85 @@ HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
359
381
  ResizeBitCast(Full128<uint8_t>(), v).raw)});
360
382
  }
361
383
 
384
+ // ------------------------------ Dup128VecFromValues
385
+
386
+ template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 32)>
387
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
388
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
389
+ TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
390
+ TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
391
+ TFromD<D> t11, TFromD<D> t12,
392
+ TFromD<D> t13, TFromD<D> t14,
393
+ TFromD<D> t15) {
394
+ return VFromD<D>{_mm256_setr_epi8(
395
+ static_cast<char>(t0), static_cast<char>(t1), static_cast<char>(t2),
396
+ static_cast<char>(t3), static_cast<char>(t4), static_cast<char>(t5),
397
+ static_cast<char>(t6), static_cast<char>(t7), static_cast<char>(t8),
398
+ static_cast<char>(t9), static_cast<char>(t10), static_cast<char>(t11),
399
+ static_cast<char>(t12), static_cast<char>(t13), static_cast<char>(t14),
400
+ static_cast<char>(t15), static_cast<char>(t0), static_cast<char>(t1),
401
+ static_cast<char>(t2), static_cast<char>(t3), static_cast<char>(t4),
402
+ static_cast<char>(t5), static_cast<char>(t6), static_cast<char>(t7),
403
+ static_cast<char>(t8), static_cast<char>(t9), static_cast<char>(t10),
404
+ static_cast<char>(t11), static_cast<char>(t12), static_cast<char>(t13),
405
+ static_cast<char>(t14), static_cast<char>(t15))};
406
+ }
407
+
408
+ template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 32)>
409
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
410
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
411
+ TFromD<D> t5, TFromD<D> t6,
412
+ TFromD<D> t7) {
413
+ return VFromD<D>{
414
+ _mm256_setr_epi16(static_cast<int16_t>(t0), static_cast<int16_t>(t1),
415
+ static_cast<int16_t>(t2), static_cast<int16_t>(t3),
416
+ static_cast<int16_t>(t4), static_cast<int16_t>(t5),
417
+ static_cast<int16_t>(t6), static_cast<int16_t>(t7),
418
+ static_cast<int16_t>(t0), static_cast<int16_t>(t1),
419
+ static_cast<int16_t>(t2), static_cast<int16_t>(t3),
420
+ static_cast<int16_t>(t4), static_cast<int16_t>(t5),
421
+ static_cast<int16_t>(t6), static_cast<int16_t>(t7))};
422
+ }
423
+
424
+ #if HWY_HAVE_FLOAT16
425
+ template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 32)>
426
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
427
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
428
+ TFromD<D> t5, TFromD<D> t6,
429
+ TFromD<D> t7) {
430
+ return VFromD<D>{_mm256_setr_ph(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2,
431
+ t3, t4, t5, t6, t7)};
432
+ }
433
+ #endif
434
+
435
+ template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 32)>
436
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
437
+ TFromD<D> t2, TFromD<D> t3) {
438
+ return VFromD<D>{
439
+ _mm256_setr_epi32(static_cast<int32_t>(t0), static_cast<int32_t>(t1),
440
+ static_cast<int32_t>(t2), static_cast<int32_t>(t3),
441
+ static_cast<int32_t>(t0), static_cast<int32_t>(t1),
442
+ static_cast<int32_t>(t2), static_cast<int32_t>(t3))};
443
+ }
444
+
445
+ template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 32)>
446
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
447
+ TFromD<D> t2, TFromD<D> t3) {
448
+ return VFromD<D>{_mm256_setr_ps(t0, t1, t2, t3, t0, t1, t2, t3)};
449
+ }
450
+
451
+ template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 32)>
452
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
453
+ return VFromD<D>{
454
+ _mm256_setr_epi64x(static_cast<int64_t>(t0), static_cast<int64_t>(t1),
455
+ static_cast<int64_t>(t0), static_cast<int64_t>(t1))};
456
+ }
457
+
458
+ template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 32)>
459
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
460
+ return VFromD<D>{_mm256_setr_pd(t0, t1, t0, t1)};
461
+ }
462
+
362
463
  // ================================================== LOGICAL
363
464
 
364
465
  // ------------------------------ And
@@ -367,7 +468,8 @@ template <typename T>
367
468
  HWY_API Vec256<T> And(Vec256<T> a, Vec256<T> b) {
368
469
  const DFromV<decltype(a)> d; // for float16_t
369
470
  const RebindToUnsigned<decltype(d)> du;
370
- return BitCast(d, VFromD<decltype(du)>{_mm256_and_si256(a.raw, b.raw)});
471
+ return BitCast(d, VFromD<decltype(du)>{_mm256_and_si256(BitCast(du, a).raw,
472
+ BitCast(du, b).raw)});
371
473
  }
372
474
 
373
475
  HWY_API Vec256<float> And(Vec256<float> a, Vec256<float> b) {
@@ -384,8 +486,8 @@ template <typename T>
384
486
  HWY_API Vec256<T> AndNot(Vec256<T> not_mask, Vec256<T> mask) {
385
487
  const DFromV<decltype(mask)> d; // for float16_t
386
488
  const RebindToUnsigned<decltype(d)> du;
387
- return BitCast(
388
- d, VFromD<decltype(du)>{_mm256_andnot_si256(not_mask.raw, mask.raw)});
489
+ return BitCast(d, VFromD<decltype(du)>{_mm256_andnot_si256(
490
+ BitCast(du, not_mask).raw, BitCast(du, mask).raw)});
389
491
  }
390
492
  HWY_API Vec256<float> AndNot(Vec256<float> not_mask, Vec256<float> mask) {
391
493
  return Vec256<float>{_mm256_andnot_ps(not_mask.raw, mask.raw)};
@@ -400,7 +502,8 @@ template <typename T>
400
502
  HWY_API Vec256<T> Or(Vec256<T> a, Vec256<T> b) {
401
503
  const DFromV<decltype(a)> d; // for float16_t
402
504
  const RebindToUnsigned<decltype(d)> du;
403
- return BitCast(d, VFromD<decltype(du)>{_mm256_or_si256(a.raw, b.raw)});
505
+ return BitCast(d, VFromD<decltype(du)>{_mm256_or_si256(BitCast(du, a).raw,
506
+ BitCast(du, b).raw)});
404
507
  }
405
508
 
406
509
  HWY_API Vec256<float> Or(Vec256<float> a, Vec256<float> b) {
@@ -416,7 +519,8 @@ template <typename T>
416
519
  HWY_API Vec256<T> Xor(Vec256<T> a, Vec256<T> b) {
417
520
  const DFromV<decltype(a)> d; // for float16_t
418
521
  const RebindToUnsigned<decltype(d)> du;
419
- return BitCast(d, VFromD<decltype(du)>{_mm256_xor_si256(a.raw, b.raw)});
522
+ return BitCast(d, VFromD<decltype(du)>{_mm256_xor_si256(BitCast(du, a).raw,
523
+ BitCast(du, b).raw)});
420
524
  }
421
525
 
422
526
  HWY_API Vec256<float> Xor(Vec256<float> a, Vec256<float> b) {
@@ -431,7 +535,7 @@ template <typename T>
431
535
  HWY_API Vec256<T> Not(const Vec256<T> v) {
432
536
  const DFromV<decltype(v)> d;
433
537
  using TU = MakeUnsigned<T>;
434
- #if HWY_TARGET <= HWY_AVX3
538
+ #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
435
539
  const __m256i vu = BitCast(RebindToUnsigned<decltype(d)>(), v).raw;
436
540
  return BitCast(d, Vec256<TU>{_mm256_ternarylogic_epi32(vu, vu, vu, 0x55)});
437
541
  #else
@@ -442,7 +546,7 @@ HWY_API Vec256<T> Not(const Vec256<T> v) {
442
546
  // ------------------------------ Xor3
443
547
  template <typename T>
444
548
  HWY_API Vec256<T> Xor3(Vec256<T> x1, Vec256<T> x2, Vec256<T> x3) {
445
- #if HWY_TARGET <= HWY_AVX3
549
+ #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
446
550
  const DFromV<decltype(x1)> d;
447
551
  const RebindToUnsigned<decltype(d)> du;
448
552
  using VU = VFromD<decltype(du)>;
@@ -457,7 +561,7 @@ HWY_API Vec256<T> Xor3(Vec256<T> x1, Vec256<T> x2, Vec256<T> x3) {
457
561
  // ------------------------------ Or3
458
562
  template <typename T>
459
563
  HWY_API Vec256<T> Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) {
460
- #if HWY_TARGET <= HWY_AVX3
564
+ #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
461
565
  const DFromV<decltype(o1)> d;
462
566
  const RebindToUnsigned<decltype(d)> du;
463
567
  using VU = VFromD<decltype(du)>;
@@ -472,7 +576,7 @@ HWY_API Vec256<T> Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) {
472
576
  // ------------------------------ OrAnd
473
577
  template <typename T>
474
578
  HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
475
- #if HWY_TARGET <= HWY_AVX3
579
+ #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
476
580
  const DFromV<decltype(o)> d;
477
581
  const RebindToUnsigned<decltype(d)> du;
478
582
  using VU = VFromD<decltype(du)>;
@@ -487,7 +591,7 @@ HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
487
591
  // ------------------------------ IfVecThenElse
488
592
  template <typename T>
489
593
  HWY_API Vec256<T> IfVecThenElse(Vec256<T> mask, Vec256<T> yes, Vec256<T> no) {
490
- #if HWY_TARGET <= HWY_AVX3
594
+ #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
491
595
  const DFromV<decltype(yes)> d;
492
596
  const RebindToUnsigned<decltype(d)> du;
493
597
  using VU = VFromD<decltype(du)>;
@@ -589,7 +693,7 @@ HWY_INLINE Vec256<T> IfThenElse(hwy::SizeTag<8> /* tag */, Mask256<T> mask,
589
693
 
590
694
  } // namespace detail
591
695
 
592
- template <typename T>
696
+ template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
593
697
  HWY_API Vec256<T> IfThenElse(Mask256<T> mask, Vec256<T> yes, Vec256<T> no) {
594
698
  return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
595
699
  }
@@ -634,7 +738,7 @@ HWY_INLINE Vec256<T> IfThenElseZero(hwy::SizeTag<8> /* tag */, Mask256<T> mask,
634
738
 
635
739
  } // namespace detail
636
740
 
637
- template <typename T, HWY_IF_NOT_FLOAT3264(T)>
741
+ template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
638
742
  HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) {
639
743
  return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
640
744
  }
@@ -672,7 +776,7 @@ HWY_INLINE Vec256<T> IfThenZeroElse(hwy::SizeTag<8> /* tag */, Mask256<T> mask,
672
776
 
673
777
  } // namespace detail
674
778
 
675
- template <typename T, HWY_IF_NOT_FLOAT3264(T)>
779
+ template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
676
780
  HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
677
781
  return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
678
782
  }
@@ -683,13 +787,6 @@ HWY_API Vec256<double> IfThenZeroElse(Mask256<double> mask, Vec256<double> no) {
683
787
  return Vec256<double>{_mm256_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
684
788
  }
685
789
 
686
- template <typename T>
687
- HWY_API Vec256<T> ZeroIfNegative(const Vec256<T> v) {
688
- static_assert(IsSigned<T>(), "Only for float");
689
- // AVX3 MaskFromVec only looks at the MSB
690
- return IfThenZeroElse(MaskFromVec(v), v);
691
- }
692
-
693
790
  // ------------------------------ Mask logical
694
791
 
695
792
  namespace detail {
@@ -879,6 +976,58 @@ HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<8> /*tag*/,
879
976
  #endif
880
977
  }
881
978
 
979
+ // UnmaskedNot returns ~m.raw without zeroing out any invalid bits
980
+ template <typename T, HWY_IF_T_SIZE(T, 1)>
981
+ HWY_INLINE Mask256<T> UnmaskedNot(const Mask256<T> m) {
982
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
983
+ return Mask256<T>{static_cast<__mmask32>(_knot_mask32(m.raw))};
984
+ #else
985
+ return Mask256<T>{static_cast<__mmask32>(~m.raw)};
986
+ #endif
987
+ }
988
+
989
+ template <typename T, HWY_IF_T_SIZE(T, 2)>
990
+ HWY_INLINE Mask256<T> UnmaskedNot(const Mask256<T> m) {
991
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
992
+ return Mask256<T>{static_cast<__mmask16>(_knot_mask16(m.raw))};
993
+ #else
994
+ return Mask256<T>{static_cast<__mmask16>(~m.raw)};
995
+ #endif
996
+ }
997
+
998
+ template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
999
+ HWY_INLINE Mask256<T> UnmaskedNot(const Mask256<T> m) {
1000
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
1001
+ return Mask256<T>{static_cast<__mmask8>(_knot_mask8(m.raw))};
1002
+ #else
1003
+ return Mask256<T>{static_cast<__mmask8>(~m.raw)};
1004
+ #endif
1005
+ }
1006
+
1007
+ template <typename T>
1008
+ HWY_INLINE Mask256<T> Not(hwy::SizeTag<1> /*tag*/, const Mask256<T> m) {
1009
+ // sizeof(T) == 1: simply return ~m as all 32 bits of m are valid
1010
+ return UnmaskedNot(m);
1011
+ }
1012
+ template <typename T>
1013
+ HWY_INLINE Mask256<T> Not(hwy::SizeTag<2> /*tag*/, const Mask256<T> m) {
1014
+ // sizeof(T) == 2: simply return ~m as all 16 bits of m are valid
1015
+ return UnmaskedNot(m);
1016
+ }
1017
+ template <typename T>
1018
+ HWY_INLINE Mask256<T> Not(hwy::SizeTag<4> /*tag*/, const Mask256<T> m) {
1019
+ // sizeof(T) == 4: simply return ~m as all 8 bits of m are valid
1020
+ return UnmaskedNot(m);
1021
+ }
1022
+ template <typename T>
1023
+ HWY_INLINE Mask256<T> Not(hwy::SizeTag<8> /*tag*/, const Mask256<T> m) {
1024
+ // sizeof(T) == 8: need to zero out the upper 4 bits of ~m as only the lower
1025
+ // 4 bits of m are valid
1026
+
1027
+ // Return (~m) & 0x0F
1028
+ return AndNot(hwy::SizeTag<8>(), m, Mask256<T>::FromBits(uint64_t{0x0F}));
1029
+ }
1030
+
882
1031
  } // namespace detail
883
1032
 
884
1033
  template <typename T>
@@ -904,8 +1053,7 @@ HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
904
1053
  template <typename T>
905
1054
  HWY_API Mask256<T> Not(const Mask256<T> m) {
906
1055
  // Flip only the valid bits.
907
- constexpr size_t N = 32 / sizeof(T);
908
- return Xor(m, Mask256<T>::FromBits((1ull << N) - 1));
1056
+ return detail::Not(hwy::SizeTag<sizeof(T)>(), m);
909
1057
  }
910
1058
 
911
1059
  template <typename T>
@@ -913,6 +1061,53 @@ HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) {
913
1061
  return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
914
1062
  }
915
1063
 
1064
+ template <class D, HWY_IF_LANES_D(D, 32)>
1065
+ HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
1066
+ MFromD<Half<D>> lo) {
1067
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
1068
+ const __mmask32 combined_mask = _mm512_kunpackw(
1069
+ static_cast<__mmask32>(hi.raw), static_cast<__mmask32>(lo.raw));
1070
+ #else
1071
+ const auto combined_mask =
1072
+ ((static_cast<uint32_t>(hi.raw) << 16) | (lo.raw & 0xFFFFu));
1073
+ #endif
1074
+
1075
+ return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
1076
+ }
1077
+
1078
+ template <class D, HWY_IF_LANES_D(D, 16)>
1079
+ HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
1080
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
1081
+ const auto shifted_mask = _kshiftri_mask32(static_cast<__mmask32>(m.raw), 16);
1082
+ #else
1083
+ const auto shifted_mask = static_cast<uint32_t>(m.raw) >> 16;
1084
+ #endif
1085
+
1086
+ return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
1087
+ }
1088
+
1089
+ template <class D, HWY_IF_LANES_D(D, 32)>
1090
+ HWY_API MFromD<D> SlideMask1Up(D /*d*/, MFromD<D> m) {
1091
+ using RawM = decltype(MFromD<D>().raw);
1092
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
1093
+ return MFromD<D>{
1094
+ static_cast<RawM>(_kshiftli_mask32(static_cast<__mmask32>(m.raw), 1))};
1095
+ #else
1096
+ return MFromD<D>{static_cast<RawM>(static_cast<uint32_t>(m.raw) << 1)};
1097
+ #endif
1098
+ }
1099
+
1100
+ template <class D, HWY_IF_LANES_D(D, 32)>
1101
+ HWY_API MFromD<D> SlideMask1Down(D /*d*/, MFromD<D> m) {
1102
+ using RawM = decltype(MFromD<D>().raw);
1103
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
1104
+ return MFromD<D>{
1105
+ static_cast<RawM>(_kshiftri_mask32(static_cast<__mmask32>(m.raw), 1))};
1106
+ #else
1107
+ return MFromD<D>{static_cast<RawM>(static_cast<uint32_t>(m.raw) >> 1)};
1108
+ #endif
1109
+ }
1110
+
916
1111
  #else // AVX2
917
1112
 
918
1113
  // ------------------------------ Mask
@@ -1072,7 +1267,11 @@ HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
1072
1267
  #if HWY_HAVE_FLOAT16
1073
1268
  HWY_API Mask256<float16_t> operator==(Vec256<float16_t> a,
1074
1269
  Vec256<float16_t> b) {
1270
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
1271
+ HWY_DIAGNOSTICS(push)
1272
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1075
1273
  return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1274
+ HWY_DIAGNOSTICS(pop)
1076
1275
  }
1077
1276
  #endif // HWY_HAVE_FLOAT16
1078
1277
  HWY_API Mask256<float> operator==(Vec256<float> a, Vec256<float> b) {
@@ -1105,7 +1304,11 @@ HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
1105
1304
  #if HWY_HAVE_FLOAT16
1106
1305
  HWY_API Mask256<float16_t> operator!=(Vec256<float16_t> a,
1107
1306
  Vec256<float16_t> b) {
1307
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
1308
+ HWY_DIAGNOSTICS(push)
1309
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1108
1310
  return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1311
+ HWY_DIAGNOSTICS(pop)
1109
1312
  }
1110
1313
  #endif // HWY_HAVE_FLOAT16
1111
1314
  HWY_API Mask256<float> operator!=(Vec256<float> a, Vec256<float> b) {
@@ -1146,7 +1349,11 @@ HWY_API Mask256<uint64_t> operator>(Vec256<uint64_t> a, Vec256<uint64_t> b) {
1146
1349
 
1147
1350
  #if HWY_HAVE_FLOAT16
1148
1351
  HWY_API Mask256<float16_t> operator>(Vec256<float16_t> a, Vec256<float16_t> b) {
1352
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
1353
+ HWY_DIAGNOSTICS(push)
1354
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1149
1355
  return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)};
1356
+ HWY_DIAGNOSTICS(pop)
1150
1357
  }
1151
1358
  #endif // HWY_HAVE_FLOAT16
1152
1359
  HWY_API Mask256<float> operator>(Vec256<float> a, Vec256<float> b) {
@@ -1161,7 +1368,11 @@ HWY_API Mask256<double> operator>(Vec256<double> a, Vec256<double> b) {
1161
1368
  #if HWY_HAVE_FLOAT16
1162
1369
  HWY_API Mask256<float16_t> operator>=(Vec256<float16_t> a,
1163
1370
  Vec256<float16_t> b) {
1371
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
1372
+ HWY_DIAGNOSTICS(push)
1373
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1164
1374
  return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)};
1375
+ HWY_DIAGNOSTICS(pop)
1165
1376
  }
1166
1377
  #endif // HWY_HAVE_FLOAT16
1167
1378
 
@@ -1617,7 +1828,7 @@ HWY_INLINE VFromD<D> Iota0(D /*d*/) {
1617
1828
 
1618
1829
  template <class D, HWY_IF_V_SIZE_D(D, 32), typename T2>
1619
1830
  HWY_API VFromD<D> Iota(D d, const T2 first) {
1620
- return detail::Iota0(d) + Set(d, static_cast<TFromD<D>>(first));
1831
+ return detail::Iota0(d) + Set(d, ConvertScalarTo<TFromD<D>>(first));
1621
1832
  }
1622
1833
 
1623
1834
  // ------------------------------ FirstN (Iota, Lt)
@@ -1732,6 +1943,15 @@ HWY_API Vec256<double> operator-(Vec256<double> a, Vec256<double> b) {
1732
1943
  return Vec256<double>{_mm256_sub_pd(a.raw, b.raw)};
1733
1944
  }
1734
1945
 
1946
+ // ------------------------------ AddSub
1947
+
1948
+ HWY_API Vec256<float> AddSub(Vec256<float> a, Vec256<float> b) {
1949
+ return Vec256<float>{_mm256_addsub_ps(a.raw, b.raw)};
1950
+ }
1951
+ HWY_API Vec256<double> AddSub(Vec256<double> a, Vec256<double> b) {
1952
+ return Vec256<double>{_mm256_addsub_pd(a.raw, b.raw)};
1953
+ }
1954
+
1735
1955
  // ------------------------------ SumsOf8
1736
1956
  HWY_API Vec256<uint64_t> SumsOf8(Vec256<uint8_t> v) {
1737
1957
  return Vec256<uint64_t>{_mm256_sad_epu8(v.raw, _mm256_setzero_si256())};
@@ -1741,6 +1961,56 @@ HWY_API Vec256<uint64_t> SumsOf8AbsDiff(Vec256<uint8_t> a, Vec256<uint8_t> b) {
1741
1961
  return Vec256<uint64_t>{_mm256_sad_epu8(a.raw, b.raw)};
1742
1962
  }
1743
1963
 
1964
+ // ------------------------------ SumsOf4
1965
+ #if HWY_TARGET <= HWY_AVX3
1966
+ namespace detail {
1967
+
1968
+ HWY_INLINE Vec256<uint32_t> SumsOf4(hwy::UnsignedTag /*type_tag*/,
1969
+ hwy::SizeTag<1> /*lane_size_tag*/,
1970
+ Vec256<uint8_t> v) {
1971
+ const DFromV<decltype(v)> d;
1972
+
1973
+ // _mm256_maskz_dbsad_epu8 is used below as the odd uint16_t lanes need to be
1974
+ // zeroed out and the sums of the 4 consecutive lanes are already in the
1975
+ // even uint16_t lanes of the _mm256_maskz_dbsad_epu8 result.
1976
+ return Vec256<uint32_t>{_mm256_maskz_dbsad_epu8(
1977
+ static_cast<__mmask16>(0x5555), v.raw, Zero(d).raw, 0)};
1978
+ }
1979
+
1980
+ // detail::SumsOf4 for Vec256<int8_t> on AVX3 is implemented in x86_512-inl.h
1981
+
1982
+ } // namespace detail
1983
+ #endif // HWY_TARGET <= HWY_AVX3
1984
+
1985
+ // ------------------------------ SumsOfAdjQuadAbsDiff
1986
+
1987
+ template <int kAOffset, int kBOffset>
1988
+ static Vec256<uint16_t> SumsOfAdjQuadAbsDiff(Vec256<uint8_t> a,
1989
+ Vec256<uint8_t> b) {
1990
+ static_assert(0 <= kAOffset && kAOffset <= 1,
1991
+ "kAOffset must be between 0 and 1");
1992
+ static_assert(0 <= kBOffset && kBOffset <= 3,
1993
+ "kBOffset must be between 0 and 3");
1994
+ return Vec256<uint16_t>{_mm256_mpsadbw_epu8(
1995
+ a.raw, b.raw,
1996
+ (kAOffset << 5) | (kBOffset << 3) | (kAOffset << 2) | kBOffset)};
1997
+ }
1998
+
1999
+ // ------------------------------ SumsOfShuffledQuadAbsDiff
2000
+
2001
+ #if HWY_TARGET <= HWY_AVX3
2002
+ template <int kIdx3, int kIdx2, int kIdx1, int kIdx0>
2003
+ static Vec256<uint16_t> SumsOfShuffledQuadAbsDiff(Vec256<uint8_t> a,
2004
+ Vec256<uint8_t> b) {
2005
+ static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3");
2006
+ static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3");
2007
+ static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3");
2008
+ static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3");
2009
+ return Vec256<uint16_t>{
2010
+ _mm256_dbsad_epu8(b.raw, a.raw, _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0))};
2011
+ }
2012
+ #endif
2013
+
1744
2014
  // ------------------------------ SaturatedAdd
1745
2015
 
1746
2016
  // Returns a + b clamped to the destination range.
@@ -1761,7 +2031,7 @@ HWY_API Vec256<int16_t> SaturatedAdd(Vec256<int16_t> a, Vec256<int16_t> b) {
1761
2031
  return Vec256<int16_t>{_mm256_adds_epi16(a.raw, b.raw)};
1762
2032
  }
1763
2033
 
1764
- #if HWY_TARGET <= HWY_AVX3
2034
+ #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
1765
2035
  HWY_API Vec256<int32_t> SaturatedAdd(Vec256<int32_t> a, Vec256<int32_t> b) {
1766
2036
  const DFromV<decltype(a)> d;
1767
2037
  const auto sum = a + b;
@@ -1783,7 +2053,7 @@ HWY_API Vec256<int64_t> SaturatedAdd(Vec256<int64_t> a, Vec256<int64_t> b) {
1783
2053
  i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
1784
2054
  return IfThenElse(overflow_mask, overflow_result, sum);
1785
2055
  }
1786
- #endif // HWY_TARGET <= HWY_AVX3
2056
+ #endif // HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
1787
2057
 
1788
2058
  // ------------------------------ SaturatedSub
1789
2059
 
@@ -1805,7 +2075,7 @@ HWY_API Vec256<int16_t> SaturatedSub(Vec256<int16_t> a, Vec256<int16_t> b) {
1805
2075
  return Vec256<int16_t>{_mm256_subs_epi16(a.raw, b.raw)};
1806
2076
  }
1807
2077
 
1808
- #if HWY_TARGET <= HWY_AVX3
2078
+ #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
1809
2079
  HWY_API Vec256<int32_t> SaturatedSub(Vec256<int32_t> a, Vec256<int32_t> b) {
1810
2080
  const DFromV<decltype(a)> d;
1811
2081
  const auto diff = a - b;
@@ -1827,7 +2097,7 @@ HWY_API Vec256<int64_t> SaturatedSub(Vec256<int64_t> a, Vec256<int64_t> b) {
1827
2097
  i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
1828
2098
  return IfThenElse(overflow_mask, overflow_result, diff);
1829
2099
  }
1830
- #endif // HWY_TARGET <= HWY_AVX3
2100
+ #endif // HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
1831
2101
 
1832
2102
  // ------------------------------ Average
1833
2103
 
@@ -1860,15 +2130,12 @@ HWY_API Vec256<int16_t> Abs(const Vec256<int16_t> v) {
1860
2130
  HWY_API Vec256<int32_t> Abs(const Vec256<int32_t> v) {
1861
2131
  return Vec256<int32_t>{_mm256_abs_epi32(v.raw)};
1862
2132
  }
1863
- // i64 is implemented after BroadcastSignBit.
1864
2133
 
1865
- template <typename T, HWY_IF_FLOAT(T)>
1866
- HWY_API Vec256<T> Abs(const Vec256<T> v) {
1867
- const DFromV<decltype(v)> d;
1868
- const RebindToSigned<decltype(d)> di;
1869
- using TI = TFromD<decltype(di)>;
1870
- return v & BitCast(d, Set(di, static_cast<TI>(~SignMask<TI>())));
2134
+ #if HWY_TARGET <= HWY_AVX3
2135
+ HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) {
2136
+ return Vec256<int64_t>{_mm256_abs_epi64(v.raw)};
1871
2137
  }
2138
+ #endif
1872
2139
 
1873
2140
  // ------------------------------ Integer multiplication
1874
2141
 
@@ -2016,14 +2283,29 @@ HWY_API Vec256<int8_t> ShiftRight(Vec256<int8_t> v) {
2016
2283
 
2017
2284
  // ------------------------------ RotateRight
2018
2285
 
2019
- template <int kBits, typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
2020
- HWY_API Vec256<T> RotateRight(const Vec256<T> v) {
2021
- constexpr size_t kSizeInBits = sizeof(T) * 8;
2022
- static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
2286
+ // U8 RotateRight implementation on AVX3_DL is now in x86_512-inl.h as U8
2287
+ // RotateRight uses detail::GaloisAffine on AVX3_DL
2288
+
2289
+ #if HWY_TARGET > HWY_AVX3_DL
2290
+ template <int kBits>
2291
+ HWY_API Vec256<uint8_t> RotateRight(const Vec256<uint8_t> v) {
2292
+ static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
2293
+ if (kBits == 0) return v;
2294
+ // AVX3 does not support 8-bit.
2295
+ return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(7, 8 - kBits)>(v));
2296
+ }
2297
+ #endif
2298
+
2299
+ template <int kBits>
2300
+ HWY_API Vec256<uint16_t> RotateRight(const Vec256<uint16_t> v) {
2301
+ static_assert(0 <= kBits && kBits < 16, "Invalid shift count");
2023
2302
  if (kBits == 0) return v;
2024
- // AVX3 does not support 8/16-bit.
2025
- return Or(ShiftRight<kBits>(v),
2026
- ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
2303
+ #if HWY_TARGET <= HWY_AVX3_DL
2304
+ return Vec256<uint16_t>{_mm256_shrdi_epi16(v.raw, v.raw, kBits)};
2305
+ #else
2306
+ // AVX3 does not support 16-bit.
2307
+ return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(15, 16 - kBits)>(v));
2308
+ #endif
2027
2309
  }
2028
2310
 
2029
2311
  template <int kBits>
@@ -2048,6 +2330,38 @@ HWY_API Vec256<uint64_t> RotateRight(const Vec256<uint64_t> v) {
2048
2330
  #endif
2049
2331
  }
2050
2332
 
2333
+ // ------------------------------ Rol/Ror
2334
+ #if HWY_TARGET <= HWY_AVX3_DL
2335
+ template <class T, HWY_IF_UI16(T)>
2336
+ HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) {
2337
+ return Vec256<T>{_mm256_shrdv_epi16(a.raw, a.raw, b.raw)};
2338
+ }
2339
+ #endif // HWY_TARGET <= HWY_AVX3_DL
2340
+
2341
+ #if HWY_TARGET <= HWY_AVX3
2342
+
2343
+ template <class T, HWY_IF_UI32(T)>
2344
+ HWY_API Vec256<T> Rol(Vec256<T> a, Vec256<T> b) {
2345
+ return Vec256<T>{_mm256_rolv_epi32(a.raw, b.raw)};
2346
+ }
2347
+
2348
+ template <class T, HWY_IF_UI32(T)>
2349
+ HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) {
2350
+ return Vec256<T>{_mm256_rorv_epi32(a.raw, b.raw)};
2351
+ }
2352
+
2353
+ template <class T, HWY_IF_UI64(T)>
2354
+ HWY_API Vec256<T> Rol(Vec256<T> a, Vec256<T> b) {
2355
+ return Vec256<T>{_mm256_rolv_epi64(a.raw, b.raw)};
2356
+ }
2357
+
2358
+ template <class T, HWY_IF_UI64(T)>
2359
+ HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) {
2360
+ return Vec256<T>{_mm256_rorv_epi64(a.raw, b.raw)};
2361
+ }
2362
+
2363
+ #endif
2364
+
2051
2365
  // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
2052
2366
 
2053
2367
  HWY_API Vec256<int8_t> BroadcastSignBit(const Vec256<int8_t> v) {
@@ -2086,16 +2400,6 @@ HWY_API Vec256<int64_t> ShiftRight(const Vec256<int64_t> v) {
2086
2400
  #endif
2087
2401
  }
2088
2402
 
2089
- HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) {
2090
- #if HWY_TARGET <= HWY_AVX3
2091
- return Vec256<int64_t>{_mm256_abs_epi64(v.raw)};
2092
- #else
2093
- const DFromV<decltype(v)> d;
2094
- const auto zero = Zero(d);
2095
- return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
2096
- #endif
2097
- }
2098
-
2099
2403
  // ------------------------------ IfNegativeThenElse (BroadcastSignBit)
2100
2404
  HWY_API Vec256<int8_t> IfNegativeThenElse(Vec256<int8_t> v, Vec256<int8_t> yes,
2101
2405
  Vec256<int8_t> no) {
@@ -2136,6 +2440,23 @@ HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
2136
2440
  #endif
2137
2441
  }
2138
2442
 
2443
+ // ------------------------------ IfNegativeThenNegOrUndefIfZero
2444
+
2445
+ HWY_API Vec256<int8_t> IfNegativeThenNegOrUndefIfZero(Vec256<int8_t> mask,
2446
+ Vec256<int8_t> v) {
2447
+ return Vec256<int8_t>{_mm256_sign_epi8(v.raw, mask.raw)};
2448
+ }
2449
+
2450
+ HWY_API Vec256<int16_t> IfNegativeThenNegOrUndefIfZero(Vec256<int16_t> mask,
2451
+ Vec256<int16_t> v) {
2452
+ return Vec256<int16_t>{_mm256_sign_epi16(v.raw, mask.raw)};
2453
+ }
2454
+
2455
+ HWY_API Vec256<int32_t> IfNegativeThenNegOrUndefIfZero(Vec256<int32_t> mask,
2456
+ Vec256<int32_t> v) {
2457
+ return Vec256<int32_t>{_mm256_sign_epi32(v.raw, mask.raw)};
2458
+ }
2459
+
2139
2460
  // ------------------------------ ShiftLeftSame
2140
2461
 
2141
2462
  HWY_API Vec256<uint16_t> ShiftLeftSame(const Vec256<uint16_t> v,
@@ -2359,103 +2680,448 @@ HWY_API Vec256<double> ApproximateReciprocal(Vec256<double> v) {
2359
2680
  }
2360
2681
  #endif
2361
2682
 
2362
- // ------------------------------ Floating-point multiply-add variants
2683
+ // ------------------------------ MaskedMinOr
2363
2684
 
2364
- #if HWY_HAVE_FLOAT16
2685
+ #if HWY_TARGET <= HWY_AVX3
2365
2686
 
2366
- HWY_API Vec256<float16_t> MulAdd(Vec256<float16_t> mul, Vec256<float16_t> x,
2367
- Vec256<float16_t> add) {
2368
- return Vec256<float16_t>{_mm256_fmadd_ph(mul.raw, x.raw, add.raw)};
2687
+ template <typename T, HWY_IF_U8(T)>
2688
+ HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2689
+ Vec256<T> b) {
2690
+ return Vec256<T>{_mm256_mask_min_epu8(no.raw, m.raw, a.raw, b.raw)};
2691
+ }
2692
+ template <typename T, HWY_IF_I8(T)>
2693
+ HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2694
+ Vec256<T> b) {
2695
+ return Vec256<T>{_mm256_mask_min_epi8(no.raw, m.raw, a.raw, b.raw)};
2369
2696
  }
2370
2697
 
2371
- HWY_API Vec256<float16_t> NegMulAdd(Vec256<float16_t> mul, Vec256<float16_t> x,
2372
- Vec256<float16_t> add) {
2373
- return Vec256<float16_t>{_mm256_fnmadd_ph(mul.raw, x.raw, add.raw)};
2698
+ template <typename T, HWY_IF_U16(T)>
2699
+ HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2700
+ Vec256<T> b) {
2701
+ return Vec256<T>{_mm256_mask_min_epu16(no.raw, m.raw, a.raw, b.raw)};
2702
+ }
2703
+ template <typename T, HWY_IF_I16(T)>
2704
+ HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2705
+ Vec256<T> b) {
2706
+ return Vec256<T>{_mm256_mask_min_epi16(no.raw, m.raw, a.raw, b.raw)};
2374
2707
  }
2375
2708
 
2376
- HWY_API Vec256<float16_t> MulSub(Vec256<float16_t> mul, Vec256<float16_t> x,
2377
- Vec256<float16_t> sub) {
2378
- return Vec256<float16_t>{_mm256_fmsub_ph(mul.raw, x.raw, sub.raw)};
2709
+ template <typename T, HWY_IF_U32(T)>
2710
+ HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2711
+ Vec256<T> b) {
2712
+ return Vec256<T>{_mm256_mask_min_epu32(no.raw, m.raw, a.raw, b.raw)};
2713
+ }
2714
+ template <typename T, HWY_IF_I32(T)>
2715
+ HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2716
+ Vec256<T> b) {
2717
+ return Vec256<T>{_mm256_mask_min_epi32(no.raw, m.raw, a.raw, b.raw)};
2379
2718
  }
2380
2719
 
2381
- HWY_API Vec256<float16_t> NegMulSub(Vec256<float16_t> mul, Vec256<float16_t> x,
2382
- Vec256<float16_t> sub) {
2383
- return Vec256<float16_t>{_mm256_fnmsub_ph(mul.raw, x.raw, sub.raw)};
2720
+ template <typename T, HWY_IF_U64(T)>
2721
+ HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2722
+ Vec256<T> b) {
2723
+ return Vec256<T>{_mm256_mask_min_epu64(no.raw, m.raw, a.raw, b.raw)};
2724
+ }
2725
+ template <typename T, HWY_IF_I64(T)>
2726
+ HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2727
+ Vec256<T> b) {
2728
+ return Vec256<T>{_mm256_mask_min_epi64(no.raw, m.raw, a.raw, b.raw)};
2729
+ }
2730
+
2731
+ template <typename T, HWY_IF_F32(T)>
2732
+ HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2733
+ Vec256<T> b) {
2734
+ return Vec256<T>{_mm256_mask_min_ps(no.raw, m.raw, a.raw, b.raw)};
2735
+ }
2736
+
2737
+ template <typename T, HWY_IF_F64(T)>
2738
+ HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2739
+ Vec256<T> b) {
2740
+ return Vec256<T>{_mm256_mask_min_pd(no.raw, m.raw, a.raw, b.raw)};
2384
2741
  }
2385
2742
 
2743
+ #if HWY_HAVE_FLOAT16
2744
+ template <typename T, HWY_IF_F16(T)>
2745
+ HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2746
+ Vec256<T> b) {
2747
+ return Vec256<T>{_mm256_mask_min_ph(no.raw, m.raw, a.raw, b.raw)};
2748
+ }
2386
2749
  #endif // HWY_HAVE_FLOAT16
2387
2750
 
2388
- HWY_API Vec256<float> MulAdd(Vec256<float> mul, Vec256<float> x,
2389
- Vec256<float> add) {
2390
- #ifdef HWY_DISABLE_BMI2_FMA
2391
- return mul * x + add;
2392
- #else
2393
- return Vec256<float>{_mm256_fmadd_ps(mul.raw, x.raw, add.raw)};
2394
- #endif
2751
+ // ------------------------------ MaskedMaxOr
2752
+
2753
+ template <typename T, HWY_IF_U8(T)>
2754
+ HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2755
+ Vec256<T> b) {
2756
+ return Vec256<T>{_mm256_mask_max_epu8(no.raw, m.raw, a.raw, b.raw)};
2395
2757
  }
2396
- HWY_API Vec256<double> MulAdd(Vec256<double> mul, Vec256<double> x,
2397
- Vec256<double> add) {
2398
- #ifdef HWY_DISABLE_BMI2_FMA
2399
- return mul * x + add;
2400
- #else
2401
- return Vec256<double>{_mm256_fmadd_pd(mul.raw, x.raw, add.raw)};
2402
- #endif
2758
+ template <typename T, HWY_IF_I8(T)>
2759
+ HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2760
+ Vec256<T> b) {
2761
+ return Vec256<T>{_mm256_mask_max_epi8(no.raw, m.raw, a.raw, b.raw)};
2403
2762
  }
2404
2763
 
2405
- HWY_API Vec256<float> NegMulAdd(Vec256<float> mul, Vec256<float> x,
2406
- Vec256<float> add) {
2407
- #ifdef HWY_DISABLE_BMI2_FMA
2408
- return add - mul * x;
2409
- #else
2410
- return Vec256<float>{_mm256_fnmadd_ps(mul.raw, x.raw, add.raw)};
2411
- #endif
2764
+ template <typename T, HWY_IF_U16(T)>
2765
+ HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2766
+ Vec256<T> b) {
2767
+ return Vec256<T>{_mm256_mask_max_epu16(no.raw, m.raw, a.raw, b.raw)};
2412
2768
  }
2413
- HWY_API Vec256<double> NegMulAdd(Vec256<double> mul, Vec256<double> x,
2414
- Vec256<double> add) {
2415
- #ifdef HWY_DISABLE_BMI2_FMA
2416
- return add - mul * x;
2417
- #else
2418
- return Vec256<double>{_mm256_fnmadd_pd(mul.raw, x.raw, add.raw)};
2419
- #endif
2769
+ template <typename T, HWY_IF_I16(T)>
2770
+ HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2771
+ Vec256<T> b) {
2772
+ return Vec256<T>{_mm256_mask_max_epi16(no.raw, m.raw, a.raw, b.raw)};
2420
2773
  }
2421
2774
 
2422
- HWY_API Vec256<float> MulSub(Vec256<float> mul, Vec256<float> x,
2423
- Vec256<float> sub) {
2424
- #ifdef HWY_DISABLE_BMI2_FMA
2425
- return mul * x - sub;
2426
- #else
2427
- return Vec256<float>{_mm256_fmsub_ps(mul.raw, x.raw, sub.raw)};
2428
- #endif
2775
+ template <typename T, HWY_IF_U32(T)>
2776
+ HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2777
+ Vec256<T> b) {
2778
+ return Vec256<T>{_mm256_mask_max_epu32(no.raw, m.raw, a.raw, b.raw)};
2429
2779
  }
2430
- HWY_API Vec256<double> MulSub(Vec256<double> mul, Vec256<double> x,
2431
- Vec256<double> sub) {
2432
- #ifdef HWY_DISABLE_BMI2_FMA
2433
- return mul * x - sub;
2434
- #else
2435
- return Vec256<double>{_mm256_fmsub_pd(mul.raw, x.raw, sub.raw)};
2436
- #endif
2780
+ template <typename T, HWY_IF_I32(T)>
2781
+ HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2782
+ Vec256<T> b) {
2783
+ return Vec256<T>{_mm256_mask_max_epi32(no.raw, m.raw, a.raw, b.raw)};
2437
2784
  }
2438
2785
 
2439
- HWY_API Vec256<float> NegMulSub(Vec256<float> mul, Vec256<float> x,
2440
- Vec256<float> sub) {
2441
- #ifdef HWY_DISABLE_BMI2_FMA
2442
- return Neg(mul * x) - sub;
2443
- #else
2444
- return Vec256<float>{_mm256_fnmsub_ps(mul.raw, x.raw, sub.raw)};
2445
- #endif
2786
+ template <typename T, HWY_IF_U64(T)>
2787
+ HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2788
+ Vec256<T> b) {
2789
+ return Vec256<T>{_mm256_mask_max_epu64(no.raw, m.raw, a.raw, b.raw)};
2446
2790
  }
2447
- HWY_API Vec256<double> NegMulSub(Vec256<double> mul, Vec256<double> x,
2448
- Vec256<double> sub) {
2449
- #ifdef HWY_DISABLE_BMI2_FMA
2450
- return Neg(mul * x) - sub;
2451
- #else
2452
- return Vec256<double>{_mm256_fnmsub_pd(mul.raw, x.raw, sub.raw)};
2453
- #endif
2791
+ template <typename T, HWY_IF_I64(T)>
2792
+ HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2793
+ Vec256<T> b) {
2794
+ return Vec256<T>{_mm256_mask_max_epi64(no.raw, m.raw, a.raw, b.raw)};
2454
2795
  }
2455
2796
 
2456
- // ------------------------------ Floating-point square root
2797
+ template <typename T, HWY_IF_F32(T)>
2798
+ HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2799
+ Vec256<T> b) {
2800
+ return Vec256<T>{_mm256_mask_max_ps(no.raw, m.raw, a.raw, b.raw)};
2801
+ }
2802
+
2803
+ template <typename T, HWY_IF_F64(T)>
2804
+ HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2805
+ Vec256<T> b) {
2806
+ return Vec256<T>{_mm256_mask_max_pd(no.raw, m.raw, a.raw, b.raw)};
2807
+ }
2457
2808
 
2458
- // Full precision square root
2809
+ #if HWY_HAVE_FLOAT16
2810
+ template <typename T, HWY_IF_F16(T)>
2811
+ HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2812
+ Vec256<T> b) {
2813
+ return Vec256<T>{_mm256_mask_max_ph(no.raw, m.raw, a.raw, b.raw)};
2814
+ }
2815
+ #endif // HWY_HAVE_FLOAT16
2816
+
2817
+ // ------------------------------ MaskedAddOr
2818
+
2819
+ template <typename T, HWY_IF_UI8(T)>
2820
+ HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2821
+ Vec256<T> b) {
2822
+ return Vec256<T>{_mm256_mask_add_epi8(no.raw, m.raw, a.raw, b.raw)};
2823
+ }
2824
+
2825
+ template <typename T, HWY_IF_UI16(T)>
2826
+ HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2827
+ Vec256<T> b) {
2828
+ return Vec256<T>{_mm256_mask_add_epi16(no.raw, m.raw, a.raw, b.raw)};
2829
+ }
2830
+
2831
+ template <typename T, HWY_IF_UI32(T)>
2832
+ HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2833
+ Vec256<T> b) {
2834
+ return Vec256<T>{_mm256_mask_add_epi32(no.raw, m.raw, a.raw, b.raw)};
2835
+ }
2836
+
2837
+ template <typename T, HWY_IF_UI64(T)>
2838
+ HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2839
+ Vec256<T> b) {
2840
+ return Vec256<T>{_mm256_mask_add_epi64(no.raw, m.raw, a.raw, b.raw)};
2841
+ }
2842
+
2843
+ template <typename T, HWY_IF_F32(T)>
2844
+ HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2845
+ Vec256<T> b) {
2846
+ return Vec256<T>{_mm256_mask_add_ps(no.raw, m.raw, a.raw, b.raw)};
2847
+ }
2848
+
2849
+ template <typename T, HWY_IF_F64(T)>
2850
+ HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2851
+ Vec256<T> b) {
2852
+ return Vec256<T>{_mm256_mask_add_pd(no.raw, m.raw, a.raw, b.raw)};
2853
+ }
2854
+
2855
+ #if HWY_HAVE_FLOAT16
2856
+ template <typename T, HWY_IF_F16(T)>
2857
+ HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2858
+ Vec256<T> b) {
2859
+ return Vec256<T>{_mm256_mask_add_ph(no.raw, m.raw, a.raw, b.raw)};
2860
+ }
2861
+ #endif // HWY_HAVE_FLOAT16
2862
+
2863
+ // ------------------------------ MaskedSubOr
2864
+
2865
+ template <typename T, HWY_IF_UI8(T)>
2866
+ HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2867
+ Vec256<T> b) {
2868
+ return Vec256<T>{_mm256_mask_sub_epi8(no.raw, m.raw, a.raw, b.raw)};
2869
+ }
2870
+
2871
+ template <typename T, HWY_IF_UI16(T)>
2872
+ HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2873
+ Vec256<T> b) {
2874
+ return Vec256<T>{_mm256_mask_sub_epi16(no.raw, m.raw, a.raw, b.raw)};
2875
+ }
2876
+
2877
+ template <typename T, HWY_IF_UI32(T)>
2878
+ HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2879
+ Vec256<T> b) {
2880
+ return Vec256<T>{_mm256_mask_sub_epi32(no.raw, m.raw, a.raw, b.raw)};
2881
+ }
2882
+
2883
+ template <typename T, HWY_IF_UI64(T)>
2884
+ HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2885
+ Vec256<T> b) {
2886
+ return Vec256<T>{_mm256_mask_sub_epi64(no.raw, m.raw, a.raw, b.raw)};
2887
+ }
2888
+
2889
+ template <typename T, HWY_IF_F32(T)>
2890
+ HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2891
+ Vec256<T> b) {
2892
+ return Vec256<T>{_mm256_mask_sub_ps(no.raw, m.raw, a.raw, b.raw)};
2893
+ }
2894
+
2895
+ template <typename T, HWY_IF_F64(T)>
2896
+ HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2897
+ Vec256<T> b) {
2898
+ return Vec256<T>{_mm256_mask_sub_pd(no.raw, m.raw, a.raw, b.raw)};
2899
+ }
2900
+
2901
+ #if HWY_HAVE_FLOAT16
2902
+ template <typename T, HWY_IF_F16(T)>
2903
+ HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2904
+ Vec256<T> b) {
2905
+ return Vec256<T>{_mm256_mask_sub_ph(no.raw, m.raw, a.raw, b.raw)};
2906
+ }
2907
+ #endif // HWY_HAVE_FLOAT16
2908
+
2909
+ // ------------------------------ MaskedMulOr
2910
+
2911
+ HWY_API Vec256<float> MaskedMulOr(Vec256<float> no, Mask256<float> m,
2912
+ Vec256<float> a, Vec256<float> b) {
2913
+ return Vec256<float>{_mm256_mask_mul_ps(no.raw, m.raw, a.raw, b.raw)};
2914
+ }
2915
+
2916
+ HWY_API Vec256<double> MaskedMulOr(Vec256<double> no, Mask256<double> m,
2917
+ Vec256<double> a, Vec256<double> b) {
2918
+ return Vec256<double>{_mm256_mask_mul_pd(no.raw, m.raw, a.raw, b.raw)};
2919
+ }
2920
+
2921
+ #if HWY_HAVE_FLOAT16
2922
+ HWY_API Vec256<float16_t> MaskedMulOr(Vec256<float16_t> no,
2923
+ Mask256<float16_t> m, Vec256<float16_t> a,
2924
+ Vec256<float16_t> b) {
2925
+ return Vec256<float16_t>{_mm256_mask_mul_ph(no.raw, m.raw, a.raw, b.raw)};
2926
+ }
2927
+ #endif // HWY_HAVE_FLOAT16
2928
+
2929
+ // ------------------------------ MaskedDivOr
2930
+
2931
+ HWY_API Vec256<float> MaskedDivOr(Vec256<float> no, Mask256<float> m,
2932
+ Vec256<float> a, Vec256<float> b) {
2933
+ return Vec256<float>{_mm256_mask_div_ps(no.raw, m.raw, a.raw, b.raw)};
2934
+ }
2935
+
2936
+ HWY_API Vec256<double> MaskedDivOr(Vec256<double> no, Mask256<double> m,
2937
+ Vec256<double> a, Vec256<double> b) {
2938
+ return Vec256<double>{_mm256_mask_div_pd(no.raw, m.raw, a.raw, b.raw)};
2939
+ }
2940
+
2941
+ #if HWY_HAVE_FLOAT16
2942
+ HWY_API Vec256<float16_t> MaskedDivOr(Vec256<float16_t> no,
2943
+ Mask256<float16_t> m, Vec256<float16_t> a,
2944
+ Vec256<float16_t> b) {
2945
+ return Vec256<float16_t>{_mm256_mask_div_ph(no.raw, m.raw, a.raw, b.raw)};
2946
+ }
2947
+ #endif // HWY_HAVE_FLOAT16
2948
+
2949
+ // ------------------------------ MaskedSatAddOr
2950
+
2951
+ template <typename T, HWY_IF_I8(T)>
2952
+ HWY_API Vec256<T> MaskedSatAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2953
+ Vec256<T> b) {
2954
+ return Vec256<T>{_mm256_mask_adds_epi8(no.raw, m.raw, a.raw, b.raw)};
2955
+ }
2956
+
2957
+ template <typename T, HWY_IF_U8(T)>
2958
+ HWY_API Vec256<T> MaskedSatAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2959
+ Vec256<T> b) {
2960
+ return Vec256<T>{_mm256_mask_adds_epu8(no.raw, m.raw, a.raw, b.raw)};
2961
+ }
2962
+
2963
+ template <typename T, HWY_IF_I16(T)>
2964
+ HWY_API Vec256<T> MaskedSatAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2965
+ Vec256<T> b) {
2966
+ return Vec256<T>{_mm256_mask_adds_epi16(no.raw, m.raw, a.raw, b.raw)};
2967
+ }
2968
+
2969
+ template <typename T, HWY_IF_U16(T)>
2970
+ HWY_API Vec256<T> MaskedSatAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2971
+ Vec256<T> b) {
2972
+ return Vec256<T>{_mm256_mask_adds_epu16(no.raw, m.raw, a.raw, b.raw)};
2973
+ }
2974
+
2975
+ // ------------------------------ MaskedSatSubOr
2976
+
2977
+ template <typename T, HWY_IF_I8(T)>
2978
+ HWY_API Vec256<T> MaskedSatSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2979
+ Vec256<T> b) {
2980
+ return Vec256<T>{_mm256_mask_subs_epi8(no.raw, m.raw, a.raw, b.raw)};
2981
+ }
2982
+
2983
+ template <typename T, HWY_IF_U8(T)>
2984
+ HWY_API Vec256<T> MaskedSatSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2985
+ Vec256<T> b) {
2986
+ return Vec256<T>{_mm256_mask_subs_epu8(no.raw, m.raw, a.raw, b.raw)};
2987
+ }
2988
+
2989
+ template <typename T, HWY_IF_I16(T)>
2990
+ HWY_API Vec256<T> MaskedSatSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2991
+ Vec256<T> b) {
2992
+ return Vec256<T>{_mm256_mask_subs_epi16(no.raw, m.raw, a.raw, b.raw)};
2993
+ }
2994
+
2995
+ template <typename T, HWY_IF_U16(T)>
2996
+ HWY_API Vec256<T> MaskedSatSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
2997
+ Vec256<T> b) {
2998
+ return Vec256<T>{_mm256_mask_subs_epu16(no.raw, m.raw, a.raw, b.raw)};
2999
+ }
3000
+
3001
+ #endif // HWY_TARGET <= HWY_AVX3
3002
+
3003
+ // ------------------------------ Floating-point multiply-add variants
3004
+
3005
+ #if HWY_HAVE_FLOAT16
3006
+
3007
+ HWY_API Vec256<float16_t> MulAdd(Vec256<float16_t> mul, Vec256<float16_t> x,
3008
+ Vec256<float16_t> add) {
3009
+ return Vec256<float16_t>{_mm256_fmadd_ph(mul.raw, x.raw, add.raw)};
3010
+ }
3011
+
3012
+ HWY_API Vec256<float16_t> NegMulAdd(Vec256<float16_t> mul, Vec256<float16_t> x,
3013
+ Vec256<float16_t> add) {
3014
+ return Vec256<float16_t>{_mm256_fnmadd_ph(mul.raw, x.raw, add.raw)};
3015
+ }
3016
+
3017
+ HWY_API Vec256<float16_t> MulSub(Vec256<float16_t> mul, Vec256<float16_t> x,
3018
+ Vec256<float16_t> sub) {
3019
+ return Vec256<float16_t>{_mm256_fmsub_ph(mul.raw, x.raw, sub.raw)};
3020
+ }
3021
+
3022
+ HWY_API Vec256<float16_t> NegMulSub(Vec256<float16_t> mul, Vec256<float16_t> x,
3023
+ Vec256<float16_t> sub) {
3024
+ return Vec256<float16_t>{_mm256_fnmsub_ph(mul.raw, x.raw, sub.raw)};
3025
+ }
3026
+
3027
+ #endif // HWY_HAVE_FLOAT16
3028
+
3029
+ HWY_API Vec256<float> MulAdd(Vec256<float> mul, Vec256<float> x,
3030
+ Vec256<float> add) {
3031
+ #ifdef HWY_DISABLE_BMI2_FMA
3032
+ return mul * x + add;
3033
+ #else
3034
+ return Vec256<float>{_mm256_fmadd_ps(mul.raw, x.raw, add.raw)};
3035
+ #endif
3036
+ }
3037
+ HWY_API Vec256<double> MulAdd(Vec256<double> mul, Vec256<double> x,
3038
+ Vec256<double> add) {
3039
+ #ifdef HWY_DISABLE_BMI2_FMA
3040
+ return mul * x + add;
3041
+ #else
3042
+ return Vec256<double>{_mm256_fmadd_pd(mul.raw, x.raw, add.raw)};
3043
+ #endif
3044
+ }
3045
+
3046
+ HWY_API Vec256<float> NegMulAdd(Vec256<float> mul, Vec256<float> x,
3047
+ Vec256<float> add) {
3048
+ #ifdef HWY_DISABLE_BMI2_FMA
3049
+ return add - mul * x;
3050
+ #else
3051
+ return Vec256<float>{_mm256_fnmadd_ps(mul.raw, x.raw, add.raw)};
3052
+ #endif
3053
+ }
3054
+ HWY_API Vec256<double> NegMulAdd(Vec256<double> mul, Vec256<double> x,
3055
+ Vec256<double> add) {
3056
+ #ifdef HWY_DISABLE_BMI2_FMA
3057
+ return add - mul * x;
3058
+ #else
3059
+ return Vec256<double>{_mm256_fnmadd_pd(mul.raw, x.raw, add.raw)};
3060
+ #endif
3061
+ }
3062
+
3063
+ HWY_API Vec256<float> MulSub(Vec256<float> mul, Vec256<float> x,
3064
+ Vec256<float> sub) {
3065
+ #ifdef HWY_DISABLE_BMI2_FMA
3066
+ return mul * x - sub;
3067
+ #else
3068
+ return Vec256<float>{_mm256_fmsub_ps(mul.raw, x.raw, sub.raw)};
3069
+ #endif
3070
+ }
3071
+ HWY_API Vec256<double> MulSub(Vec256<double> mul, Vec256<double> x,
3072
+ Vec256<double> sub) {
3073
+ #ifdef HWY_DISABLE_BMI2_FMA
3074
+ return mul * x - sub;
3075
+ #else
3076
+ return Vec256<double>{_mm256_fmsub_pd(mul.raw, x.raw, sub.raw)};
3077
+ #endif
3078
+ }
3079
+
3080
+ HWY_API Vec256<float> NegMulSub(Vec256<float> mul, Vec256<float> x,
3081
+ Vec256<float> sub) {
3082
+ #ifdef HWY_DISABLE_BMI2_FMA
3083
+ return Neg(mul * x) - sub;
3084
+ #else
3085
+ return Vec256<float>{_mm256_fnmsub_ps(mul.raw, x.raw, sub.raw)};
3086
+ #endif
3087
+ }
3088
+ HWY_API Vec256<double> NegMulSub(Vec256<double> mul, Vec256<double> x,
3089
+ Vec256<double> sub) {
3090
+ #ifdef HWY_DISABLE_BMI2_FMA
3091
+ return Neg(mul * x) - sub;
3092
+ #else
3093
+ return Vec256<double>{_mm256_fnmsub_pd(mul.raw, x.raw, sub.raw)};
3094
+ #endif
3095
+ }
3096
+
3097
+ #if HWY_HAVE_FLOAT16
3098
+ HWY_API Vec256<float16_t> MulAddSub(Vec256<float16_t> mul, Vec256<float16_t> x,
3099
+ Vec256<float16_t> sub_or_add) {
3100
+ return Vec256<float16_t>{_mm256_fmaddsub_ph(mul.raw, x.raw, sub_or_add.raw)};
3101
+ }
3102
+ #endif // HWY_HAVE_FLOAT16
3103
+
3104
+ HWY_API Vec256<float> MulAddSub(Vec256<float> mul, Vec256<float> x,
3105
+ Vec256<float> sub_or_add) {
3106
+ #ifdef HWY_DISABLE_BMI2_FMA
3107
+ return AddSub(mul * x, sub_or_add);
3108
+ #else
3109
+ return Vec256<float>{_mm256_fmaddsub_ps(mul.raw, x.raw, sub_or_add.raw)};
3110
+ #endif
3111
+ }
3112
+
3113
+ HWY_API Vec256<double> MulAddSub(Vec256<double> mul, Vec256<double> x,
3114
+ Vec256<double> sub_or_add) {
3115
+ #ifdef HWY_DISABLE_BMI2_FMA
3116
+ return AddSub(mul * x, sub_or_add);
3117
+ #else
3118
+ return Vec256<double>{_mm256_fmaddsub_pd(mul.raw, x.raw, sub_or_add.raw)};
3119
+ #endif
3120
+ }
3121
+
3122
+ // ------------------------------ Floating-point square root
3123
+
3124
+ // Full precision square root
2459
3125
  #if HWY_HAVE_FLOAT16
2460
3126
  HWY_API Vec256<float16_t> Sqrt(Vec256<float16_t> v) {
2461
3127
  return Vec256<float16_t>{_mm256_sqrt_ph(v.raw)};
@@ -2565,6 +3231,15 @@ HWY_API Mask256<float16_t> IsNaN(Vec256<float16_t> v) {
2565
3231
  v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
2566
3232
  }
2567
3233
 
3234
+ HWY_API Mask256<float16_t> IsEitherNaN(Vec256<float16_t> a,
3235
+ Vec256<float16_t> b) {
3236
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
3237
+ HWY_DIAGNOSTICS(push)
3238
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
3239
+ return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_UNORD_Q)};
3240
+ HWY_DIAGNOSTICS(pop)
3241
+ }
3242
+
2568
3243
  HWY_API Mask256<float16_t> IsInf(Vec256<float16_t> v) {
2569
3244
  return Mask256<float16_t>{_mm256_fpclass_ph_mask(
2570
3245
  v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)};
@@ -2597,6 +3272,22 @@ HWY_API Mask256<double> IsNaN(Vec256<double> v) {
2597
3272
  #endif
2598
3273
  }
2599
3274
 
3275
+ HWY_API Mask256<float> IsEitherNaN(Vec256<float> a, Vec256<float> b) {
3276
+ #if HWY_TARGET <= HWY_AVX3
3277
+ return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_UNORD_Q)};
3278
+ #else
3279
+ return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_UNORD_Q)};
3280
+ #endif
3281
+ }
3282
+
3283
+ HWY_API Mask256<double> IsEitherNaN(Vec256<double> a, Vec256<double> b) {
3284
+ #if HWY_TARGET <= HWY_AVX3
3285
+ return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_UNORD_Q)};
3286
+ #else
3287
+ return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_UNORD_Q)};
3288
+ #endif
3289
+ }
3290
+
2600
3291
  #if HWY_TARGET <= HWY_AVX3
2601
3292
 
2602
3293
  HWY_API Mask256<float> IsInf(Vec256<float> v) {
@@ -2621,35 +3312,6 @@ HWY_API Mask256<double> IsFinite(Vec256<double> v) {
2621
3312
  HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)});
2622
3313
  }
2623
3314
 
2624
- #else
2625
-
2626
- template <typename T>
2627
- HWY_API Mask256<T> IsInf(const Vec256<T> v) {
2628
- static_assert(IsFloat<T>(), "Only for float");
2629
- const DFromV<decltype(v)> d;
2630
- const RebindToSigned<decltype(d)> di;
2631
- const VFromD<decltype(di)> vi = BitCast(di, v);
2632
- // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
2633
- return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
2634
- }
2635
-
2636
- // Returns whether normal/subnormal/zero.
2637
- template <typename T>
2638
- HWY_API Mask256<T> IsFinite(const Vec256<T> v) {
2639
- static_assert(IsFloat<T>(), "Only for float");
2640
- const DFromV<decltype(v)> d;
2641
- const RebindToUnsigned<decltype(d)> du;
2642
- const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
2643
- const VFromD<decltype(du)> vu = BitCast(du, v);
2644
- // Shift left to clear the sign bit, then right so we can compare with the
2645
- // max exponent (cannot compare with MaxExponentTimes2 directly because it is
2646
- // negative and non-negative floats would be greater). MSVC seems to generate
2647
- // incorrect code if we instead add vu + vu.
2648
- const VFromD<decltype(di)> exp =
2649
- BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(ShiftLeft<1>(vu)));
2650
- return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
2651
- }
2652
-
2653
3315
  #endif // HWY_TARGET <= HWY_AVX3
2654
3316
 
2655
3317
  // ================================================== MEMORY
@@ -2662,16 +3324,13 @@ HWY_API VFromD<D> Load(D /* tag */, const TFromD<D>* HWY_RESTRICT aligned) {
2662
3324
  _mm256_load_si256(reinterpret_cast<const __m256i*>(aligned))};
2663
3325
  }
2664
3326
  // bfloat16_t is handled by x86_128-inl.h.
2665
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
2666
- HWY_API Vec256<float16_t> Load(D d, const float16_t* HWY_RESTRICT aligned) {
2667
3327
  #if HWY_HAVE_FLOAT16
2668
- (void)d;
3328
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
3329
+ HWY_API Vec256<float16_t> Load(D /* tag */,
3330
+ const float16_t* HWY_RESTRICT aligned) {
2669
3331
  return Vec256<float16_t>{_mm256_load_ph(aligned)};
2670
- #else
2671
- const RebindToUnsigned<decltype(d)> du;
2672
- return BitCast(d, Load(du, reinterpret_cast<const uint16_t*>(aligned)));
2673
- #endif // HWY_HAVE_FLOAT16
2674
3332
  }
3333
+ #endif
2675
3334
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
2676
3335
  HWY_API Vec256<float> Load(D /* tag */, const float* HWY_RESTRICT aligned) {
2677
3336
  return Vec256<float>{_mm256_load_ps(aligned)};
@@ -2686,16 +3345,12 @@ HWY_API VFromD<D> LoadU(D /* tag */, const TFromD<D>* HWY_RESTRICT p) {
2686
3345
  return VFromD<D>{_mm256_loadu_si256(reinterpret_cast<const __m256i*>(p))};
2687
3346
  }
2688
3347
  // bfloat16_t is handled by x86_128-inl.h.
2689
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
2690
- HWY_API Vec256<float16_t> LoadU(D d, const float16_t* HWY_RESTRICT p) {
2691
3348
  #if HWY_HAVE_FLOAT16
2692
- (void)d;
3349
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
3350
+ HWY_API Vec256<float16_t> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) {
2693
3351
  return Vec256<float16_t>{_mm256_loadu_ph(p)};
2694
- #else
2695
- const RebindToUnsigned<decltype(d)> du;
2696
- return BitCast(d, LoadU(du, reinterpret_cast<const uint16_t*>(p)));
2697
- #endif // HWY_HAVE_FLOAT16
2698
3352
  }
3353
+ #endif
2699
3354
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
2700
3355
  HWY_API Vec256<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) {
2701
3356
  return Vec256<float>{_mm256_loadu_ps(p)};
@@ -2756,8 +3411,8 @@ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
2756
3411
  HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
2757
3412
  const TFromD<D>* HWY_RESTRICT p) {
2758
3413
  const RebindToUnsigned<decltype(d)> du; // for float16_t
2759
- return BitCast(
2760
- d, VFromD<decltype(du)>{_mm256_mask_loadu_epi16(v.raw, m.raw, p)});
3414
+ return BitCast(d, VFromD<decltype(du)>{
3415
+ _mm256_mask_loadu_epi16(BitCast(du, v).raw, m.raw, p)});
2761
3416
  }
2762
3417
 
2763
3418
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
@@ -2831,22 +3486,24 @@ HWY_API Vec256<double> MaskedLoad(Mask256<double> m, D d,
2831
3486
  // Loads 128 bit and duplicates into both 128-bit halves. This avoids the
2832
3487
  // 3-cycle cost of moving data between 128-bit halves and avoids port 5.
2833
3488
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
2834
- HWY_API VFromD<D> LoadDup128(D /* tag */, const TFromD<D>* HWY_RESTRICT p) {
3489
+ HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
3490
+ const RebindToUnsigned<decltype(d)> du;
2835
3491
  const Full128<TFromD<D>> d128;
3492
+ const RebindToUnsigned<decltype(d128)> du128;
3493
+ const __m128i v128 = BitCast(du128, LoadU(d128, p)).raw;
2836
3494
  #if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
2837
3495
  // Workaround for incorrect results with _mm256_broadcastsi128_si256. Note
2838
3496
  // that MSVC also lacks _mm256_zextsi128_si256, but cast (which leaves the
2839
3497
  // upper half undefined) is fine because we're overwriting that anyway.
2840
3498
  // This workaround seems in turn to generate incorrect code in MSVC 2022
2841
3499
  // (19.31), so use broadcastsi128 there.
2842
- const __m128i v128 = LoadU(d128, p).raw;
2843
- return VFromD<D>{
2844
- _mm256_inserti128_si256(_mm256_castsi128_si256(v128), v128, 1)};
3500
+ return BitCast(d, VFromD<decltype(du)>{_mm256_inserti128_si256(
3501
+ _mm256_castsi128_si256(v128), v128, 1)});
2845
3502
  #else
2846
3503
  // The preferred path. This is perhaps surprising, because vbroadcasti128
2847
3504
  // with xmm input has 7 cycle latency on Intel, but Clang >= 7 is able to
2848
3505
  // pattern-match this to vbroadcastf128 with a memory operand as desired.
2849
- return VFromD<D>{_mm256_broadcastsi128_si256(LoadU(d128, p).raw)};
3506
+ return BitCast(d, VFromD<decltype(du)>{_mm256_broadcastsi128_si256(v128)});
2850
3507
  #endif
2851
3508
  }
2852
3509
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
@@ -2879,16 +3536,13 @@ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
2879
3536
  HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
2880
3537
  _mm256_store_si256(reinterpret_cast<__m256i*>(aligned), v.raw);
2881
3538
  }
2882
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
2883
- HWY_API void Store(Vec256<float16_t> v, D d, float16_t* HWY_RESTRICT aligned) {
2884
3539
  #if HWY_HAVE_FLOAT16
2885
- (void)d;
3540
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
3541
+ HWY_API void Store(Vec256<float16_t> v, D /* tag */,
3542
+ float16_t* HWY_RESTRICT aligned) {
2886
3543
  _mm256_store_ph(aligned, v.raw);
2887
- #else
2888
- const RebindToUnsigned<decltype(d)> du;
2889
- Store(BitCast(du, v), du, reinterpret_cast<uint16_t*>(aligned));
2890
- #endif // HWY_HAVE_FLOAT16
2891
3544
  }
3545
+ #endif // HWY_HAVE_FLOAT16
2892
3546
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
2893
3547
  HWY_API void Store(Vec256<float> v, D /* tag */, float* HWY_RESTRICT aligned) {
2894
3548
  _mm256_store_ps(aligned, v.raw);
@@ -2903,16 +3557,13 @@ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
2903
3557
  HWY_API void StoreU(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
2904
3558
  _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v.raw);
2905
3559
  }
2906
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
2907
- HWY_API void StoreU(Vec256<float16_t> v, D d, float16_t* HWY_RESTRICT p) {
2908
3560
  #if HWY_HAVE_FLOAT16
2909
- (void)d;
3561
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
3562
+ HWY_API void StoreU(Vec256<float16_t> v, D /* tag */,
3563
+ float16_t* HWY_RESTRICT p) {
2910
3564
  _mm256_storeu_ph(p, v.raw);
2911
- #else
2912
- const RebindToUnsigned<decltype(d)> du;
2913
- StoreU(BitCast(du, v), du, reinterpret_cast<uint16_t*>(p));
2914
- #endif // HWY_HAVE_FLOAT16
2915
3565
  }
3566
+ #endif
2916
3567
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
2917
3568
  HWY_API void StoreU(Vec256<float> v, D /* tag */, float* HWY_RESTRICT p) {
2918
3569
  _mm256_storeu_ps(p, v.raw);
@@ -3140,118 +3791,124 @@ HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D /* tag */,
3140
3791
 
3141
3792
  // ------------------------------ Gather
3142
3793
 
3143
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
3144
- HWY_INLINE VFromD<D> GatherOffset(D /* tag */,
3145
- const TFromD<D>* HWY_RESTRICT base,
3146
- Vec256<int32_t> offset) {
3147
- return VFromD<D>{_mm256_i32gather_epi32(
3148
- reinterpret_cast<const int32_t*>(base), offset.raw, 1)};
3149
- }
3150
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
3151
- HWY_INLINE VFromD<D> GatherIndex(D /* tag */,
3152
- const TFromD<D>* HWY_RESTRICT base,
3153
- Vec256<int32_t> index) {
3154
- return VFromD<D>{_mm256_i32gather_epi32(
3155
- reinterpret_cast<const int32_t*>(base), index.raw, 4)};
3156
- }
3794
+ namespace detail {
3157
3795
 
3158
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
3159
- HWY_INLINE VFromD<D> GatherOffset(D /* tag */,
3160
- const TFromD<D>* HWY_RESTRICT base,
3161
- Vec256<int64_t> offset) {
3162
- return VFromD<D>{_mm256_i64gather_epi64(
3163
- reinterpret_cast<const GatherIndex64*>(base), offset.raw, 1)};
3796
+ template <int kScale, typename T, HWY_IF_UI32(T)>
3797
+ HWY_INLINE Vec256<T> NativeGather256(const T* HWY_RESTRICT base,
3798
+ Vec256<int32_t> indices) {
3799
+ return Vec256<T>{_mm256_i32gather_epi32(
3800
+ reinterpret_cast<const int32_t*>(base), indices.raw, kScale)};
3164
3801
  }
3165
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
3166
- HWY_INLINE VFromD<D> GatherIndex(D /* tag */,
3167
- const TFromD<D>* HWY_RESTRICT base,
3168
- Vec256<int64_t> index) {
3169
- return VFromD<D>{_mm256_i64gather_epi64(
3170
- reinterpret_cast<const GatherIndex64*>(base), index.raw, 8)};
3802
+
3803
+ template <int kScale, typename T, HWY_IF_UI64(T)>
3804
+ HWY_INLINE Vec256<T> NativeGather256(const T* HWY_RESTRICT base,
3805
+ Vec256<int64_t> indices) {
3806
+ return Vec256<T>{_mm256_i64gather_epi64(
3807
+ reinterpret_cast<const GatherIndex64*>(base), indices.raw, kScale)};
3171
3808
  }
3172
3809
 
3173
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3174
- HWY_API Vec256<float> GatherOffset(D /* tag */, const float* HWY_RESTRICT base,
3175
- Vec256<int32_t> offset) {
3176
- return Vec256<float>{_mm256_i32gather_ps(base, offset.raw, 1)};
3810
+ template <int kScale>
3811
+ HWY_API Vec256<float> NativeGather256(const float* HWY_RESTRICT base,
3812
+ Vec256<int32_t> indices) {
3813
+ return Vec256<float>{_mm256_i32gather_ps(base, indices.raw, kScale)};
3177
3814
  }
3178
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3179
- HWY_API Vec256<float> GatherIndex(D /* tag */, const float* HWY_RESTRICT base,
3180
- Vec256<int32_t> index) {
3181
- return Vec256<float>{_mm256_i32gather_ps(base, index.raw, 4)};
3815
+
3816
+ template <int kScale>
3817
+ HWY_API Vec256<double> NativeGather256(const double* HWY_RESTRICT base,
3818
+ Vec256<int64_t> indices) {
3819
+ return Vec256<double>{_mm256_i64gather_pd(base, indices.raw, kScale)};
3182
3820
  }
3183
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3184
- HWY_API Vec256<double> GatherOffset(D /* tag */,
3185
- const double* HWY_RESTRICT base,
3186
- Vec256<int64_t> offset) {
3187
- return Vec256<double>{_mm256_i64gather_pd(base, offset.raw, 1)};
3821
+
3822
+ } // namespace detail
3823
+
3824
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
3825
+ HWY_API VFromD<D> GatherOffset(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
3826
+ VFromD<RebindToSigned<D>> offsets) {
3827
+ return detail::NativeGather256<1>(base, offsets);
3188
3828
  }
3189
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3190
- HWY_API Vec256<double> GatherIndex(D /* tag */, const double* HWY_RESTRICT base,
3191
- Vec256<int64_t> index) {
3192
- return Vec256<double>{_mm256_i64gather_pd(base, index.raw, 8)};
3829
+
3830
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
3831
+ HWY_API VFromD<D> GatherIndex(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
3832
+ VFromD<RebindToSigned<D>> indices) {
3833
+ return detail::NativeGather256<sizeof(TFromD<D>)>(base, indices);
3193
3834
  }
3194
3835
 
3195
- // ------------------------------ MaskedGatherIndex
3836
+ // ------------------------------ MaskedGatherIndexOr
3196
3837
 
3197
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
3198
- HWY_INLINE VFromD<D> MaskedGatherIndex(MFromD<D> m, D d,
3199
- const TFromD<D>* HWY_RESTRICT base,
3200
- Vec256<int32_t> index) {
3838
+ namespace detail {
3839
+
3840
+ template <int kScale, typename T, HWY_IF_UI32(T)>
3841
+ HWY_INLINE Vec256<T> NativeMaskedGatherOr256(Vec256<T> no, Mask256<T> m,
3842
+ const T* HWY_RESTRICT base,
3843
+ Vec256<int32_t> indices) {
3201
3844
  #if HWY_TARGET <= HWY_AVX3
3202
- return VFromD<D>{
3203
- _mm256_mmask_i32gather_epi32(Zero(d).raw, m.raw, index.raw,
3204
- reinterpret_cast<const int32_t*>(base), 4)};
3845
+ return Vec256<T>{_mm256_mmask_i32gather_epi32(
3846
+ no.raw, m.raw, indices.raw, reinterpret_cast<const int32_t*>(base),
3847
+ kScale)};
3205
3848
  #else
3206
- return VFromD<D>{_mm256_mask_i32gather_epi32(
3207
- Zero(d).raw, reinterpret_cast<const int32_t*>(base), index.raw, m.raw,
3208
- 4)};
3849
+ return Vec256<T>{_mm256_mask_i32gather_epi32(
3850
+ no.raw, reinterpret_cast<const int32_t*>(base), indices.raw, m.raw,
3851
+ kScale)};
3209
3852
  #endif
3210
3853
  }
3211
3854
 
3212
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
3213
- HWY_INLINE VFromD<D> MaskedGatherIndex(MFromD<D> m, D d,
3214
- const TFromD<D>* HWY_RESTRICT base,
3215
- Vec256<int64_t> index) {
3855
+ template <int kScale, typename T, HWY_IF_UI64(T)>
3856
+ HWY_INLINE Vec256<T> NativeMaskedGatherOr256(Vec256<T> no, Mask256<T> m,
3857
+ const T* HWY_RESTRICT base,
3858
+ Vec256<int64_t> indices) {
3216
3859
  #if HWY_TARGET <= HWY_AVX3
3217
- return VFromD<D>{_mm256_mmask_i64gather_epi64(
3218
- Zero(d).raw, m.raw, index.raw,
3219
- reinterpret_cast<const GatherIndex64*>(base), 8)};
3860
+ return Vec256<T>{_mm256_mmask_i64gather_epi64(
3861
+ no.raw, m.raw, indices.raw, reinterpret_cast<const GatherIndex64*>(base),
3862
+ kScale)};
3220
3863
  #else
3221
3864
  // For reasons unknown, _mm256_mask_i64gather_epi64 returns all-zeros.
3222
- const RebindToFloat<D> df;
3223
- return BitCast(d, Vec256<double>{_mm256_mask_i64gather_pd(
3224
- Zero(df).raw, reinterpret_cast<const double*>(base),
3225
- index.raw, RebindMask(df, m).raw, 8)});
3865
+ const Full256<T> d;
3866
+ const Full256<double> dd;
3867
+ return BitCast(d,
3868
+ Vec256<double>{_mm256_mask_i64gather_pd(
3869
+ BitCast(dd, no).raw, reinterpret_cast<const double*>(base),
3870
+ indices.raw, RebindMask(dd, m).raw, kScale)});
3226
3871
  #endif
3227
3872
  }
3228
3873
 
3229
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3230
- HWY_API Vec256<float> MaskedGatherIndex(MFromD<D> m, D d,
3231
- const float* HWY_RESTRICT base,
3232
- Vec256<int32_t> index) {
3874
+ template <int kScale>
3875
+ HWY_API Vec256<float> NativeMaskedGatherOr256(Vec256<float> no,
3876
+ Mask256<float> m,
3877
+ const float* HWY_RESTRICT base,
3878
+ Vec256<int32_t> indices) {
3233
3879
  #if HWY_TARGET <= HWY_AVX3
3234
3880
  return Vec256<float>{
3235
- _mm256_mmask_i32gather_ps(Zero(d).raw, m.raw, index.raw, base, 4)};
3881
+ _mm256_mmask_i32gather_ps(no.raw, m.raw, indices.raw, base, kScale)};
3236
3882
  #else
3237
3883
  return Vec256<float>{
3238
- _mm256_mask_i32gather_ps(Zero(d).raw, base, index.raw, m.raw, 4)};
3884
+ _mm256_mask_i32gather_ps(no.raw, base, indices.raw, m.raw, kScale)};
3239
3885
  #endif
3240
3886
  }
3241
3887
 
3242
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
3243
- HWY_API Vec256<double> MaskedGatherIndex(MFromD<D> m, D d,
3244
- const double* HWY_RESTRICT base,
3245
- Vec256<int64_t> index) {
3888
+ template <int kScale>
3889
+ HWY_API Vec256<double> NativeMaskedGatherOr256(Vec256<double> no,
3890
+ Mask256<double> m,
3891
+ const double* HWY_RESTRICT base,
3892
+ Vec256<int64_t> indices) {
3246
3893
  #if HWY_TARGET <= HWY_AVX3
3247
3894
  return Vec256<double>{
3248
- _mm256_mmask_i64gather_pd(Zero(d).raw, m.raw, index.raw, base, 8)};
3895
+ _mm256_mmask_i64gather_pd(no.raw, m.raw, indices.raw, base, kScale)};
3249
3896
  #else
3250
3897
  return Vec256<double>{
3251
- _mm256_mask_i64gather_pd(Zero(d).raw, base, index.raw, m.raw, 8)};
3898
+ _mm256_mask_i64gather_pd(no.raw, base, indices.raw, m.raw, kScale)};
3252
3899
  #endif
3253
3900
  }
3254
3901
 
3902
+ } // namespace detail
3903
+
3904
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
3905
+ HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D /*d*/,
3906
+ const TFromD<D>* HWY_RESTRICT base,
3907
+ VFromD<RebindToSigned<D>> indices) {
3908
+ return detail::NativeMaskedGatherOr256<sizeof(TFromD<D>)>(no, m, base,
3909
+ indices);
3910
+ }
3911
+
3255
3912
  HWY_DIAGNOSTICS(pop)
3256
3913
 
3257
3914
  // ================================================== SWIZZLE
@@ -3294,7 +3951,7 @@ HWY_API Vec128<T> LowerHalf(Vec256<T> v) {
3294
3951
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)>
3295
3952
  HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
3296
3953
  const RebindToUnsigned<decltype(d)> du; // for float16_t
3297
- const Twice<decltype(d)> dut;
3954
+ const Twice<decltype(du)> dut;
3298
3955
  return BitCast(d, VFromD<decltype(du)>{
3299
3956
  _mm256_extracti128_si256(BitCast(dut, v).raw, 1)});
3300
3957
  }
@@ -3375,22 +4032,16 @@ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
3375
4032
  HWY_API VFromD<D> ZeroExtendVector(D /* tag */, VFromD<Half<D>> lo) {
3376
4033
  #if HWY_HAVE_ZEXT
3377
4034
  return VFromD<D>{_mm256_zextsi128_si256(lo.raw)};
4035
+ #elif HWY_COMPILER_MSVC
4036
+ // Workaround: _mm256_inserti128_si256 does not actually zero the hi part.
4037
+ return VFromD<D>{_mm256_set_m128i(_mm_setzero_si128(), lo.raw)};
3378
4038
  #else
3379
4039
  return VFromD<D>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
3380
4040
  #endif
3381
4041
  }
3382
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
3383
- HWY_API Vec256<bfloat16_t> ZeroExtendVector(D d, Vec128<bfloat16_t> lo) {
3384
- (void)d;
3385
- #if HWY_HAVE_ZEXT
3386
- return VFromD<D>{_mm256_zextsi128_si256(lo.raw)};
3387
- #else
3388
- return VFromD<D>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
3389
- #endif // HWY_HAVE_ZEXT
3390
- }
4042
+ #if HWY_HAVE_FLOAT16
3391
4043
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
3392
4044
  HWY_API Vec256<float16_t> ZeroExtendVector(D d, Vec128<float16_t> lo) {
3393
- #if HWY_HAVE_FLOAT16
3394
4045
  #if HWY_HAVE_ZEXT
3395
4046
  (void)d;
3396
4047
  return Vec256<float16_t>{_mm256_zextph128_ph256(lo.raw)};
@@ -3398,15 +4049,8 @@ HWY_API Vec256<float16_t> ZeroExtendVector(D d, Vec128<float16_t> lo) {
3398
4049
  const RebindToUnsigned<D> du;
3399
4050
  return BitCast(d, ZeroExtendVector(du, BitCast(du, lo)));
3400
4051
  #endif // HWY_HAVE_ZEXT
3401
- #else
3402
- (void)d;
3403
- #if HWY_HAVE_ZEXT
3404
- return VFromD<D>{_mm256_zextsi128_si256(lo.raw)};
3405
- #else
3406
- return VFromD<D>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
3407
- #endif // HWY_HAVE_ZEXT
3408
- #endif // HWY_HAVE_FLOAT16
3409
4052
  }
4053
+ #endif // HWY_HAVE_FLOAT16
3410
4054
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3411
4055
  HWY_API Vec256<float> ZeroExtendVector(D /* tag */, Vec128<float> lo) {
3412
4056
  #if HWY_HAVE_ZEXT
@@ -3443,8 +4087,11 @@ HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
3443
4087
 
3444
4088
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
3445
4089
  HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) {
3446
- const auto lo256 = ZeroExtendVector(d, lo);
3447
- return VFromD<D>{_mm256_inserti128_si256(lo256.raw, hi.raw, 1)};
4090
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4091
+ const Half<decltype(du)> dh_u;
4092
+ const auto lo256 = ZeroExtendVector(du, BitCast(dh_u, lo));
4093
+ return BitCast(d, VFromD<decltype(du)>{_mm256_inserti128_si256(
4094
+ lo256.raw, BitCast(dh_u, hi).raw, 1)});
3448
4095
  }
3449
4096
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
3450
4097
  HWY_API Vec256<float> Combine(D d, Vec128<float> hi, Vec128<float> lo) {
@@ -3547,8 +4194,12 @@ HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
3547
4194
  template <class T, HWY_IF_T_SIZE(T, 2)>
3548
4195
  HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
3549
4196
  Vec256<T> v) {
3550
- const Half<DFromV<decltype(v)>> dh;
3551
- return Vec256<T>{_mm256_broadcastw_epi16(LowerHalf(dh, v).raw)};
4197
+ const DFromV<decltype(v)> d;
4198
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4199
+ const Half<decltype(d)> dh;
4200
+ const RebindToUnsigned<decltype(dh)> dh_u;
4201
+ return BitCast(d, VFromD<decltype(du)>{_mm256_broadcastw_epi16(
4202
+ BitCast(dh_u, LowerHalf(dh, v)).raw)});
3552
4203
  }
3553
4204
 
3554
4205
  template <class T, HWY_IF_UI32(T)>
@@ -3983,7 +4634,10 @@ HWY_API Vec256<double> TwoTablesLookupLanes(Vec256<double> a, Vec256<double> b,
3983
4634
 
3984
4635
  template <typename T>
3985
4636
  HWY_API Vec256<T> SwapAdjacentBlocks(Vec256<T> v) {
3986
- return Vec256<T>{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(1, 0, 3, 2))};
4637
+ const DFromV<decltype(v)> d;
4638
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4639
+ return BitCast(d, VFromD<decltype(du)>{_mm256_permute4x64_epi64(
4640
+ BitCast(du, v).raw, _MM_SHUFFLE(1, 0, 3, 2))});
3987
4641
  }
3988
4642
 
3989
4643
  HWY_API Vec256<double> SwapAdjacentBlocks(Vec256<double> v) {
@@ -4022,9 +4676,9 @@ HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
4022
4676
  _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
4023
4677
  #else
4024
4678
  const RebindToSigned<decltype(d)> di;
4025
- alignas(16) static constexpr int16_t kShuffle[8] = {
4026
- 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100};
4027
- const auto rev128 = TableLookupBytes(v, LoadDup128(di, kShuffle));
4679
+ const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
4680
+ di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
4681
+ const auto rev128 = TableLookupBytes(v, shuffle);
4028
4682
  return VFromD<D>{
4029
4683
  _mm256_permute4x64_epi64(rev128.raw, _MM_SHUFFLE(1, 0, 3, 2))};
4030
4684
  #endif
@@ -4053,9 +4707,9 @@ HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
4053
4707
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
4054
4708
  HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
4055
4709
  const RebindToSigned<decltype(d)> di;
4056
- alignas(16) static constexpr int16_t kShuffle[8] = {
4057
- 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908};
4058
- return BitCast(d, TableLookupBytes(v, LoadDup128(di, kShuffle)));
4710
+ const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
4711
+ di, 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908);
4712
+ return BitCast(d, TableLookupBytes(v, shuffle));
4059
4713
  }
4060
4714
 
4061
4715
  // 32 bit Reverse4 defined in x86_128.
@@ -4071,9 +4725,9 @@ HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) {
4071
4725
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
4072
4726
  HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
4073
4727
  const RebindToSigned<decltype(d)> di;
4074
- alignas(16) static constexpr int16_t kShuffle[8] = {
4075
- 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100};
4076
- return BitCast(d, TableLookupBytes(v, LoadDup128(di, kShuffle)));
4728
+ const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
4729
+ di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
4730
+ return BitCast(d, TableLookupBytes(v, shuffle));
4077
4731
  }
4078
4732
 
4079
4733
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
@@ -4162,8 +4816,12 @@ HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
4162
4816
  // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
4163
4817
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4164
4818
  HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
4819
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4165
4820
  const Half<decltype(d)> d2;
4166
- return VFromD<D>{_mm256_inserti128_si256(lo.raw, LowerHalf(d2, hi).raw, 1)};
4821
+ const RebindToUnsigned<decltype(d2)> du2; // for float16_t
4822
+ return BitCast(
4823
+ d, VFromD<decltype(du)>{_mm256_inserti128_si256(
4824
+ BitCast(du, lo).raw, BitCast(du2, LowerHalf(d2, hi)).raw, 1)});
4167
4825
  }
4168
4826
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4169
4827
  HWY_API Vec256<float> ConcatLowerLower(D d, Vec256<float> hi,
@@ -4180,8 +4838,10 @@ HWY_API Vec256<double> ConcatLowerLower(D d, Vec256<double> hi,
4180
4838
 
4181
4839
  // hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
4182
4840
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4183
- HWY_API VFromD<D> ConcatLowerUpper(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
4184
- return VFromD<D>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x21)};
4841
+ HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
4842
+ const RebindToUnsigned<decltype(d)> du;
4843
+ return BitCast(d, VFromD<decltype(du)>{_mm256_permute2x128_si256(
4844
+ BitCast(du, lo).raw, BitCast(du, hi).raw, 0x21)});
4185
4845
  }
4186
4846
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4187
4847
  HWY_API Vec256<float> ConcatLowerUpper(D /* tag */, Vec256<float> hi,
@@ -4196,8 +4856,10 @@ HWY_API Vec256<double> ConcatLowerUpper(D /* tag */, Vec256<double> hi,
4196
4856
 
4197
4857
  // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
4198
4858
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4199
- HWY_API VFromD<D> ConcatUpperLower(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
4200
- return VFromD<D>{_mm256_blend_epi32(hi.raw, lo.raw, 0x0F)};
4859
+ HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
4860
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4861
+ return BitCast(d, VFromD<decltype(du)>{_mm256_blend_epi32(
4862
+ BitCast(du, hi).raw, BitCast(du, lo).raw, 0x0F)});
4201
4863
  }
4202
4864
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4203
4865
  HWY_API Vec256<float> ConcatUpperLower(D /* tag */, Vec256<float> hi,
@@ -4212,8 +4874,10 @@ HWY_API Vec256<double> ConcatUpperLower(D /* tag */, Vec256<double> hi,
4212
4874
 
4213
4875
  // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
4214
4876
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4215
- HWY_API VFromD<D> ConcatUpperUpper(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
4216
- return VFromD<D>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x31)};
4877
+ HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
4878
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4879
+ return BitCast(d, VFromD<decltype(du)>{_mm256_permute2x128_si256(
4880
+ BitCast(du, lo).raw, BitCast(du, hi).raw, 0x31)});
4217
4881
  }
4218
4882
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4219
4883
  HWY_API Vec256<float> ConcatUpperUpper(D /* tag */, Vec256<float> hi,
@@ -4274,7 +4938,8 @@ HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
4274
4938
  const Vec256<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi));
4275
4939
  const Vec256<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo));
4276
4940
  const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
4277
- return VFromD<D>{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))};
4941
+ return BitCast(d, VFromD<decltype(du)>{_mm256_permute4x64_epi64(
4942
+ u16, _MM_SHUFFLE(3, 1, 2, 0))});
4278
4943
  #endif
4279
4944
  }
4280
4945
 
@@ -4365,90 +5030,211 @@ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
4365
5030
  }
4366
5031
 
4367
5032
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
4368
- HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
5033
+ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
5034
+ const RebindToUnsigned<decltype(d)> du;
5035
+ #if HWY_TARGET <= HWY_AVX3
5036
+ alignas(64) static constexpr uint16_t kIdx[16] = {
5037
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
5038
+ return BitCast(
5039
+ d, Vec256<uint32_t>{_mm256_permutex2var_epi16(
5040
+ BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
5041
+ #else
5042
+ const RepartitionToWide<decltype(du)> dw;
5043
+ // Isolate lower 16 bits per u32 so we can pack.
5044
+ const Vec256<uint32_t> mask = Set(dw, 0x0000FFFF);
5045
+ const Vec256<uint32_t> uH = And(BitCast(dw, hi), mask);
5046
+ const Vec256<uint32_t> uL = And(BitCast(dw, lo), mask);
5047
+ const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
5048
+ return BitCast(d, VFromD<decltype(du)>{_mm256_permute4x64_epi64(
5049
+ u16, _MM_SHUFFLE(3, 1, 2, 0))});
5050
+ #endif
5051
+ }
5052
+
5053
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
5054
+ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
5055
+ const RebindToUnsigned<decltype(d)> du;
5056
+ #if HWY_TARGET <= HWY_AVX3
5057
+ alignas(64) static constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
5058
+ return BitCast(
5059
+ d, Vec256<uint32_t>{_mm256_permutex2var_epi32(
5060
+ BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
5061
+ #else
5062
+ const RebindToFloat<decltype(d)> df;
5063
+ const Vec256<float> v2020{_mm256_shuffle_ps(
5064
+ BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(2, 0, 2, 0))};
5065
+ return VFromD<D>{_mm256_permute4x64_epi64(BitCast(du, v2020).raw,
5066
+ _MM_SHUFFLE(3, 1, 2, 0))};
5067
+
5068
+ #endif
5069
+ }
5070
+
5071
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
5072
+ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
5073
+ const RebindToUnsigned<decltype(d)> du;
5074
+ #if HWY_TARGET <= HWY_AVX3
5075
+ alignas(64) static constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
5076
+ return VFromD<D>{_mm256_permutex2var_ps(lo.raw, Load(du, kIdx).raw, hi.raw)};
5077
+ #else
5078
+ const VFromD<D> v2020{
5079
+ _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
5080
+ return BitCast(d, Vec256<uint32_t>{_mm256_permute4x64_epi64(
5081
+ BitCast(du, v2020).raw, _MM_SHUFFLE(3, 1, 2, 0))});
5082
+
5083
+ #endif
5084
+ }
5085
+
5086
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
5087
+ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
5088
+ const RebindToUnsigned<decltype(d)> du;
5089
+ #if HWY_TARGET <= HWY_AVX3
5090
+ alignas(64) static constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
5091
+ return BitCast(
5092
+ d, Vec256<uint64_t>{_mm256_permutex2var_epi64(
5093
+ BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
5094
+ #else
5095
+ const RebindToFloat<decltype(d)> df;
5096
+ const Vec256<double> v20{
5097
+ _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 0)};
5098
+ return VFromD<D>{
5099
+ _mm256_permute4x64_epi64(BitCast(du, v20).raw, _MM_SHUFFLE(3, 1, 2, 0))};
5100
+
5101
+ #endif
5102
+ }
5103
+
5104
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
5105
+ HWY_API Vec256<double> ConcatEven(D d, Vec256<double> hi, Vec256<double> lo) {
5106
+ #if HWY_TARGET <= HWY_AVX3
5107
+ const RebindToUnsigned<decltype(d)> du;
5108
+ alignas(64) static constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
5109
+ return Vec256<double>{
5110
+ _mm256_permutex2var_pd(lo.raw, Load(du, kIdx).raw, hi.raw)};
5111
+ #else
5112
+ (void)d;
5113
+ const Vec256<double> v20{_mm256_shuffle_pd(lo.raw, hi.raw, 0)};
5114
+ return Vec256<double>{
5115
+ _mm256_permute4x64_pd(v20.raw, _MM_SHUFFLE(3, 1, 2, 0))};
5116
+ #endif
5117
+ }
5118
+
5119
+ // ------------------------------ InterleaveWholeLower
5120
+
5121
+ #if HWY_TARGET <= HWY_AVX3
5122
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
5123
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
5124
+ #if HWY_TARGET <= HWY_AVX3_DL
5125
+ const RebindToUnsigned<decltype(d)> du;
5126
+ alignas(32) static constexpr uint8_t kIdx[32] = {
5127
+ 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
5128
+ 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47};
5129
+ return VFromD<D>{_mm256_permutex2var_epi8(a.raw, Load(du, kIdx).raw, b.raw)};
5130
+ #else
5131
+ return ConcatLowerLower(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
5132
+ #endif
5133
+ }
5134
+
5135
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
5136
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
5137
+ const RebindToUnsigned<decltype(d)> du;
5138
+ alignas(32) static constexpr uint16_t kIdx[16] = {0, 16, 1, 17, 2, 18, 3, 19,
5139
+ 4, 20, 5, 21, 6, 22, 7, 23};
5140
+ return BitCast(
5141
+ d, VFromD<decltype(du)>{_mm256_permutex2var_epi16(
5142
+ BitCast(du, a).raw, Load(du, kIdx).raw, BitCast(du, b).raw)});
5143
+ }
5144
+
5145
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
5146
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
5147
+ const RebindToUnsigned<decltype(d)> du;
5148
+ alignas(32) static constexpr uint32_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
5149
+ return VFromD<D>{_mm256_permutex2var_epi32(a.raw, Load(du, kIdx).raw, b.raw)};
5150
+ }
5151
+
5152
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
5153
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
5154
+ const RebindToUnsigned<decltype(d)> du;
5155
+ alignas(32) static constexpr uint32_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
5156
+ return VFromD<D>{_mm256_permutex2var_ps(a.raw, Load(du, kIdx).raw, b.raw)};
5157
+ }
5158
+
5159
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
5160
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
5161
+ const RebindToUnsigned<decltype(d)> du;
5162
+ alignas(32) static constexpr uint64_t kIdx[4] = {0, 4, 1, 5};
5163
+ return VFromD<D>{_mm256_permutex2var_epi64(a.raw, Load(du, kIdx).raw, b.raw)};
5164
+ }
5165
+
5166
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
5167
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
5168
+ const RebindToUnsigned<decltype(d)> du;
5169
+ alignas(32) static constexpr uint64_t kIdx[4] = {0, 4, 1, 5};
5170
+ return VFromD<D>{_mm256_permutex2var_pd(a.raw, Load(du, kIdx).raw, b.raw)};
5171
+ }
5172
+ #else // AVX2
5173
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
5174
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
5175
+ return ConcatLowerLower(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
5176
+ }
5177
+ #endif
5178
+
5179
+ // ------------------------------ InterleaveWholeUpper
5180
+
5181
+ #if HWY_TARGET <= HWY_AVX3
5182
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
5183
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
5184
+ #if HWY_TARGET <= HWY_AVX3_DL
5185
+ const RebindToUnsigned<decltype(d)> du;
5186
+ alignas(32) static constexpr uint8_t kIdx[32] = {
5187
+ 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
5188
+ 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63};
5189
+ return VFromD<D>{_mm256_permutex2var_epi8(a.raw, Load(du, kIdx).raw, b.raw)};
5190
+ #else
5191
+ return ConcatUpperUpper(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
5192
+ #endif
5193
+ }
5194
+
5195
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
5196
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
4369
5197
  const RebindToUnsigned<decltype(d)> du;
4370
- #if HWY_TARGET <= HWY_AVX3
4371
- alignas(64) static constexpr uint16_t kIdx[16] = {
4372
- 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
5198
+ alignas(32) static constexpr uint16_t kIdx[16] = {
5199
+ 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31};
4373
5200
  return BitCast(
4374
- d, Vec256<uint32_t>{_mm256_permutex2var_epi16(
4375
- BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
4376
- #else
4377
- const RepartitionToWide<decltype(du)> dw;
4378
- // Isolate lower 16 bits per u32 so we can pack.
4379
- const Vec256<uint32_t> mask = Set(dw, 0x0000FFFF);
4380
- const Vec256<uint32_t> uH = And(BitCast(dw, hi), mask);
4381
- const Vec256<uint32_t> uL = And(BitCast(dw, lo), mask);
4382
- const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
4383
- return VFromD<D>{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))};
4384
- #endif
5201
+ d, VFromD<decltype(du)>{_mm256_permutex2var_epi16(
5202
+ BitCast(du, a).raw, Load(du, kIdx).raw, BitCast(du, b).raw)});
4385
5203
  }
4386
5204
 
4387
5205
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
4388
- HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
5206
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
4389
5207
  const RebindToUnsigned<decltype(d)> du;
4390
- #if HWY_TARGET <= HWY_AVX3
4391
- alignas(64) static constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
4392
- return BitCast(
4393
- d, Vec256<uint32_t>{_mm256_permutex2var_epi32(
4394
- BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
4395
- #else
4396
- const RebindToFloat<decltype(d)> df;
4397
- const Vec256<float> v2020{_mm256_shuffle_ps(
4398
- BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(2, 0, 2, 0))};
4399
- return VFromD<D>{_mm256_permute4x64_epi64(BitCast(du, v2020).raw,
4400
- _MM_SHUFFLE(3, 1, 2, 0))};
4401
-
4402
- #endif
5208
+ alignas(32) static constexpr uint32_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
5209
+ return VFromD<D>{_mm256_permutex2var_epi32(a.raw, Load(du, kIdx).raw, b.raw)};
4403
5210
  }
4404
5211
 
4405
5212
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4406
- HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
5213
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
4407
5214
  const RebindToUnsigned<decltype(d)> du;
4408
- #if HWY_TARGET <= HWY_AVX3
4409
- alignas(64) static constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
4410
- return VFromD<D>{_mm256_permutex2var_ps(lo.raw, Load(du, kIdx).raw, hi.raw)};
4411
- #else
4412
- const VFromD<D> v2020{
4413
- _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
4414
- return BitCast(d, Vec256<uint32_t>{_mm256_permute4x64_epi64(
4415
- BitCast(du, v2020).raw, _MM_SHUFFLE(3, 1, 2, 0))});
4416
-
4417
- #endif
5215
+ alignas(32) static constexpr uint32_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
5216
+ return VFromD<D>{_mm256_permutex2var_ps(a.raw, Load(du, kIdx).raw, b.raw)};
4418
5217
  }
4419
5218
 
4420
5219
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
4421
- HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
5220
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
4422
5221
  const RebindToUnsigned<decltype(d)> du;
4423
- #if HWY_TARGET <= HWY_AVX3
4424
- alignas(64) static constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
4425
- return BitCast(
4426
- d, Vec256<uint64_t>{_mm256_permutex2var_epi64(
4427
- BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
4428
- #else
4429
- const RebindToFloat<decltype(d)> df;
4430
- const Vec256<double> v20{
4431
- _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 0)};
4432
- return VFromD<D>{
4433
- _mm256_permute4x64_epi64(BitCast(du, v20).raw, _MM_SHUFFLE(3, 1, 2, 0))};
4434
-
4435
- #endif
5222
+ alignas(32) static constexpr uint64_t kIdx[4] = {2, 6, 3, 7};
5223
+ return VFromD<D>{_mm256_permutex2var_epi64(a.raw, Load(du, kIdx).raw, b.raw)};
4436
5224
  }
4437
5225
 
4438
5226
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4439
- HWY_API Vec256<double> ConcatEven(D d, Vec256<double> hi, Vec256<double> lo) {
4440
- #if HWY_TARGET <= HWY_AVX3
5227
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
4441
5228
  const RebindToUnsigned<decltype(d)> du;
4442
- alignas(64) static constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
4443
- return Vec256<double>{
4444
- _mm256_permutex2var_pd(lo.raw, Load(du, kIdx).raw, hi.raw)};
4445
- #else
4446
- (void)d;
4447
- const Vec256<double> v20{_mm256_shuffle_pd(lo.raw, hi.raw, 0)};
4448
- return Vec256<double>{
4449
- _mm256_permute4x64_pd(v20.raw, _MM_SHUFFLE(3, 1, 2, 0))};
4450
- #endif
5229
+ alignas(32) static constexpr uint64_t kIdx[4] = {2, 6, 3, 7};
5230
+ return VFromD<D>{_mm256_permutex2var_pd(a.raw, Load(du, kIdx).raw, b.raw)};
5231
+ }
5232
+ #else // AVX2
5233
+ template <class D, HWY_IF_V_SIZE_D(D, 32)>
5234
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
5235
+ return ConcatUpperUpper(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
4451
5236
  }
5237
+ #endif
4452
5238
 
4453
5239
  // ------------------------------ DupEven (InterleaveLower)
4454
5240
 
@@ -4490,9 +5276,10 @@ template <typename T, HWY_IF_T_SIZE(T, 1)>
4490
5276
  HWY_INLINE Vec256<T> OddEven(Vec256<T> a, Vec256<T> b) {
4491
5277
  const DFromV<decltype(a)> d;
4492
5278
  const Full256<uint8_t> d8;
4493
- alignas(32) static constexpr uint8_t mask[16] = {
4494
- 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
4495
- return IfThenElse(MaskFromVec(BitCast(d, LoadDup128(d8, mask))), b, a);
5279
+ const VFromD<decltype(d8)> mask =
5280
+ Dup128VecFromValues(d8, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF,
5281
+ 0, 0xFF, 0, 0xFF, 0);
5282
+ return IfThenElse(MaskFromVec(BitCast(d, mask)), b, a);
4496
5283
  }
4497
5284
 
4498
5285
  template <typename T, HWY_IF_UI16(T)>
@@ -4505,7 +5292,8 @@ HWY_INLINE Vec256<T> OddEven(Vec256<T> a, Vec256<T> b) {
4505
5292
 
4506
5293
  #if HWY_HAVE_FLOAT16
4507
5294
  HWY_INLINE Vec256<float16_t> OddEven(Vec256<float16_t> a, Vec256<float16_t> b) {
4508
- return Vec256<float16_t>{_mm256_mask_blend_ph(a.raw, b.raw, 0x55)};
5295
+ return Vec256<float16_t>{
5296
+ _mm256_mask_blend_ph(static_cast<__mmask16>(0x5555), a.raw, b.raw)};
4509
5297
  }
4510
5298
  #endif // HWY_HAVE_FLOAT16
4511
5299
 
@@ -4527,11 +5315,80 @@ HWY_API Vec256<double> OddEven(Vec256<double> a, Vec256<double> b) {
4527
5315
  return Vec256<double>{_mm256_blend_pd(a.raw, b.raw, 5)};
4528
5316
  }
4529
5317
 
5318
+ // -------------------------- InterleaveEven
5319
+
5320
+ #if HWY_TARGET <= HWY_AVX3
5321
+ template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_UI32_D(D)>
5322
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
5323
+ return VFromD<D>{_mm256_mask_shuffle_epi32(
5324
+ a.raw, static_cast<__mmask8>(0xAA), b.raw,
5325
+ static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(2, 2, 0, 0)))};
5326
+ }
5327
+ template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_F32_D(D)>
5328
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
5329
+ return VFromD<D>{_mm256_mask_shuffle_ps(a.raw, static_cast<__mmask8>(0xAA),
5330
+ b.raw, b.raw,
5331
+ _MM_SHUFFLE(2, 2, 0, 0))};
5332
+ }
5333
+ #else
5334
+ template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_T_SIZE_D(D, 4)>
5335
+ HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
5336
+ const RebindToFloat<decltype(d)> df;
5337
+ const VFromD<decltype(df)> b2_b0_a2_a0{_mm256_shuffle_ps(
5338
+ BitCast(df, a).raw, BitCast(df, b).raw, _MM_SHUFFLE(2, 0, 2, 0))};
5339
+ return BitCast(
5340
+ d, VFromD<decltype(df)>{_mm256_shuffle_ps(
5341
+ b2_b0_a2_a0.raw, b2_b0_a2_a0.raw, _MM_SHUFFLE(3, 1, 2, 0))});
5342
+ }
5343
+ #endif
5344
+
5345
+ // I64/U64/F64 InterleaveEven is generic for vector lengths >= 32 bytes
5346
+ template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 8)>
5347
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
5348
+ return InterleaveLower(a, b);
5349
+ }
5350
+
5351
+ // -------------------------- InterleaveOdd
5352
+
5353
+ #if HWY_TARGET <= HWY_AVX3
5354
+ template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_UI32_D(D)>
5355
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
5356
+ return VFromD<D>{_mm256_mask_shuffle_epi32(
5357
+ b.raw, static_cast<__mmask8>(0x55), a.raw,
5358
+ static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(3, 3, 1, 1)))};
5359
+ }
5360
+ template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_F32_D(D)>
5361
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
5362
+ return VFromD<D>{_mm256_mask_shuffle_ps(b.raw, static_cast<__mmask8>(0x55),
5363
+ a.raw, a.raw,
5364
+ _MM_SHUFFLE(3, 3, 1, 1))};
5365
+ }
5366
+ #else
5367
+ template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_T_SIZE_D(D, 4)>
5368
+ HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
5369
+ const RebindToFloat<decltype(d)> df;
5370
+ const VFromD<decltype(df)> b3_b1_a3_a3{_mm256_shuffle_ps(
5371
+ BitCast(df, a).raw, BitCast(df, b).raw, _MM_SHUFFLE(3, 1, 3, 1))};
5372
+ return BitCast(
5373
+ d, VFromD<decltype(df)>{_mm256_shuffle_ps(
5374
+ b3_b1_a3_a3.raw, b3_b1_a3_a3.raw, _MM_SHUFFLE(3, 1, 2, 0))});
5375
+ }
5376
+ #endif
5377
+
5378
+ // I64/U64/F64 InterleaveOdd is generic for vector lengths >= 32 bytes
5379
+ template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 8)>
5380
+ HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
5381
+ return InterleaveUpper(d, a, b);
5382
+ }
5383
+
4530
5384
  // ------------------------------ OddEvenBlocks
4531
5385
 
4532
5386
  template <typename T, HWY_IF_NOT_FLOAT3264(T)>
4533
5387
  Vec256<T> OddEvenBlocks(Vec256<T> odd, Vec256<T> even) {
4534
- return Vec256<T>{_mm256_blend_epi32(odd.raw, even.raw, 0xFu)};
5388
+ const DFromV<decltype(odd)> d;
5389
+ const RebindToUnsigned<decltype(d)> du;
5390
+ return BitCast(d, VFromD<decltype(du)>{_mm256_blend_epi32(
5391
+ BitCast(du, odd).raw, BitCast(du, even).raw, 0xFu)});
4535
5392
  }
4536
5393
 
4537
5394
  HWY_API Vec256<float> OddEvenBlocks(Vec256<float> odd, Vec256<float> even) {
@@ -4554,7 +5411,10 @@ HWY_API VFromD<D> ReverseBlocks(D /*d*/, VFromD<D> v) {
4554
5411
  // Both full
4555
5412
  template <typename T, typename TI>
4556
5413
  HWY_API Vec256<TI> TableLookupBytes(Vec256<T> bytes, Vec256<TI> from) {
4557
- return Vec256<TI>{_mm256_shuffle_epi8(bytes.raw, from.raw)};
5414
+ const DFromV<decltype(from)> d;
5415
+ return BitCast(d, Vec256<uint8_t>{_mm256_shuffle_epi8(
5416
+ BitCast(Full256<uint8_t>(), bytes).raw,
5417
+ BitCast(Full256<uint8_t>(), from).raw)});
4558
5418
  }
4559
5419
 
4560
5420
  // Partial index vector
@@ -5114,14 +5974,15 @@ HWY_API Vec256<uint8_t> Shl(hwy::UnsignedTag tag, Vec256<uint8_t> v,
5114
5974
  const DFromV<decltype(v)> d;
5115
5975
  #if HWY_TARGET <= HWY_AVX3_DL
5116
5976
  (void)tag;
5117
- // kMask[i] = 0xFF >> i
5118
- alignas(16) static constexpr uint8_t kMasks[16] = {
5119
- 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0x00};
5977
+ // masks[i] = 0xFF >> i
5978
+ const VFromD<decltype(d)> masks =
5979
+ Dup128VecFromValues(d, 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0,
5980
+ 0, 0, 0, 0, 0, 0, 0);
5120
5981
  // kShl[i] = 1 << i
5121
- alignas(16) static constexpr uint8_t kShl[16] = {1, 2, 4, 8, 0x10,
5122
- 0x20, 0x40, 0x80, 0x00};
5123
- v = And(v, TableLookupBytes(LoadDup128(d, kMasks), bits));
5124
- const VFromD<decltype(d)> mul = TableLookupBytes(LoadDup128(d, kShl), bits);
5982
+ const VFromD<decltype(d)> shl = Dup128VecFromValues(
5983
+ d, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 0, 0, 0, 0, 0, 0, 0, 0);
5984
+ v = And(v, TableLookupBytes(masks, bits));
5985
+ const VFromD<decltype(d)> mul = TableLookupBytes(shl, bits);
5125
5986
  return VFromD<decltype(d)>{_mm256_gf2p8mul_epi8(v.raw, mul.raw)};
5126
5987
  #else
5127
5988
  const Repartition<uint16_t, decltype(d)> dw;
@@ -5271,62 +6132,6 @@ HWY_API Vec256<int64_t> operator>>(Vec256<int64_t> v, Vec256<int64_t> bits) {
5271
6132
  #endif
5272
6133
  }
5273
6134
 
5274
- HWY_INLINE Vec256<uint64_t> MulEven(const Vec256<uint64_t> a,
5275
- const Vec256<uint64_t> b) {
5276
- const Full256<uint64_t> du64;
5277
- const RepartitionToNarrow<decltype(du64)> du32;
5278
- const auto maskL = Set(du64, 0xFFFFFFFFULL);
5279
- const auto a32 = BitCast(du32, a);
5280
- const auto b32 = BitCast(du32, b);
5281
- // Inputs for MulEven: we only need the lower 32 bits
5282
- const auto aH = Shuffle2301(a32);
5283
- const auto bH = Shuffle2301(b32);
5284
-
5285
- // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
5286
- // the even (lower 64 bits of every 128-bit block) results. See
5287
- // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat
5288
- const auto aLbL = MulEven(a32, b32);
5289
- const auto w3 = aLbL & maskL;
5290
-
5291
- const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
5292
- const auto w2 = t2 & maskL;
5293
- const auto w1 = ShiftRight<32>(t2);
5294
-
5295
- const auto t = MulEven(a32, bH) + w2;
5296
- const auto k = ShiftRight<32>(t);
5297
-
5298
- const auto mulH = MulEven(aH, bH) + w1 + k;
5299
- const auto mulL = ShiftLeft<32>(t) + w3;
5300
- return InterleaveLower(mulL, mulH);
5301
- }
5302
-
5303
- HWY_INLINE Vec256<uint64_t> MulOdd(const Vec256<uint64_t> a,
5304
- const Vec256<uint64_t> b) {
5305
- const Full256<uint64_t> du64;
5306
- const RepartitionToNarrow<decltype(du64)> du32;
5307
- const auto maskL = Set(du64, 0xFFFFFFFFULL);
5308
- const auto a32 = BitCast(du32, a);
5309
- const auto b32 = BitCast(du32, b);
5310
- // Inputs for MulEven: we only need bits [95:64] (= upper half of input)
5311
- const auto aH = Shuffle2301(a32);
5312
- const auto bH = Shuffle2301(b32);
5313
-
5314
- // Same as above, but we're using the odd results (upper 64 bits per block).
5315
- const auto aLbL = MulEven(a32, b32);
5316
- const auto w3 = aLbL & maskL;
5317
-
5318
- const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
5319
- const auto w2 = t2 & maskL;
5320
- const auto w1 = ShiftRight<32>(t2);
5321
-
5322
- const auto t = MulEven(a32, bH) + w2;
5323
- const auto k = ShiftRight<32>(t);
5324
-
5325
- const auto mulH = MulEven(aH, bH) + w1 + k;
5326
- const auto mulL = ShiftLeft<32>(t) + w3;
5327
- return InterleaveUpper(du64, mulL, mulH);
5328
- }
5329
-
5330
6135
  // ------------------------------ WidenMulPairwiseAdd
5331
6136
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
5332
6137
  HWY_API VFromD<D> WidenMulPairwiseAdd(D /*d32*/, Vec256<int16_t> a,
@@ -5343,7 +6148,31 @@ HWY_API VFromD<DI16> SatWidenMulPairwiseAdd(
5343
6148
  return VFromD<DI16>{_mm256_maddubs_epi16(a.raw, b.raw)};
5344
6149
  }
5345
6150
 
6151
+ // ------------------------------ SatWidenMulPairwiseAccumulate
6152
+
6153
+ #if HWY_TARGET <= HWY_AVX3_DL
6154
+ template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 32)>
6155
+ HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate(
6156
+ DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a,
6157
+ VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
6158
+ return VFromD<DI32>{_mm256_dpwssds_epi32(sum.raw, a.raw, b.raw)};
6159
+ }
6160
+ #endif // HWY_TARGET <= HWY_AVX3_DL
6161
+
5346
6162
  // ------------------------------ ReorderWidenMulAccumulate
6163
+
6164
+ #if HWY_NATIVE_DOT_BF16
6165
+ template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_D(DF, 32),
6166
+ class VBF = VFromD<Repartition<bfloat16_t, DF>>>
6167
+ HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF /*df*/, VBF a, VBF b,
6168
+ const VFromD<DF> sum0,
6169
+ VFromD<DF>& /*sum1*/) {
6170
+ return VFromD<DF>{_mm256_dpbf16_ps(sum0.raw,
6171
+ reinterpret_cast<__m256bh>(a.raw),
6172
+ reinterpret_cast<__m256bh>(b.raw))};
6173
+ }
6174
+ #endif // HWY_NATIVE_DOT_BF16
6175
+
5347
6176
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
5348
6177
  HWY_API VFromD<D> ReorderWidenMulAccumulate(D d, Vec256<int16_t> a,
5349
6178
  Vec256<int16_t> b,
@@ -5461,22 +6290,91 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec32<int8_t> v) {
5461
6290
 
5462
6291
  #if HWY_TARGET <= HWY_AVX3
5463
6292
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
5464
- HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
5465
- const Rebind<float, decltype(di64)> df32;
5466
- const RebindToFloat<decltype(di64)> df64;
5467
- const RebindToSigned<decltype(df32)> di32;
6293
+ HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
6294
+ #if HWY_COMPILER_GCC_ACTUAL
6295
+ // Workaround for undefined behavior with GCC if any values of v[i] are not
6296
+ // within the range of an int64_t
6297
+
6298
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
6299
+ if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
6300
+ typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
6301
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
6302
+ return VFromD<D>{_mm256_setr_epi64x(
6303
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]),
6304
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1]),
6305
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[2]),
6306
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[3]))};
6307
+ }
6308
+ #endif
5468
6309
 
5469
- return detail::FixConversionOverflow(
5470
- di64, BitCast(df64, PromoteTo(di64, BitCast(di32, v))),
5471
- VFromD<D>{_mm256_cvttps_epi64(v.raw)});
6310
+ __m256i raw_result;
6311
+ __asm__("vcvttps2qq {%1, %0|%0, %1}"
6312
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6313
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6314
+ :);
6315
+ return VFromD<D>{raw_result};
6316
+ #else // !HWY_COMPILER_GCC_ACTUAL
6317
+ return VFromD<D>{_mm256_cvttps_epi64(v.raw)};
6318
+ #endif // HWY_COMPILER_GCC_ACTUAL
5472
6319
  }
5473
6320
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
5474
- HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
5475
- return VFromD<D>{
5476
- _mm256_maskz_cvttps_epu64(_knot_mask8(MaskFromVec(v).raw), v.raw)};
6321
+ HWY_API VFromD<D> PromoteInRangeTo(D /* tag */, VFromD<Rebind<float, D>> v) {
6322
+ #if HWY_COMPILER_GCC_ACTUAL
6323
+ // Workaround for undefined behavior with GCC if any values of v[i] are not
6324
+ // within the range of an uint64_t
6325
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
6326
+ if (detail::IsConstantX86VecForF2IConv<uint64_t>(v)) {
6327
+ typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
6328
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
6329
+ return VFromD<D>{_mm256_setr_epi64x(
6330
+ static_cast<int64_t>(
6331
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0])),
6332
+ static_cast<int64_t>(
6333
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1])),
6334
+ static_cast<int64_t>(
6335
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[2])),
6336
+ static_cast<int64_t>(
6337
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[3])))};
6338
+ }
6339
+ #endif
6340
+
6341
+ __m256i raw_result;
6342
+ __asm__("vcvttps2uqq {%1, %0|%0, %1}"
6343
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6344
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6345
+ :);
6346
+ return VFromD<D>{raw_result};
6347
+ #else // !HWY_COMPILER_GCC_ACTUAL
6348
+ return VFromD<D>{_mm256_cvttps_epu64(v.raw)};
6349
+ #endif // HWY_COMPILER_GCC_ACTUAL
5477
6350
  }
5478
6351
  #endif // HWY_TARGET <= HWY_AVX3
5479
6352
 
6353
+ // ------------------------------ PromoteEvenTo/PromoteOddTo
6354
+ #if HWY_TARGET > HWY_AVX3
6355
+ namespace detail {
6356
+
6357
+ // I32->I64 PromoteEvenTo/PromoteOddTo
6358
+
6359
+ template <class D, HWY_IF_LANES_D(D, 4)>
6360
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
6361
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
6362
+ hwy::SignedTag /*from_type_tag*/, D d_to,
6363
+ Vec256<int32_t> v) {
6364
+ return BitCast(d_to, OddEven(DupEven(BroadcastSignBit(v)), v));
6365
+ }
6366
+
6367
+ template <class D, HWY_IF_LANES_D(D, 4)>
6368
+ HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
6369
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
6370
+ hwy::SignedTag /*from_type_tag*/, D d_to,
6371
+ Vec256<int32_t> v) {
6372
+ return BitCast(d_to, OddEven(BroadcastSignBit(v), DupOdd(v)));
6373
+ }
6374
+
6375
+ } // namespace detail
6376
+ #endif
6377
+
5480
6378
  // ------------------------------ Demotions (full -> part w/ narrow lanes)
5481
6379
 
5482
6380
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
@@ -5565,32 +6463,17 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int64_t> v) {
5565
6463
 
5566
6464
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
5567
6465
  HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int64_t> v) {
5568
- const auto neg_mask = MaskFromVec(v);
5569
- #if HWY_COMPILER_HAS_MASK_INTRINSICS
5570
- const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
5571
- #else
5572
- const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
5573
- #endif
6466
+ const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
5574
6467
  return VFromD<D>{_mm256_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)};
5575
6468
  }
5576
6469
  template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
5577
6470
  HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int64_t> v) {
5578
- const auto neg_mask = MaskFromVec(v);
5579
- #if HWY_COMPILER_HAS_MASK_INTRINSICS
5580
- const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
5581
- #else
5582
- const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
5583
- #endif
6471
+ const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
5584
6472
  return VFromD<D>{_mm256_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)};
5585
6473
  }
5586
6474
  template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U8_D(D)>
5587
6475
  HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int64_t> v) {
5588
- const auto neg_mask = MaskFromVec(v);
5589
- #if HWY_COMPILER_HAS_MASK_INTRINSICS
5590
- const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
5591
- #else
5592
- const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
5593
- #endif
6476
+ const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
5594
6477
  return VFromD<D>{_mm256_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)};
5595
6478
  }
5596
6479
 
@@ -5617,32 +6500,54 @@ HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wsign-conversion")
5617
6500
 
5618
6501
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
5619
6502
  HWY_API VFromD<D> DemoteTo(D df16, Vec256<float> v) {
5620
- (void)df16;
5621
- return VFromD<D>{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
6503
+ const RebindToUnsigned<decltype(df16)> du16;
6504
+ return BitCast(
6505
+ df16, VFromD<decltype(du16)>{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)});
5622
6506
  }
5623
6507
 
5624
6508
  HWY_DIAGNOSTICS(pop)
5625
6509
 
5626
6510
  #endif // HWY_DISABLE_F16C
5627
6511
 
6512
+ #if HWY_HAVE_FLOAT16
6513
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F16_D(D)>
6514
+ HWY_API VFromD<D> DemoteTo(D /*df16*/, Vec256<double> v) {
6515
+ return VFromD<D>{_mm256_cvtpd_ph(v.raw)};
6516
+ }
6517
+ #endif // HWY_HAVE_FLOAT16
6518
+
6519
+ #if HWY_AVX3_HAVE_F32_TO_BF16C
5628
6520
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
5629
- HWY_API VFromD<D> DemoteTo(D dbf16, Vec256<float> v) {
5630
- // TODO(janwas): _mm256_cvtneps_pbh once we have avx512bf16.
5631
- const Rebind<int32_t, decltype(dbf16)> di32;
5632
- const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
5633
- const Rebind<uint16_t, decltype(dbf16)> du16;
5634
- const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
5635
- return BitCast(dbf16, DemoteTo(du16, bits_in_32));
6521
+ HWY_API VFromD<D> DemoteTo(D /*dbf16*/, Vec256<float> v) {
6522
+ #if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
6523
+ // Inline assembly workaround for LLVM codegen bug
6524
+ __m128i raw_result;
6525
+ __asm__("vcvtneps2bf16 %1, %0" : "=v"(raw_result) : "v"(v.raw));
6526
+ return VFromD<D>{raw_result};
6527
+ #else
6528
+ // The _mm256_cvtneps_pbh intrinsic returns a __m128bh vector that needs to be
6529
+ // bit casted to a __m128i vector
6530
+ return VFromD<D>{detail::BitCastToInteger(_mm256_cvtneps_pbh(v.raw))};
6531
+ #endif
5636
6532
  }
5637
6533
 
5638
6534
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
5639
- HWY_API VFromD<D> ReorderDemote2To(D dbf16, Vec256<float> a, Vec256<float> b) {
5640
- // TODO(janwas): _mm256_cvtne2ps_pbh once we have avx512bf16.
5641
- const RebindToUnsigned<decltype(dbf16)> du16;
5642
- const Repartition<uint32_t, decltype(dbf16)> du32;
5643
- const Vec256<uint32_t> b_in_even = ShiftRight<16>(BitCast(du32, b));
5644
- return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
6535
+ HWY_API VFromD<D> ReorderDemote2To(D /*dbf16*/, Vec256<float> a,
6536
+ Vec256<float> b) {
6537
+ #if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
6538
+ // Inline assembly workaround for LLVM codegen bug
6539
+ __m256i raw_result;
6540
+ __asm__("vcvtne2ps2bf16 %2, %1, %0"
6541
+ : "=v"(raw_result)
6542
+ : "v"(b.raw), "v"(a.raw));
6543
+ return VFromD<D>{raw_result};
6544
+ #else
6545
+ // The _mm256_cvtne2ps_pbh intrinsic returns a __m256bh vector that needs to
6546
+ // be bit casted to a __m256i vector
6547
+ return VFromD<D>{detail::BitCastToInteger(_mm256_cvtne2ps_pbh(b.raw, a.raw))};
6548
+ #endif
5645
6549
  }
6550
+ #endif // HWY_AVX3_HAVE_F32_TO_BF16C
5646
6551
 
5647
6552
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)>
5648
6553
  HWY_API VFromD<D> ReorderDemote2To(D /*d16*/, Vec256<int32_t> a,
@@ -5733,9 +6638,9 @@ HWY_API Vec256<uint32_t> ReorderDemote2To(D dn, Vec256<int64_t> a,
5733
6638
  _MM_SHUFFLE(2, 0, 2, 0))});
5734
6639
  }
5735
6640
 
5736
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
5737
- HWY_API Vec256<uint32_t> ReorderDemote2To(D dn, Vec256<uint64_t> a,
5738
- Vec256<uint64_t> b) {
6641
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
6642
+ HWY_API VFromD<D> ReorderDemote2To(D dn, Vec256<uint64_t> a,
6643
+ Vec256<uint64_t> b) {
5739
6644
  const Half<decltype(dn)> dnh;
5740
6645
  const Repartition<float, decltype(dn)> dn_f;
5741
6646
 
@@ -5767,37 +6672,64 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<double> v) {
5767
6672
  }
5768
6673
 
5769
6674
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
5770
- HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<double> v) {
5771
- const Full256<double> d64;
5772
- const auto clamped = detail::ClampF64ToI32Max(d64, v);
5773
- return VFromD<D>{_mm256_cvttpd_epi32(clamped.raw)};
6675
+ HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec256<double> v) {
6676
+ #if HWY_COMPILER_GCC_ACTUAL
6677
+ // Workaround for undefined behavior in _mm256_cvttpd_epi32 with GCC if any
6678
+ // values of v[i] are not within the range of an int32_t
6679
+
6680
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
6681
+ if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
6682
+ typedef double GccF64RawVectType __attribute__((__vector_size__(32)));
6683
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
6684
+ return Dup128VecFromValues(
6685
+ D(), detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
6686
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]),
6687
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[2]),
6688
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[3]));
6689
+ }
6690
+ #endif
6691
+
6692
+ __m128i raw_result;
6693
+ __asm__("vcvttpd2dq {%1, %0|%0, %1}"
6694
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6695
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6696
+ :);
6697
+ return VFromD<D>{raw_result};
6698
+ #else
6699
+ return VFromD<D>{_mm256_cvttpd_epi32(v.raw)};
6700
+ #endif
5774
6701
  }
5775
6702
 
5776
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
5777
- HWY_API VFromD<D> DemoteTo(D du32, Vec256<double> v) {
5778
6703
  #if HWY_TARGET <= HWY_AVX3
5779
- (void)du32;
5780
- return VFromD<D>{
5781
- _mm256_maskz_cvttpd_epu32(_knot_mask8(MaskFromVec(v).raw), v.raw)};
5782
- #else // AVX2
5783
- const Rebind<double, decltype(du32)> df64;
5784
- const RebindToUnsigned<decltype(df64)> du64;
5785
-
5786
- // Clamp v[i] to a value between 0 and 4294967295
5787
- const auto clamped = Min(ZeroIfNegative(v), Set(df64, 4294967295.0));
6704
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
6705
+ HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec256<double> v) {
6706
+ #if HWY_COMPILER_GCC_ACTUAL
6707
+ // Workaround for undefined behavior in _mm256_cvttpd_epu32 with GCC if any
6708
+ // values of v[i] are not within the range of an uint32_t
6709
+
6710
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
6711
+ if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) {
6712
+ typedef double GccF64RawVectType __attribute__((__vector_size__(32)));
6713
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
6714
+ return Dup128VecFromValues(
6715
+ D(), detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0]),
6716
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1]),
6717
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[2]),
6718
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[3]));
6719
+ }
6720
+ #endif
5788
6721
 
5789
- const auto k2_31 = Set(df64, 2147483648.0);
5790
- const auto clamped_is_ge_k2_31 = (clamped >= k2_31);
5791
- const auto clamped_lo31_f64 =
5792
- clamped - IfThenElseZero(clamped_is_ge_k2_31, k2_31);
5793
- const VFromD<D> clamped_lo31_u32{_mm256_cvttpd_epi32(clamped_lo31_f64.raw)};
5794
- const auto clamped_u32_msb = ShiftLeft<31>(
5795
- TruncateTo(du32, BitCast(du64, VecFromMask(df64, clamped_is_ge_k2_31))));
5796
- return Or(clamped_lo31_u32, clamped_u32_msb);
6722
+ __m128i raw_result;
6723
+ __asm__("vcvttpd2udq {%1, %0|%0, %1}"
6724
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6725
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6726
+ :);
6727
+ return VFromD<D>{raw_result};
6728
+ #else
6729
+ return VFromD<D>{_mm256_cvttpd_epu32(v.raw)};
5797
6730
  #endif
5798
6731
  }
5799
6732
 
5800
- #if HWY_TARGET <= HWY_AVX3
5801
6733
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
5802
6734
  HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
5803
6735
  return VFromD<D>{_mm256_cvtepi64_ps(v.raw)};
@@ -5963,61 +6895,274 @@ HWY_API VFromD<D> ConvertTo(D /*dd*/, Vec256<uint64_t> v) {
5963
6895
 
5964
6896
  #if HWY_HAVE_FLOAT16
5965
6897
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)>
5966
- HWY_API VFromD<D> ConvertTo(D d, Vec256<float16_t> v) {
5967
- return detail::FixConversionOverflow(d, v,
5968
- VFromD<D>{_mm256_cvttph_epi16(v.raw)});
6898
+ HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec256<float16_t> v) {
6899
+ #if HWY_COMPILER_GCC_ACTUAL
6900
+ // Workaround for undefined behavior in _mm256_cvttph_epi16 with GCC if any
6901
+ // values of v[i] are not within the range of an int16_t
6902
+
6903
+ #if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
6904
+ HWY_HAVE_SCALAR_F16_TYPE
6905
+ if (detail::IsConstantX86VecForF2IConv<int16_t>(v)) {
6906
+ typedef hwy::float16_t::Native GccF16RawVectType
6907
+ __attribute__((__vector_size__(32)));
6908
+ const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
6909
+ return VFromD<D>{_mm256_setr_epi16(
6910
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[0]),
6911
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[1]),
6912
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[2]),
6913
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[3]),
6914
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[4]),
6915
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[5]),
6916
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[6]),
6917
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[7]),
6918
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[8]),
6919
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[9]),
6920
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[10]),
6921
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[11]),
6922
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[12]),
6923
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[13]),
6924
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[14]),
6925
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[15]))};
6926
+ }
6927
+ #endif
6928
+
6929
+ __m256i raw_result;
6930
+ __asm__("vcvttph2w {%1, %0|%0, %1}"
6931
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6932
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6933
+ :);
6934
+ return VFromD<D>{raw_result};
6935
+ #else // HWY_COMPILER_GCC_ACTUAL < 1200
6936
+ return VFromD<D>{_mm256_cvttph_epi16(v.raw)};
6937
+ #endif
6938
+ }
6939
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
6940
+ HWY_API VFromD<D> ConvertInRangeTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
6941
+ #if HWY_COMPILER_GCC_ACTUAL
6942
+ // Workaround for undefined behavior in _mm256_cvttph_epu16 with GCC if any
6943
+ // values of v[i] are not within the range of an uint16_t
6944
+
6945
+ #if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
6946
+ HWY_HAVE_SCALAR_F16_TYPE
6947
+ if (detail::IsConstantX86VecForF2IConv<uint16_t>(v)) {
6948
+ typedef hwy::float16_t::Native GccF16RawVectType
6949
+ __attribute__((__vector_size__(32)));
6950
+ const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
6951
+ return VFromD<D>{_mm256_setr_epi16(
6952
+ static_cast<int16_t>(
6953
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[0])),
6954
+ static_cast<int16_t>(
6955
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[1])),
6956
+ static_cast<int16_t>(
6957
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[2])),
6958
+ static_cast<int16_t>(
6959
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[3])),
6960
+ static_cast<int16_t>(
6961
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[4])),
6962
+ static_cast<int16_t>(
6963
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[5])),
6964
+ static_cast<int16_t>(
6965
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[6])),
6966
+ static_cast<int16_t>(
6967
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[7])),
6968
+ static_cast<int16_t>(
6969
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[8])),
6970
+ static_cast<int16_t>(
6971
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[9])),
6972
+ static_cast<int16_t>(
6973
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[10])),
6974
+ static_cast<int16_t>(
6975
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[11])),
6976
+ static_cast<int16_t>(
6977
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[12])),
6978
+ static_cast<int16_t>(
6979
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[13])),
6980
+ static_cast<int16_t>(
6981
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[14])),
6982
+ static_cast<int16_t>(
6983
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[15])))};
6984
+ }
6985
+ #endif
6986
+
6987
+ __m256i raw_result;
6988
+ __asm__("vcvttph2uw {%1, %0|%0, %1}"
6989
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6990
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6991
+ :);
6992
+ return VFromD<D>{raw_result};
6993
+ #else // HWY_COMPILER_GCC_ACTUAL < 1200
6994
+ return VFromD<D>{_mm256_cvttph_epu16(v.raw)};
6995
+ #endif
5969
6996
  }
5970
6997
  #endif // HWY_HAVE_FLOAT16
5971
6998
 
5972
6999
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
5973
- HWY_API VFromD<D> ConvertTo(D d, Vec256<float> v) {
5974
- return detail::FixConversionOverflow(d, v,
5975
- VFromD<D>{_mm256_cvttps_epi32(v.raw)});
7000
+ HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec256<float> v) {
7001
+ #if HWY_COMPILER_GCC_ACTUAL
7002
+ // Workaround for undefined behavior in _mm256_cvttps_epi32 with GCC if any
7003
+ // values of v[i] are not within the range of an int32_t
7004
+
7005
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
7006
+ if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
7007
+ typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
7008
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
7009
+ return VFromD<D>{_mm256_setr_epi32(
7010
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
7011
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]),
7012
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[2]),
7013
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[3]),
7014
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[4]),
7015
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[5]),
7016
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[6]),
7017
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[7]))};
7018
+ }
7019
+ #endif
7020
+
7021
+ __m256i raw_result;
7022
+ __asm__("vcvttps2dq {%1, %0|%0, %1}"
7023
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
7024
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
7025
+ :);
7026
+ return VFromD<D>{raw_result};
7027
+ #else
7028
+ return VFromD<D>{_mm256_cvttps_epi32(v.raw)};
7029
+ #endif
5976
7030
  }
5977
7031
 
5978
7032
  #if HWY_TARGET <= HWY_AVX3
5979
7033
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
5980
- HWY_API VFromD<D> ConvertTo(D di, Vec256<double> v) {
5981
- return detail::FixConversionOverflow(di, v,
5982
- VFromD<D>{_mm256_cvttpd_epi64(v.raw)});
7034
+ HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, Vec256<double> v) {
7035
+ #if HWY_COMPILER_GCC_ACTUAL
7036
+ // Workaround for undefined behavior in _mm256_cvttpd_epi64 with GCC if any
7037
+ // values of v[i] are not within the range of an int64_t
7038
+
7039
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
7040
+ if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
7041
+ typedef double GccF64RawVectType __attribute__((__vector_size__(32)));
7042
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
7043
+ return VFromD<D>{_mm256_setr_epi64x(
7044
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]),
7045
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1]),
7046
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[2]),
7047
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[3]))};
7048
+ }
7049
+ #endif
7050
+
7051
+ __m256i raw_result;
7052
+ __asm__("vcvttpd2qq {%1, %0|%0, %1}"
7053
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
7054
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
7055
+ :);
7056
+ return VFromD<D>{raw_result};
7057
+ #else // !HWY_COMPILER_GCC_ACTUAL
7058
+ return VFromD<D>{_mm256_cvttpd_epi64(v.raw)};
7059
+ #endif // HWY_COMPILER_GCC_ACTUAL
5983
7060
  }
5984
7061
  template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U32_D(DU)>
5985
- HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
5986
- return VFromD<DU>{
5987
- _mm256_maskz_cvttps_epu32(_knot_mask8(MaskFromVec(v).raw), v.raw)};
7062
+ HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
7063
+ #if HWY_COMPILER_GCC_ACTUAL
7064
+ // Workaround for undefined behavior in _mm256_cvttps_epu32 with GCC if any
7065
+ // values of v[i] are not within the range of an uint32_t
7066
+
7067
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
7068
+ if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) {
7069
+ typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
7070
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
7071
+ return VFromD<DU>{_mm256_setr_epi32(
7072
+ static_cast<int32_t>(
7073
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0])),
7074
+ static_cast<int32_t>(
7075
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1])),
7076
+ static_cast<int32_t>(
7077
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[2])),
7078
+ static_cast<int32_t>(
7079
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[3])),
7080
+ static_cast<int32_t>(
7081
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[4])),
7082
+ static_cast<int32_t>(
7083
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[5])),
7084
+ static_cast<int32_t>(
7085
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[6])),
7086
+ static_cast<int32_t>(
7087
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[7])))};
7088
+ }
7089
+ #endif
7090
+
7091
+ __m256i raw_result;
7092
+ __asm__("vcvttps2udq {%1, %0|%0, %1}"
7093
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
7094
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
7095
+ :);
7096
+ return VFromD<DU>{raw_result};
7097
+ #else // !HWY_COMPILER_GCC_ACTUAL
7098
+ return VFromD<DU>{_mm256_cvttps_epu32(v.raw)};
7099
+ #endif // HWY_COMPILER_GCC_ACTUAL
5988
7100
  }
5989
7101
  template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U64_D(DU)>
5990
- HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
5991
- return VFromD<DU>{
5992
- _mm256_maskz_cvttpd_epu64(_knot_mask8(MaskFromVec(v).raw), v.raw)};
5993
- }
5994
- #else // AVX2
5995
- template <class DU32, HWY_IF_V_SIZE_D(DU32, 32), HWY_IF_U32_D(DU32)>
5996
- HWY_API VFromD<DU32> ConvertTo(DU32 du32, VFromD<RebindToFloat<DU32>> v) {
5997
- const RebindToSigned<decltype(du32)> di32;
5998
- const RebindToFloat<decltype(du32)> df32;
5999
-
6000
- const auto non_neg_v = ZeroIfNegative(v);
6001
- const auto exp_diff = Set(di32, int32_t{158}) -
6002
- BitCast(di32, ShiftRight<23>(BitCast(du32, non_neg_v)));
6003
- const auto scale_down_f32_val_mask =
6004
- BitCast(du32, VecFromMask(di32, Eq(exp_diff, Zero(di32))));
6005
-
6006
- const auto v_scaled = BitCast(
6007
- df32, BitCast(du32, non_neg_v) + ShiftLeft<23>(scale_down_f32_val_mask));
6008
- const VFromD<decltype(du32)> f32_to_u32_result{
6009
- _mm256_cvttps_epi32(v_scaled.raw)};
6010
-
6011
- return Or(
6012
- BitCast(du32, BroadcastSignBit(exp_diff)),
6013
- f32_to_u32_result + And(f32_to_u32_result, scale_down_f32_val_mask));
7102
+ HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
7103
+ #if HWY_COMPILER_GCC_ACTUAL
7104
+ // Workaround for undefined behavior in _mm256_cvttpd_epu64 with GCC if any
7105
+ // values of v[i] are not within the range of an uint64_t
7106
+
7107
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
7108
+ if (detail::IsConstantX86VecForF2IConv<uint64_t>(v)) {
7109
+ typedef double GccF64RawVectType __attribute__((__vector_size__(32)));
7110
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
7111
+ return VFromD<DU>{_mm256_setr_epi64x(
7112
+ static_cast<int64_t>(
7113
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0])),
7114
+ static_cast<int64_t>(
7115
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1])),
7116
+ static_cast<int64_t>(
7117
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[2])),
7118
+ static_cast<int64_t>(
7119
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[3])))};
7120
+ }
7121
+ #endif
7122
+
7123
+ __m256i raw_result;
7124
+ __asm__("vcvttpd2uqq {%1, %0|%0, %1}"
7125
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
7126
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
7127
+ :);
7128
+ return VFromD<DU>{raw_result};
7129
+ #else // !HWY_COMPILER_GCC_ACTUAL
7130
+ return VFromD<DU>{_mm256_cvttpd_epu64(v.raw)};
7131
+ #endif // HWY_COMPILER_GCC_ACTUAL
6014
7132
  }
6015
7133
  #endif // HWY_TARGET <= HWY_AVX3
6016
7134
 
6017
- HWY_API Vec256<int32_t> NearestInt(const Vec256<float> v) {
6018
- const Full256<int32_t> di;
6019
- return detail::FixConversionOverflow(
6020
- di, v, Vec256<int32_t>{_mm256_cvtps_epi32(v.raw)});
7135
+ template <class DI, HWY_IF_V_SIZE_D(DI, 32), HWY_IF_I32_D(DI)>
7136
+ HWY_INLINE VFromD<DI> NearestIntInRange(DI, VFromD<RebindToFloat<DI>> v) {
7137
+ #if HWY_COMPILER_GCC_ACTUAL
7138
+ // Workaround for undefined behavior in _mm256_cvtps_epi32 if any values of
7139
+ // v[i] are not within the range of an int32_t
7140
+
7141
+ #if HWY_COMPILER_GCC >= 700 && !HWY_IS_DEBUG_BUILD
7142
+ if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
7143
+ typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
7144
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
7145
+ return VFromD<DI>{
7146
+ _mm256_setr_epi32(detail::X86ScalarNearestInt<int32_t>(raw_v[0]),
7147
+ detail::X86ScalarNearestInt<int32_t>(raw_v[1]),
7148
+ detail::X86ScalarNearestInt<int32_t>(raw_v[2]),
7149
+ detail::X86ScalarNearestInt<int32_t>(raw_v[3]),
7150
+ detail::X86ScalarNearestInt<int32_t>(raw_v[4]),
7151
+ detail::X86ScalarNearestInt<int32_t>(raw_v[5]),
7152
+ detail::X86ScalarNearestInt<int32_t>(raw_v[6]),
7153
+ detail::X86ScalarNearestInt<int32_t>(raw_v[7]))};
7154
+ }
7155
+ #endif
7156
+
7157
+ __m256i raw_result;
7158
+ __asm__("vcvtps2dq {%1, %0|%0, %1}"
7159
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
7160
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
7161
+ :);
7162
+ return VFromD<DI>{raw_result};
7163
+ #else // !HWY_COMPILER_GCC_ACTUAL
7164
+ return VFromD<DI>{_mm256_cvtps_epi32(v.raw)};
7165
+ #endif // HWY_COMPILER_GCC_ACTUAL
6021
7166
  }
6022
7167
 
6023
7168
  #ifndef HWY_DISABLE_F16C
@@ -6035,6 +7180,15 @@ HWY_API VFromD<D> PromoteTo(D df32, Vec128<float16_t> v) {
6035
7180
 
6036
7181
  #endif // HWY_DISABLE_F16C
6037
7182
 
7183
+ #if HWY_HAVE_FLOAT16
7184
+
7185
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
7186
+ HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, Vec64<float16_t> v) {
7187
+ return VFromD<D>{_mm256_cvtph_pd(v.raw)};
7188
+ }
7189
+
7190
+ #endif // HWY_HAVE_FLOAT16
7191
+
6038
7192
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
6039
7193
  HWY_API VFromD<D> PromoteTo(D df32, Vec128<bfloat16_t> v) {
6040
7194
  const Rebind<uint16_t, decltype(df32)> du16;
@@ -6120,14 +7274,14 @@ template <uint8_t kRcon>
6120
7274
  HWY_API Vec256<uint8_t> AESKeyGenAssist(Vec256<uint8_t> v) {
6121
7275
  const Full256<uint8_t> d;
6122
7276
  #if HWY_TARGET <= HWY_AVX3_DL
6123
- alignas(16) static constexpr uint8_t kRconXorMask[16] = {
6124
- 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0};
6125
- alignas(16) static constexpr uint8_t kRotWordShuffle[16] = {
6126
- 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12};
7277
+ const VFromD<decltype(d)> rconXorMask = Dup128VecFromValues(
7278
+ d, 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0);
7279
+ const VFromD<decltype(d)> rotWordShuffle = Dup128VecFromValues(
7280
+ d, 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12);
6127
7281
  const Repartition<uint32_t, decltype(d)> du32;
6128
7282
  const auto w13 = BitCast(d, DupOdd(BitCast(du32, v)));
6129
- const auto sub_word_result = AESLastRound(w13, LoadDup128(d, kRconXorMask));
6130
- return TableLookupBytes(sub_word_result, LoadDup128(d, kRotWordShuffle));
7283
+ const auto sub_word_result = AESLastRound(w13, rconXorMask);
7284
+ return TableLookupBytes(sub_word_result, rotWordShuffle);
6131
7285
  #else
6132
7286
  const Half<decltype(d)> d2;
6133
7287
  return Combine(d, AESKeyGenAssist<kRcon>(UpperHalf(d2, v)),
@@ -6387,9 +7541,9 @@ HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) {
6387
7541
  0x0303030303030303ull};
6388
7542
  const auto rep8 = TableLookupBytes(vbits, BitCast(du, Load(du64, kRep8)));
6389
7543
 
6390
- alignas(32) static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
6391
- 1, 2, 4, 8, 16, 32, 64, 128};
6392
- return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
7544
+ const VFromD<decltype(du)> bit = Dup128VecFromValues(
7545
+ du, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
7546
+ return RebindMask(d, TestBit(rep8, bit));
6393
7547
  }
6394
7548
 
6395
7549
  template <typename T, HWY_IF_T_SIZE(T, 2)>
@@ -6923,6 +8077,16 @@ HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
6923
8077
 
6924
8078
  #endif // HWY_TARGET <= HWY_AVX3
6925
8079
 
8080
+ // ------------------------------ Dup128MaskFromMaskBits
8081
+
8082
+ // Generic for all vector lengths >= 32 bytes
8083
+ template <class D, HWY_IF_V_SIZE_GT_D(D, 16)>
8084
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
8085
+ const Half<decltype(d)> dh;
8086
+ const auto mh = Dup128MaskFromMaskBits(dh, mask_bits);
8087
+ return CombineMasks(d, mh, mh);
8088
+ }
8089
+
6926
8090
  // ------------------------------ Expand
6927
8091
 
6928
8092
  // Always define Expand/LoadExpand because generic_ops only does so for Vec128.
@@ -7396,116 +8560,26 @@ HWY_API Mask256<T> SetAtOrBeforeFirst(Mask256<T> mask) {
7396
8560
  }
7397
8561
  #endif // HWY_TARGET <= HWY_AVX3
7398
8562
 
7399
- // ------------------------------ Reductions
7400
-
7401
- namespace detail {
7402
-
7403
- // These functions start with each lane per 128-bit block being reduced with the
7404
- // corresponding lane in the other block, so we use the same logic as x86_128
7405
- // but running on both blocks at the same time. There are two (64-bit) to eight
7406
- // (16-bit) lanes per block.
7407
- template <typename T, HWY_IF_T_SIZE(T, 8)>
7408
- HWY_INLINE Vec256<T> SumOfLanes(Vec256<T> v10) {
7409
- const DFromV<decltype(v10)> d;
7410
- return Add(v10, Reverse2(d, v10));
7411
- }
7412
- template <typename T, HWY_IF_T_SIZE(T, 8)>
7413
- HWY_INLINE Vec256<T> MinOfLanes(Vec256<T> v10) {
7414
- const DFromV<decltype(v10)> d;
7415
- return Min(v10, Reverse2(d, v10));
7416
- }
7417
- template <typename T, HWY_IF_T_SIZE(T, 8)>
7418
- HWY_INLINE Vec256<T> MaxOfLanes(Vec256<T> v10) {
7419
- const DFromV<decltype(v10)> d;
7420
- return Max(v10, Reverse2(d, v10));
7421
- }
7422
-
7423
- template <typename T, HWY_IF_T_SIZE(T, 4)>
7424
- HWY_INLINE Vec256<T> SumOfLanes(Vec256<T> v3210) {
7425
- using V = decltype(v3210);
7426
- const DFromV<V> d;
7427
- const V v0123 = Reverse4(d, v3210);
7428
- const V v03_12_12_03 = Add(v3210, v0123);
7429
- const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
7430
- return Add(v03_12_12_03, v12_03_03_12);
7431
- }
7432
- template <typename T, HWY_IF_T_SIZE(T, 4)>
7433
- HWY_INLINE Vec256<T> MinOfLanes(Vec256<T> v3210) {
7434
- using V = decltype(v3210);
7435
- const DFromV<V> d;
7436
- const V v0123 = Reverse4(d, v3210);
7437
- const V v03_12_12_03 = Min(v3210, v0123);
7438
- const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
7439
- return Min(v03_12_12_03, v12_03_03_12);
7440
- }
7441
- template <typename T, HWY_IF_T_SIZE(T, 4)>
7442
- HWY_INLINE Vec256<T> MaxOfLanes(Vec256<T> v3210) {
7443
- using V = decltype(v3210);
7444
- const DFromV<V> d;
7445
- const V v0123 = Reverse4(d, v3210);
7446
- const V v03_12_12_03 = Max(v3210, v0123);
7447
- const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
7448
- return Max(v03_12_12_03, v12_03_03_12);
7449
- }
8563
+ // ------------------------------ Reductions in generic_ops
7450
8564
 
7451
- template <typename T, HWY_IF_T_SIZE(T, 2)>
7452
- HWY_INLINE Vec256<T> SumOfLanes(Vec256<T> v76543210) {
7453
- using V = decltype(v76543210);
7454
- const DFromV<V> d;
7455
- // The upper half is reversed from the lower half; omit for brevity.
7456
- const V v34_25_16_07 = Add(v76543210, Reverse8(d, v76543210));
7457
- const V v0347_1625_1625_0347 = Add(v34_25_16_07, Reverse4(d, v34_25_16_07));
7458
- return Add(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
7459
- }
7460
- template <typename T, HWY_IF_T_SIZE(T, 2)>
7461
- HWY_INLINE Vec256<T> MinOfLanes(Vec256<T> v76543210) {
7462
- using V = decltype(v76543210);
7463
- const DFromV<V> d;
7464
- // The upper half is reversed from the lower half; omit for brevity.
7465
- const V v34_25_16_07 = Min(v76543210, Reverse8(d, v76543210));
7466
- const V v0347_1625_1625_0347 = Min(v34_25_16_07, Reverse4(d, v34_25_16_07));
7467
- return Min(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
7468
- }
7469
- template <typename T, HWY_IF_T_SIZE(T, 2)>
7470
- HWY_INLINE Vec256<T> MaxOfLanes(Vec256<T> v76543210) {
7471
- using V = decltype(v76543210);
7472
- const DFromV<V> d;
7473
- // The upper half is reversed from the lower half; omit for brevity.
7474
- const V v34_25_16_07 = Max(v76543210, Reverse8(d, v76543210));
7475
- const V v0347_1625_1625_0347 = Max(v34_25_16_07, Reverse4(d, v34_25_16_07));
7476
- return Max(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
7477
- }
8565
+ // ------------------------------ BitShuffle
8566
+ #if HWY_TARGET <= HWY_AVX3_DL
8567
+ template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
8568
+ HWY_IF_V_SIZE_V(V, 32), HWY_IF_V_SIZE_V(VI, 32)>
8569
+ HWY_API V BitShuffle(V v, VI idx) {
8570
+ const DFromV<decltype(v)> d64;
8571
+ const RebindToUnsigned<decltype(d64)> du64;
8572
+ const Rebind<uint8_t, decltype(d64)> du8;
7478
8573
 
7479
- } // namespace detail
8574
+ int32_t i32_bit_shuf_result =
8575
+ static_cast<int32_t>(_mm256_bitshuffle_epi64_mask(v.raw, idx.raw));
7480
8576
 
7481
- // Supported for >8-bit types. Returns the broadcasted result.
7482
- template <class D, HWY_IF_V_SIZE_D(D, 32)>
7483
- HWY_API VFromD<D> SumOfLanes(D /*d*/, VFromD<D> vHL) {
7484
- const VFromD<D> vLH = SwapAdjacentBlocks(vHL);
7485
- return detail::SumOfLanes(Add(vLH, vHL));
7486
- }
7487
- template <class D, HWY_IF_V_SIZE_D(D, 32)>
7488
- HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
7489
- return GetLane(SumOfLanes(d, v));
7490
- }
7491
- #if HWY_HAVE_FLOAT16
7492
- template <class D, HWY_IF_V_SIZE_D(D, 32)>
7493
- HWY_API float16_t ReduceSum(D, VFromD<D> v) {
7494
- return _mm256_reduce_add_ph(v.raw);
7495
- }
7496
- #endif // HWY_HAVE_FLOAT16
7497
- template <class D, HWY_IF_V_SIZE_D(D, 32)>
7498
- HWY_API VFromD<D> MinOfLanes(D /*d*/, VFromD<D> vHL) {
7499
- const VFromD<D> vLH = SwapAdjacentBlocks(vHL);
7500
- return detail::MinOfLanes(Min(vLH, vHL));
7501
- }
7502
- template <class D, HWY_IF_V_SIZE_D(D, 32)>
7503
- HWY_API VFromD<D> MaxOfLanes(D /*d*/, VFromD<D> vHL) {
7504
- const VFromD<D> vLH = SwapAdjacentBlocks(vHL);
7505
- return detail::MaxOfLanes(Max(vLH, vHL));
8577
+ return BitCast(d64, PromoteTo(du64, VFromD<decltype(du8)>{_mm_cvtsi32_si128(
8578
+ i32_bit_shuf_result)}));
7506
8579
  }
8580
+ #endif // HWY_TARGET <= HWY_AVX3_DL
7507
8581
 
7508
- // -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex
8582
+ // ------------------------------ LeadingZeroCount
7509
8583
 
7510
8584
  #if HWY_TARGET <= HWY_AVX3
7511
8585
  template <class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>