@img/sharp-libvips-dev 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/include/expat.h +21 -10
  2. package/include/expat_config.h +11 -5
  3. package/include/ffi.h +12 -25
  4. package/include/freetype2/freetype/config/ftoption.h +1 -1
  5. package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
  6. package/include/glib-2.0/gio/gapplication.h +6 -0
  7. package/include/glib-2.0/gio/giotypes.h +0 -1
  8. package/include/glib-2.0/girepository/giarginfo.h +23 -6
  9. package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
  10. package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
  11. package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
  12. package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
  13. package/include/glib-2.0/girepository/gienuminfo.h +20 -21
  14. package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
  15. package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
  16. package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
  17. package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
  18. package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
  19. package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
  20. package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
  21. package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
  22. package/include/glib-2.0/girepository/girepository.h +53 -62
  23. package/include/glib-2.0/girepository/girffi.h +8 -7
  24. package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
  25. package/include/glib-2.0/girepository/gistructinfo.h +26 -11
  26. package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
  27. package/include/glib-2.0/girepository/gitypelib.h +9 -13
  28. package/include/glib-2.0/girepository/gitypes.h +52 -104
  29. package/include/glib-2.0/girepository/giunioninfo.h +28 -12
  30. package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
  31. package/include/glib-2.0/girepository/givalueinfo.h +65 -0
  32. package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
  33. package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
  34. package/include/glib-2.0/glib/gbitlock.h +31 -0
  35. package/include/glib-2.0/glib/gmessages.h +8 -0
  36. package/include/glib-2.0/glib/gslice.h +2 -0
  37. package/include/glib-2.0/glib/gstrfuncs.h +24 -18
  38. package/include/glib-2.0/glib/gthread.h +191 -3
  39. package/include/glib-2.0/glib-unix.h +7 -1
  40. package/include/glib-2.0/gobject/genums.h +6 -6
  41. package/include/glib-2.0/gobject/glib-types.h +11 -0
  42. package/include/glib-2.0/gobject/gsignal.h +16 -6
  43. package/include/hwy/aligned_allocator.h +171 -6
  44. package/include/hwy/base.h +1765 -543
  45. package/include/hwy/cache_control.h +24 -6
  46. package/include/hwy/detect_compiler_arch.h +23 -2
  47. package/include/hwy/detect_targets.h +56 -13
  48. package/include/hwy/foreach_target.h +24 -0
  49. package/include/hwy/highway.h +20 -3
  50. package/include/hwy/ops/arm_neon-inl.h +1086 -667
  51. package/include/hwy/ops/arm_sve-inl.h +1091 -235
  52. package/include/hwy/ops/emu128-inl.h +271 -196
  53. package/include/hwy/ops/generic_ops-inl.h +2270 -399
  54. package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
  55. package/include/hwy/ops/rvv-inl.h +1043 -311
  56. package/include/hwy/ops/scalar-inl.h +189 -159
  57. package/include/hwy/ops/set_macros-inl.h +66 -6
  58. package/include/hwy/ops/shared-inl.h +175 -56
  59. package/include/hwy/ops/wasm_128-inl.h +153 -136
  60. package/include/hwy/ops/x86_128-inl.h +1647 -646
  61. package/include/hwy/ops/x86_256-inl.h +1003 -370
  62. package/include/hwy/ops/x86_512-inl.h +948 -353
  63. package/include/hwy/per_target.h +4 -0
  64. package/include/hwy/profiler.h +648 -0
  65. package/include/hwy/robust_statistics.h +2 -2
  66. package/include/hwy/targets.h +18 -11
  67. package/include/hwy/timer.h +11 -0
  68. package/include/libpng16/png.h +32 -29
  69. package/include/libpng16/pngconf.h +2 -2
  70. package/include/libpng16/pnglibconf.h +7 -2
  71. package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
  72. package/include/libxml2/libxml/parser.h +16 -7
  73. package/include/libxml2/libxml/xmlIO.h +0 -1
  74. package/include/libxml2/libxml/xmlversion.h +4 -4
  75. package/include/pango-1.0/pango/pango-features.h +3 -3
  76. package/include/pango-1.0/pango/pango-fontmap.h +7 -0
  77. package/include/pixman-1/pixman-version.h +2 -2
  78. package/include/png.h +32 -29
  79. package/include/pngconf.h +2 -2
  80. package/include/pnglibconf.h +7 -2
  81. package/include/vips/connection.h +9 -3
  82. package/include/vips/util.h +0 -9
  83. package/include/vips/version.h +4 -4
  84. package/package.json +1 -1
  85. package/versions.json +11 -11
@@ -152,6 +152,9 @@ class Vec512 {
152
152
  HWY_INLINE Vec512& operator-=(const Vec512 other) {
153
153
  return *this = (*this - other);
154
154
  }
155
+ HWY_INLINE Vec512& operator%=(const Vec512 other) {
156
+ return *this = (*this % other);
157
+ }
155
158
  HWY_INLINE Vec512& operator&=(const Vec512 other) {
156
159
  return *this = (*this & other);
157
160
  }
@@ -373,6 +376,132 @@ HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
373
376
  BitCast(Full256<uint8_t>(), v).raw)});
374
377
  }
375
378
 
379
+ // ------------------------------ Dup128VecFromValues
380
+
381
+ template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 64)>
382
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
383
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
384
+ TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
385
+ TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
386
+ TFromD<D> t11, TFromD<D> t12,
387
+ TFromD<D> t13, TFromD<D> t14,
388
+ TFromD<D> t15) {
389
+ #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900
390
+ // Missing set_epi8/16.
391
+ return BroadcastBlock<0>(ResizeBitCast(
392
+ d, Dup128VecFromValues(Full128<TFromD<D>>(), t0, t1, t2, t3, t4, t5, t6,
393
+ t7, t8, t9, t10, t11, t12, t13, t14, t15)));
394
+ #else
395
+ (void)d;
396
+ // Need to use _mm512_set_epi8 as there is no _mm512_setr_epi8 intrinsic
397
+ // available
398
+ return VFromD<D>{_mm512_set_epi8(
399
+ static_cast<char>(t15), static_cast<char>(t14), static_cast<char>(t13),
400
+ static_cast<char>(t12), static_cast<char>(t11), static_cast<char>(t10),
401
+ static_cast<char>(t9), static_cast<char>(t8), static_cast<char>(t7),
402
+ static_cast<char>(t6), static_cast<char>(t5), static_cast<char>(t4),
403
+ static_cast<char>(t3), static_cast<char>(t2), static_cast<char>(t1),
404
+ static_cast<char>(t0), static_cast<char>(t15), static_cast<char>(t14),
405
+ static_cast<char>(t13), static_cast<char>(t12), static_cast<char>(t11),
406
+ static_cast<char>(t10), static_cast<char>(t9), static_cast<char>(t8),
407
+ static_cast<char>(t7), static_cast<char>(t6), static_cast<char>(t5),
408
+ static_cast<char>(t4), static_cast<char>(t3), static_cast<char>(t2),
409
+ static_cast<char>(t1), static_cast<char>(t0), static_cast<char>(t15),
410
+ static_cast<char>(t14), static_cast<char>(t13), static_cast<char>(t12),
411
+ static_cast<char>(t11), static_cast<char>(t10), static_cast<char>(t9),
412
+ static_cast<char>(t8), static_cast<char>(t7), static_cast<char>(t6),
413
+ static_cast<char>(t5), static_cast<char>(t4), static_cast<char>(t3),
414
+ static_cast<char>(t2), static_cast<char>(t1), static_cast<char>(t0),
415
+ static_cast<char>(t15), static_cast<char>(t14), static_cast<char>(t13),
416
+ static_cast<char>(t12), static_cast<char>(t11), static_cast<char>(t10),
417
+ static_cast<char>(t9), static_cast<char>(t8), static_cast<char>(t7),
418
+ static_cast<char>(t6), static_cast<char>(t5), static_cast<char>(t4),
419
+ static_cast<char>(t3), static_cast<char>(t2), static_cast<char>(t1),
420
+ static_cast<char>(t0))};
421
+ #endif
422
+ }
423
+
424
+ template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 64)>
425
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
426
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
427
+ TFromD<D> t5, TFromD<D> t6,
428
+ TFromD<D> t7) {
429
+ #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900
430
+ // Missing set_epi8/16.
431
+ return BroadcastBlock<0>(
432
+ ResizeBitCast(d, Dup128VecFromValues(Full128<TFromD<D>>(), t0, t1, t2, t3,
433
+ t4, t5, t6, t7)));
434
+ #else
435
+ (void)d;
436
+ // Need to use _mm512_set_epi16 as there is no _mm512_setr_epi16 intrinsic
437
+ // available
438
+ return VFromD<D>{
439
+ _mm512_set_epi16(static_cast<int16_t>(t7), static_cast<int16_t>(t6),
440
+ static_cast<int16_t>(t5), static_cast<int16_t>(t4),
441
+ static_cast<int16_t>(t3), static_cast<int16_t>(t2),
442
+ static_cast<int16_t>(t1), static_cast<int16_t>(t0),
443
+ static_cast<int16_t>(t7), static_cast<int16_t>(t6),
444
+ static_cast<int16_t>(t5), static_cast<int16_t>(t4),
445
+ static_cast<int16_t>(t3), static_cast<int16_t>(t2),
446
+ static_cast<int16_t>(t1), static_cast<int16_t>(t0),
447
+ static_cast<int16_t>(t7), static_cast<int16_t>(t6),
448
+ static_cast<int16_t>(t5), static_cast<int16_t>(t4),
449
+ static_cast<int16_t>(t3), static_cast<int16_t>(t2),
450
+ static_cast<int16_t>(t1), static_cast<int16_t>(t0),
451
+ static_cast<int16_t>(t7), static_cast<int16_t>(t6),
452
+ static_cast<int16_t>(t5), static_cast<int16_t>(t4),
453
+ static_cast<int16_t>(t3), static_cast<int16_t>(t2),
454
+ static_cast<int16_t>(t1), static_cast<int16_t>(t0))};
455
+ #endif
456
+ }
457
+
458
+ #if HWY_HAVE_FLOAT16
459
+ template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 64)>
460
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
461
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
462
+ TFromD<D> t5, TFromD<D> t6,
463
+ TFromD<D> t7) {
464
+ return VFromD<D>{_mm512_setr_ph(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2,
465
+ t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5,
466
+ t6, t7, t0, t1, t2, t3, t4, t5, t6, t7)};
467
+ }
468
+ #endif
469
+
470
+ template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 64)>
471
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
472
+ TFromD<D> t2, TFromD<D> t3) {
473
+ return VFromD<D>{
474
+ _mm512_setr_epi32(static_cast<int32_t>(t0), static_cast<int32_t>(t1),
475
+ static_cast<int32_t>(t2), static_cast<int32_t>(t3),
476
+ static_cast<int32_t>(t0), static_cast<int32_t>(t1),
477
+ static_cast<int32_t>(t2), static_cast<int32_t>(t3),
478
+ static_cast<int32_t>(t0), static_cast<int32_t>(t1),
479
+ static_cast<int32_t>(t2), static_cast<int32_t>(t3),
480
+ static_cast<int32_t>(t0), static_cast<int32_t>(t1),
481
+ static_cast<int32_t>(t2), static_cast<int32_t>(t3))};
482
+ }
483
+
484
+ template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 64)>
485
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
486
+ TFromD<D> t2, TFromD<D> t3) {
487
+ return VFromD<D>{_mm512_setr_ps(t0, t1, t2, t3, t0, t1, t2, t3, t0, t1, t2,
488
+ t3, t0, t1, t2, t3)};
489
+ }
490
+
491
+ template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 64)>
492
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
493
+ return VFromD<D>{
494
+ _mm512_setr_epi64(static_cast<int64_t>(t0), static_cast<int64_t>(t1),
495
+ static_cast<int64_t>(t0), static_cast<int64_t>(t1),
496
+ static_cast<int64_t>(t0), static_cast<int64_t>(t1),
497
+ static_cast<int64_t>(t0), static_cast<int64_t>(t1))};
498
+ }
499
+
500
+ template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 64)>
501
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
502
+ return VFromD<D>{_mm512_setr_pd(t0, t1, t0, t1, t0, t1, t0, t1)};
503
+ }
504
+
376
505
  // ----------------------------- Iota
377
506
 
378
507
  namespace detail {
@@ -480,7 +609,7 @@ HWY_INLINE VFromD<D> Iota0(D /*d*/) {
480
609
 
481
610
  template <class D, typename T2, HWY_IF_V_SIZE_D(D, 64)>
482
611
  HWY_API VFromD<D> Iota(D d, const T2 first) {
483
- return detail::Iota0(d) + Set(d, static_cast<TFromD<D>>(first));
612
+ return detail::Iota0(d) + Set(d, ConvertScalarTo<TFromD<D>>(first));
484
613
  }
485
614
 
486
615
  // ================================================== LOGICAL
@@ -502,7 +631,8 @@ template <typename T>
502
631
  HWY_API Vec512<T> And(const Vec512<T> a, const Vec512<T> b) {
503
632
  const DFromV<decltype(a)> d; // for float16_t
504
633
  const RebindToUnsigned<decltype(d)> du;
505
- return BitCast(d, VFromD<decltype(du)>{_mm512_and_si512(a.raw, b.raw)});
634
+ return BitCast(d, VFromD<decltype(du)>{_mm512_and_si512(BitCast(du, a).raw,
635
+ BitCast(du, b).raw)});
506
636
  }
507
637
 
508
638
  HWY_API Vec512<float> And(const Vec512<float> a, const Vec512<float> b) {
@@ -519,8 +649,8 @@ template <typename T>
519
649
  HWY_API Vec512<T> AndNot(const Vec512<T> not_mask, const Vec512<T> mask) {
520
650
  const DFromV<decltype(mask)> d; // for float16_t
521
651
  const RebindToUnsigned<decltype(d)> du;
522
- return BitCast(
523
- d, VFromD<decltype(du)>{_mm512_andnot_si512(not_mask.raw, mask.raw)});
652
+ return BitCast(d, VFromD<decltype(du)>{_mm512_andnot_si512(
653
+ BitCast(du, not_mask).raw, BitCast(du, mask).raw)});
524
654
  }
525
655
  HWY_API Vec512<float> AndNot(const Vec512<float> not_mask,
526
656
  const Vec512<float> mask) {
@@ -537,7 +667,8 @@ template <typename T>
537
667
  HWY_API Vec512<T> Or(const Vec512<T> a, const Vec512<T> b) {
538
668
  const DFromV<decltype(a)> d; // for float16_t
539
669
  const RebindToUnsigned<decltype(d)> du;
540
- return BitCast(d, VFromD<decltype(du)>{_mm512_or_si512(a.raw, b.raw)});
670
+ return BitCast(d, VFromD<decltype(du)>{_mm512_or_si512(BitCast(du, a).raw,
671
+ BitCast(du, b).raw)});
541
672
  }
542
673
 
543
674
  HWY_API Vec512<float> Or(const Vec512<float> a, const Vec512<float> b) {
@@ -553,7 +684,8 @@ template <typename T>
553
684
  HWY_API Vec512<T> Xor(const Vec512<T> a, const Vec512<T> b) {
554
685
  const DFromV<decltype(a)> d; // for float16_t
555
686
  const RebindToUnsigned<decltype(d)> du;
556
- return BitCast(d, VFromD<decltype(du)>{_mm512_xor_si512(a.raw, b.raw)});
687
+ return BitCast(d, VFromD<decltype(du)>{_mm512_xor_si512(BitCast(du, a).raw,
688
+ BitCast(du, b).raw)});
557
689
  }
558
690
 
559
691
  HWY_API Vec512<float> Xor(const Vec512<float> a, const Vec512<float> b) {
@@ -752,7 +884,7 @@ HWY_API MFromD<D> FirstN(D d, size_t n) {
752
884
  m.raw = static_cast<decltype(m.raw)>(_bzhi_u64(all, n));
753
885
  return m;
754
886
  #else
755
- return detail::FirstN<T>(n);
887
+ return detail::FirstN<TFromD<D>>(n);
756
888
  #endif // HWY_ARCH_X86_64
757
889
  }
758
890
 
@@ -790,7 +922,7 @@ HWY_INLINE Vec512<T> IfThenElse(hwy::SizeTag<8> /* tag */,
790
922
 
791
923
  } // namespace detail
792
924
 
793
- template <typename T, HWY_IF_NOT_FLOAT(T)>
925
+ template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
794
926
  HWY_API Vec512<T> IfThenElse(const Mask512<T> mask, const Vec512<T> yes,
795
927
  const Vec512<T> no) {
796
928
  return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
@@ -840,7 +972,7 @@ HWY_INLINE Vec512<T> IfThenElseZero(hwy::SizeTag<8> /* tag */,
840
972
 
841
973
  } // namespace detail
842
974
 
843
- template <typename T, HWY_IF_NOT_FLOAT3264(T)>
975
+ template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
844
976
  HWY_API Vec512<T> IfThenElseZero(const Mask512<T> mask, const Vec512<T> yes) {
845
977
  return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
846
978
  }
@@ -878,7 +1010,7 @@ HWY_INLINE Vec512<T> IfThenZeroElse(hwy::SizeTag<8> /* tag */,
878
1010
 
879
1011
  } // namespace detail
880
1012
 
881
- template <typename T, HWY_IF_NOT_FLOAT3264(T)>
1013
+ template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
882
1014
  HWY_API Vec512<T> IfThenZeroElse(const Mask512<T> mask, const Vec512<T> no) {
883
1015
  return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
884
1016
  }
@@ -896,6 +1028,14 @@ HWY_API Vec512<T> IfNegativeThenElse(Vec512<T> v, Vec512<T> yes, Vec512<T> no) {
896
1028
  return IfThenElse(MaskFromVec(v), yes, no);
897
1029
  }
898
1030
 
1031
+ template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
1032
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
1033
+ HWY_API Vec512<T> IfNegativeThenNegOrUndefIfZero(Vec512<T> mask, Vec512<T> v) {
1034
+ // AVX3 MaskFromVec only looks at the MSB
1035
+ const DFromV<decltype(v)> d;
1036
+ return MaskedSubOr(v, MaskFromVec(mask), Zero(d), v);
1037
+ }
1038
+
899
1039
  template <typename T, HWY_IF_FLOAT(T)>
900
1040
  HWY_API Vec512<T> ZeroIfNegative(const Vec512<T> v) {
901
1041
  // AVX3 MaskFromVec only looks at the MSB
@@ -1000,6 +1140,59 @@ HWY_API Vec512<uint64_t> SumsOf8AbsDiff(Vec512<uint8_t> a, Vec512<uint8_t> b) {
1000
1140
  return Vec512<uint64_t>{_mm512_sad_epu8(a.raw, b.raw)};
1001
1141
  }
1002
1142
 
1143
+ // ------------------------------ SumsOf4
1144
+ namespace detail {
1145
+
1146
+ HWY_INLINE Vec512<uint32_t> SumsOf4(hwy::UnsignedTag /*type_tag*/,
1147
+ hwy::SizeTag<1> /*lane_size_tag*/,
1148
+ Vec512<uint8_t> v) {
1149
+ const DFromV<decltype(v)> d;
1150
+
1151
+ // _mm512_maskz_dbsad_epu8 is used below as the odd uint16_t lanes need to be
1152
+ // zeroed out and the sums of the 4 consecutive lanes are already in the
1153
+ // even uint16_t lanes of the _mm512_maskz_dbsad_epu8 result.
1154
+ return Vec512<uint32_t>{_mm512_maskz_dbsad_epu8(
1155
+ static_cast<__mmask32>(0x55555555), v.raw, Zero(d).raw, 0)};
1156
+ }
1157
+
1158
+ // I8->I32 SumsOf4
1159
+ // Generic for all vector lengths
1160
+ template <class V>
1161
+ HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
1162
+ hwy::SignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
1163
+ const DFromV<decltype(v)> d;
1164
+ const RebindToUnsigned<decltype(d)> du;
1165
+ const RepartitionToWideX2<decltype(d)> di32;
1166
+
1167
+ // Adjust the values of v to be in the 0..255 range by adding 128 to each lane
1168
+ // of v (which is the same as an bitwise XOR of each i8 lane by 128) and then
1169
+ // bitcasting the Xor result to an u8 vector.
1170
+ const auto v_adj = BitCast(du, Xor(v, SignBit(d)));
1171
+
1172
+ // Need to add -512 to each i32 lane of the result of the
1173
+ // SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(), v_adj) operation to account
1174
+ // for the adjustment made above.
1175
+ return BitCast(di32, SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(), v_adj)) +
1176
+ Set(di32, int32_t{-512});
1177
+ }
1178
+
1179
+ } // namespace detail
1180
+
1181
+ // ------------------------------ SumsOfShuffledQuadAbsDiff
1182
+
1183
+ #if HWY_TARGET <= HWY_AVX3
1184
+ template <int kIdx3, int kIdx2, int kIdx1, int kIdx0>
1185
+ static Vec512<uint16_t> SumsOfShuffledQuadAbsDiff(Vec512<uint8_t> a,
1186
+ Vec512<uint8_t> b) {
1187
+ static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3");
1188
+ static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3");
1189
+ static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3");
1190
+ static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3");
1191
+ return Vec512<uint16_t>{
1192
+ _mm512_dbsad_epu8(b.raw, a.raw, _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0))};
1193
+ }
1194
+ #endif
1195
+
1003
1196
  // ------------------------------ SaturatedAdd
1004
1197
 
1005
1198
  // Returns a + b clamped to the destination range.
@@ -1075,27 +1268,6 @@ HWY_API Vec512<int64_t> Abs(const Vec512<int64_t> v) {
1075
1268
  return Vec512<int64_t>{_mm512_abs_epi64(v.raw)};
1076
1269
  }
1077
1270
 
1078
- // These aren't native instructions, they also involve AND with constant.
1079
- #if HWY_HAVE_FLOAT16
1080
- HWY_API Vec512<float16_t> Abs(const Vec512<float16_t> v) {
1081
- return Vec512<float16_t>{_mm512_abs_ph(v.raw)};
1082
- }
1083
- #endif // HWY_HAVE_FLOAT16
1084
-
1085
- HWY_API Vec512<float> Abs(const Vec512<float> v) {
1086
- return Vec512<float>{_mm512_abs_ps(v.raw)};
1087
- }
1088
- HWY_API Vec512<double> Abs(const Vec512<double> v) {
1089
- // Workaround: _mm512_abs_pd expects __m512, so implement it ourselves.
1090
- #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 803
1091
- const DFromV<decltype(v)> d;
1092
- const RebindToUnsigned<decltype(d)> du;
1093
- return And(v, BitCast(d, Set(du, 0x7FFFFFFFFFFFFFFFULL)));
1094
- #else
1095
- return Vec512<double>{_mm512_abs_pd(v.raw)};
1096
- #endif
1097
- }
1098
-
1099
1271
  // ------------------------------ ShiftLeft
1100
1272
 
1101
1273
  #if HWY_TARGET <= HWY_AVX3_DL
@@ -1643,6 +1815,322 @@ HWY_API Vec512<double> ApproximateReciprocal(Vec512<double> v) {
1643
1815
  return Vec512<double>{_mm512_rcp14_pd(v.raw)};
1644
1816
  }
1645
1817
 
1818
+ // ------------------------------ MaskedMinOr
1819
+
1820
+ template <typename T, HWY_IF_U8(T)>
1821
+ HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1822
+ Vec512<T> b) {
1823
+ return Vec512<T>{_mm512_mask_min_epu8(no.raw, m.raw, a.raw, b.raw)};
1824
+ }
1825
+ template <typename T, HWY_IF_I8(T)>
1826
+ HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1827
+ Vec512<T> b) {
1828
+ return Vec512<T>{_mm512_mask_min_epi8(no.raw, m.raw, a.raw, b.raw)};
1829
+ }
1830
+
1831
+ template <typename T, HWY_IF_U16(T)>
1832
+ HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1833
+ Vec512<T> b) {
1834
+ return Vec512<T>{_mm512_mask_min_epu16(no.raw, m.raw, a.raw, b.raw)};
1835
+ }
1836
+ template <typename T, HWY_IF_I16(T)>
1837
+ HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1838
+ Vec512<T> b) {
1839
+ return Vec512<T>{_mm512_mask_min_epi16(no.raw, m.raw, a.raw, b.raw)};
1840
+ }
1841
+
1842
+ template <typename T, HWY_IF_U32(T)>
1843
+ HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1844
+ Vec512<T> b) {
1845
+ return Vec512<T>{_mm512_mask_min_epu32(no.raw, m.raw, a.raw, b.raw)};
1846
+ }
1847
+ template <typename T, HWY_IF_I32(T)>
1848
+ HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1849
+ Vec512<T> b) {
1850
+ return Vec512<T>{_mm512_mask_min_epi32(no.raw, m.raw, a.raw, b.raw)};
1851
+ }
1852
+
1853
+ template <typename T, HWY_IF_U64(T)>
1854
+ HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1855
+ Vec512<T> b) {
1856
+ return Vec512<T>{_mm512_mask_min_epu64(no.raw, m.raw, a.raw, b.raw)};
1857
+ }
1858
+ template <typename T, HWY_IF_I64(T)>
1859
+ HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1860
+ Vec512<T> b) {
1861
+ return Vec512<T>{_mm512_mask_min_epi64(no.raw, m.raw, a.raw, b.raw)};
1862
+ }
1863
+
1864
+ template <typename T, HWY_IF_F32(T)>
1865
+ HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1866
+ Vec512<T> b) {
1867
+ return Vec512<T>{_mm512_mask_min_ps(no.raw, m.raw, a.raw, b.raw)};
1868
+ }
1869
+
1870
+ template <typename T, HWY_IF_F64(T)>
1871
+ HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1872
+ Vec512<T> b) {
1873
+ return Vec512<T>{_mm512_mask_min_pd(no.raw, m.raw, a.raw, b.raw)};
1874
+ }
1875
+
1876
+ #if HWY_HAVE_FLOAT16
1877
+ template <typename T, HWY_IF_F16(T)>
1878
+ HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1879
+ Vec512<T> b) {
1880
+ return Vec512<T>{_mm512_mask_min_ph(no.raw, m.raw, a.raw, b.raw)};
1881
+ }
1882
+ #endif // HWY_HAVE_FLOAT16
1883
+
1884
+ // ------------------------------ MaskedMaxOr
1885
+
1886
+ template <typename T, HWY_IF_U8(T)>
1887
+ HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1888
+ Vec512<T> b) {
1889
+ return Vec512<T>{_mm512_mask_max_epu8(no.raw, m.raw, a.raw, b.raw)};
1890
+ }
1891
+ template <typename T, HWY_IF_I8(T)>
1892
+ HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1893
+ Vec512<T> b) {
1894
+ return Vec512<T>{_mm512_mask_max_epi8(no.raw, m.raw, a.raw, b.raw)};
1895
+ }
1896
+
1897
+ template <typename T, HWY_IF_U16(T)>
1898
+ HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1899
+ Vec512<T> b) {
1900
+ return Vec512<T>{_mm512_mask_max_epu16(no.raw, m.raw, a.raw, b.raw)};
1901
+ }
1902
+ template <typename T, HWY_IF_I16(T)>
1903
+ HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1904
+ Vec512<T> b) {
1905
+ return Vec512<T>{_mm512_mask_max_epi16(no.raw, m.raw, a.raw, b.raw)};
1906
+ }
1907
+
1908
+ template <typename T, HWY_IF_U32(T)>
1909
+ HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1910
+ Vec512<T> b) {
1911
+ return Vec512<T>{_mm512_mask_max_epu32(no.raw, m.raw, a.raw, b.raw)};
1912
+ }
1913
+ template <typename T, HWY_IF_I32(T)>
1914
+ HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1915
+ Vec512<T> b) {
1916
+ return Vec512<T>{_mm512_mask_max_epi32(no.raw, m.raw, a.raw, b.raw)};
1917
+ }
1918
+
1919
+ template <typename T, HWY_IF_U64(T)>
1920
+ HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1921
+ Vec512<T> b) {
1922
+ return Vec512<T>{_mm512_mask_max_epu64(no.raw, m.raw, a.raw, b.raw)};
1923
+ }
1924
+ template <typename T, HWY_IF_I64(T)>
1925
+ HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1926
+ Vec512<T> b) {
1927
+ return Vec512<T>{_mm512_mask_max_epi64(no.raw, m.raw, a.raw, b.raw)};
1928
+ }
1929
+
1930
+ template <typename T, HWY_IF_F32(T)>
1931
+ HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1932
+ Vec512<T> b) {
1933
+ return Vec512<T>{_mm512_mask_max_ps(no.raw, m.raw, a.raw, b.raw)};
1934
+ }
1935
+
1936
+ template <typename T, HWY_IF_F64(T)>
1937
+ HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1938
+ Vec512<T> b) {
1939
+ return Vec512<T>{_mm512_mask_max_pd(no.raw, m.raw, a.raw, b.raw)};
1940
+ }
1941
+
1942
+ #if HWY_HAVE_FLOAT16
1943
+ template <typename T, HWY_IF_F16(T)>
1944
+ HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1945
+ Vec512<T> b) {
1946
+ return Vec512<T>{_mm512_mask_max_ph(no.raw, m.raw, a.raw, b.raw)};
1947
+ }
1948
+ #endif // HWY_HAVE_FLOAT16
1949
+
1950
+ // ------------------------------ MaskedAddOr
1951
+
1952
+ template <typename T, HWY_IF_UI8(T)>
1953
+ HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1954
+ Vec512<T> b) {
1955
+ return Vec512<T>{_mm512_mask_add_epi8(no.raw, m.raw, a.raw, b.raw)};
1956
+ }
1957
+
1958
+ template <typename T, HWY_IF_UI16(T)>
1959
+ HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1960
+ Vec512<T> b) {
1961
+ return Vec512<T>{_mm512_mask_add_epi16(no.raw, m.raw, a.raw, b.raw)};
1962
+ }
1963
+
1964
+ template <typename T, HWY_IF_UI32(T)>
1965
+ HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1966
+ Vec512<T> b) {
1967
+ return Vec512<T>{_mm512_mask_add_epi32(no.raw, m.raw, a.raw, b.raw)};
1968
+ }
1969
+
1970
+ template <typename T, HWY_IF_UI64(T)>
1971
+ HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1972
+ Vec512<T> b) {
1973
+ return Vec512<T>{_mm512_mask_add_epi64(no.raw, m.raw, a.raw, b.raw)};
1974
+ }
1975
+
1976
+ template <typename T, HWY_IF_F32(T)>
1977
+ HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1978
+ Vec512<T> b) {
1979
+ return Vec512<T>{_mm512_mask_add_ps(no.raw, m.raw, a.raw, b.raw)};
1980
+ }
1981
+
1982
+ template <typename T, HWY_IF_F64(T)>
1983
+ HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1984
+ Vec512<T> b) {
1985
+ return Vec512<T>{_mm512_mask_add_pd(no.raw, m.raw, a.raw, b.raw)};
1986
+ }
1987
+
1988
+ #if HWY_HAVE_FLOAT16
1989
+ template <typename T, HWY_IF_F16(T)>
1990
+ HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1991
+ Vec512<T> b) {
1992
+ return Vec512<T>{_mm512_mask_add_ph(no.raw, m.raw, a.raw, b.raw)};
1993
+ }
1994
+ #endif // HWY_HAVE_FLOAT16
1995
+
1996
+ // ------------------------------ MaskedSubOr
1997
+
1998
+ template <typename T, HWY_IF_UI8(T)>
1999
+ HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2000
+ Vec512<T> b) {
2001
+ return Vec512<T>{_mm512_mask_sub_epi8(no.raw, m.raw, a.raw, b.raw)};
2002
+ }
2003
+
2004
+ template <typename T, HWY_IF_UI16(T)>
2005
+ HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2006
+ Vec512<T> b) {
2007
+ return Vec512<T>{_mm512_mask_sub_epi16(no.raw, m.raw, a.raw, b.raw)};
2008
+ }
2009
+
2010
+ template <typename T, HWY_IF_UI32(T)>
2011
+ HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2012
+ Vec512<T> b) {
2013
+ return Vec512<T>{_mm512_mask_sub_epi32(no.raw, m.raw, a.raw, b.raw)};
2014
+ }
2015
+
2016
+ template <typename T, HWY_IF_UI64(T)>
2017
+ HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2018
+ Vec512<T> b) {
2019
+ return Vec512<T>{_mm512_mask_sub_epi64(no.raw, m.raw, a.raw, b.raw)};
2020
+ }
2021
+
2022
+ template <typename T, HWY_IF_F32(T)>
2023
+ HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2024
+ Vec512<T> b) {
2025
+ return Vec512<T>{_mm512_mask_sub_ps(no.raw, m.raw, a.raw, b.raw)};
2026
+ }
2027
+
2028
+ template <typename T, HWY_IF_F64(T)>
2029
+ HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2030
+ Vec512<T> b) {
2031
+ return Vec512<T>{_mm512_mask_sub_pd(no.raw, m.raw, a.raw, b.raw)};
2032
+ }
2033
+
2034
+ #if HWY_HAVE_FLOAT16
2035
+ template <typename T, HWY_IF_F16(T)>
2036
+ HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2037
+ Vec512<T> b) {
2038
+ return Vec512<T>{_mm512_mask_sub_ph(no.raw, m.raw, a.raw, b.raw)};
2039
+ }
2040
+ #endif // HWY_HAVE_FLOAT16
2041
+
2042
+ // ------------------------------ MaskedMulOr
2043
+
2044
+ HWY_API Vec512<float> MaskedMulOr(Vec512<float> no, Mask512<float> m,
2045
+ Vec512<float> a, Vec512<float> b) {
2046
+ return Vec512<float>{_mm512_mask_mul_ps(no.raw, m.raw, a.raw, b.raw)};
2047
+ }
2048
+
2049
+ HWY_API Vec512<double> MaskedMulOr(Vec512<double> no, Mask512<double> m,
2050
+ Vec512<double> a, Vec512<double> b) {
2051
+ return Vec512<double>{_mm512_mask_mul_pd(no.raw, m.raw, a.raw, b.raw)};
2052
+ }
2053
+
2054
+ #if HWY_HAVE_FLOAT16
2055
+ HWY_API Vec512<float16_t> MaskedMulOr(Vec512<float16_t> no,
2056
+ Mask512<float16_t> m, Vec512<float16_t> a,
2057
+ Vec512<float16_t> b) {
2058
+ return Vec512<float16_t>{_mm512_mask_mul_ph(no.raw, m.raw, a.raw, b.raw)};
2059
+ }
2060
+ #endif // HWY_HAVE_FLOAT16
2061
+
2062
+ // ------------------------------ MaskedDivOr
2063
+
2064
+ HWY_API Vec512<float> MaskedDivOr(Vec512<float> no, Mask512<float> m,
2065
+ Vec512<float> a, Vec512<float> b) {
2066
+ return Vec512<float>{_mm512_mask_div_ps(no.raw, m.raw, a.raw, b.raw)};
2067
+ }
2068
+
2069
+ HWY_API Vec512<double> MaskedDivOr(Vec512<double> no, Mask512<double> m,
2070
+ Vec512<double> a, Vec512<double> b) {
2071
+ return Vec512<double>{_mm512_mask_div_pd(no.raw, m.raw, a.raw, b.raw)};
2072
+ }
2073
+
2074
+ #if HWY_HAVE_FLOAT16
2075
+ HWY_API Vec512<float16_t> MaskedDivOr(Vec512<float16_t> no,
2076
+ Mask512<float16_t> m, Vec512<float16_t> a,
2077
+ Vec512<float16_t> b) {
2078
+ return Vec512<float16_t>{_mm512_mask_div_ph(no.raw, m.raw, a.raw, b.raw)};
2079
+ }
2080
+ #endif // HWY_HAVE_FLOAT16
2081
+
2082
+ // ------------------------------ MaskedSatAddOr
2083
+
2084
+ template <typename T, HWY_IF_I8(T)>
2085
+ HWY_API Vec512<T> MaskedSatAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2086
+ Vec512<T> b) {
2087
+ return Vec512<T>{_mm512_mask_adds_epi8(no.raw, m.raw, a.raw, b.raw)};
2088
+ }
2089
+
2090
+ template <typename T, HWY_IF_U8(T)>
2091
+ HWY_API Vec512<T> MaskedSatAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2092
+ Vec512<T> b) {
2093
+ return Vec512<T>{_mm512_mask_adds_epu8(no.raw, m.raw, a.raw, b.raw)};
2094
+ }
2095
+
2096
+ template <typename T, HWY_IF_I16(T)>
2097
+ HWY_API Vec512<T> MaskedSatAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2098
+ Vec512<T> b) {
2099
+ return Vec512<T>{_mm512_mask_adds_epi16(no.raw, m.raw, a.raw, b.raw)};
2100
+ }
2101
+
2102
+ template <typename T, HWY_IF_U16(T)>
2103
+ HWY_API Vec512<T> MaskedSatAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2104
+ Vec512<T> b) {
2105
+ return Vec512<T>{_mm512_mask_adds_epu16(no.raw, m.raw, a.raw, b.raw)};
2106
+ }
2107
+
2108
+ // ------------------------------ MaskedSatSubOr
2109
+
2110
+ template <typename T, HWY_IF_I8(T)>
2111
+ HWY_API Vec512<T> MaskedSatSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2112
+ Vec512<T> b) {
2113
+ return Vec512<T>{_mm512_mask_subs_epi8(no.raw, m.raw, a.raw, b.raw)};
2114
+ }
2115
+
2116
+ template <typename T, HWY_IF_U8(T)>
2117
+ HWY_API Vec512<T> MaskedSatSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2118
+ Vec512<T> b) {
2119
+ return Vec512<T>{_mm512_mask_subs_epu8(no.raw, m.raw, a.raw, b.raw)};
2120
+ }
2121
+
2122
+ template <typename T, HWY_IF_I16(T)>
2123
+ HWY_API Vec512<T> MaskedSatSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2124
+ Vec512<T> b) {
2125
+ return Vec512<T>{_mm512_mask_subs_epi16(no.raw, m.raw, a.raw, b.raw)};
2126
+ }
2127
+
2128
+ template <typename T, HWY_IF_U16(T)>
2129
+ HWY_API Vec512<T> MaskedSatSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2130
+ Vec512<T> b) {
2131
+ return Vec512<T>{_mm512_mask_subs_epu16(no.raw, m.raw, a.raw, b.raw)};
2132
+ }
2133
+
1646
2134
  // ------------------------------ Floating-point multiply-add variants
1647
2135
 
1648
2136
  #if HWY_HAVE_FLOAT16
@@ -1709,6 +2197,23 @@ HWY_API Vec512<double> NegMulSub(Vec512<double> mul, Vec512<double> x,
1709
2197
  return Vec512<double>{_mm512_fnmsub_pd(mul.raw, x.raw, sub.raw)};
1710
2198
  }
1711
2199
 
2200
+ #if HWY_HAVE_FLOAT16
2201
+ HWY_API Vec512<float16_t> MulAddSub(Vec512<float16_t> mul, Vec512<float16_t> x,
2202
+ Vec512<float16_t> sub_or_add) {
2203
+ return Vec512<float16_t>{_mm512_fmaddsub_ph(mul.raw, x.raw, sub_or_add.raw)};
2204
+ }
2205
+ #endif // HWY_HAVE_FLOAT16
2206
+
2207
+ HWY_API Vec512<float> MulAddSub(Vec512<float> mul, Vec512<float> x,
2208
+ Vec512<float> sub_or_add) {
2209
+ return Vec512<float>{_mm512_fmaddsub_ps(mul.raw, x.raw, sub_or_add.raw)};
2210
+ }
2211
+
2212
+ HWY_API Vec512<double> MulAddSub(Vec512<double> mul, Vec512<double> x,
2213
+ Vec512<double> sub_or_add) {
2214
+ return Vec512<double>{_mm512_fmaddsub_pd(mul.raw, x.raw, sub_or_add.raw)};
2215
+ }
2216
+
1712
2217
  // ------------------------------ Floating-point square root
1713
2218
 
1714
2219
  // Full precision square root
@@ -1873,7 +2378,11 @@ HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
1873
2378
  #if HWY_HAVE_FLOAT16
1874
2379
  HWY_API Mask512<float16_t> operator==(Vec512<float16_t> a,
1875
2380
  Vec512<float16_t> b) {
2381
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2382
+ HWY_DIAGNOSTICS(push)
2383
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1876
2384
  return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)};
2385
+ HWY_DIAGNOSTICS(pop)
1877
2386
  }
1878
2387
  #endif // HWY_HAVE_FLOAT16
1879
2388
 
@@ -1907,7 +2416,11 @@ HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
1907
2416
  #if HWY_HAVE_FLOAT16
1908
2417
  HWY_API Mask512<float16_t> operator!=(Vec512<float16_t> a,
1909
2418
  Vec512<float16_t> b) {
2419
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2420
+ HWY_DIAGNOSTICS(push)
2421
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1910
2422
  return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
2423
+ HWY_DIAGNOSTICS(pop)
1911
2424
  }
1912
2425
  #endif // HWY_HAVE_FLOAT16
1913
2426
 
@@ -1949,7 +2462,11 @@ HWY_API Mask512<int64_t> operator>(Vec512<int64_t> a, Vec512<int64_t> b) {
1949
2462
 
1950
2463
  #if HWY_HAVE_FLOAT16
1951
2464
  HWY_API Mask512<float16_t> operator>(Vec512<float16_t> a, Vec512<float16_t> b) {
2465
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2466
+ HWY_DIAGNOSTICS(push)
2467
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1952
2468
  return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)};
2469
+ HWY_DIAGNOSTICS(pop)
1953
2470
  }
1954
2471
  #endif // HWY_HAVE_FLOAT16
1955
2472
 
@@ -1965,7 +2482,11 @@ HWY_API Mask512<double> operator>(Vec512<double> a, Vec512<double> b) {
1965
2482
  #if HWY_HAVE_FLOAT16
1966
2483
  HWY_API Mask512<float16_t> operator>=(Vec512<float16_t> a,
1967
2484
  Vec512<float16_t> b) {
2485
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2486
+ HWY_DIAGNOSTICS(push)
2487
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1968
2488
  return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)};
2489
+ HWY_DIAGNOSTICS(pop)
1969
2490
  }
1970
2491
  #endif // HWY_HAVE_FLOAT16
1971
2492
 
@@ -2328,11 +2849,41 @@ HWY_API Mask512<T> ExclusiveNeither(Mask512<T> a, Mask512<T> b) {
2328
2849
  return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
2329
2850
  }
2330
2851
 
2852
+ template <class D, HWY_IF_LANES_D(D, 64)>
2853
+ HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
2854
+ MFromD<Half<D>> lo) {
2855
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
2856
+ const __mmask64 combined_mask = _mm512_kunpackd(
2857
+ static_cast<__mmask64>(hi.raw), static_cast<__mmask64>(lo.raw));
2858
+ #else
2859
+ const __mmask64 combined_mask = static_cast<__mmask64>(
2860
+ ((static_cast<uint64_t>(hi.raw) << 32) | (lo.raw & 0xFFFFFFFFULL)));
2861
+ #endif
2862
+
2863
+ return MFromD<D>{combined_mask};
2864
+ }
2865
+
2866
+ template <class D, HWY_IF_LANES_D(D, 32)>
2867
+ HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
2868
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
2869
+ const auto shifted_mask = _kshiftri_mask64(static_cast<__mmask64>(m.raw), 32);
2870
+ #else
2871
+ const auto shifted_mask = static_cast<uint64_t>(m.raw) >> 32;
2872
+ #endif
2873
+
2874
+ return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
2875
+ }
2876
+
2331
2877
  // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
2332
2878
 
2333
2879
  HWY_API Vec512<int8_t> BroadcastSignBit(Vec512<int8_t> v) {
2880
+ #if HWY_TARGET <= HWY_AVX3_DL
2881
+ const Repartition<uint64_t, DFromV<decltype(v)>> du64;
2882
+ return detail::GaloisAffine(v, Set(du64, 0x8080808080808080ull));
2883
+ #else
2334
2884
  const DFromV<decltype(v)> d;
2335
2885
  return VecFromMask(v < Zero(d));
2886
+ #endif
2336
2887
  }
2337
2888
 
2338
2889
  HWY_API Vec512<int16_t> BroadcastSignBit(Vec512<int16_t> v) {
@@ -2344,7 +2895,7 @@ HWY_API Vec512<int32_t> BroadcastSignBit(Vec512<int32_t> v) {
2344
2895
  }
2345
2896
 
2346
2897
  HWY_API Vec512<int64_t> BroadcastSignBit(Vec512<int64_t> v) {
2347
- return Vec512<int64_t>{_mm512_srai_epi64(v.raw, 63)};
2898
+ return ShiftRight<63>(v);
2348
2899
  }
2349
2900
 
2350
2901
  // ------------------------------ Floating-point classification (Not)
@@ -2410,16 +2961,13 @@ HWY_API VFromD<D> Load(D /* tag */, const TFromD<D>* HWY_RESTRICT aligned) {
2410
2961
  return VFromD<D>{_mm512_load_si512(aligned)};
2411
2962
  }
2412
2963
  // bfloat16_t is handled by x86_128-inl.h.
2413
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
2414
- HWY_API Vec512<float16_t> Load(D d, const float16_t* HWY_RESTRICT aligned) {
2415
2964
  #if HWY_HAVE_FLOAT16
2416
- (void)d;
2965
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
2966
+ HWY_API Vec512<float16_t> Load(D /* tag */,
2967
+ const float16_t* HWY_RESTRICT aligned) {
2417
2968
  return Vec512<float16_t>{_mm512_load_ph(aligned)};
2418
- #else
2419
- const RebindToUnsigned<decltype(d)> du;
2420
- return BitCast(d, Load(du, reinterpret_cast<const uint16_t*>(aligned)));
2421
- #endif // HWY_HAVE_FLOAT16
2422
2969
  }
2970
+ #endif // HWY_HAVE_FLOAT16
2423
2971
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
2424
2972
  HWY_API Vec512<float> Load(D /* tag */, const float* HWY_RESTRICT aligned) {
2425
2973
  return Vec512<float>{_mm512_load_ps(aligned)};
@@ -2435,16 +2983,12 @@ HWY_API VFromD<D> LoadU(D /* tag */, const TFromD<D>* HWY_RESTRICT p) {
2435
2983
  }
2436
2984
 
2437
2985
  // bfloat16_t is handled by x86_128-inl.h.
2438
- template <class D, HWY_IF_V_SIZE_D(D, 64)>
2439
- HWY_API Vec512<float16_t> LoadU(D d, const float16_t* HWY_RESTRICT p) {
2440
2986
  #if HWY_HAVE_FLOAT16
2441
- (void)d;
2987
+ template <class D, HWY_IF_V_SIZE_D(D, 64)>
2988
+ HWY_API Vec512<float16_t> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) {
2442
2989
  return Vec512<float16_t>{_mm512_loadu_ph(p)};
2443
- #else
2444
- const RebindToUnsigned<decltype(d)> du;
2445
- return BitCast(d, LoadU(du, reinterpret_cast<const uint16_t*>(p)));
2446
- #endif // HWY_HAVE_FLOAT16
2447
2990
  }
2991
+ #endif // HWY_HAVE_FLOAT16
2448
2992
  template <class D, HWY_IF_V_SIZE_D(D, 64)>
2449
2993
  HWY_API Vec512<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) {
2450
2994
  return Vec512<float>{_mm512_loadu_ps(p)};
@@ -2506,8 +3050,9 @@ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
2506
3050
  HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
2507
3051
  const TFromD<D>* HWY_RESTRICT p) {
2508
3052
  const RebindToUnsigned<decltype(d)> du; // for float16_t
2509
- return VFromD<D>{_mm512_mask_loadu_epi16(
2510
- BitCast(du, v).raw, m.raw, reinterpret_cast<const uint16_t*>(p))};
3053
+ return BitCast(
3054
+ d, VFromD<decltype(du)>{_mm512_mask_loadu_epi16(
3055
+ BitCast(du, v).raw, m.raw, reinterpret_cast<const uint16_t*>(p))});
2511
3056
  }
2512
3057
 
2513
3058
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
@@ -2539,10 +3084,12 @@ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, Mask512<double> m, D /* tag */,
2539
3084
  // Loads 128 bit and duplicates into both 128-bit halves. This avoids the
2540
3085
  // 3-cycle cost of moving data between 128-bit halves and avoids port 5.
2541
3086
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
2542
- HWY_API VFromD<D> LoadDup128(D /* tag */,
2543
- const TFromD<D>* const HWY_RESTRICT p) {
3087
+ HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* const HWY_RESTRICT p) {
3088
+ const RebindToUnsigned<decltype(d)> du;
2544
3089
  const Full128<TFromD<D>> d128;
2545
- return VFromD<D>{_mm512_broadcast_i32x4(LoadU(d128, p).raw)};
3090
+ const RebindToUnsigned<decltype(d128)> du128;
3091
+ return BitCast(d, VFromD<decltype(du)>{_mm512_broadcast_i32x4(
3092
+ BitCast(du128, LoadU(d128, p)).raw)});
2546
3093
  }
2547
3094
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
2548
3095
  HWY_API VFromD<D> LoadDup128(D /* tag */, const float* HWY_RESTRICT p) {
@@ -2563,15 +3110,13 @@ HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
2563
3110
  _mm512_store_si512(reinterpret_cast<__m512i*>(aligned), v.raw);
2564
3111
  }
2565
3112
  // bfloat16_t is handled by x86_128-inl.h.
3113
+ #if HWY_HAVE_FLOAT16
2566
3114
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
2567
3115
  HWY_API void Store(Vec512<float16_t> v, D /* tag */,
2568
3116
  float16_t* HWY_RESTRICT aligned) {
2569
- #if HWY_HAVE_FLOAT16
2570
3117
  _mm512_store_ph(aligned, v.raw);
2571
- #else
2572
- _mm512_store_si512(reinterpret_cast<__m512i*>(aligned), v.raw);
2573
- #endif
2574
3118
  }
3119
+ #endif
2575
3120
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
2576
3121
  HWY_API void Store(Vec512<float> v, D /* tag */, float* HWY_RESTRICT aligned) {
2577
3122
  _mm512_store_ps(aligned, v.raw);
@@ -2586,15 +3131,13 @@ HWY_API void StoreU(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
2586
3131
  _mm512_storeu_si512(reinterpret_cast<__m512i*>(p), v.raw);
2587
3132
  }
2588
3133
  // bfloat16_t is handled by x86_128-inl.h.
3134
+ #if HWY_HAVE_FLOAT16
2589
3135
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
2590
3136
  HWY_API void StoreU(Vec512<float16_t> v, D /* tag */,
2591
3137
  float16_t* HWY_RESTRICT p) {
2592
- #if HWY_HAVE_FLOAT16
2593
3138
  _mm512_storeu_ph(p, v.raw);
2594
- #else
2595
- _mm512_storeu_si512(reinterpret_cast<__m512i*>(p), v.raw);
2596
- #endif // HWY_HAVE_FLOAT16
2597
3139
  }
3140
+ #endif // HWY_HAVE_FLOAT16
2598
3141
 
2599
3142
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
2600
3143
  HWY_API void StoreU(Vec512<float> v, D /* tag */, float* HWY_RESTRICT p) {
@@ -2756,84 +3299,90 @@ HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D /* tag */,
2756
3299
  namespace detail {
2757
3300
 
2758
3301
  template <int kScale, typename T, HWY_IF_UI32(T)>
2759
- HWY_INLINE Vec512<T> NativeGather(const T* HWY_RESTRICT base,
2760
- Vec512<int32_t> index) {
2761
- return Vec512<T>{_mm512_i32gather_epi32(index.raw, base, kScale)};
3302
+ HWY_INLINE Vec512<T> NativeGather512(const T* HWY_RESTRICT base,
3303
+ Vec512<int32_t> indices) {
3304
+ return Vec512<T>{_mm512_i32gather_epi32(indices.raw, base, kScale)};
2762
3305
  }
2763
3306
 
2764
3307
  template <int kScale, typename T, HWY_IF_UI64(T)>
2765
- HWY_INLINE Vec512<T> NativeGather(const T* HWY_RESTRICT base,
2766
- Vec512<int64_t> index) {
2767
- return Vec512<T>{_mm512_i64gather_epi64(index.raw, base, kScale)};
3308
+ HWY_INLINE Vec512<T> NativeGather512(const T* HWY_RESTRICT base,
3309
+ Vec512<int64_t> indices) {
3310
+ return Vec512<T>{_mm512_i64gather_epi64(indices.raw, base, kScale)};
2768
3311
  }
2769
3312
 
2770
3313
  template <int kScale>
2771
- HWY_INLINE Vec512<float> NativeGather(const float* HWY_RESTRICT base,
2772
- Vec512<int32_t> index) {
2773
- return Vec512<float>{_mm512_i32gather_ps(index.raw, base, kScale)};
3314
+ HWY_INLINE Vec512<float> NativeGather512(const float* HWY_RESTRICT base,
3315
+ Vec512<int32_t> indices) {
3316
+ return Vec512<float>{_mm512_i32gather_ps(indices.raw, base, kScale)};
2774
3317
  }
2775
3318
 
2776
3319
  template <int kScale>
2777
- HWY_INLINE Vec512<double> NativeGather(const double* HWY_RESTRICT base,
2778
- Vec512<int64_t> index) {
2779
- return Vec512<double>{_mm512_i64gather_pd(index.raw, base, kScale)};
3320
+ HWY_INLINE Vec512<double> NativeGather512(const double* HWY_RESTRICT base,
3321
+ Vec512<int64_t> indices) {
3322
+ return Vec512<double>{_mm512_i64gather_pd(indices.raw, base, kScale)};
2780
3323
  }
2781
3324
 
2782
3325
  template <int kScale, typename T, HWY_IF_UI32(T)>
2783
- HWY_INLINE Vec512<T> NativeMaskedGather(Mask512<T> m,
2784
- const T* HWY_RESTRICT base,
2785
- Vec512<int32_t> index) {
2786
- const Full512<T> d;
3326
+ HWY_INLINE Vec512<T> NativeMaskedGatherOr512(Vec512<T> no, Mask512<T> m,
3327
+ const T* HWY_RESTRICT base,
3328
+ Vec512<int32_t> indices) {
2787
3329
  return Vec512<T>{
2788
- _mm512_mask_i32gather_epi32(Zero(d).raw, m.raw, index.raw, base, kScale)};
3330
+ _mm512_mask_i32gather_epi32(no.raw, m.raw, indices.raw, base, kScale)};
2789
3331
  }
2790
3332
 
2791
3333
  template <int kScale, typename T, HWY_IF_UI64(T)>
2792
- HWY_INLINE Vec512<T> NativeMaskedGather(Mask512<T> m,
2793
- const T* HWY_RESTRICT base,
2794
- Vec512<int64_t> index) {
2795
- const Full512<T> d;
3334
+ HWY_INLINE Vec512<T> NativeMaskedGatherOr512(Vec512<T> no, Mask512<T> m,
3335
+ const T* HWY_RESTRICT base,
3336
+ Vec512<int64_t> indices) {
2796
3337
  return Vec512<T>{
2797
- _mm512_mask_i64gather_epi64(Zero(d).raw, m.raw, index.raw, base, kScale)};
3338
+ _mm512_mask_i64gather_epi64(no.raw, m.raw, indices.raw, base, kScale)};
2798
3339
  }
2799
3340
 
2800
3341
  template <int kScale>
2801
- HWY_INLINE Vec512<float> NativeMaskedGather(Mask512<float> m,
2802
- const float* HWY_RESTRICT base,
2803
- Vec512<int32_t> index) {
2804
- const Full512<float> d;
3342
+ HWY_INLINE Vec512<float> NativeMaskedGatherOr512(Vec512<float> no,
3343
+ Mask512<float> m,
3344
+ const float* HWY_RESTRICT base,
3345
+ Vec512<int32_t> indices) {
2805
3346
  return Vec512<float>{
2806
- _mm512_mask_i32gather_ps(Zero(d).raw, m.raw, index.raw, base, kScale)};
3347
+ _mm512_mask_i32gather_ps(no.raw, m.raw, indices.raw, base, kScale)};
2807
3348
  }
2808
3349
 
2809
3350
  template <int kScale>
2810
- HWY_INLINE Vec512<double> NativeMaskedGather(Mask512<double> m,
2811
- const double* HWY_RESTRICT base,
2812
- Vec512<int64_t> index) {
2813
- const Full512<double> d;
3351
+ HWY_INLINE Vec512<double> NativeMaskedGatherOr512(
3352
+ Vec512<double> no, Mask512<double> m, const double* HWY_RESTRICT base,
3353
+ Vec512<int64_t> indices) {
2814
3354
  return Vec512<double>{
2815
- _mm512_mask_i64gather_pd(Zero(d).raw, m.raw, index.raw, base, kScale)};
3355
+ _mm512_mask_i64gather_pd(no.raw, m.raw, indices.raw, base, kScale)};
2816
3356
  }
2817
3357
  } // namespace detail
2818
3358
 
2819
- template <class D, HWY_IF_V_SIZE_D(D, 64), typename TI>
2820
- HWY_API VFromD<D> GatherOffset(D /* tag */, const TFromD<D>* HWY_RESTRICT base,
2821
- Vec512<TI> offset) {
2822
- static_assert(sizeof(TFromD<D>) == sizeof(TI), "Must match for portability");
2823
- return detail::NativeGather<1>(base, offset);
3359
+ template <class D, HWY_IF_V_SIZE_D(D, 64)>
3360
+ HWY_API VFromD<D> GatherOffset(D d, const TFromD<D>* HWY_RESTRICT base,
3361
+ VFromD<RebindToSigned<D>> offsets) {
3362
+ const RebindToSigned<decltype(d)> di;
3363
+ (void)di; // for HWY_DASSERT
3364
+ HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di))));
3365
+ return detail::NativeGather512<1>(base, offsets);
2824
3366
  }
2825
- template <class D, HWY_IF_V_SIZE_D(D, 64), typename TI>
2826
- HWY_API VFromD<D> GatherIndex(D /* tag */, const TFromD<D>* HWY_RESTRICT base,
2827
- Vec512<TI> index) {
2828
- static_assert(sizeof(TFromD<D>) == sizeof(TI), "Must match for portability");
2829
- return detail::NativeGather<sizeof(TFromD<D>)>(base, index);
3367
+
3368
+ template <class D, HWY_IF_V_SIZE_D(D, 64)>
3369
+ HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
3370
+ VFromD<RebindToSigned<D>> indices) {
3371
+ const RebindToSigned<decltype(d)> di;
3372
+ (void)di; // for HWY_DASSERT
3373
+ HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
3374
+ return detail::NativeGather512<sizeof(TFromD<D>)>(base, indices);
2830
3375
  }
2831
- template <class D, HWY_IF_V_SIZE_D(D, 64), typename TI>
2832
- HWY_API VFromD<D> MaskedGatherIndex(MFromD<D> m, D /* tag */,
2833
- const TFromD<D>* HWY_RESTRICT base,
2834
- Vec512<TI> index) {
2835
- static_assert(sizeof(TFromD<D>) == sizeof(TI), "Must match for portability");
2836
- return detail::NativeMaskedGather<sizeof(TFromD<D>)>(m, base, index);
3376
+
3377
+ template <class D, HWY_IF_V_SIZE_D(D, 64)>
3378
+ HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
3379
+ const TFromD<D>* HWY_RESTRICT base,
3380
+ VFromD<RebindToSigned<D>> indices) {
3381
+ const RebindToSigned<decltype(d)> di;
3382
+ (void)di; // for HWY_DASSERT
3383
+ HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
3384
+ return detail::NativeMaskedGatherOr512<sizeof(TFromD<D>)>(no, m, base,
3385
+ indices);
2837
3386
  }
2838
3387
 
2839
3388
  HWY_DIAGNOSTICS(pop)
@@ -2878,7 +3427,7 @@ HWY_API Vec256<T> LowerHalf(Vec512<T> v) {
2878
3427
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
2879
3428
  HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
2880
3429
  const RebindToUnsigned<decltype(d)> du; // for float16_t
2881
- const Twice<decltype(d)> dut;
3430
+ const Twice<decltype(du)> dut;
2882
3431
  return BitCast(d, VFromD<decltype(du)>{
2883
3432
  _mm512_extracti32x8_epi32(BitCast(dut, v).raw, 1)});
2884
3433
  }
@@ -2920,7 +3469,11 @@ HWY_API Vec128<T> ExtractBlock(Vec512<T> v) {
2920
3469
  template <int kBlockIdx, class T, hwy::EnableIf<(kBlockIdx > 1)>* = nullptr>
2921
3470
  HWY_API Vec128<T> ExtractBlock(Vec512<T> v) {
2922
3471
  static_assert(kBlockIdx <= 3, "Invalid block index");
2923
- return Vec128<T>{_mm512_extracti32x4_epi32(v.raw, kBlockIdx)};
3472
+ const DFromV<decltype(v)> d;
3473
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
3474
+ return BitCast(Full128<T>(),
3475
+ Vec128<MakeUnsigned<T>>{
3476
+ _mm512_extracti32x4_epi32(BitCast(du, v).raw, kBlockIdx)});
2924
3477
  }
2925
3478
 
2926
3479
  template <int kBlockIdx, hwy::EnableIf<(kBlockIdx > 1)>* = nullptr>
@@ -2955,8 +3508,13 @@ HWY_INLINE Vec512<T> InsertBlock(hwy::SizeTag<0> /* blk_idx_tag */, Vec512<T> v,
2955
3508
  template <size_t kBlockIdx, typename T>
2956
3509
  HWY_INLINE Vec512<T> InsertBlock(hwy::SizeTag<kBlockIdx> /* blk_idx_tag */,
2957
3510
  Vec512<T> v, Vec128<T> blk_to_insert) {
2958
- return Vec512<T>{_mm512_inserti32x4(v.raw, blk_to_insert.raw,
2959
- static_cast<int>(kBlockIdx & 3))};
3511
+ const DFromV<decltype(v)> d;
3512
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
3513
+ const Full128<MakeUnsigned<T>> du_blk_to_insert;
3514
+ return BitCast(
3515
+ d, VFromD<decltype(du)>{_mm512_inserti32x4(
3516
+ BitCast(du, v).raw, BitCast(du_blk_to_insert, blk_to_insert).raw,
3517
+ static_cast<int>(kBlockIdx & 3))});
2960
3518
  }
2961
3519
 
2962
3520
  template <size_t kBlockIdx, hwy::EnableIf<kBlockIdx != 0>* = nullptr>
@@ -2992,7 +3550,7 @@ HWY_API T GetLane(const Vec512<T> v) {
2992
3550
 
2993
3551
  // ------------------------------ ZeroExtendVector
2994
3552
 
2995
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_D(D)>
3553
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
2996
3554
  HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
2997
3555
  #if HWY_HAVE_ZEXT // See definition/comment in x86_256-inl.h.
2998
3556
  (void)d;
@@ -3042,11 +3600,13 @@ HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
3042
3600
  DTo d_to, DFrom d_from, VFromD<DFrom> v) {
3043
3601
  const Repartition<uint8_t, decltype(d_from)> du8_from;
3044
3602
  const auto vu8 = BitCast(du8_from, v);
3603
+ const RebindToUnsigned<decltype(d_to)> du_to;
3045
3604
  #if HWY_HAVE_ZEXT
3046
- (void)d_to;
3047
- return VFromD<DTo>{_mm512_zextsi128_si512(vu8.raw)};
3605
+ return BitCast(d_to,
3606
+ VFromD<decltype(du_to)>{_mm512_zextsi128_si512(vu8.raw)});
3048
3607
  #else
3049
- return VFromD<DTo>{_mm512_inserti32x4(Zero(d_to).raw, vu8.raw, 0)};
3608
+ return BitCast(d_to, VFromD<decltype(du_to)>{
3609
+ _mm512_inserti32x4(Zero(du_to).raw, vu8.raw, 0)});
3050
3610
  #endif
3051
3611
  }
3052
3612
 
@@ -3096,7 +3656,8 @@ HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) {
3096
3656
  const RebindToUnsigned<decltype(d)> du; // for float16_t
3097
3657
  const Half<decltype(du)> duh;
3098
3658
  const __m512i lo512 = ZeroExtendVector(du, BitCast(duh, lo)).raw;
3099
- return VFromD<D>{_mm512_inserti32x8(lo512, BitCast(duh, hi).raw, 1)};
3659
+ return BitCast(d, VFromD<decltype(du)>{
3660
+ _mm512_inserti32x8(lo512, BitCast(duh, hi).raw, 1)});
3100
3661
  }
3101
3662
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3102
3663
  HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) {
@@ -3181,7 +3742,11 @@ HWY_API Vec512<double> Broadcast(const Vec512<double> v) {
3181
3742
  template <int kBlockIdx, class T>
3182
3743
  HWY_API Vec512<T> BroadcastBlock(Vec512<T> v) {
3183
3744
  static_assert(0 <= kBlockIdx && kBlockIdx <= 3, "Invalid block index");
3184
- return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, 0x55 * kBlockIdx)};
3745
+ const DFromV<decltype(v)> d;
3746
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
3747
+ return BitCast(
3748
+ d, VFromD<decltype(du)>{_mm512_shuffle_i32x4(
3749
+ BitCast(du, v).raw, BitCast(du, v).raw, 0x55 * kBlockIdx)});
3185
3750
  }
3186
3751
 
3187
3752
  template <int kBlockIdx>
@@ -3209,7 +3774,10 @@ HWY_INLINE Vec512<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
3209
3774
  template <class T, HWY_IF_T_SIZE(T, 2)>
3210
3775
  HWY_INLINE Vec512<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
3211
3776
  Vec512<T> v) {
3212
- return Vec512<T>{_mm512_broadcastw_epi16(ResizeBitCast(Full128<T>(), v).raw)};
3777
+ const DFromV<decltype(v)> d;
3778
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
3779
+ return BitCast(d, VFromD<decltype(du)>{_mm512_broadcastw_epi16(
3780
+ ResizeBitCast(Full128<uint16_t>(), v).raw)});
3213
3781
  }
3214
3782
 
3215
3783
  template <class T, HWY_IF_UI32(T)>
@@ -3671,8 +4239,11 @@ HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
3671
4239
 
3672
4240
  // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
3673
4241
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
3674
- HWY_API VFromD<D> ConcatLowerLower(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
3675
- return VFromD<D>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BABA)};
4242
+ HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
4243
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4244
+ return BitCast(d,
4245
+ VFromD<decltype(du)>{_mm512_shuffle_i32x4(
4246
+ BitCast(du, lo).raw, BitCast(du, hi).raw, _MM_PERM_BABA)});
3676
4247
  }
3677
4248
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3678
4249
  HWY_API VFromD<D> ConcatLowerLower(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
@@ -3686,8 +4257,11 @@ HWY_API Vec512<double> ConcatLowerLower(D /* tag */, Vec512<double> hi,
3686
4257
 
3687
4258
  // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
3688
4259
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
3689
- HWY_API VFromD<D> ConcatUpperUpper(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
3690
- return VFromD<D>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_DCDC)};
4260
+ HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
4261
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4262
+ return BitCast(d,
4263
+ VFromD<decltype(du)>{_mm512_shuffle_i32x4(
4264
+ BitCast(du, lo).raw, BitCast(du, hi).raw, _MM_PERM_DCDC)});
3691
4265
  }
3692
4266
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3693
4267
  HWY_API VFromD<D> ConcatUpperUpper(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
@@ -3701,8 +4275,11 @@ HWY_API Vec512<double> ConcatUpperUpper(D /* tag */, Vec512<double> hi,
3701
4275
 
3702
4276
  // hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
3703
4277
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
3704
- HWY_API VFromD<D> ConcatLowerUpper(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
3705
- return VFromD<D>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BADC)};
4278
+ HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
4279
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4280
+ return BitCast(d,
4281
+ VFromD<decltype(du)>{_mm512_shuffle_i32x4(
4282
+ BitCast(du, lo).raw, BitCast(du, hi).raw, _MM_PERM_BADC)});
3706
4283
  }
3707
4284
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3708
4285
  HWY_API VFromD<D> ConcatLowerUpper(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
@@ -3716,11 +4293,13 @@ HWY_API Vec512<double> ConcatLowerUpper(D /* tag */, Vec512<double> hi,
3716
4293
 
3717
4294
  // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
3718
4295
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
3719
- HWY_API VFromD<D> ConcatUpperLower(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
4296
+ HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
3720
4297
  // There are no imm8 blend in AVX512. Use blend16 because 32-bit masks
3721
4298
  // are efficiently loaded from 32-bit regs.
3722
4299
  const __mmask32 mask = /*_cvtu32_mask32 */ (0x0000FFFF);
3723
- return VFromD<D>{_mm512_mask_blend_epi16(mask, hi.raw, lo.raw)};
4300
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4301
+ return BitCast(d, VFromD<decltype(du)>{_mm512_mask_blend_epi16(
4302
+ mask, BitCast(du, hi).raw, BitCast(du, lo).raw)});
3724
4303
  }
3725
4304
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3726
4305
  HWY_API VFromD<D> ConcatUpperLower(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
@@ -3881,6 +4460,130 @@ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
3881
4460
  return VFromD<D>{_mm512_permutex2var_pd(lo.raw, Load(du, kIdx).raw, hi.raw)};
3882
4461
  }
3883
4462
 
4463
+ // ------------------------------ InterleaveWholeLower
4464
+
4465
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
4466
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
4467
+ #if HWY_TARGET <= HWY_AVX3_DL
4468
+ const RebindToUnsigned<decltype(d)> du;
4469
+ alignas(64) static constexpr uint8_t kIdx[64] = {
4470
+ 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71,
4471
+ 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79,
4472
+ 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87,
4473
+ 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95};
4474
+ return VFromD<D>{_mm512_permutex2var_epi8(a.raw, Load(du, kIdx).raw, b.raw)};
4475
+ #else
4476
+ alignas(64) static constexpr uint64_t kIdx2[8] = {0, 1, 8, 9, 2, 3, 10, 11};
4477
+ const Repartition<uint64_t, decltype(d)> du64;
4478
+ return VFromD<D>{_mm512_permutex2var_epi64(InterleaveLower(a, b).raw,
4479
+ Load(du64, kIdx2).raw,
4480
+ InterleaveUpper(d, a, b).raw)};
4481
+ #endif
4482
+ }
4483
+
4484
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
4485
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
4486
+ const RebindToUnsigned<decltype(d)> du;
4487
+ alignas(64) static constexpr uint16_t kIdx[32] = {
4488
+ 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
4489
+ 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47};
4490
+ return BitCast(
4491
+ d, VFromD<decltype(du)>{_mm512_permutex2var_epi16(
4492
+ BitCast(du, a).raw, Load(du, kIdx).raw, BitCast(du, b).raw)});
4493
+ }
4494
+
4495
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
4496
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
4497
+ const RebindToUnsigned<decltype(d)> du;
4498
+ alignas(64) static constexpr uint32_t kIdx[16] = {0, 16, 1, 17, 2, 18, 3, 19,
4499
+ 4, 20, 5, 21, 6, 22, 7, 23};
4500
+ return VFromD<D>{_mm512_permutex2var_epi32(a.raw, Load(du, kIdx).raw, b.raw)};
4501
+ }
4502
+
4503
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4504
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
4505
+ const RebindToUnsigned<decltype(d)> du;
4506
+ alignas(64) static constexpr uint32_t kIdx[16] = {0, 16, 1, 17, 2, 18, 3, 19,
4507
+ 4, 20, 5, 21, 6, 22, 7, 23};
4508
+ return VFromD<D>{_mm512_permutex2var_ps(a.raw, Load(du, kIdx).raw, b.raw)};
4509
+ }
4510
+
4511
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
4512
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
4513
+ const RebindToUnsigned<decltype(d)> du;
4514
+ alignas(64) static constexpr uint64_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
4515
+ return VFromD<D>{_mm512_permutex2var_epi64(a.raw, Load(du, kIdx).raw, b.raw)};
4516
+ }
4517
+
4518
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4519
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
4520
+ const RebindToUnsigned<decltype(d)> du;
4521
+ alignas(64) static constexpr uint64_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
4522
+ return VFromD<D>{_mm512_permutex2var_pd(a.raw, Load(du, kIdx).raw, b.raw)};
4523
+ }
4524
+
4525
+ // ------------------------------ InterleaveWholeUpper
4526
+
4527
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
4528
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
4529
+ #if HWY_TARGET <= HWY_AVX3_DL
4530
+ const RebindToUnsigned<decltype(d)> du;
4531
+ alignas(64) static constexpr uint8_t kIdx[64] = {
4532
+ 32, 96, 33, 97, 34, 98, 35, 99, 36, 100, 37, 101, 38, 102, 39, 103,
4533
+ 40, 104, 41, 105, 42, 106, 43, 107, 44, 108, 45, 109, 46, 110, 47, 111,
4534
+ 48, 112, 49, 113, 50, 114, 51, 115, 52, 116, 53, 117, 54, 118, 55, 119,
4535
+ 56, 120, 57, 121, 58, 122, 59, 123, 60, 124, 61, 125, 62, 126, 63, 127};
4536
+ return VFromD<D>{_mm512_permutex2var_epi8(a.raw, Load(du, kIdx).raw, b.raw)};
4537
+ #else
4538
+ alignas(64) static constexpr uint64_t kIdx2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
4539
+ const Repartition<uint64_t, decltype(d)> du64;
4540
+ return VFromD<D>{_mm512_permutex2var_epi64(InterleaveLower(a, b).raw,
4541
+ Load(du64, kIdx2).raw,
4542
+ InterleaveUpper(d, a, b).raw)};
4543
+ #endif
4544
+ }
4545
+
4546
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
4547
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
4548
+ const RebindToUnsigned<decltype(d)> du;
4549
+ alignas(64) static constexpr uint16_t kIdx[32] = {
4550
+ 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
4551
+ 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63};
4552
+ return BitCast(
4553
+ d, VFromD<decltype(du)>{_mm512_permutex2var_epi16(
4554
+ BitCast(du, a).raw, Load(du, kIdx).raw, BitCast(du, b).raw)});
4555
+ }
4556
+
4557
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
4558
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
4559
+ const RebindToUnsigned<decltype(d)> du;
4560
+ alignas(64) static constexpr uint32_t kIdx[16] = {
4561
+ 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31};
4562
+ return VFromD<D>{_mm512_permutex2var_epi32(a.raw, Load(du, kIdx).raw, b.raw)};
4563
+ }
4564
+
4565
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4566
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
4567
+ const RebindToUnsigned<decltype(d)> du;
4568
+ alignas(64) static constexpr uint32_t kIdx[16] = {
4569
+ 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31};
4570
+ return VFromD<D>{_mm512_permutex2var_ps(a.raw, Load(du, kIdx).raw, b.raw)};
4571
+ }
4572
+
4573
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
4574
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
4575
+ const RebindToUnsigned<decltype(d)> du;
4576
+ alignas(64) static constexpr uint64_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
4577
+ return VFromD<D>{_mm512_permutex2var_epi64(a.raw, Load(du, kIdx).raw, b.raw)};
4578
+ }
4579
+
4580
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4581
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
4582
+ const RebindToUnsigned<decltype(d)> du;
4583
+ alignas(64) static constexpr uint64_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
4584
+ return VFromD<D>{_mm512_permutex2var_pd(a.raw, Load(du, kIdx).raw, b.raw)};
4585
+ }
4586
+
3884
4587
  // ------------------------------ DupEven (InterleaveLower)
3885
4588
 
3886
4589
  template <typename T, HWY_IF_T_SIZE(T, 4)>
@@ -3926,7 +4629,11 @@ HWY_API Vec512<T> OddEven(const Vec512<T> a, const Vec512<T> b) {
3926
4629
 
3927
4630
  template <typename T>
3928
4631
  HWY_API Vec512<T> OddEvenBlocks(Vec512<T> odd, Vec512<T> even) {
3929
- return Vec512<T>{_mm512_mask_blend_epi64(__mmask8{0x33u}, odd.raw, even.raw)};
4632
+ const DFromV<decltype(odd)> d;
4633
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4634
+ return BitCast(
4635
+ d, VFromD<decltype(du)>{_mm512_mask_blend_epi64(
4636
+ __mmask8{0x33u}, BitCast(du, odd).raw, BitCast(du, even).raw)});
3930
4637
  }
3931
4638
 
3932
4639
  HWY_API Vec512<float> OddEvenBlocks(Vec512<float> odd, Vec512<float> even) {
@@ -3943,7 +4650,11 @@ HWY_API Vec512<double> OddEvenBlocks(Vec512<double> odd, Vec512<double> even) {
3943
4650
 
3944
4651
  template <typename T>
3945
4652
  HWY_API Vec512<T> SwapAdjacentBlocks(Vec512<T> v) {
3946
- return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_CDAB)};
4653
+ const DFromV<decltype(v)> d;
4654
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4655
+ return BitCast(d,
4656
+ VFromD<decltype(du)>{_mm512_shuffle_i32x4(
4657
+ BitCast(du, v).raw, BitCast(du, v).raw, _MM_PERM_CDAB)});
3947
4658
  }
3948
4659
 
3949
4660
  HWY_API Vec512<float> SwapAdjacentBlocks(Vec512<float> v) {
@@ -3957,8 +4668,11 @@ HWY_API Vec512<double> SwapAdjacentBlocks(Vec512<double> v) {
3957
4668
  // ------------------------------ ReverseBlocks
3958
4669
 
3959
4670
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
3960
- HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
3961
- return VFromD<D>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_ABCD)};
4671
+ HWY_API VFromD<D> ReverseBlocks(D d, VFromD<D> v) {
4672
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4673
+ return BitCast(d,
4674
+ VFromD<decltype(du)>{_mm512_shuffle_i32x4(
4675
+ BitCast(du, v).raw, BitCast(du, v).raw, _MM_PERM_ABCD)});
3962
4676
  }
3963
4677
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3964
4678
  HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
@@ -3974,7 +4688,10 @@ HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
3974
4688
  // Both full
3975
4689
  template <typename T, typename TI>
3976
4690
  HWY_API Vec512<TI> TableLookupBytes(Vec512<T> bytes, Vec512<TI> indices) {
3977
- return Vec512<TI>{_mm512_shuffle_epi8(bytes.raw, indices.raw)};
4691
+ const DFromV<decltype(indices)> d;
4692
+ return BitCast(d, Vec512<uint8_t>{_mm512_shuffle_epi8(
4693
+ BitCast(Full512<uint8_t>(), bytes).raw,
4694
+ BitCast(Full512<uint8_t>(), indices).raw)});
3978
4695
  }
3979
4696
 
3980
4697
  // Partial index vector
@@ -4632,6 +5349,15 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec256<float16_t> v) {
4632
5349
  #endif // HWY_HAVE_FLOAT16
4633
5350
  }
4634
5351
 
5352
+ #if HWY_HAVE_FLOAT16
5353
+
5354
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
5355
+ HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, Vec128<float16_t> v) {
5356
+ return VFromD<D>{_mm512_cvtph_pd(v.raw)};
5357
+ }
5358
+
5359
+ #endif // HWY_HAVE_FLOAT16
5360
+
4635
5361
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4636
5362
  HWY_API VFromD<D> PromoteTo(D df32, Vec256<bfloat16_t> v) {
4637
5363
  const Rebind<uint16_t, decltype(df32)> du16;
@@ -4666,8 +5392,7 @@ HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
4666
5392
  }
4667
5393
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
4668
5394
  HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
4669
- return VFromD<D>{
4670
- _mm512_maskz_cvttps_epu64(_knot_mask8(MaskFromVec(v).raw), v.raw)};
5395
+ return VFromD<D>{_mm512_maskz_cvttps_epu64(Not(MaskFromVec(v)).raw, v.raw)};
4671
5396
  }
4672
5397
 
4673
5398
  // ------------------------------ Demotions (full -> part w/ narrow lanes)
@@ -4709,8 +5434,7 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int32_t> v) {
4709
5434
  const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
4710
5435
  const Vec512<uint8_t> u8{_mm512_packus_epi16(i16.raw, i16.raw)};
4711
5436
 
4712
- alignas(16) static constexpr uint32_t kLanes[4] = {0, 4, 8, 12};
4713
- const auto idx32 = LoadDup128(du32, kLanes);
5437
+ const VFromD<decltype(du32)> idx32 = Dup128VecFromValues(du32, 0, 4, 8, 12);
4714
5438
  const Vec512<uint8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, u8.raw)};
4715
5439
  return LowerHalf(LowerHalf(fixed));
4716
5440
  }
@@ -4745,9 +5469,7 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int32_t> v) {
4745
5469
  const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
4746
5470
  const Vec512<int8_t> i8{_mm512_packs_epi16(i16.raw, i16.raw)};
4747
5471
 
4748
- alignas(16) static constexpr uint32_t kLanes[16] = {0, 4, 8, 12, 0, 4, 8, 12,
4749
- 0, 4, 8, 12, 0, 4, 8, 12};
4750
- const auto idx32 = LoadDup128(du32, kLanes);
5472
+ const VFromD<decltype(du32)> idx32 = Dup128VecFromValues(du32, 0, 4, 8, 12);
4751
5473
  const Vec512<int8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, i8.raw)};
4752
5474
  return LowerHalf(LowerHalf(fixed));
4753
5475
  }
@@ -4779,32 +5501,17 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int64_t> v) {
4779
5501
 
4780
5502
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
4781
5503
  HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int64_t> v) {
4782
- const auto neg_mask = MaskFromVec(v);
4783
- #if HWY_COMPILER_HAS_MASK_INTRINSICS
4784
- const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
4785
- #else
4786
- const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
4787
- #endif
5504
+ const __mmask8 non_neg_mask = Not(MaskFromVec(v)).raw;
4788
5505
  return VFromD<D>{_mm512_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)};
4789
5506
  }
4790
5507
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
4791
5508
  HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int64_t> v) {
4792
- const auto neg_mask = MaskFromVec(v);
4793
- #if HWY_COMPILER_HAS_MASK_INTRINSICS
4794
- const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
4795
- #else
4796
- const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
4797
- #endif
5509
+ const __mmask8 non_neg_mask = Not(MaskFromVec(v)).raw;
4798
5510
  return VFromD<D>{_mm512_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)};
4799
5511
  }
4800
5512
  template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
4801
5513
  HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int64_t> v) {
4802
- const auto neg_mask = MaskFromVec(v);
4803
- #if HWY_COMPILER_HAS_MASK_INTRINSICS
4804
- const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
4805
- #else
4806
- const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
4807
- #endif
5514
+ const __mmask8 non_neg_mask = Not(MaskFromVec(v)).raw;
4808
5515
  return VFromD<D>{_mm512_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)};
4809
5516
  }
4810
5517
 
@@ -4822,14 +5529,23 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<uint64_t> v) {
4822
5529
  }
4823
5530
 
4824
5531
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
4825
- HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<float> v) {
5532
+ HWY_API VFromD<D> DemoteTo(D df16, Vec512<float> v) {
4826
5533
  // Work around warnings in the intrinsic definitions (passing -1 as a mask).
4827
5534
  HWY_DIAGNOSTICS(push)
4828
5535
  HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
4829
- return VFromD<D>{_mm512_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
5536
+ const RebindToUnsigned<decltype(df16)> du16;
5537
+ return BitCast(
5538
+ df16, VFromD<decltype(du16)>{_mm512_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)});
4830
5539
  HWY_DIAGNOSTICS(pop)
4831
5540
  }
4832
5541
 
5542
+ #if HWY_HAVE_FLOAT16
5543
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
5544
+ HWY_API VFromD<D> DemoteTo(D /*df16*/, Vec512<double> v) {
5545
+ return VFromD<D>{_mm512_cvtpd_ph(v.raw)};
5546
+ }
5547
+ #endif // HWY_HAVE_FLOAT16
5548
+
4833
5549
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
4834
5550
  HWY_API VFromD<D> DemoteTo(D dbf16, Vec512<float> v) {
4835
5551
  // TODO(janwas): _mm512_cvtneps_pbh once we have avx512bf16.
@@ -4943,8 +5659,7 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<double> v) {
4943
5659
 
4944
5660
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
4945
5661
  HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<double> v) {
4946
- return VFromD<D>{
4947
- _mm512_maskz_cvttpd_epu32(_knot_mask8(MaskFromVec(v).raw), v.raw)};
5662
+ return VFromD<D>{_mm512_maskz_cvttpd_epu32(Not(MaskFromVec(v)).raw, v.raw)};
4948
5663
  }
4949
5664
 
4950
5665
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
@@ -4962,13 +5677,12 @@ HWY_API Vec128<uint8_t> U8FromU32(const Vec512<uint32_t> v) {
4962
5677
  const DFromV<decltype(v)> d32;
4963
5678
  // In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the
4964
5679
  // lowest 4 bytes.
4965
- alignas(16) static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u,
4966
- ~0u};
4967
- const auto quads = TableLookupBytes(v, LoadDup128(d32, k8From32));
5680
+ const VFromD<decltype(d32)> v8From32 =
5681
+ Dup128VecFromValues(d32, 0x0C080400u, ~0u, ~0u, ~0u);
5682
+ const auto quads = TableLookupBytes(v, v8From32);
4968
5683
  // Gather the lowest 4 bytes of 4 128-bit blocks.
4969
- alignas(16) static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12};
4970
- const Vec512<uint8_t> bytes{
4971
- _mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)};
5684
+ const VFromD<decltype(d32)> index32 = Dup128VecFromValues(d32, 0, 4, 8, 12);
5685
+ const Vec512<uint8_t> bytes{_mm512_permutexvar_epi32(index32.raw, quads.raw)};
4972
5686
  return LowerHalf(LowerHalf(bytes));
4973
5687
  }
4974
5688
 
@@ -4979,10 +5693,9 @@ HWY_API VFromD<D> TruncateTo(D d, const Vec512<uint64_t> v) {
4979
5693
  #if HWY_TARGET <= HWY_AVX3_DL
4980
5694
  (void)d;
4981
5695
  const Full512<uint8_t> d8;
4982
- alignas(16) static constexpr uint8_t k8From64[16] = {
4983
- 0, 8, 16, 24, 32, 40, 48, 56, 0, 8, 16, 24, 32, 40, 48, 56};
4984
- const Vec512<uint8_t> bytes{
4985
- _mm512_permutexvar_epi8(LoadDup128(d8, k8From64).raw, v.raw)};
5696
+ const VFromD<decltype(d8)> v8From64 = Dup128VecFromValues(
5697
+ d8, 0, 8, 16, 24, 32, 40, 48, 56, 0, 8, 16, 24, 32, 40, 48, 56);
5698
+ const Vec512<uint8_t> bytes{_mm512_permutexvar_epi8(v8From64.raw, v.raw)};
4986
5699
  return LowerHalf(LowerHalf(LowerHalf(bytes)));
4987
5700
  #else
4988
5701
  const Full512<uint32_t> d32;
@@ -5018,21 +5731,19 @@ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
5018
5731
  HWY_API VFromD<D> TruncateTo(D /* tag */, const Vec512<uint32_t> v) {
5019
5732
  #if HWY_TARGET <= HWY_AVX3_DL
5020
5733
  const Full512<uint8_t> d8;
5021
- alignas(16) static constexpr uint8_t k8From32[16] = {
5022
- 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60};
5023
- const Vec512<uint8_t> bytes{
5024
- _mm512_permutexvar_epi8(LoadDup128(d8, k8From32).raw, v.raw)};
5734
+ const VFromD<decltype(d8)> v8From32 = Dup128VecFromValues(
5735
+ d8, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60);
5736
+ const Vec512<uint8_t> bytes{_mm512_permutexvar_epi8(v8From32.raw, v.raw)};
5025
5737
  #else
5026
5738
  const Full512<uint32_t> d32;
5027
5739
  // In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the
5028
5740
  // lowest 4 bytes.
5029
- alignas(16) static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u,
5030
- ~0u};
5031
- const auto quads = TableLookupBytes(v, LoadDup128(d32, k8From32));
5741
+ const VFromD<decltype(d32)> v8From32 =
5742
+ Dup128VecFromValues(d32, 0x0C080400u, ~0u, ~0u, ~0u);
5743
+ const auto quads = TableLookupBytes(v, v8From32);
5032
5744
  // Gather the lowest 4 bytes of 4 128-bit blocks.
5033
- alignas(16) static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12};
5034
- const Vec512<uint8_t> bytes{
5035
- _mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)};
5745
+ const VFromD<decltype(d32)> index32 = Dup128VecFromValues(d32, 0, 4, 8, 12);
5746
+ const Vec512<uint8_t> bytes{_mm512_permutexvar_epi32(index32.raw, quads.raw)};
5036
5747
  #endif
5037
5748
  return LowerHalf(LowerHalf(bytes));
5038
5749
  }
@@ -5061,9 +5772,9 @@ HWY_API VFromD<D> TruncateTo(D /* tag */, const Vec512<uint16_t> v) {
5061
5772
  _mm512_permutexvar_epi8(Load(d8, k8From16).raw, v.raw)};
5062
5773
  #else
5063
5774
  const Full512<uint32_t> d32;
5064
- alignas(16) static constexpr uint32_t k16From32[4] = {
5065
- 0x06040200u, 0x0E0C0A08u, 0x06040200u, 0x0E0C0A08u};
5066
- const auto quads = TableLookupBytes(v, LoadDup128(d32, k16From32));
5775
+ const VFromD<decltype(d32)> v16From32 = Dup128VecFromValues(
5776
+ d32, 0x06040200u, 0x0E0C0A08u, 0x06040200u, 0x0E0C0A08u);
5777
+ const auto quads = TableLookupBytes(v, v16From32);
5067
5778
  alignas(64) static constexpr uint32_t kIndex32[16] = {
5068
5779
  0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
5069
5780
  const Vec512<uint8_t> bytes{
@@ -5112,6 +5823,10 @@ HWY_API VFromD<D> ConvertTo(D d, Vec512<float16_t> v) {
5112
5823
  return detail::FixConversionOverflow(d, v,
5113
5824
  VFromD<D>{_mm512_cvttph_epi16(v.raw)});
5114
5825
  }
5826
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
5827
+ HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
5828
+ return VFromD<D>{_mm512_maskz_cvttph_epu16(Not(MaskFromVec(v)).raw, v.raw)};
5829
+ }
5115
5830
  #endif // HWY_HAVE_FLOAT16
5116
5831
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
5117
5832
  HWY_API VFromD<D> ConvertTo(D d, Vec512<float> v) {
@@ -5125,13 +5840,11 @@ HWY_API VFromD<D> ConvertTo(D di, Vec512<double> v) {
5125
5840
  }
5126
5841
  template <class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U32_D(DU)>
5127
5842
  HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
5128
- return VFromD<DU>{
5129
- _mm512_maskz_cvttps_epu32(_knot_mask16(MaskFromVec(v).raw), v.raw)};
5843
+ return VFromD<DU>{_mm512_maskz_cvttps_epu32(Not(MaskFromVec(v)).raw, v.raw)};
5130
5844
  }
5131
5845
  template <class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U64_D(DU)>
5132
5846
  HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
5133
- return VFromD<DU>{
5134
- _mm512_maskz_cvttpd_epu64(_knot_mask8(MaskFromVec(v).raw), v.raw)};
5847
+ return VFromD<DU>{_mm512_maskz_cvttpd_epu64(Not(MaskFromVec(v)).raw, v.raw)};
5135
5848
  }
5136
5849
 
5137
5850
  HWY_API Vec512<int32_t> NearestInt(const Vec512<float> v) {
@@ -5198,14 +5911,14 @@ template <uint8_t kRcon>
5198
5911
  HWY_API Vec512<uint8_t> AESKeyGenAssist(Vec512<uint8_t> v) {
5199
5912
  const Full512<uint8_t> d;
5200
5913
  #if HWY_TARGET <= HWY_AVX3_DL
5201
- alignas(16) static constexpr uint8_t kRconXorMask[16] = {
5202
- 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0};
5203
- alignas(16) static constexpr uint8_t kRotWordShuffle[16] = {
5204
- 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12};
5914
+ const VFromD<decltype(d)> rconXorMask = Dup128VecFromValues(
5915
+ d, 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0);
5916
+ const VFromD<decltype(d)> rotWordShuffle = Dup128VecFromValues(
5917
+ d, 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12);
5205
5918
  const Repartition<uint32_t, decltype(d)> du32;
5206
5919
  const auto w13 = BitCast(d, DupOdd(BitCast(du32, v)));
5207
- const auto sub_word_result = AESLastRound(w13, LoadDup128(d, kRconXorMask));
5208
- return TableLookupBytes(sub_word_result, LoadDup128(d, kRotWordShuffle));
5920
+ const auto sub_word_result = AESLastRound(w13, rconXorMask);
5921
+ return TableLookupBytes(sub_word_result, rotWordShuffle);
5209
5922
  #else
5210
5923
  const Half<decltype(d)> d2;
5211
5924
  return Combine(d, AESKeyGenAssist<kRcon>(UpperHalf(d2, v)),
@@ -5253,6 +5966,28 @@ HWY_API Vec512<uint64_t> CLMulUpper(Vec512<uint64_t> va, Vec512<uint64_t> vb) {
5253
5966
 
5254
5967
  // ================================================== MISC
5255
5968
 
5969
+ // ------------------------------ SumsOfAdjQuadAbsDiff (Broadcast,
5970
+ // SumsOfAdjShufQuadAbsDiff)
5971
+
5972
+ template <int kAOffset, int kBOffset>
5973
+ static Vec512<uint16_t> SumsOfAdjQuadAbsDiff(Vec512<uint8_t> a,
5974
+ Vec512<uint8_t> b) {
5975
+ static_assert(0 <= kAOffset && kAOffset <= 1,
5976
+ "kAOffset must be between 0 and 1");
5977
+ static_assert(0 <= kBOffset && kBOffset <= 3,
5978
+ "kBOffset must be between 0 and 3");
5979
+
5980
+ const DFromV<decltype(a)> d;
5981
+ const RepartitionToWideX2<decltype(d)> du32;
5982
+
5983
+ // While AVX3 does not have a _mm512_mpsadbw_epu8 intrinsic, the
5984
+ // SumsOfAdjQuadAbsDiff operation is implementable for 512-bit vectors on
5985
+ // AVX3 using SumsOfShuffledQuadAbsDiff and U32 Broadcast.
5986
+ return SumsOfShuffledQuadAbsDiff<kAOffset + 2, kAOffset + 1, kAOffset + 1,
5987
+ kAOffset>(
5988
+ a, BitCast(d, Broadcast<kBOffset>(BitCast(du32, b))));
5989
+ }
5990
+
5256
5991
  // ------------------------------ I32/I64 SaturatedAdd (MaskFromVec)
5257
5992
 
5258
5993
  HWY_API Vec512<int32_t> SaturatedAdd(Vec512<int32_t> a, Vec512<int32_t> b) {
@@ -6165,7 +6900,10 @@ namespace detail {
6165
6900
  // Type-safe wrapper.
6166
6901
  template <_MM_PERM_ENUM kPerm, typename T>
6167
6902
  Vec512<T> Shuffle128(const Vec512<T> lo, const Vec512<T> hi) {
6168
- return Vec512<T>{_mm512_shuffle_i64x2(lo.raw, hi.raw, kPerm)};
6903
+ const DFromV<decltype(lo)> d;
6904
+ const RebindToUnsigned<decltype(d)> du;
6905
+ return BitCast(d, VFromD<decltype(du)>{_mm512_shuffle_i64x2(
6906
+ BitCast(du, lo).raw, BitCast(du, hi).raw, kPerm)});
6169
6907
  }
6170
6908
  template <_MM_PERM_ENUM kPerm>
6171
6909
  Vec512<float> Shuffle128(const Vec512<float> lo, const Vec512<float> hi) {
@@ -6345,7 +7083,7 @@ HWY_API Mask512<T> SetOnlyFirst(Mask512<T> mask) {
6345
7083
  static_cast<typename Mask512<T>::Raw>(detail::AVX3Blsi(mask.raw))};
6346
7084
  }
6347
7085
 
6348
- // ------------------------------ Shl (LoadDup128)
7086
+ // ------------------------------ Shl (Dup128VecFromValues)
6349
7087
 
6350
7088
  HWY_API Vec512<uint16_t> operator<<(Vec512<uint16_t> v, Vec512<uint16_t> bits) {
6351
7089
  return Vec512<uint16_t>{_mm512_sllv_epi16(v.raw, bits.raw)};
@@ -6356,13 +7094,15 @@ HWY_API Vec512<uint8_t> operator<<(Vec512<uint8_t> v, Vec512<uint8_t> bits) {
6356
7094
  const DFromV<decltype(v)> d;
6357
7095
  #if HWY_TARGET <= HWY_AVX3_DL
6358
7096
  // kMask[i] = 0xFF >> i
6359
- alignas(16) static constexpr uint8_t kMasks[16] = {
6360
- 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0x00};
7097
+ const VFromD<decltype(d)> masks =
7098
+ Dup128VecFromValues(d, 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0,
7099
+ 0, 0, 0, 0, 0, 0, 0);
6361
7100
  // kShl[i] = 1 << i
6362
- alignas(16) static constexpr uint8_t kShl[16] = {0x01, 0x02, 0x04, 0x08,
6363
- 0x10, 0x20, 0x40, 0x80};
6364
- v = And(v, TableLookupBytes(LoadDup128(d, kMasks), bits));
6365
- const VFromD<decltype(d)> mul = TableLookupBytes(LoadDup128(d, kShl), bits);
7101
+ const VFromD<decltype(d)> shl =
7102
+ Dup128VecFromValues(d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0,
7103
+ 0, 0, 0, 0, 0, 0, 0);
7104
+ v = And(v, TableLookupBytes(masks, bits));
7105
+ const VFromD<decltype(d)> mul = TableLookupBytes(shl, bits);
6366
7106
  return VFromD<decltype(d)>{_mm512_gf2p8mul_epi8(v.raw, mul.raw)};
6367
7107
  #else
6368
7108
  const Repartition<uint16_t, decltype(d)> dw;
@@ -6570,161 +7310,16 @@ HWY_API VFromD<DI32> SumOfMulQuadAccumulate(
6570
7310
 
6571
7311
  // ------------------------------ Reductions
6572
7312
 
6573
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
6574
- HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
6575
- return _mm512_reduce_add_epi32(v.raw);
6576
- }
6577
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
6578
- HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
6579
- return _mm512_reduce_add_epi64(v.raw);
6580
- }
6581
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
6582
- HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
6583
- return static_cast<uint32_t>(_mm512_reduce_add_epi32(v.raw));
6584
- }
6585
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
6586
- HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
6587
- return static_cast<uint64_t>(_mm512_reduce_add_epi64(v.raw));
6588
- }
6589
- #if HWY_HAVE_FLOAT16
6590
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
6591
- HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
6592
- return _mm512_reduce_add_ph(v.raw);
6593
- }
6594
- #endif // HWY_HAVE_FLOAT16
6595
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
6596
- HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
6597
- return _mm512_reduce_add_ps(v.raw);
6598
- }
6599
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
6600
- HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
6601
- return _mm512_reduce_add_pd(v.raw);
6602
- }
6603
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
6604
- HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
6605
- const RepartitionToWide<decltype(d)> d32;
6606
- const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
6607
- const auto odd = ShiftRight<16>(BitCast(d32, v));
6608
- const auto sum = ReduceSum(d32, even + odd);
6609
- return static_cast<uint16_t>(sum);
6610
- }
6611
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
6612
- HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
6613
- const RepartitionToWide<decltype(d)> d32;
6614
- // Sign-extend
6615
- const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
6616
- const auto odd = ShiftRight<16>(BitCast(d32, v));
6617
- const auto sum = ReduceSum(d32, even + odd);
6618
- return static_cast<int16_t>(sum);
6619
- }
6620
-
6621
- // Returns the sum in each lane.
6622
- template <class D, HWY_IF_V_SIZE_D(D, 64)>
6623
- HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
6624
- return Set(d, ReduceSum(d, v));
6625
- }
7313
+ namespace detail {
6626
7314
 
6627
- // Returns the minimum in each lane.
6628
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
6629
- HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
6630
- return Set(d, _mm512_reduce_min_epi32(v.raw));
6631
- }
6632
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
6633
- HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
6634
- return Set(d, _mm512_reduce_min_epi64(v.raw));
6635
- }
6636
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
6637
- HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
6638
- return Set(d, _mm512_reduce_min_epu32(v.raw));
6639
- }
6640
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
6641
- HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
6642
- return Set(d, _mm512_reduce_min_epu64(v.raw));
6643
- }
6644
- #if HWY_HAVE_FLOAT16
6645
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
6646
- HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
6647
- return Set(d, _mm512_reduce_min_ph(v.raw));
6648
- }
6649
- #endif // HWY_HAVE_FLOAT16
6650
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
6651
- HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
6652
- return Set(d, _mm512_reduce_min_ps(v.raw));
6653
- }
6654
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
6655
- HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
6656
- return Set(d, _mm512_reduce_min_pd(v.raw));
6657
- }
6658
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
6659
- HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
6660
- const RepartitionToWide<decltype(d)> d32;
6661
- const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
6662
- const auto odd = ShiftRight<16>(BitCast(d32, v));
6663
- const auto min = MinOfLanes(d32, Min(even, odd));
6664
- // Also broadcast into odd lanes.
6665
- return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
6666
- }
6667
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
6668
- HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
6669
- const RepartitionToWide<decltype(d)> d32;
6670
- // Sign-extend
6671
- const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
6672
- const auto odd = ShiftRight<16>(BitCast(d32, v));
6673
- const auto min = MinOfLanes(d32, Min(even, odd));
6674
- // Also broadcast into odd lanes.
6675
- return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
7315
+ // Used by generic_ops-inl
7316
+ template <class D, class Func, HWY_IF_V_SIZE_D(D, 64)>
7317
+ HWY_INLINE VFromD<D> ReduceAcrossBlocks(D d, Func f, VFromD<D> v) {
7318
+ v = f(v, SwapAdjacentBlocks(v));
7319
+ return f(v, ReverseBlocks(d, v));
6676
7320
  }
6677
7321
 
6678
- // Returns the maximum in each lane.
6679
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
6680
- HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
6681
- return Set(d, _mm512_reduce_max_epi32(v.raw));
6682
- }
6683
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
6684
- HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
6685
- return Set(d, _mm512_reduce_max_epi64(v.raw));
6686
- }
6687
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
6688
- HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
6689
- return Set(d, _mm512_reduce_max_epu32(v.raw));
6690
- }
6691
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
6692
- HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
6693
- return Set(d, _mm512_reduce_max_epu64(v.raw));
6694
- }
6695
- #if HWY_HAVE_FLOAT16
6696
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
6697
- HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
6698
- return Set(d, _mm512_reduce_max_ph(v.raw));
6699
- }
6700
- #endif // HWY_HAVE_FLOAT16
6701
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
6702
- HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
6703
- return Set(d, _mm512_reduce_max_ps(v.raw));
6704
- }
6705
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
6706
- HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
6707
- return Set(d, _mm512_reduce_max_pd(v.raw));
6708
- }
6709
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
6710
- HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
6711
- const RepartitionToWide<decltype(d)> d32;
6712
- const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
6713
- const auto odd = ShiftRight<16>(BitCast(d32, v));
6714
- const auto min = MaxOfLanes(d32, Max(even, odd));
6715
- // Also broadcast into odd lanes.
6716
- return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
6717
- }
6718
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
6719
- HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
6720
- const RepartitionToWide<decltype(d)> d32;
6721
- // Sign-extend
6722
- const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
6723
- const auto odd = ShiftRight<16>(BitCast(d32, v));
6724
- const auto min = MaxOfLanes(d32, Max(even, odd));
6725
- // Also broadcast into odd lanes.
6726
- return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
6727
- }
7322
+ } // namespace detail
6728
7323
 
6729
7324
  // -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex
6730
7325