@img/sharp-libvips-dev 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/expat.h +21 -10
- package/include/expat_config.h +11 -5
- package/include/ffi.h +12 -25
- package/include/freetype2/freetype/config/ftoption.h +1 -1
- package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
- package/include/glib-2.0/gio/gapplication.h +6 -0
- package/include/glib-2.0/gio/giotypes.h +0 -1
- package/include/glib-2.0/girepository/giarginfo.h +23 -6
- package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
- package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
- package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
- package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
- package/include/glib-2.0/girepository/gienuminfo.h +20 -21
- package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
- package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
- package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
- package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
- package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
- package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
- package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
- package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
- package/include/glib-2.0/girepository/girepository.h +53 -62
- package/include/glib-2.0/girepository/girffi.h +8 -7
- package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
- package/include/glib-2.0/girepository/gistructinfo.h +26 -11
- package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
- package/include/glib-2.0/girepository/gitypelib.h +9 -13
- package/include/glib-2.0/girepository/gitypes.h +52 -104
- package/include/glib-2.0/girepository/giunioninfo.h +28 -12
- package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
- package/include/glib-2.0/girepository/givalueinfo.h +65 -0
- package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
- package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
- package/include/glib-2.0/glib/gbitlock.h +31 -0
- package/include/glib-2.0/glib/gmessages.h +8 -0
- package/include/glib-2.0/glib/gslice.h +2 -0
- package/include/glib-2.0/glib/gstrfuncs.h +24 -18
- package/include/glib-2.0/glib/gthread.h +191 -3
- package/include/glib-2.0/glib-unix.h +7 -1
- package/include/glib-2.0/gobject/genums.h +6 -6
- package/include/glib-2.0/gobject/glib-types.h +11 -0
- package/include/glib-2.0/gobject/gsignal.h +16 -6
- package/include/hwy/aligned_allocator.h +171 -6
- package/include/hwy/base.h +1765 -543
- package/include/hwy/cache_control.h +24 -6
- package/include/hwy/detect_compiler_arch.h +23 -2
- package/include/hwy/detect_targets.h +56 -13
- package/include/hwy/foreach_target.h +24 -0
- package/include/hwy/highway.h +20 -3
- package/include/hwy/ops/arm_neon-inl.h +1086 -667
- package/include/hwy/ops/arm_sve-inl.h +1091 -235
- package/include/hwy/ops/emu128-inl.h +271 -196
- package/include/hwy/ops/generic_ops-inl.h +2270 -399
- package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
- package/include/hwy/ops/rvv-inl.h +1043 -311
- package/include/hwy/ops/scalar-inl.h +189 -159
- package/include/hwy/ops/set_macros-inl.h +66 -6
- package/include/hwy/ops/shared-inl.h +175 -56
- package/include/hwy/ops/wasm_128-inl.h +153 -136
- package/include/hwy/ops/x86_128-inl.h +1647 -646
- package/include/hwy/ops/x86_256-inl.h +1003 -370
- package/include/hwy/ops/x86_512-inl.h +948 -353
- package/include/hwy/per_target.h +4 -0
- package/include/hwy/profiler.h +648 -0
- package/include/hwy/robust_statistics.h +2 -2
- package/include/hwy/targets.h +18 -11
- package/include/hwy/timer.h +11 -0
- package/include/libpng16/png.h +32 -29
- package/include/libpng16/pngconf.h +2 -2
- package/include/libpng16/pnglibconf.h +7 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
- package/include/libxml2/libxml/parser.h +16 -7
- package/include/libxml2/libxml/xmlIO.h +0 -1
- package/include/libxml2/libxml/xmlversion.h +4 -4
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-fontmap.h +7 -0
- package/include/pixman-1/pixman-version.h +2 -2
- package/include/png.h +32 -29
- package/include/pngconf.h +2 -2
- package/include/pnglibconf.h +7 -2
- package/include/vips/connection.h +9 -3
- package/include/vips/util.h +0 -9
- package/include/vips/version.h +4 -4
- package/package.json +1 -1
- package/versions.json +11 -11
|
@@ -152,6 +152,9 @@ class Vec512 {
|
|
|
152
152
|
HWY_INLINE Vec512& operator-=(const Vec512 other) {
|
|
153
153
|
return *this = (*this - other);
|
|
154
154
|
}
|
|
155
|
+
HWY_INLINE Vec512& operator%=(const Vec512 other) {
|
|
156
|
+
return *this = (*this % other);
|
|
157
|
+
}
|
|
155
158
|
HWY_INLINE Vec512& operator&=(const Vec512 other) {
|
|
156
159
|
return *this = (*this & other);
|
|
157
160
|
}
|
|
@@ -373,6 +376,132 @@ HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
|
|
|
373
376
|
BitCast(Full256<uint8_t>(), v).raw)});
|
|
374
377
|
}
|
|
375
378
|
|
|
379
|
+
// ------------------------------ Dup128VecFromValues
|
|
380
|
+
|
|
381
|
+
template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 64)>
|
|
382
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
383
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
384
|
+
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
|
|
385
|
+
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
|
|
386
|
+
TFromD<D> t11, TFromD<D> t12,
|
|
387
|
+
TFromD<D> t13, TFromD<D> t14,
|
|
388
|
+
TFromD<D> t15) {
|
|
389
|
+
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900
|
|
390
|
+
// Missing set_epi8/16.
|
|
391
|
+
return BroadcastBlock<0>(ResizeBitCast(
|
|
392
|
+
d, Dup128VecFromValues(Full128<TFromD<D>>(), t0, t1, t2, t3, t4, t5, t6,
|
|
393
|
+
t7, t8, t9, t10, t11, t12, t13, t14, t15)));
|
|
394
|
+
#else
|
|
395
|
+
(void)d;
|
|
396
|
+
// Need to use _mm512_set_epi8 as there is no _mm512_setr_epi8 intrinsic
|
|
397
|
+
// available
|
|
398
|
+
return VFromD<D>{_mm512_set_epi8(
|
|
399
|
+
static_cast<char>(t15), static_cast<char>(t14), static_cast<char>(t13),
|
|
400
|
+
static_cast<char>(t12), static_cast<char>(t11), static_cast<char>(t10),
|
|
401
|
+
static_cast<char>(t9), static_cast<char>(t8), static_cast<char>(t7),
|
|
402
|
+
static_cast<char>(t6), static_cast<char>(t5), static_cast<char>(t4),
|
|
403
|
+
static_cast<char>(t3), static_cast<char>(t2), static_cast<char>(t1),
|
|
404
|
+
static_cast<char>(t0), static_cast<char>(t15), static_cast<char>(t14),
|
|
405
|
+
static_cast<char>(t13), static_cast<char>(t12), static_cast<char>(t11),
|
|
406
|
+
static_cast<char>(t10), static_cast<char>(t9), static_cast<char>(t8),
|
|
407
|
+
static_cast<char>(t7), static_cast<char>(t6), static_cast<char>(t5),
|
|
408
|
+
static_cast<char>(t4), static_cast<char>(t3), static_cast<char>(t2),
|
|
409
|
+
static_cast<char>(t1), static_cast<char>(t0), static_cast<char>(t15),
|
|
410
|
+
static_cast<char>(t14), static_cast<char>(t13), static_cast<char>(t12),
|
|
411
|
+
static_cast<char>(t11), static_cast<char>(t10), static_cast<char>(t9),
|
|
412
|
+
static_cast<char>(t8), static_cast<char>(t7), static_cast<char>(t6),
|
|
413
|
+
static_cast<char>(t5), static_cast<char>(t4), static_cast<char>(t3),
|
|
414
|
+
static_cast<char>(t2), static_cast<char>(t1), static_cast<char>(t0),
|
|
415
|
+
static_cast<char>(t15), static_cast<char>(t14), static_cast<char>(t13),
|
|
416
|
+
static_cast<char>(t12), static_cast<char>(t11), static_cast<char>(t10),
|
|
417
|
+
static_cast<char>(t9), static_cast<char>(t8), static_cast<char>(t7),
|
|
418
|
+
static_cast<char>(t6), static_cast<char>(t5), static_cast<char>(t4),
|
|
419
|
+
static_cast<char>(t3), static_cast<char>(t2), static_cast<char>(t1),
|
|
420
|
+
static_cast<char>(t0))};
|
|
421
|
+
#endif
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 64)>
|
|
425
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
426
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
427
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
428
|
+
TFromD<D> t7) {
|
|
429
|
+
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900
|
|
430
|
+
// Missing set_epi8/16.
|
|
431
|
+
return BroadcastBlock<0>(
|
|
432
|
+
ResizeBitCast(d, Dup128VecFromValues(Full128<TFromD<D>>(), t0, t1, t2, t3,
|
|
433
|
+
t4, t5, t6, t7)));
|
|
434
|
+
#else
|
|
435
|
+
(void)d;
|
|
436
|
+
// Need to use _mm512_set_epi16 as there is no _mm512_setr_epi16 intrinsic
|
|
437
|
+
// available
|
|
438
|
+
return VFromD<D>{
|
|
439
|
+
_mm512_set_epi16(static_cast<int16_t>(t7), static_cast<int16_t>(t6),
|
|
440
|
+
static_cast<int16_t>(t5), static_cast<int16_t>(t4),
|
|
441
|
+
static_cast<int16_t>(t3), static_cast<int16_t>(t2),
|
|
442
|
+
static_cast<int16_t>(t1), static_cast<int16_t>(t0),
|
|
443
|
+
static_cast<int16_t>(t7), static_cast<int16_t>(t6),
|
|
444
|
+
static_cast<int16_t>(t5), static_cast<int16_t>(t4),
|
|
445
|
+
static_cast<int16_t>(t3), static_cast<int16_t>(t2),
|
|
446
|
+
static_cast<int16_t>(t1), static_cast<int16_t>(t0),
|
|
447
|
+
static_cast<int16_t>(t7), static_cast<int16_t>(t6),
|
|
448
|
+
static_cast<int16_t>(t5), static_cast<int16_t>(t4),
|
|
449
|
+
static_cast<int16_t>(t3), static_cast<int16_t>(t2),
|
|
450
|
+
static_cast<int16_t>(t1), static_cast<int16_t>(t0),
|
|
451
|
+
static_cast<int16_t>(t7), static_cast<int16_t>(t6),
|
|
452
|
+
static_cast<int16_t>(t5), static_cast<int16_t>(t4),
|
|
453
|
+
static_cast<int16_t>(t3), static_cast<int16_t>(t2),
|
|
454
|
+
static_cast<int16_t>(t1), static_cast<int16_t>(t0))};
|
|
455
|
+
#endif
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
#if HWY_HAVE_FLOAT16
|
|
459
|
+
template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 64)>
|
|
460
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
461
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
462
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
463
|
+
TFromD<D> t7) {
|
|
464
|
+
return VFromD<D>{_mm512_setr_ph(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2,
|
|
465
|
+
t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5,
|
|
466
|
+
t6, t7, t0, t1, t2, t3, t4, t5, t6, t7)};
|
|
467
|
+
}
|
|
468
|
+
#endif
|
|
469
|
+
|
|
470
|
+
template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 64)>
|
|
471
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
472
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
473
|
+
return VFromD<D>{
|
|
474
|
+
_mm512_setr_epi32(static_cast<int32_t>(t0), static_cast<int32_t>(t1),
|
|
475
|
+
static_cast<int32_t>(t2), static_cast<int32_t>(t3),
|
|
476
|
+
static_cast<int32_t>(t0), static_cast<int32_t>(t1),
|
|
477
|
+
static_cast<int32_t>(t2), static_cast<int32_t>(t3),
|
|
478
|
+
static_cast<int32_t>(t0), static_cast<int32_t>(t1),
|
|
479
|
+
static_cast<int32_t>(t2), static_cast<int32_t>(t3),
|
|
480
|
+
static_cast<int32_t>(t0), static_cast<int32_t>(t1),
|
|
481
|
+
static_cast<int32_t>(t2), static_cast<int32_t>(t3))};
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 64)>
|
|
485
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
486
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
487
|
+
return VFromD<D>{_mm512_setr_ps(t0, t1, t2, t3, t0, t1, t2, t3, t0, t1, t2,
|
|
488
|
+
t3, t0, t1, t2, t3)};
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 64)>
|
|
492
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
493
|
+
return VFromD<D>{
|
|
494
|
+
_mm512_setr_epi64(static_cast<int64_t>(t0), static_cast<int64_t>(t1),
|
|
495
|
+
static_cast<int64_t>(t0), static_cast<int64_t>(t1),
|
|
496
|
+
static_cast<int64_t>(t0), static_cast<int64_t>(t1),
|
|
497
|
+
static_cast<int64_t>(t0), static_cast<int64_t>(t1))};
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 64)>
|
|
501
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
502
|
+
return VFromD<D>{_mm512_setr_pd(t0, t1, t0, t1, t0, t1, t0, t1)};
|
|
503
|
+
}
|
|
504
|
+
|
|
376
505
|
// ----------------------------- Iota
|
|
377
506
|
|
|
378
507
|
namespace detail {
|
|
@@ -480,7 +609,7 @@ HWY_INLINE VFromD<D> Iota0(D /*d*/) {
|
|
|
480
609
|
|
|
481
610
|
template <class D, typename T2, HWY_IF_V_SIZE_D(D, 64)>
|
|
482
611
|
HWY_API VFromD<D> Iota(D d, const T2 first) {
|
|
483
|
-
return detail::Iota0(d) + Set(d,
|
|
612
|
+
return detail::Iota0(d) + Set(d, ConvertScalarTo<TFromD<D>>(first));
|
|
484
613
|
}
|
|
485
614
|
|
|
486
615
|
// ================================================== LOGICAL
|
|
@@ -502,7 +631,8 @@ template <typename T>
|
|
|
502
631
|
HWY_API Vec512<T> And(const Vec512<T> a, const Vec512<T> b) {
|
|
503
632
|
const DFromV<decltype(a)> d; // for float16_t
|
|
504
633
|
const RebindToUnsigned<decltype(d)> du;
|
|
505
|
-
return BitCast(d, VFromD<decltype(du)>{_mm512_and_si512(a.raw,
|
|
634
|
+
return BitCast(d, VFromD<decltype(du)>{_mm512_and_si512(BitCast(du, a).raw,
|
|
635
|
+
BitCast(du, b).raw)});
|
|
506
636
|
}
|
|
507
637
|
|
|
508
638
|
HWY_API Vec512<float> And(const Vec512<float> a, const Vec512<float> b) {
|
|
@@ -519,8 +649,8 @@ template <typename T>
|
|
|
519
649
|
HWY_API Vec512<T> AndNot(const Vec512<T> not_mask, const Vec512<T> mask) {
|
|
520
650
|
const DFromV<decltype(mask)> d; // for float16_t
|
|
521
651
|
const RebindToUnsigned<decltype(d)> du;
|
|
522
|
-
return BitCast(
|
|
523
|
-
|
|
652
|
+
return BitCast(d, VFromD<decltype(du)>{_mm512_andnot_si512(
|
|
653
|
+
BitCast(du, not_mask).raw, BitCast(du, mask).raw)});
|
|
524
654
|
}
|
|
525
655
|
HWY_API Vec512<float> AndNot(const Vec512<float> not_mask,
|
|
526
656
|
const Vec512<float> mask) {
|
|
@@ -537,7 +667,8 @@ template <typename T>
|
|
|
537
667
|
HWY_API Vec512<T> Or(const Vec512<T> a, const Vec512<T> b) {
|
|
538
668
|
const DFromV<decltype(a)> d; // for float16_t
|
|
539
669
|
const RebindToUnsigned<decltype(d)> du;
|
|
540
|
-
return BitCast(d, VFromD<decltype(du)>{_mm512_or_si512(a.raw,
|
|
670
|
+
return BitCast(d, VFromD<decltype(du)>{_mm512_or_si512(BitCast(du, a).raw,
|
|
671
|
+
BitCast(du, b).raw)});
|
|
541
672
|
}
|
|
542
673
|
|
|
543
674
|
HWY_API Vec512<float> Or(const Vec512<float> a, const Vec512<float> b) {
|
|
@@ -553,7 +684,8 @@ template <typename T>
|
|
|
553
684
|
HWY_API Vec512<T> Xor(const Vec512<T> a, const Vec512<T> b) {
|
|
554
685
|
const DFromV<decltype(a)> d; // for float16_t
|
|
555
686
|
const RebindToUnsigned<decltype(d)> du;
|
|
556
|
-
return BitCast(d, VFromD<decltype(du)>{_mm512_xor_si512(a.raw,
|
|
687
|
+
return BitCast(d, VFromD<decltype(du)>{_mm512_xor_si512(BitCast(du, a).raw,
|
|
688
|
+
BitCast(du, b).raw)});
|
|
557
689
|
}
|
|
558
690
|
|
|
559
691
|
HWY_API Vec512<float> Xor(const Vec512<float> a, const Vec512<float> b) {
|
|
@@ -752,7 +884,7 @@ HWY_API MFromD<D> FirstN(D d, size_t n) {
|
|
|
752
884
|
m.raw = static_cast<decltype(m.raw)>(_bzhi_u64(all, n));
|
|
753
885
|
return m;
|
|
754
886
|
#else
|
|
755
|
-
return detail::FirstN<
|
|
887
|
+
return detail::FirstN<TFromD<D>>(n);
|
|
756
888
|
#endif // HWY_ARCH_X86_64
|
|
757
889
|
}
|
|
758
890
|
|
|
@@ -790,7 +922,7 @@ HWY_INLINE Vec512<T> IfThenElse(hwy::SizeTag<8> /* tag */,
|
|
|
790
922
|
|
|
791
923
|
} // namespace detail
|
|
792
924
|
|
|
793
|
-
template <typename T,
|
|
925
|
+
template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
794
926
|
HWY_API Vec512<T> IfThenElse(const Mask512<T> mask, const Vec512<T> yes,
|
|
795
927
|
const Vec512<T> no) {
|
|
796
928
|
return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
|
|
@@ -840,7 +972,7 @@ HWY_INLINE Vec512<T> IfThenElseZero(hwy::SizeTag<8> /* tag */,
|
|
|
840
972
|
|
|
841
973
|
} // namespace detail
|
|
842
974
|
|
|
843
|
-
template <typename T,
|
|
975
|
+
template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
844
976
|
HWY_API Vec512<T> IfThenElseZero(const Mask512<T> mask, const Vec512<T> yes) {
|
|
845
977
|
return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
|
|
846
978
|
}
|
|
@@ -878,7 +1010,7 @@ HWY_INLINE Vec512<T> IfThenZeroElse(hwy::SizeTag<8> /* tag */,
|
|
|
878
1010
|
|
|
879
1011
|
} // namespace detail
|
|
880
1012
|
|
|
881
|
-
template <typename T,
|
|
1013
|
+
template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
882
1014
|
HWY_API Vec512<T> IfThenZeroElse(const Mask512<T> mask, const Vec512<T> no) {
|
|
883
1015
|
return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
|
|
884
1016
|
}
|
|
@@ -896,6 +1028,14 @@ HWY_API Vec512<T> IfNegativeThenElse(Vec512<T> v, Vec512<T> yes, Vec512<T> no) {
|
|
|
896
1028
|
return IfThenElse(MaskFromVec(v), yes, no);
|
|
897
1029
|
}
|
|
898
1030
|
|
|
1031
|
+
template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
|
|
1032
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
|
|
1033
|
+
HWY_API Vec512<T> IfNegativeThenNegOrUndefIfZero(Vec512<T> mask, Vec512<T> v) {
|
|
1034
|
+
// AVX3 MaskFromVec only looks at the MSB
|
|
1035
|
+
const DFromV<decltype(v)> d;
|
|
1036
|
+
return MaskedSubOr(v, MaskFromVec(mask), Zero(d), v);
|
|
1037
|
+
}
|
|
1038
|
+
|
|
899
1039
|
template <typename T, HWY_IF_FLOAT(T)>
|
|
900
1040
|
HWY_API Vec512<T> ZeroIfNegative(const Vec512<T> v) {
|
|
901
1041
|
// AVX3 MaskFromVec only looks at the MSB
|
|
@@ -1000,6 +1140,59 @@ HWY_API Vec512<uint64_t> SumsOf8AbsDiff(Vec512<uint8_t> a, Vec512<uint8_t> b) {
|
|
|
1000
1140
|
return Vec512<uint64_t>{_mm512_sad_epu8(a.raw, b.raw)};
|
|
1001
1141
|
}
|
|
1002
1142
|
|
|
1143
|
+
// ------------------------------ SumsOf4
|
|
1144
|
+
namespace detail {
|
|
1145
|
+
|
|
1146
|
+
HWY_INLINE Vec512<uint32_t> SumsOf4(hwy::UnsignedTag /*type_tag*/,
|
|
1147
|
+
hwy::SizeTag<1> /*lane_size_tag*/,
|
|
1148
|
+
Vec512<uint8_t> v) {
|
|
1149
|
+
const DFromV<decltype(v)> d;
|
|
1150
|
+
|
|
1151
|
+
// _mm512_maskz_dbsad_epu8 is used below as the odd uint16_t lanes need to be
|
|
1152
|
+
// zeroed out and the sums of the 4 consecutive lanes are already in the
|
|
1153
|
+
// even uint16_t lanes of the _mm512_maskz_dbsad_epu8 result.
|
|
1154
|
+
return Vec512<uint32_t>{_mm512_maskz_dbsad_epu8(
|
|
1155
|
+
static_cast<__mmask32>(0x55555555), v.raw, Zero(d).raw, 0)};
|
|
1156
|
+
}
|
|
1157
|
+
|
|
1158
|
+
// I8->I32 SumsOf4
|
|
1159
|
+
// Generic for all vector lengths
|
|
1160
|
+
template <class V>
|
|
1161
|
+
HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
|
|
1162
|
+
hwy::SignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
|
|
1163
|
+
const DFromV<decltype(v)> d;
|
|
1164
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1165
|
+
const RepartitionToWideX2<decltype(d)> di32;
|
|
1166
|
+
|
|
1167
|
+
// Adjust the values of v to be in the 0..255 range by adding 128 to each lane
|
|
1168
|
+
// of v (which is the same as an bitwise XOR of each i8 lane by 128) and then
|
|
1169
|
+
// bitcasting the Xor result to an u8 vector.
|
|
1170
|
+
const auto v_adj = BitCast(du, Xor(v, SignBit(d)));
|
|
1171
|
+
|
|
1172
|
+
// Need to add -512 to each i32 lane of the result of the
|
|
1173
|
+
// SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(), v_adj) operation to account
|
|
1174
|
+
// for the adjustment made above.
|
|
1175
|
+
return BitCast(di32, SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(), v_adj)) +
|
|
1176
|
+
Set(di32, int32_t{-512});
|
|
1177
|
+
}
|
|
1178
|
+
|
|
1179
|
+
} // namespace detail
|
|
1180
|
+
|
|
1181
|
+
// ------------------------------ SumsOfShuffledQuadAbsDiff
|
|
1182
|
+
|
|
1183
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
1184
|
+
template <int kIdx3, int kIdx2, int kIdx1, int kIdx0>
|
|
1185
|
+
static Vec512<uint16_t> SumsOfShuffledQuadAbsDiff(Vec512<uint8_t> a,
|
|
1186
|
+
Vec512<uint8_t> b) {
|
|
1187
|
+
static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3");
|
|
1188
|
+
static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3");
|
|
1189
|
+
static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3");
|
|
1190
|
+
static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3");
|
|
1191
|
+
return Vec512<uint16_t>{
|
|
1192
|
+
_mm512_dbsad_epu8(b.raw, a.raw, _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0))};
|
|
1193
|
+
}
|
|
1194
|
+
#endif
|
|
1195
|
+
|
|
1003
1196
|
// ------------------------------ SaturatedAdd
|
|
1004
1197
|
|
|
1005
1198
|
// Returns a + b clamped to the destination range.
|
|
@@ -1075,27 +1268,6 @@ HWY_API Vec512<int64_t> Abs(const Vec512<int64_t> v) {
|
|
|
1075
1268
|
return Vec512<int64_t>{_mm512_abs_epi64(v.raw)};
|
|
1076
1269
|
}
|
|
1077
1270
|
|
|
1078
|
-
// These aren't native instructions, they also involve AND with constant.
|
|
1079
|
-
#if HWY_HAVE_FLOAT16
|
|
1080
|
-
HWY_API Vec512<float16_t> Abs(const Vec512<float16_t> v) {
|
|
1081
|
-
return Vec512<float16_t>{_mm512_abs_ph(v.raw)};
|
|
1082
|
-
}
|
|
1083
|
-
#endif // HWY_HAVE_FLOAT16
|
|
1084
|
-
|
|
1085
|
-
HWY_API Vec512<float> Abs(const Vec512<float> v) {
|
|
1086
|
-
return Vec512<float>{_mm512_abs_ps(v.raw)};
|
|
1087
|
-
}
|
|
1088
|
-
HWY_API Vec512<double> Abs(const Vec512<double> v) {
|
|
1089
|
-
// Workaround: _mm512_abs_pd expects __m512, so implement it ourselves.
|
|
1090
|
-
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 803
|
|
1091
|
-
const DFromV<decltype(v)> d;
|
|
1092
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
1093
|
-
return And(v, BitCast(d, Set(du, 0x7FFFFFFFFFFFFFFFULL)));
|
|
1094
|
-
#else
|
|
1095
|
-
return Vec512<double>{_mm512_abs_pd(v.raw)};
|
|
1096
|
-
#endif
|
|
1097
|
-
}
|
|
1098
|
-
|
|
1099
1271
|
// ------------------------------ ShiftLeft
|
|
1100
1272
|
|
|
1101
1273
|
#if HWY_TARGET <= HWY_AVX3_DL
|
|
@@ -1643,6 +1815,322 @@ HWY_API Vec512<double> ApproximateReciprocal(Vec512<double> v) {
|
|
|
1643
1815
|
return Vec512<double>{_mm512_rcp14_pd(v.raw)};
|
|
1644
1816
|
}
|
|
1645
1817
|
|
|
1818
|
+
// ------------------------------ MaskedMinOr
|
|
1819
|
+
|
|
1820
|
+
template <typename T, HWY_IF_U8(T)>
|
|
1821
|
+
HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1822
|
+
Vec512<T> b) {
|
|
1823
|
+
return Vec512<T>{_mm512_mask_min_epu8(no.raw, m.raw, a.raw, b.raw)};
|
|
1824
|
+
}
|
|
1825
|
+
template <typename T, HWY_IF_I8(T)>
|
|
1826
|
+
HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1827
|
+
Vec512<T> b) {
|
|
1828
|
+
return Vec512<T>{_mm512_mask_min_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
1829
|
+
}
|
|
1830
|
+
|
|
1831
|
+
template <typename T, HWY_IF_U16(T)>
|
|
1832
|
+
HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1833
|
+
Vec512<T> b) {
|
|
1834
|
+
return Vec512<T>{_mm512_mask_min_epu16(no.raw, m.raw, a.raw, b.raw)};
|
|
1835
|
+
}
|
|
1836
|
+
template <typename T, HWY_IF_I16(T)>
|
|
1837
|
+
HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1838
|
+
Vec512<T> b) {
|
|
1839
|
+
return Vec512<T>{_mm512_mask_min_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
1840
|
+
}
|
|
1841
|
+
|
|
1842
|
+
template <typename T, HWY_IF_U32(T)>
|
|
1843
|
+
HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1844
|
+
Vec512<T> b) {
|
|
1845
|
+
return Vec512<T>{_mm512_mask_min_epu32(no.raw, m.raw, a.raw, b.raw)};
|
|
1846
|
+
}
|
|
1847
|
+
template <typename T, HWY_IF_I32(T)>
|
|
1848
|
+
HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1849
|
+
Vec512<T> b) {
|
|
1850
|
+
return Vec512<T>{_mm512_mask_min_epi32(no.raw, m.raw, a.raw, b.raw)};
|
|
1851
|
+
}
|
|
1852
|
+
|
|
1853
|
+
template <typename T, HWY_IF_U64(T)>
|
|
1854
|
+
HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1855
|
+
Vec512<T> b) {
|
|
1856
|
+
return Vec512<T>{_mm512_mask_min_epu64(no.raw, m.raw, a.raw, b.raw)};
|
|
1857
|
+
}
|
|
1858
|
+
template <typename T, HWY_IF_I64(T)>
|
|
1859
|
+
HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1860
|
+
Vec512<T> b) {
|
|
1861
|
+
return Vec512<T>{_mm512_mask_min_epi64(no.raw, m.raw, a.raw, b.raw)};
|
|
1862
|
+
}
|
|
1863
|
+
|
|
1864
|
+
template <typename T, HWY_IF_F32(T)>
|
|
1865
|
+
HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1866
|
+
Vec512<T> b) {
|
|
1867
|
+
return Vec512<T>{_mm512_mask_min_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
1868
|
+
}
|
|
1869
|
+
|
|
1870
|
+
template <typename T, HWY_IF_F64(T)>
|
|
1871
|
+
HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1872
|
+
Vec512<T> b) {
|
|
1873
|
+
return Vec512<T>{_mm512_mask_min_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
1874
|
+
}
|
|
1875
|
+
|
|
1876
|
+
#if HWY_HAVE_FLOAT16
|
|
1877
|
+
template <typename T, HWY_IF_F16(T)>
|
|
1878
|
+
HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1879
|
+
Vec512<T> b) {
|
|
1880
|
+
return Vec512<T>{_mm512_mask_min_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
1881
|
+
}
|
|
1882
|
+
#endif // HWY_HAVE_FLOAT16
|
|
1883
|
+
|
|
1884
|
+
// ------------------------------ MaskedMaxOr
|
|
1885
|
+
|
|
1886
|
+
template <typename T, HWY_IF_U8(T)>
|
|
1887
|
+
HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1888
|
+
Vec512<T> b) {
|
|
1889
|
+
return Vec512<T>{_mm512_mask_max_epu8(no.raw, m.raw, a.raw, b.raw)};
|
|
1890
|
+
}
|
|
1891
|
+
template <typename T, HWY_IF_I8(T)>
|
|
1892
|
+
HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1893
|
+
Vec512<T> b) {
|
|
1894
|
+
return Vec512<T>{_mm512_mask_max_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
1895
|
+
}
|
|
1896
|
+
|
|
1897
|
+
template <typename T, HWY_IF_U16(T)>
|
|
1898
|
+
HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1899
|
+
Vec512<T> b) {
|
|
1900
|
+
return Vec512<T>{_mm512_mask_max_epu16(no.raw, m.raw, a.raw, b.raw)};
|
|
1901
|
+
}
|
|
1902
|
+
template <typename T, HWY_IF_I16(T)>
|
|
1903
|
+
HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1904
|
+
Vec512<T> b) {
|
|
1905
|
+
return Vec512<T>{_mm512_mask_max_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
1906
|
+
}
|
|
1907
|
+
|
|
1908
|
+
template <typename T, HWY_IF_U32(T)>
|
|
1909
|
+
HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1910
|
+
Vec512<T> b) {
|
|
1911
|
+
return Vec512<T>{_mm512_mask_max_epu32(no.raw, m.raw, a.raw, b.raw)};
|
|
1912
|
+
}
|
|
1913
|
+
template <typename T, HWY_IF_I32(T)>
|
|
1914
|
+
HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1915
|
+
Vec512<T> b) {
|
|
1916
|
+
return Vec512<T>{_mm512_mask_max_epi32(no.raw, m.raw, a.raw, b.raw)};
|
|
1917
|
+
}
|
|
1918
|
+
|
|
1919
|
+
template <typename T, HWY_IF_U64(T)>
|
|
1920
|
+
HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1921
|
+
Vec512<T> b) {
|
|
1922
|
+
return Vec512<T>{_mm512_mask_max_epu64(no.raw, m.raw, a.raw, b.raw)};
|
|
1923
|
+
}
|
|
1924
|
+
template <typename T, HWY_IF_I64(T)>
|
|
1925
|
+
HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1926
|
+
Vec512<T> b) {
|
|
1927
|
+
return Vec512<T>{_mm512_mask_max_epi64(no.raw, m.raw, a.raw, b.raw)};
|
|
1928
|
+
}
|
|
1929
|
+
|
|
1930
|
+
template <typename T, HWY_IF_F32(T)>
|
|
1931
|
+
HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1932
|
+
Vec512<T> b) {
|
|
1933
|
+
return Vec512<T>{_mm512_mask_max_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
1934
|
+
}
|
|
1935
|
+
|
|
1936
|
+
template <typename T, HWY_IF_F64(T)>
|
|
1937
|
+
HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1938
|
+
Vec512<T> b) {
|
|
1939
|
+
return Vec512<T>{_mm512_mask_max_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
1940
|
+
}
|
|
1941
|
+
|
|
1942
|
+
#if HWY_HAVE_FLOAT16
|
|
1943
|
+
template <typename T, HWY_IF_F16(T)>
|
|
1944
|
+
HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1945
|
+
Vec512<T> b) {
|
|
1946
|
+
return Vec512<T>{_mm512_mask_max_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
1947
|
+
}
|
|
1948
|
+
#endif // HWY_HAVE_FLOAT16
|
|
1949
|
+
|
|
1950
|
+
// ------------------------------ MaskedAddOr
|
|
1951
|
+
|
|
1952
|
+
template <typename T, HWY_IF_UI8(T)>
|
|
1953
|
+
HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1954
|
+
Vec512<T> b) {
|
|
1955
|
+
return Vec512<T>{_mm512_mask_add_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
1956
|
+
}
|
|
1957
|
+
|
|
1958
|
+
template <typename T, HWY_IF_UI16(T)>
|
|
1959
|
+
HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1960
|
+
Vec512<T> b) {
|
|
1961
|
+
return Vec512<T>{_mm512_mask_add_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
1962
|
+
}
|
|
1963
|
+
|
|
1964
|
+
template <typename T, HWY_IF_UI32(T)>
|
|
1965
|
+
HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1966
|
+
Vec512<T> b) {
|
|
1967
|
+
return Vec512<T>{_mm512_mask_add_epi32(no.raw, m.raw, a.raw, b.raw)};
|
|
1968
|
+
}
|
|
1969
|
+
|
|
1970
|
+
template <typename T, HWY_IF_UI64(T)>
|
|
1971
|
+
HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1972
|
+
Vec512<T> b) {
|
|
1973
|
+
return Vec512<T>{_mm512_mask_add_epi64(no.raw, m.raw, a.raw, b.raw)};
|
|
1974
|
+
}
|
|
1975
|
+
|
|
1976
|
+
template <typename T, HWY_IF_F32(T)>
|
|
1977
|
+
HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1978
|
+
Vec512<T> b) {
|
|
1979
|
+
return Vec512<T>{_mm512_mask_add_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
1980
|
+
}
|
|
1981
|
+
|
|
1982
|
+
template <typename T, HWY_IF_F64(T)>
|
|
1983
|
+
HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1984
|
+
Vec512<T> b) {
|
|
1985
|
+
return Vec512<T>{_mm512_mask_add_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
1986
|
+
}
|
|
1987
|
+
|
|
1988
|
+
#if HWY_HAVE_FLOAT16
|
|
1989
|
+
template <typename T, HWY_IF_F16(T)>
|
|
1990
|
+
HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1991
|
+
Vec512<T> b) {
|
|
1992
|
+
return Vec512<T>{_mm512_mask_add_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
1993
|
+
}
|
|
1994
|
+
#endif // HWY_HAVE_FLOAT16
|
|
1995
|
+
|
|
1996
|
+
// ------------------------------ MaskedSubOr
|
|
1997
|
+
|
|
1998
|
+
template <typename T, HWY_IF_UI8(T)>
|
|
1999
|
+
HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2000
|
+
Vec512<T> b) {
|
|
2001
|
+
return Vec512<T>{_mm512_mask_sub_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
2002
|
+
}
|
|
2003
|
+
|
|
2004
|
+
template <typename T, HWY_IF_UI16(T)>
|
|
2005
|
+
HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2006
|
+
Vec512<T> b) {
|
|
2007
|
+
return Vec512<T>{_mm512_mask_sub_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
2008
|
+
}
|
|
2009
|
+
|
|
2010
|
+
template <typename T, HWY_IF_UI32(T)>
|
|
2011
|
+
HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2012
|
+
Vec512<T> b) {
|
|
2013
|
+
return Vec512<T>{_mm512_mask_sub_epi32(no.raw, m.raw, a.raw, b.raw)};
|
|
2014
|
+
}
|
|
2015
|
+
|
|
2016
|
+
template <typename T, HWY_IF_UI64(T)>
|
|
2017
|
+
HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2018
|
+
Vec512<T> b) {
|
|
2019
|
+
return Vec512<T>{_mm512_mask_sub_epi64(no.raw, m.raw, a.raw, b.raw)};
|
|
2020
|
+
}
|
|
2021
|
+
|
|
2022
|
+
template <typename T, HWY_IF_F32(T)>
|
|
2023
|
+
HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2024
|
+
Vec512<T> b) {
|
|
2025
|
+
return Vec512<T>{_mm512_mask_sub_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
2026
|
+
}
|
|
2027
|
+
|
|
2028
|
+
template <typename T, HWY_IF_F64(T)>
|
|
2029
|
+
HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2030
|
+
Vec512<T> b) {
|
|
2031
|
+
return Vec512<T>{_mm512_mask_sub_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
2032
|
+
}
|
|
2033
|
+
|
|
2034
|
+
#if HWY_HAVE_FLOAT16
|
|
2035
|
+
template <typename T, HWY_IF_F16(T)>
|
|
2036
|
+
HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2037
|
+
Vec512<T> b) {
|
|
2038
|
+
return Vec512<T>{_mm512_mask_sub_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
2039
|
+
}
|
|
2040
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2041
|
+
|
|
2042
|
+
// ------------------------------ MaskedMulOr
|
|
2043
|
+
|
|
2044
|
+
HWY_API Vec512<float> MaskedMulOr(Vec512<float> no, Mask512<float> m,
|
|
2045
|
+
Vec512<float> a, Vec512<float> b) {
|
|
2046
|
+
return Vec512<float>{_mm512_mask_mul_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
2047
|
+
}
|
|
2048
|
+
|
|
2049
|
+
HWY_API Vec512<double> MaskedMulOr(Vec512<double> no, Mask512<double> m,
|
|
2050
|
+
Vec512<double> a, Vec512<double> b) {
|
|
2051
|
+
return Vec512<double>{_mm512_mask_mul_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
2052
|
+
}
|
|
2053
|
+
|
|
2054
|
+
#if HWY_HAVE_FLOAT16
|
|
2055
|
+
HWY_API Vec512<float16_t> MaskedMulOr(Vec512<float16_t> no,
|
|
2056
|
+
Mask512<float16_t> m, Vec512<float16_t> a,
|
|
2057
|
+
Vec512<float16_t> b) {
|
|
2058
|
+
return Vec512<float16_t>{_mm512_mask_mul_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
2059
|
+
}
|
|
2060
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2061
|
+
|
|
2062
|
+
// ------------------------------ MaskedDivOr
|
|
2063
|
+
|
|
2064
|
+
HWY_API Vec512<float> MaskedDivOr(Vec512<float> no, Mask512<float> m,
|
|
2065
|
+
Vec512<float> a, Vec512<float> b) {
|
|
2066
|
+
return Vec512<float>{_mm512_mask_div_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
2067
|
+
}
|
|
2068
|
+
|
|
2069
|
+
HWY_API Vec512<double> MaskedDivOr(Vec512<double> no, Mask512<double> m,
|
|
2070
|
+
Vec512<double> a, Vec512<double> b) {
|
|
2071
|
+
return Vec512<double>{_mm512_mask_div_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
2072
|
+
}
|
|
2073
|
+
|
|
2074
|
+
#if HWY_HAVE_FLOAT16
|
|
2075
|
+
HWY_API Vec512<float16_t> MaskedDivOr(Vec512<float16_t> no,
|
|
2076
|
+
Mask512<float16_t> m, Vec512<float16_t> a,
|
|
2077
|
+
Vec512<float16_t> b) {
|
|
2078
|
+
return Vec512<float16_t>{_mm512_mask_div_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
2079
|
+
}
|
|
2080
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2081
|
+
|
|
2082
|
+
// ------------------------------ MaskedSatAddOr
|
|
2083
|
+
|
|
2084
|
+
template <typename T, HWY_IF_I8(T)>
|
|
2085
|
+
HWY_API Vec512<T> MaskedSatAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2086
|
+
Vec512<T> b) {
|
|
2087
|
+
return Vec512<T>{_mm512_mask_adds_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
2088
|
+
}
|
|
2089
|
+
|
|
2090
|
+
template <typename T, HWY_IF_U8(T)>
|
|
2091
|
+
HWY_API Vec512<T> MaskedSatAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2092
|
+
Vec512<T> b) {
|
|
2093
|
+
return Vec512<T>{_mm512_mask_adds_epu8(no.raw, m.raw, a.raw, b.raw)};
|
|
2094
|
+
}
|
|
2095
|
+
|
|
2096
|
+
template <typename T, HWY_IF_I16(T)>
|
|
2097
|
+
HWY_API Vec512<T> MaskedSatAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2098
|
+
Vec512<T> b) {
|
|
2099
|
+
return Vec512<T>{_mm512_mask_adds_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
2100
|
+
}
|
|
2101
|
+
|
|
2102
|
+
template <typename T, HWY_IF_U16(T)>
|
|
2103
|
+
HWY_API Vec512<T> MaskedSatAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2104
|
+
Vec512<T> b) {
|
|
2105
|
+
return Vec512<T>{_mm512_mask_adds_epu16(no.raw, m.raw, a.raw, b.raw)};
|
|
2106
|
+
}
|
|
2107
|
+
|
|
2108
|
+
// ------------------------------ MaskedSatSubOr
|
|
2109
|
+
|
|
2110
|
+
template <typename T, HWY_IF_I8(T)>
|
|
2111
|
+
HWY_API Vec512<T> MaskedSatSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2112
|
+
Vec512<T> b) {
|
|
2113
|
+
return Vec512<T>{_mm512_mask_subs_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
2114
|
+
}
|
|
2115
|
+
|
|
2116
|
+
template <typename T, HWY_IF_U8(T)>
|
|
2117
|
+
HWY_API Vec512<T> MaskedSatSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2118
|
+
Vec512<T> b) {
|
|
2119
|
+
return Vec512<T>{_mm512_mask_subs_epu8(no.raw, m.raw, a.raw, b.raw)};
|
|
2120
|
+
}
|
|
2121
|
+
|
|
2122
|
+
template <typename T, HWY_IF_I16(T)>
|
|
2123
|
+
HWY_API Vec512<T> MaskedSatSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2124
|
+
Vec512<T> b) {
|
|
2125
|
+
return Vec512<T>{_mm512_mask_subs_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
2126
|
+
}
|
|
2127
|
+
|
|
2128
|
+
template <typename T, HWY_IF_U16(T)>
|
|
2129
|
+
HWY_API Vec512<T> MaskedSatSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2130
|
+
Vec512<T> b) {
|
|
2131
|
+
return Vec512<T>{_mm512_mask_subs_epu16(no.raw, m.raw, a.raw, b.raw)};
|
|
2132
|
+
}
|
|
2133
|
+
|
|
1646
2134
|
// ------------------------------ Floating-point multiply-add variants
|
|
1647
2135
|
|
|
1648
2136
|
#if HWY_HAVE_FLOAT16
|
|
@@ -1709,6 +2197,23 @@ HWY_API Vec512<double> NegMulSub(Vec512<double> mul, Vec512<double> x,
|
|
|
1709
2197
|
return Vec512<double>{_mm512_fnmsub_pd(mul.raw, x.raw, sub.raw)};
|
|
1710
2198
|
}
|
|
1711
2199
|
|
|
2200
|
+
#if HWY_HAVE_FLOAT16
|
|
2201
|
+
HWY_API Vec512<float16_t> MulAddSub(Vec512<float16_t> mul, Vec512<float16_t> x,
|
|
2202
|
+
Vec512<float16_t> sub_or_add) {
|
|
2203
|
+
return Vec512<float16_t>{_mm512_fmaddsub_ph(mul.raw, x.raw, sub_or_add.raw)};
|
|
2204
|
+
}
|
|
2205
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2206
|
+
|
|
2207
|
+
HWY_API Vec512<float> MulAddSub(Vec512<float> mul, Vec512<float> x,
|
|
2208
|
+
Vec512<float> sub_or_add) {
|
|
2209
|
+
return Vec512<float>{_mm512_fmaddsub_ps(mul.raw, x.raw, sub_or_add.raw)};
|
|
2210
|
+
}
|
|
2211
|
+
|
|
2212
|
+
HWY_API Vec512<double> MulAddSub(Vec512<double> mul, Vec512<double> x,
|
|
2213
|
+
Vec512<double> sub_or_add) {
|
|
2214
|
+
return Vec512<double>{_mm512_fmaddsub_pd(mul.raw, x.raw, sub_or_add.raw)};
|
|
2215
|
+
}
|
|
2216
|
+
|
|
1712
2217
|
// ------------------------------ Floating-point square root
|
|
1713
2218
|
|
|
1714
2219
|
// Full precision square root
|
|
@@ -1873,7 +2378,11 @@ HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
|
|
|
1873
2378
|
#if HWY_HAVE_FLOAT16
|
|
1874
2379
|
HWY_API Mask512<float16_t> operator==(Vec512<float16_t> a,
|
|
1875
2380
|
Vec512<float16_t> b) {
|
|
2381
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
2382
|
+
HWY_DIAGNOSTICS(push)
|
|
2383
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
1876
2384
|
return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)};
|
|
2385
|
+
HWY_DIAGNOSTICS(pop)
|
|
1877
2386
|
}
|
|
1878
2387
|
#endif // HWY_HAVE_FLOAT16
|
|
1879
2388
|
|
|
@@ -1907,7 +2416,11 @@ HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
|
|
|
1907
2416
|
#if HWY_HAVE_FLOAT16
|
|
1908
2417
|
HWY_API Mask512<float16_t> operator!=(Vec512<float16_t> a,
|
|
1909
2418
|
Vec512<float16_t> b) {
|
|
2419
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
2420
|
+
HWY_DIAGNOSTICS(push)
|
|
2421
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
1910
2422
|
return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
|
|
2423
|
+
HWY_DIAGNOSTICS(pop)
|
|
1911
2424
|
}
|
|
1912
2425
|
#endif // HWY_HAVE_FLOAT16
|
|
1913
2426
|
|
|
@@ -1949,7 +2462,11 @@ HWY_API Mask512<int64_t> operator>(Vec512<int64_t> a, Vec512<int64_t> b) {
|
|
|
1949
2462
|
|
|
1950
2463
|
#if HWY_HAVE_FLOAT16
|
|
1951
2464
|
HWY_API Mask512<float16_t> operator>(Vec512<float16_t> a, Vec512<float16_t> b) {
|
|
2465
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
2466
|
+
HWY_DIAGNOSTICS(push)
|
|
2467
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
1952
2468
|
return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)};
|
|
2469
|
+
HWY_DIAGNOSTICS(pop)
|
|
1953
2470
|
}
|
|
1954
2471
|
#endif // HWY_HAVE_FLOAT16
|
|
1955
2472
|
|
|
@@ -1965,7 +2482,11 @@ HWY_API Mask512<double> operator>(Vec512<double> a, Vec512<double> b) {
|
|
|
1965
2482
|
#if HWY_HAVE_FLOAT16
|
|
1966
2483
|
HWY_API Mask512<float16_t> operator>=(Vec512<float16_t> a,
|
|
1967
2484
|
Vec512<float16_t> b) {
|
|
2485
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
2486
|
+
HWY_DIAGNOSTICS(push)
|
|
2487
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
1968
2488
|
return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)};
|
|
2489
|
+
HWY_DIAGNOSTICS(pop)
|
|
1969
2490
|
}
|
|
1970
2491
|
#endif // HWY_HAVE_FLOAT16
|
|
1971
2492
|
|
|
@@ -2328,11 +2849,41 @@ HWY_API Mask512<T> ExclusiveNeither(Mask512<T> a, Mask512<T> b) {
|
|
|
2328
2849
|
return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
|
|
2329
2850
|
}
|
|
2330
2851
|
|
|
2852
|
+
template <class D, HWY_IF_LANES_D(D, 64)>
|
|
2853
|
+
HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
|
|
2854
|
+
MFromD<Half<D>> lo) {
|
|
2855
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
2856
|
+
const __mmask64 combined_mask = _mm512_kunpackd(
|
|
2857
|
+
static_cast<__mmask64>(hi.raw), static_cast<__mmask64>(lo.raw));
|
|
2858
|
+
#else
|
|
2859
|
+
const __mmask64 combined_mask = static_cast<__mmask64>(
|
|
2860
|
+
((static_cast<uint64_t>(hi.raw) << 32) | (lo.raw & 0xFFFFFFFFULL)));
|
|
2861
|
+
#endif
|
|
2862
|
+
|
|
2863
|
+
return MFromD<D>{combined_mask};
|
|
2864
|
+
}
|
|
2865
|
+
|
|
2866
|
+
template <class D, HWY_IF_LANES_D(D, 32)>
|
|
2867
|
+
HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
|
|
2868
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
2869
|
+
const auto shifted_mask = _kshiftri_mask64(static_cast<__mmask64>(m.raw), 32);
|
|
2870
|
+
#else
|
|
2871
|
+
const auto shifted_mask = static_cast<uint64_t>(m.raw) >> 32;
|
|
2872
|
+
#endif
|
|
2873
|
+
|
|
2874
|
+
return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
|
|
2875
|
+
}
|
|
2876
|
+
|
|
2331
2877
|
// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
|
|
2332
2878
|
|
|
2333
2879
|
HWY_API Vec512<int8_t> BroadcastSignBit(Vec512<int8_t> v) {
|
|
2880
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
2881
|
+
const Repartition<uint64_t, DFromV<decltype(v)>> du64;
|
|
2882
|
+
return detail::GaloisAffine(v, Set(du64, 0x8080808080808080ull));
|
|
2883
|
+
#else
|
|
2334
2884
|
const DFromV<decltype(v)> d;
|
|
2335
2885
|
return VecFromMask(v < Zero(d));
|
|
2886
|
+
#endif
|
|
2336
2887
|
}
|
|
2337
2888
|
|
|
2338
2889
|
HWY_API Vec512<int16_t> BroadcastSignBit(Vec512<int16_t> v) {
|
|
@@ -2344,7 +2895,7 @@ HWY_API Vec512<int32_t> BroadcastSignBit(Vec512<int32_t> v) {
|
|
|
2344
2895
|
}
|
|
2345
2896
|
|
|
2346
2897
|
HWY_API Vec512<int64_t> BroadcastSignBit(Vec512<int64_t> v) {
|
|
2347
|
-
return
|
|
2898
|
+
return ShiftRight<63>(v);
|
|
2348
2899
|
}
|
|
2349
2900
|
|
|
2350
2901
|
// ------------------------------ Floating-point classification (Not)
|
|
@@ -2410,16 +2961,13 @@ HWY_API VFromD<D> Load(D /* tag */, const TFromD<D>* HWY_RESTRICT aligned) {
|
|
|
2410
2961
|
return VFromD<D>{_mm512_load_si512(aligned)};
|
|
2411
2962
|
}
|
|
2412
2963
|
// bfloat16_t is handled by x86_128-inl.h.
|
|
2413
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
|
|
2414
|
-
HWY_API Vec512<float16_t> Load(D d, const float16_t* HWY_RESTRICT aligned) {
|
|
2415
2964
|
#if HWY_HAVE_FLOAT16
|
|
2416
|
-
|
|
2965
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
|
|
2966
|
+
HWY_API Vec512<float16_t> Load(D /* tag */,
|
|
2967
|
+
const float16_t* HWY_RESTRICT aligned) {
|
|
2417
2968
|
return Vec512<float16_t>{_mm512_load_ph(aligned)};
|
|
2418
|
-
#else
|
|
2419
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
2420
|
-
return BitCast(d, Load(du, reinterpret_cast<const uint16_t*>(aligned)));
|
|
2421
|
-
#endif // HWY_HAVE_FLOAT16
|
|
2422
2969
|
}
|
|
2970
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2423
2971
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
2424
2972
|
HWY_API Vec512<float> Load(D /* tag */, const float* HWY_RESTRICT aligned) {
|
|
2425
2973
|
return Vec512<float>{_mm512_load_ps(aligned)};
|
|
@@ -2435,16 +2983,12 @@ HWY_API VFromD<D> LoadU(D /* tag */, const TFromD<D>* HWY_RESTRICT p) {
|
|
|
2435
2983
|
}
|
|
2436
2984
|
|
|
2437
2985
|
// bfloat16_t is handled by x86_128-inl.h.
|
|
2438
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64)>
|
|
2439
|
-
HWY_API Vec512<float16_t> LoadU(D d, const float16_t* HWY_RESTRICT p) {
|
|
2440
2986
|
#if HWY_HAVE_FLOAT16
|
|
2441
|
-
|
|
2987
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64)>
|
|
2988
|
+
HWY_API Vec512<float16_t> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) {
|
|
2442
2989
|
return Vec512<float16_t>{_mm512_loadu_ph(p)};
|
|
2443
|
-
#else
|
|
2444
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
2445
|
-
return BitCast(d, LoadU(du, reinterpret_cast<const uint16_t*>(p)));
|
|
2446
|
-
#endif // HWY_HAVE_FLOAT16
|
|
2447
2990
|
}
|
|
2991
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2448
2992
|
template <class D, HWY_IF_V_SIZE_D(D, 64)>
|
|
2449
2993
|
HWY_API Vec512<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) {
|
|
2450
2994
|
return Vec512<float>{_mm512_loadu_ps(p)};
|
|
@@ -2506,8 +3050,9 @@ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
|
|
|
2506
3050
|
HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
|
|
2507
3051
|
const TFromD<D>* HWY_RESTRICT p) {
|
|
2508
3052
|
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
2509
|
-
return
|
|
2510
|
-
|
|
3053
|
+
return BitCast(
|
|
3054
|
+
d, VFromD<decltype(du)>{_mm512_mask_loadu_epi16(
|
|
3055
|
+
BitCast(du, v).raw, m.raw, reinterpret_cast<const uint16_t*>(p))});
|
|
2511
3056
|
}
|
|
2512
3057
|
|
|
2513
3058
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
|
|
@@ -2539,10 +3084,12 @@ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, Mask512<double> m, D /* tag */,
|
|
|
2539
3084
|
// Loads 128 bit and duplicates into both 128-bit halves. This avoids the
|
|
2540
3085
|
// 3-cycle cost of moving data between 128-bit halves and avoids port 5.
|
|
2541
3086
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
2542
|
-
HWY_API VFromD<D> LoadDup128(D
|
|
2543
|
-
|
|
3087
|
+
HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* const HWY_RESTRICT p) {
|
|
3088
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2544
3089
|
const Full128<TFromD<D>> d128;
|
|
2545
|
-
|
|
3090
|
+
const RebindToUnsigned<decltype(d128)> du128;
|
|
3091
|
+
return BitCast(d, VFromD<decltype(du)>{_mm512_broadcast_i32x4(
|
|
3092
|
+
BitCast(du128, LoadU(d128, p)).raw)});
|
|
2546
3093
|
}
|
|
2547
3094
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
2548
3095
|
HWY_API VFromD<D> LoadDup128(D /* tag */, const float* HWY_RESTRICT p) {
|
|
@@ -2563,15 +3110,13 @@ HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
|
|
|
2563
3110
|
_mm512_store_si512(reinterpret_cast<__m512i*>(aligned), v.raw);
|
|
2564
3111
|
}
|
|
2565
3112
|
// bfloat16_t is handled by x86_128-inl.h.
|
|
3113
|
+
#if HWY_HAVE_FLOAT16
|
|
2566
3114
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
|
|
2567
3115
|
HWY_API void Store(Vec512<float16_t> v, D /* tag */,
|
|
2568
3116
|
float16_t* HWY_RESTRICT aligned) {
|
|
2569
|
-
#if HWY_HAVE_FLOAT16
|
|
2570
3117
|
_mm512_store_ph(aligned, v.raw);
|
|
2571
|
-
#else
|
|
2572
|
-
_mm512_store_si512(reinterpret_cast<__m512i*>(aligned), v.raw);
|
|
2573
|
-
#endif
|
|
2574
3118
|
}
|
|
3119
|
+
#endif
|
|
2575
3120
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
2576
3121
|
HWY_API void Store(Vec512<float> v, D /* tag */, float* HWY_RESTRICT aligned) {
|
|
2577
3122
|
_mm512_store_ps(aligned, v.raw);
|
|
@@ -2586,15 +3131,13 @@ HWY_API void StoreU(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
|
|
|
2586
3131
|
_mm512_storeu_si512(reinterpret_cast<__m512i*>(p), v.raw);
|
|
2587
3132
|
}
|
|
2588
3133
|
// bfloat16_t is handled by x86_128-inl.h.
|
|
3134
|
+
#if HWY_HAVE_FLOAT16
|
|
2589
3135
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
|
|
2590
3136
|
HWY_API void StoreU(Vec512<float16_t> v, D /* tag */,
|
|
2591
3137
|
float16_t* HWY_RESTRICT p) {
|
|
2592
|
-
#if HWY_HAVE_FLOAT16
|
|
2593
3138
|
_mm512_storeu_ph(p, v.raw);
|
|
2594
|
-
#else
|
|
2595
|
-
_mm512_storeu_si512(reinterpret_cast<__m512i*>(p), v.raw);
|
|
2596
|
-
#endif // HWY_HAVE_FLOAT16
|
|
2597
3139
|
}
|
|
3140
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2598
3141
|
|
|
2599
3142
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
2600
3143
|
HWY_API void StoreU(Vec512<float> v, D /* tag */, float* HWY_RESTRICT p) {
|
|
@@ -2756,84 +3299,90 @@ HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D /* tag */,
|
|
|
2756
3299
|
namespace detail {
|
|
2757
3300
|
|
|
2758
3301
|
template <int kScale, typename T, HWY_IF_UI32(T)>
|
|
2759
|
-
HWY_INLINE Vec512<T>
|
|
2760
|
-
|
|
2761
|
-
return Vec512<T>{_mm512_i32gather_epi32(
|
|
3302
|
+
HWY_INLINE Vec512<T> NativeGather512(const T* HWY_RESTRICT base,
|
|
3303
|
+
Vec512<int32_t> indices) {
|
|
3304
|
+
return Vec512<T>{_mm512_i32gather_epi32(indices.raw, base, kScale)};
|
|
2762
3305
|
}
|
|
2763
3306
|
|
|
2764
3307
|
template <int kScale, typename T, HWY_IF_UI64(T)>
|
|
2765
|
-
HWY_INLINE Vec512<T>
|
|
2766
|
-
|
|
2767
|
-
return Vec512<T>{_mm512_i64gather_epi64(
|
|
3308
|
+
HWY_INLINE Vec512<T> NativeGather512(const T* HWY_RESTRICT base,
|
|
3309
|
+
Vec512<int64_t> indices) {
|
|
3310
|
+
return Vec512<T>{_mm512_i64gather_epi64(indices.raw, base, kScale)};
|
|
2768
3311
|
}
|
|
2769
3312
|
|
|
2770
3313
|
template <int kScale>
|
|
2771
|
-
HWY_INLINE Vec512<float>
|
|
2772
|
-
|
|
2773
|
-
return Vec512<float>{_mm512_i32gather_ps(
|
|
3314
|
+
HWY_INLINE Vec512<float> NativeGather512(const float* HWY_RESTRICT base,
|
|
3315
|
+
Vec512<int32_t> indices) {
|
|
3316
|
+
return Vec512<float>{_mm512_i32gather_ps(indices.raw, base, kScale)};
|
|
2774
3317
|
}
|
|
2775
3318
|
|
|
2776
3319
|
template <int kScale>
|
|
2777
|
-
HWY_INLINE Vec512<double>
|
|
2778
|
-
|
|
2779
|
-
return Vec512<double>{_mm512_i64gather_pd(
|
|
3320
|
+
HWY_INLINE Vec512<double> NativeGather512(const double* HWY_RESTRICT base,
|
|
3321
|
+
Vec512<int64_t> indices) {
|
|
3322
|
+
return Vec512<double>{_mm512_i64gather_pd(indices.raw, base, kScale)};
|
|
2780
3323
|
}
|
|
2781
3324
|
|
|
2782
3325
|
template <int kScale, typename T, HWY_IF_UI32(T)>
|
|
2783
|
-
HWY_INLINE Vec512<T>
|
|
2784
|
-
|
|
2785
|
-
|
|
2786
|
-
const Full512<T> d;
|
|
3326
|
+
HWY_INLINE Vec512<T> NativeMaskedGatherOr512(Vec512<T> no, Mask512<T> m,
|
|
3327
|
+
const T* HWY_RESTRICT base,
|
|
3328
|
+
Vec512<int32_t> indices) {
|
|
2787
3329
|
return Vec512<T>{
|
|
2788
|
-
_mm512_mask_i32gather_epi32(
|
|
3330
|
+
_mm512_mask_i32gather_epi32(no.raw, m.raw, indices.raw, base, kScale)};
|
|
2789
3331
|
}
|
|
2790
3332
|
|
|
2791
3333
|
template <int kScale, typename T, HWY_IF_UI64(T)>
|
|
2792
|
-
HWY_INLINE Vec512<T>
|
|
2793
|
-
|
|
2794
|
-
|
|
2795
|
-
const Full512<T> d;
|
|
3334
|
+
HWY_INLINE Vec512<T> NativeMaskedGatherOr512(Vec512<T> no, Mask512<T> m,
|
|
3335
|
+
const T* HWY_RESTRICT base,
|
|
3336
|
+
Vec512<int64_t> indices) {
|
|
2796
3337
|
return Vec512<T>{
|
|
2797
|
-
_mm512_mask_i64gather_epi64(
|
|
3338
|
+
_mm512_mask_i64gather_epi64(no.raw, m.raw, indices.raw, base, kScale)};
|
|
2798
3339
|
}
|
|
2799
3340
|
|
|
2800
3341
|
template <int kScale>
|
|
2801
|
-
HWY_INLINE Vec512<float>
|
|
2802
|
-
|
|
2803
|
-
|
|
2804
|
-
|
|
3342
|
+
HWY_INLINE Vec512<float> NativeMaskedGatherOr512(Vec512<float> no,
|
|
3343
|
+
Mask512<float> m,
|
|
3344
|
+
const float* HWY_RESTRICT base,
|
|
3345
|
+
Vec512<int32_t> indices) {
|
|
2805
3346
|
return Vec512<float>{
|
|
2806
|
-
_mm512_mask_i32gather_ps(
|
|
3347
|
+
_mm512_mask_i32gather_ps(no.raw, m.raw, indices.raw, base, kScale)};
|
|
2807
3348
|
}
|
|
2808
3349
|
|
|
2809
3350
|
template <int kScale>
|
|
2810
|
-
HWY_INLINE Vec512<double>
|
|
2811
|
-
|
|
2812
|
-
|
|
2813
|
-
const Full512<double> d;
|
|
3351
|
+
HWY_INLINE Vec512<double> NativeMaskedGatherOr512(
|
|
3352
|
+
Vec512<double> no, Mask512<double> m, const double* HWY_RESTRICT base,
|
|
3353
|
+
Vec512<int64_t> indices) {
|
|
2814
3354
|
return Vec512<double>{
|
|
2815
|
-
_mm512_mask_i64gather_pd(
|
|
3355
|
+
_mm512_mask_i64gather_pd(no.raw, m.raw, indices.raw, base, kScale)};
|
|
2816
3356
|
}
|
|
2817
3357
|
} // namespace detail
|
|
2818
3358
|
|
|
2819
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64)
|
|
2820
|
-
HWY_API VFromD<D> GatherOffset(D
|
|
2821
|
-
|
|
2822
|
-
|
|
2823
|
-
|
|
3359
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64)>
|
|
3360
|
+
HWY_API VFromD<D> GatherOffset(D d, const TFromD<D>* HWY_RESTRICT base,
|
|
3361
|
+
VFromD<RebindToSigned<D>> offsets) {
|
|
3362
|
+
const RebindToSigned<decltype(d)> di;
|
|
3363
|
+
(void)di; // for HWY_DASSERT
|
|
3364
|
+
HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di))));
|
|
3365
|
+
return detail::NativeGather512<1>(base, offsets);
|
|
2824
3366
|
}
|
|
2825
|
-
|
|
2826
|
-
|
|
2827
|
-
|
|
2828
|
-
|
|
2829
|
-
|
|
3367
|
+
|
|
3368
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64)>
|
|
3369
|
+
HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
|
|
3370
|
+
VFromD<RebindToSigned<D>> indices) {
|
|
3371
|
+
const RebindToSigned<decltype(d)> di;
|
|
3372
|
+
(void)di; // for HWY_DASSERT
|
|
3373
|
+
HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
|
|
3374
|
+
return detail::NativeGather512<sizeof(TFromD<D>)>(base, indices);
|
|
2830
3375
|
}
|
|
2831
|
-
|
|
2832
|
-
|
|
2833
|
-
|
|
2834
|
-
|
|
2835
|
-
|
|
2836
|
-
|
|
3376
|
+
|
|
3377
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64)>
|
|
3378
|
+
HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
|
|
3379
|
+
const TFromD<D>* HWY_RESTRICT base,
|
|
3380
|
+
VFromD<RebindToSigned<D>> indices) {
|
|
3381
|
+
const RebindToSigned<decltype(d)> di;
|
|
3382
|
+
(void)di; // for HWY_DASSERT
|
|
3383
|
+
HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
|
|
3384
|
+
return detail::NativeMaskedGatherOr512<sizeof(TFromD<D>)>(no, m, base,
|
|
3385
|
+
indices);
|
|
2837
3386
|
}
|
|
2838
3387
|
|
|
2839
3388
|
HWY_DIAGNOSTICS(pop)
|
|
@@ -2878,7 +3427,7 @@ HWY_API Vec256<T> LowerHalf(Vec512<T> v) {
|
|
|
2878
3427
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
2879
3428
|
HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
|
|
2880
3429
|
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
2881
|
-
const Twice<decltype(
|
|
3430
|
+
const Twice<decltype(du)> dut;
|
|
2882
3431
|
return BitCast(d, VFromD<decltype(du)>{
|
|
2883
3432
|
_mm512_extracti32x8_epi32(BitCast(dut, v).raw, 1)});
|
|
2884
3433
|
}
|
|
@@ -2920,7 +3469,11 @@ HWY_API Vec128<T> ExtractBlock(Vec512<T> v) {
|
|
|
2920
3469
|
template <int kBlockIdx, class T, hwy::EnableIf<(kBlockIdx > 1)>* = nullptr>
|
|
2921
3470
|
HWY_API Vec128<T> ExtractBlock(Vec512<T> v) {
|
|
2922
3471
|
static_assert(kBlockIdx <= 3, "Invalid block index");
|
|
2923
|
-
|
|
3472
|
+
const DFromV<decltype(v)> d;
|
|
3473
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
3474
|
+
return BitCast(Full128<T>(),
|
|
3475
|
+
Vec128<MakeUnsigned<T>>{
|
|
3476
|
+
_mm512_extracti32x4_epi32(BitCast(du, v).raw, kBlockIdx)});
|
|
2924
3477
|
}
|
|
2925
3478
|
|
|
2926
3479
|
template <int kBlockIdx, hwy::EnableIf<(kBlockIdx > 1)>* = nullptr>
|
|
@@ -2955,8 +3508,13 @@ HWY_INLINE Vec512<T> InsertBlock(hwy::SizeTag<0> /* blk_idx_tag */, Vec512<T> v,
|
|
|
2955
3508
|
template <size_t kBlockIdx, typename T>
|
|
2956
3509
|
HWY_INLINE Vec512<T> InsertBlock(hwy::SizeTag<kBlockIdx> /* blk_idx_tag */,
|
|
2957
3510
|
Vec512<T> v, Vec128<T> blk_to_insert) {
|
|
2958
|
-
|
|
2959
|
-
|
|
3511
|
+
const DFromV<decltype(v)> d;
|
|
3512
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
3513
|
+
const Full128<MakeUnsigned<T>> du_blk_to_insert;
|
|
3514
|
+
return BitCast(
|
|
3515
|
+
d, VFromD<decltype(du)>{_mm512_inserti32x4(
|
|
3516
|
+
BitCast(du, v).raw, BitCast(du_blk_to_insert, blk_to_insert).raw,
|
|
3517
|
+
static_cast<int>(kBlockIdx & 3))});
|
|
2960
3518
|
}
|
|
2961
3519
|
|
|
2962
3520
|
template <size_t kBlockIdx, hwy::EnableIf<kBlockIdx != 0>* = nullptr>
|
|
@@ -2992,7 +3550,7 @@ HWY_API T GetLane(const Vec512<T> v) {
|
|
|
2992
3550
|
|
|
2993
3551
|
// ------------------------------ ZeroExtendVector
|
|
2994
3552
|
|
|
2995
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64),
|
|
3553
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
|
|
2996
3554
|
HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
|
|
2997
3555
|
#if HWY_HAVE_ZEXT // See definition/comment in x86_256-inl.h.
|
|
2998
3556
|
(void)d;
|
|
@@ -3042,11 +3600,13 @@ HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
|
|
|
3042
3600
|
DTo d_to, DFrom d_from, VFromD<DFrom> v) {
|
|
3043
3601
|
const Repartition<uint8_t, decltype(d_from)> du8_from;
|
|
3044
3602
|
const auto vu8 = BitCast(du8_from, v);
|
|
3603
|
+
const RebindToUnsigned<decltype(d_to)> du_to;
|
|
3045
3604
|
#if HWY_HAVE_ZEXT
|
|
3046
|
-
(
|
|
3047
|
-
|
|
3605
|
+
return BitCast(d_to,
|
|
3606
|
+
VFromD<decltype(du_to)>{_mm512_zextsi128_si512(vu8.raw)});
|
|
3048
3607
|
#else
|
|
3049
|
-
return
|
|
3608
|
+
return BitCast(d_to, VFromD<decltype(du_to)>{
|
|
3609
|
+
_mm512_inserti32x4(Zero(du_to).raw, vu8.raw, 0)});
|
|
3050
3610
|
#endif
|
|
3051
3611
|
}
|
|
3052
3612
|
|
|
@@ -3096,7 +3656,8 @@ HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) {
|
|
|
3096
3656
|
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
3097
3657
|
const Half<decltype(du)> duh;
|
|
3098
3658
|
const __m512i lo512 = ZeroExtendVector(du, BitCast(duh, lo)).raw;
|
|
3099
|
-
return
|
|
3659
|
+
return BitCast(d, VFromD<decltype(du)>{
|
|
3660
|
+
_mm512_inserti32x8(lo512, BitCast(duh, hi).raw, 1)});
|
|
3100
3661
|
}
|
|
3101
3662
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
3102
3663
|
HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) {
|
|
@@ -3181,7 +3742,11 @@ HWY_API Vec512<double> Broadcast(const Vec512<double> v) {
|
|
|
3181
3742
|
template <int kBlockIdx, class T>
|
|
3182
3743
|
HWY_API Vec512<T> BroadcastBlock(Vec512<T> v) {
|
|
3183
3744
|
static_assert(0 <= kBlockIdx && kBlockIdx <= 3, "Invalid block index");
|
|
3184
|
-
|
|
3745
|
+
const DFromV<decltype(v)> d;
|
|
3746
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
3747
|
+
return BitCast(
|
|
3748
|
+
d, VFromD<decltype(du)>{_mm512_shuffle_i32x4(
|
|
3749
|
+
BitCast(du, v).raw, BitCast(du, v).raw, 0x55 * kBlockIdx)});
|
|
3185
3750
|
}
|
|
3186
3751
|
|
|
3187
3752
|
template <int kBlockIdx>
|
|
@@ -3209,7 +3774,10 @@ HWY_INLINE Vec512<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
|
|
|
3209
3774
|
template <class T, HWY_IF_T_SIZE(T, 2)>
|
|
3210
3775
|
HWY_INLINE Vec512<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
|
|
3211
3776
|
Vec512<T> v) {
|
|
3212
|
-
|
|
3777
|
+
const DFromV<decltype(v)> d;
|
|
3778
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
3779
|
+
return BitCast(d, VFromD<decltype(du)>{_mm512_broadcastw_epi16(
|
|
3780
|
+
ResizeBitCast(Full128<uint16_t>(), v).raw)});
|
|
3213
3781
|
}
|
|
3214
3782
|
|
|
3215
3783
|
template <class T, HWY_IF_UI32(T)>
|
|
@@ -3671,8 +4239,11 @@ HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
|
|
|
3671
4239
|
|
|
3672
4240
|
// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
|
|
3673
4241
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
3674
|
-
HWY_API VFromD<D> ConcatLowerLower(D
|
|
3675
|
-
|
|
4242
|
+
HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4243
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4244
|
+
return BitCast(d,
|
|
4245
|
+
VFromD<decltype(du)>{_mm512_shuffle_i32x4(
|
|
4246
|
+
BitCast(du, lo).raw, BitCast(du, hi).raw, _MM_PERM_BABA)});
|
|
3676
4247
|
}
|
|
3677
4248
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
3678
4249
|
HWY_API VFromD<D> ConcatLowerLower(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
|
|
@@ -3686,8 +4257,11 @@ HWY_API Vec512<double> ConcatLowerLower(D /* tag */, Vec512<double> hi,
|
|
|
3686
4257
|
|
|
3687
4258
|
// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
|
|
3688
4259
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
3689
|
-
HWY_API VFromD<D> ConcatUpperUpper(D
|
|
3690
|
-
|
|
4260
|
+
HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4261
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4262
|
+
return BitCast(d,
|
|
4263
|
+
VFromD<decltype(du)>{_mm512_shuffle_i32x4(
|
|
4264
|
+
BitCast(du, lo).raw, BitCast(du, hi).raw, _MM_PERM_DCDC)});
|
|
3691
4265
|
}
|
|
3692
4266
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
3693
4267
|
HWY_API VFromD<D> ConcatUpperUpper(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
|
|
@@ -3701,8 +4275,11 @@ HWY_API Vec512<double> ConcatUpperUpper(D /* tag */, Vec512<double> hi,
|
|
|
3701
4275
|
|
|
3702
4276
|
// hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
|
|
3703
4277
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
3704
|
-
HWY_API VFromD<D> ConcatLowerUpper(D
|
|
3705
|
-
|
|
4278
|
+
HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4279
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4280
|
+
return BitCast(d,
|
|
4281
|
+
VFromD<decltype(du)>{_mm512_shuffle_i32x4(
|
|
4282
|
+
BitCast(du, lo).raw, BitCast(du, hi).raw, _MM_PERM_BADC)});
|
|
3706
4283
|
}
|
|
3707
4284
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
3708
4285
|
HWY_API VFromD<D> ConcatLowerUpper(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
|
|
@@ -3716,11 +4293,13 @@ HWY_API Vec512<double> ConcatLowerUpper(D /* tag */, Vec512<double> hi,
|
|
|
3716
4293
|
|
|
3717
4294
|
// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
|
|
3718
4295
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
3719
|
-
HWY_API VFromD<D> ConcatUpperLower(D
|
|
4296
|
+
HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
3720
4297
|
// There are no imm8 blend in AVX512. Use blend16 because 32-bit masks
|
|
3721
4298
|
// are efficiently loaded from 32-bit regs.
|
|
3722
4299
|
const __mmask32 mask = /*_cvtu32_mask32 */ (0x0000FFFF);
|
|
3723
|
-
|
|
4300
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4301
|
+
return BitCast(d, VFromD<decltype(du)>{_mm512_mask_blend_epi16(
|
|
4302
|
+
mask, BitCast(du, hi).raw, BitCast(du, lo).raw)});
|
|
3724
4303
|
}
|
|
3725
4304
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
3726
4305
|
HWY_API VFromD<D> ConcatUpperLower(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
|
|
@@ -3881,6 +4460,130 @@ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
|
3881
4460
|
return VFromD<D>{_mm512_permutex2var_pd(lo.raw, Load(du, kIdx).raw, hi.raw)};
|
|
3882
4461
|
}
|
|
3883
4462
|
|
|
4463
|
+
// ------------------------------ InterleaveWholeLower
|
|
4464
|
+
|
|
4465
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
|
|
4466
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
4467
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
4468
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4469
|
+
alignas(64) static constexpr uint8_t kIdx[64] = {
|
|
4470
|
+
0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71,
|
|
4471
|
+
8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79,
|
|
4472
|
+
16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87,
|
|
4473
|
+
24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95};
|
|
4474
|
+
return VFromD<D>{_mm512_permutex2var_epi8(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
4475
|
+
#else
|
|
4476
|
+
alignas(64) static constexpr uint64_t kIdx2[8] = {0, 1, 8, 9, 2, 3, 10, 11};
|
|
4477
|
+
const Repartition<uint64_t, decltype(d)> du64;
|
|
4478
|
+
return VFromD<D>{_mm512_permutex2var_epi64(InterleaveLower(a, b).raw,
|
|
4479
|
+
Load(du64, kIdx2).raw,
|
|
4480
|
+
InterleaveUpper(d, a, b).raw)};
|
|
4481
|
+
#endif
|
|
4482
|
+
}
|
|
4483
|
+
|
|
4484
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
|
|
4485
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
4486
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4487
|
+
alignas(64) static constexpr uint16_t kIdx[32] = {
|
|
4488
|
+
0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
|
|
4489
|
+
8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47};
|
|
4490
|
+
return BitCast(
|
|
4491
|
+
d, VFromD<decltype(du)>{_mm512_permutex2var_epi16(
|
|
4492
|
+
BitCast(du, a).raw, Load(du, kIdx).raw, BitCast(du, b).raw)});
|
|
4493
|
+
}
|
|
4494
|
+
|
|
4495
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
|
|
4496
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
4497
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4498
|
+
alignas(64) static constexpr uint32_t kIdx[16] = {0, 16, 1, 17, 2, 18, 3, 19,
|
|
4499
|
+
4, 20, 5, 21, 6, 22, 7, 23};
|
|
4500
|
+
return VFromD<D>{_mm512_permutex2var_epi32(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
4501
|
+
}
|
|
4502
|
+
|
|
4503
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
4504
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
4505
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4506
|
+
alignas(64) static constexpr uint32_t kIdx[16] = {0, 16, 1, 17, 2, 18, 3, 19,
|
|
4507
|
+
4, 20, 5, 21, 6, 22, 7, 23};
|
|
4508
|
+
return VFromD<D>{_mm512_permutex2var_ps(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
4509
|
+
}
|
|
4510
|
+
|
|
4511
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
|
|
4512
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
4513
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4514
|
+
alignas(64) static constexpr uint64_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
|
|
4515
|
+
return VFromD<D>{_mm512_permutex2var_epi64(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
4516
|
+
}
|
|
4517
|
+
|
|
4518
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
|
|
4519
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
4520
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4521
|
+
alignas(64) static constexpr uint64_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
|
|
4522
|
+
return VFromD<D>{_mm512_permutex2var_pd(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
4523
|
+
}
|
|
4524
|
+
|
|
4525
|
+
// ------------------------------ InterleaveWholeUpper
|
|
4526
|
+
|
|
4527
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
|
|
4528
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
4529
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
4530
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4531
|
+
alignas(64) static constexpr uint8_t kIdx[64] = {
|
|
4532
|
+
32, 96, 33, 97, 34, 98, 35, 99, 36, 100, 37, 101, 38, 102, 39, 103,
|
|
4533
|
+
40, 104, 41, 105, 42, 106, 43, 107, 44, 108, 45, 109, 46, 110, 47, 111,
|
|
4534
|
+
48, 112, 49, 113, 50, 114, 51, 115, 52, 116, 53, 117, 54, 118, 55, 119,
|
|
4535
|
+
56, 120, 57, 121, 58, 122, 59, 123, 60, 124, 61, 125, 62, 126, 63, 127};
|
|
4536
|
+
return VFromD<D>{_mm512_permutex2var_epi8(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
4537
|
+
#else
|
|
4538
|
+
alignas(64) static constexpr uint64_t kIdx2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
|
|
4539
|
+
const Repartition<uint64_t, decltype(d)> du64;
|
|
4540
|
+
return VFromD<D>{_mm512_permutex2var_epi64(InterleaveLower(a, b).raw,
|
|
4541
|
+
Load(du64, kIdx2).raw,
|
|
4542
|
+
InterleaveUpper(d, a, b).raw)};
|
|
4543
|
+
#endif
|
|
4544
|
+
}
|
|
4545
|
+
|
|
4546
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
|
|
4547
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
4548
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4549
|
+
alignas(64) static constexpr uint16_t kIdx[32] = {
|
|
4550
|
+
16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
|
|
4551
|
+
24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63};
|
|
4552
|
+
return BitCast(
|
|
4553
|
+
d, VFromD<decltype(du)>{_mm512_permutex2var_epi16(
|
|
4554
|
+
BitCast(du, a).raw, Load(du, kIdx).raw, BitCast(du, b).raw)});
|
|
4555
|
+
}
|
|
4556
|
+
|
|
4557
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
|
|
4558
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
4559
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4560
|
+
alignas(64) static constexpr uint32_t kIdx[16] = {
|
|
4561
|
+
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31};
|
|
4562
|
+
return VFromD<D>{_mm512_permutex2var_epi32(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
4563
|
+
}
|
|
4564
|
+
|
|
4565
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
4566
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
4567
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4568
|
+
alignas(64) static constexpr uint32_t kIdx[16] = {
|
|
4569
|
+
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31};
|
|
4570
|
+
return VFromD<D>{_mm512_permutex2var_ps(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
4571
|
+
}
|
|
4572
|
+
|
|
4573
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
|
|
4574
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
4575
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4576
|
+
alignas(64) static constexpr uint64_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
|
|
4577
|
+
return VFromD<D>{_mm512_permutex2var_epi64(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
4578
|
+
}
|
|
4579
|
+
|
|
4580
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
|
|
4581
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
4582
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4583
|
+
alignas(64) static constexpr uint64_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
|
|
4584
|
+
return VFromD<D>{_mm512_permutex2var_pd(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
4585
|
+
}
|
|
4586
|
+
|
|
3884
4587
|
// ------------------------------ DupEven (InterleaveLower)
|
|
3885
4588
|
|
|
3886
4589
|
template <typename T, HWY_IF_T_SIZE(T, 4)>
|
|
@@ -3926,7 +4629,11 @@ HWY_API Vec512<T> OddEven(const Vec512<T> a, const Vec512<T> b) {
|
|
|
3926
4629
|
|
|
3927
4630
|
template <typename T>
|
|
3928
4631
|
HWY_API Vec512<T> OddEvenBlocks(Vec512<T> odd, Vec512<T> even) {
|
|
3929
|
-
|
|
4632
|
+
const DFromV<decltype(odd)> d;
|
|
4633
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4634
|
+
return BitCast(
|
|
4635
|
+
d, VFromD<decltype(du)>{_mm512_mask_blend_epi64(
|
|
4636
|
+
__mmask8{0x33u}, BitCast(du, odd).raw, BitCast(du, even).raw)});
|
|
3930
4637
|
}
|
|
3931
4638
|
|
|
3932
4639
|
HWY_API Vec512<float> OddEvenBlocks(Vec512<float> odd, Vec512<float> even) {
|
|
@@ -3943,7 +4650,11 @@ HWY_API Vec512<double> OddEvenBlocks(Vec512<double> odd, Vec512<double> even) {
|
|
|
3943
4650
|
|
|
3944
4651
|
template <typename T>
|
|
3945
4652
|
HWY_API Vec512<T> SwapAdjacentBlocks(Vec512<T> v) {
|
|
3946
|
-
|
|
4653
|
+
const DFromV<decltype(v)> d;
|
|
4654
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4655
|
+
return BitCast(d,
|
|
4656
|
+
VFromD<decltype(du)>{_mm512_shuffle_i32x4(
|
|
4657
|
+
BitCast(du, v).raw, BitCast(du, v).raw, _MM_PERM_CDAB)});
|
|
3947
4658
|
}
|
|
3948
4659
|
|
|
3949
4660
|
HWY_API Vec512<float> SwapAdjacentBlocks(Vec512<float> v) {
|
|
@@ -3957,8 +4668,11 @@ HWY_API Vec512<double> SwapAdjacentBlocks(Vec512<double> v) {
|
|
|
3957
4668
|
// ------------------------------ ReverseBlocks
|
|
3958
4669
|
|
|
3959
4670
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
3960
|
-
HWY_API VFromD<D> ReverseBlocks(D
|
|
3961
|
-
|
|
4671
|
+
HWY_API VFromD<D> ReverseBlocks(D d, VFromD<D> v) {
|
|
4672
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4673
|
+
return BitCast(d,
|
|
4674
|
+
VFromD<decltype(du)>{_mm512_shuffle_i32x4(
|
|
4675
|
+
BitCast(du, v).raw, BitCast(du, v).raw, _MM_PERM_ABCD)});
|
|
3962
4676
|
}
|
|
3963
4677
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
3964
4678
|
HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
|
|
@@ -3974,7 +4688,10 @@ HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
|
|
|
3974
4688
|
// Both full
|
|
3975
4689
|
template <typename T, typename TI>
|
|
3976
4690
|
HWY_API Vec512<TI> TableLookupBytes(Vec512<T> bytes, Vec512<TI> indices) {
|
|
3977
|
-
|
|
4691
|
+
const DFromV<decltype(indices)> d;
|
|
4692
|
+
return BitCast(d, Vec512<uint8_t>{_mm512_shuffle_epi8(
|
|
4693
|
+
BitCast(Full512<uint8_t>(), bytes).raw,
|
|
4694
|
+
BitCast(Full512<uint8_t>(), indices).raw)});
|
|
3978
4695
|
}
|
|
3979
4696
|
|
|
3980
4697
|
// Partial index vector
|
|
@@ -4632,6 +5349,15 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec256<float16_t> v) {
|
|
|
4632
5349
|
#endif // HWY_HAVE_FLOAT16
|
|
4633
5350
|
}
|
|
4634
5351
|
|
|
5352
|
+
#if HWY_HAVE_FLOAT16
|
|
5353
|
+
|
|
5354
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
|
|
5355
|
+
HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, Vec128<float16_t> v) {
|
|
5356
|
+
return VFromD<D>{_mm512_cvtph_pd(v.raw)};
|
|
5357
|
+
}
|
|
5358
|
+
|
|
5359
|
+
#endif // HWY_HAVE_FLOAT16
|
|
5360
|
+
|
|
4635
5361
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
4636
5362
|
HWY_API VFromD<D> PromoteTo(D df32, Vec256<bfloat16_t> v) {
|
|
4637
5363
|
const Rebind<uint16_t, decltype(df32)> du16;
|
|
@@ -4666,8 +5392,7 @@ HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
|
|
|
4666
5392
|
}
|
|
4667
5393
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
|
|
4668
5394
|
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
|
|
4669
|
-
return VFromD<D>{
|
|
4670
|
-
_mm512_maskz_cvttps_epu64(_knot_mask8(MaskFromVec(v).raw), v.raw)};
|
|
5395
|
+
return VFromD<D>{_mm512_maskz_cvttps_epu64(Not(MaskFromVec(v)).raw, v.raw)};
|
|
4671
5396
|
}
|
|
4672
5397
|
|
|
4673
5398
|
// ------------------------------ Demotions (full -> part w/ narrow lanes)
|
|
@@ -4709,8 +5434,7 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int32_t> v) {
|
|
|
4709
5434
|
const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
|
|
4710
5435
|
const Vec512<uint8_t> u8{_mm512_packus_epi16(i16.raw, i16.raw)};
|
|
4711
5436
|
|
|
4712
|
-
|
|
4713
|
-
const auto idx32 = LoadDup128(du32, kLanes);
|
|
5437
|
+
const VFromD<decltype(du32)> idx32 = Dup128VecFromValues(du32, 0, 4, 8, 12);
|
|
4714
5438
|
const Vec512<uint8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, u8.raw)};
|
|
4715
5439
|
return LowerHalf(LowerHalf(fixed));
|
|
4716
5440
|
}
|
|
@@ -4745,9 +5469,7 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int32_t> v) {
|
|
|
4745
5469
|
const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
|
|
4746
5470
|
const Vec512<int8_t> i8{_mm512_packs_epi16(i16.raw, i16.raw)};
|
|
4747
5471
|
|
|
4748
|
-
|
|
4749
|
-
0, 4, 8, 12, 0, 4, 8, 12};
|
|
4750
|
-
const auto idx32 = LoadDup128(du32, kLanes);
|
|
5472
|
+
const VFromD<decltype(du32)> idx32 = Dup128VecFromValues(du32, 0, 4, 8, 12);
|
|
4751
5473
|
const Vec512<int8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, i8.raw)};
|
|
4752
5474
|
return LowerHalf(LowerHalf(fixed));
|
|
4753
5475
|
}
|
|
@@ -4779,32 +5501,17 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int64_t> v) {
|
|
|
4779
5501
|
|
|
4780
5502
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
|
|
4781
5503
|
HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int64_t> v) {
|
|
4782
|
-
const
|
|
4783
|
-
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
4784
|
-
const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
|
|
4785
|
-
#else
|
|
4786
|
-
const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
|
|
4787
|
-
#endif
|
|
5504
|
+
const __mmask8 non_neg_mask = Not(MaskFromVec(v)).raw;
|
|
4788
5505
|
return VFromD<D>{_mm512_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)};
|
|
4789
5506
|
}
|
|
4790
5507
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
|
|
4791
5508
|
HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int64_t> v) {
|
|
4792
|
-
const
|
|
4793
|
-
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
4794
|
-
const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
|
|
4795
|
-
#else
|
|
4796
|
-
const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
|
|
4797
|
-
#endif
|
|
5509
|
+
const __mmask8 non_neg_mask = Not(MaskFromVec(v)).raw;
|
|
4798
5510
|
return VFromD<D>{_mm512_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)};
|
|
4799
5511
|
}
|
|
4800
5512
|
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
|
|
4801
5513
|
HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int64_t> v) {
|
|
4802
|
-
const
|
|
4803
|
-
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
4804
|
-
const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
|
|
4805
|
-
#else
|
|
4806
|
-
const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
|
|
4807
|
-
#endif
|
|
5514
|
+
const __mmask8 non_neg_mask = Not(MaskFromVec(v)).raw;
|
|
4808
5515
|
return VFromD<D>{_mm512_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)};
|
|
4809
5516
|
}
|
|
4810
5517
|
|
|
@@ -4822,14 +5529,23 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<uint64_t> v) {
|
|
|
4822
5529
|
}
|
|
4823
5530
|
|
|
4824
5531
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
|
|
4825
|
-
HWY_API VFromD<D> DemoteTo(D
|
|
5532
|
+
HWY_API VFromD<D> DemoteTo(D df16, Vec512<float> v) {
|
|
4826
5533
|
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
4827
5534
|
HWY_DIAGNOSTICS(push)
|
|
4828
5535
|
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
4829
|
-
|
|
5536
|
+
const RebindToUnsigned<decltype(df16)> du16;
|
|
5537
|
+
return BitCast(
|
|
5538
|
+
df16, VFromD<decltype(du16)>{_mm512_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)});
|
|
4830
5539
|
HWY_DIAGNOSTICS(pop)
|
|
4831
5540
|
}
|
|
4832
5541
|
|
|
5542
|
+
#if HWY_HAVE_FLOAT16
|
|
5543
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
|
|
5544
|
+
HWY_API VFromD<D> DemoteTo(D /*df16*/, Vec512<double> v) {
|
|
5545
|
+
return VFromD<D>{_mm512_cvtpd_ph(v.raw)};
|
|
5546
|
+
}
|
|
5547
|
+
#endif // HWY_HAVE_FLOAT16
|
|
5548
|
+
|
|
4833
5549
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
|
|
4834
5550
|
HWY_API VFromD<D> DemoteTo(D dbf16, Vec512<float> v) {
|
|
4835
5551
|
// TODO(janwas): _mm512_cvtneps_pbh once we have avx512bf16.
|
|
@@ -4943,8 +5659,7 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<double> v) {
|
|
|
4943
5659
|
|
|
4944
5660
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
|
|
4945
5661
|
HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<double> v) {
|
|
4946
|
-
return VFromD<D>{
|
|
4947
|
-
_mm512_maskz_cvttpd_epu32(_knot_mask8(MaskFromVec(v).raw), v.raw)};
|
|
5662
|
+
return VFromD<D>{_mm512_maskz_cvttpd_epu32(Not(MaskFromVec(v)).raw, v.raw)};
|
|
4948
5663
|
}
|
|
4949
5664
|
|
|
4950
5665
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
@@ -4962,13 +5677,12 @@ HWY_API Vec128<uint8_t> U8FromU32(const Vec512<uint32_t> v) {
|
|
|
4962
5677
|
const DFromV<decltype(v)> d32;
|
|
4963
5678
|
// In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the
|
|
4964
5679
|
// lowest 4 bytes.
|
|
4965
|
-
|
|
4966
|
-
|
|
4967
|
-
const auto quads = TableLookupBytes(v,
|
|
5680
|
+
const VFromD<decltype(d32)> v8From32 =
|
|
5681
|
+
Dup128VecFromValues(d32, 0x0C080400u, ~0u, ~0u, ~0u);
|
|
5682
|
+
const auto quads = TableLookupBytes(v, v8From32);
|
|
4968
5683
|
// Gather the lowest 4 bytes of 4 128-bit blocks.
|
|
4969
|
-
|
|
4970
|
-
const Vec512<uint8_t> bytes{
|
|
4971
|
-
_mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)};
|
|
5684
|
+
const VFromD<decltype(d32)> index32 = Dup128VecFromValues(d32, 0, 4, 8, 12);
|
|
5685
|
+
const Vec512<uint8_t> bytes{_mm512_permutexvar_epi32(index32.raw, quads.raw)};
|
|
4972
5686
|
return LowerHalf(LowerHalf(bytes));
|
|
4973
5687
|
}
|
|
4974
5688
|
|
|
@@ -4979,10 +5693,9 @@ HWY_API VFromD<D> TruncateTo(D d, const Vec512<uint64_t> v) {
|
|
|
4979
5693
|
#if HWY_TARGET <= HWY_AVX3_DL
|
|
4980
5694
|
(void)d;
|
|
4981
5695
|
const Full512<uint8_t> d8;
|
|
4982
|
-
|
|
4983
|
-
0, 8, 16, 24, 32, 40, 48, 56, 0, 8, 16, 24, 32, 40, 48, 56
|
|
4984
|
-
const Vec512<uint8_t> bytes{
|
|
4985
|
-
_mm512_permutexvar_epi8(LoadDup128(d8, k8From64).raw, v.raw)};
|
|
5696
|
+
const VFromD<decltype(d8)> v8From64 = Dup128VecFromValues(
|
|
5697
|
+
d8, 0, 8, 16, 24, 32, 40, 48, 56, 0, 8, 16, 24, 32, 40, 48, 56);
|
|
5698
|
+
const Vec512<uint8_t> bytes{_mm512_permutexvar_epi8(v8From64.raw, v.raw)};
|
|
4986
5699
|
return LowerHalf(LowerHalf(LowerHalf(bytes)));
|
|
4987
5700
|
#else
|
|
4988
5701
|
const Full512<uint32_t> d32;
|
|
@@ -5018,21 +5731,19 @@ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
|
|
|
5018
5731
|
HWY_API VFromD<D> TruncateTo(D /* tag */, const Vec512<uint32_t> v) {
|
|
5019
5732
|
#if HWY_TARGET <= HWY_AVX3_DL
|
|
5020
5733
|
const Full512<uint8_t> d8;
|
|
5021
|
-
|
|
5022
|
-
0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
|
|
5023
|
-
const Vec512<uint8_t> bytes{
|
|
5024
|
-
_mm512_permutexvar_epi8(LoadDup128(d8, k8From32).raw, v.raw)};
|
|
5734
|
+
const VFromD<decltype(d8)> v8From32 = Dup128VecFromValues(
|
|
5735
|
+
d8, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60);
|
|
5736
|
+
const Vec512<uint8_t> bytes{_mm512_permutexvar_epi8(v8From32.raw, v.raw)};
|
|
5025
5737
|
#else
|
|
5026
5738
|
const Full512<uint32_t> d32;
|
|
5027
5739
|
// In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the
|
|
5028
5740
|
// lowest 4 bytes.
|
|
5029
|
-
|
|
5030
|
-
|
|
5031
|
-
const auto quads = TableLookupBytes(v,
|
|
5741
|
+
const VFromD<decltype(d32)> v8From32 =
|
|
5742
|
+
Dup128VecFromValues(d32, 0x0C080400u, ~0u, ~0u, ~0u);
|
|
5743
|
+
const auto quads = TableLookupBytes(v, v8From32);
|
|
5032
5744
|
// Gather the lowest 4 bytes of 4 128-bit blocks.
|
|
5033
|
-
|
|
5034
|
-
const Vec512<uint8_t> bytes{
|
|
5035
|
-
_mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)};
|
|
5745
|
+
const VFromD<decltype(d32)> index32 = Dup128VecFromValues(d32, 0, 4, 8, 12);
|
|
5746
|
+
const Vec512<uint8_t> bytes{_mm512_permutexvar_epi32(index32.raw, quads.raw)};
|
|
5036
5747
|
#endif
|
|
5037
5748
|
return LowerHalf(LowerHalf(bytes));
|
|
5038
5749
|
}
|
|
@@ -5061,9 +5772,9 @@ HWY_API VFromD<D> TruncateTo(D /* tag */, const Vec512<uint16_t> v) {
|
|
|
5061
5772
|
_mm512_permutexvar_epi8(Load(d8, k8From16).raw, v.raw)};
|
|
5062
5773
|
#else
|
|
5063
5774
|
const Full512<uint32_t> d32;
|
|
5064
|
-
|
|
5065
|
-
0x06040200u, 0x0E0C0A08u, 0x06040200u, 0x0E0C0A08u
|
|
5066
|
-
const auto quads = TableLookupBytes(v,
|
|
5775
|
+
const VFromD<decltype(d32)> v16From32 = Dup128VecFromValues(
|
|
5776
|
+
d32, 0x06040200u, 0x0E0C0A08u, 0x06040200u, 0x0E0C0A08u);
|
|
5777
|
+
const auto quads = TableLookupBytes(v, v16From32);
|
|
5067
5778
|
alignas(64) static constexpr uint32_t kIndex32[16] = {
|
|
5068
5779
|
0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
|
|
5069
5780
|
const Vec512<uint8_t> bytes{
|
|
@@ -5112,6 +5823,10 @@ HWY_API VFromD<D> ConvertTo(D d, Vec512<float16_t> v) {
|
|
|
5112
5823
|
return detail::FixConversionOverflow(d, v,
|
|
5113
5824
|
VFromD<D>{_mm512_cvttph_epi16(v.raw)});
|
|
5114
5825
|
}
|
|
5826
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
|
|
5827
|
+
HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
|
|
5828
|
+
return VFromD<D>{_mm512_maskz_cvttph_epu16(Not(MaskFromVec(v)).raw, v.raw)};
|
|
5829
|
+
}
|
|
5115
5830
|
#endif // HWY_HAVE_FLOAT16
|
|
5116
5831
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
|
|
5117
5832
|
HWY_API VFromD<D> ConvertTo(D d, Vec512<float> v) {
|
|
@@ -5125,13 +5840,11 @@ HWY_API VFromD<D> ConvertTo(D di, Vec512<double> v) {
|
|
|
5125
5840
|
}
|
|
5126
5841
|
template <class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U32_D(DU)>
|
|
5127
5842
|
HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
5128
|
-
return VFromD<DU>{
|
|
5129
|
-
_mm512_maskz_cvttps_epu32(_knot_mask16(MaskFromVec(v).raw), v.raw)};
|
|
5843
|
+
return VFromD<DU>{_mm512_maskz_cvttps_epu32(Not(MaskFromVec(v)).raw, v.raw)};
|
|
5130
5844
|
}
|
|
5131
5845
|
template <class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U64_D(DU)>
|
|
5132
5846
|
HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
5133
|
-
return VFromD<DU>{
|
|
5134
|
-
_mm512_maskz_cvttpd_epu64(_knot_mask8(MaskFromVec(v).raw), v.raw)};
|
|
5847
|
+
return VFromD<DU>{_mm512_maskz_cvttpd_epu64(Not(MaskFromVec(v)).raw, v.raw)};
|
|
5135
5848
|
}
|
|
5136
5849
|
|
|
5137
5850
|
HWY_API Vec512<int32_t> NearestInt(const Vec512<float> v) {
|
|
@@ -5198,14 +5911,14 @@ template <uint8_t kRcon>
|
|
|
5198
5911
|
HWY_API Vec512<uint8_t> AESKeyGenAssist(Vec512<uint8_t> v) {
|
|
5199
5912
|
const Full512<uint8_t> d;
|
|
5200
5913
|
#if HWY_TARGET <= HWY_AVX3_DL
|
|
5201
|
-
|
|
5202
|
-
0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0
|
|
5203
|
-
|
|
5204
|
-
0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12
|
|
5914
|
+
const VFromD<decltype(d)> rconXorMask = Dup128VecFromValues(
|
|
5915
|
+
d, 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0);
|
|
5916
|
+
const VFromD<decltype(d)> rotWordShuffle = Dup128VecFromValues(
|
|
5917
|
+
d, 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12);
|
|
5205
5918
|
const Repartition<uint32_t, decltype(d)> du32;
|
|
5206
5919
|
const auto w13 = BitCast(d, DupOdd(BitCast(du32, v)));
|
|
5207
|
-
const auto sub_word_result = AESLastRound(w13,
|
|
5208
|
-
return TableLookupBytes(sub_word_result,
|
|
5920
|
+
const auto sub_word_result = AESLastRound(w13, rconXorMask);
|
|
5921
|
+
return TableLookupBytes(sub_word_result, rotWordShuffle);
|
|
5209
5922
|
#else
|
|
5210
5923
|
const Half<decltype(d)> d2;
|
|
5211
5924
|
return Combine(d, AESKeyGenAssist<kRcon>(UpperHalf(d2, v)),
|
|
@@ -5253,6 +5966,28 @@ HWY_API Vec512<uint64_t> CLMulUpper(Vec512<uint64_t> va, Vec512<uint64_t> vb) {
|
|
|
5253
5966
|
|
|
5254
5967
|
// ================================================== MISC
|
|
5255
5968
|
|
|
5969
|
+
// ------------------------------ SumsOfAdjQuadAbsDiff (Broadcast,
|
|
5970
|
+
// SumsOfAdjShufQuadAbsDiff)
|
|
5971
|
+
|
|
5972
|
+
template <int kAOffset, int kBOffset>
|
|
5973
|
+
static Vec512<uint16_t> SumsOfAdjQuadAbsDiff(Vec512<uint8_t> a,
|
|
5974
|
+
Vec512<uint8_t> b) {
|
|
5975
|
+
static_assert(0 <= kAOffset && kAOffset <= 1,
|
|
5976
|
+
"kAOffset must be between 0 and 1");
|
|
5977
|
+
static_assert(0 <= kBOffset && kBOffset <= 3,
|
|
5978
|
+
"kBOffset must be between 0 and 3");
|
|
5979
|
+
|
|
5980
|
+
const DFromV<decltype(a)> d;
|
|
5981
|
+
const RepartitionToWideX2<decltype(d)> du32;
|
|
5982
|
+
|
|
5983
|
+
// While AVX3 does not have a _mm512_mpsadbw_epu8 intrinsic, the
|
|
5984
|
+
// SumsOfAdjQuadAbsDiff operation is implementable for 512-bit vectors on
|
|
5985
|
+
// AVX3 using SumsOfShuffledQuadAbsDiff and U32 Broadcast.
|
|
5986
|
+
return SumsOfShuffledQuadAbsDiff<kAOffset + 2, kAOffset + 1, kAOffset + 1,
|
|
5987
|
+
kAOffset>(
|
|
5988
|
+
a, BitCast(d, Broadcast<kBOffset>(BitCast(du32, b))));
|
|
5989
|
+
}
|
|
5990
|
+
|
|
5256
5991
|
// ------------------------------ I32/I64 SaturatedAdd (MaskFromVec)
|
|
5257
5992
|
|
|
5258
5993
|
HWY_API Vec512<int32_t> SaturatedAdd(Vec512<int32_t> a, Vec512<int32_t> b) {
|
|
@@ -6165,7 +6900,10 @@ namespace detail {
|
|
|
6165
6900
|
// Type-safe wrapper.
|
|
6166
6901
|
template <_MM_PERM_ENUM kPerm, typename T>
|
|
6167
6902
|
Vec512<T> Shuffle128(const Vec512<T> lo, const Vec512<T> hi) {
|
|
6168
|
-
|
|
6903
|
+
const DFromV<decltype(lo)> d;
|
|
6904
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
6905
|
+
return BitCast(d, VFromD<decltype(du)>{_mm512_shuffle_i64x2(
|
|
6906
|
+
BitCast(du, lo).raw, BitCast(du, hi).raw, kPerm)});
|
|
6169
6907
|
}
|
|
6170
6908
|
template <_MM_PERM_ENUM kPerm>
|
|
6171
6909
|
Vec512<float> Shuffle128(const Vec512<float> lo, const Vec512<float> hi) {
|
|
@@ -6345,7 +7083,7 @@ HWY_API Mask512<T> SetOnlyFirst(Mask512<T> mask) {
|
|
|
6345
7083
|
static_cast<typename Mask512<T>::Raw>(detail::AVX3Blsi(mask.raw))};
|
|
6346
7084
|
}
|
|
6347
7085
|
|
|
6348
|
-
// ------------------------------ Shl (
|
|
7086
|
+
// ------------------------------ Shl (Dup128VecFromValues)
|
|
6349
7087
|
|
|
6350
7088
|
HWY_API Vec512<uint16_t> operator<<(Vec512<uint16_t> v, Vec512<uint16_t> bits) {
|
|
6351
7089
|
return Vec512<uint16_t>{_mm512_sllv_epi16(v.raw, bits.raw)};
|
|
@@ -6356,13 +7094,15 @@ HWY_API Vec512<uint8_t> operator<<(Vec512<uint8_t> v, Vec512<uint8_t> bits) {
|
|
|
6356
7094
|
const DFromV<decltype(v)> d;
|
|
6357
7095
|
#if HWY_TARGET <= HWY_AVX3_DL
|
|
6358
7096
|
// kMask[i] = 0xFF >> i
|
|
6359
|
-
|
|
6360
|
-
0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01,
|
|
7097
|
+
const VFromD<decltype(d)> masks =
|
|
7098
|
+
Dup128VecFromValues(d, 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0,
|
|
7099
|
+
0, 0, 0, 0, 0, 0, 0);
|
|
6361
7100
|
// kShl[i] = 1 << i
|
|
6362
|
-
|
|
6363
|
-
|
|
6364
|
-
|
|
6365
|
-
|
|
7101
|
+
const VFromD<decltype(d)> shl =
|
|
7102
|
+
Dup128VecFromValues(d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0,
|
|
7103
|
+
0, 0, 0, 0, 0, 0, 0);
|
|
7104
|
+
v = And(v, TableLookupBytes(masks, bits));
|
|
7105
|
+
const VFromD<decltype(d)> mul = TableLookupBytes(shl, bits);
|
|
6366
7106
|
return VFromD<decltype(d)>{_mm512_gf2p8mul_epi8(v.raw, mul.raw)};
|
|
6367
7107
|
#else
|
|
6368
7108
|
const Repartition<uint16_t, decltype(d)> dw;
|
|
@@ -6570,161 +7310,16 @@ HWY_API VFromD<DI32> SumOfMulQuadAccumulate(
|
|
|
6570
7310
|
|
|
6571
7311
|
// ------------------------------ Reductions
|
|
6572
7312
|
|
|
6573
|
-
|
|
6574
|
-
HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
|
|
6575
|
-
return _mm512_reduce_add_epi32(v.raw);
|
|
6576
|
-
}
|
|
6577
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
|
|
6578
|
-
HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
|
|
6579
|
-
return _mm512_reduce_add_epi64(v.raw);
|
|
6580
|
-
}
|
|
6581
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
|
|
6582
|
-
HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
|
|
6583
|
-
return static_cast<uint32_t>(_mm512_reduce_add_epi32(v.raw));
|
|
6584
|
-
}
|
|
6585
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
|
|
6586
|
-
HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
|
|
6587
|
-
return static_cast<uint64_t>(_mm512_reduce_add_epi64(v.raw));
|
|
6588
|
-
}
|
|
6589
|
-
#if HWY_HAVE_FLOAT16
|
|
6590
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
|
|
6591
|
-
HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
|
|
6592
|
-
return _mm512_reduce_add_ph(v.raw);
|
|
6593
|
-
}
|
|
6594
|
-
#endif // HWY_HAVE_FLOAT16
|
|
6595
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
6596
|
-
HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
|
|
6597
|
-
return _mm512_reduce_add_ps(v.raw);
|
|
6598
|
-
}
|
|
6599
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
|
|
6600
|
-
HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
|
|
6601
|
-
return _mm512_reduce_add_pd(v.raw);
|
|
6602
|
-
}
|
|
6603
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
|
|
6604
|
-
HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
|
|
6605
|
-
const RepartitionToWide<decltype(d)> d32;
|
|
6606
|
-
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
|
|
6607
|
-
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
|
6608
|
-
const auto sum = ReduceSum(d32, even + odd);
|
|
6609
|
-
return static_cast<uint16_t>(sum);
|
|
6610
|
-
}
|
|
6611
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
|
|
6612
|
-
HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
|
|
6613
|
-
const RepartitionToWide<decltype(d)> d32;
|
|
6614
|
-
// Sign-extend
|
|
6615
|
-
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
|
|
6616
|
-
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
|
6617
|
-
const auto sum = ReduceSum(d32, even + odd);
|
|
6618
|
-
return static_cast<int16_t>(sum);
|
|
6619
|
-
}
|
|
6620
|
-
|
|
6621
|
-
// Returns the sum in each lane.
|
|
6622
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64)>
|
|
6623
|
-
HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
|
|
6624
|
-
return Set(d, ReduceSum(d, v));
|
|
6625
|
-
}
|
|
7313
|
+
namespace detail {
|
|
6626
7314
|
|
|
6627
|
-
//
|
|
6628
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64)
|
|
6629
|
-
|
|
6630
|
-
|
|
6631
|
-
|
|
6632
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
|
|
6633
|
-
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
6634
|
-
return Set(d, _mm512_reduce_min_epi64(v.raw));
|
|
6635
|
-
}
|
|
6636
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
|
|
6637
|
-
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
6638
|
-
return Set(d, _mm512_reduce_min_epu32(v.raw));
|
|
6639
|
-
}
|
|
6640
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
|
|
6641
|
-
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
6642
|
-
return Set(d, _mm512_reduce_min_epu64(v.raw));
|
|
6643
|
-
}
|
|
6644
|
-
#if HWY_HAVE_FLOAT16
|
|
6645
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
|
|
6646
|
-
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
6647
|
-
return Set(d, _mm512_reduce_min_ph(v.raw));
|
|
6648
|
-
}
|
|
6649
|
-
#endif // HWY_HAVE_FLOAT16
|
|
6650
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
6651
|
-
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
6652
|
-
return Set(d, _mm512_reduce_min_ps(v.raw));
|
|
6653
|
-
}
|
|
6654
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
|
|
6655
|
-
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
6656
|
-
return Set(d, _mm512_reduce_min_pd(v.raw));
|
|
6657
|
-
}
|
|
6658
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
|
|
6659
|
-
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
6660
|
-
const RepartitionToWide<decltype(d)> d32;
|
|
6661
|
-
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
|
|
6662
|
-
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
|
6663
|
-
const auto min = MinOfLanes(d32, Min(even, odd));
|
|
6664
|
-
// Also broadcast into odd lanes.
|
|
6665
|
-
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
|
|
6666
|
-
}
|
|
6667
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
|
|
6668
|
-
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
6669
|
-
const RepartitionToWide<decltype(d)> d32;
|
|
6670
|
-
// Sign-extend
|
|
6671
|
-
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
|
|
6672
|
-
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
|
6673
|
-
const auto min = MinOfLanes(d32, Min(even, odd));
|
|
6674
|
-
// Also broadcast into odd lanes.
|
|
6675
|
-
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
|
|
7315
|
+
// Used by generic_ops-inl
|
|
7316
|
+
template <class D, class Func, HWY_IF_V_SIZE_D(D, 64)>
|
|
7317
|
+
HWY_INLINE VFromD<D> ReduceAcrossBlocks(D d, Func f, VFromD<D> v) {
|
|
7318
|
+
v = f(v, SwapAdjacentBlocks(v));
|
|
7319
|
+
return f(v, ReverseBlocks(d, v));
|
|
6676
7320
|
}
|
|
6677
7321
|
|
|
6678
|
-
//
|
|
6679
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
|
|
6680
|
-
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
6681
|
-
return Set(d, _mm512_reduce_max_epi32(v.raw));
|
|
6682
|
-
}
|
|
6683
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
|
|
6684
|
-
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
6685
|
-
return Set(d, _mm512_reduce_max_epi64(v.raw));
|
|
6686
|
-
}
|
|
6687
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
|
|
6688
|
-
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
6689
|
-
return Set(d, _mm512_reduce_max_epu32(v.raw));
|
|
6690
|
-
}
|
|
6691
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
|
|
6692
|
-
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
6693
|
-
return Set(d, _mm512_reduce_max_epu64(v.raw));
|
|
6694
|
-
}
|
|
6695
|
-
#if HWY_HAVE_FLOAT16
|
|
6696
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
|
|
6697
|
-
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
6698
|
-
return Set(d, _mm512_reduce_max_ph(v.raw));
|
|
6699
|
-
}
|
|
6700
|
-
#endif // HWY_HAVE_FLOAT16
|
|
6701
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
6702
|
-
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
6703
|
-
return Set(d, _mm512_reduce_max_ps(v.raw));
|
|
6704
|
-
}
|
|
6705
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
|
|
6706
|
-
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
6707
|
-
return Set(d, _mm512_reduce_max_pd(v.raw));
|
|
6708
|
-
}
|
|
6709
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
|
|
6710
|
-
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
6711
|
-
const RepartitionToWide<decltype(d)> d32;
|
|
6712
|
-
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
|
|
6713
|
-
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
|
6714
|
-
const auto min = MaxOfLanes(d32, Max(even, odd));
|
|
6715
|
-
// Also broadcast into odd lanes.
|
|
6716
|
-
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
|
|
6717
|
-
}
|
|
6718
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
|
|
6719
|
-
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
6720
|
-
const RepartitionToWide<decltype(d)> d32;
|
|
6721
|
-
// Sign-extend
|
|
6722
|
-
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
|
|
6723
|
-
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
|
6724
|
-
const auto min = MaxOfLanes(d32, Max(even, odd));
|
|
6725
|
-
// Also broadcast into odd lanes.
|
|
6726
|
-
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
|
|
6727
|
-
}
|
|
7322
|
+
} // namespace detail
|
|
6728
7323
|
|
|
6729
7324
|
// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex
|
|
6730
7325
|
|