@img/sharp-libvips-dev 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +7 -1
- package/include/aom/aom_image.h +24 -12
- package/include/aom/aom_integer.h +3 -3
- package/include/aom/aomcx.h +15 -0
- package/include/aom/aomdx.h +5 -2
- package/include/archive.h +7 -5
- package/include/archive_entry.h +5 -3
- package/include/cgif.h +3 -0
- package/include/freetype2/freetype/config/ftoption.h +1 -1
- package/include/fribidi/fribidi-config.h +2 -2
- package/include/fribidi/fribidi-unicode-version.h +3 -3
- package/include/glib-2.0/gio/gappinfo.h +40 -25
- package/include/glib-2.0/gio/gasyncresult.h +1 -1
- package/include/glib-2.0/gio/gconverter.h +5 -0
- package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
- package/include/glib-2.0/gio/gfile.h +16 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/gsettings.h +8 -0
- package/include/glib-2.0/gio/gvfs.h +2 -2
- package/include/glib-2.0/girepository/gi-visibility.h +34 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
- package/include/glib-2.0/glib/giochannel.h +2 -2
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +12 -5
- package/include/glib-2.0/glib/gmain.h +93 -7
- package/include/glib-2.0/glib/gqsort.h +8 -1
- package/include/glib-2.0/glib/gstrfuncs.h +0 -12
- package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
- package/include/glib-2.0/glib/gunicode.h +1 -1
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gtype.h +6 -6
- package/include/harfbuzz/hb-buffer.h +6 -0
- package/include/harfbuzz/hb-common.h +6 -9
- package/include/harfbuzz/hb-cplusplus.hh +8 -11
- package/include/harfbuzz/hb-subset.h +17 -4
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +28 -0
- package/include/hwy/aligned_allocator.h +48 -1
- package/include/hwy/base.h +235 -34
- package/include/hwy/detect_compiler_arch.h +84 -10
- package/include/hwy/detect_targets.h +95 -29
- package/include/hwy/foreach_target.h +12 -1
- package/include/hwy/highway.h +205 -50
- package/include/hwy/ops/arm_neon-inl.h +841 -99
- package/include/hwy/ops/arm_sve-inl.h +413 -141
- package/include/hwy/ops/emu128-inl.h +373 -360
- package/include/hwy/ops/generic_ops-inl.h +804 -401
- package/include/hwy/ops/inside-inl.h +691 -0
- package/include/hwy/ops/ppc_vsx-inl.h +456 -166
- package/include/hwy/ops/rvv-inl.h +537 -249
- package/include/hwy/ops/scalar-inl.h +169 -79
- package/include/hwy/ops/set_macros-inl.h +106 -18
- package/include/hwy/ops/shared-inl.h +23 -0
- package/include/hwy/ops/wasm_128-inl.h +130 -108
- package/include/hwy/ops/x86_128-inl.h +1892 -577
- package/include/hwy/ops/x86_256-inl.h +625 -184
- package/include/hwy/ops/x86_512-inl.h +733 -131
- package/include/hwy/targets.h +22 -21
- package/include/hwy/timer-inl.h +3 -3
- package/include/hwy/timer.h +5 -1
- package/include/libheif/heif.h +170 -15
- package/include/libheif/heif_items.h +237 -0
- package/include/libheif/heif_properties.h +38 -2
- package/include/libheif/heif_regions.h +1 -1
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/pnglibconf.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
- package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
- package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
- package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
- package/include/libxml2/libxml/HTMLparser.h +12 -19
- package/include/libxml2/libxml/c14n.h +1 -12
- package/include/libxml2/libxml/debugXML.h +1 -1
- package/include/libxml2/libxml/encoding.h +9 -0
- package/include/libxml2/libxml/entities.h +12 -1
- package/include/libxml2/libxml/hash.h +19 -0
- package/include/libxml2/libxml/list.h +2 -2
- package/include/libxml2/libxml/nanohttp.h +17 -0
- package/include/libxml2/libxml/parser.h +61 -55
- package/include/libxml2/libxml/parserInternals.h +9 -1
- package/include/libxml2/libxml/pattern.h +6 -0
- package/include/libxml2/libxml/tree.h +32 -12
- package/include/libxml2/libxml/uri.h +11 -0
- package/include/libxml2/libxml/valid.h +29 -2
- package/include/libxml2/libxml/xinclude.h +7 -0
- package/include/libxml2/libxml/xmlIO.h +21 -4
- package/include/libxml2/libxml/xmlerror.h +14 -0
- package/include/libxml2/libxml/xmlexports.h +111 -15
- package/include/libxml2/libxml/xmlmemory.h +8 -45
- package/include/libxml2/libxml/xmlreader.h +2 -0
- package/include/libxml2/libxml/xmlsave.h +5 -0
- package/include/libxml2/libxml/xmlunicode.h +165 -1
- package/include/libxml2/libxml/xmlversion.h +15 -179
- package/include/libxml2/libxml/xmlwriter.h +1 -0
- package/include/libxml2/libxml/xpath.h +4 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-item.h +4 -2
- package/include/pango-1.0/pango/pango-version-macros.h +25 -0
- package/include/pango-1.0/pango/pangofc-font.h +2 -1
- package/include/pnglibconf.h +1 -1
- package/include/vips/util.h +1 -2
- package/include/vips/version.h +4 -4
- package/include/webp/decode.h +58 -56
- package/include/webp/demux.h +25 -21
- package/include/webp/encode.h +44 -39
- package/include/webp/mux.h +76 -15
- package/include/webp/mux_types.h +2 -1
- package/include/webp/sharpyuv/sharpyuv.h +77 -8
- package/include/webp/types.h +29 -8
- package/include/zconf.h +1 -1
- package/include/zlib.h +12 -12
- package/package.json +1 -1
- package/versions.json +14 -15
|
@@ -193,6 +193,25 @@ HWY_INLINE __m512i BitCastToInteger(__m512d v) {
|
|
|
193
193
|
return _mm512_castpd_si512(v);
|
|
194
194
|
}
|
|
195
195
|
|
|
196
|
+
#if HWY_AVX3_HAVE_F32_TO_BF16C
|
|
197
|
+
HWY_INLINE __m512i BitCastToInteger(__m512bh v) {
|
|
198
|
+
// Need to use reinterpret_cast on GCC/Clang or BitCastScalar on MSVC to
|
|
199
|
+
// bit cast a __m512bh to a __m512i as there is currently no intrinsic
|
|
200
|
+
// available (as of GCC 13 and Clang 17) that can bit cast a __m512bh vector
|
|
201
|
+
// to a __m512i vector
|
|
202
|
+
|
|
203
|
+
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
|
|
204
|
+
// On GCC or Clang, use reinterpret_cast to bit cast a __m512bh to a __m512i
|
|
205
|
+
return reinterpret_cast<__m512i>(v);
|
|
206
|
+
#else
|
|
207
|
+
// On MSVC, use BitCastScalar to bit cast a __m512bh to a __m512i as MSVC does
|
|
208
|
+
// not allow reinterpret_cast, static_cast, or a C-style cast to be used to
|
|
209
|
+
// bit cast from one AVX vector type to a different AVX vector type
|
|
210
|
+
return BitCastScalar<__m512i>(v);
|
|
211
|
+
#endif // HWY_COMPILER_GCC || HWY_COMPILER_CLANG
|
|
212
|
+
}
|
|
213
|
+
#endif // HWY_AVX3_HAVE_F32_TO_BF16C
|
|
214
|
+
|
|
196
215
|
template <typename T>
|
|
197
216
|
HWY_INLINE Vec512<uint8_t> BitCastToByte(Vec512<T> v) {
|
|
198
217
|
return Vec512<uint8_t>{BitCastToInteger(v.raw)};
|
|
@@ -698,45 +717,61 @@ HWY_API Vec512<double> Xor(const Vec512<double> a, const Vec512<double> b) {
|
|
|
698
717
|
// ------------------------------ Xor3
|
|
699
718
|
template <typename T>
|
|
700
719
|
HWY_API Vec512<T> Xor3(Vec512<T> x1, Vec512<T> x2, Vec512<T> x3) {
|
|
720
|
+
#if !HWY_IS_MSAN
|
|
701
721
|
const DFromV<decltype(x1)> d;
|
|
702
722
|
const RebindToUnsigned<decltype(d)> du;
|
|
703
723
|
using VU = VFromD<decltype(du)>;
|
|
704
724
|
const __m512i ret = _mm512_ternarylogic_epi64(
|
|
705
725
|
BitCast(du, x1).raw, BitCast(du, x2).raw, BitCast(du, x3).raw, 0x96);
|
|
706
726
|
return BitCast(d, VU{ret});
|
|
727
|
+
#else
|
|
728
|
+
return Xor(x1, Xor(x2, x3));
|
|
729
|
+
#endif
|
|
707
730
|
}
|
|
708
731
|
|
|
709
732
|
// ------------------------------ Or3
|
|
710
733
|
template <typename T>
|
|
711
734
|
HWY_API Vec512<T> Or3(Vec512<T> o1, Vec512<T> o2, Vec512<T> o3) {
|
|
735
|
+
#if !HWY_IS_MSAN
|
|
712
736
|
const DFromV<decltype(o1)> d;
|
|
713
737
|
const RebindToUnsigned<decltype(d)> du;
|
|
714
738
|
using VU = VFromD<decltype(du)>;
|
|
715
739
|
const __m512i ret = _mm512_ternarylogic_epi64(
|
|
716
740
|
BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE);
|
|
717
741
|
return BitCast(d, VU{ret});
|
|
742
|
+
#else
|
|
743
|
+
return Or(o1, Or(o2, o3));
|
|
744
|
+
#endif
|
|
718
745
|
}
|
|
719
746
|
|
|
720
747
|
// ------------------------------ OrAnd
|
|
721
748
|
template <typename T>
|
|
722
749
|
HWY_API Vec512<T> OrAnd(Vec512<T> o, Vec512<T> a1, Vec512<T> a2) {
|
|
750
|
+
#if !HWY_IS_MSAN
|
|
723
751
|
const DFromV<decltype(o)> d;
|
|
724
752
|
const RebindToUnsigned<decltype(d)> du;
|
|
725
753
|
using VU = VFromD<decltype(du)>;
|
|
726
754
|
const __m512i ret = _mm512_ternarylogic_epi64(
|
|
727
755
|
BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
|
|
728
756
|
return BitCast(d, VU{ret});
|
|
757
|
+
#else
|
|
758
|
+
return Or(o, And(a1, a2));
|
|
759
|
+
#endif
|
|
729
760
|
}
|
|
730
761
|
|
|
731
762
|
// ------------------------------ IfVecThenElse
|
|
732
763
|
template <typename T>
|
|
733
764
|
HWY_API Vec512<T> IfVecThenElse(Vec512<T> mask, Vec512<T> yes, Vec512<T> no) {
|
|
765
|
+
#if !HWY_IS_MSAN
|
|
734
766
|
const DFromV<decltype(yes)> d;
|
|
735
767
|
const RebindToUnsigned<decltype(d)> du;
|
|
736
768
|
using VU = VFromD<decltype(du)>;
|
|
737
769
|
return BitCast(d, VU{_mm512_ternarylogic_epi64(BitCast(du, mask).raw,
|
|
738
770
|
BitCast(du, yes).raw,
|
|
739
771
|
BitCast(du, no).raw, 0xCA)});
|
|
772
|
+
#else
|
|
773
|
+
return IfThenElse(MaskFromVec(mask), yes, no);
|
|
774
|
+
#endif
|
|
740
775
|
}
|
|
741
776
|
|
|
742
777
|
// ------------------------------ Operator overloads (internal-only if float)
|
|
@@ -1036,12 +1071,6 @@ HWY_API Vec512<T> IfNegativeThenNegOrUndefIfZero(Vec512<T> mask, Vec512<T> v) {
|
|
|
1036
1071
|
return MaskedSubOr(v, MaskFromVec(mask), Zero(d), v);
|
|
1037
1072
|
}
|
|
1038
1073
|
|
|
1039
|
-
template <typename T, HWY_IF_FLOAT(T)>
|
|
1040
|
-
HWY_API Vec512<T> ZeroIfNegative(const Vec512<T> v) {
|
|
1041
|
-
// AVX3 MaskFromVec only looks at the MSB
|
|
1042
|
-
return IfThenZeroElse(MaskFromVec(v), v);
|
|
1043
|
-
}
|
|
1044
|
-
|
|
1045
1074
|
// ================================================== ARITHMETIC
|
|
1046
1075
|
|
|
1047
1076
|
// ------------------------------ Addition
|
|
@@ -1417,14 +1446,45 @@ HWY_API Vec512<int8_t> ShiftRight(const Vec512<int8_t> v) {
|
|
|
1417
1446
|
|
|
1418
1447
|
// ------------------------------ RotateRight
|
|
1419
1448
|
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1449
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
1450
|
+
// U8 RotateRight is generic for all vector lengths on AVX3_DL
|
|
1451
|
+
template <int kBits, class V, HWY_IF_U8(TFromV<V>)>
|
|
1452
|
+
HWY_API V RotateRight(V v) {
|
|
1453
|
+
static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
|
|
1454
|
+
|
|
1455
|
+
const Repartition<uint64_t, DFromV<V>> du64;
|
|
1424
1456
|
if (kBits == 0) return v;
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
|
|
1457
|
+
|
|
1458
|
+
constexpr uint64_t kShrMatrix =
|
|
1459
|
+
(0x0102040810204080ULL << kBits) &
|
|
1460
|
+
(0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
|
|
1461
|
+
constexpr int kShlBits = (-kBits) & 7;
|
|
1462
|
+
constexpr uint64_t kShlMatrix = (0x0102040810204080ULL >> kShlBits) &
|
|
1463
|
+
(0x0101010101010101ULL * (0xFF >> kShlBits));
|
|
1464
|
+
constexpr uint64_t kMatrix = kShrMatrix | kShlMatrix;
|
|
1465
|
+
|
|
1466
|
+
return detail::GaloisAffine(v, Set(du64, kMatrix));
|
|
1467
|
+
}
|
|
1468
|
+
#else // HWY_TARGET > HWY_AVX3_DL
|
|
1469
|
+
template <int kBits>
|
|
1470
|
+
HWY_API Vec512<uint8_t> RotateRight(const Vec512<uint8_t> v) {
|
|
1471
|
+
static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
|
|
1472
|
+
if (kBits == 0) return v;
|
|
1473
|
+
// AVX3 does not support 8-bit.
|
|
1474
|
+
return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(7, 8 - kBits)>(v));
|
|
1475
|
+
}
|
|
1476
|
+
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
1477
|
+
|
|
1478
|
+
template <int kBits>
|
|
1479
|
+
HWY_API Vec512<uint16_t> RotateRight(const Vec512<uint16_t> v) {
|
|
1480
|
+
static_assert(0 <= kBits && kBits < 16, "Invalid shift count");
|
|
1481
|
+
if (kBits == 0) return v;
|
|
1482
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
1483
|
+
return Vec512<uint16_t>{_mm512_shrdi_epi16(v.raw, v.raw, kBits)};
|
|
1484
|
+
#else
|
|
1485
|
+
// AVX3 does not support 16-bit.
|
|
1486
|
+
return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(15, 16 - kBits)>(v));
|
|
1487
|
+
#endif
|
|
1428
1488
|
}
|
|
1429
1489
|
|
|
1430
1490
|
template <int kBits>
|
|
@@ -1441,6 +1501,34 @@ HWY_API Vec512<uint64_t> RotateRight(const Vec512<uint64_t> v) {
|
|
|
1441
1501
|
return Vec512<uint64_t>{_mm512_ror_epi64(v.raw, kBits)};
|
|
1442
1502
|
}
|
|
1443
1503
|
|
|
1504
|
+
// ------------------------------ Rol/Ror
|
|
1505
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
1506
|
+
template <class T, HWY_IF_UI16(T)>
|
|
1507
|
+
HWY_API Vec512<T> Ror(Vec512<T> a, Vec512<T> b) {
|
|
1508
|
+
return Vec512<T>{_mm512_shrdv_epi16(a.raw, a.raw, b.raw)};
|
|
1509
|
+
}
|
|
1510
|
+
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
1511
|
+
|
|
1512
|
+
template <class T, HWY_IF_UI32(T)>
|
|
1513
|
+
HWY_API Vec512<T> Rol(Vec512<T> a, Vec512<T> b) {
|
|
1514
|
+
return Vec512<T>{_mm512_rolv_epi32(a.raw, b.raw)};
|
|
1515
|
+
}
|
|
1516
|
+
|
|
1517
|
+
template <class T, HWY_IF_UI32(T)>
|
|
1518
|
+
HWY_API Vec512<T> Ror(Vec512<T> a, Vec512<T> b) {
|
|
1519
|
+
return Vec512<T>{_mm512_rorv_epi32(a.raw, b.raw)};
|
|
1520
|
+
}
|
|
1521
|
+
|
|
1522
|
+
template <class T, HWY_IF_UI64(T)>
|
|
1523
|
+
HWY_API Vec512<T> Rol(Vec512<T> a, Vec512<T> b) {
|
|
1524
|
+
return Vec512<T>{_mm512_rolv_epi64(a.raw, b.raw)};
|
|
1525
|
+
}
|
|
1526
|
+
|
|
1527
|
+
template <class T, HWY_IF_UI64(T)>
|
|
1528
|
+
HWY_API Vec512<T> Ror(Vec512<T> a, Vec512<T> b) {
|
|
1529
|
+
return Vec512<T>{_mm512_rorv_epi64(a.raw, b.raw)};
|
|
1530
|
+
}
|
|
1531
|
+
|
|
1444
1532
|
// ------------------------------ ShiftLeftSame
|
|
1445
1533
|
|
|
1446
1534
|
// GCC <14 and Clang <11 do not follow the Intel documentation for AVX-512
|
|
@@ -2874,6 +2962,28 @@ HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
|
|
|
2874
2962
|
return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
|
|
2875
2963
|
}
|
|
2876
2964
|
|
|
2965
|
+
template <class D, HWY_IF_LANES_D(D, 64)>
|
|
2966
|
+
HWY_API MFromD<D> SlideMask1Up(D /*d*/, MFromD<D> m) {
|
|
2967
|
+
using RawM = decltype(MFromD<D>().raw);
|
|
2968
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
2969
|
+
return MFromD<D>{
|
|
2970
|
+
static_cast<RawM>(_kshiftli_mask64(static_cast<__mmask64>(m.raw), 1))};
|
|
2971
|
+
#else
|
|
2972
|
+
return MFromD<D>{static_cast<RawM>(static_cast<uint64_t>(m.raw) << 1)};
|
|
2973
|
+
#endif
|
|
2974
|
+
}
|
|
2975
|
+
|
|
2976
|
+
template <class D, HWY_IF_LANES_D(D, 64)>
|
|
2977
|
+
HWY_API MFromD<D> SlideMask1Down(D /*d*/, MFromD<D> m) {
|
|
2978
|
+
using RawM = decltype(MFromD<D>().raw);
|
|
2979
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
2980
|
+
return MFromD<D>{
|
|
2981
|
+
static_cast<RawM>(_kshiftri_mask64(static_cast<__mmask64>(m.raw), 1))};
|
|
2982
|
+
#else
|
|
2983
|
+
return MFromD<D>{static_cast<RawM>(static_cast<uint64_t>(m.raw) >> 1)};
|
|
2984
|
+
#endif
|
|
2985
|
+
}
|
|
2986
|
+
|
|
2877
2987
|
// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
|
|
2878
2988
|
|
|
2879
2989
|
HWY_API Vec512<int8_t> BroadcastSignBit(Vec512<int8_t> v) {
|
|
@@ -2907,6 +3017,15 @@ HWY_API Mask512<float16_t> IsNaN(Vec512<float16_t> v) {
|
|
|
2907
3017
|
v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
|
|
2908
3018
|
}
|
|
2909
3019
|
|
|
3020
|
+
HWY_API Mask512<float16_t> IsEitherNaN(Vec512<float16_t> a,
|
|
3021
|
+
Vec512<float16_t> b) {
|
|
3022
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
3023
|
+
HWY_DIAGNOSTICS(push)
|
|
3024
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
3025
|
+
return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_UNORD_Q)};
|
|
3026
|
+
HWY_DIAGNOSTICS(pop)
|
|
3027
|
+
}
|
|
3028
|
+
|
|
2910
3029
|
HWY_API Mask512<float16_t> IsInf(Vec512<float16_t> v) {
|
|
2911
3030
|
return Mask512<float16_t>{_mm512_fpclass_ph_mask(v.raw, 0x18)};
|
|
2912
3031
|
}
|
|
@@ -2930,6 +3049,14 @@ HWY_API Mask512<double> IsNaN(Vec512<double> v) {
|
|
|
2930
3049
|
v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
|
|
2931
3050
|
}
|
|
2932
3051
|
|
|
3052
|
+
HWY_API Mask512<float> IsEitherNaN(Vec512<float> a, Vec512<float> b) {
|
|
3053
|
+
return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_UNORD_Q)};
|
|
3054
|
+
}
|
|
3055
|
+
|
|
3056
|
+
HWY_API Mask512<double> IsEitherNaN(Vec512<double> a, Vec512<double> b) {
|
|
3057
|
+
return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_UNORD_Q)};
|
|
3058
|
+
}
|
|
3059
|
+
|
|
2933
3060
|
HWY_API Mask512<float> IsInf(Vec512<float> v) {
|
|
2934
3061
|
return Mask512<float>{_mm512_fpclass_ps_mask(
|
|
2935
3062
|
v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)};
|
|
@@ -3357,30 +3484,21 @@ HWY_INLINE Vec512<double> NativeMaskedGatherOr512(
|
|
|
3357
3484
|
} // namespace detail
|
|
3358
3485
|
|
|
3359
3486
|
template <class D, HWY_IF_V_SIZE_D(D, 64)>
|
|
3360
|
-
HWY_API VFromD<D> GatherOffset(D d
|
|
3487
|
+
HWY_API VFromD<D> GatherOffset(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
|
|
3361
3488
|
VFromD<RebindToSigned<D>> offsets) {
|
|
3362
|
-
const RebindToSigned<decltype(d)> di;
|
|
3363
|
-
(void)di; // for HWY_DASSERT
|
|
3364
|
-
HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di))));
|
|
3365
3489
|
return detail::NativeGather512<1>(base, offsets);
|
|
3366
3490
|
}
|
|
3367
3491
|
|
|
3368
3492
|
template <class D, HWY_IF_V_SIZE_D(D, 64)>
|
|
3369
|
-
HWY_API VFromD<D> GatherIndex(D d
|
|
3493
|
+
HWY_API VFromD<D> GatherIndex(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
|
|
3370
3494
|
VFromD<RebindToSigned<D>> indices) {
|
|
3371
|
-
const RebindToSigned<decltype(d)> di;
|
|
3372
|
-
(void)di; // for HWY_DASSERT
|
|
3373
|
-
HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
|
|
3374
3495
|
return detail::NativeGather512<sizeof(TFromD<D>)>(base, indices);
|
|
3375
3496
|
}
|
|
3376
3497
|
|
|
3377
3498
|
template <class D, HWY_IF_V_SIZE_D(D, 64)>
|
|
3378
|
-
HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d
|
|
3499
|
+
HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D /*d*/,
|
|
3379
3500
|
const TFromD<D>* HWY_RESTRICT base,
|
|
3380
3501
|
VFromD<RebindToSigned<D>> indices) {
|
|
3381
|
-
const RebindToSigned<decltype(d)> di;
|
|
3382
|
-
(void)di; // for HWY_DASSERT
|
|
3383
|
-
HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
|
|
3384
3502
|
return detail::NativeMaskedGatherOr512<sizeof(TFromD<D>)>(no, m, base,
|
|
3385
3503
|
indices);
|
|
3386
3504
|
}
|
|
@@ -4625,6 +4743,35 @@ HWY_API Vec512<T> OddEven(const Vec512<T> a, const Vec512<T> b) {
|
|
|
4625
4743
|
return IfThenElse(Mask512<T>{0x5555555555555555ull >> shift}, b, a);
|
|
4626
4744
|
}
|
|
4627
4745
|
|
|
4746
|
+
// -------------------------- InterleaveEven
|
|
4747
|
+
|
|
4748
|
+
template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_UI32_D(D)>
|
|
4749
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
4750
|
+
return VFromD<D>{_mm512_mask_shuffle_epi32(
|
|
4751
|
+
a.raw, static_cast<__mmask16>(0xAAAA), b.raw,
|
|
4752
|
+
static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(2, 2, 0, 0)))};
|
|
4753
|
+
}
|
|
4754
|
+
template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_F32_D(D)>
|
|
4755
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
4756
|
+
return VFromD<D>{_mm512_mask_shuffle_ps(a.raw, static_cast<__mmask16>(0xAAAA),
|
|
4757
|
+
b.raw, b.raw,
|
|
4758
|
+
_MM_SHUFFLE(2, 2, 0, 0))};
|
|
4759
|
+
}
|
|
4760
|
+
// -------------------------- InterleaveOdd
|
|
4761
|
+
|
|
4762
|
+
template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_UI32_D(D)>
|
|
4763
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
4764
|
+
return VFromD<D>{_mm512_mask_shuffle_epi32(
|
|
4765
|
+
b.raw, static_cast<__mmask16>(0x5555), a.raw,
|
|
4766
|
+
static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(3, 3, 1, 1)))};
|
|
4767
|
+
}
|
|
4768
|
+
template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_F32_D(D)>
|
|
4769
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
4770
|
+
return VFromD<D>{_mm512_mask_shuffle_ps(b.raw, static_cast<__mmask16>(0x5555),
|
|
4771
|
+
a.raw, a.raw,
|
|
4772
|
+
_MM_SHUFFLE(3, 3, 1, 1))};
|
|
4773
|
+
}
|
|
4774
|
+
|
|
4628
4775
|
// ------------------------------ OddEvenBlocks
|
|
4629
4776
|
|
|
4630
4777
|
template <typename T>
|
|
@@ -5381,18 +5528,76 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec256<uint32_t> v) {
|
|
|
5381
5528
|
}
|
|
5382
5529
|
|
|
5383
5530
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
|
|
5384
|
-
HWY_API VFromD<D>
|
|
5385
|
-
|
|
5386
|
-
|
|
5387
|
-
|
|
5531
|
+
HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
|
|
5532
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
5533
|
+
// Workaround for undefined behavior with GCC if any values of v[i] are not
|
|
5534
|
+
// within the range of an int64_t
|
|
5535
|
+
|
|
5536
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
5537
|
+
if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
|
|
5538
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
|
|
5539
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
5540
|
+
return VFromD<D>{_mm512_setr_epi64(
|
|
5541
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]),
|
|
5542
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1]),
|
|
5543
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[2]),
|
|
5544
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[3]),
|
|
5545
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[4]),
|
|
5546
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[5]),
|
|
5547
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[6]),
|
|
5548
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[7]))};
|
|
5549
|
+
}
|
|
5550
|
+
#endif
|
|
5388
5551
|
|
|
5389
|
-
|
|
5390
|
-
|
|
5391
|
-
|
|
5552
|
+
__m512i raw_result;
|
|
5553
|
+
__asm__("vcvttps2qq {%1, %0|%0, %1}"
|
|
5554
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
5555
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
5556
|
+
:);
|
|
5557
|
+
return VFromD<D>{raw_result};
|
|
5558
|
+
#else
|
|
5559
|
+
return VFromD<D>{_mm512_cvttps_epi64(v.raw)};
|
|
5560
|
+
#endif
|
|
5392
5561
|
}
|
|
5393
5562
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
|
|
5394
|
-
HWY_API VFromD<D>
|
|
5395
|
-
|
|
5563
|
+
HWY_API VFromD<D> PromoteInRangeTo(D /* tag */, VFromD<Rebind<float, D>> v) {
|
|
5564
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
5565
|
+
// Workaround for undefined behavior with GCC if any values of v[i] are not
|
|
5566
|
+
// within the range of an uint64_t
|
|
5567
|
+
|
|
5568
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
5569
|
+
if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
|
|
5570
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
|
|
5571
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
5572
|
+
return VFromD<D>{_mm512_setr_epi64(
|
|
5573
|
+
static_cast<int64_t>(
|
|
5574
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0])),
|
|
5575
|
+
static_cast<int64_t>(
|
|
5576
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1])),
|
|
5577
|
+
static_cast<int64_t>(
|
|
5578
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[2])),
|
|
5579
|
+
static_cast<int64_t>(
|
|
5580
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[3])),
|
|
5581
|
+
static_cast<int64_t>(
|
|
5582
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[4])),
|
|
5583
|
+
static_cast<int64_t>(
|
|
5584
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[5])),
|
|
5585
|
+
static_cast<int64_t>(
|
|
5586
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[6])),
|
|
5587
|
+
static_cast<int64_t>(
|
|
5588
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[7])))};
|
|
5589
|
+
}
|
|
5590
|
+
#endif
|
|
5591
|
+
|
|
5592
|
+
__m512i raw_result;
|
|
5593
|
+
__asm__("vcvttps2uqq {%1, %0|%0, %1}"
|
|
5594
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
5595
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
5596
|
+
:);
|
|
5597
|
+
return VFromD<D>{raw_result};
|
|
5598
|
+
#else
|
|
5599
|
+
return VFromD<D>{_mm512_cvttps_epu64(v.raw)};
|
|
5600
|
+
#endif
|
|
5396
5601
|
}
|
|
5397
5602
|
|
|
5398
5603
|
// ------------------------------ Demotions (full -> part w/ narrow lanes)
|
|
@@ -5546,24 +5751,38 @@ HWY_API VFromD<D> DemoteTo(D /*df16*/, Vec512<double> v) {
|
|
|
5546
5751
|
}
|
|
5547
5752
|
#endif // HWY_HAVE_FLOAT16
|
|
5548
5753
|
|
|
5754
|
+
#if HWY_AVX3_HAVE_F32_TO_BF16C
|
|
5549
5755
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
|
|
5550
|
-
HWY_API VFromD<D> DemoteTo(D dbf16
|
|
5551
|
-
|
|
5552
|
-
|
|
5553
|
-
|
|
5554
|
-
|
|
5555
|
-
|
|
5556
|
-
|
|
5756
|
+
HWY_API VFromD<D> DemoteTo(D /*dbf16*/, Vec512<float> v) {
|
|
5757
|
+
#if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
|
|
5758
|
+
// Inline assembly workaround for LLVM codegen bug
|
|
5759
|
+
__m256i raw_result;
|
|
5760
|
+
__asm__("vcvtneps2bf16 %1, %0" : "=v"(raw_result) : "v"(v.raw));
|
|
5761
|
+
return VFromD<D>{raw_result};
|
|
5762
|
+
#else
|
|
5763
|
+
// The _mm512_cvtneps_pbh intrinsic returns a __m256bh vector that needs to be
|
|
5764
|
+
// bit casted to a __m256i vector
|
|
5765
|
+
return VFromD<D>{detail::BitCastToInteger(_mm512_cvtneps_pbh(v.raw))};
|
|
5766
|
+
#endif
|
|
5557
5767
|
}
|
|
5558
5768
|
|
|
5559
5769
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_BF16_D(D)>
|
|
5560
|
-
HWY_API VFromD<D> ReorderDemote2To(D dbf16
|
|
5561
|
-
|
|
5562
|
-
|
|
5563
|
-
|
|
5564
|
-
|
|
5565
|
-
|
|
5770
|
+
HWY_API VFromD<D> ReorderDemote2To(D /*dbf16*/, Vec512<float> a,
|
|
5771
|
+
Vec512<float> b) {
|
|
5772
|
+
#if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
|
|
5773
|
+
// Inline assembly workaround for LLVM codegen bug
|
|
5774
|
+
__m512i raw_result;
|
|
5775
|
+
__asm__("vcvtne2ps2bf16 %2, %1, %0"
|
|
5776
|
+
: "=v"(raw_result)
|
|
5777
|
+
: "v"(b.raw), "v"(a.raw));
|
|
5778
|
+
return VFromD<D>{raw_result};
|
|
5779
|
+
#else
|
|
5780
|
+
// The _mm512_cvtne2ps_pbh intrinsic returns a __m512bh vector that needs to
|
|
5781
|
+
// be bit casted to a __m512i vector
|
|
5782
|
+
return VFromD<D>{detail::BitCastToInteger(_mm512_cvtne2ps_pbh(b.raw, a.raw))};
|
|
5783
|
+
#endif
|
|
5566
5784
|
}
|
|
5785
|
+
#endif // HWY_AVX3_HAVE_F32_TO_BF16C
|
|
5567
5786
|
|
|
5568
5787
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
|
|
5569
5788
|
HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec512<int32_t> a,
|
|
@@ -5651,15 +5870,77 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<double> v) {
|
|
|
5651
5870
|
}
|
|
5652
5871
|
|
|
5653
5872
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
|
|
5654
|
-
HWY_API VFromD<D>
|
|
5655
|
-
|
|
5656
|
-
|
|
5657
|
-
|
|
5873
|
+
HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec512<double> v) {
|
|
5874
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
5875
|
+
// Workaround for undefined behavior in _mm512_cvttpd_epi32 with GCC if any
|
|
5876
|
+
// values of v[i] are not within the range of an int32_t
|
|
5877
|
+
|
|
5878
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
5879
|
+
if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
|
|
5880
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(64)));
|
|
5881
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
5882
|
+
return VFromD<D>{_mm256_setr_epi32(
|
|
5883
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
|
|
5884
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]),
|
|
5885
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[2]),
|
|
5886
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[3]),
|
|
5887
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[4]),
|
|
5888
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[5]),
|
|
5889
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[6]),
|
|
5890
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[7]))};
|
|
5891
|
+
}
|
|
5892
|
+
#endif
|
|
5893
|
+
|
|
5894
|
+
__m256i raw_result;
|
|
5895
|
+
__asm__("vcvttpd2dq {%1, %0|%0, %1}"
|
|
5896
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
5897
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
5898
|
+
:);
|
|
5899
|
+
return VFromD<D>{raw_result};
|
|
5900
|
+
#else
|
|
5901
|
+
return VFromD<D>{_mm512_cvttpd_epi32(v.raw)};
|
|
5902
|
+
#endif
|
|
5658
5903
|
}
|
|
5659
5904
|
|
|
5660
5905
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
|
|
5661
|
-
HWY_API VFromD<D>
|
|
5662
|
-
|
|
5906
|
+
HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec512<double> v) {
|
|
5907
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
5908
|
+
// Workaround for undefined behavior in _mm512_cvttpd_epu32 with GCC if any
|
|
5909
|
+
// values of v[i] are not within the range of an uint32_t
|
|
5910
|
+
|
|
5911
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
5912
|
+
if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) {
|
|
5913
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(64)));
|
|
5914
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
5915
|
+
return VFromD<D>{_mm256_setr_epi32(
|
|
5916
|
+
static_cast<int32_t>(
|
|
5917
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0])),
|
|
5918
|
+
static_cast<int32_t>(
|
|
5919
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1])),
|
|
5920
|
+
static_cast<int32_t>(
|
|
5921
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[2])),
|
|
5922
|
+
static_cast<int32_t>(
|
|
5923
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[3])),
|
|
5924
|
+
static_cast<int32_t>(
|
|
5925
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[4])),
|
|
5926
|
+
static_cast<int32_t>(
|
|
5927
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[5])),
|
|
5928
|
+
static_cast<int32_t>(
|
|
5929
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[6])),
|
|
5930
|
+
static_cast<int32_t>(
|
|
5931
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[7])))};
|
|
5932
|
+
}
|
|
5933
|
+
#endif
|
|
5934
|
+
|
|
5935
|
+
__m256i raw_result;
|
|
5936
|
+
__asm__("vcvttpd2udq {%1, %0|%0, %1}"
|
|
5937
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
5938
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
5939
|
+
:);
|
|
5940
|
+
return VFromD<D>{raw_result};
|
|
5941
|
+
#else
|
|
5942
|
+
return VFromD<D>{_mm512_cvttpd_epu32(v.raw)};
|
|
5943
|
+
#endif
|
|
5663
5944
|
}
|
|
5664
5945
|
|
|
5665
5946
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
@@ -5819,38 +6100,362 @@ HWY_API VFromD<D> ConvertTo(D /* tag*/, Vec512<uint64_t> v) {
|
|
|
5819
6100
|
// Truncates (rounds toward zero).
|
|
5820
6101
|
#if HWY_HAVE_FLOAT16
|
|
5821
6102
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
|
|
5822
|
-
HWY_API VFromD<D>
|
|
5823
|
-
|
|
5824
|
-
|
|
6103
|
+
HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec512<float16_t> v) {
|
|
6104
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6105
|
+
// Workaround for undefined behavior in _mm512_cvttph_epi16 with GCC if any
|
|
6106
|
+
// values of v[i] are not within the range of an int16_t
|
|
6107
|
+
|
|
6108
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
|
|
6109
|
+
HWY_HAVE_SCALAR_F16_TYPE
|
|
6110
|
+
if (detail::IsConstantX86VecForF2IConv<int16_t>(v)) {
|
|
6111
|
+
typedef hwy::float16_t::Native GccF16RawVectType
|
|
6112
|
+
__attribute__((__vector_size__(64)));
|
|
6113
|
+
const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
|
|
6114
|
+
return VFromD<D>{
|
|
6115
|
+
_mm512_set_epi16(detail::X86ConvertScalarFromFloat<int16_t>(raw_v[31]),
|
|
6116
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[30]),
|
|
6117
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[29]),
|
|
6118
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[28]),
|
|
6119
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[27]),
|
|
6120
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[26]),
|
|
6121
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[25]),
|
|
6122
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[24]),
|
|
6123
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[23]),
|
|
6124
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[22]),
|
|
6125
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[21]),
|
|
6126
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[20]),
|
|
6127
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[19]),
|
|
6128
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[18]),
|
|
6129
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[17]),
|
|
6130
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[16]),
|
|
6131
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[15]),
|
|
6132
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[14]),
|
|
6133
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[13]),
|
|
6134
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[12]),
|
|
6135
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[11]),
|
|
6136
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[10]),
|
|
6137
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[9]),
|
|
6138
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[8]),
|
|
6139
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[7]),
|
|
6140
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[6]),
|
|
6141
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[5]),
|
|
6142
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[4]),
|
|
6143
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[3]),
|
|
6144
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[2]),
|
|
6145
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[1]),
|
|
6146
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[0]))};
|
|
6147
|
+
}
|
|
6148
|
+
#endif
|
|
6149
|
+
|
|
6150
|
+
__m512i raw_result;
|
|
6151
|
+
__asm__("vcvttph2w {%1, %0|%0, %1}"
|
|
6152
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6153
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6154
|
+
:);
|
|
6155
|
+
return VFromD<D>{raw_result};
|
|
6156
|
+
#else
|
|
6157
|
+
return VFromD<D>{_mm512_cvttph_epi16(v.raw)};
|
|
6158
|
+
#endif
|
|
5825
6159
|
}
|
|
5826
6160
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
|
|
5827
|
-
HWY_API VFromD<D>
|
|
5828
|
-
|
|
6161
|
+
HWY_API VFromD<D> ConvertInRangeTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
|
|
6162
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6163
|
+
// Workaround for undefined behavior in _mm512_cvttph_epu16 with GCC if any
|
|
6164
|
+
// values of v[i] are not within the range of an uint16_t
|
|
6165
|
+
|
|
6166
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
|
|
6167
|
+
HWY_HAVE_SCALAR_F16_TYPE
|
|
6168
|
+
if (detail::IsConstantX86VecForF2IConv<uint16_t>(v)) {
|
|
6169
|
+
typedef hwy::float16_t::Native GccF16RawVectType
|
|
6170
|
+
__attribute__((__vector_size__(64)));
|
|
6171
|
+
const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
|
|
6172
|
+
return VFromD<D>{_mm512_set_epi16(
|
|
6173
|
+
static_cast<int16_t>(
|
|
6174
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[31])),
|
|
6175
|
+
static_cast<int16_t>(
|
|
6176
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[30])),
|
|
6177
|
+
static_cast<int16_t>(
|
|
6178
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[29])),
|
|
6179
|
+
static_cast<int16_t>(
|
|
6180
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[28])),
|
|
6181
|
+
static_cast<int16_t>(
|
|
6182
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[27])),
|
|
6183
|
+
static_cast<int16_t>(
|
|
6184
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[26])),
|
|
6185
|
+
static_cast<int16_t>(
|
|
6186
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[25])),
|
|
6187
|
+
static_cast<int16_t>(
|
|
6188
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[24])),
|
|
6189
|
+
static_cast<int16_t>(
|
|
6190
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[23])),
|
|
6191
|
+
static_cast<int16_t>(
|
|
6192
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[22])),
|
|
6193
|
+
static_cast<int16_t>(
|
|
6194
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[21])),
|
|
6195
|
+
static_cast<int16_t>(
|
|
6196
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[20])),
|
|
6197
|
+
static_cast<int16_t>(
|
|
6198
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[19])),
|
|
6199
|
+
static_cast<int16_t>(
|
|
6200
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[18])),
|
|
6201
|
+
static_cast<int16_t>(
|
|
6202
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[17])),
|
|
6203
|
+
static_cast<int16_t>(
|
|
6204
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[16])),
|
|
6205
|
+
static_cast<int16_t>(
|
|
6206
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[15])),
|
|
6207
|
+
static_cast<int16_t>(
|
|
6208
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[14])),
|
|
6209
|
+
static_cast<int16_t>(
|
|
6210
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[13])),
|
|
6211
|
+
static_cast<int16_t>(
|
|
6212
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[12])),
|
|
6213
|
+
static_cast<int16_t>(
|
|
6214
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[11])),
|
|
6215
|
+
static_cast<int16_t>(
|
|
6216
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[10])),
|
|
6217
|
+
static_cast<int16_t>(
|
|
6218
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[9])),
|
|
6219
|
+
static_cast<int16_t>(
|
|
6220
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[8])),
|
|
6221
|
+
static_cast<int16_t>(
|
|
6222
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[7])),
|
|
6223
|
+
static_cast<int16_t>(
|
|
6224
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[6])),
|
|
6225
|
+
static_cast<int16_t>(
|
|
6226
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[5])),
|
|
6227
|
+
static_cast<int16_t>(
|
|
6228
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[4])),
|
|
6229
|
+
static_cast<int16_t>(
|
|
6230
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[3])),
|
|
6231
|
+
static_cast<int16_t>(
|
|
6232
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[2])),
|
|
6233
|
+
static_cast<int16_t>(
|
|
6234
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[1])),
|
|
6235
|
+
static_cast<int16_t>(
|
|
6236
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[0])))};
|
|
6237
|
+
}
|
|
6238
|
+
#endif
|
|
6239
|
+
|
|
6240
|
+
__m512i raw_result;
|
|
6241
|
+
__asm__("vcvttph2uw {%1, %0|%0, %1}"
|
|
6242
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6243
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6244
|
+
:);
|
|
6245
|
+
return VFromD<D>{raw_result};
|
|
6246
|
+
#else
|
|
6247
|
+
return VFromD<D>{_mm512_cvttph_epu16(v.raw)};
|
|
6248
|
+
#endif
|
|
5829
6249
|
}
|
|
5830
6250
|
#endif // HWY_HAVE_FLOAT16
|
|
5831
6251
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
|
|
5832
|
-
HWY_API VFromD<D>
|
|
5833
|
-
|
|
5834
|
-
|
|
6252
|
+
HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec512<float> v) {
|
|
6253
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6254
|
+
// Workaround for undefined behavior in _mm512_cvttps_epi32 with GCC if any
|
|
6255
|
+
// values of v[i] are not within the range of an int32_t
|
|
6256
|
+
|
|
6257
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
6258
|
+
if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
|
|
6259
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(64)));
|
|
6260
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
6261
|
+
return VFromD<D>{_mm512_setr_epi32(
|
|
6262
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
|
|
6263
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]),
|
|
6264
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[2]),
|
|
6265
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[3]),
|
|
6266
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[4]),
|
|
6267
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[5]),
|
|
6268
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[6]),
|
|
6269
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[7]),
|
|
6270
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[8]),
|
|
6271
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[9]),
|
|
6272
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[10]),
|
|
6273
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[11]),
|
|
6274
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[12]),
|
|
6275
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[13]),
|
|
6276
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[14]),
|
|
6277
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[15]))};
|
|
6278
|
+
}
|
|
6279
|
+
#endif
|
|
6280
|
+
|
|
6281
|
+
__m512i raw_result;
|
|
6282
|
+
__asm__("vcvttps2dq {%1, %0|%0, %1}"
|
|
6283
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6284
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6285
|
+
:);
|
|
6286
|
+
return VFromD<D>{raw_result};
|
|
6287
|
+
#else
|
|
6288
|
+
return VFromD<D>{_mm512_cvttps_epi32(v.raw)};
|
|
6289
|
+
#endif
|
|
5835
6290
|
}
|
|
5836
6291
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
|
|
5837
|
-
HWY_API VFromD<D>
|
|
5838
|
-
|
|
5839
|
-
|
|
6292
|
+
HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, Vec512<double> v) {
|
|
6293
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6294
|
+
// Workaround for undefined behavior in _mm512_cvttpd_epi64 with GCC if any
|
|
6295
|
+
// values of v[i] are not within the range of an int64_t
|
|
6296
|
+
|
|
6297
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
6298
|
+
if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
|
|
6299
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(64)));
|
|
6300
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
6301
|
+
return VFromD<D>{_mm512_setr_epi64(
|
|
6302
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]),
|
|
6303
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1]),
|
|
6304
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[2]),
|
|
6305
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[3]),
|
|
6306
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[4]),
|
|
6307
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[5]),
|
|
6308
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[6]),
|
|
6309
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[7]))};
|
|
6310
|
+
}
|
|
6311
|
+
#endif
|
|
6312
|
+
|
|
6313
|
+
__m512i raw_result;
|
|
6314
|
+
__asm__("vcvttpd2qq {%1, %0|%0, %1}"
|
|
6315
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6316
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6317
|
+
:);
|
|
6318
|
+
return VFromD<D>{raw_result};
|
|
6319
|
+
#else
|
|
6320
|
+
return VFromD<D>{_mm512_cvttpd_epi64(v.raw)};
|
|
6321
|
+
#endif
|
|
5840
6322
|
}
|
|
5841
6323
|
template <class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U32_D(DU)>
|
|
5842
|
-
HWY_API VFromD<DU>
|
|
5843
|
-
|
|
6324
|
+
HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
6325
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6326
|
+
// Workaround for undefined behavior in _mm512_cvttps_epu32 with GCC if any
|
|
6327
|
+
// values of v[i] are not within the range of an uint32_t
|
|
6328
|
+
|
|
6329
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
6330
|
+
if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) {
|
|
6331
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(64)));
|
|
6332
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
6333
|
+
return VFromD<DU>{_mm512_setr_epi32(
|
|
6334
|
+
static_cast<int32_t>(
|
|
6335
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0])),
|
|
6336
|
+
static_cast<int32_t>(
|
|
6337
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1])),
|
|
6338
|
+
static_cast<int32_t>(
|
|
6339
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[2])),
|
|
6340
|
+
static_cast<int32_t>(
|
|
6341
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[3])),
|
|
6342
|
+
static_cast<int32_t>(
|
|
6343
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[4])),
|
|
6344
|
+
static_cast<int32_t>(
|
|
6345
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[5])),
|
|
6346
|
+
static_cast<int32_t>(
|
|
6347
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[6])),
|
|
6348
|
+
static_cast<int32_t>(
|
|
6349
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[7])),
|
|
6350
|
+
static_cast<int32_t>(
|
|
6351
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[8])),
|
|
6352
|
+
static_cast<int32_t>(
|
|
6353
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[9])),
|
|
6354
|
+
static_cast<int32_t>(
|
|
6355
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[10])),
|
|
6356
|
+
static_cast<int32_t>(
|
|
6357
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[11])),
|
|
6358
|
+
static_cast<int32_t>(
|
|
6359
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[12])),
|
|
6360
|
+
static_cast<int32_t>(
|
|
6361
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[13])),
|
|
6362
|
+
static_cast<int32_t>(
|
|
6363
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[14])),
|
|
6364
|
+
static_cast<int32_t>(
|
|
6365
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[15])))};
|
|
6366
|
+
}
|
|
6367
|
+
#endif
|
|
6368
|
+
|
|
6369
|
+
__m512i raw_result;
|
|
6370
|
+
__asm__("vcvttps2udq {%1, %0|%0, %1}"
|
|
6371
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6372
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6373
|
+
:);
|
|
6374
|
+
return VFromD<DU>{raw_result};
|
|
6375
|
+
#else
|
|
6376
|
+
return VFromD<DU>{_mm512_cvttps_epu32(v.raw)};
|
|
6377
|
+
#endif
|
|
5844
6378
|
}
|
|
5845
6379
|
template <class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U64_D(DU)>
|
|
5846
|
-
HWY_API VFromD<DU>
|
|
5847
|
-
|
|
6380
|
+
HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
6381
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6382
|
+
// Workaround for undefined behavior in _mm512_cvttpd_epu64 with GCC if any
|
|
6383
|
+
// values of v[i] are not within the range of an uint64_t
|
|
6384
|
+
|
|
6385
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
6386
|
+
if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
|
|
6387
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(64)));
|
|
6388
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
6389
|
+
return VFromD<DU>{_mm512_setr_epi64(
|
|
6390
|
+
static_cast<int64_t>(
|
|
6391
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0])),
|
|
6392
|
+
static_cast<int64_t>(
|
|
6393
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1])),
|
|
6394
|
+
static_cast<int64_t>(
|
|
6395
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[2])),
|
|
6396
|
+
static_cast<int64_t>(
|
|
6397
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[3])),
|
|
6398
|
+
static_cast<int64_t>(
|
|
6399
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[4])),
|
|
6400
|
+
static_cast<int64_t>(
|
|
6401
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[5])),
|
|
6402
|
+
static_cast<int64_t>(
|
|
6403
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[6])),
|
|
6404
|
+
static_cast<int64_t>(
|
|
6405
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[7])))};
|
|
6406
|
+
}
|
|
6407
|
+
#endif
|
|
6408
|
+
|
|
6409
|
+
__m512i raw_result;
|
|
6410
|
+
__asm__("vcvttpd2uqq {%1, %0|%0, %1}"
|
|
6411
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6412
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6413
|
+
:);
|
|
6414
|
+
return VFromD<DU>{raw_result};
|
|
6415
|
+
#else
|
|
6416
|
+
return VFromD<DU>{_mm512_cvttpd_epu64(v.raw)};
|
|
6417
|
+
#endif
|
|
5848
6418
|
}
|
|
5849
6419
|
|
|
5850
|
-
|
|
5851
|
-
|
|
5852
|
-
|
|
5853
|
-
|
|
6420
|
+
template <class DI, HWY_IF_V_SIZE_D(DI, 64), HWY_IF_I32_D(DI)>
|
|
6421
|
+
HWY_INLINE VFromD<DI> NearestIntInRange(DI, VFromD<RebindToFloat<DI>> v) {
|
|
6422
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6423
|
+
// Workaround for undefined behavior in _mm512_cvtps_epi32 with GCC if any
|
|
6424
|
+
// values of v[i] are not within the range of an int32_t
|
|
6425
|
+
|
|
6426
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
6427
|
+
if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
|
|
6428
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(64)));
|
|
6429
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
6430
|
+
return VFromD<DI>{
|
|
6431
|
+
_mm512_setr_epi32(detail::X86ScalarNearestInt<int32_t>(raw_v[0]),
|
|
6432
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[1]),
|
|
6433
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[2]),
|
|
6434
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[3]),
|
|
6435
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[4]),
|
|
6436
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[5]),
|
|
6437
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[6]),
|
|
6438
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[7]),
|
|
6439
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[8]),
|
|
6440
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[9]),
|
|
6441
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[10]),
|
|
6442
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[11]),
|
|
6443
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[12]),
|
|
6444
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[13]),
|
|
6445
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[14]),
|
|
6446
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[15]))};
|
|
6447
|
+
}
|
|
6448
|
+
#endif
|
|
6449
|
+
|
|
6450
|
+
__m512i raw_result;
|
|
6451
|
+
__asm__("vcvtps2dq {%1, %0|%0, %1}"
|
|
6452
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6453
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6454
|
+
:);
|
|
6455
|
+
return VFromD<DI>{raw_result};
|
|
6456
|
+
#else
|
|
6457
|
+
return VFromD<DI>{_mm512_cvtps_epi32(v.raw)};
|
|
6458
|
+
#endif
|
|
5854
6459
|
}
|
|
5855
6460
|
|
|
5856
6461
|
// ================================================== CRYPTO
|
|
@@ -5988,6 +6593,7 @@ static Vec512<uint16_t> SumsOfAdjQuadAbsDiff(Vec512<uint8_t> a,
|
|
|
5988
6593
|
a, BitCast(d, Broadcast<kBOffset>(BitCast(du32, b))));
|
|
5989
6594
|
}
|
|
5990
6595
|
|
|
6596
|
+
#if !HWY_IS_MSAN
|
|
5991
6597
|
// ------------------------------ I32/I64 SaturatedAdd (MaskFromVec)
|
|
5992
6598
|
|
|
5993
6599
|
HWY_API Vec512<int32_t> SaturatedAdd(Vec512<int32_t> a, Vec512<int32_t> b) {
|
|
@@ -6035,6 +6641,7 @@ HWY_API Vec512<int64_t> SaturatedSub(Vec512<int64_t> a, Vec512<int64_t> b) {
|
|
|
6035
6641
|
i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
|
|
6036
6642
|
return IfThenElse(overflow_mask, overflow_result, diff);
|
|
6037
6643
|
}
|
|
6644
|
+
#endif // !HWY_IS_MSAN
|
|
6038
6645
|
|
|
6039
6646
|
// ------------------------------ Mask testing
|
|
6040
6647
|
|
|
@@ -7197,64 +7804,6 @@ HWY_API Vec512<int64_t> operator>>(const Vec512<int64_t> v,
|
|
|
7197
7804
|
return Vec512<int64_t>{_mm512_srav_epi64(v.raw, bits.raw)};
|
|
7198
7805
|
}
|
|
7199
7806
|
|
|
7200
|
-
// ------------------------------ MulEven/Odd (Shuffle2301, InterleaveLower)
|
|
7201
|
-
|
|
7202
|
-
HWY_INLINE Vec512<uint64_t> MulEven(const Vec512<uint64_t> a,
|
|
7203
|
-
const Vec512<uint64_t> b) {
|
|
7204
|
-
const DFromV<decltype(a)> du64;
|
|
7205
|
-
const RepartitionToNarrow<decltype(du64)> du32;
|
|
7206
|
-
const auto maskL = Set(du64, 0xFFFFFFFFULL);
|
|
7207
|
-
const auto a32 = BitCast(du32, a);
|
|
7208
|
-
const auto b32 = BitCast(du32, b);
|
|
7209
|
-
// Inputs for MulEven: we only need the lower 32 bits
|
|
7210
|
-
const auto aH = Shuffle2301(a32);
|
|
7211
|
-
const auto bH = Shuffle2301(b32);
|
|
7212
|
-
|
|
7213
|
-
// Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
|
|
7214
|
-
// the even (lower 64 bits of every 128-bit block) results. See
|
|
7215
|
-
// https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat
|
|
7216
|
-
const auto aLbL = MulEven(a32, b32);
|
|
7217
|
-
const auto w3 = aLbL & maskL;
|
|
7218
|
-
|
|
7219
|
-
const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
|
|
7220
|
-
const auto w2 = t2 & maskL;
|
|
7221
|
-
const auto w1 = ShiftRight<32>(t2);
|
|
7222
|
-
|
|
7223
|
-
const auto t = MulEven(a32, bH) + w2;
|
|
7224
|
-
const auto k = ShiftRight<32>(t);
|
|
7225
|
-
|
|
7226
|
-
const auto mulH = MulEven(aH, bH) + w1 + k;
|
|
7227
|
-
const auto mulL = ShiftLeft<32>(t) + w3;
|
|
7228
|
-
return InterleaveLower(mulL, mulH);
|
|
7229
|
-
}
|
|
7230
|
-
|
|
7231
|
-
HWY_INLINE Vec512<uint64_t> MulOdd(const Vec512<uint64_t> a,
|
|
7232
|
-
const Vec512<uint64_t> b) {
|
|
7233
|
-
const DFromV<decltype(a)> du64;
|
|
7234
|
-
const RepartitionToNarrow<decltype(du64)> du32;
|
|
7235
|
-
const auto maskL = Set(du64, 0xFFFFFFFFULL);
|
|
7236
|
-
const auto a32 = BitCast(du32, a);
|
|
7237
|
-
const auto b32 = BitCast(du32, b);
|
|
7238
|
-
// Inputs for MulEven: we only need bits [95:64] (= upper half of input)
|
|
7239
|
-
const auto aH = Shuffle2301(a32);
|
|
7240
|
-
const auto bH = Shuffle2301(b32);
|
|
7241
|
-
|
|
7242
|
-
// Same as above, but we're using the odd results (upper 64 bits per block).
|
|
7243
|
-
const auto aLbL = MulEven(a32, b32);
|
|
7244
|
-
const auto w3 = aLbL & maskL;
|
|
7245
|
-
|
|
7246
|
-
const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
|
|
7247
|
-
const auto w2 = t2 & maskL;
|
|
7248
|
-
const auto w1 = ShiftRight<32>(t2);
|
|
7249
|
-
|
|
7250
|
-
const auto t = MulEven(a32, bH) + w2;
|
|
7251
|
-
const auto k = ShiftRight<32>(t);
|
|
7252
|
-
|
|
7253
|
-
const auto mulH = MulEven(aH, bH) + w1 + k;
|
|
7254
|
-
const auto mulL = ShiftLeft<32>(t) + w3;
|
|
7255
|
-
return InterleaveUpper(du64, mulL, mulH);
|
|
7256
|
-
}
|
|
7257
|
-
|
|
7258
7807
|
// ------------------------------ WidenMulPairwiseAdd
|
|
7259
7808
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
|
|
7260
7809
|
HWY_API VFromD<D> WidenMulPairwiseAdd(D /*d32*/, Vec512<int16_t> a,
|
|
@@ -7263,7 +7812,6 @@ HWY_API VFromD<D> WidenMulPairwiseAdd(D /*d32*/, Vec512<int16_t> a,
|
|
|
7263
7812
|
}
|
|
7264
7813
|
|
|
7265
7814
|
// ------------------------------ SatWidenMulPairwiseAdd
|
|
7266
|
-
|
|
7267
7815
|
template <class DI16, HWY_IF_V_SIZE_D(DI16, 64), HWY_IF_I16_D(DI16)>
|
|
7268
7816
|
HWY_API VFromD<DI16> SatWidenMulPairwiseAdd(
|
|
7269
7817
|
DI16 /* tag */, VFromD<Repartition<uint8_t, DI16>> a,
|
|
@@ -7271,7 +7819,30 @@ HWY_API VFromD<DI16> SatWidenMulPairwiseAdd(
|
|
|
7271
7819
|
return VFromD<DI16>{_mm512_maddubs_epi16(a.raw, b.raw)};
|
|
7272
7820
|
}
|
|
7273
7821
|
|
|
7822
|
+
// ------------------------------ SatWidenMulPairwiseAccumulate
|
|
7823
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
7824
|
+
template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 64)>
|
|
7825
|
+
HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate(
|
|
7826
|
+
DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a,
|
|
7827
|
+
VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
|
|
7828
|
+
return VFromD<DI32>{_mm512_dpwssds_epi32(sum.raw, a.raw, b.raw)};
|
|
7829
|
+
}
|
|
7830
|
+
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
7831
|
+
|
|
7274
7832
|
// ------------------------------ ReorderWidenMulAccumulate
|
|
7833
|
+
|
|
7834
|
+
#if HWY_NATIVE_DOT_BF16
|
|
7835
|
+
template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_D(DF, 64),
|
|
7836
|
+
class VBF = VFromD<Repartition<bfloat16_t, DF>>>
|
|
7837
|
+
HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF /*df*/, VBF a, VBF b,
|
|
7838
|
+
const VFromD<DF> sum0,
|
|
7839
|
+
VFromD<DF>& /*sum1*/) {
|
|
7840
|
+
return VFromD<DF>{_mm512_dpbf16_ps(sum0.raw,
|
|
7841
|
+
reinterpret_cast<__m512bh>(a.raw),
|
|
7842
|
+
reinterpret_cast<__m512bh>(b.raw))};
|
|
7843
|
+
}
|
|
7844
|
+
#endif // HWY_NATIVE_DOT_BF16
|
|
7845
|
+
|
|
7275
7846
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
|
|
7276
7847
|
HWY_API VFromD<D> ReorderWidenMulAccumulate(D d, Vec512<int16_t> a,
|
|
7277
7848
|
Vec512<int16_t> b,
|
|
@@ -7321,6 +7892,37 @@ HWY_INLINE VFromD<D> ReduceAcrossBlocks(D d, Func f, VFromD<D> v) {
|
|
|
7321
7892
|
|
|
7322
7893
|
} // namespace detail
|
|
7323
7894
|
|
|
7895
|
+
// ------------------------------ BitShuffle
|
|
7896
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
7897
|
+
template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
|
|
7898
|
+
HWY_IF_V_SIZE_V(V, 64), HWY_IF_V_SIZE_V(VI, 64)>
|
|
7899
|
+
HWY_API V BitShuffle(V v, VI idx) {
|
|
7900
|
+
const DFromV<decltype(v)> d64;
|
|
7901
|
+
const RebindToUnsigned<decltype(d64)> du64;
|
|
7902
|
+
const Rebind<uint8_t, decltype(d64)> du8;
|
|
7903
|
+
|
|
7904
|
+
const __mmask64 mmask64_bit_shuf_result =
|
|
7905
|
+
_mm512_bitshuffle_epi64_mask(v.raw, idx.raw);
|
|
7906
|
+
|
|
7907
|
+
#if HWY_ARCH_X86_64
|
|
7908
|
+
const VFromD<decltype(du8)> vu8_bit_shuf_result{
|
|
7909
|
+
_mm_cvtsi64_si128(static_cast<int64_t>(mmask64_bit_shuf_result))};
|
|
7910
|
+
#else
|
|
7911
|
+
const int32_t i32_lo_bit_shuf_result =
|
|
7912
|
+
static_cast<int32_t>(mmask64_bit_shuf_result);
|
|
7913
|
+
const int32_t i32_hi_bit_shuf_result =
|
|
7914
|
+
static_cast<int32_t>(_kshiftri_mask64(mmask64_bit_shuf_result, 32));
|
|
7915
|
+
|
|
7916
|
+
const VFromD<decltype(du8)> vu8_bit_shuf_result = ResizeBitCast(
|
|
7917
|
+
du8, InterleaveLower(
|
|
7918
|
+
Vec128<uint32_t>{_mm_cvtsi32_si128(i32_lo_bit_shuf_result)},
|
|
7919
|
+
Vec128<uint32_t>{_mm_cvtsi32_si128(i32_hi_bit_shuf_result)}));
|
|
7920
|
+
#endif
|
|
7921
|
+
|
|
7922
|
+
return BitCast(d64, PromoteTo(du64, vu8_bit_shuf_result));
|
|
7923
|
+
}
|
|
7924
|
+
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
7925
|
+
|
|
7324
7926
|
// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex
|
|
7325
7927
|
|
|
7326
7928
|
template <class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_V(V, 64)>
|