@img/sharp-libvips-dev 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +7 -1
- package/include/aom/aom_image.h +24 -12
- package/include/aom/aom_integer.h +3 -3
- package/include/aom/aomcx.h +15 -0
- package/include/aom/aomdx.h +5 -2
- package/include/archive.h +7 -5
- package/include/archive_entry.h +5 -3
- package/include/cgif.h +3 -0
- package/include/freetype2/freetype/config/ftoption.h +1 -1
- package/include/fribidi/fribidi-config.h +2 -2
- package/include/fribidi/fribidi-unicode-version.h +3 -3
- package/include/glib-2.0/gio/gappinfo.h +40 -25
- package/include/glib-2.0/gio/gasyncresult.h +1 -1
- package/include/glib-2.0/gio/gconverter.h +5 -0
- package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
- package/include/glib-2.0/gio/gfile.h +16 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/gsettings.h +8 -0
- package/include/glib-2.0/gio/gvfs.h +2 -2
- package/include/glib-2.0/girepository/gi-visibility.h +34 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
- package/include/glib-2.0/glib/giochannel.h +2 -2
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +12 -5
- package/include/glib-2.0/glib/gmain.h +93 -7
- package/include/glib-2.0/glib/gqsort.h +8 -1
- package/include/glib-2.0/glib/gstrfuncs.h +0 -12
- package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
- package/include/glib-2.0/glib/gunicode.h +1 -1
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gtype.h +6 -6
- package/include/harfbuzz/hb-buffer.h +6 -0
- package/include/harfbuzz/hb-common.h +6 -9
- package/include/harfbuzz/hb-cplusplus.hh +8 -11
- package/include/harfbuzz/hb-subset.h +17 -4
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +28 -0
- package/include/hwy/aligned_allocator.h +48 -1
- package/include/hwy/base.h +235 -34
- package/include/hwy/detect_compiler_arch.h +84 -10
- package/include/hwy/detect_targets.h +95 -29
- package/include/hwy/foreach_target.h +12 -1
- package/include/hwy/highway.h +205 -50
- package/include/hwy/ops/arm_neon-inl.h +841 -99
- package/include/hwy/ops/arm_sve-inl.h +413 -141
- package/include/hwy/ops/emu128-inl.h +373 -360
- package/include/hwy/ops/generic_ops-inl.h +804 -401
- package/include/hwy/ops/inside-inl.h +691 -0
- package/include/hwy/ops/ppc_vsx-inl.h +456 -166
- package/include/hwy/ops/rvv-inl.h +537 -249
- package/include/hwy/ops/scalar-inl.h +169 -79
- package/include/hwy/ops/set_macros-inl.h +106 -18
- package/include/hwy/ops/shared-inl.h +23 -0
- package/include/hwy/ops/wasm_128-inl.h +130 -108
- package/include/hwy/ops/x86_128-inl.h +1892 -577
- package/include/hwy/ops/x86_256-inl.h +625 -184
- package/include/hwy/ops/x86_512-inl.h +733 -131
- package/include/hwy/targets.h +22 -21
- package/include/hwy/timer-inl.h +3 -3
- package/include/hwy/timer.h +5 -1
- package/include/libheif/heif.h +170 -15
- package/include/libheif/heif_items.h +237 -0
- package/include/libheif/heif_properties.h +38 -2
- package/include/libheif/heif_regions.h +1 -1
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/pnglibconf.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
- package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
- package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
- package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
- package/include/libxml2/libxml/HTMLparser.h +12 -19
- package/include/libxml2/libxml/c14n.h +1 -12
- package/include/libxml2/libxml/debugXML.h +1 -1
- package/include/libxml2/libxml/encoding.h +9 -0
- package/include/libxml2/libxml/entities.h +12 -1
- package/include/libxml2/libxml/hash.h +19 -0
- package/include/libxml2/libxml/list.h +2 -2
- package/include/libxml2/libxml/nanohttp.h +17 -0
- package/include/libxml2/libxml/parser.h +61 -55
- package/include/libxml2/libxml/parserInternals.h +9 -1
- package/include/libxml2/libxml/pattern.h +6 -0
- package/include/libxml2/libxml/tree.h +32 -12
- package/include/libxml2/libxml/uri.h +11 -0
- package/include/libxml2/libxml/valid.h +29 -2
- package/include/libxml2/libxml/xinclude.h +7 -0
- package/include/libxml2/libxml/xmlIO.h +21 -4
- package/include/libxml2/libxml/xmlerror.h +14 -0
- package/include/libxml2/libxml/xmlexports.h +111 -15
- package/include/libxml2/libxml/xmlmemory.h +8 -45
- package/include/libxml2/libxml/xmlreader.h +2 -0
- package/include/libxml2/libxml/xmlsave.h +5 -0
- package/include/libxml2/libxml/xmlunicode.h +165 -1
- package/include/libxml2/libxml/xmlversion.h +15 -179
- package/include/libxml2/libxml/xmlwriter.h +1 -0
- package/include/libxml2/libxml/xpath.h +4 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-item.h +4 -2
- package/include/pango-1.0/pango/pango-version-macros.h +25 -0
- package/include/pango-1.0/pango/pangofc-font.h +2 -1
- package/include/pnglibconf.h +1 -1
- package/include/vips/util.h +1 -2
- package/include/vips/version.h +4 -4
- package/include/webp/decode.h +58 -56
- package/include/webp/demux.h +25 -21
- package/include/webp/encode.h +44 -39
- package/include/webp/mux.h +76 -15
- package/include/webp/mux_types.h +2 -1
- package/include/webp/sharpyuv/sharpyuv.h +77 -8
- package/include/webp/types.h +29 -8
- package/include/zconf.h +1 -1
- package/include/zlib.h +12 -12
- package/package.json +1 -1
- package/versions.json +14 -15
|
@@ -194,6 +194,25 @@ HWY_INLINE __m256i BitCastToInteger(__m256d v) {
|
|
|
194
194
|
return _mm256_castpd_si256(v);
|
|
195
195
|
}
|
|
196
196
|
|
|
197
|
+
#if HWY_AVX3_HAVE_F32_TO_BF16C
|
|
198
|
+
HWY_INLINE __m256i BitCastToInteger(__m256bh v) {
|
|
199
|
+
// Need to use reinterpret_cast on GCC/Clang or BitCastScalar on MSVC to
|
|
200
|
+
// bit cast a __m256bh to a __m256i as there is currently no intrinsic
|
|
201
|
+
// available (as of GCC 13 and Clang 17) that can bit cast a __m256bh vector
|
|
202
|
+
// to a __m256i vector
|
|
203
|
+
|
|
204
|
+
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
|
|
205
|
+
// On GCC or Clang, use reinterpret_cast to bit cast a __m256bh to a __m256i
|
|
206
|
+
return reinterpret_cast<__m256i>(v);
|
|
207
|
+
#else
|
|
208
|
+
// On MSVC, use BitCastScalar to bit cast a __m256bh to a __m256i as MSVC does
|
|
209
|
+
// not allow reinterpret_cast, static_cast, or a C-style cast to be used to
|
|
210
|
+
// bit cast from one AVX vector type to a different AVX vector type
|
|
211
|
+
return BitCastScalar<__m256i>(v);
|
|
212
|
+
#endif // HWY_COMPILER_GCC || HWY_COMPILER_CLANG
|
|
213
|
+
}
|
|
214
|
+
#endif // HWY_AVX3_HAVE_F32_TO_BF16C
|
|
215
|
+
|
|
197
216
|
template <typename T>
|
|
198
217
|
HWY_INLINE Vec256<uint8_t> BitCastToByte(Vec256<T> v) {
|
|
199
218
|
return Vec256<uint8_t>{BitCastToInteger(v.raw)};
|
|
@@ -516,7 +535,7 @@ template <typename T>
|
|
|
516
535
|
HWY_API Vec256<T> Not(const Vec256<T> v) {
|
|
517
536
|
const DFromV<decltype(v)> d;
|
|
518
537
|
using TU = MakeUnsigned<T>;
|
|
519
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
538
|
+
#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
520
539
|
const __m256i vu = BitCast(RebindToUnsigned<decltype(d)>(), v).raw;
|
|
521
540
|
return BitCast(d, Vec256<TU>{_mm256_ternarylogic_epi32(vu, vu, vu, 0x55)});
|
|
522
541
|
#else
|
|
@@ -527,7 +546,7 @@ HWY_API Vec256<T> Not(const Vec256<T> v) {
|
|
|
527
546
|
// ------------------------------ Xor3
|
|
528
547
|
template <typename T>
|
|
529
548
|
HWY_API Vec256<T> Xor3(Vec256<T> x1, Vec256<T> x2, Vec256<T> x3) {
|
|
530
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
549
|
+
#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
531
550
|
const DFromV<decltype(x1)> d;
|
|
532
551
|
const RebindToUnsigned<decltype(d)> du;
|
|
533
552
|
using VU = VFromD<decltype(du)>;
|
|
@@ -542,7 +561,7 @@ HWY_API Vec256<T> Xor3(Vec256<T> x1, Vec256<T> x2, Vec256<T> x3) {
|
|
|
542
561
|
// ------------------------------ Or3
|
|
543
562
|
template <typename T>
|
|
544
563
|
HWY_API Vec256<T> Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) {
|
|
545
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
564
|
+
#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
546
565
|
const DFromV<decltype(o1)> d;
|
|
547
566
|
const RebindToUnsigned<decltype(d)> du;
|
|
548
567
|
using VU = VFromD<decltype(du)>;
|
|
@@ -557,7 +576,7 @@ HWY_API Vec256<T> Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) {
|
|
|
557
576
|
// ------------------------------ OrAnd
|
|
558
577
|
template <typename T>
|
|
559
578
|
HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
|
|
560
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
579
|
+
#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
561
580
|
const DFromV<decltype(o)> d;
|
|
562
581
|
const RebindToUnsigned<decltype(d)> du;
|
|
563
582
|
using VU = VFromD<decltype(du)>;
|
|
@@ -572,7 +591,7 @@ HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
|
|
|
572
591
|
// ------------------------------ IfVecThenElse
|
|
573
592
|
template <typename T>
|
|
574
593
|
HWY_API Vec256<T> IfVecThenElse(Vec256<T> mask, Vec256<T> yes, Vec256<T> no) {
|
|
575
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
594
|
+
#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
576
595
|
const DFromV<decltype(yes)> d;
|
|
577
596
|
const RebindToUnsigned<decltype(d)> du;
|
|
578
597
|
using VU = VFromD<decltype(du)>;
|
|
@@ -768,13 +787,6 @@ HWY_API Vec256<double> IfThenZeroElse(Mask256<double> mask, Vec256<double> no) {
|
|
|
768
787
|
return Vec256<double>{_mm256_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
|
|
769
788
|
}
|
|
770
789
|
|
|
771
|
-
template <typename T>
|
|
772
|
-
HWY_API Vec256<T> ZeroIfNegative(const Vec256<T> v) {
|
|
773
|
-
static_assert(IsSigned<T>(), "Only for float");
|
|
774
|
-
// AVX3 MaskFromVec only looks at the MSB
|
|
775
|
-
return IfThenZeroElse(MaskFromVec(v), v);
|
|
776
|
-
}
|
|
777
|
-
|
|
778
790
|
// ------------------------------ Mask logical
|
|
779
791
|
|
|
780
792
|
namespace detail {
|
|
@@ -1074,6 +1086,28 @@ HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
|
|
|
1074
1086
|
return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
|
|
1075
1087
|
}
|
|
1076
1088
|
|
|
1089
|
+
template <class D, HWY_IF_LANES_D(D, 32)>
|
|
1090
|
+
HWY_API MFromD<D> SlideMask1Up(D /*d*/, MFromD<D> m) {
|
|
1091
|
+
using RawM = decltype(MFromD<D>().raw);
|
|
1092
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
1093
|
+
return MFromD<D>{
|
|
1094
|
+
static_cast<RawM>(_kshiftli_mask32(static_cast<__mmask32>(m.raw), 1))};
|
|
1095
|
+
#else
|
|
1096
|
+
return MFromD<D>{static_cast<RawM>(static_cast<uint32_t>(m.raw) << 1)};
|
|
1097
|
+
#endif
|
|
1098
|
+
}
|
|
1099
|
+
|
|
1100
|
+
template <class D, HWY_IF_LANES_D(D, 32)>
|
|
1101
|
+
HWY_API MFromD<D> SlideMask1Down(D /*d*/, MFromD<D> m) {
|
|
1102
|
+
using RawM = decltype(MFromD<D>().raw);
|
|
1103
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
1104
|
+
return MFromD<D>{
|
|
1105
|
+
static_cast<RawM>(_kshiftri_mask32(static_cast<__mmask32>(m.raw), 1))};
|
|
1106
|
+
#else
|
|
1107
|
+
return MFromD<D>{static_cast<RawM>(static_cast<uint32_t>(m.raw) >> 1)};
|
|
1108
|
+
#endif
|
|
1109
|
+
}
|
|
1110
|
+
|
|
1077
1111
|
#else // AVX2
|
|
1078
1112
|
|
|
1079
1113
|
// ------------------------------ Mask
|
|
@@ -1997,7 +2031,7 @@ HWY_API Vec256<int16_t> SaturatedAdd(Vec256<int16_t> a, Vec256<int16_t> b) {
|
|
|
1997
2031
|
return Vec256<int16_t>{_mm256_adds_epi16(a.raw, b.raw)};
|
|
1998
2032
|
}
|
|
1999
2033
|
|
|
2000
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
2034
|
+
#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
2001
2035
|
HWY_API Vec256<int32_t> SaturatedAdd(Vec256<int32_t> a, Vec256<int32_t> b) {
|
|
2002
2036
|
const DFromV<decltype(a)> d;
|
|
2003
2037
|
const auto sum = a + b;
|
|
@@ -2019,7 +2053,7 @@ HWY_API Vec256<int64_t> SaturatedAdd(Vec256<int64_t> a, Vec256<int64_t> b) {
|
|
|
2019
2053
|
i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
|
|
2020
2054
|
return IfThenElse(overflow_mask, overflow_result, sum);
|
|
2021
2055
|
}
|
|
2022
|
-
#endif // HWY_TARGET <= HWY_AVX3
|
|
2056
|
+
#endif // HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
2023
2057
|
|
|
2024
2058
|
// ------------------------------ SaturatedSub
|
|
2025
2059
|
|
|
@@ -2041,7 +2075,7 @@ HWY_API Vec256<int16_t> SaturatedSub(Vec256<int16_t> a, Vec256<int16_t> b) {
|
|
|
2041
2075
|
return Vec256<int16_t>{_mm256_subs_epi16(a.raw, b.raw)};
|
|
2042
2076
|
}
|
|
2043
2077
|
|
|
2044
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
2078
|
+
#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
2045
2079
|
HWY_API Vec256<int32_t> SaturatedSub(Vec256<int32_t> a, Vec256<int32_t> b) {
|
|
2046
2080
|
const DFromV<decltype(a)> d;
|
|
2047
2081
|
const auto diff = a - b;
|
|
@@ -2063,7 +2097,7 @@ HWY_API Vec256<int64_t> SaturatedSub(Vec256<int64_t> a, Vec256<int64_t> b) {
|
|
|
2063
2097
|
i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
|
|
2064
2098
|
return IfThenElse(overflow_mask, overflow_result, diff);
|
|
2065
2099
|
}
|
|
2066
|
-
#endif // HWY_TARGET <= HWY_AVX3
|
|
2100
|
+
#endif // HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
2067
2101
|
|
|
2068
2102
|
// ------------------------------ Average
|
|
2069
2103
|
|
|
@@ -2249,14 +2283,29 @@ HWY_API Vec256<int8_t> ShiftRight(Vec256<int8_t> v) {
|
|
|
2249
2283
|
|
|
2250
2284
|
// ------------------------------ RotateRight
|
|
2251
2285
|
|
|
2252
|
-
|
|
2253
|
-
|
|
2254
|
-
|
|
2255
|
-
|
|
2286
|
+
// U8 RotateRight implementation on AVX3_DL is now in x86_512-inl.h as U8
|
|
2287
|
+
// RotateRight uses detail::GaloisAffine on AVX3_DL
|
|
2288
|
+
|
|
2289
|
+
#if HWY_TARGET > HWY_AVX3_DL
|
|
2290
|
+
template <int kBits>
|
|
2291
|
+
HWY_API Vec256<uint8_t> RotateRight(const Vec256<uint8_t> v) {
|
|
2292
|
+
static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
|
|
2293
|
+
if (kBits == 0) return v;
|
|
2294
|
+
// AVX3 does not support 8-bit.
|
|
2295
|
+
return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(7, 8 - kBits)>(v));
|
|
2296
|
+
}
|
|
2297
|
+
#endif
|
|
2298
|
+
|
|
2299
|
+
template <int kBits>
|
|
2300
|
+
HWY_API Vec256<uint16_t> RotateRight(const Vec256<uint16_t> v) {
|
|
2301
|
+
static_assert(0 <= kBits && kBits < 16, "Invalid shift count");
|
|
2256
2302
|
if (kBits == 0) return v;
|
|
2257
|
-
|
|
2258
|
-
return
|
|
2259
|
-
|
|
2303
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
2304
|
+
return Vec256<uint16_t>{_mm256_shrdi_epi16(v.raw, v.raw, kBits)};
|
|
2305
|
+
#else
|
|
2306
|
+
// AVX3 does not support 16-bit.
|
|
2307
|
+
return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(15, 16 - kBits)>(v));
|
|
2308
|
+
#endif
|
|
2260
2309
|
}
|
|
2261
2310
|
|
|
2262
2311
|
template <int kBits>
|
|
@@ -2281,6 +2330,38 @@ HWY_API Vec256<uint64_t> RotateRight(const Vec256<uint64_t> v) {
|
|
|
2281
2330
|
#endif
|
|
2282
2331
|
}
|
|
2283
2332
|
|
|
2333
|
+
// ------------------------------ Rol/Ror
|
|
2334
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
2335
|
+
template <class T, HWY_IF_UI16(T)>
|
|
2336
|
+
HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) {
|
|
2337
|
+
return Vec256<T>{_mm256_shrdv_epi16(a.raw, a.raw, b.raw)};
|
|
2338
|
+
}
|
|
2339
|
+
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
2340
|
+
|
|
2341
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
2342
|
+
|
|
2343
|
+
template <class T, HWY_IF_UI32(T)>
|
|
2344
|
+
HWY_API Vec256<T> Rol(Vec256<T> a, Vec256<T> b) {
|
|
2345
|
+
return Vec256<T>{_mm256_rolv_epi32(a.raw, b.raw)};
|
|
2346
|
+
}
|
|
2347
|
+
|
|
2348
|
+
template <class T, HWY_IF_UI32(T)>
|
|
2349
|
+
HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) {
|
|
2350
|
+
return Vec256<T>{_mm256_rorv_epi32(a.raw, b.raw)};
|
|
2351
|
+
}
|
|
2352
|
+
|
|
2353
|
+
template <class T, HWY_IF_UI64(T)>
|
|
2354
|
+
HWY_API Vec256<T> Rol(Vec256<T> a, Vec256<T> b) {
|
|
2355
|
+
return Vec256<T>{_mm256_rolv_epi64(a.raw, b.raw)};
|
|
2356
|
+
}
|
|
2357
|
+
|
|
2358
|
+
template <class T, HWY_IF_UI64(T)>
|
|
2359
|
+
HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) {
|
|
2360
|
+
return Vec256<T>{_mm256_rorv_epi64(a.raw, b.raw)};
|
|
2361
|
+
}
|
|
2362
|
+
|
|
2363
|
+
#endif
|
|
2364
|
+
|
|
2284
2365
|
// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
|
|
2285
2366
|
|
|
2286
2367
|
HWY_API Vec256<int8_t> BroadcastSignBit(const Vec256<int8_t> v) {
|
|
@@ -3150,6 +3231,15 @@ HWY_API Mask256<float16_t> IsNaN(Vec256<float16_t> v) {
|
|
|
3150
3231
|
v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
|
|
3151
3232
|
}
|
|
3152
3233
|
|
|
3234
|
+
HWY_API Mask256<float16_t> IsEitherNaN(Vec256<float16_t> a,
|
|
3235
|
+
Vec256<float16_t> b) {
|
|
3236
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
3237
|
+
HWY_DIAGNOSTICS(push)
|
|
3238
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
3239
|
+
return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_UNORD_Q)};
|
|
3240
|
+
HWY_DIAGNOSTICS(pop)
|
|
3241
|
+
}
|
|
3242
|
+
|
|
3153
3243
|
HWY_API Mask256<float16_t> IsInf(Vec256<float16_t> v) {
|
|
3154
3244
|
return Mask256<float16_t>{_mm256_fpclass_ph_mask(
|
|
3155
3245
|
v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)};
|
|
@@ -3182,6 +3272,22 @@ HWY_API Mask256<double> IsNaN(Vec256<double> v) {
|
|
|
3182
3272
|
#endif
|
|
3183
3273
|
}
|
|
3184
3274
|
|
|
3275
|
+
HWY_API Mask256<float> IsEitherNaN(Vec256<float> a, Vec256<float> b) {
|
|
3276
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
3277
|
+
return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_UNORD_Q)};
|
|
3278
|
+
#else
|
|
3279
|
+
return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_UNORD_Q)};
|
|
3280
|
+
#endif
|
|
3281
|
+
}
|
|
3282
|
+
|
|
3283
|
+
HWY_API Mask256<double> IsEitherNaN(Vec256<double> a, Vec256<double> b) {
|
|
3284
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
3285
|
+
return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_UNORD_Q)};
|
|
3286
|
+
#else
|
|
3287
|
+
return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_UNORD_Q)};
|
|
3288
|
+
#endif
|
|
3289
|
+
}
|
|
3290
|
+
|
|
3185
3291
|
#if HWY_TARGET <= HWY_AVX3
|
|
3186
3292
|
|
|
3187
3293
|
HWY_API Mask256<float> IsInf(Vec256<float> v) {
|
|
@@ -3716,20 +3822,14 @@ HWY_API Vec256<double> NativeGather256(const double* HWY_RESTRICT base,
|
|
|
3716
3822
|
} // namespace detail
|
|
3717
3823
|
|
|
3718
3824
|
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
3719
|
-
HWY_API VFromD<D> GatherOffset(D d
|
|
3825
|
+
HWY_API VFromD<D> GatherOffset(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
|
|
3720
3826
|
VFromD<RebindToSigned<D>> offsets) {
|
|
3721
|
-
const RebindToSigned<decltype(d)> di;
|
|
3722
|
-
(void)di; // for HWY_DASSERT
|
|
3723
|
-
HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di))));
|
|
3724
3827
|
return detail::NativeGather256<1>(base, offsets);
|
|
3725
3828
|
}
|
|
3726
3829
|
|
|
3727
3830
|
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
3728
|
-
HWY_API VFromD<D> GatherIndex(D d
|
|
3831
|
+
HWY_API VFromD<D> GatherIndex(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
|
|
3729
3832
|
VFromD<RebindToSigned<D>> indices) {
|
|
3730
|
-
const RebindToSigned<decltype(d)> di;
|
|
3731
|
-
(void)di; // for HWY_DASSERT
|
|
3732
|
-
HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
|
|
3733
3833
|
return detail::NativeGather256<sizeof(TFromD<D>)>(base, indices);
|
|
3734
3834
|
}
|
|
3735
3835
|
|
|
@@ -3802,12 +3902,9 @@ HWY_API Vec256<double> NativeMaskedGatherOr256(Vec256<double> no,
|
|
|
3802
3902
|
} // namespace detail
|
|
3803
3903
|
|
|
3804
3904
|
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
3805
|
-
HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d
|
|
3905
|
+
HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D /*d*/,
|
|
3806
3906
|
const TFromD<D>* HWY_RESTRICT base,
|
|
3807
3907
|
VFromD<RebindToSigned<D>> indices) {
|
|
3808
|
-
const RebindToSigned<decltype(d)> di;
|
|
3809
|
-
(void)di; // for HWY_DASSERT
|
|
3810
|
-
HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
|
|
3811
3908
|
return detail::NativeMaskedGatherOr256<sizeof(TFromD<D>)>(no, m, base,
|
|
3812
3909
|
indices);
|
|
3813
3910
|
}
|
|
@@ -5218,6 +5315,72 @@ HWY_API Vec256<double> OddEven(Vec256<double> a, Vec256<double> b) {
|
|
|
5218
5315
|
return Vec256<double>{_mm256_blend_pd(a.raw, b.raw, 5)};
|
|
5219
5316
|
}
|
|
5220
5317
|
|
|
5318
|
+
// -------------------------- InterleaveEven
|
|
5319
|
+
|
|
5320
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
5321
|
+
template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_UI32_D(D)>
|
|
5322
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
5323
|
+
return VFromD<D>{_mm256_mask_shuffle_epi32(
|
|
5324
|
+
a.raw, static_cast<__mmask8>(0xAA), b.raw,
|
|
5325
|
+
static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(2, 2, 0, 0)))};
|
|
5326
|
+
}
|
|
5327
|
+
template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_F32_D(D)>
|
|
5328
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
5329
|
+
return VFromD<D>{_mm256_mask_shuffle_ps(a.raw, static_cast<__mmask8>(0xAA),
|
|
5330
|
+
b.raw, b.raw,
|
|
5331
|
+
_MM_SHUFFLE(2, 2, 0, 0))};
|
|
5332
|
+
}
|
|
5333
|
+
#else
|
|
5334
|
+
template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_T_SIZE_D(D, 4)>
|
|
5335
|
+
HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
|
|
5336
|
+
const RebindToFloat<decltype(d)> df;
|
|
5337
|
+
const VFromD<decltype(df)> b2_b0_a2_a0{_mm256_shuffle_ps(
|
|
5338
|
+
BitCast(df, a).raw, BitCast(df, b).raw, _MM_SHUFFLE(2, 0, 2, 0))};
|
|
5339
|
+
return BitCast(
|
|
5340
|
+
d, VFromD<decltype(df)>{_mm256_shuffle_ps(
|
|
5341
|
+
b2_b0_a2_a0.raw, b2_b0_a2_a0.raw, _MM_SHUFFLE(3, 1, 2, 0))});
|
|
5342
|
+
}
|
|
5343
|
+
#endif
|
|
5344
|
+
|
|
5345
|
+
// I64/U64/F64 InterleaveEven is generic for vector lengths >= 32 bytes
|
|
5346
|
+
template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 8)>
|
|
5347
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
5348
|
+
return InterleaveLower(a, b);
|
|
5349
|
+
}
|
|
5350
|
+
|
|
5351
|
+
// -------------------------- InterleaveOdd
|
|
5352
|
+
|
|
5353
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
5354
|
+
template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_UI32_D(D)>
|
|
5355
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
5356
|
+
return VFromD<D>{_mm256_mask_shuffle_epi32(
|
|
5357
|
+
b.raw, static_cast<__mmask8>(0x55), a.raw,
|
|
5358
|
+
static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(3, 3, 1, 1)))};
|
|
5359
|
+
}
|
|
5360
|
+
template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_F32_D(D)>
|
|
5361
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
5362
|
+
return VFromD<D>{_mm256_mask_shuffle_ps(b.raw, static_cast<__mmask8>(0x55),
|
|
5363
|
+
a.raw, a.raw,
|
|
5364
|
+
_MM_SHUFFLE(3, 3, 1, 1))};
|
|
5365
|
+
}
|
|
5366
|
+
#else
|
|
5367
|
+
template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_T_SIZE_D(D, 4)>
|
|
5368
|
+
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
|
|
5369
|
+
const RebindToFloat<decltype(d)> df;
|
|
5370
|
+
const VFromD<decltype(df)> b3_b1_a3_a3{_mm256_shuffle_ps(
|
|
5371
|
+
BitCast(df, a).raw, BitCast(df, b).raw, _MM_SHUFFLE(3, 1, 3, 1))};
|
|
5372
|
+
return BitCast(
|
|
5373
|
+
d, VFromD<decltype(df)>{_mm256_shuffle_ps(
|
|
5374
|
+
b3_b1_a3_a3.raw, b3_b1_a3_a3.raw, _MM_SHUFFLE(3, 1, 2, 0))});
|
|
5375
|
+
}
|
|
5376
|
+
#endif
|
|
5377
|
+
|
|
5378
|
+
// I64/U64/F64 InterleaveOdd is generic for vector lengths >= 32 bytes
|
|
5379
|
+
template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 8)>
|
|
5380
|
+
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
|
|
5381
|
+
return InterleaveUpper(d, a, b);
|
|
5382
|
+
}
|
|
5383
|
+
|
|
5221
5384
|
// ------------------------------ OddEvenBlocks
|
|
5222
5385
|
|
|
5223
5386
|
template <typename T, HWY_IF_NOT_FLOAT3264(T)>
|
|
@@ -5969,62 +6132,6 @@ HWY_API Vec256<int64_t> operator>>(Vec256<int64_t> v, Vec256<int64_t> bits) {
|
|
|
5969
6132
|
#endif
|
|
5970
6133
|
}
|
|
5971
6134
|
|
|
5972
|
-
HWY_INLINE Vec256<uint64_t> MulEven(const Vec256<uint64_t> a,
|
|
5973
|
-
const Vec256<uint64_t> b) {
|
|
5974
|
-
const Full256<uint64_t> du64;
|
|
5975
|
-
const RepartitionToNarrow<decltype(du64)> du32;
|
|
5976
|
-
const auto maskL = Set(du64, 0xFFFFFFFFULL);
|
|
5977
|
-
const auto a32 = BitCast(du32, a);
|
|
5978
|
-
const auto b32 = BitCast(du32, b);
|
|
5979
|
-
// Inputs for MulEven: we only need the lower 32 bits
|
|
5980
|
-
const auto aH = Shuffle2301(a32);
|
|
5981
|
-
const auto bH = Shuffle2301(b32);
|
|
5982
|
-
|
|
5983
|
-
// Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
|
|
5984
|
-
// the even (lower 64 bits of every 128-bit block) results. See
|
|
5985
|
-
// https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat
|
|
5986
|
-
const auto aLbL = MulEven(a32, b32);
|
|
5987
|
-
const auto w3 = aLbL & maskL;
|
|
5988
|
-
|
|
5989
|
-
const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
|
|
5990
|
-
const auto w2 = t2 & maskL;
|
|
5991
|
-
const auto w1 = ShiftRight<32>(t2);
|
|
5992
|
-
|
|
5993
|
-
const auto t = MulEven(a32, bH) + w2;
|
|
5994
|
-
const auto k = ShiftRight<32>(t);
|
|
5995
|
-
|
|
5996
|
-
const auto mulH = MulEven(aH, bH) + w1 + k;
|
|
5997
|
-
const auto mulL = ShiftLeft<32>(t) + w3;
|
|
5998
|
-
return InterleaveLower(mulL, mulH);
|
|
5999
|
-
}
|
|
6000
|
-
|
|
6001
|
-
HWY_INLINE Vec256<uint64_t> MulOdd(const Vec256<uint64_t> a,
|
|
6002
|
-
const Vec256<uint64_t> b) {
|
|
6003
|
-
const Full256<uint64_t> du64;
|
|
6004
|
-
const RepartitionToNarrow<decltype(du64)> du32;
|
|
6005
|
-
const auto maskL = Set(du64, 0xFFFFFFFFULL);
|
|
6006
|
-
const auto a32 = BitCast(du32, a);
|
|
6007
|
-
const auto b32 = BitCast(du32, b);
|
|
6008
|
-
// Inputs for MulEven: we only need bits [95:64] (= upper half of input)
|
|
6009
|
-
const auto aH = Shuffle2301(a32);
|
|
6010
|
-
const auto bH = Shuffle2301(b32);
|
|
6011
|
-
|
|
6012
|
-
// Same as above, but we're using the odd results (upper 64 bits per block).
|
|
6013
|
-
const auto aLbL = MulEven(a32, b32);
|
|
6014
|
-
const auto w3 = aLbL & maskL;
|
|
6015
|
-
|
|
6016
|
-
const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
|
|
6017
|
-
const auto w2 = t2 & maskL;
|
|
6018
|
-
const auto w1 = ShiftRight<32>(t2);
|
|
6019
|
-
|
|
6020
|
-
const auto t = MulEven(a32, bH) + w2;
|
|
6021
|
-
const auto k = ShiftRight<32>(t);
|
|
6022
|
-
|
|
6023
|
-
const auto mulH = MulEven(aH, bH) + w1 + k;
|
|
6024
|
-
const auto mulL = ShiftLeft<32>(t) + w3;
|
|
6025
|
-
return InterleaveUpper(du64, mulL, mulH);
|
|
6026
|
-
}
|
|
6027
|
-
|
|
6028
6135
|
// ------------------------------ WidenMulPairwiseAdd
|
|
6029
6136
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
|
|
6030
6137
|
HWY_API VFromD<D> WidenMulPairwiseAdd(D /*d32*/, Vec256<int16_t> a,
|
|
@@ -6041,7 +6148,31 @@ HWY_API VFromD<DI16> SatWidenMulPairwiseAdd(
|
|
|
6041
6148
|
return VFromD<DI16>{_mm256_maddubs_epi16(a.raw, b.raw)};
|
|
6042
6149
|
}
|
|
6043
6150
|
|
|
6151
|
+
// ------------------------------ SatWidenMulPairwiseAccumulate
|
|
6152
|
+
|
|
6153
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
6154
|
+
template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 32)>
|
|
6155
|
+
HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate(
|
|
6156
|
+
DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a,
|
|
6157
|
+
VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
|
|
6158
|
+
return VFromD<DI32>{_mm256_dpwssds_epi32(sum.raw, a.raw, b.raw)};
|
|
6159
|
+
}
|
|
6160
|
+
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
6161
|
+
|
|
6044
6162
|
// ------------------------------ ReorderWidenMulAccumulate
|
|
6163
|
+
|
|
6164
|
+
#if HWY_NATIVE_DOT_BF16
|
|
6165
|
+
template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_D(DF, 32),
|
|
6166
|
+
class VBF = VFromD<Repartition<bfloat16_t, DF>>>
|
|
6167
|
+
HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF /*df*/, VBF a, VBF b,
|
|
6168
|
+
const VFromD<DF> sum0,
|
|
6169
|
+
VFromD<DF>& /*sum1*/) {
|
|
6170
|
+
return VFromD<DF>{_mm256_dpbf16_ps(sum0.raw,
|
|
6171
|
+
reinterpret_cast<__m256bh>(a.raw),
|
|
6172
|
+
reinterpret_cast<__m256bh>(b.raw))};
|
|
6173
|
+
}
|
|
6174
|
+
#endif // HWY_NATIVE_DOT_BF16
|
|
6175
|
+
|
|
6045
6176
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
|
|
6046
6177
|
HWY_API VFromD<D> ReorderWidenMulAccumulate(D d, Vec256<int16_t> a,
|
|
6047
6178
|
Vec256<int16_t> b,
|
|
@@ -6159,19 +6290,63 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec32<int8_t> v) {
|
|
|
6159
6290
|
|
|
6160
6291
|
#if HWY_TARGET <= HWY_AVX3
|
|
6161
6292
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
|
|
6162
|
-
HWY_API VFromD<D>
|
|
6163
|
-
|
|
6164
|
-
|
|
6165
|
-
|
|
6293
|
+
HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
|
|
6294
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6295
|
+
// Workaround for undefined behavior with GCC if any values of v[i] are not
|
|
6296
|
+
// within the range of an int64_t
|
|
6297
|
+
|
|
6298
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
6299
|
+
if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
|
|
6300
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
|
|
6301
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
6302
|
+
return VFromD<D>{_mm256_setr_epi64x(
|
|
6303
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]),
|
|
6304
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1]),
|
|
6305
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[2]),
|
|
6306
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[3]))};
|
|
6307
|
+
}
|
|
6308
|
+
#endif
|
|
6166
6309
|
|
|
6167
|
-
|
|
6168
|
-
|
|
6169
|
-
|
|
6310
|
+
__m256i raw_result;
|
|
6311
|
+
__asm__("vcvttps2qq {%1, %0|%0, %1}"
|
|
6312
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6313
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6314
|
+
:);
|
|
6315
|
+
return VFromD<D>{raw_result};
|
|
6316
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
6317
|
+
return VFromD<D>{_mm256_cvttps_epi64(v.raw)};
|
|
6318
|
+
#endif // HWY_COMPILER_GCC_ACTUAL
|
|
6170
6319
|
}
|
|
6171
6320
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
|
|
6172
|
-
HWY_API VFromD<D>
|
|
6173
|
-
|
|
6174
|
-
|
|
6321
|
+
HWY_API VFromD<D> PromoteInRangeTo(D /* tag */, VFromD<Rebind<float, D>> v) {
|
|
6322
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6323
|
+
// Workaround for undefined behavior with GCC if any values of v[i] are not
|
|
6324
|
+
// within the range of an uint64_t
|
|
6325
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
6326
|
+
if (detail::IsConstantX86VecForF2IConv<uint64_t>(v)) {
|
|
6327
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
|
|
6328
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
6329
|
+
return VFromD<D>{_mm256_setr_epi64x(
|
|
6330
|
+
static_cast<int64_t>(
|
|
6331
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0])),
|
|
6332
|
+
static_cast<int64_t>(
|
|
6333
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1])),
|
|
6334
|
+
static_cast<int64_t>(
|
|
6335
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[2])),
|
|
6336
|
+
static_cast<int64_t>(
|
|
6337
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[3])))};
|
|
6338
|
+
}
|
|
6339
|
+
#endif
|
|
6340
|
+
|
|
6341
|
+
__m256i raw_result;
|
|
6342
|
+
__asm__("vcvttps2uqq {%1, %0|%0, %1}"
|
|
6343
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6344
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6345
|
+
:);
|
|
6346
|
+
return VFromD<D>{raw_result};
|
|
6347
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
6348
|
+
return VFromD<D>{_mm256_cvttps_epu64(v.raw)};
|
|
6349
|
+
#endif // HWY_COMPILER_GCC_ACTUAL
|
|
6175
6350
|
}
|
|
6176
6351
|
#endif // HWY_TARGET <= HWY_AVX3
|
|
6177
6352
|
|
|
@@ -6341,24 +6516,38 @@ HWY_API VFromD<D> DemoteTo(D /*df16*/, Vec256<double> v) {
|
|
|
6341
6516
|
}
|
|
6342
6517
|
#endif // HWY_HAVE_FLOAT16
|
|
6343
6518
|
|
|
6519
|
+
#if HWY_AVX3_HAVE_F32_TO_BF16C
|
|
6344
6520
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
|
|
6345
|
-
HWY_API VFromD<D> DemoteTo(D dbf16
|
|
6346
|
-
|
|
6347
|
-
|
|
6348
|
-
|
|
6349
|
-
|
|
6350
|
-
|
|
6351
|
-
|
|
6521
|
+
HWY_API VFromD<D> DemoteTo(D /*dbf16*/, Vec256<float> v) {
|
|
6522
|
+
#if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
|
|
6523
|
+
// Inline assembly workaround for LLVM codegen bug
|
|
6524
|
+
__m128i raw_result;
|
|
6525
|
+
__asm__("vcvtneps2bf16 %1, %0" : "=v"(raw_result) : "v"(v.raw));
|
|
6526
|
+
return VFromD<D>{raw_result};
|
|
6527
|
+
#else
|
|
6528
|
+
// The _mm256_cvtneps_pbh intrinsic returns a __m128bh vector that needs to be
|
|
6529
|
+
// bit casted to a __m128i vector
|
|
6530
|
+
return VFromD<D>{detail::BitCastToInteger(_mm256_cvtneps_pbh(v.raw))};
|
|
6531
|
+
#endif
|
|
6352
6532
|
}
|
|
6353
6533
|
|
|
6354
6534
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
|
|
6355
|
-
HWY_API VFromD<D> ReorderDemote2To(D dbf16
|
|
6356
|
-
|
|
6357
|
-
|
|
6358
|
-
|
|
6359
|
-
|
|
6360
|
-
|
|
6535
|
+
HWY_API VFromD<D> ReorderDemote2To(D /*dbf16*/, Vec256<float> a,
|
|
6536
|
+
Vec256<float> b) {
|
|
6537
|
+
#if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
|
|
6538
|
+
// Inline assembly workaround for LLVM codegen bug
|
|
6539
|
+
__m256i raw_result;
|
|
6540
|
+
__asm__("vcvtne2ps2bf16 %2, %1, %0"
|
|
6541
|
+
: "=v"(raw_result)
|
|
6542
|
+
: "v"(b.raw), "v"(a.raw));
|
|
6543
|
+
return VFromD<D>{raw_result};
|
|
6544
|
+
#else
|
|
6545
|
+
// The _mm256_cvtne2ps_pbh intrinsic returns a __m256bh vector that needs to
|
|
6546
|
+
// be bit casted to a __m256i vector
|
|
6547
|
+
return VFromD<D>{detail::BitCastToInteger(_mm256_cvtne2ps_pbh(b.raw, a.raw))};
|
|
6548
|
+
#endif
|
|
6361
6549
|
}
|
|
6550
|
+
#endif // HWY_AVX3_HAVE_F32_TO_BF16C
|
|
6362
6551
|
|
|
6363
6552
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)>
|
|
6364
6553
|
HWY_API VFromD<D> ReorderDemote2To(D /*d16*/, Vec256<int32_t> a,
|
|
@@ -6449,9 +6638,9 @@ HWY_API Vec256<uint32_t> ReorderDemote2To(D dn, Vec256<int64_t> a,
|
|
|
6449
6638
|
_MM_SHUFFLE(2, 0, 2, 0))});
|
|
6450
6639
|
}
|
|
6451
6640
|
|
|
6452
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32),
|
|
6453
|
-
HWY_API
|
|
6454
|
-
|
|
6641
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
|
|
6642
|
+
HWY_API VFromD<D> ReorderDemote2To(D dn, Vec256<uint64_t> a,
|
|
6643
|
+
Vec256<uint64_t> b) {
|
|
6455
6644
|
const Half<decltype(dn)> dnh;
|
|
6456
6645
|
const Repartition<float, decltype(dn)> dn_f;
|
|
6457
6646
|
|
|
@@ -6483,37 +6672,64 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<double> v) {
|
|
|
6483
6672
|
}
|
|
6484
6673
|
|
|
6485
6674
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
|
|
6486
|
-
HWY_API VFromD<D>
|
|
6487
|
-
|
|
6488
|
-
|
|
6489
|
-
|
|
6675
|
+
HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec256<double> v) {
|
|
6676
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6677
|
+
// Workaround for undefined behavior in _mm256_cvttpd_epi32 with GCC if any
|
|
6678
|
+
// values of v[i] are not within the range of an int32_t
|
|
6679
|
+
|
|
6680
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
6681
|
+
if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
|
|
6682
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(32)));
|
|
6683
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
6684
|
+
return Dup128VecFromValues(
|
|
6685
|
+
D(), detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
|
|
6686
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]),
|
|
6687
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[2]),
|
|
6688
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[3]));
|
|
6689
|
+
}
|
|
6690
|
+
#endif
|
|
6691
|
+
|
|
6692
|
+
__m128i raw_result;
|
|
6693
|
+
__asm__("vcvttpd2dq {%1, %0|%0, %1}"
|
|
6694
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6695
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6696
|
+
:);
|
|
6697
|
+
return VFromD<D>{raw_result};
|
|
6698
|
+
#else
|
|
6699
|
+
return VFromD<D>{_mm256_cvttpd_epi32(v.raw)};
|
|
6700
|
+
#endif
|
|
6490
6701
|
}
|
|
6491
6702
|
|
|
6492
|
-
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
|
|
6493
|
-
HWY_API VFromD<D> DemoteTo(D du32, Vec256<double> v) {
|
|
6494
6703
|
#if HWY_TARGET <= HWY_AVX3
|
|
6495
|
-
|
|
6496
|
-
|
|
6497
|
-
|
|
6498
|
-
|
|
6499
|
-
|
|
6500
|
-
|
|
6501
|
-
|
|
6502
|
-
|
|
6503
|
-
|
|
6704
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
|
|
6705
|
+
HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec256<double> v) {
|
|
6706
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6707
|
+
// Workaround for undefined behavior in _mm256_cvttpd_epu32 with GCC if any
|
|
6708
|
+
// values of v[i] are not within the range of an uint32_t
|
|
6709
|
+
|
|
6710
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
6711
|
+
if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) {
|
|
6712
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(32)));
|
|
6713
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
6714
|
+
return Dup128VecFromValues(
|
|
6715
|
+
D(), detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0]),
|
|
6716
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1]),
|
|
6717
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[2]),
|
|
6718
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[3]));
|
|
6719
|
+
}
|
|
6720
|
+
#endif
|
|
6504
6721
|
|
|
6505
|
-
|
|
6506
|
-
|
|
6507
|
-
|
|
6508
|
-
|
|
6509
|
-
|
|
6510
|
-
|
|
6511
|
-
|
|
6512
|
-
return
|
|
6722
|
+
__m128i raw_result;
|
|
6723
|
+
__asm__("vcvttpd2udq {%1, %0|%0, %1}"
|
|
6724
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6725
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6726
|
+
:);
|
|
6727
|
+
return VFromD<D>{raw_result};
|
|
6728
|
+
#else
|
|
6729
|
+
return VFromD<D>{_mm256_cvttpd_epu32(v.raw)};
|
|
6513
6730
|
#endif
|
|
6514
6731
|
}
|
|
6515
6732
|
|
|
6516
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
6517
6733
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
|
|
6518
6734
|
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
|
|
6519
6735
|
return VFromD<D>{_mm256_cvtepi64_ps(v.raw)};
|
|
@@ -6679,66 +6895,274 @@ HWY_API VFromD<D> ConvertTo(D /*dd*/, Vec256<uint64_t> v) {
|
|
|
6679
6895
|
|
|
6680
6896
|
#if HWY_HAVE_FLOAT16
|
|
6681
6897
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)>
|
|
6682
|
-
HWY_API VFromD<D>
|
|
6683
|
-
|
|
6684
|
-
|
|
6898
|
+
HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec256<float16_t> v) {
|
|
6899
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6900
|
+
// Workaround for undefined behavior in _mm256_cvttph_epi16 with GCC if any
|
|
6901
|
+
// values of v[i] are not within the range of an int16_t
|
|
6902
|
+
|
|
6903
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
|
|
6904
|
+
HWY_HAVE_SCALAR_F16_TYPE
|
|
6905
|
+
if (detail::IsConstantX86VecForF2IConv<int16_t>(v)) {
|
|
6906
|
+
typedef hwy::float16_t::Native GccF16RawVectType
|
|
6907
|
+
__attribute__((__vector_size__(32)));
|
|
6908
|
+
const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
|
|
6909
|
+
return VFromD<D>{_mm256_setr_epi16(
|
|
6910
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[0]),
|
|
6911
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[1]),
|
|
6912
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[2]),
|
|
6913
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[3]),
|
|
6914
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[4]),
|
|
6915
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[5]),
|
|
6916
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[6]),
|
|
6917
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[7]),
|
|
6918
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[8]),
|
|
6919
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[9]),
|
|
6920
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[10]),
|
|
6921
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[11]),
|
|
6922
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[12]),
|
|
6923
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[13]),
|
|
6924
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[14]),
|
|
6925
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[15]))};
|
|
6926
|
+
}
|
|
6927
|
+
#endif
|
|
6928
|
+
|
|
6929
|
+
__m256i raw_result;
|
|
6930
|
+
__asm__("vcvttph2w {%1, %0|%0, %1}"
|
|
6931
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6932
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6933
|
+
:);
|
|
6934
|
+
return VFromD<D>{raw_result};
|
|
6935
|
+
#else // HWY_COMPILER_GCC_ACTUAL < 1200
|
|
6936
|
+
return VFromD<D>{_mm256_cvttph_epi16(v.raw)};
|
|
6937
|
+
#endif
|
|
6685
6938
|
}
|
|
6686
6939
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
|
|
6687
|
-
HWY_API VFromD<D>
|
|
6688
|
-
|
|
6689
|
-
|
|
6940
|
+
HWY_API VFromD<D> ConvertInRangeTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
|
|
6941
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6942
|
+
// Workaround for undefined behavior in _mm256_cvttph_epu16 with GCC if any
|
|
6943
|
+
// values of v[i] are not within the range of an uint16_t
|
|
6944
|
+
|
|
6945
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
|
|
6946
|
+
HWY_HAVE_SCALAR_F16_TYPE
|
|
6947
|
+
if (detail::IsConstantX86VecForF2IConv<uint16_t>(v)) {
|
|
6948
|
+
typedef hwy::float16_t::Native GccF16RawVectType
|
|
6949
|
+
__attribute__((__vector_size__(32)));
|
|
6950
|
+
const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
|
|
6951
|
+
return VFromD<D>{_mm256_setr_epi16(
|
|
6952
|
+
static_cast<int16_t>(
|
|
6953
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[0])),
|
|
6954
|
+
static_cast<int16_t>(
|
|
6955
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[1])),
|
|
6956
|
+
static_cast<int16_t>(
|
|
6957
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[2])),
|
|
6958
|
+
static_cast<int16_t>(
|
|
6959
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[3])),
|
|
6960
|
+
static_cast<int16_t>(
|
|
6961
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[4])),
|
|
6962
|
+
static_cast<int16_t>(
|
|
6963
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[5])),
|
|
6964
|
+
static_cast<int16_t>(
|
|
6965
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[6])),
|
|
6966
|
+
static_cast<int16_t>(
|
|
6967
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[7])),
|
|
6968
|
+
static_cast<int16_t>(
|
|
6969
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[8])),
|
|
6970
|
+
static_cast<int16_t>(
|
|
6971
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[9])),
|
|
6972
|
+
static_cast<int16_t>(
|
|
6973
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[10])),
|
|
6974
|
+
static_cast<int16_t>(
|
|
6975
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[11])),
|
|
6976
|
+
static_cast<int16_t>(
|
|
6977
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[12])),
|
|
6978
|
+
static_cast<int16_t>(
|
|
6979
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[13])),
|
|
6980
|
+
static_cast<int16_t>(
|
|
6981
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[14])),
|
|
6982
|
+
static_cast<int16_t>(
|
|
6983
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[15])))};
|
|
6984
|
+
}
|
|
6985
|
+
#endif
|
|
6986
|
+
|
|
6987
|
+
__m256i raw_result;
|
|
6988
|
+
__asm__("vcvttph2uw {%1, %0|%0, %1}"
|
|
6989
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6990
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6991
|
+
:);
|
|
6992
|
+
return VFromD<D>{raw_result};
|
|
6993
|
+
#else // HWY_COMPILER_GCC_ACTUAL < 1200
|
|
6994
|
+
return VFromD<D>{_mm256_cvttph_epu16(v.raw)};
|
|
6995
|
+
#endif
|
|
6690
6996
|
}
|
|
6691
6997
|
#endif // HWY_HAVE_FLOAT16
|
|
6692
6998
|
|
|
6693
6999
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
|
|
6694
|
-
HWY_API VFromD<D>
|
|
6695
|
-
|
|
6696
|
-
|
|
7000
|
+
HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec256<float> v) {
|
|
7001
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
7002
|
+
// Workaround for undefined behavior in _mm256_cvttps_epi32 with GCC if any
|
|
7003
|
+
// values of v[i] are not within the range of an int32_t
|
|
7004
|
+
|
|
7005
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
7006
|
+
if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
|
|
7007
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
|
|
7008
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
7009
|
+
return VFromD<D>{_mm256_setr_epi32(
|
|
7010
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
|
|
7011
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]),
|
|
7012
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[2]),
|
|
7013
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[3]),
|
|
7014
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[4]),
|
|
7015
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[5]),
|
|
7016
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[6]),
|
|
7017
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[7]))};
|
|
7018
|
+
}
|
|
7019
|
+
#endif
|
|
7020
|
+
|
|
7021
|
+
__m256i raw_result;
|
|
7022
|
+
__asm__("vcvttps2dq {%1, %0|%0, %1}"
|
|
7023
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
7024
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
7025
|
+
:);
|
|
7026
|
+
return VFromD<D>{raw_result};
|
|
7027
|
+
#else
|
|
7028
|
+
return VFromD<D>{_mm256_cvttps_epi32(v.raw)};
|
|
7029
|
+
#endif
|
|
6697
7030
|
}
|
|
6698
7031
|
|
|
6699
7032
|
#if HWY_TARGET <= HWY_AVX3
|
|
6700
7033
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
|
|
6701
|
-
HWY_API VFromD<D>
|
|
6702
|
-
|
|
6703
|
-
|
|
7034
|
+
HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, Vec256<double> v) {
|
|
7035
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
7036
|
+
// Workaround for undefined behavior in _mm256_cvttpd_epi64 with GCC if any
|
|
7037
|
+
// values of v[i] are not within the range of an int64_t
|
|
7038
|
+
|
|
7039
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
7040
|
+
if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
|
|
7041
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(32)));
|
|
7042
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
7043
|
+
return VFromD<D>{_mm256_setr_epi64x(
|
|
7044
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]),
|
|
7045
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1]),
|
|
7046
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[2]),
|
|
7047
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[3]))};
|
|
7048
|
+
}
|
|
7049
|
+
#endif
|
|
7050
|
+
|
|
7051
|
+
__m256i raw_result;
|
|
7052
|
+
__asm__("vcvttpd2qq {%1, %0|%0, %1}"
|
|
7053
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
7054
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
7055
|
+
:);
|
|
7056
|
+
return VFromD<D>{raw_result};
|
|
7057
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
7058
|
+
return VFromD<D>{_mm256_cvttpd_epi64(v.raw)};
|
|
7059
|
+
#endif // HWY_COMPILER_GCC_ACTUAL
|
|
6704
7060
|
}
|
|
6705
7061
|
template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U32_D(DU)>
|
|
6706
|
-
HWY_API VFromD<DU>
|
|
6707
|
-
|
|
6708
|
-
|
|
7062
|
+
HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
7063
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
7064
|
+
// Workaround for undefined behavior in _mm256_cvttps_epu32 with GCC if any
|
|
7065
|
+
// values of v[i] are not within the range of an uint32_t
|
|
7066
|
+
|
|
7067
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
7068
|
+
if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) {
|
|
7069
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
|
|
7070
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
7071
|
+
return VFromD<DU>{_mm256_setr_epi32(
|
|
7072
|
+
static_cast<int32_t>(
|
|
7073
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0])),
|
|
7074
|
+
static_cast<int32_t>(
|
|
7075
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1])),
|
|
7076
|
+
static_cast<int32_t>(
|
|
7077
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[2])),
|
|
7078
|
+
static_cast<int32_t>(
|
|
7079
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[3])),
|
|
7080
|
+
static_cast<int32_t>(
|
|
7081
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[4])),
|
|
7082
|
+
static_cast<int32_t>(
|
|
7083
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[5])),
|
|
7084
|
+
static_cast<int32_t>(
|
|
7085
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[6])),
|
|
7086
|
+
static_cast<int32_t>(
|
|
7087
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[7])))};
|
|
7088
|
+
}
|
|
7089
|
+
#endif
|
|
7090
|
+
|
|
7091
|
+
__m256i raw_result;
|
|
7092
|
+
__asm__("vcvttps2udq {%1, %0|%0, %1}"
|
|
7093
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
7094
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
7095
|
+
:);
|
|
7096
|
+
return VFromD<DU>{raw_result};
|
|
7097
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
7098
|
+
return VFromD<DU>{_mm256_cvttps_epu32(v.raw)};
|
|
7099
|
+
#endif // HWY_COMPILER_GCC_ACTUAL
|
|
6709
7100
|
}
|
|
6710
7101
|
template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U64_D(DU)>
|
|
6711
|
-
HWY_API VFromD<DU>
|
|
6712
|
-
|
|
6713
|
-
|
|
6714
|
-
|
|
6715
|
-
|
|
6716
|
-
|
|
6717
|
-
|
|
6718
|
-
|
|
6719
|
-
|
|
6720
|
-
|
|
6721
|
-
|
|
6722
|
-
|
|
6723
|
-
|
|
6724
|
-
|
|
6725
|
-
|
|
6726
|
-
|
|
6727
|
-
|
|
6728
|
-
|
|
6729
|
-
|
|
6730
|
-
|
|
6731
|
-
|
|
6732
|
-
|
|
6733
|
-
|
|
6734
|
-
|
|
7102
|
+
HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
7103
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
7104
|
+
// Workaround for undefined behavior in _mm256_cvttpd_epu64 with GCC if any
|
|
7105
|
+
// values of v[i] are not within the range of an uint64_t
|
|
7106
|
+
|
|
7107
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
7108
|
+
if (detail::IsConstantX86VecForF2IConv<uint64_t>(v)) {
|
|
7109
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(32)));
|
|
7110
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
7111
|
+
return VFromD<DU>{_mm256_setr_epi64x(
|
|
7112
|
+
static_cast<int64_t>(
|
|
7113
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0])),
|
|
7114
|
+
static_cast<int64_t>(
|
|
7115
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1])),
|
|
7116
|
+
static_cast<int64_t>(
|
|
7117
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[2])),
|
|
7118
|
+
static_cast<int64_t>(
|
|
7119
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[3])))};
|
|
7120
|
+
}
|
|
7121
|
+
#endif
|
|
7122
|
+
|
|
7123
|
+
__m256i raw_result;
|
|
7124
|
+
__asm__("vcvttpd2uqq {%1, %0|%0, %1}"
|
|
7125
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
7126
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
7127
|
+
:);
|
|
7128
|
+
return VFromD<DU>{raw_result};
|
|
7129
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
7130
|
+
return VFromD<DU>{_mm256_cvttpd_epu64(v.raw)};
|
|
7131
|
+
#endif // HWY_COMPILER_GCC_ACTUAL
|
|
6735
7132
|
}
|
|
6736
7133
|
#endif // HWY_TARGET <= HWY_AVX3
|
|
6737
7134
|
|
|
6738
|
-
|
|
6739
|
-
|
|
6740
|
-
|
|
6741
|
-
|
|
7135
|
+
template <class DI, HWY_IF_V_SIZE_D(DI, 32), HWY_IF_I32_D(DI)>
|
|
7136
|
+
HWY_INLINE VFromD<DI> NearestIntInRange(DI, VFromD<RebindToFloat<DI>> v) {
|
|
7137
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
7138
|
+
// Workaround for undefined behavior in _mm256_cvtps_epi32 if any values of
|
|
7139
|
+
// v[i] are not within the range of an int32_t
|
|
7140
|
+
|
|
7141
|
+
#if HWY_COMPILER_GCC >= 700 && !HWY_IS_DEBUG_BUILD
|
|
7142
|
+
if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
|
|
7143
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
|
|
7144
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
7145
|
+
return VFromD<DI>{
|
|
7146
|
+
_mm256_setr_epi32(detail::X86ScalarNearestInt<int32_t>(raw_v[0]),
|
|
7147
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[1]),
|
|
7148
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[2]),
|
|
7149
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[3]),
|
|
7150
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[4]),
|
|
7151
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[5]),
|
|
7152
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[6]),
|
|
7153
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[7]))};
|
|
7154
|
+
}
|
|
7155
|
+
#endif
|
|
7156
|
+
|
|
7157
|
+
__m256i raw_result;
|
|
7158
|
+
__asm__("vcvtps2dq {%1, %0|%0, %1}"
|
|
7159
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
7160
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
7161
|
+
:);
|
|
7162
|
+
return VFromD<DI>{raw_result};
|
|
7163
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
7164
|
+
return VFromD<DI>{_mm256_cvtps_epi32(v.raw)};
|
|
7165
|
+
#endif // HWY_COMPILER_GCC_ACTUAL
|
|
6742
7166
|
}
|
|
6743
7167
|
|
|
6744
7168
|
#ifndef HWY_DISABLE_F16C
|
|
@@ -8138,6 +8562,23 @@ HWY_API Mask256<T> SetAtOrBeforeFirst(Mask256<T> mask) {
|
|
|
8138
8562
|
|
|
8139
8563
|
// ------------------------------ Reductions in generic_ops
|
|
8140
8564
|
|
|
8565
|
+
// ------------------------------ BitShuffle
|
|
8566
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
8567
|
+
template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
|
|
8568
|
+
HWY_IF_V_SIZE_V(V, 32), HWY_IF_V_SIZE_V(VI, 32)>
|
|
8569
|
+
HWY_API V BitShuffle(V v, VI idx) {
|
|
8570
|
+
const DFromV<decltype(v)> d64;
|
|
8571
|
+
const RebindToUnsigned<decltype(d64)> du64;
|
|
8572
|
+
const Rebind<uint8_t, decltype(d64)> du8;
|
|
8573
|
+
|
|
8574
|
+
int32_t i32_bit_shuf_result =
|
|
8575
|
+
static_cast<int32_t>(_mm256_bitshuffle_epi64_mask(v.raw, idx.raw));
|
|
8576
|
+
|
|
8577
|
+
return BitCast(d64, PromoteTo(du64, VFromD<decltype(du8)>{_mm_cvtsi32_si128(
|
|
8578
|
+
i32_bit_shuf_result)}));
|
|
8579
|
+
}
|
|
8580
|
+
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
8581
|
+
|
|
8141
8582
|
// ------------------------------ LeadingZeroCount
|
|
8142
8583
|
|
|
8143
8584
|
#if HWY_TARGET <= HWY_AVX3
|