@img/sharp-libvips-dev 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +7 -1
- package/include/aom/aom_image.h +24 -12
- package/include/aom/aom_integer.h +3 -3
- package/include/aom/aomcx.h +15 -0
- package/include/aom/aomdx.h +5 -2
- package/include/archive.h +7 -5
- package/include/archive_entry.h +5 -3
- package/include/cgif.h +3 -0
- package/include/freetype2/freetype/config/ftoption.h +1 -1
- package/include/fribidi/fribidi-config.h +2 -2
- package/include/fribidi/fribidi-unicode-version.h +3 -3
- package/include/glib-2.0/gio/gappinfo.h +40 -25
- package/include/glib-2.0/gio/gasyncresult.h +1 -1
- package/include/glib-2.0/gio/gconverter.h +5 -0
- package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
- package/include/glib-2.0/gio/gfile.h +16 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/gsettings.h +8 -0
- package/include/glib-2.0/gio/gvfs.h +2 -2
- package/include/glib-2.0/girepository/gi-visibility.h +34 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
- package/include/glib-2.0/glib/giochannel.h +2 -2
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +12 -5
- package/include/glib-2.0/glib/gmain.h +93 -7
- package/include/glib-2.0/glib/gqsort.h +8 -1
- package/include/glib-2.0/glib/gstrfuncs.h +0 -12
- package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
- package/include/glib-2.0/glib/gunicode.h +1 -1
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gtype.h +6 -6
- package/include/harfbuzz/hb-buffer.h +6 -0
- package/include/harfbuzz/hb-common.h +6 -9
- package/include/harfbuzz/hb-cplusplus.hh +8 -11
- package/include/harfbuzz/hb-subset.h +17 -4
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +28 -0
- package/include/hwy/aligned_allocator.h +48 -1
- package/include/hwy/base.h +235 -34
- package/include/hwy/detect_compiler_arch.h +84 -10
- package/include/hwy/detect_targets.h +95 -29
- package/include/hwy/foreach_target.h +12 -1
- package/include/hwy/highway.h +205 -50
- package/include/hwy/ops/arm_neon-inl.h +841 -99
- package/include/hwy/ops/arm_sve-inl.h +413 -141
- package/include/hwy/ops/emu128-inl.h +373 -360
- package/include/hwy/ops/generic_ops-inl.h +804 -401
- package/include/hwy/ops/inside-inl.h +691 -0
- package/include/hwy/ops/ppc_vsx-inl.h +456 -166
- package/include/hwy/ops/rvv-inl.h +537 -249
- package/include/hwy/ops/scalar-inl.h +169 -79
- package/include/hwy/ops/set_macros-inl.h +106 -18
- package/include/hwy/ops/shared-inl.h +23 -0
- package/include/hwy/ops/wasm_128-inl.h +130 -108
- package/include/hwy/ops/x86_128-inl.h +1892 -577
- package/include/hwy/ops/x86_256-inl.h +625 -184
- package/include/hwy/ops/x86_512-inl.h +733 -131
- package/include/hwy/targets.h +22 -21
- package/include/hwy/timer-inl.h +3 -3
- package/include/hwy/timer.h +5 -1
- package/include/libheif/heif.h +170 -15
- package/include/libheif/heif_items.h +237 -0
- package/include/libheif/heif_properties.h +38 -2
- package/include/libheif/heif_regions.h +1 -1
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/pnglibconf.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
- package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
- package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
- package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
- package/include/libxml2/libxml/HTMLparser.h +12 -19
- package/include/libxml2/libxml/c14n.h +1 -12
- package/include/libxml2/libxml/debugXML.h +1 -1
- package/include/libxml2/libxml/encoding.h +9 -0
- package/include/libxml2/libxml/entities.h +12 -1
- package/include/libxml2/libxml/hash.h +19 -0
- package/include/libxml2/libxml/list.h +2 -2
- package/include/libxml2/libxml/nanohttp.h +17 -0
- package/include/libxml2/libxml/parser.h +61 -55
- package/include/libxml2/libxml/parserInternals.h +9 -1
- package/include/libxml2/libxml/pattern.h +6 -0
- package/include/libxml2/libxml/tree.h +32 -12
- package/include/libxml2/libxml/uri.h +11 -0
- package/include/libxml2/libxml/valid.h +29 -2
- package/include/libxml2/libxml/xinclude.h +7 -0
- package/include/libxml2/libxml/xmlIO.h +21 -4
- package/include/libxml2/libxml/xmlerror.h +14 -0
- package/include/libxml2/libxml/xmlexports.h +111 -15
- package/include/libxml2/libxml/xmlmemory.h +8 -45
- package/include/libxml2/libxml/xmlreader.h +2 -0
- package/include/libxml2/libxml/xmlsave.h +5 -0
- package/include/libxml2/libxml/xmlunicode.h +165 -1
- package/include/libxml2/libxml/xmlversion.h +15 -179
- package/include/libxml2/libxml/xmlwriter.h +1 -0
- package/include/libxml2/libxml/xpath.h +4 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-item.h +4 -2
- package/include/pango-1.0/pango/pango-version-macros.h +25 -0
- package/include/pango-1.0/pango/pangofc-font.h +2 -1
- package/include/pnglibconf.h +1 -1
- package/include/vips/util.h +1 -2
- package/include/vips/version.h +4 -4
- package/include/webp/decode.h +58 -56
- package/include/webp/demux.h +25 -21
- package/include/webp/encode.h +44 -39
- package/include/webp/mux.h +76 -15
- package/include/webp/mux_types.h +2 -1
- package/include/webp/sharpyuv/sharpyuv.h +77 -8
- package/include/webp/types.h +29 -8
- package/include/zconf.h +1 -1
- package/include/zlib.h +12 -12
- package/package.json +1 -1
- package/versions.json +14 -15
|
@@ -54,6 +54,22 @@ namespace detail {
|
|
|
54
54
|
#define HWY_X86_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
|
|
55
55
|
#endif
|
|
56
56
|
|
|
57
|
+
#undef HWY_AVX3_HAVE_F32_TO_BF16C
|
|
58
|
+
#if HWY_TARGET <= HWY_AVX3_ZEN4 && !HWY_COMPILER_CLANGCL && \
|
|
59
|
+
(HWY_COMPILER_GCC_ACTUAL >= 1000 || HWY_COMPILER_CLANG >= 900) && \
|
|
60
|
+
!defined(HWY_AVX3_DISABLE_AVX512BF16)
|
|
61
|
+
#define HWY_AVX3_HAVE_F32_TO_BF16C 1
|
|
62
|
+
#else
|
|
63
|
+
#define HWY_AVX3_HAVE_F32_TO_BF16C 0
|
|
64
|
+
#endif
|
|
65
|
+
|
|
66
|
+
#undef HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT
|
|
67
|
+
#if HWY_TARGET <= HWY_AVX3 && HWY_ARCH_X86_64
|
|
68
|
+
#define HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT "v"
|
|
69
|
+
#else
|
|
70
|
+
#define HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT "x"
|
|
71
|
+
#endif
|
|
72
|
+
|
|
57
73
|
template <typename T>
|
|
58
74
|
struct Raw128 {
|
|
59
75
|
using type = __m128i;
|
|
@@ -228,9 +244,6 @@ HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
|
|
|
228
244
|
template <class D>
|
|
229
245
|
using VFromD = decltype(Zero(D()));
|
|
230
246
|
|
|
231
|
-
// ------------------------------ Tuple (VFromD)
|
|
232
|
-
#include "hwy/ops/tuple-inl.h"
|
|
233
|
-
|
|
234
247
|
// ------------------------------ BitCast
|
|
235
248
|
|
|
236
249
|
namespace detail {
|
|
@@ -242,6 +255,25 @@ HWY_INLINE __m128i BitCastToInteger(__m128h v) { return _mm_castph_si128(v); }
|
|
|
242
255
|
HWY_INLINE __m128i BitCastToInteger(__m128 v) { return _mm_castps_si128(v); }
|
|
243
256
|
HWY_INLINE __m128i BitCastToInteger(__m128d v) { return _mm_castpd_si128(v); }
|
|
244
257
|
|
|
258
|
+
#if HWY_AVX3_HAVE_F32_TO_BF16C
|
|
259
|
+
HWY_INLINE __m128i BitCastToInteger(__m128bh v) {
|
|
260
|
+
// Need to use reinterpret_cast on GCC/Clang or BitCastScalar on MSVC to
|
|
261
|
+
// bit cast a __m128bh to a __m128i as there is currently no intrinsic
|
|
262
|
+
// available (as of GCC 13 and Clang 17) that can bit cast a __m128bh vector
|
|
263
|
+
// to a __m128i vector
|
|
264
|
+
|
|
265
|
+
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
|
|
266
|
+
// On GCC or Clang, use reinterpret_cast to bit cast a __m128bh to a __m128i
|
|
267
|
+
return reinterpret_cast<__m128i>(v);
|
|
268
|
+
#else
|
|
269
|
+
// On MSVC, use BitCastScalar to bit cast a __m128bh to a __m128i as MSVC does
|
|
270
|
+
// not allow reinterpret_cast, static_cast, or a C-style cast to be used to
|
|
271
|
+
// bit cast from one SSE/AVX vector type to a different SSE/AVX vector type
|
|
272
|
+
return BitCastScalar<__m128i>(v);
|
|
273
|
+
#endif // HWY_COMPILER_GCC || HWY_COMPILER_CLANG
|
|
274
|
+
}
|
|
275
|
+
#endif // HWY_AVX3_HAVE_F32_TO_BF16C
|
|
276
|
+
|
|
245
277
|
template <typename T, size_t N>
|
|
246
278
|
HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) {
|
|
247
279
|
return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
|
|
@@ -502,6 +534,112 @@ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
|
502
534
|
return VFromD<D>{_mm_setr_pd(t0, t1)};
|
|
503
535
|
}
|
|
504
536
|
|
|
537
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
538
|
+
namespace detail {
|
|
539
|
+
|
|
540
|
+
template <class RawV>
|
|
541
|
+
static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec(
|
|
542
|
+
hwy::SizeTag<1> /* num_of_lanes_tag*/, RawV v) {
|
|
543
|
+
return __builtin_constant_p(v[0]);
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
template <class RawV>
|
|
547
|
+
static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec(
|
|
548
|
+
hwy::SizeTag<2> /* num_of_lanes_tag*/, RawV v) {
|
|
549
|
+
return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]);
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
template <class RawV>
|
|
553
|
+
static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec(
|
|
554
|
+
hwy::SizeTag<4> /* num_of_lanes_tag*/, RawV v) {
|
|
555
|
+
return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
|
|
556
|
+
__builtin_constant_p(v[2]) && __builtin_constant_p(v[3]);
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
template <class RawV>
|
|
560
|
+
static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec(
|
|
561
|
+
hwy::SizeTag<8> /* num_of_lanes_tag*/, RawV v) {
|
|
562
|
+
return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
|
|
563
|
+
__builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) &&
|
|
564
|
+
__builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) &&
|
|
565
|
+
__builtin_constant_p(v[6]) && __builtin_constant_p(v[7]);
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
template <class RawV>
|
|
569
|
+
static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec(
|
|
570
|
+
hwy::SizeTag<16> /* num_of_lanes_tag*/, RawV v) {
|
|
571
|
+
return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
|
|
572
|
+
__builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) &&
|
|
573
|
+
__builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) &&
|
|
574
|
+
__builtin_constant_p(v[6]) && __builtin_constant_p(v[7]) &&
|
|
575
|
+
__builtin_constant_p(v[8]) && __builtin_constant_p(v[9]) &&
|
|
576
|
+
__builtin_constant_p(v[10]) && __builtin_constant_p(v[11]) &&
|
|
577
|
+
__builtin_constant_p(v[12]) && __builtin_constant_p(v[13]) &&
|
|
578
|
+
__builtin_constant_p(v[14]) && __builtin_constant_p(v[15]);
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
#if HWY_TARGET <= HWY_AVX2
|
|
582
|
+
template <class RawV>
|
|
583
|
+
static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec(
|
|
584
|
+
hwy::SizeTag<32> /* num_of_lanes_tag*/, RawV v) {
|
|
585
|
+
return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
|
|
586
|
+
__builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) &&
|
|
587
|
+
__builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) &&
|
|
588
|
+
__builtin_constant_p(v[6]) && __builtin_constant_p(v[7]) &&
|
|
589
|
+
__builtin_constant_p(v[8]) && __builtin_constant_p(v[9]) &&
|
|
590
|
+
__builtin_constant_p(v[10]) && __builtin_constant_p(v[11]) &&
|
|
591
|
+
__builtin_constant_p(v[12]) && __builtin_constant_p(v[13]) &&
|
|
592
|
+
__builtin_constant_p(v[14]) && __builtin_constant_p(v[15]) &&
|
|
593
|
+
__builtin_constant_p(v[16]) && __builtin_constant_p(v[17]) &&
|
|
594
|
+
__builtin_constant_p(v[18]) && __builtin_constant_p(v[19]) &&
|
|
595
|
+
__builtin_constant_p(v[20]) && __builtin_constant_p(v[21]) &&
|
|
596
|
+
__builtin_constant_p(v[22]) && __builtin_constant_p(v[23]) &&
|
|
597
|
+
__builtin_constant_p(v[24]) && __builtin_constant_p(v[25]) &&
|
|
598
|
+
__builtin_constant_p(v[26]) && __builtin_constant_p(v[27]) &&
|
|
599
|
+
__builtin_constant_p(v[28]) && __builtin_constant_p(v[29]) &&
|
|
600
|
+
__builtin_constant_p(v[30]) && __builtin_constant_p(v[31]);
|
|
601
|
+
}
|
|
602
|
+
#endif
|
|
603
|
+
|
|
604
|
+
template <size_t kNumOfLanes, class V>
|
|
605
|
+
static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantX86Vec(
|
|
606
|
+
hwy::SizeTag<kNumOfLanes> num_of_lanes_tag, V v) {
|
|
607
|
+
using T = TFromV<V>;
|
|
608
|
+
#if HWY_HAVE_FLOAT16 && HWY_HAVE_SCALAR_F16_TYPE
|
|
609
|
+
using F16VecLaneT = hwy::float16_t::Native;
|
|
610
|
+
#else
|
|
611
|
+
using F16VecLaneT = uint16_t;
|
|
612
|
+
#endif
|
|
613
|
+
using RawVecLaneT = If<hwy::IsSame<T, hwy::float16_t>(), F16VecLaneT,
|
|
614
|
+
If<hwy::IsSame<T, hwy::bfloat16_t>(), uint16_t, T>>;
|
|
615
|
+
|
|
616
|
+
// Suppress the -Wignored-attributes warning that is emitted by
|
|
617
|
+
// RemoveCvRef<decltype(v.raw)> with GCC
|
|
618
|
+
HWY_DIAGNOSTICS(push)
|
|
619
|
+
HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
|
|
620
|
+
typedef RawVecLaneT GccRawVec
|
|
621
|
+
__attribute__((__vector_size__(sizeof(RemoveCvRef<decltype(v.raw)>))));
|
|
622
|
+
HWY_DIAGNOSTICS(pop)
|
|
623
|
+
|
|
624
|
+
return IsConstantRawX86Vec(num_of_lanes_tag,
|
|
625
|
+
reinterpret_cast<GccRawVec>(v.raw));
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
template <class TTo, class V>
|
|
629
|
+
static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantX86VecForF2IConv(V v) {
|
|
630
|
+
constexpr size_t kNumOfLanesInRawSrcVec =
|
|
631
|
+
HWY_MAX(HWY_MAX_LANES_V(V), 16 / sizeof(TFromV<V>));
|
|
632
|
+
constexpr size_t kNumOfLanesInRawResultVec =
|
|
633
|
+
HWY_MAX(HWY_MAX_LANES_V(V), 16 / sizeof(TTo));
|
|
634
|
+
constexpr size_t kNumOfLanesToCheck =
|
|
635
|
+
HWY_MIN(kNumOfLanesInRawSrcVec, kNumOfLanesInRawResultVec);
|
|
636
|
+
|
|
637
|
+
return IsConstantX86Vec(hwy::SizeTag<kNumOfLanesToCheck>(), v);
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
} // namespace detail
|
|
641
|
+
#endif // HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
642
|
+
|
|
505
643
|
// ================================================== LOGICAL
|
|
506
644
|
|
|
507
645
|
// ------------------------------ And
|
|
@@ -587,7 +725,7 @@ HWY_API Vec128<T, N> Not(const Vec128<T, N> v) {
|
|
|
587
725
|
const DFromV<decltype(v)> d;
|
|
588
726
|
const RebindToUnsigned<decltype(d)> du;
|
|
589
727
|
using VU = VFromD<decltype(du)>;
|
|
590
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
728
|
+
#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
591
729
|
const __m128i vu = BitCast(du, v).raw;
|
|
592
730
|
return BitCast(d, VU{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)});
|
|
593
731
|
#else
|
|
@@ -598,7 +736,7 @@ HWY_API Vec128<T, N> Not(const Vec128<T, N> v) {
|
|
|
598
736
|
// ------------------------------ Xor3
|
|
599
737
|
template <typename T, size_t N>
|
|
600
738
|
HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
|
|
601
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
739
|
+
#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
602
740
|
const DFromV<decltype(x1)> d;
|
|
603
741
|
const RebindToUnsigned<decltype(d)> du;
|
|
604
742
|
using VU = VFromD<decltype(du)>;
|
|
@@ -613,7 +751,7 @@ HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
|
|
|
613
751
|
// ------------------------------ Or3
|
|
614
752
|
template <typename T, size_t N>
|
|
615
753
|
HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
|
|
616
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
754
|
+
#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
617
755
|
const DFromV<decltype(o1)> d;
|
|
618
756
|
const RebindToUnsigned<decltype(d)> du;
|
|
619
757
|
using VU = VFromD<decltype(du)>;
|
|
@@ -628,7 +766,7 @@ HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
|
|
|
628
766
|
// ------------------------------ OrAnd
|
|
629
767
|
template <typename T, size_t N>
|
|
630
768
|
HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
|
|
631
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
769
|
+
#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
632
770
|
const DFromV<decltype(o)> d;
|
|
633
771
|
const RebindToUnsigned<decltype(d)> du;
|
|
634
772
|
using VU = VFromD<decltype(du)>;
|
|
@@ -644,7 +782,7 @@ HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
|
|
|
644
782
|
template <typename T, size_t N>
|
|
645
783
|
HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
|
|
646
784
|
Vec128<T, N> no) {
|
|
647
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
785
|
+
#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
648
786
|
const DFromV<decltype(no)> d;
|
|
649
787
|
const RebindToUnsigned<decltype(d)> du;
|
|
650
788
|
using VU = VFromD<decltype(du)>;
|
|
@@ -657,7 +795,7 @@ HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
|
|
|
657
795
|
}
|
|
658
796
|
|
|
659
797
|
// ------------------------------ BitwiseIfThenElse
|
|
660
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
798
|
+
#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
661
799
|
|
|
662
800
|
#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
|
|
663
801
|
#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
|
|
@@ -870,6 +1008,19 @@ HWY_API MFromD<D> MaskFalse(D /*d*/) {
|
|
|
870
1008
|
return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(0)};
|
|
871
1009
|
}
|
|
872
1010
|
|
|
1011
|
+
// ------------------------------ IsNegative (MFromD)
|
|
1012
|
+
#ifdef HWY_NATIVE_IS_NEGATIVE
|
|
1013
|
+
#undef HWY_NATIVE_IS_NEGATIVE
|
|
1014
|
+
#else
|
|
1015
|
+
#define HWY_NATIVE_IS_NEGATIVE
|
|
1016
|
+
#endif
|
|
1017
|
+
|
|
1018
|
+
// Generic for all vector lengths
|
|
1019
|
+
template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
|
|
1020
|
+
HWY_API MFromD<DFromV<V>> IsNegative(V v) {
|
|
1021
|
+
return MaskFromVec(v);
|
|
1022
|
+
}
|
|
1023
|
+
|
|
873
1024
|
// ------------------------------ PromoteMaskTo (MFromD)
|
|
874
1025
|
|
|
875
1026
|
#ifdef HWY_NATIVE_PROMOTE_MASK_TO
|
|
@@ -1072,6 +1223,101 @@ HWY_API MFromD<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom /*d_from*/,
|
|
|
1072
1223
|
MH{static_cast<RawMH>(a.raw)});
|
|
1073
1224
|
}
|
|
1074
1225
|
|
|
1226
|
+
// ------------------------------ Slide mask up/down
|
|
1227
|
+
#ifdef HWY_NATIVE_SLIDE_MASK
|
|
1228
|
+
#undef HWY_NATIVE_SLIDE_MASK
|
|
1229
|
+
#else
|
|
1230
|
+
#define HWY_NATIVE_SLIDE_MASK
|
|
1231
|
+
#endif
|
|
1232
|
+
|
|
1233
|
+
template <class D, HWY_IF_LANES_LE_D(D, 8)>
|
|
1234
|
+
HWY_API MFromD<D> SlideMask1Up(D d, MFromD<D> m) {
|
|
1235
|
+
using RawM = decltype(MFromD<D>().raw);
|
|
1236
|
+
constexpr size_t kN = MaxLanes(d);
|
|
1237
|
+
constexpr unsigned kValidLanesMask = (1u << kN) - 1u;
|
|
1238
|
+
|
|
1239
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
1240
|
+
MFromD<D> result_mask{
|
|
1241
|
+
static_cast<RawM>(_kshiftli_mask8(static_cast<__mmask8>(m.raw), 1))};
|
|
1242
|
+
|
|
1243
|
+
if (kN < 8) {
|
|
1244
|
+
result_mask =
|
|
1245
|
+
And(result_mask, MFromD<D>{static_cast<RawM>(kValidLanesMask)});
|
|
1246
|
+
}
|
|
1247
|
+
#else
|
|
1248
|
+
MFromD<D> result_mask{
|
|
1249
|
+
static_cast<RawM>((static_cast<unsigned>(m.raw) << 1) & kValidLanesMask)};
|
|
1250
|
+
#endif
|
|
1251
|
+
|
|
1252
|
+
return result_mask;
|
|
1253
|
+
}
|
|
1254
|
+
|
|
1255
|
+
template <class D, HWY_IF_LANES_D(D, 16)>
|
|
1256
|
+
HWY_API MFromD<D> SlideMask1Up(D /*d*/, MFromD<D> m) {
|
|
1257
|
+
using RawM = decltype(MFromD<D>().raw);
|
|
1258
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
1259
|
+
return MFromD<D>{
|
|
1260
|
+
static_cast<RawM>(_kshiftli_mask16(static_cast<__mmask16>(m.raw), 1))};
|
|
1261
|
+
#else
|
|
1262
|
+
return MFromD<D>{static_cast<RawM>(static_cast<unsigned>(m.raw) << 1)};
|
|
1263
|
+
#endif
|
|
1264
|
+
}
|
|
1265
|
+
|
|
1266
|
+
template <class D, HWY_IF_LANES_LE_D(D, 8)>
|
|
1267
|
+
HWY_API MFromD<D> SlideMask1Down(D d, MFromD<D> m) {
|
|
1268
|
+
using RawM = decltype(MFromD<D>().raw);
|
|
1269
|
+
constexpr size_t kN = MaxLanes(d);
|
|
1270
|
+
constexpr unsigned kValidLanesMask = (1u << kN) - 1u;
|
|
1271
|
+
|
|
1272
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
1273
|
+
if (kN < 8) {
|
|
1274
|
+
m = And(m, MFromD<D>{static_cast<RawM>(kValidLanesMask)});
|
|
1275
|
+
}
|
|
1276
|
+
|
|
1277
|
+
return MFromD<D>{
|
|
1278
|
+
static_cast<RawM>(_kshiftri_mask8(static_cast<__mmask8>(m.raw), 1))};
|
|
1279
|
+
#else
|
|
1280
|
+
return MFromD<D>{
|
|
1281
|
+
static_cast<RawM>((static_cast<unsigned>(m.raw) & kValidLanesMask) >> 1)};
|
|
1282
|
+
#endif
|
|
1283
|
+
}
|
|
1284
|
+
|
|
1285
|
+
template <class D, HWY_IF_LANES_D(D, 16)>
|
|
1286
|
+
HWY_API MFromD<D> SlideMask1Down(D /*d*/, MFromD<D> m) {
|
|
1287
|
+
using RawM = decltype(MFromD<D>().raw);
|
|
1288
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
1289
|
+
return MFromD<D>{
|
|
1290
|
+
static_cast<RawM>(_kshiftri_mask16(static_cast<__mmask16>(m.raw), 1))};
|
|
1291
|
+
#else
|
|
1292
|
+
return MFromD<D>{
|
|
1293
|
+
static_cast<RawM>((static_cast<unsigned>(m.raw) & 0xFFFFu) >> 1)};
|
|
1294
|
+
#endif
|
|
1295
|
+
}
|
|
1296
|
+
|
|
1297
|
+
// Generic for all vector lengths
|
|
1298
|
+
template <class D>
|
|
1299
|
+
HWY_API MFromD<D> SlideMaskUpLanes(D d, MFromD<D> m, size_t amt) {
|
|
1300
|
+
using RawM = decltype(MFromD<D>().raw);
|
|
1301
|
+
constexpr size_t kN = MaxLanes(d);
|
|
1302
|
+
constexpr uint64_t kValidLanesMask =
|
|
1303
|
+
static_cast<uint64_t>(((kN < 64) ? (1ULL << kN) : 0ULL) - 1ULL);
|
|
1304
|
+
|
|
1305
|
+
return MFromD<D>{static_cast<RawM>(
|
|
1306
|
+
(static_cast<uint64_t>(m.raw) << (amt & 63)) & kValidLanesMask)};
|
|
1307
|
+
}
|
|
1308
|
+
|
|
1309
|
+
// Generic for all vector lengths
|
|
1310
|
+
template <class D>
|
|
1311
|
+
HWY_API MFromD<D> SlideMaskDownLanes(D d, MFromD<D> m, size_t amt) {
|
|
1312
|
+
using RawM = decltype(MFromD<D>().raw);
|
|
1313
|
+
constexpr size_t kN = MaxLanes(d);
|
|
1314
|
+
constexpr uint64_t kValidLanesMask =
|
|
1315
|
+
static_cast<uint64_t>(((kN < 64) ? (1ULL << kN) : 0ULL) - 1ULL);
|
|
1316
|
+
|
|
1317
|
+
return MFromD<D>{static_cast<RawM>(
|
|
1318
|
+
(static_cast<uint64_t>(m.raw) & kValidLanesMask) >> (amt & 63))};
|
|
1319
|
+
}
|
|
1320
|
+
|
|
1075
1321
|
// ------------------------------ VecFromMask
|
|
1076
1322
|
|
|
1077
1323
|
template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
|
|
@@ -3660,6 +3906,12 @@ HWY_API Vec128<double, N> operator-(const Vec128<double, N> a,
|
|
|
3660
3906
|
// ------------------------------ AddSub
|
|
3661
3907
|
|
|
3662
3908
|
#if HWY_TARGET <= HWY_SSSE3
|
|
3909
|
+
|
|
3910
|
+
#undef HWY_IF_ADDSUB_V
|
|
3911
|
+
#define HWY_IF_ADDSUB_V(V) \
|
|
3912
|
+
HWY_IF_V_SIZE_GT_V( \
|
|
3913
|
+
V, ((hwy::IsFloat3264<TFromV<V>>()) ? 32 : sizeof(TFromV<V>)))
|
|
3914
|
+
|
|
3663
3915
|
template <size_t N, HWY_IF_LANES_GT(N, 1)>
|
|
3664
3916
|
HWY_API Vec128<float, N> AddSub(Vec128<float, N> a, Vec128<float, N> b) {
|
|
3665
3917
|
return Vec128<float, N>{_mm_addsub_ps(a.raw, b.raw)};
|
|
@@ -3862,7 +4114,7 @@ HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a,
|
|
|
3862
4114
|
return Vec128<int16_t, N>{_mm_adds_epi16(a.raw, b.raw)};
|
|
3863
4115
|
}
|
|
3864
4116
|
|
|
3865
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
4117
|
+
#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
3866
4118
|
#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
|
|
3867
4119
|
#undef HWY_NATIVE_I32_SATURATED_ADDSUB
|
|
3868
4120
|
#else
|
|
@@ -3900,7 +4152,7 @@ HWY_API Vec128<int64_t, N> SaturatedAdd(Vec128<int64_t, N> a,
|
|
|
3900
4152
|
i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
|
|
3901
4153
|
return IfThenElse(overflow_mask, overflow_result, sum);
|
|
3902
4154
|
}
|
|
3903
|
-
#endif // HWY_TARGET <= HWY_AVX3
|
|
4155
|
+
#endif // HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
3904
4156
|
|
|
3905
4157
|
// ------------------------------ SaturatedSub
|
|
3906
4158
|
|
|
@@ -3930,7 +4182,7 @@ HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a,
|
|
|
3930
4182
|
return Vec128<int16_t, N>{_mm_subs_epi16(a.raw, b.raw)};
|
|
3931
4183
|
}
|
|
3932
4184
|
|
|
3933
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
4185
|
+
#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
3934
4186
|
template <size_t N>
|
|
3935
4187
|
HWY_API Vec128<int32_t, N> SaturatedSub(Vec128<int32_t, N> a,
|
|
3936
4188
|
Vec128<int32_t, N> b) {
|
|
@@ -3956,7 +4208,7 @@ HWY_API Vec128<int64_t, N> SaturatedSub(Vec128<int64_t, N> a,
|
|
|
3956
4208
|
i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
|
|
3957
4209
|
return IfThenElse(overflow_mask, overflow_result, diff);
|
|
3958
4210
|
}
|
|
3959
|
-
#endif // HWY_TARGET <= HWY_AVX3
|
|
4211
|
+
#endif // HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
3960
4212
|
|
|
3961
4213
|
// ------------------------------ AverageRound
|
|
3962
4214
|
|
|
@@ -3987,7 +4239,7 @@ HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a,
|
|
|
3987
4239
|
return Vec128<int16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
|
|
3988
4240
|
}
|
|
3989
4241
|
|
|
3990
|
-
// Returns the upper
|
|
4242
|
+
// Returns the upper sizeof(T)*8 bits of a * b in each lane.
|
|
3991
4243
|
template <size_t N>
|
|
3992
4244
|
HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
|
|
3993
4245
|
const Vec128<uint16_t, N> b) {
|
|
@@ -3999,6 +4251,26 @@ HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
|
|
|
3999
4251
|
return Vec128<int16_t, N>{_mm_mulhi_epi16(a.raw, b.raw)};
|
|
4000
4252
|
}
|
|
4001
4253
|
|
|
4254
|
+
template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4)),
|
|
4255
|
+
HWY_IF_LANES_D(DFromV<V>, 1)>
|
|
4256
|
+
HWY_API V MulHigh(V a, V b) {
|
|
4257
|
+
const DFromV<decltype(a)> d;
|
|
4258
|
+
const Full128<TFromD<decltype(d)>> d_full;
|
|
4259
|
+
return ResizeBitCast(
|
|
4260
|
+
d, Slide1Down(d_full, ResizeBitCast(d_full, MulEven(a, b))));
|
|
4261
|
+
}
|
|
4262
|
+
|
|
4263
|
+
// I8/U8/I32/U32 MulHigh is generic for all vector lengths >= 2 lanes
|
|
4264
|
+
template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4)),
|
|
4265
|
+
HWY_IF_LANES_GT_D(DFromV<V>, 1)>
|
|
4266
|
+
HWY_API V MulHigh(V a, V b) {
|
|
4267
|
+
const DFromV<decltype(a)> d;
|
|
4268
|
+
|
|
4269
|
+
const auto p_even = BitCast(d, MulEven(a, b));
|
|
4270
|
+
const auto p_odd = BitCast(d, MulOdd(a, b));
|
|
4271
|
+
return InterleaveOdd(d, p_even, p_odd);
|
|
4272
|
+
}
|
|
4273
|
+
|
|
4002
4274
|
// Multiplies even lanes (0, 2 ..) and places the double-wide result into
|
|
4003
4275
|
// even and the upper half into its odd neighbor lane.
|
|
4004
4276
|
template <class V, HWY_IF_U8_D(DFromV<V>)>
|
|
@@ -4126,15 +4398,29 @@ HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
|
|
|
4126
4398
|
|
|
4127
4399
|
// ------------------------------ RotateRight (ShiftRight, Or)
|
|
4128
4400
|
|
|
4129
|
-
|
|
4130
|
-
|
|
4131
|
-
|
|
4132
|
-
|
|
4133
|
-
|
|
4401
|
+
// U8 RotateRight implementation on AVX3_DL is now in x86_512-inl.h as U8
|
|
4402
|
+
// RotateRight uses detail::GaloisAffine on AVX3_DL
|
|
4403
|
+
|
|
4404
|
+
#if HWY_TARGET > HWY_AVX3_DL
|
|
4405
|
+
template <int kBits, size_t N>
|
|
4406
|
+
HWY_API Vec128<uint8_t, N> RotateRight(const Vec128<uint8_t, N> v) {
|
|
4407
|
+
static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
|
|
4408
|
+
if (kBits == 0) return v;
|
|
4409
|
+
// AVX3 does not support 8-bit.
|
|
4410
|
+
return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(7, 8 - kBits)>(v));
|
|
4411
|
+
}
|
|
4412
|
+
#endif
|
|
4413
|
+
|
|
4414
|
+
template <int kBits, size_t N>
|
|
4415
|
+
HWY_API Vec128<uint16_t, N> RotateRight(const Vec128<uint16_t, N> v) {
|
|
4416
|
+
static_assert(0 <= kBits && kBits < 16, "Invalid shift count");
|
|
4134
4417
|
if (kBits == 0) return v;
|
|
4135
|
-
|
|
4136
|
-
return
|
|
4137
|
-
|
|
4418
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
4419
|
+
return Vec128<uint16_t, N>{_mm_shrdi_epi16(v.raw, v.raw, kBits)};
|
|
4420
|
+
#else
|
|
4421
|
+
// AVX3 does not support 16-bit.
|
|
4422
|
+
return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(15, 16 - kBits)>(v));
|
|
4423
|
+
#endif
|
|
4138
4424
|
}
|
|
4139
4425
|
|
|
4140
4426
|
template <int kBits, size_t N>
|
|
@@ -4159,6 +4445,116 @@ HWY_API Vec128<uint64_t, N> RotateRight(const Vec128<uint64_t, N> v) {
|
|
|
4159
4445
|
#endif
|
|
4160
4446
|
}
|
|
4161
4447
|
|
|
4448
|
+
// I8/I16/I32/I64 RotateRight is generic for all vector lengths
|
|
4449
|
+
template <int kBits, class V, HWY_IF_SIGNED_V(V)>
|
|
4450
|
+
HWY_API V RotateRight(V v) {
|
|
4451
|
+
const DFromV<decltype(v)> d;
|
|
4452
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4453
|
+
return BitCast(d, RotateRight<kBits>(BitCast(du, v)));
|
|
4454
|
+
}
|
|
4455
|
+
|
|
4456
|
+
// ------------------------------ Rol/Ror
|
|
4457
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
4458
|
+
#ifdef HWY_NATIVE_ROL_ROR_16
|
|
4459
|
+
#undef HWY_NATIVE_ROL_ROR_16
|
|
4460
|
+
#else
|
|
4461
|
+
#define HWY_NATIVE_ROL_ROR_16
|
|
4462
|
+
#endif
|
|
4463
|
+
|
|
4464
|
+
template <class T, size_t N, HWY_IF_UI16(T)>
|
|
4465
|
+
HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
|
|
4466
|
+
return Vec128<T, N>{_mm_shrdv_epi16(a.raw, a.raw, b.raw)};
|
|
4467
|
+
}
|
|
4468
|
+
|
|
4469
|
+
// U16/I16 Rol is generic for all vector lengths on AVX3_DL
|
|
4470
|
+
template <class V, HWY_IF_UI16(TFromV<V>)>
|
|
4471
|
+
HWY_API V Rol(V a, V b) {
|
|
4472
|
+
const DFromV<decltype(a)> d;
|
|
4473
|
+
const RebindToSigned<decltype(d)> di;
|
|
4474
|
+
return Ror(a, BitCast(d, Neg(BitCast(di, b))));
|
|
4475
|
+
}
|
|
4476
|
+
|
|
4477
|
+
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
4478
|
+
|
|
4479
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
4480
|
+
|
|
4481
|
+
#ifdef HWY_NATIVE_ROL_ROR_32_64
|
|
4482
|
+
#undef HWY_NATIVE_ROL_ROR_32_64
|
|
4483
|
+
#else
|
|
4484
|
+
#define HWY_NATIVE_ROL_ROR_32_64
|
|
4485
|
+
#endif
|
|
4486
|
+
|
|
4487
|
+
template <class T, size_t N, HWY_IF_UI32(T)>
|
|
4488
|
+
HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) {
|
|
4489
|
+
return Vec128<T, N>{_mm_rolv_epi32(a.raw, b.raw)};
|
|
4490
|
+
}
|
|
4491
|
+
|
|
4492
|
+
template <class T, size_t N, HWY_IF_UI32(T)>
|
|
4493
|
+
HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
|
|
4494
|
+
return Vec128<T, N>{_mm_rorv_epi32(a.raw, b.raw)};
|
|
4495
|
+
}
|
|
4496
|
+
|
|
4497
|
+
template <class T, size_t N, HWY_IF_UI64(T)>
|
|
4498
|
+
HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) {
|
|
4499
|
+
return Vec128<T, N>{_mm_rolv_epi64(a.raw, b.raw)};
|
|
4500
|
+
}
|
|
4501
|
+
|
|
4502
|
+
template <class T, size_t N, HWY_IF_UI64(T)>
|
|
4503
|
+
HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
|
|
4504
|
+
return Vec128<T, N>{_mm_rorv_epi64(a.raw, b.raw)};
|
|
4505
|
+
}
|
|
4506
|
+
|
|
4507
|
+
#endif
|
|
4508
|
+
|
|
4509
|
+
// ------------------------------ RotateLeftSame/RotateRightSame
|
|
4510
|
+
|
|
4511
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
4512
|
+
|
|
4513
|
+
#ifdef HWY_NATIVE_ROL_ROR_SAME_16
|
|
4514
|
+
#undef HWY_NATIVE_ROL_ROR_SAME_16
|
|
4515
|
+
#else
|
|
4516
|
+
#define HWY_NATIVE_ROL_ROR_SAME_16
|
|
4517
|
+
#endif
|
|
4518
|
+
|
|
4519
|
+
// Generic for all vector lengths
|
|
4520
|
+
template <class V, HWY_IF_UI16(TFromV<V>)>
|
|
4521
|
+
HWY_API V RotateLeftSame(V v, int bits) {
|
|
4522
|
+
const DFromV<decltype(v)> d;
|
|
4523
|
+
return Ror(v,
|
|
4524
|
+
Set(d, static_cast<TFromV<V>>(0u - static_cast<unsigned>(bits))));
|
|
4525
|
+
}
|
|
4526
|
+
|
|
4527
|
+
template <class V, HWY_IF_UI16(TFromV<V>)>
|
|
4528
|
+
HWY_API V RotateRightSame(V v, int bits) {
|
|
4529
|
+
const DFromV<decltype(v)> d;
|
|
4530
|
+
return Ror(v, Set(d, static_cast<TFromV<V>>(bits)));
|
|
4531
|
+
}
|
|
4532
|
+
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
4533
|
+
|
|
4534
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
4535
|
+
|
|
4536
|
+
#ifdef HWY_NATIVE_ROL_ROR_SAME_32_64
|
|
4537
|
+
#undef HWY_NATIVE_ROL_ROR_SAME_32_64
|
|
4538
|
+
#else
|
|
4539
|
+
#define HWY_NATIVE_ROL_ROR_SAME_32_64
|
|
4540
|
+
#endif
|
|
4541
|
+
|
|
4542
|
+
// Generic for all vector lengths
|
|
4543
|
+
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
|
|
4544
|
+
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
|
|
4545
|
+
HWY_API V RotateLeftSame(V v, int bits) {
|
|
4546
|
+
const DFromV<decltype(v)> d;
|
|
4547
|
+
return Rol(v, Set(d, static_cast<TFromV<V>>(static_cast<unsigned>(bits))));
|
|
4548
|
+
}
|
|
4549
|
+
|
|
4550
|
+
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
|
|
4551
|
+
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
|
|
4552
|
+
HWY_API V RotateRightSame(V v, int bits) {
|
|
4553
|
+
const DFromV<decltype(v)> d;
|
|
4554
|
+
return Ror(v, Set(d, static_cast<TFromV<V>>(static_cast<unsigned>(bits))));
|
|
4555
|
+
}
|
|
4556
|
+
#endif // HWY_TARGET <= HWY_AVX3
|
|
4557
|
+
|
|
4162
4558
|
// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
|
|
4163
4559
|
|
|
4164
4560
|
template <size_t N>
|
|
@@ -4312,20 +4708,6 @@ HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) {
|
|
|
4312
4708
|
#endif
|
|
4313
4709
|
}
|
|
4314
4710
|
|
|
4315
|
-
// ------------------------------ ZeroIfNegative (BroadcastSignBit)
|
|
4316
|
-
template <typename T, size_t N>
|
|
4317
|
-
HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
|
|
4318
|
-
static_assert(IsFloat<T>(), "Only works for float");
|
|
4319
|
-
const DFromV<decltype(v)> d;
|
|
4320
|
-
#if HWY_TARGET >= HWY_SSSE3
|
|
4321
|
-
const RebindToSigned<decltype(d)> di;
|
|
4322
|
-
const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
|
|
4323
|
-
#else
|
|
4324
|
-
const auto mask = MaskFromVec(v); // MSB is sufficient for BLENDVPS
|
|
4325
|
-
#endif
|
|
4326
|
-
return IfThenElse(mask, Zero(d), v);
|
|
4327
|
-
}
|
|
4328
|
-
|
|
4329
4711
|
// ------------------------------ IfNegativeThenElse
|
|
4330
4712
|
template <size_t N>
|
|
4331
4713
|
HWY_API Vec128<int8_t, N> IfNegativeThenElse(const Vec128<int8_t, N> v,
|
|
@@ -4389,6 +4771,48 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
|
|
|
4389
4771
|
#endif
|
|
4390
4772
|
}
|
|
4391
4773
|
|
|
4774
|
+
#if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4
|
|
4775
|
+
|
|
4776
|
+
#ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
|
|
4777
|
+
#undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
|
|
4778
|
+
#else
|
|
4779
|
+
#define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
|
|
4780
|
+
#endif
|
|
4781
|
+
|
|
4782
|
+
#ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
|
|
4783
|
+
#undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
|
|
4784
|
+
#else
|
|
4785
|
+
#define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
|
|
4786
|
+
#endif
|
|
4787
|
+
|
|
4788
|
+
// SSE4/AVX2 IfNegativeThenElseZero/IfNegativeThenZeroElse is generic for all
|
|
4789
|
+
// vector lengths
|
|
4790
|
+
template <class V, HWY_IF_NOT_UNSIGNED_V(V),
|
|
4791
|
+
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4) | (1 << 8))>
|
|
4792
|
+
HWY_API V IfNegativeThenElseZero(V v, V yes) {
|
|
4793
|
+
const DFromV<decltype(v)> d;
|
|
4794
|
+
return IfNegativeThenElse(v, yes, Zero(d));
|
|
4795
|
+
}
|
|
4796
|
+
|
|
4797
|
+
template <class V, HWY_IF_NOT_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, 2)>
|
|
4798
|
+
HWY_API V IfNegativeThenElseZero(V v, V yes) {
|
|
4799
|
+
return IfThenElseZero(IsNegative(v), yes);
|
|
4800
|
+
}
|
|
4801
|
+
|
|
4802
|
+
template <class V, HWY_IF_NOT_UNSIGNED_V(V),
|
|
4803
|
+
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4) | (1 << 8))>
|
|
4804
|
+
HWY_API V IfNegativeThenZeroElse(V v, V no) {
|
|
4805
|
+
const DFromV<decltype(v)> d;
|
|
4806
|
+
return IfNegativeThenElse(v, Zero(d), no);
|
|
4807
|
+
}
|
|
4808
|
+
|
|
4809
|
+
template <class V, HWY_IF_NOT_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, 2)>
|
|
4810
|
+
HWY_API V IfNegativeThenZeroElse(V v, V no) {
|
|
4811
|
+
return IfThenZeroElse(IsNegative(v), no);
|
|
4812
|
+
}
|
|
4813
|
+
|
|
4814
|
+
#endif // HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4
|
|
4815
|
+
|
|
4392
4816
|
// ------------------------------ IfNegativeThenNegOrUndefIfZero
|
|
4393
4817
|
|
|
4394
4818
|
#if HWY_TARGET <= HWY_SSSE3
|
|
@@ -5157,6 +5581,14 @@ HWY_API Vec128<double, N> NegMulSub(Vec128<double, N> mul, Vec128<double, N> x,
|
|
|
5157
5581
|
|
|
5158
5582
|
#if HWY_TARGET <= HWY_SSSE3
|
|
5159
5583
|
|
|
5584
|
+
#undef HWY_IF_MULADDSUB_V
|
|
5585
|
+
#define HWY_IF_MULADDSUB_V(V) \
|
|
5586
|
+
HWY_IF_LANES_GT_D(DFromV<V>, 1), \
|
|
5587
|
+
HWY_IF_T_SIZE_ONE_OF_V( \
|
|
5588
|
+
V, (1 << 1) | ((hwy::IsFloat<TFromV<V>>()) \
|
|
5589
|
+
? 0 \
|
|
5590
|
+
: ((1 << 2) | (1 << 4) | (1 << 8))))
|
|
5591
|
+
|
|
5160
5592
|
#if HWY_HAVE_FLOAT16
|
|
5161
5593
|
template <size_t N, HWY_IF_LANES_GT(N, 1)>
|
|
5162
5594
|
HWY_API Vec128<float16_t, N> MulAddSub(Vec128<float16_t, N> mul,
|
|
@@ -5671,20 +6103,14 @@ HWY_INLINE Vec128<double, N> NativeMaskedGatherOr128(
|
|
|
5671
6103
|
} // namespace detail
|
|
5672
6104
|
|
|
5673
6105
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
5674
|
-
HWY_API VFromD<D> GatherOffset(D d
|
|
6106
|
+
HWY_API VFromD<D> GatherOffset(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
|
|
5675
6107
|
VFromD<RebindToSigned<D>> offsets) {
|
|
5676
|
-
const RebindToSigned<decltype(d)> di;
|
|
5677
|
-
(void)di; // for HWY_DASSERT
|
|
5678
|
-
HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di))));
|
|
5679
6108
|
return detail::NativeGather128<1>(base, offsets);
|
|
5680
6109
|
}
|
|
5681
6110
|
|
|
5682
6111
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>>
|
|
5683
|
-
HWY_API VFromD<D> GatherIndex(D d
|
|
6112
|
+
HWY_API VFromD<D> GatherIndex(D /*d*/, const T* HWY_RESTRICT base,
|
|
5684
6113
|
VFromD<RebindToSigned<D>> indices) {
|
|
5685
|
-
const RebindToSigned<decltype(d)> di;
|
|
5686
|
-
(void)di; // for HWY_DASSERT
|
|
5687
|
-
HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
|
|
5688
6114
|
return detail::NativeGather128<sizeof(T)>(base, indices);
|
|
5689
6115
|
}
|
|
5690
6116
|
|
|
@@ -5695,9 +6121,6 @@ HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
|
|
|
5695
6121
|
// For partial vectors, ensure upper mask lanes are zero to prevent faults.
|
|
5696
6122
|
if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
|
|
5697
6123
|
|
|
5698
|
-
const RebindToSigned<decltype(d)> di;
|
|
5699
|
-
(void)di; // for HWY_DASSERT
|
|
5700
|
-
HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
|
|
5701
6124
|
return detail::NativeMaskedGatherOr128<sizeof(T)>(no, m, base, indices);
|
|
5702
6125
|
}
|
|
5703
6126
|
|
|
@@ -6816,37 +7239,258 @@ HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
|
|
|
6816
7239
|
return BitCast(dw, InterleaveUpper(D(), a, b));
|
|
6817
7240
|
}
|
|
6818
7241
|
|
|
6819
|
-
//
|
|
6820
|
-
namespace detail {
|
|
7242
|
+
// ================================================== CONVERT (1)
|
|
6821
7243
|
|
|
6822
|
-
|
|
6823
|
-
|
|
7244
|
+
// ------------------------------ PromoteTo unsigned (TableLookupBytesOr0)
|
|
7245
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
|
|
7246
|
+
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
|
|
7247
|
+
#if HWY_TARGET >= HWY_SSSE3
|
|
7248
|
+
const __m128i zero = _mm_setzero_si128();
|
|
7249
|
+
return VFromD<D>{_mm_unpacklo_epi8(v.raw, zero)};
|
|
6824
7250
|
#else
|
|
6825
|
-
|
|
7251
|
+
return VFromD<D>{_mm_cvtepu8_epi16(v.raw)};
|
|
6826
7252
|
#endif
|
|
6827
|
-
|
|
6828
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
6829
|
-
HWY_INLINE VFromD<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
|
|
6830
|
-
const uint32_t x2,
|
|
6831
|
-
const uint32_t x1,
|
|
6832
|
-
const uint32_t x0) {
|
|
6833
|
-
return ResizeBitCast(
|
|
6834
|
-
d, Vec128<uint32_t>{_mm_set_epi32(
|
|
6835
|
-
static_cast<int32_t>(x3), static_cast<int32_t>(x2),
|
|
6836
|
-
static_cast<int32_t>(x1), static_cast<int32_t>(x0))});
|
|
6837
7253
|
}
|
|
6838
|
-
|
|
6839
|
-
|
|
6840
|
-
|
|
6841
|
-
|
|
6842
|
-
|
|
6843
|
-
|
|
6844
|
-
|
|
6845
|
-
return BitCast(d,
|
|
6846
|
-
VFromD<decltype(du)>{_mm_shufflelo_epi16(
|
|
6847
|
-
BitCast(du, v).raw, static_cast<int>(kIdx3210 & 0xFF))});
|
|
7254
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
|
|
7255
|
+
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
|
|
7256
|
+
#if HWY_TARGET >= HWY_SSSE3
|
|
7257
|
+
return VFromD<D>{_mm_unpacklo_epi16(v.raw, _mm_setzero_si128())};
|
|
7258
|
+
#else
|
|
7259
|
+
return VFromD<D>{_mm_cvtepu16_epi32(v.raw)};
|
|
7260
|
+
#endif
|
|
6848
7261
|
}
|
|
6849
|
-
|
|
7262
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
|
|
7263
|
+
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
|
|
7264
|
+
#if HWY_TARGET >= HWY_SSSE3
|
|
7265
|
+
return VFromD<D>{_mm_unpacklo_epi32(v.raw, _mm_setzero_si128())};
|
|
7266
|
+
#else
|
|
7267
|
+
return VFromD<D>{_mm_cvtepu32_epi64(v.raw)};
|
|
7268
|
+
#endif
|
|
7269
|
+
}
|
|
7270
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
|
|
7271
|
+
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
|
|
7272
|
+
#if HWY_TARGET >= HWY_SSSE3
|
|
7273
|
+
const __m128i zero = _mm_setzero_si128();
|
|
7274
|
+
const __m128i u16 = _mm_unpacklo_epi8(v.raw, zero);
|
|
7275
|
+
return VFromD<D>{_mm_unpacklo_epi16(u16, zero)};
|
|
7276
|
+
#else
|
|
7277
|
+
return VFromD<D>{_mm_cvtepu8_epi32(v.raw)};
|
|
7278
|
+
#endif
|
|
7279
|
+
}
|
|
7280
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
|
|
7281
|
+
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) {
|
|
7282
|
+
#if HWY_TARGET > HWY_SSSE3
|
|
7283
|
+
const Rebind<uint32_t, decltype(d)> du32;
|
|
7284
|
+
return PromoteTo(d, PromoteTo(du32, v));
|
|
7285
|
+
#elif HWY_TARGET == HWY_SSSE3
|
|
7286
|
+
alignas(16) static constexpr int8_t kShuffle[16] = {
|
|
7287
|
+
0, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1};
|
|
7288
|
+
const Repartition<int8_t, decltype(d)> di8;
|
|
7289
|
+
return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle)));
|
|
7290
|
+
#else
|
|
7291
|
+
(void)d;
|
|
7292
|
+
return VFromD<D>{_mm_cvtepu8_epi64(v.raw)};
|
|
7293
|
+
#endif
|
|
7294
|
+
}
|
|
7295
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
|
|
7296
|
+
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint16_t, D>> v) {
|
|
7297
|
+
#if HWY_TARGET > HWY_SSSE3
|
|
7298
|
+
const Rebind<uint32_t, decltype(d)> du32;
|
|
7299
|
+
return PromoteTo(d, PromoteTo(du32, v));
|
|
7300
|
+
#elif HWY_TARGET == HWY_SSSE3
|
|
7301
|
+
alignas(16) static constexpr int8_t kShuffle[16] = {
|
|
7302
|
+
0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1};
|
|
7303
|
+
const Repartition<int8_t, decltype(d)> di8;
|
|
7304
|
+
return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle)));
|
|
7305
|
+
#else
|
|
7306
|
+
(void)d;
|
|
7307
|
+
return VFromD<D>{_mm_cvtepu16_epi64(v.raw)};
|
|
7308
|
+
#endif
|
|
7309
|
+
}
|
|
7310
|
+
|
|
7311
|
+
// Unsigned to signed: same plus cast.
|
|
7312
|
+
template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V),
|
|
7313
|
+
HWY_IF_LANES_GT(sizeof(TFromD<D>), sizeof(TFromV<V>)),
|
|
7314
|
+
HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V))>
|
|
7315
|
+
HWY_API VFromD<D> PromoteTo(D di, V v) {
|
|
7316
|
+
const RebindToUnsigned<decltype(di)> du;
|
|
7317
|
+
return BitCast(di, PromoteTo(du, v));
|
|
7318
|
+
}
|
|
7319
|
+
|
|
7320
|
+
// ------------------------------ PromoteTo signed (ShiftRight, ZipLower)
|
|
7321
|
+
|
|
7322
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)>
|
|
7323
|
+
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
|
|
7324
|
+
#if HWY_TARGET >= HWY_SSSE3
|
|
7325
|
+
return ShiftRight<8>(VFromD<D>{_mm_unpacklo_epi8(v.raw, v.raw)});
|
|
7326
|
+
#else
|
|
7327
|
+
return VFromD<D>{_mm_cvtepi8_epi16(v.raw)};
|
|
7328
|
+
#endif
|
|
7329
|
+
}
|
|
7330
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
|
|
7331
|
+
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
|
|
7332
|
+
#if HWY_TARGET >= HWY_SSSE3
|
|
7333
|
+
return ShiftRight<16>(VFromD<D>{_mm_unpacklo_epi16(v.raw, v.raw)});
|
|
7334
|
+
#else
|
|
7335
|
+
return VFromD<D>{_mm_cvtepi16_epi32(v.raw)};
|
|
7336
|
+
#endif
|
|
7337
|
+
}
|
|
7338
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
|
|
7339
|
+
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
|
|
7340
|
+
#if HWY_TARGET >= HWY_SSSE3
|
|
7341
|
+
return ShiftRight<32>(VFromD<D>{_mm_unpacklo_epi32(v.raw, v.raw)});
|
|
7342
|
+
#else
|
|
7343
|
+
return VFromD<D>{_mm_cvtepi32_epi64(v.raw)};
|
|
7344
|
+
#endif
|
|
7345
|
+
}
|
|
7346
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
|
|
7347
|
+
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
|
|
7348
|
+
#if HWY_TARGET >= HWY_SSSE3
|
|
7349
|
+
const __m128i x2 = _mm_unpacklo_epi8(v.raw, v.raw);
|
|
7350
|
+
const __m128i x4 = _mm_unpacklo_epi16(x2, x2);
|
|
7351
|
+
return ShiftRight<24>(VFromD<D>{x4});
|
|
7352
|
+
#else
|
|
7353
|
+
return VFromD<D>{_mm_cvtepi8_epi32(v.raw)};
|
|
7354
|
+
#endif
|
|
7355
|
+
}
|
|
7356
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
|
|
7357
|
+
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int8_t, D>> v) {
|
|
7358
|
+
#if HWY_TARGET >= HWY_SSSE3
|
|
7359
|
+
const Repartition<int32_t, decltype(d)> di32;
|
|
7360
|
+
const Half<decltype(di32)> dh_i32;
|
|
7361
|
+
const VFromD<decltype(di32)> x4{PromoteTo(dh_i32, v).raw};
|
|
7362
|
+
const VFromD<decltype(di32)> s4{
|
|
7363
|
+
_mm_shufflelo_epi16(x4.raw, _MM_SHUFFLE(3, 3, 1, 1))};
|
|
7364
|
+
return ZipLower(d, x4, s4);
|
|
7365
|
+
#else
|
|
7366
|
+
(void)d;
|
|
7367
|
+
return VFromD<D>{_mm_cvtepi8_epi64(v.raw)};
|
|
7368
|
+
#endif
|
|
7369
|
+
}
|
|
7370
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
|
|
7371
|
+
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int16_t, D>> v) {
|
|
7372
|
+
#if HWY_TARGET >= HWY_SSSE3
|
|
7373
|
+
const Repartition<int32_t, decltype(d)> di32;
|
|
7374
|
+
const Half<decltype(di32)> dh_i32;
|
|
7375
|
+
const VFromD<decltype(di32)> x2{PromoteTo(dh_i32, v).raw};
|
|
7376
|
+
const VFromD<decltype(di32)> s2{
|
|
7377
|
+
_mm_shufflelo_epi16(x2.raw, _MM_SHUFFLE(3, 3, 1, 1))};
|
|
7378
|
+
return ZipLower(d, x2, s2);
|
|
7379
|
+
#else
|
|
7380
|
+
(void)d;
|
|
7381
|
+
return VFromD<D>{_mm_cvtepi16_epi64(v.raw)};
|
|
7382
|
+
#endif
|
|
7383
|
+
}
|
|
7384
|
+
|
|
7385
|
+
// -------------------- PromoteTo float (ShiftLeft, IfNegativeThenElse)
|
|
7386
|
+
#if HWY_TARGET < HWY_SSE4 && !defined(HWY_DISABLE_F16C)
|
|
7387
|
+
|
|
7388
|
+
// Per-target flag to prevent generic_ops-inl.h from defining f16 conversions.
|
|
7389
|
+
#ifdef HWY_NATIVE_F16C
|
|
7390
|
+
#undef HWY_NATIVE_F16C
|
|
7391
|
+
#else
|
|
7392
|
+
#define HWY_NATIVE_F16C
|
|
7393
|
+
#endif
|
|
7394
|
+
|
|
7395
|
+
// Workaround for origin tracking bug in Clang msan prior to 11.0
|
|
7396
|
+
// (spurious "uninitialized memory" for TestF16 with "ORIGIN: invalid")
|
|
7397
|
+
#if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100)
|
|
7398
|
+
#define HWY_INLINE_F16 HWY_NOINLINE
|
|
7399
|
+
#else
|
|
7400
|
+
#define HWY_INLINE_F16 HWY_INLINE
|
|
7401
|
+
#endif
|
|
7402
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
|
|
7403
|
+
HWY_INLINE_F16 VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) {
|
|
7404
|
+
#if HWY_HAVE_FLOAT16
|
|
7405
|
+
const RebindToUnsigned<DFromV<decltype(v)>> du16;
|
|
7406
|
+
return VFromD<D>{_mm_cvtph_ps(BitCast(du16, v).raw)};
|
|
7407
|
+
#else
|
|
7408
|
+
return VFromD<D>{_mm_cvtph_ps(v.raw)};
|
|
7409
|
+
#endif
|
|
7410
|
+
}
|
|
7411
|
+
|
|
7412
|
+
#endif // HWY_NATIVE_F16C
|
|
7413
|
+
|
|
7414
|
+
#if HWY_HAVE_FLOAT16
|
|
7415
|
+
|
|
7416
|
+
#ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
|
|
7417
|
+
#undef HWY_NATIVE_PROMOTE_F16_TO_F64
|
|
7418
|
+
#else
|
|
7419
|
+
#define HWY_NATIVE_PROMOTE_F16_TO_F64
|
|
7420
|
+
#endif
|
|
7421
|
+
|
|
7422
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
|
|
7423
|
+
HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) {
|
|
7424
|
+
return VFromD<D>{_mm_cvtph_pd(v.raw)};
|
|
7425
|
+
}
|
|
7426
|
+
|
|
7427
|
+
#endif // HWY_HAVE_FLOAT16
|
|
7428
|
+
|
|
7429
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
|
|
7430
|
+
HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) {
|
|
7431
|
+
const Rebind<uint16_t, decltype(df32)> du16;
|
|
7432
|
+
const RebindToSigned<decltype(df32)> di32;
|
|
7433
|
+
return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
|
|
7434
|
+
}
|
|
7435
|
+
|
|
7436
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
|
|
7437
|
+
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
|
|
7438
|
+
return VFromD<D>{_mm_cvtps_pd(v.raw)};
|
|
7439
|
+
}
|
|
7440
|
+
|
|
7441
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
|
|
7442
|
+
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
|
|
7443
|
+
return VFromD<D>{_mm_cvtepi32_pd(v.raw)};
|
|
7444
|
+
}
|
|
7445
|
+
|
|
7446
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
7447
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
|
|
7448
|
+
HWY_API VFromD<D> PromoteTo(D /*df64*/, VFromD<Rebind<uint32_t, D>> v) {
|
|
7449
|
+
return VFromD<D>{_mm_cvtepu32_pd(v.raw)};
|
|
7450
|
+
}
|
|
7451
|
+
#else
|
|
7452
|
+
// Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
|
|
7453
|
+
template <class D, HWY_IF_F64_D(D)>
|
|
7454
|
+
HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<uint32_t, D>> v) {
|
|
7455
|
+
const Rebind<int32_t, decltype(df64)> di32;
|
|
7456
|
+
const auto i32_to_f64_result = PromoteTo(df64, BitCast(di32, v));
|
|
7457
|
+
return i32_to_f64_result + IfNegativeThenElse(i32_to_f64_result,
|
|
7458
|
+
Set(df64, 4294967296.0),
|
|
7459
|
+
Zero(df64));
|
|
7460
|
+
}
|
|
7461
|
+
#endif // HWY_TARGET <= HWY_AVX3
|
|
7462
|
+
|
|
7463
|
+
// ------------------------------ Per4LaneBlockShuffle
|
|
7464
|
+
namespace detail {
|
|
7465
|
+
|
|
7466
|
+
#ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
|
|
7467
|
+
#undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
|
|
7468
|
+
#else
|
|
7469
|
+
#define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
|
|
7470
|
+
#endif
|
|
7471
|
+
|
|
7472
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
7473
|
+
HWY_INLINE VFromD<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
|
|
7474
|
+
const uint32_t x2,
|
|
7475
|
+
const uint32_t x1,
|
|
7476
|
+
const uint32_t x0) {
|
|
7477
|
+
return ResizeBitCast(
|
|
7478
|
+
d, Vec128<uint32_t>{_mm_set_epi32(
|
|
7479
|
+
static_cast<int32_t>(x3), static_cast<int32_t>(x2),
|
|
7480
|
+
static_cast<int32_t>(x1), static_cast<int32_t>(x0))});
|
|
7481
|
+
}
|
|
7482
|
+
|
|
7483
|
+
template <size_t kIdx3210, class V>
|
|
7484
|
+
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
|
|
7485
|
+
hwy::SizeTag<2> /*lane_size_tag*/,
|
|
7486
|
+
hwy::SizeTag<8> /*vect_size_tag*/, V v) {
|
|
7487
|
+
const DFromV<decltype(v)> d;
|
|
7488
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
7489
|
+
return BitCast(d,
|
|
7490
|
+
VFromD<decltype(du)>{_mm_shufflelo_epi16(
|
|
7491
|
+
BitCast(du, v).raw, static_cast<int>(kIdx3210 & 0xFF))});
|
|
7492
|
+
}
|
|
7493
|
+
|
|
6850
7494
|
#if HWY_TARGET == HWY_SSE2
|
|
6851
7495
|
template <size_t kIdx3210, class V>
|
|
6852
7496
|
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
|
|
@@ -8122,27 +8766,116 @@ HWY_API Vec128<float, N> OddEven(Vec128<float, N> a, Vec128<float, N> b) {
|
|
|
8122
8766
|
#endif
|
|
8123
8767
|
}
|
|
8124
8768
|
|
|
8125
|
-
//
|
|
8126
|
-
|
|
8127
|
-
|
|
8128
|
-
|
|
8769
|
+
// -------------------------- InterleaveEven
|
|
8770
|
+
|
|
8771
|
+
template <class D, HWY_IF_LANES_LE_D(D, 2)>
|
|
8772
|
+
HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
|
|
8773
|
+
return ConcatEven(d, b, a);
|
|
8129
8774
|
}
|
|
8130
8775
|
|
|
8131
|
-
//
|
|
8776
|
+
// I8/U8 InterleaveEven is generic for all vector lengths that are >= 4 bytes
|
|
8777
|
+
template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
|
|
8778
|
+
HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
|
|
8779
|
+
const Repartition<uint16_t, decltype(d)> du16;
|
|
8780
|
+
return OddEven(BitCast(d, ShiftLeft<8>(BitCast(du16, b))), a);
|
|
8781
|
+
}
|
|
8132
8782
|
|
|
8133
|
-
|
|
8134
|
-
|
|
8135
|
-
|
|
8783
|
+
// I16/U16 InterleaveEven is generic for all vector lengths that are >= 8 bytes
|
|
8784
|
+
template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 2)>
|
|
8785
|
+
HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
|
|
8786
|
+
const Repartition<uint32_t, decltype(d)> du32;
|
|
8787
|
+
return OddEven(BitCast(d, ShiftLeft<16>(BitCast(du32, b))), a);
|
|
8136
8788
|
}
|
|
8137
8789
|
|
|
8138
|
-
|
|
8790
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
8791
|
+
template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_UI32_D(D)>
|
|
8792
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
8793
|
+
return VFromD<D>{_mm_mask_shuffle_epi32(
|
|
8794
|
+
a.raw, static_cast<__mmask8>(0x0A), b.raw,
|
|
8795
|
+
static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(2, 2, 0, 0)))};
|
|
8796
|
+
}
|
|
8797
|
+
template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_F32_D(D)>
|
|
8798
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
8799
|
+
return VFromD<D>{_mm_mask_shuffle_ps(a.raw, static_cast<__mmask8>(0x0A),
|
|
8800
|
+
b.raw, b.raw, _MM_SHUFFLE(2, 2, 0, 0))};
|
|
8801
|
+
}
|
|
8802
|
+
#else
|
|
8803
|
+
template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_T_SIZE_D(D, 4)>
|
|
8804
|
+
HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
|
|
8805
|
+
const RebindToFloat<decltype(d)> df;
|
|
8806
|
+
const auto b2_b0_a2_a0 = ConcatEven(df, BitCast(df, b), BitCast(df, a));
|
|
8807
|
+
return BitCast(
|
|
8808
|
+
d, VFromD<decltype(df)>{_mm_shuffle_ps(b2_b0_a2_a0.raw, b2_b0_a2_a0.raw,
|
|
8809
|
+
_MM_SHUFFLE(3, 1, 2, 0))});
|
|
8810
|
+
}
|
|
8811
|
+
#endif
|
|
8139
8812
|
|
|
8140
|
-
//
|
|
8141
|
-
// two from loading float exponents, which is considerably faster (according
|
|
8142
|
-
// to LLVM-MCA) than scalar or testing bits: https://gcc.godbolt.org/z/9G7Y9v.
|
|
8813
|
+
// -------------------------- InterleaveOdd
|
|
8143
8814
|
|
|
8144
|
-
|
|
8145
|
-
|
|
8815
|
+
template <class D, HWY_IF_LANES_LE_D(D, 2)>
|
|
8816
|
+
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
|
|
8817
|
+
return ConcatOdd(d, b, a);
|
|
8818
|
+
}
|
|
8819
|
+
|
|
8820
|
+
// I8/U8 InterleaveOdd is generic for all vector lengths that are >= 4 bytes
|
|
8821
|
+
template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
|
|
8822
|
+
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
|
|
8823
|
+
const Repartition<uint16_t, decltype(d)> du16;
|
|
8824
|
+
return OddEven(b, BitCast(d, ShiftRight<8>(BitCast(du16, a))));
|
|
8825
|
+
}
|
|
8826
|
+
|
|
8827
|
+
// I16/U16 InterleaveOdd is generic for all vector lengths that are >= 8 bytes
|
|
8828
|
+
template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 2)>
|
|
8829
|
+
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
|
|
8830
|
+
const Repartition<uint32_t, decltype(d)> du32;
|
|
8831
|
+
return OddEven(b, BitCast(d, ShiftRight<16>(BitCast(du32, a))));
|
|
8832
|
+
}
|
|
8833
|
+
|
|
8834
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
8835
|
+
template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_UI32_D(D)>
|
|
8836
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
8837
|
+
return VFromD<D>{_mm_mask_shuffle_epi32(
|
|
8838
|
+
b.raw, static_cast<__mmask8>(0x05), a.raw,
|
|
8839
|
+
static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(3, 3, 1, 1)))};
|
|
8840
|
+
}
|
|
8841
|
+
template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_F32_D(D)>
|
|
8842
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
8843
|
+
return VFromD<D>{_mm_mask_shuffle_ps(b.raw, static_cast<__mmask8>(0x05),
|
|
8844
|
+
a.raw, a.raw, _MM_SHUFFLE(3, 3, 1, 1))};
|
|
8845
|
+
}
|
|
8846
|
+
#else
|
|
8847
|
+
template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_T_SIZE_D(D, 4)>
|
|
8848
|
+
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
|
|
8849
|
+
const RebindToFloat<decltype(d)> df;
|
|
8850
|
+
const auto b3_b1_a3_a1 = ConcatOdd(df, BitCast(df, b), BitCast(df, a));
|
|
8851
|
+
return BitCast(
|
|
8852
|
+
d, VFromD<decltype(df)>{_mm_shuffle_ps(b3_b1_a3_a1.raw, b3_b1_a3_a1.raw,
|
|
8853
|
+
_MM_SHUFFLE(3, 1, 2, 0))});
|
|
8854
|
+
}
|
|
8855
|
+
#endif
|
|
8856
|
+
|
|
8857
|
+
// ------------------------------ OddEvenBlocks
|
|
8858
|
+
template <typename T, size_t N>
|
|
8859
|
+
HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
|
|
8860
|
+
return even;
|
|
8861
|
+
}
|
|
8862
|
+
|
|
8863
|
+
// ------------------------------ SwapAdjacentBlocks
|
|
8864
|
+
|
|
8865
|
+
template <typename T, size_t N>
|
|
8866
|
+
HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
|
|
8867
|
+
return v;
|
|
8868
|
+
}
|
|
8869
|
+
|
|
8870
|
+
// ------------------------------ Shl (ZipLower, Mul)
|
|
8871
|
+
|
|
8872
|
+
// Use AVX2/3 variable shifts where available, otherwise multiply by powers of
|
|
8873
|
+
// two from loading float exponents, which is considerably faster (according
|
|
8874
|
+
// to LLVM-MCA) than scalar or testing bits: https://gcc.godbolt.org/z/9G7Y9v.
|
|
8875
|
+
|
|
8876
|
+
namespace detail {
|
|
8877
|
+
|
|
8878
|
+
#if HWY_TARGET == HWY_AVX2 // Unused for AVX3 - we use sllv directly
|
|
8146
8879
|
template <class V>
|
|
8147
8880
|
HWY_API V AVX2ShlU16Vec128(V v, V bits) {
|
|
8148
8881
|
const DFromV<decltype(v)> d;
|
|
@@ -8150,6 +8883,22 @@ HWY_API V AVX2ShlU16Vec128(V v, V bits) {
|
|
|
8150
8883
|
return TruncateTo(d, PromoteTo(du32, v) << PromoteTo(du32, bits));
|
|
8151
8884
|
}
|
|
8152
8885
|
#elif HWY_TARGET > HWY_AVX2
|
|
8886
|
+
|
|
8887
|
+
template <class D32>
|
|
8888
|
+
static HWY_INLINE VFromD<D32> Pow2ConvF32ToI32(
|
|
8889
|
+
D32 d32, VFromD<RebindToFloat<D32>> vf32) {
|
|
8890
|
+
const RebindToSigned<decltype(d32)> di32;
|
|
8891
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
8892
|
+
// ConvertInRangeTo is safe with GCC due the inline assembly workaround used
|
|
8893
|
+
// for F32->I32 ConvertInRangeTo with GCC
|
|
8894
|
+
return BitCast(d32, ConvertInRangeTo(di32, vf32));
|
|
8895
|
+
#else
|
|
8896
|
+
// Otherwise, use NearestIntInRange because we rely on the native 0x80..00
|
|
8897
|
+
// overflow behavior
|
|
8898
|
+
return BitCast(d32, NearestIntInRange(di32, vf32));
|
|
8899
|
+
#endif
|
|
8900
|
+
}
|
|
8901
|
+
|
|
8153
8902
|
// Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
|
|
8154
8903
|
template <typename T, HWY_IF_T_SIZE(T, 2)>
|
|
8155
8904
|
HWY_INLINE Vec128<MakeUnsigned<T>> Pow2(const Vec128<T> v) {
|
|
@@ -8165,8 +8914,8 @@ HWY_INLINE Vec128<MakeUnsigned<T>> Pow2(const Vec128<T> v) {
|
|
|
8165
8914
|
const auto f0 = ZipLower(dw, zero, upper);
|
|
8166
8915
|
const auto f1 = ZipUpper(dw, zero, upper);
|
|
8167
8916
|
// See cvtps comment below.
|
|
8168
|
-
const VFromD<decltype(dw)> bits0
|
|
8169
|
-
const VFromD<decltype(dw)> bits1
|
|
8917
|
+
const VFromD<decltype(dw)> bits0 = Pow2ConvF32ToI32(dw, BitCast(df, f0));
|
|
8918
|
+
const VFromD<decltype(dw)> bits1 = Pow2ConvF32ToI32(dw, BitCast(df, f1));
|
|
8170
8919
|
#if HWY_TARGET <= HWY_SSE4
|
|
8171
8920
|
return VFromD<decltype(du)>{_mm_packus_epi32(bits0.raw, bits1.raw)};
|
|
8172
8921
|
#else
|
|
@@ -8187,7 +8936,8 @@ HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
|
|
|
8187
8936
|
// Insert 0 into lower halves for reinterpreting as binary32.
|
|
8188
8937
|
const auto f0 = ZipLower(dt_w, Zero(dt_u), ResizeBitCast(dt_u, upper));
|
|
8189
8938
|
// See cvtps comment below.
|
|
8190
|
-
const VFromD<decltype(dt_w)> bits0
|
|
8939
|
+
const VFromD<decltype(dt_w)> bits0 =
|
|
8940
|
+
Pow2ConvF32ToI32(dt_w, BitCast(dt_f, f0));
|
|
8191
8941
|
#if HWY_TARGET <= HWY_SSE4
|
|
8192
8942
|
return VFromD<decltype(du)>{_mm_packus_epi32(bits0.raw, bits0.raw)};
|
|
8193
8943
|
#elif HWY_TARGET == HWY_SSSE3
|
|
@@ -8205,11 +8955,12 @@ HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
|
|
|
8205
8955
|
template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
|
|
8206
8956
|
HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
|
|
8207
8957
|
const DFromV<decltype(v)> d;
|
|
8958
|
+
const RebindToFloat<decltype(d)> df;
|
|
8208
8959
|
const auto exp = ShiftLeft<23>(v);
|
|
8209
8960
|
const auto f = exp + Set(d, 0x3F800000); // 1.0f
|
|
8210
8961
|
// Do not use ConvertTo because we rely on the native 0x80..00 overflow
|
|
8211
8962
|
// behavior.
|
|
8212
|
-
return
|
|
8963
|
+
return Pow2ConvF32ToI32(d, BitCast(df, f));
|
|
8213
8964
|
}
|
|
8214
8965
|
|
|
8215
8966
|
#endif // HWY_TARGET > HWY_AVX2
|
|
@@ -8689,42 +9440,161 @@ HWY_API Vec128<int64_t, N> operator>>(Vec128<int64_t, N> v,
|
|
|
8689
9440
|
|
|
8690
9441
|
// ------------------------------ MulEven/Odd 64x64 (UpperHalf)
|
|
8691
9442
|
|
|
8692
|
-
|
|
9443
|
+
namespace detail {
|
|
9444
|
+
|
|
9445
|
+
template <class V, HWY_IF_U64(TFromV<V>)>
|
|
9446
|
+
static HWY_INLINE V SSE2Mul128(V a, V b, V& mulH) {
|
|
9447
|
+
const DFromV<decltype(a)> du64;
|
|
9448
|
+
const RepartitionToNarrow<decltype(du64)> du32;
|
|
9449
|
+
const auto maskL = Set(du64, 0xFFFFFFFFULL);
|
|
9450
|
+
const auto a32 = BitCast(du32, a);
|
|
9451
|
+
const auto b32 = BitCast(du32, b);
|
|
9452
|
+
// Inputs for MulEven: we only need the lower 32 bits
|
|
9453
|
+
const auto aH = Shuffle2301(a32);
|
|
9454
|
+
const auto bH = Shuffle2301(b32);
|
|
9455
|
+
|
|
9456
|
+
// Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
|
|
9457
|
+
// the even (lower 64 bits of every 128-bit block) results. See
|
|
9458
|
+
// https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.txt
|
|
9459
|
+
const auto aLbL = MulEven(a32, b32);
|
|
9460
|
+
const auto w3 = aLbL & maskL;
|
|
9461
|
+
|
|
9462
|
+
const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
|
|
9463
|
+
const auto w2 = t2 & maskL;
|
|
9464
|
+
const auto w1 = ShiftRight<32>(t2);
|
|
9465
|
+
|
|
9466
|
+
const auto t = MulEven(a32, bH) + w2;
|
|
9467
|
+
const auto k = ShiftRight<32>(t);
|
|
9468
|
+
|
|
9469
|
+
mulH = MulEven(aH, bH) + w1 + k;
|
|
9470
|
+
return ShiftLeft<32>(t) + w3;
|
|
9471
|
+
}
|
|
9472
|
+
|
|
9473
|
+
template <class V, HWY_IF_I64(TFromV<V>)>
|
|
9474
|
+
static HWY_INLINE V SSE2Mul128(V a, V b, V& mulH) {
|
|
9475
|
+
const DFromV<decltype(a)> di64;
|
|
9476
|
+
const RebindToUnsigned<decltype(di64)> du64;
|
|
9477
|
+
using VU64 = VFromD<decltype(du64)>;
|
|
9478
|
+
|
|
9479
|
+
VU64 unsigned_mulH;
|
|
9480
|
+
const auto mulL = BitCast(
|
|
9481
|
+
di64, SSE2Mul128(BitCast(du64, a), BitCast(du64, b), unsigned_mulH));
|
|
9482
|
+
mulH = BitCast(di64, unsigned_mulH) - And(BroadcastSignBit(a), b) -
|
|
9483
|
+
And(a, BroadcastSignBit(b));
|
|
9484
|
+
return mulL;
|
|
9485
|
+
}
|
|
9486
|
+
|
|
9487
|
+
} // namespace detail
|
|
9488
|
+
|
|
9489
|
+
#if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
|
|
9490
|
+
|
|
9491
|
+
template <class V, HWY_IF_UI64(TFromV<V>),
|
|
9492
|
+
HWY_IF_V_SIZE_GT_V(V, (HWY_ARCH_X86_64 ? 16 : 8))>
|
|
9493
|
+
HWY_API V MulEven(V a, V b) {
|
|
9494
|
+
V mulH;
|
|
9495
|
+
const V mulL = detail::SSE2Mul128(a, b, mulH);
|
|
9496
|
+
return InterleaveLower(mulL, mulH);
|
|
9497
|
+
}
|
|
9498
|
+
|
|
9499
|
+
template <class V, HWY_IF_UI64(TFromV<V>),
|
|
9500
|
+
HWY_IF_V_SIZE_GT_V(V, (HWY_ARCH_X86_64 ? 16 : 8))>
|
|
9501
|
+
HWY_API V MulOdd(V a, V b) {
|
|
9502
|
+
const DFromV<decltype(a)> du64;
|
|
9503
|
+
V mulH;
|
|
9504
|
+
const V mulL = detail::SSE2Mul128(a, b, mulH);
|
|
9505
|
+
return InterleaveUpper(du64, mulL, mulH);
|
|
9506
|
+
}
|
|
9507
|
+
|
|
9508
|
+
#endif // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
|
|
9509
|
+
|
|
9510
|
+
template <class V, HWY_IF_UI64(TFromV<V>),
|
|
9511
|
+
HWY_IF_V_SIZE_GT_V(V, (HWY_ARCH_X86_64 ? 8 : 0))>
|
|
9512
|
+
HWY_API V MulHigh(V a, V b) {
|
|
9513
|
+
V mulH;
|
|
9514
|
+
detail::SSE2Mul128(a, b, mulH);
|
|
9515
|
+
return mulH;
|
|
9516
|
+
}
|
|
9517
|
+
|
|
9518
|
+
#if HWY_ARCH_X86_64
|
|
9519
|
+
|
|
9520
|
+
template <class T, HWY_IF_UI64(T)>
|
|
9521
|
+
HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
|
|
8693
9522
|
const DFromV<decltype(a)> d;
|
|
8694
|
-
alignas(16)
|
|
9523
|
+
alignas(16) T mul[2];
|
|
8695
9524
|
mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
|
|
8696
9525
|
return Load(d, mul);
|
|
8697
9526
|
}
|
|
8698
9527
|
|
|
8699
|
-
|
|
9528
|
+
template <class T, HWY_IF_UI64(T)>
|
|
9529
|
+
HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
|
|
8700
9530
|
const DFromV<decltype(a)> d;
|
|
8701
9531
|
const Half<decltype(d)> d2;
|
|
8702
|
-
alignas(16)
|
|
8703
|
-
const
|
|
8704
|
-
const
|
|
9532
|
+
alignas(16) T mul[2];
|
|
9533
|
+
const T a1 = GetLane(UpperHalf(d2, a));
|
|
9534
|
+
const T b1 = GetLane(UpperHalf(d2, b));
|
|
8705
9535
|
mul[0] = Mul128(a1, b1, &mul[1]);
|
|
8706
9536
|
return Load(d, mul);
|
|
8707
9537
|
}
|
|
8708
9538
|
|
|
8709
|
-
|
|
9539
|
+
template <class T, HWY_IF_UI64(T)>
|
|
9540
|
+
HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) {
|
|
9541
|
+
T hi;
|
|
9542
|
+
Mul128(GetLane(a), GetLane(b), &hi);
|
|
9543
|
+
return Vec64<T>{_mm_cvtsi64_si128(static_cast<int64_t>(hi))};
|
|
9544
|
+
}
|
|
9545
|
+
|
|
9546
|
+
#endif // HWY_ARCH_X86_64
|
|
9547
|
+
|
|
9548
|
+
// ================================================== CONVERT (2)
|
|
9549
|
+
|
|
9550
|
+
// ------------------------------ PromoteEvenTo/PromoteOddTo
|
|
9551
|
+
|
|
9552
|
+
#if HWY_TARGET > HWY_AVX3
|
|
9553
|
+
namespace detail {
|
|
9554
|
+
|
|
9555
|
+
// I32->I64 PromoteEvenTo/PromoteOddTo
|
|
9556
|
+
|
|
9557
|
+
template <class D, HWY_IF_LANES_D(D, 1)>
|
|
9558
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
|
|
9559
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
9560
|
+
hwy::SignedTag /*from_type_tag*/, D d_to,
|
|
9561
|
+
Vec64<int32_t> v) {
|
|
9562
|
+
return PromoteLowerTo(d_to, v);
|
|
9563
|
+
}
|
|
9564
|
+
|
|
9565
|
+
template <class D, HWY_IF_LANES_D(D, 2)>
|
|
9566
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
|
|
9567
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
9568
|
+
hwy::SignedTag /*from_type_tag*/, D d_to,
|
|
9569
|
+
Vec128<int32_t> v) {
|
|
9570
|
+
const Repartition<int32_t, D> d_from;
|
|
9571
|
+
return PromoteLowerTo(d_to, ConcatEven(d_from, v, v));
|
|
9572
|
+
}
|
|
9573
|
+
|
|
9574
|
+
template <class D, class V, HWY_IF_LANES_LE_D(D, 2)>
|
|
9575
|
+
HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
|
|
9576
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
9577
|
+
hwy::SignedTag /*from_type_tag*/, D d_to,
|
|
9578
|
+
V v) {
|
|
9579
|
+
const Repartition<int32_t, D> d_from;
|
|
9580
|
+
return PromoteLowerTo(d_to, ConcatOdd(d_from, v, v));
|
|
9581
|
+
}
|
|
9582
|
+
|
|
9583
|
+
} // namespace detail
|
|
9584
|
+
#endif
|
|
9585
|
+
|
|
9586
|
+
// ------------------------------ PromoteEvenTo/PromoteOddTo
|
|
9587
|
+
#include "hwy/ops/inside-inl.h"
|
|
9588
|
+
|
|
9589
|
+
// ------------------------------ WidenMulPairwiseAdd (PromoteEvenTo)
|
|
8710
9590
|
|
|
8711
9591
|
// Generic for all vector lengths.
|
|
8712
|
-
template <class
|
|
8713
|
-
class
|
|
8714
|
-
HWY_API VFromD<
|
|
9592
|
+
template <class DF, HWY_IF_F32_D(DF),
|
|
9593
|
+
class VBF = VFromD<Repartition<bfloat16_t, DF>>>
|
|
9594
|
+
HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
|
|
8715
9595
|
// TODO(janwas): _mm_dpbf16_ps when available
|
|
8716
|
-
|
|
8717
|
-
|
|
8718
|
-
// longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip
|
|
8719
|
-
// leads to the odd/even order that RearrangeToOddPlusEven prefers.
|
|
8720
|
-
using VU32 = VFromD<decltype(du32)>;
|
|
8721
|
-
const VU32 odd = Set(du32, 0xFFFF0000u);
|
|
8722
|
-
const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
|
|
8723
|
-
const VU32 ao = And(BitCast(du32, a), odd);
|
|
8724
|
-
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
8725
|
-
const VU32 bo = And(BitCast(du32, b), odd);
|
|
8726
|
-
return MulAdd(BitCast(df32, ae), BitCast(df32, be),
|
|
8727
|
-
Mul(BitCast(df32, ao), BitCast(df32, bo)));
|
|
9596
|
+
return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
|
|
9597
|
+
Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
|
|
8728
9598
|
}
|
|
8729
9599
|
|
|
8730
9600
|
// Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe.
|
|
@@ -8768,29 +9638,48 @@ HWY_API VFromD<DI16> SatWidenMulPairwiseAdd(
|
|
|
8768
9638
|
|
|
8769
9639
|
#endif
|
|
8770
9640
|
|
|
8771
|
-
// ------------------------------
|
|
9641
|
+
// ------------------------------ SatWidenMulPairwiseAccumulate
|
|
8772
9642
|
|
|
8773
|
-
|
|
8774
|
-
|
|
8775
|
-
|
|
8776
|
-
|
|
8777
|
-
|
|
8778
|
-
|
|
8779
|
-
|
|
8780
|
-
|
|
8781
|
-
|
|
8782
|
-
|
|
8783
|
-
|
|
8784
|
-
|
|
8785
|
-
|
|
8786
|
-
|
|
8787
|
-
|
|
8788
|
-
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
8789
|
-
const VU32 bo = And(BitCast(du32, b), odd);
|
|
8790
|
-
sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
|
|
8791
|
-
return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
|
|
9643
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
9644
|
+
|
|
9645
|
+
#ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
|
|
9646
|
+
#undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
|
|
9647
|
+
#else
|
|
9648
|
+
#define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
|
|
9649
|
+
#endif
|
|
9650
|
+
|
|
9651
|
+
// Even if N=1, the I16 vectors have at least 2 lanes, hence _mm_dpwssds_epi32
|
|
9652
|
+
// is safe.
|
|
9653
|
+
template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
|
|
9654
|
+
HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate(
|
|
9655
|
+
DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a,
|
|
9656
|
+
VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
|
|
9657
|
+
return VFromD<DI32>{_mm_dpwssds_epi32(sum.raw, a.raw, b.raw)};
|
|
8792
9658
|
}
|
|
8793
9659
|
|
|
9660
|
+
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
9661
|
+
|
|
9662
|
+
// ------------------------------ ReorderWidenMulAccumulate (PromoteEvenTo)
|
|
9663
|
+
|
|
9664
|
+
#if HWY_NATIVE_DOT_BF16
|
|
9665
|
+
|
|
9666
|
+
#ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
|
|
9667
|
+
#undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
|
|
9668
|
+
#else
|
|
9669
|
+
#define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
|
|
9670
|
+
#endif
|
|
9671
|
+
|
|
9672
|
+
template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_LE_D(DF, 16),
|
|
9673
|
+
class VBF = VFromD<Repartition<bfloat16_t, DF>>>
|
|
9674
|
+
HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF /*df*/, VBF a, VBF b,
|
|
9675
|
+
const VFromD<DF> sum0,
|
|
9676
|
+
VFromD<DF>& /*sum1*/) {
|
|
9677
|
+
return VFromD<DF>{_mm_dpbf16_ps(sum0.raw, reinterpret_cast<__m128bh>(a.raw),
|
|
9678
|
+
reinterpret_cast<__m128bh>(b.raw))};
|
|
9679
|
+
}
|
|
9680
|
+
|
|
9681
|
+
#endif // HWY_NATIVE_DOT_BF16
|
|
9682
|
+
|
|
8794
9683
|
// Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe.
|
|
8795
9684
|
template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
|
|
8796
9685
|
class V16 = VFromD<RepartitionToNarrow<D32>>>
|
|
@@ -8893,263 +9782,6 @@ HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
|
|
|
8893
9782
|
|
|
8894
9783
|
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
8895
9784
|
|
|
8896
|
-
// ================================================== CONVERT
|
|
8897
|
-
|
|
8898
|
-
// ------------------------------ Promotions (part w/ narrow lanes -> full)
|
|
8899
|
-
|
|
8900
|
-
// Unsigned: zero-extend.
|
|
8901
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
|
|
8902
|
-
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
|
|
8903
|
-
#if HWY_TARGET >= HWY_SSSE3
|
|
8904
|
-
const __m128i zero = _mm_setzero_si128();
|
|
8905
|
-
return VFromD<D>{_mm_unpacklo_epi8(v.raw, zero)};
|
|
8906
|
-
#else
|
|
8907
|
-
return VFromD<D>{_mm_cvtepu8_epi16(v.raw)};
|
|
8908
|
-
#endif
|
|
8909
|
-
}
|
|
8910
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
|
|
8911
|
-
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
|
|
8912
|
-
#if HWY_TARGET >= HWY_SSSE3
|
|
8913
|
-
return VFromD<D>{_mm_unpacklo_epi16(v.raw, _mm_setzero_si128())};
|
|
8914
|
-
#else
|
|
8915
|
-
return VFromD<D>{_mm_cvtepu16_epi32(v.raw)};
|
|
8916
|
-
#endif
|
|
8917
|
-
}
|
|
8918
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
|
|
8919
|
-
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
|
|
8920
|
-
#if HWY_TARGET >= HWY_SSSE3
|
|
8921
|
-
return VFromD<D>{_mm_unpacklo_epi32(v.raw, _mm_setzero_si128())};
|
|
8922
|
-
#else
|
|
8923
|
-
return VFromD<D>{_mm_cvtepu32_epi64(v.raw)};
|
|
8924
|
-
#endif
|
|
8925
|
-
}
|
|
8926
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
|
|
8927
|
-
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
|
|
8928
|
-
#if HWY_TARGET >= HWY_SSSE3
|
|
8929
|
-
const __m128i zero = _mm_setzero_si128();
|
|
8930
|
-
const __m128i u16 = _mm_unpacklo_epi8(v.raw, zero);
|
|
8931
|
-
return VFromD<D>{_mm_unpacklo_epi16(u16, zero)};
|
|
8932
|
-
#else
|
|
8933
|
-
return VFromD<D>{_mm_cvtepu8_epi32(v.raw)};
|
|
8934
|
-
#endif
|
|
8935
|
-
}
|
|
8936
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
|
|
8937
|
-
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) {
|
|
8938
|
-
#if HWY_TARGET > HWY_SSSE3
|
|
8939
|
-
const Rebind<uint32_t, decltype(d)> du32;
|
|
8940
|
-
return PromoteTo(d, PromoteTo(du32, v));
|
|
8941
|
-
#elif HWY_TARGET == HWY_SSSE3
|
|
8942
|
-
alignas(16) static constexpr int8_t kShuffle[16] = {
|
|
8943
|
-
0, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1};
|
|
8944
|
-
const Repartition<int8_t, decltype(d)> di8;
|
|
8945
|
-
return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle)));
|
|
8946
|
-
#else
|
|
8947
|
-
(void)d;
|
|
8948
|
-
return VFromD<D>{_mm_cvtepu8_epi64(v.raw)};
|
|
8949
|
-
#endif
|
|
8950
|
-
}
|
|
8951
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
|
|
8952
|
-
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint16_t, D>> v) {
|
|
8953
|
-
#if HWY_TARGET > HWY_SSSE3
|
|
8954
|
-
const Rebind<uint32_t, decltype(d)> du32;
|
|
8955
|
-
return PromoteTo(d, PromoteTo(du32, v));
|
|
8956
|
-
#elif HWY_TARGET == HWY_SSSE3
|
|
8957
|
-
alignas(16) static constexpr int8_t kShuffle[16] = {
|
|
8958
|
-
0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1};
|
|
8959
|
-
const Repartition<int8_t, decltype(d)> di8;
|
|
8960
|
-
return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle)));
|
|
8961
|
-
#else
|
|
8962
|
-
(void)d;
|
|
8963
|
-
return VFromD<D>{_mm_cvtepu16_epi64(v.raw)};
|
|
8964
|
-
#endif
|
|
8965
|
-
}
|
|
8966
|
-
|
|
8967
|
-
// Unsigned to signed: same plus cast.
|
|
8968
|
-
template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V),
|
|
8969
|
-
HWY_IF_LANES_GT(sizeof(TFromD<D>), sizeof(TFromV<V>)),
|
|
8970
|
-
HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V))>
|
|
8971
|
-
HWY_API VFromD<D> PromoteTo(D di, V v) {
|
|
8972
|
-
const RebindToUnsigned<decltype(di)> du;
|
|
8973
|
-
return BitCast(di, PromoteTo(du, v));
|
|
8974
|
-
}
|
|
8975
|
-
|
|
8976
|
-
// Signed: replicate sign bit.
|
|
8977
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)>
|
|
8978
|
-
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
|
|
8979
|
-
#if HWY_TARGET >= HWY_SSSE3
|
|
8980
|
-
return ShiftRight<8>(VFromD<D>{_mm_unpacklo_epi8(v.raw, v.raw)});
|
|
8981
|
-
#else
|
|
8982
|
-
return VFromD<D>{_mm_cvtepi8_epi16(v.raw)};
|
|
8983
|
-
#endif
|
|
8984
|
-
}
|
|
8985
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
|
|
8986
|
-
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
|
|
8987
|
-
#if HWY_TARGET >= HWY_SSSE3
|
|
8988
|
-
return ShiftRight<16>(VFromD<D>{_mm_unpacklo_epi16(v.raw, v.raw)});
|
|
8989
|
-
#else
|
|
8990
|
-
return VFromD<D>{_mm_cvtepi16_epi32(v.raw)};
|
|
8991
|
-
#endif
|
|
8992
|
-
}
|
|
8993
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
|
|
8994
|
-
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
|
|
8995
|
-
#if HWY_TARGET >= HWY_SSSE3
|
|
8996
|
-
return ShiftRight<32>(VFromD<D>{_mm_unpacklo_epi32(v.raw, v.raw)});
|
|
8997
|
-
#else
|
|
8998
|
-
return VFromD<D>{_mm_cvtepi32_epi64(v.raw)};
|
|
8999
|
-
#endif
|
|
9000
|
-
}
|
|
9001
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
|
|
9002
|
-
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
|
|
9003
|
-
#if HWY_TARGET >= HWY_SSSE3
|
|
9004
|
-
const __m128i x2 = _mm_unpacklo_epi8(v.raw, v.raw);
|
|
9005
|
-
const __m128i x4 = _mm_unpacklo_epi16(x2, x2);
|
|
9006
|
-
return ShiftRight<24>(VFromD<D>{x4});
|
|
9007
|
-
#else
|
|
9008
|
-
return VFromD<D>{_mm_cvtepi8_epi32(v.raw)};
|
|
9009
|
-
#endif
|
|
9010
|
-
}
|
|
9011
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
|
|
9012
|
-
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int8_t, D>> v) {
|
|
9013
|
-
#if HWY_TARGET >= HWY_SSSE3
|
|
9014
|
-
const Repartition<int32_t, decltype(d)> di32;
|
|
9015
|
-
const Half<decltype(di32)> dh_i32;
|
|
9016
|
-
const VFromD<decltype(di32)> x4{PromoteTo(dh_i32, v).raw};
|
|
9017
|
-
const VFromD<decltype(di32)> s4{
|
|
9018
|
-
_mm_shufflelo_epi16(x4.raw, _MM_SHUFFLE(3, 3, 1, 1))};
|
|
9019
|
-
return ZipLower(d, x4, s4);
|
|
9020
|
-
#else
|
|
9021
|
-
(void)d;
|
|
9022
|
-
return VFromD<D>{_mm_cvtepi8_epi64(v.raw)};
|
|
9023
|
-
#endif
|
|
9024
|
-
}
|
|
9025
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
|
|
9026
|
-
HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int16_t, D>> v) {
|
|
9027
|
-
#if HWY_TARGET >= HWY_SSSE3
|
|
9028
|
-
const Repartition<int32_t, decltype(d)> di32;
|
|
9029
|
-
const Half<decltype(di32)> dh_i32;
|
|
9030
|
-
const VFromD<decltype(di32)> x2{PromoteTo(dh_i32, v).raw};
|
|
9031
|
-
const VFromD<decltype(di32)> s2{
|
|
9032
|
-
_mm_shufflelo_epi16(x2.raw, _MM_SHUFFLE(3, 3, 1, 1))};
|
|
9033
|
-
return ZipLower(d, x2, s2);
|
|
9034
|
-
#else
|
|
9035
|
-
(void)d;
|
|
9036
|
-
return VFromD<D>{_mm_cvtepi16_epi64(v.raw)};
|
|
9037
|
-
#endif
|
|
9038
|
-
}
|
|
9039
|
-
|
|
9040
|
-
#if HWY_TARGET < HWY_SSE4 && !defined(HWY_DISABLE_F16C)
|
|
9041
|
-
|
|
9042
|
-
// Per-target flag to prevent generic_ops-inl.h from defining f16 conversions.
|
|
9043
|
-
#ifdef HWY_NATIVE_F16C
|
|
9044
|
-
#undef HWY_NATIVE_F16C
|
|
9045
|
-
#else
|
|
9046
|
-
#define HWY_NATIVE_F16C
|
|
9047
|
-
#endif
|
|
9048
|
-
|
|
9049
|
-
// Workaround for origin tracking bug in Clang msan prior to 11.0
|
|
9050
|
-
// (spurious "uninitialized memory" for TestF16 with "ORIGIN: invalid")
|
|
9051
|
-
#if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100)
|
|
9052
|
-
#define HWY_INLINE_F16 HWY_NOINLINE
|
|
9053
|
-
#else
|
|
9054
|
-
#define HWY_INLINE_F16 HWY_INLINE
|
|
9055
|
-
#endif
|
|
9056
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
|
|
9057
|
-
HWY_INLINE_F16 VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) {
|
|
9058
|
-
#if HWY_HAVE_FLOAT16
|
|
9059
|
-
const RebindToUnsigned<DFromV<decltype(v)>> du16;
|
|
9060
|
-
return VFromD<D>{_mm_cvtph_ps(BitCast(du16, v).raw)};
|
|
9061
|
-
#else
|
|
9062
|
-
return VFromD<D>{_mm_cvtph_ps(v.raw)};
|
|
9063
|
-
#endif
|
|
9064
|
-
}
|
|
9065
|
-
|
|
9066
|
-
#endif // HWY_NATIVE_F16C
|
|
9067
|
-
|
|
9068
|
-
#if HWY_HAVE_FLOAT16
|
|
9069
|
-
|
|
9070
|
-
#ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
|
|
9071
|
-
#undef HWY_NATIVE_PROMOTE_F16_TO_F64
|
|
9072
|
-
#else
|
|
9073
|
-
#define HWY_NATIVE_PROMOTE_F16_TO_F64
|
|
9074
|
-
#endif
|
|
9075
|
-
|
|
9076
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
|
|
9077
|
-
HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) {
|
|
9078
|
-
return VFromD<D>{_mm_cvtph_pd(v.raw)};
|
|
9079
|
-
}
|
|
9080
|
-
|
|
9081
|
-
#endif // HWY_HAVE_FLOAT16
|
|
9082
|
-
|
|
9083
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
|
|
9084
|
-
HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) {
|
|
9085
|
-
const Rebind<uint16_t, decltype(df32)> du16;
|
|
9086
|
-
const RebindToSigned<decltype(df32)> di32;
|
|
9087
|
-
return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
|
|
9088
|
-
}
|
|
9089
|
-
|
|
9090
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
|
|
9091
|
-
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
|
|
9092
|
-
return VFromD<D>{_mm_cvtps_pd(v.raw)};
|
|
9093
|
-
}
|
|
9094
|
-
|
|
9095
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
|
|
9096
|
-
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
|
|
9097
|
-
return VFromD<D>{_mm_cvtepi32_pd(v.raw)};
|
|
9098
|
-
}
|
|
9099
|
-
|
|
9100
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
9101
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
|
|
9102
|
-
HWY_API VFromD<D> PromoteTo(D /*df64*/, VFromD<Rebind<uint32_t, D>> v) {
|
|
9103
|
-
return VFromD<D>{_mm_cvtepu32_pd(v.raw)};
|
|
9104
|
-
}
|
|
9105
|
-
#else
|
|
9106
|
-
// Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
|
|
9107
|
-
template <class D, HWY_IF_F64_D(D)>
|
|
9108
|
-
HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<uint32_t, D>> v) {
|
|
9109
|
-
const Rebind<int32_t, decltype(df64)> di32;
|
|
9110
|
-
const auto i32_to_f64_result = PromoteTo(df64, BitCast(di32, v));
|
|
9111
|
-
return i32_to_f64_result + IfNegativeThenElse(i32_to_f64_result,
|
|
9112
|
-
Set(df64, 4294967296.0),
|
|
9113
|
-
Zero(df64));
|
|
9114
|
-
}
|
|
9115
|
-
#endif
|
|
9116
|
-
|
|
9117
|
-
// ------------------------------ PromoteEvenTo/PromoteOddTo
|
|
9118
|
-
|
|
9119
|
-
#if HWY_TARGET > HWY_AVX3
|
|
9120
|
-
namespace detail {
|
|
9121
|
-
|
|
9122
|
-
// I32->I64 PromoteEvenTo/PromoteOddTo
|
|
9123
|
-
|
|
9124
|
-
template <class D, HWY_IF_LANES_D(D, 1)>
|
|
9125
|
-
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
|
|
9126
|
-
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
9127
|
-
hwy::SignedTag /*from_type_tag*/, D d_to,
|
|
9128
|
-
Vec64<int32_t> v) {
|
|
9129
|
-
return PromoteLowerTo(d_to, v);
|
|
9130
|
-
}
|
|
9131
|
-
|
|
9132
|
-
template <class D, HWY_IF_LANES_D(D, 2)>
|
|
9133
|
-
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
|
|
9134
|
-
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
9135
|
-
hwy::SignedTag /*from_type_tag*/, D d_to,
|
|
9136
|
-
Vec128<int32_t> v) {
|
|
9137
|
-
const Repartition<int32_t, D> d_from;
|
|
9138
|
-
return PromoteLowerTo(d_to, ConcatEven(d_from, v, v));
|
|
9139
|
-
}
|
|
9140
|
-
|
|
9141
|
-
template <class D, class V, HWY_IF_LANES_LE_D(D, 2)>
|
|
9142
|
-
HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
|
|
9143
|
-
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
9144
|
-
hwy::SignedTag /*from_type_tag*/, D d_to,
|
|
9145
|
-
V v) {
|
|
9146
|
-
const Repartition<int32_t, D> d_from;
|
|
9147
|
-
return PromoteLowerTo(d_to, ConcatOdd(d_from, v, v));
|
|
9148
|
-
}
|
|
9149
|
-
|
|
9150
|
-
} // namespace detail
|
|
9151
|
-
#endif
|
|
9152
|
-
|
|
9153
9785
|
// ------------------------------ Demotions (full -> part w/ narrow lanes)
|
|
9154
9786
|
|
|
9155
9787
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
|
|
@@ -9338,26 +9970,69 @@ HWY_API VFromD<D> DemoteTo(D /*df16*/, VFromD<Rebind<double, D>> v) {
|
|
|
9338
9970
|
|
|
9339
9971
|
#endif // HWY_HAVE_FLOAT16
|
|
9340
9972
|
|
|
9973
|
+
// The _mm*_cvtneps_pbh and _mm*_cvtne2ps_pbh intrinsics require GCC 9 or later
|
|
9974
|
+
// or Clang 10 or later
|
|
9975
|
+
|
|
9976
|
+
// Also need GCC or Clang to bit cast the __m128bh, __m256bh, or __m512bh vector
|
|
9977
|
+
// returned by the _mm*_cvtneps_pbh and _mm*_cvtne2ps_pbh intrinsics to a
|
|
9978
|
+
// __m128i, __m256i, or __m512i as there are currently no intrinsics available
|
|
9979
|
+
// (as of GCC 13 and Clang 17) to bit cast a __m128bh, __m256bh, or __m512bh
|
|
9980
|
+
// vector to a __m128i, __m256i, or __m512i vector
|
|
9981
|
+
|
|
9982
|
+
#if HWY_AVX3_HAVE_F32_TO_BF16C
|
|
9983
|
+
#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
9984
|
+
#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
9985
|
+
#else
|
|
9986
|
+
#define HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
9987
|
+
#endif
|
|
9988
|
+
|
|
9341
9989
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
|
|
9342
|
-
HWY_API VFromD<D> DemoteTo(D dbf16
|
|
9343
|
-
|
|
9344
|
-
|
|
9345
|
-
|
|
9346
|
-
|
|
9347
|
-
|
|
9348
|
-
|
|
9990
|
+
HWY_API VFromD<D> DemoteTo(D /*dbf16*/, VFromD<Rebind<float, D>> v) {
|
|
9991
|
+
#if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
|
|
9992
|
+
// Inline assembly workaround for LLVM codegen bug
|
|
9993
|
+
__m128i raw_result;
|
|
9994
|
+
__asm__("vcvtneps2bf16 %1, %0" : "=v"(raw_result) : "v"(v.raw));
|
|
9995
|
+
return VFromD<D>{raw_result};
|
|
9996
|
+
#else
|
|
9997
|
+
// The _mm_cvtneps_pbh intrinsic returns a __m128bh vector that needs to be
|
|
9998
|
+
// bit casted to a __m128i vector
|
|
9999
|
+
return VFromD<D>{detail::BitCastToInteger(_mm_cvtneps_pbh(v.raw))};
|
|
10000
|
+
#endif
|
|
9349
10001
|
}
|
|
9350
10002
|
|
|
9351
|
-
template <class D,
|
|
9352
|
-
|
|
9353
|
-
|
|
9354
|
-
|
|
9355
|
-
|
|
9356
|
-
|
|
9357
|
-
|
|
9358
|
-
|
|
10003
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
|
|
10004
|
+
HWY_API VFromD<D> ReorderDemote2To(D /*dbf16*/, Vec128<float> a,
|
|
10005
|
+
Vec128<float> b) {
|
|
10006
|
+
#if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
|
|
10007
|
+
// Inline assembly workaround for LLVM codegen bug
|
|
10008
|
+
__m128i raw_result;
|
|
10009
|
+
__asm__("vcvtne2ps2bf16 %2, %1, %0"
|
|
10010
|
+
: "=v"(raw_result)
|
|
10011
|
+
: "v"(b.raw), "v"(a.raw));
|
|
10012
|
+
return VFromD<D>{raw_result};
|
|
10013
|
+
#else
|
|
10014
|
+
// The _mm_cvtne2ps_pbh intrinsic returns a __m128bh vector that needs to be
|
|
10015
|
+
// bit casted to a __m128i vector
|
|
10016
|
+
return VFromD<D>{detail::BitCastToInteger(_mm_cvtne2ps_pbh(b.raw, a.raw))};
|
|
10017
|
+
#endif
|
|
9359
10018
|
}
|
|
9360
10019
|
|
|
10020
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
|
|
10021
|
+
HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec64<float> a,
|
|
10022
|
+
Vec64<float> b) {
|
|
10023
|
+
return VFromD<D>{_mm_shuffle_epi32(
|
|
10024
|
+
detail::BitCastToInteger(_mm_cvtne2ps_pbh(b.raw, a.raw)),
|
|
10025
|
+
_MM_SHUFFLE(2, 0, 2, 0))};
|
|
10026
|
+
}
|
|
10027
|
+
|
|
10028
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_BF16_D(D)>
|
|
10029
|
+
HWY_API VFromD<D> ReorderDemote2To(D dbf16, Vec32<float> a, Vec32<float> b) {
|
|
10030
|
+
const DFromV<decltype(a)> d;
|
|
10031
|
+
const Twice<decltype(d)> dt;
|
|
10032
|
+
return DemoteTo(dbf16, Combine(dt, b, a));
|
|
10033
|
+
}
|
|
10034
|
+
#endif // HWY_AVX3_HAVE_F32_TO_BF16C
|
|
10035
|
+
|
|
9361
10036
|
// Specializations for partial vectors because packs_epi32 sets lanes above 2*N.
|
|
9362
10037
|
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I16_D(D)>
|
|
9363
10038
|
HWY_API VFromD<D> ReorderDemote2To(D dn, Vec32<int32_t> a, Vec32<int32_t> b) {
|
|
@@ -9515,11 +10190,15 @@ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
|
|
|
9515
10190
|
return ReorderDemote2To(d, a, b);
|
|
9516
10191
|
}
|
|
9517
10192
|
|
|
9518
|
-
|
|
9519
|
-
|
|
9520
|
-
|
|
9521
|
-
|
|
10193
|
+
#if HWY_AVX3_HAVE_F32_TO_BF16C
|
|
10194
|
+
// F32 to BF16 OrderedDemote2To is generic for all vector lengths on targets
|
|
10195
|
+
// that support AVX512BF16
|
|
10196
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
10197
|
+
HWY_API VFromD<D> OrderedDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
|
|
10198
|
+
VFromD<Repartition<float, D>> b) {
|
|
10199
|
+
return ReorderDemote2To(dbf16, a, b);
|
|
9522
10200
|
}
|
|
10201
|
+
#endif // HWY_AVX3_HAVE_F32_TO_BF16C
|
|
9523
10202
|
|
|
9524
10203
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
|
|
9525
10204
|
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) {
|
|
@@ -9536,65 +10215,176 @@ HWY_INLINE VFromD<D> ClampF64ToI32Max(D d, VFromD<D> v) {
|
|
|
9536
10215
|
return Min(v, Set(d, 2147483647.0));
|
|
9537
10216
|
}
|
|
9538
10217
|
|
|
9539
|
-
|
|
9540
|
-
|
|
9541
|
-
|
|
9542
|
-
|
|
9543
|
-
|
|
9544
|
-
|
|
9545
|
-
|
|
9546
|
-
|
|
9547
|
-
|
|
9548
|
-
|
|
9549
|
-
|
|
9550
|
-
|
|
9551
|
-
|
|
9552
|
-
|
|
9553
|
-
|
|
9554
|
-
|
|
9555
|
-
|
|
9556
|
-
|
|
9557
|
-
|
|
9558
|
-
const VFromD<DI> max = BitCast(di, ShiftRight<1>(BitCast(du, mask)));
|
|
9559
|
-
return IfVecThenElse(mask, max, converted);
|
|
10218
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
10219
|
+
template <class TTo, class TF>
|
|
10220
|
+
static constexpr HWY_INLINE TTo
|
|
10221
|
+
X86ConvertScalarFromFloat(hwy::FloatTag /* to_type_tag */, TF from_val) {
|
|
10222
|
+
return ConvertScalarTo<TTo>(from_val);
|
|
10223
|
+
}
|
|
10224
|
+
|
|
10225
|
+
template <class TTo, class TF>
|
|
10226
|
+
static HWY_BITCASTSCALAR_CONSTEXPR HWY_INLINE TTo
|
|
10227
|
+
X86ConvertScalarFromFloat(hwy::SpecialTag /* to_type_tag */, TF from_val) {
|
|
10228
|
+
return ConvertScalarTo<TTo>(from_val);
|
|
10229
|
+
}
|
|
10230
|
+
|
|
10231
|
+
template <class TTo, class TF>
|
|
10232
|
+
static HWY_BITCASTSCALAR_CXX14_CONSTEXPR HWY_INLINE TTo
|
|
10233
|
+
X86ConvertScalarFromFloat(hwy::SignedTag /* to_type_tag */, TF from_val) {
|
|
10234
|
+
#if HWY_HAVE_SCALAR_F16_TYPE && HWY_HAVE_SCALAR_F16_OPERATORS
|
|
10235
|
+
using TFArith = If<hwy::IsSame<RemoveCvRef<TTo>, hwy::bfloat16_t>(), float,
|
|
10236
|
+
RemoveCvRef<TF>>;
|
|
9560
10237
|
#else
|
|
9561
|
-
|
|
10238
|
+
using TFArith = If<sizeof(TF) <= sizeof(float), float, RemoveCvRef<TF>>;
|
|
9562
10239
|
#endif
|
|
10240
|
+
|
|
10241
|
+
const TFArith from_val_in_arith_type = ConvertScalarTo<TFArith>(from_val);
|
|
10242
|
+
constexpr TTo kMinResultVal = LimitsMin<TTo>();
|
|
10243
|
+
HWY_BITCASTSCALAR_CONSTEXPR const TFArith kMinOutOfRangePosVal =
|
|
10244
|
+
ScalarAbs(ConvertScalarTo<TFArith>(kMinResultVal));
|
|
10245
|
+
|
|
10246
|
+
return (ScalarAbs(from_val_in_arith_type) < kMinOutOfRangePosVal)
|
|
10247
|
+
? ConvertScalarTo<TTo>(from_val_in_arith_type)
|
|
10248
|
+
: kMinResultVal;
|
|
9563
10249
|
}
|
|
9564
10250
|
|
|
10251
|
+
template <class TTo, class TF>
|
|
10252
|
+
static HWY_CXX14_CONSTEXPR HWY_INLINE TTo
|
|
10253
|
+
X86ConvertScalarFromFloat(hwy::UnsignedTag /* to_type_tag */, TF from_val) {
|
|
10254
|
+
#if HWY_HAVE_SCALAR_F16_TYPE && HWY_HAVE_SCALAR_F16_OPERATORS
|
|
10255
|
+
using TFArith = If<hwy::IsSame<RemoveCvRef<TTo>, hwy::bfloat16_t>(), float,
|
|
10256
|
+
RemoveCvRef<TF>>;
|
|
10257
|
+
#else
|
|
10258
|
+
using TFArith = If<sizeof(TF) <= sizeof(float), float, RemoveCvRef<TF>>;
|
|
10259
|
+
#endif
|
|
10260
|
+
|
|
10261
|
+
const TFArith from_val_in_arith_type = ConvertScalarTo<TFArith>(from_val);
|
|
10262
|
+
constexpr TTo kTToMsb = static_cast<TTo>(TTo{1} << (sizeof(TTo) * 8 - 1));
|
|
10263
|
+
constexpr const TFArith kNegOne = ConvertScalarTo<TFArith>(-1.0);
|
|
10264
|
+
constexpr const TFArith kMinOutOfRangePosVal =
|
|
10265
|
+
ConvertScalarTo<TFArith>(static_cast<double>(kTToMsb) * 2.0);
|
|
10266
|
+
|
|
10267
|
+
return (from_val_in_arith_type > kNegOne &&
|
|
10268
|
+
from_val_in_arith_type < kMinOutOfRangePosVal)
|
|
10269
|
+
? ConvertScalarTo<TTo>(from_val_in_arith_type)
|
|
10270
|
+
: LimitsMax<TTo>();
|
|
10271
|
+
}
|
|
10272
|
+
|
|
10273
|
+
template <class TTo, class TF>
|
|
10274
|
+
static constexpr HWY_INLINE HWY_MAYBE_UNUSED TTo
|
|
10275
|
+
X86ConvertScalarFromFloat(TF from_val) {
|
|
10276
|
+
return X86ConvertScalarFromFloat<TTo>(hwy::TypeTag<RemoveCvRef<TTo>>(),
|
|
10277
|
+
from_val);
|
|
10278
|
+
}
|
|
10279
|
+
#endif // HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
10280
|
+
|
|
9565
10281
|
} // namespace detail
|
|
9566
10282
|
|
|
9567
|
-
|
|
9568
|
-
|
|
9569
|
-
|
|
9570
|
-
|
|
9571
|
-
|
|
10283
|
+
#ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
|
|
10284
|
+
#undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
|
|
10285
|
+
#else
|
|
10286
|
+
#define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
|
|
10287
|
+
#endif
|
|
10288
|
+
|
|
10289
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
|
|
10290
|
+
HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, VFromD<Rebind<double, D>> v) {
|
|
10291
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
10292
|
+
// Workaround for undefined behavior in _mm_cvttpd_epi32 with GCC if any
|
|
10293
|
+
// values of v[i] are not within the range of an int32_t
|
|
10294
|
+
|
|
10295
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
10296
|
+
if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
|
|
10297
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
|
|
10298
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
10299
|
+
return Dup128VecFromValues(
|
|
10300
|
+
D(), detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
|
|
10301
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]), int32_t{0},
|
|
10302
|
+
int32_t{0});
|
|
10303
|
+
}
|
|
10304
|
+
#endif
|
|
10305
|
+
|
|
10306
|
+
__m128i raw_result;
|
|
10307
|
+
__asm__("%vcvttpd2dq {%1, %0|%0, %1}"
|
|
10308
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
10309
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
10310
|
+
:);
|
|
10311
|
+
return VFromD<D>{raw_result};
|
|
10312
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
10313
|
+
return VFromD<D>{_mm_cvttpd_epi32(v.raw)};
|
|
10314
|
+
#endif
|
|
10315
|
+
}
|
|
10316
|
+
|
|
10317
|
+
// F64 to I32 DemoteTo is generic for all vector lengths
|
|
10318
|
+
template <class D, HWY_IF_I32_D(D)>
|
|
10319
|
+
HWY_API VFromD<D> DemoteTo(D di32, VFromD<Rebind<double, D>> v) {
|
|
10320
|
+
const Rebind<double, decltype(di32)> df64;
|
|
10321
|
+
const VFromD<decltype(df64)> clamped = detail::ClampF64ToI32Max(df64, v);
|
|
10322
|
+
return DemoteInRangeTo(di32, clamped);
|
|
9572
10323
|
}
|
|
9573
10324
|
|
|
9574
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
|
|
9575
|
-
HWY_API VFromD<D> DemoteTo(D du32, VFromD<Rebind<double, D>> v) {
|
|
9576
10325
|
#if HWY_TARGET <= HWY_AVX3
|
|
9577
|
-
|
|
9578
|
-
|
|
9579
|
-
|
|
9580
|
-
|
|
10326
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
|
|
10327
|
+
HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, VFromD<Rebind<double, D>> v) {
|
|
10328
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
10329
|
+
// Workaround for undefined behavior in _mm_cvttpd_epu32 with GCC if any
|
|
10330
|
+
// values of v[i] are not within the range of an uint32_t
|
|
10331
|
+
|
|
10332
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
10333
|
+
if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) {
|
|
10334
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
|
|
10335
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
10336
|
+
return Dup128VecFromValues(
|
|
10337
|
+
D(), detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0]),
|
|
10338
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1]), uint32_t{0},
|
|
10339
|
+
uint32_t{0});
|
|
10340
|
+
}
|
|
10341
|
+
#endif
|
|
10342
|
+
|
|
10343
|
+
__m128i raw_result;
|
|
10344
|
+
__asm__("vcvttpd2udq {%1, %0|%0, %1}"
|
|
10345
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
10346
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
10347
|
+
:);
|
|
10348
|
+
return VFromD<D>{raw_result};
|
|
10349
|
+
#else
|
|
10350
|
+
return VFromD<D>{_mm_cvttpd_epu32(v.raw)};
|
|
10351
|
+
#endif
|
|
10352
|
+
}
|
|
10353
|
+
|
|
10354
|
+
// F64->U32 DemoteTo is generic for all vector lengths
|
|
10355
|
+
template <class D, HWY_IF_U32_D(D)>
|
|
10356
|
+
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) {
|
|
10357
|
+
return DemoteInRangeTo(D(), ZeroIfNegative(v));
|
|
10358
|
+
}
|
|
10359
|
+
#else // HWY_TARGET > HWY_AVX3
|
|
10360
|
+
|
|
10361
|
+
// F64 to U32 DemoteInRangeTo is generic for all vector lengths on
|
|
10362
|
+
// SSE2/SSSE3/SSE4/AVX2
|
|
10363
|
+
template <class D, HWY_IF_U32_D(D)>
|
|
10364
|
+
HWY_API VFromD<D> DemoteInRangeTo(D du32, VFromD<Rebind<double, D>> v) {
|
|
10365
|
+
const RebindToSigned<decltype(du32)> di32;
|
|
9581
10366
|
const Rebind<double, decltype(du32)> df64;
|
|
9582
10367
|
const RebindToUnsigned<decltype(df64)> du64;
|
|
9583
10368
|
|
|
9584
|
-
// Clamp v[i] to a value between 0 and 4294967295
|
|
9585
|
-
const auto clamped = Min(ZeroIfNegative(v), Set(df64, 4294967295.0));
|
|
9586
|
-
|
|
9587
10369
|
const auto k2_31 = Set(df64, 2147483648.0);
|
|
9588
|
-
const auto
|
|
9589
|
-
const auto clamped_lo31_f64 =
|
|
9590
|
-
|
|
9591
|
-
|
|
10370
|
+
const auto v_is_ge_k2_31 = (v >= k2_31);
|
|
10371
|
+
const auto clamped_lo31_f64 = v - IfThenElseZero(v_is_ge_k2_31, k2_31);
|
|
10372
|
+
const auto clamped_lo31_u32 =
|
|
10373
|
+
BitCast(du32, DemoteInRangeTo(di32, clamped_lo31_f64));
|
|
9592
10374
|
const auto clamped_u32_msb = ShiftLeft<31>(
|
|
9593
|
-
TruncateTo(du32, BitCast(du64, VecFromMask(df64,
|
|
10375
|
+
TruncateTo(du32, BitCast(du64, VecFromMask(df64, v_is_ge_k2_31))));
|
|
9594
10376
|
return Or(clamped_lo31_u32, clamped_u32_msb);
|
|
9595
|
-
#endif
|
|
9596
10377
|
}
|
|
9597
10378
|
|
|
10379
|
+
// F64 to U32 DemoteTo is generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
|
|
10380
|
+
template <class D, HWY_IF_U32_D(D)>
|
|
10381
|
+
HWY_API VFromD<D> DemoteTo(D du32, VFromD<Rebind<double, D>> v) {
|
|
10382
|
+
const Rebind<double, decltype(du32)> df64;
|
|
10383
|
+
const auto clamped = Min(ZeroIfNegative(v), Set(df64, 4294967295.0));
|
|
10384
|
+
return DemoteInRangeTo(du32, clamped);
|
|
10385
|
+
}
|
|
10386
|
+
#endif // HWY_TARGET <= HWY_AVX3
|
|
10387
|
+
|
|
9598
10388
|
#if HWY_TARGET <= HWY_AVX3
|
|
9599
10389
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
|
|
9600
10390
|
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
|
|
@@ -9683,23 +10473,85 @@ HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
|
|
|
9683
10473
|
}
|
|
9684
10474
|
|
|
9685
10475
|
// ------------------------------ F32->UI64 PromoteTo
|
|
10476
|
+
#ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
|
|
10477
|
+
#undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
|
|
10478
|
+
#else
|
|
10479
|
+
#define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
|
|
10480
|
+
#endif
|
|
10481
|
+
|
|
9686
10482
|
#if HWY_TARGET <= HWY_AVX3
|
|
9687
10483
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
|
|
10484
|
+
HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
|
|
10485
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
10486
|
+
// Workaround for undefined behavior with GCC if any values of v[i] are not
|
|
10487
|
+
// within the range of an int64_t
|
|
10488
|
+
|
|
10489
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
10490
|
+
if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
|
|
10491
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
|
|
10492
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
10493
|
+
return Dup128VecFromValues(
|
|
10494
|
+
D(), detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]),
|
|
10495
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1]));
|
|
10496
|
+
}
|
|
10497
|
+
#endif
|
|
10498
|
+
|
|
10499
|
+
__m128i raw_result;
|
|
10500
|
+
__asm__("vcvttps2qq {%1, %0|%0, %1}"
|
|
10501
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
10502
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
10503
|
+
:);
|
|
10504
|
+
return VFromD<D>{raw_result};
|
|
10505
|
+
#else
|
|
10506
|
+
return VFromD<D>{_mm_cvttps_epi64(v.raw)};
|
|
10507
|
+
#endif
|
|
10508
|
+
}
|
|
10509
|
+
|
|
10510
|
+
// Generic for all vector lengths.
|
|
10511
|
+
template <class D, HWY_IF_I64_D(D)>
|
|
9688
10512
|
HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
|
|
9689
10513
|
const Rebind<float, decltype(di64)> df32;
|
|
9690
10514
|
const RebindToFloat<decltype(di64)> df64;
|
|
9691
|
-
|
|
9692
|
-
|
|
9693
|
-
|
|
9694
|
-
|
|
9695
|
-
|
|
9696
|
-
|
|
9697
|
-
|
|
10515
|
+
// We now avoid GCC UB in PromoteInRangeTo via assembly, see #2189 and
|
|
10516
|
+
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115115. Previously we fixed up
|
|
10517
|
+
// the result afterwards using three instructions. Now we instead check if
|
|
10518
|
+
// v >= 2^63, and if so replace the output with 2^63-1, which is likely more
|
|
10519
|
+
// efficient. Note that the previous representable f32 is less than 2^63 and
|
|
10520
|
+
// thus fits in i64.
|
|
10521
|
+
const MFromD<D> overflow = RebindMask(
|
|
10522
|
+
di64, PromoteMaskTo(df64, df32, Ge(v, Set(df32, 9.223372e18f))));
|
|
10523
|
+
return IfThenElse(overflow, Set(di64, LimitsMax<int64_t>()),
|
|
10524
|
+
PromoteInRangeTo(di64, v));
|
|
9698
10525
|
}
|
|
9699
|
-
template <class D,
|
|
10526
|
+
template <class D, HWY_IF_U64_D(D)>
|
|
9700
10527
|
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
|
|
9701
|
-
return
|
|
9702
|
-
|
|
10528
|
+
return PromoteInRangeTo(D(), ZeroIfNegative(v));
|
|
10529
|
+
}
|
|
10530
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
|
|
10531
|
+
HWY_API VFromD<D> PromoteInRangeTo(D /* tag */, VFromD<Rebind<float, D>> v) {
|
|
10532
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
10533
|
+
// Workaround for undefined behavior with GCC if any values of v[i] are not
|
|
10534
|
+
// within the range of an uint64_t
|
|
10535
|
+
|
|
10536
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
10537
|
+
if (detail::IsConstantX86VecForF2IConv<uint64_t>(v)) {
|
|
10538
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
|
|
10539
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
10540
|
+
return Dup128VecFromValues(
|
|
10541
|
+
D(), detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0]),
|
|
10542
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1]));
|
|
10543
|
+
}
|
|
10544
|
+
#endif
|
|
10545
|
+
|
|
10546
|
+
__m128i raw_result;
|
|
10547
|
+
__asm__("vcvttps2uqq {%1, %0|%0, %1}"
|
|
10548
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
10549
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
10550
|
+
:);
|
|
10551
|
+
return VFromD<D>{raw_result};
|
|
10552
|
+
#else
|
|
10553
|
+
return VFromD<D>{_mm_cvttps_epu64(v.raw)};
|
|
10554
|
+
#endif
|
|
9703
10555
|
}
|
|
9704
10556
|
#else // AVX2 or below
|
|
9705
10557
|
|
|
@@ -9730,6 +10582,27 @@ HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
|
|
|
9730
10582
|
lo64_or_mask);
|
|
9731
10583
|
}
|
|
9732
10584
|
|
|
10585
|
+
// Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
|
|
10586
|
+
template <class D, HWY_IF_UI64_D(D)>
|
|
10587
|
+
HWY_API VFromD<D> PromoteInRangeTo(D d64, VFromD<Rebind<float, D>> v) {
|
|
10588
|
+
const Rebind<MakeNarrow<TFromD<D>>, decltype(d64)> d32;
|
|
10589
|
+
const RebindToSigned<decltype(d32)> di32;
|
|
10590
|
+
const RebindToFloat<decltype(d32)> df32;
|
|
10591
|
+
const RebindToUnsigned<decltype(d32)> du32;
|
|
10592
|
+
const Repartition<uint8_t, decltype(d32)> du32_as_du8;
|
|
10593
|
+
|
|
10594
|
+
const auto exponent_adj = BitCast(
|
|
10595
|
+
du32,
|
|
10596
|
+
SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))),
|
|
10597
|
+
BitCast(du32_as_du8, Set(du32, uint32_t{0xFFFFFF9Du}))));
|
|
10598
|
+
const auto adj_v =
|
|
10599
|
+
BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj));
|
|
10600
|
+
|
|
10601
|
+
const auto f32_to_i32_result = ConvertInRangeTo(di32, adj_v);
|
|
10602
|
+
return PromoteTo(d64, BitCast(d32, f32_to_i32_result))
|
|
10603
|
+
<< PromoteTo(d64, exponent_adj);
|
|
10604
|
+
}
|
|
10605
|
+
|
|
9733
10606
|
namespace detail {
|
|
9734
10607
|
|
|
9735
10608
|
template <class DU64, HWY_IF_V_SIZE_LE_D(DU64, 16)>
|
|
@@ -9770,7 +10643,7 @@ HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
|
|
|
9770
10643
|
|
|
9771
10644
|
const auto adj_v =
|
|
9772
10645
|
BitCast(df32, BitCast(du32, non_neg_v) - ShiftLeft<23>(exponent_adj));
|
|
9773
|
-
const
|
|
10646
|
+
const auto f32_to_i32_result = ConvertInRangeTo(di32, adj_v);
|
|
9774
10647
|
|
|
9775
10648
|
const auto i32_overflow_mask = BroadcastSignBit(f32_to_i32_result);
|
|
9776
10649
|
const auto overflow_result =
|
|
@@ -9960,7 +10833,20 @@ template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)>
|
|
|
9960
10833
|
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) {
|
|
9961
10834
|
return VFromD<D>{_mm_cvtusepi64_epi8(v.raw)};
|
|
9962
10835
|
}
|
|
9963
|
-
#else
|
|
10836
|
+
#else // AVX2 or below
|
|
10837
|
+
|
|
10838
|
+
// Disable the default unsigned to signed DemoteTo/ReorderDemote2To
|
|
10839
|
+
// implementations in generic_ops-inl.h for U64->I8/I16/I32 demotions on
|
|
10840
|
+
// SSE2/SSSE3/SSE4/AVX2 as U64->I8/I16/I32 DemoteTo/ReorderDemote2To for
|
|
10841
|
+
// SSE2/SSSE3/SSE4/AVX2 is implemented in x86_128-inl.h
|
|
10842
|
+
|
|
10843
|
+
// The default unsigned to signed DemoteTo/ReorderDemote2To
|
|
10844
|
+
// implementations in generic_ops-inl.h are still used for U32->I8/I16 and
|
|
10845
|
+
// U16->I8 demotions on SSE2/SSSE3/SSE4/AVX2
|
|
10846
|
+
|
|
10847
|
+
#undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
|
|
10848
|
+
#define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) HWY_IF_NOT_T_SIZE_V(V, 8)
|
|
10849
|
+
|
|
9964
10850
|
namespace detail {
|
|
9965
10851
|
template <class D, HWY_IF_UNSIGNED_D(D)>
|
|
9966
10852
|
HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64MaskOutResult(
|
|
@@ -10023,6 +10909,25 @@ HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<int64_t, D>> v) {
|
|
|
10023
10909
|
return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, non_neg_vals));
|
|
10024
10910
|
}
|
|
10025
10911
|
|
|
10912
|
+
template <class D,
|
|
10913
|
+
HWY_IF_T_SIZE_ONE_OF_D(
|
|
10914
|
+
D, ((HWY_TARGET != HWY_SSE2) ? ((1 << 1) | (1 << 2)) : 0) |
|
|
10915
|
+
(1 << 4)),
|
|
10916
|
+
HWY_IF_SIGNED_D(D)>
|
|
10917
|
+
HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) {
|
|
10918
|
+
const RebindToUnsigned<decltype(dn)> dn_u;
|
|
10919
|
+
return BitCast(dn, TruncateTo(dn_u, detail::DemoteFromU64Saturate(dn, v)));
|
|
10920
|
+
}
|
|
10921
|
+
|
|
10922
|
+
#if HWY_TARGET == HWY_SSE2
|
|
10923
|
+
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
|
|
10924
|
+
HWY_IF_SIGNED_D(D)>
|
|
10925
|
+
HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) {
|
|
10926
|
+
const Rebind<int32_t, decltype(dn)> di32;
|
|
10927
|
+
return DemoteTo(dn, DemoteTo(di32, v));
|
|
10928
|
+
}
|
|
10929
|
+
#endif // HWY_TARGET == HWY_SSE2
|
|
10930
|
+
|
|
10026
10931
|
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
10027
10932
|
HWY_IF_UNSIGNED_D(D)>
|
|
10028
10933
|
HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) {
|
|
@@ -10047,6 +10952,16 @@ HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint64_t, D>> a,
|
|
|
10047
10952
|
return DemoteTo(dn, Combine(dt, b, a));
|
|
10048
10953
|
}
|
|
10049
10954
|
|
|
10955
|
+
#if HWY_TARGET > HWY_AVX3
|
|
10956
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, HWY_MAX_BYTES / 2), HWY_IF_I32_D(D)>
|
|
10957
|
+
HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint64_t, D>> a,
|
|
10958
|
+
VFromD<Repartition<uint64_t, D>> b) {
|
|
10959
|
+
const DFromV<decltype(a)> d;
|
|
10960
|
+
const Twice<decltype(d)> dt;
|
|
10961
|
+
return DemoteTo(dn, Combine(dt, b, a));
|
|
10962
|
+
}
|
|
10963
|
+
#endif
|
|
10964
|
+
|
|
10050
10965
|
#if HWY_TARGET > HWY_AVX2
|
|
10051
10966
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
|
|
10052
10967
|
HWY_API Vec128<int32_t> ReorderDemote2To(D dn, Vec128<int64_t> a,
|
|
@@ -10084,9 +10999,9 @@ HWY_API Vec128<uint32_t> ReorderDemote2To(D dn, Vec128<int64_t> a,
|
|
|
10084
10999
|
return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a));
|
|
10085
11000
|
}
|
|
10086
11001
|
|
|
10087
|
-
template <class D, HWY_IF_V_SIZE_D(D, 16),
|
|
10088
|
-
HWY_API
|
|
10089
|
-
|
|
11002
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
|
|
11003
|
+
HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<uint64_t> a,
|
|
11004
|
+
Vec128<uint64_t> b) {
|
|
10090
11005
|
const Half<decltype(dn)> dnh;
|
|
10091
11006
|
|
|
10092
11007
|
const auto saturated_a = detail::DemoteFromU64Saturate(dnh, a);
|
|
@@ -10196,103 +11111,313 @@ HWY_API VFromD<D> ConvertTo(D dd, VFromD<Rebind<uint64_t, D>> v) {
|
|
|
10196
11111
|
|
|
10197
11112
|
// Truncates (rounds toward zero).
|
|
10198
11113
|
|
|
11114
|
+
#ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
|
|
11115
|
+
#undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
|
|
11116
|
+
#else
|
|
11117
|
+
#define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
|
|
11118
|
+
#endif
|
|
11119
|
+
|
|
10199
11120
|
#if HWY_HAVE_FLOAT16
|
|
10200
11121
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)>
|
|
11122
|
+
HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, VFromD<RebindToFloat<D>> v) {
|
|
11123
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
11124
|
+
// Workaround for undefined behavior in _mm_cvttph_epi16 if any values of v[i]
|
|
11125
|
+
// are not within the range of an int16_t
|
|
11126
|
+
|
|
11127
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
|
|
11128
|
+
HWY_HAVE_SCALAR_F16_TYPE
|
|
11129
|
+
if (detail::IsConstantX86VecForF2IConv<int16_t>(v)) {
|
|
11130
|
+
typedef hwy::float16_t::Native GccF16RawVectType
|
|
11131
|
+
__attribute__((__vector_size__(16)));
|
|
11132
|
+
const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
|
|
11133
|
+
return Dup128VecFromValues(
|
|
11134
|
+
D(), detail::X86ConvertScalarFromFloat<int16_t>(raw_v[0]),
|
|
11135
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[1]),
|
|
11136
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[2]),
|
|
11137
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[3]),
|
|
11138
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[4]),
|
|
11139
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[5]),
|
|
11140
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[6]),
|
|
11141
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[7]));
|
|
11142
|
+
}
|
|
11143
|
+
#endif
|
|
11144
|
+
|
|
11145
|
+
__m128i raw_result;
|
|
11146
|
+
__asm__("vcvttph2w {%1, %0|%0, %1}"
|
|
11147
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
11148
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
11149
|
+
:);
|
|
11150
|
+
return VFromD<D>{raw_result};
|
|
11151
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
11152
|
+
return VFromD<D>{_mm_cvttph_epi16(v.raw)};
|
|
11153
|
+
#endif
|
|
11154
|
+
}
|
|
11155
|
+
|
|
11156
|
+
// F16 to I16 ConvertTo is generic for all vector lengths
|
|
11157
|
+
template <class D, HWY_IF_I16_D(D)>
|
|
10201
11158
|
HWY_API VFromD<D> ConvertTo(D di, VFromD<RebindToFloat<D>> v) {
|
|
10202
|
-
|
|
10203
|
-
|
|
11159
|
+
const RebindToFloat<decltype(di)> df;
|
|
11160
|
+
// See comment at the first occurrence of "IfThenElse(overflow,".
|
|
11161
|
+
const MFromD<D> overflow =
|
|
11162
|
+
RebindMask(di, Ge(v, Set(df, ConvertScalarTo<hwy::float16_t>(32768.0f))));
|
|
11163
|
+
return IfThenElse(overflow, Set(di, LimitsMax<int16_t>()),
|
|
11164
|
+
ConvertInRangeTo(di, v));
|
|
10204
11165
|
}
|
|
11166
|
+
|
|
10205
11167
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
|
|
11168
|
+
HWY_API VFromD<D> ConvertInRangeTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
|
|
11169
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
11170
|
+
// Workaround for undefined behavior in _mm_cvttph_epu16 if any values of v[i]
|
|
11171
|
+
// are not within the range of an uint16_t
|
|
11172
|
+
|
|
11173
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
|
|
11174
|
+
HWY_HAVE_SCALAR_F16_TYPE
|
|
11175
|
+
if (detail::IsConstantX86VecForF2IConv<uint16_t>(v)) {
|
|
11176
|
+
typedef hwy::float16_t::Native GccF16RawVectType
|
|
11177
|
+
__attribute__((__vector_size__(16)));
|
|
11178
|
+
const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
|
|
11179
|
+
return Dup128VecFromValues(
|
|
11180
|
+
D(), detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[0]),
|
|
11181
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[1]),
|
|
11182
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[2]),
|
|
11183
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[3]),
|
|
11184
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[4]),
|
|
11185
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[5]),
|
|
11186
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[6]),
|
|
11187
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[7]));
|
|
11188
|
+
}
|
|
11189
|
+
#endif
|
|
11190
|
+
|
|
11191
|
+
__m128i raw_result;
|
|
11192
|
+
__asm__("vcvttph2uw {%1, %0|%0, %1}"
|
|
11193
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
11194
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
11195
|
+
:);
|
|
11196
|
+
return VFromD<D>{raw_result};
|
|
11197
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
11198
|
+
return VFromD<D>{_mm_cvttph_epu16(v.raw)};
|
|
11199
|
+
#endif
|
|
11200
|
+
}
|
|
11201
|
+
|
|
11202
|
+
// F16->U16 ConvertTo is generic for all vector lengths
|
|
11203
|
+
template <class D, HWY_IF_U16_D(D)>
|
|
10206
11204
|
HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
|
|
10207
|
-
return
|
|
10208
|
-
_mm_maskz_cvttph_epu16(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
|
|
11205
|
+
return ConvertInRangeTo(D(), ZeroIfNegative(v));
|
|
10209
11206
|
}
|
|
10210
11207
|
#endif // HWY_HAVE_FLOAT16
|
|
10211
11208
|
|
|
10212
11209
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
|
|
11210
|
+
HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, VFromD<RebindToFloat<D>> v) {
|
|
11211
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
11212
|
+
// Workaround for undefined behavior in _mm_cvttps_epi32 with GCC if any
|
|
11213
|
+
// values of v[i] are not within the range of an int32_t
|
|
11214
|
+
|
|
11215
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
11216
|
+
if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
|
|
11217
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
|
|
11218
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
11219
|
+
return Dup128VecFromValues(
|
|
11220
|
+
D(), detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
|
|
11221
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]),
|
|
11222
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[2]),
|
|
11223
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[3]));
|
|
11224
|
+
}
|
|
11225
|
+
#endif
|
|
11226
|
+
|
|
11227
|
+
__m128i raw_result;
|
|
11228
|
+
__asm__("%vcvttps2dq {%1, %0|%0, %1}"
|
|
11229
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
11230
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
11231
|
+
:);
|
|
11232
|
+
return VFromD<D>{raw_result};
|
|
11233
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
11234
|
+
return VFromD<D>{_mm_cvttps_epi32(v.raw)};
|
|
11235
|
+
#endif
|
|
11236
|
+
}
|
|
11237
|
+
|
|
11238
|
+
// F32 to I32 ConvertTo is generic for all vector lengths
|
|
11239
|
+
template <class D, HWY_IF_I32_D(D)>
|
|
10213
11240
|
HWY_API VFromD<D> ConvertTo(D di, VFromD<RebindToFloat<D>> v) {
|
|
10214
|
-
|
|
10215
|
-
|
|
11241
|
+
const RebindToFloat<decltype(di)> df;
|
|
11242
|
+
// See comment at the first occurrence of "IfThenElse(overflow,".
|
|
11243
|
+
const MFromD<D> overflow = RebindMask(di, Ge(v, Set(df, 2147483648.0f)));
|
|
11244
|
+
return IfThenElse(overflow, Set(di, LimitsMax<int32_t>()),
|
|
11245
|
+
ConvertInRangeTo(di, v));
|
|
10216
11246
|
}
|
|
10217
11247
|
|
|
10218
11248
|
#if HWY_TARGET <= HWY_AVX3
|
|
10219
11249
|
template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I64_D(DI)>
|
|
11250
|
+
HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, VFromD<RebindToFloat<DI>> v) {
|
|
11251
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
11252
|
+
// Workaround for undefined behavior in _mm_cvttpd_epi64 with GCC if any
|
|
11253
|
+
// values of v[i] are not within the range of an int64_t
|
|
11254
|
+
|
|
11255
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
11256
|
+
if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
|
|
11257
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
|
|
11258
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
11259
|
+
return Dup128VecFromValues(
|
|
11260
|
+
DI(), detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]),
|
|
11261
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1]));
|
|
11262
|
+
}
|
|
11263
|
+
#endif
|
|
11264
|
+
|
|
11265
|
+
__m128i raw_result;
|
|
11266
|
+
__asm__("vcvttpd2qq {%1, %0|%0, %1}"
|
|
11267
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
11268
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
11269
|
+
:);
|
|
11270
|
+
return VFromD<DI>{raw_result};
|
|
11271
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
11272
|
+
return VFromD<DI>{_mm_cvttpd_epi64(v.raw)};
|
|
11273
|
+
#endif
|
|
11274
|
+
}
|
|
11275
|
+
|
|
11276
|
+
// F64 to I64 ConvertTo is generic for all vector lengths on AVX3
|
|
11277
|
+
template <class DI, HWY_IF_I64_D(DI)>
|
|
10220
11278
|
HWY_API VFromD<DI> ConvertTo(DI di, VFromD<RebindToFloat<DI>> v) {
|
|
10221
|
-
|
|
10222
|
-
|
|
11279
|
+
const RebindToFloat<decltype(di)> df;
|
|
11280
|
+
// See comment at the first occurrence of "IfThenElse(overflow,".
|
|
11281
|
+
const MFromD<DI> overflow =
|
|
11282
|
+
RebindMask(di, Ge(v, Set(df, 9.223372036854776e18)));
|
|
11283
|
+
return IfThenElse(overflow, Set(di, LimitsMax<int64_t>()),
|
|
11284
|
+
ConvertInRangeTo(di, v));
|
|
10223
11285
|
}
|
|
10224
11286
|
|
|
10225
11287
|
template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U32_D(DU)>
|
|
11288
|
+
HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
11289
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
11290
|
+
// Workaround for undefined behavior in _mm_cvttps_epu32 with GCC if any
|
|
11291
|
+
// values of v[i] are not within the range of an uint32_t
|
|
11292
|
+
|
|
11293
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
11294
|
+
if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) {
|
|
11295
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
|
|
11296
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
11297
|
+
return Dup128VecFromValues(
|
|
11298
|
+
DU(), detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0]),
|
|
11299
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1]),
|
|
11300
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[2]),
|
|
11301
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[3]));
|
|
11302
|
+
}
|
|
11303
|
+
#endif
|
|
11304
|
+
|
|
11305
|
+
__m128i raw_result;
|
|
11306
|
+
__asm__("vcvttps2udq {%1, %0|%0, %1}"
|
|
11307
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
11308
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
11309
|
+
:);
|
|
11310
|
+
return VFromD<DU>{raw_result};
|
|
11311
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
11312
|
+
return VFromD<DU>{_mm_cvttps_epu32(v.raw)};
|
|
11313
|
+
#endif
|
|
11314
|
+
}
|
|
11315
|
+
|
|
11316
|
+
// F32->U32 ConvertTo is generic for all vector lengths
|
|
11317
|
+
template <class DU, HWY_IF_U32_D(DU)>
|
|
10226
11318
|
HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
10227
|
-
return
|
|
10228
|
-
_mm_maskz_cvttps_epu32(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
|
|
11319
|
+
return ConvertInRangeTo(DU(), ZeroIfNegative(v));
|
|
10229
11320
|
}
|
|
10230
11321
|
|
|
10231
11322
|
template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U64_D(DU)>
|
|
11323
|
+
HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
11324
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
11325
|
+
// Workaround for undefined behavior in _mm_cvttpd_epu64 with GCC if any
|
|
11326
|
+
// values of v[i] are not within the range of an uint64_t
|
|
11327
|
+
|
|
11328
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
11329
|
+
if (detail::IsConstantX86VecForF2IConv<uint64_t>(v)) {
|
|
11330
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
|
|
11331
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
11332
|
+
return Dup128VecFromValues(
|
|
11333
|
+
DU(), detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0]),
|
|
11334
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1]));
|
|
11335
|
+
}
|
|
11336
|
+
#endif
|
|
11337
|
+
|
|
11338
|
+
__m128i raw_result;
|
|
11339
|
+
__asm__("vcvttpd2uqq {%1, %0|%0, %1}"
|
|
11340
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
11341
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
11342
|
+
:);
|
|
11343
|
+
return VFromD<DU>{raw_result};
|
|
11344
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
11345
|
+
return VFromD<DU>{_mm_cvttpd_epu64(v.raw)};
|
|
11346
|
+
#endif
|
|
11347
|
+
}
|
|
11348
|
+
|
|
11349
|
+
// F64->U64 ConvertTo is generic for all vector lengths
|
|
11350
|
+
template <class DU, HWY_IF_U64_D(DU)>
|
|
10232
11351
|
HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
10233
|
-
return
|
|
10234
|
-
_mm_maskz_cvttpd_epu64(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
|
|
11352
|
+
return ConvertInRangeTo(DU(), ZeroIfNegative(v));
|
|
10235
11353
|
}
|
|
10236
11354
|
|
|
10237
11355
|
#else // AVX2 or below
|
|
10238
11356
|
|
|
10239
|
-
|
|
10240
|
-
|
|
11357
|
+
namespace detail {
|
|
11358
|
+
|
|
11359
|
+
template <class DU32, HWY_IF_U32_D(DU32)>
|
|
11360
|
+
static HWY_INLINE VFromD<DU32> ConvInRangeF32ToU32(
|
|
11361
|
+
DU32 du32, VFromD<RebindToFloat<DU32>> v, VFromD<DU32>& exp_diff) {
|
|
10241
11362
|
const RebindToSigned<decltype(du32)> di32;
|
|
10242
11363
|
const RebindToFloat<decltype(du32)> df32;
|
|
10243
11364
|
|
|
10244
|
-
|
|
10245
|
-
const auto exp_diff = Set(di32, int32_t{158}) -
|
|
10246
|
-
BitCast(di32, ShiftRight<23>(BitCast(du32, non_neg_v)));
|
|
11365
|
+
exp_diff = Set(du32, uint32_t{158}) - ShiftRight<23>(BitCast(du32, v));
|
|
10247
11366
|
const auto scale_down_f32_val_mask =
|
|
10248
|
-
|
|
11367
|
+
VecFromMask(du32, Eq(exp_diff, Zero(du32)));
|
|
10249
11368
|
|
|
10250
|
-
const auto v_scaled =
|
|
10251
|
-
df32, BitCast(du32,
|
|
10252
|
-
const
|
|
10253
|
-
|
|
11369
|
+
const auto v_scaled =
|
|
11370
|
+
BitCast(df32, BitCast(du32, v) + ShiftLeft<23>(scale_down_f32_val_mask));
|
|
11371
|
+
const auto f32_to_u32_result =
|
|
11372
|
+
BitCast(du32, ConvertInRangeTo(di32, v_scaled));
|
|
10254
11373
|
|
|
10255
|
-
return
|
|
10256
|
-
BitCast(du32, BroadcastSignBit(exp_diff)),
|
|
10257
|
-
f32_to_u32_result + And(f32_to_u32_result, scale_down_f32_val_mask));
|
|
11374
|
+
return f32_to_u32_result + And(f32_to_u32_result, scale_down_f32_val_mask);
|
|
10258
11375
|
}
|
|
10259
11376
|
|
|
10260
|
-
|
|
10261
|
-
|
|
10262
|
-
|
|
10263
|
-
|
|
10264
|
-
|
|
11377
|
+
} // namespace detail
|
|
11378
|
+
|
|
11379
|
+
// F32 to U32 ConvertInRangeTo is generic for all vector lengths on
|
|
11380
|
+
// SSE2/SSSE3/SSE4/AVX2
|
|
11381
|
+
template <class DU32, HWY_IF_U32_D(DU32)>
|
|
11382
|
+
HWY_API VFromD<DU32> ConvertInRangeTo(DU32 du32,
|
|
11383
|
+
VFromD<RebindToFloat<DU32>> v) {
|
|
11384
|
+
VFromD<DU32> exp_diff;
|
|
11385
|
+
const auto f32_to_u32_result = detail::ConvInRangeF32ToU32(du32, v, exp_diff);
|
|
11386
|
+
return f32_to_u32_result;
|
|
10265
11387
|
}
|
|
10266
|
-
|
|
10267
|
-
|
|
10268
|
-
|
|
10269
|
-
|
|
10270
|
-
|
|
10271
|
-
|
|
10272
|
-
|
|
11388
|
+
|
|
11389
|
+
// F32 to U32 ConvertTo is generic for all vector lengths on
|
|
11390
|
+
// SSE2/SSSE3/SSE4/AVX2
|
|
11391
|
+
template <class DU32, HWY_IF_U32_D(DU32)>
|
|
11392
|
+
HWY_API VFromD<DU32> ConvertTo(DU32 du32, VFromD<RebindToFloat<DU32>> v) {
|
|
11393
|
+
const RebindToSigned<decltype(du32)> di32;
|
|
11394
|
+
|
|
11395
|
+
const auto non_neg_v = ZeroIfNegative(v);
|
|
11396
|
+
VFromD<DU32> exp_diff;
|
|
11397
|
+
const auto f32_to_u32_result =
|
|
11398
|
+
detail::ConvInRangeF32ToU32(du32, non_neg_v, exp_diff);
|
|
11399
|
+
|
|
11400
|
+
return Or(f32_to_u32_result,
|
|
11401
|
+
BitCast(du32, BroadcastSignBit(BitCast(di32, exp_diff))));
|
|
10273
11402
|
}
|
|
10274
|
-
#endif // HWY_ARCH_X86_64
|
|
10275
11403
|
|
|
10276
|
-
|
|
10277
|
-
|
|
10278
|
-
|
|
10279
|
-
HWY_API VFromD<
|
|
10280
|
-
|
|
10281
|
-
|
|
10282
|
-
|
|
10283
|
-
const
|
|
10284
|
-
|
|
11404
|
+
namespace detail {
|
|
11405
|
+
|
|
11406
|
+
template <class D64, HWY_IF_UI64_D(D64)>
|
|
11407
|
+
HWY_API VFromD<D64> ConvAbsInRangeF64ToUI64(D64 d64,
|
|
11408
|
+
VFromD<Rebind<double, D64>> v,
|
|
11409
|
+
VFromD<D64>& biased_exp) {
|
|
11410
|
+
const RebindToSigned<decltype(d64)> di64;
|
|
11411
|
+
const RebindToUnsigned<decltype(d64)> du64;
|
|
11412
|
+
using VU64 = VFromD<decltype(du64)>;
|
|
11413
|
+
const Repartition<uint16_t, decltype(di64)> du16;
|
|
11414
|
+
const VU64 k1075 = Set(du64, 1075); /* biased exponent of 2^52 */
|
|
10285
11415
|
|
|
10286
11416
|
// Exponent indicates whether the number can be represented as int64_t.
|
|
10287
|
-
|
|
10288
|
-
|
|
10289
|
-
|
|
10290
|
-
|
|
10291
|
-
const Repartition<int32_t, decltype(di)> di32;
|
|
10292
|
-
const auto in_range = MaskFromVec(BitCast(
|
|
10293
|
-
di,
|
|
10294
|
-
VecFromMask(di32, DupEven(BitCast(di32, biased_exp)) < Set(di32, 1086))));
|
|
10295
|
-
#endif
|
|
11417
|
+
biased_exp = BitCast(d64, ShiftRight<52>(BitCast(du64, v)));
|
|
11418
|
+
HWY_IF_CONSTEXPR(IsSigned<TFromD<D64>>()) {
|
|
11419
|
+
biased_exp = And(biased_exp, Set(d64, TFromD<D64>{0x7FF}));
|
|
11420
|
+
}
|
|
10296
11421
|
|
|
10297
11422
|
// If we were to cap the exponent at 51 and add 2^52, the number would be in
|
|
10298
11423
|
// [2^52, 2^53) and mantissa bits could be read out directly. We need to
|
|
@@ -10312,45 +11437,141 @@ HWY_API VFromD<DI> ConvertTo(DI di, VFromD<Rebind<double, DI>> v) {
|
|
|
10312
11437
|
// The upper 48 bits of both shift_mnt and shift_int are guaranteed to be
|
|
10313
11438
|
// zero as the upper 48 bits of both k1075 and biased_exp are zero.
|
|
10314
11439
|
|
|
10315
|
-
const
|
|
10316
|
-
|
|
10317
|
-
const
|
|
10318
|
-
|
|
10319
|
-
const
|
|
11440
|
+
const VU64 shift_mnt = BitCast(
|
|
11441
|
+
du64, SaturatedSub(BitCast(du16, k1075), BitCast(du16, biased_exp)));
|
|
11442
|
+
const VU64 shift_int = BitCast(
|
|
11443
|
+
du64, SaturatedSub(BitCast(du16, biased_exp), BitCast(du16, k1075)));
|
|
11444
|
+
const VU64 mantissa = BitCast(du64, v) & Set(du64, (1ULL << 52) - 1);
|
|
10320
11445
|
// Include implicit 1-bit. NOTE: the shift count may exceed 63; we rely on x86
|
|
10321
11446
|
// returning zero in that case.
|
|
10322
|
-
const
|
|
11447
|
+
const VU64 int53 = (mantissa | Set(du64, 1ULL << 52)) >> shift_mnt;
|
|
10323
11448
|
|
|
10324
11449
|
// For inputs larger than 2^53 - 1, insert zeros at the bottom.
|
|
10325
11450
|
|
|
10326
|
-
// For inputs less than 2^
|
|
10327
|
-
// shifted out of the left shift result below as shift_int[i] <=
|
|
10328
|
-
// for any inputs that are less than 2^
|
|
11451
|
+
// For inputs less than 2^64, the implicit 1-bit is guaranteed not to be
|
|
11452
|
+
// shifted out of the left shift result below as shift_int[i] <= 11 is true
|
|
11453
|
+
// for any inputs that are less than 2^64.
|
|
11454
|
+
|
|
11455
|
+
return BitCast(d64, int53 << shift_int);
|
|
11456
|
+
}
|
|
11457
|
+
|
|
11458
|
+
} // namespace detail
|
|
11459
|
+
|
|
11460
|
+
#if HWY_ARCH_X86_64
|
|
11461
|
+
|
|
11462
|
+
namespace detail {
|
|
11463
|
+
|
|
11464
|
+
template <size_t N>
|
|
11465
|
+
static HWY_INLINE int64_t SSE2ConvFirstF64LaneToI64(Vec128<double, N> v) {
|
|
11466
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
11467
|
+
// Workaround for undefined behavior in _mm_cvttsd_si64 with GCC if v[0] is
|
|
11468
|
+
// not within the range of an int64_t
|
|
11469
|
+
|
|
11470
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
11471
|
+
if (IsConstantX86Vec(hwy::SizeTag<1>(), v)) {
|
|
11472
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
|
|
11473
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
11474
|
+
return X86ConvertScalarFromFloat<int64_t>(raw_v[0]);
|
|
11475
|
+
}
|
|
11476
|
+
#endif
|
|
11477
|
+
|
|
11478
|
+
int64_t result;
|
|
11479
|
+
__asm__("%vcvttsd2si {%1, %0|%0, %1}"
|
|
11480
|
+
: "=r"(result)
|
|
11481
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
11482
|
+
:);
|
|
11483
|
+
return result;
|
|
11484
|
+
#else
|
|
11485
|
+
return _mm_cvttsd_si64(v.raw);
|
|
11486
|
+
#endif
|
|
11487
|
+
}
|
|
11488
|
+
|
|
11489
|
+
} // namespace detail
|
|
11490
|
+
|
|
11491
|
+
template <class DI, HWY_IF_V_SIZE_D(DI, 8), HWY_IF_I64_D(DI)>
|
|
11492
|
+
HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, Vec64<double> v) {
|
|
11493
|
+
return VFromD<DI>{_mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToI64(v))};
|
|
11494
|
+
}
|
|
11495
|
+
template <class DI, HWY_IF_V_SIZE_D(DI, 16), HWY_IF_I64_D(DI)>
|
|
11496
|
+
HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, Vec128<double> v) {
|
|
11497
|
+
const __m128i i0 = _mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToI64(v));
|
|
11498
|
+
const Full64<double> dd2;
|
|
11499
|
+
const __m128i i1 =
|
|
11500
|
+
_mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToI64(UpperHalf(dd2, v)));
|
|
11501
|
+
return VFromD<DI>{_mm_unpacklo_epi64(i0, i1)};
|
|
11502
|
+
}
|
|
11503
|
+
|
|
11504
|
+
template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I64_D(DI)>
|
|
11505
|
+
HWY_API VFromD<DI> ConvertTo(DI di, VFromD<Rebind<double, DI>> v) {
|
|
11506
|
+
const RebindToFloat<decltype(di)> df;
|
|
11507
|
+
// See comment at the first occurrence of "IfThenElse(overflow,".
|
|
11508
|
+
const MFromD<DI> overflow =
|
|
11509
|
+
RebindMask(di, Ge(v, Set(df, 9.223372036854776e18)));
|
|
11510
|
+
return IfThenElse(overflow, Set(di, LimitsMax<int64_t>()),
|
|
11511
|
+
ConvertInRangeTo(di, v));
|
|
11512
|
+
}
|
|
11513
|
+
#endif // HWY_ARCH_X86_64
|
|
11514
|
+
|
|
11515
|
+
#if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
|
|
11516
|
+
template <class DI, HWY_IF_V_SIZE_GT_D(DI, (HWY_ARCH_X86_64 ? 16 : 0)),
|
|
11517
|
+
HWY_IF_I64_D(DI)>
|
|
11518
|
+
HWY_API VFromD<DI> ConvertInRangeTo(DI di, VFromD<Rebind<double, DI>> v) {
|
|
11519
|
+
using VI = VFromD<DI>;
|
|
11520
|
+
|
|
11521
|
+
VI biased_exp;
|
|
11522
|
+
const VI shifted = detail::ConvAbsInRangeF64ToUI64(di, v, biased_exp);
|
|
11523
|
+
const VI sign_mask = BroadcastSignBit(BitCast(di, v));
|
|
11524
|
+
|
|
11525
|
+
// If the input was negative, negate the integer (two's complement).
|
|
11526
|
+
return (shifted ^ sign_mask) - sign_mask;
|
|
11527
|
+
}
|
|
11528
|
+
|
|
11529
|
+
template <class DI, HWY_IF_V_SIZE_GT_D(DI, (HWY_ARCH_X86_64 ? 16 : 0)),
|
|
11530
|
+
HWY_IF_I64_D(DI)>
|
|
11531
|
+
HWY_API VFromD<DI> ConvertTo(DI di, VFromD<Rebind<double, DI>> v) {
|
|
11532
|
+
using VI = VFromD<DI>;
|
|
11533
|
+
|
|
11534
|
+
VI biased_exp;
|
|
11535
|
+
const VI shifted = detail::ConvAbsInRangeF64ToUI64(di, v, biased_exp);
|
|
10329
11536
|
|
|
10330
|
-
|
|
11537
|
+
#if HWY_TARGET <= HWY_SSE4
|
|
11538
|
+
const auto in_range = biased_exp < Set(di, 1086);
|
|
11539
|
+
#else
|
|
11540
|
+
const Repartition<int32_t, decltype(di)> di32;
|
|
11541
|
+
const auto in_range = MaskFromVec(BitCast(
|
|
11542
|
+
di,
|
|
11543
|
+
VecFromMask(di32, DupEven(BitCast(di32, biased_exp)) < Set(di32, 1086))));
|
|
11544
|
+
#endif
|
|
10331
11545
|
|
|
10332
11546
|
// Saturate to LimitsMin (unchanged when negating below) or LimitsMax.
|
|
10333
11547
|
const VI sign_mask = BroadcastSignBit(BitCast(di, v));
|
|
10334
11548
|
const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask;
|
|
10335
|
-
const VI magnitude = IfThenElse(in_range,
|
|
11549
|
+
const VI magnitude = IfThenElse(in_range, shifted, limit);
|
|
10336
11550
|
|
|
10337
11551
|
// If the input was negative, negate the integer (two's complement).
|
|
10338
11552
|
return (magnitude ^ sign_mask) - sign_mask;
|
|
10339
11553
|
}
|
|
10340
11554
|
#endif // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
|
|
10341
11555
|
|
|
11556
|
+
// Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
|
|
11557
|
+
template <class DU, HWY_IF_U64_D(DU)>
|
|
11558
|
+
HWY_API VFromD<DU> ConvertInRangeTo(DU du, VFromD<Rebind<double, DU>> v) {
|
|
11559
|
+
VFromD<DU> biased_exp;
|
|
11560
|
+
const auto shifted = detail::ConvAbsInRangeF64ToUI64(du, v, biased_exp);
|
|
11561
|
+
return shifted;
|
|
11562
|
+
}
|
|
11563
|
+
|
|
10342
11564
|
// Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
|
|
10343
11565
|
template <class DU, HWY_IF_U64_D(DU)>
|
|
10344
11566
|
HWY_API VFromD<DU> ConvertTo(DU du, VFromD<Rebind<double, DU>> v) {
|
|
10345
|
-
const RebindToSigned<
|
|
10346
|
-
using VU = VFromD<
|
|
10347
|
-
const Repartition<uint16_t, decltype(di)> du16;
|
|
10348
|
-
const VU k1075 = Set(du, 1075); /* biased exponent of 2^52 */
|
|
11567
|
+
const RebindToSigned<DU> di;
|
|
11568
|
+
using VU = VFromD<DU>;
|
|
10349
11569
|
|
|
10350
|
-
|
|
11570
|
+
VU biased_exp;
|
|
11571
|
+
const VU shifted =
|
|
11572
|
+
detail::ConvAbsInRangeF64ToUI64(du, ZeroIfNegative(v), biased_exp);
|
|
10351
11573
|
|
|
10352
|
-
// Exponent indicates whether the number can be represented as
|
|
10353
|
-
const VU biased_exp = ShiftRight<52>(BitCast(du, non_neg_v));
|
|
11574
|
+
// Exponent indicates whether the number can be represented as uint64_t.
|
|
10354
11575
|
#if HWY_TARGET <= HWY_SSE4
|
|
10355
11576
|
const VU out_of_range =
|
|
10356
11577
|
BitCast(du, VecFromMask(di, BitCast(di, biased_exp) > Set(di, 1086)));
|
|
@@ -10361,49 +11582,83 @@ HWY_API VFromD<DU> ConvertTo(DU du, VFromD<Rebind<double, DU>> v) {
|
|
|
10361
11582
|
VecFromMask(di32, DupEven(BitCast(di32, biased_exp)) > Set(di32, 1086)));
|
|
10362
11583
|
#endif
|
|
10363
11584
|
|
|
10364
|
-
|
|
10365
|
-
|
|
10366
|
-
//
|
|
10367
|
-
// compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead
|
|
10368
|
-
// manually shift the mantissa into place (we already have many of the
|
|
10369
|
-
// inputs anyway).
|
|
11585
|
+
return (shifted | out_of_range);
|
|
11586
|
+
}
|
|
11587
|
+
#endif // HWY_TARGET <= HWY_AVX3
|
|
10370
11588
|
|
|
10371
|
-
|
|
10372
|
-
|
|
10373
|
-
// or equal to 2047.
|
|
11589
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
11590
|
+
namespace detail {
|
|
10374
11591
|
|
|
10375
|
-
|
|
10376
|
-
|
|
10377
|
-
|
|
11592
|
+
template <class TTo, class TF, HWY_IF_SIGNED(TTo)>
|
|
11593
|
+
static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CXX14_CONSTEXPR TTo
|
|
11594
|
+
X86ScalarNearestInt(TF flt_val) {
|
|
11595
|
+
#if HWY_HAVE_SCALAR_F16_TYPE && HWY_HAVE_SCALAR_F16_OPERATORS
|
|
11596
|
+
using TFArith = If<hwy::IsSame<RemoveCvRef<TTo>, hwy::bfloat16_t>(), float,
|
|
11597
|
+
RemoveCvRef<TF>>;
|
|
11598
|
+
#else
|
|
11599
|
+
using TFArith = If<sizeof(TF) <= sizeof(float), float, RemoveCvRef<TF>>;
|
|
11600
|
+
#endif
|
|
10378
11601
|
|
|
10379
|
-
|
|
10380
|
-
|
|
11602
|
+
const TTo trunc_int_val = X86ConvertScalarFromFloat<TTo>(flt_val);
|
|
11603
|
+
const TFArith abs_val_diff = ScalarAbs(
|
|
11604
|
+
ConvertScalarTo<TFArith>(ConvertScalarTo<TFArith>(flt_val) -
|
|
11605
|
+
ConvertScalarTo<TFArith>(trunc_int_val)));
|
|
11606
|
+
constexpr TFArith kHalf = ConvertScalarTo<TFArith>(0.5);
|
|
10381
11607
|
|
|
10382
|
-
const
|
|
10383
|
-
|
|
10384
|
-
|
|
10385
|
-
|
|
10386
|
-
|
|
10387
|
-
|
|
10388
|
-
|
|
10389
|
-
|
|
11608
|
+
const bool round_result_up =
|
|
11609
|
+
((trunc_int_val ^ ScalarShr(trunc_int_val, sizeof(TTo) * 8 - 1)) !=
|
|
11610
|
+
LimitsMax<TTo>()) &&
|
|
11611
|
+
(abs_val_diff > kHalf ||
|
|
11612
|
+
(abs_val_diff == kHalf && (trunc_int_val & 1) != 0));
|
|
11613
|
+
return static_cast<TTo>(
|
|
11614
|
+
trunc_int_val +
|
|
11615
|
+
(round_result_up ? (ScalarSignBit(flt_val) ? (-1) : 1) : 0));
|
|
11616
|
+
}
|
|
10390
11617
|
|
|
10391
|
-
//
|
|
11618
|
+
} // namespace detail
|
|
11619
|
+
#endif // HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
10392
11620
|
|
|
10393
|
-
|
|
10394
|
-
|
|
10395
|
-
|
|
11621
|
+
// If these are in namespace detail, the x86_256/512 templates are not found.
|
|
11622
|
+
template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I32_D(DI)>
|
|
11623
|
+
HWY_INLINE VFromD<DI> NearestIntInRange(DI, VFromD<RebindToFloat<DI>> v) {
|
|
11624
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
11625
|
+
// Workaround for undefined behavior in _mm_cvtps_epi32 with GCC if any values
|
|
11626
|
+
// of v[i] are not within the range of an int32_t
|
|
11627
|
+
|
|
11628
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
11629
|
+
if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
|
|
11630
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
|
|
11631
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
11632
|
+
return Dup128VecFromValues(DI(),
|
|
11633
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[0]),
|
|
11634
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[1]),
|
|
11635
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[2]),
|
|
11636
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[3]));
|
|
11637
|
+
}
|
|
11638
|
+
#endif
|
|
10396
11639
|
|
|
10397
|
-
|
|
10398
|
-
|
|
11640
|
+
__m128i raw_result;
|
|
11641
|
+
__asm__("%vcvtps2dq {%1, %0|%0, %1}"
|
|
11642
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
11643
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
11644
|
+
:);
|
|
11645
|
+
return VFromD<DI>{raw_result};
|
|
11646
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
11647
|
+
return VFromD<DI>{_mm_cvtps_epi32(v.raw)};
|
|
11648
|
+
#endif
|
|
10399
11649
|
}
|
|
10400
|
-
#endif // HWY_TARGET <= HWY_AVX3
|
|
10401
11650
|
|
|
10402
|
-
|
|
10403
|
-
|
|
10404
|
-
|
|
10405
|
-
|
|
10406
|
-
|
|
11651
|
+
// Generic for all vector lengths.
|
|
11652
|
+
template <class VF, class DF = DFromV<VF>, class DI = RebindToSigned<DF>,
|
|
11653
|
+
HWY_IF_F32_D(DF)>
|
|
11654
|
+
HWY_API VFromD<DI> NearestInt(const VF v) {
|
|
11655
|
+
const DI di;
|
|
11656
|
+
// See comment at the first occurrence of "IfThenElse(overflow,".
|
|
11657
|
+
// Here we are rounding, whereas previous occurrences truncate, but there is
|
|
11658
|
+
// no difference because the previous float value is well below the max i32.
|
|
11659
|
+
const auto overflow = RebindMask(di, Ge(v, Set(DF(), 2147483648.0f)));
|
|
11660
|
+
return IfThenElse(overflow, Set(di, LimitsMax<int32_t>()),
|
|
11661
|
+
NearestIntInRange(di, v));
|
|
10407
11662
|
}
|
|
10408
11663
|
|
|
10409
11664
|
// ------------------------------ Floating-point rounding (ConvertTo)
|
|
@@ -10447,7 +11702,7 @@ HWY_API Vec128<T, N> Trunc(const Vec128<T, N> v) {
|
|
|
10447
11702
|
const DFromV<decltype(v)> df;
|
|
10448
11703
|
const RebindToSigned<decltype(df)> di;
|
|
10449
11704
|
|
|
10450
|
-
const auto integer =
|
|
11705
|
+
const auto integer = ConvertInRangeTo(di, v); // round toward 0
|
|
10451
11706
|
const auto int_f = ConvertTo(df, integer);
|
|
10452
11707
|
|
|
10453
11708
|
return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
|
|
@@ -10460,7 +11715,7 @@ HWY_API Vec128<T, N> Ceil(const Vec128<T, N> v) {
|
|
|
10460
11715
|
const DFromV<decltype(v)> df;
|
|
10461
11716
|
const RebindToSigned<decltype(df)> di;
|
|
10462
11717
|
|
|
10463
|
-
const auto integer =
|
|
11718
|
+
const auto integer = ConvertInRangeTo(di, v); // round toward 0
|
|
10464
11719
|
const auto int_f = ConvertTo(df, integer);
|
|
10465
11720
|
|
|
10466
11721
|
// Truncating a positive non-integer ends up smaller; if so, add 1.
|
|
@@ -10476,7 +11731,7 @@ HWY_API Vec128<T, N> Floor(const Vec128<T, N> v) {
|
|
|
10476
11731
|
const DFromV<decltype(v)> df;
|
|
10477
11732
|
const RebindToSigned<decltype(df)> di;
|
|
10478
11733
|
|
|
10479
|
-
const auto integer =
|
|
11734
|
+
const auto integer = ConvertInRangeTo(di, v); // round toward 0
|
|
10480
11735
|
const auto int_f = ConvertTo(df, integer);
|
|
10481
11736
|
|
|
10482
11737
|
// Truncating a negative non-integer ends up larger; if so, subtract 1.
|
|
@@ -10584,6 +11839,16 @@ HWY_API Mask128<float16_t, N> IsNaN(const Vec128<float16_t, N> v) {
|
|
|
10584
11839
|
_mm_fpclass_ph_mask(v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
|
|
10585
11840
|
}
|
|
10586
11841
|
|
|
11842
|
+
template <size_t N>
|
|
11843
|
+
HWY_API Mask128<float16_t, N> IsEitherNaN(Vec128<float16_t, N> a,
|
|
11844
|
+
Vec128<float16_t, N> b) {
|
|
11845
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
11846
|
+
HWY_DIAGNOSTICS(push)
|
|
11847
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
11848
|
+
return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_UNORD_Q)};
|
|
11849
|
+
HWY_DIAGNOSTICS(pop)
|
|
11850
|
+
}
|
|
11851
|
+
|
|
10587
11852
|
template <size_t N>
|
|
10588
11853
|
HWY_API Mask128<float16_t, N> IsInf(const Vec128<float16_t, N> v) {
|
|
10589
11854
|
return Mask128<float16_t, N>{_mm_fpclass_ph_mask(
|
|
@@ -10620,6 +11885,31 @@ HWY_API Mask128<double, N> IsNaN(const Vec128<double, N> v) {
|
|
|
10620
11885
|
#endif
|
|
10621
11886
|
}
|
|
10622
11887
|
|
|
11888
|
+
#ifdef HWY_NATIVE_IS_EITHER_NAN
|
|
11889
|
+
#undef HWY_NATIVE_IS_EITHER_NAN
|
|
11890
|
+
#else
|
|
11891
|
+
#define HWY_NATIVE_IS_EITHER_NAN
|
|
11892
|
+
#endif
|
|
11893
|
+
|
|
11894
|
+
template <size_t N>
|
|
11895
|
+
HWY_API Mask128<float, N> IsEitherNaN(Vec128<float, N> a, Vec128<float, N> b) {
|
|
11896
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
11897
|
+
return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_UNORD_Q)};
|
|
11898
|
+
#else
|
|
11899
|
+
return Mask128<float, N>{_mm_cmpunord_ps(a.raw, b.raw)};
|
|
11900
|
+
#endif
|
|
11901
|
+
}
|
|
11902
|
+
|
|
11903
|
+
template <size_t N>
|
|
11904
|
+
HWY_API Mask128<double, N> IsEitherNaN(Vec128<double, N> a,
|
|
11905
|
+
Vec128<double, N> b) {
|
|
11906
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
11907
|
+
return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_UNORD_Q)};
|
|
11908
|
+
#else
|
|
11909
|
+
return Mask128<double, N>{_mm_cmpunord_pd(a.raw, b.raw)};
|
|
11910
|
+
#endif
|
|
11911
|
+
}
|
|
11912
|
+
|
|
10623
11913
|
#if HWY_TARGET <= HWY_AVX3
|
|
10624
11914
|
|
|
10625
11915
|
// Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite.
|
|
@@ -12016,6 +13306,31 @@ HWY_API Vec128<uint8_t> MaxOfLanes(D d, Vec128<uint8_t> v) {
|
|
|
12016
13306
|
|
|
12017
13307
|
#endif // HWY_TARGET <= HWY_SSE4
|
|
12018
13308
|
|
|
13309
|
+
// ------------------------------ BitShuffle
|
|
13310
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
13311
|
+
|
|
13312
|
+
#ifdef HWY_NATIVE_BITSHUFFLE
|
|
13313
|
+
#undef HWY_NATIVE_BITSHUFFLE
|
|
13314
|
+
#else
|
|
13315
|
+
#define HWY_NATIVE_BITSHUFFLE
|
|
13316
|
+
#endif
|
|
13317
|
+
|
|
13318
|
+
template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
|
|
13319
|
+
HWY_IF_V_SIZE_LE_V(V, 16),
|
|
13320
|
+
HWY_IF_V_SIZE_V(VI, HWY_MAX_LANES_V(V) * 8)>
|
|
13321
|
+
HWY_API V BitShuffle(V v, VI idx) {
|
|
13322
|
+
const DFromV<decltype(v)> d64;
|
|
13323
|
+
const RebindToUnsigned<decltype(d64)> du64;
|
|
13324
|
+
const Rebind<uint8_t, decltype(d64)> du8;
|
|
13325
|
+
|
|
13326
|
+
int32_t i32_bit_shuf_result = static_cast<int32_t>(
|
|
13327
|
+
static_cast<uint16_t>(_mm_bitshuffle_epi64_mask(v.raw, idx.raw)));
|
|
13328
|
+
|
|
13329
|
+
return BitCast(d64, PromoteTo(du64, VFromD<decltype(du8)>{_mm_cvtsi32_si128(
|
|
13330
|
+
i32_bit_shuf_result)}));
|
|
13331
|
+
}
|
|
13332
|
+
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
13333
|
+
|
|
12019
13334
|
// ------------------------------ Lt128
|
|
12020
13335
|
|
|
12021
13336
|
namespace detail {
|