@img/sharp-libvips-dev 1.2.1 → 1.2.2-rc.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +2 -0
- package/include/aom/aomcx.h +106 -25
- package/include/ffi.h +3 -3
- package/include/freetype2/freetype/config/ftconfig.h +1 -1
- package/include/freetype2/freetype/config/ftheader.h +1 -1
- package/include/freetype2/freetype/config/ftoption.h +37 -12
- package/include/freetype2/freetype/config/ftstdlib.h +1 -1
- package/include/freetype2/freetype/config/integer-types.h +29 -2
- package/include/freetype2/freetype/config/mac-support.h +1 -1
- package/include/freetype2/freetype/config/public-macros.h +3 -3
- package/include/freetype2/freetype/freetype.h +51 -47
- package/include/freetype2/freetype/ftadvanc.h +1 -1
- package/include/freetype2/freetype/ftbbox.h +1 -1
- package/include/freetype2/freetype/ftbdf.h +1 -1
- package/include/freetype2/freetype/ftbitmap.h +1 -1
- package/include/freetype2/freetype/ftbzip2.h +1 -1
- package/include/freetype2/freetype/ftcache.h +1 -1
- package/include/freetype2/freetype/ftcid.h +1 -1
- package/include/freetype2/freetype/ftcolor.h +13 -4
- package/include/freetype2/freetype/ftdriver.h +3 -3
- package/include/freetype2/freetype/fterrdef.h +1 -1
- package/include/freetype2/freetype/fterrors.h +1 -1
- package/include/freetype2/freetype/ftfntfmt.h +1 -1
- package/include/freetype2/freetype/ftgasp.h +1 -1
- package/include/freetype2/freetype/ftglyph.h +1 -1
- package/include/freetype2/freetype/ftgxval.h +1 -1
- package/include/freetype2/freetype/ftgzip.h +1 -1
- package/include/freetype2/freetype/ftimage.h +6 -2
- package/include/freetype2/freetype/ftincrem.h +1 -1
- package/include/freetype2/freetype/ftlcdfil.h +1 -1
- package/include/freetype2/freetype/ftlist.h +1 -1
- package/include/freetype2/freetype/ftlogging.h +184 -0
- package/include/freetype2/freetype/ftlzw.h +1 -1
- package/include/freetype2/freetype/ftmac.h +1 -1
- package/include/freetype2/freetype/ftmm.h +159 -103
- package/include/freetype2/freetype/ftmodapi.h +1 -1
- package/include/freetype2/freetype/ftmoderr.h +1 -1
- package/include/freetype2/freetype/ftotval.h +1 -1
- package/include/freetype2/freetype/ftoutln.h +1 -1
- package/include/freetype2/freetype/ftparams.h +1 -1
- package/include/freetype2/freetype/ftpfr.h +1 -1
- package/include/freetype2/freetype/ftrender.h +1 -1
- package/include/freetype2/freetype/ftsizes.h +1 -1
- package/include/freetype2/freetype/ftsnames.h +1 -1
- package/include/freetype2/freetype/ftstroke.h +1 -1
- package/include/freetype2/freetype/ftsynth.h +1 -1
- package/include/freetype2/freetype/ftsystem.h +1 -1
- package/include/freetype2/freetype/fttrigon.h +1 -1
- package/include/freetype2/freetype/fttypes.h +1 -1
- package/include/freetype2/freetype/ftwinfnt.h +2 -3
- package/include/freetype2/freetype/otsvg.h +1 -1
- package/include/freetype2/freetype/t1tables.h +1 -1
- package/include/freetype2/freetype/ttnameid.h +129 -129
- package/include/freetype2/freetype/tttables.h +8 -5
- package/include/freetype2/freetype/tttags.h +1 -1
- package/include/freetype2/ft2build.h +1 -1
- package/include/glib-2.0/gio/gdbuserror.h +9 -8
- package/include/glib-2.0/gio/ginetaddress.h +12 -0
- package/include/glib-2.0/gio/gioenums.h +9 -2
- package/include/glib-2.0/glib/gstring.h +2 -2
- package/include/glib-2.0/glib/gunicode.h +1 -1
- package/include/glib-2.0/gobject/glib-types.h +1 -1
- package/include/glib-2.0/gobject/gparam.h +1 -1
- package/include/glib-2.0/gobject/gvalue.h +78 -35
- package/include/harfbuzz/hb-script-list.h +12 -0
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +2 -19
- package/include/hwy/aligned_allocator.h +11 -7
- package/include/hwy/auto_tune.h +504 -0
- package/include/hwy/base.h +425 -104
- package/include/hwy/cache_control.h +16 -0
- package/include/hwy/detect_compiler_arch.h +32 -1
- package/include/hwy/detect_targets.h +251 -67
- package/include/hwy/foreach_target.h +35 -0
- package/include/hwy/highway.h +185 -76
- package/include/hwy/nanobenchmark.h +1 -19
- package/include/hwy/ops/arm_neon-inl.h +969 -458
- package/include/hwy/ops/arm_sve-inl.h +1137 -359
- package/include/hwy/ops/emu128-inl.h +97 -11
- package/include/hwy/ops/generic_ops-inl.h +1222 -34
- package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
- package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
- package/include/hwy/ops/ppc_vsx-inl.h +306 -126
- package/include/hwy/ops/rvv-inl.h +546 -51
- package/include/hwy/ops/scalar-inl.h +77 -22
- package/include/hwy/ops/set_macros-inl.h +138 -17
- package/include/hwy/ops/shared-inl.h +50 -10
- package/include/hwy/ops/wasm_128-inl.h +137 -92
- package/include/hwy/ops/x86_128-inl.h +773 -214
- package/include/hwy/ops/x86_256-inl.h +712 -255
- package/include/hwy/ops/x86_512-inl.h +429 -753
- package/include/hwy/ops/x86_avx3-inl.h +501 -0
- package/include/hwy/per_target.h +2 -1
- package/include/hwy/profiler.h +622 -486
- package/include/hwy/targets.h +62 -20
- package/include/hwy/timer-inl.h +8 -160
- package/include/hwy/timer.h +170 -3
- package/include/hwy/x86_cpuid.h +81 -0
- package/include/libheif/heif_cxx.h +25 -5
- package/include/libheif/heif_regions.h +5 -5
- package/include/libheif/heif_version.h +2 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
- package/include/libxml2/libxml/valid.h +0 -3
- package/include/libxml2/libxml/xmlerror.h +1 -1
- package/include/libxml2/libxml/xmlversion.h +4 -4
- package/include/pango-1.0/pango/pango-enum-types.h +3 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-font.h +30 -0
- package/include/pango-1.0/pango/pango-version-macros.h +26 -0
- package/include/vips/connection.h +4 -4
- package/include/vips/version.h +4 -4
- package/include/zlib.h +3 -3
- package/package.json +1 -1
- package/versions.json +13 -13
|
@@ -21,6 +21,7 @@
|
|
|
21
21
|
// Arm NEON intrinsics are documented at:
|
|
22
22
|
// https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon]
|
|
23
23
|
|
|
24
|
+
#include "hwy/base.h"
|
|
24
25
|
#include "hwy/ops/shared-inl.h"
|
|
25
26
|
|
|
26
27
|
HWY_DIAGNOSTICS(push)
|
|
@@ -141,29 +142,6 @@ namespace detail { // for code folding and Raw128
|
|
|
141
142
|
HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \
|
|
142
143
|
HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)
|
|
143
144
|
|
|
144
|
-
// Clang 17 crashes with bf16, see github.com/llvm/llvm-project/issues/64179.
|
|
145
|
-
#undef HWY_NEON_HAVE_BFLOAT16
|
|
146
|
-
#if HWY_HAVE_SCALAR_BF16_TYPE && \
|
|
147
|
-
((HWY_TARGET == HWY_NEON_BF16 && \
|
|
148
|
-
(!HWY_COMPILER_CLANG || HWY_COMPILER_CLANG >= 1800)) || \
|
|
149
|
-
defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC))
|
|
150
|
-
#define HWY_NEON_HAVE_BFLOAT16 1
|
|
151
|
-
#else
|
|
152
|
-
#define HWY_NEON_HAVE_BFLOAT16 0
|
|
153
|
-
#endif
|
|
154
|
-
|
|
155
|
-
// HWY_NEON_HAVE_F32_TO_BF16C is defined if NEON vcvt_bf16_f32 and
|
|
156
|
-
// vbfdot_f32 are available, even if the __bf16 type is disabled due to
|
|
157
|
-
// GCC/Clang bugs.
|
|
158
|
-
#undef HWY_NEON_HAVE_F32_TO_BF16C
|
|
159
|
-
#if HWY_NEON_HAVE_BFLOAT16 || HWY_TARGET == HWY_NEON_BF16 || \
|
|
160
|
-
(defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && \
|
|
161
|
-
(HWY_COMPILER_GCC_ACTUAL >= 1000 || HWY_COMPILER_CLANG >= 1100))
|
|
162
|
-
#define HWY_NEON_HAVE_F32_TO_BF16C 1
|
|
163
|
-
#else
|
|
164
|
-
#define HWY_NEON_HAVE_F32_TO_BF16C 0
|
|
165
|
-
#endif
|
|
166
|
-
|
|
167
145
|
// bfloat16_t
|
|
168
146
|
#if HWY_NEON_HAVE_BFLOAT16
|
|
169
147
|
#define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args) \
|
|
@@ -194,10 +172,16 @@ namespace detail { // for code folding and Raw128
|
|
|
194
172
|
// Enable generic functions for whichever of (f16, bf16) are not supported.
|
|
195
173
|
#if !HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16
|
|
196
174
|
#define HWY_NEON_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
|
|
175
|
+
#define HWY_GENERIC_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
|
|
176
|
+
#define HWY_NEON_IF_NOT_EMULATED_D(D) HWY_IF_NOT_SPECIAL_FLOAT_D(D)
|
|
197
177
|
#elif !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16
|
|
198
178
|
#define HWY_NEON_IF_EMULATED_D(D) HWY_IF_F16_D(D)
|
|
179
|
+
#define HWY_GENERIC_IF_EMULATED_D(D) HWY_IF_F16_D(D)
|
|
180
|
+
#define HWY_NEON_IF_NOT_EMULATED_D(D) HWY_IF_NOT_F16_D(D)
|
|
199
181
|
#elif HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16
|
|
200
182
|
#define HWY_NEON_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
|
|
183
|
+
#define HWY_GENERIC_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
|
|
184
|
+
#define HWY_NEON_IF_NOT_EMULATED_D(D) HWY_IF_NOT_BF16_D(D)
|
|
201
185
|
#elif HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16
|
|
202
186
|
// NOTE: hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr is used instead of
|
|
203
187
|
// hwy::EnableIf<false>* = nullptr to avoid compiler errors since
|
|
@@ -205,6 +189,9 @@ namespace detail { // for code folding and Raw128
|
|
|
205
189
|
// SFINAE to occur instead of a hard error due to a dependency on the D template
|
|
206
190
|
// argument
|
|
207
191
|
#define HWY_NEON_IF_EMULATED_D(D) hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr
|
|
192
|
+
#define HWY_GENERIC_IF_EMULATED_D(D) \
|
|
193
|
+
hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr
|
|
194
|
+
#define HWY_NEON_IF_NOT_EMULATED_D(D) hwy::EnableIf<true>* = nullptr
|
|
208
195
|
#else
|
|
209
196
|
#error "Logic error, handled all four cases"
|
|
210
197
|
#endif
|
|
@@ -870,10 +857,10 @@ using Vec16 = Vec128<T, 2 / sizeof(T)>;
|
|
|
870
857
|
// FF..FF or 0.
|
|
871
858
|
template <typename T, size_t N = 16 / sizeof(T)>
|
|
872
859
|
class Mask128 {
|
|
860
|
+
public:
|
|
873
861
|
// Arm C Language Extensions return and expect unsigned type.
|
|
874
862
|
using Raw = typename detail::Raw128<MakeUnsigned<T>, N>::type;
|
|
875
863
|
|
|
876
|
-
public:
|
|
877
864
|
using PrivateT = T; // only for DFromM
|
|
878
865
|
static constexpr size_t kPrivateN = N; // only for DFromM
|
|
879
866
|
|
|
@@ -897,6 +884,249 @@ using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>;
|
|
|
897
884
|
template <class V>
|
|
898
885
|
using TFromV = typename V::PrivateT;
|
|
899
886
|
|
|
887
|
+
// TODO(janwas): ForDemoteVectors, in convert_test and demote_test, appear to
|
|
888
|
+
// instantiate this with D = double x 4. The cause is unknown. Previously,
|
|
889
|
+
// defining this in terms of Set rejected that via SFINAE because only
|
|
890
|
+
// V_SIZE = 16 and V_SIZE <= 8 overloads were defined. As a workaround,
|
|
891
|
+
// truncate the lane count to 128 bits.
|
|
892
|
+
template <class D>
|
|
893
|
+
using VFromD =
|
|
894
|
+
Vec128<TFromD<D>, HWY_MIN(16 / sizeof(TFromD<D>), MaxLanes(D()))>;
|
|
895
|
+
|
|
896
|
+
// ------------------------------ BitCast
|
|
897
|
+
|
|
898
|
+
namespace detail {
|
|
899
|
+
|
|
900
|
+
// Converts from Vec128<T, N> to Vec128<uint8_t, N * sizeof(T)> using the
|
|
901
|
+
// vreinterpret*_u8_*() set of functions.
|
|
902
|
+
#define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
|
|
903
|
+
#define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \
|
|
904
|
+
Vec128<uint8_t, size * sizeof(type##_t)>
|
|
905
|
+
#define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type##_t, size> v
|
|
906
|
+
#define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw
|
|
907
|
+
|
|
908
|
+
// Special case of u8 to u8 since vreinterpret*_u8_u8 is obviously not defined.
|
|
909
|
+
template <size_t N>
|
|
910
|
+
HWY_INLINE Vec128<uint8_t, N> BitCastToByte(Vec128<uint8_t, N> v) {
|
|
911
|
+
return v;
|
|
912
|
+
}
|
|
913
|
+
|
|
914
|
+
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(BitCastToByte, vreinterpret, _u8_,
|
|
915
|
+
HWY_CAST_TO_U8)
|
|
916
|
+
HWY_NEON_DEF_FUNCTION_BFLOAT_16(BitCastToByte, vreinterpret, _u8_,
|
|
917
|
+
HWY_CAST_TO_U8)
|
|
918
|
+
|
|
919
|
+
HWY_NEON_DEF_FUNCTION_INTS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
|
|
920
|
+
HWY_NEON_DEF_FUNCTION_UINT_16(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
|
|
921
|
+
HWY_NEON_DEF_FUNCTION_UINT_32(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
|
|
922
|
+
HWY_NEON_DEF_FUNCTION_UINT_64(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
|
|
923
|
+
|
|
924
|
+
#if !HWY_HAVE_FLOAT16
|
|
925
|
+
#if HWY_NEON_HAVE_F16C
|
|
926
|
+
HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(BitCastToByte, vreinterpret, _u8_,
|
|
927
|
+
HWY_CAST_TO_U8)
|
|
928
|
+
#else
|
|
929
|
+
template <size_t N>
|
|
930
|
+
HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<float16_t, N> v) {
|
|
931
|
+
return BitCastToByte(Vec128<uint16_t, N>(v.raw));
|
|
932
|
+
}
|
|
933
|
+
#endif // HWY_NEON_HAVE_F16C
|
|
934
|
+
#endif // !HWY_HAVE_FLOAT16
|
|
935
|
+
|
|
936
|
+
#if !HWY_NEON_HAVE_BFLOAT16
|
|
937
|
+
template <size_t N>
|
|
938
|
+
HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<bfloat16_t, N> v) {
|
|
939
|
+
return BitCastToByte(Vec128<uint16_t, N>(v.raw));
|
|
940
|
+
}
|
|
941
|
+
#endif // !HWY_NEON_HAVE_BFLOAT16
|
|
942
|
+
|
|
943
|
+
#undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
|
|
944
|
+
#undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8
|
|
945
|
+
#undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8
|
|
946
|
+
#undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8
|
|
947
|
+
|
|
948
|
+
template <class D, HWY_IF_U8_D(D)>
|
|
949
|
+
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, VFromD<D> v) {
|
|
950
|
+
return v;
|
|
951
|
+
}
|
|
952
|
+
|
|
953
|
+
// 64-bit or less:
|
|
954
|
+
|
|
955
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)>
|
|
956
|
+
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
|
|
957
|
+
VFromD<RebindToUnsigned<D>> v) {
|
|
958
|
+
return VFromD<D>(vreinterpret_s8_u8(v.raw));
|
|
959
|
+
}
|
|
960
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
|
|
961
|
+
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
|
|
962
|
+
VFromD<Repartition<uint8_t, D>> v) {
|
|
963
|
+
return VFromD<D>(vreinterpret_u16_u8(v.raw));
|
|
964
|
+
}
|
|
965
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
|
|
966
|
+
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
|
|
967
|
+
VFromD<Repartition<uint8_t, D>> v) {
|
|
968
|
+
return VFromD<D>(vreinterpret_s16_u8(v.raw));
|
|
969
|
+
}
|
|
970
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
|
|
971
|
+
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
|
|
972
|
+
VFromD<Repartition<uint8_t, D>> v) {
|
|
973
|
+
return VFromD<D>(vreinterpret_u32_u8(v.raw));
|
|
974
|
+
}
|
|
975
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
|
|
976
|
+
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
|
|
977
|
+
VFromD<Repartition<uint8_t, D>> v) {
|
|
978
|
+
return VFromD<D>(vreinterpret_s32_u8(v.raw));
|
|
979
|
+
}
|
|
980
|
+
|
|
981
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U64_D(D)>
|
|
982
|
+
HWY_INLINE Vec64<uint64_t> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
|
|
983
|
+
return Vec64<uint64_t>(vreinterpret_u64_u8(v.raw));
|
|
984
|
+
}
|
|
985
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I64_D(D)>
|
|
986
|
+
HWY_INLINE Vec64<int64_t> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
|
|
987
|
+
return Vec64<int64_t>(vreinterpret_s64_u8(v.raw));
|
|
988
|
+
}
|
|
989
|
+
|
|
990
|
+
// Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C.
|
|
991
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
|
|
992
|
+
HWY_INLINE VFromD<D> BitCastFromByte(D, VFromD<Repartition<uint8_t, D>> v) {
|
|
993
|
+
#if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
|
|
994
|
+
return VFromD<D>(vreinterpret_f16_u8(v.raw));
|
|
995
|
+
#else
|
|
996
|
+
const RebindToUnsigned<D> du;
|
|
997
|
+
return VFromD<D>(BitCastFromByte(du, v).raw);
|
|
998
|
+
#endif
|
|
999
|
+
}
|
|
1000
|
+
|
|
1001
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
|
|
1002
|
+
HWY_INLINE VFromD<D> BitCastFromByte(D, VFromD<Repartition<uint8_t, D>> v) {
|
|
1003
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
1004
|
+
return VFromD<D>(vreinterpret_bf16_u8(v.raw));
|
|
1005
|
+
#else
|
|
1006
|
+
const RebindToUnsigned<D> du;
|
|
1007
|
+
return VFromD<D>(BitCastFromByte(du, v).raw);
|
|
1008
|
+
#endif
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
|
|
1012
|
+
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
|
|
1013
|
+
VFromD<Repartition<uint8_t, D>> v) {
|
|
1014
|
+
return VFromD<D>(vreinterpret_f32_u8(v.raw));
|
|
1015
|
+
}
|
|
1016
|
+
|
|
1017
|
+
#if HWY_HAVE_FLOAT64
|
|
1018
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F64_D(D)>
|
|
1019
|
+
HWY_INLINE Vec64<double> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
|
|
1020
|
+
return Vec64<double>(vreinterpret_f64_u8(v.raw));
|
|
1021
|
+
}
|
|
1022
|
+
#endif // HWY_HAVE_FLOAT64
|
|
1023
|
+
|
|
1024
|
+
// 128-bit full:
|
|
1025
|
+
|
|
1026
|
+
template <class D, HWY_IF_I8_D(D)>
|
|
1027
|
+
HWY_INLINE Vec128<int8_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
1028
|
+
return Vec128<int8_t>(vreinterpretq_s8_u8(v.raw));
|
|
1029
|
+
}
|
|
1030
|
+
template <class D, HWY_IF_U16_D(D)>
|
|
1031
|
+
HWY_INLINE Vec128<uint16_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
1032
|
+
return Vec128<uint16_t>(vreinterpretq_u16_u8(v.raw));
|
|
1033
|
+
}
|
|
1034
|
+
template <class D, HWY_IF_I16_D(D)>
|
|
1035
|
+
HWY_INLINE Vec128<int16_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
1036
|
+
return Vec128<int16_t>(vreinterpretq_s16_u8(v.raw));
|
|
1037
|
+
}
|
|
1038
|
+
template <class D, HWY_IF_U32_D(D)>
|
|
1039
|
+
HWY_INLINE Vec128<uint32_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
1040
|
+
return Vec128<uint32_t>(vreinterpretq_u32_u8(v.raw));
|
|
1041
|
+
}
|
|
1042
|
+
template <class D, HWY_IF_I32_D(D)>
|
|
1043
|
+
HWY_INLINE Vec128<int32_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
1044
|
+
return Vec128<int32_t>(vreinterpretq_s32_u8(v.raw));
|
|
1045
|
+
}
|
|
1046
|
+
template <class D, HWY_IF_U64_D(D)>
|
|
1047
|
+
HWY_INLINE Vec128<uint64_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
1048
|
+
return Vec128<uint64_t>(vreinterpretq_u64_u8(v.raw));
|
|
1049
|
+
}
|
|
1050
|
+
template <class D, HWY_IF_I64_D(D)>
|
|
1051
|
+
HWY_INLINE Vec128<int64_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
1052
|
+
return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
|
|
1053
|
+
}
|
|
1054
|
+
|
|
1055
|
+
template <class D, HWY_IF_F32_D(D)>
|
|
1056
|
+
HWY_INLINE Vec128<float> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
1057
|
+
return Vec128<float>(vreinterpretq_f32_u8(v.raw));
|
|
1058
|
+
}
|
|
1059
|
+
|
|
1060
|
+
#if HWY_HAVE_FLOAT64
|
|
1061
|
+
template <class D, HWY_IF_F64_D(D)>
|
|
1062
|
+
HWY_INLINE Vec128<double> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
1063
|
+
return Vec128<double>(vreinterpretq_f64_u8(v.raw));
|
|
1064
|
+
}
|
|
1065
|
+
#endif // HWY_HAVE_FLOAT64
|
|
1066
|
+
|
|
1067
|
+
// Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C.
|
|
1068
|
+
template <class D, HWY_IF_F16_D(D)>
|
|
1069
|
+
HWY_INLINE VFromD<D> BitCastFromByte(D, Vec128<uint8_t> v) {
|
|
1070
|
+
#if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
|
|
1071
|
+
return VFromD<D>(vreinterpretq_f16_u8(v.raw));
|
|
1072
|
+
#else
|
|
1073
|
+
return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
|
|
1074
|
+
#endif
|
|
1075
|
+
}
|
|
1076
|
+
|
|
1077
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
1078
|
+
HWY_INLINE VFromD<D> BitCastFromByte(D, Vec128<uint8_t> v) {
|
|
1079
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
1080
|
+
return VFromD<D>(vreinterpretq_bf16_u8(v.raw));
|
|
1081
|
+
#else
|
|
1082
|
+
return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
|
|
1083
|
+
#endif
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1086
|
+
} // namespace detail
|
|
1087
|
+
|
|
1088
|
+
template <class D, class FromT>
|
|
1089
|
+
HWY_API VFromD<D> BitCast(D d,
|
|
1090
|
+
Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) {
|
|
1091
|
+
return detail::BitCastFromByte(d, detail::BitCastToByte(v));
|
|
1092
|
+
}
|
|
1093
|
+
|
|
1094
|
+
// ------------------------------ ResizeBitCast
|
|
1095
|
+
|
|
1096
|
+
// <= 8 byte vector to <= 8 byte vector
|
|
1097
|
+
template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 8),
|
|
1098
|
+
HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
1099
|
+
HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
|
|
1100
|
+
const Repartition<uint8_t, decltype(d)> du8;
|
|
1101
|
+
return BitCast(d, VFromD<decltype(du8)>{detail::BitCastToByte(v).raw});
|
|
1102
|
+
}
|
|
1103
|
+
|
|
1104
|
+
// 16-byte vector to 16-byte vector: same as BitCast
|
|
1105
|
+
template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 16),
|
|
1106
|
+
HWY_IF_V_SIZE_D(D, 16)>
|
|
1107
|
+
HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
|
|
1108
|
+
return BitCast(d, v);
|
|
1109
|
+
}
|
|
1110
|
+
|
|
1111
|
+
// 16-byte vector to <= 8-byte vector
|
|
1112
|
+
template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 16),
|
|
1113
|
+
HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
1114
|
+
HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
|
|
1115
|
+
const DFromV<decltype(v)> d_from;
|
|
1116
|
+
const Half<decltype(d_from)> dh_from;
|
|
1117
|
+
return ResizeBitCast(d, LowerHalf(dh_from, v));
|
|
1118
|
+
}
|
|
1119
|
+
|
|
1120
|
+
// <= 8-bit vector to 16-byte vector
|
|
1121
|
+
template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 8),
|
|
1122
|
+
HWY_IF_V_SIZE_D(D, 16)>
|
|
1123
|
+
HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
|
|
1124
|
+
const Full64<TFromV<FromV>> d_full64_from;
|
|
1125
|
+
const Full128<TFromV<FromV>> d_full128_from;
|
|
1126
|
+
return BitCast(d, Combine(d_full128_from, Zero(d_full64_from),
|
|
1127
|
+
ResizeBitCast(d_full64_from, v)));
|
|
1128
|
+
}
|
|
1129
|
+
|
|
900
1130
|
// ------------------------------ Set
|
|
901
1131
|
|
|
902
1132
|
namespace detail {
|
|
@@ -913,16 +1143,26 @@ namespace detail {
|
|
|
913
1143
|
#define HWY_NEON_BUILD_ARG_HWY_SET t
|
|
914
1144
|
|
|
915
1145
|
HWY_NEON_DEF_FUNCTION_ALL_TYPES(NativeSet, vdup, _n_, HWY_SET)
|
|
916
|
-
#if !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_F16C
|
|
1146
|
+
#if !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_F16C && HWY_HAVE_SCALAR_F16_TYPE
|
|
917
1147
|
HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(NativeSet, vdup, _n_, HWY_SET)
|
|
918
1148
|
#endif
|
|
919
1149
|
HWY_NEON_DEF_FUNCTION_BFLOAT_16(NativeSet, vdup, _n_, HWY_SET)
|
|
920
1150
|
|
|
921
|
-
|
|
922
|
-
|
|
1151
|
+
#if !HWY_NEON_HAVE_F16C || !HWY_HAVE_SCALAR_F16_TYPE
|
|
1152
|
+
template <class D, HWY_IF_F16_D(D)>
|
|
1153
|
+
HWY_API VFromD<D> NativeSet(D d, TFromD<D> t) {
|
|
1154
|
+
const uint16_t tu = BitCastScalar<uint16_t>(t);
|
|
1155
|
+
return BitCast(d, Set(RebindToUnsigned<D>(), tu));
|
|
1156
|
+
}
|
|
1157
|
+
#endif
|
|
1158
|
+
|
|
1159
|
+
#if !HWY_NEON_HAVE_BFLOAT16
|
|
1160
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
1161
|
+
HWY_API VFromD<D> NativeSet(D d, TFromD<D> t) {
|
|
923
1162
|
const uint16_t tu = BitCastScalar<uint16_t>(t);
|
|
924
|
-
return
|
|
1163
|
+
return BitCast(d, Set(RebindToUnsigned<D>(), tu));
|
|
925
1164
|
}
|
|
1165
|
+
#endif
|
|
926
1166
|
|
|
927
1167
|
#undef HWY_NEON_BUILD_TPL_HWY_SET
|
|
928
1168
|
#undef HWY_NEON_BUILD_RET_HWY_SET
|
|
@@ -931,25 +1171,21 @@ HWY_API Vec128<TFromD<D>, MaxLanes(D())> NativeSet(D d, TFromD<D> t) {
|
|
|
931
1171
|
|
|
932
1172
|
} // namespace detail
|
|
933
1173
|
|
|
934
|
-
// Full vector.
|
|
1174
|
+
// Full vector.
|
|
935
1175
|
// Do not use a typename T = TFromD<D> argument because T will be deduced from
|
|
936
1176
|
// the actual argument type, which can differ from TFromD<D>.
|
|
937
1177
|
template <class D, HWY_IF_V_SIZE_D(D, 16), typename T>
|
|
938
|
-
HWY_INLINE
|
|
1178
|
+
HWY_INLINE VFromD<D> Set(D /* tag */, T t) {
|
|
939
1179
|
return detail::NativeSet(Full128<TFromD<D>>(), static_cast<TFromD<D>>(t));
|
|
940
1180
|
}
|
|
941
1181
|
|
|
942
1182
|
// Partial vector: create 64-bit and return wrapper.
|
|
943
1183
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T>
|
|
944
|
-
HWY_API
|
|
1184
|
+
HWY_API VFromD<D> Set(D /* tag */, T t) {
|
|
945
1185
|
const Full64<TFromD<D>> dfull;
|
|
946
|
-
return
|
|
947
|
-
detail::NativeSet(dfull, static_cast<TFromD<D>>(t)).raw);
|
|
1186
|
+
return VFromD<D>(detail::NativeSet(dfull, static_cast<TFromD<D>>(t)).raw);
|
|
948
1187
|
}
|
|
949
1188
|
|
|
950
|
-
template <class D>
|
|
951
|
-
using VFromD = decltype(Set(D(), TFromD<D>()));
|
|
952
|
-
|
|
953
1189
|
template <class D>
|
|
954
1190
|
HWY_API VFromD<D> Zero(D d) {
|
|
955
1191
|
// Default ctor also works for bfloat16_t and float16_t.
|
|
@@ -1201,7 +1437,8 @@ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
|
1201
1437
|
BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
|
|
1202
1438
|
}
|
|
1203
1439
|
|
|
1204
|
-
#if (HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL) && HWY_NEON_HAVE_F16C
|
|
1440
|
+
#if (HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL) && HWY_NEON_HAVE_F16C && \
|
|
1441
|
+
HWY_HAVE_SCALAR_F16_TYPE
|
|
1205
1442
|
template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
1206
1443
|
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
1207
1444
|
TFromD<D> t2, TFromD<D> t3,
|
|
@@ -1393,240 +1630,6 @@ HWY_API Vec128<double> Combine(D /* tag */, Vec64<double> hi,
|
|
|
1393
1630
|
}
|
|
1394
1631
|
#endif // HWY_HAVE_FLOAT64
|
|
1395
1632
|
|
|
1396
|
-
// ------------------------------ BitCast
|
|
1397
|
-
|
|
1398
|
-
namespace detail {
|
|
1399
|
-
|
|
1400
|
-
// Converts from Vec128<T, N> to Vec128<uint8_t, N * sizeof(T)> using the
|
|
1401
|
-
// vreinterpret*_u8_*() set of functions.
|
|
1402
|
-
#define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
|
|
1403
|
-
#define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \
|
|
1404
|
-
Vec128<uint8_t, size * sizeof(type##_t)>
|
|
1405
|
-
#define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type##_t, size> v
|
|
1406
|
-
#define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw
|
|
1407
|
-
|
|
1408
|
-
// Special case of u8 to u8 since vreinterpret*_u8_u8 is obviously not defined.
|
|
1409
|
-
template <size_t N>
|
|
1410
|
-
HWY_INLINE Vec128<uint8_t, N> BitCastToByte(Vec128<uint8_t, N> v) {
|
|
1411
|
-
return v;
|
|
1412
|
-
}
|
|
1413
|
-
|
|
1414
|
-
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(BitCastToByte, vreinterpret, _u8_,
|
|
1415
|
-
HWY_CAST_TO_U8)
|
|
1416
|
-
HWY_NEON_DEF_FUNCTION_BFLOAT_16(BitCastToByte, vreinterpret, _u8_,
|
|
1417
|
-
HWY_CAST_TO_U8)
|
|
1418
|
-
|
|
1419
|
-
HWY_NEON_DEF_FUNCTION_INTS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
|
|
1420
|
-
HWY_NEON_DEF_FUNCTION_UINT_16(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
|
|
1421
|
-
HWY_NEON_DEF_FUNCTION_UINT_32(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
|
|
1422
|
-
HWY_NEON_DEF_FUNCTION_UINT_64(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
|
|
1423
|
-
|
|
1424
|
-
#if !HWY_HAVE_FLOAT16
|
|
1425
|
-
#if HWY_NEON_HAVE_F16C
|
|
1426
|
-
HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(BitCastToByte, vreinterpret, _u8_,
|
|
1427
|
-
HWY_CAST_TO_U8)
|
|
1428
|
-
#else
|
|
1429
|
-
template <size_t N>
|
|
1430
|
-
HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<float16_t, N> v) {
|
|
1431
|
-
return BitCastToByte(Vec128<uint16_t, N>(v.raw));
|
|
1432
|
-
}
|
|
1433
|
-
#endif // HWY_NEON_HAVE_F16C
|
|
1434
|
-
#endif // !HWY_HAVE_FLOAT16
|
|
1435
|
-
|
|
1436
|
-
#if !HWY_NEON_HAVE_BFLOAT16
|
|
1437
|
-
template <size_t N>
|
|
1438
|
-
HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<bfloat16_t, N> v) {
|
|
1439
|
-
return BitCastToByte(Vec128<uint16_t, N>(v.raw));
|
|
1440
|
-
}
|
|
1441
|
-
#endif // !HWY_NEON_HAVE_BFLOAT16
|
|
1442
|
-
|
|
1443
|
-
#undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
|
|
1444
|
-
#undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8
|
|
1445
|
-
#undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8
|
|
1446
|
-
#undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8
|
|
1447
|
-
|
|
1448
|
-
template <class D, HWY_IF_U8_D(D)>
|
|
1449
|
-
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, VFromD<D> v) {
|
|
1450
|
-
return v;
|
|
1451
|
-
}
|
|
1452
|
-
|
|
1453
|
-
// 64-bit or less:
|
|
1454
|
-
|
|
1455
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)>
|
|
1456
|
-
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
|
|
1457
|
-
VFromD<RebindToUnsigned<D>> v) {
|
|
1458
|
-
return VFromD<D>(vreinterpret_s8_u8(v.raw));
|
|
1459
|
-
}
|
|
1460
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
|
|
1461
|
-
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
|
|
1462
|
-
VFromD<Repartition<uint8_t, D>> v) {
|
|
1463
|
-
return VFromD<D>(vreinterpret_u16_u8(v.raw));
|
|
1464
|
-
}
|
|
1465
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
|
|
1466
|
-
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
|
|
1467
|
-
VFromD<Repartition<uint8_t, D>> v) {
|
|
1468
|
-
return VFromD<D>(vreinterpret_s16_u8(v.raw));
|
|
1469
|
-
}
|
|
1470
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
|
|
1471
|
-
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
|
|
1472
|
-
VFromD<Repartition<uint8_t, D>> v) {
|
|
1473
|
-
return VFromD<D>(vreinterpret_u32_u8(v.raw));
|
|
1474
|
-
}
|
|
1475
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
|
|
1476
|
-
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
|
|
1477
|
-
VFromD<Repartition<uint8_t, D>> v) {
|
|
1478
|
-
return VFromD<D>(vreinterpret_s32_u8(v.raw));
|
|
1479
|
-
}
|
|
1480
|
-
|
|
1481
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U64_D(D)>
|
|
1482
|
-
HWY_INLINE Vec64<uint64_t> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
|
|
1483
|
-
return Vec64<uint64_t>(vreinterpret_u64_u8(v.raw));
|
|
1484
|
-
}
|
|
1485
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I64_D(D)>
|
|
1486
|
-
HWY_INLINE Vec64<int64_t> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
|
|
1487
|
-
return Vec64<int64_t>(vreinterpret_s64_u8(v.raw));
|
|
1488
|
-
}
|
|
1489
|
-
|
|
1490
|
-
// Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C.
|
|
1491
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
|
|
1492
|
-
HWY_INLINE VFromD<D> BitCastFromByte(D, VFromD<Repartition<uint8_t, D>> v) {
|
|
1493
|
-
#if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
|
|
1494
|
-
return VFromD<D>(vreinterpret_f16_u8(v.raw));
|
|
1495
|
-
#else
|
|
1496
|
-
const RebindToUnsigned<D> du;
|
|
1497
|
-
return VFromD<D>(BitCastFromByte(du, v).raw);
|
|
1498
|
-
#endif
|
|
1499
|
-
}
|
|
1500
|
-
|
|
1501
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
|
|
1502
|
-
HWY_INLINE VFromD<D> BitCastFromByte(D, VFromD<Repartition<uint8_t, D>> v) {
|
|
1503
|
-
#if HWY_NEON_HAVE_BFLOAT16
|
|
1504
|
-
return VFromD<D>(vreinterpret_bf16_u8(v.raw));
|
|
1505
|
-
#else
|
|
1506
|
-
const RebindToUnsigned<D> du;
|
|
1507
|
-
return VFromD<D>(BitCastFromByte(du, v).raw);
|
|
1508
|
-
#endif
|
|
1509
|
-
}
|
|
1510
|
-
|
|
1511
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
|
|
1512
|
-
HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
|
|
1513
|
-
VFromD<Repartition<uint8_t, D>> v) {
|
|
1514
|
-
return VFromD<D>(vreinterpret_f32_u8(v.raw));
|
|
1515
|
-
}
|
|
1516
|
-
|
|
1517
|
-
#if HWY_HAVE_FLOAT64
|
|
1518
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F64_D(D)>
|
|
1519
|
-
HWY_INLINE Vec64<double> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
|
|
1520
|
-
return Vec64<double>(vreinterpret_f64_u8(v.raw));
|
|
1521
|
-
}
|
|
1522
|
-
#endif // HWY_HAVE_FLOAT64
|
|
1523
|
-
|
|
1524
|
-
// 128-bit full:
|
|
1525
|
-
|
|
1526
|
-
template <class D, HWY_IF_I8_D(D)>
|
|
1527
|
-
HWY_INLINE Vec128<int8_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
1528
|
-
return Vec128<int8_t>(vreinterpretq_s8_u8(v.raw));
|
|
1529
|
-
}
|
|
1530
|
-
template <class D, HWY_IF_U16_D(D)>
|
|
1531
|
-
HWY_INLINE Vec128<uint16_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
1532
|
-
return Vec128<uint16_t>(vreinterpretq_u16_u8(v.raw));
|
|
1533
|
-
}
|
|
1534
|
-
template <class D, HWY_IF_I16_D(D)>
|
|
1535
|
-
HWY_INLINE Vec128<int16_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
1536
|
-
return Vec128<int16_t>(vreinterpretq_s16_u8(v.raw));
|
|
1537
|
-
}
|
|
1538
|
-
template <class D, HWY_IF_U32_D(D)>
|
|
1539
|
-
HWY_INLINE Vec128<uint32_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
1540
|
-
return Vec128<uint32_t>(vreinterpretq_u32_u8(v.raw));
|
|
1541
|
-
}
|
|
1542
|
-
template <class D, HWY_IF_I32_D(D)>
|
|
1543
|
-
HWY_INLINE Vec128<int32_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
1544
|
-
return Vec128<int32_t>(vreinterpretq_s32_u8(v.raw));
|
|
1545
|
-
}
|
|
1546
|
-
template <class D, HWY_IF_U64_D(D)>
|
|
1547
|
-
HWY_INLINE Vec128<uint64_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
1548
|
-
return Vec128<uint64_t>(vreinterpretq_u64_u8(v.raw));
|
|
1549
|
-
}
|
|
1550
|
-
template <class D, HWY_IF_I64_D(D)>
|
|
1551
|
-
HWY_INLINE Vec128<int64_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
1552
|
-
return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
|
|
1553
|
-
}
|
|
1554
|
-
|
|
1555
|
-
template <class D, HWY_IF_F32_D(D)>
|
|
1556
|
-
HWY_INLINE Vec128<float> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
1557
|
-
return Vec128<float>(vreinterpretq_f32_u8(v.raw));
|
|
1558
|
-
}
|
|
1559
|
-
|
|
1560
|
-
#if HWY_HAVE_FLOAT64
|
|
1561
|
-
template <class D, HWY_IF_F64_D(D)>
|
|
1562
|
-
HWY_INLINE Vec128<double> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
1563
|
-
return Vec128<double>(vreinterpretq_f64_u8(v.raw));
|
|
1564
|
-
}
|
|
1565
|
-
#endif // HWY_HAVE_FLOAT64
|
|
1566
|
-
|
|
1567
|
-
// Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C.
|
|
1568
|
-
template <class D, HWY_IF_F16_D(D)>
|
|
1569
|
-
HWY_INLINE VFromD<D> BitCastFromByte(D, Vec128<uint8_t> v) {
|
|
1570
|
-
#if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
|
|
1571
|
-
return VFromD<D>(vreinterpretq_f16_u8(v.raw));
|
|
1572
|
-
#else
|
|
1573
|
-
return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
|
|
1574
|
-
#endif
|
|
1575
|
-
}
|
|
1576
|
-
|
|
1577
|
-
template <class D, HWY_IF_BF16_D(D)>
|
|
1578
|
-
HWY_INLINE VFromD<D> BitCastFromByte(D, Vec128<uint8_t> v) {
|
|
1579
|
-
#if HWY_NEON_HAVE_BFLOAT16
|
|
1580
|
-
return VFromD<D>(vreinterpretq_bf16_u8(v.raw));
|
|
1581
|
-
#else
|
|
1582
|
-
return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
|
|
1583
|
-
#endif
|
|
1584
|
-
}
|
|
1585
|
-
|
|
1586
|
-
} // namespace detail
|
|
1587
|
-
|
|
1588
|
-
template <class D, class FromT>
|
|
1589
|
-
HWY_API VFromD<D> BitCast(D d,
|
|
1590
|
-
Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) {
|
|
1591
|
-
return detail::BitCastFromByte(d, detail::BitCastToByte(v));
|
|
1592
|
-
}
|
|
1593
|
-
|
|
1594
|
-
// ------------------------------ ResizeBitCast
|
|
1595
|
-
|
|
1596
|
-
// <= 8 byte vector to <= 8 byte vector
|
|
1597
|
-
template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 8),
|
|
1598
|
-
HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
1599
|
-
HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
|
|
1600
|
-
const Repartition<uint8_t, decltype(d)> du8;
|
|
1601
|
-
return BitCast(d, VFromD<decltype(du8)>{detail::BitCastToByte(v).raw});
|
|
1602
|
-
}
|
|
1603
|
-
|
|
1604
|
-
// 16-byte vector to 16-byte vector: same as BitCast
|
|
1605
|
-
template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 16),
|
|
1606
|
-
HWY_IF_V_SIZE_D(D, 16)>
|
|
1607
|
-
HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
|
|
1608
|
-
return BitCast(d, v);
|
|
1609
|
-
}
|
|
1610
|
-
|
|
1611
|
-
// 16-byte vector to <= 8-byte vector
|
|
1612
|
-
template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 16),
|
|
1613
|
-
HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
1614
|
-
HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
|
|
1615
|
-
const DFromV<decltype(v)> d_from;
|
|
1616
|
-
const Half<decltype(d_from)> dh_from;
|
|
1617
|
-
return ResizeBitCast(d, LowerHalf(dh_from, v));
|
|
1618
|
-
}
|
|
1619
|
-
|
|
1620
|
-
// <= 8-bit vector to 16-byte vector
|
|
1621
|
-
template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 8),
|
|
1622
|
-
HWY_IF_V_SIZE_D(D, 16)>
|
|
1623
|
-
HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
|
|
1624
|
-
const Full64<TFromV<FromV>> d_full64_from;
|
|
1625
|
-
const Full128<TFromV<FromV>> d_full128_from;
|
|
1626
|
-
return BitCast(d, Combine(d_full128_from, Zero(d_full64_from),
|
|
1627
|
-
ResizeBitCast(d_full64_from, v)));
|
|
1628
|
-
}
|
|
1629
|
-
|
|
1630
1633
|
// ------------------------------ GetLane
|
|
1631
1634
|
|
|
1632
1635
|
namespace detail {
|
|
@@ -1940,10 +1943,74 @@ HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) {
|
|
|
1940
1943
|
// ================================================== ARITHMETIC
|
|
1941
1944
|
|
|
1942
1945
|
// ------------------------------ Addition
|
|
1943
|
-
|
|
1946
|
+
HWY_NEON_DEF_FUNCTION_UINTS(operator+, vadd, _, 2)
|
|
1947
|
+
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator+, vadd, _, 2)
|
|
1948
|
+
|
|
1949
|
+
template <size_t N>
|
|
1950
|
+
HWY_API Vec128<int8_t, N> operator+(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
|
|
1951
|
+
const DFromV<decltype(a)> d;
|
|
1952
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1953
|
+
return BitCast(d, BitCast(du, a) + BitCast(du, b));
|
|
1954
|
+
}
|
|
1955
|
+
|
|
1956
|
+
template <size_t N>
|
|
1957
|
+
HWY_API Vec128<int16_t, N> operator+(Vec128<int16_t, N> a,
|
|
1958
|
+
Vec128<int16_t, N> b) {
|
|
1959
|
+
const DFromV<decltype(a)> d;
|
|
1960
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1961
|
+
return BitCast(d, BitCast(du, a) + BitCast(du, b));
|
|
1962
|
+
}
|
|
1963
|
+
|
|
1964
|
+
template <size_t N>
|
|
1965
|
+
HWY_API Vec128<int32_t, N> operator+(Vec128<int32_t, N> a,
|
|
1966
|
+
Vec128<int32_t, N> b) {
|
|
1967
|
+
const DFromV<decltype(a)> d;
|
|
1968
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1969
|
+
return BitCast(d, BitCast(du, a) + BitCast(du, b));
|
|
1970
|
+
}
|
|
1971
|
+
|
|
1972
|
+
template <size_t N>
|
|
1973
|
+
HWY_API Vec128<int64_t, N> operator+(Vec128<int64_t, N> a,
|
|
1974
|
+
Vec128<int64_t, N> b) {
|
|
1975
|
+
const DFromV<decltype(a)> d;
|
|
1976
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1977
|
+
return BitCast(d, BitCast(du, a) + BitCast(du, b));
|
|
1978
|
+
}
|
|
1944
1979
|
|
|
1945
1980
|
// ------------------------------ Subtraction
|
|
1946
|
-
|
|
1981
|
+
HWY_NEON_DEF_FUNCTION_UINTS(operator-, vsub, _, 2)
|
|
1982
|
+
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator-, vsub, _, 2)
|
|
1983
|
+
|
|
1984
|
+
template <size_t N>
|
|
1985
|
+
HWY_API Vec128<int8_t, N> operator-(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
|
|
1986
|
+
const DFromV<decltype(a)> d;
|
|
1987
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1988
|
+
return BitCast(d, BitCast(du, a) - BitCast(du, b));
|
|
1989
|
+
}
|
|
1990
|
+
|
|
1991
|
+
template <size_t N>
|
|
1992
|
+
HWY_API Vec128<int16_t, N> operator-(Vec128<int16_t, N> a,
|
|
1993
|
+
Vec128<int16_t, N> b) {
|
|
1994
|
+
const DFromV<decltype(a)> d;
|
|
1995
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1996
|
+
return BitCast(d, BitCast(du, a) - BitCast(du, b));
|
|
1997
|
+
}
|
|
1998
|
+
|
|
1999
|
+
template <size_t N>
|
|
2000
|
+
HWY_API Vec128<int32_t, N> operator-(Vec128<int32_t, N> a,
|
|
2001
|
+
Vec128<int32_t, N> b) {
|
|
2002
|
+
const DFromV<decltype(a)> d;
|
|
2003
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2004
|
+
return BitCast(d, BitCast(du, a) - BitCast(du, b));
|
|
2005
|
+
}
|
|
2006
|
+
|
|
2007
|
+
template <size_t N>
|
|
2008
|
+
HWY_API Vec128<int64_t, N> operator-(Vec128<int64_t, N> a,
|
|
2009
|
+
Vec128<int64_t, N> b) {
|
|
2010
|
+
const DFromV<decltype(a)> d;
|
|
2011
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2012
|
+
return BitCast(d, BitCast(du, a) - BitCast(du, b));
|
|
2013
|
+
}
|
|
1947
2014
|
|
|
1948
2015
|
// ------------------------------ SumsOf8
|
|
1949
2016
|
|
|
@@ -2074,8 +2141,14 @@ HWY_NEON_DEF_FUNCTION_INTS_UINTS(SaturatedSub, vqsub, _, 2)
|
|
|
2074
2141
|
// ------------------------------ Average
|
|
2075
2142
|
|
|
2076
2143
|
// Returns (a + b + 1) / 2
|
|
2077
|
-
|
|
2078
|
-
|
|
2144
|
+
|
|
2145
|
+
#ifdef HWY_NATIVE_AVERAGE_ROUND_UI32
|
|
2146
|
+
#undef HWY_NATIVE_AVERAGE_ROUND_UI32
|
|
2147
|
+
#else
|
|
2148
|
+
#define HWY_NATIVE_AVERAGE_ROUND_UI32
|
|
2149
|
+
#endif
|
|
2150
|
+
|
|
2151
|
+
HWY_NEON_DEF_FUNCTION_UI_8_16_32(AverageRound, vrhadd, _, 2)
|
|
2079
2152
|
|
|
2080
2153
|
// ------------------------------ Neg
|
|
2081
2154
|
|
|
@@ -2143,6 +2216,12 @@ HWY_API Vec128<int64_t> SaturatedNeg(const Vec128<int64_t> v) {
|
|
|
2143
2216
|
|
|
2144
2217
|
// ------------------------------ ShiftLeft
|
|
2145
2218
|
|
|
2219
|
+
#ifdef HWY_NATIVE_ROUNDING_SHR
|
|
2220
|
+
#undef HWY_NATIVE_ROUNDING_SHR
|
|
2221
|
+
#else
|
|
2222
|
+
#define HWY_NATIVE_ROUNDING_SHR
|
|
2223
|
+
#endif
|
|
2224
|
+
|
|
2146
2225
|
// Customize HWY_NEON_DEF_FUNCTION to special-case count=0 (not supported).
|
|
2147
2226
|
#pragma push_macro("HWY_NEON_DEF_FUNCTION")
|
|
2148
2227
|
#undef HWY_NEON_DEF_FUNCTION
|
|
@@ -2158,6 +2237,8 @@ HWY_NEON_DEF_FUNCTION_INTS_UINTS(ShiftLeft, vshl, _n_, ignored)
|
|
|
2158
2237
|
|
|
2159
2238
|
HWY_NEON_DEF_FUNCTION_UINTS(ShiftRight, vshr, _n_, ignored)
|
|
2160
2239
|
HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, vshr, _n_, ignored)
|
|
2240
|
+
HWY_NEON_DEF_FUNCTION_UINTS(RoundingShiftRight, vrshr, _n_, ignored)
|
|
2241
|
+
HWY_NEON_DEF_FUNCTION_INTS(RoundingShiftRight, vrshr, _n_, ignored)
|
|
2161
2242
|
|
|
2162
2243
|
#pragma pop_macro("HWY_NEON_DEF_FUNCTION")
|
|
2163
2244
|
|
|
@@ -2334,6 +2415,95 @@ HWY_API Vec64<int64_t> operator>>(Vec64<int64_t> v, Vec64<int64_t> bits) {
|
|
|
2334
2415
|
return Vec64<int64_t>(vshl_s64(v.raw, Neg(bits).raw));
|
|
2335
2416
|
}
|
|
2336
2417
|
|
|
2418
|
+
// ------------------------------ RoundingShr (Neg)
|
|
2419
|
+
|
|
2420
|
+
HWY_API Vec128<uint8_t> RoundingShr(Vec128<uint8_t> v, Vec128<uint8_t> bits) {
|
|
2421
|
+
const RebindToSigned<DFromV<decltype(v)>> di;
|
|
2422
|
+
const int8x16_t neg_bits = Neg(BitCast(di, bits)).raw;
|
|
2423
|
+
return Vec128<uint8_t>(vrshlq_u8(v.raw, neg_bits));
|
|
2424
|
+
}
|
|
2425
|
+
template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)>
|
|
2426
|
+
HWY_API Vec128<uint8_t, N> RoundingShr(Vec128<uint8_t, N> v,
|
|
2427
|
+
Vec128<uint8_t, N> bits) {
|
|
2428
|
+
const RebindToSigned<DFromV<decltype(v)>> di;
|
|
2429
|
+
const int8x8_t neg_bits = Neg(BitCast(di, bits)).raw;
|
|
2430
|
+
return Vec128<uint8_t, N>(vrshl_u8(v.raw, neg_bits));
|
|
2431
|
+
}
|
|
2432
|
+
|
|
2433
|
+
HWY_API Vec128<uint16_t> RoundingShr(Vec128<uint16_t> v,
|
|
2434
|
+
Vec128<uint16_t> bits) {
|
|
2435
|
+
const RebindToSigned<DFromV<decltype(v)>> di;
|
|
2436
|
+
const int16x8_t neg_bits = Neg(BitCast(di, bits)).raw;
|
|
2437
|
+
return Vec128<uint16_t>(vrshlq_u16(v.raw, neg_bits));
|
|
2438
|
+
}
|
|
2439
|
+
template <size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)>
|
|
2440
|
+
HWY_API Vec128<uint16_t, N> RoundingShr(Vec128<uint16_t, N> v,
|
|
2441
|
+
Vec128<uint16_t, N> bits) {
|
|
2442
|
+
const RebindToSigned<DFromV<decltype(v)>> di;
|
|
2443
|
+
const int16x4_t neg_bits = Neg(BitCast(di, bits)).raw;
|
|
2444
|
+
return Vec128<uint16_t, N>(vrshl_u16(v.raw, neg_bits));
|
|
2445
|
+
}
|
|
2446
|
+
|
|
2447
|
+
HWY_API Vec128<uint32_t> RoundingShr(Vec128<uint32_t> v,
|
|
2448
|
+
Vec128<uint32_t> bits) {
|
|
2449
|
+
const RebindToSigned<DFromV<decltype(v)>> di;
|
|
2450
|
+
const int32x4_t neg_bits = Neg(BitCast(di, bits)).raw;
|
|
2451
|
+
return Vec128<uint32_t>(vrshlq_u32(v.raw, neg_bits));
|
|
2452
|
+
}
|
|
2453
|
+
template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
|
|
2454
|
+
HWY_API Vec128<uint32_t, N> RoundingShr(Vec128<uint32_t, N> v,
|
|
2455
|
+
Vec128<uint32_t, N> bits) {
|
|
2456
|
+
const RebindToSigned<DFromV<decltype(v)>> di;
|
|
2457
|
+
const int32x2_t neg_bits = Neg(BitCast(di, bits)).raw;
|
|
2458
|
+
return Vec128<uint32_t, N>(vrshl_u32(v.raw, neg_bits));
|
|
2459
|
+
}
|
|
2460
|
+
|
|
2461
|
+
HWY_API Vec128<uint64_t> RoundingShr(Vec128<uint64_t> v,
|
|
2462
|
+
Vec128<uint64_t> bits) {
|
|
2463
|
+
const RebindToSigned<DFromV<decltype(v)>> di;
|
|
2464
|
+
const int64x2_t neg_bits = Neg(BitCast(di, bits)).raw;
|
|
2465
|
+
return Vec128<uint64_t>(vrshlq_u64(v.raw, neg_bits));
|
|
2466
|
+
}
|
|
2467
|
+
HWY_API Vec64<uint64_t> RoundingShr(Vec64<uint64_t> v, Vec64<uint64_t> bits) {
|
|
2468
|
+
const RebindToSigned<DFromV<decltype(v)>> di;
|
|
2469
|
+
const int64x1_t neg_bits = Neg(BitCast(di, bits)).raw;
|
|
2470
|
+
return Vec64<uint64_t>(vrshl_u64(v.raw, neg_bits));
|
|
2471
|
+
}
|
|
2472
|
+
|
|
2473
|
+
HWY_API Vec128<int8_t> RoundingShr(Vec128<int8_t> v, Vec128<int8_t> bits) {
|
|
2474
|
+
return Vec128<int8_t>(vrshlq_s8(v.raw, Neg(bits).raw));
|
|
2475
|
+
}
|
|
2476
|
+
template <size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)>
|
|
2477
|
+
HWY_API Vec128<int8_t, N> RoundingShr(Vec128<int8_t, N> v,
|
|
2478
|
+
Vec128<int8_t, N> bits) {
|
|
2479
|
+
return Vec128<int8_t, N>(vrshl_s8(v.raw, Neg(bits).raw));
|
|
2480
|
+
}
|
|
2481
|
+
|
|
2482
|
+
HWY_API Vec128<int16_t> RoundingShr(Vec128<int16_t> v, Vec128<int16_t> bits) {
|
|
2483
|
+
return Vec128<int16_t>(vrshlq_s16(v.raw, Neg(bits).raw));
|
|
2484
|
+
}
|
|
2485
|
+
template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
|
|
2486
|
+
HWY_API Vec128<int16_t, N> RoundingShr(Vec128<int16_t, N> v,
|
|
2487
|
+
Vec128<int16_t, N> bits) {
|
|
2488
|
+
return Vec128<int16_t, N>(vrshl_s16(v.raw, Neg(bits).raw));
|
|
2489
|
+
}
|
|
2490
|
+
|
|
2491
|
+
HWY_API Vec128<int32_t> RoundingShr(Vec128<int32_t> v, Vec128<int32_t> bits) {
|
|
2492
|
+
return Vec128<int32_t>(vrshlq_s32(v.raw, Neg(bits).raw));
|
|
2493
|
+
}
|
|
2494
|
+
template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)>
|
|
2495
|
+
HWY_API Vec128<int32_t, N> RoundingShr(Vec128<int32_t, N> v,
|
|
2496
|
+
Vec128<int32_t, N> bits) {
|
|
2497
|
+
return Vec128<int32_t, N>(vrshl_s32(v.raw, Neg(bits).raw));
|
|
2498
|
+
}
|
|
2499
|
+
|
|
2500
|
+
HWY_API Vec128<int64_t> RoundingShr(Vec128<int64_t> v, Vec128<int64_t> bits) {
|
|
2501
|
+
return Vec128<int64_t>(vrshlq_s64(v.raw, Neg(bits).raw));
|
|
2502
|
+
}
|
|
2503
|
+
HWY_API Vec64<int64_t> RoundingShr(Vec64<int64_t> v, Vec64<int64_t> bits) {
|
|
2504
|
+
return Vec64<int64_t>(vrshl_s64(v.raw, Neg(bits).raw));
|
|
2505
|
+
}
|
|
2506
|
+
|
|
2337
2507
|
// ------------------------------ ShiftLeftSame (Shl)
|
|
2338
2508
|
|
|
2339
2509
|
template <typename T, size_t N>
|
|
@@ -2345,6 +2515,13 @@ HWY_API Vec128<T, N> ShiftRightSame(const Vec128<T, N> v, int bits) {
|
|
|
2345
2515
|
return v >> Set(DFromV<decltype(v)>(), static_cast<T>(bits));
|
|
2346
2516
|
}
|
|
2347
2517
|
|
|
2518
|
+
// ------------------------------ RoundingShiftRightSame (RoundingShr)
|
|
2519
|
+
|
|
2520
|
+
template <typename T, size_t N>
|
|
2521
|
+
HWY_API Vec128<T, N> RoundingShiftRightSame(const Vec128<T, N> v, int bits) {
|
|
2522
|
+
return RoundingShr(v, Set(DFromV<decltype(v)>(), static_cast<T>(bits)));
|
|
2523
|
+
}
|
|
2524
|
+
|
|
2348
2525
|
// ------------------------------ Int/float multiplication
|
|
2349
2526
|
|
|
2350
2527
|
// Per-target flag to prevent generic_ops-inl.h from defining 8-bit operator*.
|
|
@@ -2356,9 +2533,31 @@ HWY_API Vec128<T, N> ShiftRightSame(const Vec128<T, N> v, int bits) {
|
|
|
2356
2533
|
|
|
2357
2534
|
// All except ui64
|
|
2358
2535
|
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator*, vmul, _, 2)
|
|
2359
|
-
HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator*, vmul, _, 2)
|
|
2360
2536
|
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator*, vmul, _, 2)
|
|
2361
2537
|
|
|
2538
|
+
template <size_t N>
|
|
2539
|
+
HWY_API Vec128<int8_t, N> operator*(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
|
|
2540
|
+
const DFromV<decltype(a)> d;
|
|
2541
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2542
|
+
return BitCast(d, BitCast(du, a) * BitCast(du, b));
|
|
2543
|
+
}
|
|
2544
|
+
|
|
2545
|
+
template <size_t N>
|
|
2546
|
+
HWY_API Vec128<int16_t, N> operator*(Vec128<int16_t, N> a,
|
|
2547
|
+
Vec128<int16_t, N> b) {
|
|
2548
|
+
const DFromV<decltype(a)> d;
|
|
2549
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2550
|
+
return BitCast(d, BitCast(du, a) * BitCast(du, b));
|
|
2551
|
+
}
|
|
2552
|
+
|
|
2553
|
+
template <size_t N>
|
|
2554
|
+
HWY_API Vec128<int32_t, N> operator*(Vec128<int32_t, N> a,
|
|
2555
|
+
Vec128<int32_t, N> b) {
|
|
2556
|
+
const DFromV<decltype(a)> d;
|
|
2557
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2558
|
+
return BitCast(d, BitCast(du, a) * BitCast(du, b));
|
|
2559
|
+
}
|
|
2560
|
+
|
|
2362
2561
|
// ------------------------------ Integer multiplication
|
|
2363
2562
|
|
|
2364
2563
|
// Returns the upper sizeof(T)*8 bits of a * b in each lane.
|
|
@@ -2490,7 +2689,7 @@ HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
|
|
|
2490
2689
|
// ------------------------------ Floating-point division
|
|
2491
2690
|
|
|
2492
2691
|
// Emulate missing intrinsic
|
|
2493
|
-
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
|
|
2692
|
+
#if HWY_HAVE_FLOAT64 && HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
|
|
2494
2693
|
HWY_INLINE float64x1_t vrecpe_f64(float64x1_t raw) {
|
|
2495
2694
|
const CappedTag<double, 1> d;
|
|
2496
2695
|
const Twice<decltype(d)> dt;
|
|
@@ -2788,26 +2987,6 @@ HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
|
|
|
2788
2987
|
return Or(o, And(a1, a2));
|
|
2789
2988
|
}
|
|
2790
2989
|
|
|
2791
|
-
// ------------------------------ IfVecThenElse
|
|
2792
|
-
template <typename T, size_t N>
|
|
2793
|
-
HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
|
|
2794
|
-
Vec128<T, N> no) {
|
|
2795
|
-
return IfThenElse(MaskFromVec(mask), yes, no);
|
|
2796
|
-
}
|
|
2797
|
-
|
|
2798
|
-
// ------------------------------ BitwiseIfThenElse
|
|
2799
|
-
|
|
2800
|
-
#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
|
|
2801
|
-
#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
|
|
2802
|
-
#else
|
|
2803
|
-
#define HWY_NATIVE_BITWISE_IF_THEN_ELSE
|
|
2804
|
-
#endif
|
|
2805
|
-
|
|
2806
|
-
template <class V>
|
|
2807
|
-
HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
|
|
2808
|
-
return IfVecThenElse(mask, yes, no);
|
|
2809
|
-
}
|
|
2810
|
-
|
|
2811
2990
|
// ------------------------------ Operator overloads (internal-only if float)
|
|
2812
2991
|
|
|
2813
2992
|
template <typename T, size_t N>
|
|
@@ -2927,14 +3106,6 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Abs, vabs, _, 1)
|
|
|
2927
3106
|
|
|
2928
3107
|
HWY_NEON_DEF_FUNCTION_INT_8_16_32(SaturatedAbs, vqabs, _, 1)
|
|
2929
3108
|
|
|
2930
|
-
// ------------------------------ CopySign
|
|
2931
|
-
template <typename T, size_t N>
|
|
2932
|
-
HWY_API Vec128<T, N> CopySign(Vec128<T, N> magn, Vec128<T, N> sign) {
|
|
2933
|
-
static_assert(IsFloat<T>(), "Only makes sense for floating-point");
|
|
2934
|
-
const DFromV<decltype(magn)> d;
|
|
2935
|
-
return BitwiseIfThenElse(SignBit(d), sign, magn);
|
|
2936
|
-
}
|
|
2937
|
-
|
|
2938
3109
|
// ------------------------------ CopySignToAbs
|
|
2939
3110
|
template <typename T, size_t N>
|
|
2940
3111
|
HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
|
|
@@ -2981,6 +3152,21 @@ HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) {
|
|
|
2981
3152
|
|
|
2982
3153
|
// ------------------------------ IfThenElse
|
|
2983
3154
|
|
|
3155
|
+
// Workaround for incorrect codegen.
|
|
3156
|
+
#if HWY_ARCH_ARM_V7
|
|
3157
|
+
|
|
3158
|
+
template <class V, class D = DFromV<V>>
|
|
3159
|
+
HWY_API V IfThenElse(MFromD<D> mask, V yes, V no) {
|
|
3160
|
+
const RebindToUnsigned<D> du;
|
|
3161
|
+
using VU = VFromD<decltype(du)>;
|
|
3162
|
+
const VU no_u = BitCast(du, no);
|
|
3163
|
+
const VU diff_u = BitCast(du, yes) ^ no_u;
|
|
3164
|
+
const VU mask_u = BitCast(du, VecFromMask(D(), mask));
|
|
3165
|
+
return BitCast(D(), no_u ^ (diff_u & mask_u));
|
|
3166
|
+
}
|
|
3167
|
+
|
|
3168
|
+
#else // normal VBSL instruction
|
|
3169
|
+
|
|
2984
3170
|
#define HWY_NEON_BUILD_TPL_HWY_IF
|
|
2985
3171
|
#define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type##_t, size>
|
|
2986
3172
|
#define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \
|
|
@@ -2990,6 +3176,8 @@ HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) {
|
|
|
2990
3176
|
|
|
2991
3177
|
HWY_NEON_DEF_FUNCTION_ALL_TYPES(IfThenElse, vbsl, _, HWY_IF)
|
|
2992
3178
|
|
|
3179
|
+
#endif // HWY_ARCH_ARM_V7
|
|
3180
|
+
|
|
2993
3181
|
#if HWY_HAVE_FLOAT16
|
|
2994
3182
|
#define HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V) HWY_IF_BF16(TFromV<V>)
|
|
2995
3183
|
#else
|
|
@@ -3045,6 +3233,33 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
|
|
|
3045
3233
|
return IfThenElse(m, yes, no);
|
|
3046
3234
|
}
|
|
3047
3235
|
|
|
3236
|
+
template <typename T, size_t N>
|
|
3237
|
+
HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
|
|
3238
|
+
Vec128<T, N> no) {
|
|
3239
|
+
return IfThenElse(MaskFromVec(mask), yes, no);
|
|
3240
|
+
}
|
|
3241
|
+
|
|
3242
|
+
// ------------------------------ BitwiseIfThenElse
|
|
3243
|
+
|
|
3244
|
+
#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
|
|
3245
|
+
#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
|
|
3246
|
+
#else
|
|
3247
|
+
#define HWY_NATIVE_BITWISE_IF_THEN_ELSE
|
|
3248
|
+
#endif
|
|
3249
|
+
|
|
3250
|
+
template <class V>
|
|
3251
|
+
HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
|
|
3252
|
+
return IfVecThenElse(mask, yes, no);
|
|
3253
|
+
}
|
|
3254
|
+
|
|
3255
|
+
// ------------------------------ CopySign (BitwiseIfThenElse)
|
|
3256
|
+
template <typename T, size_t N>
|
|
3257
|
+
HWY_API Vec128<T, N> CopySign(Vec128<T, N> magn, Vec128<T, N> sign) {
|
|
3258
|
+
static_assert(IsFloat<T>(), "Only makes sense for floating-point");
|
|
3259
|
+
const DFromV<decltype(magn)> d;
|
|
3260
|
+
return BitwiseIfThenElse(SignBit(d), sign, magn);
|
|
3261
|
+
}
|
|
3262
|
+
|
|
3048
3263
|
// ------------------------------ Mask logical
|
|
3049
3264
|
|
|
3050
3265
|
template <typename T, size_t N>
|
|
@@ -3275,21 +3490,19 @@ HWY_API Mask128<int64_t, N> TestBit(Vec128<int64_t, N> v,
|
|
|
3275
3490
|
#undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT
|
|
3276
3491
|
#undef HWY_NEON_BUILD_ARG_HWY_TESTBIT
|
|
3277
3492
|
|
|
3278
|
-
// ------------------------------ Abs i64 (
|
|
3493
|
+
// ------------------------------ Abs i64 (IfNegativeThenElse, Neg)
|
|
3279
3494
|
HWY_API Vec128<int64_t> Abs(const Vec128<int64_t> v) {
|
|
3280
3495
|
#if HWY_ARCH_ARM_A64
|
|
3281
3496
|
return Vec128<int64_t>(vabsq_s64(v.raw));
|
|
3282
3497
|
#else
|
|
3283
|
-
|
|
3284
|
-
return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
|
|
3498
|
+
return IfNegativeThenElse(v, Neg(v), v);
|
|
3285
3499
|
#endif
|
|
3286
3500
|
}
|
|
3287
3501
|
HWY_API Vec64<int64_t> Abs(const Vec64<int64_t> v) {
|
|
3288
3502
|
#if HWY_ARCH_ARM_A64
|
|
3289
3503
|
return Vec64<int64_t>(vabs_s64(v.raw));
|
|
3290
3504
|
#else
|
|
3291
|
-
|
|
3292
|
-
return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
|
|
3505
|
+
return IfNegativeThenElse(v, Neg(v), v);
|
|
3293
3506
|
#endif
|
|
3294
3507
|
}
|
|
3295
3508
|
|
|
@@ -3298,7 +3511,7 @@ HWY_API Vec128<int64_t> SaturatedAbs(const Vec128<int64_t> v) {
|
|
|
3298
3511
|
return Vec128<int64_t>(vqabsq_s64(v.raw));
|
|
3299
3512
|
#else
|
|
3300
3513
|
const auto zero = Zero(DFromV<decltype(v)>());
|
|
3301
|
-
return
|
|
3514
|
+
return IfNegativeThenElse(v, SaturatedSub(zero, v), v);
|
|
3302
3515
|
#endif
|
|
3303
3516
|
}
|
|
3304
3517
|
HWY_API Vec64<int64_t> SaturatedAbs(const Vec64<int64_t> v) {
|
|
@@ -3306,7 +3519,7 @@ HWY_API Vec64<int64_t> SaturatedAbs(const Vec64<int64_t> v) {
|
|
|
3306
3519
|
return Vec64<int64_t>(vqabs_s64(v.raw));
|
|
3307
3520
|
#else
|
|
3308
3521
|
const auto zero = Zero(DFromV<decltype(v)>());
|
|
3309
|
-
return
|
|
3522
|
+
return IfNegativeThenElse(v, SaturatedSub(zero, v), v);
|
|
3310
3523
|
#endif
|
|
3311
3524
|
}
|
|
3312
3525
|
|
|
@@ -3442,6 +3655,28 @@ HWY_API Vec128<double> Max(Vec128<double> a, Vec128<double> b) {
|
|
|
3442
3655
|
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmax, _, 2)
|
|
3443
3656
|
#endif // HWY_ARCH_ARM_A64
|
|
3444
3657
|
|
|
3658
|
+
// ------------------------------ MinNumber and MaxNumber
|
|
3659
|
+
|
|
3660
|
+
#if !HWY_ARCH_ARM_A64
|
|
3661
|
+
|
|
3662
|
+
#ifdef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
|
|
3663
|
+
#undef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
|
|
3664
|
+
#else
|
|
3665
|
+
#define HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
|
|
3666
|
+
#endif
|
|
3667
|
+
|
|
3668
|
+
template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
|
|
3669
|
+
HWY_API V MinNumber(V a, V b) {
|
|
3670
|
+
return Min(IfThenElse(IsNaN(a), b, a), IfThenElse(IsNaN(b), a, b));
|
|
3671
|
+
}
|
|
3672
|
+
|
|
3673
|
+
template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
|
|
3674
|
+
HWY_API V MaxNumber(V a, V b) {
|
|
3675
|
+
return Max(IfThenElse(IsNaN(a), b, a), IfThenElse(IsNaN(b), a, b));
|
|
3676
|
+
}
|
|
3677
|
+
|
|
3678
|
+
#endif
|
|
3679
|
+
|
|
3445
3680
|
// ================================================== MEMORY
|
|
3446
3681
|
|
|
3447
3682
|
// ------------------------------ Load 128
|
|
@@ -5077,8 +5312,101 @@ HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
|
|
|
5077
5312
|
|
|
5078
5313
|
#endif
|
|
5079
5314
|
|
|
5315
|
+
// ------------------------------ CeilInt/FloorInt
|
|
5316
|
+
#if HWY_ARCH_ARM_A64
|
|
5317
|
+
|
|
5318
|
+
#ifdef HWY_NATIVE_CEIL_FLOOR_INT
|
|
5319
|
+
#undef HWY_NATIVE_CEIL_FLOOR_INT
|
|
5320
|
+
#else
|
|
5321
|
+
#define HWY_NATIVE_CEIL_FLOOR_INT
|
|
5322
|
+
#endif
|
|
5323
|
+
|
|
5324
|
+
#if HWY_HAVE_FLOAT16
|
|
5325
|
+
HWY_API Vec128<int16_t> CeilInt(const Vec128<float16_t> v) {
|
|
5326
|
+
return Vec128<int16_t>(vcvtpq_s16_f16(v.raw));
|
|
5327
|
+
}
|
|
5328
|
+
|
|
5329
|
+
template <size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8)>
|
|
5330
|
+
HWY_API Vec128<int16_t, N> CeilInt(const Vec128<float16_t, N> v) {
|
|
5331
|
+
return Vec128<int16_t, N>(vcvtp_s16_f16(v.raw));
|
|
5332
|
+
}
|
|
5333
|
+
|
|
5334
|
+
HWY_API Vec128<int16_t> FloorInt(const Vec128<float16_t> v) {
|
|
5335
|
+
return Vec128<int16_t>(vcvtmq_s16_f16(v.raw));
|
|
5336
|
+
}
|
|
5337
|
+
|
|
5338
|
+
template <size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8)>
|
|
5339
|
+
HWY_API Vec128<int16_t, N> FloorInt(const Vec128<float16_t, N> v) {
|
|
5340
|
+
return Vec128<int16_t, N>(vcvtm_s16_f16(v.raw));
|
|
5341
|
+
}
|
|
5342
|
+
#endif // HWY_HAVE_FLOAT16
|
|
5343
|
+
|
|
5344
|
+
HWY_API Vec128<int32_t> CeilInt(const Vec128<float> v) {
|
|
5345
|
+
return Vec128<int32_t>(vcvtpq_s32_f32(v.raw));
|
|
5346
|
+
}
|
|
5347
|
+
|
|
5348
|
+
template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)>
|
|
5349
|
+
HWY_API Vec128<int32_t, N> CeilInt(const Vec128<float, N> v) {
|
|
5350
|
+
return Vec128<int32_t, N>(vcvtp_s32_f32(v.raw));
|
|
5351
|
+
}
|
|
5352
|
+
|
|
5353
|
+
HWY_API Vec128<int64_t> CeilInt(const Vec128<double> v) {
|
|
5354
|
+
return Vec128<int64_t>(vcvtpq_s64_f64(v.raw));
|
|
5355
|
+
}
|
|
5356
|
+
|
|
5357
|
+
template <size_t N, HWY_IF_V_SIZE_LE(double, N, 8)>
|
|
5358
|
+
HWY_API Vec128<int64_t, N> CeilInt(const Vec128<double, N> v) {
|
|
5359
|
+
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 610
|
|
5360
|
+
// Workaround for missing vcvtp_s64_f64 intrinsic
|
|
5361
|
+
const DFromV<decltype(v)> d;
|
|
5362
|
+
const RebindToSigned<decltype(d)> di;
|
|
5363
|
+
const Twice<decltype(d)> dt;
|
|
5364
|
+
return LowerHalf(di, CeilInt(Combine(dt, v, v)));
|
|
5365
|
+
#else
|
|
5366
|
+
return Vec128<int64_t, N>(vcvtp_s64_f64(v.raw));
|
|
5367
|
+
#endif
|
|
5368
|
+
}
|
|
5369
|
+
|
|
5370
|
+
HWY_API Vec128<int32_t> FloorInt(const Vec128<float> v) {
|
|
5371
|
+
return Vec128<int32_t>(vcvtmq_s32_f32(v.raw));
|
|
5372
|
+
}
|
|
5373
|
+
|
|
5374
|
+
template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)>
|
|
5375
|
+
HWY_API Vec128<int32_t, N> FloorInt(const Vec128<float, N> v) {
|
|
5376
|
+
return Vec128<int32_t, N>(vcvtm_s32_f32(v.raw));
|
|
5377
|
+
}
|
|
5378
|
+
|
|
5379
|
+
HWY_API Vec128<int64_t> FloorInt(const Vec128<double> v) {
|
|
5380
|
+
return Vec128<int64_t>(vcvtmq_s64_f64(v.raw));
|
|
5381
|
+
}
|
|
5382
|
+
|
|
5383
|
+
template <size_t N, HWY_IF_V_SIZE_LE(double, N, 8)>
|
|
5384
|
+
HWY_API Vec128<int64_t, N> FloorInt(const Vec128<double, N> v) {
|
|
5385
|
+
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 610
|
|
5386
|
+
// Workaround for missing vcvtm_s64_f64 intrinsic
|
|
5387
|
+
const DFromV<decltype(v)> d;
|
|
5388
|
+
const RebindToSigned<decltype(d)> di;
|
|
5389
|
+
const Twice<decltype(d)> dt;
|
|
5390
|
+
return LowerHalf(di, FloorInt(Combine(dt, v, v)));
|
|
5391
|
+
#else
|
|
5392
|
+
return Vec128<int64_t, N>(vcvtm_s64_f64(v.raw));
|
|
5393
|
+
#endif
|
|
5394
|
+
}
|
|
5395
|
+
|
|
5396
|
+
#endif // HWY_ARCH_ARM_A64
|
|
5397
|
+
|
|
5080
5398
|
// ------------------------------ NearestInt (Round)
|
|
5081
5399
|
|
|
5400
|
+
#if HWY_HAVE_FLOAT16
|
|
5401
|
+
HWY_API Vec128<int16_t> NearestInt(const Vec128<float16_t> v) {
|
|
5402
|
+
return Vec128<int16_t>(vcvtnq_s16_f16(v.raw));
|
|
5403
|
+
}
|
|
5404
|
+
template <size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8)>
|
|
5405
|
+
HWY_API Vec128<int16_t, N> NearestInt(const Vec128<float16_t, N> v) {
|
|
5406
|
+
return Vec128<int16_t, N>(vcvtn_s16_f16(v.raw));
|
|
5407
|
+
}
|
|
5408
|
+
#endif
|
|
5409
|
+
|
|
5082
5410
|
#if HWY_ARCH_ARM_A64
|
|
5083
5411
|
|
|
5084
5412
|
HWY_API Vec128<int32_t> NearestInt(const Vec128<float> v) {
|
|
@@ -5089,6 +5417,29 @@ HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
|
|
|
5089
5417
|
return Vec128<int32_t, N>(vcvtn_s32_f32(v.raw));
|
|
5090
5418
|
}
|
|
5091
5419
|
|
|
5420
|
+
HWY_API Vec128<int64_t> NearestInt(const Vec128<double> v) {
|
|
5421
|
+
return Vec128<int64_t>(vcvtnq_s64_f64(v.raw));
|
|
5422
|
+
}
|
|
5423
|
+
|
|
5424
|
+
template <size_t N, HWY_IF_V_SIZE_LE(double, N, 8)>
|
|
5425
|
+
HWY_API Vec128<int64_t, N> NearestInt(const Vec128<double, N> v) {
|
|
5426
|
+
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 610
|
|
5427
|
+
// Workaround for missing vcvtn_s64_f64 intrinsic
|
|
5428
|
+
const DFromV<decltype(v)> d;
|
|
5429
|
+
const RebindToSigned<decltype(d)> di;
|
|
5430
|
+
const Twice<decltype(d)> dt;
|
|
5431
|
+
return LowerHalf(di, NearestInt(Combine(dt, v, v)));
|
|
5432
|
+
#else
|
|
5433
|
+
return Vec128<int64_t, N>(vcvtn_s64_f64(v.raw));
|
|
5434
|
+
#endif
|
|
5435
|
+
}
|
|
5436
|
+
|
|
5437
|
+
template <class DI32, HWY_IF_I32_D(DI32)>
|
|
5438
|
+
HWY_API VFromD<DI32> DemoteToNearestInt(DI32 di32,
|
|
5439
|
+
VFromD<Rebind<double, DI32>> v) {
|
|
5440
|
+
return DemoteTo(di32, NearestInt(v));
|
|
5441
|
+
}
|
|
5442
|
+
|
|
5092
5443
|
#else
|
|
5093
5444
|
|
|
5094
5445
|
template <size_t N>
|
|
@@ -5100,10 +5451,62 @@ HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
|
|
|
5100
5451
|
#endif
|
|
5101
5452
|
|
|
5102
5453
|
// ------------------------------ Floating-point classification
|
|
5454
|
+
|
|
5455
|
+
#if !HWY_COMPILER_CLANG || HWY_COMPILER_CLANG > 1801 || HWY_ARCH_ARM_V7
|
|
5103
5456
|
template <typename T, size_t N>
|
|
5104
5457
|
HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
|
|
5105
5458
|
return v != v;
|
|
5106
5459
|
}
|
|
5460
|
+
#else
|
|
5461
|
+
// Clang up to 18.1 generates less efficient code than the expected FCMEQ, see
|
|
5462
|
+
// https://github.com/numpy/numpy/issues/27313 and
|
|
5463
|
+
// https://github.com/numpy/numpy/pull/22954/files and
|
|
5464
|
+
// https://github.com/llvm/llvm-project/issues/59855
|
|
5465
|
+
|
|
5466
|
+
#if HWY_HAVE_FLOAT16
|
|
5467
|
+
template <typename T, size_t N, HWY_IF_F16(T), HWY_IF_V_SIZE(T, N, 16)>
|
|
5468
|
+
HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
|
|
5469
|
+
typename Mask128<T, N>::Raw ret;
|
|
5470
|
+
__asm__ volatile("fcmeq %0.8h, %1.8h, %1.8h" : "=w"(ret) : "w"(v.raw));
|
|
5471
|
+
return Not(Mask128<T, N>(ret));
|
|
5472
|
+
}
|
|
5473
|
+
template <typename T, size_t N, HWY_IF_F16(T), HWY_IF_V_SIZE_LE(T, N, 8)>
|
|
5474
|
+
HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
|
|
5475
|
+
typename Mask128<T, N>::Raw ret;
|
|
5476
|
+
__asm__ volatile("fcmeq %0.4h, %1.4h, %1.4h" : "=w"(ret) : "w"(v.raw));
|
|
5477
|
+
return Not(Mask128<T, N>(ret));
|
|
5478
|
+
}
|
|
5479
|
+
#endif // HWY_HAVE_FLOAT16
|
|
5480
|
+
|
|
5481
|
+
template <typename T, size_t N, HWY_IF_F32(T), HWY_IF_V_SIZE(T, N, 16)>
|
|
5482
|
+
HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
|
|
5483
|
+
typename Mask128<T, N>::Raw ret;
|
|
5484
|
+
__asm__ volatile("fcmeq %0.4s, %1.4s, %1.4s" : "=w"(ret) : "w"(v.raw));
|
|
5485
|
+
return Not(Mask128<T, N>(ret));
|
|
5486
|
+
}
|
|
5487
|
+
template <typename T, size_t N, HWY_IF_F32(T), HWY_IF_V_SIZE_LE(T, N, 8)>
|
|
5488
|
+
HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
|
|
5489
|
+
typename Mask128<T, N>::Raw ret;
|
|
5490
|
+
__asm__ volatile("fcmeq %0.2s, %1.2s, %1.2s" : "=w"(ret) : "w"(v.raw));
|
|
5491
|
+
return Not(Mask128<T, N>(ret));
|
|
5492
|
+
}
|
|
5493
|
+
|
|
5494
|
+
#if HWY_HAVE_FLOAT64
|
|
5495
|
+
template <typename T, size_t N, HWY_IF_F64(T), HWY_IF_V_SIZE(T, N, 16)>
|
|
5496
|
+
HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
|
|
5497
|
+
typename Mask128<T, N>::Raw ret;
|
|
5498
|
+
__asm__ volatile("fcmeq %0.2d, %1.2d, %1.2d" : "=w"(ret) : "w"(v.raw));
|
|
5499
|
+
return Not(Mask128<T, N>(ret));
|
|
5500
|
+
}
|
|
5501
|
+
template <typename T, size_t N, HWY_IF_F64(T), HWY_IF_V_SIZE_LE(T, N, 8)>
|
|
5502
|
+
HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
|
|
5503
|
+
typename Mask128<T, N>::Raw ret;
|
|
5504
|
+
__asm__ volatile("fcmeq %d0, %d1, %d1" : "=w"(ret) : "w"(v.raw));
|
|
5505
|
+
return Not(Mask128<T, N>(ret));
|
|
5506
|
+
}
|
|
5507
|
+
#endif // HWY_HAVE_FLOAT64
|
|
5508
|
+
|
|
5509
|
+
#endif // HWY_COMPILER_CLANG
|
|
5107
5510
|
|
|
5108
5511
|
// ================================================== SWIZZLE
|
|
5109
5512
|
|
|
@@ -7115,6 +7518,31 @@ HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) {
|
|
|
7115
7518
|
|
|
7116
7519
|
// ------------------------------ RearrangeToOddPlusEven (Combine)
|
|
7117
7520
|
|
|
7521
|
+
namespace detail {
|
|
7522
|
+
// Armv7 only provides 64-bit (half-vector) pairwise operations.
|
|
7523
|
+
#define HWY_NEON_DEF_PAIRWISE_OP(T, name, prefix, suffix) \
|
|
7524
|
+
HWY_INLINE Vec64<T> Pairwise##name(Vec64<T> a, Vec64<T> b) { \
|
|
7525
|
+
return Vec64<T>(prefix##_##suffix(a.raw, b.raw)); \
|
|
7526
|
+
}
|
|
7527
|
+
|
|
7528
|
+
// Note that Armv7 also lacks [u]int64 instructions, which are handled by
|
|
7529
|
+
// generic_ops-inl.h SumOfLanes etc., hence no 64-bit overloads here.
|
|
7530
|
+
#define HWY_NEON_DEF_PAIRWISE_OPS(name, prefix) \
|
|
7531
|
+
HWY_NEON_DEF_PAIRWISE_OP(uint32_t, name, prefix, u32) \
|
|
7532
|
+
HWY_NEON_DEF_PAIRWISE_OP(uint16_t, name, prefix, u16) \
|
|
7533
|
+
HWY_NEON_DEF_PAIRWISE_OP(uint8_t, name, prefix, u8) \
|
|
7534
|
+
HWY_NEON_DEF_PAIRWISE_OP(int32_t, name, prefix, s32) \
|
|
7535
|
+
HWY_NEON_DEF_PAIRWISE_OP(int16_t, name, prefix, s16) \
|
|
7536
|
+
HWY_NEON_DEF_PAIRWISE_OP(int8_t, name, prefix, s8) \
|
|
7537
|
+
HWY_NEON_DEF_PAIRWISE_OP(float32_t, name, prefix, f32)
|
|
7538
|
+
|
|
7539
|
+
HWY_NEON_DEF_PAIRWISE_OPS(Sum, vpadd)
|
|
7540
|
+
HWY_NEON_DEF_PAIRWISE_OPS(Min, vpmin)
|
|
7541
|
+
HWY_NEON_DEF_PAIRWISE_OPS(Max, vpmax)
|
|
7542
|
+
#undef HWY_NEON_DEF_PAIRWISE_OPS
|
|
7543
|
+
#undef HWY_NEON_DEF_PAIRWISE_OP
|
|
7544
|
+
} // namespace detail
|
|
7545
|
+
|
|
7118
7546
|
template <size_t N>
|
|
7119
7547
|
HWY_API Vec128<float, N> RearrangeToOddPlusEven(Vec128<float, N> sum0,
|
|
7120
7548
|
Vec128<float, N> sum1) {
|
|
@@ -7134,18 +7562,18 @@ HWY_API Vec128<int32_t> RearrangeToOddPlusEven(Vec128<int32_t> sum0,
|
|
|
7134
7562
|
#else
|
|
7135
7563
|
const Full128<int32_t> d;
|
|
7136
7564
|
const Half<decltype(d)> d64;
|
|
7137
|
-
const Vec64<int32_t> hi
|
|
7138
|
-
|
|
7565
|
+
const Vec64<int32_t> hi =
|
|
7566
|
+
detail::PairwiseSum(LowerHalf(d64, sum1), UpperHalf(d64, sum1));
|
|
7139
7567
|
const Vec64<int32_t> lo(
|
|
7140
|
-
|
|
7141
|
-
return Combine(
|
|
7568
|
+
detail::PairwiseSum(LowerHalf(d64, sum0), UpperHalf(d64, sum0)));
|
|
7569
|
+
return Combine(d, hi, lo);
|
|
7142
7570
|
#endif
|
|
7143
7571
|
}
|
|
7144
7572
|
|
|
7145
7573
|
HWY_API Vec64<int32_t> RearrangeToOddPlusEven(Vec64<int32_t> sum0,
|
|
7146
7574
|
Vec64<int32_t> sum1) {
|
|
7147
7575
|
// vmlal_s16 multiplied the lower half into sum0 and upper into sum1.
|
|
7148
|
-
return
|
|
7576
|
+
return detail::PairwiseSum(sum0, sum1);
|
|
7149
7577
|
}
|
|
7150
7578
|
|
|
7151
7579
|
HWY_API Vec32<int32_t> RearrangeToOddPlusEven(Vec32<int32_t> sum0,
|
|
@@ -7162,18 +7590,18 @@ HWY_API Vec128<uint32_t> RearrangeToOddPlusEven(Vec128<uint32_t> sum0,
|
|
|
7162
7590
|
#else
|
|
7163
7591
|
const Full128<uint32_t> d;
|
|
7164
7592
|
const Half<decltype(d)> d64;
|
|
7165
|
-
const Vec64<uint32_t> hi
|
|
7166
|
-
|
|
7167
|
-
const Vec64<uint32_t> lo
|
|
7168
|
-
|
|
7169
|
-
return Combine(
|
|
7593
|
+
const Vec64<uint32_t> hi =
|
|
7594
|
+
detail::PairwiseSum(LowerHalf(d64, sum1), UpperHalf(d64, sum1));
|
|
7595
|
+
const Vec64<uint32_t> lo =
|
|
7596
|
+
detail::PairwiseSum(LowerHalf(d64, sum0), UpperHalf(d64, sum0));
|
|
7597
|
+
return Combine(d, hi, lo);
|
|
7170
7598
|
#endif
|
|
7171
7599
|
}
|
|
7172
7600
|
|
|
7173
7601
|
HWY_API Vec64<uint32_t> RearrangeToOddPlusEven(Vec64<uint32_t> sum0,
|
|
7174
7602
|
Vec64<uint32_t> sum1) {
|
|
7175
7603
|
// vmlal_u16 multiplied the lower half into sum0 and upper into sum1.
|
|
7176
|
-
return
|
|
7604
|
+
return detail::PairwiseSum(sum0, sum1);
|
|
7177
7605
|
}
|
|
7178
7606
|
|
|
7179
7607
|
HWY_API Vec32<uint32_t> RearrangeToOddPlusEven(Vec32<uint32_t> sum0,
|
|
@@ -7182,6 +7610,78 @@ HWY_API Vec32<uint32_t> RearrangeToOddPlusEven(Vec32<uint32_t> sum0,
|
|
|
7182
7610
|
return sum0 + sum1;
|
|
7183
7611
|
}
|
|
7184
7612
|
|
|
7613
|
+
// ------------------------------ SumOfMulQuadAccumulate
|
|
7614
|
+
|
|
7615
|
+
#if HWY_TARGET == HWY_NEON_BF16
|
|
7616
|
+
|
|
7617
|
+
#ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
|
|
7618
|
+
#undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
|
|
7619
|
+
#else
|
|
7620
|
+
#define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
|
|
7621
|
+
#endif
|
|
7622
|
+
|
|
7623
|
+
template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 8)>
|
|
7624
|
+
HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 /*di32*/,
|
|
7625
|
+
VFromD<Repartition<int8_t, DI32>> a,
|
|
7626
|
+
VFromD<Repartition<int8_t, DI32>> b,
|
|
7627
|
+
VFromD<DI32> sum) {
|
|
7628
|
+
return VFromD<DI32>(vdot_s32(sum.raw, a.raw, b.raw));
|
|
7629
|
+
}
|
|
7630
|
+
|
|
7631
|
+
template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 16)>
|
|
7632
|
+
HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 /*di32*/,
|
|
7633
|
+
VFromD<Repartition<int8_t, DI32>> a,
|
|
7634
|
+
VFromD<Repartition<int8_t, DI32>> b,
|
|
7635
|
+
VFromD<DI32> sum) {
|
|
7636
|
+
return VFromD<DI32>(vdotq_s32(sum.raw, a.raw, b.raw));
|
|
7637
|
+
}
|
|
7638
|
+
|
|
7639
|
+
#ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
|
|
7640
|
+
#undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
|
|
7641
|
+
#else
|
|
7642
|
+
#define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
|
|
7643
|
+
#endif
|
|
7644
|
+
|
|
7645
|
+
template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_LE_D(DU32, 8)>
|
|
7646
|
+
HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
|
|
7647
|
+
DU32 /*du32*/, VFromD<Repartition<uint8_t, DU32>> a,
|
|
7648
|
+
VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) {
|
|
7649
|
+
return VFromD<DU32>(vdot_u32(sum.raw, a.raw, b.raw));
|
|
7650
|
+
}
|
|
7651
|
+
|
|
7652
|
+
template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_D(DU32, 16)>
|
|
7653
|
+
HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
|
|
7654
|
+
DU32 /*du32*/, VFromD<Repartition<uint8_t, DU32>> a,
|
|
7655
|
+
VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) {
|
|
7656
|
+
return VFromD<DU32>(vdotq_u32(sum.raw, a.raw, b.raw));
|
|
7657
|
+
}
|
|
7658
|
+
|
|
7659
|
+
#ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
|
|
7660
|
+
#undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
|
|
7661
|
+
#else
|
|
7662
|
+
#define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
|
|
7663
|
+
#endif
|
|
7664
|
+
|
|
7665
|
+
template <class DI32, HWY_IF_I32_D(DI32)>
|
|
7666
|
+
HWY_API VFromD<DI32> SumOfMulQuadAccumulate(
|
|
7667
|
+
DI32 di32, VFromD<Repartition<uint8_t, DI32>> a_u,
|
|
7668
|
+
VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) {
|
|
7669
|
+
// TODO: use vusdot[q]_s32 on NEON targets that require support for NEON I8MM
|
|
7670
|
+
|
|
7671
|
+
const RebindToUnsigned<decltype(di32)> du32;
|
|
7672
|
+
const Repartition<uint8_t, decltype(di32)> du8;
|
|
7673
|
+
|
|
7674
|
+
const auto b_u = BitCast(du8, b_i);
|
|
7675
|
+
const auto result_sum0 =
|
|
7676
|
+
SumOfMulQuadAccumulate(du32, a_u, b_u, BitCast(du32, sum));
|
|
7677
|
+
const auto result_sum1 = ShiftLeft<8>(
|
|
7678
|
+
SumOfMulQuadAccumulate(du32, a_u, ShiftRight<7>(b_u), Zero(du32)));
|
|
7679
|
+
|
|
7680
|
+
return BitCast(di32, Sub(result_sum0, result_sum1));
|
|
7681
|
+
}
|
|
7682
|
+
|
|
7683
|
+
#endif // HWY_TARGET == HWY_NEON_BF16
|
|
7684
|
+
|
|
7185
7685
|
// ------------------------------ WidenMulPairwiseAdd
|
|
7186
7686
|
|
|
7187
7687
|
#if HWY_NEON_HAVE_F32_TO_BF16C
|
|
@@ -7588,6 +8088,17 @@ HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
|
|
|
7588
8088
|
return v;
|
|
7589
8089
|
}
|
|
7590
8090
|
|
|
8091
|
+
// ------------------------------ InterleaveEvenBlocks
|
|
8092
|
+
template <class D, class V = VFromD<D>>
|
|
8093
|
+
HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) {
|
|
8094
|
+
return a;
|
|
8095
|
+
}
|
|
8096
|
+
// ------------------------------ InterleaveOddBlocks
|
|
8097
|
+
template <class D, class V = VFromD<D>>
|
|
8098
|
+
HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) {
|
|
8099
|
+
return a;
|
|
8100
|
+
}
|
|
8101
|
+
|
|
7591
8102
|
// ------------------------------ ReverseBlocks
|
|
7592
8103
|
// Single block: no change
|
|
7593
8104
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
@@ -8374,71 +8885,47 @@ HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
|
8374
8885
|
// On Armv7 we define SumOfLanes and generic_ops defines ReduceSum via GetLane.
|
|
8375
8886
|
#else // !HWY_ARCH_ARM_A64
|
|
8376
8887
|
|
|
8377
|
-
// Armv7 lacks N=2 and 8-bit x4, so enable
|
|
8888
|
+
// Armv7 lacks N=2 (except 32-bit) and 8-bit x4, so enable them in generic_ops.
|
|
8378
8889
|
#undef HWY_IF_SUM_OF_LANES_D
|
|
8379
8890
|
#define HWY_IF_SUM_OF_LANES_D(D) \
|
|
8380
|
-
hwy::EnableIf<(HWY_MAX_LANES_D(D) == 2) ||
|
|
8891
|
+
hwy::EnableIf<(sizeof(TFromD<D>) != 4 && HWY_MAX_LANES_D(D) == 2) || \
|
|
8381
8892
|
(sizeof(TFromD<D>) == 1 && HWY_MAX_LANES_D(D) == 4)>* = \
|
|
8382
8893
|
nullptr
|
|
8383
8894
|
#undef HWY_IF_MINMAX_OF_LANES_D
|
|
8384
8895
|
#define HWY_IF_MINMAX_OF_LANES_D(D) \
|
|
8385
|
-
hwy::EnableIf<(HWY_MAX_LANES_D(D) == 2) ||
|
|
8896
|
+
hwy::EnableIf<(sizeof(TFromD<D>) != 4 && HWY_MAX_LANES_D(D) == 2) || \
|
|
8386
8897
|
(sizeof(TFromD<D>) == 1 && HWY_MAX_LANES_D(D) == 4)>* = \
|
|
8387
8898
|
nullptr
|
|
8388
8899
|
|
|
8389
8900
|
// For arm7, we implement reductions using a series of pairwise operations. This
|
|
8390
8901
|
// produces the full vector result, so we express Reduce* in terms of *OfLanes.
|
|
8391
|
-
#define HWY_NEON_BUILD_TYPE_T(type, size) type##x##size##_t
|
|
8392
|
-
#define HWY_NEON_DEF_PAIRWISE_REDUCTION(type, size, name, prefix, suffix) \
|
|
8393
|
-
template <class D, HWY_IF_LANES_D(D, size)> \
|
|
8394
|
-
HWY_API Vec128<type##_t, size> name##OfLanes(D /* d */, \
|
|
8395
|
-
Vec128<type##_t, size> v) { \
|
|
8396
|
-
HWY_NEON_BUILD_TYPE_T(type, size) tmp = prefix##_##suffix(v.raw, v.raw); \
|
|
8397
|
-
if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \
|
|
8398
|
-
if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \
|
|
8399
|
-
return Vec128<type##_t, size>(tmp); \
|
|
8400
|
-
}
|
|
8401
8902
|
|
|
8402
|
-
|
|
8403
|
-
|
|
8404
|
-
|
|
8405
|
-
|
|
8406
|
-
|
|
8407
|
-
|
|
8408
|
-
|
|
8409
|
-
|
|
8410
|
-
tmp
|
|
8411
|
-
|
|
8412
|
-
|
|
8413
|
-
|
|
8414
|
-
|
|
8415
|
-
|
|
8903
|
+
#define HWY_NEON_DEF_PAIRWISE_REDUCTION(name) \
|
|
8904
|
+
/* generic_ops-inl.h handles 64-bit types. */ \
|
|
8905
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_NOT_T_SIZE_D(D, 8)> \
|
|
8906
|
+
HWY_API VFromD<D> name##OfLanes(D d, VFromD<D> v) { \
|
|
8907
|
+
HWY_LANES_CONSTEXPR size_t N = Lanes(d); \
|
|
8908
|
+
VFromD<D> tmp = detail::Pairwise##name(v, v); \
|
|
8909
|
+
if ((N / 2) > 1) tmp = detail::Pairwise##name(tmp, tmp); \
|
|
8910
|
+
if ((N / 4) > 1) tmp = detail::Pairwise##name(tmp, tmp); \
|
|
8911
|
+
return tmp; \
|
|
8912
|
+
} \
|
|
8913
|
+
/* Armv7 lacks q (full-vector) instructions, so first reduce 128-bit v */ \
|
|
8914
|
+
/* into a half-vector, then reduce that. */ \
|
|
8915
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 8)> \
|
|
8916
|
+
HWY_API VFromD<D> name##OfLanes(D d, VFromD<D> v) { \
|
|
8917
|
+
const Half<D> dh; \
|
|
8918
|
+
VFromD<decltype(dh)> upper = UpperHalf(dh, v); \
|
|
8919
|
+
VFromD<decltype(dh)> lower = LowerHalf(dh, v); \
|
|
8920
|
+
VFromD<decltype(dh)> half = detail::Pairwise##name(upper, lower); \
|
|
8921
|
+
half = name##OfLanes(dh, half); \
|
|
8922
|
+
return Combine(d, half, half); \
|
|
8416
8923
|
}
|
|
8417
8924
|
|
|
8418
|
-
|
|
8419
|
-
|
|
8420
|
-
|
|
8421
|
-
HWY_NEON_DEF_PAIRWISE_REDUCTION(uint8, 8, name, prefix, u8) \
|
|
8422
|
-
HWY_NEON_DEF_PAIRWISE_REDUCTION(int32, 2, name, prefix, s32) \
|
|
8423
|
-
HWY_NEON_DEF_PAIRWISE_REDUCTION(int16, 4, name, prefix, s16) \
|
|
8424
|
-
HWY_NEON_DEF_PAIRWISE_REDUCTION(int8, 8, name, prefix, s8) \
|
|
8425
|
-
HWY_NEON_DEF_PAIRWISE_REDUCTION(float32, 2, name, prefix, f32) \
|
|
8426
|
-
HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint32, 4, 2, name, prefix, u32) \
|
|
8427
|
-
HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint16, 8, 4, name, prefix, u16) \
|
|
8428
|
-
HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint8, 16, 8, name, prefix, u8) \
|
|
8429
|
-
HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int32, 4, 2, name, prefix, s32) \
|
|
8430
|
-
HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int16, 8, 4, name, prefix, s16) \
|
|
8431
|
-
HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int8, 16, 8, name, prefix, s8) \
|
|
8432
|
-
HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(float32, 4, 2, name, prefix, f32)
|
|
8433
|
-
|
|
8434
|
-
HWY_NEON_DEF_PAIRWISE_REDUCTIONS(Sum, vpadd)
|
|
8435
|
-
HWY_NEON_DEF_PAIRWISE_REDUCTIONS(Min, vpmin)
|
|
8436
|
-
HWY_NEON_DEF_PAIRWISE_REDUCTIONS(Max, vpmax)
|
|
8437
|
-
|
|
8438
|
-
#undef HWY_NEON_DEF_PAIRWISE_REDUCTIONS
|
|
8439
|
-
#undef HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION
|
|
8925
|
+
HWY_NEON_DEF_PAIRWISE_REDUCTION(Sum)
|
|
8926
|
+
HWY_NEON_DEF_PAIRWISE_REDUCTION(Min)
|
|
8927
|
+
HWY_NEON_DEF_PAIRWISE_REDUCTION(Max)
|
|
8440
8928
|
#undef HWY_NEON_DEF_PAIRWISE_REDUCTION
|
|
8441
|
-
#undef HWY_NEON_BUILD_TYPE_T
|
|
8442
8929
|
|
|
8443
8930
|
// GetLane(SumsOf4(v)) is more efficient on ArmV7 NEON than the default
|
|
8444
8931
|
// N=4 I8/U8 ReduceSum implementation in generic_ops-inl.h
|
|
@@ -8562,14 +9049,22 @@ HWY_INLINE uint64_t NibblesFromMask(D d, MFromD<D> mask) {
|
|
|
8562
9049
|
return nib & ((1ull << (d.MaxBytes() * 4)) - 1);
|
|
8563
9050
|
}
|
|
8564
9051
|
|
|
8565
|
-
|
|
8566
|
-
|
|
9052
|
+
// Returns the lowest N for the BitsFromMask result.
|
|
9053
|
+
template <class D>
|
|
9054
|
+
constexpr uint64_t OnlyActive(D d, uint64_t bits) {
|
|
9055
|
+
return (d.MaxBytes() >= 8) ? bits : (bits & ((1ull << d.MaxLanes()) - 1));
|
|
9056
|
+
}
|
|
9057
|
+
|
|
9058
|
+
} // namespace detail
|
|
9059
|
+
|
|
9060
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 16)>
|
|
9061
|
+
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
|
|
8567
9062
|
alignas(16) static constexpr uint8_t kSliceLanes[16] = {
|
|
8568
9063
|
1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80,
|
|
8569
9064
|
};
|
|
8570
|
-
const
|
|
9065
|
+
const RebindToUnsigned<D> du;
|
|
8571
9066
|
const Vec128<uint8_t> values =
|
|
8572
|
-
BitCast(du, VecFromMask(
|
|
9067
|
+
BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
|
|
8573
9068
|
|
|
8574
9069
|
#if HWY_ARCH_ARM_A64
|
|
8575
9070
|
// Can't vaddv - we need two separate bytes (16 bits).
|
|
@@ -8586,126 +9081,114 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T> mask) {
|
|
|
8586
9081
|
#endif
|
|
8587
9082
|
}
|
|
8588
9083
|
|
|
8589
|
-
template <
|
|
8590
|
-
|
|
9084
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
9085
|
+
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
|
|
8591
9086
|
// Upper lanes of partial loads are undefined. OnlyActive will fix this if
|
|
8592
9087
|
// we load all kSliceLanes so the upper lanes do not pollute the valid bits.
|
|
8593
9088
|
alignas(8) static constexpr uint8_t kSliceLanes[8] = {1, 2, 4, 8,
|
|
8594
9089
|
0x10, 0x20, 0x40, 0x80};
|
|
8595
|
-
const DFromM<decltype(mask)> d;
|
|
8596
9090
|
const RebindToUnsigned<decltype(d)> du;
|
|
8597
|
-
|
|
8598
|
-
const
|
|
9091
|
+
using VU = VFromD<decltype(du)>;
|
|
9092
|
+
const VU slice(Load(Full64<uint8_t>(), kSliceLanes).raw);
|
|
9093
|
+
const VU values = BitCast(du, VecFromMask(d, mask)) & slice;
|
|
8599
9094
|
|
|
8600
9095
|
#if HWY_ARCH_ARM_A64
|
|
8601
|
-
return vaddv_u8(values.raw);
|
|
9096
|
+
return detail::OnlyActive(d, vaddv_u8(values.raw));
|
|
8602
9097
|
#else
|
|
8603
9098
|
const uint16x4_t x2 = vpaddl_u8(values.raw);
|
|
8604
9099
|
const uint32x2_t x4 = vpaddl_u16(x2);
|
|
8605
9100
|
const uint64x1_t x8 = vpaddl_u32(x4);
|
|
8606
|
-
return vget_lane_u64(x8, 0);
|
|
9101
|
+
return detail::OnlyActive(d, vget_lane_u64(x8, 0));
|
|
8607
9102
|
#endif
|
|
8608
9103
|
}
|
|
8609
9104
|
|
|
8610
|
-
template <
|
|
8611
|
-
|
|
9105
|
+
template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_D(D, 16)>
|
|
9106
|
+
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
|
|
8612
9107
|
alignas(16) static constexpr uint16_t kSliceLanes[8] = {
|
|
8613
9108
|
1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80};
|
|
8614
|
-
const
|
|
8615
|
-
const Full128<uint16_t> du;
|
|
9109
|
+
const RebindToUnsigned<D> du;
|
|
8616
9110
|
const Vec128<uint16_t> values =
|
|
8617
9111
|
BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
|
|
8618
9112
|
#if HWY_ARCH_ARM_A64
|
|
8619
|
-
return vaddvq_u16(values.raw);
|
|
9113
|
+
return detail::OnlyActive(d, vaddvq_u16(values.raw));
|
|
8620
9114
|
#else
|
|
8621
9115
|
const uint32x4_t x2 = vpaddlq_u16(values.raw);
|
|
8622
9116
|
const uint64x2_t x4 = vpaddlq_u32(x2);
|
|
8623
|
-
return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1);
|
|
9117
|
+
return detail::OnlyActive(d, vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1));
|
|
8624
9118
|
#endif
|
|
8625
9119
|
}
|
|
8626
9120
|
|
|
8627
|
-
template <
|
|
8628
|
-
|
|
9121
|
+
template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
9122
|
+
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
|
|
8629
9123
|
// Upper lanes of partial loads are undefined. OnlyActive will fix this if
|
|
8630
9124
|
// we load all kSliceLanes so the upper lanes do not pollute the valid bits.
|
|
8631
9125
|
alignas(8) static constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8};
|
|
8632
|
-
const DFromM<decltype(mask)> d;
|
|
8633
9126
|
const RebindToUnsigned<decltype(d)> du;
|
|
8634
|
-
|
|
8635
|
-
const
|
|
9127
|
+
using VU = VFromD<decltype(du)>;
|
|
9128
|
+
const VU slice(Load(Full64<uint16_t>(), kSliceLanes).raw);
|
|
9129
|
+
const VU values = BitCast(du, VecFromMask(d, mask)) & slice;
|
|
8636
9130
|
#if HWY_ARCH_ARM_A64
|
|
8637
|
-
return vaddv_u16(values.raw);
|
|
9131
|
+
return detail::OnlyActive(d, vaddv_u16(values.raw));
|
|
8638
9132
|
#else
|
|
8639
9133
|
const uint32x2_t x2 = vpaddl_u16(values.raw);
|
|
8640
9134
|
const uint64x1_t x4 = vpaddl_u32(x2);
|
|
8641
|
-
return vget_lane_u64(x4, 0);
|
|
9135
|
+
return detail::OnlyActive(d, vget_lane_u64(x4, 0));
|
|
8642
9136
|
#endif
|
|
8643
9137
|
}
|
|
8644
9138
|
|
|
8645
|
-
template <
|
|
8646
|
-
|
|
9139
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_D(D, 16)>
|
|
9140
|
+
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
|
|
8647
9141
|
alignas(16) static constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8};
|
|
8648
|
-
const
|
|
8649
|
-
const Full128<uint32_t> du;
|
|
9142
|
+
const RebindToUnsigned<D> du;
|
|
8650
9143
|
const Vec128<uint32_t> values =
|
|
8651
9144
|
BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
|
|
8652
9145
|
#if HWY_ARCH_ARM_A64
|
|
8653
|
-
return vaddvq_u32(values.raw);
|
|
9146
|
+
return detail::OnlyActive(d, vaddvq_u32(values.raw));
|
|
8654
9147
|
#else
|
|
8655
9148
|
const uint64x2_t x2 = vpaddlq_u32(values.raw);
|
|
8656
|
-
return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1);
|
|
9149
|
+
return detail::OnlyActive(d, vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1));
|
|
8657
9150
|
#endif
|
|
8658
9151
|
}
|
|
8659
9152
|
|
|
8660
|
-
template <
|
|
8661
|
-
|
|
9153
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
9154
|
+
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
|
|
8662
9155
|
// Upper lanes of partial loads are undefined. OnlyActive will fix this if
|
|
8663
9156
|
// we load all kSliceLanes so the upper lanes do not pollute the valid bits.
|
|
8664
9157
|
alignas(8) static constexpr uint32_t kSliceLanes[2] = {1, 2};
|
|
8665
|
-
const DFromM<decltype(mask)> d;
|
|
8666
9158
|
const RebindToUnsigned<decltype(d)> du;
|
|
8667
|
-
|
|
8668
|
-
const
|
|
9159
|
+
using VU = VFromD<decltype(du)>;
|
|
9160
|
+
const VU slice(Load(Full64<uint32_t>(), kSliceLanes).raw);
|
|
9161
|
+
const VU values = BitCast(du, VecFromMask(d, mask)) & slice;
|
|
8669
9162
|
#if HWY_ARCH_ARM_A64
|
|
8670
|
-
return vaddv_u32(values.raw);
|
|
9163
|
+
return detail::OnlyActive(d, vaddv_u32(values.raw));
|
|
8671
9164
|
#else
|
|
8672
9165
|
const uint64x1_t x2 = vpaddl_u32(values.raw);
|
|
8673
|
-
return vget_lane_u64(x2, 0);
|
|
9166
|
+
return detail::OnlyActive(d, vget_lane_u64(x2, 0));
|
|
8674
9167
|
#endif
|
|
8675
9168
|
}
|
|
8676
9169
|
|
|
8677
|
-
template <
|
|
8678
|
-
|
|
9170
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 16)>
|
|
9171
|
+
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
|
|
8679
9172
|
alignas(16) static constexpr uint64_t kSliceLanes[2] = {1, 2};
|
|
8680
|
-
const
|
|
8681
|
-
const Full128<uint64_t> du;
|
|
9173
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
8682
9174
|
const Vec128<uint64_t> values =
|
|
8683
|
-
BitCast(du, VecFromMask(d,
|
|
9175
|
+
BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
|
|
8684
9176
|
#if HWY_ARCH_ARM_A64
|
|
8685
|
-
return vaddvq_u64(values.raw);
|
|
9177
|
+
return detail::OnlyActive(d, vaddvq_u64(values.raw));
|
|
8686
9178
|
#else
|
|
8687
|
-
return
|
|
9179
|
+
return detail::OnlyActive(
|
|
9180
|
+
d, vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1));
|
|
8688
9181
|
#endif
|
|
8689
9182
|
}
|
|
8690
9183
|
|
|
8691
|
-
template <
|
|
8692
|
-
|
|
8693
|
-
const
|
|
8694
|
-
const
|
|
8695
|
-
const Vec64<uint64_t> values = BitCast(du, VecFromMask(d, m)) & Set(du, 1);
|
|
9184
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
9185
|
+
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
|
|
9186
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
9187
|
+
const Vec64<uint64_t> values = BitCast(du, VecFromMask(d, mask)) & Set(du, 1);
|
|
8696
9188
|
return vget_lane_u64(values.raw, 0);
|
|
8697
9189
|
}
|
|
8698
9190
|
|
|
8699
|
-
|
|
8700
|
-
template <typename T, size_t N>
|
|
8701
|
-
constexpr uint64_t OnlyActive(uint64_t bits) {
|
|
8702
|
-
return ((N * sizeof(T)) >= 8) ? bits : (bits & ((1ull << N) - 1));
|
|
8703
|
-
}
|
|
8704
|
-
|
|
8705
|
-
template <typename T, size_t N>
|
|
8706
|
-
HWY_INLINE uint64_t BitsFromMask(Mask128<T, N> mask) {
|
|
8707
|
-
return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
|
|
8708
|
-
}
|
|
9191
|
+
namespace detail {
|
|
8709
9192
|
|
|
8710
9193
|
// Returns number of lanes whose mask is set.
|
|
8711
9194
|
//
|
|
@@ -8825,7 +9308,7 @@ HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
|
|
|
8825
9308
|
// `p` points to at least 8 writable bytes.
|
|
8826
9309
|
template <class D>
|
|
8827
9310
|
HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
|
|
8828
|
-
const uint64_t mask_bits =
|
|
9311
|
+
const uint64_t mask_bits = BitsFromMask(d, mask);
|
|
8829
9312
|
const size_t kNumBytes = (d.MaxLanes() + 7) / 8;
|
|
8830
9313
|
CopyBytes<kNumBytes>(&mask_bits, bits);
|
|
8831
9314
|
return kNumBytes;
|
|
@@ -9313,7 +9796,8 @@ HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
|
|
|
9313
9796
|
// General case, 2 or 4 byte lanes
|
|
9314
9797
|
template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
|
|
9315
9798
|
HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
|
|
9316
|
-
|
|
9799
|
+
const DFromV<decltype(v)> d;
|
|
9800
|
+
return detail::Compress(v, BitsFromMask(d, mask));
|
|
9317
9801
|
}
|
|
9318
9802
|
|
|
9319
9803
|
// Single lane: no-op
|
|
@@ -9337,12 +9821,13 @@ HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
|
|
|
9337
9821
|
// General case, 2 or 4 byte lanes
|
|
9338
9822
|
template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
|
|
9339
9823
|
HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
|
|
9824
|
+
const DFromV<decltype(v)> d;
|
|
9340
9825
|
// For partial vectors, we cannot pull the Not() into the table because
|
|
9341
9826
|
// BitsFromMask clears the upper bits.
|
|
9342
9827
|
if (N < 16 / sizeof(T)) {
|
|
9343
|
-
return detail::Compress(v,
|
|
9828
|
+
return detail::Compress(v, BitsFromMask(d, Not(mask)));
|
|
9344
9829
|
}
|
|
9345
|
-
return detail::CompressNot(v,
|
|
9830
|
+
return detail::CompressNot(v, BitsFromMask(d, mask));
|
|
9346
9831
|
}
|
|
9347
9832
|
|
|
9348
9833
|
// ------------------------------ CompressBlocksNot
|
|
@@ -9370,7 +9855,7 @@ HWY_INLINE Vec128<T, N> CompressBits(Vec128<T, N> v,
|
|
|
9370
9855
|
template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
|
|
9371
9856
|
HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
|
|
9372
9857
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
9373
|
-
const uint64_t mask_bits =
|
|
9858
|
+
const uint64_t mask_bits = BitsFromMask(d, mask);
|
|
9374
9859
|
StoreU(detail::Compress(v, mask_bits), d, unaligned);
|
|
9375
9860
|
return PopCount(mask_bits);
|
|
9376
9861
|
}
|
|
@@ -9380,7 +9865,7 @@ template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
|
|
|
9380
9865
|
HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
|
|
9381
9866
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
9382
9867
|
const RebindToUnsigned<decltype(d)> du; // so we can support fp16/bf16
|
|
9383
|
-
const uint64_t mask_bits =
|
|
9868
|
+
const uint64_t mask_bits = BitsFromMask(d, m);
|
|
9384
9869
|
const size_t count = PopCount(mask_bits);
|
|
9385
9870
|
const MFromD<D> store_mask = RebindMask(d, FirstN(du, count));
|
|
9386
9871
|
const VFromD<decltype(du)> compressed =
|
|
@@ -9420,17 +9905,22 @@ namespace detail {
|
|
|
9420
9905
|
#define HWY_NEON_BUILD_ARG_HWY_LOAD_INT from
|
|
9421
9906
|
|
|
9422
9907
|
#if HWY_ARCH_ARM_A64
|
|
9423
|
-
#define HWY_IF_LOAD_INT(D)
|
|
9424
|
-
|
|
9908
|
+
#define HWY_IF_LOAD_INT(D) \
|
|
9909
|
+
HWY_IF_V_SIZE_GT_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D)
|
|
9910
|
+
#define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args) \
|
|
9911
|
+
HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \
|
|
9912
|
+
HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args)
|
|
9425
9913
|
#else
|
|
9426
|
-
// Exclude 64x2 and f64x1, which are only supported on aarch64
|
|
9914
|
+
// Exclude 64x2 and f64x1, which are only supported on aarch64; also exclude any
|
|
9915
|
+
// emulated types.
|
|
9427
9916
|
#define HWY_IF_LOAD_INT(D) \
|
|
9428
|
-
HWY_IF_V_SIZE_GT_D(D, 4),
|
|
9917
|
+
HWY_IF_V_SIZE_GT_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D), \
|
|
9429
9918
|
hwy::EnableIf<(HWY_MAX_LANES_D(D) == 1 || sizeof(TFromD<D>) < 8)>* = \
|
|
9430
9919
|
nullptr
|
|
9431
9920
|
#define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args) \
|
|
9432
9921
|
HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
|
|
9433
9922
|
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
|
|
9923
|
+
HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args) \
|
|
9434
9924
|
HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \
|
|
9435
9925
|
HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
|
|
9436
9926
|
HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
|
|
@@ -9480,7 +9970,8 @@ HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
|
|
|
9480
9970
|
}
|
|
9481
9971
|
|
|
9482
9972
|
// <= 32 bits: avoid loading more than N bytes by copying to buffer
|
|
9483
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 4),
|
|
9973
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D),
|
|
9974
|
+
typename T = TFromD<D>>
|
|
9484
9975
|
HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
|
|
9485
9976
|
VFromD<D>& v0, VFromD<D>& v1) {
|
|
9486
9977
|
// The smallest vector registers are 64-bits and we want space for two.
|
|
@@ -9494,7 +9985,8 @@ HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
|
|
|
9494
9985
|
|
|
9495
9986
|
#if HWY_ARCH_ARM_V7
|
|
9496
9987
|
// 64x2: split into two 64x1
|
|
9497
|
-
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)
|
|
9988
|
+
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8),
|
|
9989
|
+
HWY_NEON_IF_NOT_EMULATED_D(D)>
|
|
9498
9990
|
HWY_API void LoadInterleaved2(D d, T* HWY_RESTRICT unaligned, Vec128<T>& v0,
|
|
9499
9991
|
Vec128<T>& v1) {
|
|
9500
9992
|
const Half<decltype(d)> dh;
|
|
@@ -9519,7 +10011,8 @@ HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
|
|
|
9519
10011
|
}
|
|
9520
10012
|
|
|
9521
10013
|
// <= 32 bits: avoid writing more than N bytes by copying to buffer
|
|
9522
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 4),
|
|
10014
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D),
|
|
10015
|
+
typename T = TFromD<D>>
|
|
9523
10016
|
HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
|
|
9524
10017
|
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
|
|
9525
10018
|
// The smallest vector registers are 64-bits and we want space for three.
|
|
@@ -9534,7 +10027,8 @@ HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
|
|
|
9534
10027
|
|
|
9535
10028
|
#if HWY_ARCH_ARM_V7
|
|
9536
10029
|
// 64x2: split into two 64x1
|
|
9537
|
-
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)
|
|
10030
|
+
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8),
|
|
10031
|
+
HWY_NEON_IF_NOT_EMULATED_D(D)>
|
|
9538
10032
|
HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
|
|
9539
10033
|
Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2) {
|
|
9540
10034
|
const Half<decltype(d)> dh;
|
|
@@ -9562,7 +10056,8 @@ HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
|
|
|
9562
10056
|
}
|
|
9563
10057
|
|
|
9564
10058
|
// <= 32 bits: avoid writing more than N bytes by copying to buffer
|
|
9565
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 4),
|
|
10059
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D),
|
|
10060
|
+
typename T = TFromD<D>>
|
|
9566
10061
|
HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
|
|
9567
10062
|
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
|
|
9568
10063
|
VFromD<D>& v3) {
|
|
@@ -9578,7 +10073,8 @@ HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
|
|
|
9578
10073
|
|
|
9579
10074
|
#if HWY_ARCH_ARM_V7
|
|
9580
10075
|
// 64x2: split into two 64x1
|
|
9581
|
-
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)
|
|
10076
|
+
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8),
|
|
10077
|
+
HWY_NEON_IF_NOT_EMULATED_D(D)>
|
|
9582
10078
|
HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
|
|
9583
10079
|
Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2,
|
|
9584
10080
|
Vec128<T>& v3) {
|
|
@@ -9605,17 +10101,22 @@ namespace detail {
|
|
|
9605
10101
|
#define HWY_NEON_BUILD_ARG_HWY_STORE_INT to, tup.raw
|
|
9606
10102
|
|
|
9607
10103
|
#if HWY_ARCH_ARM_A64
|
|
9608
|
-
#define HWY_IF_STORE_INT(D)
|
|
9609
|
-
|
|
10104
|
+
#define HWY_IF_STORE_INT(D) \
|
|
10105
|
+
HWY_IF_V_SIZE_GT_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D)
|
|
10106
|
+
#define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args) \
|
|
10107
|
+
HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \
|
|
10108
|
+
HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args)
|
|
9610
10109
|
#else
|
|
9611
|
-
// Exclude 64x2 and f64x1, which are only supported on aarch64
|
|
10110
|
+
// Exclude 64x2 and f64x1, which are only supported on aarch64; also exclude any
|
|
10111
|
+
// emulated types.
|
|
9612
10112
|
#define HWY_IF_STORE_INT(D) \
|
|
9613
|
-
HWY_IF_V_SIZE_GT_D(D, 4),
|
|
10113
|
+
HWY_IF_V_SIZE_GT_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D), \
|
|
9614
10114
|
hwy::EnableIf<(HWY_MAX_LANES_D(D) == 1 || sizeof(TFromD<D>) < 8)>* = \
|
|
9615
10115
|
nullptr
|
|
9616
10116
|
#define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args) \
|
|
9617
10117
|
HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
|
|
9618
10118
|
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
|
|
10119
|
+
HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args) \
|
|
9619
10120
|
HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \
|
|
9620
10121
|
HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
|
|
9621
10122
|
HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
|
|
@@ -9650,7 +10151,8 @@ HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
|
|
|
9650
10151
|
}
|
|
9651
10152
|
|
|
9652
10153
|
// <= 32 bits: avoid writing more than N bytes by copying to buffer
|
|
9653
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 4),
|
|
10154
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D),
|
|
10155
|
+
typename T = TFromD<D>>
|
|
9654
10156
|
HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
|
|
9655
10157
|
T* HWY_RESTRICT unaligned) {
|
|
9656
10158
|
alignas(16) T buf[2 * 8 / sizeof(T)];
|
|
@@ -9661,7 +10163,8 @@ HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
|
|
|
9661
10163
|
|
|
9662
10164
|
#if HWY_ARCH_ARM_V7
|
|
9663
10165
|
// 64x2: split into two 64x1
|
|
9664
|
-
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)
|
|
10166
|
+
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8),
|
|
10167
|
+
HWY_NEON_IF_NOT_EMULATED_D(D)>
|
|
9665
10168
|
HWY_API void StoreInterleaved2(Vec128<T> v0, Vec128<T> v1, D d,
|
|
9666
10169
|
T* HWY_RESTRICT unaligned) {
|
|
9667
10170
|
const Half<decltype(d)> dh;
|
|
@@ -9682,7 +10185,8 @@ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
|
|
|
9682
10185
|
}
|
|
9683
10186
|
|
|
9684
10187
|
// <= 32 bits: avoid writing more than N bytes by copying to buffer
|
|
9685
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 4),
|
|
10188
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D),
|
|
10189
|
+
typename T = TFromD<D>>
|
|
9686
10190
|
HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
|
|
9687
10191
|
T* HWY_RESTRICT unaligned) {
|
|
9688
10192
|
alignas(16) T buf[3 * 8 / sizeof(T)];
|
|
@@ -9693,7 +10197,8 @@ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
|
|
|
9693
10197
|
|
|
9694
10198
|
#if HWY_ARCH_ARM_V7
|
|
9695
10199
|
// 64x2: split into two 64x1
|
|
9696
|
-
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)
|
|
10200
|
+
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8),
|
|
10201
|
+
HWY_NEON_IF_NOT_EMULATED_D(D)>
|
|
9697
10202
|
HWY_API void StoreInterleaved3(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2, D d,
|
|
9698
10203
|
T* HWY_RESTRICT unaligned) {
|
|
9699
10204
|
const Half<decltype(d)> dh;
|
|
@@ -9714,7 +10219,8 @@ HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
|
|
|
9714
10219
|
}
|
|
9715
10220
|
|
|
9716
10221
|
// <= 32 bits: avoid writing more than N bytes by copying to buffer
|
|
9717
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 4),
|
|
10222
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D),
|
|
10223
|
+
typename T = TFromD<D>>
|
|
9718
10224
|
HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
|
|
9719
10225
|
VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) {
|
|
9720
10226
|
alignas(16) T buf[4 * 8 / sizeof(T)];
|
|
@@ -9725,7 +10231,8 @@ HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
|
|
|
9725
10231
|
|
|
9726
10232
|
#if HWY_ARCH_ARM_V7
|
|
9727
10233
|
// 64x2: split into two 64x1
|
|
9728
|
-
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)
|
|
10234
|
+
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8),
|
|
10235
|
+
HWY_NEON_IF_NOT_EMULATED_D(D)>
|
|
9729
10236
|
HWY_API void StoreInterleaved4(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2,
|
|
9730
10237
|
Vec128<T> v3, D d, T* HWY_RESTRICT unaligned) {
|
|
9731
10238
|
const Half<decltype(d)> dh;
|
|
@@ -9740,6 +10247,9 @@ HWY_API void StoreInterleaved4(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2,
|
|
|
9740
10247
|
|
|
9741
10248
|
#undef HWY_IF_STORE_INT
|
|
9742
10249
|
|
|
10250
|
+
// Fall back on generic Load/StoreInterleaved[234] for any emulated types.
|
|
10251
|
+
// Requires HWY_GENERIC_IF_EMULATED_D mirrors HWY_NEON_IF_EMULATED_D.
|
|
10252
|
+
|
|
9743
10253
|
// ------------------------------ Additional mask logical operations
|
|
9744
10254
|
template <class T>
|
|
9745
10255
|
HWY_API Mask128<T, 1> SetAtOrAfterFirst(Mask128<T, 1> mask) {
|
|
@@ -10066,6 +10576,7 @@ namespace detail { // for code folding
|
|
|
10066
10576
|
#undef HWY_NEON_DEF_FUNCTION_UINTS
|
|
10067
10577
|
#undef HWY_NEON_EVAL
|
|
10068
10578
|
#undef HWY_NEON_IF_EMULATED_D
|
|
10579
|
+
#undef HWY_NEON_IF_NOT_EMULATED_D
|
|
10069
10580
|
} // namespace detail
|
|
10070
10581
|
|
|
10071
10582
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|