@img/sharp-libvips-dev 1.2.0 → 1.2.2-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/include/ffi.h +3 -3
  2. package/include/harfbuzz/hb-deprecated.h +4 -4
  3. package/include/harfbuzz/hb-font.h +120 -9
  4. package/include/harfbuzz/hb-version.h +3 -3
  5. package/include/hwy/abort.h +2 -19
  6. package/include/hwy/aligned_allocator.h +11 -7
  7. package/include/hwy/auto_tune.h +504 -0
  8. package/include/hwy/base.h +425 -104
  9. package/include/hwy/cache_control.h +16 -0
  10. package/include/hwy/detect_compiler_arch.h +32 -1
  11. package/include/hwy/detect_targets.h +251 -67
  12. package/include/hwy/foreach_target.h +35 -0
  13. package/include/hwy/highway.h +185 -76
  14. package/include/hwy/nanobenchmark.h +1 -19
  15. package/include/hwy/ops/arm_neon-inl.h +969 -458
  16. package/include/hwy/ops/arm_sve-inl.h +1137 -359
  17. package/include/hwy/ops/emu128-inl.h +97 -11
  18. package/include/hwy/ops/generic_ops-inl.h +1222 -34
  19. package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
  20. package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
  21. package/include/hwy/ops/ppc_vsx-inl.h +306 -126
  22. package/include/hwy/ops/rvv-inl.h +546 -51
  23. package/include/hwy/ops/scalar-inl.h +77 -22
  24. package/include/hwy/ops/set_macros-inl.h +138 -17
  25. package/include/hwy/ops/shared-inl.h +50 -10
  26. package/include/hwy/ops/wasm_128-inl.h +137 -92
  27. package/include/hwy/ops/x86_128-inl.h +773 -214
  28. package/include/hwy/ops/x86_256-inl.h +712 -255
  29. package/include/hwy/ops/x86_512-inl.h +429 -753
  30. package/include/hwy/ops/x86_avx3-inl.h +501 -0
  31. package/include/hwy/per_target.h +2 -1
  32. package/include/hwy/profiler.h +622 -486
  33. package/include/hwy/targets.h +62 -20
  34. package/include/hwy/timer-inl.h +8 -160
  35. package/include/hwy/timer.h +170 -3
  36. package/include/hwy/x86_cpuid.h +81 -0
  37. package/include/libheif/heif_cxx.h +25 -5
  38. package/include/libheif/heif_regions.h +5 -5
  39. package/include/libheif/heif_version.h +2 -2
  40. package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
  41. package/include/libxml2/libxml/xmlversion.h +4 -4
  42. package/include/pango-1.0/pango/pango-enum-types.h +3 -0
  43. package/include/pango-1.0/pango/pango-features.h +3 -3
  44. package/include/pango-1.0/pango/pango-font.h +30 -0
  45. package/include/pango-1.0/pango/pango-version-macros.h +26 -0
  46. package/include/pixman-1/pixman-version.h +2 -2
  47. package/include/webp/decode.h +11 -2
  48. package/include/webp/demux.h +2 -0
  49. package/include/webp/encode.h +2 -0
  50. package/include/webp/mux_types.h +1 -0
  51. package/include/webp/sharpyuv/sharpyuv.h +1 -1
  52. package/include/webp/types.h +2 -2
  53. package/include/zlib.h +3 -3
  54. package/package.json +1 -1
  55. package/versions.json +11 -11
@@ -21,6 +21,7 @@
21
21
  // Arm NEON intrinsics are documented at:
22
22
  // https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon]
23
23
 
24
+ #include "hwy/base.h"
24
25
  #include "hwy/ops/shared-inl.h"
25
26
 
26
27
  HWY_DIAGNOSTICS(push)
@@ -141,29 +142,6 @@ namespace detail { // for code folding and Raw128
141
142
  HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \
142
143
  HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)
143
144
 
144
- // Clang 17 crashes with bf16, see github.com/llvm/llvm-project/issues/64179.
145
- #undef HWY_NEON_HAVE_BFLOAT16
146
- #if HWY_HAVE_SCALAR_BF16_TYPE && \
147
- ((HWY_TARGET == HWY_NEON_BF16 && \
148
- (!HWY_COMPILER_CLANG || HWY_COMPILER_CLANG >= 1800)) || \
149
- defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC))
150
- #define HWY_NEON_HAVE_BFLOAT16 1
151
- #else
152
- #define HWY_NEON_HAVE_BFLOAT16 0
153
- #endif
154
-
155
- // HWY_NEON_HAVE_F32_TO_BF16C is defined if NEON vcvt_bf16_f32 and
156
- // vbfdot_f32 are available, even if the __bf16 type is disabled due to
157
- // GCC/Clang bugs.
158
- #undef HWY_NEON_HAVE_F32_TO_BF16C
159
- #if HWY_NEON_HAVE_BFLOAT16 || HWY_TARGET == HWY_NEON_BF16 || \
160
- (defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && \
161
- (HWY_COMPILER_GCC_ACTUAL >= 1000 || HWY_COMPILER_CLANG >= 1100))
162
- #define HWY_NEON_HAVE_F32_TO_BF16C 1
163
- #else
164
- #define HWY_NEON_HAVE_F32_TO_BF16C 0
165
- #endif
166
-
167
145
  // bfloat16_t
168
146
  #if HWY_NEON_HAVE_BFLOAT16
169
147
  #define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args) \
@@ -194,10 +172,16 @@ namespace detail { // for code folding and Raw128
194
172
  // Enable generic functions for whichever of (f16, bf16) are not supported.
195
173
  #if !HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16
196
174
  #define HWY_NEON_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
175
+ #define HWY_GENERIC_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
176
+ #define HWY_NEON_IF_NOT_EMULATED_D(D) HWY_IF_NOT_SPECIAL_FLOAT_D(D)
197
177
  #elif !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16
198
178
  #define HWY_NEON_IF_EMULATED_D(D) HWY_IF_F16_D(D)
179
+ #define HWY_GENERIC_IF_EMULATED_D(D) HWY_IF_F16_D(D)
180
+ #define HWY_NEON_IF_NOT_EMULATED_D(D) HWY_IF_NOT_F16_D(D)
199
181
  #elif HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16
200
182
  #define HWY_NEON_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
183
+ #define HWY_GENERIC_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
184
+ #define HWY_NEON_IF_NOT_EMULATED_D(D) HWY_IF_NOT_BF16_D(D)
201
185
  #elif HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16
202
186
  // NOTE: hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr is used instead of
203
187
  // hwy::EnableIf<false>* = nullptr to avoid compiler errors since
@@ -205,6 +189,9 @@ namespace detail { // for code folding and Raw128
205
189
  // SFINAE to occur instead of a hard error due to a dependency on the D template
206
190
  // argument
207
191
  #define HWY_NEON_IF_EMULATED_D(D) hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr
192
+ #define HWY_GENERIC_IF_EMULATED_D(D) \
193
+ hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr
194
+ #define HWY_NEON_IF_NOT_EMULATED_D(D) hwy::EnableIf<true>* = nullptr
208
195
  #else
209
196
  #error "Logic error, handled all four cases"
210
197
  #endif
@@ -870,10 +857,10 @@ using Vec16 = Vec128<T, 2 / sizeof(T)>;
870
857
  // FF..FF or 0.
871
858
  template <typename T, size_t N = 16 / sizeof(T)>
872
859
  class Mask128 {
860
+ public:
873
861
  // Arm C Language Extensions return and expect unsigned type.
874
862
  using Raw = typename detail::Raw128<MakeUnsigned<T>, N>::type;
875
863
 
876
- public:
877
864
  using PrivateT = T; // only for DFromM
878
865
  static constexpr size_t kPrivateN = N; // only for DFromM
879
866
 
@@ -897,6 +884,249 @@ using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>;
897
884
  template <class V>
898
885
  using TFromV = typename V::PrivateT;
899
886
 
887
+ // TODO(janwas): ForDemoteVectors, in convert_test and demote_test, appear to
888
+ // instantiate this with D = double x 4. The cause is unknown. Previously,
889
+ // defining this in terms of Set rejected that via SFINAE because only
890
+ // V_SIZE = 16 and V_SIZE <= 8 overloads were defined. As a workaround,
891
+ // truncate the lane count to 128 bits.
892
+ template <class D>
893
+ using VFromD =
894
+ Vec128<TFromD<D>, HWY_MIN(16 / sizeof(TFromD<D>), MaxLanes(D()))>;
895
+
896
+ // ------------------------------ BitCast
897
+
898
+ namespace detail {
899
+
900
+ // Converts from Vec128<T, N> to Vec128<uint8_t, N * sizeof(T)> using the
901
+ // vreinterpret*_u8_*() set of functions.
902
+ #define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
903
+ #define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \
904
+ Vec128<uint8_t, size * sizeof(type##_t)>
905
+ #define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type##_t, size> v
906
+ #define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw
907
+
908
+ // Special case of u8 to u8 since vreinterpret*_u8_u8 is obviously not defined.
909
+ template <size_t N>
910
+ HWY_INLINE Vec128<uint8_t, N> BitCastToByte(Vec128<uint8_t, N> v) {
911
+ return v;
912
+ }
913
+
914
+ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(BitCastToByte, vreinterpret, _u8_,
915
+ HWY_CAST_TO_U8)
916
+ HWY_NEON_DEF_FUNCTION_BFLOAT_16(BitCastToByte, vreinterpret, _u8_,
917
+ HWY_CAST_TO_U8)
918
+
919
+ HWY_NEON_DEF_FUNCTION_INTS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
920
+ HWY_NEON_DEF_FUNCTION_UINT_16(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
921
+ HWY_NEON_DEF_FUNCTION_UINT_32(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
922
+ HWY_NEON_DEF_FUNCTION_UINT_64(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
923
+
924
+ #if !HWY_HAVE_FLOAT16
925
+ #if HWY_NEON_HAVE_F16C
926
+ HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(BitCastToByte, vreinterpret, _u8_,
927
+ HWY_CAST_TO_U8)
928
+ #else
929
+ template <size_t N>
930
+ HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<float16_t, N> v) {
931
+ return BitCastToByte(Vec128<uint16_t, N>(v.raw));
932
+ }
933
+ #endif // HWY_NEON_HAVE_F16C
934
+ #endif // !HWY_HAVE_FLOAT16
935
+
936
+ #if !HWY_NEON_HAVE_BFLOAT16
937
+ template <size_t N>
938
+ HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<bfloat16_t, N> v) {
939
+ return BitCastToByte(Vec128<uint16_t, N>(v.raw));
940
+ }
941
+ #endif // !HWY_NEON_HAVE_BFLOAT16
942
+
943
+ #undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
944
+ #undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8
945
+ #undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8
946
+ #undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8
947
+
948
+ template <class D, HWY_IF_U8_D(D)>
949
+ HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, VFromD<D> v) {
950
+ return v;
951
+ }
952
+
953
+ // 64-bit or less:
954
+
955
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)>
956
+ HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
957
+ VFromD<RebindToUnsigned<D>> v) {
958
+ return VFromD<D>(vreinterpret_s8_u8(v.raw));
959
+ }
960
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
961
+ HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
962
+ VFromD<Repartition<uint8_t, D>> v) {
963
+ return VFromD<D>(vreinterpret_u16_u8(v.raw));
964
+ }
965
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
966
+ HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
967
+ VFromD<Repartition<uint8_t, D>> v) {
968
+ return VFromD<D>(vreinterpret_s16_u8(v.raw));
969
+ }
970
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
971
+ HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
972
+ VFromD<Repartition<uint8_t, D>> v) {
973
+ return VFromD<D>(vreinterpret_u32_u8(v.raw));
974
+ }
975
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
976
+ HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
977
+ VFromD<Repartition<uint8_t, D>> v) {
978
+ return VFromD<D>(vreinterpret_s32_u8(v.raw));
979
+ }
980
+
981
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U64_D(D)>
982
+ HWY_INLINE Vec64<uint64_t> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
983
+ return Vec64<uint64_t>(vreinterpret_u64_u8(v.raw));
984
+ }
985
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I64_D(D)>
986
+ HWY_INLINE Vec64<int64_t> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
987
+ return Vec64<int64_t>(vreinterpret_s64_u8(v.raw));
988
+ }
989
+
990
+ // Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C.
991
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
992
+ HWY_INLINE VFromD<D> BitCastFromByte(D, VFromD<Repartition<uint8_t, D>> v) {
993
+ #if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
994
+ return VFromD<D>(vreinterpret_f16_u8(v.raw));
995
+ #else
996
+ const RebindToUnsigned<D> du;
997
+ return VFromD<D>(BitCastFromByte(du, v).raw);
998
+ #endif
999
+ }
1000
+
1001
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
1002
+ HWY_INLINE VFromD<D> BitCastFromByte(D, VFromD<Repartition<uint8_t, D>> v) {
1003
+ #if HWY_NEON_HAVE_BFLOAT16
1004
+ return VFromD<D>(vreinterpret_bf16_u8(v.raw));
1005
+ #else
1006
+ const RebindToUnsigned<D> du;
1007
+ return VFromD<D>(BitCastFromByte(du, v).raw);
1008
+ #endif
1009
+ }
1010
+
1011
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
1012
+ HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
1013
+ VFromD<Repartition<uint8_t, D>> v) {
1014
+ return VFromD<D>(vreinterpret_f32_u8(v.raw));
1015
+ }
1016
+
1017
+ #if HWY_HAVE_FLOAT64
1018
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F64_D(D)>
1019
+ HWY_INLINE Vec64<double> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
1020
+ return Vec64<double>(vreinterpret_f64_u8(v.raw));
1021
+ }
1022
+ #endif // HWY_HAVE_FLOAT64
1023
+
1024
+ // 128-bit full:
1025
+
1026
+ template <class D, HWY_IF_I8_D(D)>
1027
+ HWY_INLINE Vec128<int8_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1028
+ return Vec128<int8_t>(vreinterpretq_s8_u8(v.raw));
1029
+ }
1030
+ template <class D, HWY_IF_U16_D(D)>
1031
+ HWY_INLINE Vec128<uint16_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1032
+ return Vec128<uint16_t>(vreinterpretq_u16_u8(v.raw));
1033
+ }
1034
+ template <class D, HWY_IF_I16_D(D)>
1035
+ HWY_INLINE Vec128<int16_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1036
+ return Vec128<int16_t>(vreinterpretq_s16_u8(v.raw));
1037
+ }
1038
+ template <class D, HWY_IF_U32_D(D)>
1039
+ HWY_INLINE Vec128<uint32_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1040
+ return Vec128<uint32_t>(vreinterpretq_u32_u8(v.raw));
1041
+ }
1042
+ template <class D, HWY_IF_I32_D(D)>
1043
+ HWY_INLINE Vec128<int32_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1044
+ return Vec128<int32_t>(vreinterpretq_s32_u8(v.raw));
1045
+ }
1046
+ template <class D, HWY_IF_U64_D(D)>
1047
+ HWY_INLINE Vec128<uint64_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1048
+ return Vec128<uint64_t>(vreinterpretq_u64_u8(v.raw));
1049
+ }
1050
+ template <class D, HWY_IF_I64_D(D)>
1051
+ HWY_INLINE Vec128<int64_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1052
+ return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
1053
+ }
1054
+
1055
+ template <class D, HWY_IF_F32_D(D)>
1056
+ HWY_INLINE Vec128<float> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1057
+ return Vec128<float>(vreinterpretq_f32_u8(v.raw));
1058
+ }
1059
+
1060
+ #if HWY_HAVE_FLOAT64
1061
+ template <class D, HWY_IF_F64_D(D)>
1062
+ HWY_INLINE Vec128<double> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1063
+ return Vec128<double>(vreinterpretq_f64_u8(v.raw));
1064
+ }
1065
+ #endif // HWY_HAVE_FLOAT64
1066
+
1067
+ // Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C.
1068
+ template <class D, HWY_IF_F16_D(D)>
1069
+ HWY_INLINE VFromD<D> BitCastFromByte(D, Vec128<uint8_t> v) {
1070
+ #if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
1071
+ return VFromD<D>(vreinterpretq_f16_u8(v.raw));
1072
+ #else
1073
+ return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
1074
+ #endif
1075
+ }
1076
+
1077
+ template <class D, HWY_IF_BF16_D(D)>
1078
+ HWY_INLINE VFromD<D> BitCastFromByte(D, Vec128<uint8_t> v) {
1079
+ #if HWY_NEON_HAVE_BFLOAT16
1080
+ return VFromD<D>(vreinterpretq_bf16_u8(v.raw));
1081
+ #else
1082
+ return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
1083
+ #endif
1084
+ }
1085
+
1086
+ } // namespace detail
1087
+
1088
+ template <class D, class FromT>
1089
+ HWY_API VFromD<D> BitCast(D d,
1090
+ Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) {
1091
+ return detail::BitCastFromByte(d, detail::BitCastToByte(v));
1092
+ }
1093
+
1094
+ // ------------------------------ ResizeBitCast
1095
+
1096
+ // <= 8 byte vector to <= 8 byte vector
1097
+ template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 8),
1098
+ HWY_IF_V_SIZE_LE_D(D, 8)>
1099
+ HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
1100
+ const Repartition<uint8_t, decltype(d)> du8;
1101
+ return BitCast(d, VFromD<decltype(du8)>{detail::BitCastToByte(v).raw});
1102
+ }
1103
+
1104
+ // 16-byte vector to 16-byte vector: same as BitCast
1105
+ template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 16),
1106
+ HWY_IF_V_SIZE_D(D, 16)>
1107
+ HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
1108
+ return BitCast(d, v);
1109
+ }
1110
+
1111
+ // 16-byte vector to <= 8-byte vector
1112
+ template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 16),
1113
+ HWY_IF_V_SIZE_LE_D(D, 8)>
1114
+ HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
1115
+ const DFromV<decltype(v)> d_from;
1116
+ const Half<decltype(d_from)> dh_from;
1117
+ return ResizeBitCast(d, LowerHalf(dh_from, v));
1118
+ }
1119
+
1120
+ // <= 8-bit vector to 16-byte vector
1121
+ template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 8),
1122
+ HWY_IF_V_SIZE_D(D, 16)>
1123
+ HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
1124
+ const Full64<TFromV<FromV>> d_full64_from;
1125
+ const Full128<TFromV<FromV>> d_full128_from;
1126
+ return BitCast(d, Combine(d_full128_from, Zero(d_full64_from),
1127
+ ResizeBitCast(d_full64_from, v)));
1128
+ }
1129
+
900
1130
  // ------------------------------ Set
901
1131
 
902
1132
  namespace detail {
@@ -913,16 +1143,26 @@ namespace detail {
913
1143
  #define HWY_NEON_BUILD_ARG_HWY_SET t
914
1144
 
915
1145
  HWY_NEON_DEF_FUNCTION_ALL_TYPES(NativeSet, vdup, _n_, HWY_SET)
916
- #if !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_F16C
1146
+ #if !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_F16C && HWY_HAVE_SCALAR_F16_TYPE
917
1147
  HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(NativeSet, vdup, _n_, HWY_SET)
918
1148
  #endif
919
1149
  HWY_NEON_DEF_FUNCTION_BFLOAT_16(NativeSet, vdup, _n_, HWY_SET)
920
1150
 
921
- template <class D, HWY_NEON_IF_EMULATED_D(D)>
922
- HWY_API Vec128<TFromD<D>, MaxLanes(D())> NativeSet(D d, TFromD<D> t) {
1151
+ #if !HWY_NEON_HAVE_F16C || !HWY_HAVE_SCALAR_F16_TYPE
1152
+ template <class D, HWY_IF_F16_D(D)>
1153
+ HWY_API VFromD<D> NativeSet(D d, TFromD<D> t) {
1154
+ const uint16_t tu = BitCastScalar<uint16_t>(t);
1155
+ return BitCast(d, Set(RebindToUnsigned<D>(), tu));
1156
+ }
1157
+ #endif
1158
+
1159
+ #if !HWY_NEON_HAVE_BFLOAT16
1160
+ template <class D, HWY_IF_BF16_D(D)>
1161
+ HWY_API VFromD<D> NativeSet(D d, TFromD<D> t) {
923
1162
  const uint16_t tu = BitCastScalar<uint16_t>(t);
924
- return Vec128<TFromD<D>, d.MaxLanes()>(Set(RebindToUnsigned<D>(), tu).raw);
1163
+ return BitCast(d, Set(RebindToUnsigned<D>(), tu));
925
1164
  }
1165
+ #endif
926
1166
 
927
1167
  #undef HWY_NEON_BUILD_TPL_HWY_SET
928
1168
  #undef HWY_NEON_BUILD_RET_HWY_SET
@@ -931,25 +1171,21 @@ HWY_API Vec128<TFromD<D>, MaxLanes(D())> NativeSet(D d, TFromD<D> t) {
931
1171
 
932
1172
  } // namespace detail
933
1173
 
934
- // Full vector. Cannot yet use VFromD because that is defined in terms of Set.
1174
+ // Full vector.
935
1175
  // Do not use a typename T = TFromD<D> argument because T will be deduced from
936
1176
  // the actual argument type, which can differ from TFromD<D>.
937
1177
  template <class D, HWY_IF_V_SIZE_D(D, 16), typename T>
938
- HWY_INLINE Vec128<TFromD<D>> Set(D /* tag */, T t) {
1178
+ HWY_INLINE VFromD<D> Set(D /* tag */, T t) {
939
1179
  return detail::NativeSet(Full128<TFromD<D>>(), static_cast<TFromD<D>>(t));
940
1180
  }
941
1181
 
942
1182
  // Partial vector: create 64-bit and return wrapper.
943
1183
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T>
944
- HWY_API Vec128<TFromD<D>, MaxLanes(D())> Set(D /* tag */, T t) {
1184
+ HWY_API VFromD<D> Set(D /* tag */, T t) {
945
1185
  const Full64<TFromD<D>> dfull;
946
- return Vec128<TFromD<D>, MaxLanes(D())>(
947
- detail::NativeSet(dfull, static_cast<TFromD<D>>(t)).raw);
1186
+ return VFromD<D>(detail::NativeSet(dfull, static_cast<TFromD<D>>(t)).raw);
948
1187
  }
949
1188
 
950
- template <class D>
951
- using VFromD = decltype(Set(D(), TFromD<D>()));
952
-
953
1189
  template <class D>
954
1190
  HWY_API VFromD<D> Zero(D d) {
955
1191
  // Default ctor also works for bfloat16_t and float16_t.
@@ -1201,7 +1437,8 @@ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1201
1437
  BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
1202
1438
  }
1203
1439
 
1204
- #if (HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL) && HWY_NEON_HAVE_F16C
1440
+ #if (HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL) && HWY_NEON_HAVE_F16C && \
1441
+ HWY_HAVE_SCALAR_F16_TYPE
1205
1442
  template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
1206
1443
  HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1207
1444
  TFromD<D> t2, TFromD<D> t3,
@@ -1393,240 +1630,6 @@ HWY_API Vec128<double> Combine(D /* tag */, Vec64<double> hi,
1393
1630
  }
1394
1631
  #endif // HWY_HAVE_FLOAT64
1395
1632
 
1396
- // ------------------------------ BitCast
1397
-
1398
- namespace detail {
1399
-
1400
- // Converts from Vec128<T, N> to Vec128<uint8_t, N * sizeof(T)> using the
1401
- // vreinterpret*_u8_*() set of functions.
1402
- #define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
1403
- #define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \
1404
- Vec128<uint8_t, size * sizeof(type##_t)>
1405
- #define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type##_t, size> v
1406
- #define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw
1407
-
1408
- // Special case of u8 to u8 since vreinterpret*_u8_u8 is obviously not defined.
1409
- template <size_t N>
1410
- HWY_INLINE Vec128<uint8_t, N> BitCastToByte(Vec128<uint8_t, N> v) {
1411
- return v;
1412
- }
1413
-
1414
- HWY_NEON_DEF_FUNCTION_ALL_FLOATS(BitCastToByte, vreinterpret, _u8_,
1415
- HWY_CAST_TO_U8)
1416
- HWY_NEON_DEF_FUNCTION_BFLOAT_16(BitCastToByte, vreinterpret, _u8_,
1417
- HWY_CAST_TO_U8)
1418
-
1419
- HWY_NEON_DEF_FUNCTION_INTS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
1420
- HWY_NEON_DEF_FUNCTION_UINT_16(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
1421
- HWY_NEON_DEF_FUNCTION_UINT_32(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
1422
- HWY_NEON_DEF_FUNCTION_UINT_64(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
1423
-
1424
- #if !HWY_HAVE_FLOAT16
1425
- #if HWY_NEON_HAVE_F16C
1426
- HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(BitCastToByte, vreinterpret, _u8_,
1427
- HWY_CAST_TO_U8)
1428
- #else
1429
- template <size_t N>
1430
- HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<float16_t, N> v) {
1431
- return BitCastToByte(Vec128<uint16_t, N>(v.raw));
1432
- }
1433
- #endif // HWY_NEON_HAVE_F16C
1434
- #endif // !HWY_HAVE_FLOAT16
1435
-
1436
- #if !HWY_NEON_HAVE_BFLOAT16
1437
- template <size_t N>
1438
- HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<bfloat16_t, N> v) {
1439
- return BitCastToByte(Vec128<uint16_t, N>(v.raw));
1440
- }
1441
- #endif // !HWY_NEON_HAVE_BFLOAT16
1442
-
1443
- #undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
1444
- #undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8
1445
- #undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8
1446
- #undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8
1447
-
1448
- template <class D, HWY_IF_U8_D(D)>
1449
- HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, VFromD<D> v) {
1450
- return v;
1451
- }
1452
-
1453
- // 64-bit or less:
1454
-
1455
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)>
1456
- HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
1457
- VFromD<RebindToUnsigned<D>> v) {
1458
- return VFromD<D>(vreinterpret_s8_u8(v.raw));
1459
- }
1460
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
1461
- HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
1462
- VFromD<Repartition<uint8_t, D>> v) {
1463
- return VFromD<D>(vreinterpret_u16_u8(v.raw));
1464
- }
1465
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
1466
- HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
1467
- VFromD<Repartition<uint8_t, D>> v) {
1468
- return VFromD<D>(vreinterpret_s16_u8(v.raw));
1469
- }
1470
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
1471
- HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
1472
- VFromD<Repartition<uint8_t, D>> v) {
1473
- return VFromD<D>(vreinterpret_u32_u8(v.raw));
1474
- }
1475
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
1476
- HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
1477
- VFromD<Repartition<uint8_t, D>> v) {
1478
- return VFromD<D>(vreinterpret_s32_u8(v.raw));
1479
- }
1480
-
1481
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U64_D(D)>
1482
- HWY_INLINE Vec64<uint64_t> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
1483
- return Vec64<uint64_t>(vreinterpret_u64_u8(v.raw));
1484
- }
1485
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I64_D(D)>
1486
- HWY_INLINE Vec64<int64_t> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
1487
- return Vec64<int64_t>(vreinterpret_s64_u8(v.raw));
1488
- }
1489
-
1490
- // Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C.
1491
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
1492
- HWY_INLINE VFromD<D> BitCastFromByte(D, VFromD<Repartition<uint8_t, D>> v) {
1493
- #if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
1494
- return VFromD<D>(vreinterpret_f16_u8(v.raw));
1495
- #else
1496
- const RebindToUnsigned<D> du;
1497
- return VFromD<D>(BitCastFromByte(du, v).raw);
1498
- #endif
1499
- }
1500
-
1501
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
1502
- HWY_INLINE VFromD<D> BitCastFromByte(D, VFromD<Repartition<uint8_t, D>> v) {
1503
- #if HWY_NEON_HAVE_BFLOAT16
1504
- return VFromD<D>(vreinterpret_bf16_u8(v.raw));
1505
- #else
1506
- const RebindToUnsigned<D> du;
1507
- return VFromD<D>(BitCastFromByte(du, v).raw);
1508
- #endif
1509
- }
1510
-
1511
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
1512
- HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
1513
- VFromD<Repartition<uint8_t, D>> v) {
1514
- return VFromD<D>(vreinterpret_f32_u8(v.raw));
1515
- }
1516
-
1517
- #if HWY_HAVE_FLOAT64
1518
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F64_D(D)>
1519
- HWY_INLINE Vec64<double> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
1520
- return Vec64<double>(vreinterpret_f64_u8(v.raw));
1521
- }
1522
- #endif // HWY_HAVE_FLOAT64
1523
-
1524
- // 128-bit full:
1525
-
1526
- template <class D, HWY_IF_I8_D(D)>
1527
- HWY_INLINE Vec128<int8_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1528
- return Vec128<int8_t>(vreinterpretq_s8_u8(v.raw));
1529
- }
1530
- template <class D, HWY_IF_U16_D(D)>
1531
- HWY_INLINE Vec128<uint16_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1532
- return Vec128<uint16_t>(vreinterpretq_u16_u8(v.raw));
1533
- }
1534
- template <class D, HWY_IF_I16_D(D)>
1535
- HWY_INLINE Vec128<int16_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1536
- return Vec128<int16_t>(vreinterpretq_s16_u8(v.raw));
1537
- }
1538
- template <class D, HWY_IF_U32_D(D)>
1539
- HWY_INLINE Vec128<uint32_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1540
- return Vec128<uint32_t>(vreinterpretq_u32_u8(v.raw));
1541
- }
1542
- template <class D, HWY_IF_I32_D(D)>
1543
- HWY_INLINE Vec128<int32_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1544
- return Vec128<int32_t>(vreinterpretq_s32_u8(v.raw));
1545
- }
1546
- template <class D, HWY_IF_U64_D(D)>
1547
- HWY_INLINE Vec128<uint64_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1548
- return Vec128<uint64_t>(vreinterpretq_u64_u8(v.raw));
1549
- }
1550
- template <class D, HWY_IF_I64_D(D)>
1551
- HWY_INLINE Vec128<int64_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1552
- return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
1553
- }
1554
-
1555
- template <class D, HWY_IF_F32_D(D)>
1556
- HWY_INLINE Vec128<float> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1557
- return Vec128<float>(vreinterpretq_f32_u8(v.raw));
1558
- }
1559
-
1560
- #if HWY_HAVE_FLOAT64
1561
- template <class D, HWY_IF_F64_D(D)>
1562
- HWY_INLINE Vec128<double> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1563
- return Vec128<double>(vreinterpretq_f64_u8(v.raw));
1564
- }
1565
- #endif // HWY_HAVE_FLOAT64
1566
-
1567
- // Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C.
1568
- template <class D, HWY_IF_F16_D(D)>
1569
- HWY_INLINE VFromD<D> BitCastFromByte(D, Vec128<uint8_t> v) {
1570
- #if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
1571
- return VFromD<D>(vreinterpretq_f16_u8(v.raw));
1572
- #else
1573
- return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
1574
- #endif
1575
- }
1576
-
1577
- template <class D, HWY_IF_BF16_D(D)>
1578
- HWY_INLINE VFromD<D> BitCastFromByte(D, Vec128<uint8_t> v) {
1579
- #if HWY_NEON_HAVE_BFLOAT16
1580
- return VFromD<D>(vreinterpretq_bf16_u8(v.raw));
1581
- #else
1582
- return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
1583
- #endif
1584
- }
1585
-
1586
- } // namespace detail
1587
-
1588
- template <class D, class FromT>
1589
- HWY_API VFromD<D> BitCast(D d,
1590
- Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) {
1591
- return detail::BitCastFromByte(d, detail::BitCastToByte(v));
1592
- }
1593
-
1594
- // ------------------------------ ResizeBitCast
1595
-
1596
- // <= 8 byte vector to <= 8 byte vector
1597
- template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 8),
1598
- HWY_IF_V_SIZE_LE_D(D, 8)>
1599
- HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
1600
- const Repartition<uint8_t, decltype(d)> du8;
1601
- return BitCast(d, VFromD<decltype(du8)>{detail::BitCastToByte(v).raw});
1602
- }
1603
-
1604
- // 16-byte vector to 16-byte vector: same as BitCast
1605
- template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 16),
1606
- HWY_IF_V_SIZE_D(D, 16)>
1607
- HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
1608
- return BitCast(d, v);
1609
- }
1610
-
1611
- // 16-byte vector to <= 8-byte vector
1612
- template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 16),
1613
- HWY_IF_V_SIZE_LE_D(D, 8)>
1614
- HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
1615
- const DFromV<decltype(v)> d_from;
1616
- const Half<decltype(d_from)> dh_from;
1617
- return ResizeBitCast(d, LowerHalf(dh_from, v));
1618
- }
1619
-
1620
- // <= 8-bit vector to 16-byte vector
1621
- template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 8),
1622
- HWY_IF_V_SIZE_D(D, 16)>
1623
- HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
1624
- const Full64<TFromV<FromV>> d_full64_from;
1625
- const Full128<TFromV<FromV>> d_full128_from;
1626
- return BitCast(d, Combine(d_full128_from, Zero(d_full64_from),
1627
- ResizeBitCast(d_full64_from, v)));
1628
- }
1629
-
1630
1633
  // ------------------------------ GetLane
1631
1634
 
1632
1635
  namespace detail {
@@ -1940,10 +1943,74 @@ HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) {
1940
1943
  // ================================================== ARITHMETIC
1941
1944
 
1942
1945
  // ------------------------------ Addition
1943
- HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator+, vadd, _, 2)
1946
+ HWY_NEON_DEF_FUNCTION_UINTS(operator+, vadd, _, 2)
1947
+ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator+, vadd, _, 2)
1948
+
1949
+ template <size_t N>
1950
+ HWY_API Vec128<int8_t, N> operator+(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
1951
+ const DFromV<decltype(a)> d;
1952
+ const RebindToUnsigned<decltype(d)> du;
1953
+ return BitCast(d, BitCast(du, a) + BitCast(du, b));
1954
+ }
1955
+
1956
+ template <size_t N>
1957
+ HWY_API Vec128<int16_t, N> operator+(Vec128<int16_t, N> a,
1958
+ Vec128<int16_t, N> b) {
1959
+ const DFromV<decltype(a)> d;
1960
+ const RebindToUnsigned<decltype(d)> du;
1961
+ return BitCast(d, BitCast(du, a) + BitCast(du, b));
1962
+ }
1963
+
1964
+ template <size_t N>
1965
+ HWY_API Vec128<int32_t, N> operator+(Vec128<int32_t, N> a,
1966
+ Vec128<int32_t, N> b) {
1967
+ const DFromV<decltype(a)> d;
1968
+ const RebindToUnsigned<decltype(d)> du;
1969
+ return BitCast(d, BitCast(du, a) + BitCast(du, b));
1970
+ }
1971
+
1972
+ template <size_t N>
1973
+ HWY_API Vec128<int64_t, N> operator+(Vec128<int64_t, N> a,
1974
+ Vec128<int64_t, N> b) {
1975
+ const DFromV<decltype(a)> d;
1976
+ const RebindToUnsigned<decltype(d)> du;
1977
+ return BitCast(d, BitCast(du, a) + BitCast(du, b));
1978
+ }
1944
1979
 
1945
1980
  // ------------------------------ Subtraction
1946
- HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator-, vsub, _, 2)
1981
+ HWY_NEON_DEF_FUNCTION_UINTS(operator-, vsub, _, 2)
1982
+ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator-, vsub, _, 2)
1983
+
1984
+ template <size_t N>
1985
+ HWY_API Vec128<int8_t, N> operator-(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
1986
+ const DFromV<decltype(a)> d;
1987
+ const RebindToUnsigned<decltype(d)> du;
1988
+ return BitCast(d, BitCast(du, a) - BitCast(du, b));
1989
+ }
1990
+
1991
+ template <size_t N>
1992
+ HWY_API Vec128<int16_t, N> operator-(Vec128<int16_t, N> a,
1993
+ Vec128<int16_t, N> b) {
1994
+ const DFromV<decltype(a)> d;
1995
+ const RebindToUnsigned<decltype(d)> du;
1996
+ return BitCast(d, BitCast(du, a) - BitCast(du, b));
1997
+ }
1998
+
1999
+ template <size_t N>
2000
+ HWY_API Vec128<int32_t, N> operator-(Vec128<int32_t, N> a,
2001
+ Vec128<int32_t, N> b) {
2002
+ const DFromV<decltype(a)> d;
2003
+ const RebindToUnsigned<decltype(d)> du;
2004
+ return BitCast(d, BitCast(du, a) - BitCast(du, b));
2005
+ }
2006
+
2007
+ template <size_t N>
2008
+ HWY_API Vec128<int64_t, N> operator-(Vec128<int64_t, N> a,
2009
+ Vec128<int64_t, N> b) {
2010
+ const DFromV<decltype(a)> d;
2011
+ const RebindToUnsigned<decltype(d)> du;
2012
+ return BitCast(d, BitCast(du, a) - BitCast(du, b));
2013
+ }
1947
2014
 
1948
2015
  // ------------------------------ SumsOf8
1949
2016
 
@@ -2074,8 +2141,14 @@ HWY_NEON_DEF_FUNCTION_INTS_UINTS(SaturatedSub, vqsub, _, 2)
2074
2141
  // ------------------------------ Average
2075
2142
 
2076
2143
  // Returns (a + b + 1) / 2
2077
- HWY_NEON_DEF_FUNCTION_UINT_8(AverageRound, vrhadd, _, 2)
2078
- HWY_NEON_DEF_FUNCTION_UINT_16(AverageRound, vrhadd, _, 2)
2144
+
2145
+ #ifdef HWY_NATIVE_AVERAGE_ROUND_UI32
2146
+ #undef HWY_NATIVE_AVERAGE_ROUND_UI32
2147
+ #else
2148
+ #define HWY_NATIVE_AVERAGE_ROUND_UI32
2149
+ #endif
2150
+
2151
+ HWY_NEON_DEF_FUNCTION_UI_8_16_32(AverageRound, vrhadd, _, 2)
2079
2152
 
2080
2153
  // ------------------------------ Neg
2081
2154
 
@@ -2143,6 +2216,12 @@ HWY_API Vec128<int64_t> SaturatedNeg(const Vec128<int64_t> v) {
2143
2216
 
2144
2217
  // ------------------------------ ShiftLeft
2145
2218
 
2219
+ #ifdef HWY_NATIVE_ROUNDING_SHR
2220
+ #undef HWY_NATIVE_ROUNDING_SHR
2221
+ #else
2222
+ #define HWY_NATIVE_ROUNDING_SHR
2223
+ #endif
2224
+
2146
2225
  // Customize HWY_NEON_DEF_FUNCTION to special-case count=0 (not supported).
2147
2226
  #pragma push_macro("HWY_NEON_DEF_FUNCTION")
2148
2227
  #undef HWY_NEON_DEF_FUNCTION
@@ -2158,6 +2237,8 @@ HWY_NEON_DEF_FUNCTION_INTS_UINTS(ShiftLeft, vshl, _n_, ignored)
2158
2237
 
2159
2238
  HWY_NEON_DEF_FUNCTION_UINTS(ShiftRight, vshr, _n_, ignored)
2160
2239
  HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, vshr, _n_, ignored)
2240
+ HWY_NEON_DEF_FUNCTION_UINTS(RoundingShiftRight, vrshr, _n_, ignored)
2241
+ HWY_NEON_DEF_FUNCTION_INTS(RoundingShiftRight, vrshr, _n_, ignored)
2161
2242
 
2162
2243
  #pragma pop_macro("HWY_NEON_DEF_FUNCTION")
2163
2244
 
@@ -2334,6 +2415,95 @@ HWY_API Vec64<int64_t> operator>>(Vec64<int64_t> v, Vec64<int64_t> bits) {
2334
2415
  return Vec64<int64_t>(vshl_s64(v.raw, Neg(bits).raw));
2335
2416
  }
2336
2417
 
2418
+ // ------------------------------ RoundingShr (Neg)
2419
+
2420
+ HWY_API Vec128<uint8_t> RoundingShr(Vec128<uint8_t> v, Vec128<uint8_t> bits) {
2421
+ const RebindToSigned<DFromV<decltype(v)>> di;
2422
+ const int8x16_t neg_bits = Neg(BitCast(di, bits)).raw;
2423
+ return Vec128<uint8_t>(vrshlq_u8(v.raw, neg_bits));
2424
+ }
2425
+ template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)>
2426
+ HWY_API Vec128<uint8_t, N> RoundingShr(Vec128<uint8_t, N> v,
2427
+ Vec128<uint8_t, N> bits) {
2428
+ const RebindToSigned<DFromV<decltype(v)>> di;
2429
+ const int8x8_t neg_bits = Neg(BitCast(di, bits)).raw;
2430
+ return Vec128<uint8_t, N>(vrshl_u8(v.raw, neg_bits));
2431
+ }
2432
+
2433
+ HWY_API Vec128<uint16_t> RoundingShr(Vec128<uint16_t> v,
2434
+ Vec128<uint16_t> bits) {
2435
+ const RebindToSigned<DFromV<decltype(v)>> di;
2436
+ const int16x8_t neg_bits = Neg(BitCast(di, bits)).raw;
2437
+ return Vec128<uint16_t>(vrshlq_u16(v.raw, neg_bits));
2438
+ }
2439
+ template <size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)>
2440
+ HWY_API Vec128<uint16_t, N> RoundingShr(Vec128<uint16_t, N> v,
2441
+ Vec128<uint16_t, N> bits) {
2442
+ const RebindToSigned<DFromV<decltype(v)>> di;
2443
+ const int16x4_t neg_bits = Neg(BitCast(di, bits)).raw;
2444
+ return Vec128<uint16_t, N>(vrshl_u16(v.raw, neg_bits));
2445
+ }
2446
+
2447
+ HWY_API Vec128<uint32_t> RoundingShr(Vec128<uint32_t> v,
2448
+ Vec128<uint32_t> bits) {
2449
+ const RebindToSigned<DFromV<decltype(v)>> di;
2450
+ const int32x4_t neg_bits = Neg(BitCast(di, bits)).raw;
2451
+ return Vec128<uint32_t>(vrshlq_u32(v.raw, neg_bits));
2452
+ }
2453
+ template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
2454
+ HWY_API Vec128<uint32_t, N> RoundingShr(Vec128<uint32_t, N> v,
2455
+ Vec128<uint32_t, N> bits) {
2456
+ const RebindToSigned<DFromV<decltype(v)>> di;
2457
+ const int32x2_t neg_bits = Neg(BitCast(di, bits)).raw;
2458
+ return Vec128<uint32_t, N>(vrshl_u32(v.raw, neg_bits));
2459
+ }
2460
+
2461
+ HWY_API Vec128<uint64_t> RoundingShr(Vec128<uint64_t> v,
2462
+ Vec128<uint64_t> bits) {
2463
+ const RebindToSigned<DFromV<decltype(v)>> di;
2464
+ const int64x2_t neg_bits = Neg(BitCast(di, bits)).raw;
2465
+ return Vec128<uint64_t>(vrshlq_u64(v.raw, neg_bits));
2466
+ }
2467
+ HWY_API Vec64<uint64_t> RoundingShr(Vec64<uint64_t> v, Vec64<uint64_t> bits) {
2468
+ const RebindToSigned<DFromV<decltype(v)>> di;
2469
+ const int64x1_t neg_bits = Neg(BitCast(di, bits)).raw;
2470
+ return Vec64<uint64_t>(vrshl_u64(v.raw, neg_bits));
2471
+ }
2472
+
2473
+ HWY_API Vec128<int8_t> RoundingShr(Vec128<int8_t> v, Vec128<int8_t> bits) {
2474
+ return Vec128<int8_t>(vrshlq_s8(v.raw, Neg(bits).raw));
2475
+ }
2476
+ template <size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)>
2477
+ HWY_API Vec128<int8_t, N> RoundingShr(Vec128<int8_t, N> v,
2478
+ Vec128<int8_t, N> bits) {
2479
+ return Vec128<int8_t, N>(vrshl_s8(v.raw, Neg(bits).raw));
2480
+ }
2481
+
2482
+ HWY_API Vec128<int16_t> RoundingShr(Vec128<int16_t> v, Vec128<int16_t> bits) {
2483
+ return Vec128<int16_t>(vrshlq_s16(v.raw, Neg(bits).raw));
2484
+ }
2485
+ template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
2486
+ HWY_API Vec128<int16_t, N> RoundingShr(Vec128<int16_t, N> v,
2487
+ Vec128<int16_t, N> bits) {
2488
+ return Vec128<int16_t, N>(vrshl_s16(v.raw, Neg(bits).raw));
2489
+ }
2490
+
2491
+ HWY_API Vec128<int32_t> RoundingShr(Vec128<int32_t> v, Vec128<int32_t> bits) {
2492
+ return Vec128<int32_t>(vrshlq_s32(v.raw, Neg(bits).raw));
2493
+ }
2494
+ template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)>
2495
+ HWY_API Vec128<int32_t, N> RoundingShr(Vec128<int32_t, N> v,
2496
+ Vec128<int32_t, N> bits) {
2497
+ return Vec128<int32_t, N>(vrshl_s32(v.raw, Neg(bits).raw));
2498
+ }
2499
+
2500
+ HWY_API Vec128<int64_t> RoundingShr(Vec128<int64_t> v, Vec128<int64_t> bits) {
2501
+ return Vec128<int64_t>(vrshlq_s64(v.raw, Neg(bits).raw));
2502
+ }
2503
+ HWY_API Vec64<int64_t> RoundingShr(Vec64<int64_t> v, Vec64<int64_t> bits) {
2504
+ return Vec64<int64_t>(vrshl_s64(v.raw, Neg(bits).raw));
2505
+ }
2506
+
2337
2507
  // ------------------------------ ShiftLeftSame (Shl)
2338
2508
 
2339
2509
  template <typename T, size_t N>
@@ -2345,6 +2515,13 @@ HWY_API Vec128<T, N> ShiftRightSame(const Vec128<T, N> v, int bits) {
2345
2515
  return v >> Set(DFromV<decltype(v)>(), static_cast<T>(bits));
2346
2516
  }
2347
2517
 
2518
+ // ------------------------------ RoundingShiftRightSame (RoundingShr)
2519
+
2520
+ template <typename T, size_t N>
2521
+ HWY_API Vec128<T, N> RoundingShiftRightSame(const Vec128<T, N> v, int bits) {
2522
+ return RoundingShr(v, Set(DFromV<decltype(v)>(), static_cast<T>(bits)));
2523
+ }
2524
+
2348
2525
  // ------------------------------ Int/float multiplication
2349
2526
 
2350
2527
  // Per-target flag to prevent generic_ops-inl.h from defining 8-bit operator*.
@@ -2356,9 +2533,31 @@ HWY_API Vec128<T, N> ShiftRightSame(const Vec128<T, N> v, int bits) {
2356
2533
 
2357
2534
  // All except ui64
2358
2535
  HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator*, vmul, _, 2)
2359
- HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator*, vmul, _, 2)
2360
2536
  HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator*, vmul, _, 2)
2361
2537
 
2538
+ template <size_t N>
2539
+ HWY_API Vec128<int8_t, N> operator*(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
2540
+ const DFromV<decltype(a)> d;
2541
+ const RebindToUnsigned<decltype(d)> du;
2542
+ return BitCast(d, BitCast(du, a) * BitCast(du, b));
2543
+ }
2544
+
2545
+ template <size_t N>
2546
+ HWY_API Vec128<int16_t, N> operator*(Vec128<int16_t, N> a,
2547
+ Vec128<int16_t, N> b) {
2548
+ const DFromV<decltype(a)> d;
2549
+ const RebindToUnsigned<decltype(d)> du;
2550
+ return BitCast(d, BitCast(du, a) * BitCast(du, b));
2551
+ }
2552
+
2553
+ template <size_t N>
2554
+ HWY_API Vec128<int32_t, N> operator*(Vec128<int32_t, N> a,
2555
+ Vec128<int32_t, N> b) {
2556
+ const DFromV<decltype(a)> d;
2557
+ const RebindToUnsigned<decltype(d)> du;
2558
+ return BitCast(d, BitCast(du, a) * BitCast(du, b));
2559
+ }
2560
+
2362
2561
  // ------------------------------ Integer multiplication
2363
2562
 
2364
2563
  // Returns the upper sizeof(T)*8 bits of a * b in each lane.
@@ -2490,7 +2689,7 @@ HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
2490
2689
  // ------------------------------ Floating-point division
2491
2690
 
2492
2691
  // Emulate missing intrinsic
2493
- #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
2692
+ #if HWY_HAVE_FLOAT64 && HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
2494
2693
  HWY_INLINE float64x1_t vrecpe_f64(float64x1_t raw) {
2495
2694
  const CappedTag<double, 1> d;
2496
2695
  const Twice<decltype(d)> dt;
@@ -2788,26 +2987,6 @@ HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
2788
2987
  return Or(o, And(a1, a2));
2789
2988
  }
2790
2989
 
2791
- // ------------------------------ IfVecThenElse
2792
- template <typename T, size_t N>
2793
- HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
2794
- Vec128<T, N> no) {
2795
- return IfThenElse(MaskFromVec(mask), yes, no);
2796
- }
2797
-
2798
- // ------------------------------ BitwiseIfThenElse
2799
-
2800
- #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
2801
- #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
2802
- #else
2803
- #define HWY_NATIVE_BITWISE_IF_THEN_ELSE
2804
- #endif
2805
-
2806
- template <class V>
2807
- HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
2808
- return IfVecThenElse(mask, yes, no);
2809
- }
2810
-
2811
2990
  // ------------------------------ Operator overloads (internal-only if float)
2812
2991
 
2813
2992
  template <typename T, size_t N>
@@ -2927,14 +3106,6 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Abs, vabs, _, 1)
2927
3106
 
2928
3107
  HWY_NEON_DEF_FUNCTION_INT_8_16_32(SaturatedAbs, vqabs, _, 1)
2929
3108
 
2930
- // ------------------------------ CopySign
2931
- template <typename T, size_t N>
2932
- HWY_API Vec128<T, N> CopySign(Vec128<T, N> magn, Vec128<T, N> sign) {
2933
- static_assert(IsFloat<T>(), "Only makes sense for floating-point");
2934
- const DFromV<decltype(magn)> d;
2935
- return BitwiseIfThenElse(SignBit(d), sign, magn);
2936
- }
2937
-
2938
3109
  // ------------------------------ CopySignToAbs
2939
3110
  template <typename T, size_t N>
2940
3111
  HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
@@ -2981,6 +3152,21 @@ HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) {
2981
3152
 
2982
3153
  // ------------------------------ IfThenElse
2983
3154
 
3155
+ // Workaround for incorrect codegen.
3156
+ #if HWY_ARCH_ARM_V7
3157
+
3158
+ template <class V, class D = DFromV<V>>
3159
+ HWY_API V IfThenElse(MFromD<D> mask, V yes, V no) {
3160
+ const RebindToUnsigned<D> du;
3161
+ using VU = VFromD<decltype(du)>;
3162
+ const VU no_u = BitCast(du, no);
3163
+ const VU diff_u = BitCast(du, yes) ^ no_u;
3164
+ const VU mask_u = BitCast(du, VecFromMask(D(), mask));
3165
+ return BitCast(D(), no_u ^ (diff_u & mask_u));
3166
+ }
3167
+
3168
+ #else // normal VBSL instruction
3169
+
2984
3170
  #define HWY_NEON_BUILD_TPL_HWY_IF
2985
3171
  #define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type##_t, size>
2986
3172
  #define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \
@@ -2990,6 +3176,8 @@ HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) {
2990
3176
 
2991
3177
  HWY_NEON_DEF_FUNCTION_ALL_TYPES(IfThenElse, vbsl, _, HWY_IF)
2992
3178
 
3179
+ #endif // HWY_ARCH_ARM_V7
3180
+
2993
3181
  #if HWY_HAVE_FLOAT16
2994
3182
  #define HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V) HWY_IF_BF16(TFromV<V>)
2995
3183
  #else
@@ -3045,6 +3233,33 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
3045
3233
  return IfThenElse(m, yes, no);
3046
3234
  }
3047
3235
 
3236
+ template <typename T, size_t N>
3237
+ HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
3238
+ Vec128<T, N> no) {
3239
+ return IfThenElse(MaskFromVec(mask), yes, no);
3240
+ }
3241
+
3242
+ // ------------------------------ BitwiseIfThenElse
3243
+
3244
+ #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
3245
+ #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
3246
+ #else
3247
+ #define HWY_NATIVE_BITWISE_IF_THEN_ELSE
3248
+ #endif
3249
+
3250
+ template <class V>
3251
+ HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
3252
+ return IfVecThenElse(mask, yes, no);
3253
+ }
3254
+
3255
+ // ------------------------------ CopySign (BitwiseIfThenElse)
3256
+ template <typename T, size_t N>
3257
+ HWY_API Vec128<T, N> CopySign(Vec128<T, N> magn, Vec128<T, N> sign) {
3258
+ static_assert(IsFloat<T>(), "Only makes sense for floating-point");
3259
+ const DFromV<decltype(magn)> d;
3260
+ return BitwiseIfThenElse(SignBit(d), sign, magn);
3261
+ }
3262
+
3048
3263
  // ------------------------------ Mask logical
3049
3264
 
3050
3265
  template <typename T, size_t N>
@@ -3275,21 +3490,19 @@ HWY_API Mask128<int64_t, N> TestBit(Vec128<int64_t, N> v,
3275
3490
  #undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT
3276
3491
  #undef HWY_NEON_BUILD_ARG_HWY_TESTBIT
3277
3492
 
3278
- // ------------------------------ Abs i64 (IfThenElse, BroadcastSignBit)
3493
+ // ------------------------------ Abs i64 (IfNegativeThenElse, Neg)
3279
3494
  HWY_API Vec128<int64_t> Abs(const Vec128<int64_t> v) {
3280
3495
  #if HWY_ARCH_ARM_A64
3281
3496
  return Vec128<int64_t>(vabsq_s64(v.raw));
3282
3497
  #else
3283
- const auto zero = Zero(DFromV<decltype(v)>());
3284
- return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
3498
+ return IfNegativeThenElse(v, Neg(v), v);
3285
3499
  #endif
3286
3500
  }
3287
3501
  HWY_API Vec64<int64_t> Abs(const Vec64<int64_t> v) {
3288
3502
  #if HWY_ARCH_ARM_A64
3289
3503
  return Vec64<int64_t>(vabs_s64(v.raw));
3290
3504
  #else
3291
- const auto zero = Zero(DFromV<decltype(v)>());
3292
- return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
3505
+ return IfNegativeThenElse(v, Neg(v), v);
3293
3506
  #endif
3294
3507
  }
3295
3508
 
@@ -3298,7 +3511,7 @@ HWY_API Vec128<int64_t> SaturatedAbs(const Vec128<int64_t> v) {
3298
3511
  return Vec128<int64_t>(vqabsq_s64(v.raw));
3299
3512
  #else
3300
3513
  const auto zero = Zero(DFromV<decltype(v)>());
3301
- return IfThenElse(MaskFromVec(BroadcastSignBit(v)), SaturatedSub(zero, v), v);
3514
+ return IfNegativeThenElse(v, SaturatedSub(zero, v), v);
3302
3515
  #endif
3303
3516
  }
3304
3517
  HWY_API Vec64<int64_t> SaturatedAbs(const Vec64<int64_t> v) {
@@ -3306,7 +3519,7 @@ HWY_API Vec64<int64_t> SaturatedAbs(const Vec64<int64_t> v) {
3306
3519
  return Vec64<int64_t>(vqabs_s64(v.raw));
3307
3520
  #else
3308
3521
  const auto zero = Zero(DFromV<decltype(v)>());
3309
- return IfThenElse(MaskFromVec(BroadcastSignBit(v)), SaturatedSub(zero, v), v);
3522
+ return IfNegativeThenElse(v, SaturatedSub(zero, v), v);
3310
3523
  #endif
3311
3524
  }
3312
3525
 
@@ -3442,6 +3655,28 @@ HWY_API Vec128<double> Max(Vec128<double> a, Vec128<double> b) {
3442
3655
  HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmax, _, 2)
3443
3656
  #endif // HWY_ARCH_ARM_A64
3444
3657
 
3658
+ // ------------------------------ MinNumber and MaxNumber
3659
+
3660
+ #if !HWY_ARCH_ARM_A64
3661
+
3662
+ #ifdef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
3663
+ #undef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
3664
+ #else
3665
+ #define HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
3666
+ #endif
3667
+
3668
+ template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
3669
+ HWY_API V MinNumber(V a, V b) {
3670
+ return Min(IfThenElse(IsNaN(a), b, a), IfThenElse(IsNaN(b), a, b));
3671
+ }
3672
+
3673
+ template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
3674
+ HWY_API V MaxNumber(V a, V b) {
3675
+ return Max(IfThenElse(IsNaN(a), b, a), IfThenElse(IsNaN(b), a, b));
3676
+ }
3677
+
3678
+ #endif
3679
+
3445
3680
  // ================================================== MEMORY
3446
3681
 
3447
3682
  // ------------------------------ Load 128
@@ -5077,8 +5312,101 @@ HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
5077
5312
 
5078
5313
  #endif
5079
5314
 
5315
+ // ------------------------------ CeilInt/FloorInt
5316
+ #if HWY_ARCH_ARM_A64
5317
+
5318
+ #ifdef HWY_NATIVE_CEIL_FLOOR_INT
5319
+ #undef HWY_NATIVE_CEIL_FLOOR_INT
5320
+ #else
5321
+ #define HWY_NATIVE_CEIL_FLOOR_INT
5322
+ #endif
5323
+
5324
+ #if HWY_HAVE_FLOAT16
5325
+ HWY_API Vec128<int16_t> CeilInt(const Vec128<float16_t> v) {
5326
+ return Vec128<int16_t>(vcvtpq_s16_f16(v.raw));
5327
+ }
5328
+
5329
+ template <size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8)>
5330
+ HWY_API Vec128<int16_t, N> CeilInt(const Vec128<float16_t, N> v) {
5331
+ return Vec128<int16_t, N>(vcvtp_s16_f16(v.raw));
5332
+ }
5333
+
5334
+ HWY_API Vec128<int16_t> FloorInt(const Vec128<float16_t> v) {
5335
+ return Vec128<int16_t>(vcvtmq_s16_f16(v.raw));
5336
+ }
5337
+
5338
+ template <size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8)>
5339
+ HWY_API Vec128<int16_t, N> FloorInt(const Vec128<float16_t, N> v) {
5340
+ return Vec128<int16_t, N>(vcvtm_s16_f16(v.raw));
5341
+ }
5342
+ #endif // HWY_HAVE_FLOAT16
5343
+
5344
+ HWY_API Vec128<int32_t> CeilInt(const Vec128<float> v) {
5345
+ return Vec128<int32_t>(vcvtpq_s32_f32(v.raw));
5346
+ }
5347
+
5348
+ template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)>
5349
+ HWY_API Vec128<int32_t, N> CeilInt(const Vec128<float, N> v) {
5350
+ return Vec128<int32_t, N>(vcvtp_s32_f32(v.raw));
5351
+ }
5352
+
5353
+ HWY_API Vec128<int64_t> CeilInt(const Vec128<double> v) {
5354
+ return Vec128<int64_t>(vcvtpq_s64_f64(v.raw));
5355
+ }
5356
+
5357
+ template <size_t N, HWY_IF_V_SIZE_LE(double, N, 8)>
5358
+ HWY_API Vec128<int64_t, N> CeilInt(const Vec128<double, N> v) {
5359
+ #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 610
5360
+ // Workaround for missing vcvtp_s64_f64 intrinsic
5361
+ const DFromV<decltype(v)> d;
5362
+ const RebindToSigned<decltype(d)> di;
5363
+ const Twice<decltype(d)> dt;
5364
+ return LowerHalf(di, CeilInt(Combine(dt, v, v)));
5365
+ #else
5366
+ return Vec128<int64_t, N>(vcvtp_s64_f64(v.raw));
5367
+ #endif
5368
+ }
5369
+
5370
+ HWY_API Vec128<int32_t> FloorInt(const Vec128<float> v) {
5371
+ return Vec128<int32_t>(vcvtmq_s32_f32(v.raw));
5372
+ }
5373
+
5374
+ template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)>
5375
+ HWY_API Vec128<int32_t, N> FloorInt(const Vec128<float, N> v) {
5376
+ return Vec128<int32_t, N>(vcvtm_s32_f32(v.raw));
5377
+ }
5378
+
5379
+ HWY_API Vec128<int64_t> FloorInt(const Vec128<double> v) {
5380
+ return Vec128<int64_t>(vcvtmq_s64_f64(v.raw));
5381
+ }
5382
+
5383
+ template <size_t N, HWY_IF_V_SIZE_LE(double, N, 8)>
5384
+ HWY_API Vec128<int64_t, N> FloorInt(const Vec128<double, N> v) {
5385
+ #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 610
5386
+ // Workaround for missing vcvtm_s64_f64 intrinsic
5387
+ const DFromV<decltype(v)> d;
5388
+ const RebindToSigned<decltype(d)> di;
5389
+ const Twice<decltype(d)> dt;
5390
+ return LowerHalf(di, FloorInt(Combine(dt, v, v)));
5391
+ #else
5392
+ return Vec128<int64_t, N>(vcvtm_s64_f64(v.raw));
5393
+ #endif
5394
+ }
5395
+
5396
+ #endif // HWY_ARCH_ARM_A64
5397
+
5080
5398
  // ------------------------------ NearestInt (Round)
5081
5399
 
5400
+ #if HWY_HAVE_FLOAT16
5401
+ HWY_API Vec128<int16_t> NearestInt(const Vec128<float16_t> v) {
5402
+ return Vec128<int16_t>(vcvtnq_s16_f16(v.raw));
5403
+ }
5404
+ template <size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8)>
5405
+ HWY_API Vec128<int16_t, N> NearestInt(const Vec128<float16_t, N> v) {
5406
+ return Vec128<int16_t, N>(vcvtn_s16_f16(v.raw));
5407
+ }
5408
+ #endif
5409
+
5082
5410
  #if HWY_ARCH_ARM_A64
5083
5411
 
5084
5412
  HWY_API Vec128<int32_t> NearestInt(const Vec128<float> v) {
@@ -5089,6 +5417,29 @@ HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
5089
5417
  return Vec128<int32_t, N>(vcvtn_s32_f32(v.raw));
5090
5418
  }
5091
5419
 
5420
+ HWY_API Vec128<int64_t> NearestInt(const Vec128<double> v) {
5421
+ return Vec128<int64_t>(vcvtnq_s64_f64(v.raw));
5422
+ }
5423
+
5424
+ template <size_t N, HWY_IF_V_SIZE_LE(double, N, 8)>
5425
+ HWY_API Vec128<int64_t, N> NearestInt(const Vec128<double, N> v) {
5426
+ #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 610
5427
+ // Workaround for missing vcvtn_s64_f64 intrinsic
5428
+ const DFromV<decltype(v)> d;
5429
+ const RebindToSigned<decltype(d)> di;
5430
+ const Twice<decltype(d)> dt;
5431
+ return LowerHalf(di, NearestInt(Combine(dt, v, v)));
5432
+ #else
5433
+ return Vec128<int64_t, N>(vcvtn_s64_f64(v.raw));
5434
+ #endif
5435
+ }
5436
+
5437
+ template <class DI32, HWY_IF_I32_D(DI32)>
5438
+ HWY_API VFromD<DI32> DemoteToNearestInt(DI32 di32,
5439
+ VFromD<Rebind<double, DI32>> v) {
5440
+ return DemoteTo(di32, NearestInt(v));
5441
+ }
5442
+
5092
5443
  #else
5093
5444
 
5094
5445
  template <size_t N>
@@ -5100,10 +5451,62 @@ HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
5100
5451
  #endif
5101
5452
 
5102
5453
  // ------------------------------ Floating-point classification
5454
+
5455
+ #if !HWY_COMPILER_CLANG || HWY_COMPILER_CLANG > 1801 || HWY_ARCH_ARM_V7
5103
5456
  template <typename T, size_t N>
5104
5457
  HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
5105
5458
  return v != v;
5106
5459
  }
5460
+ #else
5461
+ // Clang up to 18.1 generates less efficient code than the expected FCMEQ, see
5462
+ // https://github.com/numpy/numpy/issues/27313 and
5463
+ // https://github.com/numpy/numpy/pull/22954/files and
5464
+ // https://github.com/llvm/llvm-project/issues/59855
5465
+
5466
+ #if HWY_HAVE_FLOAT16
5467
+ template <typename T, size_t N, HWY_IF_F16(T), HWY_IF_V_SIZE(T, N, 16)>
5468
+ HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
5469
+ typename Mask128<T, N>::Raw ret;
5470
+ __asm__ volatile("fcmeq %0.8h, %1.8h, %1.8h" : "=w"(ret) : "w"(v.raw));
5471
+ return Not(Mask128<T, N>(ret));
5472
+ }
5473
+ template <typename T, size_t N, HWY_IF_F16(T), HWY_IF_V_SIZE_LE(T, N, 8)>
5474
+ HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
5475
+ typename Mask128<T, N>::Raw ret;
5476
+ __asm__ volatile("fcmeq %0.4h, %1.4h, %1.4h" : "=w"(ret) : "w"(v.raw));
5477
+ return Not(Mask128<T, N>(ret));
5478
+ }
5479
+ #endif // HWY_HAVE_FLOAT16
5480
+
5481
+ template <typename T, size_t N, HWY_IF_F32(T), HWY_IF_V_SIZE(T, N, 16)>
5482
+ HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
5483
+ typename Mask128<T, N>::Raw ret;
5484
+ __asm__ volatile("fcmeq %0.4s, %1.4s, %1.4s" : "=w"(ret) : "w"(v.raw));
5485
+ return Not(Mask128<T, N>(ret));
5486
+ }
5487
+ template <typename T, size_t N, HWY_IF_F32(T), HWY_IF_V_SIZE_LE(T, N, 8)>
5488
+ HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
5489
+ typename Mask128<T, N>::Raw ret;
5490
+ __asm__ volatile("fcmeq %0.2s, %1.2s, %1.2s" : "=w"(ret) : "w"(v.raw));
5491
+ return Not(Mask128<T, N>(ret));
5492
+ }
5493
+
5494
+ #if HWY_HAVE_FLOAT64
5495
+ template <typename T, size_t N, HWY_IF_F64(T), HWY_IF_V_SIZE(T, N, 16)>
5496
+ HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
5497
+ typename Mask128<T, N>::Raw ret;
5498
+ __asm__ volatile("fcmeq %0.2d, %1.2d, %1.2d" : "=w"(ret) : "w"(v.raw));
5499
+ return Not(Mask128<T, N>(ret));
5500
+ }
5501
+ template <typename T, size_t N, HWY_IF_F64(T), HWY_IF_V_SIZE_LE(T, N, 8)>
5502
+ HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
5503
+ typename Mask128<T, N>::Raw ret;
5504
+ __asm__ volatile("fcmeq %d0, %d1, %d1" : "=w"(ret) : "w"(v.raw));
5505
+ return Not(Mask128<T, N>(ret));
5506
+ }
5507
+ #endif // HWY_HAVE_FLOAT64
5508
+
5509
+ #endif // HWY_COMPILER_CLANG
5107
5510
 
5108
5511
  // ================================================== SWIZZLE
5109
5512
 
@@ -7115,6 +7518,31 @@ HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) {
7115
7518
 
7116
7519
  // ------------------------------ RearrangeToOddPlusEven (Combine)
7117
7520
 
7521
+ namespace detail {
7522
+ // Armv7 only provides 64-bit (half-vector) pairwise operations.
7523
+ #define HWY_NEON_DEF_PAIRWISE_OP(T, name, prefix, suffix) \
7524
+ HWY_INLINE Vec64<T> Pairwise##name(Vec64<T> a, Vec64<T> b) { \
7525
+ return Vec64<T>(prefix##_##suffix(a.raw, b.raw)); \
7526
+ }
7527
+
7528
+ // Note that Armv7 also lacks [u]int64 instructions, which are handled by
7529
+ // generic_ops-inl.h SumOfLanes etc., hence no 64-bit overloads here.
7530
+ #define HWY_NEON_DEF_PAIRWISE_OPS(name, prefix) \
7531
+ HWY_NEON_DEF_PAIRWISE_OP(uint32_t, name, prefix, u32) \
7532
+ HWY_NEON_DEF_PAIRWISE_OP(uint16_t, name, prefix, u16) \
7533
+ HWY_NEON_DEF_PAIRWISE_OP(uint8_t, name, prefix, u8) \
7534
+ HWY_NEON_DEF_PAIRWISE_OP(int32_t, name, prefix, s32) \
7535
+ HWY_NEON_DEF_PAIRWISE_OP(int16_t, name, prefix, s16) \
7536
+ HWY_NEON_DEF_PAIRWISE_OP(int8_t, name, prefix, s8) \
7537
+ HWY_NEON_DEF_PAIRWISE_OP(float32_t, name, prefix, f32)
7538
+
7539
+ HWY_NEON_DEF_PAIRWISE_OPS(Sum, vpadd)
7540
+ HWY_NEON_DEF_PAIRWISE_OPS(Min, vpmin)
7541
+ HWY_NEON_DEF_PAIRWISE_OPS(Max, vpmax)
7542
+ #undef HWY_NEON_DEF_PAIRWISE_OPS
7543
+ #undef HWY_NEON_DEF_PAIRWISE_OP
7544
+ } // namespace detail
7545
+
7118
7546
  template <size_t N>
7119
7547
  HWY_API Vec128<float, N> RearrangeToOddPlusEven(Vec128<float, N> sum0,
7120
7548
  Vec128<float, N> sum1) {
@@ -7134,18 +7562,18 @@ HWY_API Vec128<int32_t> RearrangeToOddPlusEven(Vec128<int32_t> sum0,
7134
7562
  #else
7135
7563
  const Full128<int32_t> d;
7136
7564
  const Half<decltype(d)> d64;
7137
- const Vec64<int32_t> hi(
7138
- vpadd_s32(LowerHalf(d64, sum1).raw, UpperHalf(d64, sum1).raw));
7565
+ const Vec64<int32_t> hi =
7566
+ detail::PairwiseSum(LowerHalf(d64, sum1), UpperHalf(d64, sum1));
7139
7567
  const Vec64<int32_t> lo(
7140
- vpadd_s32(LowerHalf(d64, sum0).raw, UpperHalf(d64, sum0).raw));
7141
- return Combine(Full128<int32_t>(), hi, lo);
7568
+ detail::PairwiseSum(LowerHalf(d64, sum0), UpperHalf(d64, sum0)));
7569
+ return Combine(d, hi, lo);
7142
7570
  #endif
7143
7571
  }
7144
7572
 
7145
7573
  HWY_API Vec64<int32_t> RearrangeToOddPlusEven(Vec64<int32_t> sum0,
7146
7574
  Vec64<int32_t> sum1) {
7147
7575
  // vmlal_s16 multiplied the lower half into sum0 and upper into sum1.
7148
- return Vec64<int32_t>(vpadd_s32(sum0.raw, sum1.raw));
7576
+ return detail::PairwiseSum(sum0, sum1);
7149
7577
  }
7150
7578
 
7151
7579
  HWY_API Vec32<int32_t> RearrangeToOddPlusEven(Vec32<int32_t> sum0,
@@ -7162,18 +7590,18 @@ HWY_API Vec128<uint32_t> RearrangeToOddPlusEven(Vec128<uint32_t> sum0,
7162
7590
  #else
7163
7591
  const Full128<uint32_t> d;
7164
7592
  const Half<decltype(d)> d64;
7165
- const Vec64<uint32_t> hi(
7166
- vpadd_u32(LowerHalf(d64, sum1).raw, UpperHalf(d64, sum1).raw));
7167
- const Vec64<uint32_t> lo(
7168
- vpadd_u32(LowerHalf(d64, sum0).raw, UpperHalf(d64, sum0).raw));
7169
- return Combine(Full128<uint32_t>(), hi, lo);
7593
+ const Vec64<uint32_t> hi =
7594
+ detail::PairwiseSum(LowerHalf(d64, sum1), UpperHalf(d64, sum1));
7595
+ const Vec64<uint32_t> lo =
7596
+ detail::PairwiseSum(LowerHalf(d64, sum0), UpperHalf(d64, sum0));
7597
+ return Combine(d, hi, lo);
7170
7598
  #endif
7171
7599
  }
7172
7600
 
7173
7601
  HWY_API Vec64<uint32_t> RearrangeToOddPlusEven(Vec64<uint32_t> sum0,
7174
7602
  Vec64<uint32_t> sum1) {
7175
7603
  // vmlal_u16 multiplied the lower half into sum0 and upper into sum1.
7176
- return Vec64<uint32_t>(vpadd_u32(sum0.raw, sum1.raw));
7604
+ return detail::PairwiseSum(sum0, sum1);
7177
7605
  }
7178
7606
 
7179
7607
  HWY_API Vec32<uint32_t> RearrangeToOddPlusEven(Vec32<uint32_t> sum0,
@@ -7182,6 +7610,78 @@ HWY_API Vec32<uint32_t> RearrangeToOddPlusEven(Vec32<uint32_t> sum0,
7182
7610
  return sum0 + sum1;
7183
7611
  }
7184
7612
 
7613
+ // ------------------------------ SumOfMulQuadAccumulate
7614
+
7615
+ #if HWY_TARGET == HWY_NEON_BF16
7616
+
7617
+ #ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
7618
+ #undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
7619
+ #else
7620
+ #define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
7621
+ #endif
7622
+
7623
+ template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 8)>
7624
+ HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 /*di32*/,
7625
+ VFromD<Repartition<int8_t, DI32>> a,
7626
+ VFromD<Repartition<int8_t, DI32>> b,
7627
+ VFromD<DI32> sum) {
7628
+ return VFromD<DI32>(vdot_s32(sum.raw, a.raw, b.raw));
7629
+ }
7630
+
7631
+ template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 16)>
7632
+ HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 /*di32*/,
7633
+ VFromD<Repartition<int8_t, DI32>> a,
7634
+ VFromD<Repartition<int8_t, DI32>> b,
7635
+ VFromD<DI32> sum) {
7636
+ return VFromD<DI32>(vdotq_s32(sum.raw, a.raw, b.raw));
7637
+ }
7638
+
7639
+ #ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
7640
+ #undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
7641
+ #else
7642
+ #define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
7643
+ #endif
7644
+
7645
+ template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_LE_D(DU32, 8)>
7646
+ HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
7647
+ DU32 /*du32*/, VFromD<Repartition<uint8_t, DU32>> a,
7648
+ VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) {
7649
+ return VFromD<DU32>(vdot_u32(sum.raw, a.raw, b.raw));
7650
+ }
7651
+
7652
+ template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_D(DU32, 16)>
7653
+ HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
7654
+ DU32 /*du32*/, VFromD<Repartition<uint8_t, DU32>> a,
7655
+ VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) {
7656
+ return VFromD<DU32>(vdotq_u32(sum.raw, a.raw, b.raw));
7657
+ }
7658
+
7659
+ #ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
7660
+ #undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
7661
+ #else
7662
+ #define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
7663
+ #endif
7664
+
7665
+ template <class DI32, HWY_IF_I32_D(DI32)>
7666
+ HWY_API VFromD<DI32> SumOfMulQuadAccumulate(
7667
+ DI32 di32, VFromD<Repartition<uint8_t, DI32>> a_u,
7668
+ VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) {
7669
+ // TODO: use vusdot[q]_s32 on NEON targets that require support for NEON I8MM
7670
+
7671
+ const RebindToUnsigned<decltype(di32)> du32;
7672
+ const Repartition<uint8_t, decltype(di32)> du8;
7673
+
7674
+ const auto b_u = BitCast(du8, b_i);
7675
+ const auto result_sum0 =
7676
+ SumOfMulQuadAccumulate(du32, a_u, b_u, BitCast(du32, sum));
7677
+ const auto result_sum1 = ShiftLeft<8>(
7678
+ SumOfMulQuadAccumulate(du32, a_u, ShiftRight<7>(b_u), Zero(du32)));
7679
+
7680
+ return BitCast(di32, Sub(result_sum0, result_sum1));
7681
+ }
7682
+
7683
+ #endif // HWY_TARGET == HWY_NEON_BF16
7684
+
7185
7685
  // ------------------------------ WidenMulPairwiseAdd
7186
7686
 
7187
7687
  #if HWY_NEON_HAVE_F32_TO_BF16C
@@ -7588,6 +8088,17 @@ HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
7588
8088
  return v;
7589
8089
  }
7590
8090
 
8091
+ // ------------------------------ InterleaveEvenBlocks
8092
+ template <class D, class V = VFromD<D>>
8093
+ HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) {
8094
+ return a;
8095
+ }
8096
+ // ------------------------------ InterleaveOddBlocks
8097
+ template <class D, class V = VFromD<D>>
8098
+ HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) {
8099
+ return a;
8100
+ }
8101
+
7591
8102
  // ------------------------------ ReverseBlocks
7592
8103
  // Single block: no change
7593
8104
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
@@ -8374,71 +8885,47 @@ HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
8374
8885
  // On Armv7 we define SumOfLanes and generic_ops defines ReduceSum via GetLane.
8375
8886
  #else // !HWY_ARCH_ARM_A64
8376
8887
 
8377
- // Armv7 lacks N=2 and 8-bit x4, so enable generic versions of those.
8888
+ // Armv7 lacks N=2 (except 32-bit) and 8-bit x4, so enable them in generic_ops.
8378
8889
  #undef HWY_IF_SUM_OF_LANES_D
8379
8890
  #define HWY_IF_SUM_OF_LANES_D(D) \
8380
- hwy::EnableIf<(HWY_MAX_LANES_D(D) == 2) || \
8891
+ hwy::EnableIf<(sizeof(TFromD<D>) != 4 && HWY_MAX_LANES_D(D) == 2) || \
8381
8892
  (sizeof(TFromD<D>) == 1 && HWY_MAX_LANES_D(D) == 4)>* = \
8382
8893
  nullptr
8383
8894
  #undef HWY_IF_MINMAX_OF_LANES_D
8384
8895
  #define HWY_IF_MINMAX_OF_LANES_D(D) \
8385
- hwy::EnableIf<(HWY_MAX_LANES_D(D) == 2) || \
8896
+ hwy::EnableIf<(sizeof(TFromD<D>) != 4 && HWY_MAX_LANES_D(D) == 2) || \
8386
8897
  (sizeof(TFromD<D>) == 1 && HWY_MAX_LANES_D(D) == 4)>* = \
8387
8898
  nullptr
8388
8899
 
8389
8900
  // For arm7, we implement reductions using a series of pairwise operations. This
8390
8901
  // produces the full vector result, so we express Reduce* in terms of *OfLanes.
8391
- #define HWY_NEON_BUILD_TYPE_T(type, size) type##x##size##_t
8392
- #define HWY_NEON_DEF_PAIRWISE_REDUCTION(type, size, name, prefix, suffix) \
8393
- template <class D, HWY_IF_LANES_D(D, size)> \
8394
- HWY_API Vec128<type##_t, size> name##OfLanes(D /* d */, \
8395
- Vec128<type##_t, size> v) { \
8396
- HWY_NEON_BUILD_TYPE_T(type, size) tmp = prefix##_##suffix(v.raw, v.raw); \
8397
- if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \
8398
- if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \
8399
- return Vec128<type##_t, size>(tmp); \
8400
- }
8401
8902
 
8402
- // For the wide versions, the pairwise operations produce a half-length vector.
8403
- // We produce that `tmp` and then Combine.
8404
- #define HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(type, size, half, name, prefix, \
8405
- suffix) \
8406
- template <class D, HWY_IF_LANES_D(D, size)> \
8407
- HWY_API Vec128<type##_t, size> name##OfLanes(D /* d */, \
8408
- Vec128<type##_t, size> v) { \
8409
- HWY_NEON_BUILD_TYPE_T(type, half) tmp; \
8410
- tmp = prefix##_##suffix(vget_high_##suffix(v.raw), \
8411
- vget_low_##suffix(v.raw)); \
8412
- if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \
8413
- if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \
8414
- if ((size / 8) > 1) tmp = prefix##_##suffix(tmp, tmp); \
8415
- return Vec128<type##_t, size>(vcombine_##suffix(tmp, tmp)); \
8903
+ #define HWY_NEON_DEF_PAIRWISE_REDUCTION(name) \
8904
+ /* generic_ops-inl.h handles 64-bit types. */ \
8905
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_NOT_T_SIZE_D(D, 8)> \
8906
+ HWY_API VFromD<D> name##OfLanes(D d, VFromD<D> v) { \
8907
+ HWY_LANES_CONSTEXPR size_t N = Lanes(d); \
8908
+ VFromD<D> tmp = detail::Pairwise##name(v, v); \
8909
+ if ((N / 2) > 1) tmp = detail::Pairwise##name(tmp, tmp); \
8910
+ if ((N / 4) > 1) tmp = detail::Pairwise##name(tmp, tmp); \
8911
+ return tmp; \
8912
+ } \
8913
+ /* Armv7 lacks q (full-vector) instructions, so first reduce 128-bit v */ \
8914
+ /* into a half-vector, then reduce that. */ \
8915
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 8)> \
8916
+ HWY_API VFromD<D> name##OfLanes(D d, VFromD<D> v) { \
8917
+ const Half<D> dh; \
8918
+ VFromD<decltype(dh)> upper = UpperHalf(dh, v); \
8919
+ VFromD<decltype(dh)> lower = LowerHalf(dh, v); \
8920
+ VFromD<decltype(dh)> half = detail::Pairwise##name(upper, lower); \
8921
+ half = name##OfLanes(dh, half); \
8922
+ return Combine(d, half, half); \
8416
8923
  }
8417
8924
 
8418
- #define HWY_NEON_DEF_PAIRWISE_REDUCTIONS(name, prefix) \
8419
- HWY_NEON_DEF_PAIRWISE_REDUCTION(uint32, 2, name, prefix, u32) \
8420
- HWY_NEON_DEF_PAIRWISE_REDUCTION(uint16, 4, name, prefix, u16) \
8421
- HWY_NEON_DEF_PAIRWISE_REDUCTION(uint8, 8, name, prefix, u8) \
8422
- HWY_NEON_DEF_PAIRWISE_REDUCTION(int32, 2, name, prefix, s32) \
8423
- HWY_NEON_DEF_PAIRWISE_REDUCTION(int16, 4, name, prefix, s16) \
8424
- HWY_NEON_DEF_PAIRWISE_REDUCTION(int8, 8, name, prefix, s8) \
8425
- HWY_NEON_DEF_PAIRWISE_REDUCTION(float32, 2, name, prefix, f32) \
8426
- HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint32, 4, 2, name, prefix, u32) \
8427
- HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint16, 8, 4, name, prefix, u16) \
8428
- HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint8, 16, 8, name, prefix, u8) \
8429
- HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int32, 4, 2, name, prefix, s32) \
8430
- HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int16, 8, 4, name, prefix, s16) \
8431
- HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int8, 16, 8, name, prefix, s8) \
8432
- HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(float32, 4, 2, name, prefix, f32)
8433
-
8434
- HWY_NEON_DEF_PAIRWISE_REDUCTIONS(Sum, vpadd)
8435
- HWY_NEON_DEF_PAIRWISE_REDUCTIONS(Min, vpmin)
8436
- HWY_NEON_DEF_PAIRWISE_REDUCTIONS(Max, vpmax)
8437
-
8438
- #undef HWY_NEON_DEF_PAIRWISE_REDUCTIONS
8439
- #undef HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION
8925
+ HWY_NEON_DEF_PAIRWISE_REDUCTION(Sum)
8926
+ HWY_NEON_DEF_PAIRWISE_REDUCTION(Min)
8927
+ HWY_NEON_DEF_PAIRWISE_REDUCTION(Max)
8440
8928
  #undef HWY_NEON_DEF_PAIRWISE_REDUCTION
8441
- #undef HWY_NEON_BUILD_TYPE_T
8442
8929
 
8443
8930
  // GetLane(SumsOf4(v)) is more efficient on ArmV7 NEON than the default
8444
8931
  // N=4 I8/U8 ReduceSum implementation in generic_ops-inl.h
@@ -8562,14 +9049,22 @@ HWY_INLINE uint64_t NibblesFromMask(D d, MFromD<D> mask) {
8562
9049
  return nib & ((1ull << (d.MaxBytes() * 4)) - 1);
8563
9050
  }
8564
9051
 
8565
- template <typename T>
8566
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T> mask) {
9052
+ // Returns the lowest N for the BitsFromMask result.
9053
+ template <class D>
9054
+ constexpr uint64_t OnlyActive(D d, uint64_t bits) {
9055
+ return (d.MaxBytes() >= 8) ? bits : (bits & ((1ull << d.MaxLanes()) - 1));
9056
+ }
9057
+
9058
+ } // namespace detail
9059
+
9060
+ template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 16)>
9061
+ HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
8567
9062
  alignas(16) static constexpr uint8_t kSliceLanes[16] = {
8568
9063
  1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80,
8569
9064
  };
8570
- const Full128<uint8_t> du;
9065
+ const RebindToUnsigned<D> du;
8571
9066
  const Vec128<uint8_t> values =
8572
- BitCast(du, VecFromMask(Full128<T>(), mask)) & Load(du, kSliceLanes);
9067
+ BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
8573
9068
 
8574
9069
  #if HWY_ARCH_ARM_A64
8575
9070
  // Can't vaddv - we need two separate bytes (16 bits).
@@ -8586,126 +9081,114 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T> mask) {
8586
9081
  #endif
8587
9082
  }
8588
9083
 
8589
- template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
8590
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T, N> mask) {
9084
+ template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 8)>
9085
+ HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
8591
9086
  // Upper lanes of partial loads are undefined. OnlyActive will fix this if
8592
9087
  // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
8593
9088
  alignas(8) static constexpr uint8_t kSliceLanes[8] = {1, 2, 4, 8,
8594
9089
  0x10, 0x20, 0x40, 0x80};
8595
- const DFromM<decltype(mask)> d;
8596
9090
  const RebindToUnsigned<decltype(d)> du;
8597
- const Vec128<uint8_t, N> slice(Load(Full64<uint8_t>(), kSliceLanes).raw);
8598
- const Vec128<uint8_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
9091
+ using VU = VFromD<decltype(du)>;
9092
+ const VU slice(Load(Full64<uint8_t>(), kSliceLanes).raw);
9093
+ const VU values = BitCast(du, VecFromMask(d, mask)) & slice;
8599
9094
 
8600
9095
  #if HWY_ARCH_ARM_A64
8601
- return vaddv_u8(values.raw);
9096
+ return detail::OnlyActive(d, vaddv_u8(values.raw));
8602
9097
  #else
8603
9098
  const uint16x4_t x2 = vpaddl_u8(values.raw);
8604
9099
  const uint32x2_t x4 = vpaddl_u16(x2);
8605
9100
  const uint64x1_t x8 = vpaddl_u32(x4);
8606
- return vget_lane_u64(x8, 0);
9101
+ return detail::OnlyActive(d, vget_lane_u64(x8, 0));
8607
9102
  #endif
8608
9103
  }
8609
9104
 
8610
- template <typename T>
8611
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T> mask) {
9105
+ template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_D(D, 16)>
9106
+ HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
8612
9107
  alignas(16) static constexpr uint16_t kSliceLanes[8] = {
8613
9108
  1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80};
8614
- const Full128<T> d;
8615
- const Full128<uint16_t> du;
9109
+ const RebindToUnsigned<D> du;
8616
9110
  const Vec128<uint16_t> values =
8617
9111
  BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
8618
9112
  #if HWY_ARCH_ARM_A64
8619
- return vaddvq_u16(values.raw);
9113
+ return detail::OnlyActive(d, vaddvq_u16(values.raw));
8620
9114
  #else
8621
9115
  const uint32x4_t x2 = vpaddlq_u16(values.raw);
8622
9116
  const uint64x2_t x4 = vpaddlq_u32(x2);
8623
- return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1);
9117
+ return detail::OnlyActive(d, vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1));
8624
9118
  #endif
8625
9119
  }
8626
9120
 
8627
- template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
8628
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T, N> mask) {
9121
+ template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_LE_D(D, 8)>
9122
+ HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
8629
9123
  // Upper lanes of partial loads are undefined. OnlyActive will fix this if
8630
9124
  // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
8631
9125
  alignas(8) static constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8};
8632
- const DFromM<decltype(mask)> d;
8633
9126
  const RebindToUnsigned<decltype(d)> du;
8634
- const Vec128<uint16_t, N> slice(Load(Full64<uint16_t>(), kSliceLanes).raw);
8635
- const Vec128<uint16_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
9127
+ using VU = VFromD<decltype(du)>;
9128
+ const VU slice(Load(Full64<uint16_t>(), kSliceLanes).raw);
9129
+ const VU values = BitCast(du, VecFromMask(d, mask)) & slice;
8636
9130
  #if HWY_ARCH_ARM_A64
8637
- return vaddv_u16(values.raw);
9131
+ return detail::OnlyActive(d, vaddv_u16(values.raw));
8638
9132
  #else
8639
9133
  const uint32x2_t x2 = vpaddl_u16(values.raw);
8640
9134
  const uint64x1_t x4 = vpaddl_u32(x2);
8641
- return vget_lane_u64(x4, 0);
9135
+ return detail::OnlyActive(d, vget_lane_u64(x4, 0));
8642
9136
  #endif
8643
9137
  }
8644
9138
 
8645
- template <typename T>
8646
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T> mask) {
9139
+ template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_D(D, 16)>
9140
+ HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
8647
9141
  alignas(16) static constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8};
8648
- const Full128<T> d;
8649
- const Full128<uint32_t> du;
9142
+ const RebindToUnsigned<D> du;
8650
9143
  const Vec128<uint32_t> values =
8651
9144
  BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
8652
9145
  #if HWY_ARCH_ARM_A64
8653
- return vaddvq_u32(values.raw);
9146
+ return detail::OnlyActive(d, vaddvq_u32(values.raw));
8654
9147
  #else
8655
9148
  const uint64x2_t x2 = vpaddlq_u32(values.raw);
8656
- return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1);
9149
+ return detail::OnlyActive(d, vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1));
8657
9150
  #endif
8658
9151
  }
8659
9152
 
8660
- template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
8661
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) {
9153
+ template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_LE_D(D, 8)>
9154
+ HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
8662
9155
  // Upper lanes of partial loads are undefined. OnlyActive will fix this if
8663
9156
  // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
8664
9157
  alignas(8) static constexpr uint32_t kSliceLanes[2] = {1, 2};
8665
- const DFromM<decltype(mask)> d;
8666
9158
  const RebindToUnsigned<decltype(d)> du;
8667
- const Vec128<uint32_t, N> slice(Load(Full64<uint32_t>(), kSliceLanes).raw);
8668
- const Vec128<uint32_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
9159
+ using VU = VFromD<decltype(du)>;
9160
+ const VU slice(Load(Full64<uint32_t>(), kSliceLanes).raw);
9161
+ const VU values = BitCast(du, VecFromMask(d, mask)) & slice;
8669
9162
  #if HWY_ARCH_ARM_A64
8670
- return vaddv_u32(values.raw);
9163
+ return detail::OnlyActive(d, vaddv_u32(values.raw));
8671
9164
  #else
8672
9165
  const uint64x1_t x2 = vpaddl_u32(values.raw);
8673
- return vget_lane_u64(x2, 0);
9166
+ return detail::OnlyActive(d, vget_lane_u64(x2, 0));
8674
9167
  #endif
8675
9168
  }
8676
9169
 
8677
- template <typename T>
8678
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T> m) {
9170
+ template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 16)>
9171
+ HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
8679
9172
  alignas(16) static constexpr uint64_t kSliceLanes[2] = {1, 2};
8680
- const Full128<T> d;
8681
- const Full128<uint64_t> du;
9173
+ const RebindToUnsigned<decltype(d)> du;
8682
9174
  const Vec128<uint64_t> values =
8683
- BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes);
9175
+ BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
8684
9176
  #if HWY_ARCH_ARM_A64
8685
- return vaddvq_u64(values.raw);
9177
+ return detail::OnlyActive(d, vaddvq_u64(values.raw));
8686
9178
  #else
8687
- return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1);
9179
+ return detail::OnlyActive(
9180
+ d, vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1));
8688
9181
  #endif
8689
9182
  }
8690
9183
 
8691
- template <typename T>
8692
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, 1> m) {
8693
- const Full64<T> d;
8694
- const Full64<uint64_t> du;
8695
- const Vec64<uint64_t> values = BitCast(du, VecFromMask(d, m)) & Set(du, 1);
9184
+ template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_LE_D(D, 8)>
9185
+ HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
9186
+ const RebindToUnsigned<decltype(d)> du;
9187
+ const Vec64<uint64_t> values = BitCast(du, VecFromMask(d, mask)) & Set(du, 1);
8696
9188
  return vget_lane_u64(values.raw, 0);
8697
9189
  }
8698
9190
 
8699
- // Returns the lowest N for the BitsFromMask result.
8700
- template <typename T, size_t N>
8701
- constexpr uint64_t OnlyActive(uint64_t bits) {
8702
- return ((N * sizeof(T)) >= 8) ? bits : (bits & ((1ull << N) - 1));
8703
- }
8704
-
8705
- template <typename T, size_t N>
8706
- HWY_INLINE uint64_t BitsFromMask(Mask128<T, N> mask) {
8707
- return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
8708
- }
9191
+ namespace detail {
8709
9192
 
8710
9193
  // Returns number of lanes whose mask is set.
8711
9194
  //
@@ -8825,7 +9308,7 @@ HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
8825
9308
  // `p` points to at least 8 writable bytes.
8826
9309
  template <class D>
8827
9310
  HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
8828
- const uint64_t mask_bits = detail::BitsFromMask(mask);
9311
+ const uint64_t mask_bits = BitsFromMask(d, mask);
8829
9312
  const size_t kNumBytes = (d.MaxLanes() + 7) / 8;
8830
9313
  CopyBytes<kNumBytes>(&mask_bits, bits);
8831
9314
  return kNumBytes;
@@ -9313,7 +9796,8 @@ HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
9313
9796
  // General case, 2 or 4 byte lanes
9314
9797
  template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
9315
9798
  HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
9316
- return detail::Compress(v, detail::BitsFromMask(mask));
9799
+ const DFromV<decltype(v)> d;
9800
+ return detail::Compress(v, BitsFromMask(d, mask));
9317
9801
  }
9318
9802
 
9319
9803
  // Single lane: no-op
@@ -9337,12 +9821,13 @@ HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
9337
9821
  // General case, 2 or 4 byte lanes
9338
9822
  template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
9339
9823
  HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
9824
+ const DFromV<decltype(v)> d;
9340
9825
  // For partial vectors, we cannot pull the Not() into the table because
9341
9826
  // BitsFromMask clears the upper bits.
9342
9827
  if (N < 16 / sizeof(T)) {
9343
- return detail::Compress(v, detail::BitsFromMask(Not(mask)));
9828
+ return detail::Compress(v, BitsFromMask(d, Not(mask)));
9344
9829
  }
9345
- return detail::CompressNot(v, detail::BitsFromMask(mask));
9830
+ return detail::CompressNot(v, BitsFromMask(d, mask));
9346
9831
  }
9347
9832
 
9348
9833
  // ------------------------------ CompressBlocksNot
@@ -9370,7 +9855,7 @@ HWY_INLINE Vec128<T, N> CompressBits(Vec128<T, N> v,
9370
9855
  template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
9371
9856
  HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
9372
9857
  TFromD<D>* HWY_RESTRICT unaligned) {
9373
- const uint64_t mask_bits = detail::BitsFromMask(mask);
9858
+ const uint64_t mask_bits = BitsFromMask(d, mask);
9374
9859
  StoreU(detail::Compress(v, mask_bits), d, unaligned);
9375
9860
  return PopCount(mask_bits);
9376
9861
  }
@@ -9380,7 +9865,7 @@ template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
9380
9865
  HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
9381
9866
  TFromD<D>* HWY_RESTRICT unaligned) {
9382
9867
  const RebindToUnsigned<decltype(d)> du; // so we can support fp16/bf16
9383
- const uint64_t mask_bits = detail::BitsFromMask(m);
9868
+ const uint64_t mask_bits = BitsFromMask(d, m);
9384
9869
  const size_t count = PopCount(mask_bits);
9385
9870
  const MFromD<D> store_mask = RebindMask(d, FirstN(du, count));
9386
9871
  const VFromD<decltype(du)> compressed =
@@ -9420,17 +9905,22 @@ namespace detail {
9420
9905
  #define HWY_NEON_BUILD_ARG_HWY_LOAD_INT from
9421
9906
 
9422
9907
  #if HWY_ARCH_ARM_A64
9423
- #define HWY_IF_LOAD_INT(D) HWY_IF_V_SIZE_GT_D(D, 4)
9424
- #define HWY_NEON_DEF_FUNCTION_LOAD_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
9908
+ #define HWY_IF_LOAD_INT(D) \
9909
+ HWY_IF_V_SIZE_GT_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D)
9910
+ #define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args) \
9911
+ HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \
9912
+ HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args)
9425
9913
  #else
9426
- // Exclude 64x2 and f64x1, which are only supported on aarch64
9914
+ // Exclude 64x2 and f64x1, which are only supported on aarch64; also exclude any
9915
+ // emulated types.
9427
9916
  #define HWY_IF_LOAD_INT(D) \
9428
- HWY_IF_V_SIZE_GT_D(D, 4), \
9917
+ HWY_IF_V_SIZE_GT_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D), \
9429
9918
  hwy::EnableIf<(HWY_MAX_LANES_D(D) == 1 || sizeof(TFromD<D>) < 8)>* = \
9430
9919
  nullptr
9431
9920
  #define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args) \
9432
9921
  HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
9433
9922
  HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
9923
+ HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args) \
9434
9924
  HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \
9435
9925
  HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
9436
9926
  HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
@@ -9480,7 +9970,8 @@ HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
9480
9970
  }
9481
9971
 
9482
9972
  // <= 32 bits: avoid loading more than N bytes by copying to buffer
9483
- template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>>
9973
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D),
9974
+ typename T = TFromD<D>>
9484
9975
  HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
9485
9976
  VFromD<D>& v0, VFromD<D>& v1) {
9486
9977
  // The smallest vector registers are 64-bits and we want space for two.
@@ -9494,7 +9985,8 @@ HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
9494
9985
 
9495
9986
  #if HWY_ARCH_ARM_V7
9496
9987
  // 64x2: split into two 64x1
9497
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
9988
+ template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8),
9989
+ HWY_NEON_IF_NOT_EMULATED_D(D)>
9498
9990
  HWY_API void LoadInterleaved2(D d, T* HWY_RESTRICT unaligned, Vec128<T>& v0,
9499
9991
  Vec128<T>& v1) {
9500
9992
  const Half<decltype(d)> dh;
@@ -9519,7 +10011,8 @@ HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
9519
10011
  }
9520
10012
 
9521
10013
  // <= 32 bits: avoid writing more than N bytes by copying to buffer
9522
- template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>>
10014
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D),
10015
+ typename T = TFromD<D>>
9523
10016
  HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
9524
10017
  VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
9525
10018
  // The smallest vector registers are 64-bits and we want space for three.
@@ -9534,7 +10027,8 @@ HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
9534
10027
 
9535
10028
  #if HWY_ARCH_ARM_V7
9536
10029
  // 64x2: split into two 64x1
9537
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
10030
+ template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8),
10031
+ HWY_NEON_IF_NOT_EMULATED_D(D)>
9538
10032
  HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
9539
10033
  Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2) {
9540
10034
  const Half<decltype(d)> dh;
@@ -9562,7 +10056,8 @@ HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
9562
10056
  }
9563
10057
 
9564
10058
  // <= 32 bits: avoid writing more than N bytes by copying to buffer
9565
- template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>>
10059
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D),
10060
+ typename T = TFromD<D>>
9566
10061
  HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
9567
10062
  VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
9568
10063
  VFromD<D>& v3) {
@@ -9578,7 +10073,8 @@ HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
9578
10073
 
9579
10074
  #if HWY_ARCH_ARM_V7
9580
10075
  // 64x2: split into two 64x1
9581
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
10076
+ template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8),
10077
+ HWY_NEON_IF_NOT_EMULATED_D(D)>
9582
10078
  HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
9583
10079
  Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2,
9584
10080
  Vec128<T>& v3) {
@@ -9605,17 +10101,22 @@ namespace detail {
9605
10101
  #define HWY_NEON_BUILD_ARG_HWY_STORE_INT to, tup.raw
9606
10102
 
9607
10103
  #if HWY_ARCH_ARM_A64
9608
- #define HWY_IF_STORE_INT(D) HWY_IF_V_SIZE_GT_D(D, 4)
9609
- #define HWY_NEON_DEF_FUNCTION_STORE_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
10104
+ #define HWY_IF_STORE_INT(D) \
10105
+ HWY_IF_V_SIZE_GT_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D)
10106
+ #define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args) \
10107
+ HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \
10108
+ HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args)
9610
10109
  #else
9611
- // Exclude 64x2 and f64x1, which are only supported on aarch64
10110
+ // Exclude 64x2 and f64x1, which are only supported on aarch64; also exclude any
10111
+ // emulated types.
9612
10112
  #define HWY_IF_STORE_INT(D) \
9613
- HWY_IF_V_SIZE_GT_D(D, 4), \
10113
+ HWY_IF_V_SIZE_GT_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D), \
9614
10114
  hwy::EnableIf<(HWY_MAX_LANES_D(D) == 1 || sizeof(TFromD<D>) < 8)>* = \
9615
10115
  nullptr
9616
10116
  #define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args) \
9617
10117
  HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
9618
10118
  HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
10119
+ HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args) \
9619
10120
  HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \
9620
10121
  HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
9621
10122
  HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
@@ -9650,7 +10151,8 @@ HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
9650
10151
  }
9651
10152
 
9652
10153
  // <= 32 bits: avoid writing more than N bytes by copying to buffer
9653
- template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>>
10154
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D),
10155
+ typename T = TFromD<D>>
9654
10156
  HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
9655
10157
  T* HWY_RESTRICT unaligned) {
9656
10158
  alignas(16) T buf[2 * 8 / sizeof(T)];
@@ -9661,7 +10163,8 @@ HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
9661
10163
 
9662
10164
  #if HWY_ARCH_ARM_V7
9663
10165
  // 64x2: split into two 64x1
9664
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
10166
+ template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8),
10167
+ HWY_NEON_IF_NOT_EMULATED_D(D)>
9665
10168
  HWY_API void StoreInterleaved2(Vec128<T> v0, Vec128<T> v1, D d,
9666
10169
  T* HWY_RESTRICT unaligned) {
9667
10170
  const Half<decltype(d)> dh;
@@ -9682,7 +10185,8 @@ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
9682
10185
  }
9683
10186
 
9684
10187
  // <= 32 bits: avoid writing more than N bytes by copying to buffer
9685
- template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>>
10188
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D),
10189
+ typename T = TFromD<D>>
9686
10190
  HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
9687
10191
  T* HWY_RESTRICT unaligned) {
9688
10192
  alignas(16) T buf[3 * 8 / sizeof(T)];
@@ -9693,7 +10197,8 @@ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
9693
10197
 
9694
10198
  #if HWY_ARCH_ARM_V7
9695
10199
  // 64x2: split into two 64x1
9696
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
10200
+ template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8),
10201
+ HWY_NEON_IF_NOT_EMULATED_D(D)>
9697
10202
  HWY_API void StoreInterleaved3(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2, D d,
9698
10203
  T* HWY_RESTRICT unaligned) {
9699
10204
  const Half<decltype(d)> dh;
@@ -9714,7 +10219,8 @@ HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
9714
10219
  }
9715
10220
 
9716
10221
  // <= 32 bits: avoid writing more than N bytes by copying to buffer
9717
- template <class D, HWY_IF_V_SIZE_LE_D(D, 4), typename T = TFromD<D>>
10222
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D),
10223
+ typename T = TFromD<D>>
9718
10224
  HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
9719
10225
  VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) {
9720
10226
  alignas(16) T buf[4 * 8 / sizeof(T)];
@@ -9725,7 +10231,8 @@ HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
9725
10231
 
9726
10232
  #if HWY_ARCH_ARM_V7
9727
10233
  // 64x2: split into two 64x1
9728
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
10234
+ template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8),
10235
+ HWY_NEON_IF_NOT_EMULATED_D(D)>
9729
10236
  HWY_API void StoreInterleaved4(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2,
9730
10237
  Vec128<T> v3, D d, T* HWY_RESTRICT unaligned) {
9731
10238
  const Half<decltype(d)> dh;
@@ -9740,6 +10247,9 @@ HWY_API void StoreInterleaved4(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2,
9740
10247
 
9741
10248
  #undef HWY_IF_STORE_INT
9742
10249
 
10250
+ // Fall back on generic Load/StoreInterleaved[234] for any emulated types.
10251
+ // Requires HWY_GENERIC_IF_EMULATED_D mirrors HWY_NEON_IF_EMULATED_D.
10252
+
9743
10253
  // ------------------------------ Additional mask logical operations
9744
10254
  template <class T>
9745
10255
  HWY_API Mask128<T, 1> SetAtOrAfterFirst(Mask128<T, 1> mask) {
@@ -10066,6 +10576,7 @@ namespace detail { // for code folding
10066
10576
  #undef HWY_NEON_DEF_FUNCTION_UINTS
10067
10577
  #undef HWY_NEON_EVAL
10068
10578
  #undef HWY_NEON_IF_EMULATED_D
10579
+ #undef HWY_NEON_IF_NOT_EMULATED_D
10069
10580
  } // namespace detail
10070
10581
 
10071
10582
  // NOLINTNEXTLINE(google-readability-namespace-comments)