@img/sharp-libvips-dev 1.2.1 → 1.2.2-rc.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +2 -0
- package/include/aom/aomcx.h +106 -25
- package/include/ffi.h +3 -3
- package/include/freetype2/freetype/config/ftconfig.h +1 -1
- package/include/freetype2/freetype/config/ftheader.h +1 -1
- package/include/freetype2/freetype/config/ftoption.h +37 -12
- package/include/freetype2/freetype/config/ftstdlib.h +1 -1
- package/include/freetype2/freetype/config/integer-types.h +29 -2
- package/include/freetype2/freetype/config/mac-support.h +1 -1
- package/include/freetype2/freetype/config/public-macros.h +3 -3
- package/include/freetype2/freetype/freetype.h +51 -47
- package/include/freetype2/freetype/ftadvanc.h +1 -1
- package/include/freetype2/freetype/ftbbox.h +1 -1
- package/include/freetype2/freetype/ftbdf.h +1 -1
- package/include/freetype2/freetype/ftbitmap.h +1 -1
- package/include/freetype2/freetype/ftbzip2.h +1 -1
- package/include/freetype2/freetype/ftcache.h +1 -1
- package/include/freetype2/freetype/ftcid.h +1 -1
- package/include/freetype2/freetype/ftcolor.h +13 -4
- package/include/freetype2/freetype/ftdriver.h +3 -3
- package/include/freetype2/freetype/fterrdef.h +1 -1
- package/include/freetype2/freetype/fterrors.h +1 -1
- package/include/freetype2/freetype/ftfntfmt.h +1 -1
- package/include/freetype2/freetype/ftgasp.h +1 -1
- package/include/freetype2/freetype/ftglyph.h +1 -1
- package/include/freetype2/freetype/ftgxval.h +1 -1
- package/include/freetype2/freetype/ftgzip.h +1 -1
- package/include/freetype2/freetype/ftimage.h +6 -2
- package/include/freetype2/freetype/ftincrem.h +1 -1
- package/include/freetype2/freetype/ftlcdfil.h +1 -1
- package/include/freetype2/freetype/ftlist.h +1 -1
- package/include/freetype2/freetype/ftlogging.h +184 -0
- package/include/freetype2/freetype/ftlzw.h +1 -1
- package/include/freetype2/freetype/ftmac.h +1 -1
- package/include/freetype2/freetype/ftmm.h +159 -103
- package/include/freetype2/freetype/ftmodapi.h +1 -1
- package/include/freetype2/freetype/ftmoderr.h +1 -1
- package/include/freetype2/freetype/ftotval.h +1 -1
- package/include/freetype2/freetype/ftoutln.h +1 -1
- package/include/freetype2/freetype/ftparams.h +1 -1
- package/include/freetype2/freetype/ftpfr.h +1 -1
- package/include/freetype2/freetype/ftrender.h +1 -1
- package/include/freetype2/freetype/ftsizes.h +1 -1
- package/include/freetype2/freetype/ftsnames.h +1 -1
- package/include/freetype2/freetype/ftstroke.h +1 -1
- package/include/freetype2/freetype/ftsynth.h +1 -1
- package/include/freetype2/freetype/ftsystem.h +1 -1
- package/include/freetype2/freetype/fttrigon.h +1 -1
- package/include/freetype2/freetype/fttypes.h +1 -1
- package/include/freetype2/freetype/ftwinfnt.h +2 -3
- package/include/freetype2/freetype/otsvg.h +1 -1
- package/include/freetype2/freetype/t1tables.h +1 -1
- package/include/freetype2/freetype/ttnameid.h +129 -129
- package/include/freetype2/freetype/tttables.h +8 -5
- package/include/freetype2/freetype/tttags.h +1 -1
- package/include/freetype2/ft2build.h +1 -1
- package/include/glib-2.0/gio/gdbuserror.h +9 -8
- package/include/glib-2.0/gio/ginetaddress.h +12 -0
- package/include/glib-2.0/gio/gioenums.h +9 -2
- package/include/glib-2.0/glib/gstring.h +2 -2
- package/include/glib-2.0/glib/gunicode.h +1 -1
- package/include/glib-2.0/gobject/glib-types.h +1 -1
- package/include/glib-2.0/gobject/gparam.h +1 -1
- package/include/glib-2.0/gobject/gvalue.h +78 -35
- package/include/harfbuzz/hb-script-list.h +12 -0
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +2 -19
- package/include/hwy/aligned_allocator.h +11 -7
- package/include/hwy/auto_tune.h +504 -0
- package/include/hwy/base.h +425 -104
- package/include/hwy/cache_control.h +16 -0
- package/include/hwy/detect_compiler_arch.h +32 -1
- package/include/hwy/detect_targets.h +251 -67
- package/include/hwy/foreach_target.h +35 -0
- package/include/hwy/highway.h +185 -76
- package/include/hwy/nanobenchmark.h +1 -19
- package/include/hwy/ops/arm_neon-inl.h +969 -458
- package/include/hwy/ops/arm_sve-inl.h +1137 -359
- package/include/hwy/ops/emu128-inl.h +97 -11
- package/include/hwy/ops/generic_ops-inl.h +1222 -34
- package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
- package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
- package/include/hwy/ops/ppc_vsx-inl.h +306 -126
- package/include/hwy/ops/rvv-inl.h +546 -51
- package/include/hwy/ops/scalar-inl.h +77 -22
- package/include/hwy/ops/set_macros-inl.h +138 -17
- package/include/hwy/ops/shared-inl.h +50 -10
- package/include/hwy/ops/wasm_128-inl.h +137 -92
- package/include/hwy/ops/x86_128-inl.h +773 -214
- package/include/hwy/ops/x86_256-inl.h +712 -255
- package/include/hwy/ops/x86_512-inl.h +429 -753
- package/include/hwy/ops/x86_avx3-inl.h +501 -0
- package/include/hwy/per_target.h +2 -1
- package/include/hwy/profiler.h +622 -486
- package/include/hwy/targets.h +62 -20
- package/include/hwy/timer-inl.h +8 -160
- package/include/hwy/timer.h +170 -3
- package/include/hwy/x86_cpuid.h +81 -0
- package/include/libheif/heif_cxx.h +25 -5
- package/include/libheif/heif_regions.h +5 -5
- package/include/libheif/heif_version.h +2 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
- package/include/libxml2/libxml/valid.h +0 -3
- package/include/libxml2/libxml/xmlerror.h +1 -1
- package/include/libxml2/libxml/xmlversion.h +4 -4
- package/include/pango-1.0/pango/pango-enum-types.h +3 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-font.h +30 -0
- package/include/pango-1.0/pango/pango-version-macros.h +26 -0
- package/include/vips/connection.h +4 -4
- package/include/vips/version.h +4 -4
- package/include/zlib.h +3 -3
- package/package.json +1 -1
- package/versions.json +13 -13
|
@@ -72,10 +72,12 @@ struct Vec1 {
|
|
|
72
72
|
|
|
73
73
|
// 0 or FF..FF, same size as Vec1.
|
|
74
74
|
template <typename T>
|
|
75
|
-
|
|
75
|
+
struct Mask1 {
|
|
76
76
|
using Raw = hwy::MakeUnsigned<T>;
|
|
77
77
|
|
|
78
|
-
|
|
78
|
+
using PrivateT = T; // only for DFromM
|
|
79
|
+
static constexpr size_t kPrivateN = 1; // only for DFromM
|
|
80
|
+
|
|
79
81
|
static HWY_INLINE Mask1<T> FromBool(bool b) {
|
|
80
82
|
Mask1<T> mask;
|
|
81
83
|
mask.bits = b ? static_cast<Raw>(~Raw{0}) : 0;
|
|
@@ -88,6 +90,9 @@ class Mask1 {
|
|
|
88
90
|
template <class V>
|
|
89
91
|
using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
|
|
90
92
|
|
|
93
|
+
template <class M>
|
|
94
|
+
using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>;
|
|
95
|
+
|
|
91
96
|
template <class V>
|
|
92
97
|
using TFromV = typename V::PrivateT;
|
|
93
98
|
|
|
@@ -288,13 +293,6 @@ HWY_API Mask1<T> MaskFromVec(const Vec1<T> v) {
|
|
|
288
293
|
template <class D>
|
|
289
294
|
using MFromD = decltype(MaskFromVec(VFromD<D>()));
|
|
290
295
|
|
|
291
|
-
template <typename T>
|
|
292
|
-
Vec1<T> VecFromMask(const Mask1<T> mask) {
|
|
293
|
-
Vec1<T> v;
|
|
294
|
-
CopySameSize(&mask, &v);
|
|
295
|
-
return v;
|
|
296
|
-
}
|
|
297
|
-
|
|
298
296
|
template <class D, typename T = TFromD<D>>
|
|
299
297
|
Vec1<T> VecFromMask(D /* tag */, const Mask1<T> mask) {
|
|
300
298
|
Vec1<T> v;
|
|
@@ -302,6 +300,11 @@ Vec1<T> VecFromMask(D /* tag */, const Mask1<T> mask) {
|
|
|
302
300
|
return v;
|
|
303
301
|
}
|
|
304
302
|
|
|
303
|
+
template <class D>
|
|
304
|
+
uint64_t BitsFromMask(D, MFromD<D> mask) {
|
|
305
|
+
return mask.bits ? 1 : 0;
|
|
306
|
+
}
|
|
307
|
+
|
|
305
308
|
template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
|
|
306
309
|
HWY_API Mask1<T> FirstN(D /*tag*/, size_t n) {
|
|
307
310
|
return Mask1<T>::FromBool(n != 0);
|
|
@@ -607,13 +610,23 @@ HWY_API Vec1<int16_t> SaturatedSub(const Vec1<int16_t> a,
|
|
|
607
610
|
|
|
608
611
|
// Returns (a + b + 1) / 2
|
|
609
612
|
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
613
|
+
#ifdef HWY_NATIVE_AVERAGE_ROUND_UI32
|
|
614
|
+
#undef HWY_NATIVE_AVERAGE_ROUND_UI32
|
|
615
|
+
#else
|
|
616
|
+
#define HWY_NATIVE_AVERAGE_ROUND_UI32
|
|
617
|
+
#endif
|
|
618
|
+
|
|
619
|
+
#ifdef HWY_NATIVE_AVERAGE_ROUND_UI64
|
|
620
|
+
#undef HWY_NATIVE_AVERAGE_ROUND_UI64
|
|
621
|
+
#else
|
|
622
|
+
#define HWY_NATIVE_AVERAGE_ROUND_UI64
|
|
623
|
+
#endif
|
|
624
|
+
|
|
625
|
+
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
626
|
+
HWY_API Vec1<T> AverageRound(const Vec1<T> a, const Vec1<T> b) {
|
|
627
|
+
const T a_val = a.raw;
|
|
628
|
+
const T b_val = b.raw;
|
|
629
|
+
return Vec1<T>(static_cast<T>((a_val | b_val) - ScalarShr(a_val ^ b_val, 1)));
|
|
617
630
|
}
|
|
618
631
|
|
|
619
632
|
// ------------------------------ Absolute value
|
|
@@ -721,6 +734,11 @@ HWY_API Vec1<MakeWide<T>> MulEven(const Vec1<T> a, const Vec1<T> b) {
|
|
|
721
734
|
return Vec1<TW>(static_cast<TW>(a_wide * b.raw));
|
|
722
735
|
}
|
|
723
736
|
|
|
737
|
+
template <class T>
|
|
738
|
+
HWY_API Vec1<MakeWide<T>> MulOdd(const Vec1<T>, const Vec1<T>) {
|
|
739
|
+
static_assert(sizeof(T) == 0, "There are no odd lanes");
|
|
740
|
+
}
|
|
741
|
+
|
|
724
742
|
// Approximate reciprocal
|
|
725
743
|
HWY_API Vec1<float> ApproximateReciprocal(const Vec1<float> v) {
|
|
726
744
|
// Zero inputs are allowed, but callers are responsible for replacing the
|
|
@@ -831,9 +849,9 @@ HWY_API Vec1<T> Round(const Vec1<T> v) {
|
|
|
831
849
|
}
|
|
832
850
|
|
|
833
851
|
// Round-to-nearest even.
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
using TI =
|
|
852
|
+
template <class T, HWY_IF_FLOAT3264(T)>
|
|
853
|
+
HWY_API Vec1<MakeSigned<T>> NearestInt(const Vec1<T> v) {
|
|
854
|
+
using TI = MakeSigned<T>;
|
|
837
855
|
|
|
838
856
|
const T abs = Abs(v).raw;
|
|
839
857
|
const bool is_sign = ScalarSignBit(v.raw);
|
|
@@ -843,12 +861,39 @@ HWY_API Vec1<int32_t> NearestInt(const Vec1<float> v) {
|
|
|
843
861
|
if (!(abs <= ConvertScalarTo<T>(LimitsMax<TI>()))) {
|
|
844
862
|
return Vec1<TI>(is_sign ? LimitsMin<TI>() : LimitsMax<TI>());
|
|
845
863
|
}
|
|
846
|
-
return Vec1<
|
|
864
|
+
return Vec1<TI>(ConvertScalarTo<TI>(v.raw));
|
|
847
865
|
}
|
|
848
866
|
const T bias =
|
|
849
867
|
ConvertScalarTo<T>(v.raw < ConvertScalarTo<T>(0.0) ? -0.5 : 0.5);
|
|
850
868
|
const TI rounded = ConvertScalarTo<TI>(v.raw + bias);
|
|
851
|
-
if (rounded == 0) return Vec1<
|
|
869
|
+
if (rounded == 0) return Vec1<TI>(0);
|
|
870
|
+
TI offset = 0;
|
|
871
|
+
// Round to even
|
|
872
|
+
if ((rounded & 1) && ScalarAbs(ConvertScalarTo<T>(rounded) - v.raw) ==
|
|
873
|
+
ConvertScalarTo<T>(0.5)) {
|
|
874
|
+
offset = is_sign ? -1 : 1;
|
|
875
|
+
}
|
|
876
|
+
return Vec1<TI>(rounded - offset);
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
// Round-to-nearest even.
|
|
880
|
+
template <class DI32, HWY_IF_I32_D(DI32)>
|
|
881
|
+
HWY_API VFromD<DI32> DemoteToNearestInt(DI32 /*di32*/, const Vec1<double> v) {
|
|
882
|
+
using T = double;
|
|
883
|
+
using TI = int32_t;
|
|
884
|
+
|
|
885
|
+
const T abs = Abs(v).raw;
|
|
886
|
+
const bool is_sign = ScalarSignBit(v.raw);
|
|
887
|
+
|
|
888
|
+
// Check if too large to cast or NaN
|
|
889
|
+
if (!(abs <= ConvertScalarTo<T>(LimitsMax<TI>()))) {
|
|
890
|
+
return Vec1<TI>(is_sign ? LimitsMin<TI>() : LimitsMax<TI>());
|
|
891
|
+
}
|
|
892
|
+
|
|
893
|
+
const T bias =
|
|
894
|
+
ConvertScalarTo<T>(v.raw < ConvertScalarTo<T>(0.0) ? -0.5 : 0.5);
|
|
895
|
+
const TI rounded = ConvertScalarTo<TI>(v.raw + bias);
|
|
896
|
+
if (rounded == 0) return Vec1<TI>(0);
|
|
852
897
|
TI offset = 0;
|
|
853
898
|
// Round to even
|
|
854
899
|
if ((rounded & 1) && ScalarAbs(ConvertScalarTo<T>(rounded) - v.raw) ==
|
|
@@ -1612,12 +1657,22 @@ HWY_API Vec1<T> OddEvenBlocks(Vec1<T> /* odd */, Vec1<T> even) {
|
|
|
1612
1657
|
}
|
|
1613
1658
|
|
|
1614
1659
|
// ------------------------------ SwapAdjacentBlocks
|
|
1615
|
-
|
|
1616
1660
|
template <typename T>
|
|
1617
1661
|
HWY_API Vec1<T> SwapAdjacentBlocks(Vec1<T> v) {
|
|
1618
1662
|
return v;
|
|
1619
1663
|
}
|
|
1620
1664
|
|
|
1665
|
+
// ------------------------------ InterleaveEvenBlocks
|
|
1666
|
+
template <class D, class V = VFromD<D>>
|
|
1667
|
+
HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) {
|
|
1668
|
+
return a;
|
|
1669
|
+
}
|
|
1670
|
+
// ------------------------------ InterleaveOddBlocks
|
|
1671
|
+
template <class D, class V = VFromD<D>>
|
|
1672
|
+
HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) {
|
|
1673
|
+
return a;
|
|
1674
|
+
}
|
|
1675
|
+
|
|
1621
1676
|
// ------------------------------ TableLookupLanes
|
|
1622
1677
|
|
|
1623
1678
|
// Returned by SetTableIndices for use by TableLookupLanes.
|
|
@@ -68,10 +68,17 @@
|
|
|
68
68
|
#define HWY_TARGET_IS_PPC 0
|
|
69
69
|
#endif
|
|
70
70
|
|
|
71
|
+
#undef HWY_TARGET_IS_AVX10_2
|
|
72
|
+
#if HWY_TARGET == HWY_AVX10_2
|
|
73
|
+
#define HWY_TARGET_IS_AVX10_2 1
|
|
74
|
+
#else
|
|
75
|
+
#define HWY_TARGET_IS_AVX10_2 0
|
|
76
|
+
#endif
|
|
77
|
+
|
|
71
78
|
// Supported on all targets except RVV (requires GCC 14 or upcoming Clang)
|
|
72
79
|
#if HWY_TARGET == HWY_RVV && \
|
|
73
80
|
((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400) || \
|
|
74
|
-
(HWY_COMPILER_CLANG))
|
|
81
|
+
(HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1700))
|
|
75
82
|
#define HWY_HAVE_TUPLE 0
|
|
76
83
|
#else
|
|
77
84
|
#define HWY_HAVE_TUPLE 1
|
|
@@ -133,13 +140,28 @@
|
|
|
133
140
|
// Include previous targets, which are the half-vectors of the next target.
|
|
134
141
|
#define HWY_TARGET_STR_AVX2 \
|
|
135
142
|
HWY_TARGET_STR_SSE4 ",avx,avx2" HWY_TARGET_STR_BMI2_FMA HWY_TARGET_STR_F16C
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
143
|
+
|
|
144
|
+
#if (HWY_COMPILER_GCC_ACTUAL >= 1400 && HWY_COMPILER_GCC_ACTUAL < 1600) || \
|
|
145
|
+
HWY_COMPILER_CLANG >= 1800
|
|
146
|
+
#define HWY_TARGET_STR_AVX3_VL512 ",evex512"
|
|
147
|
+
#else
|
|
148
|
+
#define HWY_TARGET_STR_AVX3_VL512
|
|
149
|
+
#endif
|
|
150
|
+
|
|
151
|
+
#define HWY_TARGET_STR_AVX3_256 \
|
|
152
|
+
HWY_TARGET_STR_AVX2 \
|
|
153
|
+
",avx512f,avx512cd,avx512vl,avx512dq,avx512bw" HWY_TARGET_STR_AVX3_VL512
|
|
154
|
+
|
|
155
|
+
#define HWY_TARGET_STR_AVX3 HWY_TARGET_STR_AVX3_256 HWY_TARGET_STR_AVX3_VL512
|
|
156
|
+
|
|
157
|
+
#define HWY_TARGET_STR_AVX3_DL_256 \
|
|
158
|
+
HWY_TARGET_STR_AVX3_256 \
|
|
140
159
|
",vpclmulqdq,avx512vbmi,avx512vbmi2,vaes,avx512vnni,avx512bitalg," \
|
|
141
160
|
"avx512vpopcntdq,gfni"
|
|
142
161
|
|
|
162
|
+
#define HWY_TARGET_STR_AVX3_DL \
|
|
163
|
+
HWY_TARGET_STR_AVX3_DL_256 HWY_TARGET_STR_AVX3_VL512
|
|
164
|
+
|
|
143
165
|
// Force-disable for compilers that do not properly support avx512bf16.
|
|
144
166
|
#if !defined(HWY_AVX3_DISABLE_AVX512BF16) && \
|
|
145
167
|
(HWY_COMPILER_CLANGCL || \
|
|
@@ -149,12 +171,30 @@
|
|
|
149
171
|
#endif
|
|
150
172
|
|
|
151
173
|
#if !defined(HWY_AVX3_DISABLE_AVX512BF16)
|
|
152
|
-
#define
|
|
174
|
+
#define HWY_TARGET_STR_AVX3_ZEN4_256 HWY_TARGET_STR_AVX3_DL ",avx512bf16"
|
|
175
|
+
#else
|
|
176
|
+
#define HWY_TARGET_STR_AVX3_ZEN4_256 HWY_TARGET_STR_AVX3_DL
|
|
177
|
+
#endif
|
|
178
|
+
|
|
179
|
+
#define HWY_TARGET_STR_AVX3_ZEN4 \
|
|
180
|
+
HWY_TARGET_STR_AVX3_ZEN4_256 HWY_TARGET_STR_AVX3_VL512
|
|
181
|
+
|
|
182
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 1200 || HWY_COMPILER_CLANG >= 1400
|
|
183
|
+
#define HWY_TARGET_STR_AVX3_SPR_256 HWY_TARGET_STR_AVX3_ZEN4_256 ",avx512fp16"
|
|
153
184
|
#else
|
|
154
|
-
#define
|
|
185
|
+
#define HWY_TARGET_STR_AVX3_SPR_256 HWY_TARGET_STR_AVX3_ZEN4_256
|
|
155
186
|
#endif
|
|
156
187
|
|
|
157
|
-
#define HWY_TARGET_STR_AVX3_SPR
|
|
188
|
+
#define HWY_TARGET_STR_AVX3_SPR \
|
|
189
|
+
HWY_TARGET_STR_AVX3_SPR_256 HWY_TARGET_STR_AVX3_VL512
|
|
190
|
+
|
|
191
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 1500
|
|
192
|
+
#define HWY_TARGET_STR_AVX10_2 HWY_TARGET_STR_AVX3_SPR ",avx10.2"
|
|
193
|
+
#elif HWY_COMPILER_CLANG >= 2000
|
|
194
|
+
#define HWY_TARGET_STR_AVX10_2 HWY_TARGET_STR_AVX3_SPR ",avx10.2-512"
|
|
195
|
+
#else
|
|
196
|
+
#define HWY_TARGET_STR_AVX10_2 HWY_TARGET_STR_AVX3_SPR
|
|
197
|
+
#endif
|
|
158
198
|
|
|
159
199
|
#if defined(HWY_DISABLE_PPC8_CRYPTO)
|
|
160
200
|
#define HWY_TARGET_STR_PPC8_CRYPTO ""
|
|
@@ -277,9 +317,10 @@
|
|
|
277
317
|
#define HWY_TARGET_STR HWY_TARGET_STR_AVX2
|
|
278
318
|
|
|
279
319
|
//-----------------------------------------------------------------------------
|
|
280
|
-
// AVX3[_DL]
|
|
281
|
-
#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL ||
|
|
282
|
-
HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR
|
|
320
|
+
// AVX3[_DL]/AVX10
|
|
321
|
+
#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL || \
|
|
322
|
+
HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR || \
|
|
323
|
+
HWY_TARGET == HWY_AVX10_2
|
|
283
324
|
|
|
284
325
|
#define HWY_ALIGN alignas(64)
|
|
285
326
|
#define HWY_MAX_BYTES 64
|
|
@@ -287,10 +328,9 @@
|
|
|
287
328
|
|
|
288
329
|
#define HWY_HAVE_SCALABLE 0
|
|
289
330
|
#define HWY_HAVE_INTEGER64 1
|
|
290
|
-
#if HWY_TARGET
|
|
331
|
+
#if HWY_TARGET <= HWY_AVX3_SPR && \
|
|
332
|
+
(HWY_COMPILER_GCC_ACTUAL || HWY_COMPILER_CLANG >= 1901) && \
|
|
291
333
|
HWY_HAVE_SCALAR_F16_TYPE
|
|
292
|
-
// TODO: enable F16 for AVX3_SPR target with Clang once compilation issues are
|
|
293
|
-
// fixed
|
|
294
334
|
#define HWY_HAVE_FLOAT16 1
|
|
295
335
|
#else
|
|
296
336
|
#define HWY_HAVE_FLOAT16 0
|
|
@@ -304,7 +344,12 @@
|
|
|
304
344
|
#define HWY_NATIVE_DOT_BF16 0
|
|
305
345
|
#endif
|
|
306
346
|
#define HWY_CAP_GE256 1
|
|
347
|
+
|
|
348
|
+
#if HWY_MAX_BYTES >= 64
|
|
307
349
|
#define HWY_CAP_GE512 1
|
|
350
|
+
#else
|
|
351
|
+
#define HWY_CAP_GE512 0
|
|
352
|
+
#endif
|
|
308
353
|
|
|
309
354
|
#if HWY_TARGET == HWY_AVX3
|
|
310
355
|
|
|
@@ -326,6 +371,11 @@
|
|
|
326
371
|
#define HWY_NAMESPACE N_AVX3_SPR
|
|
327
372
|
#define HWY_TARGET_STR HWY_TARGET_STR_AVX3_SPR
|
|
328
373
|
|
|
374
|
+
#elif HWY_TARGET == HWY_AVX10_2
|
|
375
|
+
|
|
376
|
+
#define HWY_NAMESPACE N_AVX10_2
|
|
377
|
+
#define HWY_TARGET_STR HWY_TARGET_STR_AVX10_2
|
|
378
|
+
|
|
329
379
|
#else
|
|
330
380
|
#error "Logic error"
|
|
331
381
|
#endif // HWY_TARGET
|
|
@@ -403,6 +453,29 @@
|
|
|
403
453
|
// NEON
|
|
404
454
|
#elif HWY_TARGET_IS_NEON
|
|
405
455
|
|
|
456
|
+
// Clang 17 crashes with bf16, see github.com/llvm/llvm-project/issues/64179.
|
|
457
|
+
#undef HWY_NEON_HAVE_BFLOAT16
|
|
458
|
+
#if HWY_HAVE_SCALAR_BF16_TYPE && \
|
|
459
|
+
((HWY_TARGET == HWY_NEON_BF16 && \
|
|
460
|
+
(!HWY_COMPILER_CLANG || HWY_COMPILER_CLANG >= 1800)) || \
|
|
461
|
+
defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC))
|
|
462
|
+
#define HWY_NEON_HAVE_BFLOAT16 1
|
|
463
|
+
#else
|
|
464
|
+
#define HWY_NEON_HAVE_BFLOAT16 0
|
|
465
|
+
#endif
|
|
466
|
+
|
|
467
|
+
// HWY_NEON_HAVE_F32_TO_BF16C is defined if NEON vcvt_bf16_f32 and
|
|
468
|
+
// vbfdot_f32 are available, even if the __bf16 type is disabled due to
|
|
469
|
+
// GCC/Clang bugs.
|
|
470
|
+
#undef HWY_NEON_HAVE_F32_TO_BF16C
|
|
471
|
+
#if HWY_NEON_HAVE_BFLOAT16 || HWY_TARGET == HWY_NEON_BF16 || \
|
|
472
|
+
(defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && \
|
|
473
|
+
(HWY_COMPILER_GCC_ACTUAL >= 1000 || HWY_COMPILER_CLANG >= 1100))
|
|
474
|
+
#define HWY_NEON_HAVE_F32_TO_BF16C 1
|
|
475
|
+
#else
|
|
476
|
+
#define HWY_NEON_HAVE_F32_TO_BF16C 0
|
|
477
|
+
#endif
|
|
478
|
+
|
|
406
479
|
#define HWY_ALIGN alignas(16)
|
|
407
480
|
#define HWY_MAX_BYTES 16
|
|
408
481
|
#define HWY_LANES(T) (16 / sizeof(T))
|
|
@@ -428,7 +501,8 @@
|
|
|
428
501
|
#else
|
|
429
502
|
#define HWY_NATIVE_FMA 0
|
|
430
503
|
#endif
|
|
431
|
-
|
|
504
|
+
|
|
505
|
+
#if HWY_NEON_HAVE_F32_TO_BF16C
|
|
432
506
|
#define HWY_NATIVE_DOT_BF16 1
|
|
433
507
|
#else
|
|
434
508
|
#define HWY_NATIVE_DOT_BF16 0
|
|
@@ -480,7 +554,12 @@
|
|
|
480
554
|
#endif
|
|
481
555
|
|
|
482
556
|
#if HWY_TARGET == HWY_NEON_WITHOUT_AES
|
|
557
|
+
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400
|
|
558
|
+
// Prevents inadvertent use of SVE by GCC 13.4 and earlier, see #2689.
|
|
559
|
+
#define HWY_TARGET_STR "+nosve"
|
|
560
|
+
#else
|
|
483
561
|
// Do not define HWY_TARGET_STR (no pragma).
|
|
562
|
+
#endif // HWY_COMPILER_GCC_ACTUAL
|
|
484
563
|
#elif HWY_TARGET == HWY_NEON
|
|
485
564
|
#define HWY_TARGET_STR HWY_TARGET_STR_NEON
|
|
486
565
|
#elif HWY_TARGET == HWY_NEON_BF16
|
|
@@ -586,7 +665,7 @@
|
|
|
586
665
|
#define HWY_HAVE_SCALABLE 0
|
|
587
666
|
#define HWY_HAVE_INTEGER64 1
|
|
588
667
|
#define HWY_HAVE_FLOAT16 0
|
|
589
|
-
#define HWY_HAVE_FLOAT64
|
|
668
|
+
#define HWY_HAVE_FLOAT64 1
|
|
590
669
|
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
|
591
670
|
#define HWY_NATIVE_FMA 0
|
|
592
671
|
#define HWY_NATIVE_DOT_BF16 0
|
|
@@ -629,8 +708,50 @@
|
|
|
629
708
|
|
|
630
709
|
#define HWY_NAMESPACE N_RVV
|
|
631
710
|
|
|
711
|
+
#if HWY_COMPILER_CLANG >= 1900
|
|
712
|
+
// https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#181-zvl-minimum-vector-length-standard-extensions
|
|
713
|
+
#define HWY_TARGET_STR "arch=+v"
|
|
714
|
+
#else
|
|
715
|
+
// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
|
|
716
|
+
#endif
|
|
717
|
+
|
|
718
|
+
//-----------------------------------------------------------------------------
|
|
719
|
+
// LSX/LASX
|
|
720
|
+
#elif HWY_TARGET == HWY_LSX || HWY_TARGET == HWY_LASX
|
|
721
|
+
|
|
722
|
+
#if HWY_TARGET == HWY_LSX
|
|
723
|
+
#define HWY_ALIGN alignas(16)
|
|
724
|
+
#define HWY_MAX_BYTES 16
|
|
725
|
+
#else
|
|
726
|
+
#define HWY_ALIGN alignas(32)
|
|
727
|
+
#define HWY_MAX_BYTES 32
|
|
728
|
+
#endif
|
|
729
|
+
|
|
730
|
+
#define HWY_LANES(T) (HWY_MAX_BYTES / sizeof(T))
|
|
731
|
+
|
|
732
|
+
#define HWY_HAVE_SCALABLE 0
|
|
733
|
+
#define HWY_HAVE_INTEGER64 1
|
|
734
|
+
#define HWY_HAVE_FLOAT16 0
|
|
735
|
+
#define HWY_HAVE_FLOAT64 1
|
|
736
|
+
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
|
737
|
+
#define HWY_NATIVE_FMA 1
|
|
738
|
+
#define HWY_NATIVE_DOT_BF16 0
|
|
739
|
+
|
|
740
|
+
#if HWY_TARGET == HWY_LSX
|
|
741
|
+
#define HWY_CAP_GE256 0
|
|
742
|
+
#else
|
|
743
|
+
#define HWY_CAP_GE256 1
|
|
744
|
+
#endif
|
|
745
|
+
|
|
746
|
+
#define HWY_CAP_GE512 0
|
|
747
|
+
|
|
748
|
+
#if HWY_TARGET == HWY_LSX
|
|
749
|
+
#define HWY_NAMESPACE N_LSX
|
|
750
|
+
#else
|
|
751
|
+
#define HWY_NAMESPACE N_LASX
|
|
752
|
+
#endif
|
|
753
|
+
|
|
632
754
|
// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
|
|
633
|
-
// (rv64gcv is not a valid target)
|
|
634
755
|
|
|
635
756
|
//-----------------------------------------------------------------------------
|
|
636
757
|
// EMU128
|
|
@@ -152,9 +152,20 @@ constexpr size_t ScaleByPower(size_t N, int pow2) {
|
|
|
152
152
|
return pow2 >= 0 ? (N << pow2) : (N >> (-pow2));
|
|
153
153
|
}
|
|
154
154
|
|
|
155
|
+
template <typename T>
|
|
156
|
+
HWY_INLINE void MaybePoison(T* HWY_RESTRICT unaligned, size_t count) {
|
|
157
|
+
#if HWY_IS_MSAN
|
|
158
|
+
__msan_poison(unaligned, count * sizeof(T));
|
|
159
|
+
#else
|
|
160
|
+
(void)unaligned;
|
|
161
|
+
(void)count;
|
|
162
|
+
#endif
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// This can be useful for working around MSAN limitations. For example, prior
|
|
166
|
+
// to Clang 16, it did not understand AVX-512 CompressStore.
|
|
155
167
|
template <typename T>
|
|
156
168
|
HWY_INLINE void MaybeUnpoison(T* HWY_RESTRICT unaligned, size_t count) {
|
|
157
|
-
// Workaround for MSAN not marking compressstore as initialized (b/233326619)
|
|
158
169
|
#if HWY_IS_MSAN
|
|
159
170
|
__msan_unpoison(unaligned, count * sizeof(T));
|
|
160
171
|
#else
|
|
@@ -448,13 +459,32 @@ HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
|
|
|
448
459
|
return HWY_MAX_LANES_D(D);
|
|
449
460
|
}
|
|
450
461
|
|
|
451
|
-
#
|
|
462
|
+
#undef HWY_HAVE_CONSTEXPR_LANES
|
|
463
|
+
#undef HWY_LANES_CONSTEXPR
|
|
452
464
|
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
465
|
+
#if HWY_HAVE_SCALABLE
|
|
466
|
+
#define HWY_HAVE_CONSTEXPR_LANES 0
|
|
467
|
+
#define HWY_LANES_CONSTEXPR
|
|
468
|
+
#else
|
|
469
|
+
|
|
470
|
+
// We want Lanes() to be constexpr where possible, so that compilers are able to
|
|
471
|
+
// precompute offsets. However, user code must not depend on the constexpr,
|
|
472
|
+
// because that will fail for RISC-V V and Arm SVE. To achieve both, we mark it
|
|
473
|
+
// as non-constexpr in debug builds, but not sanitizers, because we typically
|
|
474
|
+
// want them to see the same code.
|
|
475
|
+
#if HWY_IS_DEBUG_BUILD && !HWY_IS_SANITIZER
|
|
476
|
+
#define HWY_HAVE_CONSTEXPR_LANES 0
|
|
477
|
+
#define HWY_LANES_CONSTEXPR
|
|
478
|
+
#else
|
|
479
|
+
#define HWY_HAVE_CONSTEXPR_LANES 1
|
|
480
|
+
#define HWY_LANES_CONSTEXPR constexpr
|
|
481
|
+
#endif
|
|
482
|
+
|
|
483
|
+
// Returns actual vector length, used when advancing loop counters. The
|
|
484
|
+
// non-constexpr implementations are defined in their target's header. For a
|
|
485
|
+
// guaranteed-constexpr upper bound, use `MaxLanes(d)`.
|
|
456
486
|
template <class D>
|
|
457
|
-
HWY_INLINE HWY_MAYBE_UNUSED
|
|
487
|
+
HWY_INLINE HWY_MAYBE_UNUSED HWY_LANES_CONSTEXPR size_t Lanes(D) {
|
|
458
488
|
return HWY_MAX_LANES_D(D);
|
|
459
489
|
}
|
|
460
490
|
|
|
@@ -621,8 +651,11 @@ HWY_API bool IsAligned(D d, T* ptr) {
|
|
|
621
651
|
#define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(hwy::HWY_NAMESPACE::TFromV<V>)
|
|
622
652
|
#define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(hwy::HWY_NAMESPACE::TFromV<V>)
|
|
623
653
|
#define HWY_IF_NOT_FLOAT_V(V) HWY_IF_NOT_FLOAT(hwy::HWY_NAMESPACE::TFromV<V>)
|
|
654
|
+
#define HWY_IF_FLOAT3264_V(V) HWY_IF_FLOAT3264(hwy::HWY_NAMESPACE::TFromV<V>)
|
|
624
655
|
#define HWY_IF_SPECIAL_FLOAT_V(V) \
|
|
625
656
|
HWY_IF_SPECIAL_FLOAT(hwy::HWY_NAMESPACE::TFromV<V>)
|
|
657
|
+
#define HWY_IF_FLOAT_OR_SPECIAL_V(V) \
|
|
658
|
+
HWY_IF_FLOAT_OR_SPECIAL(hwy::HWY_NAMESPACE::TFromV<V>)
|
|
626
659
|
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V) \
|
|
627
660
|
HWY_IF_NOT_FLOAT_NOR_SPECIAL(hwy::HWY_NAMESPACE::TFromV<V>)
|
|
628
661
|
|
|
@@ -633,7 +666,7 @@ HWY_API bool IsAligned(D d, T* ptr) {
|
|
|
633
666
|
#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array) \
|
|
634
667
|
HWY_IF_T_SIZE_ONE_OF(hwy::HWY_NAMESPACE::TFromV<V>, bit_array)
|
|
635
668
|
|
|
636
|
-
#define HWY_MAX_LANES_V(V) HWY_MAX_LANES_D(DFromV<V>)
|
|
669
|
+
#define HWY_MAX_LANES_V(V) HWY_MAX_LANES_D(hwy::HWY_NAMESPACE::DFromV<V>)
|
|
637
670
|
#define HWY_IF_V_SIZE_V(V, bytes) \
|
|
638
671
|
HWY_IF_V_SIZE(hwy::HWY_NAMESPACE::TFromV<V>, HWY_MAX_LANES_V(V), bytes)
|
|
639
672
|
#define HWY_IF_V_SIZE_LE_V(V, bytes) \
|
|
@@ -656,15 +689,22 @@ HWY_API bool IsAligned(D d, T* ptr) {
|
|
|
656
689
|
#define HWY_IF_MINMAX_OF_LANES_D(D) HWY_IF_LANES_GT_D(D, 1)
|
|
657
690
|
|
|
658
691
|
#undef HWY_IF_ADDSUB_V
|
|
659
|
-
#define HWY_IF_ADDSUB_V(V) HWY_IF_LANES_GT_D(DFromV<V>, 1)
|
|
692
|
+
#define HWY_IF_ADDSUB_V(V) HWY_IF_LANES_GT_D(hwy::HWY_NAMESPACE::DFromV<V>, 1)
|
|
660
693
|
|
|
661
694
|
#undef HWY_IF_MULADDSUB_V
|
|
662
|
-
#define HWY_IF_MULADDSUB_V(V)
|
|
695
|
+
#define HWY_IF_MULADDSUB_V(V) \
|
|
696
|
+
HWY_IF_LANES_GT_D(hwy::HWY_NAMESPACE::DFromV<V>, 1)
|
|
697
|
+
|
|
698
|
+
#undef HWY_IF_PAIRWISE_ADD_128_D
|
|
699
|
+
#define HWY_IF_PAIRWISE_ADD_128_D(D) HWY_IF_V_SIZE_GT_D(D, 8)
|
|
700
|
+
|
|
701
|
+
#undef HWY_IF_PAIRWISE_SUB_128_D
|
|
702
|
+
#define HWY_IF_PAIRWISE_SUB_128_D(D) HWY_IF_V_SIZE_GT_D(D, 8)
|
|
663
703
|
|
|
664
704
|
// HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V is used to disable the default
|
|
665
705
|
// implementation of unsigned to signed DemoteTo/ReorderDemote2To in
|
|
666
706
|
// generic_ops-inl.h for at least some of the unsigned to signed demotions on
|
|
667
|
-
// SCALAR/EMU128/SSE2/SSSE3/SSE4/AVX2/SVE/SVE2
|
|
707
|
+
// SCALAR/EMU128/SSE2/SSSE3/SSE4/AVX2/SVE/SVE2/LSX/LASX
|
|
668
708
|
|
|
669
709
|
#undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
|
|
670
710
|
#define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) void* = nullptr
|