@img/sharp-libvips-dev 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +7 -1
- package/include/aom/aom_image.h +24 -12
- package/include/aom/aom_integer.h +3 -3
- package/include/aom/aomcx.h +15 -0
- package/include/aom/aomdx.h +5 -2
- package/include/archive.h +7 -5
- package/include/archive_entry.h +5 -3
- package/include/cgif.h +3 -0
- package/include/freetype2/freetype/config/ftoption.h +1 -1
- package/include/fribidi/fribidi-config.h +2 -2
- package/include/fribidi/fribidi-unicode-version.h +3 -3
- package/include/glib-2.0/gio/gappinfo.h +40 -25
- package/include/glib-2.0/gio/gasyncresult.h +1 -1
- package/include/glib-2.0/gio/gconverter.h +5 -0
- package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
- package/include/glib-2.0/gio/gfile.h +16 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/gsettings.h +8 -0
- package/include/glib-2.0/gio/gvfs.h +2 -2
- package/include/glib-2.0/girepository/gi-visibility.h +34 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
- package/include/glib-2.0/glib/giochannel.h +2 -2
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +12 -5
- package/include/glib-2.0/glib/gmain.h +93 -7
- package/include/glib-2.0/glib/gqsort.h +8 -1
- package/include/glib-2.0/glib/gstrfuncs.h +0 -12
- package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
- package/include/glib-2.0/glib/gunicode.h +1 -1
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gtype.h +6 -6
- package/include/harfbuzz/hb-buffer.h +6 -0
- package/include/harfbuzz/hb-common.h +6 -9
- package/include/harfbuzz/hb-cplusplus.hh +8 -11
- package/include/harfbuzz/hb-subset.h +17 -4
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +28 -0
- package/include/hwy/aligned_allocator.h +48 -1
- package/include/hwy/base.h +235 -34
- package/include/hwy/detect_compiler_arch.h +84 -10
- package/include/hwy/detect_targets.h +95 -29
- package/include/hwy/foreach_target.h +12 -1
- package/include/hwy/highway.h +205 -50
- package/include/hwy/ops/arm_neon-inl.h +841 -99
- package/include/hwy/ops/arm_sve-inl.h +413 -141
- package/include/hwy/ops/emu128-inl.h +373 -360
- package/include/hwy/ops/generic_ops-inl.h +804 -401
- package/include/hwy/ops/inside-inl.h +691 -0
- package/include/hwy/ops/ppc_vsx-inl.h +456 -166
- package/include/hwy/ops/rvv-inl.h +537 -249
- package/include/hwy/ops/scalar-inl.h +169 -79
- package/include/hwy/ops/set_macros-inl.h +106 -18
- package/include/hwy/ops/shared-inl.h +23 -0
- package/include/hwy/ops/wasm_128-inl.h +130 -108
- package/include/hwy/ops/x86_128-inl.h +1892 -577
- package/include/hwy/ops/x86_256-inl.h +625 -184
- package/include/hwy/ops/x86_512-inl.h +733 -131
- package/include/hwy/targets.h +22 -21
- package/include/hwy/timer-inl.h +3 -3
- package/include/hwy/timer.h +5 -1
- package/include/libheif/heif.h +170 -15
- package/include/libheif/heif_items.h +237 -0
- package/include/libheif/heif_properties.h +38 -2
- package/include/libheif/heif_regions.h +1 -1
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/pnglibconf.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
- package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
- package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
- package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
- package/include/libxml2/libxml/HTMLparser.h +12 -19
- package/include/libxml2/libxml/c14n.h +1 -12
- package/include/libxml2/libxml/debugXML.h +1 -1
- package/include/libxml2/libxml/encoding.h +9 -0
- package/include/libxml2/libxml/entities.h +12 -1
- package/include/libxml2/libxml/hash.h +19 -0
- package/include/libxml2/libxml/list.h +2 -2
- package/include/libxml2/libxml/nanohttp.h +17 -0
- package/include/libxml2/libxml/parser.h +61 -55
- package/include/libxml2/libxml/parserInternals.h +9 -1
- package/include/libxml2/libxml/pattern.h +6 -0
- package/include/libxml2/libxml/tree.h +32 -12
- package/include/libxml2/libxml/uri.h +11 -0
- package/include/libxml2/libxml/valid.h +29 -2
- package/include/libxml2/libxml/xinclude.h +7 -0
- package/include/libxml2/libxml/xmlIO.h +21 -4
- package/include/libxml2/libxml/xmlerror.h +14 -0
- package/include/libxml2/libxml/xmlexports.h +111 -15
- package/include/libxml2/libxml/xmlmemory.h +8 -45
- package/include/libxml2/libxml/xmlreader.h +2 -0
- package/include/libxml2/libxml/xmlsave.h +5 -0
- package/include/libxml2/libxml/xmlunicode.h +165 -1
- package/include/libxml2/libxml/xmlversion.h +15 -179
- package/include/libxml2/libxml/xmlwriter.h +1 -0
- package/include/libxml2/libxml/xpath.h +4 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-item.h +4 -2
- package/include/pango-1.0/pango/pango-version-macros.h +25 -0
- package/include/pango-1.0/pango/pangofc-font.h +2 -1
- package/include/pnglibconf.h +1 -1
- package/include/vips/util.h +1 -2
- package/include/vips/version.h +4 -4
- package/include/webp/decode.h +58 -56
- package/include/webp/demux.h +25 -21
- package/include/webp/encode.h +44 -39
- package/include/webp/mux.h +76 -15
- package/include/webp/mux_types.h +2 -1
- package/include/webp/sharpyuv/sharpyuv.h +77 -8
- package/include/webp/types.h +29 -8
- package/include/zconf.h +1 -1
- package/include/zlib.h +12 -12
- package/package.json +1 -1
- package/versions.json +14 -15
|
@@ -111,9 +111,6 @@ HWY_API Vec1<T> Zero(D /* tag */) {
|
|
|
111
111
|
template <class D>
|
|
112
112
|
using VFromD = decltype(Zero(D()));
|
|
113
113
|
|
|
114
|
-
// ------------------------------ Tuple (VFromD)
|
|
115
|
-
#include "hwy/ops/tuple-inl.h"
|
|
116
|
-
|
|
117
114
|
// ------------------------------ Set
|
|
118
115
|
template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>, typename T2>
|
|
119
116
|
HWY_API Vec1<T> Set(D /* tag */, const T2 t) {
|
|
@@ -335,8 +332,7 @@ HWY_API Vec1<T> CopySignToAbs(const Vec1<T> abs, const Vec1<T> sign) {
|
|
|
335
332
|
// ------------------------------ BroadcastSignBit
|
|
336
333
|
template <typename T>
|
|
337
334
|
HWY_API Vec1<T> BroadcastSignBit(const Vec1<T> v) {
|
|
338
|
-
|
|
339
|
-
return v.raw < 0 ? Vec1<T>(T(-1)) : Vec1<T>(0);
|
|
335
|
+
return Vec1<T>(ScalarShr(v.raw, sizeof(T) * 8 - 1));
|
|
340
336
|
}
|
|
341
337
|
|
|
342
338
|
// ------------------------------ PopulationCount
|
|
@@ -380,15 +376,6 @@ HWY_API Vec1<T> IfNegativeThenElse(Vec1<T> v, Vec1<T> yes, Vec1<T> no) {
|
|
|
380
376
|
return vi.raw < 0 ? yes : no;
|
|
381
377
|
}
|
|
382
378
|
|
|
383
|
-
template <typename T>
|
|
384
|
-
HWY_API Vec1<T> ZeroIfNegative(const Vec1<T> v) {
|
|
385
|
-
const DFromV<decltype(v)> d;
|
|
386
|
-
const RebindToSigned<decltype(d)> di;
|
|
387
|
-
const auto vi = BitCast(di, v);
|
|
388
|
-
|
|
389
|
-
return vi.raw < 0 ? Vec1<T>(ConvertScalarTo<T>(0)) : v;
|
|
390
|
-
}
|
|
391
|
-
|
|
392
379
|
// ------------------------------ Mask logical
|
|
393
380
|
|
|
394
381
|
template <typename T>
|
|
@@ -473,35 +460,20 @@ HWY_API Vec1<T> ShiftLeft(const Vec1<T> v) {
|
|
|
473
460
|
template <int kBits, typename T>
|
|
474
461
|
HWY_API Vec1<T> ShiftRight(const Vec1<T> v) {
|
|
475
462
|
static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
|
|
476
|
-
|
|
477
|
-
// Signed right shift is now guaranteed to be arithmetic (rounding toward
|
|
478
|
-
// negative infinity, i.e. shifting in the sign bit).
|
|
479
|
-
return Vec1<T>(static_cast<T>(v.raw >> kBits));
|
|
480
|
-
#else
|
|
481
|
-
if (IsSigned<T>()) {
|
|
482
|
-
// Emulate arithmetic shift using only logical (unsigned) shifts, because
|
|
483
|
-
// signed shifts are still implementation-defined.
|
|
484
|
-
using TU = hwy::MakeUnsigned<T>;
|
|
485
|
-
const Sisd<TU> du;
|
|
486
|
-
const TU shifted = static_cast<TU>(BitCast(du, v).raw >> kBits);
|
|
487
|
-
const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
|
|
488
|
-
const size_t sign_shift =
|
|
489
|
-
static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
|
|
490
|
-
const TU upper = static_cast<TU>(sign << sign_shift);
|
|
491
|
-
return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
|
|
492
|
-
} else { // T is unsigned
|
|
493
|
-
return Vec1<T>(static_cast<T>(v.raw >> kBits));
|
|
494
|
-
}
|
|
495
|
-
#endif
|
|
463
|
+
return Vec1<T>(ScalarShr(v.raw, kBits));
|
|
496
464
|
}
|
|
497
465
|
|
|
498
466
|
// ------------------------------ RotateRight (ShiftRight)
|
|
499
|
-
template <int kBits, typename T>
|
|
467
|
+
template <int kBits, typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
500
468
|
HWY_API Vec1<T> RotateRight(const Vec1<T> v) {
|
|
469
|
+
const DFromV<decltype(v)> d;
|
|
470
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
471
|
+
|
|
501
472
|
constexpr size_t kSizeInBits = sizeof(T) * 8;
|
|
502
|
-
static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift");
|
|
473
|
+
static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
|
|
503
474
|
if (kBits == 0) return v;
|
|
504
|
-
|
|
475
|
+
|
|
476
|
+
return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
|
|
505
477
|
ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
|
|
506
478
|
}
|
|
507
479
|
|
|
@@ -515,26 +487,7 @@ HWY_API Vec1<T> ShiftLeftSame(const Vec1<T> v, int bits) {
|
|
|
515
487
|
|
|
516
488
|
template <typename T>
|
|
517
489
|
HWY_API Vec1<T> ShiftRightSame(const Vec1<T> v, int bits) {
|
|
518
|
-
|
|
519
|
-
// Signed right shift is now guaranteed to be arithmetic (rounding toward
|
|
520
|
-
// negative infinity, i.e. shifting in the sign bit).
|
|
521
|
-
return Vec1<T>(static_cast<T>(v.raw >> bits));
|
|
522
|
-
#else
|
|
523
|
-
if (IsSigned<T>()) {
|
|
524
|
-
// Emulate arithmetic shift using only logical (unsigned) shifts, because
|
|
525
|
-
// signed shifts are still implementation-defined.
|
|
526
|
-
using TU = hwy::MakeUnsigned<T>;
|
|
527
|
-
const Sisd<TU> du;
|
|
528
|
-
const TU shifted = static_cast<TU>(BitCast(du, v).raw >> bits);
|
|
529
|
-
const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
|
|
530
|
-
const size_t sign_shift =
|
|
531
|
-
static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
|
|
532
|
-
const TU upper = static_cast<TU>(sign << sign_shift);
|
|
533
|
-
return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
|
|
534
|
-
} else { // T is unsigned
|
|
535
|
-
return Vec1<T>(static_cast<T>(v.raw >> bits));
|
|
536
|
-
}
|
|
537
|
-
#endif
|
|
490
|
+
return Vec1<T>(ScalarShr(v.raw, bits));
|
|
538
491
|
}
|
|
539
492
|
|
|
540
493
|
// ------------------------------ Shl
|
|
@@ -681,8 +634,8 @@ HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
|
|
|
681
634
|
|
|
682
635
|
template <typename T, HWY_IF_FLOAT(T)>
|
|
683
636
|
HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
|
|
684
|
-
if (
|
|
685
|
-
if (
|
|
637
|
+
if (ScalarIsNaN(a.raw)) return b;
|
|
638
|
+
if (ScalarIsNaN(b.raw)) return a;
|
|
686
639
|
return Vec1<T>(HWY_MIN(a.raw, b.raw));
|
|
687
640
|
}
|
|
688
641
|
|
|
@@ -693,8 +646,8 @@ HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
|
|
|
693
646
|
|
|
694
647
|
template <typename T, HWY_IF_FLOAT(T)>
|
|
695
648
|
HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
|
|
696
|
-
if (
|
|
697
|
-
if (
|
|
649
|
+
if (ScalarIsNaN(a.raw)) return b;
|
|
650
|
+
if (ScalarIsNaN(b.raw)) return a;
|
|
698
651
|
return Vec1<T>(HWY_MAX(a.raw, b.raw));
|
|
699
652
|
}
|
|
700
653
|
|
|
@@ -740,16 +693,19 @@ HWY_API Vec1<T> operator/(const Vec1<T> a, const Vec1<T> b) {
|
|
|
740
693
|
return Vec1<T>(a.raw / b.raw);
|
|
741
694
|
}
|
|
742
695
|
|
|
743
|
-
// Returns the upper
|
|
744
|
-
|
|
745
|
-
|
|
696
|
+
// Returns the upper sizeof(T)*8 bits of a * b in each lane.
|
|
697
|
+
template <class T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
698
|
+
HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
699
|
+
HWY_API Vec1<T> MulHigh(const Vec1<T> a, const Vec1<T> b) {
|
|
700
|
+
using TW = MakeWide<T>;
|
|
701
|
+
return Vec1<T>(static_cast<T>(
|
|
702
|
+
(static_cast<TW>(a.raw) * static_cast<TW>(b.raw)) >> (sizeof(T) * 8)));
|
|
746
703
|
}
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
return Vec1<
|
|
752
|
-
(static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16));
|
|
704
|
+
template <class T, HWY_IF_UI64(T)>
|
|
705
|
+
HWY_API Vec1<T> MulHigh(const Vec1<T> a, const Vec1<T> b) {
|
|
706
|
+
T hi;
|
|
707
|
+
Mul128(a.raw, b.raw, &hi);
|
|
708
|
+
return Vec1<T>(hi);
|
|
753
709
|
}
|
|
754
710
|
|
|
755
711
|
HWY_API Vec1<int16_t> MulFixedPoint15(Vec1<int16_t> a, Vec1<int16_t> b) {
|
|
@@ -1034,12 +990,7 @@ HWY_API Mask1<T> operator>=(const Vec1<T> a, const Vec1<T> b) {
|
|
|
1034
990
|
template <typename T>
|
|
1035
991
|
HWY_API Mask1<T> IsNaN(const Vec1<T> v) {
|
|
1036
992
|
// std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
|
|
1037
|
-
|
|
1038
|
-
CopySameSize(&v, &bits);
|
|
1039
|
-
bits += bits;
|
|
1040
|
-
bits >>= 1; // clear sign bit
|
|
1041
|
-
// NaN if all exponent bits are set and the mantissa is not zero.
|
|
1042
|
-
return Mask1<T>::FromBool(bits > ExponentMask<T>());
|
|
993
|
+
return Mask1<T>::FromBool(ScalarIsNaN(v.raw));
|
|
1043
994
|
}
|
|
1044
995
|
|
|
1045
996
|
// Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite.
|
|
@@ -1158,6 +1109,9 @@ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
|
|
|
1158
1109
|
}
|
|
1159
1110
|
}
|
|
1160
1111
|
|
|
1112
|
+
// ------------------------------ Tuples
|
|
1113
|
+
#include "hwy/ops/inside-inl.h"
|
|
1114
|
+
|
|
1161
1115
|
// ------------------------------ LoadInterleaved2/3/4
|
|
1162
1116
|
|
|
1163
1117
|
// Per-target flag to prevent generic_ops-inl.h from defining StoreInterleaved2.
|
|
@@ -1357,6 +1311,48 @@ HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag /*to_type_tag*/,
|
|
|
1357
1311
|
return CastValueForF2IConv<ToT>(val);
|
|
1358
1312
|
}
|
|
1359
1313
|
|
|
1314
|
+
// If val is within the range of ToT, CastValueForInRangeF2IConv<ToT>(val)
|
|
1315
|
+
// returns static_cast<ToT>(val)
|
|
1316
|
+
//
|
|
1317
|
+
// Otherwise, CastValueForInRangeF2IConv<ToT>(val) returns an
|
|
1318
|
+
// implementation-defined result if val is not within the range of ToT.
|
|
1319
|
+
template <class ToT, class FromT>
|
|
1320
|
+
HWY_INLINE ToT CastValueForInRangeF2IConv(FromT val) {
|
|
1321
|
+
// Prevent ubsan errors when converting float to narrower integer
|
|
1322
|
+
|
|
1323
|
+
using FromTU = MakeUnsigned<FromT>;
|
|
1324
|
+
|
|
1325
|
+
constexpr unsigned kMaxExpField =
|
|
1326
|
+
static_cast<unsigned>(MaxExponentField<FromT>());
|
|
1327
|
+
constexpr unsigned kExpBias = kMaxExpField >> 1;
|
|
1328
|
+
constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
|
|
1329
|
+
kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
|
|
1330
|
+
kMaxExpField));
|
|
1331
|
+
|
|
1332
|
+
// If ToT is signed, compare only the exponent bits of val against
|
|
1333
|
+
// kMinOutOfRangeExpField.
|
|
1334
|
+
//
|
|
1335
|
+
// Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
|
|
1336
|
+
// val against kMinOutOfRangeExpField as a negative value is outside of the
|
|
1337
|
+
// range of an unsigned integer type.
|
|
1338
|
+
const FromT val_to_compare =
|
|
1339
|
+
static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);
|
|
1340
|
+
|
|
1341
|
+
// val is within the range of ToT if
|
|
1342
|
+
// (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
|
|
1343
|
+
// than kMinOutOfRangeExpField
|
|
1344
|
+
//
|
|
1345
|
+
// Otherwise, val is either outside of the range of ToT or equal to
|
|
1346
|
+
// LimitsMin<ToT>() if
|
|
1347
|
+
// (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
|
|
1348
|
+
// than or equal to kMinOutOfRangeExpField.
|
|
1349
|
+
|
|
1350
|
+
return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
|
|
1351
|
+
MantissaBits<FromT>()) < kMinOutOfRangeExpField)
|
|
1352
|
+
? static_cast<ToT>(val)
|
|
1353
|
+
: static_cast<ToT>(LimitsMin<ToT>());
|
|
1354
|
+
}
|
|
1355
|
+
|
|
1360
1356
|
} // namespace detail
|
|
1361
1357
|
|
|
1362
1358
|
#ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
|
|
@@ -1373,6 +1369,18 @@ HWY_API Vec1<TTo> PromoteTo(DTo /* tag */, Vec1<TFrom> from) {
|
|
|
1373
1369
|
detail::CastValueForPromoteTo<TTo>(hwy::TypeTag<TTo>(), from.raw));
|
|
1374
1370
|
}
|
|
1375
1371
|
|
|
1372
|
+
#ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
|
|
1373
|
+
#undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
|
|
1374
|
+
#else
|
|
1375
|
+
#define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
|
|
1376
|
+
#endif
|
|
1377
|
+
|
|
1378
|
+
template <class DTo, HWY_IF_UI64_D(DTo)>
|
|
1379
|
+
HWY_API VFromD<DTo> PromoteInRangeTo(DTo /* tag */, Vec1<float> from) {
|
|
1380
|
+
using TTo = TFromD<DTo>;
|
|
1381
|
+
return Vec1<TTo>(detail::CastValueForInRangeF2IConv<TTo>(from.raw));
|
|
1382
|
+
}
|
|
1383
|
+
|
|
1376
1384
|
// MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here,
|
|
1377
1385
|
// so we overload for TFrom=double and TTo={float,int32_t}.
|
|
1378
1386
|
template <class D, HWY_IF_F32_D(D)>
|
|
@@ -1402,15 +1410,30 @@ HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) {
|
|
|
1402
1410
|
return Vec1<TTo>(static_cast<TTo>(from.raw));
|
|
1403
1411
|
}
|
|
1404
1412
|
|
|
1413
|
+
// Disable the default unsigned to signed DemoteTo implementation in
|
|
1414
|
+
// generic_ops-inl.h on SCALAR as the SCALAR target has a target-specific
|
|
1415
|
+
// implementation of the unsigned to signed DemoteTo op and as ReorderDemote2To
|
|
1416
|
+
// is not supported on the SCALAR target
|
|
1417
|
+
|
|
1418
|
+
// NOTE: hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr is used instead of
|
|
1419
|
+
// hwy::EnableIf<false>* = nullptr to avoid compiler errors since
|
|
1420
|
+
// !hwy::IsSame<V, V>() is always false and as !hwy::IsSame<V, V>() will cause
|
|
1421
|
+
// SFINAE to occur instead of a hard error due to a dependency on the V template
|
|
1422
|
+
// argument
|
|
1423
|
+
#undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
|
|
1424
|
+
#define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \
|
|
1425
|
+
hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
|
|
1426
|
+
|
|
1405
1427
|
template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
|
|
1406
|
-
HWY_IF_UNSIGNED(TFrom),
|
|
1428
|
+
HWY_IF_UNSIGNED(TFrom), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DTo)>
|
|
1407
1429
|
HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) {
|
|
1408
1430
|
static_assert(!IsFloat<TFrom>(), "TFrom=double are handled above");
|
|
1409
1431
|
static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
|
|
1410
1432
|
|
|
1433
|
+
const auto max = static_cast<MakeUnsigned<TTo>>(LimitsMax<TTo>());
|
|
1434
|
+
|
|
1411
1435
|
// Int to int: choose closest value in TTo to `from` (avoids UB)
|
|
1412
|
-
|
|
1413
|
-
return Vec1<TTo>(static_cast<TTo>(from.raw));
|
|
1436
|
+
return Vec1<TTo>(static_cast<TTo>(HWY_MIN(from.raw, max)));
|
|
1414
1437
|
}
|
|
1415
1438
|
|
|
1416
1439
|
template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
|
|
@@ -1420,6 +1443,19 @@ HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) {
|
|
|
1420
1443
|
return Vec1<TTo>(static_cast<TTo>(from.raw));
|
|
1421
1444
|
}
|
|
1422
1445
|
|
|
1446
|
+
#ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
|
|
1447
|
+
#undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
|
|
1448
|
+
#else
|
|
1449
|
+
#define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
|
|
1450
|
+
#endif
|
|
1451
|
+
|
|
1452
|
+
template <class D32, HWY_IF_UI32_D(D32)>
|
|
1453
|
+
HWY_API VFromD<D32> DemoteInRangeTo(D32 /*d32*/,
|
|
1454
|
+
VFromD<Rebind<double, D32>> v) {
|
|
1455
|
+
using TTo = TFromD<D32>;
|
|
1456
|
+
return Vec1<TTo>(detail::CastValueForInRangeF2IConv<TTo>(v.raw));
|
|
1457
|
+
}
|
|
1458
|
+
|
|
1423
1459
|
// Per-target flag to prevent generic_ops-inl.h from defining f16 conversions;
|
|
1424
1460
|
// use this scalar version to verify the vector implementation.
|
|
1425
1461
|
#ifdef HWY_NATIVE_F16C
|
|
@@ -1448,6 +1484,12 @@ HWY_API Vec1<float16_t> DemoteTo(D /* tag */, const Vec1<float> v) {
|
|
|
1448
1484
|
return Vec1<float16_t>(F16FromF32(v.raw));
|
|
1449
1485
|
}
|
|
1450
1486
|
|
|
1487
|
+
#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
1488
|
+
#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
1489
|
+
#else
|
|
1490
|
+
#define HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
1491
|
+
#endif
|
|
1492
|
+
|
|
1451
1493
|
template <class D, HWY_IF_BF16_D(D)>
|
|
1452
1494
|
HWY_API Vec1<bfloat16_t> DemoteTo(D d, const Vec1<float> v) {
|
|
1453
1495
|
return Set(d, BF16FromF32(v.raw));
|
|
@@ -1469,6 +1511,19 @@ HWY_API Vec1<TTo> ConvertTo(DTo /* tag */, Vec1<TFrom> from) {
|
|
|
1469
1511
|
return Vec1<TTo>(static_cast<TTo>(from.raw));
|
|
1470
1512
|
}
|
|
1471
1513
|
|
|
1514
|
+
#ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
|
|
1515
|
+
#undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
|
|
1516
|
+
#else
|
|
1517
|
+
#define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
|
|
1518
|
+
#endif
|
|
1519
|
+
|
|
1520
|
+
template <class DI, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DI),
|
|
1521
|
+
HWY_IF_T_SIZE_ONE_OF_D(DI, (1 << 4) | (1 << 8))>
|
|
1522
|
+
HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, VFromD<RebindToFloat<DI>> v) {
|
|
1523
|
+
using TTo = TFromD<DI>;
|
|
1524
|
+
return VFromD<DI>(detail::CastValueForInRangeF2IConv<TTo>(v.raw));
|
|
1525
|
+
}
|
|
1526
|
+
|
|
1472
1527
|
HWY_API Vec1<uint8_t> U8FromU32(const Vec1<uint32_t> v) {
|
|
1473
1528
|
return DemoteTo(Sisd<uint8_t>(), v);
|
|
1474
1529
|
}
|
|
@@ -1956,6 +2011,35 @@ HWY_API Vec1<int32_t> WidenMulPairwiseAdd(D32 /* tag */, Vec1<int16_t> a,
|
|
|
1956
2011
|
return Vec1<int32_t>(a.raw * b.raw);
|
|
1957
2012
|
}
|
|
1958
2013
|
|
|
2014
|
+
// ------------------------------ SatWidenMulAccumFixedPoint
|
|
2015
|
+
#ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
|
|
2016
|
+
#undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
|
|
2017
|
+
#else
|
|
2018
|
+
#define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
|
|
2019
|
+
#endif
|
|
2020
|
+
|
|
2021
|
+
template <class DI32, HWY_IF_I32_D(DI32)>
|
|
2022
|
+
HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 di32,
|
|
2023
|
+
VFromD<Rebind<int16_t, DI32>> a,
|
|
2024
|
+
VFromD<Rebind<int16_t, DI32>> b,
|
|
2025
|
+
VFromD<DI32> sum) {
|
|
2026
|
+
// Multiplying static_cast<int32_t>(a.raw) by static_cast<int32_t>(b.raw)
|
|
2027
|
+
// followed by an addition of the product is okay as
|
|
2028
|
+
// (a.raw * b.raw * 2) is between -2147418112 and 2147483648 and as
|
|
2029
|
+
// a.raw * b.raw * 2 can only overflow an int32_t if both a.raw and b.raw are
|
|
2030
|
+
// equal to -32768.
|
|
2031
|
+
|
|
2032
|
+
const VFromD<DI32> product(static_cast<int32_t>(a.raw) *
|
|
2033
|
+
static_cast<int32_t>(b.raw));
|
|
2034
|
+
const VFromD<DI32> product2 = Add(product, product);
|
|
2035
|
+
|
|
2036
|
+
const auto mul_overflow =
|
|
2037
|
+
VecFromMask(di32, Eq(product2, Set(di32, LimitsMin<int32_t>())));
|
|
2038
|
+
|
|
2039
|
+
return SaturatedAdd(Sub(sum, And(BroadcastSignBit(sum), mul_overflow)),
|
|
2040
|
+
Add(product2, mul_overflow));
|
|
2041
|
+
}
|
|
2042
|
+
|
|
1959
2043
|
// ------------------------------ SatWidenMulPairwiseAdd
|
|
1960
2044
|
|
|
1961
2045
|
#ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
|
|
@@ -1983,6 +2067,12 @@ HWY_API Vec1<int16_t> SatWidenMulPairwiseAdd(DI16 /* tag */, Vec1<uint8_t> a,
|
|
|
1983
2067
|
|
|
1984
2068
|
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
|
|
1985
2069
|
|
|
2070
|
+
#ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
|
|
2071
|
+
#undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
|
|
2072
|
+
#else
|
|
2073
|
+
#define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
|
|
2074
|
+
#endif
|
|
2075
|
+
|
|
1986
2076
|
template <class D32, HWY_IF_F32_D(D32)>
|
|
1987
2077
|
HWY_API Vec1<float> ReorderWidenMulAccumulate(D32 /* tag */, Vec1<bfloat16_t> a,
|
|
1988
2078
|
Vec1<bfloat16_t> b,
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
// Copyright 2020 Google LLC
|
|
2
|
+
// Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
|
|
2
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
// SPDX-License-Identifier: BSD-3-Clause
|
|
3
5
|
//
|
|
4
6
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
7
|
// you may not use this file except in compliance with the License.
|
|
@@ -41,9 +43,31 @@
|
|
|
41
43
|
#undef HWY_HAVE_FLOAT64
|
|
42
44
|
#undef HWY_MEM_OPS_MIGHT_FAULT
|
|
43
45
|
#undef HWY_NATIVE_FMA
|
|
46
|
+
#undef HWY_NATIVE_DOT_BF16
|
|
44
47
|
#undef HWY_CAP_GE256
|
|
45
48
|
#undef HWY_CAP_GE512
|
|
46
49
|
|
|
50
|
+
#undef HWY_TARGET_IS_SVE
|
|
51
|
+
#if HWY_TARGET & HWY_ALL_SVE
|
|
52
|
+
#define HWY_TARGET_IS_SVE 1
|
|
53
|
+
#else
|
|
54
|
+
#define HWY_TARGET_IS_SVE 0
|
|
55
|
+
#endif
|
|
56
|
+
|
|
57
|
+
#undef HWY_TARGET_IS_NEON
|
|
58
|
+
#if HWY_TARGET & HWY_ALL_NEON
|
|
59
|
+
#define HWY_TARGET_IS_NEON 1
|
|
60
|
+
#else
|
|
61
|
+
#define HWY_TARGET_IS_NEON 0
|
|
62
|
+
#endif
|
|
63
|
+
|
|
64
|
+
#undef HWY_TARGET_IS_PPC
|
|
65
|
+
#if HWY_TARGET & HWY_ALL_PPC
|
|
66
|
+
#define HWY_TARGET_IS_PPC 1
|
|
67
|
+
#else
|
|
68
|
+
#define HWY_TARGET_IS_PPC 0
|
|
69
|
+
#endif
|
|
70
|
+
|
|
47
71
|
// Supported on all targets except RVV (requires GCC 14 or upcoming Clang)
|
|
48
72
|
#if HWY_TARGET == HWY_RVV && \
|
|
49
73
|
((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400) || \
|
|
@@ -116,7 +140,21 @@
|
|
|
116
140
|
",vpclmulqdq,avx512vbmi,avx512vbmi2,vaes,avx512vnni,avx512bitalg," \
|
|
117
141
|
"avx512vpopcntdq,gfni"
|
|
118
142
|
|
|
119
|
-
|
|
143
|
+
// Force-disable for compilers that do not properly support avx512bf16.
|
|
144
|
+
#if !defined(HWY_AVX3_DISABLE_AVX512BF16) && \
|
|
145
|
+
(HWY_COMPILER_CLANGCL || \
|
|
146
|
+
(HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000) || \
|
|
147
|
+
(HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 900))
|
|
148
|
+
#define HWY_AVX3_DISABLE_AVX512BF16
|
|
149
|
+
#endif
|
|
150
|
+
|
|
151
|
+
#if !defined(HWY_AVX3_DISABLE_AVX512BF16)
|
|
152
|
+
#define HWY_TARGET_STR_AVX3_ZEN4 HWY_TARGET_STR_AVX3_DL ",avx512bf16"
|
|
153
|
+
#else
|
|
154
|
+
#define HWY_TARGET_STR_AVX3_ZEN4 HWY_TARGET_STR_AVX3_DL
|
|
155
|
+
#endif
|
|
156
|
+
|
|
157
|
+
#define HWY_TARGET_STR_AVX3_SPR HWY_TARGET_STR_AVX3_ZEN4 ",avx512fp16"
|
|
120
158
|
|
|
121
159
|
#if defined(HWY_DISABLE_PPC8_CRYPTO)
|
|
122
160
|
#define HWY_TARGET_STR_PPC8_CRYPTO ""
|
|
@@ -164,6 +202,7 @@
|
|
|
164
202
|
#define HWY_HAVE_FLOAT64 1
|
|
165
203
|
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
|
166
204
|
#define HWY_NATIVE_FMA 0
|
|
205
|
+
#define HWY_NATIVE_DOT_BF16 0
|
|
167
206
|
#define HWY_CAP_GE256 0
|
|
168
207
|
#define HWY_CAP_GE512 0
|
|
169
208
|
|
|
@@ -183,6 +222,7 @@
|
|
|
183
222
|
#define HWY_HAVE_FLOAT64 1
|
|
184
223
|
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
|
185
224
|
#define HWY_NATIVE_FMA 0
|
|
225
|
+
#define HWY_NATIVE_DOT_BF16 0
|
|
186
226
|
#define HWY_CAP_GE256 0
|
|
187
227
|
#define HWY_CAP_GE512 0
|
|
188
228
|
|
|
@@ -203,6 +243,7 @@
|
|
|
203
243
|
#define HWY_HAVE_FLOAT64 1
|
|
204
244
|
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
|
205
245
|
#define HWY_NATIVE_FMA 0
|
|
246
|
+
#define HWY_NATIVE_DOT_BF16 0
|
|
206
247
|
#define HWY_CAP_GE256 0
|
|
207
248
|
#define HWY_CAP_GE512 0
|
|
208
249
|
|
|
@@ -228,6 +269,7 @@
|
|
|
228
269
|
#else
|
|
229
270
|
#define HWY_NATIVE_FMA 1
|
|
230
271
|
#endif
|
|
272
|
+
#define HWY_NATIVE_DOT_BF16 0
|
|
231
273
|
|
|
232
274
|
#define HWY_CAP_GE256 1
|
|
233
275
|
#define HWY_CAP_GE512 0
|
|
@@ -256,6 +298,11 @@
|
|
|
256
298
|
#define HWY_HAVE_FLOAT64 1
|
|
257
299
|
#define HWY_MEM_OPS_MIGHT_FAULT 0
|
|
258
300
|
#define HWY_NATIVE_FMA 1
|
|
301
|
+
#if (HWY_TARGET <= HWY_AVX3_ZEN4) && !defined(HWY_AVX3_DISABLE_AVX512BF16)
|
|
302
|
+
#define HWY_NATIVE_DOT_BF16 1
|
|
303
|
+
#else
|
|
304
|
+
#define HWY_NATIVE_DOT_BF16 0
|
|
305
|
+
#endif
|
|
259
306
|
#define HWY_CAP_GE256 1
|
|
260
307
|
#define HWY_CAP_GE512 1
|
|
261
308
|
|
|
@@ -272,8 +319,7 @@
|
|
|
272
319
|
#elif HWY_TARGET == HWY_AVX3_ZEN4
|
|
273
320
|
|
|
274
321
|
#define HWY_NAMESPACE N_AVX3_ZEN4
|
|
275
|
-
|
|
276
|
-
#define HWY_TARGET_STR HWY_TARGET_STR_AVX3_DL
|
|
322
|
+
#define HWY_TARGET_STR HWY_TARGET_STR_AVX3_ZEN4
|
|
277
323
|
|
|
278
324
|
#elif HWY_TARGET == HWY_AVX3_SPR
|
|
279
325
|
|
|
@@ -286,8 +332,7 @@
|
|
|
286
332
|
|
|
287
333
|
//-----------------------------------------------------------------------------
|
|
288
334
|
// PPC8, PPC9, PPC10
|
|
289
|
-
#elif
|
|
290
|
-
HWY_TARGET == HWY_PPC10
|
|
335
|
+
#elif HWY_TARGET_IS_PPC
|
|
291
336
|
|
|
292
337
|
#define HWY_ALIGN alignas(16)
|
|
293
338
|
#define HWY_MAX_BYTES 16
|
|
@@ -299,6 +344,7 @@
|
|
|
299
344
|
#define HWY_HAVE_FLOAT64 1
|
|
300
345
|
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
|
301
346
|
#define HWY_NATIVE_FMA 1
|
|
347
|
+
#define HWY_NATIVE_DOT_BF16 0
|
|
302
348
|
#define HWY_CAP_GE256 0
|
|
303
349
|
#define HWY_CAP_GE512 0
|
|
304
350
|
|
|
@@ -319,7 +365,7 @@
|
|
|
319
365
|
|
|
320
366
|
#else
|
|
321
367
|
#error "Logic error"
|
|
322
|
-
#endif // HWY_TARGET
|
|
368
|
+
#endif // HWY_TARGET
|
|
323
369
|
|
|
324
370
|
//-----------------------------------------------------------------------------
|
|
325
371
|
// Z14, Z15
|
|
@@ -335,6 +381,7 @@
|
|
|
335
381
|
#define HWY_HAVE_FLOAT64 1
|
|
336
382
|
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
|
337
383
|
#define HWY_NATIVE_FMA 1
|
|
384
|
+
#define HWY_NATIVE_DOT_BF16 0
|
|
338
385
|
#define HWY_CAP_GE256 0
|
|
339
386
|
#define HWY_CAP_GE512 0
|
|
340
387
|
|
|
@@ -354,7 +401,7 @@
|
|
|
354
401
|
|
|
355
402
|
//-----------------------------------------------------------------------------
|
|
356
403
|
// NEON
|
|
357
|
-
#elif
|
|
404
|
+
#elif HWY_TARGET_IS_NEON
|
|
358
405
|
|
|
359
406
|
#define HWY_ALIGN alignas(16)
|
|
360
407
|
#define HWY_MAX_BYTES 16
|
|
@@ -362,7 +409,7 @@
|
|
|
362
409
|
|
|
363
410
|
#define HWY_HAVE_SCALABLE 0
|
|
364
411
|
#define HWY_HAVE_INTEGER64 1
|
|
365
|
-
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
|
412
|
+
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || HWY_TARGET == HWY_NEON_BF16
|
|
366
413
|
#define HWY_HAVE_FLOAT16 1
|
|
367
414
|
#else
|
|
368
415
|
#define HWY_HAVE_FLOAT16 0
|
|
@@ -376,20 +423,29 @@
|
|
|
376
423
|
|
|
377
424
|
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
|
378
425
|
|
|
379
|
-
#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
|
|
426
|
+
#if defined(__ARM_FEATURE_FMA) || defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
|
|
380
427
|
#define HWY_NATIVE_FMA 1
|
|
381
428
|
#else
|
|
382
429
|
#define HWY_NATIVE_FMA 0
|
|
383
430
|
#endif
|
|
431
|
+
#if HWY_NEON_HAVE_F32_TO_BF16C || HWY_TARGET == HWY_NEON_BF16
|
|
432
|
+
#define HWY_NATIVE_DOT_BF16 1
|
|
433
|
+
#else
|
|
434
|
+
#define HWY_NATIVE_DOT_BF16 0
|
|
435
|
+
#endif
|
|
384
436
|
|
|
385
437
|
#define HWY_CAP_GE256 0
|
|
386
438
|
#define HWY_CAP_GE512 0
|
|
387
439
|
|
|
388
440
|
#if HWY_TARGET == HWY_NEON_WITHOUT_AES
|
|
389
441
|
#define HWY_NAMESPACE N_NEON_WITHOUT_AES
|
|
390
|
-
#
|
|
442
|
+
#elif HWY_TARGET == HWY_NEON
|
|
391
443
|
#define HWY_NAMESPACE N_NEON
|
|
392
|
-
#
|
|
444
|
+
#elif HWY_TARGET == HWY_NEON_BF16
|
|
445
|
+
#define HWY_NAMESPACE N_NEON_BF16
|
|
446
|
+
#else
|
|
447
|
+
#error "Logic error, missing case"
|
|
448
|
+
#endif // HWY_TARGET
|
|
393
449
|
|
|
394
450
|
// Can use pragmas instead of -march compiler flag
|
|
395
451
|
#if HWY_HAVE_RUNTIME_DISPATCH
|
|
@@ -404,21 +460,43 @@
|
|
|
404
460
|
|
|
405
461
|
#else // !HWY_ARCH_ARM_V7
|
|
406
462
|
|
|
463
|
+
#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1300) || \
|
|
464
|
+
(HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1300)
|
|
465
|
+
// GCC 12 or earlier and Clang 12 or earlier require +crypto be added to the
|
|
466
|
+
// target string to enable AArch64 AES intrinsics
|
|
467
|
+
#define HWY_TARGET_STR_NEON "+crypto"
|
|
468
|
+
#else
|
|
469
|
+
#define HWY_TARGET_STR_NEON "+aes"
|
|
470
|
+
#endif
|
|
471
|
+
|
|
472
|
+
// Clang >= 16 requires +fullfp16 instead of fp16, but Apple Clang 15 = 1600
|
|
473
|
+
// fails to parse unless the string starts with armv8, whereas 1700 refuses it.
|
|
474
|
+
#if HWY_COMPILER_CLANG >= 1700
|
|
475
|
+
#define HWY_TARGET_STR_FP16 "+fullfp16"
|
|
476
|
+
#elif HWY_COMPILER_CLANG >= 1600 && defined(__apple_build_version__)
|
|
477
|
+
#define HWY_TARGET_STR_FP16 "armv8.4-a+fullfp16"
|
|
478
|
+
#else
|
|
479
|
+
#define HWY_TARGET_STR_FP16 "+fp16"
|
|
480
|
+
#endif
|
|
481
|
+
|
|
407
482
|
#if HWY_TARGET == HWY_NEON_WITHOUT_AES
|
|
408
483
|
// Do not define HWY_TARGET_STR (no pragma).
|
|
484
|
+
#elif HWY_TARGET == HWY_NEON
|
|
485
|
+
#define HWY_TARGET_STR HWY_TARGET_STR_NEON
|
|
486
|
+
#elif HWY_TARGET == HWY_NEON_BF16
|
|
487
|
+
#define HWY_TARGET_STR HWY_TARGET_STR_FP16 "+bf16+dotprod" HWY_TARGET_STR_NEON
|
|
409
488
|
#else
|
|
410
|
-
#
|
|
411
|
-
#endif // HWY_TARGET
|
|
489
|
+
#error "Logic error, missing case"
|
|
490
|
+
#endif // HWY_TARGET
|
|
412
491
|
|
|
413
|
-
#endif // HWY_ARCH_ARM_V7
|
|
492
|
+
#endif // !HWY_ARCH_ARM_V7
|
|
414
493
|
#else // !HWY_HAVE_RUNTIME_DISPATCH
|
|
415
494
|
// HWY_TARGET_STR remains undefined
|
|
416
495
|
#endif
|
|
417
496
|
|
|
418
497
|
//-----------------------------------------------------------------------------
|
|
419
498
|
// SVE[2]
|
|
420
|
-
#elif
|
|
421
|
-
HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
|
|
499
|
+
#elif HWY_TARGET_IS_SVE
|
|
422
500
|
|
|
423
501
|
// SVE only requires lane alignment, not natural alignment of the entire vector.
|
|
424
502
|
#define HWY_ALIGN alignas(8)
|
|
@@ -432,6 +510,11 @@
|
|
|
432
510
|
#define HWY_HAVE_FLOAT64 1
|
|
433
511
|
#define HWY_MEM_OPS_MIGHT_FAULT 0
|
|
434
512
|
#define HWY_NATIVE_FMA 1
|
|
513
|
+
#if HWY_SVE_HAVE_BF16_FEATURE
|
|
514
|
+
#define HWY_NATIVE_DOT_BF16 1
|
|
515
|
+
#else
|
|
516
|
+
#define HWY_NATIVE_DOT_BF16 0
|
|
517
|
+
#endif
|
|
435
518
|
#define HWY_CAP_GE256 0
|
|
436
519
|
#define HWY_CAP_GE512 0
|
|
437
520
|
|
|
@@ -459,9 +542,9 @@
|
|
|
459
542
|
// Static dispatch with -march=armv8-a+sve2+aes, or no baseline, hence dynamic
|
|
460
543
|
// dispatch, which checks for AES support at runtime.
|
|
461
544
|
#if defined(__ARM_FEATURE_SVE2_AES) || (HWY_BASELINE_SVE2 == 0)
|
|
462
|
-
#define HWY_TARGET_STR "+sve2-aes"
|
|
545
|
+
#define HWY_TARGET_STR "+sve2+sve2-aes,+sve"
|
|
463
546
|
#else // SVE2 without AES
|
|
464
|
-
#define HWY_TARGET_STR "+sve2"
|
|
547
|
+
#define HWY_TARGET_STR "+sve2,+sve"
|
|
465
548
|
#endif
|
|
466
549
|
#else // not SVE2 target
|
|
467
550
|
#define HWY_TARGET_STR "+sve"
|
|
@@ -484,6 +567,7 @@
|
|
|
484
567
|
#define HWY_HAVE_FLOAT64 1
|
|
485
568
|
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
|
486
569
|
#define HWY_NATIVE_FMA 0
|
|
570
|
+
#define HWY_NATIVE_DOT_BF16 0
|
|
487
571
|
#define HWY_CAP_GE256 0
|
|
488
572
|
#define HWY_CAP_GE512 0
|
|
489
573
|
|
|
@@ -505,6 +589,7 @@
|
|
|
505
589
|
#define HWY_HAVE_FLOAT64 0
|
|
506
590
|
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
|
507
591
|
#define HWY_NATIVE_FMA 0
|
|
592
|
+
#define HWY_NATIVE_DOT_BF16 0
|
|
508
593
|
#define HWY_CAP_GE256 1
|
|
509
594
|
#define HWY_CAP_GE512 0
|
|
510
595
|
|
|
@@ -532,6 +617,7 @@
|
|
|
532
617
|
#define HWY_HAVE_FLOAT64 1
|
|
533
618
|
#define HWY_MEM_OPS_MIGHT_FAULT 0
|
|
534
619
|
#define HWY_NATIVE_FMA 1
|
|
620
|
+
#define HWY_NATIVE_DOT_BF16 0
|
|
535
621
|
#define HWY_CAP_GE256 0
|
|
536
622
|
#define HWY_CAP_GE512 0
|
|
537
623
|
|
|
@@ -560,6 +646,7 @@
|
|
|
560
646
|
#define HWY_HAVE_FLOAT64 1
|
|
561
647
|
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
|
562
648
|
#define HWY_NATIVE_FMA 0
|
|
649
|
+
#define HWY_NATIVE_DOT_BF16 0
|
|
563
650
|
#define HWY_CAP_GE256 0
|
|
564
651
|
#define HWY_CAP_GE512 0
|
|
565
652
|
|
|
@@ -581,6 +668,7 @@
|
|
|
581
668
|
#define HWY_HAVE_FLOAT64 1
|
|
582
669
|
#define HWY_MEM_OPS_MIGHT_FAULT 0
|
|
583
670
|
#define HWY_NATIVE_FMA 0
|
|
671
|
+
#define HWY_NATIVE_DOT_BF16 0
|
|
584
672
|
#define HWY_CAP_GE256 0
|
|
585
673
|
#define HWY_CAP_GE512 0
|
|
586
674
|
|