@img/sharp-libvips-dev 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/README.md +1 -2
  2. package/include/aom/aom_decoder.h +1 -1
  3. package/include/aom/aom_encoder.h +7 -1
  4. package/include/aom/aom_image.h +24 -12
  5. package/include/aom/aom_integer.h +3 -3
  6. package/include/aom/aomcx.h +15 -0
  7. package/include/aom/aomdx.h +5 -2
  8. package/include/archive.h +7 -5
  9. package/include/archive_entry.h +5 -3
  10. package/include/cgif.h +3 -0
  11. package/include/freetype2/freetype/config/ftoption.h +1 -1
  12. package/include/fribidi/fribidi-config.h +2 -2
  13. package/include/fribidi/fribidi-unicode-version.h +3 -3
  14. package/include/glib-2.0/gio/gappinfo.h +40 -25
  15. package/include/glib-2.0/gio/gasyncresult.h +1 -1
  16. package/include/glib-2.0/gio/gconverter.h +5 -0
  17. package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
  18. package/include/glib-2.0/gio/gfile.h +16 -0
  19. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  20. package/include/glib-2.0/gio/gsettings.h +8 -0
  21. package/include/glib-2.0/gio/gvfs.h +2 -2
  22. package/include/glib-2.0/girepository/gi-visibility.h +34 -0
  23. package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
  24. package/include/glib-2.0/glib/giochannel.h +2 -2
  25. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  26. package/include/glib-2.0/glib/gmacros.h +12 -5
  27. package/include/glib-2.0/glib/gmain.h +93 -7
  28. package/include/glib-2.0/glib/gqsort.h +8 -1
  29. package/include/glib-2.0/glib/gstrfuncs.h +0 -12
  30. package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
  31. package/include/glib-2.0/glib/gunicode.h +1 -1
  32. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  33. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  34. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  35. package/include/glib-2.0/gobject/gtype.h +6 -6
  36. package/include/harfbuzz/hb-buffer.h +6 -0
  37. package/include/harfbuzz/hb-common.h +6 -9
  38. package/include/harfbuzz/hb-cplusplus.hh +8 -11
  39. package/include/harfbuzz/hb-subset.h +17 -4
  40. package/include/harfbuzz/hb-version.h +3 -3
  41. package/include/hwy/abort.h +28 -0
  42. package/include/hwy/aligned_allocator.h +48 -1
  43. package/include/hwy/base.h +235 -34
  44. package/include/hwy/detect_compiler_arch.h +84 -10
  45. package/include/hwy/detect_targets.h +95 -29
  46. package/include/hwy/foreach_target.h +12 -1
  47. package/include/hwy/highway.h +205 -50
  48. package/include/hwy/ops/arm_neon-inl.h +841 -99
  49. package/include/hwy/ops/arm_sve-inl.h +413 -141
  50. package/include/hwy/ops/emu128-inl.h +373 -360
  51. package/include/hwy/ops/generic_ops-inl.h +804 -401
  52. package/include/hwy/ops/inside-inl.h +691 -0
  53. package/include/hwy/ops/ppc_vsx-inl.h +456 -166
  54. package/include/hwy/ops/rvv-inl.h +537 -249
  55. package/include/hwy/ops/scalar-inl.h +169 -79
  56. package/include/hwy/ops/set_macros-inl.h +106 -18
  57. package/include/hwy/ops/shared-inl.h +23 -0
  58. package/include/hwy/ops/wasm_128-inl.h +130 -108
  59. package/include/hwy/ops/x86_128-inl.h +1892 -577
  60. package/include/hwy/ops/x86_256-inl.h +625 -184
  61. package/include/hwy/ops/x86_512-inl.h +733 -131
  62. package/include/hwy/targets.h +22 -21
  63. package/include/hwy/timer-inl.h +3 -3
  64. package/include/hwy/timer.h +5 -1
  65. package/include/libheif/heif.h +170 -15
  66. package/include/libheif/heif_items.h +237 -0
  67. package/include/libheif/heif_properties.h +38 -2
  68. package/include/libheif/heif_regions.h +1 -1
  69. package/include/libheif/heif_version.h +2 -2
  70. package/include/libpng16/pnglibconf.h +1 -1
  71. package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
  72. package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
  73. package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
  74. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  75. package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
  76. package/include/libxml2/libxml/HTMLparser.h +12 -19
  77. package/include/libxml2/libxml/c14n.h +1 -12
  78. package/include/libxml2/libxml/debugXML.h +1 -1
  79. package/include/libxml2/libxml/encoding.h +9 -0
  80. package/include/libxml2/libxml/entities.h +12 -1
  81. package/include/libxml2/libxml/hash.h +19 -0
  82. package/include/libxml2/libxml/list.h +2 -2
  83. package/include/libxml2/libxml/nanohttp.h +17 -0
  84. package/include/libxml2/libxml/parser.h +61 -55
  85. package/include/libxml2/libxml/parserInternals.h +9 -1
  86. package/include/libxml2/libxml/pattern.h +6 -0
  87. package/include/libxml2/libxml/tree.h +32 -12
  88. package/include/libxml2/libxml/uri.h +11 -0
  89. package/include/libxml2/libxml/valid.h +29 -2
  90. package/include/libxml2/libxml/xinclude.h +7 -0
  91. package/include/libxml2/libxml/xmlIO.h +21 -4
  92. package/include/libxml2/libxml/xmlerror.h +14 -0
  93. package/include/libxml2/libxml/xmlexports.h +111 -15
  94. package/include/libxml2/libxml/xmlmemory.h +8 -45
  95. package/include/libxml2/libxml/xmlreader.h +2 -0
  96. package/include/libxml2/libxml/xmlsave.h +5 -0
  97. package/include/libxml2/libxml/xmlunicode.h +165 -1
  98. package/include/libxml2/libxml/xmlversion.h +15 -179
  99. package/include/libxml2/libxml/xmlwriter.h +1 -0
  100. package/include/libxml2/libxml/xpath.h +4 -0
  101. package/include/pango-1.0/pango/pango-features.h +3 -3
  102. package/include/pango-1.0/pango/pango-item.h +4 -2
  103. package/include/pango-1.0/pango/pango-version-macros.h +25 -0
  104. package/include/pango-1.0/pango/pangofc-font.h +2 -1
  105. package/include/pnglibconf.h +1 -1
  106. package/include/vips/util.h +1 -2
  107. package/include/vips/version.h +4 -4
  108. package/include/webp/decode.h +58 -56
  109. package/include/webp/demux.h +25 -21
  110. package/include/webp/encode.h +44 -39
  111. package/include/webp/mux.h +76 -15
  112. package/include/webp/mux_types.h +2 -1
  113. package/include/webp/sharpyuv/sharpyuv.h +77 -8
  114. package/include/webp/types.h +29 -8
  115. package/include/zconf.h +1 -1
  116. package/include/zlib.h +12 -12
  117. package/package.json +1 -1
  118. package/versions.json +14 -15
@@ -111,9 +111,6 @@ HWY_API Vec1<T> Zero(D /* tag */) {
111
111
  template <class D>
112
112
  using VFromD = decltype(Zero(D()));
113
113
 
114
- // ------------------------------ Tuple (VFromD)
115
- #include "hwy/ops/tuple-inl.h"
116
-
117
114
  // ------------------------------ Set
118
115
  template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>, typename T2>
119
116
  HWY_API Vec1<T> Set(D /* tag */, const T2 t) {
@@ -335,8 +332,7 @@ HWY_API Vec1<T> CopySignToAbs(const Vec1<T> abs, const Vec1<T> sign) {
335
332
  // ------------------------------ BroadcastSignBit
336
333
  template <typename T>
337
334
  HWY_API Vec1<T> BroadcastSignBit(const Vec1<T> v) {
338
- // This is used inside ShiftRight, so we cannot implement in terms of it.
339
- return v.raw < 0 ? Vec1<T>(T(-1)) : Vec1<T>(0);
335
+ return Vec1<T>(ScalarShr(v.raw, sizeof(T) * 8 - 1));
340
336
  }
341
337
 
342
338
  // ------------------------------ PopulationCount
@@ -380,15 +376,6 @@ HWY_API Vec1<T> IfNegativeThenElse(Vec1<T> v, Vec1<T> yes, Vec1<T> no) {
380
376
  return vi.raw < 0 ? yes : no;
381
377
  }
382
378
 
383
- template <typename T>
384
- HWY_API Vec1<T> ZeroIfNegative(const Vec1<T> v) {
385
- const DFromV<decltype(v)> d;
386
- const RebindToSigned<decltype(d)> di;
387
- const auto vi = BitCast(di, v);
388
-
389
- return vi.raw < 0 ? Vec1<T>(ConvertScalarTo<T>(0)) : v;
390
- }
391
-
392
379
  // ------------------------------ Mask logical
393
380
 
394
381
  template <typename T>
@@ -473,35 +460,20 @@ HWY_API Vec1<T> ShiftLeft(const Vec1<T> v) {
473
460
  template <int kBits, typename T>
474
461
  HWY_API Vec1<T> ShiftRight(const Vec1<T> v) {
475
462
  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
476
- #if __cplusplus >= 202002L
477
- // Signed right shift is now guaranteed to be arithmetic (rounding toward
478
- // negative infinity, i.e. shifting in the sign bit).
479
- return Vec1<T>(static_cast<T>(v.raw >> kBits));
480
- #else
481
- if (IsSigned<T>()) {
482
- // Emulate arithmetic shift using only logical (unsigned) shifts, because
483
- // signed shifts are still implementation-defined.
484
- using TU = hwy::MakeUnsigned<T>;
485
- const Sisd<TU> du;
486
- const TU shifted = static_cast<TU>(BitCast(du, v).raw >> kBits);
487
- const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
488
- const size_t sign_shift =
489
- static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
490
- const TU upper = static_cast<TU>(sign << sign_shift);
491
- return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
492
- } else { // T is unsigned
493
- return Vec1<T>(static_cast<T>(v.raw >> kBits));
494
- }
495
- #endif
463
+ return Vec1<T>(ScalarShr(v.raw, kBits));
496
464
  }
497
465
 
498
466
  // ------------------------------ RotateRight (ShiftRight)
499
- template <int kBits, typename T>
467
+ template <int kBits, typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
500
468
  HWY_API Vec1<T> RotateRight(const Vec1<T> v) {
469
+ const DFromV<decltype(v)> d;
470
+ const RebindToUnsigned<decltype(d)> du;
471
+
501
472
  constexpr size_t kSizeInBits = sizeof(T) * 8;
502
- static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift");
473
+ static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
503
474
  if (kBits == 0) return v;
504
- return Or(ShiftRight<kBits>(v),
475
+
476
+ return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
505
477
  ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
506
478
  }
507
479
 
@@ -515,26 +487,7 @@ HWY_API Vec1<T> ShiftLeftSame(const Vec1<T> v, int bits) {
515
487
 
516
488
  template <typename T>
517
489
  HWY_API Vec1<T> ShiftRightSame(const Vec1<T> v, int bits) {
518
- #if __cplusplus >= 202002L
519
- // Signed right shift is now guaranteed to be arithmetic (rounding toward
520
- // negative infinity, i.e. shifting in the sign bit).
521
- return Vec1<T>(static_cast<T>(v.raw >> bits));
522
- #else
523
- if (IsSigned<T>()) {
524
- // Emulate arithmetic shift using only logical (unsigned) shifts, because
525
- // signed shifts are still implementation-defined.
526
- using TU = hwy::MakeUnsigned<T>;
527
- const Sisd<TU> du;
528
- const TU shifted = static_cast<TU>(BitCast(du, v).raw >> bits);
529
- const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
530
- const size_t sign_shift =
531
- static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
532
- const TU upper = static_cast<TU>(sign << sign_shift);
533
- return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
534
- } else { // T is unsigned
535
- return Vec1<T>(static_cast<T>(v.raw >> bits));
536
- }
537
- #endif
490
+ return Vec1<T>(ScalarShr(v.raw, bits));
538
491
  }
539
492
 
540
493
  // ------------------------------ Shl
@@ -681,8 +634,8 @@ HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
681
634
 
682
635
  template <typename T, HWY_IF_FLOAT(T)>
683
636
  HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
684
- if (isnan(a.raw)) return b;
685
- if (isnan(b.raw)) return a;
637
+ if (ScalarIsNaN(a.raw)) return b;
638
+ if (ScalarIsNaN(b.raw)) return a;
686
639
  return Vec1<T>(HWY_MIN(a.raw, b.raw));
687
640
  }
688
641
 
@@ -693,8 +646,8 @@ HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
693
646
 
694
647
  template <typename T, HWY_IF_FLOAT(T)>
695
648
  HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
696
- if (isnan(a.raw)) return b;
697
- if (isnan(b.raw)) return a;
649
+ if (ScalarIsNaN(a.raw)) return b;
650
+ if (ScalarIsNaN(b.raw)) return a;
698
651
  return Vec1<T>(HWY_MAX(a.raw, b.raw));
699
652
  }
700
653
 
@@ -740,16 +693,19 @@ HWY_API Vec1<T> operator/(const Vec1<T> a, const Vec1<T> b) {
740
693
  return Vec1<T>(a.raw / b.raw);
741
694
  }
742
695
 
743
- // Returns the upper 16 bits of a * b in each lane.
744
- HWY_API Vec1<int16_t> MulHigh(const Vec1<int16_t> a, const Vec1<int16_t> b) {
745
- return Vec1<int16_t>(static_cast<int16_t>((a.raw * b.raw) >> 16));
696
+ // Returns the upper sizeof(T)*8 bits of a * b in each lane.
697
+ template <class T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
698
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
699
+ HWY_API Vec1<T> MulHigh(const Vec1<T> a, const Vec1<T> b) {
700
+ using TW = MakeWide<T>;
701
+ return Vec1<T>(static_cast<T>(
702
+ (static_cast<TW>(a.raw) * static_cast<TW>(b.raw)) >> (sizeof(T) * 8)));
746
703
  }
747
- HWY_API Vec1<uint16_t> MulHigh(const Vec1<uint16_t> a, const Vec1<uint16_t> b) {
748
- // Cast to uint32_t first to prevent overflow. Otherwise the result of
749
- // uint16_t * uint16_t is in "int" which may overflow. In practice the result
750
- // is the same but this way it is also defined.
751
- return Vec1<uint16_t>(static_cast<uint16_t>(
752
- (static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16));
704
+ template <class T, HWY_IF_UI64(T)>
705
+ HWY_API Vec1<T> MulHigh(const Vec1<T> a, const Vec1<T> b) {
706
+ T hi;
707
+ Mul128(a.raw, b.raw, &hi);
708
+ return Vec1<T>(hi);
753
709
  }
754
710
 
755
711
  HWY_API Vec1<int16_t> MulFixedPoint15(Vec1<int16_t> a, Vec1<int16_t> b) {
@@ -1034,12 +990,7 @@ HWY_API Mask1<T> operator>=(const Vec1<T> a, const Vec1<T> b) {
1034
990
  template <typename T>
1035
991
  HWY_API Mask1<T> IsNaN(const Vec1<T> v) {
1036
992
  // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
1037
- MakeUnsigned<T> bits;
1038
- CopySameSize(&v, &bits);
1039
- bits += bits;
1040
- bits >>= 1; // clear sign bit
1041
- // NaN if all exponent bits are set and the mantissa is not zero.
1042
- return Mask1<T>::FromBool(bits > ExponentMask<T>());
993
+ return Mask1<T>::FromBool(ScalarIsNaN(v.raw));
1043
994
  }
1044
995
 
1045
996
  // Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite.
@@ -1158,6 +1109,9 @@ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
1158
1109
  }
1159
1110
  }
1160
1111
 
1112
+ // ------------------------------ Tuples
1113
+ #include "hwy/ops/inside-inl.h"
1114
+
1161
1115
  // ------------------------------ LoadInterleaved2/3/4
1162
1116
 
1163
1117
  // Per-target flag to prevent generic_ops-inl.h from defining StoreInterleaved2.
@@ -1357,6 +1311,48 @@ HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag /*to_type_tag*/,
1357
1311
  return CastValueForF2IConv<ToT>(val);
1358
1312
  }
1359
1313
 
1314
+ // If val is within the range of ToT, CastValueForInRangeF2IConv<ToT>(val)
1315
+ // returns static_cast<ToT>(val)
1316
+ //
1317
+ // Otherwise, CastValueForInRangeF2IConv<ToT>(val) returns an
1318
+ // implementation-defined result if val is not within the range of ToT.
1319
+ template <class ToT, class FromT>
1320
+ HWY_INLINE ToT CastValueForInRangeF2IConv(FromT val) {
1321
+ // Prevent ubsan errors when converting float to narrower integer
1322
+
1323
+ using FromTU = MakeUnsigned<FromT>;
1324
+
1325
+ constexpr unsigned kMaxExpField =
1326
+ static_cast<unsigned>(MaxExponentField<FromT>());
1327
+ constexpr unsigned kExpBias = kMaxExpField >> 1;
1328
+ constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
1329
+ kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
1330
+ kMaxExpField));
1331
+
1332
+ // If ToT is signed, compare only the exponent bits of val against
1333
+ // kMinOutOfRangeExpField.
1334
+ //
1335
+ // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
1336
+ // val against kMinOutOfRangeExpField as a negative value is outside of the
1337
+ // range of an unsigned integer type.
1338
+ const FromT val_to_compare =
1339
+ static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);
1340
+
1341
+ // val is within the range of ToT if
1342
+ // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
1343
+ // than kMinOutOfRangeExpField
1344
+ //
1345
+ // Otherwise, val is either outside of the range of ToT or equal to
1346
+ // LimitsMin<ToT>() if
1347
+ // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
1348
+ // than or equal to kMinOutOfRangeExpField.
1349
+
1350
+ return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
1351
+ MantissaBits<FromT>()) < kMinOutOfRangeExpField)
1352
+ ? static_cast<ToT>(val)
1353
+ : static_cast<ToT>(LimitsMin<ToT>());
1354
+ }
1355
+
1360
1356
  } // namespace detail
1361
1357
 
1362
1358
  #ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
@@ -1373,6 +1369,18 @@ HWY_API Vec1<TTo> PromoteTo(DTo /* tag */, Vec1<TFrom> from) {
1373
1369
  detail::CastValueForPromoteTo<TTo>(hwy::TypeTag<TTo>(), from.raw));
1374
1370
  }
1375
1371
 
1372
+ #ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1373
+ #undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1374
+ #else
1375
+ #define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1376
+ #endif
1377
+
1378
+ template <class DTo, HWY_IF_UI64_D(DTo)>
1379
+ HWY_API VFromD<DTo> PromoteInRangeTo(DTo /* tag */, Vec1<float> from) {
1380
+ using TTo = TFromD<DTo>;
1381
+ return Vec1<TTo>(detail::CastValueForInRangeF2IConv<TTo>(from.raw));
1382
+ }
1383
+
1376
1384
  // MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here,
1377
1385
  // so we overload for TFrom=double and TTo={float,int32_t}.
1378
1386
  template <class D, HWY_IF_F32_D(D)>
@@ -1402,15 +1410,30 @@ HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) {
1402
1410
  return Vec1<TTo>(static_cast<TTo>(from.raw));
1403
1411
  }
1404
1412
 
1413
+ // Disable the default unsigned to signed DemoteTo implementation in
1414
+ // generic_ops-inl.h on SCALAR as the SCALAR target has a target-specific
1415
+ // implementation of the unsigned to signed DemoteTo op and as ReorderDemote2To
1416
+ // is not supported on the SCALAR target
1417
+
1418
+ // NOTE: hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr is used instead of
1419
+ // hwy::EnableIf<false>* = nullptr to avoid compiler errors since
1420
+ // !hwy::IsSame<V, V>() is always false and as !hwy::IsSame<V, V>() will cause
1421
+ // SFINAE to occur instead of a hard error due to a dependency on the V template
1422
+ // argument
1423
+ #undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
1424
+ #define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \
1425
+ hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
1426
+
1405
1427
  template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
1406
- HWY_IF_UNSIGNED(TFrom), HWY_IF_UNSIGNED_D(DTo)>
1428
+ HWY_IF_UNSIGNED(TFrom), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DTo)>
1407
1429
  HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) {
1408
1430
  static_assert(!IsFloat<TFrom>(), "TFrom=double are handled above");
1409
1431
  static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
1410
1432
 
1433
+ const auto max = static_cast<MakeUnsigned<TTo>>(LimitsMax<TTo>());
1434
+
1411
1435
  // Int to int: choose closest value in TTo to `from` (avoids UB)
1412
- from.raw = HWY_MIN(from.raw, LimitsMax<TTo>());
1413
- return Vec1<TTo>(static_cast<TTo>(from.raw));
1436
+ return Vec1<TTo>(static_cast<TTo>(HWY_MIN(from.raw, max)));
1414
1437
  }
1415
1438
 
1416
1439
  template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
@@ -1420,6 +1443,19 @@ HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) {
1420
1443
  return Vec1<TTo>(static_cast<TTo>(from.raw));
1421
1444
  }
1422
1445
 
1446
+ #ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
1447
+ #undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
1448
+ #else
1449
+ #define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
1450
+ #endif
1451
+
1452
+ template <class D32, HWY_IF_UI32_D(D32)>
1453
+ HWY_API VFromD<D32> DemoteInRangeTo(D32 /*d32*/,
1454
+ VFromD<Rebind<double, D32>> v) {
1455
+ using TTo = TFromD<D32>;
1456
+ return Vec1<TTo>(detail::CastValueForInRangeF2IConv<TTo>(v.raw));
1457
+ }
1458
+
1423
1459
  // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions;
1424
1460
  // use this scalar version to verify the vector implementation.
1425
1461
  #ifdef HWY_NATIVE_F16C
@@ -1448,6 +1484,12 @@ HWY_API Vec1<float16_t> DemoteTo(D /* tag */, const Vec1<float> v) {
1448
1484
  return Vec1<float16_t>(F16FromF32(v.raw));
1449
1485
  }
1450
1486
 
1487
+ #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
1488
+ #undef HWY_NATIVE_DEMOTE_F32_TO_BF16
1489
+ #else
1490
+ #define HWY_NATIVE_DEMOTE_F32_TO_BF16
1491
+ #endif
1492
+
1451
1493
  template <class D, HWY_IF_BF16_D(D)>
1452
1494
  HWY_API Vec1<bfloat16_t> DemoteTo(D d, const Vec1<float> v) {
1453
1495
  return Set(d, BF16FromF32(v.raw));
@@ -1469,6 +1511,19 @@ HWY_API Vec1<TTo> ConvertTo(DTo /* tag */, Vec1<TFrom> from) {
1469
1511
  return Vec1<TTo>(static_cast<TTo>(from.raw));
1470
1512
  }
1471
1513
 
1514
+ #ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
1515
+ #undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
1516
+ #else
1517
+ #define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
1518
+ #endif
1519
+
1520
+ template <class DI, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DI),
1521
+ HWY_IF_T_SIZE_ONE_OF_D(DI, (1 << 4) | (1 << 8))>
1522
+ HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, VFromD<RebindToFloat<DI>> v) {
1523
+ using TTo = TFromD<DI>;
1524
+ return VFromD<DI>(detail::CastValueForInRangeF2IConv<TTo>(v.raw));
1525
+ }
1526
+
1472
1527
  HWY_API Vec1<uint8_t> U8FromU32(const Vec1<uint32_t> v) {
1473
1528
  return DemoteTo(Sisd<uint8_t>(), v);
1474
1529
  }
@@ -1956,6 +2011,35 @@ HWY_API Vec1<int32_t> WidenMulPairwiseAdd(D32 /* tag */, Vec1<int16_t> a,
1956
2011
  return Vec1<int32_t>(a.raw * b.raw);
1957
2012
  }
1958
2013
 
2014
+ // ------------------------------ SatWidenMulAccumFixedPoint
2015
+ #ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
2016
+ #undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
2017
+ #else
2018
+ #define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
2019
+ #endif
2020
+
2021
+ template <class DI32, HWY_IF_I32_D(DI32)>
2022
+ HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 di32,
2023
+ VFromD<Rebind<int16_t, DI32>> a,
2024
+ VFromD<Rebind<int16_t, DI32>> b,
2025
+ VFromD<DI32> sum) {
2026
+ // Multiplying static_cast<int32_t>(a.raw) by static_cast<int32_t>(b.raw)
2027
+ // followed by an addition of the product is okay as
2028
+ // (a.raw * b.raw * 2) is between -2147418112 and 2147483648 and as
2029
+ // a.raw * b.raw * 2 can only overflow an int32_t if both a.raw and b.raw are
2030
+ // equal to -32768.
2031
+
2032
+ const VFromD<DI32> product(static_cast<int32_t>(a.raw) *
2033
+ static_cast<int32_t>(b.raw));
2034
+ const VFromD<DI32> product2 = Add(product, product);
2035
+
2036
+ const auto mul_overflow =
2037
+ VecFromMask(di32, Eq(product2, Set(di32, LimitsMin<int32_t>())));
2038
+
2039
+ return SaturatedAdd(Sub(sum, And(BroadcastSignBit(sum), mul_overflow)),
2040
+ Add(product2, mul_overflow));
2041
+ }
2042
+
1959
2043
  // ------------------------------ SatWidenMulPairwiseAdd
1960
2044
 
1961
2045
  #ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
@@ -1983,6 +2067,12 @@ HWY_API Vec1<int16_t> SatWidenMulPairwiseAdd(DI16 /* tag */, Vec1<uint8_t> a,
1983
2067
 
1984
2068
  // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
1985
2069
 
2070
+ #ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
2071
+ #undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
2072
+ #else
2073
+ #define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
2074
+ #endif
2075
+
1986
2076
  template <class D32, HWY_IF_F32_D(D32)>
1987
2077
  HWY_API Vec1<float> ReorderWidenMulAccumulate(D32 /* tag */, Vec1<bfloat16_t> a,
1988
2078
  Vec1<bfloat16_t> b,
@@ -1,5 +1,7 @@
1
1
  // Copyright 2020 Google LLC
2
+ // Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
2
3
  // SPDX-License-Identifier: Apache-2.0
4
+ // SPDX-License-Identifier: BSD-3-Clause
3
5
  //
4
6
  // Licensed under the Apache License, Version 2.0 (the "License");
5
7
  // you may not use this file except in compliance with the License.
@@ -41,9 +43,31 @@
41
43
  #undef HWY_HAVE_FLOAT64
42
44
  #undef HWY_MEM_OPS_MIGHT_FAULT
43
45
  #undef HWY_NATIVE_FMA
46
+ #undef HWY_NATIVE_DOT_BF16
44
47
  #undef HWY_CAP_GE256
45
48
  #undef HWY_CAP_GE512
46
49
 
50
+ #undef HWY_TARGET_IS_SVE
51
+ #if HWY_TARGET & HWY_ALL_SVE
52
+ #define HWY_TARGET_IS_SVE 1
53
+ #else
54
+ #define HWY_TARGET_IS_SVE 0
55
+ #endif
56
+
57
+ #undef HWY_TARGET_IS_NEON
58
+ #if HWY_TARGET & HWY_ALL_NEON
59
+ #define HWY_TARGET_IS_NEON 1
60
+ #else
61
+ #define HWY_TARGET_IS_NEON 0
62
+ #endif
63
+
64
+ #undef HWY_TARGET_IS_PPC
65
+ #if HWY_TARGET & HWY_ALL_PPC
66
+ #define HWY_TARGET_IS_PPC 1
67
+ #else
68
+ #define HWY_TARGET_IS_PPC 0
69
+ #endif
70
+
47
71
  // Supported on all targets except RVV (requires GCC 14 or upcoming Clang)
48
72
  #if HWY_TARGET == HWY_RVV && \
49
73
  ((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400) || \
@@ -116,7 +140,21 @@
116
140
  ",vpclmulqdq,avx512vbmi,avx512vbmi2,vaes,avx512vnni,avx512bitalg," \
117
141
  "avx512vpopcntdq,gfni"
118
142
 
119
- #define HWY_TARGET_STR_AVX3_SPR HWY_TARGET_STR_AVX3_DL ",avx512fp16"
143
+ // Force-disable for compilers that do not properly support avx512bf16.
144
+ #if !defined(HWY_AVX3_DISABLE_AVX512BF16) && \
145
+ (HWY_COMPILER_CLANGCL || \
146
+ (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000) || \
147
+ (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 900))
148
+ #define HWY_AVX3_DISABLE_AVX512BF16
149
+ #endif
150
+
151
+ #if !defined(HWY_AVX3_DISABLE_AVX512BF16)
152
+ #define HWY_TARGET_STR_AVX3_ZEN4 HWY_TARGET_STR_AVX3_DL ",avx512bf16"
153
+ #else
154
+ #define HWY_TARGET_STR_AVX3_ZEN4 HWY_TARGET_STR_AVX3_DL
155
+ #endif
156
+
157
+ #define HWY_TARGET_STR_AVX3_SPR HWY_TARGET_STR_AVX3_ZEN4 ",avx512fp16"
120
158
 
121
159
  #if defined(HWY_DISABLE_PPC8_CRYPTO)
122
160
  #define HWY_TARGET_STR_PPC8_CRYPTO ""
@@ -164,6 +202,7 @@
164
202
  #define HWY_HAVE_FLOAT64 1
165
203
  #define HWY_MEM_OPS_MIGHT_FAULT 1
166
204
  #define HWY_NATIVE_FMA 0
205
+ #define HWY_NATIVE_DOT_BF16 0
167
206
  #define HWY_CAP_GE256 0
168
207
  #define HWY_CAP_GE512 0
169
208
 
@@ -183,6 +222,7 @@
183
222
  #define HWY_HAVE_FLOAT64 1
184
223
  #define HWY_MEM_OPS_MIGHT_FAULT 1
185
224
  #define HWY_NATIVE_FMA 0
225
+ #define HWY_NATIVE_DOT_BF16 0
186
226
  #define HWY_CAP_GE256 0
187
227
  #define HWY_CAP_GE512 0
188
228
 
@@ -203,6 +243,7 @@
203
243
  #define HWY_HAVE_FLOAT64 1
204
244
  #define HWY_MEM_OPS_MIGHT_FAULT 1
205
245
  #define HWY_NATIVE_FMA 0
246
+ #define HWY_NATIVE_DOT_BF16 0
206
247
  #define HWY_CAP_GE256 0
207
248
  #define HWY_CAP_GE512 0
208
249
 
@@ -228,6 +269,7 @@
228
269
  #else
229
270
  #define HWY_NATIVE_FMA 1
230
271
  #endif
272
+ #define HWY_NATIVE_DOT_BF16 0
231
273
 
232
274
  #define HWY_CAP_GE256 1
233
275
  #define HWY_CAP_GE512 0
@@ -256,6 +298,11 @@
256
298
  #define HWY_HAVE_FLOAT64 1
257
299
  #define HWY_MEM_OPS_MIGHT_FAULT 0
258
300
  #define HWY_NATIVE_FMA 1
301
+ #if (HWY_TARGET <= HWY_AVX3_ZEN4) && !defined(HWY_AVX3_DISABLE_AVX512BF16)
302
+ #define HWY_NATIVE_DOT_BF16 1
303
+ #else
304
+ #define HWY_NATIVE_DOT_BF16 0
305
+ #endif
259
306
  #define HWY_CAP_GE256 1
260
307
  #define HWY_CAP_GE512 1
261
308
 
@@ -272,8 +319,7 @@
272
319
  #elif HWY_TARGET == HWY_AVX3_ZEN4
273
320
 
274
321
  #define HWY_NAMESPACE N_AVX3_ZEN4
275
- // Currently the same as HWY_AVX3_DL: both support Icelake.
276
- #define HWY_TARGET_STR HWY_TARGET_STR_AVX3_DL
322
+ #define HWY_TARGET_STR HWY_TARGET_STR_AVX3_ZEN4
277
323
 
278
324
  #elif HWY_TARGET == HWY_AVX3_SPR
279
325
 
@@ -286,8 +332,7 @@
286
332
 
287
333
  //-----------------------------------------------------------------------------
288
334
  // PPC8, PPC9, PPC10
289
- #elif HWY_TARGET == HWY_PPC8 || HWY_TARGET == HWY_PPC9 || \
290
- HWY_TARGET == HWY_PPC10
335
+ #elif HWY_TARGET_IS_PPC
291
336
 
292
337
  #define HWY_ALIGN alignas(16)
293
338
  #define HWY_MAX_BYTES 16
@@ -299,6 +344,7 @@
299
344
  #define HWY_HAVE_FLOAT64 1
300
345
  #define HWY_MEM_OPS_MIGHT_FAULT 1
301
346
  #define HWY_NATIVE_FMA 1
347
+ #define HWY_NATIVE_DOT_BF16 0
302
348
  #define HWY_CAP_GE256 0
303
349
  #define HWY_CAP_GE512 0
304
350
 
@@ -319,7 +365,7 @@
319
365
 
320
366
  #else
321
367
  #error "Logic error"
322
- #endif // HWY_TARGET == HWY_PPC10
368
+ #endif // HWY_TARGET
323
369
 
324
370
  //-----------------------------------------------------------------------------
325
371
  // Z14, Z15
@@ -335,6 +381,7 @@
335
381
  #define HWY_HAVE_FLOAT64 1
336
382
  #define HWY_MEM_OPS_MIGHT_FAULT 1
337
383
  #define HWY_NATIVE_FMA 1
384
+ #define HWY_NATIVE_DOT_BF16 0
338
385
  #define HWY_CAP_GE256 0
339
386
  #define HWY_CAP_GE512 0
340
387
 
@@ -354,7 +401,7 @@
354
401
 
355
402
  //-----------------------------------------------------------------------------
356
403
  // NEON
357
- #elif HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES
404
+ #elif HWY_TARGET_IS_NEON
358
405
 
359
406
  #define HWY_ALIGN alignas(16)
360
407
  #define HWY_MAX_BYTES 16
@@ -362,7 +409,7 @@
362
409
 
363
410
  #define HWY_HAVE_SCALABLE 0
364
411
  #define HWY_HAVE_INTEGER64 1
365
- #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
412
+ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || HWY_TARGET == HWY_NEON_BF16
366
413
  #define HWY_HAVE_FLOAT16 1
367
414
  #else
368
415
  #define HWY_HAVE_FLOAT16 0
@@ -376,20 +423,29 @@
376
423
 
377
424
  #define HWY_MEM_OPS_MIGHT_FAULT 1
378
425
 
379
- #if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
426
+ #if defined(__ARM_FEATURE_FMA) || defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
380
427
  #define HWY_NATIVE_FMA 1
381
428
  #else
382
429
  #define HWY_NATIVE_FMA 0
383
430
  #endif
431
+ #if HWY_NEON_HAVE_F32_TO_BF16C || HWY_TARGET == HWY_NEON_BF16
432
+ #define HWY_NATIVE_DOT_BF16 1
433
+ #else
434
+ #define HWY_NATIVE_DOT_BF16 0
435
+ #endif
384
436
 
385
437
  #define HWY_CAP_GE256 0
386
438
  #define HWY_CAP_GE512 0
387
439
 
388
440
  #if HWY_TARGET == HWY_NEON_WITHOUT_AES
389
441
  #define HWY_NAMESPACE N_NEON_WITHOUT_AES
390
- #else
442
+ #elif HWY_TARGET == HWY_NEON
391
443
  #define HWY_NAMESPACE N_NEON
392
- #endif
444
+ #elif HWY_TARGET == HWY_NEON_BF16
445
+ #define HWY_NAMESPACE N_NEON_BF16
446
+ #else
447
+ #error "Logic error, missing case"
448
+ #endif // HWY_TARGET
393
449
 
394
450
  // Can use pragmas instead of -march compiler flag
395
451
  #if HWY_HAVE_RUNTIME_DISPATCH
@@ -404,21 +460,43 @@
404
460
 
405
461
  #else // !HWY_ARCH_ARM_V7
406
462
 
463
+ #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1300) || \
464
+ (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1300)
465
+ // GCC 12 or earlier and Clang 12 or earlier require +crypto be added to the
466
+ // target string to enable AArch64 AES intrinsics
467
+ #define HWY_TARGET_STR_NEON "+crypto"
468
+ #else
469
+ #define HWY_TARGET_STR_NEON "+aes"
470
+ #endif
471
+
472
+ // Clang >= 16 requires +fullfp16 instead of fp16, but Apple Clang 15 = 1600
473
+ // fails to parse unless the string starts with armv8, whereas 1700 refuses it.
474
+ #if HWY_COMPILER_CLANG >= 1700
475
+ #define HWY_TARGET_STR_FP16 "+fullfp16"
476
+ #elif HWY_COMPILER_CLANG >= 1600 && defined(__apple_build_version__)
477
+ #define HWY_TARGET_STR_FP16 "armv8.4-a+fullfp16"
478
+ #else
479
+ #define HWY_TARGET_STR_FP16 "+fp16"
480
+ #endif
481
+
407
482
  #if HWY_TARGET == HWY_NEON_WITHOUT_AES
408
483
  // Do not define HWY_TARGET_STR (no pragma).
484
+ #elif HWY_TARGET == HWY_NEON
485
+ #define HWY_TARGET_STR HWY_TARGET_STR_NEON
486
+ #elif HWY_TARGET == HWY_NEON_BF16
487
+ #define HWY_TARGET_STR HWY_TARGET_STR_FP16 "+bf16+dotprod" HWY_TARGET_STR_NEON
409
488
  #else
410
- #define HWY_TARGET_STR "+crypto"
411
- #endif // HWY_TARGET == HWY_NEON_WITHOUT_AES
489
+ #error "Logic error, missing case"
490
+ #endif // HWY_TARGET
412
491
 
413
- #endif // HWY_ARCH_ARM_V7
492
+ #endif // !HWY_ARCH_ARM_V7
414
493
  #else // !HWY_HAVE_RUNTIME_DISPATCH
415
494
  // HWY_TARGET_STR remains undefined
416
495
  #endif
417
496
 
418
497
  //-----------------------------------------------------------------------------
419
498
  // SVE[2]
420
- #elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE || \
421
- HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
499
+ #elif HWY_TARGET_IS_SVE
422
500
 
423
501
  // SVE only requires lane alignment, not natural alignment of the entire vector.
424
502
  #define HWY_ALIGN alignas(8)
@@ -432,6 +510,11 @@
432
510
  #define HWY_HAVE_FLOAT64 1
433
511
  #define HWY_MEM_OPS_MIGHT_FAULT 0
434
512
  #define HWY_NATIVE_FMA 1
513
+ #if HWY_SVE_HAVE_BF16_FEATURE
514
+ #define HWY_NATIVE_DOT_BF16 1
515
+ #else
516
+ #define HWY_NATIVE_DOT_BF16 0
517
+ #endif
435
518
  #define HWY_CAP_GE256 0
436
519
  #define HWY_CAP_GE512 0
437
520
 
@@ -459,9 +542,9 @@
459
542
  // Static dispatch with -march=armv8-a+sve2+aes, or no baseline, hence dynamic
460
543
  // dispatch, which checks for AES support at runtime.
461
544
  #if defined(__ARM_FEATURE_SVE2_AES) || (HWY_BASELINE_SVE2 == 0)
462
- #define HWY_TARGET_STR "+sve2-aes"
545
+ #define HWY_TARGET_STR "+sve2+sve2-aes,+sve"
463
546
  #else // SVE2 without AES
464
- #define HWY_TARGET_STR "+sve2"
547
+ #define HWY_TARGET_STR "+sve2,+sve"
465
548
  #endif
466
549
  #else // not SVE2 target
467
550
  #define HWY_TARGET_STR "+sve"
@@ -484,6 +567,7 @@
484
567
  #define HWY_HAVE_FLOAT64 1
485
568
  #define HWY_MEM_OPS_MIGHT_FAULT 1
486
569
  #define HWY_NATIVE_FMA 0
570
+ #define HWY_NATIVE_DOT_BF16 0
487
571
  #define HWY_CAP_GE256 0
488
572
  #define HWY_CAP_GE512 0
489
573
 
@@ -505,6 +589,7 @@
505
589
  #define HWY_HAVE_FLOAT64 0
506
590
  #define HWY_MEM_OPS_MIGHT_FAULT 1
507
591
  #define HWY_NATIVE_FMA 0
592
+ #define HWY_NATIVE_DOT_BF16 0
508
593
  #define HWY_CAP_GE256 1
509
594
  #define HWY_CAP_GE512 0
510
595
 
@@ -532,6 +617,7 @@
532
617
  #define HWY_HAVE_FLOAT64 1
533
618
  #define HWY_MEM_OPS_MIGHT_FAULT 0
534
619
  #define HWY_NATIVE_FMA 1
620
+ #define HWY_NATIVE_DOT_BF16 0
535
621
  #define HWY_CAP_GE256 0
536
622
  #define HWY_CAP_GE512 0
537
623
 
@@ -560,6 +646,7 @@
560
646
  #define HWY_HAVE_FLOAT64 1
561
647
  #define HWY_MEM_OPS_MIGHT_FAULT 1
562
648
  #define HWY_NATIVE_FMA 0
649
+ #define HWY_NATIVE_DOT_BF16 0
563
650
  #define HWY_CAP_GE256 0
564
651
  #define HWY_CAP_GE512 0
565
652
 
@@ -581,6 +668,7 @@
581
668
  #define HWY_HAVE_FLOAT64 1
582
669
  #define HWY_MEM_OPS_MIGHT_FAULT 0
583
670
  #define HWY_NATIVE_FMA 0
671
+ #define HWY_NATIVE_DOT_BF16 0
584
672
  #define HWY_CAP_GE256 0
585
673
  #define HWY_CAP_GE512 0
586
674