@img/sharp-libvips-dev 1.2.1 → 1.2.2-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/include/aom/aom_decoder.h +1 -1
  2. package/include/aom/aom_encoder.h +2 -0
  3. package/include/aom/aomcx.h +106 -25
  4. package/include/ffi.h +3 -3
  5. package/include/freetype2/freetype/config/ftconfig.h +1 -1
  6. package/include/freetype2/freetype/config/ftheader.h +1 -1
  7. package/include/freetype2/freetype/config/ftoption.h +37 -12
  8. package/include/freetype2/freetype/config/ftstdlib.h +1 -1
  9. package/include/freetype2/freetype/config/integer-types.h +29 -2
  10. package/include/freetype2/freetype/config/mac-support.h +1 -1
  11. package/include/freetype2/freetype/config/public-macros.h +3 -3
  12. package/include/freetype2/freetype/freetype.h +51 -47
  13. package/include/freetype2/freetype/ftadvanc.h +1 -1
  14. package/include/freetype2/freetype/ftbbox.h +1 -1
  15. package/include/freetype2/freetype/ftbdf.h +1 -1
  16. package/include/freetype2/freetype/ftbitmap.h +1 -1
  17. package/include/freetype2/freetype/ftbzip2.h +1 -1
  18. package/include/freetype2/freetype/ftcache.h +1 -1
  19. package/include/freetype2/freetype/ftcid.h +1 -1
  20. package/include/freetype2/freetype/ftcolor.h +13 -4
  21. package/include/freetype2/freetype/ftdriver.h +3 -3
  22. package/include/freetype2/freetype/fterrdef.h +1 -1
  23. package/include/freetype2/freetype/fterrors.h +1 -1
  24. package/include/freetype2/freetype/ftfntfmt.h +1 -1
  25. package/include/freetype2/freetype/ftgasp.h +1 -1
  26. package/include/freetype2/freetype/ftglyph.h +1 -1
  27. package/include/freetype2/freetype/ftgxval.h +1 -1
  28. package/include/freetype2/freetype/ftgzip.h +1 -1
  29. package/include/freetype2/freetype/ftimage.h +6 -2
  30. package/include/freetype2/freetype/ftincrem.h +1 -1
  31. package/include/freetype2/freetype/ftlcdfil.h +1 -1
  32. package/include/freetype2/freetype/ftlist.h +1 -1
  33. package/include/freetype2/freetype/ftlogging.h +184 -0
  34. package/include/freetype2/freetype/ftlzw.h +1 -1
  35. package/include/freetype2/freetype/ftmac.h +1 -1
  36. package/include/freetype2/freetype/ftmm.h +159 -103
  37. package/include/freetype2/freetype/ftmodapi.h +1 -1
  38. package/include/freetype2/freetype/ftmoderr.h +1 -1
  39. package/include/freetype2/freetype/ftotval.h +1 -1
  40. package/include/freetype2/freetype/ftoutln.h +1 -1
  41. package/include/freetype2/freetype/ftparams.h +1 -1
  42. package/include/freetype2/freetype/ftpfr.h +1 -1
  43. package/include/freetype2/freetype/ftrender.h +1 -1
  44. package/include/freetype2/freetype/ftsizes.h +1 -1
  45. package/include/freetype2/freetype/ftsnames.h +1 -1
  46. package/include/freetype2/freetype/ftstroke.h +1 -1
  47. package/include/freetype2/freetype/ftsynth.h +1 -1
  48. package/include/freetype2/freetype/ftsystem.h +1 -1
  49. package/include/freetype2/freetype/fttrigon.h +1 -1
  50. package/include/freetype2/freetype/fttypes.h +1 -1
  51. package/include/freetype2/freetype/ftwinfnt.h +2 -3
  52. package/include/freetype2/freetype/otsvg.h +1 -1
  53. package/include/freetype2/freetype/t1tables.h +1 -1
  54. package/include/freetype2/freetype/ttnameid.h +129 -129
  55. package/include/freetype2/freetype/tttables.h +8 -5
  56. package/include/freetype2/freetype/tttags.h +1 -1
  57. package/include/freetype2/ft2build.h +1 -1
  58. package/include/glib-2.0/gio/gdbuserror.h +9 -8
  59. package/include/glib-2.0/gio/ginetaddress.h +12 -0
  60. package/include/glib-2.0/gio/gioenums.h +9 -2
  61. package/include/glib-2.0/glib/gstring.h +2 -2
  62. package/include/glib-2.0/glib/gunicode.h +1 -1
  63. package/include/glib-2.0/gobject/glib-types.h +1 -1
  64. package/include/glib-2.0/gobject/gparam.h +1 -1
  65. package/include/glib-2.0/gobject/gvalue.h +78 -35
  66. package/include/harfbuzz/hb-script-list.h +12 -0
  67. package/include/harfbuzz/hb-version.h +3 -3
  68. package/include/hwy/abort.h +2 -19
  69. package/include/hwy/aligned_allocator.h +11 -7
  70. package/include/hwy/auto_tune.h +504 -0
  71. package/include/hwy/base.h +425 -104
  72. package/include/hwy/cache_control.h +16 -0
  73. package/include/hwy/detect_compiler_arch.h +32 -1
  74. package/include/hwy/detect_targets.h +251 -67
  75. package/include/hwy/foreach_target.h +35 -0
  76. package/include/hwy/highway.h +185 -76
  77. package/include/hwy/nanobenchmark.h +1 -19
  78. package/include/hwy/ops/arm_neon-inl.h +969 -458
  79. package/include/hwy/ops/arm_sve-inl.h +1137 -359
  80. package/include/hwy/ops/emu128-inl.h +97 -11
  81. package/include/hwy/ops/generic_ops-inl.h +1222 -34
  82. package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
  83. package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
  84. package/include/hwy/ops/ppc_vsx-inl.h +306 -126
  85. package/include/hwy/ops/rvv-inl.h +546 -51
  86. package/include/hwy/ops/scalar-inl.h +77 -22
  87. package/include/hwy/ops/set_macros-inl.h +138 -17
  88. package/include/hwy/ops/shared-inl.h +50 -10
  89. package/include/hwy/ops/wasm_128-inl.h +137 -92
  90. package/include/hwy/ops/x86_128-inl.h +773 -214
  91. package/include/hwy/ops/x86_256-inl.h +712 -255
  92. package/include/hwy/ops/x86_512-inl.h +429 -753
  93. package/include/hwy/ops/x86_avx3-inl.h +501 -0
  94. package/include/hwy/per_target.h +2 -1
  95. package/include/hwy/profiler.h +622 -486
  96. package/include/hwy/targets.h +62 -20
  97. package/include/hwy/timer-inl.h +8 -160
  98. package/include/hwy/timer.h +170 -3
  99. package/include/hwy/x86_cpuid.h +81 -0
  100. package/include/libheif/heif_cxx.h +25 -5
  101. package/include/libheif/heif_regions.h +5 -5
  102. package/include/libheif/heif_version.h +2 -2
  103. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  104. package/include/libxml2/libxml/valid.h +0 -3
  105. package/include/libxml2/libxml/xmlerror.h +1 -1
  106. package/include/libxml2/libxml/xmlversion.h +4 -4
  107. package/include/pango-1.0/pango/pango-enum-types.h +3 -0
  108. package/include/pango-1.0/pango/pango-features.h +3 -3
  109. package/include/pango-1.0/pango/pango-font.h +30 -0
  110. package/include/pango-1.0/pango/pango-version-macros.h +26 -0
  111. package/include/vips/connection.h +4 -4
  112. package/include/vips/version.h +4 -4
  113. package/include/zlib.h +3 -3
  114. package/package.json +1 -1
  115. package/versions.json +13 -13
@@ -72,10 +72,12 @@ struct Vec1 {
72
72
 
73
73
  // 0 or FF..FF, same size as Vec1.
74
74
  template <typename T>
75
- class Mask1 {
75
+ struct Mask1 {
76
76
  using Raw = hwy::MakeUnsigned<T>;
77
77
 
78
- public:
78
+ using PrivateT = T; // only for DFromM
79
+ static constexpr size_t kPrivateN = 1; // only for DFromM
80
+
79
81
  static HWY_INLINE Mask1<T> FromBool(bool b) {
80
82
  Mask1<T> mask;
81
83
  mask.bits = b ? static_cast<Raw>(~Raw{0}) : 0;
@@ -88,6 +90,9 @@ class Mask1 {
88
90
  template <class V>
89
91
  using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
90
92
 
93
+ template <class M>
94
+ using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>;
95
+
91
96
  template <class V>
92
97
  using TFromV = typename V::PrivateT;
93
98
 
@@ -288,13 +293,6 @@ HWY_API Mask1<T> MaskFromVec(const Vec1<T> v) {
288
293
  template <class D>
289
294
  using MFromD = decltype(MaskFromVec(VFromD<D>()));
290
295
 
291
- template <typename T>
292
- Vec1<T> VecFromMask(const Mask1<T> mask) {
293
- Vec1<T> v;
294
- CopySameSize(&mask, &v);
295
- return v;
296
- }
297
-
298
296
  template <class D, typename T = TFromD<D>>
299
297
  Vec1<T> VecFromMask(D /* tag */, const Mask1<T> mask) {
300
298
  Vec1<T> v;
@@ -302,6 +300,11 @@ Vec1<T> VecFromMask(D /* tag */, const Mask1<T> mask) {
302
300
  return v;
303
301
  }
304
302
 
303
+ template <class D>
304
+ uint64_t BitsFromMask(D, MFromD<D> mask) {
305
+ return mask.bits ? 1 : 0;
306
+ }
307
+
305
308
  template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
306
309
  HWY_API Mask1<T> FirstN(D /*tag*/, size_t n) {
307
310
  return Mask1<T>::FromBool(n != 0);
@@ -607,13 +610,23 @@ HWY_API Vec1<int16_t> SaturatedSub(const Vec1<int16_t> a,
607
610
 
608
611
  // Returns (a + b + 1) / 2
609
612
 
610
- HWY_API Vec1<uint8_t> AverageRound(const Vec1<uint8_t> a,
611
- const Vec1<uint8_t> b) {
612
- return Vec1<uint8_t>(static_cast<uint8_t>((a.raw + b.raw + 1) / 2));
613
- }
614
- HWY_API Vec1<uint16_t> AverageRound(const Vec1<uint16_t> a,
615
- const Vec1<uint16_t> b) {
616
- return Vec1<uint16_t>(static_cast<uint16_t>((a.raw + b.raw + 1) / 2));
613
+ #ifdef HWY_NATIVE_AVERAGE_ROUND_UI32
614
+ #undef HWY_NATIVE_AVERAGE_ROUND_UI32
615
+ #else
616
+ #define HWY_NATIVE_AVERAGE_ROUND_UI32
617
+ #endif
618
+
619
+ #ifdef HWY_NATIVE_AVERAGE_ROUND_UI64
620
+ #undef HWY_NATIVE_AVERAGE_ROUND_UI64
621
+ #else
622
+ #define HWY_NATIVE_AVERAGE_ROUND_UI64
623
+ #endif
624
+
625
+ template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
626
+ HWY_API Vec1<T> AverageRound(const Vec1<T> a, const Vec1<T> b) {
627
+ const T a_val = a.raw;
628
+ const T b_val = b.raw;
629
+ return Vec1<T>(static_cast<T>((a_val | b_val) - ScalarShr(a_val ^ b_val, 1)));
617
630
  }
618
631
 
619
632
  // ------------------------------ Absolute value
@@ -721,6 +734,11 @@ HWY_API Vec1<MakeWide<T>> MulEven(const Vec1<T> a, const Vec1<T> b) {
721
734
  return Vec1<TW>(static_cast<TW>(a_wide * b.raw));
722
735
  }
723
736
 
737
+ template <class T>
738
+ HWY_API Vec1<MakeWide<T>> MulOdd(const Vec1<T>, const Vec1<T>) {
739
+ static_assert(sizeof(T) == 0, "There are no odd lanes");
740
+ }
741
+
724
742
  // Approximate reciprocal
725
743
  HWY_API Vec1<float> ApproximateReciprocal(const Vec1<float> v) {
726
744
  // Zero inputs are allowed, but callers are responsible for replacing the
@@ -831,9 +849,9 @@ HWY_API Vec1<T> Round(const Vec1<T> v) {
831
849
  }
832
850
 
833
851
  // Round-to-nearest even.
834
- HWY_API Vec1<int32_t> NearestInt(const Vec1<float> v) {
835
- using T = float;
836
- using TI = int32_t;
852
+ template <class T, HWY_IF_FLOAT3264(T)>
853
+ HWY_API Vec1<MakeSigned<T>> NearestInt(const Vec1<T> v) {
854
+ using TI = MakeSigned<T>;
837
855
 
838
856
  const T abs = Abs(v).raw;
839
857
  const bool is_sign = ScalarSignBit(v.raw);
@@ -843,12 +861,39 @@ HWY_API Vec1<int32_t> NearestInt(const Vec1<float> v) {
843
861
  if (!(abs <= ConvertScalarTo<T>(LimitsMax<TI>()))) {
844
862
  return Vec1<TI>(is_sign ? LimitsMin<TI>() : LimitsMax<TI>());
845
863
  }
846
- return Vec1<int32_t>(ConvertScalarTo<TI>(v.raw));
864
+ return Vec1<TI>(ConvertScalarTo<TI>(v.raw));
847
865
  }
848
866
  const T bias =
849
867
  ConvertScalarTo<T>(v.raw < ConvertScalarTo<T>(0.0) ? -0.5 : 0.5);
850
868
  const TI rounded = ConvertScalarTo<TI>(v.raw + bias);
851
- if (rounded == 0) return Vec1<int32_t>(0);
869
+ if (rounded == 0) return Vec1<TI>(0);
870
+ TI offset = 0;
871
+ // Round to even
872
+ if ((rounded & 1) && ScalarAbs(ConvertScalarTo<T>(rounded) - v.raw) ==
873
+ ConvertScalarTo<T>(0.5)) {
874
+ offset = is_sign ? -1 : 1;
875
+ }
876
+ return Vec1<TI>(rounded - offset);
877
+ }
878
+
879
+ // Round-to-nearest even.
880
+ template <class DI32, HWY_IF_I32_D(DI32)>
881
+ HWY_API VFromD<DI32> DemoteToNearestInt(DI32 /*di32*/, const Vec1<double> v) {
882
+ using T = double;
883
+ using TI = int32_t;
884
+
885
+ const T abs = Abs(v).raw;
886
+ const bool is_sign = ScalarSignBit(v.raw);
887
+
888
+ // Check if too large to cast or NaN
889
+ if (!(abs <= ConvertScalarTo<T>(LimitsMax<TI>()))) {
890
+ return Vec1<TI>(is_sign ? LimitsMin<TI>() : LimitsMax<TI>());
891
+ }
892
+
893
+ const T bias =
894
+ ConvertScalarTo<T>(v.raw < ConvertScalarTo<T>(0.0) ? -0.5 : 0.5);
895
+ const TI rounded = ConvertScalarTo<TI>(v.raw + bias);
896
+ if (rounded == 0) return Vec1<TI>(0);
852
897
  TI offset = 0;
853
898
  // Round to even
854
899
  if ((rounded & 1) && ScalarAbs(ConvertScalarTo<T>(rounded) - v.raw) ==
@@ -1612,12 +1657,22 @@ HWY_API Vec1<T> OddEvenBlocks(Vec1<T> /* odd */, Vec1<T> even) {
1612
1657
  }
1613
1658
 
1614
1659
  // ------------------------------ SwapAdjacentBlocks
1615
-
1616
1660
  template <typename T>
1617
1661
  HWY_API Vec1<T> SwapAdjacentBlocks(Vec1<T> v) {
1618
1662
  return v;
1619
1663
  }
1620
1664
 
1665
+ // ------------------------------ InterleaveEvenBlocks
1666
+ template <class D, class V = VFromD<D>>
1667
+ HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) {
1668
+ return a;
1669
+ }
1670
+ // ------------------------------ InterleaveOddBlocks
1671
+ template <class D, class V = VFromD<D>>
1672
+ HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) {
1673
+ return a;
1674
+ }
1675
+
1621
1676
  // ------------------------------ TableLookupLanes
1622
1677
 
1623
1678
  // Returned by SetTableIndices for use by TableLookupLanes.
@@ -68,10 +68,17 @@
68
68
  #define HWY_TARGET_IS_PPC 0
69
69
  #endif
70
70
 
71
+ #undef HWY_TARGET_IS_AVX10_2
72
+ #if HWY_TARGET == HWY_AVX10_2
73
+ #define HWY_TARGET_IS_AVX10_2 1
74
+ #else
75
+ #define HWY_TARGET_IS_AVX10_2 0
76
+ #endif
77
+
71
78
  // Supported on all targets except RVV (requires GCC 14 or upcoming Clang)
72
79
  #if HWY_TARGET == HWY_RVV && \
73
80
  ((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400) || \
74
- (HWY_COMPILER_CLANG))
81
+ (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1700))
75
82
  #define HWY_HAVE_TUPLE 0
76
83
  #else
77
84
  #define HWY_HAVE_TUPLE 1
@@ -133,13 +140,28 @@
133
140
  // Include previous targets, which are the half-vectors of the next target.
134
141
  #define HWY_TARGET_STR_AVX2 \
135
142
  HWY_TARGET_STR_SSE4 ",avx,avx2" HWY_TARGET_STR_BMI2_FMA HWY_TARGET_STR_F16C
136
- #define HWY_TARGET_STR_AVX3 \
137
- HWY_TARGET_STR_AVX2 ",avx512f,avx512cd,avx512vl,avx512dq,avx512bw"
138
- #define HWY_TARGET_STR_AVX3_DL \
139
- HWY_TARGET_STR_AVX3 \
143
+
144
+ #if (HWY_COMPILER_GCC_ACTUAL >= 1400 && HWY_COMPILER_GCC_ACTUAL < 1600) || \
145
+ HWY_COMPILER_CLANG >= 1800
146
+ #define HWY_TARGET_STR_AVX3_VL512 ",evex512"
147
+ #else
148
+ #define HWY_TARGET_STR_AVX3_VL512
149
+ #endif
150
+
151
+ #define HWY_TARGET_STR_AVX3_256 \
152
+ HWY_TARGET_STR_AVX2 \
153
+ ",avx512f,avx512cd,avx512vl,avx512dq,avx512bw" HWY_TARGET_STR_AVX3_VL512
154
+
155
+ #define HWY_TARGET_STR_AVX3 HWY_TARGET_STR_AVX3_256 HWY_TARGET_STR_AVX3_VL512
156
+
157
+ #define HWY_TARGET_STR_AVX3_DL_256 \
158
+ HWY_TARGET_STR_AVX3_256 \
140
159
  ",vpclmulqdq,avx512vbmi,avx512vbmi2,vaes,avx512vnni,avx512bitalg," \
141
160
  "avx512vpopcntdq,gfni"
142
161
 
162
+ #define HWY_TARGET_STR_AVX3_DL \
163
+ HWY_TARGET_STR_AVX3_DL_256 HWY_TARGET_STR_AVX3_VL512
164
+
143
165
  // Force-disable for compilers that do not properly support avx512bf16.
144
166
  #if !defined(HWY_AVX3_DISABLE_AVX512BF16) && \
145
167
  (HWY_COMPILER_CLANGCL || \
@@ -149,12 +171,30 @@
149
171
  #endif
150
172
 
151
173
  #if !defined(HWY_AVX3_DISABLE_AVX512BF16)
152
- #define HWY_TARGET_STR_AVX3_ZEN4 HWY_TARGET_STR_AVX3_DL ",avx512bf16"
174
+ #define HWY_TARGET_STR_AVX3_ZEN4_256 HWY_TARGET_STR_AVX3_DL ",avx512bf16"
175
+ #else
176
+ #define HWY_TARGET_STR_AVX3_ZEN4_256 HWY_TARGET_STR_AVX3_DL
177
+ #endif
178
+
179
+ #define HWY_TARGET_STR_AVX3_ZEN4 \
180
+ HWY_TARGET_STR_AVX3_ZEN4_256 HWY_TARGET_STR_AVX3_VL512
181
+
182
+ #if HWY_COMPILER_GCC_ACTUAL >= 1200 || HWY_COMPILER_CLANG >= 1400
183
+ #define HWY_TARGET_STR_AVX3_SPR_256 HWY_TARGET_STR_AVX3_ZEN4_256 ",avx512fp16"
153
184
  #else
154
- #define HWY_TARGET_STR_AVX3_ZEN4 HWY_TARGET_STR_AVX3_DL
185
+ #define HWY_TARGET_STR_AVX3_SPR_256 HWY_TARGET_STR_AVX3_ZEN4_256
155
186
  #endif
156
187
 
157
- #define HWY_TARGET_STR_AVX3_SPR HWY_TARGET_STR_AVX3_ZEN4 ",avx512fp16"
188
+ #define HWY_TARGET_STR_AVX3_SPR \
189
+ HWY_TARGET_STR_AVX3_SPR_256 HWY_TARGET_STR_AVX3_VL512
190
+
191
+ #if HWY_COMPILER_GCC_ACTUAL >= 1500
192
+ #define HWY_TARGET_STR_AVX10_2 HWY_TARGET_STR_AVX3_SPR ",avx10.2"
193
+ #elif HWY_COMPILER_CLANG >= 2000
194
+ #define HWY_TARGET_STR_AVX10_2 HWY_TARGET_STR_AVX3_SPR ",avx10.2-512"
195
+ #else
196
+ #define HWY_TARGET_STR_AVX10_2 HWY_TARGET_STR_AVX3_SPR
197
+ #endif
158
198
 
159
199
  #if defined(HWY_DISABLE_PPC8_CRYPTO)
160
200
  #define HWY_TARGET_STR_PPC8_CRYPTO ""
@@ -277,9 +317,10 @@
277
317
  #define HWY_TARGET_STR HWY_TARGET_STR_AVX2
278
318
 
279
319
  //-----------------------------------------------------------------------------
280
- // AVX3[_DL]
281
- #elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL || \
282
- HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR
320
+ // AVX3[_DL]/AVX10
321
+ #elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL || \
322
+ HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR || \
323
+ HWY_TARGET == HWY_AVX10_2
283
324
 
284
325
  #define HWY_ALIGN alignas(64)
285
326
  #define HWY_MAX_BYTES 64
@@ -287,10 +328,9 @@
287
328
 
288
329
  #define HWY_HAVE_SCALABLE 0
289
330
  #define HWY_HAVE_INTEGER64 1
290
- #if HWY_TARGET == HWY_AVX3_SPR && HWY_COMPILER_GCC_ACTUAL && \
331
+ #if HWY_TARGET <= HWY_AVX3_SPR && \
332
+ (HWY_COMPILER_GCC_ACTUAL || HWY_COMPILER_CLANG >= 1901) && \
291
333
  HWY_HAVE_SCALAR_F16_TYPE
292
- // TODO: enable F16 for AVX3_SPR target with Clang once compilation issues are
293
- // fixed
294
334
  #define HWY_HAVE_FLOAT16 1
295
335
  #else
296
336
  #define HWY_HAVE_FLOAT16 0
@@ -304,7 +344,12 @@
304
344
  #define HWY_NATIVE_DOT_BF16 0
305
345
  #endif
306
346
  #define HWY_CAP_GE256 1
347
+
348
+ #if HWY_MAX_BYTES >= 64
307
349
  #define HWY_CAP_GE512 1
350
+ #else
351
+ #define HWY_CAP_GE512 0
352
+ #endif
308
353
 
309
354
  #if HWY_TARGET == HWY_AVX3
310
355
 
@@ -326,6 +371,11 @@
326
371
  #define HWY_NAMESPACE N_AVX3_SPR
327
372
  #define HWY_TARGET_STR HWY_TARGET_STR_AVX3_SPR
328
373
 
374
+ #elif HWY_TARGET == HWY_AVX10_2
375
+
376
+ #define HWY_NAMESPACE N_AVX10_2
377
+ #define HWY_TARGET_STR HWY_TARGET_STR_AVX10_2
378
+
329
379
  #else
330
380
  #error "Logic error"
331
381
  #endif // HWY_TARGET
@@ -403,6 +453,29 @@
403
453
  // NEON
404
454
  #elif HWY_TARGET_IS_NEON
405
455
 
456
+ // Clang 17 crashes with bf16, see github.com/llvm/llvm-project/issues/64179.
457
+ #undef HWY_NEON_HAVE_BFLOAT16
458
+ #if HWY_HAVE_SCALAR_BF16_TYPE && \
459
+ ((HWY_TARGET == HWY_NEON_BF16 && \
460
+ (!HWY_COMPILER_CLANG || HWY_COMPILER_CLANG >= 1800)) || \
461
+ defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC))
462
+ #define HWY_NEON_HAVE_BFLOAT16 1
463
+ #else
464
+ #define HWY_NEON_HAVE_BFLOAT16 0
465
+ #endif
466
+
467
+ // HWY_NEON_HAVE_F32_TO_BF16C is defined if NEON vcvt_bf16_f32 and
468
+ // vbfdot_f32 are available, even if the __bf16 type is disabled due to
469
+ // GCC/Clang bugs.
470
+ #undef HWY_NEON_HAVE_F32_TO_BF16C
471
+ #if HWY_NEON_HAVE_BFLOAT16 || HWY_TARGET == HWY_NEON_BF16 || \
472
+ (defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && \
473
+ (HWY_COMPILER_GCC_ACTUAL >= 1000 || HWY_COMPILER_CLANG >= 1100))
474
+ #define HWY_NEON_HAVE_F32_TO_BF16C 1
475
+ #else
476
+ #define HWY_NEON_HAVE_F32_TO_BF16C 0
477
+ #endif
478
+
406
479
  #define HWY_ALIGN alignas(16)
407
480
  #define HWY_MAX_BYTES 16
408
481
  #define HWY_LANES(T) (16 / sizeof(T))
@@ -428,7 +501,8 @@
428
501
  #else
429
502
  #define HWY_NATIVE_FMA 0
430
503
  #endif
431
- #if HWY_NEON_HAVE_F32_TO_BF16C || HWY_TARGET == HWY_NEON_BF16
504
+
505
+ #if HWY_NEON_HAVE_F32_TO_BF16C
432
506
  #define HWY_NATIVE_DOT_BF16 1
433
507
  #else
434
508
  #define HWY_NATIVE_DOT_BF16 0
@@ -480,7 +554,12 @@
480
554
  #endif
481
555
 
482
556
  #if HWY_TARGET == HWY_NEON_WITHOUT_AES
557
+ #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400
558
+ // Prevents inadvertent use of SVE by GCC 13.4 and earlier, see #2689.
559
+ #define HWY_TARGET_STR "+nosve"
560
+ #else
483
561
  // Do not define HWY_TARGET_STR (no pragma).
562
+ #endif // HWY_COMPILER_GCC_ACTUAL
484
563
  #elif HWY_TARGET == HWY_NEON
485
564
  #define HWY_TARGET_STR HWY_TARGET_STR_NEON
486
565
  #elif HWY_TARGET == HWY_NEON_BF16
@@ -586,7 +665,7 @@
586
665
  #define HWY_HAVE_SCALABLE 0
587
666
  #define HWY_HAVE_INTEGER64 1
588
667
  #define HWY_HAVE_FLOAT16 0
589
- #define HWY_HAVE_FLOAT64 0
668
+ #define HWY_HAVE_FLOAT64 1
590
669
  #define HWY_MEM_OPS_MIGHT_FAULT 1
591
670
  #define HWY_NATIVE_FMA 0
592
671
  #define HWY_NATIVE_DOT_BF16 0
@@ -629,8 +708,50 @@
629
708
 
630
709
  #define HWY_NAMESPACE N_RVV
631
710
 
711
+ #if HWY_COMPILER_CLANG >= 1900
712
+ // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#181-zvl-minimum-vector-length-standard-extensions
713
+ #define HWY_TARGET_STR "arch=+v"
714
+ #else
715
+ // HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
716
+ #endif
717
+
718
+ //-----------------------------------------------------------------------------
719
+ // LSX/LASX
720
+ #elif HWY_TARGET == HWY_LSX || HWY_TARGET == HWY_LASX
721
+
722
+ #if HWY_TARGET == HWY_LSX
723
+ #define HWY_ALIGN alignas(16)
724
+ #define HWY_MAX_BYTES 16
725
+ #else
726
+ #define HWY_ALIGN alignas(32)
727
+ #define HWY_MAX_BYTES 32
728
+ #endif
729
+
730
+ #define HWY_LANES(T) (HWY_MAX_BYTES / sizeof(T))
731
+
732
+ #define HWY_HAVE_SCALABLE 0
733
+ #define HWY_HAVE_INTEGER64 1
734
+ #define HWY_HAVE_FLOAT16 0
735
+ #define HWY_HAVE_FLOAT64 1
736
+ #define HWY_MEM_OPS_MIGHT_FAULT 1
737
+ #define HWY_NATIVE_FMA 1
738
+ #define HWY_NATIVE_DOT_BF16 0
739
+
740
+ #if HWY_TARGET == HWY_LSX
741
+ #define HWY_CAP_GE256 0
742
+ #else
743
+ #define HWY_CAP_GE256 1
744
+ #endif
745
+
746
+ #define HWY_CAP_GE512 0
747
+
748
+ #if HWY_TARGET == HWY_LSX
749
+ #define HWY_NAMESPACE N_LSX
750
+ #else
751
+ #define HWY_NAMESPACE N_LASX
752
+ #endif
753
+
632
754
  // HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
633
- // (rv64gcv is not a valid target)
634
755
 
635
756
  //-----------------------------------------------------------------------------
636
757
  // EMU128
@@ -152,9 +152,20 @@ constexpr size_t ScaleByPower(size_t N, int pow2) {
152
152
  return pow2 >= 0 ? (N << pow2) : (N >> (-pow2));
153
153
  }
154
154
 
155
+ template <typename T>
156
+ HWY_INLINE void MaybePoison(T* HWY_RESTRICT unaligned, size_t count) {
157
+ #if HWY_IS_MSAN
158
+ __msan_poison(unaligned, count * sizeof(T));
159
+ #else
160
+ (void)unaligned;
161
+ (void)count;
162
+ #endif
163
+ }
164
+
165
+ // This can be useful for working around MSAN limitations. For example, prior
166
+ // to Clang 16, it did not understand AVX-512 CompressStore.
155
167
  template <typename T>
156
168
  HWY_INLINE void MaybeUnpoison(T* HWY_RESTRICT unaligned, size_t count) {
157
- // Workaround for MSAN not marking compressstore as initialized (b/233326619)
158
169
  #if HWY_IS_MSAN
159
170
  __msan_unpoison(unaligned, count * sizeof(T));
160
171
  #else
@@ -448,13 +459,32 @@ HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
448
459
  return HWY_MAX_LANES_D(D);
449
460
  }
450
461
 
451
- #if !HWY_HAVE_SCALABLE
462
+ #undef HWY_HAVE_CONSTEXPR_LANES
463
+ #undef HWY_LANES_CONSTEXPR
452
464
 
453
- // If non-scalable, this is constexpr; otherwise the target's header defines a
454
- // non-constexpr version of this function. This is the actual vector length,
455
- // used when advancing loop counters.
465
+ #if HWY_HAVE_SCALABLE
466
+ #define HWY_HAVE_CONSTEXPR_LANES 0
467
+ #define HWY_LANES_CONSTEXPR
468
+ #else
469
+
470
+ // We want Lanes() to be constexpr where possible, so that compilers are able to
471
+ // precompute offsets. However, user code must not depend on the constexpr,
472
+ // because that will fail for RISC-V V and Arm SVE. To achieve both, we mark it
473
+ // as non-constexpr in debug builds, but not sanitizers, because we typically
474
+ // want them to see the same code.
475
+ #if HWY_IS_DEBUG_BUILD && !HWY_IS_SANITIZER
476
+ #define HWY_HAVE_CONSTEXPR_LANES 0
477
+ #define HWY_LANES_CONSTEXPR
478
+ #else
479
+ #define HWY_HAVE_CONSTEXPR_LANES 1
480
+ #define HWY_LANES_CONSTEXPR constexpr
481
+ #endif
482
+
483
+ // Returns actual vector length, used when advancing loop counters. The
484
+ // non-constexpr implementations are defined in their target's header. For a
485
+ // guaranteed-constexpr upper bound, use `MaxLanes(d)`.
456
486
  template <class D>
457
- HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t Lanes(D) {
487
+ HWY_INLINE HWY_MAYBE_UNUSED HWY_LANES_CONSTEXPR size_t Lanes(D) {
458
488
  return HWY_MAX_LANES_D(D);
459
489
  }
460
490
 
@@ -621,8 +651,11 @@ HWY_API bool IsAligned(D d, T* ptr) {
621
651
  #define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(hwy::HWY_NAMESPACE::TFromV<V>)
622
652
  #define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(hwy::HWY_NAMESPACE::TFromV<V>)
623
653
  #define HWY_IF_NOT_FLOAT_V(V) HWY_IF_NOT_FLOAT(hwy::HWY_NAMESPACE::TFromV<V>)
654
+ #define HWY_IF_FLOAT3264_V(V) HWY_IF_FLOAT3264(hwy::HWY_NAMESPACE::TFromV<V>)
624
655
  #define HWY_IF_SPECIAL_FLOAT_V(V) \
625
656
  HWY_IF_SPECIAL_FLOAT(hwy::HWY_NAMESPACE::TFromV<V>)
657
+ #define HWY_IF_FLOAT_OR_SPECIAL_V(V) \
658
+ HWY_IF_FLOAT_OR_SPECIAL(hwy::HWY_NAMESPACE::TFromV<V>)
626
659
  #define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V) \
627
660
  HWY_IF_NOT_FLOAT_NOR_SPECIAL(hwy::HWY_NAMESPACE::TFromV<V>)
628
661
 
@@ -633,7 +666,7 @@ HWY_API bool IsAligned(D d, T* ptr) {
633
666
  #define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array) \
634
667
  HWY_IF_T_SIZE_ONE_OF(hwy::HWY_NAMESPACE::TFromV<V>, bit_array)
635
668
 
636
- #define HWY_MAX_LANES_V(V) HWY_MAX_LANES_D(DFromV<V>)
669
+ #define HWY_MAX_LANES_V(V) HWY_MAX_LANES_D(hwy::HWY_NAMESPACE::DFromV<V>)
637
670
  #define HWY_IF_V_SIZE_V(V, bytes) \
638
671
  HWY_IF_V_SIZE(hwy::HWY_NAMESPACE::TFromV<V>, HWY_MAX_LANES_V(V), bytes)
639
672
  #define HWY_IF_V_SIZE_LE_V(V, bytes) \
@@ -656,15 +689,22 @@ HWY_API bool IsAligned(D d, T* ptr) {
656
689
  #define HWY_IF_MINMAX_OF_LANES_D(D) HWY_IF_LANES_GT_D(D, 1)
657
690
 
658
691
  #undef HWY_IF_ADDSUB_V
659
- #define HWY_IF_ADDSUB_V(V) HWY_IF_LANES_GT_D(DFromV<V>, 1)
692
+ #define HWY_IF_ADDSUB_V(V) HWY_IF_LANES_GT_D(hwy::HWY_NAMESPACE::DFromV<V>, 1)
660
693
 
661
694
  #undef HWY_IF_MULADDSUB_V
662
- #define HWY_IF_MULADDSUB_V(V) HWY_IF_LANES_GT_D(DFromV<V>, 1)
695
+ #define HWY_IF_MULADDSUB_V(V) \
696
+ HWY_IF_LANES_GT_D(hwy::HWY_NAMESPACE::DFromV<V>, 1)
697
+
698
+ #undef HWY_IF_PAIRWISE_ADD_128_D
699
+ #define HWY_IF_PAIRWISE_ADD_128_D(D) HWY_IF_V_SIZE_GT_D(D, 8)
700
+
701
+ #undef HWY_IF_PAIRWISE_SUB_128_D
702
+ #define HWY_IF_PAIRWISE_SUB_128_D(D) HWY_IF_V_SIZE_GT_D(D, 8)
663
703
 
664
704
  // HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V is used to disable the default
665
705
  // implementation of unsigned to signed DemoteTo/ReorderDemote2To in
666
706
  // generic_ops-inl.h for at least some of the unsigned to signed demotions on
667
- // SCALAR/EMU128/SSE2/SSSE3/SSE4/AVX2/SVE/SVE2
707
+ // SCALAR/EMU128/SSE2/SSSE3/SSE4/AVX2/SVE/SVE2/LSX/LASX
668
708
 
669
709
  #undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
670
710
  #define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) void* = nullptr