@img/sharp-libvips-dev 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/include/expat.h +21 -10
  2. package/include/expat_config.h +11 -5
  3. package/include/ffi.h +12 -25
  4. package/include/freetype2/freetype/config/ftoption.h +1 -1
  5. package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
  6. package/include/glib-2.0/gio/gapplication.h +6 -0
  7. package/include/glib-2.0/gio/giotypes.h +0 -1
  8. package/include/glib-2.0/girepository/giarginfo.h +23 -6
  9. package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
  10. package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
  11. package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
  12. package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
  13. package/include/glib-2.0/girepository/gienuminfo.h +20 -21
  14. package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
  15. package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
  16. package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
  17. package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
  18. package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
  19. package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
  20. package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
  21. package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
  22. package/include/glib-2.0/girepository/girepository.h +53 -62
  23. package/include/glib-2.0/girepository/girffi.h +8 -7
  24. package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
  25. package/include/glib-2.0/girepository/gistructinfo.h +26 -11
  26. package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
  27. package/include/glib-2.0/girepository/gitypelib.h +9 -13
  28. package/include/glib-2.0/girepository/gitypes.h +52 -104
  29. package/include/glib-2.0/girepository/giunioninfo.h +28 -12
  30. package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
  31. package/include/glib-2.0/girepository/givalueinfo.h +65 -0
  32. package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
  33. package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
  34. package/include/glib-2.0/glib/gbitlock.h +31 -0
  35. package/include/glib-2.0/glib/gmessages.h +8 -0
  36. package/include/glib-2.0/glib/gslice.h +2 -0
  37. package/include/glib-2.0/glib/gstrfuncs.h +24 -18
  38. package/include/glib-2.0/glib/gthread.h +191 -3
  39. package/include/glib-2.0/glib-unix.h +7 -1
  40. package/include/glib-2.0/gobject/genums.h +6 -6
  41. package/include/glib-2.0/gobject/glib-types.h +11 -0
  42. package/include/glib-2.0/gobject/gsignal.h +16 -6
  43. package/include/hwy/aligned_allocator.h +171 -6
  44. package/include/hwy/base.h +1765 -543
  45. package/include/hwy/cache_control.h +24 -6
  46. package/include/hwy/detect_compiler_arch.h +23 -2
  47. package/include/hwy/detect_targets.h +56 -13
  48. package/include/hwy/foreach_target.h +24 -0
  49. package/include/hwy/highway.h +20 -3
  50. package/include/hwy/ops/arm_neon-inl.h +1086 -667
  51. package/include/hwy/ops/arm_sve-inl.h +1091 -235
  52. package/include/hwy/ops/emu128-inl.h +271 -196
  53. package/include/hwy/ops/generic_ops-inl.h +2270 -399
  54. package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
  55. package/include/hwy/ops/rvv-inl.h +1043 -311
  56. package/include/hwy/ops/scalar-inl.h +189 -159
  57. package/include/hwy/ops/set_macros-inl.h +66 -6
  58. package/include/hwy/ops/shared-inl.h +175 -56
  59. package/include/hwy/ops/wasm_128-inl.h +153 -136
  60. package/include/hwy/ops/x86_128-inl.h +1647 -646
  61. package/include/hwy/ops/x86_256-inl.h +1003 -370
  62. package/include/hwy/ops/x86_512-inl.h +948 -353
  63. package/include/hwy/per_target.h +4 -0
  64. package/include/hwy/profiler.h +648 -0
  65. package/include/hwy/robust_statistics.h +2 -2
  66. package/include/hwy/targets.h +18 -11
  67. package/include/hwy/timer.h +11 -0
  68. package/include/libpng16/png.h +32 -29
  69. package/include/libpng16/pngconf.h +2 -2
  70. package/include/libpng16/pnglibconf.h +7 -2
  71. package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
  72. package/include/libxml2/libxml/parser.h +16 -7
  73. package/include/libxml2/libxml/xmlIO.h +0 -1
  74. package/include/libxml2/libxml/xmlversion.h +4 -4
  75. package/include/pango-1.0/pango/pango-features.h +3 -3
  76. package/include/pango-1.0/pango/pango-fontmap.h +7 -0
  77. package/include/pixman-1/pixman-version.h +2 -2
  78. package/include/png.h +32 -29
  79. package/include/pngconf.h +2 -2
  80. package/include/pnglibconf.h +7 -2
  81. package/include/vips/connection.h +9 -3
  82. package/include/vips/util.h +0 -9
  83. package/include/vips/version.h +4 -4
  84. package/package.json +1 -1
  85. package/versions.json +11 -11
@@ -16,7 +16,10 @@
16
16
  // Single-element vectors and operations.
17
17
  // External include guard in highway.h - see comment there.
18
18
 
19
- #include <cmath> // std::abs, std::isnan
19
+ #include "hwy/base.h"
20
+ #ifndef HWY_NO_LIBCXX
21
+ #include <math.h> // sqrtf
22
+ #endif
20
23
 
21
24
  #include "hwy/ops/shared-inl.h"
22
25
 
@@ -49,6 +52,9 @@ struct Vec128 {
49
52
  HWY_INLINE Vec128& operator-=(const Vec128 other) {
50
53
  return *this = (*this - other);
51
54
  }
55
+ HWY_INLINE Vec128& operator%=(const Vec128 other) {
56
+ return *this = (*this % other);
57
+ }
52
58
  HWY_INLINE Vec128& operator&=(const Vec128 other) {
53
59
  return *this = (*this & other);
54
60
  }
@@ -105,7 +111,7 @@ using VFromD = decltype(Zero(D()));
105
111
  template <class D, class VFrom>
106
112
  HWY_API VFromD<D> BitCast(D /* tag */, VFrom v) {
107
113
  VFromD<D> to;
108
- CopySameSize(&v, &to);
114
+ CopySameSize(&v.raw, &to.raw);
109
115
  return to;
110
116
  }
111
117
 
@@ -122,7 +128,7 @@ HWY_API VFromD<D> ResizeBitCast(D d, VFrom v) {
122
128
  constexpr size_t kCopyByteLen = HWY_MIN(kFromByteLen, kToByteLen);
123
129
 
124
130
  VFromD<D> to = Zero(d);
125
- CopyBytes<kCopyByteLen>(&v, &to);
131
+ CopyBytes<kCopyByteLen>(&v.raw, &to.raw);
126
132
  return to;
127
133
  }
128
134
 
@@ -145,7 +151,7 @@ template <class D, typename T2>
145
151
  HWY_API VFromD<D> Set(D d, const T2 t) {
146
152
  VFromD<D> v;
147
153
  for (size_t i = 0; i < MaxLanes(d); ++i) {
148
- v.raw[i] = static_cast<TFromD<D>>(t);
154
+ v.raw[i] = ConvertScalarTo<TFromD<D>>(t);
149
155
  }
150
156
  return v;
151
157
  }
@@ -156,14 +162,79 @@ HWY_API VFromD<D> Undefined(D d) {
156
162
  return Zero(d);
157
163
  }
158
164
 
165
+ // ------------------------------ Dup128VecFromValues
166
+
167
+ template <class D, HWY_IF_T_SIZE_D(D, 1)>
168
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
169
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
170
+ TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
171
+ TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
172
+ TFromD<D> t11, TFromD<D> t12,
173
+ TFromD<D> t13, TFromD<D> t14,
174
+ TFromD<D> t15) {
175
+ VFromD<D> result;
176
+ result.raw[0] = t0;
177
+ result.raw[1] = t1;
178
+ result.raw[2] = t2;
179
+ result.raw[3] = t3;
180
+ result.raw[4] = t4;
181
+ result.raw[5] = t5;
182
+ result.raw[6] = t6;
183
+ result.raw[7] = t7;
184
+ result.raw[8] = t8;
185
+ result.raw[9] = t9;
186
+ result.raw[10] = t10;
187
+ result.raw[11] = t11;
188
+ result.raw[12] = t12;
189
+ result.raw[13] = t13;
190
+ result.raw[14] = t14;
191
+ result.raw[15] = t15;
192
+ return result;
193
+ }
194
+
195
+ template <class D, HWY_IF_T_SIZE_D(D, 2)>
196
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
197
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
198
+ TFromD<D> t5, TFromD<D> t6,
199
+ TFromD<D> t7) {
200
+ VFromD<D> result;
201
+ result.raw[0] = t0;
202
+ result.raw[1] = t1;
203
+ result.raw[2] = t2;
204
+ result.raw[3] = t3;
205
+ result.raw[4] = t4;
206
+ result.raw[5] = t5;
207
+ result.raw[6] = t6;
208
+ result.raw[7] = t7;
209
+ return result;
210
+ }
211
+
212
+ template <class D, HWY_IF_T_SIZE_D(D, 4)>
213
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
214
+ TFromD<D> t2, TFromD<D> t3) {
215
+ VFromD<D> result;
216
+ result.raw[0] = t0;
217
+ result.raw[1] = t1;
218
+ result.raw[2] = t2;
219
+ result.raw[3] = t3;
220
+ return result;
221
+ }
222
+
223
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
224
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
225
+ VFromD<D> result;
226
+ result.raw[0] = t0;
227
+ result.raw[1] = t1;
228
+ return result;
229
+ }
230
+
159
231
  // ------------------------------ Iota
160
232
 
161
233
  template <class D, typename T = TFromD<D>, typename T2>
162
234
  HWY_API VFromD<D> Iota(D d, T2 first) {
163
235
  VFromD<D> v;
164
236
  for (size_t i = 0; i < MaxLanes(d); ++i) {
165
- v.raw[i] =
166
- AddWithWraparound(hwy::IsFloatTag<T>(), static_cast<T>(first), i);
237
+ v.raw[i] = AddWithWraparound(static_cast<T>(first), i);
167
238
  }
168
239
  return v;
169
240
  }
@@ -286,7 +357,7 @@ template <typename T, size_t N>
286
357
  HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
287
358
  // This is used inside ShiftRight, so we cannot implement in terms of it.
288
359
  for (size_t i = 0; i < N; ++i) {
289
- v.raw[i] = v.raw[i] < 0 ? T(-1) : T(0);
360
+ v.raw[i] = static_cast<T>(v.raw[i] < 0 ? -1 : 0);
290
361
  }
291
362
  return v;
292
363
  }
@@ -297,7 +368,7 @@ HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
297
368
  template <typename T, size_t N>
298
369
  HWY_API Mask128<T, N> MaskFromVec(Vec128<T, N> v) {
299
370
  Mask128<T, N> mask;
300
- CopySameSize(&v, &mask);
371
+ CopySameSize(&v.raw, &mask.bits);
301
372
  return mask;
302
373
  }
303
374
 
@@ -307,20 +378,15 @@ using MFromD = decltype(MaskFromVec(VFromD<D>()));
307
378
  template <class DTo, class MFrom>
308
379
  HWY_API MFromD<DTo> RebindMask(DTo /* tag */, MFrom mask) {
309
380
  MFromD<DTo> to;
310
- CopySameSize(&mask, &to);
381
+ CopySameSize(&mask.bits, &to.bits);
311
382
  return to;
312
383
  }
313
384
 
314
- template <typename T, size_t N>
315
- Vec128<T, N> VecFromMask(Mask128<T, N> mask) {
316
- Vec128<T, N> v;
317
- CopySameSize(&mask, &v);
318
- return v;
319
- }
320
-
321
385
  template <class D>
322
386
  VFromD<D> VecFromMask(D /* tag */, MFromD<D> mask) {
323
- return VecFromMask(mask);
387
+ VFromD<D> v;
388
+ CopySameSize(&mask.bits, &v.raw);
389
+ return v;
324
390
  }
325
391
 
326
392
  template <class D>
@@ -336,19 +402,20 @@ HWY_API MFromD<D> FirstN(D d, size_t n) {
336
402
  template <typename T, size_t N>
337
403
  HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
338
404
  Vec128<T, N> no) {
339
- return IfVecThenElse(VecFromMask(mask), yes, no);
405
+ const DFromV<decltype(yes)> d;
406
+ return IfVecThenElse(VecFromMask(d, mask), yes, no);
340
407
  }
341
408
 
342
409
  template <typename T, size_t N>
343
410
  HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
344
411
  const DFromV<decltype(yes)> d;
345
- return IfVecThenElse(VecFromMask(mask), yes, Zero(d));
412
+ return IfVecThenElse(VecFromMask(d, mask), yes, Zero(d));
346
413
  }
347
414
 
348
415
  template <typename T, size_t N>
349
416
  HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
350
417
  const DFromV<decltype(no)> d;
351
- return IfVecThenElse(VecFromMask(mask), Zero(d), no);
418
+ return IfVecThenElse(VecFromMask(d, mask), Zero(d), no);
352
419
  }
353
420
 
354
421
  template <typename T, size_t N>
@@ -374,7 +441,8 @@ HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
374
441
 
375
442
  template <typename T, size_t N>
376
443
  HWY_API Mask128<T, N> Not(Mask128<T, N> m) {
377
- return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
444
+ const Simd<T, N, 0> d;
445
+ return MaskFromVec(Not(VecFromMask(d, m)));
378
446
  }
379
447
 
380
448
  template <typename T, size_t N>
@@ -614,6 +682,15 @@ HWY_API Vec128<uint64_t, (N + 7) / 8> SumsOf8(Vec128<uint8_t, N> v) {
614
682
  return sums;
615
683
  }
616
684
 
685
+ template <size_t N>
686
+ HWY_API Vec128<int64_t, (N + 7) / 8> SumsOf8(Vec128<int8_t, N> v) {
687
+ Vec128<int64_t, (N + 7) / 8> sums;
688
+ for (size_t i = 0; i < N; ++i) {
689
+ sums.raw[i / 8] += v.raw[i];
690
+ }
691
+ return sums;
692
+ }
693
+
617
694
  // ------------------------------ SaturatedAdd
618
695
  template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
619
696
  HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
@@ -652,34 +729,14 @@ HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, Vec128<T, N> b) {
652
729
 
653
730
  // ------------------------------ Abs
654
731
 
655
- // Tag dispatch instead of SFINAE for MSVC 2017 compatibility
656
- namespace detail {
657
-
658
732
  template <typename T, size_t N>
659
- HWY_INLINE Vec128<T, N> Abs(SignedTag /*tag*/, Vec128<T, N> a) {
733
+ HWY_API Vec128<T, N> Abs(Vec128<T, N> a) {
660
734
  for (size_t i = 0; i < N; ++i) {
661
- const T s = a.raw[i];
662
- const T min = hwy::LimitsMin<T>();
663
- a.raw[i] = static_cast<T>((s >= 0 || s == min) ? a.raw[i] : -s);
735
+ a.raw[i] = ScalarAbs(a.raw[i]);
664
736
  }
665
737
  return a;
666
738
  }
667
739
 
668
- template <typename T, size_t N>
669
- HWY_INLINE Vec128<T, N> Abs(hwy::FloatTag /*tag*/, Vec128<T, N> v) {
670
- for (size_t i = 0; i < N; ++i) {
671
- v.raw[i] = std::abs(v.raw[i]);
672
- }
673
- return v;
674
- }
675
-
676
- } // namespace detail
677
-
678
- template <typename T, size_t N>
679
- HWY_API Vec128<T, N> Abs(Vec128<T, N> a) {
680
- return detail::Abs(hwy::TypeTag<T>(), a);
681
- }
682
-
683
740
  // ------------------------------ Min/Max
684
741
 
685
742
  // Tag dispatch instead of SFINAE for MSVC 2017 compatibility
@@ -706,9 +763,9 @@ template <typename T, size_t N>
706
763
  HWY_INLINE Vec128<T, N> Min(hwy::FloatTag /*tag*/, Vec128<T, N> a,
707
764
  Vec128<T, N> b) {
708
765
  for (size_t i = 0; i < N; ++i) {
709
- if (std::isnan(a.raw[i])) {
766
+ if (ScalarIsNaN(a.raw[i])) {
710
767
  a.raw[i] = b.raw[i];
711
- } else if (std::isnan(b.raw[i])) {
768
+ } else if (ScalarIsNaN(b.raw[i])) {
712
769
  // no change
713
770
  } else {
714
771
  a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]);
@@ -720,9 +777,9 @@ template <typename T, size_t N>
720
777
  HWY_INLINE Vec128<T, N> Max(hwy::FloatTag /*tag*/, Vec128<T, N> a,
721
778
  Vec128<T, N> b) {
722
779
  for (size_t i = 0; i < N; ++i) {
723
- if (std::isnan(a.raw[i])) {
780
+ if (ScalarIsNaN(a.raw[i])) {
724
781
  a.raw[i] = b.raw[i];
725
- } else if (std::isnan(b.raw[i])) {
782
+ } else if (ScalarIsNaN(b.raw[i])) {
726
783
  // no change
727
784
  } else {
728
785
  a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
@@ -825,7 +882,7 @@ HWY_API Vec128<T, N> operator*(Vec128<T, N> a, Vec128<T, N> b) {
825
882
  return detail::Mul(hwy::TypeTag<T>(), a, b);
826
883
  }
827
884
 
828
- template <typename T, size_t N>
885
+ template <typename T, size_t N, HWY_IF_FLOAT(T)>
829
886
  HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
830
887
  for (size_t i = 0; i < N; ++i) {
831
888
  a.raw[i] = (b.raw[i] == T{0}) ? 0 : a.raw[i] / b.raw[i];
@@ -900,7 +957,7 @@ HWY_API Vec128<float, N> ApproximateReciprocal(Vec128<float, N> v) {
900
957
  // Zero inputs are allowed, but callers are responsible for replacing the
901
958
  // return value with something else (typically using IfThenElse). This check
902
959
  // avoids a ubsan error. The result is arbitrary.
903
- v.raw[i] = (std::abs(v.raw[i]) == 0.0f) ? 0.0f : 1.0f / v.raw[i];
960
+ v.raw[i] = (ScalarAbs(v.raw[i]) == 0.0f) ? 0.0f : 1.0f / v.raw[i];
904
961
  }
905
962
  return v;
906
963
  }
@@ -913,25 +970,25 @@ HWY_API Vec128<T, N> AbsDiff(Vec128<T, N> a, Vec128<T, N> b) {
913
970
 
914
971
  // ------------------------------ Floating-point multiply-add variants
915
972
 
916
- template <typename T, size_t N>
973
+ template <typename T, size_t N, HWY_IF_FLOAT(T)>
917
974
  HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
918
975
  Vec128<T, N> add) {
919
976
  return mul * x + add;
920
977
  }
921
978
 
922
- template <typename T, size_t N>
979
+ template <typename T, size_t N, HWY_IF_FLOAT(T)>
923
980
  HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
924
981
  Vec128<T, N> add) {
925
982
  return add - mul * x;
926
983
  }
927
984
 
928
- template <typename T, size_t N>
985
+ template <typename T, size_t N, HWY_IF_FLOAT(T)>
929
986
  HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
930
987
  Vec128<T, N> sub) {
931
988
  return mul * x - sub;
932
989
  }
933
990
 
934
- template <typename T, size_t N>
991
+ template <typename T, size_t N, HWY_IF_FLOAT(T)>
935
992
  HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
936
993
  Vec128<T, N> sub) {
937
994
  return Neg(mul) * x - sub;
@@ -943,21 +1000,52 @@ template <size_t N>
943
1000
  HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
944
1001
  for (size_t i = 0; i < N; ++i) {
945
1002
  const float half = v.raw[i] * 0.5f;
946
- uint32_t bits;
947
- CopySameSize(&v.raw[i], &bits);
948
1003
  // Initial guess based on log2(f)
949
- bits = 0x5F3759DF - (bits >> 1);
950
- CopySameSize(&bits, &v.raw[i]);
1004
+ v.raw[i] = BitCastScalar<float>(static_cast<uint32_t>(
1005
+ 0x5F3759DF - (BitCastScalar<uint32_t>(v.raw[i]) >> 1)));
951
1006
  // One Newton-Raphson iteration
952
1007
  v.raw[i] = v.raw[i] * (1.5f - (half * v.raw[i] * v.raw[i]));
953
1008
  }
954
1009
  return v;
955
1010
  }
956
1011
 
1012
+ namespace detail {
1013
+
1014
+ static HWY_INLINE float ScalarSqrt(float v) {
1015
+ #if defined(HWY_NO_LIBCXX)
1016
+ #if HWY_COMPILER_GCC_ACTUAL
1017
+ return __builtin_sqrt(v);
1018
+ #else
1019
+ uint32_t bits = BitCastScalar<uint32_t>(v);
1020
+ // Coarse approximation, letting the exponent LSB leak into the mantissa
1021
+ bits = (1 << 29) + (bits >> 1) - (1 << 22);
1022
+ return BitCastScalar<float>(bits);
1023
+ #endif // !HWY_COMPILER_GCC_ACTUAL
1024
+ #else
1025
+ return sqrtf(v);
1026
+ #endif // !HWY_NO_LIBCXX
1027
+ }
1028
+ static HWY_INLINE double ScalarSqrt(double v) {
1029
+ #if defined(HWY_NO_LIBCXX)
1030
+ #if HWY_COMPILER_GCC_ACTUAL
1031
+ return __builtin_sqrt(v);
1032
+ #else
1033
+ uint64_t bits = BitCastScalar<uint64_t>(v);
1034
+ // Coarse approximation, letting the exponent LSB leak into the mantissa
1035
+ bits = (1ULL << 61) + (bits >> 1) - (1ULL << 51);
1036
+ return BitCastScalar<double>(bits);
1037
+ #endif // !HWY_COMPILER_GCC_ACTUAL
1038
+ #else
1039
+ return sqrt(v);
1040
+ #endif // HWY_NO_LIBCXX
1041
+ }
1042
+
1043
+ } // namespace detail
1044
+
957
1045
  template <typename T, size_t N>
958
1046
  HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) {
959
1047
  for (size_t i = 0; i < N; ++i) {
960
- v.raw[i] = std::sqrt(v.raw[i]);
1048
+ v.raw[i] = detail::ScalarSqrt(v.raw[i]);
961
1049
  }
962
1050
  return v;
963
1051
  }
@@ -967,21 +1055,23 @@ HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) {
967
1055
  template <typename T, size_t N>
968
1056
  HWY_API Vec128<T, N> Round(Vec128<T, N> v) {
969
1057
  using TI = MakeSigned<T>;
1058
+ const T k0 = ConvertScalarTo<T>(0);
970
1059
  const Vec128<T, N> a = Abs(v);
971
1060
  for (size_t i = 0; i < N; ++i) {
972
1061
  if (!(a.raw[i] < MantissaEnd<T>())) { // Huge or NaN
973
1062
  continue;
974
1063
  }
975
- const T bias = v.raw[i] < T(0.0) ? T(-0.5) : T(0.5);
976
- const TI rounded = static_cast<TI>(v.raw[i] + bias);
1064
+ const T bias = ConvertScalarTo<T>(v.raw[i] < k0 ? -0.5 : 0.5);
1065
+ const TI rounded = ConvertScalarTo<TI>(v.raw[i] + bias);
977
1066
  if (rounded == 0) {
978
- v.raw[i] = v.raw[i] < 0 ? T{-0} : T{0};
1067
+ v.raw[i] = v.raw[i] < 0 ? ConvertScalarTo<T>(-0) : k0;
979
1068
  continue;
980
1069
  }
981
- const T rounded_f = static_cast<T>(rounded);
1070
+ const T rounded_f = ConvertScalarTo<T>(rounded);
982
1071
  // Round to even
983
- if ((rounded & 1) && std::abs(rounded_f - v.raw[i]) == T(0.5)) {
984
- v.raw[i] = static_cast<T>(rounded - (v.raw[i] < T(0) ? -1 : 1));
1072
+ if ((rounded & 1) &&
1073
+ ScalarAbs(rounded_f - v.raw[i]) == ConvertScalarTo<T>(0.5)) {
1074
+ v.raw[i] = ConvertScalarTo<T>(rounded - (v.raw[i] < k0 ? -1 : 1));
985
1075
  continue;
986
1076
  }
987
1077
  v.raw[i] = rounded_f;
@@ -994,30 +1084,32 @@ template <size_t N>
994
1084
  HWY_API Vec128<int32_t, N> NearestInt(Vec128<float, N> v) {
995
1085
  using T = float;
996
1086
  using TI = int32_t;
1087
+ const T k0 = ConvertScalarTo<T>(0);
997
1088
 
998
1089
  const Vec128<float, N> abs = Abs(v);
999
1090
  Vec128<int32_t, N> ret;
1000
1091
  for (size_t i = 0; i < N; ++i) {
1001
- const bool signbit = std::signbit(v.raw[i]);
1092
+ const bool signbit = ScalarSignBit(v.raw[i]);
1002
1093
 
1003
1094
  if (!(abs.raw[i] < MantissaEnd<T>())) { // Huge or NaN
1004
1095
  // Check if too large to cast or NaN
1005
- if (!(abs.raw[i] <= static_cast<T>(LimitsMax<TI>()))) {
1096
+ if (!(abs.raw[i] <= ConvertScalarTo<T>(LimitsMax<TI>()))) {
1006
1097
  ret.raw[i] = signbit ? LimitsMin<TI>() : LimitsMax<TI>();
1007
1098
  continue;
1008
1099
  }
1009
1100
  ret.raw[i] = static_cast<TI>(v.raw[i]);
1010
1101
  continue;
1011
1102
  }
1012
- const T bias = v.raw[i] < T(0.0) ? T(-0.5) : T(0.5);
1013
- const TI rounded = static_cast<TI>(v.raw[i] + bias);
1103
+ const T bias = ConvertScalarTo<T>(v.raw[i] < k0 ? -0.5 : 0.5);
1104
+ const TI rounded = ConvertScalarTo<TI>(v.raw[i] + bias);
1014
1105
  if (rounded == 0) {
1015
1106
  ret.raw[i] = 0;
1016
1107
  continue;
1017
1108
  }
1018
- const T rounded_f = static_cast<T>(rounded);
1109
+ const T rounded_f = ConvertScalarTo<T>(rounded);
1019
1110
  // Round to even
1020
- if ((rounded & 1) && std::abs(rounded_f - v.raw[i]) == T(0.5)) {
1111
+ if ((rounded & 1) &&
1112
+ ScalarAbs(rounded_f - v.raw[i]) == ConvertScalarTo<T>(0.5)) {
1021
1113
  ret.raw[i] = rounded - (signbit ? -1 : 1);
1022
1114
  continue;
1023
1115
  }
@@ -1056,8 +1148,7 @@ Vec128<Float, N> Ceil(Vec128<Float, N> v) {
1056
1148
  for (size_t i = 0; i < N; ++i) {
1057
1149
  const bool positive = v.raw[i] > Float(0.0);
1058
1150
 
1059
- Bits bits;
1060
- CopySameSize(&v.raw[i], &bits);
1151
+ Bits bits = BitCastScalar<Bits>(v.raw[i]);
1061
1152
 
1062
1153
  const int exponent =
1063
1154
  static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
@@ -1077,7 +1168,7 @@ Vec128<Float, N> Ceil(Vec128<Float, N> v) {
1077
1168
  if (positive) bits += (kMantissaMask + 1) >> exponent;
1078
1169
  bits &= ~mantissa_mask;
1079
1170
 
1080
- CopySameSize(&bits, &v.raw[i]);
1171
+ v.raw[i] = BitCastScalar<Float>(bits);
1081
1172
  }
1082
1173
  return v;
1083
1174
  }
@@ -1094,8 +1185,7 @@ Vec128<Float, N> Floor(Vec128<Float, N> v) {
1094
1185
  for (size_t i = 0; i < N; ++i) {
1095
1186
  const bool negative = v.raw[i] < Float(0.0);
1096
1187
 
1097
- Bits bits;
1098
- CopySameSize(&v.raw[i], &bits);
1188
+ Bits bits = BitCastScalar<Bits>(v.raw[i]);
1099
1189
 
1100
1190
  const int exponent =
1101
1191
  static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
@@ -1115,7 +1205,7 @@ Vec128<Float, N> Floor(Vec128<Float, N> v) {
1115
1205
  if (negative) bits += (kMantissaMask + 1) >> exponent;
1116
1206
  bits &= ~mantissa_mask;
1117
1207
 
1118
- CopySameSize(&bits, &v.raw[i]);
1208
+ v.raw[i] = BitCastScalar<Float>(bits);
1119
1209
  }
1120
1210
  return v;
1121
1211
  }
@@ -1127,44 +1217,11 @@ HWY_API Mask128<T, N> IsNaN(Vec128<T, N> v) {
1127
1217
  Mask128<T, N> ret;
1128
1218
  for (size_t i = 0; i < N; ++i) {
1129
1219
  // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
1130
- MakeUnsigned<T> bits;
1131
- CopySameSize(&v.raw[i], &bits);
1132
- bits += bits;
1133
- bits >>= 1; // clear sign bit
1134
- // NaN if all exponent bits are set and the mantissa is not zero.
1135
- ret.bits[i] = Mask128<T, N>::FromBool(bits > ExponentMask<T>());
1220
+ ret.bits[i] = Mask128<T, N>::FromBool(ScalarIsNaN(v.raw[i]));
1136
1221
  }
1137
1222
  return ret;
1138
1223
  }
1139
1224
 
1140
- template <typename T, size_t N>
1141
- HWY_API Mask128<T, N> IsInf(Vec128<T, N> v) {
1142
- static_assert(IsFloat<T>(), "Only for float");
1143
- const DFromV<decltype(v)> d;
1144
- const RebindToSigned<decltype(d)> di;
1145
- const VFromD<decltype(di)> vi = BitCast(di, v);
1146
- // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
1147
- return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
1148
- }
1149
-
1150
- // Returns whether normal/subnormal/zero.
1151
- template <typename T, size_t N>
1152
- HWY_API Mask128<T, N> IsFinite(Vec128<T, N> v) {
1153
- static_assert(IsFloat<T>(), "Only for float");
1154
- const DFromV<decltype(v)> d;
1155
- const RebindToUnsigned<decltype(d)> du;
1156
- const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
1157
- using VI = VFromD<decltype(di)>;
1158
- using VU = VFromD<decltype(du)>;
1159
- const VU vu = BitCast(du, v);
1160
- // 'Shift left' to clear the sign bit, then right so we can compare with the
1161
- // max exponent (cannot compare with MaxExponentTimes2 directly because it is
1162
- // negative and non-negative floats would be greater).
1163
- const VI exp =
1164
- BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
1165
- return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
1166
- }
1167
-
1168
1225
  // ================================================== COMPARE
1169
1226
 
1170
1227
  template <typename T, size_t N>
@@ -1510,67 +1567,59 @@ HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
1510
1567
  namespace detail {
1511
1568
 
1512
1569
  template <class ToT, class FromT>
1513
- HWY_INLINE ToT CastValueForF2IConv(hwy::UnsignedTag /* to_type_tag */,
1514
- FromT val) {
1515
- // Prevent ubsan errors when converting float to narrower integer
1516
-
1517
- // If LimitsMax<ToT>() can be exactly represented in FromT,
1518
- // kSmallestOutOfToTRangePosVal is equal to LimitsMax<ToT>().
1519
-
1520
- // Otherwise, if LimitsMax<ToT>() cannot be exactly represented in FromT,
1521
- // kSmallestOutOfToTRangePosVal is equal to LimitsMax<ToT>() + 1, which can
1522
- // be exactly represented in FromT.
1523
- constexpr FromT kSmallestOutOfToTRangePosVal =
1524
- (sizeof(ToT) * 8 <= static_cast<size_t>(MantissaBits<FromT>()) + 1)
1525
- ? static_cast<FromT>(LimitsMax<ToT>())
1526
- : static_cast<FromT>(
1527
- static_cast<FromT>(ToT{1} << (sizeof(ToT) * 8 - 1)) * FromT(2));
1528
-
1529
- if (std::signbit(val)) {
1530
- return ToT{0};
1531
- } else if (std::isinf(val) || val >= kSmallestOutOfToTRangePosVal) {
1532
- return LimitsMax<ToT>();
1533
- } else {
1534
- return static_cast<ToT>(val);
1535
- }
1536
- }
1537
-
1538
- template <class ToT, class FromT>
1539
- HWY_INLINE ToT CastValueForF2IConv(hwy::SignedTag /* to_type_tag */,
1540
- FromT val) {
1570
+ HWY_INLINE ToT CastValueForF2IConv(FromT val) {
1541
1571
  // Prevent ubsan errors when converting float to narrower integer
1542
1572
 
1543
- // If LimitsMax<ToT>() can be exactly represented in FromT,
1544
- // kSmallestOutOfToTRangePosVal is equal to LimitsMax<ToT>().
1545
-
1546
- // Otherwise, if LimitsMax<ToT>() cannot be exactly represented in FromT,
1547
- // kSmallestOutOfToTRangePosVal is equal to -LimitsMin<ToT>(), which can
1548
- // be exactly represented in FromT.
1549
- constexpr FromT kSmallestOutOfToTRangePosVal =
1550
- (sizeof(ToT) * 8 <= static_cast<size_t>(MantissaBits<FromT>()) + 2)
1551
- ? static_cast<FromT>(LimitsMax<ToT>())
1552
- : static_cast<FromT>(-static_cast<FromT>(LimitsMin<ToT>()));
1553
-
1554
- if (std::isinf(val) || std::fabs(val) >= kSmallestOutOfToTRangePosVal) {
1555
- return std::signbit(val) ? LimitsMin<ToT>() : LimitsMax<ToT>();
1556
- } else {
1557
- return static_cast<ToT>(val);
1558
- }
1573
+ using FromTU = MakeUnsigned<FromT>;
1574
+ using ToTU = MakeUnsigned<ToT>;
1575
+
1576
+ constexpr unsigned kMaxExpField =
1577
+ static_cast<unsigned>(MaxExponentField<FromT>());
1578
+ constexpr unsigned kExpBias = kMaxExpField >> 1;
1579
+ constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
1580
+ kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
1581
+ kMaxExpField));
1582
+
1583
+ // If ToT is signed, compare only the exponent bits of val against
1584
+ // kMinOutOfRangeExpField.
1585
+ //
1586
+ // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
1587
+ // val against kMinOutOfRangeExpField as a negative value is outside of the
1588
+ // range of an unsigned integer type.
1589
+ const FromT val_to_compare =
1590
+ static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);
1591
+
1592
+ // val is within the range of ToT if
1593
+ // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
1594
+ // than kMinOutOfRangeExpField
1595
+ //
1596
+ // Otherwise, val is either outside of the range of ToT or equal to
1597
+ // LimitsMin<ToT>() if
1598
+ // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
1599
+ // than or equal to kMinOutOfRangeExpField.
1600
+
1601
+ return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
1602
+ MantissaBits<FromT>()) < kMinOutOfRangeExpField)
1603
+ ? static_cast<ToT>(val)
1604
+ : static_cast<ToT>(static_cast<ToTU>(LimitsMax<ToT>()) +
1605
+ static_cast<ToTU>(ScalarSignBit(val)));
1559
1606
  }
1560
1607
 
1561
1608
  template <class ToT, class ToTypeTag, class FromT>
1562
1609
  HWY_INLINE ToT CastValueForPromoteTo(ToTypeTag /* to_type_tag */, FromT val) {
1563
- return static_cast<ToT>(val);
1610
+ return ConvertScalarTo<ToT>(val);
1564
1611
  }
1565
1612
 
1566
1613
  template <class ToT>
1567
- HWY_INLINE ToT CastValueForPromoteTo(hwy::SignedTag to_type_tag, float val) {
1568
- return CastValueForF2IConv<ToT>(to_type_tag, val);
1614
+ HWY_INLINE ToT CastValueForPromoteTo(hwy::SignedTag /*to_type_tag*/,
1615
+ float val) {
1616
+ return CastValueForF2IConv<ToT>(val);
1569
1617
  }
1570
1618
 
1571
1619
  template <class ToT>
1572
- HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag to_type_tag, float val) {
1573
- return CastValueForF2IConv<ToT>(to_type_tag, val);
1620
+ HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag /*to_type_tag*/,
1621
+ float val) {
1622
+ return CastValueForF2IConv<ToT>(val);
1574
1623
  }
1575
1624
 
1576
1625
  } // namespace detail
@@ -1594,10 +1643,10 @@ HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<double, D>> from) {
1594
1643
  VFromD<D> ret;
1595
1644
  for (size_t i = 0; i < MaxLanes(d); ++i) {
1596
1645
  // Prevent ubsan errors when converting float to narrower integer/float
1597
- if (std::isinf(from.raw[i]) ||
1598
- std::fabs(from.raw[i]) > static_cast<double>(HighestValue<float>())) {
1599
- ret.raw[i] = std::signbit(from.raw[i]) ? LowestValue<float>()
1600
- : HighestValue<float>();
1646
+ if (ScalarIsInf(from.raw[i]) ||
1647
+ ScalarAbs(from.raw[i]) > static_cast<double>(HighestValue<float>())) {
1648
+ ret.raw[i] = ScalarSignBit(from.raw[i]) ? LowestValue<float>()
1649
+ : HighestValue<float>();
1601
1650
  continue;
1602
1651
  }
1603
1652
  ret.raw[i] = static_cast<float>(from.raw[i]);
@@ -1609,8 +1658,7 @@ HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<double, D>> from) {
1609
1658
  VFromD<D> ret;
1610
1659
  for (size_t i = 0; i < MaxLanes(d); ++i) {
1611
1660
  // Prevent ubsan errors when converting double to narrower integer/int32_t
1612
- ret.raw[i] = detail::CastValueForF2IConv<TFromD<D>>(
1613
- hwy::TypeTag<TFromD<D>>(), from.raw[i]);
1661
+ ret.raw[i] = detail::CastValueForF2IConv<TFromD<D>>(from.raw[i]);
1614
1662
  }
1615
1663
  return ret;
1616
1664
  }
@@ -1715,23 +1763,20 @@ HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
1715
1763
  return ReorderDemote2To(dn, a, b);
1716
1764
  }
1717
1765
 
1718
- template <class DN, HWY_IF_BF16_D(DN), class V, HWY_IF_F32_D(DFromV<V>),
1766
+ template <class DN, HWY_IF_SPECIAL_FLOAT_D(DN), class V,
1767
+ HWY_IF_F32_D(DFromV<V>),
1719
1768
  HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
1720
1769
  HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
1721
- const RebindToUnsigned<DFromV<decltype(a)>> du32;
1722
- const size_t NW = Lanes(du32);
1723
- VFromD<Repartition<uint16_t, DN>> ret;
1724
-
1725
- const auto a_bits = BitCast(du32, a);
1726
- const auto b_bits = BitCast(du32, b);
1727
-
1770
+ const size_t NW = Lanes(dn) / 2;
1771
+ using TN = TFromD<DN>;
1772
+ VFromD<DN> ret;
1728
1773
  for (size_t i = 0; i < NW; ++i) {
1729
- ret.raw[i] = static_cast<uint16_t>(a_bits.raw[i] >> 16);
1774
+ ret.raw[i] = ConvertScalarTo<TN>(a.raw[i]);
1730
1775
  }
1731
1776
  for (size_t i = 0; i < NW; ++i) {
1732
- ret.raw[NW + i] = static_cast<uint16_t>(b_bits.raw[i] >> 16);
1777
+ ret.raw[NW + i] = ConvertScalarTo<TN>(b.raw[i]);
1733
1778
  }
1734
- return BitCast(dn, ret);
1779
+ return ret;
1735
1780
  }
1736
1781
 
1737
1782
  namespace detail {
@@ -1780,7 +1825,7 @@ HWY_API VFromD<DTo> ConvertTo(hwy::FloatTag /*tag*/, DTo /*tag*/,
1780
1825
 
1781
1826
  for (size_t i = 0; i < N; ++i) {
1782
1827
  // float## -> int##: return closest representable value
1783
- ret.raw[i] = CastValueForF2IConv<ToT>(hwy::TypeTag<ToT>(), from.raw[i]);
1828
+ ret.raw[i] = CastValueForF2IConv<ToT>(from.raw[i]);
1784
1829
  }
1785
1830
  return ret;
1786
1831
  }
@@ -1980,8 +2025,16 @@ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
1980
2025
  return ret;
1981
2026
  }
1982
2027
 
2028
+ // 2023-11-23: workaround for incorrect codegen (reduction_test fails for
2029
+ // SumsOf2 because PromoteOddTo, which uses ConcatOdd, returns zero).
2030
+ #if HWY_ARCH_RVV && HWY_TARGET == HWY_EMU128 && HWY_COMPILER_CLANG
2031
+ #define HWY_EMU128_CONCAT_INLINE HWY_NOINLINE
2032
+ #else
2033
+ #define HWY_EMU128_CONCAT_INLINE HWY_API
2034
+ #endif
2035
+
1983
2036
  template <class D>
1984
- HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
2037
+ HWY_EMU128_CONCAT_INLINE VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
1985
2038
  const Half<decltype(d)> dh;
1986
2039
  VFromD<D> ret;
1987
2040
  for (size_t i = 0; i < MaxLanes(dh); ++i) {
@@ -2349,8 +2402,8 @@ HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
2349
2402
  }
2350
2403
 
2351
2404
  // Additional overload for the optional tag.
2352
- template <class V>
2353
- HWY_API V InterleaveLower(DFromV<V> /* tag */, V a, V b) {
2405
+ template <class D>
2406
+ HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) {
2354
2407
  return InterleaveLower(a, b);
2355
2408
  }
2356
2409
 
@@ -2416,6 +2469,15 @@ HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
2416
2469
  return m;
2417
2470
  }
2418
2471
 
2472
+ template <class D>
2473
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
2474
+ MFromD<D> m;
2475
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
2476
+ m.bits[i] = MFromD<D>::FromBool(((mask_bits >> i) & 1u) != 0);
2477
+ }
2478
+ return m;
2479
+ }
2480
+
2419
2481
  // `p` points to at least 8 writable bytes.
2420
2482
  template <class D>
2421
2483
  HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
@@ -2517,7 +2579,7 @@ HWY_API Vec128<T, N> Expand(Vec128<T, N> v, const Mask128<T, N> mask) {
2517
2579
  if (mask.bits[i]) {
2518
2580
  ret.raw[i] = v.raw[in_pos++];
2519
2581
  } else {
2520
- ret.raw[i] = T(); // zero, also works for float16_t
2582
+ ret.raw[i] = ConvertScalarTo<T>(0);
2521
2583
  }
2522
2584
  }
2523
2585
  return ret;
@@ -2754,15 +2816,13 @@ HWY_API VW RearrangeToOddPlusEven(VW sum0, VW sum1) {
2754
2816
 
2755
2817
  // ================================================== REDUCTIONS
2756
2818
 
2757
- template <class D, typename T = TFromD<D>>
2758
- HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
2759
- T sum = T{0};
2760
- for (size_t i = 0; i < MaxLanes(d); ++i) {
2761
- sum += v.raw[i];
2762
- }
2763
- return Set(d, sum);
2764
- }
2765
- template <class D, typename T = TFromD<D>>
2819
+ #ifdef HWY_NATIVE_REDUCE_SCALAR
2820
+ #undef HWY_NATIVE_REDUCE_SCALAR
2821
+ #else
2822
+ #define HWY_NATIVE_REDUCE_SCALAR
2823
+ #endif
2824
+
2825
+ template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
2766
2826
  HWY_API T ReduceSum(D d, VFromD<D> v) {
2767
2827
  T sum = T{0};
2768
2828
  for (size_t i = 0; i < MaxLanes(d); ++i) {
@@ -2770,21 +2830,36 @@ HWY_API T ReduceSum(D d, VFromD<D> v) {
2770
2830
  }
2771
2831
  return sum;
2772
2832
  }
2773
- template <class D, typename T = TFromD<D>>
2774
- HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
2833
+ template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
2834
+ HWY_API T ReduceMin(D d, VFromD<D> v) {
2775
2835
  T min = HighestValue<T>();
2776
2836
  for (size_t i = 0; i < MaxLanes(d); ++i) {
2777
2837
  min = HWY_MIN(min, v.raw[i]);
2778
2838
  }
2779
- return Set(d, min);
2839
+ return min;
2780
2840
  }
2781
- template <class D, typename T = TFromD<D>>
2782
- HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
2841
+ template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
2842
+ HWY_API T ReduceMax(D d, VFromD<D> v) {
2783
2843
  T max = LowestValue<T>();
2784
2844
  for (size_t i = 0; i < MaxLanes(d); ++i) {
2785
2845
  max = HWY_MAX(max, v.raw[i]);
2786
2846
  }
2787
- return Set(d, max);
2847
+ return max;
2848
+ }
2849
+
2850
+ // ------------------------------ SumOfLanes
2851
+
2852
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
2853
+ HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
2854
+ return Set(d, ReduceSum(d, v));
2855
+ }
2856
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
2857
+ HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
2858
+ return Set(d, ReduceMin(d, v));
2859
+ }
2860
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
2861
+ HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
2862
+ return Set(d, ReduceMax(d, v));
2788
2863
  }
2789
2864
 
2790
2865
  // ================================================== OPS WITH DEPENDENCIES