@img/sharp-libvips-dev 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. package/README.md +1 -2
  2. package/include/aom/aom_decoder.h +1 -1
  3. package/include/aom/aom_encoder.h +7 -1
  4. package/include/aom/aom_image.h +24 -12
  5. package/include/aom/aom_integer.h +3 -3
  6. package/include/aom/aomcx.h +15 -0
  7. package/include/aom/aomdx.h +5 -2
  8. package/include/archive.h +7 -5
  9. package/include/archive_entry.h +5 -3
  10. package/include/cgif.h +3 -0
  11. package/include/expat.h +21 -10
  12. package/include/expat_config.h +11 -5
  13. package/include/ffi.h +12 -25
  14. package/include/freetype2/freetype/config/ftoption.h +2 -2
  15. package/include/fribidi/fribidi-config.h +2 -2
  16. package/include/fribidi/fribidi-unicode-version.h +3 -3
  17. package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
  18. package/include/glib-2.0/gio/gappinfo.h +40 -25
  19. package/include/glib-2.0/gio/gapplication.h +6 -0
  20. package/include/glib-2.0/gio/gasyncresult.h +1 -1
  21. package/include/glib-2.0/gio/gconverter.h +5 -0
  22. package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
  23. package/include/glib-2.0/gio/gfile.h +16 -0
  24. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  25. package/include/glib-2.0/gio/giotypes.h +0 -1
  26. package/include/glib-2.0/gio/gsettings.h +8 -0
  27. package/include/glib-2.0/gio/gvfs.h +2 -2
  28. package/include/glib-2.0/girepository/gi-visibility.h +34 -0
  29. package/include/glib-2.0/girepository/giarginfo.h +23 -6
  30. package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
  31. package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
  32. package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
  33. package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
  34. package/include/glib-2.0/girepository/gienuminfo.h +20 -21
  35. package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
  36. package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
  37. package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
  38. package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
  39. package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
  40. package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
  41. package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
  42. package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
  43. package/include/glib-2.0/girepository/girepository.h +53 -62
  44. package/include/glib-2.0/girepository/girffi.h +8 -7
  45. package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
  46. package/include/glib-2.0/girepository/gistructinfo.h +26 -11
  47. package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
  48. package/include/glib-2.0/girepository/gitypelib.h +9 -13
  49. package/include/glib-2.0/girepository/gitypes.h +52 -104
  50. package/include/glib-2.0/girepository/giunioninfo.h +28 -12
  51. package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
  52. package/include/glib-2.0/girepository/givalueinfo.h +65 -0
  53. package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
  54. package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
  55. package/include/glib-2.0/glib/gbitlock.h +31 -0
  56. package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
  57. package/include/glib-2.0/glib/giochannel.h +2 -2
  58. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  59. package/include/glib-2.0/glib/gmacros.h +12 -5
  60. package/include/glib-2.0/glib/gmain.h +93 -7
  61. package/include/glib-2.0/glib/gmessages.h +8 -0
  62. package/include/glib-2.0/glib/gqsort.h +8 -1
  63. package/include/glib-2.0/glib/gslice.h +2 -0
  64. package/include/glib-2.0/glib/gstrfuncs.h +24 -30
  65. package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
  66. package/include/glib-2.0/glib/gthread.h +191 -3
  67. package/include/glib-2.0/glib/gunicode.h +1 -1
  68. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  69. package/include/glib-2.0/glib-unix.h +7 -1
  70. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  71. package/include/glib-2.0/gobject/genums.h +6 -6
  72. package/include/glib-2.0/gobject/glib-types.h +11 -0
  73. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  74. package/include/glib-2.0/gobject/gsignal.h +16 -6
  75. package/include/glib-2.0/gobject/gtype.h +6 -6
  76. package/include/harfbuzz/hb-buffer.h +6 -0
  77. package/include/harfbuzz/hb-common.h +6 -9
  78. package/include/harfbuzz/hb-cplusplus.hh +8 -11
  79. package/include/harfbuzz/hb-subset.h +17 -4
  80. package/include/harfbuzz/hb-version.h +3 -3
  81. package/include/hwy/abort.h +28 -0
  82. package/include/hwy/aligned_allocator.h +218 -6
  83. package/include/hwy/base.h +1935 -512
  84. package/include/hwy/cache_control.h +24 -6
  85. package/include/hwy/detect_compiler_arch.h +105 -10
  86. package/include/hwy/detect_targets.h +146 -37
  87. package/include/hwy/foreach_target.h +36 -1
  88. package/include/hwy/highway.h +222 -50
  89. package/include/hwy/ops/arm_neon-inl.h +2055 -894
  90. package/include/hwy/ops/arm_sve-inl.h +1476 -348
  91. package/include/hwy/ops/emu128-inl.h +711 -623
  92. package/include/hwy/ops/generic_ops-inl.h +4431 -2157
  93. package/include/hwy/ops/inside-inl.h +691 -0
  94. package/include/hwy/ops/ppc_vsx-inl.h +2186 -673
  95. package/include/hwy/ops/rvv-inl.h +1556 -536
  96. package/include/hwy/ops/scalar-inl.h +353 -233
  97. package/include/hwy/ops/set_macros-inl.h +171 -23
  98. package/include/hwy/ops/shared-inl.h +198 -56
  99. package/include/hwy/ops/wasm_128-inl.h +283 -244
  100. package/include/hwy/ops/x86_128-inl.h +3673 -1357
  101. package/include/hwy/ops/x86_256-inl.h +1737 -663
  102. package/include/hwy/ops/x86_512-inl.h +1697 -500
  103. package/include/hwy/per_target.h +4 -0
  104. package/include/hwy/profiler.h +648 -0
  105. package/include/hwy/robust_statistics.h +2 -2
  106. package/include/hwy/targets.h +40 -32
  107. package/include/hwy/timer-inl.h +3 -3
  108. package/include/hwy/timer.h +16 -1
  109. package/include/libheif/heif.h +170 -15
  110. package/include/libheif/heif_items.h +237 -0
  111. package/include/libheif/heif_properties.h +38 -2
  112. package/include/libheif/heif_regions.h +1 -1
  113. package/include/libheif/heif_version.h +2 -2
  114. package/include/libpng16/png.h +32 -29
  115. package/include/libpng16/pngconf.h +2 -2
  116. package/include/libpng16/pnglibconf.h +8 -3
  117. package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
  118. package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
  119. package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
  120. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  121. package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
  122. package/include/libxml2/libxml/HTMLparser.h +12 -19
  123. package/include/libxml2/libxml/c14n.h +1 -12
  124. package/include/libxml2/libxml/debugXML.h +1 -1
  125. package/include/libxml2/libxml/encoding.h +9 -0
  126. package/include/libxml2/libxml/entities.h +12 -1
  127. package/include/libxml2/libxml/hash.h +19 -0
  128. package/include/libxml2/libxml/list.h +2 -2
  129. package/include/libxml2/libxml/nanohttp.h +17 -0
  130. package/include/libxml2/libxml/parser.h +73 -58
  131. package/include/libxml2/libxml/parserInternals.h +9 -1
  132. package/include/libxml2/libxml/pattern.h +6 -0
  133. package/include/libxml2/libxml/tree.h +32 -12
  134. package/include/libxml2/libxml/uri.h +11 -0
  135. package/include/libxml2/libxml/valid.h +29 -2
  136. package/include/libxml2/libxml/xinclude.h +7 -0
  137. package/include/libxml2/libxml/xmlIO.h +21 -5
  138. package/include/libxml2/libxml/xmlerror.h +14 -0
  139. package/include/libxml2/libxml/xmlexports.h +111 -15
  140. package/include/libxml2/libxml/xmlmemory.h +8 -45
  141. package/include/libxml2/libxml/xmlreader.h +2 -0
  142. package/include/libxml2/libxml/xmlsave.h +5 -0
  143. package/include/libxml2/libxml/xmlunicode.h +165 -1
  144. package/include/libxml2/libxml/xmlversion.h +15 -179
  145. package/include/libxml2/libxml/xmlwriter.h +1 -0
  146. package/include/libxml2/libxml/xpath.h +4 -0
  147. package/include/pango-1.0/pango/pango-features.h +2 -2
  148. package/include/pango-1.0/pango/pango-fontmap.h +7 -0
  149. package/include/pango-1.0/pango/pango-item.h +4 -2
  150. package/include/pango-1.0/pango/pango-version-macros.h +25 -0
  151. package/include/pango-1.0/pango/pangofc-font.h +2 -1
  152. package/include/pixman-1/pixman-version.h +2 -2
  153. package/include/png.h +32 -29
  154. package/include/pngconf.h +2 -2
  155. package/include/pnglibconf.h +8 -3
  156. package/include/vips/connection.h +9 -3
  157. package/include/vips/util.h +1 -11
  158. package/include/vips/version.h +4 -4
  159. package/include/webp/decode.h +58 -56
  160. package/include/webp/demux.h +25 -21
  161. package/include/webp/encode.h +44 -39
  162. package/include/webp/mux.h +76 -15
  163. package/include/webp/mux_types.h +2 -1
  164. package/include/webp/sharpyuv/sharpyuv.h +77 -8
  165. package/include/webp/types.h +29 -8
  166. package/include/zconf.h +1 -1
  167. package/include/zlib.h +12 -12
  168. package/package.json +1 -1
  169. package/versions.json +18 -19
@@ -16,6 +16,7 @@
16
16
  // Single-element vectors and operations.
17
17
  // External include guard in highway.h - see comment there.
18
18
 
19
+ #include <stdint.h>
19
20
  #ifndef HWY_NO_LIBCXX
20
21
  #include <math.h> // sqrtf
21
22
  #endif
@@ -53,6 +54,9 @@ struct Vec1 {
53
54
  HWY_INLINE Vec1& operator-=(const Vec1 other) {
54
55
  return *this = (*this - other);
55
56
  }
57
+ HWY_INLINE Vec1& operator%=(const Vec1 other) {
58
+ return *this = (*this % other);
59
+ }
56
60
  HWY_INLINE Vec1& operator&=(const Vec1 other) {
57
61
  return *this = (*this & other);
58
62
  }
@@ -101,17 +105,12 @@ HWY_API Vec1<TTo> BitCast(DTo /* tag */, Vec1<TFrom> v) {
101
105
 
102
106
  template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
103
107
  HWY_API Vec1<T> Zero(D /* tag */) {
104
- Vec1<T> v;
105
- ZeroBytes<sizeof(v.raw)>(&v.raw);
106
- return v;
108
+ return Vec1<T>(ConvertScalarTo<T>(0));
107
109
  }
108
110
 
109
111
  template <class D>
110
112
  using VFromD = decltype(Zero(D()));
111
113
 
112
- // ------------------------------ Tuple (VFromD)
113
- #include "hwy/ops/tuple-inl.h"
114
-
115
114
  // ------------------------------ Set
116
115
  template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>, typename T2>
117
116
  HWY_API Vec1<T> Set(D /* tag */, const T2 t) {
@@ -137,7 +136,7 @@ HWY_API VFromD<D> ResizeBitCast(D /* tag */, FromV v) {
137
136
  using TFrom = TFromV<FromV>;
138
137
  using TTo = TFromD<D>;
139
138
  constexpr size_t kCopyLen = HWY_MIN(sizeof(TFrom), sizeof(TTo));
140
- TTo to = TTo{0};
139
+ TTo to{};
141
140
  CopyBytes<kCopyLen>(&v.raw, &to);
142
141
  return VFromD<D>(to);
143
142
  }
@@ -156,6 +155,39 @@ HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(FromSizeTag /* from_size_tag */,
156
155
 
157
156
  } // namespace detail
158
157
 
158
+ // ------------------------------ Dup128VecFromValues
159
+
160
+ template <class D, HWY_IF_T_SIZE_D(D, 1)>
161
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> /*t1*/,
162
+ TFromD<D> /*t2*/, TFromD<D> /*t3*/,
163
+ TFromD<D> /*t4*/, TFromD<D> /*t5*/,
164
+ TFromD<D> /*t6*/, TFromD<D> /*t7*/,
165
+ TFromD<D> /*t8*/, TFromD<D> /*t9*/,
166
+ TFromD<D> /*t10*/, TFromD<D> /*t11*/,
167
+ TFromD<D> /*t12*/, TFromD<D> /*t13*/,
168
+ TFromD<D> /*t14*/, TFromD<D> /*t15*/) {
169
+ return VFromD<D>(t0);
170
+ }
171
+
172
+ template <class D, HWY_IF_T_SIZE_D(D, 2)>
173
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> /*t1*/,
174
+ TFromD<D> /*t2*/, TFromD<D> /*t3*/,
175
+ TFromD<D> /*t4*/, TFromD<D> /*t5*/,
176
+ TFromD<D> /*t6*/, TFromD<D> /*t7*/) {
177
+ return VFromD<D>(t0);
178
+ }
179
+
180
+ template <class D, HWY_IF_T_SIZE_D(D, 4)>
181
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> /*t1*/,
182
+ TFromD<D> /*t2*/, TFromD<D> /*t3*/) {
183
+ return VFromD<D>(t0);
184
+ }
185
+
186
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
187
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> /*t1*/) {
188
+ return VFromD<D>(t0);
189
+ }
190
+
159
191
  // ================================================== LOGICAL
160
192
 
161
193
  // ------------------------------ Not
@@ -300,8 +332,7 @@ HWY_API Vec1<T> CopySignToAbs(const Vec1<T> abs, const Vec1<T> sign) {
300
332
  // ------------------------------ BroadcastSignBit
301
333
  template <typename T>
302
334
  HWY_API Vec1<T> BroadcastSignBit(const Vec1<T> v) {
303
- // This is used inside ShiftRight, so we cannot implement in terms of it.
304
- return v.raw < 0 ? Vec1<T>(T(-1)) : Vec1<T>(0);
335
+ return Vec1<T>(ScalarShr(v.raw, sizeof(T) * 8 - 1));
305
336
  }
306
337
 
307
338
  // ------------------------------ PopulationCount
@@ -328,12 +359,12 @@ HWY_API Vec1<T> IfThenElse(const Mask1<T> mask, const Vec1<T> yes,
328
359
 
329
360
  template <typename T>
330
361
  HWY_API Vec1<T> IfThenElseZero(const Mask1<T> mask, const Vec1<T> yes) {
331
- return mask.bits ? yes : Vec1<T>(0);
362
+ return mask.bits ? yes : Vec1<T>(ConvertScalarTo<T>(0));
332
363
  }
333
364
 
334
365
  template <typename T>
335
366
  HWY_API Vec1<T> IfThenZeroElse(const Mask1<T> mask, const Vec1<T> no) {
336
- return mask.bits ? Vec1<T>(0) : no;
367
+ return mask.bits ? Vec1<T>(ConvertScalarTo<T>(0)) : no;
337
368
  }
338
369
 
339
370
  template <typename T>
@@ -345,11 +376,6 @@ HWY_API Vec1<T> IfNegativeThenElse(Vec1<T> v, Vec1<T> yes, Vec1<T> no) {
345
376
  return vi.raw < 0 ? yes : no;
346
377
  }
347
378
 
348
- template <typename T>
349
- HWY_API Vec1<T> ZeroIfNegative(const Vec1<T> v) {
350
- return v.raw < 0 ? Vec1<T>(0) : v;
351
- }
352
-
353
379
  // ------------------------------ Mask logical
354
380
 
355
381
  template <typename T>
@@ -407,6 +433,19 @@ HWY_API Mask1<T> SetAtOrBeforeFirst(Mask1<T> /*mask*/) {
407
433
  return Mask1<T>::FromBool(true);
408
434
  }
409
435
 
436
+ // ------------------------------ LowerHalfOfMask
437
+
438
+ #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
439
+ #undef HWY_NATIVE_LOWER_HALF_OF_MASK
440
+ #else
441
+ #define HWY_NATIVE_LOWER_HALF_OF_MASK
442
+ #endif
443
+
444
+ template <class D>
445
+ HWY_API MFromD<D> LowerHalfOfMask(D /*d*/, MFromD<D> m) {
446
+ return m;
447
+ }
448
+
410
449
  // ================================================== SHIFTS
411
450
 
412
451
  // ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
@@ -421,35 +460,20 @@ HWY_API Vec1<T> ShiftLeft(const Vec1<T> v) {
421
460
  template <int kBits, typename T>
422
461
  HWY_API Vec1<T> ShiftRight(const Vec1<T> v) {
423
462
  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
424
- #if __cplusplus >= 202002L
425
- // Signed right shift is now guaranteed to be arithmetic (rounding toward
426
- // negative infinity, i.e. shifting in the sign bit).
427
- return Vec1<T>(static_cast<T>(v.raw >> kBits));
428
- #else
429
- if (IsSigned<T>()) {
430
- // Emulate arithmetic shift using only logical (unsigned) shifts, because
431
- // signed shifts are still implementation-defined.
432
- using TU = hwy::MakeUnsigned<T>;
433
- const Sisd<TU> du;
434
- const TU shifted = static_cast<TU>(BitCast(du, v).raw >> kBits);
435
- const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
436
- const size_t sign_shift =
437
- static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
438
- const TU upper = static_cast<TU>(sign << sign_shift);
439
- return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
440
- } else { // T is unsigned
441
- return Vec1<T>(static_cast<T>(v.raw >> kBits));
442
- }
443
- #endif
463
+ return Vec1<T>(ScalarShr(v.raw, kBits));
444
464
  }
445
465
 
446
466
  // ------------------------------ RotateRight (ShiftRight)
447
- template <int kBits, typename T>
467
+ template <int kBits, typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
448
468
  HWY_API Vec1<T> RotateRight(const Vec1<T> v) {
469
+ const DFromV<decltype(v)> d;
470
+ const RebindToUnsigned<decltype(d)> du;
471
+
449
472
  constexpr size_t kSizeInBits = sizeof(T) * 8;
450
- static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift");
473
+ static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
451
474
  if (kBits == 0) return v;
452
- return Or(ShiftRight<kBits>(v),
475
+
476
+ return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
453
477
  ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
454
478
  }
455
479
 
@@ -463,26 +487,7 @@ HWY_API Vec1<T> ShiftLeftSame(const Vec1<T> v, int bits) {
463
487
 
464
488
  template <typename T>
465
489
  HWY_API Vec1<T> ShiftRightSame(const Vec1<T> v, int bits) {
466
- #if __cplusplus >= 202002L
467
- // Signed right shift is now guaranteed to be arithmetic (rounding toward
468
- // negative infinity, i.e. shifting in the sign bit).
469
- return Vec1<T>(static_cast<T>(v.raw >> bits));
470
- #else
471
- if (IsSigned<T>()) {
472
- // Emulate arithmetic shift using only logical (unsigned) shifts, because
473
- // signed shifts are still implementation-defined.
474
- using TU = hwy::MakeUnsigned<T>;
475
- const Sisd<TU> du;
476
- const TU shifted = static_cast<TU>(BitCast(du, v).raw >> bits);
477
- const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
478
- const size_t sign_shift =
479
- static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
480
- const TU upper = static_cast<TU>(sign << sign_shift);
481
- return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
482
- } else { // T is unsigned
483
- return Vec1<T>(static_cast<T>(v.raw >> bits));
484
- }
485
- #endif
490
+ return Vec1<T>(ScalarShr(v.raw, bits));
486
491
  }
487
492
 
488
493
  // ------------------------------ Shl
@@ -528,10 +533,22 @@ HWY_API Vec1<double> operator-(const Vec1<double> a, const Vec1<double> b) {
528
533
 
529
534
  // ------------------------------ SumsOf8
530
535
 
536
+ HWY_API Vec1<int64_t> SumsOf8(const Vec1<int8_t> v) {
537
+ return Vec1<int64_t>(v.raw);
538
+ }
531
539
  HWY_API Vec1<uint64_t> SumsOf8(const Vec1<uint8_t> v) {
532
540
  return Vec1<uint64_t>(v.raw);
533
541
  }
534
542
 
543
+ // ------------------------------ SumsOf2
544
+
545
+ template <class T>
546
+ HWY_API Vec1<MakeWide<T>> SumsOf2(const Vec1<T> v) {
547
+ const DFromV<decltype(v)> d;
548
+ const Rebind<MakeWide<T>, decltype(d)> dw;
549
+ return PromoteTo(dw, v);
550
+ }
551
+
535
552
  // ------------------------------ SaturatedAdd
536
553
 
537
554
  // Returns a + b clamped to the destination range.
@@ -603,57 +620,12 @@ HWY_API Vec1<uint16_t> AverageRound(const Vec1<uint16_t> a,
603
620
 
604
621
  template <typename T>
605
622
  HWY_API Vec1<T> Abs(const Vec1<T> a) {
606
- const T i = a.raw;
607
- if (i >= 0 || i == hwy::LimitsMin<T>()) return a;
608
- return Vec1<T>(static_cast<T>(-i & T{-1}));
609
- }
610
- HWY_API Vec1<float> Abs(Vec1<float> a) {
611
- int32_t i;
612
- CopyBytes<sizeof(i)>(&a.raw, &i);
613
- i &= 0x7FFFFFFF;
614
- CopyBytes<sizeof(i)>(&i, &a.raw);
615
- return a;
616
- }
617
- HWY_API Vec1<double> Abs(Vec1<double> a) {
618
- int64_t i;
619
- CopyBytes<sizeof(i)>(&a.raw, &i);
620
- i &= 0x7FFFFFFFFFFFFFFFL;
621
- CopyBytes<sizeof(i)>(&i, &a.raw);
622
- return a;
623
+ return Vec1<T>(ScalarAbs(a.raw));
623
624
  }
624
625
 
625
626
  // ------------------------------ Min/Max
626
627
 
627
628
  // <cmath> may be unavailable, so implement our own.
628
- namespace detail {
629
-
630
- static inline float Abs(float f) {
631
- uint32_t i;
632
- CopyBytes<4>(&f, &i);
633
- i &= 0x7FFFFFFFu;
634
- CopyBytes<4>(&i, &f);
635
- return f;
636
- }
637
- static inline double Abs(double f) {
638
- uint64_t i;
639
- CopyBytes<8>(&f, &i);
640
- i &= 0x7FFFFFFFFFFFFFFFull;
641
- CopyBytes<8>(&i, &f);
642
- return f;
643
- }
644
-
645
- static inline bool SignBit(float f) {
646
- uint32_t i;
647
- CopyBytes<4>(&f, &i);
648
- return (i >> 31) != 0;
649
- }
650
- static inline bool SignBit(double f) {
651
- uint64_t i;
652
- CopyBytes<8>(&f, &i);
653
- return (i >> 63) != 0;
654
- }
655
-
656
- } // namespace detail
657
629
 
658
630
  template <typename T, HWY_IF_NOT_FLOAT(T)>
659
631
  HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
@@ -662,8 +634,8 @@ HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
662
634
 
663
635
  template <typename T, HWY_IF_FLOAT(T)>
664
636
  HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
665
- if (isnan(a.raw)) return b;
666
- if (isnan(b.raw)) return a;
637
+ if (ScalarIsNaN(a.raw)) return b;
638
+ if (ScalarIsNaN(b.raw)) return a;
667
639
  return Vec1<T>(HWY_MIN(a.raw, b.raw));
668
640
  }
669
641
 
@@ -674,8 +646,8 @@ HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
674
646
 
675
647
  template <typename T, HWY_IF_FLOAT(T)>
676
648
  HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
677
- if (isnan(a.raw)) return b;
678
- if (isnan(b.raw)) return a;
649
+ if (ScalarIsNaN(a.raw)) return b;
650
+ if (ScalarIsNaN(b.raw)) return a;
679
651
  return Vec1<T>(HWY_MAX(a.raw, b.raw));
680
652
  }
681
653
 
@@ -716,21 +688,24 @@ HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) {
716
688
  static_cast<uint64_t>(b.raw)));
717
689
  }
718
690
 
719
- template <typename T>
691
+ template <typename T, HWY_IF_FLOAT(T)>
720
692
  HWY_API Vec1<T> operator/(const Vec1<T> a, const Vec1<T> b) {
721
693
  return Vec1<T>(a.raw / b.raw);
722
694
  }
723
695
 
724
- // Returns the upper 16 bits of a * b in each lane.
725
- HWY_API Vec1<int16_t> MulHigh(const Vec1<int16_t> a, const Vec1<int16_t> b) {
726
- return Vec1<int16_t>(static_cast<int16_t>((a.raw * b.raw) >> 16));
696
+ // Returns the upper sizeof(T)*8 bits of a * b in each lane.
697
+ template <class T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
698
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
699
+ HWY_API Vec1<T> MulHigh(const Vec1<T> a, const Vec1<T> b) {
700
+ using TW = MakeWide<T>;
701
+ return Vec1<T>(static_cast<T>(
702
+ (static_cast<TW>(a.raw) * static_cast<TW>(b.raw)) >> (sizeof(T) * 8)));
727
703
  }
728
- HWY_API Vec1<uint16_t> MulHigh(const Vec1<uint16_t> a, const Vec1<uint16_t> b) {
729
- // Cast to uint32_t first to prevent overflow. Otherwise the result of
730
- // uint16_t * uint16_t is in "int" which may overflow. In practice the result
731
- // is the same but this way it is also defined.
732
- return Vec1<uint16_t>(static_cast<uint16_t>(
733
- (static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16));
704
+ template <class T, HWY_IF_UI64(T)>
705
+ HWY_API Vec1<T> MulHigh(const Vec1<T> a, const Vec1<T> b) {
706
+ T hi;
707
+ Mul128(a.raw, b.raw, &hi);
708
+ return Vec1<T>(hi);
734
709
  }
735
710
 
736
711
  HWY_API Vec1<int16_t> MulFixedPoint15(Vec1<int16_t> a, Vec1<int16_t> b) {
@@ -763,23 +738,23 @@ HWY_API Vec1<T> AbsDiff(const Vec1<T> a, const Vec1<T> b) {
763
738
 
764
739
  // ------------------------------ Floating-point multiply-add variants
765
740
 
766
- template <typename T>
741
+ template <typename T, HWY_IF_FLOAT(T)>
767
742
  HWY_API Vec1<T> MulAdd(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> add) {
768
743
  return mul * x + add;
769
744
  }
770
745
 
771
- template <typename T>
746
+ template <typename T, HWY_IF_FLOAT(T)>
772
747
  HWY_API Vec1<T> NegMulAdd(const Vec1<T> mul, const Vec1<T> x,
773
748
  const Vec1<T> add) {
774
749
  return add - mul * x;
775
750
  }
776
751
 
777
- template <typename T>
752
+ template <typename T, HWY_IF_FLOAT(T)>
778
753
  HWY_API Vec1<T> MulSub(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> sub) {
779
754
  return mul * x - sub;
780
755
  }
781
756
 
782
- template <typename T>
757
+ template <typename T, HWY_IF_FLOAT(T)>
783
758
  HWY_API Vec1<T> NegMulSub(const Vec1<T> mul, const Vec1<T> x,
784
759
  const Vec1<T> sub) {
785
760
  return Neg(mul) * x - sub;
@@ -842,14 +817,17 @@ HWY_API Vec1<T> Round(const Vec1<T> v) {
842
817
  if (!(Abs(v).raw < MantissaEnd<T>())) { // Huge or NaN
843
818
  return v;
844
819
  }
845
- const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
846
- const TI rounded = static_cast<TI>(v.raw + bias);
847
- if (rounded == 0) return CopySignToAbs(Vec1<T>(0), v);
820
+ const T k0 = ConvertScalarTo<T>(0);
821
+ const T bias = ConvertScalarTo<T>(v.raw < k0 ? -0.5 : 0.5);
822
+ const TI rounded = ConvertScalarTo<TI>(v.raw + bias);
823
+ if (rounded == 0) return CopySignToAbs(Vec1<T>(k0), v);
824
+ TI offset = 0;
848
825
  // Round to even
849
- if ((rounded & 1) && detail::Abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
850
- return Vec1<T>(static_cast<T>(rounded - (v.raw < T(0) ? -1 : 1)));
826
+ if ((rounded & 1) && ScalarAbs(ConvertScalarTo<T>(rounded) - v.raw) ==
827
+ ConvertScalarTo<T>(0.5)) {
828
+ offset = v.raw < k0 ? -1 : 1;
851
829
  }
852
- return Vec1<T>(static_cast<T>(rounded));
830
+ return Vec1<T>(ConvertScalarTo<T>(rounded - offset));
853
831
  }
854
832
 
855
833
  // Round-to-nearest even.
@@ -858,23 +836,26 @@ HWY_API Vec1<int32_t> NearestInt(const Vec1<float> v) {
858
836
  using TI = int32_t;
859
837
 
860
838
  const T abs = Abs(v).raw;
861
- const bool is_sign = detail::SignBit(v.raw);
839
+ const bool is_sign = ScalarSignBit(v.raw);
862
840
 
863
841
  if (!(abs < MantissaEnd<T>())) { // Huge or NaN
864
842
  // Check if too large to cast or NaN
865
- if (!(abs <= static_cast<T>(LimitsMax<TI>()))) {
843
+ if (!(abs <= ConvertScalarTo<T>(LimitsMax<TI>()))) {
866
844
  return Vec1<TI>(is_sign ? LimitsMin<TI>() : LimitsMax<TI>());
867
845
  }
868
- return Vec1<int32_t>(static_cast<TI>(v.raw));
846
+ return Vec1<int32_t>(ConvertScalarTo<TI>(v.raw));
869
847
  }
870
- const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
871
- const TI rounded = static_cast<TI>(v.raw + bias);
848
+ const T bias =
849
+ ConvertScalarTo<T>(v.raw < ConvertScalarTo<T>(0.0) ? -0.5 : 0.5);
850
+ const TI rounded = ConvertScalarTo<TI>(v.raw + bias);
872
851
  if (rounded == 0) return Vec1<int32_t>(0);
852
+ TI offset = 0;
873
853
  // Round to even
874
- if ((rounded & 1) && detail::Abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
875
- return Vec1<TI>(rounded - (is_sign ? -1 : 1));
854
+ if ((rounded & 1) && ScalarAbs(ConvertScalarTo<T>(rounded) - v.raw) ==
855
+ ConvertScalarTo<T>(0.5)) {
856
+ offset = is_sign ? -1 : 1;
876
857
  }
877
- return Vec1<TI>(rounded);
858
+ return Vec1<TI>(rounded - offset);
878
859
  }
879
860
 
880
861
  template <typename T>
@@ -883,9 +864,9 @@ HWY_API Vec1<T> Trunc(const Vec1<T> v) {
883
864
  if (!(Abs(v).raw <= MantissaEnd<T>())) { // Huge or NaN
884
865
  return v;
885
866
  }
886
- const TI truncated = static_cast<TI>(v.raw);
867
+ const TI truncated = ConvertScalarTo<TI>(v.raw);
887
868
  if (truncated == 0) return CopySignToAbs(Vec1<T>(0), v);
888
- return Vec1<T>(static_cast<T>(truncated));
869
+ return Vec1<T>(ConvertScalarTo<T>(truncated));
889
870
  }
890
871
 
891
872
  template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
@@ -1009,14 +990,16 @@ HWY_API Mask1<T> operator>=(const Vec1<T> a, const Vec1<T> b) {
1009
990
  template <typename T>
1010
991
  HWY_API Mask1<T> IsNaN(const Vec1<T> v) {
1011
992
  // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
1012
- MakeUnsigned<T> bits;
1013
- CopySameSize(&v, &bits);
1014
- bits += bits;
1015
- bits >>= 1; // clear sign bit
1016
- // NaN if all exponent bits are set and the mantissa is not zero.
1017
- return Mask1<T>::FromBool(bits > ExponentMask<T>());
993
+ return Mask1<T>::FromBool(ScalarIsNaN(v.raw));
1018
994
  }
1019
995
 
996
+ // Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite.
997
+ #ifdef HWY_NATIVE_ISINF
998
+ #undef HWY_NATIVE_ISINF
999
+ #else
1000
+ #define HWY_NATIVE_ISINF
1001
+ #endif
1002
+
1020
1003
  HWY_API Mask1<float> IsInf(const Vec1<float> v) {
1021
1004
  const Sisd<float> d;
1022
1005
  const RebindToUnsigned<decltype(d)> du;
@@ -1126,6 +1109,9 @@ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
1126
1109
  }
1127
1110
  }
1128
1111
 
1112
+ // ------------------------------ Tuples
1113
+ #include "hwy/ops/inside-inl.h"
1114
+
1129
1115
  // ------------------------------ LoadInterleaved2/3/4
1130
1116
 
1131
1117
  // Per-target flag to prevent generic_ops-inl.h from defining StoreInterleaved2.
@@ -1205,8 +1191,9 @@ HWY_API void Stream(const Vec1<T> v, D d, T* HWY_RESTRICT aligned) {
1205
1191
  template <class D, typename T = TFromD<D>, typename TI>
1206
1192
  HWY_API void ScatterOffset(Vec1<T> v, D d, T* base, Vec1<TI> offset) {
1207
1193
  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
1208
- uint8_t* const base8 = reinterpret_cast<uint8_t*>(base) + offset.raw;
1209
- Store(v, d, reinterpret_cast<T*>(base8));
1194
+ const intptr_t addr =
1195
+ reinterpret_cast<intptr_t>(base) + static_cast<intptr_t>(offset.raw);
1196
+ Store(v, d, reinterpret_cast<T*>(addr));
1210
1197
  }
1211
1198
 
1212
1199
  template <class D, typename T = TFromD<D>, typename TI>
@@ -1231,27 +1218,36 @@ HWY_API void MaskedScatterIndex(Vec1<T> v, Mask1<T> m, D d,
1231
1218
  #define HWY_NATIVE_GATHER
1232
1219
  #endif
1233
1220
 
1234
- template <class D, typename T = TFromD<D>, typename TI>
1235
- HWY_API Vec1<T> GatherOffset(D d, const T* base, Vec1<TI> offset) {
1236
- static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
1221
+ template <class D, typename T = TFromD<D>>
1222
+ HWY_API Vec1<T> GatherOffset(D d, const T* base, Vec1<MakeSigned<T>> offset) {
1223
+ HWY_DASSERT(offset.raw >= 0);
1237
1224
  const intptr_t addr =
1238
1225
  reinterpret_cast<intptr_t>(base) + static_cast<intptr_t>(offset.raw);
1239
1226
  return Load(d, reinterpret_cast<const T*>(addr));
1240
1227
  }
1241
1228
 
1242
- template <class D, typename T = TFromD<D>, typename TI>
1243
- HWY_API Vec1<T> GatherIndex(D d, const T* HWY_RESTRICT base, Vec1<TI> index) {
1244
- static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
1229
+ template <class D, typename T = TFromD<D>>
1230
+ HWY_API Vec1<T> GatherIndex(D d, const T* HWY_RESTRICT base,
1231
+ Vec1<MakeSigned<T>> index) {
1232
+ HWY_DASSERT(index.raw >= 0);
1245
1233
  return Load(d, base + index.raw);
1246
1234
  }
1247
1235
 
1248
- template <class D, typename T = TFromD<D>, typename TI>
1236
+ template <class D, typename T = TFromD<D>>
1249
1237
  HWY_API Vec1<T> MaskedGatherIndex(Mask1<T> m, D d, const T* HWY_RESTRICT base,
1250
- Vec1<TI> index) {
1251
- static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
1238
+ Vec1<MakeSigned<T>> index) {
1239
+ HWY_DASSERT(index.raw >= 0);
1252
1240
  return MaskedLoad(m, d, base + index.raw);
1253
1241
  }
1254
1242
 
1243
+ template <class D, typename T = TFromD<D>>
1244
+ HWY_API Vec1<T> MaskedGatherIndexOr(Vec1<T> no, Mask1<T> m, D d,
1245
+ const T* HWY_RESTRICT base,
1246
+ Vec1<MakeSigned<T>> index) {
1247
+ HWY_DASSERT(index.raw >= 0);
1248
+ return MaskedLoadOr(no, m, d, base + index.raw);
1249
+ }
1250
+
1255
1251
  // ================================================== CONVERT
1256
1252
 
1257
1253
  // ConvertTo and DemoteTo with floating-point input and integer output truncate
@@ -1260,73 +1256,111 @@ HWY_API Vec1<T> MaskedGatherIndex(Mask1<T> m, D d, const T* HWY_RESTRICT base,
1260
1256
  namespace detail {
1261
1257
 
1262
1258
  template <class ToT, class FromT>
1263
- HWY_INLINE ToT CastValueForF2IConv(hwy::UnsignedTag /* to_type_tag */,
1264
- FromT val) {
1259
+ HWY_INLINE ToT CastValueForF2IConv(FromT val) {
1265
1260
  // Prevent ubsan errors when converting float to narrower integer
1266
1261
 
1267
- // If LimitsMax<ToT>() can be exactly represented in FromT,
1268
- // kSmallestOutOfToTRangePosVal is equal to LimitsMax<ToT>().
1269
-
1270
- // Otherwise, if LimitsMax<ToT>() cannot be exactly represented in FromT,
1271
- // kSmallestOutOfToTRangePosVal is equal to LimitsMax<ToT>() + 1, which can
1272
- // be exactly represented in FromT.
1273
- constexpr FromT kSmallestOutOfToTRangePosVal =
1274
- (sizeof(ToT) * 8 <= static_cast<size_t>(MantissaBits<FromT>()) + 1)
1275
- ? static_cast<FromT>(LimitsMax<ToT>())
1276
- : static_cast<FromT>(
1277
- static_cast<FromT>(ToT{1} << (sizeof(ToT) * 8 - 1)) * FromT(2));
1278
-
1279
- if (detail::SignBit(val)) {
1280
- return ToT{0};
1281
- } else if (IsInf(Vec1<FromT>(val)).bits ||
1282
- val >= kSmallestOutOfToTRangePosVal) {
1283
- return LimitsMax<ToT>();
1284
- } else {
1285
- return static_cast<ToT>(val);
1286
- }
1287
- }
1288
-
1289
- template <class ToT, class FromT>
1290
- HWY_INLINE ToT CastValueForF2IConv(hwy::SignedTag /* to_type_tag */,
1291
- FromT val) {
1292
- // Prevent ubsan errors when converting float to narrower integer
1293
-
1294
- // If LimitsMax<ToT>() can be exactly represented in FromT,
1295
- // kSmallestOutOfToTRangePosVal is equal to LimitsMax<ToT>().
1296
-
1297
- // Otherwise, if LimitsMax<ToT>() cannot be exactly represented in FromT,
1298
- // kSmallestOutOfToTRangePosVal is equal to -LimitsMin<ToT>(), which can
1299
- // be exactly represented in FromT.
1300
- constexpr FromT kSmallestOutOfToTRangePosVal =
1301
- (sizeof(ToT) * 8 <= static_cast<size_t>(MantissaBits<FromT>()) + 2)
1302
- ? static_cast<FromT>(LimitsMax<ToT>())
1303
- : static_cast<FromT>(-static_cast<FromT>(LimitsMin<ToT>()));
1304
-
1305
- if (IsInf(Vec1<FromT>(val)).bits ||
1306
- detail::Abs(val) >= kSmallestOutOfToTRangePosVal) {
1307
- return detail::SignBit(val) ? LimitsMin<ToT>() : LimitsMax<ToT>();
1308
- } else {
1309
- return static_cast<ToT>(val);
1310
- }
1262
+ using FromTU = MakeUnsigned<FromT>;
1263
+ using ToTU = MakeUnsigned<ToT>;
1264
+
1265
+ constexpr unsigned kMaxExpField =
1266
+ static_cast<unsigned>(MaxExponentField<FromT>());
1267
+ constexpr unsigned kExpBias = kMaxExpField >> 1;
1268
+ constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
1269
+ kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
1270
+ kMaxExpField));
1271
+
1272
+ // If ToT is signed, compare only the exponent bits of val against
1273
+ // kMinOutOfRangeExpField.
1274
+ //
1275
+ // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
1276
+ // val against kMinOutOfRangeExpField as a negative value is outside of the
1277
+ // range of an unsigned integer type.
1278
+ const FromT val_to_compare =
1279
+ static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);
1280
+
1281
+ // val is within the range of ToT if
1282
+ // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
1283
+ // than kMinOutOfRangeExpField
1284
+ //
1285
+ // Otherwise, val is either outside of the range of ToT or equal to
1286
+ // LimitsMin<ToT>() if
1287
+ // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
1288
+ // than or equal to kMinOutOfRangeExpField.
1289
+
1290
+ return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
1291
+ MantissaBits<FromT>()) < kMinOutOfRangeExpField)
1292
+ ? static_cast<ToT>(val)
1293
+ : static_cast<ToT>(static_cast<ToTU>(LimitsMax<ToT>()) +
1294
+ static_cast<ToTU>(ScalarSignBit(val)));
1311
1295
  }
1312
1296
 
1313
1297
  template <class ToT, class ToTypeTag, class FromT>
1314
1298
  HWY_INLINE ToT CastValueForPromoteTo(ToTypeTag /* to_type_tag */, FromT val) {
1315
- return static_cast<ToT>(val);
1299
+ return ConvertScalarTo<ToT>(val);
1316
1300
  }
1317
1301
 
1318
1302
  template <class ToT>
1319
- HWY_INLINE ToT CastValueForPromoteTo(hwy::SignedTag to_type_tag, float val) {
1320
- return CastValueForF2IConv<ToT>(to_type_tag, val);
1303
+ HWY_INLINE ToT CastValueForPromoteTo(hwy::SignedTag /*to_type_tag*/,
1304
+ float val) {
1305
+ return CastValueForF2IConv<ToT>(val);
1321
1306
  }
1322
1307
 
1323
1308
  template <class ToT>
1324
- HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag to_type_tag, float val) {
1325
- return CastValueForF2IConv<ToT>(to_type_tag, val);
1309
+ HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag /*to_type_tag*/,
1310
+ float val) {
1311
+ return CastValueForF2IConv<ToT>(val);
1312
+ }
1313
+
1314
+ // If val is within the range of ToT, CastValueForInRangeF2IConv<ToT>(val)
1315
+ // returns static_cast<ToT>(val)
1316
+ //
1317
+ // Otherwise, CastValueForInRangeF2IConv<ToT>(val) returns an
1318
+ // implementation-defined result if val is not within the range of ToT.
1319
+ template <class ToT, class FromT>
1320
+ HWY_INLINE ToT CastValueForInRangeF2IConv(FromT val) {
1321
+ // Prevent ubsan errors when converting float to narrower integer
1322
+
1323
+ using FromTU = MakeUnsigned<FromT>;
1324
+
1325
+ constexpr unsigned kMaxExpField =
1326
+ static_cast<unsigned>(MaxExponentField<FromT>());
1327
+ constexpr unsigned kExpBias = kMaxExpField >> 1;
1328
+ constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
1329
+ kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
1330
+ kMaxExpField));
1331
+
1332
+ // If ToT is signed, compare only the exponent bits of val against
1333
+ // kMinOutOfRangeExpField.
1334
+ //
1335
+ // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
1336
+ // val against kMinOutOfRangeExpField as a negative value is outside of the
1337
+ // range of an unsigned integer type.
1338
+ const FromT val_to_compare =
1339
+ static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);
1340
+
1341
+ // val is within the range of ToT if
1342
+ // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
1343
+ // than kMinOutOfRangeExpField
1344
+ //
1345
+ // Otherwise, val is either outside of the range of ToT or equal to
1346
+ // LimitsMin<ToT>() if
1347
+ // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
1348
+ // than or equal to kMinOutOfRangeExpField.
1349
+
1350
+ return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
1351
+ MantissaBits<FromT>()) < kMinOutOfRangeExpField)
1352
+ ? static_cast<ToT>(val)
1353
+ : static_cast<ToT>(LimitsMin<ToT>());
1326
1354
  }
1327
1355
 
1328
1356
  } // namespace detail
1329
1357
 
1358
+ #ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
1359
+ #undef HWY_NATIVE_PROMOTE_F16_TO_F64
1360
+ #else
1361
+ #define HWY_NATIVE_PROMOTE_F16_TO_F64
1362
+ #endif
1363
+
1330
1364
  template <class DTo, typename TTo = TFromD<DTo>, typename TFrom>
1331
1365
  HWY_API Vec1<TTo> PromoteTo(DTo /* tag */, Vec1<TFrom> from) {
1332
1366
  static_assert(sizeof(TTo) > sizeof(TFrom), "Not promoting");
@@ -1335,6 +1369,18 @@ HWY_API Vec1<TTo> PromoteTo(DTo /* tag */, Vec1<TFrom> from) {
1335
1369
  detail::CastValueForPromoteTo<TTo>(hwy::TypeTag<TTo>(), from.raw));
1336
1370
  }
1337
1371
 
1372
+ #ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1373
+ #undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1374
+ #else
1375
+ #define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1376
+ #endif
1377
+
1378
+ template <class DTo, HWY_IF_UI64_D(DTo)>
1379
+ HWY_API VFromD<DTo> PromoteInRangeTo(DTo /* tag */, Vec1<float> from) {
1380
+ using TTo = TFromD<DTo>;
1381
+ return Vec1<TTo>(detail::CastValueForInRangeF2IConv<TTo>(from.raw));
1382
+ }
1383
+
1338
1384
  // MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here,
1339
1385
  // so we overload for TFrom=double and TTo={float,int32_t}.
1340
1386
  template <class D, HWY_IF_F32_D(D)>
@@ -1342,16 +1388,15 @@ HWY_API Vec1<float> DemoteTo(D /* tag */, Vec1<double> from) {
1342
1388
  // Prevent ubsan errors when converting float to narrower integer/float
1343
1389
  if (IsInf(from).bits ||
1344
1390
  Abs(from).raw > static_cast<double>(HighestValue<float>())) {
1345
- return Vec1<float>(detail::SignBit(from.raw) ? LowestValue<float>()
1346
- : HighestValue<float>());
1391
+ return Vec1<float>(ScalarSignBit(from.raw) ? LowestValue<float>()
1392
+ : HighestValue<float>());
1347
1393
  }
1348
1394
  return Vec1<float>(static_cast<float>(from.raw));
1349
1395
  }
1350
1396
  template <class D, HWY_IF_UI32_D(D)>
1351
1397
  HWY_API VFromD<D> DemoteTo(D /* tag */, Vec1<double> from) {
1352
1398
  // Prevent ubsan errors when converting int32_t to narrower integer/int32_t
1353
- return Vec1<TFromD<D>>(detail::CastValueForF2IConv<TFromD<D>>(
1354
- hwy::TypeTag<TFromD<D>>(), from.raw));
1399
+ return Vec1<TFromD<D>>(detail::CastValueForF2IConv<TFromD<D>>(from.raw));
1355
1400
  }
1356
1401
 
1357
1402
  template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
@@ -1365,15 +1410,30 @@ HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) {
1365
1410
  return Vec1<TTo>(static_cast<TTo>(from.raw));
1366
1411
  }
1367
1412
 
1413
+ // Disable the default unsigned to signed DemoteTo implementation in
1414
+ // generic_ops-inl.h on SCALAR as the SCALAR target has a target-specific
1415
+ // implementation of the unsigned to signed DemoteTo op and as ReorderDemote2To
1416
+ // is not supported on the SCALAR target
1417
+
1418
+ // NOTE: hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr is used instead of
1419
+ // hwy::EnableIf<false>* = nullptr to avoid compiler errors since
1420
+ // !hwy::IsSame<V, V>() is always false and as !hwy::IsSame<V, V>() will cause
1421
+ // SFINAE to occur instead of a hard error due to a dependency on the V template
1422
+ // argument
1423
+ #undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
1424
+ #define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \
1425
+ hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
1426
+
1368
1427
  template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
1369
- HWY_IF_UNSIGNED(TFrom), HWY_IF_UNSIGNED_D(DTo)>
1428
+ HWY_IF_UNSIGNED(TFrom), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DTo)>
1370
1429
  HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) {
1371
1430
  static_assert(!IsFloat<TFrom>(), "TFrom=double are handled above");
1372
1431
  static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
1373
1432
 
1433
+ const auto max = static_cast<MakeUnsigned<TTo>>(LimitsMax<TTo>());
1434
+
1374
1435
  // Int to int: choose closest value in TTo to `from` (avoids UB)
1375
- from.raw = HWY_MIN(from.raw, LimitsMax<TTo>());
1376
- return Vec1<TTo>(static_cast<TTo>(from.raw));
1436
+ return Vec1<TTo>(static_cast<TTo>(HWY_MIN(from.raw, max)));
1377
1437
  }
1378
1438
 
1379
1439
  template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
@@ -1383,6 +1443,19 @@ HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) {
1383
1443
  return Vec1<TTo>(static_cast<TTo>(from.raw));
1384
1444
  }
1385
1445
 
1446
+ #ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
1447
+ #undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
1448
+ #else
1449
+ #define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
1450
+ #endif
1451
+
1452
+ template <class D32, HWY_IF_UI32_D(D32)>
1453
+ HWY_API VFromD<D32> DemoteInRangeTo(D32 /*d32*/,
1454
+ VFromD<Rebind<double, D32>> v) {
1455
+ using TTo = TFromD<D32>;
1456
+ return Vec1<TTo>(detail::CastValueForInRangeF2IConv<TTo>(v.raw));
1457
+ }
1458
+
1386
1459
  // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions;
1387
1460
  // use this scalar version to verify the vector implementation.
1388
1461
  #ifdef HWY_NATIVE_F16C
@@ -1401,11 +1474,22 @@ HWY_API Vec1<float> PromoteTo(D d, const Vec1<bfloat16_t> v) {
1401
1474
  return Set(d, F32FromBF16(v.raw));
1402
1475
  }
1403
1476
 
1477
+ template <class DTo, typename TFrom>
1478
+ HWY_API VFromD<DTo> PromoteEvenTo(DTo d_to, Vec1<TFrom> v) {
1479
+ return PromoteTo(d_to, v);
1480
+ }
1481
+
1404
1482
  template <class D, HWY_IF_F16_D(D)>
1405
1483
  HWY_API Vec1<float16_t> DemoteTo(D /* tag */, const Vec1<float> v) {
1406
1484
  return Vec1<float16_t>(F16FromF32(v.raw));
1407
1485
  }
1408
1486
 
1487
+ #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
1488
+ #undef HWY_NATIVE_DEMOTE_F32_TO_BF16
1489
+ #else
1490
+ #define HWY_NATIVE_DEMOTE_F32_TO_BF16
1491
+ #endif
1492
+
1409
1493
  template <class D, HWY_IF_BF16_D(D)>
1410
1494
  HWY_API Vec1<bfloat16_t> DemoteTo(D d, const Vec1<float> v) {
1411
1495
  return Set(d, BF16FromF32(v.raw));
@@ -1416,8 +1500,7 @@ template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
1416
1500
  HWY_API Vec1<TTo> ConvertTo(DTo /* tag */, Vec1<TFrom> from) {
1417
1501
  static_assert(sizeof(TTo) == sizeof(TFrom), "Should have same size");
1418
1502
  // float## -> int##: return closest representable value.
1419
- return Vec1<TTo>(
1420
- detail::CastValueForF2IConv<TTo>(hwy::TypeTag<TTo>(), from.raw));
1503
+ return Vec1<TTo>(detail::CastValueForF2IConv<TTo>(from.raw));
1421
1504
  }
1422
1505
 
1423
1506
  template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
@@ -1428,6 +1511,19 @@ HWY_API Vec1<TTo> ConvertTo(DTo /* tag */, Vec1<TFrom> from) {
1428
1511
  return Vec1<TTo>(static_cast<TTo>(from.raw));
1429
1512
  }
1430
1513
 
1514
+ #ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
1515
+ #undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
1516
+ #else
1517
+ #define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
1518
+ #endif
1519
+
1520
+ template <class DI, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DI),
1521
+ HWY_IF_T_SIZE_ONE_OF_D(DI, (1 << 4) | (1 << 8))>
1522
+ HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, VFromD<RebindToFloat<DI>> v) {
1523
+ using TTo = TFromD<DI>;
1524
+ return VFromD<DI>(detail::CastValueForInRangeF2IConv<TTo>(v.raw));
1525
+ }
1526
+
1431
1527
  HWY_API Vec1<uint8_t> U8FromU32(const Vec1<uint32_t> v) {
1432
1528
  return DemoteTo(Sisd<uint8_t>(), v);
1433
1529
  }
@@ -1792,6 +1888,11 @@ HWY_API Mask1<T> LoadMaskBits(D /* tag */, const uint8_t* HWY_RESTRICT bits) {
1792
1888
  return Mask1<T>::FromBool((bits[0] & 1) != 0);
1793
1889
  }
1794
1890
 
1891
+ template <class D, HWY_IF_LANES_D(D, 1)>
1892
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D /*d*/, unsigned mask_bits) {
1893
+ return MFromD<D>::FromBool((mask_bits & 1) != 0);
1894
+ }
1895
+
1795
1896
  // `p` points to at least 8 writable bytes.
1796
1897
  template <class D, typename T = TFromD<D>>
1797
1898
  HWY_API size_t StoreMaskBits(D d, const Mask1<T> mask, uint8_t* bits) {
@@ -1910,6 +2011,35 @@ HWY_API Vec1<int32_t> WidenMulPairwiseAdd(D32 /* tag */, Vec1<int16_t> a,
1910
2011
  return Vec1<int32_t>(a.raw * b.raw);
1911
2012
  }
1912
2013
 
2014
+ // ------------------------------ SatWidenMulAccumFixedPoint
2015
+ #ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
2016
+ #undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
2017
+ #else
2018
+ #define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
2019
+ #endif
2020
+
2021
+ template <class DI32, HWY_IF_I32_D(DI32)>
2022
+ HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 di32,
2023
+ VFromD<Rebind<int16_t, DI32>> a,
2024
+ VFromD<Rebind<int16_t, DI32>> b,
2025
+ VFromD<DI32> sum) {
2026
+ // Multiplying static_cast<int32_t>(a.raw) by static_cast<int32_t>(b.raw)
2027
+ // followed by an addition of the product is okay as
2028
+ // (a.raw * b.raw * 2) is between -2147418112 and 2147483648 and as
2029
+ // a.raw * b.raw * 2 can only overflow an int32_t if both a.raw and b.raw are
2030
+ // equal to -32768.
2031
+
2032
+ const VFromD<DI32> product(static_cast<int32_t>(a.raw) *
2033
+ static_cast<int32_t>(b.raw));
2034
+ const VFromD<DI32> product2 = Add(product, product);
2035
+
2036
+ const auto mul_overflow =
2037
+ VecFromMask(di32, Eq(product2, Set(di32, LimitsMin<int32_t>())));
2038
+
2039
+ return SaturatedAdd(Sub(sum, And(BroadcastSignBit(sum), mul_overflow)),
2040
+ Add(product2, mul_overflow));
2041
+ }
2042
+
1913
2043
  // ------------------------------ SatWidenMulPairwiseAdd
1914
2044
 
1915
2045
  #ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
@@ -1937,6 +2067,12 @@ HWY_API Vec1<int16_t> SatWidenMulPairwiseAdd(DI16 /* tag */, Vec1<uint8_t> a,
1937
2067
 
1938
2068
  // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
1939
2069
 
2070
+ #ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
2071
+ #undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
2072
+ #else
2073
+ #define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
2074
+ #endif
2075
+
1940
2076
  template <class D32, HWY_IF_F32_D(D32)>
1941
2077
  HWY_API Vec1<float> ReorderWidenMulAccumulate(D32 /* tag */, Vec1<bfloat16_t> a,
1942
2078
  Vec1<bfloat16_t> b,
@@ -1971,23 +2107,7 @@ HWY_API Vec1<TW> RearrangeToOddPlusEven(Vec1<TW> sum0, Vec1<TW> /* sum1 */) {
1971
2107
 
1972
2108
  // ================================================== REDUCTIONS
1973
2109
 
1974
- // Sum of all lanes, i.e. the only one.
1975
- template <class D, typename T = TFromD<D>>
1976
- HWY_API Vec1<T> SumOfLanes(D /* tag */, const Vec1<T> v) {
1977
- return v;
1978
- }
1979
- template <class D, typename T = TFromD<D>>
1980
- HWY_API T ReduceSum(D /* tag */, const Vec1<T> v) {
1981
- return GetLane(v);
1982
- }
1983
- template <class D, typename T = TFromD<D>>
1984
- HWY_API Vec1<T> MinOfLanes(D /* tag */, const Vec1<T> v) {
1985
- return v;
1986
- }
1987
- template <class D, typename T = TFromD<D>>
1988
- HWY_API Vec1<T> MaxOfLanes(D /* tag */, const Vec1<T> v) {
1989
- return v;
1990
- }
2110
+ // Nothing native, generic_ops-inl defines SumOfLanes and ReduceSum.
1991
2111
 
1992
2112
  // NOLINTNEXTLINE(google-readability-namespace-comments)
1993
2113
  } // namespace HWY_NAMESPACE