@img/sharp-libvips-dev 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. package/README.md +1 -2
  2. package/include/aom/aom_decoder.h +1 -1
  3. package/include/aom/aom_encoder.h +7 -1
  4. package/include/aom/aom_image.h +24 -12
  5. package/include/aom/aom_integer.h +3 -3
  6. package/include/aom/aomcx.h +15 -0
  7. package/include/aom/aomdx.h +5 -2
  8. package/include/archive.h +7 -5
  9. package/include/archive_entry.h +5 -3
  10. package/include/cgif.h +3 -0
  11. package/include/expat.h +21 -10
  12. package/include/expat_config.h +11 -5
  13. package/include/ffi.h +12 -25
  14. package/include/freetype2/freetype/config/ftoption.h +2 -2
  15. package/include/fribidi/fribidi-config.h +2 -2
  16. package/include/fribidi/fribidi-unicode-version.h +3 -3
  17. package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
  18. package/include/glib-2.0/gio/gappinfo.h +40 -25
  19. package/include/glib-2.0/gio/gapplication.h +6 -0
  20. package/include/glib-2.0/gio/gasyncresult.h +1 -1
  21. package/include/glib-2.0/gio/gconverter.h +5 -0
  22. package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
  23. package/include/glib-2.0/gio/gfile.h +16 -0
  24. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  25. package/include/glib-2.0/gio/giotypes.h +0 -1
  26. package/include/glib-2.0/gio/gsettings.h +8 -0
  27. package/include/glib-2.0/gio/gvfs.h +2 -2
  28. package/include/glib-2.0/girepository/gi-visibility.h +34 -0
  29. package/include/glib-2.0/girepository/giarginfo.h +23 -6
  30. package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
  31. package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
  32. package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
  33. package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
  34. package/include/glib-2.0/girepository/gienuminfo.h +20 -21
  35. package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
  36. package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
  37. package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
  38. package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
  39. package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
  40. package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
  41. package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
  42. package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
  43. package/include/glib-2.0/girepository/girepository.h +53 -62
  44. package/include/glib-2.0/girepository/girffi.h +8 -7
  45. package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
  46. package/include/glib-2.0/girepository/gistructinfo.h +26 -11
  47. package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
  48. package/include/glib-2.0/girepository/gitypelib.h +9 -13
  49. package/include/glib-2.0/girepository/gitypes.h +52 -104
  50. package/include/glib-2.0/girepository/giunioninfo.h +28 -12
  51. package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
  52. package/include/glib-2.0/girepository/givalueinfo.h +65 -0
  53. package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
  54. package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
  55. package/include/glib-2.0/glib/gbitlock.h +31 -0
  56. package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
  57. package/include/glib-2.0/glib/giochannel.h +2 -2
  58. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  59. package/include/glib-2.0/glib/gmacros.h +12 -5
  60. package/include/glib-2.0/glib/gmain.h +93 -7
  61. package/include/glib-2.0/glib/gmessages.h +8 -0
  62. package/include/glib-2.0/glib/gqsort.h +8 -1
  63. package/include/glib-2.0/glib/gslice.h +2 -0
  64. package/include/glib-2.0/glib/gstrfuncs.h +24 -30
  65. package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
  66. package/include/glib-2.0/glib/gthread.h +191 -3
  67. package/include/glib-2.0/glib/gunicode.h +1 -1
  68. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  69. package/include/glib-2.0/glib-unix.h +7 -1
  70. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  71. package/include/glib-2.0/gobject/genums.h +6 -6
  72. package/include/glib-2.0/gobject/glib-types.h +11 -0
  73. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  74. package/include/glib-2.0/gobject/gsignal.h +16 -6
  75. package/include/glib-2.0/gobject/gtype.h +6 -6
  76. package/include/harfbuzz/hb-buffer.h +6 -0
  77. package/include/harfbuzz/hb-common.h +6 -9
  78. package/include/harfbuzz/hb-cplusplus.hh +8 -11
  79. package/include/harfbuzz/hb-subset.h +17 -4
  80. package/include/harfbuzz/hb-version.h +3 -3
  81. package/include/hwy/abort.h +28 -0
  82. package/include/hwy/aligned_allocator.h +218 -6
  83. package/include/hwy/base.h +1935 -512
  84. package/include/hwy/cache_control.h +24 -6
  85. package/include/hwy/detect_compiler_arch.h +105 -10
  86. package/include/hwy/detect_targets.h +146 -37
  87. package/include/hwy/foreach_target.h +36 -1
  88. package/include/hwy/highway.h +222 -50
  89. package/include/hwy/ops/arm_neon-inl.h +2055 -894
  90. package/include/hwy/ops/arm_sve-inl.h +1476 -348
  91. package/include/hwy/ops/emu128-inl.h +711 -623
  92. package/include/hwy/ops/generic_ops-inl.h +4431 -2157
  93. package/include/hwy/ops/inside-inl.h +691 -0
  94. package/include/hwy/ops/ppc_vsx-inl.h +2186 -673
  95. package/include/hwy/ops/rvv-inl.h +1556 -536
  96. package/include/hwy/ops/scalar-inl.h +353 -233
  97. package/include/hwy/ops/set_macros-inl.h +171 -23
  98. package/include/hwy/ops/shared-inl.h +198 -56
  99. package/include/hwy/ops/wasm_128-inl.h +283 -244
  100. package/include/hwy/ops/x86_128-inl.h +3673 -1357
  101. package/include/hwy/ops/x86_256-inl.h +1737 -663
  102. package/include/hwy/ops/x86_512-inl.h +1697 -500
  103. package/include/hwy/per_target.h +4 -0
  104. package/include/hwy/profiler.h +648 -0
  105. package/include/hwy/robust_statistics.h +2 -2
  106. package/include/hwy/targets.h +40 -32
  107. package/include/hwy/timer-inl.h +3 -3
  108. package/include/hwy/timer.h +16 -1
  109. package/include/libheif/heif.h +170 -15
  110. package/include/libheif/heif_items.h +237 -0
  111. package/include/libheif/heif_properties.h +38 -2
  112. package/include/libheif/heif_regions.h +1 -1
  113. package/include/libheif/heif_version.h +2 -2
  114. package/include/libpng16/png.h +32 -29
  115. package/include/libpng16/pngconf.h +2 -2
  116. package/include/libpng16/pnglibconf.h +8 -3
  117. package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
  118. package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
  119. package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
  120. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  121. package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
  122. package/include/libxml2/libxml/HTMLparser.h +12 -19
  123. package/include/libxml2/libxml/c14n.h +1 -12
  124. package/include/libxml2/libxml/debugXML.h +1 -1
  125. package/include/libxml2/libxml/encoding.h +9 -0
  126. package/include/libxml2/libxml/entities.h +12 -1
  127. package/include/libxml2/libxml/hash.h +19 -0
  128. package/include/libxml2/libxml/list.h +2 -2
  129. package/include/libxml2/libxml/nanohttp.h +17 -0
  130. package/include/libxml2/libxml/parser.h +73 -58
  131. package/include/libxml2/libxml/parserInternals.h +9 -1
  132. package/include/libxml2/libxml/pattern.h +6 -0
  133. package/include/libxml2/libxml/tree.h +32 -12
  134. package/include/libxml2/libxml/uri.h +11 -0
  135. package/include/libxml2/libxml/valid.h +29 -2
  136. package/include/libxml2/libxml/xinclude.h +7 -0
  137. package/include/libxml2/libxml/xmlIO.h +21 -5
  138. package/include/libxml2/libxml/xmlerror.h +14 -0
  139. package/include/libxml2/libxml/xmlexports.h +111 -15
  140. package/include/libxml2/libxml/xmlmemory.h +8 -45
  141. package/include/libxml2/libxml/xmlreader.h +2 -0
  142. package/include/libxml2/libxml/xmlsave.h +5 -0
  143. package/include/libxml2/libxml/xmlunicode.h +165 -1
  144. package/include/libxml2/libxml/xmlversion.h +15 -179
  145. package/include/libxml2/libxml/xmlwriter.h +1 -0
  146. package/include/libxml2/libxml/xpath.h +4 -0
  147. package/include/pango-1.0/pango/pango-features.h +2 -2
  148. package/include/pango-1.0/pango/pango-fontmap.h +7 -0
  149. package/include/pango-1.0/pango/pango-item.h +4 -2
  150. package/include/pango-1.0/pango/pango-version-macros.h +25 -0
  151. package/include/pango-1.0/pango/pangofc-font.h +2 -1
  152. package/include/pixman-1/pixman-version.h +2 -2
  153. package/include/png.h +32 -29
  154. package/include/pngconf.h +2 -2
  155. package/include/pnglibconf.h +8 -3
  156. package/include/vips/connection.h +9 -3
  157. package/include/vips/util.h +1 -11
  158. package/include/vips/version.h +4 -4
  159. package/include/webp/decode.h +58 -56
  160. package/include/webp/demux.h +25 -21
  161. package/include/webp/encode.h +44 -39
  162. package/include/webp/mux.h +76 -15
  163. package/include/webp/mux_types.h +2 -1
  164. package/include/webp/sharpyuv/sharpyuv.h +77 -8
  165. package/include/webp/types.h +29 -8
  166. package/include/zconf.h +1 -1
  167. package/include/zlib.h +12 -12
  168. package/package.json +1 -1
  169. package/versions.json +18 -19
@@ -16,7 +16,11 @@
16
16
  // Single-element vectors and operations.
17
17
  // External include guard in highway.h - see comment there.
18
18
 
19
- #include <cmath> // std::abs, std::isnan
19
+ #include "hwy/base.h"
20
+
21
+ #ifndef HWY_NO_LIBCXX
22
+ #include <math.h> // sqrtf
23
+ #endif
20
24
 
21
25
  #include "hwy/ops/shared-inl.h"
22
26
 
@@ -49,6 +53,9 @@ struct Vec128 {
49
53
  HWY_INLINE Vec128& operator-=(const Vec128 other) {
50
54
  return *this = (*this - other);
51
55
  }
56
+ HWY_INLINE Vec128& operator%=(const Vec128 other) {
57
+ return *this = (*this % other);
58
+ }
52
59
  HWY_INLINE Vec128& operator&=(const Vec128 other) {
53
60
  return *this = (*this & other);
54
61
  }
@@ -97,15 +104,12 @@ HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
97
104
  template <class D>
98
105
  using VFromD = decltype(Zero(D()));
99
106
 
100
- // ------------------------------ Tuple (VFromD)
101
- #include "hwy/ops/tuple-inl.h"
102
-
103
107
  // ------------------------------ BitCast
104
108
 
105
109
  template <class D, class VFrom>
106
110
  HWY_API VFromD<D> BitCast(D /* tag */, VFrom v) {
107
111
  VFromD<D> to;
108
- CopySameSize(&v, &to);
112
+ CopySameSize(&v.raw, &to.raw);
109
113
  return to;
110
114
  }
111
115
 
@@ -122,7 +126,7 @@ HWY_API VFromD<D> ResizeBitCast(D d, VFrom v) {
122
126
  constexpr size_t kCopyByteLen = HWY_MIN(kFromByteLen, kToByteLen);
123
127
 
124
128
  VFromD<D> to = Zero(d);
125
- CopyBytes<kCopyByteLen>(&v, &to);
129
+ CopyBytes<kCopyByteLen>(&v.raw, &to.raw);
126
130
  return to;
127
131
  }
128
132
 
@@ -145,7 +149,7 @@ template <class D, typename T2>
145
149
  HWY_API VFromD<D> Set(D d, const T2 t) {
146
150
  VFromD<D> v;
147
151
  for (size_t i = 0; i < MaxLanes(d); ++i) {
148
- v.raw[i] = static_cast<TFromD<D>>(t);
152
+ v.raw[i] = ConvertScalarTo<TFromD<D>>(t);
149
153
  }
150
154
  return v;
151
155
  }
@@ -156,14 +160,79 @@ HWY_API VFromD<D> Undefined(D d) {
156
160
  return Zero(d);
157
161
  }
158
162
 
163
+ // ------------------------------ Dup128VecFromValues
164
+
165
+ template <class D, HWY_IF_T_SIZE_D(D, 1)>
166
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
167
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
168
+ TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
169
+ TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
170
+ TFromD<D> t11, TFromD<D> t12,
171
+ TFromD<D> t13, TFromD<D> t14,
172
+ TFromD<D> t15) {
173
+ VFromD<D> result;
174
+ result.raw[0] = t0;
175
+ result.raw[1] = t1;
176
+ result.raw[2] = t2;
177
+ result.raw[3] = t3;
178
+ result.raw[4] = t4;
179
+ result.raw[5] = t5;
180
+ result.raw[6] = t6;
181
+ result.raw[7] = t7;
182
+ result.raw[8] = t8;
183
+ result.raw[9] = t9;
184
+ result.raw[10] = t10;
185
+ result.raw[11] = t11;
186
+ result.raw[12] = t12;
187
+ result.raw[13] = t13;
188
+ result.raw[14] = t14;
189
+ result.raw[15] = t15;
190
+ return result;
191
+ }
192
+
193
+ template <class D, HWY_IF_T_SIZE_D(D, 2)>
194
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
195
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
196
+ TFromD<D> t5, TFromD<D> t6,
197
+ TFromD<D> t7) {
198
+ VFromD<D> result;
199
+ result.raw[0] = t0;
200
+ result.raw[1] = t1;
201
+ result.raw[2] = t2;
202
+ result.raw[3] = t3;
203
+ result.raw[4] = t4;
204
+ result.raw[5] = t5;
205
+ result.raw[6] = t6;
206
+ result.raw[7] = t7;
207
+ return result;
208
+ }
209
+
210
+ template <class D, HWY_IF_T_SIZE_D(D, 4)>
211
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
212
+ TFromD<D> t2, TFromD<D> t3) {
213
+ VFromD<D> result;
214
+ result.raw[0] = t0;
215
+ result.raw[1] = t1;
216
+ result.raw[2] = t2;
217
+ result.raw[3] = t3;
218
+ return result;
219
+ }
220
+
221
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
222
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
223
+ VFromD<D> result;
224
+ result.raw[0] = t0;
225
+ result.raw[1] = t1;
226
+ return result;
227
+ }
228
+
159
229
  // ------------------------------ Iota
160
230
 
161
231
  template <class D, typename T = TFromD<D>, typename T2>
162
232
  HWY_API VFromD<D> Iota(D d, T2 first) {
163
233
  VFromD<D> v;
164
234
  for (size_t i = 0; i < MaxLanes(d); ++i) {
165
- v.raw[i] =
166
- AddWithWraparound(hwy::IsFloatTag<T>(), static_cast<T>(first), i);
235
+ v.raw[i] = AddWithWraparound(static_cast<T>(first), i);
167
236
  }
168
237
  return v;
169
238
  }
@@ -284,9 +353,8 @@ HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
284
353
  // ------------------------------ BroadcastSignBit
285
354
  template <typename T, size_t N>
286
355
  HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
287
- // This is used inside ShiftRight, so we cannot implement in terms of it.
288
356
  for (size_t i = 0; i < N; ++i) {
289
- v.raw[i] = v.raw[i] < 0 ? T(-1) : T(0);
357
+ v.raw[i] = ScalarShr(v.raw[i], sizeof(T) * 8 - 1);
290
358
  }
291
359
  return v;
292
360
  }
@@ -297,7 +365,7 @@ HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
297
365
  template <typename T, size_t N>
298
366
  HWY_API Mask128<T, N> MaskFromVec(Vec128<T, N> v) {
299
367
  Mask128<T, N> mask;
300
- CopySameSize(&v, &mask);
368
+ CopySameSize(&v.raw, &mask.bits);
301
369
  return mask;
302
370
  }
303
371
 
@@ -307,20 +375,15 @@ using MFromD = decltype(MaskFromVec(VFromD<D>()));
307
375
  template <class DTo, class MFrom>
308
376
  HWY_API MFromD<DTo> RebindMask(DTo /* tag */, MFrom mask) {
309
377
  MFromD<DTo> to;
310
- CopySameSize(&mask, &to);
378
+ CopySameSize(&mask.bits, &to.bits);
311
379
  return to;
312
380
  }
313
381
 
314
- template <typename T, size_t N>
315
- Vec128<T, N> VecFromMask(Mask128<T, N> mask) {
316
- Vec128<T, N> v;
317
- CopySameSize(&mask, &v);
318
- return v;
319
- }
320
-
321
382
  template <class D>
322
383
  VFromD<D> VecFromMask(D /* tag */, MFromD<D> mask) {
323
- return VecFromMask(mask);
384
+ VFromD<D> v;
385
+ CopySameSize(&mask.bits, &v.raw);
386
+ return v;
324
387
  }
325
388
 
326
389
  template <class D>
@@ -336,19 +399,20 @@ HWY_API MFromD<D> FirstN(D d, size_t n) {
336
399
  template <typename T, size_t N>
337
400
  HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
338
401
  Vec128<T, N> no) {
339
- return IfVecThenElse(VecFromMask(mask), yes, no);
402
+ const DFromV<decltype(yes)> d;
403
+ return IfVecThenElse(VecFromMask(d, mask), yes, no);
340
404
  }
341
405
 
342
406
  template <typename T, size_t N>
343
407
  HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
344
408
  const DFromV<decltype(yes)> d;
345
- return IfVecThenElse(VecFromMask(mask), yes, Zero(d));
409
+ return IfVecThenElse(VecFromMask(d, mask), yes, Zero(d));
346
410
  }
347
411
 
348
412
  template <typename T, size_t N>
349
413
  HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
350
414
  const DFromV<decltype(no)> d;
351
- return IfVecThenElse(VecFromMask(mask), Zero(d), no);
415
+ return IfVecThenElse(VecFromMask(d, mask), Zero(d), no);
352
416
  }
353
417
 
354
418
  template <typename T, size_t N>
@@ -364,17 +428,12 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
364
428
  return v;
365
429
  }
366
430
 
367
- template <typename T, size_t N>
368
- HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
369
- const DFromV<decltype(v)> d;
370
- return IfNegativeThenElse(v, Zero(d), v);
371
- }
372
-
373
431
  // ------------------------------ Mask logical
374
432
 
375
433
  template <typename T, size_t N>
376
434
  HWY_API Mask128<T, N> Not(Mask128<T, N> m) {
377
- return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
435
+ const Simd<T, N, 0> d;
436
+ return MaskFromVec(Not(VecFromMask(d, m)));
378
437
  }
379
438
 
380
439
  template <typename T, size_t N>
@@ -426,41 +485,26 @@ HWY_API Vec128<T, N> ShiftLeft(Vec128<T, N> v) {
426
485
  template <int kBits, typename T, size_t N>
427
486
  HWY_API Vec128<T, N> ShiftRight(Vec128<T, N> v) {
428
487
  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
429
- #if __cplusplus >= 202002L
430
488
  // Signed right shift is now guaranteed to be arithmetic (rounding toward
431
489
  // negative infinity, i.e. shifting in the sign bit).
432
490
  for (size_t i = 0; i < N; ++i) {
433
- v.raw[i] = static_cast<T>(v.raw[i] >> kBits);
434
- }
435
- #else
436
- if (IsSigned<T>()) {
437
- // Emulate arithmetic shift using only logical (unsigned) shifts, because
438
- // signed shifts are still implementation-defined.
439
- using TU = hwy::MakeUnsigned<T>;
440
- for (size_t i = 0; i < N; ++i) {
441
- const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> kBits);
442
- const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
443
- const size_t sign_shift =
444
- static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
445
- const TU upper = static_cast<TU>(sign << sign_shift);
446
- v.raw[i] = static_cast<T>(shifted | upper);
447
- }
448
- } else { // T is unsigned
449
- for (size_t i = 0; i < N; ++i) {
450
- v.raw[i] = static_cast<T>(v.raw[i] >> kBits);
451
- }
491
+ v.raw[i] = ScalarShr(v.raw[i], kBits);
452
492
  }
453
- #endif
493
+
454
494
  return v;
455
495
  }
456
496
 
457
497
  // ------------------------------ RotateRight (ShiftRight)
458
- template <int kBits, typename T, size_t N>
498
+ template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
459
499
  HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
500
+ const DFromV<decltype(v)> d;
501
+ const RebindToUnsigned<decltype(d)> du;
502
+
460
503
  constexpr size_t kSizeInBits = sizeof(T) * 8;
461
504
  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
462
505
  if (kBits == 0) return v;
463
- return Or(ShiftRight<kBits>(v),
506
+
507
+ return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
464
508
  ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
465
509
  }
466
510
 
@@ -477,31 +521,10 @@ HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, int bits) {
477
521
 
478
522
  template <typename T, size_t N>
479
523
  HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, int bits) {
480
- #if __cplusplus >= 202002L
481
- // Signed right shift is now guaranteed to be arithmetic (rounding toward
482
- // negative infinity, i.e. shifting in the sign bit).
483
524
  for (size_t i = 0; i < N; ++i) {
484
- v.raw[i] = static_cast<T>(v.raw[i] >> bits);
485
- }
486
- #else
487
- if (IsSigned<T>()) {
488
- // Emulate arithmetic shift using only logical (unsigned) shifts, because
489
- // signed shifts are still implementation-defined.
490
- using TU = hwy::MakeUnsigned<T>;
491
- for (size_t i = 0; i < N; ++i) {
492
- const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits);
493
- const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
494
- const size_t sign_shift =
495
- static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
496
- const TU upper = static_cast<TU>(sign << sign_shift);
497
- v.raw[i] = static_cast<T>(shifted | upper);
498
- }
499
- } else {
500
- for (size_t i = 0; i < N; ++i) {
501
- v.raw[i] = static_cast<T>(v.raw[i] >> bits); // unsigned, logical shift
502
- }
525
+ v.raw[i] = ScalarShr(v.raw[i], bits);
503
526
  }
504
- #endif
527
+
505
528
  return v;
506
529
  }
507
530
 
@@ -519,32 +542,10 @@ HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
519
542
 
520
543
  template <typename T, size_t N>
521
544
  HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) {
522
- #if __cplusplus >= 202002L
523
- // Signed right shift is now guaranteed to be arithmetic (rounding toward
524
- // negative infinity, i.e. shifting in the sign bit).
525
545
  for (size_t i = 0; i < N; ++i) {
526
- v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]);
527
- }
528
- #else
529
- if (IsSigned<T>()) {
530
- // Emulate arithmetic shift using only logical (unsigned) shifts, because
531
- // signed shifts are still implementation-defined.
532
- using TU = hwy::MakeUnsigned<T>;
533
- for (size_t i = 0; i < N; ++i) {
534
- const TU shifted =
535
- static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits.raw[i]);
536
- const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
537
- const size_t sign_shift = static_cast<size_t>(
538
- static_cast<int>(sizeof(TU)) * 8 - 1 - bits.raw[i]);
539
- const TU upper = static_cast<TU>(sign << sign_shift);
540
- v.raw[i] = static_cast<T>(shifted | upper);
541
- }
542
- } else { // T is unsigned
543
- for (size_t i = 0; i < N; ++i) {
544
- v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]);
545
- }
546
+ v.raw[i] = ScalarShr(v.raw[i], static_cast<int>(bits.raw[i]));
546
547
  }
547
- #endif
548
+
548
549
  return v;
549
550
  }
550
551
 
@@ -614,6 +615,15 @@ HWY_API Vec128<uint64_t, (N + 7) / 8> SumsOf8(Vec128<uint8_t, N> v) {
614
615
  return sums;
615
616
  }
616
617
 
618
+ template <size_t N>
619
+ HWY_API Vec128<int64_t, (N + 7) / 8> SumsOf8(Vec128<int8_t, N> v) {
620
+ Vec128<int64_t, (N + 7) / 8> sums;
621
+ for (size_t i = 0; i < N; ++i) {
622
+ sums.raw[i / 8] += v.raw[i];
623
+ }
624
+ return sums;
625
+ }
626
+
617
627
  // ------------------------------ SaturatedAdd
618
628
  template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
619
629
  HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
@@ -652,34 +662,14 @@ HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, Vec128<T, N> b) {
652
662
 
653
663
  // ------------------------------ Abs
654
664
 
655
- // Tag dispatch instead of SFINAE for MSVC 2017 compatibility
656
- namespace detail {
657
-
658
665
  template <typename T, size_t N>
659
- HWY_INLINE Vec128<T, N> Abs(SignedTag /*tag*/, Vec128<T, N> a) {
666
+ HWY_API Vec128<T, N> Abs(Vec128<T, N> a) {
660
667
  for (size_t i = 0; i < N; ++i) {
661
- const T s = a.raw[i];
662
- const T min = hwy::LimitsMin<T>();
663
- a.raw[i] = static_cast<T>((s >= 0 || s == min) ? a.raw[i] : -s);
668
+ a.raw[i] = ScalarAbs(a.raw[i]);
664
669
  }
665
670
  return a;
666
671
  }
667
672
 
668
- template <typename T, size_t N>
669
- HWY_INLINE Vec128<T, N> Abs(hwy::FloatTag /*tag*/, Vec128<T, N> v) {
670
- for (size_t i = 0; i < N; ++i) {
671
- v.raw[i] = std::abs(v.raw[i]);
672
- }
673
- return v;
674
- }
675
-
676
- } // namespace detail
677
-
678
- template <typename T, size_t N>
679
- HWY_API Vec128<T, N> Abs(Vec128<T, N> a) {
680
- return detail::Abs(hwy::TypeTag<T>(), a);
681
- }
682
-
683
673
  // ------------------------------ Min/Max
684
674
 
685
675
  // Tag dispatch instead of SFINAE for MSVC 2017 compatibility
@@ -706,9 +696,9 @@ template <typename T, size_t N>
706
696
  HWY_INLINE Vec128<T, N> Min(hwy::FloatTag /*tag*/, Vec128<T, N> a,
707
697
  Vec128<T, N> b) {
708
698
  for (size_t i = 0; i < N; ++i) {
709
- if (std::isnan(a.raw[i])) {
699
+ if (ScalarIsNaN(a.raw[i])) {
710
700
  a.raw[i] = b.raw[i];
711
- } else if (std::isnan(b.raw[i])) {
701
+ } else if (ScalarIsNaN(b.raw[i])) {
712
702
  // no change
713
703
  } else {
714
704
  a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]);
@@ -720,9 +710,9 @@ template <typename T, size_t N>
720
710
  HWY_INLINE Vec128<T, N> Max(hwy::FloatTag /*tag*/, Vec128<T, N> a,
721
711
  Vec128<T, N> b) {
722
712
  for (size_t i = 0; i < N; ++i) {
723
- if (std::isnan(a.raw[i])) {
713
+ if (ScalarIsNaN(a.raw[i])) {
724
714
  a.raw[i] = b.raw[i];
725
- } else if (std::isnan(b.raw[i])) {
715
+ } else if (ScalarIsNaN(b.raw[i])) {
726
716
  // no change
727
717
  } else {
728
718
  a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
@@ -825,7 +815,7 @@ HWY_API Vec128<T, N> operator*(Vec128<T, N> a, Vec128<T, N> b) {
825
815
  return detail::Mul(hwy::TypeTag<T>(), a, b);
826
816
  }
827
817
 
828
- template <typename T, size_t N>
818
+ template <typename T, size_t N, HWY_IF_FLOAT(T)>
829
819
  HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
830
820
  for (size_t i = 0; i < N; ++i) {
831
821
  a.raw[i] = (b.raw[i] == T{0}) ? 0 : a.raw[i] / b.raw[i];
@@ -833,26 +823,36 @@ HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
833
823
  return a;
834
824
  }
835
825
 
836
- // Returns the upper 16 bits of a * b in each lane.
837
- template <size_t N>
838
- HWY_API Vec128<int16_t, N> MulHigh(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
826
+ // Returns the upper sizeof(T)*8 bits of a * b in each lane.
827
+ template <class T, size_t N,
828
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
829
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
830
+ HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
831
+ using TW = MakeWide<T>;
839
832
  for (size_t i = 0; i < N; ++i) {
840
- a.raw[i] = static_cast<int16_t>((int32_t{a.raw[i]} * b.raw[i]) >> 16);
833
+ a.raw[i] = static_cast<T>(
834
+ (static_cast<TW>(a.raw[i]) * static_cast<TW>(b.raw[i])) >>
835
+ (sizeof(T) * 8));
841
836
  }
842
837
  return a;
843
838
  }
844
- template <size_t N>
845
- HWY_API Vec128<uint16_t, N> MulHigh(Vec128<uint16_t, N> a,
846
- Vec128<uint16_t, N> b) {
847
- for (size_t i = 0; i < N; ++i) {
848
- // Cast to uint32_t first to prevent overflow. Otherwise the result of
849
- // uint16_t * uint16_t is in "int" which may overflow. In practice the
850
- // result is the same but this way it is also defined.
851
- a.raw[i] = static_cast<uint16_t>(
852
- (static_cast<uint32_t>(a.raw[i]) * static_cast<uint32_t>(b.raw[i])) >>
853
- 16);
854
- }
855
- return a;
839
+
840
+ template <class T, HWY_IF_UI64(T)>
841
+ HWY_API Vec128<T, 1> MulHigh(Vec128<T, 1> a, Vec128<T, 1> b) {
842
+ T hi;
843
+ Mul128(GetLane(a), GetLane(b), &hi);
844
+ return Set(Full64<T>(), hi);
845
+ }
846
+
847
+ template <class T, HWY_IF_UI64(T)>
848
+ HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
849
+ T hi_0;
850
+ T hi_1;
851
+
852
+ Mul128(GetLane(a), GetLane(b), &hi_0);
853
+ Mul128(ExtractLane(a, 1), ExtractLane(b, 1), &hi_1);
854
+
855
+ return Dup128VecFromValues(Full128<T>(), hi_0, hi_1);
856
856
  }
857
857
 
858
858
  template <size_t N>
@@ -900,7 +900,7 @@ HWY_API Vec128<float, N> ApproximateReciprocal(Vec128<float, N> v) {
900
900
  // Zero inputs are allowed, but callers are responsible for replacing the
901
901
  // return value with something else (typically using IfThenElse). This check
902
902
  // avoids a ubsan error. The result is arbitrary.
903
- v.raw[i] = (std::abs(v.raw[i]) == 0.0f) ? 0.0f : 1.0f / v.raw[i];
903
+ v.raw[i] = (ScalarAbs(v.raw[i]) == 0.0f) ? 0.0f : 1.0f / v.raw[i];
904
904
  }
905
905
  return v;
906
906
  }
@@ -913,25 +913,25 @@ HWY_API Vec128<T, N> AbsDiff(Vec128<T, N> a, Vec128<T, N> b) {
913
913
 
914
914
  // ------------------------------ Floating-point multiply-add variants
915
915
 
916
- template <typename T, size_t N>
916
+ template <typename T, size_t N, HWY_IF_FLOAT(T)>
917
917
  HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
918
918
  Vec128<T, N> add) {
919
919
  return mul * x + add;
920
920
  }
921
921
 
922
- template <typename T, size_t N>
922
+ template <typename T, size_t N, HWY_IF_FLOAT(T)>
923
923
  HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
924
924
  Vec128<T, N> add) {
925
925
  return add - mul * x;
926
926
  }
927
927
 
928
- template <typename T, size_t N>
928
+ template <typename T, size_t N, HWY_IF_FLOAT(T)>
929
929
  HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
930
930
  Vec128<T, N> sub) {
931
931
  return mul * x - sub;
932
932
  }
933
933
 
934
- template <typename T, size_t N>
934
+ template <typename T, size_t N, HWY_IF_FLOAT(T)>
935
935
  HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
936
936
  Vec128<T, N> sub) {
937
937
  return Neg(mul) * x - sub;
@@ -943,21 +943,52 @@ template <size_t N>
943
943
  HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
944
944
  for (size_t i = 0; i < N; ++i) {
945
945
  const float half = v.raw[i] * 0.5f;
946
- uint32_t bits;
947
- CopySameSize(&v.raw[i], &bits);
948
946
  // Initial guess based on log2(f)
949
- bits = 0x5F3759DF - (bits >> 1);
950
- CopySameSize(&bits, &v.raw[i]);
947
+ v.raw[i] = BitCastScalar<float>(static_cast<uint32_t>(
948
+ 0x5F3759DF - (BitCastScalar<uint32_t>(v.raw[i]) >> 1)));
951
949
  // One Newton-Raphson iteration
952
950
  v.raw[i] = v.raw[i] * (1.5f - (half * v.raw[i] * v.raw[i]));
953
951
  }
954
952
  return v;
955
953
  }
956
954
 
955
+ namespace detail {
956
+
957
+ static HWY_INLINE float ScalarSqrt(float v) {
958
+ #if defined(HWY_NO_LIBCXX)
959
+ #if HWY_COMPILER_GCC_ACTUAL
960
+ return __builtin_sqrt(v);
961
+ #else
962
+ uint32_t bits = BitCastScalar<uint32_t>(v);
963
+ // Coarse approximation, letting the exponent LSB leak into the mantissa
964
+ bits = (1 << 29) + (bits >> 1) - (1 << 22);
965
+ return BitCastScalar<float>(bits);
966
+ #endif // !HWY_COMPILER_GCC_ACTUAL
967
+ #else
968
+ return sqrtf(v);
969
+ #endif // !HWY_NO_LIBCXX
970
+ }
971
+ static HWY_INLINE double ScalarSqrt(double v) {
972
+ #if defined(HWY_NO_LIBCXX)
973
+ #if HWY_COMPILER_GCC_ACTUAL
974
+ return __builtin_sqrt(v);
975
+ #else
976
+ uint64_t bits = BitCastScalar<uint64_t>(v);
977
+ // Coarse approximation, letting the exponent LSB leak into the mantissa
978
+ bits = (1ULL << 61) + (bits >> 1) - (1ULL << 51);
979
+ return BitCastScalar<double>(bits);
980
+ #endif // !HWY_COMPILER_GCC_ACTUAL
981
+ #else
982
+ return sqrt(v);
983
+ #endif // HWY_NO_LIBCXX
984
+ }
985
+
986
+ } // namespace detail
987
+
957
988
  template <typename T, size_t N>
958
989
  HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) {
959
990
  for (size_t i = 0; i < N; ++i) {
960
- v.raw[i] = std::sqrt(v.raw[i]);
991
+ v.raw[i] = detail::ScalarSqrt(v.raw[i]);
961
992
  }
962
993
  return v;
963
994
  }
@@ -967,21 +998,23 @@ HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) {
967
998
  template <typename T, size_t N>
968
999
  HWY_API Vec128<T, N> Round(Vec128<T, N> v) {
969
1000
  using TI = MakeSigned<T>;
1001
+ const T k0 = ConvertScalarTo<T>(0);
970
1002
  const Vec128<T, N> a = Abs(v);
971
1003
  for (size_t i = 0; i < N; ++i) {
972
1004
  if (!(a.raw[i] < MantissaEnd<T>())) { // Huge or NaN
973
1005
  continue;
974
1006
  }
975
- const T bias = v.raw[i] < T(0.0) ? T(-0.5) : T(0.5);
976
- const TI rounded = static_cast<TI>(v.raw[i] + bias);
1007
+ const T bias = ConvertScalarTo<T>(v.raw[i] < k0 ? -0.5 : 0.5);
1008
+ const TI rounded = ConvertScalarTo<TI>(v.raw[i] + bias);
977
1009
  if (rounded == 0) {
978
- v.raw[i] = v.raw[i] < 0 ? T{-0} : T{0};
1010
+ v.raw[i] = v.raw[i] < 0 ? ConvertScalarTo<T>(-0) : k0;
979
1011
  continue;
980
1012
  }
981
- const T rounded_f = static_cast<T>(rounded);
1013
+ const T rounded_f = ConvertScalarTo<T>(rounded);
982
1014
  // Round to even
983
- if ((rounded & 1) && std::abs(rounded_f - v.raw[i]) == T(0.5)) {
984
- v.raw[i] = static_cast<T>(rounded - (v.raw[i] < T(0) ? -1 : 1));
1015
+ if ((rounded & 1) &&
1016
+ ScalarAbs(rounded_f - v.raw[i]) == ConvertScalarTo<T>(0.5)) {
1017
+ v.raw[i] = ConvertScalarTo<T>(rounded - (v.raw[i] < k0 ? -1 : 1));
985
1018
  continue;
986
1019
  }
987
1020
  v.raw[i] = rounded_f;
@@ -994,30 +1027,32 @@ template <size_t N>
994
1027
  HWY_API Vec128<int32_t, N> NearestInt(Vec128<float, N> v) {
995
1028
  using T = float;
996
1029
  using TI = int32_t;
1030
+ const T k0 = ConvertScalarTo<T>(0);
997
1031
 
998
1032
  const Vec128<float, N> abs = Abs(v);
999
1033
  Vec128<int32_t, N> ret;
1000
1034
  for (size_t i = 0; i < N; ++i) {
1001
- const bool signbit = std::signbit(v.raw[i]);
1035
+ const bool signbit = ScalarSignBit(v.raw[i]);
1002
1036
 
1003
1037
  if (!(abs.raw[i] < MantissaEnd<T>())) { // Huge or NaN
1004
1038
  // Check if too large to cast or NaN
1005
- if (!(abs.raw[i] <= static_cast<T>(LimitsMax<TI>()))) {
1039
+ if (!(abs.raw[i] <= ConvertScalarTo<T>(LimitsMax<TI>()))) {
1006
1040
  ret.raw[i] = signbit ? LimitsMin<TI>() : LimitsMax<TI>();
1007
1041
  continue;
1008
1042
  }
1009
1043
  ret.raw[i] = static_cast<TI>(v.raw[i]);
1010
1044
  continue;
1011
1045
  }
1012
- const T bias = v.raw[i] < T(0.0) ? T(-0.5) : T(0.5);
1013
- const TI rounded = static_cast<TI>(v.raw[i] + bias);
1046
+ const T bias = ConvertScalarTo<T>(v.raw[i] < k0 ? -0.5 : 0.5);
1047
+ const TI rounded = ConvertScalarTo<TI>(v.raw[i] + bias);
1014
1048
  if (rounded == 0) {
1015
1049
  ret.raw[i] = 0;
1016
1050
  continue;
1017
1051
  }
1018
- const T rounded_f = static_cast<T>(rounded);
1052
+ const T rounded_f = ConvertScalarTo<T>(rounded);
1019
1053
  // Round to even
1020
- if ((rounded & 1) && std::abs(rounded_f - v.raw[i]) == T(0.5)) {
1054
+ if ((rounded & 1) &&
1055
+ ScalarAbs(rounded_f - v.raw[i]) == ConvertScalarTo<T>(0.5)) {
1021
1056
  ret.raw[i] = rounded - (signbit ? -1 : 1);
1022
1057
  continue;
1023
1058
  }
@@ -1056,8 +1091,7 @@ Vec128<Float, N> Ceil(Vec128<Float, N> v) {
1056
1091
  for (size_t i = 0; i < N; ++i) {
1057
1092
  const bool positive = v.raw[i] > Float(0.0);
1058
1093
 
1059
- Bits bits;
1060
- CopySameSize(&v.raw[i], &bits);
1094
+ Bits bits = BitCastScalar<Bits>(v.raw[i]);
1061
1095
 
1062
1096
  const int exponent =
1063
1097
  static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
@@ -1077,7 +1111,7 @@ Vec128<Float, N> Ceil(Vec128<Float, N> v) {
1077
1111
  if (positive) bits += (kMantissaMask + 1) >> exponent;
1078
1112
  bits &= ~mantissa_mask;
1079
1113
 
1080
- CopySameSize(&bits, &v.raw[i]);
1114
+ v.raw[i] = BitCastScalar<Float>(bits);
1081
1115
  }
1082
1116
  return v;
1083
1117
  }
@@ -1094,8 +1128,7 @@ Vec128<Float, N> Floor(Vec128<Float, N> v) {
1094
1128
  for (size_t i = 0; i < N; ++i) {
1095
1129
  const bool negative = v.raw[i] < Float(0.0);
1096
1130
 
1097
- Bits bits;
1098
- CopySameSize(&v.raw[i], &bits);
1131
+ Bits bits = BitCastScalar<Bits>(v.raw[i]);
1099
1132
 
1100
1133
  const int exponent =
1101
1134
  static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
@@ -1115,7 +1148,7 @@ Vec128<Float, N> Floor(Vec128<Float, N> v) {
1115
1148
  if (negative) bits += (kMantissaMask + 1) >> exponent;
1116
1149
  bits &= ~mantissa_mask;
1117
1150
 
1118
- CopySameSize(&bits, &v.raw[i]);
1151
+ v.raw[i] = BitCastScalar<Float>(bits);
1119
1152
  }
1120
1153
  return v;
1121
1154
  }
@@ -1127,44 +1160,11 @@ HWY_API Mask128<T, N> IsNaN(Vec128<T, N> v) {
1127
1160
  Mask128<T, N> ret;
1128
1161
  for (size_t i = 0; i < N; ++i) {
1129
1162
  // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
1130
- MakeUnsigned<T> bits;
1131
- CopySameSize(&v.raw[i], &bits);
1132
- bits += bits;
1133
- bits >>= 1; // clear sign bit
1134
- // NaN if all exponent bits are set and the mantissa is not zero.
1135
- ret.bits[i] = Mask128<T, N>::FromBool(bits > ExponentMask<T>());
1163
+ ret.bits[i] = Mask128<T, N>::FromBool(ScalarIsNaN(v.raw[i]));
1136
1164
  }
1137
1165
  return ret;
1138
1166
  }
1139
1167
 
1140
- template <typename T, size_t N>
1141
- HWY_API Mask128<T, N> IsInf(Vec128<T, N> v) {
1142
- static_assert(IsFloat<T>(), "Only for float");
1143
- const DFromV<decltype(v)> d;
1144
- const RebindToSigned<decltype(d)> di;
1145
- const VFromD<decltype(di)> vi = BitCast(di, v);
1146
- // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
1147
- return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
1148
- }
1149
-
1150
- // Returns whether normal/subnormal/zero.
1151
- template <typename T, size_t N>
1152
- HWY_API Mask128<T, N> IsFinite(Vec128<T, N> v) {
1153
- static_assert(IsFloat<T>(), "Only for float");
1154
- const DFromV<decltype(v)> d;
1155
- const RebindToUnsigned<decltype(d)> du;
1156
- const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
1157
- using VI = VFromD<decltype(di)>;
1158
- using VU = VFromD<decltype(du)>;
1159
- const VU vu = BitCast(du, v);
1160
- // 'Shift left' to clear the sign bit, then right so we can compare with the
1161
- // max exponent (cannot compare with MaxExponentTimes2 directly because it is
1162
- // negative and non-negative floats would be greater).
1163
- const VI exp =
1164
- BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
1165
- return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
1166
- }
1167
-
1168
1168
  // ================================================== COMPARE
1169
1169
 
1170
1170
  template <typename T, size_t N>
@@ -1400,177 +1400,387 @@ HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
1400
1400
  CopyBytes(v.raw, p, num_of_lanes_to_store * sizeof(TFromD<D>));
1401
1401
  }
1402
1402
 
1403
- // ------------------------------ LoadInterleaved2/3/4
1403
+ // ================================================== COMBINE
1404
1404
 
1405
- // Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
1406
- // We implement those here because scalar code is likely faster than emulation
1407
- // via shuffles.
1408
- #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1409
- #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1410
- #else
1411
- #define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1412
- #endif
1405
+ template <typename T, size_t N>
1406
+ HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
1407
+ Vec128<T, N / 2> ret;
1408
+ CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
1409
+ return ret;
1410
+ }
1413
1411
 
1414
- template <class D, typename T = TFromD<D>>
1415
- HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
1416
- VFromD<D>& v0, VFromD<D>& v1) {
1417
- alignas(16) T buf0[MaxLanes(d)];
1418
- alignas(16) T buf1[MaxLanes(d)];
1419
- for (size_t i = 0; i < MaxLanes(d); ++i) {
1420
- buf0[i] = *unaligned++;
1421
- buf1[i] = *unaligned++;
1422
- }
1423
- v0 = Load(d, buf0);
1424
- v1 = Load(d, buf1);
1412
+ template <class D>
1413
+ HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
1414
+ return LowerHalf(v);
1425
1415
  }
1426
1416
 
1427
- template <class D, typename T = TFromD<D>>
1428
- HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
1429
- VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
1430
- alignas(16) T buf0[MaxLanes(d)];
1431
- alignas(16) T buf1[MaxLanes(d)];
1432
- alignas(16) T buf2[MaxLanes(d)];
1433
- for (size_t i = 0; i < MaxLanes(d); ++i) {
1434
- buf0[i] = *unaligned++;
1435
- buf1[i] = *unaligned++;
1436
- buf2[i] = *unaligned++;
1437
- }
1438
- v0 = Load(d, buf0);
1439
- v1 = Load(d, buf1);
1440
- v2 = Load(d, buf2);
1417
+ template <class D>
1418
+ HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
1419
+ VFromD<D> ret;
1420
+ CopyBytes<d.MaxBytes()>(&v.raw[MaxLanes(d)], ret.raw);
1421
+ return ret;
1441
1422
  }
1442
1423
 
1443
- template <class D, typename T = TFromD<D>>
1444
- HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
1445
- VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
1446
- VFromD<D>& v3) {
1447
- alignas(16) T buf0[MaxLanes(d)];
1448
- alignas(16) T buf1[MaxLanes(d)];
1449
- alignas(16) T buf2[MaxLanes(d)];
1450
- alignas(16) T buf3[MaxLanes(d)];
1451
- for (size_t i = 0; i < MaxLanes(d); ++i) {
1452
- buf0[i] = *unaligned++;
1453
- buf1[i] = *unaligned++;
1454
- buf2[i] = *unaligned++;
1455
- buf3[i] = *unaligned++;
1456
- }
1457
- v0 = Load(d, buf0);
1458
- v1 = Load(d, buf1);
1459
- v2 = Load(d, buf2);
1460
- v3 = Load(d, buf3);
1424
+ template <class D>
1425
+ HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> v) {
1426
+ const Half<decltype(d)> dh;
1427
+ VFromD<D> ret; // zero-initialized
1428
+ CopyBytes<dh.MaxBytes()>(v.raw, ret.raw);
1429
+ return ret;
1461
1430
  }
1462
1431
 
1463
- // ------------------------------ StoreInterleaved2/3/4
1432
+ template <class D, class VH = VFromD<Half<D>>>
1433
+ HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
1434
+ const Half<decltype(d)> dh;
1435
+ VFromD<D> ret;
1436
+ CopyBytes<dh.MaxBytes()>(lo_half.raw, &ret.raw[0]);
1437
+ CopyBytes<dh.MaxBytes()>(hi_half.raw, &ret.raw[MaxLanes(dh)]);
1438
+ return ret;
1439
+ }
1464
1440
 
1465
1441
  template <class D>
1466
- HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
1467
- TFromD<D>* HWY_RESTRICT unaligned) {
1468
- for (size_t i = 0; i < MaxLanes(d); ++i) {
1469
- *unaligned++ = v0.raw[i];
1470
- *unaligned++ = v1.raw[i];
1471
- }
1442
+ HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
1443
+ const Half<decltype(d)> dh;
1444
+ VFromD<D> ret;
1445
+ CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
1446
+ CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
1447
+ return ret;
1472
1448
  }
1473
1449
 
1474
1450
  template <class D>
1475
- HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
1476
- TFromD<D>* HWY_RESTRICT unaligned) {
1477
- for (size_t i = 0; i < MaxLanes(d); ++i) {
1478
- *unaligned++ = v0.raw[i];
1479
- *unaligned++ = v1.raw[i];
1480
- *unaligned++ = v2.raw[i];
1481
- }
1451
+ HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
1452
+ const Half<decltype(d)> dh;
1453
+ VFromD<D> ret;
1454
+ CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
1455
+ CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
1456
+ return ret;
1482
1457
  }
1483
1458
 
1484
1459
  template <class D>
1485
- HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
1486
- VFromD<D> v3, D d,
1487
- TFromD<D>* HWY_RESTRICT unaligned) {
1488
- for (size_t i = 0; i < MaxLanes(d); ++i) {
1489
- *unaligned++ = v0.raw[i];
1490
- *unaligned++ = v1.raw[i];
1491
- *unaligned++ = v2.raw[i];
1492
- *unaligned++ = v3.raw[i];
1493
- }
1460
+ HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
1461
+ const Half<decltype(d)> dh;
1462
+ VFromD<D> ret;
1463
+ CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
1464
+ CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
1465
+ return ret;
1494
1466
  }
1495
1467
 
1496
- // ------------------------------ Stream
1497
1468
  template <class D>
1498
- HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
1499
- Store(v, d, aligned);
1469
+ HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
1470
+ const Half<decltype(d)> dh;
1471
+ VFromD<D> ret;
1472
+ CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
1473
+ CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
1474
+ return ret;
1500
1475
  }
1501
1476
 
1502
- // ------------------------------ Scatter in generic_ops-inl.h
1503
- // ------------------------------ Gather in generic_ops-inl.h
1477
+ template <class D>
1478
+ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
1479
+ const Half<decltype(d)> dh;
1480
+ VFromD<D> ret;
1481
+ for (size_t i = 0; i < MaxLanes(dh); ++i) {
1482
+ ret.raw[i] = lo.raw[2 * i];
1483
+ }
1484
+ for (size_t i = 0; i < MaxLanes(dh); ++i) {
1485
+ ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i];
1486
+ }
1487
+ return ret;
1488
+ }
1504
1489
 
1505
- // ================================================== CONVERT
1490
+ // 2023-11-23: workaround for incorrect codegen (reduction_test fails for
1491
+ // SumsOf2 because PromoteOddTo, which uses ConcatOdd, returns zero).
1492
+ #if HWY_ARCH_RISCV && HWY_TARGET == HWY_EMU128 && HWY_COMPILER_CLANG
1493
+ #define HWY_EMU128_CONCAT_INLINE HWY_NOINLINE
1494
+ #else
1495
+ #define HWY_EMU128_CONCAT_INLINE HWY_API
1496
+ #endif
1506
1497
 
1507
- // ConvertTo and DemoteTo with floating-point input and integer output truncate
1508
- // (rounding toward zero).
1498
+ template <class D>
1499
+ HWY_EMU128_CONCAT_INLINE VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
1500
+ const Half<decltype(d)> dh;
1501
+ VFromD<D> ret;
1502
+ for (size_t i = 0; i < MaxLanes(dh); ++i) {
1503
+ ret.raw[i] = lo.raw[2 * i + 1];
1504
+ }
1505
+ for (size_t i = 0; i < MaxLanes(dh); ++i) {
1506
+ ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i + 1];
1507
+ }
1508
+ return ret;
1509
+ }
1509
1510
 
1510
- namespace detail {
1511
+ // ------------------------------ CombineShiftRightBytes
1512
+ template <int kBytes, class D>
1513
+ HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
1514
+ VFromD<D> ret;
1515
+ const uint8_t* HWY_RESTRICT lo8 =
1516
+ reinterpret_cast<const uint8_t * HWY_RESTRICT>(lo.raw);
1517
+ uint8_t* HWY_RESTRICT ret8 =
1518
+ reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
1519
+ CopyBytes<d.MaxBytes() - kBytes>(lo8 + kBytes, ret8);
1520
+ CopyBytes<kBytes>(hi.raw, ret8 + d.MaxBytes() - kBytes);
1521
+ return ret;
1522
+ }
1511
1523
 
1512
- template <class ToT, class FromT>
1513
- HWY_INLINE ToT CastValueForF2IConv(hwy::UnsignedTag /* to_type_tag */,
1514
- FromT val) {
1515
- // Prevent ubsan errors when converting float to narrower integer
1524
+ // ------------------------------ ShiftLeftBytes
1525
+
1526
+ template <int kBytes, class D>
1527
+ HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
1528
+ static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1529
+ VFromD<D> ret;
1530
+ uint8_t* HWY_RESTRICT ret8 =
1531
+ reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
1532
+ ZeroBytes<kBytes>(ret8);
1533
+ CopyBytes<d.MaxBytes() - kBytes>(v.raw, ret8 + kBytes);
1534
+ return ret;
1535
+ }
1536
+
1537
+ template <int kBytes, typename T, size_t N>
1538
+ HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
1539
+ return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
1540
+ }
1541
+
1542
+ // ------------------------------ ShiftLeftLanes
1516
1543
 
1517
- // If LimitsMax<ToT>() can be exactly represented in FromT,
1518
- // kSmallestOutOfToTRangePosVal is equal to LimitsMax<ToT>().
1544
+ template <int kLanes, class D, typename T = TFromD<D>>
1545
+ HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) {
1546
+ const Repartition<uint8_t, decltype(d)> d8;
1547
+ return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
1548
+ }
1549
+
1550
+ template <int kLanes, typename T, size_t N>
1551
+ HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) {
1552
+ return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
1553
+ }
1554
+
1555
+ // ------------------------------ ShiftRightBytes
1556
+ template <int kBytes, class D>
1557
+ HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
1558
+ static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1559
+ VFromD<D> ret;
1560
+ const uint8_t* HWY_RESTRICT v8 =
1561
+ reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
1562
+ uint8_t* HWY_RESTRICT ret8 =
1563
+ reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
1564
+ CopyBytes<d.MaxBytes() - kBytes>(v8 + kBytes, ret8);
1565
+ ZeroBytes<kBytes>(ret8 + d.MaxBytes() - kBytes);
1566
+ return ret;
1567
+ }
1568
+
1569
+ // ------------------------------ ShiftRightLanes
1570
+ template <int kLanes, class D>
1571
+ HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) {
1572
+ const Repartition<uint8_t, decltype(d)> d8;
1573
+ constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
1574
+ return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
1575
+ }
1576
+
1577
+ // ------------------------------ Tuples, PromoteEvenTo/PromoteOddTo
1578
+ #include "hwy/ops/inside-inl.h"
1579
+
1580
+ // ------------------------------ LoadInterleaved2/3/4
1581
+
1582
+ // Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
1583
+ // We implement those here because scalar code is likely faster than emulation
1584
+ // via shuffles.
1585
+ #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1586
+ #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
1587
+ #else
1588
+ #define HWY_NATIVE_LOAD_STORE_INTERLEAVED
1589
+ #endif
1590
+
1591
+ template <class D, typename T = TFromD<D>>
1592
+ HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
1593
+ VFromD<D>& v0, VFromD<D>& v1) {
1594
+ alignas(16) T buf0[MaxLanes(d)];
1595
+ alignas(16) T buf1[MaxLanes(d)];
1596
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
1597
+ buf0[i] = *unaligned++;
1598
+ buf1[i] = *unaligned++;
1599
+ }
1600
+ v0 = Load(d, buf0);
1601
+ v1 = Load(d, buf1);
1602
+ }
1519
1603
 
1520
- // Otherwise, if LimitsMax<ToT>() cannot be exactly represented in FromT,
1521
- // kSmallestOutOfToTRangePosVal is equal to LimitsMax<ToT>() + 1, which can
1522
- // be exactly represented in FromT.
1523
- constexpr FromT kSmallestOutOfToTRangePosVal =
1524
- (sizeof(ToT) * 8 <= static_cast<size_t>(MantissaBits<FromT>()) + 1)
1525
- ? static_cast<FromT>(LimitsMax<ToT>())
1526
- : static_cast<FromT>(
1527
- static_cast<FromT>(ToT{1} << (sizeof(ToT) * 8 - 1)) * FromT(2));
1604
+ template <class D, typename T = TFromD<D>>
1605
+ HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
1606
+ VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
1607
+ alignas(16) T buf0[MaxLanes(d)];
1608
+ alignas(16) T buf1[MaxLanes(d)];
1609
+ alignas(16) T buf2[MaxLanes(d)];
1610
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
1611
+ buf0[i] = *unaligned++;
1612
+ buf1[i] = *unaligned++;
1613
+ buf2[i] = *unaligned++;
1614
+ }
1615
+ v0 = Load(d, buf0);
1616
+ v1 = Load(d, buf1);
1617
+ v2 = Load(d, buf2);
1618
+ }
1528
1619
 
1529
- if (std::signbit(val)) {
1530
- return ToT{0};
1531
- } else if (std::isinf(val) || val >= kSmallestOutOfToTRangePosVal) {
1532
- return LimitsMax<ToT>();
1533
- } else {
1534
- return static_cast<ToT>(val);
1620
+ template <class D, typename T = TFromD<D>>
1621
+ HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
1622
+ VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
1623
+ VFromD<D>& v3) {
1624
+ alignas(16) T buf0[MaxLanes(d)];
1625
+ alignas(16) T buf1[MaxLanes(d)];
1626
+ alignas(16) T buf2[MaxLanes(d)];
1627
+ alignas(16) T buf3[MaxLanes(d)];
1628
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
1629
+ buf0[i] = *unaligned++;
1630
+ buf1[i] = *unaligned++;
1631
+ buf2[i] = *unaligned++;
1632
+ buf3[i] = *unaligned++;
1535
1633
  }
1634
+ v0 = Load(d, buf0);
1635
+ v1 = Load(d, buf1);
1636
+ v2 = Load(d, buf2);
1637
+ v3 = Load(d, buf3);
1536
1638
  }
1537
1639
 
1538
- template <class ToT, class FromT>
1539
- HWY_INLINE ToT CastValueForF2IConv(hwy::SignedTag /* to_type_tag */,
1540
- FromT val) {
1541
- // Prevent ubsan errors when converting float to narrower integer
1640
+ // ------------------------------ StoreInterleaved2/3/4
1542
1641
 
1543
- // If LimitsMax<ToT>() can be exactly represented in FromT,
1544
- // kSmallestOutOfToTRangePosVal is equal to LimitsMax<ToT>().
1642
+ template <class D>
1643
+ HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
1644
+ TFromD<D>* HWY_RESTRICT unaligned) {
1645
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
1646
+ *unaligned++ = v0.raw[i];
1647
+ *unaligned++ = v1.raw[i];
1648
+ }
1649
+ }
1545
1650
 
1546
- // Otherwise, if LimitsMax<ToT>() cannot be exactly represented in FromT,
1547
- // kSmallestOutOfToTRangePosVal is equal to -LimitsMin<ToT>(), which can
1548
- // be exactly represented in FromT.
1549
- constexpr FromT kSmallestOutOfToTRangePosVal =
1550
- (sizeof(ToT) * 8 <= static_cast<size_t>(MantissaBits<FromT>()) + 2)
1551
- ? static_cast<FromT>(LimitsMax<ToT>())
1552
- : static_cast<FromT>(-static_cast<FromT>(LimitsMin<ToT>()));
1651
+ template <class D>
1652
+ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
1653
+ TFromD<D>* HWY_RESTRICT unaligned) {
1654
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
1655
+ *unaligned++ = v0.raw[i];
1656
+ *unaligned++ = v1.raw[i];
1657
+ *unaligned++ = v2.raw[i];
1658
+ }
1659
+ }
1553
1660
 
1554
- if (std::isinf(val) || std::fabs(val) >= kSmallestOutOfToTRangePosVal) {
1555
- return std::signbit(val) ? LimitsMin<ToT>() : LimitsMax<ToT>();
1556
- } else {
1557
- return static_cast<ToT>(val);
1661
+ template <class D>
1662
+ HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
1663
+ VFromD<D> v3, D d,
1664
+ TFromD<D>* HWY_RESTRICT unaligned) {
1665
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
1666
+ *unaligned++ = v0.raw[i];
1667
+ *unaligned++ = v1.raw[i];
1668
+ *unaligned++ = v2.raw[i];
1669
+ *unaligned++ = v3.raw[i];
1558
1670
  }
1559
1671
  }
1560
1672
 
1673
+ // ------------------------------ Stream
1674
+ template <class D>
1675
+ HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
1676
+ Store(v, d, aligned);
1677
+ }
1678
+
1679
+ // ------------------------------ Scatter in generic_ops-inl.h
1680
+ // ------------------------------ Gather in generic_ops-inl.h
1681
+
1682
+ // ================================================== CONVERT
1683
+
1684
+ // ConvertTo and DemoteTo with floating-point input and integer output truncate
1685
+ // (rounding toward zero).
1686
+
1687
+ namespace detail {
1688
+
1689
+ template <class ToT, class FromT>
1690
+ HWY_INLINE ToT CastValueForF2IConv(FromT val) {
1691
+ // Prevent ubsan errors when converting float to narrower integer
1692
+
1693
+ using FromTU = MakeUnsigned<FromT>;
1694
+ using ToTU = MakeUnsigned<ToT>;
1695
+
1696
+ constexpr unsigned kMaxExpField =
1697
+ static_cast<unsigned>(MaxExponentField<FromT>());
1698
+ constexpr unsigned kExpBias = kMaxExpField >> 1;
1699
+ constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
1700
+ kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
1701
+ kMaxExpField));
1702
+
1703
+ // If ToT is signed, compare only the exponent bits of val against
1704
+ // kMinOutOfRangeExpField.
1705
+ //
1706
+ // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
1707
+ // val against kMinOutOfRangeExpField as a negative value is outside of the
1708
+ // range of an unsigned integer type.
1709
+ const FromT val_to_compare =
1710
+ static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);
1711
+
1712
+ // val is within the range of ToT if
1713
+ // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
1714
+ // than kMinOutOfRangeExpField
1715
+ //
1716
+ // Otherwise, val is either outside of the range of ToT or equal to
1717
+ // LimitsMin<ToT>() if
1718
+ // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
1719
+ // than or equal to kMinOutOfRangeExpField.
1720
+
1721
+ return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
1722
+ MantissaBits<FromT>()) < kMinOutOfRangeExpField)
1723
+ ? static_cast<ToT>(val)
1724
+ : static_cast<ToT>(static_cast<ToTU>(LimitsMax<ToT>()) +
1725
+ static_cast<ToTU>(ScalarSignBit(val)));
1726
+ }
1727
+
1561
1728
  template <class ToT, class ToTypeTag, class FromT>
1562
1729
  HWY_INLINE ToT CastValueForPromoteTo(ToTypeTag /* to_type_tag */, FromT val) {
1563
- return static_cast<ToT>(val);
1730
+ return ConvertScalarTo<ToT>(val);
1564
1731
  }
1565
1732
 
1566
1733
  template <class ToT>
1567
- HWY_INLINE ToT CastValueForPromoteTo(hwy::SignedTag to_type_tag, float val) {
1568
- return CastValueForF2IConv<ToT>(to_type_tag, val);
1734
+ HWY_INLINE ToT CastValueForPromoteTo(hwy::SignedTag /*to_type_tag*/,
1735
+ float val) {
1736
+ return CastValueForF2IConv<ToT>(val);
1569
1737
  }
1570
1738
 
1571
1739
  template <class ToT>
1572
- HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag to_type_tag, float val) {
1573
- return CastValueForF2IConv<ToT>(to_type_tag, val);
1740
+ HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag /*to_type_tag*/,
1741
+ float val) {
1742
+ return CastValueForF2IConv<ToT>(val);
1743
+ }
1744
+ // If val is within the range of ToT, CastValueForInRangeF2IConv<ToT>(val)
1745
+ // returns static_cast<ToT>(val)
1746
+ //
1747
+ // Otherwise, CastValueForInRangeF2IConv<ToT>(val) returns an
1748
+ // implementation-defined result if val is not within the range of ToT.
1749
+ template <class ToT, class FromT>
1750
+ HWY_INLINE ToT CastValueForInRangeF2IConv(FromT val) {
1751
+ // Prevent ubsan errors when converting float to narrower integer
1752
+
1753
+ using FromTU = MakeUnsigned<FromT>;
1754
+
1755
+ constexpr unsigned kMaxExpField =
1756
+ static_cast<unsigned>(MaxExponentField<FromT>());
1757
+ constexpr unsigned kExpBias = kMaxExpField >> 1;
1758
+ constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
1759
+ kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
1760
+ kMaxExpField));
1761
+
1762
+ // If ToT is signed, compare only the exponent bits of val against
1763
+ // kMinOutOfRangeExpField.
1764
+ //
1765
+ // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
1766
+ // val against kMinOutOfRangeExpField as a negative value is outside of the
1767
+ // range of an unsigned integer type.
1768
+ const FromT val_to_compare =
1769
+ static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);
1770
+
1771
+ // val is within the range of ToT if
1772
+ // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
1773
+ // than kMinOutOfRangeExpField
1774
+ //
1775
+ // Otherwise, val is either outside of the range of ToT or equal to
1776
+ // LimitsMin<ToT>() if
1777
+ // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
1778
+ // than or equal to kMinOutOfRangeExpField.
1779
+
1780
+ return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
1781
+ MantissaBits<FromT>()) < kMinOutOfRangeExpField)
1782
+ ? static_cast<ToT>(val)
1783
+ : static_cast<ToT>(LimitsMin<ToT>());
1574
1784
  }
1575
1785
 
1576
1786
  } // namespace detail
@@ -1587,6 +1797,21 @@ HWY_API VFromD<DTo> PromoteTo(DTo d, Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
1587
1797
  return ret;
1588
1798
  }
1589
1799
 
1800
+ #ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1801
+ #undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1802
+ #else
1803
+ #define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1804
+ #endif
1805
+
1806
+ template <class D64, HWY_IF_UI64_D(D64)>
1807
+ HWY_API VFromD<D64> PromoteInRangeTo(D64 d64, VFromD<Rebind<float, D64>> v) {
1808
+ VFromD<D64> ret;
1809
+ for (size_t i = 0; i < MaxLanes(d64); ++i) {
1810
+ ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<D64>>(v.raw[i]);
1811
+ }
1812
+ return ret;
1813
+ }
1814
+
1590
1815
  // MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here,
1591
1816
  // so we overload for TFrom=double and ToT={float,int32_t}.
1592
1817
  template <class D, HWY_IF_F32_D(D)>
@@ -1594,10 +1819,10 @@ HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<double, D>> from) {
1594
1819
  VFromD<D> ret;
1595
1820
  for (size_t i = 0; i < MaxLanes(d); ++i) {
1596
1821
  // Prevent ubsan errors when converting float to narrower integer/float
1597
- if (std::isinf(from.raw[i]) ||
1598
- std::fabs(from.raw[i]) > static_cast<double>(HighestValue<float>())) {
1599
- ret.raw[i] = std::signbit(from.raw[i]) ? LowestValue<float>()
1600
- : HighestValue<float>();
1822
+ if (ScalarIsInf(from.raw[i]) ||
1823
+ ScalarAbs(from.raw[i]) > static_cast<double>(HighestValue<float>())) {
1824
+ ret.raw[i] = ScalarSignBit(from.raw[i]) ? LowestValue<float>()
1825
+ : HighestValue<float>();
1601
1826
  continue;
1602
1827
  }
1603
1828
  ret.raw[i] = static_cast<float>(from.raw[i]);
@@ -1609,8 +1834,7 @@ HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<double, D>> from) {
1609
1834
  VFromD<D> ret;
1610
1835
  for (size_t i = 0; i < MaxLanes(d); ++i) {
1611
1836
  // Prevent ubsan errors when converting double to narrower integer/int32_t
1612
- ret.raw[i] = detail::CastValueForF2IConv<TFromD<D>>(
1613
- hwy::TypeTag<TFromD<D>>(), from.raw[i]);
1837
+ ret.raw[i] = detail::CastValueForF2IConv<TFromD<D>>(from.raw[i]);
1614
1838
  }
1615
1839
  return ret;
1616
1840
  }
@@ -1631,17 +1855,32 @@ HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) {
1631
1855
  return ret;
1632
1856
  }
1633
1857
 
1858
+ // Disable the default unsigned to signed DemoteTo/ReorderDemote2To
1859
+ // implementations in generic_ops-inl.h on EMU128 as the EMU128 target has
1860
+ // target-specific implementations of the unsigned to signed DemoteTo and
1861
+ // ReorderDemote2To ops
1862
+
1863
+ // NOTE: hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr is used instead of
1864
+ // hwy::EnableIf<false>* = nullptr to avoid compiler errors since
1865
+ // !hwy::IsSame<V, V>() is always false and as !hwy::IsSame<V, V>() will cause
1866
+ // SFINAE to occur instead of a hard error due to a dependency on the V template
1867
+ // argument
1868
+ #undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
1869
+ #define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \
1870
+ hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
1871
+
1634
1872
  template <class DTo, typename TFrom, size_t N, HWY_IF_UNSIGNED(TFrom),
1635
- HWY_IF_UNSIGNED_D(DTo)>
1873
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DTo)>
1636
1874
  HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) {
1637
1875
  using TTo = TFromD<DTo>;
1638
1876
  static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
1639
1877
 
1878
+ const auto max = static_cast<MakeUnsigned<TTo>>(LimitsMax<TTo>());
1879
+
1640
1880
  VFromD<DTo> ret;
1641
1881
  for (size_t i = 0; i < N; ++i) {
1642
1882
  // Int to int: choose closest value in ToT to `from` (avoids UB)
1643
- from.raw[i] = HWY_MIN(from.raw[i], LimitsMax<TTo>());
1644
- ret.raw[i] = static_cast<TTo>(from.raw[i]);
1883
+ ret.raw[i] = static_cast<TTo>(HWY_MIN(from.raw[i], max));
1645
1884
  }
1646
1885
  return ret;
1647
1886
  }
@@ -1689,14 +1928,15 @@ HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
1689
1928
  return ret;
1690
1929
  }
1691
1930
 
1692
- template <class DN, HWY_IF_UNSIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
1693
- HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
1931
+ template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DN), class V,
1932
+ HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
1694
1933
  HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
1695
1934
  HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
1696
1935
  const RepartitionToWide<decltype(dn)> dw;
1697
1936
  const size_t NW = Lanes(dw);
1698
1937
  using TN = TFromD<DN>;
1699
- const TN max = LimitsMax<TN>();
1938
+ using TN_U = MakeUnsigned<TN>;
1939
+ const TN_U max = static_cast<TN_U>(LimitsMax<TN>());
1700
1940
  VFromD<DN> ret;
1701
1941
  for (size_t i = 0; i < NW; ++i) {
1702
1942
  ret.raw[i] = static_cast<TN>(HWY_MIN(a.raw[i], max));
@@ -1715,23 +1955,20 @@ HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
1715
1955
  return ReorderDemote2To(dn, a, b);
1716
1956
  }
1717
1957
 
1718
- template <class DN, HWY_IF_BF16_D(DN), class V, HWY_IF_F32_D(DFromV<V>),
1958
+ template <class DN, HWY_IF_SPECIAL_FLOAT_D(DN), class V,
1959
+ HWY_IF_F32_D(DFromV<V>),
1719
1960
  HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
1720
1961
  HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
1721
- const RebindToUnsigned<DFromV<decltype(a)>> du32;
1722
- const size_t NW = Lanes(du32);
1723
- VFromD<Repartition<uint16_t, DN>> ret;
1724
-
1725
- const auto a_bits = BitCast(du32, a);
1726
- const auto b_bits = BitCast(du32, b);
1727
-
1962
+ const size_t NW = Lanes(dn) / 2;
1963
+ using TN = TFromD<DN>;
1964
+ VFromD<DN> ret;
1728
1965
  for (size_t i = 0; i < NW; ++i) {
1729
- ret.raw[i] = static_cast<uint16_t>(a_bits.raw[i] >> 16);
1966
+ ret.raw[i] = ConvertScalarTo<TN>(a.raw[i]);
1730
1967
  }
1731
1968
  for (size_t i = 0; i < NW; ++i) {
1732
- ret.raw[NW + i] = static_cast<uint16_t>(b_bits.raw[i] >> 16);
1969
+ ret.raw[NW + i] = ConvertScalarTo<TN>(b.raw[i]);
1733
1970
  }
1734
- return BitCast(dn, ret);
1971
+ return ret;
1735
1972
  }
1736
1973
 
1737
1974
  namespace detail {
@@ -1758,6 +1995,12 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<bfloat16_t, N> v) {
1758
1995
  return ret;
1759
1996
  }
1760
1997
 
1998
+ #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
1999
+ #undef HWY_NATIVE_DEMOTE_F32_TO_BF16
2000
+ #else
2001
+ #define HWY_NATIVE_DEMOTE_F32_TO_BF16
2002
+ #endif
2003
+
1761
2004
  template <class D, HWY_IF_BF16_D(D), size_t N>
1762
2005
  HWY_API VFromD<D> DemoteTo(D /* tag */, Vec128<float, N> v) {
1763
2006
  VFromD<D> ret;
@@ -1767,6 +2010,21 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec128<float, N> v) {
1767
2010
  return ret;
1768
2011
  }
1769
2012
 
2013
+ #ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
2014
+ #undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
2015
+ #else
2016
+ #define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
2017
+ #endif
2018
+
2019
+ template <class D32, HWY_IF_UI32_D(D32)>
2020
+ HWY_API VFromD<D32> DemoteInRangeTo(D32 d32, VFromD<Rebind<double, D32>> v) {
2021
+ VFromD<D32> ret;
2022
+ for (size_t i = 0; i < MaxLanes(d32); ++i) {
2023
+ ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<D32>>(v.raw[i]);
2024
+ }
2025
+ return ret;
2026
+ }
2027
+
1770
2028
  // Tag dispatch instead of SFINAE for MSVC 2017 compatibility
1771
2029
  namespace detail {
1772
2030
 
@@ -1780,7 +2038,7 @@ HWY_API VFromD<DTo> ConvertTo(hwy::FloatTag /*tag*/, DTo /*tag*/,
1780
2038
 
1781
2039
  for (size_t i = 0; i < N; ++i) {
1782
2040
  // float## -> int##: return closest representable value
1783
- ret.raw[i] = CastValueForF2IConv<ToT>(hwy::TypeTag<ToT>(), from.raw[i]);
2041
+ ret.raw[i] = CastValueForF2IConv<ToT>(from.raw[i]);
1784
2042
  }
1785
2043
  return ret;
1786
2044
  }
@@ -1806,6 +2064,22 @@ HWY_API VFromD<DTo> ConvertTo(DTo d, Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
1806
2064
  return detail::ConvertTo(hwy::IsFloatTag<TFrom>(), d, from);
1807
2065
  }
1808
2066
 
2067
+ #ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
2068
+ #undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
2069
+ #else
2070
+ #define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
2071
+ #endif
2072
+
2073
+ template <class DI, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DI),
2074
+ HWY_IF_T_SIZE_ONE_OF_D(DI, (1 << 4) | (1 << 8))>
2075
+ HWY_API VFromD<DI> ConvertInRangeTo(DI di, VFromD<RebindToFloat<DI>> v) {
2076
+ VFromD<DI> ret;
2077
+ for (size_t i = 0; i < MaxLanes(di); i++) {
2078
+ ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<DI>>(v.raw[i]);
2079
+ }
2080
+ return ret;
2081
+ }
2082
+
1809
2083
  template <size_t N>
1810
2084
  HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) {
1811
2085
  return DemoteTo(Simd<uint8_t, N, 0>(), v);
@@ -1893,172 +2167,6 @@ HWY_API VFromD<DN> OrderedTruncate2To(DN dn, V a, V b) {
1893
2167
  return ret;
1894
2168
  }
1895
2169
 
1896
- // ================================================== COMBINE
1897
-
1898
- template <typename T, size_t N>
1899
- HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
1900
- Vec128<T, N / 2> ret;
1901
- CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
1902
- return ret;
1903
- }
1904
-
1905
- template <class D>
1906
- HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
1907
- return LowerHalf(v);
1908
- }
1909
-
1910
- template <class D>
1911
- HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
1912
- VFromD<D> ret;
1913
- CopyBytes<d.MaxBytes()>(&v.raw[MaxLanes(d)], ret.raw);
1914
- return ret;
1915
- }
1916
-
1917
- template <class D>
1918
- HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> v) {
1919
- const Half<decltype(d)> dh;
1920
- VFromD<D> ret; // zero-initialized
1921
- CopyBytes<dh.MaxBytes()>(v.raw, ret.raw);
1922
- return ret;
1923
- }
1924
-
1925
- template <class D, class VH = VFromD<Half<D>>>
1926
- HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
1927
- const Half<decltype(d)> dh;
1928
- VFromD<D> ret;
1929
- CopyBytes<dh.MaxBytes()>(lo_half.raw, &ret.raw[0]);
1930
- CopyBytes<dh.MaxBytes()>(hi_half.raw, &ret.raw[MaxLanes(dh)]);
1931
- return ret;
1932
- }
1933
-
1934
- template <class D>
1935
- HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
1936
- const Half<decltype(d)> dh;
1937
- VFromD<D> ret;
1938
- CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
1939
- CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
1940
- return ret;
1941
- }
1942
-
1943
- template <class D>
1944
- HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
1945
- const Half<decltype(d)> dh;
1946
- VFromD<D> ret;
1947
- CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
1948
- CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
1949
- return ret;
1950
- }
1951
-
1952
- template <class D>
1953
- HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
1954
- const Half<decltype(d)> dh;
1955
- VFromD<D> ret;
1956
- CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
1957
- CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
1958
- return ret;
1959
- }
1960
-
1961
- template <class D>
1962
- HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
1963
- const Half<decltype(d)> dh;
1964
- VFromD<D> ret;
1965
- CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
1966
- CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
1967
- return ret;
1968
- }
1969
-
1970
- template <class D>
1971
- HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
1972
- const Half<decltype(d)> dh;
1973
- VFromD<D> ret;
1974
- for (size_t i = 0; i < MaxLanes(dh); ++i) {
1975
- ret.raw[i] = lo.raw[2 * i];
1976
- }
1977
- for (size_t i = 0; i < MaxLanes(dh); ++i) {
1978
- ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i];
1979
- }
1980
- return ret;
1981
- }
1982
-
1983
- template <class D>
1984
- HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
1985
- const Half<decltype(d)> dh;
1986
- VFromD<D> ret;
1987
- for (size_t i = 0; i < MaxLanes(dh); ++i) {
1988
- ret.raw[i] = lo.raw[2 * i + 1];
1989
- }
1990
- for (size_t i = 0; i < MaxLanes(dh); ++i) {
1991
- ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i + 1];
1992
- }
1993
- return ret;
1994
- }
1995
-
1996
- // ------------------------------ CombineShiftRightBytes
1997
- template <int kBytes, class D>
1998
- HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
1999
- VFromD<D> ret;
2000
- const uint8_t* HWY_RESTRICT lo8 =
2001
- reinterpret_cast<const uint8_t * HWY_RESTRICT>(lo.raw);
2002
- uint8_t* HWY_RESTRICT ret8 =
2003
- reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
2004
- CopyBytes<d.MaxBytes() - kBytes>(lo8 + kBytes, ret8);
2005
- CopyBytes<kBytes>(hi.raw, ret8 + d.MaxBytes() - kBytes);
2006
- return ret;
2007
- }
2008
-
2009
- // ------------------------------ ShiftLeftBytes
2010
-
2011
- template <int kBytes, class D>
2012
- HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
2013
- static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2014
- VFromD<D> ret;
2015
- uint8_t* HWY_RESTRICT ret8 =
2016
- reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
2017
- ZeroBytes<kBytes>(ret8);
2018
- CopyBytes<d.MaxBytes() - kBytes>(v.raw, ret8 + kBytes);
2019
- return ret;
2020
- }
2021
-
2022
- template <int kBytes, typename T, size_t N>
2023
- HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
2024
- return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
2025
- }
2026
-
2027
- // ------------------------------ ShiftLeftLanes
2028
-
2029
- template <int kLanes, class D, typename T = TFromD<D>>
2030
- HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) {
2031
- const Repartition<uint8_t, decltype(d)> d8;
2032
- return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
2033
- }
2034
-
2035
- template <int kLanes, typename T, size_t N>
2036
- HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) {
2037
- return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
2038
- }
2039
-
2040
- // ------------------------------ ShiftRightBytes
2041
- template <int kBytes, class D>
2042
- HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
2043
- static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2044
- VFromD<D> ret;
2045
- const uint8_t* HWY_RESTRICT v8 =
2046
- reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
2047
- uint8_t* HWY_RESTRICT ret8 =
2048
- reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
2049
- CopyBytes<d.MaxBytes() - kBytes>(v8 + kBytes, ret8);
2050
- ZeroBytes<kBytes>(ret8 + d.MaxBytes() - kBytes);
2051
- return ret;
2052
- }
2053
-
2054
- // ------------------------------ ShiftRightLanes
2055
- template <int kLanes, class D>
2056
- HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) {
2057
- const Repartition<uint8_t, decltype(d)> d8;
2058
- constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
2059
- return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
2060
- }
2061
-
2062
2170
  // ================================================== SWIZZLE
2063
2171
 
2064
2172
  template <typename T, size_t N>
@@ -2101,6 +2209,24 @@ HWY_API Vec128<T, N> OddEven(Vec128<T, N> odd, Vec128<T, N> even) {
2101
2209
  return odd;
2102
2210
  }
2103
2211
 
2212
+ template <class D>
2213
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
2214
+ constexpr size_t N = HWY_MAX_LANES_D(D);
2215
+ for (size_t i = 1; i < N; i += 2) {
2216
+ a.raw[i] = b.raw[i - 1];
2217
+ }
2218
+ return a;
2219
+ }
2220
+
2221
+ template <class D>
2222
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
2223
+ constexpr size_t N = HWY_MAX_LANES_D(D);
2224
+ for (size_t i = 1; i < N; i += 2) {
2225
+ b.raw[i - 1] = a.raw[i];
2226
+ }
2227
+ return b;
2228
+ }
2229
+
2104
2230
  template <typename T, size_t N>
2105
2231
  HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
2106
2232
  return even;
@@ -2349,8 +2475,8 @@ HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
2349
2475
  }
2350
2476
 
2351
2477
  // Additional overload for the optional tag.
2352
- template <class V>
2353
- HWY_API V InterleaveLower(DFromV<V> /* tag */, V a, V b) {
2478
+ template <class D>
2479
+ HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) {
2354
2480
  return InterleaveLower(a, b);
2355
2481
  }
2356
2482
 
@@ -2416,6 +2542,15 @@ HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
2416
2542
  return m;
2417
2543
  }
2418
2544
 
2545
+ template <class D>
2546
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
2547
+ MFromD<D> m;
2548
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
2549
+ m.bits[i] = MFromD<D>::FromBool(((mask_bits >> i) & 1u) != 0);
2550
+ }
2551
+ return m;
2552
+ }
2553
+
2419
2554
  // `p` points to at least 8 writable bytes.
2420
2555
  template <class D>
2421
2556
  HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
@@ -2517,7 +2652,7 @@ HWY_API Vec128<T, N> Expand(Vec128<T, N> v, const Mask128<T, N> mask) {
2517
2652
  if (mask.bits[i]) {
2518
2653
  ret.raw[i] = v.raw[in_pos++];
2519
2654
  } else {
2520
- ret.raw[i] = T(); // zero, also works for float16_t
2655
+ ret.raw[i] = ConvertScalarTo<T>(0);
2521
2656
  }
2522
2657
  }
2523
2658
  return ret;
@@ -2662,88 +2797,26 @@ HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
2662
2797
 
2663
2798
  // ------------------------------ WidenMulPairwiseAdd
2664
2799
 
2665
- template <class D, HWY_IF_F32_D(D), class VBF16>
2666
- HWY_API VFromD<D> WidenMulPairwiseAdd(D df32, VBF16 a, VBF16 b) {
2667
- const Rebind<uint32_t, decltype(df32)> du32;
2668
- using VU32 = VFromD<decltype(du32)>;
2669
- const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
2670
- // Avoid ZipLower/Upper so this also works on big-endian systems.
2671
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
2672
- const VU32 ao = And(BitCast(du32, a), odd);
2673
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
2674
- const VU32 bo = And(BitCast(du32, b), odd);
2675
- return Mul(BitCast(df32, ae), BitCast(df32, be)) +
2676
- Mul(BitCast(df32, ao), BitCast(df32, bo));
2800
+ template <class DF, HWY_IF_F32_D(DF), class VBF>
2801
+ HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
2802
+ return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
2803
+ Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
2677
2804
  }
2678
2805
 
2679
- template <class D, HWY_IF_I32_D(D), class VI16>
2680
- HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, VI16 a, VI16 b) {
2681
- using VI32 = VFromD<decltype(d32)>;
2682
- // Manual sign extension requires two shifts for even lanes.
2683
- const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a)));
2684
- const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b)));
2685
- const VI32 ao = ShiftRight<16>(BitCast(d32, a));
2686
- const VI32 bo = ShiftRight<16>(BitCast(d32, b));
2687
- return Add(Mul(ae, be), Mul(ao, bo));
2688
- }
2689
-
2690
- template <class D, HWY_IF_U32_D(D), class VU16>
2691
- HWY_API VFromD<D> WidenMulPairwiseAdd(D du32, VU16 a, VU16 b) {
2692
- const auto lo16_mask = Set(du32, 0x0000FFFFu);
2693
-
2694
- const auto a0 = And(BitCast(du32, a), lo16_mask);
2695
- const auto b0 = And(BitCast(du32, b), lo16_mask);
2696
-
2697
- const auto a1 = ShiftRight<16>(BitCast(du32, a));
2698
- const auto b1 = ShiftRight<16>(BitCast(du32, b));
2699
-
2700
- return Add(Mul(a0, b0), Mul(a1, b1));
2806
+ template <class D, HWY_IF_UI32_D(D), class V16>
2807
+ HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, V16 a, V16 b) {
2808
+ return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b),
2809
+ Mul(PromoteOddTo(d32, a), PromoteOddTo(d32, b)));
2701
2810
  }
2702
2811
 
2703
2812
  // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
2704
2813
 
2705
- template <class D, HWY_IF_F32_D(D), size_t N, class VBF16>
2706
- HWY_API VFromD<D> ReorderWidenMulAccumulate(D df32, VBF16 a, VBF16 b,
2707
- const Vec128<float, N> sum0,
2708
- Vec128<float, N>& sum1) {
2709
- const Rebind<uint32_t, decltype(df32)> du32;
2710
- using VU32 = VFromD<decltype(du32)>;
2711
- const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
2712
- // Avoid ZipLower/Upper so this also works on big-endian systems.
2713
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
2714
- const VU32 ao = And(BitCast(du32, a), odd);
2715
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
2716
- const VU32 bo = And(BitCast(du32, b), odd);
2717
- sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
2718
- return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
2719
- }
2720
-
2721
- template <class D, HWY_IF_I32_D(D), size_t N, class VI16>
2722
- HWY_API VFromD<D> ReorderWidenMulAccumulate(D d32, VI16 a, VI16 b,
2723
- const Vec128<int32_t, N> sum0,
2724
- Vec128<int32_t, N>& sum1) {
2725
- using VI32 = VFromD<decltype(d32)>;
2726
- // Manual sign extension requires two shifts for even lanes.
2727
- const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a)));
2728
- const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b)));
2729
- const VI32 ao = ShiftRight<16>(BitCast(d32, a));
2730
- const VI32 bo = ShiftRight<16>(BitCast(d32, b));
2731
- sum1 = Add(Mul(ao, bo), sum1);
2732
- return Add(Mul(ae, be), sum0);
2733
- }
2734
-
2735
- template <class D, HWY_IF_U32_D(D), size_t N, class VU16>
2736
- HWY_API VFromD<D> ReorderWidenMulAccumulate(D du32, VU16 a, VU16 b,
2737
- const Vec128<uint32_t, N> sum0,
2738
- Vec128<uint32_t, N>& sum1) {
2739
- using VU32 = VFromD<decltype(du32)>;
2740
- const VU32 lo16_mask = Set(du32, uint32_t{0x0000FFFFu});
2741
- const VU32 ae = And(BitCast(du32, a), lo16_mask);
2742
- const VU32 be = And(BitCast(du32, b), lo16_mask);
2743
- const VU32 ao = ShiftRight<16>(BitCast(du32, a));
2744
- const VU32 bo = ShiftRight<16>(BitCast(du32, b));
2745
- sum1 = Add(Mul(ao, bo), sum1);
2746
- return Add(Mul(ae, be), sum0);
2814
+ template <class D, HWY_IF_UI32_D(D), class V16>
2815
+ HWY_API VFromD<D> ReorderWidenMulAccumulate(D d32, V16 a, V16 b,
2816
+ const VFromD<D> sum0,
2817
+ VFromD<D>& sum1) {
2818
+ sum1 = MulAdd(PromoteOddTo(d32, a), PromoteOddTo(d32, b), sum1);
2819
+ return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b), sum0);
2747
2820
  }
2748
2821
 
2749
2822
  // ------------------------------ RearrangeToOddPlusEven
@@ -2754,15 +2827,13 @@ HWY_API VW RearrangeToOddPlusEven(VW sum0, VW sum1) {
2754
2827
 
2755
2828
  // ================================================== REDUCTIONS
2756
2829
 
2757
- template <class D, typename T = TFromD<D>>
2758
- HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
2759
- T sum = T{0};
2760
- for (size_t i = 0; i < MaxLanes(d); ++i) {
2761
- sum += v.raw[i];
2762
- }
2763
- return Set(d, sum);
2764
- }
2765
- template <class D, typename T = TFromD<D>>
2830
+ #ifdef HWY_NATIVE_REDUCE_SCALAR
2831
+ #undef HWY_NATIVE_REDUCE_SCALAR
2832
+ #else
2833
+ #define HWY_NATIVE_REDUCE_SCALAR
2834
+ #endif
2835
+
2836
+ template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
2766
2837
  HWY_API T ReduceSum(D d, VFromD<D> v) {
2767
2838
  T sum = T{0};
2768
2839
  for (size_t i = 0; i < MaxLanes(d); ++i) {
@@ -2770,39 +2841,56 @@ HWY_API T ReduceSum(D d, VFromD<D> v) {
2770
2841
  }
2771
2842
  return sum;
2772
2843
  }
2773
- template <class D, typename T = TFromD<D>>
2774
- HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
2844
+ template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
2845
+ HWY_API T ReduceMin(D d, VFromD<D> v) {
2775
2846
  T min = HighestValue<T>();
2776
2847
  for (size_t i = 0; i < MaxLanes(d); ++i) {
2777
2848
  min = HWY_MIN(min, v.raw[i]);
2778
2849
  }
2779
- return Set(d, min);
2850
+ return min;
2780
2851
  }
2781
- template <class D, typename T = TFromD<D>>
2782
- HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
2852
+ template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
2853
+ HWY_API T ReduceMax(D d, VFromD<D> v) {
2783
2854
  T max = LowestValue<T>();
2784
2855
  for (size_t i = 0; i < MaxLanes(d); ++i) {
2785
2856
  max = HWY_MAX(max, v.raw[i]);
2786
2857
  }
2787
- return Set(d, max);
2858
+ return max;
2859
+ }
2860
+
2861
+ // ------------------------------ SumOfLanes
2862
+
2863
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
2864
+ HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
2865
+ return Set(d, ReduceSum(d, v));
2866
+ }
2867
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
2868
+ HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
2869
+ return Set(d, ReduceMin(d, v));
2870
+ }
2871
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
2872
+ HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
2873
+ return Set(d, ReduceMax(d, v));
2788
2874
  }
2789
2875
 
2790
2876
  // ================================================== OPS WITH DEPENDENCIES
2791
2877
 
2792
2878
  // ------------------------------ MulEven/Odd 64x64 (UpperHalf)
2793
2879
 
2794
- HWY_INLINE Vec128<uint64_t> MulEven(Vec128<uint64_t> a, Vec128<uint64_t> b) {
2795
- alignas(16) uint64_t mul[2];
2880
+ template <class T, HWY_IF_UI64(T)>
2881
+ HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
2882
+ alignas(16) T mul[2];
2796
2883
  mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
2797
- return Load(Full128<uint64_t>(), mul);
2884
+ return Load(Full128<T>(), mul);
2798
2885
  }
2799
2886
 
2800
- HWY_INLINE Vec128<uint64_t> MulOdd(Vec128<uint64_t> a, Vec128<uint64_t> b) {
2801
- alignas(16) uint64_t mul[2];
2802
- const Half<Full128<uint64_t>> d2;
2887
+ template <class T, HWY_IF_UI64(T)>
2888
+ HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
2889
+ alignas(16) T mul[2];
2890
+ const Half<Full128<T>> d2;
2803
2891
  mul[0] =
2804
2892
  Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
2805
- return Load(Full128<uint64_t>(), mul);
2893
+ return Load(Full128<T>(), mul);
2806
2894
  }
2807
2895
 
2808
2896
  // NOLINTNEXTLINE(google-readability-namespace-comments)