@img/sharp-libvips-dev 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. package/README.md +1 -2
  2. package/include/aom/aom_decoder.h +1 -1
  3. package/include/aom/aom_encoder.h +7 -1
  4. package/include/aom/aom_image.h +24 -12
  5. package/include/aom/aom_integer.h +3 -3
  6. package/include/aom/aomcx.h +15 -0
  7. package/include/aom/aomdx.h +5 -2
  8. package/include/archive.h +7 -5
  9. package/include/archive_entry.h +5 -3
  10. package/include/cgif.h +3 -0
  11. package/include/expat.h +21 -10
  12. package/include/expat_config.h +11 -5
  13. package/include/ffi.h +12 -25
  14. package/include/freetype2/freetype/config/ftoption.h +2 -2
  15. package/include/fribidi/fribidi-config.h +2 -2
  16. package/include/fribidi/fribidi-unicode-version.h +3 -3
  17. package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
  18. package/include/glib-2.0/gio/gappinfo.h +40 -25
  19. package/include/glib-2.0/gio/gapplication.h +6 -0
  20. package/include/glib-2.0/gio/gasyncresult.h +1 -1
  21. package/include/glib-2.0/gio/gconverter.h +5 -0
  22. package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
  23. package/include/glib-2.0/gio/gfile.h +16 -0
  24. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  25. package/include/glib-2.0/gio/giotypes.h +0 -1
  26. package/include/glib-2.0/gio/gsettings.h +8 -0
  27. package/include/glib-2.0/gio/gvfs.h +2 -2
  28. package/include/glib-2.0/girepository/gi-visibility.h +34 -0
  29. package/include/glib-2.0/girepository/giarginfo.h +23 -6
  30. package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
  31. package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
  32. package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
  33. package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
  34. package/include/glib-2.0/girepository/gienuminfo.h +20 -21
  35. package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
  36. package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
  37. package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
  38. package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
  39. package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
  40. package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
  41. package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
  42. package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
  43. package/include/glib-2.0/girepository/girepository.h +53 -62
  44. package/include/glib-2.0/girepository/girffi.h +8 -7
  45. package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
  46. package/include/glib-2.0/girepository/gistructinfo.h +26 -11
  47. package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
  48. package/include/glib-2.0/girepository/gitypelib.h +9 -13
  49. package/include/glib-2.0/girepository/gitypes.h +52 -104
  50. package/include/glib-2.0/girepository/giunioninfo.h +28 -12
  51. package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
  52. package/include/glib-2.0/girepository/givalueinfo.h +65 -0
  53. package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
  54. package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
  55. package/include/glib-2.0/glib/gbitlock.h +31 -0
  56. package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
  57. package/include/glib-2.0/glib/giochannel.h +2 -2
  58. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  59. package/include/glib-2.0/glib/gmacros.h +12 -5
  60. package/include/glib-2.0/glib/gmain.h +93 -7
  61. package/include/glib-2.0/glib/gmessages.h +8 -0
  62. package/include/glib-2.0/glib/gqsort.h +8 -1
  63. package/include/glib-2.0/glib/gslice.h +2 -0
  64. package/include/glib-2.0/glib/gstrfuncs.h +24 -30
  65. package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
  66. package/include/glib-2.0/glib/gthread.h +191 -3
  67. package/include/glib-2.0/glib/gunicode.h +1 -1
  68. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  69. package/include/glib-2.0/glib-unix.h +7 -1
  70. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  71. package/include/glib-2.0/gobject/genums.h +6 -6
  72. package/include/glib-2.0/gobject/glib-types.h +11 -0
  73. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  74. package/include/glib-2.0/gobject/gsignal.h +16 -6
  75. package/include/glib-2.0/gobject/gtype.h +6 -6
  76. package/include/harfbuzz/hb-buffer.h +6 -0
  77. package/include/harfbuzz/hb-common.h +6 -9
  78. package/include/harfbuzz/hb-cplusplus.hh +8 -11
  79. package/include/harfbuzz/hb-subset.h +17 -4
  80. package/include/harfbuzz/hb-version.h +3 -3
  81. package/include/hwy/abort.h +28 -0
  82. package/include/hwy/aligned_allocator.h +218 -6
  83. package/include/hwy/base.h +1935 -512
  84. package/include/hwy/cache_control.h +24 -6
  85. package/include/hwy/detect_compiler_arch.h +105 -10
  86. package/include/hwy/detect_targets.h +146 -37
  87. package/include/hwy/foreach_target.h +36 -1
  88. package/include/hwy/highway.h +222 -50
  89. package/include/hwy/ops/arm_neon-inl.h +2055 -894
  90. package/include/hwy/ops/arm_sve-inl.h +1476 -348
  91. package/include/hwy/ops/emu128-inl.h +711 -623
  92. package/include/hwy/ops/generic_ops-inl.h +4431 -2157
  93. package/include/hwy/ops/inside-inl.h +691 -0
  94. package/include/hwy/ops/ppc_vsx-inl.h +2186 -673
  95. package/include/hwy/ops/rvv-inl.h +1556 -536
  96. package/include/hwy/ops/scalar-inl.h +353 -233
  97. package/include/hwy/ops/set_macros-inl.h +171 -23
  98. package/include/hwy/ops/shared-inl.h +198 -56
  99. package/include/hwy/ops/wasm_128-inl.h +283 -244
  100. package/include/hwy/ops/x86_128-inl.h +3673 -1357
  101. package/include/hwy/ops/x86_256-inl.h +1737 -663
  102. package/include/hwy/ops/x86_512-inl.h +1697 -500
  103. package/include/hwy/per_target.h +4 -0
  104. package/include/hwy/profiler.h +648 -0
  105. package/include/hwy/robust_statistics.h +2 -2
  106. package/include/hwy/targets.h +40 -32
  107. package/include/hwy/timer-inl.h +3 -3
  108. package/include/hwy/timer.h +16 -1
  109. package/include/libheif/heif.h +170 -15
  110. package/include/libheif/heif_items.h +237 -0
  111. package/include/libheif/heif_properties.h +38 -2
  112. package/include/libheif/heif_regions.h +1 -1
  113. package/include/libheif/heif_version.h +2 -2
  114. package/include/libpng16/png.h +32 -29
  115. package/include/libpng16/pngconf.h +2 -2
  116. package/include/libpng16/pnglibconf.h +8 -3
  117. package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
  118. package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
  119. package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
  120. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  121. package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
  122. package/include/libxml2/libxml/HTMLparser.h +12 -19
  123. package/include/libxml2/libxml/c14n.h +1 -12
  124. package/include/libxml2/libxml/debugXML.h +1 -1
  125. package/include/libxml2/libxml/encoding.h +9 -0
  126. package/include/libxml2/libxml/entities.h +12 -1
  127. package/include/libxml2/libxml/hash.h +19 -0
  128. package/include/libxml2/libxml/list.h +2 -2
  129. package/include/libxml2/libxml/nanohttp.h +17 -0
  130. package/include/libxml2/libxml/parser.h +73 -58
  131. package/include/libxml2/libxml/parserInternals.h +9 -1
  132. package/include/libxml2/libxml/pattern.h +6 -0
  133. package/include/libxml2/libxml/tree.h +32 -12
  134. package/include/libxml2/libxml/uri.h +11 -0
  135. package/include/libxml2/libxml/valid.h +29 -2
  136. package/include/libxml2/libxml/xinclude.h +7 -0
  137. package/include/libxml2/libxml/xmlIO.h +21 -5
  138. package/include/libxml2/libxml/xmlerror.h +14 -0
  139. package/include/libxml2/libxml/xmlexports.h +111 -15
  140. package/include/libxml2/libxml/xmlmemory.h +8 -45
  141. package/include/libxml2/libxml/xmlreader.h +2 -0
  142. package/include/libxml2/libxml/xmlsave.h +5 -0
  143. package/include/libxml2/libxml/xmlunicode.h +165 -1
  144. package/include/libxml2/libxml/xmlversion.h +15 -179
  145. package/include/libxml2/libxml/xmlwriter.h +1 -0
  146. package/include/libxml2/libxml/xpath.h +4 -0
  147. package/include/pango-1.0/pango/pango-features.h +2 -2
  148. package/include/pango-1.0/pango/pango-fontmap.h +7 -0
  149. package/include/pango-1.0/pango/pango-item.h +4 -2
  150. package/include/pango-1.0/pango/pango-version-macros.h +25 -0
  151. package/include/pango-1.0/pango/pangofc-font.h +2 -1
  152. package/include/pixman-1/pixman-version.h +2 -2
  153. package/include/png.h +32 -29
  154. package/include/pngconf.h +2 -2
  155. package/include/pnglibconf.h +8 -3
  156. package/include/vips/connection.h +9 -3
  157. package/include/vips/util.h +1 -11
  158. package/include/vips/version.h +4 -4
  159. package/include/webp/decode.h +58 -56
  160. package/include/webp/demux.h +25 -21
  161. package/include/webp/encode.h +44 -39
  162. package/include/webp/mux.h +76 -15
  163. package/include/webp/mux_types.h +2 -1
  164. package/include/webp/sharpyuv/sharpyuv.h +77 -8
  165. package/include/webp/types.h +29 -8
  166. package/include/zconf.h +1 -1
  167. package/include/zlib.h +12 -12
  168. package/package.json +1 -1
  169. package/versions.json +18 -19
@@ -92,6 +92,9 @@ class Vec128 {
92
92
  HWY_INLINE Vec128& operator-=(const Vec128 other) {
93
93
  return *this = (*this - other);
94
94
  }
95
+ HWY_INLINE Vec128& operator%=(const Vec128 other) {
96
+ return *this = (*this % other);
97
+ }
95
98
  HWY_INLINE Vec128& operator&=(const Vec128 other) {
96
99
  return *this = (*this & other);
97
100
  }
@@ -151,9 +154,6 @@ HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
151
154
  template <class D>
152
155
  using VFromD = decltype(Zero(D()));
153
156
 
154
- // ------------------------------ Tuple (VFromD)
155
- #include "hwy/ops/tuple-inl.h"
156
-
157
157
  // ------------------------------ BitCast
158
158
 
159
159
  namespace detail {
@@ -213,25 +213,29 @@ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
213
213
  HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
214
214
  return VFromD<D>{wasm_i8x16_splat(static_cast<int8_t>(t))};
215
215
  }
216
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
216
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)>
217
217
  HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
218
218
  return VFromD<D>{wasm_i16x8_splat(static_cast<int16_t>(t))};
219
219
  }
220
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
220
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
221
221
  HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
222
222
  return VFromD<D>{wasm_i32x4_splat(static_cast<int32_t>(t))};
223
223
  }
224
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
224
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
225
225
  HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
226
226
  return VFromD<D>{wasm_i64x2_splat(static_cast<int64_t>(t))};
227
227
  }
228
228
 
229
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_SPECIAL_FLOAT_D(D)>
230
+ HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
231
+ return VFromD<D>{wasm_i16x8_splat(BitCastScalar<int16_t>(t))};
232
+ }
229
233
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
230
- HWY_API VFromD<D> Set(D /* tag */, const float t) {
234
+ HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
231
235
  return VFromD<D>{wasm_f32x4_splat(t)};
232
236
  }
233
237
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
234
- HWY_API VFromD<D> Set(D /* tag */, const double t) {
238
+ HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
235
239
  return VFromD<D>{wasm_f64x2_splat(t)};
236
240
  }
237
241
 
@@ -251,12 +255,99 @@ template <class D, typename T = TFromD<D>, typename T2>
251
255
  HWY_API VFromD<D> Iota(D d, const T2 first) {
252
256
  HWY_ALIGN T lanes[MaxLanes(d)];
253
257
  for (size_t i = 0; i < MaxLanes(d); ++i) {
254
- lanes[i] =
255
- AddWithWraparound(hwy::IsFloatTag<T>(), static_cast<T>(first), i);
258
+ lanes[i] = AddWithWraparound(static_cast<T>(first), i);
256
259
  }
257
260
  return Load(d, lanes);
258
261
  }
259
262
 
263
+ // ------------------------------ Dup128VecFromValues
264
+ template <class D, HWY_IF_I8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
265
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
266
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
267
+ TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
268
+ TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
269
+ TFromD<D> t11, TFromD<D> t12,
270
+ TFromD<D> t13, TFromD<D> t14,
271
+ TFromD<D> t15) {
272
+ return VFromD<D>{wasm_i8x16_make(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10,
273
+ t11, t12, t13, t14, t15)};
274
+ }
275
+
276
+ template <class D, HWY_IF_U8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
277
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
278
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
279
+ TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
280
+ TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
281
+ TFromD<D> t11, TFromD<D> t12,
282
+ TFromD<D> t13, TFromD<D> t14,
283
+ TFromD<D> t15) {
284
+ return VFromD<D>{wasm_u8x16_make(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10,
285
+ t11, t12, t13, t14, t15)};
286
+ }
287
+
288
+ template <class D, HWY_IF_I16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
289
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
290
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
291
+ TFromD<D> t5, TFromD<D> t6,
292
+ TFromD<D> t7) {
293
+ return VFromD<D>{wasm_i16x8_make(t0, t1, t2, t3, t4, t5, t6, t7)};
294
+ }
295
+
296
+ template <class D, HWY_IF_U16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
297
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
298
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
299
+ TFromD<D> t5, TFromD<D> t6,
300
+ TFromD<D> t7) {
301
+ return VFromD<D>{wasm_u16x8_make(t0, t1, t2, t3, t4, t5, t6, t7)};
302
+ }
303
+
304
+ template <class D, HWY_IF_SPECIAL_FLOAT_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
305
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
306
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
307
+ TFromD<D> t5, TFromD<D> t6,
308
+ TFromD<D> t7) {
309
+ const RebindToSigned<decltype(d)> di;
310
+ return BitCast(d,
311
+ Dup128VecFromValues(
312
+ di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
313
+ BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
314
+ BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
315
+ BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
316
+ }
317
+
318
+ template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
319
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
320
+ TFromD<D> t2, TFromD<D> t3) {
321
+ return VFromD<D>{wasm_i32x4_make(t0, t1, t2, t3)};
322
+ }
323
+
324
+ template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
325
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
326
+ TFromD<D> t2, TFromD<D> t3) {
327
+ return VFromD<D>{wasm_u32x4_make(t0, t1, t2, t3)};
328
+ }
329
+
330
+ template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
331
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
332
+ TFromD<D> t2, TFromD<D> t3) {
333
+ return VFromD<D>{wasm_f32x4_make(t0, t1, t2, t3)};
334
+ }
335
+
336
+ template <class D, HWY_IF_I64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
337
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
338
+ return VFromD<D>{wasm_i64x2_make(t0, t1)};
339
+ }
340
+
341
+ template <class D, HWY_IF_U64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
342
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
343
+ return VFromD<D>{wasm_u64x2_make(t0, t1)};
344
+ }
345
+
346
+ template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
347
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
348
+ return VFromD<D>{wasm_f64x2_make(t0, t1)};
349
+ }
350
+
260
351
  // ================================================== ARITHMETIC
261
352
 
262
353
  // ------------------------------ Addition
@@ -560,12 +651,16 @@ HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
560
651
  }
561
652
 
562
653
  // ------------------------------ RotateRight (ShiftRight, Or)
563
- template <int kBits, typename T, size_t N>
654
+ template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
564
655
  HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
656
+ const DFromV<decltype(v)> d;
657
+ const RebindToUnsigned<decltype(d)> du;
658
+
565
659
  constexpr size_t kSizeInBits = sizeof(T) * 8;
566
660
  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
661
+
567
662
  if (kBits == 0) return v;
568
- return Or(ShiftRight<kBits>(v),
663
+ return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
569
664
  ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
570
665
  }
571
666
 
@@ -823,7 +918,25 @@ HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
823
918
  return Vec128<int32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
824
919
  }
825
920
 
826
- // Returns the upper 16 bits of a * b in each lane.
921
+ // Returns the upper sizeof(T)*8 bits of a * b in each lane.
922
+ template <size_t N>
923
+ HWY_API Vec128<uint8_t, N> MulHigh(const Vec128<uint8_t, N> a,
924
+ const Vec128<uint8_t, N> b) {
925
+ const auto l = wasm_u16x8_extmul_low_u8x16(a.raw, b.raw);
926
+ const auto h = wasm_u16x8_extmul_high_u8x16(a.raw, b.raw);
927
+ // TODO(eustas): shift-right + narrow?
928
+ return Vec128<uint8_t, N>{wasm_i8x16_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15,
929
+ 17, 19, 21, 23, 25, 27, 29, 31)};
930
+ }
931
+ template <size_t N>
932
+ HWY_API Vec128<int8_t, N> MulHigh(const Vec128<int8_t, N> a,
933
+ const Vec128<int8_t, N> b) {
934
+ const auto l = wasm_i16x8_extmul_low_i8x16(a.raw, b.raw);
935
+ const auto h = wasm_i16x8_extmul_high_i8x16(a.raw, b.raw);
936
+ // TODO(eustas): shift-right + narrow?
937
+ return Vec128<int8_t, N>{wasm_i8x16_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15,
938
+ 17, 19, 21, 23, 25, 27, 29, 31)};
939
+ }
827
940
  template <size_t N>
828
941
  HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
829
942
  const Vec128<uint16_t, N> b) {
@@ -842,6 +955,22 @@ HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
842
955
  return Vec128<int16_t, N>{
843
956
  wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
844
957
  }
958
+ template <size_t N>
959
+ HWY_API Vec128<uint32_t, N> MulHigh(const Vec128<uint32_t, N> a,
960
+ const Vec128<uint32_t, N> b) {
961
+ const auto l = wasm_u64x2_extmul_low_u32x4(a.raw, b.raw);
962
+ const auto h = wasm_u64x2_extmul_high_u32x4(a.raw, b.raw);
963
+ // TODO(eustas): shift-right + narrow?
964
+ return Vec128<uint32_t, N>{wasm_i32x4_shuffle(l, h, 1, 3, 5, 7)};
965
+ }
966
+ template <size_t N>
967
+ HWY_API Vec128<int32_t, N> MulHigh(const Vec128<int32_t, N> a,
968
+ const Vec128<int32_t, N> b) {
969
+ const auto l = wasm_i64x2_extmul_low_i32x4(a.raw, b.raw);
970
+ const auto h = wasm_i64x2_extmul_high_i32x4(a.raw, b.raw);
971
+ // TODO(eustas): shift-right + narrow?
972
+ return Vec128<int32_t, N>{wasm_i32x4_shuffle(l, h, 1, 3, 5, 7)};
973
+ }
845
974
 
846
975
  template <size_t N>
847
976
  HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
@@ -977,25 +1106,25 @@ HWY_API Vec128<T, N> AbsDiff(const Vec128<T, N> a, const Vec128<T, N> b) {
977
1106
 
978
1107
  // ------------------------------ Floating-point multiply-add variants
979
1108
 
980
- template <typename T, size_t N>
1109
+ template <typename T, size_t N, HWY_IF_FLOAT(T)>
981
1110
  HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
982
1111
  Vec128<T, N> add) {
983
1112
  return mul * x + add;
984
1113
  }
985
1114
 
986
- template <typename T, size_t N>
1115
+ template <typename T, size_t N, HWY_IF_FLOAT(T)>
987
1116
  HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
988
1117
  Vec128<T, N> add) {
989
1118
  return add - mul * x;
990
1119
  }
991
1120
 
992
- template <typename T, size_t N>
1121
+ template <typename T, size_t N, HWY_IF_FLOAT(T)>
993
1122
  HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
994
1123
  Vec128<T, N> sub) {
995
1124
  return mul * x - sub;
996
1125
  }
997
1126
 
998
- template <typename T, size_t N>
1127
+ template <typename T, size_t N, HWY_IF_FLOAT(T)>
999
1128
  HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
1000
1129
  Vec128<T, N> sub) {
1001
1130
  return Neg(mul) * x - sub;
@@ -1071,10 +1200,10 @@ HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
1071
1200
  template <typename T, size_t N, HWY_IF_FLOAT(T)>
1072
1201
  HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
1073
1202
  const DFromV<decltype(v)> d;
1074
- const RebindToSigned<decltype(d)> di;
1075
- const VFromD<decltype(di)> vi = BitCast(di, v);
1203
+ const RebindToUnsigned<decltype(d)> du;
1204
+ const VFromD<decltype(du)> vu = BitCast(du, v);
1076
1205
  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
1077
- return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
1206
+ return RebindMask(d, Eq(Add(vu, vu), Set(du, hwy::MaxExponentTimes2<T>())));
1078
1207
  }
1079
1208
 
1080
1209
  // Returns whether normal/subnormal/zero.
@@ -1528,13 +1657,6 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
1528
1657
  return IfThenElse(MaskFromVec(v), yes, no);
1529
1658
  }
1530
1659
 
1531
- template <typename T, size_t N, HWY_IF_FLOAT(T)>
1532
- HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
1533
- const DFromV<decltype(v)> d;
1534
- const auto zero = Zero(d);
1535
- return IfThenElse(Mask128<T, N>{(v > zero).raw}, v, zero);
1536
- }
1537
-
1538
1660
  // ------------------------------ Mask logical
1539
1661
 
1540
1662
  template <typename T, size_t N>
@@ -1815,9 +1937,7 @@ template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2),
1815
1937
  HWY_IF_NOT_SPECIAL_FLOAT(T)>
1816
1938
  HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
1817
1939
  const int16_t lane = wasm_i16x8_extract_lane(v.raw, kLane);
1818
- T ret;
1819
- CopySameSize(&lane, &ret); // for float16_t
1820
- return ret;
1940
+ return static_cast<T>(lane);
1821
1941
  }
1822
1942
  template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2),
1823
1943
  HWY_IF_SPECIAL_FLOAT(T)>
@@ -1826,10 +1946,7 @@ HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
1826
1946
  const RebindToUnsigned<decltype(d)> du;
1827
1947
 
1828
1948
  const uint16_t bits = ExtractLane<kLane>(BitCast(du, v));
1829
-
1830
- T ret;
1831
- CopySameSize(&bits, &ret);
1832
- return ret;
1949
+ return BitCastScalar<T>(bits);
1833
1950
  }
1834
1951
  template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
1835
1952
  HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
@@ -2038,7 +2155,7 @@ template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
2038
2155
  HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
2039
2156
  static_assert(kLane < N, "Lane index out of bounds");
2040
2157
  return Vec128<T, N>{
2041
- wasm_i16x8_replace_lane(v.raw, kLane, static_cast<int16_t>(t))};
2158
+ wasm_i16x8_replace_lane(v.raw, kLane, BitCastScalar<int16_t>(t))};
2042
2159
  }
2043
2160
 
2044
2161
  template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
@@ -3002,6 +3119,13 @@ HWY_API Vec128<double, N> InterleaveLower(Vec128<double, N> a,
3002
3119
  return Vec128<double, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
3003
3120
  }
3004
3121
 
3122
+ template <class T, size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_SPECIAL_FLOAT(T)>
3123
+ HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
3124
+ const DFromV<decltype(a)> d;
3125
+ const RebindToUnsigned<decltype(d)> du;
3126
+ return BitCast(d, InterleaveLower(BitCast(du, a), BitCast(du, b)));
3127
+ }
3128
+
3005
3129
  // Additional overload for the optional tag (all vector lengths).
3006
3130
  template <class D>
3007
3131
  HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) {
@@ -3710,6 +3834,50 @@ HWY_API Vec128<float, N> OddEven(const Vec128<float, N> a,
3710
3834
  return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
3711
3835
  }
3712
3836
 
3837
+ // ------------------------------ InterleaveEven
3838
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
3839
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
3840
+ return VFromD<D>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 16, 2, 18, 4, 20, 6, 22,
3841
+ 8, 24, 10, 26, 12, 28, 14, 30)};
3842
+ }
3843
+
3844
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
3845
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
3846
+ return VFromD<D>{wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 2, 10, 4, 12, 6, 14)};
3847
+ }
3848
+
3849
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
3850
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
3851
+ return VFromD<D>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 2, 6)};
3852
+ }
3853
+
3854
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
3855
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
3856
+ return InterleaveLower(a, b);
3857
+ }
3858
+
3859
+ // ------------------------------ InterleaveOdd
3860
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
3861
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
3862
+ return VFromD<D>{wasm_i8x16_shuffle(a.raw, b.raw, 1, 17, 3, 19, 5, 21, 7, 23,
3863
+ 9, 25, 11, 27, 13, 29, 15, 31)};
3864
+ }
3865
+
3866
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
3867
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
3868
+ return VFromD<D>{wasm_i16x8_shuffle(a.raw, b.raw, 1, 9, 3, 11, 5, 13, 7, 15)};
3869
+ }
3870
+
3871
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
3872
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
3873
+ return VFromD<D>{wasm_i32x4_shuffle(a.raw, b.raw, 1, 5, 3, 7)};
3874
+ }
3875
+
3876
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
3877
+ HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
3878
+ return InterleaveUpper(d, a, b);
3879
+ }
3880
+
3713
3881
  // ------------------------------ OddEvenBlocks
3714
3882
  template <typename T, size_t N>
3715
3883
  HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
@@ -3986,6 +4154,9 @@ HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
3986
4154
  return PromoteTo(d, UpperHalf(dh, v));
3987
4155
  }
3988
4156
 
4157
+ // ------------------------------ PromoteEvenTo/PromoteOddTo
4158
+ #include "hwy/ops/inside-inl.h"
4159
+
3989
4160
  // ------------------------------ Demotions (full -> part w/ narrow lanes)
3990
4161
 
3991
4162
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
@@ -4035,15 +4206,6 @@ HWY_API VFromD<D> DemoteTo(D du8, VFromD<Rebind<uint16_t, D>> v) {
4035
4206
  return DemoteTo(du8, BitCast(di16, Min(v, Set(du16, 0x7FFF))));
4036
4207
  }
4037
4208
 
4038
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
4039
- HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
4040
- const Rebind<int32_t, decltype(dbf16)> di32;
4041
- const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
4042
- const Rebind<uint16_t, decltype(dbf16)> du16;
4043
- const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
4044
- return BitCast(dbf16, DemoteTo(du16, bits_in_32));
4045
- }
4046
-
4047
4209
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
4048
4210
  HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) {
4049
4211
  return VFromD<D>{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)};
@@ -4114,15 +4276,6 @@ HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<uint64_t, D>> v) {
4114
4276
  return DemoteTo(df32, adj_f64_val);
4115
4277
  }
4116
4278
 
4117
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_BF16_D(D),
4118
- class V32 = VFromD<Repartition<float, D>>>
4119
- HWY_API VFromD<D> ReorderDemote2To(D dbf16, V32 a, V32 b) {
4120
- const RebindToUnsigned<decltype(dbf16)> du16;
4121
- const Repartition<uint32_t, decltype(dbf16)> du32;
4122
- const VFromD<decltype(du32)> b_in_even = ShiftRight<16>(BitCast(du32, b));
4123
- return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
4124
- }
4125
-
4126
4279
  // Specializations for partial vectors because i16x8_narrow_i32x4 sets lanes
4127
4280
  // above 2*N.
4128
4281
  template <class D, HWY_IF_I16_D(D)>
@@ -4469,12 +4622,6 @@ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
4469
4622
  return ReorderDemote2To(d, a, b);
4470
4623
  }
4471
4624
 
4472
- template <class D, HWY_IF_BF16_D(D), class V32 = VFromD<Repartition<float, D>>>
4473
- HWY_API VFromD<D> OrderedDemote2To(D dbf16, V32 a, V32 b) {
4474
- const RebindToUnsigned<decltype(dbf16)> du16;
4475
- return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a)));
4476
- }
4477
-
4478
4625
  // ------------------------------ ConvertTo
4479
4626
 
4480
4627
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
@@ -4675,6 +4822,31 @@ HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
4675
4822
  return And(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), Set(du64, 0xFFFF));
4676
4823
  }
4677
4824
 
4825
+ template <size_t N>
4826
+ HWY_API Vec128<int64_t, N / 8> SumsOf8(const Vec128<int8_t, N> v) {
4827
+ const DFromV<decltype(v)> di8;
4828
+ const RepartitionToWide<decltype(di8)> di16;
4829
+ const RepartitionToWide<decltype(di16)> di32;
4830
+ const RepartitionToWide<decltype(di32)> di64;
4831
+ const RebindToUnsigned<decltype(di32)> du32;
4832
+ const RebindToUnsigned<decltype(di64)> du64;
4833
+ using VI16 = VFromD<decltype(di16)>;
4834
+
4835
+ const VI16 vFDB97531 = ShiftRight<8>(BitCast(di16, v));
4836
+ const VI16 vECA86420 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, v)));
4837
+ const VI16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
4838
+
4839
+ const VI16 sDC_zz_98_zz_54_zz_10_zz =
4840
+ BitCast(di16, ShiftLeft<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
4841
+ const VI16 sFC_xx_B8_xx_74_xx_30_xx =
4842
+ Add(sFE_DC_BA_98_76_54_32_10, sDC_zz_98_zz_54_zz_10_zz);
4843
+ const VI16 sB8_xx_zz_zz_30_xx_zz_zz =
4844
+ BitCast(di16, ShiftLeft<32>(BitCast(du64, sFC_xx_B8_xx_74_xx_30_xx)));
4845
+ const VI16 sF8_xx_xx_xx_70_xx_xx_xx =
4846
+ Add(sFC_xx_B8_xx_74_xx_30_xx, sB8_xx_zz_zz_30_xx_zz_zz);
4847
+ return ShiftRight<48>(BitCast(di64, sF8_xx_xx_xx_70_xx_xx_xx));
4848
+ }
4849
+
4678
4850
  // ------------------------------ LoadMaskBits (TestBit)
4679
4851
 
4680
4852
  namespace detail {
@@ -4729,6 +4901,15 @@ HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
4729
4901
  return detail::LoadMaskBits(d, mask_bits);
4730
4902
  }
4731
4903
 
4904
+ // ------------------------------ Dup128MaskFromMaskBits
4905
+
4906
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
4907
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
4908
+ constexpr size_t kN = MaxLanes(d);
4909
+ if (kN < 8) mask_bits &= (1u << kN) - 1;
4910
+ return detail::LoadMaskBits(d, mask_bits);
4911
+ }
4912
+
4732
4913
  // ------------------------------ Mask
4733
4914
 
4734
4915
  namespace detail {
@@ -5593,59 +5774,47 @@ HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
5593
5774
 
5594
5775
  // ------------------------------ MulEven/Odd (Load)
5595
5776
 
5596
- HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a,
5597
- const Vec128<uint64_t> b) {
5598
- alignas(16) uint64_t mul[2];
5599
- mul[0] =
5600
- Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0)),
5601
- static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
5602
- return Load(Full128<uint64_t>(), mul);
5777
+ template <class T, HWY_IF_UI64(T)>
5778
+ HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
5779
+ alignas(16) T mul[2];
5780
+ mul[0] = Mul128(static_cast<T>(wasm_i64x2_extract_lane(a.raw, 0)),
5781
+ static_cast<T>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
5782
+ return Load(Full128<T>(), mul);
5603
5783
  }
5604
5784
 
5605
- HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
5606
- const Vec128<uint64_t> b) {
5607
- alignas(16) uint64_t mul[2];
5608
- mul[0] =
5609
- Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1)),
5610
- static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
5611
- return Load(Full128<uint64_t>(), mul);
5785
+ template <class T, HWY_IF_UI64(T)>
5786
+ HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
5787
+ alignas(16) T mul[2];
5788
+ mul[0] = Mul128(static_cast<T>(wasm_i64x2_extract_lane(a.raw, 1)),
5789
+ static_cast<T>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
5790
+ return Load(Full128<T>(), mul);
5612
5791
  }
5613
5792
 
5614
- // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
5793
+ // ------------------------------ I64/U64 MulHigh (GetLane)
5794
+ template <class T, HWY_IF_UI64(T)>
5795
+ HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) {
5796
+ T hi;
5797
+ Mul128(GetLane(a), GetLane(b), &hi);
5798
+ return Set(Full64<T>(), hi);
5799
+ }
5800
+
5801
+ template <class T, HWY_IF_UI64(T)>
5802
+ HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
5803
+ T hi_0;
5804
+ T hi_1;
5805
+ Mul128(GetLane(a), GetLane(b), &hi_0);
5806
+ Mul128(detail::ExtractLane<1>(a), detail::ExtractLane<1>(b), &hi_1);
5807
+ return Dup128VecFromValues(Full128<T>(), hi_0, hi_1);
5808
+ }
5809
+
5810
+ // ------------------------------ WidenMulPairwiseAdd (MulAdd, PromoteEvenTo)
5615
5811
 
5616
5812
  // Generic for all vector lengths.
5617
- template <class D32, HWY_IF_F32_D(D32),
5618
- class V16 = VFromD<Repartition<bfloat16_t, D32>>>
5619
- HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) {
5620
- const Rebind<uint32_t, decltype(df32)> du32;
5621
- using VU32 = VFromD<decltype(du32)>;
5622
- const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
5623
- // Using shift/and instead of Zip leads to the odd/even order that
5624
- // RearrangeToOddPlusEven prefers.
5625
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
5626
- const VU32 ao = And(BitCast(du32, a), odd);
5627
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
5628
- const VU32 bo = And(BitCast(du32, b), odd);
5629
- return Mul(BitCast(df32, ae), BitCast(df32, be)) +
5630
- Mul(BitCast(df32, ao), BitCast(df32, bo));
5631
- }
5632
-
5633
- template <class D32, HWY_IF_F32_D(D32),
5634
- class V16 = VFromD<Repartition<bfloat16_t, D32>>>
5635
- HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b,
5636
- const VFromD<D32> sum0,
5637
- VFromD<D32>& sum1) {
5638
- const Rebind<uint32_t, decltype(df32)> du32;
5639
- using VU32 = VFromD<decltype(du32)>;
5640
- const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
5641
- // Using shift/and instead of Zip leads to the odd/even order that
5642
- // RearrangeToOddPlusEven prefers.
5643
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
5644
- const VU32 ao = And(BitCast(du32, a), odd);
5645
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
5646
- const VU32 bo = And(BitCast(du32, b), odd);
5647
- sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
5648
- return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
5813
+ template <class DF, HWY_IF_F32_D(DF),
5814
+ class VBF = VFromD<Repartition<bfloat16_t, DF>>>
5815
+ HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
5816
+ return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
5817
+ Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
5649
5818
  }
5650
5819
 
5651
5820
  // Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is
@@ -5659,35 +5828,18 @@ HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 /* tag */, V16 a, V16 b) {
5659
5828
  template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_LE_D(DU32, 16),
5660
5829
  class VU16 = VFromD<RepartitionToNarrow<DU32>>>
5661
5830
  HWY_API VFromD<DU32> WidenMulPairwiseAdd(DU32 du32, VU16 a, VU16 b) {
5662
- const auto lo16_mask = Set(du32, 0x0000FFFFu);
5663
-
5664
- const auto a0 = And(BitCast(du32, a), lo16_mask);
5665
- const auto b0 = And(BitCast(du32, b), lo16_mask);
5666
-
5667
- const auto a1 = ShiftRight<16>(BitCast(du32, a));
5668
- const auto b1 = ShiftRight<16>(BitCast(du32, b));
5669
-
5670
- return MulAdd(a1, b1, a0 * b0);
5831
+ return MulAdd(PromoteEvenTo(du32, a), PromoteEvenTo(du32, b),
5832
+ Mul(PromoteOddTo(du32, a), PromoteOddTo(du32, b)));
5671
5833
  }
5672
5834
 
5673
- // Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is
5674
- // safe.
5675
- template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
5835
+ // ------------------------------ ReorderWidenMulAccumulate
5836
+
5837
+ template <class D32, HWY_IF_UI32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
5676
5838
  class V16 = VFromD<RepartitionToNarrow<D32>>>
5677
- HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 d, V16 a, V16 b,
5839
+ HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 d32, V16 a, V16 b,
5678
5840
  const VFromD<D32> sum0,
5679
5841
  VFromD<D32>& /*sum1*/) {
5680
- return sum0 + WidenMulPairwiseAdd(d, a, b);
5681
- }
5682
-
5683
- // Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is
5684
- // safe.
5685
- template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_LE_D(DU32, 16),
5686
- class VU16 = VFromD<RepartitionToNarrow<DU32>>>
5687
- HWY_API VFromD<DU32> ReorderWidenMulAccumulate(DU32 d, VU16 a, VU16 b,
5688
- const VFromD<DU32> sum0,
5689
- VFromD<DU32>& /*sum1*/) {
5690
- return sum0 + WidenMulPairwiseAdd(d, a, b);
5842
+ return sum0 + WidenMulPairwiseAdd(d32, a, b);
5691
5843
  }
5692
5844
 
5693
5845
  // ------------------------------ RearrangeToOddPlusEven
@@ -5711,120 +5863,7 @@ HWY_API Vec128<float, N> RearrangeToOddPlusEven(const Vec128<float, N> sum0,
5711
5863
 
5712
5864
  // ------------------------------ Reductions
5713
5865
 
5714
- namespace detail {
5715
-
5716
- // N=1: no-op
5717
- template <typename T>
5718
- HWY_INLINE Vec128<T, 1> SumOfLanes(Vec128<T, 1> v) {
5719
- return v;
5720
- }
5721
- template <typename T>
5722
- HWY_INLINE Vec128<T, 1> MinOfLanes(Vec128<T, 1> v) {
5723
- return v;
5724
- }
5725
- template <typename T>
5726
- HWY_INLINE Vec128<T, 1> MaxOfLanes(Vec128<T, 1> v) {
5727
- return v;
5728
- }
5729
-
5730
- // N=2
5731
- template <typename T>
5732
- HWY_INLINE Vec128<T, 2> SumOfLanes(Vec128<T, 2> v10) {
5733
- const DFromV<decltype(v10)> d;
5734
- return Add(v10, Reverse2(d, v10));
5735
- }
5736
- template <typename T>
5737
- HWY_INLINE Vec128<T, 2> MinOfLanes(Vec128<T, 2> v10) {
5738
- const DFromV<decltype(v10)> d;
5739
- return Min(v10, Reverse2(d, v10));
5740
- }
5741
- template <typename T>
5742
- HWY_INLINE Vec128<T, 2> MaxOfLanes(Vec128<T, 2> v10) {
5743
- const DFromV<decltype(v10)> d;
5744
- return Max(v10, Reverse2(d, v10));
5745
- }
5746
-
5747
- // N=4 (only 16/32-bit, else >128-bit)
5748
- template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
5749
- HWY_INLINE Vec128<T, 4> SumOfLanes(Vec128<T, 4> v3210) {
5750
- using V = decltype(v3210);
5751
- const DFromV<V> d;
5752
- const V v0123 = Reverse4(d, v3210);
5753
- const V v03_12_12_03 = Add(v3210, v0123);
5754
- const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
5755
- return Add(v03_12_12_03, v12_03_03_12);
5756
- }
5757
- template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
5758
- HWY_INLINE Vec128<T, 4> MinOfLanes(Vec128<T, 4> v3210) {
5759
- using V = decltype(v3210);
5760
- const DFromV<V> d;
5761
- const V v0123 = Reverse4(d, v3210);
5762
- const V v03_12_12_03 = Min(v3210, v0123);
5763
- const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
5764
- return Min(v03_12_12_03, v12_03_03_12);
5765
- }
5766
- template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
5767
- HWY_INLINE Vec128<T, 4> MaxOfLanes(Vec128<T, 4> v3210) {
5768
- using V = decltype(v3210);
5769
- const DFromV<V> d;
5770
- const V v0123 = Reverse4(d, v3210);
5771
- const V v03_12_12_03 = Max(v3210, v0123);
5772
- const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
5773
- return Max(v03_12_12_03, v12_03_03_12);
5774
- }
5775
-
5776
- // N=8 (only 16-bit, else >128-bit)
5777
- template <typename T, HWY_IF_T_SIZE(T, 2)>
5778
- HWY_INLINE Vec128<T, 8> SumOfLanes(Vec128<T, 8> v76543210) {
5779
- using V = decltype(v76543210);
5780
- const DFromV<V> d;
5781
- // The upper half is reversed from the lower half; omit for brevity.
5782
- const V v34_25_16_07 = Add(v76543210, Reverse8(d, v76543210));
5783
- const V v0347_1625_1625_0347 = Add(v34_25_16_07, Reverse4(d, v34_25_16_07));
5784
- return Add(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
5785
- }
5786
- template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
5787
- HWY_INLINE Vec128<T, 8> MinOfLanes(Vec128<T, 8> v76543210) {
5788
- using V = decltype(v76543210);
5789
- const DFromV<V> d;
5790
- // The upper half is reversed from the lower half; omit for brevity.
5791
- const V v34_25_16_07 = Min(v76543210, Reverse8(d, v76543210));
5792
- const V v0347_1625_1625_0347 = Min(v34_25_16_07, Reverse4(d, v34_25_16_07));
5793
- return Min(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
5794
- }
5795
- template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
5796
- HWY_INLINE Vec128<T, 8> MaxOfLanes(Vec128<T, 8> v76543210) {
5797
- using V = decltype(v76543210);
5798
- const DFromV<V> d;
5799
- // The upper half is reversed from the lower half; omit for brevity.
5800
- const V v34_25_16_07 = Max(v76543210, Reverse8(d, v76543210));
5801
- const V v0347_1625_1625_0347 = Max(v34_25_16_07, Reverse4(d, v34_25_16_07));
5802
- return Max(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
5803
- }
5804
-
5805
- template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
5806
- HWY_INLINE T ReduceSum(Vec128<T, N> v) {
5807
- return GetLane(SumOfLanes(v));
5808
- }
5809
-
5810
- } // namespace detail
5811
-
5812
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
5813
- HWY_API VFromD<D> SumOfLanes(D /* tag */, VFromD<D> v) {
5814
- return detail::SumOfLanes(v);
5815
- }
5816
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
5817
- HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v) {
5818
- return detail::ReduceSum(v);
5819
- }
5820
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
5821
- HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) {
5822
- return detail::MinOfLanes(v);
5823
- }
5824
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
5825
- HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) {
5826
- return detail::MaxOfLanes(v);
5827
- }
5866
+ // Nothing native, generic_ops-inl defines SumOfLanes and ReduceSum.
5828
5867
 
5829
5868
  // ------------------------------ Lt128
5830
5869