@img/sharp-libvips-dev 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +7 -1
- package/include/aom/aom_image.h +24 -12
- package/include/aom/aom_integer.h +3 -3
- package/include/aom/aomcx.h +15 -0
- package/include/aom/aomdx.h +5 -2
- package/include/archive.h +7 -5
- package/include/archive_entry.h +5 -3
- package/include/cgif.h +3 -0
- package/include/expat.h +21 -10
- package/include/expat_config.h +11 -5
- package/include/ffi.h +12 -25
- package/include/freetype2/freetype/config/ftoption.h +2 -2
- package/include/fribidi/fribidi-config.h +2 -2
- package/include/fribidi/fribidi-unicode-version.h +3 -3
- package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
- package/include/glib-2.0/gio/gappinfo.h +40 -25
- package/include/glib-2.0/gio/gapplication.h +6 -0
- package/include/glib-2.0/gio/gasyncresult.h +1 -1
- package/include/glib-2.0/gio/gconverter.h +5 -0
- package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
- package/include/glib-2.0/gio/gfile.h +16 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/giotypes.h +0 -1
- package/include/glib-2.0/gio/gsettings.h +8 -0
- package/include/glib-2.0/gio/gvfs.h +2 -2
- package/include/glib-2.0/girepository/gi-visibility.h +34 -0
- package/include/glib-2.0/girepository/giarginfo.h +23 -6
- package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
- package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
- package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
- package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
- package/include/glib-2.0/girepository/gienuminfo.h +20 -21
- package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
- package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
- package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
- package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
- package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
- package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
- package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
- package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
- package/include/glib-2.0/girepository/girepository.h +53 -62
- package/include/glib-2.0/girepository/girffi.h +8 -7
- package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
- package/include/glib-2.0/girepository/gistructinfo.h +26 -11
- package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
- package/include/glib-2.0/girepository/gitypelib.h +9 -13
- package/include/glib-2.0/girepository/gitypes.h +52 -104
- package/include/glib-2.0/girepository/giunioninfo.h +28 -12
- package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
- package/include/glib-2.0/girepository/givalueinfo.h +65 -0
- package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
- package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
- package/include/glib-2.0/glib/gbitlock.h +31 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
- package/include/glib-2.0/glib/giochannel.h +2 -2
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +12 -5
- package/include/glib-2.0/glib/gmain.h +93 -7
- package/include/glib-2.0/glib/gmessages.h +8 -0
- package/include/glib-2.0/glib/gqsort.h +8 -1
- package/include/glib-2.0/glib/gslice.h +2 -0
- package/include/glib-2.0/glib/gstrfuncs.h +24 -30
- package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
- package/include/glib-2.0/glib/gthread.h +191 -3
- package/include/glib-2.0/glib/gunicode.h +1 -1
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/glib-unix.h +7 -1
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/genums.h +6 -6
- package/include/glib-2.0/gobject/glib-types.h +11 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gsignal.h +16 -6
- package/include/glib-2.0/gobject/gtype.h +6 -6
- package/include/harfbuzz/hb-buffer.h +6 -0
- package/include/harfbuzz/hb-common.h +6 -9
- package/include/harfbuzz/hb-cplusplus.hh +8 -11
- package/include/harfbuzz/hb-subset.h +17 -4
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +28 -0
- package/include/hwy/aligned_allocator.h +218 -6
- package/include/hwy/base.h +1935 -512
- package/include/hwy/cache_control.h +24 -6
- package/include/hwy/detect_compiler_arch.h +105 -10
- package/include/hwy/detect_targets.h +146 -37
- package/include/hwy/foreach_target.h +36 -1
- package/include/hwy/highway.h +222 -50
- package/include/hwy/ops/arm_neon-inl.h +2055 -894
- package/include/hwy/ops/arm_sve-inl.h +1476 -348
- package/include/hwy/ops/emu128-inl.h +711 -623
- package/include/hwy/ops/generic_ops-inl.h +4431 -2157
- package/include/hwy/ops/inside-inl.h +691 -0
- package/include/hwy/ops/ppc_vsx-inl.h +2186 -673
- package/include/hwy/ops/rvv-inl.h +1556 -536
- package/include/hwy/ops/scalar-inl.h +353 -233
- package/include/hwy/ops/set_macros-inl.h +171 -23
- package/include/hwy/ops/shared-inl.h +198 -56
- package/include/hwy/ops/wasm_128-inl.h +283 -244
- package/include/hwy/ops/x86_128-inl.h +3673 -1357
- package/include/hwy/ops/x86_256-inl.h +1737 -663
- package/include/hwy/ops/x86_512-inl.h +1697 -500
- package/include/hwy/per_target.h +4 -0
- package/include/hwy/profiler.h +648 -0
- package/include/hwy/robust_statistics.h +2 -2
- package/include/hwy/targets.h +40 -32
- package/include/hwy/timer-inl.h +3 -3
- package/include/hwy/timer.h +16 -1
- package/include/libheif/heif.h +170 -15
- package/include/libheif/heif_items.h +237 -0
- package/include/libheif/heif_properties.h +38 -2
- package/include/libheif/heif_regions.h +1 -1
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/png.h +32 -29
- package/include/libpng16/pngconf.h +2 -2
- package/include/libpng16/pnglibconf.h +8 -3
- package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
- package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
- package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
- package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
- package/include/libxml2/libxml/HTMLparser.h +12 -19
- package/include/libxml2/libxml/c14n.h +1 -12
- package/include/libxml2/libxml/debugXML.h +1 -1
- package/include/libxml2/libxml/encoding.h +9 -0
- package/include/libxml2/libxml/entities.h +12 -1
- package/include/libxml2/libxml/hash.h +19 -0
- package/include/libxml2/libxml/list.h +2 -2
- package/include/libxml2/libxml/nanohttp.h +17 -0
- package/include/libxml2/libxml/parser.h +73 -58
- package/include/libxml2/libxml/parserInternals.h +9 -1
- package/include/libxml2/libxml/pattern.h +6 -0
- package/include/libxml2/libxml/tree.h +32 -12
- package/include/libxml2/libxml/uri.h +11 -0
- package/include/libxml2/libxml/valid.h +29 -2
- package/include/libxml2/libxml/xinclude.h +7 -0
- package/include/libxml2/libxml/xmlIO.h +21 -5
- package/include/libxml2/libxml/xmlerror.h +14 -0
- package/include/libxml2/libxml/xmlexports.h +111 -15
- package/include/libxml2/libxml/xmlmemory.h +8 -45
- package/include/libxml2/libxml/xmlreader.h +2 -0
- package/include/libxml2/libxml/xmlsave.h +5 -0
- package/include/libxml2/libxml/xmlunicode.h +165 -1
- package/include/libxml2/libxml/xmlversion.h +15 -179
- package/include/libxml2/libxml/xmlwriter.h +1 -0
- package/include/libxml2/libxml/xpath.h +4 -0
- package/include/pango-1.0/pango/pango-features.h +2 -2
- package/include/pango-1.0/pango/pango-fontmap.h +7 -0
- package/include/pango-1.0/pango/pango-item.h +4 -2
- package/include/pango-1.0/pango/pango-version-macros.h +25 -0
- package/include/pango-1.0/pango/pangofc-font.h +2 -1
- package/include/pixman-1/pixman-version.h +2 -2
- package/include/png.h +32 -29
- package/include/pngconf.h +2 -2
- package/include/pnglibconf.h +8 -3
- package/include/vips/connection.h +9 -3
- package/include/vips/util.h +1 -11
- package/include/vips/version.h +4 -4
- package/include/webp/decode.h +58 -56
- package/include/webp/demux.h +25 -21
- package/include/webp/encode.h +44 -39
- package/include/webp/mux.h +76 -15
- package/include/webp/mux_types.h +2 -1
- package/include/webp/sharpyuv/sharpyuv.h +77 -8
- package/include/webp/types.h +29 -8
- package/include/zconf.h +1 -1
- package/include/zlib.h +12 -12
- package/package.json +1 -1
- package/versions.json +18 -19
|
@@ -92,6 +92,9 @@ class Vec128 {
|
|
|
92
92
|
HWY_INLINE Vec128& operator-=(const Vec128 other) {
|
|
93
93
|
return *this = (*this - other);
|
|
94
94
|
}
|
|
95
|
+
HWY_INLINE Vec128& operator%=(const Vec128 other) {
|
|
96
|
+
return *this = (*this % other);
|
|
97
|
+
}
|
|
95
98
|
HWY_INLINE Vec128& operator&=(const Vec128 other) {
|
|
96
99
|
return *this = (*this & other);
|
|
97
100
|
}
|
|
@@ -151,9 +154,6 @@ HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
|
|
|
151
154
|
template <class D>
|
|
152
155
|
using VFromD = decltype(Zero(D()));
|
|
153
156
|
|
|
154
|
-
// ------------------------------ Tuple (VFromD)
|
|
155
|
-
#include "hwy/ops/tuple-inl.h"
|
|
156
|
-
|
|
157
157
|
// ------------------------------ BitCast
|
|
158
158
|
|
|
159
159
|
namespace detail {
|
|
@@ -213,25 +213,29 @@ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
|
|
|
213
213
|
HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
|
|
214
214
|
return VFromD<D>{wasm_i8x16_splat(static_cast<int8_t>(t))};
|
|
215
215
|
}
|
|
216
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16),
|
|
216
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)>
|
|
217
217
|
HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
|
|
218
218
|
return VFromD<D>{wasm_i16x8_splat(static_cast<int16_t>(t))};
|
|
219
219
|
}
|
|
220
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16),
|
|
220
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
|
|
221
221
|
HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
|
|
222
222
|
return VFromD<D>{wasm_i32x4_splat(static_cast<int32_t>(t))};
|
|
223
223
|
}
|
|
224
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16),
|
|
224
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
|
|
225
225
|
HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
|
|
226
226
|
return VFromD<D>{wasm_i64x2_splat(static_cast<int64_t>(t))};
|
|
227
227
|
}
|
|
228
228
|
|
|
229
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_SPECIAL_FLOAT_D(D)>
|
|
230
|
+
HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
|
|
231
|
+
return VFromD<D>{wasm_i16x8_splat(BitCastScalar<int16_t>(t))};
|
|
232
|
+
}
|
|
229
233
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
|
|
230
|
-
HWY_API VFromD<D> Set(D /* tag */,
|
|
234
|
+
HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
|
|
231
235
|
return VFromD<D>{wasm_f32x4_splat(t)};
|
|
232
236
|
}
|
|
233
237
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
|
|
234
|
-
HWY_API VFromD<D> Set(D /* tag */,
|
|
238
|
+
HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
|
|
235
239
|
return VFromD<D>{wasm_f64x2_splat(t)};
|
|
236
240
|
}
|
|
237
241
|
|
|
@@ -251,12 +255,99 @@ template <class D, typename T = TFromD<D>, typename T2>
|
|
|
251
255
|
HWY_API VFromD<D> Iota(D d, const T2 first) {
|
|
252
256
|
HWY_ALIGN T lanes[MaxLanes(d)];
|
|
253
257
|
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
254
|
-
lanes[i] =
|
|
255
|
-
AddWithWraparound(hwy::IsFloatTag<T>(), static_cast<T>(first), i);
|
|
258
|
+
lanes[i] = AddWithWraparound(static_cast<T>(first), i);
|
|
256
259
|
}
|
|
257
260
|
return Load(d, lanes);
|
|
258
261
|
}
|
|
259
262
|
|
|
263
|
+
// ------------------------------ Dup128VecFromValues
|
|
264
|
+
template <class D, HWY_IF_I8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
265
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
266
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
267
|
+
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
|
|
268
|
+
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
|
|
269
|
+
TFromD<D> t11, TFromD<D> t12,
|
|
270
|
+
TFromD<D> t13, TFromD<D> t14,
|
|
271
|
+
TFromD<D> t15) {
|
|
272
|
+
return VFromD<D>{wasm_i8x16_make(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10,
|
|
273
|
+
t11, t12, t13, t14, t15)};
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
template <class D, HWY_IF_U8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
277
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
278
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
279
|
+
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
|
|
280
|
+
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
|
|
281
|
+
TFromD<D> t11, TFromD<D> t12,
|
|
282
|
+
TFromD<D> t13, TFromD<D> t14,
|
|
283
|
+
TFromD<D> t15) {
|
|
284
|
+
return VFromD<D>{wasm_u8x16_make(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10,
|
|
285
|
+
t11, t12, t13, t14, t15)};
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
template <class D, HWY_IF_I16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
289
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
290
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
291
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
292
|
+
TFromD<D> t7) {
|
|
293
|
+
return VFromD<D>{wasm_i16x8_make(t0, t1, t2, t3, t4, t5, t6, t7)};
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
template <class D, HWY_IF_U16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
297
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
298
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
299
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
300
|
+
TFromD<D> t7) {
|
|
301
|
+
return VFromD<D>{wasm_u16x8_make(t0, t1, t2, t3, t4, t5, t6, t7)};
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
template <class D, HWY_IF_SPECIAL_FLOAT_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
305
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
306
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
307
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
308
|
+
TFromD<D> t7) {
|
|
309
|
+
const RebindToSigned<decltype(d)> di;
|
|
310
|
+
return BitCast(d,
|
|
311
|
+
Dup128VecFromValues(
|
|
312
|
+
di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
|
|
313
|
+
BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
|
|
314
|
+
BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
|
|
315
|
+
BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
319
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
320
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
321
|
+
return VFromD<D>{wasm_i32x4_make(t0, t1, t2, t3)};
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
325
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
326
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
327
|
+
return VFromD<D>{wasm_u32x4_make(t0, t1, t2, t3)};
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
331
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
332
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
333
|
+
return VFromD<D>{wasm_f32x4_make(t0, t1, t2, t3)};
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
template <class D, HWY_IF_I64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
337
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
338
|
+
return VFromD<D>{wasm_i64x2_make(t0, t1)};
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
template <class D, HWY_IF_U64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
342
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
343
|
+
return VFromD<D>{wasm_u64x2_make(t0, t1)};
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
347
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
348
|
+
return VFromD<D>{wasm_f64x2_make(t0, t1)};
|
|
349
|
+
}
|
|
350
|
+
|
|
260
351
|
// ================================================== ARITHMETIC
|
|
261
352
|
|
|
262
353
|
// ------------------------------ Addition
|
|
@@ -560,12 +651,16 @@ HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
|
|
|
560
651
|
}
|
|
561
652
|
|
|
562
653
|
// ------------------------------ RotateRight (ShiftRight, Or)
|
|
563
|
-
template <int kBits, typename T, size_t N>
|
|
654
|
+
template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
564
655
|
HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
|
|
656
|
+
const DFromV<decltype(v)> d;
|
|
657
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
658
|
+
|
|
565
659
|
constexpr size_t kSizeInBits = sizeof(T) * 8;
|
|
566
660
|
static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
|
|
661
|
+
|
|
567
662
|
if (kBits == 0) return v;
|
|
568
|
-
return Or(ShiftRight<kBits>(v),
|
|
663
|
+
return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
|
|
569
664
|
ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
|
|
570
665
|
}
|
|
571
666
|
|
|
@@ -823,7 +918,25 @@ HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
|
|
|
823
918
|
return Vec128<int32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
|
|
824
919
|
}
|
|
825
920
|
|
|
826
|
-
// Returns the upper
|
|
921
|
+
// Returns the upper sizeof(T)*8 bits of a * b in each lane.
|
|
922
|
+
template <size_t N>
|
|
923
|
+
HWY_API Vec128<uint8_t, N> MulHigh(const Vec128<uint8_t, N> a,
|
|
924
|
+
const Vec128<uint8_t, N> b) {
|
|
925
|
+
const auto l = wasm_u16x8_extmul_low_u8x16(a.raw, b.raw);
|
|
926
|
+
const auto h = wasm_u16x8_extmul_high_u8x16(a.raw, b.raw);
|
|
927
|
+
// TODO(eustas): shift-right + narrow?
|
|
928
|
+
return Vec128<uint8_t, N>{wasm_i8x16_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15,
|
|
929
|
+
17, 19, 21, 23, 25, 27, 29, 31)};
|
|
930
|
+
}
|
|
931
|
+
template <size_t N>
|
|
932
|
+
HWY_API Vec128<int8_t, N> MulHigh(const Vec128<int8_t, N> a,
|
|
933
|
+
const Vec128<int8_t, N> b) {
|
|
934
|
+
const auto l = wasm_i16x8_extmul_low_i8x16(a.raw, b.raw);
|
|
935
|
+
const auto h = wasm_i16x8_extmul_high_i8x16(a.raw, b.raw);
|
|
936
|
+
// TODO(eustas): shift-right + narrow?
|
|
937
|
+
return Vec128<int8_t, N>{wasm_i8x16_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15,
|
|
938
|
+
17, 19, 21, 23, 25, 27, 29, 31)};
|
|
939
|
+
}
|
|
827
940
|
template <size_t N>
|
|
828
941
|
HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
|
|
829
942
|
const Vec128<uint16_t, N> b) {
|
|
@@ -842,6 +955,22 @@ HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
|
|
|
842
955
|
return Vec128<int16_t, N>{
|
|
843
956
|
wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
|
|
844
957
|
}
|
|
958
|
+
template <size_t N>
|
|
959
|
+
HWY_API Vec128<uint32_t, N> MulHigh(const Vec128<uint32_t, N> a,
|
|
960
|
+
const Vec128<uint32_t, N> b) {
|
|
961
|
+
const auto l = wasm_u64x2_extmul_low_u32x4(a.raw, b.raw);
|
|
962
|
+
const auto h = wasm_u64x2_extmul_high_u32x4(a.raw, b.raw);
|
|
963
|
+
// TODO(eustas): shift-right + narrow?
|
|
964
|
+
return Vec128<uint32_t, N>{wasm_i32x4_shuffle(l, h, 1, 3, 5, 7)};
|
|
965
|
+
}
|
|
966
|
+
template <size_t N>
|
|
967
|
+
HWY_API Vec128<int32_t, N> MulHigh(const Vec128<int32_t, N> a,
|
|
968
|
+
const Vec128<int32_t, N> b) {
|
|
969
|
+
const auto l = wasm_i64x2_extmul_low_i32x4(a.raw, b.raw);
|
|
970
|
+
const auto h = wasm_i64x2_extmul_high_i32x4(a.raw, b.raw);
|
|
971
|
+
// TODO(eustas): shift-right + narrow?
|
|
972
|
+
return Vec128<int32_t, N>{wasm_i32x4_shuffle(l, h, 1, 3, 5, 7)};
|
|
973
|
+
}
|
|
845
974
|
|
|
846
975
|
template <size_t N>
|
|
847
976
|
HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
|
|
@@ -977,25 +1106,25 @@ HWY_API Vec128<T, N> AbsDiff(const Vec128<T, N> a, const Vec128<T, N> b) {
|
|
|
977
1106
|
|
|
978
1107
|
// ------------------------------ Floating-point multiply-add variants
|
|
979
1108
|
|
|
980
|
-
template <typename T, size_t N>
|
|
1109
|
+
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
981
1110
|
HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
|
|
982
1111
|
Vec128<T, N> add) {
|
|
983
1112
|
return mul * x + add;
|
|
984
1113
|
}
|
|
985
1114
|
|
|
986
|
-
template <typename T, size_t N>
|
|
1115
|
+
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
987
1116
|
HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
|
|
988
1117
|
Vec128<T, N> add) {
|
|
989
1118
|
return add - mul * x;
|
|
990
1119
|
}
|
|
991
1120
|
|
|
992
|
-
template <typename T, size_t N>
|
|
1121
|
+
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
993
1122
|
HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
|
|
994
1123
|
Vec128<T, N> sub) {
|
|
995
1124
|
return mul * x - sub;
|
|
996
1125
|
}
|
|
997
1126
|
|
|
998
|
-
template <typename T, size_t N>
|
|
1127
|
+
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
999
1128
|
HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
|
|
1000
1129
|
Vec128<T, N> sub) {
|
|
1001
1130
|
return Neg(mul) * x - sub;
|
|
@@ -1071,10 +1200,10 @@ HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
|
|
|
1071
1200
|
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
1072
1201
|
HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
|
|
1073
1202
|
const DFromV<decltype(v)> d;
|
|
1074
|
-
const
|
|
1075
|
-
const VFromD<decltype(
|
|
1203
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1204
|
+
const VFromD<decltype(du)> vu = BitCast(du, v);
|
|
1076
1205
|
// 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
|
|
1077
|
-
return RebindMask(d, Eq(Add(
|
|
1206
|
+
return RebindMask(d, Eq(Add(vu, vu), Set(du, hwy::MaxExponentTimes2<T>())));
|
|
1078
1207
|
}
|
|
1079
1208
|
|
|
1080
1209
|
// Returns whether normal/subnormal/zero.
|
|
@@ -1528,13 +1657,6 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
|
|
|
1528
1657
|
return IfThenElse(MaskFromVec(v), yes, no);
|
|
1529
1658
|
}
|
|
1530
1659
|
|
|
1531
|
-
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
1532
|
-
HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
|
|
1533
|
-
const DFromV<decltype(v)> d;
|
|
1534
|
-
const auto zero = Zero(d);
|
|
1535
|
-
return IfThenElse(Mask128<T, N>{(v > zero).raw}, v, zero);
|
|
1536
|
-
}
|
|
1537
|
-
|
|
1538
1660
|
// ------------------------------ Mask logical
|
|
1539
1661
|
|
|
1540
1662
|
template <typename T, size_t N>
|
|
@@ -1815,9 +1937,7 @@ template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2),
|
|
|
1815
1937
|
HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
1816
1938
|
HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
|
|
1817
1939
|
const int16_t lane = wasm_i16x8_extract_lane(v.raw, kLane);
|
|
1818
|
-
T
|
|
1819
|
-
CopySameSize(&lane, &ret); // for float16_t
|
|
1820
|
-
return ret;
|
|
1940
|
+
return static_cast<T>(lane);
|
|
1821
1941
|
}
|
|
1822
1942
|
template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2),
|
|
1823
1943
|
HWY_IF_SPECIAL_FLOAT(T)>
|
|
@@ -1826,10 +1946,7 @@ HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
|
|
|
1826
1946
|
const RebindToUnsigned<decltype(d)> du;
|
|
1827
1947
|
|
|
1828
1948
|
const uint16_t bits = ExtractLane<kLane>(BitCast(du, v));
|
|
1829
|
-
|
|
1830
|
-
T ret;
|
|
1831
|
-
CopySameSize(&bits, &ret);
|
|
1832
|
-
return ret;
|
|
1949
|
+
return BitCastScalar<T>(bits);
|
|
1833
1950
|
}
|
|
1834
1951
|
template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
|
|
1835
1952
|
HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
|
|
@@ -2038,7 +2155,7 @@ template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
|
|
|
2038
2155
|
HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
|
|
2039
2156
|
static_assert(kLane < N, "Lane index out of bounds");
|
|
2040
2157
|
return Vec128<T, N>{
|
|
2041
|
-
wasm_i16x8_replace_lane(v.raw, kLane,
|
|
2158
|
+
wasm_i16x8_replace_lane(v.raw, kLane, BitCastScalar<int16_t>(t))};
|
|
2042
2159
|
}
|
|
2043
2160
|
|
|
2044
2161
|
template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
|
|
@@ -3002,6 +3119,13 @@ HWY_API Vec128<double, N> InterleaveLower(Vec128<double, N> a,
|
|
|
3002
3119
|
return Vec128<double, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
|
|
3003
3120
|
}
|
|
3004
3121
|
|
|
3122
|
+
template <class T, size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_SPECIAL_FLOAT(T)>
|
|
3123
|
+
HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
|
|
3124
|
+
const DFromV<decltype(a)> d;
|
|
3125
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
3126
|
+
return BitCast(d, InterleaveLower(BitCast(du, a), BitCast(du, b)));
|
|
3127
|
+
}
|
|
3128
|
+
|
|
3005
3129
|
// Additional overload for the optional tag (all vector lengths).
|
|
3006
3130
|
template <class D>
|
|
3007
3131
|
HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) {
|
|
@@ -3710,6 +3834,50 @@ HWY_API Vec128<float, N> OddEven(const Vec128<float, N> a,
|
|
|
3710
3834
|
return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
|
|
3711
3835
|
}
|
|
3712
3836
|
|
|
3837
|
+
// ------------------------------ InterleaveEven
|
|
3838
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
|
|
3839
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3840
|
+
return VFromD<D>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 16, 2, 18, 4, 20, 6, 22,
|
|
3841
|
+
8, 24, 10, 26, 12, 28, 14, 30)};
|
|
3842
|
+
}
|
|
3843
|
+
|
|
3844
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
|
|
3845
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3846
|
+
return VFromD<D>{wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 2, 10, 4, 12, 6, 14)};
|
|
3847
|
+
}
|
|
3848
|
+
|
|
3849
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
|
|
3850
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3851
|
+
return VFromD<D>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 2, 6)};
|
|
3852
|
+
}
|
|
3853
|
+
|
|
3854
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
|
|
3855
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3856
|
+
return InterleaveLower(a, b);
|
|
3857
|
+
}
|
|
3858
|
+
|
|
3859
|
+
// ------------------------------ InterleaveOdd
|
|
3860
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
|
|
3861
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3862
|
+
return VFromD<D>{wasm_i8x16_shuffle(a.raw, b.raw, 1, 17, 3, 19, 5, 21, 7, 23,
|
|
3863
|
+
9, 25, 11, 27, 13, 29, 15, 31)};
|
|
3864
|
+
}
|
|
3865
|
+
|
|
3866
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
|
|
3867
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3868
|
+
return VFromD<D>{wasm_i16x8_shuffle(a.raw, b.raw, 1, 9, 3, 11, 5, 13, 7, 15)};
|
|
3869
|
+
}
|
|
3870
|
+
|
|
3871
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
|
|
3872
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3873
|
+
return VFromD<D>{wasm_i32x4_shuffle(a.raw, b.raw, 1, 5, 3, 7)};
|
|
3874
|
+
}
|
|
3875
|
+
|
|
3876
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
|
|
3877
|
+
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
|
|
3878
|
+
return InterleaveUpper(d, a, b);
|
|
3879
|
+
}
|
|
3880
|
+
|
|
3713
3881
|
// ------------------------------ OddEvenBlocks
|
|
3714
3882
|
template <typename T, size_t N>
|
|
3715
3883
|
HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
|
|
@@ -3986,6 +4154,9 @@ HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
|
|
|
3986
4154
|
return PromoteTo(d, UpperHalf(dh, v));
|
|
3987
4155
|
}
|
|
3988
4156
|
|
|
4157
|
+
// ------------------------------ PromoteEvenTo/PromoteOddTo
|
|
4158
|
+
#include "hwy/ops/inside-inl.h"
|
|
4159
|
+
|
|
3989
4160
|
// ------------------------------ Demotions (full -> part w/ narrow lanes)
|
|
3990
4161
|
|
|
3991
4162
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
|
|
@@ -4035,15 +4206,6 @@ HWY_API VFromD<D> DemoteTo(D du8, VFromD<Rebind<uint16_t, D>> v) {
|
|
|
4035
4206
|
return DemoteTo(du8, BitCast(di16, Min(v, Set(du16, 0x7FFF))));
|
|
4036
4207
|
}
|
|
4037
4208
|
|
|
4038
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
|
|
4039
|
-
HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
|
|
4040
|
-
const Rebind<int32_t, decltype(dbf16)> di32;
|
|
4041
|
-
const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
|
|
4042
|
-
const Rebind<uint16_t, decltype(dbf16)> du16;
|
|
4043
|
-
const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
|
|
4044
|
-
return BitCast(dbf16, DemoteTo(du16, bits_in_32));
|
|
4045
|
-
}
|
|
4046
|
-
|
|
4047
4209
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
|
|
4048
4210
|
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) {
|
|
4049
4211
|
return VFromD<D>{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)};
|
|
@@ -4114,15 +4276,6 @@ HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<uint64_t, D>> v) {
|
|
|
4114
4276
|
return DemoteTo(df32, adj_f64_val);
|
|
4115
4277
|
}
|
|
4116
4278
|
|
|
4117
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_BF16_D(D),
|
|
4118
|
-
class V32 = VFromD<Repartition<float, D>>>
|
|
4119
|
-
HWY_API VFromD<D> ReorderDemote2To(D dbf16, V32 a, V32 b) {
|
|
4120
|
-
const RebindToUnsigned<decltype(dbf16)> du16;
|
|
4121
|
-
const Repartition<uint32_t, decltype(dbf16)> du32;
|
|
4122
|
-
const VFromD<decltype(du32)> b_in_even = ShiftRight<16>(BitCast(du32, b));
|
|
4123
|
-
return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
|
|
4124
|
-
}
|
|
4125
|
-
|
|
4126
4279
|
// Specializations for partial vectors because i16x8_narrow_i32x4 sets lanes
|
|
4127
4280
|
// above 2*N.
|
|
4128
4281
|
template <class D, HWY_IF_I16_D(D)>
|
|
@@ -4469,12 +4622,6 @@ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
|
|
|
4469
4622
|
return ReorderDemote2To(d, a, b);
|
|
4470
4623
|
}
|
|
4471
4624
|
|
|
4472
|
-
template <class D, HWY_IF_BF16_D(D), class V32 = VFromD<Repartition<float, D>>>
|
|
4473
|
-
HWY_API VFromD<D> OrderedDemote2To(D dbf16, V32 a, V32 b) {
|
|
4474
|
-
const RebindToUnsigned<decltype(dbf16)> du16;
|
|
4475
|
-
return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a)));
|
|
4476
|
-
}
|
|
4477
|
-
|
|
4478
4625
|
// ------------------------------ ConvertTo
|
|
4479
4626
|
|
|
4480
4627
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
|
|
@@ -4675,6 +4822,31 @@ HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
|
|
|
4675
4822
|
return And(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), Set(du64, 0xFFFF));
|
|
4676
4823
|
}
|
|
4677
4824
|
|
|
4825
|
+
template <size_t N>
|
|
4826
|
+
HWY_API Vec128<int64_t, N / 8> SumsOf8(const Vec128<int8_t, N> v) {
|
|
4827
|
+
const DFromV<decltype(v)> di8;
|
|
4828
|
+
const RepartitionToWide<decltype(di8)> di16;
|
|
4829
|
+
const RepartitionToWide<decltype(di16)> di32;
|
|
4830
|
+
const RepartitionToWide<decltype(di32)> di64;
|
|
4831
|
+
const RebindToUnsigned<decltype(di32)> du32;
|
|
4832
|
+
const RebindToUnsigned<decltype(di64)> du64;
|
|
4833
|
+
using VI16 = VFromD<decltype(di16)>;
|
|
4834
|
+
|
|
4835
|
+
const VI16 vFDB97531 = ShiftRight<8>(BitCast(di16, v));
|
|
4836
|
+
const VI16 vECA86420 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, v)));
|
|
4837
|
+
const VI16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
|
|
4838
|
+
|
|
4839
|
+
const VI16 sDC_zz_98_zz_54_zz_10_zz =
|
|
4840
|
+
BitCast(di16, ShiftLeft<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
|
|
4841
|
+
const VI16 sFC_xx_B8_xx_74_xx_30_xx =
|
|
4842
|
+
Add(sFE_DC_BA_98_76_54_32_10, sDC_zz_98_zz_54_zz_10_zz);
|
|
4843
|
+
const VI16 sB8_xx_zz_zz_30_xx_zz_zz =
|
|
4844
|
+
BitCast(di16, ShiftLeft<32>(BitCast(du64, sFC_xx_B8_xx_74_xx_30_xx)));
|
|
4845
|
+
const VI16 sF8_xx_xx_xx_70_xx_xx_xx =
|
|
4846
|
+
Add(sFC_xx_B8_xx_74_xx_30_xx, sB8_xx_zz_zz_30_xx_zz_zz);
|
|
4847
|
+
return ShiftRight<48>(BitCast(di64, sF8_xx_xx_xx_70_xx_xx_xx));
|
|
4848
|
+
}
|
|
4849
|
+
|
|
4678
4850
|
// ------------------------------ LoadMaskBits (TestBit)
|
|
4679
4851
|
|
|
4680
4852
|
namespace detail {
|
|
@@ -4729,6 +4901,15 @@ HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
|
|
|
4729
4901
|
return detail::LoadMaskBits(d, mask_bits);
|
|
4730
4902
|
}
|
|
4731
4903
|
|
|
4904
|
+
// ------------------------------ Dup128MaskFromMaskBits
|
|
4905
|
+
|
|
4906
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
4907
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
4908
|
+
constexpr size_t kN = MaxLanes(d);
|
|
4909
|
+
if (kN < 8) mask_bits &= (1u << kN) - 1;
|
|
4910
|
+
return detail::LoadMaskBits(d, mask_bits);
|
|
4911
|
+
}
|
|
4912
|
+
|
|
4732
4913
|
// ------------------------------ Mask
|
|
4733
4914
|
|
|
4734
4915
|
namespace detail {
|
|
@@ -5593,59 +5774,47 @@ HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
|
|
|
5593
5774
|
|
|
5594
5775
|
// ------------------------------ MulEven/Odd (Load)
|
|
5595
5776
|
|
|
5596
|
-
|
|
5597
|
-
|
|
5598
|
-
alignas(16)
|
|
5599
|
-
mul[0] =
|
|
5600
|
-
|
|
5601
|
-
|
|
5602
|
-
return Load(Full128<uint64_t>(), mul);
|
|
5777
|
+
template <class T, HWY_IF_UI64(T)>
|
|
5778
|
+
HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
|
|
5779
|
+
alignas(16) T mul[2];
|
|
5780
|
+
mul[0] = Mul128(static_cast<T>(wasm_i64x2_extract_lane(a.raw, 0)),
|
|
5781
|
+
static_cast<T>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
|
|
5782
|
+
return Load(Full128<T>(), mul);
|
|
5603
5783
|
}
|
|
5604
5784
|
|
|
5605
|
-
|
|
5606
|
-
|
|
5607
|
-
alignas(16)
|
|
5608
|
-
mul[0] =
|
|
5609
|
-
|
|
5610
|
-
|
|
5611
|
-
return Load(Full128<uint64_t>(), mul);
|
|
5785
|
+
template <class T, HWY_IF_UI64(T)>
|
|
5786
|
+
HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
|
|
5787
|
+
alignas(16) T mul[2];
|
|
5788
|
+
mul[0] = Mul128(static_cast<T>(wasm_i64x2_extract_lane(a.raw, 1)),
|
|
5789
|
+
static_cast<T>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
|
|
5790
|
+
return Load(Full128<T>(), mul);
|
|
5612
5791
|
}
|
|
5613
5792
|
|
|
5614
|
-
// ------------------------------
|
|
5793
|
+
// ------------------------------ I64/U64 MulHigh (GetLane)
|
|
5794
|
+
template <class T, HWY_IF_UI64(T)>
|
|
5795
|
+
HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) {
|
|
5796
|
+
T hi;
|
|
5797
|
+
Mul128(GetLane(a), GetLane(b), &hi);
|
|
5798
|
+
return Set(Full64<T>(), hi);
|
|
5799
|
+
}
|
|
5800
|
+
|
|
5801
|
+
template <class T, HWY_IF_UI64(T)>
|
|
5802
|
+
HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
|
|
5803
|
+
T hi_0;
|
|
5804
|
+
T hi_1;
|
|
5805
|
+
Mul128(GetLane(a), GetLane(b), &hi_0);
|
|
5806
|
+
Mul128(detail::ExtractLane<1>(a), detail::ExtractLane<1>(b), &hi_1);
|
|
5807
|
+
return Dup128VecFromValues(Full128<T>(), hi_0, hi_1);
|
|
5808
|
+
}
|
|
5809
|
+
|
|
5810
|
+
// ------------------------------ WidenMulPairwiseAdd (MulAdd, PromoteEvenTo)
|
|
5615
5811
|
|
|
5616
5812
|
// Generic for all vector lengths.
|
|
5617
|
-
template <class
|
|
5618
|
-
class
|
|
5619
|
-
HWY_API VFromD<
|
|
5620
|
-
|
|
5621
|
-
|
|
5622
|
-
const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
|
|
5623
|
-
// Using shift/and instead of Zip leads to the odd/even order that
|
|
5624
|
-
// RearrangeToOddPlusEven prefers.
|
|
5625
|
-
const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
|
|
5626
|
-
const VU32 ao = And(BitCast(du32, a), odd);
|
|
5627
|
-
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
5628
|
-
const VU32 bo = And(BitCast(du32, b), odd);
|
|
5629
|
-
return Mul(BitCast(df32, ae), BitCast(df32, be)) +
|
|
5630
|
-
Mul(BitCast(df32, ao), BitCast(df32, bo));
|
|
5631
|
-
}
|
|
5632
|
-
|
|
5633
|
-
template <class D32, HWY_IF_F32_D(D32),
|
|
5634
|
-
class V16 = VFromD<Repartition<bfloat16_t, D32>>>
|
|
5635
|
-
HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b,
|
|
5636
|
-
const VFromD<D32> sum0,
|
|
5637
|
-
VFromD<D32>& sum1) {
|
|
5638
|
-
const Rebind<uint32_t, decltype(df32)> du32;
|
|
5639
|
-
using VU32 = VFromD<decltype(du32)>;
|
|
5640
|
-
const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
|
|
5641
|
-
// Using shift/and instead of Zip leads to the odd/even order that
|
|
5642
|
-
// RearrangeToOddPlusEven prefers.
|
|
5643
|
-
const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
|
|
5644
|
-
const VU32 ao = And(BitCast(du32, a), odd);
|
|
5645
|
-
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
5646
|
-
const VU32 bo = And(BitCast(du32, b), odd);
|
|
5647
|
-
sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
|
|
5648
|
-
return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
|
|
5813
|
+
template <class DF, HWY_IF_F32_D(DF),
|
|
5814
|
+
class VBF = VFromD<Repartition<bfloat16_t, DF>>>
|
|
5815
|
+
HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
|
|
5816
|
+
return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
|
|
5817
|
+
Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
|
|
5649
5818
|
}
|
|
5650
5819
|
|
|
5651
5820
|
// Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is
|
|
@@ -5659,35 +5828,18 @@ HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 /* tag */, V16 a, V16 b) {
|
|
|
5659
5828
|
template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_LE_D(DU32, 16),
|
|
5660
5829
|
class VU16 = VFromD<RepartitionToNarrow<DU32>>>
|
|
5661
5830
|
HWY_API VFromD<DU32> WidenMulPairwiseAdd(DU32 du32, VU16 a, VU16 b) {
|
|
5662
|
-
|
|
5663
|
-
|
|
5664
|
-
const auto a0 = And(BitCast(du32, a), lo16_mask);
|
|
5665
|
-
const auto b0 = And(BitCast(du32, b), lo16_mask);
|
|
5666
|
-
|
|
5667
|
-
const auto a1 = ShiftRight<16>(BitCast(du32, a));
|
|
5668
|
-
const auto b1 = ShiftRight<16>(BitCast(du32, b));
|
|
5669
|
-
|
|
5670
|
-
return MulAdd(a1, b1, a0 * b0);
|
|
5831
|
+
return MulAdd(PromoteEvenTo(du32, a), PromoteEvenTo(du32, b),
|
|
5832
|
+
Mul(PromoteOddTo(du32, a), PromoteOddTo(du32, b)));
|
|
5671
5833
|
}
|
|
5672
5834
|
|
|
5673
|
-
//
|
|
5674
|
-
|
|
5675
|
-
template <class D32,
|
|
5835
|
+
// ------------------------------ ReorderWidenMulAccumulate
|
|
5836
|
+
|
|
5837
|
+
template <class D32, HWY_IF_UI32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
|
|
5676
5838
|
class V16 = VFromD<RepartitionToNarrow<D32>>>
|
|
5677
|
-
HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32
|
|
5839
|
+
HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 d32, V16 a, V16 b,
|
|
5678
5840
|
const VFromD<D32> sum0,
|
|
5679
5841
|
VFromD<D32>& /*sum1*/) {
|
|
5680
|
-
return sum0 + WidenMulPairwiseAdd(
|
|
5681
|
-
}
|
|
5682
|
-
|
|
5683
|
-
// Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is
|
|
5684
|
-
// safe.
|
|
5685
|
-
template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_LE_D(DU32, 16),
|
|
5686
|
-
class VU16 = VFromD<RepartitionToNarrow<DU32>>>
|
|
5687
|
-
HWY_API VFromD<DU32> ReorderWidenMulAccumulate(DU32 d, VU16 a, VU16 b,
|
|
5688
|
-
const VFromD<DU32> sum0,
|
|
5689
|
-
VFromD<DU32>& /*sum1*/) {
|
|
5690
|
-
return sum0 + WidenMulPairwiseAdd(d, a, b);
|
|
5842
|
+
return sum0 + WidenMulPairwiseAdd(d32, a, b);
|
|
5691
5843
|
}
|
|
5692
5844
|
|
|
5693
5845
|
// ------------------------------ RearrangeToOddPlusEven
|
|
@@ -5711,120 +5863,7 @@ HWY_API Vec128<float, N> RearrangeToOddPlusEven(const Vec128<float, N> sum0,
|
|
|
5711
5863
|
|
|
5712
5864
|
// ------------------------------ Reductions
|
|
5713
5865
|
|
|
5714
|
-
|
|
5715
|
-
|
|
5716
|
-
// N=1: no-op
|
|
5717
|
-
template <typename T>
|
|
5718
|
-
HWY_INLINE Vec128<T, 1> SumOfLanes(Vec128<T, 1> v) {
|
|
5719
|
-
return v;
|
|
5720
|
-
}
|
|
5721
|
-
template <typename T>
|
|
5722
|
-
HWY_INLINE Vec128<T, 1> MinOfLanes(Vec128<T, 1> v) {
|
|
5723
|
-
return v;
|
|
5724
|
-
}
|
|
5725
|
-
template <typename T>
|
|
5726
|
-
HWY_INLINE Vec128<T, 1> MaxOfLanes(Vec128<T, 1> v) {
|
|
5727
|
-
return v;
|
|
5728
|
-
}
|
|
5729
|
-
|
|
5730
|
-
// N=2
|
|
5731
|
-
template <typename T>
|
|
5732
|
-
HWY_INLINE Vec128<T, 2> SumOfLanes(Vec128<T, 2> v10) {
|
|
5733
|
-
const DFromV<decltype(v10)> d;
|
|
5734
|
-
return Add(v10, Reverse2(d, v10));
|
|
5735
|
-
}
|
|
5736
|
-
template <typename T>
|
|
5737
|
-
HWY_INLINE Vec128<T, 2> MinOfLanes(Vec128<T, 2> v10) {
|
|
5738
|
-
const DFromV<decltype(v10)> d;
|
|
5739
|
-
return Min(v10, Reverse2(d, v10));
|
|
5740
|
-
}
|
|
5741
|
-
template <typename T>
|
|
5742
|
-
HWY_INLINE Vec128<T, 2> MaxOfLanes(Vec128<T, 2> v10) {
|
|
5743
|
-
const DFromV<decltype(v10)> d;
|
|
5744
|
-
return Max(v10, Reverse2(d, v10));
|
|
5745
|
-
}
|
|
5746
|
-
|
|
5747
|
-
// N=4 (only 16/32-bit, else >128-bit)
|
|
5748
|
-
template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
|
|
5749
|
-
HWY_INLINE Vec128<T, 4> SumOfLanes(Vec128<T, 4> v3210) {
|
|
5750
|
-
using V = decltype(v3210);
|
|
5751
|
-
const DFromV<V> d;
|
|
5752
|
-
const V v0123 = Reverse4(d, v3210);
|
|
5753
|
-
const V v03_12_12_03 = Add(v3210, v0123);
|
|
5754
|
-
const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
|
|
5755
|
-
return Add(v03_12_12_03, v12_03_03_12);
|
|
5756
|
-
}
|
|
5757
|
-
template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
|
|
5758
|
-
HWY_INLINE Vec128<T, 4> MinOfLanes(Vec128<T, 4> v3210) {
|
|
5759
|
-
using V = decltype(v3210);
|
|
5760
|
-
const DFromV<V> d;
|
|
5761
|
-
const V v0123 = Reverse4(d, v3210);
|
|
5762
|
-
const V v03_12_12_03 = Min(v3210, v0123);
|
|
5763
|
-
const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
|
|
5764
|
-
return Min(v03_12_12_03, v12_03_03_12);
|
|
5765
|
-
}
|
|
5766
|
-
template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
|
|
5767
|
-
HWY_INLINE Vec128<T, 4> MaxOfLanes(Vec128<T, 4> v3210) {
|
|
5768
|
-
using V = decltype(v3210);
|
|
5769
|
-
const DFromV<V> d;
|
|
5770
|
-
const V v0123 = Reverse4(d, v3210);
|
|
5771
|
-
const V v03_12_12_03 = Max(v3210, v0123);
|
|
5772
|
-
const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
|
|
5773
|
-
return Max(v03_12_12_03, v12_03_03_12);
|
|
5774
|
-
}
|
|
5775
|
-
|
|
5776
|
-
// N=8 (only 16-bit, else >128-bit)
|
|
5777
|
-
template <typename T, HWY_IF_T_SIZE(T, 2)>
|
|
5778
|
-
HWY_INLINE Vec128<T, 8> SumOfLanes(Vec128<T, 8> v76543210) {
|
|
5779
|
-
using V = decltype(v76543210);
|
|
5780
|
-
const DFromV<V> d;
|
|
5781
|
-
// The upper half is reversed from the lower half; omit for brevity.
|
|
5782
|
-
const V v34_25_16_07 = Add(v76543210, Reverse8(d, v76543210));
|
|
5783
|
-
const V v0347_1625_1625_0347 = Add(v34_25_16_07, Reverse4(d, v34_25_16_07));
|
|
5784
|
-
return Add(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
|
|
5785
|
-
}
|
|
5786
|
-
template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
|
|
5787
|
-
HWY_INLINE Vec128<T, 8> MinOfLanes(Vec128<T, 8> v76543210) {
|
|
5788
|
-
using V = decltype(v76543210);
|
|
5789
|
-
const DFromV<V> d;
|
|
5790
|
-
// The upper half is reversed from the lower half; omit for brevity.
|
|
5791
|
-
const V v34_25_16_07 = Min(v76543210, Reverse8(d, v76543210));
|
|
5792
|
-
const V v0347_1625_1625_0347 = Min(v34_25_16_07, Reverse4(d, v34_25_16_07));
|
|
5793
|
-
return Min(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
|
|
5794
|
-
}
|
|
5795
|
-
template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
|
|
5796
|
-
HWY_INLINE Vec128<T, 8> MaxOfLanes(Vec128<T, 8> v76543210) {
|
|
5797
|
-
using V = decltype(v76543210);
|
|
5798
|
-
const DFromV<V> d;
|
|
5799
|
-
// The upper half is reversed from the lower half; omit for brevity.
|
|
5800
|
-
const V v34_25_16_07 = Max(v76543210, Reverse8(d, v76543210));
|
|
5801
|
-
const V v0347_1625_1625_0347 = Max(v34_25_16_07, Reverse4(d, v34_25_16_07));
|
|
5802
|
-
return Max(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
|
|
5803
|
-
}
|
|
5804
|
-
|
|
5805
|
-
template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
|
|
5806
|
-
HWY_INLINE T ReduceSum(Vec128<T, N> v) {
|
|
5807
|
-
return GetLane(SumOfLanes(v));
|
|
5808
|
-
}
|
|
5809
|
-
|
|
5810
|
-
} // namespace detail
|
|
5811
|
-
|
|
5812
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
5813
|
-
HWY_API VFromD<D> SumOfLanes(D /* tag */, VFromD<D> v) {
|
|
5814
|
-
return detail::SumOfLanes(v);
|
|
5815
|
-
}
|
|
5816
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
5817
|
-
HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v) {
|
|
5818
|
-
return detail::ReduceSum(v);
|
|
5819
|
-
}
|
|
5820
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
5821
|
-
HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) {
|
|
5822
|
-
return detail::MinOfLanes(v);
|
|
5823
|
-
}
|
|
5824
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
5825
|
-
HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) {
|
|
5826
|
-
return detail::MaxOfLanes(v);
|
|
5827
|
-
}
|
|
5866
|
+
// Nothing native, generic_ops-inl defines SumOfLanes and ReduceSum.
|
|
5828
5867
|
|
|
5829
5868
|
// ------------------------------ Lt128
|
|
5830
5869
|
|