@img/sharp-libvips-dev 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +7 -1
- package/include/aom/aom_image.h +24 -12
- package/include/aom/aom_integer.h +3 -3
- package/include/aom/aomcx.h +15 -0
- package/include/aom/aomdx.h +5 -2
- package/include/archive.h +7 -5
- package/include/archive_entry.h +5 -3
- package/include/cgif.h +3 -0
- package/include/freetype2/freetype/config/ftoption.h +1 -1
- package/include/fribidi/fribidi-config.h +2 -2
- package/include/fribidi/fribidi-unicode-version.h +3 -3
- package/include/glib-2.0/gio/gappinfo.h +40 -25
- package/include/glib-2.0/gio/gasyncresult.h +1 -1
- package/include/glib-2.0/gio/gconverter.h +5 -0
- package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
- package/include/glib-2.0/gio/gfile.h +16 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/gsettings.h +8 -0
- package/include/glib-2.0/gio/gvfs.h +2 -2
- package/include/glib-2.0/girepository/gi-visibility.h +34 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
- package/include/glib-2.0/glib/giochannel.h +2 -2
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +12 -5
- package/include/glib-2.0/glib/gmain.h +93 -7
- package/include/glib-2.0/glib/gqsort.h +8 -1
- package/include/glib-2.0/glib/gstrfuncs.h +0 -12
- package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
- package/include/glib-2.0/glib/gunicode.h +1 -1
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gtype.h +6 -6
- package/include/harfbuzz/hb-buffer.h +6 -0
- package/include/harfbuzz/hb-common.h +6 -9
- package/include/harfbuzz/hb-cplusplus.hh +8 -11
- package/include/harfbuzz/hb-subset.h +17 -4
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +28 -0
- package/include/hwy/aligned_allocator.h +48 -1
- package/include/hwy/base.h +235 -34
- package/include/hwy/detect_compiler_arch.h +84 -10
- package/include/hwy/detect_targets.h +95 -29
- package/include/hwy/foreach_target.h +12 -1
- package/include/hwy/highway.h +205 -50
- package/include/hwy/ops/arm_neon-inl.h +841 -99
- package/include/hwy/ops/arm_sve-inl.h +413 -141
- package/include/hwy/ops/emu128-inl.h +373 -360
- package/include/hwy/ops/generic_ops-inl.h +804 -401
- package/include/hwy/ops/inside-inl.h +691 -0
- package/include/hwy/ops/ppc_vsx-inl.h +456 -166
- package/include/hwy/ops/rvv-inl.h +537 -249
- package/include/hwy/ops/scalar-inl.h +169 -79
- package/include/hwy/ops/set_macros-inl.h +106 -18
- package/include/hwy/ops/shared-inl.h +23 -0
- package/include/hwy/ops/wasm_128-inl.h +130 -108
- package/include/hwy/ops/x86_128-inl.h +1892 -577
- package/include/hwy/ops/x86_256-inl.h +625 -184
- package/include/hwy/ops/x86_512-inl.h +733 -131
- package/include/hwy/targets.h +22 -21
- package/include/hwy/timer-inl.h +3 -3
- package/include/hwy/timer.h +5 -1
- package/include/libheif/heif.h +170 -15
- package/include/libheif/heif_items.h +237 -0
- package/include/libheif/heif_properties.h +38 -2
- package/include/libheif/heif_regions.h +1 -1
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/pnglibconf.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
- package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
- package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
- package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
- package/include/libxml2/libxml/HTMLparser.h +12 -19
- package/include/libxml2/libxml/c14n.h +1 -12
- package/include/libxml2/libxml/debugXML.h +1 -1
- package/include/libxml2/libxml/encoding.h +9 -0
- package/include/libxml2/libxml/entities.h +12 -1
- package/include/libxml2/libxml/hash.h +19 -0
- package/include/libxml2/libxml/list.h +2 -2
- package/include/libxml2/libxml/nanohttp.h +17 -0
- package/include/libxml2/libxml/parser.h +61 -55
- package/include/libxml2/libxml/parserInternals.h +9 -1
- package/include/libxml2/libxml/pattern.h +6 -0
- package/include/libxml2/libxml/tree.h +32 -12
- package/include/libxml2/libxml/uri.h +11 -0
- package/include/libxml2/libxml/valid.h +29 -2
- package/include/libxml2/libxml/xinclude.h +7 -0
- package/include/libxml2/libxml/xmlIO.h +21 -4
- package/include/libxml2/libxml/xmlerror.h +14 -0
- package/include/libxml2/libxml/xmlexports.h +111 -15
- package/include/libxml2/libxml/xmlmemory.h +8 -45
- package/include/libxml2/libxml/xmlreader.h +2 -0
- package/include/libxml2/libxml/xmlsave.h +5 -0
- package/include/libxml2/libxml/xmlunicode.h +165 -1
- package/include/libxml2/libxml/xmlversion.h +15 -179
- package/include/libxml2/libxml/xmlwriter.h +1 -0
- package/include/libxml2/libxml/xpath.h +4 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-item.h +4 -2
- package/include/pango-1.0/pango/pango-version-macros.h +25 -0
- package/include/pango-1.0/pango/pangofc-font.h +2 -1
- package/include/pnglibconf.h +1 -1
- package/include/vips/util.h +1 -2
- package/include/vips/version.h +4 -4
- package/include/webp/decode.h +58 -56
- package/include/webp/demux.h +25 -21
- package/include/webp/encode.h +44 -39
- package/include/webp/mux.h +76 -15
- package/include/webp/mux_types.h +2 -1
- package/include/webp/sharpyuv/sharpyuv.h +77 -8
- package/include/webp/types.h +29 -8
- package/include/zconf.h +1 -1
- package/include/zlib.h +12 -12
- package/package.json +1 -1
- package/versions.json +14 -15
|
@@ -26,6 +26,7 @@
|
|
|
26
26
|
#endif
|
|
27
27
|
|
|
28
28
|
#include "hwy/detect_compiler_arch.h"
|
|
29
|
+
#include "hwy/detect_targets.h"
|
|
29
30
|
|
|
30
31
|
// Separate header because foreach_target.h re-enables its include guard.
|
|
31
32
|
#include "hwy/ops/set_macros-inl.h"
|
|
@@ -61,6 +62,10 @@ namespace HWY_NAMESPACE {
|
|
|
61
62
|
// We therefore pass by const& only on GCC and (Windows or aarch64). This alias
|
|
62
63
|
// must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
|
|
63
64
|
// and possibly also other functions that are not inlined.
|
|
65
|
+
//
|
|
66
|
+
// Even better is to avoid passing vector arguments to non-inlined functions,
|
|
67
|
+
// because the SVE and RISC-V ABIs are still works in progress and may lead to
|
|
68
|
+
// incorrect codegen.
|
|
64
69
|
#if HWY_COMPILER_GCC_ACTUAL && (HWY_OS_WIN || HWY_ARCH_ARM_A64)
|
|
65
70
|
template <class V>
|
|
66
71
|
using VecArg = const V&;
|
|
@@ -529,6 +534,8 @@ HWY_API bool IsAligned(D d, T* ptr) {
|
|
|
529
534
|
|
|
530
535
|
// Same as base.h macros but with a Simd<T, N, kPow2> argument instead of T.
|
|
531
536
|
#define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(hwy::HWY_NAMESPACE::TFromD<D>)
|
|
537
|
+
#define HWY_IF_NOT_UNSIGNED_D(D) \
|
|
538
|
+
HWY_IF_NOT_UNSIGNED(hwy::HWY_NAMESPACE::TFromD<D>)
|
|
532
539
|
#define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(hwy::HWY_NAMESPACE::TFromD<D>)
|
|
533
540
|
#define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(hwy::HWY_NAMESPACE::TFromD<D>)
|
|
534
541
|
#define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(hwy::HWY_NAMESPACE::TFromD<D>)
|
|
@@ -609,6 +616,8 @@ HWY_API bool IsAligned(D d, T* ptr) {
|
|
|
609
616
|
|
|
610
617
|
// Same, but with a vector argument. ops/*-inl.h define their own TFromV.
|
|
611
618
|
#define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(hwy::HWY_NAMESPACE::TFromV<V>)
|
|
619
|
+
#define HWY_IF_NOT_UNSIGNED_V(V) \
|
|
620
|
+
HWY_IF_NOT_UNSIGNED(hwy::HWY_NAMESPACE::TFromV<V>)
|
|
612
621
|
#define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(hwy::HWY_NAMESPACE::TFromV<V>)
|
|
613
622
|
#define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(hwy::HWY_NAMESPACE::TFromV<V>)
|
|
614
623
|
#define HWY_IF_NOT_FLOAT_V(V) HWY_IF_NOT_FLOAT(hwy::HWY_NAMESPACE::TFromV<V>)
|
|
@@ -646,6 +655,20 @@ HWY_API bool IsAligned(D d, T* ptr) {
|
|
|
646
655
|
#undef HWY_IF_MINMAX_OF_LANES_D
|
|
647
656
|
#define HWY_IF_MINMAX_OF_LANES_D(D) HWY_IF_LANES_GT_D(D, 1)
|
|
648
657
|
|
|
658
|
+
#undef HWY_IF_ADDSUB_V
|
|
659
|
+
#define HWY_IF_ADDSUB_V(V) HWY_IF_LANES_GT_D(DFromV<V>, 1)
|
|
660
|
+
|
|
661
|
+
#undef HWY_IF_MULADDSUB_V
|
|
662
|
+
#define HWY_IF_MULADDSUB_V(V) HWY_IF_LANES_GT_D(DFromV<V>, 1)
|
|
663
|
+
|
|
664
|
+
// HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V is used to disable the default
|
|
665
|
+
// implementation of unsigned to signed DemoteTo/ReorderDemote2To in
|
|
666
|
+
// generic_ops-inl.h for at least some of the unsigned to signed demotions on
|
|
667
|
+
// SCALAR/EMU128/SSE2/SSSE3/SSE4/AVX2/SVE/SVE2
|
|
668
|
+
|
|
669
|
+
#undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
|
|
670
|
+
#define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) void* = nullptr
|
|
671
|
+
|
|
649
672
|
// Old names (deprecated)
|
|
650
673
|
#define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_T_SIZE_D(D, bytes)
|
|
651
674
|
#define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_T_SIZE_D(D, bytes)
|
|
@@ -154,9 +154,6 @@ HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
|
|
|
154
154
|
template <class D>
|
|
155
155
|
using VFromD = decltype(Zero(D()));
|
|
156
156
|
|
|
157
|
-
// ------------------------------ Tuple (VFromD)
|
|
158
|
-
#include "hwy/ops/tuple-inl.h"
|
|
159
|
-
|
|
160
157
|
// ------------------------------ BitCast
|
|
161
158
|
|
|
162
159
|
namespace detail {
|
|
@@ -654,12 +651,16 @@ HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
|
|
|
654
651
|
}
|
|
655
652
|
|
|
656
653
|
// ------------------------------ RotateRight (ShiftRight, Or)
|
|
657
|
-
template <int kBits, typename T, size_t N>
|
|
654
|
+
template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
658
655
|
HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
|
|
656
|
+
const DFromV<decltype(v)> d;
|
|
657
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
658
|
+
|
|
659
659
|
constexpr size_t kSizeInBits = sizeof(T) * 8;
|
|
660
660
|
static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
|
|
661
|
+
|
|
661
662
|
if (kBits == 0) return v;
|
|
662
|
-
return Or(ShiftRight<kBits>(v),
|
|
663
|
+
return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
|
|
663
664
|
ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
|
|
664
665
|
}
|
|
665
666
|
|
|
@@ -917,7 +918,25 @@ HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
|
|
|
917
918
|
return Vec128<int32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
|
|
918
919
|
}
|
|
919
920
|
|
|
920
|
-
// Returns the upper
|
|
921
|
+
// Returns the upper sizeof(T)*8 bits of a * b in each lane.
|
|
922
|
+
template <size_t N>
|
|
923
|
+
HWY_API Vec128<uint8_t, N> MulHigh(const Vec128<uint8_t, N> a,
|
|
924
|
+
const Vec128<uint8_t, N> b) {
|
|
925
|
+
const auto l = wasm_u16x8_extmul_low_u8x16(a.raw, b.raw);
|
|
926
|
+
const auto h = wasm_u16x8_extmul_high_u8x16(a.raw, b.raw);
|
|
927
|
+
// TODO(eustas): shift-right + narrow?
|
|
928
|
+
return Vec128<uint8_t, N>{wasm_i8x16_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15,
|
|
929
|
+
17, 19, 21, 23, 25, 27, 29, 31)};
|
|
930
|
+
}
|
|
931
|
+
template <size_t N>
|
|
932
|
+
HWY_API Vec128<int8_t, N> MulHigh(const Vec128<int8_t, N> a,
|
|
933
|
+
const Vec128<int8_t, N> b) {
|
|
934
|
+
const auto l = wasm_i16x8_extmul_low_i8x16(a.raw, b.raw);
|
|
935
|
+
const auto h = wasm_i16x8_extmul_high_i8x16(a.raw, b.raw);
|
|
936
|
+
// TODO(eustas): shift-right + narrow?
|
|
937
|
+
return Vec128<int8_t, N>{wasm_i8x16_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15,
|
|
938
|
+
17, 19, 21, 23, 25, 27, 29, 31)};
|
|
939
|
+
}
|
|
921
940
|
template <size_t N>
|
|
922
941
|
HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
|
|
923
942
|
const Vec128<uint16_t, N> b) {
|
|
@@ -936,6 +955,22 @@ HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
|
|
|
936
955
|
return Vec128<int16_t, N>{
|
|
937
956
|
wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
|
|
938
957
|
}
|
|
958
|
+
template <size_t N>
|
|
959
|
+
HWY_API Vec128<uint32_t, N> MulHigh(const Vec128<uint32_t, N> a,
|
|
960
|
+
const Vec128<uint32_t, N> b) {
|
|
961
|
+
const auto l = wasm_u64x2_extmul_low_u32x4(a.raw, b.raw);
|
|
962
|
+
const auto h = wasm_u64x2_extmul_high_u32x4(a.raw, b.raw);
|
|
963
|
+
// TODO(eustas): shift-right + narrow?
|
|
964
|
+
return Vec128<uint32_t, N>{wasm_i32x4_shuffle(l, h, 1, 3, 5, 7)};
|
|
965
|
+
}
|
|
966
|
+
template <size_t N>
|
|
967
|
+
HWY_API Vec128<int32_t, N> MulHigh(const Vec128<int32_t, N> a,
|
|
968
|
+
const Vec128<int32_t, N> b) {
|
|
969
|
+
const auto l = wasm_i64x2_extmul_low_i32x4(a.raw, b.raw);
|
|
970
|
+
const auto h = wasm_i64x2_extmul_high_i32x4(a.raw, b.raw);
|
|
971
|
+
// TODO(eustas): shift-right + narrow?
|
|
972
|
+
return Vec128<int32_t, N>{wasm_i32x4_shuffle(l, h, 1, 3, 5, 7)};
|
|
973
|
+
}
|
|
939
974
|
|
|
940
975
|
template <size_t N>
|
|
941
976
|
HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
|
|
@@ -1622,13 +1657,6 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
|
|
|
1622
1657
|
return IfThenElse(MaskFromVec(v), yes, no);
|
|
1623
1658
|
}
|
|
1624
1659
|
|
|
1625
|
-
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
1626
|
-
HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
|
|
1627
|
-
const DFromV<decltype(v)> d;
|
|
1628
|
-
const auto zero = Zero(d);
|
|
1629
|
-
return IfThenElse(Mask128<T, N>{(v > zero).raw}, v, zero);
|
|
1630
|
-
}
|
|
1631
|
-
|
|
1632
1660
|
// ------------------------------ Mask logical
|
|
1633
1661
|
|
|
1634
1662
|
template <typename T, size_t N>
|
|
@@ -3806,6 +3834,50 @@ HWY_API Vec128<float, N> OddEven(const Vec128<float, N> a,
|
|
|
3806
3834
|
return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
|
|
3807
3835
|
}
|
|
3808
3836
|
|
|
3837
|
+
// ------------------------------ InterleaveEven
|
|
3838
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
|
|
3839
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3840
|
+
return VFromD<D>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 16, 2, 18, 4, 20, 6, 22,
|
|
3841
|
+
8, 24, 10, 26, 12, 28, 14, 30)};
|
|
3842
|
+
}
|
|
3843
|
+
|
|
3844
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
|
|
3845
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3846
|
+
return VFromD<D>{wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 2, 10, 4, 12, 6, 14)};
|
|
3847
|
+
}
|
|
3848
|
+
|
|
3849
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
|
|
3850
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3851
|
+
return VFromD<D>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 2, 6)};
|
|
3852
|
+
}
|
|
3853
|
+
|
|
3854
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
|
|
3855
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3856
|
+
return InterleaveLower(a, b);
|
|
3857
|
+
}
|
|
3858
|
+
|
|
3859
|
+
// ------------------------------ InterleaveOdd
|
|
3860
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
|
|
3861
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3862
|
+
return VFromD<D>{wasm_i8x16_shuffle(a.raw, b.raw, 1, 17, 3, 19, 5, 21, 7, 23,
|
|
3863
|
+
9, 25, 11, 27, 13, 29, 15, 31)};
|
|
3864
|
+
}
|
|
3865
|
+
|
|
3866
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
|
|
3867
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3868
|
+
return VFromD<D>{wasm_i16x8_shuffle(a.raw, b.raw, 1, 9, 3, 11, 5, 13, 7, 15)};
|
|
3869
|
+
}
|
|
3870
|
+
|
|
3871
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
|
|
3872
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
3873
|
+
return VFromD<D>{wasm_i32x4_shuffle(a.raw, b.raw, 1, 5, 3, 7)};
|
|
3874
|
+
}
|
|
3875
|
+
|
|
3876
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
|
|
3877
|
+
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
|
|
3878
|
+
return InterleaveUpper(d, a, b);
|
|
3879
|
+
}
|
|
3880
|
+
|
|
3809
3881
|
// ------------------------------ OddEvenBlocks
|
|
3810
3882
|
template <typename T, size_t N>
|
|
3811
3883
|
HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
|
|
@@ -4082,6 +4154,9 @@ HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
|
|
|
4082
4154
|
return PromoteTo(d, UpperHalf(dh, v));
|
|
4083
4155
|
}
|
|
4084
4156
|
|
|
4157
|
+
// ------------------------------ PromoteEvenTo/PromoteOddTo
|
|
4158
|
+
#include "hwy/ops/inside-inl.h"
|
|
4159
|
+
|
|
4085
4160
|
// ------------------------------ Demotions (full -> part w/ narrow lanes)
|
|
4086
4161
|
|
|
4087
4162
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
|
|
@@ -4131,15 +4206,6 @@ HWY_API VFromD<D> DemoteTo(D du8, VFromD<Rebind<uint16_t, D>> v) {
|
|
|
4131
4206
|
return DemoteTo(du8, BitCast(di16, Min(v, Set(du16, 0x7FFF))));
|
|
4132
4207
|
}
|
|
4133
4208
|
|
|
4134
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
|
|
4135
|
-
HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
|
|
4136
|
-
const Rebind<int32_t, decltype(dbf16)> di32;
|
|
4137
|
-
const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
|
|
4138
|
-
const Rebind<uint16_t, decltype(dbf16)> du16;
|
|
4139
|
-
const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
|
|
4140
|
-
return BitCast(dbf16, DemoteTo(du16, bits_in_32));
|
|
4141
|
-
}
|
|
4142
|
-
|
|
4143
4209
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
|
|
4144
4210
|
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) {
|
|
4145
4211
|
return VFromD<D>{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)};
|
|
@@ -4210,15 +4276,6 @@ HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<uint64_t, D>> v) {
|
|
|
4210
4276
|
return DemoteTo(df32, adj_f64_val);
|
|
4211
4277
|
}
|
|
4212
4278
|
|
|
4213
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_BF16_D(D),
|
|
4214
|
-
class V32 = VFromD<Repartition<float, D>>>
|
|
4215
|
-
HWY_API VFromD<D> ReorderDemote2To(D dbf16, V32 a, V32 b) {
|
|
4216
|
-
const RebindToUnsigned<decltype(dbf16)> du16;
|
|
4217
|
-
const Repartition<uint32_t, decltype(dbf16)> du32;
|
|
4218
|
-
const VFromD<decltype(du32)> b_in_even = ShiftRight<16>(BitCast(du32, b));
|
|
4219
|
-
return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
|
|
4220
|
-
}
|
|
4221
|
-
|
|
4222
4279
|
// Specializations for partial vectors because i16x8_narrow_i32x4 sets lanes
|
|
4223
4280
|
// above 2*N.
|
|
4224
4281
|
template <class D, HWY_IF_I16_D(D)>
|
|
@@ -4565,12 +4622,6 @@ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
|
|
|
4565
4622
|
return ReorderDemote2To(d, a, b);
|
|
4566
4623
|
}
|
|
4567
4624
|
|
|
4568
|
-
template <class D, HWY_IF_BF16_D(D), class V32 = VFromD<Repartition<float, D>>>
|
|
4569
|
-
HWY_API VFromD<D> OrderedDemote2To(D dbf16, V32 a, V32 b) {
|
|
4570
|
-
const RebindToUnsigned<decltype(dbf16)> du16;
|
|
4571
|
-
return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a)));
|
|
4572
|
-
}
|
|
4573
|
-
|
|
4574
4625
|
// ------------------------------ ConvertTo
|
|
4575
4626
|
|
|
4576
4627
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
|
|
@@ -5723,59 +5774,47 @@ HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
|
|
|
5723
5774
|
|
|
5724
5775
|
// ------------------------------ MulEven/Odd (Load)
|
|
5725
5776
|
|
|
5726
|
-
|
|
5727
|
-
|
|
5728
|
-
alignas(16)
|
|
5729
|
-
mul[0] =
|
|
5730
|
-
|
|
5731
|
-
|
|
5732
|
-
|
|
5777
|
+
template <class T, HWY_IF_UI64(T)>
|
|
5778
|
+
HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
|
|
5779
|
+
alignas(16) T mul[2];
|
|
5780
|
+
mul[0] = Mul128(static_cast<T>(wasm_i64x2_extract_lane(a.raw, 0)),
|
|
5781
|
+
static_cast<T>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
|
|
5782
|
+
return Load(Full128<T>(), mul);
|
|
5783
|
+
}
|
|
5784
|
+
|
|
5785
|
+
template <class T, HWY_IF_UI64(T)>
|
|
5786
|
+
HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
|
|
5787
|
+
alignas(16) T mul[2];
|
|
5788
|
+
mul[0] = Mul128(static_cast<T>(wasm_i64x2_extract_lane(a.raw, 1)),
|
|
5789
|
+
static_cast<T>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
|
|
5790
|
+
return Load(Full128<T>(), mul);
|
|
5733
5791
|
}
|
|
5734
5792
|
|
|
5735
|
-
|
|
5736
|
-
|
|
5737
|
-
|
|
5738
|
-
|
|
5739
|
-
|
|
5740
|
-
|
|
5741
|
-
return Load(Full128<uint64_t>(), mul);
|
|
5793
|
+
// ------------------------------ I64/U64 MulHigh (GetLane)
|
|
5794
|
+
template <class T, HWY_IF_UI64(T)>
|
|
5795
|
+
HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) {
|
|
5796
|
+
T hi;
|
|
5797
|
+
Mul128(GetLane(a), GetLane(b), &hi);
|
|
5798
|
+
return Set(Full64<T>(), hi);
|
|
5742
5799
|
}
|
|
5743
5800
|
|
|
5744
|
-
|
|
5801
|
+
template <class T, HWY_IF_UI64(T)>
|
|
5802
|
+
HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
|
|
5803
|
+
T hi_0;
|
|
5804
|
+
T hi_1;
|
|
5805
|
+
Mul128(GetLane(a), GetLane(b), &hi_0);
|
|
5806
|
+
Mul128(detail::ExtractLane<1>(a), detail::ExtractLane<1>(b), &hi_1);
|
|
5807
|
+
return Dup128VecFromValues(Full128<T>(), hi_0, hi_1);
|
|
5808
|
+
}
|
|
5809
|
+
|
|
5810
|
+
// ------------------------------ WidenMulPairwiseAdd (MulAdd, PromoteEvenTo)
|
|
5745
5811
|
|
|
5746
5812
|
// Generic for all vector lengths.
|
|
5747
|
-
template <class
|
|
5748
|
-
class
|
|
5749
|
-
HWY_API VFromD<
|
|
5750
|
-
|
|
5751
|
-
|
|
5752
|
-
const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
|
|
5753
|
-
// Using shift/and instead of Zip leads to the odd/even order that
|
|
5754
|
-
// RearrangeToOddPlusEven prefers.
|
|
5755
|
-
const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
|
|
5756
|
-
const VU32 ao = And(BitCast(du32, a), odd);
|
|
5757
|
-
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
5758
|
-
const VU32 bo = And(BitCast(du32, b), odd);
|
|
5759
|
-
return Mul(BitCast(df32, ae), BitCast(df32, be)) +
|
|
5760
|
-
Mul(BitCast(df32, ao), BitCast(df32, bo));
|
|
5761
|
-
}
|
|
5762
|
-
|
|
5763
|
-
template <class D32, HWY_IF_F32_D(D32),
|
|
5764
|
-
class V16 = VFromD<Repartition<bfloat16_t, D32>>>
|
|
5765
|
-
HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b,
|
|
5766
|
-
const VFromD<D32> sum0,
|
|
5767
|
-
VFromD<D32>& sum1) {
|
|
5768
|
-
const Rebind<uint32_t, decltype(df32)> du32;
|
|
5769
|
-
using VU32 = VFromD<decltype(du32)>;
|
|
5770
|
-
const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
|
|
5771
|
-
// Using shift/and instead of Zip leads to the odd/even order that
|
|
5772
|
-
// RearrangeToOddPlusEven prefers.
|
|
5773
|
-
const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
|
|
5774
|
-
const VU32 ao = And(BitCast(du32, a), odd);
|
|
5775
|
-
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
5776
|
-
const VU32 bo = And(BitCast(du32, b), odd);
|
|
5777
|
-
sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
|
|
5778
|
-
return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
|
|
5813
|
+
template <class DF, HWY_IF_F32_D(DF),
|
|
5814
|
+
class VBF = VFromD<Repartition<bfloat16_t, DF>>>
|
|
5815
|
+
HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
|
|
5816
|
+
return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
|
|
5817
|
+
Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
|
|
5779
5818
|
}
|
|
5780
5819
|
|
|
5781
5820
|
// Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is
|
|
@@ -5789,35 +5828,18 @@ HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 /* tag */, V16 a, V16 b) {
|
|
|
5789
5828
|
template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_LE_D(DU32, 16),
|
|
5790
5829
|
class VU16 = VFromD<RepartitionToNarrow<DU32>>>
|
|
5791
5830
|
HWY_API VFromD<DU32> WidenMulPairwiseAdd(DU32 du32, VU16 a, VU16 b) {
|
|
5792
|
-
|
|
5793
|
-
|
|
5794
|
-
const auto a0 = And(BitCast(du32, a), lo16_mask);
|
|
5795
|
-
const auto b0 = And(BitCast(du32, b), lo16_mask);
|
|
5796
|
-
|
|
5797
|
-
const auto a1 = ShiftRight<16>(BitCast(du32, a));
|
|
5798
|
-
const auto b1 = ShiftRight<16>(BitCast(du32, b));
|
|
5799
|
-
|
|
5800
|
-
return MulAdd(a1, b1, a0 * b0);
|
|
5831
|
+
return MulAdd(PromoteEvenTo(du32, a), PromoteEvenTo(du32, b),
|
|
5832
|
+
Mul(PromoteOddTo(du32, a), PromoteOddTo(du32, b)));
|
|
5801
5833
|
}
|
|
5802
5834
|
|
|
5803
|
-
//
|
|
5804
|
-
|
|
5805
|
-
template <class D32,
|
|
5835
|
+
// ------------------------------ ReorderWidenMulAccumulate
|
|
5836
|
+
|
|
5837
|
+
template <class D32, HWY_IF_UI32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
|
|
5806
5838
|
class V16 = VFromD<RepartitionToNarrow<D32>>>
|
|
5807
|
-
HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32
|
|
5839
|
+
HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 d32, V16 a, V16 b,
|
|
5808
5840
|
const VFromD<D32> sum0,
|
|
5809
5841
|
VFromD<D32>& /*sum1*/) {
|
|
5810
|
-
return sum0 + WidenMulPairwiseAdd(
|
|
5811
|
-
}
|
|
5812
|
-
|
|
5813
|
-
// Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is
|
|
5814
|
-
// safe.
|
|
5815
|
-
template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_LE_D(DU32, 16),
|
|
5816
|
-
class VU16 = VFromD<RepartitionToNarrow<DU32>>>
|
|
5817
|
-
HWY_API VFromD<DU32> ReorderWidenMulAccumulate(DU32 d, VU16 a, VU16 b,
|
|
5818
|
-
const VFromD<DU32> sum0,
|
|
5819
|
-
VFromD<DU32>& /*sum1*/) {
|
|
5820
|
-
return sum0 + WidenMulPairwiseAdd(d, a, b);
|
|
5842
|
+
return sum0 + WidenMulPairwiseAdd(d32, a, b);
|
|
5821
5843
|
}
|
|
5822
5844
|
|
|
5823
5845
|
// ------------------------------ RearrangeToOddPlusEven
|