@img/sharp-libvips-dev 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/README.md +1 -2
  2. package/include/aom/aom_decoder.h +1 -1
  3. package/include/aom/aom_encoder.h +7 -1
  4. package/include/aom/aom_image.h +24 -12
  5. package/include/aom/aom_integer.h +3 -3
  6. package/include/aom/aomcx.h +15 -0
  7. package/include/aom/aomdx.h +5 -2
  8. package/include/archive.h +7 -5
  9. package/include/archive_entry.h +5 -3
  10. package/include/cgif.h +3 -0
  11. package/include/freetype2/freetype/config/ftoption.h +1 -1
  12. package/include/fribidi/fribidi-config.h +2 -2
  13. package/include/fribidi/fribidi-unicode-version.h +3 -3
  14. package/include/glib-2.0/gio/gappinfo.h +40 -25
  15. package/include/glib-2.0/gio/gasyncresult.h +1 -1
  16. package/include/glib-2.0/gio/gconverter.h +5 -0
  17. package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
  18. package/include/glib-2.0/gio/gfile.h +16 -0
  19. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  20. package/include/glib-2.0/gio/gsettings.h +8 -0
  21. package/include/glib-2.0/gio/gvfs.h +2 -2
  22. package/include/glib-2.0/girepository/gi-visibility.h +34 -0
  23. package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
  24. package/include/glib-2.0/glib/giochannel.h +2 -2
  25. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  26. package/include/glib-2.0/glib/gmacros.h +12 -5
  27. package/include/glib-2.0/glib/gmain.h +93 -7
  28. package/include/glib-2.0/glib/gqsort.h +8 -1
  29. package/include/glib-2.0/glib/gstrfuncs.h +0 -12
  30. package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
  31. package/include/glib-2.0/glib/gunicode.h +1 -1
  32. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  33. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  34. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  35. package/include/glib-2.0/gobject/gtype.h +6 -6
  36. package/include/harfbuzz/hb-buffer.h +6 -0
  37. package/include/harfbuzz/hb-common.h +6 -9
  38. package/include/harfbuzz/hb-cplusplus.hh +8 -11
  39. package/include/harfbuzz/hb-subset.h +17 -4
  40. package/include/harfbuzz/hb-version.h +3 -3
  41. package/include/hwy/abort.h +28 -0
  42. package/include/hwy/aligned_allocator.h +48 -1
  43. package/include/hwy/base.h +235 -34
  44. package/include/hwy/detect_compiler_arch.h +84 -10
  45. package/include/hwy/detect_targets.h +95 -29
  46. package/include/hwy/foreach_target.h +12 -1
  47. package/include/hwy/highway.h +205 -50
  48. package/include/hwy/ops/arm_neon-inl.h +841 -99
  49. package/include/hwy/ops/arm_sve-inl.h +413 -141
  50. package/include/hwy/ops/emu128-inl.h +373 -360
  51. package/include/hwy/ops/generic_ops-inl.h +804 -401
  52. package/include/hwy/ops/inside-inl.h +691 -0
  53. package/include/hwy/ops/ppc_vsx-inl.h +456 -166
  54. package/include/hwy/ops/rvv-inl.h +537 -249
  55. package/include/hwy/ops/scalar-inl.h +169 -79
  56. package/include/hwy/ops/set_macros-inl.h +106 -18
  57. package/include/hwy/ops/shared-inl.h +23 -0
  58. package/include/hwy/ops/wasm_128-inl.h +130 -108
  59. package/include/hwy/ops/x86_128-inl.h +1892 -577
  60. package/include/hwy/ops/x86_256-inl.h +625 -184
  61. package/include/hwy/ops/x86_512-inl.h +733 -131
  62. package/include/hwy/targets.h +22 -21
  63. package/include/hwy/timer-inl.h +3 -3
  64. package/include/hwy/timer.h +5 -1
  65. package/include/libheif/heif.h +170 -15
  66. package/include/libheif/heif_items.h +237 -0
  67. package/include/libheif/heif_properties.h +38 -2
  68. package/include/libheif/heif_regions.h +1 -1
  69. package/include/libheif/heif_version.h +2 -2
  70. package/include/libpng16/pnglibconf.h +1 -1
  71. package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
  72. package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
  73. package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
  74. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  75. package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
  76. package/include/libxml2/libxml/HTMLparser.h +12 -19
  77. package/include/libxml2/libxml/c14n.h +1 -12
  78. package/include/libxml2/libxml/debugXML.h +1 -1
  79. package/include/libxml2/libxml/encoding.h +9 -0
  80. package/include/libxml2/libxml/entities.h +12 -1
  81. package/include/libxml2/libxml/hash.h +19 -0
  82. package/include/libxml2/libxml/list.h +2 -2
  83. package/include/libxml2/libxml/nanohttp.h +17 -0
  84. package/include/libxml2/libxml/parser.h +61 -55
  85. package/include/libxml2/libxml/parserInternals.h +9 -1
  86. package/include/libxml2/libxml/pattern.h +6 -0
  87. package/include/libxml2/libxml/tree.h +32 -12
  88. package/include/libxml2/libxml/uri.h +11 -0
  89. package/include/libxml2/libxml/valid.h +29 -2
  90. package/include/libxml2/libxml/xinclude.h +7 -0
  91. package/include/libxml2/libxml/xmlIO.h +21 -4
  92. package/include/libxml2/libxml/xmlerror.h +14 -0
  93. package/include/libxml2/libxml/xmlexports.h +111 -15
  94. package/include/libxml2/libxml/xmlmemory.h +8 -45
  95. package/include/libxml2/libxml/xmlreader.h +2 -0
  96. package/include/libxml2/libxml/xmlsave.h +5 -0
  97. package/include/libxml2/libxml/xmlunicode.h +165 -1
  98. package/include/libxml2/libxml/xmlversion.h +15 -179
  99. package/include/libxml2/libxml/xmlwriter.h +1 -0
  100. package/include/libxml2/libxml/xpath.h +4 -0
  101. package/include/pango-1.0/pango/pango-features.h +3 -3
  102. package/include/pango-1.0/pango/pango-item.h +4 -2
  103. package/include/pango-1.0/pango/pango-version-macros.h +25 -0
  104. package/include/pango-1.0/pango/pangofc-font.h +2 -1
  105. package/include/pnglibconf.h +1 -1
  106. package/include/vips/util.h +1 -2
  107. package/include/vips/version.h +4 -4
  108. package/include/webp/decode.h +58 -56
  109. package/include/webp/demux.h +25 -21
  110. package/include/webp/encode.h +44 -39
  111. package/include/webp/mux.h +76 -15
  112. package/include/webp/mux_types.h +2 -1
  113. package/include/webp/sharpyuv/sharpyuv.h +77 -8
  114. package/include/webp/types.h +29 -8
  115. package/include/zconf.h +1 -1
  116. package/include/zlib.h +12 -12
  117. package/package.json +1 -1
  118. package/versions.json +14 -15
@@ -26,6 +26,7 @@
26
26
  #endif
27
27
 
28
28
  #include "hwy/detect_compiler_arch.h"
29
+ #include "hwy/detect_targets.h"
29
30
 
30
31
  // Separate header because foreach_target.h re-enables its include guard.
31
32
  #include "hwy/ops/set_macros-inl.h"
@@ -61,6 +62,10 @@ namespace HWY_NAMESPACE {
61
62
  // We therefore pass by const& only on GCC and (Windows or aarch64). This alias
62
63
  // must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
63
64
  // and possibly also other functions that are not inlined.
65
+ //
66
+ // Even better is to avoid passing vector arguments to non-inlined functions,
67
+ // because the SVE and RISC-V ABIs are still works in progress and may lead to
68
+ // incorrect codegen.
64
69
  #if HWY_COMPILER_GCC_ACTUAL && (HWY_OS_WIN || HWY_ARCH_ARM_A64)
65
70
  template <class V>
66
71
  using VecArg = const V&;
@@ -529,6 +534,8 @@ HWY_API bool IsAligned(D d, T* ptr) {
529
534
 
530
535
  // Same as base.h macros but with a Simd<T, N, kPow2> argument instead of T.
531
536
  #define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(hwy::HWY_NAMESPACE::TFromD<D>)
537
+ #define HWY_IF_NOT_UNSIGNED_D(D) \
538
+ HWY_IF_NOT_UNSIGNED(hwy::HWY_NAMESPACE::TFromD<D>)
532
539
  #define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(hwy::HWY_NAMESPACE::TFromD<D>)
533
540
  #define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(hwy::HWY_NAMESPACE::TFromD<D>)
534
541
  #define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(hwy::HWY_NAMESPACE::TFromD<D>)
@@ -609,6 +616,8 @@ HWY_API bool IsAligned(D d, T* ptr) {
609
616
 
610
617
  // Same, but with a vector argument. ops/*-inl.h define their own TFromV.
611
618
  #define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(hwy::HWY_NAMESPACE::TFromV<V>)
619
+ #define HWY_IF_NOT_UNSIGNED_V(V) \
620
+ HWY_IF_NOT_UNSIGNED(hwy::HWY_NAMESPACE::TFromV<V>)
612
621
  #define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(hwy::HWY_NAMESPACE::TFromV<V>)
613
622
  #define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(hwy::HWY_NAMESPACE::TFromV<V>)
614
623
  #define HWY_IF_NOT_FLOAT_V(V) HWY_IF_NOT_FLOAT(hwy::HWY_NAMESPACE::TFromV<V>)
@@ -646,6 +655,20 @@ HWY_API bool IsAligned(D d, T* ptr) {
646
655
  #undef HWY_IF_MINMAX_OF_LANES_D
647
656
  #define HWY_IF_MINMAX_OF_LANES_D(D) HWY_IF_LANES_GT_D(D, 1)
648
657
 
658
+ #undef HWY_IF_ADDSUB_V
659
+ #define HWY_IF_ADDSUB_V(V) HWY_IF_LANES_GT_D(DFromV<V>, 1)
660
+
661
+ #undef HWY_IF_MULADDSUB_V
662
+ #define HWY_IF_MULADDSUB_V(V) HWY_IF_LANES_GT_D(DFromV<V>, 1)
663
+
664
+ // HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V is used to disable the default
665
+ // implementation of unsigned to signed DemoteTo/ReorderDemote2To in
666
+ // generic_ops-inl.h for at least some of the unsigned to signed demotions on
667
+ // SCALAR/EMU128/SSE2/SSSE3/SSE4/AVX2/SVE/SVE2
668
+
669
+ #undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
670
+ #define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) void* = nullptr
671
+
649
672
  // Old names (deprecated)
650
673
  #define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_T_SIZE_D(D, bytes)
651
674
  #define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_T_SIZE_D(D, bytes)
@@ -154,9 +154,6 @@ HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
154
154
  template <class D>
155
155
  using VFromD = decltype(Zero(D()));
156
156
 
157
- // ------------------------------ Tuple (VFromD)
158
- #include "hwy/ops/tuple-inl.h"
159
-
160
157
  // ------------------------------ BitCast
161
158
 
162
159
  namespace detail {
@@ -654,12 +651,16 @@ HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
654
651
  }
655
652
 
656
653
  // ------------------------------ RotateRight (ShiftRight, Or)
657
- template <int kBits, typename T, size_t N>
654
+ template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
658
655
  HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
656
+ const DFromV<decltype(v)> d;
657
+ const RebindToUnsigned<decltype(d)> du;
658
+
659
659
  constexpr size_t kSizeInBits = sizeof(T) * 8;
660
660
  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
661
+
661
662
  if (kBits == 0) return v;
662
- return Or(ShiftRight<kBits>(v),
663
+ return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
663
664
  ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
664
665
  }
665
666
 
@@ -917,7 +918,25 @@ HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
917
918
  return Vec128<int32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
918
919
  }
919
920
 
920
- // Returns the upper 16 bits of a * b in each lane.
921
+ // Returns the upper sizeof(T)*8 bits of a * b in each lane.
922
+ template <size_t N>
923
+ HWY_API Vec128<uint8_t, N> MulHigh(const Vec128<uint8_t, N> a,
924
+ const Vec128<uint8_t, N> b) {
925
+ const auto l = wasm_u16x8_extmul_low_u8x16(a.raw, b.raw);
926
+ const auto h = wasm_u16x8_extmul_high_u8x16(a.raw, b.raw);
927
+ // TODO(eustas): shift-right + narrow?
928
+ return Vec128<uint8_t, N>{wasm_i8x16_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15,
929
+ 17, 19, 21, 23, 25, 27, 29, 31)};
930
+ }
931
+ template <size_t N>
932
+ HWY_API Vec128<int8_t, N> MulHigh(const Vec128<int8_t, N> a,
933
+ const Vec128<int8_t, N> b) {
934
+ const auto l = wasm_i16x8_extmul_low_i8x16(a.raw, b.raw);
935
+ const auto h = wasm_i16x8_extmul_high_i8x16(a.raw, b.raw);
936
+ // TODO(eustas): shift-right + narrow?
937
+ return Vec128<int8_t, N>{wasm_i8x16_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15,
938
+ 17, 19, 21, 23, 25, 27, 29, 31)};
939
+ }
921
940
  template <size_t N>
922
941
  HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
923
942
  const Vec128<uint16_t, N> b) {
@@ -936,6 +955,22 @@ HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
936
955
  return Vec128<int16_t, N>{
937
956
  wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
938
957
  }
958
+ template <size_t N>
959
+ HWY_API Vec128<uint32_t, N> MulHigh(const Vec128<uint32_t, N> a,
960
+ const Vec128<uint32_t, N> b) {
961
+ const auto l = wasm_u64x2_extmul_low_u32x4(a.raw, b.raw);
962
+ const auto h = wasm_u64x2_extmul_high_u32x4(a.raw, b.raw);
963
+ // TODO(eustas): shift-right + narrow?
964
+ return Vec128<uint32_t, N>{wasm_i32x4_shuffle(l, h, 1, 3, 5, 7)};
965
+ }
966
+ template <size_t N>
967
+ HWY_API Vec128<int32_t, N> MulHigh(const Vec128<int32_t, N> a,
968
+ const Vec128<int32_t, N> b) {
969
+ const auto l = wasm_i64x2_extmul_low_i32x4(a.raw, b.raw);
970
+ const auto h = wasm_i64x2_extmul_high_i32x4(a.raw, b.raw);
971
+ // TODO(eustas): shift-right + narrow?
972
+ return Vec128<int32_t, N>{wasm_i32x4_shuffle(l, h, 1, 3, 5, 7)};
973
+ }
939
974
 
940
975
  template <size_t N>
941
976
  HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
@@ -1622,13 +1657,6 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
1622
1657
  return IfThenElse(MaskFromVec(v), yes, no);
1623
1658
  }
1624
1659
 
1625
- template <typename T, size_t N, HWY_IF_FLOAT(T)>
1626
- HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
1627
- const DFromV<decltype(v)> d;
1628
- const auto zero = Zero(d);
1629
- return IfThenElse(Mask128<T, N>{(v > zero).raw}, v, zero);
1630
- }
1631
-
1632
1660
  // ------------------------------ Mask logical
1633
1661
 
1634
1662
  template <typename T, size_t N>
@@ -3806,6 +3834,50 @@ HWY_API Vec128<float, N> OddEven(const Vec128<float, N> a,
3806
3834
  return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
3807
3835
  }
3808
3836
 
3837
+ // ------------------------------ InterleaveEven
3838
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
3839
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
3840
+ return VFromD<D>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 16, 2, 18, 4, 20, 6, 22,
3841
+ 8, 24, 10, 26, 12, 28, 14, 30)};
3842
+ }
3843
+
3844
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
3845
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
3846
+ return VFromD<D>{wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 2, 10, 4, 12, 6, 14)};
3847
+ }
3848
+
3849
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
3850
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
3851
+ return VFromD<D>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 2, 6)};
3852
+ }
3853
+
3854
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
3855
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
3856
+ return InterleaveLower(a, b);
3857
+ }
3858
+
3859
+ // ------------------------------ InterleaveOdd
3860
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
3861
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
3862
+ return VFromD<D>{wasm_i8x16_shuffle(a.raw, b.raw, 1, 17, 3, 19, 5, 21, 7, 23,
3863
+ 9, 25, 11, 27, 13, 29, 15, 31)};
3864
+ }
3865
+
3866
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
3867
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
3868
+ return VFromD<D>{wasm_i16x8_shuffle(a.raw, b.raw, 1, 9, 3, 11, 5, 13, 7, 15)};
3869
+ }
3870
+
3871
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
3872
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
3873
+ return VFromD<D>{wasm_i32x4_shuffle(a.raw, b.raw, 1, 5, 3, 7)};
3874
+ }
3875
+
3876
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
3877
+ HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
3878
+ return InterleaveUpper(d, a, b);
3879
+ }
3880
+
3809
3881
  // ------------------------------ OddEvenBlocks
3810
3882
  template <typename T, size_t N>
3811
3883
  HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
@@ -4082,6 +4154,9 @@ HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
4082
4154
  return PromoteTo(d, UpperHalf(dh, v));
4083
4155
  }
4084
4156
 
4157
+ // ------------------------------ PromoteEvenTo/PromoteOddTo
4158
+ #include "hwy/ops/inside-inl.h"
4159
+
4085
4160
  // ------------------------------ Demotions (full -> part w/ narrow lanes)
4086
4161
 
4087
4162
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
@@ -4131,15 +4206,6 @@ HWY_API VFromD<D> DemoteTo(D du8, VFromD<Rebind<uint16_t, D>> v) {
4131
4206
  return DemoteTo(du8, BitCast(di16, Min(v, Set(du16, 0x7FFF))));
4132
4207
  }
4133
4208
 
4134
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
4135
- HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
4136
- const Rebind<int32_t, decltype(dbf16)> di32;
4137
- const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
4138
- const Rebind<uint16_t, decltype(dbf16)> du16;
4139
- const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
4140
- return BitCast(dbf16, DemoteTo(du16, bits_in_32));
4141
- }
4142
-
4143
4209
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
4144
4210
  HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) {
4145
4211
  return VFromD<D>{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)};
@@ -4210,15 +4276,6 @@ HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<uint64_t, D>> v) {
4210
4276
  return DemoteTo(df32, adj_f64_val);
4211
4277
  }
4212
4278
 
4213
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_BF16_D(D),
4214
- class V32 = VFromD<Repartition<float, D>>>
4215
- HWY_API VFromD<D> ReorderDemote2To(D dbf16, V32 a, V32 b) {
4216
- const RebindToUnsigned<decltype(dbf16)> du16;
4217
- const Repartition<uint32_t, decltype(dbf16)> du32;
4218
- const VFromD<decltype(du32)> b_in_even = ShiftRight<16>(BitCast(du32, b));
4219
- return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
4220
- }
4221
-
4222
4279
  // Specializations for partial vectors because i16x8_narrow_i32x4 sets lanes
4223
4280
  // above 2*N.
4224
4281
  template <class D, HWY_IF_I16_D(D)>
@@ -4565,12 +4622,6 @@ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
4565
4622
  return ReorderDemote2To(d, a, b);
4566
4623
  }
4567
4624
 
4568
- template <class D, HWY_IF_BF16_D(D), class V32 = VFromD<Repartition<float, D>>>
4569
- HWY_API VFromD<D> OrderedDemote2To(D dbf16, V32 a, V32 b) {
4570
- const RebindToUnsigned<decltype(dbf16)> du16;
4571
- return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a)));
4572
- }
4573
-
4574
4625
  // ------------------------------ ConvertTo
4575
4626
 
4576
4627
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
@@ -5723,59 +5774,47 @@ HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
5723
5774
 
5724
5775
  // ------------------------------ MulEven/Odd (Load)
5725
5776
 
5726
- HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a,
5727
- const Vec128<uint64_t> b) {
5728
- alignas(16) uint64_t mul[2];
5729
- mul[0] =
5730
- Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0)),
5731
- static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
5732
- return Load(Full128<uint64_t>(), mul);
5777
+ template <class T, HWY_IF_UI64(T)>
5778
+ HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
5779
+ alignas(16) T mul[2];
5780
+ mul[0] = Mul128(static_cast<T>(wasm_i64x2_extract_lane(a.raw, 0)),
5781
+ static_cast<T>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
5782
+ return Load(Full128<T>(), mul);
5783
+ }
5784
+
5785
+ template <class T, HWY_IF_UI64(T)>
5786
+ HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
5787
+ alignas(16) T mul[2];
5788
+ mul[0] = Mul128(static_cast<T>(wasm_i64x2_extract_lane(a.raw, 1)),
5789
+ static_cast<T>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
5790
+ return Load(Full128<T>(), mul);
5733
5791
  }
5734
5792
 
5735
- HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
5736
- const Vec128<uint64_t> b) {
5737
- alignas(16) uint64_t mul[2];
5738
- mul[0] =
5739
- Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1)),
5740
- static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
5741
- return Load(Full128<uint64_t>(), mul);
5793
+ // ------------------------------ I64/U64 MulHigh (GetLane)
5794
+ template <class T, HWY_IF_UI64(T)>
5795
+ HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) {
5796
+ T hi;
5797
+ Mul128(GetLane(a), GetLane(b), &hi);
5798
+ return Set(Full64<T>(), hi);
5742
5799
  }
5743
5800
 
5744
- // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
5801
+ template <class T, HWY_IF_UI64(T)>
5802
+ HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
5803
+ T hi_0;
5804
+ T hi_1;
5805
+ Mul128(GetLane(a), GetLane(b), &hi_0);
5806
+ Mul128(detail::ExtractLane<1>(a), detail::ExtractLane<1>(b), &hi_1);
5807
+ return Dup128VecFromValues(Full128<T>(), hi_0, hi_1);
5808
+ }
5809
+
5810
+ // ------------------------------ WidenMulPairwiseAdd (MulAdd, PromoteEvenTo)
5745
5811
 
5746
5812
  // Generic for all vector lengths.
5747
- template <class D32, HWY_IF_F32_D(D32),
5748
- class V16 = VFromD<Repartition<bfloat16_t, D32>>>
5749
- HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) {
5750
- const Rebind<uint32_t, decltype(df32)> du32;
5751
- using VU32 = VFromD<decltype(du32)>;
5752
- const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
5753
- // Using shift/and instead of Zip leads to the odd/even order that
5754
- // RearrangeToOddPlusEven prefers.
5755
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
5756
- const VU32 ao = And(BitCast(du32, a), odd);
5757
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
5758
- const VU32 bo = And(BitCast(du32, b), odd);
5759
- return Mul(BitCast(df32, ae), BitCast(df32, be)) +
5760
- Mul(BitCast(df32, ao), BitCast(df32, bo));
5761
- }
5762
-
5763
- template <class D32, HWY_IF_F32_D(D32),
5764
- class V16 = VFromD<Repartition<bfloat16_t, D32>>>
5765
- HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b,
5766
- const VFromD<D32> sum0,
5767
- VFromD<D32>& sum1) {
5768
- const Rebind<uint32_t, decltype(df32)> du32;
5769
- using VU32 = VFromD<decltype(du32)>;
5770
- const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
5771
- // Using shift/and instead of Zip leads to the odd/even order that
5772
- // RearrangeToOddPlusEven prefers.
5773
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
5774
- const VU32 ao = And(BitCast(du32, a), odd);
5775
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
5776
- const VU32 bo = And(BitCast(du32, b), odd);
5777
- sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
5778
- return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
5813
+ template <class DF, HWY_IF_F32_D(DF),
5814
+ class VBF = VFromD<Repartition<bfloat16_t, DF>>>
5815
+ HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
5816
+ return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
5817
+ Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
5779
5818
  }
5780
5819
 
5781
5820
  // Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is
@@ -5789,35 +5828,18 @@ HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 /* tag */, V16 a, V16 b) {
5789
5828
  template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_LE_D(DU32, 16),
5790
5829
  class VU16 = VFromD<RepartitionToNarrow<DU32>>>
5791
5830
  HWY_API VFromD<DU32> WidenMulPairwiseAdd(DU32 du32, VU16 a, VU16 b) {
5792
- const auto lo16_mask = Set(du32, 0x0000FFFFu);
5793
-
5794
- const auto a0 = And(BitCast(du32, a), lo16_mask);
5795
- const auto b0 = And(BitCast(du32, b), lo16_mask);
5796
-
5797
- const auto a1 = ShiftRight<16>(BitCast(du32, a));
5798
- const auto b1 = ShiftRight<16>(BitCast(du32, b));
5799
-
5800
- return MulAdd(a1, b1, a0 * b0);
5831
+ return MulAdd(PromoteEvenTo(du32, a), PromoteEvenTo(du32, b),
5832
+ Mul(PromoteOddTo(du32, a), PromoteOddTo(du32, b)));
5801
5833
  }
5802
5834
 
5803
- // Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is
5804
- // safe.
5805
- template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
5835
+ // ------------------------------ ReorderWidenMulAccumulate
5836
+
5837
+ template <class D32, HWY_IF_UI32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
5806
5838
  class V16 = VFromD<RepartitionToNarrow<D32>>>
5807
- HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 d, V16 a, V16 b,
5839
+ HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 d32, V16 a, V16 b,
5808
5840
  const VFromD<D32> sum0,
5809
5841
  VFromD<D32>& /*sum1*/) {
5810
- return sum0 + WidenMulPairwiseAdd(d, a, b);
5811
- }
5812
-
5813
- // Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is
5814
- // safe.
5815
- template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_LE_D(DU32, 16),
5816
- class VU16 = VFromD<RepartitionToNarrow<DU32>>>
5817
- HWY_API VFromD<DU32> ReorderWidenMulAccumulate(DU32 d, VU16 a, VU16 b,
5818
- const VFromD<DU32> sum0,
5819
- VFromD<DU32>& /*sum1*/) {
5820
- return sum0 + WidenMulPairwiseAdd(d, a, b);
5842
+ return sum0 + WidenMulPairwiseAdd(d32, a, b);
5821
5843
  }
5822
5844
 
5823
5845
  // ------------------------------ RearrangeToOddPlusEven