@img/sharp-libvips-dev 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/README.md +1 -2
  2. package/include/aom/aom_decoder.h +1 -1
  3. package/include/aom/aom_encoder.h +7 -1
  4. package/include/aom/aom_image.h +24 -12
  5. package/include/aom/aom_integer.h +3 -3
  6. package/include/aom/aomcx.h +15 -0
  7. package/include/aom/aomdx.h +5 -2
  8. package/include/archive.h +7 -5
  9. package/include/archive_entry.h +5 -3
  10. package/include/cgif.h +3 -0
  11. package/include/freetype2/freetype/config/ftoption.h +1 -1
  12. package/include/fribidi/fribidi-config.h +2 -2
  13. package/include/fribidi/fribidi-unicode-version.h +3 -3
  14. package/include/glib-2.0/gio/gappinfo.h +40 -25
  15. package/include/glib-2.0/gio/gasyncresult.h +1 -1
  16. package/include/glib-2.0/gio/gconverter.h +5 -0
  17. package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
  18. package/include/glib-2.0/gio/gfile.h +16 -0
  19. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  20. package/include/glib-2.0/gio/gsettings.h +8 -0
  21. package/include/glib-2.0/gio/gvfs.h +2 -2
  22. package/include/glib-2.0/girepository/gi-visibility.h +34 -0
  23. package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
  24. package/include/glib-2.0/glib/giochannel.h +2 -2
  25. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  26. package/include/glib-2.0/glib/gmacros.h +12 -5
  27. package/include/glib-2.0/glib/gmain.h +93 -7
  28. package/include/glib-2.0/glib/gqsort.h +8 -1
  29. package/include/glib-2.0/glib/gstrfuncs.h +0 -12
  30. package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
  31. package/include/glib-2.0/glib/gunicode.h +1 -1
  32. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  33. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  34. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  35. package/include/glib-2.0/gobject/gtype.h +6 -6
  36. package/include/harfbuzz/hb-buffer.h +6 -0
  37. package/include/harfbuzz/hb-common.h +6 -9
  38. package/include/harfbuzz/hb-cplusplus.hh +8 -11
  39. package/include/harfbuzz/hb-subset.h +17 -4
  40. package/include/harfbuzz/hb-version.h +3 -3
  41. package/include/hwy/abort.h +28 -0
  42. package/include/hwy/aligned_allocator.h +48 -1
  43. package/include/hwy/base.h +235 -34
  44. package/include/hwy/detect_compiler_arch.h +84 -10
  45. package/include/hwy/detect_targets.h +95 -29
  46. package/include/hwy/foreach_target.h +12 -1
  47. package/include/hwy/highway.h +205 -50
  48. package/include/hwy/ops/arm_neon-inl.h +841 -99
  49. package/include/hwy/ops/arm_sve-inl.h +413 -141
  50. package/include/hwy/ops/emu128-inl.h +373 -360
  51. package/include/hwy/ops/generic_ops-inl.h +804 -401
  52. package/include/hwy/ops/inside-inl.h +691 -0
  53. package/include/hwy/ops/ppc_vsx-inl.h +456 -166
  54. package/include/hwy/ops/rvv-inl.h +537 -249
  55. package/include/hwy/ops/scalar-inl.h +169 -79
  56. package/include/hwy/ops/set_macros-inl.h +106 -18
  57. package/include/hwy/ops/shared-inl.h +23 -0
  58. package/include/hwy/ops/wasm_128-inl.h +130 -108
  59. package/include/hwy/ops/x86_128-inl.h +1892 -577
  60. package/include/hwy/ops/x86_256-inl.h +625 -184
  61. package/include/hwy/ops/x86_512-inl.h +733 -131
  62. package/include/hwy/targets.h +22 -21
  63. package/include/hwy/timer-inl.h +3 -3
  64. package/include/hwy/timer.h +5 -1
  65. package/include/libheif/heif.h +170 -15
  66. package/include/libheif/heif_items.h +237 -0
  67. package/include/libheif/heif_properties.h +38 -2
  68. package/include/libheif/heif_regions.h +1 -1
  69. package/include/libheif/heif_version.h +2 -2
  70. package/include/libpng16/pnglibconf.h +1 -1
  71. package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
  72. package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
  73. package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
  74. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  75. package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
  76. package/include/libxml2/libxml/HTMLparser.h +12 -19
  77. package/include/libxml2/libxml/c14n.h +1 -12
  78. package/include/libxml2/libxml/debugXML.h +1 -1
  79. package/include/libxml2/libxml/encoding.h +9 -0
  80. package/include/libxml2/libxml/entities.h +12 -1
  81. package/include/libxml2/libxml/hash.h +19 -0
  82. package/include/libxml2/libxml/list.h +2 -2
  83. package/include/libxml2/libxml/nanohttp.h +17 -0
  84. package/include/libxml2/libxml/parser.h +61 -55
  85. package/include/libxml2/libxml/parserInternals.h +9 -1
  86. package/include/libxml2/libxml/pattern.h +6 -0
  87. package/include/libxml2/libxml/tree.h +32 -12
  88. package/include/libxml2/libxml/uri.h +11 -0
  89. package/include/libxml2/libxml/valid.h +29 -2
  90. package/include/libxml2/libxml/xinclude.h +7 -0
  91. package/include/libxml2/libxml/xmlIO.h +21 -4
  92. package/include/libxml2/libxml/xmlerror.h +14 -0
  93. package/include/libxml2/libxml/xmlexports.h +111 -15
  94. package/include/libxml2/libxml/xmlmemory.h +8 -45
  95. package/include/libxml2/libxml/xmlreader.h +2 -0
  96. package/include/libxml2/libxml/xmlsave.h +5 -0
  97. package/include/libxml2/libxml/xmlunicode.h +165 -1
  98. package/include/libxml2/libxml/xmlversion.h +15 -179
  99. package/include/libxml2/libxml/xmlwriter.h +1 -0
  100. package/include/libxml2/libxml/xpath.h +4 -0
  101. package/include/pango-1.0/pango/pango-features.h +3 -3
  102. package/include/pango-1.0/pango/pango-item.h +4 -2
  103. package/include/pango-1.0/pango/pango-version-macros.h +25 -0
  104. package/include/pango-1.0/pango/pangofc-font.h +2 -1
  105. package/include/pnglibconf.h +1 -1
  106. package/include/vips/util.h +1 -2
  107. package/include/vips/version.h +4 -4
  108. package/include/webp/decode.h +58 -56
  109. package/include/webp/demux.h +25 -21
  110. package/include/webp/encode.h +44 -39
  111. package/include/webp/mux.h +76 -15
  112. package/include/webp/mux_types.h +2 -1
  113. package/include/webp/sharpyuv/sharpyuv.h +77 -8
  114. package/include/webp/types.h +29 -8
  115. package/include/zconf.h +1 -1
  116. package/include/zlib.h +12 -12
  117. package/package.json +1 -1
  118. package/versions.json +14 -15
@@ -43,12 +43,23 @@
43
43
 
44
44
  // HWY_SVE_HAVE_BF16_VEC is defined to 1 if the SVE svbfloat16_t vector type
45
45
  // is supported, even if HWY_SVE_HAVE_BF16_FEATURE (= intrinsics) is 0.
46
- #if HWY_SVE_HAVE_BF16_FEATURE || HWY_COMPILER_GCC_ACTUAL >= 1000
46
+ #if HWY_SVE_HAVE_BF16_FEATURE || \
47
+ (HWY_COMPILER_CLANG >= 1200 && defined(__ARM_FEATURE_SVE_BF16)) || \
48
+ HWY_COMPILER_GCC_ACTUAL >= 1000
47
49
  #define HWY_SVE_HAVE_BF16_VEC 1
48
50
  #else
49
51
  #define HWY_SVE_HAVE_BF16_VEC 0
50
52
  #endif
51
53
 
54
+ // HWY_SVE_HAVE_F32_TO_BF16C is defined to 1 if the SVE svcvt_bf16_f32_x
55
+ // and svcvtnt_bf16_f32_x intrinsics are available, even if the __bf16 type
56
+ // is disabled
57
+ #if HWY_SVE_HAVE_BF16_VEC && defined(__ARM_FEATURE_SVE_BF16)
58
+ #define HWY_SVE_HAVE_F32_TO_BF16C 1
59
+ #else
60
+ #define HWY_SVE_HAVE_F32_TO_BF16C 0
61
+ #endif
62
+
52
63
  HWY_BEFORE_NAMESPACE();
53
64
  namespace hwy {
54
65
  namespace HWY_NAMESPACE {
@@ -99,7 +110,13 @@ namespace detail { // for code folding
99
110
  #define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP) \
100
111
  HWY_SVE_FOREACH_BF16_UNCONDITIONAL(X_MACRO, NAME, OP)
101
112
  // We have both f16 and bf16, so nothing is emulated.
102
- #define HWY_SVE_IF_EMULATED_D(D) hwy::EnableIf<false>* = nullptr
113
+
114
+ // NOTE: hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr is used instead of
115
+ // hwy::EnableIf<false>* = nullptr to avoid compiler errors since
116
+ // !hwy::IsSame<D, D>() is always false and as !hwy::IsSame<D, D>() will cause
117
+ // SFINAE to occur instead of a hard error due to a dependency on the D template
118
+ // argument
119
+ #define HWY_SVE_IF_EMULATED_D(D) hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr
103
120
  #define HWY_SVE_IF_NOT_EMULATED_D(D) hwy::EnableIf<true>* = nullptr
104
121
  #else
105
122
  #define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP)
@@ -302,7 +319,9 @@ HWY_API size_t Lanes(Simd<T, N, kPow2> d) {
302
319
  return sv##OP##_b##BITS##_u32(uint32_t{0}, static_cast<uint32_t>(limit)); \
303
320
  }
304
321
  HWY_SVE_FOREACH(HWY_SVE_FIRSTN, FirstN, whilelt)
305
- HWY_SVE_FOREACH_BF16(HWY_SVE_FIRSTN, FirstN, whilelt)
322
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
323
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_FIRSTN, FirstN, whilelt)
324
+ #endif
306
325
 
307
326
  template <class D, HWY_SVE_IF_EMULATED_D(D)>
308
327
  svbool_t FirstN(D /* tag */, size_t count) {
@@ -327,7 +346,7 @@ namespace detail {
327
346
  }
328
347
 
329
348
  HWY_SVE_FOREACH(HWY_SVE_WRAP_PTRUE, PTrue, ptrue) // return all-true
330
- HWY_SVE_FOREACH_BF16(HWY_SVE_WRAP_PTRUE, PTrue, ptrue)
349
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_WRAP_PTRUE, PTrue, ptrue)
331
350
  #undef HWY_SVE_WRAP_PTRUE
332
351
 
333
352
  HWY_API svbool_t PFalse() { return svpfalse_b(); }
@@ -433,29 +452,24 @@ HWY_SVE_FOREACH_UI32(HWY_SVE_CAST, _, reinterpret)
433
452
  HWY_SVE_FOREACH_UI64(HWY_SVE_CAST, _, reinterpret)
434
453
  HWY_SVE_FOREACH_F(HWY_SVE_CAST, _, reinterpret)
435
454
 
436
- #undef HWY_SVE_CAST_NOP
437
- #undef HWY_SVE_CAST
438
-
455
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
456
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CAST, _, reinterpret)
457
+ #else // !(HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC)
439
458
  template <class V, HWY_SVE_IF_EMULATED_D(DFromV<V>)>
440
459
  HWY_INLINE svuint8_t BitCastToByte(V v) {
441
- #if HWY_SVE_HAVE_BF16_VEC
442
- return svreinterpret_u8_bf16(v);
443
- #else
444
460
  const RebindToUnsigned<DFromV<V>> du;
445
461
  return BitCastToByte(BitCast(du, v));
446
- #endif
447
462
  }
448
463
 
449
464
  template <class D, HWY_SVE_IF_EMULATED_D(D)>
450
465
  HWY_INLINE VFromD<D> BitCastFromByte(D d, svuint8_t v) {
451
- #if HWY_SVE_HAVE_BF16_VEC
452
- (void)d;
453
- return svreinterpret_bf16_u8(v);
454
- #else
455
466
  const RebindToUnsigned<decltype(d)> du;
456
467
  return BitCastFromByte(du, v);
457
- #endif
458
468
  }
469
+ #endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
470
+
471
+ #undef HWY_SVE_CAST_NOP
472
+ #undef HWY_SVE_CAST
459
473
 
460
474
  } // namespace detail
461
475
 
@@ -474,6 +488,9 @@ HWY_API VFromD<D> BitCast(D d, FromV v) {
474
488
  }
475
489
 
476
490
  HWY_SVE_FOREACH(HWY_SVE_UNDEFINED, Undefined, undef)
491
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
492
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_UNDEFINED, Undefined, undef)
493
+ #endif
477
494
 
478
495
  template <class D, HWY_SVE_IF_EMULATED_D(D)>
479
496
  VFromD<D> Undefined(D d) {
@@ -506,7 +523,9 @@ VFromD<D> Undefined(D d) {
506
523
  }
507
524
 
508
525
  HWY_SVE_FOREACH(HWY_SVE_CREATE, Create, create)
509
- HWY_SVE_FOREACH_BF16(HWY_SVE_CREATE, Create, create)
526
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
527
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CREATE, Create, create)
528
+ #endif
510
529
  #undef HWY_SVE_CREATE
511
530
 
512
531
  template <class D>
@@ -531,7 +550,9 @@ using Vec4 = decltype(Create4(D(), Zero(D()), Zero(D()), Zero(D()), Zero(D())));
531
550
  }
532
551
 
533
552
  HWY_SVE_FOREACH(HWY_SVE_GET, Get, get)
534
- HWY_SVE_FOREACH_BF16(HWY_SVE_GET, Get, get)
553
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
554
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_GET, Get, get)
555
+ #endif
535
556
  #undef HWY_SVE_GET
536
557
 
537
558
  #define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP) \
@@ -552,7 +573,9 @@ HWY_SVE_FOREACH_BF16(HWY_SVE_GET, Get, get)
552
573
  }
553
574
 
554
575
  HWY_SVE_FOREACH(HWY_SVE_SET, Set, set)
555
- HWY_SVE_FOREACH_BF16(HWY_SVE_SET, Set, set)
576
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
577
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_SET, Set, set)
578
+ #endif
556
579
  #undef HWY_SVE_SET
557
580
 
558
581
  // ------------------------------ ResizeBitCast
@@ -613,10 +636,14 @@ HWY_API svfloat16_t Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
613
636
  return svdupq_n_f16(t0, t1, t2, t3, t4, t5, t6, t7);
614
637
  }
615
638
 
616
- template <class D, HWY_SVE_IF_EMULATED_D(D)>
639
+ template <class D, HWY_IF_BF16_D(D)>
617
640
  HWY_API VBF16 Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, TFromD<D> t2,
618
641
  TFromD<D> t3, TFromD<D> t4, TFromD<D> t5,
619
642
  TFromD<D> t6, TFromD<D> t7) {
643
+ #if HWY_SVE_HAVE_BF16_FEATURE
644
+ (void)d;
645
+ return svdupq_n_bf16(t0, t1, t2, t3, t4, t5, t6, t7);
646
+ #else
620
647
  const RebindToUnsigned<decltype(d)> du;
621
648
  return BitCast(
622
649
  d, Dup128VecFromValues(
@@ -624,6 +651,7 @@ HWY_API VBF16 Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, TFromD<D> t2,
624
651
  BitCastScalar<uint16_t>(t2), BitCastScalar<uint16_t>(t3),
625
652
  BitCastScalar<uint16_t>(t4), BitCastScalar<uint16_t>(t5),
626
653
  BitCastScalar<uint16_t>(t6), BitCastScalar<uint16_t>(t7)));
654
+ #endif
627
655
  }
628
656
 
629
657
  template <class D, HWY_IF_I32_D(D)>
@@ -683,6 +711,10 @@ HWY_API V And(const V a, const V b) {
683
711
 
684
712
  // ------------------------------ Or
685
713
 
714
+ namespace detail {
715
+ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, OrN, orr_n)
716
+ } // namespace detail
717
+
686
718
  HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Or, orr)
687
719
 
688
720
  template <class V, HWY_IF_FLOAT_V(V)>
@@ -1012,14 +1044,15 @@ HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVV, AbsDiff, abd)
1012
1044
 
1013
1045
  // ------------------------------ ShiftLeft[Same]
1014
1046
 
1015
- #define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP) \
1016
- template <int kBits> \
1017
- HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
1018
- return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits); \
1019
- } \
1020
- HWY_API HWY_SVE_V(BASE, BITS) \
1021
- NAME##Same(HWY_SVE_V(BASE, BITS) v, HWY_SVE_T(uint, BITS) bits) { \
1022
- return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, bits); \
1047
+ #define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP) \
1048
+ template <int kBits> \
1049
+ HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
1050
+ return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits); \
1051
+ } \
1052
+ HWY_API HWY_SVE_V(BASE, BITS) \
1053
+ NAME##Same(HWY_SVE_V(BASE, BITS) v, int bits) { \
1054
+ return sv##OP##_##CHAR##BITS##_x( \
1055
+ HWY_SVE_PTRUE(BITS), v, static_cast<HWY_SVE_T(uint, BITS)>(bits)); \
1023
1056
  }
1024
1057
 
1025
1058
  HWY_SVE_FOREACH_UI(HWY_SVE_SHIFT_N, ShiftLeft, lsl_n)
@@ -1033,15 +1066,35 @@ HWY_SVE_FOREACH_I(HWY_SVE_SHIFT_N, ShiftRight, asr_n)
1033
1066
 
1034
1067
  // ------------------------------ RotateRight
1035
1068
 
1036
- // TODO(janwas): svxar on SVE2
1037
- template <int kBits, class V>
1069
+ #if HWY_SVE_HAVE_2
1070
+
1071
+ #define HWY_SVE_ROTATE_RIGHT_N(BASE, CHAR, BITS, HALF, NAME, OP) \
1072
+ template <int kBits> \
1073
+ HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
1074
+ if (kBits == 0) return v; \
1075
+ return sv##OP##_##CHAR##BITS(v, Zero(DFromV<decltype(v)>()), \
1076
+ HWY_MAX(kBits, 1)); \
1077
+ }
1078
+
1079
+ HWY_SVE_FOREACH_U(HWY_SVE_ROTATE_RIGHT_N, RotateRight, xar_n)
1080
+ HWY_SVE_FOREACH_I(HWY_SVE_ROTATE_RIGHT_N, RotateRight, xar_n)
1081
+
1082
+ #undef HWY_SVE_ROTATE_RIGHT_N
1083
+
1084
+ #else // !HWY_SVE_HAVE_2
1085
+ template <int kBits, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
1038
1086
  HWY_API V RotateRight(const V v) {
1087
+ const DFromV<decltype(v)> d;
1088
+ const RebindToUnsigned<decltype(d)> du;
1089
+
1039
1090
  constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
1040
1091
  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
1041
1092
  if (kBits == 0) return v;
1042
- return Or(ShiftRight<kBits>(v),
1093
+
1094
+ return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
1043
1095
  ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
1044
1096
  }
1097
+ #endif
1045
1098
 
1046
1099
  // ------------------------------ Shl/r
1047
1100
 
@@ -1089,11 +1142,7 @@ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, MaxN, max_n)
1089
1142
  HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVV, Mul, mul)
1090
1143
 
1091
1144
  // ------------------------------ MulHigh
1092
- HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
1093
- // Not part of API, used internally:
1094
- HWY_SVE_FOREACH_UI08(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
1095
- HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
1096
- HWY_SVE_FOREACH_U64(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
1145
+ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
1097
1146
 
1098
1147
  // ------------------------------ MulFixedPoint15
1099
1148
  HWY_API svint16_t MulFixedPoint15(svint16_t a, svint16_t b) {
@@ -1275,6 +1324,7 @@ HWY_API size_t FindKnownFirstTrue(D d, svbool_t m) {
1275
1324
  }
1276
1325
 
1277
1326
  HWY_SVE_FOREACH(HWY_SVE_IF_THEN_ELSE, IfThenElse, sel)
1327
+ HWY_SVE_FOREACH_BF16(HWY_SVE_IF_THEN_ELSE, IfThenElse, sel)
1278
1328
  #undef HWY_SVE_IF_THEN_ELSE
1279
1329
 
1280
1330
  template <class V, class D = DFromV<V>, HWY_SVE_IF_EMULATED_D(D)>
@@ -1562,6 +1612,22 @@ HWY_API VFromD<D> VecFromMask(const D d, svbool_t mask) {
1562
1612
  return BitCast(d, IfThenElseZero(mask, Set(di, -1)));
1563
1613
  }
1564
1614
 
1615
+ // ------------------------------ IsNegative (Lt)
1616
+ #ifdef HWY_NATIVE_IS_NEGATIVE
1617
+ #undef HWY_NATIVE_IS_NEGATIVE
1618
+ #else
1619
+ #define HWY_NATIVE_IS_NEGATIVE
1620
+ #endif
1621
+
1622
+ template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
1623
+ HWY_API svbool_t IsNegative(V v) {
1624
+ const DFromV<decltype(v)> d;
1625
+ const RebindToSigned<decltype(d)> di;
1626
+ using TI = TFromD<decltype(di)>;
1627
+
1628
+ return detail::LtN(BitCast(di, v), static_cast<TI>(0));
1629
+ }
1630
+
1565
1631
  // ------------------------------ IfVecThenElse (MaskFromVec, IfThenElse)
1566
1632
 
1567
1633
  #if HWY_SVE_HAVE_2
@@ -2486,6 +2552,29 @@ HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svuint64_t v) {
2486
2552
  return TruncateTo(dn, vn);
2487
2553
  }
2488
2554
 
2555
+ // ------------------------------ Unsigned to signed demotions
2556
+
2557
+ // Disable the default unsigned to signed DemoteTo/ReorderDemote2To
2558
+ // implementations in generic_ops-inl.h on SVE/SVE2 as the SVE/SVE2 targets have
2559
+ // target-specific implementations of the unsigned to signed DemoteTo and
2560
+ // ReorderDemote2To ops
2561
+
2562
+ // NOTE: hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr is used instead of
2563
+ // hwy::EnableIf<false>* = nullptr to avoid compiler errors since
2564
+ // !hwy::IsSame<V, V>() is always false and as !hwy::IsSame<V, V>() will cause
2565
+ // SFINAE to occur instead of a hard error due to a dependency on the V template
2566
+ // argument
2567
+ #undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
2568
+ #define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \
2569
+ hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
2570
+
2571
+ template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V),
2572
+ HWY_IF_T_SIZE_LE_D(D, sizeof(TFromV<V>) - 1)>
2573
+ HWY_API VFromD<D> DemoteTo(D dn, V v) {
2574
+ const RebindToUnsigned<D> dn_u;
2575
+ return BitCast(dn, TruncateTo(dn_u, detail::SaturateU<TFromD<D>>(v)));
2576
+ }
2577
+
2489
2578
  // ------------------------------ ConcatEven/ConcatOdd
2490
2579
 
2491
2580
  // WARNING: the upper half of these needs fixing up (uzp1/uzp2 use the
@@ -2499,14 +2588,22 @@ namespace detail {
2499
2588
  }
2500
2589
  HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1)
2501
2590
  HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2)
2502
- HWY_SVE_FOREACH_BF16(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1)
2503
- HWY_SVE_FOREACH_BF16(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2)
2591
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
2592
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull,
2593
+ uzp1)
2594
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull,
2595
+ uzp2)
2596
+ #endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
2504
2597
  #if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
2505
2598
  HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q)
2506
2599
  HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
2507
- HWY_SVE_FOREACH_BF16(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q)
2508
- HWY_SVE_FOREACH_BF16(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
2509
- #endif
2600
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
2601
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND,
2602
+ ConcatEvenBlocks, uzp1q)
2603
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks,
2604
+ uzp2q)
2605
+ #endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
2606
+ #endif // defined(__ARM_FEATURE_SVE_MATMUL_FP64)
2510
2607
  #undef HWY_SVE_CONCAT_EVERY_SECOND
2511
2608
 
2512
2609
  // Used to slide up / shift whole register left; mask indicates which range
@@ -2551,6 +2648,18 @@ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
2551
2648
  return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
2552
2649
  }
2553
2650
 
2651
+ // ------------------------------ PromoteEvenTo/PromoteOddTo
2652
+
2653
+ // Signed to signed PromoteEvenTo: 1 instruction instead of 2 in generic-inl.h.
2654
+ // Might as well also enable unsigned to unsigned, though it is just an And.
2655
+ namespace detail {
2656
+ HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGPV, NativePromoteEvenTo, extb)
2657
+ HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGPV, NativePromoteEvenTo, exth)
2658
+ HWY_SVE_FOREACH_UI64(HWY_SVE_RETV_ARGPV, NativePromoteEvenTo, extw)
2659
+ } // namespace detail
2660
+
2661
+ #include "hwy/ops/inside-inl.h"
2662
+
2554
2663
  // ------------------------------ DemoteTo F
2555
2664
 
2556
2665
  // We already toggled HWY_NATIVE_F16C above.
@@ -2576,10 +2685,46 @@ HWY_API svfloat16_t DemoteTo(Simd<float16_t, N, kPow2> d, const svfloat64_t v) {
2576
2685
  in_even); // lower half
2577
2686
  }
2578
2687
 
2688
+ #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
2689
+ #undef HWY_NATIVE_DEMOTE_F32_TO_BF16
2690
+ #else
2691
+ #define HWY_NATIVE_DEMOTE_F32_TO_BF16
2692
+ #endif
2693
+
2694
+ #if !HWY_SVE_HAVE_F32_TO_BF16C
2695
+ namespace detail {
2696
+
2697
+ // Round a F32 value to the nearest BF16 value, with the result returned as the
2698
+ // rounded F32 value bitcasted to an U32
2699
+
2700
+ // RoundF32ForDemoteToBF16 also converts NaN values to QNaN values to prevent
2701
+ // NaN F32 values from being converted to an infinity
2702
+ HWY_INLINE svuint32_t RoundF32ForDemoteToBF16(svfloat32_t v) {
2703
+ const DFromV<decltype(v)> df32;
2704
+ const RebindToUnsigned<decltype(df32)> du32;
2705
+
2706
+ const auto is_non_nan = Eq(v, v);
2707
+ const auto bits32 = BitCast(du32, v);
2708
+
2709
+ const auto round_incr =
2710
+ detail::AddN(detail::AndN(ShiftRight<16>(bits32), 1u), 0x7FFFu);
2711
+ return MaskedAddOr(detail::OrN(bits32, 0x00400000u), is_non_nan, bits32,
2712
+ round_incr);
2713
+ }
2714
+
2715
+ } // namespace detail
2716
+ #endif // !HWY_SVE_HAVE_F32_TO_BF16C
2717
+
2579
2718
  template <size_t N, int kPow2>
2580
2719
  HWY_API VBF16 DemoteTo(Simd<bfloat16_t, N, kPow2> dbf16, svfloat32_t v) {
2581
- const svuint16_t in_even = BitCast(ScalableTag<uint16_t>(), v);
2582
- return BitCast(dbf16, detail::ConcatOddFull(in_even, in_even)); // lower half
2720
+ #if HWY_SVE_HAVE_F32_TO_BF16C
2721
+ const VBF16 in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), v);
2722
+ return detail::ConcatEvenFull(in_even, in_even);
2723
+ #else
2724
+ const svuint16_t in_odd =
2725
+ BitCast(ScalableTag<uint16_t>(), detail::RoundF32ForDemoteToBF16(v));
2726
+ return BitCast(dbf16, detail::ConcatOddFull(in_odd, in_odd)); // lower half
2727
+ #endif
2583
2728
  }
2584
2729
 
2585
2730
  template <size_t N, int kPow2>
@@ -2620,32 +2765,31 @@ HWY_API svfloat32_t DemoteTo(Simd<float, N, kPow2> d, const svuint64_t v) {
2620
2765
  // ------------------------------ ConvertTo F
2621
2766
 
2622
2767
  #define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP) \
2623
- /* signed integers */ \
2768
+ /* Float from signed */ \
2624
2769
  template <size_t N, int kPow2> \
2625
2770
  HWY_API HWY_SVE_V(BASE, BITS) \
2626
2771
  NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(int, BITS) v) { \
2627
2772
  return sv##OP##_##CHAR##BITS##_s##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
2628
2773
  } \
2629
- /* unsigned integers */ \
2774
+ /* Float from unsigned */ \
2630
2775
  template <size_t N, int kPow2> \
2631
2776
  HWY_API HWY_SVE_V(BASE, BITS) \
2632
2777
  NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(uint, BITS) v) { \
2633
2778
  return sv##OP##_##CHAR##BITS##_u##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
2634
2779
  } \
2635
- /* Truncates (rounds toward zero). */ \
2780
+ /* Signed from float, rounding toward zero */ \
2636
2781
  template <size_t N, int kPow2> \
2637
2782
  HWY_API HWY_SVE_V(int, BITS) \
2638
2783
  NAME(HWY_SVE_D(int, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \
2639
2784
  return sv##OP##_s##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
2640
2785
  } \
2641
- /* Truncates to unsigned (rounds toward zero). */ \
2786
+ /* Unsigned from float, rounding toward zero */ \
2642
2787
  template <size_t N, int kPow2> \
2643
2788
  HWY_API HWY_SVE_V(uint, BITS) \
2644
2789
  NAME(HWY_SVE_D(uint, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \
2645
2790
  return sv##OP##_u##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
2646
2791
  }
2647
2792
 
2648
- // API only requires f32 but we provide f64 for use by Iota.
2649
2793
  HWY_SVE_FOREACH_F(HWY_SVE_CONVERT, ConvertTo, cvt)
2650
2794
  #undef HWY_SVE_CONVERT
2651
2795
 
@@ -3124,13 +3268,15 @@ HWY_API TFromV<V> ExtractLane(V v, size_t i) {
3124
3268
  }
3125
3269
 
3126
3270
  // ------------------------------ InsertLane (IfThenElse)
3127
- template <class V>
3128
- HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
3271
+ template <class V, typename T>
3272
+ HWY_API V InsertLane(const V v, size_t i, T t) {
3273
+ static_assert(sizeof(TFromV<V>) == sizeof(T), "Lane size mismatch");
3129
3274
  const DFromV<V> d;
3130
3275
  const RebindToSigned<decltype(d)> di;
3131
3276
  using TI = TFromD<decltype(di)>;
3132
3277
  const svbool_t is_i = detail::EqN(Iota(di, 0), static_cast<TI>(i));
3133
- return IfThenElse(RebindMask(d, is_i), Set(d, t), v);
3278
+ return IfThenElse(RebindMask(d, is_i),
3279
+ Set(d, hwy::ConvertScalarTo<TFromV<V>>(t)), v);
3134
3280
  }
3135
3281
 
3136
3282
  // ------------------------------ DupEven
@@ -3185,6 +3331,18 @@ HWY_API V OddEven(const V odd, const V even) {
3185
3331
 
3186
3332
  #endif // HWY_TARGET
3187
3333
 
3334
+ // ------------------------------ InterleaveEven
3335
+ template <class D>
3336
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
3337
+ return detail::InterleaveEven(a, b);
3338
+ }
3339
+
3340
+ // ------------------------------ InterleaveOdd
3341
+ template <class D>
3342
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
3343
+ return detail::InterleaveOdd(a, b);
3344
+ }
3345
+
3188
3346
  // ------------------------------ OddEvenBlocks
3189
3347
  template <class V>
3190
3348
  HWY_API V OddEvenBlocks(const V odd, const V even) {
@@ -3239,7 +3397,9 @@ HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) {
3239
3397
  }
3240
3398
 
3241
3399
  HWY_SVE_FOREACH(HWY_SVE_TABLE, TableLookupLanes, tbl)
3242
- HWY_SVE_FOREACH_BF16(HWY_SVE_TABLE, TableLookupLanes, tbl)
3400
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
3401
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_TABLE, TableLookupLanes, tbl)
3402
+ #endif
3243
3403
  #undef HWY_SVE_TABLE
3244
3404
 
3245
3405
  #if HWY_SVE_HAVE_2
@@ -3251,7 +3411,10 @@ namespace detail {
3251
3411
  }
3252
3412
 
3253
3413
  HWY_SVE_FOREACH(HWY_SVE_TABLE2, NativeTwoTableLookupLanes, tbl2)
3254
- HWY_SVE_FOREACH_BF16(HWY_SVE_TABLE2, NativeTwoTableLookupLanes, tbl2)
3414
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
3415
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_TABLE2, NativeTwoTableLookupLanes,
3416
+ tbl2)
3417
+ #endif
3255
3418
  #undef HWY_SVE_TABLE
3256
3419
  } // namespace detail
3257
3420
  #endif // HWY_SVE_HAVE_2
@@ -3323,7 +3486,9 @@ namespace detail {
3323
3486
  }
3324
3487
 
3325
3488
  HWY_SVE_FOREACH(HWY_SVE_REVERSE, ReverseFull, rev)
3326
- HWY_SVE_FOREACH_BF16(HWY_SVE_REVERSE, ReverseFull, rev)
3489
+ #if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC
3490
+ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_REVERSE, ReverseFull, rev)
3491
+ #endif
3327
3492
  #undef HWY_SVE_REVERSE
3328
3493
 
3329
3494
  } // namespace detail
@@ -4077,6 +4242,95 @@ HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
4077
4242
 
4078
4243
  // ================================================== Ops with dependencies
4079
4244
 
4245
+ // ------------------------------ AddSub (Reverse2)
4246
+
4247
+ // NOTE: svcadd_f*_x(HWY_SVE_PTRUE(BITS), a, b, 90) computes a[i] - b[i + 1] in
4248
+ // the even lanes and a[i] + b[i - 1] in the odd lanes.
4249
+
4250
+ #define HWY_SVE_ADDSUB_F(BASE, CHAR, BITS, HALF, NAME, OP) \
4251
+ HWY_API HWY_SVE_V(BASE, BITS) \
4252
+ NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
4253
+ const DFromV<decltype(b)> d; \
4254
+ return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, Reverse2(d, b), \
4255
+ 90); \
4256
+ }
4257
+
4258
+ HWY_SVE_FOREACH_F(HWY_SVE_ADDSUB_F, AddSub, cadd)
4259
+
4260
+ #undef HWY_SVE_ADDSUB_F
4261
+
4262
+ // NOTE: svcadd_s*(a, b, 90) and svcadd_u*(a, b, 90) compute a[i] - b[i + 1] in
4263
+ // the even lanes and a[i] + b[i - 1] in the odd lanes.
4264
+
4265
+ #if HWY_SVE_HAVE_2
4266
+ #define HWY_SVE_ADDSUB_UI(BASE, CHAR, BITS, HALF, NAME, OP) \
4267
+ HWY_API HWY_SVE_V(BASE, BITS) \
4268
+ NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
4269
+ const DFromV<decltype(b)> d; \
4270
+ return sv##OP##_##CHAR##BITS(a, Reverse2(d, b), 90); \
4271
+ }
4272
+
4273
+ HWY_SVE_FOREACH_UI(HWY_SVE_ADDSUB_UI, AddSub, cadd)
4274
+
4275
+ #undef HWY_SVE_ADDSUB_UI
4276
+
4277
+ // Disable the default implementation of AddSub in generic_ops-inl.h on SVE2
4278
+ #undef HWY_IF_ADDSUB_V
4279
+ #define HWY_IF_ADDSUB_V(V) \
4280
+ HWY_IF_LANES_GT_D(DFromV<V>, 1), \
4281
+ hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
4282
+
4283
+ #else // !HWY_SVE_HAVE_2
4284
+
4285
+ // Disable the default implementation of AddSub in generic_ops-inl.h for
4286
+ // floating-point vectors on SVE, but enable the default implementation of
4287
+ // AddSub in generic_ops-inl.h for integer vectors on SVE that do not support
4288
+ // SVE2
4289
+ #undef HWY_IF_ADDSUB_V
4290
+ #define HWY_IF_ADDSUB_V(V) \
4291
+ HWY_IF_LANES_GT_D(DFromV<V>, 1), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
4292
+
4293
+ #endif // HWY_SVE_HAVE_2
4294
+
4295
+ // ------------------------------ MulAddSub (AddSub)
4296
+
4297
+ template <class V, HWY_IF_LANES_GT_D(DFromV<V>, 1), HWY_IF_FLOAT_V(V)>
4298
+ HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
4299
+ using T = TFromV<V>;
4300
+
4301
+ const DFromV<V> d;
4302
+ const T neg_zero = ConvertScalarTo<T>(-0.0f);
4303
+
4304
+ return MulAdd(mul, x, AddSub(Set(d, neg_zero), sub_or_add));
4305
+ }
4306
+
4307
+ #if HWY_SVE_HAVE_2
4308
+
4309
+ // Disable the default implementation of MulAddSub in generic_ops-inl.h on SVE2
4310
+ #undef HWY_IF_MULADDSUB_V
4311
+ #define HWY_IF_MULADDSUB_V(V) \
4312
+ HWY_IF_LANES_GT_D(DFromV<V>, 1), \
4313
+ hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
4314
+
4315
+ template <class V, HWY_IF_LANES_GT_D(DFromV<V>, 1),
4316
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
4317
+ HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
4318
+ const DFromV<V> d;
4319
+ return MulAdd(mul, x, AddSub(Zero(d), sub_or_add));
4320
+ }
4321
+
4322
+ #else // !HWY_SVE_HAVE_2
4323
+
4324
+ // Disable the default implementation of MulAddSub in generic_ops-inl.h for
4325
+ // floating-point vectors on SVE, but enable the default implementation of
4326
+ // AddSub in generic_ops-inl.h for integer vectors on SVE targets that do not
4327
+ // support SVE2
4328
+ #undef HWY_IF_MULADDSUB_V
4329
+ #define HWY_IF_MULADDSUB_V(V) \
4330
+ HWY_IF_LANES_GT_D(DFromV<V>, 1), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)
4331
+
4332
+ #endif // HWY_SVE_HAVE_2
4333
+
4080
4334
  // ------------------------------ PromoteTo bfloat16 (ZipLower)
4081
4335
  template <size_t N, int kPow2>
4082
4336
  HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> df32, VBF16 v) {
@@ -4209,10 +4463,17 @@ HWY_INLINE VFromD<D> PromoteOddTo(ToTypeTag to_type_tag,
4209
4463
  template <size_t N, int kPow2>
4210
4464
  HWY_API VBF16 ReorderDemote2To(Simd<bfloat16_t, N, kPow2> dbf16, svfloat32_t a,
4211
4465
  svfloat32_t b) {
4212
- const RebindToUnsigned<decltype(dbf16)> du16;
4213
- const Repartition<uint32_t, decltype(dbf16)> du32;
4214
- const svuint32_t b_in_even = ShiftRight<16>(BitCast(du32, b));
4215
- return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
4466
+ #if HWY_SVE_HAVE_F32_TO_BF16C
4467
+ const VBF16 b_in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), b);
4468
+ return svcvtnt_bf16_f32_x(b_in_even, detail::PTrue(dbf16), a);
4469
+ #else
4470
+ (void)dbf16;
4471
+ const auto a_in_odd =
4472
+ BitCast(ScalableTag<uint16_t>(), detail::RoundF32ForDemoteToBF16(a));
4473
+ const auto b_in_odd =
4474
+ BitCast(ScalableTag<uint16_t>(), detail::RoundF32ForDemoteToBF16(b));
4475
+ return BitCast(dbf16, detail::InterleaveOdd(b_in_odd, a_in_odd));
4476
+ #endif
4216
4477
  }
4217
4478
 
4218
4479
  template <size_t N, int kPow2>
@@ -4350,6 +4611,14 @@ HWY_API svuint32_t ReorderDemote2To(Simd<uint32_t, N, kPow2> d32, svuint64_t a,
4350
4611
  #endif
4351
4612
  }
4352
4613
 
4614
+ template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V),
4615
+ HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>) / 2)>
4616
+ HWY_API VFromD<D> ReorderDemote2To(D dn, V a, V b) {
4617
+ const auto clamped_a = BitCast(dn, detail::SaturateU<TFromD<D>>(a));
4618
+ const auto clamped_b = BitCast(dn, detail::SaturateU<TFromD<D>>(b));
4619
+ return detail::InterleaveEven(clamped_a, clamped_b);
4620
+ }
4621
+
4353
4622
  template <class D, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
4354
4623
  HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
4355
4624
  HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2)>
@@ -4360,10 +4629,20 @@ HWY_API VFromD<D> OrderedDemote2To(D dn, V a, V b) {
4360
4629
  return Combine(dn, demoted_b, demoted_a);
4361
4630
  }
4362
4631
 
4363
- template <class D, HWY_IF_SPECIAL_FLOAT_D(D)>
4364
- HWY_API VFromD<D> OrderedDemote2To(D dn, svfloat32_t a, svfloat32_t b) {
4365
- const Half<decltype(dn)> dnh;
4366
- return Combine(dn, DemoteTo(dnh, b), DemoteTo(dnh, a));
4632
+ template <size_t N, int kPow2>
4633
+ HWY_API VBF16 OrderedDemote2To(Simd<bfloat16_t, N, kPow2> dbf16, svfloat32_t a,
4634
+ svfloat32_t b) {
4635
+ #if HWY_SVE_HAVE_F32_TO_BF16C
4636
+ (void)dbf16;
4637
+ const VBF16 a_in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), a);
4638
+ const VBF16 b_in_even = svcvt_bf16_f32_x(detail::PTrue(dbf16), b);
4639
+ return ConcatEven(dbf16, b_in_even, a_in_even);
4640
+ #else
4641
+ const RebindToUnsigned<decltype(dbf16)> du16;
4642
+ const svuint16_t a_in_odd = BitCast(du16, detail::RoundF32ForDemoteToBF16(a));
4643
+ const svuint16_t b_in_odd = BitCast(du16, detail::RoundF32ForDemoteToBF16(b));
4644
+ return BitCast(dbf16, ConcatOdd(du16, b_in_odd, a_in_odd)); // lower half
4645
+ #endif
4367
4646
  }
4368
4647
 
4369
4648
  // ------------------------------ I8/U8/I16/U16 Div
@@ -4401,12 +4680,6 @@ HWY_API V MaskedModOr(V no, M m, V a, V b) {
4401
4680
  return IfThenElse(m, Mod(a, b), no);
4402
4681
  }
4403
4682
 
4404
- // ------------------------------ ZeroIfNegative (Lt, IfThenElse)
4405
- template <class V>
4406
- HWY_API V ZeroIfNegative(const V v) {
4407
- return IfThenZeroElse(detail::LtN(v, 0), v);
4408
- }
4409
-
4410
4683
  // ------------------------------ BroadcastSignBit (ShiftRight)
4411
4684
  template <class V>
4412
4685
  HWY_API V BroadcastSignBit(const V v) {
@@ -4417,11 +4690,7 @@ HWY_API V BroadcastSignBit(const V v) {
4417
4690
  template <class V>
4418
4691
  HWY_API V IfNegativeThenElse(V v, V yes, V no) {
4419
4692
  static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float");
4420
- const DFromV<V> d;
4421
- const RebindToSigned<decltype(d)> di;
4422
-
4423
- const svbool_t m = detail::LtN(BitCast(di, v), 0);
4424
- return IfThenElse(m, yes, no);
4693
+ return IfThenElse(IsNegative(v), yes, no);
4425
4694
  }
4426
4695
 
4427
4696
  // ------------------------------ AverageRound (ShiftRight)
@@ -5445,12 +5714,24 @@ HWY_API VFromD<DW> MulOdd(const V a, const V b) {
5445
5714
  #endif
5446
5715
  }
5447
5716
 
5717
+ HWY_API svint64_t MulEven(const svint64_t a, const svint64_t b) {
5718
+ const auto lo = Mul(a, b);
5719
+ const auto hi = MulHigh(a, b);
5720
+ return detail::InterleaveEven(lo, hi);
5721
+ }
5722
+
5448
5723
  HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b) {
5449
5724
  const auto lo = Mul(a, b);
5450
5725
  const auto hi = MulHigh(a, b);
5451
5726
  return detail::InterleaveEven(lo, hi);
5452
5727
  }
5453
5728
 
5729
+ HWY_API svint64_t MulOdd(const svint64_t a, const svint64_t b) {
5730
+ const auto lo = Mul(a, b);
5731
+ const auto hi = MulHigh(a, b);
5732
+ return detail::InterleaveOdd(lo, hi);
5733
+ }
5734
+
5454
5735
  HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) {
5455
5736
  const auto lo = Mul(a, b);
5456
5737
  const auto hi = MulHigh(a, b);
@@ -5460,23 +5741,14 @@ HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) {
5460
5741
  // ------------------------------ WidenMulPairwiseAdd
5461
5742
 
5462
5743
  template <size_t N, int kPow2>
5463
- HWY_API svfloat32_t WidenMulPairwiseAdd(Simd<float, N, kPow2> df32, VBF16 a,
5744
+ HWY_API svfloat32_t WidenMulPairwiseAdd(Simd<float, N, kPow2> df, VBF16 a,
5464
5745
  VBF16 b) {
5465
- #if HWY_SVE_HAVE_BF16_FEATURE
5466
- const svfloat32_t even = svbfmlalb_f32(Zero(df32), a, b);
5746
+ #if HWY_SVE_HAVE_F32_TO_BF16C
5747
+ const svfloat32_t even = svbfmlalb_f32(Zero(df), a, b);
5467
5748
  return svbfmlalt_f32(even, a, b);
5468
5749
  #else
5469
- const RebindToUnsigned<decltype(df32)> du32;
5470
- // Using shift/and instead of Zip leads to the odd/even order that
5471
- // RearrangeToOddPlusEven prefers.
5472
- using VU32 = VFromD<decltype(du32)>;
5473
- const VU32 odd = Set(du32, 0xFFFF0000u);
5474
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
5475
- const VU32 ao = And(BitCast(du32, a), odd);
5476
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
5477
- const VU32 bo = And(BitCast(du32, b), odd);
5478
- return MulAdd(BitCast(df32, ae), BitCast(df32, be),
5479
- Mul(BitCast(df32, ao), BitCast(df32, bo)));
5750
+ return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
5751
+ Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
5480
5752
  #endif // HWY_SVE_HAVE_BF16_FEATURE
5481
5753
  }
5482
5754
 
@@ -5487,14 +5759,8 @@ HWY_API svint32_t WidenMulPairwiseAdd(Simd<int32_t, N, kPow2> d32, svint16_t a,
5487
5759
  (void)d32;
5488
5760
  return svmlalt_s32(svmullb_s32(a, b), a, b);
5489
5761
  #else
5490
- const svbool_t pg = detail::PTrue(d32);
5491
- // Shifting extracts the odd lanes as RearrangeToOddPlusEven prefers.
5492
- // Fortunately SVE has sign-extension for the even lanes.
5493
- const svint32_t ae = svexth_s32_x(pg, BitCast(d32, a));
5494
- const svint32_t be = svexth_s32_x(pg, BitCast(d32, b));
5495
- const svint32_t ao = ShiftRight<16>(BitCast(d32, a));
5496
- const svint32_t bo = ShiftRight<16>(BitCast(d32, b));
5497
- return svmla_s32_x(pg, svmul_s32_x(pg, ao, bo), ae, be);
5762
+ return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b),
5763
+ Mul(PromoteOddTo(d32, a), PromoteOddTo(d32, b)));
5498
5764
  #endif
5499
5765
  }
5500
5766
 
@@ -5505,43 +5771,59 @@ HWY_API svuint32_t WidenMulPairwiseAdd(Simd<uint32_t, N, kPow2> d32,
5505
5771
  (void)d32;
5506
5772
  return svmlalt_u32(svmullb_u32(a, b), a, b);
5507
5773
  #else
5508
- const svbool_t pg = detail::PTrue(d32);
5509
- // Shifting extracts the odd lanes as RearrangeToOddPlusEven prefers.
5510
- // Fortunately SVE has sign-extension for the even lanes.
5511
- const svuint32_t ae = svexth_u32_x(pg, BitCast(d32, a));
5512
- const svuint32_t be = svexth_u32_x(pg, BitCast(d32, b));
5513
- const svuint32_t ao = ShiftRight<16>(BitCast(d32, a));
5514
- const svuint32_t bo = ShiftRight<16>(BitCast(d32, b));
5515
- return svmla_u32_x(pg, svmul_u32_x(pg, ao, bo), ae, be);
5774
+ return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b),
5775
+ Mul(PromoteOddTo(d32, a), PromoteOddTo(d32, b)));
5516
5776
  #endif
5517
5777
  }
5518
5778
 
5779
+ // ------------------------------ SatWidenMulAccumFixedPoint
5780
+
5781
+ #if HWY_SVE_HAVE_2
5782
+
5783
+ #ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
5784
+ #undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
5785
+ #else
5786
+ #define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
5787
+ #endif
5788
+
5789
+ template <class DI32, HWY_IF_I32_D(DI32)>
5790
+ HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 /*di32*/,
5791
+ VFromD<Rebind<int16_t, DI32>> a,
5792
+ VFromD<Rebind<int16_t, DI32>> b,
5793
+ VFromD<DI32> sum) {
5794
+ return svqdmlalb_s32(sum, detail::ZipLowerSame(a, a),
5795
+ detail::ZipLowerSame(b, b));
5796
+ }
5797
+
5798
+ #endif // HWY_SVE_HAVE_2
5799
+
5519
5800
  // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
5520
5801
 
5521
- template <size_t N, int kPow2>
5522
- HWY_API svfloat32_t ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
5523
- VBF16 a, VBF16 b,
5524
- const svfloat32_t sum0,
5525
- svfloat32_t& sum1) {
5526
5802
  #if HWY_SVE_HAVE_BF16_FEATURE
5527
- (void)df32;
5528
- sum1 = svbfmlalt_f32(sum1, a, b);
5529
- return svbfmlalb_f32(sum0, a, b);
5803
+
5804
+ // NOTE: we currently do not use SVE BFDOT for bf16 ReorderWidenMulAccumulate
5805
+ // because, apparently unlike NEON, it uses round to odd unless the additional
5806
+ // FEAT_EBF16 feature is available and enabled.
5807
+ #ifdef HWY_NATIVE_MUL_EVEN_BF16
5808
+ #undef HWY_NATIVE_MUL_EVEN_BF16
5530
5809
  #else
5531
- const RebindToUnsigned<decltype(df32)> du32;
5532
- // Using shift/and instead of Zip leads to the odd/even order that
5533
- // RearrangeToOddPlusEven prefers.
5534
- using VU32 = VFromD<decltype(du32)>;
5535
- const VU32 odd = Set(du32, 0xFFFF0000u);
5536
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
5537
- const VU32 ao = And(BitCast(du32, a), odd);
5538
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
5539
- const VU32 bo = And(BitCast(du32, b), odd);
5540
- sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
5541
- return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
5542
- #endif // HWY_SVE_HAVE_BF16_FEATURE
5810
+ #define HWY_NATIVE_MUL_EVEN_BF16
5811
+ #endif
5812
+
5813
+ template <size_t N, int kPow2>
5814
+ HWY_API svfloat32_t MulEvenAdd(Simd<float, N, kPow2> /* d */, VBF16 a, VBF16 b,
5815
+ const svfloat32_t c) {
5816
+ return svbfmlalb_f32(c, a, b);
5543
5817
  }
5544
5818
 
5819
+ template <size_t N, int kPow2>
5820
+ HWY_API svfloat32_t MulOddAdd(Simd<float, N, kPow2> /* d */, VBF16 a, VBF16 b,
5821
+ const svfloat32_t c) {
5822
+ return svbfmlalt_f32(c, a, b);
5823
+ }
5824
+
5825
+ #endif // HWY_SVE_HAVE_BF16_FEATURE
5826
+
5545
5827
  template <size_t N, int kPow2>
5546
5828
  HWY_API svint32_t ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32,
5547
5829
  svint16_t a, svint16_t b,
@@ -5552,15 +5834,10 @@ HWY_API svint32_t ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32,
5552
5834
  sum1 = svmlalt_s32(sum1, a, b);
5553
5835
  return svmlalb_s32(sum0, a, b);
5554
5836
  #else
5555
- const svbool_t pg = detail::PTrue(d32);
5556
- // Shifting extracts the odd lanes as RearrangeToOddPlusEven prefers.
5557
- // Fortunately SVE has sign-extension for the even lanes.
5558
- const svint32_t ae = svexth_s32_x(pg, BitCast(d32, a));
5559
- const svint32_t be = svexth_s32_x(pg, BitCast(d32, b));
5560
- const svint32_t ao = ShiftRight<16>(BitCast(d32, a));
5561
- const svint32_t bo = ShiftRight<16>(BitCast(d32, b));
5562
- sum1 = svmla_s32_x(pg, sum1, ao, bo);
5563
- return svmla_s32_x(pg, sum0, ae, be);
5837
+ // Lane order within sum0/1 is undefined, hence we can avoid the
5838
+ // longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
5839
+ sum1 = MulAdd(PromoteOddTo(d32, a), PromoteOddTo(d32, b), sum1);
5840
+ return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b), sum0);
5564
5841
  #endif
5565
5842
  }
5566
5843
 
@@ -5574,15 +5851,10 @@ HWY_API svuint32_t ReorderWidenMulAccumulate(Simd<uint32_t, N, kPow2> d32,
5574
5851
  sum1 = svmlalt_u32(sum1, a, b);
5575
5852
  return svmlalb_u32(sum0, a, b);
5576
5853
  #else
5577
- const svbool_t pg = detail::PTrue(d32);
5578
- // Shifting extracts the odd lanes as RearrangeToOddPlusEven prefers.
5579
- // Fortunately SVE has sign-extension for the even lanes.
5580
- const svuint32_t ae = svexth_u32_x(pg, BitCast(d32, a));
5581
- const svuint32_t be = svexth_u32_x(pg, BitCast(d32, b));
5582
- const svuint32_t ao = ShiftRight<16>(BitCast(d32, a));
5583
- const svuint32_t bo = ShiftRight<16>(BitCast(d32, b));
5584
- sum1 = svmla_u32_x(pg, sum1, ao, bo);
5585
- return svmla_u32_x(pg, sum0, ae, be);
5854
+ // Lane order within sum0/1 is undefined, hence we can avoid the
5855
+ // longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
5856
+ sum1 = MulAdd(PromoteOddTo(d32, a), PromoteOddTo(d32, b), sum1);
5857
+ return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b), sum0);
5586
5858
  #endif
5587
5859
  }
5588
5860