@img/sharp-libvips-dev 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/README.md +1 -2
  2. package/include/aom/aom_decoder.h +1 -1
  3. package/include/aom/aom_encoder.h +7 -1
  4. package/include/aom/aom_image.h +24 -12
  5. package/include/aom/aom_integer.h +3 -3
  6. package/include/aom/aomcx.h +15 -0
  7. package/include/aom/aomdx.h +5 -2
  8. package/include/archive.h +7 -5
  9. package/include/archive_entry.h +5 -3
  10. package/include/cgif.h +3 -0
  11. package/include/freetype2/freetype/config/ftoption.h +1 -1
  12. package/include/fribidi/fribidi-config.h +2 -2
  13. package/include/fribidi/fribidi-unicode-version.h +3 -3
  14. package/include/glib-2.0/gio/gappinfo.h +40 -25
  15. package/include/glib-2.0/gio/gasyncresult.h +1 -1
  16. package/include/glib-2.0/gio/gconverter.h +5 -0
  17. package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
  18. package/include/glib-2.0/gio/gfile.h +16 -0
  19. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  20. package/include/glib-2.0/gio/gsettings.h +8 -0
  21. package/include/glib-2.0/gio/gvfs.h +2 -2
  22. package/include/glib-2.0/girepository/gi-visibility.h +34 -0
  23. package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
  24. package/include/glib-2.0/glib/giochannel.h +2 -2
  25. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  26. package/include/glib-2.0/glib/gmacros.h +12 -5
  27. package/include/glib-2.0/glib/gmain.h +93 -7
  28. package/include/glib-2.0/glib/gqsort.h +8 -1
  29. package/include/glib-2.0/glib/gstrfuncs.h +0 -12
  30. package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
  31. package/include/glib-2.0/glib/gunicode.h +1 -1
  32. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  33. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  34. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  35. package/include/glib-2.0/gobject/gtype.h +6 -6
  36. package/include/harfbuzz/hb-buffer.h +6 -0
  37. package/include/harfbuzz/hb-common.h +6 -9
  38. package/include/harfbuzz/hb-cplusplus.hh +8 -11
  39. package/include/harfbuzz/hb-subset.h +17 -4
  40. package/include/harfbuzz/hb-version.h +3 -3
  41. package/include/hwy/abort.h +28 -0
  42. package/include/hwy/aligned_allocator.h +48 -1
  43. package/include/hwy/base.h +235 -34
  44. package/include/hwy/detect_compiler_arch.h +84 -10
  45. package/include/hwy/detect_targets.h +95 -29
  46. package/include/hwy/foreach_target.h +12 -1
  47. package/include/hwy/highway.h +205 -50
  48. package/include/hwy/ops/arm_neon-inl.h +841 -99
  49. package/include/hwy/ops/arm_sve-inl.h +413 -141
  50. package/include/hwy/ops/emu128-inl.h +373 -360
  51. package/include/hwy/ops/generic_ops-inl.h +804 -401
  52. package/include/hwy/ops/inside-inl.h +691 -0
  53. package/include/hwy/ops/ppc_vsx-inl.h +456 -166
  54. package/include/hwy/ops/rvv-inl.h +537 -249
  55. package/include/hwy/ops/scalar-inl.h +169 -79
  56. package/include/hwy/ops/set_macros-inl.h +106 -18
  57. package/include/hwy/ops/shared-inl.h +23 -0
  58. package/include/hwy/ops/wasm_128-inl.h +130 -108
  59. package/include/hwy/ops/x86_128-inl.h +1892 -577
  60. package/include/hwy/ops/x86_256-inl.h +625 -184
  61. package/include/hwy/ops/x86_512-inl.h +733 -131
  62. package/include/hwy/targets.h +22 -21
  63. package/include/hwy/timer-inl.h +3 -3
  64. package/include/hwy/timer.h +5 -1
  65. package/include/libheif/heif.h +170 -15
  66. package/include/libheif/heif_items.h +237 -0
  67. package/include/libheif/heif_properties.h +38 -2
  68. package/include/libheif/heif_regions.h +1 -1
  69. package/include/libheif/heif_version.h +2 -2
  70. package/include/libpng16/pnglibconf.h +1 -1
  71. package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
  72. package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
  73. package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
  74. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  75. package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
  76. package/include/libxml2/libxml/HTMLparser.h +12 -19
  77. package/include/libxml2/libxml/c14n.h +1 -12
  78. package/include/libxml2/libxml/debugXML.h +1 -1
  79. package/include/libxml2/libxml/encoding.h +9 -0
  80. package/include/libxml2/libxml/entities.h +12 -1
  81. package/include/libxml2/libxml/hash.h +19 -0
  82. package/include/libxml2/libxml/list.h +2 -2
  83. package/include/libxml2/libxml/nanohttp.h +17 -0
  84. package/include/libxml2/libxml/parser.h +61 -55
  85. package/include/libxml2/libxml/parserInternals.h +9 -1
  86. package/include/libxml2/libxml/pattern.h +6 -0
  87. package/include/libxml2/libxml/tree.h +32 -12
  88. package/include/libxml2/libxml/uri.h +11 -0
  89. package/include/libxml2/libxml/valid.h +29 -2
  90. package/include/libxml2/libxml/xinclude.h +7 -0
  91. package/include/libxml2/libxml/xmlIO.h +21 -4
  92. package/include/libxml2/libxml/xmlerror.h +14 -0
  93. package/include/libxml2/libxml/xmlexports.h +111 -15
  94. package/include/libxml2/libxml/xmlmemory.h +8 -45
  95. package/include/libxml2/libxml/xmlreader.h +2 -0
  96. package/include/libxml2/libxml/xmlsave.h +5 -0
  97. package/include/libxml2/libxml/xmlunicode.h +165 -1
  98. package/include/libxml2/libxml/xmlversion.h +15 -179
  99. package/include/libxml2/libxml/xmlwriter.h +1 -0
  100. package/include/libxml2/libxml/xpath.h +4 -0
  101. package/include/pango-1.0/pango/pango-features.h +3 -3
  102. package/include/pango-1.0/pango/pango-item.h +4 -2
  103. package/include/pango-1.0/pango/pango-version-macros.h +25 -0
  104. package/include/pango-1.0/pango/pangofc-font.h +2 -1
  105. package/include/pnglibconf.h +1 -1
  106. package/include/vips/util.h +1 -2
  107. package/include/vips/version.h +4 -4
  108. package/include/webp/decode.h +58 -56
  109. package/include/webp/demux.h +25 -21
  110. package/include/webp/encode.h +44 -39
  111. package/include/webp/mux.h +76 -15
  112. package/include/webp/mux_types.h +2 -1
  113. package/include/webp/sharpyuv/sharpyuv.h +77 -8
  114. package/include/webp/types.h +29 -8
  115. package/include/zconf.h +1 -1
  116. package/include/zlib.h +12 -12
  117. package/package.json +1 -1
  118. package/versions.json +14 -15
@@ -1,5 +1,7 @@
1
1
  // Copyright 2019 Google LLC
2
+ // Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
2
3
  // SPDX-License-Identifier: Apache-2.0
4
+ // SPDX-License-Identifier: BSD-3-Clause
3
5
  //
4
6
  // Licensed under the Apache License, Version 2.0 (the "License");
5
7
  // you may not use this file except in compliance with the License.
@@ -21,16 +23,12 @@
21
23
 
22
24
  #include "hwy/ops/shared-inl.h"
23
25
 
24
- HWY_BEFORE_NAMESPACE();
25
-
26
- // Must come after HWY_BEFORE_NAMESPACE so that the intrinsics are compiled with
27
- // the same target attribute as our code, see #834.
28
26
  HWY_DIAGNOSTICS(push)
29
27
  HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
30
28
  #include <arm_neon.h> // NOLINT(build/include_order)
31
29
  HWY_DIAGNOSTICS(pop)
32
30
 
33
- // Must come after arm_neon.h.
31
+ HWY_BEFORE_NAMESPACE();
34
32
  namespace hwy {
35
33
  namespace HWY_NAMESPACE {
36
34
 
@@ -143,13 +141,29 @@ namespace detail { // for code folding and Raw128
143
141
  HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \
144
142
  HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)
145
143
 
146
- #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && \
147
- (HWY_COMPILER_GCC_ACTUAL >= 1300 || HWY_COMPILER_CLANG >= 1100)
144
+ // Clang 17 crashes with bf16, see github.com/llvm/llvm-project/issues/64179.
145
+ #undef HWY_NEON_HAVE_BFLOAT16
146
+ #if HWY_HAVE_SCALAR_BF16_TYPE && \
147
+ ((HWY_TARGET == HWY_NEON_BF16 && \
148
+ (!HWY_COMPILER_CLANG || HWY_COMPILER_CLANG >= 1800)) || \
149
+ defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC))
148
150
  #define HWY_NEON_HAVE_BFLOAT16 1
149
151
  #else
150
152
  #define HWY_NEON_HAVE_BFLOAT16 0
151
153
  #endif
152
154
 
155
+ // HWY_NEON_HAVE_F32_TO_BF16C is defined if NEON vcvt_bf16_f32 and
156
+ // vbfdot_f32 are available, even if the __bf16 type is disabled due to
157
+ // GCC/Clang bugs.
158
+ #undef HWY_NEON_HAVE_F32_TO_BF16C
159
+ #if HWY_NEON_HAVE_BFLOAT16 || HWY_TARGET == HWY_NEON_BF16 || \
160
+ (defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && \
161
+ (HWY_COMPILER_GCC_ACTUAL >= 1000 || HWY_COMPILER_CLANG >= 1100))
162
+ #define HWY_NEON_HAVE_F32_TO_BF16C 1
163
+ #else
164
+ #define HWY_NEON_HAVE_F32_TO_BF16C 0
165
+ #endif
166
+
153
167
  // bfloat16_t
154
168
  #if HWY_NEON_HAVE_BFLOAT16
155
169
  #define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args) \
@@ -185,7 +199,12 @@ namespace detail { // for code folding and Raw128
185
199
  #elif HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16
186
200
  #define HWY_NEON_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
187
201
  #elif HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16
188
- #define HWY_NEON_IF_EMULATED_D(D) hwy::EnableIf<false>* = nullptr
202
+ // NOTE: hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr is used instead of
203
+ // hwy::EnableIf<false>* = nullptr to avoid compiler errors since
204
+ // !hwy::IsSame<D, D>() is always false and as !hwy::IsSame<D, D>() will cause
205
+ // SFINAE to occur instead of a hard error due to a dependency on the D template
206
+ // argument
207
+ #define HWY_NEON_IF_EMULATED_D(D) hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr
189
208
  #else
190
209
  #error "Logic error, handled all four cases"
191
210
  #endif
@@ -945,8 +964,12 @@ HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
945
964
 
946
965
  template <class D>
947
966
  HWY_API VFromD<D> Undefined(D /*tag*/) {
967
+ #if HWY_HAS_BUILTIN(__builtin_nondeterministic_value)
968
+ return VFromD<D>{__builtin_nondeterministic_value(Zero(D()).raw)};
969
+ #else
948
970
  VFromD<D> v;
949
971
  return v;
972
+ #endif
950
973
  }
951
974
 
952
975
  HWY_DIAGNOSTICS(pop)
@@ -1292,9 +1315,6 @@ HWY_API VFromD<D> Iota(D d, const T2 first) {
1292
1315
  #endif
1293
1316
  }
1294
1317
 
1295
- // ------------------------------ Tuple (VFromD)
1296
- #include "hwy/ops/tuple-inl.h"
1297
-
1298
1318
  // ------------------------------ Combine
1299
1319
 
1300
1320
  // Full result
@@ -1616,6 +1636,14 @@ namespace detail {
1616
1636
  #define HWY_NEON_BUILD_ARG_HWY_GET v.raw, kLane
1617
1637
 
1618
1638
  HWY_NEON_DEF_FUNCTION_ALL_TYPES(GetLane, vget, _lane_, HWY_GET)
1639
+ HWY_NEON_DEF_FUNCTION_BFLOAT_16(GetLane, vget, _lane_, HWY_GET)
1640
+
1641
+ template <size_t kLane, class V, HWY_NEON_IF_EMULATED_D(DFromV<V>)>
1642
+ static HWY_INLINE HWY_MAYBE_UNUSED TFromV<V> GetLane(V v) {
1643
+ const DFromV<decltype(v)> d;
1644
+ const RebindToUnsigned<decltype(d)> du;
1645
+ return BitCastScalar<TFromV<V>>(GetLane<kLane>(BitCast(du, v)));
1646
+ }
1619
1647
 
1620
1648
  #undef HWY_NEON_BUILD_TPL_HWY_GET
1621
1649
  #undef HWY_NEON_BUILD_RET_HWY_GET
@@ -1762,6 +1790,7 @@ namespace detail {
1762
1790
  #define HWY_NEON_BUILD_ARG_HWY_INSERT t, v.raw, kLane
1763
1791
 
1764
1792
  HWY_NEON_DEF_FUNCTION_ALL_TYPES(InsertLane, vset, _lane_, HWY_INSERT)
1793
+ HWY_NEON_DEF_FUNCTION_BFLOAT_16(InsertLane, vset, _lane_, HWY_INSERT)
1765
1794
 
1766
1795
  #undef HWY_NEON_BUILD_TPL_HWY_INSERT
1767
1796
  #undef HWY_NEON_BUILD_RET_HWY_INSERT
@@ -2133,12 +2162,16 @@ HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, vshr, _n_, ignored)
2133
2162
  #pragma pop_macro("HWY_NEON_DEF_FUNCTION")
2134
2163
 
2135
2164
  // ------------------------------ RotateRight (ShiftRight, Or)
2136
- template <int kBits, typename T, size_t N>
2165
+ template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
2137
2166
  HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
2167
+ const DFromV<decltype(v)> d;
2168
+ const RebindToUnsigned<decltype(d)> du;
2169
+
2138
2170
  constexpr size_t kSizeInBits = sizeof(T) * 8;
2139
2171
  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
2140
2172
  if (kBits == 0) return v;
2141
- return Or(ShiftRight<kBits>(v),
2173
+
2174
+ return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
2142
2175
  ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
2143
2176
  }
2144
2177
 
@@ -2328,7 +2361,39 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator*, vmul, _, 2)
2328
2361
 
2329
2362
  // ------------------------------ Integer multiplication
2330
2363
 
2331
- // Returns the upper 16 bits of a * b in each lane.
2364
+ // Returns the upper sizeof(T)*8 bits of a * b in each lane.
2365
+ HWY_API Vec128<int8_t> MulHigh(Vec128<int8_t> a, Vec128<int8_t> b) {
2366
+ int16x8_t rlo = vmull_s8(vget_low_s8(a.raw), vget_low_s8(b.raw));
2367
+ #if HWY_ARCH_ARM_A64
2368
+ int16x8_t rhi = vmull_high_s8(a.raw, b.raw);
2369
+ #else
2370
+ int16x8_t rhi = vmull_s8(vget_high_s8(a.raw), vget_high_s8(b.raw));
2371
+ #endif
2372
+ return Vec128<int8_t>(
2373
+ vuzp2q_s8(vreinterpretq_s8_s16(rlo), vreinterpretq_s8_s16(rhi)));
2374
+ }
2375
+ HWY_API Vec128<uint8_t> MulHigh(Vec128<uint8_t> a, Vec128<uint8_t> b) {
2376
+ uint16x8_t rlo = vmull_u8(vget_low_u8(a.raw), vget_low_u8(b.raw));
2377
+ #if HWY_ARCH_ARM_A64
2378
+ uint16x8_t rhi = vmull_high_u8(a.raw, b.raw);
2379
+ #else
2380
+ uint16x8_t rhi = vmull_u8(vget_high_u8(a.raw), vget_high_u8(b.raw));
2381
+ #endif
2382
+ return Vec128<uint8_t>(
2383
+ vuzp2q_u8(vreinterpretq_u8_u16(rlo), vreinterpretq_u8_u16(rhi)));
2384
+ }
2385
+
2386
+ template <size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)>
2387
+ HWY_API Vec128<int8_t, N> MulHigh(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
2388
+ int8x16_t hi_lo = vreinterpretq_s8_s16(vmull_s8(a.raw, b.raw));
2389
+ return Vec128<int8_t, N>(vget_low_s8(vuzp2q_s8(hi_lo, hi_lo)));
2390
+ }
2391
+ template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)>
2392
+ HWY_API Vec128<uint8_t, N> MulHigh(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
2393
+ uint8x16_t hi_lo = vreinterpretq_u8_u16(vmull_u8(a.raw, b.raw));
2394
+ return Vec128<uint8_t, N>(vget_low_u8(vuzp2q_u8(hi_lo, hi_lo)));
2395
+ }
2396
+
2332
2397
  HWY_API Vec128<int16_t> MulHigh(Vec128<int16_t> a, Vec128<int16_t> b) {
2333
2398
  int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw));
2334
2399
  #if HWY_ARCH_ARM_A64
@@ -2362,6 +2427,57 @@ HWY_API Vec128<uint16_t, N> MulHigh(Vec128<uint16_t, N> a,
2362
2427
  return Vec128<uint16_t, N>(vget_low_u16(vuzp2q_u16(hi_lo, hi_lo)));
2363
2428
  }
2364
2429
 
2430
+ HWY_API Vec128<int32_t> MulHigh(Vec128<int32_t> a, Vec128<int32_t> b) {
2431
+ int64x2_t rlo = vmull_s32(vget_low_s32(a.raw), vget_low_s32(b.raw));
2432
+ #if HWY_ARCH_ARM_A64
2433
+ int64x2_t rhi = vmull_high_s32(a.raw, b.raw);
2434
+ #else
2435
+ int64x2_t rhi = vmull_s32(vget_high_s32(a.raw), vget_high_s32(b.raw));
2436
+ #endif
2437
+ return Vec128<int32_t>(
2438
+ vuzp2q_s32(vreinterpretq_s32_s64(rlo), vreinterpretq_s32_s64(rhi)));
2439
+ }
2440
+ HWY_API Vec128<uint32_t> MulHigh(Vec128<uint32_t> a, Vec128<uint32_t> b) {
2441
+ uint64x2_t rlo = vmull_u32(vget_low_u32(a.raw), vget_low_u32(b.raw));
2442
+ #if HWY_ARCH_ARM_A64
2443
+ uint64x2_t rhi = vmull_high_u32(a.raw, b.raw);
2444
+ #else
2445
+ uint64x2_t rhi = vmull_u32(vget_high_u32(a.raw), vget_high_u32(b.raw));
2446
+ #endif
2447
+ return Vec128<uint32_t>(
2448
+ vuzp2q_u32(vreinterpretq_u32_u64(rlo), vreinterpretq_u32_u64(rhi)));
2449
+ }
2450
+
2451
+ template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)>
2452
+ HWY_API Vec128<int32_t, N> MulHigh(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
2453
+ int32x4_t hi_lo = vreinterpretq_s32_s64(vmull_s32(a.raw, b.raw));
2454
+ return Vec128<int32_t, N>(vget_low_s32(vuzp2q_s32(hi_lo, hi_lo)));
2455
+ }
2456
+ template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
2457
+ HWY_API Vec128<uint32_t, N> MulHigh(Vec128<uint32_t, N> a,
2458
+ Vec128<uint32_t, N> b) {
2459
+ uint32x4_t hi_lo = vreinterpretq_u32_u64(vmull_u32(a.raw, b.raw));
2460
+ return Vec128<uint32_t, N>(vget_low_u32(vuzp2q_u32(hi_lo, hi_lo)));
2461
+ }
2462
+
2463
+ template <class T, HWY_IF_UI64(T)>
2464
+ HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
2465
+ T hi_0;
2466
+ T hi_1;
2467
+
2468
+ Mul128(GetLane(a), GetLane(b), &hi_0);
2469
+ Mul128(detail::GetLane<1>(a), detail::GetLane<1>(b), &hi_1);
2470
+
2471
+ return Dup128VecFromValues(Full128<T>(), hi_0, hi_1);
2472
+ }
2473
+
2474
+ template <class T, HWY_IF_UI64(T)>
2475
+ HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) {
2476
+ T hi;
2477
+ Mul128(GetLane(a), GetLane(b), &hi);
2478
+ return Set(Full64<T>(), hi);
2479
+ }
2480
+
2365
2481
  HWY_API Vec128<int16_t> MulFixedPoint15(Vec128<int16_t> a, Vec128<int16_t> b) {
2366
2482
  return Vec128<int16_t>(vqrdmulhq_s16(a.raw, b.raw));
2367
2483
  }
@@ -2467,7 +2583,7 @@ HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
2467
2583
 
2468
2584
  namespace detail {
2469
2585
 
2470
- #if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
2586
+ #if HWY_NATIVE_FMA
2471
2587
  // Wrappers for changing argument order to what intrinsics expect.
2472
2588
  HWY_NEON_DEF_FUNCTION_ALL_FLOATS(MulAdd, vfma, _, 3)
2473
2589
  HWY_NEON_DEF_FUNCTION_ALL_FLOATS(NegMulAdd, vfms, _, 3)
@@ -2485,7 +2601,7 @@ HWY_API Vec128<float, N> NegMulAdd(Vec128<float, N> add, Vec128<float, N> mul,
2485
2601
  return add - mul * x;
2486
2602
  }
2487
2603
 
2488
- #endif // defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
2604
+ #endif // HWY_NATIVE_FMA
2489
2605
  } // namespace detail
2490
2606
 
2491
2607
  template <typename T, size_t N, HWY_IF_FLOAT(T)>
@@ -2874,14 +2990,21 @@ HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) {
2874
2990
 
2875
2991
  HWY_NEON_DEF_FUNCTION_ALL_TYPES(IfThenElse, vbsl, _, HWY_IF)
2876
2992
 
2877
- template <class V, class D = DFromV<V>, HWY_NEON_IF_EMULATED_D(D)>
2878
- HWY_API V IfThenElse(MFromD<D> mask, V yes, V no) {
2993
+ #if HWY_HAVE_FLOAT16
2994
+ #define HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V) HWY_IF_BF16(TFromV<V>)
2995
+ #else
2996
+ #define HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V) HWY_IF_SPECIAL_FLOAT_V(V)
2997
+ #endif
2998
+
2999
+ template <class V, HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V)>
3000
+ HWY_API V IfThenElse(MFromD<DFromV<V>> mask, V yes, V no) {
2879
3001
  const DFromV<decltype(yes)> d;
2880
3002
  const RebindToUnsigned<decltype(d)> du;
2881
3003
  return BitCast(
2882
3004
  d, IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no)));
2883
3005
  }
2884
3006
 
3007
+ #undef HWY_NEON_IF_EMULATED_IF_THEN_ELSE
2885
3008
  #undef HWY_NEON_BUILD_TPL_HWY_IF
2886
3009
  #undef HWY_NEON_BUILD_RET_HWY_IF
2887
3010
  #undef HWY_NEON_BUILD_PARAM_HWY_IF
@@ -2922,12 +3045,6 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
2922
3045
  return IfThenElse(m, yes, no);
2923
3046
  }
2924
3047
 
2925
- template <typename T, size_t N>
2926
- HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
2927
- const auto zero = Zero(DFromV<decltype(v)>());
2928
- return Max(zero, v);
2929
- }
2930
-
2931
3048
  // ------------------------------ Mask logical
2932
3049
 
2933
3050
  template <typename T, size_t N>
@@ -3894,16 +4011,15 @@ HWY_API Vec64<double> ConvertTo(D /* tag */, Vec64<int64_t> v) {
3894
4011
 
3895
4012
  template <class D, HWY_IF_F64_D(D)>
3896
4013
  HWY_API Vec128<double> ConvertTo(D /* tag */, Vec128<uint64_t> v) {
3897
- return Vec128<double>(vcvtq_f64_u64(ZeroIfNegative(v).raw));
4014
+ return Vec128<double>(vcvtq_f64_u64(v.raw));
3898
4015
  }
3899
4016
  template <class D, HWY_IF_F64_D(D)>
3900
4017
  HWY_API Vec64<double> ConvertTo(D /* tag */, Vec64<uint64_t> v) {
3901
4018
  // GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic.
3902
- const auto non_neg_v = ZeroIfNegative(v);
3903
4019
  #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
3904
- return Set(Full64<double>(), static_cast<double>(GetLane(non_neg_v)));
4020
+ return Set(Full64<double>(), static_cast<double>(GetLane(v)));
3905
4021
  #else
3906
- return Vec64<double>(vcvt_f64_u64(non_neg_v.raw));
4022
+ return Vec64<double>(vcvt_f64_u64(v.raw));
3907
4023
  #endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
3908
4024
  }
3909
4025
 
@@ -4379,8 +4495,36 @@ HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
4379
4495
  lo32_or_mask);
4380
4496
  }
4381
4497
 
4498
+ #ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
4499
+ #undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
4500
+ #else
4501
+ #define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
4502
+ #endif
4503
+
4504
+ template <class D, HWY_IF_UI64_D(D)>
4505
+ HWY_API VFromD<D> PromoteInRangeTo(D d64, VFromD<Rebind<float, D>> v) {
4506
+ const Rebind<MakeNarrow<TFromD<D>>, decltype(d64)> d32;
4507
+ const RebindToFloat<decltype(d32)> df32;
4508
+ const RebindToUnsigned<decltype(d32)> du32;
4509
+ const Repartition<uint8_t, decltype(d32)> du32_as_du8;
4510
+
4511
+ constexpr uint32_t kExpAdjDecr =
4512
+ 0xFFFFFF9Du + static_cast<uint32_t>(!IsSigned<TFromD<D>>());
4513
+
4514
+ const auto exponent_adj = BitCast(
4515
+ du32, SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))),
4516
+ BitCast(du32_as_du8, Set(du32, kExpAdjDecr))));
4517
+ const auto adj_v =
4518
+ BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj));
4519
+
4520
+ return PromoteTo(d64, ConvertTo(d32, adj_v)) << PromoteTo(d64, exponent_adj);
4521
+ }
4522
+
4382
4523
  #endif // HWY_HAVE_FLOAT64
4383
4524
 
4525
+ // ------------------------------ PromoteEvenTo/PromoteOddTo
4526
+ #include "hwy/ops/inside-inl.h"
4527
+
4384
4528
  // ------------------------------ PromoteUpperTo
4385
4529
 
4386
4530
  #if HWY_ARCH_ARM_A64
@@ -4650,14 +4794,45 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
4650
4794
 
4651
4795
  #endif // HWY_NEON_HAVE_F16C
4652
4796
 
4653
- template <class D, HWY_IF_BF16_D(D)>
4654
- HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
4655
- const Rebind<int32_t, decltype(dbf16)> di32;
4656
- const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
4657
- const Rebind<uint16_t, decltype(dbf16)> du16;
4658
- const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
4659
- return BitCast(dbf16, DemoteTo(du16, bits_in_32));
4797
+ #if HWY_NEON_HAVE_F32_TO_BF16C
4798
+ #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
4799
+ #undef HWY_NATIVE_DEMOTE_F32_TO_BF16
4800
+ #else
4801
+ #define HWY_NATIVE_DEMOTE_F32_TO_BF16
4802
+ #endif
4803
+
4804
+ namespace detail {
4805
+ #if HWY_NEON_HAVE_BFLOAT16
4806
+ // If HWY_NEON_HAVE_BFLOAT16 is true, detail::Vec128<bfloat16_t, N>::type is
4807
+ // bfloat16x4_t or bfloat16x8_t.
4808
+ static HWY_INLINE bfloat16x4_t BitCastFromRawNeonBF16(bfloat16x4_t raw) {
4809
+ return raw;
4810
+ }
4811
+ #else
4812
+ // If HWY_NEON_HAVE_F32_TO_BF16C && !HWY_NEON_HAVE_BFLOAT16 is true,
4813
+ // detail::Vec128<bfloat16_t, N>::type is uint16x4_t or uint16x8_t vector to
4814
+ // work around compiler bugs that are there with GCC 13 or earlier or Clang 16
4815
+ // or earlier on AArch64.
4816
+
4817
+ // The bfloat16x4_t vector returned by vcvt_bf16_f32 needs to be bitcasted to
4818
+ // an uint16x4_t vector if HWY_NEON_HAVE_F32_TO_BF16C &&
4819
+ // !HWY_NEON_HAVE_BFLOAT16 is true.
4820
+ static HWY_INLINE uint16x4_t BitCastFromRawNeonBF16(bfloat16x4_t raw) {
4821
+ return vreinterpret_u16_bf16(raw);
4660
4822
  }
4823
+ #endif
4824
+ } // namespace detail
4825
+
4826
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
4827
+ HWY_API VFromD<D> DemoteTo(D /*dbf16*/, VFromD<Rebind<float, D>> v) {
4828
+ return VFromD<D>(detail::BitCastFromRawNeonBF16(vcvt_bf16_f32(v.raw)));
4829
+ }
4830
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_BF16_D(D)>
4831
+ HWY_API VFromD<D> DemoteTo(D /*dbf16*/, VFromD<Rebind<float, D>> v) {
4832
+ return VFromD<D>(detail::BitCastFromRawNeonBF16(
4833
+ vcvt_bf16_f32(vcombine_f32(v.raw, v.raw))));
4834
+ }
4835
+ #endif // HWY_NEON_HAVE_F32_TO_BF16C
4661
4836
 
4662
4837
  #if HWY_HAVE_FLOAT64
4663
4838
 
@@ -4972,13 +5147,18 @@ HWY_API Vec64<float16_t> LowerHalf(Vec128<float16_t> v) {
4972
5147
  return Vec64<float16_t>(vget_low_f16(v.raw));
4973
5148
  }
4974
5149
  #endif // HWY_HAVE_FLOAT16
5150
+ #if HWY_NEON_HAVE_BFLOAT16
5151
+ HWY_API Vec64<bfloat16_t> LowerHalf(Vec128<bfloat16_t> v) {
5152
+ return Vec64<bfloat16_t>(vget_low_bf16(v.raw));
5153
+ }
5154
+ #endif // HWY_NEON_HAVE_BFLOAT16
4975
5155
  #if HWY_HAVE_FLOAT64
4976
5156
  HWY_API Vec64<double> LowerHalf(Vec128<double> v) {
4977
5157
  return Vec64<double>(vget_low_f64(v.raw));
4978
5158
  }
4979
5159
  #endif // HWY_HAVE_FLOAT64
4980
5160
 
4981
- template <class V, HWY_IF_SPECIAL_FLOAT_V(V), HWY_IF_V_SIZE_V(V, 16)>
5161
+ template <class V, HWY_NEON_IF_EMULATED_D(DFromV<V>), HWY_IF_V_SIZE_V(V, 16)>
4982
5162
  HWY_API VFromD<Half<DFromV<V>>> LowerHalf(V v) {
4983
5163
  const Full128<uint16_t> du;
4984
5164
  const Half<DFromV<V>> dh;
@@ -5178,6 +5358,12 @@ HWY_API Vec64<float16_t> UpperHalf(D /* tag */, Vec128<float16_t> v) {
5178
5358
  return Vec64<float16_t>(vget_high_f16(v.raw));
5179
5359
  }
5180
5360
  #endif
5361
+ #if HWY_NEON_HAVE_BFLOAT16
5362
+ template <class D, HWY_IF_BF16_D(D)>
5363
+ HWY_API Vec64<bfloat16_t> UpperHalf(D /* tag */, Vec128<bfloat16_t> v) {
5364
+ return Vec64<bfloat16_t>(vget_high_bf16(v.raw));
5365
+ }
5366
+ #endif // HWY_NEON_HAVE_BFLOAT16
5181
5367
  template <class D, HWY_IF_F32_D(D)>
5182
5368
  HWY_API Vec64<float> UpperHalf(D /* tag */, Vec128<float> v) {
5183
5369
  return Vec64<float>(vget_high_f32(v.raw));
@@ -5309,6 +5495,20 @@ HWY_API Vec128<float16_t, N> Broadcast(Vec128<float16_t, N> v) {
5309
5495
  }
5310
5496
  #endif // HWY_HAVE_FLOAT16
5311
5497
 
5498
+ #if HWY_NEON_HAVE_BFLOAT16
5499
+ template <int kLane>
5500
+ HWY_API Vec128<bfloat16_t> Broadcast(Vec128<bfloat16_t> v) {
5501
+ static_assert(0 <= kLane && kLane < 8, "Invalid lane");
5502
+ return Vec128<bfloat16_t>(vdupq_laneq_bf16(v.raw, kLane));
5503
+ }
5504
+ template <int kLane, size_t N, HWY_IF_V_SIZE_LE(bfloat16_t, N, 8),
5505
+ HWY_IF_LANES_GT(N, 1)>
5506
+ HWY_API Vec128<bfloat16_t, N> Broadcast(Vec128<bfloat16_t, N> v) {
5507
+ static_assert(0 <= kLane && kLane < N, "Invalid lane");
5508
+ return Vec128<bfloat16_t, N>(vdup_lane_bf16(v.raw, kLane));
5509
+ }
5510
+ #endif // HWY_NEON_HAVE_BFLOAT16
5511
+
5312
5512
  template <int kLane>
5313
5513
  HWY_API Vec128<float> Broadcast(Vec128<float> v) {
5314
5514
  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
@@ -5416,7 +5616,26 @@ HWY_API Vec128<float16_t> Broadcast(Vec128<float16_t> v) {
5416
5616
  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
5417
5617
  return Vec128<float16_t>(vdupq_n_f16(vgetq_lane_f16(v.raw, kLane)));
5418
5618
  }
5619
+ template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8),
5620
+ HWY_IF_LANES_GT(N, 1)>
5621
+ HWY_API Vec128<float16_t, N> Broadcast(Vec128<float16_t, N> v) {
5622
+ static_assert(0 <= kLane && kLane < N, "Invalid lane");
5623
+ return Vec128<float16_t, N>(vdup_lane_f16(v.raw, kLane));
5624
+ }
5419
5625
  #endif // HWY_HAVE_FLOAT16
5626
+ #if HWY_NEON_HAVE_BFLOAT16
5627
+ template <int kLane>
5628
+ HWY_API Vec128<bfloat16_t> Broadcast(Vec128<bfloat16_t> v) {
5629
+ static_assert(0 <= kLane && kLane < 8, "Invalid lane");
5630
+ return Vec128<bfloat16_t>(vdupq_n_bf16(vgetq_lane_bf16(v.raw, kLane)));
5631
+ }
5632
+ template <int kLane, size_t N, HWY_IF_V_SIZE_LE(bfloat16_t, N, 8),
5633
+ HWY_IF_LANES_GT(N, 1)>
5634
+ HWY_API Vec128<bfloat16_t, N> Broadcast(Vec128<bfloat16_t, N> v) {
5635
+ static_assert(0 <= kLane && kLane < N, "Invalid lane");
5636
+ return Vec128<bfloat16_t, N>(vdup_lane_bf16(v.raw, kLane));
5637
+ }
5638
+ #endif // HWY_NEON_HAVE_BFLOAT16
5420
5639
  template <int kLane>
5421
5640
  HWY_API Vec128<float> Broadcast(Vec128<float> v) {
5422
5641
  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
@@ -5431,6 +5650,14 @@ HWY_API Vec128<float, N> Broadcast(Vec128<float, N> v) {
5431
5650
 
5432
5651
  #endif // HWY_ARCH_ARM_A64
5433
5652
 
5653
+ template <int kLane, typename V, HWY_NEON_IF_EMULATED_D(DFromV<V>),
5654
+ HWY_IF_LANES_GT_D(DFromV<V>, 1)>
5655
+ HWY_API V Broadcast(V v) {
5656
+ const DFromV<V> d;
5657
+ const RebindToUnsigned<decltype(d)> du;
5658
+ return BitCast(d, Broadcast<kLane>(BitCast(du, v)));
5659
+ }
5660
+
5434
5661
  // ------------------------------ TableLookupLanes
5435
5662
 
5436
5663
  // Returned by SetTableIndices for use by TableLookupLanes.
@@ -6268,16 +6495,514 @@ HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
6268
6495
  return detail::SlideDownLanes(v, amt);
6269
6496
  }
6270
6497
 
6498
+ // ------------------------------- WidenHighMulAdd
6499
+
6500
+ #ifdef HWY_NATIVE_WIDEN_HIGH_MUL_ADD
6501
+ #undef HWY_NATIVE_WIDEN_HIGH_MUL_ADD
6502
+ #else
6503
+ #define HWY_NATIVE_WIDEN_HIGH_MUL_ADD
6504
+ #endif
6505
+
6506
+ namespace detail {
6507
+
6508
+ template<class D, HWY_IF_U64_D(D), class DN = RepartitionToNarrow<D>,
6509
+ HWY_IF_LANES_GT_D(DN, 2)>
6510
+ HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
6511
+ VFromD<DN> x, VFromD<D> add) {
6512
+ #if HWY_ARCH_ARM_A64
6513
+ return Vec128<uint64_t>(vmlal_high_u32(add.raw, mul.raw, x.raw));
6514
+ #else
6515
+ const Full64<uint32_t> dh;
6516
+ return Vec128<uint64_t>(
6517
+ vmlal_u32(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
6518
+ #endif
6519
+ }
6520
+
6521
+ template<class D, HWY_IF_U64_D(D), class DN = RepartitionToNarrow<D>,
6522
+ HWY_IF_LANES_LE_D(DN, 2)>
6523
+ HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
6524
+ VFromD<DN> x, VFromD<D> add) {
6525
+ Vec128<uint64_t> mulResult = Vec128<uint64_t>(vmull_u32(mul.raw, x.raw));
6526
+ return UpperHalf(d, mulResult) + add;
6527
+ }
6528
+
6529
+ template<class D, HWY_IF_I64_D(D), class DN = RepartitionToNarrow<D>,
6530
+ HWY_IF_LANES_GT_D(DN, 2)>
6531
+ HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
6532
+ VFromD<DN> x, VFromD<D> add) {
6533
+ #if HWY_ARCH_ARM_A64
6534
+ return Vec128<int64_t>(vmlal_high_s32(add.raw, mul.raw, x.raw));
6535
+ #else
6536
+ const Full64<int32_t> dh;
6537
+ return Vec128<int64_t>(
6538
+ vmlal_s32(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
6539
+ #endif
6540
+ }
6541
+
6542
+ template<class D, HWY_IF_I64_D(D), class DN = RepartitionToNarrow<D>,
6543
+ HWY_IF_LANES_LE_D(DN, 2)>
6544
+ HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
6545
+ VFromD<DN> x, VFromD<D> add) {
6546
+ Vec128<int64_t> mulResult = Vec128<int64_t>(vmull_s32(mul.raw, x.raw));
6547
+ return UpperHalf(d, mulResult) + add;
6548
+ }
6549
+
6550
+ template<class D, HWY_IF_I32_D(D), class DN = RepartitionToNarrow<D>,
6551
+ HWY_IF_LANES_GT_D(DN, 4)>
6552
+ HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
6553
+ VFromD<DN> x, VFromD<D> add) {
6554
+ #if HWY_ARCH_ARM_A64
6555
+ return Vec128<int32_t>(vmlal_high_s16(add.raw, mul.raw, x.raw));
6556
+ #else
6557
+ const Full64<int16_t> dh;
6558
+ return Vec128<int32_t>(
6559
+ vmlal_s16(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
6560
+ #endif
6561
+ }
6562
+
6563
+ template<class D, HWY_IF_I32_D(D), class DN = RepartitionToNarrow<D>,
6564
+ HWY_IF_LANES_D(DN, 4)>
6565
+ HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
6566
+ VFromD<DN> x, VFromD<D> add) {
6567
+ Vec128<int32_t> widen = Vec128<int32_t>(vmull_s16(mul.raw, x.raw));
6568
+ Vec64<int32_t> hi = UpperHalf(d, widen);
6569
+ return hi + add;
6570
+ }
6571
+
6572
+ template<class D, HWY_IF_I32_D(D), class DN = RepartitionToNarrow<D>,
6573
+ HWY_IF_LANES_D(DN, 2)>
6574
+ HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
6575
+ VFromD<DN> x, VFromD<D> add) {
6576
+ Vec128<int32_t> widen = Vec128<int32_t>(vmull_s16(mul.raw, x.raw));
6577
+ Vec32<int32_t> hi = UpperHalf(d, Vec64<int32_t>(vget_high_s32(widen.raw)));
6578
+ return hi + add;
6579
+ }
6580
+
6581
+ template<class D, HWY_IF_U32_D(D), class DN = RepartitionToNarrow<D>,
6582
+ HWY_IF_LANES_GT_D(DN, 4)>
6583
+ HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
6584
+ VFromD<DN> x, VFromD<D> add) {
6585
+ #if HWY_ARCH_ARM_A64
6586
+ return Vec128<uint32_t>(vmlal_high_u16(add.raw, mul.raw, x.raw));
6587
+ #else
6588
+ const Full64<uint16_t> dh;
6589
+ return Vec128<uint32_t>(
6590
+ vmlal_u16(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
6591
+ #endif
6592
+ }
6593
+
6594
+ template<class D, HWY_IF_U32_D(D), class DN = RepartitionToNarrow<D>,
6595
+ HWY_IF_LANES_D(DN, 4)>
6596
+ HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
6597
+ VFromD<DN> x, VFromD<D> add) {
6598
+ Vec128<uint32_t> widen = Vec128<uint32_t>(vmull_u16(mul.raw, x.raw));
6599
+ VFromD<D> hi = UpperHalf(d, widen);
6600
+ return hi + add;
6601
+ }
6602
+
6603
+ template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_D(D, 1),
6604
+ class DN = RepartitionToNarrow<D>>
6605
+ HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
6606
+ VFromD<DN> x, VFromD<D> add) {
6607
+ Vec128<uint32_t> widen = Vec128<uint32_t>(vmull_u16(mul.raw, x.raw));
6608
+ VFromD<D> hi = UpperHalf(d, Vec64<uint32_t>(vget_high_u32(widen.raw)));
6609
+ return hi + add;
6610
+ }
6611
+
6612
+ template<class D, HWY_IF_U16_D(D), class DN = RepartitionToNarrow<D>,
6613
+ HWY_IF_LANES_GT_D(DN, 8)>
6614
+ HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
6615
+ VFromD<DN> x, VFromD<D> add) {
6616
+ #if HWY_ARCH_ARM_A64
6617
+ return Vec128<uint16_t>(vmlal_high_u8(add.raw, mul.raw, x.raw));
6618
+ #else
6619
+ const Full64<uint8_t> dh;
6620
+ return Vec128<uint16_t>(
6621
+ vmlal_u8(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
6622
+ #endif
6623
+ }
6624
+
6625
+ template<class D, HWY_IF_U16_D(D), class DN = RepartitionToNarrow<D>,
6626
+ HWY_IF_LANES_D(DN, 8)>
6627
+ HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
6628
+ VFromD<DN> x, VFromD<D> add) {
6629
+ Vec128<uint16_t> widen = Vec128<uint16_t>(vmull_u8(mul.raw, x.raw));
6630
+ VFromD<D> hi = UpperHalf(d, widen);
6631
+ return hi + add;
6632
+ }
6633
+
6634
+ template<class D, HWY_IF_U16(TFromD<D>), class DN = RepartitionToNarrow<D>,
6635
+ HWY_IF_LANES_LE_D(DN, 4)>
6636
+ HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
6637
+ VFromD<DN> x, VFromD<D> add) {
6638
+ Vec128<uint16_t> widen = Vec128<uint16_t>(vmull_u8(mul.raw, x.raw));
6639
+ const Twice<decltype(d)> d16F;
6640
+ VFromD<D> hi = UpperHalf(d, VFromD<decltype(d16F)>(vget_high_u16(widen.raw)));
6641
+ return hi + add;
6642
+ }
6643
+
6644
+ template<class D, HWY_IF_I16_D(D), class DN = RepartitionToNarrow<D>,
6645
+ HWY_IF_LANES_GT_D(DN, 8)>
6646
+ HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
6647
+ VFromD<DN> x, VFromD<D> add) {
6648
+ #if HWY_ARCH_ARM_A64
6649
+ return Vec128<int16_t>(vmlal_high_s8(add.raw, mul.raw, x.raw));
6650
+ #else
6651
+ const Full64<int8_t> dh;
6652
+ return Vec128<int16_t>(
6653
+ vmlal_s8(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
6654
+ #endif
6655
+ }
6656
+
6657
+ template<class D, HWY_IF_I16_D(D), class DN = RepartitionToNarrow<D>,
6658
+ HWY_IF_LANES_D(DN, 8)>
6659
+ HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
6660
+ VFromD<DN> x, VFromD<D> add) {
6661
+ Vec128<int16_t> widen = Vec128<int16_t>(vmull_s8(mul.raw, x.raw));
6662
+ VFromD<D> hi = UpperHalf(d, widen);
6663
+ return hi + add;
6664
+ }
6665
+
6666
+ template<class D, HWY_IF_I16_D(D), class DN = RepartitionToNarrow<D>,
6667
+ HWY_IF_LANES_LE_D(DN, 4)>
6668
+ HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
6669
+ VFromD<DN> x, VFromD<D> add) {
6670
+ Vec128<int16_t> widen = Vec128<int16_t>(vmull_s8(mul.raw, x.raw));
6671
+ const Twice<decltype(d)> d16F;
6672
+ VFromD<D> hi = UpperHalf(d, VFromD<decltype(d16F)>(vget_high_s16(widen.raw)));
6673
+ return hi + add;
6674
+ }
6675
+
6676
+ #if 0
6677
+ #if HWY_HAVE_FLOAT16
6678
+ template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 4),
6679
+ class DN = RepartitionToNarrow<D>>
6680
+ HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
6681
+ VFromD<DN> x, VFromD<D> add) {
6682
+ return VFromD<D>(vfmlalq_high_f16(add.raw, mul.raw, x.raw));
6683
+ }
6684
+
6685
+ template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 2),
6686
+ class DN = RepartitionToNarrow<D>>
6687
+ HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
6688
+ VFromD<DN> x, VFromD<D> add) {
6689
+ return Vec64<float32_t>(vfmlal_high_f16(add.raw, mul.raw, x.raw));
6690
+ }
6691
+
6692
+ template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 1),
6693
+ class DN = RepartitionToNarrow<D>>
6694
+ HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
6695
+ VFromD<DN> x, VFromD<D> add) {
6696
+ return MulAdd(add, PromoteUpperTo(d, mul), PromoteUpperTo(d, x));
6697
+ }
6698
+ #endif
6699
+ #endif
6700
+
6701
+ } // namespace detail
6702
+
6703
+ // ------------------------------- WidenMulAdd
6704
+
6705
+ #ifdef HWY_NATIVE_WIDEN_MUL_ADD
6706
+ #undef HWY_NATIVE_WIDEN_MUL_ADD
6707
+ #else
6708
+ #define HWY_NATIVE_WIDEN_MUL_ADD
6709
+ #endif
6710
+
6711
+ namespace detail {
6712
+
6713
+ template<class D, HWY_IF_U16_D(D), HWY_IF_LANES_GT_D(D, 4),
6714
+ class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
6715
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6716
+ VFromD<DN> x, VFromD<D> add) {
6717
+ return Vec128<uint16_t>(vmlal_u8(add.raw, mul.raw, x.raw));
6718
+ }
6719
+
6720
+ template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_LE_D(D, 4),
6721
+ class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
6722
+ HWY_API VFromD<D> WidenMulAdd(D d, VFromD<DN> mul, VFromD<DN> x,
6723
+ VFromD<D> add) {
6724
+ return MulAdd(add, PromoteTo(d, mul), PromoteTo(d, x));
6725
+ }
6726
+
6727
+ template<class D, HWY_IF_I16_D(D), HWY_IF_LANES_GT_D(D, 4),
6728
+ class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
6729
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6730
+ VFromD<DN> x, VFromD<D> add) {
6731
+ return VFromD<D>(vmlal_s8(add.raw, mul.raw, x.raw));
6732
+ }
6733
+
6734
+ template <class D, HWY_IF_I16_D(D), HWY_IF_LANES_LE_D(D, 4),
6735
+ class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
6736
+ HWY_API VFromD<D> WidenMulAdd(D d, VFromD<DN> mul, VFromD<DN> x,
6737
+ VFromD<D> add) {
6738
+ return MulAdd(add, PromoteTo(d, mul), PromoteTo(d, x));
6739
+ }
6740
+
6741
+ template<class D, HWY_IF_I32_D(D),
6742
+ class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
6743
+ HWY_IF_LANES_GT_D(DN, 2)>
6744
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6745
+ VFromD<DN> x, VFromD<D> add) {
6746
+ return Vec128<int32_t>(vmlal_s16(add.raw, mul.raw, x.raw));
6747
+ }
6748
+
6749
+ template<class D, HWY_IF_I32_D(D),
6750
+ class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
6751
+ HWY_IF_LANES_D(DN, 2)>
6752
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6753
+ VFromD<DN> x, VFromD<D> add) {
6754
+ Vec128<int32_t> mulRs = Vec128<int32_t>(vmull_s16(mul.raw, x.raw));
6755
+ const VFromD<D> mul10 = LowerHalf(mulRs);
6756
+ return add + mul10;
6757
+ }
6758
+
6759
+ template<class D, HWY_IF_I32_D(D),
6760
+ class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
6761
+ HWY_IF_LANES_D(D, 1)>
6762
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6763
+ VFromD<DN> x, VFromD<D> add) {
6764
+ Vec64<int32_t> mulRs = LowerHalf(Vec128<int32_t>(vmull_s16(mul.raw, x.raw)));
6765
+ const Vec32<int32_t> mul10(LowerHalf(mulRs));
6766
+ return add + mul10;
6767
+ }
6768
+
6769
+ template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_GT_D(D, 2),
6770
+ class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
6771
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6772
+ VFromD<DN> x, VFromD<D> add) {
6773
+ return Vec128<uint32_t>(vmlal_u16(add.raw, mul.raw, x.raw));
6774
+ }
6775
+
6776
+ template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_D(D, 2),
6777
+ class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
6778
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6779
+ VFromD<DN> x, VFromD<D> add) {
6780
+ Vec128<uint32_t> mulRs = Vec128<uint32_t>(vmull_u16(mul.raw, x.raw));
6781
+ const Vec64<uint32_t> mul10(LowerHalf(mulRs));
6782
+ return add + mul10;
6783
+ }
6784
+
6785
+ template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_D(D, 1),
6786
+ class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
6787
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6788
+ VFromD<DN> x, VFromD<D> add) {
6789
+ Vec64<uint32_t> mulRs =
6790
+ LowerHalf(Vec128<uint32_t>(vmull_u16(mul.raw, x.raw)));
6791
+ const Vec32<uint32_t> mul10(LowerHalf(mulRs));
6792
+ return add + mul10;
6793
+ }
6794
+
6795
+ template<class D, HWY_IF_I64_D(D), class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
6796
+ HWY_IF_LANES_D(DN, 2)>
6797
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6798
+ VFromD<DN> x, VFromD<D> add) {
6799
+ return VFromD<D>(vmlal_s32(add.raw, mul.raw, x.raw));
6800
+ }
6801
+
6802
+ template<class D, HWY_IF_I64_D(D), HWY_IF_LANES_D(D, 1),
6803
+ class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
6804
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6805
+ VFromD<DN> x, VFromD<D> add) {
6806
+ Vec128<int64_t> mulRs = Vec128<int64_t>(vmull_s32(mul.raw, x.raw));
6807
+ const VFromD<D> mul10(LowerHalf(mulRs));
6808
+ return add + mul10;
6809
+ }
6810
+
6811
+ template<class D, HWY_IF_U64_D(D), class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
6812
+ HWY_IF_LANES_D(DN, 2)>
6813
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6814
+ VFromD<DN> x, VFromD<D> add) {
6815
+ return VFromD<D>(vmlal_u32(add.raw, mul.raw, x.raw));
6816
+ }
6817
+
6818
+ template<class D, HWY_IF_U64_D(D), class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
6819
+ HWY_IF_LANES_D(DN, 1)>
6820
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6821
+ VFromD<DN> x, VFromD<D> add) {
6822
+ Vec128<uint64_t> mulRs = Vec128<uint64_t>(vmull_u32(mul.raw, x.raw));
6823
+ const VFromD<D> mul10(LowerHalf(mulRs));
6824
+ return add + mul10;
6825
+ }
6826
+
6827
+ #if 0
6828
+ #if HWY_HAVE_FLOAT16
6829
+ template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>,
6830
+ HWY_IF_LANES_D(D, 4)>
6831
+ HWY_API VFromD<D> WidenLowMulAdd(D /* tag */, VFromD<DN> mul,
6832
+ VFromD<DN> x, VFromD<D> add) {
6833
+ return VFromD<D>(vfmlalq_low_f16(add.raw, mul.raw, x.raw));
6834
+ }
6835
+
6836
+ template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>,
6837
+ HWY_IF_LANES_D(DN, 4)>
6838
+ HWY_API VFromD<D> WidenLowMulAdd(D /* tag */, VFromD<DN> mul,
6839
+ VFromD<DN> x, VFromD<D> add) {
6840
+ return Vec64<float32_t>(vfmlal_low_f16(add.raw, mul.raw, x.raw));
6841
+ }
6842
+
6843
+ template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 1),
6844
+ class DN = RepartitionToNarrow<D>>
6845
+ HWY_API VFromD<D> WidenLowMulAdd(D d, VFromD<DN> mul,
6846
+ VFromD<DN> x, VFromD<D> add) {
6847
+ return MulAdd(add, PromoteLowerTo(d, mul), PromoteLowerTo(d, x));
6848
+ }
6849
+ #endif
6850
+ #endif
6851
+
6852
+ } // namespace detail
6853
+
6854
+ // ------------------------------ WidenMulAccumulate
6855
+
6856
+ #ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
6857
+ #undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
6858
+ #else
6859
+ #define HWY_NATIVE_WIDEN_MUL_ACCUMULATE
6860
+ #endif
6861
+
6862
+ template<class D, HWY_IF_INTEGER(TFromD<D>), class DN = RepartitionToNarrow<D>>
6863
+ HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x,
6864
+ VFromD<D> low, VFromD<D>& high) {
6865
+ high = detail::WidenHighMulAdd(d, mul, x, high);
6866
+ return detail::WidenMulAdd(d, LowerHalf(mul), LowerHalf(x), low);
6867
+ }
6868
+
6869
+ #if 0
6870
+ #ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
6871
+ #undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
6872
+ #else
6873
+ #define HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
6874
+ #endif
6875
+
6876
+ #if HWY_HAVE_FLOAT16
6877
+
6878
+ template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>>
6879
+ HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x,
6880
+ VFromD<D> low, VFromD<D>& high) {
6881
+ high = detail::WidenHighMulAdd(d, mul, x, high);
6882
+ return detail::WidenLowMulAdd(d, mul, x, low);
6883
+ }
6884
+
6885
+ #endif
6886
+ #endif
6887
+
6888
+ // ------------------------------ SatWidenMulAccumFixedPoint
6889
+
6890
+ #ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
6891
+ #undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
6892
+ #else
6893
+ #define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
6894
+ #endif
6895
+
6896
+ template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 16)>
6897
+ HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 /*di32*/,
6898
+ VFromD<Rebind<int16_t, DI32>> a,
6899
+ VFromD<Rebind<int16_t, DI32>> b,
6900
+ VFromD<DI32> sum) {
6901
+ return VFromD<DI32>(vqdmlal_s16(sum.raw, a.raw, b.raw));
6902
+ }
6903
+
6904
+ template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 8)>
6905
+ HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 di32,
6906
+ VFromD<Rebind<int16_t, DI32>> a,
6907
+ VFromD<Rebind<int16_t, DI32>> b,
6908
+ VFromD<DI32> sum) {
6909
+ const Full128<TFromD<DI32>> di32_full;
6910
+ const Rebind<int16_t, decltype(di32_full)> di16_full64;
6911
+ return ResizeBitCast(
6912
+ di32, SatWidenMulAccumFixedPoint(di32_full, ResizeBitCast(di16_full64, a),
6913
+ ResizeBitCast(di16_full64, b),
6914
+ ResizeBitCast(di32_full, sum)));
6915
+ }
6916
+
6271
6917
  // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
6272
6918
 
6919
+ #if HWY_NEON_HAVE_F32_TO_BF16C
6920
+
6921
+ #ifdef HWY_NATIVE_MUL_EVEN_BF16
6922
+ #undef HWY_NATIVE_MUL_EVEN_BF16
6923
+ #else
6924
+ #define HWY_NATIVE_MUL_EVEN_BF16
6925
+ #endif
6926
+
6927
+ #ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
6928
+ #undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
6929
+ #else
6930
+ #define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
6931
+ #endif
6932
+
6933
+ namespace detail {
6273
6934
  #if HWY_NEON_HAVE_BFLOAT16
6935
+ // If HWY_NEON_HAVE_BFLOAT16 is true, detail::Vec128<bfloat16_t, N>::type is
6936
+ // bfloat16x4_t or bfloat16x8_t.
6937
+ static HWY_INLINE bfloat16x4_t BitCastToRawNeonBF16(bfloat16x4_t raw) {
6938
+ return raw;
6939
+ }
6940
+ static HWY_INLINE bfloat16x8_t BitCastToRawNeonBF16(bfloat16x8_t raw) {
6941
+ return raw;
6942
+ }
6943
+ #else
6944
+ // If HWY_NEON_HAVE_F32_TO_BF16C && !HWY_NEON_HAVE_BFLOAT16 is true,
6945
+ // detail::Vec128<bfloat16_t, N>::type is uint16x4_t or uint16x8_t vector to
6946
+ // work around compiler bugs that are there with GCC 13 or earlier or Clang 16
6947
+ // or earlier on AArch64.
6948
+
6949
+ // The uint16x4_t or uint16x8_t vector neets to be bitcasted to a bfloat16x4_t
6950
+ // or a bfloat16x8_t vector for the vbfdot_f32 and vbfdotq_f32 intrinsics if
6951
+ // HWY_NEON_HAVE_F32_TO_BF16C && !HWY_NEON_HAVE_BFLOAT16 is true
6952
+ static HWY_INLINE bfloat16x4_t BitCastToRawNeonBF16(uint16x4_t raw) {
6953
+ return vreinterpret_bf16_u16(raw);
6954
+ }
6955
+ static HWY_INLINE bfloat16x8_t BitCastToRawNeonBF16(uint16x8_t raw) {
6956
+ return vreinterpretq_bf16_u16(raw);
6957
+ }
6958
+ #endif
6959
+ } // namespace detail
6960
+
6961
+ template <class D, HWY_IF_V_SIZE_D(D, 16)>
6962
+ HWY_API Vec128<float> MulEvenAdd(D /*d32*/, Vec128<bfloat16_t> a,
6963
+ Vec128<bfloat16_t> b, const Vec128<float> c) {
6964
+ return Vec128<float>(vbfmlalbq_f32(c.raw, detail::BitCastToRawNeonBF16(a.raw),
6965
+ detail::BitCastToRawNeonBF16(b.raw)));
6966
+ }
6967
+
6968
+ template <class D, HWY_IF_V_SIZE_D(D, 16)>
6969
+ HWY_API Vec128<float> MulOddAdd(D /*d32*/, Vec128<bfloat16_t> a,
6970
+ Vec128<bfloat16_t> b, const Vec128<float> c) {
6971
+ return Vec128<float>(vbfmlaltq_f32(c.raw, detail::BitCastToRawNeonBF16(a.raw),
6972
+ detail::BitCastToRawNeonBF16(b.raw)));
6973
+ }
6274
6974
 
6275
6975
  template <class D, HWY_IF_V_SIZE_D(D, 16)>
6276
6976
  HWY_API Vec128<float> ReorderWidenMulAccumulate(D /*d32*/, Vec128<bfloat16_t> a,
6277
6977
  Vec128<bfloat16_t> b,
6278
6978
  const Vec128<float> sum0,
6279
6979
  Vec128<float>& /*sum1*/) {
6280
- return Vec128<float>(vbfdotq_f32(sum0.raw, a.raw, b.raw));
6980
+ return Vec128<float>(vbfdotq_f32(sum0.raw,
6981
+ detail::BitCastToRawNeonBF16(a.raw),
6982
+ detail::BitCastToRawNeonBF16(b.raw)));
6983
+ }
6984
+
6985
+ // There is no non-q version of these instructions.
6986
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
6987
+ HWY_API VFromD<D> MulEvenAdd(D d32, VFromD<Repartition<bfloat16_t, D>> a,
6988
+ VFromD<Repartition<bfloat16_t, D>> b,
6989
+ const VFromD<D> c) {
6990
+ const Full128<float> d32f;
6991
+ const Full128<bfloat16_t> d16f;
6992
+ return ResizeBitCast(
6993
+ d32, MulEvenAdd(d32f, ResizeBitCast(d16f, a), ResizeBitCast(d16f, b),
6994
+ ResizeBitCast(d32f, c)));
6995
+ }
6996
+
6997
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
6998
+ HWY_API VFromD<D> MulOddAdd(D d32, VFromD<Repartition<bfloat16_t, D>> a,
6999
+ VFromD<Repartition<bfloat16_t, D>> b,
7000
+ const VFromD<D> c) {
7001
+ const Full128<float> d32f;
7002
+ const Full128<bfloat16_t> d16f;
7003
+ return ResizeBitCast(
7004
+ d32, MulOddAdd(d32f, ResizeBitCast(d16f, a), ResizeBitCast(d16f, b),
7005
+ ResizeBitCast(d32f, c)));
6281
7006
  }
6282
7007
 
6283
7008
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
@@ -6285,28 +7010,11 @@ HWY_API VFromD<D> ReorderWidenMulAccumulate(
6285
7010
  D /*d32*/, VFromD<Repartition<bfloat16_t, D>> a,
6286
7011
  VFromD<Repartition<bfloat16_t, D>> b, const VFromD<D> sum0,
6287
7012
  VFromD<D>& /*sum1*/) {
6288
- return VFromD<D>(vbfdot_f32(sum0.raw, a.raw, b.raw));
7013
+ return VFromD<D>(vbfdot_f32(sum0.raw, detail::BitCastToRawNeonBF16(a.raw),
7014
+ detail::BitCastToRawNeonBF16(b.raw)));
6289
7015
  }
6290
7016
 
6291
- #else
6292
-
6293
- template <class D32, HWY_IF_F32_D(D32),
6294
- class V16 = VFromD<Repartition<bfloat16_t, D32>>>
6295
- HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b,
6296
- const VFromD<D32> sum0,
6297
- VFromD<D32>& sum1) {
6298
- const RebindToUnsigned<decltype(df32)> du32;
6299
- using VU32 = VFromD<decltype(du32)>;
6300
- const VU32 odd = Set(du32, 0xFFFF0000u);
6301
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
6302
- const VU32 ao = And(BitCast(du32, a), odd);
6303
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
6304
- const VU32 bo = And(BitCast(du32, b), odd);
6305
- sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
6306
- return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
6307
- }
6308
-
6309
- #endif // HWY_NEON_HAVE_BFLOAT16
7017
+ #endif // HWY_NEON_HAVE_F32_TO_BF16C
6310
7018
 
6311
7019
  template <class D, HWY_IF_I32_D(D)>
6312
7020
  HWY_API Vec128<int32_t> ReorderWidenMulAccumulate(D /*d32*/, Vec128<int16_t> a,
@@ -6476,37 +7184,34 @@ HWY_API Vec32<uint32_t> RearrangeToOddPlusEven(Vec32<uint32_t> sum0,
6476
7184
 
6477
7185
  // ------------------------------ WidenMulPairwiseAdd
6478
7186
 
6479
- #if HWY_NEON_HAVE_BFLOAT16
7187
+ #if HWY_NEON_HAVE_F32_TO_BF16C
6480
7188
 
6481
- template <class D, HWY_IF_V_SIZE_D(D, 16)>
6482
- HWY_API Vec128<float> WidenMulPairwiseAdd(D d32, Vec128<bfloat16_t> a,
7189
+ template <class DF, HWY_IF_V_SIZE_D(DF, 16)>
7190
+ HWY_API Vec128<float> WidenMulPairwiseAdd(DF df, Vec128<bfloat16_t> a,
6483
7191
  Vec128<bfloat16_t> b) {
6484
- return Vec128<float>(vbfdotq_f32(Zero(d32).raw, a.raw, b.raw));
7192
+ return Vec128<float>(vbfdotq_f32(Zero(df).raw,
7193
+ detail::BitCastToRawNeonBF16(a.raw),
7194
+ detail::BitCastToRawNeonBF16(b.raw)));
6485
7195
  }
6486
7196
 
6487
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
6488
- HWY_API VFromD<D> WidenMulPairwiseAdd(D d32,
6489
- VFromD<Repartition<bfloat16_t, D>> a,
6490
- VFromD<Repartition<bfloat16_t, D>> b) {
6491
- return VFromD<D>(vbfdot_f32(Zero(d32).raw, a.raw, b.raw));
7197
+ template <class DF, HWY_IF_V_SIZE_LE_D(DF, 8)>
7198
+ HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df,
7199
+ VFromD<Repartition<bfloat16_t, DF>> a,
7200
+ VFromD<Repartition<bfloat16_t, DF>> b) {
7201
+ return VFromD<DF>(vbfdot_f32(Zero(df).raw,
7202
+ detail::BitCastToRawNeonBF16(a.raw),
7203
+ detail::BitCastToRawNeonBF16(b.raw)));
6492
7204
  }
6493
7205
 
6494
7206
  #else
6495
- template <class D32, HWY_IF_F32_D(D32)>
6496
- HWY_API VFromD<D32> WidenMulPairwiseAdd(
6497
- D32 df32, VFromD<Repartition<bfloat16_t, D32>> a,
6498
- VFromD<Repartition<bfloat16_t, D32>> b) {
6499
- const RebindToUnsigned<decltype(df32)> du32;
6500
- using VU32 = VFromD<decltype(du32)>;
6501
- const VU32 odd = Set(du32, 0xFFFF0000u);
6502
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
6503
- const VU32 ao = And(BitCast(du32, a), odd);
6504
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
6505
- const VU32 bo = And(BitCast(du32, b), odd);
6506
- return MulAdd(BitCast(df32, ae), BitCast(df32, be),
6507
- Mul(BitCast(df32, ao), BitCast(df32, bo)));
7207
+ template <class DF, HWY_IF_F32_D(DF)>
7208
+ HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df,
7209
+ VFromD<Repartition<bfloat16_t, DF>> a,
7210
+ VFromD<Repartition<bfloat16_t, DF>> b) {
7211
+ return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
7212
+ Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
6508
7213
  }
6509
- #endif // HWY_NEON_HAVE_BFLOAT16
7214
+ #endif // HWY_NEON_HAVE_F32_TO_BF16C
6510
7215
 
6511
7216
  template <class D, HWY_IF_I32_D(D)>
6512
7217
  HWY_API Vec128<int32_t> WidenMulPairwiseAdd(D /*d32*/, Vec128<int16_t> a,
@@ -6841,6 +7546,36 @@ HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
6841
7546
  return IfThenElse(MaskFromVec(vec), b, a);
6842
7547
  }
6843
7548
 
7549
+ // ------------------------------ InterleaveEven
7550
+ template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
7551
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
7552
+ #if HWY_ARCH_ARM_A64
7553
+ return detail::InterleaveEven(a, b);
7554
+ #else
7555
+ return VFromD<D>(detail::InterleaveEvenOdd(a.raw, b.raw).val[0]);
7556
+ #endif
7557
+ }
7558
+
7559
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
7560
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
7561
+ return InterleaveLower(a, b);
7562
+ }
7563
+
7564
+ // ------------------------------ InterleaveOdd
7565
+ template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
7566
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
7567
+ #if HWY_ARCH_ARM_A64
7568
+ return detail::InterleaveOdd(a, b);
7569
+ #else
7570
+ return VFromD<D>(detail::InterleaveEvenOdd(a.raw, b.raw).val[1]);
7571
+ #endif
7572
+ }
7573
+
7574
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
7575
+ HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
7576
+ return InterleaveUpper(d, a, b);
7577
+ }
7578
+
6844
7579
  // ------------------------------ OddEvenBlocks
6845
7580
  template <typename T, size_t N>
6846
7581
  HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
@@ -6862,12 +7597,14 @@ HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
6862
7597
 
6863
7598
  // ------------------------------ ReorderDemote2To (OddEven)
6864
7599
 
6865
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_BF16_D(D),
6866
- class V32 = VFromD<Repartition<float, D>>>
6867
- HWY_API VFromD<D> ReorderDemote2To(D dbf16, V32 a, V32 b) {
6868
- const RebindToUnsigned<decltype(dbf16)> du16;
6869
- return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a)));
7600
+ #if HWY_NEON_HAVE_F32_TO_BF16C
7601
+ template <class D, HWY_IF_BF16_D(D)>
7602
+ HWY_API VFromD<D> ReorderDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
7603
+ VFromD<Repartition<float, D>> b) {
7604
+ const Half<decltype(dbf16)> dh_bf16;
7605
+ return Combine(dbf16, DemoteTo(dh_bf16, b), DemoteTo(dh_bf16, a));
6870
7606
  }
7607
+ #endif // HWY_NEON_HAVE_F32_TO_BF16C
6871
7608
 
6872
7609
  template <class D, HWY_IF_I32_D(D)>
6873
7610
  HWY_API Vec128<int32_t> ReorderDemote2To(D d32, Vec128<int64_t> a,
@@ -7083,16 +7820,19 @@ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
7083
7820
  return ReorderDemote2To(d, a, b);
7084
7821
  }
7085
7822
 
7086
- template <class D, HWY_IF_BF16_D(D), class V32 = VFromD<Repartition<float, D>>>
7087
- HWY_API VFromD<D> OrderedDemote2To(D dbf16, V32 a, V32 b) {
7823
+ #if HWY_NEON_HAVE_F32_TO_BF16C
7824
+ template <class D, HWY_IF_BF16_D(D)>
7825
+ HWY_API VFromD<D> OrderedDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
7826
+ VFromD<Repartition<float, D>> b) {
7088
7827
  return ReorderDemote2To(dbf16, a, b);
7089
7828
  }
7829
+ #endif // HWY_NEON_HAVE_F32_TO_BF16C
7090
7830
 
7091
7831
  // ================================================== CRYPTO
7092
7832
 
7093
7833
  // (aarch64 or Arm7) and (__ARM_FEATURE_AES or HWY_HAVE_RUNTIME_DISPATCH).
7094
7834
  // Otherwise, rely on generic_ops-inl.h to emulate AESRound / CLMul*.
7095
- #if HWY_TARGET == HWY_NEON
7835
+ #if HWY_TARGET != HWY_NEON_WITHOUT_AES
7096
7836
 
7097
7837
  #ifdef HWY_NATIVE_AES
7098
7838
  #undef HWY_NATIVE_AES
@@ -7143,7 +7883,7 @@ HWY_API Vec128<uint64_t> CLMulUpper(Vec128<uint64_t> a, Vec128<uint64_t> b) {
7143
7883
  (uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw));
7144
7884
  }
7145
7885
 
7146
- #endif // HWY_TARGET == HWY_NEON
7886
+ #endif // HWY_TARGET != HWY_NEON_WITHOUT_AES
7147
7887
 
7148
7888
  // ================================================== MISC
7149
7889
 
@@ -7318,10 +8058,11 @@ HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(Vec128<uint32_t, N> a,
7318
8058
  vget_low_u64(vmull_u32(a_packed, b_packed)));
7319
8059
  }
7320
8060
 
7321
- HWY_INLINE Vec128<uint64_t> MulEven(Vec128<uint64_t> a, Vec128<uint64_t> b) {
7322
- uint64_t hi;
7323
- uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 0), vgetq_lane_u64(b.raw, 0), &hi);
7324
- return Vec128<uint64_t>(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1));
8061
+ template <class T, HWY_IF_UI64(T)>
8062
+ HWY_INLINE Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
8063
+ T hi;
8064
+ T lo = Mul128(GetLane(a), GetLane(b), &hi);
8065
+ return Dup128VecFromValues(Full128<T>(), lo, hi);
7325
8066
  }
7326
8067
 
7327
8068
  // Multiplies odd lanes (1, 3 ..) and places the double-wide result into
@@ -7424,10 +8165,11 @@ HWY_API Vec128<uint64_t, (N + 1) / 2> MulOdd(Vec128<uint32_t, N> a,
7424
8165
  vget_low_u64(vmull_u32(a_packed, b_packed)));
7425
8166
  }
7426
8167
 
7427
- HWY_INLINE Vec128<uint64_t> MulOdd(Vec128<uint64_t> a, Vec128<uint64_t> b) {
7428
- uint64_t hi;
7429
- uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 1), vgetq_lane_u64(b.raw, 1), &hi);
7430
- return Vec128<uint64_t>(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1));
8168
+ template <class T, HWY_IF_UI64(T)>
8169
+ HWY_INLINE Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
8170
+ T hi;
8171
+ T lo = Mul128(detail::GetLane<1>(a), detail::GetLane<1>(b), &hi);
8172
+ return Dup128VecFromValues(Full128<T>(), lo, hi);
7431
8173
  }
7432
8174
 
7433
8175
  // ------------------------------ TableLookupBytes (Combine, LowerHalf)
@@ -7492,7 +8234,7 @@ HWY_API VI TableLookupBytesOr0(V bytes, VI from) {
7492
8234
 
7493
8235
  // ---------------------------- AESKeyGenAssist (AESLastRound, TableLookupBytes)
7494
8236
 
7495
- #if HWY_TARGET == HWY_NEON
8237
+ #if HWY_TARGET != HWY_NEON_WITHOUT_AES
7496
8238
  template <uint8_t kRcon>
7497
8239
  HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) {
7498
8240
  alignas(16) static constexpr uint8_t kRconXorMask[16] = {
@@ -7505,7 +8247,7 @@ HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) {
7505
8247
  const auto sub_word_result = AESLastRound(w13, Load(d, kRconXorMask));
7506
8248
  return TableLookupBytes(sub_word_result, Load(d, kRotWordShuffle));
7507
8249
  }
7508
- #endif // HWY_TARGET == HWY_NEON
8250
+ #endif // HWY_TARGET != HWY_NEON_WITHOUT_AES
7509
8251
 
7510
8252
  // ------------------------------ Scatter in generic_ops-inl.h
7511
8253
  // ------------------------------ Gather in generic_ops-inl.h