@img/sharp-libvips-dev 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +7 -1
- package/include/aom/aom_image.h +24 -12
- package/include/aom/aom_integer.h +3 -3
- package/include/aom/aomcx.h +15 -0
- package/include/aom/aomdx.h +5 -2
- package/include/archive.h +7 -5
- package/include/archive_entry.h +5 -3
- package/include/cgif.h +3 -0
- package/include/freetype2/freetype/config/ftoption.h +1 -1
- package/include/fribidi/fribidi-config.h +2 -2
- package/include/fribidi/fribidi-unicode-version.h +3 -3
- package/include/glib-2.0/gio/gappinfo.h +40 -25
- package/include/glib-2.0/gio/gasyncresult.h +1 -1
- package/include/glib-2.0/gio/gconverter.h +5 -0
- package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
- package/include/glib-2.0/gio/gfile.h +16 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/gsettings.h +8 -0
- package/include/glib-2.0/gio/gvfs.h +2 -2
- package/include/glib-2.0/girepository/gi-visibility.h +34 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
- package/include/glib-2.0/glib/giochannel.h +2 -2
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +12 -5
- package/include/glib-2.0/glib/gmain.h +93 -7
- package/include/glib-2.0/glib/gqsort.h +8 -1
- package/include/glib-2.0/glib/gstrfuncs.h +0 -12
- package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
- package/include/glib-2.0/glib/gunicode.h +1 -1
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gtype.h +6 -6
- package/include/harfbuzz/hb-buffer.h +6 -0
- package/include/harfbuzz/hb-common.h +6 -9
- package/include/harfbuzz/hb-cplusplus.hh +8 -11
- package/include/harfbuzz/hb-subset.h +17 -4
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +28 -0
- package/include/hwy/aligned_allocator.h +48 -1
- package/include/hwy/base.h +235 -34
- package/include/hwy/detect_compiler_arch.h +84 -10
- package/include/hwy/detect_targets.h +95 -29
- package/include/hwy/foreach_target.h +12 -1
- package/include/hwy/highway.h +205 -50
- package/include/hwy/ops/arm_neon-inl.h +841 -99
- package/include/hwy/ops/arm_sve-inl.h +413 -141
- package/include/hwy/ops/emu128-inl.h +373 -360
- package/include/hwy/ops/generic_ops-inl.h +804 -401
- package/include/hwy/ops/inside-inl.h +691 -0
- package/include/hwy/ops/ppc_vsx-inl.h +456 -166
- package/include/hwy/ops/rvv-inl.h +537 -249
- package/include/hwy/ops/scalar-inl.h +169 -79
- package/include/hwy/ops/set_macros-inl.h +106 -18
- package/include/hwy/ops/shared-inl.h +23 -0
- package/include/hwy/ops/wasm_128-inl.h +130 -108
- package/include/hwy/ops/x86_128-inl.h +1892 -577
- package/include/hwy/ops/x86_256-inl.h +625 -184
- package/include/hwy/ops/x86_512-inl.h +733 -131
- package/include/hwy/targets.h +22 -21
- package/include/hwy/timer-inl.h +3 -3
- package/include/hwy/timer.h +5 -1
- package/include/libheif/heif.h +170 -15
- package/include/libheif/heif_items.h +237 -0
- package/include/libheif/heif_properties.h +38 -2
- package/include/libheif/heif_regions.h +1 -1
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/pnglibconf.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
- package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
- package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
- package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
- package/include/libxml2/libxml/HTMLparser.h +12 -19
- package/include/libxml2/libxml/c14n.h +1 -12
- package/include/libxml2/libxml/debugXML.h +1 -1
- package/include/libxml2/libxml/encoding.h +9 -0
- package/include/libxml2/libxml/entities.h +12 -1
- package/include/libxml2/libxml/hash.h +19 -0
- package/include/libxml2/libxml/list.h +2 -2
- package/include/libxml2/libxml/nanohttp.h +17 -0
- package/include/libxml2/libxml/parser.h +61 -55
- package/include/libxml2/libxml/parserInternals.h +9 -1
- package/include/libxml2/libxml/pattern.h +6 -0
- package/include/libxml2/libxml/tree.h +32 -12
- package/include/libxml2/libxml/uri.h +11 -0
- package/include/libxml2/libxml/valid.h +29 -2
- package/include/libxml2/libxml/xinclude.h +7 -0
- package/include/libxml2/libxml/xmlIO.h +21 -4
- package/include/libxml2/libxml/xmlerror.h +14 -0
- package/include/libxml2/libxml/xmlexports.h +111 -15
- package/include/libxml2/libxml/xmlmemory.h +8 -45
- package/include/libxml2/libxml/xmlreader.h +2 -0
- package/include/libxml2/libxml/xmlsave.h +5 -0
- package/include/libxml2/libxml/xmlunicode.h +165 -1
- package/include/libxml2/libxml/xmlversion.h +15 -179
- package/include/libxml2/libxml/xmlwriter.h +1 -0
- package/include/libxml2/libxml/xpath.h +4 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-item.h +4 -2
- package/include/pango-1.0/pango/pango-version-macros.h +25 -0
- package/include/pango-1.0/pango/pangofc-font.h +2 -1
- package/include/pnglibconf.h +1 -1
- package/include/vips/util.h +1 -2
- package/include/vips/version.h +4 -4
- package/include/webp/decode.h +58 -56
- package/include/webp/demux.h +25 -21
- package/include/webp/encode.h +44 -39
- package/include/webp/mux.h +76 -15
- package/include/webp/mux_types.h +2 -1
- package/include/webp/sharpyuv/sharpyuv.h +77 -8
- package/include/webp/types.h +29 -8
- package/include/zconf.h +1 -1
- package/include/zlib.h +12 -12
- package/package.json +1 -1
- package/versions.json +14 -15
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
// Copyright 2019 Google LLC
|
|
2
|
+
// Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
|
|
2
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
// SPDX-License-Identifier: BSD-3-Clause
|
|
3
5
|
//
|
|
4
6
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
7
|
// you may not use this file except in compliance with the License.
|
|
@@ -21,16 +23,12 @@
|
|
|
21
23
|
|
|
22
24
|
#include "hwy/ops/shared-inl.h"
|
|
23
25
|
|
|
24
|
-
HWY_BEFORE_NAMESPACE();
|
|
25
|
-
|
|
26
|
-
// Must come after HWY_BEFORE_NAMESPACE so that the intrinsics are compiled with
|
|
27
|
-
// the same target attribute as our code, see #834.
|
|
28
26
|
HWY_DIAGNOSTICS(push)
|
|
29
27
|
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
|
|
30
28
|
#include <arm_neon.h> // NOLINT(build/include_order)
|
|
31
29
|
HWY_DIAGNOSTICS(pop)
|
|
32
30
|
|
|
33
|
-
|
|
31
|
+
HWY_BEFORE_NAMESPACE();
|
|
34
32
|
namespace hwy {
|
|
35
33
|
namespace HWY_NAMESPACE {
|
|
36
34
|
|
|
@@ -143,13 +141,29 @@ namespace detail { // for code folding and Raw128
|
|
|
143
141
|
HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \
|
|
144
142
|
HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)
|
|
145
143
|
|
|
146
|
-
|
|
147
|
-
|
|
144
|
+
// Clang 17 crashes with bf16, see github.com/llvm/llvm-project/issues/64179.
|
|
145
|
+
#undef HWY_NEON_HAVE_BFLOAT16
|
|
146
|
+
#if HWY_HAVE_SCALAR_BF16_TYPE && \
|
|
147
|
+
((HWY_TARGET == HWY_NEON_BF16 && \
|
|
148
|
+
(!HWY_COMPILER_CLANG || HWY_COMPILER_CLANG >= 1800)) || \
|
|
149
|
+
defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC))
|
|
148
150
|
#define HWY_NEON_HAVE_BFLOAT16 1
|
|
149
151
|
#else
|
|
150
152
|
#define HWY_NEON_HAVE_BFLOAT16 0
|
|
151
153
|
#endif
|
|
152
154
|
|
|
155
|
+
// HWY_NEON_HAVE_F32_TO_BF16C is defined if NEON vcvt_bf16_f32 and
|
|
156
|
+
// vbfdot_f32 are available, even if the __bf16 type is disabled due to
|
|
157
|
+
// GCC/Clang bugs.
|
|
158
|
+
#undef HWY_NEON_HAVE_F32_TO_BF16C
|
|
159
|
+
#if HWY_NEON_HAVE_BFLOAT16 || HWY_TARGET == HWY_NEON_BF16 || \
|
|
160
|
+
(defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && \
|
|
161
|
+
(HWY_COMPILER_GCC_ACTUAL >= 1000 || HWY_COMPILER_CLANG >= 1100))
|
|
162
|
+
#define HWY_NEON_HAVE_F32_TO_BF16C 1
|
|
163
|
+
#else
|
|
164
|
+
#define HWY_NEON_HAVE_F32_TO_BF16C 0
|
|
165
|
+
#endif
|
|
166
|
+
|
|
153
167
|
// bfloat16_t
|
|
154
168
|
#if HWY_NEON_HAVE_BFLOAT16
|
|
155
169
|
#define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args) \
|
|
@@ -185,7 +199,12 @@ namespace detail { // for code folding and Raw128
|
|
|
185
199
|
#elif HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16
|
|
186
200
|
#define HWY_NEON_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
|
|
187
201
|
#elif HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16
|
|
188
|
-
|
|
202
|
+
// NOTE: hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr is used instead of
|
|
203
|
+
// hwy::EnableIf<false>* = nullptr to avoid compiler errors since
|
|
204
|
+
// !hwy::IsSame<D, D>() is always false and as !hwy::IsSame<D, D>() will cause
|
|
205
|
+
// SFINAE to occur instead of a hard error due to a dependency on the D template
|
|
206
|
+
// argument
|
|
207
|
+
#define HWY_NEON_IF_EMULATED_D(D) hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr
|
|
189
208
|
#else
|
|
190
209
|
#error "Logic error, handled all four cases"
|
|
191
210
|
#endif
|
|
@@ -945,8 +964,12 @@ HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
|
|
|
945
964
|
|
|
946
965
|
template <class D>
|
|
947
966
|
HWY_API VFromD<D> Undefined(D /*tag*/) {
|
|
967
|
+
#if HWY_HAS_BUILTIN(__builtin_nondeterministic_value)
|
|
968
|
+
return VFromD<D>{__builtin_nondeterministic_value(Zero(D()).raw)};
|
|
969
|
+
#else
|
|
948
970
|
VFromD<D> v;
|
|
949
971
|
return v;
|
|
972
|
+
#endif
|
|
950
973
|
}
|
|
951
974
|
|
|
952
975
|
HWY_DIAGNOSTICS(pop)
|
|
@@ -1292,9 +1315,6 @@ HWY_API VFromD<D> Iota(D d, const T2 first) {
|
|
|
1292
1315
|
#endif
|
|
1293
1316
|
}
|
|
1294
1317
|
|
|
1295
|
-
// ------------------------------ Tuple (VFromD)
|
|
1296
|
-
#include "hwy/ops/tuple-inl.h"
|
|
1297
|
-
|
|
1298
1318
|
// ------------------------------ Combine
|
|
1299
1319
|
|
|
1300
1320
|
// Full result
|
|
@@ -1616,6 +1636,14 @@ namespace detail {
|
|
|
1616
1636
|
#define HWY_NEON_BUILD_ARG_HWY_GET v.raw, kLane
|
|
1617
1637
|
|
|
1618
1638
|
HWY_NEON_DEF_FUNCTION_ALL_TYPES(GetLane, vget, _lane_, HWY_GET)
|
|
1639
|
+
HWY_NEON_DEF_FUNCTION_BFLOAT_16(GetLane, vget, _lane_, HWY_GET)
|
|
1640
|
+
|
|
1641
|
+
template <size_t kLane, class V, HWY_NEON_IF_EMULATED_D(DFromV<V>)>
|
|
1642
|
+
static HWY_INLINE HWY_MAYBE_UNUSED TFromV<V> GetLane(V v) {
|
|
1643
|
+
const DFromV<decltype(v)> d;
|
|
1644
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1645
|
+
return BitCastScalar<TFromV<V>>(GetLane<kLane>(BitCast(du, v)));
|
|
1646
|
+
}
|
|
1619
1647
|
|
|
1620
1648
|
#undef HWY_NEON_BUILD_TPL_HWY_GET
|
|
1621
1649
|
#undef HWY_NEON_BUILD_RET_HWY_GET
|
|
@@ -1762,6 +1790,7 @@ namespace detail {
|
|
|
1762
1790
|
#define HWY_NEON_BUILD_ARG_HWY_INSERT t, v.raw, kLane
|
|
1763
1791
|
|
|
1764
1792
|
HWY_NEON_DEF_FUNCTION_ALL_TYPES(InsertLane, vset, _lane_, HWY_INSERT)
|
|
1793
|
+
HWY_NEON_DEF_FUNCTION_BFLOAT_16(InsertLane, vset, _lane_, HWY_INSERT)
|
|
1765
1794
|
|
|
1766
1795
|
#undef HWY_NEON_BUILD_TPL_HWY_INSERT
|
|
1767
1796
|
#undef HWY_NEON_BUILD_RET_HWY_INSERT
|
|
@@ -2133,12 +2162,16 @@ HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, vshr, _n_, ignored)
|
|
|
2133
2162
|
#pragma pop_macro("HWY_NEON_DEF_FUNCTION")
|
|
2134
2163
|
|
|
2135
2164
|
// ------------------------------ RotateRight (ShiftRight, Or)
|
|
2136
|
-
template <int kBits, typename T, size_t N>
|
|
2165
|
+
template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
2137
2166
|
HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
|
|
2167
|
+
const DFromV<decltype(v)> d;
|
|
2168
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2169
|
+
|
|
2138
2170
|
constexpr size_t kSizeInBits = sizeof(T) * 8;
|
|
2139
2171
|
static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
|
|
2140
2172
|
if (kBits == 0) return v;
|
|
2141
|
-
|
|
2173
|
+
|
|
2174
|
+
return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
|
|
2142
2175
|
ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
|
|
2143
2176
|
}
|
|
2144
2177
|
|
|
@@ -2328,7 +2361,39 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator*, vmul, _, 2)
|
|
|
2328
2361
|
|
|
2329
2362
|
// ------------------------------ Integer multiplication
|
|
2330
2363
|
|
|
2331
|
-
// Returns the upper
|
|
2364
|
+
// Returns the upper sizeof(T)*8 bits of a * b in each lane.
|
|
2365
|
+
HWY_API Vec128<int8_t> MulHigh(Vec128<int8_t> a, Vec128<int8_t> b) {
|
|
2366
|
+
int16x8_t rlo = vmull_s8(vget_low_s8(a.raw), vget_low_s8(b.raw));
|
|
2367
|
+
#if HWY_ARCH_ARM_A64
|
|
2368
|
+
int16x8_t rhi = vmull_high_s8(a.raw, b.raw);
|
|
2369
|
+
#else
|
|
2370
|
+
int16x8_t rhi = vmull_s8(vget_high_s8(a.raw), vget_high_s8(b.raw));
|
|
2371
|
+
#endif
|
|
2372
|
+
return Vec128<int8_t>(
|
|
2373
|
+
vuzp2q_s8(vreinterpretq_s8_s16(rlo), vreinterpretq_s8_s16(rhi)));
|
|
2374
|
+
}
|
|
2375
|
+
HWY_API Vec128<uint8_t> MulHigh(Vec128<uint8_t> a, Vec128<uint8_t> b) {
|
|
2376
|
+
uint16x8_t rlo = vmull_u8(vget_low_u8(a.raw), vget_low_u8(b.raw));
|
|
2377
|
+
#if HWY_ARCH_ARM_A64
|
|
2378
|
+
uint16x8_t rhi = vmull_high_u8(a.raw, b.raw);
|
|
2379
|
+
#else
|
|
2380
|
+
uint16x8_t rhi = vmull_u8(vget_high_u8(a.raw), vget_high_u8(b.raw));
|
|
2381
|
+
#endif
|
|
2382
|
+
return Vec128<uint8_t>(
|
|
2383
|
+
vuzp2q_u8(vreinterpretq_u8_u16(rlo), vreinterpretq_u8_u16(rhi)));
|
|
2384
|
+
}
|
|
2385
|
+
|
|
2386
|
+
template <size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)>
|
|
2387
|
+
HWY_API Vec128<int8_t, N> MulHigh(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
|
|
2388
|
+
int8x16_t hi_lo = vreinterpretq_s8_s16(vmull_s8(a.raw, b.raw));
|
|
2389
|
+
return Vec128<int8_t, N>(vget_low_s8(vuzp2q_s8(hi_lo, hi_lo)));
|
|
2390
|
+
}
|
|
2391
|
+
template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)>
|
|
2392
|
+
HWY_API Vec128<uint8_t, N> MulHigh(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
|
|
2393
|
+
uint8x16_t hi_lo = vreinterpretq_u8_u16(vmull_u8(a.raw, b.raw));
|
|
2394
|
+
return Vec128<uint8_t, N>(vget_low_u8(vuzp2q_u8(hi_lo, hi_lo)));
|
|
2395
|
+
}
|
|
2396
|
+
|
|
2332
2397
|
HWY_API Vec128<int16_t> MulHigh(Vec128<int16_t> a, Vec128<int16_t> b) {
|
|
2333
2398
|
int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw));
|
|
2334
2399
|
#if HWY_ARCH_ARM_A64
|
|
@@ -2362,6 +2427,57 @@ HWY_API Vec128<uint16_t, N> MulHigh(Vec128<uint16_t, N> a,
|
|
|
2362
2427
|
return Vec128<uint16_t, N>(vget_low_u16(vuzp2q_u16(hi_lo, hi_lo)));
|
|
2363
2428
|
}
|
|
2364
2429
|
|
|
2430
|
+
HWY_API Vec128<int32_t> MulHigh(Vec128<int32_t> a, Vec128<int32_t> b) {
|
|
2431
|
+
int64x2_t rlo = vmull_s32(vget_low_s32(a.raw), vget_low_s32(b.raw));
|
|
2432
|
+
#if HWY_ARCH_ARM_A64
|
|
2433
|
+
int64x2_t rhi = vmull_high_s32(a.raw, b.raw);
|
|
2434
|
+
#else
|
|
2435
|
+
int64x2_t rhi = vmull_s32(vget_high_s32(a.raw), vget_high_s32(b.raw));
|
|
2436
|
+
#endif
|
|
2437
|
+
return Vec128<int32_t>(
|
|
2438
|
+
vuzp2q_s32(vreinterpretq_s32_s64(rlo), vreinterpretq_s32_s64(rhi)));
|
|
2439
|
+
}
|
|
2440
|
+
HWY_API Vec128<uint32_t> MulHigh(Vec128<uint32_t> a, Vec128<uint32_t> b) {
|
|
2441
|
+
uint64x2_t rlo = vmull_u32(vget_low_u32(a.raw), vget_low_u32(b.raw));
|
|
2442
|
+
#if HWY_ARCH_ARM_A64
|
|
2443
|
+
uint64x2_t rhi = vmull_high_u32(a.raw, b.raw);
|
|
2444
|
+
#else
|
|
2445
|
+
uint64x2_t rhi = vmull_u32(vget_high_u32(a.raw), vget_high_u32(b.raw));
|
|
2446
|
+
#endif
|
|
2447
|
+
return Vec128<uint32_t>(
|
|
2448
|
+
vuzp2q_u32(vreinterpretq_u32_u64(rlo), vreinterpretq_u32_u64(rhi)));
|
|
2449
|
+
}
|
|
2450
|
+
|
|
2451
|
+
template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)>
|
|
2452
|
+
HWY_API Vec128<int32_t, N> MulHigh(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
|
|
2453
|
+
int32x4_t hi_lo = vreinterpretq_s32_s64(vmull_s32(a.raw, b.raw));
|
|
2454
|
+
return Vec128<int32_t, N>(vget_low_s32(vuzp2q_s32(hi_lo, hi_lo)));
|
|
2455
|
+
}
|
|
2456
|
+
template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
|
|
2457
|
+
HWY_API Vec128<uint32_t, N> MulHigh(Vec128<uint32_t, N> a,
|
|
2458
|
+
Vec128<uint32_t, N> b) {
|
|
2459
|
+
uint32x4_t hi_lo = vreinterpretq_u32_u64(vmull_u32(a.raw, b.raw));
|
|
2460
|
+
return Vec128<uint32_t, N>(vget_low_u32(vuzp2q_u32(hi_lo, hi_lo)));
|
|
2461
|
+
}
|
|
2462
|
+
|
|
2463
|
+
template <class T, HWY_IF_UI64(T)>
|
|
2464
|
+
HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
|
|
2465
|
+
T hi_0;
|
|
2466
|
+
T hi_1;
|
|
2467
|
+
|
|
2468
|
+
Mul128(GetLane(a), GetLane(b), &hi_0);
|
|
2469
|
+
Mul128(detail::GetLane<1>(a), detail::GetLane<1>(b), &hi_1);
|
|
2470
|
+
|
|
2471
|
+
return Dup128VecFromValues(Full128<T>(), hi_0, hi_1);
|
|
2472
|
+
}
|
|
2473
|
+
|
|
2474
|
+
template <class T, HWY_IF_UI64(T)>
|
|
2475
|
+
HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) {
|
|
2476
|
+
T hi;
|
|
2477
|
+
Mul128(GetLane(a), GetLane(b), &hi);
|
|
2478
|
+
return Set(Full64<T>(), hi);
|
|
2479
|
+
}
|
|
2480
|
+
|
|
2365
2481
|
HWY_API Vec128<int16_t> MulFixedPoint15(Vec128<int16_t> a, Vec128<int16_t> b) {
|
|
2366
2482
|
return Vec128<int16_t>(vqrdmulhq_s16(a.raw, b.raw));
|
|
2367
2483
|
}
|
|
@@ -2467,7 +2583,7 @@ HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
|
|
|
2467
2583
|
|
|
2468
2584
|
namespace detail {
|
|
2469
2585
|
|
|
2470
|
-
#if
|
|
2586
|
+
#if HWY_NATIVE_FMA
|
|
2471
2587
|
// Wrappers for changing argument order to what intrinsics expect.
|
|
2472
2588
|
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(MulAdd, vfma, _, 3)
|
|
2473
2589
|
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(NegMulAdd, vfms, _, 3)
|
|
@@ -2485,7 +2601,7 @@ HWY_API Vec128<float, N> NegMulAdd(Vec128<float, N> add, Vec128<float, N> mul,
|
|
|
2485
2601
|
return add - mul * x;
|
|
2486
2602
|
}
|
|
2487
2603
|
|
|
2488
|
-
#endif //
|
|
2604
|
+
#endif // HWY_NATIVE_FMA
|
|
2489
2605
|
} // namespace detail
|
|
2490
2606
|
|
|
2491
2607
|
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
@@ -2874,14 +2990,21 @@ HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) {
|
|
|
2874
2990
|
|
|
2875
2991
|
HWY_NEON_DEF_FUNCTION_ALL_TYPES(IfThenElse, vbsl, _, HWY_IF)
|
|
2876
2992
|
|
|
2877
|
-
|
|
2878
|
-
|
|
2993
|
+
#if HWY_HAVE_FLOAT16
|
|
2994
|
+
#define HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V) HWY_IF_BF16(TFromV<V>)
|
|
2995
|
+
#else
|
|
2996
|
+
#define HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V) HWY_IF_SPECIAL_FLOAT_V(V)
|
|
2997
|
+
#endif
|
|
2998
|
+
|
|
2999
|
+
template <class V, HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V)>
|
|
3000
|
+
HWY_API V IfThenElse(MFromD<DFromV<V>> mask, V yes, V no) {
|
|
2879
3001
|
const DFromV<decltype(yes)> d;
|
|
2880
3002
|
const RebindToUnsigned<decltype(d)> du;
|
|
2881
3003
|
return BitCast(
|
|
2882
3004
|
d, IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no)));
|
|
2883
3005
|
}
|
|
2884
3006
|
|
|
3007
|
+
#undef HWY_NEON_IF_EMULATED_IF_THEN_ELSE
|
|
2885
3008
|
#undef HWY_NEON_BUILD_TPL_HWY_IF
|
|
2886
3009
|
#undef HWY_NEON_BUILD_RET_HWY_IF
|
|
2887
3010
|
#undef HWY_NEON_BUILD_PARAM_HWY_IF
|
|
@@ -2922,12 +3045,6 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
|
|
|
2922
3045
|
return IfThenElse(m, yes, no);
|
|
2923
3046
|
}
|
|
2924
3047
|
|
|
2925
|
-
template <typename T, size_t N>
|
|
2926
|
-
HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
|
|
2927
|
-
const auto zero = Zero(DFromV<decltype(v)>());
|
|
2928
|
-
return Max(zero, v);
|
|
2929
|
-
}
|
|
2930
|
-
|
|
2931
3048
|
// ------------------------------ Mask logical
|
|
2932
3049
|
|
|
2933
3050
|
template <typename T, size_t N>
|
|
@@ -3894,16 +4011,15 @@ HWY_API Vec64<double> ConvertTo(D /* tag */, Vec64<int64_t> v) {
|
|
|
3894
4011
|
|
|
3895
4012
|
template <class D, HWY_IF_F64_D(D)>
|
|
3896
4013
|
HWY_API Vec128<double> ConvertTo(D /* tag */, Vec128<uint64_t> v) {
|
|
3897
|
-
return Vec128<double>(vcvtq_f64_u64(
|
|
4014
|
+
return Vec128<double>(vcvtq_f64_u64(v.raw));
|
|
3898
4015
|
}
|
|
3899
4016
|
template <class D, HWY_IF_F64_D(D)>
|
|
3900
4017
|
HWY_API Vec64<double> ConvertTo(D /* tag */, Vec64<uint64_t> v) {
|
|
3901
4018
|
// GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic.
|
|
3902
|
-
const auto non_neg_v = ZeroIfNegative(v);
|
|
3903
4019
|
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
|
|
3904
|
-
return Set(Full64<double>(), static_cast<double>(GetLane(
|
|
4020
|
+
return Set(Full64<double>(), static_cast<double>(GetLane(v)));
|
|
3905
4021
|
#else
|
|
3906
|
-
return Vec64<double>(vcvt_f64_u64(
|
|
4022
|
+
return Vec64<double>(vcvt_f64_u64(v.raw));
|
|
3907
4023
|
#endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
|
|
3908
4024
|
}
|
|
3909
4025
|
|
|
@@ -4379,8 +4495,36 @@ HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
|
|
|
4379
4495
|
lo32_or_mask);
|
|
4380
4496
|
}
|
|
4381
4497
|
|
|
4498
|
+
#ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
|
|
4499
|
+
#undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
|
|
4500
|
+
#else
|
|
4501
|
+
#define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
|
|
4502
|
+
#endif
|
|
4503
|
+
|
|
4504
|
+
template <class D, HWY_IF_UI64_D(D)>
|
|
4505
|
+
HWY_API VFromD<D> PromoteInRangeTo(D d64, VFromD<Rebind<float, D>> v) {
|
|
4506
|
+
const Rebind<MakeNarrow<TFromD<D>>, decltype(d64)> d32;
|
|
4507
|
+
const RebindToFloat<decltype(d32)> df32;
|
|
4508
|
+
const RebindToUnsigned<decltype(d32)> du32;
|
|
4509
|
+
const Repartition<uint8_t, decltype(d32)> du32_as_du8;
|
|
4510
|
+
|
|
4511
|
+
constexpr uint32_t kExpAdjDecr =
|
|
4512
|
+
0xFFFFFF9Du + static_cast<uint32_t>(!IsSigned<TFromD<D>>());
|
|
4513
|
+
|
|
4514
|
+
const auto exponent_adj = BitCast(
|
|
4515
|
+
du32, SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))),
|
|
4516
|
+
BitCast(du32_as_du8, Set(du32, kExpAdjDecr))));
|
|
4517
|
+
const auto adj_v =
|
|
4518
|
+
BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj));
|
|
4519
|
+
|
|
4520
|
+
return PromoteTo(d64, ConvertTo(d32, adj_v)) << PromoteTo(d64, exponent_adj);
|
|
4521
|
+
}
|
|
4522
|
+
|
|
4382
4523
|
#endif // HWY_HAVE_FLOAT64
|
|
4383
4524
|
|
|
4525
|
+
// ------------------------------ PromoteEvenTo/PromoteOddTo
|
|
4526
|
+
#include "hwy/ops/inside-inl.h"
|
|
4527
|
+
|
|
4384
4528
|
// ------------------------------ PromoteUpperTo
|
|
4385
4529
|
|
|
4386
4530
|
#if HWY_ARCH_ARM_A64
|
|
@@ -4650,14 +4794,45 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
|
|
|
4650
4794
|
|
|
4651
4795
|
#endif // HWY_NEON_HAVE_F16C
|
|
4652
4796
|
|
|
4653
|
-
|
|
4654
|
-
|
|
4655
|
-
|
|
4656
|
-
|
|
4657
|
-
|
|
4658
|
-
|
|
4659
|
-
|
|
4797
|
+
#if HWY_NEON_HAVE_F32_TO_BF16C
|
|
4798
|
+
#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
4799
|
+
#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
4800
|
+
#else
|
|
4801
|
+
#define HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
4802
|
+
#endif
|
|
4803
|
+
|
|
4804
|
+
namespace detail {
|
|
4805
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
4806
|
+
// If HWY_NEON_HAVE_BFLOAT16 is true, detail::Vec128<bfloat16_t, N>::type is
|
|
4807
|
+
// bfloat16x4_t or bfloat16x8_t.
|
|
4808
|
+
static HWY_INLINE bfloat16x4_t BitCastFromRawNeonBF16(bfloat16x4_t raw) {
|
|
4809
|
+
return raw;
|
|
4810
|
+
}
|
|
4811
|
+
#else
|
|
4812
|
+
// If HWY_NEON_HAVE_F32_TO_BF16C && !HWY_NEON_HAVE_BFLOAT16 is true,
|
|
4813
|
+
// detail::Vec128<bfloat16_t, N>::type is uint16x4_t or uint16x8_t vector to
|
|
4814
|
+
// work around compiler bugs that are there with GCC 13 or earlier or Clang 16
|
|
4815
|
+
// or earlier on AArch64.
|
|
4816
|
+
|
|
4817
|
+
// The bfloat16x4_t vector returned by vcvt_bf16_f32 needs to be bitcasted to
|
|
4818
|
+
// an uint16x4_t vector if HWY_NEON_HAVE_F32_TO_BF16C &&
|
|
4819
|
+
// !HWY_NEON_HAVE_BFLOAT16 is true.
|
|
4820
|
+
static HWY_INLINE uint16x4_t BitCastFromRawNeonBF16(bfloat16x4_t raw) {
|
|
4821
|
+
return vreinterpret_u16_bf16(raw);
|
|
4660
4822
|
}
|
|
4823
|
+
#endif
|
|
4824
|
+
} // namespace detail
|
|
4825
|
+
|
|
4826
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
|
|
4827
|
+
HWY_API VFromD<D> DemoteTo(D /*dbf16*/, VFromD<Rebind<float, D>> v) {
|
|
4828
|
+
return VFromD<D>(detail::BitCastFromRawNeonBF16(vcvt_bf16_f32(v.raw)));
|
|
4829
|
+
}
|
|
4830
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_BF16_D(D)>
|
|
4831
|
+
HWY_API VFromD<D> DemoteTo(D /*dbf16*/, VFromD<Rebind<float, D>> v) {
|
|
4832
|
+
return VFromD<D>(detail::BitCastFromRawNeonBF16(
|
|
4833
|
+
vcvt_bf16_f32(vcombine_f32(v.raw, v.raw))));
|
|
4834
|
+
}
|
|
4835
|
+
#endif // HWY_NEON_HAVE_F32_TO_BF16C
|
|
4661
4836
|
|
|
4662
4837
|
#if HWY_HAVE_FLOAT64
|
|
4663
4838
|
|
|
@@ -4972,13 +5147,18 @@ HWY_API Vec64<float16_t> LowerHalf(Vec128<float16_t> v) {
|
|
|
4972
5147
|
return Vec64<float16_t>(vget_low_f16(v.raw));
|
|
4973
5148
|
}
|
|
4974
5149
|
#endif // HWY_HAVE_FLOAT16
|
|
5150
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
5151
|
+
HWY_API Vec64<bfloat16_t> LowerHalf(Vec128<bfloat16_t> v) {
|
|
5152
|
+
return Vec64<bfloat16_t>(vget_low_bf16(v.raw));
|
|
5153
|
+
}
|
|
5154
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
4975
5155
|
#if HWY_HAVE_FLOAT64
|
|
4976
5156
|
HWY_API Vec64<double> LowerHalf(Vec128<double> v) {
|
|
4977
5157
|
return Vec64<double>(vget_low_f64(v.raw));
|
|
4978
5158
|
}
|
|
4979
5159
|
#endif // HWY_HAVE_FLOAT64
|
|
4980
5160
|
|
|
4981
|
-
template <class V,
|
|
5161
|
+
template <class V, HWY_NEON_IF_EMULATED_D(DFromV<V>), HWY_IF_V_SIZE_V(V, 16)>
|
|
4982
5162
|
HWY_API VFromD<Half<DFromV<V>>> LowerHalf(V v) {
|
|
4983
5163
|
const Full128<uint16_t> du;
|
|
4984
5164
|
const Half<DFromV<V>> dh;
|
|
@@ -5178,6 +5358,12 @@ HWY_API Vec64<float16_t> UpperHalf(D /* tag */, Vec128<float16_t> v) {
|
|
|
5178
5358
|
return Vec64<float16_t>(vget_high_f16(v.raw));
|
|
5179
5359
|
}
|
|
5180
5360
|
#endif
|
|
5361
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
5362
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
5363
|
+
HWY_API Vec64<bfloat16_t> UpperHalf(D /* tag */, Vec128<bfloat16_t> v) {
|
|
5364
|
+
return Vec64<bfloat16_t>(vget_high_bf16(v.raw));
|
|
5365
|
+
}
|
|
5366
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
5181
5367
|
template <class D, HWY_IF_F32_D(D)>
|
|
5182
5368
|
HWY_API Vec64<float> UpperHalf(D /* tag */, Vec128<float> v) {
|
|
5183
5369
|
return Vec64<float>(vget_high_f32(v.raw));
|
|
@@ -5309,6 +5495,20 @@ HWY_API Vec128<float16_t, N> Broadcast(Vec128<float16_t, N> v) {
|
|
|
5309
5495
|
}
|
|
5310
5496
|
#endif // HWY_HAVE_FLOAT16
|
|
5311
5497
|
|
|
5498
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
5499
|
+
template <int kLane>
|
|
5500
|
+
HWY_API Vec128<bfloat16_t> Broadcast(Vec128<bfloat16_t> v) {
|
|
5501
|
+
static_assert(0 <= kLane && kLane < 8, "Invalid lane");
|
|
5502
|
+
return Vec128<bfloat16_t>(vdupq_laneq_bf16(v.raw, kLane));
|
|
5503
|
+
}
|
|
5504
|
+
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(bfloat16_t, N, 8),
|
|
5505
|
+
HWY_IF_LANES_GT(N, 1)>
|
|
5506
|
+
HWY_API Vec128<bfloat16_t, N> Broadcast(Vec128<bfloat16_t, N> v) {
|
|
5507
|
+
static_assert(0 <= kLane && kLane < N, "Invalid lane");
|
|
5508
|
+
return Vec128<bfloat16_t, N>(vdup_lane_bf16(v.raw, kLane));
|
|
5509
|
+
}
|
|
5510
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
5511
|
+
|
|
5312
5512
|
template <int kLane>
|
|
5313
5513
|
HWY_API Vec128<float> Broadcast(Vec128<float> v) {
|
|
5314
5514
|
static_assert(0 <= kLane && kLane < 4, "Invalid lane");
|
|
@@ -5416,7 +5616,26 @@ HWY_API Vec128<float16_t> Broadcast(Vec128<float16_t> v) {
|
|
|
5416
5616
|
static_assert(0 <= kLane && kLane < 8, "Invalid lane");
|
|
5417
5617
|
return Vec128<float16_t>(vdupq_n_f16(vgetq_lane_f16(v.raw, kLane)));
|
|
5418
5618
|
}
|
|
5619
|
+
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8),
|
|
5620
|
+
HWY_IF_LANES_GT(N, 1)>
|
|
5621
|
+
HWY_API Vec128<float16_t, N> Broadcast(Vec128<float16_t, N> v) {
|
|
5622
|
+
static_assert(0 <= kLane && kLane < N, "Invalid lane");
|
|
5623
|
+
return Vec128<float16_t, N>(vdup_lane_f16(v.raw, kLane));
|
|
5624
|
+
}
|
|
5419
5625
|
#endif // HWY_HAVE_FLOAT16
|
|
5626
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
5627
|
+
template <int kLane>
|
|
5628
|
+
HWY_API Vec128<bfloat16_t> Broadcast(Vec128<bfloat16_t> v) {
|
|
5629
|
+
static_assert(0 <= kLane && kLane < 8, "Invalid lane");
|
|
5630
|
+
return Vec128<bfloat16_t>(vdupq_n_bf16(vgetq_lane_bf16(v.raw, kLane)));
|
|
5631
|
+
}
|
|
5632
|
+
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(bfloat16_t, N, 8),
|
|
5633
|
+
HWY_IF_LANES_GT(N, 1)>
|
|
5634
|
+
HWY_API Vec128<bfloat16_t, N> Broadcast(Vec128<bfloat16_t, N> v) {
|
|
5635
|
+
static_assert(0 <= kLane && kLane < N, "Invalid lane");
|
|
5636
|
+
return Vec128<bfloat16_t, N>(vdup_lane_bf16(v.raw, kLane));
|
|
5637
|
+
}
|
|
5638
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
5420
5639
|
template <int kLane>
|
|
5421
5640
|
HWY_API Vec128<float> Broadcast(Vec128<float> v) {
|
|
5422
5641
|
static_assert(0 <= kLane && kLane < 4, "Invalid lane");
|
|
@@ -5431,6 +5650,14 @@ HWY_API Vec128<float, N> Broadcast(Vec128<float, N> v) {
|
|
|
5431
5650
|
|
|
5432
5651
|
#endif // HWY_ARCH_ARM_A64
|
|
5433
5652
|
|
|
5653
|
+
template <int kLane, typename V, HWY_NEON_IF_EMULATED_D(DFromV<V>),
|
|
5654
|
+
HWY_IF_LANES_GT_D(DFromV<V>, 1)>
|
|
5655
|
+
HWY_API V Broadcast(V v) {
|
|
5656
|
+
const DFromV<V> d;
|
|
5657
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5658
|
+
return BitCast(d, Broadcast<kLane>(BitCast(du, v)));
|
|
5659
|
+
}
|
|
5660
|
+
|
|
5434
5661
|
// ------------------------------ TableLookupLanes
|
|
5435
5662
|
|
|
5436
5663
|
// Returned by SetTableIndices for use by TableLookupLanes.
|
|
@@ -6268,16 +6495,514 @@ HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
|
|
|
6268
6495
|
return detail::SlideDownLanes(v, amt);
|
|
6269
6496
|
}
|
|
6270
6497
|
|
|
6498
|
+
// ------------------------------- WidenHighMulAdd
|
|
6499
|
+
|
|
6500
|
+
#ifdef HWY_NATIVE_WIDEN_HIGH_MUL_ADD
|
|
6501
|
+
#undef HWY_NATIVE_WIDEN_HIGH_MUL_ADD
|
|
6502
|
+
#else
|
|
6503
|
+
#define HWY_NATIVE_WIDEN_HIGH_MUL_ADD
|
|
6504
|
+
#endif
|
|
6505
|
+
|
|
6506
|
+
namespace detail {
|
|
6507
|
+
|
|
6508
|
+
template<class D, HWY_IF_U64_D(D), class DN = RepartitionToNarrow<D>,
|
|
6509
|
+
HWY_IF_LANES_GT_D(DN, 2)>
|
|
6510
|
+
HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6511
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6512
|
+
#if HWY_ARCH_ARM_A64
|
|
6513
|
+
return Vec128<uint64_t>(vmlal_high_u32(add.raw, mul.raw, x.raw));
|
|
6514
|
+
#else
|
|
6515
|
+
const Full64<uint32_t> dh;
|
|
6516
|
+
return Vec128<uint64_t>(
|
|
6517
|
+
vmlal_u32(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
|
|
6518
|
+
#endif
|
|
6519
|
+
}
|
|
6520
|
+
|
|
6521
|
+
template<class D, HWY_IF_U64_D(D), class DN = RepartitionToNarrow<D>,
|
|
6522
|
+
HWY_IF_LANES_LE_D(DN, 2)>
|
|
6523
|
+
HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
|
|
6524
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6525
|
+
Vec128<uint64_t> mulResult = Vec128<uint64_t>(vmull_u32(mul.raw, x.raw));
|
|
6526
|
+
return UpperHalf(d, mulResult) + add;
|
|
6527
|
+
}
|
|
6528
|
+
|
|
6529
|
+
template<class D, HWY_IF_I64_D(D), class DN = RepartitionToNarrow<D>,
|
|
6530
|
+
HWY_IF_LANES_GT_D(DN, 2)>
|
|
6531
|
+
HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6532
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6533
|
+
#if HWY_ARCH_ARM_A64
|
|
6534
|
+
return Vec128<int64_t>(vmlal_high_s32(add.raw, mul.raw, x.raw));
|
|
6535
|
+
#else
|
|
6536
|
+
const Full64<int32_t> dh;
|
|
6537
|
+
return Vec128<int64_t>(
|
|
6538
|
+
vmlal_s32(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
|
|
6539
|
+
#endif
|
|
6540
|
+
}
|
|
6541
|
+
|
|
6542
|
+
template<class D, HWY_IF_I64_D(D), class DN = RepartitionToNarrow<D>,
|
|
6543
|
+
HWY_IF_LANES_LE_D(DN, 2)>
|
|
6544
|
+
HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
|
|
6545
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6546
|
+
Vec128<int64_t> mulResult = Vec128<int64_t>(vmull_s32(mul.raw, x.raw));
|
|
6547
|
+
return UpperHalf(d, mulResult) + add;
|
|
6548
|
+
}
|
|
6549
|
+
|
|
6550
|
+
template<class D, HWY_IF_I32_D(D), class DN = RepartitionToNarrow<D>,
|
|
6551
|
+
HWY_IF_LANES_GT_D(DN, 4)>
|
|
6552
|
+
HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6553
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6554
|
+
#if HWY_ARCH_ARM_A64
|
|
6555
|
+
return Vec128<int32_t>(vmlal_high_s16(add.raw, mul.raw, x.raw));
|
|
6556
|
+
#else
|
|
6557
|
+
const Full64<int16_t> dh;
|
|
6558
|
+
return Vec128<int32_t>(
|
|
6559
|
+
vmlal_s16(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
|
|
6560
|
+
#endif
|
|
6561
|
+
}
|
|
6562
|
+
|
|
6563
|
+
template<class D, HWY_IF_I32_D(D), class DN = RepartitionToNarrow<D>,
|
|
6564
|
+
HWY_IF_LANES_D(DN, 4)>
|
|
6565
|
+
HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
|
|
6566
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6567
|
+
Vec128<int32_t> widen = Vec128<int32_t>(vmull_s16(mul.raw, x.raw));
|
|
6568
|
+
Vec64<int32_t> hi = UpperHalf(d, widen);
|
|
6569
|
+
return hi + add;
|
|
6570
|
+
}
|
|
6571
|
+
|
|
6572
|
+
template<class D, HWY_IF_I32_D(D), class DN = RepartitionToNarrow<D>,
|
|
6573
|
+
HWY_IF_LANES_D(DN, 2)>
|
|
6574
|
+
HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
|
|
6575
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6576
|
+
Vec128<int32_t> widen = Vec128<int32_t>(vmull_s16(mul.raw, x.raw));
|
|
6577
|
+
Vec32<int32_t> hi = UpperHalf(d, Vec64<int32_t>(vget_high_s32(widen.raw)));
|
|
6578
|
+
return hi + add;
|
|
6579
|
+
}
|
|
6580
|
+
|
|
6581
|
+
template<class D, HWY_IF_U32_D(D), class DN = RepartitionToNarrow<D>,
|
|
6582
|
+
HWY_IF_LANES_GT_D(DN, 4)>
|
|
6583
|
+
HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6584
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6585
|
+
#if HWY_ARCH_ARM_A64
|
|
6586
|
+
return Vec128<uint32_t>(vmlal_high_u16(add.raw, mul.raw, x.raw));
|
|
6587
|
+
#else
|
|
6588
|
+
const Full64<uint16_t> dh;
|
|
6589
|
+
return Vec128<uint32_t>(
|
|
6590
|
+
vmlal_u16(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
|
|
6591
|
+
#endif
|
|
6592
|
+
}
|
|
6593
|
+
|
|
6594
|
+
template<class D, HWY_IF_U32_D(D), class DN = RepartitionToNarrow<D>,
|
|
6595
|
+
HWY_IF_LANES_D(DN, 4)>
|
|
6596
|
+
HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
|
|
6597
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6598
|
+
Vec128<uint32_t> widen = Vec128<uint32_t>(vmull_u16(mul.raw, x.raw));
|
|
6599
|
+
VFromD<D> hi = UpperHalf(d, widen);
|
|
6600
|
+
return hi + add;
|
|
6601
|
+
}
|
|
6602
|
+
|
|
6603
|
+
template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_D(D, 1),
|
|
6604
|
+
class DN = RepartitionToNarrow<D>>
|
|
6605
|
+
HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
|
|
6606
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6607
|
+
Vec128<uint32_t> widen = Vec128<uint32_t>(vmull_u16(mul.raw, x.raw));
|
|
6608
|
+
VFromD<D> hi = UpperHalf(d, Vec64<uint32_t>(vget_high_u32(widen.raw)));
|
|
6609
|
+
return hi + add;
|
|
6610
|
+
}
|
|
6611
|
+
|
|
6612
|
+
template<class D, HWY_IF_U16_D(D), class DN = RepartitionToNarrow<D>,
|
|
6613
|
+
HWY_IF_LANES_GT_D(DN, 8)>
|
|
6614
|
+
HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6615
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6616
|
+
#if HWY_ARCH_ARM_A64
|
|
6617
|
+
return Vec128<uint16_t>(vmlal_high_u8(add.raw, mul.raw, x.raw));
|
|
6618
|
+
#else
|
|
6619
|
+
const Full64<uint8_t> dh;
|
|
6620
|
+
return Vec128<uint16_t>(
|
|
6621
|
+
vmlal_u8(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
|
|
6622
|
+
#endif
|
|
6623
|
+
}
|
|
6624
|
+
|
|
6625
|
+
template<class D, HWY_IF_U16_D(D), class DN = RepartitionToNarrow<D>,
|
|
6626
|
+
HWY_IF_LANES_D(DN, 8)>
|
|
6627
|
+
HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
|
|
6628
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6629
|
+
Vec128<uint16_t> widen = Vec128<uint16_t>(vmull_u8(mul.raw, x.raw));
|
|
6630
|
+
VFromD<D> hi = UpperHalf(d, widen);
|
|
6631
|
+
return hi + add;
|
|
6632
|
+
}
|
|
6633
|
+
|
|
6634
|
+
template<class D, HWY_IF_U16(TFromD<D>), class DN = RepartitionToNarrow<D>,
|
|
6635
|
+
HWY_IF_LANES_LE_D(DN, 4)>
|
|
6636
|
+
HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
|
|
6637
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6638
|
+
Vec128<uint16_t> widen = Vec128<uint16_t>(vmull_u8(mul.raw, x.raw));
|
|
6639
|
+
const Twice<decltype(d)> d16F;
|
|
6640
|
+
VFromD<D> hi = UpperHalf(d, VFromD<decltype(d16F)>(vget_high_u16(widen.raw)));
|
|
6641
|
+
return hi + add;
|
|
6642
|
+
}
|
|
6643
|
+
|
|
6644
|
+
template<class D, HWY_IF_I16_D(D), class DN = RepartitionToNarrow<D>,
|
|
6645
|
+
HWY_IF_LANES_GT_D(DN, 8)>
|
|
6646
|
+
HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6647
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6648
|
+
#if HWY_ARCH_ARM_A64
|
|
6649
|
+
return Vec128<int16_t>(vmlal_high_s8(add.raw, mul.raw, x.raw));
|
|
6650
|
+
#else
|
|
6651
|
+
const Full64<int8_t> dh;
|
|
6652
|
+
return Vec128<int16_t>(
|
|
6653
|
+
vmlal_s8(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
|
|
6654
|
+
#endif
|
|
6655
|
+
}
|
|
6656
|
+
|
|
6657
|
+
template<class D, HWY_IF_I16_D(D), class DN = RepartitionToNarrow<D>,
|
|
6658
|
+
HWY_IF_LANES_D(DN, 8)>
|
|
6659
|
+
HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
|
|
6660
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6661
|
+
Vec128<int16_t> widen = Vec128<int16_t>(vmull_s8(mul.raw, x.raw));
|
|
6662
|
+
VFromD<D> hi = UpperHalf(d, widen);
|
|
6663
|
+
return hi + add;
|
|
6664
|
+
}
|
|
6665
|
+
|
|
6666
|
+
template<class D, HWY_IF_I16_D(D), class DN = RepartitionToNarrow<D>,
|
|
6667
|
+
HWY_IF_LANES_LE_D(DN, 4)>
|
|
6668
|
+
HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
|
|
6669
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6670
|
+
Vec128<int16_t> widen = Vec128<int16_t>(vmull_s8(mul.raw, x.raw));
|
|
6671
|
+
const Twice<decltype(d)> d16F;
|
|
6672
|
+
VFromD<D> hi = UpperHalf(d, VFromD<decltype(d16F)>(vget_high_s16(widen.raw)));
|
|
6673
|
+
return hi + add;
|
|
6674
|
+
}
|
|
6675
|
+
|
|
6676
|
+
#if 0
|
|
6677
|
+
#if HWY_HAVE_FLOAT16
|
|
6678
|
+
template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 4),
|
|
6679
|
+
class DN = RepartitionToNarrow<D>>
|
|
6680
|
+
HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6681
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6682
|
+
return VFromD<D>(vfmlalq_high_f16(add.raw, mul.raw, x.raw));
|
|
6683
|
+
}
|
|
6684
|
+
|
|
6685
|
+
template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 2),
|
|
6686
|
+
class DN = RepartitionToNarrow<D>>
|
|
6687
|
+
HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6688
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6689
|
+
return Vec64<float32_t>(vfmlal_high_f16(add.raw, mul.raw, x.raw));
|
|
6690
|
+
}
|
|
6691
|
+
|
|
6692
|
+
template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 1),
|
|
6693
|
+
class DN = RepartitionToNarrow<D>>
|
|
6694
|
+
HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
|
|
6695
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6696
|
+
return MulAdd(add, PromoteUpperTo(d, mul), PromoteUpperTo(d, x));
|
|
6697
|
+
}
|
|
6698
|
+
#endif
|
|
6699
|
+
#endif
|
|
6700
|
+
|
|
6701
|
+
} // namespace detail
|
|
6702
|
+
|
|
6703
|
+
// ------------------------------- WidenMulAdd
|
|
6704
|
+
|
|
6705
|
+
#ifdef HWY_NATIVE_WIDEN_MUL_ADD
|
|
6706
|
+
#undef HWY_NATIVE_WIDEN_MUL_ADD
|
|
6707
|
+
#else
|
|
6708
|
+
#define HWY_NATIVE_WIDEN_MUL_ADD
|
|
6709
|
+
#endif
|
|
6710
|
+
|
|
6711
|
+
namespace detail {
|
|
6712
|
+
|
|
6713
|
+
template<class D, HWY_IF_U16_D(D), HWY_IF_LANES_GT_D(D, 4),
|
|
6714
|
+
class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
|
|
6715
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6716
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6717
|
+
return Vec128<uint16_t>(vmlal_u8(add.raw, mul.raw, x.raw));
|
|
6718
|
+
}
|
|
6719
|
+
|
|
6720
|
+
template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_LE_D(D, 4),
|
|
6721
|
+
class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
|
|
6722
|
+
HWY_API VFromD<D> WidenMulAdd(D d, VFromD<DN> mul, VFromD<DN> x,
|
|
6723
|
+
VFromD<D> add) {
|
|
6724
|
+
return MulAdd(add, PromoteTo(d, mul), PromoteTo(d, x));
|
|
6725
|
+
}
|
|
6726
|
+
|
|
6727
|
+
template<class D, HWY_IF_I16_D(D), HWY_IF_LANES_GT_D(D, 4),
|
|
6728
|
+
class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
|
|
6729
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6730
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6731
|
+
return VFromD<D>(vmlal_s8(add.raw, mul.raw, x.raw));
|
|
6732
|
+
}
|
|
6733
|
+
|
|
6734
|
+
template <class D, HWY_IF_I16_D(D), HWY_IF_LANES_LE_D(D, 4),
|
|
6735
|
+
class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
|
|
6736
|
+
HWY_API VFromD<D> WidenMulAdd(D d, VFromD<DN> mul, VFromD<DN> x,
|
|
6737
|
+
VFromD<D> add) {
|
|
6738
|
+
return MulAdd(add, PromoteTo(d, mul), PromoteTo(d, x));
|
|
6739
|
+
}
|
|
6740
|
+
|
|
6741
|
+
template<class D, HWY_IF_I32_D(D),
|
|
6742
|
+
class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
|
|
6743
|
+
HWY_IF_LANES_GT_D(DN, 2)>
|
|
6744
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6745
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6746
|
+
return Vec128<int32_t>(vmlal_s16(add.raw, mul.raw, x.raw));
|
|
6747
|
+
}
|
|
6748
|
+
|
|
6749
|
+
template<class D, HWY_IF_I32_D(D),
|
|
6750
|
+
class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
|
|
6751
|
+
HWY_IF_LANES_D(DN, 2)>
|
|
6752
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6753
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6754
|
+
Vec128<int32_t> mulRs = Vec128<int32_t>(vmull_s16(mul.raw, x.raw));
|
|
6755
|
+
const VFromD<D> mul10 = LowerHalf(mulRs);
|
|
6756
|
+
return add + mul10;
|
|
6757
|
+
}
|
|
6758
|
+
|
|
6759
|
+
template<class D, HWY_IF_I32_D(D),
|
|
6760
|
+
class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
|
|
6761
|
+
HWY_IF_LANES_D(D, 1)>
|
|
6762
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6763
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6764
|
+
Vec64<int32_t> mulRs = LowerHalf(Vec128<int32_t>(vmull_s16(mul.raw, x.raw)));
|
|
6765
|
+
const Vec32<int32_t> mul10(LowerHalf(mulRs));
|
|
6766
|
+
return add + mul10;
|
|
6767
|
+
}
|
|
6768
|
+
|
|
6769
|
+
template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_GT_D(D, 2),
|
|
6770
|
+
class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
|
|
6771
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6772
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6773
|
+
return Vec128<uint32_t>(vmlal_u16(add.raw, mul.raw, x.raw));
|
|
6774
|
+
}
|
|
6775
|
+
|
|
6776
|
+
template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_D(D, 2),
|
|
6777
|
+
class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
|
|
6778
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6779
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6780
|
+
Vec128<uint32_t> mulRs = Vec128<uint32_t>(vmull_u16(mul.raw, x.raw));
|
|
6781
|
+
const Vec64<uint32_t> mul10(LowerHalf(mulRs));
|
|
6782
|
+
return add + mul10;
|
|
6783
|
+
}
|
|
6784
|
+
|
|
6785
|
+
template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_D(D, 1),
|
|
6786
|
+
class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
|
|
6787
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6788
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6789
|
+
Vec64<uint32_t> mulRs =
|
|
6790
|
+
LowerHalf(Vec128<uint32_t>(vmull_u16(mul.raw, x.raw)));
|
|
6791
|
+
const Vec32<uint32_t> mul10(LowerHalf(mulRs));
|
|
6792
|
+
return add + mul10;
|
|
6793
|
+
}
|
|
6794
|
+
|
|
6795
|
+
template<class D, HWY_IF_I64_D(D), class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
|
|
6796
|
+
HWY_IF_LANES_D(DN, 2)>
|
|
6797
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6798
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6799
|
+
return VFromD<D>(vmlal_s32(add.raw, mul.raw, x.raw));
|
|
6800
|
+
}
|
|
6801
|
+
|
|
6802
|
+
template<class D, HWY_IF_I64_D(D), HWY_IF_LANES_D(D, 1),
|
|
6803
|
+
class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
|
|
6804
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6805
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6806
|
+
Vec128<int64_t> mulRs = Vec128<int64_t>(vmull_s32(mul.raw, x.raw));
|
|
6807
|
+
const VFromD<D> mul10(LowerHalf(mulRs));
|
|
6808
|
+
return add + mul10;
|
|
6809
|
+
}
|
|
6810
|
+
|
|
6811
|
+
template<class D, HWY_IF_U64_D(D), class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
|
|
6812
|
+
HWY_IF_LANES_D(DN, 2)>
|
|
6813
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6814
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6815
|
+
return VFromD<D>(vmlal_u32(add.raw, mul.raw, x.raw));
|
|
6816
|
+
}
|
|
6817
|
+
|
|
6818
|
+
template<class D, HWY_IF_U64_D(D), class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
|
|
6819
|
+
HWY_IF_LANES_D(DN, 1)>
|
|
6820
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6821
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6822
|
+
Vec128<uint64_t> mulRs = Vec128<uint64_t>(vmull_u32(mul.raw, x.raw));
|
|
6823
|
+
const VFromD<D> mul10(LowerHalf(mulRs));
|
|
6824
|
+
return add + mul10;
|
|
6825
|
+
}
|
|
6826
|
+
|
|
6827
|
+
#if 0
|
|
6828
|
+
#if HWY_HAVE_FLOAT16
|
|
6829
|
+
template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>,
|
|
6830
|
+
HWY_IF_LANES_D(D, 4)>
|
|
6831
|
+
HWY_API VFromD<D> WidenLowMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6832
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6833
|
+
return VFromD<D>(vfmlalq_low_f16(add.raw, mul.raw, x.raw));
|
|
6834
|
+
}
|
|
6835
|
+
|
|
6836
|
+
template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>,
|
|
6837
|
+
HWY_IF_LANES_D(DN, 4)>
|
|
6838
|
+
HWY_API VFromD<D> WidenLowMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6839
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6840
|
+
return Vec64<float32_t>(vfmlal_low_f16(add.raw, mul.raw, x.raw));
|
|
6841
|
+
}
|
|
6842
|
+
|
|
6843
|
+
template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 1),
|
|
6844
|
+
class DN = RepartitionToNarrow<D>>
|
|
6845
|
+
HWY_API VFromD<D> WidenLowMulAdd(D d, VFromD<DN> mul,
|
|
6846
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6847
|
+
return MulAdd(add, PromoteLowerTo(d, mul), PromoteLowerTo(d, x));
|
|
6848
|
+
}
|
|
6849
|
+
#endif
|
|
6850
|
+
#endif
|
|
6851
|
+
|
|
6852
|
+
} // namespace detail
|
|
6853
|
+
|
|
6854
|
+
// ------------------------------ WidenMulAccumulate
|
|
6855
|
+
|
|
6856
|
+
#ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
|
|
6857
|
+
#undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
|
|
6858
|
+
#else
|
|
6859
|
+
#define HWY_NATIVE_WIDEN_MUL_ACCUMULATE
|
|
6860
|
+
#endif
|
|
6861
|
+
|
|
6862
|
+
template<class D, HWY_IF_INTEGER(TFromD<D>), class DN = RepartitionToNarrow<D>>
|
|
6863
|
+
HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x,
|
|
6864
|
+
VFromD<D> low, VFromD<D>& high) {
|
|
6865
|
+
high = detail::WidenHighMulAdd(d, mul, x, high);
|
|
6866
|
+
return detail::WidenMulAdd(d, LowerHalf(mul), LowerHalf(x), low);
|
|
6867
|
+
}
|
|
6868
|
+
|
|
6869
|
+
#if 0
|
|
6870
|
+
#ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
|
|
6871
|
+
#undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
|
|
6872
|
+
#else
|
|
6873
|
+
#define HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
|
|
6874
|
+
#endif
|
|
6875
|
+
|
|
6876
|
+
#if HWY_HAVE_FLOAT16
|
|
6877
|
+
|
|
6878
|
+
template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>>
|
|
6879
|
+
HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x,
|
|
6880
|
+
VFromD<D> low, VFromD<D>& high) {
|
|
6881
|
+
high = detail::WidenHighMulAdd(d, mul, x, high);
|
|
6882
|
+
return detail::WidenLowMulAdd(d, mul, x, low);
|
|
6883
|
+
}
|
|
6884
|
+
|
|
6885
|
+
#endif
|
|
6886
|
+
#endif
|
|
6887
|
+
|
|
6888
|
+
// ------------------------------ SatWidenMulAccumFixedPoint
|
|
6889
|
+
|
|
6890
|
+
#ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
|
|
6891
|
+
#undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
|
|
6892
|
+
#else
|
|
6893
|
+
#define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
|
|
6894
|
+
#endif
|
|
6895
|
+
|
|
6896
|
+
template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 16)>
|
|
6897
|
+
HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 /*di32*/,
|
|
6898
|
+
VFromD<Rebind<int16_t, DI32>> a,
|
|
6899
|
+
VFromD<Rebind<int16_t, DI32>> b,
|
|
6900
|
+
VFromD<DI32> sum) {
|
|
6901
|
+
return VFromD<DI32>(vqdmlal_s16(sum.raw, a.raw, b.raw));
|
|
6902
|
+
}
|
|
6903
|
+
|
|
6904
|
+
template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 8)>
|
|
6905
|
+
HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 di32,
|
|
6906
|
+
VFromD<Rebind<int16_t, DI32>> a,
|
|
6907
|
+
VFromD<Rebind<int16_t, DI32>> b,
|
|
6908
|
+
VFromD<DI32> sum) {
|
|
6909
|
+
const Full128<TFromD<DI32>> di32_full;
|
|
6910
|
+
const Rebind<int16_t, decltype(di32_full)> di16_full64;
|
|
6911
|
+
return ResizeBitCast(
|
|
6912
|
+
di32, SatWidenMulAccumFixedPoint(di32_full, ResizeBitCast(di16_full64, a),
|
|
6913
|
+
ResizeBitCast(di16_full64, b),
|
|
6914
|
+
ResizeBitCast(di32_full, sum)));
|
|
6915
|
+
}
|
|
6916
|
+
|
|
6271
6917
|
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
|
|
6272
6918
|
|
|
6919
|
+
#if HWY_NEON_HAVE_F32_TO_BF16C
|
|
6920
|
+
|
|
6921
|
+
#ifdef HWY_NATIVE_MUL_EVEN_BF16
|
|
6922
|
+
#undef HWY_NATIVE_MUL_EVEN_BF16
|
|
6923
|
+
#else
|
|
6924
|
+
#define HWY_NATIVE_MUL_EVEN_BF16
|
|
6925
|
+
#endif
|
|
6926
|
+
|
|
6927
|
+
#ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
|
|
6928
|
+
#undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
|
|
6929
|
+
#else
|
|
6930
|
+
#define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
|
|
6931
|
+
#endif
|
|
6932
|
+
|
|
6933
|
+
namespace detail {
|
|
6273
6934
|
#if HWY_NEON_HAVE_BFLOAT16
|
|
6935
|
+
// If HWY_NEON_HAVE_BFLOAT16 is true, detail::Vec128<bfloat16_t, N>::type is
|
|
6936
|
+
// bfloat16x4_t or bfloat16x8_t.
|
|
6937
|
+
static HWY_INLINE bfloat16x4_t BitCastToRawNeonBF16(bfloat16x4_t raw) {
|
|
6938
|
+
return raw;
|
|
6939
|
+
}
|
|
6940
|
+
static HWY_INLINE bfloat16x8_t BitCastToRawNeonBF16(bfloat16x8_t raw) {
|
|
6941
|
+
return raw;
|
|
6942
|
+
}
|
|
6943
|
+
#else
|
|
6944
|
+
// If HWY_NEON_HAVE_F32_TO_BF16C && !HWY_NEON_HAVE_BFLOAT16 is true,
|
|
6945
|
+
// detail::Vec128<bfloat16_t, N>::type is uint16x4_t or uint16x8_t vector to
|
|
6946
|
+
// work around compiler bugs that are there with GCC 13 or earlier or Clang 16
|
|
6947
|
+
// or earlier on AArch64.
|
|
6948
|
+
|
|
6949
|
+
// The uint16x4_t or uint16x8_t vector neets to be bitcasted to a bfloat16x4_t
|
|
6950
|
+
// or a bfloat16x8_t vector for the vbfdot_f32 and vbfdotq_f32 intrinsics if
|
|
6951
|
+
// HWY_NEON_HAVE_F32_TO_BF16C && !HWY_NEON_HAVE_BFLOAT16 is true
|
|
6952
|
+
static HWY_INLINE bfloat16x4_t BitCastToRawNeonBF16(uint16x4_t raw) {
|
|
6953
|
+
return vreinterpret_bf16_u16(raw);
|
|
6954
|
+
}
|
|
6955
|
+
static HWY_INLINE bfloat16x8_t BitCastToRawNeonBF16(uint16x8_t raw) {
|
|
6956
|
+
return vreinterpretq_bf16_u16(raw);
|
|
6957
|
+
}
|
|
6958
|
+
#endif
|
|
6959
|
+
} // namespace detail
|
|
6960
|
+
|
|
6961
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16)>
|
|
6962
|
+
HWY_API Vec128<float> MulEvenAdd(D /*d32*/, Vec128<bfloat16_t> a,
|
|
6963
|
+
Vec128<bfloat16_t> b, const Vec128<float> c) {
|
|
6964
|
+
return Vec128<float>(vbfmlalbq_f32(c.raw, detail::BitCastToRawNeonBF16(a.raw),
|
|
6965
|
+
detail::BitCastToRawNeonBF16(b.raw)));
|
|
6966
|
+
}
|
|
6967
|
+
|
|
6968
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16)>
|
|
6969
|
+
HWY_API Vec128<float> MulOddAdd(D /*d32*/, Vec128<bfloat16_t> a,
|
|
6970
|
+
Vec128<bfloat16_t> b, const Vec128<float> c) {
|
|
6971
|
+
return Vec128<float>(vbfmlaltq_f32(c.raw, detail::BitCastToRawNeonBF16(a.raw),
|
|
6972
|
+
detail::BitCastToRawNeonBF16(b.raw)));
|
|
6973
|
+
}
|
|
6274
6974
|
|
|
6275
6975
|
template <class D, HWY_IF_V_SIZE_D(D, 16)>
|
|
6276
6976
|
HWY_API Vec128<float> ReorderWidenMulAccumulate(D /*d32*/, Vec128<bfloat16_t> a,
|
|
6277
6977
|
Vec128<bfloat16_t> b,
|
|
6278
6978
|
const Vec128<float> sum0,
|
|
6279
6979
|
Vec128<float>& /*sum1*/) {
|
|
6280
|
-
return Vec128<float>(vbfdotq_f32(sum0.raw,
|
|
6980
|
+
return Vec128<float>(vbfdotq_f32(sum0.raw,
|
|
6981
|
+
detail::BitCastToRawNeonBF16(a.raw),
|
|
6982
|
+
detail::BitCastToRawNeonBF16(b.raw)));
|
|
6983
|
+
}
|
|
6984
|
+
|
|
6985
|
+
// There is no non-q version of these instructions.
|
|
6986
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
6987
|
+
HWY_API VFromD<D> MulEvenAdd(D d32, VFromD<Repartition<bfloat16_t, D>> a,
|
|
6988
|
+
VFromD<Repartition<bfloat16_t, D>> b,
|
|
6989
|
+
const VFromD<D> c) {
|
|
6990
|
+
const Full128<float> d32f;
|
|
6991
|
+
const Full128<bfloat16_t> d16f;
|
|
6992
|
+
return ResizeBitCast(
|
|
6993
|
+
d32, MulEvenAdd(d32f, ResizeBitCast(d16f, a), ResizeBitCast(d16f, b),
|
|
6994
|
+
ResizeBitCast(d32f, c)));
|
|
6995
|
+
}
|
|
6996
|
+
|
|
6997
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
6998
|
+
HWY_API VFromD<D> MulOddAdd(D d32, VFromD<Repartition<bfloat16_t, D>> a,
|
|
6999
|
+
VFromD<Repartition<bfloat16_t, D>> b,
|
|
7000
|
+
const VFromD<D> c) {
|
|
7001
|
+
const Full128<float> d32f;
|
|
7002
|
+
const Full128<bfloat16_t> d16f;
|
|
7003
|
+
return ResizeBitCast(
|
|
7004
|
+
d32, MulOddAdd(d32f, ResizeBitCast(d16f, a), ResizeBitCast(d16f, b),
|
|
7005
|
+
ResizeBitCast(d32f, c)));
|
|
6281
7006
|
}
|
|
6282
7007
|
|
|
6283
7008
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
@@ -6285,28 +7010,11 @@ HWY_API VFromD<D> ReorderWidenMulAccumulate(
|
|
|
6285
7010
|
D /*d32*/, VFromD<Repartition<bfloat16_t, D>> a,
|
|
6286
7011
|
VFromD<Repartition<bfloat16_t, D>> b, const VFromD<D> sum0,
|
|
6287
7012
|
VFromD<D>& /*sum1*/) {
|
|
6288
|
-
return VFromD<D>(vbfdot_f32(sum0.raw, a.raw,
|
|
7013
|
+
return VFromD<D>(vbfdot_f32(sum0.raw, detail::BitCastToRawNeonBF16(a.raw),
|
|
7014
|
+
detail::BitCastToRawNeonBF16(b.raw)));
|
|
6289
7015
|
}
|
|
6290
7016
|
|
|
6291
|
-
#
|
|
6292
|
-
|
|
6293
|
-
template <class D32, HWY_IF_F32_D(D32),
|
|
6294
|
-
class V16 = VFromD<Repartition<bfloat16_t, D32>>>
|
|
6295
|
-
HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b,
|
|
6296
|
-
const VFromD<D32> sum0,
|
|
6297
|
-
VFromD<D32>& sum1) {
|
|
6298
|
-
const RebindToUnsigned<decltype(df32)> du32;
|
|
6299
|
-
using VU32 = VFromD<decltype(du32)>;
|
|
6300
|
-
const VU32 odd = Set(du32, 0xFFFF0000u);
|
|
6301
|
-
const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
|
|
6302
|
-
const VU32 ao = And(BitCast(du32, a), odd);
|
|
6303
|
-
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
6304
|
-
const VU32 bo = And(BitCast(du32, b), odd);
|
|
6305
|
-
sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
|
|
6306
|
-
return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
|
|
6307
|
-
}
|
|
6308
|
-
|
|
6309
|
-
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
7017
|
+
#endif // HWY_NEON_HAVE_F32_TO_BF16C
|
|
6310
7018
|
|
|
6311
7019
|
template <class D, HWY_IF_I32_D(D)>
|
|
6312
7020
|
HWY_API Vec128<int32_t> ReorderWidenMulAccumulate(D /*d32*/, Vec128<int16_t> a,
|
|
@@ -6476,37 +7184,34 @@ HWY_API Vec32<uint32_t> RearrangeToOddPlusEven(Vec32<uint32_t> sum0,
|
|
|
6476
7184
|
|
|
6477
7185
|
// ------------------------------ WidenMulPairwiseAdd
|
|
6478
7186
|
|
|
6479
|
-
#if
|
|
7187
|
+
#if HWY_NEON_HAVE_F32_TO_BF16C
|
|
6480
7188
|
|
|
6481
|
-
template <class
|
|
6482
|
-
HWY_API Vec128<float> WidenMulPairwiseAdd(
|
|
7189
|
+
template <class DF, HWY_IF_V_SIZE_D(DF, 16)>
|
|
7190
|
+
HWY_API Vec128<float> WidenMulPairwiseAdd(DF df, Vec128<bfloat16_t> a,
|
|
6483
7191
|
Vec128<bfloat16_t> b) {
|
|
6484
|
-
return Vec128<float>(vbfdotq_f32(Zero(
|
|
7192
|
+
return Vec128<float>(vbfdotq_f32(Zero(df).raw,
|
|
7193
|
+
detail::BitCastToRawNeonBF16(a.raw),
|
|
7194
|
+
detail::BitCastToRawNeonBF16(b.raw)));
|
|
6485
7195
|
}
|
|
6486
7196
|
|
|
6487
|
-
template <class
|
|
6488
|
-
HWY_API VFromD<
|
|
6489
|
-
|
|
6490
|
-
|
|
6491
|
-
return VFromD<
|
|
7197
|
+
template <class DF, HWY_IF_V_SIZE_LE_D(DF, 8)>
|
|
7198
|
+
HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df,
|
|
7199
|
+
VFromD<Repartition<bfloat16_t, DF>> a,
|
|
7200
|
+
VFromD<Repartition<bfloat16_t, DF>> b) {
|
|
7201
|
+
return VFromD<DF>(vbfdot_f32(Zero(df).raw,
|
|
7202
|
+
detail::BitCastToRawNeonBF16(a.raw),
|
|
7203
|
+
detail::BitCastToRawNeonBF16(b.raw)));
|
|
6492
7204
|
}
|
|
6493
7205
|
|
|
6494
7206
|
#else
|
|
6495
|
-
template <class
|
|
6496
|
-
HWY_API VFromD<
|
|
6497
|
-
|
|
6498
|
-
|
|
6499
|
-
|
|
6500
|
-
|
|
6501
|
-
const VU32 odd = Set(du32, 0xFFFF0000u);
|
|
6502
|
-
const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
|
|
6503
|
-
const VU32 ao = And(BitCast(du32, a), odd);
|
|
6504
|
-
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
6505
|
-
const VU32 bo = And(BitCast(du32, b), odd);
|
|
6506
|
-
return MulAdd(BitCast(df32, ae), BitCast(df32, be),
|
|
6507
|
-
Mul(BitCast(df32, ao), BitCast(df32, bo)));
|
|
7207
|
+
template <class DF, HWY_IF_F32_D(DF)>
|
|
7208
|
+
HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df,
|
|
7209
|
+
VFromD<Repartition<bfloat16_t, DF>> a,
|
|
7210
|
+
VFromD<Repartition<bfloat16_t, DF>> b) {
|
|
7211
|
+
return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
|
|
7212
|
+
Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
|
|
6508
7213
|
}
|
|
6509
|
-
#endif //
|
|
7214
|
+
#endif // HWY_NEON_HAVE_F32_TO_BF16C
|
|
6510
7215
|
|
|
6511
7216
|
template <class D, HWY_IF_I32_D(D)>
|
|
6512
7217
|
HWY_API Vec128<int32_t> WidenMulPairwiseAdd(D /*d32*/, Vec128<int16_t> a,
|
|
@@ -6841,6 +7546,36 @@ HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
|
|
|
6841
7546
|
return IfThenElse(MaskFromVec(vec), b, a);
|
|
6842
7547
|
}
|
|
6843
7548
|
|
|
7549
|
+
// ------------------------------ InterleaveEven
|
|
7550
|
+
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
|
|
7551
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
7552
|
+
#if HWY_ARCH_ARM_A64
|
|
7553
|
+
return detail::InterleaveEven(a, b);
|
|
7554
|
+
#else
|
|
7555
|
+
return VFromD<D>(detail::InterleaveEvenOdd(a.raw, b.raw).val[0]);
|
|
7556
|
+
#endif
|
|
7557
|
+
}
|
|
7558
|
+
|
|
7559
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
7560
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
7561
|
+
return InterleaveLower(a, b);
|
|
7562
|
+
}
|
|
7563
|
+
|
|
7564
|
+
// ------------------------------ InterleaveOdd
|
|
7565
|
+
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
|
|
7566
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
7567
|
+
#if HWY_ARCH_ARM_A64
|
|
7568
|
+
return detail::InterleaveOdd(a, b);
|
|
7569
|
+
#else
|
|
7570
|
+
return VFromD<D>(detail::InterleaveEvenOdd(a.raw, b.raw).val[1]);
|
|
7571
|
+
#endif
|
|
7572
|
+
}
|
|
7573
|
+
|
|
7574
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
7575
|
+
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
|
|
7576
|
+
return InterleaveUpper(d, a, b);
|
|
7577
|
+
}
|
|
7578
|
+
|
|
6844
7579
|
// ------------------------------ OddEvenBlocks
|
|
6845
7580
|
template <typename T, size_t N>
|
|
6846
7581
|
HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
|
|
@@ -6862,12 +7597,14 @@ HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
|
|
|
6862
7597
|
|
|
6863
7598
|
// ------------------------------ ReorderDemote2To (OddEven)
|
|
6864
7599
|
|
|
6865
|
-
|
|
6866
|
-
|
|
6867
|
-
HWY_API VFromD<D> ReorderDemote2To(D dbf16,
|
|
6868
|
-
|
|
6869
|
-
|
|
7600
|
+
#if HWY_NEON_HAVE_F32_TO_BF16C
|
|
7601
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
7602
|
+
HWY_API VFromD<D> ReorderDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
|
|
7603
|
+
VFromD<Repartition<float, D>> b) {
|
|
7604
|
+
const Half<decltype(dbf16)> dh_bf16;
|
|
7605
|
+
return Combine(dbf16, DemoteTo(dh_bf16, b), DemoteTo(dh_bf16, a));
|
|
6870
7606
|
}
|
|
7607
|
+
#endif // HWY_NEON_HAVE_F32_TO_BF16C
|
|
6871
7608
|
|
|
6872
7609
|
template <class D, HWY_IF_I32_D(D)>
|
|
6873
7610
|
HWY_API Vec128<int32_t> ReorderDemote2To(D d32, Vec128<int64_t> a,
|
|
@@ -7083,16 +7820,19 @@ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
|
|
|
7083
7820
|
return ReorderDemote2To(d, a, b);
|
|
7084
7821
|
}
|
|
7085
7822
|
|
|
7086
|
-
|
|
7087
|
-
|
|
7823
|
+
#if HWY_NEON_HAVE_F32_TO_BF16C
|
|
7824
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
7825
|
+
HWY_API VFromD<D> OrderedDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
|
|
7826
|
+
VFromD<Repartition<float, D>> b) {
|
|
7088
7827
|
return ReorderDemote2To(dbf16, a, b);
|
|
7089
7828
|
}
|
|
7829
|
+
#endif // HWY_NEON_HAVE_F32_TO_BF16C
|
|
7090
7830
|
|
|
7091
7831
|
// ================================================== CRYPTO
|
|
7092
7832
|
|
|
7093
7833
|
// (aarch64 or Arm7) and (__ARM_FEATURE_AES or HWY_HAVE_RUNTIME_DISPATCH).
|
|
7094
7834
|
// Otherwise, rely on generic_ops-inl.h to emulate AESRound / CLMul*.
|
|
7095
|
-
#if HWY_TARGET
|
|
7835
|
+
#if HWY_TARGET != HWY_NEON_WITHOUT_AES
|
|
7096
7836
|
|
|
7097
7837
|
#ifdef HWY_NATIVE_AES
|
|
7098
7838
|
#undef HWY_NATIVE_AES
|
|
@@ -7143,7 +7883,7 @@ HWY_API Vec128<uint64_t> CLMulUpper(Vec128<uint64_t> a, Vec128<uint64_t> b) {
|
|
|
7143
7883
|
(uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw));
|
|
7144
7884
|
}
|
|
7145
7885
|
|
|
7146
|
-
#endif // HWY_TARGET
|
|
7886
|
+
#endif // HWY_TARGET != HWY_NEON_WITHOUT_AES
|
|
7147
7887
|
|
|
7148
7888
|
// ================================================== MISC
|
|
7149
7889
|
|
|
@@ -7318,10 +8058,11 @@ HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(Vec128<uint32_t, N> a,
|
|
|
7318
8058
|
vget_low_u64(vmull_u32(a_packed, b_packed)));
|
|
7319
8059
|
}
|
|
7320
8060
|
|
|
7321
|
-
|
|
7322
|
-
|
|
7323
|
-
|
|
7324
|
-
|
|
8061
|
+
template <class T, HWY_IF_UI64(T)>
|
|
8062
|
+
HWY_INLINE Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
|
|
8063
|
+
T hi;
|
|
8064
|
+
T lo = Mul128(GetLane(a), GetLane(b), &hi);
|
|
8065
|
+
return Dup128VecFromValues(Full128<T>(), lo, hi);
|
|
7325
8066
|
}
|
|
7326
8067
|
|
|
7327
8068
|
// Multiplies odd lanes (1, 3 ..) and places the double-wide result into
|
|
@@ -7424,10 +8165,11 @@ HWY_API Vec128<uint64_t, (N + 1) / 2> MulOdd(Vec128<uint32_t, N> a,
|
|
|
7424
8165
|
vget_low_u64(vmull_u32(a_packed, b_packed)));
|
|
7425
8166
|
}
|
|
7426
8167
|
|
|
7427
|
-
|
|
7428
|
-
|
|
7429
|
-
|
|
7430
|
-
|
|
8168
|
+
template <class T, HWY_IF_UI64(T)>
|
|
8169
|
+
HWY_INLINE Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
|
|
8170
|
+
T hi;
|
|
8171
|
+
T lo = Mul128(detail::GetLane<1>(a), detail::GetLane<1>(b), &hi);
|
|
8172
|
+
return Dup128VecFromValues(Full128<T>(), lo, hi);
|
|
7431
8173
|
}
|
|
7432
8174
|
|
|
7433
8175
|
// ------------------------------ TableLookupBytes (Combine, LowerHalf)
|
|
@@ -7492,7 +8234,7 @@ HWY_API VI TableLookupBytesOr0(V bytes, VI from) {
|
|
|
7492
8234
|
|
|
7493
8235
|
// ---------------------------- AESKeyGenAssist (AESLastRound, TableLookupBytes)
|
|
7494
8236
|
|
|
7495
|
-
#if HWY_TARGET
|
|
8237
|
+
#if HWY_TARGET != HWY_NEON_WITHOUT_AES
|
|
7496
8238
|
template <uint8_t kRcon>
|
|
7497
8239
|
HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) {
|
|
7498
8240
|
alignas(16) static constexpr uint8_t kRconXorMask[16] = {
|
|
@@ -7505,7 +8247,7 @@ HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) {
|
|
|
7505
8247
|
const auto sub_word_result = AESLastRound(w13, Load(d, kRconXorMask));
|
|
7506
8248
|
return TableLookupBytes(sub_word_result, Load(d, kRotWordShuffle));
|
|
7507
8249
|
}
|
|
7508
|
-
#endif // HWY_TARGET
|
|
8250
|
+
#endif // HWY_TARGET != HWY_NEON_WITHOUT_AES
|
|
7509
8251
|
|
|
7510
8252
|
// ------------------------------ Scatter in generic_ops-inl.h
|
|
7511
8253
|
// ------------------------------ Gather in generic_ops-inl.h
|