@img/sharp-libvips-dev 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +7 -1
- package/include/aom/aom_image.h +24 -12
- package/include/aom/aom_integer.h +3 -3
- package/include/aom/aomcx.h +15 -0
- package/include/aom/aomdx.h +5 -2
- package/include/archive.h +7 -5
- package/include/archive_entry.h +5 -3
- package/include/cgif.h +3 -0
- package/include/expat.h +21 -10
- package/include/expat_config.h +11 -5
- package/include/ffi.h +12 -25
- package/include/freetype2/freetype/config/ftoption.h +2 -2
- package/include/fribidi/fribidi-config.h +2 -2
- package/include/fribidi/fribidi-unicode-version.h +3 -3
- package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
- package/include/glib-2.0/gio/gappinfo.h +40 -25
- package/include/glib-2.0/gio/gapplication.h +6 -0
- package/include/glib-2.0/gio/gasyncresult.h +1 -1
- package/include/glib-2.0/gio/gconverter.h +5 -0
- package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
- package/include/glib-2.0/gio/gfile.h +16 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/giotypes.h +0 -1
- package/include/glib-2.0/gio/gsettings.h +8 -0
- package/include/glib-2.0/gio/gvfs.h +2 -2
- package/include/glib-2.0/girepository/gi-visibility.h +34 -0
- package/include/glib-2.0/girepository/giarginfo.h +23 -6
- package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
- package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
- package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
- package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
- package/include/glib-2.0/girepository/gienuminfo.h +20 -21
- package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
- package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
- package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
- package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
- package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
- package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
- package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
- package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
- package/include/glib-2.0/girepository/girepository.h +53 -62
- package/include/glib-2.0/girepository/girffi.h +8 -7
- package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
- package/include/glib-2.0/girepository/gistructinfo.h +26 -11
- package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
- package/include/glib-2.0/girepository/gitypelib.h +9 -13
- package/include/glib-2.0/girepository/gitypes.h +52 -104
- package/include/glib-2.0/girepository/giunioninfo.h +28 -12
- package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
- package/include/glib-2.0/girepository/givalueinfo.h +65 -0
- package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
- package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
- package/include/glib-2.0/glib/gbitlock.h +31 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
- package/include/glib-2.0/glib/giochannel.h +2 -2
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +12 -5
- package/include/glib-2.0/glib/gmain.h +93 -7
- package/include/glib-2.0/glib/gmessages.h +8 -0
- package/include/glib-2.0/glib/gqsort.h +8 -1
- package/include/glib-2.0/glib/gslice.h +2 -0
- package/include/glib-2.0/glib/gstrfuncs.h +24 -30
- package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
- package/include/glib-2.0/glib/gthread.h +191 -3
- package/include/glib-2.0/glib/gunicode.h +1 -1
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/glib-unix.h +7 -1
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/genums.h +6 -6
- package/include/glib-2.0/gobject/glib-types.h +11 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gsignal.h +16 -6
- package/include/glib-2.0/gobject/gtype.h +6 -6
- package/include/harfbuzz/hb-buffer.h +6 -0
- package/include/harfbuzz/hb-common.h +6 -9
- package/include/harfbuzz/hb-cplusplus.hh +8 -11
- package/include/harfbuzz/hb-subset.h +17 -4
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +28 -0
- package/include/hwy/aligned_allocator.h +218 -6
- package/include/hwy/base.h +1935 -512
- package/include/hwy/cache_control.h +24 -6
- package/include/hwy/detect_compiler_arch.h +105 -10
- package/include/hwy/detect_targets.h +146 -37
- package/include/hwy/foreach_target.h +36 -1
- package/include/hwy/highway.h +222 -50
- package/include/hwy/ops/arm_neon-inl.h +2055 -894
- package/include/hwy/ops/arm_sve-inl.h +1476 -348
- package/include/hwy/ops/emu128-inl.h +711 -623
- package/include/hwy/ops/generic_ops-inl.h +4431 -2157
- package/include/hwy/ops/inside-inl.h +691 -0
- package/include/hwy/ops/ppc_vsx-inl.h +2186 -673
- package/include/hwy/ops/rvv-inl.h +1556 -536
- package/include/hwy/ops/scalar-inl.h +353 -233
- package/include/hwy/ops/set_macros-inl.h +171 -23
- package/include/hwy/ops/shared-inl.h +198 -56
- package/include/hwy/ops/wasm_128-inl.h +283 -244
- package/include/hwy/ops/x86_128-inl.h +3673 -1357
- package/include/hwy/ops/x86_256-inl.h +1737 -663
- package/include/hwy/ops/x86_512-inl.h +1697 -500
- package/include/hwy/per_target.h +4 -0
- package/include/hwy/profiler.h +648 -0
- package/include/hwy/robust_statistics.h +2 -2
- package/include/hwy/targets.h +40 -32
- package/include/hwy/timer-inl.h +3 -3
- package/include/hwy/timer.h +16 -1
- package/include/libheif/heif.h +170 -15
- package/include/libheif/heif_items.h +237 -0
- package/include/libheif/heif_properties.h +38 -2
- package/include/libheif/heif_regions.h +1 -1
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/png.h +32 -29
- package/include/libpng16/pngconf.h +2 -2
- package/include/libpng16/pnglibconf.h +8 -3
- package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
- package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
- package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
- package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
- package/include/libxml2/libxml/HTMLparser.h +12 -19
- package/include/libxml2/libxml/c14n.h +1 -12
- package/include/libxml2/libxml/debugXML.h +1 -1
- package/include/libxml2/libxml/encoding.h +9 -0
- package/include/libxml2/libxml/entities.h +12 -1
- package/include/libxml2/libxml/hash.h +19 -0
- package/include/libxml2/libxml/list.h +2 -2
- package/include/libxml2/libxml/nanohttp.h +17 -0
- package/include/libxml2/libxml/parser.h +73 -58
- package/include/libxml2/libxml/parserInternals.h +9 -1
- package/include/libxml2/libxml/pattern.h +6 -0
- package/include/libxml2/libxml/tree.h +32 -12
- package/include/libxml2/libxml/uri.h +11 -0
- package/include/libxml2/libxml/valid.h +29 -2
- package/include/libxml2/libxml/xinclude.h +7 -0
- package/include/libxml2/libxml/xmlIO.h +21 -5
- package/include/libxml2/libxml/xmlerror.h +14 -0
- package/include/libxml2/libxml/xmlexports.h +111 -15
- package/include/libxml2/libxml/xmlmemory.h +8 -45
- package/include/libxml2/libxml/xmlreader.h +2 -0
- package/include/libxml2/libxml/xmlsave.h +5 -0
- package/include/libxml2/libxml/xmlunicode.h +165 -1
- package/include/libxml2/libxml/xmlversion.h +15 -179
- package/include/libxml2/libxml/xmlwriter.h +1 -0
- package/include/libxml2/libxml/xpath.h +4 -0
- package/include/pango-1.0/pango/pango-features.h +2 -2
- package/include/pango-1.0/pango/pango-fontmap.h +7 -0
- package/include/pango-1.0/pango/pango-item.h +4 -2
- package/include/pango-1.0/pango/pango-version-macros.h +25 -0
- package/include/pango-1.0/pango/pangofc-font.h +2 -1
- package/include/pixman-1/pixman-version.h +2 -2
- package/include/png.h +32 -29
- package/include/pngconf.h +2 -2
- package/include/pnglibconf.h +8 -3
- package/include/vips/connection.h +9 -3
- package/include/vips/util.h +1 -11
- package/include/vips/version.h +4 -4
- package/include/webp/decode.h +58 -56
- package/include/webp/demux.h +25 -21
- package/include/webp/encode.h +44 -39
- package/include/webp/mux.h +76 -15
- package/include/webp/mux_types.h +2 -1
- package/include/webp/sharpyuv/sharpyuv.h +77 -8
- package/include/webp/types.h +29 -8
- package/include/zconf.h +1 -1
- package/include/zlib.h +12 -12
- package/package.json +1 -1
- package/versions.json +18 -19
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
// Copyright 2019 Google LLC
|
|
2
|
+
// Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
|
|
2
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
// SPDX-License-Identifier: BSD-3-Clause
|
|
3
5
|
//
|
|
4
6
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
7
|
// you may not use this file except in compliance with the License.
|
|
@@ -21,16 +23,12 @@
|
|
|
21
23
|
|
|
22
24
|
#include "hwy/ops/shared-inl.h"
|
|
23
25
|
|
|
24
|
-
HWY_BEFORE_NAMESPACE();
|
|
25
|
-
|
|
26
|
-
// Must come after HWY_BEFORE_NAMESPACE so that the intrinsics are compiled with
|
|
27
|
-
// the same target attribute as our code, see #834.
|
|
28
26
|
HWY_DIAGNOSTICS(push)
|
|
29
27
|
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
|
|
30
28
|
#include <arm_neon.h> // NOLINT(build/include_order)
|
|
31
29
|
HWY_DIAGNOSTICS(pop)
|
|
32
30
|
|
|
33
|
-
|
|
31
|
+
HWY_BEFORE_NAMESPACE();
|
|
34
32
|
namespace hwy {
|
|
35
33
|
namespace HWY_NAMESPACE {
|
|
36
34
|
|
|
@@ -143,12 +141,29 @@ namespace detail { // for code folding and Raw128
|
|
|
143
141
|
HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \
|
|
144
142
|
HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)
|
|
145
143
|
|
|
146
|
-
|
|
144
|
+
// Clang 17 crashes with bf16, see github.com/llvm/llvm-project/issues/64179.
|
|
145
|
+
#undef HWY_NEON_HAVE_BFLOAT16
|
|
146
|
+
#if HWY_HAVE_SCALAR_BF16_TYPE && \
|
|
147
|
+
((HWY_TARGET == HWY_NEON_BF16 && \
|
|
148
|
+
(!HWY_COMPILER_CLANG || HWY_COMPILER_CLANG >= 1800)) || \
|
|
149
|
+
defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC))
|
|
147
150
|
#define HWY_NEON_HAVE_BFLOAT16 1
|
|
148
151
|
#else
|
|
149
152
|
#define HWY_NEON_HAVE_BFLOAT16 0
|
|
150
153
|
#endif
|
|
151
154
|
|
|
155
|
+
// HWY_NEON_HAVE_F32_TO_BF16C is defined if NEON vcvt_bf16_f32 and
|
|
156
|
+
// vbfdot_f32 are available, even if the __bf16 type is disabled due to
|
|
157
|
+
// GCC/Clang bugs.
|
|
158
|
+
#undef HWY_NEON_HAVE_F32_TO_BF16C
|
|
159
|
+
#if HWY_NEON_HAVE_BFLOAT16 || HWY_TARGET == HWY_NEON_BF16 || \
|
|
160
|
+
(defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && \
|
|
161
|
+
(HWY_COMPILER_GCC_ACTUAL >= 1000 || HWY_COMPILER_CLANG >= 1100))
|
|
162
|
+
#define HWY_NEON_HAVE_F32_TO_BF16C 1
|
|
163
|
+
#else
|
|
164
|
+
#define HWY_NEON_HAVE_F32_TO_BF16C 0
|
|
165
|
+
#endif
|
|
166
|
+
|
|
152
167
|
// bfloat16_t
|
|
153
168
|
#if HWY_NEON_HAVE_BFLOAT16
|
|
154
169
|
#define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args) \
|
|
@@ -160,7 +175,7 @@ namespace detail { // for code folding and Raw128
|
|
|
160
175
|
#define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args)
|
|
161
176
|
#endif
|
|
162
177
|
|
|
163
|
-
// Used for conversion instructions if
|
|
178
|
+
// Used for conversion instructions if HWY_NEON_HAVE_F16C.
|
|
164
179
|
#define HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(name, prefix, infix, \
|
|
165
180
|
args) \
|
|
166
181
|
HWY_NEON_DEF_FUNCTION(float16, 8, name, prefix##q, infix, f16, args) \
|
|
@@ -176,6 +191,24 @@ namespace detail { // for code folding and Raw128
|
|
|
176
191
|
#define HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args)
|
|
177
192
|
#endif
|
|
178
193
|
|
|
194
|
+
// Enable generic functions for whichever of (f16, bf16) are not supported.
|
|
195
|
+
#if !HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16
|
|
196
|
+
#define HWY_NEON_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
|
|
197
|
+
#elif !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16
|
|
198
|
+
#define HWY_NEON_IF_EMULATED_D(D) HWY_IF_F16_D(D)
|
|
199
|
+
#elif HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16
|
|
200
|
+
#define HWY_NEON_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
|
|
201
|
+
#elif HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16
|
|
202
|
+
// NOTE: hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr is used instead of
|
|
203
|
+
// hwy::EnableIf<false>* = nullptr to avoid compiler errors since
|
|
204
|
+
// !hwy::IsSame<D, D>() is always false and as !hwy::IsSame<D, D>() will cause
|
|
205
|
+
// SFINAE to occur instead of a hard error due to a dependency on the D template
|
|
206
|
+
// argument
|
|
207
|
+
#define HWY_NEON_IF_EMULATED_D(D) hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr
|
|
208
|
+
#else
|
|
209
|
+
#error "Logic error, handled all four cases"
|
|
210
|
+
#endif
|
|
211
|
+
|
|
179
212
|
// float
|
|
180
213
|
#define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
|
|
181
214
|
HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \
|
|
@@ -397,39 +430,6 @@ struct Tuple2<int64_t, N> {
|
|
|
397
430
|
int64x1x2_t raw;
|
|
398
431
|
};
|
|
399
432
|
|
|
400
|
-
template <>
|
|
401
|
-
struct Tuple2<float16_t, 8> {
|
|
402
|
-
#if HWY_NEON_HAVE_FLOAT16C
|
|
403
|
-
float16x8x2_t raw;
|
|
404
|
-
#else
|
|
405
|
-
uint16x8x2_t raw;
|
|
406
|
-
#endif
|
|
407
|
-
};
|
|
408
|
-
template <size_t N>
|
|
409
|
-
struct Tuple2<float16_t, N> {
|
|
410
|
-
#if HWY_NEON_HAVE_FLOAT16C
|
|
411
|
-
float16x4x2_t raw;
|
|
412
|
-
#else
|
|
413
|
-
uint16x4x2_t raw;
|
|
414
|
-
#endif
|
|
415
|
-
};
|
|
416
|
-
template <>
|
|
417
|
-
struct Tuple2<bfloat16_t, 8> {
|
|
418
|
-
#if HWY_NEON_HAVE_BFLOAT16
|
|
419
|
-
bfloat16x8x2_t raw;
|
|
420
|
-
#else
|
|
421
|
-
uint16x8x2_t raw;
|
|
422
|
-
#endif
|
|
423
|
-
};
|
|
424
|
-
template <size_t N>
|
|
425
|
-
struct Tuple2<bfloat16_t, N> {
|
|
426
|
-
#if HWY_NEON_HAVE_BFLOAT16
|
|
427
|
-
bfloat16x4x2_t raw;
|
|
428
|
-
#else
|
|
429
|
-
uint16x4x2_t raw;
|
|
430
|
-
#endif
|
|
431
|
-
};
|
|
432
|
-
|
|
433
433
|
template <>
|
|
434
434
|
struct Tuple2<float32_t, 4> {
|
|
435
435
|
float32x4x2_t raw;
|
|
@@ -514,39 +514,6 @@ struct Tuple3<int64_t, N> {
|
|
|
514
514
|
int64x1x3_t raw;
|
|
515
515
|
};
|
|
516
516
|
|
|
517
|
-
template <>
|
|
518
|
-
struct Tuple3<float16_t, 8> {
|
|
519
|
-
#if HWY_NEON_HAVE_FLOAT16C
|
|
520
|
-
float16x8x3_t raw;
|
|
521
|
-
#else
|
|
522
|
-
uint16x8x3_t raw;
|
|
523
|
-
#endif
|
|
524
|
-
};
|
|
525
|
-
template <size_t N>
|
|
526
|
-
struct Tuple3<float16_t, N> {
|
|
527
|
-
#if HWY_NEON_HAVE_FLOAT16C
|
|
528
|
-
float16x4x3_t raw;
|
|
529
|
-
#else
|
|
530
|
-
uint16x4x3_t raw;
|
|
531
|
-
#endif
|
|
532
|
-
};
|
|
533
|
-
template <>
|
|
534
|
-
struct Tuple3<bfloat16_t, 8> {
|
|
535
|
-
#if HWY_NEON_HAVE_BFLOAT16
|
|
536
|
-
bfloat16x8x3_t raw;
|
|
537
|
-
#else
|
|
538
|
-
uint16x8x3_t raw;
|
|
539
|
-
#endif
|
|
540
|
-
};
|
|
541
|
-
template <size_t N>
|
|
542
|
-
struct Tuple3<bfloat16_t, N> {
|
|
543
|
-
#if HWY_NEON_HAVE_BFLOAT16
|
|
544
|
-
bfloat16x4x3_t raw;
|
|
545
|
-
#else
|
|
546
|
-
uint16x4x3_t raw;
|
|
547
|
-
#endif
|
|
548
|
-
};
|
|
549
|
-
|
|
550
517
|
template <>
|
|
551
518
|
struct Tuple3<float32_t, 4> {
|
|
552
519
|
float32x4x3_t raw;
|
|
@@ -631,39 +598,6 @@ struct Tuple4<int64_t, N> {
|
|
|
631
598
|
int64x1x4_t raw;
|
|
632
599
|
};
|
|
633
600
|
|
|
634
|
-
template <>
|
|
635
|
-
struct Tuple4<float16_t, 8> {
|
|
636
|
-
#if HWY_NEON_HAVE_FLOAT16C
|
|
637
|
-
float16x8x4_t raw;
|
|
638
|
-
#else
|
|
639
|
-
uint16x8x4_t raw;
|
|
640
|
-
#endif
|
|
641
|
-
};
|
|
642
|
-
template <size_t N>
|
|
643
|
-
struct Tuple4<float16_t, N> {
|
|
644
|
-
#if HWY_NEON_HAVE_FLOAT16C
|
|
645
|
-
float16x4x4_t raw;
|
|
646
|
-
#else
|
|
647
|
-
uint16x4x4_t raw;
|
|
648
|
-
#endif
|
|
649
|
-
};
|
|
650
|
-
template <>
|
|
651
|
-
struct Tuple4<bfloat16_t, 8> {
|
|
652
|
-
#if HWY_NEON_HAVE_BFLOAT16
|
|
653
|
-
bfloat16x8x4_t raw;
|
|
654
|
-
#else
|
|
655
|
-
uint16x8x4_t raw;
|
|
656
|
-
#endif
|
|
657
|
-
};
|
|
658
|
-
template <size_t N>
|
|
659
|
-
struct Tuple4<bfloat16_t, N> {
|
|
660
|
-
#if HWY_NEON_HAVE_BFLOAT16
|
|
661
|
-
bfloat16x4x4_t raw;
|
|
662
|
-
#else
|
|
663
|
-
uint16x4x4_t raw;
|
|
664
|
-
#endif
|
|
665
|
-
};
|
|
666
|
-
|
|
667
601
|
template <>
|
|
668
602
|
struct Tuple4<float32_t, 4> {
|
|
669
603
|
float32x4x4_t raw;
|
|
@@ -686,215 +620,213 @@ struct Tuple4<float64_t, N> {
|
|
|
686
620
|
template <typename T, size_t N>
|
|
687
621
|
struct Raw128;
|
|
688
622
|
|
|
689
|
-
// 128
|
|
690
623
|
template <>
|
|
691
624
|
struct Raw128<uint8_t, 16> {
|
|
692
625
|
using type = uint8x16_t;
|
|
693
626
|
};
|
|
627
|
+
template <size_t N>
|
|
628
|
+
struct Raw128<uint8_t, N> {
|
|
629
|
+
using type = uint8x8_t;
|
|
630
|
+
};
|
|
694
631
|
|
|
695
632
|
template <>
|
|
696
633
|
struct Raw128<uint16_t, 8> {
|
|
697
634
|
using type = uint16x8_t;
|
|
698
635
|
};
|
|
636
|
+
template <size_t N>
|
|
637
|
+
struct Raw128<uint16_t, N> {
|
|
638
|
+
using type = uint16x4_t;
|
|
639
|
+
};
|
|
699
640
|
|
|
700
641
|
template <>
|
|
701
642
|
struct Raw128<uint32_t, 4> {
|
|
702
643
|
using type = uint32x4_t;
|
|
703
644
|
};
|
|
645
|
+
template <size_t N>
|
|
646
|
+
struct Raw128<uint32_t, N> {
|
|
647
|
+
using type = uint32x2_t;
|
|
648
|
+
};
|
|
704
649
|
|
|
705
650
|
template <>
|
|
706
651
|
struct Raw128<uint64_t, 2> {
|
|
707
652
|
using type = uint64x2_t;
|
|
708
653
|
};
|
|
654
|
+
template <>
|
|
655
|
+
struct Raw128<uint64_t, 1> {
|
|
656
|
+
using type = uint64x1_t;
|
|
657
|
+
};
|
|
709
658
|
|
|
710
659
|
template <>
|
|
711
660
|
struct Raw128<int8_t, 16> {
|
|
712
661
|
using type = int8x16_t;
|
|
713
662
|
};
|
|
663
|
+
template <size_t N>
|
|
664
|
+
struct Raw128<int8_t, N> {
|
|
665
|
+
using type = int8x8_t;
|
|
666
|
+
};
|
|
714
667
|
|
|
715
668
|
template <>
|
|
716
669
|
struct Raw128<int16_t, 8> {
|
|
717
670
|
using type = int16x8_t;
|
|
718
671
|
};
|
|
672
|
+
template <size_t N>
|
|
673
|
+
struct Raw128<int16_t, N> {
|
|
674
|
+
using type = int16x4_t;
|
|
675
|
+
};
|
|
719
676
|
|
|
720
677
|
template <>
|
|
721
678
|
struct Raw128<int32_t, 4> {
|
|
722
679
|
using type = int32x4_t;
|
|
723
680
|
};
|
|
681
|
+
template <size_t N>
|
|
682
|
+
struct Raw128<int32_t, N> {
|
|
683
|
+
using type = int32x2_t;
|
|
684
|
+
};
|
|
724
685
|
|
|
725
686
|
template <>
|
|
726
687
|
struct Raw128<int64_t, 2> {
|
|
727
688
|
using type = int64x2_t;
|
|
728
689
|
};
|
|
729
|
-
|
|
730
|
-
template <>
|
|
731
|
-
struct Raw128<float16_t, 8> {
|
|
732
|
-
#if HWY_NEON_HAVE_FLOAT16C
|
|
733
|
-
using type = float16x8_t;
|
|
734
|
-
#else
|
|
735
|
-
using type = uint16x8_t;
|
|
736
|
-
#endif
|
|
737
|
-
};
|
|
738
|
-
|
|
739
690
|
template <>
|
|
740
|
-
struct Raw128<
|
|
741
|
-
|
|
742
|
-
using type = bfloat16x8_t;
|
|
743
|
-
#else
|
|
744
|
-
using type = uint16x8_t;
|
|
745
|
-
#endif
|
|
691
|
+
struct Raw128<int64_t, 1> {
|
|
692
|
+
using type = int64x1_t;
|
|
746
693
|
};
|
|
747
694
|
|
|
748
695
|
template <>
|
|
749
696
|
struct Raw128<float, 4> {
|
|
750
697
|
using type = float32x4_t;
|
|
751
698
|
};
|
|
699
|
+
template <size_t N>
|
|
700
|
+
struct Raw128<float, N> {
|
|
701
|
+
using type = float32x2_t;
|
|
702
|
+
};
|
|
752
703
|
|
|
753
704
|
#if HWY_HAVE_FLOAT64
|
|
754
705
|
template <>
|
|
755
706
|
struct Raw128<double, 2> {
|
|
756
707
|
using type = float64x2_t;
|
|
757
708
|
};
|
|
758
|
-
#endif // HWY_HAVE_FLOAT64
|
|
759
|
-
|
|
760
|
-
// 64
|
|
761
709
|
template <>
|
|
762
|
-
struct Raw128<
|
|
763
|
-
using type =
|
|
710
|
+
struct Raw128<double, 1> {
|
|
711
|
+
using type = float64x1_t;
|
|
764
712
|
};
|
|
713
|
+
#endif // HWY_HAVE_FLOAT64
|
|
765
714
|
|
|
766
|
-
|
|
767
|
-
struct Raw128<uint16_t, 4> {
|
|
768
|
-
using type = uint16x4_t;
|
|
769
|
-
};
|
|
715
|
+
#if HWY_NEON_HAVE_F16C
|
|
770
716
|
|
|
771
717
|
template <>
|
|
772
|
-
struct
|
|
773
|
-
|
|
718
|
+
struct Tuple2<float16_t, 8> {
|
|
719
|
+
float16x8x2_t raw;
|
|
774
720
|
};
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
using type = uint64x1_t;
|
|
721
|
+
template <size_t N>
|
|
722
|
+
struct Tuple2<float16_t, N> {
|
|
723
|
+
float16x4x2_t raw;
|
|
779
724
|
};
|
|
780
725
|
|
|
781
726
|
template <>
|
|
782
|
-
struct
|
|
783
|
-
|
|
727
|
+
struct Tuple3<float16_t, 8> {
|
|
728
|
+
float16x8x3_t raw;
|
|
784
729
|
};
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
using type = int16x4_t;
|
|
730
|
+
template <size_t N>
|
|
731
|
+
struct Tuple3<float16_t, N> {
|
|
732
|
+
float16x4x3_t raw;
|
|
789
733
|
};
|
|
790
734
|
|
|
791
735
|
template <>
|
|
792
|
-
struct
|
|
793
|
-
|
|
736
|
+
struct Tuple4<float16_t, 8> {
|
|
737
|
+
float16x8x4_t raw;
|
|
794
738
|
};
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
using type = int64x1_t;
|
|
739
|
+
template <size_t N>
|
|
740
|
+
struct Tuple4<float16_t, N> {
|
|
741
|
+
float16x4x4_t raw;
|
|
799
742
|
};
|
|
800
743
|
|
|
801
744
|
template <>
|
|
802
|
-
struct Raw128<float16_t,
|
|
803
|
-
|
|
745
|
+
struct Raw128<float16_t, 8> {
|
|
746
|
+
using type = float16x8_t;
|
|
747
|
+
};
|
|
748
|
+
template <size_t N>
|
|
749
|
+
struct Raw128<float16_t, N> {
|
|
804
750
|
using type = float16x4_t;
|
|
805
|
-
#else
|
|
806
|
-
using type = uint16x4_t;
|
|
807
|
-
#endif
|
|
808
751
|
};
|
|
809
752
|
|
|
810
|
-
|
|
811
|
-
|
|
753
|
+
#else // !HWY_NEON_HAVE_F16C
|
|
754
|
+
|
|
755
|
+
template <size_t N>
|
|
756
|
+
struct Tuple2<float16_t, N> : public Tuple2<uint16_t, N> {};
|
|
757
|
+
template <size_t N>
|
|
758
|
+
struct Tuple3<float16_t, N> : public Tuple3<uint16_t, N> {};
|
|
759
|
+
template <size_t N>
|
|
760
|
+
struct Tuple4<float16_t, N> : public Tuple4<uint16_t, N> {};
|
|
761
|
+
template <size_t N>
|
|
762
|
+
struct Raw128<float16_t, N> : public Raw128<uint16_t, N> {};
|
|
763
|
+
|
|
764
|
+
#endif // HWY_NEON_HAVE_F16C
|
|
765
|
+
|
|
812
766
|
#if HWY_NEON_HAVE_BFLOAT16
|
|
813
|
-
using type = bfloat16x4_t;
|
|
814
|
-
#else
|
|
815
|
-
using type = uint16x4_t;
|
|
816
|
-
#endif
|
|
817
|
-
};
|
|
818
767
|
|
|
819
768
|
template <>
|
|
820
|
-
struct
|
|
821
|
-
|
|
769
|
+
struct Tuple2<bfloat16_t, 8> {
|
|
770
|
+
bfloat16x8x2_t raw;
|
|
822
771
|
};
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
struct Raw128<double, 1> {
|
|
827
|
-
using type = float64x1_t;
|
|
772
|
+
template <size_t N>
|
|
773
|
+
struct Tuple2<bfloat16_t, N> {
|
|
774
|
+
bfloat16x4x2_t raw;
|
|
828
775
|
};
|
|
829
|
-
#endif // HWY_HAVE_FLOAT64
|
|
830
776
|
|
|
831
|
-
// 32 (same as 64)
|
|
832
777
|
template <>
|
|
833
|
-
struct
|
|
778
|
+
struct Tuple3<bfloat16_t, 8> {
|
|
779
|
+
bfloat16x8x3_t raw;
|
|
780
|
+
};
|
|
781
|
+
template <size_t N>
|
|
782
|
+
struct Tuple3<bfloat16_t, N> {
|
|
783
|
+
bfloat16x4x3_t raw;
|
|
784
|
+
};
|
|
834
785
|
|
|
835
786
|
template <>
|
|
836
|
-
struct
|
|
787
|
+
struct Tuple4<bfloat16_t, 8> {
|
|
788
|
+
bfloat16x8x4_t raw;
|
|
789
|
+
};
|
|
790
|
+
template <size_t N>
|
|
791
|
+
struct Tuple4<bfloat16_t, N> {
|
|
792
|
+
bfloat16x4x4_t raw;
|
|
793
|
+
};
|
|
837
794
|
|
|
838
795
|
template <>
|
|
839
|
-
struct Raw128<
|
|
796
|
+
struct Raw128<bfloat16_t, 8> {
|
|
797
|
+
using type = bfloat16x8_t;
|
|
798
|
+
};
|
|
799
|
+
template <size_t N>
|
|
800
|
+
struct Raw128<bfloat16_t, N> {
|
|
801
|
+
using type = bfloat16x4_t;
|
|
802
|
+
};
|
|
840
803
|
|
|
841
|
-
|
|
842
|
-
struct Raw128<int8_t, 4> : public Raw128<int8_t, 8> {};
|
|
804
|
+
#else // !HWY_NEON_HAVE_BFLOAT16
|
|
843
805
|
|
|
844
|
-
template
|
|
845
|
-
struct
|
|
806
|
+
template <size_t N>
|
|
807
|
+
struct Tuple2<bfloat16_t, N> : public Tuple2<uint16_t, N> {};
|
|
808
|
+
template <size_t N>
|
|
809
|
+
struct Tuple3<bfloat16_t, N> : public Tuple3<uint16_t, N> {};
|
|
810
|
+
template <size_t N>
|
|
811
|
+
struct Tuple4<bfloat16_t, N> : public Tuple4<uint16_t, N> {};
|
|
812
|
+
template <size_t N>
|
|
813
|
+
struct Raw128<bfloat16_t, N> : public Raw128<uint16_t, N> {};
|
|
846
814
|
|
|
847
|
-
|
|
848
|
-
struct Raw128<int32_t, 1> : public Raw128<int32_t, 2> {};
|
|
815
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
849
816
|
|
|
850
|
-
|
|
851
|
-
struct Raw128<float16_t, 2> : public Raw128<float16_t, 4> {};
|
|
817
|
+
} // namespace detail
|
|
852
818
|
|
|
853
|
-
template
|
|
854
|
-
|
|
819
|
+
template <typename T, size_t N = 16 / sizeof(T)>
|
|
820
|
+
class Vec128 {
|
|
821
|
+
public:
|
|
822
|
+
using Raw = typename detail::Raw128<T, N>::type;
|
|
823
|
+
using PrivateT = T; // only for DFromV
|
|
824
|
+
static constexpr size_t kPrivateN = N; // only for DFromV
|
|
855
825
|
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
template <>
|
|
861
|
-
struct Raw128<uint8_t, 2> : public Raw128<uint8_t, 8> {};
|
|
862
|
-
|
|
863
|
-
template <>
|
|
864
|
-
struct Raw128<uint16_t, 1> : public Raw128<uint16_t, 4> {};
|
|
865
|
-
|
|
866
|
-
template <>
|
|
867
|
-
struct Raw128<int8_t, 2> : public Raw128<int8_t, 8> {};
|
|
868
|
-
|
|
869
|
-
template <>
|
|
870
|
-
struct Raw128<int16_t, 1> : public Raw128<int16_t, 4> {};
|
|
871
|
-
|
|
872
|
-
template <>
|
|
873
|
-
struct Raw128<float16_t, 1> : public Raw128<float16_t, 4> {};
|
|
874
|
-
|
|
875
|
-
template <>
|
|
876
|
-
struct Raw128<bfloat16_t, 1> : public Raw128<bfloat16_t, 4> {};
|
|
877
|
-
|
|
878
|
-
// 8 (same as 64)
|
|
879
|
-
template <>
|
|
880
|
-
struct Raw128<uint8_t, 1> : public Raw128<uint8_t, 8> {};
|
|
881
|
-
|
|
882
|
-
template <>
|
|
883
|
-
struct Raw128<int8_t, 1> : public Raw128<int8_t, 8> {};
|
|
884
|
-
|
|
885
|
-
} // namespace detail
|
|
886
|
-
|
|
887
|
-
template <typename T, size_t N = 16 / sizeof(T)>
|
|
888
|
-
class Vec128 {
|
|
889
|
-
public:
|
|
890
|
-
using Raw = typename detail::Raw128<T, N>::type;
|
|
891
|
-
using PrivateT = T; // only for DFromV
|
|
892
|
-
static constexpr size_t kPrivateN = N; // only for DFromV
|
|
893
|
-
|
|
894
|
-
HWY_INLINE Vec128() {}
|
|
895
|
-
Vec128(const Vec128&) = default;
|
|
896
|
-
Vec128& operator=(const Vec128&) = default;
|
|
897
|
-
HWY_INLINE explicit Vec128(const Raw raw) : raw(raw) {}
|
|
826
|
+
HWY_INLINE Vec128() {}
|
|
827
|
+
Vec128(const Vec128&) = default;
|
|
828
|
+
Vec128& operator=(const Vec128&) = default;
|
|
829
|
+
HWY_INLINE explicit Vec128(const Raw raw) : raw(raw) {}
|
|
898
830
|
|
|
899
831
|
// Compound assignment. Only usable if there is a corresponding non-member
|
|
900
832
|
// binary operator overload. For example, only f32 and f64 support division.
|
|
@@ -910,6 +842,9 @@ class Vec128 {
|
|
|
910
842
|
HWY_INLINE Vec128& operator-=(const Vec128 other) {
|
|
911
843
|
return *this = (*this - other);
|
|
912
844
|
}
|
|
845
|
+
HWY_INLINE Vec128& operator%=(const Vec128 other) {
|
|
846
|
+
return *this = (*this % other);
|
|
847
|
+
}
|
|
913
848
|
HWY_INLINE Vec128& operator&=(const Vec128 other) {
|
|
914
849
|
return *this = (*this & other);
|
|
915
850
|
}
|
|
@@ -978,26 +913,22 @@ namespace detail {
|
|
|
978
913
|
#define HWY_NEON_BUILD_ARG_HWY_SET t
|
|
979
914
|
|
|
980
915
|
HWY_NEON_DEF_FUNCTION_ALL_TYPES(NativeSet, vdup, _n_, HWY_SET)
|
|
981
|
-
|
|
982
|
-
#if !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_FLOAT16C
|
|
916
|
+
#if !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_F16C
|
|
983
917
|
HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(NativeSet, vdup, _n_, HWY_SET)
|
|
984
918
|
#endif
|
|
919
|
+
HWY_NEON_DEF_FUNCTION_BFLOAT_16(NativeSet, vdup, _n_, HWY_SET)
|
|
920
|
+
|
|
921
|
+
template <class D, HWY_NEON_IF_EMULATED_D(D)>
|
|
922
|
+
HWY_API Vec128<TFromD<D>, MaxLanes(D())> NativeSet(D d, TFromD<D> t) {
|
|
923
|
+
const uint16_t tu = BitCastScalar<uint16_t>(t);
|
|
924
|
+
return Vec128<TFromD<D>, d.MaxLanes()>(Set(RebindToUnsigned<D>(), tu).raw);
|
|
925
|
+
}
|
|
985
926
|
|
|
986
927
|
#undef HWY_NEON_BUILD_TPL_HWY_SET
|
|
987
928
|
#undef HWY_NEON_BUILD_RET_HWY_SET
|
|
988
929
|
#undef HWY_NEON_BUILD_PARAM_HWY_SET
|
|
989
930
|
#undef HWY_NEON_BUILD_ARG_HWY_SET
|
|
990
931
|
|
|
991
|
-
#if !HWY_NEON_HAVE_BFLOAT16
|
|
992
|
-
// BF16: return u16.
|
|
993
|
-
template <class D, HWY_IF_BF16_D(D)>
|
|
994
|
-
HWY_API Vec128<bfloat16_t, MaxLanes(D())> NativeSet(D d, bfloat16_t t) {
|
|
995
|
-
uint16_t tu;
|
|
996
|
-
CopyBytes<sizeof(tu)>(&t, &tu);
|
|
997
|
-
return Vec128<bfloat16_t, d.MaxLanes()>(Set(RebindToUnsigned<D>(), tu).raw);
|
|
998
|
-
}
|
|
999
|
-
#endif // !HWY_NEON_HAVE_BFLOAT16
|
|
1000
|
-
|
|
1001
932
|
} // namespace detail
|
|
1002
933
|
|
|
1003
934
|
// Full vector. Cannot yet use VFromD because that is defined in terms of Set.
|
|
@@ -1033,165 +964,323 @@ HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
|
|
|
1033
964
|
|
|
1034
965
|
template <class D>
|
|
1035
966
|
HWY_API VFromD<D> Undefined(D /*tag*/) {
|
|
967
|
+
#if HWY_HAS_BUILTIN(__builtin_nondeterministic_value)
|
|
968
|
+
return VFromD<D>{__builtin_nondeterministic_value(Zero(D()).raw)};
|
|
969
|
+
#else
|
|
1036
970
|
VFromD<D> v;
|
|
1037
971
|
return v;
|
|
972
|
+
#endif
|
|
1038
973
|
}
|
|
1039
974
|
|
|
1040
975
|
HWY_DIAGNOSTICS(pop)
|
|
1041
976
|
|
|
977
|
+
#if !HWY_COMPILER_GCC && !HWY_COMPILER_CLANGCL
|
|
1042
978
|
namespace detail {
|
|
1043
979
|
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
980
|
+
#pragma pack(push, 1)
|
|
981
|
+
|
|
982
|
+
template <class T>
|
|
983
|
+
struct alignas(8) Vec64ValsWrapper {
|
|
984
|
+
static_assert(sizeof(T) >= 1, "sizeof(T) >= 1 must be true");
|
|
985
|
+
static_assert(sizeof(T) <= 8, "sizeof(T) <= 8 must be true");
|
|
986
|
+
T vals[8 / sizeof(T)];
|
|
987
|
+
};
|
|
988
|
+
|
|
989
|
+
#pragma pack(pop)
|
|
990
|
+
|
|
991
|
+
} // namespace detail
|
|
992
|
+
#endif // !HWY_COMPILER_GCC && !HWY_COMPILER_CLANGCL
|
|
993
|
+
|
|
994
|
+
template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
995
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
996
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
997
|
+
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
|
|
998
|
+
TFromD<D> /*t8*/, TFromD<D> /*t9*/,
|
|
999
|
+
TFromD<D> /*t10*/, TFromD<D> /*t11*/,
|
|
1000
|
+
TFromD<D> /*t12*/, TFromD<D> /*t13*/,
|
|
1001
|
+
TFromD<D> /*t14*/, TFromD<D> /*t15*/) {
|
|
1047
1002
|
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
|
|
1048
|
-
typedef
|
|
1049
|
-
|
|
1050
|
-
const
|
|
1003
|
+
typedef int8_t GccI8RawVectType __attribute__((__vector_size__(8)));
|
|
1004
|
+
(void)d;
|
|
1005
|
+
const GccI8RawVectType raw = {
|
|
1006
|
+
static_cast<int8_t>(t0), static_cast<int8_t>(t1), static_cast<int8_t>(t2),
|
|
1007
|
+
static_cast<int8_t>(t3), static_cast<int8_t>(t4), static_cast<int8_t>(t5),
|
|
1008
|
+
static_cast<int8_t>(t6), static_cast<int8_t>(t7)};
|
|
1009
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1051
1010
|
#else
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1011
|
+
return ResizeBitCast(
|
|
1012
|
+
d, Set(Full64<uint64_t>(),
|
|
1013
|
+
BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{
|
|
1014
|
+
{t0, t1, t2, t3, t4, t5, t6, t7}})));
|
|
1055
1015
|
#endif
|
|
1056
|
-
return BitCast(d, vu8_iota0);
|
|
1057
1016
|
}
|
|
1058
1017
|
|
|
1059
|
-
template <class D,
|
|
1060
|
-
|
|
1061
|
-
|
|
1018
|
+
template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
1019
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
1020
|
+
TFromD<D> t2, TFromD<D> t3,
|
|
1021
|
+
TFromD<D> /*t4*/, TFromD<D> /*t5*/,
|
|
1022
|
+
TFromD<D> /*t6*/, TFromD<D> /*t7*/) {
|
|
1062
1023
|
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
|
|
1063
|
-
typedef
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1024
|
+
typedef int16_t GccI16RawVectType __attribute__((__vector_size__(8)));
|
|
1025
|
+
(void)d;
|
|
1026
|
+
const GccI16RawVectType raw = {
|
|
1027
|
+
static_cast<int16_t>(t0), static_cast<int16_t>(t1),
|
|
1028
|
+
static_cast<int16_t>(t2), static_cast<int16_t>(t3)};
|
|
1029
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1067
1030
|
#else
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1031
|
+
return ResizeBitCast(
|
|
1032
|
+
d, Set(Full64<uint64_t>(),
|
|
1033
|
+
BitCastScalar<uint64_t>(
|
|
1034
|
+
detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1, t2, t3}})));
|
|
1071
1035
|
#endif
|
|
1072
|
-
return BitCast(d, vu8_iota0);
|
|
1073
1036
|
}
|
|
1074
1037
|
|
|
1075
|
-
template <class D,
|
|
1076
|
-
|
|
1077
|
-
|
|
1038
|
+
template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
1039
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
1040
|
+
TFromD<D> /*t2*/, TFromD<D> /*t3*/) {
|
|
1078
1041
|
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
|
|
1079
|
-
typedef
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1042
|
+
typedef int32_t GccI32RawVectType __attribute__((__vector_size__(8)));
|
|
1043
|
+
(void)d;
|
|
1044
|
+
const GccI32RawVectType raw = {static_cast<int32_t>(t0),
|
|
1045
|
+
static_cast<int32_t>(t1)};
|
|
1046
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1083
1047
|
#else
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1048
|
+
return ResizeBitCast(d,
|
|
1049
|
+
Set(Full64<uint64_t>(),
|
|
1050
|
+
BitCastScalar<uint64_t>(
|
|
1051
|
+
detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}})));
|
|
1087
1052
|
#endif
|
|
1088
1053
|
}
|
|
1089
1054
|
|
|
1090
|
-
template <class D,
|
|
1091
|
-
|
|
1092
|
-
|
|
1055
|
+
template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
1056
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
1057
|
+
TFromD<D> /*t2*/, TFromD<D> /*t3*/) {
|
|
1093
1058
|
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
|
|
1094
|
-
typedef
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(kIota0));
|
|
1059
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(8)));
|
|
1060
|
+
(void)d;
|
|
1061
|
+
const GccF32RawVectType raw = {t0, t1};
|
|
1062
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1099
1063
|
#else
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1064
|
+
return ResizeBitCast(d,
|
|
1065
|
+
Set(Full64<uint64_t>(),
|
|
1066
|
+
BitCastScalar<uint64_t>(
|
|
1067
|
+
detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}})));
|
|
1103
1068
|
#endif
|
|
1104
1069
|
}
|
|
1105
1070
|
|
|
1106
|
-
template <class D,
|
|
1107
|
-
|
|
1108
|
-
|
|
1071
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 8)>
|
|
1072
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> /*t1*/) {
|
|
1073
|
+
return Set(d, t0);
|
|
1074
|
+
}
|
|
1075
|
+
|
|
1076
|
+
template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 16)>
|
|
1077
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
1078
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
1079
|
+
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
|
|
1080
|
+
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
|
|
1081
|
+
TFromD<D> t11, TFromD<D> t12,
|
|
1082
|
+
TFromD<D> t13, TFromD<D> t14,
|
|
1083
|
+
TFromD<D> t15) {
|
|
1109
1084
|
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
|
|
1110
|
-
typedef
|
|
1111
|
-
|
|
1112
|
-
const
|
|
1113
|
-
|
|
1085
|
+
typedef int8_t GccI8RawVectType __attribute__((__vector_size__(16)));
|
|
1086
|
+
(void)d;
|
|
1087
|
+
const GccI8RawVectType raw = {
|
|
1088
|
+
static_cast<int8_t>(t0), static_cast<int8_t>(t1),
|
|
1089
|
+
static_cast<int8_t>(t2), static_cast<int8_t>(t3),
|
|
1090
|
+
static_cast<int8_t>(t4), static_cast<int8_t>(t5),
|
|
1091
|
+
static_cast<int8_t>(t6), static_cast<int8_t>(t7),
|
|
1092
|
+
static_cast<int8_t>(t8), static_cast<int8_t>(t9),
|
|
1093
|
+
static_cast<int8_t>(t10), static_cast<int8_t>(t11),
|
|
1094
|
+
static_cast<int8_t>(t12), static_cast<int8_t>(t13),
|
|
1095
|
+
static_cast<int8_t>(t14), static_cast<int8_t>(t15)};
|
|
1096
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1114
1097
|
#else
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1098
|
+
const Half<decltype(d)> dh;
|
|
1099
|
+
return Combine(d,
|
|
1100
|
+
Dup128VecFromValues(dh, t8, t9, t10, t11, t12, t13, t14, t15,
|
|
1101
|
+
t8, t9, t10, t11, t12, t13, t14, t15),
|
|
1102
|
+
Dup128VecFromValues(dh, t0, t1, t2, t3, t4, t5, t6, t7, t0, t1,
|
|
1103
|
+
t2, t3, t4, t5, t6, t7));
|
|
1118
1104
|
#endif
|
|
1119
|
-
return BitCast(d, vu32_iota0);
|
|
1120
1105
|
}
|
|
1121
1106
|
|
|
1122
|
-
template <class D,
|
|
1123
|
-
|
|
1124
|
-
|
|
1107
|
+
template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 16)>
|
|
1108
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
1109
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
1110
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
1111
|
+
TFromD<D> t7) {
|
|
1125
1112
|
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
|
|
1126
|
-
typedef
|
|
1127
|
-
|
|
1128
|
-
const
|
|
1129
|
-
|
|
1113
|
+
typedef int16_t GccI16RawVectType __attribute__((__vector_size__(16)));
|
|
1114
|
+
(void)d;
|
|
1115
|
+
const GccI16RawVectType raw = {
|
|
1116
|
+
static_cast<int16_t>(t0), static_cast<int16_t>(t1),
|
|
1117
|
+
static_cast<int16_t>(t2), static_cast<int16_t>(t3),
|
|
1118
|
+
static_cast<int16_t>(t4), static_cast<int16_t>(t5),
|
|
1119
|
+
static_cast<int16_t>(t6), static_cast<int16_t>(t7)};
|
|
1120
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1130
1121
|
#else
|
|
1131
|
-
|
|
1132
|
-
|
|
1122
|
+
const Half<decltype(d)> dh;
|
|
1123
|
+
return Combine(d, Dup128VecFromValues(dh, t4, t5, t6, t7, t4, t5, t6, t7),
|
|
1124
|
+
Dup128VecFromValues(dh, t0, t1, t2, t3, t0, t1, t2, t3));
|
|
1133
1125
|
#endif
|
|
1134
|
-
return BitCast(d, vu32_iota0);
|
|
1135
1126
|
}
|
|
1136
1127
|
|
|
1137
|
-
template <class D,
|
|
1138
|
-
|
|
1128
|
+
template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 16)>
|
|
1129
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
1130
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
1139
1131
|
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
|
|
1140
|
-
typedef
|
|
1141
|
-
|
|
1142
|
-
|
|
1132
|
+
typedef int32_t GccI32RawVectType __attribute__((__vector_size__(16)));
|
|
1133
|
+
(void)d;
|
|
1134
|
+
const GccI32RawVectType raw = {
|
|
1135
|
+
static_cast<int32_t>(t0), static_cast<int32_t>(t1),
|
|
1136
|
+
static_cast<int32_t>(t2), static_cast<int32_t>(t3)};
|
|
1137
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1143
1138
|
#else
|
|
1144
|
-
|
|
1145
|
-
return
|
|
1146
|
-
|
|
1139
|
+
const Half<decltype(d)> dh;
|
|
1140
|
+
return Combine(d, Dup128VecFromValues(dh, t2, t3, t2, t3),
|
|
1141
|
+
Dup128VecFromValues(dh, t0, t1, t0, t1));
|
|
1147
1142
|
#endif
|
|
1148
1143
|
}
|
|
1149
1144
|
|
|
1150
|
-
template <class D,
|
|
1151
|
-
|
|
1145
|
+
template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 16)>
|
|
1146
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
1147
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
1152
1148
|
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
|
|
1153
1149
|
typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
|
|
1154
|
-
|
|
1155
|
-
|
|
1150
|
+
(void)d;
|
|
1151
|
+
const GccF32RawVectType raw = {t0, t1, t2, t3};
|
|
1152
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1156
1153
|
#else
|
|
1157
|
-
|
|
1158
|
-
return
|
|
1154
|
+
const Half<decltype(d)> dh;
|
|
1155
|
+
return Combine(d, Dup128VecFromValues(dh, t2, t3, t2, t3),
|
|
1156
|
+
Dup128VecFromValues(dh, t0, t1, t0, t1));
|
|
1159
1157
|
#endif
|
|
1160
1158
|
}
|
|
1161
1159
|
|
|
1162
|
-
template <class D,
|
|
1163
|
-
|
|
1164
|
-
return Zero(d);
|
|
1165
|
-
}
|
|
1166
|
-
|
|
1167
|
-
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)>
|
|
1168
|
-
HWY_INLINE VFromD<D> Iota0(D d) {
|
|
1169
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
1160
|
+
template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 16)>
|
|
1161
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) {
|
|
1170
1162
|
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
|
|
1171
|
-
typedef
|
|
1172
|
-
|
|
1173
|
-
const
|
|
1174
|
-
|
|
1163
|
+
typedef int64_t GccI64RawVectType __attribute__((__vector_size__(16)));
|
|
1164
|
+
(void)d;
|
|
1165
|
+
const GccI64RawVectType raw = {static_cast<int64_t>(t0),
|
|
1166
|
+
static_cast<int64_t>(t1)};
|
|
1167
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1175
1168
|
#else
|
|
1176
|
-
|
|
1177
|
-
|
|
1169
|
+
const Half<decltype(d)> dh;
|
|
1170
|
+
return Combine(d, Set(dh, t1), Set(dh, t0));
|
|
1178
1171
|
#endif
|
|
1179
|
-
return BitCast(d, vu64_iota0);
|
|
1180
1172
|
}
|
|
1181
1173
|
|
|
1182
1174
|
#if HWY_HAVE_FLOAT64
|
|
1183
|
-
template <class D,
|
|
1184
|
-
|
|
1175
|
+
template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 16)>
|
|
1176
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) {
|
|
1185
1177
|
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
|
|
1186
1178
|
typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
|
|
1187
|
-
|
|
1188
|
-
|
|
1179
|
+
(void)d;
|
|
1180
|
+
const GccF64RawVectType raw = {t0, t1};
|
|
1181
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1189
1182
|
#else
|
|
1190
|
-
|
|
1191
|
-
return
|
|
1183
|
+
const Half<decltype(d)> dh;
|
|
1184
|
+
return Combine(d, Set(dh, t1), Set(dh, t0));
|
|
1192
1185
|
#endif
|
|
1193
1186
|
}
|
|
1194
|
-
#endif
|
|
1187
|
+
#endif
|
|
1188
|
+
|
|
1189
|
+
// Generic for all vector lengths
|
|
1190
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
1191
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
1192
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
1193
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
1194
|
+
TFromD<D> t7) {
|
|
1195
|
+
const RebindToSigned<decltype(d)> di;
|
|
1196
|
+
return BitCast(d,
|
|
1197
|
+
Dup128VecFromValues(
|
|
1198
|
+
di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
|
|
1199
|
+
BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
|
|
1200
|
+
BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
|
|
1201
|
+
BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
|
|
1202
|
+
}
|
|
1203
|
+
|
|
1204
|
+
#if (HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL) && HWY_NEON_HAVE_F16C
|
|
1205
|
+
template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
1206
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
1207
|
+
TFromD<D> t2, TFromD<D> t3,
|
|
1208
|
+
TFromD<D> /*t4*/, TFromD<D> /*t5*/,
|
|
1209
|
+
TFromD<D> /*t6*/, TFromD<D> /*t7*/) {
|
|
1210
|
+
typedef __fp16 GccF16RawVectType __attribute__((__vector_size__(8)));
|
|
1211
|
+
(void)d;
|
|
1212
|
+
const GccF16RawVectType raw = {
|
|
1213
|
+
static_cast<__fp16>(t0), static_cast<__fp16>(t1), static_cast<__fp16>(t2),
|
|
1214
|
+
static_cast<__fp16>(t3)};
|
|
1215
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1216
|
+
}
|
|
1217
|
+
template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 16)>
|
|
1218
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
1219
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
1220
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
1221
|
+
TFromD<D> t7) {
|
|
1222
|
+
typedef __fp16 GccF16RawVectType __attribute__((__vector_size__(16)));
|
|
1223
|
+
(void)d;
|
|
1224
|
+
const GccF16RawVectType raw = {
|
|
1225
|
+
static_cast<__fp16>(t0), static_cast<__fp16>(t1), static_cast<__fp16>(t2),
|
|
1226
|
+
static_cast<__fp16>(t3), static_cast<__fp16>(t4), static_cast<__fp16>(t5),
|
|
1227
|
+
static_cast<__fp16>(t6), static_cast<__fp16>(t7)};
|
|
1228
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1229
|
+
}
|
|
1230
|
+
#else
|
|
1231
|
+
// Generic for all vector lengths if MSVC or !HWY_NEON_HAVE_F16C
|
|
1232
|
+
template <class D, HWY_IF_F16_D(D)>
|
|
1233
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
1234
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
1235
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
1236
|
+
TFromD<D> t7) {
|
|
1237
|
+
const RebindToSigned<decltype(d)> di;
|
|
1238
|
+
return BitCast(d,
|
|
1239
|
+
Dup128VecFromValues(
|
|
1240
|
+
di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
|
|
1241
|
+
BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
|
|
1242
|
+
BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
|
|
1243
|
+
BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
|
|
1244
|
+
}
|
|
1245
|
+
#endif // (HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL) && HWY_NEON_HAVE_F16C
|
|
1246
|
+
|
|
1247
|
+
namespace detail {
|
|
1248
|
+
|
|
1249
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
1250
|
+
HWY_INLINE VFromD<D> Iota0(D d) {
|
|
1251
|
+
return Dup128VecFromValues(
|
|
1252
|
+
d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2}, TFromD<D>{3}, TFromD<D>{4},
|
|
1253
|
+
TFromD<D>{5}, TFromD<D>{6}, TFromD<D>{7}, TFromD<D>{8}, TFromD<D>{9},
|
|
1254
|
+
TFromD<D>{10}, TFromD<D>{11}, TFromD<D>{12}, TFromD<D>{13}, TFromD<D>{14},
|
|
1255
|
+
TFromD<D>{15});
|
|
1256
|
+
}
|
|
1257
|
+
|
|
1258
|
+
template <class D, HWY_IF_UI16_D(D)>
|
|
1259
|
+
HWY_INLINE VFromD<D> Iota0(D d) {
|
|
1260
|
+
return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2},
|
|
1261
|
+
TFromD<D>{3}, TFromD<D>{4}, TFromD<D>{5},
|
|
1262
|
+
TFromD<D>{6}, TFromD<D>{7});
|
|
1263
|
+
}
|
|
1264
|
+
|
|
1265
|
+
template <class D, HWY_IF_F16_D(D)>
|
|
1266
|
+
HWY_INLINE VFromD<D> Iota0(D d) {
|
|
1267
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1268
|
+
return BitCast(d, Dup128VecFromValues(du, uint16_t{0}, uint16_t{0x3C00},
|
|
1269
|
+
uint16_t{0x4000}, uint16_t{0x4200},
|
|
1270
|
+
uint16_t{0x4400}, uint16_t{0x4500},
|
|
1271
|
+
uint16_t{0x4600}, uint16_t{0x4700}));
|
|
1272
|
+
}
|
|
1273
|
+
|
|
1274
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
1275
|
+
HWY_INLINE VFromD<D> Iota0(D d) {
|
|
1276
|
+
return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2},
|
|
1277
|
+
TFromD<D>{3});
|
|
1278
|
+
}
|
|
1279
|
+
|
|
1280
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
1281
|
+
HWY_INLINE VFromD<D> Iota0(D d) {
|
|
1282
|
+
return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1});
|
|
1283
|
+
}
|
|
1195
1284
|
|
|
1196
1285
|
#if HWY_COMPILER_MSVC
|
|
1197
1286
|
template <class V, HWY_IF_V_SIZE_LE_V(V, 4)>
|
|
@@ -1226,9 +1315,6 @@ HWY_API VFromD<D> Iota(D d, const T2 first) {
|
|
|
1226
1315
|
#endif
|
|
1227
1316
|
}
|
|
1228
1317
|
|
|
1229
|
-
// ------------------------------ Tuple (VFromD)
|
|
1230
|
-
#include "hwy/ops/tuple-inl.h"
|
|
1231
|
-
|
|
1232
1318
|
// ------------------------------ Combine
|
|
1233
1319
|
|
|
1234
1320
|
// Full result
|
|
@@ -1274,30 +1360,25 @@ HWY_API Vec128<int64_t> Combine(D /* tag */, Vec64<int64_t> hi,
|
|
|
1274
1360
|
return Vec128<int64_t>(vcombine_s64(lo.raw, hi.raw));
|
|
1275
1361
|
}
|
|
1276
1362
|
|
|
1277
|
-
template <class D, HWY_IF_F16_D(D)>
|
|
1278
|
-
HWY_API Vec128<float16_t> Combine(D d, Vec64<float16_t> hi,
|
|
1279
|
-
Vec64<float16_t> lo) {
|
|
1280
1363
|
#if HWY_HAVE_FLOAT16
|
|
1281
|
-
|
|
1364
|
+
template <class D, HWY_IF_F16_D(D)>
|
|
1365
|
+
HWY_API Vec128<float16_t> Combine(D, Vec64<float16_t> hi, Vec64<float16_t> lo) {
|
|
1282
1366
|
return Vec128<float16_t>(vcombine_f16(lo.raw, hi.raw));
|
|
1283
|
-
#else
|
|
1284
|
-
const RebindToUnsigned<D> du;
|
|
1285
|
-
const Half<decltype(du)> duh;
|
|
1286
|
-
return BitCast(d, Combine(du, BitCast(duh, hi), BitCast(duh, lo)));
|
|
1287
|
-
#endif
|
|
1288
1367
|
}
|
|
1368
|
+
#endif // HWY_HAVE_FLOAT16
|
|
1289
1369
|
|
|
1290
|
-
template <class D, HWY_IF_BF16_D(D)>
|
|
1291
|
-
HWY_API Vec128<bfloat16_t> Combine(D d, Vec64<bfloat16_t> hi,
|
|
1292
|
-
Vec64<bfloat16_t> lo) {
|
|
1293
1370
|
#if HWY_NEON_HAVE_BFLOAT16
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1371
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
1372
|
+
HWY_API VFromD<D> Combine(D, Vec64<bfloat16_t> hi, Vec64<bfloat16_t> lo) {
|
|
1373
|
+
return VFromD<D>(vcombine_bf16(lo.raw, hi.raw));
|
|
1374
|
+
}
|
|
1375
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
1376
|
+
|
|
1377
|
+
template <class D, class DH = Half<D>, HWY_NEON_IF_EMULATED_D(D)>
|
|
1378
|
+
HWY_API VFromD<D> Combine(D d, VFromD<DH> hi, VFromD<DH> lo) {
|
|
1297
1379
|
const RebindToUnsigned<D> du;
|
|
1298
1380
|
const Half<decltype(du)> duh;
|
|
1299
1381
|
return BitCast(d, Combine(du, BitCast(duh, hi), BitCast(duh, lo)));
|
|
1300
|
-
#endif
|
|
1301
1382
|
}
|
|
1302
1383
|
|
|
1303
1384
|
template <class D, HWY_IF_F32_D(D)>
|
|
@@ -1341,7 +1422,7 @@ HWY_NEON_DEF_FUNCTION_UINT_32(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
|
|
|
1341
1422
|
HWY_NEON_DEF_FUNCTION_UINT_64(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
|
|
1342
1423
|
|
|
1343
1424
|
#if !HWY_HAVE_FLOAT16
|
|
1344
|
-
#if
|
|
1425
|
+
#if HWY_NEON_HAVE_F16C
|
|
1345
1426
|
HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(BitCastToByte, vreinterpret, _u8_,
|
|
1346
1427
|
HWY_CAST_TO_U8)
|
|
1347
1428
|
#else
|
|
@@ -1349,7 +1430,7 @@ template <size_t N>
|
|
|
1349
1430
|
HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<float16_t, N> v) {
|
|
1350
1431
|
return BitCastToByte(Vec128<uint16_t, N>(v.raw));
|
|
1351
1432
|
}
|
|
1352
|
-
#endif //
|
|
1433
|
+
#endif // HWY_NEON_HAVE_F16C
|
|
1353
1434
|
#endif // !HWY_HAVE_FLOAT16
|
|
1354
1435
|
|
|
1355
1436
|
#if !HWY_NEON_HAVE_BFLOAT16
|
|
@@ -1406,14 +1487,24 @@ HWY_INLINE Vec64<int64_t> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
|
|
|
1406
1487
|
return Vec64<int64_t>(vreinterpret_s64_u8(v.raw));
|
|
1407
1488
|
}
|
|
1408
1489
|
|
|
1490
|
+
// Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C.
|
|
1409
1491
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
|
|
1410
|
-
HWY_INLINE VFromD<D> BitCastFromByte(D
|
|
1411
|
-
#if HWY_HAVE_FLOAT16 ||
|
|
1412
|
-
(void)d;
|
|
1492
|
+
HWY_INLINE VFromD<D> BitCastFromByte(D, VFromD<Repartition<uint8_t, D>> v) {
|
|
1493
|
+
#if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
|
|
1413
1494
|
return VFromD<D>(vreinterpret_f16_u8(v.raw));
|
|
1414
1495
|
#else
|
|
1415
1496
|
const RebindToUnsigned<D> du;
|
|
1416
|
-
return VFromD<
|
|
1497
|
+
return VFromD<D>(BitCastFromByte(du, v).raw);
|
|
1498
|
+
#endif
|
|
1499
|
+
}
|
|
1500
|
+
|
|
1501
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
|
|
1502
|
+
HWY_INLINE VFromD<D> BitCastFromByte(D, VFromD<Repartition<uint8_t, D>> v) {
|
|
1503
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
1504
|
+
return VFromD<D>(vreinterpret_bf16_u8(v.raw));
|
|
1505
|
+
#else
|
|
1506
|
+
const RebindToUnsigned<D> du;
|
|
1507
|
+
return VFromD<D>(BitCastFromByte(du, v).raw);
|
|
1417
1508
|
#endif
|
|
1418
1509
|
}
|
|
1419
1510
|
|
|
@@ -1461,15 +1552,6 @@ HWY_INLINE Vec128<int64_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
|
1461
1552
|
return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
|
|
1462
1553
|
}
|
|
1463
1554
|
|
|
1464
|
-
template <class D, HWY_IF_F16_D(D)>
|
|
1465
|
-
HWY_INLINE Vec128<float16_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
1466
|
-
#if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_FLOAT16C
|
|
1467
|
-
return Vec128<float16_t>(vreinterpretq_f16_u8(v.raw));
|
|
1468
|
-
#else
|
|
1469
|
-
return Vec128<float16_t>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
|
|
1470
|
-
#endif
|
|
1471
|
-
}
|
|
1472
|
-
|
|
1473
1555
|
template <class D, HWY_IF_F32_D(D)>
|
|
1474
1556
|
HWY_INLINE Vec128<float> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
1475
1557
|
return Vec128<float>(vreinterpretq_f32_u8(v.raw));
|
|
@@ -1482,11 +1564,23 @@ HWY_INLINE Vec128<double> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
|
1482
1564
|
}
|
|
1483
1565
|
#endif // HWY_HAVE_FLOAT64
|
|
1484
1566
|
|
|
1485
|
-
//
|
|
1567
|
+
// Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C.
|
|
1568
|
+
template <class D, HWY_IF_F16_D(D)>
|
|
1569
|
+
HWY_INLINE VFromD<D> BitCastFromByte(D, Vec128<uint8_t> v) {
|
|
1570
|
+
#if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
|
|
1571
|
+
return VFromD<D>(vreinterpretq_f16_u8(v.raw));
|
|
1572
|
+
#else
|
|
1573
|
+
return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
|
|
1574
|
+
#endif
|
|
1575
|
+
}
|
|
1576
|
+
|
|
1486
1577
|
template <class D, HWY_IF_BF16_D(D)>
|
|
1487
|
-
HWY_INLINE VFromD<D> BitCastFromByte(D
|
|
1488
|
-
|
|
1578
|
+
HWY_INLINE VFromD<D> BitCastFromByte(D, Vec128<uint8_t> v) {
|
|
1579
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
1580
|
+
return VFromD<D>(vreinterpretq_bf16_u8(v.raw));
|
|
1581
|
+
#else
|
|
1489
1582
|
return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
|
|
1583
|
+
#endif
|
|
1490
1584
|
}
|
|
1491
1585
|
|
|
1492
1586
|
} // namespace detail
|
|
@@ -1542,6 +1636,14 @@ namespace detail {
|
|
|
1542
1636
|
#define HWY_NEON_BUILD_ARG_HWY_GET v.raw, kLane
|
|
1543
1637
|
|
|
1544
1638
|
HWY_NEON_DEF_FUNCTION_ALL_TYPES(GetLane, vget, _lane_, HWY_GET)
|
|
1639
|
+
HWY_NEON_DEF_FUNCTION_BFLOAT_16(GetLane, vget, _lane_, HWY_GET)
|
|
1640
|
+
|
|
1641
|
+
template <size_t kLane, class V, HWY_NEON_IF_EMULATED_D(DFromV<V>)>
|
|
1642
|
+
static HWY_INLINE HWY_MAYBE_UNUSED TFromV<V> GetLane(V v) {
|
|
1643
|
+
const DFromV<decltype(v)> d;
|
|
1644
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1645
|
+
return BitCastScalar<TFromV<V>>(GetLane<kLane>(BitCast(du, v)));
|
|
1646
|
+
}
|
|
1545
1647
|
|
|
1546
1648
|
#undef HWY_NEON_BUILD_TPL_HWY_GET
|
|
1547
1649
|
#undef HWY_NEON_BUILD_RET_HWY_GET
|
|
@@ -1688,12 +1790,21 @@ namespace detail {
|
|
|
1688
1790
|
#define HWY_NEON_BUILD_ARG_HWY_INSERT t, v.raw, kLane
|
|
1689
1791
|
|
|
1690
1792
|
HWY_NEON_DEF_FUNCTION_ALL_TYPES(InsertLane, vset, _lane_, HWY_INSERT)
|
|
1793
|
+
HWY_NEON_DEF_FUNCTION_BFLOAT_16(InsertLane, vset, _lane_, HWY_INSERT)
|
|
1691
1794
|
|
|
1692
1795
|
#undef HWY_NEON_BUILD_TPL_HWY_INSERT
|
|
1693
1796
|
#undef HWY_NEON_BUILD_RET_HWY_INSERT
|
|
1694
1797
|
#undef HWY_NEON_BUILD_PARAM_HWY_INSERT
|
|
1695
1798
|
#undef HWY_NEON_BUILD_ARG_HWY_INSERT
|
|
1696
1799
|
|
|
1800
|
+
template <size_t kLane, class V, class D = DFromV<V>, HWY_NEON_IF_EMULATED_D(D)>
|
|
1801
|
+
HWY_API V InsertLane(const V v, TFromD<D> t) {
|
|
1802
|
+
const D d;
|
|
1803
|
+
const RebindToUnsigned<D> du;
|
|
1804
|
+
const uint16_t tu = BitCastScalar<uint16_t>(t);
|
|
1805
|
+
return BitCast(d, InsertLane<kLane>(BitCast(du, v), tu));
|
|
1806
|
+
}
|
|
1807
|
+
|
|
1697
1808
|
} // namespace detail
|
|
1698
1809
|
|
|
1699
1810
|
// Requires one overload per vector length because InsertLane<3> may be a
|
|
@@ -1842,6 +1953,89 @@ HWY_API Vec128<uint64_t> SumsOf8(const Vec128<uint8_t> v) {
|
|
|
1842
1953
|
HWY_API Vec64<uint64_t> SumsOf8(const Vec64<uint8_t> v) {
|
|
1843
1954
|
return Vec64<uint64_t>(vpaddl_u32(vpaddl_u16(vpaddl_u8(v.raw))));
|
|
1844
1955
|
}
|
|
1956
|
+
HWY_API Vec128<int64_t> SumsOf8(const Vec128<int8_t> v) {
|
|
1957
|
+
return Vec128<int64_t>(vpaddlq_s32(vpaddlq_s16(vpaddlq_s8(v.raw))));
|
|
1958
|
+
}
|
|
1959
|
+
HWY_API Vec64<int64_t> SumsOf8(const Vec64<int8_t> v) {
|
|
1960
|
+
return Vec64<int64_t>(vpaddl_s32(vpaddl_s16(vpaddl_s8(v.raw))));
|
|
1961
|
+
}
|
|
1962
|
+
|
|
1963
|
+
// ------------------------------ SumsOf2
|
|
1964
|
+
namespace detail {
|
|
1965
|
+
|
|
1966
|
+
template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
|
|
1967
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
1968
|
+
hwy::SignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
|
|
1969
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_s8(v.raw));
|
|
1970
|
+
}
|
|
1971
|
+
|
|
1972
|
+
template <class V, HWY_IF_V_SIZE_V(V, 16)>
|
|
1973
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
1974
|
+
hwy::SignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
|
|
1975
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_s8(v.raw));
|
|
1976
|
+
}
|
|
1977
|
+
|
|
1978
|
+
template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
|
|
1979
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
1980
|
+
hwy::UnsignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
|
|
1981
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_u8(v.raw));
|
|
1982
|
+
}
|
|
1983
|
+
|
|
1984
|
+
template <class V, HWY_IF_V_SIZE_V(V, 16)>
|
|
1985
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
1986
|
+
hwy::UnsignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
|
|
1987
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_u8(v.raw));
|
|
1988
|
+
}
|
|
1989
|
+
|
|
1990
|
+
template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
|
|
1991
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
1992
|
+
hwy::SignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
|
|
1993
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_s16(v.raw));
|
|
1994
|
+
}
|
|
1995
|
+
|
|
1996
|
+
template <class V, HWY_IF_V_SIZE_V(V, 16)>
|
|
1997
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
1998
|
+
hwy::SignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
|
|
1999
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_s16(v.raw));
|
|
2000
|
+
}
|
|
2001
|
+
|
|
2002
|
+
template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
|
|
2003
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
2004
|
+
hwy::UnsignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
|
|
2005
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_u16(v.raw));
|
|
2006
|
+
}
|
|
2007
|
+
|
|
2008
|
+
template <class V, HWY_IF_V_SIZE_V(V, 16)>
|
|
2009
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
2010
|
+
hwy::UnsignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
|
|
2011
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_u16(v.raw));
|
|
2012
|
+
}
|
|
2013
|
+
|
|
2014
|
+
template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
|
|
2015
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
2016
|
+
hwy::SignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
|
|
2017
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_s32(v.raw));
|
|
2018
|
+
}
|
|
2019
|
+
|
|
2020
|
+
template <class V, HWY_IF_V_SIZE_V(V, 16)>
|
|
2021
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
2022
|
+
hwy::SignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
|
|
2023
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_s32(v.raw));
|
|
2024
|
+
}
|
|
2025
|
+
|
|
2026
|
+
template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
|
|
2027
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
2028
|
+
hwy::UnsignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
|
|
2029
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_u32(v.raw));
|
|
2030
|
+
}
|
|
2031
|
+
|
|
2032
|
+
template <class V, HWY_IF_V_SIZE_V(V, 16)>
|
|
2033
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
2034
|
+
hwy::UnsignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
|
|
2035
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_u32(v.raw));
|
|
2036
|
+
}
|
|
2037
|
+
|
|
2038
|
+
} // namespace detail
|
|
1845
2039
|
|
|
1846
2040
|
// ------------------------------ SaturatedAdd
|
|
1847
2041
|
|
|
@@ -1922,6 +2116,31 @@ HWY_API Vec128<int64_t> Neg(const Vec128<int64_t> v) {
|
|
|
1922
2116
|
#endif
|
|
1923
2117
|
}
|
|
1924
2118
|
|
|
2119
|
+
// ------------------------------ SaturatedNeg
|
|
2120
|
+
#ifdef HWY_NATIVE_SATURATED_NEG_8_16_32
|
|
2121
|
+
#undef HWY_NATIVE_SATURATED_NEG_8_16_32
|
|
2122
|
+
#else
|
|
2123
|
+
#define HWY_NATIVE_SATURATED_NEG_8_16_32
|
|
2124
|
+
#endif
|
|
2125
|
+
|
|
2126
|
+
HWY_NEON_DEF_FUNCTION_INT_8_16_32(SaturatedNeg, vqneg, _, 1)
|
|
2127
|
+
|
|
2128
|
+
#if HWY_ARCH_ARM_A64
|
|
2129
|
+
#ifdef HWY_NATIVE_SATURATED_NEG_64
|
|
2130
|
+
#undef HWY_NATIVE_SATURATED_NEG_64
|
|
2131
|
+
#else
|
|
2132
|
+
#define HWY_NATIVE_SATURATED_NEG_64
|
|
2133
|
+
#endif
|
|
2134
|
+
|
|
2135
|
+
HWY_API Vec64<int64_t> SaturatedNeg(const Vec64<int64_t> v) {
|
|
2136
|
+
return Vec64<int64_t>(vqneg_s64(v.raw));
|
|
2137
|
+
}
|
|
2138
|
+
|
|
2139
|
+
HWY_API Vec128<int64_t> SaturatedNeg(const Vec128<int64_t> v) {
|
|
2140
|
+
return Vec128<int64_t>(vqnegq_s64(v.raw));
|
|
2141
|
+
}
|
|
2142
|
+
#endif
|
|
2143
|
+
|
|
1925
2144
|
// ------------------------------ ShiftLeft
|
|
1926
2145
|
|
|
1927
2146
|
// Customize HWY_NEON_DEF_FUNCTION to special-case count=0 (not supported).
|
|
@@ -1943,12 +2162,16 @@ HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, vshr, _n_, ignored)
|
|
|
1943
2162
|
#pragma pop_macro("HWY_NEON_DEF_FUNCTION")
|
|
1944
2163
|
|
|
1945
2164
|
// ------------------------------ RotateRight (ShiftRight, Or)
|
|
1946
|
-
template <int kBits, typename T, size_t N>
|
|
2165
|
+
template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
1947
2166
|
HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
|
|
2167
|
+
const DFromV<decltype(v)> d;
|
|
2168
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2169
|
+
|
|
1948
2170
|
constexpr size_t kSizeInBits = sizeof(T) * 8;
|
|
1949
2171
|
static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
|
|
1950
2172
|
if (kBits == 0) return v;
|
|
1951
|
-
|
|
2173
|
+
|
|
2174
|
+
return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
|
|
1952
2175
|
ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
|
|
1953
2176
|
}
|
|
1954
2177
|
|
|
@@ -2138,7 +2361,39 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator*, vmul, _, 2)
|
|
|
2138
2361
|
|
|
2139
2362
|
// ------------------------------ Integer multiplication
|
|
2140
2363
|
|
|
2141
|
-
// Returns the upper
|
|
2364
|
+
// Returns the upper sizeof(T)*8 bits of a * b in each lane.
|
|
2365
|
+
HWY_API Vec128<int8_t> MulHigh(Vec128<int8_t> a, Vec128<int8_t> b) {
|
|
2366
|
+
int16x8_t rlo = vmull_s8(vget_low_s8(a.raw), vget_low_s8(b.raw));
|
|
2367
|
+
#if HWY_ARCH_ARM_A64
|
|
2368
|
+
int16x8_t rhi = vmull_high_s8(a.raw, b.raw);
|
|
2369
|
+
#else
|
|
2370
|
+
int16x8_t rhi = vmull_s8(vget_high_s8(a.raw), vget_high_s8(b.raw));
|
|
2371
|
+
#endif
|
|
2372
|
+
return Vec128<int8_t>(
|
|
2373
|
+
vuzp2q_s8(vreinterpretq_s8_s16(rlo), vreinterpretq_s8_s16(rhi)));
|
|
2374
|
+
}
|
|
2375
|
+
HWY_API Vec128<uint8_t> MulHigh(Vec128<uint8_t> a, Vec128<uint8_t> b) {
|
|
2376
|
+
uint16x8_t rlo = vmull_u8(vget_low_u8(a.raw), vget_low_u8(b.raw));
|
|
2377
|
+
#if HWY_ARCH_ARM_A64
|
|
2378
|
+
uint16x8_t rhi = vmull_high_u8(a.raw, b.raw);
|
|
2379
|
+
#else
|
|
2380
|
+
uint16x8_t rhi = vmull_u8(vget_high_u8(a.raw), vget_high_u8(b.raw));
|
|
2381
|
+
#endif
|
|
2382
|
+
return Vec128<uint8_t>(
|
|
2383
|
+
vuzp2q_u8(vreinterpretq_u8_u16(rlo), vreinterpretq_u8_u16(rhi)));
|
|
2384
|
+
}
|
|
2385
|
+
|
|
2386
|
+
template <size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)>
|
|
2387
|
+
HWY_API Vec128<int8_t, N> MulHigh(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
|
|
2388
|
+
int8x16_t hi_lo = vreinterpretq_s8_s16(vmull_s8(a.raw, b.raw));
|
|
2389
|
+
return Vec128<int8_t, N>(vget_low_s8(vuzp2q_s8(hi_lo, hi_lo)));
|
|
2390
|
+
}
|
|
2391
|
+
template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)>
|
|
2392
|
+
HWY_API Vec128<uint8_t, N> MulHigh(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
|
|
2393
|
+
uint8x16_t hi_lo = vreinterpretq_u8_u16(vmull_u8(a.raw, b.raw));
|
|
2394
|
+
return Vec128<uint8_t, N>(vget_low_u8(vuzp2q_u8(hi_lo, hi_lo)));
|
|
2395
|
+
}
|
|
2396
|
+
|
|
2142
2397
|
HWY_API Vec128<int16_t> MulHigh(Vec128<int16_t> a, Vec128<int16_t> b) {
|
|
2143
2398
|
int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw));
|
|
2144
2399
|
#if HWY_ARCH_ARM_A64
|
|
@@ -2172,10 +2427,61 @@ HWY_API Vec128<uint16_t, N> MulHigh(Vec128<uint16_t, N> a,
|
|
|
2172
2427
|
return Vec128<uint16_t, N>(vget_low_u16(vuzp2q_u16(hi_lo, hi_lo)));
|
|
2173
2428
|
}
|
|
2174
2429
|
|
|
2175
|
-
HWY_API Vec128<
|
|
2176
|
-
|
|
2177
|
-
|
|
2178
|
-
|
|
2430
|
+
HWY_API Vec128<int32_t> MulHigh(Vec128<int32_t> a, Vec128<int32_t> b) {
|
|
2431
|
+
int64x2_t rlo = vmull_s32(vget_low_s32(a.raw), vget_low_s32(b.raw));
|
|
2432
|
+
#if HWY_ARCH_ARM_A64
|
|
2433
|
+
int64x2_t rhi = vmull_high_s32(a.raw, b.raw);
|
|
2434
|
+
#else
|
|
2435
|
+
int64x2_t rhi = vmull_s32(vget_high_s32(a.raw), vget_high_s32(b.raw));
|
|
2436
|
+
#endif
|
|
2437
|
+
return Vec128<int32_t>(
|
|
2438
|
+
vuzp2q_s32(vreinterpretq_s32_s64(rlo), vreinterpretq_s32_s64(rhi)));
|
|
2439
|
+
}
|
|
2440
|
+
HWY_API Vec128<uint32_t> MulHigh(Vec128<uint32_t> a, Vec128<uint32_t> b) {
|
|
2441
|
+
uint64x2_t rlo = vmull_u32(vget_low_u32(a.raw), vget_low_u32(b.raw));
|
|
2442
|
+
#if HWY_ARCH_ARM_A64
|
|
2443
|
+
uint64x2_t rhi = vmull_high_u32(a.raw, b.raw);
|
|
2444
|
+
#else
|
|
2445
|
+
uint64x2_t rhi = vmull_u32(vget_high_u32(a.raw), vget_high_u32(b.raw));
|
|
2446
|
+
#endif
|
|
2447
|
+
return Vec128<uint32_t>(
|
|
2448
|
+
vuzp2q_u32(vreinterpretq_u32_u64(rlo), vreinterpretq_u32_u64(rhi)));
|
|
2449
|
+
}
|
|
2450
|
+
|
|
2451
|
+
template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)>
|
|
2452
|
+
HWY_API Vec128<int32_t, N> MulHigh(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
|
|
2453
|
+
int32x4_t hi_lo = vreinterpretq_s32_s64(vmull_s32(a.raw, b.raw));
|
|
2454
|
+
return Vec128<int32_t, N>(vget_low_s32(vuzp2q_s32(hi_lo, hi_lo)));
|
|
2455
|
+
}
|
|
2456
|
+
template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
|
|
2457
|
+
HWY_API Vec128<uint32_t, N> MulHigh(Vec128<uint32_t, N> a,
|
|
2458
|
+
Vec128<uint32_t, N> b) {
|
|
2459
|
+
uint32x4_t hi_lo = vreinterpretq_u32_u64(vmull_u32(a.raw, b.raw));
|
|
2460
|
+
return Vec128<uint32_t, N>(vget_low_u32(vuzp2q_u32(hi_lo, hi_lo)));
|
|
2461
|
+
}
|
|
2462
|
+
|
|
2463
|
+
template <class T, HWY_IF_UI64(T)>
|
|
2464
|
+
HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
|
|
2465
|
+
T hi_0;
|
|
2466
|
+
T hi_1;
|
|
2467
|
+
|
|
2468
|
+
Mul128(GetLane(a), GetLane(b), &hi_0);
|
|
2469
|
+
Mul128(detail::GetLane<1>(a), detail::GetLane<1>(b), &hi_1);
|
|
2470
|
+
|
|
2471
|
+
return Dup128VecFromValues(Full128<T>(), hi_0, hi_1);
|
|
2472
|
+
}
|
|
2473
|
+
|
|
2474
|
+
template <class T, HWY_IF_UI64(T)>
|
|
2475
|
+
HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) {
|
|
2476
|
+
T hi;
|
|
2477
|
+
Mul128(GetLane(a), GetLane(b), &hi);
|
|
2478
|
+
return Set(Full64<T>(), hi);
|
|
2479
|
+
}
|
|
2480
|
+
|
|
2481
|
+
HWY_API Vec128<int16_t> MulFixedPoint15(Vec128<int16_t> a, Vec128<int16_t> b) {
|
|
2482
|
+
return Vec128<int16_t>(vqrdmulhq_s16(a.raw, b.raw));
|
|
2483
|
+
}
|
|
2484
|
+
template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
|
|
2179
2485
|
HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
|
|
2180
2486
|
Vec128<int16_t, N> b) {
|
|
2181
2487
|
return Vec128<int16_t, N>(vqrdmulh_s16(a.raw, b.raw));
|
|
@@ -2277,7 +2583,7 @@ HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
|
|
|
2277
2583
|
|
|
2278
2584
|
namespace detail {
|
|
2279
2585
|
|
|
2280
|
-
#if
|
|
2586
|
+
#if HWY_NATIVE_FMA
|
|
2281
2587
|
// Wrappers for changing argument order to what intrinsics expect.
|
|
2282
2588
|
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(MulAdd, vfma, _, 3)
|
|
2283
2589
|
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(NegMulAdd, vfms, _, 3)
|
|
@@ -2295,7 +2601,7 @@ HWY_API Vec128<float, N> NegMulAdd(Vec128<float, N> add, Vec128<float, N> mul,
|
|
|
2295
2601
|
return add - mul * x;
|
|
2296
2602
|
}
|
|
2297
2603
|
|
|
2298
|
-
#endif //
|
|
2604
|
+
#endif // HWY_NATIVE_FMA
|
|
2299
2605
|
} // namespace detail
|
|
2300
2606
|
|
|
2301
2607
|
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
@@ -2310,13 +2616,13 @@ HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
|
|
|
2310
2616
|
return detail::NegMulAdd(add, mul, x);
|
|
2311
2617
|
}
|
|
2312
2618
|
|
|
2313
|
-
template <typename T, size_t N>
|
|
2619
|
+
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
2314
2620
|
HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
|
|
2315
2621
|
Vec128<T, N> sub) {
|
|
2316
2622
|
return MulAdd(mul, x, Neg(sub));
|
|
2317
2623
|
}
|
|
2318
2624
|
|
|
2319
|
-
template <typename T, size_t N>
|
|
2625
|
+
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
2320
2626
|
HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
|
|
2321
2627
|
Vec128<T, N> sub) {
|
|
2322
2628
|
return Neg(MulAdd(mul, x, sub));
|
|
@@ -2612,6 +2918,15 @@ HWY_API Vec128<T, N> PopulationCount(Vec128<T, N> v) {
|
|
|
2612
2918
|
HWY_NEON_DEF_FUNCTION_INT_8_16_32(Abs, vabs, _, 1)
|
|
2613
2919
|
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Abs, vabs, _, 1)
|
|
2614
2920
|
|
|
2921
|
+
// ------------------------------ SaturatedAbs
|
|
2922
|
+
#ifdef HWY_NATIVE_SATURATED_ABS
|
|
2923
|
+
#undef HWY_NATIVE_SATURATED_ABS
|
|
2924
|
+
#else
|
|
2925
|
+
#define HWY_NATIVE_SATURATED_ABS
|
|
2926
|
+
#endif
|
|
2927
|
+
|
|
2928
|
+
HWY_NEON_DEF_FUNCTION_INT_8_16_32(SaturatedAbs, vqabs, _, 1)
|
|
2929
|
+
|
|
2615
2930
|
// ------------------------------ CopySign
|
|
2616
2931
|
template <typename T, size_t N>
|
|
2617
2932
|
HWY_API Vec128<T, N> CopySign(Vec128<T, N> magn, Vec128<T, N> sign) {
|
|
@@ -2675,22 +2990,49 @@ HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) {
|
|
|
2675
2990
|
|
|
2676
2991
|
HWY_NEON_DEF_FUNCTION_ALL_TYPES(IfThenElse, vbsl, _, HWY_IF)
|
|
2677
2992
|
|
|
2993
|
+
#if HWY_HAVE_FLOAT16
|
|
2994
|
+
#define HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V) HWY_IF_BF16(TFromV<V>)
|
|
2995
|
+
#else
|
|
2996
|
+
#define HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V) HWY_IF_SPECIAL_FLOAT_V(V)
|
|
2997
|
+
#endif
|
|
2998
|
+
|
|
2999
|
+
template <class V, HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V)>
|
|
3000
|
+
HWY_API V IfThenElse(MFromD<DFromV<V>> mask, V yes, V no) {
|
|
3001
|
+
const DFromV<decltype(yes)> d;
|
|
3002
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
3003
|
+
return BitCast(
|
|
3004
|
+
d, IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no)));
|
|
3005
|
+
}
|
|
3006
|
+
|
|
3007
|
+
#undef HWY_NEON_IF_EMULATED_IF_THEN_ELSE
|
|
2678
3008
|
#undef HWY_NEON_BUILD_TPL_HWY_IF
|
|
2679
3009
|
#undef HWY_NEON_BUILD_RET_HWY_IF
|
|
2680
3010
|
#undef HWY_NEON_BUILD_PARAM_HWY_IF
|
|
2681
3011
|
#undef HWY_NEON_BUILD_ARG_HWY_IF
|
|
2682
3012
|
|
|
2683
3013
|
// mask ? yes : 0
|
|
2684
|
-
template <typename T, size_t N>
|
|
3014
|
+
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
2685
3015
|
HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
|
|
2686
3016
|
return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
|
|
2687
3017
|
}
|
|
3018
|
+
template <typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)>
|
|
3019
|
+
HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
|
|
3020
|
+
const DFromV<decltype(yes)> d;
|
|
3021
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
3022
|
+
return BitCast(d, IfThenElseZero(RebindMask(du, mask), BitCast(du, yes)));
|
|
3023
|
+
}
|
|
2688
3024
|
|
|
2689
3025
|
// mask ? 0 : no
|
|
2690
|
-
template <typename T, size_t N>
|
|
3026
|
+
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
2691
3027
|
HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
|
|
2692
3028
|
return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
|
|
2693
3029
|
}
|
|
3030
|
+
template <typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)>
|
|
3031
|
+
HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
|
|
3032
|
+
const DFromV<decltype(no)> d;
|
|
3033
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
3034
|
+
return BitCast(d, IfThenZeroElse(RebindMask(du, mask), BitCast(du, no)));
|
|
3035
|
+
}
|
|
2694
3036
|
|
|
2695
3037
|
template <typename T, size_t N>
|
|
2696
3038
|
HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
|
|
@@ -2703,12 +3045,6 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
|
|
|
2703
3045
|
return IfThenElse(m, yes, no);
|
|
2704
3046
|
}
|
|
2705
3047
|
|
|
2706
|
-
template <typename T, size_t N>
|
|
2707
|
-
HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
|
|
2708
|
-
const auto zero = Zero(DFromV<decltype(v)>());
|
|
2709
|
-
return Max(zero, v);
|
|
2710
|
-
}
|
|
2711
|
-
|
|
2712
3048
|
// ------------------------------ Mask logical
|
|
2713
3049
|
|
|
2714
3050
|
template <typename T, size_t N>
|
|
@@ -2957,6 +3293,23 @@ HWY_API Vec64<int64_t> Abs(const Vec64<int64_t> v) {
|
|
|
2957
3293
|
#endif
|
|
2958
3294
|
}
|
|
2959
3295
|
|
|
3296
|
+
HWY_API Vec128<int64_t> SaturatedAbs(const Vec128<int64_t> v) {
|
|
3297
|
+
#if HWY_ARCH_ARM_A64
|
|
3298
|
+
return Vec128<int64_t>(vqabsq_s64(v.raw));
|
|
3299
|
+
#else
|
|
3300
|
+
const auto zero = Zero(DFromV<decltype(v)>());
|
|
3301
|
+
return IfThenElse(MaskFromVec(BroadcastSignBit(v)), SaturatedSub(zero, v), v);
|
|
3302
|
+
#endif
|
|
3303
|
+
}
|
|
3304
|
+
HWY_API Vec64<int64_t> SaturatedAbs(const Vec64<int64_t> v) {
|
|
3305
|
+
#if HWY_ARCH_ARM_A64
|
|
3306
|
+
return Vec64<int64_t>(vqabs_s64(v.raw));
|
|
3307
|
+
#else
|
|
3308
|
+
const auto zero = Zero(DFromV<decltype(v)>());
|
|
3309
|
+
return IfThenElse(MaskFromVec(BroadcastSignBit(v)), SaturatedSub(zero, v), v);
|
|
3310
|
+
#endif
|
|
3311
|
+
}
|
|
3312
|
+
|
|
2960
3313
|
// ------------------------------ Min (IfThenElse, BroadcastSignBit)
|
|
2961
3314
|
|
|
2962
3315
|
// Unsigned
|
|
@@ -3133,6 +3486,20 @@ HWY_API Vec128<int64_t> LoadU(D /* tag */,
|
|
|
3133
3486
|
const int64_t* HWY_RESTRICT unaligned) {
|
|
3134
3487
|
return Vec128<int64_t>(vld1q_s64(unaligned));
|
|
3135
3488
|
}
|
|
3489
|
+
#if HWY_HAVE_FLOAT16
|
|
3490
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
|
|
3491
|
+
HWY_API Vec128<float16_t> LoadU(D /* tag */,
|
|
3492
|
+
const float16_t* HWY_RESTRICT unaligned) {
|
|
3493
|
+
return Vec128<float16_t>(vld1q_f16(detail::NativeLanePointer(unaligned)));
|
|
3494
|
+
}
|
|
3495
|
+
#endif // HWY_HAVE_FLOAT16
|
|
3496
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
3497
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
|
|
3498
|
+
HWY_API Vec128<bfloat16_t> LoadU(D /* tag */,
|
|
3499
|
+
const bfloat16_t* HWY_RESTRICT unaligned) {
|
|
3500
|
+
return Vec128<bfloat16_t>(vld1q_bf16(detail::NativeLanePointer(unaligned)));
|
|
3501
|
+
}
|
|
3502
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
3136
3503
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
|
|
3137
3504
|
HWY_API Vec128<float> LoadU(D /* tag */, const float* HWY_RESTRICT unaligned) {
|
|
3138
3505
|
return Vec128<float>(vld1q_f32(unaligned));
|
|
@@ -3179,6 +3546,18 @@ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)>
|
|
|
3179
3546
|
HWY_API Vec64<int64_t> LoadU(D /* tag */, const int64_t* HWY_RESTRICT p) {
|
|
3180
3547
|
return Vec64<int64_t>(vld1_s64(p));
|
|
3181
3548
|
}
|
|
3549
|
+
#if HWY_HAVE_FLOAT16
|
|
3550
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F16_D(D)>
|
|
3551
|
+
HWY_API Vec64<float16_t> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) {
|
|
3552
|
+
return Vec64<float16_t>(vld1_f16(detail::NativeLanePointer(p)));
|
|
3553
|
+
}
|
|
3554
|
+
#endif // HWY_HAVE_FLOAT16
|
|
3555
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
3556
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
|
|
3557
|
+
HWY_API Vec64<bfloat16_t> LoadU(D /* tag */, const bfloat16_t* HWY_RESTRICT p) {
|
|
3558
|
+
return Vec64<bfloat16_t>(vld1_bf16(detail::NativeLanePointer(p)));
|
|
3559
|
+
}
|
|
3560
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
3182
3561
|
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
|
|
3183
3562
|
HWY_API Vec64<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) {
|
|
3184
3563
|
return Vec64<float>(vld1_f32(p));
|
|
@@ -3207,14 +3586,34 @@ HWY_API Vec32<float> LoadU(D /*tag*/, const float* HWY_RESTRICT p) {
|
|
|
3207
3586
|
return Vec32<float>(vld1_dup_f32(p));
|
|
3208
3587
|
}
|
|
3209
3588
|
|
|
3210
|
-
|
|
3211
|
-
|
|
3589
|
+
// {u,i}{8,16}
|
|
3590
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_LE_D(D, 2),
|
|
3591
|
+
HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
|
|
3592
|
+
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
3593
|
+
const Repartition<uint32_t, decltype(d)> d32;
|
|
3594
|
+
uint32_t buf;
|
|
3595
|
+
CopyBytes<4>(p, &buf);
|
|
3596
|
+
return BitCast(d, LoadU(d32, &buf));
|
|
3597
|
+
}
|
|
3598
|
+
|
|
3599
|
+
#if HWY_HAVE_FLOAT16
|
|
3600
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F16_D(D)>
|
|
3601
|
+
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
3602
|
+
const Repartition<uint32_t, decltype(d)> d32;
|
|
3603
|
+
uint32_t buf;
|
|
3604
|
+
CopyBytes<4>(p, &buf);
|
|
3605
|
+
return BitCast(d, LoadU(d32, &buf));
|
|
3606
|
+
}
|
|
3607
|
+
#endif // HWY_HAVE_FLOAT16
|
|
3608
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
3609
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_BF16_D(D)>
|
|
3212
3610
|
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
3213
3611
|
const Repartition<uint32_t, decltype(d)> d32;
|
|
3214
3612
|
uint32_t buf;
|
|
3215
3613
|
CopyBytes<4>(p, &buf);
|
|
3216
3614
|
return BitCast(d, LoadU(d32, &buf));
|
|
3217
3615
|
}
|
|
3616
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
3218
3617
|
|
|
3219
3618
|
// ------------------------------ Load 16
|
|
3220
3619
|
|
|
@@ -3228,6 +3627,18 @@ template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_I16_D(D)>
|
|
|
3228
3627
|
HWY_API VFromD<D> LoadU(D /* tag */, const int16_t* HWY_RESTRICT p) {
|
|
3229
3628
|
return VFromD<D>(vld1_dup_s16(p));
|
|
3230
3629
|
}
|
|
3630
|
+
#if HWY_HAVE_FLOAT16
|
|
3631
|
+
template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_F16_D(D)>
|
|
3632
|
+
HWY_API VFromD<D> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) {
|
|
3633
|
+
return VFromD<D>(vld1_dup_f16(detail::NativeLanePointer(p)));
|
|
3634
|
+
}
|
|
3635
|
+
#endif // HWY_HAVE_FLOAT16
|
|
3636
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
3637
|
+
template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_BF16_D(D)>
|
|
3638
|
+
HWY_API VFromD<D> LoadU(D /* tag */, const bfloat16_t* HWY_RESTRICT p) {
|
|
3639
|
+
return VFromD<D>(vld1_dup_bf16(detail::NativeLanePointer(p)));
|
|
3640
|
+
}
|
|
3641
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
3231
3642
|
|
|
3232
3643
|
// 8-bit x2
|
|
3233
3644
|
template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
|
|
@@ -3250,12 +3661,10 @@ HWY_API VFromD<D> LoadU(D /* tag */, const int8_t* HWY_RESTRICT p) {
|
|
|
3250
3661
|
|
|
3251
3662
|
// ------------------------------ Load misc
|
|
3252
3663
|
|
|
3253
|
-
|
|
3254
|
-
template <class D, HWY_IF_SPECIAL_FLOAT_D(D)>
|
|
3664
|
+
template <class D, HWY_NEON_IF_EMULATED_D(D)>
|
|
3255
3665
|
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
3256
|
-
const RebindToUnsigned<decltype(d)>
|
|
3257
|
-
|
|
3258
|
-
return BitCast(d, LoadU(du16, pu16));
|
|
3666
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
3667
|
+
return BitCast(d, LoadU(du, detail::U16LanePointer(p)));
|
|
3259
3668
|
}
|
|
3260
3669
|
|
|
3261
3670
|
// On Arm, Load is the same as LoadU.
|
|
@@ -3324,6 +3733,20 @@ HWY_API void StoreU(Vec128<int64_t> v, D /* tag */,
|
|
|
3324
3733
|
int64_t* HWY_RESTRICT unaligned) {
|
|
3325
3734
|
vst1q_s64(unaligned, v.raw);
|
|
3326
3735
|
}
|
|
3736
|
+
#if HWY_HAVE_FLOAT16
|
|
3737
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
|
|
3738
|
+
HWY_API void StoreU(Vec128<float16_t> v, D /* tag */,
|
|
3739
|
+
float16_t* HWY_RESTRICT unaligned) {
|
|
3740
|
+
vst1q_f16(detail::NativeLanePointer(unaligned), v.raw);
|
|
3741
|
+
}
|
|
3742
|
+
#endif // HWY_HAVE_FLOAT16
|
|
3743
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
3744
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
|
|
3745
|
+
HWY_API void StoreU(Vec128<bfloat16_t> v, D /* tag */,
|
|
3746
|
+
bfloat16_t* HWY_RESTRICT unaligned) {
|
|
3747
|
+
vst1q_bf16(detail::NativeLanePointer(unaligned), v.raw);
|
|
3748
|
+
}
|
|
3749
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
3327
3750
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
|
|
3328
3751
|
HWY_API void StoreU(Vec128<float> v, D /* tag */,
|
|
3329
3752
|
float* HWY_RESTRICT unaligned) {
|
|
@@ -3371,6 +3794,20 @@ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)>
|
|
|
3371
3794
|
HWY_API void StoreU(Vec64<int64_t> v, D /* tag */, int64_t* HWY_RESTRICT p) {
|
|
3372
3795
|
vst1_s64(p, v.raw);
|
|
3373
3796
|
}
|
|
3797
|
+
#if HWY_HAVE_FLOAT16
|
|
3798
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F16_D(D)>
|
|
3799
|
+
HWY_API void StoreU(Vec64<float16_t> v, D /* tag */,
|
|
3800
|
+
float16_t* HWY_RESTRICT p) {
|
|
3801
|
+
vst1_f16(detail::NativeLanePointer(p), v.raw);
|
|
3802
|
+
}
|
|
3803
|
+
#endif // HWY_HAVE_FLOAT16
|
|
3804
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
3805
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
|
|
3806
|
+
HWY_API void StoreU(Vec64<bfloat16_t> v, D /* tag */,
|
|
3807
|
+
bfloat16_t* HWY_RESTRICT p) {
|
|
3808
|
+
vst1_bf16(detail::NativeLanePointer(p), v.raw);
|
|
3809
|
+
}
|
|
3810
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
3374
3811
|
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
|
|
3375
3812
|
HWY_API void StoreU(Vec64<float> v, D /* tag */, float* HWY_RESTRICT p) {
|
|
3376
3813
|
vst1_f32(p, v.raw);
|
|
@@ -3397,28 +3834,31 @@ HWY_API void StoreU(Vec32<float> v, D, float* HWY_RESTRICT p) {
|
|
|
3397
3834
|
vst1_lane_f32(p, v.raw, 0);
|
|
3398
3835
|
}
|
|
3399
3836
|
|
|
3400
|
-
//
|
|
3401
|
-
template <class D, HWY_IF_V_SIZE_D(D, 4),
|
|
3402
|
-
|
|
3403
|
-
HWY_API void StoreU(
|
|
3837
|
+
// {u,i}{8,16}
|
|
3838
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_LE_D(D, 2),
|
|
3839
|
+
HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
|
|
3840
|
+
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
|
|
3404
3841
|
Repartition<uint32_t, decltype(d)> d32;
|
|
3405
3842
|
uint32_t buf = GetLane(BitCast(d32, v));
|
|
3406
3843
|
CopyBytes<4>(&buf, p);
|
|
3407
3844
|
}
|
|
3408
3845
|
|
|
3409
|
-
|
|
3410
|
-
|
|
3846
|
+
#if HWY_HAVE_FLOAT16
|
|
3847
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F16_D(D)>
|
|
3848
|
+
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
|
|
3411
3849
|
Repartition<uint32_t, decltype(d)> d32;
|
|
3412
3850
|
uint32_t buf = GetLane(BitCast(d32, v));
|
|
3413
3851
|
CopyBytes<4>(&buf, p);
|
|
3414
3852
|
}
|
|
3415
|
-
|
|
3416
|
-
|
|
3417
|
-
|
|
3853
|
+
#endif
|
|
3854
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
3855
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_BF16_D(D)>
|
|
3856
|
+
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
|
|
3418
3857
|
Repartition<uint32_t, decltype(d)> d32;
|
|
3419
3858
|
uint32_t buf = GetLane(BitCast(d32, v));
|
|
3420
3859
|
CopyBytes<4>(&buf, p);
|
|
3421
3860
|
}
|
|
3861
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
3422
3862
|
|
|
3423
3863
|
// ------------------------------ Store 16
|
|
3424
3864
|
|
|
@@ -3430,6 +3870,18 @@ template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_I16_D(D)>
|
|
|
3430
3870
|
HWY_API void StoreU(Vec16<int16_t> v, D, int16_t* HWY_RESTRICT p) {
|
|
3431
3871
|
vst1_lane_s16(p, v.raw, 0);
|
|
3432
3872
|
}
|
|
3873
|
+
#if HWY_HAVE_FLOAT16
|
|
3874
|
+
template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_F16_D(D)>
|
|
3875
|
+
HWY_API void StoreU(Vec16<float16_t> v, D, float16_t* HWY_RESTRICT p) {
|
|
3876
|
+
vst1_lane_f16(detail::NativeLanePointer(p), v.raw, 0);
|
|
3877
|
+
}
|
|
3878
|
+
#endif // HWY_HAVE_FLOAT16
|
|
3879
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
3880
|
+
template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_BF16_D(D)>
|
|
3881
|
+
HWY_API void StoreU(Vec16<bfloat16_t> v, D, bfloat16_t* HWY_RESTRICT p) {
|
|
3882
|
+
vst1_lane_bf16(detail::NativeLanePointer(p), v.raw, 0);
|
|
3883
|
+
}
|
|
3884
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
3433
3885
|
|
|
3434
3886
|
template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
|
|
3435
3887
|
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
|
|
@@ -3449,12 +3901,12 @@ HWY_API void StoreU(Vec128<int8_t, 1> v, D, int8_t* HWY_RESTRICT p) {
|
|
|
3449
3901
|
vst1_lane_s8(p, v.raw, 0);
|
|
3450
3902
|
}
|
|
3451
3903
|
|
|
3452
|
-
//
|
|
3453
|
-
|
|
3904
|
+
// ------------------------------ Store misc
|
|
3905
|
+
|
|
3906
|
+
template <class D, HWY_NEON_IF_EMULATED_D(D)>
|
|
3454
3907
|
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
|
|
3455
|
-
const RebindToUnsigned<decltype(d)>
|
|
3456
|
-
|
|
3457
|
-
return StoreU(BitCast(du16, v), du16, pu16);
|
|
3908
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
3909
|
+
return StoreU(BitCast(du, v), du, detail::U16LanePointer(p));
|
|
3458
3910
|
}
|
|
3459
3911
|
|
|
3460
3912
|
HWY_DIAGNOSTICS(push)
|
|
@@ -3541,24 +3993,6 @@ HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToUnsigned<D>> v) {
|
|
|
3541
3993
|
return VFromD<D>(vcvt_f32_u32(v.raw));
|
|
3542
3994
|
}
|
|
3543
3995
|
|
|
3544
|
-
// Truncates (rounds toward zero).
|
|
3545
|
-
template <class D, HWY_IF_I32_D(D)>
|
|
3546
|
-
HWY_API Vec128<int32_t> ConvertTo(D /* tag */, Vec128<float> v) {
|
|
3547
|
-
return Vec128<int32_t>(vcvtq_s32_f32(v.raw));
|
|
3548
|
-
}
|
|
3549
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
|
|
3550
|
-
HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
|
|
3551
|
-
return VFromD<D>(vcvt_s32_f32(v.raw));
|
|
3552
|
-
}
|
|
3553
|
-
template <class D, HWY_IF_U32_D(D)>
|
|
3554
|
-
HWY_API Vec128<uint32_t> ConvertTo(D /* tag */, Vec128<float> v) {
|
|
3555
|
-
return Vec128<uint32_t>(vcvtq_u32_f32(ZeroIfNegative(v).raw));
|
|
3556
|
-
}
|
|
3557
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
|
|
3558
|
-
HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
|
|
3559
|
-
return VFromD<D>(vcvt_u32_f32(ZeroIfNegative(v).raw));
|
|
3560
|
-
}
|
|
3561
|
-
|
|
3562
3996
|
#if HWY_HAVE_FLOAT64
|
|
3563
3997
|
|
|
3564
3998
|
template <class D, HWY_IF_F64_D(D)>
|
|
@@ -3577,51 +4011,168 @@ HWY_API Vec64<double> ConvertTo(D /* tag */, Vec64<int64_t> v) {
|
|
|
3577
4011
|
|
|
3578
4012
|
template <class D, HWY_IF_F64_D(D)>
|
|
3579
4013
|
HWY_API Vec128<double> ConvertTo(D /* tag */, Vec128<uint64_t> v) {
|
|
3580
|
-
return Vec128<double>(vcvtq_f64_u64(
|
|
4014
|
+
return Vec128<double>(vcvtq_f64_u64(v.raw));
|
|
3581
4015
|
}
|
|
3582
4016
|
template <class D, HWY_IF_F64_D(D)>
|
|
3583
4017
|
HWY_API Vec64<double> ConvertTo(D /* tag */, Vec64<uint64_t> v) {
|
|
3584
4018
|
// GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic.
|
|
3585
|
-
const auto non_neg_v = ZeroIfNegative(v);
|
|
3586
4019
|
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
|
|
3587
|
-
return Set(Full64<double>(), static_cast<double>(GetLane(
|
|
4020
|
+
return Set(Full64<double>(), static_cast<double>(GetLane(v)));
|
|
3588
4021
|
#else
|
|
3589
|
-
return Vec64<double>(vcvt_f64_u64(
|
|
4022
|
+
return Vec64<double>(vcvt_f64_u64(v.raw));
|
|
3590
4023
|
#endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
|
|
3591
4024
|
}
|
|
3592
4025
|
|
|
4026
|
+
#endif // HWY_HAVE_FLOAT64
|
|
4027
|
+
|
|
4028
|
+
namespace detail {
|
|
3593
4029
|
// Truncates (rounds toward zero).
|
|
3594
|
-
template <class D,
|
|
3595
|
-
|
|
4030
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
|
|
4031
|
+
HWY_INLINE Vec128<int32_t> ConvertFToI(D /* tag */, Vec128<float> v) {
|
|
4032
|
+
#if HWY_COMPILER_CLANG && \
|
|
4033
|
+
((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
|
|
4034
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
|
|
4035
|
+
// Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
|
|
4036
|
+
// outside of the range of an int32_t.
|
|
4037
|
+
|
|
4038
|
+
int32x4_t raw_result;
|
|
4039
|
+
__asm__(
|
|
4040
|
+
#if HWY_ARCH_ARM_A64
|
|
4041
|
+
"fcvtzs %0.4s, %1.4s"
|
|
4042
|
+
#else
|
|
4043
|
+
"vcvt.s32.f32 %0, %1"
|
|
4044
|
+
#endif
|
|
4045
|
+
: "=w"(raw_result)
|
|
4046
|
+
: "w"(v.raw));
|
|
4047
|
+
return Vec128<int32_t>(raw_result);
|
|
4048
|
+
#else
|
|
4049
|
+
return Vec128<int32_t>(vcvtq_s32_f32(v.raw));
|
|
4050
|
+
#endif
|
|
4051
|
+
}
|
|
4052
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
|
|
4053
|
+
HWY_INLINE VFromD<D> ConvertFToI(D /* tag */, VFromD<RebindToFloat<D>> v) {
|
|
4054
|
+
#if HWY_COMPILER_CLANG && \
|
|
4055
|
+
((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
|
|
4056
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
|
|
4057
|
+
// Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
|
|
4058
|
+
// outside of the range of an int32_t.
|
|
4059
|
+
|
|
4060
|
+
int32x2_t raw_result;
|
|
4061
|
+
__asm__(
|
|
4062
|
+
#if HWY_ARCH_ARM_A64
|
|
4063
|
+
"fcvtzs %0.2s, %1.2s"
|
|
4064
|
+
#else
|
|
4065
|
+
"vcvt.s32.f32 %0, %1"
|
|
4066
|
+
#endif
|
|
4067
|
+
: "=w"(raw_result)
|
|
4068
|
+
: "w"(v.raw));
|
|
4069
|
+
return VFromD<D>(raw_result);
|
|
4070
|
+
#else
|
|
4071
|
+
return VFromD<D>(vcvt_s32_f32(v.raw));
|
|
4072
|
+
#endif
|
|
4073
|
+
}
|
|
4074
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
|
|
4075
|
+
HWY_INLINE Vec128<uint32_t> ConvertFToU(D /* tag */, Vec128<float> v) {
|
|
4076
|
+
#if HWY_COMPILER_CLANG && \
|
|
4077
|
+
((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
|
|
4078
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
|
|
4079
|
+
// Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
|
|
4080
|
+
// outside of the range of an uint32_t.
|
|
4081
|
+
|
|
4082
|
+
uint32x4_t raw_result;
|
|
4083
|
+
__asm__(
|
|
4084
|
+
#if HWY_ARCH_ARM_A64
|
|
4085
|
+
"fcvtzu %0.4s, %1.4s"
|
|
4086
|
+
#else
|
|
4087
|
+
"vcvt.u32.f32 %0, %1"
|
|
4088
|
+
#endif
|
|
4089
|
+
: "=w"(raw_result)
|
|
4090
|
+
: "w"(v.raw));
|
|
4091
|
+
return Vec128<uint32_t>(raw_result);
|
|
4092
|
+
#else
|
|
4093
|
+
return Vec128<uint32_t>(vcvtq_u32_f32(v.raw));
|
|
4094
|
+
#endif
|
|
4095
|
+
}
|
|
4096
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
|
|
4097
|
+
HWY_INLINE VFromD<D> ConvertFToU(D /* tag */, VFromD<RebindToFloat<D>> v) {
|
|
4098
|
+
#if HWY_COMPILER_CLANG && \
|
|
4099
|
+
((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
|
|
4100
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
|
|
4101
|
+
// Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
|
|
4102
|
+
// outside of the range of an uint32_t.
|
|
4103
|
+
|
|
4104
|
+
uint32x2_t raw_result;
|
|
4105
|
+
__asm__(
|
|
4106
|
+
#if HWY_ARCH_ARM_A64
|
|
4107
|
+
"fcvtzu %0.2s, %1.2s"
|
|
4108
|
+
#else
|
|
4109
|
+
"vcvt.u32.f32 %0, %1"
|
|
4110
|
+
#endif
|
|
4111
|
+
: "=w"(raw_result)
|
|
4112
|
+
: "w"(v.raw));
|
|
4113
|
+
return VFromD<D>(raw_result);
|
|
4114
|
+
#else
|
|
4115
|
+
return VFromD<D>(vcvt_u32_f32(v.raw));
|
|
4116
|
+
#endif
|
|
4117
|
+
}
|
|
4118
|
+
|
|
4119
|
+
#if HWY_HAVE_FLOAT64
|
|
4120
|
+
|
|
4121
|
+
// Truncates (rounds toward zero).
|
|
4122
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
|
|
4123
|
+
HWY_INLINE Vec128<int64_t> ConvertFToI(D /* tag */, Vec128<double> v) {
|
|
4124
|
+
#if HWY_COMPILER_CLANG && HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200
|
|
4125
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
|
|
4126
|
+
// to avoid undefined behavior if v[i] is outside of the range of an int64_t.
|
|
4127
|
+
int64x2_t raw_result;
|
|
4128
|
+
__asm__("fcvtzs %0.2d, %1.2d" : "=w"(raw_result) : "w"(v.raw));
|
|
4129
|
+
return Vec128<int64_t>(raw_result);
|
|
4130
|
+
#else
|
|
3596
4131
|
return Vec128<int64_t>(vcvtq_s64_f64(v.raw));
|
|
4132
|
+
#endif
|
|
3597
4133
|
}
|
|
3598
|
-
template <class D, HWY_IF_I64_D(D)>
|
|
3599
|
-
|
|
3600
|
-
|
|
3601
|
-
|
|
3602
|
-
|
|
3603
|
-
|
|
3604
|
-
|
|
3605
|
-
|
|
4134
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)>
|
|
4135
|
+
HWY_INLINE Vec64<int64_t> ConvertFToI(D /* tag */, Vec64<double> v) {
|
|
4136
|
+
#if HWY_ARCH_ARM_A64 && \
|
|
4137
|
+
((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700) || \
|
|
4138
|
+
(HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200))
|
|
4139
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
|
|
4140
|
+
// to avoid undefined behavior if v[i] is outside of the range of an int64_t.
|
|
4141
|
+
// If compiling for AArch64 NEON with GCC 6 or earlier, use inline assembly to
|
|
4142
|
+
// work around the missing vcvt_s64_f64 intrinsic.
|
|
4143
|
+
int64x1_t raw_result;
|
|
4144
|
+
__asm__("fcvtzs %d0, %d1" : "=w"(raw_result) : "w"(v.raw));
|
|
4145
|
+
return Vec64<int64_t>(raw_result);
|
|
3606
4146
|
#else
|
|
3607
|
-
(void)di;
|
|
3608
4147
|
return Vec64<int64_t>(vcvt_s64_f64(v.raw));
|
|
3609
4148
|
#endif
|
|
3610
4149
|
}
|
|
3611
|
-
template <class D, HWY_IF_U64_D(D)>
|
|
3612
|
-
|
|
4150
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
|
|
4151
|
+
HWY_INLINE Vec128<uint64_t> ConvertFToU(D /* tag */, Vec128<double> v) {
|
|
4152
|
+
#if HWY_COMPILER_CLANG && HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200
|
|
4153
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
|
|
4154
|
+
// to avoid undefined behavior if v[i] is outside of the range of an uint64_t.
|
|
4155
|
+
uint64x2_t raw_result;
|
|
4156
|
+
__asm__("fcvtzu %0.2d, %1.2d" : "=w"(raw_result) : "w"(v.raw));
|
|
4157
|
+
return Vec128<uint64_t>(raw_result);
|
|
4158
|
+
#else
|
|
3613
4159
|
return Vec128<uint64_t>(vcvtq_u64_f64(v.raw));
|
|
4160
|
+
#endif
|
|
3614
4161
|
}
|
|
3615
|
-
template <class D, HWY_IF_U64_D(D)>
|
|
3616
|
-
|
|
3617
|
-
|
|
3618
|
-
|
|
3619
|
-
|
|
3620
|
-
|
|
3621
|
-
|
|
3622
|
-
|
|
4162
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U64_D(D)>
|
|
4163
|
+
HWY_INLINE Vec64<uint64_t> ConvertFToU(D /* tag */, Vec64<double> v) {
|
|
4164
|
+
#if HWY_ARCH_ARM_A64 && \
|
|
4165
|
+
((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700) || \
|
|
4166
|
+
(HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200))
|
|
4167
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
|
|
4168
|
+
// to avoid undefined behavior if v[i] is outside of the range of an uint64_t.
|
|
4169
|
+
|
|
4170
|
+
// Inline assembly is also used if compiling for AArch64 NEON with GCC 6 or
|
|
4171
|
+
// earlier to work around the issue of the missing vcvt_u64_f64 intrinsic.
|
|
4172
|
+
uint64x1_t raw_result;
|
|
4173
|
+
__asm__("fcvtzu %d0, %d1" : "=w"(raw_result) : "w"(v.raw));
|
|
4174
|
+
return Vec64<uint64_t>(raw_result);
|
|
3623
4175
|
#else
|
|
3624
|
-
(void)du;
|
|
3625
4176
|
return Vec64<uint64_t>(vcvt_u64_f64(v.raw));
|
|
3626
4177
|
#endif
|
|
3627
4178
|
}
|
|
@@ -3631,25 +4182,76 @@ HWY_API Vec64<uint64_t> ConvertTo(D du, Vec64<double> v) {
|
|
|
3631
4182
|
#if HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16
|
|
3632
4183
|
|
|
3633
4184
|
// Truncates (rounds toward zero).
|
|
3634
|
-
template <class D, HWY_IF_I16_D(D)>
|
|
3635
|
-
|
|
4185
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
|
|
4186
|
+
HWY_INLINE Vec128<int16_t> ConvertFToI(D /* tag */, Vec128<float16_t> v) {
|
|
4187
|
+
#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
|
|
4188
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
|
|
4189
|
+
// to avoid undefined behavior if v[i] is outside of the range of an int16_t.
|
|
4190
|
+
int16x8_t raw_result;
|
|
4191
|
+
__asm__("fcvtzs %0.8h, %1.8h" : "=w"(raw_result) : "w"(v.raw));
|
|
4192
|
+
return Vec128<int16_t>(raw_result);
|
|
4193
|
+
#else
|
|
3636
4194
|
return Vec128<int16_t>(vcvtq_s16_f16(v.raw));
|
|
4195
|
+
#endif
|
|
3637
4196
|
}
|
|
3638
4197
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
|
|
3639
|
-
|
|
4198
|
+
HWY_INLINE VFromD<D> ConvertFToI(D /* tag */, VFromD<Rebind<float16_t, D>> v) {
|
|
4199
|
+
#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
|
|
4200
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
|
|
4201
|
+
// to avoid undefined behavior if v[i] is outside of the range of an int16_t.
|
|
4202
|
+
int16x4_t raw_result;
|
|
4203
|
+
__asm__("fcvtzs %0.4h, %1.4h" : "=w"(raw_result) : "w"(v.raw));
|
|
4204
|
+
return VFromD<D>(raw_result);
|
|
4205
|
+
#else
|
|
3640
4206
|
return VFromD<D>(vcvt_s16_f16(v.raw));
|
|
4207
|
+
#endif
|
|
3641
4208
|
}
|
|
3642
4209
|
|
|
3643
|
-
template <class D, HWY_IF_U16_D(D)>
|
|
3644
|
-
|
|
4210
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
|
|
4211
|
+
HWY_INLINE Vec128<uint16_t> ConvertFToU(D /* tag */, Vec128<float16_t> v) {
|
|
4212
|
+
#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
|
|
4213
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
|
|
4214
|
+
// to avoid undefined behavior if v[i] is outside of the range of an uint16_t.
|
|
4215
|
+
uint16x8_t raw_result;
|
|
4216
|
+
__asm__("fcvtzu %0.8h, %1.8h" : "=w"(raw_result) : "w"(v.raw));
|
|
4217
|
+
return Vec128<uint16_t>(raw_result);
|
|
4218
|
+
#else
|
|
3645
4219
|
return Vec128<uint16_t>(vcvtq_u16_f16(v.raw));
|
|
4220
|
+
#endif
|
|
3646
4221
|
}
|
|
3647
4222
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
|
|
3648
|
-
|
|
4223
|
+
HWY_INLINE VFromD<D> ConvertFToU(D /* tag */, VFromD<Rebind<float16_t, D>> v) {
|
|
4224
|
+
#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
|
|
4225
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
|
|
4226
|
+
// to avoid undefined behavior if v[i] is outside of the range of an uint16_t.
|
|
4227
|
+
uint16x4_t raw_result;
|
|
4228
|
+
__asm__("fcvtzu %0.4h, %1.4h" : "=w"(raw_result) : "w"(v.raw));
|
|
4229
|
+
return VFromD<D>(raw_result);
|
|
4230
|
+
#else
|
|
3649
4231
|
return VFromD<D>(vcvt_u16_f16(v.raw));
|
|
4232
|
+
#endif
|
|
3650
4233
|
}
|
|
3651
4234
|
|
|
3652
4235
|
#endif // HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16
|
|
4236
|
+
} // namespace detail
|
|
4237
|
+
|
|
4238
|
+
template <class D, HWY_IF_SIGNED_D(D),
|
|
4239
|
+
HWY_IF_T_SIZE_ONE_OF_D(
|
|
4240
|
+
D, (1 << 4) |
|
|
4241
|
+
((HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16) ? (1 << 2) : 0) |
|
|
4242
|
+
(HWY_HAVE_FLOAT64 ? (1 << 8) : 0))>
|
|
4243
|
+
HWY_API VFromD<D> ConvertTo(D di, VFromD<RebindToFloat<D>> v) {
|
|
4244
|
+
return detail::ConvertFToI(di, v);
|
|
4245
|
+
}
|
|
4246
|
+
|
|
4247
|
+
template <class D, HWY_IF_UNSIGNED_D(D),
|
|
4248
|
+
HWY_IF_T_SIZE_ONE_OF_D(
|
|
4249
|
+
D, (1 << 4) |
|
|
4250
|
+
((HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16) ? (1 << 2) : 0) |
|
|
4251
|
+
(HWY_HAVE_FLOAT64 ? (1 << 8) : 0))>
|
|
4252
|
+
HWY_API VFromD<D> ConvertTo(D du, VFromD<RebindToFloat<D>> v) {
|
|
4253
|
+
return detail::ConvertFToU(du, v);
|
|
4254
|
+
}
|
|
3653
4255
|
|
|
3654
4256
|
// ------------------------------ PromoteTo (ConvertTo)
|
|
3655
4257
|
|
|
@@ -3782,7 +4384,7 @@ HWY_API VFromD<D> PromoteTo(D d, V v) {
|
|
|
3782
4384
|
return PromoteTo(d, PromoteTo(di32, v));
|
|
3783
4385
|
}
|
|
3784
4386
|
|
|
3785
|
-
#if
|
|
4387
|
+
#if HWY_NEON_HAVE_F16C
|
|
3786
4388
|
|
|
3787
4389
|
// Per-target flag to prevent generic_ops-inl.h from defining f16 conversions.
|
|
3788
4390
|
#ifdef HWY_NATIVE_F16C
|
|
@@ -3800,7 +4402,7 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float16_t, D>> v) {
|
|
|
3800
4402
|
return VFromD<D>(vget_low_f32(vcvt_f32_f16(v.raw)));
|
|
3801
4403
|
}
|
|
3802
4404
|
|
|
3803
|
-
#endif //
|
|
4405
|
+
#endif // HWY_NEON_HAVE_F16C
|
|
3804
4406
|
|
|
3805
4407
|
#if HWY_HAVE_FLOAT64
|
|
3806
4408
|
|
|
@@ -3893,8 +4495,36 @@ HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
|
|
|
3893
4495
|
lo32_or_mask);
|
|
3894
4496
|
}
|
|
3895
4497
|
|
|
4498
|
+
#ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
|
|
4499
|
+
#undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
|
|
4500
|
+
#else
|
|
4501
|
+
#define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
|
|
4502
|
+
#endif
|
|
4503
|
+
|
|
4504
|
+
template <class D, HWY_IF_UI64_D(D)>
|
|
4505
|
+
HWY_API VFromD<D> PromoteInRangeTo(D d64, VFromD<Rebind<float, D>> v) {
|
|
4506
|
+
const Rebind<MakeNarrow<TFromD<D>>, decltype(d64)> d32;
|
|
4507
|
+
const RebindToFloat<decltype(d32)> df32;
|
|
4508
|
+
const RebindToUnsigned<decltype(d32)> du32;
|
|
4509
|
+
const Repartition<uint8_t, decltype(d32)> du32_as_du8;
|
|
4510
|
+
|
|
4511
|
+
constexpr uint32_t kExpAdjDecr =
|
|
4512
|
+
0xFFFFFF9Du + static_cast<uint32_t>(!IsSigned<TFromD<D>>());
|
|
4513
|
+
|
|
4514
|
+
const auto exponent_adj = BitCast(
|
|
4515
|
+
du32, SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))),
|
|
4516
|
+
BitCast(du32_as_du8, Set(du32, kExpAdjDecr))));
|
|
4517
|
+
const auto adj_v =
|
|
4518
|
+
BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj));
|
|
4519
|
+
|
|
4520
|
+
return PromoteTo(d64, ConvertTo(d32, adj_v)) << PromoteTo(d64, exponent_adj);
|
|
4521
|
+
}
|
|
4522
|
+
|
|
3896
4523
|
#endif // HWY_HAVE_FLOAT64
|
|
3897
4524
|
|
|
4525
|
+
// ------------------------------ PromoteEvenTo/PromoteOddTo
|
|
4526
|
+
#include "hwy/ops/inside-inl.h"
|
|
4527
|
+
|
|
3898
4528
|
// ------------------------------ PromoteUpperTo
|
|
3899
4529
|
|
|
3900
4530
|
#if HWY_ARCH_ARM_A64
|
|
@@ -3946,14 +4576,14 @@ HWY_API Vec128<int64_t> PromoteUpperTo(D /* tag */, Vec128<int32_t> v) {
|
|
|
3946
4576
|
return Vec128<int64_t>(vmovl_high_s32(v.raw));
|
|
3947
4577
|
}
|
|
3948
4578
|
|
|
3949
|
-
#if
|
|
4579
|
+
#if HWY_NEON_HAVE_F16C
|
|
3950
4580
|
|
|
3951
4581
|
template <class D, HWY_IF_F32_D(D)>
|
|
3952
4582
|
HWY_API Vec128<float> PromoteUpperTo(D /* tag */, Vec128<float16_t> v) {
|
|
3953
4583
|
return Vec128<float>(vcvt_high_f32_f16(v.raw));
|
|
3954
4584
|
}
|
|
3955
4585
|
|
|
3956
|
-
#endif //
|
|
4586
|
+
#endif // HWY_NEON_HAVE_F16C
|
|
3957
4587
|
|
|
3958
4588
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
|
|
3959
4589
|
HWY_API VFromD<D> PromoteUpperTo(D df32, VFromD<Repartition<bfloat16_t, D>> v) {
|
|
@@ -4149,7 +4779,7 @@ HWY_API VFromD<D> DemoteTo(D d, Vec64<uint64_t> v) {
|
|
|
4149
4779
|
return DemoteTo(d, DemoteTo(du32, v));
|
|
4150
4780
|
}
|
|
4151
4781
|
|
|
4152
|
-
#if
|
|
4782
|
+
#if HWY_NEON_HAVE_F16C
|
|
4153
4783
|
|
|
4154
4784
|
// We already toggled HWY_NATIVE_F16C above.
|
|
4155
4785
|
|
|
@@ -4162,16 +4792,47 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
|
|
|
4162
4792
|
return VFromD<D>(vcvt_f16_f32(vcombine_f32(v.raw, v.raw)));
|
|
4163
4793
|
}
|
|
4164
4794
|
|
|
4165
|
-
#endif //
|
|
4795
|
+
#endif // HWY_NEON_HAVE_F16C
|
|
4166
4796
|
|
|
4167
|
-
|
|
4168
|
-
|
|
4169
|
-
|
|
4170
|
-
|
|
4171
|
-
|
|
4172
|
-
|
|
4173
|
-
|
|
4797
|
+
#if HWY_NEON_HAVE_F32_TO_BF16C
|
|
4798
|
+
#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
4799
|
+
#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
4800
|
+
#else
|
|
4801
|
+
#define HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
4802
|
+
#endif
|
|
4803
|
+
|
|
4804
|
+
namespace detail {
|
|
4805
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
4806
|
+
// If HWY_NEON_HAVE_BFLOAT16 is true, detail::Vec128<bfloat16_t, N>::type is
|
|
4807
|
+
// bfloat16x4_t or bfloat16x8_t.
|
|
4808
|
+
static HWY_INLINE bfloat16x4_t BitCastFromRawNeonBF16(bfloat16x4_t raw) {
|
|
4809
|
+
return raw;
|
|
4810
|
+
}
|
|
4811
|
+
#else
|
|
4812
|
+
// If HWY_NEON_HAVE_F32_TO_BF16C && !HWY_NEON_HAVE_BFLOAT16 is true,
|
|
4813
|
+
// detail::Vec128<bfloat16_t, N>::type is uint16x4_t or uint16x8_t vector to
|
|
4814
|
+
// work around compiler bugs that are there with GCC 13 or earlier or Clang 16
|
|
4815
|
+
// or earlier on AArch64.
|
|
4816
|
+
|
|
4817
|
+
// The bfloat16x4_t vector returned by vcvt_bf16_f32 needs to be bitcasted to
|
|
4818
|
+
// an uint16x4_t vector if HWY_NEON_HAVE_F32_TO_BF16C &&
|
|
4819
|
+
// !HWY_NEON_HAVE_BFLOAT16 is true.
|
|
4820
|
+
static HWY_INLINE uint16x4_t BitCastFromRawNeonBF16(bfloat16x4_t raw) {
|
|
4821
|
+
return vreinterpret_u16_bf16(raw);
|
|
4822
|
+
}
|
|
4823
|
+
#endif
|
|
4824
|
+
} // namespace detail
|
|
4825
|
+
|
|
4826
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
|
|
4827
|
+
HWY_API VFromD<D> DemoteTo(D /*dbf16*/, VFromD<Rebind<float, D>> v) {
|
|
4828
|
+
return VFromD<D>(detail::BitCastFromRawNeonBF16(vcvt_bf16_f32(v.raw)));
|
|
4174
4829
|
}
|
|
4830
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_BF16_D(D)>
|
|
4831
|
+
HWY_API VFromD<D> DemoteTo(D /*dbf16*/, VFromD<Rebind<float, D>> v) {
|
|
4832
|
+
return VFromD<D>(detail::BitCastFromRawNeonBF16(
|
|
4833
|
+
vcvt_bf16_f32(vcombine_f32(v.raw, v.raw))));
|
|
4834
|
+
}
|
|
4835
|
+
#endif // HWY_NEON_HAVE_F32_TO_BF16C
|
|
4175
4836
|
|
|
4176
4837
|
#if HWY_HAVE_FLOAT64
|
|
4177
4838
|
|
|
@@ -4184,32 +4845,10 @@ HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) {
|
|
|
4184
4845
|
return Vec32<float>(vcvt_f32_f64(vcombine_f64(v.raw, v.raw)));
|
|
4185
4846
|
}
|
|
4186
4847
|
|
|
4187
|
-
template <class D,
|
|
4188
|
-
HWY_API
|
|
4189
|
-
const
|
|
4190
|
-
return
|
|
4191
|
-
}
|
|
4192
|
-
template <class D, HWY_IF_I32_D(D)>
|
|
4193
|
-
HWY_API Vec32<int32_t> DemoteTo(D /* tag */, Vec64<double> v) {
|
|
4194
|
-
// There is no i64x1 -> i32x1 narrow, so Combine to 128-bit. Do so with the
|
|
4195
|
-
// f64 input already to also avoid the missing vcvt_s64_f64 in GCC 6.4.
|
|
4196
|
-
const Full128<double> ddt;
|
|
4197
|
-
const Full128<int64_t> dit;
|
|
4198
|
-
return Vec32<int32_t>(vqmovn_s64(ConvertTo(dit, Combine(ddt, v, v)).raw));
|
|
4199
|
-
}
|
|
4200
|
-
|
|
4201
|
-
template <class D, HWY_IF_U32_D(D)>
|
|
4202
|
-
HWY_API Vec64<uint32_t> DemoteTo(D /* tag */, Vec128<double> v) {
|
|
4203
|
-
const uint64x2_t u64 = vcvtq_u64_f64(v.raw);
|
|
4204
|
-
return Vec64<uint32_t>(vqmovn_u64(u64));
|
|
4205
|
-
}
|
|
4206
|
-
template <class D, HWY_IF_U32_D(D)>
|
|
4207
|
-
HWY_API Vec32<uint32_t> DemoteTo(D /* tag */, Vec64<double> v) {
|
|
4208
|
-
// There is no i64x1 -> i32x1 narrow, so Combine to 128-bit. Do so with the
|
|
4209
|
-
// f64 input already to also avoid the missing vcvt_s64_f64 in GCC 6.4.
|
|
4210
|
-
const Full128<double> ddt;
|
|
4211
|
-
const Full128<uint64_t> du_t;
|
|
4212
|
-
return Vec32<uint32_t>(vqmovn_u64(ConvertTo(du_t, Combine(ddt, v, v)).raw));
|
|
4848
|
+
template <class D, HWY_IF_UI32_D(D)>
|
|
4849
|
+
HWY_API VFromD<D> DemoteTo(D d32, VFromD<Rebind<double, D>> v) {
|
|
4850
|
+
const Rebind<MakeWide<TFromD<D>>, D> d64;
|
|
4851
|
+
return DemoteTo(d32, ConvertTo(d64, v));
|
|
4213
4852
|
}
|
|
4214
4853
|
|
|
4215
4854
|
#endif // HWY_HAVE_FLOAT64
|
|
@@ -4466,31 +5105,7 @@ HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
|
|
|
4466
5105
|
return v != v;
|
|
4467
5106
|
}
|
|
4468
5107
|
|
|
4469
|
-
|
|
4470
|
-
HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
|
|
4471
|
-
const DFromV<decltype(v)> d;
|
|
4472
|
-
const RebindToSigned<decltype(d)> di;
|
|
4473
|
-
const VFromD<decltype(di)> vi = BitCast(di, v);
|
|
4474
|
-
// 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
|
|
4475
|
-
return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
|
|
4476
|
-
}
|
|
4477
|
-
|
|
4478
|
-
// Returns whether normal/subnormal/zero.
|
|
4479
|
-
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
4480
|
-
HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
|
|
4481
|
-
const DFromV<decltype(v)> d;
|
|
4482
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
4483
|
-
const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
|
|
4484
|
-
const VFromD<decltype(du)> vu = BitCast(du, v);
|
|
4485
|
-
// 'Shift left' to clear the sign bit, then right so we can compare with the
|
|
4486
|
-
// max exponent (cannot compare with MaxExponentTimes2 directly because it is
|
|
4487
|
-
// negative and non-negative floats would be greater).
|
|
4488
|
-
const VFromD<decltype(di)> exp =
|
|
4489
|
-
BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
|
|
4490
|
-
return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
|
|
4491
|
-
}
|
|
4492
|
-
|
|
4493
|
-
// ================================================== SWIZZLE
|
|
5108
|
+
// ================================================== SWIZZLE
|
|
4494
5109
|
|
|
4495
5110
|
// ------------------------------ LowerHalf
|
|
4496
5111
|
|
|
@@ -4532,13 +5147,18 @@ HWY_API Vec64<float16_t> LowerHalf(Vec128<float16_t> v) {
|
|
|
4532
5147
|
return Vec64<float16_t>(vget_low_f16(v.raw));
|
|
4533
5148
|
}
|
|
4534
5149
|
#endif // HWY_HAVE_FLOAT16
|
|
5150
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
5151
|
+
HWY_API Vec64<bfloat16_t> LowerHalf(Vec128<bfloat16_t> v) {
|
|
5152
|
+
return Vec64<bfloat16_t>(vget_low_bf16(v.raw));
|
|
5153
|
+
}
|
|
5154
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
4535
5155
|
#if HWY_HAVE_FLOAT64
|
|
4536
5156
|
HWY_API Vec64<double> LowerHalf(Vec128<double> v) {
|
|
4537
5157
|
return Vec64<double>(vget_low_f64(v.raw));
|
|
4538
5158
|
}
|
|
4539
5159
|
#endif // HWY_HAVE_FLOAT64
|
|
4540
5160
|
|
|
4541
|
-
template <class V,
|
|
5161
|
+
template <class V, HWY_NEON_IF_EMULATED_D(DFromV<V>), HWY_IF_V_SIZE_V(V, 16)>
|
|
4542
5162
|
HWY_API VFromD<Half<DFromV<V>>> LowerHalf(V v) {
|
|
4543
5163
|
const Full128<uint16_t> du;
|
|
4544
5164
|
const Half<DFromV<V>> dh;
|
|
@@ -4738,6 +5358,12 @@ HWY_API Vec64<float16_t> UpperHalf(D /* tag */, Vec128<float16_t> v) {
|
|
|
4738
5358
|
return Vec64<float16_t>(vget_high_f16(v.raw));
|
|
4739
5359
|
}
|
|
4740
5360
|
#endif
|
|
5361
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
5362
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
5363
|
+
HWY_API Vec64<bfloat16_t> UpperHalf(D /* tag */, Vec128<bfloat16_t> v) {
|
|
5364
|
+
return Vec64<bfloat16_t>(vget_high_bf16(v.raw));
|
|
5365
|
+
}
|
|
5366
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
4741
5367
|
template <class D, HWY_IF_F32_D(D)>
|
|
4742
5368
|
HWY_API Vec64<float> UpperHalf(D /* tag */, Vec128<float> v) {
|
|
4743
5369
|
return Vec64<float>(vget_high_f32(v.raw));
|
|
@@ -4749,7 +5375,7 @@ HWY_API Vec64<double> UpperHalf(D /* tag */, Vec128<double> v) {
|
|
|
4749
5375
|
}
|
|
4750
5376
|
#endif // HWY_HAVE_FLOAT64
|
|
4751
5377
|
|
|
4752
|
-
template <class D,
|
|
5378
|
+
template <class D, HWY_NEON_IF_EMULATED_D(D), HWY_IF_V_SIZE_GT_D(D, 4)>
|
|
4753
5379
|
HWY_API VFromD<D> UpperHalf(D dh, VFromD<Twice<D>> v) {
|
|
4754
5380
|
const RebindToUnsigned<Twice<decltype(dh)>> du;
|
|
4755
5381
|
const Half<decltype(du)> duh;
|
|
@@ -4869,6 +5495,20 @@ HWY_API Vec128<float16_t, N> Broadcast(Vec128<float16_t, N> v) {
|
|
|
4869
5495
|
}
|
|
4870
5496
|
#endif // HWY_HAVE_FLOAT16
|
|
4871
5497
|
|
|
5498
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
5499
|
+
template <int kLane>
|
|
5500
|
+
HWY_API Vec128<bfloat16_t> Broadcast(Vec128<bfloat16_t> v) {
|
|
5501
|
+
static_assert(0 <= kLane && kLane < 8, "Invalid lane");
|
|
5502
|
+
return Vec128<bfloat16_t>(vdupq_laneq_bf16(v.raw, kLane));
|
|
5503
|
+
}
|
|
5504
|
+
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(bfloat16_t, N, 8),
|
|
5505
|
+
HWY_IF_LANES_GT(N, 1)>
|
|
5506
|
+
HWY_API Vec128<bfloat16_t, N> Broadcast(Vec128<bfloat16_t, N> v) {
|
|
5507
|
+
static_assert(0 <= kLane && kLane < N, "Invalid lane");
|
|
5508
|
+
return Vec128<bfloat16_t, N>(vdup_lane_bf16(v.raw, kLane));
|
|
5509
|
+
}
|
|
5510
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
5511
|
+
|
|
4872
5512
|
template <int kLane>
|
|
4873
5513
|
HWY_API Vec128<float> Broadcast(Vec128<float> v) {
|
|
4874
5514
|
static_assert(0 <= kLane && kLane < 4, "Invalid lane");
|
|
@@ -4976,7 +5616,26 @@ HWY_API Vec128<float16_t> Broadcast(Vec128<float16_t> v) {
|
|
|
4976
5616
|
static_assert(0 <= kLane && kLane < 8, "Invalid lane");
|
|
4977
5617
|
return Vec128<float16_t>(vdupq_n_f16(vgetq_lane_f16(v.raw, kLane)));
|
|
4978
5618
|
}
|
|
5619
|
+
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8),
|
|
5620
|
+
HWY_IF_LANES_GT(N, 1)>
|
|
5621
|
+
HWY_API Vec128<float16_t, N> Broadcast(Vec128<float16_t, N> v) {
|
|
5622
|
+
static_assert(0 <= kLane && kLane < N, "Invalid lane");
|
|
5623
|
+
return Vec128<float16_t, N>(vdup_lane_f16(v.raw, kLane));
|
|
5624
|
+
}
|
|
4979
5625
|
#endif // HWY_HAVE_FLOAT16
|
|
5626
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
5627
|
+
template <int kLane>
|
|
5628
|
+
HWY_API Vec128<bfloat16_t> Broadcast(Vec128<bfloat16_t> v) {
|
|
5629
|
+
static_assert(0 <= kLane && kLane < 8, "Invalid lane");
|
|
5630
|
+
return Vec128<bfloat16_t>(vdupq_n_bf16(vgetq_lane_bf16(v.raw, kLane)));
|
|
5631
|
+
}
|
|
5632
|
+
template <int kLane, size_t N, HWY_IF_V_SIZE_LE(bfloat16_t, N, 8),
|
|
5633
|
+
HWY_IF_LANES_GT(N, 1)>
|
|
5634
|
+
HWY_API Vec128<bfloat16_t, N> Broadcast(Vec128<bfloat16_t, N> v) {
|
|
5635
|
+
static_assert(0 <= kLane && kLane < N, "Invalid lane");
|
|
5636
|
+
return Vec128<bfloat16_t, N>(vdup_lane_bf16(v.raw, kLane));
|
|
5637
|
+
}
|
|
5638
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
4980
5639
|
template <int kLane>
|
|
4981
5640
|
HWY_API Vec128<float> Broadcast(Vec128<float> v) {
|
|
4982
5641
|
static_assert(0 <= kLane && kLane < 4, "Invalid lane");
|
|
@@ -4991,6 +5650,14 @@ HWY_API Vec128<float, N> Broadcast(Vec128<float, N> v) {
|
|
|
4991
5650
|
|
|
4992
5651
|
#endif // HWY_ARCH_ARM_A64
|
|
4993
5652
|
|
|
5653
|
+
template <int kLane, typename V, HWY_NEON_IF_EMULATED_D(DFromV<V>),
|
|
5654
|
+
HWY_IF_LANES_GT_D(DFromV<V>, 1)>
|
|
5655
|
+
HWY_API V Broadcast(V v) {
|
|
5656
|
+
const DFromV<V> d;
|
|
5657
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5658
|
+
return BitCast(d, Broadcast<kLane>(BitCast(du, v)));
|
|
5659
|
+
}
|
|
5660
|
+
|
|
4994
5661
|
// ------------------------------ TableLookupLanes
|
|
4995
5662
|
|
|
4996
5663
|
// Returned by SetTableIndices for use by TableLookupLanes.
|
|
@@ -5393,6 +6060,16 @@ HWY_API Vec128<T> InterleaveLower(Vec128<T> a, Vec128<T> b) {
|
|
|
5393
6060
|
}
|
|
5394
6061
|
#endif
|
|
5395
6062
|
|
|
6063
|
+
#if !HWY_HAVE_FLOAT16
|
|
6064
|
+
template <size_t N, HWY_IF_V_SIZE_GT(float16_t, N, 4)>
|
|
6065
|
+
HWY_API Vec128<float16_t, N> InterleaveLower(Vec128<float16_t, N> a,
|
|
6066
|
+
Vec128<float16_t, N> b) {
|
|
6067
|
+
const DFromV<decltype(a)> d;
|
|
6068
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
6069
|
+
return BitCast(d, InterleaveLower(BitCast(du, a), BitCast(du, b)));
|
|
6070
|
+
}
|
|
6071
|
+
#endif // !HWY_HAVE_FLOAT16
|
|
6072
|
+
|
|
5396
6073
|
// < 64 bit parts
|
|
5397
6074
|
template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 4)>
|
|
5398
6075
|
HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
|
|
@@ -5676,158 +6353,656 @@ HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
|
|
|
5676
6353
|
|
|
5677
6354
|
namespace detail {
|
|
5678
6355
|
|
|
5679
|
-
template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
|
|
5680
|
-
HWY_INLINE V SlideDownLanes(V v, size_t amt) {
|
|
5681
|
-
const DFromV<decltype(v)> d;
|
|
5682
|
-
using TU = UnsignedFromSize<d.MaxBytes()>;
|
|
5683
|
-
const Repartition<TU, decltype(d)> du;
|
|
5684
|
-
return BitCast(d,
|
|
5685
|
-
BitCast(du, v) << Set(
|
|
5686
|
-
du, static_cast<TU>(TU{0} - amt * sizeof(TFromV<V>) * 8)));
|
|
6356
|
+
template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
|
|
6357
|
+
HWY_INLINE V SlideDownLanes(V v, size_t amt) {
|
|
6358
|
+
const DFromV<decltype(v)> d;
|
|
6359
|
+
using TU = UnsignedFromSize<d.MaxBytes()>;
|
|
6360
|
+
const Repartition<TU, decltype(d)> du;
|
|
6361
|
+
return BitCast(d,
|
|
6362
|
+
BitCast(du, v) << Set(
|
|
6363
|
+
du, static_cast<TU>(TU{0} - amt * sizeof(TFromV<V>) * 8)));
|
|
6364
|
+
}
|
|
6365
|
+
|
|
6366
|
+
template <class V, HWY_IF_V_SIZE_V(V, 16)>
|
|
6367
|
+
HWY_INLINE V SlideDownLanes(V v, size_t amt) {
|
|
6368
|
+
const DFromV<decltype(v)> d;
|
|
6369
|
+
const Repartition<int8_t, decltype(d)> di8;
|
|
6370
|
+
auto idx = Iota(di8, static_cast<int8_t>(amt * sizeof(TFromV<V>)));
|
|
6371
|
+
idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15})));
|
|
6372
|
+
return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx));
|
|
6373
|
+
}
|
|
6374
|
+
|
|
6375
|
+
} // namespace detail
|
|
6376
|
+
|
|
6377
|
+
template <class D, HWY_IF_LANES_D(D, 1)>
|
|
6378
|
+
HWY_API VFromD<D> SlideDownLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
|
|
6379
|
+
return v;
|
|
6380
|
+
}
|
|
6381
|
+
|
|
6382
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
|
|
6383
|
+
HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
|
|
6384
|
+
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
|
|
6385
|
+
if (__builtin_constant_p(amt)) {
|
|
6386
|
+
switch (amt) {
|
|
6387
|
+
case 0:
|
|
6388
|
+
return v;
|
|
6389
|
+
case 1:
|
|
6390
|
+
return ShiftRightLanes<1>(d, v);
|
|
6391
|
+
}
|
|
6392
|
+
}
|
|
6393
|
+
#else
|
|
6394
|
+
(void)d;
|
|
6395
|
+
#endif
|
|
6396
|
+
|
|
6397
|
+
return detail::SlideDownLanes(v, amt);
|
|
6398
|
+
}
|
|
6399
|
+
|
|
6400
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
|
|
6401
|
+
HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
|
|
6402
|
+
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
|
|
6403
|
+
if (__builtin_constant_p(amt)) {
|
|
6404
|
+
switch (amt) {
|
|
6405
|
+
case 0:
|
|
6406
|
+
return v;
|
|
6407
|
+
case 1:
|
|
6408
|
+
return ShiftRightLanes<1>(d, v);
|
|
6409
|
+
case 2:
|
|
6410
|
+
return ShiftRightLanes<2>(d, v);
|
|
6411
|
+
case 3:
|
|
6412
|
+
return ShiftRightLanes<3>(d, v);
|
|
6413
|
+
}
|
|
6414
|
+
}
|
|
6415
|
+
#else
|
|
6416
|
+
(void)d;
|
|
6417
|
+
#endif
|
|
6418
|
+
|
|
6419
|
+
return detail::SlideDownLanes(v, amt);
|
|
6420
|
+
}
|
|
6421
|
+
|
|
6422
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
|
|
6423
|
+
HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
|
|
6424
|
+
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
|
|
6425
|
+
if (__builtin_constant_p(amt)) {
|
|
6426
|
+
switch (amt) {
|
|
6427
|
+
case 0:
|
|
6428
|
+
return v;
|
|
6429
|
+
case 1:
|
|
6430
|
+
return ShiftRightLanes<1>(d, v);
|
|
6431
|
+
case 2:
|
|
6432
|
+
return ShiftRightLanes<2>(d, v);
|
|
6433
|
+
case 3:
|
|
6434
|
+
return ShiftRightLanes<3>(d, v);
|
|
6435
|
+
case 4:
|
|
6436
|
+
return ShiftRightLanes<4>(d, v);
|
|
6437
|
+
case 5:
|
|
6438
|
+
return ShiftRightLanes<5>(d, v);
|
|
6439
|
+
case 6:
|
|
6440
|
+
return ShiftRightLanes<6>(d, v);
|
|
6441
|
+
case 7:
|
|
6442
|
+
return ShiftRightLanes<7>(d, v);
|
|
6443
|
+
}
|
|
6444
|
+
}
|
|
6445
|
+
#else
|
|
6446
|
+
(void)d;
|
|
6447
|
+
#endif
|
|
6448
|
+
|
|
6449
|
+
return detail::SlideDownLanes(v, amt);
|
|
6450
|
+
}
|
|
6451
|
+
|
|
6452
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
|
|
6453
|
+
HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
|
|
6454
|
+
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
|
|
6455
|
+
if (__builtin_constant_p(amt)) {
|
|
6456
|
+
switch (amt) {
|
|
6457
|
+
case 0:
|
|
6458
|
+
return v;
|
|
6459
|
+
case 1:
|
|
6460
|
+
return ShiftRightLanes<1>(d, v);
|
|
6461
|
+
case 2:
|
|
6462
|
+
return ShiftRightLanes<2>(d, v);
|
|
6463
|
+
case 3:
|
|
6464
|
+
return ShiftRightLanes<3>(d, v);
|
|
6465
|
+
case 4:
|
|
6466
|
+
return ShiftRightLanes<4>(d, v);
|
|
6467
|
+
case 5:
|
|
6468
|
+
return ShiftRightLanes<5>(d, v);
|
|
6469
|
+
case 6:
|
|
6470
|
+
return ShiftRightLanes<6>(d, v);
|
|
6471
|
+
case 7:
|
|
6472
|
+
return ShiftRightLanes<7>(d, v);
|
|
6473
|
+
case 8:
|
|
6474
|
+
return ShiftRightLanes<8>(d, v);
|
|
6475
|
+
case 9:
|
|
6476
|
+
return ShiftRightLanes<9>(d, v);
|
|
6477
|
+
case 10:
|
|
6478
|
+
return ShiftRightLanes<10>(d, v);
|
|
6479
|
+
case 11:
|
|
6480
|
+
return ShiftRightLanes<11>(d, v);
|
|
6481
|
+
case 12:
|
|
6482
|
+
return ShiftRightLanes<12>(d, v);
|
|
6483
|
+
case 13:
|
|
6484
|
+
return ShiftRightLanes<13>(d, v);
|
|
6485
|
+
case 14:
|
|
6486
|
+
return ShiftRightLanes<14>(d, v);
|
|
6487
|
+
case 15:
|
|
6488
|
+
return ShiftRightLanes<15>(d, v);
|
|
6489
|
+
}
|
|
6490
|
+
}
|
|
6491
|
+
#else
|
|
6492
|
+
(void)d;
|
|
6493
|
+
#endif
|
|
6494
|
+
|
|
6495
|
+
return detail::SlideDownLanes(v, amt);
|
|
6496
|
+
}
|
|
6497
|
+
|
|
6498
|
+
// ------------------------------- WidenHighMulAdd
|
|
6499
|
+
|
|
6500
|
+
#ifdef HWY_NATIVE_WIDEN_HIGH_MUL_ADD
|
|
6501
|
+
#undef HWY_NATIVE_WIDEN_HIGH_MUL_ADD
|
|
6502
|
+
#else
|
|
6503
|
+
#define HWY_NATIVE_WIDEN_HIGH_MUL_ADD
|
|
6504
|
+
#endif
|
|
6505
|
+
|
|
6506
|
+
namespace detail {
|
|
6507
|
+
|
|
6508
|
+
template<class D, HWY_IF_U64_D(D), class DN = RepartitionToNarrow<D>,
|
|
6509
|
+
HWY_IF_LANES_GT_D(DN, 2)>
|
|
6510
|
+
HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6511
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6512
|
+
#if HWY_ARCH_ARM_A64
|
|
6513
|
+
return Vec128<uint64_t>(vmlal_high_u32(add.raw, mul.raw, x.raw));
|
|
6514
|
+
#else
|
|
6515
|
+
const Full64<uint32_t> dh;
|
|
6516
|
+
return Vec128<uint64_t>(
|
|
6517
|
+
vmlal_u32(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
|
|
6518
|
+
#endif
|
|
6519
|
+
}
|
|
6520
|
+
|
|
6521
|
+
template<class D, HWY_IF_U64_D(D), class DN = RepartitionToNarrow<D>,
|
|
6522
|
+
HWY_IF_LANES_LE_D(DN, 2)>
|
|
6523
|
+
HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
|
|
6524
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6525
|
+
Vec128<uint64_t> mulResult = Vec128<uint64_t>(vmull_u32(mul.raw, x.raw));
|
|
6526
|
+
return UpperHalf(d, mulResult) + add;
|
|
6527
|
+
}
|
|
6528
|
+
|
|
6529
|
+
template<class D, HWY_IF_I64_D(D), class DN = RepartitionToNarrow<D>,
|
|
6530
|
+
HWY_IF_LANES_GT_D(DN, 2)>
|
|
6531
|
+
HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6532
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6533
|
+
#if HWY_ARCH_ARM_A64
|
|
6534
|
+
return Vec128<int64_t>(vmlal_high_s32(add.raw, mul.raw, x.raw));
|
|
6535
|
+
#else
|
|
6536
|
+
const Full64<int32_t> dh;
|
|
6537
|
+
return Vec128<int64_t>(
|
|
6538
|
+
vmlal_s32(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
|
|
6539
|
+
#endif
|
|
6540
|
+
}
|
|
6541
|
+
|
|
6542
|
+
template<class D, HWY_IF_I64_D(D), class DN = RepartitionToNarrow<D>,
|
|
6543
|
+
HWY_IF_LANES_LE_D(DN, 2)>
|
|
6544
|
+
HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
|
|
6545
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6546
|
+
Vec128<int64_t> mulResult = Vec128<int64_t>(vmull_s32(mul.raw, x.raw));
|
|
6547
|
+
return UpperHalf(d, mulResult) + add;
|
|
6548
|
+
}
|
|
6549
|
+
|
|
6550
|
+
template<class D, HWY_IF_I32_D(D), class DN = RepartitionToNarrow<D>,
|
|
6551
|
+
HWY_IF_LANES_GT_D(DN, 4)>
|
|
6552
|
+
HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6553
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6554
|
+
#if HWY_ARCH_ARM_A64
|
|
6555
|
+
return Vec128<int32_t>(vmlal_high_s16(add.raw, mul.raw, x.raw));
|
|
6556
|
+
#else
|
|
6557
|
+
const Full64<int16_t> dh;
|
|
6558
|
+
return Vec128<int32_t>(
|
|
6559
|
+
vmlal_s16(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
|
|
6560
|
+
#endif
|
|
6561
|
+
}
|
|
6562
|
+
|
|
6563
|
+
template<class D, HWY_IF_I32_D(D), class DN = RepartitionToNarrow<D>,
|
|
6564
|
+
HWY_IF_LANES_D(DN, 4)>
|
|
6565
|
+
HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
|
|
6566
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6567
|
+
Vec128<int32_t> widen = Vec128<int32_t>(vmull_s16(mul.raw, x.raw));
|
|
6568
|
+
Vec64<int32_t> hi = UpperHalf(d, widen);
|
|
6569
|
+
return hi + add;
|
|
6570
|
+
}
|
|
6571
|
+
|
|
6572
|
+
template<class D, HWY_IF_I32_D(D), class DN = RepartitionToNarrow<D>,
|
|
6573
|
+
HWY_IF_LANES_D(DN, 2)>
|
|
6574
|
+
HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
|
|
6575
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6576
|
+
Vec128<int32_t> widen = Vec128<int32_t>(vmull_s16(mul.raw, x.raw));
|
|
6577
|
+
Vec32<int32_t> hi = UpperHalf(d, Vec64<int32_t>(vget_high_s32(widen.raw)));
|
|
6578
|
+
return hi + add;
|
|
6579
|
+
}
|
|
6580
|
+
|
|
6581
|
+
template<class D, HWY_IF_U32_D(D), class DN = RepartitionToNarrow<D>,
|
|
6582
|
+
HWY_IF_LANES_GT_D(DN, 4)>
|
|
6583
|
+
HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6584
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6585
|
+
#if HWY_ARCH_ARM_A64
|
|
6586
|
+
return Vec128<uint32_t>(vmlal_high_u16(add.raw, mul.raw, x.raw));
|
|
6587
|
+
#else
|
|
6588
|
+
const Full64<uint16_t> dh;
|
|
6589
|
+
return Vec128<uint32_t>(
|
|
6590
|
+
vmlal_u16(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
|
|
6591
|
+
#endif
|
|
6592
|
+
}
|
|
6593
|
+
|
|
6594
|
+
template<class D, HWY_IF_U32_D(D), class DN = RepartitionToNarrow<D>,
|
|
6595
|
+
HWY_IF_LANES_D(DN, 4)>
|
|
6596
|
+
HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
|
|
6597
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6598
|
+
Vec128<uint32_t> widen = Vec128<uint32_t>(vmull_u16(mul.raw, x.raw));
|
|
6599
|
+
VFromD<D> hi = UpperHalf(d, widen);
|
|
6600
|
+
return hi + add;
|
|
6601
|
+
}
|
|
6602
|
+
|
|
6603
|
+
template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_D(D, 1),
|
|
6604
|
+
class DN = RepartitionToNarrow<D>>
|
|
6605
|
+
HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
|
|
6606
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6607
|
+
Vec128<uint32_t> widen = Vec128<uint32_t>(vmull_u16(mul.raw, x.raw));
|
|
6608
|
+
VFromD<D> hi = UpperHalf(d, Vec64<uint32_t>(vget_high_u32(widen.raw)));
|
|
6609
|
+
return hi + add;
|
|
6610
|
+
}
|
|
6611
|
+
|
|
6612
|
+
template<class D, HWY_IF_U16_D(D), class DN = RepartitionToNarrow<D>,
|
|
6613
|
+
HWY_IF_LANES_GT_D(DN, 8)>
|
|
6614
|
+
HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6615
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6616
|
+
#if HWY_ARCH_ARM_A64
|
|
6617
|
+
return Vec128<uint16_t>(vmlal_high_u8(add.raw, mul.raw, x.raw));
|
|
6618
|
+
#else
|
|
6619
|
+
const Full64<uint8_t> dh;
|
|
6620
|
+
return Vec128<uint16_t>(
|
|
6621
|
+
vmlal_u8(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
|
|
6622
|
+
#endif
|
|
6623
|
+
}
|
|
6624
|
+
|
|
6625
|
+
template<class D, HWY_IF_U16_D(D), class DN = RepartitionToNarrow<D>,
|
|
6626
|
+
HWY_IF_LANES_D(DN, 8)>
|
|
6627
|
+
HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
|
|
6628
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6629
|
+
Vec128<uint16_t> widen = Vec128<uint16_t>(vmull_u8(mul.raw, x.raw));
|
|
6630
|
+
VFromD<D> hi = UpperHalf(d, widen);
|
|
6631
|
+
return hi + add;
|
|
6632
|
+
}
|
|
6633
|
+
|
|
6634
|
+
template<class D, HWY_IF_U16(TFromD<D>), class DN = RepartitionToNarrow<D>,
|
|
6635
|
+
HWY_IF_LANES_LE_D(DN, 4)>
|
|
6636
|
+
HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
|
|
6637
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6638
|
+
Vec128<uint16_t> widen = Vec128<uint16_t>(vmull_u8(mul.raw, x.raw));
|
|
6639
|
+
const Twice<decltype(d)> d16F;
|
|
6640
|
+
VFromD<D> hi = UpperHalf(d, VFromD<decltype(d16F)>(vget_high_u16(widen.raw)));
|
|
6641
|
+
return hi + add;
|
|
6642
|
+
}
|
|
6643
|
+
|
|
6644
|
+
template<class D, HWY_IF_I16_D(D), class DN = RepartitionToNarrow<D>,
|
|
6645
|
+
HWY_IF_LANES_GT_D(DN, 8)>
|
|
6646
|
+
HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6647
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6648
|
+
#if HWY_ARCH_ARM_A64
|
|
6649
|
+
return Vec128<int16_t>(vmlal_high_s8(add.raw, mul.raw, x.raw));
|
|
6650
|
+
#else
|
|
6651
|
+
const Full64<int8_t> dh;
|
|
6652
|
+
return Vec128<int16_t>(
|
|
6653
|
+
vmlal_s8(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
|
|
6654
|
+
#endif
|
|
6655
|
+
}
|
|
6656
|
+
|
|
6657
|
+
template<class D, HWY_IF_I16_D(D), class DN = RepartitionToNarrow<D>,
|
|
6658
|
+
HWY_IF_LANES_D(DN, 8)>
|
|
6659
|
+
HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
|
|
6660
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6661
|
+
Vec128<int16_t> widen = Vec128<int16_t>(vmull_s8(mul.raw, x.raw));
|
|
6662
|
+
VFromD<D> hi = UpperHalf(d, widen);
|
|
6663
|
+
return hi + add;
|
|
6664
|
+
}
|
|
6665
|
+
|
|
6666
|
+
template<class D, HWY_IF_I16_D(D), class DN = RepartitionToNarrow<D>,
|
|
6667
|
+
HWY_IF_LANES_LE_D(DN, 4)>
|
|
6668
|
+
HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
|
|
6669
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6670
|
+
Vec128<int16_t> widen = Vec128<int16_t>(vmull_s8(mul.raw, x.raw));
|
|
6671
|
+
const Twice<decltype(d)> d16F;
|
|
6672
|
+
VFromD<D> hi = UpperHalf(d, VFromD<decltype(d16F)>(vget_high_s16(widen.raw)));
|
|
6673
|
+
return hi + add;
|
|
6674
|
+
}
|
|
6675
|
+
|
|
6676
|
+
#if 0
|
|
6677
|
+
#if HWY_HAVE_FLOAT16
|
|
6678
|
+
template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 4),
|
|
6679
|
+
class DN = RepartitionToNarrow<D>>
|
|
6680
|
+
HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6681
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6682
|
+
return VFromD<D>(vfmlalq_high_f16(add.raw, mul.raw, x.raw));
|
|
6683
|
+
}
|
|
6684
|
+
|
|
6685
|
+
template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 2),
|
|
6686
|
+
class DN = RepartitionToNarrow<D>>
|
|
6687
|
+
HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6688
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6689
|
+
return Vec64<float32_t>(vfmlal_high_f16(add.raw, mul.raw, x.raw));
|
|
6690
|
+
}
|
|
6691
|
+
|
|
6692
|
+
template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 1),
|
|
6693
|
+
class DN = RepartitionToNarrow<D>>
|
|
6694
|
+
HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
|
|
6695
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6696
|
+
return MulAdd(add, PromoteUpperTo(d, mul), PromoteUpperTo(d, x));
|
|
6697
|
+
}
|
|
6698
|
+
#endif
|
|
6699
|
+
#endif
|
|
6700
|
+
|
|
6701
|
+
} // namespace detail
|
|
6702
|
+
|
|
6703
|
+
// ------------------------------- WidenMulAdd
|
|
6704
|
+
|
|
6705
|
+
#ifdef HWY_NATIVE_WIDEN_MUL_ADD
|
|
6706
|
+
#undef HWY_NATIVE_WIDEN_MUL_ADD
|
|
6707
|
+
#else
|
|
6708
|
+
#define HWY_NATIVE_WIDEN_MUL_ADD
|
|
6709
|
+
#endif
|
|
6710
|
+
|
|
6711
|
+
namespace detail {
|
|
6712
|
+
|
|
6713
|
+
template<class D, HWY_IF_U16_D(D), HWY_IF_LANES_GT_D(D, 4),
|
|
6714
|
+
class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
|
|
6715
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6716
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6717
|
+
return Vec128<uint16_t>(vmlal_u8(add.raw, mul.raw, x.raw));
|
|
6718
|
+
}
|
|
6719
|
+
|
|
6720
|
+
template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_LE_D(D, 4),
|
|
6721
|
+
class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
|
|
6722
|
+
HWY_API VFromD<D> WidenMulAdd(D d, VFromD<DN> mul, VFromD<DN> x,
|
|
6723
|
+
VFromD<D> add) {
|
|
6724
|
+
return MulAdd(add, PromoteTo(d, mul), PromoteTo(d, x));
|
|
6725
|
+
}
|
|
6726
|
+
|
|
6727
|
+
template<class D, HWY_IF_I16_D(D), HWY_IF_LANES_GT_D(D, 4),
|
|
6728
|
+
class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
|
|
6729
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6730
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6731
|
+
return VFromD<D>(vmlal_s8(add.raw, mul.raw, x.raw));
|
|
6732
|
+
}
|
|
6733
|
+
|
|
6734
|
+
template <class D, HWY_IF_I16_D(D), HWY_IF_LANES_LE_D(D, 4),
|
|
6735
|
+
class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
|
|
6736
|
+
HWY_API VFromD<D> WidenMulAdd(D d, VFromD<DN> mul, VFromD<DN> x,
|
|
6737
|
+
VFromD<D> add) {
|
|
6738
|
+
return MulAdd(add, PromoteTo(d, mul), PromoteTo(d, x));
|
|
6739
|
+
}
|
|
6740
|
+
|
|
6741
|
+
template<class D, HWY_IF_I32_D(D),
|
|
6742
|
+
class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
|
|
6743
|
+
HWY_IF_LANES_GT_D(DN, 2)>
|
|
6744
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6745
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6746
|
+
return Vec128<int32_t>(vmlal_s16(add.raw, mul.raw, x.raw));
|
|
6747
|
+
}
|
|
6748
|
+
|
|
6749
|
+
template<class D, HWY_IF_I32_D(D),
|
|
6750
|
+
class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
|
|
6751
|
+
HWY_IF_LANES_D(DN, 2)>
|
|
6752
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6753
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6754
|
+
Vec128<int32_t> mulRs = Vec128<int32_t>(vmull_s16(mul.raw, x.raw));
|
|
6755
|
+
const VFromD<D> mul10 = LowerHalf(mulRs);
|
|
6756
|
+
return add + mul10;
|
|
6757
|
+
}
|
|
6758
|
+
|
|
6759
|
+
template<class D, HWY_IF_I32_D(D),
|
|
6760
|
+
class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
|
|
6761
|
+
HWY_IF_LANES_D(D, 1)>
|
|
6762
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6763
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6764
|
+
Vec64<int32_t> mulRs = LowerHalf(Vec128<int32_t>(vmull_s16(mul.raw, x.raw)));
|
|
6765
|
+
const Vec32<int32_t> mul10(LowerHalf(mulRs));
|
|
6766
|
+
return add + mul10;
|
|
6767
|
+
}
|
|
6768
|
+
|
|
6769
|
+
template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_GT_D(D, 2),
|
|
6770
|
+
class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
|
|
6771
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6772
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6773
|
+
return Vec128<uint32_t>(vmlal_u16(add.raw, mul.raw, x.raw));
|
|
6774
|
+
}
|
|
6775
|
+
|
|
6776
|
+
template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_D(D, 2),
|
|
6777
|
+
class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
|
|
6778
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6779
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6780
|
+
Vec128<uint32_t> mulRs = Vec128<uint32_t>(vmull_u16(mul.raw, x.raw));
|
|
6781
|
+
const Vec64<uint32_t> mul10(LowerHalf(mulRs));
|
|
6782
|
+
return add + mul10;
|
|
6783
|
+
}
|
|
6784
|
+
|
|
6785
|
+
template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_D(D, 1),
|
|
6786
|
+
class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
|
|
6787
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6788
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6789
|
+
Vec64<uint32_t> mulRs =
|
|
6790
|
+
LowerHalf(Vec128<uint32_t>(vmull_u16(mul.raw, x.raw)));
|
|
6791
|
+
const Vec32<uint32_t> mul10(LowerHalf(mulRs));
|
|
6792
|
+
return add + mul10;
|
|
6793
|
+
}
|
|
6794
|
+
|
|
6795
|
+
template<class D, HWY_IF_I64_D(D), class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
|
|
6796
|
+
HWY_IF_LANES_D(DN, 2)>
|
|
6797
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6798
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6799
|
+
return VFromD<D>(vmlal_s32(add.raw, mul.raw, x.raw));
|
|
6800
|
+
}
|
|
6801
|
+
|
|
6802
|
+
template<class D, HWY_IF_I64_D(D), HWY_IF_LANES_D(D, 1),
|
|
6803
|
+
class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
|
|
6804
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6805
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6806
|
+
Vec128<int64_t> mulRs = Vec128<int64_t>(vmull_s32(mul.raw, x.raw));
|
|
6807
|
+
const VFromD<D> mul10(LowerHalf(mulRs));
|
|
6808
|
+
return add + mul10;
|
|
6809
|
+
}
|
|
6810
|
+
|
|
6811
|
+
template<class D, HWY_IF_U64_D(D), class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
|
|
6812
|
+
HWY_IF_LANES_D(DN, 2)>
|
|
6813
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6814
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6815
|
+
return VFromD<D>(vmlal_u32(add.raw, mul.raw, x.raw));
|
|
6816
|
+
}
|
|
6817
|
+
|
|
6818
|
+
template<class D, HWY_IF_U64_D(D), class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
|
|
6819
|
+
HWY_IF_LANES_D(DN, 1)>
|
|
6820
|
+
HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6821
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6822
|
+
Vec128<uint64_t> mulRs = Vec128<uint64_t>(vmull_u32(mul.raw, x.raw));
|
|
6823
|
+
const VFromD<D> mul10(LowerHalf(mulRs));
|
|
6824
|
+
return add + mul10;
|
|
6825
|
+
}
|
|
6826
|
+
|
|
6827
|
+
#if 0
|
|
6828
|
+
#if HWY_HAVE_FLOAT16
|
|
6829
|
+
template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>,
|
|
6830
|
+
HWY_IF_LANES_D(D, 4)>
|
|
6831
|
+
HWY_API VFromD<D> WidenLowMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6832
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6833
|
+
return VFromD<D>(vfmlalq_low_f16(add.raw, mul.raw, x.raw));
|
|
6834
|
+
}
|
|
6835
|
+
|
|
6836
|
+
template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>,
|
|
6837
|
+
HWY_IF_LANES_D(DN, 4)>
|
|
6838
|
+
HWY_API VFromD<D> WidenLowMulAdd(D /* tag */, VFromD<DN> mul,
|
|
6839
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6840
|
+
return Vec64<float32_t>(vfmlal_low_f16(add.raw, mul.raw, x.raw));
|
|
5687
6841
|
}
|
|
5688
6842
|
|
|
5689
|
-
template
|
|
5690
|
-
|
|
5691
|
-
|
|
5692
|
-
|
|
5693
|
-
|
|
5694
|
-
idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15})));
|
|
5695
|
-
return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx));
|
|
6843
|
+
template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 1),
|
|
6844
|
+
class DN = RepartitionToNarrow<D>>
|
|
6845
|
+
HWY_API VFromD<D> WidenLowMulAdd(D d, VFromD<DN> mul,
|
|
6846
|
+
VFromD<DN> x, VFromD<D> add) {
|
|
6847
|
+
return MulAdd(add, PromoteLowerTo(d, mul), PromoteLowerTo(d, x));
|
|
5696
6848
|
}
|
|
6849
|
+
#endif
|
|
6850
|
+
#endif
|
|
5697
6851
|
|
|
5698
6852
|
} // namespace detail
|
|
5699
6853
|
|
|
5700
|
-
|
|
5701
|
-
HWY_API VFromD<D> SlideDownLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
|
|
5702
|
-
return v;
|
|
5703
|
-
}
|
|
6854
|
+
// ------------------------------ WidenMulAccumulate
|
|
5704
6855
|
|
|
5705
|
-
|
|
5706
|
-
|
|
5707
|
-
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
|
|
5708
|
-
if (__builtin_constant_p(amt)) {
|
|
5709
|
-
switch (amt) {
|
|
5710
|
-
case 0:
|
|
5711
|
-
return v;
|
|
5712
|
-
case 1:
|
|
5713
|
-
return ShiftRightLanes<1>(d, v);
|
|
5714
|
-
}
|
|
5715
|
-
}
|
|
6856
|
+
#ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
|
|
6857
|
+
#undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
|
|
5716
6858
|
#else
|
|
5717
|
-
|
|
6859
|
+
#define HWY_NATIVE_WIDEN_MUL_ACCUMULATE
|
|
5718
6860
|
#endif
|
|
5719
6861
|
|
|
5720
|
-
|
|
6862
|
+
template<class D, HWY_IF_INTEGER(TFromD<D>), class DN = RepartitionToNarrow<D>>
|
|
6863
|
+
HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x,
|
|
6864
|
+
VFromD<D> low, VFromD<D>& high) {
|
|
6865
|
+
high = detail::WidenHighMulAdd(d, mul, x, high);
|
|
6866
|
+
return detail::WidenMulAdd(d, LowerHalf(mul), LowerHalf(x), low);
|
|
5721
6867
|
}
|
|
5722
6868
|
|
|
5723
|
-
|
|
5724
|
-
|
|
5725
|
-
#
|
|
5726
|
-
if (__builtin_constant_p(amt)) {
|
|
5727
|
-
switch (amt) {
|
|
5728
|
-
case 0:
|
|
5729
|
-
return v;
|
|
5730
|
-
case 1:
|
|
5731
|
-
return ShiftRightLanes<1>(d, v);
|
|
5732
|
-
case 2:
|
|
5733
|
-
return ShiftRightLanes<2>(d, v);
|
|
5734
|
-
case 3:
|
|
5735
|
-
return ShiftRightLanes<3>(d, v);
|
|
5736
|
-
}
|
|
5737
|
-
}
|
|
6869
|
+
#if 0
|
|
6870
|
+
#ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
|
|
6871
|
+
#undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
|
|
5738
6872
|
#else
|
|
5739
|
-
|
|
6873
|
+
#define HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
|
|
5740
6874
|
#endif
|
|
5741
6875
|
|
|
5742
|
-
|
|
6876
|
+
#if HWY_HAVE_FLOAT16
|
|
6877
|
+
|
|
6878
|
+
template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>>
|
|
6879
|
+
HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x,
|
|
6880
|
+
VFromD<D> low, VFromD<D>& high) {
|
|
6881
|
+
high = detail::WidenHighMulAdd(d, mul, x, high);
|
|
6882
|
+
return detail::WidenLowMulAdd(d, mul, x, low);
|
|
5743
6883
|
}
|
|
5744
6884
|
|
|
5745
|
-
|
|
5746
|
-
HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
|
|
5747
|
-
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
|
|
5748
|
-
if (__builtin_constant_p(amt)) {
|
|
5749
|
-
switch (amt) {
|
|
5750
|
-
case 0:
|
|
5751
|
-
return v;
|
|
5752
|
-
case 1:
|
|
5753
|
-
return ShiftRightLanes<1>(d, v);
|
|
5754
|
-
case 2:
|
|
5755
|
-
return ShiftRightLanes<2>(d, v);
|
|
5756
|
-
case 3:
|
|
5757
|
-
return ShiftRightLanes<3>(d, v);
|
|
5758
|
-
case 4:
|
|
5759
|
-
return ShiftRightLanes<4>(d, v);
|
|
5760
|
-
case 5:
|
|
5761
|
-
return ShiftRightLanes<5>(d, v);
|
|
5762
|
-
case 6:
|
|
5763
|
-
return ShiftRightLanes<6>(d, v);
|
|
5764
|
-
case 7:
|
|
5765
|
-
return ShiftRightLanes<7>(d, v);
|
|
5766
|
-
}
|
|
5767
|
-
}
|
|
5768
|
-
#else
|
|
5769
|
-
(void)d;
|
|
6885
|
+
#endif
|
|
5770
6886
|
#endif
|
|
5771
6887
|
|
|
5772
|
-
|
|
5773
|
-
}
|
|
6888
|
+
// ------------------------------ SatWidenMulAccumFixedPoint
|
|
5774
6889
|
|
|
5775
|
-
|
|
5776
|
-
|
|
5777
|
-
#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
|
|
5778
|
-
if (__builtin_constant_p(amt)) {
|
|
5779
|
-
switch (amt) {
|
|
5780
|
-
case 0:
|
|
5781
|
-
return v;
|
|
5782
|
-
case 1:
|
|
5783
|
-
return ShiftRightLanes<1>(d, v);
|
|
5784
|
-
case 2:
|
|
5785
|
-
return ShiftRightLanes<2>(d, v);
|
|
5786
|
-
case 3:
|
|
5787
|
-
return ShiftRightLanes<3>(d, v);
|
|
5788
|
-
case 4:
|
|
5789
|
-
return ShiftRightLanes<4>(d, v);
|
|
5790
|
-
case 5:
|
|
5791
|
-
return ShiftRightLanes<5>(d, v);
|
|
5792
|
-
case 6:
|
|
5793
|
-
return ShiftRightLanes<6>(d, v);
|
|
5794
|
-
case 7:
|
|
5795
|
-
return ShiftRightLanes<7>(d, v);
|
|
5796
|
-
case 8:
|
|
5797
|
-
return ShiftRightLanes<8>(d, v);
|
|
5798
|
-
case 9:
|
|
5799
|
-
return ShiftRightLanes<9>(d, v);
|
|
5800
|
-
case 10:
|
|
5801
|
-
return ShiftRightLanes<10>(d, v);
|
|
5802
|
-
case 11:
|
|
5803
|
-
return ShiftRightLanes<11>(d, v);
|
|
5804
|
-
case 12:
|
|
5805
|
-
return ShiftRightLanes<12>(d, v);
|
|
5806
|
-
case 13:
|
|
5807
|
-
return ShiftRightLanes<13>(d, v);
|
|
5808
|
-
case 14:
|
|
5809
|
-
return ShiftRightLanes<14>(d, v);
|
|
5810
|
-
case 15:
|
|
5811
|
-
return ShiftRightLanes<15>(d, v);
|
|
5812
|
-
}
|
|
5813
|
-
}
|
|
6890
|
+
#ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
|
|
6891
|
+
#undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
|
|
5814
6892
|
#else
|
|
5815
|
-
|
|
6893
|
+
#define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
|
|
5816
6894
|
#endif
|
|
5817
6895
|
|
|
5818
|
-
|
|
6896
|
+
template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 16)>
|
|
6897
|
+
HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 /*di32*/,
|
|
6898
|
+
VFromD<Rebind<int16_t, DI32>> a,
|
|
6899
|
+
VFromD<Rebind<int16_t, DI32>> b,
|
|
6900
|
+
VFromD<DI32> sum) {
|
|
6901
|
+
return VFromD<DI32>(vqdmlal_s16(sum.raw, a.raw, b.raw));
|
|
6902
|
+
}
|
|
6903
|
+
|
|
6904
|
+
template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 8)>
|
|
6905
|
+
HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 di32,
|
|
6906
|
+
VFromD<Rebind<int16_t, DI32>> a,
|
|
6907
|
+
VFromD<Rebind<int16_t, DI32>> b,
|
|
6908
|
+
VFromD<DI32> sum) {
|
|
6909
|
+
const Full128<TFromD<DI32>> di32_full;
|
|
6910
|
+
const Rebind<int16_t, decltype(di32_full)> di16_full64;
|
|
6911
|
+
return ResizeBitCast(
|
|
6912
|
+
di32, SatWidenMulAccumFixedPoint(di32_full, ResizeBitCast(di16_full64, a),
|
|
6913
|
+
ResizeBitCast(di16_full64, b),
|
|
6914
|
+
ResizeBitCast(di32_full, sum)));
|
|
5819
6915
|
}
|
|
5820
6916
|
|
|
5821
6917
|
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
|
|
5822
6918
|
|
|
6919
|
+
#if HWY_NEON_HAVE_F32_TO_BF16C
|
|
6920
|
+
|
|
6921
|
+
#ifdef HWY_NATIVE_MUL_EVEN_BF16
|
|
6922
|
+
#undef HWY_NATIVE_MUL_EVEN_BF16
|
|
6923
|
+
#else
|
|
6924
|
+
#define HWY_NATIVE_MUL_EVEN_BF16
|
|
6925
|
+
#endif
|
|
6926
|
+
|
|
6927
|
+
#ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
|
|
6928
|
+
#undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
|
|
6929
|
+
#else
|
|
6930
|
+
#define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
|
|
6931
|
+
#endif
|
|
6932
|
+
|
|
6933
|
+
namespace detail {
|
|
5823
6934
|
#if HWY_NEON_HAVE_BFLOAT16
|
|
6935
|
+
// If HWY_NEON_HAVE_BFLOAT16 is true, detail::Vec128<bfloat16_t, N>::type is
|
|
6936
|
+
// bfloat16x4_t or bfloat16x8_t.
|
|
6937
|
+
static HWY_INLINE bfloat16x4_t BitCastToRawNeonBF16(bfloat16x4_t raw) {
|
|
6938
|
+
return raw;
|
|
6939
|
+
}
|
|
6940
|
+
static HWY_INLINE bfloat16x8_t BitCastToRawNeonBF16(bfloat16x8_t raw) {
|
|
6941
|
+
return raw;
|
|
6942
|
+
}
|
|
6943
|
+
#else
|
|
6944
|
+
// If HWY_NEON_HAVE_F32_TO_BF16C && !HWY_NEON_HAVE_BFLOAT16 is true,
|
|
6945
|
+
// detail::Vec128<bfloat16_t, N>::type is uint16x4_t or uint16x8_t vector to
|
|
6946
|
+
// work around compiler bugs that are there with GCC 13 or earlier or Clang 16
|
|
6947
|
+
// or earlier on AArch64.
|
|
6948
|
+
|
|
6949
|
+
// The uint16x4_t or uint16x8_t vector neets to be bitcasted to a bfloat16x4_t
|
|
6950
|
+
// or a bfloat16x8_t vector for the vbfdot_f32 and vbfdotq_f32 intrinsics if
|
|
6951
|
+
// HWY_NEON_HAVE_F32_TO_BF16C && !HWY_NEON_HAVE_BFLOAT16 is true
|
|
6952
|
+
static HWY_INLINE bfloat16x4_t BitCastToRawNeonBF16(uint16x4_t raw) {
|
|
6953
|
+
return vreinterpret_bf16_u16(raw);
|
|
6954
|
+
}
|
|
6955
|
+
static HWY_INLINE bfloat16x8_t BitCastToRawNeonBF16(uint16x8_t raw) {
|
|
6956
|
+
return vreinterpretq_bf16_u16(raw);
|
|
6957
|
+
}
|
|
6958
|
+
#endif
|
|
6959
|
+
} // namespace detail
|
|
6960
|
+
|
|
6961
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16)>
|
|
6962
|
+
HWY_API Vec128<float> MulEvenAdd(D /*d32*/, Vec128<bfloat16_t> a,
|
|
6963
|
+
Vec128<bfloat16_t> b, const Vec128<float> c) {
|
|
6964
|
+
return Vec128<float>(vbfmlalbq_f32(c.raw, detail::BitCastToRawNeonBF16(a.raw),
|
|
6965
|
+
detail::BitCastToRawNeonBF16(b.raw)));
|
|
6966
|
+
}
|
|
6967
|
+
|
|
6968
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16)>
|
|
6969
|
+
HWY_API Vec128<float> MulOddAdd(D /*d32*/, Vec128<bfloat16_t> a,
|
|
6970
|
+
Vec128<bfloat16_t> b, const Vec128<float> c) {
|
|
6971
|
+
return Vec128<float>(vbfmlaltq_f32(c.raw, detail::BitCastToRawNeonBF16(a.raw),
|
|
6972
|
+
detail::BitCastToRawNeonBF16(b.raw)));
|
|
6973
|
+
}
|
|
5824
6974
|
|
|
5825
6975
|
template <class D, HWY_IF_V_SIZE_D(D, 16)>
|
|
5826
6976
|
HWY_API Vec128<float> ReorderWidenMulAccumulate(D /*d32*/, Vec128<bfloat16_t> a,
|
|
5827
6977
|
Vec128<bfloat16_t> b,
|
|
5828
6978
|
const Vec128<float> sum0,
|
|
5829
6979
|
Vec128<float>& /*sum1*/) {
|
|
5830
|
-
return Vec128<float>(vbfdotq_f32(sum0.raw,
|
|
6980
|
+
return Vec128<float>(vbfdotq_f32(sum0.raw,
|
|
6981
|
+
detail::BitCastToRawNeonBF16(a.raw),
|
|
6982
|
+
detail::BitCastToRawNeonBF16(b.raw)));
|
|
6983
|
+
}
|
|
6984
|
+
|
|
6985
|
+
// There is no non-q version of these instructions.
|
|
6986
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
6987
|
+
HWY_API VFromD<D> MulEvenAdd(D d32, VFromD<Repartition<bfloat16_t, D>> a,
|
|
6988
|
+
VFromD<Repartition<bfloat16_t, D>> b,
|
|
6989
|
+
const VFromD<D> c) {
|
|
6990
|
+
const Full128<float> d32f;
|
|
6991
|
+
const Full128<bfloat16_t> d16f;
|
|
6992
|
+
return ResizeBitCast(
|
|
6993
|
+
d32, MulEvenAdd(d32f, ResizeBitCast(d16f, a), ResizeBitCast(d16f, b),
|
|
6994
|
+
ResizeBitCast(d32f, c)));
|
|
6995
|
+
}
|
|
6996
|
+
|
|
6997
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
6998
|
+
HWY_API VFromD<D> MulOddAdd(D d32, VFromD<Repartition<bfloat16_t, D>> a,
|
|
6999
|
+
VFromD<Repartition<bfloat16_t, D>> b,
|
|
7000
|
+
const VFromD<D> c) {
|
|
7001
|
+
const Full128<float> d32f;
|
|
7002
|
+
const Full128<bfloat16_t> d16f;
|
|
7003
|
+
return ResizeBitCast(
|
|
7004
|
+
d32, MulOddAdd(d32f, ResizeBitCast(d16f, a), ResizeBitCast(d16f, b),
|
|
7005
|
+
ResizeBitCast(d32f, c)));
|
|
5831
7006
|
}
|
|
5832
7007
|
|
|
5833
7008
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
@@ -5835,28 +7010,11 @@ HWY_API VFromD<D> ReorderWidenMulAccumulate(
|
|
|
5835
7010
|
D /*d32*/, VFromD<Repartition<bfloat16_t, D>> a,
|
|
5836
7011
|
VFromD<Repartition<bfloat16_t, D>> b, const VFromD<D> sum0,
|
|
5837
7012
|
VFromD<D>& /*sum1*/) {
|
|
5838
|
-
return VFromD<D>(vbfdot_f32(sum0.raw, a.raw,
|
|
5839
|
-
|
|
5840
|
-
|
|
5841
|
-
#else
|
|
5842
|
-
|
|
5843
|
-
template <class D32, HWY_IF_F32_D(D32),
|
|
5844
|
-
class V16 = VFromD<Repartition<bfloat16_t, D32>>>
|
|
5845
|
-
HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b,
|
|
5846
|
-
const VFromD<D32> sum0,
|
|
5847
|
-
VFromD<D32>& sum1) {
|
|
5848
|
-
const RebindToUnsigned<decltype(df32)> du32;
|
|
5849
|
-
using VU32 = VFromD<decltype(du32)>;
|
|
5850
|
-
const VU32 odd = Set(du32, 0xFFFF0000u);
|
|
5851
|
-
const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
|
|
5852
|
-
const VU32 ao = And(BitCast(du32, a), odd);
|
|
5853
|
-
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
5854
|
-
const VU32 bo = And(BitCast(du32, b), odd);
|
|
5855
|
-
sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
|
|
5856
|
-
return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
|
|
7013
|
+
return VFromD<D>(vbfdot_f32(sum0.raw, detail::BitCastToRawNeonBF16(a.raw),
|
|
7014
|
+
detail::BitCastToRawNeonBF16(b.raw)));
|
|
5857
7015
|
}
|
|
5858
7016
|
|
|
5859
|
-
#endif //
|
|
7017
|
+
#endif // HWY_NEON_HAVE_F32_TO_BF16C
|
|
5860
7018
|
|
|
5861
7019
|
template <class D, HWY_IF_I32_D(D)>
|
|
5862
7020
|
HWY_API Vec128<int32_t> ReorderWidenMulAccumulate(D /*d32*/, Vec128<int16_t> a,
|
|
@@ -6026,37 +7184,34 @@ HWY_API Vec32<uint32_t> RearrangeToOddPlusEven(Vec32<uint32_t> sum0,
|
|
|
6026
7184
|
|
|
6027
7185
|
// ------------------------------ WidenMulPairwiseAdd
|
|
6028
7186
|
|
|
6029
|
-
#if
|
|
7187
|
+
#if HWY_NEON_HAVE_F32_TO_BF16C
|
|
6030
7188
|
|
|
6031
|
-
template <class
|
|
6032
|
-
HWY_API Vec128<float> WidenMulPairwiseAdd(
|
|
7189
|
+
template <class DF, HWY_IF_V_SIZE_D(DF, 16)>
|
|
7190
|
+
HWY_API Vec128<float> WidenMulPairwiseAdd(DF df, Vec128<bfloat16_t> a,
|
|
6033
7191
|
Vec128<bfloat16_t> b) {
|
|
6034
|
-
return Vec128<float>(vbfdotq_f32(Zero(
|
|
7192
|
+
return Vec128<float>(vbfdotq_f32(Zero(df).raw,
|
|
7193
|
+
detail::BitCastToRawNeonBF16(a.raw),
|
|
7194
|
+
detail::BitCastToRawNeonBF16(b.raw)));
|
|
6035
7195
|
}
|
|
6036
7196
|
|
|
6037
|
-
template <class
|
|
6038
|
-
HWY_API VFromD<
|
|
6039
|
-
|
|
6040
|
-
|
|
6041
|
-
return VFromD<
|
|
7197
|
+
template <class DF, HWY_IF_V_SIZE_LE_D(DF, 8)>
|
|
7198
|
+
HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df,
|
|
7199
|
+
VFromD<Repartition<bfloat16_t, DF>> a,
|
|
7200
|
+
VFromD<Repartition<bfloat16_t, DF>> b) {
|
|
7201
|
+
return VFromD<DF>(vbfdot_f32(Zero(df).raw,
|
|
7202
|
+
detail::BitCastToRawNeonBF16(a.raw),
|
|
7203
|
+
detail::BitCastToRawNeonBF16(b.raw)));
|
|
6042
7204
|
}
|
|
6043
7205
|
|
|
6044
7206
|
#else
|
|
6045
|
-
template <class
|
|
6046
|
-
HWY_API VFromD<
|
|
6047
|
-
|
|
6048
|
-
|
|
6049
|
-
|
|
6050
|
-
|
|
6051
|
-
const VU32 odd = Set(du32, 0xFFFF0000u);
|
|
6052
|
-
const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
|
|
6053
|
-
const VU32 ao = And(BitCast(du32, a), odd);
|
|
6054
|
-
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
6055
|
-
const VU32 bo = And(BitCast(du32, b), odd);
|
|
6056
|
-
return MulAdd(BitCast(df32, ae), BitCast(df32, be),
|
|
6057
|
-
Mul(BitCast(df32, ao), BitCast(df32, bo)));
|
|
7207
|
+
template <class DF, HWY_IF_F32_D(DF)>
|
|
7208
|
+
HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df,
|
|
7209
|
+
VFromD<Repartition<bfloat16_t, DF>> a,
|
|
7210
|
+
VFromD<Repartition<bfloat16_t, DF>> b) {
|
|
7211
|
+
return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
|
|
7212
|
+
Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
|
|
6058
7213
|
}
|
|
6059
|
-
#endif //
|
|
7214
|
+
#endif // HWY_NEON_HAVE_F32_TO_BF16C
|
|
6060
7215
|
|
|
6061
7216
|
template <class D, HWY_IF_I32_D(D)>
|
|
6062
7217
|
HWY_API Vec128<int32_t> WidenMulPairwiseAdd(D /*d32*/, Vec128<int16_t> a,
|
|
@@ -6266,6 +7421,23 @@ namespace detail {
|
|
|
6266
7421
|
// There is no vuzpq_u64.
|
|
6267
7422
|
HWY_NEON_DEF_FUNCTION_UIF_8_16_32(ConcatEven, vuzp1, _, 2)
|
|
6268
7423
|
HWY_NEON_DEF_FUNCTION_UIF_8_16_32(ConcatOdd, vuzp2, _, 2)
|
|
7424
|
+
|
|
7425
|
+
#if !HWY_HAVE_FLOAT16
|
|
7426
|
+
template <size_t N>
|
|
7427
|
+
HWY_INLINE Vec128<float16_t, N> ConcatEven(Vec128<float16_t, N> hi,
|
|
7428
|
+
Vec128<float16_t, N> lo) {
|
|
7429
|
+
const DFromV<decltype(hi)> d;
|
|
7430
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
7431
|
+
return BitCast(d, ConcatEven(BitCast(du, hi), BitCast(du, lo)));
|
|
7432
|
+
}
|
|
7433
|
+
template <size_t N>
|
|
7434
|
+
HWY_INLINE Vec128<float16_t, N> ConcatOdd(Vec128<float16_t, N> hi,
|
|
7435
|
+
Vec128<float16_t, N> lo) {
|
|
7436
|
+
const DFromV<decltype(hi)> d;
|
|
7437
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
7438
|
+
return BitCast(d, ConcatOdd(BitCast(du, hi), BitCast(du, lo)));
|
|
7439
|
+
}
|
|
7440
|
+
#endif // !HWY_HAVE_FLOAT16
|
|
6269
7441
|
} // namespace detail
|
|
6270
7442
|
|
|
6271
7443
|
// Full/half vector
|
|
@@ -6374,6 +7546,36 @@ HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
|
|
|
6374
7546
|
return IfThenElse(MaskFromVec(vec), b, a);
|
|
6375
7547
|
}
|
|
6376
7548
|
|
|
7549
|
+
// ------------------------------ InterleaveEven
|
|
7550
|
+
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
|
|
7551
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
7552
|
+
#if HWY_ARCH_ARM_A64
|
|
7553
|
+
return detail::InterleaveEven(a, b);
|
|
7554
|
+
#else
|
|
7555
|
+
return VFromD<D>(detail::InterleaveEvenOdd(a.raw, b.raw).val[0]);
|
|
7556
|
+
#endif
|
|
7557
|
+
}
|
|
7558
|
+
|
|
7559
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
7560
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
7561
|
+
return InterleaveLower(a, b);
|
|
7562
|
+
}
|
|
7563
|
+
|
|
7564
|
+
// ------------------------------ InterleaveOdd
|
|
7565
|
+
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
|
|
7566
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
7567
|
+
#if HWY_ARCH_ARM_A64
|
|
7568
|
+
return detail::InterleaveOdd(a, b);
|
|
7569
|
+
#else
|
|
7570
|
+
return VFromD<D>(detail::InterleaveEvenOdd(a.raw, b.raw).val[1]);
|
|
7571
|
+
#endif
|
|
7572
|
+
}
|
|
7573
|
+
|
|
7574
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
7575
|
+
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
|
|
7576
|
+
return InterleaveUpper(d, a, b);
|
|
7577
|
+
}
|
|
7578
|
+
|
|
6377
7579
|
// ------------------------------ OddEvenBlocks
|
|
6378
7580
|
template <typename T, size_t N>
|
|
6379
7581
|
HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
|
|
@@ -6395,12 +7597,14 @@ HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
|
|
|
6395
7597
|
|
|
6396
7598
|
// ------------------------------ ReorderDemote2To (OddEven)
|
|
6397
7599
|
|
|
6398
|
-
|
|
6399
|
-
|
|
6400
|
-
HWY_API VFromD<D> ReorderDemote2To(D dbf16,
|
|
6401
|
-
|
|
6402
|
-
|
|
7600
|
+
#if HWY_NEON_HAVE_F32_TO_BF16C
|
|
7601
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
7602
|
+
HWY_API VFromD<D> ReorderDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
|
|
7603
|
+
VFromD<Repartition<float, D>> b) {
|
|
7604
|
+
const Half<decltype(dbf16)> dh_bf16;
|
|
7605
|
+
return Combine(dbf16, DemoteTo(dh_bf16, b), DemoteTo(dh_bf16, a));
|
|
6403
7606
|
}
|
|
7607
|
+
#endif // HWY_NEON_HAVE_F32_TO_BF16C
|
|
6404
7608
|
|
|
6405
7609
|
template <class D, HWY_IF_I32_D(D)>
|
|
6406
7610
|
HWY_API Vec128<int32_t> ReorderDemote2To(D d32, Vec128<int64_t> a,
|
|
@@ -6616,16 +7820,19 @@ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
|
|
|
6616
7820
|
return ReorderDemote2To(d, a, b);
|
|
6617
7821
|
}
|
|
6618
7822
|
|
|
6619
|
-
|
|
6620
|
-
|
|
7823
|
+
#if HWY_NEON_HAVE_F32_TO_BF16C
|
|
7824
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
7825
|
+
HWY_API VFromD<D> OrderedDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
|
|
7826
|
+
VFromD<Repartition<float, D>> b) {
|
|
6621
7827
|
return ReorderDemote2To(dbf16, a, b);
|
|
6622
7828
|
}
|
|
7829
|
+
#endif // HWY_NEON_HAVE_F32_TO_BF16C
|
|
6623
7830
|
|
|
6624
7831
|
// ================================================== CRYPTO
|
|
6625
7832
|
|
|
6626
7833
|
// (aarch64 or Arm7) and (__ARM_FEATURE_AES or HWY_HAVE_RUNTIME_DISPATCH).
|
|
6627
7834
|
// Otherwise, rely on generic_ops-inl.h to emulate AESRound / CLMul*.
|
|
6628
|
-
#if HWY_TARGET
|
|
7835
|
+
#if HWY_TARGET != HWY_NEON_WITHOUT_AES
|
|
6629
7836
|
|
|
6630
7837
|
#ifdef HWY_NATIVE_AES
|
|
6631
7838
|
#undef HWY_NATIVE_AES
|
|
@@ -6676,7 +7883,7 @@ HWY_API Vec128<uint64_t> CLMulUpper(Vec128<uint64_t> a, Vec128<uint64_t> b) {
|
|
|
6676
7883
|
(uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw));
|
|
6677
7884
|
}
|
|
6678
7885
|
|
|
6679
|
-
#endif // HWY_TARGET
|
|
7886
|
+
#endif // HWY_TARGET != HWY_NEON_WITHOUT_AES
|
|
6680
7887
|
|
|
6681
7888
|
// ================================================== MISC
|
|
6682
7889
|
|
|
@@ -6851,10 +8058,11 @@ HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(Vec128<uint32_t, N> a,
|
|
|
6851
8058
|
vget_low_u64(vmull_u32(a_packed, b_packed)));
|
|
6852
8059
|
}
|
|
6853
8060
|
|
|
6854
|
-
|
|
6855
|
-
|
|
6856
|
-
|
|
6857
|
-
|
|
8061
|
+
template <class T, HWY_IF_UI64(T)>
|
|
8062
|
+
HWY_INLINE Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
|
|
8063
|
+
T hi;
|
|
8064
|
+
T lo = Mul128(GetLane(a), GetLane(b), &hi);
|
|
8065
|
+
return Dup128VecFromValues(Full128<T>(), lo, hi);
|
|
6858
8066
|
}
|
|
6859
8067
|
|
|
6860
8068
|
// Multiplies odd lanes (1, 3 ..) and places the double-wide result into
|
|
@@ -6957,10 +8165,11 @@ HWY_API Vec128<uint64_t, (N + 1) / 2> MulOdd(Vec128<uint32_t, N> a,
|
|
|
6957
8165
|
vget_low_u64(vmull_u32(a_packed, b_packed)));
|
|
6958
8166
|
}
|
|
6959
8167
|
|
|
6960
|
-
|
|
6961
|
-
|
|
6962
|
-
|
|
6963
|
-
|
|
8168
|
+
template <class T, HWY_IF_UI64(T)>
|
|
8169
|
+
HWY_INLINE Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
|
|
8170
|
+
T hi;
|
|
8171
|
+
T lo = Mul128(detail::GetLane<1>(a), detail::GetLane<1>(b), &hi);
|
|
8172
|
+
return Dup128VecFromValues(Full128<T>(), lo, hi);
|
|
6964
8173
|
}
|
|
6965
8174
|
|
|
6966
8175
|
// ------------------------------ TableLookupBytes (Combine, LowerHalf)
|
|
@@ -7025,7 +8234,7 @@ HWY_API VI TableLookupBytesOr0(V bytes, VI from) {
|
|
|
7025
8234
|
|
|
7026
8235
|
// ---------------------------- AESKeyGenAssist (AESLastRound, TableLookupBytes)
|
|
7027
8236
|
|
|
7028
|
-
#if HWY_TARGET
|
|
8237
|
+
#if HWY_TARGET != HWY_NEON_WITHOUT_AES
|
|
7029
8238
|
template <uint8_t kRcon>
|
|
7030
8239
|
HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) {
|
|
7031
8240
|
alignas(16) static constexpr uint8_t kRconXorMask[16] = {
|
|
@@ -7038,51 +8247,26 @@ HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) {
|
|
|
7038
8247
|
const auto sub_word_result = AESLastRound(w13, Load(d, kRconXorMask));
|
|
7039
8248
|
return TableLookupBytes(sub_word_result, Load(d, kRotWordShuffle));
|
|
7040
8249
|
}
|
|
7041
|
-
#endif // HWY_TARGET
|
|
8250
|
+
#endif // HWY_TARGET != HWY_NEON_WITHOUT_AES
|
|
7042
8251
|
|
|
7043
8252
|
// ------------------------------ Scatter in generic_ops-inl.h
|
|
7044
8253
|
// ------------------------------ Gather in generic_ops-inl.h
|
|
7045
8254
|
|
|
7046
8255
|
// ------------------------------ Reductions
|
|
7047
8256
|
|
|
7048
|
-
|
|
7049
|
-
|
|
7050
|
-
// N=1 for any T: no-op
|
|
7051
|
-
template <typename T>
|
|
7052
|
-
HWY_INLINE T ReduceMin(hwy::SizeTag<sizeof(T)> /* tag */, Vec128<T, 1> v) {
|
|
7053
|
-
return GetLane(v);
|
|
7054
|
-
}
|
|
7055
|
-
template <typename T>
|
|
7056
|
-
HWY_INLINE T ReduceMax(hwy::SizeTag<sizeof(T)> /* tag */, Vec128<T, 1> v) {
|
|
7057
|
-
return GetLane(v);
|
|
7058
|
-
}
|
|
7059
|
-
template <typename T>
|
|
7060
|
-
HWY_INLINE T ReduceSum(hwy::SizeTag<sizeof(T)> /* tag */, Vec128<T, 1> v) {
|
|
7061
|
-
return GetLane(v);
|
|
7062
|
-
}
|
|
7063
|
-
template <typename T>
|
|
7064
|
-
HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
|
|
7065
|
-
Vec128<T, 1> v) {
|
|
7066
|
-
return v;
|
|
7067
|
-
}
|
|
7068
|
-
template <typename T>
|
|
7069
|
-
HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
|
|
7070
|
-
Vec128<T, 1> v) {
|
|
7071
|
-
return v;
|
|
7072
|
-
}
|
|
7073
|
-
template <typename T>
|
|
7074
|
-
HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
|
|
7075
|
-
Vec128<T, 1> v) {
|
|
7076
|
-
return v;
|
|
7077
|
-
}
|
|
7078
|
-
|
|
7079
|
-
// full vectors
|
|
8257
|
+
// On Armv8 we define ReduceSum and generic_ops defines SumOfLanes via Set.
|
|
7080
8258
|
#if HWY_ARCH_ARM_A64
|
|
7081
8259
|
|
|
8260
|
+
#ifdef HWY_NATIVE_REDUCE_SCALAR
|
|
8261
|
+
#undef HWY_NATIVE_REDUCE_SCALAR
|
|
8262
|
+
#else
|
|
8263
|
+
#define HWY_NATIVE_REDUCE_SCALAR
|
|
8264
|
+
#endif
|
|
8265
|
+
|
|
7082
8266
|
// TODO(janwas): use normal HWY_NEON_DEF, then FULL type list.
|
|
7083
8267
|
#define HWY_NEON_DEF_REDUCTION(type, size, name, prefix, infix, suffix) \
|
|
7084
|
-
|
|
7085
|
-
|
|
8268
|
+
template <class D, HWY_IF_LANES_D(D, size)> \
|
|
8269
|
+
HWY_API type##_t name(D /* tag */, Vec128<type##_t, size> v) { \
|
|
7086
8270
|
return HWY_NEON_EVAL(prefix##infix##suffix, v.raw); \
|
|
7087
8271
|
}
|
|
7088
8272
|
|
|
@@ -7125,83 +8309,110 @@ HWY_NEON_DEF_REDUCTION_F16(ReduceMax, vmaxv)
|
|
|
7125
8309
|
HWY_NEON_DEF_REDUCTION_CORE_TYPES(ReduceSum, vaddv)
|
|
7126
8310
|
HWY_NEON_DEF_REDUCTION_UI64(ReduceSum, vaddv)
|
|
7127
8311
|
|
|
8312
|
+
// Emulate missing UI64 and partial N=2.
|
|
8313
|
+
template <class D, HWY_IF_LANES_D(D, 2),
|
|
8314
|
+
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
|
|
8315
|
+
HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v10) {
|
|
8316
|
+
return GetLane(v10) + ExtractLane(v10, 1);
|
|
8317
|
+
}
|
|
8318
|
+
|
|
8319
|
+
template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_NOT_FLOAT_D(D),
|
|
8320
|
+
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 8))>
|
|
8321
|
+
HWY_API TFromD<D> ReduceMin(D /* tag */, VFromD<D> v10) {
|
|
8322
|
+
return HWY_MIN(GetLane(v10), ExtractLane(v10, 1));
|
|
8323
|
+
}
|
|
8324
|
+
|
|
8325
|
+
template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_NOT_FLOAT_D(D),
|
|
8326
|
+
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 8))>
|
|
8327
|
+
HWY_API TFromD<D> ReduceMax(D /* tag */, VFromD<D> v10) {
|
|
8328
|
+
return HWY_MAX(GetLane(v10), ExtractLane(v10, 1));
|
|
8329
|
+
}
|
|
8330
|
+
|
|
7128
8331
|
#if HWY_HAVE_FLOAT16
|
|
7129
|
-
|
|
8332
|
+
template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_F16_D(D)>
|
|
8333
|
+
HWY_API float16_t ReduceMin(D d, VFromD<D> v10) {
|
|
8334
|
+
return GetLane(Min(v10, Reverse2(d, v10)));
|
|
8335
|
+
}
|
|
8336
|
+
|
|
8337
|
+
template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_F16_D(D)>
|
|
8338
|
+
HWY_API float16_t ReduceMax(D d, VFromD<D> v10) {
|
|
8339
|
+
return GetLane(Max(v10, Reverse2(d, v10)));
|
|
8340
|
+
}
|
|
8341
|
+
|
|
8342
|
+
template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 8)>
|
|
8343
|
+
HWY_API float16_t ReduceSum(D /* tag */, VFromD<D> v) {
|
|
7130
8344
|
const float16x4_t x2 = vpadd_f16(v.raw, v.raw);
|
|
7131
|
-
return GetLane(
|
|
8345
|
+
return GetLane(VFromD<D>(vpadd_f16(x2, x2)));
|
|
7132
8346
|
}
|
|
7133
|
-
|
|
7134
|
-
|
|
8347
|
+
template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 16)>
|
|
8348
|
+
HWY_API float16_t ReduceSum(D d, VFromD<D> v) {
|
|
8349
|
+
const Half<decltype(d)> dh;
|
|
8350
|
+
return ReduceSum(dh, LowerHalf(dh, VFromD<D>(vpaddq_f16(v.raw, v.raw))));
|
|
7135
8351
|
}
|
|
7136
|
-
#endif
|
|
8352
|
+
#endif // HWY_HAVE_FLOAT16
|
|
7137
8353
|
|
|
7138
8354
|
#undef HWY_NEON_DEF_REDUCTION_CORE_TYPES
|
|
7139
8355
|
#undef HWY_NEON_DEF_REDUCTION_F16
|
|
7140
8356
|
#undef HWY_NEON_DEF_REDUCTION_UI64
|
|
7141
8357
|
#undef HWY_NEON_DEF_REDUCTION
|
|
7142
8358
|
|
|
7143
|
-
//
|
|
7144
|
-
#define HWY_IF_SUM_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, 1 << 2)
|
|
7145
|
-
#define HWY_IF_MINMAX_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, (1 << 8) | (1 << 2))
|
|
8359
|
+
// ------------------------------ SumOfLanes
|
|
7146
8360
|
|
|
7147
|
-
|
|
7148
|
-
|
|
7149
|
-
|
|
7150
|
-
return Set(DFromV<decltype(v)>(), ReduceMin(tag, v));
|
|
8361
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
8362
|
+
HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
|
|
8363
|
+
return Set(d, ReduceSum(d, v));
|
|
7151
8364
|
}
|
|
7152
|
-
template <
|
|
7153
|
-
HWY_API
|
|
7154
|
-
return Set(
|
|
8365
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
8366
|
+
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
8367
|
+
return Set(d, ReduceMin(d, v));
|
|
7155
8368
|
}
|
|
7156
|
-
template <
|
|
7157
|
-
HWY_API
|
|
7158
|
-
return Set(
|
|
8369
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
8370
|
+
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
8371
|
+
return Set(d, ReduceMax(d, v));
|
|
7159
8372
|
}
|
|
7160
8373
|
|
|
7161
|
-
|
|
8374
|
+
// On Armv7 we define SumOfLanes and generic_ops defines ReduceSum via GetLane.
|
|
8375
|
+
#else // !HWY_ARCH_ARM_A64
|
|
8376
|
+
|
|
8377
|
+
// Armv7 lacks N=2 and 8-bit x4, so enable generic versions of those.
|
|
8378
|
+
#undef HWY_IF_SUM_OF_LANES_D
|
|
8379
|
+
#define HWY_IF_SUM_OF_LANES_D(D) \
|
|
8380
|
+
hwy::EnableIf<(HWY_MAX_LANES_D(D) == 2) || \
|
|
8381
|
+
(sizeof(TFromD<D>) == 1 && HWY_MAX_LANES_D(D) == 4)>* = \
|
|
8382
|
+
nullptr
|
|
8383
|
+
#undef HWY_IF_MINMAX_OF_LANES_D
|
|
8384
|
+
#define HWY_IF_MINMAX_OF_LANES_D(D) \
|
|
8385
|
+
hwy::EnableIf<(HWY_MAX_LANES_D(D) == 2) || \
|
|
8386
|
+
(sizeof(TFromD<D>) == 1 && HWY_MAX_LANES_D(D) == 4)>* = \
|
|
8387
|
+
nullptr
|
|
7162
8388
|
|
|
7163
8389
|
// For arm7, we implement reductions using a series of pairwise operations. This
|
|
7164
8390
|
// produces the full vector result, so we express Reduce* in terms of *OfLanes.
|
|
7165
8391
|
#define HWY_NEON_BUILD_TYPE_T(type, size) type##x##size##_t
|
|
7166
|
-
#define HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) Vec128<type##_t, size>
|
|
7167
8392
|
#define HWY_NEON_DEF_PAIRWISE_REDUCTION(type, size, name, prefix, suffix) \
|
|
7168
|
-
|
|
7169
|
-
|
|
8393
|
+
template <class D, HWY_IF_LANES_D(D, size)> \
|
|
8394
|
+
HWY_API Vec128<type##_t, size> name##OfLanes(D /* d */, \
|
|
8395
|
+
Vec128<type##_t, size> v) { \
|
|
7170
8396
|
HWY_NEON_BUILD_TYPE_T(type, size) tmp = prefix##_##suffix(v.raw, v.raw); \
|
|
7171
8397
|
if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \
|
|
7172
8398
|
if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \
|
|
7173
|
-
return
|
|
7174
|
-
} \
|
|
7175
|
-
HWY_API type##_t Reduce##name(hwy::SizeTag<sizeof(type##_t)> tag, \
|
|
7176
|
-
Vec128<type##_t, size> v) { \
|
|
7177
|
-
return GetLane(name##OfLanes(tag, v)); \
|
|
8399
|
+
return Vec128<type##_t, size>(tmp); \
|
|
7178
8400
|
}
|
|
7179
8401
|
|
|
7180
8402
|
// For the wide versions, the pairwise operations produce a half-length vector.
|
|
7181
|
-
// We produce that
|
|
7182
|
-
// and *OfLanes in terms of the helper.
|
|
8403
|
+
// We produce that `tmp` and then Combine.
|
|
7183
8404
|
#define HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(type, size, half, name, prefix, \
|
|
7184
8405
|
suffix) \
|
|
7185
|
-
|
|
7186
|
-
|
|
8406
|
+
template <class D, HWY_IF_LANES_D(D, size)> \
|
|
8407
|
+
HWY_API Vec128<type##_t, size> name##OfLanes(D /* d */, \
|
|
8408
|
+
Vec128<type##_t, size> v) { \
|
|
7187
8409
|
HWY_NEON_BUILD_TYPE_T(type, half) tmp; \
|
|
7188
8410
|
tmp = prefix##_##suffix(vget_high_##suffix(v.raw), \
|
|
7189
8411
|
vget_low_##suffix(v.raw)); \
|
|
7190
8412
|
if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \
|
|
7191
8413
|
if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \
|
|
7192
8414
|
if ((size / 8) > 1) tmp = prefix##_##suffix(tmp, tmp); \
|
|
7193
|
-
return tmp;
|
|
7194
|
-
} \
|
|
7195
|
-
HWY_API type##_t Reduce##name(hwy::SizeTag<sizeof(type##_t)>, \
|
|
7196
|
-
Vec128<type##_t, size> v) { \
|
|
7197
|
-
const HWY_NEON_BUILD_TYPE_T(type, half) tmp = Reduce##name##Vector(v); \
|
|
7198
|
-
return HWY_NEON_EVAL(vget_lane_##suffix, tmp, 0); \
|
|
7199
|
-
} \
|
|
7200
|
-
HWY_API HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) name##OfLanes( \
|
|
7201
|
-
hwy::SizeTag<sizeof(type##_t)>, Vec128<type##_t, size> v) { \
|
|
7202
|
-
const HWY_NEON_BUILD_TYPE_T(type, half) tmp = Reduce##name##Vector(v); \
|
|
7203
|
-
return HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION( \
|
|
7204
|
-
type, size)(vcombine_##suffix(tmp, tmp)); \
|
|
8415
|
+
return Vec128<type##_t, size>(vcombine_##suffix(tmp, tmp)); \
|
|
7205
8416
|
}
|
|
7206
8417
|
|
|
7207
8418
|
#define HWY_NEON_DEF_PAIRWISE_REDUCTIONS(name, prefix) \
|
|
@@ -7227,56 +8438,22 @@ HWY_NEON_DEF_PAIRWISE_REDUCTIONS(Max, vpmax)
|
|
|
7227
8438
|
#undef HWY_NEON_DEF_PAIRWISE_REDUCTIONS
|
|
7228
8439
|
#undef HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION
|
|
7229
8440
|
#undef HWY_NEON_DEF_PAIRWISE_REDUCTION
|
|
7230
|
-
#undef HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION
|
|
7231
8441
|
#undef HWY_NEON_BUILD_TYPE_T
|
|
7232
8442
|
|
|
7233
|
-
//
|
|
7234
|
-
|
|
7235
|
-
#
|
|
7236
|
-
|
|
8443
|
+
// GetLane(SumsOf4(v)) is more efficient on ArmV7 NEON than the default
|
|
8444
|
+
// N=4 I8/U8 ReduceSum implementation in generic_ops-inl.h
|
|
8445
|
+
#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
8446
|
+
#undef HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
8447
|
+
#else
|
|
8448
|
+
#define HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
7237
8449
|
#endif
|
|
7238
8450
|
|
|
7239
|
-
|
|
7240
|
-
|
|
7241
|
-
|
|
7242
|
-
template <class D, typename T, HWY_IF_SUM_REDUCTION(T)>
|
|
7243
|
-
HWY_API Vec128<T, 2> SumOfLanes(D /* tag */, Vec128<T, 2> v10) {
|
|
7244
|
-
return v10 + Reverse2(Simd<T, 2, 0>(), v10);
|
|
7245
|
-
}
|
|
7246
|
-
|
|
7247
|
-
template <class D, typename T, HWY_IF_SUM_REDUCTION(T)>
|
|
7248
|
-
HWY_API T ReduceSum(D d, Vec128<T, 2> v10) {
|
|
7249
|
-
return GetLane(SumOfLanes(d, v10));
|
|
7250
|
-
}
|
|
7251
|
-
|
|
7252
|
-
template <class D, typename T, HWY_IF_MINMAX_REDUCTION(T)>
|
|
7253
|
-
HWY_API Vec128<T, 2> MinOfLanes(D /* tag */, Vec128<T, 2> v10) {
|
|
7254
|
-
return Min(v10, Reverse2(Simd<T, 2, 0>(), v10));
|
|
7255
|
-
}
|
|
7256
|
-
template <class D, typename T, HWY_IF_MINMAX_REDUCTION(T)>
|
|
7257
|
-
HWY_API Vec128<T, 2> MaxOfLanes(D /* tag */, Vec128<T, 2> v10) {
|
|
7258
|
-
return Max(v10, Reverse2(Simd<T, 2, 0>(), v10));
|
|
8451
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
|
|
8452
|
+
HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) {
|
|
8453
|
+
return static_cast<TFromD<D>>(GetLane(SumsOf4(v)));
|
|
7259
8454
|
}
|
|
7260
8455
|
|
|
7261
|
-
#
|
|
7262
|
-
#undef HWY_IF_MINMAX_REDUCTION
|
|
7263
|
-
|
|
7264
|
-
template <class D>
|
|
7265
|
-
HWY_API VFromD<D> SumOfLanes(D /* tag */, VFromD<D> v) {
|
|
7266
|
-
return detail::SumOfLanes(hwy::SizeTag<sizeof(TFromD<D>)>(), v);
|
|
7267
|
-
}
|
|
7268
|
-
template <class D>
|
|
7269
|
-
HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v) {
|
|
7270
|
-
return detail::ReduceSum(hwy::SizeTag<sizeof(TFromD<D>)>(), v);
|
|
7271
|
-
}
|
|
7272
|
-
template <class D>
|
|
7273
|
-
HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) {
|
|
7274
|
-
return detail::MinOfLanes(hwy::SizeTag<sizeof(TFromD<D>)>(), v);
|
|
7275
|
-
}
|
|
7276
|
-
template <class D>
|
|
7277
|
-
HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) {
|
|
7278
|
-
return detail::MaxOfLanes(hwy::SizeTag<sizeof(TFromD<D>)>(), v);
|
|
7279
|
-
}
|
|
8456
|
+
#endif // HWY_ARCH_ARM_A64
|
|
7280
8457
|
|
|
7281
8458
|
// ------------------------------ LoadMaskBits (TestBit)
|
|
7282
8459
|
|
|
@@ -7345,6 +8522,15 @@ HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
|
|
|
7345
8522
|
return detail::LoadMaskBits(d, mask_bits);
|
|
7346
8523
|
}
|
|
7347
8524
|
|
|
8525
|
+
// ------------------------------ Dup128MaskFromMaskBits
|
|
8526
|
+
|
|
8527
|
+
template <class D>
|
|
8528
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
8529
|
+
constexpr size_t kN = MaxLanes(d);
|
|
8530
|
+
if (kN < 8) mask_bits &= (1u << kN) - 1;
|
|
8531
|
+
return detail::LoadMaskBits(d, mask_bits);
|
|
8532
|
+
}
|
|
8533
|
+
|
|
7348
8534
|
// ------------------------------ Mask
|
|
7349
8535
|
|
|
7350
8536
|
namespace detail {
|
|
@@ -7674,7 +8860,7 @@ namespace detail {
|
|
|
7674
8860
|
template <class D, HWY_IF_V_SIZE_D(D, 16)>
|
|
7675
8861
|
HWY_INLINE Vec128<uint8_t> Load8Bytes(D /*tag*/, const uint8_t* bytes) {
|
|
7676
8862
|
return Vec128<uint8_t>(vreinterpretq_u8_u64(
|
|
7677
|
-
vld1q_dup_u64(
|
|
8863
|
+
vld1q_dup_u64(HWY_RCAST_ALIGNED(const uint64_t*, bytes))));
|
|
7678
8864
|
}
|
|
7679
8865
|
|
|
7680
8866
|
// Load 8 bytes and return half-reg with N <= 8 bytes.
|
|
@@ -8287,9 +9473,8 @@ HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved4, vld4, _, HWY_LOAD_INT)
|
|
|
8287
9473
|
template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>>
|
|
8288
9474
|
HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
|
|
8289
9475
|
VFromD<D>& v0, VFromD<D>& v1) {
|
|
8290
|
-
auto raw = detail::LoadInterleaved2(
|
|
8291
|
-
|
|
8292
|
-
detail::Tuple2<T, d.MaxLanes()>());
|
|
9476
|
+
auto raw = detail::LoadInterleaved2(detail::NativeLanePointer(unaligned),
|
|
9477
|
+
detail::Tuple2<T, d.MaxLanes()>());
|
|
8293
9478
|
v0 = VFromD<D>(raw.val[0]);
|
|
8294
9479
|
v1 = VFromD<D>(raw.val[1]);
|
|
8295
9480
|
}
|
|
@@ -8301,9 +9486,8 @@ HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
|
|
|
8301
9486
|
// The smallest vector registers are 64-bits and we want space for two.
|
|
8302
9487
|
alignas(16) T buf[2 * 8 / sizeof(T)] = {};
|
|
8303
9488
|
CopyBytes<d.MaxBytes() * 2>(unaligned, buf);
|
|
8304
|
-
auto raw = detail::LoadInterleaved2(
|
|
8305
|
-
|
|
8306
|
-
detail::Tuple2<T, d.MaxLanes()>());
|
|
9489
|
+
auto raw = detail::LoadInterleaved2(detail::NativeLanePointer(buf),
|
|
9490
|
+
detail::Tuple2<T, d.MaxLanes()>());
|
|
8307
9491
|
v0 = VFromD<D>(raw.val[0]);
|
|
8308
9492
|
v1 = VFromD<D>(raw.val[1]);
|
|
8309
9493
|
}
|
|
@@ -8315,12 +9499,8 @@ HWY_API void LoadInterleaved2(D d, T* HWY_RESTRICT unaligned, Vec128<T>& v0,
|
|
|
8315
9499
|
Vec128<T>& v1) {
|
|
8316
9500
|
const Half<decltype(d)> dh;
|
|
8317
9501
|
VFromD<decltype(dh)> v00, v10, v01, v11;
|
|
8318
|
-
LoadInterleaved2(
|
|
8319
|
-
|
|
8320
|
-
v10);
|
|
8321
|
-
LoadInterleaved2(
|
|
8322
|
-
dh, reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned + 2),
|
|
8323
|
-
v01, v11);
|
|
9502
|
+
LoadInterleaved2(dh, detail::NativeLanePointer(unaligned), v00, v10);
|
|
9503
|
+
LoadInterleaved2(dh, detail::NativeLanePointer(unaligned + 2), v01, v11);
|
|
8324
9504
|
v0 = Combine(d, v01, v00);
|
|
8325
9505
|
v1 = Combine(d, v11, v10);
|
|
8326
9506
|
}
|
|
@@ -8331,9 +9511,8 @@ HWY_API void LoadInterleaved2(D d, T* HWY_RESTRICT unaligned, Vec128<T>& v0,
|
|
|
8331
9511
|
template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>>
|
|
8332
9512
|
HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
|
|
8333
9513
|
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
|
|
8334
|
-
auto raw = detail::LoadInterleaved3(
|
|
8335
|
-
|
|
8336
|
-
detail::Tuple3<T, d.MaxLanes()>());
|
|
9514
|
+
auto raw = detail::LoadInterleaved3(detail::NativeLanePointer(unaligned),
|
|
9515
|
+
detail::Tuple3<T, d.MaxLanes()>());
|
|
8337
9516
|
v0 = VFromD<D>(raw.val[0]);
|
|
8338
9517
|
v1 = VFromD<D>(raw.val[1]);
|
|
8339
9518
|
v2 = VFromD<D>(raw.val[2]);
|
|
@@ -8346,9 +9525,8 @@ HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
|
|
|
8346
9525
|
// The smallest vector registers are 64-bits and we want space for three.
|
|
8347
9526
|
alignas(16) T buf[3 * 8 / sizeof(T)] = {};
|
|
8348
9527
|
CopyBytes<d.MaxBytes() * 3>(unaligned, buf);
|
|
8349
|
-
auto raw = detail::LoadInterleaved3(
|
|
8350
|
-
|
|
8351
|
-
detail::Tuple3<T, d.MaxLanes()>());
|
|
9528
|
+
auto raw = detail::LoadInterleaved3(detail::NativeLanePointer(buf),
|
|
9529
|
+
detail::Tuple3<T, d.MaxLanes()>());
|
|
8352
9530
|
v0 = VFromD<D>(raw.val[0]);
|
|
8353
9531
|
v1 = VFromD<D>(raw.val[1]);
|
|
8354
9532
|
v2 = VFromD<D>(raw.val[2]);
|
|
@@ -8361,12 +9539,8 @@ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
|
|
|
8361
9539
|
Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2) {
|
|
8362
9540
|
const Half<decltype(d)> dh;
|
|
8363
9541
|
VFromD<decltype(dh)> v00, v10, v20, v01, v11, v21;
|
|
8364
|
-
LoadInterleaved3(
|
|
8365
|
-
|
|
8366
|
-
v10, v20);
|
|
8367
|
-
LoadInterleaved3(
|
|
8368
|
-
dh, reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned + 3),
|
|
8369
|
-
v01, v11, v21);
|
|
9542
|
+
LoadInterleaved3(dh, detail::NativeLanePointer(unaligned), v00, v10, v20);
|
|
9543
|
+
LoadInterleaved3(dh, detail::NativeLanePointer(unaligned + 3), v01, v11, v21);
|
|
8370
9544
|
v0 = Combine(d, v01, v00);
|
|
8371
9545
|
v1 = Combine(d, v11, v10);
|
|
8372
9546
|
v2 = Combine(d, v21, v20);
|
|
@@ -8379,9 +9553,8 @@ template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>>
|
|
|
8379
9553
|
HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
|
|
8380
9554
|
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
|
|
8381
9555
|
VFromD<D>& v3) {
|
|
8382
|
-
auto raw = detail::LoadInterleaved4(
|
|
8383
|
-
|
|
8384
|
-
detail::Tuple4<T, d.MaxLanes()>());
|
|
9556
|
+
auto raw = detail::LoadInterleaved4(detail::NativeLanePointer(unaligned),
|
|
9557
|
+
detail::Tuple4<T, d.MaxLanes()>());
|
|
8385
9558
|
v0 = VFromD<D>(raw.val[0]);
|
|
8386
9559
|
v1 = VFromD<D>(raw.val[1]);
|
|
8387
9560
|
v2 = VFromD<D>(raw.val[2]);
|
|
@@ -8395,9 +9568,8 @@ HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
|
|
|
8395
9568
|
VFromD<D>& v3) {
|
|
8396
9569
|
alignas(16) T buf[4 * 8 / sizeof(T)] = {};
|
|
8397
9570
|
CopyBytes<d.MaxBytes() * 4>(unaligned, buf);
|
|
8398
|
-
auto raw = detail::LoadInterleaved4(
|
|
8399
|
-
|
|
8400
|
-
detail::Tuple4<T, d.MaxLanes()>());
|
|
9571
|
+
auto raw = detail::LoadInterleaved4(detail::NativeLanePointer(buf),
|
|
9572
|
+
detail::Tuple4<T, d.MaxLanes()>());
|
|
8401
9573
|
v0 = VFromD<D>(raw.val[0]);
|
|
8402
9574
|
v1 = VFromD<D>(raw.val[1]);
|
|
8403
9575
|
v2 = VFromD<D>(raw.val[2]);
|
|
@@ -8412,12 +9584,10 @@ HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
|
|
|
8412
9584
|
Vec128<T>& v3) {
|
|
8413
9585
|
const Half<decltype(d)> dh;
|
|
8414
9586
|
VFromD<decltype(dh)> v00, v10, v20, v30, v01, v11, v21, v31;
|
|
8415
|
-
LoadInterleaved4(
|
|
8416
|
-
|
|
8417
|
-
|
|
8418
|
-
|
|
8419
|
-
dh, reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned + 4),
|
|
8420
|
-
v01, v11, v21, v31);
|
|
9587
|
+
LoadInterleaved4(dh, detail::NativeLanePointer(unaligned), v00, v10, v20,
|
|
9588
|
+
v30);
|
|
9589
|
+
LoadInterleaved4(dh, detail::NativeLanePointer(unaligned + 4), v01, v11, v21,
|
|
9590
|
+
v31);
|
|
8421
9591
|
v0 = Combine(d, v01, v00);
|
|
8422
9592
|
v1 = Combine(d, v11, v10);
|
|
8423
9593
|
v2 = Combine(d, v21, v20);
|
|
@@ -8476,8 +9646,7 @@ template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>>
|
|
|
8476
9646
|
HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
|
|
8477
9647
|
T* HWY_RESTRICT unaligned) {
|
|
8478
9648
|
detail::Tuple2<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw}}};
|
|
8479
|
-
detail::StoreInterleaved2(
|
|
8480
|
-
tup, reinterpret_cast<detail::NativeLaneType<T>*>(unaligned));
|
|
9649
|
+
detail::StoreInterleaved2(tup, detail::NativeLanePointer(unaligned));
|
|
8481
9650
|
}
|
|
8482
9651
|
|
|
8483
9652
|
// <= 32 bits: avoid writing more than N bytes by copying to buffer
|
|
@@ -8486,8 +9655,7 @@ HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
|
|
|
8486
9655
|
T* HWY_RESTRICT unaligned) {
|
|
8487
9656
|
alignas(16) T buf[2 * 8 / sizeof(T)];
|
|
8488
9657
|
detail::Tuple2<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw}}};
|
|
8489
|
-
detail::StoreInterleaved2(tup,
|
|
8490
|
-
reinterpret_cast<detail::NativeLaneType<T>*>(buf));
|
|
9658
|
+
detail::StoreInterleaved2(tup, detail::NativeLanePointer(buf));
|
|
8491
9659
|
CopyBytes<d.MaxBytes() * 2>(buf, unaligned);
|
|
8492
9660
|
}
|
|
8493
9661
|
|
|
@@ -8498,10 +9666,9 @@ HWY_API void StoreInterleaved2(Vec128<T> v0, Vec128<T> v1, D d,
|
|
|
8498
9666
|
T* HWY_RESTRICT unaligned) {
|
|
8499
9667
|
const Half<decltype(d)> dh;
|
|
8500
9668
|
StoreInterleaved2(LowerHalf(dh, v0), LowerHalf(dh, v1), dh,
|
|
8501
|
-
|
|
8502
|
-
StoreInterleaved2(
|
|
8503
|
-
|
|
8504
|
-
reinterpret_cast<detail::NativeLaneType<T>*>(unaligned + 2));
|
|
9669
|
+
detail::NativeLanePointer(unaligned));
|
|
9670
|
+
StoreInterleaved2(UpperHalf(dh, v0), UpperHalf(dh, v1), dh,
|
|
9671
|
+
detail::NativeLanePointer(unaligned + 2));
|
|
8505
9672
|
}
|
|
8506
9673
|
#endif // HWY_ARCH_ARM_V7
|
|
8507
9674
|
|
|
@@ -8511,8 +9678,7 @@ template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>>
|
|
|
8511
9678
|
HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
|
|
8512
9679
|
T* HWY_RESTRICT unaligned) {
|
|
8513
9680
|
detail::Tuple3<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw}}};
|
|
8514
|
-
detail::StoreInterleaved3(
|
|
8515
|
-
tup, reinterpret_cast<detail::NativeLaneType<T>*>(unaligned));
|
|
9681
|
+
detail::StoreInterleaved3(tup, detail::NativeLanePointer(unaligned));
|
|
8516
9682
|
}
|
|
8517
9683
|
|
|
8518
9684
|
// <= 32 bits: avoid writing more than N bytes by copying to buffer
|
|
@@ -8521,8 +9687,7 @@ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
|
|
|
8521
9687
|
T* HWY_RESTRICT unaligned) {
|
|
8522
9688
|
alignas(16) T buf[3 * 8 / sizeof(T)];
|
|
8523
9689
|
detail::Tuple3<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw}}};
|
|
8524
|
-
detail::StoreInterleaved3(tup,
|
|
8525
|
-
reinterpret_cast<detail::NativeLaneType<T>*>(buf));
|
|
9690
|
+
detail::StoreInterleaved3(tup, detail::NativeLanePointer(buf));
|
|
8526
9691
|
CopyBytes<d.MaxBytes() * 3>(buf, unaligned);
|
|
8527
9692
|
}
|
|
8528
9693
|
|
|
@@ -8533,10 +9698,9 @@ HWY_API void StoreInterleaved3(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2, D d,
|
|
|
8533
9698
|
T* HWY_RESTRICT unaligned) {
|
|
8534
9699
|
const Half<decltype(d)> dh;
|
|
8535
9700
|
StoreInterleaved3(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2), dh,
|
|
8536
|
-
|
|
8537
|
-
StoreInterleaved3(
|
|
8538
|
-
|
|
8539
|
-
reinterpret_cast<detail::NativeLaneType<T>*>(unaligned + 3));
|
|
9701
|
+
detail::NativeLanePointer(unaligned));
|
|
9702
|
+
StoreInterleaved3(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2), dh,
|
|
9703
|
+
detail::NativeLanePointer(unaligned + 3));
|
|
8540
9704
|
}
|
|
8541
9705
|
#endif // HWY_ARCH_ARM_V7
|
|
8542
9706
|
|
|
@@ -8546,8 +9710,7 @@ template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>>
|
|
|
8546
9710
|
HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
|
|
8547
9711
|
VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) {
|
|
8548
9712
|
detail::Tuple4<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
|
|
8549
|
-
detail::StoreInterleaved4(
|
|
8550
|
-
tup, reinterpret_cast<detail::NativeLaneType<T>*>(unaligned));
|
|
9713
|
+
detail::StoreInterleaved4(tup, detail::NativeLanePointer(unaligned));
|
|
8551
9714
|
}
|
|
8552
9715
|
|
|
8553
9716
|
// <= 32 bits: avoid writing more than N bytes by copying to buffer
|
|
@@ -8556,8 +9719,7 @@ HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
|
|
|
8556
9719
|
VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) {
|
|
8557
9720
|
alignas(16) T buf[4 * 8 / sizeof(T)];
|
|
8558
9721
|
detail::Tuple4<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
|
|
8559
|
-
detail::StoreInterleaved4(tup,
|
|
8560
|
-
reinterpret_cast<detail::NativeLaneType<T>*>(buf));
|
|
9722
|
+
detail::StoreInterleaved4(tup, detail::NativeLanePointer(buf));
|
|
8561
9723
|
CopyBytes<d.MaxBytes() * 4>(buf, unaligned);
|
|
8562
9724
|
}
|
|
8563
9725
|
|
|
@@ -8569,11 +9731,10 @@ HWY_API void StoreInterleaved4(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2,
|
|
|
8569
9731
|
const Half<decltype(d)> dh;
|
|
8570
9732
|
StoreInterleaved4(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2),
|
|
8571
9733
|
LowerHalf(dh, v3), dh,
|
|
8572
|
-
|
|
8573
|
-
StoreInterleaved4(
|
|
8574
|
-
|
|
8575
|
-
|
|
8576
|
-
reinterpret_cast<detail::NativeLaneType<T>*>(unaligned + 4));
|
|
9734
|
+
detail::NativeLanePointer(unaligned));
|
|
9735
|
+
StoreInterleaved4(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2),
|
|
9736
|
+
UpperHalf(dh, v3), dh,
|
|
9737
|
+
detail::NativeLanePointer(unaligned + 4));
|
|
8577
9738
|
}
|
|
8578
9739
|
#endif // HWY_ARCH_ARM_V7
|
|
8579
9740
|
|
|
@@ -8904,7 +10065,7 @@ namespace detail { // for code folding
|
|
|
8904
10065
|
#undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
|
|
8905
10066
|
#undef HWY_NEON_DEF_FUNCTION_UINTS
|
|
8906
10067
|
#undef HWY_NEON_EVAL
|
|
8907
|
-
|
|
10068
|
+
#undef HWY_NEON_IF_EMULATED_D
|
|
8908
10069
|
} // namespace detail
|
|
8909
10070
|
|
|
8910
10071
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|