@img/sharp-libvips-dev 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/aom/aom_encoder.h +3 -3
- package/include/aom/aomcx.h +17 -8
- package/include/expat.h +21 -10
- package/include/expat_config.h +11 -5
- package/include/ffi.h +12 -25
- package/include/fontconfig/fontconfig.h +5 -3
- package/include/freetype2/freetype/config/ftoption.h +1 -1
- package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -7
- package/include/gio-unix-2.0/gio/gunixinputstream.h +0 -5
- package/include/gio-unix-2.0/gio/gunixoutputstream.h +0 -5
- package/include/glib-2.0/gio/gappinfo.h +0 -7
- package/include/glib-2.0/gio/gapplication.h +6 -0
- package/include/glib-2.0/gio/gapplicationcommandline.h +12 -1
- package/include/glib-2.0/gio/gasyncinitable.h +0 -7
- package/include/glib-2.0/gio/gasyncresult.h +0 -6
- package/include/glib-2.0/gio/gbufferedinputstream.h +0 -5
- package/include/glib-2.0/gio/gbufferedoutputstream.h +0 -5
- package/include/glib-2.0/gio/gbytesicon.h +0 -5
- package/include/glib-2.0/gio/gcancellable.h +0 -5
- package/include/glib-2.0/gio/gconverter.h +0 -7
- package/include/glib-2.0/gio/gconverterinputstream.h +0 -6
- package/include/glib-2.0/gio/gconverteroutputstream.h +0 -6
- package/include/glib-2.0/gio/gdatagrambased.h +0 -7
- package/include/glib-2.0/gio/gdatainputstream.h +0 -6
- package/include/glib-2.0/gio/gdataoutputstream.h +0 -6
- package/include/glib-2.0/gio/gdbusinterface.h +0 -8
- package/include/glib-2.0/gio/gdbusinterfaceskeleton.h +0 -8
- package/include/glib-2.0/gio/gdbusmessage.h +2 -1
- package/include/glib-2.0/gio/gdbusobjectmanagerclient.h +0 -8
- package/include/glib-2.0/gio/gdbusobjectmanagerserver.h +0 -8
- package/include/glib-2.0/gio/gdbusobjectproxy.h +0 -8
- package/include/glib-2.0/gio/gdbusobjectskeleton.h +0 -8
- package/include/glib-2.0/gio/gdbusproxy.h +0 -8
- package/include/glib-2.0/gio/gdebugcontroller.h +0 -8
- package/include/glib-2.0/gio/gdebugcontrollerdbus.h +0 -7
- package/include/glib-2.0/gio/gdtlsserverconnection.h +0 -8
- package/include/glib-2.0/gio/gemblem.h +0 -5
- package/include/glib-2.0/gio/gemblemedicon.h +0 -5
- package/include/glib-2.0/gio/gfile.h +0 -10
- package/include/glib-2.0/gio/gfileenumerator.h +0 -5
- package/include/glib-2.0/gio/gfileicon.h +0 -5
- package/include/glib-2.0/gio/gfileinfo.h +0 -5
- package/include/glib-2.0/gio/gfileinputstream.h +0 -8
- package/include/glib-2.0/gio/gfileiostream.h +0 -8
- package/include/glib-2.0/gio/gfilemonitor.h +0 -5
- package/include/glib-2.0/gio/gfilenamecompleter.h +0 -5
- package/include/glib-2.0/gio/gfileoutputstream.h +0 -8
- package/include/glib-2.0/gio/gfilterinputstream.h +0 -5
- package/include/glib-2.0/gio/gfilteroutputstream.h +0 -5
- package/include/glib-2.0/gio/gicon.h +0 -5
- package/include/glib-2.0/gio/ginitable.h +0 -7
- package/include/glib-2.0/gio/ginputstream.h +0 -5
- package/include/glib-2.0/gio/gio-autocleanups.h +4 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/gioenums.h +6 -1
- package/include/glib-2.0/gio/giomodule.h +0 -5
- package/include/glib-2.0/gio/giostream.h +0 -5
- package/include/glib-2.0/gio/giotypes.h +5 -108
- package/include/glib-2.0/gio/gloadableicon.h +0 -6
- package/include/glib-2.0/gio/gmemoryinputstream.h +0 -5
- package/include/glib-2.0/gio/gmemoryoutputstream.h +0 -5
- package/include/glib-2.0/gio/gmountoperation.h +0 -6
- package/include/glib-2.0/gio/gnetworking.h +4 -0
- package/include/glib-2.0/gio/goutputstream.h +0 -9
- package/include/glib-2.0/gio/gpollableinputstream.h +0 -7
- package/include/glib-2.0/gio/gpollableoutputstream.h +0 -7
- package/include/glib-2.0/gio/gproxy.h +0 -7
- package/include/glib-2.0/gio/gproxyaddressenumerator.h +0 -8
- package/include/glib-2.0/gio/gseekable.h +0 -5
- package/include/glib-2.0/gio/gsettingsbackend.h +0 -5
- package/include/glib-2.0/gio/gsimpleactiongroup.h +0 -7
- package/include/glib-2.0/gio/gsimpleasyncresult.h +0 -5
- package/include/glib-2.0/gio/gsimpleproxyresolver.h +0 -5
- package/include/glib-2.0/gio/gsocket.h +13 -0
- package/include/glib-2.0/gio/gsocketaddressenumerator.h +0 -6
- package/include/glib-2.0/gio/gsocketconnectable.h +0 -5
- package/include/glib-2.0/gio/gtask.h +12 -0
- package/include/glib-2.0/gio/gthemedicon.h +0 -5
- package/include/glib-2.0/gio/gtlsserverconnection.h +0 -8
- package/include/glib-2.0/gio/gunixcredentialsmessage.h +0 -8
- package/include/glib-2.0/gio/gvfs.h +0 -5
- package/include/glib-2.0/gio/gvolume.h +2 -2
- package/include/glib-2.0/gio/gvolumemonitor.h +0 -5
- package/include/glib-2.0/girepository/gi-visibility.h +986 -0
- package/include/glib-2.0/girepository/giarginfo.h +100 -0
- package/include/glib-2.0/girepository/gibaseinfo.h +129 -0
- package/include/glib-2.0/girepository/gicallableinfo.h +119 -0
- package/include/glib-2.0/girepository/gicallbackinfo.h +60 -0
- package/include/glib-2.0/girepository/giconstantinfo.h +72 -0
- package/include/glib-2.0/girepository/gienuminfo.h +82 -0
- package/include/glib-2.0/girepository/gifieldinfo.h +84 -0
- package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
- package/include/glib-2.0/girepository/gifunctioninfo.h +117 -0
- package/include/glib-2.0/girepository/giinterfaceinfo.h +120 -0
- package/include/glib-2.0/girepository/giobjectinfo.h +230 -0
- package/include/glib-2.0/girepository/gipropertyinfo.h +77 -0
- package/include/glib-2.0/girepository/giregisteredtypeinfo.h +75 -0
- package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
- package/include/glib-2.0/girepository/girepository.h +247 -0
- package/include/glib-2.0/girepository/girffi.h +129 -0
- package/include/glib-2.0/girepository/gisignalinfo.h +72 -0
- package/include/glib-2.0/girepository/gistructinfo.h +102 -0
- package/include/glib-2.0/girepository/gitypeinfo.h +144 -0
- package/include/glib-2.0/girepository/gitypelib.h +61 -0
- package/include/glib-2.0/girepository/gitypes.h +421 -0
- package/include/glib-2.0/girepository/giunioninfo.h +105 -0
- package/include/glib-2.0/girepository/giunresolvedinfo.h +60 -0
- package/include/glib-2.0/girepository/givalueinfo.h +65 -0
- package/include/glib-2.0/girepository/givfuncinfo.h +88 -0
- package/include/glib-2.0/glib/deprecated/gcompletion.h +1 -1
- package/include/glib-2.0/glib/deprecated/grel.h +0 -23
- package/include/glib-2.0/glib/deprecated/gthread.h +10 -6
- package/include/glib-2.0/glib/gatomic.h +20 -20
- package/include/glib-2.0/glib/gbitlock.h +31 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +39 -1
- package/include/glib-2.0/glib/gchecksum.h +0 -10
- package/include/glib-2.0/glib/gdate.h +0 -9
- package/include/glib-2.0/glib/gdatetime.h +33 -1
- package/include/glib-2.0/glib/gdir.h +5 -0
- package/include/glib-2.0/glib/ghmac.h +0 -9
- package/include/glib-2.0/glib/glib-autocleanups.h +4 -0
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +1 -0
- package/include/glib-2.0/glib/gmessages.h +11 -0
- package/include/glib-2.0/glib/gpathbuf.h +0 -7
- package/include/glib-2.0/glib/gslice.h +2 -0
- package/include/glib-2.0/glib/gstdio.h +1 -1
- package/include/glib-2.0/glib/gstrfuncs.h +24 -18
- package/include/glib-2.0/glib/gstrvbuilder.h +4 -8
- package/include/glib-2.0/glib/gtestutils.h +5 -0
- package/include/glib-2.0/glib/gthread.h +216 -3
- package/include/glib-2.0/glib/gunicode.h +12 -2
- package/include/glib-2.0/glib/gvarianttype.h +1 -10
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/glib/gwin32.h +4 -4
- package/include/glib-2.0/glib-unix.h +214 -0
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/gbinding.h +0 -8
- package/include/glib-2.0/gobject/gbindinggroup.h +0 -8
- package/include/glib-2.0/gobject/gclosure.h +1 -9
- package/include/glib-2.0/gobject/genums.h +6 -6
- package/include/glib-2.0/gobject/glib-types.h +44 -0
- package/include/glib-2.0/gobject/gobject-autocleanups.h +4 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gobject.h +1 -16
- package/include/glib-2.0/gobject/gparam.h +3 -12
- package/include/glib-2.0/gobject/gsignal.h +16 -6
- package/include/glib-2.0/gobject/gsignalgroup.h +0 -8
- package/include/glib-2.0/gobject/gtype.h +53 -20
- package/include/glib-2.0/gobject/gtypemodule.h +0 -7
- package/include/glib-2.0/gobject/gtypeplugin.h +0 -6
- package/include/glib-2.0/gobject/gvaluearray.h +0 -7
- package/include/glib-2.0/gobject/gvaluecollector.h +1 -11
- package/include/glib-2.0/gobject/gvaluetypes.h +2 -0
- package/include/hwy/aligned_allocator.h +171 -6
- package/include/hwy/base.h +1765 -543
- package/include/hwy/cache_control.h +24 -6
- package/include/hwy/detect_compiler_arch.h +23 -2
- package/include/hwy/detect_targets.h +56 -13
- package/include/hwy/foreach_target.h +24 -0
- package/include/hwy/highway.h +20 -3
- package/include/hwy/ops/arm_neon-inl.h +1086 -667
- package/include/hwy/ops/arm_sve-inl.h +1091 -235
- package/include/hwy/ops/emu128-inl.h +271 -196
- package/include/hwy/ops/generic_ops-inl.h +2270 -399
- package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
- package/include/hwy/ops/rvv-inl.h +1043 -311
- package/include/hwy/ops/scalar-inl.h +189 -159
- package/include/hwy/ops/set_macros-inl.h +66 -6
- package/include/hwy/ops/shared-inl.h +175 -56
- package/include/hwy/ops/wasm_128-inl.h +153 -136
- package/include/hwy/ops/x86_128-inl.h +1647 -646
- package/include/hwy/ops/x86_256-inl.h +1003 -370
- package/include/hwy/ops/x86_512-inl.h +948 -353
- package/include/hwy/per_target.h +4 -0
- package/include/hwy/profiler.h +648 -0
- package/include/hwy/robust_statistics.h +2 -2
- package/include/hwy/targets.h +18 -11
- package/include/hwy/timer.h +11 -0
- package/include/lcms2.h +46 -7
- package/include/lcms2_plugin.h +4 -4
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/png.h +32 -29
- package/include/libpng16/pngconf.h +2 -2
- package/include/libpng16/pnglibconf.h +7 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
- package/include/libxml2/libxml/HTMLparser.h +23 -0
- package/include/libxml2/libxml/SAX.h +0 -2
- package/include/libxml2/libxml/SAX2.h +0 -2
- package/include/libxml2/libxml/c14n.h +0 -2
- package/include/libxml2/libxml/dict.h +1 -0
- package/include/libxml2/libxml/encoding.h +16 -14
- package/include/libxml2/libxml/entities.h +4 -0
- package/include/libxml2/libxml/globals.h +15 -503
- package/include/libxml2/libxml/hash.h +57 -61
- package/include/libxml2/libxml/nanoftp.h +2 -2
- package/include/libxml2/libxml/parser.h +137 -18
- package/include/libxml2/libxml/parserInternals.h +1 -0
- package/include/libxml2/libxml/relaxng.h +2 -1
- package/include/libxml2/libxml/schemasInternals.h +1 -0
- package/include/libxml2/libxml/schematron.h +1 -0
- package/include/libxml2/libxml/threads.h +4 -11
- package/include/libxml2/libxml/tree.h +68 -20
- package/include/libxml2/libxml/uri.h +2 -1
- package/include/libxml2/libxml/valid.h +2 -0
- package/include/libxml2/libxml/xmlIO.h +65 -13
- package/include/libxml2/libxml/xmlerror.h +37 -8
- package/include/libxml2/libxml/xmlmemory.h +37 -40
- package/include/libxml2/libxml/xmlreader.h +6 -0
- package/include/libxml2/libxml/xmlregexp.h +2 -9
- package/include/libxml2/libxml/xmlsave.h +9 -0
- package/include/libxml2/libxml/xmlschemas.h +3 -0
- package/include/libxml2/libxml/xmlversion.h +28 -43
- package/include/libxml2/libxml/xpath.h +1 -1
- package/include/libxml2/libxml/xpathInternals.h +2 -1
- package/include/libxml2/libxml/xpointer.h +5 -4
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-fontmap.h +7 -0
- package/include/pixman-1/pixman-version.h +3 -3
- package/include/pixman-1/pixman.h +9 -2
- package/include/png.h +32 -29
- package/include/pngconf.h +2 -2
- package/include/pnglibconf.h +7 -2
- package/include/vips/connection.h +9 -3
- package/include/vips/util.h +0 -9
- package/include/vips/version.h +4 -4
- package/include/zconf.h +3 -0
- package/include/zlib.h +3 -3
- package/package.json +1 -1
- package/versions.json +15 -15
|
@@ -143,7 +143,8 @@ namespace detail { // for code folding and Raw128
|
|
|
143
143
|
HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \
|
|
144
144
|
HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)
|
|
145
145
|
|
|
146
|
-
#
|
|
146
|
+
#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && \
|
|
147
|
+
(HWY_COMPILER_GCC_ACTUAL >= 1300 || HWY_COMPILER_CLANG >= 1100)
|
|
147
148
|
#define HWY_NEON_HAVE_BFLOAT16 1
|
|
148
149
|
#else
|
|
149
150
|
#define HWY_NEON_HAVE_BFLOAT16 0
|
|
@@ -160,7 +161,7 @@ namespace detail { // for code folding and Raw128
|
|
|
160
161
|
#define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args)
|
|
161
162
|
#endif
|
|
162
163
|
|
|
163
|
-
// Used for conversion instructions if
|
|
164
|
+
// Used for conversion instructions if HWY_NEON_HAVE_F16C.
|
|
164
165
|
#define HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(name, prefix, infix, \
|
|
165
166
|
args) \
|
|
166
167
|
HWY_NEON_DEF_FUNCTION(float16, 8, name, prefix##q, infix, f16, args) \
|
|
@@ -176,6 +177,19 @@ namespace detail { // for code folding and Raw128
|
|
|
176
177
|
#define HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args)
|
|
177
178
|
#endif
|
|
178
179
|
|
|
180
|
+
// Enable generic functions for whichever of (f16, bf16) are not supported.
|
|
181
|
+
#if !HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16
|
|
182
|
+
#define HWY_NEON_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
|
|
183
|
+
#elif !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16
|
|
184
|
+
#define HWY_NEON_IF_EMULATED_D(D) HWY_IF_F16_D(D)
|
|
185
|
+
#elif HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16
|
|
186
|
+
#define HWY_NEON_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
|
|
187
|
+
#elif HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16
|
|
188
|
+
#define HWY_NEON_IF_EMULATED_D(D) hwy::EnableIf<false>* = nullptr
|
|
189
|
+
#else
|
|
190
|
+
#error "Logic error, handled all four cases"
|
|
191
|
+
#endif
|
|
192
|
+
|
|
179
193
|
// float
|
|
180
194
|
#define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
|
|
181
195
|
HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \
|
|
@@ -397,39 +411,6 @@ struct Tuple2<int64_t, N> {
|
|
|
397
411
|
int64x1x2_t raw;
|
|
398
412
|
};
|
|
399
413
|
|
|
400
|
-
template <>
|
|
401
|
-
struct Tuple2<float16_t, 8> {
|
|
402
|
-
#if HWY_NEON_HAVE_FLOAT16C
|
|
403
|
-
float16x8x2_t raw;
|
|
404
|
-
#else
|
|
405
|
-
uint16x8x2_t raw;
|
|
406
|
-
#endif
|
|
407
|
-
};
|
|
408
|
-
template <size_t N>
|
|
409
|
-
struct Tuple2<float16_t, N> {
|
|
410
|
-
#if HWY_NEON_HAVE_FLOAT16C
|
|
411
|
-
float16x4x2_t raw;
|
|
412
|
-
#else
|
|
413
|
-
uint16x4x2_t raw;
|
|
414
|
-
#endif
|
|
415
|
-
};
|
|
416
|
-
template <>
|
|
417
|
-
struct Tuple2<bfloat16_t, 8> {
|
|
418
|
-
#if HWY_NEON_HAVE_BFLOAT16
|
|
419
|
-
bfloat16x8x2_t raw;
|
|
420
|
-
#else
|
|
421
|
-
uint16x8x2_t raw;
|
|
422
|
-
#endif
|
|
423
|
-
};
|
|
424
|
-
template <size_t N>
|
|
425
|
-
struct Tuple2<bfloat16_t, N> {
|
|
426
|
-
#if HWY_NEON_HAVE_BFLOAT16
|
|
427
|
-
bfloat16x4x2_t raw;
|
|
428
|
-
#else
|
|
429
|
-
uint16x4x2_t raw;
|
|
430
|
-
#endif
|
|
431
|
-
};
|
|
432
|
-
|
|
433
414
|
template <>
|
|
434
415
|
struct Tuple2<float32_t, 4> {
|
|
435
416
|
float32x4x2_t raw;
|
|
@@ -514,39 +495,6 @@ struct Tuple3<int64_t, N> {
|
|
|
514
495
|
int64x1x3_t raw;
|
|
515
496
|
};
|
|
516
497
|
|
|
517
|
-
template <>
|
|
518
|
-
struct Tuple3<float16_t, 8> {
|
|
519
|
-
#if HWY_NEON_HAVE_FLOAT16C
|
|
520
|
-
float16x8x3_t raw;
|
|
521
|
-
#else
|
|
522
|
-
uint16x8x3_t raw;
|
|
523
|
-
#endif
|
|
524
|
-
};
|
|
525
|
-
template <size_t N>
|
|
526
|
-
struct Tuple3<float16_t, N> {
|
|
527
|
-
#if HWY_NEON_HAVE_FLOAT16C
|
|
528
|
-
float16x4x3_t raw;
|
|
529
|
-
#else
|
|
530
|
-
uint16x4x3_t raw;
|
|
531
|
-
#endif
|
|
532
|
-
};
|
|
533
|
-
template <>
|
|
534
|
-
struct Tuple3<bfloat16_t, 8> {
|
|
535
|
-
#if HWY_NEON_HAVE_BFLOAT16
|
|
536
|
-
bfloat16x8x3_t raw;
|
|
537
|
-
#else
|
|
538
|
-
uint16x8x3_t raw;
|
|
539
|
-
#endif
|
|
540
|
-
};
|
|
541
|
-
template <size_t N>
|
|
542
|
-
struct Tuple3<bfloat16_t, N> {
|
|
543
|
-
#if HWY_NEON_HAVE_BFLOAT16
|
|
544
|
-
bfloat16x4x3_t raw;
|
|
545
|
-
#else
|
|
546
|
-
uint16x4x3_t raw;
|
|
547
|
-
#endif
|
|
548
|
-
};
|
|
549
|
-
|
|
550
498
|
template <>
|
|
551
499
|
struct Tuple3<float32_t, 4> {
|
|
552
500
|
float32x4x3_t raw;
|
|
@@ -631,39 +579,6 @@ struct Tuple4<int64_t, N> {
|
|
|
631
579
|
int64x1x4_t raw;
|
|
632
580
|
};
|
|
633
581
|
|
|
634
|
-
template <>
|
|
635
|
-
struct Tuple4<float16_t, 8> {
|
|
636
|
-
#if HWY_NEON_HAVE_FLOAT16C
|
|
637
|
-
float16x8x4_t raw;
|
|
638
|
-
#else
|
|
639
|
-
uint16x8x4_t raw;
|
|
640
|
-
#endif
|
|
641
|
-
};
|
|
642
|
-
template <size_t N>
|
|
643
|
-
struct Tuple4<float16_t, N> {
|
|
644
|
-
#if HWY_NEON_HAVE_FLOAT16C
|
|
645
|
-
float16x4x4_t raw;
|
|
646
|
-
#else
|
|
647
|
-
uint16x4x4_t raw;
|
|
648
|
-
#endif
|
|
649
|
-
};
|
|
650
|
-
template <>
|
|
651
|
-
struct Tuple4<bfloat16_t, 8> {
|
|
652
|
-
#if HWY_NEON_HAVE_BFLOAT16
|
|
653
|
-
bfloat16x8x4_t raw;
|
|
654
|
-
#else
|
|
655
|
-
uint16x8x4_t raw;
|
|
656
|
-
#endif
|
|
657
|
-
};
|
|
658
|
-
template <size_t N>
|
|
659
|
-
struct Tuple4<bfloat16_t, N> {
|
|
660
|
-
#if HWY_NEON_HAVE_BFLOAT16
|
|
661
|
-
bfloat16x4x4_t raw;
|
|
662
|
-
#else
|
|
663
|
-
uint16x4x4_t raw;
|
|
664
|
-
#endif
|
|
665
|
-
};
|
|
666
|
-
|
|
667
582
|
template <>
|
|
668
583
|
struct Tuple4<float32_t, 4> {
|
|
669
584
|
float32x4x4_t raw;
|
|
@@ -686,201 +601,199 @@ struct Tuple4<float64_t, N> {
|
|
|
686
601
|
template <typename T, size_t N>
|
|
687
602
|
struct Raw128;
|
|
688
603
|
|
|
689
|
-
// 128
|
|
690
604
|
template <>
|
|
691
605
|
struct Raw128<uint8_t, 16> {
|
|
692
606
|
using type = uint8x16_t;
|
|
693
607
|
};
|
|
608
|
+
template <size_t N>
|
|
609
|
+
struct Raw128<uint8_t, N> {
|
|
610
|
+
using type = uint8x8_t;
|
|
611
|
+
};
|
|
694
612
|
|
|
695
613
|
template <>
|
|
696
614
|
struct Raw128<uint16_t, 8> {
|
|
697
615
|
using type = uint16x8_t;
|
|
698
616
|
};
|
|
617
|
+
template <size_t N>
|
|
618
|
+
struct Raw128<uint16_t, N> {
|
|
619
|
+
using type = uint16x4_t;
|
|
620
|
+
};
|
|
699
621
|
|
|
700
622
|
template <>
|
|
701
623
|
struct Raw128<uint32_t, 4> {
|
|
702
624
|
using type = uint32x4_t;
|
|
703
625
|
};
|
|
626
|
+
template <size_t N>
|
|
627
|
+
struct Raw128<uint32_t, N> {
|
|
628
|
+
using type = uint32x2_t;
|
|
629
|
+
};
|
|
704
630
|
|
|
705
631
|
template <>
|
|
706
632
|
struct Raw128<uint64_t, 2> {
|
|
707
633
|
using type = uint64x2_t;
|
|
708
634
|
};
|
|
635
|
+
template <>
|
|
636
|
+
struct Raw128<uint64_t, 1> {
|
|
637
|
+
using type = uint64x1_t;
|
|
638
|
+
};
|
|
709
639
|
|
|
710
640
|
template <>
|
|
711
641
|
struct Raw128<int8_t, 16> {
|
|
712
642
|
using type = int8x16_t;
|
|
713
643
|
};
|
|
644
|
+
template <size_t N>
|
|
645
|
+
struct Raw128<int8_t, N> {
|
|
646
|
+
using type = int8x8_t;
|
|
647
|
+
};
|
|
714
648
|
|
|
715
649
|
template <>
|
|
716
650
|
struct Raw128<int16_t, 8> {
|
|
717
651
|
using type = int16x8_t;
|
|
718
652
|
};
|
|
653
|
+
template <size_t N>
|
|
654
|
+
struct Raw128<int16_t, N> {
|
|
655
|
+
using type = int16x4_t;
|
|
656
|
+
};
|
|
719
657
|
|
|
720
658
|
template <>
|
|
721
659
|
struct Raw128<int32_t, 4> {
|
|
722
660
|
using type = int32x4_t;
|
|
723
661
|
};
|
|
662
|
+
template <size_t N>
|
|
663
|
+
struct Raw128<int32_t, N> {
|
|
664
|
+
using type = int32x2_t;
|
|
665
|
+
};
|
|
724
666
|
|
|
725
667
|
template <>
|
|
726
668
|
struct Raw128<int64_t, 2> {
|
|
727
669
|
using type = int64x2_t;
|
|
728
670
|
};
|
|
729
|
-
|
|
730
671
|
template <>
|
|
731
|
-
struct Raw128<
|
|
732
|
-
|
|
733
|
-
using type = float16x8_t;
|
|
734
|
-
#else
|
|
735
|
-
using type = uint16x8_t;
|
|
736
|
-
#endif
|
|
737
|
-
};
|
|
738
|
-
|
|
739
|
-
template <>
|
|
740
|
-
struct Raw128<bfloat16_t, 8> {
|
|
741
|
-
#if HWY_NEON_HAVE_BFLOAT16
|
|
742
|
-
using type = bfloat16x8_t;
|
|
743
|
-
#else
|
|
744
|
-
using type = uint16x8_t;
|
|
745
|
-
#endif
|
|
672
|
+
struct Raw128<int64_t, 1> {
|
|
673
|
+
using type = int64x1_t;
|
|
746
674
|
};
|
|
747
675
|
|
|
748
676
|
template <>
|
|
749
677
|
struct Raw128<float, 4> {
|
|
750
678
|
using type = float32x4_t;
|
|
751
679
|
};
|
|
680
|
+
template <size_t N>
|
|
681
|
+
struct Raw128<float, N> {
|
|
682
|
+
using type = float32x2_t;
|
|
683
|
+
};
|
|
752
684
|
|
|
753
685
|
#if HWY_HAVE_FLOAT64
|
|
754
686
|
template <>
|
|
755
687
|
struct Raw128<double, 2> {
|
|
756
688
|
using type = float64x2_t;
|
|
757
689
|
};
|
|
758
|
-
#endif // HWY_HAVE_FLOAT64
|
|
759
|
-
|
|
760
|
-
// 64
|
|
761
690
|
template <>
|
|
762
|
-
struct Raw128<
|
|
763
|
-
using type =
|
|
764
|
-
};
|
|
765
|
-
|
|
766
|
-
template <>
|
|
767
|
-
struct Raw128<uint16_t, 4> {
|
|
768
|
-
using type = uint16x4_t;
|
|
769
|
-
};
|
|
770
|
-
|
|
771
|
-
template <>
|
|
772
|
-
struct Raw128<uint32_t, 2> {
|
|
773
|
-
using type = uint32x2_t;
|
|
691
|
+
struct Raw128<double, 1> {
|
|
692
|
+
using type = float64x1_t;
|
|
774
693
|
};
|
|
694
|
+
#endif // HWY_HAVE_FLOAT64
|
|
775
695
|
|
|
776
|
-
|
|
777
|
-
struct Raw128<uint64_t, 1> {
|
|
778
|
-
using type = uint64x1_t;
|
|
779
|
-
};
|
|
696
|
+
#if HWY_NEON_HAVE_F16C
|
|
780
697
|
|
|
781
698
|
template <>
|
|
782
|
-
struct
|
|
783
|
-
|
|
699
|
+
struct Tuple2<float16_t, 8> {
|
|
700
|
+
float16x8x2_t raw;
|
|
784
701
|
};
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
using type = int16x4_t;
|
|
702
|
+
template <size_t N>
|
|
703
|
+
struct Tuple2<float16_t, N> {
|
|
704
|
+
float16x4x2_t raw;
|
|
789
705
|
};
|
|
790
706
|
|
|
791
707
|
template <>
|
|
792
|
-
struct
|
|
793
|
-
|
|
708
|
+
struct Tuple3<float16_t, 8> {
|
|
709
|
+
float16x8x3_t raw;
|
|
794
710
|
};
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
using type = int64x1_t;
|
|
711
|
+
template <size_t N>
|
|
712
|
+
struct Tuple3<float16_t, N> {
|
|
713
|
+
float16x4x3_t raw;
|
|
799
714
|
};
|
|
800
715
|
|
|
801
716
|
template <>
|
|
802
|
-
struct
|
|
803
|
-
|
|
804
|
-
using type = float16x4_t;
|
|
805
|
-
#else
|
|
806
|
-
using type = uint16x4_t;
|
|
807
|
-
#endif
|
|
717
|
+
struct Tuple4<float16_t, 8> {
|
|
718
|
+
float16x8x4_t raw;
|
|
808
719
|
};
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
#if HWY_NEON_HAVE_BFLOAT16
|
|
813
|
-
using type = bfloat16x4_t;
|
|
814
|
-
#else
|
|
815
|
-
using type = uint16x4_t;
|
|
816
|
-
#endif
|
|
720
|
+
template <size_t N>
|
|
721
|
+
struct Tuple4<float16_t, N> {
|
|
722
|
+
float16x4x4_t raw;
|
|
817
723
|
};
|
|
818
724
|
|
|
819
725
|
template <>
|
|
820
|
-
struct Raw128<
|
|
821
|
-
using type =
|
|
726
|
+
struct Raw128<float16_t, 8> {
|
|
727
|
+
using type = float16x8_t;
|
|
822
728
|
};
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
struct Raw128<double, 1> {
|
|
827
|
-
using type = float64x1_t;
|
|
729
|
+
template <size_t N>
|
|
730
|
+
struct Raw128<float16_t, N> {
|
|
731
|
+
using type = float16x4_t;
|
|
828
732
|
};
|
|
829
|
-
#endif // HWY_HAVE_FLOAT64
|
|
830
|
-
|
|
831
|
-
// 32 (same as 64)
|
|
832
|
-
template <>
|
|
833
|
-
struct Raw128<uint8_t, 4> : public Raw128<uint8_t, 8> {};
|
|
834
733
|
|
|
835
|
-
|
|
836
|
-
struct Raw128<uint16_t, 2> : public Raw128<uint16_t, 4> {};
|
|
734
|
+
#else // !HWY_NEON_HAVE_F16C
|
|
837
735
|
|
|
838
|
-
template
|
|
839
|
-
struct
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
template
|
|
845
|
-
struct Raw128<
|
|
846
|
-
|
|
847
|
-
template <>
|
|
848
|
-
struct Raw128<int32_t, 1> : public Raw128<int32_t, 2> {};
|
|
849
|
-
|
|
850
|
-
template <>
|
|
851
|
-
struct Raw128<float16_t, 2> : public Raw128<float16_t, 4> {};
|
|
852
|
-
|
|
853
|
-
template <>
|
|
854
|
-
struct Raw128<bfloat16_t, 2> : public Raw128<bfloat16_t, 4> {};
|
|
736
|
+
template <size_t N>
|
|
737
|
+
struct Tuple2<float16_t, N> : public Tuple2<uint16_t, N> {};
|
|
738
|
+
template <size_t N>
|
|
739
|
+
struct Tuple3<float16_t, N> : public Tuple3<uint16_t, N> {};
|
|
740
|
+
template <size_t N>
|
|
741
|
+
struct Tuple4<float16_t, N> : public Tuple4<uint16_t, N> {};
|
|
742
|
+
template <size_t N>
|
|
743
|
+
struct Raw128<float16_t, N> : public Raw128<uint16_t, N> {};
|
|
855
744
|
|
|
856
|
-
|
|
857
|
-
struct Raw128<float, 1> : public Raw128<float, 2> {};
|
|
745
|
+
#endif // HWY_NEON_HAVE_F16C
|
|
858
746
|
|
|
859
|
-
|
|
860
|
-
template <>
|
|
861
|
-
struct Raw128<uint8_t, 2> : public Raw128<uint8_t, 8> {};
|
|
747
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
862
748
|
|
|
863
749
|
template <>
|
|
864
|
-
struct
|
|
750
|
+
struct Tuple2<bfloat16_t, 8> {
|
|
751
|
+
bfloat16x8x2_t raw;
|
|
752
|
+
};
|
|
753
|
+
template <size_t N>
|
|
754
|
+
struct Tuple2<bfloat16_t, N> {
|
|
755
|
+
bfloat16x4x2_t raw;
|
|
756
|
+
};
|
|
865
757
|
|
|
866
758
|
template <>
|
|
867
|
-
struct
|
|
759
|
+
struct Tuple3<bfloat16_t, 8> {
|
|
760
|
+
bfloat16x8x3_t raw;
|
|
761
|
+
};
|
|
762
|
+
template <size_t N>
|
|
763
|
+
struct Tuple3<bfloat16_t, N> {
|
|
764
|
+
bfloat16x4x3_t raw;
|
|
765
|
+
};
|
|
868
766
|
|
|
869
767
|
template <>
|
|
870
|
-
struct
|
|
768
|
+
struct Tuple4<bfloat16_t, 8> {
|
|
769
|
+
bfloat16x8x4_t raw;
|
|
770
|
+
};
|
|
771
|
+
template <size_t N>
|
|
772
|
+
struct Tuple4<bfloat16_t, N> {
|
|
773
|
+
bfloat16x4x4_t raw;
|
|
774
|
+
};
|
|
871
775
|
|
|
872
776
|
template <>
|
|
873
|
-
struct Raw128<
|
|
777
|
+
struct Raw128<bfloat16_t, 8> {
|
|
778
|
+
using type = bfloat16x8_t;
|
|
779
|
+
};
|
|
780
|
+
template <size_t N>
|
|
781
|
+
struct Raw128<bfloat16_t, N> {
|
|
782
|
+
using type = bfloat16x4_t;
|
|
783
|
+
};
|
|
874
784
|
|
|
875
|
-
|
|
876
|
-
struct Raw128<bfloat16_t, 1> : public Raw128<bfloat16_t, 4> {};
|
|
785
|
+
#else // !HWY_NEON_HAVE_BFLOAT16
|
|
877
786
|
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
787
|
+
template <size_t N>
|
|
788
|
+
struct Tuple2<bfloat16_t, N> : public Tuple2<uint16_t, N> {};
|
|
789
|
+
template <size_t N>
|
|
790
|
+
struct Tuple3<bfloat16_t, N> : public Tuple3<uint16_t, N> {};
|
|
791
|
+
template <size_t N>
|
|
792
|
+
struct Tuple4<bfloat16_t, N> : public Tuple4<uint16_t, N> {};
|
|
793
|
+
template <size_t N>
|
|
794
|
+
struct Raw128<bfloat16_t, N> : public Raw128<uint16_t, N> {};
|
|
881
795
|
|
|
882
|
-
|
|
883
|
-
struct Raw128<int8_t, 1> : public Raw128<int8_t, 8> {};
|
|
796
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
884
797
|
|
|
885
798
|
} // namespace detail
|
|
886
799
|
|
|
@@ -910,6 +823,9 @@ class Vec128 {
|
|
|
910
823
|
HWY_INLINE Vec128& operator-=(const Vec128 other) {
|
|
911
824
|
return *this = (*this - other);
|
|
912
825
|
}
|
|
826
|
+
HWY_INLINE Vec128& operator%=(const Vec128 other) {
|
|
827
|
+
return *this = (*this % other);
|
|
828
|
+
}
|
|
913
829
|
HWY_INLINE Vec128& operator&=(const Vec128 other) {
|
|
914
830
|
return *this = (*this & other);
|
|
915
831
|
}
|
|
@@ -978,26 +894,22 @@ namespace detail {
|
|
|
978
894
|
#define HWY_NEON_BUILD_ARG_HWY_SET t
|
|
979
895
|
|
|
980
896
|
HWY_NEON_DEF_FUNCTION_ALL_TYPES(NativeSet, vdup, _n_, HWY_SET)
|
|
981
|
-
|
|
982
|
-
#if !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_FLOAT16C
|
|
897
|
+
#if !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_F16C
|
|
983
898
|
HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(NativeSet, vdup, _n_, HWY_SET)
|
|
984
899
|
#endif
|
|
900
|
+
HWY_NEON_DEF_FUNCTION_BFLOAT_16(NativeSet, vdup, _n_, HWY_SET)
|
|
901
|
+
|
|
902
|
+
template <class D, HWY_NEON_IF_EMULATED_D(D)>
|
|
903
|
+
HWY_API Vec128<TFromD<D>, MaxLanes(D())> NativeSet(D d, TFromD<D> t) {
|
|
904
|
+
const uint16_t tu = BitCastScalar<uint16_t>(t);
|
|
905
|
+
return Vec128<TFromD<D>, d.MaxLanes()>(Set(RebindToUnsigned<D>(), tu).raw);
|
|
906
|
+
}
|
|
985
907
|
|
|
986
908
|
#undef HWY_NEON_BUILD_TPL_HWY_SET
|
|
987
909
|
#undef HWY_NEON_BUILD_RET_HWY_SET
|
|
988
910
|
#undef HWY_NEON_BUILD_PARAM_HWY_SET
|
|
989
911
|
#undef HWY_NEON_BUILD_ARG_HWY_SET
|
|
990
912
|
|
|
991
|
-
#if !HWY_NEON_HAVE_BFLOAT16
|
|
992
|
-
// BF16: return u16.
|
|
993
|
-
template <class D, HWY_IF_BF16_D(D)>
|
|
994
|
-
HWY_API Vec128<bfloat16_t, MaxLanes(D())> NativeSet(D d, bfloat16_t t) {
|
|
995
|
-
uint16_t tu;
|
|
996
|
-
CopyBytes<sizeof(tu)>(&t, &tu);
|
|
997
|
-
return Vec128<bfloat16_t, d.MaxLanes()>(Set(RebindToUnsigned<D>(), tu).raw);
|
|
998
|
-
}
|
|
999
|
-
#endif // !HWY_NEON_HAVE_BFLOAT16
|
|
1000
|
-
|
|
1001
913
|
} // namespace detail
|
|
1002
914
|
|
|
1003
915
|
// Full vector. Cannot yet use VFromD because that is defined in terms of Set.
|
|
@@ -1039,159 +951,313 @@ HWY_API VFromD<D> Undefined(D /*tag*/) {
|
|
|
1039
951
|
|
|
1040
952
|
HWY_DIAGNOSTICS(pop)
|
|
1041
953
|
|
|
954
|
+
#if !HWY_COMPILER_GCC && !HWY_COMPILER_CLANGCL
|
|
1042
955
|
namespace detail {
|
|
1043
956
|
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
957
|
+
#pragma pack(push, 1)
|
|
958
|
+
|
|
959
|
+
template <class T>
|
|
960
|
+
struct alignas(8) Vec64ValsWrapper {
|
|
961
|
+
static_assert(sizeof(T) >= 1, "sizeof(T) >= 1 must be true");
|
|
962
|
+
static_assert(sizeof(T) <= 8, "sizeof(T) <= 8 must be true");
|
|
963
|
+
T vals[8 / sizeof(T)];
|
|
964
|
+
};
|
|
965
|
+
|
|
966
|
+
#pragma pack(pop)
|
|
967
|
+
|
|
968
|
+
} // namespace detail
|
|
969
|
+
#endif // !HWY_COMPILER_GCC && !HWY_COMPILER_CLANGCL
|
|
970
|
+
|
|
971
|
+
template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
972
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
973
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
974
|
+
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
|
|
975
|
+
TFromD<D> /*t8*/, TFromD<D> /*t9*/,
|
|
976
|
+
TFromD<D> /*t10*/, TFromD<D> /*t11*/,
|
|
977
|
+
TFromD<D> /*t12*/, TFromD<D> /*t13*/,
|
|
978
|
+
TFromD<D> /*t14*/, TFromD<D> /*t15*/) {
|
|
1047
979
|
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
|
|
1048
|
-
typedef
|
|
1049
|
-
|
|
1050
|
-
const
|
|
980
|
+
typedef int8_t GccI8RawVectType __attribute__((__vector_size__(8)));
|
|
981
|
+
(void)d;
|
|
982
|
+
const GccI8RawVectType raw = {
|
|
983
|
+
static_cast<int8_t>(t0), static_cast<int8_t>(t1), static_cast<int8_t>(t2),
|
|
984
|
+
static_cast<int8_t>(t3), static_cast<int8_t>(t4), static_cast<int8_t>(t5),
|
|
985
|
+
static_cast<int8_t>(t6), static_cast<int8_t>(t7)};
|
|
986
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1051
987
|
#else
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
988
|
+
return ResizeBitCast(
|
|
989
|
+
d, Set(Full64<uint64_t>(),
|
|
990
|
+
BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{
|
|
991
|
+
{t0, t1, t2, t3, t4, t5, t6, t7}})));
|
|
1055
992
|
#endif
|
|
1056
|
-
return BitCast(d, vu8_iota0);
|
|
1057
993
|
}
|
|
1058
994
|
|
|
1059
|
-
template <class D,
|
|
1060
|
-
|
|
1061
|
-
|
|
995
|
+
template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
996
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
997
|
+
TFromD<D> t2, TFromD<D> t3,
|
|
998
|
+
TFromD<D> /*t4*/, TFromD<D> /*t5*/,
|
|
999
|
+
TFromD<D> /*t6*/, TFromD<D> /*t7*/) {
|
|
1062
1000
|
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
|
|
1063
|
-
typedef
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1001
|
+
typedef int16_t GccI16RawVectType __attribute__((__vector_size__(8)));
|
|
1002
|
+
(void)d;
|
|
1003
|
+
const GccI16RawVectType raw = {
|
|
1004
|
+
static_cast<int16_t>(t0), static_cast<int16_t>(t1),
|
|
1005
|
+
static_cast<int16_t>(t2), static_cast<int16_t>(t3)};
|
|
1006
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1067
1007
|
#else
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1008
|
+
return ResizeBitCast(
|
|
1009
|
+
d, Set(Full64<uint64_t>(),
|
|
1010
|
+
BitCastScalar<uint64_t>(
|
|
1011
|
+
detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1, t2, t3}})));
|
|
1071
1012
|
#endif
|
|
1072
|
-
return BitCast(d, vu8_iota0);
|
|
1073
1013
|
}
|
|
1074
1014
|
|
|
1075
|
-
template <class D,
|
|
1076
|
-
|
|
1077
|
-
|
|
1015
|
+
template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
1016
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
1017
|
+
TFromD<D> /*t2*/, TFromD<D> /*t3*/) {
|
|
1078
1018
|
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
|
|
1079
|
-
typedef
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1019
|
+
typedef int32_t GccI32RawVectType __attribute__((__vector_size__(8)));
|
|
1020
|
+
(void)d;
|
|
1021
|
+
const GccI32RawVectType raw = {static_cast<int32_t>(t0),
|
|
1022
|
+
static_cast<int32_t>(t1)};
|
|
1023
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1083
1024
|
#else
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1025
|
+
return ResizeBitCast(d,
|
|
1026
|
+
Set(Full64<uint64_t>(),
|
|
1027
|
+
BitCastScalar<uint64_t>(
|
|
1028
|
+
detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}})));
|
|
1087
1029
|
#endif
|
|
1088
1030
|
}
|
|
1089
1031
|
|
|
1090
|
-
template <class D,
|
|
1091
|
-
|
|
1092
|
-
|
|
1032
|
+
template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
1033
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
1034
|
+
TFromD<D> /*t2*/, TFromD<D> /*t3*/) {
|
|
1093
1035
|
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
|
|
1094
|
-
typedef
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(kIota0));
|
|
1036
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(8)));
|
|
1037
|
+
(void)d;
|
|
1038
|
+
const GccF32RawVectType raw = {t0, t1};
|
|
1039
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1099
1040
|
#else
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1041
|
+
return ResizeBitCast(d,
|
|
1042
|
+
Set(Full64<uint64_t>(),
|
|
1043
|
+
BitCastScalar<uint64_t>(
|
|
1044
|
+
detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}})));
|
|
1103
1045
|
#endif
|
|
1104
1046
|
}
|
|
1105
1047
|
|
|
1106
|
-
template <class D,
|
|
1107
|
-
|
|
1108
|
-
|
|
1048
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 8)>
|
|
1049
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> /*t1*/) {
|
|
1050
|
+
return Set(d, t0);
|
|
1051
|
+
}
|
|
1052
|
+
|
|
1053
|
+
template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 16)>
|
|
1054
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
1055
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
1056
|
+
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
|
|
1057
|
+
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
|
|
1058
|
+
TFromD<D> t11, TFromD<D> t12,
|
|
1059
|
+
TFromD<D> t13, TFromD<D> t14,
|
|
1060
|
+
TFromD<D> t15) {
|
|
1109
1061
|
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
|
|
1110
|
-
typedef
|
|
1111
|
-
|
|
1112
|
-
const
|
|
1113
|
-
|
|
1062
|
+
typedef int8_t GccI8RawVectType __attribute__((__vector_size__(16)));
|
|
1063
|
+
(void)d;
|
|
1064
|
+
const GccI8RawVectType raw = {
|
|
1065
|
+
static_cast<int8_t>(t0), static_cast<int8_t>(t1),
|
|
1066
|
+
static_cast<int8_t>(t2), static_cast<int8_t>(t3),
|
|
1067
|
+
static_cast<int8_t>(t4), static_cast<int8_t>(t5),
|
|
1068
|
+
static_cast<int8_t>(t6), static_cast<int8_t>(t7),
|
|
1069
|
+
static_cast<int8_t>(t8), static_cast<int8_t>(t9),
|
|
1070
|
+
static_cast<int8_t>(t10), static_cast<int8_t>(t11),
|
|
1071
|
+
static_cast<int8_t>(t12), static_cast<int8_t>(t13),
|
|
1072
|
+
static_cast<int8_t>(t14), static_cast<int8_t>(t15)};
|
|
1073
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1114
1074
|
#else
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1075
|
+
const Half<decltype(d)> dh;
|
|
1076
|
+
return Combine(d,
|
|
1077
|
+
Dup128VecFromValues(dh, t8, t9, t10, t11, t12, t13, t14, t15,
|
|
1078
|
+
t8, t9, t10, t11, t12, t13, t14, t15),
|
|
1079
|
+
Dup128VecFromValues(dh, t0, t1, t2, t3, t4, t5, t6, t7, t0, t1,
|
|
1080
|
+
t2, t3, t4, t5, t6, t7));
|
|
1118
1081
|
#endif
|
|
1119
|
-
return BitCast(d, vu32_iota0);
|
|
1120
1082
|
}
|
|
1121
1083
|
|
|
1122
|
-
template <class D,
|
|
1123
|
-
|
|
1124
|
-
|
|
1084
|
+
template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 16)>
|
|
1085
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
1086
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
1087
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
1088
|
+
TFromD<D> t7) {
|
|
1125
1089
|
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
|
|
1126
|
-
typedef
|
|
1127
|
-
|
|
1128
|
-
const
|
|
1129
|
-
|
|
1090
|
+
typedef int16_t GccI16RawVectType __attribute__((__vector_size__(16)));
|
|
1091
|
+
(void)d;
|
|
1092
|
+
const GccI16RawVectType raw = {
|
|
1093
|
+
static_cast<int16_t>(t0), static_cast<int16_t>(t1),
|
|
1094
|
+
static_cast<int16_t>(t2), static_cast<int16_t>(t3),
|
|
1095
|
+
static_cast<int16_t>(t4), static_cast<int16_t>(t5),
|
|
1096
|
+
static_cast<int16_t>(t6), static_cast<int16_t>(t7)};
|
|
1097
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1130
1098
|
#else
|
|
1131
|
-
|
|
1132
|
-
|
|
1099
|
+
const Half<decltype(d)> dh;
|
|
1100
|
+
return Combine(d, Dup128VecFromValues(dh, t4, t5, t6, t7, t4, t5, t6, t7),
|
|
1101
|
+
Dup128VecFromValues(dh, t0, t1, t2, t3, t0, t1, t2, t3));
|
|
1133
1102
|
#endif
|
|
1134
|
-
return BitCast(d, vu32_iota0);
|
|
1135
1103
|
}
|
|
1136
1104
|
|
|
1137
|
-
template <class D,
|
|
1138
|
-
|
|
1105
|
+
template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 16)>
|
|
1106
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
1107
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
1139
1108
|
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
|
|
1140
|
-
typedef
|
|
1141
|
-
|
|
1142
|
-
|
|
1109
|
+
typedef int32_t GccI32RawVectType __attribute__((__vector_size__(16)));
|
|
1110
|
+
(void)d;
|
|
1111
|
+
const GccI32RawVectType raw = {
|
|
1112
|
+
static_cast<int32_t>(t0), static_cast<int32_t>(t1),
|
|
1113
|
+
static_cast<int32_t>(t2), static_cast<int32_t>(t3)};
|
|
1114
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1143
1115
|
#else
|
|
1144
|
-
|
|
1145
|
-
return
|
|
1146
|
-
|
|
1116
|
+
const Half<decltype(d)> dh;
|
|
1117
|
+
return Combine(d, Dup128VecFromValues(dh, t2, t3, t2, t3),
|
|
1118
|
+
Dup128VecFromValues(dh, t0, t1, t0, t1));
|
|
1147
1119
|
#endif
|
|
1148
1120
|
}
|
|
1149
1121
|
|
|
1150
|
-
template <class D,
|
|
1151
|
-
|
|
1122
|
+
template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 16)>
|
|
1123
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
1124
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
1152
1125
|
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
|
|
1153
1126
|
typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
|
|
1154
|
-
|
|
1155
|
-
|
|
1127
|
+
(void)d;
|
|
1128
|
+
const GccF32RawVectType raw = {t0, t1, t2, t3};
|
|
1129
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1156
1130
|
#else
|
|
1157
|
-
|
|
1158
|
-
return
|
|
1131
|
+
const Half<decltype(d)> dh;
|
|
1132
|
+
return Combine(d, Dup128VecFromValues(dh, t2, t3, t2, t3),
|
|
1133
|
+
Dup128VecFromValues(dh, t0, t1, t0, t1));
|
|
1159
1134
|
#endif
|
|
1160
1135
|
}
|
|
1161
1136
|
|
|
1162
|
-
template <class D,
|
|
1163
|
-
|
|
1164
|
-
return Zero(d);
|
|
1165
|
-
}
|
|
1166
|
-
|
|
1167
|
-
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)>
|
|
1168
|
-
HWY_INLINE VFromD<D> Iota0(D d) {
|
|
1169
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
1137
|
+
template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 16)>
|
|
1138
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) {
|
|
1170
1139
|
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
|
|
1171
|
-
typedef
|
|
1172
|
-
|
|
1173
|
-
const
|
|
1174
|
-
|
|
1140
|
+
typedef int64_t GccI64RawVectType __attribute__((__vector_size__(16)));
|
|
1141
|
+
(void)d;
|
|
1142
|
+
const GccI64RawVectType raw = {static_cast<int64_t>(t0),
|
|
1143
|
+
static_cast<int64_t>(t1)};
|
|
1144
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1175
1145
|
#else
|
|
1176
|
-
|
|
1177
|
-
|
|
1146
|
+
const Half<decltype(d)> dh;
|
|
1147
|
+
return Combine(d, Set(dh, t1), Set(dh, t0));
|
|
1178
1148
|
#endif
|
|
1179
|
-
return BitCast(d, vu64_iota0);
|
|
1180
1149
|
}
|
|
1181
1150
|
|
|
1182
1151
|
#if HWY_HAVE_FLOAT64
|
|
1183
|
-
template <class D,
|
|
1184
|
-
|
|
1152
|
+
template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 16)>
|
|
1153
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) {
|
|
1185
1154
|
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
|
|
1186
1155
|
typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
|
|
1187
|
-
|
|
1188
|
-
|
|
1156
|
+
(void)d;
|
|
1157
|
+
const GccF64RawVectType raw = {t0, t1};
|
|
1158
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1189
1159
|
#else
|
|
1190
|
-
|
|
1191
|
-
return
|
|
1160
|
+
const Half<decltype(d)> dh;
|
|
1161
|
+
return Combine(d, Set(dh, t1), Set(dh, t0));
|
|
1192
1162
|
#endif
|
|
1193
1163
|
}
|
|
1194
|
-
#endif
|
|
1164
|
+
#endif
|
|
1165
|
+
|
|
1166
|
+
// Generic for all vector lengths
|
|
1167
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
1168
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
1169
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
1170
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
1171
|
+
TFromD<D> t7) {
|
|
1172
|
+
const RebindToSigned<decltype(d)> di;
|
|
1173
|
+
return BitCast(d,
|
|
1174
|
+
Dup128VecFromValues(
|
|
1175
|
+
di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
|
|
1176
|
+
BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
|
|
1177
|
+
BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
|
|
1178
|
+
BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
|
|
1179
|
+
}
|
|
1180
|
+
|
|
1181
|
+
#if (HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL) && HWY_NEON_HAVE_F16C
|
|
1182
|
+
template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
1183
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
1184
|
+
TFromD<D> t2, TFromD<D> t3,
|
|
1185
|
+
TFromD<D> /*t4*/, TFromD<D> /*t5*/,
|
|
1186
|
+
TFromD<D> /*t6*/, TFromD<D> /*t7*/) {
|
|
1187
|
+
typedef __fp16 GccF16RawVectType __attribute__((__vector_size__(8)));
|
|
1188
|
+
(void)d;
|
|
1189
|
+
const GccF16RawVectType raw = {
|
|
1190
|
+
static_cast<__fp16>(t0), static_cast<__fp16>(t1), static_cast<__fp16>(t2),
|
|
1191
|
+
static_cast<__fp16>(t3)};
|
|
1192
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1193
|
+
}
|
|
1194
|
+
template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 16)>
|
|
1195
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
1196
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
1197
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
1198
|
+
TFromD<D> t7) {
|
|
1199
|
+
typedef __fp16 GccF16RawVectType __attribute__((__vector_size__(16)));
|
|
1200
|
+
(void)d;
|
|
1201
|
+
const GccF16RawVectType raw = {
|
|
1202
|
+
static_cast<__fp16>(t0), static_cast<__fp16>(t1), static_cast<__fp16>(t2),
|
|
1203
|
+
static_cast<__fp16>(t3), static_cast<__fp16>(t4), static_cast<__fp16>(t5),
|
|
1204
|
+
static_cast<__fp16>(t6), static_cast<__fp16>(t7)};
|
|
1205
|
+
return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
|
|
1206
|
+
}
|
|
1207
|
+
#else
|
|
1208
|
+
// Generic for all vector lengths if MSVC or !HWY_NEON_HAVE_F16C
|
|
1209
|
+
template <class D, HWY_IF_F16_D(D)>
|
|
1210
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
1211
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
1212
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
1213
|
+
TFromD<D> t7) {
|
|
1214
|
+
const RebindToSigned<decltype(d)> di;
|
|
1215
|
+
return BitCast(d,
|
|
1216
|
+
Dup128VecFromValues(
|
|
1217
|
+
di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
|
|
1218
|
+
BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
|
|
1219
|
+
BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
|
|
1220
|
+
BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
|
|
1221
|
+
}
|
|
1222
|
+
#endif // (HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL) && HWY_NEON_HAVE_F16C
|
|
1223
|
+
|
|
1224
|
+
namespace detail {
|
|
1225
|
+
|
|
1226
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
1227
|
+
HWY_INLINE VFromD<D> Iota0(D d) {
|
|
1228
|
+
return Dup128VecFromValues(
|
|
1229
|
+
d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2}, TFromD<D>{3}, TFromD<D>{4},
|
|
1230
|
+
TFromD<D>{5}, TFromD<D>{6}, TFromD<D>{7}, TFromD<D>{8}, TFromD<D>{9},
|
|
1231
|
+
TFromD<D>{10}, TFromD<D>{11}, TFromD<D>{12}, TFromD<D>{13}, TFromD<D>{14},
|
|
1232
|
+
TFromD<D>{15});
|
|
1233
|
+
}
|
|
1234
|
+
|
|
1235
|
+
template <class D, HWY_IF_UI16_D(D)>
|
|
1236
|
+
HWY_INLINE VFromD<D> Iota0(D d) {
|
|
1237
|
+
return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2},
|
|
1238
|
+
TFromD<D>{3}, TFromD<D>{4}, TFromD<D>{5},
|
|
1239
|
+
TFromD<D>{6}, TFromD<D>{7});
|
|
1240
|
+
}
|
|
1241
|
+
|
|
1242
|
+
template <class D, HWY_IF_F16_D(D)>
|
|
1243
|
+
HWY_INLINE VFromD<D> Iota0(D d) {
|
|
1244
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1245
|
+
return BitCast(d, Dup128VecFromValues(du, uint16_t{0}, uint16_t{0x3C00},
|
|
1246
|
+
uint16_t{0x4000}, uint16_t{0x4200},
|
|
1247
|
+
uint16_t{0x4400}, uint16_t{0x4500},
|
|
1248
|
+
uint16_t{0x4600}, uint16_t{0x4700}));
|
|
1249
|
+
}
|
|
1250
|
+
|
|
1251
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
1252
|
+
HWY_INLINE VFromD<D> Iota0(D d) {
|
|
1253
|
+
return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2},
|
|
1254
|
+
TFromD<D>{3});
|
|
1255
|
+
}
|
|
1256
|
+
|
|
1257
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8)>
|
|
1258
|
+
HWY_INLINE VFromD<D> Iota0(D d) {
|
|
1259
|
+
return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1});
|
|
1260
|
+
}
|
|
1195
1261
|
|
|
1196
1262
|
#if HWY_COMPILER_MSVC
|
|
1197
1263
|
template <class V, HWY_IF_V_SIZE_LE_V(V, 4)>
|
|
@@ -1274,30 +1340,25 @@ HWY_API Vec128<int64_t> Combine(D /* tag */, Vec64<int64_t> hi,
|
|
|
1274
1340
|
return Vec128<int64_t>(vcombine_s64(lo.raw, hi.raw));
|
|
1275
1341
|
}
|
|
1276
1342
|
|
|
1277
|
-
template <class D, HWY_IF_F16_D(D)>
|
|
1278
|
-
HWY_API Vec128<float16_t> Combine(D d, Vec64<float16_t> hi,
|
|
1279
|
-
Vec64<float16_t> lo) {
|
|
1280
1343
|
#if HWY_HAVE_FLOAT16
|
|
1281
|
-
|
|
1344
|
+
template <class D, HWY_IF_F16_D(D)>
|
|
1345
|
+
HWY_API Vec128<float16_t> Combine(D, Vec64<float16_t> hi, Vec64<float16_t> lo) {
|
|
1282
1346
|
return Vec128<float16_t>(vcombine_f16(lo.raw, hi.raw));
|
|
1283
|
-
#else
|
|
1284
|
-
const RebindToUnsigned<D> du;
|
|
1285
|
-
const Half<decltype(du)> duh;
|
|
1286
|
-
return BitCast(d, Combine(du, BitCast(duh, hi), BitCast(duh, lo)));
|
|
1287
|
-
#endif
|
|
1288
1347
|
}
|
|
1348
|
+
#endif // HWY_HAVE_FLOAT16
|
|
1289
1349
|
|
|
1290
|
-
template <class D, HWY_IF_BF16_D(D)>
|
|
1291
|
-
HWY_API Vec128<bfloat16_t> Combine(D d, Vec64<bfloat16_t> hi,
|
|
1292
|
-
Vec64<bfloat16_t> lo) {
|
|
1293
1350
|
#if HWY_NEON_HAVE_BFLOAT16
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1351
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
1352
|
+
HWY_API VFromD<D> Combine(D, Vec64<bfloat16_t> hi, Vec64<bfloat16_t> lo) {
|
|
1353
|
+
return VFromD<D>(vcombine_bf16(lo.raw, hi.raw));
|
|
1354
|
+
}
|
|
1355
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
1356
|
+
|
|
1357
|
+
template <class D, class DH = Half<D>, HWY_NEON_IF_EMULATED_D(D)>
|
|
1358
|
+
HWY_API VFromD<D> Combine(D d, VFromD<DH> hi, VFromD<DH> lo) {
|
|
1297
1359
|
const RebindToUnsigned<D> du;
|
|
1298
1360
|
const Half<decltype(du)> duh;
|
|
1299
1361
|
return BitCast(d, Combine(du, BitCast(duh, hi), BitCast(duh, lo)));
|
|
1300
|
-
#endif
|
|
1301
1362
|
}
|
|
1302
1363
|
|
|
1303
1364
|
template <class D, HWY_IF_F32_D(D)>
|
|
@@ -1341,7 +1402,7 @@ HWY_NEON_DEF_FUNCTION_UINT_32(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
|
|
|
1341
1402
|
HWY_NEON_DEF_FUNCTION_UINT_64(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
|
|
1342
1403
|
|
|
1343
1404
|
#if !HWY_HAVE_FLOAT16
|
|
1344
|
-
#if
|
|
1405
|
+
#if HWY_NEON_HAVE_F16C
|
|
1345
1406
|
HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(BitCastToByte, vreinterpret, _u8_,
|
|
1346
1407
|
HWY_CAST_TO_U8)
|
|
1347
1408
|
#else
|
|
@@ -1349,7 +1410,7 @@ template <size_t N>
|
|
|
1349
1410
|
HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<float16_t, N> v) {
|
|
1350
1411
|
return BitCastToByte(Vec128<uint16_t, N>(v.raw));
|
|
1351
1412
|
}
|
|
1352
|
-
#endif //
|
|
1413
|
+
#endif // HWY_NEON_HAVE_F16C
|
|
1353
1414
|
#endif // !HWY_HAVE_FLOAT16
|
|
1354
1415
|
|
|
1355
1416
|
#if !HWY_NEON_HAVE_BFLOAT16
|
|
@@ -1406,14 +1467,24 @@ HWY_INLINE Vec64<int64_t> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
|
|
|
1406
1467
|
return Vec64<int64_t>(vreinterpret_s64_u8(v.raw));
|
|
1407
1468
|
}
|
|
1408
1469
|
|
|
1470
|
+
// Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C.
|
|
1409
1471
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
|
|
1410
|
-
HWY_INLINE VFromD<D> BitCastFromByte(D
|
|
1411
|
-
#if HWY_HAVE_FLOAT16 ||
|
|
1412
|
-
(void)d;
|
|
1472
|
+
HWY_INLINE VFromD<D> BitCastFromByte(D, VFromD<Repartition<uint8_t, D>> v) {
|
|
1473
|
+
#if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
|
|
1413
1474
|
return VFromD<D>(vreinterpret_f16_u8(v.raw));
|
|
1414
1475
|
#else
|
|
1415
1476
|
const RebindToUnsigned<D> du;
|
|
1416
|
-
return VFromD<
|
|
1477
|
+
return VFromD<D>(BitCastFromByte(du, v).raw);
|
|
1478
|
+
#endif
|
|
1479
|
+
}
|
|
1480
|
+
|
|
1481
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
|
|
1482
|
+
HWY_INLINE VFromD<D> BitCastFromByte(D, VFromD<Repartition<uint8_t, D>> v) {
|
|
1483
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
1484
|
+
return VFromD<D>(vreinterpret_bf16_u8(v.raw));
|
|
1485
|
+
#else
|
|
1486
|
+
const RebindToUnsigned<D> du;
|
|
1487
|
+
return VFromD<D>(BitCastFromByte(du, v).raw);
|
|
1417
1488
|
#endif
|
|
1418
1489
|
}
|
|
1419
1490
|
|
|
@@ -1461,15 +1532,6 @@ HWY_INLINE Vec128<int64_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
|
1461
1532
|
return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
|
|
1462
1533
|
}
|
|
1463
1534
|
|
|
1464
|
-
template <class D, HWY_IF_F16_D(D)>
|
|
1465
|
-
HWY_INLINE Vec128<float16_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
1466
|
-
#if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_FLOAT16C
|
|
1467
|
-
return Vec128<float16_t>(vreinterpretq_f16_u8(v.raw));
|
|
1468
|
-
#else
|
|
1469
|
-
return Vec128<float16_t>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
|
|
1470
|
-
#endif
|
|
1471
|
-
}
|
|
1472
|
-
|
|
1473
1535
|
template <class D, HWY_IF_F32_D(D)>
|
|
1474
1536
|
HWY_INLINE Vec128<float> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
1475
1537
|
return Vec128<float>(vreinterpretq_f32_u8(v.raw));
|
|
@@ -1482,11 +1544,23 @@ HWY_INLINE Vec128<double> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
|
|
|
1482
1544
|
}
|
|
1483
1545
|
#endif // HWY_HAVE_FLOAT64
|
|
1484
1546
|
|
|
1485
|
-
//
|
|
1547
|
+
// Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C.
|
|
1548
|
+
template <class D, HWY_IF_F16_D(D)>
|
|
1549
|
+
HWY_INLINE VFromD<D> BitCastFromByte(D, Vec128<uint8_t> v) {
|
|
1550
|
+
#if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
|
|
1551
|
+
return VFromD<D>(vreinterpretq_f16_u8(v.raw));
|
|
1552
|
+
#else
|
|
1553
|
+
return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
|
|
1554
|
+
#endif
|
|
1555
|
+
}
|
|
1556
|
+
|
|
1486
1557
|
template <class D, HWY_IF_BF16_D(D)>
|
|
1487
|
-
HWY_INLINE VFromD<D> BitCastFromByte(D
|
|
1488
|
-
|
|
1558
|
+
HWY_INLINE VFromD<D> BitCastFromByte(D, Vec128<uint8_t> v) {
|
|
1559
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
1560
|
+
return VFromD<D>(vreinterpretq_bf16_u8(v.raw));
|
|
1561
|
+
#else
|
|
1489
1562
|
return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
|
|
1563
|
+
#endif
|
|
1490
1564
|
}
|
|
1491
1565
|
|
|
1492
1566
|
} // namespace detail
|
|
@@ -1694,6 +1768,14 @@ HWY_NEON_DEF_FUNCTION_ALL_TYPES(InsertLane, vset, _lane_, HWY_INSERT)
|
|
|
1694
1768
|
#undef HWY_NEON_BUILD_PARAM_HWY_INSERT
|
|
1695
1769
|
#undef HWY_NEON_BUILD_ARG_HWY_INSERT
|
|
1696
1770
|
|
|
1771
|
+
template <size_t kLane, class V, class D = DFromV<V>, HWY_NEON_IF_EMULATED_D(D)>
|
|
1772
|
+
HWY_API V InsertLane(const V v, TFromD<D> t) {
|
|
1773
|
+
const D d;
|
|
1774
|
+
const RebindToUnsigned<D> du;
|
|
1775
|
+
const uint16_t tu = BitCastScalar<uint16_t>(t);
|
|
1776
|
+
return BitCast(d, InsertLane<kLane>(BitCast(du, v), tu));
|
|
1777
|
+
}
|
|
1778
|
+
|
|
1697
1779
|
} // namespace detail
|
|
1698
1780
|
|
|
1699
1781
|
// Requires one overload per vector length because InsertLane<3> may be a
|
|
@@ -1842,6 +1924,89 @@ HWY_API Vec128<uint64_t> SumsOf8(const Vec128<uint8_t> v) {
|
|
|
1842
1924
|
HWY_API Vec64<uint64_t> SumsOf8(const Vec64<uint8_t> v) {
|
|
1843
1925
|
return Vec64<uint64_t>(vpaddl_u32(vpaddl_u16(vpaddl_u8(v.raw))));
|
|
1844
1926
|
}
|
|
1927
|
+
HWY_API Vec128<int64_t> SumsOf8(const Vec128<int8_t> v) {
|
|
1928
|
+
return Vec128<int64_t>(vpaddlq_s32(vpaddlq_s16(vpaddlq_s8(v.raw))));
|
|
1929
|
+
}
|
|
1930
|
+
HWY_API Vec64<int64_t> SumsOf8(const Vec64<int8_t> v) {
|
|
1931
|
+
return Vec64<int64_t>(vpaddl_s32(vpaddl_s16(vpaddl_s8(v.raw))));
|
|
1932
|
+
}
|
|
1933
|
+
|
|
1934
|
+
// ------------------------------ SumsOf2
|
|
1935
|
+
namespace detail {
|
|
1936
|
+
|
|
1937
|
+
template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
|
|
1938
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
1939
|
+
hwy::SignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
|
|
1940
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_s8(v.raw));
|
|
1941
|
+
}
|
|
1942
|
+
|
|
1943
|
+
template <class V, HWY_IF_V_SIZE_V(V, 16)>
|
|
1944
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
1945
|
+
hwy::SignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
|
|
1946
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_s8(v.raw));
|
|
1947
|
+
}
|
|
1948
|
+
|
|
1949
|
+
template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
|
|
1950
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
1951
|
+
hwy::UnsignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
|
|
1952
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_u8(v.raw));
|
|
1953
|
+
}
|
|
1954
|
+
|
|
1955
|
+
template <class V, HWY_IF_V_SIZE_V(V, 16)>
|
|
1956
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
1957
|
+
hwy::UnsignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
|
|
1958
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_u8(v.raw));
|
|
1959
|
+
}
|
|
1960
|
+
|
|
1961
|
+
template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
|
|
1962
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
1963
|
+
hwy::SignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
|
|
1964
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_s16(v.raw));
|
|
1965
|
+
}
|
|
1966
|
+
|
|
1967
|
+
template <class V, HWY_IF_V_SIZE_V(V, 16)>
|
|
1968
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
1969
|
+
hwy::SignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
|
|
1970
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_s16(v.raw));
|
|
1971
|
+
}
|
|
1972
|
+
|
|
1973
|
+
template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
|
|
1974
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
1975
|
+
hwy::UnsignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
|
|
1976
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_u16(v.raw));
|
|
1977
|
+
}
|
|
1978
|
+
|
|
1979
|
+
template <class V, HWY_IF_V_SIZE_V(V, 16)>
|
|
1980
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
1981
|
+
hwy::UnsignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
|
|
1982
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_u16(v.raw));
|
|
1983
|
+
}
|
|
1984
|
+
|
|
1985
|
+
template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
|
|
1986
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
1987
|
+
hwy::SignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
|
|
1988
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_s32(v.raw));
|
|
1989
|
+
}
|
|
1990
|
+
|
|
1991
|
+
template <class V, HWY_IF_V_SIZE_V(V, 16)>
|
|
1992
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
1993
|
+
hwy::SignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
|
|
1994
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_s32(v.raw));
|
|
1995
|
+
}
|
|
1996
|
+
|
|
1997
|
+
template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
|
|
1998
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
1999
|
+
hwy::UnsignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
|
|
2000
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_u32(v.raw));
|
|
2001
|
+
}
|
|
2002
|
+
|
|
2003
|
+
template <class V, HWY_IF_V_SIZE_V(V, 16)>
|
|
2004
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
2005
|
+
hwy::UnsignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
|
|
2006
|
+
return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_u32(v.raw));
|
|
2007
|
+
}
|
|
2008
|
+
|
|
2009
|
+
} // namespace detail
|
|
1845
2010
|
|
|
1846
2011
|
// ------------------------------ SaturatedAdd
|
|
1847
2012
|
|
|
@@ -1922,6 +2087,31 @@ HWY_API Vec128<int64_t> Neg(const Vec128<int64_t> v) {
|
|
|
1922
2087
|
#endif
|
|
1923
2088
|
}
|
|
1924
2089
|
|
|
2090
|
+
// ------------------------------ SaturatedNeg
|
|
2091
|
+
#ifdef HWY_NATIVE_SATURATED_NEG_8_16_32
|
|
2092
|
+
#undef HWY_NATIVE_SATURATED_NEG_8_16_32
|
|
2093
|
+
#else
|
|
2094
|
+
#define HWY_NATIVE_SATURATED_NEG_8_16_32
|
|
2095
|
+
#endif
|
|
2096
|
+
|
|
2097
|
+
HWY_NEON_DEF_FUNCTION_INT_8_16_32(SaturatedNeg, vqneg, _, 1)
|
|
2098
|
+
|
|
2099
|
+
#if HWY_ARCH_ARM_A64
|
|
2100
|
+
#ifdef HWY_NATIVE_SATURATED_NEG_64
|
|
2101
|
+
#undef HWY_NATIVE_SATURATED_NEG_64
|
|
2102
|
+
#else
|
|
2103
|
+
#define HWY_NATIVE_SATURATED_NEG_64
|
|
2104
|
+
#endif
|
|
2105
|
+
|
|
2106
|
+
HWY_API Vec64<int64_t> SaturatedNeg(const Vec64<int64_t> v) {
|
|
2107
|
+
return Vec64<int64_t>(vqneg_s64(v.raw));
|
|
2108
|
+
}
|
|
2109
|
+
|
|
2110
|
+
HWY_API Vec128<int64_t> SaturatedNeg(const Vec128<int64_t> v) {
|
|
2111
|
+
return Vec128<int64_t>(vqnegq_s64(v.raw));
|
|
2112
|
+
}
|
|
2113
|
+
#endif
|
|
2114
|
+
|
|
1925
2115
|
// ------------------------------ ShiftLeft
|
|
1926
2116
|
|
|
1927
2117
|
// Customize HWY_NEON_DEF_FUNCTION to special-case count=0 (not supported).
|
|
@@ -2310,13 +2500,13 @@ HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
|
|
|
2310
2500
|
return detail::NegMulAdd(add, mul, x);
|
|
2311
2501
|
}
|
|
2312
2502
|
|
|
2313
|
-
template <typename T, size_t N>
|
|
2503
|
+
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
2314
2504
|
HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
|
|
2315
2505
|
Vec128<T, N> sub) {
|
|
2316
2506
|
return MulAdd(mul, x, Neg(sub));
|
|
2317
2507
|
}
|
|
2318
2508
|
|
|
2319
|
-
template <typename T, size_t N>
|
|
2509
|
+
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
2320
2510
|
HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
|
|
2321
2511
|
Vec128<T, N> sub) {
|
|
2322
2512
|
return Neg(MulAdd(mul, x, sub));
|
|
@@ -2612,6 +2802,15 @@ HWY_API Vec128<T, N> PopulationCount(Vec128<T, N> v) {
|
|
|
2612
2802
|
HWY_NEON_DEF_FUNCTION_INT_8_16_32(Abs, vabs, _, 1)
|
|
2613
2803
|
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Abs, vabs, _, 1)
|
|
2614
2804
|
|
|
2805
|
+
// ------------------------------ SaturatedAbs
|
|
2806
|
+
#ifdef HWY_NATIVE_SATURATED_ABS
|
|
2807
|
+
#undef HWY_NATIVE_SATURATED_ABS
|
|
2808
|
+
#else
|
|
2809
|
+
#define HWY_NATIVE_SATURATED_ABS
|
|
2810
|
+
#endif
|
|
2811
|
+
|
|
2812
|
+
HWY_NEON_DEF_FUNCTION_INT_8_16_32(SaturatedAbs, vqabs, _, 1)
|
|
2813
|
+
|
|
2615
2814
|
// ------------------------------ CopySign
|
|
2616
2815
|
template <typename T, size_t N>
|
|
2617
2816
|
HWY_API Vec128<T, N> CopySign(Vec128<T, N> magn, Vec128<T, N> sign) {
|
|
@@ -2675,22 +2874,42 @@ HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) {
|
|
|
2675
2874
|
|
|
2676
2875
|
HWY_NEON_DEF_FUNCTION_ALL_TYPES(IfThenElse, vbsl, _, HWY_IF)
|
|
2677
2876
|
|
|
2877
|
+
template <class V, class D = DFromV<V>, HWY_NEON_IF_EMULATED_D(D)>
|
|
2878
|
+
HWY_API V IfThenElse(MFromD<D> mask, V yes, V no) {
|
|
2879
|
+
const DFromV<decltype(yes)> d;
|
|
2880
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2881
|
+
return BitCast(
|
|
2882
|
+
d, IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no)));
|
|
2883
|
+
}
|
|
2884
|
+
|
|
2678
2885
|
#undef HWY_NEON_BUILD_TPL_HWY_IF
|
|
2679
2886
|
#undef HWY_NEON_BUILD_RET_HWY_IF
|
|
2680
2887
|
#undef HWY_NEON_BUILD_PARAM_HWY_IF
|
|
2681
2888
|
#undef HWY_NEON_BUILD_ARG_HWY_IF
|
|
2682
2889
|
|
|
2683
2890
|
// mask ? yes : 0
|
|
2684
|
-
template <typename T, size_t N>
|
|
2891
|
+
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
2685
2892
|
HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
|
|
2686
2893
|
return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
|
|
2687
2894
|
}
|
|
2895
|
+
template <typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)>
|
|
2896
|
+
HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
|
|
2897
|
+
const DFromV<decltype(yes)> d;
|
|
2898
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2899
|
+
return BitCast(d, IfThenElseZero(RebindMask(du, mask), BitCast(du, yes)));
|
|
2900
|
+
}
|
|
2688
2901
|
|
|
2689
2902
|
// mask ? 0 : no
|
|
2690
|
-
template <typename T, size_t N>
|
|
2903
|
+
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
|
|
2691
2904
|
HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
|
|
2692
2905
|
return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
|
|
2693
2906
|
}
|
|
2907
|
+
template <typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)>
|
|
2908
|
+
HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
|
|
2909
|
+
const DFromV<decltype(no)> d;
|
|
2910
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2911
|
+
return BitCast(d, IfThenZeroElse(RebindMask(du, mask), BitCast(du, no)));
|
|
2912
|
+
}
|
|
2694
2913
|
|
|
2695
2914
|
template <typename T, size_t N>
|
|
2696
2915
|
HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
|
|
@@ -2957,6 +3176,23 @@ HWY_API Vec64<int64_t> Abs(const Vec64<int64_t> v) {
|
|
|
2957
3176
|
#endif
|
|
2958
3177
|
}
|
|
2959
3178
|
|
|
3179
|
+
HWY_API Vec128<int64_t> SaturatedAbs(const Vec128<int64_t> v) {
|
|
3180
|
+
#if HWY_ARCH_ARM_A64
|
|
3181
|
+
return Vec128<int64_t>(vqabsq_s64(v.raw));
|
|
3182
|
+
#else
|
|
3183
|
+
const auto zero = Zero(DFromV<decltype(v)>());
|
|
3184
|
+
return IfThenElse(MaskFromVec(BroadcastSignBit(v)), SaturatedSub(zero, v), v);
|
|
3185
|
+
#endif
|
|
3186
|
+
}
|
|
3187
|
+
HWY_API Vec64<int64_t> SaturatedAbs(const Vec64<int64_t> v) {
|
|
3188
|
+
#if HWY_ARCH_ARM_A64
|
|
3189
|
+
return Vec64<int64_t>(vqabs_s64(v.raw));
|
|
3190
|
+
#else
|
|
3191
|
+
const auto zero = Zero(DFromV<decltype(v)>());
|
|
3192
|
+
return IfThenElse(MaskFromVec(BroadcastSignBit(v)), SaturatedSub(zero, v), v);
|
|
3193
|
+
#endif
|
|
3194
|
+
}
|
|
3195
|
+
|
|
2960
3196
|
// ------------------------------ Min (IfThenElse, BroadcastSignBit)
|
|
2961
3197
|
|
|
2962
3198
|
// Unsigned
|
|
@@ -3133,6 +3369,20 @@ HWY_API Vec128<int64_t> LoadU(D /* tag */,
|
|
|
3133
3369
|
const int64_t* HWY_RESTRICT unaligned) {
|
|
3134
3370
|
return Vec128<int64_t>(vld1q_s64(unaligned));
|
|
3135
3371
|
}
|
|
3372
|
+
#if HWY_HAVE_FLOAT16
|
|
3373
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
|
|
3374
|
+
HWY_API Vec128<float16_t> LoadU(D /* tag */,
|
|
3375
|
+
const float16_t* HWY_RESTRICT unaligned) {
|
|
3376
|
+
return Vec128<float16_t>(vld1q_f16(detail::NativeLanePointer(unaligned)));
|
|
3377
|
+
}
|
|
3378
|
+
#endif // HWY_HAVE_FLOAT16
|
|
3379
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
3380
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
|
|
3381
|
+
HWY_API Vec128<bfloat16_t> LoadU(D /* tag */,
|
|
3382
|
+
const bfloat16_t* HWY_RESTRICT unaligned) {
|
|
3383
|
+
return Vec128<bfloat16_t>(vld1q_bf16(detail::NativeLanePointer(unaligned)));
|
|
3384
|
+
}
|
|
3385
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
3136
3386
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
|
|
3137
3387
|
HWY_API Vec128<float> LoadU(D /* tag */, const float* HWY_RESTRICT unaligned) {
|
|
3138
3388
|
return Vec128<float>(vld1q_f32(unaligned));
|
|
@@ -3179,6 +3429,18 @@ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)>
|
|
|
3179
3429
|
HWY_API Vec64<int64_t> LoadU(D /* tag */, const int64_t* HWY_RESTRICT p) {
|
|
3180
3430
|
return Vec64<int64_t>(vld1_s64(p));
|
|
3181
3431
|
}
|
|
3432
|
+
#if HWY_HAVE_FLOAT16
|
|
3433
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F16_D(D)>
|
|
3434
|
+
HWY_API Vec64<float16_t> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) {
|
|
3435
|
+
return Vec64<float16_t>(vld1_f16(detail::NativeLanePointer(p)));
|
|
3436
|
+
}
|
|
3437
|
+
#endif // HWY_HAVE_FLOAT16
|
|
3438
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
3439
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
|
|
3440
|
+
HWY_API Vec64<bfloat16_t> LoadU(D /* tag */, const bfloat16_t* HWY_RESTRICT p) {
|
|
3441
|
+
return Vec64<bfloat16_t>(vld1_bf16(detail::NativeLanePointer(p)));
|
|
3442
|
+
}
|
|
3443
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
3182
3444
|
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
|
|
3183
3445
|
HWY_API Vec64<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) {
|
|
3184
3446
|
return Vec64<float>(vld1_f32(p));
|
|
@@ -3207,14 +3469,34 @@ HWY_API Vec32<float> LoadU(D /*tag*/, const float* HWY_RESTRICT p) {
|
|
|
3207
3469
|
return Vec32<float>(vld1_dup_f32(p));
|
|
3208
3470
|
}
|
|
3209
3471
|
|
|
3210
|
-
|
|
3211
|
-
|
|
3472
|
+
// {u,i}{8,16}
|
|
3473
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_LE_D(D, 2),
|
|
3474
|
+
HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
|
|
3475
|
+
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
3476
|
+
const Repartition<uint32_t, decltype(d)> d32;
|
|
3477
|
+
uint32_t buf;
|
|
3478
|
+
CopyBytes<4>(p, &buf);
|
|
3479
|
+
return BitCast(d, LoadU(d32, &buf));
|
|
3480
|
+
}
|
|
3481
|
+
|
|
3482
|
+
#if HWY_HAVE_FLOAT16
|
|
3483
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F16_D(D)>
|
|
3484
|
+
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
3485
|
+
const Repartition<uint32_t, decltype(d)> d32;
|
|
3486
|
+
uint32_t buf;
|
|
3487
|
+
CopyBytes<4>(p, &buf);
|
|
3488
|
+
return BitCast(d, LoadU(d32, &buf));
|
|
3489
|
+
}
|
|
3490
|
+
#endif // HWY_HAVE_FLOAT16
|
|
3491
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
3492
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_BF16_D(D)>
|
|
3212
3493
|
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
3213
3494
|
const Repartition<uint32_t, decltype(d)> d32;
|
|
3214
3495
|
uint32_t buf;
|
|
3215
3496
|
CopyBytes<4>(p, &buf);
|
|
3216
3497
|
return BitCast(d, LoadU(d32, &buf));
|
|
3217
3498
|
}
|
|
3499
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
3218
3500
|
|
|
3219
3501
|
// ------------------------------ Load 16
|
|
3220
3502
|
|
|
@@ -3228,6 +3510,18 @@ template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_I16_D(D)>
|
|
|
3228
3510
|
HWY_API VFromD<D> LoadU(D /* tag */, const int16_t* HWY_RESTRICT p) {
|
|
3229
3511
|
return VFromD<D>(vld1_dup_s16(p));
|
|
3230
3512
|
}
|
|
3513
|
+
#if HWY_HAVE_FLOAT16
|
|
3514
|
+
template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_F16_D(D)>
|
|
3515
|
+
HWY_API VFromD<D> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) {
|
|
3516
|
+
return VFromD<D>(vld1_dup_f16(detail::NativeLanePointer(p)));
|
|
3517
|
+
}
|
|
3518
|
+
#endif // HWY_HAVE_FLOAT16
|
|
3519
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
3520
|
+
template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_BF16_D(D)>
|
|
3521
|
+
HWY_API VFromD<D> LoadU(D /* tag */, const bfloat16_t* HWY_RESTRICT p) {
|
|
3522
|
+
return VFromD<D>(vld1_dup_bf16(detail::NativeLanePointer(p)));
|
|
3523
|
+
}
|
|
3524
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
3231
3525
|
|
|
3232
3526
|
// 8-bit x2
|
|
3233
3527
|
template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
|
|
@@ -3250,12 +3544,10 @@ HWY_API VFromD<D> LoadU(D /* tag */, const int8_t* HWY_RESTRICT p) {
|
|
|
3250
3544
|
|
|
3251
3545
|
// ------------------------------ Load misc
|
|
3252
3546
|
|
|
3253
|
-
|
|
3254
|
-
template <class D, HWY_IF_SPECIAL_FLOAT_D(D)>
|
|
3547
|
+
template <class D, HWY_NEON_IF_EMULATED_D(D)>
|
|
3255
3548
|
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
3256
|
-
const RebindToUnsigned<decltype(d)>
|
|
3257
|
-
|
|
3258
|
-
return BitCast(d, LoadU(du16, pu16));
|
|
3549
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
3550
|
+
return BitCast(d, LoadU(du, detail::U16LanePointer(p)));
|
|
3259
3551
|
}
|
|
3260
3552
|
|
|
3261
3553
|
// On Arm, Load is the same as LoadU.
|
|
@@ -3324,6 +3616,20 @@ HWY_API void StoreU(Vec128<int64_t> v, D /* tag */,
|
|
|
3324
3616
|
int64_t* HWY_RESTRICT unaligned) {
|
|
3325
3617
|
vst1q_s64(unaligned, v.raw);
|
|
3326
3618
|
}
|
|
3619
|
+
#if HWY_HAVE_FLOAT16
|
|
3620
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
|
|
3621
|
+
HWY_API void StoreU(Vec128<float16_t> v, D /* tag */,
|
|
3622
|
+
float16_t* HWY_RESTRICT unaligned) {
|
|
3623
|
+
vst1q_f16(detail::NativeLanePointer(unaligned), v.raw);
|
|
3624
|
+
}
|
|
3625
|
+
#endif // HWY_HAVE_FLOAT16
|
|
3626
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
3627
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
|
|
3628
|
+
HWY_API void StoreU(Vec128<bfloat16_t> v, D /* tag */,
|
|
3629
|
+
bfloat16_t* HWY_RESTRICT unaligned) {
|
|
3630
|
+
vst1q_bf16(detail::NativeLanePointer(unaligned), v.raw);
|
|
3631
|
+
}
|
|
3632
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
3327
3633
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
|
|
3328
3634
|
HWY_API void StoreU(Vec128<float> v, D /* tag */,
|
|
3329
3635
|
float* HWY_RESTRICT unaligned) {
|
|
@@ -3371,6 +3677,20 @@ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)>
|
|
|
3371
3677
|
HWY_API void StoreU(Vec64<int64_t> v, D /* tag */, int64_t* HWY_RESTRICT p) {
|
|
3372
3678
|
vst1_s64(p, v.raw);
|
|
3373
3679
|
}
|
|
3680
|
+
#if HWY_HAVE_FLOAT16
|
|
3681
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F16_D(D)>
|
|
3682
|
+
HWY_API void StoreU(Vec64<float16_t> v, D /* tag */,
|
|
3683
|
+
float16_t* HWY_RESTRICT p) {
|
|
3684
|
+
vst1_f16(detail::NativeLanePointer(p), v.raw);
|
|
3685
|
+
}
|
|
3686
|
+
#endif // HWY_HAVE_FLOAT16
|
|
3687
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
3688
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
|
|
3689
|
+
HWY_API void StoreU(Vec64<bfloat16_t> v, D /* tag */,
|
|
3690
|
+
bfloat16_t* HWY_RESTRICT p) {
|
|
3691
|
+
vst1_bf16(detail::NativeLanePointer(p), v.raw);
|
|
3692
|
+
}
|
|
3693
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
3374
3694
|
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
|
|
3375
3695
|
HWY_API void StoreU(Vec64<float> v, D /* tag */, float* HWY_RESTRICT p) {
|
|
3376
3696
|
vst1_f32(p, v.raw);
|
|
@@ -3397,28 +3717,31 @@ HWY_API void StoreU(Vec32<float> v, D, float* HWY_RESTRICT p) {
|
|
|
3397
3717
|
vst1_lane_f32(p, v.raw, 0);
|
|
3398
3718
|
}
|
|
3399
3719
|
|
|
3400
|
-
//
|
|
3401
|
-
template <class D, HWY_IF_V_SIZE_D(D, 4),
|
|
3402
|
-
|
|
3403
|
-
HWY_API void StoreU(
|
|
3720
|
+
// {u,i}{8,16}
|
|
3721
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_LE_D(D, 2),
|
|
3722
|
+
HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
|
|
3723
|
+
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
|
|
3404
3724
|
Repartition<uint32_t, decltype(d)> d32;
|
|
3405
3725
|
uint32_t buf = GetLane(BitCast(d32, v));
|
|
3406
3726
|
CopyBytes<4>(&buf, p);
|
|
3407
3727
|
}
|
|
3408
3728
|
|
|
3409
|
-
|
|
3410
|
-
|
|
3729
|
+
#if HWY_HAVE_FLOAT16
|
|
3730
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F16_D(D)>
|
|
3731
|
+
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
|
|
3411
3732
|
Repartition<uint32_t, decltype(d)> d32;
|
|
3412
3733
|
uint32_t buf = GetLane(BitCast(d32, v));
|
|
3413
3734
|
CopyBytes<4>(&buf, p);
|
|
3414
3735
|
}
|
|
3415
|
-
|
|
3416
|
-
|
|
3417
|
-
|
|
3736
|
+
#endif
|
|
3737
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
3738
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_BF16_D(D)>
|
|
3739
|
+
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
|
|
3418
3740
|
Repartition<uint32_t, decltype(d)> d32;
|
|
3419
3741
|
uint32_t buf = GetLane(BitCast(d32, v));
|
|
3420
3742
|
CopyBytes<4>(&buf, p);
|
|
3421
3743
|
}
|
|
3744
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
3422
3745
|
|
|
3423
3746
|
// ------------------------------ Store 16
|
|
3424
3747
|
|
|
@@ -3430,6 +3753,18 @@ template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_I16_D(D)>
|
|
|
3430
3753
|
HWY_API void StoreU(Vec16<int16_t> v, D, int16_t* HWY_RESTRICT p) {
|
|
3431
3754
|
vst1_lane_s16(p, v.raw, 0);
|
|
3432
3755
|
}
|
|
3756
|
+
#if HWY_HAVE_FLOAT16
|
|
3757
|
+
template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_F16_D(D)>
|
|
3758
|
+
HWY_API void StoreU(Vec16<float16_t> v, D, float16_t* HWY_RESTRICT p) {
|
|
3759
|
+
vst1_lane_f16(detail::NativeLanePointer(p), v.raw, 0);
|
|
3760
|
+
}
|
|
3761
|
+
#endif // HWY_HAVE_FLOAT16
|
|
3762
|
+
#if HWY_NEON_HAVE_BFLOAT16
|
|
3763
|
+
template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_BF16_D(D)>
|
|
3764
|
+
HWY_API void StoreU(Vec16<bfloat16_t> v, D, bfloat16_t* HWY_RESTRICT p) {
|
|
3765
|
+
vst1_lane_bf16(detail::NativeLanePointer(p), v.raw, 0);
|
|
3766
|
+
}
|
|
3767
|
+
#endif // HWY_NEON_HAVE_BFLOAT16
|
|
3433
3768
|
|
|
3434
3769
|
template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
|
|
3435
3770
|
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
|
|
@@ -3449,12 +3784,12 @@ HWY_API void StoreU(Vec128<int8_t, 1> v, D, int8_t* HWY_RESTRICT p) {
|
|
|
3449
3784
|
vst1_lane_s8(p, v.raw, 0);
|
|
3450
3785
|
}
|
|
3451
3786
|
|
|
3452
|
-
//
|
|
3453
|
-
|
|
3787
|
+
// ------------------------------ Store misc
|
|
3788
|
+
|
|
3789
|
+
template <class D, HWY_NEON_IF_EMULATED_D(D)>
|
|
3454
3790
|
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
|
|
3455
|
-
const RebindToUnsigned<decltype(d)>
|
|
3456
|
-
|
|
3457
|
-
return StoreU(BitCast(du16, v), du16, pu16);
|
|
3791
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
3792
|
+
return StoreU(BitCast(du, v), du, detail::U16LanePointer(p));
|
|
3458
3793
|
}
|
|
3459
3794
|
|
|
3460
3795
|
HWY_DIAGNOSTICS(push)
|
|
@@ -3541,24 +3876,6 @@ HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToUnsigned<D>> v) {
|
|
|
3541
3876
|
return VFromD<D>(vcvt_f32_u32(v.raw));
|
|
3542
3877
|
}
|
|
3543
3878
|
|
|
3544
|
-
// Truncates (rounds toward zero).
|
|
3545
|
-
template <class D, HWY_IF_I32_D(D)>
|
|
3546
|
-
HWY_API Vec128<int32_t> ConvertTo(D /* tag */, Vec128<float> v) {
|
|
3547
|
-
return Vec128<int32_t>(vcvtq_s32_f32(v.raw));
|
|
3548
|
-
}
|
|
3549
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
|
|
3550
|
-
HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
|
|
3551
|
-
return VFromD<D>(vcvt_s32_f32(v.raw));
|
|
3552
|
-
}
|
|
3553
|
-
template <class D, HWY_IF_U32_D(D)>
|
|
3554
|
-
HWY_API Vec128<uint32_t> ConvertTo(D /* tag */, Vec128<float> v) {
|
|
3555
|
-
return Vec128<uint32_t>(vcvtq_u32_f32(ZeroIfNegative(v).raw));
|
|
3556
|
-
}
|
|
3557
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
|
|
3558
|
-
HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
|
|
3559
|
-
return VFromD<D>(vcvt_u32_f32(ZeroIfNegative(v).raw));
|
|
3560
|
-
}
|
|
3561
|
-
|
|
3562
3879
|
#if HWY_HAVE_FLOAT64
|
|
3563
3880
|
|
|
3564
3881
|
template <class D, HWY_IF_F64_D(D)>
|
|
@@ -3590,38 +3907,156 @@ HWY_API Vec64<double> ConvertTo(D /* tag */, Vec64<uint64_t> v) {
|
|
|
3590
3907
|
#endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
|
|
3591
3908
|
}
|
|
3592
3909
|
|
|
3910
|
+
#endif // HWY_HAVE_FLOAT64
|
|
3911
|
+
|
|
3912
|
+
namespace detail {
|
|
3593
3913
|
// Truncates (rounds toward zero).
|
|
3594
|
-
template <class D,
|
|
3595
|
-
|
|
3914
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
|
|
3915
|
+
HWY_INLINE Vec128<int32_t> ConvertFToI(D /* tag */, Vec128<float> v) {
|
|
3916
|
+
#if HWY_COMPILER_CLANG && \
|
|
3917
|
+
((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
|
|
3918
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
|
|
3919
|
+
// Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
|
|
3920
|
+
// outside of the range of an int32_t.
|
|
3921
|
+
|
|
3922
|
+
int32x4_t raw_result;
|
|
3923
|
+
__asm__(
|
|
3924
|
+
#if HWY_ARCH_ARM_A64
|
|
3925
|
+
"fcvtzs %0.4s, %1.4s"
|
|
3926
|
+
#else
|
|
3927
|
+
"vcvt.s32.f32 %0, %1"
|
|
3928
|
+
#endif
|
|
3929
|
+
: "=w"(raw_result)
|
|
3930
|
+
: "w"(v.raw));
|
|
3931
|
+
return Vec128<int32_t>(raw_result);
|
|
3932
|
+
#else
|
|
3933
|
+
return Vec128<int32_t>(vcvtq_s32_f32(v.raw));
|
|
3934
|
+
#endif
|
|
3935
|
+
}
|
|
3936
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
|
|
3937
|
+
HWY_INLINE VFromD<D> ConvertFToI(D /* tag */, VFromD<RebindToFloat<D>> v) {
|
|
3938
|
+
#if HWY_COMPILER_CLANG && \
|
|
3939
|
+
((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
|
|
3940
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
|
|
3941
|
+
// Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
|
|
3942
|
+
// outside of the range of an int32_t.
|
|
3943
|
+
|
|
3944
|
+
int32x2_t raw_result;
|
|
3945
|
+
__asm__(
|
|
3946
|
+
#if HWY_ARCH_ARM_A64
|
|
3947
|
+
"fcvtzs %0.2s, %1.2s"
|
|
3948
|
+
#else
|
|
3949
|
+
"vcvt.s32.f32 %0, %1"
|
|
3950
|
+
#endif
|
|
3951
|
+
: "=w"(raw_result)
|
|
3952
|
+
: "w"(v.raw));
|
|
3953
|
+
return VFromD<D>(raw_result);
|
|
3954
|
+
#else
|
|
3955
|
+
return VFromD<D>(vcvt_s32_f32(v.raw));
|
|
3956
|
+
#endif
|
|
3957
|
+
}
|
|
3958
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
|
|
3959
|
+
HWY_INLINE Vec128<uint32_t> ConvertFToU(D /* tag */, Vec128<float> v) {
|
|
3960
|
+
#if HWY_COMPILER_CLANG && \
|
|
3961
|
+
((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
|
|
3962
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
|
|
3963
|
+
// Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
|
|
3964
|
+
// outside of the range of an uint32_t.
|
|
3965
|
+
|
|
3966
|
+
uint32x4_t raw_result;
|
|
3967
|
+
__asm__(
|
|
3968
|
+
#if HWY_ARCH_ARM_A64
|
|
3969
|
+
"fcvtzu %0.4s, %1.4s"
|
|
3970
|
+
#else
|
|
3971
|
+
"vcvt.u32.f32 %0, %1"
|
|
3972
|
+
#endif
|
|
3973
|
+
: "=w"(raw_result)
|
|
3974
|
+
: "w"(v.raw));
|
|
3975
|
+
return Vec128<uint32_t>(raw_result);
|
|
3976
|
+
#else
|
|
3977
|
+
return Vec128<uint32_t>(vcvtq_u32_f32(v.raw));
|
|
3978
|
+
#endif
|
|
3979
|
+
}
|
|
3980
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
|
|
3981
|
+
HWY_INLINE VFromD<D> ConvertFToU(D /* tag */, VFromD<RebindToFloat<D>> v) {
|
|
3982
|
+
#if HWY_COMPILER_CLANG && \
|
|
3983
|
+
((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
|
|
3984
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
|
|
3985
|
+
// Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
|
|
3986
|
+
// outside of the range of an uint32_t.
|
|
3987
|
+
|
|
3988
|
+
uint32x2_t raw_result;
|
|
3989
|
+
__asm__(
|
|
3990
|
+
#if HWY_ARCH_ARM_A64
|
|
3991
|
+
"fcvtzu %0.2s, %1.2s"
|
|
3992
|
+
#else
|
|
3993
|
+
"vcvt.u32.f32 %0, %1"
|
|
3994
|
+
#endif
|
|
3995
|
+
: "=w"(raw_result)
|
|
3996
|
+
: "w"(v.raw));
|
|
3997
|
+
return VFromD<D>(raw_result);
|
|
3998
|
+
#else
|
|
3999
|
+
return VFromD<D>(vcvt_u32_f32(v.raw));
|
|
4000
|
+
#endif
|
|
4001
|
+
}
|
|
4002
|
+
|
|
4003
|
+
#if HWY_HAVE_FLOAT64
|
|
4004
|
+
|
|
4005
|
+
// Truncates (rounds toward zero).
|
|
4006
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
|
|
4007
|
+
HWY_INLINE Vec128<int64_t> ConvertFToI(D /* tag */, Vec128<double> v) {
|
|
4008
|
+
#if HWY_COMPILER_CLANG && HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200
|
|
4009
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
|
|
4010
|
+
// to avoid undefined behavior if v[i] is outside of the range of an int64_t.
|
|
4011
|
+
int64x2_t raw_result;
|
|
4012
|
+
__asm__("fcvtzs %0.2d, %1.2d" : "=w"(raw_result) : "w"(v.raw));
|
|
4013
|
+
return Vec128<int64_t>(raw_result);
|
|
4014
|
+
#else
|
|
3596
4015
|
return Vec128<int64_t>(vcvtq_s64_f64(v.raw));
|
|
4016
|
+
#endif
|
|
3597
4017
|
}
|
|
3598
|
-
template <class D, HWY_IF_I64_D(D)>
|
|
3599
|
-
|
|
3600
|
-
|
|
3601
|
-
|
|
3602
|
-
|
|
3603
|
-
|
|
3604
|
-
|
|
3605
|
-
|
|
4018
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)>
|
|
4019
|
+
HWY_INLINE Vec64<int64_t> ConvertFToI(D /* tag */, Vec64<double> v) {
|
|
4020
|
+
#if HWY_ARCH_ARM_A64 && \
|
|
4021
|
+
((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700) || \
|
|
4022
|
+
(HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200))
|
|
4023
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
|
|
4024
|
+
// to avoid undefined behavior if v[i] is outside of the range of an int64_t.
|
|
4025
|
+
// If compiling for AArch64 NEON with GCC 6 or earlier, use inline assembly to
|
|
4026
|
+
// work around the missing vcvt_s64_f64 intrinsic.
|
|
4027
|
+
int64x1_t raw_result;
|
|
4028
|
+
__asm__("fcvtzs %d0, %d1" : "=w"(raw_result) : "w"(v.raw));
|
|
4029
|
+
return Vec64<int64_t>(raw_result);
|
|
3606
4030
|
#else
|
|
3607
|
-
(void)di;
|
|
3608
4031
|
return Vec64<int64_t>(vcvt_s64_f64(v.raw));
|
|
3609
4032
|
#endif
|
|
3610
4033
|
}
|
|
3611
|
-
template <class D, HWY_IF_U64_D(D)>
|
|
3612
|
-
|
|
4034
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
|
|
4035
|
+
HWY_INLINE Vec128<uint64_t> ConvertFToU(D /* tag */, Vec128<double> v) {
|
|
4036
|
+
#if HWY_COMPILER_CLANG && HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200
|
|
4037
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
|
|
4038
|
+
// to avoid undefined behavior if v[i] is outside of the range of an uint64_t.
|
|
4039
|
+
uint64x2_t raw_result;
|
|
4040
|
+
__asm__("fcvtzu %0.2d, %1.2d" : "=w"(raw_result) : "w"(v.raw));
|
|
4041
|
+
return Vec128<uint64_t>(raw_result);
|
|
4042
|
+
#else
|
|
3613
4043
|
return Vec128<uint64_t>(vcvtq_u64_f64(v.raw));
|
|
4044
|
+
#endif
|
|
3614
4045
|
}
|
|
3615
|
-
template <class D, HWY_IF_U64_D(D)>
|
|
3616
|
-
|
|
3617
|
-
|
|
3618
|
-
|
|
3619
|
-
|
|
3620
|
-
|
|
3621
|
-
|
|
3622
|
-
|
|
4046
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U64_D(D)>
|
|
4047
|
+
HWY_INLINE Vec64<uint64_t> ConvertFToU(D /* tag */, Vec64<double> v) {
|
|
4048
|
+
#if HWY_ARCH_ARM_A64 && \
|
|
4049
|
+
((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700) || \
|
|
4050
|
+
(HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200))
|
|
4051
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
|
|
4052
|
+
// to avoid undefined behavior if v[i] is outside of the range of an uint64_t.
|
|
4053
|
+
|
|
4054
|
+
// Inline assembly is also used if compiling for AArch64 NEON with GCC 6 or
|
|
4055
|
+
// earlier to work around the issue of the missing vcvt_u64_f64 intrinsic.
|
|
4056
|
+
uint64x1_t raw_result;
|
|
4057
|
+
__asm__("fcvtzu %d0, %d1" : "=w"(raw_result) : "w"(v.raw));
|
|
4058
|
+
return Vec64<uint64_t>(raw_result);
|
|
3623
4059
|
#else
|
|
3624
|
-
(void)du;
|
|
3625
4060
|
return Vec64<uint64_t>(vcvt_u64_f64(v.raw));
|
|
3626
4061
|
#endif
|
|
3627
4062
|
}
|
|
@@ -3631,25 +4066,76 @@ HWY_API Vec64<uint64_t> ConvertTo(D du, Vec64<double> v) {
|
|
|
3631
4066
|
#if HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16
|
|
3632
4067
|
|
|
3633
4068
|
// Truncates (rounds toward zero).
|
|
3634
|
-
template <class D, HWY_IF_I16_D(D)>
|
|
3635
|
-
|
|
4069
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
|
|
4070
|
+
HWY_INLINE Vec128<int16_t> ConvertFToI(D /* tag */, Vec128<float16_t> v) {
|
|
4071
|
+
#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
|
|
4072
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
|
|
4073
|
+
// to avoid undefined behavior if v[i] is outside of the range of an int16_t.
|
|
4074
|
+
int16x8_t raw_result;
|
|
4075
|
+
__asm__("fcvtzs %0.8h, %1.8h" : "=w"(raw_result) : "w"(v.raw));
|
|
4076
|
+
return Vec128<int16_t>(raw_result);
|
|
4077
|
+
#else
|
|
3636
4078
|
return Vec128<int16_t>(vcvtq_s16_f16(v.raw));
|
|
4079
|
+
#endif
|
|
3637
4080
|
}
|
|
3638
4081
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
|
|
3639
|
-
|
|
4082
|
+
HWY_INLINE VFromD<D> ConvertFToI(D /* tag */, VFromD<Rebind<float16_t, D>> v) {
|
|
4083
|
+
#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
|
|
4084
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
|
|
4085
|
+
// to avoid undefined behavior if v[i] is outside of the range of an int16_t.
|
|
4086
|
+
int16x4_t raw_result;
|
|
4087
|
+
__asm__("fcvtzs %0.4h, %1.4h" : "=w"(raw_result) : "w"(v.raw));
|
|
4088
|
+
return VFromD<D>(raw_result);
|
|
4089
|
+
#else
|
|
3640
4090
|
return VFromD<D>(vcvt_s16_f16(v.raw));
|
|
4091
|
+
#endif
|
|
3641
4092
|
}
|
|
3642
4093
|
|
|
3643
|
-
template <class D, HWY_IF_U16_D(D)>
|
|
3644
|
-
|
|
4094
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
|
|
4095
|
+
HWY_INLINE Vec128<uint16_t> ConvertFToU(D /* tag */, Vec128<float16_t> v) {
|
|
4096
|
+
#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
|
|
4097
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
|
|
4098
|
+
// to avoid undefined behavior if v[i] is outside of the range of an uint16_t.
|
|
4099
|
+
uint16x8_t raw_result;
|
|
4100
|
+
__asm__("fcvtzu %0.8h, %1.8h" : "=w"(raw_result) : "w"(v.raw));
|
|
4101
|
+
return Vec128<uint16_t>(raw_result);
|
|
4102
|
+
#else
|
|
3645
4103
|
return Vec128<uint16_t>(vcvtq_u16_f16(v.raw));
|
|
4104
|
+
#endif
|
|
3646
4105
|
}
|
|
3647
4106
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
|
|
3648
|
-
|
|
4107
|
+
HWY_INLINE VFromD<D> ConvertFToU(D /* tag */, VFromD<Rebind<float16_t, D>> v) {
|
|
4108
|
+
#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
|
|
4109
|
+
// If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
|
|
4110
|
+
// to avoid undefined behavior if v[i] is outside of the range of an uint16_t.
|
|
4111
|
+
uint16x4_t raw_result;
|
|
4112
|
+
__asm__("fcvtzu %0.4h, %1.4h" : "=w"(raw_result) : "w"(v.raw));
|
|
4113
|
+
return VFromD<D>(raw_result);
|
|
4114
|
+
#else
|
|
3649
4115
|
return VFromD<D>(vcvt_u16_f16(v.raw));
|
|
4116
|
+
#endif
|
|
3650
4117
|
}
|
|
3651
4118
|
|
|
3652
4119
|
#endif // HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16
|
|
4120
|
+
} // namespace detail
|
|
4121
|
+
|
|
4122
|
+
template <class D, HWY_IF_SIGNED_D(D),
|
|
4123
|
+
HWY_IF_T_SIZE_ONE_OF_D(
|
|
4124
|
+
D, (1 << 4) |
|
|
4125
|
+
((HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16) ? (1 << 2) : 0) |
|
|
4126
|
+
(HWY_HAVE_FLOAT64 ? (1 << 8) : 0))>
|
|
4127
|
+
HWY_API VFromD<D> ConvertTo(D di, VFromD<RebindToFloat<D>> v) {
|
|
4128
|
+
return detail::ConvertFToI(di, v);
|
|
4129
|
+
}
|
|
4130
|
+
|
|
4131
|
+
template <class D, HWY_IF_UNSIGNED_D(D),
|
|
4132
|
+
HWY_IF_T_SIZE_ONE_OF_D(
|
|
4133
|
+
D, (1 << 4) |
|
|
4134
|
+
((HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16) ? (1 << 2) : 0) |
|
|
4135
|
+
(HWY_HAVE_FLOAT64 ? (1 << 8) : 0))>
|
|
4136
|
+
HWY_API VFromD<D> ConvertTo(D du, VFromD<RebindToFloat<D>> v) {
|
|
4137
|
+
return detail::ConvertFToU(du, v);
|
|
4138
|
+
}
|
|
3653
4139
|
|
|
3654
4140
|
// ------------------------------ PromoteTo (ConvertTo)
|
|
3655
4141
|
|
|
@@ -3782,7 +4268,7 @@ HWY_API VFromD<D> PromoteTo(D d, V v) {
|
|
|
3782
4268
|
return PromoteTo(d, PromoteTo(di32, v));
|
|
3783
4269
|
}
|
|
3784
4270
|
|
|
3785
|
-
#if
|
|
4271
|
+
#if HWY_NEON_HAVE_F16C
|
|
3786
4272
|
|
|
3787
4273
|
// Per-target flag to prevent generic_ops-inl.h from defining f16 conversions.
|
|
3788
4274
|
#ifdef HWY_NATIVE_F16C
|
|
@@ -3800,7 +4286,7 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float16_t, D>> v) {
|
|
|
3800
4286
|
return VFromD<D>(vget_low_f32(vcvt_f32_f16(v.raw)));
|
|
3801
4287
|
}
|
|
3802
4288
|
|
|
3803
|
-
#endif //
|
|
4289
|
+
#endif // HWY_NEON_HAVE_F16C
|
|
3804
4290
|
|
|
3805
4291
|
#if HWY_HAVE_FLOAT64
|
|
3806
4292
|
|
|
@@ -3946,14 +4432,14 @@ HWY_API Vec128<int64_t> PromoteUpperTo(D /* tag */, Vec128<int32_t> v) {
|
|
|
3946
4432
|
return Vec128<int64_t>(vmovl_high_s32(v.raw));
|
|
3947
4433
|
}
|
|
3948
4434
|
|
|
3949
|
-
#if
|
|
4435
|
+
#if HWY_NEON_HAVE_F16C
|
|
3950
4436
|
|
|
3951
4437
|
template <class D, HWY_IF_F32_D(D)>
|
|
3952
4438
|
HWY_API Vec128<float> PromoteUpperTo(D /* tag */, Vec128<float16_t> v) {
|
|
3953
4439
|
return Vec128<float>(vcvt_high_f32_f16(v.raw));
|
|
3954
4440
|
}
|
|
3955
4441
|
|
|
3956
|
-
#endif //
|
|
4442
|
+
#endif // HWY_NEON_HAVE_F16C
|
|
3957
4443
|
|
|
3958
4444
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
|
|
3959
4445
|
HWY_API VFromD<D> PromoteUpperTo(D df32, VFromD<Repartition<bfloat16_t, D>> v) {
|
|
@@ -4149,7 +4635,7 @@ HWY_API VFromD<D> DemoteTo(D d, Vec64<uint64_t> v) {
|
|
|
4149
4635
|
return DemoteTo(d, DemoteTo(du32, v));
|
|
4150
4636
|
}
|
|
4151
4637
|
|
|
4152
|
-
#if
|
|
4638
|
+
#if HWY_NEON_HAVE_F16C
|
|
4153
4639
|
|
|
4154
4640
|
// We already toggled HWY_NATIVE_F16C above.
|
|
4155
4641
|
|
|
@@ -4162,7 +4648,7 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
|
|
|
4162
4648
|
return VFromD<D>(vcvt_f16_f32(vcombine_f32(v.raw, v.raw)));
|
|
4163
4649
|
}
|
|
4164
4650
|
|
|
4165
|
-
#endif //
|
|
4651
|
+
#endif // HWY_NEON_HAVE_F16C
|
|
4166
4652
|
|
|
4167
4653
|
template <class D, HWY_IF_BF16_D(D)>
|
|
4168
4654
|
HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
|
|
@@ -4184,32 +4670,10 @@ HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) {
|
|
|
4184
4670
|
return Vec32<float>(vcvt_f32_f64(vcombine_f64(v.raw, v.raw)));
|
|
4185
4671
|
}
|
|
4186
4672
|
|
|
4187
|
-
template <class D,
|
|
4188
|
-
HWY_API
|
|
4189
|
-
const
|
|
4190
|
-
return
|
|
4191
|
-
}
|
|
4192
|
-
template <class D, HWY_IF_I32_D(D)>
|
|
4193
|
-
HWY_API Vec32<int32_t> DemoteTo(D /* tag */, Vec64<double> v) {
|
|
4194
|
-
// There is no i64x1 -> i32x1 narrow, so Combine to 128-bit. Do so with the
|
|
4195
|
-
// f64 input already to also avoid the missing vcvt_s64_f64 in GCC 6.4.
|
|
4196
|
-
const Full128<double> ddt;
|
|
4197
|
-
const Full128<int64_t> dit;
|
|
4198
|
-
return Vec32<int32_t>(vqmovn_s64(ConvertTo(dit, Combine(ddt, v, v)).raw));
|
|
4199
|
-
}
|
|
4200
|
-
|
|
4201
|
-
template <class D, HWY_IF_U32_D(D)>
|
|
4202
|
-
HWY_API Vec64<uint32_t> DemoteTo(D /* tag */, Vec128<double> v) {
|
|
4203
|
-
const uint64x2_t u64 = vcvtq_u64_f64(v.raw);
|
|
4204
|
-
return Vec64<uint32_t>(vqmovn_u64(u64));
|
|
4205
|
-
}
|
|
4206
|
-
template <class D, HWY_IF_U32_D(D)>
|
|
4207
|
-
HWY_API Vec32<uint32_t> DemoteTo(D /* tag */, Vec64<double> v) {
|
|
4208
|
-
// There is no i64x1 -> i32x1 narrow, so Combine to 128-bit. Do so with the
|
|
4209
|
-
// f64 input already to also avoid the missing vcvt_s64_f64 in GCC 6.4.
|
|
4210
|
-
const Full128<double> ddt;
|
|
4211
|
-
const Full128<uint64_t> du_t;
|
|
4212
|
-
return Vec32<uint32_t>(vqmovn_u64(ConvertTo(du_t, Combine(ddt, v, v)).raw));
|
|
4673
|
+
template <class D, HWY_IF_UI32_D(D)>
|
|
4674
|
+
HWY_API VFromD<D> DemoteTo(D d32, VFromD<Rebind<double, D>> v) {
|
|
4675
|
+
const Rebind<MakeWide<TFromD<D>>, D> d64;
|
|
4676
|
+
return DemoteTo(d32, ConvertTo(d64, v));
|
|
4213
4677
|
}
|
|
4214
4678
|
|
|
4215
4679
|
#endif // HWY_HAVE_FLOAT64
|
|
@@ -4466,30 +4930,6 @@ HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
|
|
|
4466
4930
|
return v != v;
|
|
4467
4931
|
}
|
|
4468
4932
|
|
|
4469
|
-
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
4470
|
-
HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
|
|
4471
|
-
const DFromV<decltype(v)> d;
|
|
4472
|
-
const RebindToSigned<decltype(d)> di;
|
|
4473
|
-
const VFromD<decltype(di)> vi = BitCast(di, v);
|
|
4474
|
-
// 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
|
|
4475
|
-
return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
|
|
4476
|
-
}
|
|
4477
|
-
|
|
4478
|
-
// Returns whether normal/subnormal/zero.
|
|
4479
|
-
template <typename T, size_t N, HWY_IF_FLOAT(T)>
|
|
4480
|
-
HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
|
|
4481
|
-
const DFromV<decltype(v)> d;
|
|
4482
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
4483
|
-
const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
|
|
4484
|
-
const VFromD<decltype(du)> vu = BitCast(du, v);
|
|
4485
|
-
// 'Shift left' to clear the sign bit, then right so we can compare with the
|
|
4486
|
-
// max exponent (cannot compare with MaxExponentTimes2 directly because it is
|
|
4487
|
-
// negative and non-negative floats would be greater).
|
|
4488
|
-
const VFromD<decltype(di)> exp =
|
|
4489
|
-
BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
|
|
4490
|
-
return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
|
|
4491
|
-
}
|
|
4492
|
-
|
|
4493
4933
|
// ================================================== SWIZZLE
|
|
4494
4934
|
|
|
4495
4935
|
// ------------------------------ LowerHalf
|
|
@@ -4749,7 +5189,7 @@ HWY_API Vec64<double> UpperHalf(D /* tag */, Vec128<double> v) {
|
|
|
4749
5189
|
}
|
|
4750
5190
|
#endif // HWY_HAVE_FLOAT64
|
|
4751
5191
|
|
|
4752
|
-
template <class D,
|
|
5192
|
+
template <class D, HWY_NEON_IF_EMULATED_D(D), HWY_IF_V_SIZE_GT_D(D, 4)>
|
|
4753
5193
|
HWY_API VFromD<D> UpperHalf(D dh, VFromD<Twice<D>> v) {
|
|
4754
5194
|
const RebindToUnsigned<Twice<decltype(dh)>> du;
|
|
4755
5195
|
const Half<decltype(du)> duh;
|
|
@@ -5393,6 +5833,16 @@ HWY_API Vec128<T> InterleaveLower(Vec128<T> a, Vec128<T> b) {
|
|
|
5393
5833
|
}
|
|
5394
5834
|
#endif
|
|
5395
5835
|
|
|
5836
|
+
#if !HWY_HAVE_FLOAT16
|
|
5837
|
+
template <size_t N, HWY_IF_V_SIZE_GT(float16_t, N, 4)>
|
|
5838
|
+
HWY_API Vec128<float16_t, N> InterleaveLower(Vec128<float16_t, N> a,
|
|
5839
|
+
Vec128<float16_t, N> b) {
|
|
5840
|
+
const DFromV<decltype(a)> d;
|
|
5841
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5842
|
+
return BitCast(d, InterleaveLower(BitCast(du, a), BitCast(du, b)));
|
|
5843
|
+
}
|
|
5844
|
+
#endif // !HWY_HAVE_FLOAT16
|
|
5845
|
+
|
|
5396
5846
|
// < 64 bit parts
|
|
5397
5847
|
template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 4)>
|
|
5398
5848
|
HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
|
|
@@ -6266,6 +6716,23 @@ namespace detail {
|
|
|
6266
6716
|
// There is no vuzpq_u64.
|
|
6267
6717
|
HWY_NEON_DEF_FUNCTION_UIF_8_16_32(ConcatEven, vuzp1, _, 2)
|
|
6268
6718
|
HWY_NEON_DEF_FUNCTION_UIF_8_16_32(ConcatOdd, vuzp2, _, 2)
|
|
6719
|
+
|
|
6720
|
+
#if !HWY_HAVE_FLOAT16
|
|
6721
|
+
template <size_t N>
|
|
6722
|
+
HWY_INLINE Vec128<float16_t, N> ConcatEven(Vec128<float16_t, N> hi,
|
|
6723
|
+
Vec128<float16_t, N> lo) {
|
|
6724
|
+
const DFromV<decltype(hi)> d;
|
|
6725
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
6726
|
+
return BitCast(d, ConcatEven(BitCast(du, hi), BitCast(du, lo)));
|
|
6727
|
+
}
|
|
6728
|
+
template <size_t N>
|
|
6729
|
+
HWY_INLINE Vec128<float16_t, N> ConcatOdd(Vec128<float16_t, N> hi,
|
|
6730
|
+
Vec128<float16_t, N> lo) {
|
|
6731
|
+
const DFromV<decltype(hi)> d;
|
|
6732
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
6733
|
+
return BitCast(d, ConcatOdd(BitCast(du, hi), BitCast(du, lo)));
|
|
6734
|
+
}
|
|
6735
|
+
#endif // !HWY_HAVE_FLOAT16
|
|
6269
6736
|
} // namespace detail
|
|
6270
6737
|
|
|
6271
6738
|
// Full/half vector
|
|
@@ -7045,44 +7512,19 @@ HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) {
|
|
|
7045
7512
|
|
|
7046
7513
|
// ------------------------------ Reductions
|
|
7047
7514
|
|
|
7048
|
-
|
|
7049
|
-
|
|
7050
|
-
// N=1 for any T: no-op
|
|
7051
|
-
template <typename T>
|
|
7052
|
-
HWY_INLINE T ReduceMin(hwy::SizeTag<sizeof(T)> /* tag */, Vec128<T, 1> v) {
|
|
7053
|
-
return GetLane(v);
|
|
7054
|
-
}
|
|
7055
|
-
template <typename T>
|
|
7056
|
-
HWY_INLINE T ReduceMax(hwy::SizeTag<sizeof(T)> /* tag */, Vec128<T, 1> v) {
|
|
7057
|
-
return GetLane(v);
|
|
7058
|
-
}
|
|
7059
|
-
template <typename T>
|
|
7060
|
-
HWY_INLINE T ReduceSum(hwy::SizeTag<sizeof(T)> /* tag */, Vec128<T, 1> v) {
|
|
7061
|
-
return GetLane(v);
|
|
7062
|
-
}
|
|
7063
|
-
template <typename T>
|
|
7064
|
-
HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
|
|
7065
|
-
Vec128<T, 1> v) {
|
|
7066
|
-
return v;
|
|
7067
|
-
}
|
|
7068
|
-
template <typename T>
|
|
7069
|
-
HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
|
|
7070
|
-
Vec128<T, 1> v) {
|
|
7071
|
-
return v;
|
|
7072
|
-
}
|
|
7073
|
-
template <typename T>
|
|
7074
|
-
HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
|
|
7075
|
-
Vec128<T, 1> v) {
|
|
7076
|
-
return v;
|
|
7077
|
-
}
|
|
7078
|
-
|
|
7079
|
-
// full vectors
|
|
7515
|
+
// On Armv8 we define ReduceSum and generic_ops defines SumOfLanes via Set.
|
|
7080
7516
|
#if HWY_ARCH_ARM_A64
|
|
7081
7517
|
|
|
7518
|
+
#ifdef HWY_NATIVE_REDUCE_SCALAR
|
|
7519
|
+
#undef HWY_NATIVE_REDUCE_SCALAR
|
|
7520
|
+
#else
|
|
7521
|
+
#define HWY_NATIVE_REDUCE_SCALAR
|
|
7522
|
+
#endif
|
|
7523
|
+
|
|
7082
7524
|
// TODO(janwas): use normal HWY_NEON_DEF, then FULL type list.
|
|
7083
7525
|
#define HWY_NEON_DEF_REDUCTION(type, size, name, prefix, infix, suffix) \
|
|
7084
|
-
|
|
7085
|
-
|
|
7526
|
+
template <class D, HWY_IF_LANES_D(D, size)> \
|
|
7527
|
+
HWY_API type##_t name(D /* tag */, Vec128<type##_t, size> v) { \
|
|
7086
7528
|
return HWY_NEON_EVAL(prefix##infix##suffix, v.raw); \
|
|
7087
7529
|
}
|
|
7088
7530
|
|
|
@@ -7125,83 +7567,110 @@ HWY_NEON_DEF_REDUCTION_F16(ReduceMax, vmaxv)
|
|
|
7125
7567
|
HWY_NEON_DEF_REDUCTION_CORE_TYPES(ReduceSum, vaddv)
|
|
7126
7568
|
HWY_NEON_DEF_REDUCTION_UI64(ReduceSum, vaddv)
|
|
7127
7569
|
|
|
7570
|
+
// Emulate missing UI64 and partial N=2.
|
|
7571
|
+
template <class D, HWY_IF_LANES_D(D, 2),
|
|
7572
|
+
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
|
|
7573
|
+
HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v10) {
|
|
7574
|
+
return GetLane(v10) + ExtractLane(v10, 1);
|
|
7575
|
+
}
|
|
7576
|
+
|
|
7577
|
+
template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_NOT_FLOAT_D(D),
|
|
7578
|
+
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 8))>
|
|
7579
|
+
HWY_API TFromD<D> ReduceMin(D /* tag */, VFromD<D> v10) {
|
|
7580
|
+
return HWY_MIN(GetLane(v10), ExtractLane(v10, 1));
|
|
7581
|
+
}
|
|
7582
|
+
|
|
7583
|
+
template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_NOT_FLOAT_D(D),
|
|
7584
|
+
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 8))>
|
|
7585
|
+
HWY_API TFromD<D> ReduceMax(D /* tag */, VFromD<D> v10) {
|
|
7586
|
+
return HWY_MAX(GetLane(v10), ExtractLane(v10, 1));
|
|
7587
|
+
}
|
|
7588
|
+
|
|
7128
7589
|
#if HWY_HAVE_FLOAT16
|
|
7129
|
-
|
|
7590
|
+
template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_F16_D(D)>
|
|
7591
|
+
HWY_API float16_t ReduceMin(D d, VFromD<D> v10) {
|
|
7592
|
+
return GetLane(Min(v10, Reverse2(d, v10)));
|
|
7593
|
+
}
|
|
7594
|
+
|
|
7595
|
+
template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_F16_D(D)>
|
|
7596
|
+
HWY_API float16_t ReduceMax(D d, VFromD<D> v10) {
|
|
7597
|
+
return GetLane(Max(v10, Reverse2(d, v10)));
|
|
7598
|
+
}
|
|
7599
|
+
|
|
7600
|
+
template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 8)>
|
|
7601
|
+
HWY_API float16_t ReduceSum(D /* tag */, VFromD<D> v) {
|
|
7130
7602
|
const float16x4_t x2 = vpadd_f16(v.raw, v.raw);
|
|
7131
|
-
return GetLane(
|
|
7603
|
+
return GetLane(VFromD<D>(vpadd_f16(x2, x2)));
|
|
7132
7604
|
}
|
|
7133
|
-
|
|
7134
|
-
|
|
7605
|
+
template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 16)>
|
|
7606
|
+
HWY_API float16_t ReduceSum(D d, VFromD<D> v) {
|
|
7607
|
+
const Half<decltype(d)> dh;
|
|
7608
|
+
return ReduceSum(dh, LowerHalf(dh, VFromD<D>(vpaddq_f16(v.raw, v.raw))));
|
|
7135
7609
|
}
|
|
7136
|
-
#endif
|
|
7610
|
+
#endif // HWY_HAVE_FLOAT16
|
|
7137
7611
|
|
|
7138
7612
|
#undef HWY_NEON_DEF_REDUCTION_CORE_TYPES
|
|
7139
7613
|
#undef HWY_NEON_DEF_REDUCTION_F16
|
|
7140
7614
|
#undef HWY_NEON_DEF_REDUCTION_UI64
|
|
7141
7615
|
#undef HWY_NEON_DEF_REDUCTION
|
|
7142
7616
|
|
|
7143
|
-
//
|
|
7144
|
-
#define HWY_IF_SUM_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, 1 << 2)
|
|
7145
|
-
#define HWY_IF_MINMAX_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, (1 << 8) | (1 << 2))
|
|
7617
|
+
// ------------------------------ SumOfLanes
|
|
7146
7618
|
|
|
7147
|
-
|
|
7148
|
-
|
|
7149
|
-
|
|
7150
|
-
return Set(DFromV<decltype(v)>(), ReduceMin(tag, v));
|
|
7619
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
7620
|
+
HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
|
|
7621
|
+
return Set(d, ReduceSum(d, v));
|
|
7151
7622
|
}
|
|
7152
|
-
template <
|
|
7153
|
-
HWY_API
|
|
7154
|
-
return Set(
|
|
7623
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
7624
|
+
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
7625
|
+
return Set(d, ReduceMin(d, v));
|
|
7155
7626
|
}
|
|
7156
|
-
template <
|
|
7157
|
-
HWY_API
|
|
7158
|
-
return Set(
|
|
7627
|
+
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
7628
|
+
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
7629
|
+
return Set(d, ReduceMax(d, v));
|
|
7159
7630
|
}
|
|
7160
7631
|
|
|
7161
|
-
|
|
7632
|
+
// On Armv7 we define SumOfLanes and generic_ops defines ReduceSum via GetLane.
|
|
7633
|
+
#else // !HWY_ARCH_ARM_A64
|
|
7634
|
+
|
|
7635
|
+
// Armv7 lacks N=2 and 8-bit x4, so enable generic versions of those.
|
|
7636
|
+
#undef HWY_IF_SUM_OF_LANES_D
|
|
7637
|
+
#define HWY_IF_SUM_OF_LANES_D(D) \
|
|
7638
|
+
hwy::EnableIf<(HWY_MAX_LANES_D(D) == 2) || \
|
|
7639
|
+
(sizeof(TFromD<D>) == 1 && HWY_MAX_LANES_D(D) == 4)>* = \
|
|
7640
|
+
nullptr
|
|
7641
|
+
#undef HWY_IF_MINMAX_OF_LANES_D
|
|
7642
|
+
#define HWY_IF_MINMAX_OF_LANES_D(D) \
|
|
7643
|
+
hwy::EnableIf<(HWY_MAX_LANES_D(D) == 2) || \
|
|
7644
|
+
(sizeof(TFromD<D>) == 1 && HWY_MAX_LANES_D(D) == 4)>* = \
|
|
7645
|
+
nullptr
|
|
7162
7646
|
|
|
7163
7647
|
// For arm7, we implement reductions using a series of pairwise operations. This
|
|
7164
7648
|
// produces the full vector result, so we express Reduce* in terms of *OfLanes.
|
|
7165
7649
|
#define HWY_NEON_BUILD_TYPE_T(type, size) type##x##size##_t
|
|
7166
|
-
#define HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) Vec128<type##_t, size>
|
|
7167
7650
|
#define HWY_NEON_DEF_PAIRWISE_REDUCTION(type, size, name, prefix, suffix) \
|
|
7168
|
-
|
|
7169
|
-
|
|
7651
|
+
template <class D, HWY_IF_LANES_D(D, size)> \
|
|
7652
|
+
HWY_API Vec128<type##_t, size> name##OfLanes(D /* d */, \
|
|
7653
|
+
Vec128<type##_t, size> v) { \
|
|
7170
7654
|
HWY_NEON_BUILD_TYPE_T(type, size) tmp = prefix##_##suffix(v.raw, v.raw); \
|
|
7171
7655
|
if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \
|
|
7172
7656
|
if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \
|
|
7173
|
-
return
|
|
7174
|
-
} \
|
|
7175
|
-
HWY_API type##_t Reduce##name(hwy::SizeTag<sizeof(type##_t)> tag, \
|
|
7176
|
-
Vec128<type##_t, size> v) { \
|
|
7177
|
-
return GetLane(name##OfLanes(tag, v)); \
|
|
7657
|
+
return Vec128<type##_t, size>(tmp); \
|
|
7178
7658
|
}
|
|
7179
7659
|
|
|
7180
7660
|
// For the wide versions, the pairwise operations produce a half-length vector.
|
|
7181
|
-
// We produce that
|
|
7182
|
-
// and *OfLanes in terms of the helper.
|
|
7661
|
+
// We produce that `tmp` and then Combine.
|
|
7183
7662
|
#define HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(type, size, half, name, prefix, \
|
|
7184
7663
|
suffix) \
|
|
7185
|
-
|
|
7186
|
-
|
|
7664
|
+
template <class D, HWY_IF_LANES_D(D, size)> \
|
|
7665
|
+
HWY_API Vec128<type##_t, size> name##OfLanes(D /* d */, \
|
|
7666
|
+
Vec128<type##_t, size> v) { \
|
|
7187
7667
|
HWY_NEON_BUILD_TYPE_T(type, half) tmp; \
|
|
7188
7668
|
tmp = prefix##_##suffix(vget_high_##suffix(v.raw), \
|
|
7189
7669
|
vget_low_##suffix(v.raw)); \
|
|
7190
7670
|
if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \
|
|
7191
7671
|
if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \
|
|
7192
7672
|
if ((size / 8) > 1) tmp = prefix##_##suffix(tmp, tmp); \
|
|
7193
|
-
return tmp;
|
|
7194
|
-
} \
|
|
7195
|
-
HWY_API type##_t Reduce##name(hwy::SizeTag<sizeof(type##_t)>, \
|
|
7196
|
-
Vec128<type##_t, size> v) { \
|
|
7197
|
-
const HWY_NEON_BUILD_TYPE_T(type, half) tmp = Reduce##name##Vector(v); \
|
|
7198
|
-
return HWY_NEON_EVAL(vget_lane_##suffix, tmp, 0); \
|
|
7199
|
-
} \
|
|
7200
|
-
HWY_API HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) name##OfLanes( \
|
|
7201
|
-
hwy::SizeTag<sizeof(type##_t)>, Vec128<type##_t, size> v) { \
|
|
7202
|
-
const HWY_NEON_BUILD_TYPE_T(type, half) tmp = Reduce##name##Vector(v); \
|
|
7203
|
-
return HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION( \
|
|
7204
|
-
type, size)(vcombine_##suffix(tmp, tmp)); \
|
|
7673
|
+
return Vec128<type##_t, size>(vcombine_##suffix(tmp, tmp)); \
|
|
7205
7674
|
}
|
|
7206
7675
|
|
|
7207
7676
|
#define HWY_NEON_DEF_PAIRWISE_REDUCTIONS(name, prefix) \
|
|
@@ -7227,56 +7696,22 @@ HWY_NEON_DEF_PAIRWISE_REDUCTIONS(Max, vpmax)
|
|
|
7227
7696
|
#undef HWY_NEON_DEF_PAIRWISE_REDUCTIONS
|
|
7228
7697
|
#undef HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION
|
|
7229
7698
|
#undef HWY_NEON_DEF_PAIRWISE_REDUCTION
|
|
7230
|
-
#undef HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION
|
|
7231
7699
|
#undef HWY_NEON_BUILD_TYPE_T
|
|
7232
7700
|
|
|
7233
|
-
//
|
|
7234
|
-
|
|
7235
|
-
#
|
|
7236
|
-
|
|
7701
|
+
// GetLane(SumsOf4(v)) is more efficient on ArmV7 NEON than the default
|
|
7702
|
+
// N=4 I8/U8 ReduceSum implementation in generic_ops-inl.h
|
|
7703
|
+
#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
7704
|
+
#undef HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
7705
|
+
#else
|
|
7706
|
+
#define HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
7237
7707
|
#endif
|
|
7238
7708
|
|
|
7239
|
-
|
|
7240
|
-
|
|
7241
|
-
|
|
7242
|
-
template <class D, typename T, HWY_IF_SUM_REDUCTION(T)>
|
|
7243
|
-
HWY_API Vec128<T, 2> SumOfLanes(D /* tag */, Vec128<T, 2> v10) {
|
|
7244
|
-
return v10 + Reverse2(Simd<T, 2, 0>(), v10);
|
|
7245
|
-
}
|
|
7246
|
-
|
|
7247
|
-
template <class D, typename T, HWY_IF_SUM_REDUCTION(T)>
|
|
7248
|
-
HWY_API T ReduceSum(D d, Vec128<T, 2> v10) {
|
|
7249
|
-
return GetLane(SumOfLanes(d, v10));
|
|
7250
|
-
}
|
|
7251
|
-
|
|
7252
|
-
template <class D, typename T, HWY_IF_MINMAX_REDUCTION(T)>
|
|
7253
|
-
HWY_API Vec128<T, 2> MinOfLanes(D /* tag */, Vec128<T, 2> v10) {
|
|
7254
|
-
return Min(v10, Reverse2(Simd<T, 2, 0>(), v10));
|
|
7255
|
-
}
|
|
7256
|
-
template <class D, typename T, HWY_IF_MINMAX_REDUCTION(T)>
|
|
7257
|
-
HWY_API Vec128<T, 2> MaxOfLanes(D /* tag */, Vec128<T, 2> v10) {
|
|
7258
|
-
return Max(v10, Reverse2(Simd<T, 2, 0>(), v10));
|
|
7709
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
|
|
7710
|
+
HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) {
|
|
7711
|
+
return static_cast<TFromD<D>>(GetLane(SumsOf4(v)));
|
|
7259
7712
|
}
|
|
7260
7713
|
|
|
7261
|
-
#
|
|
7262
|
-
#undef HWY_IF_MINMAX_REDUCTION
|
|
7263
|
-
|
|
7264
|
-
template <class D>
|
|
7265
|
-
HWY_API VFromD<D> SumOfLanes(D /* tag */, VFromD<D> v) {
|
|
7266
|
-
return detail::SumOfLanes(hwy::SizeTag<sizeof(TFromD<D>)>(), v);
|
|
7267
|
-
}
|
|
7268
|
-
template <class D>
|
|
7269
|
-
HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v) {
|
|
7270
|
-
return detail::ReduceSum(hwy::SizeTag<sizeof(TFromD<D>)>(), v);
|
|
7271
|
-
}
|
|
7272
|
-
template <class D>
|
|
7273
|
-
HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) {
|
|
7274
|
-
return detail::MinOfLanes(hwy::SizeTag<sizeof(TFromD<D>)>(), v);
|
|
7275
|
-
}
|
|
7276
|
-
template <class D>
|
|
7277
|
-
HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) {
|
|
7278
|
-
return detail::MaxOfLanes(hwy::SizeTag<sizeof(TFromD<D>)>(), v);
|
|
7279
|
-
}
|
|
7714
|
+
#endif // HWY_ARCH_ARM_A64
|
|
7280
7715
|
|
|
7281
7716
|
// ------------------------------ LoadMaskBits (TestBit)
|
|
7282
7717
|
|
|
@@ -7345,6 +7780,15 @@ HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
|
|
|
7345
7780
|
return detail::LoadMaskBits(d, mask_bits);
|
|
7346
7781
|
}
|
|
7347
7782
|
|
|
7783
|
+
// ------------------------------ Dup128MaskFromMaskBits
|
|
7784
|
+
|
|
7785
|
+
template <class D>
|
|
7786
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
7787
|
+
constexpr size_t kN = MaxLanes(d);
|
|
7788
|
+
if (kN < 8) mask_bits &= (1u << kN) - 1;
|
|
7789
|
+
return detail::LoadMaskBits(d, mask_bits);
|
|
7790
|
+
}
|
|
7791
|
+
|
|
7348
7792
|
// ------------------------------ Mask
|
|
7349
7793
|
|
|
7350
7794
|
namespace detail {
|
|
@@ -7674,7 +8118,7 @@ namespace detail {
|
|
|
7674
8118
|
template <class D, HWY_IF_V_SIZE_D(D, 16)>
|
|
7675
8119
|
HWY_INLINE Vec128<uint8_t> Load8Bytes(D /*tag*/, const uint8_t* bytes) {
|
|
7676
8120
|
return Vec128<uint8_t>(vreinterpretq_u8_u64(
|
|
7677
|
-
vld1q_dup_u64(
|
|
8121
|
+
vld1q_dup_u64(HWY_RCAST_ALIGNED(const uint64_t*, bytes))));
|
|
7678
8122
|
}
|
|
7679
8123
|
|
|
7680
8124
|
// Load 8 bytes and return half-reg with N <= 8 bytes.
|
|
@@ -8287,9 +8731,8 @@ HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved4, vld4, _, HWY_LOAD_INT)
|
|
|
8287
8731
|
template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>>
|
|
8288
8732
|
HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
|
|
8289
8733
|
VFromD<D>& v0, VFromD<D>& v1) {
|
|
8290
|
-
auto raw = detail::LoadInterleaved2(
|
|
8291
|
-
|
|
8292
|
-
detail::Tuple2<T, d.MaxLanes()>());
|
|
8734
|
+
auto raw = detail::LoadInterleaved2(detail::NativeLanePointer(unaligned),
|
|
8735
|
+
detail::Tuple2<T, d.MaxLanes()>());
|
|
8293
8736
|
v0 = VFromD<D>(raw.val[0]);
|
|
8294
8737
|
v1 = VFromD<D>(raw.val[1]);
|
|
8295
8738
|
}
|
|
@@ -8301,9 +8744,8 @@ HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
|
|
|
8301
8744
|
// The smallest vector registers are 64-bits and we want space for two.
|
|
8302
8745
|
alignas(16) T buf[2 * 8 / sizeof(T)] = {};
|
|
8303
8746
|
CopyBytes<d.MaxBytes() * 2>(unaligned, buf);
|
|
8304
|
-
auto raw = detail::LoadInterleaved2(
|
|
8305
|
-
|
|
8306
|
-
detail::Tuple2<T, d.MaxLanes()>());
|
|
8747
|
+
auto raw = detail::LoadInterleaved2(detail::NativeLanePointer(buf),
|
|
8748
|
+
detail::Tuple2<T, d.MaxLanes()>());
|
|
8307
8749
|
v0 = VFromD<D>(raw.val[0]);
|
|
8308
8750
|
v1 = VFromD<D>(raw.val[1]);
|
|
8309
8751
|
}
|
|
@@ -8315,12 +8757,8 @@ HWY_API void LoadInterleaved2(D d, T* HWY_RESTRICT unaligned, Vec128<T>& v0,
|
|
|
8315
8757
|
Vec128<T>& v1) {
|
|
8316
8758
|
const Half<decltype(d)> dh;
|
|
8317
8759
|
VFromD<decltype(dh)> v00, v10, v01, v11;
|
|
8318
|
-
LoadInterleaved2(
|
|
8319
|
-
|
|
8320
|
-
v10);
|
|
8321
|
-
LoadInterleaved2(
|
|
8322
|
-
dh, reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned + 2),
|
|
8323
|
-
v01, v11);
|
|
8760
|
+
LoadInterleaved2(dh, detail::NativeLanePointer(unaligned), v00, v10);
|
|
8761
|
+
LoadInterleaved2(dh, detail::NativeLanePointer(unaligned + 2), v01, v11);
|
|
8324
8762
|
v0 = Combine(d, v01, v00);
|
|
8325
8763
|
v1 = Combine(d, v11, v10);
|
|
8326
8764
|
}
|
|
@@ -8331,9 +8769,8 @@ HWY_API void LoadInterleaved2(D d, T* HWY_RESTRICT unaligned, Vec128<T>& v0,
|
|
|
8331
8769
|
template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>>
|
|
8332
8770
|
HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
|
|
8333
8771
|
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
|
|
8334
|
-
auto raw = detail::LoadInterleaved3(
|
|
8335
|
-
|
|
8336
|
-
detail::Tuple3<T, d.MaxLanes()>());
|
|
8772
|
+
auto raw = detail::LoadInterleaved3(detail::NativeLanePointer(unaligned),
|
|
8773
|
+
detail::Tuple3<T, d.MaxLanes()>());
|
|
8337
8774
|
v0 = VFromD<D>(raw.val[0]);
|
|
8338
8775
|
v1 = VFromD<D>(raw.val[1]);
|
|
8339
8776
|
v2 = VFromD<D>(raw.val[2]);
|
|
@@ -8346,9 +8783,8 @@ HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
|
|
|
8346
8783
|
// The smallest vector registers are 64-bits and we want space for three.
|
|
8347
8784
|
alignas(16) T buf[3 * 8 / sizeof(T)] = {};
|
|
8348
8785
|
CopyBytes<d.MaxBytes() * 3>(unaligned, buf);
|
|
8349
|
-
auto raw = detail::LoadInterleaved3(
|
|
8350
|
-
|
|
8351
|
-
detail::Tuple3<T, d.MaxLanes()>());
|
|
8786
|
+
auto raw = detail::LoadInterleaved3(detail::NativeLanePointer(buf),
|
|
8787
|
+
detail::Tuple3<T, d.MaxLanes()>());
|
|
8352
8788
|
v0 = VFromD<D>(raw.val[0]);
|
|
8353
8789
|
v1 = VFromD<D>(raw.val[1]);
|
|
8354
8790
|
v2 = VFromD<D>(raw.val[2]);
|
|
@@ -8361,12 +8797,8 @@ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
|
|
|
8361
8797
|
Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2) {
|
|
8362
8798
|
const Half<decltype(d)> dh;
|
|
8363
8799
|
VFromD<decltype(dh)> v00, v10, v20, v01, v11, v21;
|
|
8364
|
-
LoadInterleaved3(
|
|
8365
|
-
|
|
8366
|
-
v10, v20);
|
|
8367
|
-
LoadInterleaved3(
|
|
8368
|
-
dh, reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned + 3),
|
|
8369
|
-
v01, v11, v21);
|
|
8800
|
+
LoadInterleaved3(dh, detail::NativeLanePointer(unaligned), v00, v10, v20);
|
|
8801
|
+
LoadInterleaved3(dh, detail::NativeLanePointer(unaligned + 3), v01, v11, v21);
|
|
8370
8802
|
v0 = Combine(d, v01, v00);
|
|
8371
8803
|
v1 = Combine(d, v11, v10);
|
|
8372
8804
|
v2 = Combine(d, v21, v20);
|
|
@@ -8379,9 +8811,8 @@ template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>>
|
|
|
8379
8811
|
HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
|
|
8380
8812
|
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
|
|
8381
8813
|
VFromD<D>& v3) {
|
|
8382
|
-
auto raw = detail::LoadInterleaved4(
|
|
8383
|
-
|
|
8384
|
-
detail::Tuple4<T, d.MaxLanes()>());
|
|
8814
|
+
auto raw = detail::LoadInterleaved4(detail::NativeLanePointer(unaligned),
|
|
8815
|
+
detail::Tuple4<T, d.MaxLanes()>());
|
|
8385
8816
|
v0 = VFromD<D>(raw.val[0]);
|
|
8386
8817
|
v1 = VFromD<D>(raw.val[1]);
|
|
8387
8818
|
v2 = VFromD<D>(raw.val[2]);
|
|
@@ -8395,9 +8826,8 @@ HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
|
|
|
8395
8826
|
VFromD<D>& v3) {
|
|
8396
8827
|
alignas(16) T buf[4 * 8 / sizeof(T)] = {};
|
|
8397
8828
|
CopyBytes<d.MaxBytes() * 4>(unaligned, buf);
|
|
8398
|
-
auto raw = detail::LoadInterleaved4(
|
|
8399
|
-
|
|
8400
|
-
detail::Tuple4<T, d.MaxLanes()>());
|
|
8829
|
+
auto raw = detail::LoadInterleaved4(detail::NativeLanePointer(buf),
|
|
8830
|
+
detail::Tuple4<T, d.MaxLanes()>());
|
|
8401
8831
|
v0 = VFromD<D>(raw.val[0]);
|
|
8402
8832
|
v1 = VFromD<D>(raw.val[1]);
|
|
8403
8833
|
v2 = VFromD<D>(raw.val[2]);
|
|
@@ -8412,12 +8842,10 @@ HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
|
|
|
8412
8842
|
Vec128<T>& v3) {
|
|
8413
8843
|
const Half<decltype(d)> dh;
|
|
8414
8844
|
VFromD<decltype(dh)> v00, v10, v20, v30, v01, v11, v21, v31;
|
|
8415
|
-
LoadInterleaved4(
|
|
8416
|
-
|
|
8417
|
-
|
|
8418
|
-
|
|
8419
|
-
dh, reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned + 4),
|
|
8420
|
-
v01, v11, v21, v31);
|
|
8845
|
+
LoadInterleaved4(dh, detail::NativeLanePointer(unaligned), v00, v10, v20,
|
|
8846
|
+
v30);
|
|
8847
|
+
LoadInterleaved4(dh, detail::NativeLanePointer(unaligned + 4), v01, v11, v21,
|
|
8848
|
+
v31);
|
|
8421
8849
|
v0 = Combine(d, v01, v00);
|
|
8422
8850
|
v1 = Combine(d, v11, v10);
|
|
8423
8851
|
v2 = Combine(d, v21, v20);
|
|
@@ -8476,8 +8904,7 @@ template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>>
|
|
|
8476
8904
|
HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
|
|
8477
8905
|
T* HWY_RESTRICT unaligned) {
|
|
8478
8906
|
detail::Tuple2<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw}}};
|
|
8479
|
-
detail::StoreInterleaved2(
|
|
8480
|
-
tup, reinterpret_cast<detail::NativeLaneType<T>*>(unaligned));
|
|
8907
|
+
detail::StoreInterleaved2(tup, detail::NativeLanePointer(unaligned));
|
|
8481
8908
|
}
|
|
8482
8909
|
|
|
8483
8910
|
// <= 32 bits: avoid writing more than N bytes by copying to buffer
|
|
@@ -8486,8 +8913,7 @@ HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
|
|
|
8486
8913
|
T* HWY_RESTRICT unaligned) {
|
|
8487
8914
|
alignas(16) T buf[2 * 8 / sizeof(T)];
|
|
8488
8915
|
detail::Tuple2<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw}}};
|
|
8489
|
-
detail::StoreInterleaved2(tup,
|
|
8490
|
-
reinterpret_cast<detail::NativeLaneType<T>*>(buf));
|
|
8916
|
+
detail::StoreInterleaved2(tup, detail::NativeLanePointer(buf));
|
|
8491
8917
|
CopyBytes<d.MaxBytes() * 2>(buf, unaligned);
|
|
8492
8918
|
}
|
|
8493
8919
|
|
|
@@ -8498,10 +8924,9 @@ HWY_API void StoreInterleaved2(Vec128<T> v0, Vec128<T> v1, D d,
|
|
|
8498
8924
|
T* HWY_RESTRICT unaligned) {
|
|
8499
8925
|
const Half<decltype(d)> dh;
|
|
8500
8926
|
StoreInterleaved2(LowerHalf(dh, v0), LowerHalf(dh, v1), dh,
|
|
8501
|
-
|
|
8502
|
-
StoreInterleaved2(
|
|
8503
|
-
|
|
8504
|
-
reinterpret_cast<detail::NativeLaneType<T>*>(unaligned + 2));
|
|
8927
|
+
detail::NativeLanePointer(unaligned));
|
|
8928
|
+
StoreInterleaved2(UpperHalf(dh, v0), UpperHalf(dh, v1), dh,
|
|
8929
|
+
detail::NativeLanePointer(unaligned + 2));
|
|
8505
8930
|
}
|
|
8506
8931
|
#endif // HWY_ARCH_ARM_V7
|
|
8507
8932
|
|
|
@@ -8511,8 +8936,7 @@ template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>>
|
|
|
8511
8936
|
HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
|
|
8512
8937
|
T* HWY_RESTRICT unaligned) {
|
|
8513
8938
|
detail::Tuple3<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw}}};
|
|
8514
|
-
detail::StoreInterleaved3(
|
|
8515
|
-
tup, reinterpret_cast<detail::NativeLaneType<T>*>(unaligned));
|
|
8939
|
+
detail::StoreInterleaved3(tup, detail::NativeLanePointer(unaligned));
|
|
8516
8940
|
}
|
|
8517
8941
|
|
|
8518
8942
|
// <= 32 bits: avoid writing more than N bytes by copying to buffer
|
|
@@ -8521,8 +8945,7 @@ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
|
|
|
8521
8945
|
T* HWY_RESTRICT unaligned) {
|
|
8522
8946
|
alignas(16) T buf[3 * 8 / sizeof(T)];
|
|
8523
8947
|
detail::Tuple3<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw}}};
|
|
8524
|
-
detail::StoreInterleaved3(tup,
|
|
8525
|
-
reinterpret_cast<detail::NativeLaneType<T>*>(buf));
|
|
8948
|
+
detail::StoreInterleaved3(tup, detail::NativeLanePointer(buf));
|
|
8526
8949
|
CopyBytes<d.MaxBytes() * 3>(buf, unaligned);
|
|
8527
8950
|
}
|
|
8528
8951
|
|
|
@@ -8533,10 +8956,9 @@ HWY_API void StoreInterleaved3(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2, D d,
|
|
|
8533
8956
|
T* HWY_RESTRICT unaligned) {
|
|
8534
8957
|
const Half<decltype(d)> dh;
|
|
8535
8958
|
StoreInterleaved3(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2), dh,
|
|
8536
|
-
|
|
8537
|
-
StoreInterleaved3(
|
|
8538
|
-
|
|
8539
|
-
reinterpret_cast<detail::NativeLaneType<T>*>(unaligned + 3));
|
|
8959
|
+
detail::NativeLanePointer(unaligned));
|
|
8960
|
+
StoreInterleaved3(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2), dh,
|
|
8961
|
+
detail::NativeLanePointer(unaligned + 3));
|
|
8540
8962
|
}
|
|
8541
8963
|
#endif // HWY_ARCH_ARM_V7
|
|
8542
8964
|
|
|
@@ -8546,8 +8968,7 @@ template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>>
|
|
|
8546
8968
|
HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
|
|
8547
8969
|
VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) {
|
|
8548
8970
|
detail::Tuple4<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
|
|
8549
|
-
detail::StoreInterleaved4(
|
|
8550
|
-
tup, reinterpret_cast<detail::NativeLaneType<T>*>(unaligned));
|
|
8971
|
+
detail::StoreInterleaved4(tup, detail::NativeLanePointer(unaligned));
|
|
8551
8972
|
}
|
|
8552
8973
|
|
|
8553
8974
|
// <= 32 bits: avoid writing more than N bytes by copying to buffer
|
|
@@ -8556,8 +8977,7 @@ HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
|
|
|
8556
8977
|
VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) {
|
|
8557
8978
|
alignas(16) T buf[4 * 8 / sizeof(T)];
|
|
8558
8979
|
detail::Tuple4<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
|
|
8559
|
-
detail::StoreInterleaved4(tup,
|
|
8560
|
-
reinterpret_cast<detail::NativeLaneType<T>*>(buf));
|
|
8980
|
+
detail::StoreInterleaved4(tup, detail::NativeLanePointer(buf));
|
|
8561
8981
|
CopyBytes<d.MaxBytes() * 4>(buf, unaligned);
|
|
8562
8982
|
}
|
|
8563
8983
|
|
|
@@ -8569,11 +8989,10 @@ HWY_API void StoreInterleaved4(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2,
|
|
|
8569
8989
|
const Half<decltype(d)> dh;
|
|
8570
8990
|
StoreInterleaved4(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2),
|
|
8571
8991
|
LowerHalf(dh, v3), dh,
|
|
8572
|
-
|
|
8573
|
-
StoreInterleaved4(
|
|
8574
|
-
|
|
8575
|
-
|
|
8576
|
-
reinterpret_cast<detail::NativeLaneType<T>*>(unaligned + 4));
|
|
8992
|
+
detail::NativeLanePointer(unaligned));
|
|
8993
|
+
StoreInterleaved4(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2),
|
|
8994
|
+
UpperHalf(dh, v3), dh,
|
|
8995
|
+
detail::NativeLanePointer(unaligned + 4));
|
|
8577
8996
|
}
|
|
8578
8997
|
#endif // HWY_ARCH_ARM_V7
|
|
8579
8998
|
|
|
@@ -8904,7 +9323,7 @@ namespace detail { // for code folding
|
|
|
8904
9323
|
#undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
|
|
8905
9324
|
#undef HWY_NEON_DEF_FUNCTION_UINTS
|
|
8906
9325
|
#undef HWY_NEON_EVAL
|
|
8907
|
-
|
|
9326
|
+
#undef HWY_NEON_IF_EMULATED_D
|
|
8908
9327
|
} // namespace detail
|
|
8909
9328
|
|
|
8910
9329
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|