@img/sharp-libvips-dev 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +7 -1
- package/include/aom/aom_image.h +24 -12
- package/include/aom/aom_integer.h +3 -3
- package/include/aom/aomcx.h +15 -0
- package/include/aom/aomdx.h +5 -2
- package/include/archive.h +7 -5
- package/include/archive_entry.h +5 -3
- package/include/cgif.h +3 -0
- package/include/expat.h +21 -10
- package/include/expat_config.h +11 -5
- package/include/ffi.h +12 -25
- package/include/freetype2/freetype/config/ftoption.h +2 -2
- package/include/fribidi/fribidi-config.h +2 -2
- package/include/fribidi/fribidi-unicode-version.h +3 -3
- package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
- package/include/glib-2.0/gio/gappinfo.h +40 -25
- package/include/glib-2.0/gio/gapplication.h +6 -0
- package/include/glib-2.0/gio/gasyncresult.h +1 -1
- package/include/glib-2.0/gio/gconverter.h +5 -0
- package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
- package/include/glib-2.0/gio/gfile.h +16 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/giotypes.h +0 -1
- package/include/glib-2.0/gio/gsettings.h +8 -0
- package/include/glib-2.0/gio/gvfs.h +2 -2
- package/include/glib-2.0/girepository/gi-visibility.h +34 -0
- package/include/glib-2.0/girepository/giarginfo.h +23 -6
- package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
- package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
- package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
- package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
- package/include/glib-2.0/girepository/gienuminfo.h +20 -21
- package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
- package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
- package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
- package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
- package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
- package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
- package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
- package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
- package/include/glib-2.0/girepository/girepository.h +53 -62
- package/include/glib-2.0/girepository/girffi.h +8 -7
- package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
- package/include/glib-2.0/girepository/gistructinfo.h +26 -11
- package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
- package/include/glib-2.0/girepository/gitypelib.h +9 -13
- package/include/glib-2.0/girepository/gitypes.h +52 -104
- package/include/glib-2.0/girepository/giunioninfo.h +28 -12
- package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
- package/include/glib-2.0/girepository/givalueinfo.h +65 -0
- package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
- package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
- package/include/glib-2.0/glib/gbitlock.h +31 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
- package/include/glib-2.0/glib/giochannel.h +2 -2
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +12 -5
- package/include/glib-2.0/glib/gmain.h +93 -7
- package/include/glib-2.0/glib/gmessages.h +8 -0
- package/include/glib-2.0/glib/gqsort.h +8 -1
- package/include/glib-2.0/glib/gslice.h +2 -0
- package/include/glib-2.0/glib/gstrfuncs.h +24 -30
- package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
- package/include/glib-2.0/glib/gthread.h +191 -3
- package/include/glib-2.0/glib/gunicode.h +1 -1
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/glib-unix.h +7 -1
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/genums.h +6 -6
- package/include/glib-2.0/gobject/glib-types.h +11 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gsignal.h +16 -6
- package/include/glib-2.0/gobject/gtype.h +6 -6
- package/include/harfbuzz/hb-buffer.h +6 -0
- package/include/harfbuzz/hb-common.h +6 -9
- package/include/harfbuzz/hb-cplusplus.hh +8 -11
- package/include/harfbuzz/hb-subset.h +17 -4
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +28 -0
- package/include/hwy/aligned_allocator.h +218 -6
- package/include/hwy/base.h +1935 -512
- package/include/hwy/cache_control.h +24 -6
- package/include/hwy/detect_compiler_arch.h +105 -10
- package/include/hwy/detect_targets.h +146 -37
- package/include/hwy/foreach_target.h +36 -1
- package/include/hwy/highway.h +222 -50
- package/include/hwy/ops/arm_neon-inl.h +2055 -894
- package/include/hwy/ops/arm_sve-inl.h +1476 -348
- package/include/hwy/ops/emu128-inl.h +711 -623
- package/include/hwy/ops/generic_ops-inl.h +4431 -2157
- package/include/hwy/ops/inside-inl.h +691 -0
- package/include/hwy/ops/ppc_vsx-inl.h +2186 -673
- package/include/hwy/ops/rvv-inl.h +1556 -536
- package/include/hwy/ops/scalar-inl.h +353 -233
- package/include/hwy/ops/set_macros-inl.h +171 -23
- package/include/hwy/ops/shared-inl.h +198 -56
- package/include/hwy/ops/wasm_128-inl.h +283 -244
- package/include/hwy/ops/x86_128-inl.h +3673 -1357
- package/include/hwy/ops/x86_256-inl.h +1737 -663
- package/include/hwy/ops/x86_512-inl.h +1697 -500
- package/include/hwy/per_target.h +4 -0
- package/include/hwy/profiler.h +648 -0
- package/include/hwy/robust_statistics.h +2 -2
- package/include/hwy/targets.h +40 -32
- package/include/hwy/timer-inl.h +3 -3
- package/include/hwy/timer.h +16 -1
- package/include/libheif/heif.h +170 -15
- package/include/libheif/heif_items.h +237 -0
- package/include/libheif/heif_properties.h +38 -2
- package/include/libheif/heif_regions.h +1 -1
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/png.h +32 -29
- package/include/libpng16/pngconf.h +2 -2
- package/include/libpng16/pnglibconf.h +8 -3
- package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
- package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
- package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
- package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
- package/include/libxml2/libxml/HTMLparser.h +12 -19
- package/include/libxml2/libxml/c14n.h +1 -12
- package/include/libxml2/libxml/debugXML.h +1 -1
- package/include/libxml2/libxml/encoding.h +9 -0
- package/include/libxml2/libxml/entities.h +12 -1
- package/include/libxml2/libxml/hash.h +19 -0
- package/include/libxml2/libxml/list.h +2 -2
- package/include/libxml2/libxml/nanohttp.h +17 -0
- package/include/libxml2/libxml/parser.h +73 -58
- package/include/libxml2/libxml/parserInternals.h +9 -1
- package/include/libxml2/libxml/pattern.h +6 -0
- package/include/libxml2/libxml/tree.h +32 -12
- package/include/libxml2/libxml/uri.h +11 -0
- package/include/libxml2/libxml/valid.h +29 -2
- package/include/libxml2/libxml/xinclude.h +7 -0
- package/include/libxml2/libxml/xmlIO.h +21 -5
- package/include/libxml2/libxml/xmlerror.h +14 -0
- package/include/libxml2/libxml/xmlexports.h +111 -15
- package/include/libxml2/libxml/xmlmemory.h +8 -45
- package/include/libxml2/libxml/xmlreader.h +2 -0
- package/include/libxml2/libxml/xmlsave.h +5 -0
- package/include/libxml2/libxml/xmlunicode.h +165 -1
- package/include/libxml2/libxml/xmlversion.h +15 -179
- package/include/libxml2/libxml/xmlwriter.h +1 -0
- package/include/libxml2/libxml/xpath.h +4 -0
- package/include/pango-1.0/pango/pango-features.h +2 -2
- package/include/pango-1.0/pango/pango-fontmap.h +7 -0
- package/include/pango-1.0/pango/pango-item.h +4 -2
- package/include/pango-1.0/pango/pango-version-macros.h +25 -0
- package/include/pango-1.0/pango/pangofc-font.h +2 -1
- package/include/pixman-1/pixman-version.h +2 -2
- package/include/png.h +32 -29
- package/include/pngconf.h +2 -2
- package/include/pnglibconf.h +8 -3
- package/include/vips/connection.h +9 -3
- package/include/vips/util.h +1 -11
- package/include/vips/version.h +4 -4
- package/include/webp/decode.h +58 -56
- package/include/webp/demux.h +25 -21
- package/include/webp/encode.h +44 -39
- package/include/webp/mux.h +76 -15
- package/include/webp/mux_types.h +2 -1
- package/include/webp/sharpyuv/sharpyuv.h +77 -8
- package/include/webp/types.h +29 -8
- package/include/zconf.h +1 -1
- package/include/zlib.h +12 -12
- package/package.json +1 -1
- package/versions.json +18 -19
|
@@ -152,6 +152,9 @@ class Vec512 {
|
|
|
152
152
|
HWY_INLINE Vec512& operator-=(const Vec512 other) {
|
|
153
153
|
return *this = (*this - other);
|
|
154
154
|
}
|
|
155
|
+
HWY_INLINE Vec512& operator%=(const Vec512 other) {
|
|
156
|
+
return *this = (*this % other);
|
|
157
|
+
}
|
|
155
158
|
HWY_INLINE Vec512& operator&=(const Vec512 other) {
|
|
156
159
|
return *this = (*this & other);
|
|
157
160
|
}
|
|
@@ -190,6 +193,25 @@ HWY_INLINE __m512i BitCastToInteger(__m512d v) {
|
|
|
190
193
|
return _mm512_castpd_si512(v);
|
|
191
194
|
}
|
|
192
195
|
|
|
196
|
+
#if HWY_AVX3_HAVE_F32_TO_BF16C
|
|
197
|
+
HWY_INLINE __m512i BitCastToInteger(__m512bh v) {
|
|
198
|
+
// Need to use reinterpret_cast on GCC/Clang or BitCastScalar on MSVC to
|
|
199
|
+
// bit cast a __m512bh to a __m512i as there is currently no intrinsic
|
|
200
|
+
// available (as of GCC 13 and Clang 17) that can bit cast a __m512bh vector
|
|
201
|
+
// to a __m512i vector
|
|
202
|
+
|
|
203
|
+
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
|
|
204
|
+
// On GCC or Clang, use reinterpret_cast to bit cast a __m512bh to a __m512i
|
|
205
|
+
return reinterpret_cast<__m512i>(v);
|
|
206
|
+
#else
|
|
207
|
+
// On MSVC, use BitCastScalar to bit cast a __m512bh to a __m512i as MSVC does
|
|
208
|
+
// not allow reinterpret_cast, static_cast, or a C-style cast to be used to
|
|
209
|
+
// bit cast from one AVX vector type to a different AVX vector type
|
|
210
|
+
return BitCastScalar<__m512i>(v);
|
|
211
|
+
#endif // HWY_COMPILER_GCC || HWY_COMPILER_CLANG
|
|
212
|
+
}
|
|
213
|
+
#endif // HWY_AVX3_HAVE_F32_TO_BF16C
|
|
214
|
+
|
|
193
215
|
template <typename T>
|
|
194
216
|
HWY_INLINE Vec512<uint8_t> BitCastToByte(Vec512<T> v) {
|
|
195
217
|
return Vec512<uint8_t>{BitCastToInteger(v.raw)};
|
|
@@ -373,6 +395,132 @@ HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
|
|
|
373
395
|
BitCast(Full256<uint8_t>(), v).raw)});
|
|
374
396
|
}
|
|
375
397
|
|
|
398
|
+
// ------------------------------ Dup128VecFromValues
|
|
399
|
+
|
|
400
|
+
template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 64)>
|
|
401
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
402
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
403
|
+
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
|
|
404
|
+
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
|
|
405
|
+
TFromD<D> t11, TFromD<D> t12,
|
|
406
|
+
TFromD<D> t13, TFromD<D> t14,
|
|
407
|
+
TFromD<D> t15) {
|
|
408
|
+
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900
|
|
409
|
+
// Missing set_epi8/16.
|
|
410
|
+
return BroadcastBlock<0>(ResizeBitCast(
|
|
411
|
+
d, Dup128VecFromValues(Full128<TFromD<D>>(), t0, t1, t2, t3, t4, t5, t6,
|
|
412
|
+
t7, t8, t9, t10, t11, t12, t13, t14, t15)));
|
|
413
|
+
#else
|
|
414
|
+
(void)d;
|
|
415
|
+
// Need to use _mm512_set_epi8 as there is no _mm512_setr_epi8 intrinsic
|
|
416
|
+
// available
|
|
417
|
+
return VFromD<D>{_mm512_set_epi8(
|
|
418
|
+
static_cast<char>(t15), static_cast<char>(t14), static_cast<char>(t13),
|
|
419
|
+
static_cast<char>(t12), static_cast<char>(t11), static_cast<char>(t10),
|
|
420
|
+
static_cast<char>(t9), static_cast<char>(t8), static_cast<char>(t7),
|
|
421
|
+
static_cast<char>(t6), static_cast<char>(t5), static_cast<char>(t4),
|
|
422
|
+
static_cast<char>(t3), static_cast<char>(t2), static_cast<char>(t1),
|
|
423
|
+
static_cast<char>(t0), static_cast<char>(t15), static_cast<char>(t14),
|
|
424
|
+
static_cast<char>(t13), static_cast<char>(t12), static_cast<char>(t11),
|
|
425
|
+
static_cast<char>(t10), static_cast<char>(t9), static_cast<char>(t8),
|
|
426
|
+
static_cast<char>(t7), static_cast<char>(t6), static_cast<char>(t5),
|
|
427
|
+
static_cast<char>(t4), static_cast<char>(t3), static_cast<char>(t2),
|
|
428
|
+
static_cast<char>(t1), static_cast<char>(t0), static_cast<char>(t15),
|
|
429
|
+
static_cast<char>(t14), static_cast<char>(t13), static_cast<char>(t12),
|
|
430
|
+
static_cast<char>(t11), static_cast<char>(t10), static_cast<char>(t9),
|
|
431
|
+
static_cast<char>(t8), static_cast<char>(t7), static_cast<char>(t6),
|
|
432
|
+
static_cast<char>(t5), static_cast<char>(t4), static_cast<char>(t3),
|
|
433
|
+
static_cast<char>(t2), static_cast<char>(t1), static_cast<char>(t0),
|
|
434
|
+
static_cast<char>(t15), static_cast<char>(t14), static_cast<char>(t13),
|
|
435
|
+
static_cast<char>(t12), static_cast<char>(t11), static_cast<char>(t10),
|
|
436
|
+
static_cast<char>(t9), static_cast<char>(t8), static_cast<char>(t7),
|
|
437
|
+
static_cast<char>(t6), static_cast<char>(t5), static_cast<char>(t4),
|
|
438
|
+
static_cast<char>(t3), static_cast<char>(t2), static_cast<char>(t1),
|
|
439
|
+
static_cast<char>(t0))};
|
|
440
|
+
#endif
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 64)>
|
|
444
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
445
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
446
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
447
|
+
TFromD<D> t7) {
|
|
448
|
+
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900
|
|
449
|
+
// Missing set_epi8/16.
|
|
450
|
+
return BroadcastBlock<0>(
|
|
451
|
+
ResizeBitCast(d, Dup128VecFromValues(Full128<TFromD<D>>(), t0, t1, t2, t3,
|
|
452
|
+
t4, t5, t6, t7)));
|
|
453
|
+
#else
|
|
454
|
+
(void)d;
|
|
455
|
+
// Need to use _mm512_set_epi16 as there is no _mm512_setr_epi16 intrinsic
|
|
456
|
+
// available
|
|
457
|
+
return VFromD<D>{
|
|
458
|
+
_mm512_set_epi16(static_cast<int16_t>(t7), static_cast<int16_t>(t6),
|
|
459
|
+
static_cast<int16_t>(t5), static_cast<int16_t>(t4),
|
|
460
|
+
static_cast<int16_t>(t3), static_cast<int16_t>(t2),
|
|
461
|
+
static_cast<int16_t>(t1), static_cast<int16_t>(t0),
|
|
462
|
+
static_cast<int16_t>(t7), static_cast<int16_t>(t6),
|
|
463
|
+
static_cast<int16_t>(t5), static_cast<int16_t>(t4),
|
|
464
|
+
static_cast<int16_t>(t3), static_cast<int16_t>(t2),
|
|
465
|
+
static_cast<int16_t>(t1), static_cast<int16_t>(t0),
|
|
466
|
+
static_cast<int16_t>(t7), static_cast<int16_t>(t6),
|
|
467
|
+
static_cast<int16_t>(t5), static_cast<int16_t>(t4),
|
|
468
|
+
static_cast<int16_t>(t3), static_cast<int16_t>(t2),
|
|
469
|
+
static_cast<int16_t>(t1), static_cast<int16_t>(t0),
|
|
470
|
+
static_cast<int16_t>(t7), static_cast<int16_t>(t6),
|
|
471
|
+
static_cast<int16_t>(t5), static_cast<int16_t>(t4),
|
|
472
|
+
static_cast<int16_t>(t3), static_cast<int16_t>(t2),
|
|
473
|
+
static_cast<int16_t>(t1), static_cast<int16_t>(t0))};
|
|
474
|
+
#endif
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
#if HWY_HAVE_FLOAT16
|
|
478
|
+
template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 64)>
|
|
479
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
480
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
481
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
482
|
+
TFromD<D> t7) {
|
|
483
|
+
return VFromD<D>{_mm512_setr_ph(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2,
|
|
484
|
+
t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5,
|
|
485
|
+
t6, t7, t0, t1, t2, t3, t4, t5, t6, t7)};
|
|
486
|
+
}
|
|
487
|
+
#endif
|
|
488
|
+
|
|
489
|
+
template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 64)>
|
|
490
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
491
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
492
|
+
return VFromD<D>{
|
|
493
|
+
_mm512_setr_epi32(static_cast<int32_t>(t0), static_cast<int32_t>(t1),
|
|
494
|
+
static_cast<int32_t>(t2), static_cast<int32_t>(t3),
|
|
495
|
+
static_cast<int32_t>(t0), static_cast<int32_t>(t1),
|
|
496
|
+
static_cast<int32_t>(t2), static_cast<int32_t>(t3),
|
|
497
|
+
static_cast<int32_t>(t0), static_cast<int32_t>(t1),
|
|
498
|
+
static_cast<int32_t>(t2), static_cast<int32_t>(t3),
|
|
499
|
+
static_cast<int32_t>(t0), static_cast<int32_t>(t1),
|
|
500
|
+
static_cast<int32_t>(t2), static_cast<int32_t>(t3))};
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 64)>
|
|
504
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
505
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
506
|
+
return VFromD<D>{_mm512_setr_ps(t0, t1, t2, t3, t0, t1, t2, t3, t0, t1, t2,
|
|
507
|
+
t3, t0, t1, t2, t3)};
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 64)>
|
|
511
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
512
|
+
return VFromD<D>{
|
|
513
|
+
_mm512_setr_epi64(static_cast<int64_t>(t0), static_cast<int64_t>(t1),
|
|
514
|
+
static_cast<int64_t>(t0), static_cast<int64_t>(t1),
|
|
515
|
+
static_cast<int64_t>(t0), static_cast<int64_t>(t1),
|
|
516
|
+
static_cast<int64_t>(t0), static_cast<int64_t>(t1))};
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 64)>
|
|
520
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
521
|
+
return VFromD<D>{_mm512_setr_pd(t0, t1, t0, t1, t0, t1, t0, t1)};
|
|
522
|
+
}
|
|
523
|
+
|
|
376
524
|
// ----------------------------- Iota
|
|
377
525
|
|
|
378
526
|
namespace detail {
|
|
@@ -480,7 +628,7 @@ HWY_INLINE VFromD<D> Iota0(D /*d*/) {
|
|
|
480
628
|
|
|
481
629
|
template <class D, typename T2, HWY_IF_V_SIZE_D(D, 64)>
|
|
482
630
|
HWY_API VFromD<D> Iota(D d, const T2 first) {
|
|
483
|
-
return detail::Iota0(d) + Set(d,
|
|
631
|
+
return detail::Iota0(d) + Set(d, ConvertScalarTo<TFromD<D>>(first));
|
|
484
632
|
}
|
|
485
633
|
|
|
486
634
|
// ================================================== LOGICAL
|
|
@@ -502,7 +650,8 @@ template <typename T>
|
|
|
502
650
|
HWY_API Vec512<T> And(const Vec512<T> a, const Vec512<T> b) {
|
|
503
651
|
const DFromV<decltype(a)> d; // for float16_t
|
|
504
652
|
const RebindToUnsigned<decltype(d)> du;
|
|
505
|
-
return BitCast(d, VFromD<decltype(du)>{_mm512_and_si512(a.raw,
|
|
653
|
+
return BitCast(d, VFromD<decltype(du)>{_mm512_and_si512(BitCast(du, a).raw,
|
|
654
|
+
BitCast(du, b).raw)});
|
|
506
655
|
}
|
|
507
656
|
|
|
508
657
|
HWY_API Vec512<float> And(const Vec512<float> a, const Vec512<float> b) {
|
|
@@ -519,8 +668,8 @@ template <typename T>
|
|
|
519
668
|
HWY_API Vec512<T> AndNot(const Vec512<T> not_mask, const Vec512<T> mask) {
|
|
520
669
|
const DFromV<decltype(mask)> d; // for float16_t
|
|
521
670
|
const RebindToUnsigned<decltype(d)> du;
|
|
522
|
-
return BitCast(
|
|
523
|
-
|
|
671
|
+
return BitCast(d, VFromD<decltype(du)>{_mm512_andnot_si512(
|
|
672
|
+
BitCast(du, not_mask).raw, BitCast(du, mask).raw)});
|
|
524
673
|
}
|
|
525
674
|
HWY_API Vec512<float> AndNot(const Vec512<float> not_mask,
|
|
526
675
|
const Vec512<float> mask) {
|
|
@@ -537,7 +686,8 @@ template <typename T>
|
|
|
537
686
|
HWY_API Vec512<T> Or(const Vec512<T> a, const Vec512<T> b) {
|
|
538
687
|
const DFromV<decltype(a)> d; // for float16_t
|
|
539
688
|
const RebindToUnsigned<decltype(d)> du;
|
|
540
|
-
return BitCast(d, VFromD<decltype(du)>{_mm512_or_si512(a.raw,
|
|
689
|
+
return BitCast(d, VFromD<decltype(du)>{_mm512_or_si512(BitCast(du, a).raw,
|
|
690
|
+
BitCast(du, b).raw)});
|
|
541
691
|
}
|
|
542
692
|
|
|
543
693
|
HWY_API Vec512<float> Or(const Vec512<float> a, const Vec512<float> b) {
|
|
@@ -553,7 +703,8 @@ template <typename T>
|
|
|
553
703
|
HWY_API Vec512<T> Xor(const Vec512<T> a, const Vec512<T> b) {
|
|
554
704
|
const DFromV<decltype(a)> d; // for float16_t
|
|
555
705
|
const RebindToUnsigned<decltype(d)> du;
|
|
556
|
-
return BitCast(d, VFromD<decltype(du)>{_mm512_xor_si512(a.raw,
|
|
706
|
+
return BitCast(d, VFromD<decltype(du)>{_mm512_xor_si512(BitCast(du, a).raw,
|
|
707
|
+
BitCast(du, b).raw)});
|
|
557
708
|
}
|
|
558
709
|
|
|
559
710
|
HWY_API Vec512<float> Xor(const Vec512<float> a, const Vec512<float> b) {
|
|
@@ -566,45 +717,61 @@ HWY_API Vec512<double> Xor(const Vec512<double> a, const Vec512<double> b) {
|
|
|
566
717
|
// ------------------------------ Xor3
|
|
567
718
|
template <typename T>
|
|
568
719
|
HWY_API Vec512<T> Xor3(Vec512<T> x1, Vec512<T> x2, Vec512<T> x3) {
|
|
720
|
+
#if !HWY_IS_MSAN
|
|
569
721
|
const DFromV<decltype(x1)> d;
|
|
570
722
|
const RebindToUnsigned<decltype(d)> du;
|
|
571
723
|
using VU = VFromD<decltype(du)>;
|
|
572
724
|
const __m512i ret = _mm512_ternarylogic_epi64(
|
|
573
725
|
BitCast(du, x1).raw, BitCast(du, x2).raw, BitCast(du, x3).raw, 0x96);
|
|
574
726
|
return BitCast(d, VU{ret});
|
|
727
|
+
#else
|
|
728
|
+
return Xor(x1, Xor(x2, x3));
|
|
729
|
+
#endif
|
|
575
730
|
}
|
|
576
731
|
|
|
577
732
|
// ------------------------------ Or3
|
|
578
733
|
template <typename T>
|
|
579
734
|
HWY_API Vec512<T> Or3(Vec512<T> o1, Vec512<T> o2, Vec512<T> o3) {
|
|
735
|
+
#if !HWY_IS_MSAN
|
|
580
736
|
const DFromV<decltype(o1)> d;
|
|
581
737
|
const RebindToUnsigned<decltype(d)> du;
|
|
582
738
|
using VU = VFromD<decltype(du)>;
|
|
583
739
|
const __m512i ret = _mm512_ternarylogic_epi64(
|
|
584
740
|
BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE);
|
|
585
741
|
return BitCast(d, VU{ret});
|
|
742
|
+
#else
|
|
743
|
+
return Or(o1, Or(o2, o3));
|
|
744
|
+
#endif
|
|
586
745
|
}
|
|
587
746
|
|
|
588
747
|
// ------------------------------ OrAnd
|
|
589
748
|
template <typename T>
|
|
590
749
|
HWY_API Vec512<T> OrAnd(Vec512<T> o, Vec512<T> a1, Vec512<T> a2) {
|
|
750
|
+
#if !HWY_IS_MSAN
|
|
591
751
|
const DFromV<decltype(o)> d;
|
|
592
752
|
const RebindToUnsigned<decltype(d)> du;
|
|
593
753
|
using VU = VFromD<decltype(du)>;
|
|
594
754
|
const __m512i ret = _mm512_ternarylogic_epi64(
|
|
595
755
|
BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
|
|
596
756
|
return BitCast(d, VU{ret});
|
|
757
|
+
#else
|
|
758
|
+
return Or(o, And(a1, a2));
|
|
759
|
+
#endif
|
|
597
760
|
}
|
|
598
761
|
|
|
599
762
|
// ------------------------------ IfVecThenElse
|
|
600
763
|
template <typename T>
|
|
601
764
|
HWY_API Vec512<T> IfVecThenElse(Vec512<T> mask, Vec512<T> yes, Vec512<T> no) {
|
|
765
|
+
#if !HWY_IS_MSAN
|
|
602
766
|
const DFromV<decltype(yes)> d;
|
|
603
767
|
const RebindToUnsigned<decltype(d)> du;
|
|
604
768
|
using VU = VFromD<decltype(du)>;
|
|
605
769
|
return BitCast(d, VU{_mm512_ternarylogic_epi64(BitCast(du, mask).raw,
|
|
606
770
|
BitCast(du, yes).raw,
|
|
607
771
|
BitCast(du, no).raw, 0xCA)});
|
|
772
|
+
#else
|
|
773
|
+
return IfThenElse(MaskFromVec(mask), yes, no);
|
|
774
|
+
#endif
|
|
608
775
|
}
|
|
609
776
|
|
|
610
777
|
// ------------------------------ Operator overloads (internal-only if float)
|
|
@@ -752,7 +919,7 @@ HWY_API MFromD<D> FirstN(D d, size_t n) {
|
|
|
752
919
|
m.raw = static_cast<decltype(m.raw)>(_bzhi_u64(all, n));
|
|
753
920
|
return m;
|
|
754
921
|
#else
|
|
755
|
-
return detail::FirstN<
|
|
922
|
+
return detail::FirstN<TFromD<D>>(n);
|
|
756
923
|
#endif // HWY_ARCH_X86_64
|
|
757
924
|
}
|
|
758
925
|
|
|
@@ -790,7 +957,7 @@ HWY_INLINE Vec512<T> IfThenElse(hwy::SizeTag<8> /* tag */,
|
|
|
790
957
|
|
|
791
958
|
} // namespace detail
|
|
792
959
|
|
|
793
|
-
template <typename T,
|
|
960
|
+
template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
794
961
|
HWY_API Vec512<T> IfThenElse(const Mask512<T> mask, const Vec512<T> yes,
|
|
795
962
|
const Vec512<T> no) {
|
|
796
963
|
return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
|
|
@@ -840,7 +1007,7 @@ HWY_INLINE Vec512<T> IfThenElseZero(hwy::SizeTag<8> /* tag */,
|
|
|
840
1007
|
|
|
841
1008
|
} // namespace detail
|
|
842
1009
|
|
|
843
|
-
template <typename T,
|
|
1010
|
+
template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
844
1011
|
HWY_API Vec512<T> IfThenElseZero(const Mask512<T> mask, const Vec512<T> yes) {
|
|
845
1012
|
return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
|
|
846
1013
|
}
|
|
@@ -878,7 +1045,7 @@ HWY_INLINE Vec512<T> IfThenZeroElse(hwy::SizeTag<8> /* tag */,
|
|
|
878
1045
|
|
|
879
1046
|
} // namespace detail
|
|
880
1047
|
|
|
881
|
-
template <typename T,
|
|
1048
|
+
template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
882
1049
|
HWY_API Vec512<T> IfThenZeroElse(const Mask512<T> mask, const Vec512<T> no) {
|
|
883
1050
|
return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
|
|
884
1051
|
}
|
|
@@ -896,10 +1063,12 @@ HWY_API Vec512<T> IfNegativeThenElse(Vec512<T> v, Vec512<T> yes, Vec512<T> no) {
|
|
|
896
1063
|
return IfThenElse(MaskFromVec(v), yes, no);
|
|
897
1064
|
}
|
|
898
1065
|
|
|
899
|
-
template <typename T,
|
|
900
|
-
|
|
1066
|
+
template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
|
|
1067
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
|
|
1068
|
+
HWY_API Vec512<T> IfNegativeThenNegOrUndefIfZero(Vec512<T> mask, Vec512<T> v) {
|
|
901
1069
|
// AVX3 MaskFromVec only looks at the MSB
|
|
902
|
-
|
|
1070
|
+
const DFromV<decltype(v)> d;
|
|
1071
|
+
return MaskedSubOr(v, MaskFromVec(mask), Zero(d), v);
|
|
903
1072
|
}
|
|
904
1073
|
|
|
905
1074
|
// ================================================== ARITHMETIC
|
|
@@ -1000,6 +1169,59 @@ HWY_API Vec512<uint64_t> SumsOf8AbsDiff(Vec512<uint8_t> a, Vec512<uint8_t> b) {
|
|
|
1000
1169
|
return Vec512<uint64_t>{_mm512_sad_epu8(a.raw, b.raw)};
|
|
1001
1170
|
}
|
|
1002
1171
|
|
|
1172
|
+
// ------------------------------ SumsOf4
|
|
1173
|
+
namespace detail {
|
|
1174
|
+
|
|
1175
|
+
HWY_INLINE Vec512<uint32_t> SumsOf4(hwy::UnsignedTag /*type_tag*/,
|
|
1176
|
+
hwy::SizeTag<1> /*lane_size_tag*/,
|
|
1177
|
+
Vec512<uint8_t> v) {
|
|
1178
|
+
const DFromV<decltype(v)> d;
|
|
1179
|
+
|
|
1180
|
+
// _mm512_maskz_dbsad_epu8 is used below as the odd uint16_t lanes need to be
|
|
1181
|
+
// zeroed out and the sums of the 4 consecutive lanes are already in the
|
|
1182
|
+
// even uint16_t lanes of the _mm512_maskz_dbsad_epu8 result.
|
|
1183
|
+
return Vec512<uint32_t>{_mm512_maskz_dbsad_epu8(
|
|
1184
|
+
static_cast<__mmask32>(0x55555555), v.raw, Zero(d).raw, 0)};
|
|
1185
|
+
}
|
|
1186
|
+
|
|
1187
|
+
// I8->I32 SumsOf4
|
|
1188
|
+
// Generic for all vector lengths
|
|
1189
|
+
template <class V>
|
|
1190
|
+
HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
|
|
1191
|
+
hwy::SignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
|
|
1192
|
+
const DFromV<decltype(v)> d;
|
|
1193
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1194
|
+
const RepartitionToWideX2<decltype(d)> di32;
|
|
1195
|
+
|
|
1196
|
+
// Adjust the values of v to be in the 0..255 range by adding 128 to each lane
|
|
1197
|
+
// of v (which is the same as an bitwise XOR of each i8 lane by 128) and then
|
|
1198
|
+
// bitcasting the Xor result to an u8 vector.
|
|
1199
|
+
const auto v_adj = BitCast(du, Xor(v, SignBit(d)));
|
|
1200
|
+
|
|
1201
|
+
// Need to add -512 to each i32 lane of the result of the
|
|
1202
|
+
// SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(), v_adj) operation to account
|
|
1203
|
+
// for the adjustment made above.
|
|
1204
|
+
return BitCast(di32, SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(), v_adj)) +
|
|
1205
|
+
Set(di32, int32_t{-512});
|
|
1206
|
+
}
|
|
1207
|
+
|
|
1208
|
+
} // namespace detail
|
|
1209
|
+
|
|
1210
|
+
// ------------------------------ SumsOfShuffledQuadAbsDiff
|
|
1211
|
+
|
|
1212
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
1213
|
+
template <int kIdx3, int kIdx2, int kIdx1, int kIdx0>
|
|
1214
|
+
static Vec512<uint16_t> SumsOfShuffledQuadAbsDiff(Vec512<uint8_t> a,
|
|
1215
|
+
Vec512<uint8_t> b) {
|
|
1216
|
+
static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3");
|
|
1217
|
+
static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3");
|
|
1218
|
+
static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3");
|
|
1219
|
+
static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3");
|
|
1220
|
+
return Vec512<uint16_t>{
|
|
1221
|
+
_mm512_dbsad_epu8(b.raw, a.raw, _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0))};
|
|
1222
|
+
}
|
|
1223
|
+
#endif
|
|
1224
|
+
|
|
1003
1225
|
// ------------------------------ SaturatedAdd
|
|
1004
1226
|
|
|
1005
1227
|
// Returns a + b clamped to the destination range.
|
|
@@ -1075,27 +1297,6 @@ HWY_API Vec512<int64_t> Abs(const Vec512<int64_t> v) {
|
|
|
1075
1297
|
return Vec512<int64_t>{_mm512_abs_epi64(v.raw)};
|
|
1076
1298
|
}
|
|
1077
1299
|
|
|
1078
|
-
// These aren't native instructions, they also involve AND with constant.
|
|
1079
|
-
#if HWY_HAVE_FLOAT16
|
|
1080
|
-
HWY_API Vec512<float16_t> Abs(const Vec512<float16_t> v) {
|
|
1081
|
-
return Vec512<float16_t>{_mm512_abs_ph(v.raw)};
|
|
1082
|
-
}
|
|
1083
|
-
#endif // HWY_HAVE_FLOAT16
|
|
1084
|
-
|
|
1085
|
-
HWY_API Vec512<float> Abs(const Vec512<float> v) {
|
|
1086
|
-
return Vec512<float>{_mm512_abs_ps(v.raw)};
|
|
1087
|
-
}
|
|
1088
|
-
HWY_API Vec512<double> Abs(const Vec512<double> v) {
|
|
1089
|
-
// Workaround: _mm512_abs_pd expects __m512, so implement it ourselves.
|
|
1090
|
-
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 803
|
|
1091
|
-
const DFromV<decltype(v)> d;
|
|
1092
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
1093
|
-
return And(v, BitCast(d, Set(du, 0x7FFFFFFFFFFFFFFFULL)));
|
|
1094
|
-
#else
|
|
1095
|
-
return Vec512<double>{_mm512_abs_pd(v.raw)};
|
|
1096
|
-
#endif
|
|
1097
|
-
}
|
|
1098
|
-
|
|
1099
1300
|
// ------------------------------ ShiftLeft
|
|
1100
1301
|
|
|
1101
1302
|
#if HWY_TARGET <= HWY_AVX3_DL
|
|
@@ -1245,14 +1446,45 @@ HWY_API Vec512<int8_t> ShiftRight(const Vec512<int8_t> v) {
|
|
|
1245
1446
|
|
|
1246
1447
|
// ------------------------------ RotateRight
|
|
1247
1448
|
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1449
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
1450
|
+
// U8 RotateRight is generic for all vector lengths on AVX3_DL
|
|
1451
|
+
template <int kBits, class V, HWY_IF_U8(TFromV<V>)>
|
|
1452
|
+
HWY_API V RotateRight(V v) {
|
|
1453
|
+
static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
|
|
1454
|
+
|
|
1455
|
+
const Repartition<uint64_t, DFromV<V>> du64;
|
|
1252
1456
|
if (kBits == 0) return v;
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1457
|
+
|
|
1458
|
+
constexpr uint64_t kShrMatrix =
|
|
1459
|
+
(0x0102040810204080ULL << kBits) &
|
|
1460
|
+
(0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
|
|
1461
|
+
constexpr int kShlBits = (-kBits) & 7;
|
|
1462
|
+
constexpr uint64_t kShlMatrix = (0x0102040810204080ULL >> kShlBits) &
|
|
1463
|
+
(0x0101010101010101ULL * (0xFF >> kShlBits));
|
|
1464
|
+
constexpr uint64_t kMatrix = kShrMatrix | kShlMatrix;
|
|
1465
|
+
|
|
1466
|
+
return detail::GaloisAffine(v, Set(du64, kMatrix));
|
|
1467
|
+
}
|
|
1468
|
+
#else // HWY_TARGET > HWY_AVX3_DL
|
|
1469
|
+
template <int kBits>
|
|
1470
|
+
HWY_API Vec512<uint8_t> RotateRight(const Vec512<uint8_t> v) {
|
|
1471
|
+
static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
|
|
1472
|
+
if (kBits == 0) return v;
|
|
1473
|
+
// AVX3 does not support 8-bit.
|
|
1474
|
+
return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(7, 8 - kBits)>(v));
|
|
1475
|
+
}
|
|
1476
|
+
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
1477
|
+
|
|
1478
|
+
template <int kBits>
|
|
1479
|
+
HWY_API Vec512<uint16_t> RotateRight(const Vec512<uint16_t> v) {
|
|
1480
|
+
static_assert(0 <= kBits && kBits < 16, "Invalid shift count");
|
|
1481
|
+
if (kBits == 0) return v;
|
|
1482
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
1483
|
+
return Vec512<uint16_t>{_mm512_shrdi_epi16(v.raw, v.raw, kBits)};
|
|
1484
|
+
#else
|
|
1485
|
+
// AVX3 does not support 16-bit.
|
|
1486
|
+
return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(15, 16 - kBits)>(v));
|
|
1487
|
+
#endif
|
|
1256
1488
|
}
|
|
1257
1489
|
|
|
1258
1490
|
template <int kBits>
|
|
@@ -1269,6 +1501,34 @@ HWY_API Vec512<uint64_t> RotateRight(const Vec512<uint64_t> v) {
|
|
|
1269
1501
|
return Vec512<uint64_t>{_mm512_ror_epi64(v.raw, kBits)};
|
|
1270
1502
|
}
|
|
1271
1503
|
|
|
1504
|
+
// ------------------------------ Rol/Ror
|
|
1505
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
1506
|
+
template <class T, HWY_IF_UI16(T)>
|
|
1507
|
+
HWY_API Vec512<T> Ror(Vec512<T> a, Vec512<T> b) {
|
|
1508
|
+
return Vec512<T>{_mm512_shrdv_epi16(a.raw, a.raw, b.raw)};
|
|
1509
|
+
}
|
|
1510
|
+
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
1511
|
+
|
|
1512
|
+
template <class T, HWY_IF_UI32(T)>
|
|
1513
|
+
HWY_API Vec512<T> Rol(Vec512<T> a, Vec512<T> b) {
|
|
1514
|
+
return Vec512<T>{_mm512_rolv_epi32(a.raw, b.raw)};
|
|
1515
|
+
}
|
|
1516
|
+
|
|
1517
|
+
template <class T, HWY_IF_UI32(T)>
|
|
1518
|
+
HWY_API Vec512<T> Ror(Vec512<T> a, Vec512<T> b) {
|
|
1519
|
+
return Vec512<T>{_mm512_rorv_epi32(a.raw, b.raw)};
|
|
1520
|
+
}
|
|
1521
|
+
|
|
1522
|
+
template <class T, HWY_IF_UI64(T)>
|
|
1523
|
+
HWY_API Vec512<T> Rol(Vec512<T> a, Vec512<T> b) {
|
|
1524
|
+
return Vec512<T>{_mm512_rolv_epi64(a.raw, b.raw)};
|
|
1525
|
+
}
|
|
1526
|
+
|
|
1527
|
+
template <class T, HWY_IF_UI64(T)>
|
|
1528
|
+
HWY_API Vec512<T> Ror(Vec512<T> a, Vec512<T> b) {
|
|
1529
|
+
return Vec512<T>{_mm512_rorv_epi64(a.raw, b.raw)};
|
|
1530
|
+
}
|
|
1531
|
+
|
|
1272
1532
|
// ------------------------------ ShiftLeftSame
|
|
1273
1533
|
|
|
1274
1534
|
// GCC <14 and Clang <11 do not follow the Intel documentation for AVX-512
|
|
@@ -1643,6 +1903,322 @@ HWY_API Vec512<double> ApproximateReciprocal(Vec512<double> v) {
|
|
|
1643
1903
|
return Vec512<double>{_mm512_rcp14_pd(v.raw)};
|
|
1644
1904
|
}
|
|
1645
1905
|
|
|
1906
|
+
// ------------------------------ MaskedMinOr
|
|
1907
|
+
|
|
1908
|
+
template <typename T, HWY_IF_U8(T)>
|
|
1909
|
+
HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1910
|
+
Vec512<T> b) {
|
|
1911
|
+
return Vec512<T>{_mm512_mask_min_epu8(no.raw, m.raw, a.raw, b.raw)};
|
|
1912
|
+
}
|
|
1913
|
+
template <typename T, HWY_IF_I8(T)>
|
|
1914
|
+
HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1915
|
+
Vec512<T> b) {
|
|
1916
|
+
return Vec512<T>{_mm512_mask_min_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
1917
|
+
}
|
|
1918
|
+
|
|
1919
|
+
template <typename T, HWY_IF_U16(T)>
|
|
1920
|
+
HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1921
|
+
Vec512<T> b) {
|
|
1922
|
+
return Vec512<T>{_mm512_mask_min_epu16(no.raw, m.raw, a.raw, b.raw)};
|
|
1923
|
+
}
|
|
1924
|
+
template <typename T, HWY_IF_I16(T)>
|
|
1925
|
+
HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1926
|
+
Vec512<T> b) {
|
|
1927
|
+
return Vec512<T>{_mm512_mask_min_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
1928
|
+
}
|
|
1929
|
+
|
|
1930
|
+
template <typename T, HWY_IF_U32(T)>
|
|
1931
|
+
HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1932
|
+
Vec512<T> b) {
|
|
1933
|
+
return Vec512<T>{_mm512_mask_min_epu32(no.raw, m.raw, a.raw, b.raw)};
|
|
1934
|
+
}
|
|
1935
|
+
template <typename T, HWY_IF_I32(T)>
|
|
1936
|
+
HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1937
|
+
Vec512<T> b) {
|
|
1938
|
+
return Vec512<T>{_mm512_mask_min_epi32(no.raw, m.raw, a.raw, b.raw)};
|
|
1939
|
+
}
|
|
1940
|
+
|
|
1941
|
+
template <typename T, HWY_IF_U64(T)>
|
|
1942
|
+
HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1943
|
+
Vec512<T> b) {
|
|
1944
|
+
return Vec512<T>{_mm512_mask_min_epu64(no.raw, m.raw, a.raw, b.raw)};
|
|
1945
|
+
}
|
|
1946
|
+
template <typename T, HWY_IF_I64(T)>
|
|
1947
|
+
HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1948
|
+
Vec512<T> b) {
|
|
1949
|
+
return Vec512<T>{_mm512_mask_min_epi64(no.raw, m.raw, a.raw, b.raw)};
|
|
1950
|
+
}
|
|
1951
|
+
|
|
1952
|
+
template <typename T, HWY_IF_F32(T)>
|
|
1953
|
+
HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1954
|
+
Vec512<T> b) {
|
|
1955
|
+
return Vec512<T>{_mm512_mask_min_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
1956
|
+
}
|
|
1957
|
+
|
|
1958
|
+
template <typename T, HWY_IF_F64(T)>
|
|
1959
|
+
HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1960
|
+
Vec512<T> b) {
|
|
1961
|
+
return Vec512<T>{_mm512_mask_min_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
1962
|
+
}
|
|
1963
|
+
|
|
1964
|
+
#if HWY_HAVE_FLOAT16
|
|
1965
|
+
template <typename T, HWY_IF_F16(T)>
|
|
1966
|
+
HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1967
|
+
Vec512<T> b) {
|
|
1968
|
+
return Vec512<T>{_mm512_mask_min_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
1969
|
+
}
|
|
1970
|
+
#endif // HWY_HAVE_FLOAT16
|
|
1971
|
+
|
|
1972
|
+
// ------------------------------ MaskedMaxOr
|
|
1973
|
+
|
|
1974
|
+
template <typename T, HWY_IF_U8(T)>
|
|
1975
|
+
HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1976
|
+
Vec512<T> b) {
|
|
1977
|
+
return Vec512<T>{_mm512_mask_max_epu8(no.raw, m.raw, a.raw, b.raw)};
|
|
1978
|
+
}
|
|
1979
|
+
template <typename T, HWY_IF_I8(T)>
|
|
1980
|
+
HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1981
|
+
Vec512<T> b) {
|
|
1982
|
+
return Vec512<T>{_mm512_mask_max_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
1983
|
+
}
|
|
1984
|
+
|
|
1985
|
+
template <typename T, HWY_IF_U16(T)>
|
|
1986
|
+
HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1987
|
+
Vec512<T> b) {
|
|
1988
|
+
return Vec512<T>{_mm512_mask_max_epu16(no.raw, m.raw, a.raw, b.raw)};
|
|
1989
|
+
}
|
|
1990
|
+
template <typename T, HWY_IF_I16(T)>
|
|
1991
|
+
HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1992
|
+
Vec512<T> b) {
|
|
1993
|
+
return Vec512<T>{_mm512_mask_max_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
1994
|
+
}
|
|
1995
|
+
|
|
1996
|
+
template <typename T, HWY_IF_U32(T)>
|
|
1997
|
+
HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
1998
|
+
Vec512<T> b) {
|
|
1999
|
+
return Vec512<T>{_mm512_mask_max_epu32(no.raw, m.raw, a.raw, b.raw)};
|
|
2000
|
+
}
|
|
2001
|
+
template <typename T, HWY_IF_I32(T)>
|
|
2002
|
+
HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2003
|
+
Vec512<T> b) {
|
|
2004
|
+
return Vec512<T>{_mm512_mask_max_epi32(no.raw, m.raw, a.raw, b.raw)};
|
|
2005
|
+
}
|
|
2006
|
+
|
|
2007
|
+
template <typename T, HWY_IF_U64(T)>
|
|
2008
|
+
HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2009
|
+
Vec512<T> b) {
|
|
2010
|
+
return Vec512<T>{_mm512_mask_max_epu64(no.raw, m.raw, a.raw, b.raw)};
|
|
2011
|
+
}
|
|
2012
|
+
template <typename T, HWY_IF_I64(T)>
|
|
2013
|
+
HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2014
|
+
Vec512<T> b) {
|
|
2015
|
+
return Vec512<T>{_mm512_mask_max_epi64(no.raw, m.raw, a.raw, b.raw)};
|
|
2016
|
+
}
|
|
2017
|
+
|
|
2018
|
+
template <typename T, HWY_IF_F32(T)>
|
|
2019
|
+
HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2020
|
+
Vec512<T> b) {
|
|
2021
|
+
return Vec512<T>{_mm512_mask_max_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
2022
|
+
}
|
|
2023
|
+
|
|
2024
|
+
template <typename T, HWY_IF_F64(T)>
|
|
2025
|
+
HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2026
|
+
Vec512<T> b) {
|
|
2027
|
+
return Vec512<T>{_mm512_mask_max_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
2028
|
+
}
|
|
2029
|
+
|
|
2030
|
+
#if HWY_HAVE_FLOAT16
|
|
2031
|
+
template <typename T, HWY_IF_F16(T)>
|
|
2032
|
+
HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2033
|
+
Vec512<T> b) {
|
|
2034
|
+
return Vec512<T>{_mm512_mask_max_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
2035
|
+
}
|
|
2036
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2037
|
+
|
|
2038
|
+
// ------------------------------ MaskedAddOr
|
|
2039
|
+
|
|
2040
|
+
template <typename T, HWY_IF_UI8(T)>
|
|
2041
|
+
HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2042
|
+
Vec512<T> b) {
|
|
2043
|
+
return Vec512<T>{_mm512_mask_add_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
2044
|
+
}
|
|
2045
|
+
|
|
2046
|
+
template <typename T, HWY_IF_UI16(T)>
|
|
2047
|
+
HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2048
|
+
Vec512<T> b) {
|
|
2049
|
+
return Vec512<T>{_mm512_mask_add_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
2050
|
+
}
|
|
2051
|
+
|
|
2052
|
+
template <typename T, HWY_IF_UI32(T)>
|
|
2053
|
+
HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2054
|
+
Vec512<T> b) {
|
|
2055
|
+
return Vec512<T>{_mm512_mask_add_epi32(no.raw, m.raw, a.raw, b.raw)};
|
|
2056
|
+
}
|
|
2057
|
+
|
|
2058
|
+
template <typename T, HWY_IF_UI64(T)>
|
|
2059
|
+
HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2060
|
+
Vec512<T> b) {
|
|
2061
|
+
return Vec512<T>{_mm512_mask_add_epi64(no.raw, m.raw, a.raw, b.raw)};
|
|
2062
|
+
}
|
|
2063
|
+
|
|
2064
|
+
template <typename T, HWY_IF_F32(T)>
|
|
2065
|
+
HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2066
|
+
Vec512<T> b) {
|
|
2067
|
+
return Vec512<T>{_mm512_mask_add_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
2068
|
+
}
|
|
2069
|
+
|
|
2070
|
+
template <typename T, HWY_IF_F64(T)>
|
|
2071
|
+
HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2072
|
+
Vec512<T> b) {
|
|
2073
|
+
return Vec512<T>{_mm512_mask_add_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
2074
|
+
}
|
|
2075
|
+
|
|
2076
|
+
#if HWY_HAVE_FLOAT16
|
|
2077
|
+
template <typename T, HWY_IF_F16(T)>
|
|
2078
|
+
HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2079
|
+
Vec512<T> b) {
|
|
2080
|
+
return Vec512<T>{_mm512_mask_add_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
2081
|
+
}
|
|
2082
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2083
|
+
|
|
2084
|
+
// ------------------------------ MaskedSubOr
|
|
2085
|
+
|
|
2086
|
+
template <typename T, HWY_IF_UI8(T)>
|
|
2087
|
+
HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2088
|
+
Vec512<T> b) {
|
|
2089
|
+
return Vec512<T>{_mm512_mask_sub_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
2090
|
+
}
|
|
2091
|
+
|
|
2092
|
+
template <typename T, HWY_IF_UI16(T)>
|
|
2093
|
+
HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2094
|
+
Vec512<T> b) {
|
|
2095
|
+
return Vec512<T>{_mm512_mask_sub_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
2096
|
+
}
|
|
2097
|
+
|
|
2098
|
+
template <typename T, HWY_IF_UI32(T)>
|
|
2099
|
+
HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2100
|
+
Vec512<T> b) {
|
|
2101
|
+
return Vec512<T>{_mm512_mask_sub_epi32(no.raw, m.raw, a.raw, b.raw)};
|
|
2102
|
+
}
|
|
2103
|
+
|
|
2104
|
+
template <typename T, HWY_IF_UI64(T)>
|
|
2105
|
+
HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2106
|
+
Vec512<T> b) {
|
|
2107
|
+
return Vec512<T>{_mm512_mask_sub_epi64(no.raw, m.raw, a.raw, b.raw)};
|
|
2108
|
+
}
|
|
2109
|
+
|
|
2110
|
+
template <typename T, HWY_IF_F32(T)>
|
|
2111
|
+
HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2112
|
+
Vec512<T> b) {
|
|
2113
|
+
return Vec512<T>{_mm512_mask_sub_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
2114
|
+
}
|
|
2115
|
+
|
|
2116
|
+
template <typename T, HWY_IF_F64(T)>
|
|
2117
|
+
HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2118
|
+
Vec512<T> b) {
|
|
2119
|
+
return Vec512<T>{_mm512_mask_sub_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
2120
|
+
}
|
|
2121
|
+
|
|
2122
|
+
#if HWY_HAVE_FLOAT16
|
|
2123
|
+
template <typename T, HWY_IF_F16(T)>
|
|
2124
|
+
HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2125
|
+
Vec512<T> b) {
|
|
2126
|
+
return Vec512<T>{_mm512_mask_sub_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
2127
|
+
}
|
|
2128
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2129
|
+
|
|
2130
|
+
// ------------------------------ MaskedMulOr
|
|
2131
|
+
|
|
2132
|
+
HWY_API Vec512<float> MaskedMulOr(Vec512<float> no, Mask512<float> m,
|
|
2133
|
+
Vec512<float> a, Vec512<float> b) {
|
|
2134
|
+
return Vec512<float>{_mm512_mask_mul_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
2135
|
+
}
|
|
2136
|
+
|
|
2137
|
+
HWY_API Vec512<double> MaskedMulOr(Vec512<double> no, Mask512<double> m,
|
|
2138
|
+
Vec512<double> a, Vec512<double> b) {
|
|
2139
|
+
return Vec512<double>{_mm512_mask_mul_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
2140
|
+
}
|
|
2141
|
+
|
|
2142
|
+
#if HWY_HAVE_FLOAT16
|
|
2143
|
+
HWY_API Vec512<float16_t> MaskedMulOr(Vec512<float16_t> no,
|
|
2144
|
+
Mask512<float16_t> m, Vec512<float16_t> a,
|
|
2145
|
+
Vec512<float16_t> b) {
|
|
2146
|
+
return Vec512<float16_t>{_mm512_mask_mul_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
2147
|
+
}
|
|
2148
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2149
|
+
|
|
2150
|
+
// ------------------------------ MaskedDivOr
|
|
2151
|
+
|
|
2152
|
+
HWY_API Vec512<float> MaskedDivOr(Vec512<float> no, Mask512<float> m,
|
|
2153
|
+
Vec512<float> a, Vec512<float> b) {
|
|
2154
|
+
return Vec512<float>{_mm512_mask_div_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
2155
|
+
}
|
|
2156
|
+
|
|
2157
|
+
HWY_API Vec512<double> MaskedDivOr(Vec512<double> no, Mask512<double> m,
|
|
2158
|
+
Vec512<double> a, Vec512<double> b) {
|
|
2159
|
+
return Vec512<double>{_mm512_mask_div_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
2160
|
+
}
|
|
2161
|
+
|
|
2162
|
+
#if HWY_HAVE_FLOAT16
|
|
2163
|
+
HWY_API Vec512<float16_t> MaskedDivOr(Vec512<float16_t> no,
|
|
2164
|
+
Mask512<float16_t> m, Vec512<float16_t> a,
|
|
2165
|
+
Vec512<float16_t> b) {
|
|
2166
|
+
return Vec512<float16_t>{_mm512_mask_div_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
2167
|
+
}
|
|
2168
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2169
|
+
|
|
2170
|
+
// ------------------------------ MaskedSatAddOr
|
|
2171
|
+
|
|
2172
|
+
template <typename T, HWY_IF_I8(T)>
|
|
2173
|
+
HWY_API Vec512<T> MaskedSatAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2174
|
+
Vec512<T> b) {
|
|
2175
|
+
return Vec512<T>{_mm512_mask_adds_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
2176
|
+
}
|
|
2177
|
+
|
|
2178
|
+
template <typename T, HWY_IF_U8(T)>
|
|
2179
|
+
HWY_API Vec512<T> MaskedSatAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2180
|
+
Vec512<T> b) {
|
|
2181
|
+
return Vec512<T>{_mm512_mask_adds_epu8(no.raw, m.raw, a.raw, b.raw)};
|
|
2182
|
+
}
|
|
2183
|
+
|
|
2184
|
+
template <typename T, HWY_IF_I16(T)>
|
|
2185
|
+
HWY_API Vec512<T> MaskedSatAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2186
|
+
Vec512<T> b) {
|
|
2187
|
+
return Vec512<T>{_mm512_mask_adds_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
2188
|
+
}
|
|
2189
|
+
|
|
2190
|
+
template <typename T, HWY_IF_U16(T)>
|
|
2191
|
+
HWY_API Vec512<T> MaskedSatAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2192
|
+
Vec512<T> b) {
|
|
2193
|
+
return Vec512<T>{_mm512_mask_adds_epu16(no.raw, m.raw, a.raw, b.raw)};
|
|
2194
|
+
}
|
|
2195
|
+
|
|
2196
|
+
// ------------------------------ MaskedSatSubOr
|
|
2197
|
+
|
|
2198
|
+
template <typename T, HWY_IF_I8(T)>
|
|
2199
|
+
HWY_API Vec512<T> MaskedSatSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2200
|
+
Vec512<T> b) {
|
|
2201
|
+
return Vec512<T>{_mm512_mask_subs_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
2202
|
+
}
|
|
2203
|
+
|
|
2204
|
+
template <typename T, HWY_IF_U8(T)>
|
|
2205
|
+
HWY_API Vec512<T> MaskedSatSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2206
|
+
Vec512<T> b) {
|
|
2207
|
+
return Vec512<T>{_mm512_mask_subs_epu8(no.raw, m.raw, a.raw, b.raw)};
|
|
2208
|
+
}
|
|
2209
|
+
|
|
2210
|
+
template <typename T, HWY_IF_I16(T)>
|
|
2211
|
+
HWY_API Vec512<T> MaskedSatSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2212
|
+
Vec512<T> b) {
|
|
2213
|
+
return Vec512<T>{_mm512_mask_subs_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
2214
|
+
}
|
|
2215
|
+
|
|
2216
|
+
template <typename T, HWY_IF_U16(T)>
|
|
2217
|
+
HWY_API Vec512<T> MaskedSatSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
|
|
2218
|
+
Vec512<T> b) {
|
|
2219
|
+
return Vec512<T>{_mm512_mask_subs_epu16(no.raw, m.raw, a.raw, b.raw)};
|
|
2220
|
+
}
|
|
2221
|
+
|
|
1646
2222
|
// ------------------------------ Floating-point multiply-add variants
|
|
1647
2223
|
|
|
1648
2224
|
#if HWY_HAVE_FLOAT16
|
|
@@ -1709,6 +2285,23 @@ HWY_API Vec512<double> NegMulSub(Vec512<double> mul, Vec512<double> x,
|
|
|
1709
2285
|
return Vec512<double>{_mm512_fnmsub_pd(mul.raw, x.raw, sub.raw)};
|
|
1710
2286
|
}
|
|
1711
2287
|
|
|
2288
|
+
#if HWY_HAVE_FLOAT16
|
|
2289
|
+
HWY_API Vec512<float16_t> MulAddSub(Vec512<float16_t> mul, Vec512<float16_t> x,
|
|
2290
|
+
Vec512<float16_t> sub_or_add) {
|
|
2291
|
+
return Vec512<float16_t>{_mm512_fmaddsub_ph(mul.raw, x.raw, sub_or_add.raw)};
|
|
2292
|
+
}
|
|
2293
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2294
|
+
|
|
2295
|
+
HWY_API Vec512<float> MulAddSub(Vec512<float> mul, Vec512<float> x,
|
|
2296
|
+
Vec512<float> sub_or_add) {
|
|
2297
|
+
return Vec512<float>{_mm512_fmaddsub_ps(mul.raw, x.raw, sub_or_add.raw)};
|
|
2298
|
+
}
|
|
2299
|
+
|
|
2300
|
+
HWY_API Vec512<double> MulAddSub(Vec512<double> mul, Vec512<double> x,
|
|
2301
|
+
Vec512<double> sub_or_add) {
|
|
2302
|
+
return Vec512<double>{_mm512_fmaddsub_pd(mul.raw, x.raw, sub_or_add.raw)};
|
|
2303
|
+
}
|
|
2304
|
+
|
|
1712
2305
|
// ------------------------------ Floating-point square root
|
|
1713
2306
|
|
|
1714
2307
|
// Full precision square root
|
|
@@ -1873,7 +2466,11 @@ HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
|
|
|
1873
2466
|
#if HWY_HAVE_FLOAT16
|
|
1874
2467
|
HWY_API Mask512<float16_t> operator==(Vec512<float16_t> a,
|
|
1875
2468
|
Vec512<float16_t> b) {
|
|
2469
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
2470
|
+
HWY_DIAGNOSTICS(push)
|
|
2471
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
1876
2472
|
return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)};
|
|
2473
|
+
HWY_DIAGNOSTICS(pop)
|
|
1877
2474
|
}
|
|
1878
2475
|
#endif // HWY_HAVE_FLOAT16
|
|
1879
2476
|
|
|
@@ -1907,7 +2504,11 @@ HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
|
|
|
1907
2504
|
#if HWY_HAVE_FLOAT16
|
|
1908
2505
|
HWY_API Mask512<float16_t> operator!=(Vec512<float16_t> a,
|
|
1909
2506
|
Vec512<float16_t> b) {
|
|
2507
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
2508
|
+
HWY_DIAGNOSTICS(push)
|
|
2509
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
1910
2510
|
return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
|
|
2511
|
+
HWY_DIAGNOSTICS(pop)
|
|
1911
2512
|
}
|
|
1912
2513
|
#endif // HWY_HAVE_FLOAT16
|
|
1913
2514
|
|
|
@@ -1949,7 +2550,11 @@ HWY_API Mask512<int64_t> operator>(Vec512<int64_t> a, Vec512<int64_t> b) {
|
|
|
1949
2550
|
|
|
1950
2551
|
#if HWY_HAVE_FLOAT16
|
|
1951
2552
|
HWY_API Mask512<float16_t> operator>(Vec512<float16_t> a, Vec512<float16_t> b) {
|
|
2553
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
2554
|
+
HWY_DIAGNOSTICS(push)
|
|
2555
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
1952
2556
|
return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)};
|
|
2557
|
+
HWY_DIAGNOSTICS(pop)
|
|
1953
2558
|
}
|
|
1954
2559
|
#endif // HWY_HAVE_FLOAT16
|
|
1955
2560
|
|
|
@@ -1965,7 +2570,11 @@ HWY_API Mask512<double> operator>(Vec512<double> a, Vec512<double> b) {
|
|
|
1965
2570
|
#if HWY_HAVE_FLOAT16
|
|
1966
2571
|
HWY_API Mask512<float16_t> operator>=(Vec512<float16_t> a,
|
|
1967
2572
|
Vec512<float16_t> b) {
|
|
2573
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
2574
|
+
HWY_DIAGNOSTICS(push)
|
|
2575
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
1968
2576
|
return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)};
|
|
2577
|
+
HWY_DIAGNOSTICS(pop)
|
|
1969
2578
|
}
|
|
1970
2579
|
#endif // HWY_HAVE_FLOAT16
|
|
1971
2580
|
|
|
@@ -2328,11 +2937,63 @@ HWY_API Mask512<T> ExclusiveNeither(Mask512<T> a, Mask512<T> b) {
|
|
|
2328
2937
|
return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
|
|
2329
2938
|
}
|
|
2330
2939
|
|
|
2940
|
+
template <class D, HWY_IF_LANES_D(D, 64)>
|
|
2941
|
+
HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
|
|
2942
|
+
MFromD<Half<D>> lo) {
|
|
2943
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
2944
|
+
const __mmask64 combined_mask = _mm512_kunpackd(
|
|
2945
|
+
static_cast<__mmask64>(hi.raw), static_cast<__mmask64>(lo.raw));
|
|
2946
|
+
#else
|
|
2947
|
+
const __mmask64 combined_mask = static_cast<__mmask64>(
|
|
2948
|
+
((static_cast<uint64_t>(hi.raw) << 32) | (lo.raw & 0xFFFFFFFFULL)));
|
|
2949
|
+
#endif
|
|
2950
|
+
|
|
2951
|
+
return MFromD<D>{combined_mask};
|
|
2952
|
+
}
|
|
2953
|
+
|
|
2954
|
+
template <class D, HWY_IF_LANES_D(D, 32)>
|
|
2955
|
+
HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
|
|
2956
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
2957
|
+
const auto shifted_mask = _kshiftri_mask64(static_cast<__mmask64>(m.raw), 32);
|
|
2958
|
+
#else
|
|
2959
|
+
const auto shifted_mask = static_cast<uint64_t>(m.raw) >> 32;
|
|
2960
|
+
#endif
|
|
2961
|
+
|
|
2962
|
+
return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
|
|
2963
|
+
}
|
|
2964
|
+
|
|
2965
|
+
template <class D, HWY_IF_LANES_D(D, 64)>
|
|
2966
|
+
HWY_API MFromD<D> SlideMask1Up(D /*d*/, MFromD<D> m) {
|
|
2967
|
+
using RawM = decltype(MFromD<D>().raw);
|
|
2968
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
2969
|
+
return MFromD<D>{
|
|
2970
|
+
static_cast<RawM>(_kshiftli_mask64(static_cast<__mmask64>(m.raw), 1))};
|
|
2971
|
+
#else
|
|
2972
|
+
return MFromD<D>{static_cast<RawM>(static_cast<uint64_t>(m.raw) << 1)};
|
|
2973
|
+
#endif
|
|
2974
|
+
}
|
|
2975
|
+
|
|
2976
|
+
template <class D, HWY_IF_LANES_D(D, 64)>
|
|
2977
|
+
HWY_API MFromD<D> SlideMask1Down(D /*d*/, MFromD<D> m) {
|
|
2978
|
+
using RawM = decltype(MFromD<D>().raw);
|
|
2979
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
2980
|
+
return MFromD<D>{
|
|
2981
|
+
static_cast<RawM>(_kshiftri_mask64(static_cast<__mmask64>(m.raw), 1))};
|
|
2982
|
+
#else
|
|
2983
|
+
return MFromD<D>{static_cast<RawM>(static_cast<uint64_t>(m.raw) >> 1)};
|
|
2984
|
+
#endif
|
|
2985
|
+
}
|
|
2986
|
+
|
|
2331
2987
|
// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
|
|
2332
2988
|
|
|
2333
2989
|
HWY_API Vec512<int8_t> BroadcastSignBit(Vec512<int8_t> v) {
|
|
2990
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
2991
|
+
const Repartition<uint64_t, DFromV<decltype(v)>> du64;
|
|
2992
|
+
return detail::GaloisAffine(v, Set(du64, 0x8080808080808080ull));
|
|
2993
|
+
#else
|
|
2334
2994
|
const DFromV<decltype(v)> d;
|
|
2335
2995
|
return VecFromMask(v < Zero(d));
|
|
2996
|
+
#endif
|
|
2336
2997
|
}
|
|
2337
2998
|
|
|
2338
2999
|
HWY_API Vec512<int16_t> BroadcastSignBit(Vec512<int16_t> v) {
|
|
@@ -2344,7 +3005,7 @@ HWY_API Vec512<int32_t> BroadcastSignBit(Vec512<int32_t> v) {
|
|
|
2344
3005
|
}
|
|
2345
3006
|
|
|
2346
3007
|
HWY_API Vec512<int64_t> BroadcastSignBit(Vec512<int64_t> v) {
|
|
2347
|
-
return
|
|
3008
|
+
return ShiftRight<63>(v);
|
|
2348
3009
|
}
|
|
2349
3010
|
|
|
2350
3011
|
// ------------------------------ Floating-point classification (Not)
|
|
@@ -2356,6 +3017,15 @@ HWY_API Mask512<float16_t> IsNaN(Vec512<float16_t> v) {
|
|
|
2356
3017
|
v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
|
|
2357
3018
|
}
|
|
2358
3019
|
|
|
3020
|
+
HWY_API Mask512<float16_t> IsEitherNaN(Vec512<float16_t> a,
|
|
3021
|
+
Vec512<float16_t> b) {
|
|
3022
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
3023
|
+
HWY_DIAGNOSTICS(push)
|
|
3024
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
3025
|
+
return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_UNORD_Q)};
|
|
3026
|
+
HWY_DIAGNOSTICS(pop)
|
|
3027
|
+
}
|
|
3028
|
+
|
|
2359
3029
|
HWY_API Mask512<float16_t> IsInf(Vec512<float16_t> v) {
|
|
2360
3030
|
return Mask512<float16_t>{_mm512_fpclass_ph_mask(v.raw, 0x18)};
|
|
2361
3031
|
}
|
|
@@ -2379,6 +3049,14 @@ HWY_API Mask512<double> IsNaN(Vec512<double> v) {
|
|
|
2379
3049
|
v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
|
|
2380
3050
|
}
|
|
2381
3051
|
|
|
3052
|
+
HWY_API Mask512<float> IsEitherNaN(Vec512<float> a, Vec512<float> b) {
|
|
3053
|
+
return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_UNORD_Q)};
|
|
3054
|
+
}
|
|
3055
|
+
|
|
3056
|
+
HWY_API Mask512<double> IsEitherNaN(Vec512<double> a, Vec512<double> b) {
|
|
3057
|
+
return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_UNORD_Q)};
|
|
3058
|
+
}
|
|
3059
|
+
|
|
2382
3060
|
HWY_API Mask512<float> IsInf(Vec512<float> v) {
|
|
2383
3061
|
return Mask512<float>{_mm512_fpclass_ps_mask(
|
|
2384
3062
|
v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)};
|
|
@@ -2410,16 +3088,13 @@ HWY_API VFromD<D> Load(D /* tag */, const TFromD<D>* HWY_RESTRICT aligned) {
|
|
|
2410
3088
|
return VFromD<D>{_mm512_load_si512(aligned)};
|
|
2411
3089
|
}
|
|
2412
3090
|
// bfloat16_t is handled by x86_128-inl.h.
|
|
2413
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
|
|
2414
|
-
HWY_API Vec512<float16_t> Load(D d, const float16_t* HWY_RESTRICT aligned) {
|
|
2415
3091
|
#if HWY_HAVE_FLOAT16
|
|
2416
|
-
|
|
3092
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
|
|
3093
|
+
HWY_API Vec512<float16_t> Load(D /* tag */,
|
|
3094
|
+
const float16_t* HWY_RESTRICT aligned) {
|
|
2417
3095
|
return Vec512<float16_t>{_mm512_load_ph(aligned)};
|
|
2418
|
-
#else
|
|
2419
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
2420
|
-
return BitCast(d, Load(du, reinterpret_cast<const uint16_t*>(aligned)));
|
|
2421
|
-
#endif // HWY_HAVE_FLOAT16
|
|
2422
3096
|
}
|
|
3097
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2423
3098
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
2424
3099
|
HWY_API Vec512<float> Load(D /* tag */, const float* HWY_RESTRICT aligned) {
|
|
2425
3100
|
return Vec512<float>{_mm512_load_ps(aligned)};
|
|
@@ -2435,16 +3110,12 @@ HWY_API VFromD<D> LoadU(D /* tag */, const TFromD<D>* HWY_RESTRICT p) {
|
|
|
2435
3110
|
}
|
|
2436
3111
|
|
|
2437
3112
|
// bfloat16_t is handled by x86_128-inl.h.
|
|
2438
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64)>
|
|
2439
|
-
HWY_API Vec512<float16_t> LoadU(D d, const float16_t* HWY_RESTRICT p) {
|
|
2440
3113
|
#if HWY_HAVE_FLOAT16
|
|
2441
|
-
|
|
3114
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64)>
|
|
3115
|
+
HWY_API Vec512<float16_t> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) {
|
|
2442
3116
|
return Vec512<float16_t>{_mm512_loadu_ph(p)};
|
|
2443
|
-
#else
|
|
2444
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
2445
|
-
return BitCast(d, LoadU(du, reinterpret_cast<const uint16_t*>(p)));
|
|
2446
|
-
#endif // HWY_HAVE_FLOAT16
|
|
2447
3117
|
}
|
|
3118
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2448
3119
|
template <class D, HWY_IF_V_SIZE_D(D, 64)>
|
|
2449
3120
|
HWY_API Vec512<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) {
|
|
2450
3121
|
return Vec512<float>{_mm512_loadu_ps(p)};
|
|
@@ -2506,8 +3177,9 @@ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
|
|
|
2506
3177
|
HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
|
|
2507
3178
|
const TFromD<D>* HWY_RESTRICT p) {
|
|
2508
3179
|
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
2509
|
-
return
|
|
2510
|
-
|
|
3180
|
+
return BitCast(
|
|
3181
|
+
d, VFromD<decltype(du)>{_mm512_mask_loadu_epi16(
|
|
3182
|
+
BitCast(du, v).raw, m.raw, reinterpret_cast<const uint16_t*>(p))});
|
|
2511
3183
|
}
|
|
2512
3184
|
|
|
2513
3185
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
|
|
@@ -2539,10 +3211,12 @@ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, Mask512<double> m, D /* tag */,
|
|
|
2539
3211
|
// Loads 128 bit and duplicates into both 128-bit halves. This avoids the
|
|
2540
3212
|
// 3-cycle cost of moving data between 128-bit halves and avoids port 5.
|
|
2541
3213
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
2542
|
-
HWY_API VFromD<D> LoadDup128(D
|
|
2543
|
-
|
|
3214
|
+
HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* const HWY_RESTRICT p) {
|
|
3215
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2544
3216
|
const Full128<TFromD<D>> d128;
|
|
2545
|
-
|
|
3217
|
+
const RebindToUnsigned<decltype(d128)> du128;
|
|
3218
|
+
return BitCast(d, VFromD<decltype(du)>{_mm512_broadcast_i32x4(
|
|
3219
|
+
BitCast(du128, LoadU(d128, p)).raw)});
|
|
2546
3220
|
}
|
|
2547
3221
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
2548
3222
|
HWY_API VFromD<D> LoadDup128(D /* tag */, const float* HWY_RESTRICT p) {
|
|
@@ -2563,15 +3237,13 @@ HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
|
|
|
2563
3237
|
_mm512_store_si512(reinterpret_cast<__m512i*>(aligned), v.raw);
|
|
2564
3238
|
}
|
|
2565
3239
|
// bfloat16_t is handled by x86_128-inl.h.
|
|
3240
|
+
#if HWY_HAVE_FLOAT16
|
|
2566
3241
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
|
|
2567
3242
|
HWY_API void Store(Vec512<float16_t> v, D /* tag */,
|
|
2568
3243
|
float16_t* HWY_RESTRICT aligned) {
|
|
2569
|
-
#if HWY_HAVE_FLOAT16
|
|
2570
3244
|
_mm512_store_ph(aligned, v.raw);
|
|
2571
|
-
#else
|
|
2572
|
-
_mm512_store_si512(reinterpret_cast<__m512i*>(aligned), v.raw);
|
|
2573
|
-
#endif
|
|
2574
3245
|
}
|
|
3246
|
+
#endif
|
|
2575
3247
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
2576
3248
|
HWY_API void Store(Vec512<float> v, D /* tag */, float* HWY_RESTRICT aligned) {
|
|
2577
3249
|
_mm512_store_ps(aligned, v.raw);
|
|
@@ -2586,15 +3258,13 @@ HWY_API void StoreU(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
|
|
|
2586
3258
|
_mm512_storeu_si512(reinterpret_cast<__m512i*>(p), v.raw);
|
|
2587
3259
|
}
|
|
2588
3260
|
// bfloat16_t is handled by x86_128-inl.h.
|
|
3261
|
+
#if HWY_HAVE_FLOAT16
|
|
2589
3262
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
|
|
2590
3263
|
HWY_API void StoreU(Vec512<float16_t> v, D /* tag */,
|
|
2591
3264
|
float16_t* HWY_RESTRICT p) {
|
|
2592
|
-
#if HWY_HAVE_FLOAT16
|
|
2593
3265
|
_mm512_storeu_ph(p, v.raw);
|
|
2594
|
-
#else
|
|
2595
|
-
_mm512_storeu_si512(reinterpret_cast<__m512i*>(p), v.raw);
|
|
2596
|
-
#endif // HWY_HAVE_FLOAT16
|
|
2597
3266
|
}
|
|
3267
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2598
3268
|
|
|
2599
3269
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
2600
3270
|
HWY_API void StoreU(Vec512<float> v, D /* tag */, float* HWY_RESTRICT p) {
|
|
@@ -2756,84 +3426,81 @@ HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D /* tag */,
|
|
|
2756
3426
|
namespace detail {
|
|
2757
3427
|
|
|
2758
3428
|
template <int kScale, typename T, HWY_IF_UI32(T)>
|
|
2759
|
-
HWY_INLINE Vec512<T>
|
|
2760
|
-
|
|
2761
|
-
return Vec512<T>{_mm512_i32gather_epi32(
|
|
3429
|
+
HWY_INLINE Vec512<T> NativeGather512(const T* HWY_RESTRICT base,
|
|
3430
|
+
Vec512<int32_t> indices) {
|
|
3431
|
+
return Vec512<T>{_mm512_i32gather_epi32(indices.raw, base, kScale)};
|
|
2762
3432
|
}
|
|
2763
3433
|
|
|
2764
3434
|
template <int kScale, typename T, HWY_IF_UI64(T)>
|
|
2765
|
-
HWY_INLINE Vec512<T>
|
|
2766
|
-
|
|
2767
|
-
return Vec512<T>{_mm512_i64gather_epi64(
|
|
3435
|
+
HWY_INLINE Vec512<T> NativeGather512(const T* HWY_RESTRICT base,
|
|
3436
|
+
Vec512<int64_t> indices) {
|
|
3437
|
+
return Vec512<T>{_mm512_i64gather_epi64(indices.raw, base, kScale)};
|
|
2768
3438
|
}
|
|
2769
3439
|
|
|
2770
3440
|
template <int kScale>
|
|
2771
|
-
HWY_INLINE Vec512<float>
|
|
2772
|
-
|
|
2773
|
-
return Vec512<float>{_mm512_i32gather_ps(
|
|
3441
|
+
HWY_INLINE Vec512<float> NativeGather512(const float* HWY_RESTRICT base,
|
|
3442
|
+
Vec512<int32_t> indices) {
|
|
3443
|
+
return Vec512<float>{_mm512_i32gather_ps(indices.raw, base, kScale)};
|
|
2774
3444
|
}
|
|
2775
3445
|
|
|
2776
3446
|
template <int kScale>
|
|
2777
|
-
HWY_INLINE Vec512<double>
|
|
2778
|
-
|
|
2779
|
-
return Vec512<double>{_mm512_i64gather_pd(
|
|
3447
|
+
HWY_INLINE Vec512<double> NativeGather512(const double* HWY_RESTRICT base,
|
|
3448
|
+
Vec512<int64_t> indices) {
|
|
3449
|
+
return Vec512<double>{_mm512_i64gather_pd(indices.raw, base, kScale)};
|
|
2780
3450
|
}
|
|
2781
3451
|
|
|
2782
3452
|
template <int kScale, typename T, HWY_IF_UI32(T)>
|
|
2783
|
-
HWY_INLINE Vec512<T>
|
|
2784
|
-
|
|
2785
|
-
|
|
2786
|
-
const Full512<T> d;
|
|
3453
|
+
HWY_INLINE Vec512<T> NativeMaskedGatherOr512(Vec512<T> no, Mask512<T> m,
|
|
3454
|
+
const T* HWY_RESTRICT base,
|
|
3455
|
+
Vec512<int32_t> indices) {
|
|
2787
3456
|
return Vec512<T>{
|
|
2788
|
-
_mm512_mask_i32gather_epi32(
|
|
3457
|
+
_mm512_mask_i32gather_epi32(no.raw, m.raw, indices.raw, base, kScale)};
|
|
2789
3458
|
}
|
|
2790
3459
|
|
|
2791
3460
|
template <int kScale, typename T, HWY_IF_UI64(T)>
|
|
2792
|
-
HWY_INLINE Vec512<T>
|
|
2793
|
-
|
|
2794
|
-
|
|
2795
|
-
const Full512<T> d;
|
|
3461
|
+
HWY_INLINE Vec512<T> NativeMaskedGatherOr512(Vec512<T> no, Mask512<T> m,
|
|
3462
|
+
const T* HWY_RESTRICT base,
|
|
3463
|
+
Vec512<int64_t> indices) {
|
|
2796
3464
|
return Vec512<T>{
|
|
2797
|
-
_mm512_mask_i64gather_epi64(
|
|
3465
|
+
_mm512_mask_i64gather_epi64(no.raw, m.raw, indices.raw, base, kScale)};
|
|
2798
3466
|
}
|
|
2799
3467
|
|
|
2800
3468
|
template <int kScale>
|
|
2801
|
-
HWY_INLINE Vec512<float>
|
|
2802
|
-
|
|
2803
|
-
|
|
2804
|
-
|
|
3469
|
+
HWY_INLINE Vec512<float> NativeMaskedGatherOr512(Vec512<float> no,
|
|
3470
|
+
Mask512<float> m,
|
|
3471
|
+
const float* HWY_RESTRICT base,
|
|
3472
|
+
Vec512<int32_t> indices) {
|
|
2805
3473
|
return Vec512<float>{
|
|
2806
|
-
_mm512_mask_i32gather_ps(
|
|
3474
|
+
_mm512_mask_i32gather_ps(no.raw, m.raw, indices.raw, base, kScale)};
|
|
2807
3475
|
}
|
|
2808
3476
|
|
|
2809
3477
|
template <int kScale>
|
|
2810
|
-
HWY_INLINE Vec512<double>
|
|
2811
|
-
|
|
2812
|
-
|
|
2813
|
-
const Full512<double> d;
|
|
3478
|
+
HWY_INLINE Vec512<double> NativeMaskedGatherOr512(
|
|
3479
|
+
Vec512<double> no, Mask512<double> m, const double* HWY_RESTRICT base,
|
|
3480
|
+
Vec512<int64_t> indices) {
|
|
2814
3481
|
return Vec512<double>{
|
|
2815
|
-
_mm512_mask_i64gather_pd(
|
|
3482
|
+
_mm512_mask_i64gather_pd(no.raw, m.raw, indices.raw, base, kScale)};
|
|
2816
3483
|
}
|
|
2817
3484
|
} // namespace detail
|
|
2818
3485
|
|
|
2819
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64)
|
|
2820
|
-
HWY_API VFromD<D> GatherOffset(D /*
|
|
2821
|
-
|
|
2822
|
-
|
|
2823
|
-
return detail::NativeGather<1>(base, offset);
|
|
3486
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64)>
|
|
3487
|
+
HWY_API VFromD<D> GatherOffset(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
|
|
3488
|
+
VFromD<RebindToSigned<D>> offsets) {
|
|
3489
|
+
return detail::NativeGather512<1>(base, offsets);
|
|
2824
3490
|
}
|
|
2825
|
-
|
|
2826
|
-
|
|
2827
|
-
|
|
2828
|
-
|
|
2829
|
-
return detail::
|
|
3491
|
+
|
|
3492
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64)>
|
|
3493
|
+
HWY_API VFromD<D> GatherIndex(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
|
|
3494
|
+
VFromD<RebindToSigned<D>> indices) {
|
|
3495
|
+
return detail::NativeGather512<sizeof(TFromD<D>)>(base, indices);
|
|
2830
3496
|
}
|
|
2831
|
-
|
|
2832
|
-
|
|
2833
|
-
|
|
2834
|
-
|
|
2835
|
-
|
|
2836
|
-
return detail::
|
|
3497
|
+
|
|
3498
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64)>
|
|
3499
|
+
HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D /*d*/,
|
|
3500
|
+
const TFromD<D>* HWY_RESTRICT base,
|
|
3501
|
+
VFromD<RebindToSigned<D>> indices) {
|
|
3502
|
+
return detail::NativeMaskedGatherOr512<sizeof(TFromD<D>)>(no, m, base,
|
|
3503
|
+
indices);
|
|
2837
3504
|
}
|
|
2838
3505
|
|
|
2839
3506
|
HWY_DIAGNOSTICS(pop)
|
|
@@ -2878,7 +3545,7 @@ HWY_API Vec256<T> LowerHalf(Vec512<T> v) {
|
|
|
2878
3545
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
2879
3546
|
HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
|
|
2880
3547
|
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
2881
|
-
const Twice<decltype(
|
|
3548
|
+
const Twice<decltype(du)> dut;
|
|
2882
3549
|
return BitCast(d, VFromD<decltype(du)>{
|
|
2883
3550
|
_mm512_extracti32x8_epi32(BitCast(dut, v).raw, 1)});
|
|
2884
3551
|
}
|
|
@@ -2920,7 +3587,11 @@ HWY_API Vec128<T> ExtractBlock(Vec512<T> v) {
|
|
|
2920
3587
|
template <int kBlockIdx, class T, hwy::EnableIf<(kBlockIdx > 1)>* = nullptr>
|
|
2921
3588
|
HWY_API Vec128<T> ExtractBlock(Vec512<T> v) {
|
|
2922
3589
|
static_assert(kBlockIdx <= 3, "Invalid block index");
|
|
2923
|
-
|
|
3590
|
+
const DFromV<decltype(v)> d;
|
|
3591
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
3592
|
+
return BitCast(Full128<T>(),
|
|
3593
|
+
Vec128<MakeUnsigned<T>>{
|
|
3594
|
+
_mm512_extracti32x4_epi32(BitCast(du, v).raw, kBlockIdx)});
|
|
2924
3595
|
}
|
|
2925
3596
|
|
|
2926
3597
|
template <int kBlockIdx, hwy::EnableIf<(kBlockIdx > 1)>* = nullptr>
|
|
@@ -2955,8 +3626,13 @@ HWY_INLINE Vec512<T> InsertBlock(hwy::SizeTag<0> /* blk_idx_tag */, Vec512<T> v,
|
|
|
2955
3626
|
template <size_t kBlockIdx, typename T>
|
|
2956
3627
|
HWY_INLINE Vec512<T> InsertBlock(hwy::SizeTag<kBlockIdx> /* blk_idx_tag */,
|
|
2957
3628
|
Vec512<T> v, Vec128<T> blk_to_insert) {
|
|
2958
|
-
|
|
2959
|
-
|
|
3629
|
+
const DFromV<decltype(v)> d;
|
|
3630
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
3631
|
+
const Full128<MakeUnsigned<T>> du_blk_to_insert;
|
|
3632
|
+
return BitCast(
|
|
3633
|
+
d, VFromD<decltype(du)>{_mm512_inserti32x4(
|
|
3634
|
+
BitCast(du, v).raw, BitCast(du_blk_to_insert, blk_to_insert).raw,
|
|
3635
|
+
static_cast<int>(kBlockIdx & 3))});
|
|
2960
3636
|
}
|
|
2961
3637
|
|
|
2962
3638
|
template <size_t kBlockIdx, hwy::EnableIf<kBlockIdx != 0>* = nullptr>
|
|
@@ -2992,7 +3668,7 @@ HWY_API T GetLane(const Vec512<T> v) {
|
|
|
2992
3668
|
|
|
2993
3669
|
// ------------------------------ ZeroExtendVector
|
|
2994
3670
|
|
|
2995
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64),
|
|
3671
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
|
|
2996
3672
|
HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
|
|
2997
3673
|
#if HWY_HAVE_ZEXT // See definition/comment in x86_256-inl.h.
|
|
2998
3674
|
(void)d;
|
|
@@ -3042,11 +3718,13 @@ HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
|
|
|
3042
3718
|
DTo d_to, DFrom d_from, VFromD<DFrom> v) {
|
|
3043
3719
|
const Repartition<uint8_t, decltype(d_from)> du8_from;
|
|
3044
3720
|
const auto vu8 = BitCast(du8_from, v);
|
|
3721
|
+
const RebindToUnsigned<decltype(d_to)> du_to;
|
|
3045
3722
|
#if HWY_HAVE_ZEXT
|
|
3046
|
-
(
|
|
3047
|
-
|
|
3723
|
+
return BitCast(d_to,
|
|
3724
|
+
VFromD<decltype(du_to)>{_mm512_zextsi128_si512(vu8.raw)});
|
|
3048
3725
|
#else
|
|
3049
|
-
return
|
|
3726
|
+
return BitCast(d_to, VFromD<decltype(du_to)>{
|
|
3727
|
+
_mm512_inserti32x4(Zero(du_to).raw, vu8.raw, 0)});
|
|
3050
3728
|
#endif
|
|
3051
3729
|
}
|
|
3052
3730
|
|
|
@@ -3096,7 +3774,8 @@ HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) {
|
|
|
3096
3774
|
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
3097
3775
|
const Half<decltype(du)> duh;
|
|
3098
3776
|
const __m512i lo512 = ZeroExtendVector(du, BitCast(duh, lo)).raw;
|
|
3099
|
-
return
|
|
3777
|
+
return BitCast(d, VFromD<decltype(du)>{
|
|
3778
|
+
_mm512_inserti32x8(lo512, BitCast(duh, hi).raw, 1)});
|
|
3100
3779
|
}
|
|
3101
3780
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
3102
3781
|
HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) {
|
|
@@ -3181,7 +3860,11 @@ HWY_API Vec512<double> Broadcast(const Vec512<double> v) {
|
|
|
3181
3860
|
template <int kBlockIdx, class T>
|
|
3182
3861
|
HWY_API Vec512<T> BroadcastBlock(Vec512<T> v) {
|
|
3183
3862
|
static_assert(0 <= kBlockIdx && kBlockIdx <= 3, "Invalid block index");
|
|
3184
|
-
|
|
3863
|
+
const DFromV<decltype(v)> d;
|
|
3864
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
3865
|
+
return BitCast(
|
|
3866
|
+
d, VFromD<decltype(du)>{_mm512_shuffle_i32x4(
|
|
3867
|
+
BitCast(du, v).raw, BitCast(du, v).raw, 0x55 * kBlockIdx)});
|
|
3185
3868
|
}
|
|
3186
3869
|
|
|
3187
3870
|
template <int kBlockIdx>
|
|
@@ -3209,7 +3892,10 @@ HWY_INLINE Vec512<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
|
|
|
3209
3892
|
template <class T, HWY_IF_T_SIZE(T, 2)>
|
|
3210
3893
|
HWY_INLINE Vec512<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
|
|
3211
3894
|
Vec512<T> v) {
|
|
3212
|
-
|
|
3895
|
+
const DFromV<decltype(v)> d;
|
|
3896
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
3897
|
+
return BitCast(d, VFromD<decltype(du)>{_mm512_broadcastw_epi16(
|
|
3898
|
+
ResizeBitCast(Full128<uint16_t>(), v).raw)});
|
|
3213
3899
|
}
|
|
3214
3900
|
|
|
3215
3901
|
template <class T, HWY_IF_UI32(T)>
|
|
@@ -3671,8 +4357,11 @@ HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
|
|
|
3671
4357
|
|
|
3672
4358
|
// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
|
|
3673
4359
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
3674
|
-
HWY_API VFromD<D> ConcatLowerLower(D
|
|
3675
|
-
|
|
4360
|
+
HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4361
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4362
|
+
return BitCast(d,
|
|
4363
|
+
VFromD<decltype(du)>{_mm512_shuffle_i32x4(
|
|
4364
|
+
BitCast(du, lo).raw, BitCast(du, hi).raw, _MM_PERM_BABA)});
|
|
3676
4365
|
}
|
|
3677
4366
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
3678
4367
|
HWY_API VFromD<D> ConcatLowerLower(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
|
|
@@ -3686,8 +4375,11 @@ HWY_API Vec512<double> ConcatLowerLower(D /* tag */, Vec512<double> hi,
|
|
|
3686
4375
|
|
|
3687
4376
|
// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
|
|
3688
4377
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
3689
|
-
HWY_API VFromD<D> ConcatUpperUpper(D
|
|
3690
|
-
|
|
4378
|
+
HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4379
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4380
|
+
return BitCast(d,
|
|
4381
|
+
VFromD<decltype(du)>{_mm512_shuffle_i32x4(
|
|
4382
|
+
BitCast(du, lo).raw, BitCast(du, hi).raw, _MM_PERM_DCDC)});
|
|
3691
4383
|
}
|
|
3692
4384
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
3693
4385
|
HWY_API VFromD<D> ConcatUpperUpper(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
|
|
@@ -3701,8 +4393,11 @@ HWY_API Vec512<double> ConcatUpperUpper(D /* tag */, Vec512<double> hi,
|
|
|
3701
4393
|
|
|
3702
4394
|
// hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
|
|
3703
4395
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
3704
|
-
HWY_API VFromD<D> ConcatLowerUpper(D
|
|
3705
|
-
|
|
4396
|
+
HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4397
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4398
|
+
return BitCast(d,
|
|
4399
|
+
VFromD<decltype(du)>{_mm512_shuffle_i32x4(
|
|
4400
|
+
BitCast(du, lo).raw, BitCast(du, hi).raw, _MM_PERM_BADC)});
|
|
3706
4401
|
}
|
|
3707
4402
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
3708
4403
|
HWY_API VFromD<D> ConcatLowerUpper(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
|
|
@@ -3716,11 +4411,13 @@ HWY_API Vec512<double> ConcatLowerUpper(D /* tag */, Vec512<double> hi,
|
|
|
3716
4411
|
|
|
3717
4412
|
// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
|
|
3718
4413
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
3719
|
-
HWY_API VFromD<D> ConcatUpperLower(D
|
|
4414
|
+
HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
3720
4415
|
// There are no imm8 blend in AVX512. Use blend16 because 32-bit masks
|
|
3721
4416
|
// are efficiently loaded from 32-bit regs.
|
|
3722
4417
|
const __mmask32 mask = /*_cvtu32_mask32 */ (0x0000FFFF);
|
|
3723
|
-
|
|
4418
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4419
|
+
return BitCast(d, VFromD<decltype(du)>{_mm512_mask_blend_epi16(
|
|
4420
|
+
mask, BitCast(du, hi).raw, BitCast(du, lo).raw)});
|
|
3724
4421
|
}
|
|
3725
4422
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
3726
4423
|
HWY_API VFromD<D> ConcatUpperLower(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
|
|
@@ -3814,71 +4511,195 @@ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
|
3814
4511
|
const RebindToUnsigned<decltype(d)> du;
|
|
3815
4512
|
#if HWY_TARGET <= HWY_AVX3_DL
|
|
3816
4513
|
alignas(64) static constexpr uint8_t kIdx[64] = {
|
|
3817
|
-
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24,
|
|
3818
|
-
26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50,
|
|
3819
|
-
52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76,
|
|
3820
|
-
78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102,
|
|
3821
|
-
104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126};
|
|
3822
|
-
return BitCast(
|
|
3823
|
-
d, Vec512<uint32_t>{_mm512_permutex2var_epi8(
|
|
3824
|
-
BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
|
|
4514
|
+
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24,
|
|
4515
|
+
26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50,
|
|
4516
|
+
52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76,
|
|
4517
|
+
78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102,
|
|
4518
|
+
104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126};
|
|
4519
|
+
return BitCast(
|
|
4520
|
+
d, Vec512<uint32_t>{_mm512_permutex2var_epi8(
|
|
4521
|
+
BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
|
|
4522
|
+
#else
|
|
4523
|
+
const RepartitionToWide<decltype(du)> dw;
|
|
4524
|
+
// Isolate lower 8 bits per u16 so we can pack.
|
|
4525
|
+
const Vec512<uint16_t> mask = Set(dw, 0x00FF);
|
|
4526
|
+
const Vec512<uint16_t> uH = And(BitCast(dw, hi), mask);
|
|
4527
|
+
const Vec512<uint16_t> uL = And(BitCast(dw, lo), mask);
|
|
4528
|
+
const Vec512<uint64_t> u8{_mm512_packus_epi16(uL.raw, uH.raw)};
|
|
4529
|
+
// Undo block interleave: lower half = even u64 lanes, upper = odd u64 lanes.
|
|
4530
|
+
const Full512<uint64_t> du64;
|
|
4531
|
+
alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
|
|
4532
|
+
return BitCast(d, TableLookupLanes(u8, SetTableIndices(du64, kIdx)));
|
|
4533
|
+
#endif
|
|
4534
|
+
}
|
|
4535
|
+
|
|
4536
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
|
|
4537
|
+
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4538
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4539
|
+
alignas(64) static constexpr uint16_t kIdx[32] = {
|
|
4540
|
+
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
|
|
4541
|
+
32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
|
|
4542
|
+
return BitCast(
|
|
4543
|
+
d, Vec512<uint32_t>{_mm512_permutex2var_epi16(
|
|
4544
|
+
BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
|
|
4545
|
+
}
|
|
4546
|
+
|
|
4547
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
|
|
4548
|
+
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4549
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4550
|
+
alignas(64) static constexpr uint32_t kIdx[16] = {
|
|
4551
|
+
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
|
|
4552
|
+
return BitCast(
|
|
4553
|
+
d, Vec512<uint32_t>{_mm512_permutex2var_epi32(
|
|
4554
|
+
BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
|
|
4555
|
+
}
|
|
4556
|
+
|
|
4557
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
4558
|
+
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4559
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4560
|
+
alignas(64) static constexpr uint32_t kIdx[16] = {
|
|
4561
|
+
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
|
|
4562
|
+
return VFromD<D>{_mm512_permutex2var_ps(lo.raw, Load(du, kIdx).raw, hi.raw)};
|
|
4563
|
+
}
|
|
4564
|
+
|
|
4565
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
|
|
4566
|
+
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4567
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4568
|
+
alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
|
|
4569
|
+
return BitCast(
|
|
4570
|
+
d, Vec512<uint64_t>{_mm512_permutex2var_epi64(
|
|
4571
|
+
BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
|
|
4572
|
+
}
|
|
4573
|
+
|
|
4574
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
|
|
4575
|
+
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4576
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4577
|
+
alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
|
|
4578
|
+
return VFromD<D>{_mm512_permutex2var_pd(lo.raw, Load(du, kIdx).raw, hi.raw)};
|
|
4579
|
+
}
|
|
4580
|
+
|
|
4581
|
+
// ------------------------------ InterleaveWholeLower
|
|
4582
|
+
|
|
4583
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
|
|
4584
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
4585
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
4586
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4587
|
+
alignas(64) static constexpr uint8_t kIdx[64] = {
|
|
4588
|
+
0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71,
|
|
4589
|
+
8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79,
|
|
4590
|
+
16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87,
|
|
4591
|
+
24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95};
|
|
4592
|
+
return VFromD<D>{_mm512_permutex2var_epi8(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
4593
|
+
#else
|
|
4594
|
+
alignas(64) static constexpr uint64_t kIdx2[8] = {0, 1, 8, 9, 2, 3, 10, 11};
|
|
4595
|
+
const Repartition<uint64_t, decltype(d)> du64;
|
|
4596
|
+
return VFromD<D>{_mm512_permutex2var_epi64(InterleaveLower(a, b).raw,
|
|
4597
|
+
Load(du64, kIdx2).raw,
|
|
4598
|
+
InterleaveUpper(d, a, b).raw)};
|
|
4599
|
+
#endif
|
|
4600
|
+
}
|
|
4601
|
+
|
|
4602
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
|
|
4603
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
4604
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4605
|
+
alignas(64) static constexpr uint16_t kIdx[32] = {
|
|
4606
|
+
0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
|
|
4607
|
+
8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47};
|
|
4608
|
+
return BitCast(
|
|
4609
|
+
d, VFromD<decltype(du)>{_mm512_permutex2var_epi16(
|
|
4610
|
+
BitCast(du, a).raw, Load(du, kIdx).raw, BitCast(du, b).raw)});
|
|
4611
|
+
}
|
|
4612
|
+
|
|
4613
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
|
|
4614
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
4615
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4616
|
+
alignas(64) static constexpr uint32_t kIdx[16] = {0, 16, 1, 17, 2, 18, 3, 19,
|
|
4617
|
+
4, 20, 5, 21, 6, 22, 7, 23};
|
|
4618
|
+
return VFromD<D>{_mm512_permutex2var_epi32(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
4619
|
+
}
|
|
4620
|
+
|
|
4621
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
4622
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
4623
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4624
|
+
alignas(64) static constexpr uint32_t kIdx[16] = {0, 16, 1, 17, 2, 18, 3, 19,
|
|
4625
|
+
4, 20, 5, 21, 6, 22, 7, 23};
|
|
4626
|
+
return VFromD<D>{_mm512_permutex2var_ps(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
4627
|
+
}
|
|
4628
|
+
|
|
4629
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
|
|
4630
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
4631
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4632
|
+
alignas(64) static constexpr uint64_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
|
|
4633
|
+
return VFromD<D>{_mm512_permutex2var_epi64(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
4634
|
+
}
|
|
4635
|
+
|
|
4636
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
|
|
4637
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
4638
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4639
|
+
alignas(64) static constexpr uint64_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
|
|
4640
|
+
return VFromD<D>{_mm512_permutex2var_pd(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
4641
|
+
}
|
|
4642
|
+
|
|
4643
|
+
// ------------------------------ InterleaveWholeUpper
|
|
4644
|
+
|
|
4645
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
|
|
4646
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
4647
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
4648
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4649
|
+
alignas(64) static constexpr uint8_t kIdx[64] = {
|
|
4650
|
+
32, 96, 33, 97, 34, 98, 35, 99, 36, 100, 37, 101, 38, 102, 39, 103,
|
|
4651
|
+
40, 104, 41, 105, 42, 106, 43, 107, 44, 108, 45, 109, 46, 110, 47, 111,
|
|
4652
|
+
48, 112, 49, 113, 50, 114, 51, 115, 52, 116, 53, 117, 54, 118, 55, 119,
|
|
4653
|
+
56, 120, 57, 121, 58, 122, 59, 123, 60, 124, 61, 125, 62, 126, 63, 127};
|
|
4654
|
+
return VFromD<D>{_mm512_permutex2var_epi8(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
3825
4655
|
#else
|
|
3826
|
-
|
|
3827
|
-
|
|
3828
|
-
|
|
3829
|
-
|
|
3830
|
-
|
|
3831
|
-
const Vec512<uint64_t> u8{_mm512_packus_epi16(uL.raw, uH.raw)};
|
|
3832
|
-
// Undo block interleave: lower half = even u64 lanes, upper = odd u64 lanes.
|
|
3833
|
-
const Full512<uint64_t> du64;
|
|
3834
|
-
alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
|
|
3835
|
-
return BitCast(d, TableLookupLanes(u8, SetTableIndices(du64, kIdx)));
|
|
4656
|
+
alignas(64) static constexpr uint64_t kIdx2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
|
|
4657
|
+
const Repartition<uint64_t, decltype(d)> du64;
|
|
4658
|
+
return VFromD<D>{_mm512_permutex2var_epi64(InterleaveLower(a, b).raw,
|
|
4659
|
+
Load(du64, kIdx2).raw,
|
|
4660
|
+
InterleaveUpper(d, a, b).raw)};
|
|
3836
4661
|
#endif
|
|
3837
4662
|
}
|
|
3838
4663
|
|
|
3839
4664
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
|
|
3840
|
-
HWY_API VFromD<D>
|
|
4665
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
3841
4666
|
const RebindToUnsigned<decltype(d)> du;
|
|
3842
4667
|
alignas(64) static constexpr uint16_t kIdx[32] = {
|
|
3843
|
-
|
|
3844
|
-
|
|
4668
|
+
16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
|
|
4669
|
+
24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63};
|
|
3845
4670
|
return BitCast(
|
|
3846
|
-
d,
|
|
3847
|
-
BitCast(du,
|
|
4671
|
+
d, VFromD<decltype(du)>{_mm512_permutex2var_epi16(
|
|
4672
|
+
BitCast(du, a).raw, Load(du, kIdx).raw, BitCast(du, b).raw)});
|
|
3848
4673
|
}
|
|
3849
4674
|
|
|
3850
4675
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
|
|
3851
|
-
HWY_API VFromD<D>
|
|
4676
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
3852
4677
|
const RebindToUnsigned<decltype(d)> du;
|
|
3853
4678
|
alignas(64) static constexpr uint32_t kIdx[16] = {
|
|
3854
|
-
|
|
3855
|
-
return
|
|
3856
|
-
d, Vec512<uint32_t>{_mm512_permutex2var_epi32(
|
|
3857
|
-
BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
|
|
4679
|
+
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31};
|
|
4680
|
+
return VFromD<D>{_mm512_permutex2var_epi32(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
3858
4681
|
}
|
|
3859
4682
|
|
|
3860
4683
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
3861
|
-
HWY_API VFromD<D>
|
|
4684
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
3862
4685
|
const RebindToUnsigned<decltype(d)> du;
|
|
3863
4686
|
alignas(64) static constexpr uint32_t kIdx[16] = {
|
|
3864
|
-
|
|
3865
|
-
return VFromD<D>{_mm512_permutex2var_ps(
|
|
4687
|
+
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31};
|
|
4688
|
+
return VFromD<D>{_mm512_permutex2var_ps(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
3866
4689
|
}
|
|
3867
4690
|
|
|
3868
4691
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
|
|
3869
|
-
HWY_API VFromD<D>
|
|
4692
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
3870
4693
|
const RebindToUnsigned<decltype(d)> du;
|
|
3871
|
-
alignas(64) static constexpr uint64_t kIdx[8] = {
|
|
3872
|
-
return
|
|
3873
|
-
d, Vec512<uint64_t>{_mm512_permutex2var_epi64(
|
|
3874
|
-
BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
|
|
4694
|
+
alignas(64) static constexpr uint64_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
|
|
4695
|
+
return VFromD<D>{_mm512_permutex2var_epi64(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
3875
4696
|
}
|
|
3876
4697
|
|
|
3877
4698
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
|
|
3878
|
-
HWY_API VFromD<D>
|
|
4699
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
3879
4700
|
const RebindToUnsigned<decltype(d)> du;
|
|
3880
|
-
alignas(64) static constexpr uint64_t kIdx[8] = {
|
|
3881
|
-
return VFromD<D>{_mm512_permutex2var_pd(
|
|
4701
|
+
alignas(64) static constexpr uint64_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
|
|
4702
|
+
return VFromD<D>{_mm512_permutex2var_pd(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
3882
4703
|
}
|
|
3883
4704
|
|
|
3884
4705
|
// ------------------------------ DupEven (InterleaveLower)
|
|
@@ -3922,11 +4743,44 @@ HWY_API Vec512<T> OddEven(const Vec512<T> a, const Vec512<T> b) {
|
|
|
3922
4743
|
return IfThenElse(Mask512<T>{0x5555555555555555ull >> shift}, b, a);
|
|
3923
4744
|
}
|
|
3924
4745
|
|
|
4746
|
+
// -------------------------- InterleaveEven
|
|
4747
|
+
|
|
4748
|
+
template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_UI32_D(D)>
|
|
4749
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
4750
|
+
return VFromD<D>{_mm512_mask_shuffle_epi32(
|
|
4751
|
+
a.raw, static_cast<__mmask16>(0xAAAA), b.raw,
|
|
4752
|
+
static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(2, 2, 0, 0)))};
|
|
4753
|
+
}
|
|
4754
|
+
template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_F32_D(D)>
|
|
4755
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
4756
|
+
return VFromD<D>{_mm512_mask_shuffle_ps(a.raw, static_cast<__mmask16>(0xAAAA),
|
|
4757
|
+
b.raw, b.raw,
|
|
4758
|
+
_MM_SHUFFLE(2, 2, 0, 0))};
|
|
4759
|
+
}
|
|
4760
|
+
// -------------------------- InterleaveOdd
|
|
4761
|
+
|
|
4762
|
+
template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_UI32_D(D)>
|
|
4763
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
4764
|
+
return VFromD<D>{_mm512_mask_shuffle_epi32(
|
|
4765
|
+
b.raw, static_cast<__mmask16>(0x5555), a.raw,
|
|
4766
|
+
static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(3, 3, 1, 1)))};
|
|
4767
|
+
}
|
|
4768
|
+
template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_F32_D(D)>
|
|
4769
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
4770
|
+
return VFromD<D>{_mm512_mask_shuffle_ps(b.raw, static_cast<__mmask16>(0x5555),
|
|
4771
|
+
a.raw, a.raw,
|
|
4772
|
+
_MM_SHUFFLE(3, 3, 1, 1))};
|
|
4773
|
+
}
|
|
4774
|
+
|
|
3925
4775
|
// ------------------------------ OddEvenBlocks
|
|
3926
4776
|
|
|
3927
4777
|
template <typename T>
|
|
3928
4778
|
HWY_API Vec512<T> OddEvenBlocks(Vec512<T> odd, Vec512<T> even) {
|
|
3929
|
-
|
|
4779
|
+
const DFromV<decltype(odd)> d;
|
|
4780
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4781
|
+
return BitCast(
|
|
4782
|
+
d, VFromD<decltype(du)>{_mm512_mask_blend_epi64(
|
|
4783
|
+
__mmask8{0x33u}, BitCast(du, odd).raw, BitCast(du, even).raw)});
|
|
3930
4784
|
}
|
|
3931
4785
|
|
|
3932
4786
|
HWY_API Vec512<float> OddEvenBlocks(Vec512<float> odd, Vec512<float> even) {
|
|
@@ -3943,7 +4797,11 @@ HWY_API Vec512<double> OddEvenBlocks(Vec512<double> odd, Vec512<double> even) {
|
|
|
3943
4797
|
|
|
3944
4798
|
template <typename T>
|
|
3945
4799
|
HWY_API Vec512<T> SwapAdjacentBlocks(Vec512<T> v) {
|
|
3946
|
-
|
|
4800
|
+
const DFromV<decltype(v)> d;
|
|
4801
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4802
|
+
return BitCast(d,
|
|
4803
|
+
VFromD<decltype(du)>{_mm512_shuffle_i32x4(
|
|
4804
|
+
BitCast(du, v).raw, BitCast(du, v).raw, _MM_PERM_CDAB)});
|
|
3947
4805
|
}
|
|
3948
4806
|
|
|
3949
4807
|
HWY_API Vec512<float> SwapAdjacentBlocks(Vec512<float> v) {
|
|
@@ -3957,8 +4815,11 @@ HWY_API Vec512<double> SwapAdjacentBlocks(Vec512<double> v) {
|
|
|
3957
4815
|
// ------------------------------ ReverseBlocks
|
|
3958
4816
|
|
|
3959
4817
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
3960
|
-
HWY_API VFromD<D> ReverseBlocks(D
|
|
3961
|
-
|
|
4818
|
+
HWY_API VFromD<D> ReverseBlocks(D d, VFromD<D> v) {
|
|
4819
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4820
|
+
return BitCast(d,
|
|
4821
|
+
VFromD<decltype(du)>{_mm512_shuffle_i32x4(
|
|
4822
|
+
BitCast(du, v).raw, BitCast(du, v).raw, _MM_PERM_ABCD)});
|
|
3962
4823
|
}
|
|
3963
4824
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
3964
4825
|
HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
|
|
@@ -3974,7 +4835,10 @@ HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
|
|
|
3974
4835
|
// Both full
|
|
3975
4836
|
template <typename T, typename TI>
|
|
3976
4837
|
HWY_API Vec512<TI> TableLookupBytes(Vec512<T> bytes, Vec512<TI> indices) {
|
|
3977
|
-
|
|
4838
|
+
const DFromV<decltype(indices)> d;
|
|
4839
|
+
return BitCast(d, Vec512<uint8_t>{_mm512_shuffle_epi8(
|
|
4840
|
+
BitCast(Full512<uint8_t>(), bytes).raw,
|
|
4841
|
+
BitCast(Full512<uint8_t>(), indices).raw)});
|
|
3978
4842
|
}
|
|
3979
4843
|
|
|
3980
4844
|
// Partial index vector
|
|
@@ -4632,6 +5496,15 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec256<float16_t> v) {
|
|
|
4632
5496
|
#endif // HWY_HAVE_FLOAT16
|
|
4633
5497
|
}
|
|
4634
5498
|
|
|
5499
|
+
#if HWY_HAVE_FLOAT16
|
|
5500
|
+
|
|
5501
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
|
|
5502
|
+
HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, Vec128<float16_t> v) {
|
|
5503
|
+
return VFromD<D>{_mm512_cvtph_pd(v.raw)};
|
|
5504
|
+
}
|
|
5505
|
+
|
|
5506
|
+
#endif // HWY_HAVE_FLOAT16
|
|
5507
|
+
|
|
4635
5508
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
4636
5509
|
HWY_API VFromD<D> PromoteTo(D df32, Vec256<bfloat16_t> v) {
|
|
4637
5510
|
const Rebind<uint16_t, decltype(df32)> du16;
|
|
@@ -4655,19 +5528,76 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec256<uint32_t> v) {
|
|
|
4655
5528
|
}
|
|
4656
5529
|
|
|
4657
5530
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
|
|
4658
|
-
HWY_API VFromD<D>
|
|
4659
|
-
|
|
4660
|
-
|
|
4661
|
-
|
|
5531
|
+
HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
|
|
5532
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
5533
|
+
// Workaround for undefined behavior with GCC if any values of v[i] are not
|
|
5534
|
+
// within the range of an int64_t
|
|
5535
|
+
|
|
5536
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
5537
|
+
if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
|
|
5538
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
|
|
5539
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
5540
|
+
return VFromD<D>{_mm512_setr_epi64(
|
|
5541
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]),
|
|
5542
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1]),
|
|
5543
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[2]),
|
|
5544
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[3]),
|
|
5545
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[4]),
|
|
5546
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[5]),
|
|
5547
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[6]),
|
|
5548
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[7]))};
|
|
5549
|
+
}
|
|
5550
|
+
#endif
|
|
4662
5551
|
|
|
4663
|
-
|
|
4664
|
-
|
|
4665
|
-
|
|
5552
|
+
__m512i raw_result;
|
|
5553
|
+
__asm__("vcvttps2qq {%1, %0|%0, %1}"
|
|
5554
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
5555
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
5556
|
+
:);
|
|
5557
|
+
return VFromD<D>{raw_result};
|
|
5558
|
+
#else
|
|
5559
|
+
return VFromD<D>{_mm512_cvttps_epi64(v.raw)};
|
|
5560
|
+
#endif
|
|
4666
5561
|
}
|
|
4667
5562
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
|
|
4668
|
-
HWY_API VFromD<D>
|
|
4669
|
-
|
|
4670
|
-
|
|
5563
|
+
HWY_API VFromD<D> PromoteInRangeTo(D /* tag */, VFromD<Rebind<float, D>> v) {
|
|
5564
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
5565
|
+
// Workaround for undefined behavior with GCC if any values of v[i] are not
|
|
5566
|
+
// within the range of an uint64_t
|
|
5567
|
+
|
|
5568
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
5569
|
+
if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
|
|
5570
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
|
|
5571
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
5572
|
+
return VFromD<D>{_mm512_setr_epi64(
|
|
5573
|
+
static_cast<int64_t>(
|
|
5574
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0])),
|
|
5575
|
+
static_cast<int64_t>(
|
|
5576
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1])),
|
|
5577
|
+
static_cast<int64_t>(
|
|
5578
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[2])),
|
|
5579
|
+
static_cast<int64_t>(
|
|
5580
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[3])),
|
|
5581
|
+
static_cast<int64_t>(
|
|
5582
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[4])),
|
|
5583
|
+
static_cast<int64_t>(
|
|
5584
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[5])),
|
|
5585
|
+
static_cast<int64_t>(
|
|
5586
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[6])),
|
|
5587
|
+
static_cast<int64_t>(
|
|
5588
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[7])))};
|
|
5589
|
+
}
|
|
5590
|
+
#endif
|
|
5591
|
+
|
|
5592
|
+
__m512i raw_result;
|
|
5593
|
+
__asm__("vcvttps2uqq {%1, %0|%0, %1}"
|
|
5594
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
5595
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
5596
|
+
:);
|
|
5597
|
+
return VFromD<D>{raw_result};
|
|
5598
|
+
#else
|
|
5599
|
+
return VFromD<D>{_mm512_cvttps_epu64(v.raw)};
|
|
5600
|
+
#endif
|
|
4671
5601
|
}
|
|
4672
5602
|
|
|
4673
5603
|
// ------------------------------ Demotions (full -> part w/ narrow lanes)
|
|
@@ -4709,8 +5639,7 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int32_t> v) {
|
|
|
4709
5639
|
const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
|
|
4710
5640
|
const Vec512<uint8_t> u8{_mm512_packus_epi16(i16.raw, i16.raw)};
|
|
4711
5641
|
|
|
4712
|
-
|
|
4713
|
-
const auto idx32 = LoadDup128(du32, kLanes);
|
|
5642
|
+
const VFromD<decltype(du32)> idx32 = Dup128VecFromValues(du32, 0, 4, 8, 12);
|
|
4714
5643
|
const Vec512<uint8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, u8.raw)};
|
|
4715
5644
|
return LowerHalf(LowerHalf(fixed));
|
|
4716
5645
|
}
|
|
@@ -4745,9 +5674,7 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int32_t> v) {
|
|
|
4745
5674
|
const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
|
|
4746
5675
|
const Vec512<int8_t> i8{_mm512_packs_epi16(i16.raw, i16.raw)};
|
|
4747
5676
|
|
|
4748
|
-
|
|
4749
|
-
0, 4, 8, 12, 0, 4, 8, 12};
|
|
4750
|
-
const auto idx32 = LoadDup128(du32, kLanes);
|
|
5677
|
+
const VFromD<decltype(du32)> idx32 = Dup128VecFromValues(du32, 0, 4, 8, 12);
|
|
4751
5678
|
const Vec512<int8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, i8.raw)};
|
|
4752
5679
|
return LowerHalf(LowerHalf(fixed));
|
|
4753
5680
|
}
|
|
@@ -4779,32 +5706,17 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int64_t> v) {
|
|
|
4779
5706
|
|
|
4780
5707
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
|
|
4781
5708
|
HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int64_t> v) {
|
|
4782
|
-
const
|
|
4783
|
-
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
4784
|
-
const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
|
|
4785
|
-
#else
|
|
4786
|
-
const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
|
|
4787
|
-
#endif
|
|
5709
|
+
const __mmask8 non_neg_mask = Not(MaskFromVec(v)).raw;
|
|
4788
5710
|
return VFromD<D>{_mm512_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)};
|
|
4789
5711
|
}
|
|
4790
5712
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
|
|
4791
5713
|
HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int64_t> v) {
|
|
4792
|
-
const
|
|
4793
|
-
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
4794
|
-
const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
|
|
4795
|
-
#else
|
|
4796
|
-
const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
|
|
4797
|
-
#endif
|
|
5714
|
+
const __mmask8 non_neg_mask = Not(MaskFromVec(v)).raw;
|
|
4798
5715
|
return VFromD<D>{_mm512_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)};
|
|
4799
5716
|
}
|
|
4800
5717
|
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
|
|
4801
5718
|
HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int64_t> v) {
|
|
4802
|
-
const
|
|
4803
|
-
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
4804
|
-
const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
|
|
4805
|
-
#else
|
|
4806
|
-
const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
|
|
4807
|
-
#endif
|
|
5719
|
+
const __mmask8 non_neg_mask = Not(MaskFromVec(v)).raw;
|
|
4808
5720
|
return VFromD<D>{_mm512_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)};
|
|
4809
5721
|
}
|
|
4810
5722
|
|
|
@@ -4822,32 +5734,55 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<uint64_t> v) {
|
|
|
4822
5734
|
}
|
|
4823
5735
|
|
|
4824
5736
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
|
|
4825
|
-
HWY_API VFromD<D> DemoteTo(D
|
|
5737
|
+
HWY_API VFromD<D> DemoteTo(D df16, Vec512<float> v) {
|
|
4826
5738
|
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
4827
5739
|
HWY_DIAGNOSTICS(push)
|
|
4828
5740
|
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
4829
|
-
|
|
5741
|
+
const RebindToUnsigned<decltype(df16)> du16;
|
|
5742
|
+
return BitCast(
|
|
5743
|
+
df16, VFromD<decltype(du16)>{_mm512_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)});
|
|
4830
5744
|
HWY_DIAGNOSTICS(pop)
|
|
4831
5745
|
}
|
|
4832
5746
|
|
|
5747
|
+
#if HWY_HAVE_FLOAT16
|
|
5748
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
|
|
5749
|
+
HWY_API VFromD<D> DemoteTo(D /*df16*/, Vec512<double> v) {
|
|
5750
|
+
return VFromD<D>{_mm512_cvtpd_ph(v.raw)};
|
|
5751
|
+
}
|
|
5752
|
+
#endif // HWY_HAVE_FLOAT16
|
|
5753
|
+
|
|
5754
|
+
#if HWY_AVX3_HAVE_F32_TO_BF16C
|
|
4833
5755
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
|
|
4834
|
-
HWY_API VFromD<D> DemoteTo(D dbf16
|
|
4835
|
-
|
|
4836
|
-
|
|
4837
|
-
|
|
4838
|
-
|
|
4839
|
-
|
|
4840
|
-
|
|
5756
|
+
HWY_API VFromD<D> DemoteTo(D /*dbf16*/, Vec512<float> v) {
|
|
5757
|
+
#if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
|
|
5758
|
+
// Inline assembly workaround for LLVM codegen bug
|
|
5759
|
+
__m256i raw_result;
|
|
5760
|
+
__asm__("vcvtneps2bf16 %1, %0" : "=v"(raw_result) : "v"(v.raw));
|
|
5761
|
+
return VFromD<D>{raw_result};
|
|
5762
|
+
#else
|
|
5763
|
+
// The _mm512_cvtneps_pbh intrinsic returns a __m256bh vector that needs to be
|
|
5764
|
+
// bit casted to a __m256i vector
|
|
5765
|
+
return VFromD<D>{detail::BitCastToInteger(_mm512_cvtneps_pbh(v.raw))};
|
|
5766
|
+
#endif
|
|
4841
5767
|
}
|
|
4842
5768
|
|
|
4843
5769
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_BF16_D(D)>
|
|
4844
|
-
HWY_API VFromD<D> ReorderDemote2To(D dbf16
|
|
4845
|
-
|
|
4846
|
-
|
|
4847
|
-
|
|
4848
|
-
|
|
4849
|
-
|
|
5770
|
+
HWY_API VFromD<D> ReorderDemote2To(D /*dbf16*/, Vec512<float> a,
|
|
5771
|
+
Vec512<float> b) {
|
|
5772
|
+
#if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
|
|
5773
|
+
// Inline assembly workaround for LLVM codegen bug
|
|
5774
|
+
__m512i raw_result;
|
|
5775
|
+
__asm__("vcvtne2ps2bf16 %2, %1, %0"
|
|
5776
|
+
: "=v"(raw_result)
|
|
5777
|
+
: "v"(b.raw), "v"(a.raw));
|
|
5778
|
+
return VFromD<D>{raw_result};
|
|
5779
|
+
#else
|
|
5780
|
+
// The _mm512_cvtne2ps_pbh intrinsic returns a __m512bh vector that needs to
|
|
5781
|
+
// be bit casted to a __m512i vector
|
|
5782
|
+
return VFromD<D>{detail::BitCastToInteger(_mm512_cvtne2ps_pbh(b.raw, a.raw))};
|
|
5783
|
+
#endif
|
|
4850
5784
|
}
|
|
5785
|
+
#endif // HWY_AVX3_HAVE_F32_TO_BF16C
|
|
4851
5786
|
|
|
4852
5787
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
|
|
4853
5788
|
HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec512<int32_t> a,
|
|
@@ -4935,16 +5870,77 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<double> v) {
|
|
|
4935
5870
|
}
|
|
4936
5871
|
|
|
4937
5872
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
|
|
4938
|
-
HWY_API VFromD<D>
|
|
4939
|
-
|
|
4940
|
-
|
|
4941
|
-
|
|
5873
|
+
HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec512<double> v) {
|
|
5874
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
5875
|
+
// Workaround for undefined behavior in _mm512_cvttpd_epi32 with GCC if any
|
|
5876
|
+
// values of v[i] are not within the range of an int32_t
|
|
5877
|
+
|
|
5878
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
5879
|
+
if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
|
|
5880
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(64)));
|
|
5881
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
5882
|
+
return VFromD<D>{_mm256_setr_epi32(
|
|
5883
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
|
|
5884
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]),
|
|
5885
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[2]),
|
|
5886
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[3]),
|
|
5887
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[4]),
|
|
5888
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[5]),
|
|
5889
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[6]),
|
|
5890
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[7]))};
|
|
5891
|
+
}
|
|
5892
|
+
#endif
|
|
5893
|
+
|
|
5894
|
+
__m256i raw_result;
|
|
5895
|
+
__asm__("vcvttpd2dq {%1, %0|%0, %1}"
|
|
5896
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
5897
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
5898
|
+
:);
|
|
5899
|
+
return VFromD<D>{raw_result};
|
|
5900
|
+
#else
|
|
5901
|
+
return VFromD<D>{_mm512_cvttpd_epi32(v.raw)};
|
|
5902
|
+
#endif
|
|
4942
5903
|
}
|
|
4943
5904
|
|
|
4944
5905
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
|
|
4945
|
-
HWY_API VFromD<D>
|
|
4946
|
-
|
|
4947
|
-
|
|
5906
|
+
HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec512<double> v) {
|
|
5907
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
5908
|
+
// Workaround for undefined behavior in _mm512_cvttpd_epu32 with GCC if any
|
|
5909
|
+
// values of v[i] are not within the range of an uint32_t
|
|
5910
|
+
|
|
5911
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
5912
|
+
if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) {
|
|
5913
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(64)));
|
|
5914
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
5915
|
+
return VFromD<D>{_mm256_setr_epi32(
|
|
5916
|
+
static_cast<int32_t>(
|
|
5917
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0])),
|
|
5918
|
+
static_cast<int32_t>(
|
|
5919
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1])),
|
|
5920
|
+
static_cast<int32_t>(
|
|
5921
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[2])),
|
|
5922
|
+
static_cast<int32_t>(
|
|
5923
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[3])),
|
|
5924
|
+
static_cast<int32_t>(
|
|
5925
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[4])),
|
|
5926
|
+
static_cast<int32_t>(
|
|
5927
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[5])),
|
|
5928
|
+
static_cast<int32_t>(
|
|
5929
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[6])),
|
|
5930
|
+
static_cast<int32_t>(
|
|
5931
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[7])))};
|
|
5932
|
+
}
|
|
5933
|
+
#endif
|
|
5934
|
+
|
|
5935
|
+
__m256i raw_result;
|
|
5936
|
+
__asm__("vcvttpd2udq {%1, %0|%0, %1}"
|
|
5937
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
5938
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
5939
|
+
:);
|
|
5940
|
+
return VFromD<D>{raw_result};
|
|
5941
|
+
#else
|
|
5942
|
+
return VFromD<D>{_mm512_cvttpd_epu32(v.raw)};
|
|
5943
|
+
#endif
|
|
4948
5944
|
}
|
|
4949
5945
|
|
|
4950
5946
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
@@ -4962,13 +5958,12 @@ HWY_API Vec128<uint8_t> U8FromU32(const Vec512<uint32_t> v) {
|
|
|
4962
5958
|
const DFromV<decltype(v)> d32;
|
|
4963
5959
|
// In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the
|
|
4964
5960
|
// lowest 4 bytes.
|
|
4965
|
-
|
|
4966
|
-
|
|
4967
|
-
const auto quads = TableLookupBytes(v,
|
|
5961
|
+
const VFromD<decltype(d32)> v8From32 =
|
|
5962
|
+
Dup128VecFromValues(d32, 0x0C080400u, ~0u, ~0u, ~0u);
|
|
5963
|
+
const auto quads = TableLookupBytes(v, v8From32);
|
|
4968
5964
|
// Gather the lowest 4 bytes of 4 128-bit blocks.
|
|
4969
|
-
|
|
4970
|
-
const Vec512<uint8_t> bytes{
|
|
4971
|
-
_mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)};
|
|
5965
|
+
const VFromD<decltype(d32)> index32 = Dup128VecFromValues(d32, 0, 4, 8, 12);
|
|
5966
|
+
const Vec512<uint8_t> bytes{_mm512_permutexvar_epi32(index32.raw, quads.raw)};
|
|
4972
5967
|
return LowerHalf(LowerHalf(bytes));
|
|
4973
5968
|
}
|
|
4974
5969
|
|
|
@@ -4979,10 +5974,9 @@ HWY_API VFromD<D> TruncateTo(D d, const Vec512<uint64_t> v) {
|
|
|
4979
5974
|
#if HWY_TARGET <= HWY_AVX3_DL
|
|
4980
5975
|
(void)d;
|
|
4981
5976
|
const Full512<uint8_t> d8;
|
|
4982
|
-
|
|
4983
|
-
0, 8, 16, 24, 32, 40, 48, 56, 0, 8, 16, 24, 32, 40, 48, 56
|
|
4984
|
-
const Vec512<uint8_t> bytes{
|
|
4985
|
-
_mm512_permutexvar_epi8(LoadDup128(d8, k8From64).raw, v.raw)};
|
|
5977
|
+
const VFromD<decltype(d8)> v8From64 = Dup128VecFromValues(
|
|
5978
|
+
d8, 0, 8, 16, 24, 32, 40, 48, 56, 0, 8, 16, 24, 32, 40, 48, 56);
|
|
5979
|
+
const Vec512<uint8_t> bytes{_mm512_permutexvar_epi8(v8From64.raw, v.raw)};
|
|
4986
5980
|
return LowerHalf(LowerHalf(LowerHalf(bytes)));
|
|
4987
5981
|
#else
|
|
4988
5982
|
const Full512<uint32_t> d32;
|
|
@@ -5018,21 +6012,19 @@ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
|
|
|
5018
6012
|
HWY_API VFromD<D> TruncateTo(D /* tag */, const Vec512<uint32_t> v) {
|
|
5019
6013
|
#if HWY_TARGET <= HWY_AVX3_DL
|
|
5020
6014
|
const Full512<uint8_t> d8;
|
|
5021
|
-
|
|
5022
|
-
0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
|
|
5023
|
-
const Vec512<uint8_t> bytes{
|
|
5024
|
-
_mm512_permutexvar_epi8(LoadDup128(d8, k8From32).raw, v.raw)};
|
|
6015
|
+
const VFromD<decltype(d8)> v8From32 = Dup128VecFromValues(
|
|
6016
|
+
d8, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60);
|
|
6017
|
+
const Vec512<uint8_t> bytes{_mm512_permutexvar_epi8(v8From32.raw, v.raw)};
|
|
5025
6018
|
#else
|
|
5026
6019
|
const Full512<uint32_t> d32;
|
|
5027
6020
|
// In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the
|
|
5028
6021
|
// lowest 4 bytes.
|
|
5029
|
-
|
|
5030
|
-
|
|
5031
|
-
const auto quads = TableLookupBytes(v,
|
|
6022
|
+
const VFromD<decltype(d32)> v8From32 =
|
|
6023
|
+
Dup128VecFromValues(d32, 0x0C080400u, ~0u, ~0u, ~0u);
|
|
6024
|
+
const auto quads = TableLookupBytes(v, v8From32);
|
|
5032
6025
|
// Gather the lowest 4 bytes of 4 128-bit blocks.
|
|
5033
|
-
|
|
5034
|
-
const Vec512<uint8_t> bytes{
|
|
5035
|
-
_mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)};
|
|
6026
|
+
const VFromD<decltype(d32)> index32 = Dup128VecFromValues(d32, 0, 4, 8, 12);
|
|
6027
|
+
const Vec512<uint8_t> bytes{_mm512_permutexvar_epi32(index32.raw, quads.raw)};
|
|
5036
6028
|
#endif
|
|
5037
6029
|
return LowerHalf(LowerHalf(bytes));
|
|
5038
6030
|
}
|
|
@@ -5061,9 +6053,9 @@ HWY_API VFromD<D> TruncateTo(D /* tag */, const Vec512<uint16_t> v) {
|
|
|
5061
6053
|
_mm512_permutexvar_epi8(Load(d8, k8From16).raw, v.raw)};
|
|
5062
6054
|
#else
|
|
5063
6055
|
const Full512<uint32_t> d32;
|
|
5064
|
-
|
|
5065
|
-
0x06040200u, 0x0E0C0A08u, 0x06040200u, 0x0E0C0A08u
|
|
5066
|
-
const auto quads = TableLookupBytes(v,
|
|
6056
|
+
const VFromD<decltype(d32)> v16From32 = Dup128VecFromValues(
|
|
6057
|
+
d32, 0x06040200u, 0x0E0C0A08u, 0x06040200u, 0x0E0C0A08u);
|
|
6058
|
+
const auto quads = TableLookupBytes(v, v16From32);
|
|
5067
6059
|
alignas(64) static constexpr uint32_t kIndex32[16] = {
|
|
5068
6060
|
0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
|
|
5069
6061
|
const Vec512<uint8_t> bytes{
|
|
@@ -5108,36 +6100,362 @@ HWY_API VFromD<D> ConvertTo(D /* tag*/, Vec512<uint64_t> v) {
|
|
|
5108
6100
|
// Truncates (rounds toward zero).
|
|
5109
6101
|
#if HWY_HAVE_FLOAT16
|
|
5110
6102
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
|
|
5111
|
-
HWY_API VFromD<D>
|
|
5112
|
-
|
|
5113
|
-
|
|
6103
|
+
HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec512<float16_t> v) {
|
|
6104
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6105
|
+
// Workaround for undefined behavior in _mm512_cvttph_epi16 with GCC if any
|
|
6106
|
+
// values of v[i] are not within the range of an int16_t
|
|
6107
|
+
|
|
6108
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
|
|
6109
|
+
HWY_HAVE_SCALAR_F16_TYPE
|
|
6110
|
+
if (detail::IsConstantX86VecForF2IConv<int16_t>(v)) {
|
|
6111
|
+
typedef hwy::float16_t::Native GccF16RawVectType
|
|
6112
|
+
__attribute__((__vector_size__(64)));
|
|
6113
|
+
const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
|
|
6114
|
+
return VFromD<D>{
|
|
6115
|
+
_mm512_set_epi16(detail::X86ConvertScalarFromFloat<int16_t>(raw_v[31]),
|
|
6116
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[30]),
|
|
6117
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[29]),
|
|
6118
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[28]),
|
|
6119
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[27]),
|
|
6120
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[26]),
|
|
6121
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[25]),
|
|
6122
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[24]),
|
|
6123
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[23]),
|
|
6124
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[22]),
|
|
6125
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[21]),
|
|
6126
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[20]),
|
|
6127
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[19]),
|
|
6128
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[18]),
|
|
6129
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[17]),
|
|
6130
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[16]),
|
|
6131
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[15]),
|
|
6132
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[14]),
|
|
6133
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[13]),
|
|
6134
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[12]),
|
|
6135
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[11]),
|
|
6136
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[10]),
|
|
6137
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[9]),
|
|
6138
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[8]),
|
|
6139
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[7]),
|
|
6140
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[6]),
|
|
6141
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[5]),
|
|
6142
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[4]),
|
|
6143
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[3]),
|
|
6144
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[2]),
|
|
6145
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[1]),
|
|
6146
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[0]))};
|
|
6147
|
+
}
|
|
6148
|
+
#endif
|
|
6149
|
+
|
|
6150
|
+
__m512i raw_result;
|
|
6151
|
+
__asm__("vcvttph2w {%1, %0|%0, %1}"
|
|
6152
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6153
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6154
|
+
:);
|
|
6155
|
+
return VFromD<D>{raw_result};
|
|
6156
|
+
#else
|
|
6157
|
+
return VFromD<D>{_mm512_cvttph_epi16(v.raw)};
|
|
6158
|
+
#endif
|
|
6159
|
+
}
|
|
6160
|
+
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
|
|
6161
|
+
HWY_API VFromD<D> ConvertInRangeTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
|
|
6162
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6163
|
+
// Workaround for undefined behavior in _mm512_cvttph_epu16 with GCC if any
|
|
6164
|
+
// values of v[i] are not within the range of an uint16_t
|
|
6165
|
+
|
|
6166
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
|
|
6167
|
+
HWY_HAVE_SCALAR_F16_TYPE
|
|
6168
|
+
if (detail::IsConstantX86VecForF2IConv<uint16_t>(v)) {
|
|
6169
|
+
typedef hwy::float16_t::Native GccF16RawVectType
|
|
6170
|
+
__attribute__((__vector_size__(64)));
|
|
6171
|
+
const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
|
|
6172
|
+
return VFromD<D>{_mm512_set_epi16(
|
|
6173
|
+
static_cast<int16_t>(
|
|
6174
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[31])),
|
|
6175
|
+
static_cast<int16_t>(
|
|
6176
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[30])),
|
|
6177
|
+
static_cast<int16_t>(
|
|
6178
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[29])),
|
|
6179
|
+
static_cast<int16_t>(
|
|
6180
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[28])),
|
|
6181
|
+
static_cast<int16_t>(
|
|
6182
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[27])),
|
|
6183
|
+
static_cast<int16_t>(
|
|
6184
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[26])),
|
|
6185
|
+
static_cast<int16_t>(
|
|
6186
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[25])),
|
|
6187
|
+
static_cast<int16_t>(
|
|
6188
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[24])),
|
|
6189
|
+
static_cast<int16_t>(
|
|
6190
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[23])),
|
|
6191
|
+
static_cast<int16_t>(
|
|
6192
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[22])),
|
|
6193
|
+
static_cast<int16_t>(
|
|
6194
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[21])),
|
|
6195
|
+
static_cast<int16_t>(
|
|
6196
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[20])),
|
|
6197
|
+
static_cast<int16_t>(
|
|
6198
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[19])),
|
|
6199
|
+
static_cast<int16_t>(
|
|
6200
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[18])),
|
|
6201
|
+
static_cast<int16_t>(
|
|
6202
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[17])),
|
|
6203
|
+
static_cast<int16_t>(
|
|
6204
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[16])),
|
|
6205
|
+
static_cast<int16_t>(
|
|
6206
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[15])),
|
|
6207
|
+
static_cast<int16_t>(
|
|
6208
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[14])),
|
|
6209
|
+
static_cast<int16_t>(
|
|
6210
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[13])),
|
|
6211
|
+
static_cast<int16_t>(
|
|
6212
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[12])),
|
|
6213
|
+
static_cast<int16_t>(
|
|
6214
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[11])),
|
|
6215
|
+
static_cast<int16_t>(
|
|
6216
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[10])),
|
|
6217
|
+
static_cast<int16_t>(
|
|
6218
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[9])),
|
|
6219
|
+
static_cast<int16_t>(
|
|
6220
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[8])),
|
|
6221
|
+
static_cast<int16_t>(
|
|
6222
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[7])),
|
|
6223
|
+
static_cast<int16_t>(
|
|
6224
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[6])),
|
|
6225
|
+
static_cast<int16_t>(
|
|
6226
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[5])),
|
|
6227
|
+
static_cast<int16_t>(
|
|
6228
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[4])),
|
|
6229
|
+
static_cast<int16_t>(
|
|
6230
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[3])),
|
|
6231
|
+
static_cast<int16_t>(
|
|
6232
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[2])),
|
|
6233
|
+
static_cast<int16_t>(
|
|
6234
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[1])),
|
|
6235
|
+
static_cast<int16_t>(
|
|
6236
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[0])))};
|
|
6237
|
+
}
|
|
6238
|
+
#endif
|
|
6239
|
+
|
|
6240
|
+
__m512i raw_result;
|
|
6241
|
+
__asm__("vcvttph2uw {%1, %0|%0, %1}"
|
|
6242
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6243
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6244
|
+
:);
|
|
6245
|
+
return VFromD<D>{raw_result};
|
|
6246
|
+
#else
|
|
6247
|
+
return VFromD<D>{_mm512_cvttph_epu16(v.raw)};
|
|
6248
|
+
#endif
|
|
5114
6249
|
}
|
|
5115
6250
|
#endif // HWY_HAVE_FLOAT16
|
|
5116
6251
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
|
|
5117
|
-
HWY_API VFromD<D>
|
|
5118
|
-
|
|
5119
|
-
|
|
6252
|
+
HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec512<float> v) {
|
|
6253
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6254
|
+
// Workaround for undefined behavior in _mm512_cvttps_epi32 with GCC if any
|
|
6255
|
+
// values of v[i] are not within the range of an int32_t
|
|
6256
|
+
|
|
6257
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
6258
|
+
if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
|
|
6259
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(64)));
|
|
6260
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
6261
|
+
return VFromD<D>{_mm512_setr_epi32(
|
|
6262
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
|
|
6263
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]),
|
|
6264
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[2]),
|
|
6265
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[3]),
|
|
6266
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[4]),
|
|
6267
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[5]),
|
|
6268
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[6]),
|
|
6269
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[7]),
|
|
6270
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[8]),
|
|
6271
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[9]),
|
|
6272
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[10]),
|
|
6273
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[11]),
|
|
6274
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[12]),
|
|
6275
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[13]),
|
|
6276
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[14]),
|
|
6277
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[15]))};
|
|
6278
|
+
}
|
|
6279
|
+
#endif
|
|
6280
|
+
|
|
6281
|
+
__m512i raw_result;
|
|
6282
|
+
__asm__("vcvttps2dq {%1, %0|%0, %1}"
|
|
6283
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6284
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6285
|
+
:);
|
|
6286
|
+
return VFromD<D>{raw_result};
|
|
6287
|
+
#else
|
|
6288
|
+
return VFromD<D>{_mm512_cvttps_epi32(v.raw)};
|
|
6289
|
+
#endif
|
|
5120
6290
|
}
|
|
5121
6291
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
|
|
5122
|
-
HWY_API VFromD<D>
|
|
5123
|
-
|
|
5124
|
-
|
|
6292
|
+
HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, Vec512<double> v) {
|
|
6293
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6294
|
+
// Workaround for undefined behavior in _mm512_cvttpd_epi64 with GCC if any
|
|
6295
|
+
// values of v[i] are not within the range of an int64_t
|
|
6296
|
+
|
|
6297
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
6298
|
+
if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
|
|
6299
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(64)));
|
|
6300
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
6301
|
+
return VFromD<D>{_mm512_setr_epi64(
|
|
6302
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]),
|
|
6303
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1]),
|
|
6304
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[2]),
|
|
6305
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[3]),
|
|
6306
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[4]),
|
|
6307
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[5]),
|
|
6308
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[6]),
|
|
6309
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[7]))};
|
|
6310
|
+
}
|
|
6311
|
+
#endif
|
|
6312
|
+
|
|
6313
|
+
__m512i raw_result;
|
|
6314
|
+
__asm__("vcvttpd2qq {%1, %0|%0, %1}"
|
|
6315
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6316
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6317
|
+
:);
|
|
6318
|
+
return VFromD<D>{raw_result};
|
|
6319
|
+
#else
|
|
6320
|
+
return VFromD<D>{_mm512_cvttpd_epi64(v.raw)};
|
|
6321
|
+
#endif
|
|
5125
6322
|
}
|
|
5126
6323
|
template <class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U32_D(DU)>
|
|
5127
|
-
HWY_API VFromD<DU>
|
|
5128
|
-
|
|
5129
|
-
|
|
6324
|
+
HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
6325
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6326
|
+
// Workaround for undefined behavior in _mm512_cvttps_epu32 with GCC if any
|
|
6327
|
+
// values of v[i] are not within the range of an uint32_t
|
|
6328
|
+
|
|
6329
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
6330
|
+
if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) {
|
|
6331
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(64)));
|
|
6332
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
6333
|
+
return VFromD<DU>{_mm512_setr_epi32(
|
|
6334
|
+
static_cast<int32_t>(
|
|
6335
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0])),
|
|
6336
|
+
static_cast<int32_t>(
|
|
6337
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1])),
|
|
6338
|
+
static_cast<int32_t>(
|
|
6339
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[2])),
|
|
6340
|
+
static_cast<int32_t>(
|
|
6341
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[3])),
|
|
6342
|
+
static_cast<int32_t>(
|
|
6343
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[4])),
|
|
6344
|
+
static_cast<int32_t>(
|
|
6345
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[5])),
|
|
6346
|
+
static_cast<int32_t>(
|
|
6347
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[6])),
|
|
6348
|
+
static_cast<int32_t>(
|
|
6349
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[7])),
|
|
6350
|
+
static_cast<int32_t>(
|
|
6351
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[8])),
|
|
6352
|
+
static_cast<int32_t>(
|
|
6353
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[9])),
|
|
6354
|
+
static_cast<int32_t>(
|
|
6355
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[10])),
|
|
6356
|
+
static_cast<int32_t>(
|
|
6357
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[11])),
|
|
6358
|
+
static_cast<int32_t>(
|
|
6359
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[12])),
|
|
6360
|
+
static_cast<int32_t>(
|
|
6361
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[13])),
|
|
6362
|
+
static_cast<int32_t>(
|
|
6363
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[14])),
|
|
6364
|
+
static_cast<int32_t>(
|
|
6365
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[15])))};
|
|
6366
|
+
}
|
|
6367
|
+
#endif
|
|
6368
|
+
|
|
6369
|
+
__m512i raw_result;
|
|
6370
|
+
__asm__("vcvttps2udq {%1, %0|%0, %1}"
|
|
6371
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6372
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6373
|
+
:);
|
|
6374
|
+
return VFromD<DU>{raw_result};
|
|
6375
|
+
#else
|
|
6376
|
+
return VFromD<DU>{_mm512_cvttps_epu32(v.raw)};
|
|
6377
|
+
#endif
|
|
5130
6378
|
}
|
|
5131
6379
|
template <class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U64_D(DU)>
|
|
5132
|
-
HWY_API VFromD<DU>
|
|
5133
|
-
|
|
5134
|
-
|
|
6380
|
+
HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
6381
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6382
|
+
// Workaround for undefined behavior in _mm512_cvttpd_epu64 with GCC if any
|
|
6383
|
+
// values of v[i] are not within the range of an uint64_t
|
|
6384
|
+
|
|
6385
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
6386
|
+
if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
|
|
6387
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(64)));
|
|
6388
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
6389
|
+
return VFromD<DU>{_mm512_setr_epi64(
|
|
6390
|
+
static_cast<int64_t>(
|
|
6391
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0])),
|
|
6392
|
+
static_cast<int64_t>(
|
|
6393
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1])),
|
|
6394
|
+
static_cast<int64_t>(
|
|
6395
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[2])),
|
|
6396
|
+
static_cast<int64_t>(
|
|
6397
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[3])),
|
|
6398
|
+
static_cast<int64_t>(
|
|
6399
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[4])),
|
|
6400
|
+
static_cast<int64_t>(
|
|
6401
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[5])),
|
|
6402
|
+
static_cast<int64_t>(
|
|
6403
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[6])),
|
|
6404
|
+
static_cast<int64_t>(
|
|
6405
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[7])))};
|
|
6406
|
+
}
|
|
6407
|
+
#endif
|
|
6408
|
+
|
|
6409
|
+
__m512i raw_result;
|
|
6410
|
+
__asm__("vcvttpd2uqq {%1, %0|%0, %1}"
|
|
6411
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6412
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6413
|
+
:);
|
|
6414
|
+
return VFromD<DU>{raw_result};
|
|
6415
|
+
#else
|
|
6416
|
+
return VFromD<DU>{_mm512_cvttpd_epu64(v.raw)};
|
|
6417
|
+
#endif
|
|
5135
6418
|
}
|
|
5136
6419
|
|
|
5137
|
-
|
|
5138
|
-
|
|
5139
|
-
|
|
5140
|
-
|
|
6420
|
+
template <class DI, HWY_IF_V_SIZE_D(DI, 64), HWY_IF_I32_D(DI)>
|
|
6421
|
+
HWY_INLINE VFromD<DI> NearestIntInRange(DI, VFromD<RebindToFloat<DI>> v) {
|
|
6422
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6423
|
+
// Workaround for undefined behavior in _mm512_cvtps_epi32 with GCC if any
|
|
6424
|
+
// values of v[i] are not within the range of an int32_t
|
|
6425
|
+
|
|
6426
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
6427
|
+
if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
|
|
6428
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(64)));
|
|
6429
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
6430
|
+
return VFromD<DI>{
|
|
6431
|
+
_mm512_setr_epi32(detail::X86ScalarNearestInt<int32_t>(raw_v[0]),
|
|
6432
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[1]),
|
|
6433
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[2]),
|
|
6434
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[3]),
|
|
6435
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[4]),
|
|
6436
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[5]),
|
|
6437
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[6]),
|
|
6438
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[7]),
|
|
6439
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[8]),
|
|
6440
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[9]),
|
|
6441
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[10]),
|
|
6442
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[11]),
|
|
6443
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[12]),
|
|
6444
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[13]),
|
|
6445
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[14]),
|
|
6446
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[15]))};
|
|
6447
|
+
}
|
|
6448
|
+
#endif
|
|
6449
|
+
|
|
6450
|
+
__m512i raw_result;
|
|
6451
|
+
__asm__("vcvtps2dq {%1, %0|%0, %1}"
|
|
6452
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6453
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6454
|
+
:);
|
|
6455
|
+
return VFromD<DI>{raw_result};
|
|
6456
|
+
#else
|
|
6457
|
+
return VFromD<DI>{_mm512_cvtps_epi32(v.raw)};
|
|
6458
|
+
#endif
|
|
5141
6459
|
}
|
|
5142
6460
|
|
|
5143
6461
|
// ================================================== CRYPTO
|
|
@@ -5198,14 +6516,14 @@ template <uint8_t kRcon>
|
|
|
5198
6516
|
HWY_API Vec512<uint8_t> AESKeyGenAssist(Vec512<uint8_t> v) {
|
|
5199
6517
|
const Full512<uint8_t> d;
|
|
5200
6518
|
#if HWY_TARGET <= HWY_AVX3_DL
|
|
5201
|
-
|
|
5202
|
-
0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0
|
|
5203
|
-
|
|
5204
|
-
0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12
|
|
6519
|
+
const VFromD<decltype(d)> rconXorMask = Dup128VecFromValues(
|
|
6520
|
+
d, 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0);
|
|
6521
|
+
const VFromD<decltype(d)> rotWordShuffle = Dup128VecFromValues(
|
|
6522
|
+
d, 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12);
|
|
5205
6523
|
const Repartition<uint32_t, decltype(d)> du32;
|
|
5206
6524
|
const auto w13 = BitCast(d, DupOdd(BitCast(du32, v)));
|
|
5207
|
-
const auto sub_word_result = AESLastRound(w13,
|
|
5208
|
-
return TableLookupBytes(sub_word_result,
|
|
6525
|
+
const auto sub_word_result = AESLastRound(w13, rconXorMask);
|
|
6526
|
+
return TableLookupBytes(sub_word_result, rotWordShuffle);
|
|
5209
6527
|
#else
|
|
5210
6528
|
const Half<decltype(d)> d2;
|
|
5211
6529
|
return Combine(d, AESKeyGenAssist<kRcon>(UpperHalf(d2, v)),
|
|
@@ -5253,6 +6571,29 @@ HWY_API Vec512<uint64_t> CLMulUpper(Vec512<uint64_t> va, Vec512<uint64_t> vb) {
|
|
|
5253
6571
|
|
|
5254
6572
|
// ================================================== MISC
|
|
5255
6573
|
|
|
6574
|
+
// ------------------------------ SumsOfAdjQuadAbsDiff (Broadcast,
|
|
6575
|
+
// SumsOfAdjShufQuadAbsDiff)
|
|
6576
|
+
|
|
6577
|
+
template <int kAOffset, int kBOffset>
|
|
6578
|
+
static Vec512<uint16_t> SumsOfAdjQuadAbsDiff(Vec512<uint8_t> a,
|
|
6579
|
+
Vec512<uint8_t> b) {
|
|
6580
|
+
static_assert(0 <= kAOffset && kAOffset <= 1,
|
|
6581
|
+
"kAOffset must be between 0 and 1");
|
|
6582
|
+
static_assert(0 <= kBOffset && kBOffset <= 3,
|
|
6583
|
+
"kBOffset must be between 0 and 3");
|
|
6584
|
+
|
|
6585
|
+
const DFromV<decltype(a)> d;
|
|
6586
|
+
const RepartitionToWideX2<decltype(d)> du32;
|
|
6587
|
+
|
|
6588
|
+
// While AVX3 does not have a _mm512_mpsadbw_epu8 intrinsic, the
|
|
6589
|
+
// SumsOfAdjQuadAbsDiff operation is implementable for 512-bit vectors on
|
|
6590
|
+
// AVX3 using SumsOfShuffledQuadAbsDiff and U32 Broadcast.
|
|
6591
|
+
return SumsOfShuffledQuadAbsDiff<kAOffset + 2, kAOffset + 1, kAOffset + 1,
|
|
6592
|
+
kAOffset>(
|
|
6593
|
+
a, BitCast(d, Broadcast<kBOffset>(BitCast(du32, b))));
|
|
6594
|
+
}
|
|
6595
|
+
|
|
6596
|
+
#if !HWY_IS_MSAN
|
|
5256
6597
|
// ------------------------------ I32/I64 SaturatedAdd (MaskFromVec)
|
|
5257
6598
|
|
|
5258
6599
|
HWY_API Vec512<int32_t> SaturatedAdd(Vec512<int32_t> a, Vec512<int32_t> b) {
|
|
@@ -5300,6 +6641,7 @@ HWY_API Vec512<int64_t> SaturatedSub(Vec512<int64_t> a, Vec512<int64_t> b) {
|
|
|
5300
6641
|
i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
|
|
5301
6642
|
return IfThenElse(overflow_mask, overflow_result, diff);
|
|
5302
6643
|
}
|
|
6644
|
+
#endif // !HWY_IS_MSAN
|
|
5303
6645
|
|
|
5304
6646
|
// ------------------------------ Mask testing
|
|
5305
6647
|
|
|
@@ -6165,7 +7507,10 @@ namespace detail {
|
|
|
6165
7507
|
// Type-safe wrapper.
|
|
6166
7508
|
template <_MM_PERM_ENUM kPerm, typename T>
|
|
6167
7509
|
Vec512<T> Shuffle128(const Vec512<T> lo, const Vec512<T> hi) {
|
|
6168
|
-
|
|
7510
|
+
const DFromV<decltype(lo)> d;
|
|
7511
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
7512
|
+
return BitCast(d, VFromD<decltype(du)>{_mm512_shuffle_i64x2(
|
|
7513
|
+
BitCast(du, lo).raw, BitCast(du, hi).raw, kPerm)});
|
|
6169
7514
|
}
|
|
6170
7515
|
template <_MM_PERM_ENUM kPerm>
|
|
6171
7516
|
Vec512<float> Shuffle128(const Vec512<float> lo, const Vec512<float> hi) {
|
|
@@ -6345,7 +7690,7 @@ HWY_API Mask512<T> SetOnlyFirst(Mask512<T> mask) {
|
|
|
6345
7690
|
static_cast<typename Mask512<T>::Raw>(detail::AVX3Blsi(mask.raw))};
|
|
6346
7691
|
}
|
|
6347
7692
|
|
|
6348
|
-
// ------------------------------ Shl (
|
|
7693
|
+
// ------------------------------ Shl (Dup128VecFromValues)
|
|
6349
7694
|
|
|
6350
7695
|
HWY_API Vec512<uint16_t> operator<<(Vec512<uint16_t> v, Vec512<uint16_t> bits) {
|
|
6351
7696
|
return Vec512<uint16_t>{_mm512_sllv_epi16(v.raw, bits.raw)};
|
|
@@ -6356,13 +7701,15 @@ HWY_API Vec512<uint8_t> operator<<(Vec512<uint8_t> v, Vec512<uint8_t> bits) {
|
|
|
6356
7701
|
const DFromV<decltype(v)> d;
|
|
6357
7702
|
#if HWY_TARGET <= HWY_AVX3_DL
|
|
6358
7703
|
// kMask[i] = 0xFF >> i
|
|
6359
|
-
|
|
6360
|
-
0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01,
|
|
7704
|
+
const VFromD<decltype(d)> masks =
|
|
7705
|
+
Dup128VecFromValues(d, 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0,
|
|
7706
|
+
0, 0, 0, 0, 0, 0, 0);
|
|
6361
7707
|
// kShl[i] = 1 << i
|
|
6362
|
-
|
|
6363
|
-
|
|
6364
|
-
|
|
6365
|
-
|
|
7708
|
+
const VFromD<decltype(d)> shl =
|
|
7709
|
+
Dup128VecFromValues(d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0,
|
|
7710
|
+
0, 0, 0, 0, 0, 0, 0);
|
|
7711
|
+
v = And(v, TableLookupBytes(masks, bits));
|
|
7712
|
+
const VFromD<decltype(d)> mul = TableLookupBytes(shl, bits);
|
|
6366
7713
|
return VFromD<decltype(d)>{_mm512_gf2p8mul_epi8(v.raw, mul.raw)};
|
|
6367
7714
|
#else
|
|
6368
7715
|
const Repartition<uint16_t, decltype(d)> dw;
|
|
@@ -6457,64 +7804,6 @@ HWY_API Vec512<int64_t> operator>>(const Vec512<int64_t> v,
|
|
|
6457
7804
|
return Vec512<int64_t>{_mm512_srav_epi64(v.raw, bits.raw)};
|
|
6458
7805
|
}
|
|
6459
7806
|
|
|
6460
|
-
// ------------------------------ MulEven/Odd (Shuffle2301, InterleaveLower)
|
|
6461
|
-
|
|
6462
|
-
HWY_INLINE Vec512<uint64_t> MulEven(const Vec512<uint64_t> a,
|
|
6463
|
-
const Vec512<uint64_t> b) {
|
|
6464
|
-
const DFromV<decltype(a)> du64;
|
|
6465
|
-
const RepartitionToNarrow<decltype(du64)> du32;
|
|
6466
|
-
const auto maskL = Set(du64, 0xFFFFFFFFULL);
|
|
6467
|
-
const auto a32 = BitCast(du32, a);
|
|
6468
|
-
const auto b32 = BitCast(du32, b);
|
|
6469
|
-
// Inputs for MulEven: we only need the lower 32 bits
|
|
6470
|
-
const auto aH = Shuffle2301(a32);
|
|
6471
|
-
const auto bH = Shuffle2301(b32);
|
|
6472
|
-
|
|
6473
|
-
// Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
|
|
6474
|
-
// the even (lower 64 bits of every 128-bit block) results. See
|
|
6475
|
-
// https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat
|
|
6476
|
-
const auto aLbL = MulEven(a32, b32);
|
|
6477
|
-
const auto w3 = aLbL & maskL;
|
|
6478
|
-
|
|
6479
|
-
const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
|
|
6480
|
-
const auto w2 = t2 & maskL;
|
|
6481
|
-
const auto w1 = ShiftRight<32>(t2);
|
|
6482
|
-
|
|
6483
|
-
const auto t = MulEven(a32, bH) + w2;
|
|
6484
|
-
const auto k = ShiftRight<32>(t);
|
|
6485
|
-
|
|
6486
|
-
const auto mulH = MulEven(aH, bH) + w1 + k;
|
|
6487
|
-
const auto mulL = ShiftLeft<32>(t) + w3;
|
|
6488
|
-
return InterleaveLower(mulL, mulH);
|
|
6489
|
-
}
|
|
6490
|
-
|
|
6491
|
-
HWY_INLINE Vec512<uint64_t> MulOdd(const Vec512<uint64_t> a,
|
|
6492
|
-
const Vec512<uint64_t> b) {
|
|
6493
|
-
const DFromV<decltype(a)> du64;
|
|
6494
|
-
const RepartitionToNarrow<decltype(du64)> du32;
|
|
6495
|
-
const auto maskL = Set(du64, 0xFFFFFFFFULL);
|
|
6496
|
-
const auto a32 = BitCast(du32, a);
|
|
6497
|
-
const auto b32 = BitCast(du32, b);
|
|
6498
|
-
// Inputs for MulEven: we only need bits [95:64] (= upper half of input)
|
|
6499
|
-
const auto aH = Shuffle2301(a32);
|
|
6500
|
-
const auto bH = Shuffle2301(b32);
|
|
6501
|
-
|
|
6502
|
-
// Same as above, but we're using the odd results (upper 64 bits per block).
|
|
6503
|
-
const auto aLbL = MulEven(a32, b32);
|
|
6504
|
-
const auto w3 = aLbL & maskL;
|
|
6505
|
-
|
|
6506
|
-
const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
|
|
6507
|
-
const auto w2 = t2 & maskL;
|
|
6508
|
-
const auto w1 = ShiftRight<32>(t2);
|
|
6509
|
-
|
|
6510
|
-
const auto t = MulEven(a32, bH) + w2;
|
|
6511
|
-
const auto k = ShiftRight<32>(t);
|
|
6512
|
-
|
|
6513
|
-
const auto mulH = MulEven(aH, bH) + w1 + k;
|
|
6514
|
-
const auto mulL = ShiftLeft<32>(t) + w3;
|
|
6515
|
-
return InterleaveUpper(du64, mulL, mulH);
|
|
6516
|
-
}
|
|
6517
|
-
|
|
6518
7807
|
// ------------------------------ WidenMulPairwiseAdd
|
|
6519
7808
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
|
|
6520
7809
|
HWY_API VFromD<D> WidenMulPairwiseAdd(D /*d32*/, Vec512<int16_t> a,
|
|
@@ -6523,7 +7812,6 @@ HWY_API VFromD<D> WidenMulPairwiseAdd(D /*d32*/, Vec512<int16_t> a,
|
|
|
6523
7812
|
}
|
|
6524
7813
|
|
|
6525
7814
|
// ------------------------------ SatWidenMulPairwiseAdd
|
|
6526
|
-
|
|
6527
7815
|
template <class DI16, HWY_IF_V_SIZE_D(DI16, 64), HWY_IF_I16_D(DI16)>
|
|
6528
7816
|
HWY_API VFromD<DI16> SatWidenMulPairwiseAdd(
|
|
6529
7817
|
DI16 /* tag */, VFromD<Repartition<uint8_t, DI16>> a,
|
|
@@ -6531,7 +7819,30 @@ HWY_API VFromD<DI16> SatWidenMulPairwiseAdd(
|
|
|
6531
7819
|
return VFromD<DI16>{_mm512_maddubs_epi16(a.raw, b.raw)};
|
|
6532
7820
|
}
|
|
6533
7821
|
|
|
7822
|
+
// ------------------------------ SatWidenMulPairwiseAccumulate
|
|
7823
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
7824
|
+
template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 64)>
|
|
7825
|
+
HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate(
|
|
7826
|
+
DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a,
|
|
7827
|
+
VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
|
|
7828
|
+
return VFromD<DI32>{_mm512_dpwssds_epi32(sum.raw, a.raw, b.raw)};
|
|
7829
|
+
}
|
|
7830
|
+
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
7831
|
+
|
|
6534
7832
|
// ------------------------------ ReorderWidenMulAccumulate
|
|
7833
|
+
|
|
7834
|
+
#if HWY_NATIVE_DOT_BF16
|
|
7835
|
+
template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_D(DF, 64),
|
|
7836
|
+
class VBF = VFromD<Repartition<bfloat16_t, DF>>>
|
|
7837
|
+
HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF /*df*/, VBF a, VBF b,
|
|
7838
|
+
const VFromD<DF> sum0,
|
|
7839
|
+
VFromD<DF>& /*sum1*/) {
|
|
7840
|
+
return VFromD<DF>{_mm512_dpbf16_ps(sum0.raw,
|
|
7841
|
+
reinterpret_cast<__m512bh>(a.raw),
|
|
7842
|
+
reinterpret_cast<__m512bh>(b.raw))};
|
|
7843
|
+
}
|
|
7844
|
+
#endif // HWY_NATIVE_DOT_BF16
|
|
7845
|
+
|
|
6535
7846
|
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
|
|
6536
7847
|
HWY_API VFromD<D> ReorderWidenMulAccumulate(D d, Vec512<int16_t> a,
|
|
6537
7848
|
Vec512<int16_t> b,
|
|
@@ -6570,161 +7881,47 @@ HWY_API VFromD<DI32> SumOfMulQuadAccumulate(
|
|
|
6570
7881
|
|
|
6571
7882
|
// ------------------------------ Reductions
|
|
6572
7883
|
|
|
6573
|
-
|
|
6574
|
-
HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
|
|
6575
|
-
return _mm512_reduce_add_epi32(v.raw);
|
|
6576
|
-
}
|
|
6577
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
|
|
6578
|
-
HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
|
|
6579
|
-
return _mm512_reduce_add_epi64(v.raw);
|
|
6580
|
-
}
|
|
6581
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
|
|
6582
|
-
HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
|
|
6583
|
-
return static_cast<uint32_t>(_mm512_reduce_add_epi32(v.raw));
|
|
6584
|
-
}
|
|
6585
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
|
|
6586
|
-
HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
|
|
6587
|
-
return static_cast<uint64_t>(_mm512_reduce_add_epi64(v.raw));
|
|
6588
|
-
}
|
|
6589
|
-
#if HWY_HAVE_FLOAT16
|
|
6590
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
|
|
6591
|
-
HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
|
|
6592
|
-
return _mm512_reduce_add_ph(v.raw);
|
|
6593
|
-
}
|
|
6594
|
-
#endif // HWY_HAVE_FLOAT16
|
|
6595
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
6596
|
-
HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
|
|
6597
|
-
return _mm512_reduce_add_ps(v.raw);
|
|
6598
|
-
}
|
|
6599
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
|
|
6600
|
-
HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
|
|
6601
|
-
return _mm512_reduce_add_pd(v.raw);
|
|
6602
|
-
}
|
|
6603
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
|
|
6604
|
-
HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
|
|
6605
|
-
const RepartitionToWide<decltype(d)> d32;
|
|
6606
|
-
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
|
|
6607
|
-
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
|
6608
|
-
const auto sum = ReduceSum(d32, even + odd);
|
|
6609
|
-
return static_cast<uint16_t>(sum);
|
|
6610
|
-
}
|
|
6611
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
|
|
6612
|
-
HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
|
|
6613
|
-
const RepartitionToWide<decltype(d)> d32;
|
|
6614
|
-
// Sign-extend
|
|
6615
|
-
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
|
|
6616
|
-
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
|
6617
|
-
const auto sum = ReduceSum(d32, even + odd);
|
|
6618
|
-
return static_cast<int16_t>(sum);
|
|
6619
|
-
}
|
|
7884
|
+
namespace detail {
|
|
6620
7885
|
|
|
6621
|
-
//
|
|
6622
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64)>
|
|
6623
|
-
|
|
6624
|
-
|
|
7886
|
+
// Used by generic_ops-inl
|
|
7887
|
+
template <class D, class Func, HWY_IF_V_SIZE_D(D, 64)>
|
|
7888
|
+
HWY_INLINE VFromD<D> ReduceAcrossBlocks(D d, Func f, VFromD<D> v) {
|
|
7889
|
+
v = f(v, SwapAdjacentBlocks(v));
|
|
7890
|
+
return f(v, ReverseBlocks(d, v));
|
|
6625
7891
|
}
|
|
6626
7892
|
|
|
6627
|
-
//
|
|
6628
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
|
|
6629
|
-
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
6630
|
-
return Set(d, _mm512_reduce_min_epi32(v.raw));
|
|
6631
|
-
}
|
|
6632
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
|
|
6633
|
-
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
6634
|
-
return Set(d, _mm512_reduce_min_epi64(v.raw));
|
|
6635
|
-
}
|
|
6636
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
|
|
6637
|
-
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
6638
|
-
return Set(d, _mm512_reduce_min_epu32(v.raw));
|
|
6639
|
-
}
|
|
6640
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
|
|
6641
|
-
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
6642
|
-
return Set(d, _mm512_reduce_min_epu64(v.raw));
|
|
6643
|
-
}
|
|
6644
|
-
#if HWY_HAVE_FLOAT16
|
|
6645
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
|
|
6646
|
-
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
6647
|
-
return Set(d, _mm512_reduce_min_ph(v.raw));
|
|
6648
|
-
}
|
|
6649
|
-
#endif // HWY_HAVE_FLOAT16
|
|
6650
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
|
|
6651
|
-
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
6652
|
-
return Set(d, _mm512_reduce_min_ps(v.raw));
|
|
6653
|
-
}
|
|
6654
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
|
|
6655
|
-
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
6656
|
-
return Set(d, _mm512_reduce_min_pd(v.raw));
|
|
6657
|
-
}
|
|
6658
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
|
|
6659
|
-
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
6660
|
-
const RepartitionToWide<decltype(d)> d32;
|
|
6661
|
-
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
|
|
6662
|
-
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
|
6663
|
-
const auto min = MinOfLanes(d32, Min(even, odd));
|
|
6664
|
-
// Also broadcast into odd lanes.
|
|
6665
|
-
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
|
|
6666
|
-
}
|
|
6667
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
|
|
6668
|
-
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
6669
|
-
const RepartitionToWide<decltype(d)> d32;
|
|
6670
|
-
// Sign-extend
|
|
6671
|
-
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
|
|
6672
|
-
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
|
6673
|
-
const auto min = MinOfLanes(d32, Min(even, odd));
|
|
6674
|
-
// Also broadcast into odd lanes.
|
|
6675
|
-
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
|
|
6676
|
-
}
|
|
7893
|
+
} // namespace detail
|
|
6677
7894
|
|
|
6678
|
-
//
|
|
6679
|
-
|
|
6680
|
-
|
|
6681
|
-
|
|
6682
|
-
|
|
6683
|
-
|
|
6684
|
-
|
|
6685
|
-
|
|
6686
|
-
|
|
6687
|
-
|
|
6688
|
-
|
|
6689
|
-
|
|
6690
|
-
|
|
6691
|
-
|
|
6692
|
-
|
|
6693
|
-
|
|
6694
|
-
|
|
6695
|
-
|
|
6696
|
-
|
|
6697
|
-
|
|
6698
|
-
|
|
6699
|
-
|
|
6700
|
-
|
|
6701
|
-
|
|
6702
|
-
|
|
6703
|
-
|
|
6704
|
-
|
|
6705
|
-
|
|
6706
|
-
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
6707
|
-
return Set(d, _mm512_reduce_max_pd(v.raw));
|
|
6708
|
-
}
|
|
6709
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
|
|
6710
|
-
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
6711
|
-
const RepartitionToWide<decltype(d)> d32;
|
|
6712
|
-
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
|
|
6713
|
-
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
|
6714
|
-
const auto min = MaxOfLanes(d32, Max(even, odd));
|
|
6715
|
-
// Also broadcast into odd lanes.
|
|
6716
|
-
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
|
|
6717
|
-
}
|
|
6718
|
-
template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
|
|
6719
|
-
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
6720
|
-
const RepartitionToWide<decltype(d)> d32;
|
|
6721
|
-
// Sign-extend
|
|
6722
|
-
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
|
|
6723
|
-
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
|
6724
|
-
const auto min = MaxOfLanes(d32, Max(even, odd));
|
|
6725
|
-
// Also broadcast into odd lanes.
|
|
6726
|
-
return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
|
|
7895
|
+
// ------------------------------ BitShuffle
|
|
7896
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
7897
|
+
template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
|
|
7898
|
+
HWY_IF_V_SIZE_V(V, 64), HWY_IF_V_SIZE_V(VI, 64)>
|
|
7899
|
+
HWY_API V BitShuffle(V v, VI idx) {
|
|
7900
|
+
const DFromV<decltype(v)> d64;
|
|
7901
|
+
const RebindToUnsigned<decltype(d64)> du64;
|
|
7902
|
+
const Rebind<uint8_t, decltype(d64)> du8;
|
|
7903
|
+
|
|
7904
|
+
const __mmask64 mmask64_bit_shuf_result =
|
|
7905
|
+
_mm512_bitshuffle_epi64_mask(v.raw, idx.raw);
|
|
7906
|
+
|
|
7907
|
+
#if HWY_ARCH_X86_64
|
|
7908
|
+
const VFromD<decltype(du8)> vu8_bit_shuf_result{
|
|
7909
|
+
_mm_cvtsi64_si128(static_cast<int64_t>(mmask64_bit_shuf_result))};
|
|
7910
|
+
#else
|
|
7911
|
+
const int32_t i32_lo_bit_shuf_result =
|
|
7912
|
+
static_cast<int32_t>(mmask64_bit_shuf_result);
|
|
7913
|
+
const int32_t i32_hi_bit_shuf_result =
|
|
7914
|
+
static_cast<int32_t>(_kshiftri_mask64(mmask64_bit_shuf_result, 32));
|
|
7915
|
+
|
|
7916
|
+
const VFromD<decltype(du8)> vu8_bit_shuf_result = ResizeBitCast(
|
|
7917
|
+
du8, InterleaveLower(
|
|
7918
|
+
Vec128<uint32_t>{_mm_cvtsi32_si128(i32_lo_bit_shuf_result)},
|
|
7919
|
+
Vec128<uint32_t>{_mm_cvtsi32_si128(i32_hi_bit_shuf_result)}));
|
|
7920
|
+
#endif
|
|
7921
|
+
|
|
7922
|
+
return BitCast(d64, PromoteTo(du64, vu8_bit_shuf_result));
|
|
6727
7923
|
}
|
|
7924
|
+
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
6728
7925
|
|
|
6729
7926
|
// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex
|
|
6730
7927
|
|