@img/sharp-libvips-dev 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +7 -1
- package/include/aom/aom_image.h +24 -12
- package/include/aom/aom_integer.h +3 -3
- package/include/aom/aomcx.h +15 -0
- package/include/aom/aomdx.h +5 -2
- package/include/archive.h +7 -5
- package/include/archive_entry.h +5 -3
- package/include/cgif.h +3 -0
- package/include/expat.h +21 -10
- package/include/expat_config.h +11 -5
- package/include/ffi.h +12 -25
- package/include/freetype2/freetype/config/ftoption.h +2 -2
- package/include/fribidi/fribidi-config.h +2 -2
- package/include/fribidi/fribidi-unicode-version.h +3 -3
- package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
- package/include/glib-2.0/gio/gappinfo.h +40 -25
- package/include/glib-2.0/gio/gapplication.h +6 -0
- package/include/glib-2.0/gio/gasyncresult.h +1 -1
- package/include/glib-2.0/gio/gconverter.h +5 -0
- package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
- package/include/glib-2.0/gio/gfile.h +16 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/giotypes.h +0 -1
- package/include/glib-2.0/gio/gsettings.h +8 -0
- package/include/glib-2.0/gio/gvfs.h +2 -2
- package/include/glib-2.0/girepository/gi-visibility.h +34 -0
- package/include/glib-2.0/girepository/giarginfo.h +23 -6
- package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
- package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
- package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
- package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
- package/include/glib-2.0/girepository/gienuminfo.h +20 -21
- package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
- package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
- package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
- package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
- package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
- package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
- package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
- package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
- package/include/glib-2.0/girepository/girepository.h +53 -62
- package/include/glib-2.0/girepository/girffi.h +8 -7
- package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
- package/include/glib-2.0/girepository/gistructinfo.h +26 -11
- package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
- package/include/glib-2.0/girepository/gitypelib.h +9 -13
- package/include/glib-2.0/girepository/gitypes.h +52 -104
- package/include/glib-2.0/girepository/giunioninfo.h +28 -12
- package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
- package/include/glib-2.0/girepository/givalueinfo.h +65 -0
- package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
- package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
- package/include/glib-2.0/glib/gbitlock.h +31 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
- package/include/glib-2.0/glib/giochannel.h +2 -2
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +12 -5
- package/include/glib-2.0/glib/gmain.h +93 -7
- package/include/glib-2.0/glib/gmessages.h +8 -0
- package/include/glib-2.0/glib/gqsort.h +8 -1
- package/include/glib-2.0/glib/gslice.h +2 -0
- package/include/glib-2.0/glib/gstrfuncs.h +24 -30
- package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
- package/include/glib-2.0/glib/gthread.h +191 -3
- package/include/glib-2.0/glib/gunicode.h +1 -1
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/glib-unix.h +7 -1
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/genums.h +6 -6
- package/include/glib-2.0/gobject/glib-types.h +11 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gsignal.h +16 -6
- package/include/glib-2.0/gobject/gtype.h +6 -6
- package/include/harfbuzz/hb-buffer.h +6 -0
- package/include/harfbuzz/hb-common.h +6 -9
- package/include/harfbuzz/hb-cplusplus.hh +8 -11
- package/include/harfbuzz/hb-subset.h +17 -4
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +28 -0
- package/include/hwy/aligned_allocator.h +218 -6
- package/include/hwy/base.h +1935 -512
- package/include/hwy/cache_control.h +24 -6
- package/include/hwy/detect_compiler_arch.h +105 -10
- package/include/hwy/detect_targets.h +146 -37
- package/include/hwy/foreach_target.h +36 -1
- package/include/hwy/highway.h +222 -50
- package/include/hwy/ops/arm_neon-inl.h +2055 -894
- package/include/hwy/ops/arm_sve-inl.h +1476 -348
- package/include/hwy/ops/emu128-inl.h +711 -623
- package/include/hwy/ops/generic_ops-inl.h +4431 -2157
- package/include/hwy/ops/inside-inl.h +691 -0
- package/include/hwy/ops/ppc_vsx-inl.h +2186 -673
- package/include/hwy/ops/rvv-inl.h +1556 -536
- package/include/hwy/ops/scalar-inl.h +353 -233
- package/include/hwy/ops/set_macros-inl.h +171 -23
- package/include/hwy/ops/shared-inl.h +198 -56
- package/include/hwy/ops/wasm_128-inl.h +283 -244
- package/include/hwy/ops/x86_128-inl.h +3673 -1357
- package/include/hwy/ops/x86_256-inl.h +1737 -663
- package/include/hwy/ops/x86_512-inl.h +1697 -500
- package/include/hwy/per_target.h +4 -0
- package/include/hwy/profiler.h +648 -0
- package/include/hwy/robust_statistics.h +2 -2
- package/include/hwy/targets.h +40 -32
- package/include/hwy/timer-inl.h +3 -3
- package/include/hwy/timer.h +16 -1
- package/include/libheif/heif.h +170 -15
- package/include/libheif/heif_items.h +237 -0
- package/include/libheif/heif_properties.h +38 -2
- package/include/libheif/heif_regions.h +1 -1
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/png.h +32 -29
- package/include/libpng16/pngconf.h +2 -2
- package/include/libpng16/pnglibconf.h +8 -3
- package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
- package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
- package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
- package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
- package/include/libxml2/libxml/HTMLparser.h +12 -19
- package/include/libxml2/libxml/c14n.h +1 -12
- package/include/libxml2/libxml/debugXML.h +1 -1
- package/include/libxml2/libxml/encoding.h +9 -0
- package/include/libxml2/libxml/entities.h +12 -1
- package/include/libxml2/libxml/hash.h +19 -0
- package/include/libxml2/libxml/list.h +2 -2
- package/include/libxml2/libxml/nanohttp.h +17 -0
- package/include/libxml2/libxml/parser.h +73 -58
- package/include/libxml2/libxml/parserInternals.h +9 -1
- package/include/libxml2/libxml/pattern.h +6 -0
- package/include/libxml2/libxml/tree.h +32 -12
- package/include/libxml2/libxml/uri.h +11 -0
- package/include/libxml2/libxml/valid.h +29 -2
- package/include/libxml2/libxml/xinclude.h +7 -0
- package/include/libxml2/libxml/xmlIO.h +21 -5
- package/include/libxml2/libxml/xmlerror.h +14 -0
- package/include/libxml2/libxml/xmlexports.h +111 -15
- package/include/libxml2/libxml/xmlmemory.h +8 -45
- package/include/libxml2/libxml/xmlreader.h +2 -0
- package/include/libxml2/libxml/xmlsave.h +5 -0
- package/include/libxml2/libxml/xmlunicode.h +165 -1
- package/include/libxml2/libxml/xmlversion.h +15 -179
- package/include/libxml2/libxml/xmlwriter.h +1 -0
- package/include/libxml2/libxml/xpath.h +4 -0
- package/include/pango-1.0/pango/pango-features.h +2 -2
- package/include/pango-1.0/pango/pango-fontmap.h +7 -0
- package/include/pango-1.0/pango/pango-item.h +4 -2
- package/include/pango-1.0/pango/pango-version-macros.h +25 -0
- package/include/pango-1.0/pango/pangofc-font.h +2 -1
- package/include/pixman-1/pixman-version.h +2 -2
- package/include/png.h +32 -29
- package/include/pngconf.h +2 -2
- package/include/pnglibconf.h +8 -3
- package/include/vips/connection.h +9 -3
- package/include/vips/util.h +1 -11
- package/include/vips/version.h +4 -4
- package/include/webp/decode.h +58 -56
- package/include/webp/demux.h +25 -21
- package/include/webp/encode.h +44 -39
- package/include/webp/mux.h +76 -15
- package/include/webp/mux_types.h +2 -1
- package/include/webp/sharpyuv/sharpyuv.h +77 -8
- package/include/webp/types.h +29 -8
- package/include/zconf.h +1 -1
- package/include/zlib.h +12 -12
- package/package.json +1 -1
- package/versions.json +18 -19
|
@@ -101,6 +101,9 @@ class Vec256 {
|
|
|
101
101
|
HWY_INLINE Vec256& operator-=(const Vec256 other) {
|
|
102
102
|
return *this = (*this - other);
|
|
103
103
|
}
|
|
104
|
+
HWY_INLINE Vec256& operator%=(const Vec256 other) {
|
|
105
|
+
return *this = (*this % other);
|
|
106
|
+
}
|
|
104
107
|
HWY_INLINE Vec256& operator&=(const Vec256 other) {
|
|
105
108
|
return *this = (*this & other);
|
|
106
109
|
}
|
|
@@ -191,6 +194,25 @@ HWY_INLINE __m256i BitCastToInteger(__m256d v) {
|
|
|
191
194
|
return _mm256_castpd_si256(v);
|
|
192
195
|
}
|
|
193
196
|
|
|
197
|
+
#if HWY_AVX3_HAVE_F32_TO_BF16C
|
|
198
|
+
HWY_INLINE __m256i BitCastToInteger(__m256bh v) {
|
|
199
|
+
// Need to use reinterpret_cast on GCC/Clang or BitCastScalar on MSVC to
|
|
200
|
+
// bit cast a __m256bh to a __m256i as there is currently no intrinsic
|
|
201
|
+
// available (as of GCC 13 and Clang 17) that can bit cast a __m256bh vector
|
|
202
|
+
// to a __m256i vector
|
|
203
|
+
|
|
204
|
+
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
|
|
205
|
+
// On GCC or Clang, use reinterpret_cast to bit cast a __m256bh to a __m256i
|
|
206
|
+
return reinterpret_cast<__m256i>(v);
|
|
207
|
+
#else
|
|
208
|
+
// On MSVC, use BitCastScalar to bit cast a __m256bh to a __m256i as MSVC does
|
|
209
|
+
// not allow reinterpret_cast, static_cast, or a C-style cast to be used to
|
|
210
|
+
// bit cast from one AVX vector type to a different AVX vector type
|
|
211
|
+
return BitCastScalar<__m256i>(v);
|
|
212
|
+
#endif // HWY_COMPILER_GCC || HWY_COMPILER_CLANG
|
|
213
|
+
}
|
|
214
|
+
#endif // HWY_AVX3_HAVE_F32_TO_BF16C
|
|
215
|
+
|
|
194
216
|
template <typename T>
|
|
195
217
|
HWY_INLINE Vec256<uint8_t> BitCastToByte(Vec256<T> v) {
|
|
196
218
|
return Vec256<uint8_t>{BitCastToInteger(v.raw)};
|
|
@@ -359,6 +381,85 @@ HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
|
|
|
359
381
|
ResizeBitCast(Full128<uint8_t>(), v).raw)});
|
|
360
382
|
}
|
|
361
383
|
|
|
384
|
+
// ------------------------------ Dup128VecFromValues
|
|
385
|
+
|
|
386
|
+
template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 32)>
|
|
387
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
388
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
389
|
+
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
|
|
390
|
+
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
|
|
391
|
+
TFromD<D> t11, TFromD<D> t12,
|
|
392
|
+
TFromD<D> t13, TFromD<D> t14,
|
|
393
|
+
TFromD<D> t15) {
|
|
394
|
+
return VFromD<D>{_mm256_setr_epi8(
|
|
395
|
+
static_cast<char>(t0), static_cast<char>(t1), static_cast<char>(t2),
|
|
396
|
+
static_cast<char>(t3), static_cast<char>(t4), static_cast<char>(t5),
|
|
397
|
+
static_cast<char>(t6), static_cast<char>(t7), static_cast<char>(t8),
|
|
398
|
+
static_cast<char>(t9), static_cast<char>(t10), static_cast<char>(t11),
|
|
399
|
+
static_cast<char>(t12), static_cast<char>(t13), static_cast<char>(t14),
|
|
400
|
+
static_cast<char>(t15), static_cast<char>(t0), static_cast<char>(t1),
|
|
401
|
+
static_cast<char>(t2), static_cast<char>(t3), static_cast<char>(t4),
|
|
402
|
+
static_cast<char>(t5), static_cast<char>(t6), static_cast<char>(t7),
|
|
403
|
+
static_cast<char>(t8), static_cast<char>(t9), static_cast<char>(t10),
|
|
404
|
+
static_cast<char>(t11), static_cast<char>(t12), static_cast<char>(t13),
|
|
405
|
+
static_cast<char>(t14), static_cast<char>(t15))};
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 32)>
|
|
409
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
410
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
411
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
412
|
+
TFromD<D> t7) {
|
|
413
|
+
return VFromD<D>{
|
|
414
|
+
_mm256_setr_epi16(static_cast<int16_t>(t0), static_cast<int16_t>(t1),
|
|
415
|
+
static_cast<int16_t>(t2), static_cast<int16_t>(t3),
|
|
416
|
+
static_cast<int16_t>(t4), static_cast<int16_t>(t5),
|
|
417
|
+
static_cast<int16_t>(t6), static_cast<int16_t>(t7),
|
|
418
|
+
static_cast<int16_t>(t0), static_cast<int16_t>(t1),
|
|
419
|
+
static_cast<int16_t>(t2), static_cast<int16_t>(t3),
|
|
420
|
+
static_cast<int16_t>(t4), static_cast<int16_t>(t5),
|
|
421
|
+
static_cast<int16_t>(t6), static_cast<int16_t>(t7))};
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
#if HWY_HAVE_FLOAT16
|
|
425
|
+
template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 32)>
|
|
426
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
427
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
428
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
429
|
+
TFromD<D> t7) {
|
|
430
|
+
return VFromD<D>{_mm256_setr_ph(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2,
|
|
431
|
+
t3, t4, t5, t6, t7)};
|
|
432
|
+
}
|
|
433
|
+
#endif
|
|
434
|
+
|
|
435
|
+
template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 32)>
|
|
436
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
437
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
438
|
+
return VFromD<D>{
|
|
439
|
+
_mm256_setr_epi32(static_cast<int32_t>(t0), static_cast<int32_t>(t1),
|
|
440
|
+
static_cast<int32_t>(t2), static_cast<int32_t>(t3),
|
|
441
|
+
static_cast<int32_t>(t0), static_cast<int32_t>(t1),
|
|
442
|
+
static_cast<int32_t>(t2), static_cast<int32_t>(t3))};
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 32)>
|
|
446
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
447
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
448
|
+
return VFromD<D>{_mm256_setr_ps(t0, t1, t2, t3, t0, t1, t2, t3)};
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 32)>
|
|
452
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
453
|
+
return VFromD<D>{
|
|
454
|
+
_mm256_setr_epi64x(static_cast<int64_t>(t0), static_cast<int64_t>(t1),
|
|
455
|
+
static_cast<int64_t>(t0), static_cast<int64_t>(t1))};
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 32)>
|
|
459
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
460
|
+
return VFromD<D>{_mm256_setr_pd(t0, t1, t0, t1)};
|
|
461
|
+
}
|
|
462
|
+
|
|
362
463
|
// ================================================== LOGICAL
|
|
363
464
|
|
|
364
465
|
// ------------------------------ And
|
|
@@ -367,7 +468,8 @@ template <typename T>
|
|
|
367
468
|
HWY_API Vec256<T> And(Vec256<T> a, Vec256<T> b) {
|
|
368
469
|
const DFromV<decltype(a)> d; // for float16_t
|
|
369
470
|
const RebindToUnsigned<decltype(d)> du;
|
|
370
|
-
return BitCast(d, VFromD<decltype(du)>{_mm256_and_si256(a.raw,
|
|
471
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_and_si256(BitCast(du, a).raw,
|
|
472
|
+
BitCast(du, b).raw)});
|
|
371
473
|
}
|
|
372
474
|
|
|
373
475
|
HWY_API Vec256<float> And(Vec256<float> a, Vec256<float> b) {
|
|
@@ -384,8 +486,8 @@ template <typename T>
|
|
|
384
486
|
HWY_API Vec256<T> AndNot(Vec256<T> not_mask, Vec256<T> mask) {
|
|
385
487
|
const DFromV<decltype(mask)> d; // for float16_t
|
|
386
488
|
const RebindToUnsigned<decltype(d)> du;
|
|
387
|
-
return BitCast(
|
|
388
|
-
|
|
489
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_andnot_si256(
|
|
490
|
+
BitCast(du, not_mask).raw, BitCast(du, mask).raw)});
|
|
389
491
|
}
|
|
390
492
|
HWY_API Vec256<float> AndNot(Vec256<float> not_mask, Vec256<float> mask) {
|
|
391
493
|
return Vec256<float>{_mm256_andnot_ps(not_mask.raw, mask.raw)};
|
|
@@ -400,7 +502,8 @@ template <typename T>
|
|
|
400
502
|
HWY_API Vec256<T> Or(Vec256<T> a, Vec256<T> b) {
|
|
401
503
|
const DFromV<decltype(a)> d; // for float16_t
|
|
402
504
|
const RebindToUnsigned<decltype(d)> du;
|
|
403
|
-
return BitCast(d, VFromD<decltype(du)>{_mm256_or_si256(a.raw,
|
|
505
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_or_si256(BitCast(du, a).raw,
|
|
506
|
+
BitCast(du, b).raw)});
|
|
404
507
|
}
|
|
405
508
|
|
|
406
509
|
HWY_API Vec256<float> Or(Vec256<float> a, Vec256<float> b) {
|
|
@@ -416,7 +519,8 @@ template <typename T>
|
|
|
416
519
|
HWY_API Vec256<T> Xor(Vec256<T> a, Vec256<T> b) {
|
|
417
520
|
const DFromV<decltype(a)> d; // for float16_t
|
|
418
521
|
const RebindToUnsigned<decltype(d)> du;
|
|
419
|
-
return BitCast(d, VFromD<decltype(du)>{_mm256_xor_si256(a.raw,
|
|
522
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_xor_si256(BitCast(du, a).raw,
|
|
523
|
+
BitCast(du, b).raw)});
|
|
420
524
|
}
|
|
421
525
|
|
|
422
526
|
HWY_API Vec256<float> Xor(Vec256<float> a, Vec256<float> b) {
|
|
@@ -431,7 +535,7 @@ template <typename T>
|
|
|
431
535
|
HWY_API Vec256<T> Not(const Vec256<T> v) {
|
|
432
536
|
const DFromV<decltype(v)> d;
|
|
433
537
|
using TU = MakeUnsigned<T>;
|
|
434
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
538
|
+
#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
435
539
|
const __m256i vu = BitCast(RebindToUnsigned<decltype(d)>(), v).raw;
|
|
436
540
|
return BitCast(d, Vec256<TU>{_mm256_ternarylogic_epi32(vu, vu, vu, 0x55)});
|
|
437
541
|
#else
|
|
@@ -442,7 +546,7 @@ HWY_API Vec256<T> Not(const Vec256<T> v) {
|
|
|
442
546
|
// ------------------------------ Xor3
|
|
443
547
|
template <typename T>
|
|
444
548
|
HWY_API Vec256<T> Xor3(Vec256<T> x1, Vec256<T> x2, Vec256<T> x3) {
|
|
445
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
549
|
+
#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
446
550
|
const DFromV<decltype(x1)> d;
|
|
447
551
|
const RebindToUnsigned<decltype(d)> du;
|
|
448
552
|
using VU = VFromD<decltype(du)>;
|
|
@@ -457,7 +561,7 @@ HWY_API Vec256<T> Xor3(Vec256<T> x1, Vec256<T> x2, Vec256<T> x3) {
|
|
|
457
561
|
// ------------------------------ Or3
|
|
458
562
|
template <typename T>
|
|
459
563
|
HWY_API Vec256<T> Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) {
|
|
460
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
564
|
+
#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
461
565
|
const DFromV<decltype(o1)> d;
|
|
462
566
|
const RebindToUnsigned<decltype(d)> du;
|
|
463
567
|
using VU = VFromD<decltype(du)>;
|
|
@@ -472,7 +576,7 @@ HWY_API Vec256<T> Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) {
|
|
|
472
576
|
// ------------------------------ OrAnd
|
|
473
577
|
template <typename T>
|
|
474
578
|
HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
|
|
475
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
579
|
+
#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
476
580
|
const DFromV<decltype(o)> d;
|
|
477
581
|
const RebindToUnsigned<decltype(d)> du;
|
|
478
582
|
using VU = VFromD<decltype(du)>;
|
|
@@ -487,7 +591,7 @@ HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
|
|
|
487
591
|
// ------------------------------ IfVecThenElse
|
|
488
592
|
template <typename T>
|
|
489
593
|
HWY_API Vec256<T> IfVecThenElse(Vec256<T> mask, Vec256<T> yes, Vec256<T> no) {
|
|
490
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
594
|
+
#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
491
595
|
const DFromV<decltype(yes)> d;
|
|
492
596
|
const RebindToUnsigned<decltype(d)> du;
|
|
493
597
|
using VU = VFromD<decltype(du)>;
|
|
@@ -589,7 +693,7 @@ HWY_INLINE Vec256<T> IfThenElse(hwy::SizeTag<8> /* tag */, Mask256<T> mask,
|
|
|
589
693
|
|
|
590
694
|
} // namespace detail
|
|
591
695
|
|
|
592
|
-
template <typename T>
|
|
696
|
+
template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
593
697
|
HWY_API Vec256<T> IfThenElse(Mask256<T> mask, Vec256<T> yes, Vec256<T> no) {
|
|
594
698
|
return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
|
|
595
699
|
}
|
|
@@ -634,7 +738,7 @@ HWY_INLINE Vec256<T> IfThenElseZero(hwy::SizeTag<8> /* tag */, Mask256<T> mask,
|
|
|
634
738
|
|
|
635
739
|
} // namespace detail
|
|
636
740
|
|
|
637
|
-
template <typename T,
|
|
741
|
+
template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
638
742
|
HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) {
|
|
639
743
|
return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
|
|
640
744
|
}
|
|
@@ -672,7 +776,7 @@ HWY_INLINE Vec256<T> IfThenZeroElse(hwy::SizeTag<8> /* tag */, Mask256<T> mask,
|
|
|
672
776
|
|
|
673
777
|
} // namespace detail
|
|
674
778
|
|
|
675
|
-
template <typename T,
|
|
779
|
+
template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
676
780
|
HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
|
|
677
781
|
return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
|
|
678
782
|
}
|
|
@@ -683,13 +787,6 @@ HWY_API Vec256<double> IfThenZeroElse(Mask256<double> mask, Vec256<double> no) {
|
|
|
683
787
|
return Vec256<double>{_mm256_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
|
|
684
788
|
}
|
|
685
789
|
|
|
686
|
-
template <typename T>
|
|
687
|
-
HWY_API Vec256<T> ZeroIfNegative(const Vec256<T> v) {
|
|
688
|
-
static_assert(IsSigned<T>(), "Only for float");
|
|
689
|
-
// AVX3 MaskFromVec only looks at the MSB
|
|
690
|
-
return IfThenZeroElse(MaskFromVec(v), v);
|
|
691
|
-
}
|
|
692
|
-
|
|
693
790
|
// ------------------------------ Mask logical
|
|
694
791
|
|
|
695
792
|
namespace detail {
|
|
@@ -879,6 +976,58 @@ HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<8> /*tag*/,
|
|
|
879
976
|
#endif
|
|
880
977
|
}
|
|
881
978
|
|
|
979
|
+
// UnmaskedNot returns ~m.raw without zeroing out any invalid bits
|
|
980
|
+
template <typename T, HWY_IF_T_SIZE(T, 1)>
|
|
981
|
+
HWY_INLINE Mask256<T> UnmaskedNot(const Mask256<T> m) {
|
|
982
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
983
|
+
return Mask256<T>{static_cast<__mmask32>(_knot_mask32(m.raw))};
|
|
984
|
+
#else
|
|
985
|
+
return Mask256<T>{static_cast<__mmask32>(~m.raw)};
|
|
986
|
+
#endif
|
|
987
|
+
}
|
|
988
|
+
|
|
989
|
+
template <typename T, HWY_IF_T_SIZE(T, 2)>
|
|
990
|
+
HWY_INLINE Mask256<T> UnmaskedNot(const Mask256<T> m) {
|
|
991
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
992
|
+
return Mask256<T>{static_cast<__mmask16>(_knot_mask16(m.raw))};
|
|
993
|
+
#else
|
|
994
|
+
return Mask256<T>{static_cast<__mmask16>(~m.raw)};
|
|
995
|
+
#endif
|
|
996
|
+
}
|
|
997
|
+
|
|
998
|
+
template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
|
|
999
|
+
HWY_INLINE Mask256<T> UnmaskedNot(const Mask256<T> m) {
|
|
1000
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
1001
|
+
return Mask256<T>{static_cast<__mmask8>(_knot_mask8(m.raw))};
|
|
1002
|
+
#else
|
|
1003
|
+
return Mask256<T>{static_cast<__mmask8>(~m.raw)};
|
|
1004
|
+
#endif
|
|
1005
|
+
}
|
|
1006
|
+
|
|
1007
|
+
template <typename T>
|
|
1008
|
+
HWY_INLINE Mask256<T> Not(hwy::SizeTag<1> /*tag*/, const Mask256<T> m) {
|
|
1009
|
+
// sizeof(T) == 1: simply return ~m as all 32 bits of m are valid
|
|
1010
|
+
return UnmaskedNot(m);
|
|
1011
|
+
}
|
|
1012
|
+
template <typename T>
|
|
1013
|
+
HWY_INLINE Mask256<T> Not(hwy::SizeTag<2> /*tag*/, const Mask256<T> m) {
|
|
1014
|
+
// sizeof(T) == 2: simply return ~m as all 16 bits of m are valid
|
|
1015
|
+
return UnmaskedNot(m);
|
|
1016
|
+
}
|
|
1017
|
+
template <typename T>
|
|
1018
|
+
HWY_INLINE Mask256<T> Not(hwy::SizeTag<4> /*tag*/, const Mask256<T> m) {
|
|
1019
|
+
// sizeof(T) == 4: simply return ~m as all 8 bits of m are valid
|
|
1020
|
+
return UnmaskedNot(m);
|
|
1021
|
+
}
|
|
1022
|
+
template <typename T>
|
|
1023
|
+
HWY_INLINE Mask256<T> Not(hwy::SizeTag<8> /*tag*/, const Mask256<T> m) {
|
|
1024
|
+
// sizeof(T) == 8: need to zero out the upper 4 bits of ~m as only the lower
|
|
1025
|
+
// 4 bits of m are valid
|
|
1026
|
+
|
|
1027
|
+
// Return (~m) & 0x0F
|
|
1028
|
+
return AndNot(hwy::SizeTag<8>(), m, Mask256<T>::FromBits(uint64_t{0x0F}));
|
|
1029
|
+
}
|
|
1030
|
+
|
|
882
1031
|
} // namespace detail
|
|
883
1032
|
|
|
884
1033
|
template <typename T>
|
|
@@ -904,8 +1053,7 @@ HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
|
|
|
904
1053
|
template <typename T>
|
|
905
1054
|
HWY_API Mask256<T> Not(const Mask256<T> m) {
|
|
906
1055
|
// Flip only the valid bits.
|
|
907
|
-
|
|
908
|
-
return Xor(m, Mask256<T>::FromBits((1ull << N) - 1));
|
|
1056
|
+
return detail::Not(hwy::SizeTag<sizeof(T)>(), m);
|
|
909
1057
|
}
|
|
910
1058
|
|
|
911
1059
|
template <typename T>
|
|
@@ -913,6 +1061,53 @@ HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) {
|
|
|
913
1061
|
return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
|
|
914
1062
|
}
|
|
915
1063
|
|
|
1064
|
+
template <class D, HWY_IF_LANES_D(D, 32)>
|
|
1065
|
+
HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
|
|
1066
|
+
MFromD<Half<D>> lo) {
|
|
1067
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
1068
|
+
const __mmask32 combined_mask = _mm512_kunpackw(
|
|
1069
|
+
static_cast<__mmask32>(hi.raw), static_cast<__mmask32>(lo.raw));
|
|
1070
|
+
#else
|
|
1071
|
+
const auto combined_mask =
|
|
1072
|
+
((static_cast<uint32_t>(hi.raw) << 16) | (lo.raw & 0xFFFFu));
|
|
1073
|
+
#endif
|
|
1074
|
+
|
|
1075
|
+
return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
|
|
1076
|
+
}
|
|
1077
|
+
|
|
1078
|
+
template <class D, HWY_IF_LANES_D(D, 16)>
|
|
1079
|
+
HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
|
|
1080
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
1081
|
+
const auto shifted_mask = _kshiftri_mask32(static_cast<__mmask32>(m.raw), 16);
|
|
1082
|
+
#else
|
|
1083
|
+
const auto shifted_mask = static_cast<uint32_t>(m.raw) >> 16;
|
|
1084
|
+
#endif
|
|
1085
|
+
|
|
1086
|
+
return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
|
|
1087
|
+
}
|
|
1088
|
+
|
|
1089
|
+
template <class D, HWY_IF_LANES_D(D, 32)>
|
|
1090
|
+
HWY_API MFromD<D> SlideMask1Up(D /*d*/, MFromD<D> m) {
|
|
1091
|
+
using RawM = decltype(MFromD<D>().raw);
|
|
1092
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
1093
|
+
return MFromD<D>{
|
|
1094
|
+
static_cast<RawM>(_kshiftli_mask32(static_cast<__mmask32>(m.raw), 1))};
|
|
1095
|
+
#else
|
|
1096
|
+
return MFromD<D>{static_cast<RawM>(static_cast<uint32_t>(m.raw) << 1)};
|
|
1097
|
+
#endif
|
|
1098
|
+
}
|
|
1099
|
+
|
|
1100
|
+
template <class D, HWY_IF_LANES_D(D, 32)>
|
|
1101
|
+
HWY_API MFromD<D> SlideMask1Down(D /*d*/, MFromD<D> m) {
|
|
1102
|
+
using RawM = decltype(MFromD<D>().raw);
|
|
1103
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
1104
|
+
return MFromD<D>{
|
|
1105
|
+
static_cast<RawM>(_kshiftri_mask32(static_cast<__mmask32>(m.raw), 1))};
|
|
1106
|
+
#else
|
|
1107
|
+
return MFromD<D>{static_cast<RawM>(static_cast<uint32_t>(m.raw) >> 1)};
|
|
1108
|
+
#endif
|
|
1109
|
+
}
|
|
1110
|
+
|
|
916
1111
|
#else // AVX2
|
|
917
1112
|
|
|
918
1113
|
// ------------------------------ Mask
|
|
@@ -1072,7 +1267,11 @@ HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
|
|
|
1072
1267
|
#if HWY_HAVE_FLOAT16
|
|
1073
1268
|
HWY_API Mask256<float16_t> operator==(Vec256<float16_t> a,
|
|
1074
1269
|
Vec256<float16_t> b) {
|
|
1270
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
1271
|
+
HWY_DIAGNOSTICS(push)
|
|
1272
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
1075
1273
|
return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)};
|
|
1274
|
+
HWY_DIAGNOSTICS(pop)
|
|
1076
1275
|
}
|
|
1077
1276
|
#endif // HWY_HAVE_FLOAT16
|
|
1078
1277
|
HWY_API Mask256<float> operator==(Vec256<float> a, Vec256<float> b) {
|
|
@@ -1105,7 +1304,11 @@ HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
|
|
|
1105
1304
|
#if HWY_HAVE_FLOAT16
|
|
1106
1305
|
HWY_API Mask256<float16_t> operator!=(Vec256<float16_t> a,
|
|
1107
1306
|
Vec256<float16_t> b) {
|
|
1307
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
1308
|
+
HWY_DIAGNOSTICS(push)
|
|
1309
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
1108
1310
|
return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
|
|
1311
|
+
HWY_DIAGNOSTICS(pop)
|
|
1109
1312
|
}
|
|
1110
1313
|
#endif // HWY_HAVE_FLOAT16
|
|
1111
1314
|
HWY_API Mask256<float> operator!=(Vec256<float> a, Vec256<float> b) {
|
|
@@ -1146,7 +1349,11 @@ HWY_API Mask256<uint64_t> operator>(Vec256<uint64_t> a, Vec256<uint64_t> b) {
|
|
|
1146
1349
|
|
|
1147
1350
|
#if HWY_HAVE_FLOAT16
|
|
1148
1351
|
HWY_API Mask256<float16_t> operator>(Vec256<float16_t> a, Vec256<float16_t> b) {
|
|
1352
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
1353
|
+
HWY_DIAGNOSTICS(push)
|
|
1354
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
1149
1355
|
return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)};
|
|
1356
|
+
HWY_DIAGNOSTICS(pop)
|
|
1150
1357
|
}
|
|
1151
1358
|
#endif // HWY_HAVE_FLOAT16
|
|
1152
1359
|
HWY_API Mask256<float> operator>(Vec256<float> a, Vec256<float> b) {
|
|
@@ -1161,7 +1368,11 @@ HWY_API Mask256<double> operator>(Vec256<double> a, Vec256<double> b) {
|
|
|
1161
1368
|
#if HWY_HAVE_FLOAT16
|
|
1162
1369
|
HWY_API Mask256<float16_t> operator>=(Vec256<float16_t> a,
|
|
1163
1370
|
Vec256<float16_t> b) {
|
|
1371
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
1372
|
+
HWY_DIAGNOSTICS(push)
|
|
1373
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
1164
1374
|
return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)};
|
|
1375
|
+
HWY_DIAGNOSTICS(pop)
|
|
1165
1376
|
}
|
|
1166
1377
|
#endif // HWY_HAVE_FLOAT16
|
|
1167
1378
|
|
|
@@ -1617,7 +1828,7 @@ HWY_INLINE VFromD<D> Iota0(D /*d*/) {
|
|
|
1617
1828
|
|
|
1618
1829
|
template <class D, HWY_IF_V_SIZE_D(D, 32), typename T2>
|
|
1619
1830
|
HWY_API VFromD<D> Iota(D d, const T2 first) {
|
|
1620
|
-
return detail::Iota0(d) + Set(d,
|
|
1831
|
+
return detail::Iota0(d) + Set(d, ConvertScalarTo<TFromD<D>>(first));
|
|
1621
1832
|
}
|
|
1622
1833
|
|
|
1623
1834
|
// ------------------------------ FirstN (Iota, Lt)
|
|
@@ -1732,6 +1943,15 @@ HWY_API Vec256<double> operator-(Vec256<double> a, Vec256<double> b) {
|
|
|
1732
1943
|
return Vec256<double>{_mm256_sub_pd(a.raw, b.raw)};
|
|
1733
1944
|
}
|
|
1734
1945
|
|
|
1946
|
+
// ------------------------------ AddSub
|
|
1947
|
+
|
|
1948
|
+
HWY_API Vec256<float> AddSub(Vec256<float> a, Vec256<float> b) {
|
|
1949
|
+
return Vec256<float>{_mm256_addsub_ps(a.raw, b.raw)};
|
|
1950
|
+
}
|
|
1951
|
+
HWY_API Vec256<double> AddSub(Vec256<double> a, Vec256<double> b) {
|
|
1952
|
+
return Vec256<double>{_mm256_addsub_pd(a.raw, b.raw)};
|
|
1953
|
+
}
|
|
1954
|
+
|
|
1735
1955
|
// ------------------------------ SumsOf8
|
|
1736
1956
|
HWY_API Vec256<uint64_t> SumsOf8(Vec256<uint8_t> v) {
|
|
1737
1957
|
return Vec256<uint64_t>{_mm256_sad_epu8(v.raw, _mm256_setzero_si256())};
|
|
@@ -1741,6 +1961,56 @@ HWY_API Vec256<uint64_t> SumsOf8AbsDiff(Vec256<uint8_t> a, Vec256<uint8_t> b) {
|
|
|
1741
1961
|
return Vec256<uint64_t>{_mm256_sad_epu8(a.raw, b.raw)};
|
|
1742
1962
|
}
|
|
1743
1963
|
|
|
1964
|
+
// ------------------------------ SumsOf4
|
|
1965
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
1966
|
+
namespace detail {
|
|
1967
|
+
|
|
1968
|
+
HWY_INLINE Vec256<uint32_t> SumsOf4(hwy::UnsignedTag /*type_tag*/,
|
|
1969
|
+
hwy::SizeTag<1> /*lane_size_tag*/,
|
|
1970
|
+
Vec256<uint8_t> v) {
|
|
1971
|
+
const DFromV<decltype(v)> d;
|
|
1972
|
+
|
|
1973
|
+
// _mm256_maskz_dbsad_epu8 is used below as the odd uint16_t lanes need to be
|
|
1974
|
+
// zeroed out and the sums of the 4 consecutive lanes are already in the
|
|
1975
|
+
// even uint16_t lanes of the _mm256_maskz_dbsad_epu8 result.
|
|
1976
|
+
return Vec256<uint32_t>{_mm256_maskz_dbsad_epu8(
|
|
1977
|
+
static_cast<__mmask16>(0x5555), v.raw, Zero(d).raw, 0)};
|
|
1978
|
+
}
|
|
1979
|
+
|
|
1980
|
+
// detail::SumsOf4 for Vec256<int8_t> on AVX3 is implemented in x86_512-inl.h
|
|
1981
|
+
|
|
1982
|
+
} // namespace detail
|
|
1983
|
+
#endif // HWY_TARGET <= HWY_AVX3
|
|
1984
|
+
|
|
1985
|
+
// ------------------------------ SumsOfAdjQuadAbsDiff
|
|
1986
|
+
|
|
1987
|
+
template <int kAOffset, int kBOffset>
|
|
1988
|
+
static Vec256<uint16_t> SumsOfAdjQuadAbsDiff(Vec256<uint8_t> a,
|
|
1989
|
+
Vec256<uint8_t> b) {
|
|
1990
|
+
static_assert(0 <= kAOffset && kAOffset <= 1,
|
|
1991
|
+
"kAOffset must be between 0 and 1");
|
|
1992
|
+
static_assert(0 <= kBOffset && kBOffset <= 3,
|
|
1993
|
+
"kBOffset must be between 0 and 3");
|
|
1994
|
+
return Vec256<uint16_t>{_mm256_mpsadbw_epu8(
|
|
1995
|
+
a.raw, b.raw,
|
|
1996
|
+
(kAOffset << 5) | (kBOffset << 3) | (kAOffset << 2) | kBOffset)};
|
|
1997
|
+
}
|
|
1998
|
+
|
|
1999
|
+
// ------------------------------ SumsOfShuffledQuadAbsDiff
|
|
2000
|
+
|
|
2001
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
2002
|
+
template <int kIdx3, int kIdx2, int kIdx1, int kIdx0>
|
|
2003
|
+
static Vec256<uint16_t> SumsOfShuffledQuadAbsDiff(Vec256<uint8_t> a,
|
|
2004
|
+
Vec256<uint8_t> b) {
|
|
2005
|
+
static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3");
|
|
2006
|
+
static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3");
|
|
2007
|
+
static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3");
|
|
2008
|
+
static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3");
|
|
2009
|
+
return Vec256<uint16_t>{
|
|
2010
|
+
_mm256_dbsad_epu8(b.raw, a.raw, _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0))};
|
|
2011
|
+
}
|
|
2012
|
+
#endif
|
|
2013
|
+
|
|
1744
2014
|
// ------------------------------ SaturatedAdd
|
|
1745
2015
|
|
|
1746
2016
|
// Returns a + b clamped to the destination range.
|
|
@@ -1761,7 +2031,7 @@ HWY_API Vec256<int16_t> SaturatedAdd(Vec256<int16_t> a, Vec256<int16_t> b) {
|
|
|
1761
2031
|
return Vec256<int16_t>{_mm256_adds_epi16(a.raw, b.raw)};
|
|
1762
2032
|
}
|
|
1763
2033
|
|
|
1764
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
2034
|
+
#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
1765
2035
|
HWY_API Vec256<int32_t> SaturatedAdd(Vec256<int32_t> a, Vec256<int32_t> b) {
|
|
1766
2036
|
const DFromV<decltype(a)> d;
|
|
1767
2037
|
const auto sum = a + b;
|
|
@@ -1783,7 +2053,7 @@ HWY_API Vec256<int64_t> SaturatedAdd(Vec256<int64_t> a, Vec256<int64_t> b) {
|
|
|
1783
2053
|
i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
|
|
1784
2054
|
return IfThenElse(overflow_mask, overflow_result, sum);
|
|
1785
2055
|
}
|
|
1786
|
-
#endif // HWY_TARGET <= HWY_AVX3
|
|
2056
|
+
#endif // HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
1787
2057
|
|
|
1788
2058
|
// ------------------------------ SaturatedSub
|
|
1789
2059
|
|
|
@@ -1805,7 +2075,7 @@ HWY_API Vec256<int16_t> SaturatedSub(Vec256<int16_t> a, Vec256<int16_t> b) {
|
|
|
1805
2075
|
return Vec256<int16_t>{_mm256_subs_epi16(a.raw, b.raw)};
|
|
1806
2076
|
}
|
|
1807
2077
|
|
|
1808
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
2078
|
+
#if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
1809
2079
|
HWY_API Vec256<int32_t> SaturatedSub(Vec256<int32_t> a, Vec256<int32_t> b) {
|
|
1810
2080
|
const DFromV<decltype(a)> d;
|
|
1811
2081
|
const auto diff = a - b;
|
|
@@ -1827,7 +2097,7 @@ HWY_API Vec256<int64_t> SaturatedSub(Vec256<int64_t> a, Vec256<int64_t> b) {
|
|
|
1827
2097
|
i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
|
|
1828
2098
|
return IfThenElse(overflow_mask, overflow_result, diff);
|
|
1829
2099
|
}
|
|
1830
|
-
#endif // HWY_TARGET <= HWY_AVX3
|
|
2100
|
+
#endif // HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
|
|
1831
2101
|
|
|
1832
2102
|
// ------------------------------ Average
|
|
1833
2103
|
|
|
@@ -1860,15 +2130,12 @@ HWY_API Vec256<int16_t> Abs(const Vec256<int16_t> v) {
|
|
|
1860
2130
|
HWY_API Vec256<int32_t> Abs(const Vec256<int32_t> v) {
|
|
1861
2131
|
return Vec256<int32_t>{_mm256_abs_epi32(v.raw)};
|
|
1862
2132
|
}
|
|
1863
|
-
// i64 is implemented after BroadcastSignBit.
|
|
1864
2133
|
|
|
1865
|
-
|
|
1866
|
-
HWY_API Vec256<
|
|
1867
|
-
|
|
1868
|
-
const RebindToSigned<decltype(d)> di;
|
|
1869
|
-
using TI = TFromD<decltype(di)>;
|
|
1870
|
-
return v & BitCast(d, Set(di, static_cast<TI>(~SignMask<TI>())));
|
|
2134
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
2135
|
+
HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) {
|
|
2136
|
+
return Vec256<int64_t>{_mm256_abs_epi64(v.raw)};
|
|
1871
2137
|
}
|
|
2138
|
+
#endif
|
|
1872
2139
|
|
|
1873
2140
|
// ------------------------------ Integer multiplication
|
|
1874
2141
|
|
|
@@ -2016,14 +2283,29 @@ HWY_API Vec256<int8_t> ShiftRight(Vec256<int8_t> v) {
|
|
|
2016
2283
|
|
|
2017
2284
|
// ------------------------------ RotateRight
|
|
2018
2285
|
|
|
2019
|
-
|
|
2020
|
-
|
|
2021
|
-
|
|
2022
|
-
|
|
2286
|
+
// U8 RotateRight implementation on AVX3_DL is now in x86_512-inl.h as U8
|
|
2287
|
+
// RotateRight uses detail::GaloisAffine on AVX3_DL
|
|
2288
|
+
|
|
2289
|
+
#if HWY_TARGET > HWY_AVX3_DL
|
|
2290
|
+
template <int kBits>
|
|
2291
|
+
HWY_API Vec256<uint8_t> RotateRight(const Vec256<uint8_t> v) {
|
|
2292
|
+
static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
|
|
2293
|
+
if (kBits == 0) return v;
|
|
2294
|
+
// AVX3 does not support 8-bit.
|
|
2295
|
+
return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(7, 8 - kBits)>(v));
|
|
2296
|
+
}
|
|
2297
|
+
#endif
|
|
2298
|
+
|
|
2299
|
+
template <int kBits>
|
|
2300
|
+
HWY_API Vec256<uint16_t> RotateRight(const Vec256<uint16_t> v) {
|
|
2301
|
+
static_assert(0 <= kBits && kBits < 16, "Invalid shift count");
|
|
2023
2302
|
if (kBits == 0) return v;
|
|
2024
|
-
|
|
2025
|
-
return
|
|
2026
|
-
|
|
2303
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
2304
|
+
return Vec256<uint16_t>{_mm256_shrdi_epi16(v.raw, v.raw, kBits)};
|
|
2305
|
+
#else
|
|
2306
|
+
// AVX3 does not support 16-bit.
|
|
2307
|
+
return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(15, 16 - kBits)>(v));
|
|
2308
|
+
#endif
|
|
2027
2309
|
}
|
|
2028
2310
|
|
|
2029
2311
|
template <int kBits>
|
|
@@ -2048,6 +2330,38 @@ HWY_API Vec256<uint64_t> RotateRight(const Vec256<uint64_t> v) {
|
|
|
2048
2330
|
#endif
|
|
2049
2331
|
}
|
|
2050
2332
|
|
|
2333
|
+
// ------------------------------ Rol/Ror
|
|
2334
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
2335
|
+
template <class T, HWY_IF_UI16(T)>
|
|
2336
|
+
HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) {
|
|
2337
|
+
return Vec256<T>{_mm256_shrdv_epi16(a.raw, a.raw, b.raw)};
|
|
2338
|
+
}
|
|
2339
|
+
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
2340
|
+
|
|
2341
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
2342
|
+
|
|
2343
|
+
template <class T, HWY_IF_UI32(T)>
|
|
2344
|
+
HWY_API Vec256<T> Rol(Vec256<T> a, Vec256<T> b) {
|
|
2345
|
+
return Vec256<T>{_mm256_rolv_epi32(a.raw, b.raw)};
|
|
2346
|
+
}
|
|
2347
|
+
|
|
2348
|
+
template <class T, HWY_IF_UI32(T)>
|
|
2349
|
+
HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) {
|
|
2350
|
+
return Vec256<T>{_mm256_rorv_epi32(a.raw, b.raw)};
|
|
2351
|
+
}
|
|
2352
|
+
|
|
2353
|
+
template <class T, HWY_IF_UI64(T)>
|
|
2354
|
+
HWY_API Vec256<T> Rol(Vec256<T> a, Vec256<T> b) {
|
|
2355
|
+
return Vec256<T>{_mm256_rolv_epi64(a.raw, b.raw)};
|
|
2356
|
+
}
|
|
2357
|
+
|
|
2358
|
+
template <class T, HWY_IF_UI64(T)>
|
|
2359
|
+
HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) {
|
|
2360
|
+
return Vec256<T>{_mm256_rorv_epi64(a.raw, b.raw)};
|
|
2361
|
+
}
|
|
2362
|
+
|
|
2363
|
+
#endif
|
|
2364
|
+
|
|
2051
2365
|
// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
|
|
2052
2366
|
|
|
2053
2367
|
HWY_API Vec256<int8_t> BroadcastSignBit(const Vec256<int8_t> v) {
|
|
@@ -2086,16 +2400,6 @@ HWY_API Vec256<int64_t> ShiftRight(const Vec256<int64_t> v) {
|
|
|
2086
2400
|
#endif
|
|
2087
2401
|
}
|
|
2088
2402
|
|
|
2089
|
-
HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) {
|
|
2090
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
2091
|
-
return Vec256<int64_t>{_mm256_abs_epi64(v.raw)};
|
|
2092
|
-
#else
|
|
2093
|
-
const DFromV<decltype(v)> d;
|
|
2094
|
-
const auto zero = Zero(d);
|
|
2095
|
-
return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
|
|
2096
|
-
#endif
|
|
2097
|
-
}
|
|
2098
|
-
|
|
2099
2403
|
// ------------------------------ IfNegativeThenElse (BroadcastSignBit)
|
|
2100
2404
|
HWY_API Vec256<int8_t> IfNegativeThenElse(Vec256<int8_t> v, Vec256<int8_t> yes,
|
|
2101
2405
|
Vec256<int8_t> no) {
|
|
@@ -2136,6 +2440,23 @@ HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
|
|
|
2136
2440
|
#endif
|
|
2137
2441
|
}
|
|
2138
2442
|
|
|
2443
|
+
// ------------------------------ IfNegativeThenNegOrUndefIfZero
|
|
2444
|
+
|
|
2445
|
+
HWY_API Vec256<int8_t> IfNegativeThenNegOrUndefIfZero(Vec256<int8_t> mask,
|
|
2446
|
+
Vec256<int8_t> v) {
|
|
2447
|
+
return Vec256<int8_t>{_mm256_sign_epi8(v.raw, mask.raw)};
|
|
2448
|
+
}
|
|
2449
|
+
|
|
2450
|
+
HWY_API Vec256<int16_t> IfNegativeThenNegOrUndefIfZero(Vec256<int16_t> mask,
|
|
2451
|
+
Vec256<int16_t> v) {
|
|
2452
|
+
return Vec256<int16_t>{_mm256_sign_epi16(v.raw, mask.raw)};
|
|
2453
|
+
}
|
|
2454
|
+
|
|
2455
|
+
HWY_API Vec256<int32_t> IfNegativeThenNegOrUndefIfZero(Vec256<int32_t> mask,
|
|
2456
|
+
Vec256<int32_t> v) {
|
|
2457
|
+
return Vec256<int32_t>{_mm256_sign_epi32(v.raw, mask.raw)};
|
|
2458
|
+
}
|
|
2459
|
+
|
|
2139
2460
|
// ------------------------------ ShiftLeftSame
|
|
2140
2461
|
|
|
2141
2462
|
HWY_API Vec256<uint16_t> ShiftLeftSame(const Vec256<uint16_t> v,
|
|
@@ -2359,103 +2680,448 @@ HWY_API Vec256<double> ApproximateReciprocal(Vec256<double> v) {
|
|
|
2359
2680
|
}
|
|
2360
2681
|
#endif
|
|
2361
2682
|
|
|
2362
|
-
// ------------------------------
|
|
2683
|
+
// ------------------------------ MaskedMinOr
|
|
2363
2684
|
|
|
2364
|
-
#if
|
|
2685
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
2365
2686
|
|
|
2366
|
-
|
|
2367
|
-
|
|
2368
|
-
|
|
2687
|
+
template <typename T, HWY_IF_U8(T)>
|
|
2688
|
+
HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2689
|
+
Vec256<T> b) {
|
|
2690
|
+
return Vec256<T>{_mm256_mask_min_epu8(no.raw, m.raw, a.raw, b.raw)};
|
|
2691
|
+
}
|
|
2692
|
+
template <typename T, HWY_IF_I8(T)>
|
|
2693
|
+
HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2694
|
+
Vec256<T> b) {
|
|
2695
|
+
return Vec256<T>{_mm256_mask_min_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
2369
2696
|
}
|
|
2370
2697
|
|
|
2371
|
-
|
|
2372
|
-
|
|
2373
|
-
|
|
2698
|
+
template <typename T, HWY_IF_U16(T)>
|
|
2699
|
+
HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2700
|
+
Vec256<T> b) {
|
|
2701
|
+
return Vec256<T>{_mm256_mask_min_epu16(no.raw, m.raw, a.raw, b.raw)};
|
|
2702
|
+
}
|
|
2703
|
+
template <typename T, HWY_IF_I16(T)>
|
|
2704
|
+
HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2705
|
+
Vec256<T> b) {
|
|
2706
|
+
return Vec256<T>{_mm256_mask_min_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
2374
2707
|
}
|
|
2375
2708
|
|
|
2376
|
-
|
|
2377
|
-
|
|
2378
|
-
|
|
2709
|
+
template <typename T, HWY_IF_U32(T)>
|
|
2710
|
+
HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2711
|
+
Vec256<T> b) {
|
|
2712
|
+
return Vec256<T>{_mm256_mask_min_epu32(no.raw, m.raw, a.raw, b.raw)};
|
|
2713
|
+
}
|
|
2714
|
+
template <typename T, HWY_IF_I32(T)>
|
|
2715
|
+
HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2716
|
+
Vec256<T> b) {
|
|
2717
|
+
return Vec256<T>{_mm256_mask_min_epi32(no.raw, m.raw, a.raw, b.raw)};
|
|
2379
2718
|
}
|
|
2380
2719
|
|
|
2381
|
-
|
|
2382
|
-
|
|
2383
|
-
|
|
2720
|
+
template <typename T, HWY_IF_U64(T)>
|
|
2721
|
+
HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2722
|
+
Vec256<T> b) {
|
|
2723
|
+
return Vec256<T>{_mm256_mask_min_epu64(no.raw, m.raw, a.raw, b.raw)};
|
|
2724
|
+
}
|
|
2725
|
+
template <typename T, HWY_IF_I64(T)>
|
|
2726
|
+
HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2727
|
+
Vec256<T> b) {
|
|
2728
|
+
return Vec256<T>{_mm256_mask_min_epi64(no.raw, m.raw, a.raw, b.raw)};
|
|
2729
|
+
}
|
|
2730
|
+
|
|
2731
|
+
template <typename T, HWY_IF_F32(T)>
|
|
2732
|
+
HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2733
|
+
Vec256<T> b) {
|
|
2734
|
+
return Vec256<T>{_mm256_mask_min_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
2735
|
+
}
|
|
2736
|
+
|
|
2737
|
+
template <typename T, HWY_IF_F64(T)>
|
|
2738
|
+
HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2739
|
+
Vec256<T> b) {
|
|
2740
|
+
return Vec256<T>{_mm256_mask_min_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
2384
2741
|
}
|
|
2385
2742
|
|
|
2743
|
+
#if HWY_HAVE_FLOAT16
|
|
2744
|
+
template <typename T, HWY_IF_F16(T)>
|
|
2745
|
+
HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2746
|
+
Vec256<T> b) {
|
|
2747
|
+
return Vec256<T>{_mm256_mask_min_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
2748
|
+
}
|
|
2386
2749
|
#endif // HWY_HAVE_FLOAT16
|
|
2387
2750
|
|
|
2388
|
-
|
|
2389
|
-
|
|
2390
|
-
|
|
2391
|
-
|
|
2392
|
-
|
|
2393
|
-
return Vec256<
|
|
2394
|
-
#endif
|
|
2751
|
+
// ------------------------------ MaskedMaxOr
|
|
2752
|
+
|
|
2753
|
+
template <typename T, HWY_IF_U8(T)>
|
|
2754
|
+
HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2755
|
+
Vec256<T> b) {
|
|
2756
|
+
return Vec256<T>{_mm256_mask_max_epu8(no.raw, m.raw, a.raw, b.raw)};
|
|
2395
2757
|
}
|
|
2396
|
-
|
|
2397
|
-
|
|
2398
|
-
|
|
2399
|
-
return
|
|
2400
|
-
#else
|
|
2401
|
-
return Vec256<double>{_mm256_fmadd_pd(mul.raw, x.raw, add.raw)};
|
|
2402
|
-
#endif
|
|
2758
|
+
template <typename T, HWY_IF_I8(T)>
|
|
2759
|
+
HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2760
|
+
Vec256<T> b) {
|
|
2761
|
+
return Vec256<T>{_mm256_mask_max_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
2403
2762
|
}
|
|
2404
2763
|
|
|
2405
|
-
|
|
2406
|
-
|
|
2407
|
-
|
|
2408
|
-
return
|
|
2409
|
-
#else
|
|
2410
|
-
return Vec256<float>{_mm256_fnmadd_ps(mul.raw, x.raw, add.raw)};
|
|
2411
|
-
#endif
|
|
2764
|
+
template <typename T, HWY_IF_U16(T)>
|
|
2765
|
+
HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2766
|
+
Vec256<T> b) {
|
|
2767
|
+
return Vec256<T>{_mm256_mask_max_epu16(no.raw, m.raw, a.raw, b.raw)};
|
|
2412
2768
|
}
|
|
2413
|
-
|
|
2414
|
-
|
|
2415
|
-
|
|
2416
|
-
return
|
|
2417
|
-
#else
|
|
2418
|
-
return Vec256<double>{_mm256_fnmadd_pd(mul.raw, x.raw, add.raw)};
|
|
2419
|
-
#endif
|
|
2769
|
+
template <typename T, HWY_IF_I16(T)>
|
|
2770
|
+
HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2771
|
+
Vec256<T> b) {
|
|
2772
|
+
return Vec256<T>{_mm256_mask_max_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
2420
2773
|
}
|
|
2421
2774
|
|
|
2422
|
-
|
|
2423
|
-
|
|
2424
|
-
|
|
2425
|
-
return
|
|
2426
|
-
#else
|
|
2427
|
-
return Vec256<float>{_mm256_fmsub_ps(mul.raw, x.raw, sub.raw)};
|
|
2428
|
-
#endif
|
|
2775
|
+
template <typename T, HWY_IF_U32(T)>
|
|
2776
|
+
HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2777
|
+
Vec256<T> b) {
|
|
2778
|
+
return Vec256<T>{_mm256_mask_max_epu32(no.raw, m.raw, a.raw, b.raw)};
|
|
2429
2779
|
}
|
|
2430
|
-
|
|
2431
|
-
|
|
2432
|
-
|
|
2433
|
-
return
|
|
2434
|
-
#else
|
|
2435
|
-
return Vec256<double>{_mm256_fmsub_pd(mul.raw, x.raw, sub.raw)};
|
|
2436
|
-
#endif
|
|
2780
|
+
template <typename T, HWY_IF_I32(T)>
|
|
2781
|
+
HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2782
|
+
Vec256<T> b) {
|
|
2783
|
+
return Vec256<T>{_mm256_mask_max_epi32(no.raw, m.raw, a.raw, b.raw)};
|
|
2437
2784
|
}
|
|
2438
2785
|
|
|
2439
|
-
|
|
2440
|
-
|
|
2441
|
-
|
|
2442
|
-
return
|
|
2443
|
-
#else
|
|
2444
|
-
return Vec256<float>{_mm256_fnmsub_ps(mul.raw, x.raw, sub.raw)};
|
|
2445
|
-
#endif
|
|
2786
|
+
template <typename T, HWY_IF_U64(T)>
|
|
2787
|
+
HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2788
|
+
Vec256<T> b) {
|
|
2789
|
+
return Vec256<T>{_mm256_mask_max_epu64(no.raw, m.raw, a.raw, b.raw)};
|
|
2446
2790
|
}
|
|
2447
|
-
|
|
2448
|
-
|
|
2449
|
-
|
|
2450
|
-
return
|
|
2451
|
-
#else
|
|
2452
|
-
return Vec256<double>{_mm256_fnmsub_pd(mul.raw, x.raw, sub.raw)};
|
|
2453
|
-
#endif
|
|
2791
|
+
template <typename T, HWY_IF_I64(T)>
|
|
2792
|
+
HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2793
|
+
Vec256<T> b) {
|
|
2794
|
+
return Vec256<T>{_mm256_mask_max_epi64(no.raw, m.raw, a.raw, b.raw)};
|
|
2454
2795
|
}
|
|
2455
2796
|
|
|
2456
|
-
|
|
2797
|
+
template <typename T, HWY_IF_F32(T)>
|
|
2798
|
+
HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2799
|
+
Vec256<T> b) {
|
|
2800
|
+
return Vec256<T>{_mm256_mask_max_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
2801
|
+
}
|
|
2802
|
+
|
|
2803
|
+
template <typename T, HWY_IF_F64(T)>
|
|
2804
|
+
HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2805
|
+
Vec256<T> b) {
|
|
2806
|
+
return Vec256<T>{_mm256_mask_max_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
2807
|
+
}
|
|
2457
2808
|
|
|
2458
|
-
|
|
2809
|
+
#if HWY_HAVE_FLOAT16
|
|
2810
|
+
template <typename T, HWY_IF_F16(T)>
|
|
2811
|
+
HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2812
|
+
Vec256<T> b) {
|
|
2813
|
+
return Vec256<T>{_mm256_mask_max_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
2814
|
+
}
|
|
2815
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2816
|
+
|
|
2817
|
+
// ------------------------------ MaskedAddOr
|
|
2818
|
+
|
|
2819
|
+
template <typename T, HWY_IF_UI8(T)>
|
|
2820
|
+
HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2821
|
+
Vec256<T> b) {
|
|
2822
|
+
return Vec256<T>{_mm256_mask_add_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
2823
|
+
}
|
|
2824
|
+
|
|
2825
|
+
template <typename T, HWY_IF_UI16(T)>
|
|
2826
|
+
HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2827
|
+
Vec256<T> b) {
|
|
2828
|
+
return Vec256<T>{_mm256_mask_add_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
2829
|
+
}
|
|
2830
|
+
|
|
2831
|
+
template <typename T, HWY_IF_UI32(T)>
|
|
2832
|
+
HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2833
|
+
Vec256<T> b) {
|
|
2834
|
+
return Vec256<T>{_mm256_mask_add_epi32(no.raw, m.raw, a.raw, b.raw)};
|
|
2835
|
+
}
|
|
2836
|
+
|
|
2837
|
+
template <typename T, HWY_IF_UI64(T)>
|
|
2838
|
+
HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2839
|
+
Vec256<T> b) {
|
|
2840
|
+
return Vec256<T>{_mm256_mask_add_epi64(no.raw, m.raw, a.raw, b.raw)};
|
|
2841
|
+
}
|
|
2842
|
+
|
|
2843
|
+
template <typename T, HWY_IF_F32(T)>
|
|
2844
|
+
HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2845
|
+
Vec256<T> b) {
|
|
2846
|
+
return Vec256<T>{_mm256_mask_add_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
2847
|
+
}
|
|
2848
|
+
|
|
2849
|
+
template <typename T, HWY_IF_F64(T)>
|
|
2850
|
+
HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2851
|
+
Vec256<T> b) {
|
|
2852
|
+
return Vec256<T>{_mm256_mask_add_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
2853
|
+
}
|
|
2854
|
+
|
|
2855
|
+
#if HWY_HAVE_FLOAT16
|
|
2856
|
+
template <typename T, HWY_IF_F16(T)>
|
|
2857
|
+
HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2858
|
+
Vec256<T> b) {
|
|
2859
|
+
return Vec256<T>{_mm256_mask_add_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
2860
|
+
}
|
|
2861
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2862
|
+
|
|
2863
|
+
// ------------------------------ MaskedSubOr
|
|
2864
|
+
|
|
2865
|
+
template <typename T, HWY_IF_UI8(T)>
|
|
2866
|
+
HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2867
|
+
Vec256<T> b) {
|
|
2868
|
+
return Vec256<T>{_mm256_mask_sub_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
2869
|
+
}
|
|
2870
|
+
|
|
2871
|
+
template <typename T, HWY_IF_UI16(T)>
|
|
2872
|
+
HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2873
|
+
Vec256<T> b) {
|
|
2874
|
+
return Vec256<T>{_mm256_mask_sub_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
2875
|
+
}
|
|
2876
|
+
|
|
2877
|
+
template <typename T, HWY_IF_UI32(T)>
|
|
2878
|
+
HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2879
|
+
Vec256<T> b) {
|
|
2880
|
+
return Vec256<T>{_mm256_mask_sub_epi32(no.raw, m.raw, a.raw, b.raw)};
|
|
2881
|
+
}
|
|
2882
|
+
|
|
2883
|
+
template <typename T, HWY_IF_UI64(T)>
|
|
2884
|
+
HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2885
|
+
Vec256<T> b) {
|
|
2886
|
+
return Vec256<T>{_mm256_mask_sub_epi64(no.raw, m.raw, a.raw, b.raw)};
|
|
2887
|
+
}
|
|
2888
|
+
|
|
2889
|
+
template <typename T, HWY_IF_F32(T)>
|
|
2890
|
+
HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2891
|
+
Vec256<T> b) {
|
|
2892
|
+
return Vec256<T>{_mm256_mask_sub_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
2893
|
+
}
|
|
2894
|
+
|
|
2895
|
+
template <typename T, HWY_IF_F64(T)>
|
|
2896
|
+
HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2897
|
+
Vec256<T> b) {
|
|
2898
|
+
return Vec256<T>{_mm256_mask_sub_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
2899
|
+
}
|
|
2900
|
+
|
|
2901
|
+
#if HWY_HAVE_FLOAT16
|
|
2902
|
+
template <typename T, HWY_IF_F16(T)>
|
|
2903
|
+
HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2904
|
+
Vec256<T> b) {
|
|
2905
|
+
return Vec256<T>{_mm256_mask_sub_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
2906
|
+
}
|
|
2907
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2908
|
+
|
|
2909
|
+
// ------------------------------ MaskedMulOr
|
|
2910
|
+
|
|
2911
|
+
HWY_API Vec256<float> MaskedMulOr(Vec256<float> no, Mask256<float> m,
|
|
2912
|
+
Vec256<float> a, Vec256<float> b) {
|
|
2913
|
+
return Vec256<float>{_mm256_mask_mul_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
2914
|
+
}
|
|
2915
|
+
|
|
2916
|
+
HWY_API Vec256<double> MaskedMulOr(Vec256<double> no, Mask256<double> m,
|
|
2917
|
+
Vec256<double> a, Vec256<double> b) {
|
|
2918
|
+
return Vec256<double>{_mm256_mask_mul_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
2919
|
+
}
|
|
2920
|
+
|
|
2921
|
+
#if HWY_HAVE_FLOAT16
|
|
2922
|
+
HWY_API Vec256<float16_t> MaskedMulOr(Vec256<float16_t> no,
|
|
2923
|
+
Mask256<float16_t> m, Vec256<float16_t> a,
|
|
2924
|
+
Vec256<float16_t> b) {
|
|
2925
|
+
return Vec256<float16_t>{_mm256_mask_mul_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
2926
|
+
}
|
|
2927
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2928
|
+
|
|
2929
|
+
// ------------------------------ MaskedDivOr
|
|
2930
|
+
|
|
2931
|
+
HWY_API Vec256<float> MaskedDivOr(Vec256<float> no, Mask256<float> m,
|
|
2932
|
+
Vec256<float> a, Vec256<float> b) {
|
|
2933
|
+
return Vec256<float>{_mm256_mask_div_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
2934
|
+
}
|
|
2935
|
+
|
|
2936
|
+
HWY_API Vec256<double> MaskedDivOr(Vec256<double> no, Mask256<double> m,
|
|
2937
|
+
Vec256<double> a, Vec256<double> b) {
|
|
2938
|
+
return Vec256<double>{_mm256_mask_div_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
2939
|
+
}
|
|
2940
|
+
|
|
2941
|
+
#if HWY_HAVE_FLOAT16
|
|
2942
|
+
HWY_API Vec256<float16_t> MaskedDivOr(Vec256<float16_t> no,
|
|
2943
|
+
Mask256<float16_t> m, Vec256<float16_t> a,
|
|
2944
|
+
Vec256<float16_t> b) {
|
|
2945
|
+
return Vec256<float16_t>{_mm256_mask_div_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
2946
|
+
}
|
|
2947
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2948
|
+
|
|
2949
|
+
// ------------------------------ MaskedSatAddOr
|
|
2950
|
+
|
|
2951
|
+
template <typename T, HWY_IF_I8(T)>
|
|
2952
|
+
HWY_API Vec256<T> MaskedSatAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2953
|
+
Vec256<T> b) {
|
|
2954
|
+
return Vec256<T>{_mm256_mask_adds_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
2955
|
+
}
|
|
2956
|
+
|
|
2957
|
+
template <typename T, HWY_IF_U8(T)>
|
|
2958
|
+
HWY_API Vec256<T> MaskedSatAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2959
|
+
Vec256<T> b) {
|
|
2960
|
+
return Vec256<T>{_mm256_mask_adds_epu8(no.raw, m.raw, a.raw, b.raw)};
|
|
2961
|
+
}
|
|
2962
|
+
|
|
2963
|
+
template <typename T, HWY_IF_I16(T)>
|
|
2964
|
+
HWY_API Vec256<T> MaskedSatAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2965
|
+
Vec256<T> b) {
|
|
2966
|
+
return Vec256<T>{_mm256_mask_adds_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
2967
|
+
}
|
|
2968
|
+
|
|
2969
|
+
template <typename T, HWY_IF_U16(T)>
|
|
2970
|
+
HWY_API Vec256<T> MaskedSatAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2971
|
+
Vec256<T> b) {
|
|
2972
|
+
return Vec256<T>{_mm256_mask_adds_epu16(no.raw, m.raw, a.raw, b.raw)};
|
|
2973
|
+
}
|
|
2974
|
+
|
|
2975
|
+
// ------------------------------ MaskedSatSubOr
|
|
2976
|
+
|
|
2977
|
+
template <typename T, HWY_IF_I8(T)>
|
|
2978
|
+
HWY_API Vec256<T> MaskedSatSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2979
|
+
Vec256<T> b) {
|
|
2980
|
+
return Vec256<T>{_mm256_mask_subs_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
2981
|
+
}
|
|
2982
|
+
|
|
2983
|
+
template <typename T, HWY_IF_U8(T)>
|
|
2984
|
+
HWY_API Vec256<T> MaskedSatSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2985
|
+
Vec256<T> b) {
|
|
2986
|
+
return Vec256<T>{_mm256_mask_subs_epu8(no.raw, m.raw, a.raw, b.raw)};
|
|
2987
|
+
}
|
|
2988
|
+
|
|
2989
|
+
template <typename T, HWY_IF_I16(T)>
|
|
2990
|
+
HWY_API Vec256<T> MaskedSatSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2991
|
+
Vec256<T> b) {
|
|
2992
|
+
return Vec256<T>{_mm256_mask_subs_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
2993
|
+
}
|
|
2994
|
+
|
|
2995
|
+
template <typename T, HWY_IF_U16(T)>
|
|
2996
|
+
HWY_API Vec256<T> MaskedSatSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2997
|
+
Vec256<T> b) {
|
|
2998
|
+
return Vec256<T>{_mm256_mask_subs_epu16(no.raw, m.raw, a.raw, b.raw)};
|
|
2999
|
+
}
|
|
3000
|
+
|
|
3001
|
+
#endif // HWY_TARGET <= HWY_AVX3
|
|
3002
|
+
|
|
3003
|
+
// ------------------------------ Floating-point multiply-add variants
|
|
3004
|
+
|
|
3005
|
+
#if HWY_HAVE_FLOAT16
|
|
3006
|
+
|
|
3007
|
+
HWY_API Vec256<float16_t> MulAdd(Vec256<float16_t> mul, Vec256<float16_t> x,
|
|
3008
|
+
Vec256<float16_t> add) {
|
|
3009
|
+
return Vec256<float16_t>{_mm256_fmadd_ph(mul.raw, x.raw, add.raw)};
|
|
3010
|
+
}
|
|
3011
|
+
|
|
3012
|
+
HWY_API Vec256<float16_t> NegMulAdd(Vec256<float16_t> mul, Vec256<float16_t> x,
|
|
3013
|
+
Vec256<float16_t> add) {
|
|
3014
|
+
return Vec256<float16_t>{_mm256_fnmadd_ph(mul.raw, x.raw, add.raw)};
|
|
3015
|
+
}
|
|
3016
|
+
|
|
3017
|
+
HWY_API Vec256<float16_t> MulSub(Vec256<float16_t> mul, Vec256<float16_t> x,
|
|
3018
|
+
Vec256<float16_t> sub) {
|
|
3019
|
+
return Vec256<float16_t>{_mm256_fmsub_ph(mul.raw, x.raw, sub.raw)};
|
|
3020
|
+
}
|
|
3021
|
+
|
|
3022
|
+
HWY_API Vec256<float16_t> NegMulSub(Vec256<float16_t> mul, Vec256<float16_t> x,
|
|
3023
|
+
Vec256<float16_t> sub) {
|
|
3024
|
+
return Vec256<float16_t>{_mm256_fnmsub_ph(mul.raw, x.raw, sub.raw)};
|
|
3025
|
+
}
|
|
3026
|
+
|
|
3027
|
+
#endif // HWY_HAVE_FLOAT16
|
|
3028
|
+
|
|
3029
|
+
HWY_API Vec256<float> MulAdd(Vec256<float> mul, Vec256<float> x,
|
|
3030
|
+
Vec256<float> add) {
|
|
3031
|
+
#ifdef HWY_DISABLE_BMI2_FMA
|
|
3032
|
+
return mul * x + add;
|
|
3033
|
+
#else
|
|
3034
|
+
return Vec256<float>{_mm256_fmadd_ps(mul.raw, x.raw, add.raw)};
|
|
3035
|
+
#endif
|
|
3036
|
+
}
|
|
3037
|
+
HWY_API Vec256<double> MulAdd(Vec256<double> mul, Vec256<double> x,
|
|
3038
|
+
Vec256<double> add) {
|
|
3039
|
+
#ifdef HWY_DISABLE_BMI2_FMA
|
|
3040
|
+
return mul * x + add;
|
|
3041
|
+
#else
|
|
3042
|
+
return Vec256<double>{_mm256_fmadd_pd(mul.raw, x.raw, add.raw)};
|
|
3043
|
+
#endif
|
|
3044
|
+
}
|
|
3045
|
+
|
|
3046
|
+
HWY_API Vec256<float> NegMulAdd(Vec256<float> mul, Vec256<float> x,
|
|
3047
|
+
Vec256<float> add) {
|
|
3048
|
+
#ifdef HWY_DISABLE_BMI2_FMA
|
|
3049
|
+
return add - mul * x;
|
|
3050
|
+
#else
|
|
3051
|
+
return Vec256<float>{_mm256_fnmadd_ps(mul.raw, x.raw, add.raw)};
|
|
3052
|
+
#endif
|
|
3053
|
+
}
|
|
3054
|
+
HWY_API Vec256<double> NegMulAdd(Vec256<double> mul, Vec256<double> x,
|
|
3055
|
+
Vec256<double> add) {
|
|
3056
|
+
#ifdef HWY_DISABLE_BMI2_FMA
|
|
3057
|
+
return add - mul * x;
|
|
3058
|
+
#else
|
|
3059
|
+
return Vec256<double>{_mm256_fnmadd_pd(mul.raw, x.raw, add.raw)};
|
|
3060
|
+
#endif
|
|
3061
|
+
}
|
|
3062
|
+
|
|
3063
|
+
HWY_API Vec256<float> MulSub(Vec256<float> mul, Vec256<float> x,
|
|
3064
|
+
Vec256<float> sub) {
|
|
3065
|
+
#ifdef HWY_DISABLE_BMI2_FMA
|
|
3066
|
+
return mul * x - sub;
|
|
3067
|
+
#else
|
|
3068
|
+
return Vec256<float>{_mm256_fmsub_ps(mul.raw, x.raw, sub.raw)};
|
|
3069
|
+
#endif
|
|
3070
|
+
}
|
|
3071
|
+
HWY_API Vec256<double> MulSub(Vec256<double> mul, Vec256<double> x,
|
|
3072
|
+
Vec256<double> sub) {
|
|
3073
|
+
#ifdef HWY_DISABLE_BMI2_FMA
|
|
3074
|
+
return mul * x - sub;
|
|
3075
|
+
#else
|
|
3076
|
+
return Vec256<double>{_mm256_fmsub_pd(mul.raw, x.raw, sub.raw)};
|
|
3077
|
+
#endif
|
|
3078
|
+
}
|
|
3079
|
+
|
|
3080
|
+
HWY_API Vec256<float> NegMulSub(Vec256<float> mul, Vec256<float> x,
|
|
3081
|
+
Vec256<float> sub) {
|
|
3082
|
+
#ifdef HWY_DISABLE_BMI2_FMA
|
|
3083
|
+
return Neg(mul * x) - sub;
|
|
3084
|
+
#else
|
|
3085
|
+
return Vec256<float>{_mm256_fnmsub_ps(mul.raw, x.raw, sub.raw)};
|
|
3086
|
+
#endif
|
|
3087
|
+
}
|
|
3088
|
+
HWY_API Vec256<double> NegMulSub(Vec256<double> mul, Vec256<double> x,
|
|
3089
|
+
Vec256<double> sub) {
|
|
3090
|
+
#ifdef HWY_DISABLE_BMI2_FMA
|
|
3091
|
+
return Neg(mul * x) - sub;
|
|
3092
|
+
#else
|
|
3093
|
+
return Vec256<double>{_mm256_fnmsub_pd(mul.raw, x.raw, sub.raw)};
|
|
3094
|
+
#endif
|
|
3095
|
+
}
|
|
3096
|
+
|
|
3097
|
+
#if HWY_HAVE_FLOAT16
|
|
3098
|
+
HWY_API Vec256<float16_t> MulAddSub(Vec256<float16_t> mul, Vec256<float16_t> x,
|
|
3099
|
+
Vec256<float16_t> sub_or_add) {
|
|
3100
|
+
return Vec256<float16_t>{_mm256_fmaddsub_ph(mul.raw, x.raw, sub_or_add.raw)};
|
|
3101
|
+
}
|
|
3102
|
+
#endif // HWY_HAVE_FLOAT16
|
|
3103
|
+
|
|
3104
|
+
HWY_API Vec256<float> MulAddSub(Vec256<float> mul, Vec256<float> x,
|
|
3105
|
+
Vec256<float> sub_or_add) {
|
|
3106
|
+
#ifdef HWY_DISABLE_BMI2_FMA
|
|
3107
|
+
return AddSub(mul * x, sub_or_add);
|
|
3108
|
+
#else
|
|
3109
|
+
return Vec256<float>{_mm256_fmaddsub_ps(mul.raw, x.raw, sub_or_add.raw)};
|
|
3110
|
+
#endif
|
|
3111
|
+
}
|
|
3112
|
+
|
|
3113
|
+
HWY_API Vec256<double> MulAddSub(Vec256<double> mul, Vec256<double> x,
|
|
3114
|
+
Vec256<double> sub_or_add) {
|
|
3115
|
+
#ifdef HWY_DISABLE_BMI2_FMA
|
|
3116
|
+
return AddSub(mul * x, sub_or_add);
|
|
3117
|
+
#else
|
|
3118
|
+
return Vec256<double>{_mm256_fmaddsub_pd(mul.raw, x.raw, sub_or_add.raw)};
|
|
3119
|
+
#endif
|
|
3120
|
+
}
|
|
3121
|
+
|
|
3122
|
+
// ------------------------------ Floating-point square root
|
|
3123
|
+
|
|
3124
|
+
// Full precision square root
|
|
2459
3125
|
#if HWY_HAVE_FLOAT16
|
|
2460
3126
|
HWY_API Vec256<float16_t> Sqrt(Vec256<float16_t> v) {
|
|
2461
3127
|
return Vec256<float16_t>{_mm256_sqrt_ph(v.raw)};
|
|
@@ -2565,6 +3231,15 @@ HWY_API Mask256<float16_t> IsNaN(Vec256<float16_t> v) {
|
|
|
2565
3231
|
v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
|
|
2566
3232
|
}
|
|
2567
3233
|
|
|
3234
|
+
HWY_API Mask256<float16_t> IsEitherNaN(Vec256<float16_t> a,
|
|
3235
|
+
Vec256<float16_t> b) {
|
|
3236
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
3237
|
+
HWY_DIAGNOSTICS(push)
|
|
3238
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
3239
|
+
return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_UNORD_Q)};
|
|
3240
|
+
HWY_DIAGNOSTICS(pop)
|
|
3241
|
+
}
|
|
3242
|
+
|
|
2568
3243
|
HWY_API Mask256<float16_t> IsInf(Vec256<float16_t> v) {
|
|
2569
3244
|
return Mask256<float16_t>{_mm256_fpclass_ph_mask(
|
|
2570
3245
|
v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)};
|
|
@@ -2597,6 +3272,22 @@ HWY_API Mask256<double> IsNaN(Vec256<double> v) {
|
|
|
2597
3272
|
#endif
|
|
2598
3273
|
}
|
|
2599
3274
|
|
|
3275
|
+
HWY_API Mask256<float> IsEitherNaN(Vec256<float> a, Vec256<float> b) {
|
|
3276
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
3277
|
+
return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_UNORD_Q)};
|
|
3278
|
+
#else
|
|
3279
|
+
return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_UNORD_Q)};
|
|
3280
|
+
#endif
|
|
3281
|
+
}
|
|
3282
|
+
|
|
3283
|
+
HWY_API Mask256<double> IsEitherNaN(Vec256<double> a, Vec256<double> b) {
|
|
3284
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
3285
|
+
return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_UNORD_Q)};
|
|
3286
|
+
#else
|
|
3287
|
+
return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_UNORD_Q)};
|
|
3288
|
+
#endif
|
|
3289
|
+
}
|
|
3290
|
+
|
|
2600
3291
|
#if HWY_TARGET <= HWY_AVX3
|
|
2601
3292
|
|
|
2602
3293
|
HWY_API Mask256<float> IsInf(Vec256<float> v) {
|
|
@@ -2621,35 +3312,6 @@ HWY_API Mask256<double> IsFinite(Vec256<double> v) {
|
|
|
2621
3312
|
HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)});
|
|
2622
3313
|
}
|
|
2623
3314
|
|
|
2624
|
-
#else
|
|
2625
|
-
|
|
2626
|
-
template <typename T>
|
|
2627
|
-
HWY_API Mask256<T> IsInf(const Vec256<T> v) {
|
|
2628
|
-
static_assert(IsFloat<T>(), "Only for float");
|
|
2629
|
-
const DFromV<decltype(v)> d;
|
|
2630
|
-
const RebindToSigned<decltype(d)> di;
|
|
2631
|
-
const VFromD<decltype(di)> vi = BitCast(di, v);
|
|
2632
|
-
// 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
|
|
2633
|
-
return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
|
|
2634
|
-
}
|
|
2635
|
-
|
|
2636
|
-
// Returns whether normal/subnormal/zero.
|
|
2637
|
-
template <typename T>
|
|
2638
|
-
HWY_API Mask256<T> IsFinite(const Vec256<T> v) {
|
|
2639
|
-
static_assert(IsFloat<T>(), "Only for float");
|
|
2640
|
-
const DFromV<decltype(v)> d;
|
|
2641
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
2642
|
-
const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
|
|
2643
|
-
const VFromD<decltype(du)> vu = BitCast(du, v);
|
|
2644
|
-
// Shift left to clear the sign bit, then right so we can compare with the
|
|
2645
|
-
// max exponent (cannot compare with MaxExponentTimes2 directly because it is
|
|
2646
|
-
// negative and non-negative floats would be greater). MSVC seems to generate
|
|
2647
|
-
// incorrect code if we instead add vu + vu.
|
|
2648
|
-
const VFromD<decltype(di)> exp =
|
|
2649
|
-
BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(ShiftLeft<1>(vu)));
|
|
2650
|
-
return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
|
|
2651
|
-
}
|
|
2652
|
-
|
|
2653
3315
|
#endif // HWY_TARGET <= HWY_AVX3
|
|
2654
3316
|
|
|
2655
3317
|
// ================================================== MEMORY
|
|
@@ -2662,16 +3324,13 @@ HWY_API VFromD<D> Load(D /* tag */, const TFromD<D>* HWY_RESTRICT aligned) {
|
|
|
2662
3324
|
_mm256_load_si256(reinterpret_cast<const __m256i*>(aligned))};
|
|
2663
3325
|
}
|
|
2664
3326
|
// bfloat16_t is handled by x86_128-inl.h.
|
|
2665
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
|
|
2666
|
-
HWY_API Vec256<float16_t> Load(D d, const float16_t* HWY_RESTRICT aligned) {
|
|
2667
3327
|
#if HWY_HAVE_FLOAT16
|
|
2668
|
-
|
|
3328
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
|
|
3329
|
+
HWY_API Vec256<float16_t> Load(D /* tag */,
|
|
3330
|
+
const float16_t* HWY_RESTRICT aligned) {
|
|
2669
3331
|
return Vec256<float16_t>{_mm256_load_ph(aligned)};
|
|
2670
|
-
#else
|
|
2671
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
2672
|
-
return BitCast(d, Load(du, reinterpret_cast<const uint16_t*>(aligned)));
|
|
2673
|
-
#endif // HWY_HAVE_FLOAT16
|
|
2674
3332
|
}
|
|
3333
|
+
#endif
|
|
2675
3334
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
2676
3335
|
HWY_API Vec256<float> Load(D /* tag */, const float* HWY_RESTRICT aligned) {
|
|
2677
3336
|
return Vec256<float>{_mm256_load_ps(aligned)};
|
|
@@ -2686,16 +3345,12 @@ HWY_API VFromD<D> LoadU(D /* tag */, const TFromD<D>* HWY_RESTRICT p) {
|
|
|
2686
3345
|
return VFromD<D>{_mm256_loadu_si256(reinterpret_cast<const __m256i*>(p))};
|
|
2687
3346
|
}
|
|
2688
3347
|
// bfloat16_t is handled by x86_128-inl.h.
|
|
2689
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
|
|
2690
|
-
HWY_API Vec256<float16_t> LoadU(D d, const float16_t* HWY_RESTRICT p) {
|
|
2691
3348
|
#if HWY_HAVE_FLOAT16
|
|
2692
|
-
|
|
3349
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
|
|
3350
|
+
HWY_API Vec256<float16_t> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) {
|
|
2693
3351
|
return Vec256<float16_t>{_mm256_loadu_ph(p)};
|
|
2694
|
-
#else
|
|
2695
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
2696
|
-
return BitCast(d, LoadU(du, reinterpret_cast<const uint16_t*>(p)));
|
|
2697
|
-
#endif // HWY_HAVE_FLOAT16
|
|
2698
3352
|
}
|
|
3353
|
+
#endif
|
|
2699
3354
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
2700
3355
|
HWY_API Vec256<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) {
|
|
2701
3356
|
return Vec256<float>{_mm256_loadu_ps(p)};
|
|
@@ -2756,8 +3411,8 @@ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
|
|
|
2756
3411
|
HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
|
|
2757
3412
|
const TFromD<D>* HWY_RESTRICT p) {
|
|
2758
3413
|
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
2759
|
-
return BitCast(
|
|
2760
|
-
|
|
3414
|
+
return BitCast(d, VFromD<decltype(du)>{
|
|
3415
|
+
_mm256_mask_loadu_epi16(BitCast(du, v).raw, m.raw, p)});
|
|
2761
3416
|
}
|
|
2762
3417
|
|
|
2763
3418
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
|
|
@@ -2831,22 +3486,24 @@ HWY_API Vec256<double> MaskedLoad(Mask256<double> m, D d,
|
|
|
2831
3486
|
// Loads 128 bit and duplicates into both 128-bit halves. This avoids the
|
|
2832
3487
|
// 3-cycle cost of moving data between 128-bit halves and avoids port 5.
|
|
2833
3488
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
2834
|
-
HWY_API VFromD<D> LoadDup128(D
|
|
3489
|
+
HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
3490
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2835
3491
|
const Full128<TFromD<D>> d128;
|
|
3492
|
+
const RebindToUnsigned<decltype(d128)> du128;
|
|
3493
|
+
const __m128i v128 = BitCast(du128, LoadU(d128, p)).raw;
|
|
2836
3494
|
#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
|
|
2837
3495
|
// Workaround for incorrect results with _mm256_broadcastsi128_si256. Note
|
|
2838
3496
|
// that MSVC also lacks _mm256_zextsi128_si256, but cast (which leaves the
|
|
2839
3497
|
// upper half undefined) is fine because we're overwriting that anyway.
|
|
2840
3498
|
// This workaround seems in turn to generate incorrect code in MSVC 2022
|
|
2841
3499
|
// (19.31), so use broadcastsi128 there.
|
|
2842
|
-
|
|
2843
|
-
|
|
2844
|
-
_mm256_inserti128_si256(_mm256_castsi128_si256(v128), v128, 1)};
|
|
3500
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_inserti128_si256(
|
|
3501
|
+
_mm256_castsi128_si256(v128), v128, 1)});
|
|
2845
3502
|
#else
|
|
2846
3503
|
// The preferred path. This is perhaps surprising, because vbroadcasti128
|
|
2847
3504
|
// with xmm input has 7 cycle latency on Intel, but Clang >= 7 is able to
|
|
2848
3505
|
// pattern-match this to vbroadcastf128 with a memory operand as desired.
|
|
2849
|
-
return VFromD<
|
|
3506
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_broadcastsi128_si256(v128)});
|
|
2850
3507
|
#endif
|
|
2851
3508
|
}
|
|
2852
3509
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
@@ -2879,16 +3536,13 @@ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
|
|
|
2879
3536
|
HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
|
|
2880
3537
|
_mm256_store_si256(reinterpret_cast<__m256i*>(aligned), v.raw);
|
|
2881
3538
|
}
|
|
2882
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
|
|
2883
|
-
HWY_API void Store(Vec256<float16_t> v, D d, float16_t* HWY_RESTRICT aligned) {
|
|
2884
3539
|
#if HWY_HAVE_FLOAT16
|
|
2885
|
-
|
|
3540
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
|
|
3541
|
+
HWY_API void Store(Vec256<float16_t> v, D /* tag */,
|
|
3542
|
+
float16_t* HWY_RESTRICT aligned) {
|
|
2886
3543
|
_mm256_store_ph(aligned, v.raw);
|
|
2887
|
-
#else
|
|
2888
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
2889
|
-
Store(BitCast(du, v), du, reinterpret_cast<uint16_t*>(aligned));
|
|
2890
|
-
#endif // HWY_HAVE_FLOAT16
|
|
2891
3544
|
}
|
|
3545
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2892
3546
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
2893
3547
|
HWY_API void Store(Vec256<float> v, D /* tag */, float* HWY_RESTRICT aligned) {
|
|
2894
3548
|
_mm256_store_ps(aligned, v.raw);
|
|
@@ -2903,16 +3557,13 @@ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
|
|
|
2903
3557
|
HWY_API void StoreU(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
|
|
2904
3558
|
_mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v.raw);
|
|
2905
3559
|
}
|
|
2906
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
|
|
2907
|
-
HWY_API void StoreU(Vec256<float16_t> v, D d, float16_t* HWY_RESTRICT p) {
|
|
2908
3560
|
#if HWY_HAVE_FLOAT16
|
|
2909
|
-
|
|
3561
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
|
|
3562
|
+
HWY_API void StoreU(Vec256<float16_t> v, D /* tag */,
|
|
3563
|
+
float16_t* HWY_RESTRICT p) {
|
|
2910
3564
|
_mm256_storeu_ph(p, v.raw);
|
|
2911
|
-
#else
|
|
2912
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
2913
|
-
StoreU(BitCast(du, v), du, reinterpret_cast<uint16_t*>(p));
|
|
2914
|
-
#endif // HWY_HAVE_FLOAT16
|
|
2915
3565
|
}
|
|
3566
|
+
#endif
|
|
2916
3567
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
2917
3568
|
HWY_API void StoreU(Vec256<float> v, D /* tag */, float* HWY_RESTRICT p) {
|
|
2918
3569
|
_mm256_storeu_ps(p, v.raw);
|
|
@@ -3140,118 +3791,124 @@ HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D /* tag */,
|
|
|
3140
3791
|
|
|
3141
3792
|
// ------------------------------ Gather
|
|
3142
3793
|
|
|
3143
|
-
|
|
3144
|
-
HWY_INLINE VFromD<D> GatherOffset(D /* tag */,
|
|
3145
|
-
const TFromD<D>* HWY_RESTRICT base,
|
|
3146
|
-
Vec256<int32_t> offset) {
|
|
3147
|
-
return VFromD<D>{_mm256_i32gather_epi32(
|
|
3148
|
-
reinterpret_cast<const int32_t*>(base), offset.raw, 1)};
|
|
3149
|
-
}
|
|
3150
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
|
|
3151
|
-
HWY_INLINE VFromD<D> GatherIndex(D /* tag */,
|
|
3152
|
-
const TFromD<D>* HWY_RESTRICT base,
|
|
3153
|
-
Vec256<int32_t> index) {
|
|
3154
|
-
return VFromD<D>{_mm256_i32gather_epi32(
|
|
3155
|
-
reinterpret_cast<const int32_t*>(base), index.raw, 4)};
|
|
3156
|
-
}
|
|
3794
|
+
namespace detail {
|
|
3157
3795
|
|
|
3158
|
-
template <
|
|
3159
|
-
HWY_INLINE
|
|
3160
|
-
|
|
3161
|
-
|
|
3162
|
-
|
|
3163
|
-
reinterpret_cast<const GatherIndex64*>(base), offset.raw, 1)};
|
|
3796
|
+
template <int kScale, typename T, HWY_IF_UI32(T)>
|
|
3797
|
+
HWY_INLINE Vec256<T> NativeGather256(const T* HWY_RESTRICT base,
|
|
3798
|
+
Vec256<int32_t> indices) {
|
|
3799
|
+
return Vec256<T>{_mm256_i32gather_epi32(
|
|
3800
|
+
reinterpret_cast<const int32_t*>(base), indices.raw, kScale)};
|
|
3164
3801
|
}
|
|
3165
|
-
|
|
3166
|
-
|
|
3167
|
-
|
|
3168
|
-
|
|
3169
|
-
return
|
|
3170
|
-
reinterpret_cast<const GatherIndex64*>(base),
|
|
3802
|
+
|
|
3803
|
+
template <int kScale, typename T, HWY_IF_UI64(T)>
|
|
3804
|
+
HWY_INLINE Vec256<T> NativeGather256(const T* HWY_RESTRICT base,
|
|
3805
|
+
Vec256<int64_t> indices) {
|
|
3806
|
+
return Vec256<T>{_mm256_i64gather_epi64(
|
|
3807
|
+
reinterpret_cast<const GatherIndex64*>(base), indices.raw, kScale)};
|
|
3171
3808
|
}
|
|
3172
3809
|
|
|
3173
|
-
template <
|
|
3174
|
-
HWY_API Vec256<float>
|
|
3175
|
-
|
|
3176
|
-
return Vec256<float>{_mm256_i32gather_ps(base,
|
|
3810
|
+
template <int kScale>
|
|
3811
|
+
HWY_API Vec256<float> NativeGather256(const float* HWY_RESTRICT base,
|
|
3812
|
+
Vec256<int32_t> indices) {
|
|
3813
|
+
return Vec256<float>{_mm256_i32gather_ps(base, indices.raw, kScale)};
|
|
3177
3814
|
}
|
|
3178
|
-
|
|
3179
|
-
|
|
3180
|
-
|
|
3181
|
-
|
|
3815
|
+
|
|
3816
|
+
template <int kScale>
|
|
3817
|
+
HWY_API Vec256<double> NativeGather256(const double* HWY_RESTRICT base,
|
|
3818
|
+
Vec256<int64_t> indices) {
|
|
3819
|
+
return Vec256<double>{_mm256_i64gather_pd(base, indices.raw, kScale)};
|
|
3182
3820
|
}
|
|
3183
|
-
|
|
3184
|
-
|
|
3185
|
-
|
|
3186
|
-
|
|
3187
|
-
|
|
3821
|
+
|
|
3822
|
+
} // namespace detail
|
|
3823
|
+
|
|
3824
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
3825
|
+
HWY_API VFromD<D> GatherOffset(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
|
|
3826
|
+
VFromD<RebindToSigned<D>> offsets) {
|
|
3827
|
+
return detail::NativeGather256<1>(base, offsets);
|
|
3188
3828
|
}
|
|
3189
|
-
|
|
3190
|
-
|
|
3191
|
-
|
|
3192
|
-
|
|
3829
|
+
|
|
3830
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
3831
|
+
HWY_API VFromD<D> GatherIndex(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
|
|
3832
|
+
VFromD<RebindToSigned<D>> indices) {
|
|
3833
|
+
return detail::NativeGather256<sizeof(TFromD<D>)>(base, indices);
|
|
3193
3834
|
}
|
|
3194
3835
|
|
|
3195
|
-
// ------------------------------
|
|
3836
|
+
// ------------------------------ MaskedGatherIndexOr
|
|
3196
3837
|
|
|
3197
|
-
|
|
3198
|
-
|
|
3199
|
-
|
|
3200
|
-
|
|
3838
|
+
namespace detail {
|
|
3839
|
+
|
|
3840
|
+
template <int kScale, typename T, HWY_IF_UI32(T)>
|
|
3841
|
+
HWY_INLINE Vec256<T> NativeMaskedGatherOr256(Vec256<T> no, Mask256<T> m,
|
|
3842
|
+
const T* HWY_RESTRICT base,
|
|
3843
|
+
Vec256<int32_t> indices) {
|
|
3201
3844
|
#if HWY_TARGET <= HWY_AVX3
|
|
3202
|
-
return
|
|
3203
|
-
|
|
3204
|
-
|
|
3845
|
+
return Vec256<T>{_mm256_mmask_i32gather_epi32(
|
|
3846
|
+
no.raw, m.raw, indices.raw, reinterpret_cast<const int32_t*>(base),
|
|
3847
|
+
kScale)};
|
|
3205
3848
|
#else
|
|
3206
|
-
return
|
|
3207
|
-
|
|
3208
|
-
|
|
3849
|
+
return Vec256<T>{_mm256_mask_i32gather_epi32(
|
|
3850
|
+
no.raw, reinterpret_cast<const int32_t*>(base), indices.raw, m.raw,
|
|
3851
|
+
kScale)};
|
|
3209
3852
|
#endif
|
|
3210
3853
|
}
|
|
3211
3854
|
|
|
3212
|
-
template <
|
|
3213
|
-
HWY_INLINE
|
|
3214
|
-
|
|
3215
|
-
|
|
3855
|
+
template <int kScale, typename T, HWY_IF_UI64(T)>
|
|
3856
|
+
HWY_INLINE Vec256<T> NativeMaskedGatherOr256(Vec256<T> no, Mask256<T> m,
|
|
3857
|
+
const T* HWY_RESTRICT base,
|
|
3858
|
+
Vec256<int64_t> indices) {
|
|
3216
3859
|
#if HWY_TARGET <= HWY_AVX3
|
|
3217
|
-
return
|
|
3218
|
-
|
|
3219
|
-
|
|
3860
|
+
return Vec256<T>{_mm256_mmask_i64gather_epi64(
|
|
3861
|
+
no.raw, m.raw, indices.raw, reinterpret_cast<const GatherIndex64*>(base),
|
|
3862
|
+
kScale)};
|
|
3220
3863
|
#else
|
|
3221
3864
|
// For reasons unknown, _mm256_mask_i64gather_epi64 returns all-zeros.
|
|
3222
|
-
const
|
|
3223
|
-
|
|
3224
|
-
|
|
3225
|
-
|
|
3865
|
+
const Full256<T> d;
|
|
3866
|
+
const Full256<double> dd;
|
|
3867
|
+
return BitCast(d,
|
|
3868
|
+
Vec256<double>{_mm256_mask_i64gather_pd(
|
|
3869
|
+
BitCast(dd, no).raw, reinterpret_cast<const double*>(base),
|
|
3870
|
+
indices.raw, RebindMask(dd, m).raw, kScale)});
|
|
3226
3871
|
#endif
|
|
3227
3872
|
}
|
|
3228
3873
|
|
|
3229
|
-
template <
|
|
3230
|
-
HWY_API Vec256<float>
|
|
3231
|
-
|
|
3232
|
-
|
|
3874
|
+
template <int kScale>
|
|
3875
|
+
HWY_API Vec256<float> NativeMaskedGatherOr256(Vec256<float> no,
|
|
3876
|
+
Mask256<float> m,
|
|
3877
|
+
const float* HWY_RESTRICT base,
|
|
3878
|
+
Vec256<int32_t> indices) {
|
|
3233
3879
|
#if HWY_TARGET <= HWY_AVX3
|
|
3234
3880
|
return Vec256<float>{
|
|
3235
|
-
_mm256_mmask_i32gather_ps(
|
|
3881
|
+
_mm256_mmask_i32gather_ps(no.raw, m.raw, indices.raw, base, kScale)};
|
|
3236
3882
|
#else
|
|
3237
3883
|
return Vec256<float>{
|
|
3238
|
-
_mm256_mask_i32gather_ps(
|
|
3884
|
+
_mm256_mask_i32gather_ps(no.raw, base, indices.raw, m.raw, kScale)};
|
|
3239
3885
|
#endif
|
|
3240
3886
|
}
|
|
3241
3887
|
|
|
3242
|
-
template <
|
|
3243
|
-
HWY_API Vec256<double>
|
|
3244
|
-
|
|
3245
|
-
|
|
3888
|
+
template <int kScale>
|
|
3889
|
+
HWY_API Vec256<double> NativeMaskedGatherOr256(Vec256<double> no,
|
|
3890
|
+
Mask256<double> m,
|
|
3891
|
+
const double* HWY_RESTRICT base,
|
|
3892
|
+
Vec256<int64_t> indices) {
|
|
3246
3893
|
#if HWY_TARGET <= HWY_AVX3
|
|
3247
3894
|
return Vec256<double>{
|
|
3248
|
-
_mm256_mmask_i64gather_pd(
|
|
3895
|
+
_mm256_mmask_i64gather_pd(no.raw, m.raw, indices.raw, base, kScale)};
|
|
3249
3896
|
#else
|
|
3250
3897
|
return Vec256<double>{
|
|
3251
|
-
_mm256_mask_i64gather_pd(
|
|
3898
|
+
_mm256_mask_i64gather_pd(no.raw, base, indices.raw, m.raw, kScale)};
|
|
3252
3899
|
#endif
|
|
3253
3900
|
}
|
|
3254
3901
|
|
|
3902
|
+
} // namespace detail
|
|
3903
|
+
|
|
3904
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
3905
|
+
HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D /*d*/,
|
|
3906
|
+
const TFromD<D>* HWY_RESTRICT base,
|
|
3907
|
+
VFromD<RebindToSigned<D>> indices) {
|
|
3908
|
+
return detail::NativeMaskedGatherOr256<sizeof(TFromD<D>)>(no, m, base,
|
|
3909
|
+
indices);
|
|
3910
|
+
}
|
|
3911
|
+
|
|
3255
3912
|
HWY_DIAGNOSTICS(pop)
|
|
3256
3913
|
|
|
3257
3914
|
// ================================================== SWIZZLE
|
|
@@ -3294,7 +3951,7 @@ HWY_API Vec128<T> LowerHalf(Vec256<T> v) {
|
|
|
3294
3951
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
3295
3952
|
HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
|
|
3296
3953
|
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
3297
|
-
const Twice<decltype(
|
|
3954
|
+
const Twice<decltype(du)> dut;
|
|
3298
3955
|
return BitCast(d, VFromD<decltype(du)>{
|
|
3299
3956
|
_mm256_extracti128_si256(BitCast(dut, v).raw, 1)});
|
|
3300
3957
|
}
|
|
@@ -3375,22 +4032,16 @@ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
|
|
|
3375
4032
|
HWY_API VFromD<D> ZeroExtendVector(D /* tag */, VFromD<Half<D>> lo) {
|
|
3376
4033
|
#if HWY_HAVE_ZEXT
|
|
3377
4034
|
return VFromD<D>{_mm256_zextsi128_si256(lo.raw)};
|
|
4035
|
+
#elif HWY_COMPILER_MSVC
|
|
4036
|
+
// Workaround: _mm256_inserti128_si256 does not actually zero the hi part.
|
|
4037
|
+
return VFromD<D>{_mm256_set_m128i(_mm_setzero_si128(), lo.raw)};
|
|
3378
4038
|
#else
|
|
3379
4039
|
return VFromD<D>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
|
|
3380
4040
|
#endif
|
|
3381
4041
|
}
|
|
3382
|
-
|
|
3383
|
-
HWY_API Vec256<bfloat16_t> ZeroExtendVector(D d, Vec128<bfloat16_t> lo) {
|
|
3384
|
-
(void)d;
|
|
3385
|
-
#if HWY_HAVE_ZEXT
|
|
3386
|
-
return VFromD<D>{_mm256_zextsi128_si256(lo.raw)};
|
|
3387
|
-
#else
|
|
3388
|
-
return VFromD<D>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
|
|
3389
|
-
#endif // HWY_HAVE_ZEXT
|
|
3390
|
-
}
|
|
4042
|
+
#if HWY_HAVE_FLOAT16
|
|
3391
4043
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
|
|
3392
4044
|
HWY_API Vec256<float16_t> ZeroExtendVector(D d, Vec128<float16_t> lo) {
|
|
3393
|
-
#if HWY_HAVE_FLOAT16
|
|
3394
4045
|
#if HWY_HAVE_ZEXT
|
|
3395
4046
|
(void)d;
|
|
3396
4047
|
return Vec256<float16_t>{_mm256_zextph128_ph256(lo.raw)};
|
|
@@ -3398,15 +4049,8 @@ HWY_API Vec256<float16_t> ZeroExtendVector(D d, Vec128<float16_t> lo) {
|
|
|
3398
4049
|
const RebindToUnsigned<D> du;
|
|
3399
4050
|
return BitCast(d, ZeroExtendVector(du, BitCast(du, lo)));
|
|
3400
4051
|
#endif // HWY_HAVE_ZEXT
|
|
3401
|
-
#else
|
|
3402
|
-
(void)d;
|
|
3403
|
-
#if HWY_HAVE_ZEXT
|
|
3404
|
-
return VFromD<D>{_mm256_zextsi128_si256(lo.raw)};
|
|
3405
|
-
#else
|
|
3406
|
-
return VFromD<D>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
|
|
3407
|
-
#endif // HWY_HAVE_ZEXT
|
|
3408
|
-
#endif // HWY_HAVE_FLOAT16
|
|
3409
4052
|
}
|
|
4053
|
+
#endif // HWY_HAVE_FLOAT16
|
|
3410
4054
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
3411
4055
|
HWY_API Vec256<float> ZeroExtendVector(D /* tag */, Vec128<float> lo) {
|
|
3412
4056
|
#if HWY_HAVE_ZEXT
|
|
@@ -3443,8 +4087,11 @@ HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
|
|
|
3443
4087
|
|
|
3444
4088
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
3445
4089
|
HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) {
|
|
3446
|
-
const
|
|
3447
|
-
|
|
4090
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4091
|
+
const Half<decltype(du)> dh_u;
|
|
4092
|
+
const auto lo256 = ZeroExtendVector(du, BitCast(dh_u, lo));
|
|
4093
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_inserti128_si256(
|
|
4094
|
+
lo256.raw, BitCast(dh_u, hi).raw, 1)});
|
|
3448
4095
|
}
|
|
3449
4096
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
3450
4097
|
HWY_API Vec256<float> Combine(D d, Vec128<float> hi, Vec128<float> lo) {
|
|
@@ -3547,8 +4194,12 @@ HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
|
|
|
3547
4194
|
template <class T, HWY_IF_T_SIZE(T, 2)>
|
|
3548
4195
|
HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
|
|
3549
4196
|
Vec256<T> v) {
|
|
3550
|
-
const
|
|
3551
|
-
|
|
4197
|
+
const DFromV<decltype(v)> d;
|
|
4198
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4199
|
+
const Half<decltype(d)> dh;
|
|
4200
|
+
const RebindToUnsigned<decltype(dh)> dh_u;
|
|
4201
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_broadcastw_epi16(
|
|
4202
|
+
BitCast(dh_u, LowerHalf(dh, v)).raw)});
|
|
3552
4203
|
}
|
|
3553
4204
|
|
|
3554
4205
|
template <class T, HWY_IF_UI32(T)>
|
|
@@ -3983,7 +4634,10 @@ HWY_API Vec256<double> TwoTablesLookupLanes(Vec256<double> a, Vec256<double> b,
|
|
|
3983
4634
|
|
|
3984
4635
|
template <typename T>
|
|
3985
4636
|
HWY_API Vec256<T> SwapAdjacentBlocks(Vec256<T> v) {
|
|
3986
|
-
|
|
4637
|
+
const DFromV<decltype(v)> d;
|
|
4638
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4639
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_permute4x64_epi64(
|
|
4640
|
+
BitCast(du, v).raw, _MM_SHUFFLE(1, 0, 3, 2))});
|
|
3987
4641
|
}
|
|
3988
4642
|
|
|
3989
4643
|
HWY_API Vec256<double> SwapAdjacentBlocks(Vec256<double> v) {
|
|
@@ -4022,9 +4676,9 @@ HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
|
|
|
4022
4676
|
_mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
|
|
4023
4677
|
#else
|
|
4024
4678
|
const RebindToSigned<decltype(d)> di;
|
|
4025
|
-
|
|
4026
|
-
0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100
|
|
4027
|
-
const auto rev128 = TableLookupBytes(v,
|
|
4679
|
+
const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
|
|
4680
|
+
di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
|
|
4681
|
+
const auto rev128 = TableLookupBytes(v, shuffle);
|
|
4028
4682
|
return VFromD<D>{
|
|
4029
4683
|
_mm256_permute4x64_epi64(rev128.raw, _MM_SHUFFLE(1, 0, 3, 2))};
|
|
4030
4684
|
#endif
|
|
@@ -4053,9 +4707,9 @@ HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
|
|
|
4053
4707
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
|
|
4054
4708
|
HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
|
|
4055
4709
|
const RebindToSigned<decltype(d)> di;
|
|
4056
|
-
|
|
4057
|
-
0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908
|
|
4058
|
-
return BitCast(d, TableLookupBytes(v,
|
|
4710
|
+
const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
|
|
4711
|
+
di, 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908);
|
|
4712
|
+
return BitCast(d, TableLookupBytes(v, shuffle));
|
|
4059
4713
|
}
|
|
4060
4714
|
|
|
4061
4715
|
// 32 bit Reverse4 defined in x86_128.
|
|
@@ -4071,9 +4725,9 @@ HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) {
|
|
|
4071
4725
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
|
|
4072
4726
|
HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
|
|
4073
4727
|
const RebindToSigned<decltype(d)> di;
|
|
4074
|
-
|
|
4075
|
-
0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100
|
|
4076
|
-
return BitCast(d, TableLookupBytes(v,
|
|
4728
|
+
const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
|
|
4729
|
+
di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
|
|
4730
|
+
return BitCast(d, TableLookupBytes(v, shuffle));
|
|
4077
4731
|
}
|
|
4078
4732
|
|
|
4079
4733
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
|
|
@@ -4162,8 +4816,12 @@ HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
|
|
|
4162
4816
|
// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
|
|
4163
4817
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
4164
4818
|
HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4819
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4165
4820
|
const Half<decltype(d)> d2;
|
|
4166
|
-
|
|
4821
|
+
const RebindToUnsigned<decltype(d2)> du2; // for float16_t
|
|
4822
|
+
return BitCast(
|
|
4823
|
+
d, VFromD<decltype(du)>{_mm256_inserti128_si256(
|
|
4824
|
+
BitCast(du, lo).raw, BitCast(du2, LowerHalf(d2, hi)).raw, 1)});
|
|
4167
4825
|
}
|
|
4168
4826
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
4169
4827
|
HWY_API Vec256<float> ConcatLowerLower(D d, Vec256<float> hi,
|
|
@@ -4180,8 +4838,10 @@ HWY_API Vec256<double> ConcatLowerLower(D d, Vec256<double> hi,
|
|
|
4180
4838
|
|
|
4181
4839
|
// hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
|
|
4182
4840
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
4183
|
-
HWY_API VFromD<D> ConcatLowerUpper(D
|
|
4184
|
-
|
|
4841
|
+
HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4842
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4843
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_permute2x128_si256(
|
|
4844
|
+
BitCast(du, lo).raw, BitCast(du, hi).raw, 0x21)});
|
|
4185
4845
|
}
|
|
4186
4846
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
4187
4847
|
HWY_API Vec256<float> ConcatLowerUpper(D /* tag */, Vec256<float> hi,
|
|
@@ -4196,8 +4856,10 @@ HWY_API Vec256<double> ConcatLowerUpper(D /* tag */, Vec256<double> hi,
|
|
|
4196
4856
|
|
|
4197
4857
|
// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
|
|
4198
4858
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
4199
|
-
HWY_API VFromD<D> ConcatUpperLower(D
|
|
4200
|
-
|
|
4859
|
+
HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4860
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4861
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_blend_epi32(
|
|
4862
|
+
BitCast(du, hi).raw, BitCast(du, lo).raw, 0x0F)});
|
|
4201
4863
|
}
|
|
4202
4864
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
4203
4865
|
HWY_API Vec256<float> ConcatUpperLower(D /* tag */, Vec256<float> hi,
|
|
@@ -4212,8 +4874,10 @@ HWY_API Vec256<double> ConcatUpperLower(D /* tag */, Vec256<double> hi,
|
|
|
4212
4874
|
|
|
4213
4875
|
// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
|
|
4214
4876
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
4215
|
-
HWY_API VFromD<D> ConcatUpperUpper(D
|
|
4216
|
-
|
|
4877
|
+
HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4878
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4879
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_permute2x128_si256(
|
|
4880
|
+
BitCast(du, lo).raw, BitCast(du, hi).raw, 0x31)});
|
|
4217
4881
|
}
|
|
4218
4882
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
4219
4883
|
HWY_API Vec256<float> ConcatUpperUpper(D /* tag */, Vec256<float> hi,
|
|
@@ -4274,7 +4938,8 @@ HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
|
4274
4938
|
const Vec256<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi));
|
|
4275
4939
|
const Vec256<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo));
|
|
4276
4940
|
const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
|
|
4277
|
-
return VFromD<
|
|
4941
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_permute4x64_epi64(
|
|
4942
|
+
u16, _MM_SHUFFLE(3, 1, 2, 0))});
|
|
4278
4943
|
#endif
|
|
4279
4944
|
}
|
|
4280
4945
|
|
|
@@ -4365,90 +5030,211 @@ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
|
4365
5030
|
}
|
|
4366
5031
|
|
|
4367
5032
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
|
|
4368
|
-
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
5033
|
+
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
5034
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5035
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
5036
|
+
alignas(64) static constexpr uint16_t kIdx[16] = {
|
|
5037
|
+
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
|
|
5038
|
+
return BitCast(
|
|
5039
|
+
d, Vec256<uint32_t>{_mm256_permutex2var_epi16(
|
|
5040
|
+
BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
|
|
5041
|
+
#else
|
|
5042
|
+
const RepartitionToWide<decltype(du)> dw;
|
|
5043
|
+
// Isolate lower 16 bits per u32 so we can pack.
|
|
5044
|
+
const Vec256<uint32_t> mask = Set(dw, 0x0000FFFF);
|
|
5045
|
+
const Vec256<uint32_t> uH = And(BitCast(dw, hi), mask);
|
|
5046
|
+
const Vec256<uint32_t> uL = And(BitCast(dw, lo), mask);
|
|
5047
|
+
const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
|
|
5048
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_permute4x64_epi64(
|
|
5049
|
+
u16, _MM_SHUFFLE(3, 1, 2, 0))});
|
|
5050
|
+
#endif
|
|
5051
|
+
}
|
|
5052
|
+
|
|
5053
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
|
|
5054
|
+
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
5055
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5056
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
5057
|
+
alignas(64) static constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
|
|
5058
|
+
return BitCast(
|
|
5059
|
+
d, Vec256<uint32_t>{_mm256_permutex2var_epi32(
|
|
5060
|
+
BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
|
|
5061
|
+
#else
|
|
5062
|
+
const RebindToFloat<decltype(d)> df;
|
|
5063
|
+
const Vec256<float> v2020{_mm256_shuffle_ps(
|
|
5064
|
+
BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(2, 0, 2, 0))};
|
|
5065
|
+
return VFromD<D>{_mm256_permute4x64_epi64(BitCast(du, v2020).raw,
|
|
5066
|
+
_MM_SHUFFLE(3, 1, 2, 0))};
|
|
5067
|
+
|
|
5068
|
+
#endif
|
|
5069
|
+
}
|
|
5070
|
+
|
|
5071
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
5072
|
+
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
5073
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5074
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
5075
|
+
alignas(64) static constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
|
|
5076
|
+
return VFromD<D>{_mm256_permutex2var_ps(lo.raw, Load(du, kIdx).raw, hi.raw)};
|
|
5077
|
+
#else
|
|
5078
|
+
const VFromD<D> v2020{
|
|
5079
|
+
_mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
|
|
5080
|
+
return BitCast(d, Vec256<uint32_t>{_mm256_permute4x64_epi64(
|
|
5081
|
+
BitCast(du, v2020).raw, _MM_SHUFFLE(3, 1, 2, 0))});
|
|
5082
|
+
|
|
5083
|
+
#endif
|
|
5084
|
+
}
|
|
5085
|
+
|
|
5086
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
|
|
5087
|
+
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
5088
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5089
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
5090
|
+
alignas(64) static constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
|
|
5091
|
+
return BitCast(
|
|
5092
|
+
d, Vec256<uint64_t>{_mm256_permutex2var_epi64(
|
|
5093
|
+
BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
|
|
5094
|
+
#else
|
|
5095
|
+
const RebindToFloat<decltype(d)> df;
|
|
5096
|
+
const Vec256<double> v20{
|
|
5097
|
+
_mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 0)};
|
|
5098
|
+
return VFromD<D>{
|
|
5099
|
+
_mm256_permute4x64_epi64(BitCast(du, v20).raw, _MM_SHUFFLE(3, 1, 2, 0))};
|
|
5100
|
+
|
|
5101
|
+
#endif
|
|
5102
|
+
}
|
|
5103
|
+
|
|
5104
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
|
|
5105
|
+
HWY_API Vec256<double> ConcatEven(D d, Vec256<double> hi, Vec256<double> lo) {
|
|
5106
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
5107
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5108
|
+
alignas(64) static constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
|
|
5109
|
+
return Vec256<double>{
|
|
5110
|
+
_mm256_permutex2var_pd(lo.raw, Load(du, kIdx).raw, hi.raw)};
|
|
5111
|
+
#else
|
|
5112
|
+
(void)d;
|
|
5113
|
+
const Vec256<double> v20{_mm256_shuffle_pd(lo.raw, hi.raw, 0)};
|
|
5114
|
+
return Vec256<double>{
|
|
5115
|
+
_mm256_permute4x64_pd(v20.raw, _MM_SHUFFLE(3, 1, 2, 0))};
|
|
5116
|
+
#endif
|
|
5117
|
+
}
|
|
5118
|
+
|
|
5119
|
+
// ------------------------------ InterleaveWholeLower
|
|
5120
|
+
|
|
5121
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
5122
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
|
|
5123
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
5124
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
5125
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5126
|
+
alignas(32) static constexpr uint8_t kIdx[32] = {
|
|
5127
|
+
0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
|
|
5128
|
+
8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47};
|
|
5129
|
+
return VFromD<D>{_mm256_permutex2var_epi8(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
5130
|
+
#else
|
|
5131
|
+
return ConcatLowerLower(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
|
|
5132
|
+
#endif
|
|
5133
|
+
}
|
|
5134
|
+
|
|
5135
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
|
|
5136
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
5137
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5138
|
+
alignas(32) static constexpr uint16_t kIdx[16] = {0, 16, 1, 17, 2, 18, 3, 19,
|
|
5139
|
+
4, 20, 5, 21, 6, 22, 7, 23};
|
|
5140
|
+
return BitCast(
|
|
5141
|
+
d, VFromD<decltype(du)>{_mm256_permutex2var_epi16(
|
|
5142
|
+
BitCast(du, a).raw, Load(du, kIdx).raw, BitCast(du, b).raw)});
|
|
5143
|
+
}
|
|
5144
|
+
|
|
5145
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
|
|
5146
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
5147
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5148
|
+
alignas(32) static constexpr uint32_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
|
|
5149
|
+
return VFromD<D>{_mm256_permutex2var_epi32(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
5150
|
+
}
|
|
5151
|
+
|
|
5152
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
5153
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
5154
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5155
|
+
alignas(32) static constexpr uint32_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
|
|
5156
|
+
return VFromD<D>{_mm256_permutex2var_ps(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
5157
|
+
}
|
|
5158
|
+
|
|
5159
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
|
|
5160
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
5161
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5162
|
+
alignas(32) static constexpr uint64_t kIdx[4] = {0, 4, 1, 5};
|
|
5163
|
+
return VFromD<D>{_mm256_permutex2var_epi64(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
5164
|
+
}
|
|
5165
|
+
|
|
5166
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
|
|
5167
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
5168
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5169
|
+
alignas(32) static constexpr uint64_t kIdx[4] = {0, 4, 1, 5};
|
|
5170
|
+
return VFromD<D>{_mm256_permutex2var_pd(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
5171
|
+
}
|
|
5172
|
+
#else // AVX2
|
|
5173
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
5174
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
5175
|
+
return ConcatLowerLower(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
|
|
5176
|
+
}
|
|
5177
|
+
#endif
|
|
5178
|
+
|
|
5179
|
+
// ------------------------------ InterleaveWholeUpper
|
|
5180
|
+
|
|
5181
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
5182
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
|
|
5183
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
5184
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
5185
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5186
|
+
alignas(32) static constexpr uint8_t kIdx[32] = {
|
|
5187
|
+
16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
|
|
5188
|
+
24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63};
|
|
5189
|
+
return VFromD<D>{_mm256_permutex2var_epi8(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
5190
|
+
#else
|
|
5191
|
+
return ConcatUpperUpper(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
|
|
5192
|
+
#endif
|
|
5193
|
+
}
|
|
5194
|
+
|
|
5195
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
|
|
5196
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
4369
5197
|
const RebindToUnsigned<decltype(d)> du;
|
|
4370
|
-
|
|
4371
|
-
|
|
4372
|
-
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
|
|
5198
|
+
alignas(32) static constexpr uint16_t kIdx[16] = {
|
|
5199
|
+
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31};
|
|
4373
5200
|
return BitCast(
|
|
4374
|
-
d,
|
|
4375
|
-
BitCast(du,
|
|
4376
|
-
#else
|
|
4377
|
-
const RepartitionToWide<decltype(du)> dw;
|
|
4378
|
-
// Isolate lower 16 bits per u32 so we can pack.
|
|
4379
|
-
const Vec256<uint32_t> mask = Set(dw, 0x0000FFFF);
|
|
4380
|
-
const Vec256<uint32_t> uH = And(BitCast(dw, hi), mask);
|
|
4381
|
-
const Vec256<uint32_t> uL = And(BitCast(dw, lo), mask);
|
|
4382
|
-
const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
|
|
4383
|
-
return VFromD<D>{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))};
|
|
4384
|
-
#endif
|
|
5201
|
+
d, VFromD<decltype(du)>{_mm256_permutex2var_epi16(
|
|
5202
|
+
BitCast(du, a).raw, Load(du, kIdx).raw, BitCast(du, b).raw)});
|
|
4385
5203
|
}
|
|
4386
5204
|
|
|
4387
5205
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
|
|
4388
|
-
HWY_API VFromD<D>
|
|
5206
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
4389
5207
|
const RebindToUnsigned<decltype(d)> du;
|
|
4390
|
-
|
|
4391
|
-
|
|
4392
|
-
return BitCast(
|
|
4393
|
-
d, Vec256<uint32_t>{_mm256_permutex2var_epi32(
|
|
4394
|
-
BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
|
|
4395
|
-
#else
|
|
4396
|
-
const RebindToFloat<decltype(d)> df;
|
|
4397
|
-
const Vec256<float> v2020{_mm256_shuffle_ps(
|
|
4398
|
-
BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(2, 0, 2, 0))};
|
|
4399
|
-
return VFromD<D>{_mm256_permute4x64_epi64(BitCast(du, v2020).raw,
|
|
4400
|
-
_MM_SHUFFLE(3, 1, 2, 0))};
|
|
4401
|
-
|
|
4402
|
-
#endif
|
|
5208
|
+
alignas(32) static constexpr uint32_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
|
|
5209
|
+
return VFromD<D>{_mm256_permutex2var_epi32(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
4403
5210
|
}
|
|
4404
5211
|
|
|
4405
5212
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
4406
|
-
HWY_API VFromD<D>
|
|
5213
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
4407
5214
|
const RebindToUnsigned<decltype(d)> du;
|
|
4408
|
-
|
|
4409
|
-
|
|
4410
|
-
return VFromD<D>{_mm256_permutex2var_ps(lo.raw, Load(du, kIdx).raw, hi.raw)};
|
|
4411
|
-
#else
|
|
4412
|
-
const VFromD<D> v2020{
|
|
4413
|
-
_mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
|
|
4414
|
-
return BitCast(d, Vec256<uint32_t>{_mm256_permute4x64_epi64(
|
|
4415
|
-
BitCast(du, v2020).raw, _MM_SHUFFLE(3, 1, 2, 0))});
|
|
4416
|
-
|
|
4417
|
-
#endif
|
|
5215
|
+
alignas(32) static constexpr uint32_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
|
|
5216
|
+
return VFromD<D>{_mm256_permutex2var_ps(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
4418
5217
|
}
|
|
4419
5218
|
|
|
4420
5219
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
|
|
4421
|
-
HWY_API VFromD<D>
|
|
5220
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
4422
5221
|
const RebindToUnsigned<decltype(d)> du;
|
|
4423
|
-
|
|
4424
|
-
|
|
4425
|
-
return BitCast(
|
|
4426
|
-
d, Vec256<uint64_t>{_mm256_permutex2var_epi64(
|
|
4427
|
-
BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
|
|
4428
|
-
#else
|
|
4429
|
-
const RebindToFloat<decltype(d)> df;
|
|
4430
|
-
const Vec256<double> v20{
|
|
4431
|
-
_mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 0)};
|
|
4432
|
-
return VFromD<D>{
|
|
4433
|
-
_mm256_permute4x64_epi64(BitCast(du, v20).raw, _MM_SHUFFLE(3, 1, 2, 0))};
|
|
4434
|
-
|
|
4435
|
-
#endif
|
|
5222
|
+
alignas(32) static constexpr uint64_t kIdx[4] = {2, 6, 3, 7};
|
|
5223
|
+
return VFromD<D>{_mm256_permutex2var_epi64(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
4436
5224
|
}
|
|
4437
5225
|
|
|
4438
5226
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
|
|
4439
|
-
HWY_API
|
|
4440
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
5227
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
4441
5228
|
const RebindToUnsigned<decltype(d)> du;
|
|
4442
|
-
alignas(
|
|
4443
|
-
return
|
|
4444
|
-
|
|
4445
|
-
#else
|
|
4446
|
-
|
|
4447
|
-
|
|
4448
|
-
return
|
|
4449
|
-
_mm256_permute4x64_pd(v20.raw, _MM_SHUFFLE(3, 1, 2, 0))};
|
|
4450
|
-
#endif
|
|
5229
|
+
alignas(32) static constexpr uint64_t kIdx[4] = {2, 6, 3, 7};
|
|
5230
|
+
return VFromD<D>{_mm256_permutex2var_pd(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
5231
|
+
}
|
|
5232
|
+
#else // AVX2
|
|
5233
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
5234
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
5235
|
+
return ConcatUpperUpper(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
|
|
4451
5236
|
}
|
|
5237
|
+
#endif
|
|
4452
5238
|
|
|
4453
5239
|
// ------------------------------ DupEven (InterleaveLower)
|
|
4454
5240
|
|
|
@@ -4490,9 +5276,10 @@ template <typename T, HWY_IF_T_SIZE(T, 1)>
|
|
|
4490
5276
|
HWY_INLINE Vec256<T> OddEven(Vec256<T> a, Vec256<T> b) {
|
|
4491
5277
|
const DFromV<decltype(a)> d;
|
|
4492
5278
|
const Full256<uint8_t> d8;
|
|
4493
|
-
|
|
4494
|
-
|
|
4495
|
-
|
|
5279
|
+
const VFromD<decltype(d8)> mask =
|
|
5280
|
+
Dup128VecFromValues(d8, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF,
|
|
5281
|
+
0, 0xFF, 0, 0xFF, 0);
|
|
5282
|
+
return IfThenElse(MaskFromVec(BitCast(d, mask)), b, a);
|
|
4496
5283
|
}
|
|
4497
5284
|
|
|
4498
5285
|
template <typename T, HWY_IF_UI16(T)>
|
|
@@ -4505,7 +5292,8 @@ HWY_INLINE Vec256<T> OddEven(Vec256<T> a, Vec256<T> b) {
|
|
|
4505
5292
|
|
|
4506
5293
|
#if HWY_HAVE_FLOAT16
|
|
4507
5294
|
HWY_INLINE Vec256<float16_t> OddEven(Vec256<float16_t> a, Vec256<float16_t> b) {
|
|
4508
|
-
return Vec256<float16_t>{
|
|
5295
|
+
return Vec256<float16_t>{
|
|
5296
|
+
_mm256_mask_blend_ph(static_cast<__mmask16>(0x5555), a.raw, b.raw)};
|
|
4509
5297
|
}
|
|
4510
5298
|
#endif // HWY_HAVE_FLOAT16
|
|
4511
5299
|
|
|
@@ -4527,11 +5315,80 @@ HWY_API Vec256<double> OddEven(Vec256<double> a, Vec256<double> b) {
|
|
|
4527
5315
|
return Vec256<double>{_mm256_blend_pd(a.raw, b.raw, 5)};
|
|
4528
5316
|
}
|
|
4529
5317
|
|
|
5318
|
+
// -------------------------- InterleaveEven
|
|
5319
|
+
|
|
5320
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
5321
|
+
template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_UI32_D(D)>
|
|
5322
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
5323
|
+
return VFromD<D>{_mm256_mask_shuffle_epi32(
|
|
5324
|
+
a.raw, static_cast<__mmask8>(0xAA), b.raw,
|
|
5325
|
+
static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(2, 2, 0, 0)))};
|
|
5326
|
+
}
|
|
5327
|
+
template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_F32_D(D)>
|
|
5328
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
5329
|
+
return VFromD<D>{_mm256_mask_shuffle_ps(a.raw, static_cast<__mmask8>(0xAA),
|
|
5330
|
+
b.raw, b.raw,
|
|
5331
|
+
_MM_SHUFFLE(2, 2, 0, 0))};
|
|
5332
|
+
}
|
|
5333
|
+
#else
|
|
5334
|
+
template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_T_SIZE_D(D, 4)>
|
|
5335
|
+
HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
|
|
5336
|
+
const RebindToFloat<decltype(d)> df;
|
|
5337
|
+
const VFromD<decltype(df)> b2_b0_a2_a0{_mm256_shuffle_ps(
|
|
5338
|
+
BitCast(df, a).raw, BitCast(df, b).raw, _MM_SHUFFLE(2, 0, 2, 0))};
|
|
5339
|
+
return BitCast(
|
|
5340
|
+
d, VFromD<decltype(df)>{_mm256_shuffle_ps(
|
|
5341
|
+
b2_b0_a2_a0.raw, b2_b0_a2_a0.raw, _MM_SHUFFLE(3, 1, 2, 0))});
|
|
5342
|
+
}
|
|
5343
|
+
#endif
|
|
5344
|
+
|
|
5345
|
+
// I64/U64/F64 InterleaveEven is generic for vector lengths >= 32 bytes
|
|
5346
|
+
template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 8)>
|
|
5347
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
5348
|
+
return InterleaveLower(a, b);
|
|
5349
|
+
}
|
|
5350
|
+
|
|
5351
|
+
// -------------------------- InterleaveOdd
|
|
5352
|
+
|
|
5353
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
5354
|
+
template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_UI32_D(D)>
|
|
5355
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
5356
|
+
return VFromD<D>{_mm256_mask_shuffle_epi32(
|
|
5357
|
+
b.raw, static_cast<__mmask8>(0x55), a.raw,
|
|
5358
|
+
static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(3, 3, 1, 1)))};
|
|
5359
|
+
}
|
|
5360
|
+
template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_F32_D(D)>
|
|
5361
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
5362
|
+
return VFromD<D>{_mm256_mask_shuffle_ps(b.raw, static_cast<__mmask8>(0x55),
|
|
5363
|
+
a.raw, a.raw,
|
|
5364
|
+
_MM_SHUFFLE(3, 3, 1, 1))};
|
|
5365
|
+
}
|
|
5366
|
+
#else
|
|
5367
|
+
template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_T_SIZE_D(D, 4)>
|
|
5368
|
+
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
|
|
5369
|
+
const RebindToFloat<decltype(d)> df;
|
|
5370
|
+
const VFromD<decltype(df)> b3_b1_a3_a3{_mm256_shuffle_ps(
|
|
5371
|
+
BitCast(df, a).raw, BitCast(df, b).raw, _MM_SHUFFLE(3, 1, 3, 1))};
|
|
5372
|
+
return BitCast(
|
|
5373
|
+
d, VFromD<decltype(df)>{_mm256_shuffle_ps(
|
|
5374
|
+
b3_b1_a3_a3.raw, b3_b1_a3_a3.raw, _MM_SHUFFLE(3, 1, 2, 0))});
|
|
5375
|
+
}
|
|
5376
|
+
#endif
|
|
5377
|
+
|
|
5378
|
+
// I64/U64/F64 InterleaveOdd is generic for vector lengths >= 32 bytes
|
|
5379
|
+
template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 8)>
|
|
5380
|
+
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
|
|
5381
|
+
return InterleaveUpper(d, a, b);
|
|
5382
|
+
}
|
|
5383
|
+
|
|
4530
5384
|
// ------------------------------ OddEvenBlocks
|
|
4531
5385
|
|
|
4532
5386
|
template <typename T, HWY_IF_NOT_FLOAT3264(T)>
|
|
4533
5387
|
Vec256<T> OddEvenBlocks(Vec256<T> odd, Vec256<T> even) {
|
|
4534
|
-
|
|
5388
|
+
const DFromV<decltype(odd)> d;
|
|
5389
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5390
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_blend_epi32(
|
|
5391
|
+
BitCast(du, odd).raw, BitCast(du, even).raw, 0xFu)});
|
|
4535
5392
|
}
|
|
4536
5393
|
|
|
4537
5394
|
HWY_API Vec256<float> OddEvenBlocks(Vec256<float> odd, Vec256<float> even) {
|
|
@@ -4554,7 +5411,10 @@ HWY_API VFromD<D> ReverseBlocks(D /*d*/, VFromD<D> v) {
|
|
|
4554
5411
|
// Both full
|
|
4555
5412
|
template <typename T, typename TI>
|
|
4556
5413
|
HWY_API Vec256<TI> TableLookupBytes(Vec256<T> bytes, Vec256<TI> from) {
|
|
4557
|
-
|
|
5414
|
+
const DFromV<decltype(from)> d;
|
|
5415
|
+
return BitCast(d, Vec256<uint8_t>{_mm256_shuffle_epi8(
|
|
5416
|
+
BitCast(Full256<uint8_t>(), bytes).raw,
|
|
5417
|
+
BitCast(Full256<uint8_t>(), from).raw)});
|
|
4558
5418
|
}
|
|
4559
5419
|
|
|
4560
5420
|
// Partial index vector
|
|
@@ -5114,14 +5974,15 @@ HWY_API Vec256<uint8_t> Shl(hwy::UnsignedTag tag, Vec256<uint8_t> v,
|
|
|
5114
5974
|
const DFromV<decltype(v)> d;
|
|
5115
5975
|
#if HWY_TARGET <= HWY_AVX3_DL
|
|
5116
5976
|
(void)tag;
|
|
5117
|
-
//
|
|
5118
|
-
|
|
5119
|
-
0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01,
|
|
5977
|
+
// masks[i] = 0xFF >> i
|
|
5978
|
+
const VFromD<decltype(d)> masks =
|
|
5979
|
+
Dup128VecFromValues(d, 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0,
|
|
5980
|
+
0, 0, 0, 0, 0, 0, 0);
|
|
5120
5981
|
// kShl[i] = 1 << i
|
|
5121
|
-
|
|
5122
|
-
|
|
5123
|
-
v = And(v, TableLookupBytes(
|
|
5124
|
-
const VFromD<decltype(d)> mul = TableLookupBytes(
|
|
5982
|
+
const VFromD<decltype(d)> shl = Dup128VecFromValues(
|
|
5983
|
+
d, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
5984
|
+
v = And(v, TableLookupBytes(masks, bits));
|
|
5985
|
+
const VFromD<decltype(d)> mul = TableLookupBytes(shl, bits);
|
|
5125
5986
|
return VFromD<decltype(d)>{_mm256_gf2p8mul_epi8(v.raw, mul.raw)};
|
|
5126
5987
|
#else
|
|
5127
5988
|
const Repartition<uint16_t, decltype(d)> dw;
|
|
@@ -5271,62 +6132,6 @@ HWY_API Vec256<int64_t> operator>>(Vec256<int64_t> v, Vec256<int64_t> bits) {
|
|
|
5271
6132
|
#endif
|
|
5272
6133
|
}
|
|
5273
6134
|
|
|
5274
|
-
HWY_INLINE Vec256<uint64_t> MulEven(const Vec256<uint64_t> a,
|
|
5275
|
-
const Vec256<uint64_t> b) {
|
|
5276
|
-
const Full256<uint64_t> du64;
|
|
5277
|
-
const RepartitionToNarrow<decltype(du64)> du32;
|
|
5278
|
-
const auto maskL = Set(du64, 0xFFFFFFFFULL);
|
|
5279
|
-
const auto a32 = BitCast(du32, a);
|
|
5280
|
-
const auto b32 = BitCast(du32, b);
|
|
5281
|
-
// Inputs for MulEven: we only need the lower 32 bits
|
|
5282
|
-
const auto aH = Shuffle2301(a32);
|
|
5283
|
-
const auto bH = Shuffle2301(b32);
|
|
5284
|
-
|
|
5285
|
-
// Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
|
|
5286
|
-
// the even (lower 64 bits of every 128-bit block) results. See
|
|
5287
|
-
// https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat
|
|
5288
|
-
const auto aLbL = MulEven(a32, b32);
|
|
5289
|
-
const auto w3 = aLbL & maskL;
|
|
5290
|
-
|
|
5291
|
-
const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
|
|
5292
|
-
const auto w2 = t2 & maskL;
|
|
5293
|
-
const auto w1 = ShiftRight<32>(t2);
|
|
5294
|
-
|
|
5295
|
-
const auto t = MulEven(a32, bH) + w2;
|
|
5296
|
-
const auto k = ShiftRight<32>(t);
|
|
5297
|
-
|
|
5298
|
-
const auto mulH = MulEven(aH, bH) + w1 + k;
|
|
5299
|
-
const auto mulL = ShiftLeft<32>(t) + w3;
|
|
5300
|
-
return InterleaveLower(mulL, mulH);
|
|
5301
|
-
}
|
|
5302
|
-
|
|
5303
|
-
HWY_INLINE Vec256<uint64_t> MulOdd(const Vec256<uint64_t> a,
|
|
5304
|
-
const Vec256<uint64_t> b) {
|
|
5305
|
-
const Full256<uint64_t> du64;
|
|
5306
|
-
const RepartitionToNarrow<decltype(du64)> du32;
|
|
5307
|
-
const auto maskL = Set(du64, 0xFFFFFFFFULL);
|
|
5308
|
-
const auto a32 = BitCast(du32, a);
|
|
5309
|
-
const auto b32 = BitCast(du32, b);
|
|
5310
|
-
// Inputs for MulEven: we only need bits [95:64] (= upper half of input)
|
|
5311
|
-
const auto aH = Shuffle2301(a32);
|
|
5312
|
-
const auto bH = Shuffle2301(b32);
|
|
5313
|
-
|
|
5314
|
-
// Same as above, but we're using the odd results (upper 64 bits per block).
|
|
5315
|
-
const auto aLbL = MulEven(a32, b32);
|
|
5316
|
-
const auto w3 = aLbL & maskL;
|
|
5317
|
-
|
|
5318
|
-
const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
|
|
5319
|
-
const auto w2 = t2 & maskL;
|
|
5320
|
-
const auto w1 = ShiftRight<32>(t2);
|
|
5321
|
-
|
|
5322
|
-
const auto t = MulEven(a32, bH) + w2;
|
|
5323
|
-
const auto k = ShiftRight<32>(t);
|
|
5324
|
-
|
|
5325
|
-
const auto mulH = MulEven(aH, bH) + w1 + k;
|
|
5326
|
-
const auto mulL = ShiftLeft<32>(t) + w3;
|
|
5327
|
-
return InterleaveUpper(du64, mulL, mulH);
|
|
5328
|
-
}
|
|
5329
|
-
|
|
5330
6135
|
// ------------------------------ WidenMulPairwiseAdd
|
|
5331
6136
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
|
|
5332
6137
|
HWY_API VFromD<D> WidenMulPairwiseAdd(D /*d32*/, Vec256<int16_t> a,
|
|
@@ -5343,7 +6148,31 @@ HWY_API VFromD<DI16> SatWidenMulPairwiseAdd(
|
|
|
5343
6148
|
return VFromD<DI16>{_mm256_maddubs_epi16(a.raw, b.raw)};
|
|
5344
6149
|
}
|
|
5345
6150
|
|
|
6151
|
+
// ------------------------------ SatWidenMulPairwiseAccumulate
|
|
6152
|
+
|
|
6153
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
6154
|
+
template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 32)>
|
|
6155
|
+
HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate(
|
|
6156
|
+
DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a,
|
|
6157
|
+
VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
|
|
6158
|
+
return VFromD<DI32>{_mm256_dpwssds_epi32(sum.raw, a.raw, b.raw)};
|
|
6159
|
+
}
|
|
6160
|
+
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
6161
|
+
|
|
5346
6162
|
// ------------------------------ ReorderWidenMulAccumulate
|
|
6163
|
+
|
|
6164
|
+
#if HWY_NATIVE_DOT_BF16
|
|
6165
|
+
template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_D(DF, 32),
|
|
6166
|
+
class VBF = VFromD<Repartition<bfloat16_t, DF>>>
|
|
6167
|
+
HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF /*df*/, VBF a, VBF b,
|
|
6168
|
+
const VFromD<DF> sum0,
|
|
6169
|
+
VFromD<DF>& /*sum1*/) {
|
|
6170
|
+
return VFromD<DF>{_mm256_dpbf16_ps(sum0.raw,
|
|
6171
|
+
reinterpret_cast<__m256bh>(a.raw),
|
|
6172
|
+
reinterpret_cast<__m256bh>(b.raw))};
|
|
6173
|
+
}
|
|
6174
|
+
#endif // HWY_NATIVE_DOT_BF16
|
|
6175
|
+
|
|
5347
6176
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
|
|
5348
6177
|
HWY_API VFromD<D> ReorderWidenMulAccumulate(D d, Vec256<int16_t> a,
|
|
5349
6178
|
Vec256<int16_t> b,
|
|
@@ -5461,22 +6290,91 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec32<int8_t> v) {
|
|
|
5461
6290
|
|
|
5462
6291
|
#if HWY_TARGET <= HWY_AVX3
|
|
5463
6292
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
|
|
5464
|
-
HWY_API VFromD<D>
|
|
5465
|
-
|
|
5466
|
-
|
|
5467
|
-
|
|
6293
|
+
HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
|
|
6294
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6295
|
+
// Workaround for undefined behavior with GCC if any values of v[i] are not
|
|
6296
|
+
// within the range of an int64_t
|
|
6297
|
+
|
|
6298
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
6299
|
+
if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
|
|
6300
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
|
|
6301
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
6302
|
+
return VFromD<D>{_mm256_setr_epi64x(
|
|
6303
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]),
|
|
6304
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1]),
|
|
6305
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[2]),
|
|
6306
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[3]))};
|
|
6307
|
+
}
|
|
6308
|
+
#endif
|
|
5468
6309
|
|
|
5469
|
-
|
|
5470
|
-
|
|
5471
|
-
|
|
6310
|
+
__m256i raw_result;
|
|
6311
|
+
__asm__("vcvttps2qq {%1, %0|%0, %1}"
|
|
6312
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6313
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6314
|
+
:);
|
|
6315
|
+
return VFromD<D>{raw_result};
|
|
6316
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
6317
|
+
return VFromD<D>{_mm256_cvttps_epi64(v.raw)};
|
|
6318
|
+
#endif // HWY_COMPILER_GCC_ACTUAL
|
|
5472
6319
|
}
|
|
5473
6320
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
|
|
5474
|
-
HWY_API VFromD<D>
|
|
5475
|
-
|
|
5476
|
-
|
|
6321
|
+
HWY_API VFromD<D> PromoteInRangeTo(D /* tag */, VFromD<Rebind<float, D>> v) {
|
|
6322
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6323
|
+
// Workaround for undefined behavior with GCC if any values of v[i] are not
|
|
6324
|
+
// within the range of an uint64_t
|
|
6325
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
6326
|
+
if (detail::IsConstantX86VecForF2IConv<uint64_t>(v)) {
|
|
6327
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
|
|
6328
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
6329
|
+
return VFromD<D>{_mm256_setr_epi64x(
|
|
6330
|
+
static_cast<int64_t>(
|
|
6331
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0])),
|
|
6332
|
+
static_cast<int64_t>(
|
|
6333
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1])),
|
|
6334
|
+
static_cast<int64_t>(
|
|
6335
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[2])),
|
|
6336
|
+
static_cast<int64_t>(
|
|
6337
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[3])))};
|
|
6338
|
+
}
|
|
6339
|
+
#endif
|
|
6340
|
+
|
|
6341
|
+
__m256i raw_result;
|
|
6342
|
+
__asm__("vcvttps2uqq {%1, %0|%0, %1}"
|
|
6343
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6344
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6345
|
+
:);
|
|
6346
|
+
return VFromD<D>{raw_result};
|
|
6347
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
6348
|
+
return VFromD<D>{_mm256_cvttps_epu64(v.raw)};
|
|
6349
|
+
#endif // HWY_COMPILER_GCC_ACTUAL
|
|
5477
6350
|
}
|
|
5478
6351
|
#endif // HWY_TARGET <= HWY_AVX3
|
|
5479
6352
|
|
|
6353
|
+
// ------------------------------ PromoteEvenTo/PromoteOddTo
|
|
6354
|
+
#if HWY_TARGET > HWY_AVX3
|
|
6355
|
+
namespace detail {
|
|
6356
|
+
|
|
6357
|
+
// I32->I64 PromoteEvenTo/PromoteOddTo
|
|
6358
|
+
|
|
6359
|
+
template <class D, HWY_IF_LANES_D(D, 4)>
|
|
6360
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
|
|
6361
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
6362
|
+
hwy::SignedTag /*from_type_tag*/, D d_to,
|
|
6363
|
+
Vec256<int32_t> v) {
|
|
6364
|
+
return BitCast(d_to, OddEven(DupEven(BroadcastSignBit(v)), v));
|
|
6365
|
+
}
|
|
6366
|
+
|
|
6367
|
+
template <class D, HWY_IF_LANES_D(D, 4)>
|
|
6368
|
+
HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
|
|
6369
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
6370
|
+
hwy::SignedTag /*from_type_tag*/, D d_to,
|
|
6371
|
+
Vec256<int32_t> v) {
|
|
6372
|
+
return BitCast(d_to, OddEven(BroadcastSignBit(v), DupOdd(v)));
|
|
6373
|
+
}
|
|
6374
|
+
|
|
6375
|
+
} // namespace detail
|
|
6376
|
+
#endif
|
|
6377
|
+
|
|
5480
6378
|
// ------------------------------ Demotions (full -> part w/ narrow lanes)
|
|
5481
6379
|
|
|
5482
6380
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
|
|
@@ -5565,32 +6463,17 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int64_t> v) {
|
|
|
5565
6463
|
|
|
5566
6464
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
|
|
5567
6465
|
HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int64_t> v) {
|
|
5568
|
-
const
|
|
5569
|
-
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
5570
|
-
const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
|
|
5571
|
-
#else
|
|
5572
|
-
const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
|
|
5573
|
-
#endif
|
|
6466
|
+
const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
|
|
5574
6467
|
return VFromD<D>{_mm256_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)};
|
|
5575
6468
|
}
|
|
5576
6469
|
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
|
|
5577
6470
|
HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int64_t> v) {
|
|
5578
|
-
const
|
|
5579
|
-
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
5580
|
-
const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
|
|
5581
|
-
#else
|
|
5582
|
-
const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
|
|
5583
|
-
#endif
|
|
6471
|
+
const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
|
|
5584
6472
|
return VFromD<D>{_mm256_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)};
|
|
5585
6473
|
}
|
|
5586
6474
|
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U8_D(D)>
|
|
5587
6475
|
HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int64_t> v) {
|
|
5588
|
-
const
|
|
5589
|
-
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
5590
|
-
const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
|
|
5591
|
-
#else
|
|
5592
|
-
const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
|
|
5593
|
-
#endif
|
|
6476
|
+
const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
|
|
5594
6477
|
return VFromD<D>{_mm256_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)};
|
|
5595
6478
|
}
|
|
5596
6479
|
|
|
@@ -5617,32 +6500,54 @@ HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wsign-conversion")
|
|
|
5617
6500
|
|
|
5618
6501
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
|
|
5619
6502
|
HWY_API VFromD<D> DemoteTo(D df16, Vec256<float> v) {
|
|
5620
|
-
(
|
|
5621
|
-
return
|
|
6503
|
+
const RebindToUnsigned<decltype(df16)> du16;
|
|
6504
|
+
return BitCast(
|
|
6505
|
+
df16, VFromD<decltype(du16)>{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)});
|
|
5622
6506
|
}
|
|
5623
6507
|
|
|
5624
6508
|
HWY_DIAGNOSTICS(pop)
|
|
5625
6509
|
|
|
5626
6510
|
#endif // HWY_DISABLE_F16C
|
|
5627
6511
|
|
|
6512
|
+
#if HWY_HAVE_FLOAT16
|
|
6513
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F16_D(D)>
|
|
6514
|
+
HWY_API VFromD<D> DemoteTo(D /*df16*/, Vec256<double> v) {
|
|
6515
|
+
return VFromD<D>{_mm256_cvtpd_ph(v.raw)};
|
|
6516
|
+
}
|
|
6517
|
+
#endif // HWY_HAVE_FLOAT16
|
|
6518
|
+
|
|
6519
|
+
#if HWY_AVX3_HAVE_F32_TO_BF16C
|
|
5628
6520
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
|
|
5629
|
-
HWY_API VFromD<D> DemoteTo(D dbf16
|
|
5630
|
-
|
|
5631
|
-
|
|
5632
|
-
|
|
5633
|
-
|
|
5634
|
-
|
|
5635
|
-
|
|
6521
|
+
HWY_API VFromD<D> DemoteTo(D /*dbf16*/, Vec256<float> v) {
|
|
6522
|
+
#if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
|
|
6523
|
+
// Inline assembly workaround for LLVM codegen bug
|
|
6524
|
+
__m128i raw_result;
|
|
6525
|
+
__asm__("vcvtneps2bf16 %1, %0" : "=v"(raw_result) : "v"(v.raw));
|
|
6526
|
+
return VFromD<D>{raw_result};
|
|
6527
|
+
#else
|
|
6528
|
+
// The _mm256_cvtneps_pbh intrinsic returns a __m128bh vector that needs to be
|
|
6529
|
+
// bit casted to a __m128i vector
|
|
6530
|
+
return VFromD<D>{detail::BitCastToInteger(_mm256_cvtneps_pbh(v.raw))};
|
|
6531
|
+
#endif
|
|
5636
6532
|
}
|
|
5637
6533
|
|
|
5638
6534
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
|
|
5639
|
-
HWY_API VFromD<D> ReorderDemote2To(D dbf16
|
|
5640
|
-
|
|
5641
|
-
|
|
5642
|
-
|
|
5643
|
-
|
|
5644
|
-
|
|
6535
|
+
HWY_API VFromD<D> ReorderDemote2To(D /*dbf16*/, Vec256<float> a,
|
|
6536
|
+
Vec256<float> b) {
|
|
6537
|
+
#if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
|
|
6538
|
+
// Inline assembly workaround for LLVM codegen bug
|
|
6539
|
+
__m256i raw_result;
|
|
6540
|
+
__asm__("vcvtne2ps2bf16 %2, %1, %0"
|
|
6541
|
+
: "=v"(raw_result)
|
|
6542
|
+
: "v"(b.raw), "v"(a.raw));
|
|
6543
|
+
return VFromD<D>{raw_result};
|
|
6544
|
+
#else
|
|
6545
|
+
// The _mm256_cvtne2ps_pbh intrinsic returns a __m256bh vector that needs to
|
|
6546
|
+
// be bit casted to a __m256i vector
|
|
6547
|
+
return VFromD<D>{detail::BitCastToInteger(_mm256_cvtne2ps_pbh(b.raw, a.raw))};
|
|
6548
|
+
#endif
|
|
5645
6549
|
}
|
|
6550
|
+
#endif // HWY_AVX3_HAVE_F32_TO_BF16C
|
|
5646
6551
|
|
|
5647
6552
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)>
|
|
5648
6553
|
HWY_API VFromD<D> ReorderDemote2To(D /*d16*/, Vec256<int32_t> a,
|
|
@@ -5733,9 +6638,9 @@ HWY_API Vec256<uint32_t> ReorderDemote2To(D dn, Vec256<int64_t> a,
|
|
|
5733
6638
|
_MM_SHUFFLE(2, 0, 2, 0))});
|
|
5734
6639
|
}
|
|
5735
6640
|
|
|
5736
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32),
|
|
5737
|
-
HWY_API
|
|
5738
|
-
|
|
6641
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
|
|
6642
|
+
HWY_API VFromD<D> ReorderDemote2To(D dn, Vec256<uint64_t> a,
|
|
6643
|
+
Vec256<uint64_t> b) {
|
|
5739
6644
|
const Half<decltype(dn)> dnh;
|
|
5740
6645
|
const Repartition<float, decltype(dn)> dn_f;
|
|
5741
6646
|
|
|
@@ -5767,37 +6672,64 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<double> v) {
|
|
|
5767
6672
|
}
|
|
5768
6673
|
|
|
5769
6674
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
|
|
5770
|
-
HWY_API VFromD<D>
|
|
5771
|
-
|
|
5772
|
-
|
|
5773
|
-
|
|
6675
|
+
HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec256<double> v) {
|
|
6676
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6677
|
+
// Workaround for undefined behavior in _mm256_cvttpd_epi32 with GCC if any
|
|
6678
|
+
// values of v[i] are not within the range of an int32_t
|
|
6679
|
+
|
|
6680
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
6681
|
+
if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
|
|
6682
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(32)));
|
|
6683
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
6684
|
+
return Dup128VecFromValues(
|
|
6685
|
+
D(), detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
|
|
6686
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]),
|
|
6687
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[2]),
|
|
6688
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[3]));
|
|
6689
|
+
}
|
|
6690
|
+
#endif
|
|
6691
|
+
|
|
6692
|
+
__m128i raw_result;
|
|
6693
|
+
__asm__("vcvttpd2dq {%1, %0|%0, %1}"
|
|
6694
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6695
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6696
|
+
:);
|
|
6697
|
+
return VFromD<D>{raw_result};
|
|
6698
|
+
#else
|
|
6699
|
+
return VFromD<D>{_mm256_cvttpd_epi32(v.raw)};
|
|
6700
|
+
#endif
|
|
5774
6701
|
}
|
|
5775
6702
|
|
|
5776
|
-
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
|
|
5777
|
-
HWY_API VFromD<D> DemoteTo(D du32, Vec256<double> v) {
|
|
5778
6703
|
#if HWY_TARGET <= HWY_AVX3
|
|
5779
|
-
|
|
5780
|
-
|
|
5781
|
-
|
|
5782
|
-
|
|
5783
|
-
|
|
5784
|
-
|
|
5785
|
-
|
|
5786
|
-
|
|
5787
|
-
|
|
6704
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
|
|
6705
|
+
HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec256<double> v) {
|
|
6706
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6707
|
+
// Workaround for undefined behavior in _mm256_cvttpd_epu32 with GCC if any
|
|
6708
|
+
// values of v[i] are not within the range of an uint32_t
|
|
6709
|
+
|
|
6710
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
6711
|
+
if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) {
|
|
6712
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(32)));
|
|
6713
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
6714
|
+
return Dup128VecFromValues(
|
|
6715
|
+
D(), detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0]),
|
|
6716
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1]),
|
|
6717
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[2]),
|
|
6718
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[3]));
|
|
6719
|
+
}
|
|
6720
|
+
#endif
|
|
5788
6721
|
|
|
5789
|
-
|
|
5790
|
-
|
|
5791
|
-
|
|
5792
|
-
|
|
5793
|
-
|
|
5794
|
-
|
|
5795
|
-
|
|
5796
|
-
return
|
|
6722
|
+
__m128i raw_result;
|
|
6723
|
+
__asm__("vcvttpd2udq {%1, %0|%0, %1}"
|
|
6724
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6725
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6726
|
+
:);
|
|
6727
|
+
return VFromD<D>{raw_result};
|
|
6728
|
+
#else
|
|
6729
|
+
return VFromD<D>{_mm256_cvttpd_epu32(v.raw)};
|
|
5797
6730
|
#endif
|
|
5798
6731
|
}
|
|
5799
6732
|
|
|
5800
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
5801
6733
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
|
|
5802
6734
|
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
|
|
5803
6735
|
return VFromD<D>{_mm256_cvtepi64_ps(v.raw)};
|
|
@@ -5963,61 +6895,274 @@ HWY_API VFromD<D> ConvertTo(D /*dd*/, Vec256<uint64_t> v) {
|
|
|
5963
6895
|
|
|
5964
6896
|
#if HWY_HAVE_FLOAT16
|
|
5965
6897
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)>
|
|
5966
|
-
HWY_API VFromD<D>
|
|
5967
|
-
|
|
5968
|
-
|
|
6898
|
+
HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec256<float16_t> v) {
|
|
6899
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6900
|
+
// Workaround for undefined behavior in _mm256_cvttph_epi16 with GCC if any
|
|
6901
|
+
// values of v[i] are not within the range of an int16_t
|
|
6902
|
+
|
|
6903
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
|
|
6904
|
+
HWY_HAVE_SCALAR_F16_TYPE
|
|
6905
|
+
if (detail::IsConstantX86VecForF2IConv<int16_t>(v)) {
|
|
6906
|
+
typedef hwy::float16_t::Native GccF16RawVectType
|
|
6907
|
+
__attribute__((__vector_size__(32)));
|
|
6908
|
+
const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
|
|
6909
|
+
return VFromD<D>{_mm256_setr_epi16(
|
|
6910
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[0]),
|
|
6911
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[1]),
|
|
6912
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[2]),
|
|
6913
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[3]),
|
|
6914
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[4]),
|
|
6915
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[5]),
|
|
6916
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[6]),
|
|
6917
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[7]),
|
|
6918
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[8]),
|
|
6919
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[9]),
|
|
6920
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[10]),
|
|
6921
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[11]),
|
|
6922
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[12]),
|
|
6923
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[13]),
|
|
6924
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[14]),
|
|
6925
|
+
detail::X86ConvertScalarFromFloat<int16_t>(raw_v[15]))};
|
|
6926
|
+
}
|
|
6927
|
+
#endif
|
|
6928
|
+
|
|
6929
|
+
__m256i raw_result;
|
|
6930
|
+
__asm__("vcvttph2w {%1, %0|%0, %1}"
|
|
6931
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6932
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6933
|
+
:);
|
|
6934
|
+
return VFromD<D>{raw_result};
|
|
6935
|
+
#else // HWY_COMPILER_GCC_ACTUAL < 1200
|
|
6936
|
+
return VFromD<D>{_mm256_cvttph_epi16(v.raw)};
|
|
6937
|
+
#endif
|
|
6938
|
+
}
|
|
6939
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
|
|
6940
|
+
HWY_API VFromD<D> ConvertInRangeTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
|
|
6941
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
6942
|
+
// Workaround for undefined behavior in _mm256_cvttph_epu16 with GCC if any
|
|
6943
|
+
// values of v[i] are not within the range of an uint16_t
|
|
6944
|
+
|
|
6945
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
|
|
6946
|
+
HWY_HAVE_SCALAR_F16_TYPE
|
|
6947
|
+
if (detail::IsConstantX86VecForF2IConv<uint16_t>(v)) {
|
|
6948
|
+
typedef hwy::float16_t::Native GccF16RawVectType
|
|
6949
|
+
__attribute__((__vector_size__(32)));
|
|
6950
|
+
const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
|
|
6951
|
+
return VFromD<D>{_mm256_setr_epi16(
|
|
6952
|
+
static_cast<int16_t>(
|
|
6953
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[0])),
|
|
6954
|
+
static_cast<int16_t>(
|
|
6955
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[1])),
|
|
6956
|
+
static_cast<int16_t>(
|
|
6957
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[2])),
|
|
6958
|
+
static_cast<int16_t>(
|
|
6959
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[3])),
|
|
6960
|
+
static_cast<int16_t>(
|
|
6961
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[4])),
|
|
6962
|
+
static_cast<int16_t>(
|
|
6963
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[5])),
|
|
6964
|
+
static_cast<int16_t>(
|
|
6965
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[6])),
|
|
6966
|
+
static_cast<int16_t>(
|
|
6967
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[7])),
|
|
6968
|
+
static_cast<int16_t>(
|
|
6969
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[8])),
|
|
6970
|
+
static_cast<int16_t>(
|
|
6971
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[9])),
|
|
6972
|
+
static_cast<int16_t>(
|
|
6973
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[10])),
|
|
6974
|
+
static_cast<int16_t>(
|
|
6975
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[11])),
|
|
6976
|
+
static_cast<int16_t>(
|
|
6977
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[12])),
|
|
6978
|
+
static_cast<int16_t>(
|
|
6979
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[13])),
|
|
6980
|
+
static_cast<int16_t>(
|
|
6981
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[14])),
|
|
6982
|
+
static_cast<int16_t>(
|
|
6983
|
+
detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[15])))};
|
|
6984
|
+
}
|
|
6985
|
+
#endif
|
|
6986
|
+
|
|
6987
|
+
__m256i raw_result;
|
|
6988
|
+
__asm__("vcvttph2uw {%1, %0|%0, %1}"
|
|
6989
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
6990
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
6991
|
+
:);
|
|
6992
|
+
return VFromD<D>{raw_result};
|
|
6993
|
+
#else // HWY_COMPILER_GCC_ACTUAL < 1200
|
|
6994
|
+
return VFromD<D>{_mm256_cvttph_epu16(v.raw)};
|
|
6995
|
+
#endif
|
|
5969
6996
|
}
|
|
5970
6997
|
#endif // HWY_HAVE_FLOAT16
|
|
5971
6998
|
|
|
5972
6999
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
|
|
5973
|
-
HWY_API VFromD<D>
|
|
5974
|
-
|
|
5975
|
-
|
|
7000
|
+
HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec256<float> v) {
|
|
7001
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
7002
|
+
// Workaround for undefined behavior in _mm256_cvttps_epi32 with GCC if any
|
|
7003
|
+
// values of v[i] are not within the range of an int32_t
|
|
7004
|
+
|
|
7005
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
7006
|
+
if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
|
|
7007
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
|
|
7008
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
7009
|
+
return VFromD<D>{_mm256_setr_epi32(
|
|
7010
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
|
|
7011
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]),
|
|
7012
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[2]),
|
|
7013
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[3]),
|
|
7014
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[4]),
|
|
7015
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[5]),
|
|
7016
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[6]),
|
|
7017
|
+
detail::X86ConvertScalarFromFloat<int32_t>(raw_v[7]))};
|
|
7018
|
+
}
|
|
7019
|
+
#endif
|
|
7020
|
+
|
|
7021
|
+
__m256i raw_result;
|
|
7022
|
+
__asm__("vcvttps2dq {%1, %0|%0, %1}"
|
|
7023
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
7024
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
7025
|
+
:);
|
|
7026
|
+
return VFromD<D>{raw_result};
|
|
7027
|
+
#else
|
|
7028
|
+
return VFromD<D>{_mm256_cvttps_epi32(v.raw)};
|
|
7029
|
+
#endif
|
|
5976
7030
|
}
|
|
5977
7031
|
|
|
5978
7032
|
#if HWY_TARGET <= HWY_AVX3
|
|
5979
7033
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
|
|
5980
|
-
HWY_API VFromD<D>
|
|
5981
|
-
|
|
5982
|
-
|
|
7034
|
+
HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, Vec256<double> v) {
|
|
7035
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
7036
|
+
// Workaround for undefined behavior in _mm256_cvttpd_epi64 with GCC if any
|
|
7037
|
+
// values of v[i] are not within the range of an int64_t
|
|
7038
|
+
|
|
7039
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
7040
|
+
if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
|
|
7041
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(32)));
|
|
7042
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
7043
|
+
return VFromD<D>{_mm256_setr_epi64x(
|
|
7044
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]),
|
|
7045
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1]),
|
|
7046
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[2]),
|
|
7047
|
+
detail::X86ConvertScalarFromFloat<int64_t>(raw_v[3]))};
|
|
7048
|
+
}
|
|
7049
|
+
#endif
|
|
7050
|
+
|
|
7051
|
+
__m256i raw_result;
|
|
7052
|
+
__asm__("vcvttpd2qq {%1, %0|%0, %1}"
|
|
7053
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
7054
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
7055
|
+
:);
|
|
7056
|
+
return VFromD<D>{raw_result};
|
|
7057
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
7058
|
+
return VFromD<D>{_mm256_cvttpd_epi64(v.raw)};
|
|
7059
|
+
#endif // HWY_COMPILER_GCC_ACTUAL
|
|
5983
7060
|
}
|
|
5984
7061
|
template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U32_D(DU)>
|
|
5985
|
-
HWY_API VFromD<DU>
|
|
5986
|
-
|
|
5987
|
-
|
|
7062
|
+
HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
7063
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
7064
|
+
// Workaround for undefined behavior in _mm256_cvttps_epu32 with GCC if any
|
|
7065
|
+
// values of v[i] are not within the range of an uint32_t
|
|
7066
|
+
|
|
7067
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
7068
|
+
if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) {
|
|
7069
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
|
|
7070
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
7071
|
+
return VFromD<DU>{_mm256_setr_epi32(
|
|
7072
|
+
static_cast<int32_t>(
|
|
7073
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0])),
|
|
7074
|
+
static_cast<int32_t>(
|
|
7075
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1])),
|
|
7076
|
+
static_cast<int32_t>(
|
|
7077
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[2])),
|
|
7078
|
+
static_cast<int32_t>(
|
|
7079
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[3])),
|
|
7080
|
+
static_cast<int32_t>(
|
|
7081
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[4])),
|
|
7082
|
+
static_cast<int32_t>(
|
|
7083
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[5])),
|
|
7084
|
+
static_cast<int32_t>(
|
|
7085
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[6])),
|
|
7086
|
+
static_cast<int32_t>(
|
|
7087
|
+
detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[7])))};
|
|
7088
|
+
}
|
|
7089
|
+
#endif
|
|
7090
|
+
|
|
7091
|
+
__m256i raw_result;
|
|
7092
|
+
__asm__("vcvttps2udq {%1, %0|%0, %1}"
|
|
7093
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
7094
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
7095
|
+
:);
|
|
7096
|
+
return VFromD<DU>{raw_result};
|
|
7097
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
7098
|
+
return VFromD<DU>{_mm256_cvttps_epu32(v.raw)};
|
|
7099
|
+
#endif // HWY_COMPILER_GCC_ACTUAL
|
|
5988
7100
|
}
|
|
5989
7101
|
template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U64_D(DU)>
|
|
5990
|
-
HWY_API VFromD<DU>
|
|
5991
|
-
|
|
5992
|
-
|
|
5993
|
-
|
|
5994
|
-
|
|
5995
|
-
|
|
5996
|
-
|
|
5997
|
-
|
|
5998
|
-
|
|
5999
|
-
|
|
6000
|
-
|
|
6001
|
-
|
|
6002
|
-
|
|
6003
|
-
|
|
6004
|
-
|
|
6005
|
-
|
|
6006
|
-
|
|
6007
|
-
|
|
6008
|
-
|
|
6009
|
-
|
|
6010
|
-
|
|
6011
|
-
|
|
6012
|
-
|
|
6013
|
-
|
|
7102
|
+
HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
7103
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
7104
|
+
// Workaround for undefined behavior in _mm256_cvttpd_epu64 with GCC if any
|
|
7105
|
+
// values of v[i] are not within the range of an uint64_t
|
|
7106
|
+
|
|
7107
|
+
#if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
|
|
7108
|
+
if (detail::IsConstantX86VecForF2IConv<uint64_t>(v)) {
|
|
7109
|
+
typedef double GccF64RawVectType __attribute__((__vector_size__(32)));
|
|
7110
|
+
const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
|
|
7111
|
+
return VFromD<DU>{_mm256_setr_epi64x(
|
|
7112
|
+
static_cast<int64_t>(
|
|
7113
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0])),
|
|
7114
|
+
static_cast<int64_t>(
|
|
7115
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1])),
|
|
7116
|
+
static_cast<int64_t>(
|
|
7117
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[2])),
|
|
7118
|
+
static_cast<int64_t>(
|
|
7119
|
+
detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[3])))};
|
|
7120
|
+
}
|
|
7121
|
+
#endif
|
|
7122
|
+
|
|
7123
|
+
__m256i raw_result;
|
|
7124
|
+
__asm__("vcvttpd2uqq {%1, %0|%0, %1}"
|
|
7125
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
7126
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
7127
|
+
:);
|
|
7128
|
+
return VFromD<DU>{raw_result};
|
|
7129
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
7130
|
+
return VFromD<DU>{_mm256_cvttpd_epu64(v.raw)};
|
|
7131
|
+
#endif // HWY_COMPILER_GCC_ACTUAL
|
|
6014
7132
|
}
|
|
6015
7133
|
#endif // HWY_TARGET <= HWY_AVX3
|
|
6016
7134
|
|
|
6017
|
-
|
|
6018
|
-
|
|
6019
|
-
|
|
6020
|
-
|
|
7135
|
+
template <class DI, HWY_IF_V_SIZE_D(DI, 32), HWY_IF_I32_D(DI)>
|
|
7136
|
+
HWY_INLINE VFromD<DI> NearestIntInRange(DI, VFromD<RebindToFloat<DI>> v) {
|
|
7137
|
+
#if HWY_COMPILER_GCC_ACTUAL
|
|
7138
|
+
// Workaround for undefined behavior in _mm256_cvtps_epi32 if any values of
|
|
7139
|
+
// v[i] are not within the range of an int32_t
|
|
7140
|
+
|
|
7141
|
+
#if HWY_COMPILER_GCC >= 700 && !HWY_IS_DEBUG_BUILD
|
|
7142
|
+
if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
|
|
7143
|
+
typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
|
|
7144
|
+
const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
|
|
7145
|
+
return VFromD<DI>{
|
|
7146
|
+
_mm256_setr_epi32(detail::X86ScalarNearestInt<int32_t>(raw_v[0]),
|
|
7147
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[1]),
|
|
7148
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[2]),
|
|
7149
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[3]),
|
|
7150
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[4]),
|
|
7151
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[5]),
|
|
7152
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[6]),
|
|
7153
|
+
detail::X86ScalarNearestInt<int32_t>(raw_v[7]))};
|
|
7154
|
+
}
|
|
7155
|
+
#endif
|
|
7156
|
+
|
|
7157
|
+
__m256i raw_result;
|
|
7158
|
+
__asm__("vcvtps2dq {%1, %0|%0, %1}"
|
|
7159
|
+
: "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
|
|
7160
|
+
: HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
|
|
7161
|
+
:);
|
|
7162
|
+
return VFromD<DI>{raw_result};
|
|
7163
|
+
#else // !HWY_COMPILER_GCC_ACTUAL
|
|
7164
|
+
return VFromD<DI>{_mm256_cvtps_epi32(v.raw)};
|
|
7165
|
+
#endif // HWY_COMPILER_GCC_ACTUAL
|
|
6021
7166
|
}
|
|
6022
7167
|
|
|
6023
7168
|
#ifndef HWY_DISABLE_F16C
|
|
@@ -6035,6 +7180,15 @@ HWY_API VFromD<D> PromoteTo(D df32, Vec128<float16_t> v) {
|
|
|
6035
7180
|
|
|
6036
7181
|
#endif // HWY_DISABLE_F16C
|
|
6037
7182
|
|
|
7183
|
+
#if HWY_HAVE_FLOAT16
|
|
7184
|
+
|
|
7185
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
|
|
7186
|
+
HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, Vec64<float16_t> v) {
|
|
7187
|
+
return VFromD<D>{_mm256_cvtph_pd(v.raw)};
|
|
7188
|
+
}
|
|
7189
|
+
|
|
7190
|
+
#endif // HWY_HAVE_FLOAT16
|
|
7191
|
+
|
|
6038
7192
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
6039
7193
|
HWY_API VFromD<D> PromoteTo(D df32, Vec128<bfloat16_t> v) {
|
|
6040
7194
|
const Rebind<uint16_t, decltype(df32)> du16;
|
|
@@ -6120,14 +7274,14 @@ template <uint8_t kRcon>
|
|
|
6120
7274
|
HWY_API Vec256<uint8_t> AESKeyGenAssist(Vec256<uint8_t> v) {
|
|
6121
7275
|
const Full256<uint8_t> d;
|
|
6122
7276
|
#if HWY_TARGET <= HWY_AVX3_DL
|
|
6123
|
-
|
|
6124
|
-
0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0
|
|
6125
|
-
|
|
6126
|
-
0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12
|
|
7277
|
+
const VFromD<decltype(d)> rconXorMask = Dup128VecFromValues(
|
|
7278
|
+
d, 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0);
|
|
7279
|
+
const VFromD<decltype(d)> rotWordShuffle = Dup128VecFromValues(
|
|
7280
|
+
d, 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12);
|
|
6127
7281
|
const Repartition<uint32_t, decltype(d)> du32;
|
|
6128
7282
|
const auto w13 = BitCast(d, DupOdd(BitCast(du32, v)));
|
|
6129
|
-
const auto sub_word_result = AESLastRound(w13,
|
|
6130
|
-
return TableLookupBytes(sub_word_result,
|
|
7283
|
+
const auto sub_word_result = AESLastRound(w13, rconXorMask);
|
|
7284
|
+
return TableLookupBytes(sub_word_result, rotWordShuffle);
|
|
6131
7285
|
#else
|
|
6132
7286
|
const Half<decltype(d)> d2;
|
|
6133
7287
|
return Combine(d, AESKeyGenAssist<kRcon>(UpperHalf(d2, v)),
|
|
@@ -6387,9 +7541,9 @@ HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) {
|
|
|
6387
7541
|
0x0303030303030303ull};
|
|
6388
7542
|
const auto rep8 = TableLookupBytes(vbits, BitCast(du, Load(du64, kRep8)));
|
|
6389
7543
|
|
|
6390
|
-
|
|
6391
|
-
|
|
6392
|
-
return RebindMask(d, TestBit(rep8,
|
|
7544
|
+
const VFromD<decltype(du)> bit = Dup128VecFromValues(
|
|
7545
|
+
du, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
|
|
7546
|
+
return RebindMask(d, TestBit(rep8, bit));
|
|
6393
7547
|
}
|
|
6394
7548
|
|
|
6395
7549
|
template <typename T, HWY_IF_T_SIZE(T, 2)>
|
|
@@ -6923,6 +8077,16 @@ HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
|
|
|
6923
8077
|
|
|
6924
8078
|
#endif // HWY_TARGET <= HWY_AVX3
|
|
6925
8079
|
|
|
8080
|
+
// ------------------------------ Dup128MaskFromMaskBits
|
|
8081
|
+
|
|
8082
|
+
// Generic for all vector lengths >= 32 bytes
|
|
8083
|
+
template <class D, HWY_IF_V_SIZE_GT_D(D, 16)>
|
|
8084
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
8085
|
+
const Half<decltype(d)> dh;
|
|
8086
|
+
const auto mh = Dup128MaskFromMaskBits(dh, mask_bits);
|
|
8087
|
+
return CombineMasks(d, mh, mh);
|
|
8088
|
+
}
|
|
8089
|
+
|
|
6926
8090
|
// ------------------------------ Expand
|
|
6927
8091
|
|
|
6928
8092
|
// Always define Expand/LoadExpand because generic_ops only does so for Vec128.
|
|
@@ -7396,116 +8560,26 @@ HWY_API Mask256<T> SetAtOrBeforeFirst(Mask256<T> mask) {
|
|
|
7396
8560
|
}
|
|
7397
8561
|
#endif // HWY_TARGET <= HWY_AVX3
|
|
7398
8562
|
|
|
7399
|
-
// ------------------------------ Reductions
|
|
7400
|
-
|
|
7401
|
-
namespace detail {
|
|
7402
|
-
|
|
7403
|
-
// These functions start with each lane per 128-bit block being reduced with the
|
|
7404
|
-
// corresponding lane in the other block, so we use the same logic as x86_128
|
|
7405
|
-
// but running on both blocks at the same time. There are two (64-bit) to eight
|
|
7406
|
-
// (16-bit) lanes per block.
|
|
7407
|
-
template <typename T, HWY_IF_T_SIZE(T, 8)>
|
|
7408
|
-
HWY_INLINE Vec256<T> SumOfLanes(Vec256<T> v10) {
|
|
7409
|
-
const DFromV<decltype(v10)> d;
|
|
7410
|
-
return Add(v10, Reverse2(d, v10));
|
|
7411
|
-
}
|
|
7412
|
-
template <typename T, HWY_IF_T_SIZE(T, 8)>
|
|
7413
|
-
HWY_INLINE Vec256<T> MinOfLanes(Vec256<T> v10) {
|
|
7414
|
-
const DFromV<decltype(v10)> d;
|
|
7415
|
-
return Min(v10, Reverse2(d, v10));
|
|
7416
|
-
}
|
|
7417
|
-
template <typename T, HWY_IF_T_SIZE(T, 8)>
|
|
7418
|
-
HWY_INLINE Vec256<T> MaxOfLanes(Vec256<T> v10) {
|
|
7419
|
-
const DFromV<decltype(v10)> d;
|
|
7420
|
-
return Max(v10, Reverse2(d, v10));
|
|
7421
|
-
}
|
|
7422
|
-
|
|
7423
|
-
template <typename T, HWY_IF_T_SIZE(T, 4)>
|
|
7424
|
-
HWY_INLINE Vec256<T> SumOfLanes(Vec256<T> v3210) {
|
|
7425
|
-
using V = decltype(v3210);
|
|
7426
|
-
const DFromV<V> d;
|
|
7427
|
-
const V v0123 = Reverse4(d, v3210);
|
|
7428
|
-
const V v03_12_12_03 = Add(v3210, v0123);
|
|
7429
|
-
const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
|
|
7430
|
-
return Add(v03_12_12_03, v12_03_03_12);
|
|
7431
|
-
}
|
|
7432
|
-
template <typename T, HWY_IF_T_SIZE(T, 4)>
|
|
7433
|
-
HWY_INLINE Vec256<T> MinOfLanes(Vec256<T> v3210) {
|
|
7434
|
-
using V = decltype(v3210);
|
|
7435
|
-
const DFromV<V> d;
|
|
7436
|
-
const V v0123 = Reverse4(d, v3210);
|
|
7437
|
-
const V v03_12_12_03 = Min(v3210, v0123);
|
|
7438
|
-
const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
|
|
7439
|
-
return Min(v03_12_12_03, v12_03_03_12);
|
|
7440
|
-
}
|
|
7441
|
-
template <typename T, HWY_IF_T_SIZE(T, 4)>
|
|
7442
|
-
HWY_INLINE Vec256<T> MaxOfLanes(Vec256<T> v3210) {
|
|
7443
|
-
using V = decltype(v3210);
|
|
7444
|
-
const DFromV<V> d;
|
|
7445
|
-
const V v0123 = Reverse4(d, v3210);
|
|
7446
|
-
const V v03_12_12_03 = Max(v3210, v0123);
|
|
7447
|
-
const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
|
|
7448
|
-
return Max(v03_12_12_03, v12_03_03_12);
|
|
7449
|
-
}
|
|
8563
|
+
// ------------------------------ Reductions in generic_ops
|
|
7450
8564
|
|
|
7451
|
-
|
|
7452
|
-
|
|
7453
|
-
|
|
7454
|
-
|
|
7455
|
-
|
|
7456
|
-
const
|
|
7457
|
-
const
|
|
7458
|
-
|
|
7459
|
-
}
|
|
7460
|
-
template <typename T, HWY_IF_T_SIZE(T, 2)>
|
|
7461
|
-
HWY_INLINE Vec256<T> MinOfLanes(Vec256<T> v76543210) {
|
|
7462
|
-
using V = decltype(v76543210);
|
|
7463
|
-
const DFromV<V> d;
|
|
7464
|
-
// The upper half is reversed from the lower half; omit for brevity.
|
|
7465
|
-
const V v34_25_16_07 = Min(v76543210, Reverse8(d, v76543210));
|
|
7466
|
-
const V v0347_1625_1625_0347 = Min(v34_25_16_07, Reverse4(d, v34_25_16_07));
|
|
7467
|
-
return Min(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
|
|
7468
|
-
}
|
|
7469
|
-
template <typename T, HWY_IF_T_SIZE(T, 2)>
|
|
7470
|
-
HWY_INLINE Vec256<T> MaxOfLanes(Vec256<T> v76543210) {
|
|
7471
|
-
using V = decltype(v76543210);
|
|
7472
|
-
const DFromV<V> d;
|
|
7473
|
-
// The upper half is reversed from the lower half; omit for brevity.
|
|
7474
|
-
const V v34_25_16_07 = Max(v76543210, Reverse8(d, v76543210));
|
|
7475
|
-
const V v0347_1625_1625_0347 = Max(v34_25_16_07, Reverse4(d, v34_25_16_07));
|
|
7476
|
-
return Max(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
|
|
7477
|
-
}
|
|
8565
|
+
// ------------------------------ BitShuffle
|
|
8566
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
8567
|
+
template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
|
|
8568
|
+
HWY_IF_V_SIZE_V(V, 32), HWY_IF_V_SIZE_V(VI, 32)>
|
|
8569
|
+
HWY_API V BitShuffle(V v, VI idx) {
|
|
8570
|
+
const DFromV<decltype(v)> d64;
|
|
8571
|
+
const RebindToUnsigned<decltype(d64)> du64;
|
|
8572
|
+
const Rebind<uint8_t, decltype(d64)> du8;
|
|
7478
8573
|
|
|
7479
|
-
|
|
8574
|
+
int32_t i32_bit_shuf_result =
|
|
8575
|
+
static_cast<int32_t>(_mm256_bitshuffle_epi64_mask(v.raw, idx.raw));
|
|
7480
8576
|
|
|
7481
|
-
|
|
7482
|
-
|
|
7483
|
-
HWY_API VFromD<D> SumOfLanes(D /*d*/, VFromD<D> vHL) {
|
|
7484
|
-
const VFromD<D> vLH = SwapAdjacentBlocks(vHL);
|
|
7485
|
-
return detail::SumOfLanes(Add(vLH, vHL));
|
|
7486
|
-
}
|
|
7487
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
7488
|
-
HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
|
|
7489
|
-
return GetLane(SumOfLanes(d, v));
|
|
7490
|
-
}
|
|
7491
|
-
#if HWY_HAVE_FLOAT16
|
|
7492
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
7493
|
-
HWY_API float16_t ReduceSum(D, VFromD<D> v) {
|
|
7494
|
-
return _mm256_reduce_add_ph(v.raw);
|
|
7495
|
-
}
|
|
7496
|
-
#endif // HWY_HAVE_FLOAT16
|
|
7497
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
7498
|
-
HWY_API VFromD<D> MinOfLanes(D /*d*/, VFromD<D> vHL) {
|
|
7499
|
-
const VFromD<D> vLH = SwapAdjacentBlocks(vHL);
|
|
7500
|
-
return detail::MinOfLanes(Min(vLH, vHL));
|
|
7501
|
-
}
|
|
7502
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
7503
|
-
HWY_API VFromD<D> MaxOfLanes(D /*d*/, VFromD<D> vHL) {
|
|
7504
|
-
const VFromD<D> vLH = SwapAdjacentBlocks(vHL);
|
|
7505
|
-
return detail::MaxOfLanes(Max(vLH, vHL));
|
|
8577
|
+
return BitCast(d64, PromoteTo(du64, VFromD<decltype(du8)>{_mm_cvtsi32_si128(
|
|
8578
|
+
i32_bit_shuf_result)}));
|
|
7506
8579
|
}
|
|
8580
|
+
#endif // HWY_TARGET <= HWY_AVX3_DL
|
|
7507
8581
|
|
|
7508
|
-
//
|
|
8582
|
+
// ------------------------------ LeadingZeroCount
|
|
7509
8583
|
|
|
7510
8584
|
#if HWY_TARGET <= HWY_AVX3
|
|
7511
8585
|
template <class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>
|