@img/sharp-libvips-dev 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/aom/aom_encoder.h +3 -3
- package/include/aom/aomcx.h +17 -8
- package/include/expat.h +21 -10
- package/include/expat_config.h +11 -5
- package/include/ffi.h +12 -25
- package/include/fontconfig/fontconfig.h +5 -3
- package/include/freetype2/freetype/config/ftoption.h +1 -1
- package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -7
- package/include/gio-unix-2.0/gio/gunixinputstream.h +0 -5
- package/include/gio-unix-2.0/gio/gunixoutputstream.h +0 -5
- package/include/glib-2.0/gio/gappinfo.h +0 -7
- package/include/glib-2.0/gio/gapplication.h +6 -0
- package/include/glib-2.0/gio/gapplicationcommandline.h +12 -1
- package/include/glib-2.0/gio/gasyncinitable.h +0 -7
- package/include/glib-2.0/gio/gasyncresult.h +0 -6
- package/include/glib-2.0/gio/gbufferedinputstream.h +0 -5
- package/include/glib-2.0/gio/gbufferedoutputstream.h +0 -5
- package/include/glib-2.0/gio/gbytesicon.h +0 -5
- package/include/glib-2.0/gio/gcancellable.h +0 -5
- package/include/glib-2.0/gio/gconverter.h +0 -7
- package/include/glib-2.0/gio/gconverterinputstream.h +0 -6
- package/include/glib-2.0/gio/gconverteroutputstream.h +0 -6
- package/include/glib-2.0/gio/gdatagrambased.h +0 -7
- package/include/glib-2.0/gio/gdatainputstream.h +0 -6
- package/include/glib-2.0/gio/gdataoutputstream.h +0 -6
- package/include/glib-2.0/gio/gdbusinterface.h +0 -8
- package/include/glib-2.0/gio/gdbusinterfaceskeleton.h +0 -8
- package/include/glib-2.0/gio/gdbusmessage.h +2 -1
- package/include/glib-2.0/gio/gdbusobjectmanagerclient.h +0 -8
- package/include/glib-2.0/gio/gdbusobjectmanagerserver.h +0 -8
- package/include/glib-2.0/gio/gdbusobjectproxy.h +0 -8
- package/include/glib-2.0/gio/gdbusobjectskeleton.h +0 -8
- package/include/glib-2.0/gio/gdbusproxy.h +0 -8
- package/include/glib-2.0/gio/gdebugcontroller.h +0 -8
- package/include/glib-2.0/gio/gdebugcontrollerdbus.h +0 -7
- package/include/glib-2.0/gio/gdtlsserverconnection.h +0 -8
- package/include/glib-2.0/gio/gemblem.h +0 -5
- package/include/glib-2.0/gio/gemblemedicon.h +0 -5
- package/include/glib-2.0/gio/gfile.h +0 -10
- package/include/glib-2.0/gio/gfileenumerator.h +0 -5
- package/include/glib-2.0/gio/gfileicon.h +0 -5
- package/include/glib-2.0/gio/gfileinfo.h +0 -5
- package/include/glib-2.0/gio/gfileinputstream.h +0 -8
- package/include/glib-2.0/gio/gfileiostream.h +0 -8
- package/include/glib-2.0/gio/gfilemonitor.h +0 -5
- package/include/glib-2.0/gio/gfilenamecompleter.h +0 -5
- package/include/glib-2.0/gio/gfileoutputstream.h +0 -8
- package/include/glib-2.0/gio/gfilterinputstream.h +0 -5
- package/include/glib-2.0/gio/gfilteroutputstream.h +0 -5
- package/include/glib-2.0/gio/gicon.h +0 -5
- package/include/glib-2.0/gio/ginitable.h +0 -7
- package/include/glib-2.0/gio/ginputstream.h +0 -5
- package/include/glib-2.0/gio/gio-autocleanups.h +4 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/gioenums.h +6 -1
- package/include/glib-2.0/gio/giomodule.h +0 -5
- package/include/glib-2.0/gio/giostream.h +0 -5
- package/include/glib-2.0/gio/giotypes.h +5 -108
- package/include/glib-2.0/gio/gloadableicon.h +0 -6
- package/include/glib-2.0/gio/gmemoryinputstream.h +0 -5
- package/include/glib-2.0/gio/gmemoryoutputstream.h +0 -5
- package/include/glib-2.0/gio/gmountoperation.h +0 -6
- package/include/glib-2.0/gio/gnetworking.h +4 -0
- package/include/glib-2.0/gio/goutputstream.h +0 -9
- package/include/glib-2.0/gio/gpollableinputstream.h +0 -7
- package/include/glib-2.0/gio/gpollableoutputstream.h +0 -7
- package/include/glib-2.0/gio/gproxy.h +0 -7
- package/include/glib-2.0/gio/gproxyaddressenumerator.h +0 -8
- package/include/glib-2.0/gio/gseekable.h +0 -5
- package/include/glib-2.0/gio/gsettingsbackend.h +0 -5
- package/include/glib-2.0/gio/gsimpleactiongroup.h +0 -7
- package/include/glib-2.0/gio/gsimpleasyncresult.h +0 -5
- package/include/glib-2.0/gio/gsimpleproxyresolver.h +0 -5
- package/include/glib-2.0/gio/gsocket.h +13 -0
- package/include/glib-2.0/gio/gsocketaddressenumerator.h +0 -6
- package/include/glib-2.0/gio/gsocketconnectable.h +0 -5
- package/include/glib-2.0/gio/gtask.h +12 -0
- package/include/glib-2.0/gio/gthemedicon.h +0 -5
- package/include/glib-2.0/gio/gtlsserverconnection.h +0 -8
- package/include/glib-2.0/gio/gunixcredentialsmessage.h +0 -8
- package/include/glib-2.0/gio/gvfs.h +0 -5
- package/include/glib-2.0/gio/gvolume.h +2 -2
- package/include/glib-2.0/gio/gvolumemonitor.h +0 -5
- package/include/glib-2.0/girepository/gi-visibility.h +986 -0
- package/include/glib-2.0/girepository/giarginfo.h +100 -0
- package/include/glib-2.0/girepository/gibaseinfo.h +129 -0
- package/include/glib-2.0/girepository/gicallableinfo.h +119 -0
- package/include/glib-2.0/girepository/gicallbackinfo.h +60 -0
- package/include/glib-2.0/girepository/giconstantinfo.h +72 -0
- package/include/glib-2.0/girepository/gienuminfo.h +82 -0
- package/include/glib-2.0/girepository/gifieldinfo.h +84 -0
- package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
- package/include/glib-2.0/girepository/gifunctioninfo.h +117 -0
- package/include/glib-2.0/girepository/giinterfaceinfo.h +120 -0
- package/include/glib-2.0/girepository/giobjectinfo.h +230 -0
- package/include/glib-2.0/girepository/gipropertyinfo.h +77 -0
- package/include/glib-2.0/girepository/giregisteredtypeinfo.h +75 -0
- package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
- package/include/glib-2.0/girepository/girepository.h +247 -0
- package/include/glib-2.0/girepository/girffi.h +129 -0
- package/include/glib-2.0/girepository/gisignalinfo.h +72 -0
- package/include/glib-2.0/girepository/gistructinfo.h +102 -0
- package/include/glib-2.0/girepository/gitypeinfo.h +144 -0
- package/include/glib-2.0/girepository/gitypelib.h +61 -0
- package/include/glib-2.0/girepository/gitypes.h +421 -0
- package/include/glib-2.0/girepository/giunioninfo.h +105 -0
- package/include/glib-2.0/girepository/giunresolvedinfo.h +60 -0
- package/include/glib-2.0/girepository/givalueinfo.h +65 -0
- package/include/glib-2.0/girepository/givfuncinfo.h +88 -0
- package/include/glib-2.0/glib/deprecated/gcompletion.h +1 -1
- package/include/glib-2.0/glib/deprecated/grel.h +0 -23
- package/include/glib-2.0/glib/deprecated/gthread.h +10 -6
- package/include/glib-2.0/glib/gatomic.h +20 -20
- package/include/glib-2.0/glib/gbitlock.h +31 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +39 -1
- package/include/glib-2.0/glib/gchecksum.h +0 -10
- package/include/glib-2.0/glib/gdate.h +0 -9
- package/include/glib-2.0/glib/gdatetime.h +33 -1
- package/include/glib-2.0/glib/gdir.h +5 -0
- package/include/glib-2.0/glib/ghmac.h +0 -9
- package/include/glib-2.0/glib/glib-autocleanups.h +4 -0
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +1 -0
- package/include/glib-2.0/glib/gmessages.h +11 -0
- package/include/glib-2.0/glib/gpathbuf.h +0 -7
- package/include/glib-2.0/glib/gslice.h +2 -0
- package/include/glib-2.0/glib/gstdio.h +1 -1
- package/include/glib-2.0/glib/gstrfuncs.h +24 -18
- package/include/glib-2.0/glib/gstrvbuilder.h +4 -8
- package/include/glib-2.0/glib/gtestutils.h +5 -0
- package/include/glib-2.0/glib/gthread.h +216 -3
- package/include/glib-2.0/glib/gunicode.h +12 -2
- package/include/glib-2.0/glib/gvarianttype.h +1 -10
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/glib/gwin32.h +4 -4
- package/include/glib-2.0/glib-unix.h +214 -0
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/gbinding.h +0 -8
- package/include/glib-2.0/gobject/gbindinggroup.h +0 -8
- package/include/glib-2.0/gobject/gclosure.h +1 -9
- package/include/glib-2.0/gobject/genums.h +6 -6
- package/include/glib-2.0/gobject/glib-types.h +44 -0
- package/include/glib-2.0/gobject/gobject-autocleanups.h +4 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gobject.h +1 -16
- package/include/glib-2.0/gobject/gparam.h +3 -12
- package/include/glib-2.0/gobject/gsignal.h +16 -6
- package/include/glib-2.0/gobject/gsignalgroup.h +0 -8
- package/include/glib-2.0/gobject/gtype.h +53 -20
- package/include/glib-2.0/gobject/gtypemodule.h +0 -7
- package/include/glib-2.0/gobject/gtypeplugin.h +0 -6
- package/include/glib-2.0/gobject/gvaluearray.h +0 -7
- package/include/glib-2.0/gobject/gvaluecollector.h +1 -11
- package/include/glib-2.0/gobject/gvaluetypes.h +2 -0
- package/include/hwy/aligned_allocator.h +171 -6
- package/include/hwy/base.h +1765 -543
- package/include/hwy/cache_control.h +24 -6
- package/include/hwy/detect_compiler_arch.h +23 -2
- package/include/hwy/detect_targets.h +56 -13
- package/include/hwy/foreach_target.h +24 -0
- package/include/hwy/highway.h +20 -3
- package/include/hwy/ops/arm_neon-inl.h +1086 -667
- package/include/hwy/ops/arm_sve-inl.h +1091 -235
- package/include/hwy/ops/emu128-inl.h +271 -196
- package/include/hwy/ops/generic_ops-inl.h +2270 -399
- package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
- package/include/hwy/ops/rvv-inl.h +1043 -311
- package/include/hwy/ops/scalar-inl.h +189 -159
- package/include/hwy/ops/set_macros-inl.h +66 -6
- package/include/hwy/ops/shared-inl.h +175 -56
- package/include/hwy/ops/wasm_128-inl.h +153 -136
- package/include/hwy/ops/x86_128-inl.h +1647 -646
- package/include/hwy/ops/x86_256-inl.h +1003 -370
- package/include/hwy/ops/x86_512-inl.h +948 -353
- package/include/hwy/per_target.h +4 -0
- package/include/hwy/profiler.h +648 -0
- package/include/hwy/robust_statistics.h +2 -2
- package/include/hwy/targets.h +18 -11
- package/include/hwy/timer.h +11 -0
- package/include/lcms2.h +46 -7
- package/include/lcms2_plugin.h +4 -4
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/png.h +32 -29
- package/include/libpng16/pngconf.h +2 -2
- package/include/libpng16/pnglibconf.h +7 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
- package/include/libxml2/libxml/HTMLparser.h +23 -0
- package/include/libxml2/libxml/SAX.h +0 -2
- package/include/libxml2/libxml/SAX2.h +0 -2
- package/include/libxml2/libxml/c14n.h +0 -2
- package/include/libxml2/libxml/dict.h +1 -0
- package/include/libxml2/libxml/encoding.h +16 -14
- package/include/libxml2/libxml/entities.h +4 -0
- package/include/libxml2/libxml/globals.h +15 -503
- package/include/libxml2/libxml/hash.h +57 -61
- package/include/libxml2/libxml/nanoftp.h +2 -2
- package/include/libxml2/libxml/parser.h +137 -18
- package/include/libxml2/libxml/parserInternals.h +1 -0
- package/include/libxml2/libxml/relaxng.h +2 -1
- package/include/libxml2/libxml/schemasInternals.h +1 -0
- package/include/libxml2/libxml/schematron.h +1 -0
- package/include/libxml2/libxml/threads.h +4 -11
- package/include/libxml2/libxml/tree.h +68 -20
- package/include/libxml2/libxml/uri.h +2 -1
- package/include/libxml2/libxml/valid.h +2 -0
- package/include/libxml2/libxml/xmlIO.h +65 -13
- package/include/libxml2/libxml/xmlerror.h +37 -8
- package/include/libxml2/libxml/xmlmemory.h +37 -40
- package/include/libxml2/libxml/xmlreader.h +6 -0
- package/include/libxml2/libxml/xmlregexp.h +2 -9
- package/include/libxml2/libxml/xmlsave.h +9 -0
- package/include/libxml2/libxml/xmlschemas.h +3 -0
- package/include/libxml2/libxml/xmlversion.h +28 -43
- package/include/libxml2/libxml/xpath.h +1 -1
- package/include/libxml2/libxml/xpathInternals.h +2 -1
- package/include/libxml2/libxml/xpointer.h +5 -4
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-fontmap.h +7 -0
- package/include/pixman-1/pixman-version.h +3 -3
- package/include/pixman-1/pixman.h +9 -2
- package/include/png.h +32 -29
- package/include/pngconf.h +2 -2
- package/include/pnglibconf.h +7 -2
- package/include/vips/connection.h +9 -3
- package/include/vips/util.h +0 -9
- package/include/vips/version.h +4 -4
- package/include/zconf.h +3 -0
- package/include/zlib.h +3 -3
- package/package.json +1 -1
- package/versions.json +15 -15
|
@@ -101,6 +101,9 @@ class Vec256 {
|
|
|
101
101
|
HWY_INLINE Vec256& operator-=(const Vec256 other) {
|
|
102
102
|
return *this = (*this - other);
|
|
103
103
|
}
|
|
104
|
+
HWY_INLINE Vec256& operator%=(const Vec256 other) {
|
|
105
|
+
return *this = (*this % other);
|
|
106
|
+
}
|
|
104
107
|
HWY_INLINE Vec256& operator&=(const Vec256 other) {
|
|
105
108
|
return *this = (*this & other);
|
|
106
109
|
}
|
|
@@ -359,6 +362,85 @@ HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
|
|
|
359
362
|
ResizeBitCast(Full128<uint8_t>(), v).raw)});
|
|
360
363
|
}
|
|
361
364
|
|
|
365
|
+
// ------------------------------ Dup128VecFromValues
|
|
366
|
+
|
|
367
|
+
template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 32)>
|
|
368
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
369
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
370
|
+
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
|
|
371
|
+
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
|
|
372
|
+
TFromD<D> t11, TFromD<D> t12,
|
|
373
|
+
TFromD<D> t13, TFromD<D> t14,
|
|
374
|
+
TFromD<D> t15) {
|
|
375
|
+
return VFromD<D>{_mm256_setr_epi8(
|
|
376
|
+
static_cast<char>(t0), static_cast<char>(t1), static_cast<char>(t2),
|
|
377
|
+
static_cast<char>(t3), static_cast<char>(t4), static_cast<char>(t5),
|
|
378
|
+
static_cast<char>(t6), static_cast<char>(t7), static_cast<char>(t8),
|
|
379
|
+
static_cast<char>(t9), static_cast<char>(t10), static_cast<char>(t11),
|
|
380
|
+
static_cast<char>(t12), static_cast<char>(t13), static_cast<char>(t14),
|
|
381
|
+
static_cast<char>(t15), static_cast<char>(t0), static_cast<char>(t1),
|
|
382
|
+
static_cast<char>(t2), static_cast<char>(t3), static_cast<char>(t4),
|
|
383
|
+
static_cast<char>(t5), static_cast<char>(t6), static_cast<char>(t7),
|
|
384
|
+
static_cast<char>(t8), static_cast<char>(t9), static_cast<char>(t10),
|
|
385
|
+
static_cast<char>(t11), static_cast<char>(t12), static_cast<char>(t13),
|
|
386
|
+
static_cast<char>(t14), static_cast<char>(t15))};
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 32)>
|
|
390
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
391
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
392
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
393
|
+
TFromD<D> t7) {
|
|
394
|
+
return VFromD<D>{
|
|
395
|
+
_mm256_setr_epi16(static_cast<int16_t>(t0), static_cast<int16_t>(t1),
|
|
396
|
+
static_cast<int16_t>(t2), static_cast<int16_t>(t3),
|
|
397
|
+
static_cast<int16_t>(t4), static_cast<int16_t>(t5),
|
|
398
|
+
static_cast<int16_t>(t6), static_cast<int16_t>(t7),
|
|
399
|
+
static_cast<int16_t>(t0), static_cast<int16_t>(t1),
|
|
400
|
+
static_cast<int16_t>(t2), static_cast<int16_t>(t3),
|
|
401
|
+
static_cast<int16_t>(t4), static_cast<int16_t>(t5),
|
|
402
|
+
static_cast<int16_t>(t6), static_cast<int16_t>(t7))};
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
#if HWY_HAVE_FLOAT16
|
|
406
|
+
template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 32)>
|
|
407
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
408
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
409
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
410
|
+
TFromD<D> t7) {
|
|
411
|
+
return VFromD<D>{_mm256_setr_ph(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2,
|
|
412
|
+
t3, t4, t5, t6, t7)};
|
|
413
|
+
}
|
|
414
|
+
#endif
|
|
415
|
+
|
|
416
|
+
template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 32)>
|
|
417
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
418
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
419
|
+
return VFromD<D>{
|
|
420
|
+
_mm256_setr_epi32(static_cast<int32_t>(t0), static_cast<int32_t>(t1),
|
|
421
|
+
static_cast<int32_t>(t2), static_cast<int32_t>(t3),
|
|
422
|
+
static_cast<int32_t>(t0), static_cast<int32_t>(t1),
|
|
423
|
+
static_cast<int32_t>(t2), static_cast<int32_t>(t3))};
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 32)>
|
|
427
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
428
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
429
|
+
return VFromD<D>{_mm256_setr_ps(t0, t1, t2, t3, t0, t1, t2, t3)};
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 32)>
|
|
433
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
434
|
+
return VFromD<D>{
|
|
435
|
+
_mm256_setr_epi64x(static_cast<int64_t>(t0), static_cast<int64_t>(t1),
|
|
436
|
+
static_cast<int64_t>(t0), static_cast<int64_t>(t1))};
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 32)>
|
|
440
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
441
|
+
return VFromD<D>{_mm256_setr_pd(t0, t1, t0, t1)};
|
|
442
|
+
}
|
|
443
|
+
|
|
362
444
|
// ================================================== LOGICAL
|
|
363
445
|
|
|
364
446
|
// ------------------------------ And
|
|
@@ -367,7 +449,8 @@ template <typename T>
|
|
|
367
449
|
HWY_API Vec256<T> And(Vec256<T> a, Vec256<T> b) {
|
|
368
450
|
const DFromV<decltype(a)> d; // for float16_t
|
|
369
451
|
const RebindToUnsigned<decltype(d)> du;
|
|
370
|
-
return BitCast(d, VFromD<decltype(du)>{_mm256_and_si256(a.raw,
|
|
452
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_and_si256(BitCast(du, a).raw,
|
|
453
|
+
BitCast(du, b).raw)});
|
|
371
454
|
}
|
|
372
455
|
|
|
373
456
|
HWY_API Vec256<float> And(Vec256<float> a, Vec256<float> b) {
|
|
@@ -384,8 +467,8 @@ template <typename T>
|
|
|
384
467
|
HWY_API Vec256<T> AndNot(Vec256<T> not_mask, Vec256<T> mask) {
|
|
385
468
|
const DFromV<decltype(mask)> d; // for float16_t
|
|
386
469
|
const RebindToUnsigned<decltype(d)> du;
|
|
387
|
-
return BitCast(
|
|
388
|
-
|
|
470
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_andnot_si256(
|
|
471
|
+
BitCast(du, not_mask).raw, BitCast(du, mask).raw)});
|
|
389
472
|
}
|
|
390
473
|
HWY_API Vec256<float> AndNot(Vec256<float> not_mask, Vec256<float> mask) {
|
|
391
474
|
return Vec256<float>{_mm256_andnot_ps(not_mask.raw, mask.raw)};
|
|
@@ -400,7 +483,8 @@ template <typename T>
|
|
|
400
483
|
HWY_API Vec256<T> Or(Vec256<T> a, Vec256<T> b) {
|
|
401
484
|
const DFromV<decltype(a)> d; // for float16_t
|
|
402
485
|
const RebindToUnsigned<decltype(d)> du;
|
|
403
|
-
return BitCast(d, VFromD<decltype(du)>{_mm256_or_si256(a.raw,
|
|
486
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_or_si256(BitCast(du, a).raw,
|
|
487
|
+
BitCast(du, b).raw)});
|
|
404
488
|
}
|
|
405
489
|
|
|
406
490
|
HWY_API Vec256<float> Or(Vec256<float> a, Vec256<float> b) {
|
|
@@ -416,7 +500,8 @@ template <typename T>
|
|
|
416
500
|
HWY_API Vec256<T> Xor(Vec256<T> a, Vec256<T> b) {
|
|
417
501
|
const DFromV<decltype(a)> d; // for float16_t
|
|
418
502
|
const RebindToUnsigned<decltype(d)> du;
|
|
419
|
-
return BitCast(d, VFromD<decltype(du)>{_mm256_xor_si256(a.raw,
|
|
503
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_xor_si256(BitCast(du, a).raw,
|
|
504
|
+
BitCast(du, b).raw)});
|
|
420
505
|
}
|
|
421
506
|
|
|
422
507
|
HWY_API Vec256<float> Xor(Vec256<float> a, Vec256<float> b) {
|
|
@@ -589,7 +674,7 @@ HWY_INLINE Vec256<T> IfThenElse(hwy::SizeTag<8> /* tag */, Mask256<T> mask,
|
|
|
589
674
|
|
|
590
675
|
} // namespace detail
|
|
591
676
|
|
|
592
|
-
template <typename T>
|
|
677
|
+
template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
593
678
|
HWY_API Vec256<T> IfThenElse(Mask256<T> mask, Vec256<T> yes, Vec256<T> no) {
|
|
594
679
|
return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
|
|
595
680
|
}
|
|
@@ -634,7 +719,7 @@ HWY_INLINE Vec256<T> IfThenElseZero(hwy::SizeTag<8> /* tag */, Mask256<T> mask,
|
|
|
634
719
|
|
|
635
720
|
} // namespace detail
|
|
636
721
|
|
|
637
|
-
template <typename T,
|
|
722
|
+
template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
638
723
|
HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) {
|
|
639
724
|
return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
|
|
640
725
|
}
|
|
@@ -672,7 +757,7 @@ HWY_INLINE Vec256<T> IfThenZeroElse(hwy::SizeTag<8> /* tag */, Mask256<T> mask,
|
|
|
672
757
|
|
|
673
758
|
} // namespace detail
|
|
674
759
|
|
|
675
|
-
template <typename T,
|
|
760
|
+
template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
676
761
|
HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
|
|
677
762
|
return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
|
|
678
763
|
}
|
|
@@ -879,6 +964,58 @@ HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<8> /*tag*/,
|
|
|
879
964
|
#endif
|
|
880
965
|
}
|
|
881
966
|
|
|
967
|
+
// UnmaskedNot returns ~m.raw without zeroing out any invalid bits
|
|
968
|
+
template <typename T, HWY_IF_T_SIZE(T, 1)>
|
|
969
|
+
HWY_INLINE Mask256<T> UnmaskedNot(const Mask256<T> m) {
|
|
970
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
971
|
+
return Mask256<T>{static_cast<__mmask32>(_knot_mask32(m.raw))};
|
|
972
|
+
#else
|
|
973
|
+
return Mask256<T>{static_cast<__mmask32>(~m.raw)};
|
|
974
|
+
#endif
|
|
975
|
+
}
|
|
976
|
+
|
|
977
|
+
template <typename T, HWY_IF_T_SIZE(T, 2)>
|
|
978
|
+
HWY_INLINE Mask256<T> UnmaskedNot(const Mask256<T> m) {
|
|
979
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
980
|
+
return Mask256<T>{static_cast<__mmask16>(_knot_mask16(m.raw))};
|
|
981
|
+
#else
|
|
982
|
+
return Mask256<T>{static_cast<__mmask16>(~m.raw)};
|
|
983
|
+
#endif
|
|
984
|
+
}
|
|
985
|
+
|
|
986
|
+
template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
|
|
987
|
+
HWY_INLINE Mask256<T> UnmaskedNot(const Mask256<T> m) {
|
|
988
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
989
|
+
return Mask256<T>{static_cast<__mmask8>(_knot_mask8(m.raw))};
|
|
990
|
+
#else
|
|
991
|
+
return Mask256<T>{static_cast<__mmask8>(~m.raw)};
|
|
992
|
+
#endif
|
|
993
|
+
}
|
|
994
|
+
|
|
995
|
+
template <typename T>
|
|
996
|
+
HWY_INLINE Mask256<T> Not(hwy::SizeTag<1> /*tag*/, const Mask256<T> m) {
|
|
997
|
+
// sizeof(T) == 1: simply return ~m as all 32 bits of m are valid
|
|
998
|
+
return UnmaskedNot(m);
|
|
999
|
+
}
|
|
1000
|
+
template <typename T>
|
|
1001
|
+
HWY_INLINE Mask256<T> Not(hwy::SizeTag<2> /*tag*/, const Mask256<T> m) {
|
|
1002
|
+
// sizeof(T) == 2: simply return ~m as all 16 bits of m are valid
|
|
1003
|
+
return UnmaskedNot(m);
|
|
1004
|
+
}
|
|
1005
|
+
template <typename T>
|
|
1006
|
+
HWY_INLINE Mask256<T> Not(hwy::SizeTag<4> /*tag*/, const Mask256<T> m) {
|
|
1007
|
+
// sizeof(T) == 4: simply return ~m as all 8 bits of m are valid
|
|
1008
|
+
return UnmaskedNot(m);
|
|
1009
|
+
}
|
|
1010
|
+
template <typename T>
|
|
1011
|
+
HWY_INLINE Mask256<T> Not(hwy::SizeTag<8> /*tag*/, const Mask256<T> m) {
|
|
1012
|
+
// sizeof(T) == 8: need to zero out the upper 4 bits of ~m as only the lower
|
|
1013
|
+
// 4 bits of m are valid
|
|
1014
|
+
|
|
1015
|
+
// Return (~m) & 0x0F
|
|
1016
|
+
return AndNot(hwy::SizeTag<8>(), m, Mask256<T>::FromBits(uint64_t{0x0F}));
|
|
1017
|
+
}
|
|
1018
|
+
|
|
882
1019
|
} // namespace detail
|
|
883
1020
|
|
|
884
1021
|
template <typename T>
|
|
@@ -904,8 +1041,7 @@ HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
|
|
|
904
1041
|
template <typename T>
|
|
905
1042
|
HWY_API Mask256<T> Not(const Mask256<T> m) {
|
|
906
1043
|
// Flip only the valid bits.
|
|
907
|
-
|
|
908
|
-
return Xor(m, Mask256<T>::FromBits((1ull << N) - 1));
|
|
1044
|
+
return detail::Not(hwy::SizeTag<sizeof(T)>(), m);
|
|
909
1045
|
}
|
|
910
1046
|
|
|
911
1047
|
template <typename T>
|
|
@@ -913,6 +1049,31 @@ HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) {
|
|
|
913
1049
|
return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
|
|
914
1050
|
}
|
|
915
1051
|
|
|
1052
|
+
template <class D, HWY_IF_LANES_D(D, 32)>
|
|
1053
|
+
HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
|
|
1054
|
+
MFromD<Half<D>> lo) {
|
|
1055
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
1056
|
+
const __mmask32 combined_mask = _mm512_kunpackw(
|
|
1057
|
+
static_cast<__mmask32>(hi.raw), static_cast<__mmask32>(lo.raw));
|
|
1058
|
+
#else
|
|
1059
|
+
const auto combined_mask =
|
|
1060
|
+
((static_cast<uint32_t>(hi.raw) << 16) | (lo.raw & 0xFFFFu));
|
|
1061
|
+
#endif
|
|
1062
|
+
|
|
1063
|
+
return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
|
|
1064
|
+
}
|
|
1065
|
+
|
|
1066
|
+
template <class D, HWY_IF_LANES_D(D, 16)>
|
|
1067
|
+
HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
|
|
1068
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
1069
|
+
const auto shifted_mask = _kshiftri_mask32(static_cast<__mmask32>(m.raw), 16);
|
|
1070
|
+
#else
|
|
1071
|
+
const auto shifted_mask = static_cast<uint32_t>(m.raw) >> 16;
|
|
1072
|
+
#endif
|
|
1073
|
+
|
|
1074
|
+
return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
|
|
1075
|
+
}
|
|
1076
|
+
|
|
916
1077
|
#else // AVX2
|
|
917
1078
|
|
|
918
1079
|
// ------------------------------ Mask
|
|
@@ -1072,7 +1233,11 @@ HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
|
|
|
1072
1233
|
#if HWY_HAVE_FLOAT16
|
|
1073
1234
|
HWY_API Mask256<float16_t> operator==(Vec256<float16_t> a,
|
|
1074
1235
|
Vec256<float16_t> b) {
|
|
1236
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
1237
|
+
HWY_DIAGNOSTICS(push)
|
|
1238
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
1075
1239
|
return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)};
|
|
1240
|
+
HWY_DIAGNOSTICS(pop)
|
|
1076
1241
|
}
|
|
1077
1242
|
#endif // HWY_HAVE_FLOAT16
|
|
1078
1243
|
HWY_API Mask256<float> operator==(Vec256<float> a, Vec256<float> b) {
|
|
@@ -1105,7 +1270,11 @@ HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
|
|
|
1105
1270
|
#if HWY_HAVE_FLOAT16
|
|
1106
1271
|
HWY_API Mask256<float16_t> operator!=(Vec256<float16_t> a,
|
|
1107
1272
|
Vec256<float16_t> b) {
|
|
1273
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
1274
|
+
HWY_DIAGNOSTICS(push)
|
|
1275
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
1108
1276
|
return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
|
|
1277
|
+
HWY_DIAGNOSTICS(pop)
|
|
1109
1278
|
}
|
|
1110
1279
|
#endif // HWY_HAVE_FLOAT16
|
|
1111
1280
|
HWY_API Mask256<float> operator!=(Vec256<float> a, Vec256<float> b) {
|
|
@@ -1146,7 +1315,11 @@ HWY_API Mask256<uint64_t> operator>(Vec256<uint64_t> a, Vec256<uint64_t> b) {
|
|
|
1146
1315
|
|
|
1147
1316
|
#if HWY_HAVE_FLOAT16
|
|
1148
1317
|
HWY_API Mask256<float16_t> operator>(Vec256<float16_t> a, Vec256<float16_t> b) {
|
|
1318
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
1319
|
+
HWY_DIAGNOSTICS(push)
|
|
1320
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
1149
1321
|
return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)};
|
|
1322
|
+
HWY_DIAGNOSTICS(pop)
|
|
1150
1323
|
}
|
|
1151
1324
|
#endif // HWY_HAVE_FLOAT16
|
|
1152
1325
|
HWY_API Mask256<float> operator>(Vec256<float> a, Vec256<float> b) {
|
|
@@ -1161,7 +1334,11 @@ HWY_API Mask256<double> operator>(Vec256<double> a, Vec256<double> b) {
|
|
|
1161
1334
|
#if HWY_HAVE_FLOAT16
|
|
1162
1335
|
HWY_API Mask256<float16_t> operator>=(Vec256<float16_t> a,
|
|
1163
1336
|
Vec256<float16_t> b) {
|
|
1337
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
1338
|
+
HWY_DIAGNOSTICS(push)
|
|
1339
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
1164
1340
|
return Mask256<float16_t>{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)};
|
|
1341
|
+
HWY_DIAGNOSTICS(pop)
|
|
1165
1342
|
}
|
|
1166
1343
|
#endif // HWY_HAVE_FLOAT16
|
|
1167
1344
|
|
|
@@ -1617,7 +1794,7 @@ HWY_INLINE VFromD<D> Iota0(D /*d*/) {
|
|
|
1617
1794
|
|
|
1618
1795
|
template <class D, HWY_IF_V_SIZE_D(D, 32), typename T2>
|
|
1619
1796
|
HWY_API VFromD<D> Iota(D d, const T2 first) {
|
|
1620
|
-
return detail::Iota0(d) + Set(d,
|
|
1797
|
+
return detail::Iota0(d) + Set(d, ConvertScalarTo<TFromD<D>>(first));
|
|
1621
1798
|
}
|
|
1622
1799
|
|
|
1623
1800
|
// ------------------------------ FirstN (Iota, Lt)
|
|
@@ -1732,6 +1909,15 @@ HWY_API Vec256<double> operator-(Vec256<double> a, Vec256<double> b) {
|
|
|
1732
1909
|
return Vec256<double>{_mm256_sub_pd(a.raw, b.raw)};
|
|
1733
1910
|
}
|
|
1734
1911
|
|
|
1912
|
+
// ------------------------------ AddSub
|
|
1913
|
+
|
|
1914
|
+
HWY_API Vec256<float> AddSub(Vec256<float> a, Vec256<float> b) {
|
|
1915
|
+
return Vec256<float>{_mm256_addsub_ps(a.raw, b.raw)};
|
|
1916
|
+
}
|
|
1917
|
+
HWY_API Vec256<double> AddSub(Vec256<double> a, Vec256<double> b) {
|
|
1918
|
+
return Vec256<double>{_mm256_addsub_pd(a.raw, b.raw)};
|
|
1919
|
+
}
|
|
1920
|
+
|
|
1735
1921
|
// ------------------------------ SumsOf8
|
|
1736
1922
|
HWY_API Vec256<uint64_t> SumsOf8(Vec256<uint8_t> v) {
|
|
1737
1923
|
return Vec256<uint64_t>{_mm256_sad_epu8(v.raw, _mm256_setzero_si256())};
|
|
@@ -1741,6 +1927,56 @@ HWY_API Vec256<uint64_t> SumsOf8AbsDiff(Vec256<uint8_t> a, Vec256<uint8_t> b) {
|
|
|
1741
1927
|
return Vec256<uint64_t>{_mm256_sad_epu8(a.raw, b.raw)};
|
|
1742
1928
|
}
|
|
1743
1929
|
|
|
1930
|
+
// ------------------------------ SumsOf4
|
|
1931
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
1932
|
+
namespace detail {
|
|
1933
|
+
|
|
1934
|
+
HWY_INLINE Vec256<uint32_t> SumsOf4(hwy::UnsignedTag /*type_tag*/,
|
|
1935
|
+
hwy::SizeTag<1> /*lane_size_tag*/,
|
|
1936
|
+
Vec256<uint8_t> v) {
|
|
1937
|
+
const DFromV<decltype(v)> d;
|
|
1938
|
+
|
|
1939
|
+
// _mm256_maskz_dbsad_epu8 is used below as the odd uint16_t lanes need to be
|
|
1940
|
+
// zeroed out and the sums of the 4 consecutive lanes are already in the
|
|
1941
|
+
// even uint16_t lanes of the _mm256_maskz_dbsad_epu8 result.
|
|
1942
|
+
return Vec256<uint32_t>{_mm256_maskz_dbsad_epu8(
|
|
1943
|
+
static_cast<__mmask16>(0x5555), v.raw, Zero(d).raw, 0)};
|
|
1944
|
+
}
|
|
1945
|
+
|
|
1946
|
+
// detail::SumsOf4 for Vec256<int8_t> on AVX3 is implemented in x86_512-inl.h
|
|
1947
|
+
|
|
1948
|
+
} // namespace detail
|
|
1949
|
+
#endif // HWY_TARGET <= HWY_AVX3
|
|
1950
|
+
|
|
1951
|
+
// ------------------------------ SumsOfAdjQuadAbsDiff
|
|
1952
|
+
|
|
1953
|
+
template <int kAOffset, int kBOffset>
|
|
1954
|
+
static Vec256<uint16_t> SumsOfAdjQuadAbsDiff(Vec256<uint8_t> a,
|
|
1955
|
+
Vec256<uint8_t> b) {
|
|
1956
|
+
static_assert(0 <= kAOffset && kAOffset <= 1,
|
|
1957
|
+
"kAOffset must be between 0 and 1");
|
|
1958
|
+
static_assert(0 <= kBOffset && kBOffset <= 3,
|
|
1959
|
+
"kBOffset must be between 0 and 3");
|
|
1960
|
+
return Vec256<uint16_t>{_mm256_mpsadbw_epu8(
|
|
1961
|
+
a.raw, b.raw,
|
|
1962
|
+
(kAOffset << 5) | (kBOffset << 3) | (kAOffset << 2) | kBOffset)};
|
|
1963
|
+
}
|
|
1964
|
+
|
|
1965
|
+
// ------------------------------ SumsOfShuffledQuadAbsDiff
|
|
1966
|
+
|
|
1967
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
1968
|
+
template <int kIdx3, int kIdx2, int kIdx1, int kIdx0>
|
|
1969
|
+
static Vec256<uint16_t> SumsOfShuffledQuadAbsDiff(Vec256<uint8_t> a,
|
|
1970
|
+
Vec256<uint8_t> b) {
|
|
1971
|
+
static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3");
|
|
1972
|
+
static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3");
|
|
1973
|
+
static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3");
|
|
1974
|
+
static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3");
|
|
1975
|
+
return Vec256<uint16_t>{
|
|
1976
|
+
_mm256_dbsad_epu8(b.raw, a.raw, _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0))};
|
|
1977
|
+
}
|
|
1978
|
+
#endif
|
|
1979
|
+
|
|
1744
1980
|
// ------------------------------ SaturatedAdd
|
|
1745
1981
|
|
|
1746
1982
|
// Returns a + b clamped to the destination range.
|
|
@@ -1860,15 +2096,12 @@ HWY_API Vec256<int16_t> Abs(const Vec256<int16_t> v) {
|
|
|
1860
2096
|
HWY_API Vec256<int32_t> Abs(const Vec256<int32_t> v) {
|
|
1861
2097
|
return Vec256<int32_t>{_mm256_abs_epi32(v.raw)};
|
|
1862
2098
|
}
|
|
1863
|
-
// i64 is implemented after BroadcastSignBit.
|
|
1864
2099
|
|
|
1865
|
-
|
|
1866
|
-
HWY_API Vec256<
|
|
1867
|
-
|
|
1868
|
-
const RebindToSigned<decltype(d)> di;
|
|
1869
|
-
using TI = TFromD<decltype(di)>;
|
|
1870
|
-
return v & BitCast(d, Set(di, static_cast<TI>(~SignMask<TI>())));
|
|
2100
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
2101
|
+
HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) {
|
|
2102
|
+
return Vec256<int64_t>{_mm256_abs_epi64(v.raw)};
|
|
1871
2103
|
}
|
|
2104
|
+
#endif
|
|
1872
2105
|
|
|
1873
2106
|
// ------------------------------ Integer multiplication
|
|
1874
2107
|
|
|
@@ -2086,16 +2319,6 @@ HWY_API Vec256<int64_t> ShiftRight(const Vec256<int64_t> v) {
|
|
|
2086
2319
|
#endif
|
|
2087
2320
|
}
|
|
2088
2321
|
|
|
2089
|
-
HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) {
|
|
2090
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
2091
|
-
return Vec256<int64_t>{_mm256_abs_epi64(v.raw)};
|
|
2092
|
-
#else
|
|
2093
|
-
const DFromV<decltype(v)> d;
|
|
2094
|
-
const auto zero = Zero(d);
|
|
2095
|
-
return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
|
|
2096
|
-
#endif
|
|
2097
|
-
}
|
|
2098
|
-
|
|
2099
2322
|
// ------------------------------ IfNegativeThenElse (BroadcastSignBit)
|
|
2100
2323
|
HWY_API Vec256<int8_t> IfNegativeThenElse(Vec256<int8_t> v, Vec256<int8_t> yes,
|
|
2101
2324
|
Vec256<int8_t> no) {
|
|
@@ -2136,6 +2359,23 @@ HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
|
|
|
2136
2359
|
#endif
|
|
2137
2360
|
}
|
|
2138
2361
|
|
|
2362
|
+
// ------------------------------ IfNegativeThenNegOrUndefIfZero
|
|
2363
|
+
|
|
2364
|
+
HWY_API Vec256<int8_t> IfNegativeThenNegOrUndefIfZero(Vec256<int8_t> mask,
|
|
2365
|
+
Vec256<int8_t> v) {
|
|
2366
|
+
return Vec256<int8_t>{_mm256_sign_epi8(v.raw, mask.raw)};
|
|
2367
|
+
}
|
|
2368
|
+
|
|
2369
|
+
HWY_API Vec256<int16_t> IfNegativeThenNegOrUndefIfZero(Vec256<int16_t> mask,
|
|
2370
|
+
Vec256<int16_t> v) {
|
|
2371
|
+
return Vec256<int16_t>{_mm256_sign_epi16(v.raw, mask.raw)};
|
|
2372
|
+
}
|
|
2373
|
+
|
|
2374
|
+
HWY_API Vec256<int32_t> IfNegativeThenNegOrUndefIfZero(Vec256<int32_t> mask,
|
|
2375
|
+
Vec256<int32_t> v) {
|
|
2376
|
+
return Vec256<int32_t>{_mm256_sign_epi32(v.raw, mask.raw)};
|
|
2377
|
+
}
|
|
2378
|
+
|
|
2139
2379
|
// ------------------------------ ShiftLeftSame
|
|
2140
2380
|
|
|
2141
2381
|
HWY_API Vec256<uint16_t> ShiftLeftSame(const Vec256<uint16_t> v,
|
|
@@ -2359,6 +2599,326 @@ HWY_API Vec256<double> ApproximateReciprocal(Vec256<double> v) {
|
|
|
2359
2599
|
}
|
|
2360
2600
|
#endif
|
|
2361
2601
|
|
|
2602
|
+
// ------------------------------ MaskedMinOr
|
|
2603
|
+
|
|
2604
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
2605
|
+
|
|
2606
|
+
template <typename T, HWY_IF_U8(T)>
|
|
2607
|
+
HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2608
|
+
Vec256<T> b) {
|
|
2609
|
+
return Vec256<T>{_mm256_mask_min_epu8(no.raw, m.raw, a.raw, b.raw)};
|
|
2610
|
+
}
|
|
2611
|
+
template <typename T, HWY_IF_I8(T)>
|
|
2612
|
+
HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2613
|
+
Vec256<T> b) {
|
|
2614
|
+
return Vec256<T>{_mm256_mask_min_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
2615
|
+
}
|
|
2616
|
+
|
|
2617
|
+
template <typename T, HWY_IF_U16(T)>
|
|
2618
|
+
HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2619
|
+
Vec256<T> b) {
|
|
2620
|
+
return Vec256<T>{_mm256_mask_min_epu16(no.raw, m.raw, a.raw, b.raw)};
|
|
2621
|
+
}
|
|
2622
|
+
template <typename T, HWY_IF_I16(T)>
|
|
2623
|
+
HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2624
|
+
Vec256<T> b) {
|
|
2625
|
+
return Vec256<T>{_mm256_mask_min_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
2626
|
+
}
|
|
2627
|
+
|
|
2628
|
+
template <typename T, HWY_IF_U32(T)>
|
|
2629
|
+
HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2630
|
+
Vec256<T> b) {
|
|
2631
|
+
return Vec256<T>{_mm256_mask_min_epu32(no.raw, m.raw, a.raw, b.raw)};
|
|
2632
|
+
}
|
|
2633
|
+
template <typename T, HWY_IF_I32(T)>
|
|
2634
|
+
HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2635
|
+
Vec256<T> b) {
|
|
2636
|
+
return Vec256<T>{_mm256_mask_min_epi32(no.raw, m.raw, a.raw, b.raw)};
|
|
2637
|
+
}
|
|
2638
|
+
|
|
2639
|
+
template <typename T, HWY_IF_U64(T)>
|
|
2640
|
+
HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2641
|
+
Vec256<T> b) {
|
|
2642
|
+
return Vec256<T>{_mm256_mask_min_epu64(no.raw, m.raw, a.raw, b.raw)};
|
|
2643
|
+
}
|
|
2644
|
+
template <typename T, HWY_IF_I64(T)>
|
|
2645
|
+
HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2646
|
+
Vec256<T> b) {
|
|
2647
|
+
return Vec256<T>{_mm256_mask_min_epi64(no.raw, m.raw, a.raw, b.raw)};
|
|
2648
|
+
}
|
|
2649
|
+
|
|
2650
|
+
template <typename T, HWY_IF_F32(T)>
|
|
2651
|
+
HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2652
|
+
Vec256<T> b) {
|
|
2653
|
+
return Vec256<T>{_mm256_mask_min_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
2654
|
+
}
|
|
2655
|
+
|
|
2656
|
+
template <typename T, HWY_IF_F64(T)>
|
|
2657
|
+
HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2658
|
+
Vec256<T> b) {
|
|
2659
|
+
return Vec256<T>{_mm256_mask_min_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
2660
|
+
}
|
|
2661
|
+
|
|
2662
|
+
#if HWY_HAVE_FLOAT16
|
|
2663
|
+
template <typename T, HWY_IF_F16(T)>
|
|
2664
|
+
HWY_API Vec256<T> MaskedMinOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2665
|
+
Vec256<T> b) {
|
|
2666
|
+
return Vec256<T>{_mm256_mask_min_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
2667
|
+
}
|
|
2668
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2669
|
+
|
|
2670
|
+
// ------------------------------ MaskedMaxOr
|
|
2671
|
+
|
|
2672
|
+
template <typename T, HWY_IF_U8(T)>
|
|
2673
|
+
HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2674
|
+
Vec256<T> b) {
|
|
2675
|
+
return Vec256<T>{_mm256_mask_max_epu8(no.raw, m.raw, a.raw, b.raw)};
|
|
2676
|
+
}
|
|
2677
|
+
template <typename T, HWY_IF_I8(T)>
|
|
2678
|
+
HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2679
|
+
Vec256<T> b) {
|
|
2680
|
+
return Vec256<T>{_mm256_mask_max_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
2681
|
+
}
|
|
2682
|
+
|
|
2683
|
+
template <typename T, HWY_IF_U16(T)>
|
|
2684
|
+
HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2685
|
+
Vec256<T> b) {
|
|
2686
|
+
return Vec256<T>{_mm256_mask_max_epu16(no.raw, m.raw, a.raw, b.raw)};
|
|
2687
|
+
}
|
|
2688
|
+
template <typename T, HWY_IF_I16(T)>
|
|
2689
|
+
HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2690
|
+
Vec256<T> b) {
|
|
2691
|
+
return Vec256<T>{_mm256_mask_max_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
2692
|
+
}
|
|
2693
|
+
|
|
2694
|
+
template <typename T, HWY_IF_U32(T)>
|
|
2695
|
+
HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2696
|
+
Vec256<T> b) {
|
|
2697
|
+
return Vec256<T>{_mm256_mask_max_epu32(no.raw, m.raw, a.raw, b.raw)};
|
|
2698
|
+
}
|
|
2699
|
+
template <typename T, HWY_IF_I32(T)>
|
|
2700
|
+
HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2701
|
+
Vec256<T> b) {
|
|
2702
|
+
return Vec256<T>{_mm256_mask_max_epi32(no.raw, m.raw, a.raw, b.raw)};
|
|
2703
|
+
}
|
|
2704
|
+
|
|
2705
|
+
template <typename T, HWY_IF_U64(T)>
|
|
2706
|
+
HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2707
|
+
Vec256<T> b) {
|
|
2708
|
+
return Vec256<T>{_mm256_mask_max_epu64(no.raw, m.raw, a.raw, b.raw)};
|
|
2709
|
+
}
|
|
2710
|
+
template <typename T, HWY_IF_I64(T)>
|
|
2711
|
+
HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2712
|
+
Vec256<T> b) {
|
|
2713
|
+
return Vec256<T>{_mm256_mask_max_epi64(no.raw, m.raw, a.raw, b.raw)};
|
|
2714
|
+
}
|
|
2715
|
+
|
|
2716
|
+
template <typename T, HWY_IF_F32(T)>
|
|
2717
|
+
HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2718
|
+
Vec256<T> b) {
|
|
2719
|
+
return Vec256<T>{_mm256_mask_max_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
2720
|
+
}
|
|
2721
|
+
|
|
2722
|
+
template <typename T, HWY_IF_F64(T)>
|
|
2723
|
+
HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2724
|
+
Vec256<T> b) {
|
|
2725
|
+
return Vec256<T>{_mm256_mask_max_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
2726
|
+
}
|
|
2727
|
+
|
|
2728
|
+
#if HWY_HAVE_FLOAT16
|
|
2729
|
+
template <typename T, HWY_IF_F16(T)>
|
|
2730
|
+
HWY_API Vec256<T> MaskedMaxOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2731
|
+
Vec256<T> b) {
|
|
2732
|
+
return Vec256<T>{_mm256_mask_max_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
2733
|
+
}
|
|
2734
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2735
|
+
|
|
2736
|
+
// ------------------------------ MaskedAddOr
|
|
2737
|
+
|
|
2738
|
+
template <typename T, HWY_IF_UI8(T)>
|
|
2739
|
+
HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2740
|
+
Vec256<T> b) {
|
|
2741
|
+
return Vec256<T>{_mm256_mask_add_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
2742
|
+
}
|
|
2743
|
+
|
|
2744
|
+
template <typename T, HWY_IF_UI16(T)>
|
|
2745
|
+
HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2746
|
+
Vec256<T> b) {
|
|
2747
|
+
return Vec256<T>{_mm256_mask_add_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
2748
|
+
}
|
|
2749
|
+
|
|
2750
|
+
template <typename T, HWY_IF_UI32(T)>
|
|
2751
|
+
HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2752
|
+
Vec256<T> b) {
|
|
2753
|
+
return Vec256<T>{_mm256_mask_add_epi32(no.raw, m.raw, a.raw, b.raw)};
|
|
2754
|
+
}
|
|
2755
|
+
|
|
2756
|
+
template <typename T, HWY_IF_UI64(T)>
|
|
2757
|
+
HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2758
|
+
Vec256<T> b) {
|
|
2759
|
+
return Vec256<T>{_mm256_mask_add_epi64(no.raw, m.raw, a.raw, b.raw)};
|
|
2760
|
+
}
|
|
2761
|
+
|
|
2762
|
+
template <typename T, HWY_IF_F32(T)>
|
|
2763
|
+
HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2764
|
+
Vec256<T> b) {
|
|
2765
|
+
return Vec256<T>{_mm256_mask_add_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
2766
|
+
}
|
|
2767
|
+
|
|
2768
|
+
template <typename T, HWY_IF_F64(T)>
|
|
2769
|
+
HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2770
|
+
Vec256<T> b) {
|
|
2771
|
+
return Vec256<T>{_mm256_mask_add_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
2772
|
+
}
|
|
2773
|
+
|
|
2774
|
+
#if HWY_HAVE_FLOAT16
|
|
2775
|
+
template <typename T, HWY_IF_F16(T)>
|
|
2776
|
+
HWY_API Vec256<T> MaskedAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2777
|
+
Vec256<T> b) {
|
|
2778
|
+
return Vec256<T>{_mm256_mask_add_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
2779
|
+
}
|
|
2780
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2781
|
+
|
|
2782
|
+
// ------------------------------ MaskedSubOr
|
|
2783
|
+
|
|
2784
|
+
template <typename T, HWY_IF_UI8(T)>
|
|
2785
|
+
HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2786
|
+
Vec256<T> b) {
|
|
2787
|
+
return Vec256<T>{_mm256_mask_sub_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
2788
|
+
}
|
|
2789
|
+
|
|
2790
|
+
template <typename T, HWY_IF_UI16(T)>
|
|
2791
|
+
HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2792
|
+
Vec256<T> b) {
|
|
2793
|
+
return Vec256<T>{_mm256_mask_sub_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
2794
|
+
}
|
|
2795
|
+
|
|
2796
|
+
template <typename T, HWY_IF_UI32(T)>
|
|
2797
|
+
HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2798
|
+
Vec256<T> b) {
|
|
2799
|
+
return Vec256<T>{_mm256_mask_sub_epi32(no.raw, m.raw, a.raw, b.raw)};
|
|
2800
|
+
}
|
|
2801
|
+
|
|
2802
|
+
template <typename T, HWY_IF_UI64(T)>
|
|
2803
|
+
HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2804
|
+
Vec256<T> b) {
|
|
2805
|
+
return Vec256<T>{_mm256_mask_sub_epi64(no.raw, m.raw, a.raw, b.raw)};
|
|
2806
|
+
}
|
|
2807
|
+
|
|
2808
|
+
template <typename T, HWY_IF_F32(T)>
|
|
2809
|
+
HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2810
|
+
Vec256<T> b) {
|
|
2811
|
+
return Vec256<T>{_mm256_mask_sub_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
2812
|
+
}
|
|
2813
|
+
|
|
2814
|
+
template <typename T, HWY_IF_F64(T)>
|
|
2815
|
+
HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2816
|
+
Vec256<T> b) {
|
|
2817
|
+
return Vec256<T>{_mm256_mask_sub_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
2818
|
+
}
|
|
2819
|
+
|
|
2820
|
+
#if HWY_HAVE_FLOAT16
|
|
2821
|
+
template <typename T, HWY_IF_F16(T)>
|
|
2822
|
+
HWY_API Vec256<T> MaskedSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2823
|
+
Vec256<T> b) {
|
|
2824
|
+
return Vec256<T>{_mm256_mask_sub_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
2825
|
+
}
|
|
2826
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2827
|
+
|
|
2828
|
+
// ------------------------------ MaskedMulOr
|
|
2829
|
+
|
|
2830
|
+
HWY_API Vec256<float> MaskedMulOr(Vec256<float> no, Mask256<float> m,
|
|
2831
|
+
Vec256<float> a, Vec256<float> b) {
|
|
2832
|
+
return Vec256<float>{_mm256_mask_mul_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
2833
|
+
}
|
|
2834
|
+
|
|
2835
|
+
HWY_API Vec256<double> MaskedMulOr(Vec256<double> no, Mask256<double> m,
|
|
2836
|
+
Vec256<double> a, Vec256<double> b) {
|
|
2837
|
+
return Vec256<double>{_mm256_mask_mul_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
2838
|
+
}
|
|
2839
|
+
|
|
2840
|
+
#if HWY_HAVE_FLOAT16
|
|
2841
|
+
HWY_API Vec256<float16_t> MaskedMulOr(Vec256<float16_t> no,
|
|
2842
|
+
Mask256<float16_t> m, Vec256<float16_t> a,
|
|
2843
|
+
Vec256<float16_t> b) {
|
|
2844
|
+
return Vec256<float16_t>{_mm256_mask_mul_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
2845
|
+
}
|
|
2846
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2847
|
+
|
|
2848
|
+
// ------------------------------ MaskedDivOr
|
|
2849
|
+
|
|
2850
|
+
HWY_API Vec256<float> MaskedDivOr(Vec256<float> no, Mask256<float> m,
|
|
2851
|
+
Vec256<float> a, Vec256<float> b) {
|
|
2852
|
+
return Vec256<float>{_mm256_mask_div_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
2853
|
+
}
|
|
2854
|
+
|
|
2855
|
+
HWY_API Vec256<double> MaskedDivOr(Vec256<double> no, Mask256<double> m,
|
|
2856
|
+
Vec256<double> a, Vec256<double> b) {
|
|
2857
|
+
return Vec256<double>{_mm256_mask_div_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
2858
|
+
}
|
|
2859
|
+
|
|
2860
|
+
#if HWY_HAVE_FLOAT16
|
|
2861
|
+
HWY_API Vec256<float16_t> MaskedDivOr(Vec256<float16_t> no,
|
|
2862
|
+
Mask256<float16_t> m, Vec256<float16_t> a,
|
|
2863
|
+
Vec256<float16_t> b) {
|
|
2864
|
+
return Vec256<float16_t>{_mm256_mask_div_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
2865
|
+
}
|
|
2866
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2867
|
+
|
|
2868
|
+
// ------------------------------ MaskedSatAddOr
|
|
2869
|
+
|
|
2870
|
+
template <typename T, HWY_IF_I8(T)>
|
|
2871
|
+
HWY_API Vec256<T> MaskedSatAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2872
|
+
Vec256<T> b) {
|
|
2873
|
+
return Vec256<T>{_mm256_mask_adds_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
2874
|
+
}
|
|
2875
|
+
|
|
2876
|
+
template <typename T, HWY_IF_U8(T)>
|
|
2877
|
+
HWY_API Vec256<T> MaskedSatAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2878
|
+
Vec256<T> b) {
|
|
2879
|
+
return Vec256<T>{_mm256_mask_adds_epu8(no.raw, m.raw, a.raw, b.raw)};
|
|
2880
|
+
}
|
|
2881
|
+
|
|
2882
|
+
template <typename T, HWY_IF_I16(T)>
|
|
2883
|
+
HWY_API Vec256<T> MaskedSatAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2884
|
+
Vec256<T> b) {
|
|
2885
|
+
return Vec256<T>{_mm256_mask_adds_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
2886
|
+
}
|
|
2887
|
+
|
|
2888
|
+
template <typename T, HWY_IF_U16(T)>
|
|
2889
|
+
HWY_API Vec256<T> MaskedSatAddOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2890
|
+
Vec256<T> b) {
|
|
2891
|
+
return Vec256<T>{_mm256_mask_adds_epu16(no.raw, m.raw, a.raw, b.raw)};
|
|
2892
|
+
}
|
|
2893
|
+
|
|
2894
|
+
// ------------------------------ MaskedSatSubOr
|
|
2895
|
+
|
|
2896
|
+
template <typename T, HWY_IF_I8(T)>
|
|
2897
|
+
HWY_API Vec256<T> MaskedSatSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2898
|
+
Vec256<T> b) {
|
|
2899
|
+
return Vec256<T>{_mm256_mask_subs_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
2900
|
+
}
|
|
2901
|
+
|
|
2902
|
+
template <typename T, HWY_IF_U8(T)>
|
|
2903
|
+
HWY_API Vec256<T> MaskedSatSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2904
|
+
Vec256<T> b) {
|
|
2905
|
+
return Vec256<T>{_mm256_mask_subs_epu8(no.raw, m.raw, a.raw, b.raw)};
|
|
2906
|
+
}
|
|
2907
|
+
|
|
2908
|
+
template <typename T, HWY_IF_I16(T)>
|
|
2909
|
+
HWY_API Vec256<T> MaskedSatSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2910
|
+
Vec256<T> b) {
|
|
2911
|
+
return Vec256<T>{_mm256_mask_subs_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
2912
|
+
}
|
|
2913
|
+
|
|
2914
|
+
template <typename T, HWY_IF_U16(T)>
|
|
2915
|
+
HWY_API Vec256<T> MaskedSatSubOr(Vec256<T> no, Mask256<T> m, Vec256<T> a,
|
|
2916
|
+
Vec256<T> b) {
|
|
2917
|
+
return Vec256<T>{_mm256_mask_subs_epu16(no.raw, m.raw, a.raw, b.raw)};
|
|
2918
|
+
}
|
|
2919
|
+
|
|
2920
|
+
#endif // HWY_TARGET <= HWY_AVX3
|
|
2921
|
+
|
|
2362
2922
|
// ------------------------------ Floating-point multiply-add variants
|
|
2363
2923
|
|
|
2364
2924
|
#if HWY_HAVE_FLOAT16
|
|
@@ -2453,6 +3013,31 @@ HWY_API Vec256<double> NegMulSub(Vec256<double> mul, Vec256<double> x,
|
|
|
2453
3013
|
#endif
|
|
2454
3014
|
}
|
|
2455
3015
|
|
|
3016
|
+
#if HWY_HAVE_FLOAT16
|
|
3017
|
+
HWY_API Vec256<float16_t> MulAddSub(Vec256<float16_t> mul, Vec256<float16_t> x,
|
|
3018
|
+
Vec256<float16_t> sub_or_add) {
|
|
3019
|
+
return Vec256<float16_t>{_mm256_fmaddsub_ph(mul.raw, x.raw, sub_or_add.raw)};
|
|
3020
|
+
}
|
|
3021
|
+
#endif // HWY_HAVE_FLOAT16
|
|
3022
|
+
|
|
3023
|
+
HWY_API Vec256<float> MulAddSub(Vec256<float> mul, Vec256<float> x,
|
|
3024
|
+
Vec256<float> sub_or_add) {
|
|
3025
|
+
#ifdef HWY_DISABLE_BMI2_FMA
|
|
3026
|
+
return AddSub(mul * x, sub_or_add);
|
|
3027
|
+
#else
|
|
3028
|
+
return Vec256<float>{_mm256_fmaddsub_ps(mul.raw, x.raw, sub_or_add.raw)};
|
|
3029
|
+
#endif
|
|
3030
|
+
}
|
|
3031
|
+
|
|
3032
|
+
HWY_API Vec256<double> MulAddSub(Vec256<double> mul, Vec256<double> x,
|
|
3033
|
+
Vec256<double> sub_or_add) {
|
|
3034
|
+
#ifdef HWY_DISABLE_BMI2_FMA
|
|
3035
|
+
return AddSub(mul * x, sub_or_add);
|
|
3036
|
+
#else
|
|
3037
|
+
return Vec256<double>{_mm256_fmaddsub_pd(mul.raw, x.raw, sub_or_add.raw)};
|
|
3038
|
+
#endif
|
|
3039
|
+
}
|
|
3040
|
+
|
|
2456
3041
|
// ------------------------------ Floating-point square root
|
|
2457
3042
|
|
|
2458
3043
|
// Full precision square root
|
|
@@ -2621,35 +3206,6 @@ HWY_API Mask256<double> IsFinite(Vec256<double> v) {
|
|
|
2621
3206
|
HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)});
|
|
2622
3207
|
}
|
|
2623
3208
|
|
|
2624
|
-
#else
|
|
2625
|
-
|
|
2626
|
-
template <typename T>
|
|
2627
|
-
HWY_API Mask256<T> IsInf(const Vec256<T> v) {
|
|
2628
|
-
static_assert(IsFloat<T>(), "Only for float");
|
|
2629
|
-
const DFromV<decltype(v)> d;
|
|
2630
|
-
const RebindToSigned<decltype(d)> di;
|
|
2631
|
-
const VFromD<decltype(di)> vi = BitCast(di, v);
|
|
2632
|
-
// 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
|
|
2633
|
-
return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
|
|
2634
|
-
}
|
|
2635
|
-
|
|
2636
|
-
// Returns whether normal/subnormal/zero.
|
|
2637
|
-
template <typename T>
|
|
2638
|
-
HWY_API Mask256<T> IsFinite(const Vec256<T> v) {
|
|
2639
|
-
static_assert(IsFloat<T>(), "Only for float");
|
|
2640
|
-
const DFromV<decltype(v)> d;
|
|
2641
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
2642
|
-
const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
|
|
2643
|
-
const VFromD<decltype(du)> vu = BitCast(du, v);
|
|
2644
|
-
// Shift left to clear the sign bit, then right so we can compare with the
|
|
2645
|
-
// max exponent (cannot compare with MaxExponentTimes2 directly because it is
|
|
2646
|
-
// negative and non-negative floats would be greater). MSVC seems to generate
|
|
2647
|
-
// incorrect code if we instead add vu + vu.
|
|
2648
|
-
const VFromD<decltype(di)> exp =
|
|
2649
|
-
BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(ShiftLeft<1>(vu)));
|
|
2650
|
-
return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
|
|
2651
|
-
}
|
|
2652
|
-
|
|
2653
3209
|
#endif // HWY_TARGET <= HWY_AVX3
|
|
2654
3210
|
|
|
2655
3211
|
// ================================================== MEMORY
|
|
@@ -2662,16 +3218,13 @@ HWY_API VFromD<D> Load(D /* tag */, const TFromD<D>* HWY_RESTRICT aligned) {
|
|
|
2662
3218
|
_mm256_load_si256(reinterpret_cast<const __m256i*>(aligned))};
|
|
2663
3219
|
}
|
|
2664
3220
|
// bfloat16_t is handled by x86_128-inl.h.
|
|
2665
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
|
|
2666
|
-
HWY_API Vec256<float16_t> Load(D d, const float16_t* HWY_RESTRICT aligned) {
|
|
2667
3221
|
#if HWY_HAVE_FLOAT16
|
|
2668
|
-
|
|
3222
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
|
|
3223
|
+
HWY_API Vec256<float16_t> Load(D /* tag */,
|
|
3224
|
+
const float16_t* HWY_RESTRICT aligned) {
|
|
2669
3225
|
return Vec256<float16_t>{_mm256_load_ph(aligned)};
|
|
2670
|
-
#else
|
|
2671
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
2672
|
-
return BitCast(d, Load(du, reinterpret_cast<const uint16_t*>(aligned)));
|
|
2673
|
-
#endif // HWY_HAVE_FLOAT16
|
|
2674
3226
|
}
|
|
3227
|
+
#endif
|
|
2675
3228
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
2676
3229
|
HWY_API Vec256<float> Load(D /* tag */, const float* HWY_RESTRICT aligned) {
|
|
2677
3230
|
return Vec256<float>{_mm256_load_ps(aligned)};
|
|
@@ -2686,16 +3239,12 @@ HWY_API VFromD<D> LoadU(D /* tag */, const TFromD<D>* HWY_RESTRICT p) {
|
|
|
2686
3239
|
return VFromD<D>{_mm256_loadu_si256(reinterpret_cast<const __m256i*>(p))};
|
|
2687
3240
|
}
|
|
2688
3241
|
// bfloat16_t is handled by x86_128-inl.h.
|
|
2689
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
|
|
2690
|
-
HWY_API Vec256<float16_t> LoadU(D d, const float16_t* HWY_RESTRICT p) {
|
|
2691
3242
|
#if HWY_HAVE_FLOAT16
|
|
2692
|
-
|
|
3243
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
|
|
3244
|
+
HWY_API Vec256<float16_t> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) {
|
|
2693
3245
|
return Vec256<float16_t>{_mm256_loadu_ph(p)};
|
|
2694
|
-
#else
|
|
2695
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
2696
|
-
return BitCast(d, LoadU(du, reinterpret_cast<const uint16_t*>(p)));
|
|
2697
|
-
#endif // HWY_HAVE_FLOAT16
|
|
2698
3246
|
}
|
|
3247
|
+
#endif
|
|
2699
3248
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
2700
3249
|
HWY_API Vec256<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) {
|
|
2701
3250
|
return Vec256<float>{_mm256_loadu_ps(p)};
|
|
@@ -2756,8 +3305,8 @@ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
|
|
|
2756
3305
|
HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
|
|
2757
3306
|
const TFromD<D>* HWY_RESTRICT p) {
|
|
2758
3307
|
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
2759
|
-
return BitCast(
|
|
2760
|
-
|
|
3308
|
+
return BitCast(d, VFromD<decltype(du)>{
|
|
3309
|
+
_mm256_mask_loadu_epi16(BitCast(du, v).raw, m.raw, p)});
|
|
2761
3310
|
}
|
|
2762
3311
|
|
|
2763
3312
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
|
|
@@ -2831,22 +3380,24 @@ HWY_API Vec256<double> MaskedLoad(Mask256<double> m, D d,
|
|
|
2831
3380
|
// Loads 128 bit and duplicates into both 128-bit halves. This avoids the
|
|
2832
3381
|
// 3-cycle cost of moving data between 128-bit halves and avoids port 5.
|
|
2833
3382
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
2834
|
-
HWY_API VFromD<D> LoadDup128(D
|
|
3383
|
+
HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
3384
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2835
3385
|
const Full128<TFromD<D>> d128;
|
|
3386
|
+
const RebindToUnsigned<decltype(d128)> du128;
|
|
3387
|
+
const __m128i v128 = BitCast(du128, LoadU(d128, p)).raw;
|
|
2836
3388
|
#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
|
|
2837
3389
|
// Workaround for incorrect results with _mm256_broadcastsi128_si256. Note
|
|
2838
3390
|
// that MSVC also lacks _mm256_zextsi128_si256, but cast (which leaves the
|
|
2839
3391
|
// upper half undefined) is fine because we're overwriting that anyway.
|
|
2840
3392
|
// This workaround seems in turn to generate incorrect code in MSVC 2022
|
|
2841
3393
|
// (19.31), so use broadcastsi128 there.
|
|
2842
|
-
|
|
2843
|
-
|
|
2844
|
-
_mm256_inserti128_si256(_mm256_castsi128_si256(v128), v128, 1)};
|
|
3394
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_inserti128_si256(
|
|
3395
|
+
_mm256_castsi128_si256(v128), v128, 1)});
|
|
2845
3396
|
#else
|
|
2846
3397
|
// The preferred path. This is perhaps surprising, because vbroadcasti128
|
|
2847
3398
|
// with xmm input has 7 cycle latency on Intel, but Clang >= 7 is able to
|
|
2848
3399
|
// pattern-match this to vbroadcastf128 with a memory operand as desired.
|
|
2849
|
-
return VFromD<
|
|
3400
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_broadcastsi128_si256(v128)});
|
|
2850
3401
|
#endif
|
|
2851
3402
|
}
|
|
2852
3403
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
@@ -2879,16 +3430,13 @@ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
|
|
|
2879
3430
|
HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
|
|
2880
3431
|
_mm256_store_si256(reinterpret_cast<__m256i*>(aligned), v.raw);
|
|
2881
3432
|
}
|
|
2882
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
|
|
2883
|
-
HWY_API void Store(Vec256<float16_t> v, D d, float16_t* HWY_RESTRICT aligned) {
|
|
2884
3433
|
#if HWY_HAVE_FLOAT16
|
|
2885
|
-
|
|
3434
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
|
|
3435
|
+
HWY_API void Store(Vec256<float16_t> v, D /* tag */,
|
|
3436
|
+
float16_t* HWY_RESTRICT aligned) {
|
|
2886
3437
|
_mm256_store_ph(aligned, v.raw);
|
|
2887
|
-
#else
|
|
2888
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
2889
|
-
Store(BitCast(du, v), du, reinterpret_cast<uint16_t*>(aligned));
|
|
2890
|
-
#endif // HWY_HAVE_FLOAT16
|
|
2891
3438
|
}
|
|
3439
|
+
#endif // HWY_HAVE_FLOAT16
|
|
2892
3440
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
2893
3441
|
HWY_API void Store(Vec256<float> v, D /* tag */, float* HWY_RESTRICT aligned) {
|
|
2894
3442
|
_mm256_store_ps(aligned, v.raw);
|
|
@@ -2903,16 +3451,13 @@ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
|
|
|
2903
3451
|
HWY_API void StoreU(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
|
|
2904
3452
|
_mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v.raw);
|
|
2905
3453
|
}
|
|
2906
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
|
|
2907
|
-
HWY_API void StoreU(Vec256<float16_t> v, D d, float16_t* HWY_RESTRICT p) {
|
|
2908
3454
|
#if HWY_HAVE_FLOAT16
|
|
2909
|
-
|
|
3455
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
|
|
3456
|
+
HWY_API void StoreU(Vec256<float16_t> v, D /* tag */,
|
|
3457
|
+
float16_t* HWY_RESTRICT p) {
|
|
2910
3458
|
_mm256_storeu_ph(p, v.raw);
|
|
2911
|
-
#else
|
|
2912
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
2913
|
-
StoreU(BitCast(du, v), du, reinterpret_cast<uint16_t*>(p));
|
|
2914
|
-
#endif // HWY_HAVE_FLOAT16
|
|
2915
3459
|
}
|
|
3460
|
+
#endif
|
|
2916
3461
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
2917
3462
|
HWY_API void StoreU(Vec256<float> v, D /* tag */, float* HWY_RESTRICT p) {
|
|
2918
3463
|
_mm256_storeu_ps(p, v.raw);
|
|
@@ -3140,118 +3685,133 @@ HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D /* tag */,
|
|
|
3140
3685
|
|
|
3141
3686
|
// ------------------------------ Gather
|
|
3142
3687
|
|
|
3143
|
-
|
|
3144
|
-
HWY_INLINE VFromD<D> GatherOffset(D /* tag */,
|
|
3145
|
-
const TFromD<D>* HWY_RESTRICT base,
|
|
3146
|
-
Vec256<int32_t> offset) {
|
|
3147
|
-
return VFromD<D>{_mm256_i32gather_epi32(
|
|
3148
|
-
reinterpret_cast<const int32_t*>(base), offset.raw, 1)};
|
|
3149
|
-
}
|
|
3150
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
|
|
3151
|
-
HWY_INLINE VFromD<D> GatherIndex(D /* tag */,
|
|
3152
|
-
const TFromD<D>* HWY_RESTRICT base,
|
|
3153
|
-
Vec256<int32_t> index) {
|
|
3154
|
-
return VFromD<D>{_mm256_i32gather_epi32(
|
|
3155
|
-
reinterpret_cast<const int32_t*>(base), index.raw, 4)};
|
|
3156
|
-
}
|
|
3688
|
+
namespace detail {
|
|
3157
3689
|
|
|
3158
|
-
template <
|
|
3159
|
-
HWY_INLINE
|
|
3160
|
-
|
|
3161
|
-
|
|
3162
|
-
|
|
3163
|
-
reinterpret_cast<const GatherIndex64*>(base), offset.raw, 1)};
|
|
3690
|
+
template <int kScale, typename T, HWY_IF_UI32(T)>
|
|
3691
|
+
HWY_INLINE Vec256<T> NativeGather256(const T* HWY_RESTRICT base,
|
|
3692
|
+
Vec256<int32_t> indices) {
|
|
3693
|
+
return Vec256<T>{_mm256_i32gather_epi32(
|
|
3694
|
+
reinterpret_cast<const int32_t*>(base), indices.raw, kScale)};
|
|
3164
3695
|
}
|
|
3165
|
-
|
|
3166
|
-
|
|
3167
|
-
|
|
3168
|
-
|
|
3169
|
-
return
|
|
3170
|
-
reinterpret_cast<const GatherIndex64*>(base),
|
|
3696
|
+
|
|
3697
|
+
template <int kScale, typename T, HWY_IF_UI64(T)>
|
|
3698
|
+
HWY_INLINE Vec256<T> NativeGather256(const T* HWY_RESTRICT base,
|
|
3699
|
+
Vec256<int64_t> indices) {
|
|
3700
|
+
return Vec256<T>{_mm256_i64gather_epi64(
|
|
3701
|
+
reinterpret_cast<const GatherIndex64*>(base), indices.raw, kScale)};
|
|
3171
3702
|
}
|
|
3172
3703
|
|
|
3173
|
-
template <
|
|
3174
|
-
HWY_API Vec256<float>
|
|
3175
|
-
|
|
3176
|
-
return Vec256<float>{_mm256_i32gather_ps(base,
|
|
3704
|
+
template <int kScale>
|
|
3705
|
+
HWY_API Vec256<float> NativeGather256(const float* HWY_RESTRICT base,
|
|
3706
|
+
Vec256<int32_t> indices) {
|
|
3707
|
+
return Vec256<float>{_mm256_i32gather_ps(base, indices.raw, kScale)};
|
|
3177
3708
|
}
|
|
3178
|
-
|
|
3179
|
-
|
|
3180
|
-
|
|
3181
|
-
|
|
3709
|
+
|
|
3710
|
+
template <int kScale>
|
|
3711
|
+
HWY_API Vec256<double> NativeGather256(const double* HWY_RESTRICT base,
|
|
3712
|
+
Vec256<int64_t> indices) {
|
|
3713
|
+
return Vec256<double>{_mm256_i64gather_pd(base, indices.raw, kScale)};
|
|
3182
3714
|
}
|
|
3183
|
-
|
|
3184
|
-
|
|
3185
|
-
|
|
3186
|
-
|
|
3187
|
-
|
|
3715
|
+
|
|
3716
|
+
} // namespace detail
|
|
3717
|
+
|
|
3718
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
3719
|
+
HWY_API VFromD<D> GatherOffset(D d, const TFromD<D>* HWY_RESTRICT base,
|
|
3720
|
+
VFromD<RebindToSigned<D>> offsets) {
|
|
3721
|
+
const RebindToSigned<decltype(d)> di;
|
|
3722
|
+
(void)di; // for HWY_DASSERT
|
|
3723
|
+
HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di))));
|
|
3724
|
+
return detail::NativeGather256<1>(base, offsets);
|
|
3188
3725
|
}
|
|
3189
|
-
|
|
3190
|
-
|
|
3191
|
-
|
|
3192
|
-
|
|
3726
|
+
|
|
3727
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
3728
|
+
HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
|
|
3729
|
+
VFromD<RebindToSigned<D>> indices) {
|
|
3730
|
+
const RebindToSigned<decltype(d)> di;
|
|
3731
|
+
(void)di; // for HWY_DASSERT
|
|
3732
|
+
HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
|
|
3733
|
+
return detail::NativeGather256<sizeof(TFromD<D>)>(base, indices);
|
|
3193
3734
|
}
|
|
3194
3735
|
|
|
3195
|
-
// ------------------------------
|
|
3736
|
+
// ------------------------------ MaskedGatherIndexOr
|
|
3196
3737
|
|
|
3197
|
-
|
|
3198
|
-
|
|
3199
|
-
|
|
3200
|
-
|
|
3738
|
+
namespace detail {
|
|
3739
|
+
|
|
3740
|
+
template <int kScale, typename T, HWY_IF_UI32(T)>
|
|
3741
|
+
HWY_INLINE Vec256<T> NativeMaskedGatherOr256(Vec256<T> no, Mask256<T> m,
|
|
3742
|
+
const T* HWY_RESTRICT base,
|
|
3743
|
+
Vec256<int32_t> indices) {
|
|
3201
3744
|
#if HWY_TARGET <= HWY_AVX3
|
|
3202
|
-
return
|
|
3203
|
-
|
|
3204
|
-
|
|
3745
|
+
return Vec256<T>{_mm256_mmask_i32gather_epi32(
|
|
3746
|
+
no.raw, m.raw, indices.raw, reinterpret_cast<const int32_t*>(base),
|
|
3747
|
+
kScale)};
|
|
3205
3748
|
#else
|
|
3206
|
-
return
|
|
3207
|
-
|
|
3208
|
-
|
|
3749
|
+
return Vec256<T>{_mm256_mask_i32gather_epi32(
|
|
3750
|
+
no.raw, reinterpret_cast<const int32_t*>(base), indices.raw, m.raw,
|
|
3751
|
+
kScale)};
|
|
3209
3752
|
#endif
|
|
3210
3753
|
}
|
|
3211
3754
|
|
|
3212
|
-
template <
|
|
3213
|
-
HWY_INLINE
|
|
3214
|
-
|
|
3215
|
-
|
|
3755
|
+
template <int kScale, typename T, HWY_IF_UI64(T)>
|
|
3756
|
+
HWY_INLINE Vec256<T> NativeMaskedGatherOr256(Vec256<T> no, Mask256<T> m,
|
|
3757
|
+
const T* HWY_RESTRICT base,
|
|
3758
|
+
Vec256<int64_t> indices) {
|
|
3216
3759
|
#if HWY_TARGET <= HWY_AVX3
|
|
3217
|
-
return
|
|
3218
|
-
|
|
3219
|
-
|
|
3760
|
+
return Vec256<T>{_mm256_mmask_i64gather_epi64(
|
|
3761
|
+
no.raw, m.raw, indices.raw, reinterpret_cast<const GatherIndex64*>(base),
|
|
3762
|
+
kScale)};
|
|
3220
3763
|
#else
|
|
3221
3764
|
// For reasons unknown, _mm256_mask_i64gather_epi64 returns all-zeros.
|
|
3222
|
-
const
|
|
3223
|
-
|
|
3224
|
-
|
|
3225
|
-
|
|
3765
|
+
const Full256<T> d;
|
|
3766
|
+
const Full256<double> dd;
|
|
3767
|
+
return BitCast(d,
|
|
3768
|
+
Vec256<double>{_mm256_mask_i64gather_pd(
|
|
3769
|
+
BitCast(dd, no).raw, reinterpret_cast<const double*>(base),
|
|
3770
|
+
indices.raw, RebindMask(dd, m).raw, kScale)});
|
|
3226
3771
|
#endif
|
|
3227
3772
|
}
|
|
3228
3773
|
|
|
3229
|
-
template <
|
|
3230
|
-
HWY_API Vec256<float>
|
|
3231
|
-
|
|
3232
|
-
|
|
3774
|
+
template <int kScale>
|
|
3775
|
+
HWY_API Vec256<float> NativeMaskedGatherOr256(Vec256<float> no,
|
|
3776
|
+
Mask256<float> m,
|
|
3777
|
+
const float* HWY_RESTRICT base,
|
|
3778
|
+
Vec256<int32_t> indices) {
|
|
3233
3779
|
#if HWY_TARGET <= HWY_AVX3
|
|
3234
3780
|
return Vec256<float>{
|
|
3235
|
-
_mm256_mmask_i32gather_ps(
|
|
3781
|
+
_mm256_mmask_i32gather_ps(no.raw, m.raw, indices.raw, base, kScale)};
|
|
3236
3782
|
#else
|
|
3237
3783
|
return Vec256<float>{
|
|
3238
|
-
_mm256_mask_i32gather_ps(
|
|
3784
|
+
_mm256_mask_i32gather_ps(no.raw, base, indices.raw, m.raw, kScale)};
|
|
3239
3785
|
#endif
|
|
3240
3786
|
}
|
|
3241
3787
|
|
|
3242
|
-
template <
|
|
3243
|
-
HWY_API Vec256<double>
|
|
3244
|
-
|
|
3245
|
-
|
|
3788
|
+
template <int kScale>
|
|
3789
|
+
HWY_API Vec256<double> NativeMaskedGatherOr256(Vec256<double> no,
|
|
3790
|
+
Mask256<double> m,
|
|
3791
|
+
const double* HWY_RESTRICT base,
|
|
3792
|
+
Vec256<int64_t> indices) {
|
|
3246
3793
|
#if HWY_TARGET <= HWY_AVX3
|
|
3247
3794
|
return Vec256<double>{
|
|
3248
|
-
_mm256_mmask_i64gather_pd(
|
|
3795
|
+
_mm256_mmask_i64gather_pd(no.raw, m.raw, indices.raw, base, kScale)};
|
|
3249
3796
|
#else
|
|
3250
3797
|
return Vec256<double>{
|
|
3251
|
-
_mm256_mask_i64gather_pd(
|
|
3798
|
+
_mm256_mask_i64gather_pd(no.raw, base, indices.raw, m.raw, kScale)};
|
|
3252
3799
|
#endif
|
|
3253
3800
|
}
|
|
3254
3801
|
|
|
3802
|
+
} // namespace detail
|
|
3803
|
+
|
|
3804
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
3805
|
+
HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
|
|
3806
|
+
const TFromD<D>* HWY_RESTRICT base,
|
|
3807
|
+
VFromD<RebindToSigned<D>> indices) {
|
|
3808
|
+
const RebindToSigned<decltype(d)> di;
|
|
3809
|
+
(void)di; // for HWY_DASSERT
|
|
3810
|
+
HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
|
|
3811
|
+
return detail::NativeMaskedGatherOr256<sizeof(TFromD<D>)>(no, m, base,
|
|
3812
|
+
indices);
|
|
3813
|
+
}
|
|
3814
|
+
|
|
3255
3815
|
HWY_DIAGNOSTICS(pop)
|
|
3256
3816
|
|
|
3257
3817
|
// ================================================== SWIZZLE
|
|
@@ -3294,7 +3854,7 @@ HWY_API Vec128<T> LowerHalf(Vec256<T> v) {
|
|
|
3294
3854
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
3295
3855
|
HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
|
|
3296
3856
|
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
3297
|
-
const Twice<decltype(
|
|
3857
|
+
const Twice<decltype(du)> dut;
|
|
3298
3858
|
return BitCast(d, VFromD<decltype(du)>{
|
|
3299
3859
|
_mm256_extracti128_si256(BitCast(dut, v).raw, 1)});
|
|
3300
3860
|
}
|
|
@@ -3375,22 +3935,16 @@ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
|
|
|
3375
3935
|
HWY_API VFromD<D> ZeroExtendVector(D /* tag */, VFromD<Half<D>> lo) {
|
|
3376
3936
|
#if HWY_HAVE_ZEXT
|
|
3377
3937
|
return VFromD<D>{_mm256_zextsi128_si256(lo.raw)};
|
|
3938
|
+
#elif HWY_COMPILER_MSVC
|
|
3939
|
+
// Workaround: _mm256_inserti128_si256 does not actually zero the hi part.
|
|
3940
|
+
return VFromD<D>{_mm256_set_m128i(_mm_setzero_si128(), lo.raw)};
|
|
3378
3941
|
#else
|
|
3379
3942
|
return VFromD<D>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
|
|
3380
3943
|
#endif
|
|
3381
3944
|
}
|
|
3382
|
-
|
|
3383
|
-
HWY_API Vec256<bfloat16_t> ZeroExtendVector(D d, Vec128<bfloat16_t> lo) {
|
|
3384
|
-
(void)d;
|
|
3385
|
-
#if HWY_HAVE_ZEXT
|
|
3386
|
-
return VFromD<D>{_mm256_zextsi128_si256(lo.raw)};
|
|
3387
|
-
#else
|
|
3388
|
-
return VFromD<D>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
|
|
3389
|
-
#endif // HWY_HAVE_ZEXT
|
|
3390
|
-
}
|
|
3945
|
+
#if HWY_HAVE_FLOAT16
|
|
3391
3946
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
|
|
3392
3947
|
HWY_API Vec256<float16_t> ZeroExtendVector(D d, Vec128<float16_t> lo) {
|
|
3393
|
-
#if HWY_HAVE_FLOAT16
|
|
3394
3948
|
#if HWY_HAVE_ZEXT
|
|
3395
3949
|
(void)d;
|
|
3396
3950
|
return Vec256<float16_t>{_mm256_zextph128_ph256(lo.raw)};
|
|
@@ -3398,15 +3952,8 @@ HWY_API Vec256<float16_t> ZeroExtendVector(D d, Vec128<float16_t> lo) {
|
|
|
3398
3952
|
const RebindToUnsigned<D> du;
|
|
3399
3953
|
return BitCast(d, ZeroExtendVector(du, BitCast(du, lo)));
|
|
3400
3954
|
#endif // HWY_HAVE_ZEXT
|
|
3401
|
-
#else
|
|
3402
|
-
(void)d;
|
|
3403
|
-
#if HWY_HAVE_ZEXT
|
|
3404
|
-
return VFromD<D>{_mm256_zextsi128_si256(lo.raw)};
|
|
3405
|
-
#else
|
|
3406
|
-
return VFromD<D>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
|
|
3407
|
-
#endif // HWY_HAVE_ZEXT
|
|
3408
|
-
#endif // HWY_HAVE_FLOAT16
|
|
3409
3955
|
}
|
|
3956
|
+
#endif // HWY_HAVE_FLOAT16
|
|
3410
3957
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
3411
3958
|
HWY_API Vec256<float> ZeroExtendVector(D /* tag */, Vec128<float> lo) {
|
|
3412
3959
|
#if HWY_HAVE_ZEXT
|
|
@@ -3443,8 +3990,11 @@ HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
|
|
|
3443
3990
|
|
|
3444
3991
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
3445
3992
|
HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) {
|
|
3446
|
-
const
|
|
3447
|
-
|
|
3993
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
3994
|
+
const Half<decltype(du)> dh_u;
|
|
3995
|
+
const auto lo256 = ZeroExtendVector(du, BitCast(dh_u, lo));
|
|
3996
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_inserti128_si256(
|
|
3997
|
+
lo256.raw, BitCast(dh_u, hi).raw, 1)});
|
|
3448
3998
|
}
|
|
3449
3999
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
3450
4000
|
HWY_API Vec256<float> Combine(D d, Vec128<float> hi, Vec128<float> lo) {
|
|
@@ -3547,8 +4097,12 @@ HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
|
|
|
3547
4097
|
template <class T, HWY_IF_T_SIZE(T, 2)>
|
|
3548
4098
|
HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
|
|
3549
4099
|
Vec256<T> v) {
|
|
3550
|
-
const
|
|
3551
|
-
|
|
4100
|
+
const DFromV<decltype(v)> d;
|
|
4101
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4102
|
+
const Half<decltype(d)> dh;
|
|
4103
|
+
const RebindToUnsigned<decltype(dh)> dh_u;
|
|
4104
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_broadcastw_epi16(
|
|
4105
|
+
BitCast(dh_u, LowerHalf(dh, v)).raw)});
|
|
3552
4106
|
}
|
|
3553
4107
|
|
|
3554
4108
|
template <class T, HWY_IF_UI32(T)>
|
|
@@ -3983,7 +4537,10 @@ HWY_API Vec256<double> TwoTablesLookupLanes(Vec256<double> a, Vec256<double> b,
|
|
|
3983
4537
|
|
|
3984
4538
|
template <typename T>
|
|
3985
4539
|
HWY_API Vec256<T> SwapAdjacentBlocks(Vec256<T> v) {
|
|
3986
|
-
|
|
4540
|
+
const DFromV<decltype(v)> d;
|
|
4541
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4542
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_permute4x64_epi64(
|
|
4543
|
+
BitCast(du, v).raw, _MM_SHUFFLE(1, 0, 3, 2))});
|
|
3987
4544
|
}
|
|
3988
4545
|
|
|
3989
4546
|
HWY_API Vec256<double> SwapAdjacentBlocks(Vec256<double> v) {
|
|
@@ -4022,9 +4579,9 @@ HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
|
|
|
4022
4579
|
_mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
|
|
4023
4580
|
#else
|
|
4024
4581
|
const RebindToSigned<decltype(d)> di;
|
|
4025
|
-
|
|
4026
|
-
0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100
|
|
4027
|
-
const auto rev128 = TableLookupBytes(v,
|
|
4582
|
+
const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
|
|
4583
|
+
di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
|
|
4584
|
+
const auto rev128 = TableLookupBytes(v, shuffle);
|
|
4028
4585
|
return VFromD<D>{
|
|
4029
4586
|
_mm256_permute4x64_epi64(rev128.raw, _MM_SHUFFLE(1, 0, 3, 2))};
|
|
4030
4587
|
#endif
|
|
@@ -4053,9 +4610,9 @@ HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
|
|
|
4053
4610
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
|
|
4054
4611
|
HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
|
|
4055
4612
|
const RebindToSigned<decltype(d)> di;
|
|
4056
|
-
|
|
4057
|
-
0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908
|
|
4058
|
-
return BitCast(d, TableLookupBytes(v,
|
|
4613
|
+
const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
|
|
4614
|
+
di, 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908);
|
|
4615
|
+
return BitCast(d, TableLookupBytes(v, shuffle));
|
|
4059
4616
|
}
|
|
4060
4617
|
|
|
4061
4618
|
// 32 bit Reverse4 defined in x86_128.
|
|
@@ -4071,9 +4628,9 @@ HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) {
|
|
|
4071
4628
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
|
|
4072
4629
|
HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
|
|
4073
4630
|
const RebindToSigned<decltype(d)> di;
|
|
4074
|
-
|
|
4075
|
-
0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100
|
|
4076
|
-
return BitCast(d, TableLookupBytes(v,
|
|
4631
|
+
const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
|
|
4632
|
+
di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
|
|
4633
|
+
return BitCast(d, TableLookupBytes(v, shuffle));
|
|
4077
4634
|
}
|
|
4078
4635
|
|
|
4079
4636
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
|
|
@@ -4162,8 +4719,12 @@ HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
|
|
|
4162
4719
|
// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
|
|
4163
4720
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
4164
4721
|
HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4722
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4165
4723
|
const Half<decltype(d)> d2;
|
|
4166
|
-
|
|
4724
|
+
const RebindToUnsigned<decltype(d2)> du2; // for float16_t
|
|
4725
|
+
return BitCast(
|
|
4726
|
+
d, VFromD<decltype(du)>{_mm256_inserti128_si256(
|
|
4727
|
+
BitCast(du, lo).raw, BitCast(du2, LowerHalf(d2, hi)).raw, 1)});
|
|
4167
4728
|
}
|
|
4168
4729
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
4169
4730
|
HWY_API Vec256<float> ConcatLowerLower(D d, Vec256<float> hi,
|
|
@@ -4180,8 +4741,10 @@ HWY_API Vec256<double> ConcatLowerLower(D d, Vec256<double> hi,
|
|
|
4180
4741
|
|
|
4181
4742
|
// hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
|
|
4182
4743
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
4183
|
-
HWY_API VFromD<D> ConcatLowerUpper(D
|
|
4184
|
-
|
|
4744
|
+
HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4745
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4746
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_permute2x128_si256(
|
|
4747
|
+
BitCast(du, lo).raw, BitCast(du, hi).raw, 0x21)});
|
|
4185
4748
|
}
|
|
4186
4749
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
4187
4750
|
HWY_API Vec256<float> ConcatLowerUpper(D /* tag */, Vec256<float> hi,
|
|
@@ -4196,8 +4759,10 @@ HWY_API Vec256<double> ConcatLowerUpper(D /* tag */, Vec256<double> hi,
|
|
|
4196
4759
|
|
|
4197
4760
|
// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
|
|
4198
4761
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
4199
|
-
HWY_API VFromD<D> ConcatUpperLower(D
|
|
4200
|
-
|
|
4762
|
+
HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4763
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4764
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_blend_epi32(
|
|
4765
|
+
BitCast(du, hi).raw, BitCast(du, lo).raw, 0x0F)});
|
|
4201
4766
|
}
|
|
4202
4767
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
4203
4768
|
HWY_API Vec256<float> ConcatUpperLower(D /* tag */, Vec256<float> hi,
|
|
@@ -4212,8 +4777,10 @@ HWY_API Vec256<double> ConcatUpperLower(D /* tag */, Vec256<double> hi,
|
|
|
4212
4777
|
|
|
4213
4778
|
// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
|
|
4214
4779
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
|
|
4215
|
-
HWY_API VFromD<D> ConcatUpperUpper(D
|
|
4216
|
-
|
|
4780
|
+
HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
4781
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
4782
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_permute2x128_si256(
|
|
4783
|
+
BitCast(du, lo).raw, BitCast(du, hi).raw, 0x31)});
|
|
4217
4784
|
}
|
|
4218
4785
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
4219
4786
|
HWY_API Vec256<float> ConcatUpperUpper(D /* tag */, Vec256<float> hi,
|
|
@@ -4274,7 +4841,8 @@ HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
|
4274
4841
|
const Vec256<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi));
|
|
4275
4842
|
const Vec256<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo));
|
|
4276
4843
|
const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
|
|
4277
|
-
return VFromD<
|
|
4844
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_permute4x64_epi64(
|
|
4845
|
+
u16, _MM_SHUFFLE(3, 1, 2, 0))});
|
|
4278
4846
|
#endif
|
|
4279
4847
|
}
|
|
4280
4848
|
|
|
@@ -4380,7 +4948,8 @@ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
|
4380
4948
|
const Vec256<uint32_t> uH = And(BitCast(dw, hi), mask);
|
|
4381
4949
|
const Vec256<uint32_t> uL = And(BitCast(dw, lo), mask);
|
|
4382
4950
|
const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
|
|
4383
|
-
return VFromD<
|
|
4951
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_permute4x64_epi64(
|
|
4952
|
+
u16, _MM_SHUFFLE(3, 1, 2, 0))});
|
|
4384
4953
|
#endif
|
|
4385
4954
|
}
|
|
4386
4955
|
|
|
@@ -4450,6 +5019,126 @@ HWY_API Vec256<double> ConcatEven(D d, Vec256<double> hi, Vec256<double> lo) {
|
|
|
4450
5019
|
#endif
|
|
4451
5020
|
}
|
|
4452
5021
|
|
|
5022
|
+
// ------------------------------ InterleaveWholeLower
|
|
5023
|
+
|
|
5024
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
5025
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
|
|
5026
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
5027
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
5028
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5029
|
+
alignas(32) static constexpr uint8_t kIdx[32] = {
|
|
5030
|
+
0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
|
|
5031
|
+
8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47};
|
|
5032
|
+
return VFromD<D>{_mm256_permutex2var_epi8(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
5033
|
+
#else
|
|
5034
|
+
return ConcatLowerLower(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
|
|
5035
|
+
#endif
|
|
5036
|
+
}
|
|
5037
|
+
|
|
5038
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
|
|
5039
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
5040
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5041
|
+
alignas(32) static constexpr uint16_t kIdx[16] = {0, 16, 1, 17, 2, 18, 3, 19,
|
|
5042
|
+
4, 20, 5, 21, 6, 22, 7, 23};
|
|
5043
|
+
return BitCast(
|
|
5044
|
+
d, VFromD<decltype(du)>{_mm256_permutex2var_epi16(
|
|
5045
|
+
BitCast(du, a).raw, Load(du, kIdx).raw, BitCast(du, b).raw)});
|
|
5046
|
+
}
|
|
5047
|
+
|
|
5048
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
|
|
5049
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
5050
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5051
|
+
alignas(32) static constexpr uint32_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
|
|
5052
|
+
return VFromD<D>{_mm256_permutex2var_epi32(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
5053
|
+
}
|
|
5054
|
+
|
|
5055
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
5056
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
5057
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5058
|
+
alignas(32) static constexpr uint32_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
|
|
5059
|
+
return VFromD<D>{_mm256_permutex2var_ps(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
5060
|
+
}
|
|
5061
|
+
|
|
5062
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
|
|
5063
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
5064
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5065
|
+
alignas(32) static constexpr uint64_t kIdx[4] = {0, 4, 1, 5};
|
|
5066
|
+
return VFromD<D>{_mm256_permutex2var_epi64(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
5067
|
+
}
|
|
5068
|
+
|
|
5069
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
|
|
5070
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
5071
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5072
|
+
alignas(32) static constexpr uint64_t kIdx[4] = {0, 4, 1, 5};
|
|
5073
|
+
return VFromD<D>{_mm256_permutex2var_pd(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
5074
|
+
}
|
|
5075
|
+
#else // AVX2
|
|
5076
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
5077
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
5078
|
+
return ConcatLowerLower(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
|
|
5079
|
+
}
|
|
5080
|
+
#endif
|
|
5081
|
+
|
|
5082
|
+
// ------------------------------ InterleaveWholeUpper
|
|
5083
|
+
|
|
5084
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
5085
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
|
|
5086
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
5087
|
+
#if HWY_TARGET <= HWY_AVX3_DL
|
|
5088
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5089
|
+
alignas(32) static constexpr uint8_t kIdx[32] = {
|
|
5090
|
+
16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
|
|
5091
|
+
24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63};
|
|
5092
|
+
return VFromD<D>{_mm256_permutex2var_epi8(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
5093
|
+
#else
|
|
5094
|
+
return ConcatUpperUpper(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
|
|
5095
|
+
#endif
|
|
5096
|
+
}
|
|
5097
|
+
|
|
5098
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
|
|
5099
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
5100
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5101
|
+
alignas(32) static constexpr uint16_t kIdx[16] = {
|
|
5102
|
+
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31};
|
|
5103
|
+
return BitCast(
|
|
5104
|
+
d, VFromD<decltype(du)>{_mm256_permutex2var_epi16(
|
|
5105
|
+
BitCast(du, a).raw, Load(du, kIdx).raw, BitCast(du, b).raw)});
|
|
5106
|
+
}
|
|
5107
|
+
|
|
5108
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
|
|
5109
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
5110
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5111
|
+
alignas(32) static constexpr uint32_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
|
|
5112
|
+
return VFromD<D>{_mm256_permutex2var_epi32(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
5113
|
+
}
|
|
5114
|
+
|
|
5115
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
5116
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
5117
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5118
|
+
alignas(32) static constexpr uint32_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
|
|
5119
|
+
return VFromD<D>{_mm256_permutex2var_ps(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
5120
|
+
}
|
|
5121
|
+
|
|
5122
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
|
|
5123
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
5124
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5125
|
+
alignas(32) static constexpr uint64_t kIdx[4] = {2, 6, 3, 7};
|
|
5126
|
+
return VFromD<D>{_mm256_permutex2var_epi64(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
5127
|
+
}
|
|
5128
|
+
|
|
5129
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
|
|
5130
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
5131
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5132
|
+
alignas(32) static constexpr uint64_t kIdx[4] = {2, 6, 3, 7};
|
|
5133
|
+
return VFromD<D>{_mm256_permutex2var_pd(a.raw, Load(du, kIdx).raw, b.raw)};
|
|
5134
|
+
}
|
|
5135
|
+
#else // AVX2
|
|
5136
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
5137
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
5138
|
+
return ConcatUpperUpper(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
|
|
5139
|
+
}
|
|
5140
|
+
#endif
|
|
5141
|
+
|
|
4453
5142
|
// ------------------------------ DupEven (InterleaveLower)
|
|
4454
5143
|
|
|
4455
5144
|
template <typename T, HWY_IF_UI32(T)>
|
|
@@ -4490,9 +5179,10 @@ template <typename T, HWY_IF_T_SIZE(T, 1)>
|
|
|
4490
5179
|
HWY_INLINE Vec256<T> OddEven(Vec256<T> a, Vec256<T> b) {
|
|
4491
5180
|
const DFromV<decltype(a)> d;
|
|
4492
5181
|
const Full256<uint8_t> d8;
|
|
4493
|
-
|
|
4494
|
-
|
|
4495
|
-
|
|
5182
|
+
const VFromD<decltype(d8)> mask =
|
|
5183
|
+
Dup128VecFromValues(d8, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF,
|
|
5184
|
+
0, 0xFF, 0, 0xFF, 0);
|
|
5185
|
+
return IfThenElse(MaskFromVec(BitCast(d, mask)), b, a);
|
|
4496
5186
|
}
|
|
4497
5187
|
|
|
4498
5188
|
template <typename T, HWY_IF_UI16(T)>
|
|
@@ -4505,7 +5195,8 @@ HWY_INLINE Vec256<T> OddEven(Vec256<T> a, Vec256<T> b) {
|
|
|
4505
5195
|
|
|
4506
5196
|
#if HWY_HAVE_FLOAT16
|
|
4507
5197
|
HWY_INLINE Vec256<float16_t> OddEven(Vec256<float16_t> a, Vec256<float16_t> b) {
|
|
4508
|
-
return Vec256<float16_t>{
|
|
5198
|
+
return Vec256<float16_t>{
|
|
5199
|
+
_mm256_mask_blend_ph(static_cast<__mmask16>(0x5555), a.raw, b.raw)};
|
|
4509
5200
|
}
|
|
4510
5201
|
#endif // HWY_HAVE_FLOAT16
|
|
4511
5202
|
|
|
@@ -4531,7 +5222,10 @@ HWY_API Vec256<double> OddEven(Vec256<double> a, Vec256<double> b) {
|
|
|
4531
5222
|
|
|
4532
5223
|
template <typename T, HWY_IF_NOT_FLOAT3264(T)>
|
|
4533
5224
|
Vec256<T> OddEvenBlocks(Vec256<T> odd, Vec256<T> even) {
|
|
4534
|
-
|
|
5225
|
+
const DFromV<decltype(odd)> d;
|
|
5226
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5227
|
+
return BitCast(d, VFromD<decltype(du)>{_mm256_blend_epi32(
|
|
5228
|
+
BitCast(du, odd).raw, BitCast(du, even).raw, 0xFu)});
|
|
4535
5229
|
}
|
|
4536
5230
|
|
|
4537
5231
|
HWY_API Vec256<float> OddEvenBlocks(Vec256<float> odd, Vec256<float> even) {
|
|
@@ -4554,7 +5248,10 @@ HWY_API VFromD<D> ReverseBlocks(D /*d*/, VFromD<D> v) {
|
|
|
4554
5248
|
// Both full
|
|
4555
5249
|
template <typename T, typename TI>
|
|
4556
5250
|
HWY_API Vec256<TI> TableLookupBytes(Vec256<T> bytes, Vec256<TI> from) {
|
|
4557
|
-
|
|
5251
|
+
const DFromV<decltype(from)> d;
|
|
5252
|
+
return BitCast(d, Vec256<uint8_t>{_mm256_shuffle_epi8(
|
|
5253
|
+
BitCast(Full256<uint8_t>(), bytes).raw,
|
|
5254
|
+
BitCast(Full256<uint8_t>(), from).raw)});
|
|
4558
5255
|
}
|
|
4559
5256
|
|
|
4560
5257
|
// Partial index vector
|
|
@@ -5114,14 +5811,15 @@ HWY_API Vec256<uint8_t> Shl(hwy::UnsignedTag tag, Vec256<uint8_t> v,
|
|
|
5114
5811
|
const DFromV<decltype(v)> d;
|
|
5115
5812
|
#if HWY_TARGET <= HWY_AVX3_DL
|
|
5116
5813
|
(void)tag;
|
|
5117
|
-
//
|
|
5118
|
-
|
|
5119
|
-
0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01,
|
|
5814
|
+
// masks[i] = 0xFF >> i
|
|
5815
|
+
const VFromD<decltype(d)> masks =
|
|
5816
|
+
Dup128VecFromValues(d, 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0,
|
|
5817
|
+
0, 0, 0, 0, 0, 0, 0);
|
|
5120
5818
|
// kShl[i] = 1 << i
|
|
5121
|
-
|
|
5122
|
-
|
|
5123
|
-
v = And(v, TableLookupBytes(
|
|
5124
|
-
const VFromD<decltype(d)> mul = TableLookupBytes(
|
|
5819
|
+
const VFromD<decltype(d)> shl = Dup128VecFromValues(
|
|
5820
|
+
d, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
5821
|
+
v = And(v, TableLookupBytes(masks, bits));
|
|
5822
|
+
const VFromD<decltype(d)> mul = TableLookupBytes(shl, bits);
|
|
5125
5823
|
return VFromD<decltype(d)>{_mm256_gf2p8mul_epi8(v.raw, mul.raw)};
|
|
5126
5824
|
#else
|
|
5127
5825
|
const Repartition<uint16_t, decltype(d)> dw;
|
|
@@ -5472,11 +6170,36 @@ HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
|
|
|
5472
6170
|
}
|
|
5473
6171
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
|
|
5474
6172
|
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
|
|
5475
|
-
return VFromD<D>{
|
|
5476
|
-
|
|
6173
|
+
return VFromD<D>{_mm256_maskz_cvttps_epu64(
|
|
6174
|
+
detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
|
|
5477
6175
|
}
|
|
5478
6176
|
#endif // HWY_TARGET <= HWY_AVX3
|
|
5479
6177
|
|
|
6178
|
+
// ------------------------------ PromoteEvenTo/PromoteOddTo
|
|
6179
|
+
#if HWY_TARGET > HWY_AVX3
|
|
6180
|
+
namespace detail {
|
|
6181
|
+
|
|
6182
|
+
// I32->I64 PromoteEvenTo/PromoteOddTo
|
|
6183
|
+
|
|
6184
|
+
template <class D, HWY_IF_LANES_D(D, 4)>
|
|
6185
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
|
|
6186
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
6187
|
+
hwy::SignedTag /*from_type_tag*/, D d_to,
|
|
6188
|
+
Vec256<int32_t> v) {
|
|
6189
|
+
return BitCast(d_to, OddEven(DupEven(BroadcastSignBit(v)), v));
|
|
6190
|
+
}
|
|
6191
|
+
|
|
6192
|
+
template <class D, HWY_IF_LANES_D(D, 4)>
|
|
6193
|
+
HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
|
|
6194
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
6195
|
+
hwy::SignedTag /*from_type_tag*/, D d_to,
|
|
6196
|
+
Vec256<int32_t> v) {
|
|
6197
|
+
return BitCast(d_to, OddEven(BroadcastSignBit(v), DupOdd(v)));
|
|
6198
|
+
}
|
|
6199
|
+
|
|
6200
|
+
} // namespace detail
|
|
6201
|
+
#endif
|
|
6202
|
+
|
|
5480
6203
|
// ------------------------------ Demotions (full -> part w/ narrow lanes)
|
|
5481
6204
|
|
|
5482
6205
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
|
|
@@ -5565,32 +6288,17 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int64_t> v) {
|
|
|
5565
6288
|
|
|
5566
6289
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
|
|
5567
6290
|
HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int64_t> v) {
|
|
5568
|
-
const
|
|
5569
|
-
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
5570
|
-
const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
|
|
5571
|
-
#else
|
|
5572
|
-
const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
|
|
5573
|
-
#endif
|
|
6291
|
+
const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
|
|
5574
6292
|
return VFromD<D>{_mm256_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)};
|
|
5575
6293
|
}
|
|
5576
6294
|
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
|
|
5577
6295
|
HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int64_t> v) {
|
|
5578
|
-
const
|
|
5579
|
-
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
5580
|
-
const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
|
|
5581
|
-
#else
|
|
5582
|
-
const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
|
|
5583
|
-
#endif
|
|
6296
|
+
const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
|
|
5584
6297
|
return VFromD<D>{_mm256_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)};
|
|
5585
6298
|
}
|
|
5586
6299
|
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U8_D(D)>
|
|
5587
6300
|
HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<int64_t> v) {
|
|
5588
|
-
const
|
|
5589
|
-
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
5590
|
-
const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
|
|
5591
|
-
#else
|
|
5592
|
-
const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
|
|
5593
|
-
#endif
|
|
6301
|
+
const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
|
|
5594
6302
|
return VFromD<D>{_mm256_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)};
|
|
5595
6303
|
}
|
|
5596
6304
|
|
|
@@ -5617,14 +6325,22 @@ HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wsign-conversion")
|
|
|
5617
6325
|
|
|
5618
6326
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
|
|
5619
6327
|
HWY_API VFromD<D> DemoteTo(D df16, Vec256<float> v) {
|
|
5620
|
-
(
|
|
5621
|
-
return
|
|
6328
|
+
const RebindToUnsigned<decltype(df16)> du16;
|
|
6329
|
+
return BitCast(
|
|
6330
|
+
df16, VFromD<decltype(du16)>{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)});
|
|
5622
6331
|
}
|
|
5623
6332
|
|
|
5624
6333
|
HWY_DIAGNOSTICS(pop)
|
|
5625
6334
|
|
|
5626
6335
|
#endif // HWY_DISABLE_F16C
|
|
5627
6336
|
|
|
6337
|
+
#if HWY_HAVE_FLOAT16
|
|
6338
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F16_D(D)>
|
|
6339
|
+
HWY_API VFromD<D> DemoteTo(D /*df16*/, Vec256<double> v) {
|
|
6340
|
+
return VFromD<D>{_mm256_cvtpd_ph(v.raw)};
|
|
6341
|
+
}
|
|
6342
|
+
#endif // HWY_HAVE_FLOAT16
|
|
6343
|
+
|
|
5628
6344
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
|
|
5629
6345
|
HWY_API VFromD<D> DemoteTo(D dbf16, Vec256<float> v) {
|
|
5630
6346
|
// TODO(janwas): _mm256_cvtneps_pbh once we have avx512bf16.
|
|
@@ -5777,8 +6493,8 @@ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
|
|
|
5777
6493
|
HWY_API VFromD<D> DemoteTo(D du32, Vec256<double> v) {
|
|
5778
6494
|
#if HWY_TARGET <= HWY_AVX3
|
|
5779
6495
|
(void)du32;
|
|
5780
|
-
return VFromD<D>{
|
|
5781
|
-
|
|
6496
|
+
return VFromD<D>{_mm256_maskz_cvttpd_epu32(
|
|
6497
|
+
detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
|
|
5782
6498
|
#else // AVX2
|
|
5783
6499
|
const Rebind<double, decltype(du32)> df64;
|
|
5784
6500
|
const RebindToUnsigned<decltype(df64)> du64;
|
|
@@ -5967,6 +6683,11 @@ HWY_API VFromD<D> ConvertTo(D d, Vec256<float16_t> v) {
|
|
|
5967
6683
|
return detail::FixConversionOverflow(d, v,
|
|
5968
6684
|
VFromD<D>{_mm256_cvttph_epi16(v.raw)});
|
|
5969
6685
|
}
|
|
6686
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
|
|
6687
|
+
HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
|
|
6688
|
+
return VFromD<D>{_mm256_maskz_cvttph_epu16(
|
|
6689
|
+
detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
|
|
6690
|
+
}
|
|
5970
6691
|
#endif // HWY_HAVE_FLOAT16
|
|
5971
6692
|
|
|
5972
6693
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
|
|
@@ -5983,13 +6704,13 @@ HWY_API VFromD<D> ConvertTo(D di, Vec256<double> v) {
|
|
|
5983
6704
|
}
|
|
5984
6705
|
template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U32_D(DU)>
|
|
5985
6706
|
HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
5986
|
-
return VFromD<DU>{
|
|
5987
|
-
|
|
6707
|
+
return VFromD<DU>{_mm256_maskz_cvttps_epu32(
|
|
6708
|
+
detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
|
|
5988
6709
|
}
|
|
5989
6710
|
template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U64_D(DU)>
|
|
5990
6711
|
HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
5991
|
-
return VFromD<DU>{
|
|
5992
|
-
|
|
6712
|
+
return VFromD<DU>{_mm256_maskz_cvttpd_epu64(
|
|
6713
|
+
detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
|
|
5993
6714
|
}
|
|
5994
6715
|
#else // AVX2
|
|
5995
6716
|
template <class DU32, HWY_IF_V_SIZE_D(DU32, 32), HWY_IF_U32_D(DU32)>
|
|
@@ -6035,6 +6756,15 @@ HWY_API VFromD<D> PromoteTo(D df32, Vec128<float16_t> v) {
|
|
|
6035
6756
|
|
|
6036
6757
|
#endif // HWY_DISABLE_F16C
|
|
6037
6758
|
|
|
6759
|
+
#if HWY_HAVE_FLOAT16
|
|
6760
|
+
|
|
6761
|
+
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
|
|
6762
|
+
HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, Vec64<float16_t> v) {
|
|
6763
|
+
return VFromD<D>{_mm256_cvtph_pd(v.raw)};
|
|
6764
|
+
}
|
|
6765
|
+
|
|
6766
|
+
#endif // HWY_HAVE_FLOAT16
|
|
6767
|
+
|
|
6038
6768
|
template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
|
|
6039
6769
|
HWY_API VFromD<D> PromoteTo(D df32, Vec128<bfloat16_t> v) {
|
|
6040
6770
|
const Rebind<uint16_t, decltype(df32)> du16;
|
|
@@ -6120,14 +6850,14 @@ template <uint8_t kRcon>
|
|
|
6120
6850
|
HWY_API Vec256<uint8_t> AESKeyGenAssist(Vec256<uint8_t> v) {
|
|
6121
6851
|
const Full256<uint8_t> d;
|
|
6122
6852
|
#if HWY_TARGET <= HWY_AVX3_DL
|
|
6123
|
-
|
|
6124
|
-
0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0
|
|
6125
|
-
|
|
6126
|
-
0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12
|
|
6853
|
+
const VFromD<decltype(d)> rconXorMask = Dup128VecFromValues(
|
|
6854
|
+
d, 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0);
|
|
6855
|
+
const VFromD<decltype(d)> rotWordShuffle = Dup128VecFromValues(
|
|
6856
|
+
d, 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12);
|
|
6127
6857
|
const Repartition<uint32_t, decltype(d)> du32;
|
|
6128
6858
|
const auto w13 = BitCast(d, DupOdd(BitCast(du32, v)));
|
|
6129
|
-
const auto sub_word_result = AESLastRound(w13,
|
|
6130
|
-
return TableLookupBytes(sub_word_result,
|
|
6859
|
+
const auto sub_word_result = AESLastRound(w13, rconXorMask);
|
|
6860
|
+
return TableLookupBytes(sub_word_result, rotWordShuffle);
|
|
6131
6861
|
#else
|
|
6132
6862
|
const Half<decltype(d)> d2;
|
|
6133
6863
|
return Combine(d, AESKeyGenAssist<kRcon>(UpperHalf(d2, v)),
|
|
@@ -6387,9 +7117,9 @@ HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) {
|
|
|
6387
7117
|
0x0303030303030303ull};
|
|
6388
7118
|
const auto rep8 = TableLookupBytes(vbits, BitCast(du, Load(du64, kRep8)));
|
|
6389
7119
|
|
|
6390
|
-
|
|
6391
|
-
|
|
6392
|
-
return RebindMask(d, TestBit(rep8,
|
|
7120
|
+
const VFromD<decltype(du)> bit = Dup128VecFromValues(
|
|
7121
|
+
du, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
|
|
7122
|
+
return RebindMask(d, TestBit(rep8, bit));
|
|
6393
7123
|
}
|
|
6394
7124
|
|
|
6395
7125
|
template <typename T, HWY_IF_T_SIZE(T, 2)>
|
|
@@ -6923,6 +7653,16 @@ HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
|
|
|
6923
7653
|
|
|
6924
7654
|
#endif // HWY_TARGET <= HWY_AVX3
|
|
6925
7655
|
|
|
7656
|
+
// ------------------------------ Dup128MaskFromMaskBits
|
|
7657
|
+
|
|
7658
|
+
// Generic for all vector lengths >= 32 bytes
|
|
7659
|
+
template <class D, HWY_IF_V_SIZE_GT_D(D, 16)>
|
|
7660
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
7661
|
+
const Half<decltype(d)> dh;
|
|
7662
|
+
const auto mh = Dup128MaskFromMaskBits(dh, mask_bits);
|
|
7663
|
+
return CombineMasks(d, mh, mh);
|
|
7664
|
+
}
|
|
7665
|
+
|
|
6926
7666
|
// ------------------------------ Expand
|
|
6927
7667
|
|
|
6928
7668
|
// Always define Expand/LoadExpand because generic_ops only does so for Vec128.
|
|
@@ -7396,116 +8136,9 @@ HWY_API Mask256<T> SetAtOrBeforeFirst(Mask256<T> mask) {
|
|
|
7396
8136
|
}
|
|
7397
8137
|
#endif // HWY_TARGET <= HWY_AVX3
|
|
7398
8138
|
|
|
7399
|
-
// ------------------------------ Reductions
|
|
7400
|
-
|
|
7401
|
-
namespace detail {
|
|
7402
|
-
|
|
7403
|
-
// These functions start with each lane per 128-bit block being reduced with the
|
|
7404
|
-
// corresponding lane in the other block, so we use the same logic as x86_128
|
|
7405
|
-
// but running on both blocks at the same time. There are two (64-bit) to eight
|
|
7406
|
-
// (16-bit) lanes per block.
|
|
7407
|
-
template <typename T, HWY_IF_T_SIZE(T, 8)>
|
|
7408
|
-
HWY_INLINE Vec256<T> SumOfLanes(Vec256<T> v10) {
|
|
7409
|
-
const DFromV<decltype(v10)> d;
|
|
7410
|
-
return Add(v10, Reverse2(d, v10));
|
|
7411
|
-
}
|
|
7412
|
-
template <typename T, HWY_IF_T_SIZE(T, 8)>
|
|
7413
|
-
HWY_INLINE Vec256<T> MinOfLanes(Vec256<T> v10) {
|
|
7414
|
-
const DFromV<decltype(v10)> d;
|
|
7415
|
-
return Min(v10, Reverse2(d, v10));
|
|
7416
|
-
}
|
|
7417
|
-
template <typename T, HWY_IF_T_SIZE(T, 8)>
|
|
7418
|
-
HWY_INLINE Vec256<T> MaxOfLanes(Vec256<T> v10) {
|
|
7419
|
-
const DFromV<decltype(v10)> d;
|
|
7420
|
-
return Max(v10, Reverse2(d, v10));
|
|
7421
|
-
}
|
|
7422
|
-
|
|
7423
|
-
template <typename T, HWY_IF_T_SIZE(T, 4)>
|
|
7424
|
-
HWY_INLINE Vec256<T> SumOfLanes(Vec256<T> v3210) {
|
|
7425
|
-
using V = decltype(v3210);
|
|
7426
|
-
const DFromV<V> d;
|
|
7427
|
-
const V v0123 = Reverse4(d, v3210);
|
|
7428
|
-
const V v03_12_12_03 = Add(v3210, v0123);
|
|
7429
|
-
const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
|
|
7430
|
-
return Add(v03_12_12_03, v12_03_03_12);
|
|
7431
|
-
}
|
|
7432
|
-
template <typename T, HWY_IF_T_SIZE(T, 4)>
|
|
7433
|
-
HWY_INLINE Vec256<T> MinOfLanes(Vec256<T> v3210) {
|
|
7434
|
-
using V = decltype(v3210);
|
|
7435
|
-
const DFromV<V> d;
|
|
7436
|
-
const V v0123 = Reverse4(d, v3210);
|
|
7437
|
-
const V v03_12_12_03 = Min(v3210, v0123);
|
|
7438
|
-
const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
|
|
7439
|
-
return Min(v03_12_12_03, v12_03_03_12);
|
|
7440
|
-
}
|
|
7441
|
-
template <typename T, HWY_IF_T_SIZE(T, 4)>
|
|
7442
|
-
HWY_INLINE Vec256<T> MaxOfLanes(Vec256<T> v3210) {
|
|
7443
|
-
using V = decltype(v3210);
|
|
7444
|
-
const DFromV<V> d;
|
|
7445
|
-
const V v0123 = Reverse4(d, v3210);
|
|
7446
|
-
const V v03_12_12_03 = Max(v3210, v0123);
|
|
7447
|
-
const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
|
|
7448
|
-
return Max(v03_12_12_03, v12_03_03_12);
|
|
7449
|
-
}
|
|
7450
|
-
|
|
7451
|
-
template <typename T, HWY_IF_T_SIZE(T, 2)>
|
|
7452
|
-
HWY_INLINE Vec256<T> SumOfLanes(Vec256<T> v76543210) {
|
|
7453
|
-
using V = decltype(v76543210);
|
|
7454
|
-
const DFromV<V> d;
|
|
7455
|
-
// The upper half is reversed from the lower half; omit for brevity.
|
|
7456
|
-
const V v34_25_16_07 = Add(v76543210, Reverse8(d, v76543210));
|
|
7457
|
-
const V v0347_1625_1625_0347 = Add(v34_25_16_07, Reverse4(d, v34_25_16_07));
|
|
7458
|
-
return Add(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
|
|
7459
|
-
}
|
|
7460
|
-
template <typename T, HWY_IF_T_SIZE(T, 2)>
|
|
7461
|
-
HWY_INLINE Vec256<T> MinOfLanes(Vec256<T> v76543210) {
|
|
7462
|
-
using V = decltype(v76543210);
|
|
7463
|
-
const DFromV<V> d;
|
|
7464
|
-
// The upper half is reversed from the lower half; omit for brevity.
|
|
7465
|
-
const V v34_25_16_07 = Min(v76543210, Reverse8(d, v76543210));
|
|
7466
|
-
const V v0347_1625_1625_0347 = Min(v34_25_16_07, Reverse4(d, v34_25_16_07));
|
|
7467
|
-
return Min(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
|
|
7468
|
-
}
|
|
7469
|
-
template <typename T, HWY_IF_T_SIZE(T, 2)>
|
|
7470
|
-
HWY_INLINE Vec256<T> MaxOfLanes(Vec256<T> v76543210) {
|
|
7471
|
-
using V = decltype(v76543210);
|
|
7472
|
-
const DFromV<V> d;
|
|
7473
|
-
// The upper half is reversed from the lower half; omit for brevity.
|
|
7474
|
-
const V v34_25_16_07 = Max(v76543210, Reverse8(d, v76543210));
|
|
7475
|
-
const V v0347_1625_1625_0347 = Max(v34_25_16_07, Reverse4(d, v34_25_16_07));
|
|
7476
|
-
return Max(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
|
|
7477
|
-
}
|
|
7478
|
-
|
|
7479
|
-
} // namespace detail
|
|
7480
|
-
|
|
7481
|
-
// Supported for >8-bit types. Returns the broadcasted result.
|
|
7482
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
7483
|
-
HWY_API VFromD<D> SumOfLanes(D /*d*/, VFromD<D> vHL) {
|
|
7484
|
-
const VFromD<D> vLH = SwapAdjacentBlocks(vHL);
|
|
7485
|
-
return detail::SumOfLanes(Add(vLH, vHL));
|
|
7486
|
-
}
|
|
7487
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
7488
|
-
HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
|
|
7489
|
-
return GetLane(SumOfLanes(d, v));
|
|
7490
|
-
}
|
|
7491
|
-
#if HWY_HAVE_FLOAT16
|
|
7492
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
7493
|
-
HWY_API float16_t ReduceSum(D, VFromD<D> v) {
|
|
7494
|
-
return _mm256_reduce_add_ph(v.raw);
|
|
7495
|
-
}
|
|
7496
|
-
#endif // HWY_HAVE_FLOAT16
|
|
7497
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
7498
|
-
HWY_API VFromD<D> MinOfLanes(D /*d*/, VFromD<D> vHL) {
|
|
7499
|
-
const VFromD<D> vLH = SwapAdjacentBlocks(vHL);
|
|
7500
|
-
return detail::MinOfLanes(Min(vLH, vHL));
|
|
7501
|
-
}
|
|
7502
|
-
template <class D, HWY_IF_V_SIZE_D(D, 32)>
|
|
7503
|
-
HWY_API VFromD<D> MaxOfLanes(D /*d*/, VFromD<D> vHL) {
|
|
7504
|
-
const VFromD<D> vLH = SwapAdjacentBlocks(vHL);
|
|
7505
|
-
return detail::MaxOfLanes(Max(vLH, vHL));
|
|
7506
|
-
}
|
|
8139
|
+
// ------------------------------ Reductions in generic_ops
|
|
7507
8140
|
|
|
7508
|
-
//
|
|
8141
|
+
// ------------------------------ LeadingZeroCount
|
|
7509
8142
|
|
|
7510
8143
|
#if HWY_TARGET <= HWY_AVX3
|
|
7511
8144
|
template <class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>
|