@img/sharp-libvips-dev 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/aom/aom_encoder.h +3 -3
- package/include/aom/aomcx.h +17 -8
- package/include/expat.h +21 -10
- package/include/expat_config.h +11 -5
- package/include/ffi.h +12 -25
- package/include/fontconfig/fontconfig.h +5 -3
- package/include/freetype2/freetype/config/ftoption.h +1 -1
- package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -7
- package/include/gio-unix-2.0/gio/gunixinputstream.h +0 -5
- package/include/gio-unix-2.0/gio/gunixoutputstream.h +0 -5
- package/include/glib-2.0/gio/gappinfo.h +0 -7
- package/include/glib-2.0/gio/gapplication.h +6 -0
- package/include/glib-2.0/gio/gapplicationcommandline.h +12 -1
- package/include/glib-2.0/gio/gasyncinitable.h +0 -7
- package/include/glib-2.0/gio/gasyncresult.h +0 -6
- package/include/glib-2.0/gio/gbufferedinputstream.h +0 -5
- package/include/glib-2.0/gio/gbufferedoutputstream.h +0 -5
- package/include/glib-2.0/gio/gbytesicon.h +0 -5
- package/include/glib-2.0/gio/gcancellable.h +0 -5
- package/include/glib-2.0/gio/gconverter.h +0 -7
- package/include/glib-2.0/gio/gconverterinputstream.h +0 -6
- package/include/glib-2.0/gio/gconverteroutputstream.h +0 -6
- package/include/glib-2.0/gio/gdatagrambased.h +0 -7
- package/include/glib-2.0/gio/gdatainputstream.h +0 -6
- package/include/glib-2.0/gio/gdataoutputstream.h +0 -6
- package/include/glib-2.0/gio/gdbusinterface.h +0 -8
- package/include/glib-2.0/gio/gdbusinterfaceskeleton.h +0 -8
- package/include/glib-2.0/gio/gdbusmessage.h +2 -1
- package/include/glib-2.0/gio/gdbusobjectmanagerclient.h +0 -8
- package/include/glib-2.0/gio/gdbusobjectmanagerserver.h +0 -8
- package/include/glib-2.0/gio/gdbusobjectproxy.h +0 -8
- package/include/glib-2.0/gio/gdbusobjectskeleton.h +0 -8
- package/include/glib-2.0/gio/gdbusproxy.h +0 -8
- package/include/glib-2.0/gio/gdebugcontroller.h +0 -8
- package/include/glib-2.0/gio/gdebugcontrollerdbus.h +0 -7
- package/include/glib-2.0/gio/gdtlsserverconnection.h +0 -8
- package/include/glib-2.0/gio/gemblem.h +0 -5
- package/include/glib-2.0/gio/gemblemedicon.h +0 -5
- package/include/glib-2.0/gio/gfile.h +0 -10
- package/include/glib-2.0/gio/gfileenumerator.h +0 -5
- package/include/glib-2.0/gio/gfileicon.h +0 -5
- package/include/glib-2.0/gio/gfileinfo.h +0 -5
- package/include/glib-2.0/gio/gfileinputstream.h +0 -8
- package/include/glib-2.0/gio/gfileiostream.h +0 -8
- package/include/glib-2.0/gio/gfilemonitor.h +0 -5
- package/include/glib-2.0/gio/gfilenamecompleter.h +0 -5
- package/include/glib-2.0/gio/gfileoutputstream.h +0 -8
- package/include/glib-2.0/gio/gfilterinputstream.h +0 -5
- package/include/glib-2.0/gio/gfilteroutputstream.h +0 -5
- package/include/glib-2.0/gio/gicon.h +0 -5
- package/include/glib-2.0/gio/ginitable.h +0 -7
- package/include/glib-2.0/gio/ginputstream.h +0 -5
- package/include/glib-2.0/gio/gio-autocleanups.h +4 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/gioenums.h +6 -1
- package/include/glib-2.0/gio/giomodule.h +0 -5
- package/include/glib-2.0/gio/giostream.h +0 -5
- package/include/glib-2.0/gio/giotypes.h +5 -108
- package/include/glib-2.0/gio/gloadableicon.h +0 -6
- package/include/glib-2.0/gio/gmemoryinputstream.h +0 -5
- package/include/glib-2.0/gio/gmemoryoutputstream.h +0 -5
- package/include/glib-2.0/gio/gmountoperation.h +0 -6
- package/include/glib-2.0/gio/gnetworking.h +4 -0
- package/include/glib-2.0/gio/goutputstream.h +0 -9
- package/include/glib-2.0/gio/gpollableinputstream.h +0 -7
- package/include/glib-2.0/gio/gpollableoutputstream.h +0 -7
- package/include/glib-2.0/gio/gproxy.h +0 -7
- package/include/glib-2.0/gio/gproxyaddressenumerator.h +0 -8
- package/include/glib-2.0/gio/gseekable.h +0 -5
- package/include/glib-2.0/gio/gsettingsbackend.h +0 -5
- package/include/glib-2.0/gio/gsimpleactiongroup.h +0 -7
- package/include/glib-2.0/gio/gsimpleasyncresult.h +0 -5
- package/include/glib-2.0/gio/gsimpleproxyresolver.h +0 -5
- package/include/glib-2.0/gio/gsocket.h +13 -0
- package/include/glib-2.0/gio/gsocketaddressenumerator.h +0 -6
- package/include/glib-2.0/gio/gsocketconnectable.h +0 -5
- package/include/glib-2.0/gio/gtask.h +12 -0
- package/include/glib-2.0/gio/gthemedicon.h +0 -5
- package/include/glib-2.0/gio/gtlsserverconnection.h +0 -8
- package/include/glib-2.0/gio/gunixcredentialsmessage.h +0 -8
- package/include/glib-2.0/gio/gvfs.h +0 -5
- package/include/glib-2.0/gio/gvolume.h +2 -2
- package/include/glib-2.0/gio/gvolumemonitor.h +0 -5
- package/include/glib-2.0/girepository/gi-visibility.h +986 -0
- package/include/glib-2.0/girepository/giarginfo.h +100 -0
- package/include/glib-2.0/girepository/gibaseinfo.h +129 -0
- package/include/glib-2.0/girepository/gicallableinfo.h +119 -0
- package/include/glib-2.0/girepository/gicallbackinfo.h +60 -0
- package/include/glib-2.0/girepository/giconstantinfo.h +72 -0
- package/include/glib-2.0/girepository/gienuminfo.h +82 -0
- package/include/glib-2.0/girepository/gifieldinfo.h +84 -0
- package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
- package/include/glib-2.0/girepository/gifunctioninfo.h +117 -0
- package/include/glib-2.0/girepository/giinterfaceinfo.h +120 -0
- package/include/glib-2.0/girepository/giobjectinfo.h +230 -0
- package/include/glib-2.0/girepository/gipropertyinfo.h +77 -0
- package/include/glib-2.0/girepository/giregisteredtypeinfo.h +75 -0
- package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
- package/include/glib-2.0/girepository/girepository.h +247 -0
- package/include/glib-2.0/girepository/girffi.h +129 -0
- package/include/glib-2.0/girepository/gisignalinfo.h +72 -0
- package/include/glib-2.0/girepository/gistructinfo.h +102 -0
- package/include/glib-2.0/girepository/gitypeinfo.h +144 -0
- package/include/glib-2.0/girepository/gitypelib.h +61 -0
- package/include/glib-2.0/girepository/gitypes.h +421 -0
- package/include/glib-2.0/girepository/giunioninfo.h +105 -0
- package/include/glib-2.0/girepository/giunresolvedinfo.h +60 -0
- package/include/glib-2.0/girepository/givalueinfo.h +65 -0
- package/include/glib-2.0/girepository/givfuncinfo.h +88 -0
- package/include/glib-2.0/glib/deprecated/gcompletion.h +1 -1
- package/include/glib-2.0/glib/deprecated/grel.h +0 -23
- package/include/glib-2.0/glib/deprecated/gthread.h +10 -6
- package/include/glib-2.0/glib/gatomic.h +20 -20
- package/include/glib-2.0/glib/gbitlock.h +31 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +39 -1
- package/include/glib-2.0/glib/gchecksum.h +0 -10
- package/include/glib-2.0/glib/gdate.h +0 -9
- package/include/glib-2.0/glib/gdatetime.h +33 -1
- package/include/glib-2.0/glib/gdir.h +5 -0
- package/include/glib-2.0/glib/ghmac.h +0 -9
- package/include/glib-2.0/glib/glib-autocleanups.h +4 -0
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +1 -0
- package/include/glib-2.0/glib/gmessages.h +11 -0
- package/include/glib-2.0/glib/gpathbuf.h +0 -7
- package/include/glib-2.0/glib/gslice.h +2 -0
- package/include/glib-2.0/glib/gstdio.h +1 -1
- package/include/glib-2.0/glib/gstrfuncs.h +24 -18
- package/include/glib-2.0/glib/gstrvbuilder.h +4 -8
- package/include/glib-2.0/glib/gtestutils.h +5 -0
- package/include/glib-2.0/glib/gthread.h +216 -3
- package/include/glib-2.0/glib/gunicode.h +12 -2
- package/include/glib-2.0/glib/gvarianttype.h +1 -10
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/glib/gwin32.h +4 -4
- package/include/glib-2.0/glib-unix.h +214 -0
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/gbinding.h +0 -8
- package/include/glib-2.0/gobject/gbindinggroup.h +0 -8
- package/include/glib-2.0/gobject/gclosure.h +1 -9
- package/include/glib-2.0/gobject/genums.h +6 -6
- package/include/glib-2.0/gobject/glib-types.h +44 -0
- package/include/glib-2.0/gobject/gobject-autocleanups.h +4 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gobject.h +1 -16
- package/include/glib-2.0/gobject/gparam.h +3 -12
- package/include/glib-2.0/gobject/gsignal.h +16 -6
- package/include/glib-2.0/gobject/gsignalgroup.h +0 -8
- package/include/glib-2.0/gobject/gtype.h +53 -20
- package/include/glib-2.0/gobject/gtypemodule.h +0 -7
- package/include/glib-2.0/gobject/gtypeplugin.h +0 -6
- package/include/glib-2.0/gobject/gvaluearray.h +0 -7
- package/include/glib-2.0/gobject/gvaluecollector.h +1 -11
- package/include/glib-2.0/gobject/gvaluetypes.h +2 -0
- package/include/hwy/aligned_allocator.h +171 -6
- package/include/hwy/base.h +1765 -543
- package/include/hwy/cache_control.h +24 -6
- package/include/hwy/detect_compiler_arch.h +23 -2
- package/include/hwy/detect_targets.h +56 -13
- package/include/hwy/foreach_target.h +24 -0
- package/include/hwy/highway.h +20 -3
- package/include/hwy/ops/arm_neon-inl.h +1086 -667
- package/include/hwy/ops/arm_sve-inl.h +1091 -235
- package/include/hwy/ops/emu128-inl.h +271 -196
- package/include/hwy/ops/generic_ops-inl.h +2270 -399
- package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
- package/include/hwy/ops/rvv-inl.h +1043 -311
- package/include/hwy/ops/scalar-inl.h +189 -159
- package/include/hwy/ops/set_macros-inl.h +66 -6
- package/include/hwy/ops/shared-inl.h +175 -56
- package/include/hwy/ops/wasm_128-inl.h +153 -136
- package/include/hwy/ops/x86_128-inl.h +1647 -646
- package/include/hwy/ops/x86_256-inl.h +1003 -370
- package/include/hwy/ops/x86_512-inl.h +948 -353
- package/include/hwy/per_target.h +4 -0
- package/include/hwy/profiler.h +648 -0
- package/include/hwy/robust_statistics.h +2 -2
- package/include/hwy/targets.h +18 -11
- package/include/hwy/timer.h +11 -0
- package/include/lcms2.h +46 -7
- package/include/lcms2_plugin.h +4 -4
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/png.h +32 -29
- package/include/libpng16/pngconf.h +2 -2
- package/include/libpng16/pnglibconf.h +7 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
- package/include/libxml2/libxml/HTMLparser.h +23 -0
- package/include/libxml2/libxml/SAX.h +0 -2
- package/include/libxml2/libxml/SAX2.h +0 -2
- package/include/libxml2/libxml/c14n.h +0 -2
- package/include/libxml2/libxml/dict.h +1 -0
- package/include/libxml2/libxml/encoding.h +16 -14
- package/include/libxml2/libxml/entities.h +4 -0
- package/include/libxml2/libxml/globals.h +15 -503
- package/include/libxml2/libxml/hash.h +57 -61
- package/include/libxml2/libxml/nanoftp.h +2 -2
- package/include/libxml2/libxml/parser.h +137 -18
- package/include/libxml2/libxml/parserInternals.h +1 -0
- package/include/libxml2/libxml/relaxng.h +2 -1
- package/include/libxml2/libxml/schemasInternals.h +1 -0
- package/include/libxml2/libxml/schematron.h +1 -0
- package/include/libxml2/libxml/threads.h +4 -11
- package/include/libxml2/libxml/tree.h +68 -20
- package/include/libxml2/libxml/uri.h +2 -1
- package/include/libxml2/libxml/valid.h +2 -0
- package/include/libxml2/libxml/xmlIO.h +65 -13
- package/include/libxml2/libxml/xmlerror.h +37 -8
- package/include/libxml2/libxml/xmlmemory.h +37 -40
- package/include/libxml2/libxml/xmlreader.h +6 -0
- package/include/libxml2/libxml/xmlregexp.h +2 -9
- package/include/libxml2/libxml/xmlsave.h +9 -0
- package/include/libxml2/libxml/xmlschemas.h +3 -0
- package/include/libxml2/libxml/xmlversion.h +28 -43
- package/include/libxml2/libxml/xpath.h +1 -1
- package/include/libxml2/libxml/xpathInternals.h +2 -1
- package/include/libxml2/libxml/xpointer.h +5 -4
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-fontmap.h +7 -0
- package/include/pixman-1/pixman-version.h +3 -3
- package/include/pixman-1/pixman.h +9 -2
- package/include/png.h +32 -29
- package/include/pngconf.h +2 -2
- package/include/pnglibconf.h +7 -2
- package/include/vips/connection.h +9 -3
- package/include/vips/util.h +0 -9
- package/include/vips/version.h +4 -4
- package/include/zconf.h +3 -0
- package/include/zlib.h +3 -3
- package/package.json +1 -1
- package/versions.json +15 -15
|
@@ -17,6 +17,9 @@
|
|
|
17
17
|
|
|
18
18
|
// Target-independent types/functions defined after target-specific ops.
|
|
19
19
|
|
|
20
|
+
// The "include guards" in this file that check HWY_TARGET_TOGGLE serve to skip
|
|
21
|
+
// the generic implementation here if native ops are already defined.
|
|
22
|
+
|
|
20
23
|
#include "hwy/base.h"
|
|
21
24
|
|
|
22
25
|
// Define detail::Shuffle1230 etc, but only when viewing the current header;
|
|
@@ -194,6 +197,21 @@ HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from,
|
|
|
194
197
|
#endif
|
|
195
198
|
}
|
|
196
199
|
|
|
200
|
+
// ------------------------------ MaskFalse
|
|
201
|
+
#if (defined(HWY_NATIVE_MASK_FALSE) == defined(HWY_TARGET_TOGGLE))
|
|
202
|
+
#ifdef HWY_NATIVE_MASK_FALSE
|
|
203
|
+
#undef HWY_NATIVE_MASK_FALSE
|
|
204
|
+
#else
|
|
205
|
+
#define HWY_NATIVE_MASK_FALSE
|
|
206
|
+
#endif
|
|
207
|
+
|
|
208
|
+
template <class D>
|
|
209
|
+
HWY_API Mask<D> MaskFalse(D d) {
|
|
210
|
+
return MaskFromVec(Zero(d));
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
#endif // HWY_NATIVE_MASK_FALSE
|
|
214
|
+
|
|
197
215
|
// ------------------------------ BitwiseIfThenElse
|
|
198
216
|
#if (defined(HWY_NATIVE_BITWISE_IF_THEN_ELSE) == defined(HWY_TARGET_TOGGLE))
|
|
199
217
|
#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
|
|
@@ -209,9 +227,634 @@ HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
|
|
|
209
227
|
|
|
210
228
|
#endif // HWY_NATIVE_BITWISE_IF_THEN_ELSE
|
|
211
229
|
|
|
212
|
-
//
|
|
213
|
-
|
|
214
|
-
|
|
230
|
+
// ------------------------------ PromoteMaskTo
|
|
231
|
+
|
|
232
|
+
#if (defined(HWY_NATIVE_PROMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE))
|
|
233
|
+
#ifdef HWY_NATIVE_PROMOTE_MASK_TO
|
|
234
|
+
#undef HWY_NATIVE_PROMOTE_MASK_TO
|
|
235
|
+
#else
|
|
236
|
+
#define HWY_NATIVE_PROMOTE_MASK_TO
|
|
237
|
+
#endif
|
|
238
|
+
|
|
239
|
+
template <class DTo, class DFrom>
|
|
240
|
+
HWY_API Mask<DTo> PromoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) {
|
|
241
|
+
static_assert(
|
|
242
|
+
sizeof(TFromD<DTo>) > sizeof(TFromD<DFrom>),
|
|
243
|
+
"sizeof(TFromD<DTo>) must be greater than sizeof(TFromD<DFrom>)");
|
|
244
|
+
static_assert(
|
|
245
|
+
IsSame<Mask<DFrom>, Mask<Rebind<TFromD<DFrom>, DTo>>>(),
|
|
246
|
+
"Mask<DFrom> must be the same type as Mask<Rebind<TFromD<DFrom>, DTo>>");
|
|
247
|
+
|
|
248
|
+
const RebindToSigned<decltype(d_to)> di_to;
|
|
249
|
+
const RebindToSigned<decltype(d_from)> di_from;
|
|
250
|
+
|
|
251
|
+
return MaskFromVec(BitCast(
|
|
252
|
+
d_to, PromoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m)))));
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
#endif // HWY_NATIVE_PROMOTE_MASK_TO
|
|
256
|
+
|
|
257
|
+
// ------------------------------ DemoteMaskTo
|
|
258
|
+
|
|
259
|
+
#if (defined(HWY_NATIVE_DEMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE))
|
|
260
|
+
#ifdef HWY_NATIVE_DEMOTE_MASK_TO
|
|
261
|
+
#undef HWY_NATIVE_DEMOTE_MASK_TO
|
|
262
|
+
#else
|
|
263
|
+
#define HWY_NATIVE_DEMOTE_MASK_TO
|
|
264
|
+
#endif
|
|
265
|
+
|
|
266
|
+
template <class DTo, class DFrom>
|
|
267
|
+
HWY_API Mask<DTo> DemoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) {
|
|
268
|
+
static_assert(sizeof(TFromD<DTo>) < sizeof(TFromD<DFrom>),
|
|
269
|
+
"sizeof(TFromD<DTo>) must be less than sizeof(TFromD<DFrom>)");
|
|
270
|
+
static_assert(
|
|
271
|
+
IsSame<Mask<DFrom>, Mask<Rebind<TFromD<DFrom>, DTo>>>(),
|
|
272
|
+
"Mask<DFrom> must be the same type as Mask<Rebind<TFromD<DFrom>, DTo>>");
|
|
273
|
+
|
|
274
|
+
const RebindToSigned<decltype(d_to)> di_to;
|
|
275
|
+
const RebindToSigned<decltype(d_from)> di_from;
|
|
276
|
+
|
|
277
|
+
return MaskFromVec(
|
|
278
|
+
BitCast(d_to, DemoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m)))));
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
#endif // HWY_NATIVE_DEMOTE_MASK_TO
|
|
282
|
+
|
|
283
|
+
// ------------------------------ CombineMasks
|
|
284
|
+
|
|
285
|
+
#if (defined(HWY_NATIVE_COMBINE_MASKS) == defined(HWY_TARGET_TOGGLE))
|
|
286
|
+
#ifdef HWY_NATIVE_COMBINE_MASKS
|
|
287
|
+
#undef HWY_NATIVE_COMBINE_MASKS
|
|
288
|
+
#else
|
|
289
|
+
#define HWY_NATIVE_COMBINE_MASKS
|
|
290
|
+
#endif
|
|
291
|
+
|
|
292
|
+
#if HWY_TARGET != HWY_SCALAR
|
|
293
|
+
template <class D>
|
|
294
|
+
HWY_API Mask<D> CombineMasks(D d, Mask<Half<D>> hi, Mask<Half<D>> lo) {
|
|
295
|
+
const Half<decltype(d)> dh;
|
|
296
|
+
return MaskFromVec(Combine(d, VecFromMask(dh, hi), VecFromMask(dh, lo)));
|
|
297
|
+
}
|
|
298
|
+
#endif
|
|
299
|
+
|
|
300
|
+
#endif // HWY_NATIVE_COMBINE_MASKS
|
|
301
|
+
|
|
302
|
+
// ------------------------------ LowerHalfOfMask
|
|
303
|
+
|
|
304
|
+
#if (defined(HWY_NATIVE_LOWER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE))
|
|
305
|
+
#ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
|
|
306
|
+
#undef HWY_NATIVE_LOWER_HALF_OF_MASK
|
|
307
|
+
#else
|
|
308
|
+
#define HWY_NATIVE_LOWER_HALF_OF_MASK
|
|
309
|
+
#endif
|
|
310
|
+
|
|
311
|
+
template <class D>
|
|
312
|
+
HWY_API Mask<D> LowerHalfOfMask(D d, Mask<Twice<D>> m) {
|
|
313
|
+
const Twice<decltype(d)> dt;
|
|
314
|
+
return MaskFromVec(LowerHalf(d, VecFromMask(dt, m)));
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
#endif // HWY_NATIVE_LOWER_HALF_OF_MASK
|
|
318
|
+
|
|
319
|
+
// ------------------------------ UpperHalfOfMask
|
|
320
|
+
|
|
321
|
+
#if (defined(HWY_NATIVE_UPPER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE))
|
|
322
|
+
#ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
|
|
323
|
+
#undef HWY_NATIVE_UPPER_HALF_OF_MASK
|
|
324
|
+
#else
|
|
325
|
+
#define HWY_NATIVE_UPPER_HALF_OF_MASK
|
|
326
|
+
#endif
|
|
327
|
+
|
|
328
|
+
#if HWY_TARGET != HWY_SCALAR
|
|
329
|
+
template <class D>
|
|
330
|
+
HWY_API Mask<D> UpperHalfOfMask(D d, Mask<Twice<D>> m) {
|
|
331
|
+
const Twice<decltype(d)> dt;
|
|
332
|
+
return MaskFromVec(UpperHalf(d, VecFromMask(dt, m)));
|
|
333
|
+
}
|
|
334
|
+
#endif
|
|
335
|
+
|
|
336
|
+
#endif // HWY_NATIVE_UPPER_HALF_OF_MASK
|
|
337
|
+
|
|
338
|
+
// ------------------------------ OrderedDemote2MasksTo
|
|
339
|
+
|
|
340
|
+
#if (defined(HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO) == \
|
|
341
|
+
defined(HWY_TARGET_TOGGLE))
|
|
342
|
+
#ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
|
|
343
|
+
#undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
|
|
344
|
+
#else
|
|
345
|
+
#define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
|
|
346
|
+
#endif
|
|
347
|
+
|
|
348
|
+
#if HWY_TARGET != HWY_SCALAR
|
|
349
|
+
template <class DTo, class DFrom>
|
|
350
|
+
HWY_API Mask<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom d_from, Mask<DFrom> a,
|
|
351
|
+
Mask<DFrom> b) {
|
|
352
|
+
static_assert(
|
|
353
|
+
sizeof(TFromD<DTo>) == sizeof(TFromD<DFrom>) / 2,
|
|
354
|
+
"sizeof(TFromD<DTo>) must be equal to sizeof(TFromD<DFrom>) / 2");
|
|
355
|
+
static_assert(IsSame<Mask<DTo>, Mask<Repartition<TFromD<DTo>, DFrom>>>(),
|
|
356
|
+
"Mask<DTo> must be the same type as "
|
|
357
|
+
"Mask<Repartition<TFromD<DTo>, DFrom>>>()");
|
|
358
|
+
|
|
359
|
+
const RebindToSigned<decltype(d_from)> di_from;
|
|
360
|
+
const RebindToSigned<decltype(d_to)> di_to;
|
|
361
|
+
|
|
362
|
+
const auto va = BitCast(di_from, VecFromMask(d_from, a));
|
|
363
|
+
const auto vb = BitCast(di_from, VecFromMask(d_from, b));
|
|
364
|
+
return MaskFromVec(BitCast(d_to, OrderedDemote2To(di_to, va, vb)));
|
|
365
|
+
}
|
|
366
|
+
#endif
|
|
367
|
+
|
|
368
|
+
#endif // HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
|
|
369
|
+
|
|
370
|
+
// ------------------------------ InterleaveWholeLower/InterleaveWholeUpper
|
|
371
|
+
#if (defined(HWY_NATIVE_INTERLEAVE_WHOLE) == defined(HWY_TARGET_TOGGLE))
|
|
372
|
+
#ifdef HWY_NATIVE_INTERLEAVE_WHOLE
|
|
373
|
+
#undef HWY_NATIVE_INTERLEAVE_WHOLE
|
|
374
|
+
#else
|
|
375
|
+
#define HWY_NATIVE_INTERLEAVE_WHOLE
|
|
376
|
+
#endif
|
|
377
|
+
|
|
378
|
+
#if HWY_TARGET != HWY_SCALAR
|
|
379
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
380
|
+
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
381
|
+
// InterleaveWholeLower(d, a, b) is equivalent to InterleaveLower(a, b) if
|
|
382
|
+
// D().MaxBytes() <= 16 is true
|
|
383
|
+
return InterleaveLower(d, a, b);
|
|
384
|
+
}
|
|
385
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
386
|
+
HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
387
|
+
// InterleaveWholeUpper(d, a, b) is equivalent to InterleaveUpper(a, b) if
|
|
388
|
+
// D().MaxBytes() <= 16 is true
|
|
389
|
+
return InterleaveUpper(d, a, b);
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
// InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on AVX2/AVX3
|
|
393
|
+
// is implemented in x86_256-inl.h.
|
|
394
|
+
|
|
395
|
+
// InterleaveWholeLower/InterleaveWholeUpper for 64-byte vectors on AVX3 is
|
|
396
|
+
// implemented in x86_512-inl.h.
|
|
397
|
+
|
|
398
|
+
// InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on WASM_EMU256
|
|
399
|
+
// is implemented in wasm_256-inl.h.
|
|
400
|
+
#endif // HWY_TARGET != HWY_SCALAR
|
|
401
|
+
|
|
402
|
+
#endif // HWY_NATIVE_INTERLEAVE_WHOLE
|
|
403
|
+
|
|
404
|
+
#if HWY_TARGET != HWY_SCALAR
|
|
405
|
+
// The InterleaveWholeLower without the optional D parameter is generic for all
|
|
406
|
+
// vector lengths.
|
|
407
|
+
template <class V>
|
|
408
|
+
HWY_API V InterleaveWholeLower(V a, V b) {
|
|
409
|
+
return InterleaveWholeLower(DFromV<V>(), a, b);
|
|
410
|
+
}
|
|
411
|
+
#endif // HWY_TARGET != HWY_SCALAR
|
|
412
|
+
|
|
413
|
+
// ------------------------------ AddSub
|
|
414
|
+
|
|
415
|
+
template <class V, HWY_IF_LANES_D(DFromV<V>, 1)>
|
|
416
|
+
HWY_API V AddSub(V a, V b) {
|
|
417
|
+
// AddSub(a, b) for a one-lane vector is equivalent to Sub(a, b)
|
|
418
|
+
return Sub(a, b);
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
// AddSub for F32x2, F32x4, and F64x2 vectors is implemented in x86_128-inl.h on
|
|
422
|
+
// SSSE3/SSE4/AVX2/AVX3
|
|
423
|
+
|
|
424
|
+
// AddSub for F32x8 and F64x4 vectors is implemented in x86_256-inl.h on
|
|
425
|
+
// AVX2/AVX3
|
|
426
|
+
template <class V, HWY_IF_V_SIZE_GT_V(V, ((HWY_TARGET <= HWY_SSSE3 &&
|
|
427
|
+
hwy::IsFloat3264<TFromV<V>>())
|
|
428
|
+
? 32
|
|
429
|
+
: sizeof(TFromV<V>)))>
|
|
430
|
+
HWY_API V AddSub(V a, V b) {
|
|
431
|
+
using D = DFromV<decltype(a)>;
|
|
432
|
+
using T = TFromD<D>;
|
|
433
|
+
using TNegate = If<!hwy::IsSigned<T>(), MakeSigned<T>, T>;
|
|
434
|
+
|
|
435
|
+
const D d;
|
|
436
|
+
const Rebind<TNegate, D> d_negate;
|
|
437
|
+
|
|
438
|
+
// Negate the even lanes of b
|
|
439
|
+
const auto negated_even_b = OddEven(b, BitCast(d, Neg(BitCast(d_negate, b))));
|
|
440
|
+
|
|
441
|
+
return Add(a, negated_even_b);
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
// ------------------------------ MaskedAddOr etc.
|
|
445
|
+
#if (defined(HWY_NATIVE_MASKED_ARITH) == defined(HWY_TARGET_TOGGLE))
|
|
446
|
+
#ifdef HWY_NATIVE_MASKED_ARITH
|
|
447
|
+
#undef HWY_NATIVE_MASKED_ARITH
|
|
448
|
+
#else
|
|
449
|
+
#define HWY_NATIVE_MASKED_ARITH
|
|
450
|
+
#endif
|
|
451
|
+
|
|
452
|
+
template <class V, class M>
|
|
453
|
+
HWY_API V MaskedMinOr(V no, M m, V a, V b) {
|
|
454
|
+
return IfThenElse(m, Min(a, b), no);
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
template <class V, class M>
|
|
458
|
+
HWY_API V MaskedMaxOr(V no, M m, V a, V b) {
|
|
459
|
+
return IfThenElse(m, Max(a, b), no);
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
template <class V, class M>
|
|
463
|
+
HWY_API V MaskedAddOr(V no, M m, V a, V b) {
|
|
464
|
+
return IfThenElse(m, Add(a, b), no);
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
template <class V, class M>
|
|
468
|
+
HWY_API V MaskedSubOr(V no, M m, V a, V b) {
|
|
469
|
+
return IfThenElse(m, Sub(a, b), no);
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
template <class V, class M>
|
|
473
|
+
HWY_API V MaskedMulOr(V no, M m, V a, V b) {
|
|
474
|
+
return IfThenElse(m, Mul(a, b), no);
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
template <class V, class M>
|
|
478
|
+
HWY_API V MaskedDivOr(V no, M m, V a, V b) {
|
|
479
|
+
return IfThenElse(m, Div(a, b), no);
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
template <class V, class M>
|
|
483
|
+
HWY_API V MaskedModOr(V no, M m, V a, V b) {
|
|
484
|
+
return IfThenElse(m, Mod(a, b), no);
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
template <class V, class M>
|
|
488
|
+
HWY_API V MaskedSatAddOr(V no, M m, V a, V b) {
|
|
489
|
+
return IfThenElse(m, SaturatedAdd(a, b), no);
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
template <class V, class M>
|
|
493
|
+
HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
|
|
494
|
+
return IfThenElse(m, SaturatedSub(a, b), no);
|
|
495
|
+
}
|
|
496
|
+
#endif // HWY_NATIVE_MASKED_ARITH
|
|
497
|
+
|
|
498
|
+
// ------------------------------ IfNegativeThenNegOrUndefIfZero
|
|
499
|
+
|
|
500
|
+
#if (defined(HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG) == \
|
|
501
|
+
defined(HWY_TARGET_TOGGLE))
|
|
502
|
+
#ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
|
|
503
|
+
#undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
|
|
504
|
+
#else
|
|
505
|
+
#define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
|
|
506
|
+
#endif
|
|
507
|
+
|
|
508
|
+
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
509
|
+
HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
|
|
510
|
+
#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
|
|
511
|
+
// MaskedSubOr is more efficient than IfNegativeThenElse on RVV/SVE
|
|
512
|
+
const auto zero = Zero(DFromV<V>());
|
|
513
|
+
return MaskedSubOr(v, Lt(mask, zero), zero, v);
|
|
514
|
+
#else
|
|
515
|
+
return IfNegativeThenElse(mask, Neg(v), v);
|
|
516
|
+
#endif
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
#endif // HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
|
|
520
|
+
|
|
521
|
+
template <class V, HWY_IF_FLOAT_V(V)>
|
|
522
|
+
HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
|
|
523
|
+
return CopySign(v, Xor(mask, v));
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
// ------------------------------ SaturatedNeg
|
|
527
|
+
|
|
528
|
+
#if (defined(HWY_NATIVE_SATURATED_NEG_8_16_32) == defined(HWY_TARGET_TOGGLE))
|
|
529
|
+
#ifdef HWY_NATIVE_SATURATED_NEG_8_16_32
|
|
530
|
+
#undef HWY_NATIVE_SATURATED_NEG_8_16_32
|
|
531
|
+
#else
|
|
532
|
+
#define HWY_NATIVE_SATURATED_NEG_8_16_32
|
|
533
|
+
#endif
|
|
534
|
+
|
|
535
|
+
template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
|
|
536
|
+
HWY_IF_SIGNED_V(V)>
|
|
537
|
+
HWY_API V SaturatedNeg(V v) {
|
|
538
|
+
const DFromV<decltype(v)> d;
|
|
539
|
+
return SaturatedSub(Zero(d), v);
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
template <class V, HWY_IF_I32(TFromV<V>)>
|
|
543
|
+
HWY_API V SaturatedNeg(V v) {
|
|
544
|
+
const DFromV<decltype(v)> d;
|
|
545
|
+
|
|
546
|
+
#if HWY_TARGET == HWY_RVV || \
|
|
547
|
+
(HWY_TARGET >= HWY_PPC10 && HWY_TARGET <= HWY_PPC8) || \
|
|
548
|
+
(HWY_TARGET >= HWY_SVE2_128 && HWY_TARGET <= HWY_NEON_WITHOUT_AES)
|
|
549
|
+
// RVV/NEON/SVE/PPC have native I32 SaturatedSub instructions
|
|
550
|
+
return SaturatedSub(Zero(d), v);
|
|
551
|
+
#else
|
|
552
|
+
// ~v[i] - ((v[i] > LimitsMin<int32_t>()) ? -1 : 0) is equivalent to
|
|
553
|
+
// (v[i] > LimitsMin<int32_t>) ? (-v[i]) : LimitsMax<int32_t>() since
|
|
554
|
+
// -v[i] == ~v[i] + 1 == ~v[i] - (-1) and
|
|
555
|
+
// ~LimitsMin<int32_t>() == LimitsMax<int32_t>().
|
|
556
|
+
return Sub(Not(v), VecFromMask(d, Gt(v, Set(d, LimitsMin<int32_t>()))));
|
|
557
|
+
#endif
|
|
558
|
+
}
|
|
559
|
+
#endif // HWY_NATIVE_SATURATED_NEG_8_16_32
|
|
560
|
+
|
|
561
|
+
#if (defined(HWY_NATIVE_SATURATED_NEG_64) == defined(HWY_TARGET_TOGGLE))
|
|
562
|
+
#ifdef HWY_NATIVE_SATURATED_NEG_64
|
|
563
|
+
#undef HWY_NATIVE_SATURATED_NEG_64
|
|
564
|
+
#else
|
|
565
|
+
#define HWY_NATIVE_SATURATED_NEG_64
|
|
566
|
+
#endif
|
|
567
|
+
|
|
568
|
+
template <class V, HWY_IF_I64(TFromV<V>)>
|
|
569
|
+
HWY_API V SaturatedNeg(V v) {
|
|
570
|
+
#if HWY_TARGET == HWY_RVV || \
|
|
571
|
+
(HWY_TARGET >= HWY_SVE2_128 && HWY_TARGET <= HWY_NEON_WITHOUT_AES)
|
|
572
|
+
// RVV/NEON/SVE have native I64 SaturatedSub instructions
|
|
573
|
+
const DFromV<decltype(v)> d;
|
|
574
|
+
return SaturatedSub(Zero(d), v);
|
|
575
|
+
#else
|
|
576
|
+
const auto neg_v = Neg(v);
|
|
577
|
+
return Add(neg_v, BroadcastSignBit(And(v, neg_v)));
|
|
578
|
+
#endif
|
|
579
|
+
}
|
|
580
|
+
#endif // HWY_NATIVE_SATURATED_NEG_64
|
|
581
|
+
|
|
582
|
+
// ------------------------------ SaturatedAbs
|
|
583
|
+
|
|
584
|
+
#if (defined(HWY_NATIVE_SATURATED_ABS) == defined(HWY_TARGET_TOGGLE))
|
|
585
|
+
#ifdef HWY_NATIVE_SATURATED_ABS
|
|
586
|
+
#undef HWY_NATIVE_SATURATED_ABS
|
|
587
|
+
#else
|
|
588
|
+
#define HWY_NATIVE_SATURATED_ABS
|
|
589
|
+
#endif
|
|
590
|
+
|
|
591
|
+
template <class V, HWY_IF_SIGNED_V(V)>
|
|
592
|
+
HWY_API V SaturatedAbs(V v) {
|
|
593
|
+
return Max(v, SaturatedNeg(v));
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
#endif
|
|
597
|
+
|
|
598
|
+
// ------------------------------ Reductions
|
|
599
|
+
|
|
600
|
+
// Targets follow one of two strategies. If HWY_NATIVE_REDUCE_SCALAR is toggled,
|
|
601
|
+
// they (RVV/SVE/Armv8/Emu128) implement ReduceSum and SumOfLanes via Set.
|
|
602
|
+
// Otherwise, they (Armv7/PPC/scalar/WASM/x86) define zero to most of the
|
|
603
|
+
// SumOfLanes overloads. For the latter group, we here define the remaining
|
|
604
|
+
// overloads, plus ReduceSum which uses them plus GetLane.
|
|
605
|
+
#if (defined(HWY_NATIVE_REDUCE_SCALAR) == defined(HWY_TARGET_TOGGLE))
|
|
606
|
+
#ifdef HWY_NATIVE_REDUCE_SCALAR
|
|
607
|
+
#undef HWY_NATIVE_REDUCE_SCALAR
|
|
608
|
+
#else
|
|
609
|
+
#define HWY_NATIVE_REDUCE_SCALAR
|
|
610
|
+
#endif
|
|
611
|
+
|
|
612
|
+
namespace detail {
|
|
613
|
+
|
|
614
|
+
// Allows reusing the same shuffle code for SumOfLanes/MinOfLanes/MaxOfLanes.
|
|
615
|
+
struct AddFunc {
|
|
616
|
+
template <class V>
|
|
617
|
+
V operator()(V a, V b) const {
|
|
618
|
+
return Add(a, b);
|
|
619
|
+
}
|
|
620
|
+
};
|
|
621
|
+
|
|
622
|
+
struct MinFunc {
|
|
623
|
+
template <class V>
|
|
624
|
+
V operator()(V a, V b) const {
|
|
625
|
+
return Min(a, b);
|
|
626
|
+
}
|
|
627
|
+
};
|
|
628
|
+
|
|
629
|
+
struct MaxFunc {
|
|
630
|
+
template <class V>
|
|
631
|
+
V operator()(V a, V b) const {
|
|
632
|
+
return Max(a, b);
|
|
633
|
+
}
|
|
634
|
+
};
|
|
635
|
+
|
|
636
|
+
// No-op for vectors of at most one block.
|
|
637
|
+
template <class D, class Func, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
638
|
+
HWY_INLINE VFromD<D> ReduceAcrossBlocks(D, Func, VFromD<D> v) {
|
|
639
|
+
return v;
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
// Reduces a lane with its counterpart in other block(s). Shared by AVX2 and
|
|
643
|
+
// WASM_EMU256. AVX3 has its own overload.
|
|
644
|
+
template <class D, class Func, HWY_IF_V_SIZE_D(D, 32)>
|
|
645
|
+
HWY_INLINE VFromD<D> ReduceAcrossBlocks(D /*d*/, Func f, VFromD<D> v) {
|
|
646
|
+
return f(v, SwapAdjacentBlocks(v));
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
// These return the reduction result broadcasted across all lanes. They assume
|
|
650
|
+
// the caller has already reduced across blocks.
|
|
651
|
+
|
|
652
|
+
template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
|
|
653
|
+
HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v10) {
|
|
654
|
+
return f(v10, Reverse2(d, v10));
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
|
|
658
|
+
HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v3210) {
|
|
659
|
+
const VFromD<D> v0123 = Reverse4(d, v3210);
|
|
660
|
+
const VFromD<D> v03_12_12_03 = f(v3210, v0123);
|
|
661
|
+
const VFromD<D> v12_03_03_12 = Reverse2(d, v03_12_12_03);
|
|
662
|
+
return f(v03_12_12_03, v12_03_03_12);
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 8)>
|
|
666
|
+
HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v76543210) {
|
|
667
|
+
// The upper half is reversed from the lower half; omit for brevity.
|
|
668
|
+
const VFromD<D> v34_25_16_07 = f(v76543210, Reverse8(d, v76543210));
|
|
669
|
+
const VFromD<D> v0347_1625_1625_0347 =
|
|
670
|
+
f(v34_25_16_07, Reverse4(d, v34_25_16_07));
|
|
671
|
+
return f(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 16), HWY_IF_U8_D(D)>
|
|
675
|
+
HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v) {
|
|
676
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
677
|
+
using VW = VFromD<decltype(dw)>;
|
|
678
|
+
const VW vw = BitCast(dw, v);
|
|
679
|
+
// f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN.
|
|
680
|
+
const VW even = And(vw, Set(dw, 0xFF));
|
|
681
|
+
const VW odd = ShiftRight<8>(vw);
|
|
682
|
+
const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd));
|
|
683
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
684
|
+
return DupEven(BitCast(d, reduced));
|
|
685
|
+
#else
|
|
686
|
+
return DupOdd(BitCast(d, reduced));
|
|
687
|
+
#endif
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 16), HWY_IF_I8_D(D)>
|
|
691
|
+
HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v) {
|
|
692
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
693
|
+
using VW = VFromD<decltype(dw)>;
|
|
694
|
+
const VW vw = BitCast(dw, v);
|
|
695
|
+
// Sign-extend
|
|
696
|
+
// f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN.
|
|
697
|
+
const VW even = ShiftRight<8>(ShiftLeft<8>(vw));
|
|
698
|
+
const VW odd = ShiftRight<8>(vw);
|
|
699
|
+
const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd));
|
|
700
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
701
|
+
return DupEven(BitCast(d, reduced));
|
|
702
|
+
#else
|
|
703
|
+
return DupOdd(BitCast(d, reduced));
|
|
704
|
+
#endif
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
} // namespace detail
|
|
708
|
+
|
|
709
|
+
template <class D, HWY_IF_SUM_OF_LANES_D(D)>
|
|
710
|
+
HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
|
|
711
|
+
const detail::AddFunc f;
|
|
712
|
+
v = detail::ReduceAcrossBlocks(d, f, v);
|
|
713
|
+
return detail::ReduceWithinBlocks(d, f, v);
|
|
714
|
+
}
|
|
715
|
+
template <class D, HWY_IF_MINMAX_OF_LANES_D(D)>
|
|
716
|
+
HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
|
|
717
|
+
const detail::MinFunc f;
|
|
718
|
+
v = detail::ReduceAcrossBlocks(d, f, v);
|
|
719
|
+
return detail::ReduceWithinBlocks(d, f, v);
|
|
720
|
+
}
|
|
721
|
+
template <class D, HWY_IF_MINMAX_OF_LANES_D(D)>
|
|
722
|
+
HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
723
|
+
const detail::MaxFunc f;
|
|
724
|
+
v = detail::ReduceAcrossBlocks(d, f, v);
|
|
725
|
+
return detail::ReduceWithinBlocks(d, f, v);
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
template <class D, HWY_IF_REDUCE_D(D)>
|
|
729
|
+
HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
|
|
730
|
+
return GetLane(SumOfLanes(d, v));
|
|
731
|
+
}
|
|
732
|
+
template <class D, HWY_IF_REDUCE_D(D)>
|
|
733
|
+
HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) {
|
|
734
|
+
return GetLane(MinOfLanes(d, v));
|
|
735
|
+
}
|
|
736
|
+
template <class D, HWY_IF_REDUCE_D(D)>
|
|
737
|
+
HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
|
|
738
|
+
return GetLane(MaxOfLanes(d, v));
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
#endif // HWY_NATIVE_REDUCE_SCALAR
|
|
742
|
+
|
|
743
|
+
// Corner cases for both generic and native implementations:
|
|
744
|
+
// N=1 (native covers N=2 e.g. for u64x2 and even u32x2 on Arm)
|
|
745
|
+
template <class D, HWY_IF_LANES_D(D, 1)>
|
|
746
|
+
HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) {
|
|
747
|
+
return GetLane(v);
|
|
748
|
+
}
|
|
749
|
+
template <class D, HWY_IF_LANES_D(D, 1)>
|
|
750
|
+
HWY_API TFromD<D> ReduceMin(D /*d*/, VFromD<D> v) {
|
|
751
|
+
return GetLane(v);
|
|
752
|
+
}
|
|
753
|
+
template <class D, HWY_IF_LANES_D(D, 1)>
|
|
754
|
+
HWY_API TFromD<D> ReduceMax(D /*d*/, VFromD<D> v) {
|
|
755
|
+
return GetLane(v);
|
|
756
|
+
}
|
|
757
|
+
|
|
758
|
+
template <class D, HWY_IF_LANES_D(D, 1)>
|
|
759
|
+
HWY_API VFromD<D> SumOfLanes(D /* tag */, VFromD<D> v) {
|
|
760
|
+
return v;
|
|
761
|
+
}
|
|
762
|
+
template <class D, HWY_IF_LANES_D(D, 1)>
|
|
763
|
+
HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) {
|
|
764
|
+
return v;
|
|
765
|
+
}
|
|
766
|
+
template <class D, HWY_IF_LANES_D(D, 1)>
|
|
767
|
+
HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) {
|
|
768
|
+
return v;
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
// N=4 for 8-bit is still less than the minimum native size.
|
|
772
|
+
|
|
773
|
+
// ARMv7 NEON/PPC/RVV/SVE have target-specific implementations of the N=4 I8/U8
|
|
774
|
+
// ReduceSum operations
|
|
775
|
+
#if (defined(HWY_NATIVE_REDUCE_SUM_4_UI8) == defined(HWY_TARGET_TOGGLE))
|
|
776
|
+
#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
777
|
+
#undef HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
778
|
+
#else
|
|
779
|
+
#define HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
780
|
+
#endif
|
|
781
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
|
|
782
|
+
HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
|
|
783
|
+
const Twice<RepartitionToWide<decltype(d)>> dw;
|
|
784
|
+
return static_cast<TFromD<D>>(ReduceSum(dw, PromoteTo(dw, v)));
|
|
785
|
+
}
|
|
786
|
+
#endif // HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
787
|
+
|
|
788
|
+
// RVV/SVE have target-specific implementations of the N=4 I8/U8
|
|
789
|
+
// ReduceMin/ReduceMax operations
|
|
790
|
+
#if (defined(HWY_NATIVE_REDUCE_MINMAX_4_UI8) == defined(HWY_TARGET_TOGGLE))
|
|
791
|
+
#ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
792
|
+
#undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
793
|
+
#else
|
|
794
|
+
#define HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
795
|
+
#endif
|
|
796
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
|
|
797
|
+
HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) {
|
|
798
|
+
const Twice<RepartitionToWide<decltype(d)>> dw;
|
|
799
|
+
return static_cast<TFromD<D>>(ReduceMin(dw, PromoteTo(dw, v)));
|
|
800
|
+
}
|
|
801
|
+
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
|
|
802
|
+
HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
|
|
803
|
+
const Twice<RepartitionToWide<decltype(d)>> dw;
|
|
804
|
+
return static_cast<TFromD<D>>(ReduceMax(dw, PromoteTo(dw, v)));
|
|
805
|
+
}
|
|
806
|
+
#endif // HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
807
|
+
|
|
808
|
+
// ------------------------------ IsInf, IsFinite
|
|
809
|
+
|
|
810
|
+
// AVX3 has target-specific implementations of these.
|
|
811
|
+
#if (defined(HWY_NATIVE_ISINF) == defined(HWY_TARGET_TOGGLE))
|
|
812
|
+
#ifdef HWY_NATIVE_ISINF
|
|
813
|
+
#undef HWY_NATIVE_ISINF
|
|
814
|
+
#else
|
|
815
|
+
#define HWY_NATIVE_ISINF
|
|
816
|
+
#endif
|
|
817
|
+
|
|
818
|
+
template <class V, class D = DFromV<V>>
|
|
819
|
+
HWY_API MFromD<D> IsInf(const V v) {
|
|
820
|
+
using T = TFromD<D>;
|
|
821
|
+
const D d;
|
|
822
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
823
|
+
const VFromD<decltype(du)> vu = BitCast(du, v);
|
|
824
|
+
// 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
|
|
825
|
+
return RebindMask(
|
|
826
|
+
d,
|
|
827
|
+
Eq(Add(vu, vu),
|
|
828
|
+
Set(du, static_cast<MakeUnsigned<T>>(hwy::MaxExponentTimes2<T>()))));
|
|
829
|
+
}
|
|
830
|
+
|
|
831
|
+
// Returns whether normal/subnormal/zero.
|
|
832
|
+
template <class V, class D = DFromV<V>>
|
|
833
|
+
HWY_API MFromD<D> IsFinite(const V v) {
|
|
834
|
+
using T = TFromD<D>;
|
|
835
|
+
const D d;
|
|
836
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
837
|
+
const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
|
|
838
|
+
const VFromD<decltype(du)> vu = BitCast(du, v);
|
|
839
|
+
// 'Shift left' to clear the sign bit. MSVC seems to generate incorrect code
|
|
840
|
+
// for AVX2 if we instead add vu + vu.
|
|
841
|
+
#if HWY_COMPILER_MSVC
|
|
842
|
+
const VFromD<decltype(du)> shl = ShiftLeft<1>(vu);
|
|
843
|
+
#else
|
|
844
|
+
const VFromD<decltype(du)> shl = Add(vu, vu);
|
|
845
|
+
#endif
|
|
846
|
+
|
|
847
|
+
// Then shift right so we can compare with the max exponent (cannot compare
|
|
848
|
+
// with MaxExponentTimes2 directly because it is negative and non-negative
|
|
849
|
+
// floats would be greater).
|
|
850
|
+
const VFromD<decltype(di)> exp =
|
|
851
|
+
BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(shl));
|
|
852
|
+
return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
|
|
853
|
+
}
|
|
854
|
+
|
|
855
|
+
#endif // HWY_NATIVE_ISINF
|
|
856
|
+
|
|
857
|
+
// ------------------------------ LoadInterleaved2
|
|
215
858
|
|
|
216
859
|
#if HWY_IDE || \
|
|
217
860
|
(defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE))
|
|
@@ -221,8 +864,6 @@ HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
|
|
|
221
864
|
#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
|
|
222
865
|
#endif
|
|
223
866
|
|
|
224
|
-
// ------------------------------ LoadInterleaved2
|
|
225
|
-
|
|
226
867
|
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
227
868
|
HWY_API void LoadInterleaved2(D d, const TFromD<D>* HWY_RESTRICT unaligned,
|
|
228
869
|
VFromD<D>& v0, VFromD<D>& v1) {
|
|
@@ -277,6 +918,7 @@ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
|
|
|
277
918
|
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
|
|
278
919
|
const RebindToUnsigned<decltype(d)> du;
|
|
279
920
|
using V = VFromD<D>;
|
|
921
|
+
using VU = VFromD<decltype(du)>;
|
|
280
922
|
// Compact notation so these fit on one line: 12 := v1[2].
|
|
281
923
|
V A; // 05 24 14 04 23 13 03 22 12 02 21 11 01 20 10 00
|
|
282
924
|
V B; // 1a 0a 29 19 09 28 18 08 27 17 07 26 16 06 25 15
|
|
@@ -284,33 +926,33 @@ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
|
|
|
284
926
|
detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
|
|
285
927
|
// Compress all lanes belonging to v0 into consecutive lanes.
|
|
286
928
|
constexpr uint8_t Z = 0x80;
|
|
287
|
-
|
|
288
|
-
0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z
|
|
289
|
-
|
|
290
|
-
Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z
|
|
291
|
-
|
|
292
|
-
Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13
|
|
293
|
-
|
|
294
|
-
1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z
|
|
295
|
-
|
|
296
|
-
Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z
|
|
297
|
-
|
|
298
|
-
Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14
|
|
299
|
-
|
|
300
|
-
2, 5, 8, 11, 14, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z
|
|
301
|
-
|
|
302
|
-
Z, Z, Z, Z, Z, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z
|
|
303
|
-
|
|
304
|
-
Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15
|
|
305
|
-
const V v0L = BitCast(d, TableLookupBytesOr0(A,
|
|
306
|
-
const V v0M = BitCast(d, TableLookupBytesOr0(B,
|
|
307
|
-
const V v0U = BitCast(d, TableLookupBytesOr0(C,
|
|
308
|
-
const V v1L = BitCast(d, TableLookupBytesOr0(A,
|
|
309
|
-
const V v1M = BitCast(d, TableLookupBytesOr0(B,
|
|
310
|
-
const V v1U = BitCast(d, TableLookupBytesOr0(C,
|
|
311
|
-
const V v2L = BitCast(d, TableLookupBytesOr0(A,
|
|
312
|
-
const V v2M = BitCast(d, TableLookupBytesOr0(B,
|
|
313
|
-
const V v2U = BitCast(d, TableLookupBytesOr0(C,
|
|
929
|
+
const VU idx_v0A =
|
|
930
|
+
Dup128VecFromValues(du, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
|
|
931
|
+
const VU idx_v0B =
|
|
932
|
+
Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z);
|
|
933
|
+
const VU idx_v0C =
|
|
934
|
+
Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13);
|
|
935
|
+
const VU idx_v1A =
|
|
936
|
+
Dup128VecFromValues(du, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
|
|
937
|
+
const VU idx_v1B =
|
|
938
|
+
Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z);
|
|
939
|
+
const VU idx_v1C =
|
|
940
|
+
Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14);
|
|
941
|
+
const VU idx_v2A =
|
|
942
|
+
Dup128VecFromValues(du, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
|
|
943
|
+
const VU idx_v2B =
|
|
944
|
+
Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z);
|
|
945
|
+
const VU idx_v2C =
|
|
946
|
+
Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15);
|
|
947
|
+
const V v0L = BitCast(d, TableLookupBytesOr0(A, idx_v0A));
|
|
948
|
+
const V v0M = BitCast(d, TableLookupBytesOr0(B, idx_v0B));
|
|
949
|
+
const V v0U = BitCast(d, TableLookupBytesOr0(C, idx_v0C));
|
|
950
|
+
const V v1L = BitCast(d, TableLookupBytesOr0(A, idx_v1A));
|
|
951
|
+
const V v1M = BitCast(d, TableLookupBytesOr0(B, idx_v1B));
|
|
952
|
+
const V v1U = BitCast(d, TableLookupBytesOr0(C, idx_v1C));
|
|
953
|
+
const V v2L = BitCast(d, TableLookupBytesOr0(A, idx_v2A));
|
|
954
|
+
const V v2M = BitCast(d, TableLookupBytesOr0(B, idx_v2B));
|
|
955
|
+
const V v2U = BitCast(d, TableLookupBytesOr0(C, idx_v2C));
|
|
314
956
|
v0 = Xor3(v0L, v0M, v0U);
|
|
315
957
|
v1 = Xor3(v1L, v1M, v1U);
|
|
316
958
|
v2 = Xor3(v2L, v2M, v2U);
|
|
@@ -322,30 +964,40 @@ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
|
|
|
322
964
|
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
|
|
323
965
|
const RebindToUnsigned<decltype(d)> du;
|
|
324
966
|
using V = VFromD<D>;
|
|
967
|
+
using VU = VFromD<decltype(du)>;
|
|
325
968
|
V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
|
|
326
969
|
V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
|
|
327
970
|
V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
|
|
328
971
|
detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
|
|
329
972
|
// Compress all lanes belonging to v0 into consecutive lanes.
|
|
330
973
|
constexpr uint8_t Z = 0x80;
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
const
|
|
342
|
-
|
|
343
|
-
const
|
|
344
|
-
|
|
345
|
-
const
|
|
346
|
-
|
|
347
|
-
const
|
|
348
|
-
|
|
974
|
+
const VU idx_v0A =
|
|
975
|
+
Dup128VecFromValues(du, 0, 3, 6, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
976
|
+
const VU idx_v0B =
|
|
977
|
+
Dup128VecFromValues(du, Z, Z, Z, 1, 4, 7, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
978
|
+
const VU idx_v0C =
|
|
979
|
+
Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
980
|
+
const VU idx_v1A =
|
|
981
|
+
Dup128VecFromValues(du, 1, 4, 7, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
982
|
+
const VU idx_v1B =
|
|
983
|
+
Dup128VecFromValues(du, Z, Z, Z, 2, 5, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
984
|
+
const VU idx_v1C =
|
|
985
|
+
Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
986
|
+
const VU idx_v2A =
|
|
987
|
+
Dup128VecFromValues(du, 2, 5, Z, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
988
|
+
const VU idx_v2B =
|
|
989
|
+
Dup128VecFromValues(du, Z, Z, 0, 3, 6, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
990
|
+
const VU idx_v2C =
|
|
991
|
+
Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
992
|
+
const V v0L = BitCast(d, TableLookupBytesOr0(A, idx_v0A));
|
|
993
|
+
const V v0M = BitCast(d, TableLookupBytesOr0(B, idx_v0B));
|
|
994
|
+
const V v0U = BitCast(d, TableLookupBytesOr0(C, idx_v0C));
|
|
995
|
+
const V v1L = BitCast(d, TableLookupBytesOr0(A, idx_v1A));
|
|
996
|
+
const V v1M = BitCast(d, TableLookupBytesOr0(B, idx_v1B));
|
|
997
|
+
const V v1U = BitCast(d, TableLookupBytesOr0(C, idx_v1C));
|
|
998
|
+
const V v2L = BitCast(d, TableLookupBytesOr0(A, idx_v2A));
|
|
999
|
+
const V v2M = BitCast(d, TableLookupBytesOr0(B, idx_v2B));
|
|
1000
|
+
const V v2U = BitCast(d, TableLookupBytesOr0(C, idx_v2C));
|
|
349
1001
|
v0 = Xor3(v0L, v0M, v0U);
|
|
350
1002
|
v1 = Xor3(v1L, v1M, v1U);
|
|
351
1003
|
v2 = Xor3(v2L, v2M, v2U);
|
|
@@ -358,6 +1010,7 @@ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
|
|
|
358
1010
|
const RebindToUnsigned<decltype(d)> du;
|
|
359
1011
|
const Repartition<uint8_t, decltype(du)> du8;
|
|
360
1012
|
using V = VFromD<D>;
|
|
1013
|
+
using VU8 = VFromD<decltype(du8)>;
|
|
361
1014
|
V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
|
|
362
1015
|
V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
|
|
363
1016
|
V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
|
|
@@ -365,33 +1018,33 @@ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
|
|
|
365
1018
|
// Compress all lanes belonging to v0 into consecutive lanes. Same as above,
|
|
366
1019
|
// but each element of the array contains a byte index for a byte of a lane.
|
|
367
1020
|
constexpr uint8_t Z = 0x80;
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
const V v0L = TableLookupBytesOr0(A, BitCast(d,
|
|
387
|
-
const V v0M = TableLookupBytesOr0(B, BitCast(d,
|
|
388
|
-
const V v0U = TableLookupBytesOr0(C, BitCast(d,
|
|
389
|
-
const V v1L = TableLookupBytesOr0(A, BitCast(d,
|
|
390
|
-
const V v1M = TableLookupBytesOr0(B, BitCast(d,
|
|
391
|
-
const V v1U = TableLookupBytesOr0(C, BitCast(d,
|
|
392
|
-
const V v2L = TableLookupBytesOr0(A, BitCast(d,
|
|
393
|
-
const V v2M = TableLookupBytesOr0(B, BitCast(d,
|
|
394
|
-
const V v2U = TableLookupBytesOr0(C, BitCast(d,
|
|
1021
|
+
const VU8 idx_v0A = Dup128VecFromValues(du8, 0x00, 0x01, 0x06, 0x07, 0x0C,
|
|
1022
|
+
0x0D, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
|
|
1023
|
+
const VU8 idx_v0B = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, 0x02, 0x03,
|
|
1024
|
+
0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z);
|
|
1025
|
+
const VU8 idx_v0C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
|
|
1026
|
+
Z, 0x04, 0x05, 0x0A, 0x0B);
|
|
1027
|
+
const VU8 idx_v1A = Dup128VecFromValues(du8, 0x02, 0x03, 0x08, 0x09, 0x0E,
|
|
1028
|
+
0x0F, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
|
|
1029
|
+
const VU8 idx_v1B = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, 0x04, 0x05,
|
|
1030
|
+
0x0A, 0x0B, Z, Z, Z, Z, Z, Z);
|
|
1031
|
+
const VU8 idx_v1C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
|
|
1032
|
+
0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D);
|
|
1033
|
+
const VU8 idx_v2A = Dup128VecFromValues(du8, 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z,
|
|
1034
|
+
Z, Z, Z, Z, Z, Z, Z, Z, Z);
|
|
1035
|
+
const VU8 idx_v2B = Dup128VecFromValues(du8, Z, Z, Z, Z, 0x00, 0x01, 0x06,
|
|
1036
|
+
0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z);
|
|
1037
|
+
const VU8 idx_v2C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
|
|
1038
|
+
0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F);
|
|
1039
|
+
const V v0L = TableLookupBytesOr0(A, BitCast(d, idx_v0A));
|
|
1040
|
+
const V v0M = TableLookupBytesOr0(B, BitCast(d, idx_v0B));
|
|
1041
|
+
const V v0U = TableLookupBytesOr0(C, BitCast(d, idx_v0C));
|
|
1042
|
+
const V v1L = TableLookupBytesOr0(A, BitCast(d, idx_v1A));
|
|
1043
|
+
const V v1M = TableLookupBytesOr0(B, BitCast(d, idx_v1B));
|
|
1044
|
+
const V v1U = TableLookupBytesOr0(C, BitCast(d, idx_v1C));
|
|
1045
|
+
const V v2L = TableLookupBytesOr0(A, BitCast(d, idx_v2A));
|
|
1046
|
+
const V v2M = TableLookupBytesOr0(B, BitCast(d, idx_v2B));
|
|
1047
|
+
const V v2U = TableLookupBytesOr0(C, BitCast(d, idx_v2C));
|
|
395
1048
|
v0 = Xor3(v0L, v0M, v0U);
|
|
396
1049
|
v1 = Xor3(v1L, v1M, v1U);
|
|
397
1050
|
v2 = Xor3(v2L, v2M, v2U);
|
|
@@ -644,16 +1297,16 @@ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
|
|
|
644
1297
|
// v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes
|
|
645
1298
|
// to their place, with 0x80 so lanes to be filled from other vectors are 0
|
|
646
1299
|
// to enable blending by ORing together.
|
|
647
|
-
|
|
648
|
-
0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
0x80,
|
|
1300
|
+
const VFromD<decltype(du)> shuf_A0 =
|
|
1301
|
+
Dup128VecFromValues(du, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3,
|
|
1302
|
+
0x80, 0x80, 4, 0x80, 0x80, 5);
|
|
1303
|
+
// Cannot reuse shuf_A0 because it contains 5.
|
|
1304
|
+
const VFromD<decltype(du)> shuf_A1 =
|
|
1305
|
+
Dup128VecFromValues(du, 0x80, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
|
|
1306
|
+
3, 0x80, 0x80, 4, 0x80, 0x80);
|
|
653
1307
|
// The interleaved vectors will be named A, B, C; temporaries with suffix
|
|
654
1308
|
// 0..2 indicate which input vector's lanes they hold.
|
|
655
|
-
|
|
656
|
-
const auto shuf_A1 = LoadDup128(du, tbl_v1); // cannot reuse shuf_A0 (has 5)
|
|
1309
|
+
// cannot reuse shuf_A0 (has 5)
|
|
657
1310
|
const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
|
|
658
1311
|
const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0
|
|
659
1312
|
const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0.
|
|
@@ -693,19 +1346,16 @@ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
|
|
|
693
1346
|
// v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
|
|
694
1347
|
// filled from other vectors are 0 for blending. Note that these are byte
|
|
695
1348
|
// indices for 16-bit lanes.
|
|
696
|
-
|
|
697
|
-
0x80, 0x80, 0,
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
0x80, 0x80, 0x80, 0x80, 0,
|
|
701
|
-
|
|
1349
|
+
const VFromD<decltype(du8)> shuf_A1 =
|
|
1350
|
+
Dup128VecFromValues(du8, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, 2, 3,
|
|
1351
|
+
0x80, 0x80, 0x80, 0x80, 4, 5);
|
|
1352
|
+
const VFromD<decltype(du8)> shuf_A2 =
|
|
1353
|
+
Dup128VecFromValues(du8, 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80,
|
|
1354
|
+
0x80, 2, 3, 0x80, 0x80, 0x80, 0x80);
|
|
702
1355
|
|
|
703
1356
|
// The interleaved vectors will be named A, B, C; temporaries with suffix
|
|
704
1357
|
// 0..2 indicate which input vector's lanes they hold.
|
|
705
|
-
const auto shuf_A1 = LoadDup128(du8, tbl_v1); // 2..1..0.
|
|
706
|
-
// .2..1..0
|
|
707
1358
|
const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
|
|
708
|
-
const auto shuf_A2 = LoadDup128(du8, tbl_v2); // ..1..0..
|
|
709
1359
|
|
|
710
1360
|
const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
|
|
711
1361
|
const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
|
|
@@ -1104,19 +1754,22 @@ HWY_INLINE VFromD<DTo> LoadNResizeBitCast(DTo d_to, DFrom d_from,
|
|
|
1104
1754
|
|
|
1105
1755
|
} // namespace detail
|
|
1106
1756
|
|
|
1107
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1)
|
|
1757
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1),
|
|
1758
|
+
HWY_IF_NOT_BF16_D(D)>
|
|
1108
1759
|
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
|
|
1109
1760
|
size_t num_lanes) {
|
|
1110
1761
|
return (num_lanes > 0) ? LoadU(d, p) : Zero(d);
|
|
1111
1762
|
}
|
|
1112
1763
|
|
|
1113
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1)
|
|
1764
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1),
|
|
1765
|
+
HWY_IF_NOT_BF16_D(D)>
|
|
1114
1766
|
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
|
|
1115
1767
|
size_t num_lanes) {
|
|
1116
1768
|
return (num_lanes > 0) ? LoadU(d, p) : no;
|
|
1117
1769
|
}
|
|
1118
1770
|
|
|
1119
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)
|
|
1771
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2),
|
|
1772
|
+
HWY_IF_NOT_BF16_D(D)>
|
|
1120
1773
|
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
|
|
1121
1774
|
size_t num_lanes) {
|
|
1122
1775
|
const FixedTag<TFromD<D>, 1> d1;
|
|
@@ -1126,7 +1779,8 @@ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
|
|
|
1126
1779
|
return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p));
|
|
1127
1780
|
}
|
|
1128
1781
|
|
|
1129
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)
|
|
1782
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2),
|
|
1783
|
+
HWY_IF_NOT_BF16_D(D)>
|
|
1130
1784
|
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
|
|
1131
1785
|
size_t num_lanes) {
|
|
1132
1786
|
const FixedTag<TFromD<D>, 1> d1;
|
|
@@ -1136,7 +1790,8 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
|
|
|
1136
1790
|
return InterleaveLower(ResizeBitCast(d, LoadU(d1, p)), no);
|
|
1137
1791
|
}
|
|
1138
1792
|
|
|
1139
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)
|
|
1793
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4),
|
|
1794
|
+
HWY_IF_NOT_BF16_D(D)>
|
|
1140
1795
|
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
|
|
1141
1796
|
size_t num_lanes) {
|
|
1142
1797
|
const FixedTag<TFromD<D>, 2> d2;
|
|
@@ -1151,7 +1806,8 @@ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
|
|
|
1151
1806
|
return (num_lanes == 2) ? v_lo : InsertLane(v_lo, 2, p[2]);
|
|
1152
1807
|
}
|
|
1153
1808
|
|
|
1154
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)
|
|
1809
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4),
|
|
1810
|
+
HWY_IF_NOT_BF16_D(D)>
|
|
1155
1811
|
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
|
|
1156
1812
|
size_t num_lanes) {
|
|
1157
1813
|
const FixedTag<TFromD<D>, 2> d2;
|
|
@@ -1166,7 +1822,8 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
|
|
|
1166
1822
|
return (num_lanes == 2) ? v_lo : InsertLane(v_lo, 2, p[2]);
|
|
1167
1823
|
}
|
|
1168
1824
|
|
|
1169
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)
|
|
1825
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8),
|
|
1826
|
+
HWY_IF_NOT_BF16_D(D)>
|
|
1170
1827
|
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
|
|
1171
1828
|
size_t num_lanes) {
|
|
1172
1829
|
const FixedTag<TFromD<D>, 4> d4;
|
|
@@ -1201,7 +1858,8 @@ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
|
|
|
1201
1858
|
}
|
|
1202
1859
|
}
|
|
1203
1860
|
|
|
1204
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)
|
|
1861
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8),
|
|
1862
|
+
HWY_IF_NOT_BF16_D(D)>
|
|
1205
1863
|
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
|
|
1206
1864
|
size_t num_lanes) {
|
|
1207
1865
|
const FixedTag<TFromD<D>, 4> d4;
|
|
@@ -1238,7 +1896,8 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
|
|
|
1238
1896
|
}
|
|
1239
1897
|
}
|
|
1240
1898
|
|
|
1241
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16)
|
|
1899
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16),
|
|
1900
|
+
HWY_IF_NOT_BF16_D(D)>
|
|
1242
1901
|
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
|
|
1243
1902
|
size_t num_lanes) {
|
|
1244
1903
|
const FixedTag<TFromD<D>, 8> d8;
|
|
@@ -1283,7 +1942,8 @@ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
|
|
|
1283
1942
|
}
|
|
1284
1943
|
}
|
|
1285
1944
|
|
|
1286
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16)
|
|
1945
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16),
|
|
1946
|
+
HWY_IF_NOT_BF16_D(D)>
|
|
1287
1947
|
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
|
|
1288
1948
|
size_t num_lanes) {
|
|
1289
1949
|
const FixedTag<TFromD<D>, 8> d8;
|
|
@@ -1338,7 +1998,7 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
|
|
|
1338
1998
|
|
|
1339
1999
|
#if HWY_MAX_BYTES >= 32
|
|
1340
2000
|
|
|
1341
|
-
template <class D, HWY_IF_V_SIZE_GT_D(D, 16)>
|
|
2001
|
+
template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_NOT_BF16_D(D)>
|
|
1342
2002
|
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
|
|
1343
2003
|
size_t num_lanes) {
|
|
1344
2004
|
if (num_lanes >= Lanes(d)) return LoadU(d, p);
|
|
@@ -1354,7 +2014,7 @@ HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
|
|
|
1354
2014
|
}
|
|
1355
2015
|
}
|
|
1356
2016
|
|
|
1357
|
-
template <class D, HWY_IF_V_SIZE_GT_D(D, 16)>
|
|
2017
|
+
template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_NOT_BF16_D(D)>
|
|
1358
2018
|
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
|
|
1359
2019
|
size_t num_lanes) {
|
|
1360
2020
|
if (num_lanes >= Lanes(d)) return LoadU(d, p);
|
|
@@ -1374,7 +2034,23 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
|
|
|
1374
2034
|
}
|
|
1375
2035
|
|
|
1376
2036
|
#endif // HWY_MAX_BYTES >= 32
|
|
1377
|
-
|
|
2037
|
+
|
|
2038
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
2039
|
+
HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
|
|
2040
|
+
size_t num_lanes) {
|
|
2041
|
+
const RebindToUnsigned<D> du;
|
|
2042
|
+
return BitCast(d, LoadN(du, detail::U16LanePointer(p), num_lanes));
|
|
2043
|
+
}
|
|
2044
|
+
|
|
2045
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
2046
|
+
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
|
|
2047
|
+
size_t num_lanes) {
|
|
2048
|
+
const RebindToUnsigned<D> du;
|
|
2049
|
+
return BitCast(
|
|
2050
|
+
d, LoadNOr(BitCast(du, no), du, detail::U16LanePointer(p), num_lanes));
|
|
2051
|
+
}
|
|
2052
|
+
|
|
2053
|
+
#else // !HWY_MEM_OPS_MIGHT_FAULT || HWY_HAVE_SCALABLE
|
|
1378
2054
|
|
|
1379
2055
|
// For SVE and non-sanitizer AVX-512; RVV has its own specialization.
|
|
1380
2056
|
template <class D>
|
|
@@ -1549,9 +2225,7 @@ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
|
|
|
1549
2225
|
|
|
1550
2226
|
BlendedStore(v, FirstN(d, clamped_max_lanes_to_store), d, p);
|
|
1551
2227
|
|
|
1552
|
-
#if HWY_MEM_OPS_MIGHT_FAULT
|
|
1553
2228
|
detail::MaybeUnpoison(p, clamped_max_lanes_to_store);
|
|
1554
|
-
#endif
|
|
1555
2229
|
}
|
|
1556
2230
|
#endif // HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
|
|
1557
2231
|
|
|
@@ -1649,6 +2323,7 @@ HWY_API VFromD<D> GatherOffset(D d, const T* HWY_RESTRICT base,
|
|
|
1649
2323
|
HWY_ALIGN T lanes[MaxLanes(d)];
|
|
1650
2324
|
const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
|
|
1651
2325
|
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
2326
|
+
HWY_DASSERT(offset_lanes[i] >= 0);
|
|
1652
2327
|
CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
|
|
1653
2328
|
}
|
|
1654
2329
|
return Load(d, lanes);
|
|
@@ -1666,6 +2341,7 @@ HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base,
|
|
|
1666
2341
|
|
|
1667
2342
|
HWY_ALIGN T lanes[MaxLanes(d)];
|
|
1668
2343
|
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
2344
|
+
HWY_DASSERT(index_lanes[i] >= 0);
|
|
1669
2345
|
lanes[i] = base[index_lanes[i]];
|
|
1670
2346
|
}
|
|
1671
2347
|
return Load(d, lanes);
|
|
@@ -1687,11 +2363,37 @@ HWY_API VFromD<D> MaskedGatherIndex(MFromD<D> m, D d,
|
|
|
1687
2363
|
|
|
1688
2364
|
HWY_ALIGN T lanes[MaxLanes(d)];
|
|
1689
2365
|
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
2366
|
+
HWY_DASSERT(index_lanes[i] >= 0);
|
|
1690
2367
|
lanes[i] = mask_lanes[i] ? base[index_lanes[i]] : T{0};
|
|
1691
2368
|
}
|
|
1692
2369
|
return Load(d, lanes);
|
|
1693
2370
|
}
|
|
1694
2371
|
|
|
2372
|
+
template <class D, typename T = TFromD<D>>
|
|
2373
|
+
HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
|
|
2374
|
+
const T* HWY_RESTRICT base,
|
|
2375
|
+
VFromD<RebindToSigned<D>> index) {
|
|
2376
|
+
const RebindToSigned<D> di;
|
|
2377
|
+
using TI = TFromD<decltype(di)>;
|
|
2378
|
+
static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
|
|
2379
|
+
|
|
2380
|
+
HWY_ALIGN TI index_lanes[MaxLanes(di)];
|
|
2381
|
+
Store(index, di, index_lanes);
|
|
2382
|
+
|
|
2383
|
+
HWY_ALIGN TI mask_lanes[MaxLanes(di)];
|
|
2384
|
+
Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes);
|
|
2385
|
+
|
|
2386
|
+
HWY_ALIGN T no_lanes[MaxLanes(d)];
|
|
2387
|
+
Store(no, d, no_lanes);
|
|
2388
|
+
|
|
2389
|
+
HWY_ALIGN T lanes[MaxLanes(d)];
|
|
2390
|
+
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
2391
|
+
HWY_DASSERT(index_lanes[i] >= 0);
|
|
2392
|
+
lanes[i] = mask_lanes[i] ? base[index_lanes[i]] : no_lanes[i];
|
|
2393
|
+
}
|
|
2394
|
+
return Load(d, lanes);
|
|
2395
|
+
}
|
|
2396
|
+
|
|
1695
2397
|
#endif // (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE))
|
|
1696
2398
|
|
|
1697
2399
|
// ------------------------------ ScatterN/GatherN
|
|
@@ -1733,10 +2435,14 @@ HWY_API V AbsDiff(V a, V b) {
|
|
|
1733
2435
|
#define HWY_NATIVE_SUMS_OF_8_ABS_DIFF
|
|
1734
2436
|
#endif
|
|
1735
2437
|
|
|
1736
|
-
template <class V,
|
|
2438
|
+
template <class V, HWY_IF_UI8_D(DFromV<V>),
|
|
1737
2439
|
HWY_IF_V_SIZE_GT_D(DFromV<V>, (HWY_TARGET == HWY_SCALAR ? 0 : 4))>
|
|
1738
|
-
HWY_API Vec<
|
|
1739
|
-
|
|
2440
|
+
HWY_API Vec<RepartitionToWideX3<DFromV<V>>> SumsOf8AbsDiff(V a, V b) {
|
|
2441
|
+
const DFromV<decltype(a)> d;
|
|
2442
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2443
|
+
const RepartitionToWideX3<decltype(d)> dw;
|
|
2444
|
+
|
|
2445
|
+
return BitCast(dw, SumsOf8(BitCast(du, AbsDiff(a, b))));
|
|
1740
2446
|
}
|
|
1741
2447
|
|
|
1742
2448
|
#endif // HWY_NATIVE_SUMS_OF_8_ABS_DIFF
|
|
@@ -1923,6 +2629,248 @@ HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
|
|
|
1923
2629
|
#endif // HWY_TARGET != HWY_SCALAR
|
|
1924
2630
|
#endif // HWY_NATIVE_PROMOTE_UPPER_TO
|
|
1925
2631
|
|
|
2632
|
+
// ------------------------------ PromoteEvenTo/PromoteOddTo
|
|
2633
|
+
|
|
2634
|
+
#if HWY_TARGET != HWY_SCALAR
|
|
2635
|
+
namespace detail {
|
|
2636
|
+
|
|
2637
|
+
// Tag dispatch is used in detail::PromoteEvenTo and detail::PromoteOddTo as
|
|
2638
|
+
// there are target-specific specializations for some of the
|
|
2639
|
+
// detail::PromoteEvenTo and detail::PromoteOddTo cases on
|
|
2640
|
+
// SVE/PPC/SSE2/SSSE3/SSE4/AVX2.
|
|
2641
|
+
|
|
2642
|
+
// All targets except HWY_SCALAR use the implementations of
|
|
2643
|
+
// detail::PromoteEvenTo and detail::PromoteOddTo in generic_ops-inl.h for at
|
|
2644
|
+
// least some of the PromoteEvenTo and PromoteOddTo cases.
|
|
2645
|
+
|
|
2646
|
+
// Signed to signed PromoteEvenTo/PromoteOddTo
|
|
2647
|
+
template <size_t kToLaneSize, class D, class V>
|
|
2648
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(
|
|
2649
|
+
hwy::SignedTag /*to_type_tag*/,
|
|
2650
|
+
hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
|
|
2651
|
+
hwy::SignedTag /*from_type_tag*/, D d_to, V v) {
|
|
2652
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
2653
|
+
// On little-endian targets, need to shift each lane of the bitcasted vector
|
|
2654
|
+
// left by kToLaneSize * 4 bits to get the bits of the even source lanes into
|
|
2655
|
+
// the upper kToLaneSize * 4 bits of even_in_hi.
|
|
2656
|
+
const auto even_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v));
|
|
2657
|
+
#else
|
|
2658
|
+
// On big-endian targets, the bits of the even source lanes are already in
|
|
2659
|
+
// the upper kToLaneSize * 4 bits of the lanes of the bitcasted vector.
|
|
2660
|
+
const auto even_in_hi = BitCast(d_to, v);
|
|
2661
|
+
#endif
|
|
2662
|
+
|
|
2663
|
+
// Right-shift even_in_hi by kToLaneSize * 4 bits
|
|
2664
|
+
return ShiftRight<kToLaneSize * 4>(even_in_hi);
|
|
2665
|
+
}
|
|
2666
|
+
|
|
2667
|
+
template <size_t kToLaneSize, class D, class V>
|
|
2668
|
+
HWY_INLINE VFromD<D> PromoteOddTo(
|
|
2669
|
+
hwy::SignedTag /*to_type_tag*/,
|
|
2670
|
+
hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
|
|
2671
|
+
hwy::SignedTag /*from_type_tag*/, D d_to, V v) {
|
|
2672
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
2673
|
+
// On little-endian targets, the bits of the odd source lanes are already in
|
|
2674
|
+
// the upper kToLaneSize * 4 bits of the lanes of the bitcasted vector.
|
|
2675
|
+
const auto odd_in_hi = BitCast(d_to, v);
|
|
2676
|
+
#else
|
|
2677
|
+
// On big-endian targets, need to shift each lane of the bitcasted vector left
|
|
2678
|
+
// by kToLaneSize * 4 bits to get the bits of the odd source lanes into the
|
|
2679
|
+
// upper kToLaneSize * 4 bits of odd_in_hi.
|
|
2680
|
+
const auto odd_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v));
|
|
2681
|
+
#endif
|
|
2682
|
+
|
|
2683
|
+
// Right-shift odd_in_hi by kToLaneSize * 4 bits
|
|
2684
|
+
return ShiftRight<kToLaneSize * 4>(odd_in_hi);
|
|
2685
|
+
}
|
|
2686
|
+
|
|
2687
|
+
// Unsigned to unsigned PromoteEvenTo/PromoteOddTo
|
|
2688
|
+
template <size_t kToLaneSize, class D, class V>
|
|
2689
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(
|
|
2690
|
+
hwy::UnsignedTag /*to_type_tag*/,
|
|
2691
|
+
hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
|
|
2692
|
+
hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
|
|
2693
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
2694
|
+
// On little-endian targets, the bits of the even source lanes are already
|
|
2695
|
+
// in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector.
|
|
2696
|
+
|
|
2697
|
+
// Simply need to zero out the upper bits of each lane of the bitcasted
|
|
2698
|
+
// vector.
|
|
2699
|
+
return And(BitCast(d_to, v),
|
|
2700
|
+
Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>())));
|
|
2701
|
+
#else
|
|
2702
|
+
// On big-endian targets, need to shift each lane of the bitcasted vector
|
|
2703
|
+
// right by kToLaneSize * 4 bits to get the bits of the even source lanes into
|
|
2704
|
+
// the lower kToLaneSize * 4 bits of the result.
|
|
2705
|
+
|
|
2706
|
+
// The right shift below will zero out the upper kToLaneSize * 4 bits of the
|
|
2707
|
+
// result.
|
|
2708
|
+
return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v));
|
|
2709
|
+
#endif
|
|
2710
|
+
}
|
|
2711
|
+
|
|
2712
|
+
template <size_t kToLaneSize, class D, class V>
|
|
2713
|
+
HWY_INLINE VFromD<D> PromoteOddTo(
|
|
2714
|
+
hwy::UnsignedTag /*to_type_tag*/,
|
|
2715
|
+
hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
|
|
2716
|
+
hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
|
|
2717
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
2718
|
+
// On little-endian targets, need to shift each lane of the bitcasted vector
|
|
2719
|
+
// right by kToLaneSize * 4 bits to get the bits of the odd source lanes into
|
|
2720
|
+
// the lower kToLaneSize * 4 bits of the result.
|
|
2721
|
+
|
|
2722
|
+
// The right shift below will zero out the upper kToLaneSize * 4 bits of the
|
|
2723
|
+
// result.
|
|
2724
|
+
return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v));
|
|
2725
|
+
#else
|
|
2726
|
+
// On big-endian targets, the bits of the even source lanes are already
|
|
2727
|
+
// in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector.
|
|
2728
|
+
|
|
2729
|
+
// Simply need to zero out the upper bits of each lane of the bitcasted
|
|
2730
|
+
// vector.
|
|
2731
|
+
return And(BitCast(d_to, v),
|
|
2732
|
+
Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>())));
|
|
2733
|
+
#endif
|
|
2734
|
+
}
|
|
2735
|
+
|
|
2736
|
+
// Unsigned to signed: Same as unsigned->unsigned PromoteEvenTo/PromoteOddTo
|
|
2737
|
+
// followed by BitCast to signed
|
|
2738
|
+
template <size_t kToLaneSize, class D, class V>
|
|
2739
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(
|
|
2740
|
+
hwy::SignedTag /*to_type_tag*/,
|
|
2741
|
+
hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
|
|
2742
|
+
hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
|
|
2743
|
+
const RebindToUnsigned<decltype(d_to)> du_to;
|
|
2744
|
+
return BitCast(d_to,
|
|
2745
|
+
PromoteEvenTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(),
|
|
2746
|
+
hwy::UnsignedTag(), du_to, v));
|
|
2747
|
+
}
|
|
2748
|
+
|
|
2749
|
+
template <size_t kToLaneSize, class D, class V>
|
|
2750
|
+
HWY_INLINE VFromD<D> PromoteOddTo(
|
|
2751
|
+
hwy::SignedTag /*to_type_tag*/,
|
|
2752
|
+
hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
|
|
2753
|
+
hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
|
|
2754
|
+
const RebindToUnsigned<decltype(d_to)> du_to;
|
|
2755
|
+
return BitCast(d_to,
|
|
2756
|
+
PromoteOddTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(),
|
|
2757
|
+
hwy::UnsignedTag(), du_to, v));
|
|
2758
|
+
}
|
|
2759
|
+
|
|
2760
|
+
// BF16->F32 PromoteEvenTo
|
|
2761
|
+
|
|
2762
|
+
// NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag
|
|
2763
|
+
// instead of hwy::FloatTag on targets that use scalable vectors.
|
|
2764
|
+
|
|
2765
|
+
// VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same
|
|
2766
|
+
// type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>>
|
|
2767
|
+
|
|
2768
|
+
// The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered
|
|
2769
|
+
// to be a bfloat16_t vector.
|
|
2770
|
+
template <class FromTypeTag, class DF32, class VBF16,
|
|
2771
|
+
class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>,
|
|
2772
|
+
hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr>
|
|
2773
|
+
HWY_INLINE VFromD<DF32> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
|
|
2774
|
+
hwy::SizeTag<4> /*to_lane_size_tag*/,
|
|
2775
|
+
FromTypeTag /*from_type_tag*/, DF32 d_to,
|
|
2776
|
+
VBF16 v) {
|
|
2777
|
+
const RebindToUnsigned<decltype(d_to)> du_to;
|
|
2778
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
2779
|
+
// On little-endian platforms, need to shift left each lane of the bitcasted
|
|
2780
|
+
// vector by 16 bits.
|
|
2781
|
+
return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v)));
|
|
2782
|
+
#else
|
|
2783
|
+
// On big-endian platforms, the even lanes of the source vector are already
|
|
2784
|
+
// in the upper 16 bits of the lanes of the bitcasted vector.
|
|
2785
|
+
|
|
2786
|
+
// Need to simply zero out the lower 16 bits of each lane of the bitcasted
|
|
2787
|
+
// vector.
|
|
2788
|
+
return BitCast(d_to,
|
|
2789
|
+
And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u})));
|
|
2790
|
+
#endif
|
|
2791
|
+
}
|
|
2792
|
+
|
|
2793
|
+
// BF16->F32 PromoteOddTo
|
|
2794
|
+
|
|
2795
|
+
// NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag
|
|
2796
|
+
// instead of hwy::FloatTag on targets that use scalable vectors.
|
|
2797
|
+
|
|
2798
|
+
// VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same
|
|
2799
|
+
// type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>>
|
|
2800
|
+
|
|
2801
|
+
// The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered
|
|
2802
|
+
// to be a bfloat16_t vector.
|
|
2803
|
+
template <class FromTypeTag, class DF32, class VBF16,
|
|
2804
|
+
class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>,
|
|
2805
|
+
hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr>
|
|
2806
|
+
HWY_INLINE VFromD<DF32> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
|
|
2807
|
+
hwy::SizeTag<4> /*to_lane_size_tag*/,
|
|
2808
|
+
FromTypeTag /*from_type_tag*/, DF32 d_to,
|
|
2809
|
+
VBF16 v) {
|
|
2810
|
+
const RebindToUnsigned<decltype(d_to)> du_to;
|
|
2811
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
2812
|
+
// On little-endian platforms, the odd lanes of the source vector are already
|
|
2813
|
+
// in the upper 16 bits of the lanes of the bitcasted vector.
|
|
2814
|
+
|
|
2815
|
+
// Need to simply zero out the lower 16 bits of each lane of the bitcasted
|
|
2816
|
+
// vector.
|
|
2817
|
+
return BitCast(d_to,
|
|
2818
|
+
And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u})));
|
|
2819
|
+
#else
|
|
2820
|
+
// On big-endian platforms, need to shift left each lane of the bitcasted
|
|
2821
|
+
// vector by 16 bits.
|
|
2822
|
+
return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v)));
|
|
2823
|
+
#endif
|
|
2824
|
+
}
|
|
2825
|
+
|
|
2826
|
+
// Default PromoteEvenTo/PromoteOddTo implementations
|
|
2827
|
+
template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
|
|
2828
|
+
class V, HWY_IF_LANES_D(D, 1)>
|
|
2829
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(
|
|
2830
|
+
ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
|
|
2831
|
+
FromTypeTag /*from_type_tag*/, D d_to, V v) {
|
|
2832
|
+
return PromoteLowerTo(d_to, v);
|
|
2833
|
+
}
|
|
2834
|
+
|
|
2835
|
+
template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
|
|
2836
|
+
class V, HWY_IF_LANES_GT_D(D, 1)>
|
|
2837
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(
|
|
2838
|
+
ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
|
|
2839
|
+
FromTypeTag /*from_type_tag*/, D d_to, V v) {
|
|
2840
|
+
const DFromV<decltype(v)> d;
|
|
2841
|
+
return PromoteLowerTo(d_to, ConcatEven(d, v, v));
|
|
2842
|
+
}
|
|
2843
|
+
|
|
2844
|
+
template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
|
|
2845
|
+
class V>
|
|
2846
|
+
HWY_INLINE VFromD<D> PromoteOddTo(
|
|
2847
|
+
ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
|
|
2848
|
+
FromTypeTag /*from_type_tag*/, D d_to, V v) {
|
|
2849
|
+
const DFromV<decltype(v)> d;
|
|
2850
|
+
return PromoteLowerTo(d_to, ConcatOdd(d, v, v));
|
|
2851
|
+
}
|
|
2852
|
+
|
|
2853
|
+
} // namespace detail
|
|
2854
|
+
|
|
2855
|
+
template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)),
|
|
2856
|
+
class V2 = VFromD<Repartition<TFromV<V>, D>>,
|
|
2857
|
+
HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))>
|
|
2858
|
+
HWY_API VFromD<D> PromoteEvenTo(D d, V v) {
|
|
2859
|
+
return detail::PromoteEvenTo(hwy::TypeTag<TFromD<D>>(),
|
|
2860
|
+
hwy::SizeTag<sizeof(TFromD<D>)>(),
|
|
2861
|
+
hwy::TypeTag<TFromV<V>>(), d, v);
|
|
2862
|
+
}
|
|
2863
|
+
|
|
2864
|
+
template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)),
|
|
2865
|
+
class V2 = VFromD<Repartition<TFromV<V>, D>>,
|
|
2866
|
+
HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))>
|
|
2867
|
+
HWY_API VFromD<D> PromoteOddTo(D d, V v) {
|
|
2868
|
+
return detail::PromoteOddTo(hwy::TypeTag<TFromD<D>>(),
|
|
2869
|
+
hwy::SizeTag<sizeof(TFromD<D>)>(),
|
|
2870
|
+
hwy::TypeTag<TFromV<V>>(), d, v);
|
|
2871
|
+
}
|
|
2872
|
+
#endif // HWY_TARGET != HWY_SCALAR
|
|
2873
|
+
|
|
1926
2874
|
// ------------------------------ float16_t <-> float
|
|
1927
2875
|
|
|
1928
2876
|
#if (defined(HWY_NATIVE_F16C) == defined(HWY_TARGET_TOGGLE))
|
|
@@ -1956,41 +2904,237 @@ HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<float16_t, D>> v) {
|
|
|
1956
2904
|
|
|
1957
2905
|
template <class D, HWY_IF_F16_D(D)>
|
|
1958
2906
|
HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
|
|
1959
|
-
const
|
|
1960
|
-
const Rebind<
|
|
1961
|
-
const
|
|
1962
|
-
|
|
1963
|
-
|
|
1964
|
-
|
|
1965
|
-
|
|
1966
|
-
|
|
1967
|
-
|
|
1968
|
-
|
|
1969
|
-
|
|
1970
|
-
|
|
1971
|
-
|
|
1972
|
-
|
|
1973
|
-
|
|
1974
|
-
|
|
1975
|
-
|
|
1976
|
-
|
|
1977
|
-
|
|
1978
|
-
|
|
1979
|
-
|
|
1980
|
-
|
|
1981
|
-
|
|
1982
|
-
|
|
1983
|
-
const
|
|
1984
|
-
|
|
1985
|
-
|
|
1986
|
-
|
|
1987
|
-
|
|
1988
|
-
const
|
|
1989
|
-
|
|
2907
|
+
const RebindToSigned<decltype(df16)> di16;
|
|
2908
|
+
const Rebind<int32_t, decltype(df16)> di32;
|
|
2909
|
+
const RebindToFloat<decltype(di32)> df32;
|
|
2910
|
+
const RebindToUnsigned<decltype(df32)> du32;
|
|
2911
|
+
|
|
2912
|
+
// There are 23 fractional bits (plus the implied 1 bit) in the mantissa of
|
|
2913
|
+
// a F32, and there are 10 fractional bits (plus the implied 1 bit) in the
|
|
2914
|
+
// mantissa of a F16
|
|
2915
|
+
|
|
2916
|
+
// We want the unbiased exponent of round_incr[i] to be at least (-14) + 13 as
|
|
2917
|
+
// 2^(-14) is the smallest positive normal F16 value and as we want 13
|
|
2918
|
+
// mantissa bits (including the implicit 1 bit) to the left of the
|
|
2919
|
+
// F32 mantissa bits in rounded_val[i] since 23 - 10 is equal to 13
|
|
2920
|
+
|
|
2921
|
+
// The biased exponent of round_incr[i] needs to be at least 126 as
|
|
2922
|
+
// (-14) + 13 + 127 is equal to 126
|
|
2923
|
+
|
|
2924
|
+
// We also want to biased exponent of round_incr[i] to be less than or equal
|
|
2925
|
+
// to 255 (which is equal to MaxExponentField<float>())
|
|
2926
|
+
|
|
2927
|
+
// The biased F64 exponent of round_incr is equal to
|
|
2928
|
+
// HWY_MAX(HWY_MIN(((exp_bits[i] >> 23) & 255) + 13, 255), 126)
|
|
2929
|
+
|
|
2930
|
+
// hi9_bits[i] is equal to the upper 9 bits of v[i]
|
|
2931
|
+
const auto hi9_bits = ShiftRight<23>(BitCast(du32, v));
|
|
2932
|
+
|
|
2933
|
+
const auto k13 = Set(du32, uint32_t{13u});
|
|
2934
|
+
|
|
2935
|
+
// Minimum biased F32 exponent of round_incr
|
|
2936
|
+
const auto k126 = Set(du32, uint32_t{126u});
|
|
2937
|
+
|
|
2938
|
+
// round_incr_hi9_bits[i] is equivalent to
|
|
2939
|
+
// (hi9_bits[i] & 0x100) |
|
|
2940
|
+
// HWY_MAX(HWY_MIN((hi9_bits[i] & 0xFF) + 13, 255), 126)
|
|
2941
|
+
|
|
2942
|
+
#if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
|
|
2943
|
+
const auto k255 = Set(du32, uint32_t{255u});
|
|
2944
|
+
const auto round_incr_hi9_bits = BitwiseIfThenElse(
|
|
2945
|
+
k255, Max(Min(Add(And(hi9_bits, k255), k13), k255), k126), hi9_bits);
|
|
2946
|
+
#else
|
|
2947
|
+
// On targets other than SCALAR and EMU128, the exponent bits of hi9_bits can
|
|
2948
|
+
// be incremented by 13 and clamped to the [13, 255] range without overflowing
|
|
2949
|
+
// into the sign bit of hi9_bits by using U8 SaturatedAdd as there are 8
|
|
2950
|
+
// exponent bits in an F32
|
|
2951
|
+
|
|
2952
|
+
// U8 Max can be used on targets other than SCALAR and EMU128 to clamp
|
|
2953
|
+
// ((hi9_bits & 0xFF) + 13) to the [126, 255] range without affecting the sign
|
|
2954
|
+
// bit
|
|
2955
|
+
|
|
2956
|
+
const Repartition<uint8_t, decltype(du32)> du32_as_u8;
|
|
2957
|
+
const auto round_incr_hi9_bits = BitCast(
|
|
2958
|
+
du32,
|
|
2959
|
+
Max(SaturatedAdd(BitCast(du32_as_u8, hi9_bits), BitCast(du32_as_u8, k13)),
|
|
2960
|
+
BitCast(du32_as_u8, k126)));
|
|
2961
|
+
#endif
|
|
2962
|
+
|
|
2963
|
+
// (round_incr_hi9_bits >> 8) is equal to (hi9_bits >> 8), and
|
|
2964
|
+
// (round_incr_hi9_bits & 0xFF) is equal to
|
|
2965
|
+
// HWY_MAX(HWY_MIN((round_incr_hi9_bits & 0xFF) + 13, 255), 126)
|
|
2966
|
+
|
|
2967
|
+
const auto round_incr = BitCast(df32, ShiftLeft<23>(round_incr_hi9_bits));
|
|
2968
|
+
|
|
2969
|
+
// Add round_incr[i] to v[i] to round the mantissa to the nearest F16 mantissa
|
|
2970
|
+
// and to move the fractional bits of the resulting non-NaN mantissa down to
|
|
2971
|
+
// the lower 10 bits of rounded_val if (v[i] + round_incr[i]) is a non-NaN
|
|
2972
|
+
// value
|
|
2973
|
+
const auto rounded_val = Add(v, round_incr);
|
|
2974
|
+
|
|
2975
|
+
// rounded_val_bits is the bits of rounded_val as a U32
|
|
2976
|
+
const auto rounded_val_bits = BitCast(du32, rounded_val);
|
|
2977
|
+
|
|
2978
|
+
// rounded_val[i] is known to have the same biased exponent as round_incr[i]
|
|
2979
|
+
// as |round_incr[i]| > 2^12*|v[i]| is true if round_incr[i] is a finite
|
|
2980
|
+
// value, round_incr[i] and v[i] both have the same sign, and |round_incr[i]|
|
|
2981
|
+
// is either a power of 2 that is greater than or equal to 2^-1 or infinity.
|
|
2982
|
+
|
|
2983
|
+
// If rounded_val[i] is a finite F32 value, then
|
|
2984
|
+
// (rounded_val_bits[i] & 0x00000FFF) is the bit representation of the
|
|
2985
|
+
// rounded mantissa of rounded_val[i] as a UQ2.10 fixed point number that is
|
|
2986
|
+
// in the range [0, 2].
|
|
2987
|
+
|
|
2988
|
+
// In other words, (rounded_val_bits[i] & 0x00000FFF) is between 0 and 0x0800,
|
|
2989
|
+
// with (rounded_val_bits[i] & 0x000003FF) being the fractional bits of the
|
|
2990
|
+
// resulting F16 mantissa, if rounded_v[i] is a finite F32 value.
|
|
2991
|
+
|
|
2992
|
+
// (rounded_val_bits[i] & 0x007FF000) == 0 is guaranteed to be true if
|
|
2993
|
+
// rounded_val[i] is a non-NaN value
|
|
2994
|
+
|
|
2995
|
+
// The biased exponent of rounded_val[i] is guaranteed to be at least 126 as
|
|
2996
|
+
// the biased exponent of round_incr[i] is at least 126 and as both v[i] and
|
|
2997
|
+
// round_incr[i] have the same sign bit
|
|
2998
|
+
|
|
2999
|
+
// The ULP of a F32 value with a biased exponent of 126 is equal to
|
|
3000
|
+
// 2^(126 - 127 - 23), which is equal to 2^(-24) (which is also the ULP of a
|
|
3001
|
+
// F16 value with a biased exponent of 0 or 1 as (1 - 15 - 10) is equal to
|
|
3002
|
+
// -24)
|
|
3003
|
+
|
|
3004
|
+
// The biased exponent (before subtracting by 126) needs to be clamped to the
|
|
3005
|
+
// [126, 157] range as 126 + 31 is equal to 157 and as 31 is the largest
|
|
3006
|
+
// biased exponent of a F16.
|
|
3007
|
+
|
|
3008
|
+
// The biased exponent of the resulting F16 value is equal to
|
|
3009
|
+
// HWY_MIN((round_incr_hi9_bits[i] & 0xFF) +
|
|
3010
|
+
// ((rounded_val_bits[i] >> 10) & 0xFF), 157) - 126
|
|
3011
|
+
|
|
3012
|
+
#if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
|
|
3013
|
+
auto f16_exp_bits =
|
|
3014
|
+
Min(Add(ShiftLeft<10>(And(round_incr_hi9_bits, k255)),
|
|
3015
|
+
And(rounded_val_bits,
|
|
3016
|
+
Set(du32, static_cast<uint32_t>(uint32_t{0xFFu} << 10)))),
|
|
3017
|
+
Set(du32, static_cast<uint32_t>(uint32_t{157u} << 10)));
|
|
3018
|
+
#else
|
|
3019
|
+
auto f16_exp_bits = ShiftLeft<10>(BitCast(
|
|
3020
|
+
du32,
|
|
3021
|
+
Min(SaturatedAdd(BitCast(du32_as_u8, round_incr_hi9_bits),
|
|
3022
|
+
BitCast(du32_as_u8, ShiftRight<10>(rounded_val_bits))),
|
|
3023
|
+
BitCast(du32_as_u8, Set(du32, uint32_t{157})))));
|
|
3024
|
+
#endif
|
|
3025
|
+
|
|
3026
|
+
f16_exp_bits =
|
|
3027
|
+
Sub(f16_exp_bits, Set(du32, static_cast<uint32_t>(uint32_t{126u} << 10)));
|
|
3028
|
+
|
|
3029
|
+
const auto f16_unmasked_mant_bits =
|
|
3030
|
+
BitCast(di32, Or(rounded_val, VecFromMask(df32, IsNaN(rounded_val))));
|
|
3031
|
+
|
|
3032
|
+
const auto f16_exp_mant_bits =
|
|
3033
|
+
OrAnd(BitCast(di32, f16_exp_bits), f16_unmasked_mant_bits,
|
|
3034
|
+
Set(di32, int32_t{0x03FF}));
|
|
3035
|
+
|
|
3036
|
+
// f16_bits_as_i32 is the F16 bits sign-extended to an I32 (with the upper 17
|
|
3037
|
+
// bits of f16_bits_as_i32[i] set to the sign bit of rounded_val[i]) to allow
|
|
3038
|
+
// efficient truncation of the F16 bits to an I16 using an I32->I16 DemoteTo
|
|
3039
|
+
// operation
|
|
3040
|
+
const auto f16_bits_as_i32 =
|
|
3041
|
+
OrAnd(f16_exp_mant_bits, ShiftRight<16>(BitCast(di32, rounded_val_bits)),
|
|
3042
|
+
Set(di32, static_cast<int32_t>(0xFFFF8000u)));
|
|
3043
|
+
return BitCast(df16, DemoteTo(di16, f16_bits_as_i32));
|
|
1990
3044
|
}
|
|
1991
3045
|
|
|
1992
3046
|
#endif // HWY_NATIVE_F16C
|
|
1993
3047
|
|
|
3048
|
+
// ------------------------------ F64->F16 DemoteTo
|
|
3049
|
+
#if (defined(HWY_NATIVE_DEMOTE_F64_TO_F16) == defined(HWY_TARGET_TOGGLE))
|
|
3050
|
+
#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
3051
|
+
#undef HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
3052
|
+
#else
|
|
3053
|
+
#define HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
3054
|
+
#endif
|
|
3055
|
+
|
|
3056
|
+
#if HWY_HAVE_FLOAT64
|
|
3057
|
+
template <class D, HWY_IF_F16_D(D)>
|
|
3058
|
+
HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
|
|
3059
|
+
const Rebind<double, D> df64;
|
|
3060
|
+
const Rebind<uint64_t, D> du64;
|
|
3061
|
+
const Rebind<float, D> df32;
|
|
3062
|
+
|
|
3063
|
+
// The mantissa bits of v[i] are first rounded using round-to-odd rounding to
|
|
3064
|
+
// the nearest F64 value that has the lower 29 bits zeroed out to ensure that
|
|
3065
|
+
// the result is correctly rounded to a F16.
|
|
3066
|
+
|
|
3067
|
+
const auto vf64_rounded = OrAnd(
|
|
3068
|
+
And(v,
|
|
3069
|
+
BitCast(df64, Set(du64, static_cast<uint64_t>(0xFFFFFFFFE0000000u)))),
|
|
3070
|
+
BitCast(df64, Add(BitCast(du64, v),
|
|
3071
|
+
Set(du64, static_cast<uint64_t>(0x000000001FFFFFFFu)))),
|
|
3072
|
+
BitCast(df64, Set(du64, static_cast<uint64_t>(0x0000000020000000ULL))));
|
|
3073
|
+
|
|
3074
|
+
return DemoteTo(df16, DemoteTo(df32, vf64_rounded));
|
|
3075
|
+
}
|
|
3076
|
+
#endif // HWY_HAVE_FLOAT64
|
|
3077
|
+
|
|
3078
|
+
#endif // HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
3079
|
+
|
|
3080
|
+
// ------------------------------ F16->F64 PromoteTo
|
|
3081
|
+
#if (defined(HWY_NATIVE_PROMOTE_F16_TO_F64) == defined(HWY_TARGET_TOGGLE))
|
|
3082
|
+
#ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
|
|
3083
|
+
#undef HWY_NATIVE_PROMOTE_F16_TO_F64
|
|
3084
|
+
#else
|
|
3085
|
+
#define HWY_NATIVE_PROMOTE_F16_TO_F64
|
|
3086
|
+
#endif
|
|
3087
|
+
|
|
3088
|
+
#if HWY_HAVE_FLOAT64
|
|
3089
|
+
template <class D, HWY_IF_F64_D(D)>
|
|
3090
|
+
HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<float16_t, D>> v) {
|
|
3091
|
+
return PromoteTo(df64, PromoteTo(Rebind<float, D>(), v));
|
|
3092
|
+
}
|
|
3093
|
+
#endif // HWY_HAVE_FLOAT64
|
|
3094
|
+
|
|
3095
|
+
#endif // HWY_NATIVE_PROMOTE_F16_TO_F64
|
|
3096
|
+
|
|
3097
|
+
// ------------------------------ SumsOf2
|
|
3098
|
+
|
|
3099
|
+
#if HWY_TARGET != HWY_SCALAR
|
|
3100
|
+
namespace detail {
|
|
3101
|
+
|
|
3102
|
+
template <class TypeTag, size_t kLaneSize, class V>
|
|
3103
|
+
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
|
|
3104
|
+
TypeTag /*type_tag*/, hwy::SizeTag<kLaneSize> /*lane_size_tag*/, V v) {
|
|
3105
|
+
const DFromV<decltype(v)> d;
|
|
3106
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
3107
|
+
return Add(PromoteEvenTo(dw, v), PromoteOddTo(dw, v));
|
|
3108
|
+
}
|
|
3109
|
+
|
|
3110
|
+
} // namespace detail
|
|
3111
|
+
|
|
3112
|
+
template <class V>
|
|
3113
|
+
HWY_API VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(V v) {
|
|
3114
|
+
return detail::SumsOf2(hwy::TypeTag<TFromV<V>>(),
|
|
3115
|
+
hwy::SizeTag<sizeof(TFromV<V>)>(), v);
|
|
3116
|
+
}
|
|
3117
|
+
#endif // HWY_TARGET != HWY_SCALAR
|
|
3118
|
+
|
|
3119
|
+
// ------------------------------ SumsOf4
|
|
3120
|
+
|
|
3121
|
+
namespace detail {
|
|
3122
|
+
|
|
3123
|
+
template <class TypeTag, size_t kLaneSize, class V>
|
|
3124
|
+
HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
|
|
3125
|
+
TypeTag /*type_tag*/, hwy::SizeTag<kLaneSize> /*lane_size_tag*/, V v) {
|
|
3126
|
+
using hwy::HWY_NAMESPACE::SumsOf2;
|
|
3127
|
+
return SumsOf2(SumsOf2(v));
|
|
3128
|
+
}
|
|
3129
|
+
|
|
3130
|
+
} // namespace detail
|
|
3131
|
+
|
|
3132
|
+
template <class V>
|
|
3133
|
+
HWY_API VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(V v) {
|
|
3134
|
+
return detail::SumsOf4(hwy::TypeTag<TFromV<V>>(),
|
|
3135
|
+
hwy::SizeTag<sizeof(TFromV<V>)>(), v);
|
|
3136
|
+
}
|
|
3137
|
+
|
|
1994
3138
|
// ------------------------------ OrderedTruncate2To
|
|
1995
3139
|
|
|
1996
3140
|
#if HWY_IDE || \
|
|
@@ -2206,8 +3350,7 @@ HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
|
|
|
2206
3350
|
#if HWY_TARGET == HWY_SCALAR
|
|
2207
3351
|
const uint64_t u64_val = GetLane(v);
|
|
2208
3352
|
const float f32_val = static_cast<float>(u64_val);
|
|
2209
|
-
uint32_t f32_bits;
|
|
2210
|
-
CopySameSize(&f32_val, &f32_bits);
|
|
3353
|
+
const uint32_t f32_bits = BitCastScalar<uint32_t>(f32_val);
|
|
2211
3354
|
return Set(d, static_cast<uint64_t>(f32_bits >> 23));
|
|
2212
3355
|
#else
|
|
2213
3356
|
const Repartition<uint32_t, decltype(d)> du32;
|
|
@@ -2320,30 +3463,29 @@ HWY_INLINE V SubBytesMulInverseAndAffineLookup(V state, V affine_tblL,
|
|
|
2320
3463
|
|
|
2321
3464
|
// Change polynomial basis to GF(2^4)
|
|
2322
3465
|
{
|
|
2323
|
-
|
|
2324
|
-
0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2,
|
|
2325
|
-
|
|
2326
|
-
|
|
2327
|
-
0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C,
|
|
2328
|
-
|
|
3466
|
+
const VFromD<decltype(du)> basisL =
|
|
3467
|
+
Dup128VecFromValues(du, 0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2,
|
|
3468
|
+
0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA);
|
|
3469
|
+
const VFromD<decltype(du)> basisU =
|
|
3470
|
+
Dup128VecFromValues(du, 0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C,
|
|
3471
|
+
0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD);
|
|
2329
3472
|
const auto sL = And(state, mask);
|
|
2330
3473
|
const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero
|
|
2331
|
-
const auto gf4L = TableLookupBytes(
|
|
2332
|
-
const auto gf4U = TableLookupBytes(
|
|
3474
|
+
const auto gf4L = TableLookupBytes(basisL, sL);
|
|
3475
|
+
const auto gf4U = TableLookupBytes(basisU, sU);
|
|
2333
3476
|
state = Xor(gf4L, gf4U);
|
|
2334
3477
|
}
|
|
2335
3478
|
|
|
2336
3479
|
// Inversion in GF(2^4). Elements 0 represent "infinity" (division by 0) and
|
|
2337
3480
|
// cause TableLookupBytesOr0 to return 0.
|
|
2338
|
-
|
|
2339
|
-
0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3
|
|
2340
|
-
|
|
2341
|
-
0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4
|
|
2342
|
-
const auto tbl = LoadDup128(du, kInv);
|
|
3481
|
+
const VFromD<decltype(du)> zetaInv = Dup128VecFromValues(
|
|
3482
|
+
du, 0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3);
|
|
3483
|
+
const VFromD<decltype(du)> tbl = Dup128VecFromValues(
|
|
3484
|
+
du, 0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4);
|
|
2343
3485
|
const auto sL = And(state, mask); // L=low nibble, U=upper
|
|
2344
3486
|
const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero
|
|
2345
3487
|
const auto sX = Xor(sU, sL);
|
|
2346
|
-
const auto invL = TableLookupBytes(
|
|
3488
|
+
const auto invL = TableLookupBytes(zetaInv, sL);
|
|
2347
3489
|
const auto invU = TableLookupBytes(tbl, sU);
|
|
2348
3490
|
const auto invX = TableLookupBytes(tbl, sX);
|
|
2349
3491
|
const auto outL = Xor(sX, TableLookupBytesOr0(tbl, Xor(invL, invU)));
|
|
@@ -2359,26 +3501,25 @@ HWY_INLINE V SubBytes(V state) {
|
|
|
2359
3501
|
const DFromV<V> du;
|
|
2360
3502
|
// Linear skew (cannot bake 0x63 bias into the table because out* indices
|
|
2361
3503
|
// may have the infinity flag set).
|
|
2362
|
-
|
|
2363
|
-
0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0,
|
|
2364
|
-
|
|
2365
|
-
|
|
2366
|
-
0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF,
|
|
2367
|
-
|
|
2368
|
-
return Xor(SubBytesMulInverseAndAffineLookup(state,
|
|
2369
|
-
LoadDup128(du, kAffineU)),
|
|
3504
|
+
const VFromD<decltype(du)> affineL =
|
|
3505
|
+
Dup128VecFromValues(du, 0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0,
|
|
3506
|
+
0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15);
|
|
3507
|
+
const VFromD<decltype(du)> affineU =
|
|
3508
|
+
Dup128VecFromValues(du, 0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF,
|
|
3509
|
+
0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E);
|
|
3510
|
+
return Xor(SubBytesMulInverseAndAffineLookup(state, affineL, affineU),
|
|
2370
3511
|
Set(du, uint8_t{0x63}));
|
|
2371
3512
|
}
|
|
2372
3513
|
|
|
2373
3514
|
template <class V> // u8
|
|
2374
3515
|
HWY_INLINE V InvSubBytes(V state) {
|
|
2375
3516
|
const DFromV<V> du;
|
|
2376
|
-
|
|
2377
|
-
0x00, 0x40, 0xF9, 0x7E, 0x53, 0xEA, 0x87, 0x13,
|
|
2378
|
-
|
|
2379
|
-
|
|
2380
|
-
0x00, 0x1D, 0x44, 0x93, 0x0F, 0x56, 0xD7, 0x12,
|
|
2381
|
-
|
|
3517
|
+
const VFromD<decltype(du)> gF2P4InvToGF2P8InvL =
|
|
3518
|
+
Dup128VecFromValues(du, 0x00, 0x40, 0xF9, 0x7E, 0x53, 0xEA, 0x87, 0x13,
|
|
3519
|
+
0x2D, 0x3E, 0x94, 0xD4, 0xB9, 0x6D, 0xAA, 0xC7);
|
|
3520
|
+
const VFromD<decltype(du)> gF2P4InvToGF2P8InvU =
|
|
3521
|
+
Dup128VecFromValues(du, 0x00, 0x1D, 0x44, 0x93, 0x0F, 0x56, 0xD7, 0x12,
|
|
3522
|
+
0x9C, 0x8E, 0xC5, 0xD8, 0x59, 0x81, 0x4B, 0xCA);
|
|
2382
3523
|
|
|
2383
3524
|
// Apply the inverse affine transformation
|
|
2384
3525
|
const auto b = Xor(Xor3(Or(ShiftLeft<1>(state), ShiftRight<7>(state)),
|
|
@@ -2392,16 +3533,14 @@ HWY_INLINE V InvSubBytes(V state) {
|
|
|
2392
3533
|
// - Converting the GF(2^4) multiplicative inverse to the GF(2^8)
|
|
2393
3534
|
// multiplicative inverse through table lookups using the
|
|
2394
3535
|
// kGF2P4InvToGF2P8InvL and kGF2P4InvToGF2P8InvU tables
|
|
2395
|
-
return SubBytesMulInverseAndAffineLookup(
|
|
2396
|
-
|
|
2397
|
-
LoadDup128(du, kGF2P4InvToGF2P8InvU));
|
|
3536
|
+
return SubBytesMulInverseAndAffineLookup(b, gF2P4InvToGF2P8InvL,
|
|
3537
|
+
gF2P4InvToGF2P8InvU);
|
|
2398
3538
|
}
|
|
2399
3539
|
|
|
2400
3540
|
} // namespace detail
|
|
2401
3541
|
|
|
2402
3542
|
#endif // HWY_TARGET != HWY_SCALAR
|
|
2403
3543
|
|
|
2404
|
-
// "Include guard": skip if native AES instructions are available.
|
|
2405
3544
|
#if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE))
|
|
2406
3545
|
#ifdef HWY_NATIVE_AES
|
|
2407
3546
|
#undef HWY_NATIVE_AES
|
|
@@ -2417,24 +3556,18 @@ namespace detail {
|
|
|
2417
3556
|
template <class V> // u8
|
|
2418
3557
|
HWY_INLINE V ShiftRows(const V state) {
|
|
2419
3558
|
const DFromV<V> du;
|
|
2420
|
-
|
|
2421
|
-
|
|
2422
|
-
4,
|
|
2423
|
-
8, 13, 2, 7, //
|
|
2424
|
-
12, 1, 6, 11};
|
|
2425
|
-
const auto shift_row = LoadDup128(du, kShiftRow);
|
|
3559
|
+
// transposed: state is column major
|
|
3560
|
+
const VFromD<decltype(du)> shift_row = Dup128VecFromValues(
|
|
3561
|
+
du, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11);
|
|
2426
3562
|
return TableLookupBytes(state, shift_row);
|
|
2427
3563
|
}
|
|
2428
3564
|
|
|
2429
3565
|
template <class V> // u8
|
|
2430
3566
|
HWY_INLINE V InvShiftRows(const V state) {
|
|
2431
3567
|
const DFromV<V> du;
|
|
2432
|
-
|
|
2433
|
-
|
|
2434
|
-
4,
|
|
2435
|
-
8, 5, 2, 15, //
|
|
2436
|
-
12, 9, 6, 3};
|
|
2437
|
-
const auto shift_row = LoadDup128(du, kShiftRow);
|
|
3568
|
+
// transposed: state is column major
|
|
3569
|
+
const VFromD<decltype(du)> shift_row = Dup128VecFromValues(
|
|
3570
|
+
du, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3);
|
|
2438
3571
|
return TableLookupBytes(state, shift_row);
|
|
2439
3572
|
}
|
|
2440
3573
|
|
|
@@ -2455,15 +3588,15 @@ HWY_INLINE V MixColumns(const V state) {
|
|
|
2455
3588
|
// 1 2 3 1 // d are on diagonal, no permutation needed.
|
|
2456
3589
|
// 1 1 2 3 // t1230 indicates column indices of threes for the 4 rows.
|
|
2457
3590
|
// 3 1 1 2 // We also need to compute s2301 and s3012 (=1230 o 2301).
|
|
2458
|
-
|
|
2459
|
-
2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
|
|
2460
|
-
|
|
2461
|
-
1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
|
|
3591
|
+
const VFromD<decltype(du)> v2301 = Dup128VecFromValues(
|
|
3592
|
+
du, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13);
|
|
3593
|
+
const VFromD<decltype(du)> v1230 = Dup128VecFromValues(
|
|
3594
|
+
du, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12);
|
|
2462
3595
|
const auto d = GF2P8Mod11BMulBy2(state); // = state*2 in GF(2^8).
|
|
2463
|
-
const auto s2301 = TableLookupBytes(state,
|
|
3596
|
+
const auto s2301 = TableLookupBytes(state, v2301);
|
|
2464
3597
|
const auto d_s2301 = Xor(d, s2301);
|
|
2465
3598
|
const auto t_s2301 = Xor(state, d_s2301); // t(s*3) = XOR-sum {s, d(s*2)}
|
|
2466
|
-
const auto t1230_s3012 = TableLookupBytes(t_s2301,
|
|
3599
|
+
const auto t1230_s3012 = TableLookupBytes(t_s2301, v1230);
|
|
2467
3600
|
return Xor(d_s2301, t1230_s3012); // XOR-sum of 4 terms
|
|
2468
3601
|
}
|
|
2469
3602
|
|
|
@@ -2475,11 +3608,10 @@ HWY_INLINE V InvMixColumns(const V state) {
|
|
|
2475
3608
|
// 9 14 11 13
|
|
2476
3609
|
// 13 9 14 11
|
|
2477
3610
|
// 11 13 9 14
|
|
2478
|
-
|
|
2479
|
-
2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
|
|
2480
|
-
|
|
2481
|
-
1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
|
|
2482
|
-
const auto v1230 = LoadDup128(du, k1230);
|
|
3611
|
+
const VFromD<decltype(du)> v2301 = Dup128VecFromValues(
|
|
3612
|
+
du, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13);
|
|
3613
|
+
const VFromD<decltype(du)> v1230 = Dup128VecFromValues(
|
|
3614
|
+
du, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12);
|
|
2483
3615
|
|
|
2484
3616
|
const auto sx2 = GF2P8Mod11BMulBy2(state); /* = state*2 in GF(2^8) */
|
|
2485
3617
|
const auto sx4 = GF2P8Mod11BMulBy2(sx2); /* = state*4 in GF(2^8) */
|
|
@@ -2491,8 +3623,7 @@ HWY_INLINE V InvMixColumns(const V state) {
|
|
|
2491
3623
|
|
|
2492
3624
|
const auto sx13_0123_sx9_1230 = Xor(sx13, TableLookupBytes(sx9, v1230));
|
|
2493
3625
|
const auto sx14_0123_sx11_1230 = Xor(sx14, TableLookupBytes(sx11, v1230));
|
|
2494
|
-
const auto sx13_2301_sx9_3012 =
|
|
2495
|
-
TableLookupBytes(sx13_0123_sx9_1230, LoadDup128(du, k2301));
|
|
3626
|
+
const auto sx13_2301_sx9_3012 = TableLookupBytes(sx13_0123_sx9_1230, v2301);
|
|
2496
3627
|
return Xor(sx14_0123_sx11_1230, sx13_2301_sx9_3012);
|
|
2497
3628
|
}
|
|
2498
3629
|
|
|
@@ -2543,15 +3674,15 @@ HWY_API V AESLastRoundInv(V state, const V round_key) {
|
|
|
2543
3674
|
|
|
2544
3675
|
template <uint8_t kRcon, class V, HWY_IF_U8_D(DFromV<V>)>
|
|
2545
3676
|
HWY_API V AESKeyGenAssist(V v) {
|
|
2546
|
-
alignas(16) static constexpr uint8_t kRconXorMask[16] = {
|
|
2547
|
-
0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0};
|
|
2548
|
-
alignas(16) static constexpr uint8_t kRotWordShuffle[16] = {
|
|
2549
|
-
4, 5, 6, 7, 5, 6, 7, 4, 12, 13, 14, 15, 13, 14, 15, 12};
|
|
2550
3677
|
const DFromV<decltype(v)> d;
|
|
3678
|
+
const V rconXorMask = Dup128VecFromValues(d, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0,
|
|
3679
|
+
0, 0, kRcon, 0, 0, 0);
|
|
3680
|
+
const V rotWordShuffle = Dup128VecFromValues(d, 4, 5, 6, 7, 5, 6, 7, 4, 12,
|
|
3681
|
+
13, 14, 15, 13, 14, 15, 12);
|
|
2551
3682
|
const auto sub_word_result = detail::SubBytes(v);
|
|
2552
3683
|
const auto rot_word_result =
|
|
2553
|
-
TableLookupBytes(sub_word_result,
|
|
2554
|
-
return Xor(rot_word_result,
|
|
3684
|
+
TableLookupBytes(sub_word_result, rotWordShuffle);
|
|
3685
|
+
return Xor(rot_word_result, rconXorMask);
|
|
2555
3686
|
}
|
|
2556
3687
|
|
|
2557
3688
|
// Constant-time implementation inspired by
|
|
@@ -2602,203 +3733,751 @@ HWY_API V CLMulUpper(V a, V b) {
|
|
|
2602
3733
|
const auto b2 = And(b, k4);
|
|
2603
3734
|
const auto b3 = And(b, k8);
|
|
2604
3735
|
|
|
2605
|
-
auto m0 = Xor(MulOdd(a0, b0), MulOdd(a1, b3));
|
|
2606
|
-
auto m1 = Xor(MulOdd(a0, b1), MulOdd(a1, b0));
|
|
2607
|
-
auto m2 = Xor(MulOdd(a0, b2), MulOdd(a1, b1));
|
|
2608
|
-
auto m3 = Xor(MulOdd(a0, b3), MulOdd(a1, b2));
|
|
2609
|
-
m0 = Xor(m0, Xor(MulOdd(a2, b2), MulOdd(a3, b1)));
|
|
2610
|
-
m1 = Xor(m1, Xor(MulOdd(a2, b3), MulOdd(a3, b2)));
|
|
2611
|
-
m2 = Xor(m2, Xor(MulOdd(a2, b0), MulOdd(a3, b3)));
|
|
2612
|
-
m3 = Xor(m3, Xor(MulOdd(a2, b1), MulOdd(a3, b0)));
|
|
2613
|
-
return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
|
|
3736
|
+
auto m0 = Xor(MulOdd(a0, b0), MulOdd(a1, b3));
|
|
3737
|
+
auto m1 = Xor(MulOdd(a0, b1), MulOdd(a1, b0));
|
|
3738
|
+
auto m2 = Xor(MulOdd(a0, b2), MulOdd(a1, b1));
|
|
3739
|
+
auto m3 = Xor(MulOdd(a0, b3), MulOdd(a1, b2));
|
|
3740
|
+
m0 = Xor(m0, Xor(MulOdd(a2, b2), MulOdd(a3, b1)));
|
|
3741
|
+
m1 = Xor(m1, Xor(MulOdd(a2, b3), MulOdd(a3, b2)));
|
|
3742
|
+
m2 = Xor(m2, Xor(MulOdd(a2, b0), MulOdd(a3, b3)));
|
|
3743
|
+
m3 = Xor(m3, Xor(MulOdd(a2, b1), MulOdd(a3, b0)));
|
|
3744
|
+
return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
|
|
3745
|
+
}
|
|
3746
|
+
|
|
3747
|
+
#endif // HWY_NATIVE_AES
|
|
3748
|
+
#endif // HWY_TARGET != HWY_SCALAR
|
|
3749
|
+
|
|
3750
|
+
// ------------------------------ PopulationCount
|
|
3751
|
+
|
|
3752
|
+
#if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE))
|
|
3753
|
+
#ifdef HWY_NATIVE_POPCNT
|
|
3754
|
+
#undef HWY_NATIVE_POPCNT
|
|
3755
|
+
#else
|
|
3756
|
+
#define HWY_NATIVE_POPCNT
|
|
3757
|
+
#endif
|
|
3758
|
+
|
|
3759
|
+
// This overload requires vectors to be at least 16 bytes, which is the case
|
|
3760
|
+
// for LMUL >= 2.
|
|
3761
|
+
#undef HWY_IF_POPCNT
|
|
3762
|
+
#if HWY_TARGET == HWY_RVV
|
|
3763
|
+
#define HWY_IF_POPCNT(D) \
|
|
3764
|
+
hwy::EnableIf<D().Pow2() >= 1 && D().MaxLanes() >= 16>* = nullptr
|
|
3765
|
+
#else
|
|
3766
|
+
// Other targets only have these two overloads which are mutually exclusive, so
|
|
3767
|
+
// no further conditions are required.
|
|
3768
|
+
#define HWY_IF_POPCNT(D) void* = nullptr
|
|
3769
|
+
#endif // HWY_TARGET == HWY_RVV
|
|
3770
|
+
|
|
3771
|
+
template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
|
|
3772
|
+
HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_POPCNT(D)>
|
|
3773
|
+
HWY_API V PopulationCount(V v) {
|
|
3774
|
+
const D d;
|
|
3775
|
+
const V lookup =
|
|
3776
|
+
Dup128VecFromValues(d, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
|
|
3777
|
+
const auto lo = And(v, Set(d, uint8_t{0xF}));
|
|
3778
|
+
const auto hi = ShiftRight<4>(v);
|
|
3779
|
+
return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
|
|
3780
|
+
}
|
|
3781
|
+
|
|
3782
|
+
// RVV has a specialization that avoids the Set().
|
|
3783
|
+
#if HWY_TARGET != HWY_RVV
|
|
3784
|
+
// Slower fallback for capped vectors.
|
|
3785
|
+
template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
|
|
3786
|
+
HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
3787
|
+
HWY_API V PopulationCount(V v) {
|
|
3788
|
+
const D d;
|
|
3789
|
+
// See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
|
|
3790
|
+
const V k33 = Set(d, uint8_t{0x33});
|
|
3791
|
+
v = Sub(v, And(ShiftRight<1>(v), Set(d, uint8_t{0x55})));
|
|
3792
|
+
v = Add(And(ShiftRight<2>(v), k33), And(v, k33));
|
|
3793
|
+
return And(Add(v, ShiftRight<4>(v)), Set(d, uint8_t{0x0F}));
|
|
3794
|
+
}
|
|
3795
|
+
#endif // HWY_TARGET != HWY_RVV
|
|
3796
|
+
|
|
3797
|
+
template <class V, class D = DFromV<V>, HWY_IF_U16_D(D)>
|
|
3798
|
+
HWY_API V PopulationCount(V v) {
|
|
3799
|
+
const D d;
|
|
3800
|
+
const Repartition<uint8_t, decltype(d)> d8;
|
|
3801
|
+
const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
|
|
3802
|
+
return Add(ShiftRight<8>(vals), And(vals, Set(d, uint16_t{0xFF})));
|
|
3803
|
+
}
|
|
3804
|
+
|
|
3805
|
+
template <class V, class D = DFromV<V>, HWY_IF_U32_D(D)>
|
|
3806
|
+
HWY_API V PopulationCount(V v) {
|
|
3807
|
+
const D d;
|
|
3808
|
+
Repartition<uint16_t, decltype(d)> d16;
|
|
3809
|
+
auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
|
|
3810
|
+
return Add(ShiftRight<16>(vals), And(vals, Set(d, uint32_t{0xFF})));
|
|
3811
|
+
}
|
|
3812
|
+
|
|
3813
|
+
#if HWY_HAVE_INTEGER64
|
|
3814
|
+
template <class V, class D = DFromV<V>, HWY_IF_U64_D(D)>
|
|
3815
|
+
HWY_API V PopulationCount(V v) {
|
|
3816
|
+
const D d;
|
|
3817
|
+
Repartition<uint32_t, decltype(d)> d32;
|
|
3818
|
+
auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
|
|
3819
|
+
return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFFULL)));
|
|
3820
|
+
}
|
|
3821
|
+
#endif
|
|
3822
|
+
|
|
3823
|
+
#endif // HWY_NATIVE_POPCNT
|
|
3824
|
+
|
|
3825
|
+
// ------------------------------ 8-bit multiplication
|
|
3826
|
+
|
|
3827
|
+
#if (defined(HWY_NATIVE_MUL_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
|
|
3828
|
+
#ifdef HWY_NATIVE_MUL_8
|
|
3829
|
+
#undef HWY_NATIVE_MUL_8
|
|
3830
|
+
#else
|
|
3831
|
+
#define HWY_NATIVE_MUL_8
|
|
3832
|
+
#endif
|
|
3833
|
+
|
|
3834
|
+
// 8 bit and fits in wider reg: promote
|
|
3835
|
+
template <class V, HWY_IF_T_SIZE_V(V, 1),
|
|
3836
|
+
HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
|
|
3837
|
+
HWY_API V operator*(const V a, const V b) {
|
|
3838
|
+
const DFromV<decltype(a)> d;
|
|
3839
|
+
const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw;
|
|
3840
|
+
const RebindToUnsigned<decltype(d)> du; // TruncateTo result
|
|
3841
|
+
const RebindToUnsigned<decltype(dw)> dwu; // TruncateTo input
|
|
3842
|
+
const VFromD<decltype(dw)> mul = PromoteTo(dw, a) * PromoteTo(dw, b);
|
|
3843
|
+
// TruncateTo is cheaper than ConcatEven.
|
|
3844
|
+
return BitCast(d, TruncateTo(du, BitCast(dwu, mul)));
|
|
3845
|
+
}
|
|
3846
|
+
|
|
3847
|
+
// 8 bit full reg: promote halves
|
|
3848
|
+
template <class V, HWY_IF_T_SIZE_V(V, 1),
|
|
3849
|
+
HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
|
|
3850
|
+
HWY_API V operator*(const V a, const V b) {
|
|
3851
|
+
const DFromV<decltype(a)> d;
|
|
3852
|
+
const Half<decltype(d)> dh;
|
|
3853
|
+
const Twice<RepartitionToWide<decltype(dh)>> dw;
|
|
3854
|
+
const VFromD<decltype(dw)> a0 = PromoteTo(dw, LowerHalf(dh, a));
|
|
3855
|
+
const VFromD<decltype(dw)> a1 = PromoteTo(dw, UpperHalf(dh, a));
|
|
3856
|
+
const VFromD<decltype(dw)> b0 = PromoteTo(dw, LowerHalf(dh, b));
|
|
3857
|
+
const VFromD<decltype(dw)> b1 = PromoteTo(dw, UpperHalf(dh, b));
|
|
3858
|
+
const VFromD<decltype(dw)> m0 = a0 * b0;
|
|
3859
|
+
const VFromD<decltype(dw)> m1 = a1 * b1;
|
|
3860
|
+
return ConcatEven(d, BitCast(d, m1), BitCast(d, m0));
|
|
3861
|
+
}
|
|
3862
|
+
|
|
3863
|
+
#endif // HWY_NATIVE_MUL_8
|
|
3864
|
+
|
|
3865
|
+
// ------------------------------ 64-bit multiplication
|
|
3866
|
+
|
|
3867
|
+
#if (defined(HWY_NATIVE_MUL_64) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
|
|
3868
|
+
#ifdef HWY_NATIVE_MUL_64
|
|
3869
|
+
#undef HWY_NATIVE_MUL_64
|
|
3870
|
+
#else
|
|
3871
|
+
#define HWY_NATIVE_MUL_64
|
|
3872
|
+
#endif
|
|
3873
|
+
|
|
3874
|
+
// Single-lane i64 or u64
|
|
3875
|
+
template <class V, HWY_IF_T_SIZE_V(V, 8), HWY_IF_V_SIZE_V(V, 8),
|
|
3876
|
+
HWY_IF_NOT_FLOAT_V(V)>
|
|
3877
|
+
HWY_API V operator*(V x, V y) {
|
|
3878
|
+
const DFromV<V> d;
|
|
3879
|
+
using T = TFromD<decltype(d)>;
|
|
3880
|
+
using TU = MakeUnsigned<T>;
|
|
3881
|
+
const TU xu = static_cast<TU>(GetLane(x));
|
|
3882
|
+
const TU yu = static_cast<TU>(GetLane(y));
|
|
3883
|
+
return Set(d, static_cast<T>(xu * yu));
|
|
3884
|
+
}
|
|
3885
|
+
|
|
3886
|
+
template <class V, class D64 = DFromV<V>, HWY_IF_U64_D(D64),
|
|
3887
|
+
HWY_IF_V_SIZE_GT_D(D64, 8)>
|
|
3888
|
+
HWY_API V operator*(V x, V y) {
|
|
3889
|
+
RepartitionToNarrow<D64> d32;
|
|
3890
|
+
auto x32 = BitCast(d32, x);
|
|
3891
|
+
auto y32 = BitCast(d32, y);
|
|
3892
|
+
auto lolo = BitCast(d32, MulEven(x32, y32));
|
|
3893
|
+
auto lohi = BitCast(d32, MulEven(x32, BitCast(d32, ShiftRight<32>(y))));
|
|
3894
|
+
auto hilo = BitCast(d32, MulEven(BitCast(d32, ShiftRight<32>(x)), y32));
|
|
3895
|
+
auto hi = BitCast(d32, ShiftLeft<32>(BitCast(D64{}, lohi + hilo)));
|
|
3896
|
+
return BitCast(D64{}, lolo + hi);
|
|
3897
|
+
}
|
|
3898
|
+
template <class V, class DI64 = DFromV<V>, HWY_IF_I64_D(DI64),
|
|
3899
|
+
HWY_IF_V_SIZE_GT_D(DI64, 8)>
|
|
3900
|
+
HWY_API V operator*(V x, V y) {
|
|
3901
|
+
RebindToUnsigned<DI64> du64;
|
|
3902
|
+
return BitCast(DI64{}, BitCast(du64, x) * BitCast(du64, y));
|
|
3903
|
+
}
|
|
3904
|
+
|
|
3905
|
+
#endif // HWY_NATIVE_MUL_64
|
|
3906
|
+
|
|
3907
|
+
// ------------------------------ MulAdd / NegMulAdd
|
|
3908
|
+
|
|
3909
|
+
#if (defined(HWY_NATIVE_INT_FMA) == defined(HWY_TARGET_TOGGLE))
|
|
3910
|
+
#ifdef HWY_NATIVE_INT_FMA
|
|
3911
|
+
#undef HWY_NATIVE_INT_FMA
|
|
3912
|
+
#else
|
|
3913
|
+
#define HWY_NATIVE_INT_FMA
|
|
3914
|
+
#endif
|
|
3915
|
+
|
|
3916
|
+
#ifdef HWY_NATIVE_INT_FMSUB
|
|
3917
|
+
#undef HWY_NATIVE_INT_FMSUB
|
|
3918
|
+
#else
|
|
3919
|
+
#define HWY_NATIVE_INT_FMSUB
|
|
3920
|
+
#endif
|
|
3921
|
+
|
|
3922
|
+
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
3923
|
+
HWY_API V MulAdd(V mul, V x, V add) {
|
|
3924
|
+
return Add(Mul(mul, x), add);
|
|
3925
|
+
}
|
|
3926
|
+
|
|
3927
|
+
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
3928
|
+
HWY_API V NegMulAdd(V mul, V x, V add) {
|
|
3929
|
+
return Sub(add, Mul(mul, x));
|
|
3930
|
+
}
|
|
3931
|
+
|
|
3932
|
+
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
3933
|
+
HWY_API V MulSub(V mul, V x, V sub) {
|
|
3934
|
+
return Sub(Mul(mul, x), sub);
|
|
3935
|
+
}
|
|
3936
|
+
#endif // HWY_NATIVE_INT_FMA
|
|
3937
|
+
|
|
3938
|
+
// ------------------------------ Integer MulSub / NegMulSub
|
|
3939
|
+
#if (defined(HWY_NATIVE_INT_FMSUB) == defined(HWY_TARGET_TOGGLE))
|
|
3940
|
+
#ifdef HWY_NATIVE_INT_FMSUB
|
|
3941
|
+
#undef HWY_NATIVE_INT_FMSUB
|
|
3942
|
+
#else
|
|
3943
|
+
#define HWY_NATIVE_INT_FMSUB
|
|
3944
|
+
#endif
|
|
3945
|
+
|
|
3946
|
+
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
3947
|
+
HWY_API V MulSub(V mul, V x, V sub) {
|
|
3948
|
+
const DFromV<decltype(mul)> d;
|
|
3949
|
+
const RebindToSigned<decltype(d)> di;
|
|
3950
|
+
return MulAdd(mul, x, BitCast(d, Neg(BitCast(di, sub))));
|
|
3951
|
+
}
|
|
3952
|
+
|
|
3953
|
+
#endif // HWY_NATIVE_INT_FMSUB
|
|
3954
|
+
|
|
3955
|
+
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
3956
|
+
HWY_API V NegMulSub(V mul, V x, V sub) {
|
|
3957
|
+
const DFromV<decltype(mul)> d;
|
|
3958
|
+
const RebindToSigned<decltype(d)> di;
|
|
3959
|
+
|
|
3960
|
+
return BitCast(d, Neg(BitCast(di, MulAdd(mul, x, sub))));
|
|
3961
|
+
}
|
|
3962
|
+
|
|
3963
|
+
// ------------------------------ MulAddSub
|
|
3964
|
+
|
|
3965
|
+
// MulAddSub(mul, x, sub_or_add) for a 1-lane vector is equivalent to
|
|
3966
|
+
// MulSub(mul, x, sub_or_add)
|
|
3967
|
+
template <class V, HWY_IF_LANES_D(DFromV<V>, 1)>
|
|
3968
|
+
HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
|
|
3969
|
+
return MulSub(mul, x, sub_or_add);
|
|
3970
|
+
}
|
|
3971
|
+
|
|
3972
|
+
// MulAddSub for F16/F32/F64 vectors with 2 or more lanes on
|
|
3973
|
+
// SSSE3/SSE4/AVX2/AVX3 is implemented in x86_128-inl.h, x86_256-inl.h, and
|
|
3974
|
+
// x86_512-inl.h
|
|
3975
|
+
template <class V, HWY_IF_LANES_GT_D(DFromV<V>, 1),
|
|
3976
|
+
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | ((HWY_TARGET <= HWY_SSSE3 &&
|
|
3977
|
+
hwy::IsFloat<TFromV<V>>())
|
|
3978
|
+
? 0
|
|
3979
|
+
: ((1 << 2) | (1 << 4) |
|
|
3980
|
+
(1 << 8))))>
|
|
3981
|
+
HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
|
|
3982
|
+
using D = DFromV<V>;
|
|
3983
|
+
using T = TFromD<D>;
|
|
3984
|
+
using TNegate = If<!IsSigned<T>(), MakeSigned<T>, T>;
|
|
3985
|
+
|
|
3986
|
+
const D d;
|
|
3987
|
+
const Rebind<TNegate, D> d_negate;
|
|
3988
|
+
|
|
3989
|
+
const auto add =
|
|
3990
|
+
OddEven(sub_or_add, BitCast(d, Neg(BitCast(d_negate, sub_or_add))));
|
|
3991
|
+
return MulAdd(mul, x, add);
|
|
3992
|
+
}
|
|
3993
|
+
|
|
3994
|
+
// ------------------------------ Integer division
|
|
3995
|
+
#if (defined(HWY_NATIVE_INT_DIV) == defined(HWY_TARGET_TOGGLE))
|
|
3996
|
+
#ifdef HWY_NATIVE_INT_DIV
|
|
3997
|
+
#undef HWY_NATIVE_INT_DIV
|
|
3998
|
+
#else
|
|
3999
|
+
#define HWY_NATIVE_INT_DIV
|
|
4000
|
+
#endif
|
|
4001
|
+
|
|
4002
|
+
namespace detail {
|
|
4003
|
+
|
|
4004
|
+
template <class D, class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))>
|
|
4005
|
+
HWY_INLINE Vec<D> IntDivConvFloatToInt(D di, V vf) {
|
|
4006
|
+
return ConvertTo(di, vf);
|
|
4007
|
+
}
|
|
4008
|
+
|
|
4009
|
+
template <class D, class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))>
|
|
4010
|
+
HWY_INLINE Vec<D> IntDivConvIntToFloat(D df, V vi) {
|
|
4011
|
+
return ConvertTo(df, vi);
|
|
4012
|
+
}
|
|
4013
|
+
|
|
4014
|
+
#if !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
|
|
4015
|
+
template <class D, class V, HWY_IF_UI64_D(D), HWY_IF_F32(TFromV<V>)>
|
|
4016
|
+
HWY_INLINE Vec<D> IntDivConvFloatToInt(D df, V vi) {
|
|
4017
|
+
return PromoteTo(df, vi);
|
|
4018
|
+
}
|
|
4019
|
+
|
|
4020
|
+
// If !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 is true, then UI64->F32
|
|
4021
|
+
// IntDivConvIntToFloat(df, vi) returns an approximation of
|
|
4022
|
+
// static_cast<float>(v[i]) that is within 4 ULP of static_cast<float>(v[i])
|
|
4023
|
+
template <class D, class V, HWY_IF_F32_D(D), HWY_IF_I64(TFromV<V>)>
|
|
4024
|
+
HWY_INLINE Vec<D> IntDivConvIntToFloat(D df32, V vi) {
|
|
4025
|
+
const Twice<decltype(df32)> dt_f32;
|
|
4026
|
+
|
|
4027
|
+
auto vf32 =
|
|
4028
|
+
ConvertTo(dt_f32, BitCast(RebindToSigned<decltype(dt_f32)>(), vi));
|
|
4029
|
+
|
|
4030
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
4031
|
+
const auto lo_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
|
|
4032
|
+
auto hi_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
|
|
4033
|
+
#else
|
|
4034
|
+
const auto lo_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
|
|
4035
|
+
auto hi_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
|
|
4036
|
+
#endif
|
|
4037
|
+
|
|
4038
|
+
const RebindToSigned<decltype(df32)> di32;
|
|
4039
|
+
|
|
4040
|
+
hi_f32 =
|
|
4041
|
+
Add(hi_f32, And(BitCast(df32, BroadcastSignBit(BitCast(di32, lo_f32))),
|
|
4042
|
+
Set(df32, 1.0f)));
|
|
4043
|
+
return hwy::HWY_NAMESPACE::MulAdd(hi_f32, Set(df32, 4294967296.0f), lo_f32);
|
|
4044
|
+
}
|
|
4045
|
+
|
|
4046
|
+
template <class D, class V, HWY_IF_F32_D(D), HWY_IF_U64(TFromV<V>)>
|
|
4047
|
+
HWY_INLINE Vec<D> IntDivConvIntToFloat(D df32, V vu) {
|
|
4048
|
+
const Twice<decltype(df32)> dt_f32;
|
|
4049
|
+
|
|
4050
|
+
auto vf32 =
|
|
4051
|
+
ConvertTo(dt_f32, BitCast(RebindToUnsigned<decltype(dt_f32)>(), vu));
|
|
4052
|
+
|
|
4053
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
4054
|
+
const auto lo_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
|
|
4055
|
+
const auto hi_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
|
|
4056
|
+
#else
|
|
4057
|
+
const auto lo_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
|
|
4058
|
+
const auto hi_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
|
|
4059
|
+
#endif
|
|
4060
|
+
|
|
4061
|
+
return hwy::HWY_NAMESPACE::MulAdd(hi_f32, Set(df32, 4294967296.0f), lo_f32);
|
|
4062
|
+
}
|
|
4063
|
+
#endif // !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
|
|
4064
|
+
|
|
4065
|
+
template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
|
|
4066
|
+
HWY_IF_T_SIZE_GT(TFromV<V>, kOrigLaneSize)>
|
|
4067
|
+
HWY_INLINE V IntDivUsingFloatDiv(V a, V b) {
|
|
4068
|
+
const DFromV<decltype(a)> d;
|
|
4069
|
+
const RebindToFloat<decltype(d)> df;
|
|
4070
|
+
|
|
4071
|
+
// If kOrigLaneSize < sizeof(T) is true, then a[i] and b[i] are both in the
|
|
4072
|
+
// [LimitsMin<SignedFromSize<kOrigLaneSize>>(),
|
|
4073
|
+
// LimitsMax<UnsignedFromSize<kOrigLaneSize>>()] range.
|
|
4074
|
+
|
|
4075
|
+
// floor(|a[i] / b[i]|) <= |flt_q| < floor(|a[i] / b[i]|) + 1 is also
|
|
4076
|
+
// guaranteed to be true if MakeFloat<T> has at least kOrigLaneSize*8 + 1
|
|
4077
|
+
// mantissa bits (including the implied one bit), where flt_q is equal to
|
|
4078
|
+
// static_cast<MakeFloat<T>>(a[i]) / static_cast<MakeFloat<T>>(b[i]),
|
|
4079
|
+
// even in the case where the magnitude of an inexact floating point division
|
|
4080
|
+
// result is rounded up.
|
|
4081
|
+
|
|
4082
|
+
// In other words, floor(flt_q) < flt_q < ceil(flt_q) is guaranteed to be true
|
|
4083
|
+
// if (a[i] % b[i]) != 0 is true and MakeFloat<T> has at least
|
|
4084
|
+
// kOrigLaneSize*8 + 1 mantissa bits (including the implied one bit), even in
|
|
4085
|
+
// the case where the magnitude of an inexact floating point division result
|
|
4086
|
+
// is rounded up.
|
|
4087
|
+
|
|
4088
|
+
#if (HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) && \
|
|
4089
|
+
!HWY_HAVE_FLOAT64
|
|
4090
|
+
// On Armv7, do division by multiplying by the ApproximateReciprocal
|
|
4091
|
+
// to avoid unnecessary overhead as F32 Div refines the approximate
|
|
4092
|
+
// reciprocal using 4 Newton-Raphson iterations
|
|
4093
|
+
|
|
4094
|
+
const RebindToSigned<decltype(d)> di;
|
|
4095
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4096
|
+
|
|
4097
|
+
const auto flt_b = ConvertTo(df, b);
|
|
4098
|
+
auto flt_recip_b = ApproximateReciprocal(flt_b);
|
|
4099
|
+
if (kOrigLaneSize > 1) {
|
|
4100
|
+
flt_recip_b =
|
|
4101
|
+
Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b));
|
|
4102
|
+
}
|
|
4103
|
+
|
|
4104
|
+
auto q0 = ConvertTo(d, Mul(ConvertTo(df, a), flt_recip_b));
|
|
4105
|
+
const auto r0 = BitCast(di, hwy::HWY_NAMESPACE::NegMulAdd(q0, b, a));
|
|
4106
|
+
|
|
4107
|
+
auto r1 = r0;
|
|
4108
|
+
|
|
4109
|
+
// Need to negate r1[i] if a[i] < 0 is true
|
|
4110
|
+
if (IsSigned<TFromV<V>>()) {
|
|
4111
|
+
r1 = IfNegativeThenNegOrUndefIfZero(BitCast(di, a), r1);
|
|
4112
|
+
}
|
|
4113
|
+
|
|
4114
|
+
// r1[i] is now equal to (a[i] < 0) ? (-r0[i]) : r0[i]
|
|
4115
|
+
|
|
4116
|
+
auto abs_b = BitCast(du, b);
|
|
4117
|
+
if (IsSigned<TFromV<V>>()) {
|
|
4118
|
+
abs_b = BitCast(du, Abs(BitCast(di, abs_b)));
|
|
4119
|
+
}
|
|
4120
|
+
|
|
4121
|
+
// If (r1[i] < 0 || r1[i] >= abs_b[i]) is true, then set q1[i] to -1.
|
|
4122
|
+
// Otherwise, set q1[i] to 0.
|
|
4123
|
+
|
|
4124
|
+
// (r1[i] < 0 || r1[i] >= abs_b[i]) can be carried out using a single unsigned
|
|
4125
|
+
// comparison as static_cast<TU>(r1[i]) >= TU(LimitsMax<TI>() + 1) >= abs_b[i]
|
|
4126
|
+
// will be true if r1[i] < 0 is true.
|
|
4127
|
+
auto q1 = BitCast(di, VecFromMask(du, Ge(BitCast(du, r1), abs_b)));
|
|
4128
|
+
|
|
4129
|
+
// q1[i] is now equal to (r1[i] < 0 || r1[i] >= abs_b[i]) ? -1 : 0
|
|
4130
|
+
|
|
4131
|
+
// Need to negate q1[i] if r0[i] and b[i] do not have the same sign
|
|
4132
|
+
auto q1_negate_mask = r0;
|
|
4133
|
+
if (IsSigned<TFromV<V>>()) {
|
|
4134
|
+
q1_negate_mask = Xor(q1_negate_mask, BitCast(di, b));
|
|
4135
|
+
}
|
|
4136
|
+
q1 = IfNegativeThenElse(q1_negate_mask, Neg(q1), q1);
|
|
4137
|
+
|
|
4138
|
+
// q1[i] is now equal to (r1[i] < 0 || r1[i] >= abs_b[i]) ?
|
|
4139
|
+
// (((r0[i] ^ b[i]) < 0) ? 1 : -1)
|
|
4140
|
+
|
|
4141
|
+
// Need to subtract q1[i] from q0[i] to get the final result
|
|
4142
|
+
return Sub(q0, BitCast(d, q1));
|
|
4143
|
+
#else
|
|
4144
|
+
// On targets other than Armv7 NEON, use F16 or F32 division as most targets
|
|
4145
|
+
// other than Armv7 NEON have native F32 divide instructions
|
|
4146
|
+
return ConvertTo(d, Div(ConvertTo(df, a), ConvertTo(df, b)));
|
|
4147
|
+
#endif
|
|
4148
|
+
}
|
|
4149
|
+
|
|
4150
|
+
template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
|
|
4151
|
+
HWY_IF_T_SIZE(TFromV<V>, kOrigLaneSize),
|
|
4152
|
+
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
|
|
4153
|
+
HWY_INLINE V IntDivUsingFloatDiv(V a, V b) {
|
|
4154
|
+
// If kOrigLaneSize == sizeof(T) is true, at least two reciprocal
|
|
4155
|
+
// multiplication steps are needed as the mantissa of MakeFloat<T> has fewer
|
|
4156
|
+
// than kOrigLaneSize*8 + 1 bits
|
|
4157
|
+
|
|
4158
|
+
using T = TFromV<V>;
|
|
4159
|
+
|
|
4160
|
+
#if HWY_HAVE_FLOAT64
|
|
4161
|
+
using TF = MakeFloat<T>;
|
|
4162
|
+
#else
|
|
4163
|
+
using TF = float;
|
|
4164
|
+
#endif
|
|
4165
|
+
|
|
4166
|
+
const DFromV<decltype(a)> d;
|
|
4167
|
+
const RebindToSigned<decltype(d)> di;
|
|
4168
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4169
|
+
const Rebind<TF, decltype(d)> df;
|
|
4170
|
+
|
|
4171
|
+
if (!IsSigned<T>()) {
|
|
4172
|
+
// If T is unsigned, set a[i] to (a[i] >= b[i] ? 1 : 0) and set b[i] to 1 if
|
|
4173
|
+
// b[i] > LimitsMax<MakeSigned<T>>() is true
|
|
4174
|
+
|
|
4175
|
+
const auto one = Set(di, MakeSigned<T>{1});
|
|
4176
|
+
a = BitCast(
|
|
4177
|
+
d, IfNegativeThenElse(BitCast(di, b),
|
|
4178
|
+
IfThenElseZero(RebindMask(di, Ge(a, b)), one),
|
|
4179
|
+
BitCast(di, a)));
|
|
4180
|
+
b = BitCast(d, IfNegativeThenElse(BitCast(di, b), one, BitCast(di, b)));
|
|
4181
|
+
}
|
|
4182
|
+
|
|
4183
|
+
// LimitsMin<T>() <= b[i] <= LimitsMax<MakeSigned<T>>() is now true
|
|
4184
|
+
|
|
4185
|
+
const auto flt_b = IntDivConvIntToFloat(df, b);
|
|
4186
|
+
|
|
4187
|
+
#if (HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) && \
|
|
4188
|
+
!HWY_HAVE_FLOAT64
|
|
4189
|
+
auto flt_recip_b = ApproximateReciprocal(flt_b);
|
|
4190
|
+
flt_recip_b =
|
|
4191
|
+
Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b));
|
|
4192
|
+
#else
|
|
4193
|
+
const auto flt_recip_b = Div(Set(df, TF(1.0)), flt_b);
|
|
4194
|
+
#endif
|
|
4195
|
+
|
|
4196
|
+
auto q0 =
|
|
4197
|
+
IntDivConvFloatToInt(d, Mul(IntDivConvIntToFloat(df, a), flt_recip_b));
|
|
4198
|
+
const auto r0 = BitCast(di, hwy::HWY_NAMESPACE::NegMulAdd(q0, b, a));
|
|
4199
|
+
|
|
4200
|
+
auto q1 =
|
|
4201
|
+
IntDivConvFloatToInt(di, Mul(IntDivConvIntToFloat(df, r0), flt_recip_b));
|
|
4202
|
+
const auto r1 = hwy::HWY_NAMESPACE::NegMulAdd(q1, BitCast(di, b), r0);
|
|
4203
|
+
|
|
4204
|
+
auto r3 = r1;
|
|
4205
|
+
|
|
4206
|
+
#if !HWY_HAVE_FLOAT64
|
|
4207
|
+
// Need two additional reciprocal multiplication steps for I64/U64 vectors if
|
|
4208
|
+
// HWY_HAVE_FLOAT64 is 0
|
|
4209
|
+
if (sizeof(T) == 8) {
|
|
4210
|
+
const auto q2 = IntDivConvFloatToInt(
|
|
4211
|
+
di, Mul(IntDivConvIntToFloat(df, r1), flt_recip_b));
|
|
4212
|
+
const auto r2 = hwy::HWY_NAMESPACE::NegMulAdd(q2, BitCast(di, b), r1);
|
|
4213
|
+
|
|
4214
|
+
const auto q3 = IntDivConvFloatToInt(
|
|
4215
|
+
di, Mul(IntDivConvIntToFloat(df, r2), flt_recip_b));
|
|
4216
|
+
r3 = hwy::HWY_NAMESPACE::NegMulAdd(q3, BitCast(di, b), r2);
|
|
4217
|
+
|
|
4218
|
+
q0 = Add(q0, BitCast(d, q2));
|
|
4219
|
+
q1 = Add(q1, q3);
|
|
4220
|
+
}
|
|
4221
|
+
#endif // !HWY_HAVE_FLOAT64
|
|
4222
|
+
|
|
4223
|
+
auto r4 = r3;
|
|
4224
|
+
|
|
4225
|
+
// Need to negate r4[i] if a[i] < 0 is true
|
|
4226
|
+
if (IsSigned<TFromV<V>>()) {
|
|
4227
|
+
r4 = IfNegativeThenNegOrUndefIfZero(BitCast(di, a), r4);
|
|
4228
|
+
}
|
|
4229
|
+
|
|
4230
|
+
// r4[i] is now equal to (a[i] < 0) ? (-r3[i]) : r3[i]
|
|
4231
|
+
|
|
4232
|
+
auto abs_b = BitCast(du, b);
|
|
4233
|
+
if (IsSigned<TFromV<V>>()) {
|
|
4234
|
+
abs_b = BitCast(du, Abs(BitCast(di, abs_b)));
|
|
4235
|
+
}
|
|
4236
|
+
|
|
4237
|
+
// If (r4[i] < 0 || r4[i] >= abs_b[i]) is true, then set q4[i] to -1.
|
|
4238
|
+
// Otherwise, set r4[i] to 0.
|
|
4239
|
+
|
|
4240
|
+
// (r4[i] < 0 || r4[i] >= abs_b[i]) can be carried out using a single unsigned
|
|
4241
|
+
// comparison as static_cast<TU>(r4[i]) >= TU(LimitsMax<TI>() + 1) >= abs_b[i]
|
|
4242
|
+
// will be true if r4[i] < 0 is true.
|
|
4243
|
+
auto q4 = BitCast(di, VecFromMask(du, Ge(BitCast(du, r4), abs_b)));
|
|
4244
|
+
|
|
4245
|
+
// q4[i] is now equal to (r4[i] < 0 || r4[i] >= abs_b[i]) ? -1 : 0
|
|
4246
|
+
|
|
4247
|
+
// Need to negate q4[i] if r3[i] and b[i] do not have the same sign
|
|
4248
|
+
auto q4_negate_mask = r3;
|
|
4249
|
+
if (IsSigned<TFromV<V>>()) {
|
|
4250
|
+
q4_negate_mask = Xor(q4_negate_mask, BitCast(di, b));
|
|
4251
|
+
}
|
|
4252
|
+
q4 = IfNegativeThenElse(q4_negate_mask, Neg(q4), q4);
|
|
4253
|
+
|
|
4254
|
+
// q4[i] is now equal to (r4[i] < 0 || r4[i] >= abs_b[i]) ?
|
|
4255
|
+
// (((r3[i] ^ b[i]) < 0) ? 1 : -1)
|
|
4256
|
+
|
|
4257
|
+
// The final result is equal to q0[i] + q1[i] - q4[i]
|
|
4258
|
+
return Sub(Add(q0, BitCast(d, q1)), BitCast(d, q4));
|
|
2614
4259
|
}
|
|
2615
4260
|
|
|
2616
|
-
|
|
2617
|
-
|
|
4261
|
+
template <size_t kOrigLaneSize, class V,
|
|
4262
|
+
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
|
|
4263
|
+
HWY_IF_V_SIZE_LE_V(
|
|
4264
|
+
V, HWY_MAX_BYTES /
|
|
4265
|
+
((!HWY_HAVE_FLOAT16 && sizeof(TFromV<V>) == 1) ? 4 : 2))>
|
|
4266
|
+
HWY_INLINE V IntDiv(V a, V b) {
|
|
4267
|
+
using T = TFromV<V>;
|
|
2618
4268
|
|
|
2619
|
-
//
|
|
4269
|
+
// If HWY_HAVE_FLOAT16 is 0, need to promote I8 to I32 and U8 to U32
|
|
4270
|
+
using TW = MakeWide<
|
|
4271
|
+
If<(!HWY_HAVE_FLOAT16 && sizeof(TFromV<V>) == 1), MakeWide<T>, T>>;
|
|
2620
4272
|
|
|
2621
|
-
|
|
2622
|
-
|
|
2623
|
-
|
|
2624
|
-
#
|
|
4273
|
+
const DFromV<decltype(a)> d;
|
|
4274
|
+
const Rebind<TW, decltype(d)> dw;
|
|
4275
|
+
|
|
4276
|
+
#if HWY_TARGET <= HWY_SSE2
|
|
4277
|
+
// On SSE2/SSSE3/SSE4/AVX2/AVX3, promote to and from MakeSigned<TW> to avoid
|
|
4278
|
+
// unnecessary overhead
|
|
4279
|
+
const RebindToSigned<decltype(dw)> dw_i;
|
|
4280
|
+
|
|
4281
|
+
// On SSE2/SSSE3/SSE4/AVX2/AVX3, demote to MakeSigned<T> if
|
|
4282
|
+
// kOrigLaneSize < sizeof(T) to avoid unnecessary overhead
|
|
4283
|
+
const If<(kOrigLaneSize < sizeof(T)), RebindToSigned<decltype(d)>,
|
|
4284
|
+
decltype(d)>
|
|
4285
|
+
d_demote_to;
|
|
2625
4286
|
#else
|
|
2626
|
-
|
|
4287
|
+
// On other targets, promote to TW and demote to T
|
|
4288
|
+
const decltype(dw) dw_i;
|
|
4289
|
+
const decltype(d) d_demote_to;
|
|
2627
4290
|
#endif
|
|
2628
4291
|
|
|
2629
|
-
|
|
2630
|
-
|
|
2631
|
-
|
|
2632
|
-
|
|
2633
|
-
|
|
2634
|
-
|
|
4292
|
+
return BitCast(
|
|
4293
|
+
d, DemoteTo(d_demote_to, IntDivUsingFloatDiv<kOrigLaneSize>(
|
|
4294
|
+
PromoteTo(dw_i, a), PromoteTo(dw_i, b))));
|
|
4295
|
+
}
|
|
4296
|
+
|
|
4297
|
+
template <size_t kOrigLaneSize, class V,
|
|
4298
|
+
HWY_IF_T_SIZE_ONE_OF_V(V,
|
|
4299
|
+
(HWY_HAVE_FLOAT16 ? (1 << 1) : 0) | (1 << 2)),
|
|
4300
|
+
HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
|
|
4301
|
+
HWY_INLINE V IntDiv(V a, V b) {
|
|
4302
|
+
const DFromV<decltype(a)> d;
|
|
4303
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
4304
|
+
|
|
4305
|
+
#if HWY_TARGET <= HWY_SSE2
|
|
4306
|
+
// On SSE2/SSSE3/SSE4/AVX2/AVX3, promote to and from MakeSigned<TW> to avoid
|
|
4307
|
+
// unnecessary overhead
|
|
4308
|
+
const RebindToSigned<decltype(dw)> dw_i;
|
|
4309
|
+
|
|
4310
|
+
// On SSE2/SSSE3/SSE4/AVX2/AVX3, demote to MakeSigned<TFromV<V>> if
|
|
4311
|
+
// kOrigLaneSize < sizeof(TFromV<V>) to avoid unnecessary overhead
|
|
4312
|
+
const If<(kOrigLaneSize < sizeof(TFromV<V>)), RebindToSigned<decltype(d)>,
|
|
4313
|
+
decltype(d)>
|
|
4314
|
+
d_demote_to;
|
|
2635
4315
|
#else
|
|
2636
|
-
//
|
|
2637
|
-
|
|
2638
|
-
|
|
2639
|
-
#endif
|
|
4316
|
+
// On other targets, promote to MakeWide<TFromV<V>> and demote to TFromV<V>
|
|
4317
|
+
const decltype(dw) dw_i;
|
|
4318
|
+
const decltype(d) d_demote_to;
|
|
4319
|
+
#endif
|
|
2640
4320
|
|
|
2641
|
-
|
|
2642
|
-
|
|
2643
|
-
|
|
2644
|
-
|
|
2645
|
-
|
|
2646
|
-
|
|
2647
|
-
};
|
|
2648
|
-
const auto lo = And(v, Set(d, uint8_t{0xF}));
|
|
2649
|
-
const auto hi = ShiftRight<4>(v);
|
|
2650
|
-
const auto lookup = LoadDup128(d, kLookup);
|
|
2651
|
-
return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
|
|
4321
|
+
return BitCast(d, OrderedDemote2To(
|
|
4322
|
+
d_demote_to,
|
|
4323
|
+
IntDivUsingFloatDiv<kOrigLaneSize>(
|
|
4324
|
+
PromoteLowerTo(dw_i, a), PromoteLowerTo(dw_i, b)),
|
|
4325
|
+
IntDivUsingFloatDiv<kOrigLaneSize>(
|
|
4326
|
+
PromoteUpperTo(dw_i, a), PromoteUpperTo(dw_i, b))));
|
|
2652
4327
|
}
|
|
2653
4328
|
|
|
2654
|
-
|
|
2655
|
-
|
|
2656
|
-
|
|
2657
|
-
|
|
2658
|
-
|
|
2659
|
-
|
|
2660
|
-
const D d;
|
|
2661
|
-
// See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
|
|
2662
|
-
const V k33 = Set(d, uint8_t{0x33});
|
|
2663
|
-
v = Sub(v, And(ShiftRight<1>(v), Set(d, uint8_t{0x55})));
|
|
2664
|
-
v = Add(And(ShiftRight<2>(v), k33), And(v, k33));
|
|
2665
|
-
return And(Add(v, ShiftRight<4>(v)), Set(d, uint8_t{0x0F}));
|
|
2666
|
-
}
|
|
2667
|
-
#endif // HWY_TARGET != HWY_RVV
|
|
4329
|
+
#if !HWY_HAVE_FLOAT16
|
|
4330
|
+
template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
|
|
4331
|
+
HWY_IF_V_SIZE_V(V, HWY_MAX_BYTES / 2)>
|
|
4332
|
+
HWY_INLINE V IntDiv(V a, V b) {
|
|
4333
|
+
const DFromV<decltype(a)> d;
|
|
4334
|
+
const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw;
|
|
2668
4335
|
|
|
2669
|
-
|
|
2670
|
-
|
|
2671
|
-
|
|
2672
|
-
const
|
|
2673
|
-
|
|
2674
|
-
|
|
4336
|
+
#if HWY_TARGET <= HWY_SSE2
|
|
4337
|
+
// On SSE2/SSSE3, demote from int16_t to TFromV<V> to avoid unnecessary
|
|
4338
|
+
// overhead
|
|
4339
|
+
const RebindToSigned<decltype(dw)> dw_i;
|
|
4340
|
+
#else
|
|
4341
|
+
// On other targets, demote from MakeWide<TFromV<V>> to TFromV<V>
|
|
4342
|
+
const decltype(dw) dw_i;
|
|
4343
|
+
#endif
|
|
4344
|
+
|
|
4345
|
+
return DemoteTo(d,
|
|
4346
|
+
BitCast(dw_i, IntDiv<1>(PromoteTo(dw, a), PromoteTo(dw, b))));
|
|
2675
4347
|
}
|
|
4348
|
+
template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
|
|
4349
|
+
HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
|
|
4350
|
+
HWY_INLINE V IntDiv(V a, V b) {
|
|
4351
|
+
const DFromV<decltype(a)> d;
|
|
4352
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
2676
4353
|
|
|
2677
|
-
|
|
2678
|
-
|
|
2679
|
-
|
|
2680
|
-
|
|
2681
|
-
|
|
2682
|
-
|
|
4354
|
+
#if HWY_TARGET <= HWY_SSE2
|
|
4355
|
+
// On SSE2/SSSE3, demote from int16_t to TFromV<V> to avoid unnecessary
|
|
4356
|
+
// overhead
|
|
4357
|
+
const RebindToSigned<decltype(dw)> dw_i;
|
|
4358
|
+
#else
|
|
4359
|
+
// On other targets, demote from MakeWide<TFromV<V>> to TFromV<V>
|
|
4360
|
+
const decltype(dw) dw_i;
|
|
4361
|
+
#endif
|
|
4362
|
+
|
|
4363
|
+
return OrderedDemote2To(
|
|
4364
|
+
d, BitCast(dw_i, IntDiv<1>(PromoteLowerTo(dw, a), PromoteLowerTo(dw, b))),
|
|
4365
|
+
BitCast(dw_i, IntDiv<1>(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b))));
|
|
2683
4366
|
}
|
|
4367
|
+
#endif // !HWY_HAVE_FLOAT16
|
|
2684
4368
|
|
|
2685
|
-
|
|
2686
|
-
|
|
2687
|
-
|
|
2688
|
-
|
|
2689
|
-
|
|
2690
|
-
auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
|
|
2691
|
-
return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFFULL)));
|
|
4369
|
+
template <size_t kOrigLaneSize, class V,
|
|
4370
|
+
HWY_IF_T_SIZE_ONE_OF_V(V,
|
|
4371
|
+
(HWY_HAVE_FLOAT64 ? 0 : (1 << 4)) | (1 << 8))>
|
|
4372
|
+
HWY_INLINE V IntDiv(V a, V b) {
|
|
4373
|
+
return IntDivUsingFloatDiv<kOrigLaneSize>(a, b);
|
|
2692
4374
|
}
|
|
2693
|
-
#endif
|
|
2694
4375
|
|
|
2695
|
-
#
|
|
4376
|
+
#if HWY_HAVE_FLOAT64
|
|
4377
|
+
template <size_t kOrigLaneSize, class V, HWY_IF_UI32(TFromV<V>),
|
|
4378
|
+
HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
|
|
4379
|
+
HWY_INLINE V IntDiv(V a, V b) {
|
|
4380
|
+
const DFromV<decltype(a)> d;
|
|
4381
|
+
const Rebind<double, decltype(d)> df64;
|
|
2696
4382
|
|
|
2697
|
-
|
|
4383
|
+
return DemoteTo(d, Div(PromoteTo(df64, a), PromoteTo(df64, b)));
|
|
4384
|
+
}
|
|
4385
|
+
template <size_t kOrigLaneSize, class V, HWY_IF_UI32(TFromV<V>),
|
|
4386
|
+
HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
|
|
4387
|
+
HWY_INLINE V IntDiv(V a, V b) {
|
|
4388
|
+
const DFromV<decltype(a)> d;
|
|
4389
|
+
const Half<decltype(d)> dh;
|
|
4390
|
+
const Repartition<double, decltype(d)> df64;
|
|
2698
4391
|
|
|
2699
|
-
|
|
2700
|
-
|
|
2701
|
-
|
|
2702
|
-
|
|
2703
|
-
#
|
|
2704
|
-
#define HWY_NATIVE_MUL_8
|
|
2705
|
-
#endif
|
|
4392
|
+
return Combine(
|
|
4393
|
+
d, DemoteTo(dh, Div(PromoteUpperTo(df64, a), PromoteUpperTo(df64, b))),
|
|
4394
|
+
DemoteTo(dh, Div(PromoteLowerTo(df64, a), PromoteLowerTo(df64, b))));
|
|
4395
|
+
}
|
|
4396
|
+
#endif // HWY_HAVE_FLOAT64
|
|
2706
4397
|
|
|
2707
|
-
|
|
2708
|
-
|
|
4398
|
+
template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
|
|
4399
|
+
HWY_IF_T_SIZE_ONE_OF_V(V, ((HWY_TARGET <= HWY_SSE2 ||
|
|
4400
|
+
HWY_TARGET == HWY_WASM ||
|
|
4401
|
+
HWY_TARGET == HWY_WASM_EMU256)
|
|
4402
|
+
? 0
|
|
4403
|
+
: (1 << 1)) |
|
|
4404
|
+
(1 << 2) | (1 << 4) | (1 << 8))>
|
|
4405
|
+
HWY_INLINE V IntMod(V a, V b) {
|
|
4406
|
+
return hwy::HWY_NAMESPACE::NegMulAdd(IntDiv<kOrigLaneSize>(a, b), b, a);
|
|
4407
|
+
}
|
|
4408
|
+
|
|
4409
|
+
#if HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || \
|
|
4410
|
+
HWY_TARGET == HWY_WASM_EMU256
|
|
4411
|
+
template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
|
|
2709
4412
|
HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
|
|
2710
|
-
|
|
4413
|
+
HWY_INLINE V IntMod(V a, V b) {
|
|
2711
4414
|
const DFromV<decltype(a)> d;
|
|
2712
4415
|
const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw;
|
|
2713
|
-
|
|
2714
|
-
const RebindToUnsigned<decltype(dw)> dwu; // TruncateTo input
|
|
2715
|
-
const VFromD<decltype(dw)> mul = PromoteTo(dw, a) * PromoteTo(dw, b);
|
|
2716
|
-
// TruncateTo is cheaper than ConcatEven.
|
|
2717
|
-
return BitCast(d, TruncateTo(du, BitCast(dwu, mul)));
|
|
4416
|
+
return DemoteTo(d, IntMod<kOrigLaneSize>(PromoteTo(dw, a), PromoteTo(dw, b)));
|
|
2718
4417
|
}
|
|
2719
4418
|
|
|
2720
|
-
|
|
2721
|
-
template <class V, HWY_IF_T_SIZE_V(V, 1),
|
|
4419
|
+
template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
|
|
2722
4420
|
HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
|
|
2723
|
-
|
|
4421
|
+
HWY_INLINE V IntMod(V a, V b) {
|
|
2724
4422
|
const DFromV<decltype(a)> d;
|
|
2725
|
-
const
|
|
2726
|
-
|
|
2727
|
-
|
|
2728
|
-
|
|
2729
|
-
const VFromD<decltype(dw)> b0 = PromoteTo(dw, LowerHalf(dh, b));
|
|
2730
|
-
const VFromD<decltype(dw)> b1 = PromoteTo(dw, UpperHalf(dh, b));
|
|
2731
|
-
const VFromD<decltype(dw)> m0 = a0 * b0;
|
|
2732
|
-
const VFromD<decltype(dw)> m1 = a1 * b1;
|
|
2733
|
-
return ConcatEven(d, BitCast(d, m1), BitCast(d, m0));
|
|
4423
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
4424
|
+
return OrderedDemote2To(
|
|
4425
|
+
d, IntMod<kOrigLaneSize>(PromoteLowerTo(dw, a), PromoteLowerTo(dw, b)),
|
|
4426
|
+
IntMod<kOrigLaneSize>(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b)));
|
|
2734
4427
|
}
|
|
4428
|
+
#endif // HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || HWY_TARGET ==
|
|
4429
|
+
// HWY_WASM_EMU256
|
|
2735
4430
|
|
|
2736
|
-
|
|
2737
|
-
|
|
2738
|
-
// ------------------------------ 64-bit multiplication
|
|
2739
|
-
|
|
2740
|
-
// "Include guard": skip if native 64-bit mul instructions are available.
|
|
2741
|
-
#if (defined(HWY_NATIVE_MUL_64) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
|
|
2742
|
-
#ifdef HWY_NATIVE_MUL_64
|
|
2743
|
-
#undef HWY_NATIVE_MUL_64
|
|
2744
|
-
#else
|
|
2745
|
-
#define HWY_NATIVE_MUL_64
|
|
2746
|
-
#endif
|
|
4431
|
+
} // namespace detail
|
|
2747
4432
|
|
|
2748
|
-
|
|
2749
|
-
template <class V, HWY_IF_T_SIZE_V(V, 8), HWY_IF_V_SIZE_V(V, 8),
|
|
2750
|
-
HWY_IF_NOT_FLOAT_V(V)>
|
|
2751
|
-
HWY_API V operator*(V x, V y) {
|
|
2752
|
-
const DFromV<V> d;
|
|
2753
|
-
using T = TFromD<decltype(d)>;
|
|
2754
|
-
using TU = MakeUnsigned<T>;
|
|
2755
|
-
const TU xu = static_cast<TU>(GetLane(x));
|
|
2756
|
-
const TU yu = static_cast<TU>(GetLane(y));
|
|
2757
|
-
return Set(d, static_cast<T>(xu * yu));
|
|
2758
|
-
}
|
|
4433
|
+
#if HWY_TARGET == HWY_SCALAR
|
|
2759
4434
|
|
|
2760
|
-
template <class
|
|
2761
|
-
|
|
2762
|
-
|
|
2763
|
-
RepartitionToNarrow<D64> d32;
|
|
2764
|
-
auto x32 = BitCast(d32, x);
|
|
2765
|
-
auto y32 = BitCast(d32, y);
|
|
2766
|
-
auto lolo = BitCast(d32, MulEven(x32, y32));
|
|
2767
|
-
auto lohi = BitCast(d32, MulEven(x32, BitCast(d32, ShiftRight<32>(y))));
|
|
2768
|
-
auto hilo = BitCast(d32, MulEven(BitCast(d32, ShiftRight<32>(x)), y32));
|
|
2769
|
-
auto hi = BitCast(d32, ShiftLeft<32>(BitCast(D64{}, lohi + hilo)));
|
|
2770
|
-
return BitCast(D64{}, lolo + hi);
|
|
4435
|
+
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
4436
|
+
HWY_API Vec1<T> operator/(Vec1<T> a, Vec1<T> b) {
|
|
4437
|
+
return detail::IntDiv<sizeof(T)>(a, b);
|
|
2771
4438
|
}
|
|
2772
|
-
template <class
|
|
2773
|
-
|
|
2774
|
-
|
|
2775
|
-
RebindToUnsigned<DI64> du64;
|
|
2776
|
-
return BitCast(DI64{}, BitCast(du64, x) * BitCast(du64, y));
|
|
4439
|
+
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
4440
|
+
HWY_API Vec1<T> operator%(Vec1<T> a, Vec1<T> b) {
|
|
4441
|
+
return detail::IntMod<sizeof(T)>(a, b);
|
|
2777
4442
|
}
|
|
2778
4443
|
|
|
2779
|
-
#
|
|
4444
|
+
#else // HWY_TARGET != HWY_SCALAR
|
|
2780
4445
|
|
|
2781
|
-
|
|
4446
|
+
template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
4447
|
+
HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
|
|
4448
|
+
return detail::IntDiv<sizeof(T)>(a, b);
|
|
4449
|
+
}
|
|
2782
4450
|
|
|
2783
|
-
|
|
2784
|
-
|
|
2785
|
-
|
|
2786
|
-
|
|
2787
|
-
#else
|
|
2788
|
-
#define HWY_NATIVE_INT_FMA
|
|
2789
|
-
#endif
|
|
4451
|
+
template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
4452
|
+
HWY_API Vec128<T, N> operator%(Vec128<T, N> a, Vec128<T, N> b) {
|
|
4453
|
+
return detail::IntMod<sizeof(T)>(a, b);
|
|
4454
|
+
}
|
|
2790
4455
|
|
|
2791
|
-
|
|
2792
|
-
|
|
2793
|
-
|
|
4456
|
+
#if HWY_CAP_GE256
|
|
4457
|
+
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
4458
|
+
HWY_API Vec256<T> operator/(Vec256<T> a, Vec256<T> b) {
|
|
4459
|
+
return detail::IntDiv<sizeof(T)>(a, b);
|
|
2794
4460
|
}
|
|
4461
|
+
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
4462
|
+
HWY_API Vec256<T> operator%(Vec256<T> a, Vec256<T> b) {
|
|
4463
|
+
return detail::IntMod<sizeof(T)>(a, b);
|
|
4464
|
+
}
|
|
4465
|
+
#endif
|
|
2795
4466
|
|
|
2796
|
-
|
|
2797
|
-
|
|
2798
|
-
|
|
4467
|
+
#if HWY_CAP_GE512
|
|
4468
|
+
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
4469
|
+
HWY_API Vec512<T> operator/(Vec512<T> a, Vec512<T> b) {
|
|
4470
|
+
return detail::IntDiv<sizeof(T)>(a, b);
|
|
2799
4471
|
}
|
|
4472
|
+
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
4473
|
+
HWY_API Vec512<T> operator%(Vec512<T> a, Vec512<T> b) {
|
|
4474
|
+
return detail::IntMod<sizeof(T)>(a, b);
|
|
4475
|
+
}
|
|
4476
|
+
#endif
|
|
2800
4477
|
|
|
2801
|
-
#endif //
|
|
4478
|
+
#endif // HWY_TARGET == HWY_SCALAR
|
|
4479
|
+
|
|
4480
|
+
#endif // HWY_NATIVE_INT_DIV
|
|
2802
4481
|
|
|
2803
4482
|
// ------------------------------ SatWidenMulPairwiseAdd
|
|
2804
4483
|
|
|
@@ -2819,11 +4498,11 @@ template <class DI16, class VU8, class VI8,
|
|
|
2819
4498
|
HWY_API Vec<DI16> SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b) {
|
|
2820
4499
|
const RebindToUnsigned<decltype(di16)> du16;
|
|
2821
4500
|
|
|
2822
|
-
const auto a0 =
|
|
2823
|
-
const auto b0 =
|
|
4501
|
+
const auto a0 = BitCast(di16, PromoteEvenTo(du16, a));
|
|
4502
|
+
const auto b0 = PromoteEvenTo(di16, b);
|
|
2824
4503
|
|
|
2825
|
-
const auto a1 = BitCast(di16,
|
|
2826
|
-
const auto b1 =
|
|
4504
|
+
const auto a1 = BitCast(di16, PromoteOddTo(du16, a));
|
|
4505
|
+
const auto b1 = PromoteOddTo(di16, b);
|
|
2827
4506
|
|
|
2828
4507
|
return SaturatedAdd(Mul(a0, b0), Mul(a1, b1));
|
|
2829
4508
|
}
|
|
@@ -2848,11 +4527,11 @@ HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32,
|
|
|
2848
4527
|
VFromD<DI32> sum) {
|
|
2849
4528
|
const Repartition<int16_t, decltype(di32)> di16;
|
|
2850
4529
|
|
|
2851
|
-
const auto a0 =
|
|
2852
|
-
const auto b0 =
|
|
4530
|
+
const auto a0 = PromoteEvenTo(di16, a);
|
|
4531
|
+
const auto b0 = PromoteEvenTo(di16, b);
|
|
2853
4532
|
|
|
2854
|
-
const auto a1 =
|
|
2855
|
-
const auto b1 =
|
|
4533
|
+
const auto a1 = PromoteOddTo(di16, a);
|
|
4534
|
+
const auto b1 = PromoteOddTo(di16, b);
|
|
2856
4535
|
|
|
2857
4536
|
return Add(sum, Add(WidenMulPairwiseAdd(di32, a0, b0),
|
|
2858
4537
|
WidenMulPairwiseAdd(di32, a1, b1)));
|
|
@@ -2985,12 +4664,10 @@ HWY_API VFromD<DU64> SumOfMulQuadAccumulate(
|
|
|
2985
4664
|
const auto u32_even_prod = MulEven(a, b);
|
|
2986
4665
|
const auto u32_odd_prod = MulOdd(a, b);
|
|
2987
4666
|
|
|
2988
|
-
const auto
|
|
2989
|
-
|
|
2990
|
-
const auto
|
|
2991
|
-
|
|
2992
|
-
const auto p1 = Add(ShiftRight<32>(BitCast(du64, u32_even_prod)),
|
|
2993
|
-
ShiftRight<32>(BitCast(du64, u32_odd_prod)));
|
|
4667
|
+
const auto p0 = Add(PromoteEvenTo(du64, u32_even_prod),
|
|
4668
|
+
PromoteEvenTo(du64, u32_odd_prod));
|
|
4669
|
+
const auto p1 =
|
|
4670
|
+
Add(PromoteOddTo(du64, u32_even_prod), PromoteOddTo(du64, u32_odd_prod));
|
|
2994
4671
|
|
|
2995
4672
|
return Add(sum, Add(p0, p1));
|
|
2996
4673
|
}
|
|
@@ -3043,7 +4720,6 @@ HWY_API V ApproximateReciprocalSqrt(V v) {
|
|
|
3043
4720
|
|
|
3044
4721
|
// ------------------------------ Compress*
|
|
3045
4722
|
|
|
3046
|
-
// "Include guard": skip if native 8-bit compress instructions are available.
|
|
3047
4723
|
#if (defined(HWY_NATIVE_COMPRESS8) == defined(HWY_TARGET_TOGGLE))
|
|
3048
4724
|
#ifdef HWY_NATIVE_COMPRESS8
|
|
3049
4725
|
#undef HWY_NATIVE_COMPRESS8
|
|
@@ -3244,7 +4920,6 @@ HWY_API V CompressNot(V v, M mask) {
|
|
|
3244
4920
|
|
|
3245
4921
|
// ------------------------------ Expand
|
|
3246
4922
|
|
|
3247
|
-
// "Include guard": skip if native 8/16-bit Expand/LoadExpand are available.
|
|
3248
4923
|
// Note that this generic implementation assumes <= 128 bit fixed vectors;
|
|
3249
4924
|
// the SVE and RVV targets provide their own native implementations.
|
|
3250
4925
|
#if (defined(HWY_NATIVE_EXPAND) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
|
|
@@ -3853,7 +5528,9 @@ HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
|
|
|
3853
5528
|
BitCast(du, InterleaveLower(du8x2, indices8, indices8));
|
|
3854
5529
|
// TableLookupBytesOr0 operates on bytes. To convert u16 lane indices to byte
|
|
3855
5530
|
// indices, add 0 to even and 1 to odd byte lanes.
|
|
3856
|
-
const Vec128<uint16_t, N> byte_indices = Add(
|
|
5531
|
+
const Vec128<uint16_t, N> byte_indices = Add(
|
|
5532
|
+
indices16,
|
|
5533
|
+
Set(du, static_cast<uint16_t>(HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001)));
|
|
3857
5534
|
return BitCast(d, TableLookupBytesOr0(v, byte_indices));
|
|
3858
5535
|
}
|
|
3859
5536
|
|
|
@@ -3947,9 +5624,9 @@ HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
|
|
|
3947
5624
|
const Repartition<uint16_t, decltype(d)> du16;
|
|
3948
5625
|
return BitCast(d, RotateRight<8>(BitCast(du16, v)));
|
|
3949
5626
|
#else
|
|
3950
|
-
|
|
3951
|
-
|
|
3952
|
-
return TableLookupBytes(v,
|
|
5627
|
+
const VFromD<D> shuffle = Dup128VecFromValues(d, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,
|
|
5628
|
+
11, 10, 13, 12, 15, 14);
|
|
5629
|
+
return TableLookupBytes(v, shuffle);
|
|
3953
5630
|
#endif
|
|
3954
5631
|
}
|
|
3955
5632
|
|
|
@@ -3959,10 +5636,10 @@ HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
|
|
|
3959
5636
|
const Repartition<uint16_t, decltype(d)> du16;
|
|
3960
5637
|
return BitCast(d, Reverse2(du16, BitCast(du16, Reverse2(d, v))));
|
|
3961
5638
|
#else
|
|
3962
|
-
alignas(16) static constexpr uint8_t kShuffle[16] = {
|
|
3963
|
-
3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12};
|
|
3964
5639
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
3965
|
-
|
|
5640
|
+
const VFromD<decltype(du8)> shuffle = Dup128VecFromValues(
|
|
5641
|
+
du8, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
|
|
5642
|
+
return TableLookupBytes(v, BitCast(d, shuffle));
|
|
3966
5643
|
#endif
|
|
3967
5644
|
}
|
|
3968
5645
|
|
|
@@ -3972,10 +5649,10 @@ HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
|
|
|
3972
5649
|
const Repartition<uint32_t, D> du32;
|
|
3973
5650
|
return BitCast(d, Reverse2(du32, BitCast(du32, Reverse4(d, v))));
|
|
3974
5651
|
#else
|
|
3975
|
-
alignas(16) static constexpr uint8_t kShuffle[16] = {
|
|
3976
|
-
7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
|
|
3977
5652
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
3978
|
-
|
|
5653
|
+
const VFromD<decltype(du8)> shuffle = Dup128VecFromValues(
|
|
5654
|
+
du8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
|
|
5655
|
+
return TableLookupBytes(v, BitCast(d, shuffle));
|
|
3979
5656
|
#endif
|
|
3980
5657
|
}
|
|
3981
5658
|
|
|
@@ -4111,8 +5788,6 @@ HWY_INLINE Vec<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
|
|
|
4111
5788
|
const uint32_t x2,
|
|
4112
5789
|
const uint32_t x1,
|
|
4113
5790
|
const uint32_t x0) {
|
|
4114
|
-
alignas(16) const uint32_t lanes[4] = {x0, x1, x2, x3};
|
|
4115
|
-
|
|
4116
5791
|
#if HWY_TARGET == HWY_RVV
|
|
4117
5792
|
constexpr int kPow2 = d.Pow2();
|
|
4118
5793
|
constexpr int kLoadPow2 = HWY_MAX(kPow2, -1);
|
|
@@ -4128,8 +5803,7 @@ HWY_INLINE Vec<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
|
|
|
4128
5803
|
HWY_MAX(kMaxBytes / sizeof(uint32_t), kMinLanesToLoad);
|
|
4129
5804
|
const CappedTag<uint32_t, kNumToLoad> d_load;
|
|
4130
5805
|
#endif
|
|
4131
|
-
|
|
4132
|
-
return ResizeBitCast(d, LoadDup128(d_load, lanes));
|
|
5806
|
+
return ResizeBitCast(d, Dup128VecFromValues(d_load, x0, x1, x2, x3));
|
|
4133
5807
|
}
|
|
4134
5808
|
|
|
4135
5809
|
} // namespace detail
|
|
@@ -4291,10 +5965,6 @@ HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
|
|
|
4291
5965
|
const uint16_t u16_idx1 = static_cast<uint16_t>(idx1);
|
|
4292
5966
|
const uint16_t u16_idx2 = static_cast<uint16_t>(idx2);
|
|
4293
5967
|
const uint16_t u16_idx3 = static_cast<uint16_t>(idx3);
|
|
4294
|
-
alignas(16)
|
|
4295
|
-
const uint16_t indices[8] = {u16_idx0, u16_idx1, u16_idx2, u16_idx3,
|
|
4296
|
-
u16_idx0, u16_idx1, u16_idx2, u16_idx3};
|
|
4297
|
-
|
|
4298
5968
|
#if HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES
|
|
4299
5969
|
constexpr size_t kMinLanesToLoad = 4;
|
|
4300
5970
|
#else
|
|
@@ -4302,8 +5972,9 @@ HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
|
|
|
4302
5972
|
#endif
|
|
4303
5973
|
constexpr size_t kNumToLoad = HWY_MAX(HWY_MAX_LANES_D(D), kMinLanesToLoad);
|
|
4304
5974
|
const CappedTag<uint16_t, kNumToLoad> d_load;
|
|
4305
|
-
|
|
4306
|
-
|
|
5975
|
+
return ResizeBitCast(
|
|
5976
|
+
d, Dup128VecFromValues(d_load, u16_idx0, u16_idx1, u16_idx2, u16_idx3,
|
|
5977
|
+
u16_idx0, u16_idx1, u16_idx2, u16_idx3));
|
|
4307
5978
|
}
|
|
4308
5979
|
|
|
4309
5980
|
template <class D, HWY_IF_T_SIZE_D(D, 4)>
|
|
@@ -4672,6 +6343,202 @@ HWY_API VFromD<D> SlideDownBlocks(D d, VFromD<D> v) {
|
|
|
4672
6343
|
}
|
|
4673
6344
|
#endif
|
|
4674
6345
|
|
|
6346
|
+
// ------------------------------ SumsOfAdjQuadAbsDiff
|
|
6347
|
+
|
|
6348
|
+
#if (defined(HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF) == \
|
|
6349
|
+
defined(HWY_TARGET_TOGGLE))
|
|
6350
|
+
#ifdef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
|
|
6351
|
+
#undef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
|
|
6352
|
+
#else
|
|
6353
|
+
#define HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
|
|
6354
|
+
#endif
|
|
6355
|
+
|
|
6356
|
+
#if HWY_TARGET != HWY_SCALAR
|
|
6357
|
+
template <int kAOffset, int kBOffset, class V8, HWY_IF_UI8_D(DFromV<V8>)>
|
|
6358
|
+
HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfAdjQuadAbsDiff(V8 a, V8 b) {
|
|
6359
|
+
static_assert(0 <= kAOffset && kAOffset <= 1,
|
|
6360
|
+
"kAOffset must be between 0 and 1");
|
|
6361
|
+
static_assert(0 <= kBOffset && kBOffset <= 3,
|
|
6362
|
+
"kBOffset must be between 0 and 3");
|
|
6363
|
+
using D8 = DFromV<V8>;
|
|
6364
|
+
const D8 d8;
|
|
6365
|
+
const RebindToUnsigned<decltype(d8)> du8;
|
|
6366
|
+
const RepartitionToWide<decltype(d8)> d16;
|
|
6367
|
+
const RepartitionToWide<decltype(du8)> du16;
|
|
6368
|
+
|
|
6369
|
+
// Ensure that a is resized to a vector that has at least
|
|
6370
|
+
// HWY_MAX(Lanes(d8), size_t{8} << kAOffset) lanes for the interleave and
|
|
6371
|
+
// CombineShiftRightBytes operations below.
|
|
6372
|
+
#if HWY_TARGET == HWY_RVV
|
|
6373
|
+
// On RVV targets, need to ensure that d8_interleave.Pow2() >= 0 is true
|
|
6374
|
+
// to ensure that Lanes(d8_interleave) >= 16 is true.
|
|
6375
|
+
|
|
6376
|
+
// Lanes(d8_interleave) >= Lanes(d8) is guaranteed to be true on RVV
|
|
6377
|
+
// targets as d8_interleave.Pow2() >= d8.Pow2() is true.
|
|
6378
|
+
constexpr int kInterleavePow2 = HWY_MAX(d8.Pow2(), 0);
|
|
6379
|
+
const ScalableTag<TFromD<D8>, kInterleavePow2> d8_interleave;
|
|
6380
|
+
#elif HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || \
|
|
6381
|
+
HWY_TARGET == HWY_SVE2_128
|
|
6382
|
+
// On SVE targets, Lanes(d8_interleave) >= 16 and
|
|
6383
|
+
// Lanes(d8_interleave) >= Lanes(d8) are both already true as d8 is a SIMD
|
|
6384
|
+
// tag for a full u8/i8 vector on SVE.
|
|
6385
|
+
const D8 d8_interleave;
|
|
6386
|
+
#else
|
|
6387
|
+
// On targets that use non-scalable vector types, Lanes(d8_interleave) is
|
|
6388
|
+
// equal to HWY_MAX(Lanes(d8), size_t{8} << kAOffset).
|
|
6389
|
+
constexpr size_t kInterleaveLanes =
|
|
6390
|
+
HWY_MAX(HWY_MAX_LANES_D(D8), size_t{8} << kAOffset);
|
|
6391
|
+
const FixedTag<TFromD<D8>, kInterleaveLanes> d8_interleave;
|
|
6392
|
+
#endif
|
|
6393
|
+
|
|
6394
|
+
// The ResizeBitCast operation below will resize a to a vector that has
|
|
6395
|
+
// at least HWY_MAX(Lanes(d8), size_t{8} << kAOffset) lanes for the
|
|
6396
|
+
// InterleaveLower, InterleaveUpper, and CombineShiftRightBytes operations
|
|
6397
|
+
// below.
|
|
6398
|
+
const auto a_to_interleave = ResizeBitCast(d8_interleave, a);
|
|
6399
|
+
|
|
6400
|
+
const auto a_interleaved_lo =
|
|
6401
|
+
InterleaveLower(d8_interleave, a_to_interleave, a_to_interleave);
|
|
6402
|
+
const auto a_interleaved_hi =
|
|
6403
|
+
InterleaveUpper(d8_interleave, a_to_interleave, a_to_interleave);
|
|
6404
|
+
|
|
6405
|
+
/* a01: { a[kAOffset*4+0], a[kAOffset*4+1], a[kAOffset*4+1], a[kAOffset*4+2],
|
|
6406
|
+
a[kAOffset*4+2], a[kAOffset*4+3], a[kAOffset*4+3], a[kAOffset*4+4],
|
|
6407
|
+
a[kAOffset*4+4], a[kAOffset*4+5], a[kAOffset*4+5], a[kAOffset*4+6],
|
|
6408
|
+
a[kAOffset*4+6], a[kAOffset*4+7], a[kAOffset*4+7], a[kAOffset*4+8] }
|
|
6409
|
+
*/
|
|
6410
|
+
/* a23: { a[kAOffset*4+2], a[kAOffset*4+3], a[kAOffset*4+3], a[kAOffset*4+4],
|
|
6411
|
+
a[kAOffset*4+4], a[kAOffset*4+5], a[kAOffset*4+5], a[kAOffset*4+6],
|
|
6412
|
+
a[kAOffset*4+6], a[kAOffset*4+7], a[kAOffset*4+7], a[kAOffset*4+8],
|
|
6413
|
+
a[kAOffset*4+8], a[kAOffset*4+9], a[kAOffset*4+9], a[kAOffset*4+10]
|
|
6414
|
+
} */
|
|
6415
|
+
|
|
6416
|
+
// a01 and a23 are resized back to V8 as only the first Lanes(d8) lanes of
|
|
6417
|
+
// the CombineShiftRightBytes are needed for the subsequent AbsDiff operations
|
|
6418
|
+
// and as a01 and a23 need to be the same vector type as b01 and b23 for the
|
|
6419
|
+
// AbsDiff operations below.
|
|
6420
|
+
const V8 a01 =
|
|
6421
|
+
ResizeBitCast(d8, CombineShiftRightBytes<kAOffset * 8 + 1>(
|
|
6422
|
+
d8_interleave, a_interleaved_hi, a_interleaved_lo));
|
|
6423
|
+
const V8 a23 =
|
|
6424
|
+
ResizeBitCast(d8, CombineShiftRightBytes<kAOffset * 8 + 5>(
|
|
6425
|
+
d8_interleave, a_interleaved_hi, a_interleaved_lo));
|
|
6426
|
+
|
|
6427
|
+
/* b01: { b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1],
|
|
6428
|
+
b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1],
|
|
6429
|
+
b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1],
|
|
6430
|
+
b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1] }
|
|
6431
|
+
*/
|
|
6432
|
+
/* b23: { b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3],
|
|
6433
|
+
b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3],
|
|
6434
|
+
b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3],
|
|
6435
|
+
b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3] }
|
|
6436
|
+
*/
|
|
6437
|
+
const V8 b01 = BitCast(d8, Broadcast<kBOffset * 2>(BitCast(d16, b)));
|
|
6438
|
+
const V8 b23 = BitCast(d8, Broadcast<kBOffset * 2 + 1>(BitCast(d16, b)));
|
|
6439
|
+
|
|
6440
|
+
const VFromD<decltype(du16)> absdiff_sum_01 =
|
|
6441
|
+
SumsOf2(BitCast(du8, AbsDiff(a01, b01)));
|
|
6442
|
+
const VFromD<decltype(du16)> absdiff_sum_23 =
|
|
6443
|
+
SumsOf2(BitCast(du8, AbsDiff(a23, b23)));
|
|
6444
|
+
return BitCast(d16, Add(absdiff_sum_01, absdiff_sum_23));
|
|
6445
|
+
}
|
|
6446
|
+
#endif // HWY_TARGET != HWY_SCALAR
|
|
6447
|
+
|
|
6448
|
+
#endif // HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
|
|
6449
|
+
|
|
6450
|
+
// ------------------------------ SumsOfShuffledQuadAbsDiff
|
|
6451
|
+
|
|
6452
|
+
#if (defined(HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF) == \
|
|
6453
|
+
defined(HWY_TARGET_TOGGLE))
|
|
6454
|
+
#ifdef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
|
|
6455
|
+
#undef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
|
|
6456
|
+
#else
|
|
6457
|
+
#define HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
|
|
6458
|
+
#endif
|
|
6459
|
+
|
|
6460
|
+
#if HWY_TARGET != HWY_SCALAR
|
|
6461
|
+
template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, class V8,
|
|
6462
|
+
HWY_IF_UI8_D(DFromV<V8>)>
|
|
6463
|
+
HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfShuffledQuadAbsDiff(V8 a,
|
|
6464
|
+
V8 b) {
|
|
6465
|
+
static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3");
|
|
6466
|
+
static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3");
|
|
6467
|
+
static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3");
|
|
6468
|
+
static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3");
|
|
6469
|
+
|
|
6470
|
+
#if HWY_TARGET == HWY_RVV
|
|
6471
|
+
// On RVV, ensure that both vA and vB have a LMUL of at least 1/2 so that
|
|
6472
|
+
// both vA and vB can be bitcasted to a u32 vector.
|
|
6473
|
+
const detail::AdjustSimdTagToMinVecPow2<
|
|
6474
|
+
RepartitionToWideX2<DFromV<decltype(a)>>>
|
|
6475
|
+
d32;
|
|
6476
|
+
const RepartitionToNarrow<decltype(d32)> d16;
|
|
6477
|
+
const RepartitionToNarrow<decltype(d16)> d8;
|
|
6478
|
+
|
|
6479
|
+
const auto vA = ResizeBitCast(d8, a);
|
|
6480
|
+
const auto vB = ResizeBitCast(d8, b);
|
|
6481
|
+
#else
|
|
6482
|
+
const DFromV<decltype(a)> d8;
|
|
6483
|
+
const RepartitionToWide<decltype(d8)> d16;
|
|
6484
|
+
const RepartitionToWide<decltype(d16)> d32;
|
|
6485
|
+
|
|
6486
|
+
const auto vA = a;
|
|
6487
|
+
const auto vB = b;
|
|
6488
|
+
#endif
|
|
6489
|
+
|
|
6490
|
+
const RebindToUnsigned<decltype(d8)> du8;
|
|
6491
|
+
|
|
6492
|
+
const auto a_shuf =
|
|
6493
|
+
Per4LaneBlockShuffle<kIdx3, kIdx2, kIdx1, kIdx0>(BitCast(d32, vA));
|
|
6494
|
+
/* a0123_2345: { a_shuf[0], a_shuf[1], a_shuf[2], a_shuf[3],
|
|
6495
|
+
a_shuf[2], a_shuf[3], a_shuf[4], a_shuf[5],
|
|
6496
|
+
a_shuf[8], a_shuf[9], a_shuf[10], a_shuf[11],
|
|
6497
|
+
a_shuf[10], a_shuf[11], a_shuf[12], a_shuf[13] } */
|
|
6498
|
+
/* a1234_3456: { a_shuf[1], a_shuf[2], a_shuf[3], a_shuf[4],
|
|
6499
|
+
a_shuf[3], a_shuf[4], a_shuf[5], a_shuf[6],
|
|
6500
|
+
a_shuf[9], a_shuf[10], a_shuf[11], a_shuf[12],
|
|
6501
|
+
a_shuf[11], a_shuf[12], a_shuf[13], a_shuf[14] } */
|
|
6502
|
+
#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
|
|
6503
|
+
// On RVV/SVE targets, use Slide1Up/Slide1Down instead of
|
|
6504
|
+
// ShiftLeftBytes/ShiftRightBytes to avoid unnecessary zeroing out of any
|
|
6505
|
+
// lanes that are shifted into an adjacent 16-byte block as any lanes that are
|
|
6506
|
+
// shifted into an adjacent 16-byte block by Slide1Up/Slide1Down will be
|
|
6507
|
+
// replaced by the OddEven operation.
|
|
6508
|
+
const auto a_0123_2345 = BitCast(
|
|
6509
|
+
d8, OddEven(BitCast(d32, Slide1Up(d16, BitCast(d16, a_shuf))), a_shuf));
|
|
6510
|
+
const auto a_1234_3456 =
|
|
6511
|
+
BitCast(d8, OddEven(BitCast(d32, Slide1Up(d8, BitCast(d8, a_shuf))),
|
|
6512
|
+
BitCast(d32, Slide1Down(d8, BitCast(d8, a_shuf)))));
|
|
6513
|
+
#else
|
|
6514
|
+
const auto a_0123_2345 =
|
|
6515
|
+
BitCast(d8, OddEven(ShiftLeftBytes<2>(d32, a_shuf), a_shuf));
|
|
6516
|
+
const auto a_1234_3456 = BitCast(
|
|
6517
|
+
d8,
|
|
6518
|
+
OddEven(ShiftLeftBytes<1>(d32, a_shuf), ShiftRightBytes<1>(d32, a_shuf)));
|
|
6519
|
+
#endif
|
|
6520
|
+
|
|
6521
|
+
auto even_sums = SumsOf4(BitCast(du8, AbsDiff(a_0123_2345, vB)));
|
|
6522
|
+
auto odd_sums = SumsOf4(BitCast(du8, AbsDiff(a_1234_3456, vB)));
|
|
6523
|
+
|
|
6524
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
6525
|
+
odd_sums = ShiftLeft<16>(odd_sums);
|
|
6526
|
+
#else
|
|
6527
|
+
even_sums = ShiftLeft<16>(even_sums);
|
|
6528
|
+
#endif
|
|
6529
|
+
|
|
6530
|
+
const auto sums = OddEven(BitCast(d16, odd_sums), BitCast(d16, even_sums));
|
|
6531
|
+
|
|
6532
|
+
#if HWY_TARGET == HWY_RVV
|
|
6533
|
+
return ResizeBitCast(RepartitionToWide<DFromV<V8>>(), sums);
|
|
6534
|
+
#else
|
|
6535
|
+
return sums;
|
|
6536
|
+
#endif
|
|
6537
|
+
}
|
|
6538
|
+
#endif // HWY_TARGET != HWY_SCALAR
|
|
6539
|
+
|
|
6540
|
+
#endif // HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
|
|
6541
|
+
|
|
4675
6542
|
// ================================================== Operator wrapper
|
|
4676
6543
|
|
|
4677
6544
|
// SVE* and RVV currently cannot define operators and have already defined
|
|
@@ -4700,6 +6567,10 @@ template <class V>
|
|
|
4700
6567
|
HWY_API V Div(V a, V b) {
|
|
4701
6568
|
return a / b;
|
|
4702
6569
|
}
|
|
6570
|
+
template <class V>
|
|
6571
|
+
HWY_API V Mod(V a, V b) {
|
|
6572
|
+
return a % b;
|
|
6573
|
+
}
|
|
4703
6574
|
|
|
4704
6575
|
template <class V>
|
|
4705
6576
|
V Shl(V a, V b) {
|