@img/sharp-libvips-dev 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +7 -1
- package/include/aom/aom_image.h +24 -12
- package/include/aom/aom_integer.h +3 -3
- package/include/aom/aomcx.h +15 -0
- package/include/aom/aomdx.h +5 -2
- package/include/archive.h +7 -5
- package/include/archive_entry.h +5 -3
- package/include/cgif.h +3 -0
- package/include/freetype2/freetype/config/ftoption.h +1 -1
- package/include/fribidi/fribidi-config.h +2 -2
- package/include/fribidi/fribidi-unicode-version.h +3 -3
- package/include/glib-2.0/gio/gappinfo.h +40 -25
- package/include/glib-2.0/gio/gasyncresult.h +1 -1
- package/include/glib-2.0/gio/gconverter.h +5 -0
- package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
- package/include/glib-2.0/gio/gfile.h +16 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/gsettings.h +8 -0
- package/include/glib-2.0/gio/gvfs.h +2 -2
- package/include/glib-2.0/girepository/gi-visibility.h +34 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
- package/include/glib-2.0/glib/giochannel.h +2 -2
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +12 -5
- package/include/glib-2.0/glib/gmain.h +93 -7
- package/include/glib-2.0/glib/gqsort.h +8 -1
- package/include/glib-2.0/glib/gstrfuncs.h +0 -12
- package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
- package/include/glib-2.0/glib/gunicode.h +1 -1
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gtype.h +6 -6
- package/include/harfbuzz/hb-buffer.h +6 -0
- package/include/harfbuzz/hb-common.h +6 -9
- package/include/harfbuzz/hb-cplusplus.hh +8 -11
- package/include/harfbuzz/hb-subset.h +17 -4
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +28 -0
- package/include/hwy/aligned_allocator.h +48 -1
- package/include/hwy/base.h +235 -34
- package/include/hwy/detect_compiler_arch.h +84 -10
- package/include/hwy/detect_targets.h +95 -29
- package/include/hwy/foreach_target.h +12 -1
- package/include/hwy/highway.h +205 -50
- package/include/hwy/ops/arm_neon-inl.h +841 -99
- package/include/hwy/ops/arm_sve-inl.h +413 -141
- package/include/hwy/ops/emu128-inl.h +373 -360
- package/include/hwy/ops/generic_ops-inl.h +804 -401
- package/include/hwy/ops/inside-inl.h +691 -0
- package/include/hwy/ops/ppc_vsx-inl.h +456 -166
- package/include/hwy/ops/rvv-inl.h +537 -249
- package/include/hwy/ops/scalar-inl.h +169 -79
- package/include/hwy/ops/set_macros-inl.h +106 -18
- package/include/hwy/ops/shared-inl.h +23 -0
- package/include/hwy/ops/wasm_128-inl.h +130 -108
- package/include/hwy/ops/x86_128-inl.h +1892 -577
- package/include/hwy/ops/x86_256-inl.h +625 -184
- package/include/hwy/ops/x86_512-inl.h +733 -131
- package/include/hwy/targets.h +22 -21
- package/include/hwy/timer-inl.h +3 -3
- package/include/hwy/timer.h +5 -1
- package/include/libheif/heif.h +170 -15
- package/include/libheif/heif_items.h +237 -0
- package/include/libheif/heif_properties.h +38 -2
- package/include/libheif/heif_regions.h +1 -1
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/pnglibconf.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
- package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
- package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
- package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
- package/include/libxml2/libxml/HTMLparser.h +12 -19
- package/include/libxml2/libxml/c14n.h +1 -12
- package/include/libxml2/libxml/debugXML.h +1 -1
- package/include/libxml2/libxml/encoding.h +9 -0
- package/include/libxml2/libxml/entities.h +12 -1
- package/include/libxml2/libxml/hash.h +19 -0
- package/include/libxml2/libxml/list.h +2 -2
- package/include/libxml2/libxml/nanohttp.h +17 -0
- package/include/libxml2/libxml/parser.h +61 -55
- package/include/libxml2/libxml/parserInternals.h +9 -1
- package/include/libxml2/libxml/pattern.h +6 -0
- package/include/libxml2/libxml/tree.h +32 -12
- package/include/libxml2/libxml/uri.h +11 -0
- package/include/libxml2/libxml/valid.h +29 -2
- package/include/libxml2/libxml/xinclude.h +7 -0
- package/include/libxml2/libxml/xmlIO.h +21 -4
- package/include/libxml2/libxml/xmlerror.h +14 -0
- package/include/libxml2/libxml/xmlexports.h +111 -15
- package/include/libxml2/libxml/xmlmemory.h +8 -45
- package/include/libxml2/libxml/xmlreader.h +2 -0
- package/include/libxml2/libxml/xmlsave.h +5 -0
- package/include/libxml2/libxml/xmlunicode.h +165 -1
- package/include/libxml2/libxml/xmlversion.h +15 -179
- package/include/libxml2/libxml/xmlwriter.h +1 -0
- package/include/libxml2/libxml/xpath.h +4 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-item.h +4 -2
- package/include/pango-1.0/pango/pango-version-macros.h +25 -0
- package/include/pango-1.0/pango/pangofc-font.h +2 -1
- package/include/pnglibconf.h +1 -1
- package/include/vips/util.h +1 -2
- package/include/vips/version.h +4 -4
- package/include/webp/decode.h +58 -56
- package/include/webp/demux.h +25 -21
- package/include/webp/encode.h +44 -39
- package/include/webp/mux.h +76 -15
- package/include/webp/mux_types.h +2 -1
- package/include/webp/sharpyuv/sharpyuv.h +77 -8
- package/include/webp/types.h +29 -8
- package/include/zconf.h +1 -1
- package/include/zlib.h +12 -12
- package/package.json +1 -1
- package/versions.json +14 -15
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
// Copyright 2021 Google LLC
|
|
2
|
-
// Copyright 2023 Arm Limited and/or
|
|
2
|
+
// Copyright 2023,2024 Arm Limited and/or
|
|
3
|
+
// its affiliates <open-source-office@arm.com>
|
|
3
4
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
5
|
// SPDX-License-Identifier: BSD-3-Clause
|
|
5
6
|
//
|
|
@@ -59,7 +60,7 @@ HWY_API V Clamp(const V v, const V lo, const V hi) {
|
|
|
59
60
|
|
|
60
61
|
// CombineShiftRightBytes (and -Lanes) are not available for the scalar target,
|
|
61
62
|
// and RVV has its own implementation of -Lanes.
|
|
62
|
-
#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV
|
|
63
|
+
#if (HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV) || HWY_IDE
|
|
63
64
|
|
|
64
65
|
template <size_t kLanes, class D>
|
|
65
66
|
HWY_API VFromD<D> CombineShiftRightLanes(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
@@ -197,6 +198,23 @@ HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from,
|
|
|
197
198
|
#endif
|
|
198
199
|
}
|
|
199
200
|
|
|
201
|
+
// ------------------------------ IsNegative
|
|
202
|
+
#if (defined(HWY_NATIVE_IS_NEGATIVE) == defined(HWY_TARGET_TOGGLE))
|
|
203
|
+
#ifdef HWY_NATIVE_IS_NEGATIVE
|
|
204
|
+
#undef HWY_NATIVE_IS_NEGATIVE
|
|
205
|
+
#else
|
|
206
|
+
#define HWY_NATIVE_IS_NEGATIVE
|
|
207
|
+
#endif
|
|
208
|
+
|
|
209
|
+
template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
|
|
210
|
+
HWY_API Mask<DFromV<V>> IsNegative(V v) {
|
|
211
|
+
const DFromV<decltype(v)> d;
|
|
212
|
+
const RebindToSigned<decltype(d)> di;
|
|
213
|
+
return RebindMask(d, MaskFromVec(BroadcastSignBit(BitCast(di, v))));
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
#endif // HWY_NATIVE_IS_NEGATIVE
|
|
217
|
+
|
|
200
218
|
// ------------------------------ MaskFalse
|
|
201
219
|
#if (defined(HWY_NATIVE_MASK_FALSE) == defined(HWY_TARGET_TOGGLE))
|
|
202
220
|
#ifdef HWY_NATIVE_MASK_FALSE
|
|
@@ -212,6 +230,44 @@ HWY_API Mask<D> MaskFalse(D d) {
|
|
|
212
230
|
|
|
213
231
|
#endif // HWY_NATIVE_MASK_FALSE
|
|
214
232
|
|
|
233
|
+
// ------------------------------ IfNegativeThenElseZero
|
|
234
|
+
#if (defined(HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO) == defined(HWY_TARGET_TOGGLE))
|
|
235
|
+
#ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
|
|
236
|
+
#undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
|
|
237
|
+
#else
|
|
238
|
+
#define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
|
|
239
|
+
#endif
|
|
240
|
+
|
|
241
|
+
template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
|
|
242
|
+
HWY_API V IfNegativeThenElseZero(V v, V yes) {
|
|
243
|
+
return IfThenElseZero(IsNegative(v), yes);
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
#endif // HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
|
|
247
|
+
|
|
248
|
+
// ------------------------------ IfNegativeThenZeroElse
|
|
249
|
+
#if (defined(HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE) == defined(HWY_TARGET_TOGGLE))
|
|
250
|
+
#ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
|
|
251
|
+
#undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
|
|
252
|
+
#else
|
|
253
|
+
#define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
|
|
254
|
+
#endif
|
|
255
|
+
|
|
256
|
+
template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
|
|
257
|
+
HWY_API V IfNegativeThenZeroElse(V v, V no) {
|
|
258
|
+
return IfThenZeroElse(IsNegative(v), no);
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
#endif // HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
|
|
262
|
+
|
|
263
|
+
// ------------------------------ ZeroIfNegative (IfNegativeThenZeroElse)
|
|
264
|
+
|
|
265
|
+
// ZeroIfNegative is generic for all vector lengths
|
|
266
|
+
template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
|
|
267
|
+
HWY_API V ZeroIfNegative(V v) {
|
|
268
|
+
return IfNegativeThenZeroElse(v, v);
|
|
269
|
+
}
|
|
270
|
+
|
|
215
271
|
// ------------------------------ BitwiseIfThenElse
|
|
216
272
|
#if (defined(HWY_NATIVE_BITWISE_IF_THEN_ELSE) == defined(HWY_TARGET_TOGGLE))
|
|
217
273
|
#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
|
|
@@ -289,7 +345,7 @@ HWY_API Mask<DTo> DemoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) {
|
|
|
289
345
|
#define HWY_NATIVE_COMBINE_MASKS
|
|
290
346
|
#endif
|
|
291
347
|
|
|
292
|
-
#if HWY_TARGET != HWY_SCALAR
|
|
348
|
+
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
|
293
349
|
template <class D>
|
|
294
350
|
HWY_API Mask<D> CombineMasks(D d, Mask<Half<D>> hi, Mask<Half<D>> lo) {
|
|
295
351
|
const Half<decltype(d)> dh;
|
|
@@ -325,7 +381,7 @@ HWY_API Mask<D> LowerHalfOfMask(D d, Mask<Twice<D>> m) {
|
|
|
325
381
|
#define HWY_NATIVE_UPPER_HALF_OF_MASK
|
|
326
382
|
#endif
|
|
327
383
|
|
|
328
|
-
#if HWY_TARGET != HWY_SCALAR
|
|
384
|
+
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
|
329
385
|
template <class D>
|
|
330
386
|
HWY_API Mask<D> UpperHalfOfMask(D d, Mask<Twice<D>> m) {
|
|
331
387
|
const Twice<decltype(d)> dt;
|
|
@@ -345,7 +401,7 @@ HWY_API Mask<D> UpperHalfOfMask(D d, Mask<Twice<D>> m) {
|
|
|
345
401
|
#define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
|
|
346
402
|
#endif
|
|
347
403
|
|
|
348
|
-
#if HWY_TARGET != HWY_SCALAR
|
|
404
|
+
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
|
349
405
|
template <class DTo, class DFrom>
|
|
350
406
|
HWY_API Mask<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom d_from, Mask<DFrom> a,
|
|
351
407
|
Mask<DFrom> b) {
|
|
@@ -367,6 +423,17 @@ HWY_API Mask<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom d_from, Mask<DFrom> a,
|
|
|
367
423
|
|
|
368
424
|
#endif // HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
|
|
369
425
|
|
|
426
|
+
// ------------------------------ RotateLeft
|
|
427
|
+
template <int kBits, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
428
|
+
HWY_API V RotateLeft(V v) {
|
|
429
|
+
constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
|
|
430
|
+
static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
|
|
431
|
+
|
|
432
|
+
constexpr int kRotateRightAmt =
|
|
433
|
+
(kBits == 0) ? 0 : static_cast<int>(kSizeInBits) - kBits;
|
|
434
|
+
return RotateRight<kRotateRightAmt>(v);
|
|
435
|
+
}
|
|
436
|
+
|
|
370
437
|
// ------------------------------ InterleaveWholeLower/InterleaveWholeUpper
|
|
371
438
|
#if (defined(HWY_NATIVE_INTERLEAVE_WHOLE) == defined(HWY_TARGET_TOGGLE))
|
|
372
439
|
#ifdef HWY_NATIVE_INTERLEAVE_WHOLE
|
|
@@ -375,7 +442,7 @@ HWY_API Mask<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom d_from, Mask<DFrom> a,
|
|
|
375
442
|
#define HWY_NATIVE_INTERLEAVE_WHOLE
|
|
376
443
|
#endif
|
|
377
444
|
|
|
378
|
-
#if HWY_TARGET != HWY_SCALAR
|
|
445
|
+
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
|
379
446
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
380
447
|
HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
|
|
381
448
|
// InterleaveWholeLower(d, a, b) is equivalent to InterleaveLower(a, b) if
|
|
@@ -401,7 +468,7 @@ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
|
|
|
401
468
|
|
|
402
469
|
#endif // HWY_NATIVE_INTERLEAVE_WHOLE
|
|
403
470
|
|
|
404
|
-
#if HWY_TARGET != HWY_SCALAR
|
|
471
|
+
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
|
405
472
|
// The InterleaveWholeLower without the optional D parameter is generic for all
|
|
406
473
|
// vector lengths.
|
|
407
474
|
template <class V>
|
|
@@ -410,6 +477,17 @@ HWY_API V InterleaveWholeLower(V a, V b) {
|
|
|
410
477
|
}
|
|
411
478
|
#endif // HWY_TARGET != HWY_SCALAR
|
|
412
479
|
|
|
480
|
+
// ------------------------------ InterleaveEven
|
|
481
|
+
|
|
482
|
+
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
|
483
|
+
// InterleaveEven without the optional D parameter is generic for all vector
|
|
484
|
+
// lengths
|
|
485
|
+
template <class V>
|
|
486
|
+
HWY_API V InterleaveEven(V a, V b) {
|
|
487
|
+
return InterleaveEven(DFromV<V>(), a, b);
|
|
488
|
+
}
|
|
489
|
+
#endif
|
|
490
|
+
|
|
413
491
|
// ------------------------------ AddSub
|
|
414
492
|
|
|
415
493
|
template <class V, HWY_IF_LANES_D(DFromV<V>, 1)>
|
|
@@ -423,10 +501,11 @@ HWY_API V AddSub(V a, V b) {
|
|
|
423
501
|
|
|
424
502
|
// AddSub for F32x8 and F64x4 vectors is implemented in x86_256-inl.h on
|
|
425
503
|
// AVX2/AVX3
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
504
|
+
|
|
505
|
+
// AddSub for F16/F32/F64 vectors on SVE is implemented in arm_sve-inl.h
|
|
506
|
+
|
|
507
|
+
// AddSub for integer vectors on SVE2 is implemented in arm_sve-inl.h
|
|
508
|
+
template <class V, HWY_IF_ADDSUB_V(V)>
|
|
430
509
|
HWY_API V AddSub(V a, V b) {
|
|
431
510
|
using D = DFromV<decltype(a)>;
|
|
432
511
|
using T = TFromD<D>;
|
|
@@ -507,7 +586,7 @@ HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
|
|
|
507
586
|
|
|
508
587
|
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
509
588
|
HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
|
|
510
|
-
#if HWY_HAVE_SCALABLE ||
|
|
589
|
+
#if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE
|
|
511
590
|
// MaskedSubOr is more efficient than IfNegativeThenElse on RVV/SVE
|
|
512
591
|
const auto zero = Zero(DFromV<V>());
|
|
513
592
|
return MaskedSubOr(v, Lt(mask, zero), zero, v);
|
|
@@ -543,10 +622,9 @@ template <class V, HWY_IF_I32(TFromV<V>)>
|
|
|
543
622
|
HWY_API V SaturatedNeg(V v) {
|
|
544
623
|
const DFromV<decltype(v)> d;
|
|
545
624
|
|
|
546
|
-
#if HWY_TARGET == HWY_RVV ||
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
// RVV/NEON/SVE/PPC have native I32 SaturatedSub instructions
|
|
625
|
+
#if HWY_TARGET == HWY_RVV || HWY_TARGET_IS_PPC || HWY_TARGET_IS_SVE || \
|
|
626
|
+
HWY_TARGET_IS_NEON
|
|
627
|
+
// RVV/PPC/SVE/NEON have native I32 SaturatedSub instructions
|
|
550
628
|
return SaturatedSub(Zero(d), v);
|
|
551
629
|
#else
|
|
552
630
|
// ~v[i] - ((v[i] > LimitsMin<int32_t>()) ? -1 : 0) is equivalent to
|
|
@@ -567,9 +645,8 @@ HWY_API V SaturatedNeg(V v) {
|
|
|
567
645
|
|
|
568
646
|
template <class V, HWY_IF_I64(TFromV<V>)>
|
|
569
647
|
HWY_API V SaturatedNeg(V v) {
|
|
570
|
-
#if HWY_TARGET == HWY_RVV ||
|
|
571
|
-
|
|
572
|
-
// RVV/NEON/SVE have native I64 SaturatedSub instructions
|
|
648
|
+
#if HWY_TARGET == HWY_RVV || HWY_TARGET_IS_SVE || HWY_TARGET_IS_NEON
|
|
649
|
+
// RVV/SVE/NEON have native I64 SaturatedSub instructions
|
|
573
650
|
const DFromV<decltype(v)> d;
|
|
574
651
|
return SaturatedSub(Zero(d), v);
|
|
575
652
|
#else
|
|
@@ -805,6 +882,21 @@ HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
|
|
|
805
882
|
}
|
|
806
883
|
#endif // HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
807
884
|
|
|
885
|
+
// ------------------------------ IsEitherNaN
|
|
886
|
+
#if (defined(HWY_NATIVE_IS_EITHER_NAN) == defined(HWY_TARGET_TOGGLE))
|
|
887
|
+
#ifdef HWY_NATIVE_IS_EITHER_NAN
|
|
888
|
+
#undef HWY_NATIVE_IS_EITHER_NAN
|
|
889
|
+
#else
|
|
890
|
+
#define HWY_NATIVE_IS_EITHER_NAN
|
|
891
|
+
#endif
|
|
892
|
+
|
|
893
|
+
template <class V, HWY_IF_FLOAT_V(V)>
|
|
894
|
+
HWY_API MFromD<DFromV<V>> IsEitherNaN(V a, V b) {
|
|
895
|
+
return Or(IsNaN(a), IsNaN(b));
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
#endif // HWY_NATIVE_IS_EITHER_NAN
|
|
899
|
+
|
|
808
900
|
// ------------------------------ IsInf, IsFinite
|
|
809
901
|
|
|
810
902
|
// AVX3 has target-specific implementations of these.
|
|
@@ -1290,8 +1382,9 @@ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
|
|
|
1290
1382
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
1291
1383
|
const RebindToUnsigned<decltype(d)> du;
|
|
1292
1384
|
using TU = TFromD<decltype(du)>;
|
|
1293
|
-
|
|
1294
|
-
const
|
|
1385
|
+
using VU = VFromD<decltype(du)>;
|
|
1386
|
+
const VU k5 = Set(du, TU{5});
|
|
1387
|
+
const VU k6 = Set(du, TU{6});
|
|
1295
1388
|
|
|
1296
1389
|
// Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
|
|
1297
1390
|
// v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes
|
|
@@ -1307,29 +1400,29 @@ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
|
|
|
1307
1400
|
// The interleaved vectors will be named A, B, C; temporaries with suffix
|
|
1308
1401
|
// 0..2 indicate which input vector's lanes they hold.
|
|
1309
1402
|
// cannot reuse shuf_A0 (has 5)
|
|
1310
|
-
const
|
|
1311
|
-
const
|
|
1312
|
-
const
|
|
1313
|
-
const
|
|
1314
|
-
const VFromD<D> A = BitCast(d,
|
|
1403
|
+
const VU shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
|
|
1404
|
+
const VU vA0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0
|
|
1405
|
+
const VU vA1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0.
|
|
1406
|
+
const VU vA2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0..
|
|
1407
|
+
const VFromD<D> A = BitCast(d, vA0 | vA1 | vA2);
|
|
1315
1408
|
|
|
1316
1409
|
// B: v1[10],v0[10], v2[9],v1[9],v0[9] .. , v2[6],v1[6],v0[6], v2[5],v1[5]
|
|
1317
|
-
const
|
|
1318
|
-
const
|
|
1319
|
-
const
|
|
1320
|
-
const
|
|
1321
|
-
const
|
|
1322
|
-
const
|
|
1323
|
-
const VFromD<D> B = BitCast(d,
|
|
1410
|
+
const VU shuf_B0 = shuf_A2 + k6; // .A..9..8..7..6..
|
|
1411
|
+
const VU shuf_B1 = shuf_A0 + k5; // A..9..8..7..6..5
|
|
1412
|
+
const VU shuf_B2 = shuf_A1 + k5; // ..9..8..7..6..5.
|
|
1413
|
+
const VU vB0 = TableLookupBytesOr0(v0, shuf_B0);
|
|
1414
|
+
const VU vB1 = TableLookupBytesOr0(v1, shuf_B1);
|
|
1415
|
+
const VU vB2 = TableLookupBytesOr0(v2, shuf_B2);
|
|
1416
|
+
const VFromD<D> B = BitCast(d, vB0 | vB1 | vB2);
|
|
1324
1417
|
|
|
1325
1418
|
// C: v2[15],v1[15],v0[15], v2[11],v1[11],v0[11], v2[10]
|
|
1326
|
-
const
|
|
1327
|
-
const
|
|
1328
|
-
const
|
|
1329
|
-
const
|
|
1330
|
-
const
|
|
1331
|
-
const
|
|
1332
|
-
const VFromD<D> C = BitCast(d,
|
|
1419
|
+
const VU shuf_C0 = shuf_B2 + k6; // ..F..E..D..C..B.
|
|
1420
|
+
const VU shuf_C1 = shuf_B0 + k5; // .F..E..D..C..B..
|
|
1421
|
+
const VU shuf_C2 = shuf_B1 + k5; // F..E..D..C..B..A
|
|
1422
|
+
const VU vC0 = TableLookupBytesOr0(v0, shuf_C0);
|
|
1423
|
+
const VU vC1 = TableLookupBytesOr0(v1, shuf_C1);
|
|
1424
|
+
const VU vC2 = TableLookupBytesOr0(v2, shuf_C2);
|
|
1425
|
+
const VFromD<D> C = BitCast(d, vC0 | vC1 | vC2);
|
|
1333
1426
|
|
|
1334
1427
|
detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
|
|
1335
1428
|
}
|
|
@@ -1339,8 +1432,9 @@ template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_GT_D(D, 8)>
|
|
|
1339
1432
|
HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
|
|
1340
1433
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
1341
1434
|
const Repartition<uint8_t, decltype(d)> du8;
|
|
1342
|
-
|
|
1343
|
-
const
|
|
1435
|
+
using VU8 = VFromD<decltype(du8)>;
|
|
1436
|
+
const VU8 k2 = Set(du8, uint8_t{2 * sizeof(TFromD<D>)});
|
|
1437
|
+
const VU8 k3 = Set(du8, uint8_t{3 * sizeof(TFromD<D>)});
|
|
1344
1438
|
|
|
1345
1439
|
// Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
|
|
1346
1440
|
// v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
|
|
@@ -1355,30 +1449,30 @@ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
|
|
|
1355
1449
|
|
|
1356
1450
|
// The interleaved vectors will be named A, B, C; temporaries with suffix
|
|
1357
1451
|
// 0..2 indicate which input vector's lanes they hold.
|
|
1358
|
-
const
|
|
1452
|
+
const VU8 shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
|
|
1359
1453
|
|
|
1360
|
-
const
|
|
1361
|
-
const
|
|
1362
|
-
const
|
|
1454
|
+
const VU8 A0 = TableLookupBytesOr0(v0, shuf_A0);
|
|
1455
|
+
const VU8 A1 = TableLookupBytesOr0(v1, shuf_A1);
|
|
1456
|
+
const VU8 A2 = TableLookupBytesOr0(v2, shuf_A2);
|
|
1363
1457
|
const VFromD<D> A = BitCast(d, A0 | A1 | A2);
|
|
1364
1458
|
|
|
1365
1459
|
// B: v0[5] v2[4],v1[4],v0[4], v2[3],v1[3],v0[3], v2[2]
|
|
1366
|
-
const
|
|
1367
|
-
const
|
|
1368
|
-
const
|
|
1369
|
-
const
|
|
1370
|
-
const
|
|
1371
|
-
const
|
|
1372
|
-
const VFromD<D> B = BitCast(d,
|
|
1460
|
+
const VU8 shuf_B0 = shuf_A1 + k3; // 5..4..3.
|
|
1461
|
+
const VU8 shuf_B1 = shuf_A2 + k3; // ..4..3..
|
|
1462
|
+
const VU8 shuf_B2 = shuf_A0 + k2; // .4..3..2
|
|
1463
|
+
const VU8 vB0 = TableLookupBytesOr0(v0, shuf_B0);
|
|
1464
|
+
const VU8 vB1 = TableLookupBytesOr0(v1, shuf_B1);
|
|
1465
|
+
const VU8 vB2 = TableLookupBytesOr0(v2, shuf_B2);
|
|
1466
|
+
const VFromD<D> B = BitCast(d, vB0 | vB1 | vB2);
|
|
1373
1467
|
|
|
1374
1468
|
// C: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
|
|
1375
|
-
const
|
|
1376
|
-
const
|
|
1377
|
-
const
|
|
1378
|
-
const
|
|
1379
|
-
const
|
|
1380
|
-
const
|
|
1381
|
-
const VFromD<D> C = BitCast(d,
|
|
1469
|
+
const VU8 shuf_C0 = shuf_B1 + k3; // ..7..6..
|
|
1470
|
+
const VU8 shuf_C1 = shuf_B2 + k3; // .7..6..5
|
|
1471
|
+
const VU8 shuf_C2 = shuf_B0 + k2; // 7..6..5.
|
|
1472
|
+
const VU8 vC0 = TableLookupBytesOr0(v0, shuf_C0);
|
|
1473
|
+
const VU8 vC1 = TableLookupBytesOr0(v1, shuf_C1);
|
|
1474
|
+
const VU8 vC2 = TableLookupBytesOr0(v2, shuf_C2);
|
|
1475
|
+
const VFromD<D> C = BitCast(d, vC0 | vC1 | vC2);
|
|
1382
1476
|
|
|
1383
1477
|
detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
|
|
1384
1478
|
}
|
|
@@ -1431,9 +1525,10 @@ HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
|
|
|
1431
1525
|
// Use full vectors for the shuffles and first result.
|
|
1432
1526
|
constexpr size_t kFullN = 16 / sizeof(TFromD<D>);
|
|
1433
1527
|
const Full128<uint8_t> du;
|
|
1528
|
+
using VU = VFromD<decltype(du)>;
|
|
1434
1529
|
const Full128<TFromD<D>> d_full;
|
|
1435
|
-
const
|
|
1436
|
-
const
|
|
1530
|
+
const VU k5 = Set(du, uint8_t{5});
|
|
1531
|
+
const VU k6 = Set(du, uint8_t{6});
|
|
1437
1532
|
|
|
1438
1533
|
const VFromD<decltype(d_full)> v0{part0.raw};
|
|
1439
1534
|
const VFromD<decltype(d_full)> v1{part1.raw};
|
|
@@ -1450,23 +1545,23 @@ HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
|
|
|
1450
1545
|
0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
|
|
1451
1546
|
// The interleaved vectors will be named A, B, C; temporaries with suffix
|
|
1452
1547
|
// 0..2 indicate which input vector's lanes they hold.
|
|
1453
|
-
const
|
|
1454
|
-
const
|
|
1455
|
-
const
|
|
1456
|
-
const
|
|
1457
|
-
const
|
|
1458
|
-
const
|
|
1548
|
+
const VU shuf_A0 = Load(du, tbl_v0);
|
|
1549
|
+
const VU shuf_A1 = Load(du, tbl_v1); // cannot reuse shuf_A0 (5 in MSB)
|
|
1550
|
+
const VU shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
|
|
1551
|
+
const VU A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0
|
|
1552
|
+
const VU A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0.
|
|
1553
|
+
const VU A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0..
|
|
1459
1554
|
const auto A = BitCast(d_full, A0 | A1 | A2);
|
|
1460
1555
|
StoreU(A, d_full, unaligned + 0 * kFullN);
|
|
1461
1556
|
|
|
1462
1557
|
// Second (HALF) vector: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
|
|
1463
|
-
const
|
|
1464
|
-
const
|
|
1465
|
-
const
|
|
1466
|
-
const
|
|
1467
|
-
const
|
|
1468
|
-
const
|
|
1469
|
-
const VFromD<D> B{BitCast(d_full,
|
|
1558
|
+
const VU shuf_B0 = shuf_A2 + k6; // ..7..6..
|
|
1559
|
+
const VU shuf_B1 = shuf_A0 + k5; // .7..6..5
|
|
1560
|
+
const VU shuf_B2 = shuf_A1 + k5; // 7..6..5.
|
|
1561
|
+
const VU vB0 = TableLookupBytesOr0(v0, shuf_B0);
|
|
1562
|
+
const VU vB1 = TableLookupBytesOr0(v1, shuf_B1);
|
|
1563
|
+
const VU vB2 = TableLookupBytesOr0(v2, shuf_B2);
|
|
1564
|
+
const VFromD<D> B{BitCast(d_full, vB0 | vB1 | vB2).raw};
|
|
1470
1565
|
StoreU(B, d, unaligned + 1 * kFullN);
|
|
1471
1566
|
}
|
|
1472
1567
|
|
|
@@ -1477,8 +1572,9 @@ HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
|
|
|
1477
1572
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
1478
1573
|
const Twice<D> d_full;
|
|
1479
1574
|
const Full128<uint8_t> du8;
|
|
1480
|
-
|
|
1481
|
-
const
|
|
1575
|
+
using VU8 = VFromD<decltype(du8)>;
|
|
1576
|
+
const VU8 k2 = Set(du8, uint8_t{2 * sizeof(TFromD<D>)});
|
|
1577
|
+
const VU8 k3 = Set(du8, uint8_t{3 * sizeof(TFromD<D>)});
|
|
1482
1578
|
|
|
1483
1579
|
const VFromD<decltype(d_full)> v0{part0.raw};
|
|
1484
1580
|
const VFromD<decltype(d_full)> v1{part1.raw};
|
|
@@ -1497,25 +1593,25 @@ HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
|
|
|
1497
1593
|
|
|
1498
1594
|
// The interleaved vectors will be named A, B; temporaries with suffix
|
|
1499
1595
|
// 0..2 indicate which input vector's lanes they hold.
|
|
1500
|
-
const
|
|
1501
|
-
|
|
1502
|
-
const
|
|
1503
|
-
const
|
|
1504
|
-
|
|
1505
|
-
const
|
|
1506
|
-
const
|
|
1507
|
-
const
|
|
1596
|
+
const VU8 shuf_A1 = Load(du8, tbl_v1); // 2..1..0.
|
|
1597
|
+
// .2..1..0
|
|
1598
|
+
const VU8 shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
|
|
1599
|
+
const VU8 shuf_A2 = Load(du8, tbl_v2); // ..1..0..
|
|
1600
|
+
|
|
1601
|
+
const VU8 A0 = TableLookupBytesOr0(v0, shuf_A0);
|
|
1602
|
+
const VU8 A1 = TableLookupBytesOr0(v1, shuf_A1);
|
|
1603
|
+
const VU8 A2 = TableLookupBytesOr0(v2, shuf_A2);
|
|
1508
1604
|
const VFromD<decltype(d_full)> A = BitCast(d_full, A0 | A1 | A2);
|
|
1509
1605
|
StoreU(A, d_full, unaligned);
|
|
1510
1606
|
|
|
1511
1607
|
// Second (HALF) vector: v2[3],v1[3],v0[3], v2[2]
|
|
1512
|
-
const
|
|
1513
|
-
const
|
|
1514
|
-
const
|
|
1515
|
-
const
|
|
1516
|
-
const
|
|
1517
|
-
const
|
|
1518
|
-
const VFromD<decltype(d_full)> B = BitCast(d_full,
|
|
1608
|
+
const VU8 shuf_B0 = shuf_A1 + k3; // ..3.
|
|
1609
|
+
const VU8 shuf_B1 = shuf_A2 + k3; // .3..
|
|
1610
|
+
const VU8 shuf_B2 = shuf_A0 + k2; // 3..2
|
|
1611
|
+
const VU8 vB0 = TableLookupBytesOr0(v0, shuf_B0);
|
|
1612
|
+
const VU8 vB1 = TableLookupBytesOr0(v1, shuf_B1);
|
|
1613
|
+
const VU8 vB2 = TableLookupBytesOr0(v2, shuf_B2);
|
|
1614
|
+
const VFromD<decltype(d_full)> B = BitCast(d_full, vB0 | vB1 | vB2);
|
|
1519
1615
|
StoreU(VFromD<D>{B.raw}, dh, unaligned + MaxLanes(d_full));
|
|
1520
1616
|
}
|
|
1521
1617
|
|
|
@@ -1543,6 +1639,7 @@ HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
|
|
|
1543
1639
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
1544
1640
|
// Use full vectors for the shuffles and result.
|
|
1545
1641
|
const Full128<uint8_t> du;
|
|
1642
|
+
using VU = VFromD<decltype(du)>;
|
|
1546
1643
|
const Full128<TFromD<D>> d_full;
|
|
1547
1644
|
|
|
1548
1645
|
const VFromD<decltype(d_full)> v0{part0.raw};
|
|
@@ -1557,12 +1654,12 @@ HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
|
|
|
1557
1654
|
0x80, 3, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
|
|
1558
1655
|
// The interleaved vector will be named A; temporaries with suffix
|
|
1559
1656
|
// 0..2 indicate which input vector's lanes they hold.
|
|
1560
|
-
const
|
|
1561
|
-
const
|
|
1562
|
-
const
|
|
1563
|
-
const
|
|
1564
|
-
const
|
|
1565
|
-
const
|
|
1657
|
+
const VU shuf_A0 = Load(du, tbl_v0);
|
|
1658
|
+
const VU shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0);
|
|
1659
|
+
const VU shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0);
|
|
1660
|
+
const VU A0 = TableLookupBytesOr0(v0, shuf_A0); // ......3..2..1..0
|
|
1661
|
+
const VU A1 = TableLookupBytesOr0(v1, shuf_A1); // .....3..2..1..0.
|
|
1662
|
+
const VU A2 = TableLookupBytesOr0(v2, shuf_A2); // ....3..2..1..0..
|
|
1566
1663
|
const VFromD<decltype(d_full)> A = BitCast(d_full, A0 | A1 | A2);
|
|
1567
1664
|
alignas(16) TFromD<D> buf[MaxLanes(d_full)];
|
|
1568
1665
|
StoreU(A, d_full, buf);
|
|
@@ -1576,6 +1673,7 @@ HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
|
|
|
1576
1673
|
TFromD<D>* HWY_RESTRICT unaligned) {
|
|
1577
1674
|
// Use full vectors for the shuffles and result.
|
|
1578
1675
|
const Full128<uint8_t> du8;
|
|
1676
|
+
using VU8 = VFromD<decltype(du8)>;
|
|
1579
1677
|
const Full128<TFromD<D>> d_full;
|
|
1580
1678
|
|
|
1581
1679
|
const VFromD<decltype(d_full)> v0{part0.raw};
|
|
@@ -1590,15 +1688,14 @@ HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
|
|
|
1590
1688
|
0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
|
|
1591
1689
|
// The interleaved vector will be named A; temporaries with suffix
|
|
1592
1690
|
// 0..2 indicate which input vector's lanes they hold.
|
|
1593
|
-
const
|
|
1594
|
-
|
|
1595
|
-
|
|
1596
|
-
|
|
1597
|
-
|
|
1598
|
-
|
|
1599
|
-
const
|
|
1600
|
-
const
|
|
1601
|
-
const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // 1..0..
|
|
1691
|
+
const VU8 shuf_A2 = Load(du8, tbl_v2); // ..1..0..
|
|
1692
|
+
const VU8 shuf_A1 =
|
|
1693
|
+
CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2); // ...1..0.
|
|
1694
|
+
const VU8 shuf_A0 =
|
|
1695
|
+
CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2); // ....1..0
|
|
1696
|
+
const VU8 A0 = TableLookupBytesOr0(v0, shuf_A0); // ..1..0
|
|
1697
|
+
const VU8 A1 = TableLookupBytesOr0(v1, shuf_A1); // .1..0.
|
|
1698
|
+
const VU8 A2 = TableLookupBytesOr0(v2, shuf_A2); // 1..0..
|
|
1602
1699
|
const auto A = BitCast(d_full, A0 | A1 | A2);
|
|
1603
1700
|
alignas(16) TFromD<D> buf[MaxLanes(d_full)];
|
|
1604
1701
|
StoreU(A, d_full, buf);
|
|
@@ -2089,8 +2186,7 @@ namespace detail {
|
|
|
2089
2186
|
|
|
2090
2187
|
template <class DH, HWY_IF_V_SIZE_LE_D(DH, 4)>
|
|
2091
2188
|
HWY_INLINE VFromD<DH> StoreNGetUpperHalf(DH dh, VFromD<Twice<DH>> v) {
|
|
2092
|
-
constexpr size_t kMinShrVectBytes =
|
|
2093
|
-
(HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) ? 8 : 16;
|
|
2189
|
+
constexpr size_t kMinShrVectBytes = HWY_TARGET_IS_NEON ? 8 : 16;
|
|
2094
2190
|
const FixedTag<uint8_t, kMinShrVectBytes> d_shift;
|
|
2095
2191
|
return ResizeBitCast(
|
|
2096
2192
|
dh, ShiftRightBytes<dh.MaxBytes()>(d_shift, ResizeBitCast(d_shift, v)));
|
|
@@ -2299,6 +2395,25 @@ HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D d,
|
|
|
2299
2395
|
}
|
|
2300
2396
|
}
|
|
2301
2397
|
|
|
2398
|
+
template <class D, typename T = TFromD<D>>
|
|
2399
|
+
HWY_API void ScatterIndexN(VFromD<D> v, D d, T* HWY_RESTRICT base,
|
|
2400
|
+
VFromD<RebindToSigned<D>> index,
|
|
2401
|
+
const size_t max_lanes_to_store) {
|
|
2402
|
+
const RebindToSigned<decltype(d)> di;
|
|
2403
|
+
using TI = TFromD<decltype(di)>;
|
|
2404
|
+
static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
|
|
2405
|
+
|
|
2406
|
+
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
2407
|
+
if (i < max_lanes_to_store) base[ExtractLane(index, i)] = ExtractLane(v, i);
|
|
2408
|
+
}
|
|
2409
|
+
}
|
|
2410
|
+
#else
|
|
2411
|
+
template <class D, typename T = TFromD<D>>
|
|
2412
|
+
HWY_API void ScatterIndexN(VFromD<D> v, D d, T* HWY_RESTRICT base,
|
|
2413
|
+
VFromD<RebindToSigned<D>> index,
|
|
2414
|
+
const size_t max_lanes_to_store) {
|
|
2415
|
+
MaskedScatterIndex(v, FirstN(d, max_lanes_to_store), d, base, index);
|
|
2416
|
+
}
|
|
2302
2417
|
#endif // (defined(HWY_NATIVE_SCATTER) == defined(HWY_TARGET_TOGGLE))
|
|
2303
2418
|
|
|
2304
2419
|
// ------------------------------ Gather
|
|
@@ -2394,23 +2509,49 @@ HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
|
|
|
2394
2509
|
return Load(d, lanes);
|
|
2395
2510
|
}
|
|
2396
2511
|
|
|
2397
|
-
|
|
2512
|
+
template <class D, typename T = TFromD<D>>
|
|
2513
|
+
HWY_API VFromD<D> GatherIndexN(D d, const T* HWY_RESTRICT base,
|
|
2514
|
+
VFromD<RebindToSigned<D>> index,
|
|
2515
|
+
const size_t max_lanes_to_load) {
|
|
2516
|
+
const RebindToSigned<D> di;
|
|
2517
|
+
using TI = TFromD<decltype(di)>;
|
|
2518
|
+
static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
|
|
2398
2519
|
|
|
2399
|
-
|
|
2520
|
+
VFromD<D> v = Zero(d);
|
|
2521
|
+
for (size_t i = 0; i < HWY_MIN(MaxLanes(d), max_lanes_to_load); ++i) {
|
|
2522
|
+
v = InsertLane(v, i, base[ExtractLane(index, i)]);
|
|
2523
|
+
}
|
|
2524
|
+
return v;
|
|
2525
|
+
}
|
|
2400
2526
|
|
|
2401
2527
|
template <class D, typename T = TFromD<D>>
|
|
2402
|
-
HWY_API
|
|
2403
|
-
|
|
2404
|
-
|
|
2405
|
-
|
|
2406
|
-
|
|
2528
|
+
HWY_API VFromD<D> GatherIndexNOr(VFromD<D> no, D d, const T* HWY_RESTRICT base,
|
|
2529
|
+
VFromD<RebindToSigned<D>> index,
|
|
2530
|
+
const size_t max_lanes_to_load) {
|
|
2531
|
+
const RebindToSigned<D> di;
|
|
2532
|
+
using TI = TFromD<decltype(di)>;
|
|
2533
|
+
static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
|
|
2407
2534
|
|
|
2535
|
+
VFromD<D> v = no;
|
|
2536
|
+
for (size_t i = 0; i < HWY_MIN(MaxLanes(d), max_lanes_to_load); ++i) {
|
|
2537
|
+
v = InsertLane(v, i, base[ExtractLane(index, i)]);
|
|
2538
|
+
}
|
|
2539
|
+
return v;
|
|
2540
|
+
}
|
|
2541
|
+
#else
|
|
2408
2542
|
template <class D, typename T = TFromD<D>>
|
|
2409
2543
|
HWY_API VFromD<D> GatherIndexN(D d, const T* HWY_RESTRICT base,
|
|
2410
2544
|
VFromD<RebindToSigned<D>> index,
|
|
2411
2545
|
const size_t max_lanes_to_load) {
|
|
2412
2546
|
return MaskedGatherIndex(FirstN(d, max_lanes_to_load), d, base, index);
|
|
2413
2547
|
}
|
|
2548
|
+
template <class D, typename T = TFromD<D>>
|
|
2549
|
+
HWY_API VFromD<D> GatherIndexNOr(VFromD<D> no, D d, const T* HWY_RESTRICT base,
|
|
2550
|
+
VFromD<RebindToSigned<D>> index,
|
|
2551
|
+
const size_t max_lanes_to_load) {
|
|
2552
|
+
return MaskedGatherIndexOr(no, FirstN(d, max_lanes_to_load), d, base, index);
|
|
2553
|
+
}
|
|
2554
|
+
#endif // (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE))
|
|
2414
2555
|
|
|
2415
2556
|
// ------------------------------ Integer AbsDiff and SumsOf8AbsDiff
|
|
2416
2557
|
|
|
@@ -2548,6 +2689,7 @@ HWY_API V SaturatedSub(V a, V b) {
|
|
|
2548
2689
|
// ------------------------------ Unsigned to signed demotions
|
|
2549
2690
|
|
|
2550
2691
|
template <class DN, HWY_IF_SIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
|
|
2692
|
+
HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V),
|
|
2551
2693
|
class V2 = VFromD<Rebind<TFromV<V>, DN>>,
|
|
2552
2694
|
hwy::EnableIf<(sizeof(TFromD<DN>) < sizeof(TFromV<V>))>* = nullptr,
|
|
2553
2695
|
HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_D(DFromV<V2>))>
|
|
@@ -2571,6 +2713,7 @@ HWY_API VFromD<DN> DemoteTo(DN dn, V v) {
|
|
|
2571
2713
|
|
|
2572
2714
|
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
|
2573
2715
|
template <class DN, HWY_IF_SIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
|
|
2716
|
+
HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V),
|
|
2574
2717
|
class V2 = VFromD<Repartition<TFromV<V>, DN>>,
|
|
2575
2718
|
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
|
|
2576
2719
|
HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_D(DFromV<V2>))>
|
|
@@ -2629,248 +2772,6 @@ HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
|
|
|
2629
2772
|
#endif // HWY_TARGET != HWY_SCALAR
|
|
2630
2773
|
#endif // HWY_NATIVE_PROMOTE_UPPER_TO
|
|
2631
2774
|
|
|
2632
|
-
// ------------------------------ PromoteEvenTo/PromoteOddTo
|
|
2633
|
-
|
|
2634
|
-
#if HWY_TARGET != HWY_SCALAR
|
|
2635
|
-
namespace detail {
|
|
2636
|
-
|
|
2637
|
-
// Tag dispatch is used in detail::PromoteEvenTo and detail::PromoteOddTo as
|
|
2638
|
-
// there are target-specific specializations for some of the
|
|
2639
|
-
// detail::PromoteEvenTo and detail::PromoteOddTo cases on
|
|
2640
|
-
// SVE/PPC/SSE2/SSSE3/SSE4/AVX2.
|
|
2641
|
-
|
|
2642
|
-
// All targets except HWY_SCALAR use the implementations of
|
|
2643
|
-
// detail::PromoteEvenTo and detail::PromoteOddTo in generic_ops-inl.h for at
|
|
2644
|
-
// least some of the PromoteEvenTo and PromoteOddTo cases.
|
|
2645
|
-
|
|
2646
|
-
// Signed to signed PromoteEvenTo/PromoteOddTo
|
|
2647
|
-
template <size_t kToLaneSize, class D, class V>
|
|
2648
|
-
HWY_INLINE VFromD<D> PromoteEvenTo(
|
|
2649
|
-
hwy::SignedTag /*to_type_tag*/,
|
|
2650
|
-
hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
|
|
2651
|
-
hwy::SignedTag /*from_type_tag*/, D d_to, V v) {
|
|
2652
|
-
#if HWY_IS_LITTLE_ENDIAN
|
|
2653
|
-
// On little-endian targets, need to shift each lane of the bitcasted vector
|
|
2654
|
-
// left by kToLaneSize * 4 bits to get the bits of the even source lanes into
|
|
2655
|
-
// the upper kToLaneSize * 4 bits of even_in_hi.
|
|
2656
|
-
const auto even_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v));
|
|
2657
|
-
#else
|
|
2658
|
-
// On big-endian targets, the bits of the even source lanes are already in
|
|
2659
|
-
// the upper kToLaneSize * 4 bits of the lanes of the bitcasted vector.
|
|
2660
|
-
const auto even_in_hi = BitCast(d_to, v);
|
|
2661
|
-
#endif
|
|
2662
|
-
|
|
2663
|
-
// Right-shift even_in_hi by kToLaneSize * 4 bits
|
|
2664
|
-
return ShiftRight<kToLaneSize * 4>(even_in_hi);
|
|
2665
|
-
}
|
|
2666
|
-
|
|
2667
|
-
template <size_t kToLaneSize, class D, class V>
|
|
2668
|
-
HWY_INLINE VFromD<D> PromoteOddTo(
|
|
2669
|
-
hwy::SignedTag /*to_type_tag*/,
|
|
2670
|
-
hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
|
|
2671
|
-
hwy::SignedTag /*from_type_tag*/, D d_to, V v) {
|
|
2672
|
-
#if HWY_IS_LITTLE_ENDIAN
|
|
2673
|
-
// On little-endian targets, the bits of the odd source lanes are already in
|
|
2674
|
-
// the upper kToLaneSize * 4 bits of the lanes of the bitcasted vector.
|
|
2675
|
-
const auto odd_in_hi = BitCast(d_to, v);
|
|
2676
|
-
#else
|
|
2677
|
-
// On big-endian targets, need to shift each lane of the bitcasted vector left
|
|
2678
|
-
// by kToLaneSize * 4 bits to get the bits of the odd source lanes into the
|
|
2679
|
-
// upper kToLaneSize * 4 bits of odd_in_hi.
|
|
2680
|
-
const auto odd_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v));
|
|
2681
|
-
#endif
|
|
2682
|
-
|
|
2683
|
-
// Right-shift odd_in_hi by kToLaneSize * 4 bits
|
|
2684
|
-
return ShiftRight<kToLaneSize * 4>(odd_in_hi);
|
|
2685
|
-
}
|
|
2686
|
-
|
|
2687
|
-
// Unsigned to unsigned PromoteEvenTo/PromoteOddTo
|
|
2688
|
-
template <size_t kToLaneSize, class D, class V>
|
|
2689
|
-
HWY_INLINE VFromD<D> PromoteEvenTo(
|
|
2690
|
-
hwy::UnsignedTag /*to_type_tag*/,
|
|
2691
|
-
hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
|
|
2692
|
-
hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
|
|
2693
|
-
#if HWY_IS_LITTLE_ENDIAN
|
|
2694
|
-
// On little-endian targets, the bits of the even source lanes are already
|
|
2695
|
-
// in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector.
|
|
2696
|
-
|
|
2697
|
-
// Simply need to zero out the upper bits of each lane of the bitcasted
|
|
2698
|
-
// vector.
|
|
2699
|
-
return And(BitCast(d_to, v),
|
|
2700
|
-
Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>())));
|
|
2701
|
-
#else
|
|
2702
|
-
// On big-endian targets, need to shift each lane of the bitcasted vector
|
|
2703
|
-
// right by kToLaneSize * 4 bits to get the bits of the even source lanes into
|
|
2704
|
-
// the lower kToLaneSize * 4 bits of the result.
|
|
2705
|
-
|
|
2706
|
-
// The right shift below will zero out the upper kToLaneSize * 4 bits of the
|
|
2707
|
-
// result.
|
|
2708
|
-
return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v));
|
|
2709
|
-
#endif
|
|
2710
|
-
}
|
|
2711
|
-
|
|
2712
|
-
template <size_t kToLaneSize, class D, class V>
|
|
2713
|
-
HWY_INLINE VFromD<D> PromoteOddTo(
|
|
2714
|
-
hwy::UnsignedTag /*to_type_tag*/,
|
|
2715
|
-
hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
|
|
2716
|
-
hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
|
|
2717
|
-
#if HWY_IS_LITTLE_ENDIAN
|
|
2718
|
-
// On little-endian targets, need to shift each lane of the bitcasted vector
|
|
2719
|
-
// right by kToLaneSize * 4 bits to get the bits of the odd source lanes into
|
|
2720
|
-
// the lower kToLaneSize * 4 bits of the result.
|
|
2721
|
-
|
|
2722
|
-
// The right shift below will zero out the upper kToLaneSize * 4 bits of the
|
|
2723
|
-
// result.
|
|
2724
|
-
return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v));
|
|
2725
|
-
#else
|
|
2726
|
-
// On big-endian targets, the bits of the even source lanes are already
|
|
2727
|
-
// in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector.
|
|
2728
|
-
|
|
2729
|
-
// Simply need to zero out the upper bits of each lane of the bitcasted
|
|
2730
|
-
// vector.
|
|
2731
|
-
return And(BitCast(d_to, v),
|
|
2732
|
-
Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>())));
|
|
2733
|
-
#endif
|
|
2734
|
-
}
|
|
2735
|
-
|
|
2736
|
-
// Unsigned to signed: Same as unsigned->unsigned PromoteEvenTo/PromoteOddTo
|
|
2737
|
-
// followed by BitCast to signed
|
|
2738
|
-
template <size_t kToLaneSize, class D, class V>
|
|
2739
|
-
HWY_INLINE VFromD<D> PromoteEvenTo(
|
|
2740
|
-
hwy::SignedTag /*to_type_tag*/,
|
|
2741
|
-
hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
|
|
2742
|
-
hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
|
|
2743
|
-
const RebindToUnsigned<decltype(d_to)> du_to;
|
|
2744
|
-
return BitCast(d_to,
|
|
2745
|
-
PromoteEvenTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(),
|
|
2746
|
-
hwy::UnsignedTag(), du_to, v));
|
|
2747
|
-
}
|
|
2748
|
-
|
|
2749
|
-
template <size_t kToLaneSize, class D, class V>
|
|
2750
|
-
HWY_INLINE VFromD<D> PromoteOddTo(
|
|
2751
|
-
hwy::SignedTag /*to_type_tag*/,
|
|
2752
|
-
hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
|
|
2753
|
-
hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
|
|
2754
|
-
const RebindToUnsigned<decltype(d_to)> du_to;
|
|
2755
|
-
return BitCast(d_to,
|
|
2756
|
-
PromoteOddTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(),
|
|
2757
|
-
hwy::UnsignedTag(), du_to, v));
|
|
2758
|
-
}
|
|
2759
|
-
|
|
2760
|
-
// BF16->F32 PromoteEvenTo
|
|
2761
|
-
|
|
2762
|
-
// NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag
|
|
2763
|
-
// instead of hwy::FloatTag on targets that use scalable vectors.
|
|
2764
|
-
|
|
2765
|
-
// VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same
|
|
2766
|
-
// type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>>
|
|
2767
|
-
|
|
2768
|
-
// The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered
|
|
2769
|
-
// to be a bfloat16_t vector.
|
|
2770
|
-
template <class FromTypeTag, class DF32, class VBF16,
|
|
2771
|
-
class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>,
|
|
2772
|
-
hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr>
|
|
2773
|
-
HWY_INLINE VFromD<DF32> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
|
|
2774
|
-
hwy::SizeTag<4> /*to_lane_size_tag*/,
|
|
2775
|
-
FromTypeTag /*from_type_tag*/, DF32 d_to,
|
|
2776
|
-
VBF16 v) {
|
|
2777
|
-
const RebindToUnsigned<decltype(d_to)> du_to;
|
|
2778
|
-
#if HWY_IS_LITTLE_ENDIAN
|
|
2779
|
-
// On little-endian platforms, need to shift left each lane of the bitcasted
|
|
2780
|
-
// vector by 16 bits.
|
|
2781
|
-
return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v)));
|
|
2782
|
-
#else
|
|
2783
|
-
// On big-endian platforms, the even lanes of the source vector are already
|
|
2784
|
-
// in the upper 16 bits of the lanes of the bitcasted vector.
|
|
2785
|
-
|
|
2786
|
-
// Need to simply zero out the lower 16 bits of each lane of the bitcasted
|
|
2787
|
-
// vector.
|
|
2788
|
-
return BitCast(d_to,
|
|
2789
|
-
And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u})));
|
|
2790
|
-
#endif
|
|
2791
|
-
}
|
|
2792
|
-
|
|
2793
|
-
// BF16->F32 PromoteOddTo
|
|
2794
|
-
|
|
2795
|
-
// NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag
|
|
2796
|
-
// instead of hwy::FloatTag on targets that use scalable vectors.
|
|
2797
|
-
|
|
2798
|
-
// VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same
|
|
2799
|
-
// type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>>
|
|
2800
|
-
|
|
2801
|
-
// The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered
|
|
2802
|
-
// to be a bfloat16_t vector.
|
|
2803
|
-
template <class FromTypeTag, class DF32, class VBF16,
|
|
2804
|
-
class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>,
|
|
2805
|
-
hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr>
|
|
2806
|
-
HWY_INLINE VFromD<DF32> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
|
|
2807
|
-
hwy::SizeTag<4> /*to_lane_size_tag*/,
|
|
2808
|
-
FromTypeTag /*from_type_tag*/, DF32 d_to,
|
|
2809
|
-
VBF16 v) {
|
|
2810
|
-
const RebindToUnsigned<decltype(d_to)> du_to;
|
|
2811
|
-
#if HWY_IS_LITTLE_ENDIAN
|
|
2812
|
-
// On little-endian platforms, the odd lanes of the source vector are already
|
|
2813
|
-
// in the upper 16 bits of the lanes of the bitcasted vector.
|
|
2814
|
-
|
|
2815
|
-
// Need to simply zero out the lower 16 bits of each lane of the bitcasted
|
|
2816
|
-
// vector.
|
|
2817
|
-
return BitCast(d_to,
|
|
2818
|
-
And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u})));
|
|
2819
|
-
#else
|
|
2820
|
-
// On big-endian platforms, need to shift left each lane of the bitcasted
|
|
2821
|
-
// vector by 16 bits.
|
|
2822
|
-
return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v)));
|
|
2823
|
-
#endif
|
|
2824
|
-
}
|
|
2825
|
-
|
|
2826
|
-
// Default PromoteEvenTo/PromoteOddTo implementations
|
|
2827
|
-
template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
|
|
2828
|
-
class V, HWY_IF_LANES_D(D, 1)>
|
|
2829
|
-
HWY_INLINE VFromD<D> PromoteEvenTo(
|
|
2830
|
-
ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
|
|
2831
|
-
FromTypeTag /*from_type_tag*/, D d_to, V v) {
|
|
2832
|
-
return PromoteLowerTo(d_to, v);
|
|
2833
|
-
}
|
|
2834
|
-
|
|
2835
|
-
template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
|
|
2836
|
-
class V, HWY_IF_LANES_GT_D(D, 1)>
|
|
2837
|
-
HWY_INLINE VFromD<D> PromoteEvenTo(
|
|
2838
|
-
ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
|
|
2839
|
-
FromTypeTag /*from_type_tag*/, D d_to, V v) {
|
|
2840
|
-
const DFromV<decltype(v)> d;
|
|
2841
|
-
return PromoteLowerTo(d_to, ConcatEven(d, v, v));
|
|
2842
|
-
}
|
|
2843
|
-
|
|
2844
|
-
template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
|
|
2845
|
-
class V>
|
|
2846
|
-
HWY_INLINE VFromD<D> PromoteOddTo(
|
|
2847
|
-
ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
|
|
2848
|
-
FromTypeTag /*from_type_tag*/, D d_to, V v) {
|
|
2849
|
-
const DFromV<decltype(v)> d;
|
|
2850
|
-
return PromoteLowerTo(d_to, ConcatOdd(d, v, v));
|
|
2851
|
-
}
|
|
2852
|
-
|
|
2853
|
-
} // namespace detail
|
|
2854
|
-
|
|
2855
|
-
template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)),
|
|
2856
|
-
class V2 = VFromD<Repartition<TFromV<V>, D>>,
|
|
2857
|
-
HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))>
|
|
2858
|
-
HWY_API VFromD<D> PromoteEvenTo(D d, V v) {
|
|
2859
|
-
return detail::PromoteEvenTo(hwy::TypeTag<TFromD<D>>(),
|
|
2860
|
-
hwy::SizeTag<sizeof(TFromD<D>)>(),
|
|
2861
|
-
hwy::TypeTag<TFromV<V>>(), d, v);
|
|
2862
|
-
}
|
|
2863
|
-
|
|
2864
|
-
template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)),
|
|
2865
|
-
class V2 = VFromD<Repartition<TFromV<V>, D>>,
|
|
2866
|
-
HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))>
|
|
2867
|
-
HWY_API VFromD<D> PromoteOddTo(D d, V v) {
|
|
2868
|
-
return detail::PromoteOddTo(hwy::TypeTag<TFromD<D>>(),
|
|
2869
|
-
hwy::SizeTag<sizeof(TFromD<D>)>(),
|
|
2870
|
-
hwy::TypeTag<TFromV<V>>(), d, v);
|
|
2871
|
-
}
|
|
2872
|
-
#endif // HWY_TARGET != HWY_SCALAR
|
|
2873
|
-
|
|
2874
2775
|
// ------------------------------ float16_t <-> float
|
|
2875
2776
|
|
|
2876
2777
|
#if (defined(HWY_NATIVE_F16C) == defined(HWY_TARGET_TOGGLE))
|
|
@@ -2924,7 +2825,7 @@ HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
|
|
|
2924
2825
|
// We also want to biased exponent of round_incr[i] to be less than or equal
|
|
2925
2826
|
// to 255 (which is equal to MaxExponentField<float>())
|
|
2926
2827
|
|
|
2927
|
-
// The biased
|
|
2828
|
+
// The biased F32 exponent of round_incr is equal to
|
|
2928
2829
|
// HWY_MAX(HWY_MIN(((exp_bits[i] >> 23) & 255) + 13, 255), 126)
|
|
2929
2830
|
|
|
2930
2831
|
// hi9_bits[i] is equal to the upper 9 bits of v[i]
|
|
@@ -3010,24 +2911,31 @@ HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
|
|
|
3010
2911
|
// ((rounded_val_bits[i] >> 10) & 0xFF), 157) - 126
|
|
3011
2912
|
|
|
3012
2913
|
#if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
|
|
2914
|
+
const auto k157Shl10 = Set(du32, static_cast<uint32_t>(uint32_t{157u} << 10));
|
|
3013
2915
|
auto f16_exp_bits =
|
|
3014
2916
|
Min(Add(ShiftLeft<10>(And(round_incr_hi9_bits, k255)),
|
|
3015
2917
|
And(rounded_val_bits,
|
|
3016
2918
|
Set(du32, static_cast<uint32_t>(uint32_t{0xFFu} << 10)))),
|
|
3017
|
-
|
|
2919
|
+
k157Shl10);
|
|
2920
|
+
const auto f16_result_is_inf_mask =
|
|
2921
|
+
RebindMask(df32, Eq(f16_exp_bits, k157Shl10));
|
|
3018
2922
|
#else
|
|
3019
|
-
auto
|
|
2923
|
+
const auto k157 = Set(du32, uint32_t{157});
|
|
2924
|
+
auto f16_exp_bits = BitCast(
|
|
3020
2925
|
du32,
|
|
3021
2926
|
Min(SaturatedAdd(BitCast(du32_as_u8, round_incr_hi9_bits),
|
|
3022
2927
|
BitCast(du32_as_u8, ShiftRight<10>(rounded_val_bits))),
|
|
3023
|
-
BitCast(du32_as_u8,
|
|
2928
|
+
BitCast(du32_as_u8, k157)));
|
|
2929
|
+
const auto f16_result_is_inf_mask = RebindMask(df32, Eq(f16_exp_bits, k157));
|
|
2930
|
+
f16_exp_bits = ShiftLeft<10>(f16_exp_bits);
|
|
3024
2931
|
#endif
|
|
3025
2932
|
|
|
3026
2933
|
f16_exp_bits =
|
|
3027
2934
|
Sub(f16_exp_bits, Set(du32, static_cast<uint32_t>(uint32_t{126u} << 10)));
|
|
3028
2935
|
|
|
3029
2936
|
const auto f16_unmasked_mant_bits =
|
|
3030
|
-
BitCast(di32, Or(
|
|
2937
|
+
BitCast(di32, Or(IfThenZeroElse(f16_result_is_inf_mask, rounded_val),
|
|
2938
|
+
VecFromMask(df32, IsNaN(rounded_val))));
|
|
3031
2939
|
|
|
3032
2940
|
const auto f16_exp_mant_bits =
|
|
3033
2941
|
OrAnd(BitCast(di32, f16_exp_bits), f16_unmasked_mant_bits,
|
|
@@ -3094,9 +3002,224 @@ HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<float16_t, D>> v) {
|
|
|
3094
3002
|
|
|
3095
3003
|
#endif // HWY_NATIVE_PROMOTE_F16_TO_F64
|
|
3096
3004
|
|
|
3005
|
+
// ------------------------------ F32 to BF16 DemoteTo
|
|
3006
|
+
#if (defined(HWY_NATIVE_DEMOTE_F32_TO_BF16) == defined(HWY_TARGET_TOGGLE))
|
|
3007
|
+
#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
3008
|
+
#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
3009
|
+
#else
|
|
3010
|
+
#define HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
3011
|
+
#endif
|
|
3012
|
+
|
|
3013
|
+
namespace detail {
|
|
3014
|
+
|
|
3015
|
+
// Round a F32 value to the nearest BF16 value, with the result returned as the
|
|
3016
|
+
// rounded F32 value bitcasted to an U32
|
|
3017
|
+
|
|
3018
|
+
// RoundF32ForDemoteToBF16 also converts NaN values to QNaN values to prevent
|
|
3019
|
+
// NaN F32 values from being converted to an infinity
|
|
3020
|
+
template <class V, HWY_IF_F32(TFromV<V>)>
|
|
3021
|
+
HWY_INLINE VFromD<RebindToUnsigned<DFromV<V>>> RoundF32ForDemoteToBF16(V v) {
|
|
3022
|
+
const DFromV<decltype(v)> d;
|
|
3023
|
+
const RebindToUnsigned<decltype(d)> du32;
|
|
3024
|
+
|
|
3025
|
+
const auto is_non_nan = Not(IsNaN(v));
|
|
3026
|
+
const auto bits32 = BitCast(du32, v);
|
|
3027
|
+
|
|
3028
|
+
const auto round_incr =
|
|
3029
|
+
Add(And(ShiftRight<16>(bits32), Set(du32, uint32_t{1})),
|
|
3030
|
+
Set(du32, uint32_t{0x7FFFu}));
|
|
3031
|
+
return MaskedAddOr(Or(bits32, Set(du32, uint32_t{0x00400000u})),
|
|
3032
|
+
RebindMask(du32, is_non_nan), bits32, round_incr);
|
|
3033
|
+
}
|
|
3034
|
+
|
|
3035
|
+
} // namespace detail
|
|
3036
|
+
|
|
3037
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
3038
|
+
HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
|
|
3039
|
+
const RebindToUnsigned<decltype(dbf16)> du16;
|
|
3040
|
+
const Twice<decltype(du16)> dt_u16;
|
|
3041
|
+
|
|
3042
|
+
const auto rounded_bits = BitCast(dt_u16, detail::RoundF32ForDemoteToBF16(v));
|
|
3043
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
3044
|
+
return BitCast(
|
|
3045
|
+
dbf16, LowerHalf(du16, ConcatOdd(dt_u16, rounded_bits, rounded_bits)));
|
|
3046
|
+
#else
|
|
3047
|
+
return BitCast(
|
|
3048
|
+
dbf16, LowerHalf(du16, ConcatEven(dt_u16, rounded_bits, rounded_bits)));
|
|
3049
|
+
#endif
|
|
3050
|
+
}
|
|
3051
|
+
|
|
3052
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
3053
|
+
HWY_API VFromD<D> OrderedDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
|
|
3054
|
+
VFromD<Repartition<float, D>> b) {
|
|
3055
|
+
const RebindToUnsigned<decltype(dbf16)> du16;
|
|
3056
|
+
|
|
3057
|
+
const auto rounded_a_bits32 =
|
|
3058
|
+
BitCast(du16, detail::RoundF32ForDemoteToBF16(a));
|
|
3059
|
+
const auto rounded_b_bits32 =
|
|
3060
|
+
BitCast(du16, detail::RoundF32ForDemoteToBF16(b));
|
|
3061
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
3062
|
+
return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, rounded_b_bits32),
|
|
3063
|
+
BitCast(du16, rounded_a_bits32)));
|
|
3064
|
+
#else
|
|
3065
|
+
return BitCast(dbf16, ConcatEven(du16, BitCast(du16, rounded_b_bits32),
|
|
3066
|
+
BitCast(du16, rounded_a_bits32)));
|
|
3067
|
+
#endif
|
|
3068
|
+
}
|
|
3069
|
+
|
|
3070
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
3071
|
+
HWY_API VFromD<D> ReorderDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
|
|
3072
|
+
VFromD<Repartition<float, D>> b) {
|
|
3073
|
+
const RebindToUnsigned<decltype(dbf16)> du16;
|
|
3074
|
+
|
|
3075
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
3076
|
+
const auto a_in_odd = detail::RoundF32ForDemoteToBF16(a);
|
|
3077
|
+
const auto b_in_even = ShiftRight<16>(detail::RoundF32ForDemoteToBF16(b));
|
|
3078
|
+
#else
|
|
3079
|
+
const auto a_in_odd = ShiftRight<16>(detail::RoundF32ForDemoteToBF16(a));
|
|
3080
|
+
const auto b_in_even = detail::RoundF32ForDemoteToBF16(b);
|
|
3081
|
+
#endif
|
|
3082
|
+
|
|
3083
|
+
return BitCast(dbf16,
|
|
3084
|
+
OddEven(BitCast(du16, a_in_odd), BitCast(du16, b_in_even)));
|
|
3085
|
+
}
|
|
3086
|
+
|
|
3087
|
+
#endif // HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
3088
|
+
|
|
3089
|
+
// ------------------------------ PromoteInRangeTo
|
|
3090
|
+
#if (defined(HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO) == \
|
|
3091
|
+
defined(HWY_TARGET_TOGGLE))
|
|
3092
|
+
#ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
|
|
3093
|
+
#undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
|
|
3094
|
+
#else
|
|
3095
|
+
#define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
|
|
3096
|
+
#endif
|
|
3097
|
+
|
|
3098
|
+
#if HWY_HAVE_INTEGER64
|
|
3099
|
+
template <class D64, HWY_IF_UI64_D(D64)>
|
|
3100
|
+
HWY_API VFromD<D64> PromoteInRangeTo(D64 d64, VFromD<Rebind<float, D64>> v) {
|
|
3101
|
+
return PromoteTo(d64, v);
|
|
3102
|
+
}
|
|
3103
|
+
#endif
|
|
3104
|
+
|
|
3105
|
+
#endif // HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
|
|
3106
|
+
|
|
3107
|
+
// ------------------------------ ConvertInRangeTo
|
|
3108
|
+
#if (defined(HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO) == defined(HWY_TARGET_TOGGLE))
|
|
3109
|
+
#ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
|
|
3110
|
+
#undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
|
|
3111
|
+
#else
|
|
3112
|
+
#define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
|
|
3113
|
+
#endif
|
|
3114
|
+
|
|
3115
|
+
template <class DI, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DI),
|
|
3116
|
+
HWY_IF_T_SIZE_ONE_OF_D(DI, (HWY_HAVE_FLOAT16 ? (1 << 2) : 0) |
|
|
3117
|
+
(1 << 4) |
|
|
3118
|
+
(HWY_HAVE_FLOAT64 ? (1 << 8) : 0))>
|
|
3119
|
+
HWY_API VFromD<DI> ConvertInRangeTo(DI di, VFromD<RebindToFloat<DI>> v) {
|
|
3120
|
+
return ConvertTo(di, v);
|
|
3121
|
+
}
|
|
3122
|
+
|
|
3123
|
+
#endif // HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
|
|
3124
|
+
|
|
3125
|
+
// ------------------------------ DemoteInRangeTo
|
|
3126
|
+
#if (defined(HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO) == \
|
|
3127
|
+
defined(HWY_TARGET_TOGGLE))
|
|
3128
|
+
#ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
|
|
3129
|
+
#undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
|
|
3130
|
+
#else
|
|
3131
|
+
#define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
|
|
3132
|
+
#endif
|
|
3133
|
+
|
|
3134
|
+
#if HWY_HAVE_FLOAT64
|
|
3135
|
+
template <class D32, HWY_IF_UI32_D(D32)>
|
|
3136
|
+
HWY_API VFromD<D32> DemoteInRangeTo(D32 d32, VFromD<Rebind<double, D32>> v) {
|
|
3137
|
+
return DemoteTo(d32, v);
|
|
3138
|
+
}
|
|
3139
|
+
#endif
|
|
3140
|
+
|
|
3141
|
+
#endif // HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
|
|
3142
|
+
|
|
3143
|
+
// ------------------------------ PromoteInRangeLowerTo/PromoteInRangeUpperTo
|
|
3144
|
+
|
|
3145
|
+
template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
|
|
3146
|
+
HWY_API VFromD<D> PromoteInRangeLowerTo(D d, V v) {
|
|
3147
|
+
// Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
|
|
3148
|
+
// because it cannot be deduced from D (could be either bf16 or f16).
|
|
3149
|
+
const Rebind<TFromV<V>, decltype(d)> dh;
|
|
3150
|
+
return PromoteInRangeTo(d, LowerHalf(dh, v));
|
|
3151
|
+
}
|
|
3152
|
+
|
|
3153
|
+
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
|
3154
|
+
template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
|
|
3155
|
+
HWY_API VFromD<D> PromoteInRangeUpperTo(D d, V v) {
|
|
3156
|
+
#if (HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_EMU128 || \
|
|
3157
|
+
(HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64))
|
|
3158
|
+
// On targets that provide target-specific implementations of F32->UI64
|
|
3159
|
+
// PromoteInRangeTo, promote the upper half of v using PromoteInRangeTo
|
|
3160
|
+
|
|
3161
|
+
// Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
|
|
3162
|
+
// because it cannot be deduced from D (could be either bf16 or f16).
|
|
3163
|
+
const Rebind<TFromV<V>, decltype(d)> dh;
|
|
3164
|
+
return PromoteInRangeTo(d, UpperHalf(dh, v));
|
|
3165
|
+
#else
|
|
3166
|
+
// Otherwise, on targets where F32->UI64 PromoteInRangeTo is simply a wrapper
|
|
3167
|
+
// around F32->UI64 PromoteTo, promote the upper half of v to TFromD<D> using
|
|
3168
|
+
// PromoteUpperTo
|
|
3169
|
+
return PromoteUpperTo(d, v);
|
|
3170
|
+
#endif
|
|
3171
|
+
}
|
|
3172
|
+
#endif // HWY_TARGET != HWY_SCALAR
|
|
3173
|
+
|
|
3174
|
+
// ------------------------------ PromoteInRangeEvenTo/PromoteInRangeOddTo
|
|
3175
|
+
|
|
3176
|
+
template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
|
|
3177
|
+
HWY_API VFromD<D> PromoteInRangeEvenTo(D d, V v) {
|
|
3178
|
+
#if HWY_TARGET == HWY_SCALAR
|
|
3179
|
+
return PromoteInRangeTo(d, v);
|
|
3180
|
+
#elif (HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_EMU128 || \
|
|
3181
|
+
(HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64))
|
|
3182
|
+
// On targets that provide target-specific implementations of F32->UI64
|
|
3183
|
+
// PromoteInRangeTo, promote the even lanes of v using PromoteInRangeTo
|
|
3184
|
+
|
|
3185
|
+
// Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
|
|
3186
|
+
// because it cannot be deduced from D (could be either bf16 or f16).
|
|
3187
|
+
const DFromV<decltype(v)> d_from;
|
|
3188
|
+
const Rebind<TFromV<V>, decltype(d)> dh;
|
|
3189
|
+
return PromoteInRangeTo(d, LowerHalf(dh, ConcatEven(d_from, v, v)));
|
|
3190
|
+
#else
|
|
3191
|
+
// Otherwise, on targets where F32->UI64 PromoteInRangeTo is simply a wrapper
|
|
3192
|
+
// around F32->UI64 PromoteTo, promote the even lanes of v to TFromD<D> using
|
|
3193
|
+
// PromoteEvenTo
|
|
3194
|
+
return PromoteEvenTo(d, v);
|
|
3195
|
+
#endif // HWY_TARGET == HWY_SCALAR
|
|
3196
|
+
}
|
|
3197
|
+
|
|
3198
|
+
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
|
3199
|
+
template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
|
|
3200
|
+
HWY_API VFromD<D> PromoteInRangeOddTo(D d, V v) {
|
|
3201
|
+
#if (HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_EMU128 || \
|
|
3202
|
+
(HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64))
|
|
3203
|
+
// On targets that provide target-specific implementations of F32->UI64
|
|
3204
|
+
// PromoteInRangeTo, promote the odd lanes of v using PromoteInRangeTo
|
|
3205
|
+
|
|
3206
|
+
// Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
|
|
3207
|
+
// because it cannot be deduced from D (could be either bf16 or f16).
|
|
3208
|
+
const DFromV<decltype(v)> d_from;
|
|
3209
|
+
const Rebind<TFromV<V>, decltype(d)> dh;
|
|
3210
|
+
return PromoteInRangeTo(d, LowerHalf(dh, ConcatOdd(d_from, v, v)));
|
|
3211
|
+
#else
|
|
3212
|
+
// Otherwise, on targets where F32->UI64 PromoteInRangeTo is simply a wrapper
|
|
3213
|
+
// around F32->UI64 PromoteTo, promote the odd lanes of v to TFromD<D> using
|
|
3214
|
+
// PromoteOddTo
|
|
3215
|
+
return PromoteOddTo(d, v);
|
|
3216
|
+
#endif
|
|
3217
|
+
}
|
|
3218
|
+
#endif // HWY_TARGET != HWY_SCALAR
|
|
3219
|
+
|
|
3097
3220
|
// ------------------------------ SumsOf2
|
|
3098
3221
|
|
|
3099
|
-
#if HWY_TARGET != HWY_SCALAR
|
|
3222
|
+
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
|
3100
3223
|
namespace detail {
|
|
3101
3224
|
|
|
3102
3225
|
template <class TypeTag, size_t kLaneSize, class V>
|
|
@@ -3220,7 +3343,7 @@ HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
|
|
|
3220
3343
|
return TruncateTo(d, f32_biased_exp_as_u32);
|
|
3221
3344
|
}
|
|
3222
3345
|
|
|
3223
|
-
#if HWY_TARGET != HWY_SCALAR
|
|
3346
|
+
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
|
3224
3347
|
template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4)>
|
|
3225
3348
|
HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
|
|
3226
3349
|
const Half<decltype(d)> dh;
|
|
@@ -3252,7 +3375,7 @@ HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
|
|
|
3252
3375
|
return U8FromU32(f32_biased_exp_as_u32);
|
|
3253
3376
|
}
|
|
3254
3377
|
|
|
3255
|
-
#if HWY_TARGET != HWY_SCALAR
|
|
3378
|
+
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
|
3256
3379
|
template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4),
|
|
3257
3380
|
HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 2)>
|
|
3258
3381
|
HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
|
|
@@ -3549,7 +3672,7 @@ HWY_INLINE V InvSubBytes(V state) {
|
|
|
3549
3672
|
#endif
|
|
3550
3673
|
|
|
3551
3674
|
// (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar)
|
|
3552
|
-
#if HWY_TARGET != HWY_SCALAR
|
|
3675
|
+
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
|
3553
3676
|
|
|
3554
3677
|
namespace detail {
|
|
3555
3678
|
|
|
@@ -3972,12 +4095,11 @@ HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
|
|
|
3972
4095
|
// MulAddSub for F16/F32/F64 vectors with 2 or more lanes on
|
|
3973
4096
|
// SSSE3/SSE4/AVX2/AVX3 is implemented in x86_128-inl.h, x86_256-inl.h, and
|
|
3974
4097
|
// x86_512-inl.h
|
|
3975
|
-
|
|
3976
|
-
|
|
3977
|
-
|
|
3978
|
-
|
|
3979
|
-
|
|
3980
|
-
(1 << 8))))>
|
|
4098
|
+
|
|
4099
|
+
// MulAddSub for F16/F32/F64 vectors on SVE is implemented in arm_sve-inl.h
|
|
4100
|
+
|
|
4101
|
+
// MulAddSub for integer vectors on SVE2 is implemented in arm_sve-inl.h
|
|
4102
|
+
template <class V, HWY_IF_MULADDSUB_V(V)>
|
|
3981
4103
|
HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
|
|
3982
4104
|
using D = DFromV<V>;
|
|
3983
4105
|
using T = TFromD<D>;
|
|
@@ -4001,9 +4123,17 @@ HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
|
|
|
4001
4123
|
|
|
4002
4124
|
namespace detail {
|
|
4003
4125
|
|
|
4126
|
+
// DemoteInRangeTo, PromoteInRangeTo, and ConvertInRangeTo are okay to use in
|
|
4127
|
+
// the implementation of detail::IntDiv in generic_ops-inl.h as the current
|
|
4128
|
+
// implementations of DemoteInRangeTo, PromoteInRangeTo, and ConvertInRangeTo
|
|
4129
|
+
// will convert values that are outside of the range of TFromD<DI> by either
|
|
4130
|
+
// saturation, truncation, or converting values that are outside of the
|
|
4131
|
+
// destination range to LimitsMin<TFromD<DI>>() (which is equal to
|
|
4132
|
+
// static_cast<TFromD<DI>>(LimitsMax<TFromD<DI>>() + 1))
|
|
4133
|
+
|
|
4004
4134
|
template <class D, class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))>
|
|
4005
4135
|
HWY_INLINE Vec<D> IntDivConvFloatToInt(D di, V vf) {
|
|
4006
|
-
return
|
|
4136
|
+
return ConvertInRangeTo(di, vf);
|
|
4007
4137
|
}
|
|
4008
4138
|
|
|
4009
4139
|
template <class D, class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))>
|
|
@@ -4014,7 +4144,7 @@ HWY_INLINE Vec<D> IntDivConvIntToFloat(D df, V vi) {
|
|
|
4014
4144
|
#if !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
|
|
4015
4145
|
template <class D, class V, HWY_IF_UI64_D(D), HWY_IF_F32(TFromV<V>)>
|
|
4016
4146
|
HWY_INLINE Vec<D> IntDivConvFloatToInt(D df, V vi) {
|
|
4017
|
-
return
|
|
4147
|
+
return PromoteInRangeTo(df, vi);
|
|
4018
4148
|
}
|
|
4019
4149
|
|
|
4020
4150
|
// If !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 is true, then UI64->F32
|
|
@@ -4085,8 +4215,13 @@ HWY_INLINE V IntDivUsingFloatDiv(V a, V b) {
|
|
|
4085
4215
|
// the case where the magnitude of an inexact floating point division result
|
|
4086
4216
|
// is rounded up.
|
|
4087
4217
|
|
|
4088
|
-
|
|
4089
|
-
|
|
4218
|
+
// It is okay to do conversions from MakeFloat<TFromV<V>> to TFromV<V> using
|
|
4219
|
+
// ConvertInRangeTo if sizeof(TFromV<V>) > kOrigLaneSize as the result of the
|
|
4220
|
+
// floating point division is always greater than LimitsMin<TFromV<V>>() and
|
|
4221
|
+
// less than LimitsMax<TFromV<V>>() if sizeof(TFromV<V>) > kOrigLaneSize and
|
|
4222
|
+
// b[i] != 0.
|
|
4223
|
+
|
|
4224
|
+
#if HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64
|
|
4090
4225
|
// On Armv7, do division by multiplying by the ApproximateReciprocal
|
|
4091
4226
|
// to avoid unnecessary overhead as F32 Div refines the approximate
|
|
4092
4227
|
// reciprocal using 4 Newton-Raphson iterations
|
|
@@ -4101,7 +4236,7 @@ HWY_INLINE V IntDivUsingFloatDiv(V a, V b) {
|
|
|
4101
4236
|
Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b));
|
|
4102
4237
|
}
|
|
4103
4238
|
|
|
4104
|
-
auto q0 =
|
|
4239
|
+
auto q0 = ConvertInRangeTo(d, Mul(ConvertTo(df, a), flt_recip_b));
|
|
4105
4240
|
const auto r0 = BitCast(di, hwy::HWY_NAMESPACE::NegMulAdd(q0, b, a));
|
|
4106
4241
|
|
|
4107
4242
|
auto r1 = r0;
|
|
@@ -4143,7 +4278,7 @@ HWY_INLINE V IntDivUsingFloatDiv(V a, V b) {
|
|
|
4143
4278
|
#else
|
|
4144
4279
|
// On targets other than Armv7 NEON, use F16 or F32 division as most targets
|
|
4145
4280
|
// other than Armv7 NEON have native F32 divide instructions
|
|
4146
|
-
return
|
|
4281
|
+
return ConvertInRangeTo(d, Div(ConvertTo(df, a), ConvertTo(df, b)));
|
|
4147
4282
|
#endif
|
|
4148
4283
|
}
|
|
4149
4284
|
|
|
@@ -4184,8 +4319,7 @@ HWY_INLINE V IntDivUsingFloatDiv(V a, V b) {
|
|
|
4184
4319
|
|
|
4185
4320
|
const auto flt_b = IntDivConvIntToFloat(df, b);
|
|
4186
4321
|
|
|
4187
|
-
#if
|
|
4188
|
-
!HWY_HAVE_FLOAT64
|
|
4322
|
+
#if HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64
|
|
4189
4323
|
auto flt_recip_b = ApproximateReciprocal(flt_b);
|
|
4190
4324
|
flt_recip_b =
|
|
4191
4325
|
Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b));
|
|
@@ -4193,10 +4327,40 @@ HWY_INLINE V IntDivUsingFloatDiv(V a, V b) {
|
|
|
4193
4327
|
const auto flt_recip_b = Div(Set(df, TF(1.0)), flt_b);
|
|
4194
4328
|
#endif
|
|
4195
4329
|
|
|
4330
|
+
// It is okay if the conversion of a[i] * flt_recip_b[i] to T using
|
|
4331
|
+
// IntDivConvFloatToInt returns incorrect results in any lanes where b[i] == 0
|
|
4332
|
+
// as the result of IntDivUsingFloatDiv(a, b) is implementation-defined in any
|
|
4333
|
+
// lanes where b[i] == 0.
|
|
4334
|
+
|
|
4335
|
+
// If ScalarAbs(b[i]) == 1 is true, then it is possible for
|
|
4336
|
+
// a[i] * flt_recip_b[i] to be rounded up to a value that is outside of the
|
|
4337
|
+
// range of T. If a[i] * flt_recip_b[i] is outside of the range of T,
|
|
4338
|
+
// IntDivConvFloatToInt will convert any values that are out of the range of T
|
|
4339
|
+
// by either saturation, truncation, or wrapping around to LimitsMin<T>().
|
|
4340
|
+
|
|
4341
|
+
// It is okay if the conversion of a[i] * flt_recip_b[i] to T using
|
|
4342
|
+
// IntDivConvFloatToInt wraps around if ScalarAbs(b[i]) == 1 as r0 will have
|
|
4343
|
+
// the correct sign if ScalarAbs(b[i]) == 1, even in the cases where the
|
|
4344
|
+
// conversion of a[i] * flt_recip_b[i] to T using IntDivConvFloatToInt is
|
|
4345
|
+
// truncated or wraps around.
|
|
4346
|
+
|
|
4347
|
+
// If ScalarAbs(b[i]) >= 2 is true, a[i] * flt_recip_b[i] will be within the
|
|
4348
|
+
// range of T, even in the cases where the conversion of a[i] to TF is
|
|
4349
|
+
// rounded up or the result of multiplying a[i] by flt_recip_b[i] is rounded
|
|
4350
|
+
// up.
|
|
4351
|
+
|
|
4352
|
+
// ScalarAbs(r0[i]) will also always be less than (LimitsMax<T>() / 2) if
|
|
4353
|
+
// b[i] != 0, even in the cases where the conversion of a[i] * flt_recip_b[i]
|
|
4354
|
+
// to T using IntDivConvFloatToInt is truncated or is wrapped around.
|
|
4355
|
+
|
|
4196
4356
|
auto q0 =
|
|
4197
4357
|
IntDivConvFloatToInt(d, Mul(IntDivConvIntToFloat(df, a), flt_recip_b));
|
|
4198
4358
|
const auto r0 = BitCast(di, hwy::HWY_NAMESPACE::NegMulAdd(q0, b, a));
|
|
4199
4359
|
|
|
4360
|
+
// If b[i] != 0 is true, r0[i] * flt_recip_b[i] is always within the range of
|
|
4361
|
+
// T, even in the cases where the conversion of r0[i] to TF is rounded up or
|
|
4362
|
+
// the multiplication of r0[i] by flt_recip_b[i] is rounded up.
|
|
4363
|
+
|
|
4200
4364
|
auto q1 =
|
|
4201
4365
|
IntDivConvFloatToInt(di, Mul(IntDivConvIntToFloat(df, r0), flt_recip_b));
|
|
4202
4366
|
const auto r1 = hwy::HWY_NAMESPACE::NegMulAdd(q1, BitCast(di, b), r0);
|
|
@@ -4380,7 +4544,12 @@ HWY_INLINE V IntDiv(V a, V b) {
|
|
|
4380
4544
|
const DFromV<decltype(a)> d;
|
|
4381
4545
|
const Rebind<double, decltype(d)> df64;
|
|
4382
4546
|
|
|
4383
|
-
|
|
4547
|
+
// It is okay to demote the F64 Div result to int32_t or uint32_t using
|
|
4548
|
+
// DemoteInRangeTo as static_cast<double>(a[i]) / static_cast<double>(b[i])
|
|
4549
|
+
// will always be within the range of TFromV<V> if b[i] != 0 and
|
|
4550
|
+
// sizeof(TFromV<V>) <= 4.
|
|
4551
|
+
|
|
4552
|
+
return DemoteInRangeTo(d, Div(PromoteTo(df64, a), PromoteTo(df64, b)));
|
|
4384
4553
|
}
|
|
4385
4554
|
template <size_t kOrigLaneSize, class V, HWY_IF_UI32(TFromV<V>),
|
|
4386
4555
|
HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
|
|
@@ -4389,9 +4558,16 @@ HWY_INLINE V IntDiv(V a, V b) {
|
|
|
4389
4558
|
const Half<decltype(d)> dh;
|
|
4390
4559
|
const Repartition<double, decltype(d)> df64;
|
|
4391
4560
|
|
|
4392
|
-
|
|
4393
|
-
|
|
4394
|
-
|
|
4561
|
+
// It is okay to demote the F64 Div result to int32_t or uint32_t using
|
|
4562
|
+
// DemoteInRangeTo as static_cast<double>(a[i]) / static_cast<double>(b[i])
|
|
4563
|
+
// will always be within the range of TFromV<V> if b[i] != 0 and
|
|
4564
|
+
// sizeof(TFromV<V>) <= 4.
|
|
4565
|
+
|
|
4566
|
+
const VFromD<decltype(df64)> div1 =
|
|
4567
|
+
Div(PromoteUpperTo(df64, a), PromoteUpperTo(df64, b));
|
|
4568
|
+
const VFromD<decltype(df64)> div0 =
|
|
4569
|
+
Div(PromoteLowerTo(df64, a), PromoteLowerTo(df64, b));
|
|
4570
|
+
return Combine(d, DemoteInRangeTo(dh, div1), DemoteInRangeTo(dh, div0));
|
|
4395
4571
|
}
|
|
4396
4572
|
#endif // HWY_HAVE_FLOAT64
|
|
4397
4573
|
|
|
@@ -4479,6 +4655,96 @@ HWY_API Vec512<T> operator%(Vec512<T> a, Vec512<T> b) {
|
|
|
4479
4655
|
|
|
4480
4656
|
#endif // HWY_NATIVE_INT_DIV
|
|
4481
4657
|
|
|
4658
|
+
// ------------------------------ MulEvenAdd (PromoteEvenTo)
|
|
4659
|
+
|
|
4660
|
+
// SVE with bf16 and NEON with bf16 override this.
|
|
4661
|
+
#if (defined(HWY_NATIVE_MUL_EVEN_BF16) == defined(HWY_TARGET_TOGGLE))
|
|
4662
|
+
#ifdef HWY_NATIVE_MUL_EVEN_BF16
|
|
4663
|
+
#undef HWY_NATIVE_MUL_EVEN_BF16
|
|
4664
|
+
#else
|
|
4665
|
+
#define HWY_NATIVE_MUL_EVEN_BF16
|
|
4666
|
+
#endif
|
|
4667
|
+
|
|
4668
|
+
template <class DF, HWY_IF_F32_D(DF),
|
|
4669
|
+
class VBF = VFromD<Repartition<bfloat16_t, DF>>>
|
|
4670
|
+
HWY_API VFromD<DF> MulEvenAdd(DF df, VBF a, VBF b, VFromD<DF> c) {
|
|
4671
|
+
return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), c);
|
|
4672
|
+
}
|
|
4673
|
+
|
|
4674
|
+
template <class DF, HWY_IF_F32_D(DF),
|
|
4675
|
+
class VBF = VFromD<Repartition<bfloat16_t, DF>>>
|
|
4676
|
+
HWY_API VFromD<DF> MulOddAdd(DF df, VBF a, VBF b, VFromD<DF> c) {
|
|
4677
|
+
return MulAdd(PromoteOddTo(df, a), PromoteOddTo(df, b), c);
|
|
4678
|
+
}
|
|
4679
|
+
|
|
4680
|
+
#endif // HWY_NATIVE_MUL_EVEN_BF16
|
|
4681
|
+
|
|
4682
|
+
// ------------------------------ ReorderWidenMulAccumulate (MulEvenAdd)
|
|
4683
|
+
|
|
4684
|
+
// AVX3_SPR/ZEN4, and NEON with bf16 but not(!) SVE override this.
|
|
4685
|
+
#if (defined(HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16) == \
|
|
4686
|
+
defined(HWY_TARGET_TOGGLE))
|
|
4687
|
+
#ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
|
|
4688
|
+
#undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
|
|
4689
|
+
#else
|
|
4690
|
+
#define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
|
|
4691
|
+
#endif
|
|
4692
|
+
|
|
4693
|
+
template <class DF, HWY_IF_F32_D(DF),
|
|
4694
|
+
class VBF = VFromD<Repartition<bfloat16_t, DF>>>
|
|
4695
|
+
HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF df, VBF a, VBF b,
|
|
4696
|
+
VFromD<DF> sum0,
|
|
4697
|
+
VFromD<DF>& sum1) {
|
|
4698
|
+
// Lane order within sum0/1 is undefined, hence we can avoid the
|
|
4699
|
+
// longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
|
|
4700
|
+
sum1 = MulOddAdd(df, a, b, sum1);
|
|
4701
|
+
return MulEvenAdd(df, a, b, sum0);
|
|
4702
|
+
}
|
|
4703
|
+
|
|
4704
|
+
#endif // HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
|
|
4705
|
+
|
|
4706
|
+
// ------------------------------ WidenMulAccumulate
|
|
4707
|
+
|
|
4708
|
+
#if (defined(HWY_NATIVE_WIDEN_MUL_ACCUMULATE) == defined(HWY_TARGET_TOGGLE))
|
|
4709
|
+
#ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
|
|
4710
|
+
#undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
|
|
4711
|
+
#else
|
|
4712
|
+
#define HWY_NATIVE_WIDEN_MUL_ACCUMULATE
|
|
4713
|
+
#endif
|
|
4714
|
+
|
|
4715
|
+
template<class D, HWY_IF_INTEGER(TFromD<D>),
|
|
4716
|
+
class DN = RepartitionToNarrow<D>>
|
|
4717
|
+
HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x,
|
|
4718
|
+
VFromD<D> low, VFromD<D>& high) {
|
|
4719
|
+
high = MulAdd(PromoteUpperTo(d, mul), PromoteUpperTo(d, x), high);
|
|
4720
|
+
return MulAdd(PromoteLowerTo(d, mul), PromoteLowerTo(d, x), low);
|
|
4721
|
+
}
|
|
4722
|
+
|
|
4723
|
+
#endif // HWY_NATIVE_WIDEN_MUL_ACCUMULATE
|
|
4724
|
+
|
|
4725
|
+
#if 0
|
|
4726
|
+
#if (defined(HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16) == defined(HWY_TARGET_TOGGLE))
|
|
4727
|
+
|
|
4728
|
+
#ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
|
|
4729
|
+
#undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
|
|
4730
|
+
#else
|
|
4731
|
+
#define HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
|
|
4732
|
+
#endif
|
|
4733
|
+
|
|
4734
|
+
#if HWY_HAVE_FLOAT16
|
|
4735
|
+
|
|
4736
|
+
template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>>
|
|
4737
|
+
HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x,
|
|
4738
|
+
VFromD<D> low, VFromD<D>& high) {
|
|
4739
|
+
high = MulAdd(PromoteUpperTo(d, mul), PromoteUpperTo(d, x), high);
|
|
4740
|
+
return MulAdd(PromoteLowerTo(d, mul), PromoteLowerTo(d, x), low);
|
|
4741
|
+
}
|
|
4742
|
+
|
|
4743
|
+
#endif // HWY_HAVE_FLOAT16
|
|
4744
|
+
|
|
4745
|
+
#endif // HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
|
|
4746
|
+
#endif // #if 0
|
|
4747
|
+
|
|
4482
4748
|
// ------------------------------ SatWidenMulPairwiseAdd
|
|
4483
4749
|
|
|
4484
4750
|
#if (defined(HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD) == \
|
|
@@ -4509,6 +4775,66 @@ HWY_API Vec<DI16> SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b) {
|
|
|
4509
4775
|
|
|
4510
4776
|
#endif
|
|
4511
4777
|
|
|
4778
|
+
// ------------------------------ SatWidenMulPairwiseAccumulate
|
|
4779
|
+
|
|
4780
|
+
#if (defined(HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM) == \
|
|
4781
|
+
defined(HWY_TARGET_TOGGLE))
|
|
4782
|
+
|
|
4783
|
+
#ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
|
|
4784
|
+
#undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
|
|
4785
|
+
#else
|
|
4786
|
+
#define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
|
|
4787
|
+
#endif
|
|
4788
|
+
|
|
4789
|
+
template <class DI32, HWY_IF_I32_D(DI32)>
|
|
4790
|
+
HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate(
|
|
4791
|
+
DI32 di32, VFromD<Repartition<int16_t, DI32>> a,
|
|
4792
|
+
VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
|
|
4793
|
+
// WidenMulPairwiseAdd(di32, a, b) is okay here as
|
|
4794
|
+
// a[0]*b[0]+a[1]*b[1] is between -2147418112 and 2147483648 and as
|
|
4795
|
+
// a[0]*b[0]+a[1]*b[1] can only overflow an int32_t if
|
|
4796
|
+
// a[0], b[0], a[1], and b[1] are all equal to -32768.
|
|
4797
|
+
|
|
4798
|
+
const auto product = WidenMulPairwiseAdd(di32, a, b);
|
|
4799
|
+
|
|
4800
|
+
const auto mul_overflow =
|
|
4801
|
+
VecFromMask(di32, Eq(product, Set(di32, LimitsMin<int32_t>())));
|
|
4802
|
+
|
|
4803
|
+
return SaturatedAdd(Sub(sum, And(BroadcastSignBit(sum), mul_overflow)),
|
|
4804
|
+
Add(product, mul_overflow));
|
|
4805
|
+
}
|
|
4806
|
+
|
|
4807
|
+
#endif // HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
|
|
4808
|
+
|
|
4809
|
+
// ------------------------------ SatWidenMulAccumFixedPoint
|
|
4810
|
+
|
|
4811
|
+
#if (defined(HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT) == \
|
|
4812
|
+
defined(HWY_TARGET_TOGGLE))
|
|
4813
|
+
|
|
4814
|
+
#ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
|
|
4815
|
+
#undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
|
|
4816
|
+
#else
|
|
4817
|
+
#define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
|
|
4818
|
+
#endif
|
|
4819
|
+
|
|
4820
|
+
template <class DI32, HWY_IF_I32_D(DI32)>
|
|
4821
|
+
HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 di32,
|
|
4822
|
+
VFromD<Rebind<int16_t, DI32>> a,
|
|
4823
|
+
VFromD<Rebind<int16_t, DI32>> b,
|
|
4824
|
+
VFromD<DI32> sum) {
|
|
4825
|
+
const Repartition<int16_t, DI32> dt_i16;
|
|
4826
|
+
|
|
4827
|
+
const auto vt_a = ResizeBitCast(dt_i16, a);
|
|
4828
|
+
const auto vt_b = ResizeBitCast(dt_i16, b);
|
|
4829
|
+
|
|
4830
|
+
const auto dup_a = InterleaveWholeLower(dt_i16, vt_a, vt_a);
|
|
4831
|
+
const auto dup_b = InterleaveWholeLower(dt_i16, vt_b, vt_b);
|
|
4832
|
+
|
|
4833
|
+
return SatWidenMulPairwiseAccumulate(di32, dup_a, dup_b, sum);
|
|
4834
|
+
}
|
|
4835
|
+
|
|
4836
|
+
#endif // HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
|
|
4837
|
+
|
|
4512
4838
|
// ------------------------------ SumOfMulQuadAccumulate
|
|
4513
4839
|
|
|
4514
4840
|
#if (defined(HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE) == \
|
|
@@ -5588,9 +5914,7 @@ using IndicesFromD = decltype(IndicesFromVec(D(), Zero(RebindToUnsigned<D>())));
|
|
|
5588
5914
|
|
|
5589
5915
|
// RVV/SVE have their own implementations of
|
|
5590
5916
|
// TwoTablesLookupLanes(D d, VFromD<D> a, VFromD<D> b, IndicesFromD<D> idx)
|
|
5591
|
-
#if HWY_TARGET != HWY_RVV &&
|
|
5592
|
-
HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && \
|
|
5593
|
-
HWY_TARGET != HWY_SVE2_128
|
|
5917
|
+
#if HWY_TARGET != HWY_RVV && !HWY_TARGET_IS_SVE
|
|
5594
5918
|
template <class D>
|
|
5595
5919
|
HWY_API VFromD<D> TwoTablesLookupLanes(D /*d*/, VFromD<D> a, VFromD<D> b,
|
|
5596
5920
|
IndicesFromD<D> idx) {
|
|
@@ -5780,7 +6104,7 @@ HWY_API V ReverseBits(V v) {
|
|
|
5780
6104
|
#define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
|
|
5781
6105
|
#endif
|
|
5782
6106
|
|
|
5783
|
-
#if HWY_TARGET != HWY_SCALAR
|
|
6107
|
+
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
|
5784
6108
|
namespace detail {
|
|
5785
6109
|
|
|
5786
6110
|
template <class D>
|
|
@@ -5794,7 +6118,7 @@ HWY_INLINE Vec<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
|
|
|
5794
6118
|
const ScalableTag<uint32_t, kLoadPow2> d_load;
|
|
5795
6119
|
#else
|
|
5796
6120
|
constexpr size_t kMaxBytes = d.MaxBytes();
|
|
5797
|
-
#if
|
|
6121
|
+
#if HWY_TARGET_IS_NEON
|
|
5798
6122
|
constexpr size_t kMinLanesToLoad = 2;
|
|
5799
6123
|
#else
|
|
5800
6124
|
constexpr size_t kMinLanesToLoad = 4;
|
|
@@ -5811,7 +6135,7 @@ HWY_INLINE Vec<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
|
|
|
5811
6135
|
|
|
5812
6136
|
#endif // HWY_NATIVE_PER4LANEBLKSHUF_DUP32
|
|
5813
6137
|
|
|
5814
|
-
#if HWY_TARGET != HWY_SCALAR
|
|
6138
|
+
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
|
5815
6139
|
namespace detail {
|
|
5816
6140
|
|
|
5817
6141
|
template <class V>
|
|
@@ -5863,8 +6187,7 @@ HWY_INLINE Vec<D> TblLookupPer4LaneBlkU8IdxInBlk(D d, const uint32_t idx3,
|
|
|
5863
6187
|
d, Set(du32, U8x4Per4LaneBlkIndices(idx3, idx2, idx1, idx0)));
|
|
5864
6188
|
}
|
|
5865
6189
|
|
|
5866
|
-
#if HWY_HAVE_SCALABLE || HWY_TARGET ==
|
|
5867
|
-
HWY_TARGET == HWY_SVE2_128 || HWY_TARGET == HWY_EMU128
|
|
6190
|
+
#if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_EMU128
|
|
5868
6191
|
#define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D) void* = nullptr
|
|
5869
6192
|
#else
|
|
5870
6193
|
#define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D) HWY_IF_T_SIZE_D(D, 8)
|
|
@@ -5965,7 +6288,7 @@ HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
|
|
|
5965
6288
|
const uint16_t u16_idx1 = static_cast<uint16_t>(idx1);
|
|
5966
6289
|
const uint16_t u16_idx2 = static_cast<uint16_t>(idx2);
|
|
5967
6290
|
const uint16_t u16_idx3 = static_cast<uint16_t>(idx3);
|
|
5968
|
-
#if
|
|
6291
|
+
#if HWY_TARGET_IS_NEON
|
|
5969
6292
|
constexpr size_t kMinLanesToLoad = 4;
|
|
5970
6293
|
#else
|
|
5971
6294
|
constexpr size_t kMinLanesToLoad = 8;
|
|
@@ -6195,7 +6518,7 @@ HWY_API V Per4LaneBlockShuffle(V v) {
|
|
|
6195
6518
|
return v;
|
|
6196
6519
|
}
|
|
6197
6520
|
|
|
6198
|
-
#if HWY_TARGET != HWY_SCALAR
|
|
6521
|
+
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
|
6199
6522
|
template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0, class V,
|
|
6200
6523
|
HWY_IF_LANES_D(DFromV<V>, 2)>
|
|
6201
6524
|
HWY_API V Per4LaneBlockShuffle(V v) {
|
|
@@ -6294,7 +6617,7 @@ HWY_API VFromD<D> Slide1Down(D d, VFromD<D> /*v*/) {
|
|
|
6294
6617
|
return Zero(d);
|
|
6295
6618
|
}
|
|
6296
6619
|
|
|
6297
|
-
#if HWY_TARGET != HWY_SCALAR
|
|
6620
|
+
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
|
6298
6621
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_GT_D(D, 1)>
|
|
6299
6622
|
HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) {
|
|
6300
6623
|
return ShiftLeftLanes<1>(d, v);
|
|
@@ -6343,6 +6666,37 @@ HWY_API VFromD<D> SlideDownBlocks(D d, VFromD<D> v) {
|
|
|
6343
6666
|
}
|
|
6344
6667
|
#endif
|
|
6345
6668
|
|
|
6669
|
+
// ------------------------------ Slide mask up/down
|
|
6670
|
+
#if (defined(HWY_NATIVE_SLIDE_MASK) == defined(HWY_TARGET_TOGGLE))
|
|
6671
|
+
|
|
6672
|
+
#ifdef HWY_NATIVE_SLIDE_MASK
|
|
6673
|
+
#undef HWY_NATIVE_SLIDE_MASK
|
|
6674
|
+
#else
|
|
6675
|
+
#define HWY_NATIVE_SLIDE_MASK
|
|
6676
|
+
#endif
|
|
6677
|
+
|
|
6678
|
+
template <class D>
|
|
6679
|
+
HWY_API Mask<D> SlideMask1Up(D d, Mask<D> m) {
|
|
6680
|
+
return MaskFromVec(Slide1Up(d, VecFromMask(d, m)));
|
|
6681
|
+
}
|
|
6682
|
+
|
|
6683
|
+
template <class D>
|
|
6684
|
+
HWY_API Mask<D> SlideMask1Down(D d, Mask<D> m) {
|
|
6685
|
+
return MaskFromVec(Slide1Down(d, VecFromMask(d, m)));
|
|
6686
|
+
}
|
|
6687
|
+
|
|
6688
|
+
template <class D>
|
|
6689
|
+
HWY_API Mask<D> SlideMaskUpLanes(D d, Mask<D> m, size_t amt) {
|
|
6690
|
+
return MaskFromVec(SlideUpLanes(d, VecFromMask(d, m), amt));
|
|
6691
|
+
}
|
|
6692
|
+
|
|
6693
|
+
template <class D>
|
|
6694
|
+
HWY_API Mask<D> SlideMaskDownLanes(D d, Mask<D> m, size_t amt) {
|
|
6695
|
+
return MaskFromVec(SlideDownLanes(d, VecFromMask(d, m), amt));
|
|
6696
|
+
}
|
|
6697
|
+
|
|
6698
|
+
#endif // HWY_NATIVE_SLIDE_MASK
|
|
6699
|
+
|
|
6346
6700
|
// ------------------------------ SumsOfAdjQuadAbsDiff
|
|
6347
6701
|
|
|
6348
6702
|
#if (defined(HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF) == \
|
|
@@ -6353,7 +6707,7 @@ HWY_API VFromD<D> SlideDownBlocks(D d, VFromD<D> v) {
|
|
|
6353
6707
|
#define HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
|
|
6354
6708
|
#endif
|
|
6355
6709
|
|
|
6356
|
-
#if HWY_TARGET != HWY_SCALAR
|
|
6710
|
+
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
|
6357
6711
|
template <int kAOffset, int kBOffset, class V8, HWY_IF_UI8_D(DFromV<V8>)>
|
|
6358
6712
|
HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfAdjQuadAbsDiff(V8 a, V8 b) {
|
|
6359
6713
|
static_assert(0 <= kAOffset && kAOffset <= 1,
|
|
@@ -6377,8 +6731,7 @@ HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfAdjQuadAbsDiff(V8 a, V8 b) {
|
|
|
6377
6731
|
// targets as d8_interleave.Pow2() >= d8.Pow2() is true.
|
|
6378
6732
|
constexpr int kInterleavePow2 = HWY_MAX(d8.Pow2(), 0);
|
|
6379
6733
|
const ScalableTag<TFromD<D8>, kInterleavePow2> d8_interleave;
|
|
6380
|
-
#elif HWY_HAVE_SCALABLE ||
|
|
6381
|
-
HWY_TARGET == HWY_SVE2_128
|
|
6734
|
+
#elif HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE
|
|
6382
6735
|
// On SVE targets, Lanes(d8_interleave) >= 16 and
|
|
6383
6736
|
// Lanes(d8_interleave) >= Lanes(d8) are both already true as d8 is a SIMD
|
|
6384
6737
|
// tag for a full u8/i8 vector on SVE.
|
|
@@ -6457,7 +6810,7 @@ HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfAdjQuadAbsDiff(V8 a, V8 b) {
|
|
|
6457
6810
|
#define HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
|
|
6458
6811
|
#endif
|
|
6459
6812
|
|
|
6460
|
-
#if HWY_TARGET != HWY_SCALAR
|
|
6813
|
+
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
|
6461
6814
|
template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, class V8,
|
|
6462
6815
|
HWY_IF_UI8_D(DFromV<V8>)>
|
|
6463
6816
|
HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfShuffledQuadAbsDiff(V8 a,
|
|
@@ -6499,7 +6852,7 @@ HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfShuffledQuadAbsDiff(V8 a,
|
|
|
6499
6852
|
a_shuf[3], a_shuf[4], a_shuf[5], a_shuf[6],
|
|
6500
6853
|
a_shuf[9], a_shuf[10], a_shuf[11], a_shuf[12],
|
|
6501
6854
|
a_shuf[11], a_shuf[12], a_shuf[13], a_shuf[14] } */
|
|
6502
|
-
#if HWY_HAVE_SCALABLE ||
|
|
6855
|
+
#if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE
|
|
6503
6856
|
// On RVV/SVE targets, use Slide1Up/Slide1Down instead of
|
|
6504
6857
|
// ShiftLeftBytes/ShiftRightBytes to avoid unnecessary zeroing out of any
|
|
6505
6858
|
// lanes that are shifted into an adjacent 16-byte block as any lanes that are
|
|
@@ -6539,6 +6892,56 @@ HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfShuffledQuadAbsDiff(V8 a,
|
|
|
6539
6892
|
|
|
6540
6893
|
#endif // HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
|
|
6541
6894
|
|
|
6895
|
+
// ------------------------------ BitShuffle (Rol)
|
|
6896
|
+
#if (defined(HWY_NATIVE_BITSHUFFLE) == defined(HWY_TARGET_TOGGLE))
|
|
6897
|
+
#ifdef HWY_NATIVE_BITSHUFFLE
|
|
6898
|
+
#undef HWY_NATIVE_BITSHUFFLE
|
|
6899
|
+
#else
|
|
6900
|
+
#define HWY_NATIVE_BITSHUFFLE
|
|
6901
|
+
#endif
|
|
6902
|
+
|
|
6903
|
+
#if HWY_HAVE_INTEGER64 && HWY_TARGET != HWY_SCALAR
|
|
6904
|
+
template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>)>
|
|
6905
|
+
HWY_API V BitShuffle(V v, VI idx) {
|
|
6906
|
+
const DFromV<decltype(v)> d64;
|
|
6907
|
+
const RebindToUnsigned<decltype(d64)> du64;
|
|
6908
|
+
const Repartition<uint8_t, decltype(d64)> du8;
|
|
6909
|
+
|
|
6910
|
+
#if HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || \
|
|
6911
|
+
HWY_TARGET == HWY_WASM_EMU256
|
|
6912
|
+
const Repartition<uint16_t, decltype(d64)> d_idx_shr;
|
|
6913
|
+
#else
|
|
6914
|
+
const Repartition<uint8_t, decltype(d64)> d_idx_shr;
|
|
6915
|
+
#endif
|
|
6916
|
+
|
|
6917
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
6918
|
+
constexpr uint64_t kExtractedBitsMask =
|
|
6919
|
+
static_cast<uint64_t>(0x8040201008040201u);
|
|
6920
|
+
#else
|
|
6921
|
+
constexpr uint64_t kExtractedBitsMask =
|
|
6922
|
+
static_cast<uint64_t>(0x0102040810204080u);
|
|
6923
|
+
#endif
|
|
6924
|
+
|
|
6925
|
+
const auto byte_idx = BitwiseIfThenElse(
|
|
6926
|
+
Set(du8, uint8_t{0x07}),
|
|
6927
|
+
BitCast(du8, ShiftRight<3>(BitCast(d_idx_shr, idx))),
|
|
6928
|
+
BitCast(du8, Dup128VecFromValues(du64, uint64_t{0},
|
|
6929
|
+
uint64_t{0x0808080808080808u})));
|
|
6930
|
+
// We want to shift right by idx & 7 to extract the desired bit in `bytes`,
|
|
6931
|
+
// and left by iota & 7 to put it in the correct output bit. To correctly
|
|
6932
|
+
// handle shift counts from -7 to 7, we rotate.
|
|
6933
|
+
const auto rotate_left_bits = Sub(Iota(du8, uint8_t{0}), BitCast(du8, idx));
|
|
6934
|
+
|
|
6935
|
+
const auto extracted_bits =
|
|
6936
|
+
And(Rol(TableLookupBytes(v, byte_idx), rotate_left_bits),
|
|
6937
|
+
BitCast(du8, Set(du64, kExtractedBitsMask)));
|
|
6938
|
+
// Combine bit-sliced (one bit per byte) into one 64-bit sum.
|
|
6939
|
+
return BitCast(d64, SumsOf8(extracted_bits));
|
|
6940
|
+
}
|
|
6941
|
+
#endif // HWY_HAVE_INTEGER64 && HWY_TARGET != HWY_SCALAR
|
|
6942
|
+
|
|
6943
|
+
#endif // HWY_NATIVE_BITSHUFFLE
|
|
6944
|
+
|
|
6542
6945
|
// ================================================== Operator wrapper
|
|
6543
6946
|
|
|
6544
6947
|
// SVE* and RVV currently cannot define operators and have already defined
|