@img/sharp-libvips-dev 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +7 -1
- package/include/aom/aom_image.h +24 -12
- package/include/aom/aom_integer.h +3 -3
- package/include/aom/aomcx.h +15 -0
- package/include/aom/aomdx.h +5 -2
- package/include/archive.h +7 -5
- package/include/archive_entry.h +5 -3
- package/include/cgif.h +3 -0
- package/include/freetype2/freetype/config/ftoption.h +1 -1
- package/include/fribidi/fribidi-config.h +2 -2
- package/include/fribidi/fribidi-unicode-version.h +3 -3
- package/include/glib-2.0/gio/gappinfo.h +40 -25
- package/include/glib-2.0/gio/gasyncresult.h +1 -1
- package/include/glib-2.0/gio/gconverter.h +5 -0
- package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
- package/include/glib-2.0/gio/gfile.h +16 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/gsettings.h +8 -0
- package/include/glib-2.0/gio/gvfs.h +2 -2
- package/include/glib-2.0/girepository/gi-visibility.h +34 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
- package/include/glib-2.0/glib/giochannel.h +2 -2
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +12 -5
- package/include/glib-2.0/glib/gmain.h +93 -7
- package/include/glib-2.0/glib/gqsort.h +8 -1
- package/include/glib-2.0/glib/gstrfuncs.h +0 -12
- package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
- package/include/glib-2.0/glib/gunicode.h +1 -1
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gtype.h +6 -6
- package/include/harfbuzz/hb-buffer.h +6 -0
- package/include/harfbuzz/hb-common.h +6 -9
- package/include/harfbuzz/hb-cplusplus.hh +8 -11
- package/include/harfbuzz/hb-subset.h +17 -4
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +28 -0
- package/include/hwy/aligned_allocator.h +48 -1
- package/include/hwy/base.h +235 -34
- package/include/hwy/detect_compiler_arch.h +84 -10
- package/include/hwy/detect_targets.h +95 -29
- package/include/hwy/foreach_target.h +12 -1
- package/include/hwy/highway.h +205 -50
- package/include/hwy/ops/arm_neon-inl.h +841 -99
- package/include/hwy/ops/arm_sve-inl.h +413 -141
- package/include/hwy/ops/emu128-inl.h +373 -360
- package/include/hwy/ops/generic_ops-inl.h +804 -401
- package/include/hwy/ops/inside-inl.h +691 -0
- package/include/hwy/ops/ppc_vsx-inl.h +456 -166
- package/include/hwy/ops/rvv-inl.h +537 -249
- package/include/hwy/ops/scalar-inl.h +169 -79
- package/include/hwy/ops/set_macros-inl.h +106 -18
- package/include/hwy/ops/shared-inl.h +23 -0
- package/include/hwy/ops/wasm_128-inl.h +130 -108
- package/include/hwy/ops/x86_128-inl.h +1892 -577
- package/include/hwy/ops/x86_256-inl.h +625 -184
- package/include/hwy/ops/x86_512-inl.h +733 -131
- package/include/hwy/targets.h +22 -21
- package/include/hwy/timer-inl.h +3 -3
- package/include/hwy/timer.h +5 -1
- package/include/libheif/heif.h +170 -15
- package/include/libheif/heif_items.h +237 -0
- package/include/libheif/heif_properties.h +38 -2
- package/include/libheif/heif_regions.h +1 -1
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/pnglibconf.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
- package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
- package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
- package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
- package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
- package/include/libxml2/libxml/HTMLparser.h +12 -19
- package/include/libxml2/libxml/c14n.h +1 -12
- package/include/libxml2/libxml/debugXML.h +1 -1
- package/include/libxml2/libxml/encoding.h +9 -0
- package/include/libxml2/libxml/entities.h +12 -1
- package/include/libxml2/libxml/hash.h +19 -0
- package/include/libxml2/libxml/list.h +2 -2
- package/include/libxml2/libxml/nanohttp.h +17 -0
- package/include/libxml2/libxml/parser.h +61 -55
- package/include/libxml2/libxml/parserInternals.h +9 -1
- package/include/libxml2/libxml/pattern.h +6 -0
- package/include/libxml2/libxml/tree.h +32 -12
- package/include/libxml2/libxml/uri.h +11 -0
- package/include/libxml2/libxml/valid.h +29 -2
- package/include/libxml2/libxml/xinclude.h +7 -0
- package/include/libxml2/libxml/xmlIO.h +21 -4
- package/include/libxml2/libxml/xmlerror.h +14 -0
- package/include/libxml2/libxml/xmlexports.h +111 -15
- package/include/libxml2/libxml/xmlmemory.h +8 -45
- package/include/libxml2/libxml/xmlreader.h +2 -0
- package/include/libxml2/libxml/xmlsave.h +5 -0
- package/include/libxml2/libxml/xmlunicode.h +165 -1
- package/include/libxml2/libxml/xmlversion.h +15 -179
- package/include/libxml2/libxml/xmlwriter.h +1 -0
- package/include/libxml2/libxml/xpath.h +4 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-item.h +4 -2
- package/include/pango-1.0/pango/pango-version-macros.h +25 -0
- package/include/pango-1.0/pango/pangofc-font.h +2 -1
- package/include/pnglibconf.h +1 -1
- package/include/vips/util.h +1 -2
- package/include/vips/version.h +4 -4
- package/include/webp/decode.h +58 -56
- package/include/webp/demux.h +25 -21
- package/include/webp/encode.h +44 -39
- package/include/webp/mux.h +76 -15
- package/include/webp/mux_types.h +2 -1
- package/include/webp/sharpyuv/sharpyuv.h +77 -8
- package/include/webp/types.h +29 -8
- package/include/zconf.h +1 -1
- package/include/zlib.h +12 -12
- package/package.json +1 -1
- package/versions.json +14 -15
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
// External include guard in highway.h - see comment there.
|
|
18
18
|
|
|
19
19
|
#include "hwy/base.h"
|
|
20
|
+
|
|
20
21
|
#ifndef HWY_NO_LIBCXX
|
|
21
22
|
#include <math.h> // sqrtf
|
|
22
23
|
#endif
|
|
@@ -103,9 +104,6 @@ HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
|
|
|
103
104
|
template <class D>
|
|
104
105
|
using VFromD = decltype(Zero(D()));
|
|
105
106
|
|
|
106
|
-
// ------------------------------ Tuple (VFromD)
|
|
107
|
-
#include "hwy/ops/tuple-inl.h"
|
|
108
|
-
|
|
109
107
|
// ------------------------------ BitCast
|
|
110
108
|
|
|
111
109
|
template <class D, class VFrom>
|
|
@@ -355,9 +353,8 @@ HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
|
|
|
355
353
|
// ------------------------------ BroadcastSignBit
|
|
356
354
|
template <typename T, size_t N>
|
|
357
355
|
HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
|
|
358
|
-
// This is used inside ShiftRight, so we cannot implement in terms of it.
|
|
359
356
|
for (size_t i = 0; i < N; ++i) {
|
|
360
|
-
v.raw[i] =
|
|
357
|
+
v.raw[i] = ScalarShr(v.raw[i], sizeof(T) * 8 - 1);
|
|
361
358
|
}
|
|
362
359
|
return v;
|
|
363
360
|
}
|
|
@@ -431,12 +428,6 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
|
|
|
431
428
|
return v;
|
|
432
429
|
}
|
|
433
430
|
|
|
434
|
-
template <typename T, size_t N>
|
|
435
|
-
HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
|
|
436
|
-
const DFromV<decltype(v)> d;
|
|
437
|
-
return IfNegativeThenElse(v, Zero(d), v);
|
|
438
|
-
}
|
|
439
|
-
|
|
440
431
|
// ------------------------------ Mask logical
|
|
441
432
|
|
|
442
433
|
template <typename T, size_t N>
|
|
@@ -494,41 +485,26 @@ HWY_API Vec128<T, N> ShiftLeft(Vec128<T, N> v) {
|
|
|
494
485
|
template <int kBits, typename T, size_t N>
|
|
495
486
|
HWY_API Vec128<T, N> ShiftRight(Vec128<T, N> v) {
|
|
496
487
|
static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
|
|
497
|
-
#if __cplusplus >= 202002L
|
|
498
488
|
// Signed right shift is now guaranteed to be arithmetic (rounding toward
|
|
499
489
|
// negative infinity, i.e. shifting in the sign bit).
|
|
500
490
|
for (size_t i = 0; i < N; ++i) {
|
|
501
|
-
v.raw[i] =
|
|
491
|
+
v.raw[i] = ScalarShr(v.raw[i], kBits);
|
|
502
492
|
}
|
|
503
|
-
|
|
504
|
-
if (IsSigned<T>()) {
|
|
505
|
-
// Emulate arithmetic shift using only logical (unsigned) shifts, because
|
|
506
|
-
// signed shifts are still implementation-defined.
|
|
507
|
-
using TU = hwy::MakeUnsigned<T>;
|
|
508
|
-
for (size_t i = 0; i < N; ++i) {
|
|
509
|
-
const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> kBits);
|
|
510
|
-
const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
|
|
511
|
-
const size_t sign_shift =
|
|
512
|
-
static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
|
|
513
|
-
const TU upper = static_cast<TU>(sign << sign_shift);
|
|
514
|
-
v.raw[i] = static_cast<T>(shifted | upper);
|
|
515
|
-
}
|
|
516
|
-
} else { // T is unsigned
|
|
517
|
-
for (size_t i = 0; i < N; ++i) {
|
|
518
|
-
v.raw[i] = static_cast<T>(v.raw[i] >> kBits);
|
|
519
|
-
}
|
|
520
|
-
}
|
|
521
|
-
#endif
|
|
493
|
+
|
|
522
494
|
return v;
|
|
523
495
|
}
|
|
524
496
|
|
|
525
497
|
// ------------------------------ RotateRight (ShiftRight)
|
|
526
|
-
template <int kBits, typename T, size_t N>
|
|
498
|
+
template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
527
499
|
HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
|
|
500
|
+
const DFromV<decltype(v)> d;
|
|
501
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
502
|
+
|
|
528
503
|
constexpr size_t kSizeInBits = sizeof(T) * 8;
|
|
529
504
|
static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
|
|
530
505
|
if (kBits == 0) return v;
|
|
531
|
-
|
|
506
|
+
|
|
507
|
+
return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
|
|
532
508
|
ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
|
|
533
509
|
}
|
|
534
510
|
|
|
@@ -545,31 +521,10 @@ HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, int bits) {
|
|
|
545
521
|
|
|
546
522
|
template <typename T, size_t N>
|
|
547
523
|
HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, int bits) {
|
|
548
|
-
#if __cplusplus >= 202002L
|
|
549
|
-
// Signed right shift is now guaranteed to be arithmetic (rounding toward
|
|
550
|
-
// negative infinity, i.e. shifting in the sign bit).
|
|
551
524
|
for (size_t i = 0; i < N; ++i) {
|
|
552
|
-
v.raw[i] =
|
|
553
|
-
}
|
|
554
|
-
#else
|
|
555
|
-
if (IsSigned<T>()) {
|
|
556
|
-
// Emulate arithmetic shift using only logical (unsigned) shifts, because
|
|
557
|
-
// signed shifts are still implementation-defined.
|
|
558
|
-
using TU = hwy::MakeUnsigned<T>;
|
|
559
|
-
for (size_t i = 0; i < N; ++i) {
|
|
560
|
-
const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits);
|
|
561
|
-
const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
|
|
562
|
-
const size_t sign_shift =
|
|
563
|
-
static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
|
|
564
|
-
const TU upper = static_cast<TU>(sign << sign_shift);
|
|
565
|
-
v.raw[i] = static_cast<T>(shifted | upper);
|
|
566
|
-
}
|
|
567
|
-
} else {
|
|
568
|
-
for (size_t i = 0; i < N; ++i) {
|
|
569
|
-
v.raw[i] = static_cast<T>(v.raw[i] >> bits); // unsigned, logical shift
|
|
570
|
-
}
|
|
525
|
+
v.raw[i] = ScalarShr(v.raw[i], bits);
|
|
571
526
|
}
|
|
572
|
-
|
|
527
|
+
|
|
573
528
|
return v;
|
|
574
529
|
}
|
|
575
530
|
|
|
@@ -587,32 +542,10 @@ HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
|
|
|
587
542
|
|
|
588
543
|
template <typename T, size_t N>
|
|
589
544
|
HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) {
|
|
590
|
-
#if __cplusplus >= 202002L
|
|
591
|
-
// Signed right shift is now guaranteed to be arithmetic (rounding toward
|
|
592
|
-
// negative infinity, i.e. shifting in the sign bit).
|
|
593
545
|
for (size_t i = 0; i < N; ++i) {
|
|
594
|
-
v.raw[i] =
|
|
595
|
-
}
|
|
596
|
-
#else
|
|
597
|
-
if (IsSigned<T>()) {
|
|
598
|
-
// Emulate arithmetic shift using only logical (unsigned) shifts, because
|
|
599
|
-
// signed shifts are still implementation-defined.
|
|
600
|
-
using TU = hwy::MakeUnsigned<T>;
|
|
601
|
-
for (size_t i = 0; i < N; ++i) {
|
|
602
|
-
const TU shifted =
|
|
603
|
-
static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits.raw[i]);
|
|
604
|
-
const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
|
|
605
|
-
const size_t sign_shift = static_cast<size_t>(
|
|
606
|
-
static_cast<int>(sizeof(TU)) * 8 - 1 - bits.raw[i]);
|
|
607
|
-
const TU upper = static_cast<TU>(sign << sign_shift);
|
|
608
|
-
v.raw[i] = static_cast<T>(shifted | upper);
|
|
609
|
-
}
|
|
610
|
-
} else { // T is unsigned
|
|
611
|
-
for (size_t i = 0; i < N; ++i) {
|
|
612
|
-
v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]);
|
|
613
|
-
}
|
|
546
|
+
v.raw[i] = ScalarShr(v.raw[i], static_cast<int>(bits.raw[i]));
|
|
614
547
|
}
|
|
615
|
-
|
|
548
|
+
|
|
616
549
|
return v;
|
|
617
550
|
}
|
|
618
551
|
|
|
@@ -890,26 +823,36 @@ HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
|
|
|
890
823
|
return a;
|
|
891
824
|
}
|
|
892
825
|
|
|
893
|
-
// Returns the upper
|
|
894
|
-
template <size_t N
|
|
895
|
-
|
|
826
|
+
// Returns the upper sizeof(T)*8 bits of a * b in each lane.
|
|
827
|
+
template <class T, size_t N,
|
|
828
|
+
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
829
|
+
HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
830
|
+
HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
|
|
831
|
+
using TW = MakeWide<T>;
|
|
896
832
|
for (size_t i = 0; i < N; ++i) {
|
|
897
|
-
a.raw[i] = static_cast<
|
|
833
|
+
a.raw[i] = static_cast<T>(
|
|
834
|
+
(static_cast<TW>(a.raw[i]) * static_cast<TW>(b.raw[i])) >>
|
|
835
|
+
(sizeof(T) * 8));
|
|
898
836
|
}
|
|
899
837
|
return a;
|
|
900
838
|
}
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
839
|
+
|
|
840
|
+
template <class T, HWY_IF_UI64(T)>
|
|
841
|
+
HWY_API Vec128<T, 1> MulHigh(Vec128<T, 1> a, Vec128<T, 1> b) {
|
|
842
|
+
T hi;
|
|
843
|
+
Mul128(GetLane(a), GetLane(b), &hi);
|
|
844
|
+
return Set(Full64<T>(), hi);
|
|
845
|
+
}
|
|
846
|
+
|
|
847
|
+
template <class T, HWY_IF_UI64(T)>
|
|
848
|
+
HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
|
|
849
|
+
T hi_0;
|
|
850
|
+
T hi_1;
|
|
851
|
+
|
|
852
|
+
Mul128(GetLane(a), GetLane(b), &hi_0);
|
|
853
|
+
Mul128(ExtractLane(a, 1), ExtractLane(b, 1), &hi_1);
|
|
854
|
+
|
|
855
|
+
return Dup128VecFromValues(Full128<T>(), hi_0, hi_1);
|
|
913
856
|
}
|
|
914
857
|
|
|
915
858
|
template <size_t N>
|
|
@@ -1457,6 +1400,183 @@ HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
|
|
|
1457
1400
|
CopyBytes(v.raw, p, num_of_lanes_to_store * sizeof(TFromD<D>));
|
|
1458
1401
|
}
|
|
1459
1402
|
|
|
1403
|
+
// ================================================== COMBINE
|
|
1404
|
+
|
|
1405
|
+
template <typename T, size_t N>
|
|
1406
|
+
HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
|
|
1407
|
+
Vec128<T, N / 2> ret;
|
|
1408
|
+
CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
|
|
1409
|
+
return ret;
|
|
1410
|
+
}
|
|
1411
|
+
|
|
1412
|
+
template <class D>
|
|
1413
|
+
HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
|
|
1414
|
+
return LowerHalf(v);
|
|
1415
|
+
}
|
|
1416
|
+
|
|
1417
|
+
template <class D>
|
|
1418
|
+
HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
|
|
1419
|
+
VFromD<D> ret;
|
|
1420
|
+
CopyBytes<d.MaxBytes()>(&v.raw[MaxLanes(d)], ret.raw);
|
|
1421
|
+
return ret;
|
|
1422
|
+
}
|
|
1423
|
+
|
|
1424
|
+
template <class D>
|
|
1425
|
+
HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> v) {
|
|
1426
|
+
const Half<decltype(d)> dh;
|
|
1427
|
+
VFromD<D> ret; // zero-initialized
|
|
1428
|
+
CopyBytes<dh.MaxBytes()>(v.raw, ret.raw);
|
|
1429
|
+
return ret;
|
|
1430
|
+
}
|
|
1431
|
+
|
|
1432
|
+
template <class D, class VH = VFromD<Half<D>>>
|
|
1433
|
+
HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
|
|
1434
|
+
const Half<decltype(d)> dh;
|
|
1435
|
+
VFromD<D> ret;
|
|
1436
|
+
CopyBytes<dh.MaxBytes()>(lo_half.raw, &ret.raw[0]);
|
|
1437
|
+
CopyBytes<dh.MaxBytes()>(hi_half.raw, &ret.raw[MaxLanes(dh)]);
|
|
1438
|
+
return ret;
|
|
1439
|
+
}
|
|
1440
|
+
|
|
1441
|
+
template <class D>
|
|
1442
|
+
HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1443
|
+
const Half<decltype(d)> dh;
|
|
1444
|
+
VFromD<D> ret;
|
|
1445
|
+
CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
|
|
1446
|
+
CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
|
|
1447
|
+
return ret;
|
|
1448
|
+
}
|
|
1449
|
+
|
|
1450
|
+
template <class D>
|
|
1451
|
+
HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1452
|
+
const Half<decltype(d)> dh;
|
|
1453
|
+
VFromD<D> ret;
|
|
1454
|
+
CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
|
|
1455
|
+
CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
|
|
1456
|
+
return ret;
|
|
1457
|
+
}
|
|
1458
|
+
|
|
1459
|
+
template <class D>
|
|
1460
|
+
HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1461
|
+
const Half<decltype(d)> dh;
|
|
1462
|
+
VFromD<D> ret;
|
|
1463
|
+
CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
|
|
1464
|
+
CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
|
|
1465
|
+
return ret;
|
|
1466
|
+
}
|
|
1467
|
+
|
|
1468
|
+
template <class D>
|
|
1469
|
+
HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1470
|
+
const Half<decltype(d)> dh;
|
|
1471
|
+
VFromD<D> ret;
|
|
1472
|
+
CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
|
|
1473
|
+
CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
|
|
1474
|
+
return ret;
|
|
1475
|
+
}
|
|
1476
|
+
|
|
1477
|
+
template <class D>
|
|
1478
|
+
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1479
|
+
const Half<decltype(d)> dh;
|
|
1480
|
+
VFromD<D> ret;
|
|
1481
|
+
for (size_t i = 0; i < MaxLanes(dh); ++i) {
|
|
1482
|
+
ret.raw[i] = lo.raw[2 * i];
|
|
1483
|
+
}
|
|
1484
|
+
for (size_t i = 0; i < MaxLanes(dh); ++i) {
|
|
1485
|
+
ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i];
|
|
1486
|
+
}
|
|
1487
|
+
return ret;
|
|
1488
|
+
}
|
|
1489
|
+
|
|
1490
|
+
// 2023-11-23: workaround for incorrect codegen (reduction_test fails for
|
|
1491
|
+
// SumsOf2 because PromoteOddTo, which uses ConcatOdd, returns zero).
|
|
1492
|
+
#if HWY_ARCH_RISCV && HWY_TARGET == HWY_EMU128 && HWY_COMPILER_CLANG
|
|
1493
|
+
#define HWY_EMU128_CONCAT_INLINE HWY_NOINLINE
|
|
1494
|
+
#else
|
|
1495
|
+
#define HWY_EMU128_CONCAT_INLINE HWY_API
|
|
1496
|
+
#endif
|
|
1497
|
+
|
|
1498
|
+
template <class D>
|
|
1499
|
+
HWY_EMU128_CONCAT_INLINE VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1500
|
+
const Half<decltype(d)> dh;
|
|
1501
|
+
VFromD<D> ret;
|
|
1502
|
+
for (size_t i = 0; i < MaxLanes(dh); ++i) {
|
|
1503
|
+
ret.raw[i] = lo.raw[2 * i + 1];
|
|
1504
|
+
}
|
|
1505
|
+
for (size_t i = 0; i < MaxLanes(dh); ++i) {
|
|
1506
|
+
ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i + 1];
|
|
1507
|
+
}
|
|
1508
|
+
return ret;
|
|
1509
|
+
}
|
|
1510
|
+
|
|
1511
|
+
// ------------------------------ CombineShiftRightBytes
|
|
1512
|
+
template <int kBytes, class D>
|
|
1513
|
+
HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1514
|
+
VFromD<D> ret;
|
|
1515
|
+
const uint8_t* HWY_RESTRICT lo8 =
|
|
1516
|
+
reinterpret_cast<const uint8_t * HWY_RESTRICT>(lo.raw);
|
|
1517
|
+
uint8_t* HWY_RESTRICT ret8 =
|
|
1518
|
+
reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
|
|
1519
|
+
CopyBytes<d.MaxBytes() - kBytes>(lo8 + kBytes, ret8);
|
|
1520
|
+
CopyBytes<kBytes>(hi.raw, ret8 + d.MaxBytes() - kBytes);
|
|
1521
|
+
return ret;
|
|
1522
|
+
}
|
|
1523
|
+
|
|
1524
|
+
// ------------------------------ ShiftLeftBytes
|
|
1525
|
+
|
|
1526
|
+
template <int kBytes, class D>
|
|
1527
|
+
HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
|
|
1528
|
+
static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
|
|
1529
|
+
VFromD<D> ret;
|
|
1530
|
+
uint8_t* HWY_RESTRICT ret8 =
|
|
1531
|
+
reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
|
|
1532
|
+
ZeroBytes<kBytes>(ret8);
|
|
1533
|
+
CopyBytes<d.MaxBytes() - kBytes>(v.raw, ret8 + kBytes);
|
|
1534
|
+
return ret;
|
|
1535
|
+
}
|
|
1536
|
+
|
|
1537
|
+
template <int kBytes, typename T, size_t N>
|
|
1538
|
+
HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
|
|
1539
|
+
return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
|
|
1540
|
+
}
|
|
1541
|
+
|
|
1542
|
+
// ------------------------------ ShiftLeftLanes
|
|
1543
|
+
|
|
1544
|
+
template <int kLanes, class D, typename T = TFromD<D>>
|
|
1545
|
+
HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) {
|
|
1546
|
+
const Repartition<uint8_t, decltype(d)> d8;
|
|
1547
|
+
return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
|
|
1548
|
+
}
|
|
1549
|
+
|
|
1550
|
+
template <int kLanes, typename T, size_t N>
|
|
1551
|
+
HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) {
|
|
1552
|
+
return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
|
|
1553
|
+
}
|
|
1554
|
+
|
|
1555
|
+
// ------------------------------ ShiftRightBytes
|
|
1556
|
+
template <int kBytes, class D>
|
|
1557
|
+
HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
|
|
1558
|
+
static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
|
|
1559
|
+
VFromD<D> ret;
|
|
1560
|
+
const uint8_t* HWY_RESTRICT v8 =
|
|
1561
|
+
reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
|
|
1562
|
+
uint8_t* HWY_RESTRICT ret8 =
|
|
1563
|
+
reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
|
|
1564
|
+
CopyBytes<d.MaxBytes() - kBytes>(v8 + kBytes, ret8);
|
|
1565
|
+
ZeroBytes<kBytes>(ret8 + d.MaxBytes() - kBytes);
|
|
1566
|
+
return ret;
|
|
1567
|
+
}
|
|
1568
|
+
|
|
1569
|
+
// ------------------------------ ShiftRightLanes
|
|
1570
|
+
template <int kLanes, class D>
|
|
1571
|
+
HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) {
|
|
1572
|
+
const Repartition<uint8_t, decltype(d)> d8;
|
|
1573
|
+
constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
|
|
1574
|
+
return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
|
|
1575
|
+
}
|
|
1576
|
+
|
|
1577
|
+
// ------------------------------ Tuples, PromoteEvenTo/PromoteOddTo
|
|
1578
|
+
#include "hwy/ops/inside-inl.h"
|
|
1579
|
+
|
|
1460
1580
|
// ------------------------------ LoadInterleaved2/3/4
|
|
1461
1581
|
|
|
1462
1582
|
// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
|
|
@@ -1621,6 +1741,47 @@ HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag /*to_type_tag*/,
|
|
|
1621
1741
|
float val) {
|
|
1622
1742
|
return CastValueForF2IConv<ToT>(val);
|
|
1623
1743
|
}
|
|
1744
|
+
// If val is within the range of ToT, CastValueForInRangeF2IConv<ToT>(val)
|
|
1745
|
+
// returns static_cast<ToT>(val)
|
|
1746
|
+
//
|
|
1747
|
+
// Otherwise, CastValueForInRangeF2IConv<ToT>(val) returns an
|
|
1748
|
+
// implementation-defined result if val is not within the range of ToT.
|
|
1749
|
+
template <class ToT, class FromT>
|
|
1750
|
+
HWY_INLINE ToT CastValueForInRangeF2IConv(FromT val) {
|
|
1751
|
+
// Prevent ubsan errors when converting float to narrower integer
|
|
1752
|
+
|
|
1753
|
+
using FromTU = MakeUnsigned<FromT>;
|
|
1754
|
+
|
|
1755
|
+
constexpr unsigned kMaxExpField =
|
|
1756
|
+
static_cast<unsigned>(MaxExponentField<FromT>());
|
|
1757
|
+
constexpr unsigned kExpBias = kMaxExpField >> 1;
|
|
1758
|
+
constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
|
|
1759
|
+
kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
|
|
1760
|
+
kMaxExpField));
|
|
1761
|
+
|
|
1762
|
+
// If ToT is signed, compare only the exponent bits of val against
|
|
1763
|
+
// kMinOutOfRangeExpField.
|
|
1764
|
+
//
|
|
1765
|
+
// Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
|
|
1766
|
+
// val against kMinOutOfRangeExpField as a negative value is outside of the
|
|
1767
|
+
// range of an unsigned integer type.
|
|
1768
|
+
const FromT val_to_compare =
|
|
1769
|
+
static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);
|
|
1770
|
+
|
|
1771
|
+
// val is within the range of ToT if
|
|
1772
|
+
// (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
|
|
1773
|
+
// than kMinOutOfRangeExpField
|
|
1774
|
+
//
|
|
1775
|
+
// Otherwise, val is either outside of the range of ToT or equal to
|
|
1776
|
+
// LimitsMin<ToT>() if
|
|
1777
|
+
// (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
|
|
1778
|
+
// than or equal to kMinOutOfRangeExpField.
|
|
1779
|
+
|
|
1780
|
+
return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
|
|
1781
|
+
MantissaBits<FromT>()) < kMinOutOfRangeExpField)
|
|
1782
|
+
? static_cast<ToT>(val)
|
|
1783
|
+
: static_cast<ToT>(LimitsMin<ToT>());
|
|
1784
|
+
}
|
|
1624
1785
|
|
|
1625
1786
|
} // namespace detail
|
|
1626
1787
|
|
|
@@ -1636,6 +1797,21 @@ HWY_API VFromD<DTo> PromoteTo(DTo d, Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
|
|
|
1636
1797
|
return ret;
|
|
1637
1798
|
}
|
|
1638
1799
|
|
|
1800
|
+
#ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
|
|
1801
|
+
#undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
|
|
1802
|
+
#else
|
|
1803
|
+
#define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
|
|
1804
|
+
#endif
|
|
1805
|
+
|
|
1806
|
+
template <class D64, HWY_IF_UI64_D(D64)>
|
|
1807
|
+
HWY_API VFromD<D64> PromoteInRangeTo(D64 d64, VFromD<Rebind<float, D64>> v) {
|
|
1808
|
+
VFromD<D64> ret;
|
|
1809
|
+
for (size_t i = 0; i < MaxLanes(d64); ++i) {
|
|
1810
|
+
ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<D64>>(v.raw[i]);
|
|
1811
|
+
}
|
|
1812
|
+
return ret;
|
|
1813
|
+
}
|
|
1814
|
+
|
|
1639
1815
|
// MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here,
|
|
1640
1816
|
// so we overload for TFrom=double and ToT={float,int32_t}.
|
|
1641
1817
|
template <class D, HWY_IF_F32_D(D)>
|
|
@@ -1679,17 +1855,32 @@ HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) {
|
|
|
1679
1855
|
return ret;
|
|
1680
1856
|
}
|
|
1681
1857
|
|
|
1858
|
+
// Disable the default unsigned to signed DemoteTo/ReorderDemote2To
|
|
1859
|
+
// implementations in generic_ops-inl.h on EMU128 as the EMU128 target has
|
|
1860
|
+
// target-specific implementations of the unsigned to signed DemoteTo and
|
|
1861
|
+
// ReorderDemote2To ops
|
|
1862
|
+
|
|
1863
|
+
// NOTE: hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr is used instead of
|
|
1864
|
+
// hwy::EnableIf<false>* = nullptr to avoid compiler errors since
|
|
1865
|
+
// !hwy::IsSame<V, V>() is always false and as !hwy::IsSame<V, V>() will cause
|
|
1866
|
+
// SFINAE to occur instead of a hard error due to a dependency on the V template
|
|
1867
|
+
// argument
|
|
1868
|
+
#undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
|
|
1869
|
+
#define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \
|
|
1870
|
+
hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
|
|
1871
|
+
|
|
1682
1872
|
template <class DTo, typename TFrom, size_t N, HWY_IF_UNSIGNED(TFrom),
|
|
1683
|
-
|
|
1873
|
+
HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DTo)>
|
|
1684
1874
|
HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) {
|
|
1685
1875
|
using TTo = TFromD<DTo>;
|
|
1686
1876
|
static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
|
|
1687
1877
|
|
|
1878
|
+
const auto max = static_cast<MakeUnsigned<TTo>>(LimitsMax<TTo>());
|
|
1879
|
+
|
|
1688
1880
|
VFromD<DTo> ret;
|
|
1689
1881
|
for (size_t i = 0; i < N; ++i) {
|
|
1690
1882
|
// Int to int: choose closest value in ToT to `from` (avoids UB)
|
|
1691
|
-
|
|
1692
|
-
ret.raw[i] = static_cast<TTo>(from.raw[i]);
|
|
1883
|
+
ret.raw[i] = static_cast<TTo>(HWY_MIN(from.raw[i], max));
|
|
1693
1884
|
}
|
|
1694
1885
|
return ret;
|
|
1695
1886
|
}
|
|
@@ -1737,14 +1928,15 @@ HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
|
|
|
1737
1928
|
return ret;
|
|
1738
1929
|
}
|
|
1739
1930
|
|
|
1740
|
-
template <class DN,
|
|
1741
|
-
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
|
|
1931
|
+
template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DN), class V,
|
|
1932
|
+
HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
|
|
1742
1933
|
HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
|
|
1743
1934
|
HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
|
|
1744
1935
|
const RepartitionToWide<decltype(dn)> dw;
|
|
1745
1936
|
const size_t NW = Lanes(dw);
|
|
1746
1937
|
using TN = TFromD<DN>;
|
|
1747
|
-
|
|
1938
|
+
using TN_U = MakeUnsigned<TN>;
|
|
1939
|
+
const TN_U max = static_cast<TN_U>(LimitsMax<TN>());
|
|
1748
1940
|
VFromD<DN> ret;
|
|
1749
1941
|
for (size_t i = 0; i < NW; ++i) {
|
|
1750
1942
|
ret.raw[i] = static_cast<TN>(HWY_MIN(a.raw[i], max));
|
|
@@ -1803,6 +1995,12 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<bfloat16_t, N> v) {
|
|
|
1803
1995
|
return ret;
|
|
1804
1996
|
}
|
|
1805
1997
|
|
|
1998
|
+
#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
1999
|
+
#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
2000
|
+
#else
|
|
2001
|
+
#define HWY_NATIVE_DEMOTE_F32_TO_BF16
|
|
2002
|
+
#endif
|
|
2003
|
+
|
|
1806
2004
|
template <class D, HWY_IF_BF16_D(D), size_t N>
|
|
1807
2005
|
HWY_API VFromD<D> DemoteTo(D /* tag */, Vec128<float, N> v) {
|
|
1808
2006
|
VFromD<D> ret;
|
|
@@ -1812,6 +2010,21 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec128<float, N> v) {
|
|
|
1812
2010
|
return ret;
|
|
1813
2011
|
}
|
|
1814
2012
|
|
|
2013
|
+
#ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
|
|
2014
|
+
#undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
|
|
2015
|
+
#else
|
|
2016
|
+
#define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
|
|
2017
|
+
#endif
|
|
2018
|
+
|
|
2019
|
+
template <class D32, HWY_IF_UI32_D(D32)>
|
|
2020
|
+
HWY_API VFromD<D32> DemoteInRangeTo(D32 d32, VFromD<Rebind<double, D32>> v) {
|
|
2021
|
+
VFromD<D32> ret;
|
|
2022
|
+
for (size_t i = 0; i < MaxLanes(d32); ++i) {
|
|
2023
|
+
ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<D32>>(v.raw[i]);
|
|
2024
|
+
}
|
|
2025
|
+
return ret;
|
|
2026
|
+
}
|
|
2027
|
+
|
|
1815
2028
|
// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
|
|
1816
2029
|
namespace detail {
|
|
1817
2030
|
|
|
@@ -1851,6 +2064,22 @@ HWY_API VFromD<DTo> ConvertTo(DTo d, Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
|
|
|
1851
2064
|
return detail::ConvertTo(hwy::IsFloatTag<TFrom>(), d, from);
|
|
1852
2065
|
}
|
|
1853
2066
|
|
|
2067
|
+
#ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
|
|
2068
|
+
#undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
|
|
2069
|
+
#else
|
|
2070
|
+
#define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
|
|
2071
|
+
#endif
|
|
2072
|
+
|
|
2073
|
+
template <class DI, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DI),
|
|
2074
|
+
HWY_IF_T_SIZE_ONE_OF_D(DI, (1 << 4) | (1 << 8))>
|
|
2075
|
+
HWY_API VFromD<DI> ConvertInRangeTo(DI di, VFromD<RebindToFloat<DI>> v) {
|
|
2076
|
+
VFromD<DI> ret;
|
|
2077
|
+
for (size_t i = 0; i < MaxLanes(di); i++) {
|
|
2078
|
+
ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<DI>>(v.raw[i]);
|
|
2079
|
+
}
|
|
2080
|
+
return ret;
|
|
2081
|
+
}
|
|
2082
|
+
|
|
1854
2083
|
template <size_t N>
|
|
1855
2084
|
HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) {
|
|
1856
2085
|
return DemoteTo(Simd<uint8_t, N, 0>(), v);
|
|
@@ -1938,180 +2167,6 @@ HWY_API VFromD<DN> OrderedTruncate2To(DN dn, V a, V b) {
|
|
|
1938
2167
|
return ret;
|
|
1939
2168
|
}
|
|
1940
2169
|
|
|
1941
|
-
// ================================================== COMBINE
|
|
1942
|
-
|
|
1943
|
-
template <typename T, size_t N>
|
|
1944
|
-
HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
|
|
1945
|
-
Vec128<T, N / 2> ret;
|
|
1946
|
-
CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
|
|
1947
|
-
return ret;
|
|
1948
|
-
}
|
|
1949
|
-
|
|
1950
|
-
template <class D>
|
|
1951
|
-
HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
|
|
1952
|
-
return LowerHalf(v);
|
|
1953
|
-
}
|
|
1954
|
-
|
|
1955
|
-
template <class D>
|
|
1956
|
-
HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
|
|
1957
|
-
VFromD<D> ret;
|
|
1958
|
-
CopyBytes<d.MaxBytes()>(&v.raw[MaxLanes(d)], ret.raw);
|
|
1959
|
-
return ret;
|
|
1960
|
-
}
|
|
1961
|
-
|
|
1962
|
-
template <class D>
|
|
1963
|
-
HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> v) {
|
|
1964
|
-
const Half<decltype(d)> dh;
|
|
1965
|
-
VFromD<D> ret; // zero-initialized
|
|
1966
|
-
CopyBytes<dh.MaxBytes()>(v.raw, ret.raw);
|
|
1967
|
-
return ret;
|
|
1968
|
-
}
|
|
1969
|
-
|
|
1970
|
-
template <class D, class VH = VFromD<Half<D>>>
|
|
1971
|
-
HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
|
|
1972
|
-
const Half<decltype(d)> dh;
|
|
1973
|
-
VFromD<D> ret;
|
|
1974
|
-
CopyBytes<dh.MaxBytes()>(lo_half.raw, &ret.raw[0]);
|
|
1975
|
-
CopyBytes<dh.MaxBytes()>(hi_half.raw, &ret.raw[MaxLanes(dh)]);
|
|
1976
|
-
return ret;
|
|
1977
|
-
}
|
|
1978
|
-
|
|
1979
|
-
template <class D>
|
|
1980
|
-
HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1981
|
-
const Half<decltype(d)> dh;
|
|
1982
|
-
VFromD<D> ret;
|
|
1983
|
-
CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
|
|
1984
|
-
CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
|
|
1985
|
-
return ret;
|
|
1986
|
-
}
|
|
1987
|
-
|
|
1988
|
-
template <class D>
|
|
1989
|
-
HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1990
|
-
const Half<decltype(d)> dh;
|
|
1991
|
-
VFromD<D> ret;
|
|
1992
|
-
CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
|
|
1993
|
-
CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
|
|
1994
|
-
return ret;
|
|
1995
|
-
}
|
|
1996
|
-
|
|
1997
|
-
template <class D>
|
|
1998
|
-
HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
1999
|
-
const Half<decltype(d)> dh;
|
|
2000
|
-
VFromD<D> ret;
|
|
2001
|
-
CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
|
|
2002
|
-
CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
|
|
2003
|
-
return ret;
|
|
2004
|
-
}
|
|
2005
|
-
|
|
2006
|
-
template <class D>
|
|
2007
|
-
HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
2008
|
-
const Half<decltype(d)> dh;
|
|
2009
|
-
VFromD<D> ret;
|
|
2010
|
-
CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
|
|
2011
|
-
CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
|
|
2012
|
-
return ret;
|
|
2013
|
-
}
|
|
2014
|
-
|
|
2015
|
-
template <class D>
|
|
2016
|
-
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
2017
|
-
const Half<decltype(d)> dh;
|
|
2018
|
-
VFromD<D> ret;
|
|
2019
|
-
for (size_t i = 0; i < MaxLanes(dh); ++i) {
|
|
2020
|
-
ret.raw[i] = lo.raw[2 * i];
|
|
2021
|
-
}
|
|
2022
|
-
for (size_t i = 0; i < MaxLanes(dh); ++i) {
|
|
2023
|
-
ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i];
|
|
2024
|
-
}
|
|
2025
|
-
return ret;
|
|
2026
|
-
}
|
|
2027
|
-
|
|
2028
|
-
// 2023-11-23: workaround for incorrect codegen (reduction_test fails for
|
|
2029
|
-
// SumsOf2 because PromoteOddTo, which uses ConcatOdd, returns zero).
|
|
2030
|
-
#if HWY_ARCH_RVV && HWY_TARGET == HWY_EMU128 && HWY_COMPILER_CLANG
|
|
2031
|
-
#define HWY_EMU128_CONCAT_INLINE HWY_NOINLINE
|
|
2032
|
-
#else
|
|
2033
|
-
#define HWY_EMU128_CONCAT_INLINE HWY_API
|
|
2034
|
-
#endif
|
|
2035
|
-
|
|
2036
|
-
template <class D>
|
|
2037
|
-
HWY_EMU128_CONCAT_INLINE VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
2038
|
-
const Half<decltype(d)> dh;
|
|
2039
|
-
VFromD<D> ret;
|
|
2040
|
-
for (size_t i = 0; i < MaxLanes(dh); ++i) {
|
|
2041
|
-
ret.raw[i] = lo.raw[2 * i + 1];
|
|
2042
|
-
}
|
|
2043
|
-
for (size_t i = 0; i < MaxLanes(dh); ++i) {
|
|
2044
|
-
ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i + 1];
|
|
2045
|
-
}
|
|
2046
|
-
return ret;
|
|
2047
|
-
}
|
|
2048
|
-
|
|
2049
|
-
// ------------------------------ CombineShiftRightBytes
|
|
2050
|
-
template <int kBytes, class D>
|
|
2051
|
-
HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
2052
|
-
VFromD<D> ret;
|
|
2053
|
-
const uint8_t* HWY_RESTRICT lo8 =
|
|
2054
|
-
reinterpret_cast<const uint8_t * HWY_RESTRICT>(lo.raw);
|
|
2055
|
-
uint8_t* HWY_RESTRICT ret8 =
|
|
2056
|
-
reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
|
|
2057
|
-
CopyBytes<d.MaxBytes() - kBytes>(lo8 + kBytes, ret8);
|
|
2058
|
-
CopyBytes<kBytes>(hi.raw, ret8 + d.MaxBytes() - kBytes);
|
|
2059
|
-
return ret;
|
|
2060
|
-
}
|
|
2061
|
-
|
|
2062
|
-
// ------------------------------ ShiftLeftBytes
|
|
2063
|
-
|
|
2064
|
-
template <int kBytes, class D>
|
|
2065
|
-
HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
|
|
2066
|
-
static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
|
|
2067
|
-
VFromD<D> ret;
|
|
2068
|
-
uint8_t* HWY_RESTRICT ret8 =
|
|
2069
|
-
reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
|
|
2070
|
-
ZeroBytes<kBytes>(ret8);
|
|
2071
|
-
CopyBytes<d.MaxBytes() - kBytes>(v.raw, ret8 + kBytes);
|
|
2072
|
-
return ret;
|
|
2073
|
-
}
|
|
2074
|
-
|
|
2075
|
-
template <int kBytes, typename T, size_t N>
|
|
2076
|
-
HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
|
|
2077
|
-
return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
|
|
2078
|
-
}
|
|
2079
|
-
|
|
2080
|
-
// ------------------------------ ShiftLeftLanes
|
|
2081
|
-
|
|
2082
|
-
template <int kLanes, class D, typename T = TFromD<D>>
|
|
2083
|
-
HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) {
|
|
2084
|
-
const Repartition<uint8_t, decltype(d)> d8;
|
|
2085
|
-
return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
|
|
2086
|
-
}
|
|
2087
|
-
|
|
2088
|
-
template <int kLanes, typename T, size_t N>
|
|
2089
|
-
HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) {
|
|
2090
|
-
return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
|
|
2091
|
-
}
|
|
2092
|
-
|
|
2093
|
-
// ------------------------------ ShiftRightBytes
|
|
2094
|
-
template <int kBytes, class D>
|
|
2095
|
-
HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
|
|
2096
|
-
static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
|
|
2097
|
-
VFromD<D> ret;
|
|
2098
|
-
const uint8_t* HWY_RESTRICT v8 =
|
|
2099
|
-
reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
|
|
2100
|
-
uint8_t* HWY_RESTRICT ret8 =
|
|
2101
|
-
reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
|
|
2102
|
-
CopyBytes<d.MaxBytes() - kBytes>(v8 + kBytes, ret8);
|
|
2103
|
-
ZeroBytes<kBytes>(ret8 + d.MaxBytes() - kBytes);
|
|
2104
|
-
return ret;
|
|
2105
|
-
}
|
|
2106
|
-
|
|
2107
|
-
// ------------------------------ ShiftRightLanes
|
|
2108
|
-
template <int kLanes, class D>
|
|
2109
|
-
HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) {
|
|
2110
|
-
const Repartition<uint8_t, decltype(d)> d8;
|
|
2111
|
-
constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
|
|
2112
|
-
return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
|
|
2113
|
-
}
|
|
2114
|
-
|
|
2115
2170
|
// ================================================== SWIZZLE
|
|
2116
2171
|
|
|
2117
2172
|
template <typename T, size_t N>
|
|
@@ -2154,6 +2209,24 @@ HWY_API Vec128<T, N> OddEven(Vec128<T, N> odd, Vec128<T, N> even) {
|
|
|
2154
2209
|
return odd;
|
|
2155
2210
|
}
|
|
2156
2211
|
|
|
2212
|
+
template <class D>
|
|
2213
|
+
HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
2214
|
+
constexpr size_t N = HWY_MAX_LANES_D(D);
|
|
2215
|
+
for (size_t i = 1; i < N; i += 2) {
|
|
2216
|
+
a.raw[i] = b.raw[i - 1];
|
|
2217
|
+
}
|
|
2218
|
+
return a;
|
|
2219
|
+
}
|
|
2220
|
+
|
|
2221
|
+
template <class D>
|
|
2222
|
+
HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
|
|
2223
|
+
constexpr size_t N = HWY_MAX_LANES_D(D);
|
|
2224
|
+
for (size_t i = 1; i < N; i += 2) {
|
|
2225
|
+
b.raw[i - 1] = a.raw[i];
|
|
2226
|
+
}
|
|
2227
|
+
return b;
|
|
2228
|
+
}
|
|
2229
|
+
|
|
2157
2230
|
template <typename T, size_t N>
|
|
2158
2231
|
HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
|
|
2159
2232
|
return even;
|
|
@@ -2724,88 +2797,26 @@ HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
|
|
|
2724
2797
|
|
|
2725
2798
|
// ------------------------------ WidenMulPairwiseAdd
|
|
2726
2799
|
|
|
2727
|
-
template <class
|
|
2728
|
-
HWY_API VFromD<
|
|
2729
|
-
|
|
2730
|
-
|
|
2731
|
-
const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
|
|
2732
|
-
// Avoid ZipLower/Upper so this also works on big-endian systems.
|
|
2733
|
-
const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
|
|
2734
|
-
const VU32 ao = And(BitCast(du32, a), odd);
|
|
2735
|
-
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
2736
|
-
const VU32 bo = And(BitCast(du32, b), odd);
|
|
2737
|
-
return Mul(BitCast(df32, ae), BitCast(df32, be)) +
|
|
2738
|
-
Mul(BitCast(df32, ao), BitCast(df32, bo));
|
|
2739
|
-
}
|
|
2740
|
-
|
|
2741
|
-
template <class D, HWY_IF_I32_D(D), class VI16>
|
|
2742
|
-
HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, VI16 a, VI16 b) {
|
|
2743
|
-
using VI32 = VFromD<decltype(d32)>;
|
|
2744
|
-
// Manual sign extension requires two shifts for even lanes.
|
|
2745
|
-
const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a)));
|
|
2746
|
-
const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b)));
|
|
2747
|
-
const VI32 ao = ShiftRight<16>(BitCast(d32, a));
|
|
2748
|
-
const VI32 bo = ShiftRight<16>(BitCast(d32, b));
|
|
2749
|
-
return Add(Mul(ae, be), Mul(ao, bo));
|
|
2800
|
+
template <class DF, HWY_IF_F32_D(DF), class VBF>
|
|
2801
|
+
HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
|
|
2802
|
+
return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
|
|
2803
|
+
Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
|
|
2750
2804
|
}
|
|
2751
2805
|
|
|
2752
|
-
template <class D,
|
|
2753
|
-
HWY_API VFromD<D> WidenMulPairwiseAdd(D
|
|
2754
|
-
|
|
2755
|
-
|
|
2756
|
-
const auto a0 = And(BitCast(du32, a), lo16_mask);
|
|
2757
|
-
const auto b0 = And(BitCast(du32, b), lo16_mask);
|
|
2758
|
-
|
|
2759
|
-
const auto a1 = ShiftRight<16>(BitCast(du32, a));
|
|
2760
|
-
const auto b1 = ShiftRight<16>(BitCast(du32, b));
|
|
2761
|
-
|
|
2762
|
-
return Add(Mul(a0, b0), Mul(a1, b1));
|
|
2806
|
+
template <class D, HWY_IF_UI32_D(D), class V16>
|
|
2807
|
+
HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, V16 a, V16 b) {
|
|
2808
|
+
return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b),
|
|
2809
|
+
Mul(PromoteOddTo(d32, a), PromoteOddTo(d32, b)));
|
|
2763
2810
|
}
|
|
2764
2811
|
|
|
2765
2812
|
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
|
|
2766
2813
|
|
|
2767
|
-
template <class D,
|
|
2768
|
-
HWY_API VFromD<D> ReorderWidenMulAccumulate(D
|
|
2769
|
-
const
|
|
2770
|
-
|
|
2771
|
-
|
|
2772
|
-
|
|
2773
|
-
const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
|
|
2774
|
-
// Avoid ZipLower/Upper so this also works on big-endian systems.
|
|
2775
|
-
const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
|
|
2776
|
-
const VU32 ao = And(BitCast(du32, a), odd);
|
|
2777
|
-
const VU32 be = ShiftLeft<16>(BitCast(du32, b));
|
|
2778
|
-
const VU32 bo = And(BitCast(du32, b), odd);
|
|
2779
|
-
sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
|
|
2780
|
-
return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
|
|
2781
|
-
}
|
|
2782
|
-
|
|
2783
|
-
template <class D, HWY_IF_I32_D(D), size_t N, class VI16>
|
|
2784
|
-
HWY_API VFromD<D> ReorderWidenMulAccumulate(D d32, VI16 a, VI16 b,
|
|
2785
|
-
const Vec128<int32_t, N> sum0,
|
|
2786
|
-
Vec128<int32_t, N>& sum1) {
|
|
2787
|
-
using VI32 = VFromD<decltype(d32)>;
|
|
2788
|
-
// Manual sign extension requires two shifts for even lanes.
|
|
2789
|
-
const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a)));
|
|
2790
|
-
const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b)));
|
|
2791
|
-
const VI32 ao = ShiftRight<16>(BitCast(d32, a));
|
|
2792
|
-
const VI32 bo = ShiftRight<16>(BitCast(d32, b));
|
|
2793
|
-
sum1 = Add(Mul(ao, bo), sum1);
|
|
2794
|
-
return Add(Mul(ae, be), sum0);
|
|
2795
|
-
}
|
|
2796
|
-
|
|
2797
|
-
template <class D, HWY_IF_U32_D(D), size_t N, class VU16>
|
|
2798
|
-
HWY_API VFromD<D> ReorderWidenMulAccumulate(D du32, VU16 a, VU16 b,
|
|
2799
|
-
const Vec128<uint32_t, N> sum0,
|
|
2800
|
-
Vec128<uint32_t, N>& sum1) {
|
|
2801
|
-
using VU32 = VFromD<decltype(du32)>;
|
|
2802
|
-
const VU32 lo16_mask = Set(du32, uint32_t{0x0000FFFFu});
|
|
2803
|
-
const VU32 ae = And(BitCast(du32, a), lo16_mask);
|
|
2804
|
-
const VU32 be = And(BitCast(du32, b), lo16_mask);
|
|
2805
|
-
const VU32 ao = ShiftRight<16>(BitCast(du32, a));
|
|
2806
|
-
const VU32 bo = ShiftRight<16>(BitCast(du32, b));
|
|
2807
|
-
sum1 = Add(Mul(ao, bo), sum1);
|
|
2808
|
-
return Add(Mul(ae, be), sum0);
|
|
2814
|
+
template <class D, HWY_IF_UI32_D(D), class V16>
|
|
2815
|
+
HWY_API VFromD<D> ReorderWidenMulAccumulate(D d32, V16 a, V16 b,
|
|
2816
|
+
const VFromD<D> sum0,
|
|
2817
|
+
VFromD<D>& sum1) {
|
|
2818
|
+
sum1 = MulAdd(PromoteOddTo(d32, a), PromoteOddTo(d32, b), sum1);
|
|
2819
|
+
return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b), sum0);
|
|
2809
2820
|
}
|
|
2810
2821
|
|
|
2811
2822
|
// ------------------------------ RearrangeToOddPlusEven
|
|
@@ -2866,18 +2877,20 @@ HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
|
|
|
2866
2877
|
|
|
2867
2878
|
// ------------------------------ MulEven/Odd 64x64 (UpperHalf)
|
|
2868
2879
|
|
|
2869
|
-
|
|
2870
|
-
|
|
2880
|
+
template <class T, HWY_IF_UI64(T)>
|
|
2881
|
+
HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
|
|
2882
|
+
alignas(16) T mul[2];
|
|
2871
2883
|
mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
|
|
2872
|
-
return Load(Full128<
|
|
2884
|
+
return Load(Full128<T>(), mul);
|
|
2873
2885
|
}
|
|
2874
2886
|
|
|
2875
|
-
|
|
2876
|
-
|
|
2877
|
-
|
|
2887
|
+
template <class T, HWY_IF_UI64(T)>
|
|
2888
|
+
HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
|
|
2889
|
+
alignas(16) T mul[2];
|
|
2890
|
+
const Half<Full128<T>> d2;
|
|
2878
2891
|
mul[0] =
|
|
2879
2892
|
Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
|
|
2880
|
-
return Load(Full128<
|
|
2893
|
+
return Load(Full128<T>(), mul);
|
|
2881
2894
|
}
|
|
2882
2895
|
|
|
2883
2896
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|