@img/sharp-libvips-dev 1.2.0 → 1.2.2-rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/ffi.h +3 -3
- package/include/harfbuzz/hb-deprecated.h +4 -4
- package/include/harfbuzz/hb-font.h +120 -9
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +2 -19
- package/include/hwy/aligned_allocator.h +11 -7
- package/include/hwy/auto_tune.h +504 -0
- package/include/hwy/base.h +425 -104
- package/include/hwy/cache_control.h +16 -0
- package/include/hwy/detect_compiler_arch.h +32 -1
- package/include/hwy/detect_targets.h +251 -67
- package/include/hwy/foreach_target.h +35 -0
- package/include/hwy/highway.h +185 -76
- package/include/hwy/nanobenchmark.h +1 -19
- package/include/hwy/ops/arm_neon-inl.h +969 -458
- package/include/hwy/ops/arm_sve-inl.h +1137 -359
- package/include/hwy/ops/emu128-inl.h +97 -11
- package/include/hwy/ops/generic_ops-inl.h +1222 -34
- package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
- package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
- package/include/hwy/ops/ppc_vsx-inl.h +306 -126
- package/include/hwy/ops/rvv-inl.h +546 -51
- package/include/hwy/ops/scalar-inl.h +77 -22
- package/include/hwy/ops/set_macros-inl.h +138 -17
- package/include/hwy/ops/shared-inl.h +50 -10
- package/include/hwy/ops/wasm_128-inl.h +137 -92
- package/include/hwy/ops/x86_128-inl.h +773 -214
- package/include/hwy/ops/x86_256-inl.h +712 -255
- package/include/hwy/ops/x86_512-inl.h +429 -753
- package/include/hwy/ops/x86_avx3-inl.h +501 -0
- package/include/hwy/per_target.h +2 -1
- package/include/hwy/profiler.h +622 -486
- package/include/hwy/targets.h +62 -20
- package/include/hwy/timer-inl.h +8 -160
- package/include/hwy/timer.h +170 -3
- package/include/hwy/x86_cpuid.h +81 -0
- package/include/libheif/heif_cxx.h +25 -5
- package/include/libheif/heif_regions.h +5 -5
- package/include/libheif/heif_version.h +2 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
- package/include/libxml2/libxml/xmlversion.h +4 -4
- package/include/pango-1.0/pango/pango-enum-types.h +3 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-font.h +30 -0
- package/include/pango-1.0/pango/pango-version-macros.h +26 -0
- package/include/pixman-1/pixman-version.h +2 -2
- package/include/webp/decode.h +11 -2
- package/include/webp/demux.h +2 -0
- package/include/webp/encode.h +2 -0
- package/include/webp/mux_types.h +1 -0
- package/include/webp/sharpyuv/sharpyuv.h +1 -1
- package/include/webp/types.h +2 -2
- package/include/zlib.h +3 -3
- package/package.json +1 -1
- package/versions.json +11 -11
|
@@ -97,6 +97,21 @@ HWY_API Vec<D> Inf(D d) {
|
|
|
97
97
|
return BitCast(d, Set(du, max_x2 >> 1));
|
|
98
98
|
}
|
|
99
99
|
|
|
100
|
+
// ------------------------------ MaskedSetOr/MaskedSet
|
|
101
|
+
|
|
102
|
+
template <class V, typename T = TFromV<V>, typename D = DFromV<V>,
|
|
103
|
+
typename M = MFromD<D>>
|
|
104
|
+
HWY_API V MaskedSetOr(V no, M m, T a) {
|
|
105
|
+
D d;
|
|
106
|
+
return IfThenElse(m, Set(d, a), no);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
template <class D, typename V = VFromD<D>, typename M = MFromD<D>,
|
|
110
|
+
typename T = TFromD<D>>
|
|
111
|
+
HWY_API V MaskedSet(D d, M m, T a) {
|
|
112
|
+
return IfThenElseZero(m, Set(d, a));
|
|
113
|
+
}
|
|
114
|
+
|
|
100
115
|
// ------------------------------ ZeroExtendResizeBitCast
|
|
101
116
|
|
|
102
117
|
// The implementation of detail::ZeroExtendResizeBitCast for the HWY_EMU128
|
|
@@ -336,6 +351,22 @@ HWY_API Mask<DTo> DemoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) {
|
|
|
336
351
|
|
|
337
352
|
#endif // HWY_NATIVE_DEMOTE_MASK_TO
|
|
338
353
|
|
|
354
|
+
// ------------------------------ InsertIntoUpper
|
|
355
|
+
#if (defined(HWY_NATIVE_LOAD_HIGHER) == defined(HWY_TARGET_TOGGLE))
|
|
356
|
+
#ifdef HWY_NATIVE_LOAD_HIGHER
|
|
357
|
+
#undef HWY_NATIVE_LOAD_HIGHER
|
|
358
|
+
#else
|
|
359
|
+
#define HWY_NATIVE_LOAD_HIGHER
|
|
360
|
+
#endif
|
|
361
|
+
template <class D, typename T, class V = VFromD<D>(), HWY_IF_LANES_GT_D(D, 1),
|
|
362
|
+
HWY_IF_POW2_GT_D(D, -3)>
|
|
363
|
+
HWY_API V InsertIntoUpper(D d, T* p, V a) {
|
|
364
|
+
Half<D> dh;
|
|
365
|
+
const VFromD<decltype(dh)> b = LoadU(dh, p);
|
|
366
|
+
return Combine(d, b, LowerHalf(a));
|
|
367
|
+
}
|
|
368
|
+
#endif // HWY_NATIVE_LOAD_HIGHER
|
|
369
|
+
|
|
339
370
|
// ------------------------------ CombineMasks
|
|
340
371
|
|
|
341
372
|
#if (defined(HWY_NATIVE_COMBINE_MASKS) == defined(HWY_TARGET_TOGGLE))
|
|
@@ -488,6 +519,95 @@ HWY_API V InterleaveEven(V a, V b) {
|
|
|
488
519
|
}
|
|
489
520
|
#endif
|
|
490
521
|
|
|
522
|
+
// ------------------------------ MinNumber/MaxNumber
|
|
523
|
+
|
|
524
|
+
#if (defined(HWY_NATIVE_FLOAT_MIN_MAX_NUMBER) == defined(HWY_TARGET_TOGGLE))
|
|
525
|
+
#ifdef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
|
|
526
|
+
#undef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
|
|
527
|
+
#else
|
|
528
|
+
#define HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
|
|
529
|
+
#endif
|
|
530
|
+
|
|
531
|
+
template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
|
|
532
|
+
HWY_API V MinNumber(V a, V b) {
|
|
533
|
+
return Min(a, b);
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
|
|
537
|
+
HWY_API V MaxNumber(V a, V b) {
|
|
538
|
+
return Max(a, b);
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
#endif
|
|
542
|
+
|
|
543
|
+
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
544
|
+
HWY_API V MinNumber(V a, V b) {
|
|
545
|
+
return Min(a, b);
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
549
|
+
HWY_API V MaxNumber(V a, V b) {
|
|
550
|
+
return Max(a, b);
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
// ------------------------------ MinMagnitude/MaxMagnitude
|
|
554
|
+
|
|
555
|
+
#if (defined(HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE) == defined(HWY_TARGET_TOGGLE))
|
|
556
|
+
#ifdef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
|
|
557
|
+
#undef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
|
|
558
|
+
#else
|
|
559
|
+
#define HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
|
|
560
|
+
#endif
|
|
561
|
+
|
|
562
|
+
template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
|
|
563
|
+
HWY_API V MinMagnitude(V a, V b) {
|
|
564
|
+
const V abs_a = Abs(a);
|
|
565
|
+
const V abs_b = Abs(b);
|
|
566
|
+
const V min = Min(IfThenElse(Eq(abs_a, abs_b), a, b), b);
|
|
567
|
+
return IfThenElse(Lt(abs_a, abs_b), a, min);
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
|
|
571
|
+
HWY_API V MaxMagnitude(V a, V b) {
|
|
572
|
+
const V abs_a = Abs(a);
|
|
573
|
+
const V abs_b = Abs(b);
|
|
574
|
+
// This lvalue appears to be necessary to avoid a clang bug on SVE.
|
|
575
|
+
const V max = Max(IfThenElse(Eq(abs_a, abs_b), b, a), a);
|
|
576
|
+
return IfThenElse(Lt(abs_a, abs_b), b, max);
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
#endif // HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
|
|
580
|
+
|
|
581
|
+
template <class V, HWY_IF_SIGNED_V(V)>
|
|
582
|
+
HWY_API V MinMagnitude(V a, V b) {
|
|
583
|
+
const DFromV<V> d;
|
|
584
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
585
|
+
const auto abs_a = BitCast(du, Abs(a));
|
|
586
|
+
const auto abs_b = BitCast(du, Abs(b));
|
|
587
|
+
return IfThenElse(RebindMask(d, Lt(abs_a, abs_b)), a,
|
|
588
|
+
Min(IfThenElse(RebindMask(d, Eq(abs_a, abs_b)), a, b), b));
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
template <class V, HWY_IF_SIGNED_V(V)>
|
|
592
|
+
HWY_API V MaxMagnitude(V a, V b) {
|
|
593
|
+
const DFromV<V> d;
|
|
594
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
595
|
+
const auto abs_a = BitCast(du, Abs(a));
|
|
596
|
+
const auto abs_b = BitCast(du, Abs(b));
|
|
597
|
+
return IfThenElse(RebindMask(d, Lt(abs_a, abs_b)), b,
|
|
598
|
+
Max(IfThenElse(RebindMask(d, Eq(abs_a, abs_b)), b, a), a));
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
template <class V, HWY_IF_UNSIGNED_V(V)>
|
|
602
|
+
HWY_API V MinMagnitude(V a, V b) {
|
|
603
|
+
return Min(a, b);
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
template <class V, HWY_IF_UNSIGNED_V(V)>
|
|
607
|
+
HWY_API V MaxMagnitude(V a, V b) {
|
|
608
|
+
return Max(a, b);
|
|
609
|
+
}
|
|
610
|
+
|
|
491
611
|
// ------------------------------ AddSub
|
|
492
612
|
|
|
493
613
|
template <class V, HWY_IF_LANES_D(DFromV<V>, 1)>
|
|
@@ -555,12 +675,18 @@ HWY_API V MaskedMulOr(V no, M m, V a, V b) {
|
|
|
555
675
|
|
|
556
676
|
template <class V, class M>
|
|
557
677
|
HWY_API V MaskedDivOr(V no, M m, V a, V b) {
|
|
558
|
-
|
|
678
|
+
const DFromV<V> d;
|
|
679
|
+
// Avoid division by zero for masked-out lanes.
|
|
680
|
+
const V nonzero = Set(d, TFromD<decltype(d)>{1});
|
|
681
|
+
return IfThenElse(m, Div(a, IfThenElse(m, b, nonzero)), no);
|
|
559
682
|
}
|
|
560
683
|
|
|
561
684
|
template <class V, class M>
|
|
562
685
|
HWY_API V MaskedModOr(V no, M m, V a, V b) {
|
|
563
|
-
|
|
686
|
+
const DFromV<V> d;
|
|
687
|
+
// Avoid division by zero for masked-out lanes.
|
|
688
|
+
const V nonzero = Set(d, TFromD<decltype(d)>{1});
|
|
689
|
+
return IfThenElse(m, Mod(a, IfThenElse(m, b, nonzero)), no);
|
|
564
690
|
}
|
|
565
691
|
|
|
566
692
|
template <class V, class M>
|
|
@@ -574,6 +700,140 @@ HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
|
|
|
574
700
|
}
|
|
575
701
|
#endif // HWY_NATIVE_MASKED_ARITH
|
|
576
702
|
|
|
703
|
+
#if (defined(HWY_NATIVE_ZERO_MASKED_ARITH) == defined(HWY_TARGET_TOGGLE))
|
|
704
|
+
#ifdef HWY_NATIVE_ZERO_MASKED_ARITH
|
|
705
|
+
#undef HWY_NATIVE_ZERO_MASKED_ARITH
|
|
706
|
+
#else
|
|
707
|
+
#define HWY_NATIVE_ZERO_MASKED_ARITH
|
|
708
|
+
#endif
|
|
709
|
+
|
|
710
|
+
template <class V, class M>
|
|
711
|
+
HWY_API V MaskedMax(M m, V a, V b) {
|
|
712
|
+
return IfThenElseZero(m, (Max(a, b)));
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
template <class V, class M>
|
|
716
|
+
HWY_API V MaskedAdd(M m, V a, V b) {
|
|
717
|
+
return IfThenElseZero(m, Add(a, b));
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
template <class V, class M>
|
|
721
|
+
HWY_API V MaskedSub(M m, V a, V b) {
|
|
722
|
+
return IfThenElseZero(m, Sub(a, b));
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
template <class V, class M>
|
|
726
|
+
HWY_API V MaskedMul(M m, V a, V b) {
|
|
727
|
+
return IfThenElseZero(m, Mul(a, b));
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
template <class V, class M>
|
|
731
|
+
HWY_API V MaskedDiv(M m, V a, V b) {
|
|
732
|
+
return IfThenElseZero(m, Div(a, b));
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
template <class V, class M>
|
|
736
|
+
HWY_API V MaskedSaturatedAdd(M m, V a, V b) {
|
|
737
|
+
return IfThenElseZero(m, SaturatedAdd(a, b));
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
template <class V, class M>
|
|
741
|
+
HWY_API V MaskedSaturatedSub(M m, V a, V b) {
|
|
742
|
+
return IfThenElseZero(m, SaturatedSub(a, b));
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
template <class V, class M, typename D = DFromV<V>, HWY_IF_I16_D(D)>
|
|
746
|
+
HWY_API V MaskedMulFixedPoint15(M m, V a, V b) {
|
|
747
|
+
return IfThenElseZero(m, MulFixedPoint15(a, b));
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
template <class V, class M>
|
|
751
|
+
HWY_API V MaskedMulAdd(M m, V mul, V x, V add) {
|
|
752
|
+
return IfThenElseZero(m, MulAdd(mul, x, add));
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
template <class V, class M>
|
|
756
|
+
HWY_API V MaskedNegMulAdd(M m, V mul, V x, V add) {
|
|
757
|
+
return IfThenElseZero(m, NegMulAdd(mul, x, add));
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
template <class D, class M, HWY_IF_UI32_D(D),
|
|
761
|
+
class V16 = VFromD<RepartitionToNarrow<D>>>
|
|
762
|
+
HWY_API VFromD<D> MaskedWidenMulPairwiseAdd(D d32, M m, V16 a, V16 b) {
|
|
763
|
+
return IfThenElseZero(m, WidenMulPairwiseAdd(d32, a, b));
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
template <class DF, class M, HWY_IF_F32_D(DF), class VBF>
|
|
767
|
+
HWY_API VFromD<DF> MaskedWidenMulPairwiseAdd(DF df, M m, VBF a, VBF b) {
|
|
768
|
+
return IfThenElseZero(m, WidenMulPairwiseAdd(df, a, b));
|
|
769
|
+
}
|
|
770
|
+
#endif // HWY_NATIVE_ZERO_MASKED_ARITH
|
|
771
|
+
|
|
772
|
+
// ------------------------------ MaskedShift
|
|
773
|
+
template <int kShift, class V, class M>
|
|
774
|
+
HWY_API V MaskedShiftLeft(M m, V a) {
|
|
775
|
+
return IfThenElseZero(m, ShiftLeft<kShift>(a));
|
|
776
|
+
}
|
|
777
|
+
|
|
778
|
+
template <int kShift, class V, class M>
|
|
779
|
+
HWY_API V MaskedShiftRight(M m, V a) {
|
|
780
|
+
return IfThenElseZero(m, ShiftRight<kShift>(a));
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
template <int kShift, class V, class M>
|
|
784
|
+
HWY_API V MaskedShiftRightOr(V no, M m, V a) {
|
|
785
|
+
return IfThenElse(m, ShiftRight<kShift>(a), no);
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
template <class V, class M>
|
|
789
|
+
HWY_API V MaskedShrOr(V no, M m, V a, V shifts) {
|
|
790
|
+
return IfThenElse(m, Shr(a, shifts), no);
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
// ------------------------------ MaskedEq etc.
|
|
794
|
+
#if (defined(HWY_NATIVE_MASKED_COMP) == defined(HWY_TARGET_TOGGLE))
|
|
795
|
+
#ifdef HWY_NATIVE_MASKED_COMP
|
|
796
|
+
#undef HWY_NATIVE_MASKED_COMP
|
|
797
|
+
#else
|
|
798
|
+
#define HWY_NATIVE_MASKED_COMP
|
|
799
|
+
#endif
|
|
800
|
+
|
|
801
|
+
template <class V, class M>
|
|
802
|
+
HWY_API auto MaskedEq(M m, V a, V b) -> decltype(a == b) {
|
|
803
|
+
return And(m, Eq(a, b));
|
|
804
|
+
}
|
|
805
|
+
|
|
806
|
+
template <class V, class M>
|
|
807
|
+
HWY_API auto MaskedNe(M m, V a, V b) -> decltype(a == b) {
|
|
808
|
+
return And(m, Ne(a, b));
|
|
809
|
+
}
|
|
810
|
+
|
|
811
|
+
template <class V, class M>
|
|
812
|
+
HWY_API auto MaskedLt(M m, V a, V b) -> decltype(a == b) {
|
|
813
|
+
return And(m, Lt(a, b));
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
template <class V, class M>
|
|
817
|
+
HWY_API auto MaskedGt(M m, V a, V b) -> decltype(a == b) {
|
|
818
|
+
return And(m, Gt(a, b));
|
|
819
|
+
}
|
|
820
|
+
|
|
821
|
+
template <class V, class M>
|
|
822
|
+
HWY_API auto MaskedLe(M m, V a, V b) -> decltype(a == b) {
|
|
823
|
+
return And(m, Le(a, b));
|
|
824
|
+
}
|
|
825
|
+
|
|
826
|
+
template <class V, class M>
|
|
827
|
+
HWY_API auto MaskedGe(M m, V a, V b) -> decltype(a == b) {
|
|
828
|
+
return And(m, Ge(a, b));
|
|
829
|
+
}
|
|
830
|
+
|
|
831
|
+
template <class V, class M, class D = DFromV<V>>
|
|
832
|
+
HWY_API MFromD<D> MaskedIsNaN(const M m, const V v) {
|
|
833
|
+
return And(m, IsNaN(v));
|
|
834
|
+
}
|
|
835
|
+
#endif // HWY_NATIVE_MASKED_COMP
|
|
836
|
+
|
|
577
837
|
// ------------------------------ IfNegativeThenNegOrUndefIfZero
|
|
578
838
|
|
|
579
839
|
#if (defined(HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG) == \
|
|
@@ -672,6 +932,18 @@ HWY_API V SaturatedAbs(V v) {
|
|
|
672
932
|
|
|
673
933
|
#endif
|
|
674
934
|
|
|
935
|
+
// ------------------------------ MaskedAbsOr
|
|
936
|
+
template <class V, HWY_IF_SIGNED_V(V), class M>
|
|
937
|
+
HWY_API V MaskedAbsOr(V no, M m, V v) {
|
|
938
|
+
return IfThenElse(m, Abs(v), no);
|
|
939
|
+
}
|
|
940
|
+
|
|
941
|
+
// ------------------------------ MaskedAbs
|
|
942
|
+
template <class V, HWY_IF_SIGNED_V(V), class M>
|
|
943
|
+
HWY_API V MaskedAbs(M m, V v) {
|
|
944
|
+
return IfThenElseZero(m, Abs(v));
|
|
945
|
+
}
|
|
946
|
+
|
|
675
947
|
// ------------------------------ Reductions
|
|
676
948
|
|
|
677
949
|
// Targets follow one of two strategies. If HWY_NATIVE_REDUCE_SCALAR is toggled,
|
|
@@ -855,6 +1127,7 @@ HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) {
|
|
|
855
1127
|
#else
|
|
856
1128
|
#define HWY_NATIVE_REDUCE_SUM_4_UI8
|
|
857
1129
|
#endif
|
|
1130
|
+
|
|
858
1131
|
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
|
|
859
1132
|
HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
|
|
860
1133
|
const Twice<RepartitionToWide<decltype(d)>> dw;
|
|
@@ -882,6 +1155,30 @@ HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
|
|
|
882
1155
|
}
|
|
883
1156
|
#endif // HWY_NATIVE_REDUCE_MINMAX_4_UI8
|
|
884
1157
|
|
|
1158
|
+
#if (defined(HWY_NATIVE_MASKED_REDUCE_SCALAR) == defined(HWY_TARGET_TOGGLE))
|
|
1159
|
+
#ifdef HWY_NATIVE_MASKED_REDUCE_SCALAR
|
|
1160
|
+
#undef HWY_NATIVE_MASKED_REDUCE_SCALAR
|
|
1161
|
+
#else
|
|
1162
|
+
#define HWY_NATIVE_MASKED_REDUCE_SCALAR
|
|
1163
|
+
#endif
|
|
1164
|
+
|
|
1165
|
+
template <class D, class M>
|
|
1166
|
+
HWY_API TFromD<D> MaskedReduceSum(D d, M m, VFromD<D> v) {
|
|
1167
|
+
return ReduceSum(d, IfThenElseZero(m, v));
|
|
1168
|
+
}
|
|
1169
|
+
template <class D, class M>
|
|
1170
|
+
HWY_API TFromD<D> MaskedReduceMin(D d, M m, VFromD<D> v) {
|
|
1171
|
+
return ReduceMin(
|
|
1172
|
+
d, IfThenElse(m, v, Set(d, hwy::PositiveInfOrHighestValue<TFromD<D>>())));
|
|
1173
|
+
}
|
|
1174
|
+
template <class D, class M>
|
|
1175
|
+
HWY_API TFromD<D> MaskedReduceMax(D d, M m, VFromD<D> v) {
|
|
1176
|
+
return ReduceMax(
|
|
1177
|
+
d, IfThenElse(m, v, Set(d, hwy::NegativeInfOrLowestValue<TFromD<D>>())));
|
|
1178
|
+
}
|
|
1179
|
+
|
|
1180
|
+
#endif // HWY_NATIVE_MASKED_REDUCE_SCALAR
|
|
1181
|
+
|
|
885
1182
|
// ------------------------------ IsEitherNaN
|
|
886
1183
|
#if (defined(HWY_NATIVE_IS_EITHER_NAN) == defined(HWY_TARGET_TOGGLE))
|
|
887
1184
|
#ifdef HWY_NATIVE_IS_EITHER_NAN
|
|
@@ -946,6 +1243,225 @@ HWY_API MFromD<D> IsFinite(const V v) {
|
|
|
946
1243
|
|
|
947
1244
|
#endif // HWY_NATIVE_ISINF
|
|
948
1245
|
|
|
1246
|
+
// ------------------------------ CeilInt/FloorInt
|
|
1247
|
+
#if (defined(HWY_NATIVE_CEIL_FLOOR_INT) == defined(HWY_TARGET_TOGGLE))
|
|
1248
|
+
#ifdef HWY_NATIVE_CEIL_FLOOR_INT
|
|
1249
|
+
#undef HWY_NATIVE_CEIL_FLOOR_INT
|
|
1250
|
+
#else
|
|
1251
|
+
#define HWY_NATIVE_CEIL_FLOOR_INT
|
|
1252
|
+
#endif
|
|
1253
|
+
|
|
1254
|
+
template <class V, HWY_IF_FLOAT_V(V)>
|
|
1255
|
+
HWY_API VFromD<RebindToSigned<DFromV<V>>> CeilInt(V v) {
|
|
1256
|
+
const DFromV<decltype(v)> d;
|
|
1257
|
+
const RebindToSigned<decltype(d)> di;
|
|
1258
|
+
return ConvertTo(di, Ceil(v));
|
|
1259
|
+
}
|
|
1260
|
+
|
|
1261
|
+
template <class V, HWY_IF_FLOAT_V(V)>
|
|
1262
|
+
HWY_API VFromD<RebindToSigned<DFromV<V>>> FloorInt(V v) {
|
|
1263
|
+
const DFromV<decltype(v)> d;
|
|
1264
|
+
const RebindToSigned<decltype(d)> di;
|
|
1265
|
+
return ConvertTo(di, Floor(v));
|
|
1266
|
+
}
|
|
1267
|
+
|
|
1268
|
+
#endif // HWY_NATIVE_CEIL_FLOOR_INT
|
|
1269
|
+
|
|
1270
|
+
// ------------------------------ MulByPow2/MulByFloorPow2
|
|
1271
|
+
|
|
1272
|
+
#if (defined(HWY_NATIVE_MUL_BY_POW2) == defined(HWY_TARGET_TOGGLE))
|
|
1273
|
+
#ifdef HWY_NATIVE_MUL_BY_POW2
|
|
1274
|
+
#undef HWY_NATIVE_MUL_BY_POW2
|
|
1275
|
+
#else
|
|
1276
|
+
#define HWY_NATIVE_MUL_BY_POW2
|
|
1277
|
+
#endif
|
|
1278
|
+
|
|
1279
|
+
template <class V, HWY_IF_FLOAT_V(V)>
|
|
1280
|
+
HWY_API V MulByPow2(V v, VFromD<RebindToSigned<DFromV<V>>> exp) {
|
|
1281
|
+
const DFromV<decltype(v)> df;
|
|
1282
|
+
const RebindToUnsigned<decltype(df)> du;
|
|
1283
|
+
const RebindToSigned<decltype(df)> di;
|
|
1284
|
+
|
|
1285
|
+
using TF = TFromD<decltype(df)>;
|
|
1286
|
+
using TI = TFromD<decltype(di)>;
|
|
1287
|
+
using TU = TFromD<decltype(du)>;
|
|
1288
|
+
|
|
1289
|
+
using VF = VFromD<decltype(df)>;
|
|
1290
|
+
using VI = VFromD<decltype(di)>;
|
|
1291
|
+
|
|
1292
|
+
constexpr TI kMaxBiasedExp = MaxExponentField<TF>();
|
|
1293
|
+
static_assert(kMaxBiasedExp > 0, "kMaxBiasedExp > 0 must be true");
|
|
1294
|
+
|
|
1295
|
+
constexpr TI kExpBias = static_cast<TI>(kMaxBiasedExp >> 1);
|
|
1296
|
+
static_assert(kExpBias > 0, "kExpBias > 0 must be true");
|
|
1297
|
+
static_assert(kExpBias <= LimitsMax<TI>() / 3,
|
|
1298
|
+
"kExpBias <= LimitsMax<TI>() / 3 must be true");
|
|
1299
|
+
|
|
1300
|
+
#if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4
|
|
1301
|
+
using TExpMinMax = If<(sizeof(TI) <= 4), TI, int32_t>;
|
|
1302
|
+
#elif (HWY_TARGET >= HWY_SSSE3 && HWY_TARGET <= HWY_SSE2) || \
|
|
1303
|
+
HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_WASM_EMU256
|
|
1304
|
+
using TExpMinMax = int16_t;
|
|
1305
|
+
#else
|
|
1306
|
+
using TExpMinMax = TI;
|
|
1307
|
+
#endif
|
|
1308
|
+
|
|
1309
|
+
#if HWY_TARGET == HWY_EMU128 || HWY_TARGET == HWY_SCALAR
|
|
1310
|
+
using TExpSatSub = TU;
|
|
1311
|
+
#elif HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || \
|
|
1312
|
+
HWY_TARGET == HWY_WASM_EMU256
|
|
1313
|
+
using TExpSatSub = If<(sizeof(TF) == 4), uint8_t, uint16_t>;
|
|
1314
|
+
#elif HWY_TARGET_IS_PPC
|
|
1315
|
+
using TExpSatSub = If<(sizeof(TF) >= 4), uint32_t, TU>;
|
|
1316
|
+
#else
|
|
1317
|
+
using TExpSatSub = If<(sizeof(TF) == 4), uint8_t, TU>;
|
|
1318
|
+
#endif
|
|
1319
|
+
|
|
1320
|
+
static_assert(kExpBias <= static_cast<TI>(LimitsMax<TExpMinMax>() / 3),
|
|
1321
|
+
"kExpBias <= LimitsMax<TExpMinMax>() / 3 must be true");
|
|
1322
|
+
|
|
1323
|
+
const Repartition<TExpMinMax, decltype(df)> d_exp_min_max;
|
|
1324
|
+
const Repartition<TExpSatSub, decltype(df)> d_sat_exp_sub;
|
|
1325
|
+
|
|
1326
|
+
constexpr int kNumOfExpBits = ExponentBits<TF>();
|
|
1327
|
+
constexpr int kNumOfMantBits = MantissaBits<TF>();
|
|
1328
|
+
|
|
1329
|
+
// The sign bit of BitCastScalar<TU>(a[i]) >> kNumOfMantBits can be zeroed out
|
|
1330
|
+
// using SaturatedSub if kZeroOutSignUsingSatSub is true.
|
|
1331
|
+
|
|
1332
|
+
// If kZeroOutSignUsingSatSub is true, then val_for_exp_sub will be bitcasted
|
|
1333
|
+
// to a vector that has a smaller lane size than TU for the SaturatedSub
|
|
1334
|
+
// operation below.
|
|
1335
|
+
constexpr bool kZeroOutSignUsingSatSub =
|
|
1336
|
+
((sizeof(TExpSatSub) * 8) == static_cast<size_t>(kNumOfExpBits));
|
|
1337
|
+
|
|
1338
|
+
// If kZeroOutSignUsingSatSub is true, then the upper
|
|
1339
|
+
// (sizeof(TU) - sizeof(TExpSatSub)) * 8 bits of kExpDecrBy1Bits will be all
|
|
1340
|
+
// ones and the lower sizeof(TExpSatSub) * 8 bits of kExpDecrBy1Bits will be
|
|
1341
|
+
// equal to 1.
|
|
1342
|
+
|
|
1343
|
+
// Otherwise, if kZeroOutSignUsingSatSub is false, kExpDecrBy1Bits will be
|
|
1344
|
+
// equal to 1.
|
|
1345
|
+
constexpr TU kExpDecrBy1Bits = static_cast<TU>(
|
|
1346
|
+
TU{1} - (static_cast<TU>(kZeroOutSignUsingSatSub) << kNumOfExpBits));
|
|
1347
|
+
|
|
1348
|
+
VF val_for_exp_sub = v;
|
|
1349
|
+
HWY_IF_CONSTEXPR(!kZeroOutSignUsingSatSub) {
|
|
1350
|
+
// If kZeroOutSignUsingSatSub is not true, zero out the sign bit of
|
|
1351
|
+
// val_for_exp_sub[i] using Abs
|
|
1352
|
+
val_for_exp_sub = Abs(val_for_exp_sub);
|
|
1353
|
+
}
|
|
1354
|
+
|
|
1355
|
+
// min_exp1_plus_min_exp2[i] is the smallest exponent such that
|
|
1356
|
+
// min_exp1_plus_min_exp2[i] >= 2 - kExpBias * 2 and
|
|
1357
|
+
// std::ldexp(v[i], min_exp1_plus_min_exp2[i]) is a normal floating-point
|
|
1358
|
+
// number if v[i] is a normal number
|
|
1359
|
+
const VI min_exp1_plus_min_exp2 = BitCast(
|
|
1360
|
+
di,
|
|
1361
|
+
Max(BitCast(
|
|
1362
|
+
d_exp_min_max,
|
|
1363
|
+
Neg(BitCast(
|
|
1364
|
+
di,
|
|
1365
|
+
SaturatedSub(
|
|
1366
|
+
BitCast(d_sat_exp_sub, ShiftRight<kNumOfMantBits>(
|
|
1367
|
+
BitCast(du, val_for_exp_sub))),
|
|
1368
|
+
BitCast(d_sat_exp_sub, Set(du, kExpDecrBy1Bits)))))),
|
|
1369
|
+
BitCast(d_exp_min_max,
|
|
1370
|
+
Set(di, static_cast<TI>(2 - kExpBias - kExpBias)))));
|
|
1371
|
+
|
|
1372
|
+
const VI clamped_exp =
|
|
1373
|
+
Max(Min(exp, Set(di, static_cast<TI>(kExpBias * 3))),
|
|
1374
|
+
Add(min_exp1_plus_min_exp2, Set(di, static_cast<TI>(1 - kExpBias))));
|
|
1375
|
+
|
|
1376
|
+
const VI exp1_plus_exp2 = BitCast(
|
|
1377
|
+
di, Max(Min(BitCast(d_exp_min_max,
|
|
1378
|
+
Sub(clamped_exp, ShiftRight<2>(clamped_exp))),
|
|
1379
|
+
BitCast(d_exp_min_max,
|
|
1380
|
+
Set(di, static_cast<TI>(kExpBias + kExpBias)))),
|
|
1381
|
+
BitCast(d_exp_min_max, min_exp1_plus_min_exp2)));
|
|
1382
|
+
|
|
1383
|
+
const VI exp1 = ShiftRight<1>(exp1_plus_exp2);
|
|
1384
|
+
const VI exp2 = Sub(exp1_plus_exp2, exp1);
|
|
1385
|
+
const VI exp3 = Sub(clamped_exp, exp1_plus_exp2);
|
|
1386
|
+
|
|
1387
|
+
const VI exp_bias = Set(di, kExpBias);
|
|
1388
|
+
|
|
1389
|
+
const VF factor1 =
|
|
1390
|
+
BitCast(df, ShiftLeft<kNumOfMantBits>(Add(exp1, exp_bias)));
|
|
1391
|
+
const VF factor2 =
|
|
1392
|
+
BitCast(df, ShiftLeft<kNumOfMantBits>(Add(exp2, exp_bias)));
|
|
1393
|
+
const VF factor3 =
|
|
1394
|
+
BitCast(df, ShiftLeft<kNumOfMantBits>(Add(exp3, exp_bias)));
|
|
1395
|
+
|
|
1396
|
+
return Mul(Mul(Mul(v, factor1), factor2), factor3);
|
|
1397
|
+
}
|
|
1398
|
+
|
|
1399
|
+
template <class V, HWY_IF_FLOAT_V(V)>
|
|
1400
|
+
HWY_API V MulByFloorPow2(V v, V exp) {
|
|
1401
|
+
const DFromV<decltype(v)> df;
|
|
1402
|
+
|
|
1403
|
+
// MulByFloorPow2 special cases:
|
|
1404
|
+
// MulByFloorPow2(v, NaN) => NaN
|
|
1405
|
+
// MulByFloorPow2(0, inf) => NaN
|
|
1406
|
+
// MulByFloorPow2(inf, -inf) => NaN
|
|
1407
|
+
// MulByFloorPow2(-inf, -inf) => NaN
|
|
1408
|
+
const auto is_special_case_with_nan_result =
|
|
1409
|
+
Or(IsNaN(exp),
|
|
1410
|
+
And(Eq(Abs(v), IfNegativeThenElseZero(exp, Inf(df))), IsInf(exp)));
|
|
1411
|
+
|
|
1412
|
+
return IfThenElse(is_special_case_with_nan_result, NaN(df),
|
|
1413
|
+
MulByPow2(v, FloorInt(exp)));
|
|
1414
|
+
}
|
|
1415
|
+
|
|
1416
|
+
#endif // HWY_NATIVE_MUL_BY_POW2
|
|
1417
|
+
|
|
1418
|
+
// ------------------------------ GetBiasedExponent
|
|
1419
|
+
#if (defined(HWY_NATIVE_GET_BIASED_EXPONENT) == defined(HWY_TARGET_TOGGLE))
|
|
1420
|
+
#ifdef HWY_NATIVE_GET_BIASED_EXPONENT
|
|
1421
|
+
#undef HWY_NATIVE_GET_BIASED_EXPONENT
|
|
1422
|
+
#else
|
|
1423
|
+
#define HWY_NATIVE_GET_BIASED_EXPONENT
|
|
1424
|
+
#endif
|
|
1425
|
+
|
|
1426
|
+
template <class V, HWY_IF_FLOAT_V(V)>
|
|
1427
|
+
HWY_API VFromD<RebindToUnsigned<DFromV<V>>> GetBiasedExponent(V v) {
|
|
1428
|
+
using T = TFromV<V>;
|
|
1429
|
+
|
|
1430
|
+
const DFromV<V> d;
|
|
1431
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1432
|
+
|
|
1433
|
+
constexpr int kNumOfMantBits = MantissaBits<T>();
|
|
1434
|
+
return ShiftRight<kNumOfMantBits>(BitCast(du, Abs(v)));
|
|
1435
|
+
}
|
|
1436
|
+
|
|
1437
|
+
#endif
|
|
1438
|
+
|
|
1439
|
+
// ------------------------------ GetExponent
|
|
1440
|
+
|
|
1441
|
+
#if (defined(HWY_NATIVE_GET_EXPONENT) == defined(HWY_TARGET_TOGGLE))
|
|
1442
|
+
#ifdef HWY_NATIVE_GET_EXPONENT
|
|
1443
|
+
#undef HWY_NATIVE_GET_EXPONENT
|
|
1444
|
+
#else
|
|
1445
|
+
#define HWY_NATIVE_GET_EXPONENT
|
|
1446
|
+
#endif
|
|
1447
|
+
|
|
1448
|
+
template <class V, HWY_IF_FLOAT_V(V)>
|
|
1449
|
+
HWY_API V GetExponent(V v) {
|
|
1450
|
+
const DFromV<V> d;
|
|
1451
|
+
using T = TFromV<V>;
|
|
1452
|
+
const RebindToSigned<decltype(d)> di;
|
|
1453
|
+
|
|
1454
|
+
const auto exponent_offset = Set(di, MaxExponentField<T>() >> 1);
|
|
1455
|
+
|
|
1456
|
+
// extract exponent bits as integer
|
|
1457
|
+
const auto encoded_exponent = GetBiasedExponent(v);
|
|
1458
|
+
const auto exponent_int = Sub(BitCast(di, encoded_exponent), exponent_offset);
|
|
1459
|
+
|
|
1460
|
+
// convert integer to original type
|
|
1461
|
+
return ConvertTo(d, exponent_int);
|
|
1462
|
+
}
|
|
1463
|
+
|
|
1464
|
+
#endif // HWY_NATIVE_GET_EXPONENT
|
|
949
1465
|
// ------------------------------ LoadInterleaved2
|
|
950
1466
|
|
|
951
1467
|
#if HWY_IDE || \
|
|
@@ -1819,6 +2335,110 @@ HWY_API void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1,
|
|
|
1819
2335
|
|
|
1820
2336
|
#endif // HWY_NATIVE_LOAD_STORE_INTERLEAVED
|
|
1821
2337
|
|
|
2338
|
+
// ------------------------------ PairwiseAdd/PairwiseSub
|
|
2339
|
+
#if (defined(HWY_NATIVE_PAIRWISE_ADD) == defined(HWY_TARGET_TOGGLE))
|
|
2340
|
+
#ifdef HWY_NATIVE_PAIRWISE_ADD
|
|
2341
|
+
#undef HWY_NATIVE_PAIRWISE_ADD
|
|
2342
|
+
#else
|
|
2343
|
+
#define HWY_NATIVE_PAIRWISE_ADD
|
|
2344
|
+
#endif
|
|
2345
|
+
|
|
2346
|
+
template <class D, class V = VFromD<D>(), HWY_IF_LANES_GT_D(D, 1)>
|
|
2347
|
+
HWY_API V PairwiseAdd(D d, V a, V b) {
|
|
2348
|
+
return Add(InterleaveEven(d, a, b), InterleaveOdd(d, a, b));
|
|
2349
|
+
}
|
|
2350
|
+
|
|
2351
|
+
#endif
|
|
2352
|
+
|
|
2353
|
+
#if (defined(HWY_NATIVE_PAIRWISE_SUB) == defined(HWY_TARGET_TOGGLE))
|
|
2354
|
+
#ifdef HWY_NATIVE_PAIRWISE_SUB
|
|
2355
|
+
#undef HWY_NATIVE_PAIRWISE_SUB
|
|
2356
|
+
#else
|
|
2357
|
+
#define HWY_NATIVE_PAIRWISE_SUB
|
|
2358
|
+
#endif
|
|
2359
|
+
|
|
2360
|
+
template <class D, class V = VFromD<D>(), HWY_IF_LANES_GT_D(D, 1)>
|
|
2361
|
+
HWY_API V PairwiseSub(D d, V a, V b) {
|
|
2362
|
+
return Sub(InterleaveOdd(d, a, b), InterleaveEven(d, a, b));
|
|
2363
|
+
}
|
|
2364
|
+
|
|
2365
|
+
#endif
|
|
2366
|
+
|
|
2367
|
+
// Load/StoreInterleaved for special floats. Requires HWY_GENERIC_IF_EMULATED_D
|
|
2368
|
+
// is defined such that it is true only for types that actually require these
|
|
2369
|
+
// generic implementations.
|
|
2370
|
+
#if HWY_IDE || (defined(HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED) == \
|
|
2371
|
+
defined(HWY_TARGET_TOGGLE) && \
|
|
2372
|
+
defined(HWY_GENERIC_IF_EMULATED_D))
|
|
2373
|
+
#ifdef HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED
|
|
2374
|
+
#undef HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED
|
|
2375
|
+
#else
|
|
2376
|
+
#define HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED
|
|
2377
|
+
#endif
|
|
2378
|
+
#if HWY_IDE
|
|
2379
|
+
#define HWY_GENERIC_IF_EMULATED_D(D) int
|
|
2380
|
+
#endif
|
|
2381
|
+
|
|
2382
|
+
template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>>
|
|
2383
|
+
HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
|
|
2384
|
+
VFromD<D>& v0, VFromD<D>& v1) {
|
|
2385
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2386
|
+
VFromD<decltype(du)> vu0, vu1;
|
|
2387
|
+
LoadInterleaved2(du, detail::U16LanePointer(unaligned), vu0, vu1);
|
|
2388
|
+
v0 = BitCast(d, vu0);
|
|
2389
|
+
v1 = BitCast(d, vu1);
|
|
2390
|
+
}
|
|
2391
|
+
|
|
2392
|
+
template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>>
|
|
2393
|
+
HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
|
|
2394
|
+
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
|
|
2395
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2396
|
+
VFromD<decltype(du)> vu0, vu1, vu2;
|
|
2397
|
+
LoadInterleaved3(du, detail::U16LanePointer(unaligned), vu0, vu1, vu2);
|
|
2398
|
+
v0 = BitCast(d, vu0);
|
|
2399
|
+
v1 = BitCast(d, vu1);
|
|
2400
|
+
v2 = BitCast(d, vu2);
|
|
2401
|
+
}
|
|
2402
|
+
|
|
2403
|
+
template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>>
|
|
2404
|
+
HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
|
|
2405
|
+
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
|
|
2406
|
+
VFromD<D>& v3) {
|
|
2407
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2408
|
+
VFromD<decltype(du)> vu0, vu1, vu2, vu3;
|
|
2409
|
+
LoadInterleaved4(du, detail::U16LanePointer(unaligned), vu0, vu1, vu2, vu3);
|
|
2410
|
+
v0 = BitCast(d, vu0);
|
|
2411
|
+
v1 = BitCast(d, vu1);
|
|
2412
|
+
v2 = BitCast(d, vu2);
|
|
2413
|
+
v3 = BitCast(d, vu3);
|
|
2414
|
+
}
|
|
2415
|
+
|
|
2416
|
+
template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>>
|
|
2417
|
+
HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
|
|
2418
|
+
T* HWY_RESTRICT unaligned) {
|
|
2419
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2420
|
+
StoreInterleaved2(BitCast(du, v0), BitCast(du, v1), du,
|
|
2421
|
+
detail::U16LanePointer(unaligned));
|
|
2422
|
+
}
|
|
2423
|
+
|
|
2424
|
+
template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>>
|
|
2425
|
+
HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
|
|
2426
|
+
T* HWY_RESTRICT unaligned) {
|
|
2427
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2428
|
+
StoreInterleaved3(BitCast(du, v0), BitCast(du, v1), BitCast(du, v2), du,
|
|
2429
|
+
detail::U16LanePointer(unaligned));
|
|
2430
|
+
}
|
|
2431
|
+
|
|
2432
|
+
template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>>
|
|
2433
|
+
HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
|
|
2434
|
+
VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) {
|
|
2435
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
2436
|
+
StoreInterleaved4(BitCast(du, v0), BitCast(du, v1), BitCast(du, v2),
|
|
2437
|
+
BitCast(du, v3), du, detail::U16LanePointer(unaligned));
|
|
2438
|
+
}
|
|
2439
|
+
|
|
2440
|
+
#endif // HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED
|
|
2441
|
+
|
|
1822
2442
|
// ------------------------------ LoadN
|
|
1823
2443
|
|
|
1824
2444
|
#if (defined(HWY_NATIVE_LOAD_N) == defined(HWY_TARGET_TOGGLE))
|
|
@@ -2327,6 +2947,24 @@ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
|
|
|
2327
2947
|
|
|
2328
2948
|
#endif // (defined(HWY_NATIVE_STORE_N) == defined(HWY_TARGET_TOGGLE))
|
|
2329
2949
|
|
|
2950
|
+
// ------------------------------ TruncateStore
|
|
2951
|
+
#if (defined(HWY_NATIVE_STORE_TRUNCATED) == defined(HWY_TARGET_TOGGLE))
|
|
2952
|
+
#ifdef HWY_NATIVE_STORE_TRUNCATED
|
|
2953
|
+
#undef HWY_NATIVE_STORE_TRUNCATED
|
|
2954
|
+
#else
|
|
2955
|
+
#define HWY_NATIVE_STORE_TRUNCATED
|
|
2956
|
+
#endif
|
|
2957
|
+
|
|
2958
|
+
template <class D, class T, HWY_IF_T_SIZE_GT_D(D, sizeof(T)),
|
|
2959
|
+
HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
|
|
2960
|
+
HWY_API void TruncateStore(VFromD<D> v, const D /*d*/, T* HWY_RESTRICT p) {
|
|
2961
|
+
using DTo = Rebind<T, D>;
|
|
2962
|
+
DTo dsmall;
|
|
2963
|
+
StoreU(TruncateTo(dsmall, v), dsmall, p);
|
|
2964
|
+
}
|
|
2965
|
+
|
|
2966
|
+
#endif // (defined(HWY_NATIVE_STORE_TRUNCATED) == defined(HWY_TARGET_TOGGLE))
|
|
2967
|
+
|
|
2330
2968
|
// ------------------------------ Scatter
|
|
2331
2969
|
|
|
2332
2970
|
#if (defined(HWY_NATIVE_SCATTER) == defined(HWY_TARGET_TOGGLE))
|
|
@@ -2511,17 +3149,9 @@ HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
|
|
|
2511
3149
|
|
|
2512
3150
|
template <class D, typename T = TFromD<D>>
|
|
2513
3151
|
HWY_API VFromD<D> GatherIndexN(D d, const T* HWY_RESTRICT base,
|
|
2514
|
-
VFromD<RebindToSigned<D>> index,
|
|
2515
|
-
const size_t max_lanes_to_load) {
|
|
2516
|
-
|
|
2517
|
-
using TI = TFromD<decltype(di)>;
|
|
2518
|
-
static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
|
|
2519
|
-
|
|
2520
|
-
VFromD<D> v = Zero(d);
|
|
2521
|
-
for (size_t i = 0; i < HWY_MIN(MaxLanes(d), max_lanes_to_load); ++i) {
|
|
2522
|
-
v = InsertLane(v, i, base[ExtractLane(index, i)]);
|
|
2523
|
-
}
|
|
2524
|
-
return v;
|
|
3152
|
+
VFromD<RebindToSigned<D>> index,
|
|
3153
|
+
const size_t max_lanes_to_load) {
|
|
3154
|
+
return GatherIndexNOr(Zero(d), d, base, index, max_lanes_to_load);
|
|
2525
3155
|
}
|
|
2526
3156
|
|
|
2527
3157
|
template <class D, typename T = TFromD<D>>
|
|
@@ -2533,8 +3163,9 @@ HWY_API VFromD<D> GatherIndexNOr(VFromD<D> no, D d, const T* HWY_RESTRICT base,
|
|
|
2533
3163
|
static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
|
|
2534
3164
|
|
|
2535
3165
|
VFromD<D> v = no;
|
|
2536
|
-
for (size_t i = 0; i <
|
|
2537
|
-
|
|
3166
|
+
for (size_t i = 0; i < MaxLanes(d); ++i) {
|
|
3167
|
+
if (i < max_lanes_to_load)
|
|
3168
|
+
v = InsertLane(v, i, base[ExtractLane(index, i)]);
|
|
2538
3169
|
}
|
|
2539
3170
|
return v;
|
|
2540
3171
|
}
|
|
@@ -3561,6 +4192,21 @@ HWY_API V TrailingZeroCount(V v) {
|
|
|
3561
4192
|
}
|
|
3562
4193
|
#endif // HWY_NATIVE_LEADING_ZERO_COUNT
|
|
3563
4194
|
|
|
4195
|
+
// ------------------------------ MaskedLeadingZeroCount
|
|
4196
|
+
#if (defined(HWY_NATIVE_MASKED_LEADING_ZERO_COUNT) == \
|
|
4197
|
+
defined(HWY_TARGET_TOGGLE))
|
|
4198
|
+
#ifdef HWY_NATIVE_MASKED_LEADING_ZERO_COUNT
|
|
4199
|
+
#undef HWY_NATIVE_MASKED_LEADING_ZERO_COUNT
|
|
4200
|
+
#else
|
|
4201
|
+
#define HWY_NATIVE_MASKED_LEADING_ZERO_COUNT
|
|
4202
|
+
#endif
|
|
4203
|
+
|
|
4204
|
+
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), class M>
|
|
4205
|
+
HWY_API V MaskedLeadingZeroCount(M m, V v) {
|
|
4206
|
+
return IfThenElseZero(m, LeadingZeroCount(v));
|
|
4207
|
+
}
|
|
4208
|
+
#endif // HWY_NATIVE_MASKED_LEADING_ZERO_COUNT
|
|
4209
|
+
|
|
3564
4210
|
// ------------------------------ AESRound
|
|
3565
4211
|
|
|
3566
4212
|
// Cannot implement on scalar: need at least 16 bytes for TableLookupBytes.
|
|
@@ -4027,6 +4673,12 @@ HWY_API V operator*(V x, V y) {
|
|
|
4027
4673
|
|
|
4028
4674
|
#endif // HWY_NATIVE_MUL_64
|
|
4029
4675
|
|
|
4676
|
+
// ------------------------------ MulRound
|
|
4677
|
+
template <class V, HWY_IF_FLOAT_V(V)>
|
|
4678
|
+
HWY_API V MulRound(V a, V b) {
|
|
4679
|
+
return Round(Mul(a, b));
|
|
4680
|
+
}
|
|
4681
|
+
|
|
4030
4682
|
// ------------------------------ MulAdd / NegMulAdd
|
|
4031
4683
|
|
|
4032
4684
|
#if (defined(HWY_NATIVE_INT_FMA) == defined(HWY_TARGET_TOGGLE))
|
|
@@ -4057,6 +4709,86 @@ HWY_API V MulSub(V mul, V x, V sub) {
|
|
|
4057
4709
|
return Sub(Mul(mul, x), sub);
|
|
4058
4710
|
}
|
|
4059
4711
|
#endif // HWY_NATIVE_INT_FMA
|
|
4712
|
+
// ------------------------------ MulComplex* / MaskedMulComplex*
|
|
4713
|
+
|
|
4714
|
+
#if (defined(HWY_NATIVE_CPLX) == defined(HWY_TARGET_TOGGLE))
|
|
4715
|
+
#ifdef HWY_NATIVE_CPLX
|
|
4716
|
+
#undef HWY_NATIVE_CPLX
|
|
4717
|
+
#else
|
|
4718
|
+
#define HWY_NATIVE_CPLX
|
|
4719
|
+
#endif
|
|
4720
|
+
|
|
4721
|
+
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
|
4722
|
+
|
|
4723
|
+
template <class V, HWY_IF_NOT_UNSIGNED(TFromV<V>)>
|
|
4724
|
+
HWY_API V ComplexConj(V a) {
|
|
4725
|
+
return OddEven(Neg(a), a);
|
|
4726
|
+
}
|
|
4727
|
+
|
|
4728
|
+
template <class V>
|
|
4729
|
+
HWY_API V MulComplex(V a, V b) {
|
|
4730
|
+
// a = u + iv, b = x + iy
|
|
4731
|
+
const auto u = DupEven(a);
|
|
4732
|
+
const auto v = DupOdd(a);
|
|
4733
|
+
const auto x = DupEven(b);
|
|
4734
|
+
const auto y = DupOdd(b);
|
|
4735
|
+
|
|
4736
|
+
return OddEven(MulAdd(u, y, Mul(v, x)), Sub(Mul(u, x), Mul(v, y)));
|
|
4737
|
+
}
|
|
4738
|
+
|
|
4739
|
+
template <class V>
|
|
4740
|
+
HWY_API V MulComplexConj(V a, V b) {
|
|
4741
|
+
// a = u + iv, b = x + iy
|
|
4742
|
+
const auto u = DupEven(a);
|
|
4743
|
+
const auto v = DupOdd(a);
|
|
4744
|
+
const auto x = DupEven(b);
|
|
4745
|
+
const auto y = DupOdd(b);
|
|
4746
|
+
|
|
4747
|
+
return OddEven(Sub(Mul(v, x), Mul(u, y)), MulAdd(u, x, Mul(v, y)));
|
|
4748
|
+
}
|
|
4749
|
+
|
|
4750
|
+
template <class V>
|
|
4751
|
+
HWY_API V MulComplexAdd(V a, V b, V c) {
|
|
4752
|
+
return Add(MulComplex(a, b), c);
|
|
4753
|
+
}
|
|
4754
|
+
|
|
4755
|
+
template <class V>
|
|
4756
|
+
HWY_API V MulComplexConjAdd(V a, V b, V c) {
|
|
4757
|
+
return Add(MulComplexConj(a, b), c);
|
|
4758
|
+
}
|
|
4759
|
+
|
|
4760
|
+
template <class V, class M>
|
|
4761
|
+
HWY_API V MaskedMulComplexConjAdd(M mask, V a, V b, V c) {
|
|
4762
|
+
return IfThenElseZero(mask, MulComplexConjAdd(a, b, c));
|
|
4763
|
+
}
|
|
4764
|
+
|
|
4765
|
+
template <class V, class M>
|
|
4766
|
+
HWY_API V MaskedMulComplexConj(M mask, V a, V b) {
|
|
4767
|
+
return IfThenElseZero(mask, MulComplexConj(a, b));
|
|
4768
|
+
}
|
|
4769
|
+
|
|
4770
|
+
template <class V, class M>
|
|
4771
|
+
HWY_API V MaskedMulComplexOr(V no, M mask, V a, V b) {
|
|
4772
|
+
return IfThenElse(mask, MulComplex(a, b), no);
|
|
4773
|
+
}
|
|
4774
|
+
#endif // HWY_TARGET != HWY_SCALAR
|
|
4775
|
+
|
|
4776
|
+
#endif // HWY_NATIVE_CPLX
|
|
4777
|
+
|
|
4778
|
+
// ------------------------------ MaskedMulAddOr
|
|
4779
|
+
#if (defined(HWY_NATIVE_MASKED_INT_FMA) == defined(HWY_TARGET_TOGGLE))
|
|
4780
|
+
#ifdef HWY_NATIVE_MASKED_INT_FMA
|
|
4781
|
+
#undef HWY_NATIVE_MASKED_INT_FMA
|
|
4782
|
+
#else
|
|
4783
|
+
#define HWY_NATIVE_MASKED_INT_FMA
|
|
4784
|
+
#endif
|
|
4785
|
+
|
|
4786
|
+
template <class V, class M>
|
|
4787
|
+
HWY_API V MaskedMulAddOr(V no, M m, V mul, V x, V add) {
|
|
4788
|
+
return IfThenElse(m, MulAdd(mul, x, add), no);
|
|
4789
|
+
}
|
|
4790
|
+
|
|
4791
|
+
#endif // HWY_NATIVE_MASKED_INT_FMA
|
|
4060
4792
|
|
|
4061
4793
|
// ------------------------------ Integer MulSub / NegMulSub
|
|
4062
4794
|
#if (defined(HWY_NATIVE_INT_FMSUB) == defined(HWY_TARGET_TOGGLE))
|
|
@@ -4112,6 +4844,25 @@ HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
|
|
|
4112
4844
|
OddEven(sub_or_add, BitCast(d, Neg(BitCast(d_negate, sub_or_add))));
|
|
4113
4845
|
return MulAdd(mul, x, add);
|
|
4114
4846
|
}
|
|
4847
|
+
// ------------------------------ MulSubAdd
|
|
4848
|
+
|
|
4849
|
+
template <class V>
|
|
4850
|
+
HWY_API V MulSubAdd(V mul, V x, V sub_or_add) {
|
|
4851
|
+
using D = DFromV<V>;
|
|
4852
|
+
using T = TFromD<D>;
|
|
4853
|
+
using TNegate = If<!IsSigned<T>(), MakeSigned<T>, T>;
|
|
4854
|
+
|
|
4855
|
+
const D d;
|
|
4856
|
+
const Rebind<TNegate, D> d_negate;
|
|
4857
|
+
|
|
4858
|
+
return MulAddSub(mul, x, BitCast(d, Neg(BitCast(d_negate, sub_or_add))));
|
|
4859
|
+
}
|
|
4860
|
+
|
|
4861
|
+
// ------------------------------ MaskedConvertTo
|
|
4862
|
+
template <class D, class V, class M>
|
|
4863
|
+
HWY_API VFromD<D> MaskedConvertTo(M m, D d, V v) {
|
|
4864
|
+
return IfThenElseZero(m, ConvertTo(d, v));
|
|
4865
|
+
}
|
|
4115
4866
|
|
|
4116
4867
|
// ------------------------------ Integer division
|
|
4117
4868
|
#if (defined(HWY_NATIVE_INT_DIV) == defined(HWY_TARGET_TOGGLE))
|
|
@@ -4574,7 +5325,9 @@ HWY_INLINE V IntDiv(V a, V b) {
|
|
|
4574
5325
|
template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
|
|
4575
5326
|
HWY_IF_T_SIZE_ONE_OF_V(V, ((HWY_TARGET <= HWY_SSE2 ||
|
|
4576
5327
|
HWY_TARGET == HWY_WASM ||
|
|
4577
|
-
HWY_TARGET == HWY_WASM_EMU256
|
|
5328
|
+
HWY_TARGET == HWY_WASM_EMU256 ||
|
|
5329
|
+
HWY_TARGET == HWY_LSX ||
|
|
5330
|
+
HWY_TARGET == HWY_LASX)
|
|
4578
5331
|
? 0
|
|
4579
5332
|
: (1 << 1)) |
|
|
4580
5333
|
(1 << 2) | (1 << 4) | (1 << 8))>
|
|
@@ -4582,8 +5335,9 @@ HWY_INLINE V IntMod(V a, V b) {
|
|
|
4582
5335
|
return hwy::HWY_NAMESPACE::NegMulAdd(IntDiv<kOrigLaneSize>(a, b), b, a);
|
|
4583
5336
|
}
|
|
4584
5337
|
|
|
4585
|
-
#if HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM ||
|
|
4586
|
-
HWY_TARGET == HWY_WASM_EMU256
|
|
5338
|
+
#if HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || \
|
|
5339
|
+
HWY_TARGET == HWY_WASM_EMU256 || HWY_TARGET == HWY_LSX || \
|
|
5340
|
+
HWY_TARGET == HWY_LASX
|
|
4587
5341
|
template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
|
|
4588
5342
|
HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
|
|
4589
5343
|
HWY_INLINE V IntMod(V a, V b) {
|
|
@@ -4602,7 +5356,7 @@ HWY_INLINE V IntMod(V a, V b) {
|
|
|
4602
5356
|
IntMod<kOrigLaneSize>(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b)));
|
|
4603
5357
|
}
|
|
4604
5358
|
#endif // HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || HWY_TARGET ==
|
|
4605
|
-
// HWY_WASM_EMU256
|
|
5359
|
+
// HWY_WASM_EMU256 || HWY_TARGET == HWY_LSX || HWY_TARGET == HWY_LASX
|
|
4606
5360
|
|
|
4607
5361
|
} // namespace detail
|
|
4608
5362
|
|
|
@@ -4655,6 +5409,102 @@ HWY_API Vec512<T> operator%(Vec512<T> a, Vec512<T> b) {
|
|
|
4655
5409
|
|
|
4656
5410
|
#endif // HWY_NATIVE_INT_DIV
|
|
4657
5411
|
|
|
5412
|
+
// ------------------------------ AverageRound
|
|
5413
|
+
|
|
5414
|
+
#if (defined(HWY_NATIVE_AVERAGE_ROUND_UI32) == defined(HWY_TARGET_TOGGLE))
|
|
5415
|
+
#ifdef HWY_NATIVE_AVERAGE_ROUND_UI32
|
|
5416
|
+
#undef HWY_NATIVE_AVERAGE_ROUND_UI32
|
|
5417
|
+
#else
|
|
5418
|
+
#define HWY_NATIVE_AVERAGE_ROUND_UI32
|
|
5419
|
+
#endif
|
|
5420
|
+
|
|
5421
|
+
template <class V, HWY_IF_UI32(TFromV<V>)>
|
|
5422
|
+
HWY_API V AverageRound(V a, V b) {
|
|
5423
|
+
return Sub(Or(a, b), ShiftRight<1>(Xor(a, b)));
|
|
5424
|
+
}
|
|
5425
|
+
|
|
5426
|
+
#endif // HWY_NATIVE_AVERAGE_ROUND_UI64
|
|
5427
|
+
|
|
5428
|
+
#if (defined(HWY_NATIVE_AVERAGE_ROUND_UI64) == defined(HWY_TARGET_TOGGLE))
|
|
5429
|
+
#ifdef HWY_NATIVE_AVERAGE_ROUND_UI64
|
|
5430
|
+
#undef HWY_NATIVE_AVERAGE_ROUND_UI64
|
|
5431
|
+
#else
|
|
5432
|
+
#define HWY_NATIVE_AVERAGE_ROUND_UI64
|
|
5433
|
+
#endif
|
|
5434
|
+
|
|
5435
|
+
#if HWY_HAVE_INTEGER64
|
|
5436
|
+
template <class V, HWY_IF_UI64(TFromV<V>)>
|
|
5437
|
+
HWY_API V AverageRound(V a, V b) {
|
|
5438
|
+
return Sub(Or(a, b), ShiftRight<1>(Xor(a, b)));
|
|
5439
|
+
}
|
|
5440
|
+
#endif
|
|
5441
|
+
|
|
5442
|
+
#endif // HWY_NATIVE_AVERAGE_ROUND_UI64
|
|
5443
|
+
|
|
5444
|
+
// ------------------------------ RoundingShiftRight (AverageRound)
|
|
5445
|
+
|
|
5446
|
+
#if (defined(HWY_NATIVE_ROUNDING_SHR) == defined(HWY_TARGET_TOGGLE))
|
|
5447
|
+
#ifdef HWY_NATIVE_ROUNDING_SHR
|
|
5448
|
+
#undef HWY_NATIVE_ROUNDING_SHR
|
|
5449
|
+
#else
|
|
5450
|
+
#define HWY_NATIVE_ROUNDING_SHR
|
|
5451
|
+
#endif
|
|
5452
|
+
|
|
5453
|
+
template <int kShiftAmt, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
5454
|
+
HWY_API V RoundingShiftRight(V v) {
|
|
5455
|
+
const DFromV<V> d;
|
|
5456
|
+
using T = TFromD<decltype(d)>;
|
|
5457
|
+
|
|
5458
|
+
static_assert(
|
|
5459
|
+
0 <= kShiftAmt && kShiftAmt <= static_cast<int>(sizeof(T) * 8 - 1),
|
|
5460
|
+
"kShiftAmt is out of range");
|
|
5461
|
+
|
|
5462
|
+
constexpr int kScaleDownShrAmt = HWY_MAX(kShiftAmt - 1, 0);
|
|
5463
|
+
|
|
5464
|
+
auto scaled_down_v = v;
|
|
5465
|
+
HWY_IF_CONSTEXPR(kScaleDownShrAmt > 0) {
|
|
5466
|
+
scaled_down_v = ShiftRight<kScaleDownShrAmt>(v);
|
|
5467
|
+
}
|
|
5468
|
+
|
|
5469
|
+
HWY_IF_CONSTEXPR(kShiftAmt == 0) { return scaled_down_v; }
|
|
5470
|
+
|
|
5471
|
+
return AverageRound(scaled_down_v, Zero(d));
|
|
5472
|
+
}
|
|
5473
|
+
|
|
5474
|
+
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
5475
|
+
HWY_API V RoundingShiftRightSame(V v, int shift_amt) {
|
|
5476
|
+
const DFromV<V> d;
|
|
5477
|
+
using T = TFromD<decltype(d)>;
|
|
5478
|
+
|
|
5479
|
+
const int shift_amt_is_zero_mask = -static_cast<int>(shift_amt == 0);
|
|
5480
|
+
|
|
5481
|
+
const auto scaled_down_v = ShiftRightSame(
|
|
5482
|
+
v, static_cast<int>(static_cast<unsigned>(shift_amt) +
|
|
5483
|
+
static_cast<unsigned>(~shift_amt_is_zero_mask)));
|
|
5484
|
+
|
|
5485
|
+
return AverageRound(
|
|
5486
|
+
scaled_down_v,
|
|
5487
|
+
And(scaled_down_v, Set(d, static_cast<T>(shift_amt_is_zero_mask))));
|
|
5488
|
+
}
|
|
5489
|
+
|
|
5490
|
+
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
5491
|
+
HWY_API V RoundingShr(V v, V amt) {
|
|
5492
|
+
const DFromV<V> d;
|
|
5493
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
5494
|
+
using T = TFromD<decltype(d)>;
|
|
5495
|
+
using TU = MakeUnsigned<T>;
|
|
5496
|
+
|
|
5497
|
+
const auto unsigned_amt = BitCast(du, amt);
|
|
5498
|
+
const auto scale_down_shr_amt =
|
|
5499
|
+
BitCast(d, SaturatedSub(unsigned_amt, Set(du, TU{1})));
|
|
5500
|
+
|
|
5501
|
+
const auto scaled_down_v = Shr(v, scale_down_shr_amt);
|
|
5502
|
+
return AverageRound(scaled_down_v,
|
|
5503
|
+
IfThenElseZero(Eq(amt, Zero(d)), scaled_down_v));
|
|
5504
|
+
}
|
|
5505
|
+
|
|
5506
|
+
#endif // HWY_NATIVE_ROUNDING_SHR
|
|
5507
|
+
|
|
4658
5508
|
// ------------------------------ MulEvenAdd (PromoteEvenTo)
|
|
4659
5509
|
|
|
4660
5510
|
// SVE with bf16 and NEON with bf16 override this.
|
|
@@ -4835,6 +5685,26 @@ HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 di32,
|
|
|
4835
5685
|
|
|
4836
5686
|
#endif // HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
|
|
4837
5687
|
|
|
5688
|
+
// ------------------------------ MaskedSqrt
|
|
5689
|
+
|
|
5690
|
+
#if (defined(HWY_NATIVE_MASKED_SQRT) == defined(HWY_TARGET_TOGGLE))
|
|
5691
|
+
|
|
5692
|
+
#ifdef HWY_NATIVE_MASKED_SQRT
|
|
5693
|
+
#undef HWY_NATIVE_MASKED_SQRT
|
|
5694
|
+
#else
|
|
5695
|
+
#define HWY_NATIVE_MASKED_SQRT
|
|
5696
|
+
#endif
|
|
5697
|
+
template <class V, HWY_IF_FLOAT_V(V), class M>
|
|
5698
|
+
HWY_API V MaskedSqrt(M m, V v) {
|
|
5699
|
+
return IfThenElseZero(m, Sqrt(v));
|
|
5700
|
+
}
|
|
5701
|
+
|
|
5702
|
+
template <class V, HWY_IF_FLOAT_V(V), class M>
|
|
5703
|
+
HWY_API V MaskedSqrtOr(V no, M m, V v) {
|
|
5704
|
+
return IfThenElse(m, Sqrt(v), no);
|
|
5705
|
+
}
|
|
5706
|
+
#endif
|
|
5707
|
+
|
|
4838
5708
|
// ------------------------------ SumOfMulQuadAccumulate
|
|
4839
5709
|
|
|
4840
5710
|
#if (defined(HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE) == \
|
|
@@ -5019,6 +5889,12 @@ HWY_API V ApproximateReciprocal(V v) {
|
|
|
5019
5889
|
|
|
5020
5890
|
#endif // HWY_NATIVE_F64_APPROX_RECIP
|
|
5021
5891
|
|
|
5892
|
+
// ------------------------------ MaskedApproximateReciprocal
|
|
5893
|
+
template <class V, HWY_IF_FLOAT_V(V), class M>
|
|
5894
|
+
HWY_API V MaskedApproximateReciprocal(M m, V v) {
|
|
5895
|
+
return IfThenElseZero(m, ApproximateReciprocal(v));
|
|
5896
|
+
}
|
|
5897
|
+
|
|
5022
5898
|
// ------------------------------ F64 ApproximateReciprocalSqrt
|
|
5023
5899
|
|
|
5024
5900
|
#if (defined(HWY_NATIVE_F64_APPROX_RSQRT) == defined(HWY_TARGET_TOGGLE))
|
|
@@ -5044,6 +5920,12 @@ HWY_API V ApproximateReciprocalSqrt(V v) {
|
|
|
5044
5920
|
|
|
5045
5921
|
#endif // HWY_NATIVE_F64_APPROX_RSQRT
|
|
5046
5922
|
|
|
5923
|
+
// ------------------------------ MaskedApproximateReciprocalSqrt
|
|
5924
|
+
template <class V, HWY_IF_FLOAT_V(V), class M>
|
|
5925
|
+
HWY_API V MaskedApproximateReciprocalSqrt(M m, V v) {
|
|
5926
|
+
return IfThenElseZero(m, ApproximateReciprocalSqrt(v));
|
|
5927
|
+
}
|
|
5928
|
+
|
|
5047
5929
|
// ------------------------------ Compress*
|
|
5048
5930
|
|
|
5049
5931
|
#if (defined(HWY_NATIVE_COMPRESS8) == defined(HWY_TARGET_TOGGLE))
|
|
@@ -5257,13 +6139,6 @@ HWY_API V CompressNot(V v, M mask) {
|
|
|
5257
6139
|
|
|
5258
6140
|
namespace detail {
|
|
5259
6141
|
|
|
5260
|
-
#if HWY_IDE
|
|
5261
|
-
template <class M>
|
|
5262
|
-
HWY_INLINE uint64_t BitsFromMask(M /* mask */) {
|
|
5263
|
-
return 0;
|
|
5264
|
-
}
|
|
5265
|
-
#endif // HWY_IDE
|
|
5266
|
-
|
|
5267
6142
|
template <size_t N>
|
|
5268
6143
|
HWY_INLINE Vec128<uint8_t, N> IndicesForExpandFromBits(uint64_t mask_bits) {
|
|
5269
6144
|
static_assert(N <= 8, "Should only be called for half-vectors");
|
|
@@ -5537,7 +6412,7 @@ template <typename T, size_t N, HWY_IF_T_SIZE(T, 1), HWY_IF_V_SIZE_LE(T, N, 8)>
|
|
|
5537
6412
|
HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
|
|
5538
6413
|
const DFromV<decltype(v)> d;
|
|
5539
6414
|
|
|
5540
|
-
const uint64_t mask_bits =
|
|
6415
|
+
const uint64_t mask_bits = BitsFromMask(d, mask);
|
|
5541
6416
|
const Vec128<uint8_t, N> indices =
|
|
5542
6417
|
detail::IndicesForExpandFromBits<N>(mask_bits);
|
|
5543
6418
|
return BitCast(d, TableLookupBytesOr0(v, indices));
|
|
@@ -5551,15 +6426,16 @@ HWY_API Vec128<T> Expand(Vec128<T> v, Mask128<T> mask) {
|
|
|
5551
6426
|
const Half<decltype(du)> duh;
|
|
5552
6427
|
const Vec128<uint8_t> vu = BitCast(du, v);
|
|
5553
6428
|
|
|
5554
|
-
const uint64_t mask_bits =
|
|
6429
|
+
const uint64_t mask_bits = BitsFromMask(d, mask);
|
|
5555
6430
|
const uint64_t maskL = mask_bits & 0xFF;
|
|
5556
6431
|
const uint64_t maskH = mask_bits >> 8;
|
|
5557
6432
|
|
|
5558
6433
|
// We want to skip past the v bytes already consumed by idxL. There is no
|
|
5559
6434
|
// instruction for shift-reg by variable bytes. Storing v itself would work
|
|
5560
6435
|
// but would involve a store-load forwarding stall. We instead shuffle using
|
|
5561
|
-
// loaded indices.
|
|
5562
|
-
//
|
|
6436
|
+
// loaded indices.
|
|
6437
|
+
// TODO: MultiRotateRight would also help, but if we have that, we probably
|
|
6438
|
+
// also have native 8-bit Expand?
|
|
5563
6439
|
alignas(16) static constexpr uint8_t iota[32] = {
|
|
5564
6440
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
|
|
5565
6441
|
11, 12, 13, 14, 15, 128, 128, 128, 128, 128, 128,
|
|
@@ -5583,7 +6459,7 @@ HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
|
|
|
5583
6459
|
const RebindToUnsigned<decltype(d)> du;
|
|
5584
6460
|
|
|
5585
6461
|
const Rebind<uint8_t, decltype(d)> du8;
|
|
5586
|
-
const uint64_t mask_bits =
|
|
6462
|
+
const uint64_t mask_bits = BitsFromMask(d, mask);
|
|
5587
6463
|
|
|
5588
6464
|
// Storing as 8-bit reduces table size from 4 KiB to 2 KiB. We cannot apply
|
|
5589
6465
|
// the nibble trick used below because not all indices fit within one lane.
|
|
@@ -5865,7 +6741,7 @@ HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
|
|
|
5865
6741
|
const DFromV<decltype(v)> d;
|
|
5866
6742
|
const RebindToUnsigned<decltype(d)> du;
|
|
5867
6743
|
|
|
5868
|
-
const uint64_t mask_bits =
|
|
6744
|
+
const uint64_t mask_bits = BitsFromMask(d, mask);
|
|
5869
6745
|
|
|
5870
6746
|
alignas(16) static constexpr uint32_t packed_array[16] = {
|
|
5871
6747
|
// PrintExpand64x4Nibble - same for 32x4.
|
|
@@ -6550,6 +7426,107 @@ HWY_API V Per4LaneBlockShuffle(V v) {
|
|
|
6550
7426
|
}
|
|
6551
7427
|
#endif
|
|
6552
7428
|
|
|
7429
|
+
// ------------------------------ PairwiseAdd128/PairwiseSub128
|
|
7430
|
+
// (Per4LaneBlockShuffle)
|
|
7431
|
+
#if (defined(HWY_NATIVE_PAIRWISE_ADD_128) == defined(HWY_TARGET_TOGGLE))
|
|
7432
|
+
#ifdef HWY_NATIVE_PAIRWISE_ADD_128
|
|
7433
|
+
#undef HWY_NATIVE_PAIRWISE_ADD_128
|
|
7434
|
+
#else
|
|
7435
|
+
#define HWY_NATIVE_PAIRWISE_ADD_128
|
|
7436
|
+
#endif
|
|
7437
|
+
|
|
7438
|
+
namespace detail {
|
|
7439
|
+
|
|
7440
|
+
// detail::BlockwiseConcatOddEven(d, v) returns the even lanes of each block of
|
|
7441
|
+
// v followed by the odd lanes of v
|
|
7442
|
+
#if HWY_TARGET_IS_NEON || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_RVV || \
|
|
7443
|
+
HWY_TARGET == HWY_LSX || HWY_TARGET == HWY_LASX
|
|
7444
|
+
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
7445
|
+
HWY_IF_V_SIZE_GT_D(D, 8)>
|
|
7446
|
+
static HWY_INLINE HWY_MAYBE_UNUSED Vec<D> BlockwiseConcatOddEven(D d,
|
|
7447
|
+
Vec<D> v) {
|
|
7448
|
+
#if HWY_TARGET == HWY_RVV
|
|
7449
|
+
const ScalableTag<uint64_t, HWY_MAX(HWY_POW2_D(D), 0)> du64;
|
|
7450
|
+
#else
|
|
7451
|
+
const Repartition<uint64_t, DFromV<decltype(v)>> du64;
|
|
7452
|
+
#endif
|
|
7453
|
+
|
|
7454
|
+
const Repartition<TFromD<decltype(d)>, decltype(du64)> d_concat;
|
|
7455
|
+
const auto v_to_concat = ResizeBitCast(d_concat, v);
|
|
7456
|
+
|
|
7457
|
+
const auto evens = ConcatEven(d, v_to_concat, v_to_concat);
|
|
7458
|
+
const auto odds = ConcatOdd(d, v_to_concat, v_to_concat);
|
|
7459
|
+
return ResizeBitCast(
|
|
7460
|
+
d, InterleaveWholeLower(BitCast(du64, evens), BitCast(du64, odds)));
|
|
7461
|
+
}
|
|
7462
|
+
|
|
7463
|
+
#else // !(HWY_TARGET_IS_NEON || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_RVV)
|
|
7464
|
+
|
|
7465
|
+
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_GT_D(D, 8)>
|
|
7466
|
+
static HWY_INLINE HWY_MAYBE_UNUSED Vec<D> BlockwiseConcatOddEven(D d,
|
|
7467
|
+
Vec<D> v) {
|
|
7468
|
+
#if HWY_TARGET == HWY_SSE2
|
|
7469
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
7470
|
+
const RebindToSigned<RepartitionToWide<decltype(du)>> dw;
|
|
7471
|
+
|
|
7472
|
+
const auto vu = BitCast(du, v);
|
|
7473
|
+
return BitCast(
|
|
7474
|
+
d, OrderedDemote2To(du, PromoteEvenTo(dw, vu), PromoteOddTo(dw, vu)));
|
|
7475
|
+
#else
|
|
7476
|
+
const Repartition<uint8_t, decltype(d)> du8;
|
|
7477
|
+
const auto idx =
|
|
7478
|
+
BitCast(d, Dup128VecFromValues(du8, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7,
|
|
7479
|
+
9, 11, 13, 15));
|
|
7480
|
+
return TableLookupBytes(v, idx);
|
|
7481
|
+
#endif
|
|
7482
|
+
}
|
|
7483
|
+
|
|
7484
|
+
template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_GT_D(D, 8)>
|
|
7485
|
+
static HWY_INLINE HWY_MAYBE_UNUSED Vec<D> BlockwiseConcatOddEven(D d,
|
|
7486
|
+
Vec<D> v) {
|
|
7487
|
+
#if HWY_TARGET == HWY_SSE2
|
|
7488
|
+
const RebindToSigned<decltype(d)> di;
|
|
7489
|
+
const RepartitionToWide<decltype(di)> dw;
|
|
7490
|
+
const auto vi = BitCast(di, v);
|
|
7491
|
+
return BitCast(
|
|
7492
|
+
d, OrderedDemote2To(di, PromoteEvenTo(dw, vi), PromoteOddTo(dw, vi)));
|
|
7493
|
+
#else
|
|
7494
|
+
const Repartition<uint8_t, decltype(d)> du8;
|
|
7495
|
+
const auto idx = BitCast(d, Dup128VecFromValues(du8, 0, 1, 4, 5, 8, 9, 12, 13,
|
|
7496
|
+
2, 3, 6, 7, 10, 11, 14, 15));
|
|
7497
|
+
return TableLookupBytes(v, idx);
|
|
7498
|
+
#endif
|
|
7499
|
+
}
|
|
7500
|
+
|
|
7501
|
+
template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_GT_D(D, 8)>
|
|
7502
|
+
static HWY_INLINE HWY_MAYBE_UNUSED Vec<D> BlockwiseConcatOddEven(D /*d*/,
|
|
7503
|
+
Vec<D> v) {
|
|
7504
|
+
return Per4LaneBlockShuffle<3, 1, 2, 0>(v);
|
|
7505
|
+
}
|
|
7506
|
+
#endif // HWY_TARGET_IS_NEON || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_RVV
|
|
7507
|
+
|
|
7508
|
+
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
|
|
7509
|
+
static HWY_INLINE HWY_MAYBE_UNUSED Vec<D> BlockwiseConcatOddEven(D /*d*/,
|
|
7510
|
+
Vec<D> v) {
|
|
7511
|
+
return v;
|
|
7512
|
+
}
|
|
7513
|
+
|
|
7514
|
+
} // namespace detail
|
|
7515
|
+
|
|
7516
|
+
// Pairwise add with output in 128 bit blocks of a and b.
|
|
7517
|
+
template <class D, HWY_IF_PAIRWISE_ADD_128_D(D)>
|
|
7518
|
+
HWY_API Vec<D> PairwiseAdd128(D d, Vec<D> a, Vec<D> b) {
|
|
7519
|
+
return detail::BlockwiseConcatOddEven(d, PairwiseAdd(d, a, b));
|
|
7520
|
+
}
|
|
7521
|
+
|
|
7522
|
+
// Pairwise sub with output in 128 bit blocks of a and b.
|
|
7523
|
+
template <class D, HWY_IF_PAIRWISE_SUB_128_D(D)>
|
|
7524
|
+
HWY_API Vec<D> PairwiseSub128(D d, Vec<D> a, Vec<D> b) {
|
|
7525
|
+
return detail::BlockwiseConcatOddEven(d, PairwiseSub(d, a, b));
|
|
7526
|
+
}
|
|
7527
|
+
|
|
7528
|
+
#endif
|
|
7529
|
+
|
|
6553
7530
|
// ------------------------------ Blocks
|
|
6554
7531
|
|
|
6555
7532
|
template <class D>
|
|
@@ -6922,9 +7899,17 @@ HWY_API V BitShuffle(V v, VI idx) {
|
|
|
6922
7899
|
static_cast<uint64_t>(0x0102040810204080u);
|
|
6923
7900
|
#endif
|
|
6924
7901
|
|
|
7902
|
+
const auto k7 = Set(du8, uint8_t{0x07});
|
|
7903
|
+
|
|
7904
|
+
auto unmasked_byte_idx = BitCast(du8, ShiftRight<3>(BitCast(d_idx_shr, idx)));
|
|
7905
|
+
#if HWY_IS_BIG_ENDIAN
|
|
7906
|
+
// Need to invert the lower 3 bits of unmasked_byte_idx[i] on big-endian
|
|
7907
|
+
// targets
|
|
7908
|
+
unmasked_byte_idx = Xor(unmasked_byte_idx, k7);
|
|
7909
|
+
#endif // HWY_IS_BIG_ENDIAN
|
|
7910
|
+
|
|
6925
7911
|
const auto byte_idx = BitwiseIfThenElse(
|
|
6926
|
-
|
|
6927
|
-
BitCast(du8, ShiftRight<3>(BitCast(d_idx_shr, idx))),
|
|
7912
|
+
k7, unmasked_byte_idx,
|
|
6928
7913
|
BitCast(du8, Dup128VecFromValues(du64, uint64_t{0},
|
|
6929
7914
|
uint64_t{0x0808080808080808u})));
|
|
6930
7915
|
// We want to shift right by idx & 7 to extract the desired bit in `bytes`,
|
|
@@ -6942,6 +7927,195 @@ HWY_API V BitShuffle(V v, VI idx) {
|
|
|
6942
7927
|
|
|
6943
7928
|
#endif // HWY_NATIVE_BITSHUFFLE
|
|
6944
7929
|
|
|
7930
|
+
template <class V, class M>
|
|
7931
|
+
HWY_API V MaskedOr(M m, V a, V b) {
|
|
7932
|
+
return IfThenElseZero(m, Or(a, b));
|
|
7933
|
+
}
|
|
7934
|
+
// ------------------------------ AllBits1/AllBits0
|
|
7935
|
+
#if (defined(HWY_NATIVE_ALLONES) == defined(HWY_TARGET_TOGGLE))
|
|
7936
|
+
#ifdef HWY_NATIVE_ALLONES
|
|
7937
|
+
#undef HWY_NATIVE_ALLONES
|
|
7938
|
+
#else
|
|
7939
|
+
#define HWY_NATIVE_ALLONES
|
|
7940
|
+
#endif
|
|
7941
|
+
|
|
7942
|
+
template <class D, class V = VFromD<D>>
|
|
7943
|
+
HWY_API bool AllBits1(D d, V v) {
|
|
7944
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
7945
|
+
using TU = TFromD<decltype(du)>;
|
|
7946
|
+
return AllTrue(du, Eq(BitCast(du, v), Set(du, hwy::HighestValue<TU>())));
|
|
7947
|
+
}
|
|
7948
|
+
#endif // HWY_NATIVE_ALLONES
|
|
7949
|
+
|
|
7950
|
+
#if (defined(HWY_NATIVE_ALLZEROS) == defined(HWY_TARGET_TOGGLE))
|
|
7951
|
+
#ifdef HWY_NATIVE_ALLZEROS
|
|
7952
|
+
#undef HWY_NATIVE_ALLZEROS
|
|
7953
|
+
#else
|
|
7954
|
+
#define HWY_NATIVE_ALLZEROS
|
|
7955
|
+
#endif
|
|
7956
|
+
|
|
7957
|
+
template <class D, class V = VFromD<D>>
|
|
7958
|
+
HWY_API bool AllBits0(D d, V v) {
|
|
7959
|
+
return AllTrue(d, Eq(v, Zero(d)));
|
|
7960
|
+
}
|
|
7961
|
+
#endif // HWY_NATIVE_ALLZEROS
|
|
7962
|
+
|
|
7963
|
+
// ------------------------------ MultiRotateRight
|
|
7964
|
+
#if (defined(HWY_NATIVE_MULTIROTATERIGHT) == defined(HWY_TARGET_TOGGLE))
|
|
7965
|
+
#ifdef HWY_NATIVE_MULTIROTATERIGHT
|
|
7966
|
+
#undef HWY_NATIVE_MULTIROTATERIGHT
|
|
7967
|
+
#else
|
|
7968
|
+
#define HWY_NATIVE_MULTIROTATERIGHT
|
|
7969
|
+
#endif
|
|
7970
|
+
|
|
7971
|
+
template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
|
|
7972
|
+
class VI_2 = VFromD<Repartition<TFromV<VI>, DFromV<V>>>,
|
|
7973
|
+
HWY_IF_LANES_D(DFromV<VI>, HWY_MAX_LANES_V(VI_2)),
|
|
7974
|
+
HWY_IF_V_SIZE_V(V, 8)>
|
|
7975
|
+
HWY_API V MultiRotateRight(V v, VI idx) {
|
|
7976
|
+
const DFromV<V> d64;
|
|
7977
|
+
const Twice<decltype(d64)> dt64;
|
|
7978
|
+
const Repartition<uint8_t, decltype(d64)> du8;
|
|
7979
|
+
const Repartition<uint8_t, decltype(dt64)> dt_u8;
|
|
7980
|
+
const Repartition<uint16_t, decltype(dt64)> dt_u16;
|
|
7981
|
+
const auto k7 = Set(du8, uint8_t{0x07});
|
|
7982
|
+
const auto k63 = Set(du8, uint8_t{0x3F});
|
|
7983
|
+
|
|
7984
|
+
const auto masked_idx = And(k63, BitCast(du8, idx));
|
|
7985
|
+
|
|
7986
|
+
auto byte_idx = ShiftRight<3>(masked_idx);
|
|
7987
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
7988
|
+
const auto hi_byte_idx = Add(byte_idx, Set(du8, uint8_t{1}));
|
|
7989
|
+
#else
|
|
7990
|
+
byte_idx = Xor(byte_idx, k7);
|
|
7991
|
+
const auto hi_byte_idx = Add(byte_idx, k7);
|
|
7992
|
+
#endif
|
|
7993
|
+
|
|
7994
|
+
const auto idx_shift = And(k7, masked_idx);
|
|
7995
|
+
|
|
7996
|
+
// Calculate even lanes
|
|
7997
|
+
const auto even_src = DupEven(ResizeBitCast(dt64, v));
|
|
7998
|
+
// Expand indexes to pull out 16 bit segments of idx and idx + 1
|
|
7999
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
8000
|
+
const auto even_idx = InterleaveLower(ResizeBitCast(dt_u8, byte_idx),
|
|
8001
|
+
ResizeBitCast(dt_u8, hi_byte_idx));
|
|
8002
|
+
#else
|
|
8003
|
+
const auto even_idx = InterleaveLower(ResizeBitCast(dt_u8, hi_byte_idx),
|
|
8004
|
+
ResizeBitCast(dt_u8, byte_idx));
|
|
8005
|
+
#endif
|
|
8006
|
+
// TableLookupBytes indexes select from within a 16 byte block
|
|
8007
|
+
const auto even_segments = TableLookupBytes(even_src, even_idx);
|
|
8008
|
+
// Extract unaligned bytes from 16 bit segments
|
|
8009
|
+
const auto even_idx_shift = PromoteTo(dt_u16, idx_shift);
|
|
8010
|
+
const auto extracted_even_bytes =
|
|
8011
|
+
Shr(BitCast(dt_u16, even_segments), even_idx_shift);
|
|
8012
|
+
|
|
8013
|
+
// Extract the even bytes of each 128 bit block and pack into lower 64 bits
|
|
8014
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
8015
|
+
const auto even_lanes = BitCast(
|
|
8016
|
+
dt64,
|
|
8017
|
+
ConcatEven(dt_u8, Zero(dt_u8), BitCast(dt_u8, extracted_even_bytes)));
|
|
8018
|
+
#else
|
|
8019
|
+
const auto even_lanes = BitCast(
|
|
8020
|
+
dt64,
|
|
8021
|
+
ConcatOdd(dt_u8, Zero(dt_u8), BitCast(dt_u8, extracted_even_bytes)));
|
|
8022
|
+
#endif
|
|
8023
|
+
|
|
8024
|
+
return LowerHalf(d64, even_lanes);
|
|
8025
|
+
}
|
|
8026
|
+
|
|
8027
|
+
template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
|
|
8028
|
+
class VI_2 = VFromD<Repartition<TFromV<VI>, DFromV<V>>>,
|
|
8029
|
+
HWY_IF_LANES_D(DFromV<VI>, HWY_MAX_LANES_V(VI_2)),
|
|
8030
|
+
HWY_IF_V_SIZE_GT_V(V, 8)>
|
|
8031
|
+
HWY_API V MultiRotateRight(V v, VI idx) {
|
|
8032
|
+
const DFromV<V> d64;
|
|
8033
|
+
const Repartition<uint8_t, decltype(d64)> du8;
|
|
8034
|
+
const Repartition<uint16_t, decltype(d64)> du16;
|
|
8035
|
+
const auto k7 = Set(du8, uint8_t{0x07});
|
|
8036
|
+
const auto k63 = Set(du8, uint8_t{0x3F});
|
|
8037
|
+
|
|
8038
|
+
const auto masked_idx = And(k63, BitCast(du8, idx));
|
|
8039
|
+
|
|
8040
|
+
auto byte_idx = ShiftRight<3>(masked_idx);
|
|
8041
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
8042
|
+
const auto hi_byte_idx = Add(byte_idx, Set(du8, uint8_t{1}));
|
|
8043
|
+
#else
|
|
8044
|
+
byte_idx = Xor(byte_idx, k7);
|
|
8045
|
+
const auto hi_byte_idx = Add(byte_idx, k7);
|
|
8046
|
+
#endif
|
|
8047
|
+
|
|
8048
|
+
const auto idx_shift = And(k7, masked_idx);
|
|
8049
|
+
|
|
8050
|
+
// Calculate even lanes
|
|
8051
|
+
const auto even_src = DupEven(v);
|
|
8052
|
+
// Expand indexes to pull out 16 bit segments of idx and idx + 1
|
|
8053
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
8054
|
+
const auto even_idx = InterleaveLower(byte_idx, hi_byte_idx);
|
|
8055
|
+
#else
|
|
8056
|
+
const auto even_idx = InterleaveLower(hi_byte_idx, byte_idx);
|
|
8057
|
+
#endif
|
|
8058
|
+
// TableLookupBytes indexes select from within a 16 byte block
|
|
8059
|
+
const auto even_segments = TableLookupBytes(even_src, even_idx);
|
|
8060
|
+
// Extract unaligned bytes from 16 bit segments
|
|
8061
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
8062
|
+
const auto even_idx_shift = ZipLower(idx_shift, Zero(du8));
|
|
8063
|
+
#else
|
|
8064
|
+
const auto even_idx_shift = ZipLower(Zero(du8), idx_shift);
|
|
8065
|
+
#endif
|
|
8066
|
+
const auto extracted_even_bytes =
|
|
8067
|
+
Shr(BitCast(du16, even_segments), even_idx_shift);
|
|
8068
|
+
|
|
8069
|
+
// Calculate odd lanes
|
|
8070
|
+
const auto odd_src = DupOdd(v);
|
|
8071
|
+
// Expand indexes to pull out 16 bit segments of idx and idx + 1
|
|
8072
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
8073
|
+
const auto odd_idx = InterleaveUpper(du8, byte_idx, hi_byte_idx);
|
|
8074
|
+
#else
|
|
8075
|
+
const auto odd_idx = InterleaveUpper(du8, hi_byte_idx, byte_idx);
|
|
8076
|
+
#endif
|
|
8077
|
+
// TableLookupBytes indexes select from within a 16 byte block
|
|
8078
|
+
const auto odd_segments = TableLookupBytes(odd_src, odd_idx);
|
|
8079
|
+
// Extract unaligned bytes from 16 bit segments
|
|
8080
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
8081
|
+
const auto odd_idx_shift = ZipUpper(du16, idx_shift, Zero(du8));
|
|
8082
|
+
#else
|
|
8083
|
+
const auto odd_idx_shift = ZipUpper(du16, Zero(du8), idx_shift);
|
|
8084
|
+
#endif
|
|
8085
|
+
const auto extracted_odd_bytes =
|
|
8086
|
+
Shr(BitCast(du16, odd_segments), odd_idx_shift);
|
|
8087
|
+
|
|
8088
|
+
// Extract the even bytes of each 128 bit block and pack into lower 64 bits
|
|
8089
|
+
#if HWY_IS_LITTLE_ENDIAN
|
|
8090
|
+
const auto even_lanes = BitCast(
|
|
8091
|
+
d64, ConcatEven(du8, Zero(du8), BitCast(du8, extracted_even_bytes)));
|
|
8092
|
+
const auto odd_lanes = BitCast(
|
|
8093
|
+
d64, ConcatEven(du8, Zero(du8), BitCast(du8, extracted_odd_bytes)));
|
|
8094
|
+
#else
|
|
8095
|
+
const auto even_lanes = BitCast(
|
|
8096
|
+
d64, ConcatOdd(du8, Zero(du8), BitCast(du8, extracted_even_bytes)));
|
|
8097
|
+
const auto odd_lanes = BitCast(
|
|
8098
|
+
d64, ConcatOdd(du8, Zero(du8), BitCast(du8, extracted_odd_bytes)));
|
|
8099
|
+
#endif
|
|
8100
|
+
// Interleave at 64 bit level
|
|
8101
|
+
return InterleaveWholeLower(even_lanes, odd_lanes);
|
|
8102
|
+
}
|
|
8103
|
+
|
|
8104
|
+
#if HWY_TARGET == HWY_RVV
|
|
8105
|
+
|
|
8106
|
+
// MultiRotateRight for LMUL=1/2 case on RVV
|
|
8107
|
+
template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
|
|
8108
|
+
class VI_2 = VFromD<Repartition<TFromV<VI>, DFromV<V>>>,
|
|
8109
|
+
HWY_IF_POW2_LE_D(DFromV<V>, 0),
|
|
8110
|
+
HWY_IF_LANES_D(DFromV<VI>, HWY_MAX_LANES_V(VI_2) / 2)>
|
|
8111
|
+
HWY_API V MultiRotateRight(V v, VI idx) {
|
|
8112
|
+
return MultiRotateRight(v, ResizeBitCast(Twice<DFromV<VI>>(), idx));
|
|
8113
|
+
}
|
|
8114
|
+
|
|
8115
|
+
#endif
|
|
8116
|
+
|
|
8117
|
+
#endif
|
|
8118
|
+
|
|
6945
8119
|
// ================================================== Operator wrapper
|
|
6946
8120
|
|
|
6947
8121
|
// SVE* and RVV currently cannot define operators and have already defined
|
|
@@ -7013,6 +8187,20 @@ HWY_API auto Le(V a, V b) -> decltype(a == b) {
|
|
|
7013
8187
|
|
|
7014
8188
|
#endif // HWY_NATIVE_OPERATOR_REPLACEMENTS
|
|
7015
8189
|
|
|
8190
|
+
#undef HWY_GENERIC_IF_EMULATED_D
|
|
8191
|
+
|
|
8192
|
+
// TODO: remove once callers are updated.
|
|
8193
|
+
// SVE and RVV do not support DFromM because their masks are loosely typed.
|
|
8194
|
+
#if HWY_MAX_BYTES <= 64 && !HWY_TARGET_IS_SVE && HWY_TARGET != HWY_RVV
|
|
8195
|
+
namespace detail {
|
|
8196
|
+
template <class M>
|
|
8197
|
+
uint64_t BitsFromMask(M m) {
|
|
8198
|
+
const DFromM<M> d;
|
|
8199
|
+
return ::hwy::HWY_NAMESPACE::BitsFromMask(d, m);
|
|
8200
|
+
}
|
|
8201
|
+
} // namespace detail
|
|
8202
|
+
#endif // !HWY_HAVE_SCALABLE && HWY_MAX_BYTES <= 64
|
|
8203
|
+
|
|
7016
8204
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
|
7017
8205
|
} // namespace HWY_NAMESPACE
|
|
7018
8206
|
} // namespace hwy
|