@img/sharp-libvips-dev 1.2.0 → 1.2.2-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/include/ffi.h +3 -3
  2. package/include/harfbuzz/hb-deprecated.h +4 -4
  3. package/include/harfbuzz/hb-font.h +120 -9
  4. package/include/harfbuzz/hb-version.h +3 -3
  5. package/include/hwy/abort.h +2 -19
  6. package/include/hwy/aligned_allocator.h +11 -7
  7. package/include/hwy/auto_tune.h +504 -0
  8. package/include/hwy/base.h +425 -104
  9. package/include/hwy/cache_control.h +16 -0
  10. package/include/hwy/detect_compiler_arch.h +32 -1
  11. package/include/hwy/detect_targets.h +251 -67
  12. package/include/hwy/foreach_target.h +35 -0
  13. package/include/hwy/highway.h +185 -76
  14. package/include/hwy/nanobenchmark.h +1 -19
  15. package/include/hwy/ops/arm_neon-inl.h +969 -458
  16. package/include/hwy/ops/arm_sve-inl.h +1137 -359
  17. package/include/hwy/ops/emu128-inl.h +97 -11
  18. package/include/hwy/ops/generic_ops-inl.h +1222 -34
  19. package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
  20. package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
  21. package/include/hwy/ops/ppc_vsx-inl.h +306 -126
  22. package/include/hwy/ops/rvv-inl.h +546 -51
  23. package/include/hwy/ops/scalar-inl.h +77 -22
  24. package/include/hwy/ops/set_macros-inl.h +138 -17
  25. package/include/hwy/ops/shared-inl.h +50 -10
  26. package/include/hwy/ops/wasm_128-inl.h +137 -92
  27. package/include/hwy/ops/x86_128-inl.h +773 -214
  28. package/include/hwy/ops/x86_256-inl.h +712 -255
  29. package/include/hwy/ops/x86_512-inl.h +429 -753
  30. package/include/hwy/ops/x86_avx3-inl.h +501 -0
  31. package/include/hwy/per_target.h +2 -1
  32. package/include/hwy/profiler.h +622 -486
  33. package/include/hwy/targets.h +62 -20
  34. package/include/hwy/timer-inl.h +8 -160
  35. package/include/hwy/timer.h +170 -3
  36. package/include/hwy/x86_cpuid.h +81 -0
  37. package/include/libheif/heif_cxx.h +25 -5
  38. package/include/libheif/heif_regions.h +5 -5
  39. package/include/libheif/heif_version.h +2 -2
  40. package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
  41. package/include/libxml2/libxml/xmlversion.h +4 -4
  42. package/include/pango-1.0/pango/pango-enum-types.h +3 -0
  43. package/include/pango-1.0/pango/pango-features.h +3 -3
  44. package/include/pango-1.0/pango/pango-font.h +30 -0
  45. package/include/pango-1.0/pango/pango-version-macros.h +26 -0
  46. package/include/pixman-1/pixman-version.h +2 -2
  47. package/include/webp/decode.h +11 -2
  48. package/include/webp/demux.h +2 -0
  49. package/include/webp/encode.h +2 -0
  50. package/include/webp/mux_types.h +1 -0
  51. package/include/webp/sharpyuv/sharpyuv.h +1 -1
  52. package/include/webp/types.h +2 -2
  53. package/include/zlib.h +3 -3
  54. package/package.json +1 -1
  55. package/versions.json +11 -11
@@ -97,6 +97,21 @@ HWY_API Vec<D> Inf(D d) {
97
97
  return BitCast(d, Set(du, max_x2 >> 1));
98
98
  }
99
99
 
100
+ // ------------------------------ MaskedSetOr/MaskedSet
101
+
102
+ template <class V, typename T = TFromV<V>, typename D = DFromV<V>,
103
+ typename M = MFromD<D>>
104
+ HWY_API V MaskedSetOr(V no, M m, T a) {
105
+ D d;
106
+ return IfThenElse(m, Set(d, a), no);
107
+ }
108
+
109
+ template <class D, typename V = VFromD<D>, typename M = MFromD<D>,
110
+ typename T = TFromD<D>>
111
+ HWY_API V MaskedSet(D d, M m, T a) {
112
+ return IfThenElseZero(m, Set(d, a));
113
+ }
114
+
100
115
  // ------------------------------ ZeroExtendResizeBitCast
101
116
 
102
117
  // The implementation of detail::ZeroExtendResizeBitCast for the HWY_EMU128
@@ -336,6 +351,22 @@ HWY_API Mask<DTo> DemoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) {
336
351
 
337
352
  #endif // HWY_NATIVE_DEMOTE_MASK_TO
338
353
 
354
+ // ------------------------------ InsertIntoUpper
355
+ #if (defined(HWY_NATIVE_LOAD_HIGHER) == defined(HWY_TARGET_TOGGLE))
356
+ #ifdef HWY_NATIVE_LOAD_HIGHER
357
+ #undef HWY_NATIVE_LOAD_HIGHER
358
+ #else
359
+ #define HWY_NATIVE_LOAD_HIGHER
360
+ #endif
361
+ template <class D, typename T, class V = VFromD<D>(), HWY_IF_LANES_GT_D(D, 1),
362
+ HWY_IF_POW2_GT_D(D, -3)>
363
+ HWY_API V InsertIntoUpper(D d, T* p, V a) {
364
+ Half<D> dh;
365
+ const VFromD<decltype(dh)> b = LoadU(dh, p);
366
+ return Combine(d, b, LowerHalf(a));
367
+ }
368
+ #endif // HWY_NATIVE_LOAD_HIGHER
369
+
339
370
  // ------------------------------ CombineMasks
340
371
 
341
372
  #if (defined(HWY_NATIVE_COMBINE_MASKS) == defined(HWY_TARGET_TOGGLE))
@@ -488,6 +519,95 @@ HWY_API V InterleaveEven(V a, V b) {
488
519
  }
489
520
  #endif
490
521
 
522
+ // ------------------------------ MinNumber/MaxNumber
523
+
524
+ #if (defined(HWY_NATIVE_FLOAT_MIN_MAX_NUMBER) == defined(HWY_TARGET_TOGGLE))
525
+ #ifdef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
526
+ #undef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
527
+ #else
528
+ #define HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
529
+ #endif
530
+
531
+ template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
532
+ HWY_API V MinNumber(V a, V b) {
533
+ return Min(a, b);
534
+ }
535
+
536
+ template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
537
+ HWY_API V MaxNumber(V a, V b) {
538
+ return Max(a, b);
539
+ }
540
+
541
+ #endif
542
+
543
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
544
+ HWY_API V MinNumber(V a, V b) {
545
+ return Min(a, b);
546
+ }
547
+
548
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
549
+ HWY_API V MaxNumber(V a, V b) {
550
+ return Max(a, b);
551
+ }
552
+
553
+ // ------------------------------ MinMagnitude/MaxMagnitude
554
+
555
+ #if (defined(HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE) == defined(HWY_TARGET_TOGGLE))
556
+ #ifdef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
557
+ #undef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
558
+ #else
559
+ #define HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
560
+ #endif
561
+
562
+ template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
563
+ HWY_API V MinMagnitude(V a, V b) {
564
+ const V abs_a = Abs(a);
565
+ const V abs_b = Abs(b);
566
+ const V min = Min(IfThenElse(Eq(abs_a, abs_b), a, b), b);
567
+ return IfThenElse(Lt(abs_a, abs_b), a, min);
568
+ }
569
+
570
+ template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
571
+ HWY_API V MaxMagnitude(V a, V b) {
572
+ const V abs_a = Abs(a);
573
+ const V abs_b = Abs(b);
574
+ // This lvalue appears to be necessary to avoid a clang bug on SVE.
575
+ const V max = Max(IfThenElse(Eq(abs_a, abs_b), b, a), a);
576
+ return IfThenElse(Lt(abs_a, abs_b), b, max);
577
+ }
578
+
579
+ #endif // HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
580
+
581
+ template <class V, HWY_IF_SIGNED_V(V)>
582
+ HWY_API V MinMagnitude(V a, V b) {
583
+ const DFromV<V> d;
584
+ const RebindToUnsigned<decltype(d)> du;
585
+ const auto abs_a = BitCast(du, Abs(a));
586
+ const auto abs_b = BitCast(du, Abs(b));
587
+ return IfThenElse(RebindMask(d, Lt(abs_a, abs_b)), a,
588
+ Min(IfThenElse(RebindMask(d, Eq(abs_a, abs_b)), a, b), b));
589
+ }
590
+
591
+ template <class V, HWY_IF_SIGNED_V(V)>
592
+ HWY_API V MaxMagnitude(V a, V b) {
593
+ const DFromV<V> d;
594
+ const RebindToUnsigned<decltype(d)> du;
595
+ const auto abs_a = BitCast(du, Abs(a));
596
+ const auto abs_b = BitCast(du, Abs(b));
597
+ return IfThenElse(RebindMask(d, Lt(abs_a, abs_b)), b,
598
+ Max(IfThenElse(RebindMask(d, Eq(abs_a, abs_b)), b, a), a));
599
+ }
600
+
601
+ template <class V, HWY_IF_UNSIGNED_V(V)>
602
+ HWY_API V MinMagnitude(V a, V b) {
603
+ return Min(a, b);
604
+ }
605
+
606
+ template <class V, HWY_IF_UNSIGNED_V(V)>
607
+ HWY_API V MaxMagnitude(V a, V b) {
608
+ return Max(a, b);
609
+ }
610
+
491
611
  // ------------------------------ AddSub
492
612
 
493
613
  template <class V, HWY_IF_LANES_D(DFromV<V>, 1)>
@@ -555,12 +675,18 @@ HWY_API V MaskedMulOr(V no, M m, V a, V b) {
555
675
 
556
676
  template <class V, class M>
557
677
  HWY_API V MaskedDivOr(V no, M m, V a, V b) {
558
- return IfThenElse(m, Div(a, b), no);
678
+ const DFromV<V> d;
679
+ // Avoid division by zero for masked-out lanes.
680
+ const V nonzero = Set(d, TFromD<decltype(d)>{1});
681
+ return IfThenElse(m, Div(a, IfThenElse(m, b, nonzero)), no);
559
682
  }
560
683
 
561
684
  template <class V, class M>
562
685
  HWY_API V MaskedModOr(V no, M m, V a, V b) {
563
- return IfThenElse(m, Mod(a, b), no);
686
+ const DFromV<V> d;
687
+ // Avoid division by zero for masked-out lanes.
688
+ const V nonzero = Set(d, TFromD<decltype(d)>{1});
689
+ return IfThenElse(m, Mod(a, IfThenElse(m, b, nonzero)), no);
564
690
  }
565
691
 
566
692
  template <class V, class M>
@@ -574,6 +700,140 @@ HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
574
700
  }
575
701
  #endif // HWY_NATIVE_MASKED_ARITH
576
702
 
703
+ #if (defined(HWY_NATIVE_ZERO_MASKED_ARITH) == defined(HWY_TARGET_TOGGLE))
704
+ #ifdef HWY_NATIVE_ZERO_MASKED_ARITH
705
+ #undef HWY_NATIVE_ZERO_MASKED_ARITH
706
+ #else
707
+ #define HWY_NATIVE_ZERO_MASKED_ARITH
708
+ #endif
709
+
710
+ template <class V, class M>
711
+ HWY_API V MaskedMax(M m, V a, V b) {
712
+ return IfThenElseZero(m, (Max(a, b)));
713
+ }
714
+
715
+ template <class V, class M>
716
+ HWY_API V MaskedAdd(M m, V a, V b) {
717
+ return IfThenElseZero(m, Add(a, b));
718
+ }
719
+
720
+ template <class V, class M>
721
+ HWY_API V MaskedSub(M m, V a, V b) {
722
+ return IfThenElseZero(m, Sub(a, b));
723
+ }
724
+
725
+ template <class V, class M>
726
+ HWY_API V MaskedMul(M m, V a, V b) {
727
+ return IfThenElseZero(m, Mul(a, b));
728
+ }
729
+
730
+ template <class V, class M>
731
+ HWY_API V MaskedDiv(M m, V a, V b) {
732
+ return IfThenElseZero(m, Div(a, b));
733
+ }
734
+
735
+ template <class V, class M>
736
+ HWY_API V MaskedSaturatedAdd(M m, V a, V b) {
737
+ return IfThenElseZero(m, SaturatedAdd(a, b));
738
+ }
739
+
740
+ template <class V, class M>
741
+ HWY_API V MaskedSaturatedSub(M m, V a, V b) {
742
+ return IfThenElseZero(m, SaturatedSub(a, b));
743
+ }
744
+
745
+ template <class V, class M, typename D = DFromV<V>, HWY_IF_I16_D(D)>
746
+ HWY_API V MaskedMulFixedPoint15(M m, V a, V b) {
747
+ return IfThenElseZero(m, MulFixedPoint15(a, b));
748
+ }
749
+
750
+ template <class V, class M>
751
+ HWY_API V MaskedMulAdd(M m, V mul, V x, V add) {
752
+ return IfThenElseZero(m, MulAdd(mul, x, add));
753
+ }
754
+
755
+ template <class V, class M>
756
+ HWY_API V MaskedNegMulAdd(M m, V mul, V x, V add) {
757
+ return IfThenElseZero(m, NegMulAdd(mul, x, add));
758
+ }
759
+
760
+ template <class D, class M, HWY_IF_UI32_D(D),
761
+ class V16 = VFromD<RepartitionToNarrow<D>>>
762
+ HWY_API VFromD<D> MaskedWidenMulPairwiseAdd(D d32, M m, V16 a, V16 b) {
763
+ return IfThenElseZero(m, WidenMulPairwiseAdd(d32, a, b));
764
+ }
765
+
766
+ template <class DF, class M, HWY_IF_F32_D(DF), class VBF>
767
+ HWY_API VFromD<DF> MaskedWidenMulPairwiseAdd(DF df, M m, VBF a, VBF b) {
768
+ return IfThenElseZero(m, WidenMulPairwiseAdd(df, a, b));
769
+ }
770
+ #endif // HWY_NATIVE_ZERO_MASKED_ARITH
771
+
772
+ // ------------------------------ MaskedShift
773
+ template <int kShift, class V, class M>
774
+ HWY_API V MaskedShiftLeft(M m, V a) {
775
+ return IfThenElseZero(m, ShiftLeft<kShift>(a));
776
+ }
777
+
778
+ template <int kShift, class V, class M>
779
+ HWY_API V MaskedShiftRight(M m, V a) {
780
+ return IfThenElseZero(m, ShiftRight<kShift>(a));
781
+ }
782
+
783
+ template <int kShift, class V, class M>
784
+ HWY_API V MaskedShiftRightOr(V no, M m, V a) {
785
+ return IfThenElse(m, ShiftRight<kShift>(a), no);
786
+ }
787
+
788
+ template <class V, class M>
789
+ HWY_API V MaskedShrOr(V no, M m, V a, V shifts) {
790
+ return IfThenElse(m, Shr(a, shifts), no);
791
+ }
792
+
793
+ // ------------------------------ MaskedEq etc.
794
+ #if (defined(HWY_NATIVE_MASKED_COMP) == defined(HWY_TARGET_TOGGLE))
795
+ #ifdef HWY_NATIVE_MASKED_COMP
796
+ #undef HWY_NATIVE_MASKED_COMP
797
+ #else
798
+ #define HWY_NATIVE_MASKED_COMP
799
+ #endif
800
+
801
+ template <class V, class M>
802
+ HWY_API auto MaskedEq(M m, V a, V b) -> decltype(a == b) {
803
+ return And(m, Eq(a, b));
804
+ }
805
+
806
+ template <class V, class M>
807
+ HWY_API auto MaskedNe(M m, V a, V b) -> decltype(a == b) {
808
+ return And(m, Ne(a, b));
809
+ }
810
+
811
+ template <class V, class M>
812
+ HWY_API auto MaskedLt(M m, V a, V b) -> decltype(a == b) {
813
+ return And(m, Lt(a, b));
814
+ }
815
+
816
+ template <class V, class M>
817
+ HWY_API auto MaskedGt(M m, V a, V b) -> decltype(a == b) {
818
+ return And(m, Gt(a, b));
819
+ }
820
+
821
+ template <class V, class M>
822
+ HWY_API auto MaskedLe(M m, V a, V b) -> decltype(a == b) {
823
+ return And(m, Le(a, b));
824
+ }
825
+
826
+ template <class V, class M>
827
+ HWY_API auto MaskedGe(M m, V a, V b) -> decltype(a == b) {
828
+ return And(m, Ge(a, b));
829
+ }
830
+
831
+ template <class V, class M, class D = DFromV<V>>
832
+ HWY_API MFromD<D> MaskedIsNaN(const M m, const V v) {
833
+ return And(m, IsNaN(v));
834
+ }
835
+ #endif // HWY_NATIVE_MASKED_COMP
836
+
577
837
  // ------------------------------ IfNegativeThenNegOrUndefIfZero
578
838
 
579
839
  #if (defined(HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG) == \
@@ -672,6 +932,18 @@ HWY_API V SaturatedAbs(V v) {
672
932
 
673
933
  #endif
674
934
 
935
+ // ------------------------------ MaskedAbsOr
936
+ template <class V, HWY_IF_SIGNED_V(V), class M>
937
+ HWY_API V MaskedAbsOr(V no, M m, V v) {
938
+ return IfThenElse(m, Abs(v), no);
939
+ }
940
+
941
+ // ------------------------------ MaskedAbs
942
+ template <class V, HWY_IF_SIGNED_V(V), class M>
943
+ HWY_API V MaskedAbs(M m, V v) {
944
+ return IfThenElseZero(m, Abs(v));
945
+ }
946
+
675
947
  // ------------------------------ Reductions
676
948
 
677
949
  // Targets follow one of two strategies. If HWY_NATIVE_REDUCE_SCALAR is toggled,
@@ -855,6 +1127,7 @@ HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) {
855
1127
  #else
856
1128
  #define HWY_NATIVE_REDUCE_SUM_4_UI8
857
1129
  #endif
1130
+
858
1131
  template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
859
1132
  HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
860
1133
  const Twice<RepartitionToWide<decltype(d)>> dw;
@@ -882,6 +1155,30 @@ HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
882
1155
  }
883
1156
  #endif // HWY_NATIVE_REDUCE_MINMAX_4_UI8
884
1157
 
1158
+ #if (defined(HWY_NATIVE_MASKED_REDUCE_SCALAR) == defined(HWY_TARGET_TOGGLE))
1159
+ #ifdef HWY_NATIVE_MASKED_REDUCE_SCALAR
1160
+ #undef HWY_NATIVE_MASKED_REDUCE_SCALAR
1161
+ #else
1162
+ #define HWY_NATIVE_MASKED_REDUCE_SCALAR
1163
+ #endif
1164
+
1165
+ template <class D, class M>
1166
+ HWY_API TFromD<D> MaskedReduceSum(D d, M m, VFromD<D> v) {
1167
+ return ReduceSum(d, IfThenElseZero(m, v));
1168
+ }
1169
+ template <class D, class M>
1170
+ HWY_API TFromD<D> MaskedReduceMin(D d, M m, VFromD<D> v) {
1171
+ return ReduceMin(
1172
+ d, IfThenElse(m, v, Set(d, hwy::PositiveInfOrHighestValue<TFromD<D>>())));
1173
+ }
1174
+ template <class D, class M>
1175
+ HWY_API TFromD<D> MaskedReduceMax(D d, M m, VFromD<D> v) {
1176
+ return ReduceMax(
1177
+ d, IfThenElse(m, v, Set(d, hwy::NegativeInfOrLowestValue<TFromD<D>>())));
1178
+ }
1179
+
1180
+ #endif // HWY_NATIVE_MASKED_REDUCE_SCALAR
1181
+
885
1182
  // ------------------------------ IsEitherNaN
886
1183
  #if (defined(HWY_NATIVE_IS_EITHER_NAN) == defined(HWY_TARGET_TOGGLE))
887
1184
  #ifdef HWY_NATIVE_IS_EITHER_NAN
@@ -946,6 +1243,225 @@ HWY_API MFromD<D> IsFinite(const V v) {
946
1243
 
947
1244
  #endif // HWY_NATIVE_ISINF
948
1245
 
1246
+ // ------------------------------ CeilInt/FloorInt
1247
+ #if (defined(HWY_NATIVE_CEIL_FLOOR_INT) == defined(HWY_TARGET_TOGGLE))
1248
+ #ifdef HWY_NATIVE_CEIL_FLOOR_INT
1249
+ #undef HWY_NATIVE_CEIL_FLOOR_INT
1250
+ #else
1251
+ #define HWY_NATIVE_CEIL_FLOOR_INT
1252
+ #endif
1253
+
1254
+ template <class V, HWY_IF_FLOAT_V(V)>
1255
+ HWY_API VFromD<RebindToSigned<DFromV<V>>> CeilInt(V v) {
1256
+ const DFromV<decltype(v)> d;
1257
+ const RebindToSigned<decltype(d)> di;
1258
+ return ConvertTo(di, Ceil(v));
1259
+ }
1260
+
1261
+ template <class V, HWY_IF_FLOAT_V(V)>
1262
+ HWY_API VFromD<RebindToSigned<DFromV<V>>> FloorInt(V v) {
1263
+ const DFromV<decltype(v)> d;
1264
+ const RebindToSigned<decltype(d)> di;
1265
+ return ConvertTo(di, Floor(v));
1266
+ }
1267
+
1268
+ #endif // HWY_NATIVE_CEIL_FLOOR_INT
1269
+
1270
+ // ------------------------------ MulByPow2/MulByFloorPow2
1271
+
1272
+ #if (defined(HWY_NATIVE_MUL_BY_POW2) == defined(HWY_TARGET_TOGGLE))
1273
+ #ifdef HWY_NATIVE_MUL_BY_POW2
1274
+ #undef HWY_NATIVE_MUL_BY_POW2
1275
+ #else
1276
+ #define HWY_NATIVE_MUL_BY_POW2
1277
+ #endif
1278
+
1279
+ template <class V, HWY_IF_FLOAT_V(V)>
1280
+ HWY_API V MulByPow2(V v, VFromD<RebindToSigned<DFromV<V>>> exp) {
1281
+ const DFromV<decltype(v)> df;
1282
+ const RebindToUnsigned<decltype(df)> du;
1283
+ const RebindToSigned<decltype(df)> di;
1284
+
1285
+ using TF = TFromD<decltype(df)>;
1286
+ using TI = TFromD<decltype(di)>;
1287
+ using TU = TFromD<decltype(du)>;
1288
+
1289
+ using VF = VFromD<decltype(df)>;
1290
+ using VI = VFromD<decltype(di)>;
1291
+
1292
+ constexpr TI kMaxBiasedExp = MaxExponentField<TF>();
1293
+ static_assert(kMaxBiasedExp > 0, "kMaxBiasedExp > 0 must be true");
1294
+
1295
+ constexpr TI kExpBias = static_cast<TI>(kMaxBiasedExp >> 1);
1296
+ static_assert(kExpBias > 0, "kExpBias > 0 must be true");
1297
+ static_assert(kExpBias <= LimitsMax<TI>() / 3,
1298
+ "kExpBias <= LimitsMax<TI>() / 3 must be true");
1299
+
1300
+ #if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4
1301
+ using TExpMinMax = If<(sizeof(TI) <= 4), TI, int32_t>;
1302
+ #elif (HWY_TARGET >= HWY_SSSE3 && HWY_TARGET <= HWY_SSE2) || \
1303
+ HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_WASM_EMU256
1304
+ using TExpMinMax = int16_t;
1305
+ #else
1306
+ using TExpMinMax = TI;
1307
+ #endif
1308
+
1309
+ #if HWY_TARGET == HWY_EMU128 || HWY_TARGET == HWY_SCALAR
1310
+ using TExpSatSub = TU;
1311
+ #elif HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || \
1312
+ HWY_TARGET == HWY_WASM_EMU256
1313
+ using TExpSatSub = If<(sizeof(TF) == 4), uint8_t, uint16_t>;
1314
+ #elif HWY_TARGET_IS_PPC
1315
+ using TExpSatSub = If<(sizeof(TF) >= 4), uint32_t, TU>;
1316
+ #else
1317
+ using TExpSatSub = If<(sizeof(TF) == 4), uint8_t, TU>;
1318
+ #endif
1319
+
1320
+ static_assert(kExpBias <= static_cast<TI>(LimitsMax<TExpMinMax>() / 3),
1321
+ "kExpBias <= LimitsMax<TExpMinMax>() / 3 must be true");
1322
+
1323
+ const Repartition<TExpMinMax, decltype(df)> d_exp_min_max;
1324
+ const Repartition<TExpSatSub, decltype(df)> d_sat_exp_sub;
1325
+
1326
+ constexpr int kNumOfExpBits = ExponentBits<TF>();
1327
+ constexpr int kNumOfMantBits = MantissaBits<TF>();
1328
+
1329
+ // The sign bit of BitCastScalar<TU>(a[i]) >> kNumOfMantBits can be zeroed out
1330
+ // using SaturatedSub if kZeroOutSignUsingSatSub is true.
1331
+
1332
+ // If kZeroOutSignUsingSatSub is true, then val_for_exp_sub will be bitcasted
1333
+ // to a vector that has a smaller lane size than TU for the SaturatedSub
1334
+ // operation below.
1335
+ constexpr bool kZeroOutSignUsingSatSub =
1336
+ ((sizeof(TExpSatSub) * 8) == static_cast<size_t>(kNumOfExpBits));
1337
+
1338
+ // If kZeroOutSignUsingSatSub is true, then the upper
1339
+ // (sizeof(TU) - sizeof(TExpSatSub)) * 8 bits of kExpDecrBy1Bits will be all
1340
+ // ones and the lower sizeof(TExpSatSub) * 8 bits of kExpDecrBy1Bits will be
1341
+ // equal to 1.
1342
+
1343
+ // Otherwise, if kZeroOutSignUsingSatSub is false, kExpDecrBy1Bits will be
1344
+ // equal to 1.
1345
+ constexpr TU kExpDecrBy1Bits = static_cast<TU>(
1346
+ TU{1} - (static_cast<TU>(kZeroOutSignUsingSatSub) << kNumOfExpBits));
1347
+
1348
+ VF val_for_exp_sub = v;
1349
+ HWY_IF_CONSTEXPR(!kZeroOutSignUsingSatSub) {
1350
+ // If kZeroOutSignUsingSatSub is not true, zero out the sign bit of
1351
+ // val_for_exp_sub[i] using Abs
1352
+ val_for_exp_sub = Abs(val_for_exp_sub);
1353
+ }
1354
+
1355
+ // min_exp1_plus_min_exp2[i] is the smallest exponent such that
1356
+ // min_exp1_plus_min_exp2[i] >= 2 - kExpBias * 2 and
1357
+ // std::ldexp(v[i], min_exp1_plus_min_exp2[i]) is a normal floating-point
1358
+ // number if v[i] is a normal number
1359
+ const VI min_exp1_plus_min_exp2 = BitCast(
1360
+ di,
1361
+ Max(BitCast(
1362
+ d_exp_min_max,
1363
+ Neg(BitCast(
1364
+ di,
1365
+ SaturatedSub(
1366
+ BitCast(d_sat_exp_sub, ShiftRight<kNumOfMantBits>(
1367
+ BitCast(du, val_for_exp_sub))),
1368
+ BitCast(d_sat_exp_sub, Set(du, kExpDecrBy1Bits)))))),
1369
+ BitCast(d_exp_min_max,
1370
+ Set(di, static_cast<TI>(2 - kExpBias - kExpBias)))));
1371
+
1372
+ const VI clamped_exp =
1373
+ Max(Min(exp, Set(di, static_cast<TI>(kExpBias * 3))),
1374
+ Add(min_exp1_plus_min_exp2, Set(di, static_cast<TI>(1 - kExpBias))));
1375
+
1376
+ const VI exp1_plus_exp2 = BitCast(
1377
+ di, Max(Min(BitCast(d_exp_min_max,
1378
+ Sub(clamped_exp, ShiftRight<2>(clamped_exp))),
1379
+ BitCast(d_exp_min_max,
1380
+ Set(di, static_cast<TI>(kExpBias + kExpBias)))),
1381
+ BitCast(d_exp_min_max, min_exp1_plus_min_exp2)));
1382
+
1383
+ const VI exp1 = ShiftRight<1>(exp1_plus_exp2);
1384
+ const VI exp2 = Sub(exp1_plus_exp2, exp1);
1385
+ const VI exp3 = Sub(clamped_exp, exp1_plus_exp2);
1386
+
1387
+ const VI exp_bias = Set(di, kExpBias);
1388
+
1389
+ const VF factor1 =
1390
+ BitCast(df, ShiftLeft<kNumOfMantBits>(Add(exp1, exp_bias)));
1391
+ const VF factor2 =
1392
+ BitCast(df, ShiftLeft<kNumOfMantBits>(Add(exp2, exp_bias)));
1393
+ const VF factor3 =
1394
+ BitCast(df, ShiftLeft<kNumOfMantBits>(Add(exp3, exp_bias)));
1395
+
1396
+ return Mul(Mul(Mul(v, factor1), factor2), factor3);
1397
+ }
1398
+
1399
+ template <class V, HWY_IF_FLOAT_V(V)>
1400
+ HWY_API V MulByFloorPow2(V v, V exp) {
1401
+ const DFromV<decltype(v)> df;
1402
+
1403
+ // MulByFloorPow2 special cases:
1404
+ // MulByFloorPow2(v, NaN) => NaN
1405
+ // MulByFloorPow2(0, inf) => NaN
1406
+ // MulByFloorPow2(inf, -inf) => NaN
1407
+ // MulByFloorPow2(-inf, -inf) => NaN
1408
+ const auto is_special_case_with_nan_result =
1409
+ Or(IsNaN(exp),
1410
+ And(Eq(Abs(v), IfNegativeThenElseZero(exp, Inf(df))), IsInf(exp)));
1411
+
1412
+ return IfThenElse(is_special_case_with_nan_result, NaN(df),
1413
+ MulByPow2(v, FloorInt(exp)));
1414
+ }
1415
+
1416
+ #endif // HWY_NATIVE_MUL_BY_POW2
1417
+
1418
+ // ------------------------------ GetBiasedExponent
1419
+ #if (defined(HWY_NATIVE_GET_BIASED_EXPONENT) == defined(HWY_TARGET_TOGGLE))
1420
+ #ifdef HWY_NATIVE_GET_BIASED_EXPONENT
1421
+ #undef HWY_NATIVE_GET_BIASED_EXPONENT
1422
+ #else
1423
+ #define HWY_NATIVE_GET_BIASED_EXPONENT
1424
+ #endif
1425
+
1426
+ template <class V, HWY_IF_FLOAT_V(V)>
1427
+ HWY_API VFromD<RebindToUnsigned<DFromV<V>>> GetBiasedExponent(V v) {
1428
+ using T = TFromV<V>;
1429
+
1430
+ const DFromV<V> d;
1431
+ const RebindToUnsigned<decltype(d)> du;
1432
+
1433
+ constexpr int kNumOfMantBits = MantissaBits<T>();
1434
+ return ShiftRight<kNumOfMantBits>(BitCast(du, Abs(v)));
1435
+ }
1436
+
1437
+ #endif
1438
+
1439
+ // ------------------------------ GetExponent
1440
+
1441
+ #if (defined(HWY_NATIVE_GET_EXPONENT) == defined(HWY_TARGET_TOGGLE))
1442
+ #ifdef HWY_NATIVE_GET_EXPONENT
1443
+ #undef HWY_NATIVE_GET_EXPONENT
1444
+ #else
1445
+ #define HWY_NATIVE_GET_EXPONENT
1446
+ #endif
1447
+
1448
+ template <class V, HWY_IF_FLOAT_V(V)>
1449
+ HWY_API V GetExponent(V v) {
1450
+ const DFromV<V> d;
1451
+ using T = TFromV<V>;
1452
+ const RebindToSigned<decltype(d)> di;
1453
+
1454
+ const auto exponent_offset = Set(di, MaxExponentField<T>() >> 1);
1455
+
1456
+ // extract exponent bits as integer
1457
+ const auto encoded_exponent = GetBiasedExponent(v);
1458
+ const auto exponent_int = Sub(BitCast(di, encoded_exponent), exponent_offset);
1459
+
1460
+ // convert integer to original type
1461
+ return ConvertTo(d, exponent_int);
1462
+ }
1463
+
1464
+ #endif // HWY_NATIVE_GET_EXPONENT
949
1465
  // ------------------------------ LoadInterleaved2
950
1466
 
951
1467
  #if HWY_IDE || \
@@ -1819,6 +2335,110 @@ HWY_API void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1,
1819
2335
 
1820
2336
  #endif // HWY_NATIVE_LOAD_STORE_INTERLEAVED
1821
2337
 
2338
+ // ------------------------------ PairwiseAdd/PairwiseSub
2339
+ #if (defined(HWY_NATIVE_PAIRWISE_ADD) == defined(HWY_TARGET_TOGGLE))
2340
+ #ifdef HWY_NATIVE_PAIRWISE_ADD
2341
+ #undef HWY_NATIVE_PAIRWISE_ADD
2342
+ #else
2343
+ #define HWY_NATIVE_PAIRWISE_ADD
2344
+ #endif
2345
+
2346
+ template <class D, class V = VFromD<D>(), HWY_IF_LANES_GT_D(D, 1)>
2347
+ HWY_API V PairwiseAdd(D d, V a, V b) {
2348
+ return Add(InterleaveEven(d, a, b), InterleaveOdd(d, a, b));
2349
+ }
2350
+
2351
+ #endif
2352
+
2353
+ #if (defined(HWY_NATIVE_PAIRWISE_SUB) == defined(HWY_TARGET_TOGGLE))
2354
+ #ifdef HWY_NATIVE_PAIRWISE_SUB
2355
+ #undef HWY_NATIVE_PAIRWISE_SUB
2356
+ #else
2357
+ #define HWY_NATIVE_PAIRWISE_SUB
2358
+ #endif
2359
+
2360
+ template <class D, class V = VFromD<D>(), HWY_IF_LANES_GT_D(D, 1)>
2361
+ HWY_API V PairwiseSub(D d, V a, V b) {
2362
+ return Sub(InterleaveOdd(d, a, b), InterleaveEven(d, a, b));
2363
+ }
2364
+
2365
+ #endif
2366
+
2367
+ // Load/StoreInterleaved for special floats. Requires HWY_GENERIC_IF_EMULATED_D
2368
+ // is defined such that it is true only for types that actually require these
2369
+ // generic implementations.
2370
+ #if HWY_IDE || (defined(HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED) == \
2371
+ defined(HWY_TARGET_TOGGLE) && \
2372
+ defined(HWY_GENERIC_IF_EMULATED_D))
2373
+ #ifdef HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED
2374
+ #undef HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED
2375
+ #else
2376
+ #define HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED
2377
+ #endif
2378
+ #if HWY_IDE
2379
+ #define HWY_GENERIC_IF_EMULATED_D(D) int
2380
+ #endif
2381
+
2382
+ template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>>
2383
+ HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
2384
+ VFromD<D>& v0, VFromD<D>& v1) {
2385
+ const RebindToUnsigned<decltype(d)> du;
2386
+ VFromD<decltype(du)> vu0, vu1;
2387
+ LoadInterleaved2(du, detail::U16LanePointer(unaligned), vu0, vu1);
2388
+ v0 = BitCast(d, vu0);
2389
+ v1 = BitCast(d, vu1);
2390
+ }
2391
+
2392
+ template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>>
2393
+ HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
2394
+ VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
2395
+ const RebindToUnsigned<decltype(d)> du;
2396
+ VFromD<decltype(du)> vu0, vu1, vu2;
2397
+ LoadInterleaved3(du, detail::U16LanePointer(unaligned), vu0, vu1, vu2);
2398
+ v0 = BitCast(d, vu0);
2399
+ v1 = BitCast(d, vu1);
2400
+ v2 = BitCast(d, vu2);
2401
+ }
2402
+
2403
+ template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>>
2404
+ HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
2405
+ VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
2406
+ VFromD<D>& v3) {
2407
+ const RebindToUnsigned<decltype(d)> du;
2408
+ VFromD<decltype(du)> vu0, vu1, vu2, vu3;
2409
+ LoadInterleaved4(du, detail::U16LanePointer(unaligned), vu0, vu1, vu2, vu3);
2410
+ v0 = BitCast(d, vu0);
2411
+ v1 = BitCast(d, vu1);
2412
+ v2 = BitCast(d, vu2);
2413
+ v3 = BitCast(d, vu3);
2414
+ }
2415
+
2416
+ template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>>
2417
+ HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
2418
+ T* HWY_RESTRICT unaligned) {
2419
+ const RebindToUnsigned<decltype(d)> du;
2420
+ StoreInterleaved2(BitCast(du, v0), BitCast(du, v1), du,
2421
+ detail::U16LanePointer(unaligned));
2422
+ }
2423
+
2424
+ template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>>
2425
+ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
2426
+ T* HWY_RESTRICT unaligned) {
2427
+ const RebindToUnsigned<decltype(d)> du;
2428
+ StoreInterleaved3(BitCast(du, v0), BitCast(du, v1), BitCast(du, v2), du,
2429
+ detail::U16LanePointer(unaligned));
2430
+ }
2431
+
2432
+ template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>>
2433
+ HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
2434
+ VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) {
2435
+ const RebindToUnsigned<decltype(d)> du;
2436
+ StoreInterleaved4(BitCast(du, v0), BitCast(du, v1), BitCast(du, v2),
2437
+ BitCast(du, v3), du, detail::U16LanePointer(unaligned));
2438
+ }
2439
+
2440
+ #endif // HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED
2441
+
1822
2442
  // ------------------------------ LoadN
1823
2443
 
1824
2444
  #if (defined(HWY_NATIVE_LOAD_N) == defined(HWY_TARGET_TOGGLE))
@@ -2327,6 +2947,24 @@ HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
2327
2947
 
2328
2948
  #endif // (defined(HWY_NATIVE_STORE_N) == defined(HWY_TARGET_TOGGLE))
2329
2949
 
2950
+ // ------------------------------ TruncateStore
2951
+ #if (defined(HWY_NATIVE_STORE_TRUNCATED) == defined(HWY_TARGET_TOGGLE))
2952
+ #ifdef HWY_NATIVE_STORE_TRUNCATED
2953
+ #undef HWY_NATIVE_STORE_TRUNCATED
2954
+ #else
2955
+ #define HWY_NATIVE_STORE_TRUNCATED
2956
+ #endif
2957
+
2958
+ template <class D, class T, HWY_IF_T_SIZE_GT_D(D, sizeof(T)),
2959
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
2960
+ HWY_API void TruncateStore(VFromD<D> v, const D /*d*/, T* HWY_RESTRICT p) {
2961
+ using DTo = Rebind<T, D>;
2962
+ DTo dsmall;
2963
+ StoreU(TruncateTo(dsmall, v), dsmall, p);
2964
+ }
2965
+
2966
+ #endif // (defined(HWY_NATIVE_STORE_TRUNCATED) == defined(HWY_TARGET_TOGGLE))
2967
+
2330
2968
  // ------------------------------ Scatter
2331
2969
 
2332
2970
  #if (defined(HWY_NATIVE_SCATTER) == defined(HWY_TARGET_TOGGLE))
@@ -2511,17 +3149,9 @@ HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
2511
3149
 
2512
3150
  template <class D, typename T = TFromD<D>>
2513
3151
  HWY_API VFromD<D> GatherIndexN(D d, const T* HWY_RESTRICT base,
2514
- VFromD<RebindToSigned<D>> index,
2515
- const size_t max_lanes_to_load) {
2516
- const RebindToSigned<D> di;
2517
- using TI = TFromD<decltype(di)>;
2518
- static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
2519
-
2520
- VFromD<D> v = Zero(d);
2521
- for (size_t i = 0; i < HWY_MIN(MaxLanes(d), max_lanes_to_load); ++i) {
2522
- v = InsertLane(v, i, base[ExtractLane(index, i)]);
2523
- }
2524
- return v;
3152
+ VFromD<RebindToSigned<D>> index,
3153
+ const size_t max_lanes_to_load) {
3154
+ return GatherIndexNOr(Zero(d), d, base, index, max_lanes_to_load);
2525
3155
  }
2526
3156
 
2527
3157
  template <class D, typename T = TFromD<D>>
@@ -2533,8 +3163,9 @@ HWY_API VFromD<D> GatherIndexNOr(VFromD<D> no, D d, const T* HWY_RESTRICT base,
2533
3163
  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
2534
3164
 
2535
3165
  VFromD<D> v = no;
2536
- for (size_t i = 0; i < HWY_MIN(MaxLanes(d), max_lanes_to_load); ++i) {
2537
- v = InsertLane(v, i, base[ExtractLane(index, i)]);
3166
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
3167
+ if (i < max_lanes_to_load)
3168
+ v = InsertLane(v, i, base[ExtractLane(index, i)]);
2538
3169
  }
2539
3170
  return v;
2540
3171
  }
@@ -3561,6 +4192,21 @@ HWY_API V TrailingZeroCount(V v) {
3561
4192
  }
3562
4193
  #endif // HWY_NATIVE_LEADING_ZERO_COUNT
3563
4194
 
4195
+ // ------------------------------ MaskedLeadingZeroCount
4196
+ #if (defined(HWY_NATIVE_MASKED_LEADING_ZERO_COUNT) == \
4197
+ defined(HWY_TARGET_TOGGLE))
4198
+ #ifdef HWY_NATIVE_MASKED_LEADING_ZERO_COUNT
4199
+ #undef HWY_NATIVE_MASKED_LEADING_ZERO_COUNT
4200
+ #else
4201
+ #define HWY_NATIVE_MASKED_LEADING_ZERO_COUNT
4202
+ #endif
4203
+
4204
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), class M>
4205
+ HWY_API V MaskedLeadingZeroCount(M m, V v) {
4206
+ return IfThenElseZero(m, LeadingZeroCount(v));
4207
+ }
4208
+ #endif // HWY_NATIVE_MASKED_LEADING_ZERO_COUNT
4209
+
3564
4210
  // ------------------------------ AESRound
3565
4211
 
3566
4212
  // Cannot implement on scalar: need at least 16 bytes for TableLookupBytes.
@@ -4027,6 +4673,12 @@ HWY_API V operator*(V x, V y) {
4027
4673
 
4028
4674
  #endif // HWY_NATIVE_MUL_64
4029
4675
 
4676
+ // ------------------------------ MulRound
4677
+ template <class V, HWY_IF_FLOAT_V(V)>
4678
+ HWY_API V MulRound(V a, V b) {
4679
+ return Round(Mul(a, b));
4680
+ }
4681
+
4030
4682
  // ------------------------------ MulAdd / NegMulAdd
4031
4683
 
4032
4684
  #if (defined(HWY_NATIVE_INT_FMA) == defined(HWY_TARGET_TOGGLE))
@@ -4057,6 +4709,86 @@ HWY_API V MulSub(V mul, V x, V sub) {
4057
4709
  return Sub(Mul(mul, x), sub);
4058
4710
  }
4059
4711
  #endif // HWY_NATIVE_INT_FMA
4712
+ // ------------------------------ MulComplex* / MaskedMulComplex*
4713
+
4714
+ #if (defined(HWY_NATIVE_CPLX) == defined(HWY_TARGET_TOGGLE))
4715
+ #ifdef HWY_NATIVE_CPLX
4716
+ #undef HWY_NATIVE_CPLX
4717
+ #else
4718
+ #define HWY_NATIVE_CPLX
4719
+ #endif
4720
+
4721
+ #if HWY_TARGET != HWY_SCALAR || HWY_IDE
4722
+
4723
+ template <class V, HWY_IF_NOT_UNSIGNED(TFromV<V>)>
4724
+ HWY_API V ComplexConj(V a) {
4725
+ return OddEven(Neg(a), a);
4726
+ }
4727
+
4728
+ template <class V>
4729
+ HWY_API V MulComplex(V a, V b) {
4730
+ // a = u + iv, b = x + iy
4731
+ const auto u = DupEven(a);
4732
+ const auto v = DupOdd(a);
4733
+ const auto x = DupEven(b);
4734
+ const auto y = DupOdd(b);
4735
+
4736
+ return OddEven(MulAdd(u, y, Mul(v, x)), Sub(Mul(u, x), Mul(v, y)));
4737
+ }
4738
+
4739
+ template <class V>
4740
+ HWY_API V MulComplexConj(V a, V b) {
4741
+ // a = u + iv, b = x + iy
4742
+ const auto u = DupEven(a);
4743
+ const auto v = DupOdd(a);
4744
+ const auto x = DupEven(b);
4745
+ const auto y = DupOdd(b);
4746
+
4747
+ return OddEven(Sub(Mul(v, x), Mul(u, y)), MulAdd(u, x, Mul(v, y)));
4748
+ }
4749
+
4750
+ template <class V>
4751
+ HWY_API V MulComplexAdd(V a, V b, V c) {
4752
+ return Add(MulComplex(a, b), c);
4753
+ }
4754
+
4755
+ template <class V>
4756
+ HWY_API V MulComplexConjAdd(V a, V b, V c) {
4757
+ return Add(MulComplexConj(a, b), c);
4758
+ }
4759
+
4760
+ template <class V, class M>
4761
+ HWY_API V MaskedMulComplexConjAdd(M mask, V a, V b, V c) {
4762
+ return IfThenElseZero(mask, MulComplexConjAdd(a, b, c));
4763
+ }
4764
+
4765
+ template <class V, class M>
4766
+ HWY_API V MaskedMulComplexConj(M mask, V a, V b) {
4767
+ return IfThenElseZero(mask, MulComplexConj(a, b));
4768
+ }
4769
+
4770
+ template <class V, class M>
4771
+ HWY_API V MaskedMulComplexOr(V no, M mask, V a, V b) {
4772
+ return IfThenElse(mask, MulComplex(a, b), no);
4773
+ }
4774
+ #endif // HWY_TARGET != HWY_SCALAR
4775
+
4776
+ #endif // HWY_NATIVE_CPLX
4777
+
4778
+ // ------------------------------ MaskedMulAddOr
4779
+ #if (defined(HWY_NATIVE_MASKED_INT_FMA) == defined(HWY_TARGET_TOGGLE))
4780
+ #ifdef HWY_NATIVE_MASKED_INT_FMA
4781
+ #undef HWY_NATIVE_MASKED_INT_FMA
4782
+ #else
4783
+ #define HWY_NATIVE_MASKED_INT_FMA
4784
+ #endif
4785
+
4786
+ template <class V, class M>
4787
+ HWY_API V MaskedMulAddOr(V no, M m, V mul, V x, V add) {
4788
+ return IfThenElse(m, MulAdd(mul, x, add), no);
4789
+ }
4790
+
4791
+ #endif // HWY_NATIVE_MASKED_INT_FMA
4060
4792
 
4061
4793
  // ------------------------------ Integer MulSub / NegMulSub
4062
4794
  #if (defined(HWY_NATIVE_INT_FMSUB) == defined(HWY_TARGET_TOGGLE))
@@ -4112,6 +4844,25 @@ HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
4112
4844
  OddEven(sub_or_add, BitCast(d, Neg(BitCast(d_negate, sub_or_add))));
4113
4845
  return MulAdd(mul, x, add);
4114
4846
  }
4847
+ // ------------------------------ MulSubAdd
4848
+
4849
+ template <class V>
4850
+ HWY_API V MulSubAdd(V mul, V x, V sub_or_add) {
4851
+ using D = DFromV<V>;
4852
+ using T = TFromD<D>;
4853
+ using TNegate = If<!IsSigned<T>(), MakeSigned<T>, T>;
4854
+
4855
+ const D d;
4856
+ const Rebind<TNegate, D> d_negate;
4857
+
4858
+ return MulAddSub(mul, x, BitCast(d, Neg(BitCast(d_negate, sub_or_add))));
4859
+ }
4860
+
4861
+ // ------------------------------ MaskedConvertTo
4862
+ template <class D, class V, class M>
4863
+ HWY_API VFromD<D> MaskedConvertTo(M m, D d, V v) {
4864
+ return IfThenElseZero(m, ConvertTo(d, v));
4865
+ }
4115
4866
 
4116
4867
  // ------------------------------ Integer division
4117
4868
  #if (defined(HWY_NATIVE_INT_DIV) == defined(HWY_TARGET_TOGGLE))
@@ -4574,7 +5325,9 @@ HWY_INLINE V IntDiv(V a, V b) {
4574
5325
  template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
4575
5326
  HWY_IF_T_SIZE_ONE_OF_V(V, ((HWY_TARGET <= HWY_SSE2 ||
4576
5327
  HWY_TARGET == HWY_WASM ||
4577
- HWY_TARGET == HWY_WASM_EMU256)
5328
+ HWY_TARGET == HWY_WASM_EMU256 ||
5329
+ HWY_TARGET == HWY_LSX ||
5330
+ HWY_TARGET == HWY_LASX)
4578
5331
  ? 0
4579
5332
  : (1 << 1)) |
4580
5333
  (1 << 2) | (1 << 4) | (1 << 8))>
@@ -4582,8 +5335,9 @@ HWY_INLINE V IntMod(V a, V b) {
4582
5335
  return hwy::HWY_NAMESPACE::NegMulAdd(IntDiv<kOrigLaneSize>(a, b), b, a);
4583
5336
  }
4584
5337
 
4585
- #if HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || \
4586
- HWY_TARGET == HWY_WASM_EMU256
5338
+ #if HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || \
5339
+ HWY_TARGET == HWY_WASM_EMU256 || HWY_TARGET == HWY_LSX || \
5340
+ HWY_TARGET == HWY_LASX
4587
5341
  template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
4588
5342
  HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
4589
5343
  HWY_INLINE V IntMod(V a, V b) {
@@ -4602,7 +5356,7 @@ HWY_INLINE V IntMod(V a, V b) {
4602
5356
  IntMod<kOrigLaneSize>(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b)));
4603
5357
  }
4604
5358
  #endif // HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || HWY_TARGET ==
4605
- // HWY_WASM_EMU256
5359
+ // HWY_WASM_EMU256 || HWY_TARGET == HWY_LSX || HWY_TARGET == HWY_LASX
4606
5360
 
4607
5361
  } // namespace detail
4608
5362
 
@@ -4655,6 +5409,102 @@ HWY_API Vec512<T> operator%(Vec512<T> a, Vec512<T> b) {
4655
5409
 
4656
5410
  #endif // HWY_NATIVE_INT_DIV
4657
5411
 
5412
+ // ------------------------------ AverageRound
5413
+
5414
+ #if (defined(HWY_NATIVE_AVERAGE_ROUND_UI32) == defined(HWY_TARGET_TOGGLE))
5415
+ #ifdef HWY_NATIVE_AVERAGE_ROUND_UI32
5416
+ #undef HWY_NATIVE_AVERAGE_ROUND_UI32
5417
+ #else
5418
+ #define HWY_NATIVE_AVERAGE_ROUND_UI32
5419
+ #endif
5420
+
5421
+ template <class V, HWY_IF_UI32(TFromV<V>)>
5422
+ HWY_API V AverageRound(V a, V b) {
5423
+ return Sub(Or(a, b), ShiftRight<1>(Xor(a, b)));
5424
+ }
5425
+
5426
+ #endif // HWY_NATIVE_AVERAGE_ROUND_UI64
5427
+
5428
+ #if (defined(HWY_NATIVE_AVERAGE_ROUND_UI64) == defined(HWY_TARGET_TOGGLE))
5429
+ #ifdef HWY_NATIVE_AVERAGE_ROUND_UI64
5430
+ #undef HWY_NATIVE_AVERAGE_ROUND_UI64
5431
+ #else
5432
+ #define HWY_NATIVE_AVERAGE_ROUND_UI64
5433
+ #endif
5434
+
5435
+ #if HWY_HAVE_INTEGER64
5436
+ template <class V, HWY_IF_UI64(TFromV<V>)>
5437
+ HWY_API V AverageRound(V a, V b) {
5438
+ return Sub(Or(a, b), ShiftRight<1>(Xor(a, b)));
5439
+ }
5440
+ #endif
5441
+
5442
+ #endif // HWY_NATIVE_AVERAGE_ROUND_UI64
5443
+
5444
+ // ------------------------------ RoundingShiftRight (AverageRound)
5445
+
5446
+ #if (defined(HWY_NATIVE_ROUNDING_SHR) == defined(HWY_TARGET_TOGGLE))
5447
+ #ifdef HWY_NATIVE_ROUNDING_SHR
5448
+ #undef HWY_NATIVE_ROUNDING_SHR
5449
+ #else
5450
+ #define HWY_NATIVE_ROUNDING_SHR
5451
+ #endif
5452
+
5453
+ template <int kShiftAmt, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
5454
+ HWY_API V RoundingShiftRight(V v) {
5455
+ const DFromV<V> d;
5456
+ using T = TFromD<decltype(d)>;
5457
+
5458
+ static_assert(
5459
+ 0 <= kShiftAmt && kShiftAmt <= static_cast<int>(sizeof(T) * 8 - 1),
5460
+ "kShiftAmt is out of range");
5461
+
5462
+ constexpr int kScaleDownShrAmt = HWY_MAX(kShiftAmt - 1, 0);
5463
+
5464
+ auto scaled_down_v = v;
5465
+ HWY_IF_CONSTEXPR(kScaleDownShrAmt > 0) {
5466
+ scaled_down_v = ShiftRight<kScaleDownShrAmt>(v);
5467
+ }
5468
+
5469
+ HWY_IF_CONSTEXPR(kShiftAmt == 0) { return scaled_down_v; }
5470
+
5471
+ return AverageRound(scaled_down_v, Zero(d));
5472
+ }
5473
+
5474
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
5475
+ HWY_API V RoundingShiftRightSame(V v, int shift_amt) {
5476
+ const DFromV<V> d;
5477
+ using T = TFromD<decltype(d)>;
5478
+
5479
+ const int shift_amt_is_zero_mask = -static_cast<int>(shift_amt == 0);
5480
+
5481
+ const auto scaled_down_v = ShiftRightSame(
5482
+ v, static_cast<int>(static_cast<unsigned>(shift_amt) +
5483
+ static_cast<unsigned>(~shift_amt_is_zero_mask)));
5484
+
5485
+ return AverageRound(
5486
+ scaled_down_v,
5487
+ And(scaled_down_v, Set(d, static_cast<T>(shift_amt_is_zero_mask))));
5488
+ }
5489
+
5490
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
5491
+ HWY_API V RoundingShr(V v, V amt) {
5492
+ const DFromV<V> d;
5493
+ const RebindToUnsigned<decltype(d)> du;
5494
+ using T = TFromD<decltype(d)>;
5495
+ using TU = MakeUnsigned<T>;
5496
+
5497
+ const auto unsigned_amt = BitCast(du, amt);
5498
+ const auto scale_down_shr_amt =
5499
+ BitCast(d, SaturatedSub(unsigned_amt, Set(du, TU{1})));
5500
+
5501
+ const auto scaled_down_v = Shr(v, scale_down_shr_amt);
5502
+ return AverageRound(scaled_down_v,
5503
+ IfThenElseZero(Eq(amt, Zero(d)), scaled_down_v));
5504
+ }
5505
+
5506
+ #endif // HWY_NATIVE_ROUNDING_SHR
5507
+
4658
5508
  // ------------------------------ MulEvenAdd (PromoteEvenTo)
4659
5509
 
4660
5510
  // SVE with bf16 and NEON with bf16 override this.
@@ -4835,6 +5685,26 @@ HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 di32,
4835
5685
 
4836
5686
  #endif // HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
4837
5687
 
5688
+ // ------------------------------ MaskedSqrt
5689
+
5690
+ #if (defined(HWY_NATIVE_MASKED_SQRT) == defined(HWY_TARGET_TOGGLE))
5691
+
5692
+ #ifdef HWY_NATIVE_MASKED_SQRT
5693
+ #undef HWY_NATIVE_MASKED_SQRT
5694
+ #else
5695
+ #define HWY_NATIVE_MASKED_SQRT
5696
+ #endif
5697
+ template <class V, HWY_IF_FLOAT_V(V), class M>
5698
+ HWY_API V MaskedSqrt(M m, V v) {
5699
+ return IfThenElseZero(m, Sqrt(v));
5700
+ }
5701
+
5702
+ template <class V, HWY_IF_FLOAT_V(V), class M>
5703
+ HWY_API V MaskedSqrtOr(V no, M m, V v) {
5704
+ return IfThenElse(m, Sqrt(v), no);
5705
+ }
5706
+ #endif
5707
+
4838
5708
  // ------------------------------ SumOfMulQuadAccumulate
4839
5709
 
4840
5710
  #if (defined(HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE) == \
@@ -5019,6 +5889,12 @@ HWY_API V ApproximateReciprocal(V v) {
5019
5889
 
5020
5890
  #endif // HWY_NATIVE_F64_APPROX_RECIP
5021
5891
 
5892
+ // ------------------------------ MaskedApproximateReciprocal
5893
+ template <class V, HWY_IF_FLOAT_V(V), class M>
5894
+ HWY_API V MaskedApproximateReciprocal(M m, V v) {
5895
+ return IfThenElseZero(m, ApproximateReciprocal(v));
5896
+ }
5897
+
5022
5898
  // ------------------------------ F64 ApproximateReciprocalSqrt
5023
5899
 
5024
5900
  #if (defined(HWY_NATIVE_F64_APPROX_RSQRT) == defined(HWY_TARGET_TOGGLE))
@@ -5044,6 +5920,12 @@ HWY_API V ApproximateReciprocalSqrt(V v) {
5044
5920
 
5045
5921
  #endif // HWY_NATIVE_F64_APPROX_RSQRT
5046
5922
 
5923
+ // ------------------------------ MaskedApproximateReciprocalSqrt
5924
+ template <class V, HWY_IF_FLOAT_V(V), class M>
5925
+ HWY_API V MaskedApproximateReciprocalSqrt(M m, V v) {
5926
+ return IfThenElseZero(m, ApproximateReciprocalSqrt(v));
5927
+ }
5928
+
5047
5929
  // ------------------------------ Compress*
5048
5930
 
5049
5931
  #if (defined(HWY_NATIVE_COMPRESS8) == defined(HWY_TARGET_TOGGLE))
@@ -5257,13 +6139,6 @@ HWY_API V CompressNot(V v, M mask) {
5257
6139
 
5258
6140
  namespace detail {
5259
6141
 
5260
- #if HWY_IDE
5261
- template <class M>
5262
- HWY_INLINE uint64_t BitsFromMask(M /* mask */) {
5263
- return 0;
5264
- }
5265
- #endif // HWY_IDE
5266
-
5267
6142
  template <size_t N>
5268
6143
  HWY_INLINE Vec128<uint8_t, N> IndicesForExpandFromBits(uint64_t mask_bits) {
5269
6144
  static_assert(N <= 8, "Should only be called for half-vectors");
@@ -5537,7 +6412,7 @@ template <typename T, size_t N, HWY_IF_T_SIZE(T, 1), HWY_IF_V_SIZE_LE(T, N, 8)>
5537
6412
  HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
5538
6413
  const DFromV<decltype(v)> d;
5539
6414
 
5540
- const uint64_t mask_bits = detail::BitsFromMask(mask);
6415
+ const uint64_t mask_bits = BitsFromMask(d, mask);
5541
6416
  const Vec128<uint8_t, N> indices =
5542
6417
  detail::IndicesForExpandFromBits<N>(mask_bits);
5543
6418
  return BitCast(d, TableLookupBytesOr0(v, indices));
@@ -5551,15 +6426,16 @@ HWY_API Vec128<T> Expand(Vec128<T> v, Mask128<T> mask) {
5551
6426
  const Half<decltype(du)> duh;
5552
6427
  const Vec128<uint8_t> vu = BitCast(du, v);
5553
6428
 
5554
- const uint64_t mask_bits = detail::BitsFromMask(mask);
6429
+ const uint64_t mask_bits = BitsFromMask(d, mask);
5555
6430
  const uint64_t maskL = mask_bits & 0xFF;
5556
6431
  const uint64_t maskH = mask_bits >> 8;
5557
6432
 
5558
6433
  // We want to skip past the v bytes already consumed by idxL. There is no
5559
6434
  // instruction for shift-reg by variable bytes. Storing v itself would work
5560
6435
  // but would involve a store-load forwarding stall. We instead shuffle using
5561
- // loaded indices. multishift_epi64_epi8 would also help, but if we have that,
5562
- // we probably also have native 8-bit Expand.
6436
+ // loaded indices.
6437
+ // TODO: MultiRotateRight would also help, but if we have that, we probably
6438
+ // also have native 8-bit Expand?
5563
6439
  alignas(16) static constexpr uint8_t iota[32] = {
5564
6440
  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
5565
6441
  11, 12, 13, 14, 15, 128, 128, 128, 128, 128, 128,
@@ -5583,7 +6459,7 @@ HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
5583
6459
  const RebindToUnsigned<decltype(d)> du;
5584
6460
 
5585
6461
  const Rebind<uint8_t, decltype(d)> du8;
5586
- const uint64_t mask_bits = detail::BitsFromMask(mask);
6462
+ const uint64_t mask_bits = BitsFromMask(d, mask);
5587
6463
 
5588
6464
  // Storing as 8-bit reduces table size from 4 KiB to 2 KiB. We cannot apply
5589
6465
  // the nibble trick used below because not all indices fit within one lane.
@@ -5865,7 +6741,7 @@ HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
5865
6741
  const DFromV<decltype(v)> d;
5866
6742
  const RebindToUnsigned<decltype(d)> du;
5867
6743
 
5868
- const uint64_t mask_bits = detail::BitsFromMask(mask);
6744
+ const uint64_t mask_bits = BitsFromMask(d, mask);
5869
6745
 
5870
6746
  alignas(16) static constexpr uint32_t packed_array[16] = {
5871
6747
  // PrintExpand64x4Nibble - same for 32x4.
@@ -6550,6 +7426,107 @@ HWY_API V Per4LaneBlockShuffle(V v) {
6550
7426
  }
6551
7427
  #endif
6552
7428
 
7429
+ // ------------------------------ PairwiseAdd128/PairwiseSub128
7430
+ // (Per4LaneBlockShuffle)
7431
+ #if (defined(HWY_NATIVE_PAIRWISE_ADD_128) == defined(HWY_TARGET_TOGGLE))
7432
+ #ifdef HWY_NATIVE_PAIRWISE_ADD_128
7433
+ #undef HWY_NATIVE_PAIRWISE_ADD_128
7434
+ #else
7435
+ #define HWY_NATIVE_PAIRWISE_ADD_128
7436
+ #endif
7437
+
7438
+ namespace detail {
7439
+
7440
+ // detail::BlockwiseConcatOddEven(d, v) returns the even lanes of each block of
7441
+ // v followed by the odd lanes of v
7442
+ #if HWY_TARGET_IS_NEON || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_RVV || \
7443
+ HWY_TARGET == HWY_LSX || HWY_TARGET == HWY_LASX
7444
+ template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
7445
+ HWY_IF_V_SIZE_GT_D(D, 8)>
7446
+ static HWY_INLINE HWY_MAYBE_UNUSED Vec<D> BlockwiseConcatOddEven(D d,
7447
+ Vec<D> v) {
7448
+ #if HWY_TARGET == HWY_RVV
7449
+ const ScalableTag<uint64_t, HWY_MAX(HWY_POW2_D(D), 0)> du64;
7450
+ #else
7451
+ const Repartition<uint64_t, DFromV<decltype(v)>> du64;
7452
+ #endif
7453
+
7454
+ const Repartition<TFromD<decltype(d)>, decltype(du64)> d_concat;
7455
+ const auto v_to_concat = ResizeBitCast(d_concat, v);
7456
+
7457
+ const auto evens = ConcatEven(d, v_to_concat, v_to_concat);
7458
+ const auto odds = ConcatOdd(d, v_to_concat, v_to_concat);
7459
+ return ResizeBitCast(
7460
+ d, InterleaveWholeLower(BitCast(du64, evens), BitCast(du64, odds)));
7461
+ }
7462
+
7463
+ #else // !(HWY_TARGET_IS_NEON || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_RVV)
7464
+
7465
+ template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_GT_D(D, 8)>
7466
+ static HWY_INLINE HWY_MAYBE_UNUSED Vec<D> BlockwiseConcatOddEven(D d,
7467
+ Vec<D> v) {
7468
+ #if HWY_TARGET == HWY_SSE2
7469
+ const RebindToUnsigned<decltype(d)> du;
7470
+ const RebindToSigned<RepartitionToWide<decltype(du)>> dw;
7471
+
7472
+ const auto vu = BitCast(du, v);
7473
+ return BitCast(
7474
+ d, OrderedDemote2To(du, PromoteEvenTo(dw, vu), PromoteOddTo(dw, vu)));
7475
+ #else
7476
+ const Repartition<uint8_t, decltype(d)> du8;
7477
+ const auto idx =
7478
+ BitCast(d, Dup128VecFromValues(du8, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7,
7479
+ 9, 11, 13, 15));
7480
+ return TableLookupBytes(v, idx);
7481
+ #endif
7482
+ }
7483
+
7484
+ template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_GT_D(D, 8)>
7485
+ static HWY_INLINE HWY_MAYBE_UNUSED Vec<D> BlockwiseConcatOddEven(D d,
7486
+ Vec<D> v) {
7487
+ #if HWY_TARGET == HWY_SSE2
7488
+ const RebindToSigned<decltype(d)> di;
7489
+ const RepartitionToWide<decltype(di)> dw;
7490
+ const auto vi = BitCast(di, v);
7491
+ return BitCast(
7492
+ d, OrderedDemote2To(di, PromoteEvenTo(dw, vi), PromoteOddTo(dw, vi)));
7493
+ #else
7494
+ const Repartition<uint8_t, decltype(d)> du8;
7495
+ const auto idx = BitCast(d, Dup128VecFromValues(du8, 0, 1, 4, 5, 8, 9, 12, 13,
7496
+ 2, 3, 6, 7, 10, 11, 14, 15));
7497
+ return TableLookupBytes(v, idx);
7498
+ #endif
7499
+ }
7500
+
7501
+ template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_GT_D(D, 8)>
7502
+ static HWY_INLINE HWY_MAYBE_UNUSED Vec<D> BlockwiseConcatOddEven(D /*d*/,
7503
+ Vec<D> v) {
7504
+ return Per4LaneBlockShuffle<3, 1, 2, 0>(v);
7505
+ }
7506
+ #endif // HWY_TARGET_IS_NEON || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_RVV
7507
+
7508
+ template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
7509
+ static HWY_INLINE HWY_MAYBE_UNUSED Vec<D> BlockwiseConcatOddEven(D /*d*/,
7510
+ Vec<D> v) {
7511
+ return v;
7512
+ }
7513
+
7514
+ } // namespace detail
7515
+
7516
+ // Pairwise add with output in 128 bit blocks of a and b.
7517
+ template <class D, HWY_IF_PAIRWISE_ADD_128_D(D)>
7518
+ HWY_API Vec<D> PairwiseAdd128(D d, Vec<D> a, Vec<D> b) {
7519
+ return detail::BlockwiseConcatOddEven(d, PairwiseAdd(d, a, b));
7520
+ }
7521
+
7522
+ // Pairwise sub with output in 128 bit blocks of a and b.
7523
+ template <class D, HWY_IF_PAIRWISE_SUB_128_D(D)>
7524
+ HWY_API Vec<D> PairwiseSub128(D d, Vec<D> a, Vec<D> b) {
7525
+ return detail::BlockwiseConcatOddEven(d, PairwiseSub(d, a, b));
7526
+ }
7527
+
7528
+ #endif
7529
+
6553
7530
  // ------------------------------ Blocks
6554
7531
 
6555
7532
  template <class D>
@@ -6922,9 +7899,17 @@ HWY_API V BitShuffle(V v, VI idx) {
6922
7899
  static_cast<uint64_t>(0x0102040810204080u);
6923
7900
  #endif
6924
7901
 
7902
+ const auto k7 = Set(du8, uint8_t{0x07});
7903
+
7904
+ auto unmasked_byte_idx = BitCast(du8, ShiftRight<3>(BitCast(d_idx_shr, idx)));
7905
+ #if HWY_IS_BIG_ENDIAN
7906
+ // Need to invert the lower 3 bits of unmasked_byte_idx[i] on big-endian
7907
+ // targets
7908
+ unmasked_byte_idx = Xor(unmasked_byte_idx, k7);
7909
+ #endif // HWY_IS_BIG_ENDIAN
7910
+
6925
7911
  const auto byte_idx = BitwiseIfThenElse(
6926
- Set(du8, uint8_t{0x07}),
6927
- BitCast(du8, ShiftRight<3>(BitCast(d_idx_shr, idx))),
7912
+ k7, unmasked_byte_idx,
6928
7913
  BitCast(du8, Dup128VecFromValues(du64, uint64_t{0},
6929
7914
  uint64_t{0x0808080808080808u})));
6930
7915
  // We want to shift right by idx & 7 to extract the desired bit in `bytes`,
@@ -6942,6 +7927,195 @@ HWY_API V BitShuffle(V v, VI idx) {
6942
7927
 
6943
7928
  #endif // HWY_NATIVE_BITSHUFFLE
6944
7929
 
7930
+ template <class V, class M>
7931
+ HWY_API V MaskedOr(M m, V a, V b) {
7932
+ return IfThenElseZero(m, Or(a, b));
7933
+ }
7934
+ // ------------------------------ AllBits1/AllBits0
7935
+ #if (defined(HWY_NATIVE_ALLONES) == defined(HWY_TARGET_TOGGLE))
7936
+ #ifdef HWY_NATIVE_ALLONES
7937
+ #undef HWY_NATIVE_ALLONES
7938
+ #else
7939
+ #define HWY_NATIVE_ALLONES
7940
+ #endif
7941
+
7942
+ template <class D, class V = VFromD<D>>
7943
+ HWY_API bool AllBits1(D d, V v) {
7944
+ const RebindToUnsigned<decltype(d)> du;
7945
+ using TU = TFromD<decltype(du)>;
7946
+ return AllTrue(du, Eq(BitCast(du, v), Set(du, hwy::HighestValue<TU>())));
7947
+ }
7948
+ #endif // HWY_NATIVE_ALLONES
7949
+
7950
+ #if (defined(HWY_NATIVE_ALLZEROS) == defined(HWY_TARGET_TOGGLE))
7951
+ #ifdef HWY_NATIVE_ALLZEROS
7952
+ #undef HWY_NATIVE_ALLZEROS
7953
+ #else
7954
+ #define HWY_NATIVE_ALLZEROS
7955
+ #endif
7956
+
7957
+ template <class D, class V = VFromD<D>>
7958
+ HWY_API bool AllBits0(D d, V v) {
7959
+ return AllTrue(d, Eq(v, Zero(d)));
7960
+ }
7961
+ #endif // HWY_NATIVE_ALLZEROS
7962
+
7963
+ // ------------------------------ MultiRotateRight
7964
+ #if (defined(HWY_NATIVE_MULTIROTATERIGHT) == defined(HWY_TARGET_TOGGLE))
7965
+ #ifdef HWY_NATIVE_MULTIROTATERIGHT
7966
+ #undef HWY_NATIVE_MULTIROTATERIGHT
7967
+ #else
7968
+ #define HWY_NATIVE_MULTIROTATERIGHT
7969
+ #endif
7970
+
7971
+ template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
7972
+ class VI_2 = VFromD<Repartition<TFromV<VI>, DFromV<V>>>,
7973
+ HWY_IF_LANES_D(DFromV<VI>, HWY_MAX_LANES_V(VI_2)),
7974
+ HWY_IF_V_SIZE_V(V, 8)>
7975
+ HWY_API V MultiRotateRight(V v, VI idx) {
7976
+ const DFromV<V> d64;
7977
+ const Twice<decltype(d64)> dt64;
7978
+ const Repartition<uint8_t, decltype(d64)> du8;
7979
+ const Repartition<uint8_t, decltype(dt64)> dt_u8;
7980
+ const Repartition<uint16_t, decltype(dt64)> dt_u16;
7981
+ const auto k7 = Set(du8, uint8_t{0x07});
7982
+ const auto k63 = Set(du8, uint8_t{0x3F});
7983
+
7984
+ const auto masked_idx = And(k63, BitCast(du8, idx));
7985
+
7986
+ auto byte_idx = ShiftRight<3>(masked_idx);
7987
+ #if HWY_IS_LITTLE_ENDIAN
7988
+ const auto hi_byte_idx = Add(byte_idx, Set(du8, uint8_t{1}));
7989
+ #else
7990
+ byte_idx = Xor(byte_idx, k7);
7991
+ const auto hi_byte_idx = Add(byte_idx, k7);
7992
+ #endif
7993
+
7994
+ const auto idx_shift = And(k7, masked_idx);
7995
+
7996
+ // Calculate even lanes
7997
+ const auto even_src = DupEven(ResizeBitCast(dt64, v));
7998
+ // Expand indexes to pull out 16 bit segments of idx and idx + 1
7999
+ #if HWY_IS_LITTLE_ENDIAN
8000
+ const auto even_idx = InterleaveLower(ResizeBitCast(dt_u8, byte_idx),
8001
+ ResizeBitCast(dt_u8, hi_byte_idx));
8002
+ #else
8003
+ const auto even_idx = InterleaveLower(ResizeBitCast(dt_u8, hi_byte_idx),
8004
+ ResizeBitCast(dt_u8, byte_idx));
8005
+ #endif
8006
+ // TableLookupBytes indexes select from within a 16 byte block
8007
+ const auto even_segments = TableLookupBytes(even_src, even_idx);
8008
+ // Extract unaligned bytes from 16 bit segments
8009
+ const auto even_idx_shift = PromoteTo(dt_u16, idx_shift);
8010
+ const auto extracted_even_bytes =
8011
+ Shr(BitCast(dt_u16, even_segments), even_idx_shift);
8012
+
8013
+ // Extract the even bytes of each 128 bit block and pack into lower 64 bits
8014
+ #if HWY_IS_LITTLE_ENDIAN
8015
+ const auto even_lanes = BitCast(
8016
+ dt64,
8017
+ ConcatEven(dt_u8, Zero(dt_u8), BitCast(dt_u8, extracted_even_bytes)));
8018
+ #else
8019
+ const auto even_lanes = BitCast(
8020
+ dt64,
8021
+ ConcatOdd(dt_u8, Zero(dt_u8), BitCast(dt_u8, extracted_even_bytes)));
8022
+ #endif
8023
+
8024
+ return LowerHalf(d64, even_lanes);
8025
+ }
8026
+
8027
+ template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
8028
+ class VI_2 = VFromD<Repartition<TFromV<VI>, DFromV<V>>>,
8029
+ HWY_IF_LANES_D(DFromV<VI>, HWY_MAX_LANES_V(VI_2)),
8030
+ HWY_IF_V_SIZE_GT_V(V, 8)>
8031
+ HWY_API V MultiRotateRight(V v, VI idx) {
8032
+ const DFromV<V> d64;
8033
+ const Repartition<uint8_t, decltype(d64)> du8;
8034
+ const Repartition<uint16_t, decltype(d64)> du16;
8035
+ const auto k7 = Set(du8, uint8_t{0x07});
8036
+ const auto k63 = Set(du8, uint8_t{0x3F});
8037
+
8038
+ const auto masked_idx = And(k63, BitCast(du8, idx));
8039
+
8040
+ auto byte_idx = ShiftRight<3>(masked_idx);
8041
+ #if HWY_IS_LITTLE_ENDIAN
8042
+ const auto hi_byte_idx = Add(byte_idx, Set(du8, uint8_t{1}));
8043
+ #else
8044
+ byte_idx = Xor(byte_idx, k7);
8045
+ const auto hi_byte_idx = Add(byte_idx, k7);
8046
+ #endif
8047
+
8048
+ const auto idx_shift = And(k7, masked_idx);
8049
+
8050
+ // Calculate even lanes
8051
+ const auto even_src = DupEven(v);
8052
+ // Expand indexes to pull out 16 bit segments of idx and idx + 1
8053
+ #if HWY_IS_LITTLE_ENDIAN
8054
+ const auto even_idx = InterleaveLower(byte_idx, hi_byte_idx);
8055
+ #else
8056
+ const auto even_idx = InterleaveLower(hi_byte_idx, byte_idx);
8057
+ #endif
8058
+ // TableLookupBytes indexes select from within a 16 byte block
8059
+ const auto even_segments = TableLookupBytes(even_src, even_idx);
8060
+ // Extract unaligned bytes from 16 bit segments
8061
+ #if HWY_IS_LITTLE_ENDIAN
8062
+ const auto even_idx_shift = ZipLower(idx_shift, Zero(du8));
8063
+ #else
8064
+ const auto even_idx_shift = ZipLower(Zero(du8), idx_shift);
8065
+ #endif
8066
+ const auto extracted_even_bytes =
8067
+ Shr(BitCast(du16, even_segments), even_idx_shift);
8068
+
8069
+ // Calculate odd lanes
8070
+ const auto odd_src = DupOdd(v);
8071
+ // Expand indexes to pull out 16 bit segments of idx and idx + 1
8072
+ #if HWY_IS_LITTLE_ENDIAN
8073
+ const auto odd_idx = InterleaveUpper(du8, byte_idx, hi_byte_idx);
8074
+ #else
8075
+ const auto odd_idx = InterleaveUpper(du8, hi_byte_idx, byte_idx);
8076
+ #endif
8077
+ // TableLookupBytes indexes select from within a 16 byte block
8078
+ const auto odd_segments = TableLookupBytes(odd_src, odd_idx);
8079
+ // Extract unaligned bytes from 16 bit segments
8080
+ #if HWY_IS_LITTLE_ENDIAN
8081
+ const auto odd_idx_shift = ZipUpper(du16, idx_shift, Zero(du8));
8082
+ #else
8083
+ const auto odd_idx_shift = ZipUpper(du16, Zero(du8), idx_shift);
8084
+ #endif
8085
+ const auto extracted_odd_bytes =
8086
+ Shr(BitCast(du16, odd_segments), odd_idx_shift);
8087
+
8088
+ // Extract the even bytes of each 128 bit block and pack into lower 64 bits
8089
+ #if HWY_IS_LITTLE_ENDIAN
8090
+ const auto even_lanes = BitCast(
8091
+ d64, ConcatEven(du8, Zero(du8), BitCast(du8, extracted_even_bytes)));
8092
+ const auto odd_lanes = BitCast(
8093
+ d64, ConcatEven(du8, Zero(du8), BitCast(du8, extracted_odd_bytes)));
8094
+ #else
8095
+ const auto even_lanes = BitCast(
8096
+ d64, ConcatOdd(du8, Zero(du8), BitCast(du8, extracted_even_bytes)));
8097
+ const auto odd_lanes = BitCast(
8098
+ d64, ConcatOdd(du8, Zero(du8), BitCast(du8, extracted_odd_bytes)));
8099
+ #endif
8100
+ // Interleave at 64 bit level
8101
+ return InterleaveWholeLower(even_lanes, odd_lanes);
8102
+ }
8103
+
8104
+ #if HWY_TARGET == HWY_RVV
8105
+
8106
+ // MultiRotateRight for LMUL=1/2 case on RVV
8107
+ template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
8108
+ class VI_2 = VFromD<Repartition<TFromV<VI>, DFromV<V>>>,
8109
+ HWY_IF_POW2_LE_D(DFromV<V>, 0),
8110
+ HWY_IF_LANES_D(DFromV<VI>, HWY_MAX_LANES_V(VI_2) / 2)>
8111
+ HWY_API V MultiRotateRight(V v, VI idx) {
8112
+ return MultiRotateRight(v, ResizeBitCast(Twice<DFromV<VI>>(), idx));
8113
+ }
8114
+
8115
+ #endif
8116
+
8117
+ #endif
8118
+
6945
8119
  // ================================================== Operator wrapper
6946
8120
 
6947
8121
  // SVE* and RVV currently cannot define operators and have already defined
@@ -7013,6 +8187,20 @@ HWY_API auto Le(V a, V b) -> decltype(a == b) {
7013
8187
 
7014
8188
  #endif // HWY_NATIVE_OPERATOR_REPLACEMENTS
7015
8189
 
8190
+ #undef HWY_GENERIC_IF_EMULATED_D
8191
+
8192
+ // TODO: remove once callers are updated.
8193
+ // SVE and RVV do not support DFromM because their masks are loosely typed.
8194
+ #if HWY_MAX_BYTES <= 64 && !HWY_TARGET_IS_SVE && HWY_TARGET != HWY_RVV
8195
+ namespace detail {
8196
+ template <class M>
8197
+ uint64_t BitsFromMask(M m) {
8198
+ const DFromM<M> d;
8199
+ return ::hwy::HWY_NAMESPACE::BitsFromMask(d, m);
8200
+ }
8201
+ } // namespace detail
8202
+ #endif // !HWY_HAVE_SCALABLE && HWY_MAX_BYTES <= 64
8203
+
7016
8204
  // NOLINTNEXTLINE(google-readability-namespace-comments)
7017
8205
  } // namespace HWY_NAMESPACE
7018
8206
  } // namespace hwy