@img/sharp-libvips-dev 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/README.md +1 -2
  2. package/include/aom/aom_decoder.h +1 -1
  3. package/include/aom/aom_encoder.h +7 -1
  4. package/include/aom/aom_image.h +24 -12
  5. package/include/aom/aom_integer.h +3 -3
  6. package/include/aom/aomcx.h +15 -0
  7. package/include/aom/aomdx.h +5 -2
  8. package/include/archive.h +7 -5
  9. package/include/archive_entry.h +5 -3
  10. package/include/cgif.h +3 -0
  11. package/include/freetype2/freetype/config/ftoption.h +1 -1
  12. package/include/fribidi/fribidi-config.h +2 -2
  13. package/include/fribidi/fribidi-unicode-version.h +3 -3
  14. package/include/glib-2.0/gio/gappinfo.h +40 -25
  15. package/include/glib-2.0/gio/gasyncresult.h +1 -1
  16. package/include/glib-2.0/gio/gconverter.h +5 -0
  17. package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
  18. package/include/glib-2.0/gio/gfile.h +16 -0
  19. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  20. package/include/glib-2.0/gio/gsettings.h +8 -0
  21. package/include/glib-2.0/gio/gvfs.h +2 -2
  22. package/include/glib-2.0/girepository/gi-visibility.h +34 -0
  23. package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
  24. package/include/glib-2.0/glib/giochannel.h +2 -2
  25. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  26. package/include/glib-2.0/glib/gmacros.h +12 -5
  27. package/include/glib-2.0/glib/gmain.h +93 -7
  28. package/include/glib-2.0/glib/gqsort.h +8 -1
  29. package/include/glib-2.0/glib/gstrfuncs.h +0 -12
  30. package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
  31. package/include/glib-2.0/glib/gunicode.h +1 -1
  32. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  33. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  34. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  35. package/include/glib-2.0/gobject/gtype.h +6 -6
  36. package/include/harfbuzz/hb-buffer.h +6 -0
  37. package/include/harfbuzz/hb-common.h +6 -9
  38. package/include/harfbuzz/hb-cplusplus.hh +8 -11
  39. package/include/harfbuzz/hb-subset.h +17 -4
  40. package/include/harfbuzz/hb-version.h +3 -3
  41. package/include/hwy/abort.h +28 -0
  42. package/include/hwy/aligned_allocator.h +48 -1
  43. package/include/hwy/base.h +235 -34
  44. package/include/hwy/detect_compiler_arch.h +84 -10
  45. package/include/hwy/detect_targets.h +95 -29
  46. package/include/hwy/foreach_target.h +12 -1
  47. package/include/hwy/highway.h +205 -50
  48. package/include/hwy/ops/arm_neon-inl.h +841 -99
  49. package/include/hwy/ops/arm_sve-inl.h +413 -141
  50. package/include/hwy/ops/emu128-inl.h +373 -360
  51. package/include/hwy/ops/generic_ops-inl.h +804 -401
  52. package/include/hwy/ops/inside-inl.h +691 -0
  53. package/include/hwy/ops/ppc_vsx-inl.h +456 -166
  54. package/include/hwy/ops/rvv-inl.h +537 -249
  55. package/include/hwy/ops/scalar-inl.h +169 -79
  56. package/include/hwy/ops/set_macros-inl.h +106 -18
  57. package/include/hwy/ops/shared-inl.h +23 -0
  58. package/include/hwy/ops/wasm_128-inl.h +130 -108
  59. package/include/hwy/ops/x86_128-inl.h +1892 -577
  60. package/include/hwy/ops/x86_256-inl.h +625 -184
  61. package/include/hwy/ops/x86_512-inl.h +733 -131
  62. package/include/hwy/targets.h +22 -21
  63. package/include/hwy/timer-inl.h +3 -3
  64. package/include/hwy/timer.h +5 -1
  65. package/include/libheif/heif.h +170 -15
  66. package/include/libheif/heif_items.h +237 -0
  67. package/include/libheif/heif_properties.h +38 -2
  68. package/include/libheif/heif_regions.h +1 -1
  69. package/include/libheif/heif_version.h +2 -2
  70. package/include/libpng16/pnglibconf.h +1 -1
  71. package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
  72. package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
  73. package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
  74. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  75. package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
  76. package/include/libxml2/libxml/HTMLparser.h +12 -19
  77. package/include/libxml2/libxml/c14n.h +1 -12
  78. package/include/libxml2/libxml/debugXML.h +1 -1
  79. package/include/libxml2/libxml/encoding.h +9 -0
  80. package/include/libxml2/libxml/entities.h +12 -1
  81. package/include/libxml2/libxml/hash.h +19 -0
  82. package/include/libxml2/libxml/list.h +2 -2
  83. package/include/libxml2/libxml/nanohttp.h +17 -0
  84. package/include/libxml2/libxml/parser.h +61 -55
  85. package/include/libxml2/libxml/parserInternals.h +9 -1
  86. package/include/libxml2/libxml/pattern.h +6 -0
  87. package/include/libxml2/libxml/tree.h +32 -12
  88. package/include/libxml2/libxml/uri.h +11 -0
  89. package/include/libxml2/libxml/valid.h +29 -2
  90. package/include/libxml2/libxml/xinclude.h +7 -0
  91. package/include/libxml2/libxml/xmlIO.h +21 -4
  92. package/include/libxml2/libxml/xmlerror.h +14 -0
  93. package/include/libxml2/libxml/xmlexports.h +111 -15
  94. package/include/libxml2/libxml/xmlmemory.h +8 -45
  95. package/include/libxml2/libxml/xmlreader.h +2 -0
  96. package/include/libxml2/libxml/xmlsave.h +5 -0
  97. package/include/libxml2/libxml/xmlunicode.h +165 -1
  98. package/include/libxml2/libxml/xmlversion.h +15 -179
  99. package/include/libxml2/libxml/xmlwriter.h +1 -0
  100. package/include/libxml2/libxml/xpath.h +4 -0
  101. package/include/pango-1.0/pango/pango-features.h +3 -3
  102. package/include/pango-1.0/pango/pango-item.h +4 -2
  103. package/include/pango-1.0/pango/pango-version-macros.h +25 -0
  104. package/include/pango-1.0/pango/pangofc-font.h +2 -1
  105. package/include/pnglibconf.h +1 -1
  106. package/include/vips/util.h +1 -2
  107. package/include/vips/version.h +4 -4
  108. package/include/webp/decode.h +58 -56
  109. package/include/webp/demux.h +25 -21
  110. package/include/webp/encode.h +44 -39
  111. package/include/webp/mux.h +76 -15
  112. package/include/webp/mux_types.h +2 -1
  113. package/include/webp/sharpyuv/sharpyuv.h +77 -8
  114. package/include/webp/types.h +29 -8
  115. package/include/zconf.h +1 -1
  116. package/include/zlib.h +12 -12
  117. package/package.json +1 -1
  118. package/versions.json +14 -15
@@ -17,6 +17,7 @@
17
17
  // External include guard in highway.h - see comment there.
18
18
 
19
19
  #include "hwy/base.h"
20
+
20
21
  #ifndef HWY_NO_LIBCXX
21
22
  #include <math.h> // sqrtf
22
23
  #endif
@@ -103,9 +104,6 @@ HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
103
104
  template <class D>
104
105
  using VFromD = decltype(Zero(D()));
105
106
 
106
- // ------------------------------ Tuple (VFromD)
107
- #include "hwy/ops/tuple-inl.h"
108
-
109
107
  // ------------------------------ BitCast
110
108
 
111
109
  template <class D, class VFrom>
@@ -355,9 +353,8 @@ HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
355
353
  // ------------------------------ BroadcastSignBit
356
354
  template <typename T, size_t N>
357
355
  HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
358
- // This is used inside ShiftRight, so we cannot implement in terms of it.
359
356
  for (size_t i = 0; i < N; ++i) {
360
- v.raw[i] = static_cast<T>(v.raw[i] < 0 ? -1 : 0);
357
+ v.raw[i] = ScalarShr(v.raw[i], sizeof(T) * 8 - 1);
361
358
  }
362
359
  return v;
363
360
  }
@@ -431,12 +428,6 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
431
428
  return v;
432
429
  }
433
430
 
434
- template <typename T, size_t N>
435
- HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
436
- const DFromV<decltype(v)> d;
437
- return IfNegativeThenElse(v, Zero(d), v);
438
- }
439
-
440
431
  // ------------------------------ Mask logical
441
432
 
442
433
  template <typename T, size_t N>
@@ -494,41 +485,26 @@ HWY_API Vec128<T, N> ShiftLeft(Vec128<T, N> v) {
494
485
  template <int kBits, typename T, size_t N>
495
486
  HWY_API Vec128<T, N> ShiftRight(Vec128<T, N> v) {
496
487
  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
497
- #if __cplusplus >= 202002L
498
488
  // Signed right shift is now guaranteed to be arithmetic (rounding toward
499
489
  // negative infinity, i.e. shifting in the sign bit).
500
490
  for (size_t i = 0; i < N; ++i) {
501
- v.raw[i] = static_cast<T>(v.raw[i] >> kBits);
491
+ v.raw[i] = ScalarShr(v.raw[i], kBits);
502
492
  }
503
- #else
504
- if (IsSigned<T>()) {
505
- // Emulate arithmetic shift using only logical (unsigned) shifts, because
506
- // signed shifts are still implementation-defined.
507
- using TU = hwy::MakeUnsigned<T>;
508
- for (size_t i = 0; i < N; ++i) {
509
- const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> kBits);
510
- const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
511
- const size_t sign_shift =
512
- static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
513
- const TU upper = static_cast<TU>(sign << sign_shift);
514
- v.raw[i] = static_cast<T>(shifted | upper);
515
- }
516
- } else { // T is unsigned
517
- for (size_t i = 0; i < N; ++i) {
518
- v.raw[i] = static_cast<T>(v.raw[i] >> kBits);
519
- }
520
- }
521
- #endif
493
+
522
494
  return v;
523
495
  }
524
496
 
525
497
  // ------------------------------ RotateRight (ShiftRight)
526
- template <int kBits, typename T, size_t N>
498
+ template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
527
499
  HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
500
+ const DFromV<decltype(v)> d;
501
+ const RebindToUnsigned<decltype(d)> du;
502
+
528
503
  constexpr size_t kSizeInBits = sizeof(T) * 8;
529
504
  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
530
505
  if (kBits == 0) return v;
531
- return Or(ShiftRight<kBits>(v),
506
+
507
+ return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
532
508
  ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
533
509
  }
534
510
 
@@ -545,31 +521,10 @@ HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, int bits) {
545
521
 
546
522
  template <typename T, size_t N>
547
523
  HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, int bits) {
548
- #if __cplusplus >= 202002L
549
- // Signed right shift is now guaranteed to be arithmetic (rounding toward
550
- // negative infinity, i.e. shifting in the sign bit).
551
524
  for (size_t i = 0; i < N; ++i) {
552
- v.raw[i] = static_cast<T>(v.raw[i] >> bits);
553
- }
554
- #else
555
- if (IsSigned<T>()) {
556
- // Emulate arithmetic shift using only logical (unsigned) shifts, because
557
- // signed shifts are still implementation-defined.
558
- using TU = hwy::MakeUnsigned<T>;
559
- for (size_t i = 0; i < N; ++i) {
560
- const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits);
561
- const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
562
- const size_t sign_shift =
563
- static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
564
- const TU upper = static_cast<TU>(sign << sign_shift);
565
- v.raw[i] = static_cast<T>(shifted | upper);
566
- }
567
- } else {
568
- for (size_t i = 0; i < N; ++i) {
569
- v.raw[i] = static_cast<T>(v.raw[i] >> bits); // unsigned, logical shift
570
- }
525
+ v.raw[i] = ScalarShr(v.raw[i], bits);
571
526
  }
572
- #endif
527
+
573
528
  return v;
574
529
  }
575
530
 
@@ -587,32 +542,10 @@ HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
587
542
 
588
543
  template <typename T, size_t N>
589
544
  HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) {
590
- #if __cplusplus >= 202002L
591
- // Signed right shift is now guaranteed to be arithmetic (rounding toward
592
- // negative infinity, i.e. shifting in the sign bit).
593
545
  for (size_t i = 0; i < N; ++i) {
594
- v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]);
595
- }
596
- #else
597
- if (IsSigned<T>()) {
598
- // Emulate arithmetic shift using only logical (unsigned) shifts, because
599
- // signed shifts are still implementation-defined.
600
- using TU = hwy::MakeUnsigned<T>;
601
- for (size_t i = 0; i < N; ++i) {
602
- const TU shifted =
603
- static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits.raw[i]);
604
- const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
605
- const size_t sign_shift = static_cast<size_t>(
606
- static_cast<int>(sizeof(TU)) * 8 - 1 - bits.raw[i]);
607
- const TU upper = static_cast<TU>(sign << sign_shift);
608
- v.raw[i] = static_cast<T>(shifted | upper);
609
- }
610
- } else { // T is unsigned
611
- for (size_t i = 0; i < N; ++i) {
612
- v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]);
613
- }
546
+ v.raw[i] = ScalarShr(v.raw[i], static_cast<int>(bits.raw[i]));
614
547
  }
615
- #endif
548
+
616
549
  return v;
617
550
  }
618
551
 
@@ -890,26 +823,36 @@ HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
890
823
  return a;
891
824
  }
892
825
 
893
- // Returns the upper 16 bits of a * b in each lane.
894
- template <size_t N>
895
- HWY_API Vec128<int16_t, N> MulHigh(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
826
+ // Returns the upper sizeof(T)*8 bits of a * b in each lane.
827
+ template <class T, size_t N,
828
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
829
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
830
+ HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
831
+ using TW = MakeWide<T>;
896
832
  for (size_t i = 0; i < N; ++i) {
897
- a.raw[i] = static_cast<int16_t>((int32_t{a.raw[i]} * b.raw[i]) >> 16);
833
+ a.raw[i] = static_cast<T>(
834
+ (static_cast<TW>(a.raw[i]) * static_cast<TW>(b.raw[i])) >>
835
+ (sizeof(T) * 8));
898
836
  }
899
837
  return a;
900
838
  }
901
- template <size_t N>
902
- HWY_API Vec128<uint16_t, N> MulHigh(Vec128<uint16_t, N> a,
903
- Vec128<uint16_t, N> b) {
904
- for (size_t i = 0; i < N; ++i) {
905
- // Cast to uint32_t first to prevent overflow. Otherwise the result of
906
- // uint16_t * uint16_t is in "int" which may overflow. In practice the
907
- // result is the same but this way it is also defined.
908
- a.raw[i] = static_cast<uint16_t>(
909
- (static_cast<uint32_t>(a.raw[i]) * static_cast<uint32_t>(b.raw[i])) >>
910
- 16);
911
- }
912
- return a;
839
+
840
+ template <class T, HWY_IF_UI64(T)>
841
+ HWY_API Vec128<T, 1> MulHigh(Vec128<T, 1> a, Vec128<T, 1> b) {
842
+ T hi;
843
+ Mul128(GetLane(a), GetLane(b), &hi);
844
+ return Set(Full64<T>(), hi);
845
+ }
846
+
847
+ template <class T, HWY_IF_UI64(T)>
848
+ HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
849
+ T hi_0;
850
+ T hi_1;
851
+
852
+ Mul128(GetLane(a), GetLane(b), &hi_0);
853
+ Mul128(ExtractLane(a, 1), ExtractLane(b, 1), &hi_1);
854
+
855
+ return Dup128VecFromValues(Full128<T>(), hi_0, hi_1);
913
856
  }
914
857
 
915
858
  template <size_t N>
@@ -1457,6 +1400,183 @@ HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
1457
1400
  CopyBytes(v.raw, p, num_of_lanes_to_store * sizeof(TFromD<D>));
1458
1401
  }
1459
1402
 
1403
+ // ================================================== COMBINE
1404
+
1405
+ template <typename T, size_t N>
1406
+ HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
1407
+ Vec128<T, N / 2> ret;
1408
+ CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
1409
+ return ret;
1410
+ }
1411
+
1412
+ template <class D>
1413
+ HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
1414
+ return LowerHalf(v);
1415
+ }
1416
+
1417
+ template <class D>
1418
+ HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
1419
+ VFromD<D> ret;
1420
+ CopyBytes<d.MaxBytes()>(&v.raw[MaxLanes(d)], ret.raw);
1421
+ return ret;
1422
+ }
1423
+
1424
+ template <class D>
1425
+ HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> v) {
1426
+ const Half<decltype(d)> dh;
1427
+ VFromD<D> ret; // zero-initialized
1428
+ CopyBytes<dh.MaxBytes()>(v.raw, ret.raw);
1429
+ return ret;
1430
+ }
1431
+
1432
+ template <class D, class VH = VFromD<Half<D>>>
1433
+ HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
1434
+ const Half<decltype(d)> dh;
1435
+ VFromD<D> ret;
1436
+ CopyBytes<dh.MaxBytes()>(lo_half.raw, &ret.raw[0]);
1437
+ CopyBytes<dh.MaxBytes()>(hi_half.raw, &ret.raw[MaxLanes(dh)]);
1438
+ return ret;
1439
+ }
1440
+
1441
+ template <class D>
1442
+ HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
1443
+ const Half<decltype(d)> dh;
1444
+ VFromD<D> ret;
1445
+ CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
1446
+ CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
1447
+ return ret;
1448
+ }
1449
+
1450
+ template <class D>
1451
+ HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
1452
+ const Half<decltype(d)> dh;
1453
+ VFromD<D> ret;
1454
+ CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
1455
+ CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
1456
+ return ret;
1457
+ }
1458
+
1459
+ template <class D>
1460
+ HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
1461
+ const Half<decltype(d)> dh;
1462
+ VFromD<D> ret;
1463
+ CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
1464
+ CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
1465
+ return ret;
1466
+ }
1467
+
1468
+ template <class D>
1469
+ HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
1470
+ const Half<decltype(d)> dh;
1471
+ VFromD<D> ret;
1472
+ CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
1473
+ CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
1474
+ return ret;
1475
+ }
1476
+
1477
+ template <class D>
1478
+ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
1479
+ const Half<decltype(d)> dh;
1480
+ VFromD<D> ret;
1481
+ for (size_t i = 0; i < MaxLanes(dh); ++i) {
1482
+ ret.raw[i] = lo.raw[2 * i];
1483
+ }
1484
+ for (size_t i = 0; i < MaxLanes(dh); ++i) {
1485
+ ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i];
1486
+ }
1487
+ return ret;
1488
+ }
1489
+
1490
+ // 2023-11-23: workaround for incorrect codegen (reduction_test fails for
1491
+ // SumsOf2 because PromoteOddTo, which uses ConcatOdd, returns zero).
1492
+ #if HWY_ARCH_RISCV && HWY_TARGET == HWY_EMU128 && HWY_COMPILER_CLANG
1493
+ #define HWY_EMU128_CONCAT_INLINE HWY_NOINLINE
1494
+ #else
1495
+ #define HWY_EMU128_CONCAT_INLINE HWY_API
1496
+ #endif
1497
+
1498
+ template <class D>
1499
+ HWY_EMU128_CONCAT_INLINE VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
1500
+ const Half<decltype(d)> dh;
1501
+ VFromD<D> ret;
1502
+ for (size_t i = 0; i < MaxLanes(dh); ++i) {
1503
+ ret.raw[i] = lo.raw[2 * i + 1];
1504
+ }
1505
+ for (size_t i = 0; i < MaxLanes(dh); ++i) {
1506
+ ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i + 1];
1507
+ }
1508
+ return ret;
1509
+ }
1510
+
1511
+ // ------------------------------ CombineShiftRightBytes
1512
+ template <int kBytes, class D>
1513
+ HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
1514
+ VFromD<D> ret;
1515
+ const uint8_t* HWY_RESTRICT lo8 =
1516
+ reinterpret_cast<const uint8_t * HWY_RESTRICT>(lo.raw);
1517
+ uint8_t* HWY_RESTRICT ret8 =
1518
+ reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
1519
+ CopyBytes<d.MaxBytes() - kBytes>(lo8 + kBytes, ret8);
1520
+ CopyBytes<kBytes>(hi.raw, ret8 + d.MaxBytes() - kBytes);
1521
+ return ret;
1522
+ }
1523
+
1524
+ // ------------------------------ ShiftLeftBytes
1525
+
1526
+ template <int kBytes, class D>
1527
+ HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
1528
+ static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1529
+ VFromD<D> ret;
1530
+ uint8_t* HWY_RESTRICT ret8 =
1531
+ reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
1532
+ ZeroBytes<kBytes>(ret8);
1533
+ CopyBytes<d.MaxBytes() - kBytes>(v.raw, ret8 + kBytes);
1534
+ return ret;
1535
+ }
1536
+
1537
+ template <int kBytes, typename T, size_t N>
1538
+ HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
1539
+ return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
1540
+ }
1541
+
1542
+ // ------------------------------ ShiftLeftLanes
1543
+
1544
+ template <int kLanes, class D, typename T = TFromD<D>>
1545
+ HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) {
1546
+ const Repartition<uint8_t, decltype(d)> d8;
1547
+ return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
1548
+ }
1549
+
1550
+ template <int kLanes, typename T, size_t N>
1551
+ HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) {
1552
+ return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
1553
+ }
1554
+
1555
+ // ------------------------------ ShiftRightBytes
1556
+ template <int kBytes, class D>
1557
+ HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
1558
+ static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
1559
+ VFromD<D> ret;
1560
+ const uint8_t* HWY_RESTRICT v8 =
1561
+ reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
1562
+ uint8_t* HWY_RESTRICT ret8 =
1563
+ reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
1564
+ CopyBytes<d.MaxBytes() - kBytes>(v8 + kBytes, ret8);
1565
+ ZeroBytes<kBytes>(ret8 + d.MaxBytes() - kBytes);
1566
+ return ret;
1567
+ }
1568
+
1569
+ // ------------------------------ ShiftRightLanes
1570
+ template <int kLanes, class D>
1571
+ HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) {
1572
+ const Repartition<uint8_t, decltype(d)> d8;
1573
+ constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
1574
+ return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
1575
+ }
1576
+
1577
+ // ------------------------------ Tuples, PromoteEvenTo/PromoteOddTo
1578
+ #include "hwy/ops/inside-inl.h"
1579
+
1460
1580
  // ------------------------------ LoadInterleaved2/3/4
1461
1581
 
1462
1582
  // Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
@@ -1621,6 +1741,47 @@ HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag /*to_type_tag*/,
1621
1741
  float val) {
1622
1742
  return CastValueForF2IConv<ToT>(val);
1623
1743
  }
1744
+ // If val is within the range of ToT, CastValueForInRangeF2IConv<ToT>(val)
1745
+ // returns static_cast<ToT>(val)
1746
+ //
1747
+ // Otherwise, CastValueForInRangeF2IConv<ToT>(val) returns an
1748
+ // implementation-defined result if val is not within the range of ToT.
1749
+ template <class ToT, class FromT>
1750
+ HWY_INLINE ToT CastValueForInRangeF2IConv(FromT val) {
1751
+ // Prevent ubsan errors when converting float to narrower integer
1752
+
1753
+ using FromTU = MakeUnsigned<FromT>;
1754
+
1755
+ constexpr unsigned kMaxExpField =
1756
+ static_cast<unsigned>(MaxExponentField<FromT>());
1757
+ constexpr unsigned kExpBias = kMaxExpField >> 1;
1758
+ constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
1759
+ kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
1760
+ kMaxExpField));
1761
+
1762
+ // If ToT is signed, compare only the exponent bits of val against
1763
+ // kMinOutOfRangeExpField.
1764
+ //
1765
+ // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
1766
+ // val against kMinOutOfRangeExpField as a negative value is outside of the
1767
+ // range of an unsigned integer type.
1768
+ const FromT val_to_compare =
1769
+ static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);
1770
+
1771
+ // val is within the range of ToT if
1772
+ // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
1773
+ // than kMinOutOfRangeExpField
1774
+ //
1775
+ // Otherwise, val is either outside of the range of ToT or equal to
1776
+ // LimitsMin<ToT>() if
1777
+ // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
1778
+ // than or equal to kMinOutOfRangeExpField.
1779
+
1780
+ return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
1781
+ MantissaBits<FromT>()) < kMinOutOfRangeExpField)
1782
+ ? static_cast<ToT>(val)
1783
+ : static_cast<ToT>(LimitsMin<ToT>());
1784
+ }
1624
1785
 
1625
1786
  } // namespace detail
1626
1787
 
@@ -1636,6 +1797,21 @@ HWY_API VFromD<DTo> PromoteTo(DTo d, Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
1636
1797
  return ret;
1637
1798
  }
1638
1799
 
1800
+ #ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1801
+ #undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1802
+ #else
1803
+ #define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
1804
+ #endif
1805
+
1806
+ template <class D64, HWY_IF_UI64_D(D64)>
1807
+ HWY_API VFromD<D64> PromoteInRangeTo(D64 d64, VFromD<Rebind<float, D64>> v) {
1808
+ VFromD<D64> ret;
1809
+ for (size_t i = 0; i < MaxLanes(d64); ++i) {
1810
+ ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<D64>>(v.raw[i]);
1811
+ }
1812
+ return ret;
1813
+ }
1814
+
1639
1815
  // MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here,
1640
1816
  // so we overload for TFrom=double and ToT={float,int32_t}.
1641
1817
  template <class D, HWY_IF_F32_D(D)>
@@ -1679,17 +1855,32 @@ HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) {
1679
1855
  return ret;
1680
1856
  }
1681
1857
 
1858
+ // Disable the default unsigned to signed DemoteTo/ReorderDemote2To
1859
+ // implementations in generic_ops-inl.h on EMU128 as the EMU128 target has
1860
+ // target-specific implementations of the unsigned to signed DemoteTo and
1861
+ // ReorderDemote2To ops
1862
+
1863
+ // NOTE: hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr is used instead of
1864
+ // hwy::EnableIf<false>* = nullptr to avoid compiler errors since
1865
+ // !hwy::IsSame<V, V>() is always false and as !hwy::IsSame<V, V>() will cause
1866
+ // SFINAE to occur instead of a hard error due to a dependency on the V template
1867
+ // argument
1868
+ #undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
1869
+ #define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \
1870
+ hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
1871
+
1682
1872
  template <class DTo, typename TFrom, size_t N, HWY_IF_UNSIGNED(TFrom),
1683
- HWY_IF_UNSIGNED_D(DTo)>
1873
+ HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DTo)>
1684
1874
  HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) {
1685
1875
  using TTo = TFromD<DTo>;
1686
1876
  static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
1687
1877
 
1878
+ const auto max = static_cast<MakeUnsigned<TTo>>(LimitsMax<TTo>());
1879
+
1688
1880
  VFromD<DTo> ret;
1689
1881
  for (size_t i = 0; i < N; ++i) {
1690
1882
  // Int to int: choose closest value in ToT to `from` (avoids UB)
1691
- from.raw[i] = HWY_MIN(from.raw[i], LimitsMax<TTo>());
1692
- ret.raw[i] = static_cast<TTo>(from.raw[i]);
1883
+ ret.raw[i] = static_cast<TTo>(HWY_MIN(from.raw[i], max));
1693
1884
  }
1694
1885
  return ret;
1695
1886
  }
@@ -1737,14 +1928,15 @@ HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
1737
1928
  return ret;
1738
1929
  }
1739
1930
 
1740
- template <class DN, HWY_IF_UNSIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
1741
- HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
1931
+ template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DN), class V,
1932
+ HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
1742
1933
  HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
1743
1934
  HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
1744
1935
  const RepartitionToWide<decltype(dn)> dw;
1745
1936
  const size_t NW = Lanes(dw);
1746
1937
  using TN = TFromD<DN>;
1747
- const TN max = LimitsMax<TN>();
1938
+ using TN_U = MakeUnsigned<TN>;
1939
+ const TN_U max = static_cast<TN_U>(LimitsMax<TN>());
1748
1940
  VFromD<DN> ret;
1749
1941
  for (size_t i = 0; i < NW; ++i) {
1750
1942
  ret.raw[i] = static_cast<TN>(HWY_MIN(a.raw[i], max));
@@ -1803,6 +1995,12 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<bfloat16_t, N> v) {
1803
1995
  return ret;
1804
1996
  }
1805
1997
 
1998
+ #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
1999
+ #undef HWY_NATIVE_DEMOTE_F32_TO_BF16
2000
+ #else
2001
+ #define HWY_NATIVE_DEMOTE_F32_TO_BF16
2002
+ #endif
2003
+
1806
2004
  template <class D, HWY_IF_BF16_D(D), size_t N>
1807
2005
  HWY_API VFromD<D> DemoteTo(D /* tag */, Vec128<float, N> v) {
1808
2006
  VFromD<D> ret;
@@ -1812,6 +2010,21 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec128<float, N> v) {
1812
2010
  return ret;
1813
2011
  }
1814
2012
 
2013
+ #ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
2014
+ #undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
2015
+ #else
2016
+ #define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
2017
+ #endif
2018
+
2019
+ template <class D32, HWY_IF_UI32_D(D32)>
2020
+ HWY_API VFromD<D32> DemoteInRangeTo(D32 d32, VFromD<Rebind<double, D32>> v) {
2021
+ VFromD<D32> ret;
2022
+ for (size_t i = 0; i < MaxLanes(d32); ++i) {
2023
+ ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<D32>>(v.raw[i]);
2024
+ }
2025
+ return ret;
2026
+ }
2027
+
1815
2028
  // Tag dispatch instead of SFINAE for MSVC 2017 compatibility
1816
2029
  namespace detail {
1817
2030
 
@@ -1851,6 +2064,22 @@ HWY_API VFromD<DTo> ConvertTo(DTo d, Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
1851
2064
  return detail::ConvertTo(hwy::IsFloatTag<TFrom>(), d, from);
1852
2065
  }
1853
2066
 
2067
+ #ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
2068
+ #undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
2069
+ #else
2070
+ #define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
2071
+ #endif
2072
+
2073
+ template <class DI, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DI),
2074
+ HWY_IF_T_SIZE_ONE_OF_D(DI, (1 << 4) | (1 << 8))>
2075
+ HWY_API VFromD<DI> ConvertInRangeTo(DI di, VFromD<RebindToFloat<DI>> v) {
2076
+ VFromD<DI> ret;
2077
+ for (size_t i = 0; i < MaxLanes(di); i++) {
2078
+ ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<DI>>(v.raw[i]);
2079
+ }
2080
+ return ret;
2081
+ }
2082
+
1854
2083
  template <size_t N>
1855
2084
  HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) {
1856
2085
  return DemoteTo(Simd<uint8_t, N, 0>(), v);
@@ -1938,180 +2167,6 @@ HWY_API VFromD<DN> OrderedTruncate2To(DN dn, V a, V b) {
1938
2167
  return ret;
1939
2168
  }
1940
2169
 
1941
- // ================================================== COMBINE
1942
-
1943
- template <typename T, size_t N>
1944
- HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
1945
- Vec128<T, N / 2> ret;
1946
- CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
1947
- return ret;
1948
- }
1949
-
1950
- template <class D>
1951
- HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
1952
- return LowerHalf(v);
1953
- }
1954
-
1955
- template <class D>
1956
- HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
1957
- VFromD<D> ret;
1958
- CopyBytes<d.MaxBytes()>(&v.raw[MaxLanes(d)], ret.raw);
1959
- return ret;
1960
- }
1961
-
1962
- template <class D>
1963
- HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> v) {
1964
- const Half<decltype(d)> dh;
1965
- VFromD<D> ret; // zero-initialized
1966
- CopyBytes<dh.MaxBytes()>(v.raw, ret.raw);
1967
- return ret;
1968
- }
1969
-
1970
- template <class D, class VH = VFromD<Half<D>>>
1971
- HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
1972
- const Half<decltype(d)> dh;
1973
- VFromD<D> ret;
1974
- CopyBytes<dh.MaxBytes()>(lo_half.raw, &ret.raw[0]);
1975
- CopyBytes<dh.MaxBytes()>(hi_half.raw, &ret.raw[MaxLanes(dh)]);
1976
- return ret;
1977
- }
1978
-
1979
- template <class D>
1980
- HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
1981
- const Half<decltype(d)> dh;
1982
- VFromD<D> ret;
1983
- CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
1984
- CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
1985
- return ret;
1986
- }
1987
-
1988
- template <class D>
1989
- HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
1990
- const Half<decltype(d)> dh;
1991
- VFromD<D> ret;
1992
- CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
1993
- CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
1994
- return ret;
1995
- }
1996
-
1997
- template <class D>
1998
- HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
1999
- const Half<decltype(d)> dh;
2000
- VFromD<D> ret;
2001
- CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
2002
- CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
2003
- return ret;
2004
- }
2005
-
2006
- template <class D>
2007
- HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
2008
- const Half<decltype(d)> dh;
2009
- VFromD<D> ret;
2010
- CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
2011
- CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
2012
- return ret;
2013
- }
2014
-
2015
- template <class D>
2016
- HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
2017
- const Half<decltype(d)> dh;
2018
- VFromD<D> ret;
2019
- for (size_t i = 0; i < MaxLanes(dh); ++i) {
2020
- ret.raw[i] = lo.raw[2 * i];
2021
- }
2022
- for (size_t i = 0; i < MaxLanes(dh); ++i) {
2023
- ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i];
2024
- }
2025
- return ret;
2026
- }
2027
-
2028
- // 2023-11-23: workaround for incorrect codegen (reduction_test fails for
2029
- // SumsOf2 because PromoteOddTo, which uses ConcatOdd, returns zero).
2030
- #if HWY_ARCH_RVV && HWY_TARGET == HWY_EMU128 && HWY_COMPILER_CLANG
2031
- #define HWY_EMU128_CONCAT_INLINE HWY_NOINLINE
2032
- #else
2033
- #define HWY_EMU128_CONCAT_INLINE HWY_API
2034
- #endif
2035
-
2036
- template <class D>
2037
- HWY_EMU128_CONCAT_INLINE VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
2038
- const Half<decltype(d)> dh;
2039
- VFromD<D> ret;
2040
- for (size_t i = 0; i < MaxLanes(dh); ++i) {
2041
- ret.raw[i] = lo.raw[2 * i + 1];
2042
- }
2043
- for (size_t i = 0; i < MaxLanes(dh); ++i) {
2044
- ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i + 1];
2045
- }
2046
- return ret;
2047
- }
2048
-
2049
- // ------------------------------ CombineShiftRightBytes
2050
- template <int kBytes, class D>
2051
- HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
2052
- VFromD<D> ret;
2053
- const uint8_t* HWY_RESTRICT lo8 =
2054
- reinterpret_cast<const uint8_t * HWY_RESTRICT>(lo.raw);
2055
- uint8_t* HWY_RESTRICT ret8 =
2056
- reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
2057
- CopyBytes<d.MaxBytes() - kBytes>(lo8 + kBytes, ret8);
2058
- CopyBytes<kBytes>(hi.raw, ret8 + d.MaxBytes() - kBytes);
2059
- return ret;
2060
- }
2061
-
2062
- // ------------------------------ ShiftLeftBytes
2063
-
2064
- template <int kBytes, class D>
2065
- HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
2066
- static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2067
- VFromD<D> ret;
2068
- uint8_t* HWY_RESTRICT ret8 =
2069
- reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
2070
- ZeroBytes<kBytes>(ret8);
2071
- CopyBytes<d.MaxBytes() - kBytes>(v.raw, ret8 + kBytes);
2072
- return ret;
2073
- }
2074
-
2075
- template <int kBytes, typename T, size_t N>
2076
- HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
2077
- return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
2078
- }
2079
-
2080
- // ------------------------------ ShiftLeftLanes
2081
-
2082
- template <int kLanes, class D, typename T = TFromD<D>>
2083
- HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) {
2084
- const Repartition<uint8_t, decltype(d)> d8;
2085
- return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
2086
- }
2087
-
2088
- template <int kLanes, typename T, size_t N>
2089
- HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) {
2090
- return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
2091
- }
2092
-
2093
- // ------------------------------ ShiftRightBytes
2094
- template <int kBytes, class D>
2095
- HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
2096
- static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
2097
- VFromD<D> ret;
2098
- const uint8_t* HWY_RESTRICT v8 =
2099
- reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
2100
- uint8_t* HWY_RESTRICT ret8 =
2101
- reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
2102
- CopyBytes<d.MaxBytes() - kBytes>(v8 + kBytes, ret8);
2103
- ZeroBytes<kBytes>(ret8 + d.MaxBytes() - kBytes);
2104
- return ret;
2105
- }
2106
-
2107
- // ------------------------------ ShiftRightLanes
2108
- template <int kLanes, class D>
2109
- HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) {
2110
- const Repartition<uint8_t, decltype(d)> d8;
2111
- constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
2112
- return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
2113
- }
2114
-
2115
2170
  // ================================================== SWIZZLE
2116
2171
 
2117
2172
  template <typename T, size_t N>
@@ -2154,6 +2209,24 @@ HWY_API Vec128<T, N> OddEven(Vec128<T, N> odd, Vec128<T, N> even) {
2154
2209
  return odd;
2155
2210
  }
2156
2211
 
2212
+ template <class D>
2213
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
2214
+ constexpr size_t N = HWY_MAX_LANES_D(D);
2215
+ for (size_t i = 1; i < N; i += 2) {
2216
+ a.raw[i] = b.raw[i - 1];
2217
+ }
2218
+ return a;
2219
+ }
2220
+
2221
+ template <class D>
2222
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
2223
+ constexpr size_t N = HWY_MAX_LANES_D(D);
2224
+ for (size_t i = 1; i < N; i += 2) {
2225
+ b.raw[i - 1] = a.raw[i];
2226
+ }
2227
+ return b;
2228
+ }
2229
+
2157
2230
  template <typename T, size_t N>
2158
2231
  HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
2159
2232
  return even;
@@ -2724,88 +2797,26 @@ HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
2724
2797
 
2725
2798
  // ------------------------------ WidenMulPairwiseAdd
2726
2799
 
2727
- template <class D, HWY_IF_F32_D(D), class VBF16>
2728
- HWY_API VFromD<D> WidenMulPairwiseAdd(D df32, VBF16 a, VBF16 b) {
2729
- const Rebind<uint32_t, decltype(df32)> du32;
2730
- using VU32 = VFromD<decltype(du32)>;
2731
- const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
2732
- // Avoid ZipLower/Upper so this also works on big-endian systems.
2733
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
2734
- const VU32 ao = And(BitCast(du32, a), odd);
2735
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
2736
- const VU32 bo = And(BitCast(du32, b), odd);
2737
- return Mul(BitCast(df32, ae), BitCast(df32, be)) +
2738
- Mul(BitCast(df32, ao), BitCast(df32, bo));
2739
- }
2740
-
2741
- template <class D, HWY_IF_I32_D(D), class VI16>
2742
- HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, VI16 a, VI16 b) {
2743
- using VI32 = VFromD<decltype(d32)>;
2744
- // Manual sign extension requires two shifts for even lanes.
2745
- const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a)));
2746
- const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b)));
2747
- const VI32 ao = ShiftRight<16>(BitCast(d32, a));
2748
- const VI32 bo = ShiftRight<16>(BitCast(d32, b));
2749
- return Add(Mul(ae, be), Mul(ao, bo));
2800
+ template <class DF, HWY_IF_F32_D(DF), class VBF>
2801
+ HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
2802
+ return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
2803
+ Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
2750
2804
  }
2751
2805
 
2752
- template <class D, HWY_IF_U32_D(D), class VU16>
2753
- HWY_API VFromD<D> WidenMulPairwiseAdd(D du32, VU16 a, VU16 b) {
2754
- const auto lo16_mask = Set(du32, 0x0000FFFFu);
2755
-
2756
- const auto a0 = And(BitCast(du32, a), lo16_mask);
2757
- const auto b0 = And(BitCast(du32, b), lo16_mask);
2758
-
2759
- const auto a1 = ShiftRight<16>(BitCast(du32, a));
2760
- const auto b1 = ShiftRight<16>(BitCast(du32, b));
2761
-
2762
- return Add(Mul(a0, b0), Mul(a1, b1));
2806
+ template <class D, HWY_IF_UI32_D(D), class V16>
2807
+ HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, V16 a, V16 b) {
2808
+ return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b),
2809
+ Mul(PromoteOddTo(d32, a), PromoteOddTo(d32, b)));
2763
2810
  }
2764
2811
 
2765
2812
  // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
2766
2813
 
2767
- template <class D, HWY_IF_F32_D(D), size_t N, class VBF16>
2768
- HWY_API VFromD<D> ReorderWidenMulAccumulate(D df32, VBF16 a, VBF16 b,
2769
- const Vec128<float, N> sum0,
2770
- Vec128<float, N>& sum1) {
2771
- const Rebind<uint32_t, decltype(df32)> du32;
2772
- using VU32 = VFromD<decltype(du32)>;
2773
- const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
2774
- // Avoid ZipLower/Upper so this also works on big-endian systems.
2775
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
2776
- const VU32 ao = And(BitCast(du32, a), odd);
2777
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
2778
- const VU32 bo = And(BitCast(du32, b), odd);
2779
- sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
2780
- return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
2781
- }
2782
-
2783
- template <class D, HWY_IF_I32_D(D), size_t N, class VI16>
2784
- HWY_API VFromD<D> ReorderWidenMulAccumulate(D d32, VI16 a, VI16 b,
2785
- const Vec128<int32_t, N> sum0,
2786
- Vec128<int32_t, N>& sum1) {
2787
- using VI32 = VFromD<decltype(d32)>;
2788
- // Manual sign extension requires two shifts for even lanes.
2789
- const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a)));
2790
- const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b)));
2791
- const VI32 ao = ShiftRight<16>(BitCast(d32, a));
2792
- const VI32 bo = ShiftRight<16>(BitCast(d32, b));
2793
- sum1 = Add(Mul(ao, bo), sum1);
2794
- return Add(Mul(ae, be), sum0);
2795
- }
2796
-
2797
- template <class D, HWY_IF_U32_D(D), size_t N, class VU16>
2798
- HWY_API VFromD<D> ReorderWidenMulAccumulate(D du32, VU16 a, VU16 b,
2799
- const Vec128<uint32_t, N> sum0,
2800
- Vec128<uint32_t, N>& sum1) {
2801
- using VU32 = VFromD<decltype(du32)>;
2802
- const VU32 lo16_mask = Set(du32, uint32_t{0x0000FFFFu});
2803
- const VU32 ae = And(BitCast(du32, a), lo16_mask);
2804
- const VU32 be = And(BitCast(du32, b), lo16_mask);
2805
- const VU32 ao = ShiftRight<16>(BitCast(du32, a));
2806
- const VU32 bo = ShiftRight<16>(BitCast(du32, b));
2807
- sum1 = Add(Mul(ao, bo), sum1);
2808
- return Add(Mul(ae, be), sum0);
2814
+ template <class D, HWY_IF_UI32_D(D), class V16>
2815
+ HWY_API VFromD<D> ReorderWidenMulAccumulate(D d32, V16 a, V16 b,
2816
+ const VFromD<D> sum0,
2817
+ VFromD<D>& sum1) {
2818
+ sum1 = MulAdd(PromoteOddTo(d32, a), PromoteOddTo(d32, b), sum1);
2819
+ return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b), sum0);
2809
2820
  }
2810
2821
 
2811
2822
  // ------------------------------ RearrangeToOddPlusEven
@@ -2866,18 +2877,20 @@ HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
2866
2877
 
2867
2878
  // ------------------------------ MulEven/Odd 64x64 (UpperHalf)
2868
2879
 
2869
- HWY_INLINE Vec128<uint64_t> MulEven(Vec128<uint64_t> a, Vec128<uint64_t> b) {
2870
- alignas(16) uint64_t mul[2];
2880
+ template <class T, HWY_IF_UI64(T)>
2881
+ HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
2882
+ alignas(16) T mul[2];
2871
2883
  mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
2872
- return Load(Full128<uint64_t>(), mul);
2884
+ return Load(Full128<T>(), mul);
2873
2885
  }
2874
2886
 
2875
- HWY_INLINE Vec128<uint64_t> MulOdd(Vec128<uint64_t> a, Vec128<uint64_t> b) {
2876
- alignas(16) uint64_t mul[2];
2877
- const Half<Full128<uint64_t>> d2;
2887
+ template <class T, HWY_IF_UI64(T)>
2888
+ HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
2889
+ alignas(16) T mul[2];
2890
+ const Half<Full128<T>> d2;
2878
2891
  mul[0] =
2879
2892
  Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
2880
- return Load(Full128<uint64_t>(), mul);
2893
+ return Load(Full128<T>(), mul);
2881
2894
  }
2882
2895
 
2883
2896
  // NOLINTNEXTLINE(google-readability-namespace-comments)