@img/sharp-libvips-dev 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/README.md +1 -2
  2. package/include/aom/aom_decoder.h +1 -1
  3. package/include/aom/aom_encoder.h +7 -1
  4. package/include/aom/aom_image.h +24 -12
  5. package/include/aom/aom_integer.h +3 -3
  6. package/include/aom/aomcx.h +15 -0
  7. package/include/aom/aomdx.h +5 -2
  8. package/include/archive.h +7 -5
  9. package/include/archive_entry.h +5 -3
  10. package/include/cgif.h +3 -0
  11. package/include/freetype2/freetype/config/ftoption.h +1 -1
  12. package/include/fribidi/fribidi-config.h +2 -2
  13. package/include/fribidi/fribidi-unicode-version.h +3 -3
  14. package/include/glib-2.0/gio/gappinfo.h +40 -25
  15. package/include/glib-2.0/gio/gasyncresult.h +1 -1
  16. package/include/glib-2.0/gio/gconverter.h +5 -0
  17. package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
  18. package/include/glib-2.0/gio/gfile.h +16 -0
  19. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  20. package/include/glib-2.0/gio/gsettings.h +8 -0
  21. package/include/glib-2.0/gio/gvfs.h +2 -2
  22. package/include/glib-2.0/girepository/gi-visibility.h +34 -0
  23. package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
  24. package/include/glib-2.0/glib/giochannel.h +2 -2
  25. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  26. package/include/glib-2.0/glib/gmacros.h +12 -5
  27. package/include/glib-2.0/glib/gmain.h +93 -7
  28. package/include/glib-2.0/glib/gqsort.h +8 -1
  29. package/include/glib-2.0/glib/gstrfuncs.h +0 -12
  30. package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
  31. package/include/glib-2.0/glib/gunicode.h +1 -1
  32. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  33. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  34. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  35. package/include/glib-2.0/gobject/gtype.h +6 -6
  36. package/include/harfbuzz/hb-buffer.h +6 -0
  37. package/include/harfbuzz/hb-common.h +6 -9
  38. package/include/harfbuzz/hb-cplusplus.hh +8 -11
  39. package/include/harfbuzz/hb-subset.h +17 -4
  40. package/include/harfbuzz/hb-version.h +3 -3
  41. package/include/hwy/abort.h +28 -0
  42. package/include/hwy/aligned_allocator.h +48 -1
  43. package/include/hwy/base.h +235 -34
  44. package/include/hwy/detect_compiler_arch.h +84 -10
  45. package/include/hwy/detect_targets.h +95 -29
  46. package/include/hwy/foreach_target.h +12 -1
  47. package/include/hwy/highway.h +205 -50
  48. package/include/hwy/ops/arm_neon-inl.h +841 -99
  49. package/include/hwy/ops/arm_sve-inl.h +413 -141
  50. package/include/hwy/ops/emu128-inl.h +373 -360
  51. package/include/hwy/ops/generic_ops-inl.h +804 -401
  52. package/include/hwy/ops/inside-inl.h +691 -0
  53. package/include/hwy/ops/ppc_vsx-inl.h +456 -166
  54. package/include/hwy/ops/rvv-inl.h +537 -249
  55. package/include/hwy/ops/scalar-inl.h +169 -79
  56. package/include/hwy/ops/set_macros-inl.h +106 -18
  57. package/include/hwy/ops/shared-inl.h +23 -0
  58. package/include/hwy/ops/wasm_128-inl.h +130 -108
  59. package/include/hwy/ops/x86_128-inl.h +1892 -577
  60. package/include/hwy/ops/x86_256-inl.h +625 -184
  61. package/include/hwy/ops/x86_512-inl.h +733 -131
  62. package/include/hwy/targets.h +22 -21
  63. package/include/hwy/timer-inl.h +3 -3
  64. package/include/hwy/timer.h +5 -1
  65. package/include/libheif/heif.h +170 -15
  66. package/include/libheif/heif_items.h +237 -0
  67. package/include/libheif/heif_properties.h +38 -2
  68. package/include/libheif/heif_regions.h +1 -1
  69. package/include/libheif/heif_version.h +2 -2
  70. package/include/libpng16/pnglibconf.h +1 -1
  71. package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
  72. package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
  73. package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
  74. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  75. package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
  76. package/include/libxml2/libxml/HTMLparser.h +12 -19
  77. package/include/libxml2/libxml/c14n.h +1 -12
  78. package/include/libxml2/libxml/debugXML.h +1 -1
  79. package/include/libxml2/libxml/encoding.h +9 -0
  80. package/include/libxml2/libxml/entities.h +12 -1
  81. package/include/libxml2/libxml/hash.h +19 -0
  82. package/include/libxml2/libxml/list.h +2 -2
  83. package/include/libxml2/libxml/nanohttp.h +17 -0
  84. package/include/libxml2/libxml/parser.h +61 -55
  85. package/include/libxml2/libxml/parserInternals.h +9 -1
  86. package/include/libxml2/libxml/pattern.h +6 -0
  87. package/include/libxml2/libxml/tree.h +32 -12
  88. package/include/libxml2/libxml/uri.h +11 -0
  89. package/include/libxml2/libxml/valid.h +29 -2
  90. package/include/libxml2/libxml/xinclude.h +7 -0
  91. package/include/libxml2/libxml/xmlIO.h +21 -4
  92. package/include/libxml2/libxml/xmlerror.h +14 -0
  93. package/include/libxml2/libxml/xmlexports.h +111 -15
  94. package/include/libxml2/libxml/xmlmemory.h +8 -45
  95. package/include/libxml2/libxml/xmlreader.h +2 -0
  96. package/include/libxml2/libxml/xmlsave.h +5 -0
  97. package/include/libxml2/libxml/xmlunicode.h +165 -1
  98. package/include/libxml2/libxml/xmlversion.h +15 -179
  99. package/include/libxml2/libxml/xmlwriter.h +1 -0
  100. package/include/libxml2/libxml/xpath.h +4 -0
  101. package/include/pango-1.0/pango/pango-features.h +3 -3
  102. package/include/pango-1.0/pango/pango-item.h +4 -2
  103. package/include/pango-1.0/pango/pango-version-macros.h +25 -0
  104. package/include/pango-1.0/pango/pangofc-font.h +2 -1
  105. package/include/pnglibconf.h +1 -1
  106. package/include/vips/util.h +1 -2
  107. package/include/vips/version.h +4 -4
  108. package/include/webp/decode.h +58 -56
  109. package/include/webp/demux.h +25 -21
  110. package/include/webp/encode.h +44 -39
  111. package/include/webp/mux.h +76 -15
  112. package/include/webp/mux_types.h +2 -1
  113. package/include/webp/sharpyuv/sharpyuv.h +77 -8
  114. package/include/webp/types.h +29 -8
  115. package/include/zconf.h +1 -1
  116. package/include/zlib.h +12 -12
  117. package/package.json +1 -1
  118. package/versions.json +14 -15
@@ -54,6 +54,22 @@ namespace detail {
54
54
  #define HWY_X86_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
55
55
  #endif
56
56
 
57
+ #undef HWY_AVX3_HAVE_F32_TO_BF16C
58
+ #if HWY_TARGET <= HWY_AVX3_ZEN4 && !HWY_COMPILER_CLANGCL && \
59
+ (HWY_COMPILER_GCC_ACTUAL >= 1000 || HWY_COMPILER_CLANG >= 900) && \
60
+ !defined(HWY_AVX3_DISABLE_AVX512BF16)
61
+ #define HWY_AVX3_HAVE_F32_TO_BF16C 1
62
+ #else
63
+ #define HWY_AVX3_HAVE_F32_TO_BF16C 0
64
+ #endif
65
+
66
+ #undef HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT
67
+ #if HWY_TARGET <= HWY_AVX3 && HWY_ARCH_X86_64
68
+ #define HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT "v"
69
+ #else
70
+ #define HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT "x"
71
+ #endif
72
+
57
73
  template <typename T>
58
74
  struct Raw128 {
59
75
  using type = __m128i;
@@ -228,9 +244,6 @@ HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
228
244
  template <class D>
229
245
  using VFromD = decltype(Zero(D()));
230
246
 
231
- // ------------------------------ Tuple (VFromD)
232
- #include "hwy/ops/tuple-inl.h"
233
-
234
247
  // ------------------------------ BitCast
235
248
 
236
249
  namespace detail {
@@ -242,6 +255,25 @@ HWY_INLINE __m128i BitCastToInteger(__m128h v) { return _mm_castph_si128(v); }
242
255
  HWY_INLINE __m128i BitCastToInteger(__m128 v) { return _mm_castps_si128(v); }
243
256
  HWY_INLINE __m128i BitCastToInteger(__m128d v) { return _mm_castpd_si128(v); }
244
257
 
258
+ #if HWY_AVX3_HAVE_F32_TO_BF16C
259
+ HWY_INLINE __m128i BitCastToInteger(__m128bh v) {
260
+ // Need to use reinterpret_cast on GCC/Clang or BitCastScalar on MSVC to
261
+ // bit cast a __m128bh to a __m128i as there is currently no intrinsic
262
+ // available (as of GCC 13 and Clang 17) that can bit cast a __m128bh vector
263
+ // to a __m128i vector
264
+
265
+ #if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
266
+ // On GCC or Clang, use reinterpret_cast to bit cast a __m128bh to a __m128i
267
+ return reinterpret_cast<__m128i>(v);
268
+ #else
269
+ // On MSVC, use BitCastScalar to bit cast a __m128bh to a __m128i as MSVC does
270
+ // not allow reinterpret_cast, static_cast, or a C-style cast to be used to
271
+ // bit cast from one SSE/AVX vector type to a different SSE/AVX vector type
272
+ return BitCastScalar<__m128i>(v);
273
+ #endif // HWY_COMPILER_GCC || HWY_COMPILER_CLANG
274
+ }
275
+ #endif // HWY_AVX3_HAVE_F32_TO_BF16C
276
+
245
277
  template <typename T, size_t N>
246
278
  HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) {
247
279
  return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
@@ -502,6 +534,112 @@ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
502
534
  return VFromD<D>{_mm_setr_pd(t0, t1)};
503
535
  }
504
536
 
537
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
538
+ namespace detail {
539
+
540
+ template <class RawV>
541
+ static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec(
542
+ hwy::SizeTag<1> /* num_of_lanes_tag*/, RawV v) {
543
+ return __builtin_constant_p(v[0]);
544
+ }
545
+
546
+ template <class RawV>
547
+ static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec(
548
+ hwy::SizeTag<2> /* num_of_lanes_tag*/, RawV v) {
549
+ return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]);
550
+ }
551
+
552
+ template <class RawV>
553
+ static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec(
554
+ hwy::SizeTag<4> /* num_of_lanes_tag*/, RawV v) {
555
+ return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
556
+ __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]);
557
+ }
558
+
559
+ template <class RawV>
560
+ static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec(
561
+ hwy::SizeTag<8> /* num_of_lanes_tag*/, RawV v) {
562
+ return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
563
+ __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) &&
564
+ __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) &&
565
+ __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]);
566
+ }
567
+
568
+ template <class RawV>
569
+ static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec(
570
+ hwy::SizeTag<16> /* num_of_lanes_tag*/, RawV v) {
571
+ return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
572
+ __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) &&
573
+ __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) &&
574
+ __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]) &&
575
+ __builtin_constant_p(v[8]) && __builtin_constant_p(v[9]) &&
576
+ __builtin_constant_p(v[10]) && __builtin_constant_p(v[11]) &&
577
+ __builtin_constant_p(v[12]) && __builtin_constant_p(v[13]) &&
578
+ __builtin_constant_p(v[14]) && __builtin_constant_p(v[15]);
579
+ }
580
+
581
+ #if HWY_TARGET <= HWY_AVX2
582
+ template <class RawV>
583
+ static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec(
584
+ hwy::SizeTag<32> /* num_of_lanes_tag*/, RawV v) {
585
+ return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
586
+ __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) &&
587
+ __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) &&
588
+ __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]) &&
589
+ __builtin_constant_p(v[8]) && __builtin_constant_p(v[9]) &&
590
+ __builtin_constant_p(v[10]) && __builtin_constant_p(v[11]) &&
591
+ __builtin_constant_p(v[12]) && __builtin_constant_p(v[13]) &&
592
+ __builtin_constant_p(v[14]) && __builtin_constant_p(v[15]) &&
593
+ __builtin_constant_p(v[16]) && __builtin_constant_p(v[17]) &&
594
+ __builtin_constant_p(v[18]) && __builtin_constant_p(v[19]) &&
595
+ __builtin_constant_p(v[20]) && __builtin_constant_p(v[21]) &&
596
+ __builtin_constant_p(v[22]) && __builtin_constant_p(v[23]) &&
597
+ __builtin_constant_p(v[24]) && __builtin_constant_p(v[25]) &&
598
+ __builtin_constant_p(v[26]) && __builtin_constant_p(v[27]) &&
599
+ __builtin_constant_p(v[28]) && __builtin_constant_p(v[29]) &&
600
+ __builtin_constant_p(v[30]) && __builtin_constant_p(v[31]);
601
+ }
602
+ #endif
603
+
604
+ template <size_t kNumOfLanes, class V>
605
+ static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantX86Vec(
606
+ hwy::SizeTag<kNumOfLanes> num_of_lanes_tag, V v) {
607
+ using T = TFromV<V>;
608
+ #if HWY_HAVE_FLOAT16 && HWY_HAVE_SCALAR_F16_TYPE
609
+ using F16VecLaneT = hwy::float16_t::Native;
610
+ #else
611
+ using F16VecLaneT = uint16_t;
612
+ #endif
613
+ using RawVecLaneT = If<hwy::IsSame<T, hwy::float16_t>(), F16VecLaneT,
614
+ If<hwy::IsSame<T, hwy::bfloat16_t>(), uint16_t, T>>;
615
+
616
+ // Suppress the -Wignored-attributes warning that is emitted by
617
+ // RemoveCvRef<decltype(v.raw)> with GCC
618
+ HWY_DIAGNOSTICS(push)
619
+ HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
620
+ typedef RawVecLaneT GccRawVec
621
+ __attribute__((__vector_size__(sizeof(RemoveCvRef<decltype(v.raw)>))));
622
+ HWY_DIAGNOSTICS(pop)
623
+
624
+ return IsConstantRawX86Vec(num_of_lanes_tag,
625
+ reinterpret_cast<GccRawVec>(v.raw));
626
+ }
627
+
628
+ template <class TTo, class V>
629
+ static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantX86VecForF2IConv(V v) {
630
+ constexpr size_t kNumOfLanesInRawSrcVec =
631
+ HWY_MAX(HWY_MAX_LANES_V(V), 16 / sizeof(TFromV<V>));
632
+ constexpr size_t kNumOfLanesInRawResultVec =
633
+ HWY_MAX(HWY_MAX_LANES_V(V), 16 / sizeof(TTo));
634
+ constexpr size_t kNumOfLanesToCheck =
635
+ HWY_MIN(kNumOfLanesInRawSrcVec, kNumOfLanesInRawResultVec);
636
+
637
+ return IsConstantX86Vec(hwy::SizeTag<kNumOfLanesToCheck>(), v);
638
+ }
639
+
640
+ } // namespace detail
641
+ #endif // HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
642
+
505
643
  // ================================================== LOGICAL
506
644
 
507
645
  // ------------------------------ And
@@ -587,7 +725,7 @@ HWY_API Vec128<T, N> Not(const Vec128<T, N> v) {
587
725
  const DFromV<decltype(v)> d;
588
726
  const RebindToUnsigned<decltype(d)> du;
589
727
  using VU = VFromD<decltype(du)>;
590
- #if HWY_TARGET <= HWY_AVX3
728
+ #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
591
729
  const __m128i vu = BitCast(du, v).raw;
592
730
  return BitCast(d, VU{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)});
593
731
  #else
@@ -598,7 +736,7 @@ HWY_API Vec128<T, N> Not(const Vec128<T, N> v) {
598
736
  // ------------------------------ Xor3
599
737
  template <typename T, size_t N>
600
738
  HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
601
- #if HWY_TARGET <= HWY_AVX3
739
+ #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
602
740
  const DFromV<decltype(x1)> d;
603
741
  const RebindToUnsigned<decltype(d)> du;
604
742
  using VU = VFromD<decltype(du)>;
@@ -613,7 +751,7 @@ HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
613
751
  // ------------------------------ Or3
614
752
  template <typename T, size_t N>
615
753
  HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
616
- #if HWY_TARGET <= HWY_AVX3
754
+ #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
617
755
  const DFromV<decltype(o1)> d;
618
756
  const RebindToUnsigned<decltype(d)> du;
619
757
  using VU = VFromD<decltype(du)>;
@@ -628,7 +766,7 @@ HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
628
766
  // ------------------------------ OrAnd
629
767
  template <typename T, size_t N>
630
768
  HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
631
- #if HWY_TARGET <= HWY_AVX3
769
+ #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
632
770
  const DFromV<decltype(o)> d;
633
771
  const RebindToUnsigned<decltype(d)> du;
634
772
  using VU = VFromD<decltype(du)>;
@@ -644,7 +782,7 @@ HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
644
782
  template <typename T, size_t N>
645
783
  HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
646
784
  Vec128<T, N> no) {
647
- #if HWY_TARGET <= HWY_AVX3
785
+ #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
648
786
  const DFromV<decltype(no)> d;
649
787
  const RebindToUnsigned<decltype(d)> du;
650
788
  using VU = VFromD<decltype(du)>;
@@ -657,7 +795,7 @@ HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
657
795
  }
658
796
 
659
797
  // ------------------------------ BitwiseIfThenElse
660
- #if HWY_TARGET <= HWY_AVX3
798
+ #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
661
799
 
662
800
  #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
663
801
  #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
@@ -870,6 +1008,19 @@ HWY_API MFromD<D> MaskFalse(D /*d*/) {
870
1008
  return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(0)};
871
1009
  }
872
1010
 
1011
+ // ------------------------------ IsNegative (MFromD)
1012
+ #ifdef HWY_NATIVE_IS_NEGATIVE
1013
+ #undef HWY_NATIVE_IS_NEGATIVE
1014
+ #else
1015
+ #define HWY_NATIVE_IS_NEGATIVE
1016
+ #endif
1017
+
1018
+ // Generic for all vector lengths
1019
+ template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
1020
+ HWY_API MFromD<DFromV<V>> IsNegative(V v) {
1021
+ return MaskFromVec(v);
1022
+ }
1023
+
873
1024
  // ------------------------------ PromoteMaskTo (MFromD)
874
1025
 
875
1026
  #ifdef HWY_NATIVE_PROMOTE_MASK_TO
@@ -1072,6 +1223,101 @@ HWY_API MFromD<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom /*d_from*/,
1072
1223
  MH{static_cast<RawMH>(a.raw)});
1073
1224
  }
1074
1225
 
1226
+ // ------------------------------ Slide mask up/down
1227
+ #ifdef HWY_NATIVE_SLIDE_MASK
1228
+ #undef HWY_NATIVE_SLIDE_MASK
1229
+ #else
1230
+ #define HWY_NATIVE_SLIDE_MASK
1231
+ #endif
1232
+
1233
+ template <class D, HWY_IF_LANES_LE_D(D, 8)>
1234
+ HWY_API MFromD<D> SlideMask1Up(D d, MFromD<D> m) {
1235
+ using RawM = decltype(MFromD<D>().raw);
1236
+ constexpr size_t kN = MaxLanes(d);
1237
+ constexpr unsigned kValidLanesMask = (1u << kN) - 1u;
1238
+
1239
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
1240
+ MFromD<D> result_mask{
1241
+ static_cast<RawM>(_kshiftli_mask8(static_cast<__mmask8>(m.raw), 1))};
1242
+
1243
+ if (kN < 8) {
1244
+ result_mask =
1245
+ And(result_mask, MFromD<D>{static_cast<RawM>(kValidLanesMask)});
1246
+ }
1247
+ #else
1248
+ MFromD<D> result_mask{
1249
+ static_cast<RawM>((static_cast<unsigned>(m.raw) << 1) & kValidLanesMask)};
1250
+ #endif
1251
+
1252
+ return result_mask;
1253
+ }
1254
+
1255
+ template <class D, HWY_IF_LANES_D(D, 16)>
1256
+ HWY_API MFromD<D> SlideMask1Up(D /*d*/, MFromD<D> m) {
1257
+ using RawM = decltype(MFromD<D>().raw);
1258
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
1259
+ return MFromD<D>{
1260
+ static_cast<RawM>(_kshiftli_mask16(static_cast<__mmask16>(m.raw), 1))};
1261
+ #else
1262
+ return MFromD<D>{static_cast<RawM>(static_cast<unsigned>(m.raw) << 1)};
1263
+ #endif
1264
+ }
1265
+
1266
+ template <class D, HWY_IF_LANES_LE_D(D, 8)>
1267
+ HWY_API MFromD<D> SlideMask1Down(D d, MFromD<D> m) {
1268
+ using RawM = decltype(MFromD<D>().raw);
1269
+ constexpr size_t kN = MaxLanes(d);
1270
+ constexpr unsigned kValidLanesMask = (1u << kN) - 1u;
1271
+
1272
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
1273
+ if (kN < 8) {
1274
+ m = And(m, MFromD<D>{static_cast<RawM>(kValidLanesMask)});
1275
+ }
1276
+
1277
+ return MFromD<D>{
1278
+ static_cast<RawM>(_kshiftri_mask8(static_cast<__mmask8>(m.raw), 1))};
1279
+ #else
1280
+ return MFromD<D>{
1281
+ static_cast<RawM>((static_cast<unsigned>(m.raw) & kValidLanesMask) >> 1)};
1282
+ #endif
1283
+ }
1284
+
1285
+ template <class D, HWY_IF_LANES_D(D, 16)>
1286
+ HWY_API MFromD<D> SlideMask1Down(D /*d*/, MFromD<D> m) {
1287
+ using RawM = decltype(MFromD<D>().raw);
1288
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
1289
+ return MFromD<D>{
1290
+ static_cast<RawM>(_kshiftri_mask16(static_cast<__mmask16>(m.raw), 1))};
1291
+ #else
1292
+ return MFromD<D>{
1293
+ static_cast<RawM>((static_cast<unsigned>(m.raw) & 0xFFFFu) >> 1)};
1294
+ #endif
1295
+ }
1296
+
1297
+ // Generic for all vector lengths
1298
+ template <class D>
1299
+ HWY_API MFromD<D> SlideMaskUpLanes(D d, MFromD<D> m, size_t amt) {
1300
+ using RawM = decltype(MFromD<D>().raw);
1301
+ constexpr size_t kN = MaxLanes(d);
1302
+ constexpr uint64_t kValidLanesMask =
1303
+ static_cast<uint64_t>(((kN < 64) ? (1ULL << kN) : 0ULL) - 1ULL);
1304
+
1305
+ return MFromD<D>{static_cast<RawM>(
1306
+ (static_cast<uint64_t>(m.raw) << (amt & 63)) & kValidLanesMask)};
1307
+ }
1308
+
1309
+ // Generic for all vector lengths
1310
+ template <class D>
1311
+ HWY_API MFromD<D> SlideMaskDownLanes(D d, MFromD<D> m, size_t amt) {
1312
+ using RawM = decltype(MFromD<D>().raw);
1313
+ constexpr size_t kN = MaxLanes(d);
1314
+ constexpr uint64_t kValidLanesMask =
1315
+ static_cast<uint64_t>(((kN < 64) ? (1ULL << kN) : 0ULL) - 1ULL);
1316
+
1317
+ return MFromD<D>{static_cast<RawM>(
1318
+ (static_cast<uint64_t>(m.raw) & kValidLanesMask) >> (amt & 63))};
1319
+ }
1320
+
1075
1321
  // ------------------------------ VecFromMask
1076
1322
 
1077
1323
  template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
@@ -3660,6 +3906,12 @@ HWY_API Vec128<double, N> operator-(const Vec128<double, N> a,
3660
3906
  // ------------------------------ AddSub
3661
3907
 
3662
3908
  #if HWY_TARGET <= HWY_SSSE3
3909
+
3910
+ #undef HWY_IF_ADDSUB_V
3911
+ #define HWY_IF_ADDSUB_V(V) \
3912
+ HWY_IF_V_SIZE_GT_V( \
3913
+ V, ((hwy::IsFloat3264<TFromV<V>>()) ? 32 : sizeof(TFromV<V>)))
3914
+
3663
3915
  template <size_t N, HWY_IF_LANES_GT(N, 1)>
3664
3916
  HWY_API Vec128<float, N> AddSub(Vec128<float, N> a, Vec128<float, N> b) {
3665
3917
  return Vec128<float, N>{_mm_addsub_ps(a.raw, b.raw)};
@@ -3862,7 +4114,7 @@ HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a,
3862
4114
  return Vec128<int16_t, N>{_mm_adds_epi16(a.raw, b.raw)};
3863
4115
  }
3864
4116
 
3865
- #if HWY_TARGET <= HWY_AVX3
4117
+ #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
3866
4118
  #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
3867
4119
  #undef HWY_NATIVE_I32_SATURATED_ADDSUB
3868
4120
  #else
@@ -3900,7 +4152,7 @@ HWY_API Vec128<int64_t, N> SaturatedAdd(Vec128<int64_t, N> a,
3900
4152
  i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
3901
4153
  return IfThenElse(overflow_mask, overflow_result, sum);
3902
4154
  }
3903
- #endif // HWY_TARGET <= HWY_AVX3
4155
+ #endif // HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
3904
4156
 
3905
4157
  // ------------------------------ SaturatedSub
3906
4158
 
@@ -3930,7 +4182,7 @@ HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a,
3930
4182
  return Vec128<int16_t, N>{_mm_subs_epi16(a.raw, b.raw)};
3931
4183
  }
3932
4184
 
3933
- #if HWY_TARGET <= HWY_AVX3
4185
+ #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
3934
4186
  template <size_t N>
3935
4187
  HWY_API Vec128<int32_t, N> SaturatedSub(Vec128<int32_t, N> a,
3936
4188
  Vec128<int32_t, N> b) {
@@ -3956,7 +4208,7 @@ HWY_API Vec128<int64_t, N> SaturatedSub(Vec128<int64_t, N> a,
3956
4208
  i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
3957
4209
  return IfThenElse(overflow_mask, overflow_result, diff);
3958
4210
  }
3959
- #endif // HWY_TARGET <= HWY_AVX3
4211
+ #endif // HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
3960
4212
 
3961
4213
  // ------------------------------ AverageRound
3962
4214
 
@@ -3987,7 +4239,7 @@ HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a,
3987
4239
  return Vec128<int16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
3988
4240
  }
3989
4241
 
3990
- // Returns the upper 16 bits of a * b in each lane.
4242
+ // Returns the upper sizeof(T)*8 bits of a * b in each lane.
3991
4243
  template <size_t N>
3992
4244
  HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
3993
4245
  const Vec128<uint16_t, N> b) {
@@ -3999,6 +4251,26 @@ HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
3999
4251
  return Vec128<int16_t, N>{_mm_mulhi_epi16(a.raw, b.raw)};
4000
4252
  }
4001
4253
 
4254
+ template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4)),
4255
+ HWY_IF_LANES_D(DFromV<V>, 1)>
4256
+ HWY_API V MulHigh(V a, V b) {
4257
+ const DFromV<decltype(a)> d;
4258
+ const Full128<TFromD<decltype(d)>> d_full;
4259
+ return ResizeBitCast(
4260
+ d, Slide1Down(d_full, ResizeBitCast(d_full, MulEven(a, b))));
4261
+ }
4262
+
4263
+ // I8/U8/I32/U32 MulHigh is generic for all vector lengths >= 2 lanes
4264
+ template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4)),
4265
+ HWY_IF_LANES_GT_D(DFromV<V>, 1)>
4266
+ HWY_API V MulHigh(V a, V b) {
4267
+ const DFromV<decltype(a)> d;
4268
+
4269
+ const auto p_even = BitCast(d, MulEven(a, b));
4270
+ const auto p_odd = BitCast(d, MulOdd(a, b));
4271
+ return InterleaveOdd(d, p_even, p_odd);
4272
+ }
4273
+
4002
4274
  // Multiplies even lanes (0, 2 ..) and places the double-wide result into
4003
4275
  // even and the upper half into its odd neighbor lane.
4004
4276
  template <class V, HWY_IF_U8_D(DFromV<V>)>
@@ -4126,15 +4398,29 @@ HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
4126
4398
 
4127
4399
  // ------------------------------ RotateRight (ShiftRight, Or)
4128
4400
 
4129
- template <int kBits, typename T, size_t N,
4130
- HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
4131
- HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
4132
- constexpr size_t kSizeInBits = sizeof(T) * 8;
4133
- static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
4401
+ // U8 RotateRight implementation on AVX3_DL is now in x86_512-inl.h as U8
4402
+ // RotateRight uses detail::GaloisAffine on AVX3_DL
4403
+
4404
+ #if HWY_TARGET > HWY_AVX3_DL
4405
+ template <int kBits, size_t N>
4406
+ HWY_API Vec128<uint8_t, N> RotateRight(const Vec128<uint8_t, N> v) {
4407
+ static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
4408
+ if (kBits == 0) return v;
4409
+ // AVX3 does not support 8-bit.
4410
+ return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(7, 8 - kBits)>(v));
4411
+ }
4412
+ #endif
4413
+
4414
+ template <int kBits, size_t N>
4415
+ HWY_API Vec128<uint16_t, N> RotateRight(const Vec128<uint16_t, N> v) {
4416
+ static_assert(0 <= kBits && kBits < 16, "Invalid shift count");
4134
4417
  if (kBits == 0) return v;
4135
- // AVX3 does not support 8/16-bit.
4136
- return Or(ShiftRight<kBits>(v),
4137
- ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
4418
+ #if HWY_TARGET <= HWY_AVX3_DL
4419
+ return Vec128<uint16_t, N>{_mm_shrdi_epi16(v.raw, v.raw, kBits)};
4420
+ #else
4421
+ // AVX3 does not support 16-bit.
4422
+ return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(15, 16 - kBits)>(v));
4423
+ #endif
4138
4424
  }
4139
4425
 
4140
4426
  template <int kBits, size_t N>
@@ -4159,6 +4445,116 @@ HWY_API Vec128<uint64_t, N> RotateRight(const Vec128<uint64_t, N> v) {
4159
4445
  #endif
4160
4446
  }
4161
4447
 
4448
+ // I8/I16/I32/I64 RotateRight is generic for all vector lengths
4449
+ template <int kBits, class V, HWY_IF_SIGNED_V(V)>
4450
+ HWY_API V RotateRight(V v) {
4451
+ const DFromV<decltype(v)> d;
4452
+ const RebindToUnsigned<decltype(d)> du;
4453
+ return BitCast(d, RotateRight<kBits>(BitCast(du, v)));
4454
+ }
4455
+
4456
+ // ------------------------------ Rol/Ror
4457
+ #if HWY_TARGET <= HWY_AVX3_DL
4458
+ #ifdef HWY_NATIVE_ROL_ROR_16
4459
+ #undef HWY_NATIVE_ROL_ROR_16
4460
+ #else
4461
+ #define HWY_NATIVE_ROL_ROR_16
4462
+ #endif
4463
+
4464
+ template <class T, size_t N, HWY_IF_UI16(T)>
4465
+ HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
4466
+ return Vec128<T, N>{_mm_shrdv_epi16(a.raw, a.raw, b.raw)};
4467
+ }
4468
+
4469
+ // U16/I16 Rol is generic for all vector lengths on AVX3_DL
4470
+ template <class V, HWY_IF_UI16(TFromV<V>)>
4471
+ HWY_API V Rol(V a, V b) {
4472
+ const DFromV<decltype(a)> d;
4473
+ const RebindToSigned<decltype(d)> di;
4474
+ return Ror(a, BitCast(d, Neg(BitCast(di, b))));
4475
+ }
4476
+
4477
+ #endif // HWY_TARGET <= HWY_AVX3_DL
4478
+
4479
+ #if HWY_TARGET <= HWY_AVX3
4480
+
4481
+ #ifdef HWY_NATIVE_ROL_ROR_32_64
4482
+ #undef HWY_NATIVE_ROL_ROR_32_64
4483
+ #else
4484
+ #define HWY_NATIVE_ROL_ROR_32_64
4485
+ #endif
4486
+
4487
+ template <class T, size_t N, HWY_IF_UI32(T)>
4488
+ HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) {
4489
+ return Vec128<T, N>{_mm_rolv_epi32(a.raw, b.raw)};
4490
+ }
4491
+
4492
+ template <class T, size_t N, HWY_IF_UI32(T)>
4493
+ HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
4494
+ return Vec128<T, N>{_mm_rorv_epi32(a.raw, b.raw)};
4495
+ }
4496
+
4497
+ template <class T, size_t N, HWY_IF_UI64(T)>
4498
+ HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) {
4499
+ return Vec128<T, N>{_mm_rolv_epi64(a.raw, b.raw)};
4500
+ }
4501
+
4502
+ template <class T, size_t N, HWY_IF_UI64(T)>
4503
+ HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
4504
+ return Vec128<T, N>{_mm_rorv_epi64(a.raw, b.raw)};
4505
+ }
4506
+
4507
+ #endif
4508
+
4509
+ // ------------------------------ RotateLeftSame/RotateRightSame
4510
+
4511
+ #if HWY_TARGET <= HWY_AVX3_DL
4512
+
4513
+ #ifdef HWY_NATIVE_ROL_ROR_SAME_16
4514
+ #undef HWY_NATIVE_ROL_ROR_SAME_16
4515
+ #else
4516
+ #define HWY_NATIVE_ROL_ROR_SAME_16
4517
+ #endif
4518
+
4519
+ // Generic for all vector lengths
4520
+ template <class V, HWY_IF_UI16(TFromV<V>)>
4521
+ HWY_API V RotateLeftSame(V v, int bits) {
4522
+ const DFromV<decltype(v)> d;
4523
+ return Ror(v,
4524
+ Set(d, static_cast<TFromV<V>>(0u - static_cast<unsigned>(bits))));
4525
+ }
4526
+
4527
+ template <class V, HWY_IF_UI16(TFromV<V>)>
4528
+ HWY_API V RotateRightSame(V v, int bits) {
4529
+ const DFromV<decltype(v)> d;
4530
+ return Ror(v, Set(d, static_cast<TFromV<V>>(bits)));
4531
+ }
4532
+ #endif // HWY_TARGET <= HWY_AVX3_DL
4533
+
4534
+ #if HWY_TARGET <= HWY_AVX3
4535
+
4536
+ #ifdef HWY_NATIVE_ROL_ROR_SAME_32_64
4537
+ #undef HWY_NATIVE_ROL_ROR_SAME_32_64
4538
+ #else
4539
+ #define HWY_NATIVE_ROL_ROR_SAME_32_64
4540
+ #endif
4541
+
4542
+ // Generic for all vector lengths
4543
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
4544
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
4545
+ HWY_API V RotateLeftSame(V v, int bits) {
4546
+ const DFromV<decltype(v)> d;
4547
+ return Rol(v, Set(d, static_cast<TFromV<V>>(static_cast<unsigned>(bits))));
4548
+ }
4549
+
4550
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
4551
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
4552
+ HWY_API V RotateRightSame(V v, int bits) {
4553
+ const DFromV<decltype(v)> d;
4554
+ return Ror(v, Set(d, static_cast<TFromV<V>>(static_cast<unsigned>(bits))));
4555
+ }
4556
+ #endif // HWY_TARGET <= HWY_AVX3
4557
+
4162
4558
  // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
4163
4559
 
4164
4560
  template <size_t N>
@@ -4312,20 +4708,6 @@ HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) {
4312
4708
  #endif
4313
4709
  }
4314
4710
 
4315
- // ------------------------------ ZeroIfNegative (BroadcastSignBit)
4316
- template <typename T, size_t N>
4317
- HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
4318
- static_assert(IsFloat<T>(), "Only works for float");
4319
- const DFromV<decltype(v)> d;
4320
- #if HWY_TARGET >= HWY_SSSE3
4321
- const RebindToSigned<decltype(d)> di;
4322
- const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
4323
- #else
4324
- const auto mask = MaskFromVec(v); // MSB is sufficient for BLENDVPS
4325
- #endif
4326
- return IfThenElse(mask, Zero(d), v);
4327
- }
4328
-
4329
4711
  // ------------------------------ IfNegativeThenElse
4330
4712
  template <size_t N>
4331
4713
  HWY_API Vec128<int8_t, N> IfNegativeThenElse(const Vec128<int8_t, N> v,
@@ -4389,6 +4771,48 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
4389
4771
  #endif
4390
4772
  }
4391
4773
 
4774
+ #if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4
4775
+
4776
+ #ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
4777
+ #undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
4778
+ #else
4779
+ #define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
4780
+ #endif
4781
+
4782
+ #ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
4783
+ #undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
4784
+ #else
4785
+ #define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
4786
+ #endif
4787
+
4788
+ // SSE4/AVX2 IfNegativeThenElseZero/IfNegativeThenZeroElse is generic for all
4789
+ // vector lengths
4790
+ template <class V, HWY_IF_NOT_UNSIGNED_V(V),
4791
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4) | (1 << 8))>
4792
+ HWY_API V IfNegativeThenElseZero(V v, V yes) {
4793
+ const DFromV<decltype(v)> d;
4794
+ return IfNegativeThenElse(v, yes, Zero(d));
4795
+ }
4796
+
4797
+ template <class V, HWY_IF_NOT_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, 2)>
4798
+ HWY_API V IfNegativeThenElseZero(V v, V yes) {
4799
+ return IfThenElseZero(IsNegative(v), yes);
4800
+ }
4801
+
4802
+ template <class V, HWY_IF_NOT_UNSIGNED_V(V),
4803
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4) | (1 << 8))>
4804
+ HWY_API V IfNegativeThenZeroElse(V v, V no) {
4805
+ const DFromV<decltype(v)> d;
4806
+ return IfNegativeThenElse(v, Zero(d), no);
4807
+ }
4808
+
4809
+ template <class V, HWY_IF_NOT_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, 2)>
4810
+ HWY_API V IfNegativeThenZeroElse(V v, V no) {
4811
+ return IfThenZeroElse(IsNegative(v), no);
4812
+ }
4813
+
4814
+ #endif // HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4
4815
+
4392
4816
  // ------------------------------ IfNegativeThenNegOrUndefIfZero
4393
4817
 
4394
4818
  #if HWY_TARGET <= HWY_SSSE3
@@ -5157,6 +5581,14 @@ HWY_API Vec128<double, N> NegMulSub(Vec128<double, N> mul, Vec128<double, N> x,
5157
5581
 
5158
5582
  #if HWY_TARGET <= HWY_SSSE3
5159
5583
 
5584
+ #undef HWY_IF_MULADDSUB_V
5585
+ #define HWY_IF_MULADDSUB_V(V) \
5586
+ HWY_IF_LANES_GT_D(DFromV<V>, 1), \
5587
+ HWY_IF_T_SIZE_ONE_OF_V( \
5588
+ V, (1 << 1) | ((hwy::IsFloat<TFromV<V>>()) \
5589
+ ? 0 \
5590
+ : ((1 << 2) | (1 << 4) | (1 << 8))))
5591
+
5160
5592
  #if HWY_HAVE_FLOAT16
5161
5593
  template <size_t N, HWY_IF_LANES_GT(N, 1)>
5162
5594
  HWY_API Vec128<float16_t, N> MulAddSub(Vec128<float16_t, N> mul,
@@ -5671,20 +6103,14 @@ HWY_INLINE Vec128<double, N> NativeMaskedGatherOr128(
5671
6103
  } // namespace detail
5672
6104
 
5673
6105
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
5674
- HWY_API VFromD<D> GatherOffset(D d, const TFromD<D>* HWY_RESTRICT base,
6106
+ HWY_API VFromD<D> GatherOffset(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
5675
6107
  VFromD<RebindToSigned<D>> offsets) {
5676
- const RebindToSigned<decltype(d)> di;
5677
- (void)di; // for HWY_DASSERT
5678
- HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di))));
5679
6108
  return detail::NativeGather128<1>(base, offsets);
5680
6109
  }
5681
6110
 
5682
6111
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>>
5683
- HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base,
6112
+ HWY_API VFromD<D> GatherIndex(D /*d*/, const T* HWY_RESTRICT base,
5684
6113
  VFromD<RebindToSigned<D>> indices) {
5685
- const RebindToSigned<decltype(d)> di;
5686
- (void)di; // for HWY_DASSERT
5687
- HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
5688
6114
  return detail::NativeGather128<sizeof(T)>(base, indices);
5689
6115
  }
5690
6116
 
@@ -5695,9 +6121,6 @@ HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
5695
6121
  // For partial vectors, ensure upper mask lanes are zero to prevent faults.
5696
6122
  if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
5697
6123
 
5698
- const RebindToSigned<decltype(d)> di;
5699
- (void)di; // for HWY_DASSERT
5700
- HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
5701
6124
  return detail::NativeMaskedGatherOr128<sizeof(T)>(no, m, base, indices);
5702
6125
  }
5703
6126
 
@@ -6816,37 +7239,258 @@ HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
6816
7239
  return BitCast(dw, InterleaveUpper(D(), a, b));
6817
7240
  }
6818
7241
 
6819
- // ------------------------------ Per4LaneBlockShuffle
6820
- namespace detail {
7242
+ // ================================================== CONVERT (1)
6821
7243
 
6822
- #ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
6823
- #undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
7244
+ // ------------------------------ PromoteTo unsigned (TableLookupBytesOr0)
7245
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
7246
+ HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
7247
+ #if HWY_TARGET >= HWY_SSSE3
7248
+ const __m128i zero = _mm_setzero_si128();
7249
+ return VFromD<D>{_mm_unpacklo_epi8(v.raw, zero)};
6824
7250
  #else
6825
- #define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
7251
+ return VFromD<D>{_mm_cvtepu8_epi16(v.raw)};
6826
7252
  #endif
6827
-
6828
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
6829
- HWY_INLINE VFromD<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
6830
- const uint32_t x2,
6831
- const uint32_t x1,
6832
- const uint32_t x0) {
6833
- return ResizeBitCast(
6834
- d, Vec128<uint32_t>{_mm_set_epi32(
6835
- static_cast<int32_t>(x3), static_cast<int32_t>(x2),
6836
- static_cast<int32_t>(x1), static_cast<int32_t>(x0))});
6837
7253
  }
6838
-
6839
- template <size_t kIdx3210, class V>
6840
- HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
6841
- hwy::SizeTag<2> /*lane_size_tag*/,
6842
- hwy::SizeTag<8> /*vect_size_tag*/, V v) {
6843
- const DFromV<decltype(v)> d;
6844
- const RebindToUnsigned<decltype(d)> du; // for float16_t
6845
- return BitCast(d,
6846
- VFromD<decltype(du)>{_mm_shufflelo_epi16(
6847
- BitCast(du, v).raw, static_cast<int>(kIdx3210 & 0xFF))});
7254
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
7255
+ HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
7256
+ #if HWY_TARGET >= HWY_SSSE3
7257
+ return VFromD<D>{_mm_unpacklo_epi16(v.raw, _mm_setzero_si128())};
7258
+ #else
7259
+ return VFromD<D>{_mm_cvtepu16_epi32(v.raw)};
7260
+ #endif
6848
7261
  }
6849
-
7262
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
7263
+ HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
7264
+ #if HWY_TARGET >= HWY_SSSE3
7265
+ return VFromD<D>{_mm_unpacklo_epi32(v.raw, _mm_setzero_si128())};
7266
+ #else
7267
+ return VFromD<D>{_mm_cvtepu32_epi64(v.raw)};
7268
+ #endif
7269
+ }
7270
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
7271
+ HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
7272
+ #if HWY_TARGET >= HWY_SSSE3
7273
+ const __m128i zero = _mm_setzero_si128();
7274
+ const __m128i u16 = _mm_unpacklo_epi8(v.raw, zero);
7275
+ return VFromD<D>{_mm_unpacklo_epi16(u16, zero)};
7276
+ #else
7277
+ return VFromD<D>{_mm_cvtepu8_epi32(v.raw)};
7278
+ #endif
7279
+ }
7280
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
7281
+ HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) {
7282
+ #if HWY_TARGET > HWY_SSSE3
7283
+ const Rebind<uint32_t, decltype(d)> du32;
7284
+ return PromoteTo(d, PromoteTo(du32, v));
7285
+ #elif HWY_TARGET == HWY_SSSE3
7286
+ alignas(16) static constexpr int8_t kShuffle[16] = {
7287
+ 0, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1};
7288
+ const Repartition<int8_t, decltype(d)> di8;
7289
+ return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle)));
7290
+ #else
7291
+ (void)d;
7292
+ return VFromD<D>{_mm_cvtepu8_epi64(v.raw)};
7293
+ #endif
7294
+ }
7295
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
7296
+ HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint16_t, D>> v) {
7297
+ #if HWY_TARGET > HWY_SSSE3
7298
+ const Rebind<uint32_t, decltype(d)> du32;
7299
+ return PromoteTo(d, PromoteTo(du32, v));
7300
+ #elif HWY_TARGET == HWY_SSSE3
7301
+ alignas(16) static constexpr int8_t kShuffle[16] = {
7302
+ 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1};
7303
+ const Repartition<int8_t, decltype(d)> di8;
7304
+ return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle)));
7305
+ #else
7306
+ (void)d;
7307
+ return VFromD<D>{_mm_cvtepu16_epi64(v.raw)};
7308
+ #endif
7309
+ }
7310
+
7311
+ // Unsigned to signed: same plus cast.
7312
+ template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V),
7313
+ HWY_IF_LANES_GT(sizeof(TFromD<D>), sizeof(TFromV<V>)),
7314
+ HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V))>
7315
+ HWY_API VFromD<D> PromoteTo(D di, V v) {
7316
+ const RebindToUnsigned<decltype(di)> du;
7317
+ return BitCast(di, PromoteTo(du, v));
7318
+ }
7319
+
7320
+ // ------------------------------ PromoteTo signed (ShiftRight, ZipLower)
7321
+
7322
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)>
7323
+ HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
7324
+ #if HWY_TARGET >= HWY_SSSE3
7325
+ return ShiftRight<8>(VFromD<D>{_mm_unpacklo_epi8(v.raw, v.raw)});
7326
+ #else
7327
+ return VFromD<D>{_mm_cvtepi8_epi16(v.raw)};
7328
+ #endif
7329
+ }
7330
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
7331
+ HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
7332
+ #if HWY_TARGET >= HWY_SSSE3
7333
+ return ShiftRight<16>(VFromD<D>{_mm_unpacklo_epi16(v.raw, v.raw)});
7334
+ #else
7335
+ return VFromD<D>{_mm_cvtepi16_epi32(v.raw)};
7336
+ #endif
7337
+ }
7338
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
7339
+ HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
7340
+ #if HWY_TARGET >= HWY_SSSE3
7341
+ return ShiftRight<32>(VFromD<D>{_mm_unpacklo_epi32(v.raw, v.raw)});
7342
+ #else
7343
+ return VFromD<D>{_mm_cvtepi32_epi64(v.raw)};
7344
+ #endif
7345
+ }
7346
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
7347
+ HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
7348
+ #if HWY_TARGET >= HWY_SSSE3
7349
+ const __m128i x2 = _mm_unpacklo_epi8(v.raw, v.raw);
7350
+ const __m128i x4 = _mm_unpacklo_epi16(x2, x2);
7351
+ return ShiftRight<24>(VFromD<D>{x4});
7352
+ #else
7353
+ return VFromD<D>{_mm_cvtepi8_epi32(v.raw)};
7354
+ #endif
7355
+ }
7356
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
7357
+ HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int8_t, D>> v) {
7358
+ #if HWY_TARGET >= HWY_SSSE3
7359
+ const Repartition<int32_t, decltype(d)> di32;
7360
+ const Half<decltype(di32)> dh_i32;
7361
+ const VFromD<decltype(di32)> x4{PromoteTo(dh_i32, v).raw};
7362
+ const VFromD<decltype(di32)> s4{
7363
+ _mm_shufflelo_epi16(x4.raw, _MM_SHUFFLE(3, 3, 1, 1))};
7364
+ return ZipLower(d, x4, s4);
7365
+ #else
7366
+ (void)d;
7367
+ return VFromD<D>{_mm_cvtepi8_epi64(v.raw)};
7368
+ #endif
7369
+ }
7370
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
7371
+ HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int16_t, D>> v) {
7372
+ #if HWY_TARGET >= HWY_SSSE3
7373
+ const Repartition<int32_t, decltype(d)> di32;
7374
+ const Half<decltype(di32)> dh_i32;
7375
+ const VFromD<decltype(di32)> x2{PromoteTo(dh_i32, v).raw};
7376
+ const VFromD<decltype(di32)> s2{
7377
+ _mm_shufflelo_epi16(x2.raw, _MM_SHUFFLE(3, 3, 1, 1))};
7378
+ return ZipLower(d, x2, s2);
7379
+ #else
7380
+ (void)d;
7381
+ return VFromD<D>{_mm_cvtepi16_epi64(v.raw)};
7382
+ #endif
7383
+ }
7384
+
7385
+ // -------------------- PromoteTo float (ShiftLeft, IfNegativeThenElse)
7386
+ #if HWY_TARGET < HWY_SSE4 && !defined(HWY_DISABLE_F16C)
7387
+
7388
+ // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions.
7389
+ #ifdef HWY_NATIVE_F16C
7390
+ #undef HWY_NATIVE_F16C
7391
+ #else
7392
+ #define HWY_NATIVE_F16C
7393
+ #endif
7394
+
7395
+ // Workaround for origin tracking bug in Clang msan prior to 11.0
7396
+ // (spurious "uninitialized memory" for TestF16 with "ORIGIN: invalid")
7397
+ #if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100)
7398
+ #define HWY_INLINE_F16 HWY_NOINLINE
7399
+ #else
7400
+ #define HWY_INLINE_F16 HWY_INLINE
7401
+ #endif
7402
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
7403
+ HWY_INLINE_F16 VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) {
7404
+ #if HWY_HAVE_FLOAT16
7405
+ const RebindToUnsigned<DFromV<decltype(v)>> du16;
7406
+ return VFromD<D>{_mm_cvtph_ps(BitCast(du16, v).raw)};
7407
+ #else
7408
+ return VFromD<D>{_mm_cvtph_ps(v.raw)};
7409
+ #endif
7410
+ }
7411
+
7412
+ #endif // HWY_NATIVE_F16C
7413
+
7414
+ #if HWY_HAVE_FLOAT16
7415
+
7416
+ #ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
7417
+ #undef HWY_NATIVE_PROMOTE_F16_TO_F64
7418
+ #else
7419
+ #define HWY_NATIVE_PROMOTE_F16_TO_F64
7420
+ #endif
7421
+
7422
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
7423
+ HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) {
7424
+ return VFromD<D>{_mm_cvtph_pd(v.raw)};
7425
+ }
7426
+
7427
+ #endif // HWY_HAVE_FLOAT16
7428
+
7429
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
7430
+ HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) {
7431
+ const Rebind<uint16_t, decltype(df32)> du16;
7432
+ const RebindToSigned<decltype(df32)> di32;
7433
+ return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
7434
+ }
7435
+
7436
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
7437
+ HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
7438
+ return VFromD<D>{_mm_cvtps_pd(v.raw)};
7439
+ }
7440
+
7441
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
7442
+ HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
7443
+ return VFromD<D>{_mm_cvtepi32_pd(v.raw)};
7444
+ }
7445
+
7446
+ #if HWY_TARGET <= HWY_AVX3
7447
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
7448
+ HWY_API VFromD<D> PromoteTo(D /*df64*/, VFromD<Rebind<uint32_t, D>> v) {
7449
+ return VFromD<D>{_mm_cvtepu32_pd(v.raw)};
7450
+ }
7451
+ #else
7452
+ // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
7453
+ template <class D, HWY_IF_F64_D(D)>
7454
+ HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<uint32_t, D>> v) {
7455
+ const Rebind<int32_t, decltype(df64)> di32;
7456
+ const auto i32_to_f64_result = PromoteTo(df64, BitCast(di32, v));
7457
+ return i32_to_f64_result + IfNegativeThenElse(i32_to_f64_result,
7458
+ Set(df64, 4294967296.0),
7459
+ Zero(df64));
7460
+ }
7461
+ #endif // HWY_TARGET <= HWY_AVX3
7462
+
7463
+ // ------------------------------ Per4LaneBlockShuffle
7464
+ namespace detail {
7465
+
7466
+ #ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
7467
+ #undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
7468
+ #else
7469
+ #define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
7470
+ #endif
7471
+
7472
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
7473
+ HWY_INLINE VFromD<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
7474
+ const uint32_t x2,
7475
+ const uint32_t x1,
7476
+ const uint32_t x0) {
7477
+ return ResizeBitCast(
7478
+ d, Vec128<uint32_t>{_mm_set_epi32(
7479
+ static_cast<int32_t>(x3), static_cast<int32_t>(x2),
7480
+ static_cast<int32_t>(x1), static_cast<int32_t>(x0))});
7481
+ }
7482
+
7483
+ template <size_t kIdx3210, class V>
7484
+ HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
7485
+ hwy::SizeTag<2> /*lane_size_tag*/,
7486
+ hwy::SizeTag<8> /*vect_size_tag*/, V v) {
7487
+ const DFromV<decltype(v)> d;
7488
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
7489
+ return BitCast(d,
7490
+ VFromD<decltype(du)>{_mm_shufflelo_epi16(
7491
+ BitCast(du, v).raw, static_cast<int>(kIdx3210 & 0xFF))});
7492
+ }
7493
+
6850
7494
  #if HWY_TARGET == HWY_SSE2
6851
7495
  template <size_t kIdx3210, class V>
6852
7496
  HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
@@ -8122,27 +8766,116 @@ HWY_API Vec128<float, N> OddEven(Vec128<float, N> a, Vec128<float, N> b) {
8122
8766
  #endif
8123
8767
  }
8124
8768
 
8125
- // ------------------------------ OddEvenBlocks
8126
- template <typename T, size_t N>
8127
- HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
8128
- return even;
8769
+ // -------------------------- InterleaveEven
8770
+
8771
+ template <class D, HWY_IF_LANES_LE_D(D, 2)>
8772
+ HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
8773
+ return ConcatEven(d, b, a);
8129
8774
  }
8130
8775
 
8131
- // ------------------------------ SwapAdjacentBlocks
8776
+ // I8/U8 InterleaveEven is generic for all vector lengths that are >= 4 bytes
8777
+ template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
8778
+ HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
8779
+ const Repartition<uint16_t, decltype(d)> du16;
8780
+ return OddEven(BitCast(d, ShiftLeft<8>(BitCast(du16, b))), a);
8781
+ }
8132
8782
 
8133
- template <typename T, size_t N>
8134
- HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
8135
- return v;
8783
+ // I16/U16 InterleaveEven is generic for all vector lengths that are >= 8 bytes
8784
+ template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 2)>
8785
+ HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
8786
+ const Repartition<uint32_t, decltype(d)> du32;
8787
+ return OddEven(BitCast(d, ShiftLeft<16>(BitCast(du32, b))), a);
8136
8788
  }
8137
8789
 
8138
- // ------------------------------ Shl (ZipLower, Mul)
8790
+ #if HWY_TARGET <= HWY_AVX3
8791
+ template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_UI32_D(D)>
8792
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
8793
+ return VFromD<D>{_mm_mask_shuffle_epi32(
8794
+ a.raw, static_cast<__mmask8>(0x0A), b.raw,
8795
+ static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(2, 2, 0, 0)))};
8796
+ }
8797
+ template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_F32_D(D)>
8798
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
8799
+ return VFromD<D>{_mm_mask_shuffle_ps(a.raw, static_cast<__mmask8>(0x0A),
8800
+ b.raw, b.raw, _MM_SHUFFLE(2, 2, 0, 0))};
8801
+ }
8802
+ #else
8803
+ template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_T_SIZE_D(D, 4)>
8804
+ HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
8805
+ const RebindToFloat<decltype(d)> df;
8806
+ const auto b2_b0_a2_a0 = ConcatEven(df, BitCast(df, b), BitCast(df, a));
8807
+ return BitCast(
8808
+ d, VFromD<decltype(df)>{_mm_shuffle_ps(b2_b0_a2_a0.raw, b2_b0_a2_a0.raw,
8809
+ _MM_SHUFFLE(3, 1, 2, 0))});
8810
+ }
8811
+ #endif
8139
8812
 
8140
- // Use AVX2/3 variable shifts where available, otherwise multiply by powers of
8141
- // two from loading float exponents, which is considerably faster (according
8142
- // to LLVM-MCA) than scalar or testing bits: https://gcc.godbolt.org/z/9G7Y9v.
8813
+ // -------------------------- InterleaveOdd
8143
8814
 
8144
- namespace detail {
8145
- #if HWY_TARGET == HWY_AVX2 // Unused for AVX3 - we use sllv directly
8815
+ template <class D, HWY_IF_LANES_LE_D(D, 2)>
8816
+ HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
8817
+ return ConcatOdd(d, b, a);
8818
+ }
8819
+
8820
+ // I8/U8 InterleaveOdd is generic for all vector lengths that are >= 4 bytes
8821
+ template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
8822
+ HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
8823
+ const Repartition<uint16_t, decltype(d)> du16;
8824
+ return OddEven(b, BitCast(d, ShiftRight<8>(BitCast(du16, a))));
8825
+ }
8826
+
8827
+ // I16/U16 InterleaveOdd is generic for all vector lengths that are >= 8 bytes
8828
+ template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 2)>
8829
+ HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
8830
+ const Repartition<uint32_t, decltype(d)> du32;
8831
+ return OddEven(b, BitCast(d, ShiftRight<16>(BitCast(du32, a))));
8832
+ }
8833
+
8834
+ #if HWY_TARGET <= HWY_AVX3
8835
+ template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_UI32_D(D)>
8836
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
8837
+ return VFromD<D>{_mm_mask_shuffle_epi32(
8838
+ b.raw, static_cast<__mmask8>(0x05), a.raw,
8839
+ static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(3, 3, 1, 1)))};
8840
+ }
8841
+ template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_F32_D(D)>
8842
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
8843
+ return VFromD<D>{_mm_mask_shuffle_ps(b.raw, static_cast<__mmask8>(0x05),
8844
+ a.raw, a.raw, _MM_SHUFFLE(3, 3, 1, 1))};
8845
+ }
8846
+ #else
8847
+ template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_T_SIZE_D(D, 4)>
8848
+ HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
8849
+ const RebindToFloat<decltype(d)> df;
8850
+ const auto b3_b1_a3_a1 = ConcatOdd(df, BitCast(df, b), BitCast(df, a));
8851
+ return BitCast(
8852
+ d, VFromD<decltype(df)>{_mm_shuffle_ps(b3_b1_a3_a1.raw, b3_b1_a3_a1.raw,
8853
+ _MM_SHUFFLE(3, 1, 2, 0))});
8854
+ }
8855
+ #endif
8856
+
8857
+ // ------------------------------ OddEvenBlocks
8858
+ template <typename T, size_t N>
8859
+ HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
8860
+ return even;
8861
+ }
8862
+
8863
+ // ------------------------------ SwapAdjacentBlocks
8864
+
8865
+ template <typename T, size_t N>
8866
+ HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
8867
+ return v;
8868
+ }
8869
+
8870
+ // ------------------------------ Shl (ZipLower, Mul)
8871
+
8872
+ // Use AVX2/3 variable shifts where available, otherwise multiply by powers of
8873
+ // two from loading float exponents, which is considerably faster (according
8874
+ // to LLVM-MCA) than scalar or testing bits: https://gcc.godbolt.org/z/9G7Y9v.
8875
+
8876
+ namespace detail {
8877
+
8878
+ #if HWY_TARGET == HWY_AVX2 // Unused for AVX3 - we use sllv directly
8146
8879
  template <class V>
8147
8880
  HWY_API V AVX2ShlU16Vec128(V v, V bits) {
8148
8881
  const DFromV<decltype(v)> d;
@@ -8150,6 +8883,22 @@ HWY_API V AVX2ShlU16Vec128(V v, V bits) {
8150
8883
  return TruncateTo(d, PromoteTo(du32, v) << PromoteTo(du32, bits));
8151
8884
  }
8152
8885
  #elif HWY_TARGET > HWY_AVX2
8886
+
8887
+ template <class D32>
8888
+ static HWY_INLINE VFromD<D32> Pow2ConvF32ToI32(
8889
+ D32 d32, VFromD<RebindToFloat<D32>> vf32) {
8890
+ const RebindToSigned<decltype(d32)> di32;
8891
+ #if HWY_COMPILER_GCC_ACTUAL
8892
+ // ConvertInRangeTo is safe with GCC due the inline assembly workaround used
8893
+ // for F32->I32 ConvertInRangeTo with GCC
8894
+ return BitCast(d32, ConvertInRangeTo(di32, vf32));
8895
+ #else
8896
+ // Otherwise, use NearestIntInRange because we rely on the native 0x80..00
8897
+ // overflow behavior
8898
+ return BitCast(d32, NearestIntInRange(di32, vf32));
8899
+ #endif
8900
+ }
8901
+
8153
8902
  // Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
8154
8903
  template <typename T, HWY_IF_T_SIZE(T, 2)>
8155
8904
  HWY_INLINE Vec128<MakeUnsigned<T>> Pow2(const Vec128<T> v) {
@@ -8165,8 +8914,8 @@ HWY_INLINE Vec128<MakeUnsigned<T>> Pow2(const Vec128<T> v) {
8165
8914
  const auto f0 = ZipLower(dw, zero, upper);
8166
8915
  const auto f1 = ZipUpper(dw, zero, upper);
8167
8916
  // See cvtps comment below.
8168
- const VFromD<decltype(dw)> bits0{_mm_cvtps_epi32(BitCast(df, f0).raw)};
8169
- const VFromD<decltype(dw)> bits1{_mm_cvtps_epi32(BitCast(df, f1).raw)};
8917
+ const VFromD<decltype(dw)> bits0 = Pow2ConvF32ToI32(dw, BitCast(df, f0));
8918
+ const VFromD<decltype(dw)> bits1 = Pow2ConvF32ToI32(dw, BitCast(df, f1));
8170
8919
  #if HWY_TARGET <= HWY_SSE4
8171
8920
  return VFromD<decltype(du)>{_mm_packus_epi32(bits0.raw, bits1.raw)};
8172
8921
  #else
@@ -8187,7 +8936,8 @@ HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
8187
8936
  // Insert 0 into lower halves for reinterpreting as binary32.
8188
8937
  const auto f0 = ZipLower(dt_w, Zero(dt_u), ResizeBitCast(dt_u, upper));
8189
8938
  // See cvtps comment below.
8190
- const VFromD<decltype(dt_w)> bits0{_mm_cvtps_epi32(BitCast(dt_f, f0).raw)};
8939
+ const VFromD<decltype(dt_w)> bits0 =
8940
+ Pow2ConvF32ToI32(dt_w, BitCast(dt_f, f0));
8191
8941
  #if HWY_TARGET <= HWY_SSE4
8192
8942
  return VFromD<decltype(du)>{_mm_packus_epi32(bits0.raw, bits0.raw)};
8193
8943
  #elif HWY_TARGET == HWY_SSSE3
@@ -8205,11 +8955,12 @@ HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
8205
8955
  template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
8206
8956
  HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
8207
8957
  const DFromV<decltype(v)> d;
8958
+ const RebindToFloat<decltype(d)> df;
8208
8959
  const auto exp = ShiftLeft<23>(v);
8209
8960
  const auto f = exp + Set(d, 0x3F800000); // 1.0f
8210
8961
  // Do not use ConvertTo because we rely on the native 0x80..00 overflow
8211
8962
  // behavior.
8212
- return Vec128<MakeUnsigned<T>, N>{_mm_cvtps_epi32(_mm_castsi128_ps(f.raw))};
8963
+ return Pow2ConvF32ToI32(d, BitCast(df, f));
8213
8964
  }
8214
8965
 
8215
8966
  #endif // HWY_TARGET > HWY_AVX2
@@ -8689,42 +9440,161 @@ HWY_API Vec128<int64_t, N> operator>>(Vec128<int64_t, N> v,
8689
9440
 
8690
9441
  // ------------------------------ MulEven/Odd 64x64 (UpperHalf)
8691
9442
 
8692
- HWY_INLINE Vec128<uint64_t> MulEven(Vec128<uint64_t> a, Vec128<uint64_t> b) {
9443
+ namespace detail {
9444
+
9445
+ template <class V, HWY_IF_U64(TFromV<V>)>
9446
+ static HWY_INLINE V SSE2Mul128(V a, V b, V& mulH) {
9447
+ const DFromV<decltype(a)> du64;
9448
+ const RepartitionToNarrow<decltype(du64)> du32;
9449
+ const auto maskL = Set(du64, 0xFFFFFFFFULL);
9450
+ const auto a32 = BitCast(du32, a);
9451
+ const auto b32 = BitCast(du32, b);
9452
+ // Inputs for MulEven: we only need the lower 32 bits
9453
+ const auto aH = Shuffle2301(a32);
9454
+ const auto bH = Shuffle2301(b32);
9455
+
9456
+ // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
9457
+ // the even (lower 64 bits of every 128-bit block) results. See
9458
+ // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.txt
9459
+ const auto aLbL = MulEven(a32, b32);
9460
+ const auto w3 = aLbL & maskL;
9461
+
9462
+ const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
9463
+ const auto w2 = t2 & maskL;
9464
+ const auto w1 = ShiftRight<32>(t2);
9465
+
9466
+ const auto t = MulEven(a32, bH) + w2;
9467
+ const auto k = ShiftRight<32>(t);
9468
+
9469
+ mulH = MulEven(aH, bH) + w1 + k;
9470
+ return ShiftLeft<32>(t) + w3;
9471
+ }
9472
+
9473
+ template <class V, HWY_IF_I64(TFromV<V>)>
9474
+ static HWY_INLINE V SSE2Mul128(V a, V b, V& mulH) {
9475
+ const DFromV<decltype(a)> di64;
9476
+ const RebindToUnsigned<decltype(di64)> du64;
9477
+ using VU64 = VFromD<decltype(du64)>;
9478
+
9479
+ VU64 unsigned_mulH;
9480
+ const auto mulL = BitCast(
9481
+ di64, SSE2Mul128(BitCast(du64, a), BitCast(du64, b), unsigned_mulH));
9482
+ mulH = BitCast(di64, unsigned_mulH) - And(BroadcastSignBit(a), b) -
9483
+ And(a, BroadcastSignBit(b));
9484
+ return mulL;
9485
+ }
9486
+
9487
+ } // namespace detail
9488
+
9489
+ #if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
9490
+
9491
+ template <class V, HWY_IF_UI64(TFromV<V>),
9492
+ HWY_IF_V_SIZE_GT_V(V, (HWY_ARCH_X86_64 ? 16 : 8))>
9493
+ HWY_API V MulEven(V a, V b) {
9494
+ V mulH;
9495
+ const V mulL = detail::SSE2Mul128(a, b, mulH);
9496
+ return InterleaveLower(mulL, mulH);
9497
+ }
9498
+
9499
+ template <class V, HWY_IF_UI64(TFromV<V>),
9500
+ HWY_IF_V_SIZE_GT_V(V, (HWY_ARCH_X86_64 ? 16 : 8))>
9501
+ HWY_API V MulOdd(V a, V b) {
9502
+ const DFromV<decltype(a)> du64;
9503
+ V mulH;
9504
+ const V mulL = detail::SSE2Mul128(a, b, mulH);
9505
+ return InterleaveUpper(du64, mulL, mulH);
9506
+ }
9507
+
9508
+ #endif // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
9509
+
9510
+ template <class V, HWY_IF_UI64(TFromV<V>),
9511
+ HWY_IF_V_SIZE_GT_V(V, (HWY_ARCH_X86_64 ? 8 : 0))>
9512
+ HWY_API V MulHigh(V a, V b) {
9513
+ V mulH;
9514
+ detail::SSE2Mul128(a, b, mulH);
9515
+ return mulH;
9516
+ }
9517
+
9518
+ #if HWY_ARCH_X86_64
9519
+
9520
+ template <class T, HWY_IF_UI64(T)>
9521
+ HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
8693
9522
  const DFromV<decltype(a)> d;
8694
- alignas(16) uint64_t mul[2];
9523
+ alignas(16) T mul[2];
8695
9524
  mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
8696
9525
  return Load(d, mul);
8697
9526
  }
8698
9527
 
8699
- HWY_INLINE Vec128<uint64_t> MulOdd(Vec128<uint64_t> a, Vec128<uint64_t> b) {
9528
+ template <class T, HWY_IF_UI64(T)>
9529
+ HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
8700
9530
  const DFromV<decltype(a)> d;
8701
9531
  const Half<decltype(d)> d2;
8702
- alignas(16) uint64_t mul[2];
8703
- const uint64_t a1 = GetLane(UpperHalf(d2, a));
8704
- const uint64_t b1 = GetLane(UpperHalf(d2, b));
9532
+ alignas(16) T mul[2];
9533
+ const T a1 = GetLane(UpperHalf(d2, a));
9534
+ const T b1 = GetLane(UpperHalf(d2, b));
8705
9535
  mul[0] = Mul128(a1, b1, &mul[1]);
8706
9536
  return Load(d, mul);
8707
9537
  }
8708
9538
 
8709
- // ------------------------------ WidenMulPairwiseAdd
9539
+ template <class T, HWY_IF_UI64(T)>
9540
+ HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) {
9541
+ T hi;
9542
+ Mul128(GetLane(a), GetLane(b), &hi);
9543
+ return Vec64<T>{_mm_cvtsi64_si128(static_cast<int64_t>(hi))};
9544
+ }
9545
+
9546
+ #endif // HWY_ARCH_X86_64
9547
+
9548
+ // ================================================== CONVERT (2)
9549
+
9550
+ // ------------------------------ PromoteEvenTo/PromoteOddTo
9551
+
9552
+ #if HWY_TARGET > HWY_AVX3
9553
+ namespace detail {
9554
+
9555
+ // I32->I64 PromoteEvenTo/PromoteOddTo
9556
+
9557
+ template <class D, HWY_IF_LANES_D(D, 1)>
9558
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
9559
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
9560
+ hwy::SignedTag /*from_type_tag*/, D d_to,
9561
+ Vec64<int32_t> v) {
9562
+ return PromoteLowerTo(d_to, v);
9563
+ }
9564
+
9565
+ template <class D, HWY_IF_LANES_D(D, 2)>
9566
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
9567
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
9568
+ hwy::SignedTag /*from_type_tag*/, D d_to,
9569
+ Vec128<int32_t> v) {
9570
+ const Repartition<int32_t, D> d_from;
9571
+ return PromoteLowerTo(d_to, ConcatEven(d_from, v, v));
9572
+ }
9573
+
9574
+ template <class D, class V, HWY_IF_LANES_LE_D(D, 2)>
9575
+ HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
9576
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
9577
+ hwy::SignedTag /*from_type_tag*/, D d_to,
9578
+ V v) {
9579
+ const Repartition<int32_t, D> d_from;
9580
+ return PromoteLowerTo(d_to, ConcatOdd(d_from, v, v));
9581
+ }
9582
+
9583
+ } // namespace detail
9584
+ #endif
9585
+
9586
+ // ------------------------------ PromoteEvenTo/PromoteOddTo
9587
+ #include "hwy/ops/inside-inl.h"
9588
+
9589
+ // ------------------------------ WidenMulPairwiseAdd (PromoteEvenTo)
8710
9590
 
8711
9591
  // Generic for all vector lengths.
8712
- template <class D32, HWY_IF_F32_D(D32),
8713
- class V16 = VFromD<Repartition<bfloat16_t, D32>>>
8714
- HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) {
9592
+ template <class DF, HWY_IF_F32_D(DF),
9593
+ class VBF = VFromD<Repartition<bfloat16_t, DF>>>
9594
+ HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
8715
9595
  // TODO(janwas): _mm_dpbf16_ps when available
8716
- const RebindToUnsigned<decltype(df32)> du32;
8717
- // Lane order within sum0/1 is undefined, hence we can avoid the
8718
- // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip
8719
- // leads to the odd/even order that RearrangeToOddPlusEven prefers.
8720
- using VU32 = VFromD<decltype(du32)>;
8721
- const VU32 odd = Set(du32, 0xFFFF0000u);
8722
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
8723
- const VU32 ao = And(BitCast(du32, a), odd);
8724
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
8725
- const VU32 bo = And(BitCast(du32, b), odd);
8726
- return MulAdd(BitCast(df32, ae), BitCast(df32, be),
8727
- Mul(BitCast(df32, ao), BitCast(df32, bo)));
9596
+ return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
9597
+ Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
8728
9598
  }
8729
9599
 
8730
9600
  // Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe.
@@ -8768,29 +9638,48 @@ HWY_API VFromD<DI16> SatWidenMulPairwiseAdd(
8768
9638
 
8769
9639
  #endif
8770
9640
 
8771
- // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ShiftLeft)
9641
+ // ------------------------------ SatWidenMulPairwiseAccumulate
8772
9642
 
8773
- // Generic for all vector lengths.
8774
- template <class D32, HWY_IF_F32_D(D32),
8775
- class V16 = VFromD<Repartition<bfloat16_t, D32>>>
8776
- HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b,
8777
- const VFromD<D32> sum0,
8778
- VFromD<D32>& sum1) {
8779
- // TODO(janwas): _mm_dpbf16_ps when available
8780
- const RebindToUnsigned<decltype(df32)> du32;
8781
- // Lane order within sum0/1 is undefined, hence we can avoid the
8782
- // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip
8783
- // leads to the odd/even order that RearrangeToOddPlusEven prefers.
8784
- using VU32 = VFromD<decltype(du32)>;
8785
- const VU32 odd = Set(du32, 0xFFFF0000u);
8786
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
8787
- const VU32 ao = And(BitCast(du32, a), odd);
8788
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
8789
- const VU32 bo = And(BitCast(du32, b), odd);
8790
- sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
8791
- return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
9643
+ #if HWY_TARGET <= HWY_AVX3_DL
9644
+
9645
+ #ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
9646
+ #undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
9647
+ #else
9648
+ #define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
9649
+ #endif
9650
+
9651
+ // Even if N=1, the I16 vectors have at least 2 lanes, hence _mm_dpwssds_epi32
9652
+ // is safe.
9653
+ template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
9654
+ HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate(
9655
+ DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a,
9656
+ VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
9657
+ return VFromD<DI32>{_mm_dpwssds_epi32(sum.raw, a.raw, b.raw)};
8792
9658
  }
8793
9659
 
9660
+ #endif // HWY_TARGET <= HWY_AVX3_DL
9661
+
9662
+ // ------------------------------ ReorderWidenMulAccumulate (PromoteEvenTo)
9663
+
9664
+ #if HWY_NATIVE_DOT_BF16
9665
+
9666
+ #ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
9667
+ #undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
9668
+ #else
9669
+ #define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
9670
+ #endif
9671
+
9672
+ template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_LE_D(DF, 16),
9673
+ class VBF = VFromD<Repartition<bfloat16_t, DF>>>
9674
+ HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF /*df*/, VBF a, VBF b,
9675
+ const VFromD<DF> sum0,
9676
+ VFromD<DF>& /*sum1*/) {
9677
+ return VFromD<DF>{_mm_dpbf16_ps(sum0.raw, reinterpret_cast<__m128bh>(a.raw),
9678
+ reinterpret_cast<__m128bh>(b.raw))};
9679
+ }
9680
+
9681
+ #endif // HWY_NATIVE_DOT_BF16
9682
+
8794
9683
  // Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe.
8795
9684
  template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
8796
9685
  class V16 = VFromD<RepartitionToNarrow<D32>>>
@@ -8893,263 +9782,6 @@ HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
8893
9782
 
8894
9783
  #endif // HWY_TARGET <= HWY_AVX3_DL
8895
9784
 
8896
- // ================================================== CONVERT
8897
-
8898
- // ------------------------------ Promotions (part w/ narrow lanes -> full)
8899
-
8900
- // Unsigned: zero-extend.
8901
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
8902
- HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
8903
- #if HWY_TARGET >= HWY_SSSE3
8904
- const __m128i zero = _mm_setzero_si128();
8905
- return VFromD<D>{_mm_unpacklo_epi8(v.raw, zero)};
8906
- #else
8907
- return VFromD<D>{_mm_cvtepu8_epi16(v.raw)};
8908
- #endif
8909
- }
8910
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
8911
- HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
8912
- #if HWY_TARGET >= HWY_SSSE3
8913
- return VFromD<D>{_mm_unpacklo_epi16(v.raw, _mm_setzero_si128())};
8914
- #else
8915
- return VFromD<D>{_mm_cvtepu16_epi32(v.raw)};
8916
- #endif
8917
- }
8918
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
8919
- HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
8920
- #if HWY_TARGET >= HWY_SSSE3
8921
- return VFromD<D>{_mm_unpacklo_epi32(v.raw, _mm_setzero_si128())};
8922
- #else
8923
- return VFromD<D>{_mm_cvtepu32_epi64(v.raw)};
8924
- #endif
8925
- }
8926
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
8927
- HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
8928
- #if HWY_TARGET >= HWY_SSSE3
8929
- const __m128i zero = _mm_setzero_si128();
8930
- const __m128i u16 = _mm_unpacklo_epi8(v.raw, zero);
8931
- return VFromD<D>{_mm_unpacklo_epi16(u16, zero)};
8932
- #else
8933
- return VFromD<D>{_mm_cvtepu8_epi32(v.raw)};
8934
- #endif
8935
- }
8936
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
8937
- HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) {
8938
- #if HWY_TARGET > HWY_SSSE3
8939
- const Rebind<uint32_t, decltype(d)> du32;
8940
- return PromoteTo(d, PromoteTo(du32, v));
8941
- #elif HWY_TARGET == HWY_SSSE3
8942
- alignas(16) static constexpr int8_t kShuffle[16] = {
8943
- 0, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1};
8944
- const Repartition<int8_t, decltype(d)> di8;
8945
- return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle)));
8946
- #else
8947
- (void)d;
8948
- return VFromD<D>{_mm_cvtepu8_epi64(v.raw)};
8949
- #endif
8950
- }
8951
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
8952
- HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint16_t, D>> v) {
8953
- #if HWY_TARGET > HWY_SSSE3
8954
- const Rebind<uint32_t, decltype(d)> du32;
8955
- return PromoteTo(d, PromoteTo(du32, v));
8956
- #elif HWY_TARGET == HWY_SSSE3
8957
- alignas(16) static constexpr int8_t kShuffle[16] = {
8958
- 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1};
8959
- const Repartition<int8_t, decltype(d)> di8;
8960
- return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle)));
8961
- #else
8962
- (void)d;
8963
- return VFromD<D>{_mm_cvtepu16_epi64(v.raw)};
8964
- #endif
8965
- }
8966
-
8967
- // Unsigned to signed: same plus cast.
8968
- template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V),
8969
- HWY_IF_LANES_GT(sizeof(TFromD<D>), sizeof(TFromV<V>)),
8970
- HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V))>
8971
- HWY_API VFromD<D> PromoteTo(D di, V v) {
8972
- const RebindToUnsigned<decltype(di)> du;
8973
- return BitCast(di, PromoteTo(du, v));
8974
- }
8975
-
8976
- // Signed: replicate sign bit.
8977
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)>
8978
- HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
8979
- #if HWY_TARGET >= HWY_SSSE3
8980
- return ShiftRight<8>(VFromD<D>{_mm_unpacklo_epi8(v.raw, v.raw)});
8981
- #else
8982
- return VFromD<D>{_mm_cvtepi8_epi16(v.raw)};
8983
- #endif
8984
- }
8985
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
8986
- HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
8987
- #if HWY_TARGET >= HWY_SSSE3
8988
- return ShiftRight<16>(VFromD<D>{_mm_unpacklo_epi16(v.raw, v.raw)});
8989
- #else
8990
- return VFromD<D>{_mm_cvtepi16_epi32(v.raw)};
8991
- #endif
8992
- }
8993
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
8994
- HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
8995
- #if HWY_TARGET >= HWY_SSSE3
8996
- return ShiftRight<32>(VFromD<D>{_mm_unpacklo_epi32(v.raw, v.raw)});
8997
- #else
8998
- return VFromD<D>{_mm_cvtepi32_epi64(v.raw)};
8999
- #endif
9000
- }
9001
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
9002
- HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
9003
- #if HWY_TARGET >= HWY_SSSE3
9004
- const __m128i x2 = _mm_unpacklo_epi8(v.raw, v.raw);
9005
- const __m128i x4 = _mm_unpacklo_epi16(x2, x2);
9006
- return ShiftRight<24>(VFromD<D>{x4});
9007
- #else
9008
- return VFromD<D>{_mm_cvtepi8_epi32(v.raw)};
9009
- #endif
9010
- }
9011
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
9012
- HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int8_t, D>> v) {
9013
- #if HWY_TARGET >= HWY_SSSE3
9014
- const Repartition<int32_t, decltype(d)> di32;
9015
- const Half<decltype(di32)> dh_i32;
9016
- const VFromD<decltype(di32)> x4{PromoteTo(dh_i32, v).raw};
9017
- const VFromD<decltype(di32)> s4{
9018
- _mm_shufflelo_epi16(x4.raw, _MM_SHUFFLE(3, 3, 1, 1))};
9019
- return ZipLower(d, x4, s4);
9020
- #else
9021
- (void)d;
9022
- return VFromD<D>{_mm_cvtepi8_epi64(v.raw)};
9023
- #endif
9024
- }
9025
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
9026
- HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int16_t, D>> v) {
9027
- #if HWY_TARGET >= HWY_SSSE3
9028
- const Repartition<int32_t, decltype(d)> di32;
9029
- const Half<decltype(di32)> dh_i32;
9030
- const VFromD<decltype(di32)> x2{PromoteTo(dh_i32, v).raw};
9031
- const VFromD<decltype(di32)> s2{
9032
- _mm_shufflelo_epi16(x2.raw, _MM_SHUFFLE(3, 3, 1, 1))};
9033
- return ZipLower(d, x2, s2);
9034
- #else
9035
- (void)d;
9036
- return VFromD<D>{_mm_cvtepi16_epi64(v.raw)};
9037
- #endif
9038
- }
9039
-
9040
- #if HWY_TARGET < HWY_SSE4 && !defined(HWY_DISABLE_F16C)
9041
-
9042
- // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions.
9043
- #ifdef HWY_NATIVE_F16C
9044
- #undef HWY_NATIVE_F16C
9045
- #else
9046
- #define HWY_NATIVE_F16C
9047
- #endif
9048
-
9049
- // Workaround for origin tracking bug in Clang msan prior to 11.0
9050
- // (spurious "uninitialized memory" for TestF16 with "ORIGIN: invalid")
9051
- #if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100)
9052
- #define HWY_INLINE_F16 HWY_NOINLINE
9053
- #else
9054
- #define HWY_INLINE_F16 HWY_INLINE
9055
- #endif
9056
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
9057
- HWY_INLINE_F16 VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) {
9058
- #if HWY_HAVE_FLOAT16
9059
- const RebindToUnsigned<DFromV<decltype(v)>> du16;
9060
- return VFromD<D>{_mm_cvtph_ps(BitCast(du16, v).raw)};
9061
- #else
9062
- return VFromD<D>{_mm_cvtph_ps(v.raw)};
9063
- #endif
9064
- }
9065
-
9066
- #endif // HWY_NATIVE_F16C
9067
-
9068
- #if HWY_HAVE_FLOAT16
9069
-
9070
- #ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
9071
- #undef HWY_NATIVE_PROMOTE_F16_TO_F64
9072
- #else
9073
- #define HWY_NATIVE_PROMOTE_F16_TO_F64
9074
- #endif
9075
-
9076
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
9077
- HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) {
9078
- return VFromD<D>{_mm_cvtph_pd(v.raw)};
9079
- }
9080
-
9081
- #endif // HWY_HAVE_FLOAT16
9082
-
9083
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
9084
- HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) {
9085
- const Rebind<uint16_t, decltype(df32)> du16;
9086
- const RebindToSigned<decltype(df32)> di32;
9087
- return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
9088
- }
9089
-
9090
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
9091
- HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
9092
- return VFromD<D>{_mm_cvtps_pd(v.raw)};
9093
- }
9094
-
9095
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
9096
- HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
9097
- return VFromD<D>{_mm_cvtepi32_pd(v.raw)};
9098
- }
9099
-
9100
- #if HWY_TARGET <= HWY_AVX3
9101
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
9102
- HWY_API VFromD<D> PromoteTo(D /*df64*/, VFromD<Rebind<uint32_t, D>> v) {
9103
- return VFromD<D>{_mm_cvtepu32_pd(v.raw)};
9104
- }
9105
- #else
9106
- // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
9107
- template <class D, HWY_IF_F64_D(D)>
9108
- HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<uint32_t, D>> v) {
9109
- const Rebind<int32_t, decltype(df64)> di32;
9110
- const auto i32_to_f64_result = PromoteTo(df64, BitCast(di32, v));
9111
- return i32_to_f64_result + IfNegativeThenElse(i32_to_f64_result,
9112
- Set(df64, 4294967296.0),
9113
- Zero(df64));
9114
- }
9115
- #endif
9116
-
9117
- // ------------------------------ PromoteEvenTo/PromoteOddTo
9118
-
9119
- #if HWY_TARGET > HWY_AVX3
9120
- namespace detail {
9121
-
9122
- // I32->I64 PromoteEvenTo/PromoteOddTo
9123
-
9124
- template <class D, HWY_IF_LANES_D(D, 1)>
9125
- HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
9126
- hwy::SizeTag<8> /*to_lane_size_tag*/,
9127
- hwy::SignedTag /*from_type_tag*/, D d_to,
9128
- Vec64<int32_t> v) {
9129
- return PromoteLowerTo(d_to, v);
9130
- }
9131
-
9132
- template <class D, HWY_IF_LANES_D(D, 2)>
9133
- HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
9134
- hwy::SizeTag<8> /*to_lane_size_tag*/,
9135
- hwy::SignedTag /*from_type_tag*/, D d_to,
9136
- Vec128<int32_t> v) {
9137
- const Repartition<int32_t, D> d_from;
9138
- return PromoteLowerTo(d_to, ConcatEven(d_from, v, v));
9139
- }
9140
-
9141
- template <class D, class V, HWY_IF_LANES_LE_D(D, 2)>
9142
- HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
9143
- hwy::SizeTag<8> /*to_lane_size_tag*/,
9144
- hwy::SignedTag /*from_type_tag*/, D d_to,
9145
- V v) {
9146
- const Repartition<int32_t, D> d_from;
9147
- return PromoteLowerTo(d_to, ConcatOdd(d_from, v, v));
9148
- }
9149
-
9150
- } // namespace detail
9151
- #endif
9152
-
9153
9785
  // ------------------------------ Demotions (full -> part w/ narrow lanes)
9154
9786
 
9155
9787
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
@@ -9338,26 +9970,69 @@ HWY_API VFromD<D> DemoteTo(D /*df16*/, VFromD<Rebind<double, D>> v) {
9338
9970
 
9339
9971
  #endif // HWY_HAVE_FLOAT16
9340
9972
 
9973
+ // The _mm*_cvtneps_pbh and _mm*_cvtne2ps_pbh intrinsics require GCC 9 or later
9974
+ // or Clang 10 or later
9975
+
9976
+ // Also need GCC or Clang to bit cast the __m128bh, __m256bh, or __m512bh vector
9977
+ // returned by the _mm*_cvtneps_pbh and _mm*_cvtne2ps_pbh intrinsics to a
9978
+ // __m128i, __m256i, or __m512i as there are currently no intrinsics available
9979
+ // (as of GCC 13 and Clang 17) to bit cast a __m128bh, __m256bh, or __m512bh
9980
+ // vector to a __m128i, __m256i, or __m512i vector
9981
+
9982
+ #if HWY_AVX3_HAVE_F32_TO_BF16C
9983
+ #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
9984
+ #undef HWY_NATIVE_DEMOTE_F32_TO_BF16
9985
+ #else
9986
+ #define HWY_NATIVE_DEMOTE_F32_TO_BF16
9987
+ #endif
9988
+
9341
9989
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
9342
- HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
9343
- // TODO(janwas): _mm_cvtneps_pbh once we have avx512bf16.
9344
- const Rebind<int32_t, decltype(dbf16)> di32;
9345
- const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
9346
- const Rebind<uint16_t, decltype(dbf16)> du16;
9347
- const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
9348
- return BitCast(dbf16, DemoteTo(du16, bits_in_32));
9990
+ HWY_API VFromD<D> DemoteTo(D /*dbf16*/, VFromD<Rebind<float, D>> v) {
9991
+ #if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
9992
+ // Inline assembly workaround for LLVM codegen bug
9993
+ __m128i raw_result;
9994
+ __asm__("vcvtneps2bf16 %1, %0" : "=v"(raw_result) : "v"(v.raw));
9995
+ return VFromD<D>{raw_result};
9996
+ #else
9997
+ // The _mm_cvtneps_pbh intrinsic returns a __m128bh vector that needs to be
9998
+ // bit casted to a __m128i vector
9999
+ return VFromD<D>{detail::BitCastToInteger(_mm_cvtneps_pbh(v.raw))};
10000
+ #endif
9349
10001
  }
9350
10002
 
9351
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_BF16_D(D),
9352
- class V32 = VFromD<Repartition<float, D>>>
9353
- HWY_API VFromD<D> ReorderDemote2To(D dbf16, V32 a, V32 b) {
9354
- // TODO(janwas): _mm_cvtne2ps_pbh once we have avx512bf16.
9355
- const RebindToUnsigned<decltype(dbf16)> du16;
9356
- const Repartition<uint32_t, decltype(dbf16)> du32;
9357
- const VFromD<decltype(du32)> b_in_even = ShiftRight<16>(BitCast(du32, b));
9358
- return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
10003
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
10004
+ HWY_API VFromD<D> ReorderDemote2To(D /*dbf16*/, Vec128<float> a,
10005
+ Vec128<float> b) {
10006
+ #if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
10007
+ // Inline assembly workaround for LLVM codegen bug
10008
+ __m128i raw_result;
10009
+ __asm__("vcvtne2ps2bf16 %2, %1, %0"
10010
+ : "=v"(raw_result)
10011
+ : "v"(b.raw), "v"(a.raw));
10012
+ return VFromD<D>{raw_result};
10013
+ #else
10014
+ // The _mm_cvtne2ps_pbh intrinsic returns a __m128bh vector that needs to be
10015
+ // bit casted to a __m128i vector
10016
+ return VFromD<D>{detail::BitCastToInteger(_mm_cvtne2ps_pbh(b.raw, a.raw))};
10017
+ #endif
9359
10018
  }
9360
10019
 
10020
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
10021
+ HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec64<float> a,
10022
+ Vec64<float> b) {
10023
+ return VFromD<D>{_mm_shuffle_epi32(
10024
+ detail::BitCastToInteger(_mm_cvtne2ps_pbh(b.raw, a.raw)),
10025
+ _MM_SHUFFLE(2, 0, 2, 0))};
10026
+ }
10027
+
10028
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_BF16_D(D)>
10029
+ HWY_API VFromD<D> ReorderDemote2To(D dbf16, Vec32<float> a, Vec32<float> b) {
10030
+ const DFromV<decltype(a)> d;
10031
+ const Twice<decltype(d)> dt;
10032
+ return DemoteTo(dbf16, Combine(dt, b, a));
10033
+ }
10034
+ #endif // HWY_AVX3_HAVE_F32_TO_BF16C
10035
+
9361
10036
  // Specializations for partial vectors because packs_epi32 sets lanes above 2*N.
9362
10037
  template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I16_D(D)>
9363
10038
  HWY_API VFromD<D> ReorderDemote2To(D dn, Vec32<int32_t> a, Vec32<int32_t> b) {
@@ -9515,11 +10190,15 @@ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
9515
10190
  return ReorderDemote2To(d, a, b);
9516
10191
  }
9517
10192
 
9518
- template <class D, HWY_IF_BF16_D(D), class V32 = VFromD<Repartition<float, D>>>
9519
- HWY_API VFromD<D> OrderedDemote2To(D dbf16, V32 a, V32 b) {
9520
- const RebindToUnsigned<decltype(dbf16)> du16;
9521
- return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a)));
10193
+ #if HWY_AVX3_HAVE_F32_TO_BF16C
10194
+ // F32 to BF16 OrderedDemote2To is generic for all vector lengths on targets
10195
+ // that support AVX512BF16
10196
+ template <class D, HWY_IF_BF16_D(D)>
10197
+ HWY_API VFromD<D> OrderedDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
10198
+ VFromD<Repartition<float, D>> b) {
10199
+ return ReorderDemote2To(dbf16, a, b);
9522
10200
  }
10201
+ #endif // HWY_AVX3_HAVE_F32_TO_BF16C
9523
10202
 
9524
10203
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
9525
10204
  HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) {
@@ -9536,65 +10215,176 @@ HWY_INLINE VFromD<D> ClampF64ToI32Max(D d, VFromD<D> v) {
9536
10215
  return Min(v, Set(d, 2147483647.0));
9537
10216
  }
9538
10217
 
9539
- // For ConvertTo float->int of same size, clamping before conversion would
9540
- // change the result because the max integer value is not exactly representable.
9541
- // Instead detect the overflow result after conversion and fix it.
9542
- // Generic for all vector lengths.
9543
- template <class DI>
9544
- HWY_INLINE VFromD<DI> FixConversionOverflow(DI di,
9545
- VFromD<RebindToFloat<DI>> original,
9546
- VFromD<DI> converted) {
9547
- // Combinations of original and output sign:
9548
- // --: normal <0 or -huge_val to 80..00: OK
9549
- // -+: -0 to 0 : OK
9550
- // +-: +huge_val to 80..00 : xor with FF..FF to get 7F..FF
9551
- // ++: normal >0 : OK
9552
- const VFromD<DI> sign_wrong = AndNot(BitCast(di, original), converted);
9553
- #if HWY_COMPILER_GCC_ACTUAL
9554
- // Critical GCC 11 compiler bug (possibly also GCC 10): omits the Xor; also
9555
- // Add() if using that instead. Work around with one more instruction.
9556
- const RebindToUnsigned<DI> du;
9557
- const VFromD<DI> mask = BroadcastSignBit(sign_wrong);
9558
- const VFromD<DI> max = BitCast(di, ShiftRight<1>(BitCast(du, mask)));
9559
- return IfVecThenElse(mask, max, converted);
10218
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
10219
+ template <class TTo, class TF>
10220
+ static constexpr HWY_INLINE TTo
10221
+ X86ConvertScalarFromFloat(hwy::FloatTag /* to_type_tag */, TF from_val) {
10222
+ return ConvertScalarTo<TTo>(from_val);
10223
+ }
10224
+
10225
+ template <class TTo, class TF>
10226
+ static HWY_BITCASTSCALAR_CONSTEXPR HWY_INLINE TTo
10227
+ X86ConvertScalarFromFloat(hwy::SpecialTag /* to_type_tag */, TF from_val) {
10228
+ return ConvertScalarTo<TTo>(from_val);
10229
+ }
10230
+
10231
+ template <class TTo, class TF>
10232
+ static HWY_BITCASTSCALAR_CXX14_CONSTEXPR HWY_INLINE TTo
10233
+ X86ConvertScalarFromFloat(hwy::SignedTag /* to_type_tag */, TF from_val) {
10234
+ #if HWY_HAVE_SCALAR_F16_TYPE && HWY_HAVE_SCALAR_F16_OPERATORS
10235
+ using TFArith = If<hwy::IsSame<RemoveCvRef<TTo>, hwy::bfloat16_t>(), float,
10236
+ RemoveCvRef<TF>>;
9560
10237
  #else
9561
- return Xor(converted, BroadcastSignBit(sign_wrong));
10238
+ using TFArith = If<sizeof(TF) <= sizeof(float), float, RemoveCvRef<TF>>;
9562
10239
  #endif
10240
+
10241
+ const TFArith from_val_in_arith_type = ConvertScalarTo<TFArith>(from_val);
10242
+ constexpr TTo kMinResultVal = LimitsMin<TTo>();
10243
+ HWY_BITCASTSCALAR_CONSTEXPR const TFArith kMinOutOfRangePosVal =
10244
+ ScalarAbs(ConvertScalarTo<TFArith>(kMinResultVal));
10245
+
10246
+ return (ScalarAbs(from_val_in_arith_type) < kMinOutOfRangePosVal)
10247
+ ? ConvertScalarTo<TTo>(from_val_in_arith_type)
10248
+ : kMinResultVal;
9563
10249
  }
9564
10250
 
10251
+ template <class TTo, class TF>
10252
+ static HWY_CXX14_CONSTEXPR HWY_INLINE TTo
10253
+ X86ConvertScalarFromFloat(hwy::UnsignedTag /* to_type_tag */, TF from_val) {
10254
+ #if HWY_HAVE_SCALAR_F16_TYPE && HWY_HAVE_SCALAR_F16_OPERATORS
10255
+ using TFArith = If<hwy::IsSame<RemoveCvRef<TTo>, hwy::bfloat16_t>(), float,
10256
+ RemoveCvRef<TF>>;
10257
+ #else
10258
+ using TFArith = If<sizeof(TF) <= sizeof(float), float, RemoveCvRef<TF>>;
10259
+ #endif
10260
+
10261
+ const TFArith from_val_in_arith_type = ConvertScalarTo<TFArith>(from_val);
10262
+ constexpr TTo kTToMsb = static_cast<TTo>(TTo{1} << (sizeof(TTo) * 8 - 1));
10263
+ constexpr const TFArith kNegOne = ConvertScalarTo<TFArith>(-1.0);
10264
+ constexpr const TFArith kMinOutOfRangePosVal =
10265
+ ConvertScalarTo<TFArith>(static_cast<double>(kTToMsb) * 2.0);
10266
+
10267
+ return (from_val_in_arith_type > kNegOne &&
10268
+ from_val_in_arith_type < kMinOutOfRangePosVal)
10269
+ ? ConvertScalarTo<TTo>(from_val_in_arith_type)
10270
+ : LimitsMax<TTo>();
10271
+ }
10272
+
10273
+ template <class TTo, class TF>
10274
+ static constexpr HWY_INLINE HWY_MAYBE_UNUSED TTo
10275
+ X86ConvertScalarFromFloat(TF from_val) {
10276
+ return X86ConvertScalarFromFloat<TTo>(hwy::TypeTag<RemoveCvRef<TTo>>(),
10277
+ from_val);
10278
+ }
10279
+ #endif // HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
10280
+
9565
10281
  } // namespace detail
9566
10282
 
9567
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D),
9568
- class DF = Rebind<double, D>>
9569
- HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<DF> v) {
9570
- const VFromD<DF> clamped = detail::ClampF64ToI32Max(DF(), v);
9571
- return VFromD<D>{_mm_cvttpd_epi32(clamped.raw)};
10283
+ #ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
10284
+ #undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
10285
+ #else
10286
+ #define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
10287
+ #endif
10288
+
10289
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
10290
+ HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, VFromD<Rebind<double, D>> v) {
10291
+ #if HWY_COMPILER_GCC_ACTUAL
10292
+ // Workaround for undefined behavior in _mm_cvttpd_epi32 with GCC if any
10293
+ // values of v[i] are not within the range of an int32_t
10294
+
10295
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
10296
+ if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
10297
+ typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
10298
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
10299
+ return Dup128VecFromValues(
10300
+ D(), detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
10301
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]), int32_t{0},
10302
+ int32_t{0});
10303
+ }
10304
+ #endif
10305
+
10306
+ __m128i raw_result;
10307
+ __asm__("%vcvttpd2dq {%1, %0|%0, %1}"
10308
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
10309
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
10310
+ :);
10311
+ return VFromD<D>{raw_result};
10312
+ #else // !HWY_COMPILER_GCC_ACTUAL
10313
+ return VFromD<D>{_mm_cvttpd_epi32(v.raw)};
10314
+ #endif
10315
+ }
10316
+
10317
+ // F64 to I32 DemoteTo is generic for all vector lengths
10318
+ template <class D, HWY_IF_I32_D(D)>
10319
+ HWY_API VFromD<D> DemoteTo(D di32, VFromD<Rebind<double, D>> v) {
10320
+ const Rebind<double, decltype(di32)> df64;
10321
+ const VFromD<decltype(df64)> clamped = detail::ClampF64ToI32Max(df64, v);
10322
+ return DemoteInRangeTo(di32, clamped);
9572
10323
  }
9573
10324
 
9574
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
9575
- HWY_API VFromD<D> DemoteTo(D du32, VFromD<Rebind<double, D>> v) {
9576
10325
  #if HWY_TARGET <= HWY_AVX3
9577
- (void)du32;
9578
- return VFromD<D>{
9579
- _mm_maskz_cvttpd_epu32(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
9580
- #else // AVX2 or earlier
10326
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
10327
+ HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, VFromD<Rebind<double, D>> v) {
10328
+ #if HWY_COMPILER_GCC_ACTUAL
10329
+ // Workaround for undefined behavior in _mm_cvttpd_epu32 with GCC if any
10330
+ // values of v[i] are not within the range of an uint32_t
10331
+
10332
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
10333
+ if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) {
10334
+ typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
10335
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
10336
+ return Dup128VecFromValues(
10337
+ D(), detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0]),
10338
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1]), uint32_t{0},
10339
+ uint32_t{0});
10340
+ }
10341
+ #endif
10342
+
10343
+ __m128i raw_result;
10344
+ __asm__("vcvttpd2udq {%1, %0|%0, %1}"
10345
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
10346
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
10347
+ :);
10348
+ return VFromD<D>{raw_result};
10349
+ #else
10350
+ return VFromD<D>{_mm_cvttpd_epu32(v.raw)};
10351
+ #endif
10352
+ }
10353
+
10354
+ // F64->U32 DemoteTo is generic for all vector lengths
10355
+ template <class D, HWY_IF_U32_D(D)>
10356
+ HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) {
10357
+ return DemoteInRangeTo(D(), ZeroIfNegative(v));
10358
+ }
10359
+ #else // HWY_TARGET > HWY_AVX3
10360
+
10361
+ // F64 to U32 DemoteInRangeTo is generic for all vector lengths on
10362
+ // SSE2/SSSE3/SSE4/AVX2
10363
+ template <class D, HWY_IF_U32_D(D)>
10364
+ HWY_API VFromD<D> DemoteInRangeTo(D du32, VFromD<Rebind<double, D>> v) {
10365
+ const RebindToSigned<decltype(du32)> di32;
9581
10366
  const Rebind<double, decltype(du32)> df64;
9582
10367
  const RebindToUnsigned<decltype(df64)> du64;
9583
10368
 
9584
- // Clamp v[i] to a value between 0 and 4294967295
9585
- const auto clamped = Min(ZeroIfNegative(v), Set(df64, 4294967295.0));
9586
-
9587
10369
  const auto k2_31 = Set(df64, 2147483648.0);
9588
- const auto clamped_is_ge_k2_31 = (clamped >= k2_31);
9589
- const auto clamped_lo31_f64 =
9590
- clamped - IfThenElseZero(clamped_is_ge_k2_31, k2_31);
9591
- const VFromD<D> clamped_lo31_u32{_mm_cvttpd_epi32(clamped_lo31_f64.raw)};
10370
+ const auto v_is_ge_k2_31 = (v >= k2_31);
10371
+ const auto clamped_lo31_f64 = v - IfThenElseZero(v_is_ge_k2_31, k2_31);
10372
+ const auto clamped_lo31_u32 =
10373
+ BitCast(du32, DemoteInRangeTo(di32, clamped_lo31_f64));
9592
10374
  const auto clamped_u32_msb = ShiftLeft<31>(
9593
- TruncateTo(du32, BitCast(du64, VecFromMask(df64, clamped_is_ge_k2_31))));
10375
+ TruncateTo(du32, BitCast(du64, VecFromMask(df64, v_is_ge_k2_31))));
9594
10376
  return Or(clamped_lo31_u32, clamped_u32_msb);
9595
- #endif
9596
10377
  }
9597
10378
 
10379
+ // F64 to U32 DemoteTo is generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
10380
+ template <class D, HWY_IF_U32_D(D)>
10381
+ HWY_API VFromD<D> DemoteTo(D du32, VFromD<Rebind<double, D>> v) {
10382
+ const Rebind<double, decltype(du32)> df64;
10383
+ const auto clamped = Min(ZeroIfNegative(v), Set(df64, 4294967295.0));
10384
+ return DemoteInRangeTo(du32, clamped);
10385
+ }
10386
+ #endif // HWY_TARGET <= HWY_AVX3
10387
+
9598
10388
  #if HWY_TARGET <= HWY_AVX3
9599
10389
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
9600
10390
  HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
@@ -9683,23 +10473,85 @@ HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
9683
10473
  }
9684
10474
 
9685
10475
  // ------------------------------ F32->UI64 PromoteTo
10476
+ #ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
10477
+ #undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
10478
+ #else
10479
+ #define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
10480
+ #endif
10481
+
9686
10482
  #if HWY_TARGET <= HWY_AVX3
9687
10483
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
10484
+ HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
10485
+ #if HWY_COMPILER_GCC_ACTUAL
10486
+ // Workaround for undefined behavior with GCC if any values of v[i] are not
10487
+ // within the range of an int64_t
10488
+
10489
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
10490
+ if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
10491
+ typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
10492
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
10493
+ return Dup128VecFromValues(
10494
+ D(), detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]),
10495
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1]));
10496
+ }
10497
+ #endif
10498
+
10499
+ __m128i raw_result;
10500
+ __asm__("vcvttps2qq {%1, %0|%0, %1}"
10501
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
10502
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
10503
+ :);
10504
+ return VFromD<D>{raw_result};
10505
+ #else
10506
+ return VFromD<D>{_mm_cvttps_epi64(v.raw)};
10507
+ #endif
10508
+ }
10509
+
10510
+ // Generic for all vector lengths.
10511
+ template <class D, HWY_IF_I64_D(D)>
9688
10512
  HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
9689
10513
  const Rebind<float, decltype(di64)> df32;
9690
10514
  const RebindToFloat<decltype(di64)> df64;
9691
- const Twice<decltype(df32)> dt_f32;
9692
-
9693
- return detail::FixConversionOverflow(
9694
- di64,
9695
- BitCast(df64, InterleaveLower(ResizeBitCast(dt_f32, v),
9696
- ResizeBitCast(dt_f32, v))),
9697
- VFromD<D>{_mm_cvttps_epi64(v.raw)});
10515
+ // We now avoid GCC UB in PromoteInRangeTo via assembly, see #2189 and
10516
+ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115115. Previously we fixed up
10517
+ // the result afterwards using three instructions. Now we instead check if
10518
+ // v >= 2^63, and if so replace the output with 2^63-1, which is likely more
10519
+ // efficient. Note that the previous representable f32 is less than 2^63 and
10520
+ // thus fits in i64.
10521
+ const MFromD<D> overflow = RebindMask(
10522
+ di64, PromoteMaskTo(df64, df32, Ge(v, Set(df32, 9.223372e18f))));
10523
+ return IfThenElse(overflow, Set(di64, LimitsMax<int64_t>()),
10524
+ PromoteInRangeTo(di64, v));
9698
10525
  }
9699
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
10526
+ template <class D, HWY_IF_U64_D(D)>
9700
10527
  HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
9701
- return VFromD<D>{
9702
- _mm_maskz_cvttps_epu64(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
10528
+ return PromoteInRangeTo(D(), ZeroIfNegative(v));
10529
+ }
10530
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
10531
+ HWY_API VFromD<D> PromoteInRangeTo(D /* tag */, VFromD<Rebind<float, D>> v) {
10532
+ #if HWY_COMPILER_GCC_ACTUAL
10533
+ // Workaround for undefined behavior with GCC if any values of v[i] are not
10534
+ // within the range of an uint64_t
10535
+
10536
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
10537
+ if (detail::IsConstantX86VecForF2IConv<uint64_t>(v)) {
10538
+ typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
10539
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
10540
+ return Dup128VecFromValues(
10541
+ D(), detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0]),
10542
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1]));
10543
+ }
10544
+ #endif
10545
+
10546
+ __m128i raw_result;
10547
+ __asm__("vcvttps2uqq {%1, %0|%0, %1}"
10548
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
10549
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
10550
+ :);
10551
+ return VFromD<D>{raw_result};
10552
+ #else
10553
+ return VFromD<D>{_mm_cvttps_epu64(v.raw)};
10554
+ #endif
9703
10555
  }
9704
10556
  #else // AVX2 or below
9705
10557
 
@@ -9730,6 +10582,27 @@ HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
9730
10582
  lo64_or_mask);
9731
10583
  }
9732
10584
 
10585
+ // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
10586
+ template <class D, HWY_IF_UI64_D(D)>
10587
+ HWY_API VFromD<D> PromoteInRangeTo(D d64, VFromD<Rebind<float, D>> v) {
10588
+ const Rebind<MakeNarrow<TFromD<D>>, decltype(d64)> d32;
10589
+ const RebindToSigned<decltype(d32)> di32;
10590
+ const RebindToFloat<decltype(d32)> df32;
10591
+ const RebindToUnsigned<decltype(d32)> du32;
10592
+ const Repartition<uint8_t, decltype(d32)> du32_as_du8;
10593
+
10594
+ const auto exponent_adj = BitCast(
10595
+ du32,
10596
+ SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))),
10597
+ BitCast(du32_as_du8, Set(du32, uint32_t{0xFFFFFF9Du}))));
10598
+ const auto adj_v =
10599
+ BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj));
10600
+
10601
+ const auto f32_to_i32_result = ConvertInRangeTo(di32, adj_v);
10602
+ return PromoteTo(d64, BitCast(d32, f32_to_i32_result))
10603
+ << PromoteTo(d64, exponent_adj);
10604
+ }
10605
+
9733
10606
  namespace detail {
9734
10607
 
9735
10608
  template <class DU64, HWY_IF_V_SIZE_LE_D(DU64, 16)>
@@ -9770,7 +10643,7 @@ HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
9770
10643
 
9771
10644
  const auto adj_v =
9772
10645
  BitCast(df32, BitCast(du32, non_neg_v) - ShiftLeft<23>(exponent_adj));
9773
- const VFromD<decltype(di32)> f32_to_i32_result{_mm_cvttps_epi32(adj_v.raw)};
10646
+ const auto f32_to_i32_result = ConvertInRangeTo(di32, adj_v);
9774
10647
 
9775
10648
  const auto i32_overflow_mask = BroadcastSignBit(f32_to_i32_result);
9776
10649
  const auto overflow_result =
@@ -9960,7 +10833,20 @@ template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)>
9960
10833
  HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) {
9961
10834
  return VFromD<D>{_mm_cvtusepi64_epi8(v.raw)};
9962
10835
  }
9963
- #else // AVX2 or below
10836
+ #else // AVX2 or below
10837
+
10838
+ // Disable the default unsigned to signed DemoteTo/ReorderDemote2To
10839
+ // implementations in generic_ops-inl.h for U64->I8/I16/I32 demotions on
10840
+ // SSE2/SSSE3/SSE4/AVX2 as U64->I8/I16/I32 DemoteTo/ReorderDemote2To for
10841
+ // SSE2/SSSE3/SSE4/AVX2 is implemented in x86_128-inl.h
10842
+
10843
+ // The default unsigned to signed DemoteTo/ReorderDemote2To
10844
+ // implementations in generic_ops-inl.h are still used for U32->I8/I16 and
10845
+ // U16->I8 demotions on SSE2/SSSE3/SSE4/AVX2
10846
+
10847
+ #undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
10848
+ #define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) HWY_IF_NOT_T_SIZE_V(V, 8)
10849
+
9964
10850
  namespace detail {
9965
10851
  template <class D, HWY_IF_UNSIGNED_D(D)>
9966
10852
  HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64MaskOutResult(
@@ -10023,6 +10909,25 @@ HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<int64_t, D>> v) {
10023
10909
  return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, non_neg_vals));
10024
10910
  }
10025
10911
 
10912
+ template <class D,
10913
+ HWY_IF_T_SIZE_ONE_OF_D(
10914
+ D, ((HWY_TARGET != HWY_SSE2) ? ((1 << 1) | (1 << 2)) : 0) |
10915
+ (1 << 4)),
10916
+ HWY_IF_SIGNED_D(D)>
10917
+ HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) {
10918
+ const RebindToUnsigned<decltype(dn)> dn_u;
10919
+ return BitCast(dn, TruncateTo(dn_u, detail::DemoteFromU64Saturate(dn, v)));
10920
+ }
10921
+
10922
+ #if HWY_TARGET == HWY_SSE2
10923
+ template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
10924
+ HWY_IF_SIGNED_D(D)>
10925
+ HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) {
10926
+ const Rebind<int32_t, decltype(dn)> di32;
10927
+ return DemoteTo(dn, DemoteTo(di32, v));
10928
+ }
10929
+ #endif // HWY_TARGET == HWY_SSE2
10930
+
10026
10931
  template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
10027
10932
  HWY_IF_UNSIGNED_D(D)>
10028
10933
  HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) {
@@ -10047,6 +10952,16 @@ HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint64_t, D>> a,
10047
10952
  return DemoteTo(dn, Combine(dt, b, a));
10048
10953
  }
10049
10954
 
10955
+ #if HWY_TARGET > HWY_AVX3
10956
+ template <class D, HWY_IF_V_SIZE_LE_D(D, HWY_MAX_BYTES / 2), HWY_IF_I32_D(D)>
10957
+ HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint64_t, D>> a,
10958
+ VFromD<Repartition<uint64_t, D>> b) {
10959
+ const DFromV<decltype(a)> d;
10960
+ const Twice<decltype(d)> dt;
10961
+ return DemoteTo(dn, Combine(dt, b, a));
10962
+ }
10963
+ #endif
10964
+
10050
10965
  #if HWY_TARGET > HWY_AVX2
10051
10966
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
10052
10967
  HWY_API Vec128<int32_t> ReorderDemote2To(D dn, Vec128<int64_t> a,
@@ -10084,9 +10999,9 @@ HWY_API Vec128<uint32_t> ReorderDemote2To(D dn, Vec128<int64_t> a,
10084
10999
  return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a));
10085
11000
  }
10086
11001
 
10087
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
10088
- HWY_API Vec128<uint32_t> ReorderDemote2To(D dn, Vec128<uint64_t> a,
10089
- Vec128<uint64_t> b) {
11002
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
11003
+ HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<uint64_t> a,
11004
+ Vec128<uint64_t> b) {
10090
11005
  const Half<decltype(dn)> dnh;
10091
11006
 
10092
11007
  const auto saturated_a = detail::DemoteFromU64Saturate(dnh, a);
@@ -10196,103 +11111,313 @@ HWY_API VFromD<D> ConvertTo(D dd, VFromD<Rebind<uint64_t, D>> v) {
10196
11111
 
10197
11112
  // Truncates (rounds toward zero).
10198
11113
 
11114
+ #ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
11115
+ #undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
11116
+ #else
11117
+ #define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
11118
+ #endif
11119
+
10199
11120
  #if HWY_HAVE_FLOAT16
10200
11121
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)>
11122
+ HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, VFromD<RebindToFloat<D>> v) {
11123
+ #if HWY_COMPILER_GCC_ACTUAL
11124
+ // Workaround for undefined behavior in _mm_cvttph_epi16 if any values of v[i]
11125
+ // are not within the range of an int16_t
11126
+
11127
+ #if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
11128
+ HWY_HAVE_SCALAR_F16_TYPE
11129
+ if (detail::IsConstantX86VecForF2IConv<int16_t>(v)) {
11130
+ typedef hwy::float16_t::Native GccF16RawVectType
11131
+ __attribute__((__vector_size__(16)));
11132
+ const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
11133
+ return Dup128VecFromValues(
11134
+ D(), detail::X86ConvertScalarFromFloat<int16_t>(raw_v[0]),
11135
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[1]),
11136
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[2]),
11137
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[3]),
11138
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[4]),
11139
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[5]),
11140
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[6]),
11141
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[7]));
11142
+ }
11143
+ #endif
11144
+
11145
+ __m128i raw_result;
11146
+ __asm__("vcvttph2w {%1, %0|%0, %1}"
11147
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
11148
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
11149
+ :);
11150
+ return VFromD<D>{raw_result};
11151
+ #else // !HWY_COMPILER_GCC_ACTUAL
11152
+ return VFromD<D>{_mm_cvttph_epi16(v.raw)};
11153
+ #endif
11154
+ }
11155
+
11156
+ // F16 to I16 ConvertTo is generic for all vector lengths
11157
+ template <class D, HWY_IF_I16_D(D)>
10201
11158
  HWY_API VFromD<D> ConvertTo(D di, VFromD<RebindToFloat<D>> v) {
10202
- return detail::FixConversionOverflow(
10203
- di, v, VFromD<RebindToSigned<D>>{_mm_cvttph_epi16(v.raw)});
11159
+ const RebindToFloat<decltype(di)> df;
11160
+ // See comment at the first occurrence of "IfThenElse(overflow,".
11161
+ const MFromD<D> overflow =
11162
+ RebindMask(di, Ge(v, Set(df, ConvertScalarTo<hwy::float16_t>(32768.0f))));
11163
+ return IfThenElse(overflow, Set(di, LimitsMax<int16_t>()),
11164
+ ConvertInRangeTo(di, v));
10204
11165
  }
11166
+
10205
11167
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
11168
+ HWY_API VFromD<D> ConvertInRangeTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
11169
+ #if HWY_COMPILER_GCC_ACTUAL
11170
+ // Workaround for undefined behavior in _mm_cvttph_epu16 if any values of v[i]
11171
+ // are not within the range of an uint16_t
11172
+
11173
+ #if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
11174
+ HWY_HAVE_SCALAR_F16_TYPE
11175
+ if (detail::IsConstantX86VecForF2IConv<uint16_t>(v)) {
11176
+ typedef hwy::float16_t::Native GccF16RawVectType
11177
+ __attribute__((__vector_size__(16)));
11178
+ const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
11179
+ return Dup128VecFromValues(
11180
+ D(), detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[0]),
11181
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[1]),
11182
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[2]),
11183
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[3]),
11184
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[4]),
11185
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[5]),
11186
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[6]),
11187
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[7]));
11188
+ }
11189
+ #endif
11190
+
11191
+ __m128i raw_result;
11192
+ __asm__("vcvttph2uw {%1, %0|%0, %1}"
11193
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
11194
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
11195
+ :);
11196
+ return VFromD<D>{raw_result};
11197
+ #else // !HWY_COMPILER_GCC_ACTUAL
11198
+ return VFromD<D>{_mm_cvttph_epu16(v.raw)};
11199
+ #endif
11200
+ }
11201
+
11202
+ // F16->U16 ConvertTo is generic for all vector lengths
11203
+ template <class D, HWY_IF_U16_D(D)>
10206
11204
  HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
10207
- return VFromD<D>{
10208
- _mm_maskz_cvttph_epu16(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
11205
+ return ConvertInRangeTo(D(), ZeroIfNegative(v));
10209
11206
  }
10210
11207
  #endif // HWY_HAVE_FLOAT16
10211
11208
 
10212
11209
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
11210
+ HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, VFromD<RebindToFloat<D>> v) {
11211
+ #if HWY_COMPILER_GCC_ACTUAL
11212
+ // Workaround for undefined behavior in _mm_cvttps_epi32 with GCC if any
11213
+ // values of v[i] are not within the range of an int32_t
11214
+
11215
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
11216
+ if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
11217
+ typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
11218
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
11219
+ return Dup128VecFromValues(
11220
+ D(), detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
11221
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]),
11222
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[2]),
11223
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[3]));
11224
+ }
11225
+ #endif
11226
+
11227
+ __m128i raw_result;
11228
+ __asm__("%vcvttps2dq {%1, %0|%0, %1}"
11229
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
11230
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
11231
+ :);
11232
+ return VFromD<D>{raw_result};
11233
+ #else // !HWY_COMPILER_GCC_ACTUAL
11234
+ return VFromD<D>{_mm_cvttps_epi32(v.raw)};
11235
+ #endif
11236
+ }
11237
+
11238
+ // F32 to I32 ConvertTo is generic for all vector lengths
11239
+ template <class D, HWY_IF_I32_D(D)>
10213
11240
  HWY_API VFromD<D> ConvertTo(D di, VFromD<RebindToFloat<D>> v) {
10214
- return detail::FixConversionOverflow(
10215
- di, v, VFromD<RebindToSigned<D>>{_mm_cvttps_epi32(v.raw)});
11241
+ const RebindToFloat<decltype(di)> df;
11242
+ // See comment at the first occurrence of "IfThenElse(overflow,".
11243
+ const MFromD<D> overflow = RebindMask(di, Ge(v, Set(df, 2147483648.0f)));
11244
+ return IfThenElse(overflow, Set(di, LimitsMax<int32_t>()),
11245
+ ConvertInRangeTo(di, v));
10216
11246
  }
10217
11247
 
10218
11248
  #if HWY_TARGET <= HWY_AVX3
10219
11249
  template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I64_D(DI)>
11250
+ HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, VFromD<RebindToFloat<DI>> v) {
11251
+ #if HWY_COMPILER_GCC_ACTUAL
11252
+ // Workaround for undefined behavior in _mm_cvttpd_epi64 with GCC if any
11253
+ // values of v[i] are not within the range of an int64_t
11254
+
11255
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
11256
+ if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
11257
+ typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
11258
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
11259
+ return Dup128VecFromValues(
11260
+ DI(), detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]),
11261
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1]));
11262
+ }
11263
+ #endif
11264
+
11265
+ __m128i raw_result;
11266
+ __asm__("vcvttpd2qq {%1, %0|%0, %1}"
11267
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
11268
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
11269
+ :);
11270
+ return VFromD<DI>{raw_result};
11271
+ #else // !HWY_COMPILER_GCC_ACTUAL
11272
+ return VFromD<DI>{_mm_cvttpd_epi64(v.raw)};
11273
+ #endif
11274
+ }
11275
+
11276
+ // F64 to I64 ConvertTo is generic for all vector lengths on AVX3
11277
+ template <class DI, HWY_IF_I64_D(DI)>
10220
11278
  HWY_API VFromD<DI> ConvertTo(DI di, VFromD<RebindToFloat<DI>> v) {
10221
- return detail::FixConversionOverflow(di, v,
10222
- VFromD<DI>{_mm_cvttpd_epi64(v.raw)});
11279
+ const RebindToFloat<decltype(di)> df;
11280
+ // See comment at the first occurrence of "IfThenElse(overflow,".
11281
+ const MFromD<DI> overflow =
11282
+ RebindMask(di, Ge(v, Set(df, 9.223372036854776e18)));
11283
+ return IfThenElse(overflow, Set(di, LimitsMax<int64_t>()),
11284
+ ConvertInRangeTo(di, v));
10223
11285
  }
10224
11286
 
10225
11287
  template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U32_D(DU)>
11288
+ HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
11289
+ #if HWY_COMPILER_GCC_ACTUAL
11290
+ // Workaround for undefined behavior in _mm_cvttps_epu32 with GCC if any
11291
+ // values of v[i] are not within the range of an uint32_t
11292
+
11293
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
11294
+ if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) {
11295
+ typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
11296
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
11297
+ return Dup128VecFromValues(
11298
+ DU(), detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0]),
11299
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1]),
11300
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[2]),
11301
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[3]));
11302
+ }
11303
+ #endif
11304
+
11305
+ __m128i raw_result;
11306
+ __asm__("vcvttps2udq {%1, %0|%0, %1}"
11307
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
11308
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
11309
+ :);
11310
+ return VFromD<DU>{raw_result};
11311
+ #else // !HWY_COMPILER_GCC_ACTUAL
11312
+ return VFromD<DU>{_mm_cvttps_epu32(v.raw)};
11313
+ #endif
11314
+ }
11315
+
11316
+ // F32->U32 ConvertTo is generic for all vector lengths
11317
+ template <class DU, HWY_IF_U32_D(DU)>
10226
11318
  HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
10227
- return VFromD<DU>{
10228
- _mm_maskz_cvttps_epu32(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
11319
+ return ConvertInRangeTo(DU(), ZeroIfNegative(v));
10229
11320
  }
10230
11321
 
10231
11322
  template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U64_D(DU)>
11323
+ HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
11324
+ #if HWY_COMPILER_GCC_ACTUAL
11325
+ // Workaround for undefined behavior in _mm_cvttpd_epu64 with GCC if any
11326
+ // values of v[i] are not within the range of an uint64_t
11327
+
11328
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
11329
+ if (detail::IsConstantX86VecForF2IConv<uint64_t>(v)) {
11330
+ typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
11331
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
11332
+ return Dup128VecFromValues(
11333
+ DU(), detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0]),
11334
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1]));
11335
+ }
11336
+ #endif
11337
+
11338
+ __m128i raw_result;
11339
+ __asm__("vcvttpd2uqq {%1, %0|%0, %1}"
11340
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
11341
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
11342
+ :);
11343
+ return VFromD<DU>{raw_result};
11344
+ #else // !HWY_COMPILER_GCC_ACTUAL
11345
+ return VFromD<DU>{_mm_cvttpd_epu64(v.raw)};
11346
+ #endif
11347
+ }
11348
+
11349
+ // F64->U64 ConvertTo is generic for all vector lengths
11350
+ template <class DU, HWY_IF_U64_D(DU)>
10232
11351
  HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
10233
- return VFromD<DU>{
10234
- _mm_maskz_cvttpd_epu64(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
11352
+ return ConvertInRangeTo(DU(), ZeroIfNegative(v));
10235
11353
  }
10236
11354
 
10237
11355
  #else // AVX2 or below
10238
11356
 
10239
- template <class DU32, HWY_IF_V_SIZE_LE_D(DU32, 16), HWY_IF_U32_D(DU32)>
10240
- HWY_API VFromD<DU32> ConvertTo(DU32 du32, VFromD<RebindToFloat<DU32>> v) {
11357
+ namespace detail {
11358
+
11359
+ template <class DU32, HWY_IF_U32_D(DU32)>
11360
+ static HWY_INLINE VFromD<DU32> ConvInRangeF32ToU32(
11361
+ DU32 du32, VFromD<RebindToFloat<DU32>> v, VFromD<DU32>& exp_diff) {
10241
11362
  const RebindToSigned<decltype(du32)> di32;
10242
11363
  const RebindToFloat<decltype(du32)> df32;
10243
11364
 
10244
- const auto non_neg_v = ZeroIfNegative(v);
10245
- const auto exp_diff = Set(di32, int32_t{158}) -
10246
- BitCast(di32, ShiftRight<23>(BitCast(du32, non_neg_v)));
11365
+ exp_diff = Set(du32, uint32_t{158}) - ShiftRight<23>(BitCast(du32, v));
10247
11366
  const auto scale_down_f32_val_mask =
10248
- BitCast(du32, VecFromMask(di32, Eq(exp_diff, Zero(di32))));
11367
+ VecFromMask(du32, Eq(exp_diff, Zero(du32)));
10249
11368
 
10250
- const auto v_scaled = BitCast(
10251
- df32, BitCast(du32, non_neg_v) + ShiftLeft<23>(scale_down_f32_val_mask));
10252
- const VFromD<decltype(du32)> f32_to_u32_result{
10253
- _mm_cvttps_epi32(v_scaled.raw)};
11369
+ const auto v_scaled =
11370
+ BitCast(df32, BitCast(du32, v) + ShiftLeft<23>(scale_down_f32_val_mask));
11371
+ const auto f32_to_u32_result =
11372
+ BitCast(du32, ConvertInRangeTo(di32, v_scaled));
10254
11373
 
10255
- return Or(
10256
- BitCast(du32, BroadcastSignBit(exp_diff)),
10257
- f32_to_u32_result + And(f32_to_u32_result, scale_down_f32_val_mask));
11374
+ return f32_to_u32_result + And(f32_to_u32_result, scale_down_f32_val_mask);
10258
11375
  }
10259
11376
 
10260
- #if HWY_ARCH_X86_64
10261
- template <class DI, HWY_IF_V_SIZE_D(DI, 8), HWY_IF_I64_D(DI)>
10262
- HWY_API VFromD<DI> ConvertTo(DI di, Vec64<double> v) {
10263
- const Vec64<int64_t> i0{_mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw))};
10264
- return detail::FixConversionOverflow(di, v, i0);
11377
+ } // namespace detail
11378
+
11379
+ // F32 to U32 ConvertInRangeTo is generic for all vector lengths on
11380
+ // SSE2/SSSE3/SSE4/AVX2
11381
+ template <class DU32, HWY_IF_U32_D(DU32)>
11382
+ HWY_API VFromD<DU32> ConvertInRangeTo(DU32 du32,
11383
+ VFromD<RebindToFloat<DU32>> v) {
11384
+ VFromD<DU32> exp_diff;
11385
+ const auto f32_to_u32_result = detail::ConvInRangeF32ToU32(du32, v, exp_diff);
11386
+ return f32_to_u32_result;
10265
11387
  }
10266
- template <class DI, HWY_IF_V_SIZE_D(DI, 16), HWY_IF_I64_D(DI)>
10267
- HWY_API VFromD<DI> ConvertTo(DI di, Vec128<double> v) {
10268
- const __m128i i0 = _mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw));
10269
- const Full64<double> dd2;
10270
- const __m128i i1 = _mm_cvtsi64_si128(_mm_cvttsd_si64(UpperHalf(dd2, v).raw));
10271
- return detail::FixConversionOverflow(
10272
- di, v, Vec128<int64_t>{_mm_unpacklo_epi64(i0, i1)});
11388
+
11389
+ // F32 to U32 ConvertTo is generic for all vector lengths on
11390
+ // SSE2/SSSE3/SSE4/AVX2
11391
+ template <class DU32, HWY_IF_U32_D(DU32)>
11392
+ HWY_API VFromD<DU32> ConvertTo(DU32 du32, VFromD<RebindToFloat<DU32>> v) {
11393
+ const RebindToSigned<decltype(du32)> di32;
11394
+
11395
+ const auto non_neg_v = ZeroIfNegative(v);
11396
+ VFromD<DU32> exp_diff;
11397
+ const auto f32_to_u32_result =
11398
+ detail::ConvInRangeF32ToU32(du32, non_neg_v, exp_diff);
11399
+
11400
+ return Or(f32_to_u32_result,
11401
+ BitCast(du32, BroadcastSignBit(BitCast(di32, exp_diff))));
10273
11402
  }
10274
- #endif // HWY_ARCH_X86_64
10275
11403
 
10276
- #if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
10277
- template <class DI, HWY_IF_V_SIZE_GT_D(DI, (HWY_ARCH_X86_64 ? 16 : 0)),
10278
- HWY_IF_I64_D(DI)>
10279
- HWY_API VFromD<DI> ConvertTo(DI di, VFromD<Rebind<double, DI>> v) {
10280
- using VI = VFromD<decltype(di)>;
10281
- const RebindToUnsigned<decltype(di)> du;
10282
- using VU = VFromD<decltype(du)>;
10283
- const Repartition<uint16_t, decltype(di)> du16;
10284
- const VI k1075 = Set(di, 1075); /* biased exponent of 2^52 */
11404
+ namespace detail {
11405
+
11406
+ template <class D64, HWY_IF_UI64_D(D64)>
11407
+ HWY_API VFromD<D64> ConvAbsInRangeF64ToUI64(D64 d64,
11408
+ VFromD<Rebind<double, D64>> v,
11409
+ VFromD<D64>& biased_exp) {
11410
+ const RebindToSigned<decltype(d64)> di64;
11411
+ const RebindToUnsigned<decltype(d64)> du64;
11412
+ using VU64 = VFromD<decltype(du64)>;
11413
+ const Repartition<uint16_t, decltype(di64)> du16;
11414
+ const VU64 k1075 = Set(du64, 1075); /* biased exponent of 2^52 */
10285
11415
 
10286
11416
  // Exponent indicates whether the number can be represented as int64_t.
10287
- const VU biased_exp = ShiftRight<52>(BitCast(du, v)) & Set(du, 0x7FF);
10288
- #if HWY_TARGET <= HWY_SSE4
10289
- const auto in_range = BitCast(di, biased_exp) < Set(di, 1086);
10290
- #else
10291
- const Repartition<int32_t, decltype(di)> di32;
10292
- const auto in_range = MaskFromVec(BitCast(
10293
- di,
10294
- VecFromMask(di32, DupEven(BitCast(di32, biased_exp)) < Set(di32, 1086))));
10295
- #endif
11417
+ biased_exp = BitCast(d64, ShiftRight<52>(BitCast(du64, v)));
11418
+ HWY_IF_CONSTEXPR(IsSigned<TFromD<D64>>()) {
11419
+ biased_exp = And(biased_exp, Set(d64, TFromD<D64>{0x7FF}));
11420
+ }
10296
11421
 
10297
11422
  // If we were to cap the exponent at 51 and add 2^52, the number would be in
10298
11423
  // [2^52, 2^53) and mantissa bits could be read out directly. We need to
@@ -10312,45 +11437,141 @@ HWY_API VFromD<DI> ConvertTo(DI di, VFromD<Rebind<double, DI>> v) {
10312
11437
  // The upper 48 bits of both shift_mnt and shift_int are guaranteed to be
10313
11438
  // zero as the upper 48 bits of both k1075 and biased_exp are zero.
10314
11439
 
10315
- const VU shift_mnt = BitCast(
10316
- du, SaturatedSub(BitCast(du16, k1075), BitCast(du16, biased_exp)));
10317
- const VU shift_int = BitCast(
10318
- du, SaturatedSub(BitCast(du16, biased_exp), BitCast(du16, k1075)));
10319
- const VU mantissa = BitCast(du, v) & Set(du, (1ULL << 52) - 1);
11440
+ const VU64 shift_mnt = BitCast(
11441
+ du64, SaturatedSub(BitCast(du16, k1075), BitCast(du16, biased_exp)));
11442
+ const VU64 shift_int = BitCast(
11443
+ du64, SaturatedSub(BitCast(du16, biased_exp), BitCast(du16, k1075)));
11444
+ const VU64 mantissa = BitCast(du64, v) & Set(du64, (1ULL << 52) - 1);
10320
11445
  // Include implicit 1-bit. NOTE: the shift count may exceed 63; we rely on x86
10321
11446
  // returning zero in that case.
10322
- const VU int53 = (mantissa | Set(du, 1ULL << 52)) >> shift_mnt;
11447
+ const VU64 int53 = (mantissa | Set(du64, 1ULL << 52)) >> shift_mnt;
10323
11448
 
10324
11449
  // For inputs larger than 2^53 - 1, insert zeros at the bottom.
10325
11450
 
10326
- // For inputs less than 2^63, the implicit 1-bit is guaranteed not to be
10327
- // shifted out of the left shift result below as shift_int[i] <= 10 is true
10328
- // for any inputs that are less than 2^63.
11451
+ // For inputs less than 2^64, the implicit 1-bit is guaranteed not to be
11452
+ // shifted out of the left shift result below as shift_int[i] <= 11 is true
11453
+ // for any inputs that are less than 2^64.
11454
+
11455
+ return BitCast(d64, int53 << shift_int);
11456
+ }
11457
+
11458
+ } // namespace detail
11459
+
11460
+ #if HWY_ARCH_X86_64
11461
+
11462
+ namespace detail {
11463
+
11464
+ template <size_t N>
11465
+ static HWY_INLINE int64_t SSE2ConvFirstF64LaneToI64(Vec128<double, N> v) {
11466
+ #if HWY_COMPILER_GCC_ACTUAL
11467
+ // Workaround for undefined behavior in _mm_cvttsd_si64 with GCC if v[0] is
11468
+ // not within the range of an int64_t
11469
+
11470
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
11471
+ if (IsConstantX86Vec(hwy::SizeTag<1>(), v)) {
11472
+ typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
11473
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
11474
+ return X86ConvertScalarFromFloat<int64_t>(raw_v[0]);
11475
+ }
11476
+ #endif
11477
+
11478
+ int64_t result;
11479
+ __asm__("%vcvttsd2si {%1, %0|%0, %1}"
11480
+ : "=r"(result)
11481
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
11482
+ :);
11483
+ return result;
11484
+ #else
11485
+ return _mm_cvttsd_si64(v.raw);
11486
+ #endif
11487
+ }
11488
+
11489
+ } // namespace detail
11490
+
11491
+ template <class DI, HWY_IF_V_SIZE_D(DI, 8), HWY_IF_I64_D(DI)>
11492
+ HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, Vec64<double> v) {
11493
+ return VFromD<DI>{_mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToI64(v))};
11494
+ }
11495
+ template <class DI, HWY_IF_V_SIZE_D(DI, 16), HWY_IF_I64_D(DI)>
11496
+ HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, Vec128<double> v) {
11497
+ const __m128i i0 = _mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToI64(v));
11498
+ const Full64<double> dd2;
11499
+ const __m128i i1 =
11500
+ _mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToI64(UpperHalf(dd2, v)));
11501
+ return VFromD<DI>{_mm_unpacklo_epi64(i0, i1)};
11502
+ }
11503
+
11504
+ template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I64_D(DI)>
11505
+ HWY_API VFromD<DI> ConvertTo(DI di, VFromD<Rebind<double, DI>> v) {
11506
+ const RebindToFloat<decltype(di)> df;
11507
+ // See comment at the first occurrence of "IfThenElse(overflow,".
11508
+ const MFromD<DI> overflow =
11509
+ RebindMask(di, Ge(v, Set(df, 9.223372036854776e18)));
11510
+ return IfThenElse(overflow, Set(di, LimitsMax<int64_t>()),
11511
+ ConvertInRangeTo(di, v));
11512
+ }
11513
+ #endif // HWY_ARCH_X86_64
11514
+
11515
+ #if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
11516
+ template <class DI, HWY_IF_V_SIZE_GT_D(DI, (HWY_ARCH_X86_64 ? 16 : 0)),
11517
+ HWY_IF_I64_D(DI)>
11518
+ HWY_API VFromD<DI> ConvertInRangeTo(DI di, VFromD<Rebind<double, DI>> v) {
11519
+ using VI = VFromD<DI>;
11520
+
11521
+ VI biased_exp;
11522
+ const VI shifted = detail::ConvAbsInRangeF64ToUI64(di, v, biased_exp);
11523
+ const VI sign_mask = BroadcastSignBit(BitCast(di, v));
11524
+
11525
+ // If the input was negative, negate the integer (two's complement).
11526
+ return (shifted ^ sign_mask) - sign_mask;
11527
+ }
11528
+
11529
+ template <class DI, HWY_IF_V_SIZE_GT_D(DI, (HWY_ARCH_X86_64 ? 16 : 0)),
11530
+ HWY_IF_I64_D(DI)>
11531
+ HWY_API VFromD<DI> ConvertTo(DI di, VFromD<Rebind<double, DI>> v) {
11532
+ using VI = VFromD<DI>;
11533
+
11534
+ VI biased_exp;
11535
+ const VI shifted = detail::ConvAbsInRangeF64ToUI64(di, v, biased_exp);
10329
11536
 
10330
- const VU shifted = int53 << shift_int;
11537
+ #if HWY_TARGET <= HWY_SSE4
11538
+ const auto in_range = biased_exp < Set(di, 1086);
11539
+ #else
11540
+ const Repartition<int32_t, decltype(di)> di32;
11541
+ const auto in_range = MaskFromVec(BitCast(
11542
+ di,
11543
+ VecFromMask(di32, DupEven(BitCast(di32, biased_exp)) < Set(di32, 1086))));
11544
+ #endif
10331
11545
 
10332
11546
  // Saturate to LimitsMin (unchanged when negating below) or LimitsMax.
10333
11547
  const VI sign_mask = BroadcastSignBit(BitCast(di, v));
10334
11548
  const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask;
10335
- const VI magnitude = IfThenElse(in_range, BitCast(di, shifted), limit);
11549
+ const VI magnitude = IfThenElse(in_range, shifted, limit);
10336
11550
 
10337
11551
  // If the input was negative, negate the integer (two's complement).
10338
11552
  return (magnitude ^ sign_mask) - sign_mask;
10339
11553
  }
10340
11554
  #endif // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
10341
11555
 
11556
+ // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
11557
+ template <class DU, HWY_IF_U64_D(DU)>
11558
+ HWY_API VFromD<DU> ConvertInRangeTo(DU du, VFromD<Rebind<double, DU>> v) {
11559
+ VFromD<DU> biased_exp;
11560
+ const auto shifted = detail::ConvAbsInRangeF64ToUI64(du, v, biased_exp);
11561
+ return shifted;
11562
+ }
11563
+
10342
11564
  // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
10343
11565
  template <class DU, HWY_IF_U64_D(DU)>
10344
11566
  HWY_API VFromD<DU> ConvertTo(DU du, VFromD<Rebind<double, DU>> v) {
10345
- const RebindToSigned<decltype(du)> di;
10346
- using VU = VFromD<decltype(du)>;
10347
- const Repartition<uint16_t, decltype(di)> du16;
10348
- const VU k1075 = Set(du, 1075); /* biased exponent of 2^52 */
11567
+ const RebindToSigned<DU> di;
11568
+ using VU = VFromD<DU>;
10349
11569
 
10350
- const auto non_neg_v = ZeroIfNegative(v);
11570
+ VU biased_exp;
11571
+ const VU shifted =
11572
+ detail::ConvAbsInRangeF64ToUI64(du, ZeroIfNegative(v), biased_exp);
10351
11573
 
10352
- // Exponent indicates whether the number can be represented as int64_t.
10353
- const VU biased_exp = ShiftRight<52>(BitCast(du, non_neg_v));
11574
+ // Exponent indicates whether the number can be represented as uint64_t.
10354
11575
  #if HWY_TARGET <= HWY_SSE4
10355
11576
  const VU out_of_range =
10356
11577
  BitCast(du, VecFromMask(di, BitCast(di, biased_exp) > Set(di, 1086)));
@@ -10361,49 +11582,83 @@ HWY_API VFromD<DU> ConvertTo(DU du, VFromD<Rebind<double, DU>> v) {
10361
11582
  VecFromMask(di32, DupEven(BitCast(di32, biased_exp)) > Set(di32, 1086)));
10362
11583
  #endif
10363
11584
 
10364
- // If we were to cap the exponent at 51 and add 2^52, the number would be in
10365
- // [2^52, 2^53) and mantissa bits could be read out directly. We need to
10366
- // round-to-0 (truncate), but changing rounding mode in MXCSR hits a
10367
- // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead
10368
- // manually shift the mantissa into place (we already have many of the
10369
- // inputs anyway).
11585
+ return (shifted | out_of_range);
11586
+ }
11587
+ #endif // HWY_TARGET <= HWY_AVX3
10370
11588
 
10371
- // Use 16-bit saturated unsigned subtraction to compute shift_mnt and
10372
- // shift_int since biased_exp[i] is a non-negative integer that is less than
10373
- // or equal to 2047.
11589
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
11590
+ namespace detail {
10374
11591
 
10375
- // 16-bit saturated unsigned subtraction is also more efficient than a
10376
- // 64-bit subtraction followed by a 64-bit signed Max operation on
10377
- // SSE2/SSSE3/SSE4/AVX2.
11592
+ template <class TTo, class TF, HWY_IF_SIGNED(TTo)>
11593
+ static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CXX14_CONSTEXPR TTo
11594
+ X86ScalarNearestInt(TF flt_val) {
11595
+ #if HWY_HAVE_SCALAR_F16_TYPE && HWY_HAVE_SCALAR_F16_OPERATORS
11596
+ using TFArith = If<hwy::IsSame<RemoveCvRef<TTo>, hwy::bfloat16_t>(), float,
11597
+ RemoveCvRef<TF>>;
11598
+ #else
11599
+ using TFArith = If<sizeof(TF) <= sizeof(float), float, RemoveCvRef<TF>>;
11600
+ #endif
10378
11601
 
10379
- // The upper 48 bits of both shift_mnt and shift_int are guaranteed to be
10380
- // zero as the upper 48 bits of both k1075 and biased_exp are zero.
11602
+ const TTo trunc_int_val = X86ConvertScalarFromFloat<TTo>(flt_val);
11603
+ const TFArith abs_val_diff = ScalarAbs(
11604
+ ConvertScalarTo<TFArith>(ConvertScalarTo<TFArith>(flt_val) -
11605
+ ConvertScalarTo<TFArith>(trunc_int_val)));
11606
+ constexpr TFArith kHalf = ConvertScalarTo<TFArith>(0.5);
10381
11607
 
10382
- const VU shift_mnt = BitCast(
10383
- du, SaturatedSub(BitCast(du16, k1075), BitCast(du16, biased_exp)));
10384
- const VU shift_int = BitCast(
10385
- du, SaturatedSub(BitCast(du16, biased_exp), BitCast(du16, k1075)));
10386
- const VU mantissa = BitCast(du, non_neg_v) & Set(du, (1ULL << 52) - 1);
10387
- // Include implicit 1-bit. NOTE: the shift count may exceed 63; we rely on x86
10388
- // returning zero in that case.
10389
- const VU int53 = (mantissa | Set(du, 1ULL << 52)) >> shift_mnt;
11608
+ const bool round_result_up =
11609
+ ((trunc_int_val ^ ScalarShr(trunc_int_val, sizeof(TTo) * 8 - 1)) !=
11610
+ LimitsMax<TTo>()) &&
11611
+ (abs_val_diff > kHalf ||
11612
+ (abs_val_diff == kHalf && (trunc_int_val & 1) != 0));
11613
+ return static_cast<TTo>(
11614
+ trunc_int_val +
11615
+ (round_result_up ? (ScalarSignBit(flt_val) ? (-1) : 1) : 0));
11616
+ }
10390
11617
 
10391
- // For inputs larger than 2^53 - 1, insert zeros at the bottom.
11618
+ } // namespace detail
11619
+ #endif // HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
10392
11620
 
10393
- // For inputs less than 2^64, the implicit 1-bit is guaranteed not to be
10394
- // shifted out of the left shift result below as shift_int[i] <= 11 is true
10395
- // for any inputs that are less than 2^64.
11621
+ // If these are in namespace detail, the x86_256/512 templates are not found.
11622
+ template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I32_D(DI)>
11623
+ HWY_INLINE VFromD<DI> NearestIntInRange(DI, VFromD<RebindToFloat<DI>> v) {
11624
+ #if HWY_COMPILER_GCC_ACTUAL
11625
+ // Workaround for undefined behavior in _mm_cvtps_epi32 with GCC if any values
11626
+ // of v[i] are not within the range of an int32_t
11627
+
11628
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
11629
+ if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
11630
+ typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
11631
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
11632
+ return Dup128VecFromValues(DI(),
11633
+ detail::X86ScalarNearestInt<int32_t>(raw_v[0]),
11634
+ detail::X86ScalarNearestInt<int32_t>(raw_v[1]),
11635
+ detail::X86ScalarNearestInt<int32_t>(raw_v[2]),
11636
+ detail::X86ScalarNearestInt<int32_t>(raw_v[3]));
11637
+ }
11638
+ #endif
10396
11639
 
10397
- const VU shifted = int53 << shift_int;
10398
- return (shifted | out_of_range);
11640
+ __m128i raw_result;
11641
+ __asm__("%vcvtps2dq {%1, %0|%0, %1}"
11642
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
11643
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
11644
+ :);
11645
+ return VFromD<DI>{raw_result};
11646
+ #else // !HWY_COMPILER_GCC_ACTUAL
11647
+ return VFromD<DI>{_mm_cvtps_epi32(v.raw)};
11648
+ #endif
10399
11649
  }
10400
- #endif // HWY_TARGET <= HWY_AVX3
10401
11650
 
10402
- template <size_t N>
10403
- HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
10404
- const RebindToSigned<DFromV<decltype(v)>> di;
10405
- return detail::FixConversionOverflow(
10406
- di, v, VFromD<decltype(di)>{_mm_cvtps_epi32(v.raw)});
11651
+ // Generic for all vector lengths.
11652
+ template <class VF, class DF = DFromV<VF>, class DI = RebindToSigned<DF>,
11653
+ HWY_IF_F32_D(DF)>
11654
+ HWY_API VFromD<DI> NearestInt(const VF v) {
11655
+ const DI di;
11656
+ // See comment at the first occurrence of "IfThenElse(overflow,".
11657
+ // Here we are rounding, whereas previous occurrences truncate, but there is
11658
+ // no difference because the previous float value is well below the max i32.
11659
+ const auto overflow = RebindMask(di, Ge(v, Set(DF(), 2147483648.0f)));
11660
+ return IfThenElse(overflow, Set(di, LimitsMax<int32_t>()),
11661
+ NearestIntInRange(di, v));
10407
11662
  }
10408
11663
 
10409
11664
  // ------------------------------ Floating-point rounding (ConvertTo)
@@ -10447,7 +11702,7 @@ HWY_API Vec128<T, N> Trunc(const Vec128<T, N> v) {
10447
11702
  const DFromV<decltype(v)> df;
10448
11703
  const RebindToSigned<decltype(df)> di;
10449
11704
 
10450
- const auto integer = ConvertTo(di, v); // round toward 0
11705
+ const auto integer = ConvertInRangeTo(di, v); // round toward 0
10451
11706
  const auto int_f = ConvertTo(df, integer);
10452
11707
 
10453
11708
  return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
@@ -10460,7 +11715,7 @@ HWY_API Vec128<T, N> Ceil(const Vec128<T, N> v) {
10460
11715
  const DFromV<decltype(v)> df;
10461
11716
  const RebindToSigned<decltype(df)> di;
10462
11717
 
10463
- const auto integer = ConvertTo(di, v); // round toward 0
11718
+ const auto integer = ConvertInRangeTo(di, v); // round toward 0
10464
11719
  const auto int_f = ConvertTo(df, integer);
10465
11720
 
10466
11721
  // Truncating a positive non-integer ends up smaller; if so, add 1.
@@ -10476,7 +11731,7 @@ HWY_API Vec128<T, N> Floor(const Vec128<T, N> v) {
10476
11731
  const DFromV<decltype(v)> df;
10477
11732
  const RebindToSigned<decltype(df)> di;
10478
11733
 
10479
- const auto integer = ConvertTo(di, v); // round toward 0
11734
+ const auto integer = ConvertInRangeTo(di, v); // round toward 0
10480
11735
  const auto int_f = ConvertTo(df, integer);
10481
11736
 
10482
11737
  // Truncating a negative non-integer ends up larger; if so, subtract 1.
@@ -10584,6 +11839,16 @@ HWY_API Mask128<float16_t, N> IsNaN(const Vec128<float16_t, N> v) {
10584
11839
  _mm_fpclass_ph_mask(v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
10585
11840
  }
10586
11841
 
11842
+ template <size_t N>
11843
+ HWY_API Mask128<float16_t, N> IsEitherNaN(Vec128<float16_t, N> a,
11844
+ Vec128<float16_t, N> b) {
11845
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
11846
+ HWY_DIAGNOSTICS(push)
11847
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
11848
+ return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_UNORD_Q)};
11849
+ HWY_DIAGNOSTICS(pop)
11850
+ }
11851
+
10587
11852
  template <size_t N>
10588
11853
  HWY_API Mask128<float16_t, N> IsInf(const Vec128<float16_t, N> v) {
10589
11854
  return Mask128<float16_t, N>{_mm_fpclass_ph_mask(
@@ -10620,6 +11885,31 @@ HWY_API Mask128<double, N> IsNaN(const Vec128<double, N> v) {
10620
11885
  #endif
10621
11886
  }
10622
11887
 
11888
+ #ifdef HWY_NATIVE_IS_EITHER_NAN
11889
+ #undef HWY_NATIVE_IS_EITHER_NAN
11890
+ #else
11891
+ #define HWY_NATIVE_IS_EITHER_NAN
11892
+ #endif
11893
+
11894
+ template <size_t N>
11895
+ HWY_API Mask128<float, N> IsEitherNaN(Vec128<float, N> a, Vec128<float, N> b) {
11896
+ #if HWY_TARGET <= HWY_AVX3
11897
+ return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_UNORD_Q)};
11898
+ #else
11899
+ return Mask128<float, N>{_mm_cmpunord_ps(a.raw, b.raw)};
11900
+ #endif
11901
+ }
11902
+
11903
+ template <size_t N>
11904
+ HWY_API Mask128<double, N> IsEitherNaN(Vec128<double, N> a,
11905
+ Vec128<double, N> b) {
11906
+ #if HWY_TARGET <= HWY_AVX3
11907
+ return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_UNORD_Q)};
11908
+ #else
11909
+ return Mask128<double, N>{_mm_cmpunord_pd(a.raw, b.raw)};
11910
+ #endif
11911
+ }
11912
+
10623
11913
  #if HWY_TARGET <= HWY_AVX3
10624
11914
 
10625
11915
  // Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite.
@@ -12016,6 +13306,31 @@ HWY_API Vec128<uint8_t> MaxOfLanes(D d, Vec128<uint8_t> v) {
12016
13306
 
12017
13307
  #endif // HWY_TARGET <= HWY_SSE4
12018
13308
 
13309
+ // ------------------------------ BitShuffle
13310
+ #if HWY_TARGET <= HWY_AVX3_DL
13311
+
13312
+ #ifdef HWY_NATIVE_BITSHUFFLE
13313
+ #undef HWY_NATIVE_BITSHUFFLE
13314
+ #else
13315
+ #define HWY_NATIVE_BITSHUFFLE
13316
+ #endif
13317
+
13318
+ template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
13319
+ HWY_IF_V_SIZE_LE_V(V, 16),
13320
+ HWY_IF_V_SIZE_V(VI, HWY_MAX_LANES_V(V) * 8)>
13321
+ HWY_API V BitShuffle(V v, VI idx) {
13322
+ const DFromV<decltype(v)> d64;
13323
+ const RebindToUnsigned<decltype(d64)> du64;
13324
+ const Rebind<uint8_t, decltype(d64)> du8;
13325
+
13326
+ int32_t i32_bit_shuf_result = static_cast<int32_t>(
13327
+ static_cast<uint16_t>(_mm_bitshuffle_epi64_mask(v.raw, idx.raw)));
13328
+
13329
+ return BitCast(d64, PromoteTo(du64, VFromD<decltype(du8)>{_mm_cvtsi32_si128(
13330
+ i32_bit_shuf_result)}));
13331
+ }
13332
+ #endif // HWY_TARGET <= HWY_AVX3_DL
13333
+
12019
13334
  // ------------------------------ Lt128
12020
13335
 
12021
13336
  namespace detail {