@img/sharp-libvips-dev 1.2.1 → 1.2.2-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/include/aom/aom_decoder.h +1 -1
  2. package/include/aom/aom_encoder.h +2 -0
  3. package/include/aom/aomcx.h +106 -25
  4. package/include/ffi.h +3 -3
  5. package/include/freetype2/freetype/config/ftconfig.h +1 -1
  6. package/include/freetype2/freetype/config/ftheader.h +1 -1
  7. package/include/freetype2/freetype/config/ftoption.h +37 -12
  8. package/include/freetype2/freetype/config/ftstdlib.h +1 -1
  9. package/include/freetype2/freetype/config/integer-types.h +29 -2
  10. package/include/freetype2/freetype/config/mac-support.h +1 -1
  11. package/include/freetype2/freetype/config/public-macros.h +3 -3
  12. package/include/freetype2/freetype/freetype.h +51 -47
  13. package/include/freetype2/freetype/ftadvanc.h +1 -1
  14. package/include/freetype2/freetype/ftbbox.h +1 -1
  15. package/include/freetype2/freetype/ftbdf.h +1 -1
  16. package/include/freetype2/freetype/ftbitmap.h +1 -1
  17. package/include/freetype2/freetype/ftbzip2.h +1 -1
  18. package/include/freetype2/freetype/ftcache.h +1 -1
  19. package/include/freetype2/freetype/ftcid.h +1 -1
  20. package/include/freetype2/freetype/ftcolor.h +13 -4
  21. package/include/freetype2/freetype/ftdriver.h +3 -3
  22. package/include/freetype2/freetype/fterrdef.h +1 -1
  23. package/include/freetype2/freetype/fterrors.h +1 -1
  24. package/include/freetype2/freetype/ftfntfmt.h +1 -1
  25. package/include/freetype2/freetype/ftgasp.h +1 -1
  26. package/include/freetype2/freetype/ftglyph.h +1 -1
  27. package/include/freetype2/freetype/ftgxval.h +1 -1
  28. package/include/freetype2/freetype/ftgzip.h +1 -1
  29. package/include/freetype2/freetype/ftimage.h +6 -2
  30. package/include/freetype2/freetype/ftincrem.h +1 -1
  31. package/include/freetype2/freetype/ftlcdfil.h +1 -1
  32. package/include/freetype2/freetype/ftlist.h +1 -1
  33. package/include/freetype2/freetype/ftlogging.h +184 -0
  34. package/include/freetype2/freetype/ftlzw.h +1 -1
  35. package/include/freetype2/freetype/ftmac.h +1 -1
  36. package/include/freetype2/freetype/ftmm.h +159 -103
  37. package/include/freetype2/freetype/ftmodapi.h +1 -1
  38. package/include/freetype2/freetype/ftmoderr.h +1 -1
  39. package/include/freetype2/freetype/ftotval.h +1 -1
  40. package/include/freetype2/freetype/ftoutln.h +1 -1
  41. package/include/freetype2/freetype/ftparams.h +1 -1
  42. package/include/freetype2/freetype/ftpfr.h +1 -1
  43. package/include/freetype2/freetype/ftrender.h +1 -1
  44. package/include/freetype2/freetype/ftsizes.h +1 -1
  45. package/include/freetype2/freetype/ftsnames.h +1 -1
  46. package/include/freetype2/freetype/ftstroke.h +1 -1
  47. package/include/freetype2/freetype/ftsynth.h +1 -1
  48. package/include/freetype2/freetype/ftsystem.h +1 -1
  49. package/include/freetype2/freetype/fttrigon.h +1 -1
  50. package/include/freetype2/freetype/fttypes.h +1 -1
  51. package/include/freetype2/freetype/ftwinfnt.h +2 -3
  52. package/include/freetype2/freetype/otsvg.h +1 -1
  53. package/include/freetype2/freetype/t1tables.h +1 -1
  54. package/include/freetype2/freetype/ttnameid.h +129 -129
  55. package/include/freetype2/freetype/tttables.h +8 -5
  56. package/include/freetype2/freetype/tttags.h +1 -1
  57. package/include/freetype2/ft2build.h +1 -1
  58. package/include/glib-2.0/gio/gdbuserror.h +9 -8
  59. package/include/glib-2.0/gio/ginetaddress.h +12 -0
  60. package/include/glib-2.0/gio/gioenums.h +9 -2
  61. package/include/glib-2.0/glib/gstring.h +2 -2
  62. package/include/glib-2.0/glib/gunicode.h +1 -1
  63. package/include/glib-2.0/gobject/glib-types.h +1 -1
  64. package/include/glib-2.0/gobject/gparam.h +1 -1
  65. package/include/glib-2.0/gobject/gvalue.h +78 -35
  66. package/include/harfbuzz/hb-script-list.h +12 -0
  67. package/include/harfbuzz/hb-version.h +3 -3
  68. package/include/hwy/abort.h +2 -19
  69. package/include/hwy/aligned_allocator.h +11 -7
  70. package/include/hwy/auto_tune.h +504 -0
  71. package/include/hwy/base.h +425 -104
  72. package/include/hwy/cache_control.h +16 -0
  73. package/include/hwy/detect_compiler_arch.h +32 -1
  74. package/include/hwy/detect_targets.h +251 -67
  75. package/include/hwy/foreach_target.h +35 -0
  76. package/include/hwy/highway.h +185 -76
  77. package/include/hwy/nanobenchmark.h +1 -19
  78. package/include/hwy/ops/arm_neon-inl.h +969 -458
  79. package/include/hwy/ops/arm_sve-inl.h +1137 -359
  80. package/include/hwy/ops/emu128-inl.h +97 -11
  81. package/include/hwy/ops/generic_ops-inl.h +1222 -34
  82. package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
  83. package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
  84. package/include/hwy/ops/ppc_vsx-inl.h +306 -126
  85. package/include/hwy/ops/rvv-inl.h +546 -51
  86. package/include/hwy/ops/scalar-inl.h +77 -22
  87. package/include/hwy/ops/set_macros-inl.h +138 -17
  88. package/include/hwy/ops/shared-inl.h +50 -10
  89. package/include/hwy/ops/wasm_128-inl.h +137 -92
  90. package/include/hwy/ops/x86_128-inl.h +773 -214
  91. package/include/hwy/ops/x86_256-inl.h +712 -255
  92. package/include/hwy/ops/x86_512-inl.h +429 -753
  93. package/include/hwy/ops/x86_avx3-inl.h +501 -0
  94. package/include/hwy/per_target.h +2 -1
  95. package/include/hwy/profiler.h +622 -486
  96. package/include/hwy/targets.h +62 -20
  97. package/include/hwy/timer-inl.h +8 -160
  98. package/include/hwy/timer.h +170 -3
  99. package/include/hwy/x86_cpuid.h +81 -0
  100. package/include/libheif/heif_cxx.h +25 -5
  101. package/include/libheif/heif_regions.h +5 -5
  102. package/include/libheif/heif_version.h +2 -2
  103. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  104. package/include/libxml2/libxml/valid.h +0 -3
  105. package/include/libxml2/libxml/xmlerror.h +1 -1
  106. package/include/libxml2/libxml/xmlversion.h +4 -4
  107. package/include/pango-1.0/pango/pango-enum-types.h +3 -0
  108. package/include/pango-1.0/pango/pango-features.h +3 -3
  109. package/include/pango-1.0/pango/pango-font.h +30 -0
  110. package/include/pango-1.0/pango/pango-version-macros.h +26 -0
  111. package/include/vips/connection.h +4 -4
  112. package/include/vips/version.h +4 -4
  113. package/include/zlib.h +3 -3
  114. package/package.json +1 -1
  115. package/versions.json +13 -13
@@ -46,6 +46,36 @@ HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494,
46
46
  #include <f16cintrin.h>
47
47
  #include <fmaintrin.h>
48
48
  #include <smmintrin.h>
49
+
50
+ #if HWY_TARGET <= HWY_AVX10_2
51
+ #include <avx512bitalgintrin.h>
52
+ #include <avx512bwintrin.h>
53
+ #include <avx512cdintrin.h>
54
+ #include <avx512dqintrin.h>
55
+ #include <avx512fintrin.h>
56
+ #include <avx512vbmi2intrin.h>
57
+ #include <avx512vbmiintrin.h>
58
+ #include <avx512vbmivlintrin.h>
59
+ #include <avx512vlbitalgintrin.h>
60
+ #include <avx512vlbwintrin.h>
61
+ #include <avx512vlcdintrin.h>
62
+ #include <avx512vldqintrin.h>
63
+ #include <avx512vlintrin.h>
64
+ #include <avx512vlvbmi2intrin.h>
65
+ #include <avx512vlvnniintrin.h>
66
+ #include <avx512vnniintrin.h>
67
+ #include <avx512vpopcntdqintrin.h>
68
+ #include <avx512vpopcntdqvlintrin.h>
69
+ // Must come after avx512fintrin, else will not define 512-bit intrinsics.
70
+ #include <avx512fp16intrin.h>
71
+ #include <avx512vlfp16intrin.h>
72
+ #include <gfniintrin.h>
73
+ #include <vaesintrin.h>
74
+ #include <vpclmulqdqintrin.h>
75
+
76
+ #endif // HWY_TARGET <= HWY_AVX10_2
77
+
78
+ // clang-format on
49
79
  #endif // HWY_COMPILER_CLANGCL
50
80
 
51
81
  // For half-width vectors. Already includes base.h.
@@ -117,67 +147,90 @@ class Vec256 {
117
147
  Raw raw;
118
148
  };
119
149
 
120
- #if HWY_TARGET <= HWY_AVX3
121
-
122
150
  namespace detail {
123
151
 
152
+ #if HWY_TARGET <= HWY_AVX3
153
+
124
154
  // Template arg: sizeof(lane type)
125
155
  template <size_t size>
126
- struct RawMask256 {};
156
+ struct RawMask256T {};
127
157
  template <>
128
- struct RawMask256<1> {
158
+ struct RawMask256T<1> {
129
159
  using type = __mmask32;
130
160
  };
131
161
  template <>
132
- struct RawMask256<2> {
162
+ struct RawMask256T<2> {
133
163
  using type = __mmask16;
134
164
  };
135
165
  template <>
136
- struct RawMask256<4> {
166
+ struct RawMask256T<4> {
137
167
  using type = __mmask8;
138
168
  };
139
169
  template <>
140
- struct RawMask256<8> {
170
+ struct RawMask256T<8> {
141
171
  using type = __mmask8;
142
172
  };
143
173
 
174
+ template <typename T>
175
+ using RawMask256 = typename RawMask256T<sizeof(T)>::type;
176
+
177
+ #else // AVX2 or earlier
178
+
179
+ template <typename T>
180
+ using RawMask256 = typename Raw256<T>::type;
181
+
182
+ #endif // HWY_TARGET <= HWY_AVX3
183
+
144
184
  } // namespace detail
145
185
 
146
186
  template <typename T>
147
187
  struct Mask256 {
148
- using Raw = typename detail::RawMask256<sizeof(T)>::type;
188
+ using Raw = typename detail::RawMask256<T>;
149
189
 
190
+ using PrivateT = T; // only for DFromM
191
+ static constexpr size_t kPrivateN = 32 / sizeof(T); // only for DFromM
192
+
193
+ #if HWY_TARGET <= HWY_AVX3
150
194
  static Mask256<T> FromBits(uint64_t mask_bits) {
151
195
  return Mask256<T>{static_cast<Raw>(mask_bits)};
152
196
  }
197
+ #else
198
+ // Lanes are either FF..FF or 0.
199
+ #endif // HWY_TARGET <= HWY_AVX3
153
200
 
154
201
  Raw raw;
155
202
  };
156
203
 
157
- #else // AVX2
158
-
159
- // FF..FF or 0.
160
204
  template <typename T>
161
- struct Mask256 {
162
- typename detail::Raw256<T>::type raw;
163
- };
164
-
165
- #endif // AVX2
205
+ using Full256 = Simd<T, 32 / sizeof(T), 0>;
166
206
 
167
- #if HWY_TARGET <= HWY_AVX3
168
- namespace detail {
207
+ // ------------------------------ Zero
169
208
 
170
- // Used by Expand() emulation, which is required for both AVX3 and AVX2.
171
- template <typename T>
172
- HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
173
- return mask.raw;
209
+ // Cannot use VFromD here because it is defined in terms of Zero.
210
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
211
+ HWY_API Vec256<TFromD<D>> Zero(D /* tag */) {
212
+ return Vec256<TFromD<D>>{_mm256_setzero_si256()};
213
+ }
214
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
215
+ HWY_API Vec256<bfloat16_t> Zero(D /* tag */) {
216
+ return Vec256<bfloat16_t>{_mm256_setzero_si256()};
217
+ }
218
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
219
+ HWY_API Vec256<float16_t> Zero(D /* tag */) {
220
+ #if HWY_HAVE_FLOAT16
221
+ return Vec256<float16_t>{_mm256_setzero_ph()};
222
+ #else
223
+ return Vec256<float16_t>{_mm256_setzero_si256()};
224
+ #endif // HWY_HAVE_FLOAT16
225
+ }
226
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
227
+ HWY_API Vec256<float> Zero(D /* tag */) {
228
+ return Vec256<float>{_mm256_setzero_ps()};
229
+ }
230
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
231
+ HWY_API Vec256<double> Zero(D /* tag */) {
232
+ return Vec256<double>{_mm256_setzero_pd()};
174
233
  }
175
-
176
- } // namespace detail
177
- #endif // HWY_TARGET <= HWY_AVX3
178
-
179
- template <typename T>
180
- using Full256 = Simd<T, 32 / sizeof(T), 0>;
181
234
 
182
235
  // ------------------------------ BitCast
183
236
 
@@ -250,34 +303,6 @@ HWY_API VFromD<D> BitCast(D d, Vec256<FromT> v) {
250
303
  return detail::BitCastFromByte(d, detail::BitCastToByte(v));
251
304
  }
252
305
 
253
- // ------------------------------ Zero
254
-
255
- // Cannot use VFromD here because it is defined in terms of Zero.
256
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
257
- HWY_API Vec256<TFromD<D>> Zero(D /* tag */) {
258
- return Vec256<TFromD<D>>{_mm256_setzero_si256()};
259
- }
260
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
261
- HWY_API Vec256<bfloat16_t> Zero(D /* tag */) {
262
- return Vec256<bfloat16_t>{_mm256_setzero_si256()};
263
- }
264
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
265
- HWY_API Vec256<float16_t> Zero(D /* tag */) {
266
- #if HWY_HAVE_FLOAT16
267
- return Vec256<float16_t>{_mm256_setzero_ph()};
268
- #else
269
- return Vec256<float16_t>{_mm256_setzero_si256()};
270
- #endif // HWY_HAVE_FLOAT16
271
- }
272
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
273
- HWY_API Vec256<float> Zero(D /* tag */) {
274
- return Vec256<float>{_mm256_setzero_ps()};
275
- }
276
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
277
- HWY_API Vec256<double> Zero(D /* tag */) {
278
- return Vec256<double>{_mm256_setzero_pd()};
279
- }
280
-
281
306
  // ------------------------------ Set
282
307
 
283
308
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
@@ -1761,6 +1786,68 @@ HWY_API Vec256<double> Max(const Vec256<double> a, const Vec256<double> b) {
1761
1786
  return Vec256<double>{_mm256_max_pd(a.raw, b.raw)};
1762
1787
  }
1763
1788
 
1789
+ // ------------------------------ MinNumber and MaxNumber
1790
+
1791
+ #if HWY_X86_HAVE_AVX10_2_OPS
1792
+
1793
+ #if HWY_HAVE_FLOAT16
1794
+ HWY_API Vec256<float16_t> MinNumber(Vec256<float16_t> a, Vec256<float16_t> b) {
1795
+ return Vec256<float16_t>{_mm256_minmax_ph(a.raw, b.raw, 0x14)};
1796
+ }
1797
+ #endif
1798
+ HWY_API Vec256<float> MinNumber(Vec256<float> a, Vec256<float> b) {
1799
+ return Vec256<float>{_mm256_minmax_ps(a.raw, b.raw, 0x14)};
1800
+ }
1801
+ HWY_API Vec256<double> MinNumber(Vec256<double> a, Vec256<double> b) {
1802
+ return Vec256<double>{_mm256_minmax_pd(a.raw, b.raw, 0x14)};
1803
+ }
1804
+
1805
+ #if HWY_HAVE_FLOAT16
1806
+ HWY_API Vec256<float16_t> MaxNumber(Vec256<float16_t> a, Vec256<float16_t> b) {
1807
+ return Vec256<float16_t>{_mm256_minmax_ph(a.raw, b.raw, 0x15)};
1808
+ }
1809
+ #endif
1810
+ HWY_API Vec256<float> MaxNumber(Vec256<float> a, Vec256<float> b) {
1811
+ return Vec256<float>{_mm256_minmax_ps(a.raw, b.raw, 0x15)};
1812
+ }
1813
+ HWY_API Vec256<double> MaxNumber(Vec256<double> a, Vec256<double> b) {
1814
+ return Vec256<double>{_mm256_minmax_pd(a.raw, b.raw, 0x15)};
1815
+ }
1816
+
1817
+ #endif
1818
+
1819
+ // ------------------------------ MinMagnitude and MaxMagnitude
1820
+
1821
+ #if HWY_X86_HAVE_AVX10_2_OPS
1822
+
1823
+ #if HWY_HAVE_FLOAT16
1824
+ HWY_API Vec256<float16_t> MinMagnitude(Vec256<float16_t> a,
1825
+ Vec256<float16_t> b) {
1826
+ return Vec256<float16_t>{_mm256_minmax_ph(a.raw, b.raw, 0x16)};
1827
+ }
1828
+ #endif
1829
+ HWY_API Vec256<float> MinMagnitude(Vec256<float> a, Vec256<float> b) {
1830
+ return Vec256<float>{_mm256_minmax_ps(a.raw, b.raw, 0x16)};
1831
+ }
1832
+ HWY_API Vec256<double> MinMagnitude(Vec256<double> a, Vec256<double> b) {
1833
+ return Vec256<double>{_mm256_minmax_pd(a.raw, b.raw, 0x16)};
1834
+ }
1835
+
1836
+ #if HWY_HAVE_FLOAT16
1837
+ HWY_API Vec256<float16_t> MaxMagnitude(Vec256<float16_t> a,
1838
+ Vec256<float16_t> b) {
1839
+ return Vec256<float16_t>{_mm256_minmax_ph(a.raw, b.raw, 0x17)};
1840
+ }
1841
+ #endif
1842
+ HWY_API Vec256<float> MaxMagnitude(Vec256<float> a, Vec256<float> b) {
1843
+ return Vec256<float>{_mm256_minmax_ps(a.raw, b.raw, 0x17)};
1844
+ }
1845
+ HWY_API Vec256<double> MaxMagnitude(Vec256<double> a, Vec256<double> b) {
1846
+ return Vec256<double>{_mm256_minmax_pd(a.raw, b.raw, 0x17)};
1847
+ }
1848
+
1849
+ #endif
1850
+
1764
1851
  // ------------------------------ Iota
1765
1852
 
1766
1853
  namespace detail {
@@ -1952,6 +2039,47 @@ HWY_API Vec256<double> AddSub(Vec256<double> a, Vec256<double> b) {
1952
2039
  return Vec256<double>{_mm256_addsub_pd(a.raw, b.raw)};
1953
2040
  }
1954
2041
 
2042
+ // ------------------------------ PairwiseAdd128/PairwiseSub128
2043
+
2044
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI16_D(D)>
2045
+ HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) {
2046
+ return VFromD<D>{_mm256_hadd_epi16(a.raw, b.raw)};
2047
+ }
2048
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI16_D(D)>
2049
+ HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) {
2050
+ const DFromV<decltype(a)> d;
2051
+ const RebindToSigned<decltype(d)> di;
2052
+ return BitCast(d,
2053
+ Neg(BitCast(di, VFromD<D>{_mm256_hsub_epi16(a.raw, b.raw)})));
2054
+ }
2055
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
2056
+ HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) {
2057
+ return VFromD<D>{_mm256_hadd_epi32(a.raw, b.raw)};
2058
+ }
2059
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
2060
+ HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) {
2061
+ const DFromV<decltype(a)> d;
2062
+ const RebindToSigned<decltype(d)> di;
2063
+ return BitCast(d,
2064
+ Neg(BitCast(di, VFromD<D>{_mm256_hsub_epi32(a.raw, b.raw)})));
2065
+ }
2066
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
2067
+ HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) {
2068
+ return VFromD<D>{_mm256_hadd_ps(a.raw, b.raw)};
2069
+ }
2070
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
2071
+ HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) {
2072
+ return Neg(VFromD<D>{_mm256_hsub_ps(a.raw, b.raw)});
2073
+ }
2074
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
2075
+ HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) {
2076
+ return VFromD<D>{_mm256_hadd_pd(a.raw, b.raw)};
2077
+ }
2078
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
2079
+ HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) {
2080
+ return Neg(VFromD<D>{_mm256_hsub_pd(a.raw, b.raw)});
2081
+ }
2082
+
1955
2083
  // ------------------------------ SumsOf8
1956
2084
  HWY_API Vec256<uint64_t> SumsOf8(Vec256<uint8_t> v) {
1957
2085
  return Vec256<uint64_t>{_mm256_sad_epu8(v.raw, _mm256_setzero_si256())};
@@ -2146,6 +2274,11 @@ HWY_API Vec256<uint16_t> operator*(Vec256<uint16_t> a, Vec256<uint16_t> b) {
2146
2274
  HWY_API Vec256<uint32_t> operator*(Vec256<uint32_t> a, Vec256<uint32_t> b) {
2147
2275
  return Vec256<uint32_t>{_mm256_mullo_epi32(a.raw, b.raw)};
2148
2276
  }
2277
+ #if HWY_TARGET <= HWY_AVX3
2278
+ HWY_API Vec256<uint64_t> operator*(Vec256<uint64_t> a, Vec256<uint64_t> b) {
2279
+ return Vec256<uint64_t>{_mm256_mullo_epi64(a.raw, b.raw)};
2280
+ }
2281
+ #endif
2149
2282
 
2150
2283
  // Signed
2151
2284
  HWY_API Vec256<int16_t> operator*(Vec256<int16_t> a, Vec256<int16_t> b) {
@@ -2154,6 +2287,11 @@ HWY_API Vec256<int16_t> operator*(Vec256<int16_t> a, Vec256<int16_t> b) {
2154
2287
  HWY_API Vec256<int32_t> operator*(Vec256<int32_t> a, Vec256<int32_t> b) {
2155
2288
  return Vec256<int32_t>{_mm256_mullo_epi32(a.raw, b.raw)};
2156
2289
  }
2290
+ #if HWY_TARGET <= HWY_AVX3
2291
+ HWY_API Vec256<int64_t> operator*(Vec256<int64_t> a, Vec256<int64_t> b) {
2292
+ return Vec256<int64_t>{_mm256_mullo_epi64(a.raw, b.raw)};
2293
+ }
2294
+ #endif
2157
2295
 
2158
2296
  // Returns the upper 16 bits of a * b in each lane.
2159
2297
  HWY_API Vec256<uint16_t> MulHigh(Vec256<uint16_t> a, Vec256<uint16_t> b) {
@@ -2377,29 +2515,37 @@ HWY_API Vec256<int32_t> BroadcastSignBit(const Vec256<int32_t> v) {
2377
2515
  return ShiftRight<31>(v);
2378
2516
  }
2379
2517
 
2518
+ #if HWY_TARGET <= HWY_AVX3
2519
+
2520
+ template <int kBits>
2521
+ HWY_API Vec256<int64_t> ShiftRight(const Vec256<int64_t> v) {
2522
+ return Vec256<int64_t>{
2523
+ _mm256_srai_epi64(v.raw, static_cast<Shift64Count>(kBits))};
2524
+ }
2525
+
2526
+ HWY_API Vec256<int64_t> BroadcastSignBit(const Vec256<int64_t> v) {
2527
+ return ShiftRight<63>(v);
2528
+ }
2529
+
2530
+ #else // AVX2
2531
+
2532
+ // Unlike above, this will be used to implement int64_t ShiftRight.
2380
2533
  HWY_API Vec256<int64_t> BroadcastSignBit(const Vec256<int64_t> v) {
2381
- #if HWY_TARGET == HWY_AVX2
2382
2534
  const DFromV<decltype(v)> d;
2383
2535
  return VecFromMask(v < Zero(d));
2384
- #else
2385
- return Vec256<int64_t>{_mm256_srai_epi64(v.raw, 63)};
2386
- #endif
2387
2536
  }
2388
2537
 
2389
2538
  template <int kBits>
2390
2539
  HWY_API Vec256<int64_t> ShiftRight(const Vec256<int64_t> v) {
2391
- #if HWY_TARGET <= HWY_AVX3
2392
- return Vec256<int64_t>{
2393
- _mm256_srai_epi64(v.raw, static_cast<Shift64Count>(kBits))};
2394
- #else
2395
2540
  const Full256<int64_t> di;
2396
2541
  const Full256<uint64_t> du;
2397
2542
  const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
2398
2543
  const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v));
2399
2544
  return right | sign;
2400
- #endif
2401
2545
  }
2402
2546
 
2547
+ #endif // #if HWY_TARGET <= HWY_AVX3
2548
+
2403
2549
  // ------------------------------ IfNegativeThenElse (BroadcastSignBit)
2404
2550
  HWY_API Vec256<int8_t> IfNegativeThenElse(Vec256<int8_t> v, Vec256<int8_t> yes,
2405
2551
  Vec256<int8_t> no) {
@@ -2459,6 +2605,10 @@ HWY_API Vec256<int32_t> IfNegativeThenNegOrUndefIfZero(Vec256<int32_t> mask,
2459
2605
 
2460
2606
  // ------------------------------ ShiftLeftSame
2461
2607
 
2608
+ // Disable sign conversion warnings for GCC debug intrinsics.
2609
+ HWY_DIAGNOSTICS(push)
2610
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
2611
+
2462
2612
  HWY_API Vec256<uint16_t> ShiftLeftSame(const Vec256<uint16_t> v,
2463
2613
  const int bits) {
2464
2614
  #if HWY_COMPILER_GCC
@@ -2606,6 +2756,8 @@ HWY_API Vec256<int8_t> ShiftRightSame(Vec256<int8_t> v, const int bits) {
2606
2756
  return (shifted ^ shifted_sign) - shifted_sign;
2607
2757
  }
2608
2758
 
2759
+ HWY_DIAGNOSTICS(pop)
2760
+
2609
2761
  // ------------------------------ Neg (Xor, Sub)
2610
2762
 
2611
2763
  // Tag dispatch instead of SFINAE for MSVC 2017 compatibility
@@ -2651,6 +2803,25 @@ HWY_API Vec256<double> operator*(Vec256<double> a, Vec256<double> b) {
2651
2803
  return Vec256<double>{_mm256_mul_pd(a.raw, b.raw)};
2652
2804
  }
2653
2805
 
2806
+ #if HWY_TARGET <= HWY_AVX3
2807
+
2808
+ #if HWY_HAVE_FLOAT16
2809
+ HWY_API Vec256<float16_t> MulByFloorPow2(Vec256<float16_t> a,
2810
+ Vec256<float16_t> b) {
2811
+ return Vec256<float16_t>{_mm256_scalef_ph(a.raw, b.raw)};
2812
+ }
2813
+ #endif
2814
+
2815
+ HWY_API Vec256<float> MulByFloorPow2(Vec256<float> a, Vec256<float> b) {
2816
+ return Vec256<float>{_mm256_scalef_ps(a.raw, b.raw)};
2817
+ }
2818
+
2819
+ HWY_API Vec256<double> MulByFloorPow2(Vec256<double> a, Vec256<double> b) {
2820
+ return Vec256<double>{_mm256_scalef_pd(a.raw, b.raw)};
2821
+ }
2822
+
2823
+ #endif // HWY_TARGET <= HWY_AVX3
2824
+
2654
2825
  #if HWY_HAVE_FLOAT16
2655
2826
  HWY_API Vec256<float16_t> operator/(Vec256<float16_t> a, Vec256<float16_t> b) {
2656
2827
  return Vec256<float16_t>{_mm256_div_ph(a.raw, b.raw)};
@@ -2680,6 +2851,27 @@ HWY_API Vec256<double> ApproximateReciprocal(Vec256<double> v) {
2680
2851
  }
2681
2852
  #endif
2682
2853
 
2854
+ // ------------------------------ GetExponent
2855
+
2856
+ #if HWY_TARGET <= HWY_AVX3
2857
+
2858
+ #if HWY_HAVE_FLOAT16
2859
+ template <class V, HWY_IF_F16(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>
2860
+ HWY_API V GetExponent(V v) {
2861
+ return V{_mm256_getexp_ph(v.raw)};
2862
+ }
2863
+ #endif
2864
+ template <class V, HWY_IF_F32(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>
2865
+ HWY_API V GetExponent(V v) {
2866
+ return V{_mm256_getexp_ps(v.raw)};
2867
+ }
2868
+ template <class V, HWY_IF_F64(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>
2869
+ HWY_API V GetExponent(V v) {
2870
+ return V{_mm256_getexp_pd(v.raw)};
2871
+ }
2872
+
2873
+ #endif
2874
+
2683
2875
  // ------------------------------ MaskedMinOr
2684
2876
 
2685
2877
  #if HWY_TARGET <= HWY_AVX3
@@ -4170,48 +4362,130 @@ HWY_API Vec256<double> Broadcast(const Vec256<double> v) {
4170
4362
  return Vec256<double>{_mm256_shuffle_pd(v.raw, v.raw, 15 * kLane)};
4171
4363
  }
4172
4364
 
4173
- // ------------------------------ BroadcastBlock
4174
-
4175
- template <int kBlockIdx, class T>
4176
- HWY_API Vec256<T> BroadcastBlock(Vec256<T> v) {
4177
- static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index");
4178
- const DFromV<decltype(v)> d;
4179
- return (kBlockIdx == 0) ? ConcatLowerLower(d, v, v)
4180
- : ConcatUpperUpper(d, v, v);
4181
- }
4182
-
4183
- // ------------------------------ BroadcastLane
4184
-
4185
- namespace detail {
4365
+ // ------------------------------ Concat blocks (LowerHalf, ZeroExtendVector)
4186
4366
 
4187
- template <class T, HWY_IF_T_SIZE(T, 1)>
4188
- HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
4189
- Vec256<T> v) {
4190
- const Half<DFromV<decltype(v)>> dh;
4191
- return Vec256<T>{_mm256_broadcastb_epi8(LowerHalf(dh, v).raw)};
4192
- }
4367
+ // _mm256_broadcastsi128_si256 has 7 cycle latency on ICL.
4368
+ // _mm256_permute2x128_si256 is slow on Zen1 (8 uops), so we avoid it (at no
4369
+ // extra cost) for LowerLower and UpperLower.
4193
4370
 
4194
- template <class T, HWY_IF_T_SIZE(T, 2)>
4195
- HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
4196
- Vec256<T> v) {
4197
- const DFromV<decltype(v)> d;
4371
+ // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
4372
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4373
+ HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
4198
4374
  const RebindToUnsigned<decltype(d)> du; // for float16_t
4199
- const Half<decltype(d)> dh;
4200
- const RebindToUnsigned<decltype(dh)> dh_u;
4201
- return BitCast(d, VFromD<decltype(du)>{_mm256_broadcastw_epi16(
4202
- BitCast(dh_u, LowerHalf(dh, v)).raw)});
4375
+ const Half<decltype(d)> d2;
4376
+ const RebindToUnsigned<decltype(d2)> du2; // for float16_t
4377
+ return BitCast(
4378
+ d, VFromD<decltype(du)>{_mm256_inserti128_si256(
4379
+ BitCast(du, lo).raw, BitCast(du2, LowerHalf(d2, hi)).raw, 1)});
4203
4380
  }
4204
-
4205
- template <class T, HWY_IF_UI32(T)>
4206
- HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
4207
- Vec256<T> v) {
4208
- const Half<DFromV<decltype(v)>> dh;
4209
- return Vec256<T>{_mm256_broadcastd_epi32(LowerHalf(dh, v).raw)};
4381
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4382
+ HWY_API Vec256<float> ConcatLowerLower(D d, Vec256<float> hi,
4383
+ Vec256<float> lo) {
4384
+ const Half<decltype(d)> d2;
4385
+ return Vec256<float>{_mm256_insertf128_ps(lo.raw, LowerHalf(d2, hi).raw, 1)};
4386
+ }
4387
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4388
+ HWY_API Vec256<double> ConcatLowerLower(D d, Vec256<double> hi,
4389
+ Vec256<double> lo) {
4390
+ const Half<decltype(d)> d2;
4391
+ return Vec256<double>{_mm256_insertf128_pd(lo.raw, LowerHalf(d2, hi).raw, 1)};
4210
4392
  }
4211
4393
 
4212
- template <class T, HWY_IF_UI64(T)>
4213
- HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
4214
- Vec256<T> v) {
4394
+ // hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
4395
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4396
+ HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
4397
+ const RebindToUnsigned<decltype(d)> du;
4398
+ return BitCast(d, VFromD<decltype(du)>{_mm256_permute2x128_si256(
4399
+ BitCast(du, lo).raw, BitCast(du, hi).raw, 0x21)});
4400
+ }
4401
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4402
+ HWY_API Vec256<float> ConcatLowerUpper(D /* tag */, Vec256<float> hi,
4403
+ Vec256<float> lo) {
4404
+ return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x21)};
4405
+ }
4406
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4407
+ HWY_API Vec256<double> ConcatLowerUpper(D /* tag */, Vec256<double> hi,
4408
+ Vec256<double> lo) {
4409
+ return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x21)};
4410
+ }
4411
+
4412
+ // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
4413
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4414
+ HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
4415
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4416
+ return BitCast(d, VFromD<decltype(du)>{_mm256_blend_epi32(
4417
+ BitCast(du, hi).raw, BitCast(du, lo).raw, 0x0F)});
4418
+ }
4419
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4420
+ HWY_API Vec256<float> ConcatUpperLower(D /* tag */, Vec256<float> hi,
4421
+ Vec256<float> lo) {
4422
+ return Vec256<float>{_mm256_blend_ps(hi.raw, lo.raw, 0x0F)};
4423
+ }
4424
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4425
+ HWY_API Vec256<double> ConcatUpperLower(D /* tag */, Vec256<double> hi,
4426
+ Vec256<double> lo) {
4427
+ return Vec256<double>{_mm256_blend_pd(hi.raw, lo.raw, 3)};
4428
+ }
4429
+
4430
+ // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
4431
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4432
+ HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
4433
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4434
+ return BitCast(d, VFromD<decltype(du)>{_mm256_permute2x128_si256(
4435
+ BitCast(du, lo).raw, BitCast(du, hi).raw, 0x31)});
4436
+ }
4437
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4438
+ HWY_API Vec256<float> ConcatUpperUpper(D /* tag */, Vec256<float> hi,
4439
+ Vec256<float> lo) {
4440
+ return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x31)};
4441
+ }
4442
+ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4443
+ HWY_API Vec256<double> ConcatUpperUpper(D /* tag */, Vec256<double> hi,
4444
+ Vec256<double> lo) {
4445
+ return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x31)};
4446
+ }
4447
+
4448
+ // ------------------------------ BroadcastBlock
4449
+ template <int kBlockIdx, class T>
4450
+ HWY_API Vec256<T> BroadcastBlock(Vec256<T> v) {
4451
+ static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index");
4452
+ const DFromV<decltype(v)> d;
4453
+ return (kBlockIdx == 0) ? ConcatLowerLower(d, v, v)
4454
+ : ConcatUpperUpper(d, v, v);
4455
+ }
4456
+
4457
+ // ------------------------------ BroadcastLane
4458
+
4459
+ namespace detail {
4460
+
4461
+ template <class T, HWY_IF_T_SIZE(T, 1)>
4462
+ HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
4463
+ Vec256<T> v) {
4464
+ const Half<DFromV<decltype(v)>> dh;
4465
+ return Vec256<T>{_mm256_broadcastb_epi8(LowerHalf(dh, v).raw)};
4466
+ }
4467
+
4468
+ template <class T, HWY_IF_T_SIZE(T, 2)>
4469
+ HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
4470
+ Vec256<T> v) {
4471
+ const DFromV<decltype(v)> d;
4472
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4473
+ const Half<decltype(d)> dh;
4474
+ const RebindToUnsigned<decltype(dh)> dh_u;
4475
+ return BitCast(d, VFromD<decltype(du)>{_mm256_broadcastw_epi16(
4476
+ BitCast(dh_u, LowerHalf(dh, v)).raw)});
4477
+ }
4478
+
4479
+ template <class T, HWY_IF_UI32(T)>
4480
+ HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
4481
+ Vec256<T> v) {
4482
+ const Half<DFromV<decltype(v)>> dh;
4483
+ return Vec256<T>{_mm256_broadcastd_epi32(LowerHalf(dh, v).raw)};
4484
+ }
4485
+
4486
+ template <class T, HWY_IF_UI64(T)>
4487
+ HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
4488
+ Vec256<T> v) {
4215
4489
  const Half<DFromV<decltype(v)>> dh;
4216
4490
  return Vec256<T>{_mm256_broadcastq_epi64(LowerHalf(dh, v).raw)};
4217
4491
  }
@@ -4651,6 +4925,18 @@ HWY_API Vec256<float> SwapAdjacentBlocks(Vec256<float> v) {
4651
4925
  return BitCast(d, SwapAdjacentBlocks(BitCast(dw, v)));
4652
4926
  }
4653
4927
 
4928
+ // ------------------------------ InterleaveEvenBlocks (ConcatLowerLower)
4929
+ template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_D(D, 32)>
4930
+ HWY_API V InterleaveEvenBlocks(D d, V a, V b) {
4931
+ return ConcatLowerLower(d, b, a);
4932
+ }
4933
+
4934
+ // ------------------------------ InterleaveOddBlocks (ConcatUpperUpper)
4935
+ template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_D(D, 32)>
4936
+ HWY_API V InterleaveOddBlocks(D d, V a, V b) {
4937
+ return ConcatUpperUpper(d, b, a);
4938
+ }
4939
+
4654
4940
  // ------------------------------ Reverse (RotateRight)
4655
4941
 
4656
4942
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
@@ -4807,89 +5093,6 @@ HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
4807
5093
  return VFromD<D>{_mm256_unpackhi_pd(a.raw, b.raw)};
4808
5094
  }
4809
5095
 
4810
- // ------------------------------ Blocks (LowerHalf, ZeroExtendVector)
4811
-
4812
- // _mm256_broadcastsi128_si256 has 7 cycle latency on ICL.
4813
- // _mm256_permute2x128_si256 is slow on Zen1 (8 uops), so we avoid it (at no
4814
- // extra cost) for LowerLower and UpperLower.
4815
-
4816
- // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
4817
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4818
- HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
4819
- const RebindToUnsigned<decltype(d)> du; // for float16_t
4820
- const Half<decltype(d)> d2;
4821
- const RebindToUnsigned<decltype(d2)> du2; // for float16_t
4822
- return BitCast(
4823
- d, VFromD<decltype(du)>{_mm256_inserti128_si256(
4824
- BitCast(du, lo).raw, BitCast(du2, LowerHalf(d2, hi)).raw, 1)});
4825
- }
4826
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4827
- HWY_API Vec256<float> ConcatLowerLower(D d, Vec256<float> hi,
4828
- Vec256<float> lo) {
4829
- const Half<decltype(d)> d2;
4830
- return Vec256<float>{_mm256_insertf128_ps(lo.raw, LowerHalf(d2, hi).raw, 1)};
4831
- }
4832
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4833
- HWY_API Vec256<double> ConcatLowerLower(D d, Vec256<double> hi,
4834
- Vec256<double> lo) {
4835
- const Half<decltype(d)> d2;
4836
- return Vec256<double>{_mm256_insertf128_pd(lo.raw, LowerHalf(d2, hi).raw, 1)};
4837
- }
4838
-
4839
- // hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
4840
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4841
- HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
4842
- const RebindToUnsigned<decltype(d)> du;
4843
- return BitCast(d, VFromD<decltype(du)>{_mm256_permute2x128_si256(
4844
- BitCast(du, lo).raw, BitCast(du, hi).raw, 0x21)});
4845
- }
4846
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4847
- HWY_API Vec256<float> ConcatLowerUpper(D /* tag */, Vec256<float> hi,
4848
- Vec256<float> lo) {
4849
- return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x21)};
4850
- }
4851
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4852
- HWY_API Vec256<double> ConcatLowerUpper(D /* tag */, Vec256<double> hi,
4853
- Vec256<double> lo) {
4854
- return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x21)};
4855
- }
4856
-
4857
- // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
4858
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4859
- HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
4860
- const RebindToUnsigned<decltype(d)> du; // for float16_t
4861
- return BitCast(d, VFromD<decltype(du)>{_mm256_blend_epi32(
4862
- BitCast(du, hi).raw, BitCast(du, lo).raw, 0x0F)});
4863
- }
4864
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4865
- HWY_API Vec256<float> ConcatUpperLower(D /* tag */, Vec256<float> hi,
4866
- Vec256<float> lo) {
4867
- return Vec256<float>{_mm256_blend_ps(hi.raw, lo.raw, 0x0F)};
4868
- }
4869
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4870
- HWY_API Vec256<double> ConcatUpperLower(D /* tag */, Vec256<double> hi,
4871
- Vec256<double> lo) {
4872
- return Vec256<double>{_mm256_blend_pd(hi.raw, lo.raw, 3)};
4873
- }
4874
-
4875
- // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
4876
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
4877
- HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
4878
- const RebindToUnsigned<decltype(d)> du; // for float16_t
4879
- return BitCast(d, VFromD<decltype(du)>{_mm256_permute2x128_si256(
4880
- BitCast(du, lo).raw, BitCast(du, hi).raw, 0x31)});
4881
- }
4882
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
4883
- HWY_API Vec256<float> ConcatUpperUpper(D /* tag */, Vec256<float> hi,
4884
- Vec256<float> lo) {
4885
- return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x31)};
4886
- }
4887
- template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
4888
- HWY_API Vec256<double> ConcatUpperUpper(D /* tag */, Vec256<double> hi,
4889
- Vec256<double> lo) {
4890
- return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x31)};
4891
- }
4892
-
4893
5096
  // ---------------------------- InsertBlock (ConcatLowerLower, ConcatUpperLower)
4894
5097
  template <int kBlockIdx, class T>
4895
5098
  HWY_API Vec256<T> InsertBlock(Vec256<T> v, Vec128<T> blk_to_insert) {
@@ -6133,6 +6336,19 @@ HWY_API Vec256<int64_t> operator>>(Vec256<int64_t> v, Vec256<int64_t> bits) {
6133
6336
  }
6134
6337
 
6135
6338
  // ------------------------------ WidenMulPairwiseAdd
6339
+
6340
+ #if HWY_NATIVE_DOT_BF16
6341
+
6342
+ template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_D(DF, 32),
6343
+ class VBF = VFromD<Repartition<bfloat16_t, DF>>>
6344
+ HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
6345
+ return VFromD<DF>{_mm256_dpbf16_ps(Zero(df).raw,
6346
+ reinterpret_cast<__m256bh>(a.raw),
6347
+ reinterpret_cast<__m256bh>(b.raw))};
6348
+ }
6349
+
6350
+ #endif // HWY_NATIVE_DOT_BF16
6351
+
6136
6352
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
6137
6353
  HWY_API VFromD<D> WidenMulPairwiseAdd(D /*d32*/, Vec256<int16_t> a,
6138
6354
  Vec256<int16_t> b) {
@@ -6291,7 +6507,9 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec32<int8_t> v) {
6291
6507
  #if HWY_TARGET <= HWY_AVX3
6292
6508
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
6293
6509
  HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
6294
- #if HWY_COMPILER_GCC_ACTUAL
6510
+ #if HWY_X86_HAVE_AVX10_2_OPS
6511
+ return VFromD<D>{_mm256_cvtts_ps_epi64(v.raw)};
6512
+ #elif HWY_COMPILER_GCC_ACTUAL
6295
6513
  // Workaround for undefined behavior with GCC if any values of v[i] are not
6296
6514
  // within the range of an int64_t
6297
6515
 
@@ -6319,7 +6537,9 @@ HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
6319
6537
  }
6320
6538
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
6321
6539
  HWY_API VFromD<D> PromoteInRangeTo(D /* tag */, VFromD<Rebind<float, D>> v) {
6322
- #if HWY_COMPILER_GCC_ACTUAL
6540
+ #if HWY_X86_HAVE_AVX10_2_OPS
6541
+ return VFromD<D>{_mm256_cvtts_ps_epu64(v.raw)};
6542
+ #elif HWY_COMPILER_GCC_ACTUAL
6323
6543
  // Workaround for undefined behavior with GCC if any values of v[i] are not
6324
6544
  // within the range of an uint64_t
6325
6545
  #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
@@ -6666,6 +6886,31 @@ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
6666
6886
  _MM_SHUFFLE(3, 1, 2, 0))};
6667
6887
  }
6668
6888
 
6889
+ #if HWY_TARGET <= HWY_AVX3
6890
+ template <class D, HWY_IF_V_SIZE_D(D, HWY_MAX_BYTES), HWY_IF_UI32_D(D)>
6891
+ HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int64_t, D>> a,
6892
+ VFromD<Repartition<int64_t, D>> b) {
6893
+ const Half<decltype(dn)> dnh;
6894
+ return Combine(dn, DemoteTo(dnh, b), DemoteTo(dnh, a));
6895
+ }
6896
+
6897
+ template <class D, HWY_IF_V_SIZE_D(D, HWY_MAX_BYTES), HWY_IF_U32_D(D)>
6898
+ HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint64_t, D>> a,
6899
+ VFromD<Repartition<uint64_t, D>> b) {
6900
+ const Half<decltype(dn)> dnh;
6901
+ return Combine(dn, DemoteTo(dnh, b), DemoteTo(dnh, a));
6902
+ }
6903
+
6904
+ template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
6905
+ HWY_IF_V_SIZE_GT_D(D, 16), class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
6906
+ HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
6907
+ HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2),
6908
+ HWY_IF_T_SIZE_V(V, 8)>
6909
+ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
6910
+ return ReorderDemote2To(d, a, b);
6911
+ }
6912
+ #endif
6913
+
6669
6914
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
6670
6915
  HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<double> v) {
6671
6916
  return VFromD<D>{_mm256_cvtpd_ps(v.raw)};
@@ -6673,7 +6918,9 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<double> v) {
6673
6918
 
6674
6919
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
6675
6920
  HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec256<double> v) {
6676
- #if HWY_COMPILER_GCC_ACTUAL
6921
+ #if HWY_X86_HAVE_AVX10_2_OPS
6922
+ return VFromD<D>{_mm256_cvtts_pd_epi32(v.raw)};
6923
+ #elif HWY_COMPILER_GCC_ACTUAL
6677
6924
  // Workaround for undefined behavior in _mm256_cvttpd_epi32 with GCC if any
6678
6925
  // values of v[i] are not within the range of an int32_t
6679
6926
 
@@ -6703,7 +6950,9 @@ HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec256<double> v) {
6703
6950
  #if HWY_TARGET <= HWY_AVX3
6704
6951
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
6705
6952
  HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec256<double> v) {
6706
- #if HWY_COMPILER_GCC_ACTUAL
6953
+ #if HWY_X86_HAVE_AVX10_2_OPS
6954
+ return VFromD<D>{_mm256_cvtts_pd_epu32(v.raw)};
6955
+ #elif HWY_COMPILER_GCC_ACTUAL
6707
6956
  // Workaround for undefined behavior in _mm256_cvttpd_epu32 with GCC if any
6708
6957
  // values of v[i] are not within the range of an uint32_t
6709
6958
 
@@ -6998,7 +7247,9 @@ HWY_API VFromD<D> ConvertInRangeTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
6998
7247
 
6999
7248
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
7000
7249
  HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec256<float> v) {
7001
- #if HWY_COMPILER_GCC_ACTUAL
7250
+ #if HWY_X86_HAVE_AVX10_2_OPS
7251
+ return VFromD<D>{_mm256_cvtts_ps_epi32(v.raw)};
7252
+ #elif HWY_COMPILER_GCC_ACTUAL
7002
7253
  // Workaround for undefined behavior in _mm256_cvttps_epi32 with GCC if any
7003
7254
  // values of v[i] are not within the range of an int32_t
7004
7255
 
@@ -7032,7 +7283,9 @@ HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec256<float> v) {
7032
7283
  #if HWY_TARGET <= HWY_AVX3
7033
7284
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
7034
7285
  HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, Vec256<double> v) {
7035
- #if HWY_COMPILER_GCC_ACTUAL
7286
+ #if HWY_X86_HAVE_AVX10_2_OPS
7287
+ return VFromD<D>{_mm256_cvtts_pd_epi64(v.raw)};
7288
+ #elif HWY_COMPILER_GCC_ACTUAL
7036
7289
  // Workaround for undefined behavior in _mm256_cvttpd_epi64 with GCC if any
7037
7290
  // values of v[i] are not within the range of an int64_t
7038
7291
 
@@ -7060,7 +7313,9 @@ HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, Vec256<double> v) {
7060
7313
  }
7061
7314
  template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U32_D(DU)>
7062
7315
  HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
7063
- #if HWY_COMPILER_GCC_ACTUAL
7316
+ #if HWY_X86_HAVE_AVX10_2_OPS
7317
+ return VFromD<DU>{_mm256_cvtts_ps_epu32(v.raw)};
7318
+ #elif HWY_COMPILER_GCC_ACTUAL
7064
7319
  // Workaround for undefined behavior in _mm256_cvttps_epu32 with GCC if any
7065
7320
  // values of v[i] are not within the range of an uint32_t
7066
7321
 
@@ -7100,7 +7355,9 @@ HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
7100
7355
  }
7101
7356
  template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U64_D(DU)>
7102
7357
  HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
7103
- #if HWY_COMPILER_GCC_ACTUAL
7358
+ #if HWY_X86_HAVE_AVX10_2_OPS
7359
+ return VFromD<DU>{_mm256_cvtts_pd_epu64(v.raw)};
7360
+ #elif HWY_COMPILER_GCC_ACTUAL
7104
7361
  // Workaround for undefined behavior in _mm256_cvttpd_epu64 with GCC if any
7105
7362
  // values of v[i] are not within the range of an uint64_t
7106
7363
 
@@ -7133,7 +7390,8 @@ HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
7133
7390
  #endif // HWY_TARGET <= HWY_AVX3
7134
7391
 
7135
7392
  template <class DI, HWY_IF_V_SIZE_D(DI, 32), HWY_IF_I32_D(DI)>
7136
- HWY_INLINE VFromD<DI> NearestIntInRange(DI, VFromD<RebindToFloat<DI>> v) {
7393
+ static HWY_INLINE VFromD<DI> NearestIntInRange(DI,
7394
+ VFromD<RebindToFloat<DI>> v) {
7137
7395
  #if HWY_COMPILER_GCC_ACTUAL
7138
7396
  // Workaround for undefined behavior in _mm256_cvtps_epi32 if any values of
7139
7397
  // v[i] are not within the range of an int32_t
@@ -7165,6 +7423,113 @@ HWY_INLINE VFromD<DI> NearestIntInRange(DI, VFromD<RebindToFloat<DI>> v) {
7165
7423
  #endif // HWY_COMPILER_GCC_ACTUAL
7166
7424
  }
7167
7425
 
7426
+ #if HWY_HAVE_FLOAT16
7427
+ template <class DI, HWY_IF_V_SIZE_D(DI, 32), HWY_IF_I16_D(DI)>
7428
+ static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*d*/, Vec256<float16_t> v) {
7429
+ #if HWY_COMPILER_GCC_ACTUAL
7430
+ // Workaround for undefined behavior in _mm256_cvtph_epi16 with GCC if any
7431
+ // values of v[i] are not within the range of an int16_t
7432
+
7433
+ #if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
7434
+ HWY_HAVE_SCALAR_F16_TYPE
7435
+ if (detail::IsConstantX86VecForF2IConv<int16_t>(v)) {
7436
+ typedef hwy::float16_t::Native GccF16RawVectType
7437
+ __attribute__((__vector_size__(32)));
7438
+ const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
7439
+ return VFromD<DI>{
7440
+ _mm256_setr_epi16(detail::X86ScalarNearestInt<int16_t>(raw_v[0]),
7441
+ detail::X86ScalarNearestInt<int16_t>(raw_v[1]),
7442
+ detail::X86ScalarNearestInt<int16_t>(raw_v[2]),
7443
+ detail::X86ScalarNearestInt<int16_t>(raw_v[3]),
7444
+ detail::X86ScalarNearestInt<int16_t>(raw_v[4]),
7445
+ detail::X86ScalarNearestInt<int16_t>(raw_v[5]),
7446
+ detail::X86ScalarNearestInt<int16_t>(raw_v[6]),
7447
+ detail::X86ScalarNearestInt<int16_t>(raw_v[7]),
7448
+ detail::X86ScalarNearestInt<int16_t>(raw_v[8]),
7449
+ detail::X86ScalarNearestInt<int16_t>(raw_v[9]),
7450
+ detail::X86ScalarNearestInt<int16_t>(raw_v[10]),
7451
+ detail::X86ScalarNearestInt<int16_t>(raw_v[11]),
7452
+ detail::X86ScalarNearestInt<int16_t>(raw_v[12]),
7453
+ detail::X86ScalarNearestInt<int16_t>(raw_v[13]),
7454
+ detail::X86ScalarNearestInt<int16_t>(raw_v[14]),
7455
+ detail::X86ScalarNearestInt<int16_t>(raw_v[15]))};
7456
+ }
7457
+ #endif
7458
+
7459
+ __m256i raw_result;
7460
+ __asm__("vcvtph2w {%1, %0|%0, %1}"
7461
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
7462
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
7463
+ :);
7464
+ return VFromD<DI>{raw_result};
7465
+ #else // HWY_COMPILER_GCC_ACTUAL
7466
+ return VFromD<DI>{_mm256_cvtph_epi16(v.raw)};
7467
+ #endif
7468
+ }
7469
+ #endif
7470
+
7471
+ #if HWY_TARGET <= HWY_AVX3
7472
+ template <class DI, HWY_IF_V_SIZE_D(DI, 32), HWY_IF_I64_D(DI)>
7473
+ static HWY_INLINE VFromD<DI> NearestIntInRange(DI,
7474
+ VFromD<RebindToFloat<DI>> v) {
7475
+ #if HWY_COMPILER_GCC_ACTUAL
7476
+ // Workaround for undefined behavior in _mm256_cvtpd_epi64 with GCC if any
7477
+ // values of v[i] are not within the range of an int64_t
7478
+
7479
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
7480
+ if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
7481
+ typedef double GccF64RawVectType __attribute__((__vector_size__(32)));
7482
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
7483
+ return VFromD<DI>{
7484
+ _mm256_setr_epi64x(detail::X86ScalarNearestInt<int64_t>(raw_v[0]),
7485
+ detail::X86ScalarNearestInt<int64_t>(raw_v[1]),
7486
+ detail::X86ScalarNearestInt<int64_t>(raw_v[2]),
7487
+ detail::X86ScalarNearestInt<int64_t>(raw_v[3]))};
7488
+ }
7489
+ #endif
7490
+
7491
+ __m256i raw_result;
7492
+ __asm__("vcvtpd2qq {%1, %0|%0, %1}"
7493
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
7494
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
7495
+ :);
7496
+ return VFromD<DI>{raw_result};
7497
+ #else // !HWY_COMPILER_GCC_ACTUAL
7498
+ return VFromD<DI>{_mm256_cvtpd_epi64(v.raw)};
7499
+ #endif // HWY_COMPILER_GCC_ACTUAL
7500
+ }
7501
+ #endif // HWY_TARGET <= HWY_AVX3
7502
+
7503
+ template <class DI, HWY_IF_V_SIZE_D(DI, 16), HWY_IF_I32_D(DI)>
7504
+ static HWY_INLINE VFromD<DI> DemoteToNearestIntInRange(
7505
+ DI, VFromD<Rebind<double, DI>> v) {
7506
+ #if HWY_COMPILER_GCC_ACTUAL
7507
+ // Workaround for undefined behavior in _mm256_cvtpd_epi32 with GCC if any
7508
+ // values of v[i] are not within the range of an int32_t
7509
+
7510
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
7511
+ if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
7512
+ typedef double GccF32RawVectType __attribute__((__vector_size__(32)));
7513
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
7514
+ return Dup128VecFromValues(DI(),
7515
+ detail::X86ScalarNearestInt<int32_t>(raw_v[0]),
7516
+ detail::X86ScalarNearestInt<int32_t>(raw_v[1]),
7517
+ detail::X86ScalarNearestInt<int32_t>(raw_v[2]),
7518
+ detail::X86ScalarNearestInt<int32_t>(raw_v[3]));
7519
+ }
7520
+ #endif
7521
+
7522
+ __m128i raw_result;
7523
+ __asm__("vcvtpd2dq {%1, %0|%0, %1}"
7524
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
7525
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
7526
+ :);
7527
+ return VFromD<DI>{raw_result};
7528
+ #else // !HWY_COMPILER_GCC_ACTUAL
7529
+ return VFromD<DI>{_mm256_cvtpd_epi32(v.raw)};
7530
+ #endif
7531
+ }
7532
+
7168
7533
  #ifndef HWY_DISABLE_F16C
7169
7534
 
7170
7535
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
@@ -7592,26 +7957,22 @@ HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
7592
7957
  return detail::LoadMaskBits256<TFromD<D>>(mask_bits);
7593
7958
  }
7594
7959
 
7595
- // ------------------------------ StoreMaskBits
7596
-
7597
- namespace detail {
7960
+ // ------------------------------ BitsFromMask
7598
7961
 
7599
- template <typename T, HWY_IF_T_SIZE(T, 1)>
7600
- HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
7601
- const Full256<T> d;
7602
- const Full256<uint8_t> d8;
7962
+ template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 32)>
7963
+ HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
7964
+ const RebindToUnsigned<D> d8;
7603
7965
  const auto sign_bits = BitCast(d8, VecFromMask(d, mask)).raw;
7604
7966
  // Prevent sign-extension of 32-bit masks because the intrinsic returns int.
7605
7967
  return static_cast<uint32_t>(_mm256_movemask_epi8(sign_bits));
7606
7968
  }
7607
7969
 
7608
- template <typename T, HWY_IF_T_SIZE(T, 2)>
7609
- HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
7970
+ template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_D(D, 32)>
7971
+ HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
7610
7972
  #if !defined(HWY_DISABLE_BMI2_FMA) && !defined(HWY_DISABLE_PEXT_ON_AVX2)
7611
- const Full256<T> d;
7612
- const Full256<uint8_t> d8;
7973
+ const Repartition<uint8_t, D> d8;
7613
7974
  const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
7614
- const uint64_t sign_bits8 = BitsFromMask(mask8);
7975
+ const uint64_t sign_bits8 = BitsFromMask(d8, mask8);
7615
7976
  // Skip the bits from the lower byte of each u16 (better not to use the
7616
7977
  // same packs_epi16 as SSE4, because that requires an extra swizzle here).
7617
7978
  return _pext_u32(static_cast<uint32_t>(sign_bits8), 0xAAAAAAAAu);
@@ -7627,32 +7988,29 @@ HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
7627
7988
  #endif // HWY_ARCH_X86_64
7628
7989
  }
7629
7990
 
7630
- template <typename T, HWY_IF_T_SIZE(T, 4)>
7631
- HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
7632
- const Full256<T> d;
7633
- const Full256<float> df;
7991
+ template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_D(D, 32)>
7992
+ HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
7993
+ const RebindToFloat<D> df;
7634
7994
  const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw;
7635
7995
  return static_cast<unsigned>(_mm256_movemask_ps(sign_bits));
7636
7996
  }
7637
7997
 
7638
- template <typename T, HWY_IF_T_SIZE(T, 8)>
7639
- HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
7640
- const Full256<T> d;
7641
- const Full256<double> df;
7998
+ template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 32)>
7999
+ HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
8000
+ const RebindToFloat<D> df;
7642
8001
  const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw;
7643
8002
  return static_cast<unsigned>(_mm256_movemask_pd(sign_bits));
7644
8003
  }
7645
8004
 
7646
- } // namespace detail
7647
-
8005
+ // ------------------------------ StoreMaskBits
7648
8006
  // `p` points to at least 8 writable bytes.
7649
8007
  template <class D, HWY_IF_V_SIZE_D(D, 32)>
7650
8008
  HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
7651
- constexpr size_t N = Lanes(d);
7652
- constexpr size_t kNumBytes = (N + 7) / 8;
8009
+ HWY_LANES_CONSTEXPR size_t N = Lanes(d);
8010
+ HWY_LANES_CONSTEXPR size_t kNumBytes = (N + 7) / 8;
7653
8011
 
7654
- const uint64_t mask_bits = detail::BitsFromMask(mask);
7655
- CopyBytes<kNumBytes>(&mask_bits, bits);
8012
+ const uint64_t mask_bits = BitsFromMask(d, mask);
8013
+ CopyBytes(&mask_bits, bits, kNumBytes);
7656
8014
  return kNumBytes;
7657
8015
  }
7658
8016
 
@@ -7664,59 +8022,59 @@ template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
7664
8022
  HWY_API bool AllFalse(D d, MFromD<D> mask) {
7665
8023
  const Repartition<uint8_t, decltype(d)> d8;
7666
8024
  const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
7667
- return detail::BitsFromMask(mask8) == 0;
8025
+ return BitsFromMask(d8, mask8) == 0;
7668
8026
  }
7669
8027
 
7670
8028
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 2)>
7671
- HWY_API bool AllFalse(D /* tag */, MFromD<D> mask) {
8029
+ HWY_API bool AllFalse(D d, MFromD<D> mask) {
7672
8030
  // Cheaper than PTEST, which is 2 uop / 3L.
7673
- return detail::BitsFromMask(mask) == 0;
8031
+ return BitsFromMask(d, mask) == 0;
7674
8032
  }
7675
8033
 
7676
8034
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
7677
8035
  HWY_API bool AllTrue(D d, MFromD<D> mask) {
7678
8036
  const Repartition<uint8_t, decltype(d)> d8;
7679
8037
  const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
7680
- return detail::BitsFromMask(mask8) == (1ull << 32) - 1;
8038
+ return BitsFromMask(d8, mask8) == (1ull << 32) - 1;
7681
8039
  }
7682
8040
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 2)>
7683
8041
  HWY_API bool AllTrue(D d, MFromD<D> mask) {
7684
- constexpr uint64_t kAllBits = (1ull << Lanes(d)) - 1;
7685
- return detail::BitsFromMask(mask) == kAllBits;
8042
+ constexpr uint64_t kAllBits = (1ull << MaxLanes(d)) - 1;
8043
+ return BitsFromMask(d, mask) == kAllBits;
7686
8044
  }
7687
8045
 
7688
8046
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
7689
8047
  HWY_API size_t CountTrue(D d, MFromD<D> mask) {
7690
8048
  const Repartition<uint8_t, decltype(d)> d8;
7691
8049
  const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
7692
- return PopCount(detail::BitsFromMask(mask8)) >> 1;
8050
+ return PopCount(BitsFromMask(d8, mask8)) >> 1;
7693
8051
  }
7694
8052
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 2)>
7695
- HWY_API size_t CountTrue(D /* tag */, MFromD<D> mask) {
7696
- return PopCount(detail::BitsFromMask(mask));
8053
+ HWY_API size_t CountTrue(D d, MFromD<D> mask) {
8054
+ return PopCount(BitsFromMask(d, mask));
7697
8055
  }
7698
8056
 
7699
8057
  template <class D, HWY_IF_V_SIZE_D(D, 32)>
7700
- HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD<D> mask) {
7701
- const uint32_t mask_bits = static_cast<uint32_t>(detail::BitsFromMask(mask));
8058
+ HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) {
8059
+ const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask));
7702
8060
  return Num0BitsBelowLS1Bit_Nonzero32(mask_bits);
7703
8061
  }
7704
8062
 
7705
8063
  template <class D, HWY_IF_V_SIZE_D(D, 32)>
7706
- HWY_API intptr_t FindFirstTrue(D /* tag */, MFromD<D> mask) {
7707
- const uint32_t mask_bits = static_cast<uint32_t>(detail::BitsFromMask(mask));
8064
+ HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) {
8065
+ const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask));
7708
8066
  return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1;
7709
8067
  }
7710
8068
 
7711
8069
  template <class D, HWY_IF_V_SIZE_D(D, 32)>
7712
- HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD<D> mask) {
7713
- const uint32_t mask_bits = static_cast<uint32_t>(detail::BitsFromMask(mask));
8070
+ HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) {
8071
+ const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask));
7714
8072
  return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits);
7715
8073
  }
7716
8074
 
7717
8075
  template <class D, HWY_IF_V_SIZE_D(D, 32)>
7718
- HWY_API intptr_t FindLastTrue(D /* tag */, MFromD<D> mask) {
7719
- const uint32_t mask_bits = static_cast<uint32_t>(detail::BitsFromMask(mask));
8076
+ HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
8077
+ const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask));
7720
8078
  return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits))
7721
8079
  : -1;
7722
8080
  }
@@ -7969,12 +8327,14 @@ HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
7969
8327
 
7970
8328
  template <typename T, HWY_IF_NOT_T_SIZE(T, 1)>
7971
8329
  HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> m) {
7972
- return detail::Compress(v, detail::BitsFromMask(m));
8330
+ const DFromV<decltype(v)> d;
8331
+ return detail::Compress(v, BitsFromMask(d, m));
7973
8332
  }
7974
8333
 
7975
8334
  template <typename T, HWY_IF_NOT_T_SIZE(T, 1)>
7976
8335
  HWY_API Vec256<T> CompressNot(Vec256<T> v, Mask256<T> m) {
7977
- return detail::CompressNot(v, detail::BitsFromMask(m));
8336
+ const DFromV<decltype(v)> d;
8337
+ return detail::CompressNot(v, BitsFromMask(d, m));
7978
8338
  }
7979
8339
 
7980
8340
  HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
@@ -8002,7 +8362,7 @@ HWY_API Vec256<T> CompressBits(Vec256<T> v, const uint8_t* HWY_RESTRICT bits) {
8002
8362
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 1)>
8003
8363
  HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d,
8004
8364
  TFromD<D>* HWY_RESTRICT unaligned) {
8005
- const uint64_t mask_bits = detail::BitsFromMask(m);
8365
+ const uint64_t mask_bits = BitsFromMask(d, m);
8006
8366
  const size_t count = PopCount(mask_bits);
8007
8367
  StoreU(detail::Compress(v, mask_bits), d, unaligned);
8008
8368
  detail::MaybeUnpoison(unaligned, count);
@@ -8013,7 +8373,7 @@ template <class D, HWY_IF_V_SIZE_D(D, 32),
8013
8373
  HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
8014
8374
  HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
8015
8375
  TFromD<D>* HWY_RESTRICT unaligned) {
8016
- const uint64_t mask_bits = detail::BitsFromMask(m);
8376
+ const uint64_t mask_bits = BitsFromMask(d, m);
8017
8377
  const size_t count = PopCount(mask_bits);
8018
8378
 
8019
8379
  const RebindToUnsigned<decltype(d)> du;
@@ -8040,7 +8400,7 @@ HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
8040
8400
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
8041
8401
  HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
8042
8402
  TFromD<D>* HWY_RESTRICT unaligned) {
8043
- const uint64_t mask_bits = detail::BitsFromMask(m);
8403
+ const uint64_t mask_bits = BitsFromMask(d, m);
8044
8404
  const size_t count = PopCount(mask_bits);
8045
8405
  const VFromD<D> compressed = detail::Compress(v, mask_bits);
8046
8406
 
@@ -8059,11 +8419,11 @@ HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
8059
8419
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 1)>
8060
8420
  HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
8061
8421
  D d, TFromD<D>* HWY_RESTRICT unaligned) {
8062
- constexpr size_t N = Lanes(d);
8063
- constexpr size_t kNumBytes = (N + 7) / 8;
8422
+ HWY_LANES_CONSTEXPR size_t N = Lanes(d);
8423
+ HWY_LANES_CONSTEXPR size_t kNumBytes = (N + 7) / 8;
8064
8424
 
8065
8425
  uint64_t mask_bits = 0;
8066
- CopyBytes<kNumBytes>(bits, &mask_bits);
8426
+ CopyBytes(bits, &mask_bits, kNumBytes);
8067
8427
 
8068
8428
  if (N < 8) {
8069
8429
  mask_bits &= (1ull << N) - 1;
@@ -8157,7 +8517,7 @@ HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) {
8157
8517
  // LUTs are infeasible for so many mask combinations, so Combine two
8158
8518
  // half-vector Expand.
8159
8519
  const Half<decltype(d)> dh;
8160
- const uint64_t mask_bits = detail::BitsFromMask(mask);
8520
+ const uint64_t mask_bits = BitsFromMask(d, mask);
8161
8521
  constexpr size_t N = 32 / sizeof(T);
8162
8522
  const size_t countL = PopCount(mask_bits & ((1 << (N / 2)) - 1));
8163
8523
  const Mask128<T> maskL = MaskFromVec(LowerHalf(VecFromMask(d, mask)));
@@ -8211,7 +8571,7 @@ HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) {
8211
8571
  return BitCast(d, detail::NativeExpand(BitCast(du, v), mu));
8212
8572
  #else
8213
8573
  const RebindToUnsigned<decltype(d)> du;
8214
- const uint64_t mask_bits = detail::BitsFromMask(mask);
8574
+ const uint64_t mask_bits = BitsFromMask(d, mask);
8215
8575
 
8216
8576
  alignas(16) constexpr uint32_t packed_array[256] = {
8217
8577
  // PrintExpand32x8Nibble.
@@ -8280,7 +8640,7 @@ HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) {
8280
8640
  return BitCast(d, detail::NativeExpand(BitCast(du, v), mu));
8281
8641
  #else
8282
8642
  const RebindToUnsigned<decltype(d)> du;
8283
- const uint64_t mask_bits = detail::BitsFromMask(mask);
8643
+ const uint64_t mask_bits = BitsFromMask(d, mask);
8284
8644
 
8285
8645
  alignas(16) constexpr uint64_t packed_array[16] = {
8286
8646
  // PrintExpand64x4Nibble.
@@ -8354,7 +8714,7 @@ namespace detail {
8354
8714
  template <class D, HWY_IF_V_SIZE_D(D, 32)>
8355
8715
  HWY_API void LoadTransposedBlocks3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
8356
8716
  VFromD<D>& A, VFromD<D>& B, VFromD<D>& C) {
8357
- constexpr size_t N = Lanes(d);
8717
+ HWY_LANES_CONSTEXPR size_t N = Lanes(d);
8358
8718
  const VFromD<D> v10 = LoadU(d, unaligned + 0 * N); // 1 0
8359
8719
  const VFromD<D> v32 = LoadU(d, unaligned + 1 * N);
8360
8720
  const VFromD<D> v54 = LoadU(d, unaligned + 2 * N);
@@ -8378,7 +8738,7 @@ template <class D, HWY_IF_V_SIZE_D(D, 32)>
8378
8738
  HWY_API void LoadTransposedBlocks4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
8379
8739
  VFromD<D>& vA, VFromD<D>& vB, VFromD<D>& vC,
8380
8740
  VFromD<D>& vD) {
8381
- constexpr size_t N = Lanes(d);
8741
+ HWY_LANES_CONSTEXPR size_t N = Lanes(d);
8382
8742
  const VFromD<D> v10 = LoadU(d, unaligned + 0 * N);
8383
8743
  const VFromD<D> v32 = LoadU(d, unaligned + 1 * N);
8384
8744
  const VFromD<D> v54 = LoadU(d, unaligned + 2 * N);
@@ -8405,7 +8765,7 @@ namespace detail {
8405
8765
  template <class D, HWY_IF_V_SIZE_D(D, 32)>
8406
8766
  HWY_API void StoreTransposedBlocks2(VFromD<D> i, VFromD<D> j, D d,
8407
8767
  TFromD<D>* HWY_RESTRICT unaligned) {
8408
- constexpr size_t N = Lanes(d);
8768
+ HWY_LANES_CONSTEXPR size_t N = Lanes(d);
8409
8769
  const auto out0 = ConcatLowerLower(d, j, i);
8410
8770
  const auto out1 = ConcatUpperUpper(d, j, i);
8411
8771
  StoreU(out0, d, unaligned + 0 * N);
@@ -8423,7 +8783,7 @@ HWY_API void StoreTransposedBlocks2(VFromD<D> i, VFromD<D> j, D d,
8423
8783
  template <class D, HWY_IF_V_SIZE_D(D, 32)>
8424
8784
  HWY_API void StoreTransposedBlocks3(VFromD<D> i, VFromD<D> j, VFromD<D> k, D d,
8425
8785
  TFromD<D>* HWY_RESTRICT unaligned) {
8426
- constexpr size_t N = Lanes(d);
8786
+ HWY_LANES_CONSTEXPR size_t N = Lanes(d);
8427
8787
  const auto out0 = ConcatLowerLower(d, j, i);
8428
8788
  const auto out1 = ConcatUpperLower(d, i, k);
8429
8789
  const auto out2 = ConcatUpperUpper(d, k, j);
@@ -8446,7 +8806,7 @@ template <class D, HWY_IF_V_SIZE_D(D, 32)>
8446
8806
  HWY_API void StoreTransposedBlocks4(VFromD<D> i, VFromD<D> j, VFromD<D> k,
8447
8807
  VFromD<D> l, D d,
8448
8808
  TFromD<D>* HWY_RESTRICT unaligned) {
8449
- constexpr size_t N = Lanes(d);
8809
+ HWY_LANES_CONSTEXPR size_t N = Lanes(d);
8450
8810
  // Write lower halves, then upper.
8451
8811
  const auto out0 = ConcatLowerLower(d, j, i);
8452
8812
  const auto out1 = ConcatLowerLower(d, l, k);
@@ -8464,7 +8824,7 @@ HWY_API void StoreTransposedBlocks4(VFromD<D> i, VFromD<D> j, VFromD<D> k,
8464
8824
  #if HWY_TARGET <= HWY_AVX3
8465
8825
  template <class T>
8466
8826
  HWY_API Mask256<T> SetAtOrAfterFirst(Mask256<T> mask) {
8467
- constexpr size_t N = Lanes(Full256<T>());
8827
+ constexpr size_t N = MaxLanes(Full256<T>());
8468
8828
  constexpr uint32_t kActiveElemMask =
8469
8829
  static_cast<uint32_t>((uint64_t{1} << N) - 1);
8470
8830
  return Mask256<T>{static_cast<typename Mask256<T>::Raw>(
@@ -8472,7 +8832,7 @@ HWY_API Mask256<T> SetAtOrAfterFirst(Mask256<T> mask) {
8472
8832
  }
8473
8833
  template <class T>
8474
8834
  HWY_API Mask256<T> SetBeforeFirst(Mask256<T> mask) {
8475
- constexpr size_t N = Lanes(Full256<T>());
8835
+ constexpr size_t N = MaxLanes(Full256<T>());
8476
8836
  constexpr uint32_t kActiveElemMask =
8477
8837
  static_cast<uint32_t>((uint64_t{1} << N) - 1);
8478
8838
  return Mask256<T>{static_cast<typename Mask256<T>::Raw>(
@@ -8480,7 +8840,7 @@ HWY_API Mask256<T> SetBeforeFirst(Mask256<T> mask) {
8480
8840
  }
8481
8841
  template <class T>
8482
8842
  HWY_API Mask256<T> SetAtOrBeforeFirst(Mask256<T> mask) {
8483
- constexpr size_t N = Lanes(Full256<T>());
8843
+ constexpr size_t N = MaxLanes(Full256<T>());
8484
8844
  constexpr uint32_t kActiveElemMask =
8485
8845
  static_cast<uint32_t>((uint64_t{1} << N) - 1);
8486
8846
  return Mask256<T>{static_cast<typename Mask256<T>::Raw>(
@@ -8579,6 +8939,24 @@ HWY_API V BitShuffle(V v, VI idx) {
8579
8939
  }
8580
8940
  #endif // HWY_TARGET <= HWY_AVX3_DL
8581
8941
 
8942
+ // ------------------------------ MultiRotateRight
8943
+
8944
+ #if HWY_TARGET <= HWY_AVX3_DL
8945
+
8946
+ #ifdef HWY_NATIVE_MULTIROTATERIGHT
8947
+ #undef HWY_NATIVE_MULTIROTATERIGHT
8948
+ #else
8949
+ #define HWY_NATIVE_MULTIROTATERIGHT
8950
+ #endif
8951
+
8952
+ template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
8953
+ HWY_IF_V_SIZE_V(V, 32), HWY_IF_V_SIZE_V(VI, HWY_MAX_LANES_V(V) * 8)>
8954
+ HWY_API V MultiRotateRight(V v, VI idx) {
8955
+ return V{_mm256_multishift_epi64_epi8(idx.raw, v.raw)};
8956
+ }
8957
+
8958
+ #endif
8959
+
8582
8960
  // ------------------------------ LeadingZeroCount
8583
8961
 
8584
8962
  #if HWY_TARGET <= HWY_AVX3
@@ -8591,6 +8969,85 @@ template <class V, HWY_IF_UI64(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>
8591
8969
  HWY_API V LeadingZeroCount(V v) {
8592
8970
  return V{_mm256_lzcnt_epi64(v.raw)};
8593
8971
  }
8972
+
8973
+ namespace detail {
8974
+
8975
+ template <class V, HWY_IF_UNSIGNED_V(V),
8976
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
8977
+ HWY_IF_LANES_LE_D(DFromV<V>, HWY_MAX_BYTES / 4)>
8978
+ static HWY_INLINE HWY_MAYBE_UNUSED V Lzcnt32ForU8OrU16OrU32(V v) {
8979
+ const DFromV<decltype(v)> d;
8980
+ const Rebind<int32_t, decltype(d)> di32;
8981
+ const Rebind<uint32_t, decltype(d)> du32;
8982
+
8983
+ const auto v_lz_count = LeadingZeroCount(PromoteTo(du32, v));
8984
+ return DemoteTo(d, BitCast(di32, v_lz_count));
8985
+ }
8986
+
8987
+ template <class V, HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, 4)>
8988
+ static HWY_INLINE HWY_MAYBE_UNUSED V Lzcnt32ForU8OrU16OrU32(V v) {
8989
+ return LeadingZeroCount(v);
8990
+ }
8991
+
8992
+ template <class V, HWY_IF_UNSIGNED_V(V),
8993
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
8994
+ HWY_IF_LANES_GT_D(DFromV<V>, HWY_MAX_BYTES / 4)>
8995
+ static HWY_INLINE HWY_MAYBE_UNUSED V Lzcnt32ForU8OrU16OrU32(V v) {
8996
+ const DFromV<decltype(v)> d;
8997
+ const RepartitionToWide<decltype(d)> dw;
8998
+ const RebindToSigned<decltype(dw)> dw_i;
8999
+
9000
+ const auto lo_v_lz_count = Lzcnt32ForU8OrU16OrU32(PromoteLowerTo(dw, v));
9001
+ const auto hi_v_lz_count = Lzcnt32ForU8OrU16OrU32(PromoteUpperTo(dw, v));
9002
+ return OrderedDemote2To(d, BitCast(dw_i, lo_v_lz_count),
9003
+ BitCast(dw_i, hi_v_lz_count));
9004
+ }
9005
+
9006
+ } // namespace detail
9007
+
9008
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
9009
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
9010
+ HWY_API V LeadingZeroCount(V v) {
9011
+ const DFromV<decltype(v)> d;
9012
+ const RebindToUnsigned<decltype(d)> du;
9013
+ using TU = TFromD<decltype(du)>;
9014
+
9015
+ constexpr TU kNumOfBitsInT{sizeof(TU) * 8};
9016
+ const auto v_lzcnt32 = detail::Lzcnt32ForU8OrU16OrU32(BitCast(du, v));
9017
+ return BitCast(d, Min(v_lzcnt32 - Set(du, TU{32 - kNumOfBitsInT}),
9018
+ Set(du, TU{kNumOfBitsInT})));
9019
+ }
9020
+
9021
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
9022
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
9023
+ HWY_API V HighestSetBitIndex(V v) {
9024
+ const DFromV<decltype(v)> d;
9025
+ const RebindToUnsigned<decltype(d)> du;
9026
+ using TU = TFromD<decltype(du)>;
9027
+ return BitCast(
9028
+ d, Set(du, TU{31}) - detail::Lzcnt32ForU8OrU16OrU32(BitCast(du, v)));
9029
+ }
9030
+
9031
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
9032
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
9033
+ HWY_API V HighestSetBitIndex(V v) {
9034
+ const DFromV<decltype(v)> d;
9035
+ using T = TFromD<decltype(d)>;
9036
+ return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v));
9037
+ }
9038
+
9039
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
9040
+ HWY_API V TrailingZeroCount(V v) {
9041
+ const DFromV<decltype(v)> d;
9042
+ const RebindToSigned<decltype(d)> di;
9043
+ using T = TFromD<decltype(d)>;
9044
+
9045
+ const auto vi = BitCast(di, v);
9046
+ const auto lowest_bit = BitCast(d, And(vi, Neg(vi)));
9047
+ constexpr T kNumOfBitsInT{sizeof(T) * 8};
9048
+ const auto bit_idx = HighestSetBitIndex(lowest_bit);
9049
+ return IfThenElse(MaskFromVec(bit_idx), Set(d, kNumOfBitsInT), bit_idx);
9050
+ }
8594
9051
  #endif // HWY_TARGET <= HWY_AVX3
8595
9052
 
8596
9053
  // NOLINTNEXTLINE(google-readability-namespace-comments)