@img/sharp-libvips-dev 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. package/README.md +1 -2
  2. package/include/aom/aom_decoder.h +1 -1
  3. package/include/aom/aom_encoder.h +7 -1
  4. package/include/aom/aom_image.h +24 -12
  5. package/include/aom/aom_integer.h +3 -3
  6. package/include/aom/aomcx.h +15 -0
  7. package/include/aom/aomdx.h +5 -2
  8. package/include/archive.h +7 -5
  9. package/include/archive_entry.h +5 -3
  10. package/include/cgif.h +3 -0
  11. package/include/expat.h +21 -10
  12. package/include/expat_config.h +11 -5
  13. package/include/ffi.h +12 -25
  14. package/include/freetype2/freetype/config/ftoption.h +2 -2
  15. package/include/fribidi/fribidi-config.h +2 -2
  16. package/include/fribidi/fribidi-unicode-version.h +3 -3
  17. package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
  18. package/include/glib-2.0/gio/gappinfo.h +40 -25
  19. package/include/glib-2.0/gio/gapplication.h +6 -0
  20. package/include/glib-2.0/gio/gasyncresult.h +1 -1
  21. package/include/glib-2.0/gio/gconverter.h +5 -0
  22. package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
  23. package/include/glib-2.0/gio/gfile.h +16 -0
  24. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  25. package/include/glib-2.0/gio/giotypes.h +0 -1
  26. package/include/glib-2.0/gio/gsettings.h +8 -0
  27. package/include/glib-2.0/gio/gvfs.h +2 -2
  28. package/include/glib-2.0/girepository/gi-visibility.h +34 -0
  29. package/include/glib-2.0/girepository/giarginfo.h +23 -6
  30. package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
  31. package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
  32. package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
  33. package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
  34. package/include/glib-2.0/girepository/gienuminfo.h +20 -21
  35. package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
  36. package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
  37. package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
  38. package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
  39. package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
  40. package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
  41. package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
  42. package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
  43. package/include/glib-2.0/girepository/girepository.h +53 -62
  44. package/include/glib-2.0/girepository/girffi.h +8 -7
  45. package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
  46. package/include/glib-2.0/girepository/gistructinfo.h +26 -11
  47. package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
  48. package/include/glib-2.0/girepository/gitypelib.h +9 -13
  49. package/include/glib-2.0/girepository/gitypes.h +52 -104
  50. package/include/glib-2.0/girepository/giunioninfo.h +28 -12
  51. package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
  52. package/include/glib-2.0/girepository/givalueinfo.h +65 -0
  53. package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
  54. package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
  55. package/include/glib-2.0/glib/gbitlock.h +31 -0
  56. package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
  57. package/include/glib-2.0/glib/giochannel.h +2 -2
  58. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  59. package/include/glib-2.0/glib/gmacros.h +12 -5
  60. package/include/glib-2.0/glib/gmain.h +93 -7
  61. package/include/glib-2.0/glib/gmessages.h +8 -0
  62. package/include/glib-2.0/glib/gqsort.h +8 -1
  63. package/include/glib-2.0/glib/gslice.h +2 -0
  64. package/include/glib-2.0/glib/gstrfuncs.h +24 -30
  65. package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
  66. package/include/glib-2.0/glib/gthread.h +191 -3
  67. package/include/glib-2.0/glib/gunicode.h +1 -1
  68. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  69. package/include/glib-2.0/glib-unix.h +7 -1
  70. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  71. package/include/glib-2.0/gobject/genums.h +6 -6
  72. package/include/glib-2.0/gobject/glib-types.h +11 -0
  73. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  74. package/include/glib-2.0/gobject/gsignal.h +16 -6
  75. package/include/glib-2.0/gobject/gtype.h +6 -6
  76. package/include/harfbuzz/hb-buffer.h +6 -0
  77. package/include/harfbuzz/hb-common.h +6 -9
  78. package/include/harfbuzz/hb-cplusplus.hh +8 -11
  79. package/include/harfbuzz/hb-subset.h +17 -4
  80. package/include/harfbuzz/hb-version.h +3 -3
  81. package/include/hwy/abort.h +28 -0
  82. package/include/hwy/aligned_allocator.h +218 -6
  83. package/include/hwy/base.h +1935 -512
  84. package/include/hwy/cache_control.h +24 -6
  85. package/include/hwy/detect_compiler_arch.h +105 -10
  86. package/include/hwy/detect_targets.h +146 -37
  87. package/include/hwy/foreach_target.h +36 -1
  88. package/include/hwy/highway.h +222 -50
  89. package/include/hwy/ops/arm_neon-inl.h +2055 -894
  90. package/include/hwy/ops/arm_sve-inl.h +1476 -348
  91. package/include/hwy/ops/emu128-inl.h +711 -623
  92. package/include/hwy/ops/generic_ops-inl.h +4431 -2157
  93. package/include/hwy/ops/inside-inl.h +691 -0
  94. package/include/hwy/ops/ppc_vsx-inl.h +2186 -673
  95. package/include/hwy/ops/rvv-inl.h +1556 -536
  96. package/include/hwy/ops/scalar-inl.h +353 -233
  97. package/include/hwy/ops/set_macros-inl.h +171 -23
  98. package/include/hwy/ops/shared-inl.h +198 -56
  99. package/include/hwy/ops/wasm_128-inl.h +283 -244
  100. package/include/hwy/ops/x86_128-inl.h +3673 -1357
  101. package/include/hwy/ops/x86_256-inl.h +1737 -663
  102. package/include/hwy/ops/x86_512-inl.h +1697 -500
  103. package/include/hwy/per_target.h +4 -0
  104. package/include/hwy/profiler.h +648 -0
  105. package/include/hwy/robust_statistics.h +2 -2
  106. package/include/hwy/targets.h +40 -32
  107. package/include/hwy/timer-inl.h +3 -3
  108. package/include/hwy/timer.h +16 -1
  109. package/include/libheif/heif.h +170 -15
  110. package/include/libheif/heif_items.h +237 -0
  111. package/include/libheif/heif_properties.h +38 -2
  112. package/include/libheif/heif_regions.h +1 -1
  113. package/include/libheif/heif_version.h +2 -2
  114. package/include/libpng16/png.h +32 -29
  115. package/include/libpng16/pngconf.h +2 -2
  116. package/include/libpng16/pnglibconf.h +8 -3
  117. package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
  118. package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
  119. package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
  120. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  121. package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
  122. package/include/libxml2/libxml/HTMLparser.h +12 -19
  123. package/include/libxml2/libxml/c14n.h +1 -12
  124. package/include/libxml2/libxml/debugXML.h +1 -1
  125. package/include/libxml2/libxml/encoding.h +9 -0
  126. package/include/libxml2/libxml/entities.h +12 -1
  127. package/include/libxml2/libxml/hash.h +19 -0
  128. package/include/libxml2/libxml/list.h +2 -2
  129. package/include/libxml2/libxml/nanohttp.h +17 -0
  130. package/include/libxml2/libxml/parser.h +73 -58
  131. package/include/libxml2/libxml/parserInternals.h +9 -1
  132. package/include/libxml2/libxml/pattern.h +6 -0
  133. package/include/libxml2/libxml/tree.h +32 -12
  134. package/include/libxml2/libxml/uri.h +11 -0
  135. package/include/libxml2/libxml/valid.h +29 -2
  136. package/include/libxml2/libxml/xinclude.h +7 -0
  137. package/include/libxml2/libxml/xmlIO.h +21 -5
  138. package/include/libxml2/libxml/xmlerror.h +14 -0
  139. package/include/libxml2/libxml/xmlexports.h +111 -15
  140. package/include/libxml2/libxml/xmlmemory.h +8 -45
  141. package/include/libxml2/libxml/xmlreader.h +2 -0
  142. package/include/libxml2/libxml/xmlsave.h +5 -0
  143. package/include/libxml2/libxml/xmlunicode.h +165 -1
  144. package/include/libxml2/libxml/xmlversion.h +15 -179
  145. package/include/libxml2/libxml/xmlwriter.h +1 -0
  146. package/include/libxml2/libxml/xpath.h +4 -0
  147. package/include/pango-1.0/pango/pango-features.h +2 -2
  148. package/include/pango-1.0/pango/pango-fontmap.h +7 -0
  149. package/include/pango-1.0/pango/pango-item.h +4 -2
  150. package/include/pango-1.0/pango/pango-version-macros.h +25 -0
  151. package/include/pango-1.0/pango/pangofc-font.h +2 -1
  152. package/include/pixman-1/pixman-version.h +2 -2
  153. package/include/png.h +32 -29
  154. package/include/pngconf.h +2 -2
  155. package/include/pnglibconf.h +8 -3
  156. package/include/vips/connection.h +9 -3
  157. package/include/vips/util.h +1 -11
  158. package/include/vips/version.h +4 -4
  159. package/include/webp/decode.h +58 -56
  160. package/include/webp/demux.h +25 -21
  161. package/include/webp/encode.h +44 -39
  162. package/include/webp/mux.h +76 -15
  163. package/include/webp/mux_types.h +2 -1
  164. package/include/webp/sharpyuv/sharpyuv.h +77 -8
  165. package/include/webp/types.h +29 -8
  166. package/include/zconf.h +1 -1
  167. package/include/zlib.h +12 -12
  168. package/package.json +1 -1
  169. package/versions.json +18 -19
@@ -152,6 +152,9 @@ class Vec512 {
152
152
  HWY_INLINE Vec512& operator-=(const Vec512 other) {
153
153
  return *this = (*this - other);
154
154
  }
155
+ HWY_INLINE Vec512& operator%=(const Vec512 other) {
156
+ return *this = (*this % other);
157
+ }
155
158
  HWY_INLINE Vec512& operator&=(const Vec512 other) {
156
159
  return *this = (*this & other);
157
160
  }
@@ -190,6 +193,25 @@ HWY_INLINE __m512i BitCastToInteger(__m512d v) {
190
193
  return _mm512_castpd_si512(v);
191
194
  }
192
195
 
196
+ #if HWY_AVX3_HAVE_F32_TO_BF16C
197
+ HWY_INLINE __m512i BitCastToInteger(__m512bh v) {
198
+ // Need to use reinterpret_cast on GCC/Clang or BitCastScalar on MSVC to
199
+ // bit cast a __m512bh to a __m512i as there is currently no intrinsic
200
+ // available (as of GCC 13 and Clang 17) that can bit cast a __m512bh vector
201
+ // to a __m512i vector
202
+
203
+ #if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
204
+ // On GCC or Clang, use reinterpret_cast to bit cast a __m512bh to a __m512i
205
+ return reinterpret_cast<__m512i>(v);
206
+ #else
207
+ // On MSVC, use BitCastScalar to bit cast a __m512bh to a __m512i as MSVC does
208
+ // not allow reinterpret_cast, static_cast, or a C-style cast to be used to
209
+ // bit cast from one AVX vector type to a different AVX vector type
210
+ return BitCastScalar<__m512i>(v);
211
+ #endif // HWY_COMPILER_GCC || HWY_COMPILER_CLANG
212
+ }
213
+ #endif // HWY_AVX3_HAVE_F32_TO_BF16C
214
+
193
215
  template <typename T>
194
216
  HWY_INLINE Vec512<uint8_t> BitCastToByte(Vec512<T> v) {
195
217
  return Vec512<uint8_t>{BitCastToInteger(v.raw)};
@@ -373,6 +395,132 @@ HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
373
395
  BitCast(Full256<uint8_t>(), v).raw)});
374
396
  }
375
397
 
398
+ // ------------------------------ Dup128VecFromValues
399
+
400
+ template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 64)>
401
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
402
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
403
+ TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
404
+ TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
405
+ TFromD<D> t11, TFromD<D> t12,
406
+ TFromD<D> t13, TFromD<D> t14,
407
+ TFromD<D> t15) {
408
+ #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900
409
+ // Missing set_epi8/16.
410
+ return BroadcastBlock<0>(ResizeBitCast(
411
+ d, Dup128VecFromValues(Full128<TFromD<D>>(), t0, t1, t2, t3, t4, t5, t6,
412
+ t7, t8, t9, t10, t11, t12, t13, t14, t15)));
413
+ #else
414
+ (void)d;
415
+ // Need to use _mm512_set_epi8 as there is no _mm512_setr_epi8 intrinsic
416
+ // available
417
+ return VFromD<D>{_mm512_set_epi8(
418
+ static_cast<char>(t15), static_cast<char>(t14), static_cast<char>(t13),
419
+ static_cast<char>(t12), static_cast<char>(t11), static_cast<char>(t10),
420
+ static_cast<char>(t9), static_cast<char>(t8), static_cast<char>(t7),
421
+ static_cast<char>(t6), static_cast<char>(t5), static_cast<char>(t4),
422
+ static_cast<char>(t3), static_cast<char>(t2), static_cast<char>(t1),
423
+ static_cast<char>(t0), static_cast<char>(t15), static_cast<char>(t14),
424
+ static_cast<char>(t13), static_cast<char>(t12), static_cast<char>(t11),
425
+ static_cast<char>(t10), static_cast<char>(t9), static_cast<char>(t8),
426
+ static_cast<char>(t7), static_cast<char>(t6), static_cast<char>(t5),
427
+ static_cast<char>(t4), static_cast<char>(t3), static_cast<char>(t2),
428
+ static_cast<char>(t1), static_cast<char>(t0), static_cast<char>(t15),
429
+ static_cast<char>(t14), static_cast<char>(t13), static_cast<char>(t12),
430
+ static_cast<char>(t11), static_cast<char>(t10), static_cast<char>(t9),
431
+ static_cast<char>(t8), static_cast<char>(t7), static_cast<char>(t6),
432
+ static_cast<char>(t5), static_cast<char>(t4), static_cast<char>(t3),
433
+ static_cast<char>(t2), static_cast<char>(t1), static_cast<char>(t0),
434
+ static_cast<char>(t15), static_cast<char>(t14), static_cast<char>(t13),
435
+ static_cast<char>(t12), static_cast<char>(t11), static_cast<char>(t10),
436
+ static_cast<char>(t9), static_cast<char>(t8), static_cast<char>(t7),
437
+ static_cast<char>(t6), static_cast<char>(t5), static_cast<char>(t4),
438
+ static_cast<char>(t3), static_cast<char>(t2), static_cast<char>(t1),
439
+ static_cast<char>(t0))};
440
+ #endif
441
+ }
442
+
443
+ template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 64)>
444
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
445
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
446
+ TFromD<D> t5, TFromD<D> t6,
447
+ TFromD<D> t7) {
448
+ #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900
449
+ // Missing set_epi8/16.
450
+ return BroadcastBlock<0>(
451
+ ResizeBitCast(d, Dup128VecFromValues(Full128<TFromD<D>>(), t0, t1, t2, t3,
452
+ t4, t5, t6, t7)));
453
+ #else
454
+ (void)d;
455
+ // Need to use _mm512_set_epi16 as there is no _mm512_setr_epi16 intrinsic
456
+ // available
457
+ return VFromD<D>{
458
+ _mm512_set_epi16(static_cast<int16_t>(t7), static_cast<int16_t>(t6),
459
+ static_cast<int16_t>(t5), static_cast<int16_t>(t4),
460
+ static_cast<int16_t>(t3), static_cast<int16_t>(t2),
461
+ static_cast<int16_t>(t1), static_cast<int16_t>(t0),
462
+ static_cast<int16_t>(t7), static_cast<int16_t>(t6),
463
+ static_cast<int16_t>(t5), static_cast<int16_t>(t4),
464
+ static_cast<int16_t>(t3), static_cast<int16_t>(t2),
465
+ static_cast<int16_t>(t1), static_cast<int16_t>(t0),
466
+ static_cast<int16_t>(t7), static_cast<int16_t>(t6),
467
+ static_cast<int16_t>(t5), static_cast<int16_t>(t4),
468
+ static_cast<int16_t>(t3), static_cast<int16_t>(t2),
469
+ static_cast<int16_t>(t1), static_cast<int16_t>(t0),
470
+ static_cast<int16_t>(t7), static_cast<int16_t>(t6),
471
+ static_cast<int16_t>(t5), static_cast<int16_t>(t4),
472
+ static_cast<int16_t>(t3), static_cast<int16_t>(t2),
473
+ static_cast<int16_t>(t1), static_cast<int16_t>(t0))};
474
+ #endif
475
+ }
476
+
477
+ #if HWY_HAVE_FLOAT16
478
+ template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 64)>
479
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
480
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
481
+ TFromD<D> t5, TFromD<D> t6,
482
+ TFromD<D> t7) {
483
+ return VFromD<D>{_mm512_setr_ph(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2,
484
+ t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5,
485
+ t6, t7, t0, t1, t2, t3, t4, t5, t6, t7)};
486
+ }
487
+ #endif
488
+
489
+ template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 64)>
490
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
491
+ TFromD<D> t2, TFromD<D> t3) {
492
+ return VFromD<D>{
493
+ _mm512_setr_epi32(static_cast<int32_t>(t0), static_cast<int32_t>(t1),
494
+ static_cast<int32_t>(t2), static_cast<int32_t>(t3),
495
+ static_cast<int32_t>(t0), static_cast<int32_t>(t1),
496
+ static_cast<int32_t>(t2), static_cast<int32_t>(t3),
497
+ static_cast<int32_t>(t0), static_cast<int32_t>(t1),
498
+ static_cast<int32_t>(t2), static_cast<int32_t>(t3),
499
+ static_cast<int32_t>(t0), static_cast<int32_t>(t1),
500
+ static_cast<int32_t>(t2), static_cast<int32_t>(t3))};
501
+ }
502
+
503
+ template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 64)>
504
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
505
+ TFromD<D> t2, TFromD<D> t3) {
506
+ return VFromD<D>{_mm512_setr_ps(t0, t1, t2, t3, t0, t1, t2, t3, t0, t1, t2,
507
+ t3, t0, t1, t2, t3)};
508
+ }
509
+
510
+ template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 64)>
511
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
512
+ return VFromD<D>{
513
+ _mm512_setr_epi64(static_cast<int64_t>(t0), static_cast<int64_t>(t1),
514
+ static_cast<int64_t>(t0), static_cast<int64_t>(t1),
515
+ static_cast<int64_t>(t0), static_cast<int64_t>(t1),
516
+ static_cast<int64_t>(t0), static_cast<int64_t>(t1))};
517
+ }
518
+
519
+ template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 64)>
520
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
521
+ return VFromD<D>{_mm512_setr_pd(t0, t1, t0, t1, t0, t1, t0, t1)};
522
+ }
523
+
376
524
  // ----------------------------- Iota
377
525
 
378
526
  namespace detail {
@@ -480,7 +628,7 @@ HWY_INLINE VFromD<D> Iota0(D /*d*/) {
480
628
 
481
629
  template <class D, typename T2, HWY_IF_V_SIZE_D(D, 64)>
482
630
  HWY_API VFromD<D> Iota(D d, const T2 first) {
483
- return detail::Iota0(d) + Set(d, static_cast<TFromD<D>>(first));
631
+ return detail::Iota0(d) + Set(d, ConvertScalarTo<TFromD<D>>(first));
484
632
  }
485
633
 
486
634
  // ================================================== LOGICAL
@@ -502,7 +650,8 @@ template <typename T>
502
650
  HWY_API Vec512<T> And(const Vec512<T> a, const Vec512<T> b) {
503
651
  const DFromV<decltype(a)> d; // for float16_t
504
652
  const RebindToUnsigned<decltype(d)> du;
505
- return BitCast(d, VFromD<decltype(du)>{_mm512_and_si512(a.raw, b.raw)});
653
+ return BitCast(d, VFromD<decltype(du)>{_mm512_and_si512(BitCast(du, a).raw,
654
+ BitCast(du, b).raw)});
506
655
  }
507
656
 
508
657
  HWY_API Vec512<float> And(const Vec512<float> a, const Vec512<float> b) {
@@ -519,8 +668,8 @@ template <typename T>
519
668
  HWY_API Vec512<T> AndNot(const Vec512<T> not_mask, const Vec512<T> mask) {
520
669
  const DFromV<decltype(mask)> d; // for float16_t
521
670
  const RebindToUnsigned<decltype(d)> du;
522
- return BitCast(
523
- d, VFromD<decltype(du)>{_mm512_andnot_si512(not_mask.raw, mask.raw)});
671
+ return BitCast(d, VFromD<decltype(du)>{_mm512_andnot_si512(
672
+ BitCast(du, not_mask).raw, BitCast(du, mask).raw)});
524
673
  }
525
674
  HWY_API Vec512<float> AndNot(const Vec512<float> not_mask,
526
675
  const Vec512<float> mask) {
@@ -537,7 +686,8 @@ template <typename T>
537
686
  HWY_API Vec512<T> Or(const Vec512<T> a, const Vec512<T> b) {
538
687
  const DFromV<decltype(a)> d; // for float16_t
539
688
  const RebindToUnsigned<decltype(d)> du;
540
- return BitCast(d, VFromD<decltype(du)>{_mm512_or_si512(a.raw, b.raw)});
689
+ return BitCast(d, VFromD<decltype(du)>{_mm512_or_si512(BitCast(du, a).raw,
690
+ BitCast(du, b).raw)});
541
691
  }
542
692
 
543
693
  HWY_API Vec512<float> Or(const Vec512<float> a, const Vec512<float> b) {
@@ -553,7 +703,8 @@ template <typename T>
553
703
  HWY_API Vec512<T> Xor(const Vec512<T> a, const Vec512<T> b) {
554
704
  const DFromV<decltype(a)> d; // for float16_t
555
705
  const RebindToUnsigned<decltype(d)> du;
556
- return BitCast(d, VFromD<decltype(du)>{_mm512_xor_si512(a.raw, b.raw)});
706
+ return BitCast(d, VFromD<decltype(du)>{_mm512_xor_si512(BitCast(du, a).raw,
707
+ BitCast(du, b).raw)});
557
708
  }
558
709
 
559
710
  HWY_API Vec512<float> Xor(const Vec512<float> a, const Vec512<float> b) {
@@ -566,45 +717,61 @@ HWY_API Vec512<double> Xor(const Vec512<double> a, const Vec512<double> b) {
566
717
  // ------------------------------ Xor3
567
718
  template <typename T>
568
719
  HWY_API Vec512<T> Xor3(Vec512<T> x1, Vec512<T> x2, Vec512<T> x3) {
720
+ #if !HWY_IS_MSAN
569
721
  const DFromV<decltype(x1)> d;
570
722
  const RebindToUnsigned<decltype(d)> du;
571
723
  using VU = VFromD<decltype(du)>;
572
724
  const __m512i ret = _mm512_ternarylogic_epi64(
573
725
  BitCast(du, x1).raw, BitCast(du, x2).raw, BitCast(du, x3).raw, 0x96);
574
726
  return BitCast(d, VU{ret});
727
+ #else
728
+ return Xor(x1, Xor(x2, x3));
729
+ #endif
575
730
  }
576
731
 
577
732
  // ------------------------------ Or3
578
733
  template <typename T>
579
734
  HWY_API Vec512<T> Or3(Vec512<T> o1, Vec512<T> o2, Vec512<T> o3) {
735
+ #if !HWY_IS_MSAN
580
736
  const DFromV<decltype(o1)> d;
581
737
  const RebindToUnsigned<decltype(d)> du;
582
738
  using VU = VFromD<decltype(du)>;
583
739
  const __m512i ret = _mm512_ternarylogic_epi64(
584
740
  BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE);
585
741
  return BitCast(d, VU{ret});
742
+ #else
743
+ return Or(o1, Or(o2, o3));
744
+ #endif
586
745
  }
587
746
 
588
747
  // ------------------------------ OrAnd
589
748
  template <typename T>
590
749
  HWY_API Vec512<T> OrAnd(Vec512<T> o, Vec512<T> a1, Vec512<T> a2) {
750
+ #if !HWY_IS_MSAN
591
751
  const DFromV<decltype(o)> d;
592
752
  const RebindToUnsigned<decltype(d)> du;
593
753
  using VU = VFromD<decltype(du)>;
594
754
  const __m512i ret = _mm512_ternarylogic_epi64(
595
755
  BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
596
756
  return BitCast(d, VU{ret});
757
+ #else
758
+ return Or(o, And(a1, a2));
759
+ #endif
597
760
  }
598
761
 
599
762
  // ------------------------------ IfVecThenElse
600
763
  template <typename T>
601
764
  HWY_API Vec512<T> IfVecThenElse(Vec512<T> mask, Vec512<T> yes, Vec512<T> no) {
765
+ #if !HWY_IS_MSAN
602
766
  const DFromV<decltype(yes)> d;
603
767
  const RebindToUnsigned<decltype(d)> du;
604
768
  using VU = VFromD<decltype(du)>;
605
769
  return BitCast(d, VU{_mm512_ternarylogic_epi64(BitCast(du, mask).raw,
606
770
  BitCast(du, yes).raw,
607
771
  BitCast(du, no).raw, 0xCA)});
772
+ #else
773
+ return IfThenElse(MaskFromVec(mask), yes, no);
774
+ #endif
608
775
  }
609
776
 
610
777
  // ------------------------------ Operator overloads (internal-only if float)
@@ -752,7 +919,7 @@ HWY_API MFromD<D> FirstN(D d, size_t n) {
752
919
  m.raw = static_cast<decltype(m.raw)>(_bzhi_u64(all, n));
753
920
  return m;
754
921
  #else
755
- return detail::FirstN<T>(n);
922
+ return detail::FirstN<TFromD<D>>(n);
756
923
  #endif // HWY_ARCH_X86_64
757
924
  }
758
925
 
@@ -790,7 +957,7 @@ HWY_INLINE Vec512<T> IfThenElse(hwy::SizeTag<8> /* tag */,
790
957
 
791
958
  } // namespace detail
792
959
 
793
- template <typename T, HWY_IF_NOT_FLOAT(T)>
960
+ template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
794
961
  HWY_API Vec512<T> IfThenElse(const Mask512<T> mask, const Vec512<T> yes,
795
962
  const Vec512<T> no) {
796
963
  return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
@@ -840,7 +1007,7 @@ HWY_INLINE Vec512<T> IfThenElseZero(hwy::SizeTag<8> /* tag */,
840
1007
 
841
1008
  } // namespace detail
842
1009
 
843
- template <typename T, HWY_IF_NOT_FLOAT3264(T)>
1010
+ template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
844
1011
  HWY_API Vec512<T> IfThenElseZero(const Mask512<T> mask, const Vec512<T> yes) {
845
1012
  return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
846
1013
  }
@@ -878,7 +1045,7 @@ HWY_INLINE Vec512<T> IfThenZeroElse(hwy::SizeTag<8> /* tag */,
878
1045
 
879
1046
  } // namespace detail
880
1047
 
881
- template <typename T, HWY_IF_NOT_FLOAT3264(T)>
1048
+ template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
882
1049
  HWY_API Vec512<T> IfThenZeroElse(const Mask512<T> mask, const Vec512<T> no) {
883
1050
  return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
884
1051
  }
@@ -896,10 +1063,12 @@ HWY_API Vec512<T> IfNegativeThenElse(Vec512<T> v, Vec512<T> yes, Vec512<T> no) {
896
1063
  return IfThenElse(MaskFromVec(v), yes, no);
897
1064
  }
898
1065
 
899
- template <typename T, HWY_IF_FLOAT(T)>
900
- HWY_API Vec512<T> ZeroIfNegative(const Vec512<T> v) {
1066
+ template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
1067
+ HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
1068
+ HWY_API Vec512<T> IfNegativeThenNegOrUndefIfZero(Vec512<T> mask, Vec512<T> v) {
901
1069
  // AVX3 MaskFromVec only looks at the MSB
902
- return IfThenZeroElse(MaskFromVec(v), v);
1070
+ const DFromV<decltype(v)> d;
1071
+ return MaskedSubOr(v, MaskFromVec(mask), Zero(d), v);
903
1072
  }
904
1073
 
905
1074
  // ================================================== ARITHMETIC
@@ -1000,6 +1169,59 @@ HWY_API Vec512<uint64_t> SumsOf8AbsDiff(Vec512<uint8_t> a, Vec512<uint8_t> b) {
1000
1169
  return Vec512<uint64_t>{_mm512_sad_epu8(a.raw, b.raw)};
1001
1170
  }
1002
1171
 
1172
+ // ------------------------------ SumsOf4
1173
+ namespace detail {
1174
+
1175
+ HWY_INLINE Vec512<uint32_t> SumsOf4(hwy::UnsignedTag /*type_tag*/,
1176
+ hwy::SizeTag<1> /*lane_size_tag*/,
1177
+ Vec512<uint8_t> v) {
1178
+ const DFromV<decltype(v)> d;
1179
+
1180
+ // _mm512_maskz_dbsad_epu8 is used below as the odd uint16_t lanes need to be
1181
+ // zeroed out and the sums of the 4 consecutive lanes are already in the
1182
+ // even uint16_t lanes of the _mm512_maskz_dbsad_epu8 result.
1183
+ return Vec512<uint32_t>{_mm512_maskz_dbsad_epu8(
1184
+ static_cast<__mmask32>(0x55555555), v.raw, Zero(d).raw, 0)};
1185
+ }
1186
+
1187
+ // I8->I32 SumsOf4
1188
+ // Generic for all vector lengths
1189
+ template <class V>
1190
+ HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
1191
+ hwy::SignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
1192
+ const DFromV<decltype(v)> d;
1193
+ const RebindToUnsigned<decltype(d)> du;
1194
+ const RepartitionToWideX2<decltype(d)> di32;
1195
+
1196
+ // Adjust the values of v to be in the 0..255 range by adding 128 to each lane
1197
+ // of v (which is the same as an bitwise XOR of each i8 lane by 128) and then
1198
+ // bitcasting the Xor result to an u8 vector.
1199
+ const auto v_adj = BitCast(du, Xor(v, SignBit(d)));
1200
+
1201
+ // Need to add -512 to each i32 lane of the result of the
1202
+ // SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(), v_adj) operation to account
1203
+ // for the adjustment made above.
1204
+ return BitCast(di32, SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(), v_adj)) +
1205
+ Set(di32, int32_t{-512});
1206
+ }
1207
+
1208
+ } // namespace detail
1209
+
1210
+ // ------------------------------ SumsOfShuffledQuadAbsDiff
1211
+
1212
+ #if HWY_TARGET <= HWY_AVX3
1213
+ template <int kIdx3, int kIdx2, int kIdx1, int kIdx0>
1214
+ static Vec512<uint16_t> SumsOfShuffledQuadAbsDiff(Vec512<uint8_t> a,
1215
+ Vec512<uint8_t> b) {
1216
+ static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3");
1217
+ static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3");
1218
+ static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3");
1219
+ static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3");
1220
+ return Vec512<uint16_t>{
1221
+ _mm512_dbsad_epu8(b.raw, a.raw, _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0))};
1222
+ }
1223
+ #endif
1224
+
1003
1225
  // ------------------------------ SaturatedAdd
1004
1226
 
1005
1227
  // Returns a + b clamped to the destination range.
@@ -1075,27 +1297,6 @@ HWY_API Vec512<int64_t> Abs(const Vec512<int64_t> v) {
1075
1297
  return Vec512<int64_t>{_mm512_abs_epi64(v.raw)};
1076
1298
  }
1077
1299
 
1078
- // These aren't native instructions, they also involve AND with constant.
1079
- #if HWY_HAVE_FLOAT16
1080
- HWY_API Vec512<float16_t> Abs(const Vec512<float16_t> v) {
1081
- return Vec512<float16_t>{_mm512_abs_ph(v.raw)};
1082
- }
1083
- #endif // HWY_HAVE_FLOAT16
1084
-
1085
- HWY_API Vec512<float> Abs(const Vec512<float> v) {
1086
- return Vec512<float>{_mm512_abs_ps(v.raw)};
1087
- }
1088
- HWY_API Vec512<double> Abs(const Vec512<double> v) {
1089
- // Workaround: _mm512_abs_pd expects __m512, so implement it ourselves.
1090
- #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 803
1091
- const DFromV<decltype(v)> d;
1092
- const RebindToUnsigned<decltype(d)> du;
1093
- return And(v, BitCast(d, Set(du, 0x7FFFFFFFFFFFFFFFULL)));
1094
- #else
1095
- return Vec512<double>{_mm512_abs_pd(v.raw)};
1096
- #endif
1097
- }
1098
-
1099
1300
  // ------------------------------ ShiftLeft
1100
1301
 
1101
1302
  #if HWY_TARGET <= HWY_AVX3_DL
@@ -1245,14 +1446,45 @@ HWY_API Vec512<int8_t> ShiftRight(const Vec512<int8_t> v) {
1245
1446
 
1246
1447
  // ------------------------------ RotateRight
1247
1448
 
1248
- template <int kBits, typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
1249
- HWY_API Vec512<T> RotateRight(const Vec512<T> v) {
1250
- constexpr size_t kSizeInBits = sizeof(T) * 8;
1251
- static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
1449
+ #if HWY_TARGET <= HWY_AVX3_DL
1450
+ // U8 RotateRight is generic for all vector lengths on AVX3_DL
1451
+ template <int kBits, class V, HWY_IF_U8(TFromV<V>)>
1452
+ HWY_API V RotateRight(V v) {
1453
+ static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
1454
+
1455
+ const Repartition<uint64_t, DFromV<V>> du64;
1252
1456
  if (kBits == 0) return v;
1253
- // AVX3 does not support 8/16-bit.
1254
- return Or(ShiftRight<kBits>(v),
1255
- ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
1457
+
1458
+ constexpr uint64_t kShrMatrix =
1459
+ (0x0102040810204080ULL << kBits) &
1460
+ (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
1461
+ constexpr int kShlBits = (-kBits) & 7;
1462
+ constexpr uint64_t kShlMatrix = (0x0102040810204080ULL >> kShlBits) &
1463
+ (0x0101010101010101ULL * (0xFF >> kShlBits));
1464
+ constexpr uint64_t kMatrix = kShrMatrix | kShlMatrix;
1465
+
1466
+ return detail::GaloisAffine(v, Set(du64, kMatrix));
1467
+ }
1468
+ #else // HWY_TARGET > HWY_AVX3_DL
1469
+ template <int kBits>
1470
+ HWY_API Vec512<uint8_t> RotateRight(const Vec512<uint8_t> v) {
1471
+ static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
1472
+ if (kBits == 0) return v;
1473
+ // AVX3 does not support 8-bit.
1474
+ return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(7, 8 - kBits)>(v));
1475
+ }
1476
+ #endif // HWY_TARGET <= HWY_AVX3_DL
1477
+
1478
+ template <int kBits>
1479
+ HWY_API Vec512<uint16_t> RotateRight(const Vec512<uint16_t> v) {
1480
+ static_assert(0 <= kBits && kBits < 16, "Invalid shift count");
1481
+ if (kBits == 0) return v;
1482
+ #if HWY_TARGET <= HWY_AVX3_DL
1483
+ return Vec512<uint16_t>{_mm512_shrdi_epi16(v.raw, v.raw, kBits)};
1484
+ #else
1485
+ // AVX3 does not support 16-bit.
1486
+ return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(15, 16 - kBits)>(v));
1487
+ #endif
1256
1488
  }
1257
1489
 
1258
1490
  template <int kBits>
@@ -1269,6 +1501,34 @@ HWY_API Vec512<uint64_t> RotateRight(const Vec512<uint64_t> v) {
1269
1501
  return Vec512<uint64_t>{_mm512_ror_epi64(v.raw, kBits)};
1270
1502
  }
1271
1503
 
1504
+ // ------------------------------ Rol/Ror
1505
+ #if HWY_TARGET <= HWY_AVX3_DL
1506
+ template <class T, HWY_IF_UI16(T)>
1507
+ HWY_API Vec512<T> Ror(Vec512<T> a, Vec512<T> b) {
1508
+ return Vec512<T>{_mm512_shrdv_epi16(a.raw, a.raw, b.raw)};
1509
+ }
1510
+ #endif // HWY_TARGET <= HWY_AVX3_DL
1511
+
1512
+ template <class T, HWY_IF_UI32(T)>
1513
+ HWY_API Vec512<T> Rol(Vec512<T> a, Vec512<T> b) {
1514
+ return Vec512<T>{_mm512_rolv_epi32(a.raw, b.raw)};
1515
+ }
1516
+
1517
+ template <class T, HWY_IF_UI32(T)>
1518
+ HWY_API Vec512<T> Ror(Vec512<T> a, Vec512<T> b) {
1519
+ return Vec512<T>{_mm512_rorv_epi32(a.raw, b.raw)};
1520
+ }
1521
+
1522
+ template <class T, HWY_IF_UI64(T)>
1523
+ HWY_API Vec512<T> Rol(Vec512<T> a, Vec512<T> b) {
1524
+ return Vec512<T>{_mm512_rolv_epi64(a.raw, b.raw)};
1525
+ }
1526
+
1527
+ template <class T, HWY_IF_UI64(T)>
1528
+ HWY_API Vec512<T> Ror(Vec512<T> a, Vec512<T> b) {
1529
+ return Vec512<T>{_mm512_rorv_epi64(a.raw, b.raw)};
1530
+ }
1531
+
1272
1532
  // ------------------------------ ShiftLeftSame
1273
1533
 
1274
1534
  // GCC <14 and Clang <11 do not follow the Intel documentation for AVX-512
@@ -1643,6 +1903,322 @@ HWY_API Vec512<double> ApproximateReciprocal(Vec512<double> v) {
1643
1903
  return Vec512<double>{_mm512_rcp14_pd(v.raw)};
1644
1904
  }
1645
1905
 
1906
+ // ------------------------------ MaskedMinOr
1907
+
1908
+ template <typename T, HWY_IF_U8(T)>
1909
+ HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1910
+ Vec512<T> b) {
1911
+ return Vec512<T>{_mm512_mask_min_epu8(no.raw, m.raw, a.raw, b.raw)};
1912
+ }
1913
+ template <typename T, HWY_IF_I8(T)>
1914
+ HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1915
+ Vec512<T> b) {
1916
+ return Vec512<T>{_mm512_mask_min_epi8(no.raw, m.raw, a.raw, b.raw)};
1917
+ }
1918
+
1919
+ template <typename T, HWY_IF_U16(T)>
1920
+ HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1921
+ Vec512<T> b) {
1922
+ return Vec512<T>{_mm512_mask_min_epu16(no.raw, m.raw, a.raw, b.raw)};
1923
+ }
1924
+ template <typename T, HWY_IF_I16(T)>
1925
+ HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1926
+ Vec512<T> b) {
1927
+ return Vec512<T>{_mm512_mask_min_epi16(no.raw, m.raw, a.raw, b.raw)};
1928
+ }
1929
+
1930
+ template <typename T, HWY_IF_U32(T)>
1931
+ HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1932
+ Vec512<T> b) {
1933
+ return Vec512<T>{_mm512_mask_min_epu32(no.raw, m.raw, a.raw, b.raw)};
1934
+ }
1935
+ template <typename T, HWY_IF_I32(T)>
1936
+ HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1937
+ Vec512<T> b) {
1938
+ return Vec512<T>{_mm512_mask_min_epi32(no.raw, m.raw, a.raw, b.raw)};
1939
+ }
1940
+
1941
+ template <typename T, HWY_IF_U64(T)>
1942
+ HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1943
+ Vec512<T> b) {
1944
+ return Vec512<T>{_mm512_mask_min_epu64(no.raw, m.raw, a.raw, b.raw)};
1945
+ }
1946
+ template <typename T, HWY_IF_I64(T)>
1947
+ HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1948
+ Vec512<T> b) {
1949
+ return Vec512<T>{_mm512_mask_min_epi64(no.raw, m.raw, a.raw, b.raw)};
1950
+ }
1951
+
1952
+ template <typename T, HWY_IF_F32(T)>
1953
+ HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1954
+ Vec512<T> b) {
1955
+ return Vec512<T>{_mm512_mask_min_ps(no.raw, m.raw, a.raw, b.raw)};
1956
+ }
1957
+
1958
+ template <typename T, HWY_IF_F64(T)>
1959
+ HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1960
+ Vec512<T> b) {
1961
+ return Vec512<T>{_mm512_mask_min_pd(no.raw, m.raw, a.raw, b.raw)};
1962
+ }
1963
+
1964
+ #if HWY_HAVE_FLOAT16
1965
+ template <typename T, HWY_IF_F16(T)>
1966
+ HWY_API Vec512<T> MaskedMinOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1967
+ Vec512<T> b) {
1968
+ return Vec512<T>{_mm512_mask_min_ph(no.raw, m.raw, a.raw, b.raw)};
1969
+ }
1970
+ #endif // HWY_HAVE_FLOAT16
1971
+
1972
+ // ------------------------------ MaskedMaxOr
1973
+
1974
+ template <typename T, HWY_IF_U8(T)>
1975
+ HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1976
+ Vec512<T> b) {
1977
+ return Vec512<T>{_mm512_mask_max_epu8(no.raw, m.raw, a.raw, b.raw)};
1978
+ }
1979
+ template <typename T, HWY_IF_I8(T)>
1980
+ HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1981
+ Vec512<T> b) {
1982
+ return Vec512<T>{_mm512_mask_max_epi8(no.raw, m.raw, a.raw, b.raw)};
1983
+ }
1984
+
1985
+ template <typename T, HWY_IF_U16(T)>
1986
+ HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1987
+ Vec512<T> b) {
1988
+ return Vec512<T>{_mm512_mask_max_epu16(no.raw, m.raw, a.raw, b.raw)};
1989
+ }
1990
+ template <typename T, HWY_IF_I16(T)>
1991
+ HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1992
+ Vec512<T> b) {
1993
+ return Vec512<T>{_mm512_mask_max_epi16(no.raw, m.raw, a.raw, b.raw)};
1994
+ }
1995
+
1996
+ template <typename T, HWY_IF_U32(T)>
1997
+ HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
1998
+ Vec512<T> b) {
1999
+ return Vec512<T>{_mm512_mask_max_epu32(no.raw, m.raw, a.raw, b.raw)};
2000
+ }
2001
+ template <typename T, HWY_IF_I32(T)>
2002
+ HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2003
+ Vec512<T> b) {
2004
+ return Vec512<T>{_mm512_mask_max_epi32(no.raw, m.raw, a.raw, b.raw)};
2005
+ }
2006
+
2007
+ template <typename T, HWY_IF_U64(T)>
2008
+ HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2009
+ Vec512<T> b) {
2010
+ return Vec512<T>{_mm512_mask_max_epu64(no.raw, m.raw, a.raw, b.raw)};
2011
+ }
2012
+ template <typename T, HWY_IF_I64(T)>
2013
+ HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2014
+ Vec512<T> b) {
2015
+ return Vec512<T>{_mm512_mask_max_epi64(no.raw, m.raw, a.raw, b.raw)};
2016
+ }
2017
+
2018
+ template <typename T, HWY_IF_F32(T)>
2019
+ HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2020
+ Vec512<T> b) {
2021
+ return Vec512<T>{_mm512_mask_max_ps(no.raw, m.raw, a.raw, b.raw)};
2022
+ }
2023
+
2024
+ template <typename T, HWY_IF_F64(T)>
2025
+ HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2026
+ Vec512<T> b) {
2027
+ return Vec512<T>{_mm512_mask_max_pd(no.raw, m.raw, a.raw, b.raw)};
2028
+ }
2029
+
2030
+ #if HWY_HAVE_FLOAT16
2031
+ template <typename T, HWY_IF_F16(T)>
2032
+ HWY_API Vec512<T> MaskedMaxOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2033
+ Vec512<T> b) {
2034
+ return Vec512<T>{_mm512_mask_max_ph(no.raw, m.raw, a.raw, b.raw)};
2035
+ }
2036
+ #endif // HWY_HAVE_FLOAT16
2037
+
2038
+ // ------------------------------ MaskedAddOr
2039
+
2040
+ template <typename T, HWY_IF_UI8(T)>
2041
+ HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2042
+ Vec512<T> b) {
2043
+ return Vec512<T>{_mm512_mask_add_epi8(no.raw, m.raw, a.raw, b.raw)};
2044
+ }
2045
+
2046
+ template <typename T, HWY_IF_UI16(T)>
2047
+ HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2048
+ Vec512<T> b) {
2049
+ return Vec512<T>{_mm512_mask_add_epi16(no.raw, m.raw, a.raw, b.raw)};
2050
+ }
2051
+
2052
+ template <typename T, HWY_IF_UI32(T)>
2053
+ HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2054
+ Vec512<T> b) {
2055
+ return Vec512<T>{_mm512_mask_add_epi32(no.raw, m.raw, a.raw, b.raw)};
2056
+ }
2057
+
2058
+ template <typename T, HWY_IF_UI64(T)>
2059
+ HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2060
+ Vec512<T> b) {
2061
+ return Vec512<T>{_mm512_mask_add_epi64(no.raw, m.raw, a.raw, b.raw)};
2062
+ }
2063
+
2064
+ template <typename T, HWY_IF_F32(T)>
2065
+ HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2066
+ Vec512<T> b) {
2067
+ return Vec512<T>{_mm512_mask_add_ps(no.raw, m.raw, a.raw, b.raw)};
2068
+ }
2069
+
2070
+ template <typename T, HWY_IF_F64(T)>
2071
+ HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2072
+ Vec512<T> b) {
2073
+ return Vec512<T>{_mm512_mask_add_pd(no.raw, m.raw, a.raw, b.raw)};
2074
+ }
2075
+
2076
+ #if HWY_HAVE_FLOAT16
2077
+ template <typename T, HWY_IF_F16(T)>
2078
+ HWY_API Vec512<T> MaskedAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2079
+ Vec512<T> b) {
2080
+ return Vec512<T>{_mm512_mask_add_ph(no.raw, m.raw, a.raw, b.raw)};
2081
+ }
2082
+ #endif // HWY_HAVE_FLOAT16
2083
+
2084
+ // ------------------------------ MaskedSubOr
2085
+
2086
+ template <typename T, HWY_IF_UI8(T)>
2087
+ HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2088
+ Vec512<T> b) {
2089
+ return Vec512<T>{_mm512_mask_sub_epi8(no.raw, m.raw, a.raw, b.raw)};
2090
+ }
2091
+
2092
+ template <typename T, HWY_IF_UI16(T)>
2093
+ HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2094
+ Vec512<T> b) {
2095
+ return Vec512<T>{_mm512_mask_sub_epi16(no.raw, m.raw, a.raw, b.raw)};
2096
+ }
2097
+
2098
+ template <typename T, HWY_IF_UI32(T)>
2099
+ HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2100
+ Vec512<T> b) {
2101
+ return Vec512<T>{_mm512_mask_sub_epi32(no.raw, m.raw, a.raw, b.raw)};
2102
+ }
2103
+
2104
+ template <typename T, HWY_IF_UI64(T)>
2105
+ HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2106
+ Vec512<T> b) {
2107
+ return Vec512<T>{_mm512_mask_sub_epi64(no.raw, m.raw, a.raw, b.raw)};
2108
+ }
2109
+
2110
+ template <typename T, HWY_IF_F32(T)>
2111
+ HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2112
+ Vec512<T> b) {
2113
+ return Vec512<T>{_mm512_mask_sub_ps(no.raw, m.raw, a.raw, b.raw)};
2114
+ }
2115
+
2116
+ template <typename T, HWY_IF_F64(T)>
2117
+ HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2118
+ Vec512<T> b) {
2119
+ return Vec512<T>{_mm512_mask_sub_pd(no.raw, m.raw, a.raw, b.raw)};
2120
+ }
2121
+
2122
+ #if HWY_HAVE_FLOAT16
2123
+ template <typename T, HWY_IF_F16(T)>
2124
+ HWY_API Vec512<T> MaskedSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2125
+ Vec512<T> b) {
2126
+ return Vec512<T>{_mm512_mask_sub_ph(no.raw, m.raw, a.raw, b.raw)};
2127
+ }
2128
+ #endif // HWY_HAVE_FLOAT16
2129
+
2130
+ // ------------------------------ MaskedMulOr
2131
+
2132
+ HWY_API Vec512<float> MaskedMulOr(Vec512<float> no, Mask512<float> m,
2133
+ Vec512<float> a, Vec512<float> b) {
2134
+ return Vec512<float>{_mm512_mask_mul_ps(no.raw, m.raw, a.raw, b.raw)};
2135
+ }
2136
+
2137
+ HWY_API Vec512<double> MaskedMulOr(Vec512<double> no, Mask512<double> m,
2138
+ Vec512<double> a, Vec512<double> b) {
2139
+ return Vec512<double>{_mm512_mask_mul_pd(no.raw, m.raw, a.raw, b.raw)};
2140
+ }
2141
+
2142
+ #if HWY_HAVE_FLOAT16
2143
+ HWY_API Vec512<float16_t> MaskedMulOr(Vec512<float16_t> no,
2144
+ Mask512<float16_t> m, Vec512<float16_t> a,
2145
+ Vec512<float16_t> b) {
2146
+ return Vec512<float16_t>{_mm512_mask_mul_ph(no.raw, m.raw, a.raw, b.raw)};
2147
+ }
2148
+ #endif // HWY_HAVE_FLOAT16
2149
+
2150
+ // ------------------------------ MaskedDivOr
2151
+
2152
+ HWY_API Vec512<float> MaskedDivOr(Vec512<float> no, Mask512<float> m,
2153
+ Vec512<float> a, Vec512<float> b) {
2154
+ return Vec512<float>{_mm512_mask_div_ps(no.raw, m.raw, a.raw, b.raw)};
2155
+ }
2156
+
2157
+ HWY_API Vec512<double> MaskedDivOr(Vec512<double> no, Mask512<double> m,
2158
+ Vec512<double> a, Vec512<double> b) {
2159
+ return Vec512<double>{_mm512_mask_div_pd(no.raw, m.raw, a.raw, b.raw)};
2160
+ }
2161
+
2162
+ #if HWY_HAVE_FLOAT16
2163
+ HWY_API Vec512<float16_t> MaskedDivOr(Vec512<float16_t> no,
2164
+ Mask512<float16_t> m, Vec512<float16_t> a,
2165
+ Vec512<float16_t> b) {
2166
+ return Vec512<float16_t>{_mm512_mask_div_ph(no.raw, m.raw, a.raw, b.raw)};
2167
+ }
2168
+ #endif // HWY_HAVE_FLOAT16
2169
+
2170
+ // ------------------------------ MaskedSatAddOr
2171
+
2172
+ template <typename T, HWY_IF_I8(T)>
2173
+ HWY_API Vec512<T> MaskedSatAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2174
+ Vec512<T> b) {
2175
+ return Vec512<T>{_mm512_mask_adds_epi8(no.raw, m.raw, a.raw, b.raw)};
2176
+ }
2177
+
2178
+ template <typename T, HWY_IF_U8(T)>
2179
+ HWY_API Vec512<T> MaskedSatAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2180
+ Vec512<T> b) {
2181
+ return Vec512<T>{_mm512_mask_adds_epu8(no.raw, m.raw, a.raw, b.raw)};
2182
+ }
2183
+
2184
+ template <typename T, HWY_IF_I16(T)>
2185
+ HWY_API Vec512<T> MaskedSatAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2186
+ Vec512<T> b) {
2187
+ return Vec512<T>{_mm512_mask_adds_epi16(no.raw, m.raw, a.raw, b.raw)};
2188
+ }
2189
+
2190
+ template <typename T, HWY_IF_U16(T)>
2191
+ HWY_API Vec512<T> MaskedSatAddOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2192
+ Vec512<T> b) {
2193
+ return Vec512<T>{_mm512_mask_adds_epu16(no.raw, m.raw, a.raw, b.raw)};
2194
+ }
2195
+
2196
+ // ------------------------------ MaskedSatSubOr
2197
+
2198
+ template <typename T, HWY_IF_I8(T)>
2199
+ HWY_API Vec512<T> MaskedSatSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2200
+ Vec512<T> b) {
2201
+ return Vec512<T>{_mm512_mask_subs_epi8(no.raw, m.raw, a.raw, b.raw)};
2202
+ }
2203
+
2204
+ template <typename T, HWY_IF_U8(T)>
2205
+ HWY_API Vec512<T> MaskedSatSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2206
+ Vec512<T> b) {
2207
+ return Vec512<T>{_mm512_mask_subs_epu8(no.raw, m.raw, a.raw, b.raw)};
2208
+ }
2209
+
2210
+ template <typename T, HWY_IF_I16(T)>
2211
+ HWY_API Vec512<T> MaskedSatSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2212
+ Vec512<T> b) {
2213
+ return Vec512<T>{_mm512_mask_subs_epi16(no.raw, m.raw, a.raw, b.raw)};
2214
+ }
2215
+
2216
+ template <typename T, HWY_IF_U16(T)>
2217
+ HWY_API Vec512<T> MaskedSatSubOr(Vec512<T> no, Mask512<T> m, Vec512<T> a,
2218
+ Vec512<T> b) {
2219
+ return Vec512<T>{_mm512_mask_subs_epu16(no.raw, m.raw, a.raw, b.raw)};
2220
+ }
2221
+
1646
2222
  // ------------------------------ Floating-point multiply-add variants
1647
2223
 
1648
2224
  #if HWY_HAVE_FLOAT16
@@ -1709,6 +2285,23 @@ HWY_API Vec512<double> NegMulSub(Vec512<double> mul, Vec512<double> x,
1709
2285
  return Vec512<double>{_mm512_fnmsub_pd(mul.raw, x.raw, sub.raw)};
1710
2286
  }
1711
2287
 
2288
+ #if HWY_HAVE_FLOAT16
2289
+ HWY_API Vec512<float16_t> MulAddSub(Vec512<float16_t> mul, Vec512<float16_t> x,
2290
+ Vec512<float16_t> sub_or_add) {
2291
+ return Vec512<float16_t>{_mm512_fmaddsub_ph(mul.raw, x.raw, sub_or_add.raw)};
2292
+ }
2293
+ #endif // HWY_HAVE_FLOAT16
2294
+
2295
+ HWY_API Vec512<float> MulAddSub(Vec512<float> mul, Vec512<float> x,
2296
+ Vec512<float> sub_or_add) {
2297
+ return Vec512<float>{_mm512_fmaddsub_ps(mul.raw, x.raw, sub_or_add.raw)};
2298
+ }
2299
+
2300
+ HWY_API Vec512<double> MulAddSub(Vec512<double> mul, Vec512<double> x,
2301
+ Vec512<double> sub_or_add) {
2302
+ return Vec512<double>{_mm512_fmaddsub_pd(mul.raw, x.raw, sub_or_add.raw)};
2303
+ }
2304
+
1712
2305
  // ------------------------------ Floating-point square root
1713
2306
 
1714
2307
  // Full precision square root
@@ -1873,7 +2466,11 @@ HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
1873
2466
  #if HWY_HAVE_FLOAT16
1874
2467
  HWY_API Mask512<float16_t> operator==(Vec512<float16_t> a,
1875
2468
  Vec512<float16_t> b) {
2469
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2470
+ HWY_DIAGNOSTICS(push)
2471
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1876
2472
  return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)};
2473
+ HWY_DIAGNOSTICS(pop)
1877
2474
  }
1878
2475
  #endif // HWY_HAVE_FLOAT16
1879
2476
 
@@ -1907,7 +2504,11 @@ HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
1907
2504
  #if HWY_HAVE_FLOAT16
1908
2505
  HWY_API Mask512<float16_t> operator!=(Vec512<float16_t> a,
1909
2506
  Vec512<float16_t> b) {
2507
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2508
+ HWY_DIAGNOSTICS(push)
2509
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1910
2510
  return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
2511
+ HWY_DIAGNOSTICS(pop)
1911
2512
  }
1912
2513
  #endif // HWY_HAVE_FLOAT16
1913
2514
 
@@ -1949,7 +2550,11 @@ HWY_API Mask512<int64_t> operator>(Vec512<int64_t> a, Vec512<int64_t> b) {
1949
2550
 
1950
2551
  #if HWY_HAVE_FLOAT16
1951
2552
  HWY_API Mask512<float16_t> operator>(Vec512<float16_t> a, Vec512<float16_t> b) {
2553
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2554
+ HWY_DIAGNOSTICS(push)
2555
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1952
2556
  return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)};
2557
+ HWY_DIAGNOSTICS(pop)
1953
2558
  }
1954
2559
  #endif // HWY_HAVE_FLOAT16
1955
2560
 
@@ -1965,7 +2570,11 @@ HWY_API Mask512<double> operator>(Vec512<double> a, Vec512<double> b) {
1965
2570
  #if HWY_HAVE_FLOAT16
1966
2571
  HWY_API Mask512<float16_t> operator>=(Vec512<float16_t> a,
1967
2572
  Vec512<float16_t> b) {
2573
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2574
+ HWY_DIAGNOSTICS(push)
2575
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1968
2576
  return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)};
2577
+ HWY_DIAGNOSTICS(pop)
1969
2578
  }
1970
2579
  #endif // HWY_HAVE_FLOAT16
1971
2580
 
@@ -2328,11 +2937,63 @@ HWY_API Mask512<T> ExclusiveNeither(Mask512<T> a, Mask512<T> b) {
2328
2937
  return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
2329
2938
  }
2330
2939
 
2940
+ template <class D, HWY_IF_LANES_D(D, 64)>
2941
+ HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
2942
+ MFromD<Half<D>> lo) {
2943
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
2944
+ const __mmask64 combined_mask = _mm512_kunpackd(
2945
+ static_cast<__mmask64>(hi.raw), static_cast<__mmask64>(lo.raw));
2946
+ #else
2947
+ const __mmask64 combined_mask = static_cast<__mmask64>(
2948
+ ((static_cast<uint64_t>(hi.raw) << 32) | (lo.raw & 0xFFFFFFFFULL)));
2949
+ #endif
2950
+
2951
+ return MFromD<D>{combined_mask};
2952
+ }
2953
+
2954
+ template <class D, HWY_IF_LANES_D(D, 32)>
2955
+ HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
2956
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
2957
+ const auto shifted_mask = _kshiftri_mask64(static_cast<__mmask64>(m.raw), 32);
2958
+ #else
2959
+ const auto shifted_mask = static_cast<uint64_t>(m.raw) >> 32;
2960
+ #endif
2961
+
2962
+ return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
2963
+ }
2964
+
2965
+ template <class D, HWY_IF_LANES_D(D, 64)>
2966
+ HWY_API MFromD<D> SlideMask1Up(D /*d*/, MFromD<D> m) {
2967
+ using RawM = decltype(MFromD<D>().raw);
2968
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
2969
+ return MFromD<D>{
2970
+ static_cast<RawM>(_kshiftli_mask64(static_cast<__mmask64>(m.raw), 1))};
2971
+ #else
2972
+ return MFromD<D>{static_cast<RawM>(static_cast<uint64_t>(m.raw) << 1)};
2973
+ #endif
2974
+ }
2975
+
2976
+ template <class D, HWY_IF_LANES_D(D, 64)>
2977
+ HWY_API MFromD<D> SlideMask1Down(D /*d*/, MFromD<D> m) {
2978
+ using RawM = decltype(MFromD<D>().raw);
2979
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
2980
+ return MFromD<D>{
2981
+ static_cast<RawM>(_kshiftri_mask64(static_cast<__mmask64>(m.raw), 1))};
2982
+ #else
2983
+ return MFromD<D>{static_cast<RawM>(static_cast<uint64_t>(m.raw) >> 1)};
2984
+ #endif
2985
+ }
2986
+
2331
2987
  // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
2332
2988
 
2333
2989
  HWY_API Vec512<int8_t> BroadcastSignBit(Vec512<int8_t> v) {
2990
+ #if HWY_TARGET <= HWY_AVX3_DL
2991
+ const Repartition<uint64_t, DFromV<decltype(v)>> du64;
2992
+ return detail::GaloisAffine(v, Set(du64, 0x8080808080808080ull));
2993
+ #else
2334
2994
  const DFromV<decltype(v)> d;
2335
2995
  return VecFromMask(v < Zero(d));
2996
+ #endif
2336
2997
  }
2337
2998
 
2338
2999
  HWY_API Vec512<int16_t> BroadcastSignBit(Vec512<int16_t> v) {
@@ -2344,7 +3005,7 @@ HWY_API Vec512<int32_t> BroadcastSignBit(Vec512<int32_t> v) {
2344
3005
  }
2345
3006
 
2346
3007
  HWY_API Vec512<int64_t> BroadcastSignBit(Vec512<int64_t> v) {
2347
- return Vec512<int64_t>{_mm512_srai_epi64(v.raw, 63)};
3008
+ return ShiftRight<63>(v);
2348
3009
  }
2349
3010
 
2350
3011
  // ------------------------------ Floating-point classification (Not)
@@ -2356,6 +3017,15 @@ HWY_API Mask512<float16_t> IsNaN(Vec512<float16_t> v) {
2356
3017
  v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
2357
3018
  }
2358
3019
 
3020
+ HWY_API Mask512<float16_t> IsEitherNaN(Vec512<float16_t> a,
3021
+ Vec512<float16_t> b) {
3022
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
3023
+ HWY_DIAGNOSTICS(push)
3024
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
3025
+ return Mask512<float16_t>{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_UNORD_Q)};
3026
+ HWY_DIAGNOSTICS(pop)
3027
+ }
3028
+
2359
3029
  HWY_API Mask512<float16_t> IsInf(Vec512<float16_t> v) {
2360
3030
  return Mask512<float16_t>{_mm512_fpclass_ph_mask(v.raw, 0x18)};
2361
3031
  }
@@ -2379,6 +3049,14 @@ HWY_API Mask512<double> IsNaN(Vec512<double> v) {
2379
3049
  v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
2380
3050
  }
2381
3051
 
3052
+ HWY_API Mask512<float> IsEitherNaN(Vec512<float> a, Vec512<float> b) {
3053
+ return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_UNORD_Q)};
3054
+ }
3055
+
3056
+ HWY_API Mask512<double> IsEitherNaN(Vec512<double> a, Vec512<double> b) {
3057
+ return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_UNORD_Q)};
3058
+ }
3059
+
2382
3060
  HWY_API Mask512<float> IsInf(Vec512<float> v) {
2383
3061
  return Mask512<float>{_mm512_fpclass_ps_mask(
2384
3062
  v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)};
@@ -2410,16 +3088,13 @@ HWY_API VFromD<D> Load(D /* tag */, const TFromD<D>* HWY_RESTRICT aligned) {
2410
3088
  return VFromD<D>{_mm512_load_si512(aligned)};
2411
3089
  }
2412
3090
  // bfloat16_t is handled by x86_128-inl.h.
2413
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
2414
- HWY_API Vec512<float16_t> Load(D d, const float16_t* HWY_RESTRICT aligned) {
2415
3091
  #if HWY_HAVE_FLOAT16
2416
- (void)d;
3092
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
3093
+ HWY_API Vec512<float16_t> Load(D /* tag */,
3094
+ const float16_t* HWY_RESTRICT aligned) {
2417
3095
  return Vec512<float16_t>{_mm512_load_ph(aligned)};
2418
- #else
2419
- const RebindToUnsigned<decltype(d)> du;
2420
- return BitCast(d, Load(du, reinterpret_cast<const uint16_t*>(aligned)));
2421
- #endif // HWY_HAVE_FLOAT16
2422
3096
  }
3097
+ #endif // HWY_HAVE_FLOAT16
2423
3098
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
2424
3099
  HWY_API Vec512<float> Load(D /* tag */, const float* HWY_RESTRICT aligned) {
2425
3100
  return Vec512<float>{_mm512_load_ps(aligned)};
@@ -2435,16 +3110,12 @@ HWY_API VFromD<D> LoadU(D /* tag */, const TFromD<D>* HWY_RESTRICT p) {
2435
3110
  }
2436
3111
 
2437
3112
  // bfloat16_t is handled by x86_128-inl.h.
2438
- template <class D, HWY_IF_V_SIZE_D(D, 64)>
2439
- HWY_API Vec512<float16_t> LoadU(D d, const float16_t* HWY_RESTRICT p) {
2440
3113
  #if HWY_HAVE_FLOAT16
2441
- (void)d;
3114
+ template <class D, HWY_IF_V_SIZE_D(D, 64)>
3115
+ HWY_API Vec512<float16_t> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) {
2442
3116
  return Vec512<float16_t>{_mm512_loadu_ph(p)};
2443
- #else
2444
- const RebindToUnsigned<decltype(d)> du;
2445
- return BitCast(d, LoadU(du, reinterpret_cast<const uint16_t*>(p)));
2446
- #endif // HWY_HAVE_FLOAT16
2447
3117
  }
3118
+ #endif // HWY_HAVE_FLOAT16
2448
3119
  template <class D, HWY_IF_V_SIZE_D(D, 64)>
2449
3120
  HWY_API Vec512<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) {
2450
3121
  return Vec512<float>{_mm512_loadu_ps(p)};
@@ -2506,8 +3177,9 @@ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
2506
3177
  HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
2507
3178
  const TFromD<D>* HWY_RESTRICT p) {
2508
3179
  const RebindToUnsigned<decltype(d)> du; // for float16_t
2509
- return VFromD<D>{_mm512_mask_loadu_epi16(
2510
- BitCast(du, v).raw, m.raw, reinterpret_cast<const uint16_t*>(p))};
3180
+ return BitCast(
3181
+ d, VFromD<decltype(du)>{_mm512_mask_loadu_epi16(
3182
+ BitCast(du, v).raw, m.raw, reinterpret_cast<const uint16_t*>(p))});
2511
3183
  }
2512
3184
 
2513
3185
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
@@ -2539,10 +3211,12 @@ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, Mask512<double> m, D /* tag */,
2539
3211
  // Loads 128 bit and duplicates into both 128-bit halves. This avoids the
2540
3212
  // 3-cycle cost of moving data between 128-bit halves and avoids port 5.
2541
3213
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
2542
- HWY_API VFromD<D> LoadDup128(D /* tag */,
2543
- const TFromD<D>* const HWY_RESTRICT p) {
3214
+ HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* const HWY_RESTRICT p) {
3215
+ const RebindToUnsigned<decltype(d)> du;
2544
3216
  const Full128<TFromD<D>> d128;
2545
- return VFromD<D>{_mm512_broadcast_i32x4(LoadU(d128, p).raw)};
3217
+ const RebindToUnsigned<decltype(d128)> du128;
3218
+ return BitCast(d, VFromD<decltype(du)>{_mm512_broadcast_i32x4(
3219
+ BitCast(du128, LoadU(d128, p)).raw)});
2546
3220
  }
2547
3221
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
2548
3222
  HWY_API VFromD<D> LoadDup128(D /* tag */, const float* HWY_RESTRICT p) {
@@ -2563,15 +3237,13 @@ HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
2563
3237
  _mm512_store_si512(reinterpret_cast<__m512i*>(aligned), v.raw);
2564
3238
  }
2565
3239
  // bfloat16_t is handled by x86_128-inl.h.
3240
+ #if HWY_HAVE_FLOAT16
2566
3241
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
2567
3242
  HWY_API void Store(Vec512<float16_t> v, D /* tag */,
2568
3243
  float16_t* HWY_RESTRICT aligned) {
2569
- #if HWY_HAVE_FLOAT16
2570
3244
  _mm512_store_ph(aligned, v.raw);
2571
- #else
2572
- _mm512_store_si512(reinterpret_cast<__m512i*>(aligned), v.raw);
2573
- #endif
2574
3245
  }
3246
+ #endif
2575
3247
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
2576
3248
  HWY_API void Store(Vec512<float> v, D /* tag */, float* HWY_RESTRICT aligned) {
2577
3249
  _mm512_store_ps(aligned, v.raw);
@@ -2586,15 +3258,13 @@ HWY_API void StoreU(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
2586
3258
  _mm512_storeu_si512(reinterpret_cast<__m512i*>(p), v.raw);
2587
3259
  }
2588
3260
  // bfloat16_t is handled by x86_128-inl.h.
3261
+ #if HWY_HAVE_FLOAT16
2589
3262
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
2590
3263
  HWY_API void StoreU(Vec512<float16_t> v, D /* tag */,
2591
3264
  float16_t* HWY_RESTRICT p) {
2592
- #if HWY_HAVE_FLOAT16
2593
3265
  _mm512_storeu_ph(p, v.raw);
2594
- #else
2595
- _mm512_storeu_si512(reinterpret_cast<__m512i*>(p), v.raw);
2596
- #endif // HWY_HAVE_FLOAT16
2597
3266
  }
3267
+ #endif // HWY_HAVE_FLOAT16
2598
3268
 
2599
3269
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
2600
3270
  HWY_API void StoreU(Vec512<float> v, D /* tag */, float* HWY_RESTRICT p) {
@@ -2756,84 +3426,81 @@ HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D /* tag */,
2756
3426
  namespace detail {
2757
3427
 
2758
3428
  template <int kScale, typename T, HWY_IF_UI32(T)>
2759
- HWY_INLINE Vec512<T> NativeGather(const T* HWY_RESTRICT base,
2760
- Vec512<int32_t> index) {
2761
- return Vec512<T>{_mm512_i32gather_epi32(index.raw, base, kScale)};
3429
+ HWY_INLINE Vec512<T> NativeGather512(const T* HWY_RESTRICT base,
3430
+ Vec512<int32_t> indices) {
3431
+ return Vec512<T>{_mm512_i32gather_epi32(indices.raw, base, kScale)};
2762
3432
  }
2763
3433
 
2764
3434
  template <int kScale, typename T, HWY_IF_UI64(T)>
2765
- HWY_INLINE Vec512<T> NativeGather(const T* HWY_RESTRICT base,
2766
- Vec512<int64_t> index) {
2767
- return Vec512<T>{_mm512_i64gather_epi64(index.raw, base, kScale)};
3435
+ HWY_INLINE Vec512<T> NativeGather512(const T* HWY_RESTRICT base,
3436
+ Vec512<int64_t> indices) {
3437
+ return Vec512<T>{_mm512_i64gather_epi64(indices.raw, base, kScale)};
2768
3438
  }
2769
3439
 
2770
3440
  template <int kScale>
2771
- HWY_INLINE Vec512<float> NativeGather(const float* HWY_RESTRICT base,
2772
- Vec512<int32_t> index) {
2773
- return Vec512<float>{_mm512_i32gather_ps(index.raw, base, kScale)};
3441
+ HWY_INLINE Vec512<float> NativeGather512(const float* HWY_RESTRICT base,
3442
+ Vec512<int32_t> indices) {
3443
+ return Vec512<float>{_mm512_i32gather_ps(indices.raw, base, kScale)};
2774
3444
  }
2775
3445
 
2776
3446
  template <int kScale>
2777
- HWY_INLINE Vec512<double> NativeGather(const double* HWY_RESTRICT base,
2778
- Vec512<int64_t> index) {
2779
- return Vec512<double>{_mm512_i64gather_pd(index.raw, base, kScale)};
3447
+ HWY_INLINE Vec512<double> NativeGather512(const double* HWY_RESTRICT base,
3448
+ Vec512<int64_t> indices) {
3449
+ return Vec512<double>{_mm512_i64gather_pd(indices.raw, base, kScale)};
2780
3450
  }
2781
3451
 
2782
3452
  template <int kScale, typename T, HWY_IF_UI32(T)>
2783
- HWY_INLINE Vec512<T> NativeMaskedGather(Mask512<T> m,
2784
- const T* HWY_RESTRICT base,
2785
- Vec512<int32_t> index) {
2786
- const Full512<T> d;
3453
+ HWY_INLINE Vec512<T> NativeMaskedGatherOr512(Vec512<T> no, Mask512<T> m,
3454
+ const T* HWY_RESTRICT base,
3455
+ Vec512<int32_t> indices) {
2787
3456
  return Vec512<T>{
2788
- _mm512_mask_i32gather_epi32(Zero(d).raw, m.raw, index.raw, base, kScale)};
3457
+ _mm512_mask_i32gather_epi32(no.raw, m.raw, indices.raw, base, kScale)};
2789
3458
  }
2790
3459
 
2791
3460
  template <int kScale, typename T, HWY_IF_UI64(T)>
2792
- HWY_INLINE Vec512<T> NativeMaskedGather(Mask512<T> m,
2793
- const T* HWY_RESTRICT base,
2794
- Vec512<int64_t> index) {
2795
- const Full512<T> d;
3461
+ HWY_INLINE Vec512<T> NativeMaskedGatherOr512(Vec512<T> no, Mask512<T> m,
3462
+ const T* HWY_RESTRICT base,
3463
+ Vec512<int64_t> indices) {
2796
3464
  return Vec512<T>{
2797
- _mm512_mask_i64gather_epi64(Zero(d).raw, m.raw, index.raw, base, kScale)};
3465
+ _mm512_mask_i64gather_epi64(no.raw, m.raw, indices.raw, base, kScale)};
2798
3466
  }
2799
3467
 
2800
3468
  template <int kScale>
2801
- HWY_INLINE Vec512<float> NativeMaskedGather(Mask512<float> m,
2802
- const float* HWY_RESTRICT base,
2803
- Vec512<int32_t> index) {
2804
- const Full512<float> d;
3469
+ HWY_INLINE Vec512<float> NativeMaskedGatherOr512(Vec512<float> no,
3470
+ Mask512<float> m,
3471
+ const float* HWY_RESTRICT base,
3472
+ Vec512<int32_t> indices) {
2805
3473
  return Vec512<float>{
2806
- _mm512_mask_i32gather_ps(Zero(d).raw, m.raw, index.raw, base, kScale)};
3474
+ _mm512_mask_i32gather_ps(no.raw, m.raw, indices.raw, base, kScale)};
2807
3475
  }
2808
3476
 
2809
3477
  template <int kScale>
2810
- HWY_INLINE Vec512<double> NativeMaskedGather(Mask512<double> m,
2811
- const double* HWY_RESTRICT base,
2812
- Vec512<int64_t> index) {
2813
- const Full512<double> d;
3478
+ HWY_INLINE Vec512<double> NativeMaskedGatherOr512(
3479
+ Vec512<double> no, Mask512<double> m, const double* HWY_RESTRICT base,
3480
+ Vec512<int64_t> indices) {
2814
3481
  return Vec512<double>{
2815
- _mm512_mask_i64gather_pd(Zero(d).raw, m.raw, index.raw, base, kScale)};
3482
+ _mm512_mask_i64gather_pd(no.raw, m.raw, indices.raw, base, kScale)};
2816
3483
  }
2817
3484
  } // namespace detail
2818
3485
 
2819
- template <class D, HWY_IF_V_SIZE_D(D, 64), typename TI>
2820
- HWY_API VFromD<D> GatherOffset(D /* tag */, const TFromD<D>* HWY_RESTRICT base,
2821
- Vec512<TI> offset) {
2822
- static_assert(sizeof(TFromD<D>) == sizeof(TI), "Must match for portability");
2823
- return detail::NativeGather<1>(base, offset);
3486
+ template <class D, HWY_IF_V_SIZE_D(D, 64)>
3487
+ HWY_API VFromD<D> GatherOffset(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
3488
+ VFromD<RebindToSigned<D>> offsets) {
3489
+ return detail::NativeGather512<1>(base, offsets);
2824
3490
  }
2825
- template <class D, HWY_IF_V_SIZE_D(D, 64), typename TI>
2826
- HWY_API VFromD<D> GatherIndex(D /* tag */, const TFromD<D>* HWY_RESTRICT base,
2827
- Vec512<TI> index) {
2828
- static_assert(sizeof(TFromD<D>) == sizeof(TI), "Must match for portability");
2829
- return detail::NativeGather<sizeof(TFromD<D>)>(base, index);
3491
+
3492
+ template <class D, HWY_IF_V_SIZE_D(D, 64)>
3493
+ HWY_API VFromD<D> GatherIndex(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
3494
+ VFromD<RebindToSigned<D>> indices) {
3495
+ return detail::NativeGather512<sizeof(TFromD<D>)>(base, indices);
2830
3496
  }
2831
- template <class D, HWY_IF_V_SIZE_D(D, 64), typename TI>
2832
- HWY_API VFromD<D> MaskedGatherIndex(MFromD<D> m, D /* tag */,
2833
- const TFromD<D>* HWY_RESTRICT base,
2834
- Vec512<TI> index) {
2835
- static_assert(sizeof(TFromD<D>) == sizeof(TI), "Must match for portability");
2836
- return detail::NativeMaskedGather<sizeof(TFromD<D>)>(m, base, index);
3497
+
3498
+ template <class D, HWY_IF_V_SIZE_D(D, 64)>
3499
+ HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D /*d*/,
3500
+ const TFromD<D>* HWY_RESTRICT base,
3501
+ VFromD<RebindToSigned<D>> indices) {
3502
+ return detail::NativeMaskedGatherOr512<sizeof(TFromD<D>)>(no, m, base,
3503
+ indices);
2837
3504
  }
2838
3505
 
2839
3506
  HWY_DIAGNOSTICS(pop)
@@ -2878,7 +3545,7 @@ HWY_API Vec256<T> LowerHalf(Vec512<T> v) {
2878
3545
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
2879
3546
  HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
2880
3547
  const RebindToUnsigned<decltype(d)> du; // for float16_t
2881
- const Twice<decltype(d)> dut;
3548
+ const Twice<decltype(du)> dut;
2882
3549
  return BitCast(d, VFromD<decltype(du)>{
2883
3550
  _mm512_extracti32x8_epi32(BitCast(dut, v).raw, 1)});
2884
3551
  }
@@ -2920,7 +3587,11 @@ HWY_API Vec128<T> ExtractBlock(Vec512<T> v) {
2920
3587
  template <int kBlockIdx, class T, hwy::EnableIf<(kBlockIdx > 1)>* = nullptr>
2921
3588
  HWY_API Vec128<T> ExtractBlock(Vec512<T> v) {
2922
3589
  static_assert(kBlockIdx <= 3, "Invalid block index");
2923
- return Vec128<T>{_mm512_extracti32x4_epi32(v.raw, kBlockIdx)};
3590
+ const DFromV<decltype(v)> d;
3591
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
3592
+ return BitCast(Full128<T>(),
3593
+ Vec128<MakeUnsigned<T>>{
3594
+ _mm512_extracti32x4_epi32(BitCast(du, v).raw, kBlockIdx)});
2924
3595
  }
2925
3596
 
2926
3597
  template <int kBlockIdx, hwy::EnableIf<(kBlockIdx > 1)>* = nullptr>
@@ -2955,8 +3626,13 @@ HWY_INLINE Vec512<T> InsertBlock(hwy::SizeTag<0> /* blk_idx_tag */, Vec512<T> v,
2955
3626
  template <size_t kBlockIdx, typename T>
2956
3627
  HWY_INLINE Vec512<T> InsertBlock(hwy::SizeTag<kBlockIdx> /* blk_idx_tag */,
2957
3628
  Vec512<T> v, Vec128<T> blk_to_insert) {
2958
- return Vec512<T>{_mm512_inserti32x4(v.raw, blk_to_insert.raw,
2959
- static_cast<int>(kBlockIdx & 3))};
3629
+ const DFromV<decltype(v)> d;
3630
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
3631
+ const Full128<MakeUnsigned<T>> du_blk_to_insert;
3632
+ return BitCast(
3633
+ d, VFromD<decltype(du)>{_mm512_inserti32x4(
3634
+ BitCast(du, v).raw, BitCast(du_blk_to_insert, blk_to_insert).raw,
3635
+ static_cast<int>(kBlockIdx & 3))});
2960
3636
  }
2961
3637
 
2962
3638
  template <size_t kBlockIdx, hwy::EnableIf<kBlockIdx != 0>* = nullptr>
@@ -2992,7 +3668,7 @@ HWY_API T GetLane(const Vec512<T> v) {
2992
3668
 
2993
3669
  // ------------------------------ ZeroExtendVector
2994
3670
 
2995
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_D(D)>
3671
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
2996
3672
  HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
2997
3673
  #if HWY_HAVE_ZEXT // See definition/comment in x86_256-inl.h.
2998
3674
  (void)d;
@@ -3042,11 +3718,13 @@ HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
3042
3718
  DTo d_to, DFrom d_from, VFromD<DFrom> v) {
3043
3719
  const Repartition<uint8_t, decltype(d_from)> du8_from;
3044
3720
  const auto vu8 = BitCast(du8_from, v);
3721
+ const RebindToUnsigned<decltype(d_to)> du_to;
3045
3722
  #if HWY_HAVE_ZEXT
3046
- (void)d_to;
3047
- return VFromD<DTo>{_mm512_zextsi128_si512(vu8.raw)};
3723
+ return BitCast(d_to,
3724
+ VFromD<decltype(du_to)>{_mm512_zextsi128_si512(vu8.raw)});
3048
3725
  #else
3049
- return VFromD<DTo>{_mm512_inserti32x4(Zero(d_to).raw, vu8.raw, 0)};
3726
+ return BitCast(d_to, VFromD<decltype(du_to)>{
3727
+ _mm512_inserti32x4(Zero(du_to).raw, vu8.raw, 0)});
3050
3728
  #endif
3051
3729
  }
3052
3730
 
@@ -3096,7 +3774,8 @@ HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) {
3096
3774
  const RebindToUnsigned<decltype(d)> du; // for float16_t
3097
3775
  const Half<decltype(du)> duh;
3098
3776
  const __m512i lo512 = ZeroExtendVector(du, BitCast(duh, lo)).raw;
3099
- return VFromD<D>{_mm512_inserti32x8(lo512, BitCast(duh, hi).raw, 1)};
3777
+ return BitCast(d, VFromD<decltype(du)>{
3778
+ _mm512_inserti32x8(lo512, BitCast(duh, hi).raw, 1)});
3100
3779
  }
3101
3780
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3102
3781
  HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) {
@@ -3181,7 +3860,11 @@ HWY_API Vec512<double> Broadcast(const Vec512<double> v) {
3181
3860
  template <int kBlockIdx, class T>
3182
3861
  HWY_API Vec512<T> BroadcastBlock(Vec512<T> v) {
3183
3862
  static_assert(0 <= kBlockIdx && kBlockIdx <= 3, "Invalid block index");
3184
- return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, 0x55 * kBlockIdx)};
3863
+ const DFromV<decltype(v)> d;
3864
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
3865
+ return BitCast(
3866
+ d, VFromD<decltype(du)>{_mm512_shuffle_i32x4(
3867
+ BitCast(du, v).raw, BitCast(du, v).raw, 0x55 * kBlockIdx)});
3185
3868
  }
3186
3869
 
3187
3870
  template <int kBlockIdx>
@@ -3209,7 +3892,10 @@ HWY_INLINE Vec512<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
3209
3892
  template <class T, HWY_IF_T_SIZE(T, 2)>
3210
3893
  HWY_INLINE Vec512<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
3211
3894
  Vec512<T> v) {
3212
- return Vec512<T>{_mm512_broadcastw_epi16(ResizeBitCast(Full128<T>(), v).raw)};
3895
+ const DFromV<decltype(v)> d;
3896
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
3897
+ return BitCast(d, VFromD<decltype(du)>{_mm512_broadcastw_epi16(
3898
+ ResizeBitCast(Full128<uint16_t>(), v).raw)});
3213
3899
  }
3214
3900
 
3215
3901
  template <class T, HWY_IF_UI32(T)>
@@ -3671,8 +4357,11 @@ HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
3671
4357
 
3672
4358
  // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
3673
4359
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
3674
- HWY_API VFromD<D> ConcatLowerLower(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
3675
- return VFromD<D>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BABA)};
4360
+ HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
4361
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4362
+ return BitCast(d,
4363
+ VFromD<decltype(du)>{_mm512_shuffle_i32x4(
4364
+ BitCast(du, lo).raw, BitCast(du, hi).raw, _MM_PERM_BABA)});
3676
4365
  }
3677
4366
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3678
4367
  HWY_API VFromD<D> ConcatLowerLower(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
@@ -3686,8 +4375,11 @@ HWY_API Vec512<double> ConcatLowerLower(D /* tag */, Vec512<double> hi,
3686
4375
 
3687
4376
  // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
3688
4377
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
3689
- HWY_API VFromD<D> ConcatUpperUpper(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
3690
- return VFromD<D>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_DCDC)};
4378
+ HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
4379
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4380
+ return BitCast(d,
4381
+ VFromD<decltype(du)>{_mm512_shuffle_i32x4(
4382
+ BitCast(du, lo).raw, BitCast(du, hi).raw, _MM_PERM_DCDC)});
3691
4383
  }
3692
4384
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3693
4385
  HWY_API VFromD<D> ConcatUpperUpper(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
@@ -3701,8 +4393,11 @@ HWY_API Vec512<double> ConcatUpperUpper(D /* tag */, Vec512<double> hi,
3701
4393
 
3702
4394
  // hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
3703
4395
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
3704
- HWY_API VFromD<D> ConcatLowerUpper(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
3705
- return VFromD<D>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BADC)};
4396
+ HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
4397
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4398
+ return BitCast(d,
4399
+ VFromD<decltype(du)>{_mm512_shuffle_i32x4(
4400
+ BitCast(du, lo).raw, BitCast(du, hi).raw, _MM_PERM_BADC)});
3706
4401
  }
3707
4402
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3708
4403
  HWY_API VFromD<D> ConcatLowerUpper(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
@@ -3716,11 +4411,13 @@ HWY_API Vec512<double> ConcatLowerUpper(D /* tag */, Vec512<double> hi,
3716
4411
 
3717
4412
  // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
3718
4413
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
3719
- HWY_API VFromD<D> ConcatUpperLower(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
4414
+ HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
3720
4415
  // There are no imm8 blend in AVX512. Use blend16 because 32-bit masks
3721
4416
  // are efficiently loaded from 32-bit regs.
3722
4417
  const __mmask32 mask = /*_cvtu32_mask32 */ (0x0000FFFF);
3723
- return VFromD<D>{_mm512_mask_blend_epi16(mask, hi.raw, lo.raw)};
4418
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4419
+ return BitCast(d, VFromD<decltype(du)>{_mm512_mask_blend_epi16(
4420
+ mask, BitCast(du, hi).raw, BitCast(du, lo).raw)});
3724
4421
  }
3725
4422
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3726
4423
  HWY_API VFromD<D> ConcatUpperLower(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
@@ -3814,71 +4511,195 @@ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
3814
4511
  const RebindToUnsigned<decltype(d)> du;
3815
4512
  #if HWY_TARGET <= HWY_AVX3_DL
3816
4513
  alignas(64) static constexpr uint8_t kIdx[64] = {
3817
- 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24,
3818
- 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50,
3819
- 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76,
3820
- 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102,
3821
- 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126};
3822
- return BitCast(
3823
- d, Vec512<uint32_t>{_mm512_permutex2var_epi8(
3824
- BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
4514
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24,
4515
+ 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50,
4516
+ 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76,
4517
+ 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102,
4518
+ 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126};
4519
+ return BitCast(
4520
+ d, Vec512<uint32_t>{_mm512_permutex2var_epi8(
4521
+ BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
4522
+ #else
4523
+ const RepartitionToWide<decltype(du)> dw;
4524
+ // Isolate lower 8 bits per u16 so we can pack.
4525
+ const Vec512<uint16_t> mask = Set(dw, 0x00FF);
4526
+ const Vec512<uint16_t> uH = And(BitCast(dw, hi), mask);
4527
+ const Vec512<uint16_t> uL = And(BitCast(dw, lo), mask);
4528
+ const Vec512<uint64_t> u8{_mm512_packus_epi16(uL.raw, uH.raw)};
4529
+ // Undo block interleave: lower half = even u64 lanes, upper = odd u64 lanes.
4530
+ const Full512<uint64_t> du64;
4531
+ alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
4532
+ return BitCast(d, TableLookupLanes(u8, SetTableIndices(du64, kIdx)));
4533
+ #endif
4534
+ }
4535
+
4536
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
4537
+ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
4538
+ const RebindToUnsigned<decltype(d)> du;
4539
+ alignas(64) static constexpr uint16_t kIdx[32] = {
4540
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
4541
+ 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
4542
+ return BitCast(
4543
+ d, Vec512<uint32_t>{_mm512_permutex2var_epi16(
4544
+ BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
4545
+ }
4546
+
4547
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
4548
+ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
4549
+ const RebindToUnsigned<decltype(d)> du;
4550
+ alignas(64) static constexpr uint32_t kIdx[16] = {
4551
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
4552
+ return BitCast(
4553
+ d, Vec512<uint32_t>{_mm512_permutex2var_epi32(
4554
+ BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
4555
+ }
4556
+
4557
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4558
+ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
4559
+ const RebindToUnsigned<decltype(d)> du;
4560
+ alignas(64) static constexpr uint32_t kIdx[16] = {
4561
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
4562
+ return VFromD<D>{_mm512_permutex2var_ps(lo.raw, Load(du, kIdx).raw, hi.raw)};
4563
+ }
4564
+
4565
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
4566
+ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
4567
+ const RebindToUnsigned<decltype(d)> du;
4568
+ alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
4569
+ return BitCast(
4570
+ d, Vec512<uint64_t>{_mm512_permutex2var_epi64(
4571
+ BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
4572
+ }
4573
+
4574
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4575
+ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
4576
+ const RebindToUnsigned<decltype(d)> du;
4577
+ alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
4578
+ return VFromD<D>{_mm512_permutex2var_pd(lo.raw, Load(du, kIdx).raw, hi.raw)};
4579
+ }
4580
+
4581
+ // ------------------------------ InterleaveWholeLower
4582
+
4583
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
4584
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
4585
+ #if HWY_TARGET <= HWY_AVX3_DL
4586
+ const RebindToUnsigned<decltype(d)> du;
4587
+ alignas(64) static constexpr uint8_t kIdx[64] = {
4588
+ 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71,
4589
+ 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79,
4590
+ 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87,
4591
+ 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95};
4592
+ return VFromD<D>{_mm512_permutex2var_epi8(a.raw, Load(du, kIdx).raw, b.raw)};
4593
+ #else
4594
+ alignas(64) static constexpr uint64_t kIdx2[8] = {0, 1, 8, 9, 2, 3, 10, 11};
4595
+ const Repartition<uint64_t, decltype(d)> du64;
4596
+ return VFromD<D>{_mm512_permutex2var_epi64(InterleaveLower(a, b).raw,
4597
+ Load(du64, kIdx2).raw,
4598
+ InterleaveUpper(d, a, b).raw)};
4599
+ #endif
4600
+ }
4601
+
4602
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
4603
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
4604
+ const RebindToUnsigned<decltype(d)> du;
4605
+ alignas(64) static constexpr uint16_t kIdx[32] = {
4606
+ 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
4607
+ 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47};
4608
+ return BitCast(
4609
+ d, VFromD<decltype(du)>{_mm512_permutex2var_epi16(
4610
+ BitCast(du, a).raw, Load(du, kIdx).raw, BitCast(du, b).raw)});
4611
+ }
4612
+
4613
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
4614
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
4615
+ const RebindToUnsigned<decltype(d)> du;
4616
+ alignas(64) static constexpr uint32_t kIdx[16] = {0, 16, 1, 17, 2, 18, 3, 19,
4617
+ 4, 20, 5, 21, 6, 22, 7, 23};
4618
+ return VFromD<D>{_mm512_permutex2var_epi32(a.raw, Load(du, kIdx).raw, b.raw)};
4619
+ }
4620
+
4621
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4622
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
4623
+ const RebindToUnsigned<decltype(d)> du;
4624
+ alignas(64) static constexpr uint32_t kIdx[16] = {0, 16, 1, 17, 2, 18, 3, 19,
4625
+ 4, 20, 5, 21, 6, 22, 7, 23};
4626
+ return VFromD<D>{_mm512_permutex2var_ps(a.raw, Load(du, kIdx).raw, b.raw)};
4627
+ }
4628
+
4629
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
4630
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
4631
+ const RebindToUnsigned<decltype(d)> du;
4632
+ alignas(64) static constexpr uint64_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
4633
+ return VFromD<D>{_mm512_permutex2var_epi64(a.raw, Load(du, kIdx).raw, b.raw)};
4634
+ }
4635
+
4636
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
4637
+ HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
4638
+ const RebindToUnsigned<decltype(d)> du;
4639
+ alignas(64) static constexpr uint64_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11};
4640
+ return VFromD<D>{_mm512_permutex2var_pd(a.raw, Load(du, kIdx).raw, b.raw)};
4641
+ }
4642
+
4643
+ // ------------------------------ InterleaveWholeUpper
4644
+
4645
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 1)>
4646
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
4647
+ #if HWY_TARGET <= HWY_AVX3_DL
4648
+ const RebindToUnsigned<decltype(d)> du;
4649
+ alignas(64) static constexpr uint8_t kIdx[64] = {
4650
+ 32, 96, 33, 97, 34, 98, 35, 99, 36, 100, 37, 101, 38, 102, 39, 103,
4651
+ 40, 104, 41, 105, 42, 106, 43, 107, 44, 108, 45, 109, 46, 110, 47, 111,
4652
+ 48, 112, 49, 113, 50, 114, 51, 115, 52, 116, 53, 117, 54, 118, 55, 119,
4653
+ 56, 120, 57, 121, 58, 122, 59, 123, 60, 124, 61, 125, 62, 126, 63, 127};
4654
+ return VFromD<D>{_mm512_permutex2var_epi8(a.raw, Load(du, kIdx).raw, b.raw)};
3825
4655
  #else
3826
- const RepartitionToWide<decltype(du)> dw;
3827
- // Isolate lower 8 bits per u16 so we can pack.
3828
- const Vec512<uint16_t> mask = Set(dw, 0x00FF);
3829
- const Vec512<uint16_t> uH = And(BitCast(dw, hi), mask);
3830
- const Vec512<uint16_t> uL = And(BitCast(dw, lo), mask);
3831
- const Vec512<uint64_t> u8{_mm512_packus_epi16(uL.raw, uH.raw)};
3832
- // Undo block interleave: lower half = even u64 lanes, upper = odd u64 lanes.
3833
- const Full512<uint64_t> du64;
3834
- alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
3835
- return BitCast(d, TableLookupLanes(u8, SetTableIndices(du64, kIdx)));
4656
+ alignas(64) static constexpr uint64_t kIdx2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
4657
+ const Repartition<uint64_t, decltype(d)> du64;
4658
+ return VFromD<D>{_mm512_permutex2var_epi64(InterleaveLower(a, b).raw,
4659
+ Load(du64, kIdx2).raw,
4660
+ InterleaveUpper(d, a, b).raw)};
3836
4661
  #endif
3837
4662
  }
3838
4663
 
3839
4664
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_T_SIZE_D(D, 2)>
3840
- HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
4665
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
3841
4666
  const RebindToUnsigned<decltype(d)> du;
3842
4667
  alignas(64) static constexpr uint16_t kIdx[32] = {
3843
- 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
3844
- 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
4668
+ 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
4669
+ 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63};
3845
4670
  return BitCast(
3846
- d, Vec512<uint32_t>{_mm512_permutex2var_epi16(
3847
- BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
4671
+ d, VFromD<decltype(du)>{_mm512_permutex2var_epi16(
4672
+ BitCast(du, a).raw, Load(du, kIdx).raw, BitCast(du, b).raw)});
3848
4673
  }
3849
4674
 
3850
4675
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI32_D(D)>
3851
- HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
4676
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
3852
4677
  const RebindToUnsigned<decltype(d)> du;
3853
4678
  alignas(64) static constexpr uint32_t kIdx[16] = {
3854
- 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
3855
- return BitCast(
3856
- d, Vec512<uint32_t>{_mm512_permutex2var_epi32(
3857
- BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
4679
+ 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31};
4680
+ return VFromD<D>{_mm512_permutex2var_epi32(a.raw, Load(du, kIdx).raw, b.raw)};
3858
4681
  }
3859
4682
 
3860
4683
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3861
- HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
4684
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
3862
4685
  const RebindToUnsigned<decltype(d)> du;
3863
4686
  alignas(64) static constexpr uint32_t kIdx[16] = {
3864
- 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
3865
- return VFromD<D>{_mm512_permutex2var_ps(lo.raw, Load(du, kIdx).raw, hi.raw)};
4687
+ 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31};
4688
+ return VFromD<D>{_mm512_permutex2var_ps(a.raw, Load(du, kIdx).raw, b.raw)};
3866
4689
  }
3867
4690
 
3868
4691
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_UI64_D(D)>
3869
- HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
4692
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
3870
4693
  const RebindToUnsigned<decltype(d)> du;
3871
- alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3872
- return BitCast(
3873
- d, Vec512<uint64_t>{_mm512_permutex2var_epi64(
3874
- BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)});
4694
+ alignas(64) static constexpr uint64_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
4695
+ return VFromD<D>{_mm512_permutex2var_epi64(a.raw, Load(du, kIdx).raw, b.raw)};
3875
4696
  }
3876
4697
 
3877
4698
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
3878
- HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
4699
+ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
3879
4700
  const RebindToUnsigned<decltype(d)> du;
3880
- alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3881
- return VFromD<D>{_mm512_permutex2var_pd(lo.raw, Load(du, kIdx).raw, hi.raw)};
4701
+ alignas(64) static constexpr uint64_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15};
4702
+ return VFromD<D>{_mm512_permutex2var_pd(a.raw, Load(du, kIdx).raw, b.raw)};
3882
4703
  }
3883
4704
 
3884
4705
  // ------------------------------ DupEven (InterleaveLower)
@@ -3922,11 +4743,44 @@ HWY_API Vec512<T> OddEven(const Vec512<T> a, const Vec512<T> b) {
3922
4743
  return IfThenElse(Mask512<T>{0x5555555555555555ull >> shift}, b, a);
3923
4744
  }
3924
4745
 
4746
+ // -------------------------- InterleaveEven
4747
+
4748
+ template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_UI32_D(D)>
4749
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
4750
+ return VFromD<D>{_mm512_mask_shuffle_epi32(
4751
+ a.raw, static_cast<__mmask16>(0xAAAA), b.raw,
4752
+ static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(2, 2, 0, 0)))};
4753
+ }
4754
+ template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_F32_D(D)>
4755
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
4756
+ return VFromD<D>{_mm512_mask_shuffle_ps(a.raw, static_cast<__mmask16>(0xAAAA),
4757
+ b.raw, b.raw,
4758
+ _MM_SHUFFLE(2, 2, 0, 0))};
4759
+ }
4760
+ // -------------------------- InterleaveOdd
4761
+
4762
+ template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_UI32_D(D)>
4763
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
4764
+ return VFromD<D>{_mm512_mask_shuffle_epi32(
4765
+ b.raw, static_cast<__mmask16>(0x5555), a.raw,
4766
+ static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(3, 3, 1, 1)))};
4767
+ }
4768
+ template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_F32_D(D)>
4769
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
4770
+ return VFromD<D>{_mm512_mask_shuffle_ps(b.raw, static_cast<__mmask16>(0x5555),
4771
+ a.raw, a.raw,
4772
+ _MM_SHUFFLE(3, 3, 1, 1))};
4773
+ }
4774
+
3925
4775
  // ------------------------------ OddEvenBlocks
3926
4776
 
3927
4777
  template <typename T>
3928
4778
  HWY_API Vec512<T> OddEvenBlocks(Vec512<T> odd, Vec512<T> even) {
3929
- return Vec512<T>{_mm512_mask_blend_epi64(__mmask8{0x33u}, odd.raw, even.raw)};
4779
+ const DFromV<decltype(odd)> d;
4780
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4781
+ return BitCast(
4782
+ d, VFromD<decltype(du)>{_mm512_mask_blend_epi64(
4783
+ __mmask8{0x33u}, BitCast(du, odd).raw, BitCast(du, even).raw)});
3930
4784
  }
3931
4785
 
3932
4786
  HWY_API Vec512<float> OddEvenBlocks(Vec512<float> odd, Vec512<float> even) {
@@ -3943,7 +4797,11 @@ HWY_API Vec512<double> OddEvenBlocks(Vec512<double> odd, Vec512<double> even) {
3943
4797
 
3944
4798
  template <typename T>
3945
4799
  HWY_API Vec512<T> SwapAdjacentBlocks(Vec512<T> v) {
3946
- return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_CDAB)};
4800
+ const DFromV<decltype(v)> d;
4801
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4802
+ return BitCast(d,
4803
+ VFromD<decltype(du)>{_mm512_shuffle_i32x4(
4804
+ BitCast(du, v).raw, BitCast(du, v).raw, _MM_PERM_CDAB)});
3947
4805
  }
3948
4806
 
3949
4807
  HWY_API Vec512<float> SwapAdjacentBlocks(Vec512<float> v) {
@@ -3957,8 +4815,11 @@ HWY_API Vec512<double> SwapAdjacentBlocks(Vec512<double> v) {
3957
4815
  // ------------------------------ ReverseBlocks
3958
4816
 
3959
4817
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
3960
- HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
3961
- return VFromD<D>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_ABCD)};
4818
+ HWY_API VFromD<D> ReverseBlocks(D d, VFromD<D> v) {
4819
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
4820
+ return BitCast(d,
4821
+ VFromD<decltype(du)>{_mm512_shuffle_i32x4(
4822
+ BitCast(du, v).raw, BitCast(du, v).raw, _MM_PERM_ABCD)});
3962
4823
  }
3963
4824
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
3964
4825
  HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
@@ -3974,7 +4835,10 @@ HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
3974
4835
  // Both full
3975
4836
  template <typename T, typename TI>
3976
4837
  HWY_API Vec512<TI> TableLookupBytes(Vec512<T> bytes, Vec512<TI> indices) {
3977
- return Vec512<TI>{_mm512_shuffle_epi8(bytes.raw, indices.raw)};
4838
+ const DFromV<decltype(indices)> d;
4839
+ return BitCast(d, Vec512<uint8_t>{_mm512_shuffle_epi8(
4840
+ BitCast(Full512<uint8_t>(), bytes).raw,
4841
+ BitCast(Full512<uint8_t>(), indices).raw)});
3978
4842
  }
3979
4843
 
3980
4844
  // Partial index vector
@@ -4632,6 +5496,15 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec256<float16_t> v) {
4632
5496
  #endif // HWY_HAVE_FLOAT16
4633
5497
  }
4634
5498
 
5499
+ #if HWY_HAVE_FLOAT16
5500
+
5501
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
5502
+ HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, Vec128<float16_t> v) {
5503
+ return VFromD<D>{_mm512_cvtph_pd(v.raw)};
5504
+ }
5505
+
5506
+ #endif // HWY_HAVE_FLOAT16
5507
+
4635
5508
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
4636
5509
  HWY_API VFromD<D> PromoteTo(D df32, Vec256<bfloat16_t> v) {
4637
5510
  const Rebind<uint16_t, decltype(df32)> du16;
@@ -4655,19 +5528,76 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, Vec256<uint32_t> v) {
4655
5528
  }
4656
5529
 
4657
5530
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
4658
- HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
4659
- const Rebind<float, decltype(di64)> df32;
4660
- const RebindToFloat<decltype(di64)> df64;
4661
- const RebindToSigned<decltype(df32)> di32;
5531
+ HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
5532
+ #if HWY_COMPILER_GCC_ACTUAL
5533
+ // Workaround for undefined behavior with GCC if any values of v[i] are not
5534
+ // within the range of an int64_t
5535
+
5536
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
5537
+ if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
5538
+ typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
5539
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
5540
+ return VFromD<D>{_mm512_setr_epi64(
5541
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]),
5542
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1]),
5543
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[2]),
5544
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[3]),
5545
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[4]),
5546
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[5]),
5547
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[6]),
5548
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[7]))};
5549
+ }
5550
+ #endif
4662
5551
 
4663
- return detail::FixConversionOverflow(
4664
- di64, BitCast(df64, PromoteTo(di64, BitCast(di32, v))),
4665
- VFromD<D>{_mm512_cvttps_epi64(v.raw)});
5552
+ __m512i raw_result;
5553
+ __asm__("vcvttps2qq {%1, %0|%0, %1}"
5554
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
5555
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
5556
+ :);
5557
+ return VFromD<D>{raw_result};
5558
+ #else
5559
+ return VFromD<D>{_mm512_cvttps_epi64(v.raw)};
5560
+ #endif
4666
5561
  }
4667
5562
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
4668
- HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
4669
- return VFromD<D>{
4670
- _mm512_maskz_cvttps_epu64(_knot_mask8(MaskFromVec(v).raw), v.raw)};
5563
+ HWY_API VFromD<D> PromoteInRangeTo(D /* tag */, VFromD<Rebind<float, D>> v) {
5564
+ #if HWY_COMPILER_GCC_ACTUAL
5565
+ // Workaround for undefined behavior with GCC if any values of v[i] are not
5566
+ // within the range of an uint64_t
5567
+
5568
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
5569
+ if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
5570
+ typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
5571
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
5572
+ return VFromD<D>{_mm512_setr_epi64(
5573
+ static_cast<int64_t>(
5574
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0])),
5575
+ static_cast<int64_t>(
5576
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1])),
5577
+ static_cast<int64_t>(
5578
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[2])),
5579
+ static_cast<int64_t>(
5580
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[3])),
5581
+ static_cast<int64_t>(
5582
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[4])),
5583
+ static_cast<int64_t>(
5584
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[5])),
5585
+ static_cast<int64_t>(
5586
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[6])),
5587
+ static_cast<int64_t>(
5588
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[7])))};
5589
+ }
5590
+ #endif
5591
+
5592
+ __m512i raw_result;
5593
+ __asm__("vcvttps2uqq {%1, %0|%0, %1}"
5594
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
5595
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
5596
+ :);
5597
+ return VFromD<D>{raw_result};
5598
+ #else
5599
+ return VFromD<D>{_mm512_cvttps_epu64(v.raw)};
5600
+ #endif
4671
5601
  }
4672
5602
 
4673
5603
  // ------------------------------ Demotions (full -> part w/ narrow lanes)
@@ -4709,8 +5639,7 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int32_t> v) {
4709
5639
  const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
4710
5640
  const Vec512<uint8_t> u8{_mm512_packus_epi16(i16.raw, i16.raw)};
4711
5641
 
4712
- alignas(16) static constexpr uint32_t kLanes[4] = {0, 4, 8, 12};
4713
- const auto idx32 = LoadDup128(du32, kLanes);
5642
+ const VFromD<decltype(du32)> idx32 = Dup128VecFromValues(du32, 0, 4, 8, 12);
4714
5643
  const Vec512<uint8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, u8.raw)};
4715
5644
  return LowerHalf(LowerHalf(fixed));
4716
5645
  }
@@ -4745,9 +5674,7 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int32_t> v) {
4745
5674
  const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
4746
5675
  const Vec512<int8_t> i8{_mm512_packs_epi16(i16.raw, i16.raw)};
4747
5676
 
4748
- alignas(16) static constexpr uint32_t kLanes[16] = {0, 4, 8, 12, 0, 4, 8, 12,
4749
- 0, 4, 8, 12, 0, 4, 8, 12};
4750
- const auto idx32 = LoadDup128(du32, kLanes);
5677
+ const VFromD<decltype(du32)> idx32 = Dup128VecFromValues(du32, 0, 4, 8, 12);
4751
5678
  const Vec512<int8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, i8.raw)};
4752
5679
  return LowerHalf(LowerHalf(fixed));
4753
5680
  }
@@ -4779,32 +5706,17 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int64_t> v) {
4779
5706
 
4780
5707
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
4781
5708
  HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int64_t> v) {
4782
- const auto neg_mask = MaskFromVec(v);
4783
- #if HWY_COMPILER_HAS_MASK_INTRINSICS
4784
- const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
4785
- #else
4786
- const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
4787
- #endif
5709
+ const __mmask8 non_neg_mask = Not(MaskFromVec(v)).raw;
4788
5710
  return VFromD<D>{_mm512_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)};
4789
5711
  }
4790
5712
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
4791
5713
  HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int64_t> v) {
4792
- const auto neg_mask = MaskFromVec(v);
4793
- #if HWY_COMPILER_HAS_MASK_INTRINSICS
4794
- const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
4795
- #else
4796
- const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
4797
- #endif
5714
+ const __mmask8 non_neg_mask = Not(MaskFromVec(v)).raw;
4798
5715
  return VFromD<D>{_mm512_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)};
4799
5716
  }
4800
5717
  template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
4801
5718
  HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<int64_t> v) {
4802
- const auto neg_mask = MaskFromVec(v);
4803
- #if HWY_COMPILER_HAS_MASK_INTRINSICS
4804
- const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
4805
- #else
4806
- const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
4807
- #endif
5719
+ const __mmask8 non_neg_mask = Not(MaskFromVec(v)).raw;
4808
5720
  return VFromD<D>{_mm512_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)};
4809
5721
  }
4810
5722
 
@@ -4822,32 +5734,55 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<uint64_t> v) {
4822
5734
  }
4823
5735
 
4824
5736
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
4825
- HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<float> v) {
5737
+ HWY_API VFromD<D> DemoteTo(D df16, Vec512<float> v) {
4826
5738
  // Work around warnings in the intrinsic definitions (passing -1 as a mask).
4827
5739
  HWY_DIAGNOSTICS(push)
4828
5740
  HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
4829
- return VFromD<D>{_mm512_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
5741
+ const RebindToUnsigned<decltype(df16)> du16;
5742
+ return BitCast(
5743
+ df16, VFromD<decltype(du16)>{_mm512_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)});
4830
5744
  HWY_DIAGNOSTICS(pop)
4831
5745
  }
4832
5746
 
5747
+ #if HWY_HAVE_FLOAT16
5748
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
5749
+ HWY_API VFromD<D> DemoteTo(D /*df16*/, Vec512<double> v) {
5750
+ return VFromD<D>{_mm512_cvtpd_ph(v.raw)};
5751
+ }
5752
+ #endif // HWY_HAVE_FLOAT16
5753
+
5754
+ #if HWY_AVX3_HAVE_F32_TO_BF16C
4833
5755
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
4834
- HWY_API VFromD<D> DemoteTo(D dbf16, Vec512<float> v) {
4835
- // TODO(janwas): _mm512_cvtneps_pbh once we have avx512bf16.
4836
- const Rebind<int32_t, decltype(dbf16)> di32;
4837
- const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
4838
- const Rebind<uint16_t, decltype(dbf16)> du16;
4839
- const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
4840
- return BitCast(dbf16, DemoteTo(du16, bits_in_32));
5756
+ HWY_API VFromD<D> DemoteTo(D /*dbf16*/, Vec512<float> v) {
5757
+ #if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
5758
+ // Inline assembly workaround for LLVM codegen bug
5759
+ __m256i raw_result;
5760
+ __asm__("vcvtneps2bf16 %1, %0" : "=v"(raw_result) : "v"(v.raw));
5761
+ return VFromD<D>{raw_result};
5762
+ #else
5763
+ // The _mm512_cvtneps_pbh intrinsic returns a __m256bh vector that needs to be
5764
+ // bit casted to a __m256i vector
5765
+ return VFromD<D>{detail::BitCastToInteger(_mm512_cvtneps_pbh(v.raw))};
5766
+ #endif
4841
5767
  }
4842
5768
 
4843
5769
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_BF16_D(D)>
4844
- HWY_API VFromD<D> ReorderDemote2To(D dbf16, Vec512<float> a, Vec512<float> b) {
4845
- // TODO(janwas): _mm512_cvtne2ps_pbh once we have avx512bf16.
4846
- const RebindToUnsigned<decltype(dbf16)> du16;
4847
- const Repartition<uint32_t, decltype(dbf16)> du32;
4848
- const Vec512<uint32_t> b_in_even = ShiftRight<16>(BitCast(du32, b));
4849
- return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
5770
+ HWY_API VFromD<D> ReorderDemote2To(D /*dbf16*/, Vec512<float> a,
5771
+ Vec512<float> b) {
5772
+ #if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
5773
+ // Inline assembly workaround for LLVM codegen bug
5774
+ __m512i raw_result;
5775
+ __asm__("vcvtne2ps2bf16 %2, %1, %0"
5776
+ : "=v"(raw_result)
5777
+ : "v"(b.raw), "v"(a.raw));
5778
+ return VFromD<D>{raw_result};
5779
+ #else
5780
+ // The _mm512_cvtne2ps_pbh intrinsic returns a __m512bh vector that needs to
5781
+ // be bit casted to a __m512i vector
5782
+ return VFromD<D>{detail::BitCastToInteger(_mm512_cvtne2ps_pbh(b.raw, a.raw))};
5783
+ #endif
4850
5784
  }
5785
+ #endif // HWY_AVX3_HAVE_F32_TO_BF16C
4851
5786
 
4852
5787
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
4853
5788
  HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec512<int32_t> a,
@@ -4935,16 +5870,77 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<double> v) {
4935
5870
  }
4936
5871
 
4937
5872
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
4938
- HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<double> v) {
4939
- const Full512<double> d64;
4940
- const auto clamped = detail::ClampF64ToI32Max(d64, v);
4941
- return VFromD<D>{_mm512_cvttpd_epi32(clamped.raw)};
5873
+ HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec512<double> v) {
5874
+ #if HWY_COMPILER_GCC_ACTUAL
5875
+ // Workaround for undefined behavior in _mm512_cvttpd_epi32 with GCC if any
5876
+ // values of v[i] are not within the range of an int32_t
5877
+
5878
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
5879
+ if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
5880
+ typedef double GccF64RawVectType __attribute__((__vector_size__(64)));
5881
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
5882
+ return VFromD<D>{_mm256_setr_epi32(
5883
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
5884
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]),
5885
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[2]),
5886
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[3]),
5887
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[4]),
5888
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[5]),
5889
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[6]),
5890
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[7]))};
5891
+ }
5892
+ #endif
5893
+
5894
+ __m256i raw_result;
5895
+ __asm__("vcvttpd2dq {%1, %0|%0, %1}"
5896
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
5897
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
5898
+ :);
5899
+ return VFromD<D>{raw_result};
5900
+ #else
5901
+ return VFromD<D>{_mm512_cvttpd_epi32(v.raw)};
5902
+ #endif
4942
5903
  }
4943
5904
 
4944
5905
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
4945
- HWY_API VFromD<D> DemoteTo(D /* tag */, Vec512<double> v) {
4946
- return VFromD<D>{
4947
- _mm512_maskz_cvttpd_epu32(_knot_mask8(MaskFromVec(v).raw), v.raw)};
5906
+ HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, Vec512<double> v) {
5907
+ #if HWY_COMPILER_GCC_ACTUAL
5908
+ // Workaround for undefined behavior in _mm512_cvttpd_epu32 with GCC if any
5909
+ // values of v[i] are not within the range of an uint32_t
5910
+
5911
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
5912
+ if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) {
5913
+ typedef double GccF64RawVectType __attribute__((__vector_size__(64)));
5914
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
5915
+ return VFromD<D>{_mm256_setr_epi32(
5916
+ static_cast<int32_t>(
5917
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0])),
5918
+ static_cast<int32_t>(
5919
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1])),
5920
+ static_cast<int32_t>(
5921
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[2])),
5922
+ static_cast<int32_t>(
5923
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[3])),
5924
+ static_cast<int32_t>(
5925
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[4])),
5926
+ static_cast<int32_t>(
5927
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[5])),
5928
+ static_cast<int32_t>(
5929
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[6])),
5930
+ static_cast<int32_t>(
5931
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[7])))};
5932
+ }
5933
+ #endif
5934
+
5935
+ __m256i raw_result;
5936
+ __asm__("vcvttpd2udq {%1, %0|%0, %1}"
5937
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
5938
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
5939
+ :);
5940
+ return VFromD<D>{raw_result};
5941
+ #else
5942
+ return VFromD<D>{_mm512_cvttpd_epu32(v.raw)};
5943
+ #endif
4948
5944
  }
4949
5945
 
4950
5946
  template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
@@ -4962,13 +5958,12 @@ HWY_API Vec128<uint8_t> U8FromU32(const Vec512<uint32_t> v) {
4962
5958
  const DFromV<decltype(v)> d32;
4963
5959
  // In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the
4964
5960
  // lowest 4 bytes.
4965
- alignas(16) static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u,
4966
- ~0u};
4967
- const auto quads = TableLookupBytes(v, LoadDup128(d32, k8From32));
5961
+ const VFromD<decltype(d32)> v8From32 =
5962
+ Dup128VecFromValues(d32, 0x0C080400u, ~0u, ~0u, ~0u);
5963
+ const auto quads = TableLookupBytes(v, v8From32);
4968
5964
  // Gather the lowest 4 bytes of 4 128-bit blocks.
4969
- alignas(16) static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12};
4970
- const Vec512<uint8_t> bytes{
4971
- _mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)};
5965
+ const VFromD<decltype(d32)> index32 = Dup128VecFromValues(d32, 0, 4, 8, 12);
5966
+ const Vec512<uint8_t> bytes{_mm512_permutexvar_epi32(index32.raw, quads.raw)};
4972
5967
  return LowerHalf(LowerHalf(bytes));
4973
5968
  }
4974
5969
 
@@ -4979,10 +5974,9 @@ HWY_API VFromD<D> TruncateTo(D d, const Vec512<uint64_t> v) {
4979
5974
  #if HWY_TARGET <= HWY_AVX3_DL
4980
5975
  (void)d;
4981
5976
  const Full512<uint8_t> d8;
4982
- alignas(16) static constexpr uint8_t k8From64[16] = {
4983
- 0, 8, 16, 24, 32, 40, 48, 56, 0, 8, 16, 24, 32, 40, 48, 56};
4984
- const Vec512<uint8_t> bytes{
4985
- _mm512_permutexvar_epi8(LoadDup128(d8, k8From64).raw, v.raw)};
5977
+ const VFromD<decltype(d8)> v8From64 = Dup128VecFromValues(
5978
+ d8, 0, 8, 16, 24, 32, 40, 48, 56, 0, 8, 16, 24, 32, 40, 48, 56);
5979
+ const Vec512<uint8_t> bytes{_mm512_permutexvar_epi8(v8From64.raw, v.raw)};
4986
5980
  return LowerHalf(LowerHalf(LowerHalf(bytes)));
4987
5981
  #else
4988
5982
  const Full512<uint32_t> d32;
@@ -5018,21 +6012,19 @@ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
5018
6012
  HWY_API VFromD<D> TruncateTo(D /* tag */, const Vec512<uint32_t> v) {
5019
6013
  #if HWY_TARGET <= HWY_AVX3_DL
5020
6014
  const Full512<uint8_t> d8;
5021
- alignas(16) static constexpr uint8_t k8From32[16] = {
5022
- 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60};
5023
- const Vec512<uint8_t> bytes{
5024
- _mm512_permutexvar_epi8(LoadDup128(d8, k8From32).raw, v.raw)};
6015
+ const VFromD<decltype(d8)> v8From32 = Dup128VecFromValues(
6016
+ d8, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60);
6017
+ const Vec512<uint8_t> bytes{_mm512_permutexvar_epi8(v8From32.raw, v.raw)};
5025
6018
  #else
5026
6019
  const Full512<uint32_t> d32;
5027
6020
  // In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the
5028
6021
  // lowest 4 bytes.
5029
- alignas(16) static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u,
5030
- ~0u};
5031
- const auto quads = TableLookupBytes(v, LoadDup128(d32, k8From32));
6022
+ const VFromD<decltype(d32)> v8From32 =
6023
+ Dup128VecFromValues(d32, 0x0C080400u, ~0u, ~0u, ~0u);
6024
+ const auto quads = TableLookupBytes(v, v8From32);
5032
6025
  // Gather the lowest 4 bytes of 4 128-bit blocks.
5033
- alignas(16) static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12};
5034
- const Vec512<uint8_t> bytes{
5035
- _mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)};
6026
+ const VFromD<decltype(d32)> index32 = Dup128VecFromValues(d32, 0, 4, 8, 12);
6027
+ const Vec512<uint8_t> bytes{_mm512_permutexvar_epi32(index32.raw, quads.raw)};
5036
6028
  #endif
5037
6029
  return LowerHalf(LowerHalf(bytes));
5038
6030
  }
@@ -5061,9 +6053,9 @@ HWY_API VFromD<D> TruncateTo(D /* tag */, const Vec512<uint16_t> v) {
5061
6053
  _mm512_permutexvar_epi8(Load(d8, k8From16).raw, v.raw)};
5062
6054
  #else
5063
6055
  const Full512<uint32_t> d32;
5064
- alignas(16) static constexpr uint32_t k16From32[4] = {
5065
- 0x06040200u, 0x0E0C0A08u, 0x06040200u, 0x0E0C0A08u};
5066
- const auto quads = TableLookupBytes(v, LoadDup128(d32, k16From32));
6056
+ const VFromD<decltype(d32)> v16From32 = Dup128VecFromValues(
6057
+ d32, 0x06040200u, 0x0E0C0A08u, 0x06040200u, 0x0E0C0A08u);
6058
+ const auto quads = TableLookupBytes(v, v16From32);
5067
6059
  alignas(64) static constexpr uint32_t kIndex32[16] = {
5068
6060
  0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
5069
6061
  const Vec512<uint8_t> bytes{
@@ -5108,36 +6100,362 @@ HWY_API VFromD<D> ConvertTo(D /* tag*/, Vec512<uint64_t> v) {
5108
6100
  // Truncates (rounds toward zero).
5109
6101
  #if HWY_HAVE_FLOAT16
5110
6102
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
5111
- HWY_API VFromD<D> ConvertTo(D d, Vec512<float16_t> v) {
5112
- return detail::FixConversionOverflow(d, v,
5113
- VFromD<D>{_mm512_cvttph_epi16(v.raw)});
6103
+ HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec512<float16_t> v) {
6104
+ #if HWY_COMPILER_GCC_ACTUAL
6105
+ // Workaround for undefined behavior in _mm512_cvttph_epi16 with GCC if any
6106
+ // values of v[i] are not within the range of an int16_t
6107
+
6108
+ #if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
6109
+ HWY_HAVE_SCALAR_F16_TYPE
6110
+ if (detail::IsConstantX86VecForF2IConv<int16_t>(v)) {
6111
+ typedef hwy::float16_t::Native GccF16RawVectType
6112
+ __attribute__((__vector_size__(64)));
6113
+ const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
6114
+ return VFromD<D>{
6115
+ _mm512_set_epi16(detail::X86ConvertScalarFromFloat<int16_t>(raw_v[31]),
6116
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[30]),
6117
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[29]),
6118
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[28]),
6119
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[27]),
6120
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[26]),
6121
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[25]),
6122
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[24]),
6123
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[23]),
6124
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[22]),
6125
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[21]),
6126
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[20]),
6127
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[19]),
6128
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[18]),
6129
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[17]),
6130
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[16]),
6131
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[15]),
6132
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[14]),
6133
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[13]),
6134
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[12]),
6135
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[11]),
6136
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[10]),
6137
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[9]),
6138
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[8]),
6139
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[7]),
6140
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[6]),
6141
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[5]),
6142
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[4]),
6143
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[3]),
6144
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[2]),
6145
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[1]),
6146
+ detail::X86ConvertScalarFromFloat<int16_t>(raw_v[0]))};
6147
+ }
6148
+ #endif
6149
+
6150
+ __m512i raw_result;
6151
+ __asm__("vcvttph2w {%1, %0|%0, %1}"
6152
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6153
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6154
+ :);
6155
+ return VFromD<D>{raw_result};
6156
+ #else
6157
+ return VFromD<D>{_mm512_cvttph_epi16(v.raw)};
6158
+ #endif
6159
+ }
6160
+ template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
6161
+ HWY_API VFromD<D> ConvertInRangeTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
6162
+ #if HWY_COMPILER_GCC_ACTUAL
6163
+ // Workaround for undefined behavior in _mm512_cvttph_epu16 with GCC if any
6164
+ // values of v[i] are not within the range of an uint16_t
6165
+
6166
+ #if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
6167
+ HWY_HAVE_SCALAR_F16_TYPE
6168
+ if (detail::IsConstantX86VecForF2IConv<uint16_t>(v)) {
6169
+ typedef hwy::float16_t::Native GccF16RawVectType
6170
+ __attribute__((__vector_size__(64)));
6171
+ const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
6172
+ return VFromD<D>{_mm512_set_epi16(
6173
+ static_cast<int16_t>(
6174
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[31])),
6175
+ static_cast<int16_t>(
6176
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[30])),
6177
+ static_cast<int16_t>(
6178
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[29])),
6179
+ static_cast<int16_t>(
6180
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[28])),
6181
+ static_cast<int16_t>(
6182
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[27])),
6183
+ static_cast<int16_t>(
6184
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[26])),
6185
+ static_cast<int16_t>(
6186
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[25])),
6187
+ static_cast<int16_t>(
6188
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[24])),
6189
+ static_cast<int16_t>(
6190
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[23])),
6191
+ static_cast<int16_t>(
6192
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[22])),
6193
+ static_cast<int16_t>(
6194
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[21])),
6195
+ static_cast<int16_t>(
6196
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[20])),
6197
+ static_cast<int16_t>(
6198
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[19])),
6199
+ static_cast<int16_t>(
6200
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[18])),
6201
+ static_cast<int16_t>(
6202
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[17])),
6203
+ static_cast<int16_t>(
6204
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[16])),
6205
+ static_cast<int16_t>(
6206
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[15])),
6207
+ static_cast<int16_t>(
6208
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[14])),
6209
+ static_cast<int16_t>(
6210
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[13])),
6211
+ static_cast<int16_t>(
6212
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[12])),
6213
+ static_cast<int16_t>(
6214
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[11])),
6215
+ static_cast<int16_t>(
6216
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[10])),
6217
+ static_cast<int16_t>(
6218
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[9])),
6219
+ static_cast<int16_t>(
6220
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[8])),
6221
+ static_cast<int16_t>(
6222
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[7])),
6223
+ static_cast<int16_t>(
6224
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[6])),
6225
+ static_cast<int16_t>(
6226
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[5])),
6227
+ static_cast<int16_t>(
6228
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[4])),
6229
+ static_cast<int16_t>(
6230
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[3])),
6231
+ static_cast<int16_t>(
6232
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[2])),
6233
+ static_cast<int16_t>(
6234
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[1])),
6235
+ static_cast<int16_t>(
6236
+ detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[0])))};
6237
+ }
6238
+ #endif
6239
+
6240
+ __m512i raw_result;
6241
+ __asm__("vcvttph2uw {%1, %0|%0, %1}"
6242
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6243
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6244
+ :);
6245
+ return VFromD<D>{raw_result};
6246
+ #else
6247
+ return VFromD<D>{_mm512_cvttph_epu16(v.raw)};
6248
+ #endif
5114
6249
  }
5115
6250
  #endif // HWY_HAVE_FLOAT16
5116
6251
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
5117
- HWY_API VFromD<D> ConvertTo(D d, Vec512<float> v) {
5118
- return detail::FixConversionOverflow(d, v,
5119
- VFromD<D>{_mm512_cvttps_epi32(v.raw)});
6252
+ HWY_API VFromD<D> ConvertInRangeTo(D /*d*/, Vec512<float> v) {
6253
+ #if HWY_COMPILER_GCC_ACTUAL
6254
+ // Workaround for undefined behavior in _mm512_cvttps_epi32 with GCC if any
6255
+ // values of v[i] are not within the range of an int32_t
6256
+
6257
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
6258
+ if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
6259
+ typedef float GccF32RawVectType __attribute__((__vector_size__(64)));
6260
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
6261
+ return VFromD<D>{_mm512_setr_epi32(
6262
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
6263
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]),
6264
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[2]),
6265
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[3]),
6266
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[4]),
6267
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[5]),
6268
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[6]),
6269
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[7]),
6270
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[8]),
6271
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[9]),
6272
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[10]),
6273
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[11]),
6274
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[12]),
6275
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[13]),
6276
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[14]),
6277
+ detail::X86ConvertScalarFromFloat<int32_t>(raw_v[15]))};
6278
+ }
6279
+ #endif
6280
+
6281
+ __m512i raw_result;
6282
+ __asm__("vcvttps2dq {%1, %0|%0, %1}"
6283
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6284
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6285
+ :);
6286
+ return VFromD<D>{raw_result};
6287
+ #else
6288
+ return VFromD<D>{_mm512_cvttps_epi32(v.raw)};
6289
+ #endif
5120
6290
  }
5121
6291
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
5122
- HWY_API VFromD<D> ConvertTo(D di, Vec512<double> v) {
5123
- return detail::FixConversionOverflow(di, v,
5124
- VFromD<D>{_mm512_cvttpd_epi64(v.raw)});
6292
+ HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, Vec512<double> v) {
6293
+ #if HWY_COMPILER_GCC_ACTUAL
6294
+ // Workaround for undefined behavior in _mm512_cvttpd_epi64 with GCC if any
6295
+ // values of v[i] are not within the range of an int64_t
6296
+
6297
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
6298
+ if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
6299
+ typedef double GccF64RawVectType __attribute__((__vector_size__(64)));
6300
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
6301
+ return VFromD<D>{_mm512_setr_epi64(
6302
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]),
6303
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1]),
6304
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[2]),
6305
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[3]),
6306
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[4]),
6307
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[5]),
6308
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[6]),
6309
+ detail::X86ConvertScalarFromFloat<int64_t>(raw_v[7]))};
6310
+ }
6311
+ #endif
6312
+
6313
+ __m512i raw_result;
6314
+ __asm__("vcvttpd2qq {%1, %0|%0, %1}"
6315
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6316
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6317
+ :);
6318
+ return VFromD<D>{raw_result};
6319
+ #else
6320
+ return VFromD<D>{_mm512_cvttpd_epi64(v.raw)};
6321
+ #endif
5125
6322
  }
5126
6323
  template <class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U32_D(DU)>
5127
- HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
5128
- return VFromD<DU>{
5129
- _mm512_maskz_cvttps_epu32(_knot_mask16(MaskFromVec(v).raw), v.raw)};
6324
+ HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
6325
+ #if HWY_COMPILER_GCC_ACTUAL
6326
+ // Workaround for undefined behavior in _mm512_cvttps_epu32 with GCC if any
6327
+ // values of v[i] are not within the range of an uint32_t
6328
+
6329
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
6330
+ if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) {
6331
+ typedef float GccF32RawVectType __attribute__((__vector_size__(64)));
6332
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
6333
+ return VFromD<DU>{_mm512_setr_epi32(
6334
+ static_cast<int32_t>(
6335
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0])),
6336
+ static_cast<int32_t>(
6337
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1])),
6338
+ static_cast<int32_t>(
6339
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[2])),
6340
+ static_cast<int32_t>(
6341
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[3])),
6342
+ static_cast<int32_t>(
6343
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[4])),
6344
+ static_cast<int32_t>(
6345
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[5])),
6346
+ static_cast<int32_t>(
6347
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[6])),
6348
+ static_cast<int32_t>(
6349
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[7])),
6350
+ static_cast<int32_t>(
6351
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[8])),
6352
+ static_cast<int32_t>(
6353
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[9])),
6354
+ static_cast<int32_t>(
6355
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[10])),
6356
+ static_cast<int32_t>(
6357
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[11])),
6358
+ static_cast<int32_t>(
6359
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[12])),
6360
+ static_cast<int32_t>(
6361
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[13])),
6362
+ static_cast<int32_t>(
6363
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[14])),
6364
+ static_cast<int32_t>(
6365
+ detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[15])))};
6366
+ }
6367
+ #endif
6368
+
6369
+ __m512i raw_result;
6370
+ __asm__("vcvttps2udq {%1, %0|%0, %1}"
6371
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6372
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6373
+ :);
6374
+ return VFromD<DU>{raw_result};
6375
+ #else
6376
+ return VFromD<DU>{_mm512_cvttps_epu32(v.raw)};
6377
+ #endif
5130
6378
  }
5131
6379
  template <class DU, HWY_IF_V_SIZE_D(DU, 64), HWY_IF_U64_D(DU)>
5132
- HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
5133
- return VFromD<DU>{
5134
- _mm512_maskz_cvttpd_epu64(_knot_mask8(MaskFromVec(v).raw), v.raw)};
6380
+ HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
6381
+ #if HWY_COMPILER_GCC_ACTUAL
6382
+ // Workaround for undefined behavior in _mm512_cvttpd_epu64 with GCC if any
6383
+ // values of v[i] are not within the range of an uint64_t
6384
+
6385
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
6386
+ if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
6387
+ typedef double GccF64RawVectType __attribute__((__vector_size__(64)));
6388
+ const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
6389
+ return VFromD<DU>{_mm512_setr_epi64(
6390
+ static_cast<int64_t>(
6391
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0])),
6392
+ static_cast<int64_t>(
6393
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1])),
6394
+ static_cast<int64_t>(
6395
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[2])),
6396
+ static_cast<int64_t>(
6397
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[3])),
6398
+ static_cast<int64_t>(
6399
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[4])),
6400
+ static_cast<int64_t>(
6401
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[5])),
6402
+ static_cast<int64_t>(
6403
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[6])),
6404
+ static_cast<int64_t>(
6405
+ detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[7])))};
6406
+ }
6407
+ #endif
6408
+
6409
+ __m512i raw_result;
6410
+ __asm__("vcvttpd2uqq {%1, %0|%0, %1}"
6411
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6412
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6413
+ :);
6414
+ return VFromD<DU>{raw_result};
6415
+ #else
6416
+ return VFromD<DU>{_mm512_cvttpd_epu64(v.raw)};
6417
+ #endif
5135
6418
  }
5136
6419
 
5137
- HWY_API Vec512<int32_t> NearestInt(const Vec512<float> v) {
5138
- const Full512<int32_t> di;
5139
- return detail::FixConversionOverflow(
5140
- di, v, Vec512<int32_t>{_mm512_cvtps_epi32(v.raw)});
6420
+ template <class DI, HWY_IF_V_SIZE_D(DI, 64), HWY_IF_I32_D(DI)>
6421
+ HWY_INLINE VFromD<DI> NearestIntInRange(DI, VFromD<RebindToFloat<DI>> v) {
6422
+ #if HWY_COMPILER_GCC_ACTUAL
6423
+ // Workaround for undefined behavior in _mm512_cvtps_epi32 with GCC if any
6424
+ // values of v[i] are not within the range of an int32_t
6425
+
6426
+ #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
6427
+ if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
6428
+ typedef float GccF32RawVectType __attribute__((__vector_size__(64)));
6429
+ const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
6430
+ return VFromD<DI>{
6431
+ _mm512_setr_epi32(detail::X86ScalarNearestInt<int32_t>(raw_v[0]),
6432
+ detail::X86ScalarNearestInt<int32_t>(raw_v[1]),
6433
+ detail::X86ScalarNearestInt<int32_t>(raw_v[2]),
6434
+ detail::X86ScalarNearestInt<int32_t>(raw_v[3]),
6435
+ detail::X86ScalarNearestInt<int32_t>(raw_v[4]),
6436
+ detail::X86ScalarNearestInt<int32_t>(raw_v[5]),
6437
+ detail::X86ScalarNearestInt<int32_t>(raw_v[6]),
6438
+ detail::X86ScalarNearestInt<int32_t>(raw_v[7]),
6439
+ detail::X86ScalarNearestInt<int32_t>(raw_v[8]),
6440
+ detail::X86ScalarNearestInt<int32_t>(raw_v[9]),
6441
+ detail::X86ScalarNearestInt<int32_t>(raw_v[10]),
6442
+ detail::X86ScalarNearestInt<int32_t>(raw_v[11]),
6443
+ detail::X86ScalarNearestInt<int32_t>(raw_v[12]),
6444
+ detail::X86ScalarNearestInt<int32_t>(raw_v[13]),
6445
+ detail::X86ScalarNearestInt<int32_t>(raw_v[14]),
6446
+ detail::X86ScalarNearestInt<int32_t>(raw_v[15]))};
6447
+ }
6448
+ #endif
6449
+
6450
+ __m512i raw_result;
6451
+ __asm__("vcvtps2dq {%1, %0|%0, %1}"
6452
+ : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
6453
+ : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
6454
+ :);
6455
+ return VFromD<DI>{raw_result};
6456
+ #else
6457
+ return VFromD<DI>{_mm512_cvtps_epi32(v.raw)};
6458
+ #endif
5141
6459
  }
5142
6460
 
5143
6461
  // ================================================== CRYPTO
@@ -5198,14 +6516,14 @@ template <uint8_t kRcon>
5198
6516
  HWY_API Vec512<uint8_t> AESKeyGenAssist(Vec512<uint8_t> v) {
5199
6517
  const Full512<uint8_t> d;
5200
6518
  #if HWY_TARGET <= HWY_AVX3_DL
5201
- alignas(16) static constexpr uint8_t kRconXorMask[16] = {
5202
- 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0};
5203
- alignas(16) static constexpr uint8_t kRotWordShuffle[16] = {
5204
- 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12};
6519
+ const VFromD<decltype(d)> rconXorMask = Dup128VecFromValues(
6520
+ d, 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0);
6521
+ const VFromD<decltype(d)> rotWordShuffle = Dup128VecFromValues(
6522
+ d, 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12);
5205
6523
  const Repartition<uint32_t, decltype(d)> du32;
5206
6524
  const auto w13 = BitCast(d, DupOdd(BitCast(du32, v)));
5207
- const auto sub_word_result = AESLastRound(w13, LoadDup128(d, kRconXorMask));
5208
- return TableLookupBytes(sub_word_result, LoadDup128(d, kRotWordShuffle));
6525
+ const auto sub_word_result = AESLastRound(w13, rconXorMask);
6526
+ return TableLookupBytes(sub_word_result, rotWordShuffle);
5209
6527
  #else
5210
6528
  const Half<decltype(d)> d2;
5211
6529
  return Combine(d, AESKeyGenAssist<kRcon>(UpperHalf(d2, v)),
@@ -5253,6 +6571,29 @@ HWY_API Vec512<uint64_t> CLMulUpper(Vec512<uint64_t> va, Vec512<uint64_t> vb) {
5253
6571
 
5254
6572
  // ================================================== MISC
5255
6573
 
6574
+ // ------------------------------ SumsOfAdjQuadAbsDiff (Broadcast,
6575
+ // SumsOfAdjShufQuadAbsDiff)
6576
+
6577
+ template <int kAOffset, int kBOffset>
6578
+ static Vec512<uint16_t> SumsOfAdjQuadAbsDiff(Vec512<uint8_t> a,
6579
+ Vec512<uint8_t> b) {
6580
+ static_assert(0 <= kAOffset && kAOffset <= 1,
6581
+ "kAOffset must be between 0 and 1");
6582
+ static_assert(0 <= kBOffset && kBOffset <= 3,
6583
+ "kBOffset must be between 0 and 3");
6584
+
6585
+ const DFromV<decltype(a)> d;
6586
+ const RepartitionToWideX2<decltype(d)> du32;
6587
+
6588
+ // While AVX3 does not have a _mm512_mpsadbw_epu8 intrinsic, the
6589
+ // SumsOfAdjQuadAbsDiff operation is implementable for 512-bit vectors on
6590
+ // AVX3 using SumsOfShuffledQuadAbsDiff and U32 Broadcast.
6591
+ return SumsOfShuffledQuadAbsDiff<kAOffset + 2, kAOffset + 1, kAOffset + 1,
6592
+ kAOffset>(
6593
+ a, BitCast(d, Broadcast<kBOffset>(BitCast(du32, b))));
6594
+ }
6595
+
6596
+ #if !HWY_IS_MSAN
5256
6597
  // ------------------------------ I32/I64 SaturatedAdd (MaskFromVec)
5257
6598
 
5258
6599
  HWY_API Vec512<int32_t> SaturatedAdd(Vec512<int32_t> a, Vec512<int32_t> b) {
@@ -5300,6 +6641,7 @@ HWY_API Vec512<int64_t> SaturatedSub(Vec512<int64_t> a, Vec512<int64_t> b) {
5300
6641
  i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
5301
6642
  return IfThenElse(overflow_mask, overflow_result, diff);
5302
6643
  }
6644
+ #endif // !HWY_IS_MSAN
5303
6645
 
5304
6646
  // ------------------------------ Mask testing
5305
6647
 
@@ -6165,7 +7507,10 @@ namespace detail {
6165
7507
  // Type-safe wrapper.
6166
7508
  template <_MM_PERM_ENUM kPerm, typename T>
6167
7509
  Vec512<T> Shuffle128(const Vec512<T> lo, const Vec512<T> hi) {
6168
- return Vec512<T>{_mm512_shuffle_i64x2(lo.raw, hi.raw, kPerm)};
7510
+ const DFromV<decltype(lo)> d;
7511
+ const RebindToUnsigned<decltype(d)> du;
7512
+ return BitCast(d, VFromD<decltype(du)>{_mm512_shuffle_i64x2(
7513
+ BitCast(du, lo).raw, BitCast(du, hi).raw, kPerm)});
6169
7514
  }
6170
7515
  template <_MM_PERM_ENUM kPerm>
6171
7516
  Vec512<float> Shuffle128(const Vec512<float> lo, const Vec512<float> hi) {
@@ -6345,7 +7690,7 @@ HWY_API Mask512<T> SetOnlyFirst(Mask512<T> mask) {
6345
7690
  static_cast<typename Mask512<T>::Raw>(detail::AVX3Blsi(mask.raw))};
6346
7691
  }
6347
7692
 
6348
- // ------------------------------ Shl (LoadDup128)
7693
+ // ------------------------------ Shl (Dup128VecFromValues)
6349
7694
 
6350
7695
  HWY_API Vec512<uint16_t> operator<<(Vec512<uint16_t> v, Vec512<uint16_t> bits) {
6351
7696
  return Vec512<uint16_t>{_mm512_sllv_epi16(v.raw, bits.raw)};
@@ -6356,13 +7701,15 @@ HWY_API Vec512<uint8_t> operator<<(Vec512<uint8_t> v, Vec512<uint8_t> bits) {
6356
7701
  const DFromV<decltype(v)> d;
6357
7702
  #if HWY_TARGET <= HWY_AVX3_DL
6358
7703
  // kMask[i] = 0xFF >> i
6359
- alignas(16) static constexpr uint8_t kMasks[16] = {
6360
- 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0x00};
7704
+ const VFromD<decltype(d)> masks =
7705
+ Dup128VecFromValues(d, 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0,
7706
+ 0, 0, 0, 0, 0, 0, 0);
6361
7707
  // kShl[i] = 1 << i
6362
- alignas(16) static constexpr uint8_t kShl[16] = {0x01, 0x02, 0x04, 0x08,
6363
- 0x10, 0x20, 0x40, 0x80};
6364
- v = And(v, TableLookupBytes(LoadDup128(d, kMasks), bits));
6365
- const VFromD<decltype(d)> mul = TableLookupBytes(LoadDup128(d, kShl), bits);
7708
+ const VFromD<decltype(d)> shl =
7709
+ Dup128VecFromValues(d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0,
7710
+ 0, 0, 0, 0, 0, 0, 0);
7711
+ v = And(v, TableLookupBytes(masks, bits));
7712
+ const VFromD<decltype(d)> mul = TableLookupBytes(shl, bits);
6366
7713
  return VFromD<decltype(d)>{_mm512_gf2p8mul_epi8(v.raw, mul.raw)};
6367
7714
  #else
6368
7715
  const Repartition<uint16_t, decltype(d)> dw;
@@ -6457,64 +7804,6 @@ HWY_API Vec512<int64_t> operator>>(const Vec512<int64_t> v,
6457
7804
  return Vec512<int64_t>{_mm512_srav_epi64(v.raw, bits.raw)};
6458
7805
  }
6459
7806
 
6460
- // ------------------------------ MulEven/Odd (Shuffle2301, InterleaveLower)
6461
-
6462
- HWY_INLINE Vec512<uint64_t> MulEven(const Vec512<uint64_t> a,
6463
- const Vec512<uint64_t> b) {
6464
- const DFromV<decltype(a)> du64;
6465
- const RepartitionToNarrow<decltype(du64)> du32;
6466
- const auto maskL = Set(du64, 0xFFFFFFFFULL);
6467
- const auto a32 = BitCast(du32, a);
6468
- const auto b32 = BitCast(du32, b);
6469
- // Inputs for MulEven: we only need the lower 32 bits
6470
- const auto aH = Shuffle2301(a32);
6471
- const auto bH = Shuffle2301(b32);
6472
-
6473
- // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
6474
- // the even (lower 64 bits of every 128-bit block) results. See
6475
- // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat
6476
- const auto aLbL = MulEven(a32, b32);
6477
- const auto w3 = aLbL & maskL;
6478
-
6479
- const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
6480
- const auto w2 = t2 & maskL;
6481
- const auto w1 = ShiftRight<32>(t2);
6482
-
6483
- const auto t = MulEven(a32, bH) + w2;
6484
- const auto k = ShiftRight<32>(t);
6485
-
6486
- const auto mulH = MulEven(aH, bH) + w1 + k;
6487
- const auto mulL = ShiftLeft<32>(t) + w3;
6488
- return InterleaveLower(mulL, mulH);
6489
- }
6490
-
6491
- HWY_INLINE Vec512<uint64_t> MulOdd(const Vec512<uint64_t> a,
6492
- const Vec512<uint64_t> b) {
6493
- const DFromV<decltype(a)> du64;
6494
- const RepartitionToNarrow<decltype(du64)> du32;
6495
- const auto maskL = Set(du64, 0xFFFFFFFFULL);
6496
- const auto a32 = BitCast(du32, a);
6497
- const auto b32 = BitCast(du32, b);
6498
- // Inputs for MulEven: we only need bits [95:64] (= upper half of input)
6499
- const auto aH = Shuffle2301(a32);
6500
- const auto bH = Shuffle2301(b32);
6501
-
6502
- // Same as above, but we're using the odd results (upper 64 bits per block).
6503
- const auto aLbL = MulEven(a32, b32);
6504
- const auto w3 = aLbL & maskL;
6505
-
6506
- const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
6507
- const auto w2 = t2 & maskL;
6508
- const auto w1 = ShiftRight<32>(t2);
6509
-
6510
- const auto t = MulEven(a32, bH) + w2;
6511
- const auto k = ShiftRight<32>(t);
6512
-
6513
- const auto mulH = MulEven(aH, bH) + w1 + k;
6514
- const auto mulL = ShiftLeft<32>(t) + w3;
6515
- return InterleaveUpper(du64, mulL, mulH);
6516
- }
6517
-
6518
7807
  // ------------------------------ WidenMulPairwiseAdd
6519
7808
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
6520
7809
  HWY_API VFromD<D> WidenMulPairwiseAdd(D /*d32*/, Vec512<int16_t> a,
@@ -6523,7 +7812,6 @@ HWY_API VFromD<D> WidenMulPairwiseAdd(D /*d32*/, Vec512<int16_t> a,
6523
7812
  }
6524
7813
 
6525
7814
  // ------------------------------ SatWidenMulPairwiseAdd
6526
-
6527
7815
  template <class DI16, HWY_IF_V_SIZE_D(DI16, 64), HWY_IF_I16_D(DI16)>
6528
7816
  HWY_API VFromD<DI16> SatWidenMulPairwiseAdd(
6529
7817
  DI16 /* tag */, VFromD<Repartition<uint8_t, DI16>> a,
@@ -6531,7 +7819,30 @@ HWY_API VFromD<DI16> SatWidenMulPairwiseAdd(
6531
7819
  return VFromD<DI16>{_mm512_maddubs_epi16(a.raw, b.raw)};
6532
7820
  }
6533
7821
 
7822
+ // ------------------------------ SatWidenMulPairwiseAccumulate
7823
+ #if HWY_TARGET <= HWY_AVX3_DL
7824
+ template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 64)>
7825
+ HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate(
7826
+ DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a,
7827
+ VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
7828
+ return VFromD<DI32>{_mm512_dpwssds_epi32(sum.raw, a.raw, b.raw)};
7829
+ }
7830
+ #endif // HWY_TARGET <= HWY_AVX3_DL
7831
+
6534
7832
  // ------------------------------ ReorderWidenMulAccumulate
7833
+
7834
+ #if HWY_NATIVE_DOT_BF16
7835
+ template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_D(DF, 64),
7836
+ class VBF = VFromD<Repartition<bfloat16_t, DF>>>
7837
+ HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF /*df*/, VBF a, VBF b,
7838
+ const VFromD<DF> sum0,
7839
+ VFromD<DF>& /*sum1*/) {
7840
+ return VFromD<DF>{_mm512_dpbf16_ps(sum0.raw,
7841
+ reinterpret_cast<__m512bh>(a.raw),
7842
+ reinterpret_cast<__m512bh>(b.raw))};
7843
+ }
7844
+ #endif // HWY_NATIVE_DOT_BF16
7845
+
6535
7846
  template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
6536
7847
  HWY_API VFromD<D> ReorderWidenMulAccumulate(D d, Vec512<int16_t> a,
6537
7848
  Vec512<int16_t> b,
@@ -6570,161 +7881,47 @@ HWY_API VFromD<DI32> SumOfMulQuadAccumulate(
6570
7881
 
6571
7882
  // ------------------------------ Reductions
6572
7883
 
6573
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
6574
- HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
6575
- return _mm512_reduce_add_epi32(v.raw);
6576
- }
6577
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
6578
- HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
6579
- return _mm512_reduce_add_epi64(v.raw);
6580
- }
6581
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
6582
- HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
6583
- return static_cast<uint32_t>(_mm512_reduce_add_epi32(v.raw));
6584
- }
6585
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
6586
- HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
6587
- return static_cast<uint64_t>(_mm512_reduce_add_epi64(v.raw));
6588
- }
6589
- #if HWY_HAVE_FLOAT16
6590
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
6591
- HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
6592
- return _mm512_reduce_add_ph(v.raw);
6593
- }
6594
- #endif // HWY_HAVE_FLOAT16
6595
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
6596
- HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
6597
- return _mm512_reduce_add_ps(v.raw);
6598
- }
6599
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
6600
- HWY_API TFromD<D> ReduceSum(D, VFromD<D> v) {
6601
- return _mm512_reduce_add_pd(v.raw);
6602
- }
6603
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
6604
- HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
6605
- const RepartitionToWide<decltype(d)> d32;
6606
- const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
6607
- const auto odd = ShiftRight<16>(BitCast(d32, v));
6608
- const auto sum = ReduceSum(d32, even + odd);
6609
- return static_cast<uint16_t>(sum);
6610
- }
6611
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
6612
- HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
6613
- const RepartitionToWide<decltype(d)> d32;
6614
- // Sign-extend
6615
- const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
6616
- const auto odd = ShiftRight<16>(BitCast(d32, v));
6617
- const auto sum = ReduceSum(d32, even + odd);
6618
- return static_cast<int16_t>(sum);
6619
- }
7884
+ namespace detail {
6620
7885
 
6621
- // Returns the sum in each lane.
6622
- template <class D, HWY_IF_V_SIZE_D(D, 64)>
6623
- HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
6624
- return Set(d, ReduceSum(d, v));
7886
+ // Used by generic_ops-inl
7887
+ template <class D, class Func, HWY_IF_V_SIZE_D(D, 64)>
7888
+ HWY_INLINE VFromD<D> ReduceAcrossBlocks(D d, Func f, VFromD<D> v) {
7889
+ v = f(v, SwapAdjacentBlocks(v));
7890
+ return f(v, ReverseBlocks(d, v));
6625
7891
  }
6626
7892
 
6627
- // Returns the minimum in each lane.
6628
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
6629
- HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
6630
- return Set(d, _mm512_reduce_min_epi32(v.raw));
6631
- }
6632
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
6633
- HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
6634
- return Set(d, _mm512_reduce_min_epi64(v.raw));
6635
- }
6636
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
6637
- HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
6638
- return Set(d, _mm512_reduce_min_epu32(v.raw));
6639
- }
6640
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
6641
- HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
6642
- return Set(d, _mm512_reduce_min_epu64(v.raw));
6643
- }
6644
- #if HWY_HAVE_FLOAT16
6645
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
6646
- HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
6647
- return Set(d, _mm512_reduce_min_ph(v.raw));
6648
- }
6649
- #endif // HWY_HAVE_FLOAT16
6650
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
6651
- HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
6652
- return Set(d, _mm512_reduce_min_ps(v.raw));
6653
- }
6654
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
6655
- HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
6656
- return Set(d, _mm512_reduce_min_pd(v.raw));
6657
- }
6658
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
6659
- HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
6660
- const RepartitionToWide<decltype(d)> d32;
6661
- const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
6662
- const auto odd = ShiftRight<16>(BitCast(d32, v));
6663
- const auto min = MinOfLanes(d32, Min(even, odd));
6664
- // Also broadcast into odd lanes.
6665
- return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
6666
- }
6667
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
6668
- HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
6669
- const RepartitionToWide<decltype(d)> d32;
6670
- // Sign-extend
6671
- const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
6672
- const auto odd = ShiftRight<16>(BitCast(d32, v));
6673
- const auto min = MinOfLanes(d32, Min(even, odd));
6674
- // Also broadcast into odd lanes.
6675
- return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
6676
- }
7893
+ } // namespace detail
6677
7894
 
6678
- // Returns the maximum in each lane.
6679
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I32_D(D)>
6680
- HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
6681
- return Set(d, _mm512_reduce_max_epi32(v.raw));
6682
- }
6683
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I64_D(D)>
6684
- HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
6685
- return Set(d, _mm512_reduce_max_epi64(v.raw));
6686
- }
6687
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U32_D(D)>
6688
- HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
6689
- return Set(d, _mm512_reduce_max_epu32(v.raw));
6690
- }
6691
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U64_D(D)>
6692
- HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
6693
- return Set(d, _mm512_reduce_max_epu64(v.raw));
6694
- }
6695
- #if HWY_HAVE_FLOAT16
6696
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F16_D(D)>
6697
- HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
6698
- return Set(d, _mm512_reduce_max_ph(v.raw));
6699
- }
6700
- #endif // HWY_HAVE_FLOAT16
6701
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
6702
- HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
6703
- return Set(d, _mm512_reduce_max_ps(v.raw));
6704
- }
6705
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
6706
- HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
6707
- return Set(d, _mm512_reduce_max_pd(v.raw));
6708
- }
6709
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_U16_D(D)>
6710
- HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
6711
- const RepartitionToWide<decltype(d)> d32;
6712
- const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
6713
- const auto odd = ShiftRight<16>(BitCast(d32, v));
6714
- const auto min = MaxOfLanes(d32, Max(even, odd));
6715
- // Also broadcast into odd lanes.
6716
- return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
6717
- }
6718
- template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_I16_D(D)>
6719
- HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
6720
- const RepartitionToWide<decltype(d)> d32;
6721
- // Sign-extend
6722
- const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
6723
- const auto odd = ShiftRight<16>(BitCast(d32, v));
6724
- const auto min = MaxOfLanes(d32, Max(even, odd));
6725
- // Also broadcast into odd lanes.
6726
- return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
7895
+ // ------------------------------ BitShuffle
7896
+ #if HWY_TARGET <= HWY_AVX3_DL
7897
+ template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
7898
+ HWY_IF_V_SIZE_V(V, 64), HWY_IF_V_SIZE_V(VI, 64)>
7899
+ HWY_API V BitShuffle(V v, VI idx) {
7900
+ const DFromV<decltype(v)> d64;
7901
+ const RebindToUnsigned<decltype(d64)> du64;
7902
+ const Rebind<uint8_t, decltype(d64)> du8;
7903
+
7904
+ const __mmask64 mmask64_bit_shuf_result =
7905
+ _mm512_bitshuffle_epi64_mask(v.raw, idx.raw);
7906
+
7907
+ #if HWY_ARCH_X86_64
7908
+ const VFromD<decltype(du8)> vu8_bit_shuf_result{
7909
+ _mm_cvtsi64_si128(static_cast<int64_t>(mmask64_bit_shuf_result))};
7910
+ #else
7911
+ const int32_t i32_lo_bit_shuf_result =
7912
+ static_cast<int32_t>(mmask64_bit_shuf_result);
7913
+ const int32_t i32_hi_bit_shuf_result =
7914
+ static_cast<int32_t>(_kshiftri_mask64(mmask64_bit_shuf_result, 32));
7915
+
7916
+ const VFromD<decltype(du8)> vu8_bit_shuf_result = ResizeBitCast(
7917
+ du8, InterleaveLower(
7918
+ Vec128<uint32_t>{_mm_cvtsi32_si128(i32_lo_bit_shuf_result)},
7919
+ Vec128<uint32_t>{_mm_cvtsi32_si128(i32_hi_bit_shuf_result)}));
7920
+ #endif
7921
+
7922
+ return BitCast(d64, PromoteTo(du64, vu8_bit_shuf_result));
6727
7923
  }
7924
+ #endif // HWY_TARGET <= HWY_AVX3_DL
6728
7925
 
6729
7926
  // -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex
6730
7927