@img/sharp-libvips-dev 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/include/expat.h +21 -10
  2. package/include/expat_config.h +11 -5
  3. package/include/ffi.h +12 -25
  4. package/include/freetype2/freetype/config/ftoption.h +1 -1
  5. package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
  6. package/include/glib-2.0/gio/gapplication.h +6 -0
  7. package/include/glib-2.0/gio/giotypes.h +0 -1
  8. package/include/glib-2.0/girepository/giarginfo.h +23 -6
  9. package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
  10. package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
  11. package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
  12. package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
  13. package/include/glib-2.0/girepository/gienuminfo.h +20 -21
  14. package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
  15. package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
  16. package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
  17. package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
  18. package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
  19. package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
  20. package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
  21. package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
  22. package/include/glib-2.0/girepository/girepository.h +53 -62
  23. package/include/glib-2.0/girepository/girffi.h +8 -7
  24. package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
  25. package/include/glib-2.0/girepository/gistructinfo.h +26 -11
  26. package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
  27. package/include/glib-2.0/girepository/gitypelib.h +9 -13
  28. package/include/glib-2.0/girepository/gitypes.h +52 -104
  29. package/include/glib-2.0/girepository/giunioninfo.h +28 -12
  30. package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
  31. package/include/glib-2.0/girepository/givalueinfo.h +65 -0
  32. package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
  33. package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
  34. package/include/glib-2.0/glib/gbitlock.h +31 -0
  35. package/include/glib-2.0/glib/gmessages.h +8 -0
  36. package/include/glib-2.0/glib/gslice.h +2 -0
  37. package/include/glib-2.0/glib/gstrfuncs.h +24 -18
  38. package/include/glib-2.0/glib/gthread.h +191 -3
  39. package/include/glib-2.0/glib-unix.h +7 -1
  40. package/include/glib-2.0/gobject/genums.h +6 -6
  41. package/include/glib-2.0/gobject/glib-types.h +11 -0
  42. package/include/glib-2.0/gobject/gsignal.h +16 -6
  43. package/include/hwy/aligned_allocator.h +171 -6
  44. package/include/hwy/base.h +1765 -543
  45. package/include/hwy/cache_control.h +24 -6
  46. package/include/hwy/detect_compiler_arch.h +23 -2
  47. package/include/hwy/detect_targets.h +56 -13
  48. package/include/hwy/foreach_target.h +24 -0
  49. package/include/hwy/highway.h +20 -3
  50. package/include/hwy/ops/arm_neon-inl.h +1086 -667
  51. package/include/hwy/ops/arm_sve-inl.h +1091 -235
  52. package/include/hwy/ops/emu128-inl.h +271 -196
  53. package/include/hwy/ops/generic_ops-inl.h +2270 -399
  54. package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
  55. package/include/hwy/ops/rvv-inl.h +1043 -311
  56. package/include/hwy/ops/scalar-inl.h +189 -159
  57. package/include/hwy/ops/set_macros-inl.h +66 -6
  58. package/include/hwy/ops/shared-inl.h +175 -56
  59. package/include/hwy/ops/wasm_128-inl.h +153 -136
  60. package/include/hwy/ops/x86_128-inl.h +1647 -646
  61. package/include/hwy/ops/x86_256-inl.h +1003 -370
  62. package/include/hwy/ops/x86_512-inl.h +948 -353
  63. package/include/hwy/per_target.h +4 -0
  64. package/include/hwy/profiler.h +648 -0
  65. package/include/hwy/robust_statistics.h +2 -2
  66. package/include/hwy/targets.h +18 -11
  67. package/include/hwy/timer.h +11 -0
  68. package/include/libpng16/png.h +32 -29
  69. package/include/libpng16/pngconf.h +2 -2
  70. package/include/libpng16/pnglibconf.h +7 -2
  71. package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
  72. package/include/libxml2/libxml/parser.h +16 -7
  73. package/include/libxml2/libxml/xmlIO.h +0 -1
  74. package/include/libxml2/libxml/xmlversion.h +4 -4
  75. package/include/pango-1.0/pango/pango-features.h +3 -3
  76. package/include/pango-1.0/pango/pango-fontmap.h +7 -0
  77. package/include/pixman-1/pixman-version.h +2 -2
  78. package/include/png.h +32 -29
  79. package/include/pngconf.h +2 -2
  80. package/include/pnglibconf.h +7 -2
  81. package/include/vips/connection.h +9 -3
  82. package/include/vips/util.h +0 -9
  83. package/include/vips/version.h +4 -4
  84. package/package.json +1 -1
  85. package/versions.json +11 -11
@@ -47,6 +47,13 @@ namespace hwy {
47
47
  namespace HWY_NAMESPACE {
48
48
  namespace detail {
49
49
 
50
+ // Enable generic functions for whichever of (f16, bf16) are not supported.
51
+ #if !HWY_HAVE_FLOAT16
52
+ #define HWY_X86_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
53
+ #else
54
+ #define HWY_X86_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
55
+ #endif
56
+
50
57
  template <typename T>
51
58
  struct Raw128 {
52
59
  using type = __m128i;
@@ -90,6 +97,9 @@ class Vec128 {
90
97
  HWY_INLINE Vec128& operator-=(const Vec128 other) {
91
98
  return *this = (*this - other);
92
99
  }
100
+ HWY_INLINE Vec128& operator%=(const Vec128 other) {
101
+ return *this = (*this % other);
102
+ }
93
103
  HWY_INLINE Vec128& operator&=(const Vec128 other) {
94
104
  return *this = (*this & other);
95
105
  }
@@ -194,18 +204,12 @@ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
194
204
  HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
195
205
  return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()};
196
206
  }
197
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_BF16_D(D)>
198
- HWY_API Vec128<bfloat16_t, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
199
- return Vec128<bfloat16_t, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()};
200
- }
207
+ #if HWY_HAVE_FLOAT16
201
208
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
202
209
  HWY_API Vec128<float16_t, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
203
- #if HWY_HAVE_FLOAT16
204
210
  return Vec128<float16_t, HWY_MAX_LANES_D(D)>{_mm_setzero_ph()};
205
- #else
206
- return Vec128<float16_t, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()};
207
- #endif
208
211
  }
212
+ #endif // HWY_HAVE_FLOAT16
209
213
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
210
214
  HWY_API Vec128<float, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
211
215
  return Vec128<float, HWY_MAX_LANES_D(D)>{_mm_setzero_ps()};
@@ -214,6 +218,10 @@ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
214
218
  HWY_API Vec128<double, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
215
219
  return Vec128<double, HWY_MAX_LANES_D(D)>{_mm_setzero_pd()};
216
220
  }
221
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_X86_IF_EMULATED_D(D)>
222
+ HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
223
+ return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()};
224
+ }
217
225
 
218
226
  // Using the existing Zero function instead of a dedicated function for
219
227
  // deduction avoids having to forward-declare Vec256 here.
@@ -307,7 +315,7 @@ HWY_API VFromD<D> Set(D /* tag */, double t) {
307
315
  }
308
316
 
309
317
  // Generic for all vector lengths.
310
- template <class D, HWY_IF_SPECIAL_FLOAT_D(D)>
318
+ template <class D, HWY_X86_IF_EMULATED_D(D)>
311
319
  HWY_API VFromD<D> Set(D df, TFromD<D> t) {
312
320
  const RebindToUnsigned<decltype(df)> du;
313
321
  static_assert(sizeof(TFromD<D>) == 2, "Expecting [b]f16");
@@ -328,18 +336,12 @@ HWY_API VFromD<D> Undefined(D /* tag */) {
328
336
  // generate an XOR instruction.
329
337
  return VFromD<D>{_mm_undefined_si128()};
330
338
  }
331
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_BF16_D(D)>
332
- HWY_API VFromD<D> Undefined(D /* tag */) {
333
- return VFromD<D>{_mm_undefined_si128()};
334
- }
339
+ #if HWY_HAVE_FLOAT16
335
340
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
336
341
  HWY_API VFromD<D> Undefined(D /* tag */) {
337
- #if HWY_HAVE_FLOAT16
338
342
  return VFromD<D>{_mm_undefined_ph()};
339
- #else
340
- return VFromD<D>{_mm_undefined_si128()};
341
- #endif
342
343
  }
344
+ #endif // HWY_HAVE_FLOAT16
343
345
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
344
346
  HWY_API VFromD<D> Undefined(D /* tag */) {
345
347
  return VFromD<D>{_mm_undefined_ps()};
@@ -348,6 +350,10 @@ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
348
350
  HWY_API VFromD<D> Undefined(D /* tag */) {
349
351
  return VFromD<D>{_mm_undefined_pd()};
350
352
  }
353
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_X86_IF_EMULATED_D(D)>
354
+ HWY_API VFromD<D> Undefined(D /* tag */) {
355
+ return VFromD<D>{_mm_undefined_si128()};
356
+ }
351
357
 
352
358
  HWY_DIAGNOSTICS(pop)
353
359
 
@@ -359,7 +365,11 @@ HWY_API T GetLane(const Vec128<T, N> v) {
359
365
  }
360
366
  template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
361
367
  HWY_API T GetLane(const Vec128<T, N> v) {
362
- return static_cast<T>(_mm_cvtsi128_si32(v.raw) & 0xFFFF);
368
+ const DFromV<decltype(v)> d;
369
+ const RebindToUnsigned<decltype(d)> du;
370
+ const uint16_t bits =
371
+ static_cast<uint16_t>(_mm_cvtsi128_si32(BitCast(du, v).raw) & 0xFFFF);
372
+ return BitCastScalar<T>(bits);
363
373
  }
364
374
  template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
365
375
  HWY_API T GetLane(const Vec128<T, N> v) {
@@ -394,6 +404,104 @@ HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
394
404
  return BitCast(d, VFromD<decltype(du8)>{detail::BitCastToInteger(v.raw)});
395
405
  }
396
406
 
407
+ // ------------------------------ Dup128VecFromValues
408
+
409
+ template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
410
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
411
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
412
+ TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
413
+ TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
414
+ TFromD<D> t11, TFromD<D> t12,
415
+ TFromD<D> t13, TFromD<D> t14,
416
+ TFromD<D> t15) {
417
+ return VFromD<D>{_mm_setr_epi8(
418
+ static_cast<char>(t0), static_cast<char>(t1), static_cast<char>(t2),
419
+ static_cast<char>(t3), static_cast<char>(t4), static_cast<char>(t5),
420
+ static_cast<char>(t6), static_cast<char>(t7), static_cast<char>(t8),
421
+ static_cast<char>(t9), static_cast<char>(t10), static_cast<char>(t11),
422
+ static_cast<char>(t12), static_cast<char>(t13), static_cast<char>(t14),
423
+ static_cast<char>(t15))};
424
+ }
425
+
426
+ template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
427
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
428
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
429
+ TFromD<D> t5, TFromD<D> t6,
430
+ TFromD<D> t7) {
431
+ return VFromD<D>{
432
+ _mm_setr_epi16(static_cast<int16_t>(t0), static_cast<int16_t>(t1),
433
+ static_cast<int16_t>(t2), static_cast<int16_t>(t3),
434
+ static_cast<int16_t>(t4), static_cast<int16_t>(t5),
435
+ static_cast<int16_t>(t6), static_cast<int16_t>(t7))};
436
+ }
437
+
438
+ // Generic for all vector lengths
439
+ template <class D, HWY_IF_BF16_D(D)>
440
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
441
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
442
+ TFromD<D> t5, TFromD<D> t6,
443
+ TFromD<D> t7) {
444
+ const RebindToSigned<decltype(d)> di;
445
+ return BitCast(d,
446
+ Dup128VecFromValues(
447
+ di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
448
+ BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
449
+ BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
450
+ BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
451
+ }
452
+
453
+ #if HWY_HAVE_FLOAT16
454
+ template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
455
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
456
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
457
+ TFromD<D> t5, TFromD<D> t6,
458
+ TFromD<D> t7) {
459
+ return VFromD<D>{_mm_setr_ph(t0, t1, t2, t3, t4, t5, t6, t7)};
460
+ }
461
+ #else
462
+ // Generic for all vector lengths if HWY_HAVE_FLOAT16 is not true
463
+ template <class D, HWY_IF_F16_D(D)>
464
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
465
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
466
+ TFromD<D> t5, TFromD<D> t6,
467
+ TFromD<D> t7) {
468
+ const RebindToSigned<decltype(d)> di;
469
+ return BitCast(d,
470
+ Dup128VecFromValues(
471
+ di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
472
+ BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
473
+ BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
474
+ BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
475
+ }
476
+ #endif // HWY_HAVE_FLOAT16
477
+
478
+ template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
479
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
480
+ TFromD<D> t2, TFromD<D> t3) {
481
+ return VFromD<D>{
482
+ _mm_setr_epi32(static_cast<int32_t>(t0), static_cast<int32_t>(t1),
483
+ static_cast<int32_t>(t2), static_cast<int32_t>(t3))};
484
+ }
485
+
486
+ template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
487
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
488
+ TFromD<D> t2, TFromD<D> t3) {
489
+ return VFromD<D>{_mm_setr_ps(t0, t1, t2, t3)};
490
+ }
491
+
492
+ template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
493
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
494
+ // Need to use _mm_set_epi64x as there is no _mm_setr_epi64x intrinsic
495
+ // available
496
+ return VFromD<D>{
497
+ _mm_set_epi64x(static_cast<int64_t>(t1), static_cast<int64_t>(t0))};
498
+ }
499
+
500
+ template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
501
+ HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
502
+ return VFromD<D>{_mm_setr_pd(t0, t1)};
503
+ }
504
+
397
505
  // ================================================== LOGICAL
398
506
 
399
507
  // ------------------------------ And
@@ -402,7 +510,8 @@ template <typename T, size_t N>
402
510
  HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
403
511
  const DFromV<decltype(a)> d; // for float16_t
404
512
  const RebindToUnsigned<decltype(d)> du;
405
- return BitCast(d, VFromD<decltype(du)>{_mm_and_si128(a.raw, b.raw)});
513
+ return BitCast(d, VFromD<decltype(du)>{
514
+ _mm_and_si128(BitCast(du, a).raw, BitCast(du, b).raw)});
406
515
  }
407
516
  template <size_t N>
408
517
  HWY_API Vec128<float, N> And(Vec128<float, N> a, Vec128<float, N> b) {
@@ -420,8 +529,8 @@ template <typename T, size_t N>
420
529
  HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
421
530
  const DFromV<decltype(mask)> d; // for float16_t
422
531
  const RebindToUnsigned<decltype(d)> du;
423
- return BitCast(
424
- d, VFromD<decltype(du)>{_mm_andnot_si128(not_mask.raw, mask.raw)});
532
+ return BitCast(d, VFromD<decltype(du)>{_mm_andnot_si128(
533
+ BitCast(du, not_mask).raw, BitCast(du, mask).raw)});
425
534
  }
426
535
  template <size_t N>
427
536
  HWY_API Vec128<float, N> AndNot(Vec128<float, N> not_mask,
@@ -440,7 +549,8 @@ template <typename T, size_t N>
440
549
  HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
441
550
  const DFromV<decltype(a)> d; // for float16_t
442
551
  const RebindToUnsigned<decltype(d)> du;
443
- return BitCast(d, VFromD<decltype(du)>{_mm_or_si128(a.raw, b.raw)});
552
+ return BitCast(d, VFromD<decltype(du)>{
553
+ _mm_or_si128(BitCast(du, a).raw, BitCast(du, b).raw)});
444
554
  }
445
555
 
446
556
  template <size_t N>
@@ -458,7 +568,8 @@ template <typename T, size_t N>
458
568
  HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
459
569
  const DFromV<decltype(a)> d; // for float16_t
460
570
  const RebindToUnsigned<decltype(d)> du;
461
- return BitCast(d, VFromD<decltype(du)>{_mm_xor_si128(a.raw, b.raw)});
571
+ return BitCast(d, VFromD<decltype(du)>{
572
+ _mm_xor_si128(BitCast(du, a).raw, BitCast(du, b).raw)});
462
573
  }
463
574
 
464
575
  template <size_t N>
@@ -651,8 +762,9 @@ HWY_INLINE Vec128<T, N> Neg(const Vec128<T, N> v) {
651
762
  }
652
763
 
653
764
  // ------------------------------ Floating-point Abs
654
- template <typename T, size_t N, HWY_IF_FLOAT(T)>
655
- HWY_API Vec128<T, N> Abs(const Vec128<T, N> v) {
765
+ // Generic for all vector lengths
766
+ template <class V, HWY_IF_FLOAT(TFromV<V>)>
767
+ HWY_API V Abs(V v) {
656
768
  const DFromV<decltype(v)> d;
657
769
  const RebindToSigned<decltype(d)> di;
658
770
  using TI = TFromD<decltype(di)>;
@@ -691,14 +803,332 @@ HWY_API V CopySignToAbs(const V abs, const V sign) {
691
803
  // ================================================== MASK
692
804
 
693
805
  #if HWY_TARGET <= HWY_AVX3
806
+ // ------------------------------ MaskFromVec
694
807
 
695
- // ------------------------------ IfThenElse
808
+ namespace detail {
809
+
810
+ template <typename T, size_t N>
811
+ HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<1> /*tag*/,
812
+ const Vec128<T, N> v) {
813
+ return Mask128<T, N>{_mm_movepi8_mask(v.raw)};
814
+ }
815
+ template <typename T, size_t N>
816
+ HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<2> /*tag*/,
817
+ const Vec128<T, N> v) {
818
+ return Mask128<T, N>{_mm_movepi16_mask(v.raw)};
819
+ }
820
+ template <typename T, size_t N>
821
+ HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<4> /*tag*/,
822
+ const Vec128<T, N> v) {
823
+ return Mask128<T, N>{_mm_movepi32_mask(v.raw)};
824
+ }
825
+ template <typename T, size_t N>
826
+ HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<8> /*tag*/,
827
+ const Vec128<T, N> v) {
828
+ return Mask128<T, N>{_mm_movepi64_mask(v.raw)};
829
+ }
830
+
831
+ } // namespace detail
832
+
833
+ template <typename T, size_t N>
834
+ HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
835
+ return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
836
+ }
837
+ // There do not seem to be native floating-point versions of these instructions.
838
+ #if HWY_HAVE_FLOAT16
839
+ template <size_t N>
840
+ HWY_API Mask128<float16_t, N> MaskFromVec(const Vec128<float16_t, N> v) {
841
+ const RebindToSigned<DFromV<decltype(v)>> di;
842
+ return Mask128<float16_t, N>{MaskFromVec(BitCast(di, v)).raw};
843
+ }
844
+ #endif
845
+ template <size_t N>
846
+ HWY_API Mask128<float, N> MaskFromVec(const Vec128<float, N> v) {
847
+ const RebindToSigned<DFromV<decltype(v)>> di;
848
+ return Mask128<float, N>{MaskFromVec(BitCast(di, v)).raw};
849
+ }
850
+ template <size_t N>
851
+ HWY_API Mask128<double, N> MaskFromVec(const Vec128<double, N> v) {
852
+ const RebindToSigned<DFromV<decltype(v)>> di;
853
+ return Mask128<double, N>{MaskFromVec(BitCast(di, v)).raw};
854
+ }
855
+
856
+ template <class D>
857
+ using MFromD = decltype(MaskFromVec(VFromD<D>()));
858
+
859
+ // ------------------------------ MaskFalse (MFromD)
860
+
861
+ #ifdef HWY_NATIVE_MASK_FALSE
862
+ #undef HWY_NATIVE_MASK_FALSE
863
+ #else
864
+ #define HWY_NATIVE_MASK_FALSE
865
+ #endif
866
+
867
+ // Generic for all vector lengths
868
+ template <class D>
869
+ HWY_API MFromD<D> MaskFalse(D /*d*/) {
870
+ return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(0)};
871
+ }
872
+
873
+ // ------------------------------ PromoteMaskTo (MFromD)
874
+
875
+ #ifdef HWY_NATIVE_PROMOTE_MASK_TO
876
+ #undef HWY_NATIVE_PROMOTE_MASK_TO
877
+ #else
878
+ #define HWY_NATIVE_PROMOTE_MASK_TO
879
+ #endif
880
+
881
+ // AVX3 PromoteMaskTo is generic for all vector lengths
882
+ template <class DTo, class DFrom,
883
+ HWY_IF_T_SIZE_GT_D(DTo, sizeof(TFromD<DFrom>)),
884
+ class DFrom_2 = Rebind<TFromD<DFrom>, DTo>,
885
+ hwy::EnableIf<IsSame<MFromD<DFrom>, MFromD<DFrom_2>>()>* = nullptr>
886
+ HWY_API MFromD<DTo> PromoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
887
+ MFromD<DFrom> m) {
888
+ return MFromD<DTo>{static_cast<decltype(MFromD<DTo>().raw)>(m.raw)};
889
+ }
890
+
891
+ // ------------------------------ DemoteMaskTo (MFromD)
892
+
893
+ #ifdef HWY_NATIVE_DEMOTE_MASK_TO
894
+ #undef HWY_NATIVE_DEMOTE_MASK_TO
895
+ #else
896
+ #define HWY_NATIVE_DEMOTE_MASK_TO
897
+ #endif
898
+
899
+ // AVX3 DemoteMaskTo is generic for all vector lengths
900
+ template <class DTo, class DFrom,
901
+ HWY_IF_T_SIZE_LE_D(DTo, sizeof(TFromD<DFrom>) - 1),
902
+ class DFrom_2 = Rebind<TFromD<DFrom>, DTo>,
903
+ hwy::EnableIf<IsSame<MFromD<DFrom>, MFromD<DFrom_2>>()>* = nullptr>
904
+ HWY_API MFromD<DTo> DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
905
+ MFromD<DFrom> m) {
906
+ return MFromD<DTo>{static_cast<decltype(MFromD<DTo>().raw)>(m.raw)};
907
+ }
908
+
909
+ // ------------------------------ CombineMasks (MFromD)
910
+
911
+ #ifdef HWY_NATIVE_COMBINE_MASKS
912
+ #undef HWY_NATIVE_COMBINE_MASKS
913
+ #else
914
+ #define HWY_NATIVE_COMBINE_MASKS
915
+ #endif
916
+
917
+ template <class D, HWY_IF_LANES_D(D, 2)>
918
+ HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
919
+ MFromD<Half<D>> lo) {
920
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
921
+ const __mmask8 combined_mask = _kor_mask8(
922
+ _kshiftli_mask8(static_cast<__mmask8>(hi.raw), 1),
923
+ _kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(1)));
924
+ #else
925
+ const auto combined_mask =
926
+ (static_cast<unsigned>(hi.raw) << 1) | (lo.raw & 1);
927
+ #endif
928
+
929
+ return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
930
+ }
931
+
932
+ template <class D, HWY_IF_LANES_D(D, 4)>
933
+ HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
934
+ MFromD<Half<D>> lo) {
935
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
936
+ const __mmask8 combined_mask = _kor_mask8(
937
+ _kshiftli_mask8(static_cast<__mmask8>(hi.raw), 2),
938
+ _kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(3)));
939
+ #else
940
+ const auto combined_mask =
941
+ (static_cast<unsigned>(hi.raw) << 2) | (lo.raw & 3);
942
+ #endif
943
+
944
+ return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
945
+ }
946
+
947
+ template <class D, HWY_IF_LANES_D(D, 8)>
948
+ HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
949
+ MFromD<Half<D>> lo) {
950
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
951
+ const __mmask8 combined_mask = _kor_mask8(
952
+ _kshiftli_mask8(static_cast<__mmask8>(hi.raw), 4),
953
+ _kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(15)));
954
+ #else
955
+ const auto combined_mask =
956
+ (static_cast<unsigned>(hi.raw) << 4) | (lo.raw & 15u);
957
+ #endif
958
+
959
+ return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
960
+ }
961
+
962
+ template <class D, HWY_IF_LANES_D(D, 16)>
963
+ HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
964
+ MFromD<Half<D>> lo) {
965
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
966
+ const __mmask16 combined_mask = _mm512_kunpackb(
967
+ static_cast<__mmask16>(hi.raw), static_cast<__mmask16>(lo.raw));
968
+ #else
969
+ const auto combined_mask =
970
+ ((static_cast<unsigned>(hi.raw) << 8) | (lo.raw & 0xFFu));
971
+ #endif
972
+
973
+ return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
974
+ }
975
+
976
+ // ------------------------------ LowerHalfOfMask (MFromD)
977
+
978
+ #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
979
+ #undef HWY_NATIVE_LOWER_HALF_OF_MASK
980
+ #else
981
+ #define HWY_NATIVE_LOWER_HALF_OF_MASK
982
+ #endif
983
+
984
+ // Generic for all vector lengths
985
+ template <class D>
986
+ HWY_API MFromD<D> LowerHalfOfMask(D d, MFromD<Twice<D>> m) {
987
+ using RawM = decltype(MFromD<D>().raw);
988
+ constexpr size_t kN = MaxLanes(d);
989
+ constexpr size_t kNumOfBitsInRawMask = sizeof(RawM) * 8;
990
+
991
+ MFromD<D> result_mask{static_cast<RawM>(m.raw)};
992
+
993
+ if (kN < kNumOfBitsInRawMask) {
994
+ result_mask =
995
+ And(result_mask, MFromD<D>{static_cast<RawM>((1ULL << kN) - 1)});
996
+ }
997
+
998
+ return result_mask;
999
+ }
1000
+
1001
+ // ------------------------------ UpperHalfOfMask (MFromD)
1002
+
1003
+ #ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
1004
+ #undef HWY_NATIVE_UPPER_HALF_OF_MASK
1005
+ #else
1006
+ #define HWY_NATIVE_UPPER_HALF_OF_MASK
1007
+ #endif
1008
+
1009
+ template <class D, HWY_IF_LANES_D(D, 1)>
1010
+ HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
1011
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
1012
+ const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 1);
1013
+ #else
1014
+ const auto shifted_mask = static_cast<unsigned>(m.raw) >> 1;
1015
+ #endif
1016
+
1017
+ return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
1018
+ }
1019
+
1020
+ template <class D, HWY_IF_LANES_D(D, 2)>
1021
+ HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
1022
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
1023
+ const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 2);
1024
+ #else
1025
+ const auto shifted_mask = static_cast<unsigned>(m.raw) >> 2;
1026
+ #endif
1027
+
1028
+ return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
1029
+ }
1030
+
1031
+ template <class D, HWY_IF_LANES_D(D, 4)>
1032
+ HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
1033
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
1034
+ const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 4);
1035
+ #else
1036
+ const auto shifted_mask = static_cast<unsigned>(m.raw) >> 4;
1037
+ #endif
1038
+
1039
+ return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
1040
+ }
1041
+
1042
+ template <class D, HWY_IF_LANES_D(D, 8)>
1043
+ HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
1044
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
1045
+ const auto shifted_mask = _kshiftri_mask16(static_cast<__mmask16>(m.raw), 8);
1046
+ #else
1047
+ const auto shifted_mask = static_cast<unsigned>(m.raw) >> 8;
1048
+ #endif
1049
+
1050
+ return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
1051
+ }
1052
+
1053
+ // ------------------------------ OrderedDemote2MasksTo (MFromD, CombineMasks)
1054
+
1055
+ #ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
1056
+ #undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
1057
+ #else
1058
+ #define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
1059
+ #endif
1060
+
1061
+ // Generic for all vector lengths
1062
+ template <class DTo, class DFrom,
1063
+ HWY_IF_T_SIZE_D(DTo, sizeof(TFromD<DFrom>) / 2),
1064
+ class DTo_2 = Repartition<TFromD<DTo>, DFrom>,
1065
+ hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DTo_2>>()>* = nullptr>
1066
+ HWY_API MFromD<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom /*d_from*/,
1067
+ MFromD<DFrom> a, MFromD<DFrom> b) {
1068
+ using MH = MFromD<Half<DTo>>;
1069
+ using RawMH = decltype(MH().raw);
1070
+
1071
+ return CombineMasks(d_to, MH{static_cast<RawMH>(b.raw)},
1072
+ MH{static_cast<RawMH>(a.raw)});
1073
+ }
1074
+
1075
+ // ------------------------------ VecFromMask
1076
+
1077
+ template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
1078
+ HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1079
+ return Vec128<T, N>{_mm_movm_epi8(v.raw)};
1080
+ }
1081
+
1082
+ template <typename T, size_t N, HWY_IF_UI16(T)>
1083
+ HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1084
+ return Vec128<T, N>{_mm_movm_epi16(v.raw)};
1085
+ }
1086
+
1087
+ template <typename T, size_t N, HWY_IF_UI32(T)>
1088
+ HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1089
+ return Vec128<T, N>{_mm_movm_epi32(v.raw)};
1090
+ }
696
1091
 
697
- // Returns mask ? b : a.
1092
+ template <typename T, size_t N, HWY_IF_UI64(T)>
1093
+ HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1094
+ return Vec128<T, N>{_mm_movm_epi64(v.raw)};
1095
+ }
1096
+
1097
+ #if HWY_HAVE_FLOAT16
1098
+ template <size_t N>
1099
+ HWY_API Vec128<float16_t, N> VecFromMask(const Mask128<float16_t, N> v) {
1100
+ return Vec128<float16_t, N>{_mm_castsi128_ph(_mm_movm_epi16(v.raw))};
1101
+ }
1102
+ #endif // HWY_HAVE_FLOAT16
1103
+
1104
+ template <size_t N>
1105
+ HWY_API Vec128<float, N> VecFromMask(const Mask128<float, N> v) {
1106
+ return Vec128<float, N>{_mm_castsi128_ps(_mm_movm_epi32(v.raw))};
1107
+ }
1108
+
1109
+ template <size_t N>
1110
+ HWY_API Vec128<double, N> VecFromMask(const Mask128<double, N> v) {
1111
+ return Vec128<double, N>{_mm_castsi128_pd(_mm_movm_epi64(v.raw))};
1112
+ }
1113
+
1114
+ // Generic for all vector lengths.
1115
+ template <class D>
1116
+ HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) {
1117
+ return VecFromMask(v);
1118
+ }
1119
+
1120
+ // ------------------------------ RebindMask (MaskFromVec)
1121
+
1122
+ template <typename TFrom, size_t NFrom, class DTo, HWY_IF_V_SIZE_LE_D(DTo, 16)>
1123
+ HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) {
1124
+ static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
1125
+ return MFromD<DTo>{m.raw};
1126
+ }
1127
+
1128
+ // ------------------------------ IfThenElse
698
1129
 
699
1130
  namespace detail {
700
1131
 
701
- // Templates for signed/unsigned integer of a particular size.
702
1132
  template <typename T, size_t N>
703
1133
  HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<1> /* tag */,
704
1134
  Mask128<T, N> mask, Vec128<T, N> yes,
@@ -726,7 +1156,7 @@ HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<8> /* tag */,
726
1156
 
727
1157
  } // namespace detail
728
1158
 
729
- template <typename T, size_t N>
1159
+ template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
730
1160
  HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
731
1161
  Vec128<T, N> no) {
732
1162
  return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
@@ -741,6 +1171,14 @@ HWY_API Vec128<float16_t, N> IfThenElse(Mask128<float16_t, N> mask,
741
1171
  }
742
1172
  #endif // HWY_HAVE_FLOAT16
743
1173
 
1174
+ // Generic for all vector lengths.
1175
+ template <class V, class D = DFromV<V>, HWY_X86_IF_EMULATED_D(D)>
1176
+ HWY_API V IfThenElse(MFromD<D> mask, V yes, V no) {
1177
+ const RebindToUnsigned<D> du;
1178
+ return BitCast(
1179
+ D(), IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no)));
1180
+ }
1181
+
744
1182
  template <size_t N>
745
1183
  HWY_API Vec128<float, N> IfThenElse(Mask128<float, N> mask,
746
1184
  Vec128<float, N> yes, Vec128<float, N> no) {
@@ -779,7 +1217,7 @@ HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<8> /* tag */,
779
1217
 
780
1218
  } // namespace detail
781
1219
 
782
- template <typename T, size_t N>
1220
+ template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
783
1221
  HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
784
1222
  return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
785
1223
  }
@@ -796,6 +1234,13 @@ HWY_API Vec128<double, N> IfThenElseZero(Mask128<double, N> mask,
796
1234
  return Vec128<double, N>{_mm_maskz_mov_pd(mask.raw, yes.raw)};
797
1235
  }
798
1236
 
1237
+ // Generic for all vector lengths.
1238
+ template <class V, class D = DFromV<V>, HWY_IF_SPECIAL_FLOAT_D(D)>
1239
+ HWY_API V IfThenElseZero(MFromD<D> mask, V yes) {
1240
+ const RebindToUnsigned<D> du;
1241
+ return BitCast(D(), IfThenElseZero(RebindMask(du, mask), BitCast(du, yes)));
1242
+ }
1243
+
799
1244
  namespace detail {
800
1245
 
801
1246
  template <typename T, size_t N>
@@ -822,7 +1267,7 @@ HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<8> /* tag */,
822
1267
 
823
1268
  } // namespace detail
824
1269
 
825
- template <typename T, size_t N>
1270
+ template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
826
1271
  HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
827
1272
  return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
828
1273
  }
@@ -839,6 +1284,13 @@ HWY_API Vec128<double, N> IfThenZeroElse(Mask128<double, N> mask,
839
1284
  return Vec128<double, N>{_mm_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
840
1285
  }
841
1286
 
1287
+ // Generic for all vector lengths.
1288
+ template <class V, class D = DFromV<V>, HWY_IF_SPECIAL_FLOAT_D(D)>
1289
+ HWY_API V IfThenZeroElse(MFromD<D> mask, V no) {
1290
+ const RebindToUnsigned<D> du;
1291
+ return BitCast(D(), IfThenZeroElse(RebindMask(du, mask), BitCast(du, no)));
1292
+ }
1293
+
842
1294
  // ------------------------------ Mask logical
843
1295
 
844
1296
  // For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently.
@@ -1042,6 +1494,68 @@ HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<8> /*tag*/,
1042
1494
  #endif
1043
1495
  }
1044
1496
 
1497
+ // UnmaskedNot returns ~m.raw without zeroing out any invalid bits
1498
+ template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
1499
+ HWY_INLINE Mask128<T, N> UnmaskedNot(const Mask128<T, N> m) {
1500
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
1501
+ return Mask128<T, N>{static_cast<__mmask16>(_knot_mask16(m.raw))};
1502
+ #else
1503
+ return Mask128<T, N>{static_cast<__mmask16>(~m.raw)};
1504
+ #endif
1505
+ }
1506
+
1507
+ template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
1508
+ HWY_INLINE Mask128<T, N> UnmaskedNot(const Mask128<T, N> m) {
1509
+ #if HWY_COMPILER_HAS_MASK_INTRINSICS
1510
+ return Mask128<T, N>{static_cast<__mmask8>(_knot_mask8(m.raw))};
1511
+ #else
1512
+ return Mask128<T, N>{static_cast<__mmask8>(~m.raw)};
1513
+ #endif
1514
+ }
1515
+
1516
+ template <typename T>
1517
+ HWY_INLINE Mask128<T> Not(hwy::SizeTag<1> /*tag*/, const Mask128<T> m) {
1518
+ // sizeof(T) == 1 and N == 16: simply return ~m as all 16 bits of m are valid
1519
+ return UnmaskedNot(m);
1520
+ }
1521
+ template <typename T, size_t N, HWY_IF_LANES_LE(N, 8)>
1522
+ HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> m) {
1523
+ // sizeof(T) == 1 and N <= 8: need to zero out the upper bits of ~m as there
1524
+ // are fewer than 16 valid bits in m
1525
+
1526
+ // Return (~m) & ((1ull << N) - 1)
1527
+ return AndNot(hwy::SizeTag<1>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
1528
+ }
1529
+ template <typename T>
1530
+ HWY_INLINE Mask128<T> Not(hwy::SizeTag<2> /*tag*/, const Mask128<T> m) {
1531
+ // sizeof(T) == 2 and N == 8: simply return ~m as all 8 bits of m are valid
1532
+ return UnmaskedNot(m);
1533
+ }
1534
+ template <typename T, size_t N, HWY_IF_LANES_LE(N, 4)>
1535
+ HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> m) {
1536
+ // sizeof(T) == 2 and N <= 4: need to zero out the upper bits of ~m as there
1537
+ // are fewer than 8 valid bits in m
1538
+
1539
+ // Return (~m) & ((1ull << N) - 1)
1540
+ return AndNot(hwy::SizeTag<2>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
1541
+ }
1542
+ template <typename T, size_t N>
1543
+ HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> m) {
1544
+ // sizeof(T) == 4: need to zero out the upper bits of ~m as there are at most
1545
+ // 4 valid bits in m
1546
+
1547
+ // Return (~m) & ((1ull << N) - 1)
1548
+ return AndNot(hwy::SizeTag<4>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
1549
+ }
1550
+ template <typename T, size_t N>
1551
+ HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> m) {
1552
+ // sizeof(T) == 8: need to zero out the upper bits of ~m as there are at most
1553
+ // 2 valid bits in m
1554
+
1555
+ // Return (~m) & ((1ull << N) - 1)
1556
+ return AndNot(hwy::SizeTag<8>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
1557
+ }
1558
+
1045
1559
  } // namespace detail
1046
1560
 
1047
1561
  template <typename T, size_t N>
@@ -1066,9 +1580,8 @@ HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
1066
1580
 
1067
1581
  template <typename T, size_t N>
1068
1582
  HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
1069
- // Flip only the valid bits.
1070
- // TODO(janwas): use _knot intrinsics if N >= 8.
1071
- return Xor(m, Mask128<T, N>::FromBits((1ull << N) - 1));
1583
+ // Flip only the valid bits
1584
+ return detail::Not(hwy::SizeTag<sizeof(T)>(), m);
1072
1585
  }
1073
1586
 
1074
1587
  template <typename T, size_t N>
@@ -1309,20 +1822,17 @@ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
1309
1822
  HWY_API VFromD<D> Load(D /* tag */, const TFromD<D>* HWY_RESTRICT aligned) {
1310
1823
  return VFromD<D>{_mm_load_si128(reinterpret_cast<const __m128i*>(aligned))};
1311
1824
  }
1312
- // Generic for all vector lengths greater than or equal to 16 bytes.
1313
- template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_BF16_D(D)>
1314
- HWY_API VFromD<D> Load(D d, const bfloat16_t* HWY_RESTRICT aligned) {
1315
- const RebindToUnsigned<decltype(d)> du;
1316
- return BitCast(d, Load(du, reinterpret_cast<const uint16_t*>(aligned)));
1317
- }
1318
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
1319
- HWY_API Vec128<float16_t> Load(D d, const float16_t* HWY_RESTRICT aligned) {
1320
1825
  #if HWY_HAVE_FLOAT16
1826
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
1827
+ HWY_API Vec128<float16_t> Load(D, const float16_t* HWY_RESTRICT aligned) {
1321
1828
  return Vec128<float16_t>{_mm_load_ph(aligned)};
1322
- #else
1323
- const RebindToUnsigned<decltype(d)> du;
1324
- return BitCast(d, Load(du, reinterpret_cast<const uint16_t*>(aligned)));
1829
+ }
1325
1830
  #endif // HWY_HAVE_FLOAT16
1831
+ // Generic for all vector lengths greater than or equal to 16 bytes.
1832
+ template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
1833
+ HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT aligned) {
1834
+ const RebindToUnsigned<decltype(d)> du;
1835
+ return BitCast(d, Load(du, detail::U16LanePointer(aligned)));
1326
1836
  }
1327
1837
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
1328
1838
  HWY_API Vec128<float> Load(D /* tag */, const float* HWY_RESTRICT aligned) {
@@ -1337,21 +1847,17 @@ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
1337
1847
  HWY_API VFromD<D> LoadU(D /* tag */, const TFromD<D>* HWY_RESTRICT p) {
1338
1848
  return VFromD<D>{_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))};
1339
1849
  }
1340
- // Generic for all vector lengths greater than or equal to 16 bytes.
1341
- template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_BF16_D(D)>
1342
- HWY_API VFromD<D> LoadU(D d, const bfloat16_t* HWY_RESTRICT p) {
1343
- const RebindToUnsigned<decltype(d)> du;
1344
- return BitCast(d, LoadU(du, reinterpret_cast<const uint16_t*>(p)));
1345
- }
1346
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
1347
- HWY_API Vec128<float16_t> LoadU(D d, const float16_t* HWY_RESTRICT p) {
1348
1850
  #if HWY_HAVE_FLOAT16
1349
- (void)d;
1851
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
1852
+ HWY_API Vec128<float16_t> LoadU(D, const float16_t* HWY_RESTRICT p) {
1350
1853
  return Vec128<float16_t>{_mm_loadu_ph(p)};
1351
- #else
1352
- const RebindToUnsigned<decltype(d)> du;
1353
- return BitCast(d, LoadU(du, reinterpret_cast<const uint16_t*>(p)));
1854
+ }
1354
1855
  #endif // HWY_HAVE_FLOAT16
1856
+ // Generic for all vector lengths greater than or equal to 16 bytes.
1857
+ template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
1858
+ HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
1859
+ const RebindToUnsigned<decltype(d)> du;
1860
+ return BitCast(d, LoadU(du, detail::U16LanePointer(p)));
1355
1861
  }
1356
1862
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
1357
1863
  HWY_API Vec128<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) {
@@ -1445,21 +1951,17 @@ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
1445
1951
  HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
1446
1952
  _mm_store_si128(reinterpret_cast<__m128i*>(aligned), v.raw);
1447
1953
  }
1448
- // Generic for all vector lengths greater than or equal to 16 bytes.
1449
- template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_BF16_D(D)>
1450
- HWY_API void Store(VFromD<D> v, D d, bfloat16_t* HWY_RESTRICT aligned) {
1451
- const RebindToUnsigned<decltype(d)> du;
1452
- Store(BitCast(du, v), du, reinterpret_cast<uint16_t*>(aligned));
1453
- }
1454
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
1455
- HWY_API void Store(Vec128<float16_t> v, D d, float16_t* HWY_RESTRICT aligned) {
1456
1954
  #if HWY_HAVE_FLOAT16
1457
- (void)d;
1955
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
1956
+ HWY_API void Store(Vec128<float16_t> v, D, float16_t* HWY_RESTRICT aligned) {
1458
1957
  _mm_store_ph(aligned, v.raw);
1459
- #else
1958
+ }
1959
+ #endif // HWY_HAVE_FLOAT16
1960
+ // Generic for all vector lengths greater than or equal to 16 bytes.
1961
+ template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
1962
+ HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
1460
1963
  const RebindToUnsigned<decltype(d)> du;
1461
1964
  Store(BitCast(du, v), du, reinterpret_cast<uint16_t*>(aligned));
1462
- #endif // HWY_HAVE_FLOAT16
1463
1965
  }
1464
1966
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
1465
1967
  HWY_API void Store(Vec128<float> v, D /* tag */, float* HWY_RESTRICT aligned) {
@@ -1475,21 +1977,17 @@ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
1475
1977
  HWY_API void StoreU(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
1476
1978
  _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v.raw);
1477
1979
  }
1478
- // Generic for all vector lengths greater than or equal to 16 bytes.
1479
- template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_BF16_D(D)>
1480
- HWY_API void StoreU(VFromD<D> v, D d, bfloat16_t* HWY_RESTRICT p) {
1481
- const RebindToUnsigned<decltype(d)> du;
1482
- StoreU(BitCast(du, v), du, reinterpret_cast<uint16_t*>(p));
1483
- }
1484
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
1485
- HWY_API void StoreU(Vec128<float16_t> v, D d, float16_t* HWY_RESTRICT p) {
1486
1980
  #if HWY_HAVE_FLOAT16
1487
- (void)d;
1981
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
1982
+ HWY_API void StoreU(Vec128<float16_t> v, D, float16_t* HWY_RESTRICT p) {
1488
1983
  _mm_storeu_ph(p, v.raw);
1489
- #else
1984
+ }
1985
+ #endif // HWY_HAVE_FLOAT16
1986
+ // Generic for all vector lengths greater than or equal to 16 bytes.
1987
+ template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
1988
+ HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
1490
1989
  const RebindToUnsigned<decltype(d)> du;
1491
1990
  StoreU(BitCast(du, v), du, reinterpret_cast<uint16_t*>(p));
1492
- #endif // HWY_HAVE_FLOAT16
1493
1991
  }
1494
1992
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
1495
1993
  HWY_API void StoreU(Vec128<float> v, D /* tag */, float* HWY_RESTRICT p) {
@@ -1553,20 +2051,24 @@ HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
1553
2051
  template <typename T, size_t N, typename TI, size_t NI>
1554
2052
  HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
1555
2053
  const Vec128<TI, NI> from) {
2054
+ const DFromV<decltype(from)> d;
2055
+ const Repartition<uint8_t, decltype(d)> du8;
2056
+
2057
+ const DFromV<decltype(bytes)> d_bytes;
2058
+ const Repartition<uint8_t, decltype(d_bytes)> du8_bytes;
1556
2059
  #if HWY_TARGET == HWY_SSE2
1557
2060
  #if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
1558
2061
  typedef uint8_t GccU8RawVectType __attribute__((__vector_size__(16)));
2062
+ (void)d;
2063
+ (void)du8;
2064
+ (void)d_bytes;
2065
+ (void)du8_bytes;
1559
2066
  return Vec128<TI, NI>{reinterpret_cast<typename detail::Raw128<TI>::type>(
1560
2067
  __builtin_shuffle(reinterpret_cast<GccU8RawVectType>(bytes.raw),
1561
2068
  reinterpret_cast<GccU8RawVectType>(from.raw)))};
1562
2069
  #else
1563
- const DFromV<decltype(from)> d;
1564
- const Repartition<uint8_t, decltype(d)> du8;
1565
2070
  const Full128<uint8_t> du8_full;
1566
2071
 
1567
- const DFromV<decltype(bytes)> d_bytes;
1568
- const Repartition<uint8_t, decltype(d_bytes)> du8_bytes;
1569
-
1570
2072
  alignas(16) uint8_t result_bytes[16];
1571
2073
  alignas(16) uint8_t u8_bytes[16];
1572
2074
  alignas(16) uint8_t from_bytes[16];
@@ -1581,7 +2083,9 @@ HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
1581
2083
  return BitCast(d, VFromD<decltype(du8)>{Load(du8_full, result_bytes).raw});
1582
2084
  #endif
1583
2085
  #else // SSSE3 or newer
1584
- return Vec128<TI, NI>{_mm_shuffle_epi8(bytes.raw, from.raw)};
2086
+ return BitCast(
2087
+ d, VFromD<decltype(du8)>{_mm_shuffle_epi8(BitCast(du8_bytes, bytes).raw,
2088
+ BitCast(du8, from).raw)});
1585
2089
  #endif
1586
2090
  }
1587
2091
 
@@ -1636,8 +2140,11 @@ HWY_API Vec32<T> ShuffleTwo2301(const Vec32<T> a, const Vec32<T> b) {
1636
2140
  _mm_shufflelo_epi16(ba.raw, _MM_SHUFFLE(3, 0, 3, 0))};
1637
2141
  return BitCast(d, Or(ShiftLeft<8>(ba_shuffled), ShiftRight<8>(ba_shuffled)));
1638
2142
  #else
1639
- alignas(16) const T kShuffle[8] = {1, 0, 7, 6};
1640
- return Vec32<T>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
2143
+ const RebindToUnsigned<decltype(d2)> d2_u;
2144
+ const auto shuffle_idx =
2145
+ BitCast(d2, Dup128VecFromValues(d2_u, 1, 0, 7, 6, 0, 0, 0, 0, 0, 0, 0, 0,
2146
+ 0, 0, 0, 0));
2147
+ return Vec32<T>{TableLookupBytes(ba, shuffle_idx).raw};
1641
2148
  #endif
1642
2149
  }
1643
2150
  template <typename T, HWY_IF_T_SIZE(T, 2)>
@@ -1651,8 +2158,11 @@ HWY_API Vec64<T> ShuffleTwo2301(const Vec64<T> a, const Vec64<T> b) {
1651
2158
  return Vec64<T>{
1652
2159
  _mm_shufflelo_epi16(ba_shuffled.raw, _MM_SHUFFLE(2, 3, 0, 1))};
1653
2160
  #else
1654
- alignas(16) const T kShuffle[8] = {0x0302, 0x0100, 0x0f0e, 0x0d0c};
1655
- return Vec64<T>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
2161
+ const RebindToUnsigned<decltype(d2)> d2_u;
2162
+ const auto shuffle_idx = BitCast(
2163
+ d2,
2164
+ Dup128VecFromValues(d2_u, 0x0302, 0x0100, 0x0f0e, 0x0d0c, 0, 0, 0, 0));
2165
+ return Vec64<T>{TableLookupBytes(ba, shuffle_idx).raw};
1656
2166
  #endif
1657
2167
  }
1658
2168
  template <typename T, HWY_IF_T_SIZE(T, 4)>
@@ -1679,8 +2189,11 @@ HWY_API Vec32<T> ShuffleTwo1230(const Vec32<T> a, const Vec32<T> b) {
1679
2189
  #else
1680
2190
  const Twice<decltype(d)> d2;
1681
2191
  const auto ba = Combine(d2, b, a);
1682
- alignas(16) const T kShuffle[8] = {0, 3, 6, 5};
1683
- return Vec32<T>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
2192
+ const RebindToUnsigned<decltype(d2)> d2_u;
2193
+ const auto shuffle_idx =
2194
+ BitCast(d2, Dup128VecFromValues(d2_u, 0, 3, 6, 5, 0, 0, 0, 0, 0, 0, 0, 0,
2195
+ 0, 0, 0, 0));
2196
+ return Vec32<T>{TableLookupBytes(ba, shuffle_idx).raw};
1684
2197
  #endif
1685
2198
  }
1686
2199
  template <typename T, HWY_IF_T_SIZE(T, 2)>
@@ -1695,8 +2208,11 @@ HWY_API Vec64<T> ShuffleTwo1230(const Vec64<T> a, const Vec64<T> b) {
1695
2208
  #else
1696
2209
  const Twice<decltype(d)> d2;
1697
2210
  const auto ba = Combine(d2, b, a);
1698
- alignas(16) const T kShuffle[8] = {0x0100, 0x0706, 0x0d0c, 0x0b0a};
1699
- return Vec64<T>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
2211
+ const RebindToUnsigned<decltype(d2)> d2_u;
2212
+ const auto shuffle_idx = BitCast(
2213
+ d2,
2214
+ Dup128VecFromValues(d2_u, 0x0100, 0x0706, 0x0d0c, 0x0b0a, 0, 0, 0, 0));
2215
+ return Vec64<T>{TableLookupBytes(ba, shuffle_idx).raw};
1700
2216
  #endif
1701
2217
  }
1702
2218
  template <typename T, HWY_IF_T_SIZE(T, 4)>
@@ -1723,8 +2239,11 @@ HWY_API Vec32<T> ShuffleTwo3012(const Vec32<T> a, const Vec32<T> b) {
1723
2239
  #else
1724
2240
  const Twice<decltype(d)> d2;
1725
2241
  const auto ba = Combine(d2, b, a);
1726
- alignas(16) const T kShuffle[8] = {2, 1, 4, 7};
1727
- return Vec32<T>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
2242
+ const RebindToUnsigned<decltype(d2)> d2_u;
2243
+ const auto shuffle_idx =
2244
+ BitCast(d2, Dup128VecFromValues(d2_u, 2, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0,
2245
+ 0, 0, 0, 0));
2246
+ return Vec32<T>{TableLookupBytes(ba, shuffle_idx).raw};
1728
2247
  #endif
1729
2248
  }
1730
2249
  template <typename T, HWY_IF_T_SIZE(T, 2)>
@@ -1739,8 +2258,11 @@ HWY_API Vec64<T> ShuffleTwo3012(const Vec64<T> a, const Vec64<T> b) {
1739
2258
  #else
1740
2259
  const Twice<decltype(d)> d2;
1741
2260
  const auto ba = Combine(d2, b, a);
1742
- alignas(16) const T kShuffle[8] = {0x0504, 0x0302, 0x0908, 0x0f0e};
1743
- return Vec64<T>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
2261
+ const RebindToUnsigned<decltype(d2)> d2_u;
2262
+ const auto shuffle_idx = BitCast(
2263
+ d2,
2264
+ Dup128VecFromValues(d2_u, 0x0504, 0x0302, 0x0908, 0x0f0e, 0, 0, 0, 0));
2265
+ return Vec64<T>{TableLookupBytes(ba, shuffle_idx).raw};
1744
2266
  #endif
1745
2267
  }
1746
2268
  template <typename T, HWY_IF_T_SIZE(T, 4)>
@@ -1812,105 +2334,6 @@ HWY_API Vec128<float> Shuffle0123(const Vec128<float> v) {
1812
2334
 
1813
2335
  // Comparisons set a mask bit to 1 if the condition is true, else 0.
1814
2336
 
1815
- // ------------------------------ MaskFromVec
1816
-
1817
- namespace detail {
1818
-
1819
- template <typename T, size_t N>
1820
- HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<1> /*tag*/,
1821
- const Vec128<T, N> v) {
1822
- return Mask128<T, N>{_mm_movepi8_mask(v.raw)};
1823
- }
1824
- template <typename T, size_t N>
1825
- HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<2> /*tag*/,
1826
- const Vec128<T, N> v) {
1827
- return Mask128<T, N>{_mm_movepi16_mask(v.raw)};
1828
- }
1829
- template <typename T, size_t N>
1830
- HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<4> /*tag*/,
1831
- const Vec128<T, N> v) {
1832
- return Mask128<T, N>{_mm_movepi32_mask(v.raw)};
1833
- }
1834
- template <typename T, size_t N>
1835
- HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<8> /*tag*/,
1836
- const Vec128<T, N> v) {
1837
- return Mask128<T, N>{_mm_movepi64_mask(v.raw)};
1838
- }
1839
-
1840
- } // namespace detail
1841
-
1842
- template <typename T, size_t N>
1843
- HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
1844
- return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
1845
- }
1846
- // There do not seem to be native floating-point versions of these instructions.
1847
- template <size_t N>
1848
- HWY_API Mask128<float, N> MaskFromVec(const Vec128<float, N> v) {
1849
- const RebindToSigned<DFromV<decltype(v)>> di;
1850
- return Mask128<float, N>{MaskFromVec(BitCast(di, v)).raw};
1851
- }
1852
- template <size_t N>
1853
- HWY_API Mask128<double, N> MaskFromVec(const Vec128<double, N> v) {
1854
- const RebindToSigned<DFromV<decltype(v)>> di;
1855
- return Mask128<double, N>{MaskFromVec(BitCast(di, v)).raw};
1856
- }
1857
-
1858
- template <class D>
1859
- using MFromD = decltype(MaskFromVec(VFromD<D>()));
1860
-
1861
- // ------------------------------ VecFromMask
1862
-
1863
- template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
1864
- HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1865
- return Vec128<T, N>{_mm_movm_epi8(v.raw)};
1866
- }
1867
-
1868
- template <typename T, size_t N, HWY_IF_UI16(T)>
1869
- HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1870
- return Vec128<T, N>{_mm_movm_epi16(v.raw)};
1871
- }
1872
-
1873
- template <typename T, size_t N, HWY_IF_UI32(T)>
1874
- HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1875
- return Vec128<T, N>{_mm_movm_epi32(v.raw)};
1876
- }
1877
-
1878
- template <typename T, size_t N, HWY_IF_UI64(T)>
1879
- HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1880
- return Vec128<T, N>{_mm_movm_epi64(v.raw)};
1881
- }
1882
-
1883
- #if HWY_HAVE_FLOAT16
1884
- template <size_t N>
1885
- HWY_API Vec128<float16_t, N> VecFromMask(const Mask128<float16_t, N> v) {
1886
- return Vec128<float16_t, N>{_mm_castsi128_ph(_mm_movm_epi16(v.raw))};
1887
- }
1888
- #endif // HWY_HAVE_FLOAT16
1889
-
1890
- template <size_t N>
1891
- HWY_API Vec128<float, N> VecFromMask(const Mask128<float, N> v) {
1892
- return Vec128<float, N>{_mm_castsi128_ps(_mm_movm_epi32(v.raw))};
1893
- }
1894
-
1895
- template <size_t N>
1896
- HWY_API Vec128<double, N> VecFromMask(const Mask128<double, N> v) {
1897
- return Vec128<double, N>{_mm_castsi128_pd(_mm_movm_epi64(v.raw))};
1898
- }
1899
-
1900
- // Generic for all vector lengths.
1901
- template <class D>
1902
- HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) {
1903
- return VecFromMask(v);
1904
- }
1905
-
1906
- // ------------------------------ RebindMask (MaskFromVec)
1907
-
1908
- template <typename TFrom, size_t NFrom, class DTo, HWY_IF_V_SIZE_LE_D(DTo, 16)>
1909
- HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) {
1910
- static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
1911
- return MFromD<DTo>{m.raw};
1912
- }
1913
-
1914
2337
  // ------------------------------ TestBit
1915
2338
 
1916
2339
  namespace detail {
@@ -1970,7 +2393,11 @@ HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
1970
2393
  template <size_t N>
1971
2394
  HWY_API Mask128<float16_t, N> operator==(Vec128<float16_t, N> a,
1972
2395
  Vec128<float16_t, N> b) {
2396
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2397
+ HWY_DIAGNOSTICS(push)
2398
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
1973
2399
  return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)};
2400
+ HWY_DIAGNOSTICS(pop)
1974
2401
  }
1975
2402
  #endif // HWY_HAVE_FLOAT16
1976
2403
  template <size_t N>
@@ -2010,7 +2437,11 @@ HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
2010
2437
  template <size_t N>
2011
2438
  HWY_API Mask128<float16_t, N> operator!=(Vec128<float16_t, N> a,
2012
2439
  Vec128<float16_t, N> b) {
2440
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2441
+ HWY_DIAGNOSTICS(push)
2442
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
2013
2443
  return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
2444
+ HWY_DIAGNOSTICS(pop)
2014
2445
  }
2015
2446
  #endif // HWY_HAVE_FLOAT16
2016
2447
  template <size_t N>
@@ -2072,7 +2503,11 @@ HWY_API Mask128<uint64_t, N> operator>(Vec128<uint64_t, N> a,
2072
2503
  template <size_t N>
2073
2504
  HWY_API Mask128<float16_t, N> operator>(Vec128<float16_t, N> a,
2074
2505
  Vec128<float16_t, N> b) {
2506
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2507
+ HWY_DIAGNOSTICS(push)
2508
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
2075
2509
  return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)};
2510
+ HWY_DIAGNOSTICS(pop)
2076
2511
  }
2077
2512
  #endif // HWY_HAVE_FLOAT16
2078
2513
  template <size_t N>
@@ -2090,7 +2525,11 @@ HWY_API Mask128<double, N> operator>(Vec128<double, N> a, Vec128<double, N> b) {
2090
2525
  template <size_t N>
2091
2526
  HWY_API Mask128<float16_t, N> operator>=(Vec128<float16_t, N> a,
2092
2527
  Vec128<float16_t, N> b) {
2528
+ // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2529
+ HWY_DIAGNOSTICS(push)
2530
+ HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
2093
2531
  return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)};
2532
+ HWY_DIAGNOSTICS(pop)
2094
2533
  }
2095
2534
  #endif // HWY_HAVE_FLOAT16
2096
2535
  template <size_t N>
@@ -2494,7 +2933,7 @@ static HWY_INLINE V MaskOutVec128Iota(V v) {
2494
2933
  template <class D, typename T2, HWY_IF_V_SIZE_LE_D(D, 16)>
2495
2934
  HWY_API VFromD<D> Iota(D d, const T2 first) {
2496
2935
  const auto result_iota =
2497
- detail::Iota0(d) + Set(d, static_cast<TFromD<D>>(first));
2936
+ detail::Iota0(d) + Set(d, ConvertScalarTo<TFromD<D>>(first));
2498
2937
  #if HWY_COMPILER_MSVC
2499
2938
  return detail::MaskOutVec128Iota(result_iota);
2500
2939
  #else
@@ -2619,9 +3058,11 @@ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */,
2619
3058
  }
2620
3059
 
2621
3060
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
2622
- HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */,
3061
+ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
2623
3062
  const TFromD<D>* HWY_RESTRICT p) {
2624
- return VFromD<D>{_mm_mask_loadu_epi16(v.raw, m.raw, p)};
3063
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
3064
+ return BitCast(d, VFromD<decltype(du)>{
3065
+ _mm_mask_loadu_epi16(BitCast(du, v).raw, m.raw, p)});
2625
3066
  }
2626
3067
 
2627
3068
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
@@ -3216,23 +3657,182 @@ HWY_API Vec128<double, N> operator-(const Vec128<double, N> a,
3216
3657
  return Vec128<double, N>{_mm_sub_pd(a.raw, b.raw)};
3217
3658
  }
3218
3659
 
3219
- // ------------------------------ SumsOf8
3220
- template <size_t N>
3221
- HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
3222
- return Vec128<uint64_t, N / 8>{_mm_sad_epu8(v.raw, _mm_setzero_si128())};
3660
+ // ------------------------------ AddSub
3661
+
3662
+ #if HWY_TARGET <= HWY_SSSE3
3663
+ template <size_t N, HWY_IF_LANES_GT(N, 1)>
3664
+ HWY_API Vec128<float, N> AddSub(Vec128<float, N> a, Vec128<float, N> b) {
3665
+ return Vec128<float, N>{_mm_addsub_ps(a.raw, b.raw)};
3666
+ }
3667
+ HWY_API Vec128<double> AddSub(Vec128<double> a, Vec128<double> b) {
3668
+ return Vec128<double>{_mm_addsub_pd(a.raw, b.raw)};
3669
+ }
3670
+ #endif // HWY_TARGET <= HWY_SSSE3
3671
+
3672
+ // ------------------------------ SumsOf8
3673
+ template <size_t N>
3674
+ HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
3675
+ return Vec128<uint64_t, N / 8>{_mm_sad_epu8(v.raw, _mm_setzero_si128())};
3676
+ }
3677
+
3678
+ // Generic for all vector lengths
3679
+ template <class V, HWY_IF_I8_D(DFromV<V>)>
3680
+ HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) {
3681
+ const DFromV<decltype(v)> d;
3682
+ const RebindToUnsigned<decltype(d)> du;
3683
+ const Repartition<int64_t, decltype(d)> di64;
3684
+
3685
+ // Adjust the values of v to be in the 0..255 range by adding 128 to each lane
3686
+ // of v (which is the same as an bitwise XOR of each i8 lane by 128) and then
3687
+ // bitcasting the Xor result to an u8 vector.
3688
+ const auto v_adj = BitCast(du, Xor(v, SignBit(d)));
3689
+
3690
+ // Need to add -1024 to each i64 lane of the result of the SumsOf8(v_adj)
3691
+ // operation to account for the adjustment made above.
3692
+ return BitCast(di64, SumsOf8(v_adj)) + Set(di64, int64_t{-1024});
3693
+ }
3694
+
3695
+ #ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
3696
+ #undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
3697
+ #else
3698
+ #define HWY_NATIVE_SUMS_OF_8_ABS_DIFF
3699
+ #endif
3700
+
3701
+ template <size_t N>
3702
+ HWY_API Vec128<uint64_t, N / 8> SumsOf8AbsDiff(const Vec128<uint8_t, N> a,
3703
+ const Vec128<uint8_t, N> b) {
3704
+ return Vec128<uint64_t, N / 8>{_mm_sad_epu8(a.raw, b.raw)};
3705
+ }
3706
+
3707
+ // Generic for all vector lengths
3708
+ template <class V, HWY_IF_I8_D(DFromV<V>)>
3709
+ HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8AbsDiff(V a, V b) {
3710
+ const DFromV<V> d;
3711
+ const RebindToUnsigned<decltype(d)> du;
3712
+ const RepartitionToWideX3<decltype(d)> di64;
3713
+
3714
+ // Adjust the values of a and b to be in the 0..255 range by adding 128 to
3715
+ // each lane of a and b (which is the same as an bitwise XOR of each i8 lane
3716
+ // by 128) and then bitcasting the results of the Xor operations to u8
3717
+ // vectors.
3718
+ const auto i8_msb = SignBit(d);
3719
+ const auto a_adj = BitCast(du, Xor(a, i8_msb));
3720
+ const auto b_adj = BitCast(du, Xor(b, i8_msb));
3721
+
3722
+ // The result of SumsOf8AbsDiff(a_adj, b_adj) can simply be bitcasted to an
3723
+ // i64 vector as |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true
3724
+ return BitCast(di64, SumsOf8AbsDiff(a_adj, b_adj));
3725
+ }
3726
+
3727
+ // ------------------------------ SumsOf4
3728
+ #if HWY_TARGET <= HWY_AVX3
3729
+ namespace detail {
3730
+
3731
+ template <size_t N>
3732
+ HWY_INLINE Vec128<uint32_t, (N + 3) / 4> SumsOf4(
3733
+ hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/,
3734
+ Vec128<uint8_t, N> v) {
3735
+ const DFromV<decltype(v)> d;
3736
+
3737
+ // _mm_maskz_dbsad_epu8 is used below as the odd uint16_t lanes need to be
3738
+ // zeroed out and the sums of the 4 consecutive lanes are already in the
3739
+ // even uint16_t lanes of the _mm_maskz_dbsad_epu8 result.
3740
+ return Vec128<uint32_t, (N + 3) / 4>{
3741
+ _mm_maskz_dbsad_epu8(static_cast<__mmask8>(0x55), v.raw, Zero(d).raw, 0)};
3742
+ }
3743
+
3744
+ // detail::SumsOf4 for Vec128<int8_t, N> on AVX3 is implemented in x86_512-inl.h
3745
+
3746
+ } // namespace detail
3747
+ #endif // HWY_TARGET <= HWY_AVX3
3748
+
3749
+ // ------------------------------ SumsOfAdjQuadAbsDiff
3750
+
3751
+ #if HWY_TARGET <= HWY_SSE4
3752
+ #ifdef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
3753
+ #undef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
3754
+ #else
3755
+ #define HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
3756
+ #endif
3757
+
3758
+ template <int kAOffset, int kBOffset, size_t N>
3759
+ HWY_API Vec128<uint16_t, (N + 1) / 2> SumsOfAdjQuadAbsDiff(
3760
+ Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
3761
+ static_assert(0 <= kAOffset && kAOffset <= 1,
3762
+ "kAOffset must be between 0 and 1");
3763
+ static_assert(0 <= kBOffset && kBOffset <= 3,
3764
+ "kBOffset must be between 0 and 3");
3765
+ return Vec128<uint16_t, (N + 1) / 2>{
3766
+ _mm_mpsadbw_epu8(a.raw, b.raw, (kAOffset << 2) | kBOffset)};
3223
3767
  }
3224
3768
 
3225
- #ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
3226
- #undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
3769
+ // Generic for all vector lengths
3770
+ template <int kAOffset, int kBOffset, class V, HWY_IF_I8_D(DFromV<V>)>
3771
+ HWY_API VFromD<RepartitionToWide<DFromV<V>>> SumsOfAdjQuadAbsDiff(V a, V b) {
3772
+ const DFromV<decltype(a)> d;
3773
+ const RebindToUnsigned<decltype(d)> du;
3774
+ const RepartitionToWide<decltype(d)> dw;
3775
+
3776
+ // Adjust the values of a and b to be in the 0..255 range by adding 128 to
3777
+ // each lane of a and b (which is the same as an bitwise XOR of each i8 lane
3778
+ // by 128) and then bitcasting the results of the Xor operations to u8
3779
+ // vectors.
3780
+ const auto i8_msb = SignBit(d);
3781
+ const auto a_adj = BitCast(du, Xor(a, i8_msb));
3782
+ const auto b_adj = BitCast(du, Xor(b, i8_msb));
3783
+
3784
+ // The result of SumsOfAdjQuadAbsDiff<kAOffset, kBOffset>(a_adj, b_adj) can
3785
+ // simply be bitcasted to an i16 vector as
3786
+ // |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true.
3787
+ return BitCast(dw, SumsOfAdjQuadAbsDiff<kAOffset, kBOffset>(a_adj, b_adj));
3788
+ }
3789
+ #endif
3790
+
3791
+ // ------------------------------ SumsOfShuffledQuadAbsDiff
3792
+
3793
+ #if HWY_TARGET <= HWY_AVX3
3794
+ #ifdef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
3795
+ #undef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
3227
3796
  #else
3228
- #define HWY_NATIVE_SUMS_OF_8_ABS_DIFF
3797
+ #define HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
3229
3798
  #endif
3230
3799
 
3231
- template <size_t N>
3232
- HWY_API Vec128<uint64_t, N / 8> SumsOf8AbsDiff(const Vec128<uint8_t, N> a,
3233
- const Vec128<uint8_t, N> b) {
3234
- return Vec128<uint64_t, N / 8>{_mm_sad_epu8(a.raw, b.raw)};
3800
+ template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, size_t N>
3801
+ HWY_API Vec128<uint16_t, (N + 1) / 2> SumsOfShuffledQuadAbsDiff(
3802
+ Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
3803
+ static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3");
3804
+ static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3");
3805
+ static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3");
3806
+ static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3");
3807
+ return Vec128<uint16_t, (N + 1) / 2>{
3808
+ _mm_dbsad_epu8(b.raw, a.raw, _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0))};
3809
+ }
3810
+
3811
+ // Generic for all vector lengths
3812
+ template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, class V,
3813
+ HWY_IF_I8_D(DFromV<V>)>
3814
+ HWY_API VFromD<RepartitionToWide<DFromV<V>>> SumsOfShuffledQuadAbsDiff(V a,
3815
+ V b) {
3816
+ const DFromV<decltype(a)> d;
3817
+ const RebindToUnsigned<decltype(d)> du;
3818
+ const RepartitionToWide<decltype(d)> dw;
3819
+
3820
+ // Adjust the values of a and b to be in the 0..255 range by adding 128 to
3821
+ // each lane of a and b (which is the same as an bitwise XOR of each i8 lane
3822
+ // by 128) and then bitcasting the results of the Xor operations to u8
3823
+ // vectors.
3824
+ const auto i8_msb = SignBit(d);
3825
+ const auto a_adj = BitCast(du, Xor(a, i8_msb));
3826
+ const auto b_adj = BitCast(du, Xor(b, i8_msb));
3827
+
3828
+ // The result of
3829
+ // SumsOfShuffledQuadAbsDiff<kIdx3, kIdx2, kIdx1, kIdx0>(a_adj, b_adj) can
3830
+ // simply be bitcasted to an i16 vector as
3831
+ // |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true.
3832
+ return BitCast(
3833
+ dw, SumsOfShuffledQuadAbsDiff<kIdx3, kIdx2, kIdx1, kIdx0>(a_adj, b_adj));
3235
3834
  }
3835
+ #endif
3236
3836
 
3237
3837
  // ------------------------------ SaturatedAdd
3238
3838
 
@@ -3631,16 +4231,62 @@ HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
3631
4231
  #endif
3632
4232
  }
3633
4233
 
4234
+ #if HWY_TARGET <= HWY_AVX3
3634
4235
  template <size_t N>
3635
4236
  HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
3636
- #if HWY_TARGET <= HWY_AVX3
3637
4237
  return Vec128<int64_t, N>{_mm_abs_epi64(v.raw)};
4238
+ }
3638
4239
  #else
4240
+ // I64 Abs is generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
4241
+ template <class V, HWY_IF_I64(TFromV<V>)>
4242
+ HWY_API V Abs(V v) {
3639
4243
  const auto zero = Zero(DFromV<decltype(v)>());
3640
- return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
4244
+ return IfNegativeThenElse(v, zero - v, v);
4245
+ }
4246
+ #endif
4247
+
4248
+ #ifdef HWY_NATIVE_SATURATED_ABS
4249
+ #undef HWY_NATIVE_SATURATED_ABS
4250
+ #else
4251
+ #define HWY_NATIVE_SATURATED_ABS
4252
+ #endif
4253
+
4254
+ // Generic for all vector lengths
4255
+ template <class V, HWY_IF_I8(TFromV<V>)>
4256
+ HWY_API V SaturatedAbs(V v) {
4257
+ const DFromV<decltype(v)> d;
4258
+ const RebindToUnsigned<decltype(d)> du;
4259
+ return BitCast(d, Min(BitCast(du, v), BitCast(du, SaturatedSub(Zero(d), v))));
4260
+ }
4261
+
4262
+ // Generic for all vector lengths
4263
+ template <class V, HWY_IF_I16(TFromV<V>)>
4264
+ HWY_API V SaturatedAbs(V v) {
4265
+ return Max(v, SaturatedSub(Zero(DFromV<V>()), v));
4266
+ }
4267
+
4268
+ // Generic for all vector lengths
4269
+ template <class V, HWY_IF_I32(TFromV<V>)>
4270
+ HWY_API V SaturatedAbs(V v) {
4271
+ const auto abs_v = Abs(v);
4272
+
4273
+ #if HWY_TARGET <= HWY_SSE4
4274
+ const DFromV<decltype(v)> d;
4275
+ const RebindToUnsigned<decltype(d)> du;
4276
+ return BitCast(d, Min(BitCast(du, abs_v),
4277
+ Set(du, static_cast<uint32_t>(LimitsMax<int32_t>()))));
4278
+ #else
4279
+ return Add(abs_v, BroadcastSignBit(abs_v));
3641
4280
  #endif
3642
4281
  }
3643
4282
 
4283
+ // Generic for all vector lengths
4284
+ template <class V, HWY_IF_I64(TFromV<V>)>
4285
+ HWY_API V SaturatedAbs(V v) {
4286
+ const auto abs_v = Abs(v);
4287
+ return Add(abs_v, BroadcastSignBit(abs_v));
4288
+ }
4289
+
3644
4290
  // GCC <14 and Clang <11 do not follow the Intel documentation for AVX-512VL
3645
4291
  // srli_epi64: the count should be unsigned int. Note that this is not the same
3646
4292
  // as the Shift3264Count in x86_512-inl.h (GCC also requires int).
@@ -3743,6 +4389,49 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
3743
4389
  #endif
3744
4390
  }
3745
4391
 
4392
+ // ------------------------------ IfNegativeThenNegOrUndefIfZero
4393
+
4394
+ #if HWY_TARGET <= HWY_SSSE3
4395
+
4396
+ #ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
4397
+ #undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
4398
+ #else
4399
+ #define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
4400
+ #endif
4401
+
4402
+ template <size_t N>
4403
+ HWY_API Vec128<int8_t, N> IfNegativeThenNegOrUndefIfZero(Vec128<int8_t, N> mask,
4404
+ Vec128<int8_t, N> v) {
4405
+ return Vec128<int8_t, N>{_mm_sign_epi8(v.raw, mask.raw)};
4406
+ }
4407
+
4408
+ template <size_t N>
4409
+ HWY_API Vec128<int16_t, N> IfNegativeThenNegOrUndefIfZero(
4410
+ Vec128<int16_t, N> mask, Vec128<int16_t, N> v) {
4411
+ return Vec128<int16_t, N>{_mm_sign_epi16(v.raw, mask.raw)};
4412
+ }
4413
+
4414
+ template <size_t N>
4415
+ HWY_API Vec128<int32_t, N> IfNegativeThenNegOrUndefIfZero(
4416
+ Vec128<int32_t, N> mask, Vec128<int32_t, N> v) {
4417
+ return Vec128<int32_t, N>{_mm_sign_epi32(v.raw, mask.raw)};
4418
+ }
4419
+
4420
+ // Generic for all vector lengths
4421
+ template <class V, HWY_IF_I64_D(DFromV<V>)>
4422
+ HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
4423
+ #if HWY_TARGET <= HWY_AVX3
4424
+ // MaskedSubOr is more efficient than IfNegativeThenElse on AVX3
4425
+ const DFromV<decltype(v)> d;
4426
+ return MaskedSubOr(v, MaskFromVec(mask), Zero(d), v);
4427
+ #else
4428
+ // IfNegativeThenElse is more efficient than MaskedSubOr on SSE4/AVX2
4429
+ return IfNegativeThenElse(mask, Neg(v), v);
4430
+ #endif
4431
+ }
4432
+
4433
+ #endif // HWY_TARGET <= HWY_SSSE3
4434
+
3746
4435
  // ------------------------------ ShiftLeftSame
3747
4436
 
3748
4437
  template <size_t N>
@@ -4000,6 +4689,361 @@ HWY_API V AbsDiff(V a, V b) {
4000
4689
  return Abs(a - b);
4001
4690
  }
4002
4691
 
4692
+ // ------------------------------ MaskedMinOr
4693
+
4694
+ #if HWY_TARGET <= HWY_AVX3
4695
+
4696
+ #ifdef HWY_NATIVE_MASKED_ARITH
4697
+ #undef HWY_NATIVE_MASKED_ARITH
4698
+ #else
4699
+ #define HWY_NATIVE_MASKED_ARITH
4700
+ #endif
4701
+
4702
+ template <typename T, size_t N, HWY_IF_U8(T)>
4703
+ HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
4704
+ Vec128<T, N> a, Vec128<T, N> b) {
4705
+ return Vec128<T, N>{_mm_mask_min_epu8(no.raw, m.raw, a.raw, b.raw)};
4706
+ }
4707
+ template <typename T, size_t N, HWY_IF_I8(T)>
4708
+ HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
4709
+ Vec128<T, N> a, Vec128<T, N> b) {
4710
+ return Vec128<T, N>{_mm_mask_min_epi8(no.raw, m.raw, a.raw, b.raw)};
4711
+ }
4712
+
4713
+ template <typename T, size_t N, HWY_IF_U16(T)>
4714
+ HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
4715
+ Vec128<T, N> a, Vec128<T, N> b) {
4716
+ return Vec128<T, N>{_mm_mask_min_epu16(no.raw, m.raw, a.raw, b.raw)};
4717
+ }
4718
+ template <typename T, size_t N, HWY_IF_I16(T)>
4719
+ HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
4720
+ Vec128<T, N> a, Vec128<T, N> b) {
4721
+ return Vec128<T, N>{_mm_mask_min_epi16(no.raw, m.raw, a.raw, b.raw)};
4722
+ }
4723
+
4724
+ template <typename T, size_t N, HWY_IF_U32(T)>
4725
+ HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
4726
+ Vec128<T, N> a, Vec128<T, N> b) {
4727
+ return Vec128<T, N>{_mm_mask_min_epu32(no.raw, m.raw, a.raw, b.raw)};
4728
+ }
4729
+ template <typename T, size_t N, HWY_IF_I32(T)>
4730
+ HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
4731
+ Vec128<T, N> a, Vec128<T, N> b) {
4732
+ return Vec128<T, N>{_mm_mask_min_epi32(no.raw, m.raw, a.raw, b.raw)};
4733
+ }
4734
+
4735
+ template <typename T, size_t N, HWY_IF_U64(T)>
4736
+ HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
4737
+ Vec128<T, N> a, Vec128<T, N> b) {
4738
+ return Vec128<T, N>{_mm_mask_min_epu64(no.raw, m.raw, a.raw, b.raw)};
4739
+ }
4740
+ template <typename T, size_t N, HWY_IF_I64(T)>
4741
+ HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
4742
+ Vec128<T, N> a, Vec128<T, N> b) {
4743
+ return Vec128<T, N>{_mm_mask_min_epi64(no.raw, m.raw, a.raw, b.raw)};
4744
+ }
4745
+
4746
+ template <typename T, size_t N, HWY_IF_F32(T)>
4747
+ HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
4748
+ Vec128<T, N> a, Vec128<T, N> b) {
4749
+ return Vec128<T, N>{_mm_mask_min_ps(no.raw, m.raw, a.raw, b.raw)};
4750
+ }
4751
+
4752
+ template <typename T, size_t N, HWY_IF_F64(T)>
4753
+ HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
4754
+ Vec128<T, N> a, Vec128<T, N> b) {
4755
+ return Vec128<T, N>{_mm_mask_min_pd(no.raw, m.raw, a.raw, b.raw)};
4756
+ }
4757
+
4758
+ #if HWY_HAVE_FLOAT16
4759
+ template <typename T, size_t N, HWY_IF_F16(T)>
4760
+ HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
4761
+ Vec128<T, N> a, Vec128<T, N> b) {
4762
+ return Vec128<T, N>{_mm_mask_min_ph(no.raw, m.raw, a.raw, b.raw)};
4763
+ }
4764
+ #endif // HWY_HAVE_FLOAT16
4765
+
4766
+ // ------------------------------ MaskedMaxOr
4767
+
4768
+ template <typename T, size_t N, HWY_IF_U8(T)>
4769
+ HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
4770
+ Vec128<T, N> a, Vec128<T, N> b) {
4771
+ return Vec128<T, N>{_mm_mask_max_epu8(no.raw, m.raw, a.raw, b.raw)};
4772
+ }
4773
+ template <typename T, size_t N, HWY_IF_I8(T)>
4774
+ HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
4775
+ Vec128<T, N> a, Vec128<T, N> b) {
4776
+ return Vec128<T, N>{_mm_mask_max_epi8(no.raw, m.raw, a.raw, b.raw)};
4777
+ }
4778
+
4779
+ template <typename T, size_t N, HWY_IF_U16(T)>
4780
+ HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
4781
+ Vec128<T, N> a, Vec128<T, N> b) {
4782
+ return Vec128<T, N>{_mm_mask_max_epu16(no.raw, m.raw, a.raw, b.raw)};
4783
+ }
4784
+ template <typename T, size_t N, HWY_IF_I16(T)>
4785
+ HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
4786
+ Vec128<T, N> a, Vec128<T, N> b) {
4787
+ return Vec128<T, N>{_mm_mask_max_epi16(no.raw, m.raw, a.raw, b.raw)};
4788
+ }
4789
+
4790
+ template <typename T, size_t N, HWY_IF_U32(T)>
4791
+ HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
4792
+ Vec128<T, N> a, Vec128<T, N> b) {
4793
+ return Vec128<T, N>{_mm_mask_max_epu32(no.raw, m.raw, a.raw, b.raw)};
4794
+ }
4795
+ template <typename T, size_t N, HWY_IF_I32(T)>
4796
+ HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
4797
+ Vec128<T, N> a, Vec128<T, N> b) {
4798
+ return Vec128<T, N>{_mm_mask_max_epi32(no.raw, m.raw, a.raw, b.raw)};
4799
+ }
4800
+
4801
+ template <typename T, size_t N, HWY_IF_U64(T)>
4802
+ HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
4803
+ Vec128<T, N> a, Vec128<T, N> b) {
4804
+ return Vec128<T, N>{_mm_mask_max_epu64(no.raw, m.raw, a.raw, b.raw)};
4805
+ }
4806
+ template <typename T, size_t N, HWY_IF_I64(T)>
4807
+ HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
4808
+ Vec128<T, N> a, Vec128<T, N> b) {
4809
+ return Vec128<T, N>{_mm_mask_max_epi64(no.raw, m.raw, a.raw, b.raw)};
4810
+ }
4811
+
4812
+ template <typename T, size_t N, HWY_IF_F32(T)>
4813
+ HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
4814
+ Vec128<T, N> a, Vec128<T, N> b) {
4815
+ return Vec128<T, N>{_mm_mask_max_ps(no.raw, m.raw, a.raw, b.raw)};
4816
+ }
4817
+
4818
+ template <typename T, size_t N, HWY_IF_F64(T)>
4819
+ HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
4820
+ Vec128<T, N> a, Vec128<T, N> b) {
4821
+ return Vec128<T, N>{_mm_mask_max_pd(no.raw, m.raw, a.raw, b.raw)};
4822
+ }
4823
+
4824
+ #if HWY_HAVE_FLOAT16
4825
+ template <typename T, size_t N, HWY_IF_F16(T)>
4826
+ HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
4827
+ Vec128<T, N> a, Vec128<T, N> b) {
4828
+ return Vec128<T, N>{_mm_mask_max_ph(no.raw, m.raw, a.raw, b.raw)};
4829
+ }
4830
+ #endif // HWY_HAVE_FLOAT16
4831
+
4832
+ // ------------------------------ MaskedAddOr
4833
+
4834
+ template <typename T, size_t N, HWY_IF_UI8(T)>
4835
+ HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
4836
+ Vec128<T, N> a, Vec128<T, N> b) {
4837
+ return Vec128<T, N>{_mm_mask_add_epi8(no.raw, m.raw, a.raw, b.raw)};
4838
+ }
4839
+
4840
+ template <typename T, size_t N, HWY_IF_UI16(T)>
4841
+ HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
4842
+ Vec128<T, N> a, Vec128<T, N> b) {
4843
+ return Vec128<T, N>{_mm_mask_add_epi16(no.raw, m.raw, a.raw, b.raw)};
4844
+ }
4845
+
4846
+ template <typename T, size_t N, HWY_IF_UI32(T)>
4847
+ HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
4848
+ Vec128<T, N> a, Vec128<T, N> b) {
4849
+ return Vec128<T, N>{_mm_mask_add_epi32(no.raw, m.raw, a.raw, b.raw)};
4850
+ }
4851
+
4852
+ template <typename T, size_t N, HWY_IF_UI64(T)>
4853
+ HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
4854
+ Vec128<T, N> a, Vec128<T, N> b) {
4855
+ return Vec128<T, N>{_mm_mask_add_epi64(no.raw, m.raw, a.raw, b.raw)};
4856
+ }
4857
+
4858
+ template <typename T, size_t N, HWY_IF_F32(T)>
4859
+ HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
4860
+ Vec128<T, N> a, Vec128<T, N> b) {
4861
+ return Vec128<T, N>{_mm_mask_add_ps(no.raw, m.raw, a.raw, b.raw)};
4862
+ }
4863
+
4864
+ template <typename T, size_t N, HWY_IF_F64(T)>
4865
+ HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
4866
+ Vec128<T, N> a, Vec128<T, N> b) {
4867
+ return Vec128<T, N>{_mm_mask_add_pd(no.raw, m.raw, a.raw, b.raw)};
4868
+ }
4869
+
4870
+ #if HWY_HAVE_FLOAT16
4871
+ template <typename T, size_t N, HWY_IF_F16(T)>
4872
+ HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
4873
+ Vec128<T, N> a, Vec128<T, N> b) {
4874
+ return Vec128<T, N>{_mm_mask_add_ph(no.raw, m.raw, a.raw, b.raw)};
4875
+ }
4876
+ #endif // HWY_HAVE_FLOAT16
4877
+
4878
+ // ------------------------------ MaskedSubOr
4879
+
4880
+ template <typename T, size_t N, HWY_IF_UI8(T)>
4881
+ HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
4882
+ Vec128<T, N> a, Vec128<T, N> b) {
4883
+ return Vec128<T, N>{_mm_mask_sub_epi8(no.raw, m.raw, a.raw, b.raw)};
4884
+ }
4885
+
4886
+ template <typename T, size_t N, HWY_IF_UI16(T)>
4887
+ HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
4888
+ Vec128<T, N> a, Vec128<T, N> b) {
4889
+ return Vec128<T, N>{_mm_mask_sub_epi16(no.raw, m.raw, a.raw, b.raw)};
4890
+ }
4891
+
4892
+ template <typename T, size_t N, HWY_IF_UI32(T)>
4893
+ HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
4894
+ Vec128<T, N> a, Vec128<T, N> b) {
4895
+ return Vec128<T, N>{_mm_mask_sub_epi32(no.raw, m.raw, a.raw, b.raw)};
4896
+ }
4897
+
4898
+ template <typename T, size_t N, HWY_IF_UI64(T)>
4899
+ HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
4900
+ Vec128<T, N> a, Vec128<T, N> b) {
4901
+ return Vec128<T, N>{_mm_mask_sub_epi64(no.raw, m.raw, a.raw, b.raw)};
4902
+ }
4903
+
4904
+ template <typename T, size_t N, HWY_IF_F32(T)>
4905
+ HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
4906
+ Vec128<T, N> a, Vec128<T, N> b) {
4907
+ return Vec128<T, N>{_mm_mask_sub_ps(no.raw, m.raw, a.raw, b.raw)};
4908
+ }
4909
+
4910
+ template <typename T, size_t N, HWY_IF_F64(T)>
4911
+ HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
4912
+ Vec128<T, N> a, Vec128<T, N> b) {
4913
+ return Vec128<T, N>{_mm_mask_sub_pd(no.raw, m.raw, a.raw, b.raw)};
4914
+ }
4915
+
4916
+ #if HWY_HAVE_FLOAT16
4917
+ template <typename T, size_t N, HWY_IF_F16(T)>
4918
+ HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
4919
+ Vec128<T, N> a, Vec128<T, N> b) {
4920
+ return Vec128<T, N>{_mm_mask_sub_ph(no.raw, m.raw, a.raw, b.raw)};
4921
+ }
4922
+ #endif // HWY_HAVE_FLOAT16
4923
+
4924
+ // ------------------------------ MaskedMulOr
4925
+
4926
+ // There are no elementwise integer mask_mul. Generic for all vector lengths.
4927
+ template <class V, class M>
4928
+ HWY_API V MaskedMulOr(V no, M m, V a, V b) {
4929
+ return IfThenElse(m, a * b, no);
4930
+ }
4931
+
4932
+ template <size_t N>
4933
+ HWY_API Vec128<float, N> MaskedMulOr(Vec128<float, N> no, Mask128<float, N> m,
4934
+ Vec128<float, N> a, Vec128<float, N> b) {
4935
+ return Vec128<float, N>{_mm_mask_mul_ps(no.raw, m.raw, a.raw, b.raw)};
4936
+ }
4937
+
4938
+ template <size_t N>
4939
+ HWY_API Vec128<double, N> MaskedMulOr(Vec128<double, N> no,
4940
+ Mask128<double, N> m, Vec128<double, N> a,
4941
+ Vec128<double, N> b) {
4942
+ return Vec128<double, N>{_mm_mask_mul_pd(no.raw, m.raw, a.raw, b.raw)};
4943
+ }
4944
+
4945
+ #if HWY_HAVE_FLOAT16
4946
+ template <size_t N>
4947
+ HWY_API Vec128<float16_t, N> MaskedMulOr(Vec128<float16_t, N> no,
4948
+ Mask128<float16_t, N> m,
4949
+ Vec128<float16_t, N> a,
4950
+ Vec128<float16_t, N> b) {
4951
+ return Vec128<float16_t, N>{_mm_mask_mul_ph(no.raw, m.raw, a.raw, b.raw)};
4952
+ }
4953
+ #endif // HWY_HAVE_FLOAT16
4954
+
4955
+ // ------------------------------ MaskedDivOr
4956
+
4957
+ template <size_t N>
4958
+ HWY_API Vec128<float, N> MaskedDivOr(Vec128<float, N> no, Mask128<float, N> m,
4959
+ Vec128<float, N> a, Vec128<float, N> b) {
4960
+ return Vec128<float, N>{_mm_mask_div_ps(no.raw, m.raw, a.raw, b.raw)};
4961
+ }
4962
+
4963
+ template <size_t N>
4964
+ HWY_API Vec128<double, N> MaskedDivOr(Vec128<double, N> no,
4965
+ Mask128<double, N> m, Vec128<double, N> a,
4966
+ Vec128<double, N> b) {
4967
+ return Vec128<double, N>{_mm_mask_div_pd(no.raw, m.raw, a.raw, b.raw)};
4968
+ }
4969
+
4970
+ #if HWY_HAVE_FLOAT16
4971
+ template <size_t N>
4972
+ HWY_API Vec128<float16_t, N> MaskedDivOr(Vec128<float16_t, N> no,
4973
+ Mask128<float16_t, N> m,
4974
+ Vec128<float16_t, N> a,
4975
+ Vec128<float16_t, N> b) {
4976
+ return Vec128<float16_t, N>{_mm_mask_div_ph(no.raw, m.raw, a.raw, b.raw)};
4977
+ }
4978
+ #endif // HWY_HAVE_FLOAT16
4979
+
4980
+ // Generic for all vector lengths
4981
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
4982
+ HWY_API V MaskedDivOr(V no, MFromD<DFromV<V>> m, V a, V b) {
4983
+ return IfThenElse(m, Div(a, b), no);
4984
+ }
4985
+
4986
+ // ------------------------------ MaskedModOr
4987
+ // Generic for all vector lengths
4988
+ template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
4989
+ HWY_API V MaskedModOr(V no, MFromD<DFromV<V>> m, V a, V b) {
4990
+ return IfThenElse(m, Mod(a, b), no);
4991
+ }
4992
+
4993
+ // ------------------------------ MaskedSatAddOr
4994
+
4995
+ template <typename T, size_t N, HWY_IF_I8(T)>
4996
+ HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m,
4997
+ Vec128<T, N> a, Vec128<T, N> b) {
4998
+ return Vec128<T, N>{_mm_mask_adds_epi8(no.raw, m.raw, a.raw, b.raw)};
4999
+ }
5000
+
5001
+ template <typename T, size_t N, HWY_IF_U8(T)>
5002
+ HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m,
5003
+ Vec128<T, N> a, Vec128<T, N> b) {
5004
+ return Vec128<T, N>{_mm_mask_adds_epu8(no.raw, m.raw, a.raw, b.raw)};
5005
+ }
5006
+
5007
+ template <typename T, size_t N, HWY_IF_I16(T)>
5008
+ HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m,
5009
+ Vec128<T, N> a, Vec128<T, N> b) {
5010
+ return Vec128<T, N>{_mm_mask_adds_epi16(no.raw, m.raw, a.raw, b.raw)};
5011
+ }
5012
+
5013
+ template <typename T, size_t N, HWY_IF_U16(T)>
5014
+ HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m,
5015
+ Vec128<T, N> a, Vec128<T, N> b) {
5016
+ return Vec128<T, N>{_mm_mask_adds_epu16(no.raw, m.raw, a.raw, b.raw)};
5017
+ }
5018
+
5019
+ // ------------------------------ MaskedSatSubOr
5020
+
5021
+ template <typename T, size_t N, HWY_IF_I8(T)>
5022
+ HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m,
5023
+ Vec128<T, N> a, Vec128<T, N> b) {
5024
+ return Vec128<T, N>{_mm_mask_subs_epi8(no.raw, m.raw, a.raw, b.raw)};
5025
+ }
5026
+
5027
+ template <typename T, size_t N, HWY_IF_U8(T)>
5028
+ HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m,
5029
+ Vec128<T, N> a, Vec128<T, N> b) {
5030
+ return Vec128<T, N>{_mm_mask_subs_epu8(no.raw, m.raw, a.raw, b.raw)};
5031
+ }
5032
+
5033
+ template <typename T, size_t N, HWY_IF_I16(T)>
5034
+ HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m,
5035
+ Vec128<T, N> a, Vec128<T, N> b) {
5036
+ return Vec128<T, N>{_mm_mask_subs_epi16(no.raw, m.raw, a.raw, b.raw)};
5037
+ }
5038
+
5039
+ template <typename T, size_t N, HWY_IF_U16(T)>
5040
+ HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m,
5041
+ Vec128<T, N> a, Vec128<T, N> b) {
5042
+ return Vec128<T, N>{_mm_mask_subs_epu16(no.raw, m.raw, a.raw, b.raw)};
5043
+ }
5044
+
5045
+ #endif // HWY_TARGET <= HWY_AVX3
5046
+
4003
5047
  // ------------------------------ Floating-point multiply-add variants
4004
5048
 
4005
5049
  #if HWY_HAVE_FLOAT16
@@ -4035,7 +5079,7 @@ HWY_API Vec128<float16_t, N> NegMulSub(Vec128<float16_t, N> mul,
4035
5079
  template <size_t N>
4036
5080
  HWY_API Vec128<float, N> MulAdd(Vec128<float, N> mul, Vec128<float, N> x,
4037
5081
  Vec128<float, N> add) {
4038
- #if HWY_TARGET >= HWY_SSE4
5082
+ #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
4039
5083
  return mul * x + add;
4040
5084
  #else
4041
5085
  return Vec128<float, N>{_mm_fmadd_ps(mul.raw, x.raw, add.raw)};
@@ -4044,7 +5088,7 @@ HWY_API Vec128<float, N> MulAdd(Vec128<float, N> mul, Vec128<float, N> x,
4044
5088
  template <size_t N>
4045
5089
  HWY_API Vec128<double, N> MulAdd(Vec128<double, N> mul, Vec128<double, N> x,
4046
5090
  Vec128<double, N> add) {
4047
- #if HWY_TARGET >= HWY_SSE4
5091
+ #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
4048
5092
  return mul * x + add;
4049
5093
  #else
4050
5094
  return Vec128<double, N>{_mm_fmadd_pd(mul.raw, x.raw, add.raw)};
@@ -4055,7 +5099,7 @@ HWY_API Vec128<double, N> MulAdd(Vec128<double, N> mul, Vec128<double, N> x,
4055
5099
  template <size_t N>
4056
5100
  HWY_API Vec128<float, N> NegMulAdd(Vec128<float, N> mul, Vec128<float, N> x,
4057
5101
  Vec128<float, N> add) {
4058
- #if HWY_TARGET >= HWY_SSE4
5102
+ #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
4059
5103
  return add - mul * x;
4060
5104
  #else
4061
5105
  return Vec128<float, N>{_mm_fnmadd_ps(mul.raw, x.raw, add.raw)};
@@ -4064,7 +5108,7 @@ HWY_API Vec128<float, N> NegMulAdd(Vec128<float, N> mul, Vec128<float, N> x,
4064
5108
  template <size_t N>
4065
5109
  HWY_API Vec128<double, N> NegMulAdd(Vec128<double, N> mul, Vec128<double, N> x,
4066
5110
  Vec128<double, N> add) {
4067
- #if HWY_TARGET >= HWY_SSE4
5111
+ #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
4068
5112
  return add - mul * x;
4069
5113
  #else
4070
5114
  return Vec128<double, N>{_mm_fnmadd_pd(mul.raw, x.raw, add.raw)};
@@ -4075,7 +5119,7 @@ HWY_API Vec128<double, N> NegMulAdd(Vec128<double, N> mul, Vec128<double, N> x,
4075
5119
  template <size_t N>
4076
5120
  HWY_API Vec128<float, N> MulSub(Vec128<float, N> mul, Vec128<float, N> x,
4077
5121
  Vec128<float, N> sub) {
4078
- #if HWY_TARGET >= HWY_SSE4
5122
+ #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
4079
5123
  return mul * x - sub;
4080
5124
  #else
4081
5125
  return Vec128<float, N>{_mm_fmsub_ps(mul.raw, x.raw, sub.raw)};
@@ -4084,33 +5128,65 @@ HWY_API Vec128<float, N> MulSub(Vec128<float, N> mul, Vec128<float, N> x,
4084
5128
  template <size_t N>
4085
5129
  HWY_API Vec128<double, N> MulSub(Vec128<double, N> mul, Vec128<double, N> x,
4086
5130
  Vec128<double, N> sub) {
4087
- #if HWY_TARGET >= HWY_SSE4
5131
+ #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
4088
5132
  return mul * x - sub;
4089
5133
  #else
4090
5134
  return Vec128<double, N>{_mm_fmsub_pd(mul.raw, x.raw, sub.raw)};
4091
5135
  #endif
4092
5136
  }
4093
5137
 
4094
- // Returns -mul * x - sub
4095
- template <size_t N>
4096
- HWY_API Vec128<float, N> NegMulSub(Vec128<float, N> mul, Vec128<float, N> x,
4097
- Vec128<float, N> sub) {
4098
- #if HWY_TARGET >= HWY_SSE4
4099
- return Neg(mul) * x - sub;
5138
+ // Returns -mul * x - sub
5139
+ template <size_t N>
5140
+ HWY_API Vec128<float, N> NegMulSub(Vec128<float, N> mul, Vec128<float, N> x,
5141
+ Vec128<float, N> sub) {
5142
+ #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5143
+ return Neg(mul) * x - sub;
5144
+ #else
5145
+ return Vec128<float, N>{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)};
5146
+ #endif
5147
+ }
5148
+ template <size_t N>
5149
+ HWY_API Vec128<double, N> NegMulSub(Vec128<double, N> mul, Vec128<double, N> x,
5150
+ Vec128<double, N> sub) {
5151
+ #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5152
+ return Neg(mul) * x - sub;
5153
+ #else
5154
+ return Vec128<double, N>{_mm_fnmsub_pd(mul.raw, x.raw, sub.raw)};
5155
+ #endif
5156
+ }
5157
+
5158
+ #if HWY_TARGET <= HWY_SSSE3
5159
+
5160
+ #if HWY_HAVE_FLOAT16
5161
+ template <size_t N, HWY_IF_LANES_GT(N, 1)>
5162
+ HWY_API Vec128<float16_t, N> MulAddSub(Vec128<float16_t, N> mul,
5163
+ Vec128<float16_t, N> x,
5164
+ Vec128<float16_t, N> sub_or_add) {
5165
+ return Vec128<float16_t, N>{_mm_fmaddsub_ph(mul.raw, x.raw, sub_or_add.raw)};
5166
+ }
5167
+ #endif // HWY_HAVE_FLOAT16
5168
+
5169
+ template <size_t N, HWY_IF_LANES_GT(N, 1)>
5170
+ HWY_API Vec128<float, N> MulAddSub(Vec128<float, N> mul, Vec128<float, N> x,
5171
+ Vec128<float, N> sub_or_add) {
5172
+ #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5173
+ return AddSub(mul * x, sub_or_add);
4100
5174
  #else
4101
- return Vec128<float, N>{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)};
5175
+ return Vec128<float, N>{_mm_fmaddsub_ps(mul.raw, x.raw, sub_or_add.raw)};
4102
5176
  #endif
4103
5177
  }
4104
- template <size_t N>
4105
- HWY_API Vec128<double, N> NegMulSub(Vec128<double, N> mul, Vec128<double, N> x,
4106
- Vec128<double, N> sub) {
4107
- #if HWY_TARGET >= HWY_SSE4
4108
- return Neg(mul) * x - sub;
5178
+
5179
+ HWY_API Vec128<double> MulAddSub(Vec128<double> mul, Vec128<double> x,
5180
+ Vec128<double> sub_or_add) {
5181
+ #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
5182
+ return AddSub(mul * x, sub_or_add);
4109
5183
  #else
4110
- return Vec128<double, N>{_mm_fnmsub_pd(mul.raw, x.raw, sub.raw)};
5184
+ return Vec128<double>{_mm_fmaddsub_pd(mul.raw, x.raw, sub_or_add.raw)};
4111
5185
  #endif
4112
5186
  }
4113
5187
 
5188
+ #endif // HWY_TARGET <= HWY_SSSE3
5189
+
4114
5190
  // ------------------------------ Floating-point square root
4115
5191
 
4116
5192
  // Full precision square root
@@ -4508,116 +5584,129 @@ HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D d,
4508
5584
 
4509
5585
  namespace detail {
4510
5586
 
4511
- template <int kScale, class D, class VI, HWY_IF_UI32_D(D)>
4512
- HWY_INLINE VFromD<D> NativeGather128(D /* tag */,
4513
- const TFromD<D>* HWY_RESTRICT base,
4514
- VI index) {
4515
- return VFromD<D>{_mm_i32gather_epi32(reinterpret_cast<const int32_t*>(base),
4516
- index.raw, kScale)};
5587
+ template <int kScale, typename T, size_t N, HWY_IF_UI32(T)>
5588
+ HWY_INLINE Vec128<T, N> NativeGather128(const T* HWY_RESTRICT base,
5589
+ Vec128<int32_t, N> indices) {
5590
+ return Vec128<T, N>{_mm_i32gather_epi32(
5591
+ reinterpret_cast<const int32_t*>(base), indices.raw, kScale)};
4517
5592
  }
4518
5593
 
4519
- template <int kScale, class D, class VI, HWY_IF_UI64_D(D)>
4520
- HWY_INLINE VFromD<D> NativeGather128(D /* tag */,
4521
- const TFromD<D>* HWY_RESTRICT base,
4522
- VI index) {
4523
- return VFromD<D>{_mm_i64gather_epi64(
4524
- reinterpret_cast<const GatherIndex64*>(base), index.raw, kScale)};
5594
+ template <int kScale, typename T, size_t N, HWY_IF_UI64(T)>
5595
+ HWY_INLINE Vec128<T, N> NativeGather128(const T* HWY_RESTRICT base,
5596
+ Vec128<int64_t, N> indices) {
5597
+ return Vec128<T, N>{_mm_i64gather_epi64(
5598
+ reinterpret_cast<const GatherIndex64*>(base), indices.raw, kScale)};
4525
5599
  }
4526
5600
 
4527
- template <int kScale, class D, class VI, HWY_IF_F32_D(D)>
4528
- HWY_INLINE VFromD<D> NativeGather128(D /* tag */,
4529
- const float* HWY_RESTRICT base, VI index) {
4530
- return VFromD<D>{_mm_i32gather_ps(base, index.raw, kScale)};
5601
+ template <int kScale, size_t N>
5602
+ HWY_INLINE Vec128<float, N> NativeGather128(const float* HWY_RESTRICT base,
5603
+ Vec128<int32_t, N> indices) {
5604
+ return Vec128<float, N>{_mm_i32gather_ps(base, indices.raw, kScale)};
4531
5605
  }
4532
5606
 
4533
- template <int kScale, class D, class VI, HWY_IF_F64_D(D)>
4534
- HWY_INLINE VFromD<D> NativeGather128(D /* tag */,
4535
- const double* HWY_RESTRICT base,
4536
- VI index) {
4537
- return VFromD<D>{_mm_i64gather_pd(base, index.raw, kScale)};
5607
+ template <int kScale, size_t N>
5608
+ HWY_INLINE Vec128<double, N> NativeGather128(const double* HWY_RESTRICT base,
5609
+ Vec128<int64_t, N> indices) {
5610
+ return Vec128<double, N>{_mm_i64gather_pd(base, indices.raw, kScale)};
4538
5611
  }
4539
5612
 
4540
- template <int kScale, class D, class VI, HWY_IF_UI32_D(D)>
4541
- HWY_INLINE VFromD<D> NativeMaskedGather128(MFromD<D> m, D d,
4542
- const TFromD<D>* HWY_RESTRICT base,
4543
- VI index) {
4544
- // For partial vectors, ensure upper mask lanes are zero to prevent faults.
4545
- if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
5613
+ template <int kScale, typename T, size_t N, HWY_IF_UI32(T)>
5614
+ HWY_INLINE Vec128<T, N> NativeMaskedGatherOr128(Vec128<T, N> no,
5615
+ Mask128<T, N> m,
5616
+ const T* HWY_RESTRICT base,
5617
+ Vec128<int32_t, N> indices) {
4546
5618
  #if HWY_TARGET <= HWY_AVX3
4547
- return VFromD<D>{_mm_mmask_i32gather_epi32(
4548
- Zero(d).raw, m.raw, index.raw, reinterpret_cast<const int32_t*>(base),
5619
+ return Vec128<T, N>{_mm_mmask_i32gather_epi32(
5620
+ no.raw, m.raw, indices.raw, reinterpret_cast<const int32_t*>(base),
4549
5621
  kScale)};
4550
5622
  #else
4551
- return VFromD<D>{_mm_mask_i32gather_epi32(
4552
- Zero(d).raw, reinterpret_cast<const int32_t*>(base), index.raw, m.raw,
4553
- kScale)};
5623
+ return Vec128<T, N>{
5624
+ _mm_mask_i32gather_epi32(no.raw, reinterpret_cast<const int32_t*>(base),
5625
+ indices.raw, m.raw, kScale)};
4554
5626
  #endif
4555
5627
  }
4556
5628
 
4557
- template <int kScale, class D, class VI, HWY_IF_UI64_D(D)>
4558
- HWY_INLINE VFromD<D> NativeMaskedGather128(MFromD<D> m, D d,
4559
- const TFromD<D>* HWY_RESTRICT base,
4560
- VI index) {
4561
- // For partial vectors, ensure upper mask lanes are zero to prevent faults.
4562
- if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
5629
+ template <int kScale, typename T, size_t N, HWY_IF_UI64(T)>
5630
+ HWY_INLINE Vec128<T, N> NativeMaskedGatherOr128(Vec128<T, N> no,
5631
+ Mask128<T, N> m,
5632
+ const T* HWY_RESTRICT base,
5633
+ Vec128<int64_t, N> indices) {
4563
5634
  #if HWY_TARGET <= HWY_AVX3
4564
- return VFromD<D>{_mm_mmask_i64gather_epi64(
4565
- Zero(d).raw, m.raw, index.raw,
4566
- reinterpret_cast<const GatherIndex64*>(base), kScale)};
5635
+ return Vec128<T, N>{_mm_mmask_i64gather_epi64(
5636
+ no.raw, m.raw, indices.raw, reinterpret_cast<const GatherIndex64*>(base),
5637
+ kScale)};
4567
5638
  #else
4568
- return VFromD<D>{_mm_mask_i64gather_epi64(
4569
- Zero(d).raw, reinterpret_cast<const GatherIndex64*>(base), index.raw,
4570
- m.raw, kScale)};
5639
+ return Vec128<T, N>{_mm_mask_i64gather_epi64(
5640
+ no.raw, reinterpret_cast<const GatherIndex64*>(base), indices.raw, m.raw,
5641
+ kScale)};
4571
5642
  #endif
4572
5643
  }
4573
5644
 
4574
- template <int kScale, class D, class VI, HWY_IF_F32_D(D)>
4575
- HWY_INLINE VFromD<D> NativeMaskedGather128(MFromD<D> m, D d,
4576
- const float* HWY_RESTRICT base,
4577
- VI index) {
4578
- // For partial vectors, ensure upper mask lanes are zero to prevent faults.
4579
- if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
5645
+ template <int kScale, size_t N>
5646
+ HWY_INLINE Vec128<float, N> NativeMaskedGatherOr128(
5647
+ Vec128<float, N> no, Mask128<float, N> m, const float* HWY_RESTRICT base,
5648
+ Vec128<int32_t, N> indices) {
4580
5649
  #if HWY_TARGET <= HWY_AVX3
4581
- return VFromD<D>{
4582
- _mm_mmask_i32gather_ps(Zero(d).raw, m.raw, index.raw, base, kScale)};
5650
+ return Vec128<float, N>{
5651
+ _mm_mmask_i32gather_ps(no.raw, m.raw, indices.raw, base, kScale)};
4583
5652
  #else
4584
- return VFromD<D>{
4585
- _mm_mask_i32gather_ps(Zero(d).raw, base, index.raw, m.raw, kScale)};
5653
+ return Vec128<float, N>{
5654
+ _mm_mask_i32gather_ps(no.raw, base, indices.raw, m.raw, kScale)};
4586
5655
  #endif
4587
5656
  }
4588
5657
 
4589
- template <int kScale, class D, class VI, HWY_IF_F64_D(D)>
4590
- HWY_INLINE VFromD<D> NativeMaskedGather128(MFromD<D> m, D d,
4591
- const double* HWY_RESTRICT base,
4592
- VI index) {
4593
- // For partial vectors, ensure upper mask lanes are zero to prevent faults.
4594
- if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
5658
+ template <int kScale, size_t N>
5659
+ HWY_INLINE Vec128<double, N> NativeMaskedGatherOr128(
5660
+ Vec128<double, N> no, Mask128<double, N> m, const double* HWY_RESTRICT base,
5661
+ Vec128<int64_t, N> indices) {
4595
5662
  #if HWY_TARGET <= HWY_AVX3
4596
- return VFromD<D>{
4597
- _mm_mmask_i64gather_pd(Zero(d).raw, m.raw, index.raw, base, kScale)};
5663
+ return Vec128<double, N>{
5664
+ _mm_mmask_i64gather_pd(no.raw, m.raw, indices.raw, base, kScale)};
4598
5665
  #else
4599
- return VFromD<D>{
4600
- _mm_mask_i64gather_pd(Zero(d).raw, base, index.raw, m.raw, kScale)};
5666
+ return Vec128<double, N>{
5667
+ _mm_mask_i64gather_pd(no.raw, base, indices.raw, m.raw, kScale)};
4601
5668
  #endif
4602
5669
  }
4603
5670
 
4604
5671
  } // namespace detail
4605
5672
 
4606
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>, class VI>
4607
- HWY_API VFromD<D> GatherOffset(D d, const T* HWY_RESTRICT base, VI offset) {
4608
- static_assert(sizeof(T) == sizeof(TFromV<VI>), "Index/lane size must match");
4609
- return detail::NativeGather128<1>(d, base, offset);
5673
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
5674
+ HWY_API VFromD<D> GatherOffset(D d, const TFromD<D>* HWY_RESTRICT base,
5675
+ VFromD<RebindToSigned<D>> offsets) {
5676
+ const RebindToSigned<decltype(d)> di;
5677
+ (void)di; // for HWY_DASSERT
5678
+ HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di))));
5679
+ return detail::NativeGather128<1>(base, offsets);
5680
+ }
5681
+
5682
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>>
5683
+ HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base,
5684
+ VFromD<RebindToSigned<D>> indices) {
5685
+ const RebindToSigned<decltype(d)> di;
5686
+ (void)di; // for HWY_DASSERT
5687
+ HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
5688
+ return detail::NativeGather128<sizeof(T)>(base, indices);
4610
5689
  }
4611
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>, class VI>
4612
- HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base, VI index) {
4613
- static_assert(sizeof(T) == sizeof(TFromV<VI>), "Index/lane size must match");
4614
- return detail::NativeGather128<sizeof(T)>(d, base, index);
5690
+
5691
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>>
5692
+ HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
5693
+ const T* HWY_RESTRICT base,
5694
+ VFromD<RebindToSigned<D>> indices) {
5695
+ // For partial vectors, ensure upper mask lanes are zero to prevent faults.
5696
+ if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
5697
+
5698
+ const RebindToSigned<decltype(d)> di;
5699
+ (void)di; // for HWY_DASSERT
5700
+ HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
5701
+ return detail::NativeMaskedGatherOr128<sizeof(T)>(no, m, base, indices);
4615
5702
  }
4616
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>, class VI>
5703
+
5704
+ // Generic for all vector lengths.
5705
+ template <class D>
4617
5706
  HWY_API VFromD<D> MaskedGatherIndex(MFromD<D> m, D d,
4618
- const T* HWY_RESTRICT base, VI index) {
4619
- static_assert(sizeof(T) == sizeof(TFromV<VI>), "Index/lane size must match");
4620
- return detail::NativeMaskedGather128<sizeof(T)>(m, d, base, index);
5707
+ const TFromD<D>* HWY_RESTRICT base,
5708
+ VFromD<RebindToSigned<D>> indices) {
5709
+ return MaskedGatherIndexOr(Zero(d), m, d, base, indices);
4621
5710
  }
4622
5711
 
4623
5712
  #endif // HWY_TARGET <= HWY_AVX2
@@ -4740,9 +5829,7 @@ HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
4740
5829
  const RebindToUnsigned<decltype(d)> du;
4741
5830
  const uint16_t lane = static_cast<uint16_t>(
4742
5831
  _mm_extract_epi16(BitCast(du, v).raw, kLane) & 0xFFFF);
4743
- T ret;
4744
- CopySameSize(&lane, &ret); // for float16_t
4745
- return ret;
5832
+ return BitCastScalar<T>(lane);
4746
5833
  }
4747
5834
 
4748
5835
  template <size_t kLane, typename T, size_t N, HWY_IF_UI32(T)>
@@ -4780,9 +5867,7 @@ HWY_INLINE float ExtractLane(const Vec128<float, N> v) {
4780
5867
  #else
4781
5868
  // Bug in the intrinsic, returns int but should be float.
4782
5869
  const int32_t bits = _mm_extract_ps(v.raw, kLane);
4783
- float ret;
4784
- CopySameSize(&bits, &ret);
4785
- return ret;
5870
+ return BitCastScalar<float>(bits);
4786
5871
  #endif
4787
5872
  }
4788
5873
 
@@ -4958,8 +6043,7 @@ HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
4958
6043
  static_assert(kLane < N, "Lane index out of bounds");
4959
6044
  const DFromV<decltype(v)> d;
4960
6045
  const RebindToUnsigned<decltype(d)> du;
4961
- uint16_t bits;
4962
- CopySameSize(&t, &bits); // for float16_t
6046
+ const uint16_t bits = BitCastScalar<uint16_t>(t);
4963
6047
  return BitCast(d, VFromD<decltype(du)>{
4964
6048
  _mm_insert_epi16(BitCast(du, v).raw, bits, kLane)});
4965
6049
  }
@@ -4970,8 +6054,7 @@ HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
4970
6054
  #if HWY_TARGET >= HWY_SSSE3
4971
6055
  return InsertLaneUsingBroadcastAndBlend(v, kLane, t);
4972
6056
  #else
4973
- MakeSigned<T> ti;
4974
- CopySameSize(&t, &ti); // don't just cast because T might be float.
6057
+ const MakeSigned<T> ti = BitCastScalar<MakeSigned<T>>(t);
4975
6058
  return Vec128<T, N>{_mm_insert_epi32(v.raw, ti, kLane)};
4976
6059
  #endif
4977
6060
  }
@@ -4990,8 +6073,7 @@ HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
4990
6073
  return BitCast(
4991
6074
  d, Vec128<double, N>{_mm_shuffle_pd(BitCast(df, v).raw, vt.raw, 0)});
4992
6075
  #else
4993
- MakeSigned<T> ti;
4994
- CopySameSize(&t, &ti); // don't just cast because T might be float.
6076
+ const MakeSigned<T> ti = BitCastScalar<MakeSigned<T>>(t);
4995
6077
  return Vec128<T, N>{_mm_insert_epi64(v.raw, ti, kLane)};
4996
6078
  #endif
4997
6079
  }
@@ -5527,9 +6609,9 @@ HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
5527
6609
  return BitCast(d, VU{_mm_shuffle_epi32(rev4.raw, _MM_SHUFFLE(1, 0, 3, 2))});
5528
6610
  #else
5529
6611
  const RebindToSigned<decltype(d)> di;
5530
- alignas(16) static constexpr int16_t kShuffle[8] = {
5531
- 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100};
5532
- return BitCast(d, TableLookupBytes(v, LoadDup128(di, kShuffle)));
6612
+ const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
6613
+ di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
6614
+ return BitCast(d, TableLookupBytes(v, shuffle));
5533
6615
  #endif
5534
6616
  }
5535
6617
 
@@ -5578,9 +6660,9 @@ HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
5578
6660
  return BitCast(d, VU{shuf_result});
5579
6661
  #else
5580
6662
  const RebindToSigned<decltype(d)> di;
5581
- alignas(16) static constexpr int16_t kShuffle[8] = {
5582
- 0x0302, 0x0100, 0x0706, 0x0504, 0x0B0A, 0x0908, 0x0F0E, 0x0D0C};
5583
- return BitCast(d, TableLookupBytes(v, LoadDup128(di, kShuffle)));
6663
+ const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
6664
+ di, 0x0302, 0x0100, 0x0706, 0x0504, 0x0B0A, 0x0908, 0x0F0E, 0x0D0C);
6665
+ return BitCast(d, TableLookupBytes(v, shuffle));
5584
6666
  #endif
5585
6667
  }
5586
6668
 
@@ -5615,9 +6697,9 @@ HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
5615
6697
  _MM_SHUFFLE(0, 1, 2, 3))});
5616
6698
  #else
5617
6699
  const RebindToSigned<decltype(d)> di;
5618
- alignas(16) static constexpr int16_t kShuffle[8] = {
5619
- 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908};
5620
- return BitCast(d, TableLookupBytes(v, LoadDup128(di, kShuffle)));
6700
+ const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
6701
+ di, 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908);
6702
+ return BitCast(d, TableLookupBytes(v, shuffle));
5621
6703
  #endif
5622
6704
  }
5623
6705
 
@@ -5641,9 +6723,9 @@ HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
5641
6723
  return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v))));
5642
6724
  #else
5643
6725
  const RebindToSigned<decltype(d)> di;
5644
- alignas(16) static constexpr int16_t kShuffle[8] = {
5645
- 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100};
5646
- return BitCast(d, TableLookupBytes(v, LoadDup128(di, kShuffle)));
6726
+ const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
6727
+ di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
6728
+ return BitCast(d, TableLookupBytes(v, shuffle));
5647
6729
  #endif
5648
6730
  }
5649
6731
 
@@ -5758,7 +6840,11 @@ template <size_t kIdx3210, class V>
5758
6840
  HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
5759
6841
  hwy::SizeTag<2> /*lane_size_tag*/,
5760
6842
  hwy::SizeTag<8> /*vect_size_tag*/, V v) {
5761
- return V{_mm_shufflelo_epi16(v.raw, static_cast<int>(kIdx3210 & 0xFF))};
6843
+ const DFromV<decltype(v)> d;
6844
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
6845
+ return BitCast(d,
6846
+ VFromD<decltype(du)>{_mm_shufflelo_epi16(
6847
+ BitCast(du, v).raw, static_cast<int>(kIdx3210 & 0xFF))});
5762
6848
  }
5763
6849
 
5764
6850
  #if HWY_TARGET == HWY_SSE2
@@ -5766,8 +6852,12 @@ template <size_t kIdx3210, class V>
5766
6852
  HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
5767
6853
  hwy::SizeTag<2> /*lane_size_tag*/,
5768
6854
  hwy::SizeTag<16> /*vect_size_tag*/, V v) {
6855
+ const DFromV<decltype(v)> d;
6856
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
5769
6857
  constexpr int kShuffle = static_cast<int>(kIdx3210 & 0xFF);
5770
- return V{_mm_shufflehi_epi16(_mm_shufflelo_epi16(v.raw, kShuffle), kShuffle)};
6858
+ return BitCast(
6859
+ d, VFromD<decltype(du)>{_mm_shufflehi_epi16(
6860
+ _mm_shufflelo_epi16(BitCast(du, v).raw, kShuffle), kShuffle)});
5771
6861
  }
5772
6862
 
5773
6863
  template <size_t kIdx3210, size_t kVectSize, class V,
@@ -6173,7 +7263,7 @@ template <class D, HWY_IF_T_SIZE_ONE_OF_D(
6173
7263
  (1 << 4) | (1 << 8))>
6174
7264
  HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
6175
7265
  size_t max_lanes_to_store) {
6176
- const size_t num_of_lanes_to_store =
7266
+ const size_t num_lanes_to_store =
6177
7267
  HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D));
6178
7268
 
6179
7269
  #if HWY_COMPILER_MSVC
@@ -6181,12 +7271,14 @@ HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
6181
7271
  HWY_FENCE;
6182
7272
  #endif
6183
7273
 
6184
- BlendedStore(v, FirstN(d, num_of_lanes_to_store), d, p);
7274
+ BlendedStore(v, FirstN(d, num_lanes_to_store), d, p);
6185
7275
 
6186
7276
  #if HWY_COMPILER_MSVC
6187
7277
  // Work around MSVC compiler bug by using a HWY_FENCE after the BlendedStore
6188
7278
  HWY_FENCE;
6189
7279
  #endif
7280
+
7281
+ detail::MaybeUnpoison(p, num_lanes_to_store);
6190
7282
  }
6191
7283
 
6192
7284
  #if HWY_TARGET > HWY_AVX3
@@ -6214,36 +7306,35 @@ namespace detail {
6214
7306
  template <class D, HWY_IF_T_SIZE_D(D, 1)>
6215
7307
  HWY_API void AVX2UIF8Or16StoreTrailingN(VFromD<D> v_trailing, D /*d*/,
6216
7308
  TFromD<D>* HWY_RESTRICT p,
6217
- size_t num_of_lanes_to_store) {
7309
+ size_t num_lanes_to_store) {
6218
7310
  // AVX2UIF8Or16StoreTrailingN should only be called for an I8/U8 vector if
6219
- // (num_of_lanes_to_store & 3) != 0 is true
7311
+ // (num_lanes_to_store & 3) != 0 is true
6220
7312
  const auto v_full128 = ResizeBitCast(Full128<TFromD<D>>(), v_trailing);
6221
- if ((num_of_lanes_to_store & 2) != 0) {
7313
+ if ((num_lanes_to_store & 2) != 0) {
6222
7314
  const uint16_t u16_bits = GetLane(BitCast(Full128<uint16_t>(), v_full128));
6223
- p[num_of_lanes_to_store - 1] = detail::ExtractLane<2>(v_full128);
7315
+ p[num_lanes_to_store - 1] = detail::ExtractLane<2>(v_full128);
6224
7316
  CopyBytes<sizeof(uint16_t)>(&u16_bits,
6225
- p + (num_of_lanes_to_store & ~size_t{3}));
7317
+ p + (num_lanes_to_store & ~size_t{3}));
6226
7318
  } else {
6227
- p[num_of_lanes_to_store - 1] = GetLane(v_full128);
7319
+ p[num_lanes_to_store - 1] = GetLane(v_full128);
6228
7320
  }
6229
7321
  }
6230
7322
 
6231
7323
  template <class D, HWY_IF_T_SIZE_D(D, 2)>
6232
7324
  HWY_API void AVX2UIF8Or16StoreTrailingN(VFromD<D> v_trailing, D /*d*/,
6233
- TFromD<D>* HWY_RESTRICT p,
6234
- size_t num_of_lanes_to_store) {
7325
+ TFromD<D>* p,
7326
+ size_t num_lanes_to_store) {
6235
7327
  // AVX2UIF8Or16StoreTrailingN should only be called for an I16/U16/F16/BF16
6236
- // vector if (num_of_lanes_to_store & 1) == 1 is true
6237
- p[num_of_lanes_to_store - 1] = GetLane(v_trailing);
7328
+ // vector if (num_lanes_to_store & 1) == 1 is true
7329
+ p[num_lanes_to_store - 1] = GetLane(v_trailing);
6238
7330
  }
6239
7331
 
6240
7332
  } // namespace detail
6241
7333
 
6242
7334
  template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
6243
7335
  HWY_IF_LANES_GT_D(D, 2)>
6244
- HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
6245
- size_t max_lanes_to_store) {
6246
- const size_t num_of_lanes_to_store =
7336
+ HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* p, size_t max_lanes_to_store) {
7337
+ const size_t num_lanes_to_store =
6247
7338
  HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D));
6248
7339
 
6249
7340
  const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))>
@@ -6252,7 +7343,7 @@ HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
6252
7343
  const Repartition<int32_t, decltype(d_full)> di32_full;
6253
7344
 
6254
7345
  const auto i32_store_mask = BitCast(
6255
- di32_full, VecFromMask(du_full, FirstN(du_full, num_of_lanes_to_store)));
7346
+ di32_full, VecFromMask(du_full, FirstN(du_full, num_lanes_to_store)));
6256
7347
  const auto vi32 = ResizeBitCast(di32_full, v);
6257
7348
 
6258
7349
  #if HWY_COMPILER_MSVC
@@ -6265,19 +7356,21 @@ HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
6265
7356
 
6266
7357
  constexpr size_t kNumOfLanesPerI32 = 4 / sizeof(TFromD<D>);
6267
7358
  constexpr size_t kTrailingLenMask = kNumOfLanesPerI32 - 1;
6268
- const size_t trailing_n = (num_of_lanes_to_store & kTrailingLenMask);
7359
+ const size_t trailing_n = (num_lanes_to_store & kTrailingLenMask);
6269
7360
 
6270
7361
  if (trailing_n != 0) {
6271
- const auto v_trailing = ResizeBitCast(
7362
+ const VFromD<D> v_trailing = ResizeBitCast(
6272
7363
  d, SlideDownLanes(di32_full, vi32,
6273
- num_of_lanes_to_store / kNumOfLanesPerI32));
6274
- detail::AVX2UIF8Or16StoreTrailingN(v_trailing, d, p, num_of_lanes_to_store);
7364
+ num_lanes_to_store / kNumOfLanesPerI32));
7365
+ detail::AVX2UIF8Or16StoreTrailingN(v_trailing, d, p, num_lanes_to_store);
6275
7366
  }
6276
7367
 
6277
7368
  #if HWY_COMPILER_MSVC
6278
7369
  // Work around MSVC compiler bug by using a HWY_FENCE after the BlendedStore
6279
7370
  HWY_FENCE;
6280
7371
  #endif
7372
+
7373
+ detail::MaybeUnpoison(p, num_lanes_to_store);
6281
7374
  }
6282
7375
  #endif // HWY_TARGET > HWY_AVX3
6283
7376
  #endif // HWY_TARGET <= HWY_AVX2
@@ -6300,19 +7393,36 @@ HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
6300
7393
 
6301
7394
  // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
6302
7395
 
6303
- template <class D, HWY_IF_V_SIZE_D(D, 16)>
7396
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
6304
7397
  HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
6305
7398
  const RebindToUnsigned<decltype(d)> du;
6306
7399
  const Half<decltype(du)> duh;
6307
7400
  return BitCast(d, VFromD<decltype(du)>{_mm_move_epi64(BitCast(duh, lo).raw)});
6308
7401
  }
6309
7402
 
6310
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
7403
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
6311
7404
  HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
6312
7405
  const Half<D> dh;
6313
7406
  return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD<D>{lo.raw});
6314
7407
  }
6315
7408
 
7409
+ #if HWY_HAVE_FLOAT16
7410
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
7411
+ HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
7412
+ const RebindToUnsigned<decltype(d)> du;
7413
+ const Half<decltype(du)> duh;
7414
+ return BitCast(d, ZeroExtendVector(du, BitCast(duh, lo)));
7415
+ }
7416
+ #endif
7417
+
7418
+ // Generic for all vector lengths.
7419
+ template <class D, HWY_X86_IF_EMULATED_D(D)>
7420
+ HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
7421
+ const RebindToUnsigned<decltype(d)> du;
7422
+ const Half<decltype(du)> duh;
7423
+ return BitCast(d, ZeroExtendVector(du, BitCast(duh, lo)));
7424
+ }
7425
+
6316
7426
  // ------------------------------ Concat full (InterleaveLower)
6317
7427
 
6318
7428
  // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
@@ -6459,10 +7569,11 @@ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
6459
7569
  HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
6460
7570
  // Right-shift 16 bits per i32 - a *signed* shift of 0x8000xxxx returns
6461
7571
  // 0xFFFF8000, which correctly saturates to 0x8000.
7572
+ const RebindToUnsigned<decltype(d)> du;
6462
7573
  const Repartition<int32_t, decltype(d)> dw;
6463
7574
  const Vec128<int32_t> uH = ShiftRight<16>(BitCast(dw, hi));
6464
7575
  const Vec128<int32_t> uL = ShiftRight<16>(BitCast(dw, lo));
6465
- return VFromD<D>{_mm_packs_epi32(uL.raw, uH.raw)};
7576
+ return BitCast(d, VFromD<decltype(du)>{_mm_packs_epi32(uL.raw, uH.raw)});
6466
7577
  }
6467
7578
 
6468
7579
  // 16-bit x4
@@ -6565,11 +7676,12 @@ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
6565
7676
  HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
6566
7677
  #if HWY_TARGET <= HWY_SSE4
6567
7678
  // Isolate lower 16 bits per u32 so we can pack.
7679
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
6568
7680
  const Repartition<uint32_t, decltype(d)> dw;
6569
7681
  const Vec128<uint32_t> mask = Set(dw, 0x0000FFFF);
6570
7682
  const Vec128<uint32_t> uH = And(BitCast(dw, hi), mask);
6571
7683
  const Vec128<uint32_t> uL = And(BitCast(dw, lo), mask);
6572
- return VFromD<D>{_mm_packus_epi32(uL.raw, uH.raw)};
7684
+ return BitCast(d, VFromD<decltype(du)>{_mm_packus_epi32(uL.raw, uH.raw)});
6573
7685
  #elif HWY_TARGET == HWY_SSE2
6574
7686
  const Repartition<uint32_t, decltype(d)> dw;
6575
7687
  return ConcatOdd(d, BitCast(d, ShiftLeft<16>(BitCast(dw, hi))),
@@ -6642,9 +7754,9 @@ HWY_API V DupEven(V v) {
6642
7754
 
6643
7755
  #if HWY_TARGET <= HWY_SSSE3
6644
7756
  const RebindToUnsigned<decltype(d)> du;
6645
- alignas(16) static constexpr uint8_t kShuffle[16] = {
6646
- 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
6647
- return TableLookupBytes(v, BitCast(d, LoadDup128(du, kShuffle)));
7757
+ const VFromD<decltype(du)> shuffle = Dup128VecFromValues(
7758
+ du, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
7759
+ return TableLookupBytes(v, BitCast(d, shuffle));
6648
7760
  #else
6649
7761
  const Repartition<uint16_t, decltype(d)> du16;
6650
7762
  return IfVecThenElse(BitCast(d, Set(du16, uint16_t{0xFF00})),
@@ -6656,8 +7768,8 @@ template <typename T, HWY_IF_T_SIZE(T, 2)>
6656
7768
  HWY_API Vec64<T> DupEven(const Vec64<T> v) {
6657
7769
  const DFromV<decltype(v)> d;
6658
7770
  const RebindToUnsigned<decltype(d)> du; // for float16_t
6659
- return BitCast(d, VFromD<decltype(du)>{
6660
- _mm_shufflelo_epi16(v.raw, _MM_SHUFFLE(2, 2, 0, 0))});
7771
+ return BitCast(d, VFromD<decltype(du)>{_mm_shufflelo_epi16(
7772
+ BitCast(du, v).raw, _MM_SHUFFLE(2, 2, 0, 0))});
6661
7773
  }
6662
7774
 
6663
7775
  // Generic for all vector lengths.
@@ -6666,9 +7778,9 @@ HWY_API V DupEven(const V v) {
6666
7778
  const DFromV<decltype(v)> d;
6667
7779
  const RebindToUnsigned<decltype(d)> du; // for float16_t
6668
7780
  #if HWY_TARGET <= HWY_SSSE3
6669
- alignas(16) static constexpr uint16_t kShuffle[8] = {
6670
- 0x0100, 0x0100, 0x0504, 0x0504, 0x0908, 0x0908, 0x0d0c, 0x0d0c};
6671
- return TableLookupBytes(v, BitCast(d, LoadDup128(du, kShuffle)));
7781
+ const VFromD<decltype(du)> shuffle = Dup128VecFromValues(
7782
+ du, 0x0100, 0x0100, 0x0504, 0x0504, 0x0908, 0x0908, 0x0d0c, 0x0d0c);
7783
+ return TableLookupBytes(v, BitCast(d, shuffle));
6672
7784
  #else
6673
7785
  return BitCast(
6674
7786
  d, VFromD<decltype(du)>{_mm_shufflehi_epi16(
@@ -6699,9 +7811,9 @@ HWY_API V DupOdd(V v) {
6699
7811
 
6700
7812
  #if HWY_TARGET <= HWY_SSSE3
6701
7813
  const RebindToUnsigned<decltype(d)> du;
6702
- alignas(16) static constexpr uint8_t kShuffle[16] = {
6703
- 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
6704
- return TableLookupBytes(v, BitCast(d, LoadDup128(du, kShuffle)));
7814
+ const VFromD<decltype(du)> shuffle = Dup128VecFromValues(
7815
+ du, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
7816
+ return TableLookupBytes(v, BitCast(d, shuffle));
6705
7817
  #else
6706
7818
  const Repartition<uint16_t, decltype(d)> du16;
6707
7819
  return IfVecThenElse(BitCast(d, Set(du16, uint16_t{0x00FF})),
@@ -6723,9 +7835,9 @@ HWY_API V DupOdd(V v) {
6723
7835
  const DFromV<decltype(v)> d;
6724
7836
  const RebindToUnsigned<decltype(d)> du; // for float16_t
6725
7837
  #if HWY_TARGET <= HWY_SSSE3
6726
- alignas(16) static constexpr uint16_t kShuffle[8] = {
6727
- 0x0302, 0x0302, 0x0706, 0x0706, 0x0b0a, 0x0b0a, 0x0f0e, 0x0f0e};
6728
- return TableLookupBytes(v, BitCast(d, LoadDup128(du, kShuffle)));
7838
+ const VFromD<decltype(du)> shuffle = Dup128VecFromValues(
7839
+ du, 0x0302, 0x0302, 0x0706, 0x0706, 0x0b0a, 0x0b0a, 0x0f0e, 0x0f0e);
7840
+ return TableLookupBytes(v, BitCast(d, shuffle));
6729
7841
  #else
6730
7842
  return BitCast(
6731
7843
  d, VFromD<decltype(du)>{_mm_shufflehi_epi16(
@@ -6952,14 +8064,16 @@ HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
6952
8064
 
6953
8065
  template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
6954
8066
  HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
6955
- #if HWY_TARGET >= HWY_SSSE3
6956
8067
  const DFromV<decltype(a)> d;
8068
+ #if HWY_TARGET >= HWY_SSSE3
6957
8069
  const Repartition<uint8_t, decltype(d)> d8;
6958
8070
  alignas(16) static constexpr uint8_t mask[16] = {
6959
8071
  0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
6960
8072
  return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
6961
8073
  #else
6962
- return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x55)};
8074
+ const RebindToUnsigned<decltype(d)> du; // for float16_t
8075
+ return BitCast(d, VFromD<decltype(du)>{_mm_blend_epi16(
8076
+ BitCast(du, a).raw, BitCast(du, b).raw, 0x55)});
6963
8077
  #endif
6964
8078
  }
6965
8079
 
@@ -7941,11 +9055,31 @@ HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int16_t, D>> v) {
7941
9055
  #endif
7942
9056
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
7943
9057
  HWY_INLINE_F16 VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) {
9058
+ #if HWY_HAVE_FLOAT16
9059
+ const RebindToUnsigned<DFromV<decltype(v)>> du16;
9060
+ return VFromD<D>{_mm_cvtph_ps(BitCast(du16, v).raw)};
9061
+ #else
7944
9062
  return VFromD<D>{_mm_cvtph_ps(v.raw)};
9063
+ #endif
7945
9064
  }
7946
9065
 
7947
9066
  #endif // HWY_NATIVE_F16C
7948
9067
 
9068
+ #if HWY_HAVE_FLOAT16
9069
+
9070
+ #ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
9071
+ #undef HWY_NATIVE_PROMOTE_F16_TO_F64
9072
+ #else
9073
+ #define HWY_NATIVE_PROMOTE_F16_TO_F64
9074
+ #endif
9075
+
9076
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
9077
+ HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) {
9078
+ return VFromD<D>{_mm_cvtph_pd(v.raw)};
9079
+ }
9080
+
9081
+ #endif // HWY_HAVE_FLOAT16
9082
+
7949
9083
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
7950
9084
  HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) {
7951
9085
  const Rebind<uint16_t, decltype(df32)> du16;
@@ -7980,6 +9114,42 @@ HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<uint32_t, D>> v) {
7980
9114
  }
7981
9115
  #endif
7982
9116
 
9117
+ // ------------------------------ PromoteEvenTo/PromoteOddTo
9118
+
9119
+ #if HWY_TARGET > HWY_AVX3
9120
+ namespace detail {
9121
+
9122
+ // I32->I64 PromoteEvenTo/PromoteOddTo
9123
+
9124
+ template <class D, HWY_IF_LANES_D(D, 1)>
9125
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
9126
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
9127
+ hwy::SignedTag /*from_type_tag*/, D d_to,
9128
+ Vec64<int32_t> v) {
9129
+ return PromoteLowerTo(d_to, v);
9130
+ }
9131
+
9132
+ template <class D, HWY_IF_LANES_D(D, 2)>
9133
+ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
9134
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
9135
+ hwy::SignedTag /*from_type_tag*/, D d_to,
9136
+ Vec128<int32_t> v) {
9137
+ const Repartition<int32_t, D> d_from;
9138
+ return PromoteLowerTo(d_to, ConcatEven(d_from, v, v));
9139
+ }
9140
+
9141
+ template <class D, class V, HWY_IF_LANES_LE_D(D, 2)>
9142
+ HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
9143
+ hwy::SizeTag<8> /*to_lane_size_tag*/,
9144
+ hwy::SignedTag /*from_type_tag*/, D d_to,
9145
+ V v) {
9146
+ const Repartition<int32_t, D> d_from;
9147
+ return PromoteLowerTo(d_to, ConcatOdd(d_from, v, v));
9148
+ }
9149
+
9150
+ } // namespace detail
9151
+ #endif
9152
+
7983
9153
  // ------------------------------ Demotions (full -> part w/ narrow lanes)
7984
9154
 
7985
9155
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
@@ -8143,14 +9313,31 @@ HWY_DIAGNOSTICS(push)
8143
9313
  HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wmain")
8144
9314
 
8145
9315
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
8146
- HWY_API VFromD<D> DemoteTo(D /*tag*/, VFromD<Rebind<float, D>> v) {
8147
- return VFromD<D>{_mm_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
9316
+ HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
9317
+ const RebindToUnsigned<decltype(df16)> du16;
9318
+ return BitCast(
9319
+ df16, VFromD<decltype(du16)>{_mm_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)});
8148
9320
  }
8149
9321
 
8150
9322
  HWY_DIAGNOSTICS(pop)
8151
9323
 
8152
9324
  #endif // F16C
8153
9325
 
9326
+ #if HWY_HAVE_FLOAT16
9327
+
9328
+ #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
9329
+ #undef HWY_NATIVE_DEMOTE_F64_TO_F16
9330
+ #else
9331
+ #define HWY_NATIVE_DEMOTE_F64_TO_F16
9332
+ #endif
9333
+
9334
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)>
9335
+ HWY_API VFromD<D> DemoteTo(D /*df16*/, VFromD<Rebind<double, D>> v) {
9336
+ return VFromD<D>{_mm_cvtpd_ph(v.raw)};
9337
+ }
9338
+
9339
+ #endif // HWY_HAVE_FLOAT16
9340
+
8154
9341
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
8155
9342
  HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
8156
9343
  // TODO(janwas): _mm_cvtneps_pbh once we have avx512bf16.
@@ -8389,7 +9576,7 @@ HWY_API VFromD<D> DemoteTo(D du32, VFromD<Rebind<double, D>> v) {
8389
9576
  #if HWY_TARGET <= HWY_AVX3
8390
9577
  (void)du32;
8391
9578
  return VFromD<D>{
8392
- _mm_maskz_cvttpd_epu32(_knot_mask8(MaskFromVec(v).raw), v.raw)};
9579
+ _mm_maskz_cvttpd_epu32(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
8393
9580
  #else // AVX2 or earlier
8394
9581
  const Rebind<double, decltype(du32)> df64;
8395
9582
  const RebindToUnsigned<decltype(df64)> du64;
@@ -8512,7 +9699,7 @@ HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
8512
9699
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
8513
9700
  HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
8514
9701
  return VFromD<D>{
8515
- _mm_maskz_cvttps_epu64(_knot_mask8(MaskFromVec(v).raw), v.raw)};
9702
+ _mm_maskz_cvttps_epu64(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
8516
9703
  }
8517
9704
  #else // AVX2 or below
8518
9705
 
@@ -8747,32 +9934,17 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
8747
9934
 
8748
9935
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
8749
9936
  HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
8750
- const auto neg_mask = MaskFromVec(v);
8751
- #if HWY_COMPILER_HAS_MASK_INTRINSICS
8752
- const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
8753
- #else
8754
- const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
8755
- #endif
9937
+ const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
8756
9938
  return VFromD<D>{_mm_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)};
8757
9939
  }
8758
9940
  template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)>
8759
9941
  HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
8760
- const auto neg_mask = MaskFromVec(v);
8761
- #if HWY_COMPILER_HAS_MASK_INTRINSICS
8762
- const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
8763
- #else
8764
- const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
8765
- #endif
9942
+ const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
8766
9943
  return VFromD<D>{_mm_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)};
8767
9944
  }
8768
9945
  template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)>
8769
9946
  HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
8770
- const auto neg_mask = MaskFromVec(v);
8771
- #if HWY_COMPILER_HAS_MASK_INTRINSICS
8772
- const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
8773
- #else
8774
- const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
8775
- #endif
9947
+ const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
8776
9948
  return VFromD<D>{_mm_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)};
8777
9949
  }
8778
9950
 
@@ -9030,6 +10202,11 @@ HWY_API VFromD<D> ConvertTo(D di, VFromD<RebindToFloat<D>> v) {
9030
10202
  return detail::FixConversionOverflow(
9031
10203
  di, v, VFromD<RebindToSigned<D>>{_mm_cvttph_epi16(v.raw)});
9032
10204
  }
10205
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
10206
+ HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
10207
+ return VFromD<D>{
10208
+ _mm_maskz_cvttph_epu16(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
10209
+ }
9033
10210
  #endif // HWY_HAVE_FLOAT16
9034
10211
 
9035
10212
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
@@ -9048,13 +10225,13 @@ HWY_API VFromD<DI> ConvertTo(DI di, VFromD<RebindToFloat<DI>> v) {
9048
10225
  template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U32_D(DU)>
9049
10226
  HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
9050
10227
  return VFromD<DU>{
9051
- _mm_maskz_cvttps_epu32(_knot_mask8(MaskFromVec(v).raw), v.raw)};
10228
+ _mm_maskz_cvttps_epu32(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
9052
10229
  }
9053
10230
 
9054
10231
  template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U64_D(DU)>
9055
10232
  HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
9056
10233
  return VFromD<DU>{
9057
- _mm_maskz_cvttpd_epu64(_knot_mask8(MaskFromVec(v).raw), v.raw)};
10234
+ _mm_maskz_cvttpd_epu64(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
9058
10235
  }
9059
10236
 
9060
10237
  #else // AVX2 or below
@@ -9445,6 +10622,13 @@ HWY_API Mask128<double, N> IsNaN(const Vec128<double, N> v) {
9445
10622
 
9446
10623
  #if HWY_TARGET <= HWY_AVX3
9447
10624
 
10625
+ // Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite.
10626
+ #ifdef HWY_NATIVE_ISINF
10627
+ #undef HWY_NATIVE_ISINF
10628
+ #else
10629
+ #define HWY_NATIVE_ISINF
10630
+ #endif
10631
+
9448
10632
  template <size_t N>
9449
10633
  HWY_API Mask128<float, N> IsInf(const Vec128<float, N> v) {
9450
10634
  return Mask128<float, N>{_mm_fpclass_ps_mask(
@@ -9472,35 +10656,6 @@ HWY_API Mask128<double, N> IsFinite(const Vec128<double, N> v) {
9472
10656
  HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)});
9473
10657
  }
9474
10658
 
9475
- #else
9476
-
9477
- template <typename T, size_t N>
9478
- HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
9479
- static_assert(IsFloat<T>(), "Only for float");
9480
- const DFromV<decltype(v)> d;
9481
- const RebindToSigned<decltype(d)> di;
9482
- const VFromD<decltype(di)> vi = BitCast(di, v);
9483
- // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
9484
- return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
9485
- }
9486
-
9487
- // Returns whether normal/subnormal/zero.
9488
- template <typename T, size_t N>
9489
- HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
9490
- static_assert(IsFloat<T>(), "Only for float");
9491
- const DFromV<decltype(v)> d;
9492
- const RebindToUnsigned<decltype(d)> du;
9493
- const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
9494
- const VFromD<decltype(du)> vu = BitCast(du, v);
9495
- // Shift left to clear the sign bit, then right so we can compare with the
9496
- // max exponent (cannot compare with MaxExponentTimes2 directly because it is
9497
- // negative and non-negative floats would be greater). MSVC seems to generate
9498
- // incorrect code if we instead add vu + vu.
9499
- const VFromD<decltype(di)> exp =
9500
- BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(ShiftLeft<1>(vu)));
9501
- return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
9502
- }
9503
-
9504
10659
  #endif // HWY_TARGET <= HWY_AVX3
9505
10660
 
9506
10661
  // ================================================== CRYPTO
@@ -9586,10 +10741,9 @@ HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) {
9586
10741
  1, 1, 1, 1, 1, 1, 1, 1};
9587
10742
  const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
9588
10743
  #endif
9589
-
9590
- alignas(16) static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
9591
- 1, 2, 4, 8, 16, 32, 64, 128};
9592
- return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
10744
+ const VFromD<decltype(du)> bit = Dup128VecFromValues(
10745
+ du, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
10746
+ return RebindMask(d, TestBit(rep8, bit));
9593
10747
  }
9594
10748
 
9595
10749
  template <class D, HWY_IF_T_SIZE_D(D, 2)>
@@ -9644,6 +10798,20 @@ HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
9644
10798
  #endif
9645
10799
  }
9646
10800
 
10801
+ // ------------------------------ Dup128MaskFromMaskBits
10802
+
10803
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
10804
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
10805
+ constexpr size_t kN = MaxLanes(d);
10806
+ if (kN < 8) mask_bits &= (1u << kN) - 1;
10807
+
10808
+ #if HWY_TARGET <= HWY_AVX3
10809
+ return MFromD<D>::FromBits(mask_bits);
10810
+ #else
10811
+ return detail::LoadMaskBits128(d, mask_bits);
10812
+ #endif
10813
+ }
10814
+
9647
10815
  template <typename T>
9648
10816
  struct CompressIsPartition {
9649
10817
  #if HWY_TARGET <= HWY_AVX3
@@ -10779,243 +11947,74 @@ HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
10779
11947
 
10780
11948
  // ------------------------------ Reductions
10781
11949
 
10782
- namespace detail {
11950
+ // Nothing fully native, generic_ops-inl defines SumOfLanes and ReduceSum.
10783
11951
 
10784
- // N=1: no-op
10785
- template <typename T>
10786
- HWY_INLINE Vec128<T, 1> SumOfLanes(Vec128<T, 1> v) {
10787
- return v;
10788
- }
10789
- template <typename T>
10790
- HWY_INLINE Vec128<T, 1> MinOfLanes(Vec128<T, 1> v) {
10791
- return v;
10792
- }
10793
- template <typename T>
10794
- HWY_INLINE Vec128<T, 1> MaxOfLanes(Vec128<T, 1> v) {
10795
- return v;
10796
- }
11952
+ // We provide specializations of u8x8 and u8x16, so exclude those.
11953
+ #undef HWY_IF_SUM_OF_LANES_D
11954
+ #define HWY_IF_SUM_OF_LANES_D(D) \
11955
+ HWY_IF_LANES_GT_D(D, 1), \
11956
+ hwy::EnableIf<!hwy::IsSame<TFromD<D>, uint8_t>() || \
11957
+ (HWY_V_SIZE_D(D) != 8 && HWY_V_SIZE_D(D) != 16)>* = \
11958
+ nullptr
10797
11959
 
10798
- // N=2
10799
- template <typename T>
10800
- HWY_INLINE Vec128<T, 2> SumOfLanes(Vec128<T, 2> v10) {
10801
- const DFromV<decltype(v10)> d;
10802
- return Add(v10, Reverse2(d, v10));
11960
+ template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_D(D, 8)>
11961
+ HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
11962
+ return Set(d, static_cast<uint8_t>(GetLane(SumsOf8(v)) & 0xFF));
10803
11963
  }
10804
- template <typename T>
10805
- HWY_INLINE Vec128<T, 2> MinOfLanes(Vec128<T, 2> v10) {
10806
- const DFromV<decltype(v10)> d;
10807
- return Min(v10, Reverse2(d, v10));
10808
- }
10809
- template <typename T>
10810
- HWY_INLINE Vec128<T, 2> MaxOfLanes(Vec128<T, 2> v10) {
10811
- const DFromV<decltype(v10)> d;
10812
- return Max(v10, Reverse2(d, v10));
10813
- }
10814
-
10815
- // N=4 (only 16/32-bit, else >128-bit)
10816
- template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
10817
- HWY_INLINE Vec128<T, 4> SumOfLanes(Vec128<T, 4> v3210) {
10818
- using V = decltype(v3210);
10819
- const DFromV<V> d;
10820
- const V v0123 = Reverse4(d, v3210);
10821
- const V v03_12_12_03 = Add(v3210, v0123);
10822
- const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
10823
- return Add(v03_12_12_03, v12_03_03_12);
10824
- }
10825
- template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
10826
- HWY_INLINE Vec128<T, 4> MinOfLanes(Vec128<T, 4> v3210) {
10827
- using V = decltype(v3210);
10828
- const DFromV<V> d;
10829
- const V v0123 = Reverse4(d, v3210);
10830
- const V v03_12_12_03 = Min(v3210, v0123);
10831
- const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
10832
- return Min(v03_12_12_03, v12_03_03_12);
10833
- }
10834
- template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
10835
- HWY_INLINE Vec128<T, 4> MaxOfLanes(Vec128<T, 4> v3210) {
10836
- using V = decltype(v3210);
10837
- const DFromV<V> d;
10838
- const V v0123 = Reverse4(d, v3210);
10839
- const V v03_12_12_03 = Max(v3210, v0123);
10840
- const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
10841
- return Max(v03_12_12_03, v12_03_03_12);
11964
+ template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_D(D, 16)>
11965
+ HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
11966
+ const Repartition<uint64_t, decltype(d)> d64;
11967
+ VFromD<decltype(d64)> sums = SumsOf8(v);
11968
+ sums = SumOfLanes(d64, sums);
11969
+ return Broadcast<0>(BitCast(d, sums));
10842
11970
  }
10843
11971
 
10844
- #undef HWY_X86_IF_NOT_MINPOS
10845
11972
  #if HWY_TARGET <= HWY_SSE4
10846
- // Skip the T_SIZE = 2 overload in favor of the following two.
10847
- #define HWY_X86_IF_NOT_MINPOS(T) \
10848
- hwy::EnableIf<!IsSame<T, uint16_t>()>* = nullptr
11973
+ // We provide specializations of u8x8, u8x16, and u16x8, so exclude those.
11974
+ #undef HWY_IF_MINMAX_OF_LANES_D
11975
+ #define HWY_IF_MINMAX_OF_LANES_D(D) \
11976
+ HWY_IF_LANES_GT_D(D, 1), \
11977
+ hwy::EnableIf<(!hwy::IsSame<TFromD<D>, uint8_t>() || \
11978
+ ((HWY_V_SIZE_D(D) < 8) || (HWY_V_SIZE_D(D) > 16))) && \
11979
+ (!hwy::IsSame<TFromD<D>, uint16_t>() || \
11980
+ (HWY_V_SIZE_D(D) != 16))>* = nullptr
10849
11981
 
10850
- HWY_INLINE Vec128<uint16_t> MinOfLanes(Vec128<uint16_t> v) {
11982
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
11983
+ HWY_API Vec128<uint16_t> MinOfLanes(D /* tag */, Vec128<uint16_t> v) {
10851
11984
  return Broadcast<0>(Vec128<uint16_t>{_mm_minpos_epu16(v.raw)});
10852
11985
  }
10853
11986
 
10854
- HWY_INLINE Vec128<uint16_t> MaxOfLanes(Vec128<uint16_t> v) {
10855
- const DFromV<decltype(v)> d;
11987
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
11988
+ HWY_API Vec128<uint16_t> MaxOfLanes(D d, Vec128<uint16_t> v) {
10856
11989
  const Vec128<uint16_t> max = Set(d, LimitsMax<uint16_t>());
10857
- return max - MinOfLanes(max - v);
10858
- }
10859
- #else
10860
- #define HWY_X86_IF_NOT_MINPOS(T) hwy::EnableIf<true>* = nullptr
10861
- #endif // HWY_TARGET <= HWY_SSE4
10862
-
10863
- // N=8 (only 16-bit, else >128-bit)
10864
- template <typename T, HWY_IF_T_SIZE(T, 2)>
10865
- HWY_INLINE Vec128<T, 8> SumOfLanes(Vec128<T, 8> v76543210) {
10866
- using V = decltype(v76543210);
10867
- const DFromV<V> d;
10868
- // The upper half is reversed from the lower half; omit for brevity.
10869
- const V v34_25_16_07 = Add(v76543210, Reverse8(d, v76543210));
10870
- const V v0347_1625_1625_0347 = Add(v34_25_16_07, Reverse4(d, v34_25_16_07));
10871
- return Add(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
10872
- }
10873
- template <typename T, HWY_IF_T_SIZE(T, 2), HWY_X86_IF_NOT_MINPOS(T)>
10874
- HWY_INLINE Vec128<T, 8> MinOfLanes(Vec128<T, 8> v76543210) {
10875
- using V = decltype(v76543210);
10876
- const DFromV<V> d;
10877
- // The upper half is reversed from the lower half; omit for brevity.
10878
- const V v34_25_16_07 = Min(v76543210, Reverse8(d, v76543210));
10879
- const V v0347_1625_1625_0347 = Min(v34_25_16_07, Reverse4(d, v34_25_16_07));
10880
- return Min(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
10881
- }
10882
- template <typename T, HWY_IF_T_SIZE(T, 2), HWY_X86_IF_NOT_MINPOS(T)>
10883
- HWY_INLINE Vec128<T, 8> MaxOfLanes(Vec128<T, 8> v76543210) {
10884
- using V = decltype(v76543210);
10885
- const DFromV<V> d;
10886
- // The upper half is reversed from the lower half; omit for brevity.
10887
- const V v34_25_16_07 = Max(v76543210, Reverse8(d, v76543210));
10888
- const V v0347_1625_1625_0347 = Max(v34_25_16_07, Reverse4(d, v34_25_16_07));
10889
- return Max(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
10890
- }
10891
-
10892
- template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
10893
- HWY_INLINE T ReduceSum(Vec128<T, N> v) {
10894
- return GetLane(SumOfLanes(v));
10895
- }
10896
-
10897
- // u8, N=8, N=16:
10898
- HWY_INLINE uint8_t ReduceSum(Vec64<uint8_t> v) {
10899
- return static_cast<uint8_t>(GetLane(SumsOf8(v)) & 0xFF);
10900
- }
10901
- HWY_INLINE Vec64<uint8_t> SumOfLanes(Vec64<uint8_t> v) {
10902
- const Full64<uint8_t> d;
10903
- return Set(d, ReduceSum(v));
10904
- }
10905
- HWY_INLINE uint8_t ReduceSum(Vec128<uint8_t> v) {
10906
- uint64_t sums = ReduceSum(SumsOf8(v));
10907
- return static_cast<uint8_t>(sums & 0xFF);
10908
- }
10909
- HWY_INLINE Vec128<uint8_t> SumOfLanes(Vec128<uint8_t> v) {
10910
- const DFromV<decltype(v)> d;
10911
- return Set(d, ReduceSum(v));
10912
- }
10913
- template <size_t N, HWY_IF_V_SIZE_GT(int8_t, N, 4)>
10914
- HWY_INLINE int8_t ReduceSum(const Vec128<int8_t, N> v) {
10915
- const DFromV<decltype(v)> d;
10916
- const RebindToUnsigned<decltype(d)> du;
10917
- const auto is_neg = v < Zero(d);
10918
-
10919
- // Sum positive and negative lanes separately, then combine to get the result.
10920
- const auto positive = SumsOf8(BitCast(du, IfThenZeroElse(is_neg, v)));
10921
- const auto negative = SumsOf8(BitCast(du, IfThenElseZero(is_neg, Abs(v))));
10922
- return static_cast<int8_t>(ReduceSum(positive - negative) & 0xFF);
10923
- }
10924
- template <size_t N, HWY_IF_V_SIZE_GT(int8_t, N, 4)>
10925
- HWY_INLINE Vec128<int8_t, N> SumOfLanes(const Vec128<int8_t, N> v) {
10926
- const DFromV<decltype(v)> d;
10927
- return Set(d, ReduceSum(v));
11990
+ return max - MinOfLanes(d, max - v);
10928
11991
  }
10929
11992
 
10930
- #if HWY_TARGET <= HWY_SSE4
10931
- HWY_INLINE Vec64<uint8_t> MinOfLanes(Vec64<uint8_t> v) {
10932
- const DFromV<decltype(v)> d;
11993
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
11994
+ HWY_API Vec64<uint8_t> MinOfLanes(D d, Vec64<uint8_t> v) {
10933
11995
  const Rebind<uint16_t, decltype(d)> d16;
10934
- return TruncateTo(d, MinOfLanes(PromoteTo(d16, v)));
11996
+ return TruncateTo(d, MinOfLanes(d16, PromoteTo(d16, v)));
10935
11997
  }
10936
- HWY_INLINE Vec128<uint8_t> MinOfLanes(Vec128<uint8_t> v) {
10937
- const Half<DFromV<decltype(v)>> d;
11998
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
11999
+ HWY_API Vec128<uint8_t> MinOfLanes(D d, Vec128<uint8_t> v) {
12000
+ const Half<decltype(d)> dh;
10938
12001
  Vec64<uint8_t> result =
10939
- Min(MinOfLanes(UpperHalf(d, v)), MinOfLanes(LowerHalf(d, v)));
10940
- return Combine(DFromV<decltype(v)>(), result, result);
12002
+ Min(MinOfLanes(dh, UpperHalf(dh, v)), MinOfLanes(dh, LowerHalf(dh, v)));
12003
+ return Combine(d, result, result);
10941
12004
  }
10942
12005
 
10943
- HWY_INLINE Vec64<uint8_t> MaxOfLanes(Vec64<uint8_t> v) {
10944
- const Vec64<uint8_t> m(Set(DFromV<decltype(v)>(), LimitsMax<uint8_t>()));
10945
- return m - MinOfLanes(m - v);
10946
- }
10947
- HWY_INLINE Vec128<uint8_t> MaxOfLanes(Vec128<uint8_t> v) {
10948
- const Vec128<uint8_t> m(Set(DFromV<decltype(v)>(), LimitsMax<uint8_t>()));
10949
- return m - MinOfLanes(m - v);
10950
- }
10951
- #elif HWY_TARGET >= HWY_SSSE3
10952
- template <size_t N, HWY_IF_V_SIZE_GT(uint8_t, N, 4)>
10953
- HWY_API Vec128<uint8_t, N> MaxOfLanes(Vec128<uint8_t, N> v) {
10954
- const DFromV<decltype(v)> d;
10955
- const RepartitionToWide<decltype(d)> d16;
10956
- const RepartitionToWide<decltype(d16)> d32;
10957
- Vec128<uint8_t, N> vm = Max(v, Reverse2(d, v));
10958
- vm = Max(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
10959
- vm = Max(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
10960
- if (N > 8) {
10961
- const RepartitionToWide<decltype(d32)> d64;
10962
- vm = Max(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
10963
- }
10964
- return vm;
10965
- }
10966
-
10967
- template <size_t N, HWY_IF_V_SIZE_GT(uint8_t, N, 4)>
10968
- HWY_API Vec128<uint8_t, N> MinOfLanes(Vec128<uint8_t, N> v) {
10969
- const DFromV<decltype(v)> d;
10970
- const RepartitionToWide<decltype(d)> d16;
10971
- const RepartitionToWide<decltype(d16)> d32;
10972
- Vec128<uint8_t, N> vm = Min(v, Reverse2(d, v));
10973
- vm = Min(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
10974
- vm = Min(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
10975
- if (N > 8) {
10976
- const RepartitionToWide<decltype(d32)> d64;
10977
- vm = Min(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
10978
- }
10979
- return vm;
10980
- }
10981
- #endif
10982
-
10983
- // Implement min/max of i8 in terms of u8 by toggling the sign bit.
10984
- template <size_t N, HWY_IF_V_SIZE_GT(int8_t, N, 4)>
10985
- HWY_INLINE Vec128<int8_t, N> MinOfLanes(Vec128<int8_t, N> v) {
10986
- const DFromV<decltype(v)> d;
10987
- const RebindToUnsigned<decltype(d)> du;
10988
- const auto mask = SignBit(du);
10989
- const auto vu = Xor(BitCast(du, v), mask);
10990
- return BitCast(d, Xor(MinOfLanes(vu), mask));
12006
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
12007
+ HWY_API Vec64<uint8_t> MaxOfLanes(D d, Vec64<uint8_t> v) {
12008
+ const Vec64<uint8_t> m(Set(d, LimitsMax<uint8_t>()));
12009
+ return m - MinOfLanes(d, m - v);
10991
12010
  }
10992
- template <size_t N, HWY_IF_V_SIZE_GT(int8_t, N, 4)>
10993
- HWY_INLINE Vec128<int8_t, N> MaxOfLanes(Vec128<int8_t, N> v) {
10994
- const DFromV<decltype(v)> d;
10995
- const RebindToUnsigned<decltype(d)> du;
10996
- const auto mask = SignBit(du);
10997
- const auto vu = Xor(BitCast(du, v), mask);
10998
- return BitCast(d, Xor(MaxOfLanes(vu), mask));
12011
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
12012
+ HWY_API Vec128<uint8_t> MaxOfLanes(D d, Vec128<uint8_t> v) {
12013
+ const Vec128<uint8_t> m(Set(d, LimitsMax<uint8_t>()));
12014
+ return m - MinOfLanes(d, m - v);
10999
12015
  }
11000
12016
 
11001
- } // namespace detail
11002
-
11003
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11004
- HWY_API VFromD<D> SumOfLanes(D /* tag */, VFromD<D> v) {
11005
- return detail::SumOfLanes(v);
11006
- }
11007
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11008
- HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v) {
11009
- return detail::ReduceSum(v);
11010
- }
11011
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11012
- HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) {
11013
- return detail::MinOfLanes(v);
11014
- }
11015
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
11016
- HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) {
11017
- return detail::MaxOfLanes(v);
11018
- }
12017
+ #endif // HWY_TARGET <= HWY_SSE4
11019
12018
 
11020
12019
  // ------------------------------ Lt128
11021
12020
 
@@ -11168,6 +12167,8 @@ HWY_API V LeadingZeroCount(V v) {
11168
12167
  } // namespace hwy
11169
12168
  HWY_AFTER_NAMESPACE();
11170
12169
 
12170
+ #undef HWY_X86_IF_EMULATED_D
12171
+
11171
12172
  // Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
11172
12173
  // the warning seems to be issued at the call site of intrinsics, i.e. our code.
11173
12174
  HWY_DIAGNOSTICS(pop)