@img/sharp-libvips-dev 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/README.md +1 -2
  2. package/include/aom/aom_decoder.h +1 -1
  3. package/include/aom/aom_encoder.h +7 -1
  4. package/include/aom/aom_image.h +24 -12
  5. package/include/aom/aom_integer.h +3 -3
  6. package/include/aom/aomcx.h +15 -0
  7. package/include/aom/aomdx.h +5 -2
  8. package/include/archive.h +7 -5
  9. package/include/archive_entry.h +5 -3
  10. package/include/cgif.h +3 -0
  11. package/include/freetype2/freetype/config/ftoption.h +1 -1
  12. package/include/fribidi/fribidi-config.h +2 -2
  13. package/include/fribidi/fribidi-unicode-version.h +3 -3
  14. package/include/glib-2.0/gio/gappinfo.h +40 -25
  15. package/include/glib-2.0/gio/gasyncresult.h +1 -1
  16. package/include/glib-2.0/gio/gconverter.h +5 -0
  17. package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
  18. package/include/glib-2.0/gio/gfile.h +16 -0
  19. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  20. package/include/glib-2.0/gio/gsettings.h +8 -0
  21. package/include/glib-2.0/gio/gvfs.h +2 -2
  22. package/include/glib-2.0/girepository/gi-visibility.h +34 -0
  23. package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
  24. package/include/glib-2.0/glib/giochannel.h +2 -2
  25. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  26. package/include/glib-2.0/glib/gmacros.h +12 -5
  27. package/include/glib-2.0/glib/gmain.h +93 -7
  28. package/include/glib-2.0/glib/gqsort.h +8 -1
  29. package/include/glib-2.0/glib/gstrfuncs.h +0 -12
  30. package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
  31. package/include/glib-2.0/glib/gunicode.h +1 -1
  32. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  33. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  34. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  35. package/include/glib-2.0/gobject/gtype.h +6 -6
  36. package/include/harfbuzz/hb-buffer.h +6 -0
  37. package/include/harfbuzz/hb-common.h +6 -9
  38. package/include/harfbuzz/hb-cplusplus.hh +8 -11
  39. package/include/harfbuzz/hb-subset.h +17 -4
  40. package/include/harfbuzz/hb-version.h +3 -3
  41. package/include/hwy/abort.h +28 -0
  42. package/include/hwy/aligned_allocator.h +48 -1
  43. package/include/hwy/base.h +235 -34
  44. package/include/hwy/detect_compiler_arch.h +84 -10
  45. package/include/hwy/detect_targets.h +95 -29
  46. package/include/hwy/foreach_target.h +12 -1
  47. package/include/hwy/highway.h +205 -50
  48. package/include/hwy/ops/arm_neon-inl.h +841 -99
  49. package/include/hwy/ops/arm_sve-inl.h +413 -141
  50. package/include/hwy/ops/emu128-inl.h +373 -360
  51. package/include/hwy/ops/generic_ops-inl.h +804 -401
  52. package/include/hwy/ops/inside-inl.h +691 -0
  53. package/include/hwy/ops/ppc_vsx-inl.h +456 -166
  54. package/include/hwy/ops/rvv-inl.h +537 -249
  55. package/include/hwy/ops/scalar-inl.h +169 -79
  56. package/include/hwy/ops/set_macros-inl.h +106 -18
  57. package/include/hwy/ops/shared-inl.h +23 -0
  58. package/include/hwy/ops/wasm_128-inl.h +130 -108
  59. package/include/hwy/ops/x86_128-inl.h +1892 -577
  60. package/include/hwy/ops/x86_256-inl.h +625 -184
  61. package/include/hwy/ops/x86_512-inl.h +733 -131
  62. package/include/hwy/targets.h +22 -21
  63. package/include/hwy/timer-inl.h +3 -3
  64. package/include/hwy/timer.h +5 -1
  65. package/include/libheif/heif.h +170 -15
  66. package/include/libheif/heif_items.h +237 -0
  67. package/include/libheif/heif_properties.h +38 -2
  68. package/include/libheif/heif_regions.h +1 -1
  69. package/include/libheif/heif_version.h +2 -2
  70. package/include/libpng16/pnglibconf.h +1 -1
  71. package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
  72. package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
  73. package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
  74. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  75. package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
  76. package/include/libxml2/libxml/HTMLparser.h +12 -19
  77. package/include/libxml2/libxml/c14n.h +1 -12
  78. package/include/libxml2/libxml/debugXML.h +1 -1
  79. package/include/libxml2/libxml/encoding.h +9 -0
  80. package/include/libxml2/libxml/entities.h +12 -1
  81. package/include/libxml2/libxml/hash.h +19 -0
  82. package/include/libxml2/libxml/list.h +2 -2
  83. package/include/libxml2/libxml/nanohttp.h +17 -0
  84. package/include/libxml2/libxml/parser.h +61 -55
  85. package/include/libxml2/libxml/parserInternals.h +9 -1
  86. package/include/libxml2/libxml/pattern.h +6 -0
  87. package/include/libxml2/libxml/tree.h +32 -12
  88. package/include/libxml2/libxml/uri.h +11 -0
  89. package/include/libxml2/libxml/valid.h +29 -2
  90. package/include/libxml2/libxml/xinclude.h +7 -0
  91. package/include/libxml2/libxml/xmlIO.h +21 -4
  92. package/include/libxml2/libxml/xmlerror.h +14 -0
  93. package/include/libxml2/libxml/xmlexports.h +111 -15
  94. package/include/libxml2/libxml/xmlmemory.h +8 -45
  95. package/include/libxml2/libxml/xmlreader.h +2 -0
  96. package/include/libxml2/libxml/xmlsave.h +5 -0
  97. package/include/libxml2/libxml/xmlunicode.h +165 -1
  98. package/include/libxml2/libxml/xmlversion.h +15 -179
  99. package/include/libxml2/libxml/xmlwriter.h +1 -0
  100. package/include/libxml2/libxml/xpath.h +4 -0
  101. package/include/pango-1.0/pango/pango-features.h +3 -3
  102. package/include/pango-1.0/pango/pango-item.h +4 -2
  103. package/include/pango-1.0/pango/pango-version-macros.h +25 -0
  104. package/include/pango-1.0/pango/pangofc-font.h +2 -1
  105. package/include/pnglibconf.h +1 -1
  106. package/include/vips/util.h +1 -2
  107. package/include/vips/version.h +4 -4
  108. package/include/webp/decode.h +58 -56
  109. package/include/webp/demux.h +25 -21
  110. package/include/webp/encode.h +44 -39
  111. package/include/webp/mux.h +76 -15
  112. package/include/webp/mux_types.h +2 -1
  113. package/include/webp/sharpyuv/sharpyuv.h +77 -8
  114. package/include/webp/types.h +29 -8
  115. package/include/zconf.h +1 -1
  116. package/include/zlib.h +12 -12
  117. package/package.json +1 -1
  118. package/versions.json +14 -15
@@ -1,5 +1,6 @@
1
1
  // Copyright 2021 Google LLC
2
- // Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
2
+ // Copyright 2023,2024 Arm Limited and/or
3
+ // its affiliates <open-source-office@arm.com>
3
4
  // SPDX-License-Identifier: Apache-2.0
4
5
  // SPDX-License-Identifier: BSD-3-Clause
5
6
  //
@@ -59,7 +60,7 @@ HWY_API V Clamp(const V v, const V lo, const V hi) {
59
60
 
60
61
  // CombineShiftRightBytes (and -Lanes) are not available for the scalar target,
61
62
  // and RVV has its own implementation of -Lanes.
62
- #if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV
63
+ #if (HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV) || HWY_IDE
63
64
 
64
65
  template <size_t kLanes, class D>
65
66
  HWY_API VFromD<D> CombineShiftRightLanes(D d, VFromD<D> hi, VFromD<D> lo) {
@@ -197,6 +198,23 @@ HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from,
197
198
  #endif
198
199
  }
199
200
 
201
+ // ------------------------------ IsNegative
202
+ #if (defined(HWY_NATIVE_IS_NEGATIVE) == defined(HWY_TARGET_TOGGLE))
203
+ #ifdef HWY_NATIVE_IS_NEGATIVE
204
+ #undef HWY_NATIVE_IS_NEGATIVE
205
+ #else
206
+ #define HWY_NATIVE_IS_NEGATIVE
207
+ #endif
208
+
209
+ template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
210
+ HWY_API Mask<DFromV<V>> IsNegative(V v) {
211
+ const DFromV<decltype(v)> d;
212
+ const RebindToSigned<decltype(d)> di;
213
+ return RebindMask(d, MaskFromVec(BroadcastSignBit(BitCast(di, v))));
214
+ }
215
+
216
+ #endif // HWY_NATIVE_IS_NEGATIVE
217
+
200
218
  // ------------------------------ MaskFalse
201
219
  #if (defined(HWY_NATIVE_MASK_FALSE) == defined(HWY_TARGET_TOGGLE))
202
220
  #ifdef HWY_NATIVE_MASK_FALSE
@@ -212,6 +230,44 @@ HWY_API Mask<D> MaskFalse(D d) {
212
230
 
213
231
  #endif // HWY_NATIVE_MASK_FALSE
214
232
 
233
+ // ------------------------------ IfNegativeThenElseZero
234
+ #if (defined(HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO) == defined(HWY_TARGET_TOGGLE))
235
+ #ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
236
+ #undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
237
+ #else
238
+ #define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
239
+ #endif
240
+
241
+ template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
242
+ HWY_API V IfNegativeThenElseZero(V v, V yes) {
243
+ return IfThenElseZero(IsNegative(v), yes);
244
+ }
245
+
246
+ #endif // HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
247
+
248
+ // ------------------------------ IfNegativeThenZeroElse
249
+ #if (defined(HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE) == defined(HWY_TARGET_TOGGLE))
250
+ #ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
251
+ #undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
252
+ #else
253
+ #define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
254
+ #endif
255
+
256
+ template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
257
+ HWY_API V IfNegativeThenZeroElse(V v, V no) {
258
+ return IfThenZeroElse(IsNegative(v), no);
259
+ }
260
+
261
+ #endif // HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
262
+
263
+ // ------------------------------ ZeroIfNegative (IfNegativeThenZeroElse)
264
+
265
+ // ZeroIfNegative is generic for all vector lengths
266
+ template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
267
+ HWY_API V ZeroIfNegative(V v) {
268
+ return IfNegativeThenZeroElse(v, v);
269
+ }
270
+
215
271
  // ------------------------------ BitwiseIfThenElse
216
272
  #if (defined(HWY_NATIVE_BITWISE_IF_THEN_ELSE) == defined(HWY_TARGET_TOGGLE))
217
273
  #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
@@ -289,7 +345,7 @@ HWY_API Mask<DTo> DemoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) {
289
345
  #define HWY_NATIVE_COMBINE_MASKS
290
346
  #endif
291
347
 
292
- #if HWY_TARGET != HWY_SCALAR
348
+ #if HWY_TARGET != HWY_SCALAR || HWY_IDE
293
349
  template <class D>
294
350
  HWY_API Mask<D> CombineMasks(D d, Mask<Half<D>> hi, Mask<Half<D>> lo) {
295
351
  const Half<decltype(d)> dh;
@@ -325,7 +381,7 @@ HWY_API Mask<D> LowerHalfOfMask(D d, Mask<Twice<D>> m) {
325
381
  #define HWY_NATIVE_UPPER_HALF_OF_MASK
326
382
  #endif
327
383
 
328
- #if HWY_TARGET != HWY_SCALAR
384
+ #if HWY_TARGET != HWY_SCALAR || HWY_IDE
329
385
  template <class D>
330
386
  HWY_API Mask<D> UpperHalfOfMask(D d, Mask<Twice<D>> m) {
331
387
  const Twice<decltype(d)> dt;
@@ -345,7 +401,7 @@ HWY_API Mask<D> UpperHalfOfMask(D d, Mask<Twice<D>> m) {
345
401
  #define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
346
402
  #endif
347
403
 
348
- #if HWY_TARGET != HWY_SCALAR
404
+ #if HWY_TARGET != HWY_SCALAR || HWY_IDE
349
405
  template <class DTo, class DFrom>
350
406
  HWY_API Mask<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom d_from, Mask<DFrom> a,
351
407
  Mask<DFrom> b) {
@@ -367,6 +423,17 @@ HWY_API Mask<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom d_from, Mask<DFrom> a,
367
423
 
368
424
  #endif // HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
369
425
 
426
+ // ------------------------------ RotateLeft
427
+ template <int kBits, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
428
+ HWY_API V RotateLeft(V v) {
429
+ constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
430
+ static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
431
+
432
+ constexpr int kRotateRightAmt =
433
+ (kBits == 0) ? 0 : static_cast<int>(kSizeInBits) - kBits;
434
+ return RotateRight<kRotateRightAmt>(v);
435
+ }
436
+
370
437
  // ------------------------------ InterleaveWholeLower/InterleaveWholeUpper
371
438
  #if (defined(HWY_NATIVE_INTERLEAVE_WHOLE) == defined(HWY_TARGET_TOGGLE))
372
439
  #ifdef HWY_NATIVE_INTERLEAVE_WHOLE
@@ -375,7 +442,7 @@ HWY_API Mask<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom d_from, Mask<DFrom> a,
375
442
  #define HWY_NATIVE_INTERLEAVE_WHOLE
376
443
  #endif
377
444
 
378
- #if HWY_TARGET != HWY_SCALAR
445
+ #if HWY_TARGET != HWY_SCALAR || HWY_IDE
379
446
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
380
447
  HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
381
448
  // InterleaveWholeLower(d, a, b) is equivalent to InterleaveLower(a, b) if
@@ -401,7 +468,7 @@ HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
401
468
 
402
469
  #endif // HWY_NATIVE_INTERLEAVE_WHOLE
403
470
 
404
- #if HWY_TARGET != HWY_SCALAR
471
+ #if HWY_TARGET != HWY_SCALAR || HWY_IDE
405
472
  // The InterleaveWholeLower without the optional D parameter is generic for all
406
473
  // vector lengths.
407
474
  template <class V>
@@ -410,6 +477,17 @@ HWY_API V InterleaveWholeLower(V a, V b) {
410
477
  }
411
478
  #endif // HWY_TARGET != HWY_SCALAR
412
479
 
480
+ // ------------------------------ InterleaveEven
481
+
482
+ #if HWY_TARGET != HWY_SCALAR || HWY_IDE
483
+ // InterleaveEven without the optional D parameter is generic for all vector
484
+ // lengths
485
+ template <class V>
486
+ HWY_API V InterleaveEven(V a, V b) {
487
+ return InterleaveEven(DFromV<V>(), a, b);
488
+ }
489
+ #endif
490
+
413
491
  // ------------------------------ AddSub
414
492
 
415
493
  template <class V, HWY_IF_LANES_D(DFromV<V>, 1)>
@@ -423,10 +501,11 @@ HWY_API V AddSub(V a, V b) {
423
501
 
424
502
  // AddSub for F32x8 and F64x4 vectors is implemented in x86_256-inl.h on
425
503
  // AVX2/AVX3
426
- template <class V, HWY_IF_V_SIZE_GT_V(V, ((HWY_TARGET <= HWY_SSSE3 &&
427
- hwy::IsFloat3264<TFromV<V>>())
428
- ? 32
429
- : sizeof(TFromV<V>)))>
504
+
505
+ // AddSub for F16/F32/F64 vectors on SVE is implemented in arm_sve-inl.h
506
+
507
+ // AddSub for integer vectors on SVE2 is implemented in arm_sve-inl.h
508
+ template <class V, HWY_IF_ADDSUB_V(V)>
430
509
  HWY_API V AddSub(V a, V b) {
431
510
  using D = DFromV<decltype(a)>;
432
511
  using T = TFromD<D>;
@@ -507,7 +586,7 @@ HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
507
586
 
508
587
  template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
509
588
  HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
510
- #if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
589
+ #if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE
511
590
  // MaskedSubOr is more efficient than IfNegativeThenElse on RVV/SVE
512
591
  const auto zero = Zero(DFromV<V>());
513
592
  return MaskedSubOr(v, Lt(mask, zero), zero, v);
@@ -543,10 +622,9 @@ template <class V, HWY_IF_I32(TFromV<V>)>
543
622
  HWY_API V SaturatedNeg(V v) {
544
623
  const DFromV<decltype(v)> d;
545
624
 
546
- #if HWY_TARGET == HWY_RVV || \
547
- (HWY_TARGET >= HWY_PPC10 && HWY_TARGET <= HWY_PPC8) || \
548
- (HWY_TARGET >= HWY_SVE2_128 && HWY_TARGET <= HWY_NEON_WITHOUT_AES)
549
- // RVV/NEON/SVE/PPC have native I32 SaturatedSub instructions
625
+ #if HWY_TARGET == HWY_RVV || HWY_TARGET_IS_PPC || HWY_TARGET_IS_SVE || \
626
+ HWY_TARGET_IS_NEON
627
+ // RVV/PPC/SVE/NEON have native I32 SaturatedSub instructions
550
628
  return SaturatedSub(Zero(d), v);
551
629
  #else
552
630
  // ~v[i] - ((v[i] > LimitsMin<int32_t>()) ? -1 : 0) is equivalent to
@@ -567,9 +645,8 @@ HWY_API V SaturatedNeg(V v) {
567
645
 
568
646
  template <class V, HWY_IF_I64(TFromV<V>)>
569
647
  HWY_API V SaturatedNeg(V v) {
570
- #if HWY_TARGET == HWY_RVV || \
571
- (HWY_TARGET >= HWY_SVE2_128 && HWY_TARGET <= HWY_NEON_WITHOUT_AES)
572
- // RVV/NEON/SVE have native I64 SaturatedSub instructions
648
+ #if HWY_TARGET == HWY_RVV || HWY_TARGET_IS_SVE || HWY_TARGET_IS_NEON
649
+ // RVV/SVE/NEON have native I64 SaturatedSub instructions
573
650
  const DFromV<decltype(v)> d;
574
651
  return SaturatedSub(Zero(d), v);
575
652
  #else
@@ -805,6 +882,21 @@ HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
805
882
  }
806
883
  #endif // HWY_NATIVE_REDUCE_MINMAX_4_UI8
807
884
 
885
+ // ------------------------------ IsEitherNaN
886
+ #if (defined(HWY_NATIVE_IS_EITHER_NAN) == defined(HWY_TARGET_TOGGLE))
887
+ #ifdef HWY_NATIVE_IS_EITHER_NAN
888
+ #undef HWY_NATIVE_IS_EITHER_NAN
889
+ #else
890
+ #define HWY_NATIVE_IS_EITHER_NAN
891
+ #endif
892
+
893
+ template <class V, HWY_IF_FLOAT_V(V)>
894
+ HWY_API MFromD<DFromV<V>> IsEitherNaN(V a, V b) {
895
+ return Or(IsNaN(a), IsNaN(b));
896
+ }
897
+
898
+ #endif // HWY_NATIVE_IS_EITHER_NAN
899
+
808
900
  // ------------------------------ IsInf, IsFinite
809
901
 
810
902
  // AVX3 has target-specific implementations of these.
@@ -1290,8 +1382,9 @@ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
1290
1382
  TFromD<D>* HWY_RESTRICT unaligned) {
1291
1383
  const RebindToUnsigned<decltype(d)> du;
1292
1384
  using TU = TFromD<decltype(du)>;
1293
- const auto k5 = Set(du, TU{5});
1294
- const auto k6 = Set(du, TU{6});
1385
+ using VU = VFromD<decltype(du)>;
1386
+ const VU k5 = Set(du, TU{5});
1387
+ const VU k6 = Set(du, TU{6});
1295
1388
 
1296
1389
  // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
1297
1390
  // v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes
@@ -1307,29 +1400,29 @@ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
1307
1400
  // The interleaved vectors will be named A, B, C; temporaries with suffix
1308
1401
  // 0..2 indicate which input vector's lanes they hold.
1309
1402
  // cannot reuse shuf_A0 (has 5)
1310
- const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
1311
- const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0
1312
- const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0.
1313
- const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0..
1314
- const VFromD<D> A = BitCast(d, A0 | A1 | A2);
1403
+ const VU shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
1404
+ const VU vA0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0
1405
+ const VU vA1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0.
1406
+ const VU vA2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0..
1407
+ const VFromD<D> A = BitCast(d, vA0 | vA1 | vA2);
1315
1408
 
1316
1409
  // B: v1[10],v0[10], v2[9],v1[9],v0[9] .. , v2[6],v1[6],v0[6], v2[5],v1[5]
1317
- const auto shuf_B0 = shuf_A2 + k6; // .A..9..8..7..6..
1318
- const auto shuf_B1 = shuf_A0 + k5; // A..9..8..7..6..5
1319
- const auto shuf_B2 = shuf_A1 + k5; // ..9..8..7..6..5.
1320
- const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
1321
- const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
1322
- const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
1323
- const VFromD<D> B = BitCast(d, B0 | B1 | B2);
1410
+ const VU shuf_B0 = shuf_A2 + k6; // .A..9..8..7..6..
1411
+ const VU shuf_B1 = shuf_A0 + k5; // A..9..8..7..6..5
1412
+ const VU shuf_B2 = shuf_A1 + k5; // ..9..8..7..6..5.
1413
+ const VU vB0 = TableLookupBytesOr0(v0, shuf_B0);
1414
+ const VU vB1 = TableLookupBytesOr0(v1, shuf_B1);
1415
+ const VU vB2 = TableLookupBytesOr0(v2, shuf_B2);
1416
+ const VFromD<D> B = BitCast(d, vB0 | vB1 | vB2);
1324
1417
 
1325
1418
  // C: v2[15],v1[15],v0[15], v2[11],v1[11],v0[11], v2[10]
1326
- const auto shuf_C0 = shuf_B2 + k6; // ..F..E..D..C..B.
1327
- const auto shuf_C1 = shuf_B0 + k5; // .F..E..D..C..B..
1328
- const auto shuf_C2 = shuf_B1 + k5; // F..E..D..C..B..A
1329
- const auto C0 = TableLookupBytesOr0(v0, shuf_C0);
1330
- const auto C1 = TableLookupBytesOr0(v1, shuf_C1);
1331
- const auto C2 = TableLookupBytesOr0(v2, shuf_C2);
1332
- const VFromD<D> C = BitCast(d, C0 | C1 | C2);
1419
+ const VU shuf_C0 = shuf_B2 + k6; // ..F..E..D..C..B.
1420
+ const VU shuf_C1 = shuf_B0 + k5; // .F..E..D..C..B..
1421
+ const VU shuf_C2 = shuf_B1 + k5; // F..E..D..C..B..A
1422
+ const VU vC0 = TableLookupBytesOr0(v0, shuf_C0);
1423
+ const VU vC1 = TableLookupBytesOr0(v1, shuf_C1);
1424
+ const VU vC2 = TableLookupBytesOr0(v2, shuf_C2);
1425
+ const VFromD<D> C = BitCast(d, vC0 | vC1 | vC2);
1333
1426
 
1334
1427
  detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
1335
1428
  }
@@ -1339,8 +1432,9 @@ template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_GT_D(D, 8)>
1339
1432
  HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
1340
1433
  TFromD<D>* HWY_RESTRICT unaligned) {
1341
1434
  const Repartition<uint8_t, decltype(d)> du8;
1342
- const auto k2 = Set(du8, uint8_t{2 * sizeof(TFromD<D>)});
1343
- const auto k3 = Set(du8, uint8_t{3 * sizeof(TFromD<D>)});
1435
+ using VU8 = VFromD<decltype(du8)>;
1436
+ const VU8 k2 = Set(du8, uint8_t{2 * sizeof(TFromD<D>)});
1437
+ const VU8 k3 = Set(du8, uint8_t{3 * sizeof(TFromD<D>)});
1344
1438
 
1345
1439
  // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
1346
1440
  // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
@@ -1355,30 +1449,30 @@ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
1355
1449
 
1356
1450
  // The interleaved vectors will be named A, B, C; temporaries with suffix
1357
1451
  // 0..2 indicate which input vector's lanes they hold.
1358
- const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
1452
+ const VU8 shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
1359
1453
 
1360
- const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
1361
- const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
1362
- const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
1454
+ const VU8 A0 = TableLookupBytesOr0(v0, shuf_A0);
1455
+ const VU8 A1 = TableLookupBytesOr0(v1, shuf_A1);
1456
+ const VU8 A2 = TableLookupBytesOr0(v2, shuf_A2);
1363
1457
  const VFromD<D> A = BitCast(d, A0 | A1 | A2);
1364
1458
 
1365
1459
  // B: v0[5] v2[4],v1[4],v0[4], v2[3],v1[3],v0[3], v2[2]
1366
- const auto shuf_B0 = shuf_A1 + k3; // 5..4..3.
1367
- const auto shuf_B1 = shuf_A2 + k3; // ..4..3..
1368
- const auto shuf_B2 = shuf_A0 + k2; // .4..3..2
1369
- const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
1370
- const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
1371
- const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
1372
- const VFromD<D> B = BitCast(d, B0 | B1 | B2);
1460
+ const VU8 shuf_B0 = shuf_A1 + k3; // 5..4..3.
1461
+ const VU8 shuf_B1 = shuf_A2 + k3; // ..4..3..
1462
+ const VU8 shuf_B2 = shuf_A0 + k2; // .4..3..2
1463
+ const VU8 vB0 = TableLookupBytesOr0(v0, shuf_B0);
1464
+ const VU8 vB1 = TableLookupBytesOr0(v1, shuf_B1);
1465
+ const VU8 vB2 = TableLookupBytesOr0(v2, shuf_B2);
1466
+ const VFromD<D> B = BitCast(d, vB0 | vB1 | vB2);
1373
1467
 
1374
1468
  // C: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
1375
- const auto shuf_C0 = shuf_B1 + k3; // ..7..6..
1376
- const auto shuf_C1 = shuf_B2 + k3; // .7..6..5
1377
- const auto shuf_C2 = shuf_B0 + k2; // 7..6..5.
1378
- const auto C0 = TableLookupBytesOr0(v0, shuf_C0);
1379
- const auto C1 = TableLookupBytesOr0(v1, shuf_C1);
1380
- const auto C2 = TableLookupBytesOr0(v2, shuf_C2);
1381
- const VFromD<D> C = BitCast(d, C0 | C1 | C2);
1469
+ const VU8 shuf_C0 = shuf_B1 + k3; // ..7..6..
1470
+ const VU8 shuf_C1 = shuf_B2 + k3; // .7..6..5
1471
+ const VU8 shuf_C2 = shuf_B0 + k2; // 7..6..5.
1472
+ const VU8 vC0 = TableLookupBytesOr0(v0, shuf_C0);
1473
+ const VU8 vC1 = TableLookupBytesOr0(v1, shuf_C1);
1474
+ const VU8 vC2 = TableLookupBytesOr0(v2, shuf_C2);
1475
+ const VFromD<D> C = BitCast(d, vC0 | vC1 | vC2);
1382
1476
 
1383
1477
  detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
1384
1478
  }
@@ -1431,9 +1525,10 @@ HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
1431
1525
  // Use full vectors for the shuffles and first result.
1432
1526
  constexpr size_t kFullN = 16 / sizeof(TFromD<D>);
1433
1527
  const Full128<uint8_t> du;
1528
+ using VU = VFromD<decltype(du)>;
1434
1529
  const Full128<TFromD<D>> d_full;
1435
- const auto k5 = Set(du, uint8_t{5});
1436
- const auto k6 = Set(du, uint8_t{6});
1530
+ const VU k5 = Set(du, uint8_t{5});
1531
+ const VU k6 = Set(du, uint8_t{6});
1437
1532
 
1438
1533
  const VFromD<decltype(d_full)> v0{part0.raw};
1439
1534
  const VFromD<decltype(d_full)> v1{part1.raw};
@@ -1450,23 +1545,23 @@ HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
1450
1545
  0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
1451
1546
  // The interleaved vectors will be named A, B, C; temporaries with suffix
1452
1547
  // 0..2 indicate which input vector's lanes they hold.
1453
- const auto shuf_A0 = Load(du, tbl_v0);
1454
- const auto shuf_A1 = Load(du, tbl_v1); // cannot reuse shuf_A0 (5 in MSB)
1455
- const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
1456
- const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0
1457
- const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0.
1458
- const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0..
1548
+ const VU shuf_A0 = Load(du, tbl_v0);
1549
+ const VU shuf_A1 = Load(du, tbl_v1); // cannot reuse shuf_A0 (5 in MSB)
1550
+ const VU shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
1551
+ const VU A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0
1552
+ const VU A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0.
1553
+ const VU A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0..
1459
1554
  const auto A = BitCast(d_full, A0 | A1 | A2);
1460
1555
  StoreU(A, d_full, unaligned + 0 * kFullN);
1461
1556
 
1462
1557
  // Second (HALF) vector: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
1463
- const auto shuf_B0 = shuf_A2 + k6; // ..7..6..
1464
- const auto shuf_B1 = shuf_A0 + k5; // .7..6..5
1465
- const auto shuf_B2 = shuf_A1 + k5; // 7..6..5.
1466
- const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
1467
- const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
1468
- const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
1469
- const VFromD<D> B{BitCast(d_full, B0 | B1 | B2).raw};
1558
+ const VU shuf_B0 = shuf_A2 + k6; // ..7..6..
1559
+ const VU shuf_B1 = shuf_A0 + k5; // .7..6..5
1560
+ const VU shuf_B2 = shuf_A1 + k5; // 7..6..5.
1561
+ const VU vB0 = TableLookupBytesOr0(v0, shuf_B0);
1562
+ const VU vB1 = TableLookupBytesOr0(v1, shuf_B1);
1563
+ const VU vB2 = TableLookupBytesOr0(v2, shuf_B2);
1564
+ const VFromD<D> B{BitCast(d_full, vB0 | vB1 | vB2).raw};
1470
1565
  StoreU(B, d, unaligned + 1 * kFullN);
1471
1566
  }
1472
1567
 
@@ -1477,8 +1572,9 @@ HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
1477
1572
  TFromD<D>* HWY_RESTRICT unaligned) {
1478
1573
  const Twice<D> d_full;
1479
1574
  const Full128<uint8_t> du8;
1480
- const auto k2 = Set(du8, uint8_t{2 * sizeof(TFromD<D>)});
1481
- const auto k3 = Set(du8, uint8_t{3 * sizeof(TFromD<D>)});
1575
+ using VU8 = VFromD<decltype(du8)>;
1576
+ const VU8 k2 = Set(du8, uint8_t{2 * sizeof(TFromD<D>)});
1577
+ const VU8 k3 = Set(du8, uint8_t{3 * sizeof(TFromD<D>)});
1482
1578
 
1483
1579
  const VFromD<decltype(d_full)> v0{part0.raw};
1484
1580
  const VFromD<decltype(d_full)> v1{part1.raw};
@@ -1497,25 +1593,25 @@ HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
1497
1593
 
1498
1594
  // The interleaved vectors will be named A, B; temporaries with suffix
1499
1595
  // 0..2 indicate which input vector's lanes they hold.
1500
- const auto shuf_A1 = Load(du8, tbl_v1); // 2..1..0.
1501
- // .2..1..0
1502
- const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
1503
- const auto shuf_A2 = Load(du8, tbl_v2); // ..1..0..
1504
-
1505
- const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
1506
- const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
1507
- const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
1596
+ const VU8 shuf_A1 = Load(du8, tbl_v1); // 2..1..0.
1597
+ // .2..1..0
1598
+ const VU8 shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
1599
+ const VU8 shuf_A2 = Load(du8, tbl_v2); // ..1..0..
1600
+
1601
+ const VU8 A0 = TableLookupBytesOr0(v0, shuf_A0);
1602
+ const VU8 A1 = TableLookupBytesOr0(v1, shuf_A1);
1603
+ const VU8 A2 = TableLookupBytesOr0(v2, shuf_A2);
1508
1604
  const VFromD<decltype(d_full)> A = BitCast(d_full, A0 | A1 | A2);
1509
1605
  StoreU(A, d_full, unaligned);
1510
1606
 
1511
1607
  // Second (HALF) vector: v2[3],v1[3],v0[3], v2[2]
1512
- const auto shuf_B0 = shuf_A1 + k3; // ..3.
1513
- const auto shuf_B1 = shuf_A2 + k3; // .3..
1514
- const auto shuf_B2 = shuf_A0 + k2; // 3..2
1515
- const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
1516
- const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
1517
- const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
1518
- const VFromD<decltype(d_full)> B = BitCast(d_full, B0 | B1 | B2);
1608
+ const VU8 shuf_B0 = shuf_A1 + k3; // ..3.
1609
+ const VU8 shuf_B1 = shuf_A2 + k3; // .3..
1610
+ const VU8 shuf_B2 = shuf_A0 + k2; // 3..2
1611
+ const VU8 vB0 = TableLookupBytesOr0(v0, shuf_B0);
1612
+ const VU8 vB1 = TableLookupBytesOr0(v1, shuf_B1);
1613
+ const VU8 vB2 = TableLookupBytesOr0(v2, shuf_B2);
1614
+ const VFromD<decltype(d_full)> B = BitCast(d_full, vB0 | vB1 | vB2);
1519
1615
  StoreU(VFromD<D>{B.raw}, dh, unaligned + MaxLanes(d_full));
1520
1616
  }
1521
1617
 
@@ -1543,6 +1639,7 @@ HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
1543
1639
  TFromD<D>* HWY_RESTRICT unaligned) {
1544
1640
  // Use full vectors for the shuffles and result.
1545
1641
  const Full128<uint8_t> du;
1642
+ using VU = VFromD<decltype(du)>;
1546
1643
  const Full128<TFromD<D>> d_full;
1547
1644
 
1548
1645
  const VFromD<decltype(d_full)> v0{part0.raw};
@@ -1557,12 +1654,12 @@ HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
1557
1654
  0x80, 3, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
1558
1655
  // The interleaved vector will be named A; temporaries with suffix
1559
1656
  // 0..2 indicate which input vector's lanes they hold.
1560
- const auto shuf_A0 = Load(du, tbl_v0);
1561
- const auto shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0);
1562
- const auto shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0);
1563
- const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // ......3..2..1..0
1564
- const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // .....3..2..1..0.
1565
- const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // ....3..2..1..0..
1657
+ const VU shuf_A0 = Load(du, tbl_v0);
1658
+ const VU shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0);
1659
+ const VU shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0);
1660
+ const VU A0 = TableLookupBytesOr0(v0, shuf_A0); // ......3..2..1..0
1661
+ const VU A1 = TableLookupBytesOr0(v1, shuf_A1); // .....3..2..1..0.
1662
+ const VU A2 = TableLookupBytesOr0(v2, shuf_A2); // ....3..2..1..0..
1566
1663
  const VFromD<decltype(d_full)> A = BitCast(d_full, A0 | A1 | A2);
1567
1664
  alignas(16) TFromD<D> buf[MaxLanes(d_full)];
1568
1665
  StoreU(A, d_full, buf);
@@ -1576,6 +1673,7 @@ HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
1576
1673
  TFromD<D>* HWY_RESTRICT unaligned) {
1577
1674
  // Use full vectors for the shuffles and result.
1578
1675
  const Full128<uint8_t> du8;
1676
+ using VU8 = VFromD<decltype(du8)>;
1579
1677
  const Full128<TFromD<D>> d_full;
1580
1678
 
1581
1679
  const VFromD<decltype(d_full)> v0{part0.raw};
@@ -1590,15 +1688,14 @@ HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
1590
1688
  0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
1591
1689
  // The interleaved vector will be named A; temporaries with suffix
1592
1690
  // 0..2 indicate which input vector's lanes they hold.
1593
- const auto shuf_A2 = // ..1..0..
1594
- Load(du8, tbl_v2);
1595
- const auto shuf_A1 = // ...1..0.
1596
- CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2);
1597
- const auto shuf_A0 = // ....1..0
1598
- CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2);
1599
- const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // ..1..0
1600
- const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // .1..0.
1601
- const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // 1..0..
1691
+ const VU8 shuf_A2 = Load(du8, tbl_v2); // ..1..0..
1692
+ const VU8 shuf_A1 =
1693
+ CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2); // ...1..0.
1694
+ const VU8 shuf_A0 =
1695
+ CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2); // ....1..0
1696
+ const VU8 A0 = TableLookupBytesOr0(v0, shuf_A0); // ..1..0
1697
+ const VU8 A1 = TableLookupBytesOr0(v1, shuf_A1); // .1..0.
1698
+ const VU8 A2 = TableLookupBytesOr0(v2, shuf_A2); // 1..0..
1602
1699
  const auto A = BitCast(d_full, A0 | A1 | A2);
1603
1700
  alignas(16) TFromD<D> buf[MaxLanes(d_full)];
1604
1701
  StoreU(A, d_full, buf);
@@ -2089,8 +2186,7 @@ namespace detail {
2089
2186
 
2090
2187
  template <class DH, HWY_IF_V_SIZE_LE_D(DH, 4)>
2091
2188
  HWY_INLINE VFromD<DH> StoreNGetUpperHalf(DH dh, VFromD<Twice<DH>> v) {
2092
- constexpr size_t kMinShrVectBytes =
2093
- (HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) ? 8 : 16;
2189
+ constexpr size_t kMinShrVectBytes = HWY_TARGET_IS_NEON ? 8 : 16;
2094
2190
  const FixedTag<uint8_t, kMinShrVectBytes> d_shift;
2095
2191
  return ResizeBitCast(
2096
2192
  dh, ShiftRightBytes<dh.MaxBytes()>(d_shift, ResizeBitCast(d_shift, v)));
@@ -2299,6 +2395,25 @@ HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D d,
2299
2395
  }
2300
2396
  }
2301
2397
 
2398
+ template <class D, typename T = TFromD<D>>
2399
+ HWY_API void ScatterIndexN(VFromD<D> v, D d, T* HWY_RESTRICT base,
2400
+ VFromD<RebindToSigned<D>> index,
2401
+ const size_t max_lanes_to_store) {
2402
+ const RebindToSigned<decltype(d)> di;
2403
+ using TI = TFromD<decltype(di)>;
2404
+ static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
2405
+
2406
+ for (size_t i = 0; i < MaxLanes(d); ++i) {
2407
+ if (i < max_lanes_to_store) base[ExtractLane(index, i)] = ExtractLane(v, i);
2408
+ }
2409
+ }
2410
+ #else
2411
+ template <class D, typename T = TFromD<D>>
2412
+ HWY_API void ScatterIndexN(VFromD<D> v, D d, T* HWY_RESTRICT base,
2413
+ VFromD<RebindToSigned<D>> index,
2414
+ const size_t max_lanes_to_store) {
2415
+ MaskedScatterIndex(v, FirstN(d, max_lanes_to_store), d, base, index);
2416
+ }
2302
2417
  #endif // (defined(HWY_NATIVE_SCATTER) == defined(HWY_TARGET_TOGGLE))
2303
2418
 
2304
2419
  // ------------------------------ Gather
@@ -2394,23 +2509,49 @@ HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
2394
2509
  return Load(d, lanes);
2395
2510
  }
2396
2511
 
2397
- #endif // (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE))
2512
+ template <class D, typename T = TFromD<D>>
2513
+ HWY_API VFromD<D> GatherIndexN(D d, const T* HWY_RESTRICT base,
2514
+ VFromD<RebindToSigned<D>> index,
2515
+ const size_t max_lanes_to_load) {
2516
+ const RebindToSigned<D> di;
2517
+ using TI = TFromD<decltype(di)>;
2518
+ static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
2398
2519
 
2399
- // ------------------------------ ScatterN/GatherN
2520
+ VFromD<D> v = Zero(d);
2521
+ for (size_t i = 0; i < HWY_MIN(MaxLanes(d), max_lanes_to_load); ++i) {
2522
+ v = InsertLane(v, i, base[ExtractLane(index, i)]);
2523
+ }
2524
+ return v;
2525
+ }
2400
2526
 
2401
2527
  template <class D, typename T = TFromD<D>>
2402
- HWY_API void ScatterIndexN(VFromD<D> v, D d, T* HWY_RESTRICT base,
2403
- VFromD<RebindToSigned<D>> index,
2404
- const size_t max_lanes_to_store) {
2405
- MaskedScatterIndex(v, FirstN(d, max_lanes_to_store), d, base, index);
2406
- }
2528
+ HWY_API VFromD<D> GatherIndexNOr(VFromD<D> no, D d, const T* HWY_RESTRICT base,
2529
+ VFromD<RebindToSigned<D>> index,
2530
+ const size_t max_lanes_to_load) {
2531
+ const RebindToSigned<D> di;
2532
+ using TI = TFromD<decltype(di)>;
2533
+ static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
2407
2534
 
2535
+ VFromD<D> v = no;
2536
+ for (size_t i = 0; i < HWY_MIN(MaxLanes(d), max_lanes_to_load); ++i) {
2537
+ v = InsertLane(v, i, base[ExtractLane(index, i)]);
2538
+ }
2539
+ return v;
2540
+ }
2541
+ #else
2408
2542
  template <class D, typename T = TFromD<D>>
2409
2543
  HWY_API VFromD<D> GatherIndexN(D d, const T* HWY_RESTRICT base,
2410
2544
  VFromD<RebindToSigned<D>> index,
2411
2545
  const size_t max_lanes_to_load) {
2412
2546
  return MaskedGatherIndex(FirstN(d, max_lanes_to_load), d, base, index);
2413
2547
  }
2548
+ template <class D, typename T = TFromD<D>>
2549
+ HWY_API VFromD<D> GatherIndexNOr(VFromD<D> no, D d, const T* HWY_RESTRICT base,
2550
+ VFromD<RebindToSigned<D>> index,
2551
+ const size_t max_lanes_to_load) {
2552
+ return MaskedGatherIndexOr(no, FirstN(d, max_lanes_to_load), d, base, index);
2553
+ }
2554
+ #endif // (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE))
2414
2555
 
2415
2556
  // ------------------------------ Integer AbsDiff and SumsOf8AbsDiff
2416
2557
 
@@ -2548,6 +2689,7 @@ HWY_API V SaturatedSub(V a, V b) {
2548
2689
  // ------------------------------ Unsigned to signed demotions
2549
2690
 
2550
2691
  template <class DN, HWY_IF_SIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
2692
+ HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V),
2551
2693
  class V2 = VFromD<Rebind<TFromV<V>, DN>>,
2552
2694
  hwy::EnableIf<(sizeof(TFromD<DN>) < sizeof(TFromV<V>))>* = nullptr,
2553
2695
  HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_D(DFromV<V2>))>
@@ -2571,6 +2713,7 @@ HWY_API VFromD<DN> DemoteTo(DN dn, V v) {
2571
2713
 
2572
2714
  #if HWY_TARGET != HWY_SCALAR || HWY_IDE
2573
2715
  template <class DN, HWY_IF_SIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
2716
+ HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V),
2574
2717
  class V2 = VFromD<Repartition<TFromV<V>, DN>>,
2575
2718
  HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
2576
2719
  HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_D(DFromV<V2>))>
@@ -2629,248 +2772,6 @@ HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
2629
2772
  #endif // HWY_TARGET != HWY_SCALAR
2630
2773
  #endif // HWY_NATIVE_PROMOTE_UPPER_TO
2631
2774
 
2632
- // ------------------------------ PromoteEvenTo/PromoteOddTo
2633
-
2634
- #if HWY_TARGET != HWY_SCALAR
2635
- namespace detail {
2636
-
2637
- // Tag dispatch is used in detail::PromoteEvenTo and detail::PromoteOddTo as
2638
- // there are target-specific specializations for some of the
2639
- // detail::PromoteEvenTo and detail::PromoteOddTo cases on
2640
- // SVE/PPC/SSE2/SSSE3/SSE4/AVX2.
2641
-
2642
- // All targets except HWY_SCALAR use the implementations of
2643
- // detail::PromoteEvenTo and detail::PromoteOddTo in generic_ops-inl.h for at
2644
- // least some of the PromoteEvenTo and PromoteOddTo cases.
2645
-
2646
- // Signed to signed PromoteEvenTo/PromoteOddTo
2647
- template <size_t kToLaneSize, class D, class V>
2648
- HWY_INLINE VFromD<D> PromoteEvenTo(
2649
- hwy::SignedTag /*to_type_tag*/,
2650
- hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
2651
- hwy::SignedTag /*from_type_tag*/, D d_to, V v) {
2652
- #if HWY_IS_LITTLE_ENDIAN
2653
- // On little-endian targets, need to shift each lane of the bitcasted vector
2654
- // left by kToLaneSize * 4 bits to get the bits of the even source lanes into
2655
- // the upper kToLaneSize * 4 bits of even_in_hi.
2656
- const auto even_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v));
2657
- #else
2658
- // On big-endian targets, the bits of the even source lanes are already in
2659
- // the upper kToLaneSize * 4 bits of the lanes of the bitcasted vector.
2660
- const auto even_in_hi = BitCast(d_to, v);
2661
- #endif
2662
-
2663
- // Right-shift even_in_hi by kToLaneSize * 4 bits
2664
- return ShiftRight<kToLaneSize * 4>(even_in_hi);
2665
- }
2666
-
2667
- template <size_t kToLaneSize, class D, class V>
2668
- HWY_INLINE VFromD<D> PromoteOddTo(
2669
- hwy::SignedTag /*to_type_tag*/,
2670
- hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
2671
- hwy::SignedTag /*from_type_tag*/, D d_to, V v) {
2672
- #if HWY_IS_LITTLE_ENDIAN
2673
- // On little-endian targets, the bits of the odd source lanes are already in
2674
- // the upper kToLaneSize * 4 bits of the lanes of the bitcasted vector.
2675
- const auto odd_in_hi = BitCast(d_to, v);
2676
- #else
2677
- // On big-endian targets, need to shift each lane of the bitcasted vector left
2678
- // by kToLaneSize * 4 bits to get the bits of the odd source lanes into the
2679
- // upper kToLaneSize * 4 bits of odd_in_hi.
2680
- const auto odd_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v));
2681
- #endif
2682
-
2683
- // Right-shift odd_in_hi by kToLaneSize * 4 bits
2684
- return ShiftRight<kToLaneSize * 4>(odd_in_hi);
2685
- }
2686
-
2687
- // Unsigned to unsigned PromoteEvenTo/PromoteOddTo
2688
- template <size_t kToLaneSize, class D, class V>
2689
- HWY_INLINE VFromD<D> PromoteEvenTo(
2690
- hwy::UnsignedTag /*to_type_tag*/,
2691
- hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
2692
- hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
2693
- #if HWY_IS_LITTLE_ENDIAN
2694
- // On little-endian targets, the bits of the even source lanes are already
2695
- // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector.
2696
-
2697
- // Simply need to zero out the upper bits of each lane of the bitcasted
2698
- // vector.
2699
- return And(BitCast(d_to, v),
2700
- Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>())));
2701
- #else
2702
- // On big-endian targets, need to shift each lane of the bitcasted vector
2703
- // right by kToLaneSize * 4 bits to get the bits of the even source lanes into
2704
- // the lower kToLaneSize * 4 bits of the result.
2705
-
2706
- // The right shift below will zero out the upper kToLaneSize * 4 bits of the
2707
- // result.
2708
- return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v));
2709
- #endif
2710
- }
2711
-
2712
- template <size_t kToLaneSize, class D, class V>
2713
- HWY_INLINE VFromD<D> PromoteOddTo(
2714
- hwy::UnsignedTag /*to_type_tag*/,
2715
- hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
2716
- hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
2717
- #if HWY_IS_LITTLE_ENDIAN
2718
- // On little-endian targets, need to shift each lane of the bitcasted vector
2719
- // right by kToLaneSize * 4 bits to get the bits of the odd source lanes into
2720
- // the lower kToLaneSize * 4 bits of the result.
2721
-
2722
- // The right shift below will zero out the upper kToLaneSize * 4 bits of the
2723
- // result.
2724
- return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v));
2725
- #else
2726
- // On big-endian targets, the bits of the even source lanes are already
2727
- // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector.
2728
-
2729
- // Simply need to zero out the upper bits of each lane of the bitcasted
2730
- // vector.
2731
- return And(BitCast(d_to, v),
2732
- Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>())));
2733
- #endif
2734
- }
2735
-
2736
- // Unsigned to signed: Same as unsigned->unsigned PromoteEvenTo/PromoteOddTo
2737
- // followed by BitCast to signed
2738
- template <size_t kToLaneSize, class D, class V>
2739
- HWY_INLINE VFromD<D> PromoteEvenTo(
2740
- hwy::SignedTag /*to_type_tag*/,
2741
- hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
2742
- hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
2743
- const RebindToUnsigned<decltype(d_to)> du_to;
2744
- return BitCast(d_to,
2745
- PromoteEvenTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(),
2746
- hwy::UnsignedTag(), du_to, v));
2747
- }
2748
-
2749
- template <size_t kToLaneSize, class D, class V>
2750
- HWY_INLINE VFromD<D> PromoteOddTo(
2751
- hwy::SignedTag /*to_type_tag*/,
2752
- hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
2753
- hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
2754
- const RebindToUnsigned<decltype(d_to)> du_to;
2755
- return BitCast(d_to,
2756
- PromoteOddTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(),
2757
- hwy::UnsignedTag(), du_to, v));
2758
- }
2759
-
2760
- // BF16->F32 PromoteEvenTo
2761
-
2762
- // NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag
2763
- // instead of hwy::FloatTag on targets that use scalable vectors.
2764
-
2765
- // VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same
2766
- // type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>>
2767
-
2768
- // The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered
2769
- // to be a bfloat16_t vector.
2770
- template <class FromTypeTag, class DF32, class VBF16,
2771
- class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>,
2772
- hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr>
2773
- HWY_INLINE VFromD<DF32> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
2774
- hwy::SizeTag<4> /*to_lane_size_tag*/,
2775
- FromTypeTag /*from_type_tag*/, DF32 d_to,
2776
- VBF16 v) {
2777
- const RebindToUnsigned<decltype(d_to)> du_to;
2778
- #if HWY_IS_LITTLE_ENDIAN
2779
- // On little-endian platforms, need to shift left each lane of the bitcasted
2780
- // vector by 16 bits.
2781
- return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v)));
2782
- #else
2783
- // On big-endian platforms, the even lanes of the source vector are already
2784
- // in the upper 16 bits of the lanes of the bitcasted vector.
2785
-
2786
- // Need to simply zero out the lower 16 bits of each lane of the bitcasted
2787
- // vector.
2788
- return BitCast(d_to,
2789
- And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u})));
2790
- #endif
2791
- }
2792
-
2793
- // BF16->F32 PromoteOddTo
2794
-
2795
- // NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag
2796
- // instead of hwy::FloatTag on targets that use scalable vectors.
2797
-
2798
- // VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same
2799
- // type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>>
2800
-
2801
- // The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered
2802
- // to be a bfloat16_t vector.
2803
- template <class FromTypeTag, class DF32, class VBF16,
2804
- class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>,
2805
- hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr>
2806
- HWY_INLINE VFromD<DF32> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
2807
- hwy::SizeTag<4> /*to_lane_size_tag*/,
2808
- FromTypeTag /*from_type_tag*/, DF32 d_to,
2809
- VBF16 v) {
2810
- const RebindToUnsigned<decltype(d_to)> du_to;
2811
- #if HWY_IS_LITTLE_ENDIAN
2812
- // On little-endian platforms, the odd lanes of the source vector are already
2813
- // in the upper 16 bits of the lanes of the bitcasted vector.
2814
-
2815
- // Need to simply zero out the lower 16 bits of each lane of the bitcasted
2816
- // vector.
2817
- return BitCast(d_to,
2818
- And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u})));
2819
- #else
2820
- // On big-endian platforms, need to shift left each lane of the bitcasted
2821
- // vector by 16 bits.
2822
- return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v)));
2823
- #endif
2824
- }
2825
-
2826
- // Default PromoteEvenTo/PromoteOddTo implementations
2827
- template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
2828
- class V, HWY_IF_LANES_D(D, 1)>
2829
- HWY_INLINE VFromD<D> PromoteEvenTo(
2830
- ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
2831
- FromTypeTag /*from_type_tag*/, D d_to, V v) {
2832
- return PromoteLowerTo(d_to, v);
2833
- }
2834
-
2835
- template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
2836
- class V, HWY_IF_LANES_GT_D(D, 1)>
2837
- HWY_INLINE VFromD<D> PromoteEvenTo(
2838
- ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
2839
- FromTypeTag /*from_type_tag*/, D d_to, V v) {
2840
- const DFromV<decltype(v)> d;
2841
- return PromoteLowerTo(d_to, ConcatEven(d, v, v));
2842
- }
2843
-
2844
- template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
2845
- class V>
2846
- HWY_INLINE VFromD<D> PromoteOddTo(
2847
- ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
2848
- FromTypeTag /*from_type_tag*/, D d_to, V v) {
2849
- const DFromV<decltype(v)> d;
2850
- return PromoteLowerTo(d_to, ConcatOdd(d, v, v));
2851
- }
2852
-
2853
- } // namespace detail
2854
-
2855
- template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)),
2856
- class V2 = VFromD<Repartition<TFromV<V>, D>>,
2857
- HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))>
2858
- HWY_API VFromD<D> PromoteEvenTo(D d, V v) {
2859
- return detail::PromoteEvenTo(hwy::TypeTag<TFromD<D>>(),
2860
- hwy::SizeTag<sizeof(TFromD<D>)>(),
2861
- hwy::TypeTag<TFromV<V>>(), d, v);
2862
- }
2863
-
2864
- template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)),
2865
- class V2 = VFromD<Repartition<TFromV<V>, D>>,
2866
- HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))>
2867
- HWY_API VFromD<D> PromoteOddTo(D d, V v) {
2868
- return detail::PromoteOddTo(hwy::TypeTag<TFromD<D>>(),
2869
- hwy::SizeTag<sizeof(TFromD<D>)>(),
2870
- hwy::TypeTag<TFromV<V>>(), d, v);
2871
- }
2872
- #endif // HWY_TARGET != HWY_SCALAR
2873
-
2874
2775
  // ------------------------------ float16_t <-> float
2875
2776
 
2876
2777
  #if (defined(HWY_NATIVE_F16C) == defined(HWY_TARGET_TOGGLE))
@@ -2924,7 +2825,7 @@ HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
2924
2825
  // We also want to biased exponent of round_incr[i] to be less than or equal
2925
2826
  // to 255 (which is equal to MaxExponentField<float>())
2926
2827
 
2927
- // The biased F64 exponent of round_incr is equal to
2828
+ // The biased F32 exponent of round_incr is equal to
2928
2829
  // HWY_MAX(HWY_MIN(((exp_bits[i] >> 23) & 255) + 13, 255), 126)
2929
2830
 
2930
2831
  // hi9_bits[i] is equal to the upper 9 bits of v[i]
@@ -3010,24 +2911,31 @@ HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
3010
2911
  // ((rounded_val_bits[i] >> 10) & 0xFF), 157) - 126
3011
2912
 
3012
2913
  #if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
2914
+ const auto k157Shl10 = Set(du32, static_cast<uint32_t>(uint32_t{157u} << 10));
3013
2915
  auto f16_exp_bits =
3014
2916
  Min(Add(ShiftLeft<10>(And(round_incr_hi9_bits, k255)),
3015
2917
  And(rounded_val_bits,
3016
2918
  Set(du32, static_cast<uint32_t>(uint32_t{0xFFu} << 10)))),
3017
- Set(du32, static_cast<uint32_t>(uint32_t{157u} << 10)));
2919
+ k157Shl10);
2920
+ const auto f16_result_is_inf_mask =
2921
+ RebindMask(df32, Eq(f16_exp_bits, k157Shl10));
3018
2922
  #else
3019
- auto f16_exp_bits = ShiftLeft<10>(BitCast(
2923
+ const auto k157 = Set(du32, uint32_t{157});
2924
+ auto f16_exp_bits = BitCast(
3020
2925
  du32,
3021
2926
  Min(SaturatedAdd(BitCast(du32_as_u8, round_incr_hi9_bits),
3022
2927
  BitCast(du32_as_u8, ShiftRight<10>(rounded_val_bits))),
3023
- BitCast(du32_as_u8, Set(du32, uint32_t{157})))));
2928
+ BitCast(du32_as_u8, k157)));
2929
+ const auto f16_result_is_inf_mask = RebindMask(df32, Eq(f16_exp_bits, k157));
2930
+ f16_exp_bits = ShiftLeft<10>(f16_exp_bits);
3024
2931
  #endif
3025
2932
 
3026
2933
  f16_exp_bits =
3027
2934
  Sub(f16_exp_bits, Set(du32, static_cast<uint32_t>(uint32_t{126u} << 10)));
3028
2935
 
3029
2936
  const auto f16_unmasked_mant_bits =
3030
- BitCast(di32, Or(rounded_val, VecFromMask(df32, IsNaN(rounded_val))));
2937
+ BitCast(di32, Or(IfThenZeroElse(f16_result_is_inf_mask, rounded_val),
2938
+ VecFromMask(df32, IsNaN(rounded_val))));
3031
2939
 
3032
2940
  const auto f16_exp_mant_bits =
3033
2941
  OrAnd(BitCast(di32, f16_exp_bits), f16_unmasked_mant_bits,
@@ -3094,9 +3002,224 @@ HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<float16_t, D>> v) {
3094
3002
 
3095
3003
  #endif // HWY_NATIVE_PROMOTE_F16_TO_F64
3096
3004
 
3005
+ // ------------------------------ F32 to BF16 DemoteTo
3006
+ #if (defined(HWY_NATIVE_DEMOTE_F32_TO_BF16) == defined(HWY_TARGET_TOGGLE))
3007
+ #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
3008
+ #undef HWY_NATIVE_DEMOTE_F32_TO_BF16
3009
+ #else
3010
+ #define HWY_NATIVE_DEMOTE_F32_TO_BF16
3011
+ #endif
3012
+
3013
+ namespace detail {
3014
+
3015
+ // Round a F32 value to the nearest BF16 value, with the result returned as the
3016
+ // rounded F32 value bitcasted to an U32
3017
+
3018
+ // RoundF32ForDemoteToBF16 also converts NaN values to QNaN values to prevent
3019
+ // NaN F32 values from being converted to an infinity
3020
+ template <class V, HWY_IF_F32(TFromV<V>)>
3021
+ HWY_INLINE VFromD<RebindToUnsigned<DFromV<V>>> RoundF32ForDemoteToBF16(V v) {
3022
+ const DFromV<decltype(v)> d;
3023
+ const RebindToUnsigned<decltype(d)> du32;
3024
+
3025
+ const auto is_non_nan = Not(IsNaN(v));
3026
+ const auto bits32 = BitCast(du32, v);
3027
+
3028
+ const auto round_incr =
3029
+ Add(And(ShiftRight<16>(bits32), Set(du32, uint32_t{1})),
3030
+ Set(du32, uint32_t{0x7FFFu}));
3031
+ return MaskedAddOr(Or(bits32, Set(du32, uint32_t{0x00400000u})),
3032
+ RebindMask(du32, is_non_nan), bits32, round_incr);
3033
+ }
3034
+
3035
+ } // namespace detail
3036
+
3037
+ template <class D, HWY_IF_BF16_D(D)>
3038
+ HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
3039
+ const RebindToUnsigned<decltype(dbf16)> du16;
3040
+ const Twice<decltype(du16)> dt_u16;
3041
+
3042
+ const auto rounded_bits = BitCast(dt_u16, detail::RoundF32ForDemoteToBF16(v));
3043
+ #if HWY_IS_LITTLE_ENDIAN
3044
+ return BitCast(
3045
+ dbf16, LowerHalf(du16, ConcatOdd(dt_u16, rounded_bits, rounded_bits)));
3046
+ #else
3047
+ return BitCast(
3048
+ dbf16, LowerHalf(du16, ConcatEven(dt_u16, rounded_bits, rounded_bits)));
3049
+ #endif
3050
+ }
3051
+
3052
+ template <class D, HWY_IF_BF16_D(D)>
3053
+ HWY_API VFromD<D> OrderedDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
3054
+ VFromD<Repartition<float, D>> b) {
3055
+ const RebindToUnsigned<decltype(dbf16)> du16;
3056
+
3057
+ const auto rounded_a_bits32 =
3058
+ BitCast(du16, detail::RoundF32ForDemoteToBF16(a));
3059
+ const auto rounded_b_bits32 =
3060
+ BitCast(du16, detail::RoundF32ForDemoteToBF16(b));
3061
+ #if HWY_IS_LITTLE_ENDIAN
3062
+ return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, rounded_b_bits32),
3063
+ BitCast(du16, rounded_a_bits32)));
3064
+ #else
3065
+ return BitCast(dbf16, ConcatEven(du16, BitCast(du16, rounded_b_bits32),
3066
+ BitCast(du16, rounded_a_bits32)));
3067
+ #endif
3068
+ }
3069
+
3070
+ template <class D, HWY_IF_BF16_D(D)>
3071
+ HWY_API VFromD<D> ReorderDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
3072
+ VFromD<Repartition<float, D>> b) {
3073
+ const RebindToUnsigned<decltype(dbf16)> du16;
3074
+
3075
+ #if HWY_IS_LITTLE_ENDIAN
3076
+ const auto a_in_odd = detail::RoundF32ForDemoteToBF16(a);
3077
+ const auto b_in_even = ShiftRight<16>(detail::RoundF32ForDemoteToBF16(b));
3078
+ #else
3079
+ const auto a_in_odd = ShiftRight<16>(detail::RoundF32ForDemoteToBF16(a));
3080
+ const auto b_in_even = detail::RoundF32ForDemoteToBF16(b);
3081
+ #endif
3082
+
3083
+ return BitCast(dbf16,
3084
+ OddEven(BitCast(du16, a_in_odd), BitCast(du16, b_in_even)));
3085
+ }
3086
+
3087
+ #endif // HWY_NATIVE_DEMOTE_F32_TO_BF16
3088
+
3089
+ // ------------------------------ PromoteInRangeTo
3090
+ #if (defined(HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO) == \
3091
+ defined(HWY_TARGET_TOGGLE))
3092
+ #ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
3093
+ #undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
3094
+ #else
3095
+ #define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
3096
+ #endif
3097
+
3098
+ #if HWY_HAVE_INTEGER64
3099
+ template <class D64, HWY_IF_UI64_D(D64)>
3100
+ HWY_API VFromD<D64> PromoteInRangeTo(D64 d64, VFromD<Rebind<float, D64>> v) {
3101
+ return PromoteTo(d64, v);
3102
+ }
3103
+ #endif
3104
+
3105
+ #endif // HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
3106
+
3107
+ // ------------------------------ ConvertInRangeTo
3108
+ #if (defined(HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO) == defined(HWY_TARGET_TOGGLE))
3109
+ #ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
3110
+ #undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
3111
+ #else
3112
+ #define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
3113
+ #endif
3114
+
3115
+ template <class DI, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DI),
3116
+ HWY_IF_T_SIZE_ONE_OF_D(DI, (HWY_HAVE_FLOAT16 ? (1 << 2) : 0) |
3117
+ (1 << 4) |
3118
+ (HWY_HAVE_FLOAT64 ? (1 << 8) : 0))>
3119
+ HWY_API VFromD<DI> ConvertInRangeTo(DI di, VFromD<RebindToFloat<DI>> v) {
3120
+ return ConvertTo(di, v);
3121
+ }
3122
+
3123
+ #endif // HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
3124
+
3125
+ // ------------------------------ DemoteInRangeTo
3126
+ #if (defined(HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO) == \
3127
+ defined(HWY_TARGET_TOGGLE))
3128
+ #ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
3129
+ #undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
3130
+ #else
3131
+ #define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
3132
+ #endif
3133
+
3134
+ #if HWY_HAVE_FLOAT64
3135
+ template <class D32, HWY_IF_UI32_D(D32)>
3136
+ HWY_API VFromD<D32> DemoteInRangeTo(D32 d32, VFromD<Rebind<double, D32>> v) {
3137
+ return DemoteTo(d32, v);
3138
+ }
3139
+ #endif
3140
+
3141
+ #endif // HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
3142
+
3143
+ // ------------------------------ PromoteInRangeLowerTo/PromoteInRangeUpperTo
3144
+
3145
+ template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
3146
+ HWY_API VFromD<D> PromoteInRangeLowerTo(D d, V v) {
3147
+ // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
3148
+ // because it cannot be deduced from D (could be either bf16 or f16).
3149
+ const Rebind<TFromV<V>, decltype(d)> dh;
3150
+ return PromoteInRangeTo(d, LowerHalf(dh, v));
3151
+ }
3152
+
3153
+ #if HWY_TARGET != HWY_SCALAR || HWY_IDE
3154
+ template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
3155
+ HWY_API VFromD<D> PromoteInRangeUpperTo(D d, V v) {
3156
+ #if (HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_EMU128 || \
3157
+ (HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64))
3158
+ // On targets that provide target-specific implementations of F32->UI64
3159
+ // PromoteInRangeTo, promote the upper half of v using PromoteInRangeTo
3160
+
3161
+ // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
3162
+ // because it cannot be deduced from D (could be either bf16 or f16).
3163
+ const Rebind<TFromV<V>, decltype(d)> dh;
3164
+ return PromoteInRangeTo(d, UpperHalf(dh, v));
3165
+ #else
3166
+ // Otherwise, on targets where F32->UI64 PromoteInRangeTo is simply a wrapper
3167
+ // around F32->UI64 PromoteTo, promote the upper half of v to TFromD<D> using
3168
+ // PromoteUpperTo
3169
+ return PromoteUpperTo(d, v);
3170
+ #endif
3171
+ }
3172
+ #endif // HWY_TARGET != HWY_SCALAR
3173
+
3174
+ // ------------------------------ PromoteInRangeEvenTo/PromoteInRangeOddTo
3175
+
3176
+ template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
3177
+ HWY_API VFromD<D> PromoteInRangeEvenTo(D d, V v) {
3178
+ #if HWY_TARGET == HWY_SCALAR
3179
+ return PromoteInRangeTo(d, v);
3180
+ #elif (HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_EMU128 || \
3181
+ (HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64))
3182
+ // On targets that provide target-specific implementations of F32->UI64
3183
+ // PromoteInRangeTo, promote the even lanes of v using PromoteInRangeTo
3184
+
3185
+ // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
3186
+ // because it cannot be deduced from D (could be either bf16 or f16).
3187
+ const DFromV<decltype(v)> d_from;
3188
+ const Rebind<TFromV<V>, decltype(d)> dh;
3189
+ return PromoteInRangeTo(d, LowerHalf(dh, ConcatEven(d_from, v, v)));
3190
+ #else
3191
+ // Otherwise, on targets where F32->UI64 PromoteInRangeTo is simply a wrapper
3192
+ // around F32->UI64 PromoteTo, promote the even lanes of v to TFromD<D> using
3193
+ // PromoteEvenTo
3194
+ return PromoteEvenTo(d, v);
3195
+ #endif // HWY_TARGET == HWY_SCALAR
3196
+ }
3197
+
3198
+ #if HWY_TARGET != HWY_SCALAR || HWY_IDE
3199
+ template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
3200
+ HWY_API VFromD<D> PromoteInRangeOddTo(D d, V v) {
3201
+ #if (HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_EMU128 || \
3202
+ (HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64))
3203
+ // On targets that provide target-specific implementations of F32->UI64
3204
+ // PromoteInRangeTo, promote the odd lanes of v using PromoteInRangeTo
3205
+
3206
+ // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
3207
+ // because it cannot be deduced from D (could be either bf16 or f16).
3208
+ const DFromV<decltype(v)> d_from;
3209
+ const Rebind<TFromV<V>, decltype(d)> dh;
3210
+ return PromoteInRangeTo(d, LowerHalf(dh, ConcatOdd(d_from, v, v)));
3211
+ #else
3212
+ // Otherwise, on targets where F32->UI64 PromoteInRangeTo is simply a wrapper
3213
+ // around F32->UI64 PromoteTo, promote the odd lanes of v to TFromD<D> using
3214
+ // PromoteOddTo
3215
+ return PromoteOddTo(d, v);
3216
+ #endif
3217
+ }
3218
+ #endif // HWY_TARGET != HWY_SCALAR
3219
+
3097
3220
  // ------------------------------ SumsOf2
3098
3221
 
3099
- #if HWY_TARGET != HWY_SCALAR
3222
+ #if HWY_TARGET != HWY_SCALAR || HWY_IDE
3100
3223
  namespace detail {
3101
3224
 
3102
3225
  template <class TypeTag, size_t kLaneSize, class V>
@@ -3220,7 +3343,7 @@ HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
3220
3343
  return TruncateTo(d, f32_biased_exp_as_u32);
3221
3344
  }
3222
3345
 
3223
- #if HWY_TARGET != HWY_SCALAR
3346
+ #if HWY_TARGET != HWY_SCALAR || HWY_IDE
3224
3347
  template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4)>
3225
3348
  HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
3226
3349
  const Half<decltype(d)> dh;
@@ -3252,7 +3375,7 @@ HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
3252
3375
  return U8FromU32(f32_biased_exp_as_u32);
3253
3376
  }
3254
3377
 
3255
- #if HWY_TARGET != HWY_SCALAR
3378
+ #if HWY_TARGET != HWY_SCALAR || HWY_IDE
3256
3379
  template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4),
3257
3380
  HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 2)>
3258
3381
  HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
@@ -3549,7 +3672,7 @@ HWY_INLINE V InvSubBytes(V state) {
3549
3672
  #endif
3550
3673
 
3551
3674
  // (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar)
3552
- #if HWY_TARGET != HWY_SCALAR
3675
+ #if HWY_TARGET != HWY_SCALAR || HWY_IDE
3553
3676
 
3554
3677
  namespace detail {
3555
3678
 
@@ -3972,12 +4095,11 @@ HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
3972
4095
  // MulAddSub for F16/F32/F64 vectors with 2 or more lanes on
3973
4096
  // SSSE3/SSE4/AVX2/AVX3 is implemented in x86_128-inl.h, x86_256-inl.h, and
3974
4097
  // x86_512-inl.h
3975
- template <class V, HWY_IF_LANES_GT_D(DFromV<V>, 1),
3976
- HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | ((HWY_TARGET <= HWY_SSSE3 &&
3977
- hwy::IsFloat<TFromV<V>>())
3978
- ? 0
3979
- : ((1 << 2) | (1 << 4) |
3980
- (1 << 8))))>
4098
+
4099
+ // MulAddSub for F16/F32/F64 vectors on SVE is implemented in arm_sve-inl.h
4100
+
4101
+ // MulAddSub for integer vectors on SVE2 is implemented in arm_sve-inl.h
4102
+ template <class V, HWY_IF_MULADDSUB_V(V)>
3981
4103
  HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
3982
4104
  using D = DFromV<V>;
3983
4105
  using T = TFromD<D>;
@@ -4001,9 +4123,17 @@ HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
4001
4123
 
4002
4124
  namespace detail {
4003
4125
 
4126
+ // DemoteInRangeTo, PromoteInRangeTo, and ConvertInRangeTo are okay to use in
4127
+ // the implementation of detail::IntDiv in generic_ops-inl.h as the current
4128
+ // implementations of DemoteInRangeTo, PromoteInRangeTo, and ConvertInRangeTo
4129
+ // will convert values that are outside of the range of TFromD<DI> by either
4130
+ // saturation, truncation, or converting values that are outside of the
4131
+ // destination range to LimitsMin<TFromD<DI>>() (which is equal to
4132
+ // static_cast<TFromD<DI>>(LimitsMax<TFromD<DI>>() + 1))
4133
+
4004
4134
  template <class D, class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))>
4005
4135
  HWY_INLINE Vec<D> IntDivConvFloatToInt(D di, V vf) {
4006
- return ConvertTo(di, vf);
4136
+ return ConvertInRangeTo(di, vf);
4007
4137
  }
4008
4138
 
4009
4139
  template <class D, class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))>
@@ -4014,7 +4144,7 @@ HWY_INLINE Vec<D> IntDivConvIntToFloat(D df, V vi) {
4014
4144
  #if !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
4015
4145
  template <class D, class V, HWY_IF_UI64_D(D), HWY_IF_F32(TFromV<V>)>
4016
4146
  HWY_INLINE Vec<D> IntDivConvFloatToInt(D df, V vi) {
4017
- return PromoteTo(df, vi);
4147
+ return PromoteInRangeTo(df, vi);
4018
4148
  }
4019
4149
 
4020
4150
  // If !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 is true, then UI64->F32
@@ -4085,8 +4215,13 @@ HWY_INLINE V IntDivUsingFloatDiv(V a, V b) {
4085
4215
  // the case where the magnitude of an inexact floating point division result
4086
4216
  // is rounded up.
4087
4217
 
4088
- #if (HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) && \
4089
- !HWY_HAVE_FLOAT64
4218
+ // It is okay to do conversions from MakeFloat<TFromV<V>> to TFromV<V> using
4219
+ // ConvertInRangeTo if sizeof(TFromV<V>) > kOrigLaneSize as the result of the
4220
+ // floating point division is always greater than LimitsMin<TFromV<V>>() and
4221
+ // less than LimitsMax<TFromV<V>>() if sizeof(TFromV<V>) > kOrigLaneSize and
4222
+ // b[i] != 0.
4223
+
4224
+ #if HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64
4090
4225
  // On Armv7, do division by multiplying by the ApproximateReciprocal
4091
4226
  // to avoid unnecessary overhead as F32 Div refines the approximate
4092
4227
  // reciprocal using 4 Newton-Raphson iterations
@@ -4101,7 +4236,7 @@ HWY_INLINE V IntDivUsingFloatDiv(V a, V b) {
4101
4236
  Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b));
4102
4237
  }
4103
4238
 
4104
- auto q0 = ConvertTo(d, Mul(ConvertTo(df, a), flt_recip_b));
4239
+ auto q0 = ConvertInRangeTo(d, Mul(ConvertTo(df, a), flt_recip_b));
4105
4240
  const auto r0 = BitCast(di, hwy::HWY_NAMESPACE::NegMulAdd(q0, b, a));
4106
4241
 
4107
4242
  auto r1 = r0;
@@ -4143,7 +4278,7 @@ HWY_INLINE V IntDivUsingFloatDiv(V a, V b) {
4143
4278
  #else
4144
4279
  // On targets other than Armv7 NEON, use F16 or F32 division as most targets
4145
4280
  // other than Armv7 NEON have native F32 divide instructions
4146
- return ConvertTo(d, Div(ConvertTo(df, a), ConvertTo(df, b)));
4281
+ return ConvertInRangeTo(d, Div(ConvertTo(df, a), ConvertTo(df, b)));
4147
4282
  #endif
4148
4283
  }
4149
4284
 
@@ -4184,8 +4319,7 @@ HWY_INLINE V IntDivUsingFloatDiv(V a, V b) {
4184
4319
 
4185
4320
  const auto flt_b = IntDivConvIntToFloat(df, b);
4186
4321
 
4187
- #if (HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) && \
4188
- !HWY_HAVE_FLOAT64
4322
+ #if HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64
4189
4323
  auto flt_recip_b = ApproximateReciprocal(flt_b);
4190
4324
  flt_recip_b =
4191
4325
  Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b));
@@ -4193,10 +4327,40 @@ HWY_INLINE V IntDivUsingFloatDiv(V a, V b) {
4193
4327
  const auto flt_recip_b = Div(Set(df, TF(1.0)), flt_b);
4194
4328
  #endif
4195
4329
 
4330
+ // It is okay if the conversion of a[i] * flt_recip_b[i] to T using
4331
+ // IntDivConvFloatToInt returns incorrect results in any lanes where b[i] == 0
4332
+ // as the result of IntDivUsingFloatDiv(a, b) is implementation-defined in any
4333
+ // lanes where b[i] == 0.
4334
+
4335
+ // If ScalarAbs(b[i]) == 1 is true, then it is possible for
4336
+ // a[i] * flt_recip_b[i] to be rounded up to a value that is outside of the
4337
+ // range of T. If a[i] * flt_recip_b[i] is outside of the range of T,
4338
+ // IntDivConvFloatToInt will convert any values that are out of the range of T
4339
+ // by either saturation, truncation, or wrapping around to LimitsMin<T>().
4340
+
4341
+ // It is okay if the conversion of a[i] * flt_recip_b[i] to T using
4342
+ // IntDivConvFloatToInt wraps around if ScalarAbs(b[i]) == 1 as r0 will have
4343
+ // the correct sign if ScalarAbs(b[i]) == 1, even in the cases where the
4344
+ // conversion of a[i] * flt_recip_b[i] to T using IntDivConvFloatToInt is
4345
+ // truncated or wraps around.
4346
+
4347
+ // If ScalarAbs(b[i]) >= 2 is true, a[i] * flt_recip_b[i] will be within the
4348
+ // range of T, even in the cases where the conversion of a[i] to TF is
4349
+ // rounded up or the result of multiplying a[i] by flt_recip_b[i] is rounded
4350
+ // up.
4351
+
4352
+ // ScalarAbs(r0[i]) will also always be less than (LimitsMax<T>() / 2) if
4353
+ // b[i] != 0, even in the cases where the conversion of a[i] * flt_recip_b[i]
4354
+ // to T using IntDivConvFloatToInt is truncated or is wrapped around.
4355
+
4196
4356
  auto q0 =
4197
4357
  IntDivConvFloatToInt(d, Mul(IntDivConvIntToFloat(df, a), flt_recip_b));
4198
4358
  const auto r0 = BitCast(di, hwy::HWY_NAMESPACE::NegMulAdd(q0, b, a));
4199
4359
 
4360
+ // If b[i] != 0 is true, r0[i] * flt_recip_b[i] is always within the range of
4361
+ // T, even in the cases where the conversion of r0[i] to TF is rounded up or
4362
+ // the multiplication of r0[i] by flt_recip_b[i] is rounded up.
4363
+
4200
4364
  auto q1 =
4201
4365
  IntDivConvFloatToInt(di, Mul(IntDivConvIntToFloat(df, r0), flt_recip_b));
4202
4366
  const auto r1 = hwy::HWY_NAMESPACE::NegMulAdd(q1, BitCast(di, b), r0);
@@ -4380,7 +4544,12 @@ HWY_INLINE V IntDiv(V a, V b) {
4380
4544
  const DFromV<decltype(a)> d;
4381
4545
  const Rebind<double, decltype(d)> df64;
4382
4546
 
4383
- return DemoteTo(d, Div(PromoteTo(df64, a), PromoteTo(df64, b)));
4547
+ // It is okay to demote the F64 Div result to int32_t or uint32_t using
4548
+ // DemoteInRangeTo as static_cast<double>(a[i]) / static_cast<double>(b[i])
4549
+ // will always be within the range of TFromV<V> if b[i] != 0 and
4550
+ // sizeof(TFromV<V>) <= 4.
4551
+
4552
+ return DemoteInRangeTo(d, Div(PromoteTo(df64, a), PromoteTo(df64, b)));
4384
4553
  }
4385
4554
  template <size_t kOrigLaneSize, class V, HWY_IF_UI32(TFromV<V>),
4386
4555
  HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
@@ -4389,9 +4558,16 @@ HWY_INLINE V IntDiv(V a, V b) {
4389
4558
  const Half<decltype(d)> dh;
4390
4559
  const Repartition<double, decltype(d)> df64;
4391
4560
 
4392
- return Combine(
4393
- d, DemoteTo(dh, Div(PromoteUpperTo(df64, a), PromoteUpperTo(df64, b))),
4394
- DemoteTo(dh, Div(PromoteLowerTo(df64, a), PromoteLowerTo(df64, b))));
4561
+ // It is okay to demote the F64 Div result to int32_t or uint32_t using
4562
+ // DemoteInRangeTo as static_cast<double>(a[i]) / static_cast<double>(b[i])
4563
+ // will always be within the range of TFromV<V> if b[i] != 0 and
4564
+ // sizeof(TFromV<V>) <= 4.
4565
+
4566
+ const VFromD<decltype(df64)> div1 =
4567
+ Div(PromoteUpperTo(df64, a), PromoteUpperTo(df64, b));
4568
+ const VFromD<decltype(df64)> div0 =
4569
+ Div(PromoteLowerTo(df64, a), PromoteLowerTo(df64, b));
4570
+ return Combine(d, DemoteInRangeTo(dh, div1), DemoteInRangeTo(dh, div0));
4395
4571
  }
4396
4572
  #endif // HWY_HAVE_FLOAT64
4397
4573
 
@@ -4479,6 +4655,96 @@ HWY_API Vec512<T> operator%(Vec512<T> a, Vec512<T> b) {
4479
4655
 
4480
4656
  #endif // HWY_NATIVE_INT_DIV
4481
4657
 
4658
+ // ------------------------------ MulEvenAdd (PromoteEvenTo)
4659
+
4660
+ // SVE with bf16 and NEON with bf16 override this.
4661
+ #if (defined(HWY_NATIVE_MUL_EVEN_BF16) == defined(HWY_TARGET_TOGGLE))
4662
+ #ifdef HWY_NATIVE_MUL_EVEN_BF16
4663
+ #undef HWY_NATIVE_MUL_EVEN_BF16
4664
+ #else
4665
+ #define HWY_NATIVE_MUL_EVEN_BF16
4666
+ #endif
4667
+
4668
+ template <class DF, HWY_IF_F32_D(DF),
4669
+ class VBF = VFromD<Repartition<bfloat16_t, DF>>>
4670
+ HWY_API VFromD<DF> MulEvenAdd(DF df, VBF a, VBF b, VFromD<DF> c) {
4671
+ return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), c);
4672
+ }
4673
+
4674
+ template <class DF, HWY_IF_F32_D(DF),
4675
+ class VBF = VFromD<Repartition<bfloat16_t, DF>>>
4676
+ HWY_API VFromD<DF> MulOddAdd(DF df, VBF a, VBF b, VFromD<DF> c) {
4677
+ return MulAdd(PromoteOddTo(df, a), PromoteOddTo(df, b), c);
4678
+ }
4679
+
4680
+ #endif // HWY_NATIVE_MUL_EVEN_BF16
4681
+
4682
+ // ------------------------------ ReorderWidenMulAccumulate (MulEvenAdd)
4683
+
4684
+ // AVX3_SPR/ZEN4, and NEON with bf16 but not(!) SVE override this.
4685
+ #if (defined(HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16) == \
4686
+ defined(HWY_TARGET_TOGGLE))
4687
+ #ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
4688
+ #undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
4689
+ #else
4690
+ #define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
4691
+ #endif
4692
+
4693
+ template <class DF, HWY_IF_F32_D(DF),
4694
+ class VBF = VFromD<Repartition<bfloat16_t, DF>>>
4695
+ HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF df, VBF a, VBF b,
4696
+ VFromD<DF> sum0,
4697
+ VFromD<DF>& sum1) {
4698
+ // Lane order within sum0/1 is undefined, hence we can avoid the
4699
+ // longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
4700
+ sum1 = MulOddAdd(df, a, b, sum1);
4701
+ return MulEvenAdd(df, a, b, sum0);
4702
+ }
4703
+
4704
+ #endif // HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
4705
+
4706
+ // ------------------------------ WidenMulAccumulate
4707
+
4708
+ #if (defined(HWY_NATIVE_WIDEN_MUL_ACCUMULATE) == defined(HWY_TARGET_TOGGLE))
4709
+ #ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
4710
+ #undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
4711
+ #else
4712
+ #define HWY_NATIVE_WIDEN_MUL_ACCUMULATE
4713
+ #endif
4714
+
4715
+ template<class D, HWY_IF_INTEGER(TFromD<D>),
4716
+ class DN = RepartitionToNarrow<D>>
4717
+ HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x,
4718
+ VFromD<D> low, VFromD<D>& high) {
4719
+ high = MulAdd(PromoteUpperTo(d, mul), PromoteUpperTo(d, x), high);
4720
+ return MulAdd(PromoteLowerTo(d, mul), PromoteLowerTo(d, x), low);
4721
+ }
4722
+
4723
+ #endif // HWY_NATIVE_WIDEN_MUL_ACCUMULATE
4724
+
4725
+ #if 0
4726
+ #if (defined(HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16) == defined(HWY_TARGET_TOGGLE))
4727
+
4728
+ #ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
4729
+ #undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
4730
+ #else
4731
+ #define HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
4732
+ #endif
4733
+
4734
+ #if HWY_HAVE_FLOAT16
4735
+
4736
+ template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>>
4737
+ HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x,
4738
+ VFromD<D> low, VFromD<D>& high) {
4739
+ high = MulAdd(PromoteUpperTo(d, mul), PromoteUpperTo(d, x), high);
4740
+ return MulAdd(PromoteLowerTo(d, mul), PromoteLowerTo(d, x), low);
4741
+ }
4742
+
4743
+ #endif // HWY_HAVE_FLOAT16
4744
+
4745
+ #endif // HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
4746
+ #endif // #if 0
4747
+
4482
4748
  // ------------------------------ SatWidenMulPairwiseAdd
4483
4749
 
4484
4750
  #if (defined(HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD) == \
@@ -4509,6 +4775,66 @@ HWY_API Vec<DI16> SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b) {
4509
4775
 
4510
4776
  #endif
4511
4777
 
4778
+ // ------------------------------ SatWidenMulPairwiseAccumulate
4779
+
4780
+ #if (defined(HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM) == \
4781
+ defined(HWY_TARGET_TOGGLE))
4782
+
4783
+ #ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
4784
+ #undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
4785
+ #else
4786
+ #define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
4787
+ #endif
4788
+
4789
+ template <class DI32, HWY_IF_I32_D(DI32)>
4790
+ HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate(
4791
+ DI32 di32, VFromD<Repartition<int16_t, DI32>> a,
4792
+ VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
4793
+ // WidenMulPairwiseAdd(di32, a, b) is okay here as
4794
+ // a[0]*b[0]+a[1]*b[1] is between -2147418112 and 2147483648 and as
4795
+ // a[0]*b[0]+a[1]*b[1] can only overflow an int32_t if
4796
+ // a[0], b[0], a[1], and b[1] are all equal to -32768.
4797
+
4798
+ const auto product = WidenMulPairwiseAdd(di32, a, b);
4799
+
4800
+ const auto mul_overflow =
4801
+ VecFromMask(di32, Eq(product, Set(di32, LimitsMin<int32_t>())));
4802
+
4803
+ return SaturatedAdd(Sub(sum, And(BroadcastSignBit(sum), mul_overflow)),
4804
+ Add(product, mul_overflow));
4805
+ }
4806
+
4807
+ #endif // HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
4808
+
4809
+ // ------------------------------ SatWidenMulAccumFixedPoint
4810
+
4811
+ #if (defined(HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT) == \
4812
+ defined(HWY_TARGET_TOGGLE))
4813
+
4814
+ #ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
4815
+ #undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
4816
+ #else
4817
+ #define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
4818
+ #endif
4819
+
4820
+ template <class DI32, HWY_IF_I32_D(DI32)>
4821
+ HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 di32,
4822
+ VFromD<Rebind<int16_t, DI32>> a,
4823
+ VFromD<Rebind<int16_t, DI32>> b,
4824
+ VFromD<DI32> sum) {
4825
+ const Repartition<int16_t, DI32> dt_i16;
4826
+
4827
+ const auto vt_a = ResizeBitCast(dt_i16, a);
4828
+ const auto vt_b = ResizeBitCast(dt_i16, b);
4829
+
4830
+ const auto dup_a = InterleaveWholeLower(dt_i16, vt_a, vt_a);
4831
+ const auto dup_b = InterleaveWholeLower(dt_i16, vt_b, vt_b);
4832
+
4833
+ return SatWidenMulPairwiseAccumulate(di32, dup_a, dup_b, sum);
4834
+ }
4835
+
4836
+ #endif // HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
4837
+
4512
4838
  // ------------------------------ SumOfMulQuadAccumulate
4513
4839
 
4514
4840
  #if (defined(HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE) == \
@@ -5588,9 +5914,7 @@ using IndicesFromD = decltype(IndicesFromVec(D(), Zero(RebindToUnsigned<D>())));
5588
5914
 
5589
5915
  // RVV/SVE have their own implementations of
5590
5916
  // TwoTablesLookupLanes(D d, VFromD<D> a, VFromD<D> b, IndicesFromD<D> idx)
5591
- #if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE && \
5592
- HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && \
5593
- HWY_TARGET != HWY_SVE2_128
5917
+ #if HWY_TARGET != HWY_RVV && !HWY_TARGET_IS_SVE
5594
5918
  template <class D>
5595
5919
  HWY_API VFromD<D> TwoTablesLookupLanes(D /*d*/, VFromD<D> a, VFromD<D> b,
5596
5920
  IndicesFromD<D> idx) {
@@ -5780,7 +6104,7 @@ HWY_API V ReverseBits(V v) {
5780
6104
  #define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
5781
6105
  #endif
5782
6106
 
5783
- #if HWY_TARGET != HWY_SCALAR
6107
+ #if HWY_TARGET != HWY_SCALAR || HWY_IDE
5784
6108
  namespace detail {
5785
6109
 
5786
6110
  template <class D>
@@ -5794,7 +6118,7 @@ HWY_INLINE Vec<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
5794
6118
  const ScalableTag<uint32_t, kLoadPow2> d_load;
5795
6119
  #else
5796
6120
  constexpr size_t kMaxBytes = d.MaxBytes();
5797
- #if HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES
6121
+ #if HWY_TARGET_IS_NEON
5798
6122
  constexpr size_t kMinLanesToLoad = 2;
5799
6123
  #else
5800
6124
  constexpr size_t kMinLanesToLoad = 4;
@@ -5811,7 +6135,7 @@ HWY_INLINE Vec<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
5811
6135
 
5812
6136
  #endif // HWY_NATIVE_PER4LANEBLKSHUF_DUP32
5813
6137
 
5814
- #if HWY_TARGET != HWY_SCALAR
6138
+ #if HWY_TARGET != HWY_SCALAR || HWY_IDE
5815
6139
  namespace detail {
5816
6140
 
5817
6141
  template <class V>
@@ -5863,8 +6187,7 @@ HWY_INLINE Vec<D> TblLookupPer4LaneBlkU8IdxInBlk(D d, const uint32_t idx3,
5863
6187
  d, Set(du32, U8x4Per4LaneBlkIndices(idx3, idx2, idx1, idx0)));
5864
6188
  }
5865
6189
 
5866
- #if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || \
5867
- HWY_TARGET == HWY_SVE2_128 || HWY_TARGET == HWY_EMU128
6190
+ #if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_EMU128
5868
6191
  #define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D) void* = nullptr
5869
6192
  #else
5870
6193
  #define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D) HWY_IF_T_SIZE_D(D, 8)
@@ -5965,7 +6288,7 @@ HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
5965
6288
  const uint16_t u16_idx1 = static_cast<uint16_t>(idx1);
5966
6289
  const uint16_t u16_idx2 = static_cast<uint16_t>(idx2);
5967
6290
  const uint16_t u16_idx3 = static_cast<uint16_t>(idx3);
5968
- #if HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES
6291
+ #if HWY_TARGET_IS_NEON
5969
6292
  constexpr size_t kMinLanesToLoad = 4;
5970
6293
  #else
5971
6294
  constexpr size_t kMinLanesToLoad = 8;
@@ -6195,7 +6518,7 @@ HWY_API V Per4LaneBlockShuffle(V v) {
6195
6518
  return v;
6196
6519
  }
6197
6520
 
6198
- #if HWY_TARGET != HWY_SCALAR
6521
+ #if HWY_TARGET != HWY_SCALAR || HWY_IDE
6199
6522
  template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0, class V,
6200
6523
  HWY_IF_LANES_D(DFromV<V>, 2)>
6201
6524
  HWY_API V Per4LaneBlockShuffle(V v) {
@@ -6294,7 +6617,7 @@ HWY_API VFromD<D> Slide1Down(D d, VFromD<D> /*v*/) {
6294
6617
  return Zero(d);
6295
6618
  }
6296
6619
 
6297
- #if HWY_TARGET != HWY_SCALAR
6620
+ #if HWY_TARGET != HWY_SCALAR || HWY_IDE
6298
6621
  template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_GT_D(D, 1)>
6299
6622
  HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) {
6300
6623
  return ShiftLeftLanes<1>(d, v);
@@ -6343,6 +6666,37 @@ HWY_API VFromD<D> SlideDownBlocks(D d, VFromD<D> v) {
6343
6666
  }
6344
6667
  #endif
6345
6668
 
6669
+ // ------------------------------ Slide mask up/down
6670
+ #if (defined(HWY_NATIVE_SLIDE_MASK) == defined(HWY_TARGET_TOGGLE))
6671
+
6672
+ #ifdef HWY_NATIVE_SLIDE_MASK
6673
+ #undef HWY_NATIVE_SLIDE_MASK
6674
+ #else
6675
+ #define HWY_NATIVE_SLIDE_MASK
6676
+ #endif
6677
+
6678
+ template <class D>
6679
+ HWY_API Mask<D> SlideMask1Up(D d, Mask<D> m) {
6680
+ return MaskFromVec(Slide1Up(d, VecFromMask(d, m)));
6681
+ }
6682
+
6683
+ template <class D>
6684
+ HWY_API Mask<D> SlideMask1Down(D d, Mask<D> m) {
6685
+ return MaskFromVec(Slide1Down(d, VecFromMask(d, m)));
6686
+ }
6687
+
6688
+ template <class D>
6689
+ HWY_API Mask<D> SlideMaskUpLanes(D d, Mask<D> m, size_t amt) {
6690
+ return MaskFromVec(SlideUpLanes(d, VecFromMask(d, m), amt));
6691
+ }
6692
+
6693
+ template <class D>
6694
+ HWY_API Mask<D> SlideMaskDownLanes(D d, Mask<D> m, size_t amt) {
6695
+ return MaskFromVec(SlideDownLanes(d, VecFromMask(d, m), amt));
6696
+ }
6697
+
6698
+ #endif // HWY_NATIVE_SLIDE_MASK
6699
+
6346
6700
  // ------------------------------ SumsOfAdjQuadAbsDiff
6347
6701
 
6348
6702
  #if (defined(HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF) == \
@@ -6353,7 +6707,7 @@ HWY_API VFromD<D> SlideDownBlocks(D d, VFromD<D> v) {
6353
6707
  #define HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
6354
6708
  #endif
6355
6709
 
6356
- #if HWY_TARGET != HWY_SCALAR
6710
+ #if HWY_TARGET != HWY_SCALAR || HWY_IDE
6357
6711
  template <int kAOffset, int kBOffset, class V8, HWY_IF_UI8_D(DFromV<V8>)>
6358
6712
  HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfAdjQuadAbsDiff(V8 a, V8 b) {
6359
6713
  static_assert(0 <= kAOffset && kAOffset <= 1,
@@ -6377,8 +6731,7 @@ HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfAdjQuadAbsDiff(V8 a, V8 b) {
6377
6731
  // targets as d8_interleave.Pow2() >= d8.Pow2() is true.
6378
6732
  constexpr int kInterleavePow2 = HWY_MAX(d8.Pow2(), 0);
6379
6733
  const ScalableTag<TFromD<D8>, kInterleavePow2> d8_interleave;
6380
- #elif HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || \
6381
- HWY_TARGET == HWY_SVE2_128
6734
+ #elif HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE
6382
6735
  // On SVE targets, Lanes(d8_interleave) >= 16 and
6383
6736
  // Lanes(d8_interleave) >= Lanes(d8) are both already true as d8 is a SIMD
6384
6737
  // tag for a full u8/i8 vector on SVE.
@@ -6457,7 +6810,7 @@ HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfAdjQuadAbsDiff(V8 a, V8 b) {
6457
6810
  #define HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
6458
6811
  #endif
6459
6812
 
6460
- #if HWY_TARGET != HWY_SCALAR
6813
+ #if HWY_TARGET != HWY_SCALAR || HWY_IDE
6461
6814
  template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, class V8,
6462
6815
  HWY_IF_UI8_D(DFromV<V8>)>
6463
6816
  HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfShuffledQuadAbsDiff(V8 a,
@@ -6499,7 +6852,7 @@ HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfShuffledQuadAbsDiff(V8 a,
6499
6852
  a_shuf[3], a_shuf[4], a_shuf[5], a_shuf[6],
6500
6853
  a_shuf[9], a_shuf[10], a_shuf[11], a_shuf[12],
6501
6854
  a_shuf[11], a_shuf[12], a_shuf[13], a_shuf[14] } */
6502
- #if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
6855
+ #if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE
6503
6856
  // On RVV/SVE targets, use Slide1Up/Slide1Down instead of
6504
6857
  // ShiftLeftBytes/ShiftRightBytes to avoid unnecessary zeroing out of any
6505
6858
  // lanes that are shifted into an adjacent 16-byte block as any lanes that are
@@ -6539,6 +6892,56 @@ HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfShuffledQuadAbsDiff(V8 a,
6539
6892
 
6540
6893
  #endif // HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
6541
6894
 
6895
+ // ------------------------------ BitShuffle (Rol)
6896
+ #if (defined(HWY_NATIVE_BITSHUFFLE) == defined(HWY_TARGET_TOGGLE))
6897
+ #ifdef HWY_NATIVE_BITSHUFFLE
6898
+ #undef HWY_NATIVE_BITSHUFFLE
6899
+ #else
6900
+ #define HWY_NATIVE_BITSHUFFLE
6901
+ #endif
6902
+
6903
+ #if HWY_HAVE_INTEGER64 && HWY_TARGET != HWY_SCALAR
6904
+ template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>)>
6905
+ HWY_API V BitShuffle(V v, VI idx) {
6906
+ const DFromV<decltype(v)> d64;
6907
+ const RebindToUnsigned<decltype(d64)> du64;
6908
+ const Repartition<uint8_t, decltype(d64)> du8;
6909
+
6910
+ #if HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || \
6911
+ HWY_TARGET == HWY_WASM_EMU256
6912
+ const Repartition<uint16_t, decltype(d64)> d_idx_shr;
6913
+ #else
6914
+ const Repartition<uint8_t, decltype(d64)> d_idx_shr;
6915
+ #endif
6916
+
6917
+ #if HWY_IS_LITTLE_ENDIAN
6918
+ constexpr uint64_t kExtractedBitsMask =
6919
+ static_cast<uint64_t>(0x8040201008040201u);
6920
+ #else
6921
+ constexpr uint64_t kExtractedBitsMask =
6922
+ static_cast<uint64_t>(0x0102040810204080u);
6923
+ #endif
6924
+
6925
+ const auto byte_idx = BitwiseIfThenElse(
6926
+ Set(du8, uint8_t{0x07}),
6927
+ BitCast(du8, ShiftRight<3>(BitCast(d_idx_shr, idx))),
6928
+ BitCast(du8, Dup128VecFromValues(du64, uint64_t{0},
6929
+ uint64_t{0x0808080808080808u})));
6930
+ // We want to shift right by idx & 7 to extract the desired bit in `bytes`,
6931
+ // and left by iota & 7 to put it in the correct output bit. To correctly
6932
+ // handle shift counts from -7 to 7, we rotate.
6933
+ const auto rotate_left_bits = Sub(Iota(du8, uint8_t{0}), BitCast(du8, idx));
6934
+
6935
+ const auto extracted_bits =
6936
+ And(Rol(TableLookupBytes(v, byte_idx), rotate_left_bits),
6937
+ BitCast(du8, Set(du64, kExtractedBitsMask)));
6938
+ // Combine bit-sliced (one bit per byte) into one 64-bit sum.
6939
+ return BitCast(d64, SumsOf8(extracted_bits));
6940
+ }
6941
+ #endif // HWY_HAVE_INTEGER64 && HWY_TARGET != HWY_SCALAR
6942
+
6943
+ #endif // HWY_NATIVE_BITSHUFFLE
6944
+
6542
6945
  // ================================================== Operator wrapper
6543
6946
 
6544
6947
  // SVE* and RVV currently cannot define operators and have already defined