@img/sharp-libvips-dev 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/include/expat.h +21 -10
  2. package/include/expat_config.h +11 -5
  3. package/include/ffi.h +12 -25
  4. package/include/freetype2/freetype/config/ftoption.h +1 -1
  5. package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
  6. package/include/glib-2.0/gio/gapplication.h +6 -0
  7. package/include/glib-2.0/gio/giotypes.h +0 -1
  8. package/include/glib-2.0/girepository/giarginfo.h +23 -6
  9. package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
  10. package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
  11. package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
  12. package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
  13. package/include/glib-2.0/girepository/gienuminfo.h +20 -21
  14. package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
  15. package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
  16. package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
  17. package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
  18. package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
  19. package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
  20. package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
  21. package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
  22. package/include/glib-2.0/girepository/girepository.h +53 -62
  23. package/include/glib-2.0/girepository/girffi.h +8 -7
  24. package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
  25. package/include/glib-2.0/girepository/gistructinfo.h +26 -11
  26. package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
  27. package/include/glib-2.0/girepository/gitypelib.h +9 -13
  28. package/include/glib-2.0/girepository/gitypes.h +52 -104
  29. package/include/glib-2.0/girepository/giunioninfo.h +28 -12
  30. package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
  31. package/include/glib-2.0/girepository/givalueinfo.h +65 -0
  32. package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
  33. package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
  34. package/include/glib-2.0/glib/gbitlock.h +31 -0
  35. package/include/glib-2.0/glib/gmessages.h +8 -0
  36. package/include/glib-2.0/glib/gslice.h +2 -0
  37. package/include/glib-2.0/glib/gstrfuncs.h +24 -18
  38. package/include/glib-2.0/glib/gthread.h +191 -3
  39. package/include/glib-2.0/glib-unix.h +7 -1
  40. package/include/glib-2.0/gobject/genums.h +6 -6
  41. package/include/glib-2.0/gobject/glib-types.h +11 -0
  42. package/include/glib-2.0/gobject/gsignal.h +16 -6
  43. package/include/hwy/aligned_allocator.h +171 -6
  44. package/include/hwy/base.h +1765 -543
  45. package/include/hwy/cache_control.h +24 -6
  46. package/include/hwy/detect_compiler_arch.h +23 -2
  47. package/include/hwy/detect_targets.h +56 -13
  48. package/include/hwy/foreach_target.h +24 -0
  49. package/include/hwy/highway.h +20 -3
  50. package/include/hwy/ops/arm_neon-inl.h +1086 -667
  51. package/include/hwy/ops/arm_sve-inl.h +1091 -235
  52. package/include/hwy/ops/emu128-inl.h +271 -196
  53. package/include/hwy/ops/generic_ops-inl.h +2270 -399
  54. package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
  55. package/include/hwy/ops/rvv-inl.h +1043 -311
  56. package/include/hwy/ops/scalar-inl.h +189 -159
  57. package/include/hwy/ops/set_macros-inl.h +66 -6
  58. package/include/hwy/ops/shared-inl.h +175 -56
  59. package/include/hwy/ops/wasm_128-inl.h +153 -136
  60. package/include/hwy/ops/x86_128-inl.h +1647 -646
  61. package/include/hwy/ops/x86_256-inl.h +1003 -370
  62. package/include/hwy/ops/x86_512-inl.h +948 -353
  63. package/include/hwy/per_target.h +4 -0
  64. package/include/hwy/profiler.h +648 -0
  65. package/include/hwy/robust_statistics.h +2 -2
  66. package/include/hwy/targets.h +18 -11
  67. package/include/hwy/timer.h +11 -0
  68. package/include/libpng16/png.h +32 -29
  69. package/include/libpng16/pngconf.h +2 -2
  70. package/include/libpng16/pnglibconf.h +7 -2
  71. package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
  72. package/include/libxml2/libxml/parser.h +16 -7
  73. package/include/libxml2/libxml/xmlIO.h +0 -1
  74. package/include/libxml2/libxml/xmlversion.h +4 -4
  75. package/include/pango-1.0/pango/pango-features.h +3 -3
  76. package/include/pango-1.0/pango/pango-fontmap.h +7 -0
  77. package/include/pixman-1/pixman-version.h +2 -2
  78. package/include/png.h +32 -29
  79. package/include/pngconf.h +2 -2
  80. package/include/pnglibconf.h +7 -2
  81. package/include/vips/connection.h +9 -3
  82. package/include/vips/util.h +0 -9
  83. package/include/vips/version.h +4 -4
  84. package/package.json +1 -1
  85. package/versions.json +11 -11
@@ -143,7 +143,8 @@ namespace detail { // for code folding and Raw128
143
143
  HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \
144
144
  HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)
145
145
 
146
- #ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
146
+ #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && \
147
+ (HWY_COMPILER_GCC_ACTUAL >= 1300 || HWY_COMPILER_CLANG >= 1100)
147
148
  #define HWY_NEON_HAVE_BFLOAT16 1
148
149
  #else
149
150
  #define HWY_NEON_HAVE_BFLOAT16 0
@@ -160,7 +161,7 @@ namespace detail { // for code folding and Raw128
160
161
  #define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args)
161
162
  #endif
162
163
 
163
- // Used for conversion instructions if HWY_NEON_HAVE_FLOAT16C.
164
+ // Used for conversion instructions if HWY_NEON_HAVE_F16C.
164
165
  #define HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(name, prefix, infix, \
165
166
  args) \
166
167
  HWY_NEON_DEF_FUNCTION(float16, 8, name, prefix##q, infix, f16, args) \
@@ -176,6 +177,19 @@ namespace detail { // for code folding and Raw128
176
177
  #define HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args)
177
178
  #endif
178
179
 
180
+ // Enable generic functions for whichever of (f16, bf16) are not supported.
181
+ #if !HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16
182
+ #define HWY_NEON_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
183
+ #elif !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16
184
+ #define HWY_NEON_IF_EMULATED_D(D) HWY_IF_F16_D(D)
185
+ #elif HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16
186
+ #define HWY_NEON_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
187
+ #elif HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16
188
+ #define HWY_NEON_IF_EMULATED_D(D) hwy::EnableIf<false>* = nullptr
189
+ #else
190
+ #error "Logic error, handled all four cases"
191
+ #endif
192
+
179
193
  // float
180
194
  #define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
181
195
  HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \
@@ -397,39 +411,6 @@ struct Tuple2<int64_t, N> {
397
411
  int64x1x2_t raw;
398
412
  };
399
413
 
400
- template <>
401
- struct Tuple2<float16_t, 8> {
402
- #if HWY_NEON_HAVE_FLOAT16C
403
- float16x8x2_t raw;
404
- #else
405
- uint16x8x2_t raw;
406
- #endif
407
- };
408
- template <size_t N>
409
- struct Tuple2<float16_t, N> {
410
- #if HWY_NEON_HAVE_FLOAT16C
411
- float16x4x2_t raw;
412
- #else
413
- uint16x4x2_t raw;
414
- #endif
415
- };
416
- template <>
417
- struct Tuple2<bfloat16_t, 8> {
418
- #if HWY_NEON_HAVE_BFLOAT16
419
- bfloat16x8x2_t raw;
420
- #else
421
- uint16x8x2_t raw;
422
- #endif
423
- };
424
- template <size_t N>
425
- struct Tuple2<bfloat16_t, N> {
426
- #if HWY_NEON_HAVE_BFLOAT16
427
- bfloat16x4x2_t raw;
428
- #else
429
- uint16x4x2_t raw;
430
- #endif
431
- };
432
-
433
414
  template <>
434
415
  struct Tuple2<float32_t, 4> {
435
416
  float32x4x2_t raw;
@@ -514,39 +495,6 @@ struct Tuple3<int64_t, N> {
514
495
  int64x1x3_t raw;
515
496
  };
516
497
 
517
- template <>
518
- struct Tuple3<float16_t, 8> {
519
- #if HWY_NEON_HAVE_FLOAT16C
520
- float16x8x3_t raw;
521
- #else
522
- uint16x8x3_t raw;
523
- #endif
524
- };
525
- template <size_t N>
526
- struct Tuple3<float16_t, N> {
527
- #if HWY_NEON_HAVE_FLOAT16C
528
- float16x4x3_t raw;
529
- #else
530
- uint16x4x3_t raw;
531
- #endif
532
- };
533
- template <>
534
- struct Tuple3<bfloat16_t, 8> {
535
- #if HWY_NEON_HAVE_BFLOAT16
536
- bfloat16x8x3_t raw;
537
- #else
538
- uint16x8x3_t raw;
539
- #endif
540
- };
541
- template <size_t N>
542
- struct Tuple3<bfloat16_t, N> {
543
- #if HWY_NEON_HAVE_BFLOAT16
544
- bfloat16x4x3_t raw;
545
- #else
546
- uint16x4x3_t raw;
547
- #endif
548
- };
549
-
550
498
  template <>
551
499
  struct Tuple3<float32_t, 4> {
552
500
  float32x4x3_t raw;
@@ -631,39 +579,6 @@ struct Tuple4<int64_t, N> {
631
579
  int64x1x4_t raw;
632
580
  };
633
581
 
634
- template <>
635
- struct Tuple4<float16_t, 8> {
636
- #if HWY_NEON_HAVE_FLOAT16C
637
- float16x8x4_t raw;
638
- #else
639
- uint16x8x4_t raw;
640
- #endif
641
- };
642
- template <size_t N>
643
- struct Tuple4<float16_t, N> {
644
- #if HWY_NEON_HAVE_FLOAT16C
645
- float16x4x4_t raw;
646
- #else
647
- uint16x4x4_t raw;
648
- #endif
649
- };
650
- template <>
651
- struct Tuple4<bfloat16_t, 8> {
652
- #if HWY_NEON_HAVE_BFLOAT16
653
- bfloat16x8x4_t raw;
654
- #else
655
- uint16x8x4_t raw;
656
- #endif
657
- };
658
- template <size_t N>
659
- struct Tuple4<bfloat16_t, N> {
660
- #if HWY_NEON_HAVE_BFLOAT16
661
- bfloat16x4x4_t raw;
662
- #else
663
- uint16x4x4_t raw;
664
- #endif
665
- };
666
-
667
582
  template <>
668
583
  struct Tuple4<float32_t, 4> {
669
584
  float32x4x4_t raw;
@@ -686,201 +601,199 @@ struct Tuple4<float64_t, N> {
686
601
  template <typename T, size_t N>
687
602
  struct Raw128;
688
603
 
689
- // 128
690
604
  template <>
691
605
  struct Raw128<uint8_t, 16> {
692
606
  using type = uint8x16_t;
693
607
  };
608
+ template <size_t N>
609
+ struct Raw128<uint8_t, N> {
610
+ using type = uint8x8_t;
611
+ };
694
612
 
695
613
  template <>
696
614
  struct Raw128<uint16_t, 8> {
697
615
  using type = uint16x8_t;
698
616
  };
617
+ template <size_t N>
618
+ struct Raw128<uint16_t, N> {
619
+ using type = uint16x4_t;
620
+ };
699
621
 
700
622
  template <>
701
623
  struct Raw128<uint32_t, 4> {
702
624
  using type = uint32x4_t;
703
625
  };
626
+ template <size_t N>
627
+ struct Raw128<uint32_t, N> {
628
+ using type = uint32x2_t;
629
+ };
704
630
 
705
631
  template <>
706
632
  struct Raw128<uint64_t, 2> {
707
633
  using type = uint64x2_t;
708
634
  };
635
+ template <>
636
+ struct Raw128<uint64_t, 1> {
637
+ using type = uint64x1_t;
638
+ };
709
639
 
710
640
  template <>
711
641
  struct Raw128<int8_t, 16> {
712
642
  using type = int8x16_t;
713
643
  };
644
+ template <size_t N>
645
+ struct Raw128<int8_t, N> {
646
+ using type = int8x8_t;
647
+ };
714
648
 
715
649
  template <>
716
650
  struct Raw128<int16_t, 8> {
717
651
  using type = int16x8_t;
718
652
  };
653
+ template <size_t N>
654
+ struct Raw128<int16_t, N> {
655
+ using type = int16x4_t;
656
+ };
719
657
 
720
658
  template <>
721
659
  struct Raw128<int32_t, 4> {
722
660
  using type = int32x4_t;
723
661
  };
662
+ template <size_t N>
663
+ struct Raw128<int32_t, N> {
664
+ using type = int32x2_t;
665
+ };
724
666
 
725
667
  template <>
726
668
  struct Raw128<int64_t, 2> {
727
669
  using type = int64x2_t;
728
670
  };
729
-
730
671
  template <>
731
- struct Raw128<float16_t, 8> {
732
- #if HWY_NEON_HAVE_FLOAT16C
733
- using type = float16x8_t;
734
- #else
735
- using type = uint16x8_t;
736
- #endif
737
- };
738
-
739
- template <>
740
- struct Raw128<bfloat16_t, 8> {
741
- #if HWY_NEON_HAVE_BFLOAT16
742
- using type = bfloat16x8_t;
743
- #else
744
- using type = uint16x8_t;
745
- #endif
672
+ struct Raw128<int64_t, 1> {
673
+ using type = int64x1_t;
746
674
  };
747
675
 
748
676
  template <>
749
677
  struct Raw128<float, 4> {
750
678
  using type = float32x4_t;
751
679
  };
680
+ template <size_t N>
681
+ struct Raw128<float, N> {
682
+ using type = float32x2_t;
683
+ };
752
684
 
753
685
  #if HWY_HAVE_FLOAT64
754
686
  template <>
755
687
  struct Raw128<double, 2> {
756
688
  using type = float64x2_t;
757
689
  };
758
- #endif // HWY_HAVE_FLOAT64
759
-
760
- // 64
761
690
  template <>
762
- struct Raw128<uint8_t, 8> {
763
- using type = uint8x8_t;
764
- };
765
-
766
- template <>
767
- struct Raw128<uint16_t, 4> {
768
- using type = uint16x4_t;
769
- };
770
-
771
- template <>
772
- struct Raw128<uint32_t, 2> {
773
- using type = uint32x2_t;
691
+ struct Raw128<double, 1> {
692
+ using type = float64x1_t;
774
693
  };
694
+ #endif // HWY_HAVE_FLOAT64
775
695
 
776
- template <>
777
- struct Raw128<uint64_t, 1> {
778
- using type = uint64x1_t;
779
- };
696
+ #if HWY_NEON_HAVE_F16C
780
697
 
781
698
  template <>
782
- struct Raw128<int8_t, 8> {
783
- using type = int8x8_t;
699
+ struct Tuple2<float16_t, 8> {
700
+ float16x8x2_t raw;
784
701
  };
785
-
786
- template <>
787
- struct Raw128<int16_t, 4> {
788
- using type = int16x4_t;
702
+ template <size_t N>
703
+ struct Tuple2<float16_t, N> {
704
+ float16x4x2_t raw;
789
705
  };
790
706
 
791
707
  template <>
792
- struct Raw128<int32_t, 2> {
793
- using type = int32x2_t;
708
+ struct Tuple3<float16_t, 8> {
709
+ float16x8x3_t raw;
794
710
  };
795
-
796
- template <>
797
- struct Raw128<int64_t, 1> {
798
- using type = int64x1_t;
711
+ template <size_t N>
712
+ struct Tuple3<float16_t, N> {
713
+ float16x4x3_t raw;
799
714
  };
800
715
 
801
716
  template <>
802
- struct Raw128<float16_t, 4> {
803
- #if HWY_NEON_HAVE_FLOAT16C
804
- using type = float16x4_t;
805
- #else
806
- using type = uint16x4_t;
807
- #endif
717
+ struct Tuple4<float16_t, 8> {
718
+ float16x8x4_t raw;
808
719
  };
809
-
810
- template <>
811
- struct Raw128<bfloat16_t, 4> {
812
- #if HWY_NEON_HAVE_BFLOAT16
813
- using type = bfloat16x4_t;
814
- #else
815
- using type = uint16x4_t;
816
- #endif
720
+ template <size_t N>
721
+ struct Tuple4<float16_t, N> {
722
+ float16x4x4_t raw;
817
723
  };
818
724
 
819
725
  template <>
820
- struct Raw128<float, 2> {
821
- using type = float32x2_t;
726
+ struct Raw128<float16_t, 8> {
727
+ using type = float16x8_t;
822
728
  };
823
-
824
- #if HWY_HAVE_FLOAT64
825
- template <>
826
- struct Raw128<double, 1> {
827
- using type = float64x1_t;
729
+ template <size_t N>
730
+ struct Raw128<float16_t, N> {
731
+ using type = float16x4_t;
828
732
  };
829
- #endif // HWY_HAVE_FLOAT64
830
-
831
- // 32 (same as 64)
832
- template <>
833
- struct Raw128<uint8_t, 4> : public Raw128<uint8_t, 8> {};
834
733
 
835
- template <>
836
- struct Raw128<uint16_t, 2> : public Raw128<uint16_t, 4> {};
734
+ #else // !HWY_NEON_HAVE_F16C
837
735
 
838
- template <>
839
- struct Raw128<uint32_t, 1> : public Raw128<uint32_t, 2> {};
840
-
841
- template <>
842
- struct Raw128<int8_t, 4> : public Raw128<int8_t, 8> {};
843
-
844
- template <>
845
- struct Raw128<int16_t, 2> : public Raw128<int16_t, 4> {};
846
-
847
- template <>
848
- struct Raw128<int32_t, 1> : public Raw128<int32_t, 2> {};
849
-
850
- template <>
851
- struct Raw128<float16_t, 2> : public Raw128<float16_t, 4> {};
852
-
853
- template <>
854
- struct Raw128<bfloat16_t, 2> : public Raw128<bfloat16_t, 4> {};
736
+ template <size_t N>
737
+ struct Tuple2<float16_t, N> : public Tuple2<uint16_t, N> {};
738
+ template <size_t N>
739
+ struct Tuple3<float16_t, N> : public Tuple3<uint16_t, N> {};
740
+ template <size_t N>
741
+ struct Tuple4<float16_t, N> : public Tuple4<uint16_t, N> {};
742
+ template <size_t N>
743
+ struct Raw128<float16_t, N> : public Raw128<uint16_t, N> {};
855
744
 
856
- template <>
857
- struct Raw128<float, 1> : public Raw128<float, 2> {};
745
+ #endif // HWY_NEON_HAVE_F16C
858
746
 
859
- // 16 (same as 64)
860
- template <>
861
- struct Raw128<uint8_t, 2> : public Raw128<uint8_t, 8> {};
747
+ #if HWY_NEON_HAVE_BFLOAT16
862
748
 
863
749
  template <>
864
- struct Raw128<uint16_t, 1> : public Raw128<uint16_t, 4> {};
750
+ struct Tuple2<bfloat16_t, 8> {
751
+ bfloat16x8x2_t raw;
752
+ };
753
+ template <size_t N>
754
+ struct Tuple2<bfloat16_t, N> {
755
+ bfloat16x4x2_t raw;
756
+ };
865
757
 
866
758
  template <>
867
- struct Raw128<int8_t, 2> : public Raw128<int8_t, 8> {};
759
+ struct Tuple3<bfloat16_t, 8> {
760
+ bfloat16x8x3_t raw;
761
+ };
762
+ template <size_t N>
763
+ struct Tuple3<bfloat16_t, N> {
764
+ bfloat16x4x3_t raw;
765
+ };
868
766
 
869
767
  template <>
870
- struct Raw128<int16_t, 1> : public Raw128<int16_t, 4> {};
768
+ struct Tuple4<bfloat16_t, 8> {
769
+ bfloat16x8x4_t raw;
770
+ };
771
+ template <size_t N>
772
+ struct Tuple4<bfloat16_t, N> {
773
+ bfloat16x4x4_t raw;
774
+ };
871
775
 
872
776
  template <>
873
- struct Raw128<float16_t, 1> : public Raw128<float16_t, 4> {};
777
+ struct Raw128<bfloat16_t, 8> {
778
+ using type = bfloat16x8_t;
779
+ };
780
+ template <size_t N>
781
+ struct Raw128<bfloat16_t, N> {
782
+ using type = bfloat16x4_t;
783
+ };
874
784
 
875
- template <>
876
- struct Raw128<bfloat16_t, 1> : public Raw128<bfloat16_t, 4> {};
785
+ #else // !HWY_NEON_HAVE_BFLOAT16
877
786
 
878
- // 8 (same as 64)
879
- template <>
880
- struct Raw128<uint8_t, 1> : public Raw128<uint8_t, 8> {};
787
+ template <size_t N>
788
+ struct Tuple2<bfloat16_t, N> : public Tuple2<uint16_t, N> {};
789
+ template <size_t N>
790
+ struct Tuple3<bfloat16_t, N> : public Tuple3<uint16_t, N> {};
791
+ template <size_t N>
792
+ struct Tuple4<bfloat16_t, N> : public Tuple4<uint16_t, N> {};
793
+ template <size_t N>
794
+ struct Raw128<bfloat16_t, N> : public Raw128<uint16_t, N> {};
881
795
 
882
- template <>
883
- struct Raw128<int8_t, 1> : public Raw128<int8_t, 8> {};
796
+ #endif // HWY_NEON_HAVE_BFLOAT16
884
797
 
885
798
  } // namespace detail
886
799
 
@@ -910,6 +823,9 @@ class Vec128 {
910
823
  HWY_INLINE Vec128& operator-=(const Vec128 other) {
911
824
  return *this = (*this - other);
912
825
  }
826
+ HWY_INLINE Vec128& operator%=(const Vec128 other) {
827
+ return *this = (*this % other);
828
+ }
913
829
  HWY_INLINE Vec128& operator&=(const Vec128 other) {
914
830
  return *this = (*this & other);
915
831
  }
@@ -978,26 +894,22 @@ namespace detail {
978
894
  #define HWY_NEON_BUILD_ARG_HWY_SET t
979
895
 
980
896
  HWY_NEON_DEF_FUNCTION_ALL_TYPES(NativeSet, vdup, _n_, HWY_SET)
981
- HWY_NEON_DEF_FUNCTION_BFLOAT_16(NativeSet, vdup, _n_, HWY_SET)
982
- #if !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_FLOAT16C
897
+ #if !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_F16C
983
898
  HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(NativeSet, vdup, _n_, HWY_SET)
984
899
  #endif
900
+ HWY_NEON_DEF_FUNCTION_BFLOAT_16(NativeSet, vdup, _n_, HWY_SET)
901
+
902
+ template <class D, HWY_NEON_IF_EMULATED_D(D)>
903
+ HWY_API Vec128<TFromD<D>, MaxLanes(D())> NativeSet(D d, TFromD<D> t) {
904
+ const uint16_t tu = BitCastScalar<uint16_t>(t);
905
+ return Vec128<TFromD<D>, d.MaxLanes()>(Set(RebindToUnsigned<D>(), tu).raw);
906
+ }
985
907
 
986
908
  #undef HWY_NEON_BUILD_TPL_HWY_SET
987
909
  #undef HWY_NEON_BUILD_RET_HWY_SET
988
910
  #undef HWY_NEON_BUILD_PARAM_HWY_SET
989
911
  #undef HWY_NEON_BUILD_ARG_HWY_SET
990
912
 
991
- #if !HWY_NEON_HAVE_BFLOAT16
992
- // BF16: return u16.
993
- template <class D, HWY_IF_BF16_D(D)>
994
- HWY_API Vec128<bfloat16_t, MaxLanes(D())> NativeSet(D d, bfloat16_t t) {
995
- uint16_t tu;
996
- CopyBytes<sizeof(tu)>(&t, &tu);
997
- return Vec128<bfloat16_t, d.MaxLanes()>(Set(RebindToUnsigned<D>(), tu).raw);
998
- }
999
- #endif // !HWY_NEON_HAVE_BFLOAT16
1000
-
1001
913
  } // namespace detail
1002
914
 
1003
915
  // Full vector. Cannot yet use VFromD because that is defined in terms of Set.
@@ -1039,159 +951,313 @@ HWY_API VFromD<D> Undefined(D /*tag*/) {
1039
951
 
1040
952
  HWY_DIAGNOSTICS(pop)
1041
953
 
954
+ #if !HWY_COMPILER_GCC && !HWY_COMPILER_CLANGCL
1042
955
  namespace detail {
1043
956
 
1044
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
1045
- HWY_INLINE VFromD<D> Iota0(D d) {
1046
- const RebindToUnsigned<decltype(d)> du;
957
+ #pragma pack(push, 1)
958
+
959
+ template <class T>
960
+ struct alignas(8) Vec64ValsWrapper {
961
+ static_assert(sizeof(T) >= 1, "sizeof(T) >= 1 must be true");
962
+ static_assert(sizeof(T) <= 8, "sizeof(T) <= 8 must be true");
963
+ T vals[8 / sizeof(T)];
964
+ };
965
+
966
+ #pragma pack(pop)
967
+
968
+ } // namespace detail
969
+ #endif // !HWY_COMPILER_GCC && !HWY_COMPILER_CLANGCL
970
+
971
+ template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
972
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
973
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
974
+ TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
975
+ TFromD<D> /*t8*/, TFromD<D> /*t9*/,
976
+ TFromD<D> /*t10*/, TFromD<D> /*t11*/,
977
+ TFromD<D> /*t12*/, TFromD<D> /*t13*/,
978
+ TFromD<D> /*t14*/, TFromD<D> /*t15*/) {
1047
979
  #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1048
- typedef uint8_t GccU8RawVectType __attribute__((__vector_size__(8)));
1049
- constexpr GccU8RawVectType kU8Iota0 = {0, 1, 2, 3, 4, 5, 6, 7};
1050
- const VFromD<decltype(du)> vu8_iota0(reinterpret_cast<uint8x8_t>(kU8Iota0));
980
+ typedef int8_t GccI8RawVectType __attribute__((__vector_size__(8)));
981
+ (void)d;
982
+ const GccI8RawVectType raw = {
983
+ static_cast<int8_t>(t0), static_cast<int8_t>(t1), static_cast<int8_t>(t2),
984
+ static_cast<int8_t>(t3), static_cast<int8_t>(t4), static_cast<int8_t>(t5),
985
+ static_cast<int8_t>(t6), static_cast<int8_t>(t7)};
986
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1051
987
  #else
1052
- alignas(8) static constexpr uint8_t kU8Iota0[8] = {0, 1, 2, 3, 4, 5, 6, 7};
1053
- const VFromD<decltype(du)> vu8_iota0(
1054
- Load(Full64<TFromD<decltype(du)>>(), kU8Iota0).raw);
988
+ return ResizeBitCast(
989
+ d, Set(Full64<uint64_t>(),
990
+ BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{
991
+ {t0, t1, t2, t3, t4, t5, t6, t7}})));
1055
992
  #endif
1056
- return BitCast(d, vu8_iota0);
1057
993
  }
1058
994
 
1059
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
1060
- HWY_INLINE VFromD<D> Iota0(D d) {
1061
- const RebindToUnsigned<decltype(d)> du;
995
+ template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
996
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
997
+ TFromD<D> t2, TFromD<D> t3,
998
+ TFromD<D> /*t4*/, TFromD<D> /*t5*/,
999
+ TFromD<D> /*t6*/, TFromD<D> /*t7*/) {
1062
1000
  #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1063
- typedef uint8_t GccU8RawVectType __attribute__((__vector_size__(16)));
1064
- constexpr GccU8RawVectType kU8Iota0 = {0, 1, 2, 3, 4, 5, 6, 7,
1065
- 8, 9, 10, 11, 12, 13, 14, 15};
1066
- const VFromD<decltype(du)> vu8_iota0(reinterpret_cast<uint8x16_t>(kU8Iota0));
1001
+ typedef int16_t GccI16RawVectType __attribute__((__vector_size__(8)));
1002
+ (void)d;
1003
+ const GccI16RawVectType raw = {
1004
+ static_cast<int16_t>(t0), static_cast<int16_t>(t1),
1005
+ static_cast<int16_t>(t2), static_cast<int16_t>(t3)};
1006
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1067
1007
  #else
1068
- alignas(16) static constexpr uint8_t kU8Iota0[16] = {
1069
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
1070
- const auto vu8_iota0 = Load(du, kU8Iota0);
1008
+ return ResizeBitCast(
1009
+ d, Set(Full64<uint64_t>(),
1010
+ BitCastScalar<uint64_t>(
1011
+ detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1, t2, t3}})));
1071
1012
  #endif
1072
- return BitCast(d, vu8_iota0);
1073
1013
  }
1074
1014
 
1075
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
1076
- HWY_INLINE VFromD<D> Iota0(D d) {
1077
- using T = TFromD<decltype(d)>;
1015
+ template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
1016
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1017
+ TFromD<D> /*t2*/, TFromD<D> /*t3*/) {
1078
1018
  #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1079
- typedef detail::NativeLaneType<T> GccRawVectType
1080
- __attribute__((__vector_size__(8)));
1081
- constexpr GccRawVectType kIota0 = {T{0}, T{1}, T{2}, static_cast<T>(3)};
1082
- return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(kIota0));
1019
+ typedef int32_t GccI32RawVectType __attribute__((__vector_size__(8)));
1020
+ (void)d;
1021
+ const GccI32RawVectType raw = {static_cast<int32_t>(t0),
1022
+ static_cast<int32_t>(t1)};
1023
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1083
1024
  #else
1084
- alignas(8) static constexpr T kIota0[4] = {T{0}, T{1}, T{2},
1085
- static_cast<T>(3)};
1086
- return Load(d, kIota0);
1025
+ return ResizeBitCast(d,
1026
+ Set(Full64<uint64_t>(),
1027
+ BitCastScalar<uint64_t>(
1028
+ detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}})));
1087
1029
  #endif
1088
1030
  }
1089
1031
 
1090
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
1091
- HWY_INLINE VFromD<D> Iota0(D d) {
1092
- using T = TFromD<decltype(d)>;
1032
+ template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
1033
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1034
+ TFromD<D> /*t2*/, TFromD<D> /*t3*/) {
1093
1035
  #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1094
- typedef detail::NativeLaneType<T> GccRawVectType
1095
- __attribute__((__vector_size__(16)));
1096
- constexpr GccRawVectType kIota0 = {T{0}, T{1}, T{2}, static_cast<T>(3),
1097
- T{4}, T{5}, T{6}, static_cast<T>(7)};
1098
- return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(kIota0));
1036
+ typedef float GccF32RawVectType __attribute__((__vector_size__(8)));
1037
+ (void)d;
1038
+ const GccF32RawVectType raw = {t0, t1};
1039
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1099
1040
  #else
1100
- alignas(16) static constexpr T kU16Iota0[8] = {
1101
- T{0}, T{1}, T{2}, static_cast<T>(3), T{4}, T{5}, T{6}, static_cast<T>(7)};
1102
- return Load(d, kIota0);
1041
+ return ResizeBitCast(d,
1042
+ Set(Full64<uint64_t>(),
1043
+ BitCastScalar<uint64_t>(
1044
+ detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}})));
1103
1045
  #endif
1104
1046
  }
1105
1047
 
1106
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_UI32_D(D)>
1107
- HWY_INLINE VFromD<D> Iota0(D d) {
1108
- const RebindToUnsigned<decltype(d)> du;
1048
+ template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 8)>
1049
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> /*t1*/) {
1050
+ return Set(d, t0);
1051
+ }
1052
+
1053
+ template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 16)>
1054
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1055
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
1056
+ TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
1057
+ TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
1058
+ TFromD<D> t11, TFromD<D> t12,
1059
+ TFromD<D> t13, TFromD<D> t14,
1060
+ TFromD<D> t15) {
1109
1061
  #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1110
- typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(8)));
1111
- constexpr GccU32RawVectType kU32Iota0 = {0, 1};
1112
- const VFromD<decltype(du)> vu32_iota0(
1113
- reinterpret_cast<uint32x2_t>(kU32Iota0));
1062
+ typedef int8_t GccI8RawVectType __attribute__((__vector_size__(16)));
1063
+ (void)d;
1064
+ const GccI8RawVectType raw = {
1065
+ static_cast<int8_t>(t0), static_cast<int8_t>(t1),
1066
+ static_cast<int8_t>(t2), static_cast<int8_t>(t3),
1067
+ static_cast<int8_t>(t4), static_cast<int8_t>(t5),
1068
+ static_cast<int8_t>(t6), static_cast<int8_t>(t7),
1069
+ static_cast<int8_t>(t8), static_cast<int8_t>(t9),
1070
+ static_cast<int8_t>(t10), static_cast<int8_t>(t11),
1071
+ static_cast<int8_t>(t12), static_cast<int8_t>(t13),
1072
+ static_cast<int8_t>(t14), static_cast<int8_t>(t15)};
1073
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1114
1074
  #else
1115
- alignas(8) static constexpr uint32_t kU32Iota0[2] = {0, 1};
1116
- const VFromD<decltype(du)> vu32_iota0{
1117
- Load(Full64<TFromD<decltype(du)>>(), kU32Iota0).raw};
1075
+ const Half<decltype(d)> dh;
1076
+ return Combine(d,
1077
+ Dup128VecFromValues(dh, t8, t9, t10, t11, t12, t13, t14, t15,
1078
+ t8, t9, t10, t11, t12, t13, t14, t15),
1079
+ Dup128VecFromValues(dh, t0, t1, t2, t3, t4, t5, t6, t7, t0, t1,
1080
+ t2, t3, t4, t5, t6, t7));
1118
1081
  #endif
1119
- return BitCast(d, vu32_iota0);
1120
1082
  }
1121
1083
 
1122
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
1123
- HWY_INLINE VFromD<D> Iota0(D d) {
1124
- const RebindToUnsigned<decltype(d)> du;
1084
+ template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 16)>
1085
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1086
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
1087
+ TFromD<D> t5, TFromD<D> t6,
1088
+ TFromD<D> t7) {
1125
1089
  #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1126
- typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16)));
1127
- constexpr GccU32RawVectType kU32Iota0 = {0, 1, 2, 3};
1128
- const VFromD<decltype(du)> vu32_iota0(
1129
- reinterpret_cast<uint32x4_t>(kU32Iota0));
1090
+ typedef int16_t GccI16RawVectType __attribute__((__vector_size__(16)));
1091
+ (void)d;
1092
+ const GccI16RawVectType raw = {
1093
+ static_cast<int16_t>(t0), static_cast<int16_t>(t1),
1094
+ static_cast<int16_t>(t2), static_cast<int16_t>(t3),
1095
+ static_cast<int16_t>(t4), static_cast<int16_t>(t5),
1096
+ static_cast<int16_t>(t6), static_cast<int16_t>(t7)};
1097
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1130
1098
  #else
1131
- alignas(16) static constexpr uint32_t kU32Iota0[4] = {0, 1, 2, 3};
1132
- const auto vu32_iota0 = Load(du, kU32Iota0);
1099
+ const Half<decltype(d)> dh;
1100
+ return Combine(d, Dup128VecFromValues(dh, t4, t5, t6, t7, t4, t5, t6, t7),
1101
+ Dup128VecFromValues(dh, t0, t1, t2, t3, t0, t1, t2, t3));
1133
1102
  #endif
1134
- return BitCast(d, vu32_iota0);
1135
1103
  }
1136
1104
 
1137
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
1138
- HWY_INLINE VFromD<D> Iota0(D d) {
1105
+ template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 16)>
1106
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1107
+ TFromD<D> t2, TFromD<D> t3) {
1139
1108
  #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1140
- typedef float GccF32RawVectType __attribute__((__vector_size__(8)));
1141
- constexpr GccF32RawVectType kF32Iota0 = {0.0f, 1.0f};
1142
- return VFromD<decltype(d)>(reinterpret_cast<float32x2_t>(kF32Iota0));
1109
+ typedef int32_t GccI32RawVectType __attribute__((__vector_size__(16)));
1110
+ (void)d;
1111
+ const GccI32RawVectType raw = {
1112
+ static_cast<int32_t>(t0), static_cast<int32_t>(t1),
1113
+ static_cast<int32_t>(t2), static_cast<int32_t>(t3)};
1114
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1143
1115
  #else
1144
- alignas(8) static constexpr float kF32Iota0[2] = {0.0f, 1.0f};
1145
- return VFromD<decltype(d)>{
1146
- Load(Full64<TFromD<decltype(d)>>(), kF32Iota0).raw};
1116
+ const Half<decltype(d)> dh;
1117
+ return Combine(d, Dup128VecFromValues(dh, t2, t3, t2, t3),
1118
+ Dup128VecFromValues(dh, t0, t1, t0, t1));
1147
1119
  #endif
1148
1120
  }
1149
1121
 
1150
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
1151
- HWY_INLINE VFromD<D> Iota0(D d) {
1122
+ template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 16)>
1123
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1124
+ TFromD<D> t2, TFromD<D> t3) {
1152
1125
  #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1153
1126
  typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
1154
- constexpr GccF32RawVectType kF32Iota0 = {0.0f, 1.0f, 2.0f, 3.0f};
1155
- return VFromD<decltype(d)>(reinterpret_cast<float32x4_t>(kF32Iota0));
1127
+ (void)d;
1128
+ const GccF32RawVectType raw = {t0, t1, t2, t3};
1129
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1156
1130
  #else
1157
- alignas(16) static constexpr float kF32Iota0[4] = {0.0f, 1.0f, 2.0f, 3.0f};
1158
- return Load(d, kF32Iota0);
1131
+ const Half<decltype(d)> dh;
1132
+ return Combine(d, Dup128VecFromValues(dh, t2, t3, t2, t3),
1133
+ Dup128VecFromValues(dh, t0, t1, t0, t1));
1159
1134
  #endif
1160
1135
  }
1161
1136
 
1162
- template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 8)>
1163
- HWY_INLINE VFromD<D> Iota0(D d) {
1164
- return Zero(d);
1165
- }
1166
-
1167
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)>
1168
- HWY_INLINE VFromD<D> Iota0(D d) {
1169
- const RebindToUnsigned<decltype(d)> du;
1137
+ template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 16)>
1138
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) {
1170
1139
  #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1171
- typedef uint64_t GccU64RawVectType __attribute__((__vector_size__(16)));
1172
- constexpr GccU64RawVectType kU64Iota0 = {0, 1};
1173
- const VFromD<decltype(du)> vu64_iota0(
1174
- reinterpret_cast<uint64x2_t>(kU64Iota0));
1140
+ typedef int64_t GccI64RawVectType __attribute__((__vector_size__(16)));
1141
+ (void)d;
1142
+ const GccI64RawVectType raw = {static_cast<int64_t>(t0),
1143
+ static_cast<int64_t>(t1)};
1144
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1175
1145
  #else
1176
- alignas(16) static constexpr uint64_t kU64Iota0[4] = {0, 1};
1177
- const auto vu64_iota0 = Load(du, kU64Iota0);
1146
+ const Half<decltype(d)> dh;
1147
+ return Combine(d, Set(dh, t1), Set(dh, t0));
1178
1148
  #endif
1179
- return BitCast(d, vu64_iota0);
1180
1149
  }
1181
1150
 
1182
1151
  #if HWY_HAVE_FLOAT64
1183
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
1184
- HWY_INLINE VFromD<D> Iota0(D d) {
1152
+ template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 16)>
1153
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) {
1185
1154
  #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1186
1155
  typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
1187
- constexpr GccF64RawVectType kF64Iota0 = {0.0, 1.0};
1188
- return VFromD<decltype(d)>(reinterpret_cast<float64x2_t>(kF64Iota0));
1156
+ (void)d;
1157
+ const GccF64RawVectType raw = {t0, t1};
1158
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1189
1159
  #else
1190
- alignas(16) static constexpr double kF64Iota0[4] = {0.0, 1.0};
1191
- return Load(d, kF64Iota0);
1160
+ const Half<decltype(d)> dh;
1161
+ return Combine(d, Set(dh, t1), Set(dh, t0));
1192
1162
  #endif
1193
1163
  }
1194
- #endif // HWY_HAVE_FLOAT64
1164
+ #endif
1165
+
1166
+ // Generic for all vector lengths
1167
+ template <class D, HWY_IF_BF16_D(D)>
1168
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1169
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
1170
+ TFromD<D> t5, TFromD<D> t6,
1171
+ TFromD<D> t7) {
1172
+ const RebindToSigned<decltype(d)> di;
1173
+ return BitCast(d,
1174
+ Dup128VecFromValues(
1175
+ di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
1176
+ BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
1177
+ BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
1178
+ BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
1179
+ }
1180
+
1181
+ #if (HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL) && HWY_NEON_HAVE_F16C
1182
+ template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
1183
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1184
+ TFromD<D> t2, TFromD<D> t3,
1185
+ TFromD<D> /*t4*/, TFromD<D> /*t5*/,
1186
+ TFromD<D> /*t6*/, TFromD<D> /*t7*/) {
1187
+ typedef __fp16 GccF16RawVectType __attribute__((__vector_size__(8)));
1188
+ (void)d;
1189
+ const GccF16RawVectType raw = {
1190
+ static_cast<__fp16>(t0), static_cast<__fp16>(t1), static_cast<__fp16>(t2),
1191
+ static_cast<__fp16>(t3)};
1192
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1193
+ }
1194
+ template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 16)>
1195
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1196
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
1197
+ TFromD<D> t5, TFromD<D> t6,
1198
+ TFromD<D> t7) {
1199
+ typedef __fp16 GccF16RawVectType __attribute__((__vector_size__(16)));
1200
+ (void)d;
1201
+ const GccF16RawVectType raw = {
1202
+ static_cast<__fp16>(t0), static_cast<__fp16>(t1), static_cast<__fp16>(t2),
1203
+ static_cast<__fp16>(t3), static_cast<__fp16>(t4), static_cast<__fp16>(t5),
1204
+ static_cast<__fp16>(t6), static_cast<__fp16>(t7)};
1205
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1206
+ }
1207
+ #else
1208
+ // Generic for all vector lengths if MSVC or !HWY_NEON_HAVE_F16C
1209
+ template <class D, HWY_IF_F16_D(D)>
1210
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1211
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
1212
+ TFromD<D> t5, TFromD<D> t6,
1213
+ TFromD<D> t7) {
1214
+ const RebindToSigned<decltype(d)> di;
1215
+ return BitCast(d,
1216
+ Dup128VecFromValues(
1217
+ di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
1218
+ BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
1219
+ BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
1220
+ BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
1221
+ }
1222
+ #endif // (HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL) && HWY_NEON_HAVE_F16C
1223
+
1224
+ namespace detail {
1225
+
1226
+ template <class D, HWY_IF_T_SIZE_D(D, 1)>
1227
+ HWY_INLINE VFromD<D> Iota0(D d) {
1228
+ return Dup128VecFromValues(
1229
+ d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2}, TFromD<D>{3}, TFromD<D>{4},
1230
+ TFromD<D>{5}, TFromD<D>{6}, TFromD<D>{7}, TFromD<D>{8}, TFromD<D>{9},
1231
+ TFromD<D>{10}, TFromD<D>{11}, TFromD<D>{12}, TFromD<D>{13}, TFromD<D>{14},
1232
+ TFromD<D>{15});
1233
+ }
1234
+
1235
+ template <class D, HWY_IF_UI16_D(D)>
1236
+ HWY_INLINE VFromD<D> Iota0(D d) {
1237
+ return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2},
1238
+ TFromD<D>{3}, TFromD<D>{4}, TFromD<D>{5},
1239
+ TFromD<D>{6}, TFromD<D>{7});
1240
+ }
1241
+
1242
+ template <class D, HWY_IF_F16_D(D)>
1243
+ HWY_INLINE VFromD<D> Iota0(D d) {
1244
+ const RebindToUnsigned<decltype(d)> du;
1245
+ return BitCast(d, Dup128VecFromValues(du, uint16_t{0}, uint16_t{0x3C00},
1246
+ uint16_t{0x4000}, uint16_t{0x4200},
1247
+ uint16_t{0x4400}, uint16_t{0x4500},
1248
+ uint16_t{0x4600}, uint16_t{0x4700}));
1249
+ }
1250
+
1251
+ template <class D, HWY_IF_T_SIZE_D(D, 4)>
1252
+ HWY_INLINE VFromD<D> Iota0(D d) {
1253
+ return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2},
1254
+ TFromD<D>{3});
1255
+ }
1256
+
1257
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
1258
+ HWY_INLINE VFromD<D> Iota0(D d) {
1259
+ return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1});
1260
+ }
1195
1261
 
1196
1262
  #if HWY_COMPILER_MSVC
1197
1263
  template <class V, HWY_IF_V_SIZE_LE_V(V, 4)>
@@ -1274,30 +1340,25 @@ HWY_API Vec128<int64_t> Combine(D /* tag */, Vec64<int64_t> hi,
1274
1340
  return Vec128<int64_t>(vcombine_s64(lo.raw, hi.raw));
1275
1341
  }
1276
1342
 
1277
- template <class D, HWY_IF_F16_D(D)>
1278
- HWY_API Vec128<float16_t> Combine(D d, Vec64<float16_t> hi,
1279
- Vec64<float16_t> lo) {
1280
1343
  #if HWY_HAVE_FLOAT16
1281
- (void)d;
1344
+ template <class D, HWY_IF_F16_D(D)>
1345
+ HWY_API Vec128<float16_t> Combine(D, Vec64<float16_t> hi, Vec64<float16_t> lo) {
1282
1346
  return Vec128<float16_t>(vcombine_f16(lo.raw, hi.raw));
1283
- #else
1284
- const RebindToUnsigned<D> du;
1285
- const Half<decltype(du)> duh;
1286
- return BitCast(d, Combine(du, BitCast(duh, hi), BitCast(duh, lo)));
1287
- #endif
1288
1347
  }
1348
+ #endif // HWY_HAVE_FLOAT16
1289
1349
 
1290
- template <class D, HWY_IF_BF16_D(D)>
1291
- HWY_API Vec128<bfloat16_t> Combine(D d, Vec64<bfloat16_t> hi,
1292
- Vec64<bfloat16_t> lo) {
1293
1350
  #if HWY_NEON_HAVE_BFLOAT16
1294
- (void)d;
1295
- return Vec128<bfloat16_t>(vcombine_bf16(lo.raw, hi.raw));
1296
- #else
1351
+ template <class D, HWY_IF_BF16_D(D)>
1352
+ HWY_API VFromD<D> Combine(D, Vec64<bfloat16_t> hi, Vec64<bfloat16_t> lo) {
1353
+ return VFromD<D>(vcombine_bf16(lo.raw, hi.raw));
1354
+ }
1355
+ #endif // HWY_NEON_HAVE_BFLOAT16
1356
+
1357
+ template <class D, class DH = Half<D>, HWY_NEON_IF_EMULATED_D(D)>
1358
+ HWY_API VFromD<D> Combine(D d, VFromD<DH> hi, VFromD<DH> lo) {
1297
1359
  const RebindToUnsigned<D> du;
1298
1360
  const Half<decltype(du)> duh;
1299
1361
  return BitCast(d, Combine(du, BitCast(duh, hi), BitCast(duh, lo)));
1300
- #endif
1301
1362
  }
1302
1363
 
1303
1364
  template <class D, HWY_IF_F32_D(D)>
@@ -1341,7 +1402,7 @@ HWY_NEON_DEF_FUNCTION_UINT_32(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
1341
1402
  HWY_NEON_DEF_FUNCTION_UINT_64(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
1342
1403
 
1343
1404
  #if !HWY_HAVE_FLOAT16
1344
- #if HWY_NEON_HAVE_FLOAT16C
1405
+ #if HWY_NEON_HAVE_F16C
1345
1406
  HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(BitCastToByte, vreinterpret, _u8_,
1346
1407
  HWY_CAST_TO_U8)
1347
1408
  #else
@@ -1349,7 +1410,7 @@ template <size_t N>
1349
1410
  HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<float16_t, N> v) {
1350
1411
  return BitCastToByte(Vec128<uint16_t, N>(v.raw));
1351
1412
  }
1352
- #endif // HWY_NEON_HAVE_FLOAT16C
1413
+ #endif // HWY_NEON_HAVE_F16C
1353
1414
  #endif // !HWY_HAVE_FLOAT16
1354
1415
 
1355
1416
  #if !HWY_NEON_HAVE_BFLOAT16
@@ -1406,14 +1467,24 @@ HWY_INLINE Vec64<int64_t> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
1406
1467
  return Vec64<int64_t>(vreinterpret_s64_u8(v.raw));
1407
1468
  }
1408
1469
 
1470
+ // Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C.
1409
1471
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
1410
- HWY_INLINE VFromD<D> BitCastFromByte(D d, VFromD<Repartition<uint8_t, D>> v) {
1411
- #if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_FLOAT16C
1412
- (void)d;
1472
+ HWY_INLINE VFromD<D> BitCastFromByte(D, VFromD<Repartition<uint8_t, D>> v) {
1473
+ #if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
1413
1474
  return VFromD<D>(vreinterpret_f16_u8(v.raw));
1414
1475
  #else
1415
1476
  const RebindToUnsigned<D> du;
1416
- return VFromD<decltype(d)>(BitCastFromByte(du, v).raw);
1477
+ return VFromD<D>(BitCastFromByte(du, v).raw);
1478
+ #endif
1479
+ }
1480
+
1481
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
1482
+ HWY_INLINE VFromD<D> BitCastFromByte(D, VFromD<Repartition<uint8_t, D>> v) {
1483
+ #if HWY_NEON_HAVE_BFLOAT16
1484
+ return VFromD<D>(vreinterpret_bf16_u8(v.raw));
1485
+ #else
1486
+ const RebindToUnsigned<D> du;
1487
+ return VFromD<D>(BitCastFromByte(du, v).raw);
1417
1488
  #endif
1418
1489
  }
1419
1490
 
@@ -1461,15 +1532,6 @@ HWY_INLINE Vec128<int64_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1461
1532
  return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
1462
1533
  }
1463
1534
 
1464
- template <class D, HWY_IF_F16_D(D)>
1465
- HWY_INLINE Vec128<float16_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1466
- #if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_FLOAT16C
1467
- return Vec128<float16_t>(vreinterpretq_f16_u8(v.raw));
1468
- #else
1469
- return Vec128<float16_t>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
1470
- #endif
1471
- }
1472
-
1473
1535
  template <class D, HWY_IF_F32_D(D)>
1474
1536
  HWY_INLINE Vec128<float> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1475
1537
  return Vec128<float>(vreinterpretq_f32_u8(v.raw));
@@ -1482,11 +1544,23 @@ HWY_INLINE Vec128<double> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1482
1544
  }
1483
1545
  #endif // HWY_HAVE_FLOAT64
1484
1546
 
1485
- // Special case for bfloat16_t, which may have the same Raw as uint16_t.
1547
+ // Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C.
1548
+ template <class D, HWY_IF_F16_D(D)>
1549
+ HWY_INLINE VFromD<D> BitCastFromByte(D, Vec128<uint8_t> v) {
1550
+ #if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
1551
+ return VFromD<D>(vreinterpretq_f16_u8(v.raw));
1552
+ #else
1553
+ return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
1554
+ #endif
1555
+ }
1556
+
1486
1557
  template <class D, HWY_IF_BF16_D(D)>
1487
- HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
1488
- VFromD<Repartition<uint8_t, D>> v) {
1558
+ HWY_INLINE VFromD<D> BitCastFromByte(D, Vec128<uint8_t> v) {
1559
+ #if HWY_NEON_HAVE_BFLOAT16
1560
+ return VFromD<D>(vreinterpretq_bf16_u8(v.raw));
1561
+ #else
1489
1562
  return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
1563
+ #endif
1490
1564
  }
1491
1565
 
1492
1566
  } // namespace detail
@@ -1694,6 +1768,14 @@ HWY_NEON_DEF_FUNCTION_ALL_TYPES(InsertLane, vset, _lane_, HWY_INSERT)
1694
1768
  #undef HWY_NEON_BUILD_PARAM_HWY_INSERT
1695
1769
  #undef HWY_NEON_BUILD_ARG_HWY_INSERT
1696
1770
 
1771
+ template <size_t kLane, class V, class D = DFromV<V>, HWY_NEON_IF_EMULATED_D(D)>
1772
+ HWY_API V InsertLane(const V v, TFromD<D> t) {
1773
+ const D d;
1774
+ const RebindToUnsigned<D> du;
1775
+ const uint16_t tu = BitCastScalar<uint16_t>(t);
1776
+ return BitCast(d, InsertLane<kLane>(BitCast(du, v), tu));
1777
+ }
1778
+
1697
1779
  } // namespace detail
1698
1780
 
1699
1781
  // Requires one overload per vector length because InsertLane<3> may be a
@@ -1842,6 +1924,89 @@ HWY_API Vec128<uint64_t> SumsOf8(const Vec128<uint8_t> v) {
1842
1924
  HWY_API Vec64<uint64_t> SumsOf8(const Vec64<uint8_t> v) {
1843
1925
  return Vec64<uint64_t>(vpaddl_u32(vpaddl_u16(vpaddl_u8(v.raw))));
1844
1926
  }
1927
+ HWY_API Vec128<int64_t> SumsOf8(const Vec128<int8_t> v) {
1928
+ return Vec128<int64_t>(vpaddlq_s32(vpaddlq_s16(vpaddlq_s8(v.raw))));
1929
+ }
1930
+ HWY_API Vec64<int64_t> SumsOf8(const Vec64<int8_t> v) {
1931
+ return Vec64<int64_t>(vpaddl_s32(vpaddl_s16(vpaddl_s8(v.raw))));
1932
+ }
1933
+
1934
+ // ------------------------------ SumsOf2
1935
+ namespace detail {
1936
+
1937
+ template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
1938
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
1939
+ hwy::SignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
1940
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_s8(v.raw));
1941
+ }
1942
+
1943
+ template <class V, HWY_IF_V_SIZE_V(V, 16)>
1944
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
1945
+ hwy::SignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
1946
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_s8(v.raw));
1947
+ }
1948
+
1949
+ template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
1950
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
1951
+ hwy::UnsignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
1952
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_u8(v.raw));
1953
+ }
1954
+
1955
+ template <class V, HWY_IF_V_SIZE_V(V, 16)>
1956
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
1957
+ hwy::UnsignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
1958
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_u8(v.raw));
1959
+ }
1960
+
1961
+ template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
1962
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
1963
+ hwy::SignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
1964
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_s16(v.raw));
1965
+ }
1966
+
1967
+ template <class V, HWY_IF_V_SIZE_V(V, 16)>
1968
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
1969
+ hwy::SignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
1970
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_s16(v.raw));
1971
+ }
1972
+
1973
+ template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
1974
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
1975
+ hwy::UnsignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
1976
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_u16(v.raw));
1977
+ }
1978
+
1979
+ template <class V, HWY_IF_V_SIZE_V(V, 16)>
1980
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
1981
+ hwy::UnsignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
1982
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_u16(v.raw));
1983
+ }
1984
+
1985
+ template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
1986
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
1987
+ hwy::SignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
1988
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_s32(v.raw));
1989
+ }
1990
+
1991
+ template <class V, HWY_IF_V_SIZE_V(V, 16)>
1992
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
1993
+ hwy::SignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
1994
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_s32(v.raw));
1995
+ }
1996
+
1997
+ template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
1998
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
1999
+ hwy::UnsignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
2000
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_u32(v.raw));
2001
+ }
2002
+
2003
+ template <class V, HWY_IF_V_SIZE_V(V, 16)>
2004
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
2005
+ hwy::UnsignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
2006
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_u32(v.raw));
2007
+ }
2008
+
2009
+ } // namespace detail
1845
2010
 
1846
2011
  // ------------------------------ SaturatedAdd
1847
2012
 
@@ -1922,6 +2087,31 @@ HWY_API Vec128<int64_t> Neg(const Vec128<int64_t> v) {
1922
2087
  #endif
1923
2088
  }
1924
2089
 
2090
+ // ------------------------------ SaturatedNeg
2091
+ #ifdef HWY_NATIVE_SATURATED_NEG_8_16_32
2092
+ #undef HWY_NATIVE_SATURATED_NEG_8_16_32
2093
+ #else
2094
+ #define HWY_NATIVE_SATURATED_NEG_8_16_32
2095
+ #endif
2096
+
2097
+ HWY_NEON_DEF_FUNCTION_INT_8_16_32(SaturatedNeg, vqneg, _, 1)
2098
+
2099
+ #if HWY_ARCH_ARM_A64
2100
+ #ifdef HWY_NATIVE_SATURATED_NEG_64
2101
+ #undef HWY_NATIVE_SATURATED_NEG_64
2102
+ #else
2103
+ #define HWY_NATIVE_SATURATED_NEG_64
2104
+ #endif
2105
+
2106
+ HWY_API Vec64<int64_t> SaturatedNeg(const Vec64<int64_t> v) {
2107
+ return Vec64<int64_t>(vqneg_s64(v.raw));
2108
+ }
2109
+
2110
+ HWY_API Vec128<int64_t> SaturatedNeg(const Vec128<int64_t> v) {
2111
+ return Vec128<int64_t>(vqnegq_s64(v.raw));
2112
+ }
2113
+ #endif
2114
+
1925
2115
  // ------------------------------ ShiftLeft
1926
2116
 
1927
2117
  // Customize HWY_NEON_DEF_FUNCTION to special-case count=0 (not supported).
@@ -2310,13 +2500,13 @@ HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
2310
2500
  return detail::NegMulAdd(add, mul, x);
2311
2501
  }
2312
2502
 
2313
- template <typename T, size_t N>
2503
+ template <typename T, size_t N, HWY_IF_FLOAT(T)>
2314
2504
  HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
2315
2505
  Vec128<T, N> sub) {
2316
2506
  return MulAdd(mul, x, Neg(sub));
2317
2507
  }
2318
2508
 
2319
- template <typename T, size_t N>
2509
+ template <typename T, size_t N, HWY_IF_FLOAT(T)>
2320
2510
  HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
2321
2511
  Vec128<T, N> sub) {
2322
2512
  return Neg(MulAdd(mul, x, sub));
@@ -2612,6 +2802,15 @@ HWY_API Vec128<T, N> PopulationCount(Vec128<T, N> v) {
2612
2802
  HWY_NEON_DEF_FUNCTION_INT_8_16_32(Abs, vabs, _, 1)
2613
2803
  HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Abs, vabs, _, 1)
2614
2804
 
2805
+ // ------------------------------ SaturatedAbs
2806
+ #ifdef HWY_NATIVE_SATURATED_ABS
2807
+ #undef HWY_NATIVE_SATURATED_ABS
2808
+ #else
2809
+ #define HWY_NATIVE_SATURATED_ABS
2810
+ #endif
2811
+
2812
+ HWY_NEON_DEF_FUNCTION_INT_8_16_32(SaturatedAbs, vqabs, _, 1)
2813
+
2615
2814
  // ------------------------------ CopySign
2616
2815
  template <typename T, size_t N>
2617
2816
  HWY_API Vec128<T, N> CopySign(Vec128<T, N> magn, Vec128<T, N> sign) {
@@ -2675,22 +2874,42 @@ HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) {
2675
2874
 
2676
2875
  HWY_NEON_DEF_FUNCTION_ALL_TYPES(IfThenElse, vbsl, _, HWY_IF)
2677
2876
 
2877
+ template <class V, class D = DFromV<V>, HWY_NEON_IF_EMULATED_D(D)>
2878
+ HWY_API V IfThenElse(MFromD<D> mask, V yes, V no) {
2879
+ const DFromV<decltype(yes)> d;
2880
+ const RebindToUnsigned<decltype(d)> du;
2881
+ return BitCast(
2882
+ d, IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no)));
2883
+ }
2884
+
2678
2885
  #undef HWY_NEON_BUILD_TPL_HWY_IF
2679
2886
  #undef HWY_NEON_BUILD_RET_HWY_IF
2680
2887
  #undef HWY_NEON_BUILD_PARAM_HWY_IF
2681
2888
  #undef HWY_NEON_BUILD_ARG_HWY_IF
2682
2889
 
2683
2890
  // mask ? yes : 0
2684
- template <typename T, size_t N>
2891
+ template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
2685
2892
  HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
2686
2893
  return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
2687
2894
  }
2895
+ template <typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)>
2896
+ HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
2897
+ const DFromV<decltype(yes)> d;
2898
+ const RebindToUnsigned<decltype(d)> du;
2899
+ return BitCast(d, IfThenElseZero(RebindMask(du, mask), BitCast(du, yes)));
2900
+ }
2688
2901
 
2689
2902
  // mask ? 0 : no
2690
- template <typename T, size_t N>
2903
+ template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
2691
2904
  HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
2692
2905
  return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
2693
2906
  }
2907
+ template <typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)>
2908
+ HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
2909
+ const DFromV<decltype(no)> d;
2910
+ const RebindToUnsigned<decltype(d)> du;
2911
+ return BitCast(d, IfThenZeroElse(RebindMask(du, mask), BitCast(du, no)));
2912
+ }
2694
2913
 
2695
2914
  template <typename T, size_t N>
2696
2915
  HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
@@ -2957,6 +3176,23 @@ HWY_API Vec64<int64_t> Abs(const Vec64<int64_t> v) {
2957
3176
  #endif
2958
3177
  }
2959
3178
 
3179
+ HWY_API Vec128<int64_t> SaturatedAbs(const Vec128<int64_t> v) {
3180
+ #if HWY_ARCH_ARM_A64
3181
+ return Vec128<int64_t>(vqabsq_s64(v.raw));
3182
+ #else
3183
+ const auto zero = Zero(DFromV<decltype(v)>());
3184
+ return IfThenElse(MaskFromVec(BroadcastSignBit(v)), SaturatedSub(zero, v), v);
3185
+ #endif
3186
+ }
3187
+ HWY_API Vec64<int64_t> SaturatedAbs(const Vec64<int64_t> v) {
3188
+ #if HWY_ARCH_ARM_A64
3189
+ return Vec64<int64_t>(vqabs_s64(v.raw));
3190
+ #else
3191
+ const auto zero = Zero(DFromV<decltype(v)>());
3192
+ return IfThenElse(MaskFromVec(BroadcastSignBit(v)), SaturatedSub(zero, v), v);
3193
+ #endif
3194
+ }
3195
+
2960
3196
  // ------------------------------ Min (IfThenElse, BroadcastSignBit)
2961
3197
 
2962
3198
  // Unsigned
@@ -3133,6 +3369,20 @@ HWY_API Vec128<int64_t> LoadU(D /* tag */,
3133
3369
  const int64_t* HWY_RESTRICT unaligned) {
3134
3370
  return Vec128<int64_t>(vld1q_s64(unaligned));
3135
3371
  }
3372
+ #if HWY_HAVE_FLOAT16
3373
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
3374
+ HWY_API Vec128<float16_t> LoadU(D /* tag */,
3375
+ const float16_t* HWY_RESTRICT unaligned) {
3376
+ return Vec128<float16_t>(vld1q_f16(detail::NativeLanePointer(unaligned)));
3377
+ }
3378
+ #endif // HWY_HAVE_FLOAT16
3379
+ #if HWY_NEON_HAVE_BFLOAT16
3380
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
3381
+ HWY_API Vec128<bfloat16_t> LoadU(D /* tag */,
3382
+ const bfloat16_t* HWY_RESTRICT unaligned) {
3383
+ return Vec128<bfloat16_t>(vld1q_bf16(detail::NativeLanePointer(unaligned)));
3384
+ }
3385
+ #endif // HWY_NEON_HAVE_BFLOAT16
3136
3386
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
3137
3387
  HWY_API Vec128<float> LoadU(D /* tag */, const float* HWY_RESTRICT unaligned) {
3138
3388
  return Vec128<float>(vld1q_f32(unaligned));
@@ -3179,6 +3429,18 @@ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)>
3179
3429
  HWY_API Vec64<int64_t> LoadU(D /* tag */, const int64_t* HWY_RESTRICT p) {
3180
3430
  return Vec64<int64_t>(vld1_s64(p));
3181
3431
  }
3432
+ #if HWY_HAVE_FLOAT16
3433
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F16_D(D)>
3434
+ HWY_API Vec64<float16_t> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) {
3435
+ return Vec64<float16_t>(vld1_f16(detail::NativeLanePointer(p)));
3436
+ }
3437
+ #endif // HWY_HAVE_FLOAT16
3438
+ #if HWY_NEON_HAVE_BFLOAT16
3439
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
3440
+ HWY_API Vec64<bfloat16_t> LoadU(D /* tag */, const bfloat16_t* HWY_RESTRICT p) {
3441
+ return Vec64<bfloat16_t>(vld1_bf16(detail::NativeLanePointer(p)));
3442
+ }
3443
+ #endif // HWY_NEON_HAVE_BFLOAT16
3182
3444
  template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
3183
3445
  HWY_API Vec64<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) {
3184
3446
  return Vec64<float>(vld1_f32(p));
@@ -3207,14 +3469,34 @@ HWY_API Vec32<float> LoadU(D /*tag*/, const float* HWY_RESTRICT p) {
3207
3469
  return Vec32<float>(vld1_dup_f32(p));
3208
3470
  }
3209
3471
 
3210
- template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_NOT_SPECIAL_FLOAT_D(D),
3211
- HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
3472
+ // {u,i}{8,16}
3473
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_LE_D(D, 2),
3474
+ HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
3475
+ HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
3476
+ const Repartition<uint32_t, decltype(d)> d32;
3477
+ uint32_t buf;
3478
+ CopyBytes<4>(p, &buf);
3479
+ return BitCast(d, LoadU(d32, &buf));
3480
+ }
3481
+
3482
+ #if HWY_HAVE_FLOAT16
3483
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F16_D(D)>
3484
+ HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
3485
+ const Repartition<uint32_t, decltype(d)> d32;
3486
+ uint32_t buf;
3487
+ CopyBytes<4>(p, &buf);
3488
+ return BitCast(d, LoadU(d32, &buf));
3489
+ }
3490
+ #endif // HWY_HAVE_FLOAT16
3491
+ #if HWY_NEON_HAVE_BFLOAT16
3492
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_BF16_D(D)>
3212
3493
  HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
3213
3494
  const Repartition<uint32_t, decltype(d)> d32;
3214
3495
  uint32_t buf;
3215
3496
  CopyBytes<4>(p, &buf);
3216
3497
  return BitCast(d, LoadU(d32, &buf));
3217
3498
  }
3499
+ #endif // HWY_NEON_HAVE_BFLOAT16
3218
3500
 
3219
3501
  // ------------------------------ Load 16
3220
3502
 
@@ -3228,6 +3510,18 @@ template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_I16_D(D)>
3228
3510
  HWY_API VFromD<D> LoadU(D /* tag */, const int16_t* HWY_RESTRICT p) {
3229
3511
  return VFromD<D>(vld1_dup_s16(p));
3230
3512
  }
3513
+ #if HWY_HAVE_FLOAT16
3514
+ template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_F16_D(D)>
3515
+ HWY_API VFromD<D> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) {
3516
+ return VFromD<D>(vld1_dup_f16(detail::NativeLanePointer(p)));
3517
+ }
3518
+ #endif // HWY_HAVE_FLOAT16
3519
+ #if HWY_NEON_HAVE_BFLOAT16
3520
+ template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_BF16_D(D)>
3521
+ HWY_API VFromD<D> LoadU(D /* tag */, const bfloat16_t* HWY_RESTRICT p) {
3522
+ return VFromD<D>(vld1_dup_bf16(detail::NativeLanePointer(p)));
3523
+ }
3524
+ #endif // HWY_NEON_HAVE_BFLOAT16
3231
3525
 
3232
3526
  // 8-bit x2
3233
3527
  template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
@@ -3250,12 +3544,10 @@ HWY_API VFromD<D> LoadU(D /* tag */, const int8_t* HWY_RESTRICT p) {
3250
3544
 
3251
3545
  // ------------------------------ Load misc
3252
3546
 
3253
- // [b]float16_t may use the same Raw as uint16_t, so forward to that.
3254
- template <class D, HWY_IF_SPECIAL_FLOAT_D(D)>
3547
+ template <class D, HWY_NEON_IF_EMULATED_D(D)>
3255
3548
  HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
3256
- const RebindToUnsigned<decltype(d)> du16;
3257
- const auto pu16 = reinterpret_cast<const uint16_t*>(p);
3258
- return BitCast(d, LoadU(du16, pu16));
3549
+ const RebindToUnsigned<decltype(d)> du;
3550
+ return BitCast(d, LoadU(du, detail::U16LanePointer(p)));
3259
3551
  }
3260
3552
 
3261
3553
  // On Arm, Load is the same as LoadU.
@@ -3324,6 +3616,20 @@ HWY_API void StoreU(Vec128<int64_t> v, D /* tag */,
3324
3616
  int64_t* HWY_RESTRICT unaligned) {
3325
3617
  vst1q_s64(unaligned, v.raw);
3326
3618
  }
3619
+ #if HWY_HAVE_FLOAT16
3620
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
3621
+ HWY_API void StoreU(Vec128<float16_t> v, D /* tag */,
3622
+ float16_t* HWY_RESTRICT unaligned) {
3623
+ vst1q_f16(detail::NativeLanePointer(unaligned), v.raw);
3624
+ }
3625
+ #endif // HWY_HAVE_FLOAT16
3626
+ #if HWY_NEON_HAVE_BFLOAT16
3627
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
3628
+ HWY_API void StoreU(Vec128<bfloat16_t> v, D /* tag */,
3629
+ bfloat16_t* HWY_RESTRICT unaligned) {
3630
+ vst1q_bf16(detail::NativeLanePointer(unaligned), v.raw);
3631
+ }
3632
+ #endif // HWY_NEON_HAVE_BFLOAT16
3327
3633
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
3328
3634
  HWY_API void StoreU(Vec128<float> v, D /* tag */,
3329
3635
  float* HWY_RESTRICT unaligned) {
@@ -3371,6 +3677,20 @@ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)>
3371
3677
  HWY_API void StoreU(Vec64<int64_t> v, D /* tag */, int64_t* HWY_RESTRICT p) {
3372
3678
  vst1_s64(p, v.raw);
3373
3679
  }
3680
+ #if HWY_HAVE_FLOAT16
3681
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F16_D(D)>
3682
+ HWY_API void StoreU(Vec64<float16_t> v, D /* tag */,
3683
+ float16_t* HWY_RESTRICT p) {
3684
+ vst1_f16(detail::NativeLanePointer(p), v.raw);
3685
+ }
3686
+ #endif // HWY_HAVE_FLOAT16
3687
+ #if HWY_NEON_HAVE_BFLOAT16
3688
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
3689
+ HWY_API void StoreU(Vec64<bfloat16_t> v, D /* tag */,
3690
+ bfloat16_t* HWY_RESTRICT p) {
3691
+ vst1_bf16(detail::NativeLanePointer(p), v.raw);
3692
+ }
3693
+ #endif // HWY_NEON_HAVE_BFLOAT16
3374
3694
  template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
3375
3695
  HWY_API void StoreU(Vec64<float> v, D /* tag */, float* HWY_RESTRICT p) {
3376
3696
  vst1_f32(p, v.raw);
@@ -3397,28 +3717,31 @@ HWY_API void StoreU(Vec32<float> v, D, float* HWY_RESTRICT p) {
3397
3717
  vst1_lane_f32(p, v.raw, 0);
3398
3718
  }
3399
3719
 
3400
- // Overload 16-bit types directly to avoid ambiguity with [b]float16_t.
3401
- template <class D, HWY_IF_V_SIZE_D(D, 4), typename T = TFromD<D>,
3402
- HWY_IF_T_SIZE(T, 1)>
3403
- HWY_API void StoreU(Vec32<T> v, D d, T* HWY_RESTRICT p) {
3720
+ // {u,i}{8,16}
3721
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_LE_D(D, 2),
3722
+ HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
3723
+ HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
3404
3724
  Repartition<uint32_t, decltype(d)> d32;
3405
3725
  uint32_t buf = GetLane(BitCast(d32, v));
3406
3726
  CopyBytes<4>(&buf, p);
3407
3727
  }
3408
3728
 
3409
- template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)>
3410
- HWY_API void StoreU(Vec32<uint16_t> v, D d, uint16_t* HWY_RESTRICT p) {
3729
+ #if HWY_HAVE_FLOAT16
3730
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F16_D(D)>
3731
+ HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
3411
3732
  Repartition<uint32_t, decltype(d)> d32;
3412
3733
  uint32_t buf = GetLane(BitCast(d32, v));
3413
3734
  CopyBytes<4>(&buf, p);
3414
3735
  }
3415
-
3416
- template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I16_D(D)>
3417
- HWY_API void StoreU(Vec32<int16_t> v, D d, int16_t* HWY_RESTRICT p) {
3736
+ #endif
3737
+ #if HWY_NEON_HAVE_BFLOAT16
3738
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_BF16_D(D)>
3739
+ HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
3418
3740
  Repartition<uint32_t, decltype(d)> d32;
3419
3741
  uint32_t buf = GetLane(BitCast(d32, v));
3420
3742
  CopyBytes<4>(&buf, p);
3421
3743
  }
3744
+ #endif // HWY_NEON_HAVE_BFLOAT16
3422
3745
 
3423
3746
  // ------------------------------ Store 16
3424
3747
 
@@ -3430,6 +3753,18 @@ template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_I16_D(D)>
3430
3753
  HWY_API void StoreU(Vec16<int16_t> v, D, int16_t* HWY_RESTRICT p) {
3431
3754
  vst1_lane_s16(p, v.raw, 0);
3432
3755
  }
3756
+ #if HWY_HAVE_FLOAT16
3757
+ template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_F16_D(D)>
3758
+ HWY_API void StoreU(Vec16<float16_t> v, D, float16_t* HWY_RESTRICT p) {
3759
+ vst1_lane_f16(detail::NativeLanePointer(p), v.raw, 0);
3760
+ }
3761
+ #endif // HWY_HAVE_FLOAT16
3762
+ #if HWY_NEON_HAVE_BFLOAT16
3763
+ template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_BF16_D(D)>
3764
+ HWY_API void StoreU(Vec16<bfloat16_t> v, D, bfloat16_t* HWY_RESTRICT p) {
3765
+ vst1_lane_bf16(detail::NativeLanePointer(p), v.raw, 0);
3766
+ }
3767
+ #endif // HWY_NEON_HAVE_BFLOAT16
3433
3768
 
3434
3769
  template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
3435
3770
  HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
@@ -3449,12 +3784,12 @@ HWY_API void StoreU(Vec128<int8_t, 1> v, D, int8_t* HWY_RESTRICT p) {
3449
3784
  vst1_lane_s8(p, v.raw, 0);
3450
3785
  }
3451
3786
 
3452
- // [b]float16_t may use the same Raw as uint16_t, so forward to that.
3453
- template <class D, HWY_IF_SPECIAL_FLOAT_D(D)>
3787
+ // ------------------------------ Store misc
3788
+
3789
+ template <class D, HWY_NEON_IF_EMULATED_D(D)>
3454
3790
  HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
3455
- const RebindToUnsigned<decltype(d)> du16;
3456
- const auto pu16 = reinterpret_cast<uint16_t*>(p);
3457
- return StoreU(BitCast(du16, v), du16, pu16);
3791
+ const RebindToUnsigned<decltype(d)> du;
3792
+ return StoreU(BitCast(du, v), du, detail::U16LanePointer(p));
3458
3793
  }
3459
3794
 
3460
3795
  HWY_DIAGNOSTICS(push)
@@ -3541,24 +3876,6 @@ HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToUnsigned<D>> v) {
3541
3876
  return VFromD<D>(vcvt_f32_u32(v.raw));
3542
3877
  }
3543
3878
 
3544
- // Truncates (rounds toward zero).
3545
- template <class D, HWY_IF_I32_D(D)>
3546
- HWY_API Vec128<int32_t> ConvertTo(D /* tag */, Vec128<float> v) {
3547
- return Vec128<int32_t>(vcvtq_s32_f32(v.raw));
3548
- }
3549
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
3550
- HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
3551
- return VFromD<D>(vcvt_s32_f32(v.raw));
3552
- }
3553
- template <class D, HWY_IF_U32_D(D)>
3554
- HWY_API Vec128<uint32_t> ConvertTo(D /* tag */, Vec128<float> v) {
3555
- return Vec128<uint32_t>(vcvtq_u32_f32(ZeroIfNegative(v).raw));
3556
- }
3557
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
3558
- HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
3559
- return VFromD<D>(vcvt_u32_f32(ZeroIfNegative(v).raw));
3560
- }
3561
-
3562
3879
  #if HWY_HAVE_FLOAT64
3563
3880
 
3564
3881
  template <class D, HWY_IF_F64_D(D)>
@@ -3590,38 +3907,156 @@ HWY_API Vec64<double> ConvertTo(D /* tag */, Vec64<uint64_t> v) {
3590
3907
  #endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
3591
3908
  }
3592
3909
 
3910
+ #endif // HWY_HAVE_FLOAT64
3911
+
3912
+ namespace detail {
3593
3913
  // Truncates (rounds toward zero).
3594
- template <class D, HWY_IF_I64_D(D)>
3595
- HWY_API Vec128<int64_t> ConvertTo(D /* tag */, Vec128<double> v) {
3914
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
3915
+ HWY_INLINE Vec128<int32_t> ConvertFToI(D /* tag */, Vec128<float> v) {
3916
+ #if HWY_COMPILER_CLANG && \
3917
+ ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
3918
+ // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
3919
+ // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
3920
+ // outside of the range of an int32_t.
3921
+
3922
+ int32x4_t raw_result;
3923
+ __asm__(
3924
+ #if HWY_ARCH_ARM_A64
3925
+ "fcvtzs %0.4s, %1.4s"
3926
+ #else
3927
+ "vcvt.s32.f32 %0, %1"
3928
+ #endif
3929
+ : "=w"(raw_result)
3930
+ : "w"(v.raw));
3931
+ return Vec128<int32_t>(raw_result);
3932
+ #else
3933
+ return Vec128<int32_t>(vcvtq_s32_f32(v.raw));
3934
+ #endif
3935
+ }
3936
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
3937
+ HWY_INLINE VFromD<D> ConvertFToI(D /* tag */, VFromD<RebindToFloat<D>> v) {
3938
+ #if HWY_COMPILER_CLANG && \
3939
+ ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
3940
+ // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
3941
+ // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
3942
+ // outside of the range of an int32_t.
3943
+
3944
+ int32x2_t raw_result;
3945
+ __asm__(
3946
+ #if HWY_ARCH_ARM_A64
3947
+ "fcvtzs %0.2s, %1.2s"
3948
+ #else
3949
+ "vcvt.s32.f32 %0, %1"
3950
+ #endif
3951
+ : "=w"(raw_result)
3952
+ : "w"(v.raw));
3953
+ return VFromD<D>(raw_result);
3954
+ #else
3955
+ return VFromD<D>(vcvt_s32_f32(v.raw));
3956
+ #endif
3957
+ }
3958
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
3959
+ HWY_INLINE Vec128<uint32_t> ConvertFToU(D /* tag */, Vec128<float> v) {
3960
+ #if HWY_COMPILER_CLANG && \
3961
+ ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
3962
+ // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
3963
+ // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
3964
+ // outside of the range of an uint32_t.
3965
+
3966
+ uint32x4_t raw_result;
3967
+ __asm__(
3968
+ #if HWY_ARCH_ARM_A64
3969
+ "fcvtzu %0.4s, %1.4s"
3970
+ #else
3971
+ "vcvt.u32.f32 %0, %1"
3972
+ #endif
3973
+ : "=w"(raw_result)
3974
+ : "w"(v.raw));
3975
+ return Vec128<uint32_t>(raw_result);
3976
+ #else
3977
+ return Vec128<uint32_t>(vcvtq_u32_f32(v.raw));
3978
+ #endif
3979
+ }
3980
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
3981
+ HWY_INLINE VFromD<D> ConvertFToU(D /* tag */, VFromD<RebindToFloat<D>> v) {
3982
+ #if HWY_COMPILER_CLANG && \
3983
+ ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
3984
+ // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
3985
+ // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
3986
+ // outside of the range of an uint32_t.
3987
+
3988
+ uint32x2_t raw_result;
3989
+ __asm__(
3990
+ #if HWY_ARCH_ARM_A64
3991
+ "fcvtzu %0.2s, %1.2s"
3992
+ #else
3993
+ "vcvt.u32.f32 %0, %1"
3994
+ #endif
3995
+ : "=w"(raw_result)
3996
+ : "w"(v.raw));
3997
+ return VFromD<D>(raw_result);
3998
+ #else
3999
+ return VFromD<D>(vcvt_u32_f32(v.raw));
4000
+ #endif
4001
+ }
4002
+
4003
+ #if HWY_HAVE_FLOAT64
4004
+
4005
+ // Truncates (rounds toward zero).
4006
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
4007
+ HWY_INLINE Vec128<int64_t> ConvertFToI(D /* tag */, Vec128<double> v) {
4008
+ #if HWY_COMPILER_CLANG && HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200
4009
+ // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4010
+ // to avoid undefined behavior if v[i] is outside of the range of an int64_t.
4011
+ int64x2_t raw_result;
4012
+ __asm__("fcvtzs %0.2d, %1.2d" : "=w"(raw_result) : "w"(v.raw));
4013
+ return Vec128<int64_t>(raw_result);
4014
+ #else
3596
4015
  return Vec128<int64_t>(vcvtq_s64_f64(v.raw));
4016
+ #endif
3597
4017
  }
3598
- template <class D, HWY_IF_I64_D(D)>
3599
- HWY_API Vec64<int64_t> ConvertTo(D di, Vec64<double> v) {
3600
- // GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic. Use the
3601
- // 128-bit version to avoid UB from casting double -> int64_t.
3602
- #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
3603
- const Full128<double> ddt;
3604
- const Twice<decltype(di)> dit;
3605
- return LowerHalf(di, ConvertTo(dit, Combine(ddt, v, v)));
4018
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)>
4019
+ HWY_INLINE Vec64<int64_t> ConvertFToI(D /* tag */, Vec64<double> v) {
4020
+ #if HWY_ARCH_ARM_A64 && \
4021
+ ((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700) || \
4022
+ (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200))
4023
+ // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4024
+ // to avoid undefined behavior if v[i] is outside of the range of an int64_t.
4025
+ // If compiling for AArch64 NEON with GCC 6 or earlier, use inline assembly to
4026
+ // work around the missing vcvt_s64_f64 intrinsic.
4027
+ int64x1_t raw_result;
4028
+ __asm__("fcvtzs %d0, %d1" : "=w"(raw_result) : "w"(v.raw));
4029
+ return Vec64<int64_t>(raw_result);
3606
4030
  #else
3607
- (void)di;
3608
4031
  return Vec64<int64_t>(vcvt_s64_f64(v.raw));
3609
4032
  #endif
3610
4033
  }
3611
- template <class D, HWY_IF_U64_D(D)>
3612
- HWY_API Vec128<uint64_t> ConvertTo(D /* tag */, Vec128<double> v) {
4034
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
4035
+ HWY_INLINE Vec128<uint64_t> ConvertFToU(D /* tag */, Vec128<double> v) {
4036
+ #if HWY_COMPILER_CLANG && HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200
4037
+ // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4038
+ // to avoid undefined behavior if v[i] is outside of the range of an uint64_t.
4039
+ uint64x2_t raw_result;
4040
+ __asm__("fcvtzu %0.2d, %1.2d" : "=w"(raw_result) : "w"(v.raw));
4041
+ return Vec128<uint64_t>(raw_result);
4042
+ #else
3613
4043
  return Vec128<uint64_t>(vcvtq_u64_f64(v.raw));
4044
+ #endif
3614
4045
  }
3615
- template <class D, HWY_IF_U64_D(D)>
3616
- HWY_API Vec64<uint64_t> ConvertTo(D du, Vec64<double> v) {
3617
- // GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic. Use the
3618
- // 128-bit version to avoid UB from casting double -> uint64_t.
3619
- #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
3620
- const Full128<double> ddt;
3621
- const Twice<decltype(du)> du_t;
3622
- return LowerHalf(du, ConvertTo(du_t, Combine(ddt, v, v)));
4046
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U64_D(D)>
4047
+ HWY_INLINE Vec64<uint64_t> ConvertFToU(D /* tag */, Vec64<double> v) {
4048
+ #if HWY_ARCH_ARM_A64 && \
4049
+ ((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700) || \
4050
+ (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200))
4051
+ // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4052
+ // to avoid undefined behavior if v[i] is outside of the range of an uint64_t.
4053
+
4054
+ // Inline assembly is also used if compiling for AArch64 NEON with GCC 6 or
4055
+ // earlier to work around the issue of the missing vcvt_u64_f64 intrinsic.
4056
+ uint64x1_t raw_result;
4057
+ __asm__("fcvtzu %d0, %d1" : "=w"(raw_result) : "w"(v.raw));
4058
+ return Vec64<uint64_t>(raw_result);
3623
4059
  #else
3624
- (void)du;
3625
4060
  return Vec64<uint64_t>(vcvt_u64_f64(v.raw));
3626
4061
  #endif
3627
4062
  }
@@ -3631,25 +4066,76 @@ HWY_API Vec64<uint64_t> ConvertTo(D du, Vec64<double> v) {
3631
4066
  #if HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16
3632
4067
 
3633
4068
  // Truncates (rounds toward zero).
3634
- template <class D, HWY_IF_I16_D(D)>
3635
- HWY_API Vec128<int16_t> ConvertTo(D /* tag */, Vec128<float16_t> v) {
4069
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
4070
+ HWY_INLINE Vec128<int16_t> ConvertFToI(D /* tag */, Vec128<float16_t> v) {
4071
+ #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
4072
+ // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4073
+ // to avoid undefined behavior if v[i] is outside of the range of an int16_t.
4074
+ int16x8_t raw_result;
4075
+ __asm__("fcvtzs %0.8h, %1.8h" : "=w"(raw_result) : "w"(v.raw));
4076
+ return Vec128<int16_t>(raw_result);
4077
+ #else
3636
4078
  return Vec128<int16_t>(vcvtq_s16_f16(v.raw));
4079
+ #endif
3637
4080
  }
3638
4081
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
3639
- HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<float16_t, D>> v) {
4082
+ HWY_INLINE VFromD<D> ConvertFToI(D /* tag */, VFromD<Rebind<float16_t, D>> v) {
4083
+ #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
4084
+ // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4085
+ // to avoid undefined behavior if v[i] is outside of the range of an int16_t.
4086
+ int16x4_t raw_result;
4087
+ __asm__("fcvtzs %0.4h, %1.4h" : "=w"(raw_result) : "w"(v.raw));
4088
+ return VFromD<D>(raw_result);
4089
+ #else
3640
4090
  return VFromD<D>(vcvt_s16_f16(v.raw));
4091
+ #endif
3641
4092
  }
3642
4093
 
3643
- template <class D, HWY_IF_U16_D(D)>
3644
- HWY_API Vec128<uint16_t> ConvertTo(D /* tag */, Vec128<float16_t> v) {
4094
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
4095
+ HWY_INLINE Vec128<uint16_t> ConvertFToU(D /* tag */, Vec128<float16_t> v) {
4096
+ #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
4097
+ // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4098
+ // to avoid undefined behavior if v[i] is outside of the range of an uint16_t.
4099
+ uint16x8_t raw_result;
4100
+ __asm__("fcvtzu %0.8h, %1.8h" : "=w"(raw_result) : "w"(v.raw));
4101
+ return Vec128<uint16_t>(raw_result);
4102
+ #else
3645
4103
  return Vec128<uint16_t>(vcvtq_u16_f16(v.raw));
4104
+ #endif
3646
4105
  }
3647
4106
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
3648
- HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<float16_t, D>> v) {
4107
+ HWY_INLINE VFromD<D> ConvertFToU(D /* tag */, VFromD<Rebind<float16_t, D>> v) {
4108
+ #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
4109
+ // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4110
+ // to avoid undefined behavior if v[i] is outside of the range of an uint16_t.
4111
+ uint16x4_t raw_result;
4112
+ __asm__("fcvtzu %0.4h, %1.4h" : "=w"(raw_result) : "w"(v.raw));
4113
+ return VFromD<D>(raw_result);
4114
+ #else
3649
4115
  return VFromD<D>(vcvt_u16_f16(v.raw));
4116
+ #endif
3650
4117
  }
3651
4118
 
3652
4119
  #endif // HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16
4120
+ } // namespace detail
4121
+
4122
+ template <class D, HWY_IF_SIGNED_D(D),
4123
+ HWY_IF_T_SIZE_ONE_OF_D(
4124
+ D, (1 << 4) |
4125
+ ((HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16) ? (1 << 2) : 0) |
4126
+ (HWY_HAVE_FLOAT64 ? (1 << 8) : 0))>
4127
+ HWY_API VFromD<D> ConvertTo(D di, VFromD<RebindToFloat<D>> v) {
4128
+ return detail::ConvertFToI(di, v);
4129
+ }
4130
+
4131
+ template <class D, HWY_IF_UNSIGNED_D(D),
4132
+ HWY_IF_T_SIZE_ONE_OF_D(
4133
+ D, (1 << 4) |
4134
+ ((HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16) ? (1 << 2) : 0) |
4135
+ (HWY_HAVE_FLOAT64 ? (1 << 8) : 0))>
4136
+ HWY_API VFromD<D> ConvertTo(D du, VFromD<RebindToFloat<D>> v) {
4137
+ return detail::ConvertFToU(du, v);
4138
+ }
3653
4139
 
3654
4140
  // ------------------------------ PromoteTo (ConvertTo)
3655
4141
 
@@ -3782,7 +4268,7 @@ HWY_API VFromD<D> PromoteTo(D d, V v) {
3782
4268
  return PromoteTo(d, PromoteTo(di32, v));
3783
4269
  }
3784
4270
 
3785
- #if HWY_NEON_HAVE_FLOAT16C
4271
+ #if HWY_NEON_HAVE_F16C
3786
4272
 
3787
4273
  // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions.
3788
4274
  #ifdef HWY_NATIVE_F16C
@@ -3800,7 +4286,7 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float16_t, D>> v) {
3800
4286
  return VFromD<D>(vget_low_f32(vcvt_f32_f16(v.raw)));
3801
4287
  }
3802
4288
 
3803
- #endif // HWY_NEON_HAVE_FLOAT16C
4289
+ #endif // HWY_NEON_HAVE_F16C
3804
4290
 
3805
4291
  #if HWY_HAVE_FLOAT64
3806
4292
 
@@ -3946,14 +4432,14 @@ HWY_API Vec128<int64_t> PromoteUpperTo(D /* tag */, Vec128<int32_t> v) {
3946
4432
  return Vec128<int64_t>(vmovl_high_s32(v.raw));
3947
4433
  }
3948
4434
 
3949
- #if HWY_NEON_HAVE_FLOAT16C
4435
+ #if HWY_NEON_HAVE_F16C
3950
4436
 
3951
4437
  template <class D, HWY_IF_F32_D(D)>
3952
4438
  HWY_API Vec128<float> PromoteUpperTo(D /* tag */, Vec128<float16_t> v) {
3953
4439
  return Vec128<float>(vcvt_high_f32_f16(v.raw));
3954
4440
  }
3955
4441
 
3956
- #endif // HWY_NEON_HAVE_FLOAT16C
4442
+ #endif // HWY_NEON_HAVE_F16C
3957
4443
 
3958
4444
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
3959
4445
  HWY_API VFromD<D> PromoteUpperTo(D df32, VFromD<Repartition<bfloat16_t, D>> v) {
@@ -4149,7 +4635,7 @@ HWY_API VFromD<D> DemoteTo(D d, Vec64<uint64_t> v) {
4149
4635
  return DemoteTo(d, DemoteTo(du32, v));
4150
4636
  }
4151
4637
 
4152
- #if HWY_NEON_HAVE_FLOAT16C
4638
+ #if HWY_NEON_HAVE_F16C
4153
4639
 
4154
4640
  // We already toggled HWY_NATIVE_F16C above.
4155
4641
 
@@ -4162,7 +4648,7 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
4162
4648
  return VFromD<D>(vcvt_f16_f32(vcombine_f32(v.raw, v.raw)));
4163
4649
  }
4164
4650
 
4165
- #endif // HWY_NEON_HAVE_FLOAT16C
4651
+ #endif // HWY_NEON_HAVE_F16C
4166
4652
 
4167
4653
  template <class D, HWY_IF_BF16_D(D)>
4168
4654
  HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
@@ -4184,32 +4670,10 @@ HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) {
4184
4670
  return Vec32<float>(vcvt_f32_f64(vcombine_f64(v.raw, v.raw)));
4185
4671
  }
4186
4672
 
4187
- template <class D, HWY_IF_I32_D(D)>
4188
- HWY_API Vec64<int32_t> DemoteTo(D /* tag */, Vec128<double> v) {
4189
- const int64x2_t i64 = vcvtq_s64_f64(v.raw);
4190
- return Vec64<int32_t>(vqmovn_s64(i64));
4191
- }
4192
- template <class D, HWY_IF_I32_D(D)>
4193
- HWY_API Vec32<int32_t> DemoteTo(D /* tag */, Vec64<double> v) {
4194
- // There is no i64x1 -> i32x1 narrow, so Combine to 128-bit. Do so with the
4195
- // f64 input already to also avoid the missing vcvt_s64_f64 in GCC 6.4.
4196
- const Full128<double> ddt;
4197
- const Full128<int64_t> dit;
4198
- return Vec32<int32_t>(vqmovn_s64(ConvertTo(dit, Combine(ddt, v, v)).raw));
4199
- }
4200
-
4201
- template <class D, HWY_IF_U32_D(D)>
4202
- HWY_API Vec64<uint32_t> DemoteTo(D /* tag */, Vec128<double> v) {
4203
- const uint64x2_t u64 = vcvtq_u64_f64(v.raw);
4204
- return Vec64<uint32_t>(vqmovn_u64(u64));
4205
- }
4206
- template <class D, HWY_IF_U32_D(D)>
4207
- HWY_API Vec32<uint32_t> DemoteTo(D /* tag */, Vec64<double> v) {
4208
- // There is no i64x1 -> i32x1 narrow, so Combine to 128-bit. Do so with the
4209
- // f64 input already to also avoid the missing vcvt_s64_f64 in GCC 6.4.
4210
- const Full128<double> ddt;
4211
- const Full128<uint64_t> du_t;
4212
- return Vec32<uint32_t>(vqmovn_u64(ConvertTo(du_t, Combine(ddt, v, v)).raw));
4673
+ template <class D, HWY_IF_UI32_D(D)>
4674
+ HWY_API VFromD<D> DemoteTo(D d32, VFromD<Rebind<double, D>> v) {
4675
+ const Rebind<MakeWide<TFromD<D>>, D> d64;
4676
+ return DemoteTo(d32, ConvertTo(d64, v));
4213
4677
  }
4214
4678
 
4215
4679
  #endif // HWY_HAVE_FLOAT64
@@ -4466,30 +4930,6 @@ HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
4466
4930
  return v != v;
4467
4931
  }
4468
4932
 
4469
- template <typename T, size_t N, HWY_IF_FLOAT(T)>
4470
- HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
4471
- const DFromV<decltype(v)> d;
4472
- const RebindToSigned<decltype(d)> di;
4473
- const VFromD<decltype(di)> vi = BitCast(di, v);
4474
- // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
4475
- return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
4476
- }
4477
-
4478
- // Returns whether normal/subnormal/zero.
4479
- template <typename T, size_t N, HWY_IF_FLOAT(T)>
4480
- HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
4481
- const DFromV<decltype(v)> d;
4482
- const RebindToUnsigned<decltype(d)> du;
4483
- const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
4484
- const VFromD<decltype(du)> vu = BitCast(du, v);
4485
- // 'Shift left' to clear the sign bit, then right so we can compare with the
4486
- // max exponent (cannot compare with MaxExponentTimes2 directly because it is
4487
- // negative and non-negative floats would be greater).
4488
- const VFromD<decltype(di)> exp =
4489
- BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
4490
- return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
4491
- }
4492
-
4493
4933
  // ================================================== SWIZZLE
4494
4934
 
4495
4935
  // ------------------------------ LowerHalf
@@ -4749,7 +5189,7 @@ HWY_API Vec64<double> UpperHalf(D /* tag */, Vec128<double> v) {
4749
5189
  }
4750
5190
  #endif // HWY_HAVE_FLOAT64
4751
5191
 
4752
- template <class D, HWY_IF_SPECIAL_FLOAT_D(D), HWY_IF_V_SIZE_GT_D(D, 4)>
5192
+ template <class D, HWY_NEON_IF_EMULATED_D(D), HWY_IF_V_SIZE_GT_D(D, 4)>
4753
5193
  HWY_API VFromD<D> UpperHalf(D dh, VFromD<Twice<D>> v) {
4754
5194
  const RebindToUnsigned<Twice<decltype(dh)>> du;
4755
5195
  const Half<decltype(du)> duh;
@@ -5393,6 +5833,16 @@ HWY_API Vec128<T> InterleaveLower(Vec128<T> a, Vec128<T> b) {
5393
5833
  }
5394
5834
  #endif
5395
5835
 
5836
+ #if !HWY_HAVE_FLOAT16
5837
+ template <size_t N, HWY_IF_V_SIZE_GT(float16_t, N, 4)>
5838
+ HWY_API Vec128<float16_t, N> InterleaveLower(Vec128<float16_t, N> a,
5839
+ Vec128<float16_t, N> b) {
5840
+ const DFromV<decltype(a)> d;
5841
+ const RebindToUnsigned<decltype(d)> du;
5842
+ return BitCast(d, InterleaveLower(BitCast(du, a), BitCast(du, b)));
5843
+ }
5844
+ #endif // !HWY_HAVE_FLOAT16
5845
+
5396
5846
  // < 64 bit parts
5397
5847
  template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 4)>
5398
5848
  HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
@@ -6266,6 +6716,23 @@ namespace detail {
6266
6716
  // There is no vuzpq_u64.
6267
6717
  HWY_NEON_DEF_FUNCTION_UIF_8_16_32(ConcatEven, vuzp1, _, 2)
6268
6718
  HWY_NEON_DEF_FUNCTION_UIF_8_16_32(ConcatOdd, vuzp2, _, 2)
6719
+
6720
+ #if !HWY_HAVE_FLOAT16
6721
+ template <size_t N>
6722
+ HWY_INLINE Vec128<float16_t, N> ConcatEven(Vec128<float16_t, N> hi,
6723
+ Vec128<float16_t, N> lo) {
6724
+ const DFromV<decltype(hi)> d;
6725
+ const RebindToUnsigned<decltype(d)> du;
6726
+ return BitCast(d, ConcatEven(BitCast(du, hi), BitCast(du, lo)));
6727
+ }
6728
+ template <size_t N>
6729
+ HWY_INLINE Vec128<float16_t, N> ConcatOdd(Vec128<float16_t, N> hi,
6730
+ Vec128<float16_t, N> lo) {
6731
+ const DFromV<decltype(hi)> d;
6732
+ const RebindToUnsigned<decltype(d)> du;
6733
+ return BitCast(d, ConcatOdd(BitCast(du, hi), BitCast(du, lo)));
6734
+ }
6735
+ #endif // !HWY_HAVE_FLOAT16
6269
6736
  } // namespace detail
6270
6737
 
6271
6738
  // Full/half vector
@@ -7045,44 +7512,19 @@ HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) {
7045
7512
 
7046
7513
  // ------------------------------ Reductions
7047
7514
 
7048
- namespace detail {
7049
-
7050
- // N=1 for any T: no-op
7051
- template <typename T>
7052
- HWY_INLINE T ReduceMin(hwy::SizeTag<sizeof(T)> /* tag */, Vec128<T, 1> v) {
7053
- return GetLane(v);
7054
- }
7055
- template <typename T>
7056
- HWY_INLINE T ReduceMax(hwy::SizeTag<sizeof(T)> /* tag */, Vec128<T, 1> v) {
7057
- return GetLane(v);
7058
- }
7059
- template <typename T>
7060
- HWY_INLINE T ReduceSum(hwy::SizeTag<sizeof(T)> /* tag */, Vec128<T, 1> v) {
7061
- return GetLane(v);
7062
- }
7063
- template <typename T>
7064
- HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
7065
- Vec128<T, 1> v) {
7066
- return v;
7067
- }
7068
- template <typename T>
7069
- HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
7070
- Vec128<T, 1> v) {
7071
- return v;
7072
- }
7073
- template <typename T>
7074
- HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
7075
- Vec128<T, 1> v) {
7076
- return v;
7077
- }
7078
-
7079
- // full vectors
7515
+ // On Armv8 we define ReduceSum and generic_ops defines SumOfLanes via Set.
7080
7516
  #if HWY_ARCH_ARM_A64
7081
7517
 
7518
+ #ifdef HWY_NATIVE_REDUCE_SCALAR
7519
+ #undef HWY_NATIVE_REDUCE_SCALAR
7520
+ #else
7521
+ #define HWY_NATIVE_REDUCE_SCALAR
7522
+ #endif
7523
+
7082
7524
  // TODO(janwas): use normal HWY_NEON_DEF, then FULL type list.
7083
7525
  #define HWY_NEON_DEF_REDUCTION(type, size, name, prefix, infix, suffix) \
7084
- HWY_API type##_t name(hwy::SizeTag<sizeof(type##_t)>, \
7085
- Vec128<type##_t, size> v) { \
7526
+ template <class D, HWY_IF_LANES_D(D, size)> \
7527
+ HWY_API type##_t name(D /* tag */, Vec128<type##_t, size> v) { \
7086
7528
  return HWY_NEON_EVAL(prefix##infix##suffix, v.raw); \
7087
7529
  }
7088
7530
 
@@ -7125,83 +7567,110 @@ HWY_NEON_DEF_REDUCTION_F16(ReduceMax, vmaxv)
7125
7567
  HWY_NEON_DEF_REDUCTION_CORE_TYPES(ReduceSum, vaddv)
7126
7568
  HWY_NEON_DEF_REDUCTION_UI64(ReduceSum, vaddv)
7127
7569
 
7570
+ // Emulate missing UI64 and partial N=2.
7571
+ template <class D, HWY_IF_LANES_D(D, 2),
7572
+ HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
7573
+ HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v10) {
7574
+ return GetLane(v10) + ExtractLane(v10, 1);
7575
+ }
7576
+
7577
+ template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_NOT_FLOAT_D(D),
7578
+ HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 8))>
7579
+ HWY_API TFromD<D> ReduceMin(D /* tag */, VFromD<D> v10) {
7580
+ return HWY_MIN(GetLane(v10), ExtractLane(v10, 1));
7581
+ }
7582
+
7583
+ template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_NOT_FLOAT_D(D),
7584
+ HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 8))>
7585
+ HWY_API TFromD<D> ReduceMax(D /* tag */, VFromD<D> v10) {
7586
+ return HWY_MAX(GetLane(v10), ExtractLane(v10, 1));
7587
+ }
7588
+
7128
7589
  #if HWY_HAVE_FLOAT16
7129
- HWY_API float16_t ReduceSum(hwy::SizeTag<2>, Vec64<float16_t> v) {
7590
+ template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_F16_D(D)>
7591
+ HWY_API float16_t ReduceMin(D d, VFromD<D> v10) {
7592
+ return GetLane(Min(v10, Reverse2(d, v10)));
7593
+ }
7594
+
7595
+ template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_F16_D(D)>
7596
+ HWY_API float16_t ReduceMax(D d, VFromD<D> v10) {
7597
+ return GetLane(Max(v10, Reverse2(d, v10)));
7598
+ }
7599
+
7600
+ template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 8)>
7601
+ HWY_API float16_t ReduceSum(D /* tag */, VFromD<D> v) {
7130
7602
  const float16x4_t x2 = vpadd_f16(v.raw, v.raw);
7131
- return GetLane(Vec64<float16_t>(vpadd_f16(x2, x2)));
7603
+ return GetLane(VFromD<D>(vpadd_f16(x2, x2)));
7132
7604
  }
7133
- HWY_API float16_t ReduceSum(hwy::SizeTag<2> tag, Vec128<float16_t> v) {
7134
- return ReduceSum(tag, LowerHalf(Vec128<float16_t>(vpaddq_f16(v.raw, v.raw))));
7605
+ template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 16)>
7606
+ HWY_API float16_t ReduceSum(D d, VFromD<D> v) {
7607
+ const Half<decltype(d)> dh;
7608
+ return ReduceSum(dh, LowerHalf(dh, VFromD<D>(vpaddq_f16(v.raw, v.raw))));
7135
7609
  }
7136
- #endif
7610
+ #endif // HWY_HAVE_FLOAT16
7137
7611
 
7138
7612
  #undef HWY_NEON_DEF_REDUCTION_CORE_TYPES
7139
7613
  #undef HWY_NEON_DEF_REDUCTION_F16
7140
7614
  #undef HWY_NEON_DEF_REDUCTION_UI64
7141
7615
  #undef HWY_NEON_DEF_REDUCTION
7142
7616
 
7143
- // Need some fallback implementations for [ui]64x2 and [ui]16x2.
7144
- #define HWY_IF_SUM_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, 1 << 2)
7145
- #define HWY_IF_MINMAX_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, (1 << 8) | (1 << 2))
7617
+ // ------------------------------ SumOfLanes
7146
7618
 
7147
- // Implement Min/Max/SumOfLanes in terms of the corresponding reduction.
7148
- template <size_t N, typename V>
7149
- HWY_API V MinOfLanes(hwy::SizeTag<N> tag, V v) {
7150
- return Set(DFromV<decltype(v)>(), ReduceMin(tag, v));
7619
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
7620
+ HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
7621
+ return Set(d, ReduceSum(d, v));
7151
7622
  }
7152
- template <size_t N, typename V>
7153
- HWY_API V MaxOfLanes(hwy::SizeTag<N> tag, V v) {
7154
- return Set(DFromV<decltype(v)>(), ReduceMax(tag, v));
7623
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
7624
+ HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
7625
+ return Set(d, ReduceMin(d, v));
7155
7626
  }
7156
- template <size_t N, typename V>
7157
- HWY_API V SumOfLanes(hwy::SizeTag<N> tag, V v) {
7158
- return Set(DFromV<decltype(v)>(), ReduceSum(tag, v));
7627
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
7628
+ HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
7629
+ return Set(d, ReduceMax(d, v));
7159
7630
  }
7160
7631
 
7161
- #else
7632
+ // On Armv7 we define SumOfLanes and generic_ops defines ReduceSum via GetLane.
7633
+ #else // !HWY_ARCH_ARM_A64
7634
+
7635
+ // Armv7 lacks N=2 and 8-bit x4, so enable generic versions of those.
7636
+ #undef HWY_IF_SUM_OF_LANES_D
7637
+ #define HWY_IF_SUM_OF_LANES_D(D) \
7638
+ hwy::EnableIf<(HWY_MAX_LANES_D(D) == 2) || \
7639
+ (sizeof(TFromD<D>) == 1 && HWY_MAX_LANES_D(D) == 4)>* = \
7640
+ nullptr
7641
+ #undef HWY_IF_MINMAX_OF_LANES_D
7642
+ #define HWY_IF_MINMAX_OF_LANES_D(D) \
7643
+ hwy::EnableIf<(HWY_MAX_LANES_D(D) == 2) || \
7644
+ (sizeof(TFromD<D>) == 1 && HWY_MAX_LANES_D(D) == 4)>* = \
7645
+ nullptr
7162
7646
 
7163
7647
  // For arm7, we implement reductions using a series of pairwise operations. This
7164
7648
  // produces the full vector result, so we express Reduce* in terms of *OfLanes.
7165
7649
  #define HWY_NEON_BUILD_TYPE_T(type, size) type##x##size##_t
7166
- #define HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) Vec128<type##_t, size>
7167
7650
  #define HWY_NEON_DEF_PAIRWISE_REDUCTION(type, size, name, prefix, suffix) \
7168
- HWY_API HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) name##OfLanes( \
7169
- hwy::SizeTag<sizeof(type##_t)>, Vec128<type##_t, size> v) { \
7651
+ template <class D, HWY_IF_LANES_D(D, size)> \
7652
+ HWY_API Vec128<type##_t, size> name##OfLanes(D /* d */, \
7653
+ Vec128<type##_t, size> v) { \
7170
7654
  HWY_NEON_BUILD_TYPE_T(type, size) tmp = prefix##_##suffix(v.raw, v.raw); \
7171
7655
  if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \
7172
7656
  if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \
7173
- return HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size)(tmp); \
7174
- } \
7175
- HWY_API type##_t Reduce##name(hwy::SizeTag<sizeof(type##_t)> tag, \
7176
- Vec128<type##_t, size> v) { \
7177
- return GetLane(name##OfLanes(tag, v)); \
7657
+ return Vec128<type##_t, size>(tmp); \
7178
7658
  }
7179
7659
 
7180
7660
  // For the wide versions, the pairwise operations produce a half-length vector.
7181
- // We produce that value with a Reduce*Vector helper method, and express Reduce*
7182
- // and *OfLanes in terms of the helper.
7661
+ // We produce that `tmp` and then Combine.
7183
7662
  #define HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(type, size, half, name, prefix, \
7184
7663
  suffix) \
7185
- HWY_API HWY_NEON_BUILD_TYPE_T(type, half) \
7186
- Reduce##name##Vector(Vec128<type##_t, size> v) { \
7664
+ template <class D, HWY_IF_LANES_D(D, size)> \
7665
+ HWY_API Vec128<type##_t, size> name##OfLanes(D /* d */, \
7666
+ Vec128<type##_t, size> v) { \
7187
7667
  HWY_NEON_BUILD_TYPE_T(type, half) tmp; \
7188
7668
  tmp = prefix##_##suffix(vget_high_##suffix(v.raw), \
7189
7669
  vget_low_##suffix(v.raw)); \
7190
7670
  if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \
7191
7671
  if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \
7192
7672
  if ((size / 8) > 1) tmp = prefix##_##suffix(tmp, tmp); \
7193
- return tmp; \
7194
- } \
7195
- HWY_API type##_t Reduce##name(hwy::SizeTag<sizeof(type##_t)>, \
7196
- Vec128<type##_t, size> v) { \
7197
- const HWY_NEON_BUILD_TYPE_T(type, half) tmp = Reduce##name##Vector(v); \
7198
- return HWY_NEON_EVAL(vget_lane_##suffix, tmp, 0); \
7199
- } \
7200
- HWY_API HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) name##OfLanes( \
7201
- hwy::SizeTag<sizeof(type##_t)>, Vec128<type##_t, size> v) { \
7202
- const HWY_NEON_BUILD_TYPE_T(type, half) tmp = Reduce##name##Vector(v); \
7203
- return HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION( \
7204
- type, size)(vcombine_##suffix(tmp, tmp)); \
7673
+ return Vec128<type##_t, size>(vcombine_##suffix(tmp, tmp)); \
7205
7674
  }
7206
7675
 
7207
7676
  #define HWY_NEON_DEF_PAIRWISE_REDUCTIONS(name, prefix) \
@@ -7227,56 +7696,22 @@ HWY_NEON_DEF_PAIRWISE_REDUCTIONS(Max, vpmax)
7227
7696
  #undef HWY_NEON_DEF_PAIRWISE_REDUCTIONS
7228
7697
  #undef HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION
7229
7698
  #undef HWY_NEON_DEF_PAIRWISE_REDUCTION
7230
- #undef HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION
7231
7699
  #undef HWY_NEON_BUILD_TYPE_T
7232
7700
 
7233
- // Need fallback min/max implementations for [ui]64x2 and [ui]16x2.
7234
- #define HWY_IF_SUM_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, 1 << 2 | 1 << 8)
7235
- #define HWY_IF_MINMAX_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, 1 << 2 | 1 << 8)
7236
-
7701
+ // GetLane(SumsOf4(v)) is more efficient on ArmV7 NEON than the default
7702
+ // N=4 I8/U8 ReduceSum implementation in generic_ops-inl.h
7703
+ #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
7704
+ #undef HWY_NATIVE_REDUCE_SUM_4_UI8
7705
+ #else
7706
+ #define HWY_NATIVE_REDUCE_SUM_4_UI8
7237
7707
  #endif
7238
7708
 
7239
- } // namespace detail
7240
-
7241
- // [ui]16/[ui]64: N=2 -- special case for pairs of very small or large lanes
7242
- template <class D, typename T, HWY_IF_SUM_REDUCTION(T)>
7243
- HWY_API Vec128<T, 2> SumOfLanes(D /* tag */, Vec128<T, 2> v10) {
7244
- return v10 + Reverse2(Simd<T, 2, 0>(), v10);
7245
- }
7246
-
7247
- template <class D, typename T, HWY_IF_SUM_REDUCTION(T)>
7248
- HWY_API T ReduceSum(D d, Vec128<T, 2> v10) {
7249
- return GetLane(SumOfLanes(d, v10));
7250
- }
7251
-
7252
- template <class D, typename T, HWY_IF_MINMAX_REDUCTION(T)>
7253
- HWY_API Vec128<T, 2> MinOfLanes(D /* tag */, Vec128<T, 2> v10) {
7254
- return Min(v10, Reverse2(Simd<T, 2, 0>(), v10));
7255
- }
7256
- template <class D, typename T, HWY_IF_MINMAX_REDUCTION(T)>
7257
- HWY_API Vec128<T, 2> MaxOfLanes(D /* tag */, Vec128<T, 2> v10) {
7258
- return Max(v10, Reverse2(Simd<T, 2, 0>(), v10));
7709
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
7710
+ HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) {
7711
+ return static_cast<TFromD<D>>(GetLane(SumsOf4(v)));
7259
7712
  }
7260
7713
 
7261
- #undef HWY_IF_SUM_REDUCTION
7262
- #undef HWY_IF_MINMAX_REDUCTION
7263
-
7264
- template <class D>
7265
- HWY_API VFromD<D> SumOfLanes(D /* tag */, VFromD<D> v) {
7266
- return detail::SumOfLanes(hwy::SizeTag<sizeof(TFromD<D>)>(), v);
7267
- }
7268
- template <class D>
7269
- HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v) {
7270
- return detail::ReduceSum(hwy::SizeTag<sizeof(TFromD<D>)>(), v);
7271
- }
7272
- template <class D>
7273
- HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) {
7274
- return detail::MinOfLanes(hwy::SizeTag<sizeof(TFromD<D>)>(), v);
7275
- }
7276
- template <class D>
7277
- HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) {
7278
- return detail::MaxOfLanes(hwy::SizeTag<sizeof(TFromD<D>)>(), v);
7279
- }
7714
+ #endif // HWY_ARCH_ARM_A64
7280
7715
 
7281
7716
  // ------------------------------ LoadMaskBits (TestBit)
7282
7717
 
@@ -7345,6 +7780,15 @@ HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
7345
7780
  return detail::LoadMaskBits(d, mask_bits);
7346
7781
  }
7347
7782
 
7783
+ // ------------------------------ Dup128MaskFromMaskBits
7784
+
7785
+ template <class D>
7786
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
7787
+ constexpr size_t kN = MaxLanes(d);
7788
+ if (kN < 8) mask_bits &= (1u << kN) - 1;
7789
+ return detail::LoadMaskBits(d, mask_bits);
7790
+ }
7791
+
7348
7792
  // ------------------------------ Mask
7349
7793
 
7350
7794
  namespace detail {
@@ -7674,7 +8118,7 @@ namespace detail {
7674
8118
  template <class D, HWY_IF_V_SIZE_D(D, 16)>
7675
8119
  HWY_INLINE Vec128<uint8_t> Load8Bytes(D /*tag*/, const uint8_t* bytes) {
7676
8120
  return Vec128<uint8_t>(vreinterpretq_u8_u64(
7677
- vld1q_dup_u64(reinterpret_cast<const uint64_t*>(bytes))));
8121
+ vld1q_dup_u64(HWY_RCAST_ALIGNED(const uint64_t*, bytes))));
7678
8122
  }
7679
8123
 
7680
8124
  // Load 8 bytes and return half-reg with N <= 8 bytes.
@@ -8287,9 +8731,8 @@ HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved4, vld4, _, HWY_LOAD_INT)
8287
8731
  template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>>
8288
8732
  HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
8289
8733
  VFromD<D>& v0, VFromD<D>& v1) {
8290
- auto raw = detail::LoadInterleaved2(
8291
- reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned),
8292
- detail::Tuple2<T, d.MaxLanes()>());
8734
+ auto raw = detail::LoadInterleaved2(detail::NativeLanePointer(unaligned),
8735
+ detail::Tuple2<T, d.MaxLanes()>());
8293
8736
  v0 = VFromD<D>(raw.val[0]);
8294
8737
  v1 = VFromD<D>(raw.val[1]);
8295
8738
  }
@@ -8301,9 +8744,8 @@ HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
8301
8744
  // The smallest vector registers are 64-bits and we want space for two.
8302
8745
  alignas(16) T buf[2 * 8 / sizeof(T)] = {};
8303
8746
  CopyBytes<d.MaxBytes() * 2>(unaligned, buf);
8304
- auto raw = detail::LoadInterleaved2(
8305
- reinterpret_cast<const detail::NativeLaneType<T>*>(buf),
8306
- detail::Tuple2<T, d.MaxLanes()>());
8747
+ auto raw = detail::LoadInterleaved2(detail::NativeLanePointer(buf),
8748
+ detail::Tuple2<T, d.MaxLanes()>());
8307
8749
  v0 = VFromD<D>(raw.val[0]);
8308
8750
  v1 = VFromD<D>(raw.val[1]);
8309
8751
  }
@@ -8315,12 +8757,8 @@ HWY_API void LoadInterleaved2(D d, T* HWY_RESTRICT unaligned, Vec128<T>& v0,
8315
8757
  Vec128<T>& v1) {
8316
8758
  const Half<decltype(d)> dh;
8317
8759
  VFromD<decltype(dh)> v00, v10, v01, v11;
8318
- LoadInterleaved2(
8319
- dh, reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned), v00,
8320
- v10);
8321
- LoadInterleaved2(
8322
- dh, reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned + 2),
8323
- v01, v11);
8760
+ LoadInterleaved2(dh, detail::NativeLanePointer(unaligned), v00, v10);
8761
+ LoadInterleaved2(dh, detail::NativeLanePointer(unaligned + 2), v01, v11);
8324
8762
  v0 = Combine(d, v01, v00);
8325
8763
  v1 = Combine(d, v11, v10);
8326
8764
  }
@@ -8331,9 +8769,8 @@ HWY_API void LoadInterleaved2(D d, T* HWY_RESTRICT unaligned, Vec128<T>& v0,
8331
8769
  template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>>
8332
8770
  HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
8333
8771
  VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
8334
- auto raw = detail::LoadInterleaved3(
8335
- reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned),
8336
- detail::Tuple3<T, d.MaxLanes()>());
8772
+ auto raw = detail::LoadInterleaved3(detail::NativeLanePointer(unaligned),
8773
+ detail::Tuple3<T, d.MaxLanes()>());
8337
8774
  v0 = VFromD<D>(raw.val[0]);
8338
8775
  v1 = VFromD<D>(raw.val[1]);
8339
8776
  v2 = VFromD<D>(raw.val[2]);
@@ -8346,9 +8783,8 @@ HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
8346
8783
  // The smallest vector registers are 64-bits and we want space for three.
8347
8784
  alignas(16) T buf[3 * 8 / sizeof(T)] = {};
8348
8785
  CopyBytes<d.MaxBytes() * 3>(unaligned, buf);
8349
- auto raw = detail::LoadInterleaved3(
8350
- reinterpret_cast<const detail::NativeLaneType<T>*>(buf),
8351
- detail::Tuple3<T, d.MaxLanes()>());
8786
+ auto raw = detail::LoadInterleaved3(detail::NativeLanePointer(buf),
8787
+ detail::Tuple3<T, d.MaxLanes()>());
8352
8788
  v0 = VFromD<D>(raw.val[0]);
8353
8789
  v1 = VFromD<D>(raw.val[1]);
8354
8790
  v2 = VFromD<D>(raw.val[2]);
@@ -8361,12 +8797,8 @@ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
8361
8797
  Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2) {
8362
8798
  const Half<decltype(d)> dh;
8363
8799
  VFromD<decltype(dh)> v00, v10, v20, v01, v11, v21;
8364
- LoadInterleaved3(
8365
- dh, reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned), v00,
8366
- v10, v20);
8367
- LoadInterleaved3(
8368
- dh, reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned + 3),
8369
- v01, v11, v21);
8800
+ LoadInterleaved3(dh, detail::NativeLanePointer(unaligned), v00, v10, v20);
8801
+ LoadInterleaved3(dh, detail::NativeLanePointer(unaligned + 3), v01, v11, v21);
8370
8802
  v0 = Combine(d, v01, v00);
8371
8803
  v1 = Combine(d, v11, v10);
8372
8804
  v2 = Combine(d, v21, v20);
@@ -8379,9 +8811,8 @@ template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>>
8379
8811
  HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
8380
8812
  VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
8381
8813
  VFromD<D>& v3) {
8382
- auto raw = detail::LoadInterleaved4(
8383
- reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned),
8384
- detail::Tuple4<T, d.MaxLanes()>());
8814
+ auto raw = detail::LoadInterleaved4(detail::NativeLanePointer(unaligned),
8815
+ detail::Tuple4<T, d.MaxLanes()>());
8385
8816
  v0 = VFromD<D>(raw.val[0]);
8386
8817
  v1 = VFromD<D>(raw.val[1]);
8387
8818
  v2 = VFromD<D>(raw.val[2]);
@@ -8395,9 +8826,8 @@ HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
8395
8826
  VFromD<D>& v3) {
8396
8827
  alignas(16) T buf[4 * 8 / sizeof(T)] = {};
8397
8828
  CopyBytes<d.MaxBytes() * 4>(unaligned, buf);
8398
- auto raw = detail::LoadInterleaved4(
8399
- reinterpret_cast<const detail::NativeLaneType<T>*>(buf),
8400
- detail::Tuple4<T, d.MaxLanes()>());
8829
+ auto raw = detail::LoadInterleaved4(detail::NativeLanePointer(buf),
8830
+ detail::Tuple4<T, d.MaxLanes()>());
8401
8831
  v0 = VFromD<D>(raw.val[0]);
8402
8832
  v1 = VFromD<D>(raw.val[1]);
8403
8833
  v2 = VFromD<D>(raw.val[2]);
@@ -8412,12 +8842,10 @@ HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
8412
8842
  Vec128<T>& v3) {
8413
8843
  const Half<decltype(d)> dh;
8414
8844
  VFromD<decltype(dh)> v00, v10, v20, v30, v01, v11, v21, v31;
8415
- LoadInterleaved4(
8416
- dh, reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned), v00,
8417
- v10, v20, v30);
8418
- LoadInterleaved4(
8419
- dh, reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned + 4),
8420
- v01, v11, v21, v31);
8845
+ LoadInterleaved4(dh, detail::NativeLanePointer(unaligned), v00, v10, v20,
8846
+ v30);
8847
+ LoadInterleaved4(dh, detail::NativeLanePointer(unaligned + 4), v01, v11, v21,
8848
+ v31);
8421
8849
  v0 = Combine(d, v01, v00);
8422
8850
  v1 = Combine(d, v11, v10);
8423
8851
  v2 = Combine(d, v21, v20);
@@ -8476,8 +8904,7 @@ template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>>
8476
8904
  HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
8477
8905
  T* HWY_RESTRICT unaligned) {
8478
8906
  detail::Tuple2<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw}}};
8479
- detail::StoreInterleaved2(
8480
- tup, reinterpret_cast<detail::NativeLaneType<T>*>(unaligned));
8907
+ detail::StoreInterleaved2(tup, detail::NativeLanePointer(unaligned));
8481
8908
  }
8482
8909
 
8483
8910
  // <= 32 bits: avoid writing more than N bytes by copying to buffer
@@ -8486,8 +8913,7 @@ HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
8486
8913
  T* HWY_RESTRICT unaligned) {
8487
8914
  alignas(16) T buf[2 * 8 / sizeof(T)];
8488
8915
  detail::Tuple2<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw}}};
8489
- detail::StoreInterleaved2(tup,
8490
- reinterpret_cast<detail::NativeLaneType<T>*>(buf));
8916
+ detail::StoreInterleaved2(tup, detail::NativeLanePointer(buf));
8491
8917
  CopyBytes<d.MaxBytes() * 2>(buf, unaligned);
8492
8918
  }
8493
8919
 
@@ -8498,10 +8924,9 @@ HWY_API void StoreInterleaved2(Vec128<T> v0, Vec128<T> v1, D d,
8498
8924
  T* HWY_RESTRICT unaligned) {
8499
8925
  const Half<decltype(d)> dh;
8500
8926
  StoreInterleaved2(LowerHalf(dh, v0), LowerHalf(dh, v1), dh,
8501
- reinterpret_cast<detail::NativeLaneType<T>*>(unaligned));
8502
- StoreInterleaved2(
8503
- UpperHalf(dh, v0), UpperHalf(dh, v1), dh,
8504
- reinterpret_cast<detail::NativeLaneType<T>*>(unaligned + 2));
8927
+ detail::NativeLanePointer(unaligned));
8928
+ StoreInterleaved2(UpperHalf(dh, v0), UpperHalf(dh, v1), dh,
8929
+ detail::NativeLanePointer(unaligned + 2));
8505
8930
  }
8506
8931
  #endif // HWY_ARCH_ARM_V7
8507
8932
 
@@ -8511,8 +8936,7 @@ template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>>
8511
8936
  HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
8512
8937
  T* HWY_RESTRICT unaligned) {
8513
8938
  detail::Tuple3<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw}}};
8514
- detail::StoreInterleaved3(
8515
- tup, reinterpret_cast<detail::NativeLaneType<T>*>(unaligned));
8939
+ detail::StoreInterleaved3(tup, detail::NativeLanePointer(unaligned));
8516
8940
  }
8517
8941
 
8518
8942
  // <= 32 bits: avoid writing more than N bytes by copying to buffer
@@ -8521,8 +8945,7 @@ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
8521
8945
  T* HWY_RESTRICT unaligned) {
8522
8946
  alignas(16) T buf[3 * 8 / sizeof(T)];
8523
8947
  detail::Tuple3<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw}}};
8524
- detail::StoreInterleaved3(tup,
8525
- reinterpret_cast<detail::NativeLaneType<T>*>(buf));
8948
+ detail::StoreInterleaved3(tup, detail::NativeLanePointer(buf));
8526
8949
  CopyBytes<d.MaxBytes() * 3>(buf, unaligned);
8527
8950
  }
8528
8951
 
@@ -8533,10 +8956,9 @@ HWY_API void StoreInterleaved3(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2, D d,
8533
8956
  T* HWY_RESTRICT unaligned) {
8534
8957
  const Half<decltype(d)> dh;
8535
8958
  StoreInterleaved3(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2), dh,
8536
- reinterpret_cast<detail::NativeLaneType<T>*>(unaligned));
8537
- StoreInterleaved3(
8538
- UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2), dh,
8539
- reinterpret_cast<detail::NativeLaneType<T>*>(unaligned + 3));
8959
+ detail::NativeLanePointer(unaligned));
8960
+ StoreInterleaved3(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2), dh,
8961
+ detail::NativeLanePointer(unaligned + 3));
8540
8962
  }
8541
8963
  #endif // HWY_ARCH_ARM_V7
8542
8964
 
@@ -8546,8 +8968,7 @@ template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>>
8546
8968
  HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
8547
8969
  VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) {
8548
8970
  detail::Tuple4<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
8549
- detail::StoreInterleaved4(
8550
- tup, reinterpret_cast<detail::NativeLaneType<T>*>(unaligned));
8971
+ detail::StoreInterleaved4(tup, detail::NativeLanePointer(unaligned));
8551
8972
  }
8552
8973
 
8553
8974
  // <= 32 bits: avoid writing more than N bytes by copying to buffer
@@ -8556,8 +8977,7 @@ HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
8556
8977
  VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) {
8557
8978
  alignas(16) T buf[4 * 8 / sizeof(T)];
8558
8979
  detail::Tuple4<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
8559
- detail::StoreInterleaved4(tup,
8560
- reinterpret_cast<detail::NativeLaneType<T>*>(buf));
8980
+ detail::StoreInterleaved4(tup, detail::NativeLanePointer(buf));
8561
8981
  CopyBytes<d.MaxBytes() * 4>(buf, unaligned);
8562
8982
  }
8563
8983
 
@@ -8569,11 +8989,10 @@ HWY_API void StoreInterleaved4(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2,
8569
8989
  const Half<decltype(d)> dh;
8570
8990
  StoreInterleaved4(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2),
8571
8991
  LowerHalf(dh, v3), dh,
8572
- reinterpret_cast<detail::NativeLaneType<T>*>(unaligned));
8573
- StoreInterleaved4(
8574
- UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2),
8575
- UpperHalf(dh, v3), dh,
8576
- reinterpret_cast<detail::NativeLaneType<T>*>(unaligned + 4));
8992
+ detail::NativeLanePointer(unaligned));
8993
+ StoreInterleaved4(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2),
8994
+ UpperHalf(dh, v3), dh,
8995
+ detail::NativeLanePointer(unaligned + 4));
8577
8996
  }
8578
8997
  #endif // HWY_ARCH_ARM_V7
8579
8998
 
@@ -8904,7 +9323,7 @@ namespace detail { // for code folding
8904
9323
  #undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
8905
9324
  #undef HWY_NEON_DEF_FUNCTION_UINTS
8906
9325
  #undef HWY_NEON_EVAL
8907
-
9326
+ #undef HWY_NEON_IF_EMULATED_D
8908
9327
  } // namespace detail
8909
9328
 
8910
9329
  // NOLINTNEXTLINE(google-readability-namespace-comments)