@img/sharp-libvips-dev 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. package/README.md +1 -2
  2. package/include/aom/aom_decoder.h +1 -1
  3. package/include/aom/aom_encoder.h +7 -1
  4. package/include/aom/aom_image.h +24 -12
  5. package/include/aom/aom_integer.h +3 -3
  6. package/include/aom/aomcx.h +15 -0
  7. package/include/aom/aomdx.h +5 -2
  8. package/include/archive.h +7 -5
  9. package/include/archive_entry.h +5 -3
  10. package/include/cgif.h +3 -0
  11. package/include/expat.h +21 -10
  12. package/include/expat_config.h +11 -5
  13. package/include/ffi.h +12 -25
  14. package/include/freetype2/freetype/config/ftoption.h +2 -2
  15. package/include/fribidi/fribidi-config.h +2 -2
  16. package/include/fribidi/fribidi-unicode-version.h +3 -3
  17. package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
  18. package/include/glib-2.0/gio/gappinfo.h +40 -25
  19. package/include/glib-2.0/gio/gapplication.h +6 -0
  20. package/include/glib-2.0/gio/gasyncresult.h +1 -1
  21. package/include/glib-2.0/gio/gconverter.h +5 -0
  22. package/include/glib-2.0/gio/gdbusintrospection.h +1 -1
  23. package/include/glib-2.0/gio/gfile.h +16 -0
  24. package/include/glib-2.0/gio/gio-visibility.h +34 -0
  25. package/include/glib-2.0/gio/giotypes.h +0 -1
  26. package/include/glib-2.0/gio/gsettings.h +8 -0
  27. package/include/glib-2.0/gio/gvfs.h +2 -2
  28. package/include/glib-2.0/girepository/gi-visibility.h +34 -0
  29. package/include/glib-2.0/girepository/giarginfo.h +23 -6
  30. package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
  31. package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
  32. package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
  33. package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
  34. package/include/glib-2.0/girepository/gienuminfo.h +20 -21
  35. package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
  36. package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
  37. package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
  38. package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
  39. package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
  40. package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
  41. package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
  42. package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
  43. package/include/glib-2.0/girepository/girepository.h +53 -62
  44. package/include/glib-2.0/girepository/girffi.h +8 -7
  45. package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
  46. package/include/glib-2.0/girepository/gistructinfo.h +26 -11
  47. package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
  48. package/include/glib-2.0/girepository/gitypelib.h +9 -13
  49. package/include/glib-2.0/girepository/gitypes.h +52 -104
  50. package/include/glib-2.0/girepository/giunioninfo.h +28 -12
  51. package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
  52. package/include/glib-2.0/girepository/givalueinfo.h +65 -0
  53. package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
  54. package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
  55. package/include/glib-2.0/glib/gbitlock.h +31 -0
  56. package/include/glib-2.0/glib/gbookmarkfile.h +1 -1
  57. package/include/glib-2.0/glib/giochannel.h +2 -2
  58. package/include/glib-2.0/glib/glib-visibility.h +34 -0
  59. package/include/glib-2.0/glib/gmacros.h +12 -5
  60. package/include/glib-2.0/glib/gmain.h +93 -7
  61. package/include/glib-2.0/glib/gmessages.h +8 -0
  62. package/include/glib-2.0/glib/gqsort.h +8 -1
  63. package/include/glib-2.0/glib/gslice.h +2 -0
  64. package/include/glib-2.0/glib/gstrfuncs.h +24 -30
  65. package/include/glib-2.0/glib/gstrvbuilder.h +3 -0
  66. package/include/glib-2.0/glib/gthread.h +191 -3
  67. package/include/glib-2.0/glib/gunicode.h +1 -1
  68. package/include/glib-2.0/glib/gversionmacros.h +9 -0
  69. package/include/glib-2.0/glib-unix.h +7 -1
  70. package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
  71. package/include/glib-2.0/gobject/genums.h +6 -6
  72. package/include/glib-2.0/gobject/glib-types.h +11 -0
  73. package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
  74. package/include/glib-2.0/gobject/gsignal.h +16 -6
  75. package/include/glib-2.0/gobject/gtype.h +6 -6
  76. package/include/harfbuzz/hb-buffer.h +6 -0
  77. package/include/harfbuzz/hb-common.h +6 -9
  78. package/include/harfbuzz/hb-cplusplus.hh +8 -11
  79. package/include/harfbuzz/hb-subset.h +17 -4
  80. package/include/harfbuzz/hb-version.h +3 -3
  81. package/include/hwy/abort.h +28 -0
  82. package/include/hwy/aligned_allocator.h +218 -6
  83. package/include/hwy/base.h +1935 -512
  84. package/include/hwy/cache_control.h +24 -6
  85. package/include/hwy/detect_compiler_arch.h +105 -10
  86. package/include/hwy/detect_targets.h +146 -37
  87. package/include/hwy/foreach_target.h +36 -1
  88. package/include/hwy/highway.h +222 -50
  89. package/include/hwy/ops/arm_neon-inl.h +2055 -894
  90. package/include/hwy/ops/arm_sve-inl.h +1476 -348
  91. package/include/hwy/ops/emu128-inl.h +711 -623
  92. package/include/hwy/ops/generic_ops-inl.h +4431 -2157
  93. package/include/hwy/ops/inside-inl.h +691 -0
  94. package/include/hwy/ops/ppc_vsx-inl.h +2186 -673
  95. package/include/hwy/ops/rvv-inl.h +1556 -536
  96. package/include/hwy/ops/scalar-inl.h +353 -233
  97. package/include/hwy/ops/set_macros-inl.h +171 -23
  98. package/include/hwy/ops/shared-inl.h +198 -56
  99. package/include/hwy/ops/wasm_128-inl.h +283 -244
  100. package/include/hwy/ops/x86_128-inl.h +3673 -1357
  101. package/include/hwy/ops/x86_256-inl.h +1737 -663
  102. package/include/hwy/ops/x86_512-inl.h +1697 -500
  103. package/include/hwy/per_target.h +4 -0
  104. package/include/hwy/profiler.h +648 -0
  105. package/include/hwy/robust_statistics.h +2 -2
  106. package/include/hwy/targets.h +40 -32
  107. package/include/hwy/timer-inl.h +3 -3
  108. package/include/hwy/timer.h +16 -1
  109. package/include/libheif/heif.h +170 -15
  110. package/include/libheif/heif_items.h +237 -0
  111. package/include/libheif/heif_properties.h +38 -2
  112. package/include/libheif/heif_regions.h +1 -1
  113. package/include/libheif/heif_version.h +2 -2
  114. package/include/libpng16/png.h +32 -29
  115. package/include/libpng16/pngconf.h +2 -2
  116. package/include/libpng16/pnglibconf.h +8 -3
  117. package/include/librsvg-2.0/librsvg/rsvg-cairo.h +1 -1
  118. package/include/librsvg-2.0/librsvg/rsvg-features.h +3 -4
  119. package/include/librsvg-2.0/librsvg/rsvg-pixbuf.h +235 -0
  120. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  121. package/include/librsvg-2.0/librsvg/rsvg.h +55 -176
  122. package/include/libxml2/libxml/HTMLparser.h +12 -19
  123. package/include/libxml2/libxml/c14n.h +1 -12
  124. package/include/libxml2/libxml/debugXML.h +1 -1
  125. package/include/libxml2/libxml/encoding.h +9 -0
  126. package/include/libxml2/libxml/entities.h +12 -1
  127. package/include/libxml2/libxml/hash.h +19 -0
  128. package/include/libxml2/libxml/list.h +2 -2
  129. package/include/libxml2/libxml/nanohttp.h +17 -0
  130. package/include/libxml2/libxml/parser.h +73 -58
  131. package/include/libxml2/libxml/parserInternals.h +9 -1
  132. package/include/libxml2/libxml/pattern.h +6 -0
  133. package/include/libxml2/libxml/tree.h +32 -12
  134. package/include/libxml2/libxml/uri.h +11 -0
  135. package/include/libxml2/libxml/valid.h +29 -2
  136. package/include/libxml2/libxml/xinclude.h +7 -0
  137. package/include/libxml2/libxml/xmlIO.h +21 -5
  138. package/include/libxml2/libxml/xmlerror.h +14 -0
  139. package/include/libxml2/libxml/xmlexports.h +111 -15
  140. package/include/libxml2/libxml/xmlmemory.h +8 -45
  141. package/include/libxml2/libxml/xmlreader.h +2 -0
  142. package/include/libxml2/libxml/xmlsave.h +5 -0
  143. package/include/libxml2/libxml/xmlunicode.h +165 -1
  144. package/include/libxml2/libxml/xmlversion.h +15 -179
  145. package/include/libxml2/libxml/xmlwriter.h +1 -0
  146. package/include/libxml2/libxml/xpath.h +4 -0
  147. package/include/pango-1.0/pango/pango-features.h +2 -2
  148. package/include/pango-1.0/pango/pango-fontmap.h +7 -0
  149. package/include/pango-1.0/pango/pango-item.h +4 -2
  150. package/include/pango-1.0/pango/pango-version-macros.h +25 -0
  151. package/include/pango-1.0/pango/pangofc-font.h +2 -1
  152. package/include/pixman-1/pixman-version.h +2 -2
  153. package/include/png.h +32 -29
  154. package/include/pngconf.h +2 -2
  155. package/include/pnglibconf.h +8 -3
  156. package/include/vips/connection.h +9 -3
  157. package/include/vips/util.h +1 -11
  158. package/include/vips/version.h +4 -4
  159. package/include/webp/decode.h +58 -56
  160. package/include/webp/demux.h +25 -21
  161. package/include/webp/encode.h +44 -39
  162. package/include/webp/mux.h +76 -15
  163. package/include/webp/mux_types.h +2 -1
  164. package/include/webp/sharpyuv/sharpyuv.h +77 -8
  165. package/include/webp/types.h +29 -8
  166. package/include/zconf.h +1 -1
  167. package/include/zlib.h +12 -12
  168. package/package.json +1 -1
  169. package/versions.json +18 -19
@@ -1,5 +1,7 @@
1
1
  // Copyright 2019 Google LLC
2
+ // Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
2
3
  // SPDX-License-Identifier: Apache-2.0
4
+ // SPDX-License-Identifier: BSD-3-Clause
3
5
  //
4
6
  // Licensed under the Apache License, Version 2.0 (the "License");
5
7
  // you may not use this file except in compliance with the License.
@@ -21,16 +23,12 @@
21
23
 
22
24
  #include "hwy/ops/shared-inl.h"
23
25
 
24
- HWY_BEFORE_NAMESPACE();
25
-
26
- // Must come after HWY_BEFORE_NAMESPACE so that the intrinsics are compiled with
27
- // the same target attribute as our code, see #834.
28
26
  HWY_DIAGNOSTICS(push)
29
27
  HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
30
28
  #include <arm_neon.h> // NOLINT(build/include_order)
31
29
  HWY_DIAGNOSTICS(pop)
32
30
 
33
- // Must come after arm_neon.h.
31
+ HWY_BEFORE_NAMESPACE();
34
32
  namespace hwy {
35
33
  namespace HWY_NAMESPACE {
36
34
 
@@ -143,12 +141,29 @@ namespace detail { // for code folding and Raw128
143
141
  HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \
144
142
  HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)
145
143
 
146
- #ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
144
+ // Clang 17 crashes with bf16, see github.com/llvm/llvm-project/issues/64179.
145
+ #undef HWY_NEON_HAVE_BFLOAT16
146
+ #if HWY_HAVE_SCALAR_BF16_TYPE && \
147
+ ((HWY_TARGET == HWY_NEON_BF16 && \
148
+ (!HWY_COMPILER_CLANG || HWY_COMPILER_CLANG >= 1800)) || \
149
+ defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC))
147
150
  #define HWY_NEON_HAVE_BFLOAT16 1
148
151
  #else
149
152
  #define HWY_NEON_HAVE_BFLOAT16 0
150
153
  #endif
151
154
 
155
+ // HWY_NEON_HAVE_F32_TO_BF16C is defined if NEON vcvt_bf16_f32 and
156
+ // vbfdot_f32 are available, even if the __bf16 type is disabled due to
157
+ // GCC/Clang bugs.
158
+ #undef HWY_NEON_HAVE_F32_TO_BF16C
159
+ #if HWY_NEON_HAVE_BFLOAT16 || HWY_TARGET == HWY_NEON_BF16 || \
160
+ (defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && \
161
+ (HWY_COMPILER_GCC_ACTUAL >= 1000 || HWY_COMPILER_CLANG >= 1100))
162
+ #define HWY_NEON_HAVE_F32_TO_BF16C 1
163
+ #else
164
+ #define HWY_NEON_HAVE_F32_TO_BF16C 0
165
+ #endif
166
+
152
167
  // bfloat16_t
153
168
  #if HWY_NEON_HAVE_BFLOAT16
154
169
  #define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args) \
@@ -160,7 +175,7 @@ namespace detail { // for code folding and Raw128
160
175
  #define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args)
161
176
  #endif
162
177
 
163
- // Used for conversion instructions if HWY_NEON_HAVE_FLOAT16C.
178
+ // Used for conversion instructions if HWY_NEON_HAVE_F16C.
164
179
  #define HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(name, prefix, infix, \
165
180
  args) \
166
181
  HWY_NEON_DEF_FUNCTION(float16, 8, name, prefix##q, infix, f16, args) \
@@ -176,6 +191,24 @@ namespace detail { // for code folding and Raw128
176
191
  #define HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args)
177
192
  #endif
178
193
 
194
+ // Enable generic functions for whichever of (f16, bf16) are not supported.
195
+ #if !HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16
196
+ #define HWY_NEON_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
197
+ #elif !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16
198
+ #define HWY_NEON_IF_EMULATED_D(D) HWY_IF_F16_D(D)
199
+ #elif HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16
200
+ #define HWY_NEON_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
201
+ #elif HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16
202
+ // NOTE: hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr is used instead of
203
+ // hwy::EnableIf<false>* = nullptr to avoid compiler errors since
204
+ // !hwy::IsSame<D, D>() is always false and as !hwy::IsSame<D, D>() will cause
205
+ // SFINAE to occur instead of a hard error due to a dependency on the D template
206
+ // argument
207
+ #define HWY_NEON_IF_EMULATED_D(D) hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr
208
+ #else
209
+ #error "Logic error, handled all four cases"
210
+ #endif
211
+
179
212
  // float
180
213
  #define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
181
214
  HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \
@@ -397,39 +430,6 @@ struct Tuple2<int64_t, N> {
397
430
  int64x1x2_t raw;
398
431
  };
399
432
 
400
- template <>
401
- struct Tuple2<float16_t, 8> {
402
- #if HWY_NEON_HAVE_FLOAT16C
403
- float16x8x2_t raw;
404
- #else
405
- uint16x8x2_t raw;
406
- #endif
407
- };
408
- template <size_t N>
409
- struct Tuple2<float16_t, N> {
410
- #if HWY_NEON_HAVE_FLOAT16C
411
- float16x4x2_t raw;
412
- #else
413
- uint16x4x2_t raw;
414
- #endif
415
- };
416
- template <>
417
- struct Tuple2<bfloat16_t, 8> {
418
- #if HWY_NEON_HAVE_BFLOAT16
419
- bfloat16x8x2_t raw;
420
- #else
421
- uint16x8x2_t raw;
422
- #endif
423
- };
424
- template <size_t N>
425
- struct Tuple2<bfloat16_t, N> {
426
- #if HWY_NEON_HAVE_BFLOAT16
427
- bfloat16x4x2_t raw;
428
- #else
429
- uint16x4x2_t raw;
430
- #endif
431
- };
432
-
433
433
  template <>
434
434
  struct Tuple2<float32_t, 4> {
435
435
  float32x4x2_t raw;
@@ -514,39 +514,6 @@ struct Tuple3<int64_t, N> {
514
514
  int64x1x3_t raw;
515
515
  };
516
516
 
517
- template <>
518
- struct Tuple3<float16_t, 8> {
519
- #if HWY_NEON_HAVE_FLOAT16C
520
- float16x8x3_t raw;
521
- #else
522
- uint16x8x3_t raw;
523
- #endif
524
- };
525
- template <size_t N>
526
- struct Tuple3<float16_t, N> {
527
- #if HWY_NEON_HAVE_FLOAT16C
528
- float16x4x3_t raw;
529
- #else
530
- uint16x4x3_t raw;
531
- #endif
532
- };
533
- template <>
534
- struct Tuple3<bfloat16_t, 8> {
535
- #if HWY_NEON_HAVE_BFLOAT16
536
- bfloat16x8x3_t raw;
537
- #else
538
- uint16x8x3_t raw;
539
- #endif
540
- };
541
- template <size_t N>
542
- struct Tuple3<bfloat16_t, N> {
543
- #if HWY_NEON_HAVE_BFLOAT16
544
- bfloat16x4x3_t raw;
545
- #else
546
- uint16x4x3_t raw;
547
- #endif
548
- };
549
-
550
517
  template <>
551
518
  struct Tuple3<float32_t, 4> {
552
519
  float32x4x3_t raw;
@@ -631,39 +598,6 @@ struct Tuple4<int64_t, N> {
631
598
  int64x1x4_t raw;
632
599
  };
633
600
 
634
- template <>
635
- struct Tuple4<float16_t, 8> {
636
- #if HWY_NEON_HAVE_FLOAT16C
637
- float16x8x4_t raw;
638
- #else
639
- uint16x8x4_t raw;
640
- #endif
641
- };
642
- template <size_t N>
643
- struct Tuple4<float16_t, N> {
644
- #if HWY_NEON_HAVE_FLOAT16C
645
- float16x4x4_t raw;
646
- #else
647
- uint16x4x4_t raw;
648
- #endif
649
- };
650
- template <>
651
- struct Tuple4<bfloat16_t, 8> {
652
- #if HWY_NEON_HAVE_BFLOAT16
653
- bfloat16x8x4_t raw;
654
- #else
655
- uint16x8x4_t raw;
656
- #endif
657
- };
658
- template <size_t N>
659
- struct Tuple4<bfloat16_t, N> {
660
- #if HWY_NEON_HAVE_BFLOAT16
661
- bfloat16x4x4_t raw;
662
- #else
663
- uint16x4x4_t raw;
664
- #endif
665
- };
666
-
667
601
  template <>
668
602
  struct Tuple4<float32_t, 4> {
669
603
  float32x4x4_t raw;
@@ -686,215 +620,213 @@ struct Tuple4<float64_t, N> {
686
620
  template <typename T, size_t N>
687
621
  struct Raw128;
688
622
 
689
- // 128
690
623
  template <>
691
624
  struct Raw128<uint8_t, 16> {
692
625
  using type = uint8x16_t;
693
626
  };
627
+ template <size_t N>
628
+ struct Raw128<uint8_t, N> {
629
+ using type = uint8x8_t;
630
+ };
694
631
 
695
632
  template <>
696
633
  struct Raw128<uint16_t, 8> {
697
634
  using type = uint16x8_t;
698
635
  };
636
+ template <size_t N>
637
+ struct Raw128<uint16_t, N> {
638
+ using type = uint16x4_t;
639
+ };
699
640
 
700
641
  template <>
701
642
  struct Raw128<uint32_t, 4> {
702
643
  using type = uint32x4_t;
703
644
  };
645
+ template <size_t N>
646
+ struct Raw128<uint32_t, N> {
647
+ using type = uint32x2_t;
648
+ };
704
649
 
705
650
  template <>
706
651
  struct Raw128<uint64_t, 2> {
707
652
  using type = uint64x2_t;
708
653
  };
654
+ template <>
655
+ struct Raw128<uint64_t, 1> {
656
+ using type = uint64x1_t;
657
+ };
709
658
 
710
659
  template <>
711
660
  struct Raw128<int8_t, 16> {
712
661
  using type = int8x16_t;
713
662
  };
663
+ template <size_t N>
664
+ struct Raw128<int8_t, N> {
665
+ using type = int8x8_t;
666
+ };
714
667
 
715
668
  template <>
716
669
  struct Raw128<int16_t, 8> {
717
670
  using type = int16x8_t;
718
671
  };
672
+ template <size_t N>
673
+ struct Raw128<int16_t, N> {
674
+ using type = int16x4_t;
675
+ };
719
676
 
720
677
  template <>
721
678
  struct Raw128<int32_t, 4> {
722
679
  using type = int32x4_t;
723
680
  };
681
+ template <size_t N>
682
+ struct Raw128<int32_t, N> {
683
+ using type = int32x2_t;
684
+ };
724
685
 
725
686
  template <>
726
687
  struct Raw128<int64_t, 2> {
727
688
  using type = int64x2_t;
728
689
  };
729
-
730
- template <>
731
- struct Raw128<float16_t, 8> {
732
- #if HWY_NEON_HAVE_FLOAT16C
733
- using type = float16x8_t;
734
- #else
735
- using type = uint16x8_t;
736
- #endif
737
- };
738
-
739
690
  template <>
740
- struct Raw128<bfloat16_t, 8> {
741
- #if HWY_NEON_HAVE_BFLOAT16
742
- using type = bfloat16x8_t;
743
- #else
744
- using type = uint16x8_t;
745
- #endif
691
+ struct Raw128<int64_t, 1> {
692
+ using type = int64x1_t;
746
693
  };
747
694
 
748
695
  template <>
749
696
  struct Raw128<float, 4> {
750
697
  using type = float32x4_t;
751
698
  };
699
+ template <size_t N>
700
+ struct Raw128<float, N> {
701
+ using type = float32x2_t;
702
+ };
752
703
 
753
704
  #if HWY_HAVE_FLOAT64
754
705
  template <>
755
706
  struct Raw128<double, 2> {
756
707
  using type = float64x2_t;
757
708
  };
758
- #endif // HWY_HAVE_FLOAT64
759
-
760
- // 64
761
709
  template <>
762
- struct Raw128<uint8_t, 8> {
763
- using type = uint8x8_t;
710
+ struct Raw128<double, 1> {
711
+ using type = float64x1_t;
764
712
  };
713
+ #endif // HWY_HAVE_FLOAT64
765
714
 
766
- template <>
767
- struct Raw128<uint16_t, 4> {
768
- using type = uint16x4_t;
769
- };
715
+ #if HWY_NEON_HAVE_F16C
770
716
 
771
717
  template <>
772
- struct Raw128<uint32_t, 2> {
773
- using type = uint32x2_t;
718
+ struct Tuple2<float16_t, 8> {
719
+ float16x8x2_t raw;
774
720
  };
775
-
776
- template <>
777
- struct Raw128<uint64_t, 1> {
778
- using type = uint64x1_t;
721
+ template <size_t N>
722
+ struct Tuple2<float16_t, N> {
723
+ float16x4x2_t raw;
779
724
  };
780
725
 
781
726
  template <>
782
- struct Raw128<int8_t, 8> {
783
- using type = int8x8_t;
727
+ struct Tuple3<float16_t, 8> {
728
+ float16x8x3_t raw;
784
729
  };
785
-
786
- template <>
787
- struct Raw128<int16_t, 4> {
788
- using type = int16x4_t;
730
+ template <size_t N>
731
+ struct Tuple3<float16_t, N> {
732
+ float16x4x3_t raw;
789
733
  };
790
734
 
791
735
  template <>
792
- struct Raw128<int32_t, 2> {
793
- using type = int32x2_t;
736
+ struct Tuple4<float16_t, 8> {
737
+ float16x8x4_t raw;
794
738
  };
795
-
796
- template <>
797
- struct Raw128<int64_t, 1> {
798
- using type = int64x1_t;
739
+ template <size_t N>
740
+ struct Tuple4<float16_t, N> {
741
+ float16x4x4_t raw;
799
742
  };
800
743
 
801
744
  template <>
802
- struct Raw128<float16_t, 4> {
803
- #if HWY_NEON_HAVE_FLOAT16C
745
+ struct Raw128<float16_t, 8> {
746
+ using type = float16x8_t;
747
+ };
748
+ template <size_t N>
749
+ struct Raw128<float16_t, N> {
804
750
  using type = float16x4_t;
805
- #else
806
- using type = uint16x4_t;
807
- #endif
808
751
  };
809
752
 
810
- template <>
811
- struct Raw128<bfloat16_t, 4> {
753
+ #else // !HWY_NEON_HAVE_F16C
754
+
755
+ template <size_t N>
756
+ struct Tuple2<float16_t, N> : public Tuple2<uint16_t, N> {};
757
+ template <size_t N>
758
+ struct Tuple3<float16_t, N> : public Tuple3<uint16_t, N> {};
759
+ template <size_t N>
760
+ struct Tuple4<float16_t, N> : public Tuple4<uint16_t, N> {};
761
+ template <size_t N>
762
+ struct Raw128<float16_t, N> : public Raw128<uint16_t, N> {};
763
+
764
+ #endif // HWY_NEON_HAVE_F16C
765
+
812
766
  #if HWY_NEON_HAVE_BFLOAT16
813
- using type = bfloat16x4_t;
814
- #else
815
- using type = uint16x4_t;
816
- #endif
817
- };
818
767
 
819
768
  template <>
820
- struct Raw128<float, 2> {
821
- using type = float32x2_t;
769
+ struct Tuple2<bfloat16_t, 8> {
770
+ bfloat16x8x2_t raw;
822
771
  };
823
-
824
- #if HWY_HAVE_FLOAT64
825
- template <>
826
- struct Raw128<double, 1> {
827
- using type = float64x1_t;
772
+ template <size_t N>
773
+ struct Tuple2<bfloat16_t, N> {
774
+ bfloat16x4x2_t raw;
828
775
  };
829
- #endif // HWY_HAVE_FLOAT64
830
776
 
831
- // 32 (same as 64)
832
777
  template <>
833
- struct Raw128<uint8_t, 4> : public Raw128<uint8_t, 8> {};
778
+ struct Tuple3<bfloat16_t, 8> {
779
+ bfloat16x8x3_t raw;
780
+ };
781
+ template <size_t N>
782
+ struct Tuple3<bfloat16_t, N> {
783
+ bfloat16x4x3_t raw;
784
+ };
834
785
 
835
786
  template <>
836
- struct Raw128<uint16_t, 2> : public Raw128<uint16_t, 4> {};
787
+ struct Tuple4<bfloat16_t, 8> {
788
+ bfloat16x8x4_t raw;
789
+ };
790
+ template <size_t N>
791
+ struct Tuple4<bfloat16_t, N> {
792
+ bfloat16x4x4_t raw;
793
+ };
837
794
 
838
795
  template <>
839
- struct Raw128<uint32_t, 1> : public Raw128<uint32_t, 2> {};
796
+ struct Raw128<bfloat16_t, 8> {
797
+ using type = bfloat16x8_t;
798
+ };
799
+ template <size_t N>
800
+ struct Raw128<bfloat16_t, N> {
801
+ using type = bfloat16x4_t;
802
+ };
840
803
 
841
- template <>
842
- struct Raw128<int8_t, 4> : public Raw128<int8_t, 8> {};
804
+ #else // !HWY_NEON_HAVE_BFLOAT16
843
805
 
844
- template <>
845
- struct Raw128<int16_t, 2> : public Raw128<int16_t, 4> {};
806
+ template <size_t N>
807
+ struct Tuple2<bfloat16_t, N> : public Tuple2<uint16_t, N> {};
808
+ template <size_t N>
809
+ struct Tuple3<bfloat16_t, N> : public Tuple3<uint16_t, N> {};
810
+ template <size_t N>
811
+ struct Tuple4<bfloat16_t, N> : public Tuple4<uint16_t, N> {};
812
+ template <size_t N>
813
+ struct Raw128<bfloat16_t, N> : public Raw128<uint16_t, N> {};
846
814
 
847
- template <>
848
- struct Raw128<int32_t, 1> : public Raw128<int32_t, 2> {};
815
+ #endif // HWY_NEON_HAVE_BFLOAT16
849
816
 
850
- template <>
851
- struct Raw128<float16_t, 2> : public Raw128<float16_t, 4> {};
817
+ } // namespace detail
852
818
 
853
- template <>
854
- struct Raw128<bfloat16_t, 2> : public Raw128<bfloat16_t, 4> {};
819
+ template <typename T, size_t N = 16 / sizeof(T)>
820
+ class Vec128 {
821
+ public:
822
+ using Raw = typename detail::Raw128<T, N>::type;
823
+ using PrivateT = T; // only for DFromV
824
+ static constexpr size_t kPrivateN = N; // only for DFromV
855
825
 
856
- template <>
857
- struct Raw128<float, 1> : public Raw128<float, 2> {};
858
-
859
- // 16 (same as 64)
860
- template <>
861
- struct Raw128<uint8_t, 2> : public Raw128<uint8_t, 8> {};
862
-
863
- template <>
864
- struct Raw128<uint16_t, 1> : public Raw128<uint16_t, 4> {};
865
-
866
- template <>
867
- struct Raw128<int8_t, 2> : public Raw128<int8_t, 8> {};
868
-
869
- template <>
870
- struct Raw128<int16_t, 1> : public Raw128<int16_t, 4> {};
871
-
872
- template <>
873
- struct Raw128<float16_t, 1> : public Raw128<float16_t, 4> {};
874
-
875
- template <>
876
- struct Raw128<bfloat16_t, 1> : public Raw128<bfloat16_t, 4> {};
877
-
878
- // 8 (same as 64)
879
- template <>
880
- struct Raw128<uint8_t, 1> : public Raw128<uint8_t, 8> {};
881
-
882
- template <>
883
- struct Raw128<int8_t, 1> : public Raw128<int8_t, 8> {};
884
-
885
- } // namespace detail
886
-
887
- template <typename T, size_t N = 16 / sizeof(T)>
888
- class Vec128 {
889
- public:
890
- using Raw = typename detail::Raw128<T, N>::type;
891
- using PrivateT = T; // only for DFromV
892
- static constexpr size_t kPrivateN = N; // only for DFromV
893
-
894
- HWY_INLINE Vec128() {}
895
- Vec128(const Vec128&) = default;
896
- Vec128& operator=(const Vec128&) = default;
897
- HWY_INLINE explicit Vec128(const Raw raw) : raw(raw) {}
826
+ HWY_INLINE Vec128() {}
827
+ Vec128(const Vec128&) = default;
828
+ Vec128& operator=(const Vec128&) = default;
829
+ HWY_INLINE explicit Vec128(const Raw raw) : raw(raw) {}
898
830
 
899
831
  // Compound assignment. Only usable if there is a corresponding non-member
900
832
  // binary operator overload. For example, only f32 and f64 support division.
@@ -910,6 +842,9 @@ class Vec128 {
910
842
  HWY_INLINE Vec128& operator-=(const Vec128 other) {
911
843
  return *this = (*this - other);
912
844
  }
845
+ HWY_INLINE Vec128& operator%=(const Vec128 other) {
846
+ return *this = (*this % other);
847
+ }
913
848
  HWY_INLINE Vec128& operator&=(const Vec128 other) {
914
849
  return *this = (*this & other);
915
850
  }
@@ -978,26 +913,22 @@ namespace detail {
978
913
  #define HWY_NEON_BUILD_ARG_HWY_SET t
979
914
 
980
915
  HWY_NEON_DEF_FUNCTION_ALL_TYPES(NativeSet, vdup, _n_, HWY_SET)
981
- HWY_NEON_DEF_FUNCTION_BFLOAT_16(NativeSet, vdup, _n_, HWY_SET)
982
- #if !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_FLOAT16C
916
+ #if !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_F16C
983
917
  HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(NativeSet, vdup, _n_, HWY_SET)
984
918
  #endif
919
+ HWY_NEON_DEF_FUNCTION_BFLOAT_16(NativeSet, vdup, _n_, HWY_SET)
920
+
921
+ template <class D, HWY_NEON_IF_EMULATED_D(D)>
922
+ HWY_API Vec128<TFromD<D>, MaxLanes(D())> NativeSet(D d, TFromD<D> t) {
923
+ const uint16_t tu = BitCastScalar<uint16_t>(t);
924
+ return Vec128<TFromD<D>, d.MaxLanes()>(Set(RebindToUnsigned<D>(), tu).raw);
925
+ }
985
926
 
986
927
  #undef HWY_NEON_BUILD_TPL_HWY_SET
987
928
  #undef HWY_NEON_BUILD_RET_HWY_SET
988
929
  #undef HWY_NEON_BUILD_PARAM_HWY_SET
989
930
  #undef HWY_NEON_BUILD_ARG_HWY_SET
990
931
 
991
- #if !HWY_NEON_HAVE_BFLOAT16
992
- // BF16: return u16.
993
- template <class D, HWY_IF_BF16_D(D)>
994
- HWY_API Vec128<bfloat16_t, MaxLanes(D())> NativeSet(D d, bfloat16_t t) {
995
- uint16_t tu;
996
- CopyBytes<sizeof(tu)>(&t, &tu);
997
- return Vec128<bfloat16_t, d.MaxLanes()>(Set(RebindToUnsigned<D>(), tu).raw);
998
- }
999
- #endif // !HWY_NEON_HAVE_BFLOAT16
1000
-
1001
932
  } // namespace detail
1002
933
 
1003
934
  // Full vector. Cannot yet use VFromD because that is defined in terms of Set.
@@ -1033,165 +964,323 @@ HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
1033
964
 
1034
965
  template <class D>
1035
966
  HWY_API VFromD<D> Undefined(D /*tag*/) {
967
+ #if HWY_HAS_BUILTIN(__builtin_nondeterministic_value)
968
+ return VFromD<D>{__builtin_nondeterministic_value(Zero(D()).raw)};
969
+ #else
1036
970
  VFromD<D> v;
1037
971
  return v;
972
+ #endif
1038
973
  }
1039
974
 
1040
975
  HWY_DIAGNOSTICS(pop)
1041
976
 
977
+ #if !HWY_COMPILER_GCC && !HWY_COMPILER_CLANGCL
1042
978
  namespace detail {
1043
979
 
1044
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
1045
- HWY_INLINE VFromD<D> Iota0(D d) {
1046
- const RebindToUnsigned<decltype(d)> du;
980
+ #pragma pack(push, 1)
981
+
982
+ template <class T>
983
+ struct alignas(8) Vec64ValsWrapper {
984
+ static_assert(sizeof(T) >= 1, "sizeof(T) >= 1 must be true");
985
+ static_assert(sizeof(T) <= 8, "sizeof(T) <= 8 must be true");
986
+ T vals[8 / sizeof(T)];
987
+ };
988
+
989
+ #pragma pack(pop)
990
+
991
+ } // namespace detail
992
+ #endif // !HWY_COMPILER_GCC && !HWY_COMPILER_CLANGCL
993
+
994
+ template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
995
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
996
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
997
+ TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
998
+ TFromD<D> /*t8*/, TFromD<D> /*t9*/,
999
+ TFromD<D> /*t10*/, TFromD<D> /*t11*/,
1000
+ TFromD<D> /*t12*/, TFromD<D> /*t13*/,
1001
+ TFromD<D> /*t14*/, TFromD<D> /*t15*/) {
1047
1002
  #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1048
- typedef uint8_t GccU8RawVectType __attribute__((__vector_size__(8)));
1049
- constexpr GccU8RawVectType kU8Iota0 = {0, 1, 2, 3, 4, 5, 6, 7};
1050
- const VFromD<decltype(du)> vu8_iota0(reinterpret_cast<uint8x8_t>(kU8Iota0));
1003
+ typedef int8_t GccI8RawVectType __attribute__((__vector_size__(8)));
1004
+ (void)d;
1005
+ const GccI8RawVectType raw = {
1006
+ static_cast<int8_t>(t0), static_cast<int8_t>(t1), static_cast<int8_t>(t2),
1007
+ static_cast<int8_t>(t3), static_cast<int8_t>(t4), static_cast<int8_t>(t5),
1008
+ static_cast<int8_t>(t6), static_cast<int8_t>(t7)};
1009
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1051
1010
  #else
1052
- alignas(8) static constexpr uint8_t kU8Iota0[8] = {0, 1, 2, 3, 4, 5, 6, 7};
1053
- const VFromD<decltype(du)> vu8_iota0(
1054
- Load(Full64<TFromD<decltype(du)>>(), kU8Iota0).raw);
1011
+ return ResizeBitCast(
1012
+ d, Set(Full64<uint64_t>(),
1013
+ BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{
1014
+ {t0, t1, t2, t3, t4, t5, t6, t7}})));
1055
1015
  #endif
1056
- return BitCast(d, vu8_iota0);
1057
1016
  }
1058
1017
 
1059
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
1060
- HWY_INLINE VFromD<D> Iota0(D d) {
1061
- const RebindToUnsigned<decltype(d)> du;
1018
+ template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
1019
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1020
+ TFromD<D> t2, TFromD<D> t3,
1021
+ TFromD<D> /*t4*/, TFromD<D> /*t5*/,
1022
+ TFromD<D> /*t6*/, TFromD<D> /*t7*/) {
1062
1023
  #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1063
- typedef uint8_t GccU8RawVectType __attribute__((__vector_size__(16)));
1064
- constexpr GccU8RawVectType kU8Iota0 = {0, 1, 2, 3, 4, 5, 6, 7,
1065
- 8, 9, 10, 11, 12, 13, 14, 15};
1066
- const VFromD<decltype(du)> vu8_iota0(reinterpret_cast<uint8x16_t>(kU8Iota0));
1024
+ typedef int16_t GccI16RawVectType __attribute__((__vector_size__(8)));
1025
+ (void)d;
1026
+ const GccI16RawVectType raw = {
1027
+ static_cast<int16_t>(t0), static_cast<int16_t>(t1),
1028
+ static_cast<int16_t>(t2), static_cast<int16_t>(t3)};
1029
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1067
1030
  #else
1068
- alignas(16) static constexpr uint8_t kU8Iota0[16] = {
1069
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
1070
- const auto vu8_iota0 = Load(du, kU8Iota0);
1031
+ return ResizeBitCast(
1032
+ d, Set(Full64<uint64_t>(),
1033
+ BitCastScalar<uint64_t>(
1034
+ detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1, t2, t3}})));
1071
1035
  #endif
1072
- return BitCast(d, vu8_iota0);
1073
1036
  }
1074
1037
 
1075
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
1076
- HWY_INLINE VFromD<D> Iota0(D d) {
1077
- using T = TFromD<decltype(d)>;
1038
+ template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
1039
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1040
+ TFromD<D> /*t2*/, TFromD<D> /*t3*/) {
1078
1041
  #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1079
- typedef detail::NativeLaneType<T> GccRawVectType
1080
- __attribute__((__vector_size__(8)));
1081
- constexpr GccRawVectType kIota0 = {T{0}, T{1}, T{2}, static_cast<T>(3)};
1082
- return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(kIota0));
1042
+ typedef int32_t GccI32RawVectType __attribute__((__vector_size__(8)));
1043
+ (void)d;
1044
+ const GccI32RawVectType raw = {static_cast<int32_t>(t0),
1045
+ static_cast<int32_t>(t1)};
1046
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1083
1047
  #else
1084
- alignas(8) static constexpr T kIota0[4] = {T{0}, T{1}, T{2},
1085
- static_cast<T>(3)};
1086
- return Load(d, kIota0);
1048
+ return ResizeBitCast(d,
1049
+ Set(Full64<uint64_t>(),
1050
+ BitCastScalar<uint64_t>(
1051
+ detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}})));
1087
1052
  #endif
1088
1053
  }
1089
1054
 
1090
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
1091
- HWY_INLINE VFromD<D> Iota0(D d) {
1092
- using T = TFromD<decltype(d)>;
1055
+ template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
1056
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1057
+ TFromD<D> /*t2*/, TFromD<D> /*t3*/) {
1093
1058
  #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1094
- typedef detail::NativeLaneType<T> GccRawVectType
1095
- __attribute__((__vector_size__(16)));
1096
- constexpr GccRawVectType kIota0 = {T{0}, T{1}, T{2}, static_cast<T>(3),
1097
- T{4}, T{5}, T{6}, static_cast<T>(7)};
1098
- return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(kIota0));
1059
+ typedef float GccF32RawVectType __attribute__((__vector_size__(8)));
1060
+ (void)d;
1061
+ const GccF32RawVectType raw = {t0, t1};
1062
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1099
1063
  #else
1100
- alignas(16) static constexpr T kU16Iota0[8] = {
1101
- T{0}, T{1}, T{2}, static_cast<T>(3), T{4}, T{5}, T{6}, static_cast<T>(7)};
1102
- return Load(d, kIota0);
1064
+ return ResizeBitCast(d,
1065
+ Set(Full64<uint64_t>(),
1066
+ BitCastScalar<uint64_t>(
1067
+ detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}})));
1103
1068
  #endif
1104
1069
  }
1105
1070
 
1106
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_UI32_D(D)>
1107
- HWY_INLINE VFromD<D> Iota0(D d) {
1108
- const RebindToUnsigned<decltype(d)> du;
1071
+ template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 8)>
1072
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> /*t1*/) {
1073
+ return Set(d, t0);
1074
+ }
1075
+
1076
+ template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 16)>
1077
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1078
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
1079
+ TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
1080
+ TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
1081
+ TFromD<D> t11, TFromD<D> t12,
1082
+ TFromD<D> t13, TFromD<D> t14,
1083
+ TFromD<D> t15) {
1109
1084
  #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1110
- typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(8)));
1111
- constexpr GccU32RawVectType kU32Iota0 = {0, 1};
1112
- const VFromD<decltype(du)> vu32_iota0(
1113
- reinterpret_cast<uint32x2_t>(kU32Iota0));
1085
+ typedef int8_t GccI8RawVectType __attribute__((__vector_size__(16)));
1086
+ (void)d;
1087
+ const GccI8RawVectType raw = {
1088
+ static_cast<int8_t>(t0), static_cast<int8_t>(t1),
1089
+ static_cast<int8_t>(t2), static_cast<int8_t>(t3),
1090
+ static_cast<int8_t>(t4), static_cast<int8_t>(t5),
1091
+ static_cast<int8_t>(t6), static_cast<int8_t>(t7),
1092
+ static_cast<int8_t>(t8), static_cast<int8_t>(t9),
1093
+ static_cast<int8_t>(t10), static_cast<int8_t>(t11),
1094
+ static_cast<int8_t>(t12), static_cast<int8_t>(t13),
1095
+ static_cast<int8_t>(t14), static_cast<int8_t>(t15)};
1096
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1114
1097
  #else
1115
- alignas(8) static constexpr uint32_t kU32Iota0[2] = {0, 1};
1116
- const VFromD<decltype(du)> vu32_iota0{
1117
- Load(Full64<TFromD<decltype(du)>>(), kU32Iota0).raw};
1098
+ const Half<decltype(d)> dh;
1099
+ return Combine(d,
1100
+ Dup128VecFromValues(dh, t8, t9, t10, t11, t12, t13, t14, t15,
1101
+ t8, t9, t10, t11, t12, t13, t14, t15),
1102
+ Dup128VecFromValues(dh, t0, t1, t2, t3, t4, t5, t6, t7, t0, t1,
1103
+ t2, t3, t4, t5, t6, t7));
1118
1104
  #endif
1119
- return BitCast(d, vu32_iota0);
1120
1105
  }
1121
1106
 
1122
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
1123
- HWY_INLINE VFromD<D> Iota0(D d) {
1124
- const RebindToUnsigned<decltype(d)> du;
1107
+ template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 16)>
1108
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1109
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
1110
+ TFromD<D> t5, TFromD<D> t6,
1111
+ TFromD<D> t7) {
1125
1112
  #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1126
- typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16)));
1127
- constexpr GccU32RawVectType kU32Iota0 = {0, 1, 2, 3};
1128
- const VFromD<decltype(du)> vu32_iota0(
1129
- reinterpret_cast<uint32x4_t>(kU32Iota0));
1113
+ typedef int16_t GccI16RawVectType __attribute__((__vector_size__(16)));
1114
+ (void)d;
1115
+ const GccI16RawVectType raw = {
1116
+ static_cast<int16_t>(t0), static_cast<int16_t>(t1),
1117
+ static_cast<int16_t>(t2), static_cast<int16_t>(t3),
1118
+ static_cast<int16_t>(t4), static_cast<int16_t>(t5),
1119
+ static_cast<int16_t>(t6), static_cast<int16_t>(t7)};
1120
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1130
1121
  #else
1131
- alignas(16) static constexpr uint32_t kU32Iota0[4] = {0, 1, 2, 3};
1132
- const auto vu32_iota0 = Load(du, kU32Iota0);
1122
+ const Half<decltype(d)> dh;
1123
+ return Combine(d, Dup128VecFromValues(dh, t4, t5, t6, t7, t4, t5, t6, t7),
1124
+ Dup128VecFromValues(dh, t0, t1, t2, t3, t0, t1, t2, t3));
1133
1125
  #endif
1134
- return BitCast(d, vu32_iota0);
1135
1126
  }
1136
1127
 
1137
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
1138
- HWY_INLINE VFromD<D> Iota0(D d) {
1128
+ template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 16)>
1129
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1130
+ TFromD<D> t2, TFromD<D> t3) {
1139
1131
  #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1140
- typedef float GccF32RawVectType __attribute__((__vector_size__(8)));
1141
- constexpr GccF32RawVectType kF32Iota0 = {0.0f, 1.0f};
1142
- return VFromD<decltype(d)>(reinterpret_cast<float32x2_t>(kF32Iota0));
1132
+ typedef int32_t GccI32RawVectType __attribute__((__vector_size__(16)));
1133
+ (void)d;
1134
+ const GccI32RawVectType raw = {
1135
+ static_cast<int32_t>(t0), static_cast<int32_t>(t1),
1136
+ static_cast<int32_t>(t2), static_cast<int32_t>(t3)};
1137
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1143
1138
  #else
1144
- alignas(8) static constexpr float kF32Iota0[2] = {0.0f, 1.0f};
1145
- return VFromD<decltype(d)>{
1146
- Load(Full64<TFromD<decltype(d)>>(), kF32Iota0).raw};
1139
+ const Half<decltype(d)> dh;
1140
+ return Combine(d, Dup128VecFromValues(dh, t2, t3, t2, t3),
1141
+ Dup128VecFromValues(dh, t0, t1, t0, t1));
1147
1142
  #endif
1148
1143
  }
1149
1144
 
1150
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
1151
- HWY_INLINE VFromD<D> Iota0(D d) {
1145
+ template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 16)>
1146
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1147
+ TFromD<D> t2, TFromD<D> t3) {
1152
1148
  #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1153
1149
  typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
1154
- constexpr GccF32RawVectType kF32Iota0 = {0.0f, 1.0f, 2.0f, 3.0f};
1155
- return VFromD<decltype(d)>(reinterpret_cast<float32x4_t>(kF32Iota0));
1150
+ (void)d;
1151
+ const GccF32RawVectType raw = {t0, t1, t2, t3};
1152
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1156
1153
  #else
1157
- alignas(16) static constexpr float kF32Iota0[4] = {0.0f, 1.0f, 2.0f, 3.0f};
1158
- return Load(d, kF32Iota0);
1154
+ const Half<decltype(d)> dh;
1155
+ return Combine(d, Dup128VecFromValues(dh, t2, t3, t2, t3),
1156
+ Dup128VecFromValues(dh, t0, t1, t0, t1));
1159
1157
  #endif
1160
1158
  }
1161
1159
 
1162
- template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 8)>
1163
- HWY_INLINE VFromD<D> Iota0(D d) {
1164
- return Zero(d);
1165
- }
1166
-
1167
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)>
1168
- HWY_INLINE VFromD<D> Iota0(D d) {
1169
- const RebindToUnsigned<decltype(d)> du;
1160
+ template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 16)>
1161
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) {
1170
1162
  #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1171
- typedef uint64_t GccU64RawVectType __attribute__((__vector_size__(16)));
1172
- constexpr GccU64RawVectType kU64Iota0 = {0, 1};
1173
- const VFromD<decltype(du)> vu64_iota0(
1174
- reinterpret_cast<uint64x2_t>(kU64Iota0));
1163
+ typedef int64_t GccI64RawVectType __attribute__((__vector_size__(16)));
1164
+ (void)d;
1165
+ const GccI64RawVectType raw = {static_cast<int64_t>(t0),
1166
+ static_cast<int64_t>(t1)};
1167
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1175
1168
  #else
1176
- alignas(16) static constexpr uint64_t kU64Iota0[4] = {0, 1};
1177
- const auto vu64_iota0 = Load(du, kU64Iota0);
1169
+ const Half<decltype(d)> dh;
1170
+ return Combine(d, Set(dh, t1), Set(dh, t0));
1178
1171
  #endif
1179
- return BitCast(d, vu64_iota0);
1180
1172
  }
1181
1173
 
1182
1174
  #if HWY_HAVE_FLOAT64
1183
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
1184
- HWY_INLINE VFromD<D> Iota0(D d) {
1175
+ template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 16)>
1176
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) {
1185
1177
  #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
1186
1178
  typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
1187
- constexpr GccF64RawVectType kF64Iota0 = {0.0, 1.0};
1188
- return VFromD<decltype(d)>(reinterpret_cast<float64x2_t>(kF64Iota0));
1179
+ (void)d;
1180
+ const GccF64RawVectType raw = {t0, t1};
1181
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1189
1182
  #else
1190
- alignas(16) static constexpr double kF64Iota0[4] = {0.0, 1.0};
1191
- return Load(d, kF64Iota0);
1183
+ const Half<decltype(d)> dh;
1184
+ return Combine(d, Set(dh, t1), Set(dh, t0));
1192
1185
  #endif
1193
1186
  }
1194
- #endif // HWY_HAVE_FLOAT64
1187
+ #endif
1188
+
1189
+ // Generic for all vector lengths
1190
+ template <class D, HWY_IF_BF16_D(D)>
1191
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1192
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
1193
+ TFromD<D> t5, TFromD<D> t6,
1194
+ TFromD<D> t7) {
1195
+ const RebindToSigned<decltype(d)> di;
1196
+ return BitCast(d,
1197
+ Dup128VecFromValues(
1198
+ di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
1199
+ BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
1200
+ BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
1201
+ BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
1202
+ }
1203
+
1204
+ #if (HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL) && HWY_NEON_HAVE_F16C
1205
+ template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
1206
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1207
+ TFromD<D> t2, TFromD<D> t3,
1208
+ TFromD<D> /*t4*/, TFromD<D> /*t5*/,
1209
+ TFromD<D> /*t6*/, TFromD<D> /*t7*/) {
1210
+ typedef __fp16 GccF16RawVectType __attribute__((__vector_size__(8)));
1211
+ (void)d;
1212
+ const GccF16RawVectType raw = {
1213
+ static_cast<__fp16>(t0), static_cast<__fp16>(t1), static_cast<__fp16>(t2),
1214
+ static_cast<__fp16>(t3)};
1215
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1216
+ }
1217
+ template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 16)>
1218
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1219
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
1220
+ TFromD<D> t5, TFromD<D> t6,
1221
+ TFromD<D> t7) {
1222
+ typedef __fp16 GccF16RawVectType __attribute__((__vector_size__(16)));
1223
+ (void)d;
1224
+ const GccF16RawVectType raw = {
1225
+ static_cast<__fp16>(t0), static_cast<__fp16>(t1), static_cast<__fp16>(t2),
1226
+ static_cast<__fp16>(t3), static_cast<__fp16>(t4), static_cast<__fp16>(t5),
1227
+ static_cast<__fp16>(t6), static_cast<__fp16>(t7)};
1228
+ return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
1229
+ }
1230
+ #else
1231
+ // Generic for all vector lengths if MSVC or !HWY_NEON_HAVE_F16C
1232
+ template <class D, HWY_IF_F16_D(D)>
1233
+ HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
1234
+ TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
1235
+ TFromD<D> t5, TFromD<D> t6,
1236
+ TFromD<D> t7) {
1237
+ const RebindToSigned<decltype(d)> di;
1238
+ return BitCast(d,
1239
+ Dup128VecFromValues(
1240
+ di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
1241
+ BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
1242
+ BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
1243
+ BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
1244
+ }
1245
+ #endif // (HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL) && HWY_NEON_HAVE_F16C
1246
+
1247
+ namespace detail {
1248
+
1249
+ template <class D, HWY_IF_T_SIZE_D(D, 1)>
1250
+ HWY_INLINE VFromD<D> Iota0(D d) {
1251
+ return Dup128VecFromValues(
1252
+ d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2}, TFromD<D>{3}, TFromD<D>{4},
1253
+ TFromD<D>{5}, TFromD<D>{6}, TFromD<D>{7}, TFromD<D>{8}, TFromD<D>{9},
1254
+ TFromD<D>{10}, TFromD<D>{11}, TFromD<D>{12}, TFromD<D>{13}, TFromD<D>{14},
1255
+ TFromD<D>{15});
1256
+ }
1257
+
1258
+ template <class D, HWY_IF_UI16_D(D)>
1259
+ HWY_INLINE VFromD<D> Iota0(D d) {
1260
+ return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2},
1261
+ TFromD<D>{3}, TFromD<D>{4}, TFromD<D>{5},
1262
+ TFromD<D>{6}, TFromD<D>{7});
1263
+ }
1264
+
1265
+ template <class D, HWY_IF_F16_D(D)>
1266
+ HWY_INLINE VFromD<D> Iota0(D d) {
1267
+ const RebindToUnsigned<decltype(d)> du;
1268
+ return BitCast(d, Dup128VecFromValues(du, uint16_t{0}, uint16_t{0x3C00},
1269
+ uint16_t{0x4000}, uint16_t{0x4200},
1270
+ uint16_t{0x4400}, uint16_t{0x4500},
1271
+ uint16_t{0x4600}, uint16_t{0x4700}));
1272
+ }
1273
+
1274
+ template <class D, HWY_IF_T_SIZE_D(D, 4)>
1275
+ HWY_INLINE VFromD<D> Iota0(D d) {
1276
+ return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2},
1277
+ TFromD<D>{3});
1278
+ }
1279
+
1280
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
1281
+ HWY_INLINE VFromD<D> Iota0(D d) {
1282
+ return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1});
1283
+ }
1195
1284
 
1196
1285
  #if HWY_COMPILER_MSVC
1197
1286
  template <class V, HWY_IF_V_SIZE_LE_V(V, 4)>
@@ -1226,9 +1315,6 @@ HWY_API VFromD<D> Iota(D d, const T2 first) {
1226
1315
  #endif
1227
1316
  }
1228
1317
 
1229
- // ------------------------------ Tuple (VFromD)
1230
- #include "hwy/ops/tuple-inl.h"
1231
-
1232
1318
  // ------------------------------ Combine
1233
1319
 
1234
1320
  // Full result
@@ -1274,30 +1360,25 @@ HWY_API Vec128<int64_t> Combine(D /* tag */, Vec64<int64_t> hi,
1274
1360
  return Vec128<int64_t>(vcombine_s64(lo.raw, hi.raw));
1275
1361
  }
1276
1362
 
1277
- template <class D, HWY_IF_F16_D(D)>
1278
- HWY_API Vec128<float16_t> Combine(D d, Vec64<float16_t> hi,
1279
- Vec64<float16_t> lo) {
1280
1363
  #if HWY_HAVE_FLOAT16
1281
- (void)d;
1364
+ template <class D, HWY_IF_F16_D(D)>
1365
+ HWY_API Vec128<float16_t> Combine(D, Vec64<float16_t> hi, Vec64<float16_t> lo) {
1282
1366
  return Vec128<float16_t>(vcombine_f16(lo.raw, hi.raw));
1283
- #else
1284
- const RebindToUnsigned<D> du;
1285
- const Half<decltype(du)> duh;
1286
- return BitCast(d, Combine(du, BitCast(duh, hi), BitCast(duh, lo)));
1287
- #endif
1288
1367
  }
1368
+ #endif // HWY_HAVE_FLOAT16
1289
1369
 
1290
- template <class D, HWY_IF_BF16_D(D)>
1291
- HWY_API Vec128<bfloat16_t> Combine(D d, Vec64<bfloat16_t> hi,
1292
- Vec64<bfloat16_t> lo) {
1293
1370
  #if HWY_NEON_HAVE_BFLOAT16
1294
- (void)d;
1295
- return Vec128<bfloat16_t>(vcombine_bf16(lo.raw, hi.raw));
1296
- #else
1371
+ template <class D, HWY_IF_BF16_D(D)>
1372
+ HWY_API VFromD<D> Combine(D, Vec64<bfloat16_t> hi, Vec64<bfloat16_t> lo) {
1373
+ return VFromD<D>(vcombine_bf16(lo.raw, hi.raw));
1374
+ }
1375
+ #endif // HWY_NEON_HAVE_BFLOAT16
1376
+
1377
+ template <class D, class DH = Half<D>, HWY_NEON_IF_EMULATED_D(D)>
1378
+ HWY_API VFromD<D> Combine(D d, VFromD<DH> hi, VFromD<DH> lo) {
1297
1379
  const RebindToUnsigned<D> du;
1298
1380
  const Half<decltype(du)> duh;
1299
1381
  return BitCast(d, Combine(du, BitCast(duh, hi), BitCast(duh, lo)));
1300
- #endif
1301
1382
  }
1302
1383
 
1303
1384
  template <class D, HWY_IF_F32_D(D)>
@@ -1341,7 +1422,7 @@ HWY_NEON_DEF_FUNCTION_UINT_32(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
1341
1422
  HWY_NEON_DEF_FUNCTION_UINT_64(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
1342
1423
 
1343
1424
  #if !HWY_HAVE_FLOAT16
1344
- #if HWY_NEON_HAVE_FLOAT16C
1425
+ #if HWY_NEON_HAVE_F16C
1345
1426
  HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(BitCastToByte, vreinterpret, _u8_,
1346
1427
  HWY_CAST_TO_U8)
1347
1428
  #else
@@ -1349,7 +1430,7 @@ template <size_t N>
1349
1430
  HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<float16_t, N> v) {
1350
1431
  return BitCastToByte(Vec128<uint16_t, N>(v.raw));
1351
1432
  }
1352
- #endif // HWY_NEON_HAVE_FLOAT16C
1433
+ #endif // HWY_NEON_HAVE_F16C
1353
1434
  #endif // !HWY_HAVE_FLOAT16
1354
1435
 
1355
1436
  #if !HWY_NEON_HAVE_BFLOAT16
@@ -1406,14 +1487,24 @@ HWY_INLINE Vec64<int64_t> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
1406
1487
  return Vec64<int64_t>(vreinterpret_s64_u8(v.raw));
1407
1488
  }
1408
1489
 
1490
+ // Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C.
1409
1491
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
1410
- HWY_INLINE VFromD<D> BitCastFromByte(D d, VFromD<Repartition<uint8_t, D>> v) {
1411
- #if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_FLOAT16C
1412
- (void)d;
1492
+ HWY_INLINE VFromD<D> BitCastFromByte(D, VFromD<Repartition<uint8_t, D>> v) {
1493
+ #if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
1413
1494
  return VFromD<D>(vreinterpret_f16_u8(v.raw));
1414
1495
  #else
1415
1496
  const RebindToUnsigned<D> du;
1416
- return VFromD<decltype(d)>(BitCastFromByte(du, v).raw);
1497
+ return VFromD<D>(BitCastFromByte(du, v).raw);
1498
+ #endif
1499
+ }
1500
+
1501
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
1502
+ HWY_INLINE VFromD<D> BitCastFromByte(D, VFromD<Repartition<uint8_t, D>> v) {
1503
+ #if HWY_NEON_HAVE_BFLOAT16
1504
+ return VFromD<D>(vreinterpret_bf16_u8(v.raw));
1505
+ #else
1506
+ const RebindToUnsigned<D> du;
1507
+ return VFromD<D>(BitCastFromByte(du, v).raw);
1417
1508
  #endif
1418
1509
  }
1419
1510
 
@@ -1461,15 +1552,6 @@ HWY_INLINE Vec128<int64_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1461
1552
  return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
1462
1553
  }
1463
1554
 
1464
- template <class D, HWY_IF_F16_D(D)>
1465
- HWY_INLINE Vec128<float16_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1466
- #if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_FLOAT16C
1467
- return Vec128<float16_t>(vreinterpretq_f16_u8(v.raw));
1468
- #else
1469
- return Vec128<float16_t>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
1470
- #endif
1471
- }
1472
-
1473
1555
  template <class D, HWY_IF_F32_D(D)>
1474
1556
  HWY_INLINE Vec128<float> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1475
1557
  return Vec128<float>(vreinterpretq_f32_u8(v.raw));
@@ -1482,11 +1564,23 @@ HWY_INLINE Vec128<double> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
1482
1564
  }
1483
1565
  #endif // HWY_HAVE_FLOAT64
1484
1566
 
1485
- // Special case for bfloat16_t, which may have the same Raw as uint16_t.
1567
+ // Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C.
1568
+ template <class D, HWY_IF_F16_D(D)>
1569
+ HWY_INLINE VFromD<D> BitCastFromByte(D, Vec128<uint8_t> v) {
1570
+ #if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
1571
+ return VFromD<D>(vreinterpretq_f16_u8(v.raw));
1572
+ #else
1573
+ return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
1574
+ #endif
1575
+ }
1576
+
1486
1577
  template <class D, HWY_IF_BF16_D(D)>
1487
- HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
1488
- VFromD<Repartition<uint8_t, D>> v) {
1578
+ HWY_INLINE VFromD<D> BitCastFromByte(D, Vec128<uint8_t> v) {
1579
+ #if HWY_NEON_HAVE_BFLOAT16
1580
+ return VFromD<D>(vreinterpretq_bf16_u8(v.raw));
1581
+ #else
1489
1582
  return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
1583
+ #endif
1490
1584
  }
1491
1585
 
1492
1586
  } // namespace detail
@@ -1542,6 +1636,14 @@ namespace detail {
1542
1636
  #define HWY_NEON_BUILD_ARG_HWY_GET v.raw, kLane
1543
1637
 
1544
1638
  HWY_NEON_DEF_FUNCTION_ALL_TYPES(GetLane, vget, _lane_, HWY_GET)
1639
+ HWY_NEON_DEF_FUNCTION_BFLOAT_16(GetLane, vget, _lane_, HWY_GET)
1640
+
1641
+ template <size_t kLane, class V, HWY_NEON_IF_EMULATED_D(DFromV<V>)>
1642
+ static HWY_INLINE HWY_MAYBE_UNUSED TFromV<V> GetLane(V v) {
1643
+ const DFromV<decltype(v)> d;
1644
+ const RebindToUnsigned<decltype(d)> du;
1645
+ return BitCastScalar<TFromV<V>>(GetLane<kLane>(BitCast(du, v)));
1646
+ }
1545
1647
 
1546
1648
  #undef HWY_NEON_BUILD_TPL_HWY_GET
1547
1649
  #undef HWY_NEON_BUILD_RET_HWY_GET
@@ -1688,12 +1790,21 @@ namespace detail {
1688
1790
  #define HWY_NEON_BUILD_ARG_HWY_INSERT t, v.raw, kLane
1689
1791
 
1690
1792
  HWY_NEON_DEF_FUNCTION_ALL_TYPES(InsertLane, vset, _lane_, HWY_INSERT)
1793
+ HWY_NEON_DEF_FUNCTION_BFLOAT_16(InsertLane, vset, _lane_, HWY_INSERT)
1691
1794
 
1692
1795
  #undef HWY_NEON_BUILD_TPL_HWY_INSERT
1693
1796
  #undef HWY_NEON_BUILD_RET_HWY_INSERT
1694
1797
  #undef HWY_NEON_BUILD_PARAM_HWY_INSERT
1695
1798
  #undef HWY_NEON_BUILD_ARG_HWY_INSERT
1696
1799
 
1800
+ template <size_t kLane, class V, class D = DFromV<V>, HWY_NEON_IF_EMULATED_D(D)>
1801
+ HWY_API V InsertLane(const V v, TFromD<D> t) {
1802
+ const D d;
1803
+ const RebindToUnsigned<D> du;
1804
+ const uint16_t tu = BitCastScalar<uint16_t>(t);
1805
+ return BitCast(d, InsertLane<kLane>(BitCast(du, v), tu));
1806
+ }
1807
+
1697
1808
  } // namespace detail
1698
1809
 
1699
1810
  // Requires one overload per vector length because InsertLane<3> may be a
@@ -1842,6 +1953,89 @@ HWY_API Vec128<uint64_t> SumsOf8(const Vec128<uint8_t> v) {
1842
1953
  HWY_API Vec64<uint64_t> SumsOf8(const Vec64<uint8_t> v) {
1843
1954
  return Vec64<uint64_t>(vpaddl_u32(vpaddl_u16(vpaddl_u8(v.raw))));
1844
1955
  }
1956
+ HWY_API Vec128<int64_t> SumsOf8(const Vec128<int8_t> v) {
1957
+ return Vec128<int64_t>(vpaddlq_s32(vpaddlq_s16(vpaddlq_s8(v.raw))));
1958
+ }
1959
+ HWY_API Vec64<int64_t> SumsOf8(const Vec64<int8_t> v) {
1960
+ return Vec64<int64_t>(vpaddl_s32(vpaddl_s16(vpaddl_s8(v.raw))));
1961
+ }
1962
+
1963
+ // ------------------------------ SumsOf2
1964
+ namespace detail {
1965
+
1966
+ template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
1967
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
1968
+ hwy::SignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
1969
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_s8(v.raw));
1970
+ }
1971
+
1972
+ template <class V, HWY_IF_V_SIZE_V(V, 16)>
1973
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
1974
+ hwy::SignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
1975
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_s8(v.raw));
1976
+ }
1977
+
1978
+ template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
1979
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
1980
+ hwy::UnsignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
1981
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_u8(v.raw));
1982
+ }
1983
+
1984
+ template <class V, HWY_IF_V_SIZE_V(V, 16)>
1985
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
1986
+ hwy::UnsignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
1987
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_u8(v.raw));
1988
+ }
1989
+
1990
+ template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
1991
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
1992
+ hwy::SignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
1993
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_s16(v.raw));
1994
+ }
1995
+
1996
+ template <class V, HWY_IF_V_SIZE_V(V, 16)>
1997
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
1998
+ hwy::SignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
1999
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_s16(v.raw));
2000
+ }
2001
+
2002
+ template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
2003
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
2004
+ hwy::UnsignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
2005
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_u16(v.raw));
2006
+ }
2007
+
2008
+ template <class V, HWY_IF_V_SIZE_V(V, 16)>
2009
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
2010
+ hwy::UnsignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
2011
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_u16(v.raw));
2012
+ }
2013
+
2014
+ template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
2015
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
2016
+ hwy::SignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
2017
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_s32(v.raw));
2018
+ }
2019
+
2020
+ template <class V, HWY_IF_V_SIZE_V(V, 16)>
2021
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
2022
+ hwy::SignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
2023
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_s32(v.raw));
2024
+ }
2025
+
2026
+ template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
2027
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
2028
+ hwy::UnsignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
2029
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_u32(v.raw));
2030
+ }
2031
+
2032
+ template <class V, HWY_IF_V_SIZE_V(V, 16)>
2033
+ HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
2034
+ hwy::UnsignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
2035
+ return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_u32(v.raw));
2036
+ }
2037
+
2038
+ } // namespace detail
1845
2039
 
1846
2040
  // ------------------------------ SaturatedAdd
1847
2041
 
@@ -1922,6 +2116,31 @@ HWY_API Vec128<int64_t> Neg(const Vec128<int64_t> v) {
1922
2116
  #endif
1923
2117
  }
1924
2118
 
2119
+ // ------------------------------ SaturatedNeg
2120
+ #ifdef HWY_NATIVE_SATURATED_NEG_8_16_32
2121
+ #undef HWY_NATIVE_SATURATED_NEG_8_16_32
2122
+ #else
2123
+ #define HWY_NATIVE_SATURATED_NEG_8_16_32
2124
+ #endif
2125
+
2126
+ HWY_NEON_DEF_FUNCTION_INT_8_16_32(SaturatedNeg, vqneg, _, 1)
2127
+
2128
+ #if HWY_ARCH_ARM_A64
2129
+ #ifdef HWY_NATIVE_SATURATED_NEG_64
2130
+ #undef HWY_NATIVE_SATURATED_NEG_64
2131
+ #else
2132
+ #define HWY_NATIVE_SATURATED_NEG_64
2133
+ #endif
2134
+
2135
+ HWY_API Vec64<int64_t> SaturatedNeg(const Vec64<int64_t> v) {
2136
+ return Vec64<int64_t>(vqneg_s64(v.raw));
2137
+ }
2138
+
2139
+ HWY_API Vec128<int64_t> SaturatedNeg(const Vec128<int64_t> v) {
2140
+ return Vec128<int64_t>(vqnegq_s64(v.raw));
2141
+ }
2142
+ #endif
2143
+
1925
2144
  // ------------------------------ ShiftLeft
1926
2145
 
1927
2146
  // Customize HWY_NEON_DEF_FUNCTION to special-case count=0 (not supported).
@@ -1943,12 +2162,16 @@ HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, vshr, _n_, ignored)
1943
2162
  #pragma pop_macro("HWY_NEON_DEF_FUNCTION")
1944
2163
 
1945
2164
  // ------------------------------ RotateRight (ShiftRight, Or)
1946
- template <int kBits, typename T, size_t N>
2165
+ template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
1947
2166
  HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
2167
+ const DFromV<decltype(v)> d;
2168
+ const RebindToUnsigned<decltype(d)> du;
2169
+
1948
2170
  constexpr size_t kSizeInBits = sizeof(T) * 8;
1949
2171
  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
1950
2172
  if (kBits == 0) return v;
1951
- return Or(ShiftRight<kBits>(v),
2173
+
2174
+ return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
1952
2175
  ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
1953
2176
  }
1954
2177
 
@@ -2138,7 +2361,39 @@ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator*, vmul, _, 2)
2138
2361
 
2139
2362
  // ------------------------------ Integer multiplication
2140
2363
 
2141
- // Returns the upper 16 bits of a * b in each lane.
2364
+ // Returns the upper sizeof(T)*8 bits of a * b in each lane.
2365
+ HWY_API Vec128<int8_t> MulHigh(Vec128<int8_t> a, Vec128<int8_t> b) {
2366
+ int16x8_t rlo = vmull_s8(vget_low_s8(a.raw), vget_low_s8(b.raw));
2367
+ #if HWY_ARCH_ARM_A64
2368
+ int16x8_t rhi = vmull_high_s8(a.raw, b.raw);
2369
+ #else
2370
+ int16x8_t rhi = vmull_s8(vget_high_s8(a.raw), vget_high_s8(b.raw));
2371
+ #endif
2372
+ return Vec128<int8_t>(
2373
+ vuzp2q_s8(vreinterpretq_s8_s16(rlo), vreinterpretq_s8_s16(rhi)));
2374
+ }
2375
+ HWY_API Vec128<uint8_t> MulHigh(Vec128<uint8_t> a, Vec128<uint8_t> b) {
2376
+ uint16x8_t rlo = vmull_u8(vget_low_u8(a.raw), vget_low_u8(b.raw));
2377
+ #if HWY_ARCH_ARM_A64
2378
+ uint16x8_t rhi = vmull_high_u8(a.raw, b.raw);
2379
+ #else
2380
+ uint16x8_t rhi = vmull_u8(vget_high_u8(a.raw), vget_high_u8(b.raw));
2381
+ #endif
2382
+ return Vec128<uint8_t>(
2383
+ vuzp2q_u8(vreinterpretq_u8_u16(rlo), vreinterpretq_u8_u16(rhi)));
2384
+ }
2385
+
2386
+ template <size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)>
2387
+ HWY_API Vec128<int8_t, N> MulHigh(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
2388
+ int8x16_t hi_lo = vreinterpretq_s8_s16(vmull_s8(a.raw, b.raw));
2389
+ return Vec128<int8_t, N>(vget_low_s8(vuzp2q_s8(hi_lo, hi_lo)));
2390
+ }
2391
+ template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)>
2392
+ HWY_API Vec128<uint8_t, N> MulHigh(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
2393
+ uint8x16_t hi_lo = vreinterpretq_u8_u16(vmull_u8(a.raw, b.raw));
2394
+ return Vec128<uint8_t, N>(vget_low_u8(vuzp2q_u8(hi_lo, hi_lo)));
2395
+ }
2396
+
2142
2397
  HWY_API Vec128<int16_t> MulHigh(Vec128<int16_t> a, Vec128<int16_t> b) {
2143
2398
  int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw));
2144
2399
  #if HWY_ARCH_ARM_A64
@@ -2172,10 +2427,61 @@ HWY_API Vec128<uint16_t, N> MulHigh(Vec128<uint16_t, N> a,
2172
2427
  return Vec128<uint16_t, N>(vget_low_u16(vuzp2q_u16(hi_lo, hi_lo)));
2173
2428
  }
2174
2429
 
2175
- HWY_API Vec128<int16_t> MulFixedPoint15(Vec128<int16_t> a, Vec128<int16_t> b) {
2176
- return Vec128<int16_t>(vqrdmulhq_s16(a.raw, b.raw));
2177
- }
2178
- template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
2430
+ HWY_API Vec128<int32_t> MulHigh(Vec128<int32_t> a, Vec128<int32_t> b) {
2431
+ int64x2_t rlo = vmull_s32(vget_low_s32(a.raw), vget_low_s32(b.raw));
2432
+ #if HWY_ARCH_ARM_A64
2433
+ int64x2_t rhi = vmull_high_s32(a.raw, b.raw);
2434
+ #else
2435
+ int64x2_t rhi = vmull_s32(vget_high_s32(a.raw), vget_high_s32(b.raw));
2436
+ #endif
2437
+ return Vec128<int32_t>(
2438
+ vuzp2q_s32(vreinterpretq_s32_s64(rlo), vreinterpretq_s32_s64(rhi)));
2439
+ }
2440
+ HWY_API Vec128<uint32_t> MulHigh(Vec128<uint32_t> a, Vec128<uint32_t> b) {
2441
+ uint64x2_t rlo = vmull_u32(vget_low_u32(a.raw), vget_low_u32(b.raw));
2442
+ #if HWY_ARCH_ARM_A64
2443
+ uint64x2_t rhi = vmull_high_u32(a.raw, b.raw);
2444
+ #else
2445
+ uint64x2_t rhi = vmull_u32(vget_high_u32(a.raw), vget_high_u32(b.raw));
2446
+ #endif
2447
+ return Vec128<uint32_t>(
2448
+ vuzp2q_u32(vreinterpretq_u32_u64(rlo), vreinterpretq_u32_u64(rhi)));
2449
+ }
2450
+
2451
+ template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)>
2452
+ HWY_API Vec128<int32_t, N> MulHigh(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
2453
+ int32x4_t hi_lo = vreinterpretq_s32_s64(vmull_s32(a.raw, b.raw));
2454
+ return Vec128<int32_t, N>(vget_low_s32(vuzp2q_s32(hi_lo, hi_lo)));
2455
+ }
2456
+ template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
2457
+ HWY_API Vec128<uint32_t, N> MulHigh(Vec128<uint32_t, N> a,
2458
+ Vec128<uint32_t, N> b) {
2459
+ uint32x4_t hi_lo = vreinterpretq_u32_u64(vmull_u32(a.raw, b.raw));
2460
+ return Vec128<uint32_t, N>(vget_low_u32(vuzp2q_u32(hi_lo, hi_lo)));
2461
+ }
2462
+
2463
+ template <class T, HWY_IF_UI64(T)>
2464
+ HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
2465
+ T hi_0;
2466
+ T hi_1;
2467
+
2468
+ Mul128(GetLane(a), GetLane(b), &hi_0);
2469
+ Mul128(detail::GetLane<1>(a), detail::GetLane<1>(b), &hi_1);
2470
+
2471
+ return Dup128VecFromValues(Full128<T>(), hi_0, hi_1);
2472
+ }
2473
+
2474
+ template <class T, HWY_IF_UI64(T)>
2475
+ HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) {
2476
+ T hi;
2477
+ Mul128(GetLane(a), GetLane(b), &hi);
2478
+ return Set(Full64<T>(), hi);
2479
+ }
2480
+
2481
+ HWY_API Vec128<int16_t> MulFixedPoint15(Vec128<int16_t> a, Vec128<int16_t> b) {
2482
+ return Vec128<int16_t>(vqrdmulhq_s16(a.raw, b.raw));
2483
+ }
2484
+ template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
2179
2485
  HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
2180
2486
  Vec128<int16_t, N> b) {
2181
2487
  return Vec128<int16_t, N>(vqrdmulh_s16(a.raw, b.raw));
@@ -2277,7 +2583,7 @@ HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
2277
2583
 
2278
2584
  namespace detail {
2279
2585
 
2280
- #if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
2586
+ #if HWY_NATIVE_FMA
2281
2587
  // Wrappers for changing argument order to what intrinsics expect.
2282
2588
  HWY_NEON_DEF_FUNCTION_ALL_FLOATS(MulAdd, vfma, _, 3)
2283
2589
  HWY_NEON_DEF_FUNCTION_ALL_FLOATS(NegMulAdd, vfms, _, 3)
@@ -2295,7 +2601,7 @@ HWY_API Vec128<float, N> NegMulAdd(Vec128<float, N> add, Vec128<float, N> mul,
2295
2601
  return add - mul * x;
2296
2602
  }
2297
2603
 
2298
- #endif // defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
2604
+ #endif // HWY_NATIVE_FMA
2299
2605
  } // namespace detail
2300
2606
 
2301
2607
  template <typename T, size_t N, HWY_IF_FLOAT(T)>
@@ -2310,13 +2616,13 @@ HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
2310
2616
  return detail::NegMulAdd(add, mul, x);
2311
2617
  }
2312
2618
 
2313
- template <typename T, size_t N>
2619
+ template <typename T, size_t N, HWY_IF_FLOAT(T)>
2314
2620
  HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
2315
2621
  Vec128<T, N> sub) {
2316
2622
  return MulAdd(mul, x, Neg(sub));
2317
2623
  }
2318
2624
 
2319
- template <typename T, size_t N>
2625
+ template <typename T, size_t N, HWY_IF_FLOAT(T)>
2320
2626
  HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
2321
2627
  Vec128<T, N> sub) {
2322
2628
  return Neg(MulAdd(mul, x, sub));
@@ -2612,6 +2918,15 @@ HWY_API Vec128<T, N> PopulationCount(Vec128<T, N> v) {
2612
2918
  HWY_NEON_DEF_FUNCTION_INT_8_16_32(Abs, vabs, _, 1)
2613
2919
  HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Abs, vabs, _, 1)
2614
2920
 
2921
+ // ------------------------------ SaturatedAbs
2922
+ #ifdef HWY_NATIVE_SATURATED_ABS
2923
+ #undef HWY_NATIVE_SATURATED_ABS
2924
+ #else
2925
+ #define HWY_NATIVE_SATURATED_ABS
2926
+ #endif
2927
+
2928
+ HWY_NEON_DEF_FUNCTION_INT_8_16_32(SaturatedAbs, vqabs, _, 1)
2929
+
2615
2930
  // ------------------------------ CopySign
2616
2931
  template <typename T, size_t N>
2617
2932
  HWY_API Vec128<T, N> CopySign(Vec128<T, N> magn, Vec128<T, N> sign) {
@@ -2675,22 +2990,49 @@ HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) {
2675
2990
 
2676
2991
  HWY_NEON_DEF_FUNCTION_ALL_TYPES(IfThenElse, vbsl, _, HWY_IF)
2677
2992
 
2993
+ #if HWY_HAVE_FLOAT16
2994
+ #define HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V) HWY_IF_BF16(TFromV<V>)
2995
+ #else
2996
+ #define HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V) HWY_IF_SPECIAL_FLOAT_V(V)
2997
+ #endif
2998
+
2999
+ template <class V, HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V)>
3000
+ HWY_API V IfThenElse(MFromD<DFromV<V>> mask, V yes, V no) {
3001
+ const DFromV<decltype(yes)> d;
3002
+ const RebindToUnsigned<decltype(d)> du;
3003
+ return BitCast(
3004
+ d, IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no)));
3005
+ }
3006
+
3007
+ #undef HWY_NEON_IF_EMULATED_IF_THEN_ELSE
2678
3008
  #undef HWY_NEON_BUILD_TPL_HWY_IF
2679
3009
  #undef HWY_NEON_BUILD_RET_HWY_IF
2680
3010
  #undef HWY_NEON_BUILD_PARAM_HWY_IF
2681
3011
  #undef HWY_NEON_BUILD_ARG_HWY_IF
2682
3012
 
2683
3013
  // mask ? yes : 0
2684
- template <typename T, size_t N>
3014
+ template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
2685
3015
  HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
2686
3016
  return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
2687
3017
  }
3018
+ template <typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)>
3019
+ HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
3020
+ const DFromV<decltype(yes)> d;
3021
+ const RebindToUnsigned<decltype(d)> du;
3022
+ return BitCast(d, IfThenElseZero(RebindMask(du, mask), BitCast(du, yes)));
3023
+ }
2688
3024
 
2689
3025
  // mask ? 0 : no
2690
- template <typename T, size_t N>
3026
+ template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
2691
3027
  HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
2692
3028
  return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
2693
3029
  }
3030
+ template <typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)>
3031
+ HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
3032
+ const DFromV<decltype(no)> d;
3033
+ const RebindToUnsigned<decltype(d)> du;
3034
+ return BitCast(d, IfThenZeroElse(RebindMask(du, mask), BitCast(du, no)));
3035
+ }
2694
3036
 
2695
3037
  template <typename T, size_t N>
2696
3038
  HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
@@ -2703,12 +3045,6 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
2703
3045
  return IfThenElse(m, yes, no);
2704
3046
  }
2705
3047
 
2706
- template <typename T, size_t N>
2707
- HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
2708
- const auto zero = Zero(DFromV<decltype(v)>());
2709
- return Max(zero, v);
2710
- }
2711
-
2712
3048
  // ------------------------------ Mask logical
2713
3049
 
2714
3050
  template <typename T, size_t N>
@@ -2957,6 +3293,23 @@ HWY_API Vec64<int64_t> Abs(const Vec64<int64_t> v) {
2957
3293
  #endif
2958
3294
  }
2959
3295
 
3296
+ HWY_API Vec128<int64_t> SaturatedAbs(const Vec128<int64_t> v) {
3297
+ #if HWY_ARCH_ARM_A64
3298
+ return Vec128<int64_t>(vqabsq_s64(v.raw));
3299
+ #else
3300
+ const auto zero = Zero(DFromV<decltype(v)>());
3301
+ return IfThenElse(MaskFromVec(BroadcastSignBit(v)), SaturatedSub(zero, v), v);
3302
+ #endif
3303
+ }
3304
+ HWY_API Vec64<int64_t> SaturatedAbs(const Vec64<int64_t> v) {
3305
+ #if HWY_ARCH_ARM_A64
3306
+ return Vec64<int64_t>(vqabs_s64(v.raw));
3307
+ #else
3308
+ const auto zero = Zero(DFromV<decltype(v)>());
3309
+ return IfThenElse(MaskFromVec(BroadcastSignBit(v)), SaturatedSub(zero, v), v);
3310
+ #endif
3311
+ }
3312
+
2960
3313
  // ------------------------------ Min (IfThenElse, BroadcastSignBit)
2961
3314
 
2962
3315
  // Unsigned
@@ -3133,6 +3486,20 @@ HWY_API Vec128<int64_t> LoadU(D /* tag */,
3133
3486
  const int64_t* HWY_RESTRICT unaligned) {
3134
3487
  return Vec128<int64_t>(vld1q_s64(unaligned));
3135
3488
  }
3489
+ #if HWY_HAVE_FLOAT16
3490
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
3491
+ HWY_API Vec128<float16_t> LoadU(D /* tag */,
3492
+ const float16_t* HWY_RESTRICT unaligned) {
3493
+ return Vec128<float16_t>(vld1q_f16(detail::NativeLanePointer(unaligned)));
3494
+ }
3495
+ #endif // HWY_HAVE_FLOAT16
3496
+ #if HWY_NEON_HAVE_BFLOAT16
3497
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
3498
+ HWY_API Vec128<bfloat16_t> LoadU(D /* tag */,
3499
+ const bfloat16_t* HWY_RESTRICT unaligned) {
3500
+ return Vec128<bfloat16_t>(vld1q_bf16(detail::NativeLanePointer(unaligned)));
3501
+ }
3502
+ #endif // HWY_NEON_HAVE_BFLOAT16
3136
3503
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
3137
3504
  HWY_API Vec128<float> LoadU(D /* tag */, const float* HWY_RESTRICT unaligned) {
3138
3505
  return Vec128<float>(vld1q_f32(unaligned));
@@ -3179,6 +3546,18 @@ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)>
3179
3546
  HWY_API Vec64<int64_t> LoadU(D /* tag */, const int64_t* HWY_RESTRICT p) {
3180
3547
  return Vec64<int64_t>(vld1_s64(p));
3181
3548
  }
3549
+ #if HWY_HAVE_FLOAT16
3550
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F16_D(D)>
3551
+ HWY_API Vec64<float16_t> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) {
3552
+ return Vec64<float16_t>(vld1_f16(detail::NativeLanePointer(p)));
3553
+ }
3554
+ #endif // HWY_HAVE_FLOAT16
3555
+ #if HWY_NEON_HAVE_BFLOAT16
3556
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
3557
+ HWY_API Vec64<bfloat16_t> LoadU(D /* tag */, const bfloat16_t* HWY_RESTRICT p) {
3558
+ return Vec64<bfloat16_t>(vld1_bf16(detail::NativeLanePointer(p)));
3559
+ }
3560
+ #endif // HWY_NEON_HAVE_BFLOAT16
3182
3561
  template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
3183
3562
  HWY_API Vec64<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) {
3184
3563
  return Vec64<float>(vld1_f32(p));
@@ -3207,14 +3586,34 @@ HWY_API Vec32<float> LoadU(D /*tag*/, const float* HWY_RESTRICT p) {
3207
3586
  return Vec32<float>(vld1_dup_f32(p));
3208
3587
  }
3209
3588
 
3210
- template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_NOT_SPECIAL_FLOAT_D(D),
3211
- HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
3589
+ // {u,i}{8,16}
3590
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_LE_D(D, 2),
3591
+ HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
3592
+ HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
3593
+ const Repartition<uint32_t, decltype(d)> d32;
3594
+ uint32_t buf;
3595
+ CopyBytes<4>(p, &buf);
3596
+ return BitCast(d, LoadU(d32, &buf));
3597
+ }
3598
+
3599
+ #if HWY_HAVE_FLOAT16
3600
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F16_D(D)>
3601
+ HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
3602
+ const Repartition<uint32_t, decltype(d)> d32;
3603
+ uint32_t buf;
3604
+ CopyBytes<4>(p, &buf);
3605
+ return BitCast(d, LoadU(d32, &buf));
3606
+ }
3607
+ #endif // HWY_HAVE_FLOAT16
3608
+ #if HWY_NEON_HAVE_BFLOAT16
3609
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_BF16_D(D)>
3212
3610
  HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
3213
3611
  const Repartition<uint32_t, decltype(d)> d32;
3214
3612
  uint32_t buf;
3215
3613
  CopyBytes<4>(p, &buf);
3216
3614
  return BitCast(d, LoadU(d32, &buf));
3217
3615
  }
3616
+ #endif // HWY_NEON_HAVE_BFLOAT16
3218
3617
 
3219
3618
  // ------------------------------ Load 16
3220
3619
 
@@ -3228,6 +3627,18 @@ template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_I16_D(D)>
3228
3627
  HWY_API VFromD<D> LoadU(D /* tag */, const int16_t* HWY_RESTRICT p) {
3229
3628
  return VFromD<D>(vld1_dup_s16(p));
3230
3629
  }
3630
+ #if HWY_HAVE_FLOAT16
3631
+ template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_F16_D(D)>
3632
+ HWY_API VFromD<D> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) {
3633
+ return VFromD<D>(vld1_dup_f16(detail::NativeLanePointer(p)));
3634
+ }
3635
+ #endif // HWY_HAVE_FLOAT16
3636
+ #if HWY_NEON_HAVE_BFLOAT16
3637
+ template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_BF16_D(D)>
3638
+ HWY_API VFromD<D> LoadU(D /* tag */, const bfloat16_t* HWY_RESTRICT p) {
3639
+ return VFromD<D>(vld1_dup_bf16(detail::NativeLanePointer(p)));
3640
+ }
3641
+ #endif // HWY_NEON_HAVE_BFLOAT16
3231
3642
 
3232
3643
  // 8-bit x2
3233
3644
  template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
@@ -3250,12 +3661,10 @@ HWY_API VFromD<D> LoadU(D /* tag */, const int8_t* HWY_RESTRICT p) {
3250
3661
 
3251
3662
  // ------------------------------ Load misc
3252
3663
 
3253
- // [b]float16_t may use the same Raw as uint16_t, so forward to that.
3254
- template <class D, HWY_IF_SPECIAL_FLOAT_D(D)>
3664
+ template <class D, HWY_NEON_IF_EMULATED_D(D)>
3255
3665
  HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
3256
- const RebindToUnsigned<decltype(d)> du16;
3257
- const auto pu16 = reinterpret_cast<const uint16_t*>(p);
3258
- return BitCast(d, LoadU(du16, pu16));
3666
+ const RebindToUnsigned<decltype(d)> du;
3667
+ return BitCast(d, LoadU(du, detail::U16LanePointer(p)));
3259
3668
  }
3260
3669
 
3261
3670
  // On Arm, Load is the same as LoadU.
@@ -3324,6 +3733,20 @@ HWY_API void StoreU(Vec128<int64_t> v, D /* tag */,
3324
3733
  int64_t* HWY_RESTRICT unaligned) {
3325
3734
  vst1q_s64(unaligned, v.raw);
3326
3735
  }
3736
+ #if HWY_HAVE_FLOAT16
3737
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
3738
+ HWY_API void StoreU(Vec128<float16_t> v, D /* tag */,
3739
+ float16_t* HWY_RESTRICT unaligned) {
3740
+ vst1q_f16(detail::NativeLanePointer(unaligned), v.raw);
3741
+ }
3742
+ #endif // HWY_HAVE_FLOAT16
3743
+ #if HWY_NEON_HAVE_BFLOAT16
3744
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
3745
+ HWY_API void StoreU(Vec128<bfloat16_t> v, D /* tag */,
3746
+ bfloat16_t* HWY_RESTRICT unaligned) {
3747
+ vst1q_bf16(detail::NativeLanePointer(unaligned), v.raw);
3748
+ }
3749
+ #endif // HWY_NEON_HAVE_BFLOAT16
3327
3750
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
3328
3751
  HWY_API void StoreU(Vec128<float> v, D /* tag */,
3329
3752
  float* HWY_RESTRICT unaligned) {
@@ -3371,6 +3794,20 @@ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)>
3371
3794
  HWY_API void StoreU(Vec64<int64_t> v, D /* tag */, int64_t* HWY_RESTRICT p) {
3372
3795
  vst1_s64(p, v.raw);
3373
3796
  }
3797
+ #if HWY_HAVE_FLOAT16
3798
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F16_D(D)>
3799
+ HWY_API void StoreU(Vec64<float16_t> v, D /* tag */,
3800
+ float16_t* HWY_RESTRICT p) {
3801
+ vst1_f16(detail::NativeLanePointer(p), v.raw);
3802
+ }
3803
+ #endif // HWY_HAVE_FLOAT16
3804
+ #if HWY_NEON_HAVE_BFLOAT16
3805
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
3806
+ HWY_API void StoreU(Vec64<bfloat16_t> v, D /* tag */,
3807
+ bfloat16_t* HWY_RESTRICT p) {
3808
+ vst1_bf16(detail::NativeLanePointer(p), v.raw);
3809
+ }
3810
+ #endif // HWY_NEON_HAVE_BFLOAT16
3374
3811
  template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
3375
3812
  HWY_API void StoreU(Vec64<float> v, D /* tag */, float* HWY_RESTRICT p) {
3376
3813
  vst1_f32(p, v.raw);
@@ -3397,28 +3834,31 @@ HWY_API void StoreU(Vec32<float> v, D, float* HWY_RESTRICT p) {
3397
3834
  vst1_lane_f32(p, v.raw, 0);
3398
3835
  }
3399
3836
 
3400
- // Overload 16-bit types directly to avoid ambiguity with [b]float16_t.
3401
- template <class D, HWY_IF_V_SIZE_D(D, 4), typename T = TFromD<D>,
3402
- HWY_IF_T_SIZE(T, 1)>
3403
- HWY_API void StoreU(Vec32<T> v, D d, T* HWY_RESTRICT p) {
3837
+ // {u,i}{8,16}
3838
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_LE_D(D, 2),
3839
+ HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
3840
+ HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
3404
3841
  Repartition<uint32_t, decltype(d)> d32;
3405
3842
  uint32_t buf = GetLane(BitCast(d32, v));
3406
3843
  CopyBytes<4>(&buf, p);
3407
3844
  }
3408
3845
 
3409
- template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)>
3410
- HWY_API void StoreU(Vec32<uint16_t> v, D d, uint16_t* HWY_RESTRICT p) {
3846
+ #if HWY_HAVE_FLOAT16
3847
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F16_D(D)>
3848
+ HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
3411
3849
  Repartition<uint32_t, decltype(d)> d32;
3412
3850
  uint32_t buf = GetLane(BitCast(d32, v));
3413
3851
  CopyBytes<4>(&buf, p);
3414
3852
  }
3415
-
3416
- template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I16_D(D)>
3417
- HWY_API void StoreU(Vec32<int16_t> v, D d, int16_t* HWY_RESTRICT p) {
3853
+ #endif
3854
+ #if HWY_NEON_HAVE_BFLOAT16
3855
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_BF16_D(D)>
3856
+ HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
3418
3857
  Repartition<uint32_t, decltype(d)> d32;
3419
3858
  uint32_t buf = GetLane(BitCast(d32, v));
3420
3859
  CopyBytes<4>(&buf, p);
3421
3860
  }
3861
+ #endif // HWY_NEON_HAVE_BFLOAT16
3422
3862
 
3423
3863
  // ------------------------------ Store 16
3424
3864
 
@@ -3430,6 +3870,18 @@ template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_I16_D(D)>
3430
3870
  HWY_API void StoreU(Vec16<int16_t> v, D, int16_t* HWY_RESTRICT p) {
3431
3871
  vst1_lane_s16(p, v.raw, 0);
3432
3872
  }
3873
+ #if HWY_HAVE_FLOAT16
3874
+ template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_F16_D(D)>
3875
+ HWY_API void StoreU(Vec16<float16_t> v, D, float16_t* HWY_RESTRICT p) {
3876
+ vst1_lane_f16(detail::NativeLanePointer(p), v.raw, 0);
3877
+ }
3878
+ #endif // HWY_HAVE_FLOAT16
3879
+ #if HWY_NEON_HAVE_BFLOAT16
3880
+ template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_BF16_D(D)>
3881
+ HWY_API void StoreU(Vec16<bfloat16_t> v, D, bfloat16_t* HWY_RESTRICT p) {
3882
+ vst1_lane_bf16(detail::NativeLanePointer(p), v.raw, 0);
3883
+ }
3884
+ #endif // HWY_NEON_HAVE_BFLOAT16
3433
3885
 
3434
3886
  template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
3435
3887
  HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
@@ -3449,12 +3901,12 @@ HWY_API void StoreU(Vec128<int8_t, 1> v, D, int8_t* HWY_RESTRICT p) {
3449
3901
  vst1_lane_s8(p, v.raw, 0);
3450
3902
  }
3451
3903
 
3452
- // [b]float16_t may use the same Raw as uint16_t, so forward to that.
3453
- template <class D, HWY_IF_SPECIAL_FLOAT_D(D)>
3904
+ // ------------------------------ Store misc
3905
+
3906
+ template <class D, HWY_NEON_IF_EMULATED_D(D)>
3454
3907
  HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
3455
- const RebindToUnsigned<decltype(d)> du16;
3456
- const auto pu16 = reinterpret_cast<uint16_t*>(p);
3457
- return StoreU(BitCast(du16, v), du16, pu16);
3908
+ const RebindToUnsigned<decltype(d)> du;
3909
+ return StoreU(BitCast(du, v), du, detail::U16LanePointer(p));
3458
3910
  }
3459
3911
 
3460
3912
  HWY_DIAGNOSTICS(push)
@@ -3541,24 +3993,6 @@ HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToUnsigned<D>> v) {
3541
3993
  return VFromD<D>(vcvt_f32_u32(v.raw));
3542
3994
  }
3543
3995
 
3544
- // Truncates (rounds toward zero).
3545
- template <class D, HWY_IF_I32_D(D)>
3546
- HWY_API Vec128<int32_t> ConvertTo(D /* tag */, Vec128<float> v) {
3547
- return Vec128<int32_t>(vcvtq_s32_f32(v.raw));
3548
- }
3549
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
3550
- HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
3551
- return VFromD<D>(vcvt_s32_f32(v.raw));
3552
- }
3553
- template <class D, HWY_IF_U32_D(D)>
3554
- HWY_API Vec128<uint32_t> ConvertTo(D /* tag */, Vec128<float> v) {
3555
- return Vec128<uint32_t>(vcvtq_u32_f32(ZeroIfNegative(v).raw));
3556
- }
3557
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
3558
- HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
3559
- return VFromD<D>(vcvt_u32_f32(ZeroIfNegative(v).raw));
3560
- }
3561
-
3562
3996
  #if HWY_HAVE_FLOAT64
3563
3997
 
3564
3998
  template <class D, HWY_IF_F64_D(D)>
@@ -3577,51 +4011,168 @@ HWY_API Vec64<double> ConvertTo(D /* tag */, Vec64<int64_t> v) {
3577
4011
 
3578
4012
  template <class D, HWY_IF_F64_D(D)>
3579
4013
  HWY_API Vec128<double> ConvertTo(D /* tag */, Vec128<uint64_t> v) {
3580
- return Vec128<double>(vcvtq_f64_u64(ZeroIfNegative(v).raw));
4014
+ return Vec128<double>(vcvtq_f64_u64(v.raw));
3581
4015
  }
3582
4016
  template <class D, HWY_IF_F64_D(D)>
3583
4017
  HWY_API Vec64<double> ConvertTo(D /* tag */, Vec64<uint64_t> v) {
3584
4018
  // GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic.
3585
- const auto non_neg_v = ZeroIfNegative(v);
3586
4019
  #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
3587
- return Set(Full64<double>(), static_cast<double>(GetLane(non_neg_v)));
4020
+ return Set(Full64<double>(), static_cast<double>(GetLane(v)));
3588
4021
  #else
3589
- return Vec64<double>(vcvt_f64_u64(non_neg_v.raw));
4022
+ return Vec64<double>(vcvt_f64_u64(v.raw));
3590
4023
  #endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
3591
4024
  }
3592
4025
 
4026
+ #endif // HWY_HAVE_FLOAT64
4027
+
4028
+ namespace detail {
3593
4029
  // Truncates (rounds toward zero).
3594
- template <class D, HWY_IF_I64_D(D)>
3595
- HWY_API Vec128<int64_t> ConvertTo(D /* tag */, Vec128<double> v) {
4030
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
4031
+ HWY_INLINE Vec128<int32_t> ConvertFToI(D /* tag */, Vec128<float> v) {
4032
+ #if HWY_COMPILER_CLANG && \
4033
+ ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
4034
+ // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
4035
+ // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
4036
+ // outside of the range of an int32_t.
4037
+
4038
+ int32x4_t raw_result;
4039
+ __asm__(
4040
+ #if HWY_ARCH_ARM_A64
4041
+ "fcvtzs %0.4s, %1.4s"
4042
+ #else
4043
+ "vcvt.s32.f32 %0, %1"
4044
+ #endif
4045
+ : "=w"(raw_result)
4046
+ : "w"(v.raw));
4047
+ return Vec128<int32_t>(raw_result);
4048
+ #else
4049
+ return Vec128<int32_t>(vcvtq_s32_f32(v.raw));
4050
+ #endif
4051
+ }
4052
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
4053
+ HWY_INLINE VFromD<D> ConvertFToI(D /* tag */, VFromD<RebindToFloat<D>> v) {
4054
+ #if HWY_COMPILER_CLANG && \
4055
+ ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
4056
+ // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
4057
+ // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
4058
+ // outside of the range of an int32_t.
4059
+
4060
+ int32x2_t raw_result;
4061
+ __asm__(
4062
+ #if HWY_ARCH_ARM_A64
4063
+ "fcvtzs %0.2s, %1.2s"
4064
+ #else
4065
+ "vcvt.s32.f32 %0, %1"
4066
+ #endif
4067
+ : "=w"(raw_result)
4068
+ : "w"(v.raw));
4069
+ return VFromD<D>(raw_result);
4070
+ #else
4071
+ return VFromD<D>(vcvt_s32_f32(v.raw));
4072
+ #endif
4073
+ }
4074
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
4075
+ HWY_INLINE Vec128<uint32_t> ConvertFToU(D /* tag */, Vec128<float> v) {
4076
+ #if HWY_COMPILER_CLANG && \
4077
+ ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
4078
+ // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
4079
+ // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
4080
+ // outside of the range of an uint32_t.
4081
+
4082
+ uint32x4_t raw_result;
4083
+ __asm__(
4084
+ #if HWY_ARCH_ARM_A64
4085
+ "fcvtzu %0.4s, %1.4s"
4086
+ #else
4087
+ "vcvt.u32.f32 %0, %1"
4088
+ #endif
4089
+ : "=w"(raw_result)
4090
+ : "w"(v.raw));
4091
+ return Vec128<uint32_t>(raw_result);
4092
+ #else
4093
+ return Vec128<uint32_t>(vcvtq_u32_f32(v.raw));
4094
+ #endif
4095
+ }
4096
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
4097
+ HWY_INLINE VFromD<D> ConvertFToU(D /* tag */, VFromD<RebindToFloat<D>> v) {
4098
+ #if HWY_COMPILER_CLANG && \
4099
+ ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
4100
+ // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
4101
+ // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
4102
+ // outside of the range of an uint32_t.
4103
+
4104
+ uint32x2_t raw_result;
4105
+ __asm__(
4106
+ #if HWY_ARCH_ARM_A64
4107
+ "fcvtzu %0.2s, %1.2s"
4108
+ #else
4109
+ "vcvt.u32.f32 %0, %1"
4110
+ #endif
4111
+ : "=w"(raw_result)
4112
+ : "w"(v.raw));
4113
+ return VFromD<D>(raw_result);
4114
+ #else
4115
+ return VFromD<D>(vcvt_u32_f32(v.raw));
4116
+ #endif
4117
+ }
4118
+
4119
+ #if HWY_HAVE_FLOAT64
4120
+
4121
+ // Truncates (rounds toward zero).
4122
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
4123
+ HWY_INLINE Vec128<int64_t> ConvertFToI(D /* tag */, Vec128<double> v) {
4124
+ #if HWY_COMPILER_CLANG && HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200
4125
+ // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4126
+ // to avoid undefined behavior if v[i] is outside of the range of an int64_t.
4127
+ int64x2_t raw_result;
4128
+ __asm__("fcvtzs %0.2d, %1.2d" : "=w"(raw_result) : "w"(v.raw));
4129
+ return Vec128<int64_t>(raw_result);
4130
+ #else
3596
4131
  return Vec128<int64_t>(vcvtq_s64_f64(v.raw));
4132
+ #endif
3597
4133
  }
3598
- template <class D, HWY_IF_I64_D(D)>
3599
- HWY_API Vec64<int64_t> ConvertTo(D di, Vec64<double> v) {
3600
- // GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic. Use the
3601
- // 128-bit version to avoid UB from casting double -> int64_t.
3602
- #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
3603
- const Full128<double> ddt;
3604
- const Twice<decltype(di)> dit;
3605
- return LowerHalf(di, ConvertTo(dit, Combine(ddt, v, v)));
4134
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)>
4135
+ HWY_INLINE Vec64<int64_t> ConvertFToI(D /* tag */, Vec64<double> v) {
4136
+ #if HWY_ARCH_ARM_A64 && \
4137
+ ((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700) || \
4138
+ (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200))
4139
+ // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4140
+ // to avoid undefined behavior if v[i] is outside of the range of an int64_t.
4141
+ // If compiling for AArch64 NEON with GCC 6 or earlier, use inline assembly to
4142
+ // work around the missing vcvt_s64_f64 intrinsic.
4143
+ int64x1_t raw_result;
4144
+ __asm__("fcvtzs %d0, %d1" : "=w"(raw_result) : "w"(v.raw));
4145
+ return Vec64<int64_t>(raw_result);
3606
4146
  #else
3607
- (void)di;
3608
4147
  return Vec64<int64_t>(vcvt_s64_f64(v.raw));
3609
4148
  #endif
3610
4149
  }
3611
- template <class D, HWY_IF_U64_D(D)>
3612
- HWY_API Vec128<uint64_t> ConvertTo(D /* tag */, Vec128<double> v) {
4150
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
4151
+ HWY_INLINE Vec128<uint64_t> ConvertFToU(D /* tag */, Vec128<double> v) {
4152
+ #if HWY_COMPILER_CLANG && HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200
4153
+ // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4154
+ // to avoid undefined behavior if v[i] is outside of the range of an uint64_t.
4155
+ uint64x2_t raw_result;
4156
+ __asm__("fcvtzu %0.2d, %1.2d" : "=w"(raw_result) : "w"(v.raw));
4157
+ return Vec128<uint64_t>(raw_result);
4158
+ #else
3613
4159
  return Vec128<uint64_t>(vcvtq_u64_f64(v.raw));
4160
+ #endif
3614
4161
  }
3615
- template <class D, HWY_IF_U64_D(D)>
3616
- HWY_API Vec64<uint64_t> ConvertTo(D du, Vec64<double> v) {
3617
- // GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic. Use the
3618
- // 128-bit version to avoid UB from casting double -> uint64_t.
3619
- #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
3620
- const Full128<double> ddt;
3621
- const Twice<decltype(du)> du_t;
3622
- return LowerHalf(du, ConvertTo(du_t, Combine(ddt, v, v)));
4162
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U64_D(D)>
4163
+ HWY_INLINE Vec64<uint64_t> ConvertFToU(D /* tag */, Vec64<double> v) {
4164
+ #if HWY_ARCH_ARM_A64 && \
4165
+ ((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700) || \
4166
+ (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200))
4167
+ // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4168
+ // to avoid undefined behavior if v[i] is outside of the range of an uint64_t.
4169
+
4170
+ // Inline assembly is also used if compiling for AArch64 NEON with GCC 6 or
4171
+ // earlier to work around the issue of the missing vcvt_u64_f64 intrinsic.
4172
+ uint64x1_t raw_result;
4173
+ __asm__("fcvtzu %d0, %d1" : "=w"(raw_result) : "w"(v.raw));
4174
+ return Vec64<uint64_t>(raw_result);
3623
4175
  #else
3624
- (void)du;
3625
4176
  return Vec64<uint64_t>(vcvt_u64_f64(v.raw));
3626
4177
  #endif
3627
4178
  }
@@ -3631,25 +4182,76 @@ HWY_API Vec64<uint64_t> ConvertTo(D du, Vec64<double> v) {
3631
4182
  #if HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16
3632
4183
 
3633
4184
  // Truncates (rounds toward zero).
3634
- template <class D, HWY_IF_I16_D(D)>
3635
- HWY_API Vec128<int16_t> ConvertTo(D /* tag */, Vec128<float16_t> v) {
4185
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
4186
+ HWY_INLINE Vec128<int16_t> ConvertFToI(D /* tag */, Vec128<float16_t> v) {
4187
+ #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
4188
+ // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4189
+ // to avoid undefined behavior if v[i] is outside of the range of an int16_t.
4190
+ int16x8_t raw_result;
4191
+ __asm__("fcvtzs %0.8h, %1.8h" : "=w"(raw_result) : "w"(v.raw));
4192
+ return Vec128<int16_t>(raw_result);
4193
+ #else
3636
4194
  return Vec128<int16_t>(vcvtq_s16_f16(v.raw));
4195
+ #endif
3637
4196
  }
3638
4197
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
3639
- HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<float16_t, D>> v) {
4198
+ HWY_INLINE VFromD<D> ConvertFToI(D /* tag */, VFromD<Rebind<float16_t, D>> v) {
4199
+ #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
4200
+ // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4201
+ // to avoid undefined behavior if v[i] is outside of the range of an int16_t.
4202
+ int16x4_t raw_result;
4203
+ __asm__("fcvtzs %0.4h, %1.4h" : "=w"(raw_result) : "w"(v.raw));
4204
+ return VFromD<D>(raw_result);
4205
+ #else
3640
4206
  return VFromD<D>(vcvt_s16_f16(v.raw));
4207
+ #endif
3641
4208
  }
3642
4209
 
3643
- template <class D, HWY_IF_U16_D(D)>
3644
- HWY_API Vec128<uint16_t> ConvertTo(D /* tag */, Vec128<float16_t> v) {
4210
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
4211
+ HWY_INLINE Vec128<uint16_t> ConvertFToU(D /* tag */, Vec128<float16_t> v) {
4212
+ #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
4213
+ // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4214
+ // to avoid undefined behavior if v[i] is outside of the range of an uint16_t.
4215
+ uint16x8_t raw_result;
4216
+ __asm__("fcvtzu %0.8h, %1.8h" : "=w"(raw_result) : "w"(v.raw));
4217
+ return Vec128<uint16_t>(raw_result);
4218
+ #else
3645
4219
  return Vec128<uint16_t>(vcvtq_u16_f16(v.raw));
4220
+ #endif
3646
4221
  }
3647
4222
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
3648
- HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<float16_t, D>> v) {
4223
+ HWY_INLINE VFromD<D> ConvertFToU(D /* tag */, VFromD<Rebind<float16_t, D>> v) {
4224
+ #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
4225
+ // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
4226
+ // to avoid undefined behavior if v[i] is outside of the range of an uint16_t.
4227
+ uint16x4_t raw_result;
4228
+ __asm__("fcvtzu %0.4h, %1.4h" : "=w"(raw_result) : "w"(v.raw));
4229
+ return VFromD<D>(raw_result);
4230
+ #else
3649
4231
  return VFromD<D>(vcvt_u16_f16(v.raw));
4232
+ #endif
3650
4233
  }
3651
4234
 
3652
4235
  #endif // HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16
4236
+ } // namespace detail
4237
+
4238
+ template <class D, HWY_IF_SIGNED_D(D),
4239
+ HWY_IF_T_SIZE_ONE_OF_D(
4240
+ D, (1 << 4) |
4241
+ ((HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16) ? (1 << 2) : 0) |
4242
+ (HWY_HAVE_FLOAT64 ? (1 << 8) : 0))>
4243
+ HWY_API VFromD<D> ConvertTo(D di, VFromD<RebindToFloat<D>> v) {
4244
+ return detail::ConvertFToI(di, v);
4245
+ }
4246
+
4247
+ template <class D, HWY_IF_UNSIGNED_D(D),
4248
+ HWY_IF_T_SIZE_ONE_OF_D(
4249
+ D, (1 << 4) |
4250
+ ((HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16) ? (1 << 2) : 0) |
4251
+ (HWY_HAVE_FLOAT64 ? (1 << 8) : 0))>
4252
+ HWY_API VFromD<D> ConvertTo(D du, VFromD<RebindToFloat<D>> v) {
4253
+ return detail::ConvertFToU(du, v);
4254
+ }
3653
4255
 
3654
4256
  // ------------------------------ PromoteTo (ConvertTo)
3655
4257
 
@@ -3782,7 +4384,7 @@ HWY_API VFromD<D> PromoteTo(D d, V v) {
3782
4384
  return PromoteTo(d, PromoteTo(di32, v));
3783
4385
  }
3784
4386
 
3785
- #if HWY_NEON_HAVE_FLOAT16C
4387
+ #if HWY_NEON_HAVE_F16C
3786
4388
 
3787
4389
  // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions.
3788
4390
  #ifdef HWY_NATIVE_F16C
@@ -3800,7 +4402,7 @@ HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float16_t, D>> v) {
3800
4402
  return VFromD<D>(vget_low_f32(vcvt_f32_f16(v.raw)));
3801
4403
  }
3802
4404
 
3803
- #endif // HWY_NEON_HAVE_FLOAT16C
4405
+ #endif // HWY_NEON_HAVE_F16C
3804
4406
 
3805
4407
  #if HWY_HAVE_FLOAT64
3806
4408
 
@@ -3893,8 +4495,36 @@ HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
3893
4495
  lo32_or_mask);
3894
4496
  }
3895
4497
 
4498
+ #ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
4499
+ #undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
4500
+ #else
4501
+ #define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
4502
+ #endif
4503
+
4504
+ template <class D, HWY_IF_UI64_D(D)>
4505
+ HWY_API VFromD<D> PromoteInRangeTo(D d64, VFromD<Rebind<float, D>> v) {
4506
+ const Rebind<MakeNarrow<TFromD<D>>, decltype(d64)> d32;
4507
+ const RebindToFloat<decltype(d32)> df32;
4508
+ const RebindToUnsigned<decltype(d32)> du32;
4509
+ const Repartition<uint8_t, decltype(d32)> du32_as_du8;
4510
+
4511
+ constexpr uint32_t kExpAdjDecr =
4512
+ 0xFFFFFF9Du + static_cast<uint32_t>(!IsSigned<TFromD<D>>());
4513
+
4514
+ const auto exponent_adj = BitCast(
4515
+ du32, SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))),
4516
+ BitCast(du32_as_du8, Set(du32, kExpAdjDecr))));
4517
+ const auto adj_v =
4518
+ BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj));
4519
+
4520
+ return PromoteTo(d64, ConvertTo(d32, adj_v)) << PromoteTo(d64, exponent_adj);
4521
+ }
4522
+
3896
4523
  #endif // HWY_HAVE_FLOAT64
3897
4524
 
4525
+ // ------------------------------ PromoteEvenTo/PromoteOddTo
4526
+ #include "hwy/ops/inside-inl.h"
4527
+
3898
4528
  // ------------------------------ PromoteUpperTo
3899
4529
 
3900
4530
  #if HWY_ARCH_ARM_A64
@@ -3946,14 +4576,14 @@ HWY_API Vec128<int64_t> PromoteUpperTo(D /* tag */, Vec128<int32_t> v) {
3946
4576
  return Vec128<int64_t>(vmovl_high_s32(v.raw));
3947
4577
  }
3948
4578
 
3949
- #if HWY_NEON_HAVE_FLOAT16C
4579
+ #if HWY_NEON_HAVE_F16C
3950
4580
 
3951
4581
  template <class D, HWY_IF_F32_D(D)>
3952
4582
  HWY_API Vec128<float> PromoteUpperTo(D /* tag */, Vec128<float16_t> v) {
3953
4583
  return Vec128<float>(vcvt_high_f32_f16(v.raw));
3954
4584
  }
3955
4585
 
3956
- #endif // HWY_NEON_HAVE_FLOAT16C
4586
+ #endif // HWY_NEON_HAVE_F16C
3957
4587
 
3958
4588
  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
3959
4589
  HWY_API VFromD<D> PromoteUpperTo(D df32, VFromD<Repartition<bfloat16_t, D>> v) {
@@ -4149,7 +4779,7 @@ HWY_API VFromD<D> DemoteTo(D d, Vec64<uint64_t> v) {
4149
4779
  return DemoteTo(d, DemoteTo(du32, v));
4150
4780
  }
4151
4781
 
4152
- #if HWY_NEON_HAVE_FLOAT16C
4782
+ #if HWY_NEON_HAVE_F16C
4153
4783
 
4154
4784
  // We already toggled HWY_NATIVE_F16C above.
4155
4785
 
@@ -4162,16 +4792,47 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
4162
4792
  return VFromD<D>(vcvt_f16_f32(vcombine_f32(v.raw, v.raw)));
4163
4793
  }
4164
4794
 
4165
- #endif // HWY_NEON_HAVE_FLOAT16C
4795
+ #endif // HWY_NEON_HAVE_F16C
4166
4796
 
4167
- template <class D, HWY_IF_BF16_D(D)>
4168
- HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
4169
- const Rebind<int32_t, decltype(dbf16)> di32;
4170
- const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
4171
- const Rebind<uint16_t, decltype(dbf16)> du16;
4172
- const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
4173
- return BitCast(dbf16, DemoteTo(du16, bits_in_32));
4797
+ #if HWY_NEON_HAVE_F32_TO_BF16C
4798
+ #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
4799
+ #undef HWY_NATIVE_DEMOTE_F32_TO_BF16
4800
+ #else
4801
+ #define HWY_NATIVE_DEMOTE_F32_TO_BF16
4802
+ #endif
4803
+
4804
+ namespace detail {
4805
+ #if HWY_NEON_HAVE_BFLOAT16
4806
+ // If HWY_NEON_HAVE_BFLOAT16 is true, detail::Vec128<bfloat16_t, N>::type is
4807
+ // bfloat16x4_t or bfloat16x8_t.
4808
+ static HWY_INLINE bfloat16x4_t BitCastFromRawNeonBF16(bfloat16x4_t raw) {
4809
+ return raw;
4810
+ }
4811
+ #else
4812
+ // If HWY_NEON_HAVE_F32_TO_BF16C && !HWY_NEON_HAVE_BFLOAT16 is true,
4813
+ // detail::Vec128<bfloat16_t, N>::type is uint16x4_t or uint16x8_t vector to
4814
+ // work around compiler bugs that are there with GCC 13 or earlier or Clang 16
4815
+ // or earlier on AArch64.
4816
+
4817
+ // The bfloat16x4_t vector returned by vcvt_bf16_f32 needs to be bitcasted to
4818
+ // an uint16x4_t vector if HWY_NEON_HAVE_F32_TO_BF16C &&
4819
+ // !HWY_NEON_HAVE_BFLOAT16 is true.
4820
+ static HWY_INLINE uint16x4_t BitCastFromRawNeonBF16(bfloat16x4_t raw) {
4821
+ return vreinterpret_u16_bf16(raw);
4822
+ }
4823
+ #endif
4824
+ } // namespace detail
4825
+
4826
+ template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
4827
+ HWY_API VFromD<D> DemoteTo(D /*dbf16*/, VFromD<Rebind<float, D>> v) {
4828
+ return VFromD<D>(detail::BitCastFromRawNeonBF16(vcvt_bf16_f32(v.raw)));
4174
4829
  }
4830
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_BF16_D(D)>
4831
+ HWY_API VFromD<D> DemoteTo(D /*dbf16*/, VFromD<Rebind<float, D>> v) {
4832
+ return VFromD<D>(detail::BitCastFromRawNeonBF16(
4833
+ vcvt_bf16_f32(vcombine_f32(v.raw, v.raw))));
4834
+ }
4835
+ #endif // HWY_NEON_HAVE_F32_TO_BF16C
4175
4836
 
4176
4837
  #if HWY_HAVE_FLOAT64
4177
4838
 
@@ -4184,32 +4845,10 @@ HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) {
4184
4845
  return Vec32<float>(vcvt_f32_f64(vcombine_f64(v.raw, v.raw)));
4185
4846
  }
4186
4847
 
4187
- template <class D, HWY_IF_I32_D(D)>
4188
- HWY_API Vec64<int32_t> DemoteTo(D /* tag */, Vec128<double> v) {
4189
- const int64x2_t i64 = vcvtq_s64_f64(v.raw);
4190
- return Vec64<int32_t>(vqmovn_s64(i64));
4191
- }
4192
- template <class D, HWY_IF_I32_D(D)>
4193
- HWY_API Vec32<int32_t> DemoteTo(D /* tag */, Vec64<double> v) {
4194
- // There is no i64x1 -> i32x1 narrow, so Combine to 128-bit. Do so with the
4195
- // f64 input already to also avoid the missing vcvt_s64_f64 in GCC 6.4.
4196
- const Full128<double> ddt;
4197
- const Full128<int64_t> dit;
4198
- return Vec32<int32_t>(vqmovn_s64(ConvertTo(dit, Combine(ddt, v, v)).raw));
4199
- }
4200
-
4201
- template <class D, HWY_IF_U32_D(D)>
4202
- HWY_API Vec64<uint32_t> DemoteTo(D /* tag */, Vec128<double> v) {
4203
- const uint64x2_t u64 = vcvtq_u64_f64(v.raw);
4204
- return Vec64<uint32_t>(vqmovn_u64(u64));
4205
- }
4206
- template <class D, HWY_IF_U32_D(D)>
4207
- HWY_API Vec32<uint32_t> DemoteTo(D /* tag */, Vec64<double> v) {
4208
- // There is no i64x1 -> i32x1 narrow, so Combine to 128-bit. Do so with the
4209
- // f64 input already to also avoid the missing vcvt_s64_f64 in GCC 6.4.
4210
- const Full128<double> ddt;
4211
- const Full128<uint64_t> du_t;
4212
- return Vec32<uint32_t>(vqmovn_u64(ConvertTo(du_t, Combine(ddt, v, v)).raw));
4848
+ template <class D, HWY_IF_UI32_D(D)>
4849
+ HWY_API VFromD<D> DemoteTo(D d32, VFromD<Rebind<double, D>> v) {
4850
+ const Rebind<MakeWide<TFromD<D>>, D> d64;
4851
+ return DemoteTo(d32, ConvertTo(d64, v));
4213
4852
  }
4214
4853
 
4215
4854
  #endif // HWY_HAVE_FLOAT64
@@ -4466,31 +5105,7 @@ HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
4466
5105
  return v != v;
4467
5106
  }
4468
5107
 
4469
- template <typename T, size_t N, HWY_IF_FLOAT(T)>
4470
- HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
4471
- const DFromV<decltype(v)> d;
4472
- const RebindToSigned<decltype(d)> di;
4473
- const VFromD<decltype(di)> vi = BitCast(di, v);
4474
- // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
4475
- return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
4476
- }
4477
-
4478
- // Returns whether normal/subnormal/zero.
4479
- template <typename T, size_t N, HWY_IF_FLOAT(T)>
4480
- HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
4481
- const DFromV<decltype(v)> d;
4482
- const RebindToUnsigned<decltype(d)> du;
4483
- const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
4484
- const VFromD<decltype(du)> vu = BitCast(du, v);
4485
- // 'Shift left' to clear the sign bit, then right so we can compare with the
4486
- // max exponent (cannot compare with MaxExponentTimes2 directly because it is
4487
- // negative and non-negative floats would be greater).
4488
- const VFromD<decltype(di)> exp =
4489
- BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
4490
- return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
4491
- }
4492
-
4493
- // ================================================== SWIZZLE
5108
+ // ================================================== SWIZZLE
4494
5109
 
4495
5110
  // ------------------------------ LowerHalf
4496
5111
 
@@ -4532,13 +5147,18 @@ HWY_API Vec64<float16_t> LowerHalf(Vec128<float16_t> v) {
4532
5147
  return Vec64<float16_t>(vget_low_f16(v.raw));
4533
5148
  }
4534
5149
  #endif // HWY_HAVE_FLOAT16
5150
+ #if HWY_NEON_HAVE_BFLOAT16
5151
+ HWY_API Vec64<bfloat16_t> LowerHalf(Vec128<bfloat16_t> v) {
5152
+ return Vec64<bfloat16_t>(vget_low_bf16(v.raw));
5153
+ }
5154
+ #endif // HWY_NEON_HAVE_BFLOAT16
4535
5155
  #if HWY_HAVE_FLOAT64
4536
5156
  HWY_API Vec64<double> LowerHalf(Vec128<double> v) {
4537
5157
  return Vec64<double>(vget_low_f64(v.raw));
4538
5158
  }
4539
5159
  #endif // HWY_HAVE_FLOAT64
4540
5160
 
4541
- template <class V, HWY_IF_SPECIAL_FLOAT_V(V), HWY_IF_V_SIZE_V(V, 16)>
5161
+ template <class V, HWY_NEON_IF_EMULATED_D(DFromV<V>), HWY_IF_V_SIZE_V(V, 16)>
4542
5162
  HWY_API VFromD<Half<DFromV<V>>> LowerHalf(V v) {
4543
5163
  const Full128<uint16_t> du;
4544
5164
  const Half<DFromV<V>> dh;
@@ -4738,6 +5358,12 @@ HWY_API Vec64<float16_t> UpperHalf(D /* tag */, Vec128<float16_t> v) {
4738
5358
  return Vec64<float16_t>(vget_high_f16(v.raw));
4739
5359
  }
4740
5360
  #endif
5361
+ #if HWY_NEON_HAVE_BFLOAT16
5362
+ template <class D, HWY_IF_BF16_D(D)>
5363
+ HWY_API Vec64<bfloat16_t> UpperHalf(D /* tag */, Vec128<bfloat16_t> v) {
5364
+ return Vec64<bfloat16_t>(vget_high_bf16(v.raw));
5365
+ }
5366
+ #endif // HWY_NEON_HAVE_BFLOAT16
4741
5367
  template <class D, HWY_IF_F32_D(D)>
4742
5368
  HWY_API Vec64<float> UpperHalf(D /* tag */, Vec128<float> v) {
4743
5369
  return Vec64<float>(vget_high_f32(v.raw));
@@ -4749,7 +5375,7 @@ HWY_API Vec64<double> UpperHalf(D /* tag */, Vec128<double> v) {
4749
5375
  }
4750
5376
  #endif // HWY_HAVE_FLOAT64
4751
5377
 
4752
- template <class D, HWY_IF_SPECIAL_FLOAT_D(D), HWY_IF_V_SIZE_GT_D(D, 4)>
5378
+ template <class D, HWY_NEON_IF_EMULATED_D(D), HWY_IF_V_SIZE_GT_D(D, 4)>
4753
5379
  HWY_API VFromD<D> UpperHalf(D dh, VFromD<Twice<D>> v) {
4754
5380
  const RebindToUnsigned<Twice<decltype(dh)>> du;
4755
5381
  const Half<decltype(du)> duh;
@@ -4869,6 +5495,20 @@ HWY_API Vec128<float16_t, N> Broadcast(Vec128<float16_t, N> v) {
4869
5495
  }
4870
5496
  #endif // HWY_HAVE_FLOAT16
4871
5497
 
5498
+ #if HWY_NEON_HAVE_BFLOAT16
5499
+ template <int kLane>
5500
+ HWY_API Vec128<bfloat16_t> Broadcast(Vec128<bfloat16_t> v) {
5501
+ static_assert(0 <= kLane && kLane < 8, "Invalid lane");
5502
+ return Vec128<bfloat16_t>(vdupq_laneq_bf16(v.raw, kLane));
5503
+ }
5504
+ template <int kLane, size_t N, HWY_IF_V_SIZE_LE(bfloat16_t, N, 8),
5505
+ HWY_IF_LANES_GT(N, 1)>
5506
+ HWY_API Vec128<bfloat16_t, N> Broadcast(Vec128<bfloat16_t, N> v) {
5507
+ static_assert(0 <= kLane && kLane < N, "Invalid lane");
5508
+ return Vec128<bfloat16_t, N>(vdup_lane_bf16(v.raw, kLane));
5509
+ }
5510
+ #endif // HWY_NEON_HAVE_BFLOAT16
5511
+
4872
5512
  template <int kLane>
4873
5513
  HWY_API Vec128<float> Broadcast(Vec128<float> v) {
4874
5514
  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
@@ -4976,7 +5616,26 @@ HWY_API Vec128<float16_t> Broadcast(Vec128<float16_t> v) {
4976
5616
  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
4977
5617
  return Vec128<float16_t>(vdupq_n_f16(vgetq_lane_f16(v.raw, kLane)));
4978
5618
  }
5619
+ template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8),
5620
+ HWY_IF_LANES_GT(N, 1)>
5621
+ HWY_API Vec128<float16_t, N> Broadcast(Vec128<float16_t, N> v) {
5622
+ static_assert(0 <= kLane && kLane < N, "Invalid lane");
5623
+ return Vec128<float16_t, N>(vdup_lane_f16(v.raw, kLane));
5624
+ }
4979
5625
  #endif // HWY_HAVE_FLOAT16
5626
+ #if HWY_NEON_HAVE_BFLOAT16
5627
+ template <int kLane>
5628
+ HWY_API Vec128<bfloat16_t> Broadcast(Vec128<bfloat16_t> v) {
5629
+ static_assert(0 <= kLane && kLane < 8, "Invalid lane");
5630
+ return Vec128<bfloat16_t>(vdupq_n_bf16(vgetq_lane_bf16(v.raw, kLane)));
5631
+ }
5632
+ template <int kLane, size_t N, HWY_IF_V_SIZE_LE(bfloat16_t, N, 8),
5633
+ HWY_IF_LANES_GT(N, 1)>
5634
+ HWY_API Vec128<bfloat16_t, N> Broadcast(Vec128<bfloat16_t, N> v) {
5635
+ static_assert(0 <= kLane && kLane < N, "Invalid lane");
5636
+ return Vec128<bfloat16_t, N>(vdup_lane_bf16(v.raw, kLane));
5637
+ }
5638
+ #endif // HWY_NEON_HAVE_BFLOAT16
4980
5639
  template <int kLane>
4981
5640
  HWY_API Vec128<float> Broadcast(Vec128<float> v) {
4982
5641
  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
@@ -4991,6 +5650,14 @@ HWY_API Vec128<float, N> Broadcast(Vec128<float, N> v) {
4991
5650
 
4992
5651
  #endif // HWY_ARCH_ARM_A64
4993
5652
 
5653
+ template <int kLane, typename V, HWY_NEON_IF_EMULATED_D(DFromV<V>),
5654
+ HWY_IF_LANES_GT_D(DFromV<V>, 1)>
5655
+ HWY_API V Broadcast(V v) {
5656
+ const DFromV<V> d;
5657
+ const RebindToUnsigned<decltype(d)> du;
5658
+ return BitCast(d, Broadcast<kLane>(BitCast(du, v)));
5659
+ }
5660
+
4994
5661
  // ------------------------------ TableLookupLanes
4995
5662
 
4996
5663
  // Returned by SetTableIndices for use by TableLookupLanes.
@@ -5393,6 +6060,16 @@ HWY_API Vec128<T> InterleaveLower(Vec128<T> a, Vec128<T> b) {
5393
6060
  }
5394
6061
  #endif
5395
6062
 
6063
+ #if !HWY_HAVE_FLOAT16
6064
+ template <size_t N, HWY_IF_V_SIZE_GT(float16_t, N, 4)>
6065
+ HWY_API Vec128<float16_t, N> InterleaveLower(Vec128<float16_t, N> a,
6066
+ Vec128<float16_t, N> b) {
6067
+ const DFromV<decltype(a)> d;
6068
+ const RebindToUnsigned<decltype(d)> du;
6069
+ return BitCast(d, InterleaveLower(BitCast(du, a), BitCast(du, b)));
6070
+ }
6071
+ #endif // !HWY_HAVE_FLOAT16
6072
+
5396
6073
  // < 64 bit parts
5397
6074
  template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 4)>
5398
6075
  HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
@@ -5676,158 +6353,656 @@ HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
5676
6353
 
5677
6354
  namespace detail {
5678
6355
 
5679
- template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
5680
- HWY_INLINE V SlideDownLanes(V v, size_t amt) {
5681
- const DFromV<decltype(v)> d;
5682
- using TU = UnsignedFromSize<d.MaxBytes()>;
5683
- const Repartition<TU, decltype(d)> du;
5684
- return BitCast(d,
5685
- BitCast(du, v) << Set(
5686
- du, static_cast<TU>(TU{0} - amt * sizeof(TFromV<V>) * 8)));
6356
+ template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
6357
+ HWY_INLINE V SlideDownLanes(V v, size_t amt) {
6358
+ const DFromV<decltype(v)> d;
6359
+ using TU = UnsignedFromSize<d.MaxBytes()>;
6360
+ const Repartition<TU, decltype(d)> du;
6361
+ return BitCast(d,
6362
+ BitCast(du, v) << Set(
6363
+ du, static_cast<TU>(TU{0} - amt * sizeof(TFromV<V>) * 8)));
6364
+ }
6365
+
6366
+ template <class V, HWY_IF_V_SIZE_V(V, 16)>
6367
+ HWY_INLINE V SlideDownLanes(V v, size_t amt) {
6368
+ const DFromV<decltype(v)> d;
6369
+ const Repartition<int8_t, decltype(d)> di8;
6370
+ auto idx = Iota(di8, static_cast<int8_t>(amt * sizeof(TFromV<V>)));
6371
+ idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15})));
6372
+ return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx));
6373
+ }
6374
+
6375
+ } // namespace detail
6376
+
6377
+ template <class D, HWY_IF_LANES_D(D, 1)>
6378
+ HWY_API VFromD<D> SlideDownLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
6379
+ return v;
6380
+ }
6381
+
6382
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
6383
+ HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
6384
+ #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
6385
+ if (__builtin_constant_p(amt)) {
6386
+ switch (amt) {
6387
+ case 0:
6388
+ return v;
6389
+ case 1:
6390
+ return ShiftRightLanes<1>(d, v);
6391
+ }
6392
+ }
6393
+ #else
6394
+ (void)d;
6395
+ #endif
6396
+
6397
+ return detail::SlideDownLanes(v, amt);
6398
+ }
6399
+
6400
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
6401
+ HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
6402
+ #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
6403
+ if (__builtin_constant_p(amt)) {
6404
+ switch (amt) {
6405
+ case 0:
6406
+ return v;
6407
+ case 1:
6408
+ return ShiftRightLanes<1>(d, v);
6409
+ case 2:
6410
+ return ShiftRightLanes<2>(d, v);
6411
+ case 3:
6412
+ return ShiftRightLanes<3>(d, v);
6413
+ }
6414
+ }
6415
+ #else
6416
+ (void)d;
6417
+ #endif
6418
+
6419
+ return detail::SlideDownLanes(v, amt);
6420
+ }
6421
+
6422
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
6423
+ HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
6424
+ #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
6425
+ if (__builtin_constant_p(amt)) {
6426
+ switch (amt) {
6427
+ case 0:
6428
+ return v;
6429
+ case 1:
6430
+ return ShiftRightLanes<1>(d, v);
6431
+ case 2:
6432
+ return ShiftRightLanes<2>(d, v);
6433
+ case 3:
6434
+ return ShiftRightLanes<3>(d, v);
6435
+ case 4:
6436
+ return ShiftRightLanes<4>(d, v);
6437
+ case 5:
6438
+ return ShiftRightLanes<5>(d, v);
6439
+ case 6:
6440
+ return ShiftRightLanes<6>(d, v);
6441
+ case 7:
6442
+ return ShiftRightLanes<7>(d, v);
6443
+ }
6444
+ }
6445
+ #else
6446
+ (void)d;
6447
+ #endif
6448
+
6449
+ return detail::SlideDownLanes(v, amt);
6450
+ }
6451
+
6452
+ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
6453
+ HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
6454
+ #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
6455
+ if (__builtin_constant_p(amt)) {
6456
+ switch (amt) {
6457
+ case 0:
6458
+ return v;
6459
+ case 1:
6460
+ return ShiftRightLanes<1>(d, v);
6461
+ case 2:
6462
+ return ShiftRightLanes<2>(d, v);
6463
+ case 3:
6464
+ return ShiftRightLanes<3>(d, v);
6465
+ case 4:
6466
+ return ShiftRightLanes<4>(d, v);
6467
+ case 5:
6468
+ return ShiftRightLanes<5>(d, v);
6469
+ case 6:
6470
+ return ShiftRightLanes<6>(d, v);
6471
+ case 7:
6472
+ return ShiftRightLanes<7>(d, v);
6473
+ case 8:
6474
+ return ShiftRightLanes<8>(d, v);
6475
+ case 9:
6476
+ return ShiftRightLanes<9>(d, v);
6477
+ case 10:
6478
+ return ShiftRightLanes<10>(d, v);
6479
+ case 11:
6480
+ return ShiftRightLanes<11>(d, v);
6481
+ case 12:
6482
+ return ShiftRightLanes<12>(d, v);
6483
+ case 13:
6484
+ return ShiftRightLanes<13>(d, v);
6485
+ case 14:
6486
+ return ShiftRightLanes<14>(d, v);
6487
+ case 15:
6488
+ return ShiftRightLanes<15>(d, v);
6489
+ }
6490
+ }
6491
+ #else
6492
+ (void)d;
6493
+ #endif
6494
+
6495
+ return detail::SlideDownLanes(v, amt);
6496
+ }
6497
+
6498
+ // ------------------------------- WidenHighMulAdd
6499
+
6500
+ #ifdef HWY_NATIVE_WIDEN_HIGH_MUL_ADD
6501
+ #undef HWY_NATIVE_WIDEN_HIGH_MUL_ADD
6502
+ #else
6503
+ #define HWY_NATIVE_WIDEN_HIGH_MUL_ADD
6504
+ #endif
6505
+
6506
+ namespace detail {
6507
+
6508
+ template<class D, HWY_IF_U64_D(D), class DN = RepartitionToNarrow<D>,
6509
+ HWY_IF_LANES_GT_D(DN, 2)>
6510
+ HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
6511
+ VFromD<DN> x, VFromD<D> add) {
6512
+ #if HWY_ARCH_ARM_A64
6513
+ return Vec128<uint64_t>(vmlal_high_u32(add.raw, mul.raw, x.raw));
6514
+ #else
6515
+ const Full64<uint32_t> dh;
6516
+ return Vec128<uint64_t>(
6517
+ vmlal_u32(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
6518
+ #endif
6519
+ }
6520
+
6521
+ template<class D, HWY_IF_U64_D(D), class DN = RepartitionToNarrow<D>,
6522
+ HWY_IF_LANES_LE_D(DN, 2)>
6523
+ HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
6524
+ VFromD<DN> x, VFromD<D> add) {
6525
+ Vec128<uint64_t> mulResult = Vec128<uint64_t>(vmull_u32(mul.raw, x.raw));
6526
+ return UpperHalf(d, mulResult) + add;
6527
+ }
6528
+
6529
+ template<class D, HWY_IF_I64_D(D), class DN = RepartitionToNarrow<D>,
6530
+ HWY_IF_LANES_GT_D(DN, 2)>
6531
+ HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
6532
+ VFromD<DN> x, VFromD<D> add) {
6533
+ #if HWY_ARCH_ARM_A64
6534
+ return Vec128<int64_t>(vmlal_high_s32(add.raw, mul.raw, x.raw));
6535
+ #else
6536
+ const Full64<int32_t> dh;
6537
+ return Vec128<int64_t>(
6538
+ vmlal_s32(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
6539
+ #endif
6540
+ }
6541
+
6542
+ template<class D, HWY_IF_I64_D(D), class DN = RepartitionToNarrow<D>,
6543
+ HWY_IF_LANES_LE_D(DN, 2)>
6544
+ HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
6545
+ VFromD<DN> x, VFromD<D> add) {
6546
+ Vec128<int64_t> mulResult = Vec128<int64_t>(vmull_s32(mul.raw, x.raw));
6547
+ return UpperHalf(d, mulResult) + add;
6548
+ }
6549
+
6550
+ template<class D, HWY_IF_I32_D(D), class DN = RepartitionToNarrow<D>,
6551
+ HWY_IF_LANES_GT_D(DN, 4)>
6552
+ HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
6553
+ VFromD<DN> x, VFromD<D> add) {
6554
+ #if HWY_ARCH_ARM_A64
6555
+ return Vec128<int32_t>(vmlal_high_s16(add.raw, mul.raw, x.raw));
6556
+ #else
6557
+ const Full64<int16_t> dh;
6558
+ return Vec128<int32_t>(
6559
+ vmlal_s16(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
6560
+ #endif
6561
+ }
6562
+
6563
+ template<class D, HWY_IF_I32_D(D), class DN = RepartitionToNarrow<D>,
6564
+ HWY_IF_LANES_D(DN, 4)>
6565
+ HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
6566
+ VFromD<DN> x, VFromD<D> add) {
6567
+ Vec128<int32_t> widen = Vec128<int32_t>(vmull_s16(mul.raw, x.raw));
6568
+ Vec64<int32_t> hi = UpperHalf(d, widen);
6569
+ return hi + add;
6570
+ }
6571
+
6572
+ template<class D, HWY_IF_I32_D(D), class DN = RepartitionToNarrow<D>,
6573
+ HWY_IF_LANES_D(DN, 2)>
6574
+ HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
6575
+ VFromD<DN> x, VFromD<D> add) {
6576
+ Vec128<int32_t> widen = Vec128<int32_t>(vmull_s16(mul.raw, x.raw));
6577
+ Vec32<int32_t> hi = UpperHalf(d, Vec64<int32_t>(vget_high_s32(widen.raw)));
6578
+ return hi + add;
6579
+ }
6580
+
6581
+ template<class D, HWY_IF_U32_D(D), class DN = RepartitionToNarrow<D>,
6582
+ HWY_IF_LANES_GT_D(DN, 4)>
6583
+ HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
6584
+ VFromD<DN> x, VFromD<D> add) {
6585
+ #if HWY_ARCH_ARM_A64
6586
+ return Vec128<uint32_t>(vmlal_high_u16(add.raw, mul.raw, x.raw));
6587
+ #else
6588
+ const Full64<uint16_t> dh;
6589
+ return Vec128<uint32_t>(
6590
+ vmlal_u16(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
6591
+ #endif
6592
+ }
6593
+
6594
+ template<class D, HWY_IF_U32_D(D), class DN = RepartitionToNarrow<D>,
6595
+ HWY_IF_LANES_D(DN, 4)>
6596
+ HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
6597
+ VFromD<DN> x, VFromD<D> add) {
6598
+ Vec128<uint32_t> widen = Vec128<uint32_t>(vmull_u16(mul.raw, x.raw));
6599
+ VFromD<D> hi = UpperHalf(d, widen);
6600
+ return hi + add;
6601
+ }
6602
+
6603
+ template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_D(D, 1),
6604
+ class DN = RepartitionToNarrow<D>>
6605
+ HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
6606
+ VFromD<DN> x, VFromD<D> add) {
6607
+ Vec128<uint32_t> widen = Vec128<uint32_t>(vmull_u16(mul.raw, x.raw));
6608
+ VFromD<D> hi = UpperHalf(d, Vec64<uint32_t>(vget_high_u32(widen.raw)));
6609
+ return hi + add;
6610
+ }
6611
+
6612
+ template<class D, HWY_IF_U16_D(D), class DN = RepartitionToNarrow<D>,
6613
+ HWY_IF_LANES_GT_D(DN, 8)>
6614
+ HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
6615
+ VFromD<DN> x, VFromD<D> add) {
6616
+ #if HWY_ARCH_ARM_A64
6617
+ return Vec128<uint16_t>(vmlal_high_u8(add.raw, mul.raw, x.raw));
6618
+ #else
6619
+ const Full64<uint8_t> dh;
6620
+ return Vec128<uint16_t>(
6621
+ vmlal_u8(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
6622
+ #endif
6623
+ }
6624
+
6625
+ template<class D, HWY_IF_U16_D(D), class DN = RepartitionToNarrow<D>,
6626
+ HWY_IF_LANES_D(DN, 8)>
6627
+ HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
6628
+ VFromD<DN> x, VFromD<D> add) {
6629
+ Vec128<uint16_t> widen = Vec128<uint16_t>(vmull_u8(mul.raw, x.raw));
6630
+ VFromD<D> hi = UpperHalf(d, widen);
6631
+ return hi + add;
6632
+ }
6633
+
6634
+ template<class D, HWY_IF_U16(TFromD<D>), class DN = RepartitionToNarrow<D>,
6635
+ HWY_IF_LANES_LE_D(DN, 4)>
6636
+ HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
6637
+ VFromD<DN> x, VFromD<D> add) {
6638
+ Vec128<uint16_t> widen = Vec128<uint16_t>(vmull_u8(mul.raw, x.raw));
6639
+ const Twice<decltype(d)> d16F;
6640
+ VFromD<D> hi = UpperHalf(d, VFromD<decltype(d16F)>(vget_high_u16(widen.raw)));
6641
+ return hi + add;
6642
+ }
6643
+
6644
+ template<class D, HWY_IF_I16_D(D), class DN = RepartitionToNarrow<D>,
6645
+ HWY_IF_LANES_GT_D(DN, 8)>
6646
+ HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
6647
+ VFromD<DN> x, VFromD<D> add) {
6648
+ #if HWY_ARCH_ARM_A64
6649
+ return Vec128<int16_t>(vmlal_high_s8(add.raw, mul.raw, x.raw));
6650
+ #else
6651
+ const Full64<int8_t> dh;
6652
+ return Vec128<int16_t>(
6653
+ vmlal_s8(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
6654
+ #endif
6655
+ }
6656
+
6657
+ template<class D, HWY_IF_I16_D(D), class DN = RepartitionToNarrow<D>,
6658
+ HWY_IF_LANES_D(DN, 8)>
6659
+ HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
6660
+ VFromD<DN> x, VFromD<D> add) {
6661
+ Vec128<int16_t> widen = Vec128<int16_t>(vmull_s8(mul.raw, x.raw));
6662
+ VFromD<D> hi = UpperHalf(d, widen);
6663
+ return hi + add;
6664
+ }
6665
+
6666
+ template<class D, HWY_IF_I16_D(D), class DN = RepartitionToNarrow<D>,
6667
+ HWY_IF_LANES_LE_D(DN, 4)>
6668
+ HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
6669
+ VFromD<DN> x, VFromD<D> add) {
6670
+ Vec128<int16_t> widen = Vec128<int16_t>(vmull_s8(mul.raw, x.raw));
6671
+ const Twice<decltype(d)> d16F;
6672
+ VFromD<D> hi = UpperHalf(d, VFromD<decltype(d16F)>(vget_high_s16(widen.raw)));
6673
+ return hi + add;
6674
+ }
6675
+
6676
+ #if 0
6677
+ #if HWY_HAVE_FLOAT16
6678
+ template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 4),
6679
+ class DN = RepartitionToNarrow<D>>
6680
+ HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
6681
+ VFromD<DN> x, VFromD<D> add) {
6682
+ return VFromD<D>(vfmlalq_high_f16(add.raw, mul.raw, x.raw));
6683
+ }
6684
+
6685
+ template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 2),
6686
+ class DN = RepartitionToNarrow<D>>
6687
+ HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
6688
+ VFromD<DN> x, VFromD<D> add) {
6689
+ return Vec64<float32_t>(vfmlal_high_f16(add.raw, mul.raw, x.raw));
6690
+ }
6691
+
6692
+ template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 1),
6693
+ class DN = RepartitionToNarrow<D>>
6694
+ HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
6695
+ VFromD<DN> x, VFromD<D> add) {
6696
+ return MulAdd(add, PromoteUpperTo(d, mul), PromoteUpperTo(d, x));
6697
+ }
6698
+ #endif
6699
+ #endif
6700
+
6701
+ } // namespace detail
6702
+
6703
+ // ------------------------------- WidenMulAdd
6704
+
6705
+ #ifdef HWY_NATIVE_WIDEN_MUL_ADD
6706
+ #undef HWY_NATIVE_WIDEN_MUL_ADD
6707
+ #else
6708
+ #define HWY_NATIVE_WIDEN_MUL_ADD
6709
+ #endif
6710
+
6711
+ namespace detail {
6712
+
6713
+ template<class D, HWY_IF_U16_D(D), HWY_IF_LANES_GT_D(D, 4),
6714
+ class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
6715
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6716
+ VFromD<DN> x, VFromD<D> add) {
6717
+ return Vec128<uint16_t>(vmlal_u8(add.raw, mul.raw, x.raw));
6718
+ }
6719
+
6720
+ template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_LE_D(D, 4),
6721
+ class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
6722
+ HWY_API VFromD<D> WidenMulAdd(D d, VFromD<DN> mul, VFromD<DN> x,
6723
+ VFromD<D> add) {
6724
+ return MulAdd(add, PromoteTo(d, mul), PromoteTo(d, x));
6725
+ }
6726
+
6727
+ template<class D, HWY_IF_I16_D(D), HWY_IF_LANES_GT_D(D, 4),
6728
+ class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
6729
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6730
+ VFromD<DN> x, VFromD<D> add) {
6731
+ return VFromD<D>(vmlal_s8(add.raw, mul.raw, x.raw));
6732
+ }
6733
+
6734
+ template <class D, HWY_IF_I16_D(D), HWY_IF_LANES_LE_D(D, 4),
6735
+ class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
6736
+ HWY_API VFromD<D> WidenMulAdd(D d, VFromD<DN> mul, VFromD<DN> x,
6737
+ VFromD<D> add) {
6738
+ return MulAdd(add, PromoteTo(d, mul), PromoteTo(d, x));
6739
+ }
6740
+
6741
+ template<class D, HWY_IF_I32_D(D),
6742
+ class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
6743
+ HWY_IF_LANES_GT_D(DN, 2)>
6744
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6745
+ VFromD<DN> x, VFromD<D> add) {
6746
+ return Vec128<int32_t>(vmlal_s16(add.raw, mul.raw, x.raw));
6747
+ }
6748
+
6749
+ template<class D, HWY_IF_I32_D(D),
6750
+ class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
6751
+ HWY_IF_LANES_D(DN, 2)>
6752
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6753
+ VFromD<DN> x, VFromD<D> add) {
6754
+ Vec128<int32_t> mulRs = Vec128<int32_t>(vmull_s16(mul.raw, x.raw));
6755
+ const VFromD<D> mul10 = LowerHalf(mulRs);
6756
+ return add + mul10;
6757
+ }
6758
+
6759
+ template<class D, HWY_IF_I32_D(D),
6760
+ class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
6761
+ HWY_IF_LANES_D(D, 1)>
6762
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6763
+ VFromD<DN> x, VFromD<D> add) {
6764
+ Vec64<int32_t> mulRs = LowerHalf(Vec128<int32_t>(vmull_s16(mul.raw, x.raw)));
6765
+ const Vec32<int32_t> mul10(LowerHalf(mulRs));
6766
+ return add + mul10;
6767
+ }
6768
+
6769
+ template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_GT_D(D, 2),
6770
+ class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
6771
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6772
+ VFromD<DN> x, VFromD<D> add) {
6773
+ return Vec128<uint32_t>(vmlal_u16(add.raw, mul.raw, x.raw));
6774
+ }
6775
+
6776
+ template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_D(D, 2),
6777
+ class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
6778
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6779
+ VFromD<DN> x, VFromD<D> add) {
6780
+ Vec128<uint32_t> mulRs = Vec128<uint32_t>(vmull_u16(mul.raw, x.raw));
6781
+ const Vec64<uint32_t> mul10(LowerHalf(mulRs));
6782
+ return add + mul10;
6783
+ }
6784
+
6785
+ template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_D(D, 1),
6786
+ class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
6787
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6788
+ VFromD<DN> x, VFromD<D> add) {
6789
+ Vec64<uint32_t> mulRs =
6790
+ LowerHalf(Vec128<uint32_t>(vmull_u16(mul.raw, x.raw)));
6791
+ const Vec32<uint32_t> mul10(LowerHalf(mulRs));
6792
+ return add + mul10;
6793
+ }
6794
+
6795
+ template<class D, HWY_IF_I64_D(D), class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
6796
+ HWY_IF_LANES_D(DN, 2)>
6797
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6798
+ VFromD<DN> x, VFromD<D> add) {
6799
+ return VFromD<D>(vmlal_s32(add.raw, mul.raw, x.raw));
6800
+ }
6801
+
6802
+ template<class D, HWY_IF_I64_D(D), HWY_IF_LANES_D(D, 1),
6803
+ class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
6804
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6805
+ VFromD<DN> x, VFromD<D> add) {
6806
+ Vec128<int64_t> mulRs = Vec128<int64_t>(vmull_s32(mul.raw, x.raw));
6807
+ const VFromD<D> mul10(LowerHalf(mulRs));
6808
+ return add + mul10;
6809
+ }
6810
+
6811
+ template<class D, HWY_IF_U64_D(D), class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
6812
+ HWY_IF_LANES_D(DN, 2)>
6813
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6814
+ VFromD<DN> x, VFromD<D> add) {
6815
+ return VFromD<D>(vmlal_u32(add.raw, mul.raw, x.raw));
6816
+ }
6817
+
6818
+ template<class D, HWY_IF_U64_D(D), class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
6819
+ HWY_IF_LANES_D(DN, 1)>
6820
+ HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
6821
+ VFromD<DN> x, VFromD<D> add) {
6822
+ Vec128<uint64_t> mulRs = Vec128<uint64_t>(vmull_u32(mul.raw, x.raw));
6823
+ const VFromD<D> mul10(LowerHalf(mulRs));
6824
+ return add + mul10;
6825
+ }
6826
+
6827
+ #if 0
6828
+ #if HWY_HAVE_FLOAT16
6829
+ template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>,
6830
+ HWY_IF_LANES_D(D, 4)>
6831
+ HWY_API VFromD<D> WidenLowMulAdd(D /* tag */, VFromD<DN> mul,
6832
+ VFromD<DN> x, VFromD<D> add) {
6833
+ return VFromD<D>(vfmlalq_low_f16(add.raw, mul.raw, x.raw));
6834
+ }
6835
+
6836
+ template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>,
6837
+ HWY_IF_LANES_D(DN, 4)>
6838
+ HWY_API VFromD<D> WidenLowMulAdd(D /* tag */, VFromD<DN> mul,
6839
+ VFromD<DN> x, VFromD<D> add) {
6840
+ return Vec64<float32_t>(vfmlal_low_f16(add.raw, mul.raw, x.raw));
5687
6841
  }
5688
6842
 
5689
- template <class V, HWY_IF_V_SIZE_V(V, 16)>
5690
- HWY_INLINE V SlideDownLanes(V v, size_t amt) {
5691
- const DFromV<decltype(v)> d;
5692
- const Repartition<int8_t, decltype(d)> di8;
5693
- auto idx = Iota(di8, static_cast<int8_t>(amt * sizeof(TFromV<V>)));
5694
- idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15})));
5695
- return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx));
6843
+ template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 1),
6844
+ class DN = RepartitionToNarrow<D>>
6845
+ HWY_API VFromD<D> WidenLowMulAdd(D d, VFromD<DN> mul,
6846
+ VFromD<DN> x, VFromD<D> add) {
6847
+ return MulAdd(add, PromoteLowerTo(d, mul), PromoteLowerTo(d, x));
5696
6848
  }
6849
+ #endif
6850
+ #endif
5697
6851
 
5698
6852
  } // namespace detail
5699
6853
 
5700
- template <class D, HWY_IF_LANES_D(D, 1)>
5701
- HWY_API VFromD<D> SlideDownLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
5702
- return v;
5703
- }
6854
+ // ------------------------------ WidenMulAccumulate
5704
6855
 
5705
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
5706
- HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
5707
- #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
5708
- if (__builtin_constant_p(amt)) {
5709
- switch (amt) {
5710
- case 0:
5711
- return v;
5712
- case 1:
5713
- return ShiftRightLanes<1>(d, v);
5714
- }
5715
- }
6856
+ #ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
6857
+ #undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
5716
6858
  #else
5717
- (void)d;
6859
+ #define HWY_NATIVE_WIDEN_MUL_ACCUMULATE
5718
6860
  #endif
5719
6861
 
5720
- return detail::SlideDownLanes(v, amt);
6862
+ template<class D, HWY_IF_INTEGER(TFromD<D>), class DN = RepartitionToNarrow<D>>
6863
+ HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x,
6864
+ VFromD<D> low, VFromD<D>& high) {
6865
+ high = detail::WidenHighMulAdd(d, mul, x, high);
6866
+ return detail::WidenMulAdd(d, LowerHalf(mul), LowerHalf(x), low);
5721
6867
  }
5722
6868
 
5723
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
5724
- HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
5725
- #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
5726
- if (__builtin_constant_p(amt)) {
5727
- switch (amt) {
5728
- case 0:
5729
- return v;
5730
- case 1:
5731
- return ShiftRightLanes<1>(d, v);
5732
- case 2:
5733
- return ShiftRightLanes<2>(d, v);
5734
- case 3:
5735
- return ShiftRightLanes<3>(d, v);
5736
- }
5737
- }
6869
+ #if 0
6870
+ #ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
6871
+ #undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
5738
6872
  #else
5739
- (void)d;
6873
+ #define HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
5740
6874
  #endif
5741
6875
 
5742
- return detail::SlideDownLanes(v, amt);
6876
+ #if HWY_HAVE_FLOAT16
6877
+
6878
+ template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>>
6879
+ HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x,
6880
+ VFromD<D> low, VFromD<D>& high) {
6881
+ high = detail::WidenHighMulAdd(d, mul, x, high);
6882
+ return detail::WidenLowMulAdd(d, mul, x, low);
5743
6883
  }
5744
6884
 
5745
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
5746
- HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
5747
- #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
5748
- if (__builtin_constant_p(amt)) {
5749
- switch (amt) {
5750
- case 0:
5751
- return v;
5752
- case 1:
5753
- return ShiftRightLanes<1>(d, v);
5754
- case 2:
5755
- return ShiftRightLanes<2>(d, v);
5756
- case 3:
5757
- return ShiftRightLanes<3>(d, v);
5758
- case 4:
5759
- return ShiftRightLanes<4>(d, v);
5760
- case 5:
5761
- return ShiftRightLanes<5>(d, v);
5762
- case 6:
5763
- return ShiftRightLanes<6>(d, v);
5764
- case 7:
5765
- return ShiftRightLanes<7>(d, v);
5766
- }
5767
- }
5768
- #else
5769
- (void)d;
6885
+ #endif
5770
6886
  #endif
5771
6887
 
5772
- return detail::SlideDownLanes(v, amt);
5773
- }
6888
+ // ------------------------------ SatWidenMulAccumFixedPoint
5774
6889
 
5775
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
5776
- HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
5777
- #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
5778
- if (__builtin_constant_p(amt)) {
5779
- switch (amt) {
5780
- case 0:
5781
- return v;
5782
- case 1:
5783
- return ShiftRightLanes<1>(d, v);
5784
- case 2:
5785
- return ShiftRightLanes<2>(d, v);
5786
- case 3:
5787
- return ShiftRightLanes<3>(d, v);
5788
- case 4:
5789
- return ShiftRightLanes<4>(d, v);
5790
- case 5:
5791
- return ShiftRightLanes<5>(d, v);
5792
- case 6:
5793
- return ShiftRightLanes<6>(d, v);
5794
- case 7:
5795
- return ShiftRightLanes<7>(d, v);
5796
- case 8:
5797
- return ShiftRightLanes<8>(d, v);
5798
- case 9:
5799
- return ShiftRightLanes<9>(d, v);
5800
- case 10:
5801
- return ShiftRightLanes<10>(d, v);
5802
- case 11:
5803
- return ShiftRightLanes<11>(d, v);
5804
- case 12:
5805
- return ShiftRightLanes<12>(d, v);
5806
- case 13:
5807
- return ShiftRightLanes<13>(d, v);
5808
- case 14:
5809
- return ShiftRightLanes<14>(d, v);
5810
- case 15:
5811
- return ShiftRightLanes<15>(d, v);
5812
- }
5813
- }
6890
+ #ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
6891
+ #undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
5814
6892
  #else
5815
- (void)d;
6893
+ #define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
5816
6894
  #endif
5817
6895
 
5818
- return detail::SlideDownLanes(v, amt);
6896
+ template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 16)>
6897
+ HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 /*di32*/,
6898
+ VFromD<Rebind<int16_t, DI32>> a,
6899
+ VFromD<Rebind<int16_t, DI32>> b,
6900
+ VFromD<DI32> sum) {
6901
+ return VFromD<DI32>(vqdmlal_s16(sum.raw, a.raw, b.raw));
6902
+ }
6903
+
6904
+ template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 8)>
6905
+ HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 di32,
6906
+ VFromD<Rebind<int16_t, DI32>> a,
6907
+ VFromD<Rebind<int16_t, DI32>> b,
6908
+ VFromD<DI32> sum) {
6909
+ const Full128<TFromD<DI32>> di32_full;
6910
+ const Rebind<int16_t, decltype(di32_full)> di16_full64;
6911
+ return ResizeBitCast(
6912
+ di32, SatWidenMulAccumFixedPoint(di32_full, ResizeBitCast(di16_full64, a),
6913
+ ResizeBitCast(di16_full64, b),
6914
+ ResizeBitCast(di32_full, sum)));
5819
6915
  }
5820
6916
 
5821
6917
  // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
5822
6918
 
6919
+ #if HWY_NEON_HAVE_F32_TO_BF16C
6920
+
6921
+ #ifdef HWY_NATIVE_MUL_EVEN_BF16
6922
+ #undef HWY_NATIVE_MUL_EVEN_BF16
6923
+ #else
6924
+ #define HWY_NATIVE_MUL_EVEN_BF16
6925
+ #endif
6926
+
6927
+ #ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
6928
+ #undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
6929
+ #else
6930
+ #define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
6931
+ #endif
6932
+
6933
+ namespace detail {
5823
6934
  #if HWY_NEON_HAVE_BFLOAT16
6935
+ // If HWY_NEON_HAVE_BFLOAT16 is true, detail::Vec128<bfloat16_t, N>::type is
6936
+ // bfloat16x4_t or bfloat16x8_t.
6937
+ static HWY_INLINE bfloat16x4_t BitCastToRawNeonBF16(bfloat16x4_t raw) {
6938
+ return raw;
6939
+ }
6940
+ static HWY_INLINE bfloat16x8_t BitCastToRawNeonBF16(bfloat16x8_t raw) {
6941
+ return raw;
6942
+ }
6943
+ #else
6944
+ // If HWY_NEON_HAVE_F32_TO_BF16C && !HWY_NEON_HAVE_BFLOAT16 is true,
6945
+ // detail::Vec128<bfloat16_t, N>::type is uint16x4_t or uint16x8_t vector to
6946
+ // work around compiler bugs that are there with GCC 13 or earlier or Clang 16
6947
+ // or earlier on AArch64.
6948
+
6949
+ // The uint16x4_t or uint16x8_t vector neets to be bitcasted to a bfloat16x4_t
6950
+ // or a bfloat16x8_t vector for the vbfdot_f32 and vbfdotq_f32 intrinsics if
6951
+ // HWY_NEON_HAVE_F32_TO_BF16C && !HWY_NEON_HAVE_BFLOAT16 is true
6952
+ static HWY_INLINE bfloat16x4_t BitCastToRawNeonBF16(uint16x4_t raw) {
6953
+ return vreinterpret_bf16_u16(raw);
6954
+ }
6955
+ static HWY_INLINE bfloat16x8_t BitCastToRawNeonBF16(uint16x8_t raw) {
6956
+ return vreinterpretq_bf16_u16(raw);
6957
+ }
6958
+ #endif
6959
+ } // namespace detail
6960
+
6961
+ template <class D, HWY_IF_V_SIZE_D(D, 16)>
6962
+ HWY_API Vec128<float> MulEvenAdd(D /*d32*/, Vec128<bfloat16_t> a,
6963
+ Vec128<bfloat16_t> b, const Vec128<float> c) {
6964
+ return Vec128<float>(vbfmlalbq_f32(c.raw, detail::BitCastToRawNeonBF16(a.raw),
6965
+ detail::BitCastToRawNeonBF16(b.raw)));
6966
+ }
6967
+
6968
+ template <class D, HWY_IF_V_SIZE_D(D, 16)>
6969
+ HWY_API Vec128<float> MulOddAdd(D /*d32*/, Vec128<bfloat16_t> a,
6970
+ Vec128<bfloat16_t> b, const Vec128<float> c) {
6971
+ return Vec128<float>(vbfmlaltq_f32(c.raw, detail::BitCastToRawNeonBF16(a.raw),
6972
+ detail::BitCastToRawNeonBF16(b.raw)));
6973
+ }
5824
6974
 
5825
6975
  template <class D, HWY_IF_V_SIZE_D(D, 16)>
5826
6976
  HWY_API Vec128<float> ReorderWidenMulAccumulate(D /*d32*/, Vec128<bfloat16_t> a,
5827
6977
  Vec128<bfloat16_t> b,
5828
6978
  const Vec128<float> sum0,
5829
6979
  Vec128<float>& /*sum1*/) {
5830
- return Vec128<float>(vbfdotq_f32(sum0.raw, a.raw, b.raw));
6980
+ return Vec128<float>(vbfdotq_f32(sum0.raw,
6981
+ detail::BitCastToRawNeonBF16(a.raw),
6982
+ detail::BitCastToRawNeonBF16(b.raw)));
6983
+ }
6984
+
6985
+ // There is no non-q version of these instructions.
6986
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
6987
+ HWY_API VFromD<D> MulEvenAdd(D d32, VFromD<Repartition<bfloat16_t, D>> a,
6988
+ VFromD<Repartition<bfloat16_t, D>> b,
6989
+ const VFromD<D> c) {
6990
+ const Full128<float> d32f;
6991
+ const Full128<bfloat16_t> d16f;
6992
+ return ResizeBitCast(
6993
+ d32, MulEvenAdd(d32f, ResizeBitCast(d16f, a), ResizeBitCast(d16f, b),
6994
+ ResizeBitCast(d32f, c)));
6995
+ }
6996
+
6997
+ template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
6998
+ HWY_API VFromD<D> MulOddAdd(D d32, VFromD<Repartition<bfloat16_t, D>> a,
6999
+ VFromD<Repartition<bfloat16_t, D>> b,
7000
+ const VFromD<D> c) {
7001
+ const Full128<float> d32f;
7002
+ const Full128<bfloat16_t> d16f;
7003
+ return ResizeBitCast(
7004
+ d32, MulOddAdd(d32f, ResizeBitCast(d16f, a), ResizeBitCast(d16f, b),
7005
+ ResizeBitCast(d32f, c)));
5831
7006
  }
5832
7007
 
5833
7008
  template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
@@ -5835,28 +7010,11 @@ HWY_API VFromD<D> ReorderWidenMulAccumulate(
5835
7010
  D /*d32*/, VFromD<Repartition<bfloat16_t, D>> a,
5836
7011
  VFromD<Repartition<bfloat16_t, D>> b, const VFromD<D> sum0,
5837
7012
  VFromD<D>& /*sum1*/) {
5838
- return VFromD<D>(vbfdot_f32(sum0.raw, a.raw, b.raw));
5839
- }
5840
-
5841
- #else
5842
-
5843
- template <class D32, HWY_IF_F32_D(D32),
5844
- class V16 = VFromD<Repartition<bfloat16_t, D32>>>
5845
- HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b,
5846
- const VFromD<D32> sum0,
5847
- VFromD<D32>& sum1) {
5848
- const RebindToUnsigned<decltype(df32)> du32;
5849
- using VU32 = VFromD<decltype(du32)>;
5850
- const VU32 odd = Set(du32, 0xFFFF0000u);
5851
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
5852
- const VU32 ao = And(BitCast(du32, a), odd);
5853
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
5854
- const VU32 bo = And(BitCast(du32, b), odd);
5855
- sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
5856
- return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
7013
+ return VFromD<D>(vbfdot_f32(sum0.raw, detail::BitCastToRawNeonBF16(a.raw),
7014
+ detail::BitCastToRawNeonBF16(b.raw)));
5857
7015
  }
5858
7016
 
5859
- #endif // HWY_NEON_HAVE_BFLOAT16
7017
+ #endif // HWY_NEON_HAVE_F32_TO_BF16C
5860
7018
 
5861
7019
  template <class D, HWY_IF_I32_D(D)>
5862
7020
  HWY_API Vec128<int32_t> ReorderWidenMulAccumulate(D /*d32*/, Vec128<int16_t> a,
@@ -6026,37 +7184,34 @@ HWY_API Vec32<uint32_t> RearrangeToOddPlusEven(Vec32<uint32_t> sum0,
6026
7184
 
6027
7185
  // ------------------------------ WidenMulPairwiseAdd
6028
7186
 
6029
- #if HWY_NEON_HAVE_BFLOAT16
7187
+ #if HWY_NEON_HAVE_F32_TO_BF16C
6030
7188
 
6031
- template <class D, HWY_IF_V_SIZE_D(D, 16)>
6032
- HWY_API Vec128<float> WidenMulPairwiseAdd(D d32, Vec128<bfloat16_t> a,
7189
+ template <class DF, HWY_IF_V_SIZE_D(DF, 16)>
7190
+ HWY_API Vec128<float> WidenMulPairwiseAdd(DF df, Vec128<bfloat16_t> a,
6033
7191
  Vec128<bfloat16_t> b) {
6034
- return Vec128<float>(vbfdotq_f32(Zero(d32).raw, a.raw, b.raw));
7192
+ return Vec128<float>(vbfdotq_f32(Zero(df).raw,
7193
+ detail::BitCastToRawNeonBF16(a.raw),
7194
+ detail::BitCastToRawNeonBF16(b.raw)));
6035
7195
  }
6036
7196
 
6037
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
6038
- HWY_API VFromD<D> WidenMulPairwiseAdd(D d32,
6039
- VFromD<Repartition<bfloat16_t, D>> a,
6040
- VFromD<Repartition<bfloat16_t, D>> b) {
6041
- return VFromD<D>(vbfdot_f32(Zero(d32).raw, a.raw, b.raw));
7197
+ template <class DF, HWY_IF_V_SIZE_LE_D(DF, 8)>
7198
+ HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df,
7199
+ VFromD<Repartition<bfloat16_t, DF>> a,
7200
+ VFromD<Repartition<bfloat16_t, DF>> b) {
7201
+ return VFromD<DF>(vbfdot_f32(Zero(df).raw,
7202
+ detail::BitCastToRawNeonBF16(a.raw),
7203
+ detail::BitCastToRawNeonBF16(b.raw)));
6042
7204
  }
6043
7205
 
6044
7206
  #else
6045
- template <class D32, HWY_IF_F32_D(D32)>
6046
- HWY_API VFromD<D32> WidenMulPairwiseAdd(
6047
- D32 df32, VFromD<Repartition<bfloat16_t, D32>> a,
6048
- VFromD<Repartition<bfloat16_t, D32>> b) {
6049
- const RebindToUnsigned<decltype(df32)> du32;
6050
- using VU32 = VFromD<decltype(du32)>;
6051
- const VU32 odd = Set(du32, 0xFFFF0000u);
6052
- const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
6053
- const VU32 ao = And(BitCast(du32, a), odd);
6054
- const VU32 be = ShiftLeft<16>(BitCast(du32, b));
6055
- const VU32 bo = And(BitCast(du32, b), odd);
6056
- return MulAdd(BitCast(df32, ae), BitCast(df32, be),
6057
- Mul(BitCast(df32, ao), BitCast(df32, bo)));
7207
+ template <class DF, HWY_IF_F32_D(DF)>
7208
+ HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df,
7209
+ VFromD<Repartition<bfloat16_t, DF>> a,
7210
+ VFromD<Repartition<bfloat16_t, DF>> b) {
7211
+ return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
7212
+ Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
6058
7213
  }
6059
- #endif // HWY_NEON_HAVE_BFLOAT16
7214
+ #endif // HWY_NEON_HAVE_F32_TO_BF16C
6060
7215
 
6061
7216
  template <class D, HWY_IF_I32_D(D)>
6062
7217
  HWY_API Vec128<int32_t> WidenMulPairwiseAdd(D /*d32*/, Vec128<int16_t> a,
@@ -6266,6 +7421,23 @@ namespace detail {
6266
7421
  // There is no vuzpq_u64.
6267
7422
  HWY_NEON_DEF_FUNCTION_UIF_8_16_32(ConcatEven, vuzp1, _, 2)
6268
7423
  HWY_NEON_DEF_FUNCTION_UIF_8_16_32(ConcatOdd, vuzp2, _, 2)
7424
+
7425
+ #if !HWY_HAVE_FLOAT16
7426
+ template <size_t N>
7427
+ HWY_INLINE Vec128<float16_t, N> ConcatEven(Vec128<float16_t, N> hi,
7428
+ Vec128<float16_t, N> lo) {
7429
+ const DFromV<decltype(hi)> d;
7430
+ const RebindToUnsigned<decltype(d)> du;
7431
+ return BitCast(d, ConcatEven(BitCast(du, hi), BitCast(du, lo)));
7432
+ }
7433
+ template <size_t N>
7434
+ HWY_INLINE Vec128<float16_t, N> ConcatOdd(Vec128<float16_t, N> hi,
7435
+ Vec128<float16_t, N> lo) {
7436
+ const DFromV<decltype(hi)> d;
7437
+ const RebindToUnsigned<decltype(d)> du;
7438
+ return BitCast(d, ConcatOdd(BitCast(du, hi), BitCast(du, lo)));
7439
+ }
7440
+ #endif // !HWY_HAVE_FLOAT16
6269
7441
  } // namespace detail
6270
7442
 
6271
7443
  // Full/half vector
@@ -6374,6 +7546,36 @@ HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
6374
7546
  return IfThenElse(MaskFromVec(vec), b, a);
6375
7547
  }
6376
7548
 
7549
+ // ------------------------------ InterleaveEven
7550
+ template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
7551
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
7552
+ #if HWY_ARCH_ARM_A64
7553
+ return detail::InterleaveEven(a, b);
7554
+ #else
7555
+ return VFromD<D>(detail::InterleaveEvenOdd(a.raw, b.raw).val[0]);
7556
+ #endif
7557
+ }
7558
+
7559
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
7560
+ HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
7561
+ return InterleaveLower(a, b);
7562
+ }
7563
+
7564
+ // ------------------------------ InterleaveOdd
7565
+ template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
7566
+ HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
7567
+ #if HWY_ARCH_ARM_A64
7568
+ return detail::InterleaveOdd(a, b);
7569
+ #else
7570
+ return VFromD<D>(detail::InterleaveEvenOdd(a.raw, b.raw).val[1]);
7571
+ #endif
7572
+ }
7573
+
7574
+ template <class D, HWY_IF_T_SIZE_D(D, 8)>
7575
+ HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
7576
+ return InterleaveUpper(d, a, b);
7577
+ }
7578
+
6377
7579
  // ------------------------------ OddEvenBlocks
6378
7580
  template <typename T, size_t N>
6379
7581
  HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
@@ -6395,12 +7597,14 @@ HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
6395
7597
 
6396
7598
  // ------------------------------ ReorderDemote2To (OddEven)
6397
7599
 
6398
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_BF16_D(D),
6399
- class V32 = VFromD<Repartition<float, D>>>
6400
- HWY_API VFromD<D> ReorderDemote2To(D dbf16, V32 a, V32 b) {
6401
- const RebindToUnsigned<decltype(dbf16)> du16;
6402
- return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a)));
7600
+ #if HWY_NEON_HAVE_F32_TO_BF16C
7601
+ template <class D, HWY_IF_BF16_D(D)>
7602
+ HWY_API VFromD<D> ReorderDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
7603
+ VFromD<Repartition<float, D>> b) {
7604
+ const Half<decltype(dbf16)> dh_bf16;
7605
+ return Combine(dbf16, DemoteTo(dh_bf16, b), DemoteTo(dh_bf16, a));
6403
7606
  }
7607
+ #endif // HWY_NEON_HAVE_F32_TO_BF16C
6404
7608
 
6405
7609
  template <class D, HWY_IF_I32_D(D)>
6406
7610
  HWY_API Vec128<int32_t> ReorderDemote2To(D d32, Vec128<int64_t> a,
@@ -6616,16 +7820,19 @@ HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
6616
7820
  return ReorderDemote2To(d, a, b);
6617
7821
  }
6618
7822
 
6619
- template <class D, HWY_IF_BF16_D(D), class V32 = VFromD<Repartition<float, D>>>
6620
- HWY_API VFromD<D> OrderedDemote2To(D dbf16, V32 a, V32 b) {
7823
+ #if HWY_NEON_HAVE_F32_TO_BF16C
7824
+ template <class D, HWY_IF_BF16_D(D)>
7825
+ HWY_API VFromD<D> OrderedDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
7826
+ VFromD<Repartition<float, D>> b) {
6621
7827
  return ReorderDemote2To(dbf16, a, b);
6622
7828
  }
7829
+ #endif // HWY_NEON_HAVE_F32_TO_BF16C
6623
7830
 
6624
7831
  // ================================================== CRYPTO
6625
7832
 
6626
7833
  // (aarch64 or Arm7) and (__ARM_FEATURE_AES or HWY_HAVE_RUNTIME_DISPATCH).
6627
7834
  // Otherwise, rely on generic_ops-inl.h to emulate AESRound / CLMul*.
6628
- #if HWY_TARGET == HWY_NEON
7835
+ #if HWY_TARGET != HWY_NEON_WITHOUT_AES
6629
7836
 
6630
7837
  #ifdef HWY_NATIVE_AES
6631
7838
  #undef HWY_NATIVE_AES
@@ -6676,7 +7883,7 @@ HWY_API Vec128<uint64_t> CLMulUpper(Vec128<uint64_t> a, Vec128<uint64_t> b) {
6676
7883
  (uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw));
6677
7884
  }
6678
7885
 
6679
- #endif // HWY_TARGET == HWY_NEON
7886
+ #endif // HWY_TARGET != HWY_NEON_WITHOUT_AES
6680
7887
 
6681
7888
  // ================================================== MISC
6682
7889
 
@@ -6851,10 +8058,11 @@ HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(Vec128<uint32_t, N> a,
6851
8058
  vget_low_u64(vmull_u32(a_packed, b_packed)));
6852
8059
  }
6853
8060
 
6854
- HWY_INLINE Vec128<uint64_t> MulEven(Vec128<uint64_t> a, Vec128<uint64_t> b) {
6855
- uint64_t hi;
6856
- uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 0), vgetq_lane_u64(b.raw, 0), &hi);
6857
- return Vec128<uint64_t>(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1));
8061
+ template <class T, HWY_IF_UI64(T)>
8062
+ HWY_INLINE Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
8063
+ T hi;
8064
+ T lo = Mul128(GetLane(a), GetLane(b), &hi);
8065
+ return Dup128VecFromValues(Full128<T>(), lo, hi);
6858
8066
  }
6859
8067
 
6860
8068
  // Multiplies odd lanes (1, 3 ..) and places the double-wide result into
@@ -6957,10 +8165,11 @@ HWY_API Vec128<uint64_t, (N + 1) / 2> MulOdd(Vec128<uint32_t, N> a,
6957
8165
  vget_low_u64(vmull_u32(a_packed, b_packed)));
6958
8166
  }
6959
8167
 
6960
- HWY_INLINE Vec128<uint64_t> MulOdd(Vec128<uint64_t> a, Vec128<uint64_t> b) {
6961
- uint64_t hi;
6962
- uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 1), vgetq_lane_u64(b.raw, 1), &hi);
6963
- return Vec128<uint64_t>(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1));
8168
+ template <class T, HWY_IF_UI64(T)>
8169
+ HWY_INLINE Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
8170
+ T hi;
8171
+ T lo = Mul128(detail::GetLane<1>(a), detail::GetLane<1>(b), &hi);
8172
+ return Dup128VecFromValues(Full128<T>(), lo, hi);
6964
8173
  }
6965
8174
 
6966
8175
  // ------------------------------ TableLookupBytes (Combine, LowerHalf)
@@ -7025,7 +8234,7 @@ HWY_API VI TableLookupBytesOr0(V bytes, VI from) {
7025
8234
 
7026
8235
  // ---------------------------- AESKeyGenAssist (AESLastRound, TableLookupBytes)
7027
8236
 
7028
- #if HWY_TARGET == HWY_NEON
8237
+ #if HWY_TARGET != HWY_NEON_WITHOUT_AES
7029
8238
  template <uint8_t kRcon>
7030
8239
  HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) {
7031
8240
  alignas(16) static constexpr uint8_t kRconXorMask[16] = {
@@ -7038,51 +8247,26 @@ HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) {
7038
8247
  const auto sub_word_result = AESLastRound(w13, Load(d, kRconXorMask));
7039
8248
  return TableLookupBytes(sub_word_result, Load(d, kRotWordShuffle));
7040
8249
  }
7041
- #endif // HWY_TARGET == HWY_NEON
8250
+ #endif // HWY_TARGET != HWY_NEON_WITHOUT_AES
7042
8251
 
7043
8252
  // ------------------------------ Scatter in generic_ops-inl.h
7044
8253
  // ------------------------------ Gather in generic_ops-inl.h
7045
8254
 
7046
8255
  // ------------------------------ Reductions
7047
8256
 
7048
- namespace detail {
7049
-
7050
- // N=1 for any T: no-op
7051
- template <typename T>
7052
- HWY_INLINE T ReduceMin(hwy::SizeTag<sizeof(T)> /* tag */, Vec128<T, 1> v) {
7053
- return GetLane(v);
7054
- }
7055
- template <typename T>
7056
- HWY_INLINE T ReduceMax(hwy::SizeTag<sizeof(T)> /* tag */, Vec128<T, 1> v) {
7057
- return GetLane(v);
7058
- }
7059
- template <typename T>
7060
- HWY_INLINE T ReduceSum(hwy::SizeTag<sizeof(T)> /* tag */, Vec128<T, 1> v) {
7061
- return GetLane(v);
7062
- }
7063
- template <typename T>
7064
- HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
7065
- Vec128<T, 1> v) {
7066
- return v;
7067
- }
7068
- template <typename T>
7069
- HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
7070
- Vec128<T, 1> v) {
7071
- return v;
7072
- }
7073
- template <typename T>
7074
- HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
7075
- Vec128<T, 1> v) {
7076
- return v;
7077
- }
7078
-
7079
- // full vectors
8257
+ // On Armv8 we define ReduceSum and generic_ops defines SumOfLanes via Set.
7080
8258
  #if HWY_ARCH_ARM_A64
7081
8259
 
8260
+ #ifdef HWY_NATIVE_REDUCE_SCALAR
8261
+ #undef HWY_NATIVE_REDUCE_SCALAR
8262
+ #else
8263
+ #define HWY_NATIVE_REDUCE_SCALAR
8264
+ #endif
8265
+
7082
8266
  // TODO(janwas): use normal HWY_NEON_DEF, then FULL type list.
7083
8267
  #define HWY_NEON_DEF_REDUCTION(type, size, name, prefix, infix, suffix) \
7084
- HWY_API type##_t name(hwy::SizeTag<sizeof(type##_t)>, \
7085
- Vec128<type##_t, size> v) { \
8268
+ template <class D, HWY_IF_LANES_D(D, size)> \
8269
+ HWY_API type##_t name(D /* tag */, Vec128<type##_t, size> v) { \
7086
8270
  return HWY_NEON_EVAL(prefix##infix##suffix, v.raw); \
7087
8271
  }
7088
8272
 
@@ -7125,83 +8309,110 @@ HWY_NEON_DEF_REDUCTION_F16(ReduceMax, vmaxv)
7125
8309
  HWY_NEON_DEF_REDUCTION_CORE_TYPES(ReduceSum, vaddv)
7126
8310
  HWY_NEON_DEF_REDUCTION_UI64(ReduceSum, vaddv)
7127
8311
 
8312
+ // Emulate missing UI64 and partial N=2.
8313
+ template <class D, HWY_IF_LANES_D(D, 2),
8314
+ HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
8315
+ HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v10) {
8316
+ return GetLane(v10) + ExtractLane(v10, 1);
8317
+ }
8318
+
8319
+ template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_NOT_FLOAT_D(D),
8320
+ HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 8))>
8321
+ HWY_API TFromD<D> ReduceMin(D /* tag */, VFromD<D> v10) {
8322
+ return HWY_MIN(GetLane(v10), ExtractLane(v10, 1));
8323
+ }
8324
+
8325
+ template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_NOT_FLOAT_D(D),
8326
+ HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 8))>
8327
+ HWY_API TFromD<D> ReduceMax(D /* tag */, VFromD<D> v10) {
8328
+ return HWY_MAX(GetLane(v10), ExtractLane(v10, 1));
8329
+ }
8330
+
7128
8331
  #if HWY_HAVE_FLOAT16
7129
- HWY_API float16_t ReduceSum(hwy::SizeTag<2>, Vec64<float16_t> v) {
8332
+ template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_F16_D(D)>
8333
+ HWY_API float16_t ReduceMin(D d, VFromD<D> v10) {
8334
+ return GetLane(Min(v10, Reverse2(d, v10)));
8335
+ }
8336
+
8337
+ template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_F16_D(D)>
8338
+ HWY_API float16_t ReduceMax(D d, VFromD<D> v10) {
8339
+ return GetLane(Max(v10, Reverse2(d, v10)));
8340
+ }
8341
+
8342
+ template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 8)>
8343
+ HWY_API float16_t ReduceSum(D /* tag */, VFromD<D> v) {
7130
8344
  const float16x4_t x2 = vpadd_f16(v.raw, v.raw);
7131
- return GetLane(Vec64<float16_t>(vpadd_f16(x2, x2)));
8345
+ return GetLane(VFromD<D>(vpadd_f16(x2, x2)));
7132
8346
  }
7133
- HWY_API float16_t ReduceSum(hwy::SizeTag<2> tag, Vec128<float16_t> v) {
7134
- return ReduceSum(tag, LowerHalf(Vec128<float16_t>(vpaddq_f16(v.raw, v.raw))));
8347
+ template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 16)>
8348
+ HWY_API float16_t ReduceSum(D d, VFromD<D> v) {
8349
+ const Half<decltype(d)> dh;
8350
+ return ReduceSum(dh, LowerHalf(dh, VFromD<D>(vpaddq_f16(v.raw, v.raw))));
7135
8351
  }
7136
- #endif
8352
+ #endif // HWY_HAVE_FLOAT16
7137
8353
 
7138
8354
  #undef HWY_NEON_DEF_REDUCTION_CORE_TYPES
7139
8355
  #undef HWY_NEON_DEF_REDUCTION_F16
7140
8356
  #undef HWY_NEON_DEF_REDUCTION_UI64
7141
8357
  #undef HWY_NEON_DEF_REDUCTION
7142
8358
 
7143
- // Need some fallback implementations for [ui]64x2 and [ui]16x2.
7144
- #define HWY_IF_SUM_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, 1 << 2)
7145
- #define HWY_IF_MINMAX_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, (1 << 8) | (1 << 2))
8359
+ // ------------------------------ SumOfLanes
7146
8360
 
7147
- // Implement Min/Max/SumOfLanes in terms of the corresponding reduction.
7148
- template <size_t N, typename V>
7149
- HWY_API V MinOfLanes(hwy::SizeTag<N> tag, V v) {
7150
- return Set(DFromV<decltype(v)>(), ReduceMin(tag, v));
8361
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
8362
+ HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
8363
+ return Set(d, ReduceSum(d, v));
7151
8364
  }
7152
- template <size_t N, typename V>
7153
- HWY_API V MaxOfLanes(hwy::SizeTag<N> tag, V v) {
7154
- return Set(DFromV<decltype(v)>(), ReduceMax(tag, v));
8365
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
8366
+ HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
8367
+ return Set(d, ReduceMin(d, v));
7155
8368
  }
7156
- template <size_t N, typename V>
7157
- HWY_API V SumOfLanes(hwy::SizeTag<N> tag, V v) {
7158
- return Set(DFromV<decltype(v)>(), ReduceSum(tag, v));
8369
+ template <class D, HWY_IF_LANES_GT_D(D, 1)>
8370
+ HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
8371
+ return Set(d, ReduceMax(d, v));
7159
8372
  }
7160
8373
 
7161
- #else
8374
+ // On Armv7 we define SumOfLanes and generic_ops defines ReduceSum via GetLane.
8375
+ #else // !HWY_ARCH_ARM_A64
8376
+
8377
+ // Armv7 lacks N=2 and 8-bit x4, so enable generic versions of those.
8378
+ #undef HWY_IF_SUM_OF_LANES_D
8379
+ #define HWY_IF_SUM_OF_LANES_D(D) \
8380
+ hwy::EnableIf<(HWY_MAX_LANES_D(D) == 2) || \
8381
+ (sizeof(TFromD<D>) == 1 && HWY_MAX_LANES_D(D) == 4)>* = \
8382
+ nullptr
8383
+ #undef HWY_IF_MINMAX_OF_LANES_D
8384
+ #define HWY_IF_MINMAX_OF_LANES_D(D) \
8385
+ hwy::EnableIf<(HWY_MAX_LANES_D(D) == 2) || \
8386
+ (sizeof(TFromD<D>) == 1 && HWY_MAX_LANES_D(D) == 4)>* = \
8387
+ nullptr
7162
8388
 
7163
8389
  // For arm7, we implement reductions using a series of pairwise operations. This
7164
8390
  // produces the full vector result, so we express Reduce* in terms of *OfLanes.
7165
8391
  #define HWY_NEON_BUILD_TYPE_T(type, size) type##x##size##_t
7166
- #define HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) Vec128<type##_t, size>
7167
8392
  #define HWY_NEON_DEF_PAIRWISE_REDUCTION(type, size, name, prefix, suffix) \
7168
- HWY_API HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) name##OfLanes( \
7169
- hwy::SizeTag<sizeof(type##_t)>, Vec128<type##_t, size> v) { \
8393
+ template <class D, HWY_IF_LANES_D(D, size)> \
8394
+ HWY_API Vec128<type##_t, size> name##OfLanes(D /* d */, \
8395
+ Vec128<type##_t, size> v) { \
7170
8396
  HWY_NEON_BUILD_TYPE_T(type, size) tmp = prefix##_##suffix(v.raw, v.raw); \
7171
8397
  if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \
7172
8398
  if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \
7173
- return HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size)(tmp); \
7174
- } \
7175
- HWY_API type##_t Reduce##name(hwy::SizeTag<sizeof(type##_t)> tag, \
7176
- Vec128<type##_t, size> v) { \
7177
- return GetLane(name##OfLanes(tag, v)); \
8399
+ return Vec128<type##_t, size>(tmp); \
7178
8400
  }
7179
8401
 
7180
8402
  // For the wide versions, the pairwise operations produce a half-length vector.
7181
- // We produce that value with a Reduce*Vector helper method, and express Reduce*
7182
- // and *OfLanes in terms of the helper.
8403
+ // We produce that `tmp` and then Combine.
7183
8404
  #define HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(type, size, half, name, prefix, \
7184
8405
  suffix) \
7185
- HWY_API HWY_NEON_BUILD_TYPE_T(type, half) \
7186
- Reduce##name##Vector(Vec128<type##_t, size> v) { \
8406
+ template <class D, HWY_IF_LANES_D(D, size)> \
8407
+ HWY_API Vec128<type##_t, size> name##OfLanes(D /* d */, \
8408
+ Vec128<type##_t, size> v) { \
7187
8409
  HWY_NEON_BUILD_TYPE_T(type, half) tmp; \
7188
8410
  tmp = prefix##_##suffix(vget_high_##suffix(v.raw), \
7189
8411
  vget_low_##suffix(v.raw)); \
7190
8412
  if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \
7191
8413
  if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \
7192
8414
  if ((size / 8) > 1) tmp = prefix##_##suffix(tmp, tmp); \
7193
- return tmp; \
7194
- } \
7195
- HWY_API type##_t Reduce##name(hwy::SizeTag<sizeof(type##_t)>, \
7196
- Vec128<type##_t, size> v) { \
7197
- const HWY_NEON_BUILD_TYPE_T(type, half) tmp = Reduce##name##Vector(v); \
7198
- return HWY_NEON_EVAL(vget_lane_##suffix, tmp, 0); \
7199
- } \
7200
- HWY_API HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) name##OfLanes( \
7201
- hwy::SizeTag<sizeof(type##_t)>, Vec128<type##_t, size> v) { \
7202
- const HWY_NEON_BUILD_TYPE_T(type, half) tmp = Reduce##name##Vector(v); \
7203
- return HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION( \
7204
- type, size)(vcombine_##suffix(tmp, tmp)); \
8415
+ return Vec128<type##_t, size>(vcombine_##suffix(tmp, tmp)); \
7205
8416
  }
7206
8417
 
7207
8418
  #define HWY_NEON_DEF_PAIRWISE_REDUCTIONS(name, prefix) \
@@ -7227,56 +8438,22 @@ HWY_NEON_DEF_PAIRWISE_REDUCTIONS(Max, vpmax)
7227
8438
  #undef HWY_NEON_DEF_PAIRWISE_REDUCTIONS
7228
8439
  #undef HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION
7229
8440
  #undef HWY_NEON_DEF_PAIRWISE_REDUCTION
7230
- #undef HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION
7231
8441
  #undef HWY_NEON_BUILD_TYPE_T
7232
8442
 
7233
- // Need fallback min/max implementations for [ui]64x2 and [ui]16x2.
7234
- #define HWY_IF_SUM_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, 1 << 2 | 1 << 8)
7235
- #define HWY_IF_MINMAX_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, 1 << 2 | 1 << 8)
7236
-
8443
+ // GetLane(SumsOf4(v)) is more efficient on ArmV7 NEON than the default
8444
+ // N=4 I8/U8 ReduceSum implementation in generic_ops-inl.h
8445
+ #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
8446
+ #undef HWY_NATIVE_REDUCE_SUM_4_UI8
8447
+ #else
8448
+ #define HWY_NATIVE_REDUCE_SUM_4_UI8
7237
8449
  #endif
7238
8450
 
7239
- } // namespace detail
7240
-
7241
- // [ui]16/[ui]64: N=2 -- special case for pairs of very small or large lanes
7242
- template <class D, typename T, HWY_IF_SUM_REDUCTION(T)>
7243
- HWY_API Vec128<T, 2> SumOfLanes(D /* tag */, Vec128<T, 2> v10) {
7244
- return v10 + Reverse2(Simd<T, 2, 0>(), v10);
7245
- }
7246
-
7247
- template <class D, typename T, HWY_IF_SUM_REDUCTION(T)>
7248
- HWY_API T ReduceSum(D d, Vec128<T, 2> v10) {
7249
- return GetLane(SumOfLanes(d, v10));
7250
- }
7251
-
7252
- template <class D, typename T, HWY_IF_MINMAX_REDUCTION(T)>
7253
- HWY_API Vec128<T, 2> MinOfLanes(D /* tag */, Vec128<T, 2> v10) {
7254
- return Min(v10, Reverse2(Simd<T, 2, 0>(), v10));
7255
- }
7256
- template <class D, typename T, HWY_IF_MINMAX_REDUCTION(T)>
7257
- HWY_API Vec128<T, 2> MaxOfLanes(D /* tag */, Vec128<T, 2> v10) {
7258
- return Max(v10, Reverse2(Simd<T, 2, 0>(), v10));
8451
+ template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
8452
+ HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) {
8453
+ return static_cast<TFromD<D>>(GetLane(SumsOf4(v)));
7259
8454
  }
7260
8455
 
7261
- #undef HWY_IF_SUM_REDUCTION
7262
- #undef HWY_IF_MINMAX_REDUCTION
7263
-
7264
- template <class D>
7265
- HWY_API VFromD<D> SumOfLanes(D /* tag */, VFromD<D> v) {
7266
- return detail::SumOfLanes(hwy::SizeTag<sizeof(TFromD<D>)>(), v);
7267
- }
7268
- template <class D>
7269
- HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v) {
7270
- return detail::ReduceSum(hwy::SizeTag<sizeof(TFromD<D>)>(), v);
7271
- }
7272
- template <class D>
7273
- HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) {
7274
- return detail::MinOfLanes(hwy::SizeTag<sizeof(TFromD<D>)>(), v);
7275
- }
7276
- template <class D>
7277
- HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) {
7278
- return detail::MaxOfLanes(hwy::SizeTag<sizeof(TFromD<D>)>(), v);
7279
- }
8456
+ #endif // HWY_ARCH_ARM_A64
7280
8457
 
7281
8458
  // ------------------------------ LoadMaskBits (TestBit)
7282
8459
 
@@ -7345,6 +8522,15 @@ HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
7345
8522
  return detail::LoadMaskBits(d, mask_bits);
7346
8523
  }
7347
8524
 
8525
+ // ------------------------------ Dup128MaskFromMaskBits
8526
+
8527
+ template <class D>
8528
+ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
8529
+ constexpr size_t kN = MaxLanes(d);
8530
+ if (kN < 8) mask_bits &= (1u << kN) - 1;
8531
+ return detail::LoadMaskBits(d, mask_bits);
8532
+ }
8533
+
7348
8534
  // ------------------------------ Mask
7349
8535
 
7350
8536
  namespace detail {
@@ -7674,7 +8860,7 @@ namespace detail {
7674
8860
  template <class D, HWY_IF_V_SIZE_D(D, 16)>
7675
8861
  HWY_INLINE Vec128<uint8_t> Load8Bytes(D /*tag*/, const uint8_t* bytes) {
7676
8862
  return Vec128<uint8_t>(vreinterpretq_u8_u64(
7677
- vld1q_dup_u64(reinterpret_cast<const uint64_t*>(bytes))));
8863
+ vld1q_dup_u64(HWY_RCAST_ALIGNED(const uint64_t*, bytes))));
7678
8864
  }
7679
8865
 
7680
8866
  // Load 8 bytes and return half-reg with N <= 8 bytes.
@@ -8287,9 +9473,8 @@ HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved4, vld4, _, HWY_LOAD_INT)
8287
9473
  template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>>
8288
9474
  HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
8289
9475
  VFromD<D>& v0, VFromD<D>& v1) {
8290
- auto raw = detail::LoadInterleaved2(
8291
- reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned),
8292
- detail::Tuple2<T, d.MaxLanes()>());
9476
+ auto raw = detail::LoadInterleaved2(detail::NativeLanePointer(unaligned),
9477
+ detail::Tuple2<T, d.MaxLanes()>());
8293
9478
  v0 = VFromD<D>(raw.val[0]);
8294
9479
  v1 = VFromD<D>(raw.val[1]);
8295
9480
  }
@@ -8301,9 +9486,8 @@ HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
8301
9486
  // The smallest vector registers are 64-bits and we want space for two.
8302
9487
  alignas(16) T buf[2 * 8 / sizeof(T)] = {};
8303
9488
  CopyBytes<d.MaxBytes() * 2>(unaligned, buf);
8304
- auto raw = detail::LoadInterleaved2(
8305
- reinterpret_cast<const detail::NativeLaneType<T>*>(buf),
8306
- detail::Tuple2<T, d.MaxLanes()>());
9489
+ auto raw = detail::LoadInterleaved2(detail::NativeLanePointer(buf),
9490
+ detail::Tuple2<T, d.MaxLanes()>());
8307
9491
  v0 = VFromD<D>(raw.val[0]);
8308
9492
  v1 = VFromD<D>(raw.val[1]);
8309
9493
  }
@@ -8315,12 +9499,8 @@ HWY_API void LoadInterleaved2(D d, T* HWY_RESTRICT unaligned, Vec128<T>& v0,
8315
9499
  Vec128<T>& v1) {
8316
9500
  const Half<decltype(d)> dh;
8317
9501
  VFromD<decltype(dh)> v00, v10, v01, v11;
8318
- LoadInterleaved2(
8319
- dh, reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned), v00,
8320
- v10);
8321
- LoadInterleaved2(
8322
- dh, reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned + 2),
8323
- v01, v11);
9502
+ LoadInterleaved2(dh, detail::NativeLanePointer(unaligned), v00, v10);
9503
+ LoadInterleaved2(dh, detail::NativeLanePointer(unaligned + 2), v01, v11);
8324
9504
  v0 = Combine(d, v01, v00);
8325
9505
  v1 = Combine(d, v11, v10);
8326
9506
  }
@@ -8331,9 +9511,8 @@ HWY_API void LoadInterleaved2(D d, T* HWY_RESTRICT unaligned, Vec128<T>& v0,
8331
9511
  template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>>
8332
9512
  HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
8333
9513
  VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
8334
- auto raw = detail::LoadInterleaved3(
8335
- reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned),
8336
- detail::Tuple3<T, d.MaxLanes()>());
9514
+ auto raw = detail::LoadInterleaved3(detail::NativeLanePointer(unaligned),
9515
+ detail::Tuple3<T, d.MaxLanes()>());
8337
9516
  v0 = VFromD<D>(raw.val[0]);
8338
9517
  v1 = VFromD<D>(raw.val[1]);
8339
9518
  v2 = VFromD<D>(raw.val[2]);
@@ -8346,9 +9525,8 @@ HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
8346
9525
  // The smallest vector registers are 64-bits and we want space for three.
8347
9526
  alignas(16) T buf[3 * 8 / sizeof(T)] = {};
8348
9527
  CopyBytes<d.MaxBytes() * 3>(unaligned, buf);
8349
- auto raw = detail::LoadInterleaved3(
8350
- reinterpret_cast<const detail::NativeLaneType<T>*>(buf),
8351
- detail::Tuple3<T, d.MaxLanes()>());
9528
+ auto raw = detail::LoadInterleaved3(detail::NativeLanePointer(buf),
9529
+ detail::Tuple3<T, d.MaxLanes()>());
8352
9530
  v0 = VFromD<D>(raw.val[0]);
8353
9531
  v1 = VFromD<D>(raw.val[1]);
8354
9532
  v2 = VFromD<D>(raw.val[2]);
@@ -8361,12 +9539,8 @@ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
8361
9539
  Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2) {
8362
9540
  const Half<decltype(d)> dh;
8363
9541
  VFromD<decltype(dh)> v00, v10, v20, v01, v11, v21;
8364
- LoadInterleaved3(
8365
- dh, reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned), v00,
8366
- v10, v20);
8367
- LoadInterleaved3(
8368
- dh, reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned + 3),
8369
- v01, v11, v21);
9542
+ LoadInterleaved3(dh, detail::NativeLanePointer(unaligned), v00, v10, v20);
9543
+ LoadInterleaved3(dh, detail::NativeLanePointer(unaligned + 3), v01, v11, v21);
8370
9544
  v0 = Combine(d, v01, v00);
8371
9545
  v1 = Combine(d, v11, v10);
8372
9546
  v2 = Combine(d, v21, v20);
@@ -8379,9 +9553,8 @@ template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>>
8379
9553
  HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
8380
9554
  VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
8381
9555
  VFromD<D>& v3) {
8382
- auto raw = detail::LoadInterleaved4(
8383
- reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned),
8384
- detail::Tuple4<T, d.MaxLanes()>());
9556
+ auto raw = detail::LoadInterleaved4(detail::NativeLanePointer(unaligned),
9557
+ detail::Tuple4<T, d.MaxLanes()>());
8385
9558
  v0 = VFromD<D>(raw.val[0]);
8386
9559
  v1 = VFromD<D>(raw.val[1]);
8387
9560
  v2 = VFromD<D>(raw.val[2]);
@@ -8395,9 +9568,8 @@ HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
8395
9568
  VFromD<D>& v3) {
8396
9569
  alignas(16) T buf[4 * 8 / sizeof(T)] = {};
8397
9570
  CopyBytes<d.MaxBytes() * 4>(unaligned, buf);
8398
- auto raw = detail::LoadInterleaved4(
8399
- reinterpret_cast<const detail::NativeLaneType<T>*>(buf),
8400
- detail::Tuple4<T, d.MaxLanes()>());
9571
+ auto raw = detail::LoadInterleaved4(detail::NativeLanePointer(buf),
9572
+ detail::Tuple4<T, d.MaxLanes()>());
8401
9573
  v0 = VFromD<D>(raw.val[0]);
8402
9574
  v1 = VFromD<D>(raw.val[1]);
8403
9575
  v2 = VFromD<D>(raw.val[2]);
@@ -8412,12 +9584,10 @@ HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
8412
9584
  Vec128<T>& v3) {
8413
9585
  const Half<decltype(d)> dh;
8414
9586
  VFromD<decltype(dh)> v00, v10, v20, v30, v01, v11, v21, v31;
8415
- LoadInterleaved4(
8416
- dh, reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned), v00,
8417
- v10, v20, v30);
8418
- LoadInterleaved4(
8419
- dh, reinterpret_cast<const detail::NativeLaneType<T>*>(unaligned + 4),
8420
- v01, v11, v21, v31);
9587
+ LoadInterleaved4(dh, detail::NativeLanePointer(unaligned), v00, v10, v20,
9588
+ v30);
9589
+ LoadInterleaved4(dh, detail::NativeLanePointer(unaligned + 4), v01, v11, v21,
9590
+ v31);
8421
9591
  v0 = Combine(d, v01, v00);
8422
9592
  v1 = Combine(d, v11, v10);
8423
9593
  v2 = Combine(d, v21, v20);
@@ -8476,8 +9646,7 @@ template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>>
8476
9646
  HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
8477
9647
  T* HWY_RESTRICT unaligned) {
8478
9648
  detail::Tuple2<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw}}};
8479
- detail::StoreInterleaved2(
8480
- tup, reinterpret_cast<detail::NativeLaneType<T>*>(unaligned));
9649
+ detail::StoreInterleaved2(tup, detail::NativeLanePointer(unaligned));
8481
9650
  }
8482
9651
 
8483
9652
  // <= 32 bits: avoid writing more than N bytes by copying to buffer
@@ -8486,8 +9655,7 @@ HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
8486
9655
  T* HWY_RESTRICT unaligned) {
8487
9656
  alignas(16) T buf[2 * 8 / sizeof(T)];
8488
9657
  detail::Tuple2<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw}}};
8489
- detail::StoreInterleaved2(tup,
8490
- reinterpret_cast<detail::NativeLaneType<T>*>(buf));
9658
+ detail::StoreInterleaved2(tup, detail::NativeLanePointer(buf));
8491
9659
  CopyBytes<d.MaxBytes() * 2>(buf, unaligned);
8492
9660
  }
8493
9661
 
@@ -8498,10 +9666,9 @@ HWY_API void StoreInterleaved2(Vec128<T> v0, Vec128<T> v1, D d,
8498
9666
  T* HWY_RESTRICT unaligned) {
8499
9667
  const Half<decltype(d)> dh;
8500
9668
  StoreInterleaved2(LowerHalf(dh, v0), LowerHalf(dh, v1), dh,
8501
- reinterpret_cast<detail::NativeLaneType<T>*>(unaligned));
8502
- StoreInterleaved2(
8503
- UpperHalf(dh, v0), UpperHalf(dh, v1), dh,
8504
- reinterpret_cast<detail::NativeLaneType<T>*>(unaligned + 2));
9669
+ detail::NativeLanePointer(unaligned));
9670
+ StoreInterleaved2(UpperHalf(dh, v0), UpperHalf(dh, v1), dh,
9671
+ detail::NativeLanePointer(unaligned + 2));
8505
9672
  }
8506
9673
  #endif // HWY_ARCH_ARM_V7
8507
9674
 
@@ -8511,8 +9678,7 @@ template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>>
8511
9678
  HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
8512
9679
  T* HWY_RESTRICT unaligned) {
8513
9680
  detail::Tuple3<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw}}};
8514
- detail::StoreInterleaved3(
8515
- tup, reinterpret_cast<detail::NativeLaneType<T>*>(unaligned));
9681
+ detail::StoreInterleaved3(tup, detail::NativeLanePointer(unaligned));
8516
9682
  }
8517
9683
 
8518
9684
  // <= 32 bits: avoid writing more than N bytes by copying to buffer
@@ -8521,8 +9687,7 @@ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
8521
9687
  T* HWY_RESTRICT unaligned) {
8522
9688
  alignas(16) T buf[3 * 8 / sizeof(T)];
8523
9689
  detail::Tuple3<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw}}};
8524
- detail::StoreInterleaved3(tup,
8525
- reinterpret_cast<detail::NativeLaneType<T>*>(buf));
9690
+ detail::StoreInterleaved3(tup, detail::NativeLanePointer(buf));
8526
9691
  CopyBytes<d.MaxBytes() * 3>(buf, unaligned);
8527
9692
  }
8528
9693
 
@@ -8533,10 +9698,9 @@ HWY_API void StoreInterleaved3(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2, D d,
8533
9698
  T* HWY_RESTRICT unaligned) {
8534
9699
  const Half<decltype(d)> dh;
8535
9700
  StoreInterleaved3(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2), dh,
8536
- reinterpret_cast<detail::NativeLaneType<T>*>(unaligned));
8537
- StoreInterleaved3(
8538
- UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2), dh,
8539
- reinterpret_cast<detail::NativeLaneType<T>*>(unaligned + 3));
9701
+ detail::NativeLanePointer(unaligned));
9702
+ StoreInterleaved3(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2), dh,
9703
+ detail::NativeLanePointer(unaligned + 3));
8540
9704
  }
8541
9705
  #endif // HWY_ARCH_ARM_V7
8542
9706
 
@@ -8546,8 +9710,7 @@ template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>>
8546
9710
  HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
8547
9711
  VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) {
8548
9712
  detail::Tuple4<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
8549
- detail::StoreInterleaved4(
8550
- tup, reinterpret_cast<detail::NativeLaneType<T>*>(unaligned));
9713
+ detail::StoreInterleaved4(tup, detail::NativeLanePointer(unaligned));
8551
9714
  }
8552
9715
 
8553
9716
  // <= 32 bits: avoid writing more than N bytes by copying to buffer
@@ -8556,8 +9719,7 @@ HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
8556
9719
  VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) {
8557
9720
  alignas(16) T buf[4 * 8 / sizeof(T)];
8558
9721
  detail::Tuple4<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
8559
- detail::StoreInterleaved4(tup,
8560
- reinterpret_cast<detail::NativeLaneType<T>*>(buf));
9722
+ detail::StoreInterleaved4(tup, detail::NativeLanePointer(buf));
8561
9723
  CopyBytes<d.MaxBytes() * 4>(buf, unaligned);
8562
9724
  }
8563
9725
 
@@ -8569,11 +9731,10 @@ HWY_API void StoreInterleaved4(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2,
8569
9731
  const Half<decltype(d)> dh;
8570
9732
  StoreInterleaved4(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2),
8571
9733
  LowerHalf(dh, v3), dh,
8572
- reinterpret_cast<detail::NativeLaneType<T>*>(unaligned));
8573
- StoreInterleaved4(
8574
- UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2),
8575
- UpperHalf(dh, v3), dh,
8576
- reinterpret_cast<detail::NativeLaneType<T>*>(unaligned + 4));
9734
+ detail::NativeLanePointer(unaligned));
9735
+ StoreInterleaved4(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2),
9736
+ UpperHalf(dh, v3), dh,
9737
+ detail::NativeLanePointer(unaligned + 4));
8577
9738
  }
8578
9739
  #endif // HWY_ARCH_ARM_V7
8579
9740
 
@@ -8904,7 +10065,7 @@ namespace detail { // for code folding
8904
10065
  #undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
8905
10066
  #undef HWY_NEON_DEF_FUNCTION_UINTS
8906
10067
  #undef HWY_NEON_EVAL
8907
-
10068
+ #undef HWY_NEON_IF_EMULATED_D
8908
10069
  } // namespace detail
8909
10070
 
8910
10071
  // NOLINTNEXTLINE(google-readability-namespace-comments)