sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
  2. sequenzo/__init__.py +349 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +476 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +22 -0
  30. sequenzo/data_preprocessing/helpers.py +303 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/dyadic_children.csv +61 -0
  44. sequenzo/datasets/dyadic_parents.csv +61 -0
  45. sequenzo/datasets/mvad.csv +713 -0
  46. sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
  47. sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
  48. sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
  49. sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
  50. sequenzo/datasets/political_science_aid_shock.csv +166 -0
  51. sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
  52. sequenzo/define_sequence_data.py +1400 -0
  53. sequenzo/dissimilarity_measures/__init__.py +31 -0
  54. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  55. sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
  56. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
  57. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  58. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  59. sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
  60. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  61. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  62. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  63. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  64. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  65. sequenzo/dissimilarity_measures/src/module.cpp +40 -0
  66. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  67. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  214. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  215. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  216. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  217. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  218. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  219. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  220. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  221. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  222. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  223. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  224. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  225. sequenzo/multidomain/__init__.py +23 -0
  226. sequenzo/multidomain/association_between_domains.py +311 -0
  227. sequenzo/multidomain/cat.py +597 -0
  228. sequenzo/multidomain/combt.py +519 -0
  229. sequenzo/multidomain/dat.py +81 -0
  230. sequenzo/multidomain/idcd.py +139 -0
  231. sequenzo/multidomain/linked_polyad.py +292 -0
  232. sequenzo/openmp_setup.py +233 -0
  233. sequenzo/prefix_tree/__init__.py +62 -0
  234. sequenzo/prefix_tree/hub.py +114 -0
  235. sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
  236. sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
  237. sequenzo/prefix_tree/spell_level_indicators.py +297 -0
  238. sequenzo/prefix_tree/system_level_indicators.py +544 -0
  239. sequenzo/prefix_tree/utils.py +54 -0
  240. sequenzo/seqhmm/__init__.py +95 -0
  241. sequenzo/seqhmm/advanced_optimization.py +305 -0
  242. sequenzo/seqhmm/bootstrap.py +411 -0
  243. sequenzo/seqhmm/build_hmm.py +142 -0
  244. sequenzo/seqhmm/build_mhmm.py +136 -0
  245. sequenzo/seqhmm/build_nhmm.py +121 -0
  246. sequenzo/seqhmm/fit_mhmm.py +62 -0
  247. sequenzo/seqhmm/fit_model.py +61 -0
  248. sequenzo/seqhmm/fit_nhmm.py +76 -0
  249. sequenzo/seqhmm/formulas.py +289 -0
  250. sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
  251. sequenzo/seqhmm/gradients_nhmm.py +306 -0
  252. sequenzo/seqhmm/hmm.py +291 -0
  253. sequenzo/seqhmm/mhmm.py +314 -0
  254. sequenzo/seqhmm/model_comparison.py +238 -0
  255. sequenzo/seqhmm/multichannel_em.py +282 -0
  256. sequenzo/seqhmm/multichannel_utils.py +138 -0
  257. sequenzo/seqhmm/nhmm.py +270 -0
  258. sequenzo/seqhmm/nhmm_utils.py +191 -0
  259. sequenzo/seqhmm/predict.py +137 -0
  260. sequenzo/seqhmm/predict_mhmm.py +142 -0
  261. sequenzo/seqhmm/simulate.py +878 -0
  262. sequenzo/seqhmm/utils.py +218 -0
  263. sequenzo/seqhmm/visualization.py +910 -0
  264. sequenzo/sequence_characteristics/__init__.py +40 -0
  265. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  266. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  267. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  268. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  269. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  270. sequenzo/sequence_characteristics/turbulence.py +155 -0
  271. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  272. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  273. sequenzo/suffix_tree/__init__.py +66 -0
  274. sequenzo/suffix_tree/hub.py +114 -0
  275. sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
  276. sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
  277. sequenzo/suffix_tree/spell_level_indicators.py +248 -0
  278. sequenzo/suffix_tree/system_level_indicators.py +535 -0
  279. sequenzo/suffix_tree/utils.py +56 -0
  280. sequenzo/version_check.py +283 -0
  281. sequenzo/visualization/__init__.py +29 -0
  282. sequenzo/visualization/plot_mean_time.py +222 -0
  283. sequenzo/visualization/plot_modal_state.py +276 -0
  284. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  285. sequenzo/visualization/plot_relative_frequency.py +405 -0
  286. sequenzo/visualization/plot_sequence_index.py +1175 -0
  287. sequenzo/visualization/plot_single_medoid.py +153 -0
  288. sequenzo/visualization/plot_state_distribution.py +651 -0
  289. sequenzo/visualization/plot_transition_matrix.py +190 -0
  290. sequenzo/visualization/utils/__init__.py +23 -0
  291. sequenzo/visualization/utils/utils.py +310 -0
  292. sequenzo/with_event_history_analysis/__init__.py +35 -0
  293. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  294. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  295. sequenzo-0.1.31.dist-info/METADATA +286 -0
  296. sequenzo-0.1.31.dist-info/RECORD +299 -0
  297. sequenzo-0.1.31.dist-info/WHEEL +5 -0
  298. sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
  299. sequenzo-0.1.31.dist-info/top_level.txt +2 -0
@@ -0,0 +1,3142 @@
1
+ /***************************************************************************
2
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
3
+ * Martin Renou *
4
+ * Copyright (c) QuantStack *
5
+ * Copyright (c) Serge Guelton *
6
+ * *
7
+ * Distributed under the terms of the BSD 3-Clause License. *
8
+ * *
9
+ * The full license is in the file LICENSE, distributed with this software. *
10
+ ****************************************************************************/
11
+
12
+ #ifndef XSIMD_NEON_HPP
13
+ #define XSIMD_NEON_HPP
14
+
15
+ #include <algorithm>
16
+ #include <array>
17
+ #include <complex>
18
+ #include <tuple>
19
+ #include <type_traits>
20
+
21
+ #include "../types/xsimd_neon_register.hpp"
22
+ #include "../types/xsimd_utils.hpp"
23
+
24
+ // Wrap intrinsics so we can pass them as function pointers
25
+ // - OP: intrinsics name prefix, e.g., vorrq
26
+ // - RT: type traits to deduce intrinsics return types
27
+ #define WRAP_BINARY_UINT_EXCLUDING_64(OP, RT) \
28
+ namespace wrap \
29
+ { \
30
+ XSIMD_INLINE RT<uint8x16_t> OP##_u8(uint8x16_t a, uint8x16_t b) noexcept \
31
+ { \
32
+ return ::OP##_u8(a, b); \
33
+ } \
34
+ XSIMD_INLINE RT<uint16x8_t> OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \
35
+ { \
36
+ return ::OP##_u16(a, b); \
37
+ } \
38
+ XSIMD_INLINE RT<uint32x4_t> OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \
39
+ { \
40
+ return ::OP##_u32(a, b); \
41
+ } \
42
+ }
43
+
44
+ #define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \
45
+ WRAP_BINARY_UINT_EXCLUDING_64(OP, RT) \
46
+ namespace wrap \
47
+ { \
48
+ XSIMD_INLINE RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept \
49
+ { \
50
+ return ::OP##_s8(a, b); \
51
+ } \
52
+ XSIMD_INLINE RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept \
53
+ { \
54
+ return ::OP##_s16(a, b); \
55
+ } \
56
+ XSIMD_INLINE RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept \
57
+ { \
58
+ return ::OP##_s32(a, b); \
59
+ } \
60
+ }
61
+
62
+ #define WRAP_BINARY_INT(OP, RT) \
63
+ WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \
64
+ namespace wrap \
65
+ { \
66
+ XSIMD_INLINE RT<uint64x2_t> OP##_u64(uint64x2_t a, uint64x2_t b) noexcept \
67
+ { \
68
+ return ::OP##_u64(a, b); \
69
+ } \
70
+ XSIMD_INLINE RT<int64x2_t> OP##_s64(int64x2_t a, int64x2_t b) noexcept \
71
+ { \
72
+ return ::OP##_s64(a, b); \
73
+ } \
74
+ }
75
+
76
+ #define WRAP_BINARY_FLOAT(OP, RT) \
77
+ namespace wrap \
78
+ { \
79
+ XSIMD_INLINE RT<float32x4_t> OP##_f32(float32x4_t a, float32x4_t b) noexcept \
80
+ { \
81
+ return ::OP##_f32(a, b); \
82
+ } \
83
+ }
84
+
85
+ #define WRAP_UNARY_INT_EXCLUDING_64(OP) \
86
+ namespace wrap \
87
+ { \
88
+ XSIMD_INLINE uint8x16_t OP##_u8(uint8x16_t a) noexcept \
89
+ { \
90
+ return ::OP##_u8(a); \
91
+ } \
92
+ XSIMD_INLINE int8x16_t OP##_s8(int8x16_t a) noexcept \
93
+ { \
94
+ return ::OP##_s8(a); \
95
+ } \
96
+ XSIMD_INLINE uint16x8_t OP##_u16(uint16x8_t a) noexcept \
97
+ { \
98
+ return ::OP##_u16(a); \
99
+ } \
100
+ XSIMD_INLINE int16x8_t OP##_s16(int16x8_t a) noexcept \
101
+ { \
102
+ return ::OP##_s16(a); \
103
+ } \
104
+ XSIMD_INLINE uint32x4_t OP##_u32(uint32x4_t a) noexcept \
105
+ { \
106
+ return ::OP##_u32(a); \
107
+ } \
108
+ XSIMD_INLINE int32x4_t OP##_s32(int32x4_t a) noexcept \
109
+ { \
110
+ return ::OP##_s32(a); \
111
+ } \
112
+ }
113
+
114
+ #define WRAP_UNARY_INT(OP) \
115
+ WRAP_UNARY_INT_EXCLUDING_64(OP) \
116
+ namespace wrap \
117
+ { \
118
+ XSIMD_INLINE uint64x2_t OP##_u64(uint64x2_t a) noexcept \
119
+ { \
120
+ return ::OP##_u64(a); \
121
+ } \
122
+ XSIMD_INLINE int64x2_t OP##_s64(int64x2_t a) noexcept \
123
+ { \
124
+ return ::OP##_s64(a); \
125
+ } \
126
+ }
127
+
128
+ #define WRAP_UNARY_FLOAT(OP) \
129
+ namespace wrap \
130
+ { \
131
+ XSIMD_INLINE float32x4_t OP##_f32(float32x4_t a) noexcept \
132
+ { \
133
+ return ::OP##_f32(a); \
134
+ } \
135
+ }
136
+
137
+ // Dummy identity caster to ease coding
138
+ XSIMD_INLINE uint8x16_t vreinterpretq_u8_u8(uint8x16_t arg) noexcept { return arg; }
139
+ XSIMD_INLINE int8x16_t vreinterpretq_s8_s8(int8x16_t arg) noexcept { return arg; }
140
+ XSIMD_INLINE uint16x8_t vreinterpretq_u16_u16(uint16x8_t arg) noexcept { return arg; }
141
+ XSIMD_INLINE int16x8_t vreinterpretq_s16_s16(int16x8_t arg) noexcept { return arg; }
142
+ XSIMD_INLINE uint32x4_t vreinterpretq_u32_u32(uint32x4_t arg) noexcept { return arg; }
143
+ XSIMD_INLINE int32x4_t vreinterpretq_s32_s32(int32x4_t arg) noexcept { return arg; }
144
+ XSIMD_INLINE uint64x2_t vreinterpretq_u64_u64(uint64x2_t arg) noexcept { return arg; }
145
+ XSIMD_INLINE int64x2_t vreinterpretq_s64_s64(int64x2_t arg) noexcept { return arg; }
146
+ XSIMD_INLINE float32x4_t vreinterpretq_f32_f32(float32x4_t arg) noexcept { return arg; }
147
+
148
+ namespace xsimd
149
+ {
150
+ template <typename T, class A, bool... Values>
151
+ struct batch_bool_constant;
152
+
153
+ namespace kernel
154
+ {
155
+ using namespace types;
156
+
157
+ namespace detail
158
+ {
159
+ template <template <class> class return_type, class... T>
160
+ struct neon_dispatcher_base
161
+ {
162
+ struct unary
163
+ {
164
+ using container_type = std::tuple<return_type<T> (*)(T)...>;
165
+ const container_type m_func;
166
+
167
+ template <class U>
168
+ return_type<U> apply(U rhs) const noexcept
169
+ {
170
+ using func_type = return_type<U> (*)(U);
171
+ auto func = xsimd::detail::get<func_type>(m_func);
172
+ return func(rhs);
173
+ }
174
+ };
175
+
176
+ struct binary
177
+ {
178
+ using container_type = std::tuple<return_type<T> (*)(T, T)...>;
179
+ const container_type m_func;
180
+
181
+ template <class U>
182
+ return_type<U> apply(U lhs, U rhs) const noexcept
183
+ {
184
+ using func_type = return_type<U> (*)(U, U);
185
+ auto func = xsimd::detail::get<func_type>(m_func);
186
+ return func(lhs, rhs);
187
+ }
188
+ };
189
+ };
190
+
191
+ /***************************
192
+ * arithmetic dispatchers *
193
+ ***************************/
194
+
195
+ template <class T>
196
+ using identity_return_type = T;
197
+
198
+ template <class... T>
199
+ struct neon_dispatcher_impl : neon_dispatcher_base<identity_return_type, T...>
200
+ {
201
+ };
202
+
203
+ using neon_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
204
+ uint16x8_t, int16x8_t,
205
+ uint32x4_t, int32x4_t,
206
+ uint64x2_t, int64x2_t,
207
+ float32x4_t>;
208
+
209
+ using excluding_int64_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
210
+ uint16x8_t, int16x8_t,
211
+ uint32x4_t, int32x4_t,
212
+ float32x4_t>;
213
+
214
+ using excluding_int64f32_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
215
+ uint16x8_t, int16x8_t,
216
+ uint32x4_t, int32x4_t>;
217
+
218
+ /**************************
219
+ * comparison dispatchers *
220
+ **************************/
221
+
222
+ template <class T>
223
+ struct comp_return_type_impl;
224
+
225
+ template <>
226
+ struct comp_return_type_impl<uint8x16_t>
227
+ {
228
+ using type = uint8x16_t;
229
+ };
230
+
231
+ template <>
232
+ struct comp_return_type_impl<int8x16_t>
233
+ {
234
+ using type = uint8x16_t;
235
+ };
236
+
237
+ template <>
238
+ struct comp_return_type_impl<uint16x8_t>
239
+ {
240
+ using type = uint16x8_t;
241
+ };
242
+
243
+ template <>
244
+ struct comp_return_type_impl<int16x8_t>
245
+ {
246
+ using type = uint16x8_t;
247
+ };
248
+
249
+ template <>
250
+ struct comp_return_type_impl<uint32x4_t>
251
+ {
252
+ using type = uint32x4_t;
253
+ };
254
+
255
+ template <>
256
+ struct comp_return_type_impl<int32x4_t>
257
+ {
258
+ using type = uint32x4_t;
259
+ };
260
+
261
+ template <>
262
+ struct comp_return_type_impl<uint64x2_t>
263
+ {
264
+ using type = uint64x2_t;
265
+ };
266
+
267
+ template <>
268
+ struct comp_return_type_impl<int64x2_t>
269
+ {
270
+ using type = uint64x2_t;
271
+ };
272
+
273
+ template <>
274
+ struct comp_return_type_impl<float32x4_t>
275
+ {
276
+ using type = uint32x4_t;
277
+ };
278
+
279
+ template <class T>
280
+ using comp_return_type = typename comp_return_type_impl<T>::type;
281
+
282
+ template <class... T>
283
+ struct neon_comp_dispatcher_impl : neon_dispatcher_base<comp_return_type, T...>
284
+ {
285
+ };
286
+
287
+ using excluding_int64_comp_dispatcher = neon_comp_dispatcher_impl<uint8x16_t, int8x16_t,
288
+ uint16x8_t, int16x8_t,
289
+ uint32x4_t, int32x4_t,
290
+ float32x4_t>;
291
+
292
+ /**************************************
293
+ * enabling / disabling metafunctions *
294
+ **************************************/
295
+
296
+ template <class T>
297
+ using enable_neon_type_t = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value,
298
+ int>::type;
299
+
300
+ template <class T>
301
+ using exclude_int64_neon_t
302
+ = typename std::enable_if<(std::is_integral<T>::value && sizeof(T) != 8) || std::is_same<T, float>::value, int>::type;
303
+ }
304
+
305
+ /*************
306
+ * broadcast *
307
+ *************/
308
+
309
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
310
+ XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
311
+ {
312
+ return vdupq_n_u8(uint8_t(val));
313
+ }
314
+
315
+ template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
316
+ XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
317
+ {
318
+ return vdupq_n_s8(int8_t(val));
319
+ }
320
+
321
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
322
+ XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
323
+ {
324
+ return vdupq_n_u16(uint16_t(val));
325
+ }
326
+
327
+ template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
328
+ XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
329
+ {
330
+ return vdupq_n_s16(int16_t(val));
331
+ }
332
+
333
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
334
+ XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
335
+ {
336
+ return vdupq_n_u32(uint32_t(val));
337
+ }
338
+
339
+ template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
340
+ XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
341
+ {
342
+ return vdupq_n_s32(int32_t(val));
343
+ }
344
+
345
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
346
+ XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
347
+ {
348
+ return vdupq_n_u64(uint64_t(val));
349
+ }
350
+
351
+ template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
352
+ XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
353
+ {
354
+ return vdupq_n_s64(int64_t(val));
355
+ }
356
+
357
+ template <class A>
358
+ XSIMD_INLINE batch<float, A> broadcast(float val, requires_arch<neon>) noexcept
359
+ {
360
+ return vdupq_n_f32(val);
361
+ }
362
+
363
+ /*******
364
+ * set *
365
+ *******/
366
+
367
+ template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
368
+ XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<neon>, Args... args) noexcept
369
+ {
370
+ return xsimd::types::detail::neon_vector_type<T> { args... };
371
+ }
372
+
373
+ template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
374
+ XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<neon>, Args... args) noexcept
375
+ {
376
+ using register_type = typename batch_bool<T, A>::register_type;
377
+ using unsigned_type = as_unsigned_integer_t<T>;
378
+ return register_type { static_cast<unsigned_type>(args ? -1LL : 0LL)... };
379
+ }
380
+
381
+ template <class A>
382
+ XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<neon>, float f0, float f1, float f2, float f3) noexcept
383
+ {
384
+ return float32x4_t { f0, f1, f2, f3 };
385
+ }
386
+
387
+ template <class A>
388
+ XSIMD_INLINE batch<std::complex<float>, A> set(batch<std::complex<float>, A> const&, requires_arch<neon>,
389
+ std::complex<float> c0, std::complex<float> c1,
390
+ std::complex<float> c2, std::complex<float> c3) noexcept
391
+ {
392
+ return batch<std::complex<float>, A>(float32x4_t { c0.real(), c1.real(), c2.real(), c3.real() },
393
+ float32x4_t { c0.imag(), c1.imag(), c2.imag(), c3.imag() });
394
+ }
395
+
396
+ template <class A, class... Args>
397
+ XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<neon>, Args... args) noexcept
398
+ {
399
+ using register_type = typename batch_bool<float, A>::register_type;
400
+ using unsigned_type = as_unsigned_integer_t<float>;
401
+ return register_type { static_cast<unsigned_type>(args ? -1LL : 0LL)... };
402
+ }
403
+
404
+ /*************
405
+ * from_bool *
406
+ *************/
407
+
408
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
409
+ XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
410
+ {
411
+ return vandq_u8(arg, vdupq_n_u8(1));
412
+ }
413
+
414
+ template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
415
+ XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
416
+ {
417
+ return vandq_s8(reinterpret_cast<int8x16_t>(arg.data), vdupq_n_s8(1));
418
+ }
419
+
420
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
421
+ XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
422
+ {
423
+ return vandq_u16(arg, vdupq_n_u16(1));
424
+ }
425
+
426
+ template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
427
+ XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
428
+ {
429
+ return vandq_s16(reinterpret_cast<int16x8_t>(arg.data), vdupq_n_s16(1));
430
+ }
431
+
432
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
433
+ XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
434
+ {
435
+ return vandq_u32(arg, vdupq_n_u32(1));
436
+ }
437
+
438
+ template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
439
+ XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
440
+ {
441
+ return vandq_s32(reinterpret_cast<int32x4_t>(arg.data), vdupq_n_s32(1));
442
+ }
443
+
444
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
445
+ XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
446
+ {
447
+ return vandq_u64(arg, vdupq_n_u64(1));
448
+ }
449
+
450
+ template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
451
+ XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
452
+ {
453
+ return vandq_s64(reinterpret_cast<int64x2_t>(arg.data), vdupq_n_s64(1));
454
+ }
455
+
456
+ template <class A>
457
+ XSIMD_INLINE batch<float, A> from_bool(batch_bool<float, A> const& arg, requires_arch<neon>) noexcept
458
+ {
459
+ return vreinterpretq_f32_u32(vandq_u32(arg, vreinterpretq_u32_f32(vdupq_n_f32(1.f))));
460
+ }
461
+
462
+ /********
463
+ * load *
464
+ ********/
465
+
466
+ // It is not possible to use a call to A::alignment() here, so use an
467
+ // immediate instead.
468
+ #if defined(__clang__) || defined(__GNUC__)
469
+ #define xsimd_aligned_load(inst, type, expr) inst((type)__builtin_assume_aligned(expr, 16))
470
+ #elif defined(_MSC_VER)
471
+ #define xsimd_aligned_load(inst, type, expr) inst##_ex((type)expr, 128)
472
+ #else
473
+ #define xsimd_aligned_load(inst, type, expr) inst((type)expr)
474
+ #endif
475
+
476
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
477
+ XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
478
+ {
479
+ return xsimd_aligned_load(vld1q_u8, uint8_t*, src);
480
+ }
481
+
482
+ template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
483
+ XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
484
+ {
485
+ return xsimd_aligned_load(vld1q_s8, int8_t*, src);
486
+ }
487
+
488
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
489
+ XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
490
+ {
491
+ return xsimd_aligned_load(vld1q_u16, uint16_t*, src);
492
+ }
493
+ template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
494
+ XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
495
+ {
496
+ return xsimd_aligned_load(vld1q_s16, int16_t*, src);
497
+ }
498
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
499
+ XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
500
+ {
501
+ return xsimd_aligned_load(vld1q_u32, uint32_t*, src);
502
+ }
503
+ template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
504
+ XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
505
+ {
506
+ return xsimd_aligned_load(vld1q_s32, int32_t*, src);
507
+ }
508
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
509
+ XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
510
+ {
511
+ return xsimd_aligned_load(vld1q_u64, uint64_t*, src);
512
+ }
513
+ template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
514
+ XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
515
+ {
516
+ return xsimd_aligned_load(vld1q_s64, int64_t*, src);
517
+ }
518
+
519
+ template <class A>
520
+ XSIMD_INLINE batch<float, A> load_aligned(float const* src, convert<float>, requires_arch<neon>) noexcept
521
+ {
522
+ return xsimd_aligned_load(vld1q_f32, float*, src);
523
+ }
524
+
525
+ #undef xsimd_aligned_load
526
+
527
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
528
+ XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
529
+ {
530
+ return vld1q_u8((uint8_t*)src);
531
+ }
532
+
533
+ template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
534
+ XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
535
+ {
536
+ return vld1q_s8((int8_t*)src);
537
+ }
538
+
539
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
540
+ XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
541
+ {
542
+ return vld1q_u16((uint16_t*)src);
543
+ }
544
+ template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
545
+ XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
546
+ {
547
+ return vld1q_s16((int16_t*)src);
548
+ }
549
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
550
+ XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
551
+ {
552
+ return vld1q_u32((uint32_t*)src);
553
+ }
554
+ template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
555
+ XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
556
+ {
557
+ return vld1q_s32((int32_t*)src);
558
+ }
559
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
560
+ XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
561
+ {
562
+ return vld1q_u64((uint64_t*)src);
563
+ }
564
+ template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
565
+ XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
566
+ {
567
+ return vld1q_s64((int64_t*)src);
568
+ }
569
+
570
+ template <class A>
571
+ XSIMD_INLINE batch<float, A> load_unaligned(float const* src, convert<float>, requires_arch<neon>) noexcept
572
+ {
573
+ return vld1q_f32(src);
574
+ }
575
+
576
+ /*********
577
+ * store *
578
+ *********/
579
+
580
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
581
+ XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
582
+ {
583
+ vst1q_u8((uint8_t*)dst, src);
584
+ }
585
+
586
+ template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
587
+ XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
588
+ {
589
+ vst1q_s8((int8_t*)dst, src);
590
+ }
591
+
592
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
593
+ XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
594
+ {
595
+ vst1q_u16((uint16_t*)dst, src);
596
+ }
597
+
598
+ template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
599
+ XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
600
+ {
601
+ vst1q_s16((int16_t*)dst, src);
602
+ }
603
+
604
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
605
+ XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
606
+ {
607
+ vst1q_u32((uint32_t*)dst, src);
608
+ }
609
+
610
+ template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
611
+ XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
612
+ {
613
+ vst1q_s32((int32_t*)dst, src);
614
+ }
615
+
616
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
617
+ XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
618
+ {
619
+ vst1q_u64((uint64_t*)dst, src);
620
+ }
621
+
622
+ template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
623
+ XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
624
+ {
625
+ vst1q_s64((int64_t*)dst, src);
626
+ }
627
+
628
+ template <class A>
629
+ XSIMD_INLINE void store_aligned(float* dst, batch<float, A> const& src, requires_arch<neon>) noexcept
630
+ {
631
+ vst1q_f32(dst, src);
632
+ }
633
+
634
+ template <class A, class T>
635
+ XSIMD_INLINE void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
636
+ {
637
+ store_aligned<A>(dst, src, A {});
638
+ }
639
+
640
+ /****************
641
+ * load_complex *
642
+ ****************/
643
+
644
+ template <class A>
645
+ XSIMD_INLINE batch<std::complex<float>, A> load_complex_aligned(std::complex<float> const* mem, convert<std::complex<float>>, requires_arch<neon>) noexcept
646
+ {
647
+ using real_batch = batch<float, A>;
648
+ const float* buf = reinterpret_cast<const float*>(mem);
649
+ float32x4x2_t tmp = vld2q_f32(buf);
650
+ real_batch real = tmp.val[0],
651
+ imag = tmp.val[1];
652
+ return batch<std::complex<float>, A> { real, imag };
653
+ }
654
+
655
+ template <class A>
656
+ XSIMD_INLINE batch<std::complex<float>, A> load_complex_unaligned(std::complex<float> const* mem, convert<std::complex<float>> cvt, requires_arch<neon>) noexcept
657
+ {
658
+ return load_complex_aligned<A>(mem, cvt, A {});
659
+ }
660
+
661
+ /*****************
662
+ * store_complex *
663
+ *****************/
664
+
665
+ template <class A>
666
+ XSIMD_INLINE void store_complex_aligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept
667
+ {
668
+ float32x4x2_t tmp;
669
+ tmp.val[0] = src.real();
670
+ tmp.val[1] = src.imag();
671
+ float* buf = reinterpret_cast<float*>(dst);
672
+ vst2q_f32(buf, tmp);
673
+ }
674
+
675
+ template <class A>
676
+ XSIMD_INLINE void store_complex_unaligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept
677
+ {
678
+ store_complex_aligned(dst, src, A {});
679
+ }
680
+
681
+ /*******
682
+ * neg *
683
+ *******/
684
+
685
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
686
+ XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
687
+ {
688
+ return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(rhs)));
689
+ }
690
+
691
+ template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
692
+ XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
693
+ {
694
+ return vnegq_s8(rhs);
695
+ }
696
+
697
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
698
+ XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
699
+ {
700
+ return vreinterpretq_u16_s16(vnegq_s16(vreinterpretq_s16_u16(rhs)));
701
+ }
702
+
703
+ template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
704
+ XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
705
+ {
706
+ return vnegq_s16(rhs);
707
+ }
708
+
709
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
710
+ XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
711
+ {
712
+ return vreinterpretq_u32_s32(vnegq_s32(vreinterpretq_s32_u32(rhs)));
713
+ }
714
+
715
+ template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
716
+ XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
717
+ {
718
+ return vnegq_s32(rhs);
719
+ }
720
+
721
+ template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
722
+ XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
723
+ {
724
+ return 0 - rhs;
725
+ }
726
+
727
+ template <class A>
728
+ XSIMD_INLINE batch<float, A> neg(batch<float, A> const& rhs, requires_arch<neon>) noexcept
729
+ {
730
+ return vnegq_f32(rhs);
731
+ }
732
+
733
+ /*******
734
+ * add *
735
+ *******/
736
+
737
+ WRAP_BINARY_INT(vaddq, detail::identity_return_type)
738
+ WRAP_BINARY_FLOAT(vaddq, detail::identity_return_type)
739
+
740
+ template <class A, class T, detail::enable_neon_type_t<T> = 0>
741
+ XSIMD_INLINE batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
742
+ {
743
+ using register_type = typename batch<T, A>::register_type;
744
+ const detail::neon_dispatcher::binary dispatcher = {
745
+ std::make_tuple(wrap::vaddq_u8, wrap::vaddq_s8, wrap::vaddq_u16, wrap::vaddq_s16,
746
+ wrap::vaddq_u32, wrap::vaddq_s32, wrap::vaddq_u64, wrap::vaddq_s64,
747
+ wrap::vaddq_f32)
748
+ };
749
+ return dispatcher.apply(register_type(lhs), register_type(rhs));
750
+ }
751
+
752
+ /*******
753
+ * avg *
754
+ *******/
755
+
756
+ WRAP_BINARY_UINT_EXCLUDING_64(vhaddq, detail::identity_return_type)
757
+
758
+ template <class A, class T, class = typename std::enable_if<(std::is_unsigned<T>::value && sizeof(T) != 8), void>::type>
759
+ XSIMD_INLINE batch<T, A> avg(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
760
+ {
761
+ using register_type = typename batch<T, A>::register_type;
762
+ const detail::neon_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary dispatcher = {
763
+ std::make_tuple(wrap::vhaddq_u8, wrap::vhaddq_u16, wrap::vhaddq_u32)
764
+ };
765
+ return dispatcher.apply(register_type(lhs), register_type(rhs));
766
+ }
767
+
768
+ /********
769
+ * avgr *
770
+ ********/
771
+
772
+ WRAP_BINARY_UINT_EXCLUDING_64(vrhaddq, detail::identity_return_type)
773
+
774
+ template <class A, class T, class = typename std::enable_if<(std::is_unsigned<T>::value && sizeof(T) != 8), void>::type>
775
+ XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
776
+ {
777
+ using register_type = typename batch<T, A>::register_type;
778
+ const detail::neon_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary dispatcher = {
779
+ std::make_tuple(wrap::vrhaddq_u8, wrap::vrhaddq_u16, wrap::vrhaddq_u32)
780
+ };
781
+ return dispatcher.apply(register_type(lhs), register_type(rhs));
782
+ }
783
+
784
+ /********
785
+ * sadd *
786
+ ********/
787
+
788
+ WRAP_BINARY_INT(vqaddq, detail::identity_return_type)
789
+
790
+ template <class A, class T, detail::enable_neon_type_t<T> = 0>
791
+ XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
792
+ {
793
+ using register_type = typename batch<T, A>::register_type;
794
+ const detail::neon_dispatcher::binary dispatcher = {
795
+ std::make_tuple(wrap::vqaddq_u8, wrap::vqaddq_s8, wrap::vqaddq_u16, wrap::vqaddq_s16,
796
+ wrap::vqaddq_u32, wrap::vqaddq_s32, wrap::vqaddq_u64, wrap::vqaddq_s64,
797
+ wrap::vaddq_f32)
798
+ };
799
+ return dispatcher.apply(register_type(lhs), register_type(rhs));
800
+ }
801
+
802
+ /*******
803
+ * sub *
804
+ *******/
805
+
806
+ WRAP_BINARY_INT(vsubq, detail::identity_return_type)
807
+ WRAP_BINARY_FLOAT(vsubq, detail::identity_return_type)
808
+
809
+ template <class A, class T, detail::enable_neon_type_t<T> = 0>
810
+ XSIMD_INLINE batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
811
+ {
812
+ using register_type = typename batch<T, A>::register_type;
813
+ const detail::neon_dispatcher::binary dispatcher = {
814
+ std::make_tuple(wrap::vsubq_u8, wrap::vsubq_s8, wrap::vsubq_u16, wrap::vsubq_s16,
815
+ wrap::vsubq_u32, wrap::vsubq_s32, wrap::vsubq_u64, wrap::vsubq_s64,
816
+ wrap::vsubq_f32)
817
+ };
818
+ return dispatcher.apply(register_type(lhs), register_type(rhs));
819
+ }
820
+
821
+ /********
822
+ * ssub *
823
+ ********/
824
+
825
+ WRAP_BINARY_INT(vqsubq, detail::identity_return_type)
826
+
827
+ template <class A, class T, detail::enable_neon_type_t<T> = 0>
828
+ XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
829
+ {
830
+ using register_type = typename batch<T, A>::register_type;
831
+ const detail::neon_dispatcher::binary dispatcher = {
832
+ std::make_tuple(wrap::vqsubq_u8, wrap::vqsubq_s8, wrap::vqsubq_u16, wrap::vqsubq_s16,
833
+ wrap::vqsubq_u32, wrap::vqsubq_s32, wrap::vqsubq_u64, wrap::vqsubq_s64,
834
+ wrap::vsubq_f32)
835
+ };
836
+ return dispatcher.apply(register_type(lhs), register_type(rhs));
837
+ }
838
+
839
+ /*******
840
+ * mul *
841
+ *******/
842
+
843
+ WRAP_BINARY_INT_EXCLUDING_64(vmulq, detail::identity_return_type)
844
+ WRAP_BINARY_FLOAT(vmulq, detail::identity_return_type)
845
+
846
+ template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
847
+ XSIMD_INLINE batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
848
+ {
849
+ using register_type = typename batch<T, A>::register_type;
850
+ const detail::excluding_int64_dispatcher::binary dispatcher = {
851
+ std::make_tuple(wrap::vmulq_u8, wrap::vmulq_s8, wrap::vmulq_u16, wrap::vmulq_s16,
852
+ wrap::vmulq_u32, wrap::vmulq_s32, wrap::vmulq_f32)
853
+ };
854
+ return dispatcher.apply(register_type(lhs), register_type(rhs));
855
+ }
856
+
857
+ /*******
858
+ * div *
859
+ *******/
860
+
861
+ #if defined(XSIMD_FAST_INTEGER_DIVISION)
862
+ template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
863
+ XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
864
+ {
865
+ return vcvtq_s32_f32(vcvtq_f32_s32(lhs) / vcvtq_f32_s32(rhs));
866
+ }
867
+
868
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
869
+ XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
870
+ {
871
+ return vcvtq_u32_f32(vcvtq_f32_u32(lhs) / vcvtq_f32_u32(rhs));
872
+ }
873
+ #endif
874
+
875
+ template <class A>
876
+ XSIMD_INLINE batch<float, A> div(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
877
+ {
878
+ // from stackoverflow & https://projectne10.github.io/Ne10/doc/NE10__divc_8neon_8c_source.html
879
+ // get an initial estimate of 1/b.
880
+ float32x4_t rcp = reciprocal(rhs);
881
+
882
+ // use a couple Newton-Raphson steps to refine the estimate. Depending on your
883
+ // application's accuracy requirements, you may be able to get away with only
884
+ // one refinement (instead of the two used here). Be sure to test!
885
+ rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp);
886
+ rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp);
887
+
888
+ // and finally, compute a / b = a * (1 / b)
889
+ return vmulq_f32(lhs, rcp);
890
+ }
891
+
892
+ /******
893
+ * eq *
894
+ ******/
895
+
896
+ WRAP_BINARY_INT_EXCLUDING_64(vceqq, detail::comp_return_type)
897
+ WRAP_BINARY_FLOAT(vceqq, detail::comp_return_type)
898
+
899
+ template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
900
+ XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
901
+ {
902
+ using register_type = typename batch<T, A>::register_type;
903
+ const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
904
+ std::make_tuple(wrap::vceqq_u8, wrap::vceqq_s8, wrap::vceqq_u16, wrap::vceqq_s16,
905
+ wrap::vceqq_u32, wrap::vceqq_s32, wrap::vceqq_f32)
906
+ };
907
+ return dispatcher.apply(register_type(lhs), register_type(rhs));
908
+ }
909
+
910
+ template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
911
+ XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
912
+ {
913
+ using register_type = typename batch_bool<T, A>::register_type;
914
+ using dispatcher_type = detail::neon_comp_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary;
915
+ const dispatcher_type dispatcher = {
916
+ std::make_tuple(wrap::vceqq_u8, wrap::vceqq_u16, wrap::vceqq_u32)
917
+ };
918
+ return dispatcher.apply(register_type(lhs), register_type(rhs));
919
+ }
920
+
921
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
922
+ XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
923
+ {
924
+ auto eq32 = vceqq_u32(vreinterpretq_u32_u64(lhs.data), vreinterpretq_u32_u64(rhs.data));
925
+ auto rev32 = vrev64q_u32(eq32);
926
+ auto eq64 = vandq_u32(eq32, rev32);
927
+ return batch_bool<T, A>(vreinterpretq_u64_u32(eq64));
928
+ }
929
+
930
+ template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
931
+ XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
932
+ {
933
+ auto eq32 = vceqq_u32(vreinterpretq_u32_s64(lhs.data), vreinterpretq_u32_s64(rhs.data));
934
+ auto rev32 = vrev64q_u32(eq32);
935
+ auto eq64 = vandq_u32(eq32, rev32);
936
+ return batch_bool<T, A>(vreinterpretq_u64_u32(eq64));
937
+ }
938
+
939
+ template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
940
+ XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
941
+ {
942
+ return eq(batch<T, A> { lhs.data }, batch<T, A> { rhs.data }, A {});
943
+ }
944
+
945
+ /*************
946
+ * fast_cast *
947
+ *************/
948
+
949
+ namespace detail
950
+ {
951
+ template <class A>
952
+ XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept
953
+ {
954
+ return vcvtq_f32_s32(self);
955
+ }
956
+
957
+ template <class A>
958
+ XSIMD_INLINE batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept
959
+ {
960
+ return vcvtq_f32_u32(self);
961
+ }
962
+
963
+ template <class A>
964
+ XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<neon>) noexcept
965
+ {
966
+ return vcvtq_s32_f32(self);
967
+ }
968
+
969
+ template <class A>
970
+ XSIMD_INLINE batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<neon>) noexcept
971
+ {
972
+ return vcvtq_u32_f32(self);
973
+ }
974
+
975
+ }
976
+
977
+ /******
978
+ * lt *
979
+ ******/
980
+
981
+ WRAP_BINARY_INT_EXCLUDING_64(vcltq, detail::comp_return_type)
982
+ WRAP_BINARY_FLOAT(vcltq, detail::comp_return_type)
983
+
984
+ template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
985
+ XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
986
+ {
987
+ using register_type = typename batch<T, A>::register_type;
988
+ const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
989
+ std::make_tuple(wrap::vcltq_u8, wrap::vcltq_s8, wrap::vcltq_u16, wrap::vcltq_s16,
990
+ wrap::vcltq_u32, wrap::vcltq_s32, wrap::vcltq_f32)
991
+ };
992
+ return dispatcher.apply(register_type(lhs), register_type(rhs));
993
+ }
994
+
995
+ template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
996
+ XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
997
+ {
998
+ using register_type = typename batch<T, A>::register_type;
999
+ return batch_bool<T, A>(vreinterpretq_u64_s64(vshrq_n_s64(vqsubq_s64(register_type(lhs), register_type(rhs)), 63)));
1000
+ }
1001
+
1002
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1003
+ XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1004
+ {
1005
+ using register_type = typename batch<T, A>::register_type;
1006
+ register_type acc = { 0x7FFFFFFFFFFFFFFFull, 0x7FFFFFFFFFFFFFFFull };
1007
+ return batch_bool<T, A>(vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_u64(vqaddq_u64(vqsubq_u64(register_type(rhs), register_type(lhs)), acc)), 63)));
1008
+ }
1009
+
1010
+ /******
1011
+ * le *
1012
+ ******/
1013
+
1014
+ WRAP_BINARY_INT_EXCLUDING_64(vcleq, detail::comp_return_type)
1015
+ WRAP_BINARY_FLOAT(vcleq, detail::comp_return_type)
1016
+
1017
+ template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
1018
+ XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1019
+ {
1020
+ using register_type = typename batch<T, A>::register_type;
1021
+ const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
1022
+ std::make_tuple(wrap::vcleq_u8, wrap::vcleq_s8, wrap::vcleq_u16, wrap::vcleq_s16,
1023
+ wrap::vcleq_u32, wrap::vcleq_s32, wrap::vcleq_f32)
1024
+ };
1025
+ return dispatcher.apply(register_type(lhs), register_type(rhs));
1026
+ }
1027
+
1028
+ template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
1029
+ XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1030
+ {
1031
+ return !(lhs > rhs);
1032
+ }
1033
+
1034
+ /******
1035
+ * gt *
1036
+ ******/
1037
+ namespace detail
1038
+ {
1039
+ XSIMD_INLINE int64x2_t bitwise_not_s64(int64x2_t arg) noexcept
1040
+ {
1041
+ return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(arg)));
1042
+ }
1043
+
1044
+ XSIMD_INLINE uint64x2_t bitwise_not_u64(uint64x2_t arg) noexcept
1045
+ {
1046
+ return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(arg)));
1047
+ }
1048
+ }
1049
+
1050
+ WRAP_BINARY_INT_EXCLUDING_64(vcgtq, detail::comp_return_type)
1051
+ WRAP_BINARY_FLOAT(vcgtq, detail::comp_return_type)
1052
+
1053
+ template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
1054
+ XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1055
+ {
1056
+ using register_type = typename batch<T, A>::register_type;
1057
+ const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
1058
+ std::make_tuple(wrap::vcgtq_u8, wrap::vcgtq_s8, wrap::vcgtq_u16, wrap::vcgtq_s16,
1059
+ wrap::vcgtq_u32, wrap::vcgtq_s32, wrap::vcgtq_f32)
1060
+ };
1061
+ return dispatcher.apply(register_type(lhs), register_type(rhs));
1062
+ }
1063
+
1064
+ template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1065
+ XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1066
+ {
1067
+ using register_type = typename batch<T, A>::register_type;
1068
+ return batch_bool<T, A>(vreinterpretq_u64_s64(vshrq_n_s64(vqsubq_s64(register_type(rhs), register_type(lhs)), 63)));
1069
+ }
1070
+
1071
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1072
+ XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1073
+ {
1074
+ using register_type = typename batch<T, A>::register_type;
1075
+ register_type acc = { 0x7FFFFFFFFFFFFFFFull, 0x7FFFFFFFFFFFFFFFull };
1076
+ return batch_bool<T, A>(vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_u64(vqaddq_u64(vqsubq_u64(register_type(lhs), register_type(rhs)), acc)), 63)));
1077
+ }
1078
+
1079
+ /******
1080
+ * ge *
1081
+ ******/
1082
+
1083
+ WRAP_BINARY_INT_EXCLUDING_64(vcgeq, detail::comp_return_type)
1084
+ WRAP_BINARY_FLOAT(vcgeq, detail::comp_return_type)
1085
+
1086
+ template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
1087
+ XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1088
+ {
1089
+ using register_type = typename batch<T, A>::register_type;
1090
+ const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
1091
+ std::make_tuple(wrap::vcgeq_u8, wrap::vcgeq_s8, wrap::vcgeq_u16, wrap::vcgeq_s16,
1092
+ wrap::vcgeq_u32, wrap::vcgeq_s32, wrap::vcgeq_f32)
1093
+ };
1094
+ return dispatcher.apply(register_type(lhs), register_type(rhs));
1095
+ }
1096
+
1097
+ template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
1098
+ XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1099
+ {
1100
+ return !(lhs < rhs);
1101
+ }
1102
+
1103
+ /*******************
1104
+ * batch_bool_cast *
1105
+ *******************/
1106
+
1107
+ template <class A, class T_out, class T_in>
1108
+ XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon>) noexcept
1109
+ {
1110
+ using register_type = typename batch_bool<T_out, A>::register_type;
1111
+ return register_type(self);
1112
+ }
1113
+
1114
+ /***************
1115
+ * bitwise_and *
1116
+ ***************/
1117
+
1118
+ WRAP_BINARY_INT(vandq, detail::identity_return_type)
1119
+
1120
+ namespace detail
1121
+ {
1122
+ XSIMD_INLINE float32x4_t bitwise_and_f32(float32x4_t lhs, float32x4_t rhs) noexcept
1123
+ {
1124
+ return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(lhs),
1125
+ vreinterpretq_u32_f32(rhs)));
1126
+ }
1127
+
1128
+ template <class V>
1129
+ V bitwise_and_neon(V const& lhs, V const& rhs)
1130
+ {
1131
+ const neon_dispatcher::binary dispatcher = {
1132
+ std::make_tuple(wrap::vandq_u8, wrap::vandq_s8, wrap::vandq_u16, wrap::vandq_s16,
1133
+ wrap::vandq_u32, wrap::vandq_s32, wrap::vandq_u64, wrap::vandq_s64,
1134
+ bitwise_and_f32)
1135
+ };
1136
+ return dispatcher.apply(lhs, rhs);
1137
+ }
1138
+ }
1139
+
1140
+ template <class A, class T, detail::enable_neon_type_t<T> = 0>
1141
+ XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1142
+ {
1143
+ using register_type = typename batch<T, A>::register_type;
1144
+ return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
1145
+ }
1146
+
1147
+ template <class A, class T, detail::enable_neon_type_t<T> = 0>
1148
+ XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1149
+ {
1150
+ using register_type = typename batch_bool<T, A>::register_type;
1151
+ return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
1152
+ }
1153
+
1154
+ /**************
1155
+ * bitwise_or *
1156
+ **************/
1157
+
1158
+ WRAP_BINARY_INT(vorrq, detail::identity_return_type)
1159
+
1160
+ namespace detail
1161
+ {
1162
+ XSIMD_INLINE float32x4_t bitwise_or_f32(float32x4_t lhs, float32x4_t rhs) noexcept
1163
+ {
1164
+ return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(lhs),
1165
+ vreinterpretq_u32_f32(rhs)));
1166
+ }
1167
+
1168
+ template <class V>
1169
+ XSIMD_INLINE V bitwise_or_neon(V const& lhs, V const& rhs) noexcept
1170
+ {
1171
+ const neon_dispatcher::binary dispatcher = {
1172
+ std::make_tuple(wrap::vorrq_u8, wrap::vorrq_s8, wrap::vorrq_u16, wrap::vorrq_s16,
1173
+ wrap::vorrq_u32, wrap::vorrq_s32, wrap::vorrq_u64, wrap::vorrq_s64,
1174
+ bitwise_or_f32)
1175
+ };
1176
+ return dispatcher.apply(lhs, rhs);
1177
+ }
1178
+ }
1179
+
1180
+ template <class A, class T, detail::enable_neon_type_t<T> = 0>
1181
+ XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1182
+ {
1183
+ using register_type = typename batch<T, A>::register_type;
1184
+ return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
1185
+ }
1186
+
1187
+ template <class A, class T, detail::enable_neon_type_t<T> = 0>
1188
+ XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1189
+ {
1190
+ using register_type = typename batch_bool<T, A>::register_type;
1191
+ return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
1192
+ }
1193
+
1194
+ /***************
1195
+ * bitwise_xor *
1196
+ ***************/
1197
+
1198
+ WRAP_BINARY_INT(veorq, detail::identity_return_type)
1199
+
1200
+ namespace detail
1201
+ {
1202
+ XSIMD_INLINE float32x4_t bitwise_xor_f32(float32x4_t lhs, float32x4_t rhs) noexcept
1203
+ {
1204
+ return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(lhs),
1205
+ vreinterpretq_u32_f32(rhs)));
1206
+ }
1207
+
1208
+ template <class V>
1209
+ XSIMD_INLINE V bitwise_xor_neon(V const& lhs, V const& rhs) noexcept
1210
+ {
1211
+ const neon_dispatcher::binary dispatcher = {
1212
+ std::make_tuple(wrap::veorq_u8, wrap::veorq_s8, wrap::veorq_u16, wrap::veorq_s16,
1213
+ wrap::veorq_u32, wrap::veorq_s32, wrap::veorq_u64, wrap::veorq_s64,
1214
+ bitwise_xor_f32)
1215
+ };
1216
+ return dispatcher.apply(lhs, rhs);
1217
+ }
1218
+ }
1219
+
1220
+ template <class A, class T, detail::enable_neon_type_t<T> = 0>
1221
+ XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1222
+ {
1223
+ using register_type = typename batch<T, A>::register_type;
1224
+ return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
1225
+ }
1226
+
1227
+ template <class A, class T, detail::enable_neon_type_t<T> = 0>
1228
+ XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1229
+ {
1230
+ using register_type = typename batch_bool<T, A>::register_type;
1231
+ return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
1232
+ }
1233
+
1234
+ /*******
1235
+ * neq *
1236
+ *******/
1237
+
1238
+ template <class A, class T>
1239
+ XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1240
+ {
1241
+ return bitwise_xor(lhs, rhs, A {});
1242
+ }
1243
+
1244
+ /***************
1245
+ * bitwise_not *
1246
+ ***************/
1247
+
1248
+ WRAP_UNARY_INT_EXCLUDING_64(vmvnq)
1249
+
1250
+ namespace detail
1251
+ {
1252
+ XSIMD_INLINE float32x4_t bitwise_not_f32(float32x4_t arg) noexcept
1253
+ {
1254
+ return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(arg)));
1255
+ }
1256
+
1257
+ template <class V>
1258
+ XSIMD_INLINE V bitwise_not_neon(V const& arg) noexcept
1259
+ {
1260
+ const neon_dispatcher::unary dispatcher = {
1261
+ std::make_tuple(wrap::vmvnq_u8, wrap::vmvnq_s8, wrap::vmvnq_u16, wrap::vmvnq_s16,
1262
+ wrap::vmvnq_u32, wrap::vmvnq_s32,
1263
+ bitwise_not_u64, bitwise_not_s64,
1264
+ bitwise_not_f32)
1265
+ };
1266
+ return dispatcher.apply(arg);
1267
+ }
1268
+ }
1269
+
1270
+ template <class A, class T, detail::enable_neon_type_t<T> = 0>
1271
+ XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<neon>) noexcept
1272
+ {
1273
+ using register_type = typename batch<T, A>::register_type;
1274
+ return detail::bitwise_not_neon(register_type(arg));
1275
+ }
1276
+
1277
+ template <class A, class T, detail::enable_neon_type_t<T> = 0>
1278
+ XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
1279
+ {
1280
+ using register_type = typename batch_bool<T, A>::register_type;
1281
+ return detail::bitwise_not_neon(register_type(arg));
1282
+ }
1283
+
1284
+ /******************
1285
+ * bitwise_andnot *
1286
+ ******************/
1287
+
1288
+ WRAP_BINARY_INT(vbicq, detail::identity_return_type)
1289
+
1290
+ namespace detail
1291
+ {
1292
+ XSIMD_INLINE float32x4_t bitwise_andnot_f32(float32x4_t lhs, float32x4_t rhs) noexcept
1293
+ {
1294
+ return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs)));
1295
+ }
1296
+
1297
+ template <class V>
1298
+ XSIMD_INLINE V bitwise_andnot_neon(V const& lhs, V const& rhs) noexcept
1299
+ {
1300
+ const detail::neon_dispatcher::binary dispatcher = {
1301
+ std::make_tuple(wrap::vbicq_u8, wrap::vbicq_s8, wrap::vbicq_u16, wrap::vbicq_s16,
1302
+ wrap::vbicq_u32, wrap::vbicq_s32, wrap::vbicq_u64, wrap::vbicq_s64,
1303
+ bitwise_andnot_f32)
1304
+ };
1305
+ return dispatcher.apply(lhs, rhs);
1306
+ }
1307
+ }
1308
+
1309
+ template <class A, class T, detail::enable_neon_type_t<T> = 0>
1310
+ XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1311
+ {
1312
+ using register_type = typename batch<T, A>::register_type;
1313
+ return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
1314
+ }
1315
+
1316
+ template <class A, class T, detail::enable_neon_type_t<T> = 0>
1317
+ XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1318
+ {
1319
+ using register_type = typename batch_bool<T, A>::register_type;
1320
+ return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
1321
+ }
1322
+
1323
+ /*******
1324
+ * min *
1325
+ *******/
1326
+
1327
+ WRAP_BINARY_INT_EXCLUDING_64(vminq, detail::identity_return_type)
1328
+ WRAP_BINARY_FLOAT(vminq, detail::identity_return_type)
1329
+
1330
+ template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
1331
+ XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1332
+ {
1333
+ using register_type = typename batch<T, A>::register_type;
1334
+ const detail::excluding_int64_dispatcher::binary dispatcher = {
1335
+ std::make_tuple(wrap::vminq_u8, wrap::vminq_s8, wrap::vminq_u16, wrap::vminq_s16,
1336
+ wrap::vminq_u32, wrap::vminq_s32, wrap::vminq_f32)
1337
+ };
1338
+ return dispatcher.apply(register_type(lhs), register_type(rhs));
1339
+ }
1340
+
1341
+ template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
1342
+ XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1343
+ {
1344
+ return select(lhs > rhs, rhs, lhs);
1345
+ }
1346
+
1347
+ /*******
1348
+ * max *
1349
+ *******/
1350
+
1351
+ WRAP_BINARY_INT_EXCLUDING_64(vmaxq, detail::identity_return_type)
1352
+ WRAP_BINARY_FLOAT(vmaxq, detail::identity_return_type)
1353
+
1354
+ template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
1355
+ XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1356
+ {
1357
+ using register_type = typename batch<T, A>::register_type;
1358
+ const detail::excluding_int64_dispatcher::binary dispatcher = {
1359
+ std::make_tuple(wrap::vmaxq_u8, wrap::vmaxq_s8, wrap::vmaxq_u16, wrap::vmaxq_s16,
1360
+ wrap::vmaxq_u32, wrap::vmaxq_s32, wrap::vmaxq_f32)
1361
+ };
1362
+ return dispatcher.apply(register_type(lhs), register_type(rhs));
1363
+ }
1364
+
1365
+ template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
1366
+ XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1367
+ {
1368
+ return select(lhs > rhs, lhs, rhs);
1369
+ }
1370
+
1371
+ /*******
1372
+ * abs *
1373
+ *******/
1374
+
1375
+ namespace wrap
1376
+ {
1377
+ XSIMD_INLINE int8x16_t vabsq_s8(int8x16_t a) noexcept { return ::vabsq_s8(a); }
1378
+ XSIMD_INLINE int16x8_t vabsq_s16(int16x8_t a) noexcept { return ::vabsq_s16(a); }
1379
+ XSIMD_INLINE int32x4_t vabsq_s32(int32x4_t a) noexcept { return ::vabsq_s32(a); }
1380
+ }
1381
+ WRAP_UNARY_FLOAT(vabsq)
1382
+
1383
+ namespace detail
1384
+ {
1385
+ XSIMD_INLINE uint8x16_t abs_u8(uint8x16_t arg) noexcept
1386
+ {
1387
+ return arg;
1388
+ }
1389
+
1390
+ XSIMD_INLINE uint16x8_t abs_u16(uint16x8_t arg) noexcept
1391
+ {
1392
+ return arg;
1393
+ }
1394
+
1395
+ XSIMD_INLINE uint32x4_t abs_u32(uint32x4_t arg) noexcept
1396
+ {
1397
+ return arg;
1398
+ }
1399
+ }
1400
+
1401
+ template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
1402
+ XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<neon>) noexcept
1403
+ {
1404
+ using register_type = typename batch<T, A>::register_type;
1405
+ const detail::excluding_int64_dispatcher::unary dispatcher = {
1406
+ std::make_tuple(detail::abs_u8, wrap::vabsq_s8, detail::abs_u16, wrap::vabsq_s16,
1407
+ detail::abs_u32, wrap::vabsq_s32, wrap::vabsq_f32)
1408
+ };
1409
+ return dispatcher.apply(register_type(arg));
1410
+ }
1411
+
1412
+ /********
1413
+ * rsqrt *
1414
+ ********/
1415
+
1416
+ template <class A>
1417
+ XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept
1418
+ {
1419
+ return vrsqrteq_f32(arg);
1420
+ }
1421
+
1422
+ /********
1423
+ * sqrt *
1424
+ ********/
1425
+
1426
+ template <class A>
1427
+ XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept
1428
+ {
1429
+ batch<float, A> sqrt_reciprocal = vrsqrteq_f32(arg);
1430
+ // one iter
1431
+ sqrt_reciprocal = sqrt_reciprocal * batch<float, A>(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal));
1432
+ batch<float, A> sqrt_approx = arg * sqrt_reciprocal * batch<float, A>(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal));
1433
+ batch<float, A> zero(0.f);
1434
+ return select(arg == zero, zero, sqrt_approx);
1435
+ }
1436
+
1437
+ /********************
1438
+ * Fused operations *
1439
+ ********************/
1440
+
1441
+ #ifdef __ARM_FEATURE_FMA
1442
+ template <class A>
1443
+ XSIMD_INLINE batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept
1444
+ {
1445
+ return vfmaq_f32(z, x, y);
1446
+ }
1447
+
1448
+ template <class A>
1449
+ XSIMD_INLINE batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept
1450
+ {
1451
+ return vfmaq_f32(-z, x, y);
1452
+ }
1453
+ #endif
1454
+
1455
+ /*********
1456
+ * haddp *
1457
+ *********/
1458
+
1459
+ template <class A>
1460
+ XSIMD_INLINE batch<float, A> haddp(const batch<float, A>* row, requires_arch<neon>) noexcept
1461
+ {
1462
+ // row = (a,b,c,d)
1463
+ float32x2_t tmp1, tmp2, tmp3;
1464
+ // tmp1 = (a0 + a2, a1 + a3)
1465
+ tmp1 = vpadd_f32(vget_low_f32(row[0]), vget_high_f32(row[0]));
1466
+ // tmp2 = (b0 + b2, b1 + b3)
1467
+ tmp2 = vpadd_f32(vget_low_f32(row[1]), vget_high_f32(row[1]));
1468
+ // tmp1 = (a0..3, b0..3)
1469
+ tmp1 = vpadd_f32(tmp1, tmp2);
1470
+ // tmp2 = (c0 + c2, c1 + c3)
1471
+ tmp2 = vpadd_f32(vget_low_f32(row[2]), vget_high_f32(row[2]));
1472
+ // tmp3 = (d0 + d2, d1 + d3)
1473
+ tmp3 = vpadd_f32(vget_low_f32(row[3]), vget_high_f32(row[3]));
1474
+ // tmp1 = (c0..3, d0..3)
1475
+ tmp2 = vpadd_f32(tmp2, tmp3);
1476
+ // return = (a0..3, b0..3, c0..3, d0..3)
1477
+ return vcombine_f32(tmp1, tmp2);
1478
+ }
1479
+
1480
+ /**************
1481
+ * reciprocal *
1482
+ **************/
1483
+
1484
+ template <class A>
1485
+ XSIMD_INLINE batch<float, A>
1486
+ reciprocal(const batch<float, A>& x,
1487
+ kernel::requires_arch<neon>) noexcept
1488
+ {
1489
+ return vrecpeq_f32(x);
1490
+ }
1491
+
1492
+ /**********
1493
+ * insert *
1494
+ **********/
1495
+
1496
+ template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 1> = 0>
1497
+ XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1498
+ {
1499
+ return vsetq_lane_u8(val, self, I);
1500
+ }
1501
+
1502
+ template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 1> = 0>
1503
+ XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1504
+ {
1505
+ return vsetq_lane_s8(val, self, I);
1506
+ }
1507
+
1508
+ template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 2> = 0>
1509
+ XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1510
+ {
1511
+ return vsetq_lane_u16(val, self, I);
1512
+ }
1513
+
1514
+ template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 2> = 0>
1515
+ XSIMD_INLINE batch<int16_t, A> insert(batch<int16_t, A> const& self, int16_t val, index<I>, requires_arch<neon>) noexcept
1516
+ {
1517
+ return vsetq_lane_s16(val, self, I);
1518
+ }
1519
+
1520
+ template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 4> = 0>
1521
+ XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1522
+ {
1523
+ return vsetq_lane_u32(val, self, I);
1524
+ }
1525
+
1526
+ template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 4> = 0>
1527
+ XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1528
+ {
1529
+ return vsetq_lane_s32(val, self, I);
1530
+ }
1531
+
1532
+ template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 8> = 0>
1533
+ XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1534
+ {
1535
+ return vsetq_lane_u64(val, self, I);
1536
+ }
1537
+
1538
+ template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 8> = 0>
1539
+ XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1540
+ {
1541
+ return vsetq_lane_s64(val, self, I);
1542
+ }
1543
+
1544
+ template <class A, size_t I>
1545
+ XSIMD_INLINE batch<float, A> insert(batch<float, A> const& self, float val, index<I>, requires_arch<neon>) noexcept
1546
+ {
1547
+ return vsetq_lane_f32(val, self, I);
1548
+ }
1549
+
1550
+ /********************
1551
+ * nearbyint_as_int *
1552
+ *******************/
1553
+
1554
+ template <class A>
1555
+ XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
1556
+ requires_arch<neon>) noexcept
1557
+ {
1558
+ /* origin: https://github.com/DLTcollab/sse2neon/blob/cad518a93b326f0f644b7972d488d04eaa2b0475/sse2neon.h#L4028-L4047 */
1559
+ // Contributors to this work are:
1560
+ // John W. Ratcliff <jratcliffscarab@gmail.com>
1561
+ // Brandon Rowlett <browlett@nvidia.com>
1562
+ // Ken Fast <kfast@gdeb.com>
1563
+ // Eric van Beurden <evanbeurden@nvidia.com>
1564
+ // Alexander Potylitsin <apotylitsin@nvidia.com>
1565
+ // Hasindu Gamaarachchi <hasindu2008@gmail.com>
1566
+ // Jim Huang <jserv@biilabs.io>
1567
+ // Mark Cheng <marktwtn@biilabs.io>
1568
+ // Malcolm James MacLeod <malcolm@gulden.com>
1569
+ // Devin Hussey (easyaspi314) <husseydevin@gmail.com>
1570
+ // Sebastian Pop <spop@amazon.com>
1571
+ // Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
1572
+ // Danila Kutenin <danilak@google.com>
1573
+ // François Turban (JishinMaster) <francois.turban@gmail.com>
1574
+ // Pei-Hsuan Hung <afcidk@gmail.com>
1575
+ // Yang-Hao Yuan <yanghau@biilabs.io>
1576
+ // Syoyo Fujita <syoyo@lighttransport.com>
1577
+ // Brecht Van Lommel <brecht@blender.org>
1578
+
1579
+ /*
1580
+ * sse2neon is freely redistributable under the MIT License.
1581
+ *
1582
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
1583
+ * of this software and associated documentation files (the "Software"), to deal
1584
+ * in the Software without restriction, including without limitation the rights
1585
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1586
+ * copies of the Software, and to permit persons to whom the Software is
1587
+ * furnished to do so, subject to the following conditions:
1588
+ *
1589
+ * The above copyright notice and this permission notice shall be included in
1590
+ * all copies or substantial portions of the Software.
1591
+ *
1592
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1593
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1594
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1595
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1596
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1597
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1598
+ * SOFTWARE.
1599
+ */
1600
+
1601
+ const auto signmask = vdupq_n_u32(0x80000000);
1602
+ const auto half = vbslq_f32(signmask, self,
1603
+ vdupq_n_f32(0.5f)); /* +/- 0.5 */
1604
+ const auto r_normal = vcvtq_s32_f32(vaddq_f32(
1605
+ self, half)); /* round to integer: [a + 0.5]*/
1606
+ const auto r_trunc = vcvtq_s32_f32(self); /* truncate to integer: [a] */
1607
+ const auto plusone = vreinterpretq_s32_u32(vshrq_n_u32(
1608
+ vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
1609
+ const auto r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
1610
+ vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
1611
+ const auto delta = vsubq_f32(
1612
+ self,
1613
+ vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
1614
+ const auto is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
1615
+ return vbslq_s32(is_delta_half, r_even, r_normal);
1616
+ }
1617
+
1618
+ /**************
1619
+ * reduce_add *
1620
+ **************/
1621
+
1622
+ namespace detail
1623
+ {
1624
+ template <class T, class A, class V>
1625
+ XSIMD_INLINE T sum_batch(V const& arg) noexcept
1626
+ {
1627
+ T res = T(0);
1628
+ for (std::size_t i = 0; i < batch<T, A>::size; ++i)
1629
+ {
1630
+ res += arg[i];
1631
+ }
1632
+ return res;
1633
+ }
1634
+ }
1635
+
1636
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
1637
+ XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1638
+ {
1639
+ uint8x8_t tmp = vpadd_u8(vget_low_u8(arg), vget_high_u8(arg));
1640
+ tmp = vpadd_u8(tmp, tmp);
1641
+ tmp = vpadd_u8(tmp, tmp);
1642
+ tmp = vpadd_u8(tmp, tmp);
1643
+ return vget_lane_u8(tmp, 0);
1644
+ }
1645
+
1646
+ template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
1647
+ XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1648
+ {
1649
+ int8x8_t tmp = vpadd_s8(vget_low_s8(arg), vget_high_s8(arg));
1650
+ tmp = vpadd_s8(tmp, tmp);
1651
+ tmp = vpadd_s8(tmp, tmp);
1652
+ tmp = vpadd_s8(tmp, tmp);
1653
+ return vget_lane_s8(tmp, 0);
1654
+ }
1655
+
1656
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
1657
+ XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1658
+ {
1659
+ uint16x4_t tmp = vpadd_u16(vget_low_u16(arg), vget_high_u16(arg));
1660
+ tmp = vpadd_u16(tmp, tmp);
1661
+ tmp = vpadd_u16(tmp, tmp);
1662
+ return vget_lane_u16(tmp, 0);
1663
+ }
1664
+
1665
+ template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
1666
+ XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1667
+ {
1668
+ int16x4_t tmp = vpadd_s16(vget_low_s16(arg), vget_high_s16(arg));
1669
+ tmp = vpadd_s16(tmp, tmp);
1670
+ tmp = vpadd_s16(tmp, tmp);
1671
+ return vget_lane_s16(tmp, 0);
1672
+ }
1673
+
1674
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
1675
+ XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1676
+ {
1677
+ uint32x2_t tmp = vpadd_u32(vget_low_u32(arg), vget_high_u32(arg));
1678
+ tmp = vpadd_u32(tmp, tmp);
1679
+ return vget_lane_u32(tmp, 0);
1680
+ }
1681
+
1682
+ template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
1683
+ XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1684
+ {
1685
+ int32x2_t tmp = vpadd_s32(vget_low_s32(arg), vget_high_s32(arg));
1686
+ tmp = vpadd_s32(tmp, tmp);
1687
+ return vget_lane_s32(tmp, 0);
1688
+ }
1689
+
1690
+ template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
1691
+ XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1692
+ {
1693
+ return arg.get(0) + arg.get(1);
1694
+ }
1695
+
1696
+ template <class A>
1697
+ XSIMD_INLINE float reduce_add(batch<float, A> const& arg, requires_arch<neon>) noexcept
1698
+ {
1699
+ float32x2_t tmp = vpadd_f32(vget_low_f32(arg), vget_high_f32(arg));
1700
+ tmp = vpadd_f32(tmp, tmp);
1701
+ return vget_lane_f32(tmp, 0);
1702
+ }
1703
+
1704
+ /**************
1705
+ * reduce_max *
1706
+ **************/
1707
+
1708
+ // Using common implementation because ARM does not provide intrinsics
1709
+ // for this operation
1710
+
1711
+ /**************
1712
+ * reduce_min *
1713
+ **************/
1714
+
1715
+ // Using common implementation because ARM does not provide intrinsics
1716
+ // for this operation
1717
+
1718
+ /**************
1719
+ * reduce_mul *
1720
+ **************/
1721
+
1722
+ // Using common implementation because ARM does not provide intrinsics
1723
+ // for this operation
1724
+
1725
+ /**********
1726
+ * select *
1727
+ **********/
1728
+
1729
+ namespace wrap
1730
+ {
1731
+ XSIMD_INLINE uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) noexcept { return ::vbslq_u8(a, b, c); }
1732
+ XSIMD_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) noexcept { return ::vbslq_s8(a, b, c); }
1733
+ XSIMD_INLINE uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) noexcept { return ::vbslq_u16(a, b, c); }
1734
+ XSIMD_INLINE int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) noexcept { return ::vbslq_s16(a, b, c); }
1735
+ XSIMD_INLINE uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) noexcept { return ::vbslq_u32(a, b, c); }
1736
+ XSIMD_INLINE int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) noexcept { return ::vbslq_s32(a, b, c); }
1737
+ XSIMD_INLINE uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) noexcept { return ::vbslq_u64(a, b, c); }
1738
+ XSIMD_INLINE int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) noexcept { return ::vbslq_s64(a, b, c); }
1739
+ XSIMD_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) noexcept { return ::vbslq_f32(a, b, c); }
1740
+ }
1741
+
1742
+ namespace detail
1743
+ {
1744
+ template <class... T>
1745
+ struct neon_select_dispatcher_impl
1746
+ {
1747
+ using container_type = std::tuple<T (*)(comp_return_type<T>, T, T)...>;
1748
+ const container_type m_func;
1749
+
1750
+ template <class U>
1751
+ U apply(comp_return_type<U> cond, U lhs, U rhs) const noexcept
1752
+ {
1753
+ using func_type = U (*)(comp_return_type<U>, U, U);
1754
+ auto func = xsimd::detail::get<func_type>(m_func);
1755
+ return func(cond, lhs, rhs);
1756
+ }
1757
+ };
1758
+
1759
+ using neon_select_dispatcher = neon_select_dispatcher_impl<uint8x16_t, int8x16_t,
1760
+ uint16x8_t, int16x8_t,
1761
+ uint32x4_t, int32x4_t,
1762
+ uint64x2_t, int64x2_t,
1763
+ float32x4_t>;
1764
+ }
1765
+
1766
+ template <class A, class T, detail::enable_neon_type_t<T> = 0>
1767
+ XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<neon>) noexcept
1768
+ {
1769
+ using bool_register_type = typename batch_bool<T, A>::register_type;
1770
+ using register_type = typename batch<T, A>::register_type;
1771
+ const detail::neon_select_dispatcher dispatcher = {
1772
+ std::make_tuple(wrap::vbslq_u8, wrap::vbslq_s8, wrap::vbslq_u16, wrap::vbslq_s16,
1773
+ wrap::vbslq_u32, wrap::vbslq_s32, wrap::vbslq_u64, wrap::vbslq_s64,
1774
+ wrap::vbslq_f32)
1775
+ };
1776
+ return dispatcher.apply(bool_register_type(cond), register_type(a), register_type(b));
1777
+ }
1778
+
1779
+ template <class A, class T, bool... b, detail::enable_neon_type_t<T> = 0>
1780
+ XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<neon>) noexcept
1781
+ {
1782
+ return select(batch_bool<T, A> { b... }, true_br, false_br, neon {});
1783
+ }
1784
+
1785
+ /*************
1786
+ * transpose *
1787
+ *************/
1788
+ template <class A>
1789
+ XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<neon>) noexcept
1790
+ {
1791
+ assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
1792
+ (void)matrix_end;
1793
+ auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
1794
+ auto t01 = vtrnq_f32(r0, r1);
1795
+ auto t23 = vtrnq_f32(r2, r3);
1796
+ matrix_begin[0] = vcombine_f32(vget_low_f32(t01.val[0]), vget_low_f32(t23.val[0]));
1797
+ matrix_begin[1] = vcombine_f32(vget_low_f32(t01.val[1]), vget_low_f32(t23.val[1]));
1798
+ matrix_begin[2] = vcombine_f32(vget_high_f32(t01.val[0]), vget_high_f32(t23.val[0]));
1799
+ matrix_begin[3] = vcombine_f32(vget_high_f32(t01.val[1]), vget_high_f32(t23.val[1]));
1800
+ }
1801
+ template <class A>
1802
+ XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<neon>) noexcept
1803
+ {
1804
+ assert((matrix_end - matrix_begin == batch<uint32_t, A>::size) && "correctly sized matrix");
1805
+ (void)matrix_end;
1806
+ auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
1807
+ auto t01 = vtrnq_u32(r0, r1);
1808
+ auto t23 = vtrnq_u32(r2, r3);
1809
+ matrix_begin[0] = vcombine_u32(vget_low_u32(t01.val[0]), vget_low_u32(t23.val[0]));
1810
+ matrix_begin[1] = vcombine_u32(vget_low_u32(t01.val[1]), vget_low_u32(t23.val[1]));
1811
+ matrix_begin[2] = vcombine_u32(vget_high_u32(t01.val[0]), vget_high_u32(t23.val[0]));
1812
+ matrix_begin[3] = vcombine_u32(vget_high_u32(t01.val[1]), vget_high_u32(t23.val[1]));
1813
+ }
1814
+ template <class A>
1815
+ XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<neon>) noexcept
1816
+ {
1817
+ assert((matrix_end - matrix_begin == batch<int32_t, A>::size) && "correctly sized matrix");
1818
+ (void)matrix_end;
1819
+ auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
1820
+ auto t01 = vtrnq_s32(r0, r1);
1821
+ auto t23 = vtrnq_s32(r2, r3);
1822
+ matrix_begin[0] = vcombine_s32(vget_low_s32(t01.val[0]), vget_low_s32(t23.val[0]));
1823
+ matrix_begin[1] = vcombine_s32(vget_low_s32(t01.val[1]), vget_low_s32(t23.val[1]));
1824
+ matrix_begin[2] = vcombine_s32(vget_high_s32(t01.val[0]), vget_high_s32(t23.val[0]));
1825
+ matrix_begin[3] = vcombine_s32(vget_high_s32(t01.val[1]), vget_high_s32(t23.val[1]));
1826
+ }
1827
+
1828
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1829
+ XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon>) noexcept
1830
+ {
1831
+ assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
1832
+ (void)matrix_end;
1833
+ auto r0 = matrix_begin[0], r1 = matrix_begin[1];
1834
+ matrix_begin[0] = vcombine_u64(vget_low_u64(r0), vget_low_u64(r1));
1835
+ matrix_begin[1] = vcombine_u64(vget_high_u64(r0), vget_high_u64(r1));
1836
+ }
1837
+
1838
+ template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1839
+ XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon>) noexcept
1840
+ {
1841
+ assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
1842
+ (void)matrix_end;
1843
+ auto r0 = matrix_begin[0], r1 = matrix_begin[1];
1844
+ matrix_begin[0] = vcombine_s64(vget_low_s64(r0), vget_low_s64(r1));
1845
+ matrix_begin[1] = vcombine_s64(vget_high_s64(r0), vget_high_s64(r1));
1846
+ }
1847
+
1848
+ /**********
1849
+ * zip_lo *
1850
+ **********/
1851
+
1852
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
1853
+ XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1854
+ {
1855
+ uint8x8x2_t tmp = vzip_u8(vget_low_u8(lhs), vget_low_u8(rhs));
1856
+ return vcombine_u8(tmp.val[0], tmp.val[1]);
1857
+ }
1858
+
1859
+ template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
1860
+ XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1861
+ {
1862
+ int8x8x2_t tmp = vzip_s8(vget_low_s8(lhs), vget_low_s8(rhs));
1863
+ return vcombine_s8(tmp.val[0], tmp.val[1]);
1864
+ }
1865
+
1866
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
1867
+ XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1868
+ {
1869
+ uint16x4x2_t tmp = vzip_u16(vget_low_u16(lhs), vget_low_u16(rhs));
1870
+ return vcombine_u16(tmp.val[0], tmp.val[1]);
1871
+ }
1872
+
1873
+ template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
1874
+ XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1875
+ {
1876
+ int16x4x2_t tmp = vzip_s16(vget_low_s16(lhs), vget_low_s16(rhs));
1877
+ return vcombine_s16(tmp.val[0], tmp.val[1]);
1878
+ }
1879
+
1880
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
1881
+ XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1882
+ {
1883
+ uint32x2x2_t tmp = vzip_u32(vget_low_u32(lhs), vget_low_u32(rhs));
1884
+ return vcombine_u32(tmp.val[0], tmp.val[1]);
1885
+ }
1886
+
1887
+ template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
1888
+ XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1889
+ {
1890
+ int32x2x2_t tmp = vzip_s32(vget_low_s32(lhs), vget_low_s32(rhs));
1891
+ return vcombine_s32(tmp.val[0], tmp.val[1]);
1892
+ }
1893
+
1894
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1895
+ XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1896
+ {
1897
+ return vcombine_u64(vget_low_u64(lhs), vget_low_u64(rhs));
1898
+ }
1899
+
1900
+ template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1901
+ XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1902
+ {
1903
+ return vcombine_s64(vget_low_s64(lhs), vget_low_s64(rhs));
1904
+ }
1905
+
1906
+ template <class A>
1907
+ XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
1908
+ {
1909
+ float32x2x2_t tmp = vzip_f32(vget_low_f32(lhs), vget_low_f32(rhs));
1910
+ return vcombine_f32(tmp.val[0], tmp.val[1]);
1911
+ }
1912
+
1913
+ /**********
1914
+ * zip_hi *
1915
+ **********/
1916
+
1917
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
1918
+ XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1919
+ {
1920
+ uint8x8x2_t tmp = vzip_u8(vget_high_u8(lhs), vget_high_u8(rhs));
1921
+ return vcombine_u8(tmp.val[0], tmp.val[1]);
1922
+ }
1923
+
1924
+ template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
1925
+ XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1926
+ {
1927
+ int8x8x2_t tmp = vzip_s8(vget_high_s8(lhs), vget_high_s8(rhs));
1928
+ return vcombine_s8(tmp.val[0], tmp.val[1]);
1929
+ }
1930
+
1931
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
1932
+ XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1933
+ {
1934
+ uint16x4x2_t tmp = vzip_u16(vget_high_u16(lhs), vget_high_u16(rhs));
1935
+ return vcombine_u16(tmp.val[0], tmp.val[1]);
1936
+ }
1937
+
1938
+ template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
1939
+ XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1940
+ {
1941
+ int16x4x2_t tmp = vzip_s16(vget_high_s16(lhs), vget_high_s16(rhs));
1942
+ return vcombine_s16(tmp.val[0], tmp.val[1]);
1943
+ }
1944
+
1945
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
1946
+ XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1947
+ {
1948
+ uint32x2x2_t tmp = vzip_u32(vget_high_u32(lhs), vget_high_u32(rhs));
1949
+ return vcombine_u32(tmp.val[0], tmp.val[1]);
1950
+ }
1951
+
1952
+ template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
1953
+ XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1954
+ {
1955
+ int32x2x2_t tmp = vzip_s32(vget_high_s32(lhs), vget_high_s32(rhs));
1956
+ return vcombine_s32(tmp.val[0], tmp.val[1]);
1957
+ }
1958
+
1959
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1960
+ XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1961
+ {
1962
+ return vcombine_u64(vget_high_u64(lhs), vget_high_u64(rhs));
1963
+ }
1964
+
1965
+ template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1966
+ XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1967
+ {
1968
+ return vcombine_s64(vget_high_s64(lhs), vget_high_s64(rhs));
1969
+ }
1970
+
1971
+ template <class A>
1972
+ XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
1973
+ {
1974
+ float32x2x2_t tmp = vzip_f32(vget_high_f32(lhs), vget_high_f32(rhs));
1975
+ return vcombine_f32(tmp.val[0], tmp.val[1]);
1976
+ }
1977
+
1978
+ /****************
1979
+ * extract_pair *
1980
+ ****************/
1981
+
1982
+ namespace detail
1983
+ {
1984
+ template <class A, class T>
1985
+ XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& /*rhs*/, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
1986
+ {
1987
+ assert(false && "extract_pair out of bounds");
1988
+ return batch<T, A> {};
1989
+ }
1990
+
1991
+ template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
1992
+ XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1993
+ {
1994
+ if (n == I)
1995
+ {
1996
+ return vextq_u8(rhs, lhs, I);
1997
+ }
1998
+ else
1999
+ {
2000
+ return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2001
+ }
2002
+ }
2003
+
2004
+ template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 1> = 0>
2005
+ XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
2006
+ {
2007
+ if (n == I)
2008
+ {
2009
+ return vextq_s8(rhs, lhs, I);
2010
+ }
2011
+ else
2012
+ {
2013
+ return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2014
+ }
2015
+ }
2016
+
2017
+ template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
2018
+ XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
2019
+ {
2020
+ if (n == I)
2021
+ {
2022
+ return vextq_u16(rhs, lhs, I);
2023
+ }
2024
+ else
2025
+ {
2026
+ return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2027
+ }
2028
+ }
2029
+
2030
+ template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 2> = 0>
2031
+ XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
2032
+ {
2033
+ if (n == I)
2034
+ {
2035
+ return vextq_s16(rhs, lhs, I);
2036
+ }
2037
+ else
2038
+ {
2039
+ return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2040
+ }
2041
+ }
2042
+
2043
+ template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
2044
+ XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
2045
+ {
2046
+ if (n == I)
2047
+ {
2048
+ return vextq_u32(rhs, lhs, I);
2049
+ }
2050
+ else
2051
+ {
2052
+ return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2053
+ }
2054
+ }
2055
+
2056
+ template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 4> = 0>
2057
+ XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
2058
+ {
2059
+ if (n == I)
2060
+ {
2061
+ return vextq_s32(rhs, lhs, I);
2062
+ }
2063
+ else
2064
+ {
2065
+ return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2066
+ }
2067
+ }
2068
+
2069
+ template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
2070
+ XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
2071
+ {
2072
+ if (n == I)
2073
+ {
2074
+ return vextq_u64(rhs, lhs, I);
2075
+ }
2076
+ else
2077
+ {
2078
+ return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2079
+ }
2080
+ }
2081
+
2082
+ template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 8> = 0>
2083
+ XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
2084
+ {
2085
+ if (n == I)
2086
+ {
2087
+ return vextq_s64(rhs, lhs, I);
2088
+ }
2089
+ else
2090
+ {
2091
+ return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2092
+ }
2093
+ }
2094
+
2095
+ template <class A, size_t I, size_t... Is>
2096
+ XSIMD_INLINE batch<float, A> extract_pair(batch<float, A> const& lhs, batch<float, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
2097
+ {
2098
+ if (n == I)
2099
+ {
2100
+ return vextq_f32(rhs, lhs, I);
2101
+ }
2102
+ else
2103
+ {
2104
+ return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2105
+ }
2106
+ }
2107
+
2108
+ template <class A, class T, size_t... Is>
2109
+ XSIMD_INLINE batch<T, A> extract_pair_impl(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<0, Is...>) noexcept
2110
+ {
2111
+ if (n == 0)
2112
+ {
2113
+ return rhs;
2114
+ }
2115
+ else
2116
+ {
2117
+ return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2118
+ }
2119
+ }
2120
+ }
2121
+
2122
+ template <class A, class T>
2123
+ XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, requires_arch<neon>) noexcept
2124
+ {
2125
+ constexpr std::size_t size = batch<T, A>::size;
2126
+ assert(n < size && "index in bounds");
2127
+ return detail::extract_pair_impl(lhs, rhs, n, ::xsimd::detail::make_index_sequence<size>());
2128
+ }
2129
+
2130
+ /******************
2131
+ * bitwise_lshift *
2132
+ ******************/
2133
+
2134
+ namespace detail
2135
+ {
2136
+ template <class A, class T>
2137
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept
2138
+ {
2139
+ assert(false && "bitwise_lshift out of bounds");
2140
+ return batch<T, A> {};
2141
+ }
2142
+
2143
+ template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
2144
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2145
+ {
2146
+ if (n == I)
2147
+ {
2148
+ return vshlq_n_u8(lhs, I);
2149
+ }
2150
+ else
2151
+ {
2152
+ return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2153
+ }
2154
+ }
2155
+
2156
+ template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 1> = 0>
2157
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2158
+ {
2159
+ if (n == I)
2160
+ {
2161
+ return vshlq_n_s8(lhs, I);
2162
+ }
2163
+ else
2164
+ {
2165
+ return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2166
+ }
2167
+ }
2168
+
2169
+ template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
2170
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2171
+ {
2172
+ if (n == I)
2173
+ {
2174
+ return vshlq_n_u16(lhs, I);
2175
+ }
2176
+ else
2177
+ {
2178
+ return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2179
+ }
2180
+ }
2181
+
2182
+ template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 2> = 0>
2183
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2184
+ {
2185
+ if (n == I)
2186
+ {
2187
+ return vshlq_n_s16(lhs, I);
2188
+ }
2189
+ else
2190
+ {
2191
+ return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2192
+ }
2193
+ }
2194
+
2195
+ template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
2196
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2197
+ {
2198
+ if (n == I)
2199
+ {
2200
+ return vshlq_n_u32(lhs, I);
2201
+ }
2202
+ else
2203
+ {
2204
+ return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2205
+ }
2206
+ }
2207
+
2208
+ template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 4> = 0>
2209
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2210
+ {
2211
+ if (n == I)
2212
+ {
2213
+ return vshlq_n_s32(lhs, I);
2214
+ }
2215
+ else
2216
+ {
2217
+ return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2218
+ }
2219
+ }
2220
+
2221
+ template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
2222
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2223
+ {
2224
+ if (n == I)
2225
+ {
2226
+ return vshlq_n_u64(lhs, I);
2227
+ }
2228
+ else
2229
+ {
2230
+ return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2231
+ }
2232
+ }
2233
+
2234
+ template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 8> = 0>
2235
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2236
+ {
2237
+ if (n == I)
2238
+ {
2239
+ return vshlq_n_s64(lhs, I);
2240
+ }
2241
+ else
2242
+ {
2243
+ return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2244
+ }
2245
+ }
2246
+
2247
+ template <class A, class T, int... Is>
2248
+ XSIMD_INLINE batch<T, A> bitwise_lshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept
2249
+ {
2250
+ if (n == 0)
2251
+ {
2252
+ return lhs;
2253
+ }
2254
+ else
2255
+ {
2256
+ return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2257
+ }
2258
+ }
2259
+ }
2260
+
2261
+ template <class A, class T>
2262
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept
2263
+ {
2264
+ constexpr int size = sizeof(typename batch<T, A>::value_type) * 8;
2265
+ assert(0 <= n && n < size && "index in bounds");
2266
+ return detail::bitwise_lshift_impl(lhs, n, ::xsimd::detail::make_int_sequence<size>());
2267
+ }
2268
+
2269
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
2270
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2271
+ {
2272
+ return vshlq_u8(lhs, rhs);
2273
+ }
2274
+
2275
+ template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
2276
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2277
+ {
2278
+ return vshlq_s8(lhs, rhs);
2279
+ }
2280
+
2281
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
2282
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2283
+ {
2284
+ return vshlq_u16(lhs, rhs);
2285
+ }
2286
+
2287
+ template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
2288
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2289
+ {
2290
+ return vshlq_s16(lhs, rhs);
2291
+ }
2292
+
2293
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
2294
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2295
+ {
2296
+ return vshlq_u32(lhs, rhs);
2297
+ }
2298
+
2299
+ template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
2300
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2301
+ {
2302
+ return vshlq_s32(lhs, rhs);
2303
+ }
2304
+
2305
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
2306
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2307
+ {
2308
+ return vshlq_u64(lhs, rhs);
2309
+ }
2310
+
2311
+ template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
2312
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2313
+ {
2314
+ return vshlq_s64(lhs, rhs);
2315
+ }
2316
+
2317
+ // immediate variant
2318
+ template <size_t shift, class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
2319
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x, requires_arch<neon>) noexcept
2320
+ {
2321
+ return vshlq_n_u8(x, shift);
2322
+ }
2323
+
2324
+ template <size_t shift, class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
2325
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x, requires_arch<neon>) noexcept
2326
+ {
2327
+ return vshlq_n_s8(x, shift);
2328
+ }
2329
+
2330
+ template <size_t shift, class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
2331
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x, requires_arch<neon>) noexcept
2332
+ {
2333
+ return vshlq_n_u16(x, shift);
2334
+ }
2335
+
2336
+ template <size_t shift, class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
2337
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x, requires_arch<neon>) noexcept
2338
+ {
2339
+ return vshlq_n_s16(x, shift);
2340
+ }
2341
+
2342
+ template <size_t shift, class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
2343
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x, requires_arch<neon>) noexcept
2344
+ {
2345
+ return vshlq_n_u32(x, shift);
2346
+ }
2347
+
2348
+ template <size_t shift, class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
2349
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x, requires_arch<neon>) noexcept
2350
+ {
2351
+ return vshlq_n_s32(x, shift);
2352
+ }
2353
+
2354
+ template <size_t shift, class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
2355
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x, requires_arch<neon>) noexcept
2356
+ {
2357
+ return vshlq_n_u64(x, shift);
2358
+ }
2359
+
2360
+ template <size_t shift, class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
2361
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x, requires_arch<neon>) noexcept
2362
+ {
2363
+ return vshlq_n_s64(x, shift);
2364
+ }
2365
+
2366
+ /******************
2367
+ * bitwise_rshift *
2368
+ ******************/
2369
+
2370
+ namespace detail
2371
+ {
2372
+ template <class A, class T>
2373
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept
2374
+ {
2375
+ assert(false && "bitwise_rshift out of bounds");
2376
+ return batch<T, A> {};
2377
+ }
2378
+
2379
+ template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
2380
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2381
+ {
2382
+ if (n == I)
2383
+ {
2384
+ return vshrq_n_u8(lhs, I);
2385
+ }
2386
+ else
2387
+ {
2388
+ return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2389
+ }
2390
+ }
2391
+
2392
+ template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 1> = 0>
2393
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2394
+ {
2395
+ if (n == I)
2396
+ {
2397
+ return vshrq_n_s8(lhs, I);
2398
+ }
2399
+ else
2400
+ {
2401
+ return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2402
+ }
2403
+ }
2404
+
2405
+ template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
2406
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2407
+ {
2408
+ if (n == I)
2409
+ {
2410
+ return vshrq_n_u16(lhs, I);
2411
+ }
2412
+ else
2413
+ {
2414
+ return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2415
+ }
2416
+ }
2417
+
2418
+ template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 2> = 0>
2419
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2420
+ {
2421
+ if (n == I)
2422
+ {
2423
+ return vshrq_n_s16(lhs, I);
2424
+ }
2425
+ else
2426
+ {
2427
+ return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2428
+ }
2429
+ }
2430
+
2431
+ template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
2432
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2433
+ {
2434
+ if (n == I)
2435
+ {
2436
+ return vshrq_n_u32(lhs, I);
2437
+ }
2438
+ else
2439
+ {
2440
+ return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2441
+ }
2442
+ }
2443
+
2444
+ template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 4> = 0>
2445
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2446
+ {
2447
+ if (n == I)
2448
+ {
2449
+ return vshrq_n_s32(lhs, I);
2450
+ }
2451
+ else
2452
+ {
2453
+ return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2454
+ }
2455
+ }
2456
+
2457
+ template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
2458
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2459
+ {
2460
+ if (n == I)
2461
+ {
2462
+ return vshrq_n_u64(lhs, I);
2463
+ }
2464
+ else
2465
+ {
2466
+ return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2467
+ }
2468
+ }
2469
+
2470
+ template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 8> = 0>
2471
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2472
+ {
2473
+ if (n == I)
2474
+ {
2475
+ return vshrq_n_s64(lhs, I);
2476
+ }
2477
+ else
2478
+ {
2479
+ return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2480
+ }
2481
+ }
2482
+
2483
+ template <class A, class T, int... Is>
2484
+ XSIMD_INLINE batch<T, A> bitwise_rshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept
2485
+ {
2486
+ if (n == 0)
2487
+ {
2488
+ return lhs;
2489
+ }
2490
+ else
2491
+ {
2492
+ return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2493
+ }
2494
+ }
2495
+ }
2496
+
2497
+ template <class A, class T>
2498
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept
2499
+ {
2500
+ constexpr int size = sizeof(typename batch<T, A>::value_type) * 8;
2501
+ assert(0 <= n && n < size && "index in bounds");
2502
+ return detail::bitwise_rshift_impl(lhs, n, ::xsimd::detail::make_int_sequence<size>());
2503
+ }
2504
+
2505
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
2506
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2507
+ {
2508
+ return vshlq_u8(lhs, vnegq_s8(rhs));
2509
+ }
2510
+
2511
+ template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
2512
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2513
+ {
2514
+ return vshlq_s8(lhs, vnegq_s8(rhs));
2515
+ }
2516
+
2517
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
2518
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2519
+ {
2520
+ return vshlq_u16(lhs, vnegq_s16(rhs));
2521
+ }
2522
+
2523
+ template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
2524
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2525
+ {
2526
+ return vshlq_s16(lhs, vnegq_s16(rhs));
2527
+ }
2528
+
2529
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
2530
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2531
+ {
2532
+ return vshlq_u32(lhs, vnegq_s32(rhs));
2533
+ }
2534
+
2535
+ template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
2536
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2537
+ {
2538
+ return vshlq_s32(lhs, vnegq_s32(rhs));
2539
+ }
2540
+
2541
+ // immediate variant
2542
+ template <size_t shift, class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
2543
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& x, requires_arch<neon>) noexcept
2544
+ {
2545
+ return vshrq_n_u8(x, shift);
2546
+ }
2547
+
2548
+ template <size_t shift, class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
2549
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& x, requires_arch<neon>) noexcept
2550
+ {
2551
+ return vshrq_n_s8(x, shift);
2552
+ }
2553
+
2554
+ template <size_t shift, class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
2555
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& x, requires_arch<neon>) noexcept
2556
+ {
2557
+ return vshrq_n_u16(x, shift);
2558
+ }
2559
+
2560
+ template <size_t shift, class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
2561
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& x, requires_arch<neon>) noexcept
2562
+ {
2563
+ return vshrq_n_s16(x, shift);
2564
+ }
2565
+
2566
+ template <size_t shift, class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
2567
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& x, requires_arch<neon>) noexcept
2568
+ {
2569
+ return vshrq_n_u32(x, shift);
2570
+ }
2571
+
2572
+ template <size_t shift, class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
2573
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& x, requires_arch<neon>) noexcept
2574
+ {
2575
+ return vshrq_n_s32(x, shift);
2576
+ }
2577
+
2578
+ template <size_t shift, class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
2579
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& x, requires_arch<neon>) noexcept
2580
+ {
2581
+ return vshrq_n_u64(x, shift);
2582
+ }
2583
+
2584
+ template <size_t shift, class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
2585
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& x, requires_arch<neon>) noexcept
2586
+ {
2587
+ return vshrq_n_s64(x, shift);
2588
+ }
2589
+
2590
+ // first
2591
+ template <class A>
2592
+ XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<neon>) noexcept
2593
+ {
2594
+ return vgetq_lane_f32(self, 0);
2595
+ }
2596
+
2597
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
2598
+ XSIMD_INLINE T first(batch<T, A> val, requires_arch<neon>) noexcept
2599
+ {
2600
+ return vgetq_lane_u8(val, 0);
2601
+ }
2602
+
2603
+ template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
2604
+ XSIMD_INLINE T first(batch<T, A> val, requires_arch<neon>) noexcept
2605
+ {
2606
+ return vgetq_lane_s8(val, 0);
2607
+ }
2608
+
2609
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
2610
+ XSIMD_INLINE T first(batch<T, A> val, requires_arch<neon>) noexcept
2611
+ {
2612
+ return vgetq_lane_u16(val, 0);
2613
+ }
2614
+
2615
+ template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
2616
+ XSIMD_INLINE T first(batch<T, A> val, requires_arch<neon>) noexcept
2617
+ {
2618
+ return vgetq_lane_s16(val, 0);
2619
+ }
2620
+
2621
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
2622
+ XSIMD_INLINE T first(batch<T, A> val, requires_arch<neon>) noexcept
2623
+ {
2624
+ return vgetq_lane_u32(val, 0);
2625
+ }
2626
+
2627
+ template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
2628
+ XSIMD_INLINE T first(batch<T, A> val, requires_arch<neon>) noexcept
2629
+ {
2630
+ return vgetq_lane_s32(val, 0);
2631
+ }
2632
+
2633
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
2634
+ XSIMD_INLINE T first(batch<T, A> val, requires_arch<neon>) noexcept
2635
+ {
2636
+ return vgetq_lane_u64(val, 0);
2637
+ }
2638
+
2639
+ template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
2640
+ XSIMD_INLINE T first(batch<T, A> val, requires_arch<neon>) noexcept
2641
+ {
2642
+ return vgetq_lane_s64(val, 0);
2643
+ }
2644
+
2645
+ // Overloads of bitwise shifts accepting two batches of uint64/int64 are not available with ARMv7
2646
+
2647
+ /*******
2648
+ * all *
2649
+ *******/
2650
+
2651
+ template <class A, class T, detail::enable_sized_t<T, 8> = 0>
2652
+ XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2653
+ {
2654
+ uint64x1_t tmp = vand_u64(vget_low_u64(arg), vget_high_u64(arg));
2655
+ return vget_lane_u64(tmp, 0) == ~0ULL;
2656
+ }
2657
+
2658
+ template <class A, class T, detail::enable_sized_t<T, 1> = 0>
2659
+ XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2660
+ {
2661
+ return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u8(arg)), neon {});
2662
+ }
2663
+
2664
+ template <class A, class T, detail::enable_sized_t<T, 2> = 0>
2665
+ XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2666
+ {
2667
+ return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u16(arg)), neon {});
2668
+ }
2669
+
2670
+ template <class A, class T, detail::enable_sized_t<T, 4> = 0>
2671
+ XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2672
+ {
2673
+ return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u32(arg)), neon {});
2674
+ }
2675
+
2676
+ /*******
2677
+ * any *
2678
+ *******/
2679
+
2680
+ template <class A, class T, detail::enable_sized_t<T, 8> = 0>
2681
+ XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2682
+ {
2683
+ uint32x2_t tmp = vqmovn_u64(arg);
2684
+ return vget_lane_u64(vreinterpret_u64_u32(tmp), 0) != 0;
2685
+ }
2686
+
2687
+ template <class A, class T, detail::enable_sized_t<T, 1> = 0>
2688
+ XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2689
+ {
2690
+ return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u8(arg)), neon {});
2691
+ }
2692
+
2693
+ template <class A, class T, detail::enable_sized_t<T, 2> = 0>
2694
+ XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2695
+ {
2696
+ return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u16(arg)), neon {});
2697
+ }
2698
+
2699
+ template <class A, class T, detail::enable_sized_t<T, 4> = 0>
2700
+ XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2701
+ {
2702
+ return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u32(arg)), neon {});
2703
+ }
2704
+
2705
+ /****************
2706
+ * bitwise_cast *
2707
+ ****************/
2708
+
2709
+ #define WRAP_CAST(SUFFIX, TYPE) \
2710
+ namespace wrap \
2711
+ { \
2712
+ XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u8(uint8x16_t a) noexcept \
2713
+ { \
2714
+ return ::vreinterpretq_##SUFFIX##_u8(a); \
2715
+ } \
2716
+ XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s8(int8x16_t a) noexcept \
2717
+ { \
2718
+ return ::vreinterpretq_##SUFFIX##_s8(a); \
2719
+ } \
2720
+ XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u16(uint16x8_t a) noexcept \
2721
+ { \
2722
+ return ::vreinterpretq_##SUFFIX##_u16(a); \
2723
+ } \
2724
+ XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s16(int16x8_t a) noexcept \
2725
+ { \
2726
+ return ::vreinterpretq_##SUFFIX##_s16(a); \
2727
+ } \
2728
+ XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u32(uint32x4_t a) noexcept \
2729
+ { \
2730
+ return ::vreinterpretq_##SUFFIX##_u32(a); \
2731
+ } \
2732
+ XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s32(int32x4_t a) noexcept \
2733
+ { \
2734
+ return ::vreinterpretq_##SUFFIX##_s32(a); \
2735
+ } \
2736
+ XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u64(uint64x2_t a) noexcept \
2737
+ { \
2738
+ return ::vreinterpretq_##SUFFIX##_u64(a); \
2739
+ } \
2740
+ XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s64(int64x2_t a) noexcept \
2741
+ { \
2742
+ return ::vreinterpretq_##SUFFIX##_s64(a); \
2743
+ } \
2744
+ XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_f32(float32x4_t a) noexcept \
2745
+ { \
2746
+ return ::vreinterpretq_##SUFFIX##_f32(a); \
2747
+ } \
2748
+ }
2749
+
2750
+ WRAP_CAST(u8, uint8x16_t)
2751
+ WRAP_CAST(s8, int8x16_t)
2752
+ WRAP_CAST(u16, uint16x8_t)
2753
+ WRAP_CAST(s16, int16x8_t)
2754
+ WRAP_CAST(u32, uint32x4_t)
2755
+ WRAP_CAST(s32, int32x4_t)
2756
+ WRAP_CAST(u64, uint64x2_t)
2757
+ WRAP_CAST(s64, int64x2_t)
2758
+ WRAP_CAST(f32, float32x4_t)
2759
+
2760
+ #undef WRAP_CAST
2761
+
2762
+ namespace detail
2763
+ {
2764
+ template <class R, class... T>
2765
+ struct bitwise_caster_impl
2766
+ {
2767
+ using container_type = std::tuple<R (*)(T)...>;
2768
+ container_type m_func;
2769
+
2770
+ template <class U>
2771
+ R apply(U rhs) const noexcept
2772
+ {
2773
+ using func_type = R (*)(U);
2774
+ auto func = xsimd::detail::get<func_type>(m_func);
2775
+ return func(rhs);
2776
+ }
2777
+ };
2778
+
2779
+ template <class R, class... T>
2780
+ XSIMD_INLINE const bitwise_caster_impl<R, T...> make_bitwise_caster_impl(R (*... arg)(T)) noexcept
2781
+ {
2782
+ return { std::make_tuple(arg...) };
2783
+ }
2784
+
2785
+ template <class... T>
2786
+ struct type_list
2787
+ {
2788
+ };
2789
+
2790
+ template <class RTL, class TTL>
2791
+ struct bitwise_caster;
2792
+
2793
+ template <class... R, class... T>
2794
+ struct bitwise_caster<type_list<R...>, type_list<T...>>
2795
+ {
2796
+ using container_type = std::tuple<bitwise_caster_impl<R, T...>...>;
2797
+ container_type m_caster;
2798
+
2799
+ template <class V, class U>
2800
+ V apply(U rhs) const noexcept
2801
+ {
2802
+ using caster_type = bitwise_caster_impl<V, T...>;
2803
+ auto caster = xsimd::detail::get<caster_type>(m_caster);
2804
+ return caster.apply(rhs);
2805
+ }
2806
+ };
2807
+
2808
+ template <class... T>
2809
+ using bitwise_caster_t = bitwise_caster<type_list<T...>, type_list<T...>>;
2810
+
2811
+ using neon_bitwise_caster = bitwise_caster_t<uint8x16_t, int8x16_t,
2812
+ uint16x8_t, int16x8_t,
2813
+ uint32x4_t, int32x4_t,
2814
+ uint64x2_t, int64x2_t,
2815
+ float32x4_t>;
2816
+ }
2817
+
2818
+ template <class A, class T, class R>
2819
+ XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<neon>) noexcept
2820
+ {
2821
+ const detail::neon_bitwise_caster caster = {
2822
+ std::make_tuple(
2823
+ detail::make_bitwise_caster_impl(wrap::vreinterpretq_u8_u8, wrap::vreinterpretq_u8_s8, wrap::vreinterpretq_u8_u16, wrap::vreinterpretq_u8_s16,
2824
+ wrap::vreinterpretq_u8_u32, wrap::vreinterpretq_u8_s32, wrap::vreinterpretq_u8_u64, wrap::vreinterpretq_u8_s64,
2825
+ wrap::vreinterpretq_u8_f32),
2826
+ detail::make_bitwise_caster_impl(wrap::vreinterpretq_s8_u8, wrap::vreinterpretq_s8_s8, wrap::vreinterpretq_s8_u16, wrap::vreinterpretq_s8_s16,
2827
+ wrap::vreinterpretq_s8_u32, wrap::vreinterpretq_s8_s32, wrap::vreinterpretq_s8_u64, wrap::vreinterpretq_s8_s64,
2828
+ wrap::vreinterpretq_s8_f32),
2829
+ detail::make_bitwise_caster_impl(wrap::vreinterpretq_u16_u8, wrap::vreinterpretq_u16_s8, wrap::vreinterpretq_u16_u16, wrap::vreinterpretq_u16_s16,
2830
+ wrap::vreinterpretq_u16_u32, wrap::vreinterpretq_u16_s32, wrap::vreinterpretq_u16_u64, wrap::vreinterpretq_u16_s64,
2831
+ wrap::vreinterpretq_u16_f32),
2832
+ detail::make_bitwise_caster_impl(wrap::vreinterpretq_s16_u8, wrap::vreinterpretq_s16_s8, wrap::vreinterpretq_s16_u16, wrap::vreinterpretq_s16_s16,
2833
+ wrap::vreinterpretq_s16_u32, wrap::vreinterpretq_s16_s32, wrap::vreinterpretq_s16_u64, wrap::vreinterpretq_s16_s64,
2834
+ wrap::vreinterpretq_s16_f32),
2835
+ detail::make_bitwise_caster_impl(wrap::vreinterpretq_u32_u8, wrap::vreinterpretq_u32_s8, wrap::vreinterpretq_u32_u16, wrap::vreinterpretq_u32_s16,
2836
+ wrap::vreinterpretq_u32_u32, wrap::vreinterpretq_u32_s32, wrap::vreinterpretq_u32_u64, wrap::vreinterpretq_u32_s64,
2837
+ wrap::vreinterpretq_u32_f32),
2838
+ detail::make_bitwise_caster_impl(wrap::vreinterpretq_s32_u8, wrap::vreinterpretq_s32_s8, wrap::vreinterpretq_s32_u16, wrap::vreinterpretq_s32_s16,
2839
+ wrap::vreinterpretq_s32_u32, wrap::vreinterpretq_s32_s32, wrap::vreinterpretq_s32_u64, wrap::vreinterpretq_s32_s64,
2840
+ wrap::vreinterpretq_s32_f32),
2841
+ detail::make_bitwise_caster_impl(wrap::vreinterpretq_u64_u8, wrap::vreinterpretq_u64_s8, wrap::vreinterpretq_u64_u16, wrap::vreinterpretq_u64_s16,
2842
+ wrap::vreinterpretq_u64_u32, wrap::vreinterpretq_u64_s32, wrap::vreinterpretq_u64_u64, wrap::vreinterpretq_u64_s64,
2843
+ wrap::vreinterpretq_u64_f32),
2844
+ detail::make_bitwise_caster_impl(wrap::vreinterpretq_s64_u8, wrap::vreinterpretq_s64_s8, wrap::vreinterpretq_s64_u16, wrap::vreinterpretq_s64_s16,
2845
+ wrap::vreinterpretq_s64_u32, wrap::vreinterpretq_s64_s32, wrap::vreinterpretq_s64_u64, wrap::vreinterpretq_s64_s64,
2846
+ wrap::vreinterpretq_s64_f32),
2847
+ detail::make_bitwise_caster_impl(wrap::vreinterpretq_f32_u8, wrap::vreinterpretq_f32_s8, wrap::vreinterpretq_f32_u16, wrap::vreinterpretq_f32_s16,
2848
+ wrap::vreinterpretq_f32_u32, wrap::vreinterpretq_f32_s32, wrap::vreinterpretq_f32_u64, wrap::vreinterpretq_f32_s64,
2849
+ wrap::vreinterpretq_f32_f32))
2850
+ };
2851
+ using src_register_type = typename batch<T, A>::register_type;
2852
+ using dst_register_type = typename batch<R, A>::register_type;
2853
+ return caster.apply<dst_register_type>(src_register_type(arg));
2854
+ }
2855
+
2856
+ /*********
2857
+ * isnan *
2858
+ *********/
2859
+
2860
+ template <class A>
2861
+ XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& arg, requires_arch<neon>) noexcept
2862
+ {
2863
+ return !(arg == arg);
2864
+ }
2865
+
2866
+ // slide_left
2867
+ namespace detail
2868
+ {
2869
+ template <size_t N>
2870
+ struct slider_left
2871
+ {
2872
+ template <class A, class T>
2873
+ XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
2874
+ {
2875
+ const auto left = vdupq_n_u8(0);
2876
+ const auto right = bitwise_cast<uint8_t>(x).data;
2877
+ const batch<uint8_t, A> res(vextq_u8(left, right, 16 - N));
2878
+ return bitwise_cast<T>(res);
2879
+ }
2880
+ };
2881
+
2882
+ template <>
2883
+ struct slider_left<0>
2884
+ {
2885
+ template <class A, class T>
2886
+ XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
2887
+ {
2888
+ return x;
2889
+ }
2890
+ };
2891
+ } // namespace detail
2892
+
2893
+ template <size_t N, class A, class T>
2894
+ XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<neon>) noexcept
2895
+ {
2896
+ return detail::slider_left<N> {}(x, A {});
2897
+ }
2898
+
2899
+ // slide_right
2900
+ namespace detail
2901
+ {
2902
+ template <size_t N>
2903
+ struct slider_right
2904
+ {
2905
+ template <class A, class T>
2906
+ XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
2907
+ {
2908
+ const auto left = bitwise_cast<uint8_t>(x).data;
2909
+ const auto right = vdupq_n_u8(0);
2910
+ const batch<uint8_t, A> res(vextq_u8(left, right, N));
2911
+ return bitwise_cast<T>(res);
2912
+ }
2913
+ };
2914
+
2915
+ template <>
2916
+ struct slider_right<16>
2917
+ {
2918
+ template <class A, class T>
2919
+ XSIMD_INLINE batch<T, A> operator()(batch<T, A> const&, requires_arch<neon>) noexcept
2920
+ {
2921
+ return batch<T, A> {};
2922
+ }
2923
+ };
2924
+ } // namespace detail
2925
+
2926
+ template <size_t N, class A, class T>
2927
+ XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<neon>) noexcept
2928
+ {
2929
+ return detail::slider_right<N> {}(x, A {});
2930
+ }
2931
+
2932
+ /****************
2933
+ * rotate_left *
2934
+ ****************/
2935
+ namespace wrap
2936
+ {
2937
+ template <size_t N>
2938
+ XSIMD_INLINE uint8x16_t rotate_left_u8(uint8x16_t a, uint8x16_t b) noexcept { return vextq_u8(a, b, N); }
2939
+ template <size_t N>
2940
+ XSIMD_INLINE int8x16_t rotate_left_s8(int8x16_t a, int8x16_t b) noexcept { return vextq_s8(a, b, N); }
2941
+ template <size_t N>
2942
+ XSIMD_INLINE uint16x8_t rotate_left_u16(uint16x8_t a, uint16x8_t b) noexcept { return vextq_u16(a, b, N); }
2943
+ template <size_t N>
2944
+ XSIMD_INLINE int16x8_t rotate_left_s16(int16x8_t a, int16x8_t b) noexcept { return vextq_s16(a, b, N); }
2945
+ template <size_t N>
2946
+ XSIMD_INLINE uint32x4_t rotate_left_u32(uint32x4_t a, uint32x4_t b) noexcept { return vextq_u32(a, b, N); }
2947
+ template <size_t N>
2948
+ XSIMD_INLINE int32x4_t rotate_left_s32(int32x4_t a, int32x4_t b) noexcept { return vextq_s32(a, b, N); }
2949
+ template <size_t N>
2950
+ XSIMD_INLINE uint64x2_t rotate_left_u64(uint64x2_t a, uint64x2_t b) noexcept { return vextq_u64(a, b, N); }
2951
+ template <size_t N>
2952
+ XSIMD_INLINE int64x2_t rotate_left_s64(int64x2_t a, int64x2_t b) noexcept { return vextq_s64(a, b, N); }
2953
+ template <size_t N>
2954
+ XSIMD_INLINE float32x4_t rotate_left_f32(float32x4_t a, float32x4_t b) noexcept { return vextq_f32(a, b, N); }
2955
+ }
2956
+
2957
+ template <size_t N, class A, class T, detail::enable_neon_type_t<T> = 0>
2958
+ XSIMD_INLINE batch<T, A> rotate_left(batch<T, A> const& a, requires_arch<neon>) noexcept
2959
+ {
2960
+ using register_type = typename batch<T, A>::register_type;
2961
+ // Adding modulo to avoid warning.
2962
+ const detail::neon_dispatcher::binary dispatcher = {
2963
+ std::make_tuple(wrap::rotate_left_u8<N>, wrap::rotate_left_s8<N>, wrap::rotate_left_u16<N % 8>, wrap::rotate_left_s16<N % 8>,
2964
+ wrap::rotate_left_u32<N % 4>, wrap::rotate_left_s32<N % 4>, wrap::rotate_left_u64<N % 2>, wrap::rotate_left_s64<N % 2>,
2965
+ wrap::rotate_left_f32<N % 4>)
2966
+ };
2967
+ return dispatcher.apply(register_type(a), register_type(a));
2968
+ }
2969
+ }
2970
+
2971
+ template <typename T, class A, T... Values>
2972
+ struct batch_constant;
2973
+
2974
+ namespace kernel
2975
+ {
2976
+ /***********
2977
+ * swizzle *
2978
+ ***********/
2979
+
2980
+ template <class A, class T, class I, I... idx>
2981
+ XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self,
2982
+ batch_constant<I, A, idx...>,
2983
+ requires_arch<neon>) noexcept
2984
+ {
2985
+ static_assert(batch<T, A>::size == sizeof...(idx), "valid swizzle indices");
2986
+ std::array<T, batch<T, A>::size> data;
2987
+ self.store_aligned(data.data());
2988
+ return set(batch<T, A>(), A(), data[idx]...);
2989
+ }
2990
+
2991
+ template <class A, uint64_t V0, uint64_t V1>
2992
+ XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self,
2993
+ batch_constant<uint64_t, A, V0, V1>,
2994
+ requires_arch<neon>) noexcept
2995
+ {
2996
+ XSIMD_IF_CONSTEXPR(V0 == 0 && V1 == 0)
2997
+ {
2998
+ auto lo = vget_low_u64(self);
2999
+ return vcombine_u64(lo, lo);
3000
+ }
3001
+ XSIMD_IF_CONSTEXPR(V0 == 1 && V1 == 1)
3002
+ {
3003
+ auto hi = vget_high_u64(self);
3004
+ return vcombine_u64(hi, hi);
3005
+ }
3006
+ XSIMD_IF_CONSTEXPR(V0 == 0 && V1 == 1)
3007
+ {
3008
+ return self;
3009
+ }
3010
+ else
3011
+ {
3012
+ return vextq_u64(self, self, 1);
3013
+ }
3014
+ }
3015
+
3016
+ template <class A, uint64_t V0, uint64_t V1>
3017
+ XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self,
3018
+ batch_constant<int64_t, A, V0, V1> mask,
3019
+ requires_arch<neon>) noexcept
3020
+ {
3021
+ return vreinterpretq_s64_u64(swizzle(vreinterpretq_u64_s64(self), mask, A {}));
3022
+ }
3023
+
3024
+ namespace detail
3025
+ {
3026
+ template <uint32_t Va, uint32_t Vb>
3027
+ XSIMD_INLINE uint8x8_t make_mask()
3028
+ {
3029
+ uint8x8_t res = {
3030
+ static_cast<uint8_t>((Va % 2) * 4 + 0),
3031
+ static_cast<uint8_t>((Va % 2) * 4 + 1),
3032
+ static_cast<uint8_t>((Va % 2) * 4 + 2),
3033
+ static_cast<uint8_t>((Va % 2) * 4 + 3),
3034
+ static_cast<uint8_t>((Vb % 2) * 4 + 0),
3035
+ static_cast<uint8_t>((Vb % 2) * 4 + 1),
3036
+ static_cast<uint8_t>((Vb % 2) * 4 + 2),
3037
+ static_cast<uint8_t>((Vb % 2) * 4 + 3),
3038
+ };
3039
+ return res;
3040
+ }
3041
+ }
3042
+
3043
+ template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
3044
+ XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self,
3045
+ batch_constant<uint32_t, A, V0, V1, V2, V3> mask,
3046
+ requires_arch<neon>) noexcept
3047
+ {
3048
+ constexpr bool is_identity = detail::is_identity(mask);
3049
+ constexpr bool is_dup_lo = detail::is_dup_lo(mask);
3050
+ constexpr bool is_dup_hi = detail::is_dup_hi(mask);
3051
+
3052
+ XSIMD_IF_CONSTEXPR(is_identity)
3053
+ {
3054
+ return self;
3055
+ }
3056
+ XSIMD_IF_CONSTEXPR(is_dup_lo)
3057
+ {
3058
+ XSIMD_IF_CONSTEXPR(V0 == 0 && V1 == 1)
3059
+ {
3060
+ return vreinterpretq_u32_u64(vdupq_lane_u64(vget_low_u64(vreinterpretq_u64_u32(self)), 0));
3061
+ }
3062
+ XSIMD_IF_CONSTEXPR(V0 == 1 && V1 == 0)
3063
+ {
3064
+ return vreinterpretq_u32_u64(vdupq_lane_u64(vreinterpret_u64_u32(vrev64_u32(vget_low_u32(self))), 0));
3065
+ }
3066
+ return vdupq_n_u32(vgetq_lane_u32(self, V0));
3067
+ }
3068
+ XSIMD_IF_CONSTEXPR(is_dup_hi)
3069
+ {
3070
+ XSIMD_IF_CONSTEXPR(V0 == 2 && V1 == 3)
3071
+ {
3072
+ return vreinterpretq_u32_u64(vdupq_lane_u64(vget_high_u64(vreinterpretq_u64_u32(self)), 0));
3073
+ }
3074
+ XSIMD_IF_CONSTEXPR(V0 == 3 && V1 == 2)
3075
+ {
3076
+ return vreinterpretq_u32_u64(vdupq_lane_u64(vreinterpret_u64_u32(vrev64_u32(vget_high_u32(self))), 0));
3077
+ }
3078
+ return vdupq_n_u32(vgetq_lane_u32(self, V0));
3079
+ }
3080
+ XSIMD_IF_CONSTEXPR(V0 < 2 && V1 < 2 && V2 < 2 && V3 < 2)
3081
+ {
3082
+ uint8x8_t low = vreinterpret_u8_u64(vget_low_u64(vreinterpretq_u64_u32(self)));
3083
+ uint8x8_t mask_lo = detail::make_mask<V0, V1>();
3084
+ uint8x8_t mask_hi = detail::make_mask<V2, V3>();
3085
+ uint8x8_t lo = vtbl1_u8(low, mask_lo);
3086
+ uint8x8_t hi = vtbl1_u8(low, mask_hi);
3087
+ return vreinterpretq_u32_u8(vcombine_u8(lo, hi));
3088
+ }
3089
+ XSIMD_IF_CONSTEXPR(V0 >= 2 && V1 >= 2 && V2 >= 2 && V3 >= 2)
3090
+ {
3091
+ uint8x8_t high = vreinterpret_u8_u64(vget_high_u64(vreinterpretq_u64_u32(self)));
3092
+ uint8x8_t mask_lo = detail::make_mask<V0, V1>();
3093
+ uint8x8_t mask_hi = detail::make_mask<V2, V3>();
3094
+ uint8x8_t lo = vtbl1_u8(high, mask_lo);
3095
+ uint8x8_t hi = vtbl1_u8(high, mask_hi);
3096
+ return vreinterpretq_u32_u8(vcombine_u8(lo, hi));
3097
+ }
3098
+
3099
+ uint8x8_t mask_lo = detail::make_mask<V0, V1>();
3100
+ uint8x8_t mask_hi = detail::make_mask<V2, V3>();
3101
+
3102
+ uint8x8_t low = vreinterpret_u8_u64(vget_low_u64(vreinterpretq_u64_u32(self)));
3103
+ uint8x8_t lol = vtbl1_u8(low, mask_lo);
3104
+ uint8x8_t loh = vtbl1_u8(low, mask_hi);
3105
+ uint32x4_t true_br = vreinterpretq_u32_u8(vcombine_u8(lol, loh));
3106
+
3107
+ uint8x8_t high = vreinterpret_u8_u64(vget_high_u64(vreinterpretq_u64_u32(self)));
3108
+ uint8x8_t hil = vtbl1_u8(high, mask_lo);
3109
+ uint8x8_t hih = vtbl1_u8(high, mask_hi);
3110
+ uint32x4_t false_br = vreinterpretq_u32_u8(vcombine_u8(hil, hih));
3111
+
3112
+ batch_bool_constant<uint32_t, A, (V0 < 2), (V1 < 2), (V2 < 2), (V3 < 2)> blend_mask;
3113
+ return select(blend_mask, batch<uint32_t, A>(true_br), batch<uint32_t, A>(false_br), A {});
3114
+ }
3115
+
3116
+ template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
3117
+ XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self,
3118
+ batch_constant<int32_t, A, V0, V1, V2, V3> mask,
3119
+ requires_arch<neon>) noexcept
3120
+ {
3121
+ return vreinterpretq_s32_u32(swizzle(vreinterpretq_u32_s32(self), mask, A {}));
3122
+ }
3123
+
3124
+ template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
3125
+ XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self,
3126
+ batch_constant<uint32_t, A, V0, V1, V2, V3> mask,
3127
+ requires_arch<neon>) noexcept
3128
+ {
3129
+ return vreinterpretq_f32_u32(swizzle(batch<uint32_t, A>(vreinterpretq_u32_f32(self)), mask, A {}));
3130
+ }
3131
+ }
3132
+
3133
+ }
3134
+
3135
+ #undef WRAP_BINARY_INT_EXCLUDING_64
3136
+ #undef WRAP_BINARY_INT
3137
+ #undef WRAP_BINARY_FLOAT
3138
+ #undef WRAP_UNARY_INT_EXCLUDING_64
3139
+ #undef WRAP_UNARY_INT
3140
+ #undef WRAP_UNARY_FLOAT
3141
+
3142
+ #endif