sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
  2. sequenzo/__init__.py +349 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +476 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +22 -0
  30. sequenzo/data_preprocessing/helpers.py +303 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/dyadic_children.csv +61 -0
  44. sequenzo/datasets/dyadic_parents.csv +61 -0
  45. sequenzo/datasets/mvad.csv +713 -0
  46. sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
  47. sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
  48. sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
  49. sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
  50. sequenzo/datasets/political_science_aid_shock.csv +166 -0
  51. sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
  52. sequenzo/define_sequence_data.py +1400 -0
  53. sequenzo/dissimilarity_measures/__init__.py +31 -0
  54. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  55. sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
  56. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
  57. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  58. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  59. sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
  60. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  61. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  62. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  63. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  64. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  65. sequenzo/dissimilarity_measures/src/module.cpp +40 -0
  66. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  67. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  214. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  215. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  216. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  217. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  218. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  219. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  220. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  221. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  222. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  223. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  224. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  225. sequenzo/multidomain/__init__.py +23 -0
  226. sequenzo/multidomain/association_between_domains.py +311 -0
  227. sequenzo/multidomain/cat.py +597 -0
  228. sequenzo/multidomain/combt.py +519 -0
  229. sequenzo/multidomain/dat.py +81 -0
  230. sequenzo/multidomain/idcd.py +139 -0
  231. sequenzo/multidomain/linked_polyad.py +292 -0
  232. sequenzo/openmp_setup.py +233 -0
  233. sequenzo/prefix_tree/__init__.py +62 -0
  234. sequenzo/prefix_tree/hub.py +114 -0
  235. sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
  236. sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
  237. sequenzo/prefix_tree/spell_level_indicators.py +297 -0
  238. sequenzo/prefix_tree/system_level_indicators.py +544 -0
  239. sequenzo/prefix_tree/utils.py +54 -0
  240. sequenzo/seqhmm/__init__.py +95 -0
  241. sequenzo/seqhmm/advanced_optimization.py +305 -0
  242. sequenzo/seqhmm/bootstrap.py +411 -0
  243. sequenzo/seqhmm/build_hmm.py +142 -0
  244. sequenzo/seqhmm/build_mhmm.py +136 -0
  245. sequenzo/seqhmm/build_nhmm.py +121 -0
  246. sequenzo/seqhmm/fit_mhmm.py +62 -0
  247. sequenzo/seqhmm/fit_model.py +61 -0
  248. sequenzo/seqhmm/fit_nhmm.py +76 -0
  249. sequenzo/seqhmm/formulas.py +289 -0
  250. sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
  251. sequenzo/seqhmm/gradients_nhmm.py +306 -0
  252. sequenzo/seqhmm/hmm.py +291 -0
  253. sequenzo/seqhmm/mhmm.py +314 -0
  254. sequenzo/seqhmm/model_comparison.py +238 -0
  255. sequenzo/seqhmm/multichannel_em.py +282 -0
  256. sequenzo/seqhmm/multichannel_utils.py +138 -0
  257. sequenzo/seqhmm/nhmm.py +270 -0
  258. sequenzo/seqhmm/nhmm_utils.py +191 -0
  259. sequenzo/seqhmm/predict.py +137 -0
  260. sequenzo/seqhmm/predict_mhmm.py +142 -0
  261. sequenzo/seqhmm/simulate.py +878 -0
  262. sequenzo/seqhmm/utils.py +218 -0
  263. sequenzo/seqhmm/visualization.py +910 -0
  264. sequenzo/sequence_characteristics/__init__.py +40 -0
  265. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  266. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  267. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  268. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  269. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  270. sequenzo/sequence_characteristics/turbulence.py +155 -0
  271. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  272. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  273. sequenzo/suffix_tree/__init__.py +66 -0
  274. sequenzo/suffix_tree/hub.py +114 -0
  275. sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
  276. sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
  277. sequenzo/suffix_tree/spell_level_indicators.py +248 -0
  278. sequenzo/suffix_tree/system_level_indicators.py +535 -0
  279. sequenzo/suffix_tree/utils.py +56 -0
  280. sequenzo/version_check.py +283 -0
  281. sequenzo/visualization/__init__.py +29 -0
  282. sequenzo/visualization/plot_mean_time.py +222 -0
  283. sequenzo/visualization/plot_modal_state.py +276 -0
  284. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  285. sequenzo/visualization/plot_relative_frequency.py +405 -0
  286. sequenzo/visualization/plot_sequence_index.py +1175 -0
  287. sequenzo/visualization/plot_single_medoid.py +153 -0
  288. sequenzo/visualization/plot_state_distribution.py +651 -0
  289. sequenzo/visualization/plot_transition_matrix.py +190 -0
  290. sequenzo/visualization/utils/__init__.py +23 -0
  291. sequenzo/visualization/utils/utils.py +310 -0
  292. sequenzo/with_event_history_analysis/__init__.py +35 -0
  293. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  294. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  295. sequenzo-0.1.31.dist-info/METADATA +286 -0
  296. sequenzo-0.1.31.dist-info/RECORD +299 -0
  297. sequenzo-0.1.31.dist-info/WHEEL +5 -0
  298. sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
  299. sequenzo-0.1.31.dist-info/top_level.txt +2 -0
@@ -0,0 +1,2650 @@
1
+ /***************************************************************************
2
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
3
+ * Martin Renou *
4
+ * Copyright (c) QuantStack *
5
+ * Copyright (c) Serge Guelton *
6
+ * *
7
+ * Distributed under the terms of the BSD 3-Clause License. *
8
+ * *
9
+ * The full license is in the file LICENSE, distributed with this software. *
10
+ ****************************************************************************/
11
+
12
+ #ifndef XSIMD_AVX512F_HPP
13
+ #define XSIMD_AVX512F_HPP
14
+
15
+ #include <complex>
16
+ #include <limits>
17
+ #include <type_traits>
18
+
19
+ #include "../types/xsimd_avx512f_register.hpp"
20
+
21
+ namespace xsimd
22
+ {
23
+
24
+ namespace kernel
25
+ {
26
+ using namespace types;
27
+
28
+ // fwd
29
+ template <class A, class T, class Mask>
30
+ XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, Mask const& mask, requires_arch<common>) noexcept;
31
+ template <class A, class T, class Mask>
32
+ XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, Mask const& mask, requires_arch<common>) noexcept;
33
+ template <class A, class T, size_t I>
34
+ XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<common>) noexcept;
35
+ template <class A, class T, class ITy, ITy... Is>
36
+ XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch_constant<ITy, A, Is...>, requires_arch<common>) noexcept;
37
+ template <class A>
38
+ XSIMD_INLINE void transpose(batch<uint16_t, A>* matrix_begin, batch<uint16_t, A>* matrix_end, requires_arch<common>) noexcept;
39
+ template <class A>
40
+ XSIMD_INLINE void transpose(batch<uint8_t, A>* matrix_begin, batch<uint8_t, A>* matrix_end, requires_arch<common>) noexcept;
41
+
42
+ namespace detail
43
+ {
44
+ XSIMD_INLINE void split_avx512(__m512 val, __m256& low, __m256& high) noexcept
45
+ {
46
+ low = _mm512_castps512_ps256(val);
47
+ high = _mm512_extractf32x8_ps(val, 1);
48
+ }
49
+ XSIMD_INLINE void split_avx512(__m512d val, __m256d& low, __m256d& high) noexcept
50
+ {
51
+ low = _mm512_castpd512_pd256(val);
52
+ high = _mm512_extractf64x4_pd(val, 1);
53
+ }
54
+ XSIMD_INLINE void split_avx512(__m512i val, __m256i& low, __m256i& high) noexcept
55
+ {
56
+ low = _mm512_castsi512_si256(val);
57
+ high = _mm512_extracti64x4_epi64(val, 1);
58
+ }
59
+ XSIMD_INLINE __m512i merge_avx(__m256i low, __m256i high) noexcept
60
+ {
61
+ return _mm512_inserti64x4(_mm512_castsi256_si512(low), high, 1);
62
+ }
63
+ XSIMD_INLINE __m512 merge_avx(__m256 low, __m256 high) noexcept
64
+ {
65
+ return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castpd256_pd512(_mm256_castps_pd(low)), _mm256_castps_pd(high), 1));
66
+ }
67
+ XSIMD_INLINE __m512d merge_avx(__m256d low, __m256d high) noexcept
68
+ {
69
+ return _mm512_insertf64x4(_mm512_castpd256_pd512(low), high, 1);
70
+ }
71
+ template <class F>
72
+ __m512i fwd_to_avx(F f, __m512i self)
73
+ {
74
+ __m256i self_low, self_high;
75
+ split_avx512(self, self_low, self_high);
76
+ __m256i res_low = f(self_low);
77
+ __m256i res_high = f(self_high);
78
+ return merge_avx(res_low, res_high);
79
+ }
80
+ template <class F>
81
+ __m512i fwd_to_avx(F f, __m512i self, __m512i other)
82
+ {
83
+ __m256i self_low, self_high, other_low, other_high;
84
+ split_avx512(self, self_low, self_high);
85
+ split_avx512(other, other_low, other_high);
86
+ __m256i res_low = f(self_low, other_low);
87
+ __m256i res_high = f(self_high, other_high);
88
+ return merge_avx(res_low, res_high);
89
+ }
90
+ template <class F>
91
+ __m512i fwd_to_avx(F f, __m512i self, int32_t other)
92
+ {
93
+ __m256i self_low, self_high;
94
+ split_avx512(self, self_low, self_high);
95
+ __m256i res_low = f(self_low, other);
96
+ __m256i res_high = f(self_high, other);
97
+ return merge_avx(res_low, res_high);
98
+ }
99
+ }
100
+ namespace detail
101
+ {
102
+
103
+ XSIMD_INLINE uint32_t morton(uint16_t x, uint16_t y) noexcept
104
+ {
105
+
106
+ static const unsigned short MortonTable256[256] = {
107
+ 0x0000, 0x0001, 0x0004, 0x0005, 0x0010, 0x0011, 0x0014, 0x0015,
108
+ 0x0040, 0x0041, 0x0044, 0x0045, 0x0050, 0x0051, 0x0054, 0x0055,
109
+ 0x0100, 0x0101, 0x0104, 0x0105, 0x0110, 0x0111, 0x0114, 0x0115,
110
+ 0x0140, 0x0141, 0x0144, 0x0145, 0x0150, 0x0151, 0x0154, 0x0155,
111
+ 0x0400, 0x0401, 0x0404, 0x0405, 0x0410, 0x0411, 0x0414, 0x0415,
112
+ 0x0440, 0x0441, 0x0444, 0x0445, 0x0450, 0x0451, 0x0454, 0x0455,
113
+ 0x0500, 0x0501, 0x0504, 0x0505, 0x0510, 0x0511, 0x0514, 0x0515,
114
+ 0x0540, 0x0541, 0x0544, 0x0545, 0x0550, 0x0551, 0x0554, 0x0555,
115
+ 0x1000, 0x1001, 0x1004, 0x1005, 0x1010, 0x1011, 0x1014, 0x1015,
116
+ 0x1040, 0x1041, 0x1044, 0x1045, 0x1050, 0x1051, 0x1054, 0x1055,
117
+ 0x1100, 0x1101, 0x1104, 0x1105, 0x1110, 0x1111, 0x1114, 0x1115,
118
+ 0x1140, 0x1141, 0x1144, 0x1145, 0x1150, 0x1151, 0x1154, 0x1155,
119
+ 0x1400, 0x1401, 0x1404, 0x1405, 0x1410, 0x1411, 0x1414, 0x1415,
120
+ 0x1440, 0x1441, 0x1444, 0x1445, 0x1450, 0x1451, 0x1454, 0x1455,
121
+ 0x1500, 0x1501, 0x1504, 0x1505, 0x1510, 0x1511, 0x1514, 0x1515,
122
+ 0x1540, 0x1541, 0x1544, 0x1545, 0x1550, 0x1551, 0x1554, 0x1555,
123
+ 0x4000, 0x4001, 0x4004, 0x4005, 0x4010, 0x4011, 0x4014, 0x4015,
124
+ 0x4040, 0x4041, 0x4044, 0x4045, 0x4050, 0x4051, 0x4054, 0x4055,
125
+ 0x4100, 0x4101, 0x4104, 0x4105, 0x4110, 0x4111, 0x4114, 0x4115,
126
+ 0x4140, 0x4141, 0x4144, 0x4145, 0x4150, 0x4151, 0x4154, 0x4155,
127
+ 0x4400, 0x4401, 0x4404, 0x4405, 0x4410, 0x4411, 0x4414, 0x4415,
128
+ 0x4440, 0x4441, 0x4444, 0x4445, 0x4450, 0x4451, 0x4454, 0x4455,
129
+ 0x4500, 0x4501, 0x4504, 0x4505, 0x4510, 0x4511, 0x4514, 0x4515,
130
+ 0x4540, 0x4541, 0x4544, 0x4545, 0x4550, 0x4551, 0x4554, 0x4555,
131
+ 0x5000, 0x5001, 0x5004, 0x5005, 0x5010, 0x5011, 0x5014, 0x5015,
132
+ 0x5040, 0x5041, 0x5044, 0x5045, 0x5050, 0x5051, 0x5054, 0x5055,
133
+ 0x5100, 0x5101, 0x5104, 0x5105, 0x5110, 0x5111, 0x5114, 0x5115,
134
+ 0x5140, 0x5141, 0x5144, 0x5145, 0x5150, 0x5151, 0x5154, 0x5155,
135
+ 0x5400, 0x5401, 0x5404, 0x5405, 0x5410, 0x5411, 0x5414, 0x5415,
136
+ 0x5440, 0x5441, 0x5444, 0x5445, 0x5450, 0x5451, 0x5454, 0x5455,
137
+ 0x5500, 0x5501, 0x5504, 0x5505, 0x5510, 0x5511, 0x5514, 0x5515,
138
+ 0x5540, 0x5541, 0x5544, 0x5545, 0x5550, 0x5551, 0x5554, 0x5555
139
+ };
140
+
141
+ uint32_t z = MortonTable256[y >> 8] << 17 | MortonTable256[x >> 8] << 16 | MortonTable256[y & 0xFF] << 1 | MortonTable256[x & 0xFF];
142
+ return z;
143
+ }
144
+
145
+ template <class A, class T, int Cmp>
146
+ XSIMD_INLINE batch_bool<T, A> compare_int_avx512f(batch<T, A> const& self, batch<T, A> const& other) noexcept
147
+ {
148
+ using register_type = typename batch_bool<T, A>::register_type;
149
+ if (std::is_signed<T>::value)
150
+ {
151
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
152
+ {
153
+ // shifting to take sign into account
154
+ uint64_t mask_low0 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x000000FF)) << 24,
155
+ (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x000000FF)) << 24,
156
+ Cmp);
157
+ uint64_t mask_low1 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x0000FF00)) << 16,
158
+ (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x0000FF00)) << 16,
159
+ Cmp);
160
+ uint64_t mask_high0 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x00FF0000)) << 8,
161
+ (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x00FF0000)) << 8,
162
+ Cmp);
163
+ uint64_t mask_high1 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0xFF000000)),
164
+ (batch<int32_t, A>(other.data) & batch<int32_t, A>(0xFF000000)),
165
+ Cmp);
166
+ uint64_t mask = 0;
167
+ for (unsigned i = 0; i < 16; ++i)
168
+ {
169
+ mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0);
170
+ mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1);
171
+ mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2);
172
+ mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3);
173
+ }
174
+ return (register_type)mask;
175
+ }
176
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
177
+ {
178
+ // shifting to take sign into account
179
+ uint16_t mask_low = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x0000FFFF)) << 16,
180
+ (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x0000FFFF)) << 16,
181
+ Cmp);
182
+ uint16_t mask_high = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0xFFFF0000)),
183
+ (batch<int32_t, A>(other.data) & batch<int32_t, A>(0xFFFF0000)),
184
+ Cmp);
185
+ return static_cast<register_type>(morton(mask_low, mask_high));
186
+ }
187
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
188
+ {
189
+ return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp);
190
+ }
191
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
192
+ {
193
+ return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp);
194
+ }
195
+ }
196
+ else
197
+ {
198
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
199
+ {
200
+ uint64_t mask_low0 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x000000FF)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x000000FF)), Cmp);
201
+ uint64_t mask_low1 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x0000FF00)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x0000FF00)), Cmp);
202
+ uint64_t mask_high0 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x00FF0000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x00FF0000)), Cmp);
203
+ uint64_t mask_high1 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0xFF000000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0xFF000000)), Cmp);
204
+ uint64_t mask = 0;
205
+ for (unsigned i = 0; i < 16; ++i)
206
+ {
207
+ mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0);
208
+ mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1);
209
+ mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2);
210
+ mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3);
211
+ }
212
+ return (register_type)mask;
213
+ }
214
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
215
+ {
216
+ uint16_t mask_low = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x0000FFFF)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x0000FFFF)), Cmp);
217
+ uint16_t mask_high = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0xFFFF0000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0xFFFF0000)), Cmp);
218
+ return static_cast<register_type>(morton(mask_low, mask_high));
219
+ }
220
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
221
+ {
222
+ return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp);
223
+ }
224
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
225
+ {
226
+ return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp);
227
+ }
228
+ }
229
+ }
230
+ }
231
+
232
+ // abs
233
+ template <class A>
234
+ XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<avx512f>) noexcept
235
+ {
236
+ __m512 self_asf = (__m512)self;
237
+ __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asf);
238
+ __m512i res_asi = _mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF), self_asi);
239
+ return *reinterpret_cast<__m512*>(&res_asi);
240
+ }
241
+ template <class A>
242
+ XSIMD_INLINE batch<double, A> abs(batch<double, A> const& self, requires_arch<avx512f>) noexcept
243
+ {
244
+ __m512d self_asd = (__m512d)self;
245
+ __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asd);
246
+ __m512i res_asi = _mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
247
+ self_asi);
248
+ return *reinterpret_cast<__m512d*>(&res_asi);
249
+ }
250
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
251
+ XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<avx512f>) noexcept
252
+ {
253
+ if (std::is_unsigned<T>::value)
254
+ {
255
+ return self;
256
+ }
257
+
258
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
259
+ {
260
+ return detail::fwd_to_avx([](__m256i s) noexcept
261
+ { return abs(batch<T, avx2>(s)); },
262
+ self);
263
+ }
264
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
265
+ {
266
+ return detail::fwd_to_avx([](__m256i s) noexcept
267
+ { return abs(batch<T, avx2>(s)); },
268
+ self);
269
+ }
270
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
271
+ {
272
+ return _mm512_abs_epi32(self);
273
+ }
274
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
275
+ {
276
+ return _mm512_abs_epi64(self);
277
+ }
278
+ else
279
+ {
280
+ assert(false && "unsupported arch/op combination");
281
+ return {};
282
+ }
283
+ }
284
+
285
+ // add
286
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
287
+ XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
288
+ {
289
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
290
+ {
291
+ return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
292
+ { return add(batch<T, avx2>(s), batch<T, avx2>(o)); },
293
+ self, other);
294
+ }
295
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
296
+ {
297
+ return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
298
+ { return add(batch<T, avx2>(s), batch<T, avx2>(o)); },
299
+ self, other);
300
+ }
301
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
302
+ {
303
+ return _mm512_add_epi32(self, other);
304
+ }
305
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
306
+ {
307
+ return _mm512_add_epi64(self, other);
308
+ }
309
+ else
310
+ {
311
+ assert(false && "unsupported arch/op combination");
312
+ return {};
313
+ }
314
+ }
315
+ template <class A>
316
+ XSIMD_INLINE batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
317
+ {
318
+ return _mm512_add_ps(self, other);
319
+ }
320
+ template <class A>
321
+ XSIMD_INLINE batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
322
+ {
323
+ return _mm512_add_pd(self, other);
324
+ }
325
+
326
+ // all
327
+ template <class A, class T>
328
+ XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
329
+ {
330
+ using register_type = typename batch_bool<T, A>::register_type;
331
+ return self.data == register_type(-1);
332
+ }
333
+
334
+ // any
335
+ template <class A, class T>
336
+ XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
337
+ {
338
+ using register_type = typename batch_bool<T, A>::register_type;
339
+ return self.data != register_type(0);
340
+ }
341
+
342
+ // batch_bool_cast
343
+ template <class A, class T_out, class T_in>
344
+ XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<avx512f>) noexcept
345
+ {
346
+ return self.data;
347
+ }
348
+
349
+ // bitwise_and
350
+ template <class A>
351
+ XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
352
+ {
353
+ #if defined(_MSC_VER)
354
+ return _mm512_and_ps(self, other);
355
+ #else
356
+ return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
357
+ #endif
358
+ }
359
+ template <class A>
360
+ XSIMD_INLINE batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
361
+ {
362
+ return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other)));
363
+ }
364
+
365
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
366
+ XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
367
+ {
368
+ return _mm512_and_si512(self, other);
369
+ }
370
+
371
+ template <class A, class T>
372
+ XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
373
+ {
374
+ using register_type = typename batch_bool<T, A>::register_type;
375
+ return register_type(self.data & other.data);
376
+ }
377
+
378
+ // bitwise_andnot
379
+ template <class A>
380
+ XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
381
+ {
382
+ return _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(other), _mm512_castps_si512(self)));
383
+ }
384
+ template <class A>
385
+ XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
386
+ {
387
+ return _mm512_castsi512_pd(_mm512_andnot_si512(_mm512_castpd_si512(other), _mm512_castpd_si512(self)));
388
+ }
389
+
390
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
391
+ XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
392
+ {
393
+ return _mm512_andnot_si512(other, self);
394
+ }
395
+
396
+ template <class A, class T>
397
+ XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
398
+ {
399
+ using register_type = typename batch_bool<T, A>::register_type;
400
+ return register_type(self.data & ~other.data);
401
+ }
402
+
403
+ // bitwise_lshift
404
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
405
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
406
+ {
407
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
408
+ {
409
+ #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
410
+ __m512i tmp = _mm512_sllv_epi32(self, _mm512_set1_epi32(other));
411
+ #else
412
+ __m512i tmp = _mm512_slli_epi32(self, other);
413
+ #endif
414
+ return _mm512_and_si512(_mm512_set1_epi8(0xFF << other), tmp);
415
+ }
416
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
417
+ {
418
+ return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept
419
+ { return bitwise_lshift(batch<T, avx2>(s), o, avx2 {}); },
420
+ self, other);
421
+ #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
422
+ }
423
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
424
+ {
425
+ return _mm512_sllv_epi32(self, _mm512_set1_epi32(other));
426
+ }
427
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
428
+ {
429
+ return _mm512_sllv_epi64(self, _mm512_set1_epi64(other));
430
+ #else
431
+ }
432
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
433
+ {
434
+ return _mm512_slli_epi32(self, other);
435
+ }
436
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
437
+ {
438
+ return _mm512_slli_epi64(self, other);
439
+ #endif
440
+ }
441
+ else
442
+ {
443
+ assert(false && "unsupported arch/op combination");
444
+ return {};
445
+ }
446
+ }
447
+
448
+ // bitwise_not
449
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
450
+ XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx512f>) noexcept
451
+ {
452
+ return _mm512_xor_si512(self, _mm512_set1_epi32(-1));
453
+ }
454
+ template <class A, class T>
455
+ XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
456
+ {
457
+ using register_type = typename batch_bool<T, A>::register_type;
458
+ return register_type(~self.data);
459
+ }
460
+
461
+ template <class A>
462
+ XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512f>) noexcept
463
+ {
464
+ return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_set1_epi32(-1)));
465
+ }
466
+ template <class A>
467
+ XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512f>) noexcept
468
+ {
469
+ return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_set1_epi32(-1)));
470
+ }
471
+
472
+ // bitwise_or
473
+ template <class A>
474
+ XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
475
+ {
476
+ return _mm512_castsi512_ps(_mm512_or_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
477
+ }
478
+ template <class A>
479
+ XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
480
+ {
481
+ return _mm512_castsi512_pd(_mm512_or_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other)));
482
+ }
483
+
484
+ template <class A, class T>
485
+ XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
486
+ {
487
+ using register_type = typename batch_bool<T, A>::register_type;
488
+ return register_type(self.data | other.data);
489
+ }
490
+
491
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
492
+ XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
493
+ {
494
+ return _mm512_or_si512(self, other);
495
+ }
496
+
497
+ // bitwise_rshift
498
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
499
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
500
+ {
501
+ if (std::is_signed<T>::value)
502
+ {
503
+ #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
504
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
505
+ {
506
+ return _mm512_srav_epi32(self, _mm512_set1_epi32(other));
507
+ }
508
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
509
+ {
510
+ return _mm512_srav_epi64(self, _mm512_set1_epi64(other));
511
+ #else
512
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
513
+ {
514
+ return _mm512_srai_epi32(self, other);
515
+ }
516
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
517
+ {
518
+ return _mm512_srai_epi64(self, other);
519
+ #endif
520
+ }
521
+ else
522
+ {
523
+ return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept
524
+ { return bitwise_rshift(batch<T, avx2>(s), o, avx2 {}); },
525
+ self, other);
526
+ }
527
+ }
528
+ else
529
+ {
530
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
531
+ {
532
+ #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
533
+ __m512i tmp = _mm512_srlv_epi32(self, _mm512_set1_epi32(other));
534
+ #else
535
+ __m512i tmp = _mm512_srli_epi32(self, other);
536
+ #endif
537
+ return _mm512_and_si512(_mm512_set1_epi8(0xFF >> other), tmp);
538
+ #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
539
+ }
540
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
541
+ {
542
+ return _mm512_srlv_epi32(self, _mm512_set1_epi32(other));
543
+ }
544
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
545
+ {
546
+ return _mm512_srlv_epi64(self, _mm512_set1_epi64(other));
547
+ #else
548
+ }
549
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
550
+ {
551
+ return _mm512_srli_epi32(self, other);
552
+ }
553
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
554
+ {
555
+ return _mm512_srli_epi64(self, other);
556
+ #endif
557
+ }
558
+ else
559
+ {
560
+ return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept
561
+ { return bitwise_rshift(batch<T, avx2>(s), o, avx2 {}); },
562
+ self, other);
563
+ }
564
+ }
565
+ }
566
+
567
+ // rotl
568
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
569
+ XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
570
+ {
571
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
572
+ {
573
+ return _mm512_rolv_epi32(self, other);
574
+ }
575
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
576
+ {
577
+ return _mm512_rolv_epi64(self, other);
578
+ }
579
+ return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
580
+ { return rotl(batch<T, avx2>(s), batch<T, avx2>(o), avx2 {}); },
581
+ self, other);
582
+ }
583
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
584
+ XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
585
+ {
586
+ return rotl(self, batch<T, A>(other), A {});
587
+ }
588
+ template <size_t count, class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
589
+ XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, requires_arch<avx512f>) noexcept
590
+ {
591
+ constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
592
+ static_assert(count < bits, "Count must be less than the number of bits in T");
593
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
594
+ {
595
+ return _mm512_rol_epi32(self, count);
596
+ }
597
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
598
+ {
599
+ return _mm512_rol_epi64(self, count);
600
+ }
601
+
602
+ return detail::fwd_to_avx([](__m256i s) noexcept
603
+ { return rotl<count>(batch<T, avx2>(s), avx2 {}); },
604
+ self);
605
+ }
606
+
607
+ // rotr
608
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
609
+ XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
610
+ {
611
+ XSIMD_IF_CONSTEXPR(sizeof(T) < 4)
612
+ {
613
+ return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
614
+ { return rotr(batch<T, avx2>(s), batch<T, avx2>(o), avx2 {}); },
615
+ self, other);
616
+ }
617
+ XSIMD_IF_CONSTEXPR(std::is_unsigned<T>::value)
618
+ {
619
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
620
+ {
621
+ return _mm512_rorv_epi32(self, other);
622
+ }
623
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
624
+ {
625
+ return _mm512_rorv_epi64(self, other);
626
+ }
627
+ }
628
+ return rotr(self, other, common {});
629
+ }
630
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
631
+ XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
632
+ {
633
+ return rotr(self, batch<T, A>(other), A {});
634
+ }
635
+
636
+ template <size_t count, class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
637
+ XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& self, requires_arch<avx512f>) noexcept
638
+ {
639
+ constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
640
+ static_assert(count < bits, "Count must be less than the number of bits in T");
641
+ XSIMD_IF_CONSTEXPR(sizeof(T) < 4)
642
+ {
643
+ return detail::fwd_to_avx([](__m256i s) noexcept
644
+ { return rotr<count>(batch<T, avx2>(s), avx2 {}); },
645
+ self);
646
+ }
647
+ XSIMD_IF_CONSTEXPR(std::is_unsigned<T>::value)
648
+ {
649
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
650
+ {
651
+ return _mm512_ror_epi32(self, count);
652
+ }
653
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
654
+ {
655
+ return _mm512_ror_epi64(self, count);
656
+ }
657
+ }
658
+ return rotr<count>(self, common {});
659
+ }
660
+
661
+ // bitwise_xor
662
+ template <class A>
663
+ XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
664
+ {
665
+ return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
666
+ }
667
+ template <class A>
668
+ XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
669
+ {
670
+ return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other)));
671
+ }
672
+
673
+ template <class A, class T>
674
+ XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
675
+ {
676
+ using register_type = typename batch_bool<T, A>::register_type;
677
+ return register_type(self.data ^ other.data);
678
+ }
679
+
680
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
681
+ XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
682
+ {
683
+ return _mm512_xor_si512(self, other);
684
+ }
685
+
686
+ // bitwise_cast
687
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
688
+ XSIMD_INLINE batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
689
+ {
690
+ return _mm512_castsi512_ps(self);
691
+ }
692
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
693
+ XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<avx512f>) noexcept
694
+ {
695
+ return _mm512_castsi512_pd(self);
696
+ }
697
+ template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
698
+ XSIMD_INLINE batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<avx512f>) noexcept
699
+ {
700
+ return batch<Tp, A>(self.data);
701
+ }
702
+ template <class A>
703
+ XSIMD_INLINE batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<avx512f>) noexcept
704
+ {
705
+ return _mm512_castps_pd(self);
706
+ }
707
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
708
+ XSIMD_INLINE batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<avx512f>) noexcept
709
+ {
710
+ return _mm512_castps_si512(self);
711
+ }
712
+ template <class A>
713
+ XSIMD_INLINE batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
714
+ {
715
+ return _mm512_castpd_ps(self);
716
+ }
717
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
718
+ XSIMD_INLINE batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<avx512f>) noexcept
719
+ {
720
+ return _mm512_castpd_si512(self);
721
+ }
722
+
723
+ // broadcast
724
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
725
+ XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<avx512f>) noexcept
726
+ {
727
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
728
+ {
729
+ return _mm512_set1_epi8(val);
730
+ }
731
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
732
+ {
733
+ return _mm512_set1_epi16(val);
734
+ }
735
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
736
+ {
737
+ return _mm512_set1_epi32(val);
738
+ }
739
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
740
+ {
741
+ return _mm512_set1_epi64(val);
742
+ }
743
+ else
744
+ {
745
+ assert(false && "unsupported");
746
+ return {};
747
+ }
748
+ }
749
+ template <class A>
750
+ XSIMD_INLINE batch<float, A> broadcast(float val, requires_arch<avx512f>) noexcept
751
+ {
752
+ return _mm512_set1_ps(val);
753
+ }
754
+ template <class A>
755
+ batch<double, A> XSIMD_INLINE broadcast(double val, requires_arch<avx512f>) noexcept
756
+ {
757
+ return _mm512_set1_pd(val);
758
+ }
759
+
760
+ // ceil
761
+ template <class A>
762
+ XSIMD_INLINE batch<float, A> ceil(batch<float, A> const& self, requires_arch<avx512f>) noexcept
763
+ {
764
+ return _mm512_roundscale_ps(self, _MM_FROUND_TO_POS_INF);
765
+ }
766
+ template <class A>
767
+ XSIMD_INLINE batch<double, A> ceil(batch<double, A> const& self, requires_arch<avx512f>) noexcept
768
+ {
769
+ return _mm512_roundscale_pd(self, _MM_FROUND_TO_POS_INF);
770
+ }
771
+
772
+ // compress
773
+ template <class A>
774
+ XSIMD_INLINE batch<float, A> compress(batch<float, A> const& self, batch_bool<float, A> const& mask, requires_arch<avx512f>) noexcept
775
+ {
776
+ return _mm512_maskz_compress_ps(mask.mask(), self);
777
+ }
778
+ template <class A>
779
+ XSIMD_INLINE batch<double, A> compress(batch<double, A> const& self, batch_bool<double, A> const& mask, requires_arch<avx512f>) noexcept
780
+ {
781
+ return _mm512_maskz_compress_pd(mask.mask(), self);
782
+ }
783
+ template <class A>
784
+ XSIMD_INLINE batch<int32_t, A> compress(batch<int32_t, A> const& self, batch_bool<int32_t, A> const& mask, requires_arch<avx512f>) noexcept
785
+ {
786
+ return _mm512_maskz_compress_epi32(mask.mask(), self);
787
+ }
788
+ template <class A>
789
+ XSIMD_INLINE batch<uint32_t, A> compress(batch<uint32_t, A> const& self, batch_bool<uint32_t, A> const& mask, requires_arch<avx512f>) noexcept
790
+ {
791
+ return _mm512_maskz_compress_epi32(mask.mask(), self);
792
+ }
793
+ template <class A>
794
+ XSIMD_INLINE batch<int64_t, A> compress(batch<int64_t, A> const& self, batch_bool<int64_t, A> const& mask, requires_arch<avx512f>) noexcept
795
+ {
796
+ return _mm512_maskz_compress_epi64(mask.mask(), self);
797
+ }
798
+ template <class A>
799
+ XSIMD_INLINE batch<uint64_t, A> compress(batch<uint64_t, A> const& self, batch_bool<uint64_t, A> const& mask, requires_arch<avx512f>) noexcept
800
+ {
801
+ return _mm512_maskz_compress_epi64(mask.mask(), self);
802
+ }
803
+
804
+ // convert
805
+ namespace detail
806
+ {
807
+ template <class A>
808
+ XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
809
+ {
810
+ return _mm512_cvtepi32_ps(self);
811
+ }
812
+
813
+ template <class A>
814
+ XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx512f>) noexcept
815
+ {
816
+ return _mm512_cvttps_epi32(self);
817
+ }
818
+
819
+ template <class A>
820
+ XSIMD_INLINE batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
821
+ {
822
+ return _mm512_cvtepu32_ps(self);
823
+ }
824
+
825
+ template <class A>
826
+ batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<avx512f>)
827
+ {
828
+ return _mm512_cvttps_epu32(self);
829
+ }
830
+ }
831
+
832
+ namespace detail
833
+ {
834
+ // complex_low
835
+ template <class A>
836
+ XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<avx512f>) noexcept
837
+ {
838
+ __m512i idx = _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
839
+ return _mm512_permutex2var_ps(self.real(), idx, self.imag());
840
+ }
841
+ template <class A>
842
+ XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx512f>) noexcept
843
+ {
844
+ __m512i idx = _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11);
845
+ return _mm512_permutex2var_pd(self.real(), idx, self.imag());
846
+ }
847
+
848
+ // complex_high
849
+ template <class A>
850
+ XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<avx512f>) noexcept
851
+ {
852
+ __m512i idx = _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
853
+ return _mm512_permutex2var_ps(self.real(), idx, self.imag());
854
+ }
855
+ template <class A>
856
+ XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx512f>) noexcept
857
+ {
858
+ __m512i idx = _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15);
859
+ return _mm512_permutex2var_pd(self.real(), idx, self.imag());
860
+ }
861
+ }
862
+ // incr_if
863
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
864
+ XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<avx512f>) noexcept
865
+ {
866
+
867
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
868
+ {
869
+ return _mm512_mask_sub_epi32(self, mask.data, self, _mm512_set1_epi32(1));
870
+ }
871
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
872
+ {
873
+ return _mm512_mask_sub_epi64(self, mask.data, self, _mm512_set1_epi64(1));
874
+ }
875
+ else
876
+ {
877
+ return decr_if(self, mask, common {});
878
+ }
879
+ }
880
+
881
+ // div
882
+ template <class A>
883
+ XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
884
+ {
885
+ return _mm512_div_ps(self, other);
886
+ }
887
+ template <class A>
888
+ XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
889
+ {
890
+ return _mm512_div_pd(self, other);
891
+ }
892
+
893
+ // eq
894
+ template <class A>
895
+ XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
896
+ {
897
+ return _mm512_cmp_ps_mask(self, other, _CMP_EQ_OQ);
898
+ }
899
+ template <class A>
900
+ XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
901
+ {
902
+ return _mm512_cmp_pd_mask(self, other, _CMP_EQ_OQ);
903
+ }
904
+
905
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
906
+ XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
907
+ {
908
+ return detail::compare_int_avx512f<A, T, _MM_CMPINT_EQ>(self, other);
909
+ }
910
+ template <class A, class T>
911
+ XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
912
+ {
913
+ using register_type = typename batch_bool<T, A>::register_type;
914
+ return register_type(~self.data ^ other.data);
915
+ }
916
+
917
+ // expand
918
+ template <class A>
919
+ XSIMD_INLINE batch<float, A> expand(batch<float, A> const& self, batch_bool<float, A> const& mask, requires_arch<avx512f>) noexcept
920
+ {
921
+ return _mm512_maskz_expand_ps(mask.mask(), self);
922
+ }
923
+ template <class A>
924
+ XSIMD_INLINE batch<double, A> expand(batch<double, A> const& self, batch_bool<double, A> const& mask, requires_arch<avx512f>) noexcept
925
+ {
926
+ return _mm512_maskz_expand_pd(mask.mask(), self);
927
+ }
928
+ template <class A>
929
+ XSIMD_INLINE batch<int32_t, A> expand(batch<int32_t, A> const& self, batch_bool<int32_t, A> const& mask, requires_arch<avx512f>) noexcept
930
+ {
931
+ return _mm512_maskz_expand_epi32(mask.mask(), self);
932
+ }
933
+ template <class A>
934
+ XSIMD_INLINE batch<uint32_t, A> expand(batch<uint32_t, A> const& self, batch_bool<uint32_t, A> const& mask, requires_arch<avx512f>) noexcept
935
+ {
936
+ return _mm512_maskz_expand_epi32(mask.mask(), self);
937
+ }
938
+ template <class A>
939
+ XSIMD_INLINE batch<int64_t, A> expand(batch<int64_t, A> const& self, batch_bool<int64_t, A> const& mask, requires_arch<avx512f>) noexcept
940
+ {
941
+ return _mm512_maskz_expand_epi64(mask.mask(), self);
942
+ }
943
+ template <class A>
944
+ XSIMD_INLINE batch<uint64_t, A> expand(batch<uint64_t, A> const& self, batch_bool<uint64_t, A> const& mask, requires_arch<avx512f>) noexcept
945
+ {
946
+ return _mm512_maskz_expand_epi64(mask.mask(), self);
947
+ }
948
+
949
+ // floor
950
+ template <class A>
951
+ XSIMD_INLINE batch<float, A> floor(batch<float, A> const& self, requires_arch<avx512f>) noexcept
952
+ {
953
+ return _mm512_roundscale_ps(self, _MM_FROUND_TO_NEG_INF);
954
+ }
955
+ template <class A>
956
+ XSIMD_INLINE batch<double, A> floor(batch<double, A> const& self, requires_arch<avx512f>) noexcept
957
+ {
958
+ return _mm512_roundscale_pd(self, _MM_FROUND_TO_NEG_INF);
959
+ }
960
+
961
+ // fnma
962
+ template <class A>
963
+ XSIMD_INLINE batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
964
+ {
965
+ return _mm512_fnmadd_ps(x, y, z);
966
+ }
967
+
968
+ template <class A>
969
+ XSIMD_INLINE batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
970
+ {
971
+ return _mm512_fnmadd_pd(x, y, z);
972
+ }
973
+
974
+ // fma
975
+ template <class A>
976
+ XSIMD_INLINE batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
977
+ {
978
+ return _mm512_fmadd_ps(x, y, z);
979
+ }
980
+
981
+ template <class A>
982
+ XSIMD_INLINE batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
983
+ {
984
+ return _mm512_fmadd_pd(x, y, z);
985
+ }
986
+
987
+ // fms
988
+ template <class A>
989
+ XSIMD_INLINE batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
990
+ {
991
+ return _mm512_fmsub_ps(x, y, z);
992
+ }
993
+
994
+ template <class A>
995
+ XSIMD_INLINE batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
996
+ {
997
+ return _mm512_fmsub_pd(x, y, z);
998
+ }
999
+ // fmas
1000
+ template <class A>
1001
+ XSIMD_INLINE batch<float, A> fmas(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
1002
+ {
1003
+ return _mm512_fmaddsub_ps(x, y, z);
1004
+ }
1005
+
1006
+ template <class A>
1007
+ XSIMD_INLINE batch<double, A> fmas(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
1008
+ {
1009
+ return _mm512_fmaddsub_pd(x, y, z);
1010
+ }
1011
+
1012
+ // from bool
1013
+ template <class A, class T>
1014
+ XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
1015
+ {
1016
+ return select(self, batch<T, A>(1), batch<T, A>(0));
1017
+ }
1018
+
1019
+ // from_mask
1020
+ template <class T, class A>
1021
+ XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<avx512f>) noexcept
1022
+ {
1023
+ return static_cast<typename batch_bool<T, A>::register_type>(mask);
1024
+ }
1025
+
1026
+ // gather
1027
+ template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
1028
+ XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
1029
+ kernel::requires_arch<avx512f>) noexcept
1030
+ {
1031
+ return _mm512_i32gather_epi32(index, static_cast<const void*>(src), sizeof(T));
1032
+ }
1033
+
1034
+ template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0>
1035
+ XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
1036
+ kernel::requires_arch<avx512f>) noexcept
1037
+ {
1038
+ return _mm512_i64gather_epi64(index, static_cast<const void*>(src), sizeof(T));
1039
+ }
1040
+
1041
+ template <class A, class U, detail::enable_sized_integral_t<U, 4> = 0>
1042
+ XSIMD_INLINE batch<float, A> gather(batch<float, A> const&, float const* src,
1043
+ batch<U, A> const& index,
1044
+ kernel::requires_arch<avx512f>) noexcept
1045
+ {
1046
+ return _mm512_i32gather_ps(index, src, sizeof(float));
1047
+ }
1048
+
1049
+ template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0>
1050
+ XSIMD_INLINE batch<double, A>
1051
+ gather(batch<double, A> const&, double const* src, batch<U, A> const& index,
1052
+ kernel::requires_arch<avx512f>) noexcept
1053
+ {
1054
+ return _mm512_i64gather_pd(index, src, sizeof(double));
1055
+ }
1056
+
1057
+ // gather: handmade conversions
1058
+ template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
1059
+ XSIMD_INLINE batch<float, A> gather(batch<float, A> const&, double const* src,
1060
+ batch<V, A> const& index,
1061
+ requires_arch<avx512f>) noexcept
1062
+ {
1063
+ const batch<double, A> low(_mm512_i32gather_pd(_mm512_castsi512_si256(index.data), src, sizeof(double)));
1064
+ const batch<double, A> high(_mm512_i32gather_pd(_mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(index.data), 1)), src, sizeof(double)));
1065
+ return detail::merge_avx(_mm512_cvtpd_ps(low.data), _mm512_cvtpd_ps(high.data));
1066
+ }
1067
+
1068
+ template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
1069
+ XSIMD_INLINE batch<int32_t, A> gather(batch<int32_t, A> const&, double const* src,
1070
+ batch<V, A> const& index,
1071
+ requires_arch<avx512f>) noexcept
1072
+ {
1073
+ const batch<double, A> low(_mm512_i32gather_pd(_mm512_castsi512_si256(index.data), src, sizeof(double)));
1074
+ const batch<double, A> high(_mm512_i32gather_pd(_mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(index.data), 1)), src, sizeof(double)));
1075
+ return detail::merge_avx(_mm512_cvtpd_epi32(low.data), _mm512_cvtpd_epi32(high.data));
1076
+ }
1077
+
1078
+ // ge
1079
+ template <class A>
1080
+ XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1081
+ {
1082
+ return _mm512_cmp_ps_mask(self, other, _CMP_GE_OQ);
1083
+ }
1084
+ template <class A>
1085
+ XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1086
+ {
1087
+ return _mm512_cmp_pd_mask(self, other, _CMP_GE_OQ);
1088
+ }
1089
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1090
+ XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1091
+ {
1092
+ return detail::compare_int_avx512f<A, T, _MM_CMPINT_GE>(self, other);
1093
+ }
1094
+
1095
+ // gt
1096
+ template <class A>
1097
+ XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1098
+ {
1099
+ return _mm512_cmp_ps_mask(self, other, _CMP_GT_OQ);
1100
+ }
1101
+ template <class A>
1102
+ XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1103
+ {
1104
+ return _mm512_cmp_pd_mask(self, other, _CMP_GT_OQ);
1105
+ }
1106
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1107
+ XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1108
+ {
1109
+ return detail::compare_int_avx512f<A, T, _MM_CMPINT_GT>(self, other);
1110
+ }
1111
+
1112
+ // haddp
1113
+ template <class A>
1114
+ XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx512f>) noexcept
1115
+ {
1116
+ // The following folds over the vector once:
1117
+ // tmp1 = [a0..8, b0..8]
1118
+ // tmp2 = [a8..f, b8..f]
1119
+ #define XSIMD_AVX512_HADDP_STEP1(I, a, b) \
1120
+ batch<float, avx512f> res##I; \
1121
+ { \
1122
+ auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
1123
+ auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
1124
+ res##I = _mm512_add_ps(tmp1, tmp2); \
1125
+ }
1126
+
1127
+ XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]);
1128
+ XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]);
1129
+ XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]);
1130
+ XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]);
1131
+ XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]);
1132
+ XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]);
1133
+ XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]);
1134
+ XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]);
1135
+
1136
+ #undef XSIMD_AVX512_HADDP_STEP1
1137
+
1138
+ // The following flds the code and shuffles so that hadd_ps produces the correct result
1139
+ // tmp1 = [a0..4, a8..12, b0..4, b8..12] (same for tmp3)
1140
+ // tmp2 = [a5..8, a12..16, b5..8, b12..16] (same for tmp4)
1141
+ // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ...
1142
+ #define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d) \
1143
+ batch<float, avx2> halfx##I; \
1144
+ { \
1145
+ auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \
1146
+ auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \
1147
+ \
1148
+ auto resx1 = _mm512_add_ps(tmp1, tmp2); \
1149
+ \
1150
+ auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \
1151
+ auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \
1152
+ \
1153
+ auto resx2 = _mm512_add_ps(tmp3, tmp4); \
1154
+ \
1155
+ auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \
1156
+ auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \
1157
+ \
1158
+ auto resx3 = _mm512_add_ps(tmp5, tmp6); \
1159
+ \
1160
+ halfx##I = _mm256_hadd_ps(_mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(resx3, 0)), _mm512_extractf32x4_ps(resx3, 1), 1), \
1161
+ _mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(resx3, 2)), _mm512_extractf32x4_ps(resx3, 3), 1)); \
1162
+ }
1163
+
1164
+ XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3);
1165
+ XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7);
1166
+
1167
+ #undef XSIMD_AVX512_HADDP_STEP2
1168
+
1169
+ auto concat = _mm512_castps256_ps512(halfx0);
1170
+ concat = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(concat), _mm256_castps_pd(halfx1), 1));
1171
+ return concat;
1172
+ }
1173
+
1174
+ template <class A>
1175
+ XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<avx512f>) noexcept
1176
+ {
1177
+ #define step1(I, a, b) \
1178
+ batch<double, avx512f> res##I; \
1179
+ { \
1180
+ auto tmp1 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
1181
+ auto tmp2 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
1182
+ res##I = _mm512_add_pd(tmp1, tmp2); \
1183
+ }
1184
+
1185
+ step1(1, row[0], row[2]);
1186
+ step1(2, row[4], row[6]);
1187
+ step1(3, row[1], row[3]);
1188
+ step1(4, row[5], row[7]);
1189
+
1190
+ #undef step1
1191
+
1192
+ auto tmp5 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(2, 0, 2, 0));
1193
+ auto tmp6 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(3, 1, 3, 1));
1194
+
1195
+ auto resx1 = _mm512_add_pd(tmp5, tmp6);
1196
+
1197
+ auto tmp7 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(2, 0, 2, 0));
1198
+ auto tmp8 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(3, 1, 3, 1));
1199
+
1200
+ auto resx2 = _mm512_add_pd(tmp7, tmp8);
1201
+
1202
+ auto tmpx = _mm512_shuffle_pd(resx1, resx2, 0b00000000);
1203
+ auto tmpy = _mm512_shuffle_pd(resx1, resx2, 0b11111111);
1204
+
1205
+ return _mm512_add_pd(tmpx, tmpy);
1206
+ }
1207
+
1208
+ // incr_if
1209
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1210
+ XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<avx512f>) noexcept
1211
+ {
1212
+
1213
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1214
+ {
1215
+ return _mm512_mask_add_epi32(self, mask.data, self, _mm512_set1_epi32(1));
1216
+ }
1217
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1218
+ {
1219
+ return _mm512_mask_add_epi64(self, mask.data, self, _mm512_set1_epi64(1));
1220
+ }
1221
+ else
1222
+ {
1223
+ return incr_if(self, mask, common {});
1224
+ }
1225
+ }
1226
+
1227
+ // insert
1228
+ template <class A, size_t I>
1229
+ XSIMD_INLINE batch<float, A> insert(batch<float, A> const& self, float val, index<I>, requires_arch<avx512f>) noexcept
1230
+ {
1231
+
1232
+ int32_t tmp = bit_cast<int32_t>(val);
1233
+ return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_castps_si512(self), __mmask16(1 << (I & 15)), tmp));
1234
+ }
1235
+
1236
+ template <class A, size_t I>
1237
+ XSIMD_INLINE batch<double, A> insert(batch<double, A> const& self, double val, index<I>, requires_arch<avx512f>) noexcept
1238
+ {
1239
+ int64_t tmp = bit_cast<int64_t>(val);
1240
+ return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_castpd_si512(self), __mmask8(1 << (I & 7)), tmp));
1241
+ }
1242
+ template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1243
+ XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<avx512f>) noexcept
1244
+ {
1245
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1246
+ {
1247
+ return _mm512_mask_set1_epi32(self, __mmask16(1 << (I & 15)), val);
1248
+ }
1249
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1250
+ {
1251
+ return _mm512_mask_set1_epi64(self, __mmask8(1 << (I & 7)), val);
1252
+ }
1253
+ else
1254
+ {
1255
+ return insert(self, val, pos, common {});
1256
+ }
1257
+ }
1258
+
1259
+ // isnan
1260
+ template <class A>
1261
+ XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<avx512f>) noexcept
1262
+ {
1263
+ return _mm512_cmp_ps_mask(self, self, _CMP_UNORD_Q);
1264
+ }
1265
+ template <class A>
1266
+ XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<avx512f>) noexcept
1267
+ {
1268
+ return _mm512_cmp_pd_mask(self, self, _CMP_UNORD_Q);
1269
+ }
1270
+
1271
+ // ldexp
1272
+ template <class A>
1273
+ XSIMD_INLINE batch<float, A> ldexp(const batch<float, A>& self, const batch<as_integer_t<float>, A>& other, requires_arch<avx512f>) noexcept
1274
+ {
1275
+ return _mm512_scalef_ps(self, _mm512_cvtepi32_ps(other));
1276
+ }
1277
+
1278
+ template <class A>
1279
+ XSIMD_INLINE batch<double, A> ldexp(const batch<double, A>& self, const batch<as_integer_t<double>, A>& other, requires_arch<avx512f>) noexcept
1280
+ {
1281
+ // FIXME: potential data loss here when converting other elements to
1282
+ // int32 before converting them back to double.
1283
+ __m512d adjusted_index = _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(other));
1284
+ return _mm512_scalef_pd(self, adjusted_index);
1285
+ }
1286
+
1287
+ // le
1288
+ template <class A>
1289
+ XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1290
+ {
1291
+ return _mm512_cmp_ps_mask(self, other, _CMP_LE_OQ);
1292
+ }
1293
+ template <class A>
1294
+ XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1295
+ {
1296
+ return _mm512_cmp_pd_mask(self, other, _CMP_LE_OQ);
1297
+ }
1298
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1299
+ XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1300
+ {
1301
+ return detail::compare_int_avx512f<A, T, _MM_CMPINT_LE>(self, other);
1302
+ }
1303
+
1304
+ namespace detail
1305
+ {
1306
+ // Adapted from https://github.com/serge-sans-paille/fast-bitset-from-bool-array
1307
+ // Generate a bitset from an array of boolean.
1308
+ XSIMD_INLINE unsigned char tobitset(unsigned char unpacked[8])
1309
+ {
1310
+ uint64_t data;
1311
+ memcpy(&data, unpacked, sizeof(uint64_t));
1312
+
1313
+ const uint64_t magic = (0x80 + 0x4000 + 0x200000 + 0x10000000 + 0x0800000000 + 0x040000000000 + 0x02000000000000 + 0x0100000000000000);
1314
+
1315
+ unsigned char res = ((data * magic) >> 56) & 0xFF;
1316
+ return res;
1317
+ }
1318
+ }
1319
+
1320
+ // load mask
1321
+ template <class A, class T>
1322
+ XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<avx512f>) noexcept
1323
+ {
1324
+ using register_type = typename batch_bool<T, A>::register_type;
1325
+ constexpr auto size = batch_bool<T, A>::size;
1326
+ constexpr auto iter = size / 8;
1327
+ static_assert((size % 8) == 0, "incorrect size of bool batch");
1328
+ register_type mask = 0;
1329
+ for (std::size_t i = 0; i < iter; ++i)
1330
+ {
1331
+ unsigned char block = detail::tobitset((unsigned char*)mem + i * 8);
1332
+ mask |= (register_type(block) << (i * 8));
1333
+ }
1334
+ return mask;
1335
+ }
1336
+
1337
+ // load_aligned
1338
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1339
+ XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<avx512f>) noexcept
1340
+ {
1341
+ return _mm512_load_si512((__m512i const*)mem);
1342
+ }
1343
+ template <class A>
1344
+ XSIMD_INLINE batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<avx512f>) noexcept
1345
+ {
1346
+ return _mm512_load_ps(mem);
1347
+ }
1348
+ template <class A>
1349
+ XSIMD_INLINE batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<avx512f>) noexcept
1350
+ {
1351
+ return _mm512_load_pd(mem);
1352
+ }
1353
+
1354
+ // load_complex
1355
+ namespace detail
1356
+ {
1357
+ template <class A>
1358
+ XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx512f>) noexcept
1359
+ {
1360
+ __m512i real_idx = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
1361
+ __m512i imag_idx = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
1362
+ auto real = _mm512_permutex2var_ps(hi, real_idx, lo);
1363
+ auto imag = _mm512_permutex2var_ps(hi, imag_idx, lo);
1364
+ return { real, imag };
1365
+ }
1366
+ template <class A>
1367
+ XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx512f>) noexcept
1368
+ {
1369
+ __m512i real_idx = _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14);
1370
+ __m512i imag_idx = _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15);
1371
+ auto real = _mm512_permutex2var_pd(hi, real_idx, lo);
1372
+ auto imag = _mm512_permutex2var_pd(hi, imag_idx, lo);
1373
+ return { real, imag };
1374
+ }
1375
+ }
1376
+
1377
+ // load_unaligned
1378
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1379
+ XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<avx512f>) noexcept
1380
+ {
1381
+ return _mm512_loadu_si512((__m512i const*)mem);
1382
+ }
1383
+ template <class A>
1384
+ XSIMD_INLINE batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<avx512f>) noexcept
1385
+ {
1386
+ return _mm512_loadu_ps(mem);
1387
+ }
1388
+ template <class A>
1389
+ XSIMD_INLINE batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<avx512f>) noexcept
1390
+ {
1391
+ return _mm512_loadu_pd(mem);
1392
+ }
1393
+
1394
+ // lt
1395
+ template <class A>
1396
+ XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1397
+ {
1398
+ return _mm512_cmp_ps_mask(self, other, _CMP_LT_OQ);
1399
+ }
1400
+ template <class A>
1401
+ XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1402
+ {
1403
+ return _mm512_cmp_pd_mask(self, other, _CMP_LT_OQ);
1404
+ }
1405
+
1406
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1407
+ XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1408
+ {
1409
+ return detail::compare_int_avx512f<A, T, _MM_CMPINT_LT>(self, other);
1410
+ }
1411
+
1412
+ // mask
1413
+ template <class A, class T>
1414
+ XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
1415
+ {
1416
+ return self.data;
1417
+ }
1418
+
1419
+ // max
1420
+ template <class A>
1421
+ XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1422
+ {
1423
+ return _mm512_max_ps(other, self);
1424
+ }
1425
+ template <class A>
1426
+ XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1427
+ {
1428
+ return _mm512_max_pd(other, self);
1429
+ }
1430
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1431
+ XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1432
+ {
1433
+ if (std::is_signed<T>::value)
1434
+ {
1435
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1436
+ {
1437
+ return _mm512_max_epi32(self, other);
1438
+ }
1439
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1440
+ {
1441
+ return _mm512_max_epi64(self, other);
1442
+ }
1443
+ else
1444
+ {
1445
+ return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
1446
+ { return max(batch<T, avx2>(s), batch<T, avx2>(o)); },
1447
+ self, other);
1448
+ }
1449
+ }
1450
+ else
1451
+ {
1452
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1453
+ {
1454
+ return _mm512_max_epu32(self, other);
1455
+ }
1456
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1457
+ {
1458
+ return _mm512_max_epu64(self, other);
1459
+ }
1460
+ else
1461
+ {
1462
+ return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
1463
+ { return max(batch<T, avx2>(s), batch<T, avx2>(o)); },
1464
+ self, other);
1465
+ }
1466
+ }
1467
+ }
1468
+
1469
+ // min
1470
+ template <class A>
1471
+ XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1472
+ {
1473
+ return _mm512_min_ps(other, self);
1474
+ }
1475
+ template <class A>
1476
+ XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1477
+ {
1478
+ return _mm512_min_pd(other, self);
1479
+ }
1480
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1481
+ XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1482
+ {
1483
+ if (std::is_signed<T>::value)
1484
+ {
1485
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1486
+ {
1487
+ return _mm512_min_epi32(self, other);
1488
+ }
1489
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1490
+ {
1491
+ return _mm512_min_epi64(self, other);
1492
+ }
1493
+ else
1494
+ {
1495
+ return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
1496
+ { return min(batch<T, avx2>(s), batch<T, avx2>(o)); },
1497
+ self, other);
1498
+ }
1499
+ }
1500
+ else
1501
+ {
1502
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1503
+ {
1504
+ return _mm512_min_epu32(self, other);
1505
+ }
1506
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1507
+ {
1508
+ return _mm512_min_epu64(self, other);
1509
+ }
1510
+ else
1511
+ {
1512
+ return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
1513
+ { return min(batch<T, avx2>(s), batch<T, avx2>(o)); },
1514
+ self, other);
1515
+ }
1516
+ }
1517
+ }
1518
+
1519
+ // mul
1520
+ template <class A>
1521
+ XSIMD_INLINE batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1522
+ {
1523
+ return _mm512_mul_ps(self, other);
1524
+ }
1525
+ template <class A>
1526
+ XSIMD_INLINE batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1527
+ {
1528
+ return _mm512_mul_pd(self, other);
1529
+ }
1530
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1531
+ XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1532
+ {
1533
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1534
+ {
1535
+ return _mm512_mullo_epi32(self, other);
1536
+ }
1537
+ else
1538
+ {
1539
+ return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
1540
+ { return mul(batch<T, avx2>(s), batch<T, avx2>(o)); },
1541
+ self, other);
1542
+ }
1543
+ }
1544
+
1545
+ // nearbyint
1546
+ template <class A>
1547
+ XSIMD_INLINE batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<avx512f>) noexcept
1548
+ {
1549
+ return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION);
1550
+ }
1551
+ template <class A>
1552
+ XSIMD_INLINE batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<avx512f>) noexcept
1553
+ {
1554
+ return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION);
1555
+ }
1556
+
1557
+ // nearbyint_as_int
1558
+ template <class A>
1559
+ XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
1560
+ requires_arch<avx512f>) noexcept
1561
+ {
1562
+ return _mm512_cvtps_epi32(self);
1563
+ }
1564
+
1565
+ // neg
1566
+ template <class A, class T>
1567
+ XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<avx512f>) noexcept
1568
+ {
1569
+ return 0 - self;
1570
+ }
1571
+
1572
+ // neq
1573
+ template <class A>
1574
+ XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1575
+ {
1576
+ return _mm512_cmp_ps_mask(self, other, _CMP_NEQ_UQ);
1577
+ }
1578
+ template <class A>
1579
+ XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1580
+ {
1581
+ return _mm512_cmp_pd_mask(self, other, _CMP_NEQ_UQ);
1582
+ }
1583
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1584
+ XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1585
+ {
1586
+ return ~(self == other);
1587
+ }
1588
+
1589
+ template <class A, class T>
1590
+ XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
1591
+ {
1592
+ using register_type = typename batch_bool<T, A>::register_type;
1593
+ return register_type(self.data ^ other.data);
1594
+ }
1595
+
1596
+ // reciprocal
1597
+ template <class A>
1598
+ XSIMD_INLINE batch<float, A>
1599
+ reciprocal(batch<float, A> const& self,
1600
+ kernel::requires_arch<avx512f>) noexcept
1601
+ {
1602
+ return _mm512_rcp14_ps(self);
1603
+ }
1604
+
1605
+ template <class A>
1606
+ XSIMD_INLINE batch<double, A>
1607
+ reciprocal(batch<double, A> const& self,
1608
+ kernel::requires_arch<avx512f>) noexcept
1609
+ {
1610
+ return _mm512_rcp14_pd(self);
1611
+ }
1612
+
1613
+ // reduce_add
1614
+ template <class A>
1615
+ XSIMD_INLINE float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
1616
+ {
1617
+ return _mm512_reduce_add_ps(rhs);
1618
+ }
1619
+ template <class A>
1620
+ XSIMD_INLINE double reduce_add(batch<double, A> const& rhs, requires_arch<avx512f>) noexcept
1621
+ {
1622
+ return _mm512_reduce_add_pd(rhs);
1623
+ }
1624
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1625
+ XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx512f>) noexcept
1626
+ {
1627
+ __m256i low, high;
1628
+ detail::split_avx512(self, low, high);
1629
+ batch<T, avx2> blow(low), bhigh(high);
1630
+ return reduce_add(blow, avx2 {}) + reduce_add(bhigh, avx2 {});
1631
+ }
1632
+
1633
+ // reduce_max
1634
+ template <class A, class T, class _ = typename std::enable_if<(sizeof(T) == 1), void>::type>
1635
+ XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<avx512f>) noexcept
1636
+ {
1637
+ constexpr batch_constant<uint64_t, A, 5, 6, 7, 8, 0, 0, 0, 0> mask;
1638
+ batch<T, A> step = _mm512_permutexvar_epi64(mask.as_batch(), self);
1639
+ batch<T, A> acc = max(self, step);
1640
+ __m256i low = _mm512_castsi512_si256(acc);
1641
+ return reduce_max(batch<T, avx2>(low));
1642
+ }
1643
+
1644
+ // reduce_min
1645
+ template <class A, class T, class _ = typename std::enable_if<(sizeof(T) == 1), void>::type>
1646
+ XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<avx512f>) noexcept
1647
+ {
1648
+ constexpr batch_constant<uint64_t, A, 5, 6, 7, 8, 0, 0, 0, 0> mask;
1649
+ batch<T, A> step = _mm512_permutexvar_epi64(mask.as_batch(), self);
1650
+ batch<T, A> acc = min(self, step);
1651
+ __m256i low = _mm512_castsi512_si256(acc);
1652
+ return reduce_min(batch<T, avx2>(low));
1653
+ }
1654
+
1655
+ // reduce_mul
1656
+ template <class A>
1657
+ XSIMD_INLINE float reduce_mul(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
1658
+ {
1659
+ return _mm512_reduce_mul_ps(rhs);
1660
+ }
1661
+ template <class A>
1662
+ XSIMD_INLINE double reduce_mul(batch<double, A> const& rhs, requires_arch<avx512f>) noexcept
1663
+ {
1664
+ return _mm512_reduce_mul_pd(rhs);
1665
+ }
1666
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1667
+ XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<avx512f>) noexcept
1668
+ {
1669
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1670
+ {
1671
+ return _mm512_reduce_mul_epi32(self);
1672
+ }
1673
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1674
+ {
1675
+ return _mm512_reduce_mul_epi64(self);
1676
+ }
1677
+ else
1678
+ {
1679
+ __m256i low, high;
1680
+ detail::split_avx512(self, low, high);
1681
+ batch<T, avx2> blow(low), bhigh(high);
1682
+ return reduce_mul(blow, avx2 {}) * reduce_mul(bhigh, avx2 {});
1683
+ }
1684
+ }
1685
+
1686
+ // rsqrt
1687
+ template <class A>
1688
+ XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept
1689
+ {
1690
+ return _mm512_rsqrt14_ps(val);
1691
+ }
1692
+ template <class A>
1693
+ XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<avx512f>) noexcept
1694
+ {
1695
+ return _mm512_rsqrt14_pd(val);
1696
+ }
1697
+
1698
+ // sadd
1699
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1700
+ XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1701
+ {
1702
+ if (std::is_signed<T>::value)
1703
+ {
1704
+ auto mask = other < 0;
1705
+ auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
1706
+ auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
1707
+ return other + select(mask, self_neg_branch, self_pos_branch);
1708
+ }
1709
+ else
1710
+ {
1711
+ const auto diffmax = std::numeric_limits<T>::max() - self;
1712
+ const auto mindiff = min(diffmax, other);
1713
+ return self + mindiff;
1714
+ }
1715
+ }
1716
+
1717
+ // scatter
1718
+ template <class A, class T,
1719
+ class = typename std::enable_if<std::is_same<uint32_t, T>::value || std::is_same<int32_t, T>::value, void>::type>
1720
+ XSIMD_INLINE void scatter(batch<T, A> const& src, T* dst,
1721
+ batch<int32_t, A> const& index,
1722
+ kernel::requires_arch<avx512f>) noexcept
1723
+ {
1724
+ _mm512_i32scatter_epi32(dst, index, src, sizeof(T));
1725
+ }
1726
+
1727
+ template <class A, class T,
1728
+ class = typename std::enable_if<std::is_same<uint64_t, T>::value || std::is_same<int64_t, T>::value, void>::type>
1729
+ XSIMD_INLINE void scatter(batch<T, A> const& src, T* dst,
1730
+ batch<int64_t, A> const& index,
1731
+ kernel::requires_arch<avx512f>) noexcept
1732
+ {
1733
+ _mm512_i64scatter_epi64(dst, index, src, sizeof(T));
1734
+ }
1735
+
1736
+ template <class A>
1737
+ XSIMD_INLINE void scatter(batch<float, A> const& src, float* dst,
1738
+ batch<int32_t, A> const& index,
1739
+ kernel::requires_arch<avx512f>) noexcept
1740
+ {
1741
+ _mm512_i32scatter_ps(dst, index, src, sizeof(float));
1742
+ }
1743
+
1744
+ template <class A>
1745
+ XSIMD_INLINE void scatter(batch<double, A> const& src, double* dst,
1746
+ batch<int64_t, A> const& index,
1747
+ kernel::requires_arch<avx512f>) noexcept
1748
+ {
1749
+ _mm512_i64scatter_pd(dst, index, src, sizeof(double));
1750
+ }
1751
+
1752
+ // select
1753
+ template <class A>
1754
+ XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx512f>) noexcept
1755
+ {
1756
+ return _mm512_mask_blend_ps(cond, false_br, true_br);
1757
+ }
1758
+ template <class A>
1759
+ XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx512f>) noexcept
1760
+ {
1761
+ return _mm512_mask_blend_pd(cond, false_br, true_br);
1762
+ }
1763
+
1764
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1765
+ XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512f>) noexcept
1766
+ {
1767
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1768
+ {
1769
+ alignas(avx2::alignment()) uint8_t buffer[64];
1770
+ // FIXME: ultra inefficient
1771
+ for (int i = 0; i < 64; ++i)
1772
+ buffer[i] = cond.data & (1ull << i) ? 0xFF : 0;
1773
+ __m256i cond_low = batch<uint8_t, avx2>::load_aligned(&buffer[0]);
1774
+ __m256i cond_hi = batch<uint8_t, avx2>::load_aligned(&buffer[32]);
1775
+
1776
+ __m256i true_low, true_hi;
1777
+ detail::split_avx512(true_br, true_low, true_hi);
1778
+
1779
+ __m256i false_low, false_hi;
1780
+ detail::split_avx512(false_br, false_low, false_hi);
1781
+
1782
+ __m256i res_low = select(batch_bool<T, avx2>(cond_low), batch<T, avx2>(true_low), batch<T, avx2>(false_low), avx2 {});
1783
+ __m256i res_hi = select(batch_bool<T, avx2>(cond_hi), batch<T, avx2>(true_hi), batch<T, avx2>(false_hi), avx2 {});
1784
+ return detail::merge_avx(res_low, res_hi);
1785
+ }
1786
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1787
+ {
1788
+ __m256i cond_low = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data & 0xFFFF, _mm512_set1_epi32(~0));
1789
+ __m256i cond_hi = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data >> 16, _mm512_set1_epi32(~0));
1790
+
1791
+ __m256i true_low, true_hi;
1792
+ detail::split_avx512(true_br, true_low, true_hi);
1793
+
1794
+ __m256i false_low, false_hi;
1795
+ detail::split_avx512(false_br, false_low, false_hi);
1796
+
1797
+ __m256i res_low = select(batch_bool<T, avx2>(cond_low), batch<T, avx2>(true_low), batch<T, avx2>(false_low), avx2 {});
1798
+ __m256i res_hi = select(batch_bool<T, avx2>(cond_hi), batch<T, avx2>(true_hi), batch<T, avx2>(false_hi), avx2 {});
1799
+ return detail::merge_avx(res_low, res_hi);
1800
+ }
1801
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1802
+ {
1803
+ return _mm512_mask_blend_epi32(cond, false_br, true_br);
1804
+ }
1805
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1806
+ {
1807
+ return _mm512_mask_blend_epi64(cond, false_br, true_br);
1808
+ }
1809
+ else
1810
+ {
1811
+ assert(false && "unsupported arch/type combination");
1812
+ return {};
1813
+ }
1814
+ }
1815
+
1816
+ template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1817
+ XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512f>) noexcept
1818
+ {
1819
+ return select(batch_bool<T, A> { Values... }, true_br, false_br, avx512f {});
1820
+ }
1821
+
1822
+ namespace detail
1823
+ {
1824
+ template <class T>
1825
+ using enable_signed_integer_t = typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value,
1826
+ int>::type;
1827
+
1828
+ template <class T>
1829
+ using enable_unsigned_integer_t = typename std::enable_if<std::is_integral<T>::value && std::is_unsigned<T>::value,
1830
+ int>::type;
1831
+ }
1832
+
1833
+ // set
1834
+ template <class A>
1835
+ XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<avx512f>, float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15) noexcept
1836
+ {
1837
+ return _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
1838
+ }
1839
+
1840
+ template <class A>
1841
+ XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<avx512f>, double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7) noexcept
1842
+ {
1843
+ return _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7);
1844
+ }
1845
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1846
+ XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
1847
+ {
1848
+ return _mm512_set_epi64(v7, v6, v5, v4, v3, v2, v1, v0);
1849
+ }
1850
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1851
+ XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
1852
+ T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
1853
+ {
1854
+ return _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
1855
+ }
1856
+ template <class A, class T, detail::enable_signed_integer_t<T> = 0>
1857
+ XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
1858
+ T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
1859
+ T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
1860
+ T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
1861
+ {
1862
+ #if defined(__clang__) || __GNUC__
1863
+ return __extension__(__m512i)(__v32hi) {
1864
+ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1865
+ v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
1866
+ };
1867
+ #else
1868
+ return _mm512_set_epi16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
1869
+ v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
1870
+ #endif
1871
+ }
1872
+
1873
+ template <class A, class T, detail::enable_unsigned_integer_t<T> = 0>
1874
+ XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
1875
+ T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
1876
+ T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
1877
+ T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
1878
+ {
1879
+ #if defined(__clang__) || __GNUC__
1880
+ return __extension__(__m512i)(__v32hu) {
1881
+ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1882
+ v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
1883
+ };
1884
+ #else
1885
+ return _mm512_set_epi16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
1886
+ v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
1887
+ #endif
1888
+ }
1889
+
1890
+ template <class A, class T, detail::enable_signed_integer_t<T> = 0>
1891
+ XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
1892
+ T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
1893
+ T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
1894
+ T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31,
1895
+ T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39,
1896
+ T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47,
1897
+ T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55,
1898
+ T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept
1899
+ {
1900
+
1901
+ #if defined(__clang__) || __GNUC__
1902
+ return __extension__(__m512i)(__v64qi) {
1903
+ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1904
+ v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
1905
+ v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
1906
+ v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63
1907
+ };
1908
+ #else
1909
+ return _mm512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
1910
+ v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
1911
+ v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
1912
+ v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
1913
+ #endif
1914
+ }
1915
+ template <class A, class T, detail::enable_unsigned_integer_t<T> = 0>
1916
+ XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
1917
+ T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
1918
+ T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
1919
+ T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31,
1920
+ T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39,
1921
+ T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47,
1922
+ T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55,
1923
+ T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept
1924
+ {
1925
+
1926
+ #if defined(__clang__) || __GNUC__
1927
+ return __extension__(__m512i)(__v64qu) {
1928
+ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1929
+ v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
1930
+ v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
1931
+ v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63
1932
+ };
1933
+ #else
1934
+ return _mm512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
1935
+ v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
1936
+ v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
1937
+ v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
1938
+ #endif
1939
+ }
1940
+
1941
+ template <class A, class T, class... Values>
1942
+ XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<avx512f>, Values... values) noexcept
1943
+ {
1944
+ static_assert(sizeof...(Values) == batch_bool<T, A>::size, "consistent init");
1945
+ using register_type = typename batch_bool<T, A>::register_type;
1946
+ register_type r = 0;
1947
+ unsigned shift = 0;
1948
+ (void)std::initializer_list<register_type> { (r |= register_type(values ? 1 : 0) << (shift++))... };
1949
+ return r;
1950
+ }
1951
+
1952
+ // shuffle
1953
+ template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7, ITy I8, ITy I9, ITy I10, ITy I11, ITy I12, ITy I13, ITy I14, ITy I15>
1954
+ XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y,
1955
+ batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15> mask,
1956
+ requires_arch<avx512f>) noexcept
1957
+ {
1958
+ constexpr uint32_t smask = (I0 & 0x3) | ((I1 & 0x3) << 2) | ((I2 & 0x3) << 4) | ((I3 & 0x3) << 6);
1959
+
1960
+ // shuffle within lane
1961
+ if ((I4 == I0 + 4) && (I5 == I1 + 4) && (I6 == I2 + 4) && (I7 == I3 + 4) && (I8 == I0 + 8) && (I9 == I1 + 8) && (I10 == I2 + 8) && (I11 == I3 + 8) && (I12 == I0 + 12) && (I13 == I1 + 12) && (I14 == I2 + 12) && (I15 == I3 + 12) && I0 < 4 && I1 < 4 && I2 >= 16 && I2 < 20 && I3 >= 16 && I3 < 20)
1962
+ return _mm512_shuffle_ps(x, y, smask);
1963
+
1964
+ // shuffle within opposite lane
1965
+ if ((I4 == I0 + 4) && (I5 == I1 + 4) && (I6 == I2 + 4) && (I7 == I3 + 4) && (I8 == I0 + 8) && (I9 == I1 + 8) && (I10 == I2 + 8) && (I11 == I3 + 8) && (I12 == I0 + 12) && (I13 == I1 + 12) && (I14 == I2 + 12) && (I15 == I3 + 12) && I2 < 4 && I3 < 4 && I0 >= 16 && I0 < 20 && I1 >= 16 && I1 < 20)
1966
+ return _mm512_shuffle_ps(y, x, smask);
1967
+
1968
+ return shuffle(x, y, mask, common {});
1969
+ }
1970
+
1971
+ template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7>
1972
+ XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx512f>) noexcept
1973
+ {
1974
+ constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3) | ((I4 & 0x1) << 4) | ((I5 & 0x1) << 5) | ((I6 & 0x1) << 6) | ((I7 & 0x1) << 7);
1975
+ // shuffle within lane
1976
+ if (I0 < 2 && I1 >= 8 && I1 < 10 && I2 >= 2 && I2 < 4 && I3 >= 10 && I3 < 12 && I4 >= 4 && I4 < 6 && I5 >= 12 && I5 < 14 && I6 >= 6 && I6 < 8 && I7 >= 14)
1977
+ return _mm512_shuffle_pd(x, y, smask);
1978
+
1979
+ // shuffle within opposite lane
1980
+ if (I1 < 2 && I0 >= 8 && I0 < 10 && I3 >= 2 && I3 < 4 && I2 >= 10 && I2 < 12 && I5 >= 4 && I5 < 6 && I4 >= 12 && I4 < 14 && I7 >= 6 && I7 < 8 && I6 >= 14)
1981
+ return _mm512_shuffle_pd(y, x, smask);
1982
+
1983
+ return shuffle(x, y, mask, common {});
1984
+ }
1985
+
1986
+ // slide_left
1987
+ namespace detail
1988
+ {
1989
+ template <size_t N>
1990
+ struct make_slide_left_pattern
1991
+ {
1992
+ static constexpr size_t get(size_t i, size_t)
1993
+ {
1994
+ return i >= N ? i - N : 0;
1995
+ }
1996
+ };
1997
+
1998
+ template <size_t N, class A, class T>
1999
+ XSIMD_INLINE batch<T, A> slide_left_aligned_u32(batch<T, A> const& x, requires_arch<avx512f>) noexcept
2000
+ {
2001
+ static_assert((N & 3) == 0 || N >= 64, "N must be aligned to 32 bits");
2002
+
2003
+ if (N == 0)
2004
+ {
2005
+ return x;
2006
+ }
2007
+ if (N >= 64)
2008
+ {
2009
+ return batch<T, A>(T(0));
2010
+ }
2011
+
2012
+ __mmask16 mask = uint16_t(0xFFFFu << (N / 4));
2013
+
2014
+ if ((N & 15) == 0)
2015
+ {
2016
+ const uint8_t imm8 = uint8_t(0xe4 << (2 * (N / 16)));
2017
+ return _mm512_maskz_shuffle_i32x4(mask, x, x, imm8);
2018
+ }
2019
+
2020
+ auto slide_pattern = make_batch_constant<uint32_t, detail::make_slide_left_pattern<N / 4>, A>();
2021
+ return _mm512_maskz_permutexvar_epi32(mask, slide_pattern.as_batch(), x);
2022
+ }
2023
+ }
2024
+
2025
+ template <size_t N, class A, class T>
2026
+ XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512f>) noexcept
2027
+ {
2028
+ constexpr size_t NN = N & ~3;
2029
+ if (N == NN || NN >= 64)
2030
+ {
2031
+ // Call fast path
2032
+ return detail::slide_left_aligned_u32<NN>(x, A {});
2033
+ }
2034
+
2035
+ __m512i xl = detail::slide_left_aligned_u32<NN, A, T>(_mm512_slli_epi32(x, 8 * (N - NN)), A {});
2036
+ __m512i xr = detail::slide_left_aligned_u32<NN + 4, A, T>(_mm512_srli_epi32(x, 32 - 8 * (N - NN)), A {});
2037
+ return _mm512_or_epi32(xl, xr);
2038
+ }
2039
+
2040
+ // slide_right
2041
+ namespace detail
2042
+ {
2043
+ template <size_t N>
2044
+ struct make_slide_right_pattern
2045
+ {
2046
+ static constexpr size_t get(size_t i, size_t n)
2047
+ {
2048
+ return i < (n - N) ? i + N : 0;
2049
+ }
2050
+ };
2051
+
2052
+ template <size_t N, class A, class T>
2053
+ XSIMD_INLINE batch<T, A> slide_right_aligned_u32(batch<T, A> const& x, requires_arch<avx512f>) noexcept
2054
+ {
2055
+ static_assert((N & 3) == 0 || N >= 64, "N must be aligned to 32 bits");
2056
+
2057
+ if (N == 0)
2058
+ {
2059
+ return x;
2060
+ }
2061
+ if (N >= 64)
2062
+ {
2063
+ return batch<T, A>(T(0));
2064
+ }
2065
+
2066
+ __mmask16 mask = 0xFFFFu >> (N / 4);
2067
+
2068
+ if ((N & 15) == 0)
2069
+ {
2070
+ const uint8_t imm8 = 0xe4 >> (2 * (N / 16));
2071
+ return _mm512_maskz_shuffle_i32x4(mask, x, x, imm8);
2072
+ }
2073
+
2074
+ auto slide_pattern = make_batch_constant<uint32_t, detail::make_slide_right_pattern<N / 4>, A>();
2075
+ return _mm512_maskz_permutexvar_epi32(mask, slide_pattern.as_batch(), x);
2076
+ }
2077
+ }
2078
+ template <size_t N, class A, class T>
2079
+ XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512f>) noexcept
2080
+ {
2081
+ constexpr size_t NN = N & ~3;
2082
+ if (N == NN || NN >= 64)
2083
+ {
2084
+ // Call fast path
2085
+ return detail::slide_right_aligned_u32<NN>(x, A {});
2086
+ }
2087
+
2088
+ __m512i xl = detail::slide_right_aligned_u32<NN + 4, A, T>(_mm512_slli_epi32(x, 32 - 8 * (N - NN)), A {});
2089
+ __m512i xr = detail::slide_right_aligned_u32<NN, A, T>(_mm512_srli_epi32(x, 8 * (N - NN)), A {});
2090
+ return _mm512_or_epi32(xl, xr);
2091
+ }
2092
+
2093
+ // sqrt
2094
+ template <class A>
2095
+ XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept
2096
+ {
2097
+ return _mm512_sqrt_ps(val);
2098
+ }
2099
+ template <class A>
2100
+ XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<avx512f>) noexcept
2101
+ {
2102
+ return _mm512_sqrt_pd(val);
2103
+ }
2104
+
2105
+ // ssub
2106
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
2107
+ XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
2108
+ {
2109
+ if (std::is_signed<T>::value)
2110
+ {
2111
+ return sadd(self, -other);
2112
+ }
2113
+ else
2114
+ {
2115
+ const auto diff = min(self, other);
2116
+ return self - diff;
2117
+ }
2118
+ }
2119
+
2120
+ // store
2121
+ template <class T, class A>
2122
+ XSIMD_INLINE void store(batch_bool<T, A> const& self, bool* mem, requires_arch<avx512f>) noexcept
2123
+ {
2124
+ using register_type = typename batch_bool<T, A>::register_type;
2125
+ constexpr auto size = batch_bool<T, A>::size;
2126
+ for (std::size_t i = 0; i < size; ++i)
2127
+ mem[i] = self.data & (register_type(1) << i);
2128
+ }
2129
+
2130
+ // store_aligned
2131
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
2132
+ XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<avx512f>) noexcept
2133
+ {
2134
+ return _mm512_store_si512((__m512i*)mem, self);
2135
+ }
2136
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
2137
+ XSIMD_INLINE void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
2138
+ {
2139
+ return _mm512_store_si512((__m512i*)mem, self);
2140
+ }
2141
+ template <class A>
2142
+ XSIMD_INLINE void store_aligned(float* mem, batch<float, A> const& self, requires_arch<avx512f>) noexcept
2143
+ {
2144
+ return _mm512_store_ps(mem, self);
2145
+ }
2146
+ template <class A>
2147
+ XSIMD_INLINE void store_aligned(double* mem, batch<double, A> const& self, requires_arch<avx512f>) noexcept
2148
+ {
2149
+ return _mm512_store_pd(mem, self);
2150
+ }
2151
+
2152
+ // store_unaligned
2153
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
2154
+ XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<avx512f>) noexcept
2155
+ {
2156
+ return _mm512_storeu_si512((__m512i*)mem, self);
2157
+ }
2158
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
2159
+ XSIMD_INLINE void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
2160
+ {
2161
+ return _mm512_storeu_si512((__m512i*)mem, self);
2162
+ }
2163
+ template <class A>
2164
+ XSIMD_INLINE void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<avx512f>) noexcept
2165
+ {
2166
+ return _mm512_storeu_ps(mem, self);
2167
+ }
2168
+ template <class A>
2169
+ XSIMD_INLINE void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<avx512f>) noexcept
2170
+ {
2171
+ return _mm512_storeu_pd(mem, self);
2172
+ }
2173
+
2174
+ // sub
2175
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
2176
+ XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
2177
+ {
2178
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
2179
+ {
2180
+ return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
2181
+ { return sub(batch<T, avx2>(s), batch<T, avx2>(o)); },
2182
+ self, other);
2183
+ }
2184
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
2185
+ {
2186
+ return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
2187
+ { return sub(batch<T, avx2>(s), batch<T, avx2>(o)); },
2188
+ self, other);
2189
+ }
2190
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
2191
+ {
2192
+ return _mm512_sub_epi32(self, other);
2193
+ }
2194
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
2195
+ {
2196
+ return _mm512_sub_epi64(self, other);
2197
+ }
2198
+ else
2199
+ {
2200
+ assert(false && "unsupported arch/op combination");
2201
+ return {};
2202
+ }
2203
+ }
2204
+ template <class A>
2205
+ XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
2206
+ {
2207
+ return _mm512_sub_ps(self, other);
2208
+ }
2209
+ template <class A>
2210
+ XSIMD_INLINE batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
2211
+ {
2212
+ return _mm512_sub_pd(self, other);
2213
+ }
2214
+
2215
+ // swizzle (dynamic version)
2216
+ template <class A>
2217
+ XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx512f>) noexcept
2218
+ {
2219
+ return _mm512_permutexvar_ps(mask, self);
2220
+ }
2221
+
2222
+ template <class A>
2223
+ XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx512f>) noexcept
2224
+ {
2225
+ return _mm512_permutexvar_pd(mask, self);
2226
+ }
2227
+
2228
+ template <class A>
2229
+ XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx512f>) noexcept
2230
+ {
2231
+ return _mm512_permutexvar_epi64(mask, self);
2232
+ }
2233
+
2234
+ template <class A>
2235
+ XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx512f>) noexcept
2236
+ {
2237
+ return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx512f {}));
2238
+ }
2239
+
2240
+ template <class A>
2241
+ XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx512f>) noexcept
2242
+ {
2243
+ return _mm512_permutexvar_epi32(mask, self);
2244
+ }
2245
+
2246
+ template <class A>
2247
+ XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx512f>) noexcept
2248
+ {
2249
+ return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx512f {}));
2250
+ }
2251
+
2252
+ template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7,
2253
+ uint32_t V8, uint32_t V9, uint32_t V10, uint32_t V11, uint32_t V12, uint32_t V13, uint32_t V14, uint32_t V15>
2254
+ XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self,
2255
+ batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask,
2256
+ requires_arch<avx512f>) noexcept
2257
+ {
2258
+ XSIMD_IF_CONSTEXPR(detail::is_identity(mask))
2259
+ {
2260
+ return self;
2261
+ }
2262
+ XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))
2263
+ {
2264
+ constexpr int imm0 = detail::mod_shuffle(V0, V1, V2, V3);
2265
+ constexpr int imm1 = detail::mod_shuffle(V4, V5, V6, V7);
2266
+ constexpr int imm2 = detail::mod_shuffle(V8, V9, V10, V11);
2267
+ constexpr int imm3 = detail::mod_shuffle(V12, V13, V14, V15);
2268
+ XSIMD_IF_CONSTEXPR(imm0 == imm1 && imm0 == imm2 && imm0 == imm3)
2269
+ {
2270
+ return _mm512_permute_ps(self, imm0);
2271
+ }
2272
+ }
2273
+ return swizzle(self, mask.as_batch(), avx512f {});
2274
+ }
2275
+ template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3, uint64_t V4, uint64_t V5, uint64_t V6, uint64_t V7>
2276
+ XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self,
2277
+ batch_constant<uint64_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask,
2278
+ requires_arch<avx512f>) noexcept
2279
+ {
2280
+ XSIMD_IF_CONSTEXPR(detail::is_identity(mask))
2281
+ {
2282
+ return self;
2283
+ }
2284
+ XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))
2285
+ {
2286
+ constexpr auto imm = ((V0 & 1) << 0) | ((V1 & 1) << 1) | ((V2 & 1) << 2) | ((V3 & 1) << 3) | ((V4 & 1) << 4) | ((V5 & 1) << 5) | ((V6 & 1) << 6) | ((V7 & 1) << 7);
2287
+ return _mm512_permute_pd(self, imm);
2288
+ }
2289
+ constexpr bool dup_lo = detail::is_dup_lo(mask);
2290
+ constexpr bool dup_hi = detail::is_dup_hi(mask);
2291
+ XSIMD_IF_CONSTEXPR(dup_lo || dup_hi)
2292
+ {
2293
+ const batch<double, avx2> half = _mm512_extractf64x4_pd(self, dup_lo ? 0 : 1);
2294
+ constexpr typename std::conditional<dup_lo, batch_constant<uint64_t, avx2, V0 % 4, V1 % 4, V2 % 4, V3 % 4>,
2295
+ batch_constant<uint64_t, avx2, V4 % 4, V5 % 4, V6 % 4, V7 % 4>>::type half_mask {};
2296
+ return _mm512_broadcast_f64x4(swizzle(half, half_mask, avx2 {}));
2297
+ }
2298
+ // General case
2299
+ return swizzle(self, mask.as_batch(), avx512f {});
2300
+ }
2301
+
2302
+ template <class A, uint64_t... Vs>
2303
+ XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
2304
+ {
2305
+ return swizzle(self, mask.as_batch(), avx512f {});
2306
+ }
2307
+
2308
+ template <class A, uint64_t... Vs>
2309
+ XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
2310
+ {
2311
+ return swizzle(self, mask.as_batch(), avx512f {});
2312
+ }
2313
+
2314
+ template <class A, uint32_t... Vs>
2315
+ XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
2316
+ {
2317
+ return swizzle(self, mask.as_batch(), avx512f {});
2318
+ }
2319
+
2320
+ template <class A, uint32_t... Vs>
2321
+ XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
2322
+ {
2323
+ return swizzle(self, mask.as_batch(), avx512f {});
2324
+ }
2325
+
2326
+ namespace detail
2327
+ {
2328
+ template <class T, class A, T... Idx>
2329
+ struct is_pair_of_contiguous_indices;
2330
+
2331
+ template <class T, class A>
2332
+ struct is_pair_of_contiguous_indices<T, A> : std::true_type
2333
+ {
2334
+ };
2335
+
2336
+ template <class T, class A, T Idx0, T Idx1, T... Idx>
2337
+ struct is_pair_of_contiguous_indices<T, A, Idx0, Idx1, Idx...> : std::conditional<(Idx0 % 2 == 0) && (Idx0 + 1 == Idx1), is_pair_of_contiguous_indices<T, A, Idx...>, std::false_type>::type
2338
+ {
2339
+ };
2340
+
2341
+ template <class A, uint16_t I0, uint16_t I1, uint16_t I2, uint16_t I3, uint16_t I4, uint16_t I5, uint16_t I6, uint16_t I7,
2342
+ uint16_t I8, uint16_t I9, uint16_t I10, uint16_t I11, uint16_t I12, uint16_t I13, uint16_t I14, uint16_t I15,
2343
+ uint16_t I16, uint16_t I17, uint16_t I18, uint16_t I19, uint16_t I20, uint16_t I21, uint16_t I22, uint16_t I23,
2344
+ uint16_t I24, uint16_t I25, uint16_t I26, uint16_t I27, uint16_t I28, uint16_t I29, uint16_t I30, uint16_t I31>
2345
+ struct fold_batch_constant
2346
+ {
2347
+ using type = batch_constant<uint32_t, A, I0 / 2, I2 / 2, I4 / 2, I6 / 2, I8 / 2, I10 / 2, I12 / 2, I14 / 2,
2348
+ I16 / 2, I18 / 2, I20 / 2, I22 / 2, I24 / 2, I26 / 2, I28 / 2, I30 / 2>;
2349
+ };
2350
+
2351
+ }
2352
+
2353
+ template <class A, uint16_t... Idx, class _ = typename std::enable_if<detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value, void>::type>
2354
+ XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Idx...>, requires_arch<avx512f>) noexcept
2355
+ {
2356
+ constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
2357
+ return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
2358
+ }
2359
+
2360
+ template <class A>
2361
+ XSIMD_INLINE batch<uint16_t, A>
2362
+ swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, (uint16_t)1, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1>, requires_arch<avx512f>) noexcept
2363
+ {
2364
+ // FIXME: this sequence is very inefficient, but it's here to catch
2365
+ // a pattern generated by detail::reduce from xsimd_common_math.hpp.
2366
+ // The whole pattern is actually decently folded by GCC and Clang,
2367
+ // so bare with it.
2368
+ constexpr batch_constant<uint32_t, A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
2369
+ auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
2370
+
2371
+ alignas(A::alignment()) uint16_t buffer[32];
2372
+ _mm512_store_si512((__m512i*)&buffer[0], tmp);
2373
+ buffer[0] = buffer[1];
2374
+ return _mm512_load_si512(&buffer[0]);
2375
+ }
2376
+
2377
+ template <class A, uint16_t... Vs>
2378
+ XSIMD_INLINE batch<int16_t, A>
2379
+ swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
2380
+ {
2381
+ return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, avx512f {}));
2382
+ }
2383
+
2384
+ // transpose
2385
+ template <class A>
2386
+ XSIMD_INLINE void transpose(batch<uint16_t, A>* matrix_begin, batch<uint16_t, A>* matrix_end, requires_arch<avx512f>) noexcept
2387
+ {
2388
+ assert((matrix_end - matrix_begin == batch<uint16_t, A>::size) && "correctly sized matrix");
2389
+ (void)matrix_end;
2390
+ batch<uint16_t, avx2> tmp_lo0[16];
2391
+ for (int i = 0; i < 16; ++i)
2392
+ tmp_lo0[i] = _mm512_castsi512_si256(matrix_begin[i]);
2393
+ transpose(tmp_lo0 + 0, tmp_lo0 + 16, avx2 {});
2394
+
2395
+ batch<uint16_t, avx2> tmp_hi0[16];
2396
+ for (int i = 0; i < 16; ++i)
2397
+ tmp_hi0[i] = _mm512_castsi512_si256(matrix_begin[16 + i]);
2398
+ transpose(tmp_hi0 + 0, tmp_hi0 + 16, avx2 {});
2399
+
2400
+ batch<uint16_t, avx2> tmp_lo1[16];
2401
+ for (int i = 0; i < 16; ++i)
2402
+ tmp_lo1[i] = _mm512_extracti64x4_epi64(matrix_begin[i], 1);
2403
+ transpose(tmp_lo1 + 0, tmp_lo1 + 16, avx2 {});
2404
+
2405
+ batch<uint16_t, avx2> tmp_hi1[16];
2406
+ for (int i = 0; i < 16; ++i)
2407
+ tmp_hi1[i] = _mm512_extracti64x4_epi64(matrix_begin[16 + i], 1);
2408
+ transpose(tmp_hi1 + 0, tmp_hi1 + 16, avx2 {});
2409
+
2410
+ for (int i = 0; i < 16; ++i)
2411
+ matrix_begin[i] = detail::merge_avx(tmp_lo0[i], tmp_hi0[i]);
2412
+ for (int i = 0; i < 16; ++i)
2413
+ matrix_begin[i + 16] = detail::merge_avx(tmp_lo1[i], tmp_hi1[i]);
2414
+ }
2415
+ template <class A>
2416
+ XSIMD_INLINE void transpose(batch<int16_t, A>* matrix_begin, batch<int16_t, A>* matrix_end, requires_arch<avx512f>) noexcept
2417
+ {
2418
+ return transpose(reinterpret_cast<batch<uint16_t, A>*>(matrix_begin), reinterpret_cast<batch<uint16_t, A>*>(matrix_end), A {});
2419
+ }
2420
+
2421
+ template <class A>
2422
+ XSIMD_INLINE void transpose(batch<uint8_t, A>* matrix_begin, batch<uint8_t, A>* matrix_end, requires_arch<avx512f>) noexcept
2423
+ {
2424
+ assert((matrix_end - matrix_begin == batch<uint8_t, A>::size) && "correctly sized matrix");
2425
+ (void)matrix_end;
2426
+ batch<uint8_t, avx2> tmp_lo0[32];
2427
+ for (int i = 0; i < 32; ++i)
2428
+ tmp_lo0[i] = _mm512_castsi512_si256(matrix_begin[i]);
2429
+ transpose(tmp_lo0 + 0, tmp_lo0 + 32, avx2 {});
2430
+
2431
+ batch<uint8_t, avx2> tmp_hi0[32];
2432
+ for (int i = 0; i < 32; ++i)
2433
+ tmp_hi0[i] = _mm512_castsi512_si256(matrix_begin[32 + i]);
2434
+ transpose(tmp_hi0 + 0, tmp_hi0 + 32, avx2 {});
2435
+
2436
+ batch<uint8_t, avx2> tmp_lo1[32];
2437
+ for (int i = 0; i < 32; ++i)
2438
+ tmp_lo1[i] = _mm512_extracti64x4_epi64(matrix_begin[i], 1);
2439
+ transpose(tmp_lo1 + 0, tmp_lo1 + 32, avx2 {});
2440
+
2441
+ batch<uint8_t, avx2> tmp_hi1[32];
2442
+ for (int i = 0; i < 32; ++i)
2443
+ tmp_hi1[i] = _mm512_extracti64x4_epi64(matrix_begin[32 + i], 1);
2444
+ transpose(tmp_hi1 + 0, tmp_hi1 + 32, avx2 {});
2445
+
2446
+ for (int i = 0; i < 32; ++i)
2447
+ matrix_begin[i] = detail::merge_avx(tmp_lo0[i], tmp_hi0[i]);
2448
+ for (int i = 0; i < 32; ++i)
2449
+ matrix_begin[i + 32] = detail::merge_avx(tmp_lo1[i], tmp_hi1[i]);
2450
+ }
2451
+ template <class A>
2452
+ XSIMD_INLINE void transpose(batch<int8_t, A>* matrix_begin, batch<int8_t, A>* matrix_end, requires_arch<avx512f>) noexcept
2453
+ {
2454
+ return transpose(reinterpret_cast<batch<uint8_t, A>*>(matrix_begin), reinterpret_cast<batch<uint8_t, A>*>(matrix_end), A {});
2455
+ }
2456
+
2457
+ // trunc
2458
+ template <class A>
2459
+ XSIMD_INLINE batch<float, A>
2460
+ trunc(batch<float, A> const& self, requires_arch<avx512f>) noexcept
2461
+ {
2462
+ return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION);
2463
+ }
2464
+ template <class A>
2465
+ XSIMD_INLINE batch<double, A>
2466
+ trunc(batch<double, A> const& self, requires_arch<avx512f>) noexcept
2467
+ {
2468
+ return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION);
2469
+ }
2470
+
2471
+ // zip_hi
2472
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
2473
+ XSIMD_INLINE batch<T, A>
2474
+ zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
2475
+ {
2476
+ __m512i lo, hi;
2477
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
2478
+ {
2479
+ assert(false && "not implemented yet");
2480
+ return {};
2481
+ }
2482
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
2483
+ {
2484
+ assert(false && "not implemented yet");
2485
+ return {};
2486
+ }
2487
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
2488
+ {
2489
+ lo = _mm512_unpacklo_epi32(self, other);
2490
+ hi = _mm512_unpackhi_epi32(self, other);
2491
+ }
2492
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
2493
+ {
2494
+ lo = _mm512_unpacklo_epi64(self, other);
2495
+ hi = _mm512_unpackhi_epi64(self, other);
2496
+ }
2497
+ else
2498
+ {
2499
+ assert(false && "unsupported arch/op combination");
2500
+ return {};
2501
+ }
2502
+ return _mm512_inserti32x4(
2503
+ _mm512_inserti32x4(
2504
+ _mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0),
2505
+ _mm512_extracti32x4_epi32(lo, 3),
2506
+ 2),
2507
+ _mm512_extracti32x4_epi32(hi, 2),
2508
+ 1);
2509
+ }
2510
+ template <class A>
2511
+ XSIMD_INLINE batch<float, A>
2512
+ zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
2513
+ {
2514
+ auto lo = _mm512_unpacklo_ps(self, other);
2515
+ auto hi = _mm512_unpackhi_ps(self, other);
2516
+ return _mm512_insertf32x4(
2517
+ _mm512_insertf32x4(
2518
+ _mm512_insertf32x4(hi, _mm512_extractf32x4_ps(lo, 2), 0),
2519
+ _mm512_extractf32x4_ps(lo, 3),
2520
+ 2),
2521
+ _mm512_extractf32x4_ps(hi, 2),
2522
+ 1);
2523
+ }
2524
+ template <class A>
2525
+ XSIMD_INLINE batch<double, A>
2526
+ zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
2527
+ {
2528
+ auto lo = _mm512_castpd_ps(_mm512_unpacklo_pd(self, other));
2529
+ auto hi = _mm512_castpd_ps(_mm512_unpackhi_pd(self, other));
2530
+ return _mm512_castps_pd(_mm512_insertf32x4(
2531
+ _mm512_insertf32x4(
2532
+ _mm512_insertf32x4(hi, _mm512_extractf32x4_ps(lo, 2), 0),
2533
+ _mm512_extractf32x4_ps(lo, 3),
2534
+ 2),
2535
+ _mm512_extractf32x4_ps(hi, 2),
2536
+ 1));
2537
+ }
2538
+
2539
+ // zip_lo
2540
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
2541
+ XSIMD_INLINE batch<T, A>
2542
+ zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
2543
+ {
2544
+ __m512i lo, hi;
2545
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
2546
+ {
2547
+ assert(false && "not implemented yet");
2548
+ return {};
2549
+ }
2550
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
2551
+ {
2552
+ assert(false && "not implemented yet");
2553
+ return {};
2554
+ }
2555
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
2556
+ {
2557
+ lo = _mm512_unpacklo_epi32(self, other);
2558
+ hi = _mm512_unpackhi_epi32(self, other);
2559
+ }
2560
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
2561
+ {
2562
+ lo = _mm512_unpacklo_epi64(self, other);
2563
+ hi = _mm512_unpackhi_epi64(self, other);
2564
+ }
2565
+ else
2566
+ {
2567
+ assert(false && "unsupported arch/op combination");
2568
+ return {};
2569
+ }
2570
+ return _mm512_inserti32x4(
2571
+ _mm512_inserti32x4(
2572
+ _mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1),
2573
+ _mm512_extracti32x4_epi32(hi, 1),
2574
+ 3),
2575
+ _mm512_extracti32x4_epi32(lo, 1),
2576
+ 2);
2577
+ }
2578
+ template <class A>
2579
+ XSIMD_INLINE batch<float, A>
2580
+ zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
2581
+ {
2582
+ auto lo = _mm512_unpacklo_ps(self, other);
2583
+ auto hi = _mm512_unpackhi_ps(self, other);
2584
+ return _mm512_insertf32x4(
2585
+ _mm512_insertf32x4(
2586
+ _mm512_insertf32x4(lo, _mm512_extractf32x4_ps(hi, 0), 1),
2587
+ _mm512_extractf32x4_ps(hi, 1),
2588
+ 3),
2589
+ _mm512_extractf32x4_ps(lo, 1),
2590
+ 2);
2591
+ }
2592
+ template <class A>
2593
+ XSIMD_INLINE batch<double, A>
2594
+ zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
2595
+ {
2596
+ auto lo = _mm512_castpd_ps(_mm512_unpacklo_pd(self, other));
2597
+ auto hi = _mm512_castpd_ps(_mm512_unpackhi_pd(self, other));
2598
+ return _mm512_castps_pd(_mm512_insertf32x4(
2599
+ _mm512_insertf32x4(
2600
+ _mm512_insertf32x4(lo, _mm512_extractf32x4_ps(hi, 0), 1),
2601
+ _mm512_extractf32x4_ps(hi, 1),
2602
+ 3),
2603
+ _mm512_extractf32x4_ps(lo, 1),
2604
+ 2));
2605
+ }
2606
+
2607
+ // first
2608
+ template <class A>
2609
+ XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<avx512f>) noexcept
2610
+ {
2611
+ return _mm512_cvtss_f32(self);
2612
+ }
2613
+
2614
+ template <class A>
2615
+ XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<avx512f>) noexcept
2616
+ {
2617
+ return _mm512_cvtsd_f64(self);
2618
+ }
2619
+
2620
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
2621
+ XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<avx512f>) noexcept
2622
+ {
2623
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
2624
+ {
2625
+ return static_cast<T>(_mm512_cvtsi512_si32(self) & 0xFF);
2626
+ }
2627
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
2628
+ {
2629
+ return static_cast<T>(_mm512_cvtsi512_si32(self) & 0xFFFF);
2630
+ }
2631
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
2632
+ {
2633
+ return static_cast<T>(_mm512_cvtsi512_si32(self));
2634
+ }
2635
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
2636
+ {
2637
+ batch<T, sse4_2> low = _mm512_castsi512_si128(self);
2638
+ return first(low, sse4_2 {});
2639
+ }
2640
+ else
2641
+ {
2642
+ assert(false && "unsupported arch/op combination");
2643
+ return {};
2644
+ }
2645
+ }
2646
+
2647
+ }
2648
+ }
2649
+
2650
+ #endif