sequenzo 0.1.24__cp311-cp311-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sequenzo might be problematic. Click here for more details.

Files changed (264) hide show
  1. _sequenzo_fastcluster.cpython-311-darwin.so +0 -0
  2. sequenzo/__init__.py +240 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +474 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-311-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-311-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +20 -0
  30. sequenzo/data_preprocessing/helpers.py +256 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/mvad.csv +713 -0
  44. sequenzo/datasets/pairfam_family.csv +1867 -0
  45. sequenzo/datasets/polyadic_samplec1.csv +61 -0
  46. sequenzo/datasets/polyadic_samplep1.csv +61 -0
  47. sequenzo/datasets/polyadic_seqc1.csv +61 -0
  48. sequenzo/datasets/polyadic_seqp1.csv +61 -0
  49. sequenzo/define_sequence_data.py +609 -0
  50. sequenzo/dissimilarity_measures/__init__.py +31 -0
  51. sequenzo/dissimilarity_measures/c_code.cpython-311-darwin.so +0 -0
  52. sequenzo/dissimilarity_measures/get_distance_matrix.py +702 -0
  53. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +241 -0
  54. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  55. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  56. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  57. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  58. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  59. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  60. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  61. sequenzo/dissimilarity_measures/src/module.cpp +34 -0
  62. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  63. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  64. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  65. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  66. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  67. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  214. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  215. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  216. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-311-darwin.so +0 -0
  217. sequenzo/dissimilarity_measures/utils/seqconc.cpython-311-darwin.so +0 -0
  218. sequenzo/dissimilarity_measures/utils/seqdss.cpython-311-darwin.so +0 -0
  219. sequenzo/dissimilarity_measures/utils/seqdur.cpython-311-darwin.so +0 -0
  220. sequenzo/dissimilarity_measures/utils/seqlength.cpython-311-darwin.so +0 -0
  221. sequenzo/multidomain/__init__.py +23 -0
  222. sequenzo/multidomain/association_between_domains.py +311 -0
  223. sequenzo/multidomain/cat.py +431 -0
  224. sequenzo/multidomain/combt.py +519 -0
  225. sequenzo/multidomain/dat.py +89 -0
  226. sequenzo/multidomain/idcd.py +139 -0
  227. sequenzo/multidomain/linked_polyad.py +292 -0
  228. sequenzo/openmp_setup.py +233 -0
  229. sequenzo/prefix_tree/__init__.py +43 -0
  230. sequenzo/prefix_tree/individual_level_indicators.py +1274 -0
  231. sequenzo/prefix_tree/system_level_indicators.py +465 -0
  232. sequenzo/prefix_tree/utils.py +54 -0
  233. sequenzo/sequence_characteristics/__init__.py +40 -0
  234. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  235. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  236. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  237. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  238. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  239. sequenzo/sequence_characteristics/turbulence.py +155 -0
  240. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  241. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  242. sequenzo/suffix_tree/__init__.py +48 -0
  243. sequenzo/suffix_tree/individual_level_indicators.py +1638 -0
  244. sequenzo/suffix_tree/system_level_indicators.py +456 -0
  245. sequenzo/suffix_tree/utils.py +56 -0
  246. sequenzo/visualization/__init__.py +29 -0
  247. sequenzo/visualization/plot_mean_time.py +194 -0
  248. sequenzo/visualization/plot_modal_state.py +276 -0
  249. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  250. sequenzo/visualization/plot_relative_frequency.py +404 -0
  251. sequenzo/visualization/plot_sequence_index.py +951 -0
  252. sequenzo/visualization/plot_single_medoid.py +153 -0
  253. sequenzo/visualization/plot_state_distribution.py +627 -0
  254. sequenzo/visualization/plot_transition_matrix.py +190 -0
  255. sequenzo/visualization/utils/__init__.py +23 -0
  256. sequenzo/visualization/utils/utils.py +310 -0
  257. sequenzo/with_event_history_analysis/__init__.py +35 -0
  258. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  259. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  260. sequenzo-0.1.24.dist-info/METADATA +255 -0
  261. sequenzo-0.1.24.dist-info/RECORD +264 -0
  262. sequenzo-0.1.24.dist-info/WHEEL +5 -0
  263. sequenzo-0.1.24.dist-info/licenses/LICENSE +28 -0
  264. sequenzo-0.1.24.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1924 @@
1
+ /***************************************************************************
2
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
3
+ * Martin Renou *
4
+ * Copyright (c) QuantStack *
5
+ * Copyright (c) Serge Guelton *
6
+ * Copyright (c) Marco Barbone *
7
+ * *
8
+ * Distributed under the terms of the BSD 3-Clause License. *
9
+ * *
10
+ * The full license is in the file LICENSE, distributed with this software. *
11
+ ****************************************************************************/
12
+
13
+ #ifndef XSIMD_AVX_HPP
14
+ #define XSIMD_AVX_HPP
15
+
16
+ #include <complex>
17
+ #include <limits>
18
+ #include <type_traits>
19
+
20
+ #include "../types/xsimd_avx_register.hpp"
21
+
22
+ namespace xsimd
23
+ {
24
+ namespace kernel
25
+ {
26
+ using namespace types;
27
+
28
+ // fwd
29
+ template <class A, class T, size_t I>
30
+ XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<common>) noexcept;
31
+
32
+ template <class A>
33
+ XSIMD_INLINE void transpose(batch<uint16_t, A>* matrix_begin, batch<uint16_t, A>* matrix_end, requires_arch<common>) noexcept;
34
+ template <class A>
35
+ XSIMD_INLINE void transpose(batch<uint8_t, A>* matrix_begin, batch<uint8_t, A>* matrix_end, requires_arch<common>) noexcept;
36
+
37
+ namespace detail
38
+ {
39
+ XSIMD_INLINE void split_avx(__m256i val, __m128i& low, __m128i& high) noexcept
40
+ {
41
+ low = _mm256_castsi256_si128(val);
42
+ high = _mm256_extractf128_si256(val, 1);
43
+ }
44
+ XSIMD_INLINE void split_avx(__m256 val, __m128& low, __m128& high) noexcept
45
+ {
46
+ low = _mm256_castps256_ps128(val);
47
+ high = _mm256_extractf128_ps(val, 1);
48
+ }
49
+ XSIMD_INLINE void split_avx(__m256d val, __m128d& low, __m128d& high) noexcept
50
+ {
51
+ low = _mm256_castpd256_pd128(val);
52
+ high = _mm256_extractf128_pd(val, 1);
53
+ }
54
+ XSIMD_INLINE __m256i merge_sse(__m128i low, __m128i high) noexcept
55
+ {
56
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(low), high, 1);
57
+ }
58
+ XSIMD_INLINE __m256 merge_sse(__m128 low, __m128 high) noexcept
59
+ {
60
+ return _mm256_insertf128_ps(_mm256_castps128_ps256(low), high, 1);
61
+ }
62
+ XSIMD_INLINE __m256d merge_sse(__m128d low, __m128d high) noexcept
63
+ {
64
+ return _mm256_insertf128_pd(_mm256_castpd128_pd256(low), high, 1);
65
+ }
66
+ template <class F>
67
+ XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self) noexcept
68
+ {
69
+ __m128i self_low, self_high;
70
+ split_avx(self, self_low, self_high);
71
+ __m128i res_low = f(self_low);
72
+ __m128i res_high = f(self_high);
73
+ return merge_sse(res_low, res_high);
74
+ }
75
+ template <class F>
76
+ XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self, __m256i other) noexcept
77
+ {
78
+ __m128i self_low, self_high, other_low, other_high;
79
+ split_avx(self, self_low, self_high);
80
+ split_avx(other, other_low, other_high);
81
+ __m128i res_low = f(self_low, other_low);
82
+ __m128i res_high = f(self_high, other_high);
83
+ return merge_sse(res_low, res_high);
84
+ }
85
+ template <class F>
86
+ XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self, int32_t other) noexcept
87
+ {
88
+ __m128i self_low, self_high;
89
+ split_avx(self, self_low, self_high);
90
+ __m128i res_low = f(self_low, other);
91
+ __m128i res_high = f(self_high, other);
92
+ return merge_sse(res_low, res_high);
93
+ }
94
+ }
95
+
96
+ // abs
97
+ template <class A>
98
+ XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<avx>) noexcept
99
+ {
100
+ __m256 sign_mask = _mm256_set1_ps(-0.f); // -0.f = 1 << 31
101
+ return _mm256_andnot_ps(sign_mask, self);
102
+ }
103
+ template <class A>
104
+ XSIMD_INLINE batch<double, A> abs(batch<double, A> const& self, requires_arch<avx>) noexcept
105
+ {
106
+ __m256d sign_mask = _mm256_set1_pd(-0.f); // -0.f = 1 << 31
107
+ return _mm256_andnot_pd(sign_mask, self);
108
+ }
109
+
110
+ // add
111
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
112
+ XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
113
+ {
114
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
115
+ { return add(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
116
+ self, other);
117
+ }
118
+ template <class A>
119
+ XSIMD_INLINE batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
120
+ {
121
+ return _mm256_add_ps(self, other);
122
+ }
123
+ template <class A>
124
+ XSIMD_INLINE batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
125
+ {
126
+ return _mm256_add_pd(self, other);
127
+ }
128
+
129
+ // all
130
+ template <class A>
131
+ XSIMD_INLINE bool all(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
132
+ {
133
+ return _mm256_testc_ps(self, batch_bool<float, A>(true)) != 0;
134
+ }
135
+ template <class A>
136
+ XSIMD_INLINE bool all(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
137
+ {
138
+ return _mm256_testc_pd(self, batch_bool<double, A>(true)) != 0;
139
+ }
140
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
141
+ XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
142
+ {
143
+ return _mm256_testc_si256(self, batch_bool<T, A>(true)) != 0;
144
+ }
145
+
146
+ // any
147
+ template <class A>
148
+ XSIMD_INLINE bool any(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
149
+ {
150
+ return !_mm256_testz_ps(self, self);
151
+ }
152
+ template <class A>
153
+ XSIMD_INLINE bool any(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
154
+ {
155
+ return !_mm256_testz_pd(self, self);
156
+ }
157
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
158
+ XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
159
+ {
160
+ return !_mm256_testz_si256(self, self);
161
+ }
162
+
163
+ // batch_bool_cast
164
+ template <class A, class T_out, class T_in>
165
+ XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<avx>) noexcept
166
+ {
167
+ return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
168
+ }
169
+
170
+ // bitwise_and
171
+ template <class A>
172
+ XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
173
+ {
174
+ return _mm256_and_ps(self, other);
175
+ }
176
+ template <class A>
177
+ XSIMD_INLINE batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
178
+ {
179
+ return _mm256_and_pd(self, other);
180
+ }
181
+
182
+ template <class A>
183
+ XSIMD_INLINE batch_bool<float, A> bitwise_and(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
184
+ {
185
+ return _mm256_and_ps(self, other);
186
+ }
187
+ template <class A>
188
+ XSIMD_INLINE batch_bool<double, A> bitwise_and(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
189
+ {
190
+ return _mm256_and_pd(self, other);
191
+ }
192
+
193
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
194
+ XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
195
+ {
196
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
197
+ { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
198
+ self, other);
199
+ }
200
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
201
+ XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
202
+ {
203
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
204
+ { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
205
+ self, other);
206
+ }
207
+
208
+ // bitwise_andnot
209
+ template <class A>
210
+ XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
211
+ {
212
+ return _mm256_andnot_ps(other, self);
213
+ }
214
+ template <class A>
215
+ XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
216
+ {
217
+ return _mm256_andnot_pd(other, self);
218
+ }
219
+
220
+ template <class A>
221
+ XSIMD_INLINE batch_bool<float, A> bitwise_andnot(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
222
+ {
223
+ return _mm256_andnot_ps(other, self);
224
+ }
225
+ template <class A>
226
+ XSIMD_INLINE batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
227
+ {
228
+ return _mm256_andnot_pd(other, self);
229
+ }
230
+
231
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
232
+ XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
233
+ {
234
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
235
+ { return bitwise_andnot(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
236
+ self, other);
237
+ }
238
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
239
+ XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
240
+ {
241
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
242
+ { return bitwise_andnot(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
243
+ self, other);
244
+ }
245
+
246
+ // bitwise_lshift
247
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
248
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx>) noexcept
249
+ {
250
+ return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept
251
+ { return bitwise_lshift(batch<T, sse4_2>(s), o, sse4_2 {}); },
252
+ self, other);
253
+ }
254
+
255
+ // bitwise_not
256
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
257
+ XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx>) noexcept
258
+ {
259
+ return detail::fwd_to_sse([](__m128i s) noexcept
260
+ { return bitwise_not(batch<T, sse4_2>(s), sse4_2 {}); },
261
+ self);
262
+ }
263
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
264
+ XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
265
+ {
266
+ return detail::fwd_to_sse([](__m128i s) noexcept
267
+ { return bitwise_not(batch_bool<T, sse4_2>(s), sse4_2 {}); },
268
+ self);
269
+ }
270
+
271
+ // bitwise_or
272
+ template <class A>
273
+ XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
274
+ {
275
+ return _mm256_or_ps(self, other);
276
+ }
277
+ template <class A>
278
+ XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
279
+ {
280
+ return _mm256_or_pd(self, other);
281
+ }
282
+ template <class A>
283
+ XSIMD_INLINE batch_bool<float, A> bitwise_or(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
284
+ {
285
+ return _mm256_or_ps(self, other);
286
+ }
287
+ template <class A>
288
+ XSIMD_INLINE batch_bool<double, A> bitwise_or(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
289
+ {
290
+ return _mm256_or_pd(self, other);
291
+ }
292
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
293
+ XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
294
+ {
295
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
296
+ { return bitwise_or(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
297
+ self, other);
298
+ }
299
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
300
+ XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
301
+ {
302
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
303
+ { return bitwise_or(batch_bool<T, sse4_2>(s), batch_bool<T, sse4_2>(o)); },
304
+ self, other);
305
+ }
306
+
307
+ // bitwise_rshift
308
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
309
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx>) noexcept
310
+ {
311
+ return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept
312
+ { return bitwise_rshift(batch<T, sse4_2>(s), o, sse4_2 {}); },
313
+ self, other);
314
+ }
315
+
316
+ // bitwise_xor
317
+ template <class A>
318
+ XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
319
+ {
320
+ return _mm256_xor_ps(self, other);
321
+ }
322
+ template <class A>
323
+ XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
324
+ {
325
+ return _mm256_xor_pd(self, other);
326
+ }
327
+ template <class A>
328
+ XSIMD_INLINE batch_bool<float, A> bitwise_xor(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
329
+ {
330
+ return _mm256_xor_ps(self, other);
331
+ }
332
+ template <class A>
333
+ XSIMD_INLINE batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
334
+ {
335
+ return _mm256_xor_pd(self, other);
336
+ }
337
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
338
+ XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
339
+ {
340
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
341
+ { return bitwise_xor(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
342
+ self, other);
343
+ }
344
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
345
+ XSIMD_INLINE batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
346
+ {
347
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
348
+ { return bitwise_xor(batch_bool<T, sse4_2>(s), batch_bool<T, sse4_2>(o), sse4_2 {}); },
349
+ self, other);
350
+ }
351
+
352
+ // bitwise_cast
353
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
354
+ XSIMD_INLINE batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
355
+ {
356
+ return _mm256_castsi256_ps(self);
357
+ }
358
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
359
+ XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<avx>) noexcept
360
+ {
361
+ return _mm256_castsi256_pd(self);
362
+ }
363
+ template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
364
+ XSIMD_INLINE batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<avx>) noexcept
365
+ {
366
+ return batch<Tp, A>(self.data);
367
+ }
368
+ template <class A>
369
+ XSIMD_INLINE batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<avx>) noexcept
370
+ {
371
+ return _mm256_castps_pd(self);
372
+ }
373
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
374
+ XSIMD_INLINE batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<avx>) noexcept
375
+ {
376
+ return _mm256_castps_si256(self);
377
+ }
378
+ template <class A>
379
+ XSIMD_INLINE batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
380
+ {
381
+ return _mm256_castpd_ps(self);
382
+ }
383
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
384
+ XSIMD_INLINE batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<avx>) noexcept
385
+ {
386
+ return _mm256_castpd_si256(self);
387
+ }
388
+
389
+ // bitwise_not
390
+ template <class A>
391
+ XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx>) noexcept
392
+ {
393
+ return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1)));
394
+ }
395
+ template <class A>
396
+ XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx>) noexcept
397
+ {
398
+ return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1)));
399
+ }
400
+ template <class A>
401
+ XSIMD_INLINE batch_bool<float, A> bitwise_not(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
402
+ {
403
+ return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1)));
404
+ }
405
+ template <class A>
406
+ XSIMD_INLINE batch_bool<double, A> bitwise_not(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
407
+ {
408
+ return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1)));
409
+ }
410
+
411
+ // broadcast
412
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
413
+ XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<avx>) noexcept
414
+ {
415
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
416
+ {
417
+ return _mm256_set1_epi8(val);
418
+ }
419
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
420
+ {
421
+ return _mm256_set1_epi16(val);
422
+ }
423
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
424
+ {
425
+ return _mm256_set1_epi32(val);
426
+ }
427
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
428
+ {
429
+ return _mm256_set1_epi64x(val);
430
+ }
431
+ else
432
+ {
433
+ assert(false && "unsupported");
434
+ return {};
435
+ }
436
+ }
437
+ template <class A>
438
+ XSIMD_INLINE batch<float, A> broadcast(float val, requires_arch<avx>) noexcept
439
+ {
440
+ return _mm256_set1_ps(val);
441
+ }
442
+ template <class A>
443
+ XSIMD_INLINE batch<double, A> broadcast(double val, requires_arch<avx>) noexcept
444
+ {
445
+ return _mm256_set1_pd(val);
446
+ }
447
+
448
+ // ceil
449
+ template <class A>
450
+ XSIMD_INLINE batch<float, A> ceil(batch<float, A> const& self, requires_arch<avx>) noexcept
451
+ {
452
+ return _mm256_ceil_ps(self);
453
+ }
454
+ template <class A>
455
+ XSIMD_INLINE batch<double, A> ceil(batch<double, A> const& self, requires_arch<avx>) noexcept
456
+ {
457
+ return _mm256_ceil_pd(self);
458
+ }
459
+
460
+ namespace detail
461
+ {
462
+ // On clang, _mm256_extractf128_ps is built upon build_shufflevector
463
+ // which require index parameter to be a constant
464
+ template <int index, class B>
465
+ XSIMD_INLINE B get_half_complex_f(const B& real, const B& imag) noexcept
466
+ {
467
+ __m128 tmp0 = _mm256_extractf128_ps(real, index);
468
+ __m128 tmp1 = _mm256_extractf128_ps(imag, index);
469
+ __m128 tmp2 = _mm_unpackhi_ps(tmp0, tmp1);
470
+ tmp0 = _mm_unpacklo_ps(tmp0, tmp1);
471
+ __m256 res = real;
472
+ res = _mm256_insertf128_ps(res, tmp0, 0);
473
+ res = _mm256_insertf128_ps(res, tmp2, 1);
474
+ return res;
475
+ }
476
+ template <int index, class B>
477
+ XSIMD_INLINE B get_half_complex_d(const B& real, const B& imag) noexcept
478
+ {
479
+ __m128d tmp0 = _mm256_extractf128_pd(real, index);
480
+ __m128d tmp1 = _mm256_extractf128_pd(imag, index);
481
+ __m128d tmp2 = _mm_unpackhi_pd(tmp0, tmp1);
482
+ tmp0 = _mm_unpacklo_pd(tmp0, tmp1);
483
+ __m256d res = real;
484
+ res = _mm256_insertf128_pd(res, tmp0, 0);
485
+ res = _mm256_insertf128_pd(res, tmp2, 1);
486
+ return res;
487
+ }
488
+
489
+ // complex_low
490
+ template <class A>
491
+ XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<avx>) noexcept
492
+ {
493
+ return get_half_complex_f<0>(self.real(), self.imag());
494
+ }
495
+ template <class A>
496
+ XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx>) noexcept
497
+ {
498
+ return get_half_complex_d<0>(self.real(), self.imag());
499
+ }
500
+
501
+ // complex_high
502
+ template <class A>
503
+ XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<avx>) noexcept
504
+ {
505
+ return get_half_complex_f<1>(self.real(), self.imag());
506
+ }
507
+ template <class A>
508
+ XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx>) noexcept
509
+ {
510
+ return get_half_complex_d<1>(self.real(), self.imag());
511
+ }
512
+ }
513
+
514
+ // fast_cast
515
+ namespace detail
516
+ {
517
+ template <class A>
518
+ XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
519
+ {
520
+ return _mm256_cvtepi32_ps(self);
521
+ }
522
+
523
+ template <class A>
524
+ XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx>) noexcept
525
+ {
526
+ return _mm256_cvttps_epi32(self);
527
+ }
528
+ }
529
+
530
+ // decr_if
531
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
532
+ XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<avx>) noexcept
533
+ {
534
+ return self + batch<T, A>(mask.data);
535
+ }
536
+
537
+ // div
538
+ template <class A>
539
+ XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
540
+ {
541
+ return _mm256_div_ps(self, other);
542
+ }
543
+ template <class A>
544
+ XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
545
+ {
546
+ return _mm256_div_pd(self, other);
547
+ }
548
+
549
+ // eq
550
+ template <class A>
551
+ XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
552
+ {
553
+ return _mm256_cmp_ps(self, other, _CMP_EQ_OQ);
554
+ }
555
+ template <class A>
556
+ XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
557
+ {
558
+ return _mm256_cmp_pd(self, other, _CMP_EQ_OQ);
559
+ }
560
+ template <class A>
561
+ XSIMD_INLINE batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
562
+ {
563
+ return ~(self != other);
564
+ }
565
+ template <class A>
566
+ XSIMD_INLINE batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
567
+ {
568
+ return ~(self != other);
569
+ }
570
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
571
+ XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
572
+ {
573
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
574
+ { return eq(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
575
+ self, other);
576
+ }
577
+
578
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
579
+ XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
580
+ {
581
+ return ~(self != other);
582
+ }
583
+
584
+ // floor
585
+ template <class A>
586
+ XSIMD_INLINE batch<float, A> floor(batch<float, A> const& self, requires_arch<avx>) noexcept
587
+ {
588
+ return _mm256_floor_ps(self);
589
+ }
590
+ template <class A>
591
+ XSIMD_INLINE batch<double, A> floor(batch<double, A> const& self, requires_arch<avx>) noexcept
592
+ {
593
+ return _mm256_floor_pd(self);
594
+ }
595
+
596
+ // from_mask
597
+ template <class A>
598
+ XSIMD_INLINE batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<avx>) noexcept
599
+ {
600
+ alignas(A::alignment()) static const uint64_t lut32[] = {
601
+ 0x0000000000000000ul,
602
+ 0x00000000FFFFFFFFul,
603
+ 0xFFFFFFFF00000000ul,
604
+ 0xFFFFFFFFFFFFFFFFul,
605
+ };
606
+ assert(!(mask & ~0xFFul) && "inbound mask");
607
+ return _mm256_castsi256_ps(_mm256_setr_epi64x(lut32[mask & 0x3], lut32[(mask >> 2) & 0x3], lut32[(mask >> 4) & 0x3], lut32[mask >> 6]));
608
+ }
609
+ template <class A>
610
+ XSIMD_INLINE batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<avx>) noexcept
611
+ {
612
+ alignas(A::alignment()) static const uint64_t lut64[][4] = {
613
+ { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul },
614
+ { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul },
615
+ { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul },
616
+ { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul },
617
+ { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
618
+ { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
619
+ { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
620
+ { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
621
+ { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
622
+ { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
623
+ { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
624
+ { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
625
+ { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
626
+ { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
627
+ { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
628
+ { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
629
+ };
630
+ assert(!(mask & ~0xFul) && "inbound mask");
631
+ return _mm256_castsi256_pd(_mm256_load_si256((const __m256i*)lut64[mask]));
632
+ }
633
+ template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
634
+ XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<avx>) noexcept
635
+ {
636
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
637
+ {
638
+ alignas(A::alignment()) static const uint32_t lut32[] = {
639
+ 0x00000000,
640
+ 0x000000FF,
641
+ 0x0000FF00,
642
+ 0x0000FFFF,
643
+ 0x00FF0000,
644
+ 0x00FF00FF,
645
+ 0x00FFFF00,
646
+ 0x00FFFFFF,
647
+ 0xFF000000,
648
+ 0xFF0000FF,
649
+ 0xFF00FF00,
650
+ 0xFF00FFFF,
651
+ 0xFFFF0000,
652
+ 0xFFFF00FF,
653
+ 0xFFFFFF00,
654
+ 0xFFFFFFFF,
655
+ };
656
+ assert(!(mask & ~0xFFFFFFFFul) && "inbound mask");
657
+ return _mm256_setr_epi32(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF],
658
+ lut32[(mask >> 8) & 0xF], lut32[(mask >> 12) & 0xF],
659
+ lut32[(mask >> 16) & 0xF], lut32[(mask >> 20) & 0xF],
660
+ lut32[(mask >> 24) & 0xF], lut32[mask >> 28]);
661
+ }
662
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
663
+ {
664
+ alignas(A::alignment()) static const uint64_t lut64[] = {
665
+ 0x0000000000000000ul,
666
+ 0x000000000000FFFFul,
667
+ 0x00000000FFFF0000ul,
668
+ 0x00000000FFFFFFFFul,
669
+ 0x0000FFFF00000000ul,
670
+ 0x0000FFFF0000FFFFul,
671
+ 0x0000FFFFFFFF0000ul,
672
+ 0x0000FFFFFFFFFFFFul,
673
+ 0xFFFF000000000000ul,
674
+ 0xFFFF00000000FFFFul,
675
+ 0xFFFF0000FFFF0000ul,
676
+ 0xFFFF0000FFFFFFFFul,
677
+ 0xFFFFFFFF00000000ul,
678
+ 0xFFFFFFFF0000FFFFul,
679
+ 0xFFFFFFFFFFFF0000ul,
680
+ 0xFFFFFFFFFFFFFFFFul,
681
+ };
682
+ assert(!(mask & ~0xFFFFul) && "inbound mask");
683
+ return _mm256_setr_epi64x(lut64[mask & 0xF], lut64[(mask >> 4) & 0xF], lut64[(mask >> 8) & 0xF], lut64[(mask >> 12) & 0xF]);
684
+ }
685
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
686
+ {
687
+ return _mm256_castps_si256(from_mask(batch_bool<float, A> {}, mask, avx {}));
688
+ }
689
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
690
+ {
691
+ return _mm256_castpd_si256(from_mask(batch_bool<double, A> {}, mask, avx {}));
692
+ }
693
+ }
694
+
695
+ // haddp
696
+ template <class A>
697
+ XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx>) noexcept
698
+ {
699
+ // row = (a,b,c,d,e,f,g,h)
700
+ // tmp0 = (a0+a1, a2+a3, b0+b1, b2+b3, a4+a5, a6+a7, b4+b5, b6+b7)
701
+ __m256 tmp0 = _mm256_hadd_ps(row[0], row[1]);
702
+ // tmp1 = (c0+c1, c2+c3, d1+d2, d2+d3, c4+c5, c6+c7, d4+d5, d6+d7)
703
+ __m256 tmp1 = _mm256_hadd_ps(row[2], row[3]);
704
+ // tmp1 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3,
705
+ // a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7)
706
+ tmp1 = _mm256_hadd_ps(tmp0, tmp1);
707
+ // tmp0 = (e0+e1, e2+e3, f0+f1, f2+f3, e4+e5, e6+e7, f4+f5, f6+f7)
708
+ tmp0 = _mm256_hadd_ps(row[4], row[5]);
709
+ // tmp2 = (g0+g1, g2+g3, h0+h1, h2+h3, g4+g5, g6+g7, h4+h5, h6+h7)
710
+ __m256 tmp2 = _mm256_hadd_ps(row[6], row[7]);
711
+ // tmp2 = (e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3,
712
+ // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7)
713
+ tmp2 = _mm256_hadd_ps(tmp0, tmp2);
714
+ // tmp0 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3,
715
+ // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7)
716
+ tmp0 = _mm256_blend_ps(tmp1, tmp2, 0b11110000);
717
+ // tmp1 = (a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7,
718
+ // e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3)
719
+ tmp1 = _mm256_permute2f128_ps(tmp1, tmp2, 0x21);
720
+ return _mm256_add_ps(tmp0, tmp1);
721
+ }
722
+ template <class A>
723
+ XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<avx>) noexcept
724
+ {
725
+ // row = (a,b,c,d)
726
+ // tmp0 = (a0+a1, b0+b1, a2+a3, b2+b3)
727
+ __m256d tmp0 = _mm256_hadd_pd(row[0], row[1]);
728
+ // tmp1 = (c0+c1, d0+d1, c2+c3, d2+d3)
729
+ __m256d tmp1 = _mm256_hadd_pd(row[2], row[3]);
730
+ // tmp2 = (a0+a1, b0+b1, c2+c3, d2+d3)
731
+ __m256d tmp2 = _mm256_blend_pd(tmp0, tmp1, 0b1100);
732
+ // tmp1 = (a2+a3, b2+b3, c2+c3, d2+d3)
733
+ tmp1 = _mm256_permute2f128_pd(tmp0, tmp1, 0x21);
734
+ return _mm256_add_pd(tmp1, tmp2);
735
+ }
736
+
737
+ // incr_if
738
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
739
+ XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<avx>) noexcept
740
+ {
741
+ return self - batch<T, A>(mask.data);
742
+ }
743
+
744
+ // insert
745
+ template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
746
+ XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<avx>) noexcept
747
+ {
748
+ #if !defined(_MSC_VER) || _MSC_VER > 1900
749
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
750
+ {
751
+ return _mm256_insert_epi8(self, val, I);
752
+ }
753
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
754
+ {
755
+ return _mm256_insert_epi16(self, val, I);
756
+ }
757
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
758
+ {
759
+ return _mm256_insert_epi32(self, val, I);
760
+ }
761
+ else
762
+ {
763
+ return insert(self, val, pos, common {});
764
+ }
765
+ #endif
766
+ return insert(self, val, pos, common {});
767
+ }
768
+
769
+ // isnan
770
+ template <class A>
771
+ XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<avx>) noexcept
772
+ {
773
+ return _mm256_cmp_ps(self, self, _CMP_UNORD_Q);
774
+ }
775
+ template <class A>
776
+ XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<avx>) noexcept
777
+ {
778
+ return _mm256_cmp_pd(self, self, _CMP_UNORD_Q);
779
+ }
780
+
781
+ // le
782
+ template <class A>
783
+ XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
784
+ {
785
+ return _mm256_cmp_ps(self, other, _CMP_LE_OQ);
786
+ }
787
+ template <class A>
788
+ XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
789
+ {
790
+ return _mm256_cmp_pd(self, other, _CMP_LE_OQ);
791
+ }
792
+
793
+ // load_aligned
794
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
795
+ XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<avx>) noexcept
796
+ {
797
+ return _mm256_load_si256((__m256i const*)mem);
798
+ }
799
+ template <class A>
800
+ XSIMD_INLINE batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<avx>) noexcept
801
+ {
802
+ return _mm256_load_ps(mem);
803
+ }
804
+ template <class A>
805
+ XSIMD_INLINE batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<avx>) noexcept
806
+ {
807
+ return _mm256_load_pd(mem);
808
+ }
809
+
810
+ namespace detail
811
+ {
812
+ // load_complex
813
+ template <class A>
814
+ XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx>) noexcept
815
+ {
816
+ using batch_type = batch<float, A>;
817
+ __m128 tmp0 = _mm256_extractf128_ps(hi, 0);
818
+ __m128 tmp1 = _mm256_extractf128_ps(hi, 1);
819
+ __m128 tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0));
820
+ __m128 tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
821
+ batch_type real = _mm256_castps128_ps256(tmp_real);
822
+ batch_type imag = _mm256_castps128_ps256(tmp_imag);
823
+
824
+ tmp0 = _mm256_extractf128_ps(lo, 0);
825
+ tmp1 = _mm256_extractf128_ps(lo, 1);
826
+ tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0));
827
+ tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
828
+ real = _mm256_insertf128_ps(real, tmp_real, 1);
829
+ imag = _mm256_insertf128_ps(imag, tmp_imag, 1);
830
+ return { real, imag };
831
+ }
832
+ template <class A>
833
+ XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx>) noexcept
834
+ {
835
+ using batch_type = batch<double, A>;
836
+ __m128d tmp0 = _mm256_extractf128_pd(hi, 0);
837
+ __m128d tmp1 = _mm256_extractf128_pd(hi, 1);
838
+ batch_type real = _mm256_castpd128_pd256(_mm_unpacklo_pd(tmp0, tmp1));
839
+ batch_type imag = _mm256_castpd128_pd256(_mm_unpackhi_pd(tmp0, tmp1));
840
+
841
+ tmp0 = _mm256_extractf128_pd(lo, 0);
842
+ tmp1 = _mm256_extractf128_pd(lo, 1);
843
+ __m256d re_tmp1 = _mm256_insertf128_pd(real, _mm_unpacklo_pd(tmp0, tmp1), 1);
844
+ __m256d im_tmp1 = _mm256_insertf128_pd(imag, _mm_unpackhi_pd(tmp0, tmp1), 1);
845
+ real = _mm256_blend_pd(real, re_tmp1, 12);
846
+ imag = _mm256_blend_pd(imag, im_tmp1, 12);
847
+ return { real, imag };
848
+ }
849
+ }
850
+
851
+ // load_unaligned
852
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
853
+ XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<avx>) noexcept
854
+ {
855
+ return _mm256_loadu_si256((__m256i const*)mem);
856
+ }
857
+ template <class A>
858
+ XSIMD_INLINE batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<avx>) noexcept
859
+ {
860
+ return _mm256_loadu_ps(mem);
861
+ }
862
+ template <class A>
863
+ XSIMD_INLINE batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<avx>) noexcept
864
+ {
865
+ return _mm256_loadu_pd(mem);
866
+ }
867
+
868
+ // lt
869
+ template <class A>
870
+ XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
871
+ {
872
+ return _mm256_cmp_ps(self, other, _CMP_LT_OQ);
873
+ }
874
+ template <class A>
875
+ XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
876
+ {
877
+ return _mm256_cmp_pd(self, other, _CMP_LT_OQ);
878
+ }
879
+
880
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
881
+ XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
882
+ {
883
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
884
+ { return lt(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
885
+ self, other);
886
+ }
887
+
888
+ // mask
889
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
890
+ XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
891
+ {
892
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
893
+ {
894
+ __m128i self_low, self_high;
895
+ detail::split_avx(self, self_low, self_high);
896
+ return mask(batch_bool<T, sse4_2>(self_low), sse4_2 {}) | (mask(batch_bool<T, sse4_2>(self_high), sse4_2 {}) << (128 / (8 * sizeof(T))));
897
+ }
898
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
899
+ {
900
+ return _mm256_movemask_ps(_mm256_castsi256_ps(self));
901
+ }
902
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
903
+ {
904
+ return _mm256_movemask_pd(_mm256_castsi256_pd(self));
905
+ }
906
+ else
907
+ {
908
+ assert(false && "unsupported arch/op combination");
909
+ return {};
910
+ }
911
+ }
912
+ template <class A>
913
+ XSIMD_INLINE uint64_t mask(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
914
+ {
915
+ return _mm256_movemask_ps(self);
916
+ }
917
+
918
+ template <class A>
919
+ XSIMD_INLINE uint64_t mask(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
920
+ {
921
+ return _mm256_movemask_pd(self);
922
+ }
923
+
924
+ // max
925
+ template <class A>
926
+ XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
927
+ {
928
+ return _mm256_max_ps(other, self);
929
+ }
930
+ template <class A>
931
+ XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
932
+ {
933
+ return _mm256_max_pd(other, self);
934
+ }
935
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
936
+ XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
937
+ {
938
+ return select(self > other, self, other);
939
+ }
940
+
941
+ // min
942
+ template <class A>
943
+ XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
944
+ {
945
+ return _mm256_min_ps(other, self);
946
+ }
947
+ template <class A>
948
+ XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
949
+ {
950
+ return _mm256_min_pd(other, self);
951
+ }
952
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
953
+ XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
954
+ {
955
+ return select(self <= other, self, other);
956
+ }
957
+
958
+ // mul
959
+ template <class A>
960
+ XSIMD_INLINE batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
961
+ {
962
+ return _mm256_mul_ps(self, other);
963
+ }
964
+ template <class A>
965
+ XSIMD_INLINE batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
966
+ {
967
+ return _mm256_mul_pd(self, other);
968
+ }
969
+
970
+ // nearbyint
971
+ template <class A>
972
+ XSIMD_INLINE batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<avx>) noexcept
973
+ {
974
+ return _mm256_round_ps(self, _MM_FROUND_TO_NEAREST_INT);
975
+ }
976
+ template <class A>
977
+ XSIMD_INLINE batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<avx>) noexcept
978
+ {
979
+ return _mm256_round_pd(self, _MM_FROUND_TO_NEAREST_INT);
980
+ }
981
+
982
+ // nearbyint_as_int
983
+ template <class A>
984
+ XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
985
+ requires_arch<avx>) noexcept
986
+ {
987
+ return _mm256_cvtps_epi32(self);
988
+ }
989
+
990
+ // neg
991
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
992
+ XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<avx>) noexcept
993
+ {
994
+ return 0 - self;
995
+ }
996
+ template <class A>
997
+ batch<float, A> neg(batch<float, A> const& self, requires_arch<avx>)
998
+ {
999
+ return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)));
1000
+ }
1001
+ template <class A>
1002
+ XSIMD_INLINE batch<double, A> neg(batch<double, A> const& self, requires_arch<avx>) noexcept
1003
+ {
1004
+ return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000)));
1005
+ }
1006
+
1007
+ // neq
1008
+ template <class A>
1009
+ XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
1010
+ {
1011
+ return _mm256_cmp_ps(self, other, _CMP_NEQ_UQ);
1012
+ }
1013
+ template <class A>
1014
+ XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
1015
+ {
1016
+ return _mm256_cmp_pd(self, other, _CMP_NEQ_UQ);
1017
+ }
1018
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1019
+ XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
1020
+ {
1021
+ return ~(self == other);
1022
+ }
1023
+
1024
+ template <class A>
1025
+ XSIMD_INLINE batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
1026
+ {
1027
+ return _mm256_xor_ps(self, other);
1028
+ }
1029
+ template <class A>
1030
+ XSIMD_INLINE batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
1031
+ {
1032
+ return _mm256_xor_pd(self, other);
1033
+ }
1034
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1035
+ XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
1036
+ {
1037
+ return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(self.data), _mm256_castsi256_ps(other.data)));
1038
+ }
1039
+
1040
+ // reciprocal
1041
+ template <class A>
1042
+ XSIMD_INLINE batch<float, A> reciprocal(batch<float, A> const& self,
1043
+ kernel::requires_arch<avx>) noexcept
1044
+ {
1045
+ return _mm256_rcp_ps(self);
1046
+ }
1047
+
1048
+ // reduce_add
1049
+ template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
1050
+ XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
1051
+ {
1052
+ typename batch<T, sse4_2>::register_type low, high;
1053
+ detail::split_avx(self, low, high);
1054
+ batch<T, sse4_2> blow(low), bhigh(high);
1055
+ return reduce_add(blow + bhigh);
1056
+ }
1057
+
1058
+ // reduce_max
1059
+ template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
1060
+ XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<avx>) noexcept
1061
+ {
1062
+ constexpr auto mask = detail::shuffle(1, 0);
1063
+ batch<T, A> step = _mm256_permute2f128_si256(self, self, mask);
1064
+ batch<T, A> acc = max(self, step);
1065
+ __m128i low = _mm256_castsi256_si128(acc);
1066
+ return reduce_max(batch<T, sse4_2>(low));
1067
+ }
1068
+
1069
+ // reduce_min
1070
+ template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
1071
+ XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<avx>) noexcept
1072
+ {
1073
+ constexpr auto mask = detail::shuffle(1, 0);
1074
+ batch<T, A> step = _mm256_permute2f128_si256(self, self, mask);
1075
+ batch<T, A> acc = min(self, step);
1076
+ __m128i low = _mm256_castsi256_si128(acc);
1077
+ return reduce_min(batch<T, sse4_2>(low));
1078
+ }
1079
+
1080
+ // reduce_mul
1081
+ template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
1082
+ XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<avx>) noexcept
1083
+ {
1084
+ typename batch<T, sse4_2>::register_type low, high;
1085
+ detail::split_avx(self, low, high);
1086
+ batch<T, sse4_2> blow(low), bhigh(high);
1087
+ return reduce_mul(blow * bhigh);
1088
+ }
1089
+
1090
+ // rsqrt
1091
+ template <class A>
1092
+ XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
1093
+ {
1094
+ return _mm256_rsqrt_ps(val);
1095
+ }
1096
+ template <class A>
1097
+ XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<avx>) noexcept
1098
+ {
1099
+ return _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(val)));
1100
+ }
1101
+
1102
+ // sadd
1103
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1104
+ XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
1105
+ {
1106
+ if (std::is_signed<T>::value)
1107
+ {
1108
+ auto mask = (other >> (8 * sizeof(T) - 1));
1109
+ auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
1110
+ auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
1111
+ return other + select(batch_bool<T, A>(mask.data), self_neg_branch, self_pos_branch);
1112
+ }
1113
+ else
1114
+ {
1115
+ const auto diffmax = std::numeric_limits<T>::max() - self;
1116
+ const auto mindiff = min(diffmax, other);
1117
+ return self + mindiff;
1118
+ }
1119
+ }
1120
+
1121
+ // select
1122
+ template <class A>
1123
+ XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
1124
+ {
1125
+ return _mm256_blendv_ps(false_br, true_br, cond);
1126
+ }
1127
+ template <class A>
1128
+ XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
1129
+ {
1130
+ return _mm256_blendv_pd(false_br, true_br, cond);
1131
+ }
1132
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1133
+ XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
1134
+ {
1135
+ __m128i cond_low, cond_hi;
1136
+ detail::split_avx(cond, cond_low, cond_hi);
1137
+
1138
+ __m128i true_low, true_hi;
1139
+ detail::split_avx(true_br, true_low, true_hi);
1140
+
1141
+ __m128i false_low, false_hi;
1142
+ detail::split_avx(false_br, false_low, false_hi);
1143
+
1144
+ __m128i res_low = select(batch_bool<T, sse4_2>(cond_low), batch<T, sse4_2>(true_low), batch<T, sse4_2>(false_low), sse4_2 {});
1145
+ __m128i res_hi = select(batch_bool<T, sse4_2>(cond_hi), batch<T, sse4_2>(true_hi), batch<T, sse4_2>(false_hi), sse4_2 {});
1146
+ return detail::merge_sse(res_low, res_hi);
1147
+ }
1148
+ template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1149
+ XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
1150
+ {
1151
+ return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
1152
+ }
1153
+
1154
+ template <class A, bool... Values>
1155
+ XSIMD_INLINE batch<float, A> select(batch_bool_constant<float, A, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
1156
+ {
1157
+ constexpr auto mask = batch_bool_constant<float, A, Values...>::mask();
1158
+ return _mm256_blend_ps(false_br, true_br, mask);
1159
+ }
1160
+
1161
+ template <class A, bool... Values>
1162
+ XSIMD_INLINE batch<double, A> select(batch_bool_constant<double, A, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
1163
+ {
1164
+ constexpr auto mask = batch_bool_constant<double, A, Values...>::mask();
1165
+ return _mm256_blend_pd(false_br, true_br, mask);
1166
+ }
1167
+
1168
+ // set
1169
+ template <class A, class... Values>
1170
+ XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<avx>, Values... values) noexcept
1171
+ {
1172
+ static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
1173
+ return _mm256_setr_ps(values...);
1174
+ }
1175
+
1176
+ template <class A, class... Values>
1177
+ XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<avx>, Values... values) noexcept
1178
+ {
1179
+ static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
1180
+ return _mm256_setr_pd(values...);
1181
+ }
1182
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1183
+ XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3) noexcept
1184
+ {
1185
+ return _mm256_set_epi64x(v3, v2, v1, v0);
1186
+ }
1187
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1188
+ XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
1189
+ {
1190
+ return _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
1191
+ }
1192
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1193
+ XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
1194
+ {
1195
+ return _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
1196
+ }
1197
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1198
+ XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
1199
+ T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
1200
+ {
1201
+ return _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
1202
+ }
1203
+
1204
+ template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1205
+ XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<avx>, Values... values) noexcept
1206
+ {
1207
+ return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
1208
+ }
1209
+
1210
+ template <class A, class... Values>
1211
+ XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<avx>, Values... values) noexcept
1212
+ {
1213
+ static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
1214
+ return _mm256_castsi256_ps(set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data);
1215
+ }
1216
+
1217
+ template <class A, class... Values>
1218
+ XSIMD_INLINE batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<avx>, Values... values) noexcept
1219
+ {
1220
+ static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
1221
+ return _mm256_castsi256_pd(set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data);
1222
+ }
1223
+
1224
+ // shuffle
1225
+ template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7>
1226
+ XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx>) noexcept
1227
+ {
1228
+ constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
1229
+ // shuffle within lane
1230
+ if (I4 == (I0 + 4) && I5 == (I1 + 4) && I6 == (I2 + 4) && I7 == (I3 + 4) && I0 < 4 && I1 < 4 && I2 >= 8 && I2 < 12 && I3 >= 8 && I3 < 12)
1231
+ return _mm256_shuffle_ps(x, y, smask);
1232
+
1233
+ // shuffle within opposite lane
1234
+ if (I4 == (I0 + 4) && I5 == (I1 + 4) && I6 == (I2 + 4) && I7 == (I3 + 4) && I2 < 4 && I3 < 4 && I0 >= 8 && I0 < 12 && I1 >= 8 && I1 < 12)
1235
+ return _mm256_shuffle_ps(y, x, smask);
1236
+
1237
+ return shuffle(x, y, mask, common {});
1238
+ }
1239
+
1240
+ template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
1241
+ XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<avx>) noexcept
1242
+ {
1243
+ constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3);
1244
+ // shuffle within lane
1245
+ if (I0 < 2 && I1 >= 4 && I1 < 6 && I2 >= 2 && I2 < 4 && I3 >= 6)
1246
+ return _mm256_shuffle_pd(x, y, smask);
1247
+
1248
+ // shuffle within opposite lane
1249
+ if (I1 < 2 && I0 >= 4 && I0 < 6 && I3 >= 2 && I3 < 4 && I2 >= 6)
1250
+ return _mm256_shuffle_pd(y, x, smask);
1251
+
1252
+ return shuffle(x, y, mask, common {});
1253
+ }
1254
+
1255
+ // slide_left
1256
+ template <size_t N, class A, class T>
1257
+ XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx>) noexcept
1258
+ {
1259
+ constexpr unsigned BitCount = N * 8;
1260
+ if (BitCount == 0)
1261
+ {
1262
+ return x;
1263
+ }
1264
+ if (BitCount >= 256)
1265
+ {
1266
+ return batch<T, A>(T(0));
1267
+ }
1268
+ if (BitCount > 128)
1269
+ {
1270
+ constexpr unsigned M = (BitCount - 128) / 8;
1271
+ __m128i low = _mm256_castsi256_si128(x);
1272
+ auto y = _mm_slli_si128(low, M);
1273
+ __m256i zero = _mm256_setzero_si256();
1274
+ return _mm256_insertf128_si256(zero, y, 1);
1275
+ }
1276
+ if (BitCount == 128)
1277
+ {
1278
+ __m128i low = _mm256_castsi256_si128(x);
1279
+ __m256i zero = _mm256_setzero_si256();
1280
+ return _mm256_insertf128_si256(zero, low, 1);
1281
+ }
1282
+ // shifting by [0, 128[ bits
1283
+ constexpr unsigned M = BitCount / 8;
1284
+
1285
+ __m128i low = _mm256_castsi256_si128(x);
1286
+ auto ylow = _mm_slli_si128(low, M);
1287
+ auto zlow = _mm_srli_si128(low, 16 - M);
1288
+
1289
+ __m128i high = _mm256_extractf128_si256(x, 1);
1290
+ auto yhigh = _mm_slli_si128(high, M);
1291
+
1292
+ __m256i res = _mm256_castsi128_si256(ylow);
1293
+ return _mm256_insertf128_si256(res, _mm_or_si128(yhigh, zlow), 1);
1294
+ }
1295
+
1296
+ // slide_right
1297
+ template <size_t N, class A, class T>
1298
+ XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx>) noexcept
1299
+ {
1300
+ constexpr unsigned BitCount = N * 8;
1301
+ if (BitCount == 0)
1302
+ {
1303
+ return x;
1304
+ }
1305
+ if (BitCount >= 256)
1306
+ {
1307
+ return batch<T, A>(T(0));
1308
+ }
1309
+ if (BitCount > 128)
1310
+ {
1311
+ constexpr unsigned M = (BitCount - 128) / 8;
1312
+ __m128i high = _mm256_extractf128_si256(x, 1);
1313
+ __m128i y = _mm_srli_si128(high, M);
1314
+ __m256i zero = _mm256_setzero_si256();
1315
+ return _mm256_insertf128_si256(zero, y, 0);
1316
+ }
1317
+ if (BitCount == 128)
1318
+ {
1319
+ __m128i high = _mm256_extractf128_si256(x, 1);
1320
+ return _mm256_castsi128_si256(high);
1321
+ }
1322
+ // shifting by [0, 128[ bits
1323
+ constexpr unsigned M = BitCount / 8;
1324
+
1325
+ __m128i low = _mm256_castsi256_si128(x);
1326
+ auto ylow = _mm_srli_si128(low, M);
1327
+
1328
+ __m128i high = _mm256_extractf128_si256(x, 1);
1329
+ auto yhigh = _mm_srli_si128(high, M);
1330
+ auto zhigh = _mm_slli_si128(high, 16 - M);
1331
+
1332
+ __m256i res = _mm256_castsi128_si256(_mm_or_si128(ylow, zhigh));
1333
+ return _mm256_insertf128_si256(res, yhigh, 1);
1334
+ }
1335
+
1336
+ // sqrt
1337
+ template <class A>
1338
+ XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
1339
+ {
1340
+ return _mm256_sqrt_ps(val);
1341
+ }
1342
+ template <class A>
1343
+ XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<avx>) noexcept
1344
+ {
1345
+ return _mm256_sqrt_pd(val);
1346
+ }
1347
+
1348
+ // ssub
1349
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1350
+ XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
1351
+ {
1352
+ if (std::is_signed<T>::value)
1353
+ {
1354
+ return sadd(self, -other);
1355
+ }
1356
+ else
1357
+ {
1358
+ const auto diff = min(self, other);
1359
+ return self - diff;
1360
+ }
1361
+ }
1362
+
1363
+ // store_aligned
1364
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1365
+ XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
1366
+ {
1367
+ return _mm256_store_si256((__m256i*)mem, self);
1368
+ }
1369
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1370
+ XSIMD_INLINE void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx>) noexcept
1371
+ {
1372
+ return _mm256_store_si256((__m256i*)mem, self);
1373
+ }
1374
+ template <class A>
1375
+ XSIMD_INLINE void store_aligned(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
1376
+ {
1377
+ return _mm256_store_ps(mem, self);
1378
+ }
1379
+ template <class A>
1380
+ XSIMD_INLINE void store_aligned(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
1381
+ {
1382
+ return _mm256_store_pd(mem, self);
1383
+ }
1384
+
1385
+ // store_unaligned
1386
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1387
+ XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
1388
+ {
1389
+ return _mm256_storeu_si256((__m256i*)mem, self);
1390
+ }
1391
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1392
+ XSIMD_INLINE void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx>) noexcept
1393
+ {
1394
+ return _mm256_storeu_si256((__m256i*)mem, self);
1395
+ }
1396
+ template <class A>
1397
+ XSIMD_INLINE void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
1398
+ {
1399
+ return _mm256_storeu_ps(mem, self);
1400
+ }
1401
+ template <class A>
1402
+ XSIMD_INLINE void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
1403
+ {
1404
+ return _mm256_storeu_pd(mem, self);
1405
+ }
1406
+
1407
+ // sub
1408
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1409
+ XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
1410
+ {
1411
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
1412
+ { return sub(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
1413
+ self, other);
1414
+ }
1415
+ template <class A>
1416
+ XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
1417
+ {
1418
+ return _mm256_sub_ps(self, other);
1419
+ }
1420
+ template <class A>
1421
+ XSIMD_INLINE batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
1422
+ {
1423
+ return _mm256_sub_pd(self, other);
1424
+ }
1425
+
1426
+ // swizzle (dynamic mask)
1427
+ template <class A>
1428
+ XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx>) noexcept
1429
+ {
1430
+ // duplicate low and high part of input
1431
+ // Duplicate lanes separately
1432
+ // 1) duplicate low and high lanes
1433
+ __m256 lo = _mm256_permute2f128_ps(self, self, 0x00); // [low | low]
1434
+ __m256 hi = _mm256_permute2f128_ps(self, self, 0x11); // [high| high]
1435
+
1436
+ // normalize mask
1437
+ batch<uint32_t, A> half_mask = mask % 4;
1438
+
1439
+ // permute within each lane
1440
+ __m256 r0 = _mm256_permutevar_ps(lo, half_mask);
1441
+ __m256 r1 = _mm256_permutevar_ps(hi, half_mask);
1442
+
1443
+ batch_bool<uint32_t, A> blend_mask = mask >= 4;
1444
+ return _mm256_blendv_ps(r0, r1, batch_bool_cast<float>(blend_mask));
1445
+ }
1446
+
1447
+ template <class A>
1448
+ XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx>) noexcept
1449
+ {
1450
+ // duplicate low and high part of input
1451
+ __m256d lo = _mm256_permute2f128_pd(self, self, 0x00);
1452
+ __m256d hi = _mm256_permute2f128_pd(self, self, 0x11);
1453
+
1454
+ // normalize mask
1455
+ batch<uint64_t, A> half_mask = -(mask & 1);
1456
+
1457
+ // permute within each lane
1458
+ __m256d r0 = _mm256_permutevar_pd(lo, half_mask);
1459
+ __m256d r1 = _mm256_permutevar_pd(hi, half_mask);
1460
+
1461
+ // mask to choose the right lane
1462
+ batch_bool<uint64_t, A> blend_mask = mask >= 2;
1463
+
1464
+ // blend the two permutes
1465
+ return _mm256_blendv_pd(r0, r1, batch_bool_cast<double>(blend_mask));
1466
+ }
1467
+
1468
+ template <class A, typename T, detail::enable_sized_integral_t<T, 4> = 0>
1469
+ XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<uint32_t, A> const& mask, requires_arch<avx>) noexcept
1470
+ {
1471
+ return bitwise_cast<T>(
1472
+ swizzle(bitwise_cast<float>(self), mask));
1473
+ }
1474
+
1475
+ template <class A, typename T, detail::enable_sized_integral_t<T, 8> = 0>
1476
+ XSIMD_INLINE batch<T, A>
1477
+ swizzle(batch<T, A> const& self, batch<uint64_t, A> const& mask, requires_arch<avx>) noexcept
1478
+ {
1479
+ return bitwise_cast<T>(
1480
+ swizzle(bitwise_cast<double>(self), mask));
1481
+ }
1482
+
1483
+ // swizzle (constant mask)
1484
+ template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
1485
+ XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx>) noexcept
1486
+ {
1487
+ constexpr bool is_identity = detail::is_identity(mask);
1488
+ constexpr bool is_dup_low = detail::is_dup_lo(mask);
1489
+ constexpr bool is_dup_hi = detail::is_dup_hi(mask);
1490
+ constexpr bool is_dup = is_dup_low || is_dup_hi;
1491
+ XSIMD_IF_CONSTEXPR(is_identity)
1492
+ {
1493
+ return self;
1494
+ }
1495
+ XSIMD_IF_CONSTEXPR(is_dup)
1496
+ {
1497
+ constexpr auto control = is_dup_low ? 0x00 : 0x11;
1498
+ constexpr auto is_dup_identity = is_dup_low ? detail::is_identity<uint32_t, V0, V1, V2, V3>() : detail::is_identity<int64_t, V4 - 4, V5 - 4, V6 - 4, V7 - 4>();
1499
+ auto split = _mm256_permute2f128_ps(self, self, control);
1500
+ XSIMD_IF_CONSTEXPR(!is_dup_identity)
1501
+ {
1502
+ constexpr auto shuffle_mask = is_dup_low ? detail::mod_shuffle(V0, V1, V2, V3) : detail::mod_shuffle(V4 - 4, V5 - 4, V6 - 4, V7 - 4);
1503
+ split = _mm256_permute_ps(split, shuffle_mask);
1504
+ }
1505
+ return split;
1506
+ }
1507
+ // Duplicate lanes separately
1508
+ // 1) duplicate low and high lanes
1509
+ __m256 low_dup = _mm256_permute2f128_ps(self, self, 0x00); // [low | low]
1510
+ __m256 hi_dup = _mm256_permute2f128_ps(self, self, 0x11); // [high| high]
1511
+
1512
+ // 2) build lane-local index vector (each element = source_index & 3)
1513
+ constexpr batch_constant<uint32_t, A, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
1514
+
1515
+ __m256 r0 = _mm256_permutevar_ps(low_dup, half_mask.as_batch()); // pick from low lane
1516
+ __m256 r1 = _mm256_permutevar_ps(hi_dup, half_mask.as_batch()); // pick from high lane
1517
+
1518
+ constexpr batch_bool_constant<uint32_t, A, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> lane_mask {};
1519
+
1520
+ return _mm256_blend_ps(r0, r1, lane_mask.mask());
1521
+ }
1522
+
1523
+ template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
1524
+ XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> mask, requires_arch<avx>) noexcept
1525
+ {
1526
+ // cannot use detail::mod_shuffle as the mod and shift are different in this case
1527
+ constexpr auto imm = ((V0 & 1) << 0) | ((V1 & 1) << 1) | ((V2 & 1) << 2) | ((V3 & 1) << 3);
1528
+ XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) { return self; }
1529
+ XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))
1530
+ {
1531
+ return _mm256_permute_pd(self, imm);
1532
+ }
1533
+ // duplicate low and high part of input
1534
+ __m256d lo = _mm256_permute2f128_pd(self, self, 0x00);
1535
+ __m256d hi = _mm256_permute2f128_pd(self, self, 0x11);
1536
+
1537
+ // permute within each lane
1538
+ __m256d r0 = _mm256_permute_pd(lo, imm);
1539
+ __m256d r1 = _mm256_permute_pd(hi, imm);
1540
+
1541
+ // mask to choose the right lane
1542
+ constexpr batch_bool_constant<uint64_t, A, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
1543
+
1544
+ // blend the two permutes
1545
+ return _mm256_blend_pd(r0, r1, blend_mask.mask());
1546
+ }
1547
+ template <class A,
1548
+ typename T,
1549
+ uint32_t V0,
1550
+ uint32_t V1,
1551
+ uint32_t V2,
1552
+ uint32_t V3,
1553
+ uint32_t V4,
1554
+ uint32_t V5,
1555
+ uint32_t V6,
1556
+ uint32_t V7,
1557
+ detail::enable_sized_integral_t<T, 4> = 0>
1558
+ XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self,
1559
+ batch_constant<uint32_t, A,
1560
+ V0,
1561
+ V1,
1562
+ V2,
1563
+ V3,
1564
+ V4,
1565
+ V5,
1566
+ V6,
1567
+ V7> const& mask,
1568
+ requires_arch<avx>) noexcept
1569
+ {
1570
+ return bitwise_cast<T>(
1571
+ swizzle(bitwise_cast<float>(self), mask));
1572
+ }
1573
+
1574
+ template <class A,
1575
+ typename T,
1576
+ uint64_t V0,
1577
+ uint64_t V1,
1578
+ uint64_t V2,
1579
+ uint64_t V3,
1580
+ detail::enable_sized_integral_t<T, 8> = 0>
1581
+ XSIMD_INLINE batch<T, A>
1582
+ swizzle(batch<T, A> const& self,
1583
+ batch_constant<uint64_t, A, V0, V1, V2, V3> const& mask,
1584
+ requires_arch<avx>) noexcept
1585
+ {
1586
+ return bitwise_cast<T>(
1587
+ swizzle(bitwise_cast<double>(self), mask));
1588
+ }
1589
+ // transpose
1590
+ template <class A>
1591
+ XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<avx>) noexcept
1592
+ {
1593
+ assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
1594
+ (void)matrix_end;
1595
+ // See
1596
+ // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2
1597
+ auto r0 = matrix_begin[0], r1 = matrix_begin[1],
1598
+ r2 = matrix_begin[2], r3 = matrix_begin[3],
1599
+ r4 = matrix_begin[4], r5 = matrix_begin[5],
1600
+ r6 = matrix_begin[6], r7 = matrix_begin[7];
1601
+
1602
+ auto t0 = _mm256_unpacklo_ps(r0, r1);
1603
+ auto t1 = _mm256_unpackhi_ps(r0, r1);
1604
+ auto t2 = _mm256_unpacklo_ps(r2, r3);
1605
+ auto t3 = _mm256_unpackhi_ps(r2, r3);
1606
+ auto t4 = _mm256_unpacklo_ps(r4, r5);
1607
+ auto t5 = _mm256_unpackhi_ps(r4, r5);
1608
+ auto t6 = _mm256_unpacklo_ps(r6, r7);
1609
+ auto t7 = _mm256_unpackhi_ps(r6, r7);
1610
+
1611
+ r0 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(1, 0, 1, 0));
1612
+ r1 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 2, 3, 2));
1613
+ r2 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0));
1614
+ r3 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2));
1615
+ r4 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(1, 0, 1, 0));
1616
+ r5 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(3, 2, 3, 2));
1617
+ r6 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0));
1618
+ r7 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2));
1619
+
1620
+ matrix_begin[0] = _mm256_permute2f128_ps(r0, r4, 0x20);
1621
+ matrix_begin[1] = _mm256_permute2f128_ps(r1, r5, 0x20);
1622
+ matrix_begin[2] = _mm256_permute2f128_ps(r2, r6, 0x20);
1623
+ matrix_begin[3] = _mm256_permute2f128_ps(r3, r7, 0x20);
1624
+ matrix_begin[4] = _mm256_permute2f128_ps(r0, r4, 0x31);
1625
+ matrix_begin[5] = _mm256_permute2f128_ps(r1, r5, 0x31);
1626
+ matrix_begin[6] = _mm256_permute2f128_ps(r2, r6, 0x31);
1627
+ matrix_begin[7] = _mm256_permute2f128_ps(r3, r7, 0x31);
1628
+ }
1629
+
1630
+ template <class A>
1631
+ XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<avx>) noexcept
1632
+ {
1633
+ return transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
1634
+ }
1635
+ template <class A>
1636
+ XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<avx>) noexcept
1637
+ {
1638
+ return transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
1639
+ }
1640
+
1641
+ template <class A>
1642
+ XSIMD_INLINE void transpose(batch<double, A>* matrix_begin, batch<double, A>* matrix_end, requires_arch<avx>) noexcept
1643
+ {
1644
+ assert((matrix_end - matrix_begin == batch<double, A>::size) && "correctly sized matrix");
1645
+ (void)matrix_end;
1646
+ auto r0 = matrix_begin[0], r1 = matrix_begin[1],
1647
+ r2 = matrix_begin[2], r3 = matrix_begin[3];
1648
+
1649
+ auto t0 = _mm256_unpacklo_pd(r0, r1); // r00 r10 r01 r11
1650
+ auto t1 = _mm256_unpackhi_pd(r0, r1); // r02 r12 r03 r13
1651
+ auto t2 = _mm256_unpacklo_pd(r2, r3); // r20 r30 r21 r31
1652
+ auto t3 = _mm256_unpackhi_pd(r2, r3); // r22 r32 r23 r33
1653
+
1654
+ matrix_begin[0] = _mm256_permute2f128_pd(t0, t2, 0x20);
1655
+ matrix_begin[1] = _mm256_permute2f128_pd(t1, t3, 0x20);
1656
+ matrix_begin[2] = _mm256_permute2f128_pd(t0, t2, 0x31);
1657
+ matrix_begin[3] = _mm256_permute2f128_pd(t1, t3, 0x31);
1658
+ }
1659
+
1660
+ template <class A>
1661
+ XSIMD_INLINE void transpose(batch<uint64_t, A>* matrix_begin, batch<uint64_t, A>* matrix_end, requires_arch<avx>) noexcept
1662
+ {
1663
+ return transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
1664
+ }
1665
+ template <class A>
1666
+ XSIMD_INLINE void transpose(batch<int64_t, A>* matrix_begin, batch<int64_t, A>* matrix_end, requires_arch<avx>) noexcept
1667
+ {
1668
+ return transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
1669
+ }
1670
+
1671
+ template <class A>
1672
+ XSIMD_INLINE void transpose(batch<uint16_t, A>* matrix_begin, batch<uint16_t, A>* matrix_end, requires_arch<avx>) noexcept
1673
+ {
1674
+ assert((matrix_end - matrix_begin == batch<uint16_t, A>::size) && "correctly sized matrix");
1675
+ (void)matrix_end;
1676
+ batch<uint16_t, sse4_2> tmp_lo0[8];
1677
+ for (int i = 0; i < 8; ++i)
1678
+ tmp_lo0[i] = _mm256_castsi256_si128(matrix_begin[i]);
1679
+ transpose(tmp_lo0 + 0, tmp_lo0 + 8, sse4_2 {});
1680
+
1681
+ batch<uint16_t, sse4_2> tmp_hi0[8];
1682
+ for (int i = 0; i < 8; ++i)
1683
+ tmp_hi0[i] = _mm256_castsi256_si128(matrix_begin[8 + i]);
1684
+ transpose(tmp_hi0 + 0, tmp_hi0 + 8, sse4_2 {});
1685
+
1686
+ batch<uint16_t, sse4_2> tmp_lo1[8];
1687
+ for (int i = 0; i < 8; ++i)
1688
+ tmp_lo1[i] = _mm256_extractf128_si256(matrix_begin[i], 1);
1689
+ transpose(tmp_lo1 + 0, tmp_lo1 + 8, sse4_2 {});
1690
+
1691
+ batch<uint16_t, sse4_2> tmp_hi1[8];
1692
+ for (int i = 0; i < 8; ++i)
1693
+ tmp_hi1[i] = _mm256_extractf128_si256(matrix_begin[8 + i], 1);
1694
+ transpose(tmp_hi1 + 0, tmp_hi1 + 8, sse4_2 {});
1695
+
1696
+ for (int i = 0; i < 8; ++i)
1697
+ matrix_begin[i] = detail::merge_sse(tmp_lo0[i], tmp_hi0[i]);
1698
+ for (int i = 0; i < 8; ++i)
1699
+ matrix_begin[i + 8] = detail::merge_sse(tmp_lo1[i], tmp_hi1[i]);
1700
+ }
1701
+ template <class A>
1702
+ XSIMD_INLINE void transpose(batch<int16_t, A>* matrix_begin, batch<int16_t, A>* matrix_end, requires_arch<avx>) noexcept
1703
+ {
1704
+ return transpose(reinterpret_cast<batch<uint16_t, A>*>(matrix_begin), reinterpret_cast<batch<uint16_t, A>*>(matrix_end), A {});
1705
+ }
1706
+
1707
+ template <class A>
1708
+ XSIMD_INLINE void transpose(batch<uint8_t, A>* matrix_begin, batch<uint8_t, A>* matrix_end, requires_arch<avx>) noexcept
1709
+ {
1710
+ assert((matrix_end - matrix_begin == batch<uint8_t, A>::size) && "correctly sized matrix");
1711
+ (void)matrix_end;
1712
+ batch<uint8_t, sse4_2> tmp_lo0[16];
1713
+ for (int i = 0; i < 16; ++i)
1714
+ tmp_lo0[i] = _mm256_castsi256_si128(matrix_begin[i]);
1715
+ transpose(tmp_lo0 + 0, tmp_lo0 + 16, sse4_2 {});
1716
+
1717
+ batch<uint8_t, sse4_2> tmp_hi0[16];
1718
+ for (int i = 0; i < 16; ++i)
1719
+ tmp_hi0[i] = _mm256_castsi256_si128(matrix_begin[16 + i]);
1720
+ transpose(tmp_hi0 + 0, tmp_hi0 + 16, sse4_2 {});
1721
+
1722
+ batch<uint8_t, sse4_2> tmp_lo1[16];
1723
+ for (int i = 0; i < 16; ++i)
1724
+ tmp_lo1[i] = _mm256_extractf128_si256(matrix_begin[i], 1);
1725
+ transpose(tmp_lo1 + 0, tmp_lo1 + 16, sse4_2 {});
1726
+
1727
+ batch<uint8_t, sse4_2> tmp_hi1[16];
1728
+ for (int i = 0; i < 16; ++i)
1729
+ tmp_hi1[i] = _mm256_extractf128_si256(matrix_begin[16 + i], 1);
1730
+ transpose(tmp_hi1 + 0, tmp_hi1 + 16, sse4_2 {});
1731
+
1732
+ for (int i = 0; i < 16; ++i)
1733
+ matrix_begin[i] = detail::merge_sse(tmp_lo0[i], tmp_hi0[i]);
1734
+ for (int i = 0; i < 16; ++i)
1735
+ matrix_begin[i + 16] = detail::merge_sse(tmp_lo1[i], tmp_hi1[i]);
1736
+ }
1737
+ template <class A>
1738
+ XSIMD_INLINE void transpose(batch<int8_t, A>* matrix_begin, batch<int8_t, A>* matrix_end, requires_arch<avx>) noexcept
1739
+ {
1740
+ return transpose(reinterpret_cast<batch<uint8_t, A>*>(matrix_begin), reinterpret_cast<batch<uint8_t, A>*>(matrix_end), A {});
1741
+ }
1742
+
1743
+ // trunc
1744
+ template <class A>
1745
+ XSIMD_INLINE batch<float, A> trunc(batch<float, A> const& self, requires_arch<avx>) noexcept
1746
+ {
1747
+ return _mm256_round_ps(self, _MM_FROUND_TO_ZERO);
1748
+ }
1749
+ template <class A>
1750
+ XSIMD_INLINE batch<double, A> trunc(batch<double, A> const& self, requires_arch<avx>) noexcept
1751
+ {
1752
+ return _mm256_round_pd(self, _MM_FROUND_TO_ZERO);
1753
+ }
1754
+
1755
+ // zip_hi
1756
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1757
+ XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
1758
+ {
1759
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
1760
+ {
1761
+ // extract high word
1762
+ __m128i self_hi = _mm256_extractf128_si256(self, 1);
1763
+ __m128i other_hi = _mm256_extractf128_si256(other, 1);
1764
+
1765
+ // interleave
1766
+ __m128i res_lo, res_hi;
1767
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1768
+ {
1769
+ res_lo = _mm_unpacklo_epi8(self_hi, other_hi);
1770
+ res_hi = _mm_unpackhi_epi8(self_hi, other_hi);
1771
+ }
1772
+ else
1773
+ {
1774
+ res_lo = _mm_unpacklo_epi16(self_hi, other_hi);
1775
+ res_hi = _mm_unpackhi_epi16(self_hi, other_hi);
1776
+ }
1777
+
1778
+ // fuse
1779
+ return _mm256_castps_si256(
1780
+ _mm256_insertf128_ps(
1781
+ _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)),
1782
+ _mm_castsi128_ps(res_hi),
1783
+ 1));
1784
+ }
1785
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1786
+ {
1787
+ auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
1788
+ auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
1789
+ return _mm256_castps_si256(_mm256_permute2f128_ps(lo, hi, 0x31));
1790
+ }
1791
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1792
+ {
1793
+ auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
1794
+ auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
1795
+ return _mm256_castpd_si256(_mm256_permute2f128_pd(lo, hi, 0x31));
1796
+ }
1797
+ else
1798
+ {
1799
+ assert(false && "unsupported arch/op combination");
1800
+ return {};
1801
+ }
1802
+ }
1803
+ template <class A>
1804
+ XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
1805
+ {
1806
+ auto lo = _mm256_unpacklo_ps(self, other);
1807
+ auto hi = _mm256_unpackhi_ps(self, other);
1808
+ return _mm256_permute2f128_ps(lo, hi, 0x31);
1809
+ }
1810
+ template <class A>
1811
+ XSIMD_INLINE batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
1812
+ {
1813
+ auto lo = _mm256_unpacklo_pd(self, other);
1814
+ auto hi = _mm256_unpackhi_pd(self, other);
1815
+ return _mm256_permute2f128_pd(lo, hi, 0x31);
1816
+ }
1817
+
1818
+ // zip_lo
1819
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1820
+ XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
1821
+ {
1822
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
1823
+ {
1824
+ // extract low word
1825
+ __m128i self_lo = _mm256_extractf128_si256(self, 0);
1826
+ __m128i other_lo = _mm256_extractf128_si256(other, 0);
1827
+
1828
+ // interleave
1829
+ __m128i res_lo, res_hi;
1830
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1831
+ {
1832
+ res_lo = _mm_unpacklo_epi8(self_lo, other_lo);
1833
+ res_hi = _mm_unpackhi_epi8(self_lo, other_lo);
1834
+ }
1835
+ else
1836
+ {
1837
+ res_lo = _mm_unpacklo_epi16(self_lo, other_lo);
1838
+ res_hi = _mm_unpackhi_epi16(self_lo, other_lo);
1839
+ }
1840
+
1841
+ // fuse
1842
+ return _mm256_castps_si256(
1843
+ _mm256_insertf128_ps(
1844
+ _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)),
1845
+ _mm_castsi128_ps(res_hi),
1846
+ 1));
1847
+ }
1848
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1849
+ {
1850
+ auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
1851
+ auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
1852
+ return _mm256_castps_si256(_mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1));
1853
+ }
1854
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1855
+ {
1856
+ auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
1857
+ auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
1858
+ return _mm256_castpd_si256(_mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1));
1859
+ }
1860
+ else
1861
+ {
1862
+ assert(false && "unsupported arch/op combination");
1863
+ return {};
1864
+ }
1865
+ }
1866
+
1867
+ template <class A>
1868
+ XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
1869
+ {
1870
+ auto lo = _mm256_unpacklo_ps(self, other);
1871
+ auto hi = _mm256_unpackhi_ps(self, other);
1872
+ return _mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1);
1873
+ }
1874
+ template <class A>
1875
+ XSIMD_INLINE batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
1876
+ {
1877
+ auto lo = _mm256_unpacklo_pd(self, other);
1878
+ auto hi = _mm256_unpackhi_pd(self, other);
1879
+ return _mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1);
1880
+ }
1881
+
1882
+ // first
1883
+ template <class A>
1884
+ XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<avx>) noexcept
1885
+ {
1886
+ return _mm256_cvtss_f32(self);
1887
+ }
1888
+
1889
+ template <class A>
1890
+ XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<avx>) noexcept
1891
+ {
1892
+ return _mm256_cvtsd_f64(self);
1893
+ }
1894
+
1895
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1896
+ XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<avx>) noexcept
1897
+ {
1898
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1899
+ {
1900
+ return static_cast<T>(_mm256_cvtsi256_si32(self) & 0xFF);
1901
+ }
1902
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1903
+ {
1904
+ return static_cast<T>(_mm256_cvtsi256_si32(self) & 0xFFFF);
1905
+ }
1906
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1907
+ {
1908
+ return static_cast<T>(_mm256_cvtsi256_si32(self));
1909
+ }
1910
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1911
+ {
1912
+ batch<T, sse4_2> low = _mm256_castsi256_si128(self);
1913
+ return first(low, sse4_2 {});
1914
+ }
1915
+ else
1916
+ {
1917
+ assert(false && "unsupported arch/op combination");
1918
+ return {};
1919
+ }
1920
+ }
1921
+ }
1922
+ }
1923
+
1924
+ #endif