sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
  2. sequenzo/__init__.py +349 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +476 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +22 -0
  30. sequenzo/data_preprocessing/helpers.py +303 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/dyadic_children.csv +61 -0
  44. sequenzo/datasets/dyadic_parents.csv +61 -0
  45. sequenzo/datasets/mvad.csv +713 -0
  46. sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
  47. sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
  48. sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
  49. sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
  50. sequenzo/datasets/political_science_aid_shock.csv +166 -0
  51. sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
  52. sequenzo/define_sequence_data.py +1400 -0
  53. sequenzo/dissimilarity_measures/__init__.py +31 -0
  54. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  55. sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
  56. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
  57. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  58. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  59. sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
  60. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  61. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  62. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  63. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  64. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  65. sequenzo/dissimilarity_measures/src/module.cpp +40 -0
  66. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  67. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  214. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  215. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  216. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  217. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  218. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  219. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  220. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  221. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  222. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  223. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  224. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  225. sequenzo/multidomain/__init__.py +23 -0
  226. sequenzo/multidomain/association_between_domains.py +311 -0
  227. sequenzo/multidomain/cat.py +597 -0
  228. sequenzo/multidomain/combt.py +519 -0
  229. sequenzo/multidomain/dat.py +81 -0
  230. sequenzo/multidomain/idcd.py +139 -0
  231. sequenzo/multidomain/linked_polyad.py +292 -0
  232. sequenzo/openmp_setup.py +233 -0
  233. sequenzo/prefix_tree/__init__.py +62 -0
  234. sequenzo/prefix_tree/hub.py +114 -0
  235. sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
  236. sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
  237. sequenzo/prefix_tree/spell_level_indicators.py +297 -0
  238. sequenzo/prefix_tree/system_level_indicators.py +544 -0
  239. sequenzo/prefix_tree/utils.py +54 -0
  240. sequenzo/seqhmm/__init__.py +95 -0
  241. sequenzo/seqhmm/advanced_optimization.py +305 -0
  242. sequenzo/seqhmm/bootstrap.py +411 -0
  243. sequenzo/seqhmm/build_hmm.py +142 -0
  244. sequenzo/seqhmm/build_mhmm.py +136 -0
  245. sequenzo/seqhmm/build_nhmm.py +121 -0
  246. sequenzo/seqhmm/fit_mhmm.py +62 -0
  247. sequenzo/seqhmm/fit_model.py +61 -0
  248. sequenzo/seqhmm/fit_nhmm.py +76 -0
  249. sequenzo/seqhmm/formulas.py +289 -0
  250. sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
  251. sequenzo/seqhmm/gradients_nhmm.py +306 -0
  252. sequenzo/seqhmm/hmm.py +291 -0
  253. sequenzo/seqhmm/mhmm.py +314 -0
  254. sequenzo/seqhmm/model_comparison.py +238 -0
  255. sequenzo/seqhmm/multichannel_em.py +282 -0
  256. sequenzo/seqhmm/multichannel_utils.py +138 -0
  257. sequenzo/seqhmm/nhmm.py +270 -0
  258. sequenzo/seqhmm/nhmm_utils.py +191 -0
  259. sequenzo/seqhmm/predict.py +137 -0
  260. sequenzo/seqhmm/predict_mhmm.py +142 -0
  261. sequenzo/seqhmm/simulate.py +878 -0
  262. sequenzo/seqhmm/utils.py +218 -0
  263. sequenzo/seqhmm/visualization.py +910 -0
  264. sequenzo/sequence_characteristics/__init__.py +40 -0
  265. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  266. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  267. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  268. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  269. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  270. sequenzo/sequence_characteristics/turbulence.py +155 -0
  271. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  272. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  273. sequenzo/suffix_tree/__init__.py +66 -0
  274. sequenzo/suffix_tree/hub.py +114 -0
  275. sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
  276. sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
  277. sequenzo/suffix_tree/spell_level_indicators.py +248 -0
  278. sequenzo/suffix_tree/system_level_indicators.py +535 -0
  279. sequenzo/suffix_tree/utils.py +56 -0
  280. sequenzo/version_check.py +283 -0
  281. sequenzo/visualization/__init__.py +29 -0
  282. sequenzo/visualization/plot_mean_time.py +222 -0
  283. sequenzo/visualization/plot_modal_state.py +276 -0
  284. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  285. sequenzo/visualization/plot_relative_frequency.py +405 -0
  286. sequenzo/visualization/plot_sequence_index.py +1175 -0
  287. sequenzo/visualization/plot_single_medoid.py +153 -0
  288. sequenzo/visualization/plot_state_distribution.py +651 -0
  289. sequenzo/visualization/plot_transition_matrix.py +190 -0
  290. sequenzo/visualization/utils/__init__.py +23 -0
  291. sequenzo/visualization/utils/utils.py +310 -0
  292. sequenzo/with_event_history_analysis/__init__.py +35 -0
  293. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  294. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  295. sequenzo-0.1.31.dist-info/METADATA +286 -0
  296. sequenzo-0.1.31.dist-info/RECORD +299 -0
  297. sequenzo-0.1.31.dist-info/WHEEL +5 -0
  298. sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
  299. sequenzo-0.1.31.dist-info/top_level.txt +2 -0
@@ -0,0 +1,2024 @@
1
+ /***************************************************************************
2
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
3
+ * Martin Renou *
4
+ * Copyright (c) QuantStack *
5
+ * Copyright (c) Serge Guelton *
6
+ * *
7
+ * Distributed under the terms of the BSD 3-Clause License. *
8
+ * *
9
+ * The full license is in the file LICENSE, distributed with this software. *
10
+ ****************************************************************************/
11
+
12
+ #ifndef XSIMD_SSE2_HPP
13
+ #define XSIMD_SSE2_HPP
14
+
15
+ #include <complex>
16
+ #include <limits>
17
+ #include <type_traits>
18
+
19
+ #include "../types/xsimd_sse2_register.hpp"
20
+
21
+ namespace xsimd
22
+ {
23
+ template <typename T, class A, bool... Values>
24
+ struct batch_bool_constant;
25
+
26
+ template <class T_out, class T_in, class A>
27
+ XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
28
+
29
+ template <typename T, class A, T... Values>
30
+ struct batch_constant;
31
+
32
+ namespace kernel
33
+ {
34
+ using namespace types;
35
+
36
+ namespace detail
37
+ {
38
+ constexpr uint32_t shuffle(uint32_t w, uint32_t x, uint32_t y, uint32_t z)
39
+ {
40
+ return (z << 6) | (y << 4) | (x << 2) | w;
41
+ }
42
+ constexpr uint32_t shuffle(uint32_t x, uint32_t y)
43
+ {
44
+ return (y << 1) | x;
45
+ }
46
+
47
+ constexpr uint32_t mod_shuffle(uint32_t w, uint32_t x, uint32_t y, uint32_t z)
48
+ {
49
+ return shuffle(w % 4, x % 4, y % 4, z % 4);
50
+ }
51
+
52
+ constexpr uint32_t mod_shuffle(uint32_t w, uint32_t x)
53
+ {
54
+ return shuffle(w % 2, x % 2);
55
+ }
56
+ }
57
+
58
+ // fwd
59
+ template <class A, class T, size_t I>
60
+ XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<common>) noexcept;
61
+ template <class A, typename T, typename ITy, ITy... Indices>
62
+ XSIMD_INLINE batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<common>) noexcept;
63
+ template <class A, class T>
64
+ XSIMD_INLINE batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<common>) noexcept;
65
+ template <class A, class T>
66
+ XSIMD_INLINE batch<T, A> avgr(batch<T, A> const&, batch<T, A> const&, requires_arch<common>) noexcept;
67
+
68
+ // abs
69
+ template <class A>
70
+ XSIMD_INLINE batch<double, A> abs(batch<double, A> const& self, requires_arch<sse2>) noexcept
71
+ {
72
+ __m128d sign_mask = _mm_set1_pd(-0.f); // -0.f = 1 << 31
73
+ return _mm_andnot_pd(sign_mask, self);
74
+ }
75
+ template <class A>
76
+ XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<sse2>) noexcept
77
+ {
78
+ __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
79
+ return _mm_andnot_ps(sign_mask, self);
80
+ }
81
+
82
+ // add
83
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
84
+ XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
85
+ {
86
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
87
+ {
88
+ return _mm_add_epi8(self, other);
89
+ }
90
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
91
+ {
92
+ return _mm_add_epi16(self, other);
93
+ }
94
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
95
+ {
96
+ return _mm_add_epi32(self, other);
97
+ }
98
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
99
+ {
100
+ return _mm_add_epi64(self, other);
101
+ }
102
+ else
103
+ {
104
+ assert(false && "unsupported arch/op combination");
105
+ return {};
106
+ }
107
+ }
108
+
109
+ template <class A>
110
+ XSIMD_INLINE batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
111
+ {
112
+ return _mm_add_ps(self, other);
113
+ }
114
+
115
+ template <class A>
116
+ XSIMD_INLINE batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
117
+ {
118
+ return _mm_add_pd(self, other);
119
+ }
120
+
121
+ // all
122
+ template <class A>
123
+ XSIMD_INLINE bool all(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
124
+ {
125
+ return _mm_movemask_ps(self) == 0x0F;
126
+ }
127
+ template <class A>
128
+ XSIMD_INLINE bool all(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
129
+ {
130
+ return _mm_movemask_pd(self) == 0x03;
131
+ }
132
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
133
+ XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
134
+ {
135
+ return _mm_movemask_epi8(self) == 0xFFFF;
136
+ }
137
+
138
+ // any
139
+ template <class A>
140
+ XSIMD_INLINE bool any(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
141
+ {
142
+ return _mm_movemask_ps(self) != 0;
143
+ }
144
+ template <class A>
145
+ XSIMD_INLINE bool any(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
146
+ {
147
+ return _mm_movemask_pd(self) != 0;
148
+ }
149
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
150
+ XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
151
+ {
152
+ return _mm_movemask_epi8(self) != 0;
153
+ }
154
+
155
+ // avgr
156
+ template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
157
+ XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
158
+ {
159
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
160
+ {
161
+ return _mm_avg_epu8(self, other);
162
+ }
163
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
164
+ {
165
+ return _mm_avg_epu16(self, other);
166
+ }
167
+ else
168
+ {
169
+ return avgr(self, other, common {});
170
+ }
171
+ }
172
+
173
+ // avg
174
+ template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
175
+ XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
176
+ {
177
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
178
+ {
179
+ auto adj = ((self ^ other) << 7) >> 7;
180
+ return avgr(self, other, A {}) - adj;
181
+ }
182
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
183
+ {
184
+ auto adj = ((self ^ other) << 15) >> 15;
185
+ return avgr(self, other, A {}) - adj;
186
+ }
187
+ else
188
+ {
189
+ return avg(self, other, common {});
190
+ }
191
+ }
192
+
193
+ // batch_bool_cast
194
+ template <class A, class T_out, class T_in>
195
+ XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<sse2>) noexcept
196
+ {
197
+ return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
198
+ }
199
+
200
+ // bitwise_and
201
+ template <class A>
202
+ XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
203
+ {
204
+ return _mm_and_ps(self, other);
205
+ }
206
+ template <class A>
207
+ XSIMD_INLINE batch_bool<float, A> bitwise_and(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
208
+ {
209
+ return _mm_and_ps(self, other);
210
+ }
211
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
212
+ XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
213
+ {
214
+ return _mm_and_si128(self, other);
215
+ }
216
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
217
+ XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
218
+ {
219
+ return _mm_and_si128(self, other);
220
+ }
221
+
222
+ template <class A>
223
+ batch<double, A> XSIMD_INLINE bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
224
+ {
225
+ return _mm_and_pd(self, other);
226
+ }
227
+
228
+ template <class A>
229
+ XSIMD_INLINE batch_bool<double, A> bitwise_and(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
230
+ {
231
+ return _mm_and_pd(self, other);
232
+ }
233
+
234
+ // bitwise_andnot
235
+ template <class A>
236
+ XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
237
+ {
238
+ return _mm_andnot_ps(other, self);
239
+ }
240
+
241
+ template <class A>
242
+ XSIMD_INLINE batch_bool<float, A> bitwise_andnot(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
243
+ {
244
+ return _mm_andnot_ps(other, self);
245
+ }
246
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
247
+ XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
248
+ {
249
+ return _mm_andnot_si128(other, self);
250
+ }
251
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
252
+ XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
253
+ {
254
+ return _mm_andnot_si128(other, self);
255
+ }
256
+
257
+ template <class A>
258
+ XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
259
+ {
260
+ return _mm_andnot_pd(other, self);
261
+ }
262
+
263
+ template <class A>
264
+ XSIMD_INLINE batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
265
+ {
266
+ return _mm_andnot_pd(other, self);
267
+ }
268
+
269
+ // bitwise_lshift
270
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
271
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<sse2>) noexcept
272
+ {
273
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
274
+ {
275
+ return _mm_and_si128(_mm_set1_epi8(0xFF << other), _mm_slli_epi32(self, other));
276
+ }
277
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
278
+ {
279
+ return _mm_slli_epi16(self, other);
280
+ }
281
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
282
+ {
283
+ return _mm_slli_epi32(self, other);
284
+ }
285
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
286
+ {
287
+ return _mm_slli_epi64(self, other);
288
+ }
289
+ else
290
+ {
291
+ assert(false && "unsupported arch/op combination");
292
+ return {};
293
+ }
294
+ }
295
+ template <size_t shift, class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
296
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, requires_arch<sse2>) noexcept
297
+ {
298
+ constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
299
+ static_assert(shift < bits, "Count must be less than the number of bits in T");
300
+ XSIMD_IF_CONSTEXPR(shift == 0)
301
+ {
302
+ return self;
303
+ }
304
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
305
+ {
306
+ // 8-bit left shift via 16-bit shift + mask
307
+ __m128i shifted = _mm_slli_epi16(self, static_cast<int>(shift));
308
+ __m128i mask = _mm_set1_epi8(static_cast<char>(0xFF << shift));
309
+ return _mm_and_si128(shifted, mask);
310
+ }
311
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
312
+ {
313
+ return _mm_slli_epi16(self, static_cast<int>(shift));
314
+ }
315
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
316
+ {
317
+ return _mm_slli_epi32(self, static_cast<int>(shift));
318
+ }
319
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
320
+ {
321
+ return _mm_slli_epi64(self, static_cast<int>(shift));
322
+ }
323
+ return bitwise_lshift<shift>(self, common {});
324
+ }
325
+
326
+ // bitwise_not
327
+ template <class A>
328
+ XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<sse2>) noexcept
329
+ {
330
+ return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1)));
331
+ }
332
+ template <class A>
333
+ XSIMD_INLINE batch_bool<float, A> bitwise_not(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
334
+ {
335
+ return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1)));
336
+ }
337
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
338
+ XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<sse2>) noexcept
339
+ {
340
+ return _mm_xor_si128(self, _mm_set1_epi32(-1));
341
+ }
342
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
343
+ XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
344
+ {
345
+ return _mm_xor_si128(self, _mm_set1_epi32(-1));
346
+ }
347
+ template <class A>
348
+ XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<sse2>) noexcept
349
+ {
350
+ return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1)));
351
+ }
352
+ template <class A>
353
+ XSIMD_INLINE batch_bool<double, A> bitwise_not(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
354
+ {
355
+ return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1)));
356
+ }
357
+
358
+ // bitwise_or
359
+ template <class A>
360
+ XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
361
+ {
362
+ return _mm_or_ps(self, other);
363
+ }
364
+ template <class A>
365
+ XSIMD_INLINE batch_bool<float, A> bitwise_or(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
366
+ {
367
+ return _mm_or_ps(self, other);
368
+ }
369
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
370
+ XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
371
+ {
372
+ return _mm_or_si128(self, other);
373
+ }
374
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
375
+ XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
376
+ {
377
+ return _mm_or_si128(self, other);
378
+ }
379
+
380
+ template <class A>
381
+ XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
382
+ {
383
+ return _mm_or_pd(self, other);
384
+ }
385
+
386
+ template <class A>
387
+ XSIMD_INLINE batch_bool<double, A> bitwise_or(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
388
+ {
389
+ return _mm_or_pd(self, other);
390
+ }
391
+
392
+ // bitwise_rshift
393
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
394
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<sse2>) noexcept
395
+ {
396
+ if (std::is_signed<T>::value)
397
+ {
398
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
399
+ {
400
+ __m128i sign_mask = _mm_set1_epi16((0xFF00 >> other) & 0x00FF);
401
+ __m128i cmp_is_negative = _mm_cmpgt_epi8(_mm_setzero_si128(), self);
402
+ __m128i res = _mm_srai_epi16(self, other);
403
+ return _mm_or_si128(_mm_and_si128(sign_mask, cmp_is_negative), _mm_andnot_si128(sign_mask, res));
404
+ }
405
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
406
+ {
407
+ return _mm_srai_epi16(self, other);
408
+ }
409
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
410
+ {
411
+ return _mm_srai_epi32(self, other);
412
+ }
413
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
414
+ {
415
+ // from https://github.com/samyvilar/vect/blob/master/vect_128.h
416
+ return _mm_or_si128(
417
+ _mm_srli_epi64(self, other),
418
+ _mm_slli_epi64(
419
+ _mm_srai_epi32(_mm_shuffle_epi32(self, _MM_SHUFFLE(3, 3, 1, 1)), 32),
420
+ 64 - other));
421
+ }
422
+ else
423
+ {
424
+ assert(false && "unsupported arch/op combination");
425
+ return {};
426
+ }
427
+ }
428
+ else
429
+ {
430
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
431
+ {
432
+ return _mm_and_si128(_mm_set1_epi8(0xFF >> other), _mm_srli_epi32(self, other));
433
+ }
434
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
435
+ {
436
+ return _mm_srli_epi16(self, other);
437
+ }
438
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
439
+ {
440
+ return _mm_srli_epi32(self, other);
441
+ }
442
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
443
+ {
444
+ return _mm_srli_epi64(self, other);
445
+ }
446
+ else
447
+ {
448
+ assert(false && "unsupported arch/op combination");
449
+ return {};
450
+ }
451
+ }
452
+ }
453
+ template <size_t shift, class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
454
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, requires_arch<sse2>) noexcept
455
+ {
456
+ constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
457
+ static_assert(shift < bits,
458
+ "Shift must be less than the number of value bits in the type");
459
+
460
+ XSIMD_IF_CONSTEXPR(shift == 0)
461
+ {
462
+ return self;
463
+ }
464
+
465
+ XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
466
+ {
467
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
468
+ {
469
+ // 8-bit arithmetic right shift via 16-bit shift + sign-extension handling.
470
+ __m128i shifted = _mm_srai_epi16(self, static_cast<int>(shift));
471
+ __m128i sign_mask = _mm_set1_epi16(static_cast<short>(0xFF00 >> shift));
472
+ __m128i cmp_negative = _mm_cmpgt_epi8(_mm_setzero_si128(), self);
473
+ return _mm_or_si128(_mm_and_si128(sign_mask, cmp_negative),
474
+ _mm_andnot_si128(sign_mask, shifted));
475
+ }
476
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
477
+ {
478
+ return _mm_srai_epi16(self, static_cast<int>(shift));
479
+ }
480
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
481
+ {
482
+ return _mm_srai_epi32(self, static_cast<int>(shift));
483
+ }
484
+ // No 64-bit arithmetic right shift in SSE2; fall back
485
+ return bitwise_rshift<shift>(self, common {});
486
+ }
487
+ else // unsigned / logical right shift
488
+ {
489
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
490
+ {
491
+ // Emulate byte-wise logical right shift using 16-bit shifts + per-byte mask.
492
+ __m128i s16 = _mm_srli_epi16(self, static_cast<int>(shift));
493
+ __m128i mask = _mm_set1_epi8(static_cast<char>(0xFFu >> shift));
494
+ return _mm_and_si128(s16, mask);
495
+ }
496
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
497
+ {
498
+ return _mm_srli_epi16(self, static_cast<int>(shift));
499
+ }
500
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
501
+ {
502
+ return _mm_srli_epi32(self, static_cast<int>(shift));
503
+ }
504
+ else // sizeof(T) == 8
505
+ {
506
+ return _mm_srli_epi64(self, static_cast<int>(shift));
507
+ }
508
+ }
509
+ }
510
+
511
+ // bitwise_xor
512
+ template <class A>
513
+ XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
514
+ {
515
+ return _mm_xor_ps(self, other);
516
+ }
517
+ template <class A>
518
+ XSIMD_INLINE batch_bool<float, A> bitwise_xor(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
519
+ {
520
+ return _mm_xor_ps(self, other);
521
+ }
522
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
523
+ XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
524
+ {
525
+ return _mm_xor_si128(self, other);
526
+ }
527
+ template <class A>
528
+ XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
529
+ {
530
+ return _mm_xor_pd(self, other);
531
+ }
532
+ template <class A>
533
+ XSIMD_INLINE batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
534
+ {
535
+ return _mm_xor_pd(self, other);
536
+ }
537
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
538
+ XSIMD_INLINE batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
539
+ {
540
+ return _mm_xor_si128(self, other);
541
+ }
542
+
543
+ // bitwise_cast
544
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
545
+ XSIMD_INLINE batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
546
+ {
547
+ return _mm_castsi128_ps(self);
548
+ }
549
+ template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
550
+ XSIMD_INLINE batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<sse2>) noexcept
551
+ {
552
+ return batch<Tp, A>(self.data);
553
+ }
554
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
555
+ XSIMD_INLINE batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<sse2>) noexcept
556
+ {
557
+ return _mm_castps_si128(self);
558
+ }
559
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
560
+ XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<sse2>) noexcept
561
+ {
562
+ return _mm_castsi128_pd(self);
563
+ }
564
+ template <class A>
565
+ XSIMD_INLINE batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<sse2>) noexcept
566
+ {
567
+ return _mm_castps_pd(self);
568
+ }
569
+ template <class A>
570
+ XSIMD_INLINE batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
571
+ {
572
+ return _mm_castpd_ps(self);
573
+ }
574
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
575
+ XSIMD_INLINE batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<sse2>) noexcept
576
+ {
577
+ return _mm_castpd_si128(self);
578
+ }
579
+
580
+ // broadcast
581
+ template <class A>
582
+ batch<float, A> XSIMD_INLINE broadcast(float val, requires_arch<sse2>) noexcept
583
+ {
584
+ return _mm_set1_ps(val);
585
+ }
586
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
587
+ XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<sse2>) noexcept
588
+ {
589
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
590
+ {
591
+ return _mm_set1_epi8(val);
592
+ }
593
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
594
+ {
595
+ return _mm_set1_epi16(val);
596
+ }
597
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
598
+ {
599
+ return _mm_set1_epi32(val);
600
+ }
601
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
602
+ {
603
+ return _mm_set1_epi64x(val);
604
+ }
605
+ else
606
+ {
607
+ assert(false && "unsupported arch/op combination");
608
+ return {};
609
+ }
610
+ }
611
+ template <class A>
612
+ XSIMD_INLINE batch<double, A> broadcast(double val, requires_arch<sse2>) noexcept
613
+ {
614
+ return _mm_set1_pd(val);
615
+ }
616
+
617
+ // store_complex
618
+ namespace detail
619
+ {
620
+ // Override these methods in SSE-based archs, no need to override store_aligned / store_unaligned
621
+ // complex_low
622
+ template <class A>
623
+ XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<sse2>) noexcept
624
+ {
625
+ return _mm_unpacklo_ps(self.real(), self.imag());
626
+ }
627
+ // complex_high
628
+ template <class A>
629
+ XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<sse2>) noexcept
630
+ {
631
+ return _mm_unpackhi_ps(self.real(), self.imag());
632
+ }
633
+ template <class A>
634
+ XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<sse2>) noexcept
635
+ {
636
+ return _mm_unpacklo_pd(self.real(), self.imag());
637
+ }
638
+ template <class A>
639
+ XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<sse2>) noexcept
640
+ {
641
+ return _mm_unpackhi_pd(self.real(), self.imag());
642
+ }
643
+ }
644
+
645
+ // decr_if
646
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
647
+ XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<sse2>) noexcept
648
+ {
649
+ return self + batch<T, A>(mask.data);
650
+ }
651
+
652
+ // div
653
+ template <class A>
654
+ XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
655
+ {
656
+ return _mm_div_ps(self, other);
657
+ }
658
+ template <class A>
659
+ XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
660
+ {
661
+ return _mm_div_pd(self, other);
662
+ }
663
+
664
+ // fast_cast
665
+ namespace detail
666
+ {
667
+ template <class A>
668
+ XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
669
+ {
670
+ return _mm_cvtepi32_ps(self);
671
+ }
672
+
673
+ template <class A>
674
+ XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
675
+ {
676
+ // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
677
+ // adapted to sse2
678
+ __m128i xH = _mm_srli_epi64(x, 32);
679
+ xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84
680
+ __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
681
+ __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); // 2^52
682
+ __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
683
+ return _mm_add_pd(f, _mm_castsi128_pd(xL));
684
+ }
685
+
686
+ template <class A>
687
+ XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
688
+ {
689
+ // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
690
+ // adapted to sse2
691
+ __m128i xH = _mm_srai_epi32(x, 16);
692
+ xH = _mm_and_si128(xH, _mm_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
693
+ xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); // 3*2^67
694
+ __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
695
+ __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); // 2^52
696
+ __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
697
+ return _mm_add_pd(f, _mm_castsi128_pd(xL));
698
+ }
699
+
700
+ template <class A>
701
+ XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<sse2>) noexcept
702
+ {
703
+ return _mm_cvttps_epi32(self);
704
+ }
705
+ }
706
+
707
+ // eq
708
+ template <class A>
709
+ XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
710
+ {
711
+ return _mm_cmpeq_ps(self, other);
712
+ }
713
+ template <class A>
714
+ XSIMD_INLINE batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
715
+ {
716
+ return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(self), _mm_castps_si128(other)));
717
+ }
718
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
719
+ XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
720
+ {
721
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
722
+ {
723
+ return _mm_cmpeq_epi8(self, other);
724
+ }
725
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
726
+ {
727
+ return _mm_cmpeq_epi16(self, other);
728
+ }
729
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
730
+ {
731
+ return _mm_cmpeq_epi32(self, other);
732
+ }
733
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
734
+ {
735
+ __m128i tmp1 = _mm_cmpeq_epi32(self, other);
736
+ __m128i tmp2 = _mm_shuffle_epi32(tmp1, 0xB1);
737
+ __m128i tmp3 = _mm_and_si128(tmp1, tmp2);
738
+ __m128i tmp4 = _mm_srai_epi32(tmp3, 31);
739
+ return _mm_shuffle_epi32(tmp4, 0xF5);
740
+ }
741
+ else
742
+ {
743
+ assert(false && "unsupported arch/op combination");
744
+ return {};
745
+ }
746
+ }
747
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
748
+ XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
749
+ {
750
+ return ~(self != other);
751
+ }
752
+ template <class A>
753
+ XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
754
+ {
755
+ return _mm_cmpeq_pd(self, other);
756
+ }
757
+ template <class A>
758
+ XSIMD_INLINE batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
759
+ {
760
+ return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other)));
761
+ }
762
+
763
+ // first
764
+ template <class A>
765
+ XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<sse2>) noexcept
766
+ {
767
+ return _mm_cvtss_f32(self);
768
+ }
769
+
770
+ template <class A>
771
+ XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<sse2>) noexcept
772
+ {
773
+ return _mm_cvtsd_f64(self);
774
+ }
775
+
776
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
777
+ XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<sse2>) noexcept
778
+ {
779
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
780
+ {
781
+ return static_cast<T>(_mm_cvtsi128_si32(self) & 0xFF);
782
+ }
783
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
784
+ {
785
+ return static_cast<T>(_mm_cvtsi128_si32(self) & 0xFFFF);
786
+ }
787
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
788
+ {
789
+ return static_cast<T>(_mm_cvtsi128_si32(self));
790
+ }
791
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
792
+ {
793
+ #if defined(__x86_64__)
794
+ return static_cast<T>(_mm_cvtsi128_si64(self));
795
+ #else
796
+ __m128i m;
797
+ _mm_storel_epi64(&m, self);
798
+ int64_t i;
799
+ std::memcpy(&i, &m, sizeof(i));
800
+ return i;
801
+ #endif
802
+ }
803
+ else
804
+ {
805
+ assert(false && "unsupported arch/op combination");
806
+ return {};
807
+ }
808
+ }
809
+
810
+ // from_mask
811
+ template <class A>
812
+ XSIMD_INLINE batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
813
+ {
814
+ alignas(A::alignment()) static const uint32_t lut[][4] = {
815
+ { 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
816
+ { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 },
817
+ { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 },
818
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 },
819
+ { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 },
820
+ { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 },
821
+ { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
822
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
823
+ { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
824
+ { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF },
825
+ { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
826
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
827
+ { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
828
+ { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
829
+ { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
830
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
831
+ };
832
+ assert(!(mask & ~0xFul) && "inbound mask");
833
+ return _mm_castsi128_ps(_mm_load_si128((const __m128i*)lut[mask]));
834
+ }
835
+ template <class A>
836
+ XSIMD_INLINE batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
837
+ {
838
+ alignas(A::alignment()) static const uint64_t lut[][4] = {
839
+ { 0x0000000000000000ul, 0x0000000000000000ul },
840
+ { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
841
+ { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
842
+ { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
843
+ };
844
+ assert(!(mask & ~0x3ul) && "inbound mask");
845
+ return _mm_castsi128_pd(_mm_load_si128((const __m128i*)lut[mask]));
846
+ }
847
+ template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
848
+ XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
849
+ {
850
+ alignas(A::alignment()) static const uint64_t lut64[] = {
851
+ 0x0000000000000000,
852
+ 0x000000000000FFFF,
853
+ 0x00000000FFFF0000,
854
+ 0x00000000FFFFFFFF,
855
+ 0x0000FFFF00000000,
856
+ 0x0000FFFF0000FFFF,
857
+ 0x0000FFFFFFFF0000,
858
+ 0x0000FFFFFFFFFFFF,
859
+ 0xFFFF000000000000,
860
+ 0xFFFF00000000FFFF,
861
+ 0xFFFF0000FFFF0000,
862
+ 0xFFFF0000FFFFFFFF,
863
+ 0xFFFFFFFF00000000,
864
+ 0xFFFFFFFF0000FFFF,
865
+ 0xFFFFFFFFFFFF0000,
866
+ 0xFFFFFFFFFFFFFFFF,
867
+ };
868
+ alignas(A::alignment()) static const uint32_t lut32[] = {
869
+ 0x00000000,
870
+ 0x000000FF,
871
+ 0x0000FF00,
872
+ 0x0000FFFF,
873
+ 0x00FF0000,
874
+ 0x00FF00FF,
875
+ 0x00FFFF00,
876
+ 0x00FFFFFF,
877
+ 0xFF000000,
878
+ 0xFF0000FF,
879
+ 0xFF00FF00,
880
+ 0xFF00FFFF,
881
+ 0xFFFF0000,
882
+ 0xFFFF00FF,
883
+ 0xFFFFFF00,
884
+ 0xFFFFFFFF,
885
+ };
886
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
887
+ {
888
+ assert(!(mask & ~0xFFFF) && "inbound mask");
889
+ return _mm_setr_epi32(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF], lut32[(mask >> 8) & 0xF], lut32[mask >> 12]);
890
+ }
891
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
892
+ {
893
+ assert(!(mask & ~0xFF) && "inbound mask");
894
+ return _mm_set_epi64x(lut64[mask >> 4], lut64[mask & 0xF]);
895
+ }
896
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
897
+ {
898
+ return _mm_castps_si128(from_mask(batch_bool<float, A> {}, mask, sse2 {}));
899
+ }
900
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
901
+ {
902
+ return _mm_castpd_si128(from_mask(batch_bool<double, A> {}, mask, sse2 {}));
903
+ }
904
+ }
905
+
906
+ // ge
907
+ template <class A>
908
+ XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
909
+ {
910
+ return _mm_cmpge_ps(self, other);
911
+ }
912
+ template <class A>
913
+ XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
914
+ {
915
+ return _mm_cmpge_pd(self, other);
916
+ }
917
+
918
+ // gt
919
+ template <class A>
920
+ XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
921
+ {
922
+ return _mm_cmpgt_ps(self, other);
923
+ }
924
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
925
+ XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
926
+ {
927
+ if (std::is_signed<T>::value)
928
+ {
929
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
930
+ {
931
+ return _mm_cmpgt_epi8(self, other);
932
+ }
933
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
934
+ {
935
+ return _mm_cmpgt_epi16(self, other);
936
+ }
937
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
938
+ {
939
+ return _mm_cmpgt_epi32(self, other);
940
+ }
941
+ else
942
+ {
943
+ return gt(self, other, common {});
944
+ }
945
+ }
946
+ else
947
+ {
948
+ return gt(self, other, common {});
949
+ }
950
+ }
951
+
952
+ template <class A>
953
+ XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
954
+ {
955
+ return _mm_cmpgt_pd(self, other);
956
+ }
957
+
958
+ // haddp
959
+ template <class A>
960
+ XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<sse2>) noexcept
961
+ {
962
+ __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]);
963
+ __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]);
964
+ __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]);
965
+ tmp0 = _mm_add_ps(tmp0, tmp1);
966
+ tmp1 = _mm_unpacklo_ps(row[2], row[3]);
967
+ tmp1 = _mm_add_ps(tmp1, tmp2);
968
+ tmp2 = _mm_movehl_ps(tmp1, tmp0);
969
+ tmp0 = _mm_movelh_ps(tmp0, tmp1);
970
+ return _mm_add_ps(tmp0, tmp2);
971
+ }
972
+ template <class A>
973
+ XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<sse2>) noexcept
974
+ {
975
+ return _mm_add_pd(_mm_unpacklo_pd(row[0], row[1]),
976
+ _mm_unpackhi_pd(row[0], row[1]));
977
+ }
978
+
979
+ // incr_if
980
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
981
+ XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<sse2>) noexcept
982
+ {
983
+ return self - batch<T, A>(mask.data);
984
+ }
985
+
986
+ // insert
987
+ template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
988
+ XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse2>) noexcept
989
+ {
990
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
991
+ {
992
+ return _mm_insert_epi16(self, val, I);
993
+ }
994
+ else
995
+ {
996
+ return insert(self, val, pos, common {});
997
+ }
998
+ }
999
+
1000
+ // isnan
1001
+ template <class A>
1002
+ XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<sse2>) noexcept
1003
+ {
1004
+ return _mm_cmpunord_ps(self, self);
1005
+ }
1006
+ template <class A>
1007
+ XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<sse2>) noexcept
1008
+ {
1009
+ return _mm_cmpunord_pd(self, self);
1010
+ }
1011
+
1012
+ // load_aligned
1013
+ template <class A>
1014
+ XSIMD_INLINE batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<sse2>) noexcept
1015
+ {
1016
+ return _mm_load_ps(mem);
1017
+ }
1018
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1019
+ XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<sse2>) noexcept
1020
+ {
1021
+ return _mm_load_si128((__m128i const*)mem);
1022
+ }
1023
+ template <class A>
1024
+ XSIMD_INLINE batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<sse2>) noexcept
1025
+ {
1026
+ return _mm_load_pd(mem);
1027
+ }
1028
+
1029
+ // load_unaligned
1030
+ template <class A>
1031
+ XSIMD_INLINE batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<sse2>) noexcept
1032
+ {
1033
+ return _mm_loadu_ps(mem);
1034
+ }
1035
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1036
+ XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<sse2>) noexcept
1037
+ {
1038
+ return _mm_loadu_si128((__m128i const*)mem);
1039
+ }
1040
+ template <class A>
1041
+ XSIMD_INLINE batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<sse2>) noexcept
1042
+ {
1043
+ return _mm_loadu_pd(mem);
1044
+ }
1045
+
1046
+ // load_complex
1047
+ namespace detail
1048
+ {
1049
+ // Redefine these methods in the SSE-based archs if required
1050
+ template <class A>
1051
+ XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<sse2>) noexcept
1052
+ {
1053
+ return { _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1)) };
1054
+ }
1055
+ template <class A>
1056
+ XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<sse2>) noexcept
1057
+ {
1058
+ return { _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(1, 1)) };
1059
+ }
1060
+ }
1061
+
1062
+ // le
1063
+ template <class A>
1064
+ XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1065
+ {
1066
+ return _mm_cmple_ps(self, other);
1067
+ }
1068
+ template <class A>
1069
+ XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1070
+ {
1071
+ return _mm_cmple_pd(self, other);
1072
+ }
1073
+
1074
+ // lt
1075
+ template <class A>
1076
+ XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1077
+ {
1078
+ return _mm_cmplt_ps(self, other);
1079
+ }
1080
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1081
+ XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1082
+ {
1083
+ if (std::is_signed<T>::value)
1084
+ {
1085
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1086
+ {
1087
+ return _mm_cmplt_epi8(self, other);
1088
+ }
1089
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1090
+ {
1091
+ return _mm_cmplt_epi16(self, other);
1092
+ }
1093
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1094
+ {
1095
+ return _mm_cmplt_epi32(self, other);
1096
+ }
1097
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1098
+ {
1099
+ __m128i tmp1 = _mm_sub_epi64(self, other);
1100
+ __m128i tmp2 = _mm_xor_si128(self, other);
1101
+ __m128i tmp3 = _mm_andnot_si128(other, self);
1102
+ __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1);
1103
+ __m128i tmp5 = _mm_or_si128(tmp3, tmp4);
1104
+ __m128i tmp6 = _mm_srai_epi32(tmp5, 31);
1105
+ return _mm_shuffle_epi32(tmp6, 0xF5);
1106
+ }
1107
+ else
1108
+ {
1109
+ assert(false && "unsupported arch/op combination");
1110
+ return {};
1111
+ }
1112
+ }
1113
+ else
1114
+ {
1115
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1116
+ {
1117
+ return _mm_cmplt_epi8(_mm_xor_si128(self, _mm_set1_epi8(std::numeric_limits<int8_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi8(std::numeric_limits<int8_t>::lowest())));
1118
+ }
1119
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1120
+ {
1121
+ return _mm_cmplt_epi16(_mm_xor_si128(self, _mm_set1_epi16(std::numeric_limits<int16_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi16(std::numeric_limits<int16_t>::lowest())));
1122
+ }
1123
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1124
+ {
1125
+ return _mm_cmplt_epi32(_mm_xor_si128(self, _mm_set1_epi32(std::numeric_limits<int32_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi32(std::numeric_limits<int32_t>::lowest())));
1126
+ }
1127
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1128
+ {
1129
+ auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
1130
+ auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
1131
+ __m128i tmp1 = _mm_sub_epi64(xself, xother);
1132
+ __m128i tmp2 = _mm_xor_si128(xself, xother);
1133
+ __m128i tmp3 = _mm_andnot_si128(xother, xself);
1134
+ __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1);
1135
+ __m128i tmp5 = _mm_or_si128(tmp3, tmp4);
1136
+ __m128i tmp6 = _mm_srai_epi32(tmp5, 31);
1137
+ return _mm_shuffle_epi32(tmp6, 0xF5);
1138
+ }
1139
+ else
1140
+ {
1141
+ assert(false && "unsupported arch/op combination");
1142
+ return {};
1143
+ }
1144
+ }
1145
+ }
1146
+
1147
+ template <class A>
1148
+ XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1149
+ {
1150
+ return _mm_cmplt_pd(self, other);
1151
+ }
1152
+
1153
+ /* compression table to turn 0b10 into 0b1,
1154
+ * 0b100010 into 0b101 etc
1155
+ */
1156
+ namespace detail
1157
+ {
1158
+ XSIMD_INLINE int mask_lut(uint64_t mask)
1159
+ {
1160
+ // clang-format off
1161
+ static const int mask_lut[256] = {
1162
+ 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0,
1163
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1164
+ 0x4, 0x0, 0x5, 0x0, 0x0, 0x0, 0x0, 0x0, 0x6, 0x0, 0x7, 0x0, 0x0, 0x0, 0x0, 0x0,
1165
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1166
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1167
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1168
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1169
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1170
+ 0x8, 0x0, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, 0xA, 0x0, 0xB, 0x0, 0x0, 0x0, 0x0, 0x0,
1171
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1172
+ 0xC, 0x0, 0xD, 0x0, 0x0, 0x0, 0x0, 0x0, 0xE, 0x0, 0xF, 0x0, 0x0, 0x0, 0x0, 0x0,
1173
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1174
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1175
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1176
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1177
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1178
+ };
1179
+ // clang-format on
1180
+ return mask_lut[mask & 0xAA];
1181
+ }
1182
+ }
1183
+
1184
+ // mask
1185
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1186
+ XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
1187
+ {
1188
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1189
+ {
1190
+ return _mm_movemask_epi8(self);
1191
+ }
1192
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1193
+ {
1194
+ uint64_t mask8 = _mm_movemask_epi8(self);
1195
+ return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4);
1196
+ }
1197
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1198
+ {
1199
+ return _mm_movemask_ps(_mm_castsi128_ps(self));
1200
+ }
1201
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1202
+ {
1203
+ return _mm_movemask_pd(_mm_castsi128_pd(self));
1204
+ }
1205
+ else
1206
+ {
1207
+ assert(false && "unsupported arch/op combination");
1208
+ return {};
1209
+ }
1210
+ }
1211
+ template <class A>
1212
+ XSIMD_INLINE uint64_t mask(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
1213
+ {
1214
+ return _mm_movemask_ps(self);
1215
+ }
1216
+
1217
+ template <class A>
1218
+ XSIMD_INLINE uint64_t mask(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
1219
+ {
1220
+ return _mm_movemask_pd(self);
1221
+ }
1222
+
1223
+ // max
1224
+ template <class A>
1225
+ XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1226
+ {
1227
+ return _mm_max_ps(other, self);
1228
+ }
1229
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1230
+ XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1231
+ {
1232
+ return select(self > other, self, other);
1233
+ }
1234
+ template <class A>
1235
+ XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1236
+ {
1237
+ return _mm_max_pd(other, self);
1238
+ }
1239
+
1240
+ // min
1241
+ template <class A>
1242
+ XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1243
+ {
1244
+ return _mm_min_ps(other, self);
1245
+ }
1246
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1247
+ XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1248
+ {
1249
+ return select(self <= other, self, other);
1250
+ }
1251
+ template <class A>
1252
+ XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1253
+ {
1254
+ return _mm_min_pd(other, self);
1255
+ }
1256
+
1257
+ // mul
1258
+ template <class A>
1259
+ XSIMD_INLINE batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1260
+ {
1261
+ return _mm_mul_ps(self, other);
1262
+ }
1263
+ template <class A>
1264
+ XSIMD_INLINE batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1265
+ {
1266
+ return _mm_mul_pd(self, other);
1267
+ }
1268
+
1269
+ // mul
1270
+ template <class A>
1271
+ XSIMD_INLINE batch<int16_t, A> mul(batch<int16_t, A> const& self, batch<int16_t, A> const& other, requires_arch<sse2>) noexcept
1272
+ {
1273
+ return _mm_mullo_epi16(self, other);
1274
+ }
1275
+
1276
+ // nearbyint_as_int
1277
+ template <class A>
1278
+ XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
1279
+ requires_arch<sse2>) noexcept
1280
+ {
1281
+ return _mm_cvtps_epi32(self);
1282
+ }
1283
+
1284
+ // neg
1285
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1286
+ XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<sse2>) noexcept
1287
+ {
1288
+ return 0 - self;
1289
+ }
1290
+ template <class A>
1291
+ XSIMD_INLINE batch<float, A> neg(batch<float, A> const& self, requires_arch<sse2>) noexcept
1292
+ {
1293
+ return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
1294
+ }
1295
+
1296
+ template <class A>
1297
+ XSIMD_INLINE batch<double, A> neg(batch<double, A> const& self, requires_arch<sse2>) noexcept
1298
+ {
1299
+ return _mm_xor_pd(
1300
+ self, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000)));
1301
+ }
1302
+
1303
+ // neq
1304
+ template <class A>
1305
+ XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1306
+ {
1307
+ return _mm_cmpneq_ps(self, other);
1308
+ }
1309
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1310
+ XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1311
+ {
1312
+ return ~(self == other);
1313
+ }
1314
+ template <class A>
1315
+ XSIMD_INLINE batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
1316
+ {
1317
+ return _mm_xor_ps(self, other);
1318
+ }
1319
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1320
+ XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
1321
+ {
1322
+ return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(self.data), _mm_castsi128_ps(other.data)));
1323
+ }
1324
+
1325
+ template <class A>
1326
+ XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1327
+ {
1328
+ return _mm_cmpneq_pd(self, other);
1329
+ }
1330
+ template <class A>
1331
+ XSIMD_INLINE batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
1332
+ {
1333
+ return _mm_xor_pd(self, other);
1334
+ }
1335
+
1336
+ // reciprocal
1337
+ template <class A>
1338
+ XSIMD_INLINE batch<float, A> reciprocal(batch<float, A> const& self,
1339
+ kernel::requires_arch<sse2>)
1340
+ {
1341
+ return _mm_rcp_ps(self);
1342
+ }
1343
+
1344
+ // reduce_add
1345
+ template <class A>
1346
+ XSIMD_INLINE float reduce_add(batch<float, A> const& self, requires_arch<sse2>) noexcept
1347
+ {
1348
+ __m128 tmp0 = _mm_add_ps(self, _mm_movehl_ps(self, self));
1349
+ __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
1350
+ return _mm_cvtss_f32(tmp1);
1351
+ }
1352
+
1353
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1354
+ XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<sse2>) noexcept
1355
+ {
1356
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1357
+ {
1358
+ __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
1359
+ __m128i tmp2 = _mm_add_epi32(self, tmp1);
1360
+ __m128i tmp3 = _mm_shuffle_epi32(tmp2, 0x01);
1361
+ __m128i tmp4 = _mm_add_epi32(tmp2, tmp3);
1362
+ return _mm_cvtsi128_si32(tmp4);
1363
+ }
1364
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1365
+ {
1366
+ __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
1367
+ __m128i tmp2 = _mm_add_epi64(self, tmp1);
1368
+ #if defined(__x86_64__)
1369
+ return _mm_cvtsi128_si64(tmp2);
1370
+ #else
1371
+ __m128i m;
1372
+ _mm_storel_epi64(&m, tmp2);
1373
+ int64_t i;
1374
+ std::memcpy(&i, &m, sizeof(i));
1375
+ return i;
1376
+ #endif
1377
+ }
1378
+ else
1379
+ {
1380
+ return reduce_add(self, common {});
1381
+ }
1382
+ }
1383
+
1384
+ template <class A>
1385
+ XSIMD_INLINE double reduce_add(batch<double, A> const& self, requires_arch<sse2>) noexcept
1386
+ {
1387
+ return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self)));
1388
+ }
1389
+
1390
+ // reduce_max
1391
+ template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
1392
+ XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<sse2>) noexcept
1393
+ {
1394
+ constexpr auto mask0 = detail::shuffle(2, 3, 0, 0);
1395
+ batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
1396
+ batch<T, A> acc0 = max(self, step0);
1397
+
1398
+ constexpr auto mask1 = detail::shuffle(1, 0, 0, 0);
1399
+ batch<T, A> step1 = _mm_shuffle_epi32(acc0, mask1);
1400
+ batch<T, A> acc1 = max(acc0, step1);
1401
+
1402
+ constexpr auto mask2 = detail::shuffle(1, 0, 0, 0);
1403
+ batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
1404
+ batch<T, A> acc2 = max(acc1, step2);
1405
+ if (sizeof(T) == 2)
1406
+ return first(acc2, A {});
1407
+ batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
1408
+ batch<T, A> acc3 = max(acc2, step3);
1409
+ return first(acc3, A {});
1410
+ }
1411
+
1412
+ // reduce_min
1413
+ template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
1414
+ XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<sse2>) noexcept
1415
+ {
1416
+ constexpr auto mask0 = detail::shuffle(2, 3, 0, 0);
1417
+ batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
1418
+ batch<T, A> acc0 = min(self, step0);
1419
+
1420
+ constexpr auto mask1 = detail::shuffle(1, 0, 0, 0);
1421
+ batch<T, A> step1 = _mm_shuffle_epi32(acc0, mask1);
1422
+ batch<T, A> acc1 = min(acc0, step1);
1423
+
1424
+ constexpr auto mask2 = detail::shuffle(1, 0, 0, 0);
1425
+ batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
1426
+ batch<T, A> acc2 = min(acc1, step2);
1427
+ if (sizeof(T) == 2)
1428
+ return first(acc2, A {});
1429
+ batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
1430
+ batch<T, A> acc3 = min(acc2, step3);
1431
+ return first(acc3, A {});
1432
+ }
1433
+
1434
+ // reduce_mul
1435
+ template <class A>
1436
+ XSIMD_INLINE float reduce_mul(batch<float, A> const& self, requires_arch<sse2>) noexcept
1437
+ {
1438
+ __m128 tmp0 = _mm_mul_ps(self, _mm_movehl_ps(self, self));
1439
+ __m128 tmp1 = _mm_mul_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
1440
+ return _mm_cvtss_f32(tmp1);
1441
+ }
1442
+
1443
+ template <class A>
1444
+ XSIMD_INLINE double reduce_mul(batch<double, A> const& self, requires_arch<sse2>) noexcept
1445
+ {
1446
+ return _mm_cvtsd_f64(_mm_mul_sd(self, _mm_unpackhi_pd(self, self)));
1447
+ }
1448
+
1449
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1450
+ XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<sse2>) noexcept
1451
+ {
1452
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1453
+ {
1454
+ batch<T, A> tmp1 = _mm_shuffle_epi32(self, _MM_SHUFFLE(0, 1, 2, 3));
1455
+ tmp1 = tmp1 * self;
1456
+ batch<T, A> tmp2 = _mm_unpackhi_epi32(tmp1, tmp1);
1457
+ tmp2 = tmp2 * tmp1;
1458
+ return _mm_cvtsi128_si32(tmp2);
1459
+ }
1460
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1461
+ {
1462
+ batch<T, A> tmp1 = _mm_unpackhi_epi64(self, self);
1463
+ auto tmp2 = tmp1 * self;
1464
+ #if defined(__x86_64__)
1465
+ return _mm_cvtsi128_si64(tmp2);
1466
+ #else
1467
+ __m128i m;
1468
+ _mm_storel_epi64(&m, tmp2);
1469
+ int64_t i;
1470
+ std::memcpy(&i, &m, sizeof(i));
1471
+ return i;
1472
+ #endif
1473
+ }
1474
+ else
1475
+ {
1476
+ return reduce_mul(self, common {});
1477
+ }
1478
+ }
1479
+
1480
+ // rsqrt
1481
+ template <class A>
1482
+ XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
1483
+ {
1484
+ return _mm_rsqrt_ps(val);
1485
+ }
1486
+ template <class A>
1487
+ XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<sse2>) noexcept
1488
+ {
1489
+ return _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(val)));
1490
+ }
1491
+
1492
+ // select
1493
+ template <class A>
1494
+ XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse2>) noexcept
1495
+ {
1496
+ return _mm_or_ps(_mm_and_ps(cond, true_br), _mm_andnot_ps(cond, false_br));
1497
+ }
1498
+
1499
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1500
+ XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
1501
+ {
1502
+ return _mm_or_si128(_mm_and_si128(cond, true_br), _mm_andnot_si128(cond, false_br));
1503
+ }
1504
+ template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1505
+ XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
1506
+ {
1507
+ return select(batch_bool<T, A> { Values... }, true_br, false_br, sse2 {});
1508
+ }
1509
+ template <class A>
1510
+ XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse2>) noexcept
1511
+ {
1512
+ return _mm_or_pd(_mm_and_pd(cond, true_br), _mm_andnot_pd(cond, false_br));
1513
+ }
1514
+
1515
+ // shuffle
1516
+ template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
1517
+ XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<sse2>) noexcept
1518
+ {
1519
+ constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
1520
+ // shuffle within lane
1521
+ if (I0 < 4 && I1 < 4 && I2 >= 4 && I3 >= 4)
1522
+ return _mm_shuffle_ps(x, y, smask);
1523
+
1524
+ // shuffle within opposite lane
1525
+ if (I0 >= 4 && I1 >= 4 && I2 < 4 && I3 < 4)
1526
+ return _mm_shuffle_ps(y, x, smask);
1527
+ return shuffle(x, y, mask, common {});
1528
+ }
1529
+
1530
+ template <class A, class ITy, ITy I0, ITy I1>
1531
+ XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1> mask, requires_arch<sse2>) noexcept
1532
+ {
1533
+ constexpr uint32_t smask = detail::mod_shuffle(I0, I1);
1534
+ // shuffle within lane
1535
+ if (I0 < 2 && I1 >= 2)
1536
+ return _mm_shuffle_pd(x, y, smask);
1537
+
1538
+ // shuffle within opposite lane
1539
+ if (I0 >= 2 && I1 < 2)
1540
+ return _mm_shuffle_pd(y, x, smask);
1541
+ return shuffle(x, y, mask, common {});
1542
+ }
1543
+
1544
+ // sqrt
1545
+ template <class A>
1546
+ XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
1547
+ {
1548
+ return _mm_sqrt_ps(val);
1549
+ }
1550
+ template <class A>
1551
+ XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<sse2>) noexcept
1552
+ {
1553
+ return _mm_sqrt_pd(val);
1554
+ }
1555
+
1556
+ // slide_left
1557
+ template <size_t N, class A, class T>
1558
+ XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<sse2>) noexcept
1559
+ {
1560
+ return _mm_slli_si128(x, N);
1561
+ }
1562
+
1563
+ // slide_right
1564
+ template <size_t N, class A, class T>
1565
+ XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<sse2>) noexcept
1566
+ {
1567
+ return _mm_srli_si128(x, N);
1568
+ }
1569
+
1570
+ // sadd
1571
+
1572
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1573
+ XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1574
+ {
1575
+ if (std::is_signed<T>::value)
1576
+ {
1577
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1578
+ {
1579
+ return _mm_adds_epi8(self, other);
1580
+ }
1581
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1582
+ {
1583
+ return _mm_adds_epi16(self, other);
1584
+ }
1585
+ else
1586
+ {
1587
+ return sadd(self, other, common {});
1588
+ }
1589
+ }
1590
+ else
1591
+ {
1592
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1593
+ {
1594
+ return _mm_adds_epu8(self, other);
1595
+ }
1596
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1597
+ {
1598
+ return _mm_adds_epu16(self, other);
1599
+ }
1600
+ else
1601
+ {
1602
+ return sadd(self, other, common {});
1603
+ }
1604
+ }
1605
+ }
1606
+
1607
+ // set
1608
+ template <class A, class... Values>
1609
+ XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<sse2>, Values... values) noexcept
1610
+ {
1611
+ static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
1612
+ return _mm_setr_ps(values...);
1613
+ }
1614
+
1615
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1616
+ XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1) noexcept
1617
+ {
1618
+ return _mm_set_epi64x(v1, v0);
1619
+ }
1620
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1621
+ XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3) noexcept
1622
+ {
1623
+ return _mm_setr_epi32(v0, v1, v2, v3);
1624
+ }
1625
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1626
+ XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
1627
+ {
1628
+ return _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7);
1629
+ }
1630
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1631
+ XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
1632
+ {
1633
+ return _mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
1634
+ }
1635
+
1636
+ template <class A, class... Values>
1637
+ XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<sse2>, Values... values) noexcept
1638
+ {
1639
+ static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
1640
+ return _mm_setr_pd(values...);
1641
+ }
1642
+
1643
+ template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1644
+ XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<sse2>, Values... values) noexcept
1645
+ {
1646
+ return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
1647
+ }
1648
+
1649
+ template <class A, class... Values>
1650
+ XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<sse2>, Values... values) noexcept
1651
+ {
1652
+ static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
1653
+ return _mm_castsi128_ps(set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data);
1654
+ }
1655
+
1656
+ template <class A, class... Values>
1657
+ XSIMD_INLINE batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<sse2>, Values... values) noexcept
1658
+ {
1659
+ static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
1660
+ return _mm_castsi128_pd(set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data);
1661
+ }
1662
+
1663
+ // ssub
1664
+
1665
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1666
+ XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1667
+ {
1668
+ if (std::is_signed<T>::value)
1669
+ {
1670
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1671
+ {
1672
+ return _mm_subs_epi8(self, other);
1673
+ }
1674
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1675
+ {
1676
+ return _mm_subs_epi16(self, other);
1677
+ }
1678
+ else
1679
+ {
1680
+ return ssub(self, other, common {});
1681
+ }
1682
+ }
1683
+ else
1684
+ {
1685
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1686
+ {
1687
+ return _mm_subs_epu8(self, other);
1688
+ }
1689
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1690
+ {
1691
+ return _mm_subs_epu16(self, other);
1692
+ }
1693
+ else
1694
+ {
1695
+ return ssub(self, other, common {});
1696
+ }
1697
+ }
1698
+ }
1699
+
1700
+ // store_aligned
1701
+ template <class A>
1702
+ XSIMD_INLINE void store_aligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
1703
+ {
1704
+ return _mm_store_ps(mem, self);
1705
+ }
1706
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1707
+ XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
1708
+ {
1709
+ return _mm_store_si128((__m128i*)mem, self);
1710
+ }
1711
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1712
+ XSIMD_INLINE void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
1713
+ {
1714
+ return _mm_store_si128((__m128i*)mem, self);
1715
+ }
1716
+ template <class A>
1717
+ XSIMD_INLINE void store_aligned(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
1718
+ {
1719
+ return _mm_store_pd(mem, self);
1720
+ }
1721
+
1722
+ // store_unaligned
1723
+ template <class A>
1724
+ XSIMD_INLINE void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
1725
+ {
1726
+ return _mm_storeu_ps(mem, self);
1727
+ }
1728
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1729
+ XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
1730
+ {
1731
+ return _mm_storeu_si128((__m128i*)mem, self);
1732
+ }
1733
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1734
+ XSIMD_INLINE void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
1735
+ {
1736
+ return _mm_storeu_si128((__m128i*)mem, self);
1737
+ }
1738
+ template <class A>
1739
+ XSIMD_INLINE void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
1740
+ {
1741
+ return _mm_storeu_pd(mem, self);
1742
+ }
1743
+
1744
+ // sub
1745
+ template <class A>
1746
+ XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1747
+ {
1748
+ return _mm_sub_ps(self, other);
1749
+ }
1750
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1751
+ XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1752
+ {
1753
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1754
+ {
1755
+ return _mm_sub_epi8(self, other);
1756
+ }
1757
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1758
+ {
1759
+ return _mm_sub_epi16(self, other);
1760
+ }
1761
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1762
+ {
1763
+ return _mm_sub_epi32(self, other);
1764
+ }
1765
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1766
+ {
1767
+ return _mm_sub_epi64(self, other);
1768
+ }
1769
+ else
1770
+ {
1771
+ assert(false && "unsupported arch/op combination");
1772
+ return {};
1773
+ }
1774
+ }
1775
+ template <class A>
1776
+ XSIMD_INLINE batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1777
+ {
1778
+ return _mm_sub_pd(self, other);
1779
+ }
1780
+
1781
+ // swizzle
1782
+
1783
+ template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1784
+ XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
1785
+ {
1786
+ constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
1787
+ return _mm_shuffle_ps(self, self, index);
1788
+ }
1789
+
1790
+ template <class A, uint64_t V0, uint64_t V1>
1791
+ XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<sse2>) noexcept
1792
+ {
1793
+ constexpr uint32_t index = detail::shuffle(V0, V1);
1794
+ return _mm_shuffle_pd(self, self, index);
1795
+ }
1796
+
1797
+ template <class A, uint64_t V0, uint64_t V1>
1798
+ XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<sse2>) noexcept
1799
+ {
1800
+ constexpr uint32_t index = detail::shuffle(2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1);
1801
+ return _mm_shuffle_epi32(self, index);
1802
+ }
1803
+
1804
+ template <class A, uint64_t V0, uint64_t V1>
1805
+ XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1> mask, requires_arch<sse2>) noexcept
1806
+ {
1807
+ return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, sse2 {}));
1808
+ }
1809
+
1810
+ template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1811
+ XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
1812
+ {
1813
+ constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
1814
+ return _mm_shuffle_epi32(self, index);
1815
+ }
1816
+
1817
+ template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1818
+ XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3> mask, requires_arch<sse2>) noexcept
1819
+ {
1820
+ return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, sse2 {}));
1821
+ }
1822
+
1823
+ template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1824
+ XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<sse2>) noexcept
1825
+ {
1826
+ constexpr bool is_identity = detail::is_identity(mask);
1827
+ constexpr bool is_dup_lo = detail::is_dup_lo(mask);
1828
+ constexpr bool is_dup_hi = detail::is_dup_hi(mask);
1829
+
1830
+ XSIMD_IF_CONSTEXPR(is_identity)
1831
+ {
1832
+ return self;
1833
+ }
1834
+ XSIMD_IF_CONSTEXPR(is_dup_lo)
1835
+ {
1836
+ // permute the low half
1837
+ constexpr int imm = detail::mod_shuffle(V0, V1, V2, V3);
1838
+ const auto lo = _mm_shufflelo_epi16(self, imm);
1839
+ // broadcast that 64-bit low half into both halves
1840
+ const auto lo_all = _mm_unpacklo_epi64(lo, lo);
1841
+ return lo_all;
1842
+ }
1843
+ XSIMD_IF_CONSTEXPR(is_dup_hi)
1844
+ {
1845
+ // permute the high half
1846
+ constexpr int imm = detail::mod_shuffle(V4, V5, V6, V7);
1847
+ const auto hi = _mm_shufflehi_epi16(self, imm);
1848
+ // broadcast that 64-bit high half into both halves
1849
+ const auto hi_all = _mm_unpackhi_epi64(hi, hi);
1850
+ return hi_all;
1851
+ }
1852
+ // Only pick elements from the low lane
1853
+ XSIMD_IF_CONSTEXPR((V0 < 4) && (V1 < 4) && (V2 < 4) && (V3 < 4) && (V4 < 4) && (V5 < 4) && (V6 < 4) && (V7 < 4))
1854
+ {
1855
+ // permute within each sub lane
1856
+ constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3);
1857
+ constexpr auto mask_hi = detail::mod_shuffle(V4, V5, V6, V7);
1858
+ __m128i lol = _mm_shufflelo_epi16(self, mask_lo);
1859
+ __m128i loh = _mm_shufflelo_epi16(self, mask_hi);
1860
+
1861
+ // generate temporary lanes
1862
+ return _mm_unpacklo_epi64(lol, loh);
1863
+ }
1864
+ // Only pick elements from the high lane
1865
+ XSIMD_IF_CONSTEXPR((V0 >= 4) && (V1 >= 4) && (V2 >= 4) && (V3 >= 4) && (V4 >= 4) && (V5 >= 4) && (V6 >= 4) && (V7 >= 4))
1866
+ {
1867
+ // permute within each sub lane
1868
+ constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3);
1869
+ constexpr auto mask_hi = detail::mod_shuffle(V4, V5, V6, V7);
1870
+ __m128i hil = _mm_shufflehi_epi16(self, mask_lo);
1871
+ __m128i hih = _mm_shufflehi_epi16(self, mask_hi);
1872
+
1873
+ // generate temporary lanes
1874
+ return _mm_unpackhi_epi64(hil, hih);
1875
+ }
1876
+
1877
+ // Generic case
1878
+
1879
+ // permute within each sub lane
1880
+ constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3);
1881
+ constexpr auto mask_hi = detail::mod_shuffle(V4, V5, V6, V7);
1882
+ __m128i lol = _mm_shufflelo_epi16(self, mask_lo);
1883
+ __m128i loh = _mm_shufflelo_epi16(self, mask_hi);
1884
+ __m128i hil = _mm_shufflehi_epi16(self, mask_lo);
1885
+ __m128i hih = _mm_shufflehi_epi16(self, mask_hi);
1886
+
1887
+ // generate temporary lanes
1888
+ __m128i lo = _mm_unpacklo_epi64(lol, loh);
1889
+ __m128i hi = _mm_unpackhi_epi64(hil, hih);
1890
+
1891
+ // mask to choose the right lane
1892
+ batch_bool_constant<uint16_t, A, (V0 < 4), (V1 < 4), (V2 < 4), (V3 < 4), (V4 < 4), (V5 < 4), (V6 < 4), (V7 < 4)> blend_mask;
1893
+
1894
+ // blend the two permutes
1895
+ return select(blend_mask, batch<uint16_t, A>(lo), batch<uint16_t, A>(hi));
1896
+ }
1897
+
1898
+ template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1899
+ XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<sse2>) noexcept
1900
+ {
1901
+ return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, sse2 {}));
1902
+ }
1903
+
1904
+ // transpose
1905
+ template <class A>
1906
+ XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<sse2>) noexcept
1907
+ {
1908
+ assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
1909
+ (void)matrix_end;
1910
+ auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
1911
+ _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
1912
+ matrix_begin[0] = r0;
1913
+ matrix_begin[1] = r1;
1914
+ matrix_begin[2] = r2;
1915
+ matrix_begin[3] = r3;
1916
+ }
1917
+ template <class A>
1918
+ XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<sse2>) noexcept
1919
+ {
1920
+ transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
1921
+ }
1922
+ template <class A>
1923
+ XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<sse2>) noexcept
1924
+ {
1925
+ transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
1926
+ }
1927
+
1928
+ template <class A>
1929
+ XSIMD_INLINE void transpose(batch<double, A>* matrix_begin, batch<double, A>* matrix_end, requires_arch<sse2>) noexcept
1930
+ {
1931
+ assert((matrix_end - matrix_begin == batch<double, A>::size) && "correctly sized matrix");
1932
+ (void)matrix_end;
1933
+ auto r0 = matrix_begin[0], r1 = matrix_begin[1];
1934
+ matrix_begin[0] = _mm_unpacklo_pd(r0, r1);
1935
+ matrix_begin[1] = _mm_unpackhi_pd(r0, r1);
1936
+ }
1937
+ template <class A>
1938
+ XSIMD_INLINE void transpose(batch<uint64_t, A>* matrix_begin, batch<uint64_t, A>* matrix_end, requires_arch<sse2>) noexcept
1939
+ {
1940
+ transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
1941
+ }
1942
+ template <class A>
1943
+ XSIMD_INLINE void transpose(batch<int64_t, A>* matrix_begin, batch<int64_t, A>* matrix_end, requires_arch<sse2>) noexcept
1944
+ {
1945
+ transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
1946
+ }
1947
+
1948
+ // zip_hi
1949
+ template <class A>
1950
+ XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1951
+ {
1952
+ return _mm_unpackhi_ps(self, other);
1953
+ }
1954
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1955
+ XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1956
+ {
1957
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1958
+ {
1959
+ return _mm_unpackhi_epi8(self, other);
1960
+ }
1961
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1962
+ {
1963
+ return _mm_unpackhi_epi16(self, other);
1964
+ }
1965
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1966
+ {
1967
+ return _mm_unpackhi_epi32(self, other);
1968
+ }
1969
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1970
+ {
1971
+ return _mm_unpackhi_epi64(self, other);
1972
+ }
1973
+ else
1974
+ {
1975
+ assert(false && "unsupported arch/op combination");
1976
+ return {};
1977
+ }
1978
+ }
1979
+ template <class A>
1980
+ XSIMD_INLINE batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1981
+ {
1982
+ return _mm_unpackhi_pd(self, other);
1983
+ }
1984
+
1985
+ // zip_lo
1986
+ template <class A>
1987
+ XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1988
+ {
1989
+ return _mm_unpacklo_ps(self, other);
1990
+ }
1991
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1992
+ XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1993
+ {
1994
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1995
+ {
1996
+ return _mm_unpacklo_epi8(self, other);
1997
+ }
1998
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1999
+ {
2000
+ return _mm_unpacklo_epi16(self, other);
2001
+ }
2002
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
2003
+ {
2004
+ return _mm_unpacklo_epi32(self, other);
2005
+ }
2006
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
2007
+ {
2008
+ return _mm_unpacklo_epi64(self, other);
2009
+ }
2010
+ else
2011
+ {
2012
+ assert(false && "unsupported arch/op combination");
2013
+ return {};
2014
+ }
2015
+ }
2016
+ template <class A>
2017
+ XSIMD_INLINE batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
2018
+ {
2019
+ return _mm_unpacklo_pd(self, other);
2020
+ }
2021
+ }
2022
+ }
2023
+
2024
+ #endif