sequenzo 0.1.21__cp312-cp312-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sequenzo might be problematic. Click here for more details.

Files changed (260) hide show
  1. sequenzo/__init__.py +240 -0
  2. sequenzo/big_data/__init__.py +12 -0
  3. sequenzo/big_data/clara/__init__.py +26 -0
  4. sequenzo/big_data/clara/clara.py +467 -0
  5. sequenzo/big_data/clara/utils/__init__.py +27 -0
  6. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  7. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  8. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-312-darwin.so +0 -0
  9. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  10. sequenzo/big_data/clara/visualization.py +88 -0
  11. sequenzo/clustering/KMedoids.py +196 -0
  12. sequenzo/clustering/__init__.py +30 -0
  13. sequenzo/clustering/clustering_c_code.cpython-312-darwin.so +0 -0
  14. sequenzo/clustering/hierarchical_clustering.py +1380 -0
  15. sequenzo/clustering/src/KMedoid.cpp +262 -0
  16. sequenzo/clustering/src/PAM.cpp +236 -0
  17. sequenzo/clustering/src/PAMonce.cpp +234 -0
  18. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  19. sequenzo/clustering/src/cluster_quality.h +128 -0
  20. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  21. sequenzo/clustering/src/module.cpp +228 -0
  22. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  23. sequenzo/clustering/utils/__init__.py +27 -0
  24. sequenzo/clustering/utils/disscenter.py +122 -0
  25. sequenzo/data_preprocessing/__init__.py +20 -0
  26. sequenzo/data_preprocessing/helpers.py +256 -0
  27. sequenzo/datasets/__init__.py +41 -0
  28. sequenzo/datasets/biofam.csv +2001 -0
  29. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  30. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  31. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  32. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  33. sequenzo/datasets/country_co2_emissions.csv +194 -0
  34. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  35. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  36. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  37. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  38. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  39. sequenzo/datasets/mvad.csv +713 -0
  40. sequenzo/datasets/pairfam_family.csv +1867 -0
  41. sequenzo/datasets/polyadic_samplec1.csv +61 -0
  42. sequenzo/datasets/polyadic_samplep1.csv +61 -0
  43. sequenzo/datasets/polyadic_seqc1.csv +61 -0
  44. sequenzo/datasets/polyadic_seqp1.csv +61 -0
  45. sequenzo/define_sequence_data.py +609 -0
  46. sequenzo/dissimilarity_measures/__init__.py +31 -0
  47. sequenzo/dissimilarity_measures/c_code.cpython-312-darwin.so +0 -0
  48. sequenzo/dissimilarity_measures/get_distance_matrix.py +702 -0
  49. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +241 -0
  50. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  51. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  52. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  53. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  54. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  55. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  56. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  57. sequenzo/dissimilarity_measures/src/module.cpp +34 -0
  58. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  59. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  60. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  61. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  62. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  63. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  64. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  65. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  66. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  67. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  210. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  211. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  212. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-312-darwin.so +0 -0
  213. sequenzo/dissimilarity_measures/utils/seqconc.cpython-312-darwin.so +0 -0
  214. sequenzo/dissimilarity_measures/utils/seqdss.cpython-312-darwin.so +0 -0
  215. sequenzo/dissimilarity_measures/utils/seqdur.cpython-312-darwin.so +0 -0
  216. sequenzo/dissimilarity_measures/utils/seqlength.cpython-312-darwin.so +0 -0
  217. sequenzo/multidomain/__init__.py +23 -0
  218. sequenzo/multidomain/association_between_domains.py +311 -0
  219. sequenzo/multidomain/cat.py +431 -0
  220. sequenzo/multidomain/combt.py +519 -0
  221. sequenzo/multidomain/dat.py +89 -0
  222. sequenzo/multidomain/idcd.py +139 -0
  223. sequenzo/multidomain/linked_polyad.py +292 -0
  224. sequenzo/openmp_setup.py +233 -0
  225. sequenzo/prefix_tree/__init__.py +43 -0
  226. sequenzo/prefix_tree/individual_level_indicators.py +1274 -0
  227. sequenzo/prefix_tree/system_level_indicators.py +465 -0
  228. sequenzo/prefix_tree/utils.py +54 -0
  229. sequenzo/sequence_characteristics/__init__.py +40 -0
  230. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  231. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  232. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  233. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  234. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  235. sequenzo/sequence_characteristics/turbulence.py +155 -0
  236. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  237. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  238. sequenzo/suffix_tree/__init__.py +48 -0
  239. sequenzo/suffix_tree/individual_level_indicators.py +1638 -0
  240. sequenzo/suffix_tree/system_level_indicators.py +456 -0
  241. sequenzo/suffix_tree/utils.py +56 -0
  242. sequenzo/visualization/__init__.py +29 -0
  243. sequenzo/visualization/plot_mean_time.py +194 -0
  244. sequenzo/visualization/plot_modal_state.py +276 -0
  245. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  246. sequenzo/visualization/plot_relative_frequency.py +404 -0
  247. sequenzo/visualization/plot_sequence_index.py +937 -0
  248. sequenzo/visualization/plot_single_medoid.py +153 -0
  249. sequenzo/visualization/plot_state_distribution.py +613 -0
  250. sequenzo/visualization/plot_transition_matrix.py +190 -0
  251. sequenzo/visualization/utils/__init__.py +23 -0
  252. sequenzo/visualization/utils/utils.py +310 -0
  253. sequenzo/with_event_history_analysis/__init__.py +35 -0
  254. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  255. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  256. sequenzo-0.1.21.dist-info/METADATA +308 -0
  257. sequenzo-0.1.21.dist-info/RECORD +254 -0
  258. sequenzo-0.1.21.dist-info/WHEEL +5 -0
  259. sequenzo-0.1.21.dist-info/licenses/LICENSE +28 -0
  260. sequenzo-0.1.21.dist-info/top_level.txt +1 -0
@@ -0,0 +1,241 @@
1
+ """
2
+ @Author : 李欣怡
3
+ @File : get_substitution_cost_matrix.py
4
+ @Time : 2024/11/11 12:00
5
+ @Desc : Compute substitution costs and substitution-cost/proximity matrix
6
+ """
7
+
8
+ import pandas as pd
9
+ import numpy as np
10
+
11
+ from .utils.get_sm_trate_substitution_cost_matrix import get_sm_trate_substitution_cost_matrix
12
+ from sequenzo.define_sequence_data import SequenceData
13
+ from sequenzo.sequence_characteristics.overall_cross_sectional_entropy import get_cross_sectional_entropy
14
+ from .get_distance_matrix import with_missing_warned
15
+
16
+ def get_substitution_cost_matrix(seqdata, method, cval=None, miss_cost=None, time_varying=False,
17
+ weighted=True, transition="both", lag=1, miss_cost_fixed=None,
18
+ **kwargs):
19
+ if 'with_missing' in kwargs and not with_missing_warned:
20
+ print("[!] 'with_missing' has been removed and is ignored.")
21
+ print(" Missing values are always included by default, consistent with TraMineR.")
22
+
23
+ # ================
24
+ # Check Parameters
25
+ # ================
26
+ if not isinstance(seqdata, SequenceData):
27
+ raise ValueError(" [!] data is NOT a sequence object, see SequenceData function to create one.")
28
+
29
+ metlist = ["CONSTANT", "TRATE", "INDELS", "INDELSLOG"]
30
+ if method not in metlist:
31
+ raise ValueError(f" [!] method must be one of: {', '.join(metlist)}.")
32
+
33
+ transitionlist = ["previous", "next", "both"]
34
+ if transition not in transitionlist:
35
+ raise ValueError(f" [!] transition must be one of: {', '.join(transitionlist)}.")
36
+
37
+ return_result = {"indel": 1}
38
+
39
+ cval4cond = time_varying and method == "TRATE" and transition == "both"
40
+ if cval is None:
41
+ cval = 4 if cval4cond else 2
42
+ if miss_cost is None:
43
+ miss_cost = cval
44
+ if miss_cost_fixed is None:
45
+ miss_cost_fixed = False if method in ["INDELS", "INDELSLOG"] else True
46
+
47
+ states = seqdata.states.copy()
48
+ alphsize = len(states) + 1
49
+
50
+ # ==================
51
+ # Process "CONSTANT"
52
+ # ==================
53
+ if method == "CONSTANT":
54
+ if cval is None:
55
+ raise ValueError("[!] No value for the constant substitution-cost.")
56
+
57
+ if time_varying:
58
+ time = seqdata.seqdata.shape[1]
59
+
60
+ print(
61
+ f" - Creating {alphsize}x{alphsize}x{time} time varying substitution-cost matrix using {cval} as constant value.")
62
+ costs = np.full((time, alphsize, alphsize), cval)
63
+
64
+ for i in range(time):
65
+ np.fill_diagonal(costs[i, :, :], 0) # Set diagonal to 0 in each time slice
66
+ else:
67
+ print(f" - Creating {alphsize}x{alphsize} substitution-cost matrix using {cval} as constant value")
68
+ costs = np.full((alphsize, alphsize), cval)
69
+ np.fill_diagonal(costs, 0) # Set diagonal to 0
70
+
71
+ # ===============
72
+ # Process "TRATE"
73
+ # ===============
74
+ if method == "TRATE":
75
+ print("[>] Transition-based substitution-cost matrix (TRATE) initiated...")
76
+ print(f" - Computing transition probabilities for: [{', '.join(map(str, seqdata.states))}]") # Because the matrix CLARA is passing in is a number
77
+
78
+ if time_varying:
79
+ tr = get_sm_trate_substitution_cost_matrix(seqdata, time_varying=True, weighted=weighted, lag=lag)
80
+
81
+ tmat = tr.shape[1] # Number of states (since tr is three dimensions np.ndarray, the first dimension is time)
82
+ time = seqdata.seqdata.shape[1] # Total number of time points
83
+ costs = np.zeros((time, alphsize, alphsize))
84
+
85
+ # Function to compute the cost according to transition rates
86
+ def tratecostBoth(trate, t, state1, state2, debut, fin):
87
+ cost = 0
88
+ if not debut:
89
+ # the first state
90
+ cost -= trate[t - 1, state1, state2] + trate[t - 1, state2, state1]
91
+ if not fin:
92
+ # the last state
93
+ cost -= trate[t, state1, state2] + trate[t, state2, state1]
94
+ return cost + cval if not debut and not fin else cval + 2 * cost
95
+
96
+ def tratecostPrevious(trate, t, state1, state2, debut, fin):
97
+ cost = 0
98
+ if not debut:
99
+ # the first state
100
+ cost -= trate[t - 1, state1, state2] + trate[t - 1, state2, state1]
101
+ return cval + cost
102
+
103
+ def tratecostNext(trate, t, state1, state2, debut, fin):
104
+ cost = 0
105
+ if not fin:
106
+ # the last state
107
+ cost -= trate[t, state1, state2] + trate[t, state2, state1]
108
+ return cval + cost
109
+
110
+ if transition == "previous":
111
+ tratecost = tratecostPrevious
112
+ elif transition == "next":
113
+ tratecost = tratecostNext
114
+ else:
115
+ tratecost = tratecostBoth
116
+
117
+ for t in range(time):
118
+ for i in range(tmat - 1):
119
+ for j in range(i + 1, tmat):
120
+ cost = max(0, tratecost(tr, t, i, j, debut=(t == 0), fin=(t == time - 1)))
121
+ costs[t, i, j] = cost
122
+ costs[t, j, i] = cost
123
+
124
+ else:
125
+ tr = get_sm_trate_substitution_cost_matrix(seqdata, time_varying=False, weighted=weighted, lag=lag)
126
+
127
+ tmat = tr.shape[0]
128
+ costs = np.zeros((alphsize, alphsize))
129
+
130
+ for i in range(1, tmat - 1):
131
+ for j in range(i + 1, tmat):
132
+ cost = cval - tr[i, j] - tr[j, i]
133
+ costs[i, j] = cost
134
+ costs[j, i] = cost
135
+
136
+ indel = 0.5 * np.max(costs)
137
+
138
+ return_result['indel'] = indel
139
+
140
+ # ================================
141
+ # Process "INDELS" and "INDELSLOG"
142
+ # ================================
143
+ if method in ["INDELS", "INDELSLOG"]:
144
+ if time_varying:
145
+ indels = get_cross_sectional_entropy(seqdata, return_format="dict")['Frequencies']
146
+ else:
147
+ ww = seqdata.weights
148
+ if ww is None:
149
+ ww = np.ones(seqdata.seqdata.shape[0])
150
+
151
+ flat_seq = seqdata.values.flatten(order='F')
152
+ weights_rep = np.repeat(ww, seqdata.seqdata.shape[1])
153
+ df = pd.DataFrame({'state': flat_seq, 'weight': weights_rep})
154
+ weighted_counts = df.groupby('state')['weight'].sum()
155
+
156
+ weighted_prob = weighted_counts / weighted_counts.sum()
157
+ states_num = range(1, len(seqdata.states) + 1)
158
+ indels = np.array([weighted_prob.get(s, 0) for s in states_num])
159
+
160
+ indels[np.isnan(indels)] = 1
161
+ if method == "INDELSLOG":
162
+ indels = np.log(2 / (1 + indels))
163
+ else:
164
+ indels = 1 / indels
165
+ indels[np.isinf(indels)] = 1e15 # 避免cast警告
166
+
167
+ if time_varying:
168
+ return_result['indel'] = indels
169
+ else:
170
+ return_result['indel'] = np.insert(indels, 0, 0) # cause C++ is 1-indexed
171
+
172
+ if time_varying:
173
+ time = seqdata.seqdata.shape[1]
174
+
175
+ print(
176
+ f" - Creating {alphsize}x{alphsize}x{time} time varying substitution-cost matrix using {cval} as constant value.")
177
+ costs = np.full((time, alphsize, alphsize), 0.0)
178
+
179
+ for t in range(time):
180
+ for i in range(1, alphsize):
181
+ for j in range(1, alphsize):
182
+ if i != j:
183
+ val = indels.iloc[i - 1, t] + indels.iloc[j - 1, t]
184
+ costs[t, i, j] = np.clip(val, -1e15, 1e15) # 避免cast警告
185
+
186
+ else:
187
+ costs = np.full((alphsize, alphsize), 0.0)
188
+ for i in range(1, alphsize):
189
+ for j in range(1, alphsize):
190
+ if i != j:
191
+ costs[i, j] = indels[i - 1] + indels[j - 1]
192
+ costs[np.isinf(costs)] = 1e15 # 避免cast警告
193
+
194
+ # =================================
195
+ # Process the Cost of Missing Value
196
+ # =================================
197
+ if seqdata.ismissing and miss_cost_fixed:
198
+ if time_varying:
199
+ costs[:, alphsize - 1, :alphsize - 1] = miss_cost
200
+ costs[:, :alphsize - 1, alphsize - 1] = miss_cost
201
+ else:
202
+ costs[alphsize - 1, :alphsize - 1] = miss_cost
203
+ costs[:alphsize - 1, alphsize - 1] = miss_cost
204
+
205
+ # ===============================
206
+ # Setting Rows and Columns Labels
207
+ # ===============================
208
+ if time_varying: # 3D
209
+ costs = costs
210
+ else: # 2D
211
+ states.insert(0, "null")
212
+ costs = pd.DataFrame(costs, index=states, columns=states, dtype=float)
213
+
214
+ # ===============================
215
+ # Calculate the Similarity Matrix
216
+ # ===============================
217
+ return_result['sm'] = costs
218
+
219
+ return return_result
220
+
221
+
222
+ # Define seqsubm as an alias for backward compatibility
223
+ def seqsubm(*args, **kwargs):
224
+ return get_substitution_cost_matrix(*args, **kwargs)['sm']
225
+
226
+
227
+ if __name__ == "__main__":
228
+ df = pd.read_csv('D:/country_co2_emissions_missing.csv')
229
+
230
+ time = list(df.columns)[1:]
231
+
232
+ states = ['Very Low', 'Low', 'Middle', 'High', 'Very High']
233
+
234
+ sequence_data = SequenceData(df, time=time, id_col="country", states=states)
235
+
236
+ sm = get_substitution_cost_matrix(sequence_data,
237
+ method="CONSTANT",
238
+ cval=2,
239
+ time_varying=False)
240
+
241
+ print("===============")
@@ -0,0 +1,148 @@
1
+ #include <pybind11/pybind11.h>
2
+ #include <pybind11/numpy.h>
3
+ #include <vector>
4
+ #include <cmath>
5
+ #include <iostream>
6
+ #include "utils.h"
7
+ #include "dp_utils.h"
8
+ #ifdef _OPENMP
9
+ #include <omp.h>
10
+ #endif
11
+ #include <xsimd/xsimd.hpp>
12
+
13
+ namespace py = pybind11;
14
+
15
+ class DHDdistance{
16
+ public:
17
+ DHDdistance(py::array_t<int> sequences, py::array_t<double> sm, int norm, double maxdist, py::array_t<int> refseqS)
18
+ : norm(norm), maxdist(maxdist){
19
+ py::print("[>] Starting (Dynamic) Hamming Distance(DHD/HAM)...");
20
+ std::cout << std::flush;
21
+
22
+ try{
23
+ this->sequences = sequences;
24
+ this->sm = sm;
25
+
26
+ auto seq_shape = sequences.shape();
27
+ nseq = seq_shape[0];
28
+ len = seq_shape[1];
29
+
30
+ dist_matrix = py::array_t<double>({nseq, nseq});
31
+
32
+ // about reference sequences :
33
+ nans = nseq;
34
+
35
+ rseq1 = refseqS.at(0);
36
+ rseq2 = refseqS.at(1);
37
+ if(rseq1 < rseq2){
38
+ nseq = rseq1;
39
+ nans = nseq * (rseq2 - rseq1);
40
+ }else{
41
+ rseq1 = rseq1 - 1;
42
+ }
43
+ refdist_matrix = py::array_t<double>({nseq, (rseq2-rseq1)});
44
+ } catch (const std::exception& e){
45
+ py::print("Error in constructor: ", e.what());
46
+ throw ;
47
+ }
48
+ }
49
+
50
+ double compute_distance(int is, int js) {
51
+ try {
52
+ int m = len;
53
+ int n = len;
54
+ int minimum = m;
55
+ if(n < m) minimum = n;
56
+ double cost = 0;
57
+
58
+ auto ptr_sm = sm.unchecked<3>();
59
+ auto ptr_seq = sequences.unchecked<2>();
60
+
61
+ // 使用 SIMD 批量处理
62
+ const int simd_width = xsimd::batch<double>::size;
63
+ int i = 0;
64
+
65
+ for(; i + simd_width <= minimum; i += simd_width) {
66
+ alignas(32) int seq_is[simd_width];
67
+ alignas(32) int seq_js[simd_width];
68
+ alignas(32) double tmp[simd_width];
69
+
70
+ // 加载序列
71
+ for(int j = 0; j < simd_width; j++) {
72
+ seq_is[j] = ptr_seq(is, i + j);
73
+ seq_js[j] = ptr_seq(js, i + j);
74
+ }
75
+ xsimd::batch<int> batch_seq_is = xsimd::load_unaligned(seq_is);
76
+ xsimd::batch<int> batch_seq_js = xsimd::load_unaligned(seq_js);
77
+
78
+ // 比较是否相等
79
+ auto equal_mask = (batch_seq_is == batch_seq_js);
80
+ for(int j = 0; j < simd_width; j++) {
81
+ tmp[j] = equal_mask.get(j) ? 0.0 : ptr_sm(i + j, seq_is[j], seq_js[j]);
82
+ }
83
+
84
+ xsimd::batch<double> costs = xsimd::load_unaligned(tmp);
85
+ cost += xsimd::reduce_add(costs);
86
+ }
87
+
88
+ // 处理尾部:用 SIMD 填充无效数据
89
+ for(; i < minimum; i += simd_width) {
90
+ alignas(32) double tmp[simd_width];
91
+ int bound = std::min(simd_width, minimum - i);
92
+ for(int j = 0; j < simd_width; j++) {
93
+ tmp[j] = (j < bound) ? ptr_sm(i + j, ptr_seq(is, i + j), ptr_seq(js, i + j)) : 0.0;
94
+ }
95
+ xsimd::batch<double> costs = xsimd::load_unaligned(tmp);
96
+ cost += xsimd::reduce_add(costs);
97
+ }
98
+
99
+ return normalize_distance(cost, maxdist, maxdist, maxdist, norm);
100
+ } catch (const std::exception& e) {
101
+ py::print("Error in compute_distance: ", e.what());
102
+ throw;
103
+ }
104
+ }
105
+
106
+ py::array_t<double> compute_all_distances() {
107
+ try {
108
+ return dp_utils::compute_all_distances_simple(
109
+ nseq,
110
+ dist_matrix,
111
+ [this](int i, int j){ return this->compute_distance(i, j); }
112
+ );
113
+ } catch (const std::exception& e) {
114
+ py::print("Error in compute_all_distances: ", e.what());
115
+ throw;
116
+ }
117
+ }
118
+
119
+ py::array_t<double> compute_refseq_distances() {
120
+ try {
121
+ return dp_utils::compute_refseq_distances_simple(
122
+ nseq,
123
+ rseq1,
124
+ rseq2,
125
+ refdist_matrix,
126
+ [this](int is, int rseq){ return this->compute_distance(is, rseq); }
127
+ );
128
+ } catch (const std::exception& e) {
129
+ py::print("Error in compute_all_distances: ", e.what());
130
+ throw;
131
+ }
132
+ }
133
+
134
+ private:
135
+ py::array_t<int> sequences;
136
+ py::array_t<double> sm;
137
+ int norm;
138
+ int nseq;
139
+ int len;
140
+ py::array_t<double> dist_matrix;
141
+ double maxdist;
142
+
143
+ py::array_t<int> refseqS;
144
+ int nans = -1;
145
+ int rseq1 = -1;
146
+ int rseq2 = -1;
147
+ py::array_t<double> refdist_matrix;
148
+ };
@@ -0,0 +1,114 @@
1
+ #include <pybind11/pybind11.h>
2
+ #include <pybind11/numpy.h>
3
+ #include <vector>
4
+ #include <iostream>
5
+ #include "utils.h"
6
+ #include "dp_utils.h"
7
+
8
+ namespace py = pybind11;
9
+
10
+ class LCPdistance{
11
+ public:
12
+ LCPdistance(py::array_t<int> sequences, int norm, int sign, py::array_t<int> refseqS)
13
+ : norm(norm), sign(sign){
14
+ py::print("[>] Starting (Reverse) Longest Common Prefix(LCP/RLCP)...");
15
+ std::cout << std::flush;
16
+
17
+ try{
18
+ this->sequences = sequences;
19
+
20
+ auto seq_shape = sequences.shape();
21
+ nseq = seq_shape[0];
22
+ len = seq_shape[1];
23
+
24
+ dist_matrix = py::array_t<double>({nseq, nseq});
25
+
26
+ // about reference sequences :
27
+ nans = nseq;
28
+
29
+ rseq1 = refseqS.at(0);
30
+ rseq2 = refseqS.at(1);
31
+ if(rseq1 < rseq2){
32
+ nseq = rseq1;
33
+ nans = nseq * (rseq2 - rseq1);
34
+ }else{
35
+ rseq1 = rseq1 - 1;
36
+ }
37
+ refdist_matrix = py::array_t<double>({nseq, (rseq2-rseq1)});
38
+ } catch (const std::exception& e){
39
+ py::print("Error in constructor: ", e.what());
40
+ throw ;
41
+ }
42
+ }
43
+
44
+ double compute_distance(int is, int js) {
45
+ try {
46
+ int m = len;
47
+ int n = len;
48
+ int minimum = m;
49
+ if(n < m) minimum = n;
50
+
51
+ int length = 0;
52
+ auto ptr_seq = sequences.unchecked<2>();
53
+
54
+ if(sign > 0){
55
+ while(ptr_seq(is, length) == ptr_seq(js, length) && length < minimum){
56
+ length ++;
57
+ }
58
+ } else{
59
+ length = 1;
60
+ while(ptr_seq(is, (m - length)) == ptr_seq(js, (n - length)) && length <= minimum){
61
+ length ++;
62
+ }
63
+ length --;
64
+ }
65
+
66
+ return normalize_distance(n+m-2.0*length, n+m, m, n, norm);
67
+ } catch (const std::exception& e) {
68
+ py::print("Error in compute_distance: ", e.what());
69
+ throw;
70
+ }
71
+ }
72
+
73
+ py::array_t<double> compute_all_distances() {
74
+ try {
75
+ return dp_utils::compute_all_distances_simple(
76
+ nseq,
77
+ dist_matrix,
78
+ [this](int i, int j){ return this->compute_distance(i, j); }
79
+ );
80
+ } catch (const std::exception& e) {
81
+ py::print("Error in compute_all_distances: ", e.what());
82
+ throw;
83
+ }
84
+ }
85
+
86
+ py::array_t<double> compute_refseq_distances() {
87
+ try {
88
+ return dp_utils::compute_refseq_distances_simple(
89
+ nseq,
90
+ rseq1,
91
+ rseq2,
92
+ refdist_matrix,
93
+ [this](int is, int rseq){ return this->compute_distance(is, rseq); }
94
+ );
95
+ } catch (const std::exception& e) {
96
+ py::print("Error in compute_all_distances: ", e.what());
97
+ throw;
98
+ }
99
+ }
100
+
101
+ private:
102
+ py::array_t<int> sequences;
103
+ int norm;
104
+ int nseq;
105
+ int len;
106
+ int sign;
107
+ py::array_t<double> dist_matrix;
108
+
109
+ py::array_t<int> refseqS;
110
+ int nans = -1;
111
+ int rseq1 = -1;
112
+ int rseq2 = -1;
113
+ py::array_t<double> refdist_matrix;
114
+ };