sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
  2. sequenzo/__init__.py +349 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +476 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +22 -0
  30. sequenzo/data_preprocessing/helpers.py +303 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/dyadic_children.csv +61 -0
  44. sequenzo/datasets/dyadic_parents.csv +61 -0
  45. sequenzo/datasets/mvad.csv +713 -0
  46. sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
  47. sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
  48. sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
  49. sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
  50. sequenzo/datasets/political_science_aid_shock.csv +166 -0
  51. sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
  52. sequenzo/define_sequence_data.py +1400 -0
  53. sequenzo/dissimilarity_measures/__init__.py +31 -0
  54. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  55. sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
  56. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
  57. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  58. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  59. sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
  60. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  61. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  62. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  63. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  64. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  65. sequenzo/dissimilarity_measures/src/module.cpp +40 -0
  66. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  67. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  214. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  215. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  216. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  217. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  218. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  219. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  220. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  221. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  222. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  223. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  224. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  225. sequenzo/multidomain/__init__.py +23 -0
  226. sequenzo/multidomain/association_between_domains.py +311 -0
  227. sequenzo/multidomain/cat.py +597 -0
  228. sequenzo/multidomain/combt.py +519 -0
  229. sequenzo/multidomain/dat.py +81 -0
  230. sequenzo/multidomain/idcd.py +139 -0
  231. sequenzo/multidomain/linked_polyad.py +292 -0
  232. sequenzo/openmp_setup.py +233 -0
  233. sequenzo/prefix_tree/__init__.py +62 -0
  234. sequenzo/prefix_tree/hub.py +114 -0
  235. sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
  236. sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
  237. sequenzo/prefix_tree/spell_level_indicators.py +297 -0
  238. sequenzo/prefix_tree/system_level_indicators.py +544 -0
  239. sequenzo/prefix_tree/utils.py +54 -0
  240. sequenzo/seqhmm/__init__.py +95 -0
  241. sequenzo/seqhmm/advanced_optimization.py +305 -0
  242. sequenzo/seqhmm/bootstrap.py +411 -0
  243. sequenzo/seqhmm/build_hmm.py +142 -0
  244. sequenzo/seqhmm/build_mhmm.py +136 -0
  245. sequenzo/seqhmm/build_nhmm.py +121 -0
  246. sequenzo/seqhmm/fit_mhmm.py +62 -0
  247. sequenzo/seqhmm/fit_model.py +61 -0
  248. sequenzo/seqhmm/fit_nhmm.py +76 -0
  249. sequenzo/seqhmm/formulas.py +289 -0
  250. sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
  251. sequenzo/seqhmm/gradients_nhmm.py +306 -0
  252. sequenzo/seqhmm/hmm.py +291 -0
  253. sequenzo/seqhmm/mhmm.py +314 -0
  254. sequenzo/seqhmm/model_comparison.py +238 -0
  255. sequenzo/seqhmm/multichannel_em.py +282 -0
  256. sequenzo/seqhmm/multichannel_utils.py +138 -0
  257. sequenzo/seqhmm/nhmm.py +270 -0
  258. sequenzo/seqhmm/nhmm_utils.py +191 -0
  259. sequenzo/seqhmm/predict.py +137 -0
  260. sequenzo/seqhmm/predict_mhmm.py +142 -0
  261. sequenzo/seqhmm/simulate.py +878 -0
  262. sequenzo/seqhmm/utils.py +218 -0
  263. sequenzo/seqhmm/visualization.py +910 -0
  264. sequenzo/sequence_characteristics/__init__.py +40 -0
  265. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  266. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  267. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  268. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  269. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  270. sequenzo/sequence_characteristics/turbulence.py +155 -0
  271. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  272. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  273. sequenzo/suffix_tree/__init__.py +66 -0
  274. sequenzo/suffix_tree/hub.py +114 -0
  275. sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
  276. sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
  277. sequenzo/suffix_tree/spell_level_indicators.py +248 -0
  278. sequenzo/suffix_tree/system_level_indicators.py +535 -0
  279. sequenzo/suffix_tree/utils.py +56 -0
  280. sequenzo/version_check.py +283 -0
  281. sequenzo/visualization/__init__.py +29 -0
  282. sequenzo/visualization/plot_mean_time.py +222 -0
  283. sequenzo/visualization/plot_modal_state.py +276 -0
  284. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  285. sequenzo/visualization/plot_relative_frequency.py +405 -0
  286. sequenzo/visualization/plot_sequence_index.py +1175 -0
  287. sequenzo/visualization/plot_single_medoid.py +153 -0
  288. sequenzo/visualization/plot_state_distribution.py +651 -0
  289. sequenzo/visualization/plot_transition_matrix.py +190 -0
  290. sequenzo/visualization/utils/__init__.py +23 -0
  291. sequenzo/visualization/utils/utils.py +310 -0
  292. sequenzo/with_event_history_analysis/__init__.py +35 -0
  293. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  294. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  295. sequenzo-0.1.31.dist-info/METADATA +286 -0
  296. sequenzo-0.1.31.dist-info/RECORD +299 -0
  297. sequenzo-0.1.31.dist-info/WHEEL +5 -0
  298. sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
  299. sequenzo-0.1.31.dist-info/top_level.txt +2 -0
@@ -0,0 +1,762 @@
1
+ """
2
+ @Author : Xinyi Li 李欣怡
3
+ @File : get_distance_matrix.py
4
+ @Time : 2024/11/10 19:55
5
+ @Desc : Computes pairwise dissimilarities between sequences or dissimilarity from a reference sequence.
6
+ Several dissimilarity measures can be chosen,
7
+ including optimal matching (OM) and many of its variants, distance based on the count of common attributes,
8
+ and distances between state distributions within sequences.
9
+
10
+ :params
11
+ seqdata : State sequence object of class stslist
12
+ method : String.The dissimilarity measure to use.
13
+ It can be "OM", "OMloc", "OMslen", "OMspell", "OMstran", "HAM", "DHD",
14
+ "CHI2", "EUCLID", "LCS", "LCP", "RLCP", "LCPspell", "RLCPspell",
15
+ "NMS", "NMSMST", "SVRspell", or "TWED".
16
+ refseq : Default: NULL. The baseline sequence to compute the distances from.
17
+ (1)When an integer, the index of a sequence in seqdata or 0 for the most frequent sequence.
18
+ (2)When a state sequence object, it must contain a single sequence and have the same alphabet as seqdata.
19
+ (3)When a list, it must be a list of two sets of indexes of seqdata rows.
20
+ norm : Default: "none". The normalization to use when method is one of
21
+ {"OM", "OMloc", "OMslen", "OMspell", "OMstran", "TWED", "HAM", "DHD",
22
+ "LCS", "LCP", "RLCP", "LCPspell", "RLCPspell", "CHI2", "EUCLID"}.
23
+ (1)It can be "none", "auto", or,
24
+ except for "CHI2" and "EUCLID", "maxlength", "gmean", "maxdist", or "YujianBo".
25
+ (2)"auto" is equivalent to
26
+ 1) "maxlength" when method is one of "OM", "HAM", or "DHD",
27
+ 2)"gmean" when method is one of "LCS", "LCP", "RLCP", "LCPspell", or "RLCPspell",
28
+ 3) YujianBo when method is one of "OMloc", "OMslen", "OMspell", "OMstran", "TWED".
29
+ indel : Insertion/deletion cost(s).
30
+ Applies when method is one of "OM", "OMslen", "OMspell", or "OMstran".
31
+ (1)The single state-independent insertion/deletion cost when a double.
32
+ (2)The state-dependent insertion/deletion costs when a vector of doubles.
33
+ The vector should contain an indel cost by state in the order of the alphabet.
34
+ (3)When "auto", the indel is set as max(sm)/2 when sm is a matrix
35
+ and is computed by means of seqcost when sm is a string specifying a cost method.
36
+ sm : Substitution costs. Default: NULL.
37
+ (1)The substitution-cost matrix when a matrix
38
+ and method is one of "OM", "OMloc", "OMslen", "OMspell", "OMstran", "HAM", or "TWED".
39
+ (2)The series of the substitution-cost matrices when an array and method = "DHD".
40
+ They are grouped in a 3-dimensional array with the third index referring to the position in the sequence.
41
+ (3)One of the strings "CONSTANT", "INDELS", "INDELSLOG", or "TRATE".
42
+ Designates a seqcost method to build sm. "CONSTANT" is not relevant for "DHD".
43
+ sm is mandatory when method is one of "OM", "OMloc", "OMslen", "OMspell", "OMstran", or "TWED".
44
+ sm is autogenerated when method is one of "HAM" or "DHD" and sm = NULL.
45
+ full.matrix : Default: TRUE. When refseq = NULL, if TRUE, the full distance matrix is returned,
46
+ if FALSE, an object of class dist is returned,
47
+ that is, a vector containing only values from the lower triangle of the distance matrix.
48
+ Objects of class dist are smaller and can be passed directly as arguments to most clustering functions.
49
+ tpow : Default: 1.0.
50
+ The exponential weight of spell length when method is one of "OMspell", "NMSMST", or "SVRspell".
51
+ expcost : Default: 0.5. The cost of spell length transformation when method = "OMloc", "OMspell", "LCPspell", or "RLCPspell".
52
+ It must be positive. The exact interpretation is distance-dependent.
53
+ weighted : Default: TRUE. When method is "CHI2" or when sm is a string (method),
54
+ should the distributions of the states account for the sequence weights in seqdata?
55
+ check.max.size : Logical. Should seqdist stop when maximum allowed number of unique sequences is exceeded?
56
+ """
57
+ import gc
58
+ import time
59
+ import warnings
60
+
61
+ from scipy.spatial.distance import pdist, squareform
62
+ import numpy as np
63
+ import pandas as pd
64
+
65
+ from sequenzo.define_sequence_data import SequenceData
66
+
67
+ with_missing_warned = False
68
+
69
+ def get_distance_matrix(seqdata=None, method=None, refseq=None, norm="none", indel="auto", sm=None, full_matrix=True,
70
+ tpow=1.0, expcost=0.5, weighted=True, check_max_size=True, opts=None, **kwargs):
71
+
72
+ from .utils.seqconc import seqconc
73
+ from .utils.seqdss import seqdss
74
+ from .utils.seqdur import seqdur
75
+ from .utils.seqlength import seqlength
76
+ from . import get_substitution_cost_matrix
77
+
78
+ # Lazily import the c_code module to avoid circular dependencies during installation
79
+ from .__init__ import _import_c_code
80
+ c_code = _import_c_code()
81
+
82
+ gc.collect() # garbage collection
83
+
84
+ if opts is not None:
85
+ seqdata = opts.get('seqdata')
86
+ method = opts.get('method')
87
+ refseq = opts.get('refseq')
88
+ norm = opts.get('norm') or "none"
89
+ indel = opts.get('indel') or "auto"
90
+ sm = opts.get('sm')
91
+ full_matrix = opts.get('full_matrix') or True
92
+ tpow = opts.get('tpow') or 1.0
93
+ expcost = opts.get('expcost') or 0.5
94
+ weighted = opts.get('weighted') or True
95
+ check_max_size = opts.get('check_max_size') or True
96
+
97
+ if 'with_missing' in kwargs:
98
+ print("[!] 'with_missing' has been removed and is ignored.")
99
+ print(" Missing values are always included by default, consistent with TraMineR.")
100
+
101
+ with_missing_warned = True
102
+
103
+ # ======================================
104
+ # Check Arguments With Deprecated Values
105
+ # ======================================
106
+ # the version in 2017
107
+ # check method
108
+ deprecated_methods = ["OMopt", "LCSopt"]
109
+ if method in deprecated_methods:
110
+ print(f"[!] Warning: {method} is deprecated.\n")
111
+
112
+ if method == "OMopt":
113
+ method = "OM"
114
+ print(f"[!] 'method' is set to \"OM\" which is equivalent.")
115
+ elif method == "LCSopt":
116
+ method = "LCS"
117
+ print(f"[!] 'method' is set to \"LCS\" which is equivalent.")
118
+
119
+ # check norm
120
+ if isinstance(norm, bool):
121
+ norm = "auto" if norm else "none"
122
+ print("[!] Warning: 'norm' has a deprecated value, TRUE changed into 'auto', FALSE into 'none'.\n")
123
+
124
+ # ===========================================
125
+ # Check For Arguments That Need To Be Defined
126
+ # ===========================================
127
+ # Check if the method parameter is missing
128
+ if seqdata is None:
129
+ raise ValueError("[!] The 'seqdata' parameter is missing.")
130
+ if method is None:
131
+ raise ValueError("[!] The 'method' parameter is missing.")
132
+
133
+ # ====================
134
+ # Check Argument Types
135
+ # ====================
136
+ if not isinstance(seqdata, SequenceData):
137
+ raise ValueError("[!] 'seqdata' must be a state sequence object created with SequenceData")
138
+
139
+ nseqs = seqdata.seqdata.shape[0]
140
+ nstates = len(seqdata.states)
141
+ seqs_dlens = np.unique(seqlength(seqdata))
142
+
143
+ # check method
144
+ om_methods = ["OM", "OMspell"]
145
+ methods = om_methods + ["HAM", "DHD", "LCP", "RLCP", "LCPspell", "RLCPspell"]
146
+
147
+ if method not in methods:
148
+ raise ValueError(f"[!] Invalid 'method': {method}. Expected one of {methods}")
149
+
150
+ # check refseq
151
+ if refseq is not None:
152
+ # if list of two sets of indexes, we will compute pairwise distances between the two sets
153
+ if isinstance(refseq, list) and len(refseq) > 1:
154
+ if len(refseq) > 2:
155
+ print("[!] Warning: Only first two elements of the 'refseq' list are used.\n")
156
+
157
+ for i, ref in enumerate(refseq[:2]):
158
+ if any(not isinstance(x, int) or x < 0 for x in ref):
159
+ raise ValueError(
160
+ "[x] When 'refseq' is a list, it must contain two sets of indexes with positive integer values.")
161
+
162
+ if max(ref, default=-1) > nseqs:
163
+ raise ValueError("[x] Some indexes in 'refseq' are out of range.")
164
+
165
+ refseq_type = "sets"
166
+
167
+ else:
168
+ raise ValueError("[!] Invalid 'refseq' value.")
169
+
170
+ else:
171
+ refseq_type = "none"
172
+
173
+ # check for empty sequences
174
+ sdur = seqdur(seqdata)
175
+ emptyseq = np.where(np.isnan(sdur[:, 0]))[0]
176
+
177
+ if len(emptyseq) > 0:
178
+ if method == "OMloc":
179
+ raise ValueError(f"[!] Error: empty sequences in method 'OMloc': {emptyseq}.")
180
+ else:
181
+ print(f"[!] Warning: empty sequences {emptyseq}.\n")
182
+
183
+ print(f"[>] Processing {nseqs} sequences with {nstates} unique states.")
184
+
185
+ # check norm
186
+ norms = ["auto", "none", "maxlength", "gmean", "maxdist", "YujianBo"]
187
+ if norm not in norms:
188
+ raise ValueError(f"[!] 'norm' should be in {norms}.")
189
+
190
+ # check indel
191
+ # indel_type: "number", "vector", "auto"
192
+ # must be after including missing values as an additional state (nstates)
193
+ # all but NMS, NMSMST, SVRspell
194
+ if isinstance(indel, (int, float)):
195
+ indel_type = "number"
196
+ elif isinstance(indel, (np.ndarray, list)) and np.issubdtype(indel.dtype, np.number):
197
+ if len(indel) != nstates:
198
+ raise ValueError("[!] When a vector, 'indel' must contain a cost for each state.")
199
+ indel_type = "vector"
200
+ elif indel == "auto":
201
+ indel_type = "auto"
202
+ else:
203
+ raise ValueError("[!] indel")
204
+
205
+ # check sm
206
+ # Must be after sanity checks on 'indel'
207
+ # Add here new seqcost() method names
208
+ # sm.type:
209
+ # "none" :
210
+ # "matrix" : "OM", "OMloc", "OMslen", "OMspell", "OMstran", "HAM", "DHD" or "TWED".
211
+ # "method" : "TRATE", "CONSTANT", "INDELS", "INDELSLOG"
212
+ sm_methods = ["TRATE", "CONSTANT", "INDELS", "INDELSLOG"]
213
+
214
+ if sm is not None:
215
+ if isinstance(sm, np.ndarray) and (sm.ndim == 2 or sm.ndim == 3):
216
+ sm_type = "matrix"
217
+ elif isinstance(sm, np.ndarray) and sm.ndim == 1:
218
+ sm_type = "array"
219
+ elif isinstance(sm, str):
220
+ sm = sm.upper()
221
+ if sm not in sm_methods:
222
+ raise ValueError(f"[!] Invalid 'sm' value, must be one of {sm_methods}.")
223
+ if method == "OM" and (sm == "INDELSLOG" or sm == "INDELS"):
224
+ raise ValueError(f"[!] 'sm = \"{sm}\"' is not relevant for OM now, consider TRATE or CONSTANT instead.")
225
+ sm_type = "method"
226
+ else:
227
+ raise ValueError("[!] 'sm' must be of a valid type (matrix, array, method).")
228
+ else:
229
+ sm_type = "none"
230
+
231
+ # ===================================
232
+ # Check Arguments Not Yet Implemented
233
+ # ===================================
234
+ # norm: all but SVRspell, NMS, NMSMST
235
+ if norm != "none" and method not in ["OM", "OMspell", "HAM", "DHD", "LCP", "RLCP", "LCPspell", "RLCPspell"]:
236
+ raise ValueError(f"[x] norm is not matched with {method}.")
237
+
238
+ # ===============================
239
+ # Check Method Specific Arguments
240
+ # ===============================
241
+ # 1. OMspell, LCPspell, RLCPspell
242
+ if method in ["OMspell"] and expcost < 0:
243
+ raise ValueError("[x] 'expcost' must be positive.")
244
+ if method in ["LCPspell", "RLCPspell"] and expcost < 0:
245
+ raise ValueError("[x] 'expcost' must be non-negative for LCPspell/RLCPspell (use 0 to ignore duration).")
246
+
247
+ # 2. DHD
248
+ elif method == "DHD":
249
+ if sm_type == "method" and sm == "CONSTANT":
250
+ raise ValueError("[!] 'sm = \"CONSTANT\"' is not relevant for DHD, consider HAM instead.")
251
+
252
+ # 3. HAM, DHD
253
+ if method in ["HAM", "DHD"]:
254
+ if seqs_dlens.shape[0] > 1:
255
+ raise ValueError(f"[x] {method} is not defined for sequences of different length.")
256
+
257
+ # ==============
258
+ # Configure Norm
259
+ # ==============
260
+ if norm == "auto":
261
+ if method in ["OM", "HAM", "DHD"]:
262
+ norm = "maxlength"
263
+ elif method in ["LCP", "RLCP", "LCPspell", "RLCPspell"]:
264
+ norm = "gmean"
265
+ elif method in ["OMspell"]:
266
+ norm = "YujianBo"
267
+ else:
268
+ raise ValueError(f"[!] No known normalization method to select automatically for {method}.")
269
+
270
+ # ======================
271
+ # Configure sm and indel
272
+ # ======================
273
+
274
+ if indel_type == "auto" and sm_type == "matrix":
275
+ indel = np.max(sm) / 2
276
+ indel_type = "number"
277
+
278
+ # OM, OMspell, HAM, DHD
279
+ if method in om_methods + ["HAM", "DHD"]:
280
+ if sm_type == "matrix":
281
+ if method in om_methods + ["TWED"]:
282
+ # TODO : checkcost()
283
+ # Add a NaN column at the beginning and a NaN row at the top
284
+ # This ensures that indexing starts from 1
285
+ nan_col = np.full((sm.shape[0], 1), np.nan)
286
+ sm = np.hstack([nan_col, sm])
287
+ nan_row = np.full((1, sm.shape[1]), np.nan)
288
+ sm = np.vstack([nan_row, sm])
289
+ pass
290
+
291
+ elif method == "HAM":
292
+ # TODO : checkcost()
293
+ nan_col = np.full((sm.shape[0], 1), np.nan)
294
+ sm = np.hstack([nan_col, sm])
295
+ nan_row = np.full((1, sm.shape[1]), np.nan)
296
+ sm = np.vstack([nan_row, sm])
297
+ pass
298
+
299
+ else:
300
+ raise ValueError(f"[x] No known 'sm' check for {method}.")
301
+
302
+ elif sm_type == "array":
303
+ if method == "DHD":
304
+ # TODO : checkcost()
305
+ pass
306
+ else:
307
+ raise ValueError(f"[x] 'sm' as an array is not relevant for {method}.")
308
+
309
+ elif sm_type == "method":
310
+ tv = False
311
+ cost = None
312
+ if sm in ["INDELS", "INDELSLOG"]:
313
+ if method == "DHD":
314
+ tv = True
315
+ elif sm == "TRATE":
316
+ if method == "OM":
317
+ cost = 2
318
+ elif method == "HAM":
319
+ cost = 2
320
+ elif method == "DHD":
321
+ cost = 4
322
+ tv = True
323
+ elif sm == "CONSTANT":
324
+ if method == "HAM":
325
+ cost = 1
326
+ else:
327
+ cost = 2
328
+
329
+ sm = get_substitution_cost_matrix(seqdata,
330
+ method=sm,
331
+ cval=cost,
332
+ miss_cost=cost,
333
+ time_varying=tv,
334
+ weighted=weighted)
335
+
336
+ if indel_type == "auto":
337
+ indel = sm['indel']
338
+ indel_type = "vector" if getElementsNumber(indel) > 1 else "number"
339
+
340
+ print(f"[>] generated an indel of type {indel_type}\n")
341
+
342
+ sm = sm['sm']
343
+
344
+ del cost, tv
345
+
346
+ else:
347
+ if method == "HAM":
348
+ print("[>] Creating a 'sm' with a single substitution cost of 1.\n")
349
+ sm = get_substitution_cost_matrix(seqdata,
350
+ method="CONSTANT",
351
+ cval=1,
352
+ miss_cost=1)
353
+ if indel_type == "auto":
354
+ indel = sm['indel']
355
+ indel_type = "vector" if getElementsNumber(indel) > 1 else "number"
356
+
357
+ sm = sm['sm']
358
+
359
+ elif method == "DHD":
360
+ print("[>] Creating a 'sm' with the costs derived from the transition rates.\n")
361
+ sm = get_substitution_cost_matrix(seqdata,
362
+ method="TRATE",
363
+ cval=4, miss_cost=4, time_varying=True,
364
+ weighted=weighted)
365
+
366
+ if indel_type == "auto":
367
+ indel = sm['indel']
368
+ indel_type = "vector" if getElementsNumber(indel) > 1 else "number"
369
+
370
+ sm = sm['sm']
371
+
372
+ else:
373
+ raise ValueError("[x] 'sm' is missing.")
374
+
375
+ elif method not in ["CHI2", "EUCLID", "LCP", "RLCP", "LCPspell", "RLCPspell", "NMS", "NMSMST", "SVRspell"]:
376
+ raise ValueError(f"[x] No known 'sm' preparation for {method}.")
377
+
378
+ # ===========================
379
+ # Pre-Process Data (Part 1/2)
380
+ # ===========================
381
+ seqdata_num = seqdata.values # it's numpy
382
+
383
+ if refseq_type == "sets":
384
+ dseqs_num1 = np.unique(seqdata_num[refseq[0], :], axis=0)
385
+ nunique1 = len(dseqs_num1)
386
+ dseqs_num2 = np.unique(seqdata_num[refseq[1], :], axis=0)
387
+ nunique2 = len(dseqs_num2)
388
+
389
+ dseqs_num = np.vstack((dseqs_num1, dseqs_num2))
390
+
391
+ else:
392
+ dseqs_num = np.unique(seqdata_num, axis=0)
393
+
394
+ # Check that dseqs_num does not exceed the max allowed number
395
+ # if check_max_size:
396
+ # max_allowed_seq = np.floor(np.sqrt(np.iinfo(np.int32).max)) if refseq_type == "none" else np.iinfo(np.int32).max - 1
397
+ #
398
+ # if refseq_type == "sets":
399
+ # if (np.sqrt(nunique1) * np.sqrt(nunique2)) > max_allowed_seq:
400
+ # raise ValueError(f"[!] Number of {nunique1} and {nunique2} unique sequences too large for max allowed distances {max_allowed_seq}.")
401
+ # else:
402
+ # if len(dseqs_num) > max_allowed_seq:
403
+ # raise ValueError(f"[!] {len(dseqs_num)} unique sequences exceeds max allowed of {max_allowed_seq}.")
404
+
405
+ # =========================
406
+ # Handle Reference Sequence
407
+ # =========================
408
+ if refseq_type == "sets":
409
+ conc1 = seqconc(data=seqdata_num[refseq[0], :])
410
+ conc2 = seqconc(data=dseqs_num1)
411
+ # Find the position of each element in conc1 within conc2
412
+ index_map = {value: idx for idx, value in enumerate(conc2)}
413
+ seqdata_didxs1 = np.array([index_map[element] for element in conc1])
414
+
415
+ conc3 = seqconc(data=seqdata_num[refseq[1], :])
416
+ conc4 = seqconc(data=dseqs_num2)
417
+ # Find the position of each element in conc3 within conc4
418
+ index_map = {value: idx for idx, value in enumerate(conc4)}
419
+ seqdata_didxs2 = np.array([index_map[element] for element in conc3])
420
+
421
+ else:
422
+ seqdata_series = seqconc(data=seqdata_num)
423
+ dseqs_series = seqconc(data=dseqs_num)
424
+
425
+ index_map = {value: idx for idx, value in enumerate(dseqs_series)}
426
+ seqdata_didxs = np.array([index_map[element] for element in seqdata_series])
427
+
428
+ if refseq_type != "none":
429
+ if refseq_type == "sets":
430
+ if method in ["OMstran"]:
431
+ refseq_id = refseq
432
+ else:
433
+ refseq_id = [nunique1, nunique1 + nunique2]
434
+
435
+ else:
436
+ raise ValueError(f"[!] Unknown refseq type: {refseq_type}.")
437
+
438
+ if refseq_type == "sets":
439
+ print(f"[>] Pairwise measures between two subsets of sequences of sizes {len(refseq[0])} and {len(refseq[1])}")
440
+
441
+ # ==============================
442
+ # Compute Method-Specific Values
443
+ # ==============================
444
+ if method in ["OMspell"]:
445
+ if indel_type == "number":
446
+ indellist = np.repeat(indel, nstates + 1)
447
+ indel_type = "vector"
448
+ elif indel_type == "vector":
449
+ indellist = indel
450
+
451
+ indel = np.max(indellist)
452
+
453
+ # OM method: convert vector indel to scalar if needed
454
+ # OMdistance C++ code only accepts scalar indel, not state-dependent
455
+ # Following TraMineR's behavior: when indel.type == "vector", use max(indel)
456
+ # See TraMineR seqdist.R line 696: params[["indel"]] <- max(indel)
457
+ elif method == "OM" and indel_type == "vector":
458
+ if isinstance(indel, np.ndarray):
459
+ # Use max(indel) to match TraMineR's behavior
460
+ indel = float(np.max(indel))
461
+ indel_type = "number"
462
+ elif isinstance(indel, list):
463
+ indel_array = np.array(indel)
464
+ indel = float(np.max(indel_array))
465
+ indel_type = "number"
466
+
467
+ # OMspell
468
+ # Redefined dseqs.num
469
+ if method in ["OMspell", "LCPspell", "RLCPspell", "NMSMST", "SVRspell"]:
470
+ dseqs_dur = seqdur(seqdata) ** tpow # Do not use dseqs.num
471
+
472
+ # The position of the first occurrence of the deduplicated data (conc1) in the original data (conc2)
473
+ conc1 = seqconc(data=dseqs_num)
474
+ conc2 = seqconc(data=seqdata_num)
475
+ index_map = {value: idx for idx, value in enumerate(conc2)}
476
+ dseqs_oidxs = np.array([index_map[element] for element in conc1])
477
+
478
+ # Can't sort! Otherwise, the actual sequence compared will not be the expected sequence
479
+
480
+ # Get duration
481
+ c = 1 if method == "OMspell" else 0
482
+ dseqs_dur = dseqs_dur[dseqs_oidxs, :] - c
483
+
484
+ # Get DSS
485
+ seqdata_dss = seqdss(seqdata)
486
+ dseqs_num = seqdata_dss[dseqs_oidxs, :]
487
+
488
+ if method in ["OMspell", "LCPspell", "RLCPspell"]:
489
+ _seqlength = seqlength(dseqs_num)
490
+ if method == "LCPspell":
491
+ sign = 1
492
+ elif method == "RLCPspell":
493
+ sign = -1
494
+
495
+ del dseqs_oidxs
496
+ del c
497
+ del seqdata_dss
498
+
499
+ # HAM, DHD
500
+ elif method in ["HAM", "DHD"]:
501
+ if method == "HAM":
502
+ # sm_type = "array" # Not used. Should be here if it changes.
503
+ sm = adaptSmForHAM(sm, nstates, seqdata.seqdata.shape[1])
504
+
505
+ # Maximum possible cost of the Hamming distance
506
+ max_cost = 0
507
+ for i in range(np.max(seqs_dlens)): # seqs_dlens has here only one value
508
+ max_cost += np.max(sm[i, :, :])
509
+
510
+ # LCP
511
+ elif method == "LCP":
512
+ sign = 1
513
+
514
+ # RLCP
515
+ elif method == "RLCP":
516
+ sign = -1
517
+
518
+ # LCPspell (spell-based LCP, forward)
519
+ elif method == "LCPspell":
520
+ sign = 1
521
+
522
+ # RLCPspell (spell-based LCP, reverse)
523
+ elif method == "RLCPspell":
524
+ sign = -1
525
+
526
+ del index_map
527
+ del seqdata_num
528
+
529
+ # ===========================
530
+ # Pre-Process Data (part 2/2)
531
+ # ===========================
532
+ # Modified dseqs.num for OMspell
533
+ ndn = dseqs_num.shape[0]
534
+ incl_refseq = " (including refseq)" if refseq_type == "sequence" else ""
535
+ seq_or_spell = "spell sequences" if method in ["OMspell", "LCPspell", "RLCPspell"] else "sequences"
536
+ print(f"[>] Identified {ndn} unique {seq_or_spell}{incl_refseq}.")
537
+ del ndn
538
+ del seq_or_spell
539
+
540
+ # =================
541
+ # Compute Distances
542
+ # =================
543
+ norm_num = norms[1:].index(norm)
544
+ if isinstance(sm, pd.DataFrame):
545
+ sm = sm.values
546
+ lengths = seqlength(dseqs_num)
547
+
548
+ # C++ already guarantees that invalid values will not be accessed
549
+ warnings.filterwarnings("ignore", category=RuntimeWarning, message="invalid value encountered in cast")
550
+
551
+ if refseq_type != "none":
552
+ if len(refseq_id) == 1:
553
+ refseq_id = [refseq_id, refseq_id]
554
+
555
+ refseq_id = np.array(refseq_id, dtype=int)
556
+
557
+ if method == "OM":
558
+ om = c_code.OMdistance(dseqs_num,
559
+ sm,
560
+ indel,
561
+ norm_num,
562
+ lengths,
563
+ refseq_id)
564
+ dist_matrix = om.compute_refseq_distances()
565
+
566
+ elif method == "OMspell":
567
+ om = c_code.OMspellDistance(dseqs_num,
568
+ sm,
569
+ indel,
570
+ norm_num,
571
+ refseq_id,
572
+ expcost,
573
+ dseqs_dur,
574
+ indellist.astype(np.float64),
575
+ _seqlength)
576
+ dist_matrix = om.compute_refseq_distances()
577
+
578
+ elif method == "HAM" or method == "DHD":
579
+ DHD = c_code.DHDdistance(dseqs_num,
580
+ sm,
581
+ norm_num,
582
+ max_cost,
583
+ refseq_id)
584
+ dist_matrix = DHD.compute_refseq_distances()
585
+
586
+ elif method == "LCP" or method == "RLCP":
587
+ LCP = c_code.LCPdistance(dseqs_num,
588
+ norm_num,
589
+ sign,
590
+ refseq_id)
591
+ dist_matrix = LCP.compute_all_distances()
592
+
593
+ elif method == "LCPspell" or method == "RLCPspell":
594
+ LCPspell = c_code.LCPspellDistance(dseqs_num,
595
+ dseqs_dur,
596
+ _seqlength,
597
+ norm_num,
598
+ sign,
599
+ refseq_id,
600
+ expcost)
601
+ dist_matrix = LCPspell.compute_refseq_distances()
602
+
603
+ dist_matrix = dist_matrix[seqdata_didxs1[:, None], seqdata_didxs2[None, :]]
604
+
605
+ dist_matrix = pd.DataFrame(dist_matrix, index=seqdata.ids[refseq[0]], columns=seqdata.ids[refseq[1]])
606
+
607
+ else:
608
+ refseq_id = np.array([-1, -1])
609
+
610
+ if method == "OM":
611
+ om = c_code.OMdistance(dseqs_num,
612
+ sm,
613
+ indel,
614
+ norm_num,
615
+ lengths,
616
+ refseq_id)
617
+ dist_matrix = om.compute_all_distances()
618
+
619
+ elif method == "OMspell":
620
+ om = c_code.OMspellDistance(dseqs_num,
621
+ sm,
622
+ indel,
623
+ norm_num,
624
+ refseq_id,
625
+ expcost,
626
+ dseqs_dur,
627
+ indellist,
628
+ _seqlength)
629
+ dist_matrix = om.compute_all_distances()
630
+
631
+ elif method == "HAM" or method == "DHD":
632
+ DHD = c_code.DHDdistance(dseqs_num,
633
+ sm,
634
+ norm_num,
635
+ max_cost,
636
+ refseq_id)
637
+ dist_matrix = DHD.compute_all_distances()
638
+
639
+ elif method == "LCP" or method == "RLCP":
640
+ LCP = c_code.LCPdistance(dseqs_num,
641
+ norm_num,
642
+ sign,
643
+ refseq_id)
644
+ dist_matrix = LCP.compute_all_distances()
645
+
646
+ elif method == "LCPspell" or method == "RLCPspell":
647
+ LCPspell = c_code.LCPspellDistance(dseqs_num,
648
+ dseqs_dur,
649
+ _seqlength,
650
+ norm_num,
651
+ sign,
652
+ refseq_id,
653
+ expcost)
654
+ dist_matrix = LCPspell.compute_all_distances()
655
+
656
+ _matrix = c_code.dist2matrix(nseqs, seqdata_didxs, dist_matrix)
657
+ _dist2matrix = _matrix.padding_matrix()
658
+
659
+ if full_matrix == True and refseq == None:
660
+ dist_matrix = pd.DataFrame(_dist2matrix, index=seqdata.ids, columns=seqdata.ids)
661
+
662
+ elif full_matrix == False and refseq != None:
663
+ print("[!] Sequenzo returned a full distance matrix because 'refseq' is not None. This is same as TraMineR.")
664
+
665
+ elif full_matrix == False and refseq == None:
666
+ dist_matrix = squareform(_dist2matrix)
667
+
668
+ print("[>] Computed Successfully.")
669
+ return dist_matrix
670
+
671
+
672
+
673
+ def adaptSmForHAM(sm, nstates, ncols):
674
+ costs = np.tile(sm, (ncols, 1, 1))
675
+ return costs
676
+
677
+ def getElementsNumber(x):
678
+ if isinstance(x, pd.DataFrame):
679
+ return x.shape[1]
680
+ elif isinstance(x, (np.ndarray, list, tuple)):
681
+ return x.size if isinstance(x, np.ndarray) else len(x)
682
+ else:
683
+ return 1
684
+
685
+
686
+ if __name__ == '__main__':
687
+ from sequenzo import *
688
+
689
+ start_time = time.time()
690
+
691
+ # tracemalloc.start()
692
+
693
+ # df = pd.read_csv("D:/college/research/QiQi/sequenzo/files/sampled_data_sets/broad_data/sampled_30000_data.csv")
694
+ # df = pd.read_csv("D:/college/research/QiQi/sequenzo/files/orignal data/detailed_sequence_10_work_years_df.csv")
695
+
696
+ # ===============================
697
+ # Sohee
698
+ # ===============================
699
+ # df = pd.read_csv('D:/college/research/QiQi/sequenzo/data_and_output/orignal data/sohee/sequence_data.csv')
700
+ # time_list = list(df.columns)[1:133]
701
+ # states = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
702
+ # # states = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
703
+ # labels = ['FT+WC', 'FT+BC', 'PT+WC', 'PT+BC', 'U', 'OLF']
704
+ # sequence_data = SequenceData(df, time=time_list, states=states, labels=labels, id_col="PID")
705
+ # om = get_distance_matrix(sequence_data, method="OM", sm="TRATE", indel="auto")
706
+
707
+ # om.to_csv("D:/college/research/QiQi/sequenzo/files/sequenzo_Sohee_string_OM_TRATE.csv", index=True)
708
+
709
+ # ===============================
710
+ # kass
711
+ # ===============================
712
+ # df = pd.read_csv('D:/college/research/QiQi/sequenzo/files/orignal data/kass/wide_civil_final_df.csv')
713
+ # time_list = list(df.columns)[1:]
714
+ # states = ['Extensive Warfare', 'Limited Violence', 'No Violence', 'Pervasive Warfare', 'Prolonged Warfare',
715
+ # 'Serious Violence', 'Serious Warfare', 'Sporadic Violence', 'Technological Warfare', 'Total Warfare']
716
+ # sequence_data = SequenceData(df, time=time_list, time_type="year", states=states, id_col="COUNTRY")
717
+ # om = get_distance_matrix(sequence_data, method="RLCP", sm="TRATE", indel="auto")
718
+
719
+
720
+ # ===============================
721
+ # CO2
722
+ # ===============================
723
+ df = pd.read_csv("D:/country_co2_emissions_missing.csv")
724
+ _time = list(df.columns)[1:]
725
+ states = ['Very Low', 'Low', 'Middle', 'High', 'Very High']
726
+ sequence_data = SequenceData(df, time=_time, id_col="country", states=states)
727
+ om = get_distance_matrix(sequence_data, method="OMspell", sm="TRATE", indel="auto")
728
+
729
+
730
+ # ===============================
731
+ # detailed
732
+ # ===============================
733
+ # df = pd.read_csv("D:/college/research/QiQi/sequenzo/data_and_output/sampled_data_sets/detailed_data/sampled_1000_data.csv")
734
+ # _time = list(df.columns)[4:]
735
+ # states = ['data', 'data & intensive math', 'hardware', 'research', 'software', 'software & hardware', 'support & test']
736
+ # sequence_data = SequenceData(df[['worker_id', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10']],
737
+ # time_type="age", time=_time, id_col="worker_id", states=states)
738
+ # # refseq = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [142, 85]]
739
+ # om = get_distance_matrix(sequence_data, method="OM", sm="TRATE", indel="auto")
740
+
741
+ # ===============================
742
+ # broad
743
+ # ===============================
744
+ # df = pd.read_csv("D:/college/research/QiQi/sequenzo/data_and_output/sampled_data_sets/broad_data/sampled_1000_data.csv")
745
+ # _time = list(df.columns)[4:]
746
+ # states = ['Non-computing', 'Non-technical computing', 'Technical computing']
747
+ # sequence_data = SequenceData(df[['worker_id', 'C1', 'C2', 'C3', 'C4', 'C5']],
748
+ # time_type="age", time=_time, id_col="worker_id", states=states)
749
+ # om = get_distance_matrix(sequence_data, method="DHD", sm="TRATE", indel="auto")
750
+
751
+ # refseq = [[0, 1, 2], [99, 100]]
752
+ # print(om)
753
+
754
+ # snapshot = tracemalloc.take_snapshot()
755
+ # top_stats = snapshot.statistics('lineno')
756
+ # for stat in top_stats[:10]:
757
+ # print(stat)
758
+
759
+ print("================")
760
+ end_time = time.time()
761
+ print(f"[>] Total time: {end_time - start_time:.2f} seconds")
762
+ print(om)