sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
  2. sequenzo/__init__.py +349 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +476 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +22 -0
  30. sequenzo/data_preprocessing/helpers.py +303 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/dyadic_children.csv +61 -0
  44. sequenzo/datasets/dyadic_parents.csv +61 -0
  45. sequenzo/datasets/mvad.csv +713 -0
  46. sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
  47. sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
  48. sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
  49. sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
  50. sequenzo/datasets/political_science_aid_shock.csv +166 -0
  51. sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
  52. sequenzo/define_sequence_data.py +1400 -0
  53. sequenzo/dissimilarity_measures/__init__.py +31 -0
  54. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  55. sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
  56. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
  57. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  58. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  59. sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
  60. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  61. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  62. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  63. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  64. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  65. sequenzo/dissimilarity_measures/src/module.cpp +40 -0
  66. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  67. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  214. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  215. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  216. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  217. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  218. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  219. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  220. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  221. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  222. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  223. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  224. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  225. sequenzo/multidomain/__init__.py +23 -0
  226. sequenzo/multidomain/association_between_domains.py +311 -0
  227. sequenzo/multidomain/cat.py +597 -0
  228. sequenzo/multidomain/combt.py +519 -0
  229. sequenzo/multidomain/dat.py +81 -0
  230. sequenzo/multidomain/idcd.py +139 -0
  231. sequenzo/multidomain/linked_polyad.py +292 -0
  232. sequenzo/openmp_setup.py +233 -0
  233. sequenzo/prefix_tree/__init__.py +62 -0
  234. sequenzo/prefix_tree/hub.py +114 -0
  235. sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
  236. sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
  237. sequenzo/prefix_tree/spell_level_indicators.py +297 -0
  238. sequenzo/prefix_tree/system_level_indicators.py +544 -0
  239. sequenzo/prefix_tree/utils.py +54 -0
  240. sequenzo/seqhmm/__init__.py +95 -0
  241. sequenzo/seqhmm/advanced_optimization.py +305 -0
  242. sequenzo/seqhmm/bootstrap.py +411 -0
  243. sequenzo/seqhmm/build_hmm.py +142 -0
  244. sequenzo/seqhmm/build_mhmm.py +136 -0
  245. sequenzo/seqhmm/build_nhmm.py +121 -0
  246. sequenzo/seqhmm/fit_mhmm.py +62 -0
  247. sequenzo/seqhmm/fit_model.py +61 -0
  248. sequenzo/seqhmm/fit_nhmm.py +76 -0
  249. sequenzo/seqhmm/formulas.py +289 -0
  250. sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
  251. sequenzo/seqhmm/gradients_nhmm.py +306 -0
  252. sequenzo/seqhmm/hmm.py +291 -0
  253. sequenzo/seqhmm/mhmm.py +314 -0
  254. sequenzo/seqhmm/model_comparison.py +238 -0
  255. sequenzo/seqhmm/multichannel_em.py +282 -0
  256. sequenzo/seqhmm/multichannel_utils.py +138 -0
  257. sequenzo/seqhmm/nhmm.py +270 -0
  258. sequenzo/seqhmm/nhmm_utils.py +191 -0
  259. sequenzo/seqhmm/predict.py +137 -0
  260. sequenzo/seqhmm/predict_mhmm.py +142 -0
  261. sequenzo/seqhmm/simulate.py +878 -0
  262. sequenzo/seqhmm/utils.py +218 -0
  263. sequenzo/seqhmm/visualization.py +910 -0
  264. sequenzo/sequence_characteristics/__init__.py +40 -0
  265. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  266. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  267. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  268. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  269. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  270. sequenzo/sequence_characteristics/turbulence.py +155 -0
  271. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  272. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  273. sequenzo/suffix_tree/__init__.py +66 -0
  274. sequenzo/suffix_tree/hub.py +114 -0
  275. sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
  276. sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
  277. sequenzo/suffix_tree/spell_level_indicators.py +248 -0
  278. sequenzo/suffix_tree/system_level_indicators.py +535 -0
  279. sequenzo/suffix_tree/utils.py +56 -0
  280. sequenzo/version_check.py +283 -0
  281. sequenzo/visualization/__init__.py +29 -0
  282. sequenzo/visualization/plot_mean_time.py +222 -0
  283. sequenzo/visualization/plot_modal_state.py +276 -0
  284. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  285. sequenzo/visualization/plot_relative_frequency.py +405 -0
  286. sequenzo/visualization/plot_sequence_index.py +1175 -0
  287. sequenzo/visualization/plot_single_medoid.py +153 -0
  288. sequenzo/visualization/plot_state_distribution.py +651 -0
  289. sequenzo/visualization/plot_transition_matrix.py +190 -0
  290. sequenzo/visualization/utils/__init__.py +23 -0
  291. sequenzo/visualization/utils/utils.py +310 -0
  292. sequenzo/with_event_history_analysis/__init__.py +35 -0
  293. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  294. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  295. sequenzo-0.1.31.dist-info/METADATA +286 -0
  296. sequenzo-0.1.31.dist-info/RECORD +299 -0
  297. sequenzo-0.1.31.dist-info/WHEEL +5 -0
  298. sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
  299. sequenzo-0.1.31.dist-info/top_level.txt +2 -0
@@ -0,0 +1,597 @@
1
+ """
2
+ @Author : Xinyi Li 李欣怡, Yuqi Liang 梁彧祺
3
+ @File : cat.py
4
+ @Time : 2025/4/8 09:06
5
+ @Desc : Build multidomain (MD) sequences of combined individual domain states (expanded alphabet),
6
+ derive multidomain indel and substitution costs from domain costs by means of an additive trick (CAT),
7
+ and compute OM pairwise distances using CAT costs.
8
+ """
9
+ import numpy as np
10
+ import pandas as pd
11
+ from typing import List, Union, Optional
12
+ import contextlib
13
+ import io
14
+
15
+ from sequenzo.define_sequence_data import SequenceData
16
+ from sequenzo.dissimilarity_measures.utils import seqlength
17
+ from sequenzo.dissimilarity_measures import get_distance_matrix, get_substitution_cost_matrix
18
+
19
+
20
+ def compute_cat_distance_matrix(channels: List[SequenceData],
21
+ method: Optional[str] = None,
22
+ norm: str = "none",
23
+ indel: Union[float, np.ndarray, List[Union[float, List[float]]]] = "auto",
24
+ sm: Optional[Union[List[str], List[np.ndarray]]] = None,
25
+ with_missing: Optional[Union[bool, List[bool]]] = None,
26
+ full_matrix: bool = True,
27
+ link: str = "sum",
28
+ cval: float = 2,
29
+ miss_cost: float = 2,
30
+ cweight: Optional[List[float]] = None,
31
+ what: str = "MDseq",
32
+ ch_sep: str = "+"):
33
+ """
34
+ mulitdomain sequences analysis, you can get:
35
+ - multi-domain sequences ('MDseq')
36
+ - multi-domain substitution and indel costs ('cost')
37
+ - multi-domain distance_matrix ('diss')
38
+
39
+ :param channels: A list of domain state sequence stslist objects defined with the define_sequences_data function
40
+ :param method: Dissimilarity measure between sequences.
41
+ :param norm: The normalization method to use. Ignored if what is not "diss".
42
+ :param indel: An insertion/deletion cost or a vector of state dependent indel costs for each domain.
43
+ :param sm: A list with a substitution-cost matrix for each domain,
44
+ or a list of method names for generating the domain substitution costs
45
+ :param with_missing: Whether consider missing values
46
+ :param full_matrix: the full distance matrix between MD sequences is returned.
47
+ :param link: Method to compute the "link" between domains.
48
+ :param cval: Domain substitution cost for "CONSTANT" matrix, for seqcost
49
+ :param miss_cost: Cost to substitute missing values at domain level, for seqcost
50
+ :param cweight: A vector of domain weights.
51
+ :param what: What output should be returned?
52
+ :param ch_sep: Separator used for building state names of the expanded alphabet.
53
+ """
54
+
55
+ # ==================
56
+ # Checking Arguments
57
+ # ==================
58
+ if what == "sm":
59
+ print("[!] what='sm' deprecated! Use what='cost' instead.")
60
+ what = "cost"
61
+ elif what == "seqmc":
62
+ print("[!] what='seqmc' deprecated! Use what='MDseq' instead.")
63
+ what = "MDseq"
64
+
65
+ valid_whats = ["MDseq", "cost", "diss"]
66
+ if what not in valid_whats:
67
+ raise ValueError(f"[!] 'what' should be one of {valid_whats}.")
68
+
69
+ if what == "diss" and not method:
70
+ raise ValueError("[!] A valid 'method' must be provided when what = 'diss'.")
71
+ if what == "cost" and sm is None:
72
+ raise ValueError("[!] 'sm' cannot be NULL when what = 'cost'.")
73
+
74
+ nchannels = len(channels)
75
+ if nchannels < 2:
76
+ raise ValueError("[!] Please specify at least two domains.")
77
+
78
+ # Check cweight
79
+ if cweight is None:
80
+ cweight = np.repeat(1.0, nchannels)
81
+
82
+ # If time varying sm are provided, all sm must be 3-dimensional
83
+ timeVarying = False
84
+ if isinstance(sm, list) and isinstance(sm[0], np.ndarray):
85
+ ndims = [arr.ndim for arr in sm]
86
+ if any(d == 3 for d in ndims) and not all(d == 3 for d in ndims):
87
+ raise ValueError("[!] One sm is 3-dimensional and some are not.")
88
+
89
+ if ndims[0] == 3:
90
+ timeVarying = True
91
+
92
+ # Check indel
93
+ # Convert all elements in indel(list) to list
94
+ if isinstance(indel, (float, int)):
95
+ indel = [indel] * nchannels
96
+ indel = [[x] for x in indel]
97
+
98
+ if isinstance(indel, np.ndarray):
99
+ indel = [[x] for x in indel.tolist()]
100
+
101
+ if isinstance(indel, list) and isinstance(indel[0], (float, int)):
102
+ indel = [[x] for x in indel]
103
+
104
+ if len(indel) > 1 and any(indel == "auto" for indel in indel):
105
+ raise ValueError("[!] 'auto' not allowed in vector or list indel.")
106
+
107
+ if isinstance(indel, list) and len(indel) == 1:
108
+ raise ValueError("[!] When a list or vector, indel must be of length equal to number of domains.")
109
+
110
+ if isinstance(indel, list) and len(indel) != nchannels:
111
+ raise ValueError("[!] When a list or vector, indel must be of length equal to number of domains.")
112
+
113
+ # Check missing
114
+ has_miss = np.repeat(False, nchannels)
115
+
116
+ for i in range(nchannels):
117
+ channel = channels[i]
118
+ alphabet = channel.states
119
+
120
+ # Check separator
121
+ if any(ch_sep in str(s) for s in alphabet):
122
+ raise ValueError(f"[!] 'ch.sep' symbol ({ch_sep}) occurs in alphabet of at least one channel.")
123
+
124
+ has_miss[i] = channel.ismissing
125
+ if with_missing is not None and has_miss[i] != with_missing[i]:
126
+ with_missing[i] = has_miss[i]
127
+ print(f"[!] Bad with.missing value for domain {i + 1}. I set it as {has_miss[i]}.")
128
+
129
+ if with_missing is None:
130
+ with_missing = has_miss
131
+
132
+ if isinstance(with_missing, bool) or len(with_missing) == 1:
133
+ with_missing = np.repeat(with_missing, nchannels)
134
+
135
+ if len(with_missing) > 1 and len(with_missing) != nchannels:
136
+ raise ValueError("[!] When a vector, with.missing must be of length equal to number of domains.")
137
+
138
+ # Check number of sequences for each channel
139
+ first_len = channels[0].seqdata.shape[0]
140
+ if not all(channel.seqdata.shape[0] == first_len for channel in channels):
141
+ raise ValueError("[!] sequence objects have different numbers of rows.")
142
+
143
+ numseq = first_len
144
+
145
+ print(f"[>] {nchannels} domains with {numseq} sequences.")
146
+ # Actually LCP and RLCP are not included
147
+
148
+ # Check what : method, sm
149
+ if what == "diss":
150
+ metlist = ["OM", "LCS", "DHD", "HAM"]
151
+
152
+ if method not in metlist:
153
+ raise ValueError(f"[!] 'method' should be one of {metlist}.")
154
+ if not isinstance(sm, list):
155
+ raise ValueError(f"[!] 'sm' should be a list.")
156
+
157
+ if method == "LCS":
158
+ method = "OM"
159
+ sm = "CONSTANT"
160
+ indel = list(np.repeat(indel, nchannels))
161
+ cval = 2
162
+ miss_cost = 2
163
+
164
+ timeVarying = method == "DHD"
165
+
166
+ if sm is None:
167
+ costmethod = "CONSTANT"
168
+ if method == "DHD":
169
+ costmethod = "TRATE"
170
+ sm = list(np.repeat(costmethod, nchannels))
171
+
172
+ if len(sm) == 1 and sm[0] in ["CONSTANT", "TRATE", "INDELS", "INDELSLOG"]:
173
+ sm = list(np.repeat(sm, nchannels))
174
+
175
+ # Checking correct numbers of info per channel
176
+ if what != "MDseq":
177
+ if len(sm) != nchannels or len(cweight) != nchannels:
178
+ raise ValueError("[!] You must supply one weight, one substitution matrix, and one indel per domain.\n"
179
+ " Hint: The length of `sm` or `cweight` does not match the number of domains.")
180
+
181
+ # Checking that all channels have the same length
182
+ slength1 = seqlength(channels[1])
183
+ for i in range(1, nchannels):
184
+ if not np.array_equal(slength1, seqlength(channels[i])):
185
+ print("[!] Cases with sequences of different length across domains.")
186
+ break
187
+
188
+ substmat_list = [] # subsitution matrix
189
+ indel_list = [] # indels per channel
190
+ alphabet_list = [] # alphabet for each channel
191
+ alphsize_list = [] # alphabet size per channel
192
+ maxlength_list = np.zeros(nchannels) # seqlenth of each channels
193
+
194
+ # Storing number of columns and cnames
195
+ for i in range(nchannels):
196
+ maxlength_list[i] = channels[i].seqdata.shape[1]
197
+ max_index = np.argmax(maxlength_list)
198
+ md_cnames = channels[max_index].seqdata.columns
199
+
200
+ print("[>] Building MD sequences of combined states.")
201
+
202
+ # ================================
203
+ # Building the New Sequence Object
204
+ # ================================
205
+ maxlength = int(np.max(maxlength_list))
206
+ newseqdata = np.full((numseq, maxlength), "", dtype='U256')
207
+
208
+ for i in range(nchannels):
209
+ seqchan = channels[i].values.copy()
210
+ # Convert numeric codes back to state names using inverse mapping
211
+ inverse_mapping = channels[i].inverse_state_mapping
212
+
213
+ # Handle missing values: if "Missing" is in states, it has a normal mapping
214
+ # If not, missing values (NaN) map to len(states) as the default
215
+ # We need to ensure this code maps to the actual missing state name
216
+ missing_code = len(channels[i].states)
217
+ if channels[i].ismissing and missing_code not in inverse_mapping:
218
+ # Find the missing state name (could be "Missing" or np.nan)
219
+ missing_state = None
220
+ for s in channels[i].states:
221
+ if pd.isna(s) or (isinstance(s, str) and s.lower() == "missing"):
222
+ missing_state = s
223
+ break
224
+ if missing_state is not None:
225
+ # "Missing" is in states, so it should already be in inverse_mapping
226
+ # But if it's not, add it
227
+ if missing_state not in inverse_mapping.values():
228
+ # Find what code "Missing" maps to
229
+ for code, state in inverse_mapping.items():
230
+ if state == missing_state:
231
+ break
232
+ else:
233
+ # "Missing" not found in mapping, add missing_code -> missing_state
234
+ inverse_mapping[missing_code] = missing_state
235
+
236
+ for j in range(maxlength):
237
+ if j < maxlength_list[i]:
238
+ # Convert numeric codes to state names
239
+ # Codes are already integers from .values, but convert to int to be safe
240
+ def code_to_state(code):
241
+ code_int = int(code)
242
+ # Use inverse mapping to get state name
243
+ state_name = inverse_mapping.get(code_int)
244
+ if state_name is None:
245
+ # Code not found in mapping - this shouldn't happen with valid data
246
+ # But handle it gracefully by returning the code as string
247
+ # This will help identify data issues
248
+ return str(code_int)
249
+ # Convert state name to string (handles np.nan case)
250
+ if pd.isna(state_name):
251
+ return "Missing"
252
+ return str(state_name)
253
+
254
+ newcol = np.array([code_to_state(code) for code in seqchan[:, j]], dtype='U256')
255
+
256
+ # TraMineR default missing value is legal, and we already do this.
257
+ # newseqdataNA[,j] <- newseqdataNA[,j] & newCol == void
258
+
259
+ # SequenceData has no attributes void, so we default fill with missing value (np.nan)
260
+ # if (fill.with.miss == TRUE & has.miss[i] & any(newCol == void)) {
261
+ # newCol[newCol == void] < - nr
262
+ # }
263
+
264
+ else:
265
+ newcol = np.repeat("", numseq)
266
+
267
+ if i > 0:
268
+ newseqdata[:, j] = np.char.add(np.char.add(newseqdata[:, j], ch_sep), newcol)
269
+ else:
270
+ newseqdata[:, j] = newcol
271
+
272
+ # Get unique states in order of first appearance (like R's seqdef)
273
+ # np.unique sorts, but we need to preserve order of first appearance to match TraMineR
274
+ # TraMineR's seqdef uses the order of first appearance in the data
275
+ # Exclude empty strings (void) and NaN values, matching R's behavior
276
+ seen = set()
277
+ states_space = []
278
+ for i in range(numseq):
279
+ for j in range(maxlength):
280
+ val = newseqdata[i, j]
281
+ # Exclude empty strings (void) and NaN values
282
+ if val and val.strip() and not pd.isna(val) and val not in seen:
283
+ seen.add(val)
284
+ states_space.append(val)
285
+
286
+ print(" - OK.")
287
+
288
+ if what == "MDseq":
289
+ return newseqdata
290
+ else:
291
+ # ==================================================
292
+ # Building Substitution Matrix and Indel Per Channel
293
+ # ==================================================
294
+ for i in range(nchannels):
295
+ channel = channels[i]
296
+
297
+ if not isinstance(channel, SequenceData):
298
+ raise ValueError("[!] Channel ", i,
299
+ " is not a state sequence object, use 'seqdef' function to create one.")
300
+
301
+ # Use the actual states from the channel (like TraMineR uses attr(channels[[i]],"alphabet"))
302
+ # TraMineR: alphabet_list[[i]] <- attr(channels[[i]],"alphabet")
303
+ # Important: We need to preserve the exact order of channel.states for proper indexing
304
+ # Convert states to strings to match what's in MD sequences
305
+ # Store original states list for reference (before adding missing)
306
+ original_states = channel.states.copy()
307
+ states = [str(s) if not pd.isna(s) else "Missing" for s in original_states]
308
+
309
+ alphabet_list.append(states)
310
+ alphsize_list.append(len(states))
311
+
312
+ # Pre-progress indel
313
+ if indel != "auto" and len(indel[i]) == 1:
314
+ indel[i] = np.repeat(indel[i], alphsize_list[i])
315
+
316
+ # Substitution matrix generation method is given
317
+ if isinstance(sm[i], str):
318
+ print(f"[>] Computing substitution cost matrix for domain {i}.")
319
+
320
+ with contextlib.redirect_stdout(io.StringIO()):
321
+ costs = get_substitution_cost_matrix(channel, sm[i],
322
+ time_varying=timeVarying,
323
+ cval=cval,
324
+ miss_cost=miss_cost)
325
+ sm_matrix = costs['sm']
326
+ substmat_list.append(sm_matrix)
327
+
328
+ if "auto" == indel:
329
+ # costs['indel'] may include "null" at index 0 for some methods, but we only need state indels
330
+ # Extract state indels (skip index 0 which is "null" if present)
331
+ indel_val = costs['indel']
332
+ if isinstance(indel_val, np.ndarray) and len(indel_val) > alphsize_list[i]:
333
+ # Array has "null" at index 0, extract only state indels
334
+ state_indel = indel_val[1:]
335
+ elif np.isscalar(indel_val):
336
+ # Scalar indel, use as-is
337
+ state_indel = indel_val
338
+ else:
339
+ # Array with correct length (no "null" entry)
340
+ state_indel = indel_val
341
+
342
+ # If it's a scalar or single-element array, repeat it for all states
343
+ if np.isscalar(state_indel) or (isinstance(state_indel, np.ndarray) and state_indel.size == 1):
344
+ indel_list.append(np.repeat(state_indel if np.isscalar(state_indel) else state_indel[0], alphsize_list[i]))
345
+ else:
346
+ # Already an array with correct length
347
+ indel_list.append(state_indel)
348
+ else:
349
+ indel_list.append(indel[i])
350
+
351
+ else: # Provided sm
352
+ substmat_list.append(sm[i])
353
+
354
+ if "auto" == indel:
355
+ indel_list.append(np.repeat(np.max(sm[i]) / 2, alphsize_list[i]))
356
+ else:
357
+ indel_list.append(indel[i])
358
+
359
+ # Mutliply by channel weight
360
+ substmat_list[i] = cweight[i] * substmat_list[i]
361
+
362
+ if "auto" == indel:
363
+ indel = indel_list
364
+
365
+ # =============================================
366
+ # Building the New CAT Substitution Cost Matrix
367
+ # =============================================
368
+ print("[>] Computing MD substitution and indel costs with additive trick.")
369
+
370
+ # Build new subsitution matrix and new alphabet
371
+ alphabet = states_space
372
+ alphabet_size = len(alphabet)
373
+ newindel = None
374
+
375
+ # Recomputing the substitution matrix
376
+ if not timeVarying:
377
+ newsm = np.zeros((alphabet_size, alphabet_size))
378
+ newindel = np.zeros(alphabet_size)
379
+
380
+ # To reduce redundancy, we simply merged the code for retrieving sm and indel
381
+ statelisti = alphabet[alphabet_size - 1].split(ch_sep)
382
+ for i in range(nchannels):
383
+ state = statelisti[i]
384
+ ipos = alphabet_list[i].index(state)
385
+
386
+ newindel[alphabet_size - 1] += indel[i][ipos] * cweight[i]
387
+
388
+ for i in range(alphabet_size - 1):
389
+ statelisti = alphabet[i].split(ch_sep)
390
+
391
+ for chan in range(nchannels):
392
+ state = statelisti[chan]
393
+ ipos = alphabet_list[chan].index(state)
394
+
395
+ newindel[i] += indel[chan][ipos] * cweight[chan]
396
+
397
+ for j in range(i + 1, alphabet_size):
398
+ cost = 0
399
+ statelistj = alphabet[j].split(ch_sep)
400
+
401
+ for chan in range(nchannels):
402
+ state_i = statelisti[chan] # State string from MD sequence (e.g., "1", "2")
403
+ state_j = statelistj[chan] # State string from MD sequence
404
+
405
+ if isinstance(substmat_list[chan], pd.DataFrame):
406
+ state_i_str = str(state_i)
407
+ state_j_str = str(state_j)
408
+ if state_i_str not in substmat_list[chan].index or state_j_str not in substmat_list[chan].columns:
409
+ raise ValueError(f"State {state_i_str} or {state_j_str} not found in substitution matrix for channel {chan}. "
410
+ f"Available indices: {list(substmat_list[chan].index)}")
411
+ cost += substmat_list[chan].loc[state_i_str, state_j_str]
412
+ else:
413
+ # numpy array doesn't have "null" row/column, use index directly
414
+ # Get 0-based index in alphabet_list
415
+ ipos_base = alphabet_list[chan].index(state_i)
416
+ jpos_base = alphabet_list[chan].index(state_j)
417
+ cost += substmat_list[chan][ipos_base, jpos_base]
418
+
419
+ newsm[i, j] = cost
420
+ newsm[j, i] = cost
421
+
422
+ else:
423
+ # Recomputing time varying substitution
424
+ newsm = np.zeros((maxlength, alphabet_size, alphabet_size))
425
+
426
+ for t in range(maxlength):
427
+ for i in range(alphabet_size - 1):
428
+ statelisti = alphabet[i].split(ch_sep)
429
+
430
+ for j in range(i + 1, alphabet_size):
431
+ cost = 0
432
+ statelistj = alphabet[j].split(ch_sep)
433
+
434
+ for chan in range(nchannels):
435
+ # For time-varying matrices, there is no "null" row/column
436
+ # TraMineR: ipos <- match(statelisti[chan], alphabet_list[[chan]])
437
+ # cost <- cost + substmat_list[[chan]][ipos, jpos, t]
438
+ # match() returns 1-based index in R, but we use 0-based index in Python
439
+ state_i = statelisti[chan]
440
+ state_j = statelistj[chan]
441
+ ipos = alphabet_list[chan].index(state_i)
442
+ jpos = alphabet_list[chan].index(state_j)
443
+
444
+ # For time-varying, substmat_list[chan] is a 3D numpy array: (time, states, states)
445
+ # No "null" row/column, so use index directly
446
+ if isinstance(substmat_list[chan], np.ndarray) and substmat_list[chan].ndim == 3:
447
+ cost += substmat_list[chan][t, ipos, jpos]
448
+ else:
449
+ # Fallback for DataFrame (shouldn't happen for time-varying, but just in case)
450
+ # DataFrame has no "null" row/column after removal, use .loc with state labels
451
+ if isinstance(substmat_list[chan], pd.DataFrame):
452
+ cost += substmat_list[chan].loc[state_i, state_j]
453
+ else:
454
+ cost += substmat_list[chan][t, ipos, jpos]
455
+
456
+ newsm[t, i, j] = cost
457
+ newsm[t, j, i] = cost
458
+
459
+ print(" - OK.")
460
+
461
+ # Indel as sum
462
+ # When newindel is None and indel is not state-dependent (simple vector), compute sum
463
+ # TraMineR: if (is.null(newindel) & !is.list(indel_list)) newindel <- sum(indel*cweight)
464
+ if newindel is None:
465
+ # Check if indel is state-dependent (any element has length > 1)
466
+ is_state_dependent = False
467
+ if isinstance(indel, list) and len(indel) > 0:
468
+ # Check if any indel[i] has more than one element (state-dependent)
469
+ for ind in indel:
470
+ if isinstance(ind, (list, np.ndarray)) and len(ind) > 1:
471
+ is_state_dependent = True
472
+ break
473
+
474
+ if is_state_dependent:
475
+ # State-dependent indel: should have been computed above
476
+ # If we reach here, it means we have state-dependent indels but didn't compute newindel
477
+ # This shouldn't happen, but fallback to computing it
478
+ newindel = np.zeros(alphabet_size)
479
+ for i in range(alphabet_size):
480
+ statelisti = alphabet[i].split(ch_sep)
481
+ for chan in range(nchannels):
482
+ state = statelisti[chan]
483
+ ipos = alphabet_list[chan].index(state)
484
+ indel_val = indel[chan][ipos] if isinstance(indel[chan], (list, np.ndarray)) else indel[chan]
485
+ newindel[i] += indel_val * cweight[chan]
486
+ else:
487
+ # Simple vector: sum(indel * cweight) like TraMineR
488
+ # Extract single values from each channel's indel
489
+ indel_values = []
490
+ for ind in indel:
491
+ if isinstance(ind, (list, np.ndarray)):
492
+ indel_values.append(ind[0] if len(ind) > 0 else 1.0)
493
+ else:
494
+ indel_values.append(ind)
495
+ newindel = np.sum(np.array(indel_values) * np.array(cweight))
496
+
497
+ # If we want the mean of cost
498
+ if link == "mean":
499
+ newindel = newindel / np.sum(cweight)
500
+ newsm = newsm / np.sum(cweight)
501
+
502
+ if what == "cost":
503
+ return {
504
+ "sm": newsm,
505
+ "indel": newindel,
506
+ "alphabet": alphabet,
507
+ "cweight": cweight
508
+ }
509
+
510
+ if what == "diss":
511
+ if np.any(np.isnan(newsm)) or np.any(np.isnan(newindel)):
512
+ raise ValueError("NA values found in substitution or indel costs. Cannot compute MD distances.")
513
+
514
+ print("[>] Computing MD distances using additive trick.")
515
+
516
+ newseqdata_df = pd.DataFrame(newseqdata, columns=md_cnames)
517
+ newseqdata_df.insert(0, channels[0].id_col, channels[0].ids)
518
+
519
+ # Reconstruct multi-domain labels for composite states
520
+ domain_labels = [channel.labels for channel in channels] # e.g., [["At home", "Left home"], ["No child", "Child"]]
521
+
522
+ md_labels = []
523
+ for md_state in states_space:
524
+ parts = md_state.split(ch_sep) # e.g., ["0", "1"]
525
+ if len(parts) != len(domain_labels):
526
+ md_labels.append(md_state) # fallback if structure doesn't match
527
+ else:
528
+ label_parts = []
529
+ for val, dom_lab in zip(parts, domain_labels):
530
+ try:
531
+ label_parts.append(dom_lab[int(val)])
532
+ except (ValueError, IndexError):
533
+ label_parts.append(str(val)) # fallback if unexpected value
534
+ md_labels.append(" + ".join(label_parts))
535
+
536
+ with contextlib.redirect_stdout(io.StringIO()):
537
+ newseqdata_seq = SequenceData(newseqdata_df,
538
+ time=md_cnames,
539
+ states=states_space,
540
+ labels=md_labels,
541
+ id_col=channels[0].id_col)
542
+
543
+ # Pass newindel as-is (can be scalar or vector depending on state-dependency)
544
+ # TraMineR passes the full newindel vector/scalar to seqdist
545
+ temp_newsm = pd.DataFrame(newsm, index=alphabet, columns=alphabet)
546
+ with contextlib.redirect_stdout(io.StringIO()):
547
+ diss_matrix = get_distance_matrix(newseqdata_seq,
548
+ method=method,
549
+ norm=norm,
550
+ indel=newindel,
551
+ sm=newsm,
552
+ full_matrix=full_matrix)
553
+ print(" - OK.")
554
+
555
+ diss_matrix = pd.DataFrame(diss_matrix, index=channels[0].ids, columns=channels[0].ids)
556
+ return diss_matrix
557
+
558
+
559
+ if __name__ == "__main__":
560
+ import os
561
+
562
+ root = "/Users/xinyi/Projects/sequenzo/sequenzo/data_and_output/orignal data"
563
+
564
+ path1 = os.path.join(root, "country_co2_emissions_Without_missing_values.csv")
565
+ path2 = os.path.join(root, "country_co2_emissions_global_deciles_Without_missing_values.csv")
566
+
567
+ file1 = pd.read_csv(path1)
568
+ file2 = pd.read_csv(path2)
569
+
570
+ file1_time_list = list(file1.columns)[1:]
571
+ file2_time_list = list(file2.columns)[1:]
572
+
573
+ file1_states = ['Very Low', 'Low', 'Middle', 'High', 'Very High']
574
+ file2_states = ['D1 (Very Low)', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10 (Very High)']
575
+
576
+ file1_sequence_data = SequenceData(file1,
577
+ time=file1_time_list,
578
+ id_col="country",
579
+ states=file1_states,
580
+ labels=file1_states)
581
+ file2_sequence_data = SequenceData(file2,
582
+ time=file2_time_list,
583
+ id_col="country",
584
+ states=file2_states,
585
+ labels=file2_states)
586
+
587
+ sequence_list = [file1_sequence_data, file2_sequence_data]
588
+
589
+ MD = compute_cat_distance_matrix(channels=sequence_list,
590
+ method="OM",
591
+ sm=["CONSTANT", "TRATE"],
592
+ indel=[2, 10],
593
+ what="diss",)
594
+ print(MD)
595
+
596
+ # out_path = os.path.join(root, "CO2_MD_python_result_OM_TRATE_diss.csv")
597
+ # MD.to_csv(out_path, index=False)