sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
  2. sequenzo/__init__.py +349 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +476 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +22 -0
  30. sequenzo/data_preprocessing/helpers.py +303 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/dyadic_children.csv +61 -0
  44. sequenzo/datasets/dyadic_parents.csv +61 -0
  45. sequenzo/datasets/mvad.csv +713 -0
  46. sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
  47. sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
  48. sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
  49. sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
  50. sequenzo/datasets/political_science_aid_shock.csv +166 -0
  51. sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
  52. sequenzo/define_sequence_data.py +1400 -0
  53. sequenzo/dissimilarity_measures/__init__.py +31 -0
  54. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  55. sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
  56. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
  57. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  58. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  59. sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
  60. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  61. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  62. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  63. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  64. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  65. sequenzo/dissimilarity_measures/src/module.cpp +40 -0
  66. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  67. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  214. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  215. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  216. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  217. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  218. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  219. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  220. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  221. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  222. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  223. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  224. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  225. sequenzo/multidomain/__init__.py +23 -0
  226. sequenzo/multidomain/association_between_domains.py +311 -0
  227. sequenzo/multidomain/cat.py +597 -0
  228. sequenzo/multidomain/combt.py +519 -0
  229. sequenzo/multidomain/dat.py +81 -0
  230. sequenzo/multidomain/idcd.py +139 -0
  231. sequenzo/multidomain/linked_polyad.py +292 -0
  232. sequenzo/openmp_setup.py +233 -0
  233. sequenzo/prefix_tree/__init__.py +62 -0
  234. sequenzo/prefix_tree/hub.py +114 -0
  235. sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
  236. sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
  237. sequenzo/prefix_tree/spell_level_indicators.py +297 -0
  238. sequenzo/prefix_tree/system_level_indicators.py +544 -0
  239. sequenzo/prefix_tree/utils.py +54 -0
  240. sequenzo/seqhmm/__init__.py +95 -0
  241. sequenzo/seqhmm/advanced_optimization.py +305 -0
  242. sequenzo/seqhmm/bootstrap.py +411 -0
  243. sequenzo/seqhmm/build_hmm.py +142 -0
  244. sequenzo/seqhmm/build_mhmm.py +136 -0
  245. sequenzo/seqhmm/build_nhmm.py +121 -0
  246. sequenzo/seqhmm/fit_mhmm.py +62 -0
  247. sequenzo/seqhmm/fit_model.py +61 -0
  248. sequenzo/seqhmm/fit_nhmm.py +76 -0
  249. sequenzo/seqhmm/formulas.py +289 -0
  250. sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
  251. sequenzo/seqhmm/gradients_nhmm.py +306 -0
  252. sequenzo/seqhmm/hmm.py +291 -0
  253. sequenzo/seqhmm/mhmm.py +314 -0
  254. sequenzo/seqhmm/model_comparison.py +238 -0
  255. sequenzo/seqhmm/multichannel_em.py +282 -0
  256. sequenzo/seqhmm/multichannel_utils.py +138 -0
  257. sequenzo/seqhmm/nhmm.py +270 -0
  258. sequenzo/seqhmm/nhmm_utils.py +191 -0
  259. sequenzo/seqhmm/predict.py +137 -0
  260. sequenzo/seqhmm/predict_mhmm.py +142 -0
  261. sequenzo/seqhmm/simulate.py +878 -0
  262. sequenzo/seqhmm/utils.py +218 -0
  263. sequenzo/seqhmm/visualization.py +910 -0
  264. sequenzo/sequence_characteristics/__init__.py +40 -0
  265. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  266. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  267. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  268. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  269. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  270. sequenzo/sequence_characteristics/turbulence.py +155 -0
  271. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  272. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  273. sequenzo/suffix_tree/__init__.py +66 -0
  274. sequenzo/suffix_tree/hub.py +114 -0
  275. sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
  276. sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
  277. sequenzo/suffix_tree/spell_level_indicators.py +248 -0
  278. sequenzo/suffix_tree/system_level_indicators.py +535 -0
  279. sequenzo/suffix_tree/utils.py +56 -0
  280. sequenzo/version_check.py +283 -0
  281. sequenzo/visualization/__init__.py +29 -0
  282. sequenzo/visualization/plot_mean_time.py +222 -0
  283. sequenzo/visualization/plot_modal_state.py +276 -0
  284. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  285. sequenzo/visualization/plot_relative_frequency.py +405 -0
  286. sequenzo/visualization/plot_sequence_index.py +1175 -0
  287. sequenzo/visualization/plot_single_medoid.py +153 -0
  288. sequenzo/visualization/plot_state_distribution.py +651 -0
  289. sequenzo/visualization/plot_transition_matrix.py +190 -0
  290. sequenzo/visualization/utils/__init__.py +23 -0
  291. sequenzo/visualization/utils/utils.py +310 -0
  292. sequenzo/with_event_history_analysis/__init__.py +35 -0
  293. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  294. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  295. sequenzo-0.1.31.dist-info/METADATA +286 -0
  296. sequenzo-0.1.31.dist-info/RECORD +299 -0
  297. sequenzo-0.1.31.dist-info/WHEEL +5 -0
  298. sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
  299. sequenzo-0.1.31.dist-info/top_level.txt +2 -0
@@ -0,0 +1,519 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : combt.py
4
+ @Time : 15/04/2025 21:30
5
+ @Desc : Modular utility functions for CombT (Combined Typology) strategy.
6
+ Split into reusable components to give users control over distance calculation, clustering, and label merging.
7
+ """
8
+ from collections import Counter
9
+ from typing import List
10
+ import numpy as np
11
+ import pandas as pd
12
+ import matplotlib
13
+ import os
14
+
15
+ # Set to Agg backend if no DISPLAY environment variable (server/terminal)
16
+ # if os.environ.get("DISPLAY", "") == "":
17
+ # matplotlib.use("Agg")
18
+ import matplotlib.pyplot as plt
19
+ import seaborn as sns
20
+ from sklearn.metrics import silhouette_score
21
+
22
+ from sequenzo.dissimilarity_measures import get_distance_matrix
23
+ from sequenzo.clustering.hierarchical_clustering import Cluster, ClusterQuality
24
+ from sequenzo.visualization.utils.utils import save_and_show_results
25
+
26
+
27
+ def _compute_domain_distances(sequence_objects, method_params) -> List[np.ndarray]:
28
+ """
29
+ Compute distance matrices for each domain using specified methods.
30
+
31
+ Parameters:
32
+ - sequence_objects: List of sequence data objects, one per domain
33
+ - method_params: List of parameter dictionaries for distance computation, one per domain
34
+
35
+ Returns:
36
+ - List of distance matrices, one per domain
37
+ """
38
+ # Validate input parameters
39
+ if method_params is None or len(method_params) != len(sequence_objects):
40
+ raise ValueError("[CombT] Number of method_params must match number of domains.")
41
+
42
+ for i, params in enumerate(method_params):
43
+ if "method" not in params:
44
+ raise ValueError(f"[CombT] Required parameter 'method' missing in method_params[{i}]")
45
+
46
+ distances = []
47
+ for seq, params in zip(sequence_objects, method_params):
48
+ diss = get_distance_matrix(seqdata=seq, **params)
49
+ distances.append(diss)
50
+ return distances
51
+
52
+
53
+ def _assemble_combined_typology(cluster_labels: List[np.ndarray], ids: np.ndarray, sep: str = "+") -> pd.Series:
54
+ """
55
+ Assemble the combined typology from individual domain cluster labels.
56
+
57
+ Parameters:
58
+ - cluster_labels: List of cluster label arrays, one per domain
59
+ - ids: Array of sequence IDs
60
+ - sep: Separator to use for combining label strings (default: "+")
61
+
62
+ Returns:
63
+ - Pandas Series containing the combined typology labels with IDs as index
64
+ """
65
+ n = len(cluster_labels[0])
66
+ assert all(len(cl) == n for cl in cluster_labels), "[CombT] Cluster label arrays must have the same length."
67
+ combined = [sep.join(str(cl[i]) for cl in cluster_labels) for i in range(n)]
68
+ return pd.Series(combined, index=ids, name="CombT")
69
+
70
+
71
+ def _get_combt_membership_table(ids: np.ndarray,
72
+ cluster_labels: List[np.ndarray],
73
+ combined_typology: pd.Series,
74
+ domain_names: List[str] = None) -> pd.DataFrame:
75
+ """
76
+ Create a membership table that shows the domain cluster and combined typology for each ID.
77
+
78
+ Parameters:
79
+ - ids: Array of sequence IDs
80
+ - cluster_labels: List of cluster label arrays, one per domain
81
+ - combined_typology: Combined typology labels
82
+ - domain_names: Optional list of domain names (default: Domain_1, Domain_2, etc.)
83
+
84
+ Returns:
85
+ - DataFrame with domain cluster memberships and combined typology
86
+ """
87
+ df = pd.DataFrame(index=ids)
88
+ # Use domain_names if provided, otherwise generate default names
89
+ domain_names = domain_names or [f"Domain_{i + 1}" for i in range(len(cluster_labels))]
90
+
91
+ for name, labels in zip(domain_names, cluster_labels):
92
+ df[f"{name}_Cluster"] = labels
93
+ df["CombT"] = combined_typology
94
+ return df
95
+
96
+
97
+ def get_interactive_combined_typology(domains, method_params, domain_names=None, norm="zscore",
98
+ interactive=True, predefined_clusters=None):
99
+ """
100
+ Interactive or automated interface for the CombT workflow.
101
+ """
102
+ # 首先导入必要的库并设置matplotlib后端
103
+ import matplotlib
104
+ original_backend = matplotlib.get_backend()
105
+ matplotlib.use('Agg') # 在函数开始处强制使用Agg后端
106
+
107
+ try:
108
+ diss_matrices = _compute_domain_distances(domains, method_params)
109
+
110
+ cluster_labels = []
111
+ ids = domains[0].ids
112
+
113
+ # Use domain_names if provided, otherwise generate default names
114
+ domain_names = domain_names or [f"Domain_{i + 1}" for i in range(len(domains))]
115
+
116
+ # Check if predefined clusters are provided for non-interactive mode
117
+ if not interactive and predefined_clusters is not None:
118
+ if len(predefined_clusters) != len(domains):
119
+ raise ValueError("[CombT] Number of predefined clusters must match number of domains.")
120
+
121
+ for i, (diss, seq) in enumerate(zip(diss_matrices, domains)):
122
+ print(f"\n[>] Processing domain: {domain_names[i]}")
123
+ clus = Cluster(matrix=diss, entity_ids=seq.ids)
124
+ k = predefined_clusters[i]
125
+ print(f"[>] Using predefined number of clusters for domain '{domain_names[i]}': {k}")
126
+ labels = clus.get_cluster_labels(num_clusters=k)
127
+ cluster_labels.append(labels)
128
+
129
+ else: # Interactive mode
130
+ for i, (diss, seq) in enumerate(zip(diss_matrices, domains)):
131
+ print(f"\n[>] Processing domain: {domain_names[i]}")
132
+ clus = Cluster(matrix=diss, entity_ids=seq.ids)
133
+ quality = ClusterQuality(clus)
134
+ quality.compute_cluster_quality_scores()
135
+
136
+ try:
137
+ # 使用savefig替代save_and_show_results
138
+ fig = quality.plot_combined_scores(norm=norm,
139
+ title=f"Cluster Quality - {domain_names[i]}",
140
+ show=False) # 不显示,只返回图形对象
141
+
142
+ # 直接保存图形而不显示
143
+ fig.savefig(f"Cluster Quality - {domain_names[i]}.png", dpi=200)
144
+ plt.close(fig) # 确保关闭图形
145
+
146
+ print(
147
+ f"[>] Cluster Quality - {domain_names[i]}.png has been saved. Please check it and then come back.\n")
148
+
149
+ except Exception as e:
150
+ print(f"[!] Warning: Could not create or save plot: {e}")
151
+ print(f"[>] Continuing without visualization...")
152
+
153
+ while True:
154
+ try:
155
+ k = int(input(f"[?] Enter number of clusters for domain '{domain_names[i]}': "))
156
+ labels = clus.get_cluster_labels(num_clusters=k)
157
+ cluster_labels.append(labels)
158
+ break
159
+ except Exception as e:
160
+ print(f"[!] Invalid input: {e}. Please try again.")
161
+
162
+ combt_series = _assemble_combined_typology(cluster_labels, ids=ids)
163
+ membership_df = _get_combt_membership_table(ids, cluster_labels, combt_series, domain_names)
164
+
165
+ print("\n[>] Combined Typology Membership Table Preview:")
166
+ print(membership_df.reset_index().rename(columns={"index": "id"}).head())
167
+
168
+ membership_df.reset_index().rename(columns={"index": "id"}).to_csv("combt_membership_table.csv", index=False)
169
+ print("\n[>] combt_membership_table.csv has been saved.")
170
+
171
+ # Output frequency and proportion table
172
+ freq_table = membership_df["CombT"].value_counts().reset_index()
173
+ freq_table.columns = ["CombT", "Frequency"]
174
+ freq_table["Proportion (%)"] = (freq_table["Frequency"] / freq_table["Frequency"].sum() * 100).round(2)
175
+
176
+ print("\n[>] CombT Frequency Table:")
177
+ print(freq_table)
178
+ freq_table.to_csv("freq_table.csv", index=False)
179
+ print("\n[>] freq_table.csv has been saved.")
180
+
181
+ # Optional bar plot - with error handling
182
+ try:
183
+ plt.figure(figsize=(10, 5))
184
+ sns.barplot(data=freq_table, x="CombT", y="Proportion (%)", color="skyblue")
185
+ plt.xticks(rotation=45, ha="right")
186
+ plt.title("Frequency of Combined Typologies")
187
+ plt.xlabel("CombT")
188
+ plt.ylabel("Frequency")
189
+ plt.tight_layout()
190
+
191
+ # 直接保存图形而不显示
192
+ plt.savefig("Frequency of Combined Typologies.png", dpi=200)
193
+ plt.close() # 确保关闭图形
194
+
195
+ print("\n[>] Frequency of Combined Typologies.png has been saved.")
196
+ except Exception as e:
197
+ print(f"[!] Warning: Could not create frequency plot: {e}")
198
+
199
+ return diss_matrices, membership_df
200
+
201
+ finally:
202
+ # 无论如何,在函数结束时恢复原始后端
203
+ matplotlib.use(original_backend)
204
+
205
+
206
+ def _compute_silhouette_score(diss_matrix, labels):
207
+ """
208
+ Compute silhouette score with precomputed distance matrix.
209
+
210
+ Parameters:
211
+ - diss_matrix: Square distance matrix
212
+ - labels: Cluster labels
213
+
214
+ Returns:
215
+ - Silhouette score
216
+ """
217
+ return silhouette_score(diss_matrix, labels, metric='precomputed')
218
+
219
+
220
+ def merge_sparse_combt_types(distance_matrix,
221
+ labels,
222
+ min_size=30, # For sample size about 2,000
223
+ asw_threshold=0.5, # Silhouette score threshold
224
+ verbose=True,
225
+ print_merge_details=True,
226
+ visualize_process=True,
227
+ visualization_path="merge_progress.png"):
228
+ """
229
+ Merge sparse CombT labels based on silhouette score threshold strategy.
230
+
231
+ This implements the algorithm described in the CombT paper to avoid scarce
232
+ combined types by merging them while maintaining cluster quality.
233
+
234
+ Parameters:
235
+ - distance_matrix: np.ndarray or pd.DataFrame, full square dissimilarity matrix.
236
+ - labels: array-like of original CombT string labels.
237
+ - min_size: int, minimum samples per allowed group.
238
+ - asw_threshold: float, minimum silhouette score threshold to accept a merge.
239
+ - verbose: bool, print steps or not.
240
+ - print_merge_details: bool, whether to print detailed merge history at the end.
241
+ - visualize_process: bool, whether to create a visualization of the merge process.
242
+ - visualization_path: str, file path to save the visualization (if visualize_process=True).
243
+
244
+ Returns:
245
+ - new_labels: numpy array of updated labels after merging.
246
+ - merge_info: dict, containing merge history and quality information.
247
+ """
248
+ # Parameter validation
249
+ if not isinstance(distance_matrix, (np.ndarray, pd.DataFrame)):
250
+ raise TypeError("distance_matrix must be numpy.ndarray or pandas.DataFrame")
251
+
252
+ if isinstance(distance_matrix, pd.DataFrame):
253
+ distance_matrix = distance_matrix.values
254
+
255
+ if distance_matrix.shape[0] != distance_matrix.shape[1]:
256
+ raise ValueError("distance_matrix must be square (n x n)")
257
+
258
+ labels = np.array(labels)
259
+ if len(labels) != distance_matrix.shape[0]:
260
+ raise ValueError(
261
+ f"Length of labels ({len(labels)}) does not match distance matrix dimensions ({distance_matrix.shape[0]})")
262
+
263
+ # Track merge history and quality metrics
264
+ merge_info = {
265
+ "merge_history": [],
266
+ "initial_silhouette": None,
267
+ "final_silhouette": None,
268
+ "initial_cluster_count": None,
269
+ "final_cluster_count": None,
270
+ "small_clusters_merged": 0
271
+ }
272
+
273
+ label_counts = pd.Series(labels).value_counts()
274
+ original_labels = sorted(label_counts.index.tolist())
275
+
276
+ # Record initial state
277
+ merge_info["initial_cluster_count"] = len(original_labels)
278
+
279
+ # Create a mapping between string labels and numeric labels
280
+ label_map = {label: f"C{i + 1}" for i, label in enumerate(original_labels)}
281
+ numeric_labels = np.array([label_map[l] for l in labels])
282
+ reverse_map = {v: k for k, v in label_map.items()}
283
+
284
+ unique_labels = np.unique(numeric_labels)
285
+ current_score = _compute_silhouette_score(distance_matrix, numeric_labels)
286
+ merge_info["initial_silhouette"] = current_score
287
+
288
+ if verbose:
289
+ print(f"[>] Initial clusters: {len(unique_labels)}, Initial ASW: {current_score:.4f}")
290
+ print(f"[>] Beginning merge process with min_size={min_size} and ASW threshold={asw_threshold}")
291
+
292
+ total_merges = 0
293
+ iterations = 0
294
+
295
+ # Main merging loop
296
+ while current_score >= asw_threshold:
297
+ iterations += 1
298
+ if verbose and iterations % 10 == 0:
299
+ print(f"[>] Iteration {iterations}, current clusters: {len(np.unique(numeric_labels))}")
300
+
301
+ counts = Counter(numeric_labels)
302
+ small_clusters = [lab for lab, cnt in counts.items() if cnt < min_size]
303
+
304
+ # Exit if no small clusters remain
305
+ if not small_clusters:
306
+ break
307
+
308
+ merged = False
309
+ for small in small_clusters:
310
+ other_labels = [lab for lab in np.unique(numeric_labels) if lab != small]
311
+
312
+ best_score = -np.inf
313
+ best_target = None
314
+
315
+ # Find best merge target
316
+ for target in other_labels:
317
+ temp_labels = numeric_labels.copy()
318
+ temp_labels[temp_labels == small] = target
319
+ try:
320
+ score = _compute_silhouette_score(distance_matrix, temp_labels)
321
+ if score > best_score:
322
+ best_score = score
323
+ best_target = target
324
+ except Exception as e:
325
+ if verbose:
326
+ print(f"[!] Error computing silhouette for merge {small} -> {target}: {e}")
327
+ continue
328
+
329
+ # Execute merge if it maintains quality threshold
330
+ if best_score >= asw_threshold:
331
+ old_count = counts[small]
332
+ numeric_labels[numeric_labels == small] = best_target
333
+ current_score = best_score
334
+
335
+ # Record merge details
336
+ merge_details = {
337
+ "iteration": iterations,
338
+ "source": reverse_map[small],
339
+ "target": reverse_map[best_target],
340
+ "source_size": old_count,
341
+ "new_asw": best_score
342
+ }
343
+ merge_info["merge_history"].append(merge_details)
344
+ merge_info["small_clusters_merged"] += 1
345
+ total_merges += 1
346
+
347
+ if verbose:
348
+ print(
349
+ f"[+] Merged {small} ({reverse_map[small]}, size={old_count}) -> {best_target} ({reverse_map[best_target]}) | New ASW: {current_score:.4f}")
350
+
351
+ merged = True
352
+ break
353
+
354
+ # Exit if no suitable merges found
355
+ if not merged:
356
+ if verbose:
357
+ print(f"[!] No suitable merges found that maintain ASW >= {asw_threshold}")
358
+ break
359
+
360
+ # Convert back to original label format
361
+ merged_map = {old: reverse_map[old] for old in np.unique(numeric_labels)}
362
+ new_combined = [merged_map[l] for l in numeric_labels]
363
+
364
+ original_cluster_count = len(set(labels))
365
+ final_cluster_count = len(set(new_combined))
366
+
367
+ # Update final metrics
368
+ merge_info["final_silhouette"] = current_score
369
+ merge_info["final_cluster_count"] = final_cluster_count
370
+ merge_info["total_merges"] = total_merges
371
+
372
+ if verbose:
373
+ print(f"\n[>] CombT clusters before merging: {original_cluster_count}")
374
+ print(f"[>] CombT clusters after merging: {final_cluster_count}")
375
+ print(f"[>] Total merges performed: {total_merges}")
376
+ print(f"[>] Final ASW: {current_score:.4f}")
377
+
378
+ # Print merge history details if requested
379
+ if verbose and print_merge_details and merge_info["merge_history"]:
380
+ print("\n[>] Merge History Details:")
381
+ for i, merge in enumerate(merge_info["merge_history"]):
382
+ print(
383
+ f" Merge {i + 1}: {merge['source']} (size={merge['source_size']}) -> {merge['target']} | ASW: {merge['new_asw']:.4f}")
384
+
385
+ # Visualize merge process if requested
386
+ if visualize_process and merge_info["merge_history"]:
387
+ try:
388
+ _plot_merge_progress(merge_info, save_as=visualization_path)
389
+ if verbose:
390
+ print(f"\n[>] Merge process visualization saved to: {visualization_path}")
391
+ except Exception as e:
392
+ if verbose:
393
+ print(f"[!] Warning: Could not create merge visualization: {e}")
394
+
395
+ return np.array(new_combined), merge_info
396
+
397
+
398
+ def _plot_merge_progress(merge_info, save_as=None):
399
+ """
400
+ Internal function to visualize the progress of the cluster merging process.
401
+
402
+ Parameters:
403
+ - merge_info: dict, merge information returned by merge_sparse_combt_types
404
+ - save_as: str, filename to save the plot
405
+ """
406
+ if not merge_info["merge_history"]:
407
+ print("No merges were performed.")
408
+ return
409
+
410
+ # 保存当前matplotlib后端并临时切换到Agg
411
+ import matplotlib
412
+ current_backend = matplotlib.get_backend()
413
+
414
+ try:
415
+ # 使用上下文管理器临时更改后端
416
+ with plt.rc_context({'backend': 'Agg'}):
417
+ # Extract data
418
+ iterations = [m["iteration"] for m in merge_info["merge_history"]]
419
+ asw_scores = [m["new_asw"] for m in merge_info["merge_history"]]
420
+
421
+ # Calculate cluster counts at each iteration
422
+ clusters = [merge_info["initial_cluster_count"]]
423
+ for i in range(len(iterations)):
424
+ clusters.append(clusters[-1] - 1) # Each merge reduces clusters by 1
425
+
426
+ # Create figure
427
+ fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8), sharex=True)
428
+
429
+ # ASW scores plot
430
+ ax1.plot(iterations, asw_scores, 'o-', color='blue')
431
+ ax1.set_ylabel('Silhouette Score (ASW)')
432
+ ax1.set_title('Merge Process Progress')
433
+ ax1.grid(True, linestyle='--', alpha=0.7)
434
+
435
+ # Cluster count plot
436
+ ax2.plot(iterations, clusters[1:], 'o-', color='green')
437
+ ax2.set_xlabel('Merge Iteration')
438
+ ax2.set_ylabel('Number of Clusters')
439
+ ax2.grid(True, linestyle='--', alpha=0.7)
440
+
441
+ # Add reference lines
442
+ ax1.axhline(y=merge_info["initial_silhouette"], color='blue', linestyle='--', alpha=0.5,
443
+ label=f'Initial ASW: {merge_info["initial_silhouette"]:.4f}')
444
+ ax1.axhline(y=merge_info["final_silhouette"], color='red', linestyle='--', alpha=0.5,
445
+ label=f'Final ASW: {merge_info["final_silhouette"]:.4f}')
446
+ ax1.legend()
447
+
448
+ ax2.axhline(y=merge_info["initial_cluster_count"], color='green', linestyle='--', alpha=0.5,
449
+ label=f'Initial clusters: {merge_info["initial_cluster_count"]}')
450
+ ax2.axhline(y=merge_info["final_cluster_count"], color='red', linestyle='--', alpha=0.5,
451
+ label=f'Final clusters: {merge_info["final_cluster_count"]}')
452
+ ax2.legend()
453
+
454
+ plt.tight_layout()
455
+
456
+ if save_as:
457
+ plt.savefig(save_as, dpi=200)
458
+
459
+ plt.close() # 确保关闭图形
460
+
461
+ except Exception as e:
462
+ print(f"[!] Error creating merge progress plot: {e}")
463
+
464
+ finally:
465
+ # 恢复原始后端
466
+ matplotlib.use(current_backend)
467
+
468
+
469
+ if __name__ == '__main__':
470
+ from sequenzo import *
471
+
472
+ left_df = load_dataset('biofam_left_domain')
473
+ children_df = load_dataset('biofam_child_domain')
474
+ married_df = load_dataset('biofam_married_domain')
475
+
476
+ time_cols = [col for col in children_df.columns if col.startswith("age_")]
477
+
478
+ seq_left = SequenceData(left_df, time_type="age", time=time_cols, states=[0, 1],
479
+ labels=["At home", "Left home"], id_col="id")
480
+ seq_child = SequenceData(children_df, time_type="age", time=time_cols, states=[0, 1],
481
+ labels=["No child", "Child"], id_col="id")
482
+ seq_marr = SequenceData(married_df, time_type="age", time=time_cols, states=[0, 1],
483
+ labels=["Not married", "Married"], id_col="id")
484
+
485
+ domains = [seq_left, seq_child, seq_marr]
486
+ method_params = [
487
+ {"method": "OM", "sm": "TRATE", "indel": "auto"},
488
+ {"method": "OM", "sm": "CONSTANT", "indel": "auto"},
489
+ {"method": "OM", "sm": "CONSTANT", "indel": 1},
490
+ ]
491
+
492
+ # NOTE: The order of domains is critical - must match between domains list and domain_names
493
+ diss_matrices, membership_df = get_interactive_combined_typology(domains,
494
+ method_params,
495
+ domain_names=["Left", "Child", "Married"])
496
+
497
+ dat_matrix = compute_dat_distance_matrix(domains, method_params=method_params)
498
+
499
+ # Use CombT as the label
500
+ labels = membership_df["CombT"].values
501
+
502
+ # Merge sparse clusters - important to check the proportions before deciding min_size
503
+ merged_labels, merge_info = merge_sparse_combt_types(distance_matrix=dat_matrix,
504
+ labels=labels,
505
+ min_size=50,
506
+ asw_threshold=0.5,
507
+ verbose=True,
508
+ # Optional parameters below, the default is True
509
+ print_merge_details=True,
510
+ visualize_process=True,
511
+ visualization_path="merge_progress_combt.png"
512
+ )
513
+
514
+ # Update the membership dataframe
515
+ membership_df["CombT_Merged"] = merged_labels
516
+
517
+ # Save results
518
+ membership_df.reset_index().rename(columns={"index": "id"}).to_csv("combt_membership_table.csv", index=False)
519
+ print("\n[>] combt_membership_table.csv has been saved.")
@@ -0,0 +1,81 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : dat.py
4
+ @Time : 15/04/2025 17:28
5
+ @Desc : DAT (Distance Additive Trick) strategy with customizable dissimilarity parameters per domain.
6
+ """
7
+ from typing import List, Dict
8
+ import pandas as pd
9
+ import numpy as np
10
+ from sequenzo.define_sequence_data import SequenceData
11
+ from sequenzo.dissimilarity_measures import get_distance_matrix
12
+
13
+
14
+ def compute_dat_distance_matrix(
15
+ sequence_objects: list,
16
+ method_params: List[Dict] = None
17
+ ) -> np.ndarray:
18
+ """
19
+ Compute Distance Additive Trick (DAT) distance matrix.
20
+ This sums distance matrices from multiple SequenceData domains, each with its own method config.
21
+
22
+ Parameters:
23
+ - sequence_objects: List of SequenceData instances
24
+ - method_params: List of dicts for each domain, e.g.:
25
+ [{"method": "OM", "sm": "TRATE", "indel": "auto"}, ...]
26
+ Each dict will be passed directly into get_distance_matrix as kwargs.
27
+
28
+ Returns:
29
+ - A numpy array representing the combined distance matrix
30
+ """
31
+ distance_matrices = []
32
+
33
+ if method_params is None or len(method_params) != len(sequence_objects):
34
+ raise ValueError(
35
+ f"[DAT] Please provide a list of dissimilarity measures parameters for each domain.\n"
36
+ f"Expected {len(sequence_objects)} dicts in method_params, but got: {method_params}.\n"
37
+ f"For instance, if you have two domains, try something like this:\n\n"
38
+ f" method_params = [\n"
39
+ f" {{'method': 'OM', 'sm': 'CONSTANT', 'indel': 1}},\n"
40
+ f" {{'method': 'HAM'}} # if using Hamming, no 'sm' or 'indel' needed\n"
41
+ f" ]\n\n"
42
+ f"Each dict will be passed directly into get_distance_matrix(seqdata=..., **params)."
43
+ )
44
+
45
+ for seq, params in zip(sequence_objects, method_params):
46
+ dist = get_distance_matrix(seqdata=seq, **params)
47
+ distance_matrices.append(dist)
48
+
49
+ dat_matrix = sum(distance_matrices)
50
+ return dat_matrix
51
+
52
+
53
+ if __name__ == '__main__':
54
+
55
+ from sequenzo import *
56
+
57
+ left_df = load_dataset('biofam_left_domain')
58
+ children_df = load_dataset('biofam_child_domain')
59
+ married_df = load_dataset('biofam_married_domain')
60
+
61
+ time_cols = [col for col in children_df.columns if col.startswith("age_")]
62
+
63
+ seq_left = SequenceData(data=left_df, time=time_cols, states=[0, 1],
64
+ labels=["At home", "Left home"])
65
+ seq_child = SequenceData(data=children_df, time=time_cols, states=[0, 1],
66
+ labels=["No child", "Child"])
67
+ seq_marr = SequenceData(data=married_df, time=time_cols, states=[0, 1],
68
+ labels=["Not married", "Married"])
69
+
70
+ domains_seq_list = [seq_left, seq_child, seq_marr]
71
+
72
+ domain_params = [
73
+ {"method": "OM", "sm": "TRATE", "indel": "auto"},
74
+ {"method": "OM", "sm": "CONSTANT", "indel": "auto"},
75
+ # {"method": "OM", "sm": "CONSTANT", "indel": 1},
76
+ {"method": "DHD"}
77
+ ]
78
+
79
+ dat_matrix = compute_dat_distance_matrix(domains_seq_list, method_params=domain_params)
80
+
81
+ print(dat_matrix)