sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
  2. sequenzo/__init__.py +349 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +476 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +22 -0
  30. sequenzo/data_preprocessing/helpers.py +303 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/dyadic_children.csv +61 -0
  44. sequenzo/datasets/dyadic_parents.csv +61 -0
  45. sequenzo/datasets/mvad.csv +713 -0
  46. sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
  47. sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
  48. sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
  49. sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
  50. sequenzo/datasets/political_science_aid_shock.csv +166 -0
  51. sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
  52. sequenzo/define_sequence_data.py +1400 -0
  53. sequenzo/dissimilarity_measures/__init__.py +31 -0
  54. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  55. sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
  56. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
  57. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  58. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  59. sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
  60. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  61. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  62. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  63. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  64. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  65. sequenzo/dissimilarity_measures/src/module.cpp +40 -0
  66. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  67. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  214. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  215. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  216. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  217. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  218. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  219. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  220. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  221. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  222. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  223. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  224. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  225. sequenzo/multidomain/__init__.py +23 -0
  226. sequenzo/multidomain/association_between_domains.py +311 -0
  227. sequenzo/multidomain/cat.py +597 -0
  228. sequenzo/multidomain/combt.py +519 -0
  229. sequenzo/multidomain/dat.py +81 -0
  230. sequenzo/multidomain/idcd.py +139 -0
  231. sequenzo/multidomain/linked_polyad.py +292 -0
  232. sequenzo/openmp_setup.py +233 -0
  233. sequenzo/prefix_tree/__init__.py +62 -0
  234. sequenzo/prefix_tree/hub.py +114 -0
  235. sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
  236. sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
  237. sequenzo/prefix_tree/spell_level_indicators.py +297 -0
  238. sequenzo/prefix_tree/system_level_indicators.py +544 -0
  239. sequenzo/prefix_tree/utils.py +54 -0
  240. sequenzo/seqhmm/__init__.py +95 -0
  241. sequenzo/seqhmm/advanced_optimization.py +305 -0
  242. sequenzo/seqhmm/bootstrap.py +411 -0
  243. sequenzo/seqhmm/build_hmm.py +142 -0
  244. sequenzo/seqhmm/build_mhmm.py +136 -0
  245. sequenzo/seqhmm/build_nhmm.py +121 -0
  246. sequenzo/seqhmm/fit_mhmm.py +62 -0
  247. sequenzo/seqhmm/fit_model.py +61 -0
  248. sequenzo/seqhmm/fit_nhmm.py +76 -0
  249. sequenzo/seqhmm/formulas.py +289 -0
  250. sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
  251. sequenzo/seqhmm/gradients_nhmm.py +306 -0
  252. sequenzo/seqhmm/hmm.py +291 -0
  253. sequenzo/seqhmm/mhmm.py +314 -0
  254. sequenzo/seqhmm/model_comparison.py +238 -0
  255. sequenzo/seqhmm/multichannel_em.py +282 -0
  256. sequenzo/seqhmm/multichannel_utils.py +138 -0
  257. sequenzo/seqhmm/nhmm.py +270 -0
  258. sequenzo/seqhmm/nhmm_utils.py +191 -0
  259. sequenzo/seqhmm/predict.py +137 -0
  260. sequenzo/seqhmm/predict_mhmm.py +142 -0
  261. sequenzo/seqhmm/simulate.py +878 -0
  262. sequenzo/seqhmm/utils.py +218 -0
  263. sequenzo/seqhmm/visualization.py +910 -0
  264. sequenzo/sequence_characteristics/__init__.py +40 -0
  265. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  266. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  267. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  268. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  269. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  270. sequenzo/sequence_characteristics/turbulence.py +155 -0
  271. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  272. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  273. sequenzo/suffix_tree/__init__.py +66 -0
  274. sequenzo/suffix_tree/hub.py +114 -0
  275. sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
  276. sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
  277. sequenzo/suffix_tree/spell_level_indicators.py +248 -0
  278. sequenzo/suffix_tree/system_level_indicators.py +535 -0
  279. sequenzo/suffix_tree/utils.py +56 -0
  280. sequenzo/version_check.py +283 -0
  281. sequenzo/visualization/__init__.py +29 -0
  282. sequenzo/visualization/plot_mean_time.py +222 -0
  283. sequenzo/visualization/plot_modal_state.py +276 -0
  284. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  285. sequenzo/visualization/plot_relative_frequency.py +405 -0
  286. sequenzo/visualization/plot_sequence_index.py +1175 -0
  287. sequenzo/visualization/plot_single_medoid.py +153 -0
  288. sequenzo/visualization/plot_state_distribution.py +651 -0
  289. sequenzo/visualization/plot_transition_matrix.py +190 -0
  290. sequenzo/visualization/utils/__init__.py +23 -0
  291. sequenzo/visualization/utils/utils.py +310 -0
  292. sequenzo/with_event_history_analysis/__init__.py +35 -0
  293. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  294. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  295. sequenzo-0.1.31.dist-info/METADATA +286 -0
  296. sequenzo-0.1.31.dist-info/RECORD +299 -0
  297. sequenzo-0.1.31.dist-info/WHEEL +5 -0
  298. sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
  299. sequenzo-0.1.31.dist-info/top_level.txt +2 -0
sequenzo/seqhmm/hmm.py ADDED
@@ -0,0 +1,291 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : hmm.py
4
+ @Time : 2025-11-13 16:20
5
+ @Desc : Base HMM class for Sequenzo
6
+
7
+ This module provides the HMM class that wraps hmmlearn's CategoricalHMM
8
+ and adapts it for use with Sequenzo's SequenceData format.
9
+ """
10
+
11
+ import numpy as np
12
+ import pandas as pd
13
+ from typing import Optional, List, Dict, Union
14
+ from hmmlearn.hmm import CategoricalHMM
15
+ from sequenzo.define_sequence_data import SequenceData
16
+ from .utils import (
17
+ sequence_data_to_hmmlearn_format,
18
+ int_to_state_mapping,
19
+ state_to_int_mapping
20
+ )
21
+ from .multichannel_utils import prepare_multichannel_data
22
+
23
+
24
+ class HMM:
25
+ """
26
+ Hidden Markov Model for sequence analysis.
27
+
28
+ This class wraps hmmlearn's CategoricalHMM and provides a Sequenzo-friendly
29
+ interface that works with SequenceData objects.
30
+
31
+ Attributes:
32
+ observations: SequenceData object containing the observed sequences
33
+ n_states: Number of hidden states
34
+ n_symbols: Number of observed symbols (alphabet size)
35
+ alphabet: List of observed state symbols
36
+ state_names: Optional names for hidden states
37
+ channel_names: Optional names for channels (for multichannel data)
38
+ length_of_sequences: Maximum sequence length
39
+ sequence_lengths: Array of individual sequence lengths
40
+ n_sequences: Number of sequences
41
+ n_channels: Number of channels (currently 1 for single-channel)
42
+
43
+ # Model parameters (after fitting)
44
+ initial_probs: Initial state probabilities
45
+ transition_probs: Transition probability matrix
46
+ emission_probs: Emission probability matrix
47
+
48
+ # hmmlearn model
49
+ _hmm_model: Internal hmmlearn CategoricalHMM model
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ observations: Union[SequenceData, List[SequenceData]],
55
+ n_states: int,
56
+ initial_probs: Optional[np.ndarray] = None,
57
+ transition_probs: Optional[np.ndarray] = None,
58
+ emission_probs: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
59
+ state_names: Optional[List[str]] = None,
60
+ channel_names: Optional[List[str]] = None,
61
+ random_state: Optional[int] = None
62
+ ):
63
+ """
64
+ Initialize an HMM model.
65
+
66
+ Args:
67
+ observations: SequenceData object or list of SequenceData objects (for multichannel)
68
+ n_states: Number of hidden states
69
+ initial_probs: Optional initial state probabilities (n_states,)
70
+ transition_probs: Optional transition matrix (n_states x n_states)
71
+ emission_probs: Optional emission matrix (n_states x n_symbols) or
72
+ list of matrices (one per channel for multichannel)
73
+ state_names: Optional names for hidden states
74
+ channel_names: Optional names for channels
75
+ random_state: Random seed for initialization
76
+ """
77
+ # Handle multichannel data
78
+ channels, channel_names_list, alphabets = prepare_multichannel_data(observations)
79
+ self.channels = channels
80
+ self.n_channels = len(channels)
81
+
82
+ # For single channel, store as observations for backward compatibility
83
+ if self.n_channels == 1:
84
+ self.observations = channels[0]
85
+ self.alphabet = alphabets[0]
86
+ else:
87
+ # For multichannel, store first channel as primary (for compatibility)
88
+ self.observations = channels[0]
89
+ self.alphabet = alphabets[0]
90
+
91
+ self.alphabets = alphabets
92
+ self.n_symbols = [len(alph) for alph in alphabets]
93
+
94
+ # For single channel, use single n_symbols
95
+ if self.n_channels == 1:
96
+ self.n_symbols = self.n_symbols[0]
97
+
98
+ self.n_states = n_states
99
+
100
+ # Store metadata
101
+ self.state_names = state_names or [f"State {i+1}" for i in range(n_states)]
102
+ self.channel_names = channel_names or channel_names_list
103
+
104
+ # Get sequence information (use first channel for sequence info)
105
+ self.sequence_lengths = np.array([len(seq) for seq in channels[0].sequences])
106
+ self.length_of_sequences = int(self.sequence_lengths.max())
107
+ self.n_sequences = len(channels[0].sequences)
108
+
109
+ # Create mappings
110
+ self._int_to_state = int_to_state_mapping(self.alphabet)
111
+ self._state_to_int = state_to_int_mapping(self.alphabet)
112
+
113
+ # Initialize hmmlearn model (only for single channel)
114
+ # For multichannel, we'll need custom implementation
115
+ if self.n_channels == 1:
116
+ self._hmm_model = CategoricalHMM(
117
+ n_components=n_states,
118
+ n_features=self.n_symbols,
119
+ random_state=random_state,
120
+ n_iter=100, # Default max iterations
121
+ tol=1e-2, # Default tolerance
122
+ verbose=False
123
+ )
124
+
125
+ # Set initial parameters if provided
126
+ # When custom parameters are provided, we need to remove the corresponding
127
+ # letters from init_params to prevent hmmlearn from re-initializing them
128
+ # 's' = startprob, 't' = transmat, 'e' = emissionprob
129
+ if initial_probs is not None:
130
+ self._hmm_model.startprob_ = initial_probs
131
+ # Remove 's' from init_params so startprob won't be re-initialized during fit
132
+ self._hmm_model.init_params = self._hmm_model.init_params.replace('s', '')
133
+
134
+ if transition_probs is not None:
135
+ self._hmm_model.transmat_ = transition_probs
136
+ # Remove 't' from init_params so transmat won't be re-initialized during fit
137
+ self._hmm_model.init_params = self._hmm_model.init_params.replace('t', '')
138
+
139
+ if emission_probs is not None:
140
+ self._hmm_model.emissionprob_ = emission_probs
141
+ # Remove 'e' from init_params so emissionprob won't be re-initialized during fit
142
+ self._hmm_model.init_params = self._hmm_model.init_params.replace('e', '')
143
+ else:
144
+ # Multichannel: hmmlearn doesn't support this directly
145
+ # We'll implement custom fitting
146
+ self._hmm_model = None
147
+ if emission_probs is not None and isinstance(emission_probs, list):
148
+ if len(emission_probs) != self.n_channels:
149
+ raise ValueError(
150
+ f"emission_probs list length ({len(emission_probs)}) must equal n_channels ({self.n_channels})"
151
+ )
152
+
153
+ # Store parameters (will be updated after fitting)
154
+ self.initial_probs = initial_probs
155
+ self.transition_probs = transition_probs
156
+ self.emission_probs = emission_probs
157
+
158
+ # Fitting results
159
+ self.log_likelihood = None
160
+ self.n_iter = None
161
+ self.converged = None
162
+
163
+ def fit(
164
+ self,
165
+ n_iter: int = 100,
166
+ tol: float = 1e-2,
167
+ verbose: bool = False
168
+ ) -> 'HMM':
169
+ """
170
+ Fit the HMM model to the observations using EM algorithm.
171
+
172
+ For single-channel data, uses hmmlearn's EM algorithm.
173
+ For multichannel data, uses custom multichannel EM algorithm.
174
+
175
+ Args:
176
+ n_iter: Maximum number of EM iterations
177
+ tol: Convergence tolerance
178
+ verbose: Whether to print progress
179
+
180
+ Returns:
181
+ self: Returns self for method chaining
182
+ """
183
+ if self.n_channels == 1:
184
+ # Single channel: use hmmlearn
185
+ X, lengths = sequence_data_to_hmmlearn_format(self.observations)
186
+
187
+ # Ensure init_params is correctly set before fitting
188
+ # Remove letters from init_params if we have custom parameters
189
+ if self.initial_probs is not None:
190
+ self._hmm_model.startprob_ = self.initial_probs.copy()
191
+ # Remove 's' from init_params to prevent re-initialization
192
+ if 's' in self._hmm_model.init_params:
193
+ self._hmm_model.init_params = self._hmm_model.init_params.replace('s', '')
194
+
195
+ if self.transition_probs is not None:
196
+ self._hmm_model.transmat_ = self.transition_probs.copy()
197
+ # Remove 't' from init_params to prevent re-initialization
198
+ if 't' in self._hmm_model.init_params:
199
+ self._hmm_model.init_params = self._hmm_model.init_params.replace('t', '')
200
+
201
+ if self.emission_probs is not None:
202
+ self._hmm_model.emissionprob_ = self.emission_probs.copy()
203
+ # Remove 'e' from init_params to prevent re-initialization
204
+ if 'e' in self._hmm_model.init_params:
205
+ self._hmm_model.init_params = self._hmm_model.init_params.replace('e', '')
206
+
207
+ # Update hmmlearn model parameters
208
+ self._hmm_model.n_iter = n_iter
209
+ self._hmm_model.tol = tol
210
+ self._hmm_model.verbose = verbose
211
+
212
+ # Fit the model, suppressing warnings about init_params
213
+ import warnings
214
+ with warnings.catch_warnings():
215
+ warnings.filterwarnings('ignore', message='.*init_params.*')
216
+ warnings.filterwarnings('ignore', message='.*overwritten during initialization.*')
217
+ self._hmm_model.fit(X, lengths)
218
+
219
+ # Extract fitted parameters
220
+ self.initial_probs = self._hmm_model.startprob_.copy()
221
+ self.transition_probs = self._hmm_model.transmat_.copy()
222
+ self.emission_probs = self._hmm_model.emissionprob_.copy()
223
+
224
+ # Store fitting results
225
+ self.log_likelihood = self._hmm_model.score(X, lengths)
226
+ self.n_iter = self._hmm_model.monitor_.iter
227
+ self.converged = self._hmm_model.monitor_.converged
228
+ else:
229
+ # Multichannel: use custom EM algorithm
230
+ from .multichannel_em import fit_multichannel_hmm
231
+ fit_multichannel_hmm(self, n_iter=n_iter, tol=tol, verbose=verbose)
232
+
233
+ return self
234
+
235
+ def predict(self, sequences: Optional[SequenceData] = None) -> np.ndarray:
236
+ """
237
+ Predict the most likely hidden state sequence using Viterbi algorithm.
238
+
239
+ Args:
240
+ sequences: Optional SequenceData to predict (uses self.observations if None)
241
+
242
+ Returns:
243
+ numpy array: Predicted hidden states for each sequence
244
+ """
245
+ if sequences is None:
246
+ sequences = self.observations
247
+
248
+ X, lengths = sequence_data_to_hmmlearn_format(sequences)
249
+ states = self._hmm_model.predict(X, lengths)
250
+
251
+ return states
252
+
253
+ def predict_proba(self, sequences: Optional[SequenceData] = None) -> np.ndarray:
254
+ """
255
+ Compute posterior probabilities of hidden states.
256
+
257
+ Args:
258
+ sequences: Optional SequenceData (uses self.observations if None)
259
+
260
+ Returns:
261
+ numpy array: Posterior probabilities for each time point
262
+ """
263
+ if sequences is None:
264
+ sequences = self.observations
265
+
266
+ X, lengths = sequence_data_to_hmmlearn_format(sequences)
267
+ posteriors = self._hmm_model.predict_proba(X, lengths)
268
+
269
+ return posteriors
270
+
271
+ def score(self, sequences: Optional[SequenceData] = None) -> float:
272
+ """
273
+ Compute the log-likelihood of sequences under the model.
274
+
275
+ Args:
276
+ sequences: Optional SequenceData (uses self.observations if None)
277
+
278
+ Returns:
279
+ float: Log-likelihood
280
+ """
281
+ if sequences is None:
282
+ sequences = self.observations
283
+
284
+ X, lengths = sequence_data_to_hmmlearn_format(sequences)
285
+ return self._hmm_model.score(X, lengths)
286
+
287
+ def __repr__(self) -> str:
288
+ """String representation of the HMM."""
289
+ status = "fitted" if self.log_likelihood is not None else "unfitted"
290
+ return (f"HMM(n_states={self.n_states}, n_symbols={self.n_symbols}, "
291
+ f"n_sequences={self.n_sequences}, status='{status}')")
@@ -0,0 +1,314 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : mhmm.py
4
+ @Time : 2025-11-22 08:47
5
+ @Desc : Mixture Hidden Markov Model (MHMM) for Sequenzo
6
+
7
+ A Mixture HMM consists of multiple HMM submodels, where each submodel represents
8
+ a cluster or type. The model assigns each sequence to one of these clusters with
9
+ certain probabilities.
10
+
11
+ This is similar to seqHMM's mhmm class in R.
12
+ """
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ from typing import Optional, List, Dict, Union
17
+ from sequenzo.define_sequence_data import SequenceData
18
+ from .hmm import HMM
19
+ from .utils import (
20
+ sequence_data_to_hmmlearn_format,
21
+ create_initial_probs,
22
+ create_transition_probs,
23
+ create_emission_probs
24
+ )
25
+
26
+
27
+ class MHMM:
28
+ """
29
+ Mixture Hidden Markov Model for sequence analysis.
30
+
31
+ A Mixture HMM consists of multiple HMM submodels (clusters). Each sequence
32
+ belongs to one of these clusters with certain probabilities. The model
33
+ estimates both the cluster membership probabilities and the parameters
34
+ of each HMM submodel.
35
+
36
+ Attributes:
37
+ observations: SequenceData object containing the observed sequences
38
+ n_clusters: Number of clusters (submodels)
39
+ clusters: List of HMM objects, one for each cluster
40
+ cluster_probs: Mixture probabilities (probability of each cluster)
41
+ coefficients: Optional regression coefficients for covariates
42
+ X: Optional covariate matrix
43
+ cluster_names: Optional names for clusters
44
+ state_names: Optional names for hidden states (per cluster)
45
+ channel_names: Optional names for channels
46
+
47
+ # Model parameters (after fitting)
48
+ log_likelihood: Log-likelihood of the fitted model
49
+ n_iter: Number of EM iterations performed
50
+ converged: Whether the EM algorithm converged
51
+ """
52
+
53
+ def __init__(
54
+ self,
55
+ observations: SequenceData,
56
+ n_clusters: int,
57
+ n_states: Union[int, List[int]],
58
+ clusters: Optional[List[HMM]] = None,
59
+ cluster_probs: Optional[np.ndarray] = None,
60
+ coefficients: Optional[np.ndarray] = None,
61
+ X: Optional[np.ndarray] = None,
62
+ cluster_names: Optional[List[str]] = None,
63
+ state_names: Optional[List[List[str]]] = None,
64
+ channel_names: Optional[List[str]] = None,
65
+ random_state: Optional[int] = None
66
+ ):
67
+ """
68
+ Initialize a Mixture HMM model.
69
+
70
+ Args:
71
+ observations: SequenceData object containing the sequences
72
+ n_clusters: Number of clusters (submodels)
73
+ n_states: Number of hidden states per cluster. Can be:
74
+ - int: Same number of states for all clusters
75
+ - List[int]: Different number of states for each cluster
76
+ clusters: Optional list of pre-built HMM objects for each cluster
77
+ cluster_probs: Optional initial cluster probabilities (n_clusters,)
78
+ coefficients: Optional regression coefficients for covariates
79
+ X: Optional covariate matrix (n_sequences x n_covariates)
80
+ cluster_names: Optional names for clusters
81
+ state_names: Optional names for hidden states (list of lists)
82
+ channel_names: Optional names for channels
83
+ random_state: Random seed for initialization
84
+ """
85
+ self.observations = observations
86
+ self.n_clusters = n_clusters
87
+ self.alphabet = observations.alphabet
88
+ self.n_symbols = len(self.alphabet)
89
+ self.n_sequences = len(observations.sequences)
90
+
91
+ # Handle n_states: convert to list if int
92
+ if isinstance(n_states, int):
93
+ n_states = [n_states] * n_clusters
94
+ self.n_states = n_states
95
+
96
+ # Validate n_states length
97
+ if len(n_states) != n_clusters:
98
+ raise ValueError(
99
+ f"n_states length ({len(n_states)}) must equal n_clusters ({n_clusters})"
100
+ )
101
+
102
+ # Set names
103
+ self.cluster_names = cluster_names or [f"Cluster {i+1}" for i in range(n_clusters)]
104
+ self.channel_names = channel_names or ["Channel 1"]
105
+ self.n_channels = len(self.channel_names)
106
+
107
+ # Initialize clusters (HMM submodels)
108
+ if clusters is None:
109
+ self.clusters = []
110
+ for k in range(n_clusters):
111
+ # Get state names for this cluster
112
+ cluster_state_names = None
113
+ if state_names is not None:
114
+ cluster_state_names = state_names[k] if k < len(state_names) else None
115
+
116
+ # Create HMM for this cluster
117
+ hmm = HMM(
118
+ observations=observations,
119
+ n_states=n_states[k],
120
+ state_names=cluster_state_names,
121
+ channel_names=channel_names,
122
+ random_state=random_state
123
+ )
124
+ self.clusters.append(hmm)
125
+ else:
126
+ if len(clusters) != n_clusters:
127
+ raise ValueError(
128
+ f"Number of clusters ({len(clusters)}) must equal n_clusters ({n_clusters})"
129
+ )
130
+ self.clusters = clusters
131
+
132
+ # Initialize cluster probabilities
133
+ if cluster_probs is None:
134
+ self.cluster_probs = np.ones(n_clusters) / n_clusters # Uniform
135
+ else:
136
+ if len(cluster_probs) != n_clusters:
137
+ raise ValueError(
138
+ f"cluster_probs length ({len(cluster_probs)}) must equal n_clusters ({n_clusters})"
139
+ )
140
+ if not np.isclose(np.sum(cluster_probs), 1.0):
141
+ raise ValueError("cluster_probs must sum to 1.0")
142
+ self.cluster_probs = np.array(cluster_probs)
143
+
144
+ # Covariates (for future extension)
145
+ self.coefficients = coefficients
146
+ self.X = X
147
+ self.n_covariates = X.shape[1] if X is not None else 0
148
+
149
+ # Fitting results
150
+ self.log_likelihood = None
151
+ self.n_iter = None
152
+ self.converged = None
153
+
154
+ # Store responsibilities (posterior cluster probabilities) after fitting
155
+ self.responsibilities = None
156
+
157
+ def fit(
158
+ self,
159
+ n_iter: int = 100,
160
+ tol: float = 1e-2,
161
+ verbose: bool = False
162
+ ) -> 'MHMM':
163
+ """
164
+ Fit the Mixture HMM model using EM algorithm.
165
+
166
+ The EM algorithm alternates between:
167
+ 1. E-step: Compute responsibilities (posterior cluster probabilities)
168
+ 2. M-step: Update cluster probabilities and HMM parameters
169
+
170
+ Args:
171
+ n_iter: Maximum number of EM iterations
172
+ tol: Convergence tolerance
173
+ verbose: Whether to print progress
174
+
175
+ Returns:
176
+ self: Returns self for method chaining
177
+ """
178
+ # Convert SequenceData to hmmlearn format
179
+ X, lengths = sequence_data_to_hmmlearn_format(self.observations)
180
+ n_sequences = len(lengths)
181
+
182
+ # Initialize log-likelihood
183
+ prev_log_likelihood = -np.inf
184
+
185
+ # EM algorithm
186
+ for iteration in range(n_iter):
187
+ # E-step: Compute responsibilities
188
+ # Responsibility = P(cluster | sequence) = P(sequence | cluster) * P(cluster) / P(sequence)
189
+
190
+ # Compute log-likelihood for each sequence under each cluster
191
+ log_likelihoods = np.zeros((n_sequences, self.n_clusters))
192
+
193
+ for k in range(self.n_clusters):
194
+ # Fit this cluster's HMM if not already fitted
195
+ # Suppress warnings about init_params during fitting
196
+ import warnings
197
+ with warnings.catch_warnings():
198
+ warnings.filterwarnings('ignore', message='.*init_params.*')
199
+ if self.clusters[k].log_likelihood is None:
200
+ self.clusters[k].fit(n_iter=10, tol=tol, verbose=False)
201
+
202
+ # Compute log-likelihood for each sequence
203
+ for seq_idx in range(n_sequences):
204
+ # Get sequence indices
205
+ start_idx = lengths[:seq_idx].sum()
206
+ end_idx = start_idx + lengths[seq_idx]
207
+ seq_X = X[start_idx:end_idx]
208
+ seq_lengths = np.array([lengths[seq_idx]])
209
+
210
+ # Compute log-likelihood
211
+ log_likelihoods[seq_idx, k] = self.clusters[k]._hmm_model.score(seq_X, seq_lengths)
212
+
213
+ # Add log of cluster probabilities
214
+ log_probs = np.log(self.cluster_probs + 1e-10) # Add small epsilon to avoid log(0)
215
+ log_likelihoods += log_probs[np.newaxis, :]
216
+
217
+ # Compute responsibilities using log-sum-exp trick for numerical stability
218
+ # responsibility = exp(log_likelihood - log_sum_exp(log_likelihoods))
219
+ max_log_lik = np.max(log_likelihoods, axis=1, keepdims=True)
220
+ exp_log_lik = np.exp(log_likelihoods - max_log_lik)
221
+ responsibilities = exp_log_lik / np.sum(exp_log_lik, axis=1, keepdims=True)
222
+ self.responsibilities = responsibilities
223
+
224
+ # M-step: Update cluster probabilities
225
+ self.cluster_probs = np.mean(responsibilities, axis=0)
226
+
227
+ # M-step: Update each cluster's HMM parameters
228
+ # We use weighted fitting: each sequence contributes to each cluster
229
+ # proportionally to its responsibility
230
+ for k in range(self.n_clusters):
231
+ # For simplicity, we fit using all sequences but this could be
232
+ # optimized to use only sequences with high responsibility
233
+ # For now, we refit each cluster's HMM
234
+ # Suppress warnings about init_params during fitting
235
+ import warnings
236
+ with warnings.catch_warnings():
237
+ warnings.filterwarnings('ignore', message='.*init_params.*')
238
+ self.clusters[k].fit(n_iter=10, tol=tol, verbose=False)
239
+
240
+ # Compute overall log-likelihood
241
+ # log P(data) = sum over sequences of log(sum over clusters of P(seq | cluster) * P(cluster))
242
+ log_likelihood = np.sum(
243
+ np.log(np.sum(np.exp(log_likelihoods), axis=1) + 1e-10)
244
+ )
245
+
246
+ if verbose:
247
+ print(f"Iteration {iteration + 1}: log-likelihood = {log_likelihood:.4f}")
248
+
249
+ # Check convergence
250
+ if iteration > 0:
251
+ change = log_likelihood - prev_log_likelihood
252
+ if abs(change) < tol:
253
+ self.converged = True
254
+ if verbose:
255
+ print(f"Converged at iteration {iteration + 1}")
256
+ break
257
+
258
+ prev_log_likelihood = log_likelihood
259
+
260
+ self.log_likelihood = prev_log_likelihood
261
+ self.n_iter = iteration + 1
262
+
263
+ if not self.converged:
264
+ self.converged = False
265
+ if verbose:
266
+ print(f"Did not converge after {n_iter} iterations")
267
+
268
+ return self
269
+
270
+ def predict_cluster(self, sequences: Optional[SequenceData] = None) -> np.ndarray:
271
+ """
272
+ Predict the most likely cluster for each sequence.
273
+
274
+ Args:
275
+ sequences: Optional SequenceData (uses self.observations if None)
276
+
277
+ Returns:
278
+ numpy array: Predicted cluster index for each sequence
279
+ """
280
+ if self.responsibilities is None:
281
+ raise ValueError("Model must be fitted before prediction. Use fit() first.")
282
+
283
+ if sequences is None:
284
+ return np.argmax(self.responsibilities, axis=1)
285
+ else:
286
+ # Compute responsibilities for new sequences
287
+ X, lengths = sequence_data_to_hmmlearn_format(sequences)
288
+ n_sequences = len(lengths)
289
+
290
+ log_likelihoods = np.zeros((n_sequences, self.n_clusters))
291
+
292
+ for k in range(self.n_clusters):
293
+ for seq_idx in range(n_sequences):
294
+ start_idx = lengths[:seq_idx].sum()
295
+ end_idx = start_idx + lengths[seq_idx]
296
+ seq_X = X[start_idx:end_idx]
297
+ seq_lengths = np.array([lengths[seq_idx]])
298
+
299
+ log_likelihoods[seq_idx, k] = self.clusters[k]._hmm_model.score(seq_X, seq_lengths)
300
+
301
+ log_probs = np.log(self.cluster_probs + 1e-10)
302
+ log_likelihoods += log_probs[np.newaxis, :]
303
+
304
+ max_log_lik = np.max(log_likelihoods, axis=1, keepdims=True)
305
+ exp_log_lik = np.exp(log_likelihoods - max_log_lik)
306
+ responsibilities = exp_log_lik / np.sum(exp_log_lik, axis=1, keepdims=True)
307
+
308
+ return np.argmax(responsibilities, axis=1)
309
+
310
+ def __repr__(self) -> str:
311
+ """String representation of the MHMM."""
312
+ status = "fitted" if self.log_likelihood is not None else "unfitted"
313
+ return (f"MHMM(n_clusters={self.n_clusters}, n_states={self.n_states}, "
314
+ f"n_sequences={self.n_sequences}, status='{status}')")