sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
  2. sequenzo/__init__.py +349 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +476 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +22 -0
  30. sequenzo/data_preprocessing/helpers.py +303 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/dyadic_children.csv +61 -0
  44. sequenzo/datasets/dyadic_parents.csv +61 -0
  45. sequenzo/datasets/mvad.csv +713 -0
  46. sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
  47. sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
  48. sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
  49. sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
  50. sequenzo/datasets/political_science_aid_shock.csv +166 -0
  51. sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
  52. sequenzo/define_sequence_data.py +1400 -0
  53. sequenzo/dissimilarity_measures/__init__.py +31 -0
  54. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  55. sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
  56. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
  57. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  58. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  59. sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
  60. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  61. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  62. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  63. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  64. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  65. sequenzo/dissimilarity_measures/src/module.cpp +40 -0
  66. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  67. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  214. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  215. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  216. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  217. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  218. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  219. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  220. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  221. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  222. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  223. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  224. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  225. sequenzo/multidomain/__init__.py +23 -0
  226. sequenzo/multidomain/association_between_domains.py +311 -0
  227. sequenzo/multidomain/cat.py +597 -0
  228. sequenzo/multidomain/combt.py +519 -0
  229. sequenzo/multidomain/dat.py +81 -0
  230. sequenzo/multidomain/idcd.py +139 -0
  231. sequenzo/multidomain/linked_polyad.py +292 -0
  232. sequenzo/openmp_setup.py +233 -0
  233. sequenzo/prefix_tree/__init__.py +62 -0
  234. sequenzo/prefix_tree/hub.py +114 -0
  235. sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
  236. sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
  237. sequenzo/prefix_tree/spell_level_indicators.py +297 -0
  238. sequenzo/prefix_tree/system_level_indicators.py +544 -0
  239. sequenzo/prefix_tree/utils.py +54 -0
  240. sequenzo/seqhmm/__init__.py +95 -0
  241. sequenzo/seqhmm/advanced_optimization.py +305 -0
  242. sequenzo/seqhmm/bootstrap.py +411 -0
  243. sequenzo/seqhmm/build_hmm.py +142 -0
  244. sequenzo/seqhmm/build_mhmm.py +136 -0
  245. sequenzo/seqhmm/build_nhmm.py +121 -0
  246. sequenzo/seqhmm/fit_mhmm.py +62 -0
  247. sequenzo/seqhmm/fit_model.py +61 -0
  248. sequenzo/seqhmm/fit_nhmm.py +76 -0
  249. sequenzo/seqhmm/formulas.py +289 -0
  250. sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
  251. sequenzo/seqhmm/gradients_nhmm.py +306 -0
  252. sequenzo/seqhmm/hmm.py +291 -0
  253. sequenzo/seqhmm/mhmm.py +314 -0
  254. sequenzo/seqhmm/model_comparison.py +238 -0
  255. sequenzo/seqhmm/multichannel_em.py +282 -0
  256. sequenzo/seqhmm/multichannel_utils.py +138 -0
  257. sequenzo/seqhmm/nhmm.py +270 -0
  258. sequenzo/seqhmm/nhmm_utils.py +191 -0
  259. sequenzo/seqhmm/predict.py +137 -0
  260. sequenzo/seqhmm/predict_mhmm.py +142 -0
  261. sequenzo/seqhmm/simulate.py +878 -0
  262. sequenzo/seqhmm/utils.py +218 -0
  263. sequenzo/seqhmm/visualization.py +910 -0
  264. sequenzo/sequence_characteristics/__init__.py +40 -0
  265. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  266. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  267. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  268. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  269. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  270. sequenzo/sequence_characteristics/turbulence.py +155 -0
  271. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  272. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  273. sequenzo/suffix_tree/__init__.py +66 -0
  274. sequenzo/suffix_tree/hub.py +114 -0
  275. sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
  276. sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
  277. sequenzo/suffix_tree/spell_level_indicators.py +248 -0
  278. sequenzo/suffix_tree/system_level_indicators.py +535 -0
  279. sequenzo/suffix_tree/utils.py +56 -0
  280. sequenzo/version_check.py +283 -0
  281. sequenzo/visualization/__init__.py +29 -0
  282. sequenzo/visualization/plot_mean_time.py +222 -0
  283. sequenzo/visualization/plot_modal_state.py +276 -0
  284. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  285. sequenzo/visualization/plot_relative_frequency.py +405 -0
  286. sequenzo/visualization/plot_sequence_index.py +1175 -0
  287. sequenzo/visualization/plot_single_medoid.py +153 -0
  288. sequenzo/visualization/plot_state_distribution.py +651 -0
  289. sequenzo/visualization/plot_transition_matrix.py +190 -0
  290. sequenzo/visualization/utils/__init__.py +23 -0
  291. sequenzo/visualization/utils/utils.py +310 -0
  292. sequenzo/with_event_history_analysis/__init__.py +35 -0
  293. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  294. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  295. sequenzo-0.1.31.dist-info/METADATA +286 -0
  296. sequenzo-0.1.31.dist-info/RECORD +299 -0
  297. sequenzo-0.1.31.dist-info/WHEEL +5 -0
  298. sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
  299. sequenzo-0.1.31.dist-info/top_level.txt +2 -0
@@ -0,0 +1,238 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : model_comparison.py
4
+ @Time : 2025-10-08 14:32
5
+ @Desc : Model comparison functions (AIC, BIC) for HMM models
6
+
7
+ This module provides functions for computing AIC and BIC to compare different
8
+ HMM models, similar to seqHMM's logLik() and summary() functions in R.
9
+ """
10
+
11
+ import numpy as np
12
+ from typing import Optional
13
+ from .hmm import HMM
14
+ from .mhmm import MHMM
15
+ from .nhmm import NHMM
16
+
17
+
18
+ def compute_n_parameters(model) -> int:
19
+ """
20
+ Compute the number of free parameters in a model.
21
+
22
+ This is used for computing AIC and BIC. The number of parameters
23
+ (degrees of freedom) is the number of estimable parameters in the model.
24
+
25
+ Args:
26
+ model: HMM, MHMM, or NHMM model object
27
+
28
+ Returns:
29
+ int: Number of free parameters
30
+ """
31
+ if isinstance(model, HMM):
32
+ # For basic HMM:
33
+ # - Initial probabilities: n_states - 1 (sum to 1)
34
+ # - Transition probabilities: n_states * (n_states - 1) (each row sums to 1)
35
+ # - Emission probabilities: n_states * (n_symbols - 1) (each row sums to 1)
36
+ n_init = model.n_states - 1
37
+ n_trans = model.n_states * (model.n_states - 1)
38
+ n_emiss = model.n_states * (model.n_symbols - 1)
39
+ return n_init + n_trans + n_emiss
40
+
41
+ elif isinstance(model, MHMM):
42
+ # For Mixture HMM:
43
+ # - Cluster probabilities: n_clusters - 1 (sum to 1)
44
+ # - For each cluster: same as basic HMM
45
+ # - Covariate coefficients (if any): n_covariates * (n_clusters - 1)
46
+ n_cluster = model.n_clusters - 1
47
+
48
+ # Parameters for each cluster
49
+ n_per_cluster = 0
50
+ for k in range(model.n_clusters):
51
+ cluster = model.clusters[k]
52
+ n_init = cluster.n_states - 1
53
+ n_trans = cluster.n_states * (cluster.n_states - 1)
54
+ n_emiss = cluster.n_states * (cluster.n_symbols - 1)
55
+ n_per_cluster += n_init + n_trans + n_emiss
56
+
57
+ # Covariate coefficients (if any)
58
+ n_coefs = 0
59
+ if model.coefficients is not None:
60
+ n_coefs = model.coefficients.size - model.n_clusters # First column is zero
61
+
62
+ return n_cluster + n_per_cluster + n_coefs
63
+
64
+ elif isinstance(model, NHMM):
65
+ # For Non-homogeneous HMM:
66
+ # - eta_pi: n_covariates * n_states
67
+ # - eta_A: n_covariates * n_states * n_states
68
+ # - eta_B: n_covariates * n_states * n_symbols
69
+ # Note: We don't subtract constraints here because Softmax handles them
70
+ n_pi = model.n_covariates * model.n_states
71
+ n_A = model.n_covariates * model.n_states * model.n_states
72
+ n_B = model.n_covariates * model.n_states * model.n_symbols
73
+ return n_pi + n_A + n_B
74
+
75
+ else:
76
+ raise ValueError(f"Unknown model type: {type(model)}")
77
+
78
+
79
+ def compute_n_observations(model) -> int:
80
+ """
81
+ Compute the number of observations in a model.
82
+
83
+ For multichannel models, each observed value in a single channel
84
+ amounts to 1/n_channels observation, i.e., a fully observed time point
85
+ for a single sequence amounts to one observation.
86
+
87
+ Args:
88
+ model: HMM, MHMM, or NHMM model object
89
+
90
+ Returns:
91
+ int: Number of observations
92
+ """
93
+ if isinstance(model, (HMM, MHMM, NHMM)):
94
+ # For single-channel models, each time point is one observation
95
+ # For multichannel models, we divide by number of channels
96
+ n_channels = getattr(model, 'n_channels', 1)
97
+ total_timepoints = sum(model.sequence_lengths)
98
+ return int(total_timepoints / n_channels)
99
+ else:
100
+ raise ValueError(f"Unknown model type: {type(model)}")
101
+
102
+
103
+ def aic(model, log_likelihood: Optional[float] = None) -> float:
104
+ """
105
+ Compute Akaike Information Criterion (AIC) for a model.
106
+
107
+ AIC = -2 * log-likelihood + 2 * n_parameters
108
+
109
+ Lower AIC values indicate better models (better fit with fewer parameters).
110
+
111
+ This is similar to seqHMM's AIC computation via stats::AIC(logLik(model)).
112
+
113
+ Args:
114
+ model: Fitted HMM, MHMM, or NHMM model object
115
+ log_likelihood: Optional log-likelihood value. If None, uses model.log_likelihood
116
+
117
+ Returns:
118
+ float: AIC value
119
+
120
+ Examples:
121
+ >>> from sequenzo.seqhmm import build_hmm, fit_model, aic
122
+ >>>
123
+ >>> hmm = build_hmm(seq, n_states=4, random_state=42)
124
+ >>> hmm = fit_model(hmm)
125
+ >>> aic_value = aic(hmm)
126
+ >>> print(f"AIC: {aic_value:.2f}")
127
+ """
128
+ if log_likelihood is None:
129
+ if model.log_likelihood is None:
130
+ raise ValueError("Model must be fitted before computing AIC. Use fit_model() first.")
131
+ log_likelihood = model.log_likelihood
132
+
133
+ n_params = compute_n_parameters(model)
134
+ aic_value = -2 * log_likelihood + 2 * n_params
135
+
136
+ return aic_value
137
+
138
+
139
+ def bic(model, log_likelihood: Optional[float] = None) -> float:
140
+ """
141
+ Compute Bayesian Information Criterion (BIC) for a model.
142
+
143
+ BIC = -2 * log-likelihood + log(n_observations) * n_parameters
144
+
145
+ Lower BIC values indicate better models. BIC penalizes complexity more
146
+ than AIC, especially for large datasets.
147
+
148
+ This is similar to seqHMM's BIC computation via stats::BIC(logLik(model)).
149
+
150
+ Args:
151
+ model: Fitted HMM, MHMM, or NHMM model object
152
+ log_likelihood: Optional log-likelihood value. If None, uses model.log_likelihood
153
+
154
+ Returns:
155
+ float: BIC value
156
+
157
+ Examples:
158
+ >>> from sequenzo.seqhmm import build_hmm, fit_model, bic
159
+ >>>
160
+ >>> hmm = build_hmm(seq, n_states=4, random_state=42)
161
+ >>> hmm = fit_model(hmm)
162
+ >>> bic_value = bic(hmm)
163
+ >>> print(f"BIC: {bic_value:.2f}")
164
+ """
165
+ if log_likelihood is None:
166
+ if model.log_likelihood is None:
167
+ raise ValueError("Model must be fitted before computing BIC. Use fit_model() first.")
168
+ log_likelihood = model.log_likelihood
169
+
170
+ n_params = compute_n_parameters(model)
171
+ n_obs = compute_n_observations(model)
172
+ bic_value = -2 * log_likelihood + np.log(n_obs) * n_params
173
+
174
+ return bic_value
175
+
176
+
177
+ def compare_models(models: list, criterion: str = 'BIC') -> dict:
178
+ """
179
+ Compare multiple models using AIC or BIC.
180
+
181
+ This function computes AIC or BIC for multiple models and returns
182
+ a comparison table, similar to comparing models in seqHMM.
183
+
184
+ Args:
185
+ models: List of fitted model objects (HMM, MHMM, or NHMM)
186
+ criterion: Criterion to use ('AIC' or 'BIC'). Default is 'BIC'.
187
+
188
+ Returns:
189
+ dict: Dictionary with model names, log-likelihood, n_parameters, and criterion values
190
+
191
+ Examples:
192
+ >>> from sequenzo.seqhmm import build_hmm, fit_model, compare_models
193
+ >>>
194
+ >>> # Fit models with different numbers of states
195
+ >>> hmm3 = build_hmm(seq, n_states=3, random_state=42)
196
+ >>> hmm4 = build_hmm(seq, n_states=4, random_state=42)
197
+ >>> hmm5 = build_hmm(seq, n_states=5, random_state=42)
198
+ >>>
199
+ >>> hmm3 = fit_model(hmm3)
200
+ >>> hmm4 = fit_model(hmm4)
201
+ >>> hmm5 = fit_model(hmm5)
202
+ >>>
203
+ >>> # Compare models
204
+ >>> comparison = compare_models([hmm3, hmm4, hmm5], criterion='BIC')
205
+ >>> print(comparison)
206
+ """
207
+ if criterion not in ['AIC', 'BIC']:
208
+ raise ValueError("criterion must be 'AIC' or 'BIC'")
209
+
210
+ results = []
211
+ for i, model in enumerate(models):
212
+ if model.log_likelihood is None:
213
+ raise ValueError(f"Model {i} must be fitted before comparison.")
214
+
215
+ n_params = compute_n_parameters(model)
216
+ n_obs = compute_n_observations(model)
217
+
218
+ if criterion == 'AIC':
219
+ criterion_value = aic(model)
220
+ else:
221
+ criterion_value = bic(model)
222
+
223
+ results.append({
224
+ 'model': f"Model {i+1}",
225
+ 'log_likelihood': model.log_likelihood,
226
+ 'n_parameters': n_params,
227
+ 'n_observations': n_obs,
228
+ criterion: criterion_value
229
+ })
230
+
231
+ # Sort by criterion value (lower is better)
232
+ results.sort(key=lambda x: x[criterion])
233
+
234
+ return {
235
+ 'criterion': criterion,
236
+ 'models': results,
237
+ 'best_model': results[0]['model']
238
+ }
@@ -0,0 +1,282 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : multichannel_em.py
4
+ @Time : 2025-11-08 13:52
5
+ @Desc : EM algorithm for multichannel HMM
6
+
7
+ This module provides the EM algorithm implementation for multichannel HMM,
8
+ where each sequence has multiple parallel channels (e.g., marriage, children, residence).
9
+ """
10
+
11
+ import numpy as np
12
+ from typing import List
13
+ from .hmm import HMM
14
+ from .multichannel_utils import multichannel_to_hmmlearn_format, compute_multichannel_emission_prob
15
+ from .utils import sequence_data_to_hmmlearn_format, state_to_int_mapping
16
+
17
+
18
+ def fit_multichannel_hmm(
19
+ model: HMM,
20
+ n_iter: int = 100,
21
+ tol: float = 1e-2,
22
+ verbose: bool = False
23
+ ) -> HMM:
24
+ """
25
+ Fit a multichannel HMM using EM algorithm.
26
+
27
+ For multichannel HMM, the emission probability is the product of
28
+ emission probabilities across all channels (assuming independence).
29
+
30
+ This is similar to seqHMM's multichannel HMM fitting in R.
31
+
32
+ Args:
33
+ model: HMM model object with multichannel data
34
+ n_iter: Maximum number of EM iterations
35
+ tol: Convergence tolerance
36
+ verbose: Whether to print progress
37
+
38
+ Returns:
39
+ HMM: Fitted model
40
+ """
41
+ n_channels = model.n_channels
42
+ n_states = model.n_states
43
+ channels = model.channels
44
+
45
+ # Get sequence lengths (same for all channels)
46
+ lengths = model.sequence_lengths
47
+ n_sequences = len(lengths)
48
+
49
+ # Initialize parameters if not provided
50
+ if model.initial_probs is None:
51
+ model.initial_probs = np.ones(n_states) / n_states
52
+
53
+ if model.transition_probs is None:
54
+ model.transition_probs = np.ones((n_states, n_states)) / n_states
55
+
56
+ if model.emission_probs is None or not isinstance(model.emission_probs, list):
57
+ # Initialize emission probabilities for each channel
58
+ model.emission_probs = []
59
+ for ch in range(n_channels):
60
+ n_symbols_ch = model.n_symbols[ch]
61
+ emission_ch = np.random.rand(n_states, n_symbols_ch)
62
+ emission_ch = emission_ch / emission_ch.sum(axis=1, keepdims=True)
63
+ model.emission_probs.append(emission_ch)
64
+
65
+ # Convert channels to integer format
66
+ X_list = []
67
+ state_to_int_list = []
68
+ for ch in range(n_channels):
69
+ X_ch, _ = sequence_data_to_hmmlearn_format(channels[ch])
70
+ X_list.append(X_ch)
71
+ state_to_int_ch = state_to_int_mapping(channels[ch].alphabet)
72
+ state_to_int_list.append(state_to_int_ch)
73
+
74
+ # EM algorithm
75
+ prev_log_likelihood = -np.inf
76
+
77
+ for iteration in range(n_iter):
78
+ # E-step: Compute forward and backward probabilities
79
+ # For multichannel, we need to compute emission probabilities
80
+ # as product across channels
81
+
82
+ # Initialize forward and backward arrays
83
+ log_alpha = {} # Dictionary: seq_idx -> (n_states, T) array
84
+ log_beta = {} # Dictionary: seq_idx -> (n_states, T) array
85
+
86
+ total_log_lik = 0.0
87
+
88
+ # Forward pass
89
+ for seq_idx in range(n_sequences):
90
+ seq_length = lengths[seq_idx]
91
+ start_idx = lengths[:seq_idx].sum()
92
+ end_idx = start_idx + seq_length
93
+
94
+ # Get observations for all channels
95
+ obs_list = [X_ch[start_idx:end_idx, 0] for X_ch in X_list]
96
+
97
+ # Initialize forward probabilities
98
+ alpha = np.zeros((n_states, seq_length))
99
+
100
+ # Initialization: alpha[i, 0] = pi[i] * product(B_ch[i, obs_ch[0]])
101
+ for i in range(n_states):
102
+ emission_prob = 1.0
103
+ for ch in range(n_channels):
104
+ emission_prob *= model.emission_probs[ch][i, obs_list[ch][0]]
105
+ alpha[i, 0] = model.initial_probs[i] * emission_prob
106
+
107
+ # Scale to prevent underflow
108
+ scale = alpha[:, 0].sum()
109
+ alpha[:, 0] /= scale
110
+ log_scale = np.log(scale)
111
+
112
+ # Recursion
113
+ for t in range(1, seq_length):
114
+ for j in range(n_states):
115
+ # Compute emission probability for multichannel
116
+ emission_prob = 1.0
117
+ for ch in range(n_channels):
118
+ emission_prob *= model.emission_probs[ch][j, obs_list[ch][t]]
119
+
120
+ # Forward: alpha[j, t] = sum_i(alpha[i, t-1] * A[i, j] * B[j, obs[t]])
121
+ alpha[j, t] = np.sum(alpha[:, t-1] * model.transition_probs[:, j]) * emission_prob
122
+
123
+ # Scale
124
+ scale = alpha[:, t].sum()
125
+ alpha[:, t] /= scale
126
+ log_scale += np.log(scale)
127
+
128
+ log_alpha[seq_idx] = np.log(alpha + 1e-10)
129
+ total_log_lik += log_scale
130
+
131
+ # Backward pass
132
+ for seq_idx in range(n_sequences):
133
+ seq_length = lengths[seq_idx]
134
+ start_idx = lengths[:seq_idx].sum()
135
+ end_idx = start_idx + seq_length
136
+
137
+ obs_list = [X_ch[start_idx:end_idx, 0] for X_ch in X_list]
138
+
139
+ beta = np.ones((n_states, seq_length))
140
+
141
+ # Recursion (backward)
142
+ for t in range(seq_length - 2, -1, -1):
143
+ for i in range(n_states):
144
+ # Compute emission probability for next time
145
+ emission_prob_next = 1.0
146
+ for ch in range(n_channels):
147
+ emission_prob_next *= model.emission_probs[ch][:, obs_list[ch][t+1]]
148
+
149
+ # Backward: beta[i, t] = sum_j(A[i, j] * B[j, obs[t+1]] * beta[j, t+1])
150
+ beta[i, t] = np.sum(
151
+ model.transition_probs[i, :] * emission_prob_next * beta[:, t+1]
152
+ )
153
+
154
+ # Scale (use same scale as forward)
155
+ beta[:, t] /= beta[:, t].sum()
156
+
157
+ log_beta[seq_idx] = np.log(beta + 1e-10)
158
+
159
+ # M-step: Update parameters
160
+ # Update initial probabilities
161
+ gamma_0 = np.zeros(n_states)
162
+ for seq_idx in range(n_sequences):
163
+ gamma_0 += np.exp(log_alpha[seq_idx][:, 0] + log_beta[seq_idx][:, 0] -
164
+ np.log(np.sum(np.exp(log_alpha[seq_idx][:, 0] + log_beta[seq_idx][:, 0]))))
165
+ model.initial_probs = gamma_0 / n_sequences
166
+
167
+ # Update transition probabilities
168
+ xi_sum = np.zeros((n_states, n_states))
169
+ gamma_sum = np.zeros(n_states)
170
+
171
+ for seq_idx in range(n_sequences):
172
+ seq_length = lengths[seq_idx]
173
+ start_idx = lengths[:seq_idx].sum()
174
+ end_idx = start_idx + seq_length
175
+
176
+ obs_list = [X_ch[start_idx:end_idx, 0] for X_ch in X_list]
177
+
178
+ # Compute gamma and xi
179
+ for t in range(seq_length):
180
+ # Gamma: posterior probability of being in state i at time t
181
+ log_gamma = log_alpha[seq_idx][:, t] + log_beta[seq_idx][:, t]
182
+ log_gamma -= np.log(np.sum(np.exp(log_gamma)))
183
+ gamma = np.exp(log_gamma)
184
+ gamma_sum += gamma
185
+
186
+ if t < seq_length - 1:
187
+ # Xi: joint probability of state i at t and state j at t+1
188
+ for i in range(n_states):
189
+ for j in range(n_states):
190
+ # Compute emission probability for next time
191
+ emission_prob_next = 1.0
192
+ for ch in range(n_channels):
193
+ emission_prob_next *= model.emission_probs[ch][j, obs_list[ch][t+1]]
194
+
195
+ log_xi = (
196
+ log_alpha[seq_idx][i, t] +
197
+ np.log(model.transition_probs[i, j] + 1e-10) +
198
+ np.log(emission_prob_next + 1e-10) +
199
+ log_beta[seq_idx][j, t+1]
200
+ )
201
+ # Normalize
202
+ log_xi_sum = -np.inf
203
+ for i2 in range(n_states):
204
+ for j2 in range(n_states):
205
+ emission_prob_next2 = 1.0
206
+ for ch in range(n_channels):
207
+ emission_prob_next2 *= model.emission_probs[ch][j2, obs_list[ch][t+1]]
208
+ log_xi_term = (
209
+ log_alpha[seq_idx][i2, t] +
210
+ np.log(model.transition_probs[i2, j2] + 1e-10) +
211
+ np.log(emission_prob_next2 + 1e-10) +
212
+ log_beta[seq_idx][j2, t+1]
213
+ )
214
+ if log_xi_sum == -np.inf:
215
+ log_xi_sum = log_xi_term
216
+ else:
217
+ log_xi_sum = np.logaddexp(log_xi_sum, log_xi_term)
218
+
219
+ xi = np.exp(log_xi - log_xi_sum)
220
+ xi_sum[i, j] += xi
221
+
222
+ # Normalize transition probabilities
223
+ for i in range(n_states):
224
+ if gamma_sum[i] > 0:
225
+ model.transition_probs[i, :] = xi_sum[i, :] / gamma_sum[i]
226
+ else:
227
+ model.transition_probs[i, :] = 1.0 / n_states
228
+
229
+ # Update emission probabilities for each channel
230
+ for ch in range(n_channels):
231
+ n_symbols_ch = model.n_symbols[ch]
232
+ emission_ch = np.zeros((n_states, n_symbols_ch))
233
+ gamma_sum_ch = np.zeros(n_states)
234
+
235
+ for seq_idx in range(n_sequences):
236
+ seq_length = lengths[seq_idx]
237
+ start_idx = lengths[:seq_idx].sum()
238
+ end_idx = start_idx + seq_length
239
+
240
+ obs_ch = X_list[ch][start_idx:end_idx, 0]
241
+
242
+ for t in range(seq_length):
243
+ # Gamma: posterior probability
244
+ log_gamma = log_alpha[seq_idx][:, t] + log_beta[seq_idx][:, t]
245
+ log_gamma -= np.log(np.sum(np.exp(log_gamma)))
246
+ gamma = np.exp(log_gamma)
247
+
248
+ # Update emission counts
249
+ for i in range(n_states):
250
+ emission_ch[i, obs_ch[t]] += gamma[i]
251
+ gamma_sum_ch[i] += gamma[i]
252
+
253
+ # Normalize
254
+ for i in range(n_states):
255
+ if gamma_sum_ch[i] > 0:
256
+ model.emission_probs[ch][i, :] = emission_ch[i, :] / gamma_sum_ch[i]
257
+ else:
258
+ model.emission_probs[ch][i, :] = 1.0 / n_symbols_ch
259
+
260
+ # Check convergence
261
+ if iteration > 0:
262
+ change = total_log_lik - prev_log_likelihood
263
+ if abs(change) < tol:
264
+ model.converged = True
265
+ if verbose:
266
+ print(f"Converged at iteration {iteration + 1}")
267
+ break
268
+
269
+ prev_log_likelihood = total_log_lik
270
+
271
+ if verbose and (iteration + 1) % 10 == 0:
272
+ print(f"Iteration {iteration + 1}: log-likelihood = {total_log_lik:.4f}")
273
+
274
+ model.log_likelihood = prev_log_likelihood
275
+ model.n_iter = iteration + 1
276
+
277
+ if not model.converged:
278
+ model.converged = False
279
+ if verbose:
280
+ print(f"Did not converge after {n_iter} iterations")
281
+
282
+ return model
@@ -0,0 +1,138 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : multichannel_utils.py
4
+ @Time : 2025-11-05 11:26
5
+ @Desc : Utility functions for multichannel HMM support
6
+
7
+ This module provides helper functions for handling multichannel sequence data,
8
+ where each subject has multiple parallel sequences (channels).
9
+ """
10
+
11
+ import numpy as np
12
+ from typing import List, Union, Tuple
13
+ from sequenzo.define_sequence_data import SequenceData
14
+
15
+
16
+ def prepare_multichannel_data(
17
+ observations: Union[SequenceData, List[SequenceData]]
18
+ ) -> Tuple[List[SequenceData], List[str], List[List[str]]]:
19
+ """
20
+ Prepare multichannel data for HMM.
21
+
22
+ This function handles both single-channel (SequenceData) and
23
+ multichannel (List[SequenceData]) inputs.
24
+
25
+ Args:
26
+ observations: Either a single SequenceData or a list of SequenceData objects
27
+
28
+ Returns:
29
+ tuple: (channels, channel_names, alphabets) where:
30
+ - channels: List of SequenceData objects (one per channel)
31
+ - channel_names: List of channel names
32
+ - alphabets: List of alphabets (one per channel)
33
+ """
34
+ if isinstance(observations, SequenceData):
35
+ # Single channel
36
+ return [observations], ["Channel 1"], [observations.alphabet]
37
+
38
+ elif isinstance(observations, list):
39
+ # Multichannel
40
+ if len(observations) == 0:
41
+ raise ValueError("observations list cannot be empty")
42
+
43
+ # Validate all channels have same number of sequences
44
+ n_sequences = len(observations[0].sequences)
45
+ for i, obs in enumerate(observations):
46
+ if not isinstance(obs, SequenceData):
47
+ raise ValueError(f"observations[{i}] must be a SequenceData object")
48
+ if len(obs.sequences) != n_sequences:
49
+ raise ValueError(
50
+ f"All channels must have the same number of sequences. "
51
+ f"Channel 0 has {n_sequences}, channel {i} has {len(obs.sequences)}"
52
+ )
53
+
54
+ # Get channel names and alphabets
55
+ channel_names = [f"Channel {i+1}" for i in range(len(observations))]
56
+ alphabets = [obs.alphabet for obs in observations]
57
+
58
+ return observations, channel_names, alphabets
59
+
60
+ else:
61
+ raise ValueError(
62
+ f"observations must be SequenceData or List[SequenceData], got {type(observations)}"
63
+ )
64
+
65
+
66
+ def multichannel_to_hmmlearn_format(
67
+ channels: List[SequenceData]
68
+ ) -> Tuple[np.ndarray, np.ndarray]:
69
+ """
70
+ Convert multichannel SequenceData to format for hmmlearn.
71
+
72
+ For multichannel data, hmmlearn expects observations to be a tuple
73
+ of arrays, one per channel. However, since hmmlearn's CategoricalHMM
74
+ doesn't directly support multichannel, we'll need to handle this
75
+ differently or use a custom implementation.
76
+
77
+ For now, this function prepares the data structure. A full implementation
78
+ would require extending hmmlearn or implementing multichannel HMM from scratch.
79
+
80
+ Args:
81
+ channels: List of SequenceData objects (one per channel)
82
+
83
+ Returns:
84
+ tuple: (X_list, lengths) where:
85
+ - X_list: List of observation arrays (one per channel)
86
+ - lengths: Array of sequence lengths (same for all channels)
87
+ """
88
+ from .utils import sequence_data_to_hmmlearn_format
89
+
90
+ X_list = []
91
+ lengths_list = []
92
+
93
+ for channel in channels:
94
+ X, lengths = sequence_data_to_hmmlearn_format(channel)
95
+ X_list.append(X)
96
+ lengths_list.append(lengths)
97
+
98
+ # Validate all channels have same lengths
99
+ lengths = lengths_list[0]
100
+ for i, l in enumerate(lengths_list[1:], 1):
101
+ if not np.array_equal(lengths, l):
102
+ raise ValueError(
103
+ f"All channels must have the same sequence lengths. "
104
+ f"Channel 0 and channel {i} differ."
105
+ )
106
+
107
+ return X_list, lengths
108
+
109
+
110
+ def compute_multichannel_emission_prob(
111
+ emission_probs: List[np.ndarray],
112
+ observations: List[np.ndarray],
113
+ n_states: int
114
+ ) -> float:
115
+ """
116
+ Compute emission probability for multichannel observations.
117
+
118
+ For multichannel HMM, the emission probability is the product of
119
+ emission probabilities across all channels (assuming independence).
120
+
121
+ P(obs | state) = product over channels: P(obs_channel | state)
122
+
123
+ Args:
124
+ emission_probs: List of emission probability matrices, one per channel
125
+ observations: List of observed symbols (one per channel) at current time
126
+ n_states: Number of hidden states
127
+
128
+ Returns:
129
+ numpy array: Emission probabilities (n_states,) for current observations
130
+ """
131
+ n_channels = len(emission_probs)
132
+ emission = np.ones(n_states)
133
+
134
+ # Multiply probabilities across channels
135
+ for ch in range(n_channels):
136
+ emission *= emission_probs[ch][:, observations[ch]]
137
+
138
+ return emission