sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
  2. sequenzo/__init__.py +349 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +476 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +22 -0
  30. sequenzo/data_preprocessing/helpers.py +303 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/dyadic_children.csv +61 -0
  44. sequenzo/datasets/dyadic_parents.csv +61 -0
  45. sequenzo/datasets/mvad.csv +713 -0
  46. sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
  47. sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
  48. sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
  49. sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
  50. sequenzo/datasets/political_science_aid_shock.csv +166 -0
  51. sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
  52. sequenzo/define_sequence_data.py +1400 -0
  53. sequenzo/dissimilarity_measures/__init__.py +31 -0
  54. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  55. sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
  56. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
  57. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  58. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  59. sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
  60. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  61. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  62. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  63. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  64. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  65. sequenzo/dissimilarity_measures/src/module.cpp +40 -0
  66. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  67. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  214. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  215. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  216. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  217. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  218. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  219. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  220. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  221. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  222. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  223. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  224. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  225. sequenzo/multidomain/__init__.py +23 -0
  226. sequenzo/multidomain/association_between_domains.py +311 -0
  227. sequenzo/multidomain/cat.py +597 -0
  228. sequenzo/multidomain/combt.py +519 -0
  229. sequenzo/multidomain/dat.py +81 -0
  230. sequenzo/multidomain/idcd.py +139 -0
  231. sequenzo/multidomain/linked_polyad.py +292 -0
  232. sequenzo/openmp_setup.py +233 -0
  233. sequenzo/prefix_tree/__init__.py +62 -0
  234. sequenzo/prefix_tree/hub.py +114 -0
  235. sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
  236. sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
  237. sequenzo/prefix_tree/spell_level_indicators.py +297 -0
  238. sequenzo/prefix_tree/system_level_indicators.py +544 -0
  239. sequenzo/prefix_tree/utils.py +54 -0
  240. sequenzo/seqhmm/__init__.py +95 -0
  241. sequenzo/seqhmm/advanced_optimization.py +305 -0
  242. sequenzo/seqhmm/bootstrap.py +411 -0
  243. sequenzo/seqhmm/build_hmm.py +142 -0
  244. sequenzo/seqhmm/build_mhmm.py +136 -0
  245. sequenzo/seqhmm/build_nhmm.py +121 -0
  246. sequenzo/seqhmm/fit_mhmm.py +62 -0
  247. sequenzo/seqhmm/fit_model.py +61 -0
  248. sequenzo/seqhmm/fit_nhmm.py +76 -0
  249. sequenzo/seqhmm/formulas.py +289 -0
  250. sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
  251. sequenzo/seqhmm/gradients_nhmm.py +306 -0
  252. sequenzo/seqhmm/hmm.py +291 -0
  253. sequenzo/seqhmm/mhmm.py +314 -0
  254. sequenzo/seqhmm/model_comparison.py +238 -0
  255. sequenzo/seqhmm/multichannel_em.py +282 -0
  256. sequenzo/seqhmm/multichannel_utils.py +138 -0
  257. sequenzo/seqhmm/nhmm.py +270 -0
  258. sequenzo/seqhmm/nhmm_utils.py +191 -0
  259. sequenzo/seqhmm/predict.py +137 -0
  260. sequenzo/seqhmm/predict_mhmm.py +142 -0
  261. sequenzo/seqhmm/simulate.py +878 -0
  262. sequenzo/seqhmm/utils.py +218 -0
  263. sequenzo/seqhmm/visualization.py +910 -0
  264. sequenzo/sequence_characteristics/__init__.py +40 -0
  265. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  266. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  267. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  268. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  269. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  270. sequenzo/sequence_characteristics/turbulence.py +155 -0
  271. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  272. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  273. sequenzo/suffix_tree/__init__.py +66 -0
  274. sequenzo/suffix_tree/hub.py +114 -0
  275. sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
  276. sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
  277. sequenzo/suffix_tree/spell_level_indicators.py +248 -0
  278. sequenzo/suffix_tree/system_level_indicators.py +535 -0
  279. sequenzo/suffix_tree/utils.py +56 -0
  280. sequenzo/version_check.py +283 -0
  281. sequenzo/visualization/__init__.py +29 -0
  282. sequenzo/visualization/plot_mean_time.py +222 -0
  283. sequenzo/visualization/plot_modal_state.py +276 -0
  284. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  285. sequenzo/visualization/plot_relative_frequency.py +405 -0
  286. sequenzo/visualization/plot_sequence_index.py +1175 -0
  287. sequenzo/visualization/plot_single_medoid.py +153 -0
  288. sequenzo/visualization/plot_state_distribution.py +651 -0
  289. sequenzo/visualization/plot_transition_matrix.py +190 -0
  290. sequenzo/visualization/utils/__init__.py +23 -0
  291. sequenzo/visualization/utils/utils.py +310 -0
  292. sequenzo/with_event_history_analysis/__init__.py +35 -0
  293. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  294. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  295. sequenzo-0.1.31.dist-info/METADATA +286 -0
  296. sequenzo-0.1.31.dist-info/RECORD +299 -0
  297. sequenzo-0.1.31.dist-info/WHEEL +5 -0
  298. sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
  299. sequenzo-0.1.31.dist-info/top_level.txt +2 -0
@@ -0,0 +1,411 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : bootstrap.py
4
+ @Time : 2025-10-05 08:15
5
+ @Desc : Bootstrap confidence intervals for HMM model coefficients
6
+
7
+ This module provides functions for computing bootstrap confidence intervals
8
+ for model parameters, similar to seqHMM's bootstrap_coefs() function in R.
9
+ """
10
+
11
+ import numpy as np
12
+ from typing import Optional, List, Dict, Callable, Union
13
+ from .hmm import HMM
14
+ from .mhmm import MHMM
15
+ from .nhmm import NHMM
16
+ from sequenzo.define_sequence_data import SequenceData
17
+
18
+ # Try to import tqdm for progress bar, but make it optional
19
+ try:
20
+ from tqdm import tqdm
21
+ HAS_TQDM = True
22
+ except ImportError:
23
+ HAS_TQDM = False
24
+
25
+
26
+ def bootstrap_model(
27
+ model: Union[HMM, MHMM, NHMM],
28
+ n_sim: int = 100,
29
+ method: str = 'nonparametric',
30
+ random_state: Optional[int] = None,
31
+ verbose: bool = True,
32
+ n_jobs: int = 1
33
+ ) -> dict:
34
+ """
35
+ Bootstrap sampling for HMM model coefficients.
36
+
37
+ This function performs bootstrap resampling to estimate confidence intervals
38
+ for model parameters. For each bootstrap sample, the model is refitted and
39
+ parameters are stored. This is similar to seqHMM's bootstrap_coefs() function.
40
+
41
+ Args:
42
+ model: Fitted HMM, MHMM, or NHMM model object
43
+ n_sim: Number of bootstrap samples. Default is 100.
44
+ method: Bootstrap method. Options:
45
+ - 'nonparametric': Resample sequences with replacement (default)
46
+ - 'parametric': Not yet implemented
47
+ random_state: Random seed for reproducibility
48
+ verbose: Whether to show progress bar
49
+ n_jobs: Number of parallel jobs (not yet implemented, always uses 1)
50
+
51
+ Returns:
52
+ dict: Dictionary containing:
53
+ - 'bootstrap_samples': List of bootstrap parameter estimates
54
+ - 'original_model': Original model object
55
+ - 'n_sim': Number of bootstrap samples
56
+ - 'method': Bootstrap method used
57
+ - 'summary': Summary statistics (mean, std, percentiles)
58
+
59
+ Examples:
60
+ >>> from sequenzo.seqhmm import build_hmm, fit_model, bootstrap_model
61
+ >>>
62
+ >>> # Fit model
63
+ >>> hmm = build_hmm(seq, n_states=4, random_state=42)
64
+ >>> hmm = fit_model(hmm)
65
+ >>>
66
+ >>> # Bootstrap
67
+ >>> boot_results = bootstrap_model(hmm, n_sim=100, verbose=True)
68
+ >>>
69
+ >>> # Get confidence intervals
70
+ >>> ci = boot_results['summary']['ci_95']
71
+ >>> print(f"95% CI for initial_probs: {ci['initial_probs']}")
72
+ """
73
+ if model.log_likelihood is None:
74
+ raise ValueError("Model must be fitted before bootstrapping. Use fit_model() first.")
75
+
76
+ rng = np.random.RandomState(random_state)
77
+ n_sequences = model.n_sequences
78
+
79
+ # Store bootstrap samples
80
+ bootstrap_samples = []
81
+
82
+ # Progress bar
83
+ if verbose and HAS_TQDM:
84
+ iterator = tqdm(range(n_sim), desc="Bootstrap sampling")
85
+ else:
86
+ iterator = range(n_sim)
87
+ if verbose:
88
+ print(f"Running {n_sim} bootstrap samples...")
89
+
90
+ # Bootstrap loop
91
+ for b in iterator:
92
+ if method == 'nonparametric':
93
+ # Resample sequences with replacement
94
+ bootstrap_indices = rng.choice(n_sequences, size=n_sequences, replace=True)
95
+
96
+ # Create bootstrap dataset
97
+ bootstrap_obs = _resample_sequences(model.observations, bootstrap_indices)
98
+
99
+ # Create and fit bootstrap model
100
+ try:
101
+ bootstrap_model_obj = _create_bootstrap_model(model, bootstrap_obs)
102
+ bootstrap_model_obj = _fit_bootstrap_model(bootstrap_model_obj)
103
+
104
+ # Extract parameters
105
+ params = _extract_parameters(bootstrap_model_obj)
106
+ bootstrap_samples.append(params)
107
+
108
+ except Exception as e:
109
+ if verbose:
110
+ print(f"Warning: Bootstrap sample {b+1} failed: {e}")
111
+ continue
112
+
113
+ else:
114
+ raise ValueError(f"Unknown bootstrap method: {method}")
115
+
116
+ if len(bootstrap_samples) == 0:
117
+ raise ValueError("All bootstrap samples failed. Check model fitting.")
118
+
119
+ # Compute summary statistics
120
+ summary = _compute_bootstrap_summary(bootstrap_samples, model)
121
+
122
+ return {
123
+ 'bootstrap_samples': bootstrap_samples,
124
+ 'original_model': model,
125
+ 'n_sim': n_sim,
126
+ 'n_successful': len(bootstrap_samples),
127
+ 'method': method,
128
+ 'summary': summary
129
+ }
130
+
131
+
132
+ def _resample_sequences(observations: SequenceData, indices: np.ndarray) -> SequenceData:
133
+ """
134
+ Resample sequences based on bootstrap indices.
135
+
136
+ Args:
137
+ observations: Original SequenceData object
138
+ indices: Bootstrap indices (which sequences to include, with replacement)
139
+
140
+ Returns:
141
+ SequenceData: Resampled SequenceData object
142
+ """
143
+ import pandas as pd
144
+
145
+ # Get the original DataFrame
146
+ original_df = observations.to_dataframe()
147
+
148
+ # Resample rows based on indices
149
+ resampled_df = original_df.iloc[indices].copy()
150
+
151
+ # Reset index to create new sequence IDs
152
+ resampled_df = resampled_df.reset_index(drop=True)
153
+
154
+ # Get time columns from original observations
155
+ # We need to extract the time column names from the original data
156
+ # This is a bit tricky - we'll use the values attribute
157
+ time_cols = observations.values.columns.tolist() if hasattr(observations, 'values') else None
158
+
159
+ # If we can't get time columns directly, try to infer from sequence length
160
+ if time_cols is None:
161
+ # Get max sequence length
162
+ max_length = max(len(seq) for seq in observations.sequences)
163
+ time_cols = list(range(1, max_length + 1))
164
+
165
+ # Create new SequenceData object
166
+ seq_data = SequenceData(
167
+ resampled_df,
168
+ time=time_cols,
169
+ states=observations.states,
170
+ labels=observations.labels,
171
+ id_col=None
172
+ )
173
+
174
+ return seq_data
175
+
176
+
177
+ def _create_bootstrap_model(
178
+ original_model: Union[HMM, MHMM, NHMM],
179
+ bootstrap_obs: SequenceData
180
+ ) -> Union[HMM, MHMM, NHMM]:
181
+ """
182
+ Create a new model object for bootstrap sample.
183
+
184
+ Args:
185
+ original_model: Original fitted model
186
+ bootstrap_obs: Bootstrap resampled observations
187
+
188
+ Returns:
189
+ New model object with same structure as original
190
+ """
191
+ if isinstance(original_model, HMM):
192
+ from .build_hmm import build_hmm
193
+ return build_hmm(
194
+ bootstrap_obs,
195
+ n_states=original_model.n_states,
196
+ initial_probs=original_model.initial_probs.copy(),
197
+ transition_probs=original_model.transition_probs.copy(),
198
+ emission_probs=original_model.emission_probs.copy(),
199
+ state_names=original_model.state_names,
200
+ random_state=None
201
+ )
202
+
203
+ elif isinstance(original_model, MHMM):
204
+ from .build_mhmm import build_mhmm
205
+
206
+ # Get cluster parameters
207
+ initial_probs_list = [c.initial_probs.copy() for c in original_model.clusters]
208
+ transition_probs_list = [c.transition_probs.copy() for c in original_model.clusters]
209
+ emission_probs_list = [c.emission_probs.copy() for c in original_model.clusters]
210
+ state_names_list = [c.state_names for c in original_model.clusters]
211
+
212
+ return build_mhmm(
213
+ bootstrap_obs,
214
+ n_clusters=original_model.n_clusters,
215
+ n_states=[c.n_states for c in original_model.clusters],
216
+ initial_probs=initial_probs_list,
217
+ transition_probs=transition_probs_list,
218
+ emission_probs=emission_probs_list,
219
+ cluster_probs=original_model.cluster_probs.copy(),
220
+ cluster_names=original_model.cluster_names,
221
+ state_names=state_names_list,
222
+ random_state=None
223
+ )
224
+
225
+ elif isinstance(original_model, NHMM):
226
+ from .build_nhmm import build_nhmm
227
+ return build_nhmm(
228
+ bootstrap_obs,
229
+ n_states=original_model.n_states,
230
+ X=original_model.X, # Use same covariates (or resample if needed)
231
+ eta_pi=original_model.eta_pi.copy(),
232
+ eta_A=original_model.eta_A.copy(),
233
+ eta_B=original_model.eta_B.copy(),
234
+ state_names=original_model.state_names,
235
+ random_state=None
236
+ )
237
+
238
+ else:
239
+ raise ValueError(f"Unknown model type: {type(original_model)}")
240
+
241
+
242
+ def _fit_bootstrap_model(model: Union[HMM, MHMM, NHMM]) -> Union[HMM, MHMM, NHMM]:
243
+ """
244
+ Fit a bootstrap model.
245
+
246
+ Args:
247
+ model: Bootstrap model object
248
+
249
+ Returns:
250
+ Fitted model
251
+ """
252
+ if isinstance(model, HMM):
253
+ from .fit_model import fit_model
254
+ return fit_model(model, n_iter=50, tol=1e-2, verbose=False)
255
+
256
+ elif isinstance(model, MHMM):
257
+ from .fit_mhmm import fit_mhmm
258
+ return fit_mhmm(model, n_iter=50, tol=1e-2, verbose=False)
259
+
260
+ elif isinstance(model, NHMM):
261
+ from .fit_nhmm import fit_nhmm
262
+ return fit_nhmm(model, n_iter=50, tol=1e-3, verbose=False)
263
+
264
+ else:
265
+ raise ValueError(f"Unknown model type: {type(model)}")
266
+
267
+
268
+ def _extract_parameters(model: Union[HMM, MHMM, NHMM]) -> dict:
269
+ """
270
+ Extract parameters from a fitted model.
271
+
272
+ Args:
273
+ model: Fitted model object
274
+
275
+ Returns:
276
+ dict: Dictionary of parameters
277
+ """
278
+ if isinstance(model, HMM):
279
+ return {
280
+ 'initial_probs': model.initial_probs.copy(),
281
+ 'transition_probs': model.transition_probs.copy(),
282
+ 'emission_probs': model.emission_probs.copy()
283
+ }
284
+
285
+ elif isinstance(model, MHMM):
286
+ return {
287
+ 'cluster_probs': model.cluster_probs.copy(),
288
+ 'clusters': [
289
+ {
290
+ 'initial_probs': c.initial_probs.copy(),
291
+ 'transition_probs': c.transition_probs.copy(),
292
+ 'emission_probs': c.emission_probs.copy()
293
+ }
294
+ for c in model.clusters
295
+ ],
296
+ 'coefficients': model.coefficients.copy() if model.coefficients is not None else None
297
+ }
298
+
299
+ elif isinstance(model, NHMM):
300
+ return {
301
+ 'eta_pi': model.eta_pi.copy(),
302
+ 'eta_A': model.eta_A.copy(),
303
+ 'eta_B': model.eta_B.copy()
304
+ }
305
+
306
+ else:
307
+ raise ValueError(f"Unknown model type: {type(model)}")
308
+
309
+
310
+ def _compute_bootstrap_summary(
311
+ bootstrap_samples: List[dict],
312
+ original_model: Union[HMM, MHMM, NHMM]
313
+ ) -> dict:
314
+ """
315
+ Compute summary statistics from bootstrap samples.
316
+
317
+ Args:
318
+ bootstrap_samples: List of parameter dictionaries from bootstrap samples
319
+ original_model: Original fitted model
320
+
321
+ Returns:
322
+ dict: Summary statistics including means, stds, and confidence intervals
323
+ """
324
+ summary = {}
325
+
326
+ if isinstance(original_model, HMM):
327
+ # Stack arrays
328
+ initial_probs_stack = np.array([s['initial_probs'] for s in bootstrap_samples])
329
+ transition_probs_stack = np.array([s['transition_probs'] for s in bootstrap_samples])
330
+ emission_probs_stack = np.array([s['emission_probs'] for s in bootstrap_samples])
331
+
332
+ # Compute statistics
333
+ summary['initial_probs'] = {
334
+ 'mean': np.mean(initial_probs_stack, axis=0),
335
+ 'std': np.std(initial_probs_stack, axis=0),
336
+ 'ci_95': np.percentile(initial_probs_stack, [2.5, 97.5], axis=0)
337
+ }
338
+
339
+ summary['transition_probs'] = {
340
+ 'mean': np.mean(transition_probs_stack, axis=0),
341
+ 'std': np.std(transition_probs_stack, axis=0),
342
+ 'ci_95': np.percentile(transition_probs_stack, [2.5, 97.5], axis=0)
343
+ }
344
+
345
+ summary['emission_probs'] = {
346
+ 'mean': np.mean(emission_probs_stack, axis=0),
347
+ 'std': np.std(emission_probs_stack, axis=0),
348
+ 'ci_95': np.percentile(emission_probs_stack, [2.5, 97.5], axis=0)
349
+ }
350
+
351
+ elif isinstance(original_model, MHMM):
352
+ # Cluster probabilities
353
+ cluster_probs_stack = np.array([s['cluster_probs'] for s in bootstrap_samples])
354
+ summary['cluster_probs'] = {
355
+ 'mean': np.mean(cluster_probs_stack, axis=0),
356
+ 'std': np.std(cluster_probs_stack, axis=0),
357
+ 'ci_95': np.percentile(cluster_probs_stack, [2.5, 97.5], axis=0)
358
+ }
359
+
360
+ # Cluster-specific parameters
361
+ summary['clusters'] = []
362
+ for k in range(original_model.n_clusters):
363
+ cluster_params = {
364
+ 'initial_probs': np.array([s['clusters'][k]['initial_probs'] for s in bootstrap_samples]),
365
+ 'transition_probs': np.array([s['clusters'][k]['transition_probs'] for s in bootstrap_samples]),
366
+ 'emission_probs': np.array([s['clusters'][k]['emission_probs'] for s in bootstrap_samples])
367
+ }
368
+
369
+ summary['clusters'].append({
370
+ 'initial_probs': {
371
+ 'mean': np.mean(cluster_params['initial_probs'], axis=0),
372
+ 'std': np.std(cluster_params['initial_probs'], axis=0),
373
+ 'ci_95': np.percentile(cluster_params['initial_probs'], [2.5, 97.5], axis=0)
374
+ },
375
+ 'transition_probs': {
376
+ 'mean': np.mean(cluster_params['transition_probs'], axis=0),
377
+ 'std': np.std(cluster_params['transition_probs'], axis=0),
378
+ 'ci_95': np.percentile(cluster_params['transition_probs'], [2.5, 97.5], axis=0)
379
+ },
380
+ 'emission_probs': {
381
+ 'mean': np.mean(cluster_params['emission_probs'], axis=0),
382
+ 'std': np.std(cluster_params['emission_probs'], axis=0),
383
+ 'ci_95': np.percentile(cluster_params['emission_probs'], [2.5, 97.5], axis=0)
384
+ }
385
+ })
386
+
387
+ elif isinstance(original_model, NHMM):
388
+ # Coefficients
389
+ eta_pi_stack = np.array([s['eta_pi'] for s in bootstrap_samples])
390
+ eta_A_stack = np.array([s['eta_A'] for s in bootstrap_samples])
391
+ eta_B_stack = np.array([s['eta_B'] for s in bootstrap_samples])
392
+
393
+ summary['eta_pi'] = {
394
+ 'mean': np.mean(eta_pi_stack, axis=0),
395
+ 'std': np.std(eta_pi_stack, axis=0),
396
+ 'ci_95': np.percentile(eta_pi_stack, [2.5, 97.5], axis=0)
397
+ }
398
+
399
+ summary['eta_A'] = {
400
+ 'mean': np.mean(eta_A_stack, axis=0),
401
+ 'std': np.std(eta_A_stack, axis=0),
402
+ 'ci_95': np.percentile(eta_A_stack, [2.5, 97.5], axis=0)
403
+ }
404
+
405
+ summary['eta_B'] = {
406
+ 'mean': np.mean(eta_B_stack, axis=0),
407
+ 'std': np.std(eta_B_stack, axis=0),
408
+ 'ci_95': np.percentile(eta_B_stack, [2.5, 97.5], axis=0)
409
+ }
410
+
411
+ return summary
@@ -0,0 +1,142 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : build_hmm.py
4
+ @Time : 2025-11-10 09:05
5
+ @Desc : Build HMM models from SequenceData
6
+
7
+ This module provides the build_hmm function, which creates HMM model objects
8
+ similar to seqHMM's build_hmm() function in R.
9
+ """
10
+
11
+ import numpy as np
12
+ from typing import Optional, List, Union
13
+ from sequenzo.define_sequence_data import SequenceData
14
+ from .multichannel_utils import prepare_multichannel_data
15
+ from .hmm import HMM
16
+ from .utils import (
17
+ create_initial_probs,
18
+ create_transition_probs,
19
+ create_emission_probs
20
+ )
21
+
22
+
23
+ def build_hmm(
24
+ observations: Union[SequenceData, List[SequenceData]],
25
+ n_states: Optional[int] = None,
26
+ initial_probs: Optional[np.ndarray] = None,
27
+ transition_probs: Optional[np.ndarray] = None,
28
+ emission_probs: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
29
+ state_names: Optional[List[str]] = None,
30
+ channel_names: Optional[List[str]] = None,
31
+ random_state: Optional[int] = None
32
+ ) -> HMM:
33
+ """
34
+ Build a Hidden Markov Model object.
35
+
36
+ This function creates an HMM model object that can be fitted to sequence data.
37
+ It supports both single-channel and multichannel data.
38
+ It is similar to seqHMM's build_hmm() function in R.
39
+
40
+ Args:
41
+ observations: SequenceData object or list of SequenceData objects (for multichannel)
42
+ containing the sequences to model
43
+ n_states: Number of hidden states. Required if initial_probs, transition_probs,
44
+ or emission_probs are not provided.
45
+ initial_probs: Optional initial state probabilities (n_states,).
46
+ If None, will be randomly initialized.
47
+ transition_probs: Optional transition probability matrix (n_states x n_states).
48
+ If None, will be randomly initialized.
49
+ emission_probs: Optional emission probability matrix (n_states x n_symbols).
50
+ If None, will be randomly initialized.
51
+ state_names: Optional names for hidden states. If None, uses "State 1", "State 2", etc.
52
+ channel_names: Optional names for channels. Currently only single-channel is supported.
53
+ random_state: Random seed for initialization of random parameters.
54
+
55
+ Returns:
56
+ HMM: An HMM model object (not yet fitted)
57
+
58
+ Examples:
59
+ >>> from sequenzo import SequenceData, load_dataset
60
+ >>> from sequenzo.seqhmm import build_hmm
61
+ >>>
62
+ >>> # Load example data
63
+ >>> df = load_dataset('mvad')
64
+ >>> seq = SequenceData(df, time=range(15, 86), states=['EM', 'FE', 'HE', 'JL', 'SC', 'TR'])
65
+ >>>
66
+ >>> # Build HMM with 4 states, random initialization
67
+ >>> hmm = build_hmm(seq, n_states=4, random_state=42)
68
+ >>>
69
+ >>> # Build HMM with custom initial parameters
70
+ >>> init_probs = np.array([0.3, 0.3, 0.2, 0.2])
71
+ >>> trans_probs = np.array([[0.8, 0.1, 0.05, 0.05],
72
+ ... [0.05, 0.8, 0.1, 0.05],
73
+ ... [0.05, 0.05, 0.8, 0.1],
74
+ ... [0.05, 0.05, 0.1, 0.8]])
75
+ >>> emission_probs = np.random.rand(4, 6) # 4 states, 6 symbols
76
+ >>> emission_probs = emission_probs / emission_probs.sum(axis=1, keepdims=True)
77
+ >>> hmm = build_hmm(seq, initial_probs=init_probs,
78
+ ... transition_probs=trans_probs,
79
+ ... emission_probs=emission_probs)
80
+ """
81
+ # Determine number of states
82
+ if n_states is None:
83
+ if initial_probs is not None:
84
+ n_states = len(initial_probs)
85
+ elif transition_probs is not None:
86
+ n_states = transition_probs.shape[0]
87
+ elif emission_probs is not None:
88
+ n_states = emission_probs.shape[0]
89
+ else:
90
+ raise ValueError(
91
+ "n_states must be provided if initial_probs, transition_probs, "
92
+ "and emission_probs are all None"
93
+ )
94
+
95
+ # Get alphabet size
96
+ n_symbols = len(observations.alphabet)
97
+
98
+ # Create initial probabilities if not provided
99
+ if initial_probs is None:
100
+ initial_probs = create_initial_probs(n_states, method='uniform')
101
+
102
+ # Create transition probabilities if not provided
103
+ if transition_probs is None:
104
+ transition_probs = create_transition_probs(
105
+ n_states, method='random', random_state=random_state
106
+ )
107
+
108
+ # Create emission probabilities if not provided
109
+ if emission_probs is None:
110
+ emission_probs = create_emission_probs(
111
+ n_states, n_symbols, method='random', random_state=random_state
112
+ )
113
+
114
+ # Validate dimensions
115
+ if len(initial_probs) != n_states:
116
+ raise ValueError(
117
+ f"initial_probs length ({len(initial_probs)}) must equal n_states ({n_states})"
118
+ )
119
+
120
+ if transition_probs.shape != (n_states, n_states):
121
+ raise ValueError(
122
+ f"transition_probs shape ({transition_probs.shape}) must be ({n_states}, {n_states})"
123
+ )
124
+
125
+ if emission_probs.shape != (n_states, n_symbols):
126
+ raise ValueError(
127
+ f"emission_probs shape ({emission_probs.shape}) must be ({n_states}, {n_symbols})"
128
+ )
129
+
130
+ # Create and return HMM object
131
+ hmm = HMM(
132
+ observations=observations,
133
+ n_states=n_states,
134
+ initial_probs=initial_probs,
135
+ transition_probs=transition_probs,
136
+ emission_probs=emission_probs,
137
+ state_names=state_names,
138
+ channel_names=channel_names,
139
+ random_state=random_state
140
+ )
141
+
142
+ return hmm
@@ -0,0 +1,136 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : build_mhmm.py
4
+ @Time : 2025-11-21 10:55
5
+ @Desc : Build Mixture HMM models from SequenceData
6
+
7
+ This module provides the build_mhmm function, which creates Mixture HMM model objects
8
+ similar to seqHMM's build_mhmm() function in R.
9
+ """
10
+
11
+ import numpy as np
12
+ from typing import Optional, List, Union
13
+ from sequenzo.define_sequence_data import SequenceData
14
+ from .mhmm import MHMM
15
+ from .hmm import HMM
16
+ from .utils import (
17
+ create_initial_probs,
18
+ create_transition_probs,
19
+ create_emission_probs
20
+ )
21
+
22
+
23
+ def build_mhmm(
24
+ observations: SequenceData,
25
+ n_clusters: int,
26
+ n_states: Union[int, List[int]],
27
+ initial_probs: Optional[List[np.ndarray]] = None,
28
+ transition_probs: Optional[List[np.ndarray]] = None,
29
+ emission_probs: Optional[List[np.ndarray]] = None,
30
+ cluster_probs: Optional[np.ndarray] = None,
31
+ cluster_names: Optional[List[str]] = None,
32
+ state_names: Optional[List[List[str]]] = None,
33
+ channel_names: Optional[List[str]] = None,
34
+ random_state: Optional[int] = None
35
+ ) -> MHMM:
36
+ """
37
+ Build a Mixture Hidden Markov Model object.
38
+
39
+ A Mixture HMM consists of multiple HMM submodels (clusters). Each sequence
40
+ belongs to one of these clusters with certain probabilities. This function
41
+ creates the model structure but does not fit it (use fit_mhmm() for that).
42
+
43
+ It is similar to seqHMM's build_mhmm() function in R.
44
+
45
+ Args:
46
+ observations: SequenceData object containing the sequences to model
47
+ n_clusters: Number of clusters (submodels)
48
+ n_states: Number of hidden states per cluster. Can be:
49
+ - int: Same number of states for all clusters
50
+ - List[int]: Different number of states for each cluster
51
+ initial_probs: Optional list of initial state probabilities, one per cluster.
52
+ Each element should be (n_states[k],) array.
53
+ transition_probs: Optional list of transition matrices, one per cluster.
54
+ Each element should be (n_states[k], n_states[k]) array.
55
+ emission_probs: Optional list of emission matrices, one per cluster.
56
+ Each element should be (n_states[k], n_symbols) array.
57
+ cluster_probs: Optional initial cluster probabilities (n_clusters,).
58
+ If None, uses uniform probabilities.
59
+ cluster_names: Optional names for clusters
60
+ state_names: Optional names for hidden states. Should be a list of lists,
61
+ where state_names[k] contains names for cluster k.
62
+ channel_names: Optional names for channels
63
+ random_state: Random seed for initialization
64
+
65
+ Returns:
66
+ MHMM: A Mixture HMM model object (not yet fitted)
67
+
68
+ Examples:
69
+ >>> from sequenzo import SequenceData, load_dataset
70
+ >>> from sequenzo.seqhmm import build_mhmm
71
+ >>>
72
+ >>> # Load example data
73
+ >>> df = load_dataset('mvad')
74
+ >>> seq = SequenceData(df, time=range(15, 86), states=['EM', 'FE', 'HE', 'JL', 'SC', 'TR'])
75
+ >>>
76
+ >>> # Build MHMM with 3 clusters, 4 states each
77
+ >>> mhmm = build_mhmm(seq, n_clusters=3, n_states=4, random_state=42)
78
+ >>>
79
+ >>> # Build MHMM with different number of states per cluster
80
+ >>> mhmm = build_mhmm(seq, n_clusters=3, n_states=[4, 4, 6], random_state=42)
81
+ """
82
+ # Get alphabet size
83
+ n_symbols = len(observations.alphabet)
84
+
85
+ # Handle n_states: convert to list if int
86
+ if isinstance(n_states, int):
87
+ n_states_list = [n_states] * n_clusters
88
+ else:
89
+ n_states_list = n_states
90
+
91
+ # Validate n_states length
92
+ if len(n_states_list) != n_clusters:
93
+ raise ValueError(
94
+ f"n_states length ({len(n_states_list)}) must equal n_clusters ({n_clusters})"
95
+ )
96
+
97
+ # Build HMM clusters
98
+ clusters = []
99
+ for k in range(n_clusters):
100
+ # Get parameters for this cluster
101
+ cluster_initial = initial_probs[k] if initial_probs is not None and k < len(initial_probs) else None
102
+ cluster_transition = transition_probs[k] if transition_probs is not None and k < len(transition_probs) else None
103
+ cluster_emission = emission_probs[k] if emission_probs is not None and k < len(emission_probs) else None
104
+
105
+ # Get state names for this cluster
106
+ cluster_state_names = None
107
+ if state_names is not None and k < len(state_names):
108
+ cluster_state_names = state_names[k]
109
+
110
+ # Create HMM for this cluster
111
+ hmm = HMM(
112
+ observations=observations,
113
+ n_states=n_states_list[k],
114
+ initial_probs=cluster_initial,
115
+ transition_probs=cluster_transition,
116
+ emission_probs=cluster_emission,
117
+ state_names=cluster_state_names,
118
+ channel_names=channel_names,
119
+ random_state=random_state
120
+ )
121
+ clusters.append(hmm)
122
+
123
+ # Create and return MHMM object
124
+ mhmm = MHMM(
125
+ observations=observations,
126
+ n_clusters=n_clusters,
127
+ n_states=n_states_list,
128
+ clusters=clusters,
129
+ cluster_probs=cluster_probs,
130
+ cluster_names=cluster_names,
131
+ state_names=state_names,
132
+ channel_names=channel_names,
133
+ random_state=random_state
134
+ )
135
+
136
+ return mhmm