sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
  2. sequenzo/__init__.py +349 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +476 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +22 -0
  30. sequenzo/data_preprocessing/helpers.py +303 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/dyadic_children.csv +61 -0
  44. sequenzo/datasets/dyadic_parents.csv +61 -0
  45. sequenzo/datasets/mvad.csv +713 -0
  46. sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
  47. sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
  48. sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
  49. sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
  50. sequenzo/datasets/political_science_aid_shock.csv +166 -0
  51. sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
  52. sequenzo/define_sequence_data.py +1400 -0
  53. sequenzo/dissimilarity_measures/__init__.py +31 -0
  54. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  55. sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
  56. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
  57. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  58. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  59. sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
  60. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  61. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  62. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  63. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  64. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  65. sequenzo/dissimilarity_measures/src/module.cpp +40 -0
  66. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  67. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  214. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  215. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  216. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  217. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  218. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  219. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  220. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  221. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  222. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  223. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  224. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  225. sequenzo/multidomain/__init__.py +23 -0
  226. sequenzo/multidomain/association_between_domains.py +311 -0
  227. sequenzo/multidomain/cat.py +597 -0
  228. sequenzo/multidomain/combt.py +519 -0
  229. sequenzo/multidomain/dat.py +81 -0
  230. sequenzo/multidomain/idcd.py +139 -0
  231. sequenzo/multidomain/linked_polyad.py +292 -0
  232. sequenzo/openmp_setup.py +233 -0
  233. sequenzo/prefix_tree/__init__.py +62 -0
  234. sequenzo/prefix_tree/hub.py +114 -0
  235. sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
  236. sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
  237. sequenzo/prefix_tree/spell_level_indicators.py +297 -0
  238. sequenzo/prefix_tree/system_level_indicators.py +544 -0
  239. sequenzo/prefix_tree/utils.py +54 -0
  240. sequenzo/seqhmm/__init__.py +95 -0
  241. sequenzo/seqhmm/advanced_optimization.py +305 -0
  242. sequenzo/seqhmm/bootstrap.py +411 -0
  243. sequenzo/seqhmm/build_hmm.py +142 -0
  244. sequenzo/seqhmm/build_mhmm.py +136 -0
  245. sequenzo/seqhmm/build_nhmm.py +121 -0
  246. sequenzo/seqhmm/fit_mhmm.py +62 -0
  247. sequenzo/seqhmm/fit_model.py +61 -0
  248. sequenzo/seqhmm/fit_nhmm.py +76 -0
  249. sequenzo/seqhmm/formulas.py +289 -0
  250. sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
  251. sequenzo/seqhmm/gradients_nhmm.py +306 -0
  252. sequenzo/seqhmm/hmm.py +291 -0
  253. sequenzo/seqhmm/mhmm.py +314 -0
  254. sequenzo/seqhmm/model_comparison.py +238 -0
  255. sequenzo/seqhmm/multichannel_em.py +282 -0
  256. sequenzo/seqhmm/multichannel_utils.py +138 -0
  257. sequenzo/seqhmm/nhmm.py +270 -0
  258. sequenzo/seqhmm/nhmm_utils.py +191 -0
  259. sequenzo/seqhmm/predict.py +137 -0
  260. sequenzo/seqhmm/predict_mhmm.py +142 -0
  261. sequenzo/seqhmm/simulate.py +878 -0
  262. sequenzo/seqhmm/utils.py +218 -0
  263. sequenzo/seqhmm/visualization.py +910 -0
  264. sequenzo/sequence_characteristics/__init__.py +40 -0
  265. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  266. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  267. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  268. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  269. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  270. sequenzo/sequence_characteristics/turbulence.py +155 -0
  271. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  272. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  273. sequenzo/suffix_tree/__init__.py +66 -0
  274. sequenzo/suffix_tree/hub.py +114 -0
  275. sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
  276. sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
  277. sequenzo/suffix_tree/spell_level_indicators.py +248 -0
  278. sequenzo/suffix_tree/system_level_indicators.py +535 -0
  279. sequenzo/suffix_tree/utils.py +56 -0
  280. sequenzo/version_check.py +283 -0
  281. sequenzo/visualization/__init__.py +29 -0
  282. sequenzo/visualization/plot_mean_time.py +222 -0
  283. sequenzo/visualization/plot_modal_state.py +276 -0
  284. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  285. sequenzo/visualization/plot_relative_frequency.py +405 -0
  286. sequenzo/visualization/plot_sequence_index.py +1175 -0
  287. sequenzo/visualization/plot_single_medoid.py +153 -0
  288. sequenzo/visualization/plot_state_distribution.py +651 -0
  289. sequenzo/visualization/plot_transition_matrix.py +190 -0
  290. sequenzo/visualization/utils/__init__.py +23 -0
  291. sequenzo/visualization/utils/utils.py +310 -0
  292. sequenzo/with_event_history_analysis/__init__.py +35 -0
  293. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  294. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  295. sequenzo-0.1.31.dist-info/METADATA +286 -0
  296. sequenzo-0.1.31.dist-info/RECORD +299 -0
  297. sequenzo-0.1.31.dist-info/WHEEL +5 -0
  298. sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
  299. sequenzo-0.1.31.dist-info/top_level.txt +2 -0
@@ -0,0 +1,270 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : nhmm.py
4
+ @Time : 2025-11-23 13:39
5
+ @Desc : Non-homogeneous Hidden Markov Model (NHMM) for Sequenzo
6
+
7
+ A Non-homogeneous HMM allows transition and emission probabilities to vary
8
+ over time or with covariates. This is useful when the underlying process
9
+ changes over time or depends on external factors.
10
+
11
+ This is similar to seqHMM's nhmm class in R.
12
+ """
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ from typing import Optional, List, Dict, Union, Tuple
17
+ from scipy.optimize import minimize
18
+ from sequenzo.define_sequence_data import SequenceData
19
+ from .utils import sequence_data_to_hmmlearn_format
20
+ from .nhmm_utils import (
21
+ compute_transition_probs_with_covariates,
22
+ compute_emission_probs_with_covariates,
23
+ compute_initial_probs_with_covariates
24
+ )
25
+
26
+
27
+ class NHMM:
28
+ """
29
+ Non-homogeneous Hidden Markov Model for sequence analysis.
30
+
31
+ In a Non-homogeneous HMM, transition and emission probabilities can vary
32
+ over time or with covariates. This allows the model to capture time-varying
33
+ or covariate-dependent patterns in the data.
34
+
35
+ Attributes:
36
+ observations: SequenceData object containing the observed sequences
37
+ n_states: Number of hidden states
38
+ n_symbols: Number of observed symbols
39
+ alphabet: List of observed state symbols
40
+ state_names: Optional names for hidden states
41
+ X: Covariate matrix (n_sequences x n_timepoints x n_covariates)
42
+ n_covariates: Number of covariates
43
+
44
+ # Model parameters (coefficients)
45
+ eta_pi: Coefficients for initial probabilities (n_covariates x n_states)
46
+ eta_A: Coefficients for transition probabilities (n_covariates x n_states x n_states)
47
+ eta_B: Coefficients for emission probabilities (n_covariates x n_states x n_symbols)
48
+
49
+ # Fitting results
50
+ log_likelihood: Log-likelihood of the fitted model
51
+ n_iter: Number of optimization iterations
52
+ converged: Whether optimization converged
53
+ """
54
+
55
+ def __init__(
56
+ self,
57
+ observations: SequenceData,
58
+ n_states: int,
59
+ X: np.ndarray,
60
+ eta_pi: Optional[np.ndarray] = None,
61
+ eta_A: Optional[np.ndarray] = None,
62
+ eta_B: Optional[np.ndarray] = None,
63
+ state_names: Optional[List[str]] = None,
64
+ random_state: Optional[int] = None
65
+ ):
66
+ """
67
+ Initialize a Non-homogeneous HMM model.
68
+
69
+ Args:
70
+ observations: SequenceData object containing the sequences
71
+ n_states: Number of hidden states
72
+ X: Covariate matrix of shape (n_sequences, n_timepoints, n_covariates)
73
+ where X[i, t, c] is the value of covariate c at time t for sequence i
74
+ eta_pi: Optional coefficients for initial probabilities (n_covariates x n_states)
75
+ eta_A: Optional coefficients for transition probabilities (n_covariates x n_states x n_states)
76
+ eta_B: Optional coefficients for emission probabilities (n_covariates x n_states x n_symbols)
77
+ state_names: Optional names for hidden states
78
+ random_state: Random seed for initialization
79
+ """
80
+ self.observations = observations
81
+ self.alphabet = observations.alphabet
82
+ self.n_symbols = len(self.alphabet)
83
+ self.n_states = n_states
84
+ self.n_sequences = len(observations.sequences)
85
+
86
+ # Validate and store covariates
87
+ if X.ndim != 3:
88
+ raise ValueError("X must be 3-dimensional: (n_sequences, n_timepoints, n_covariates)")
89
+ self.X = X
90
+ self.n_covariates = X.shape[2]
91
+
92
+ # Get sequence lengths
93
+ self.sequence_lengths = np.array([len(seq) for seq in observations.sequences])
94
+ self.length_of_sequences = int(self.sequence_lengths.max())
95
+
96
+ # Validate X dimensions match sequences
97
+ if X.shape[0] != self.n_sequences:
98
+ raise ValueError(
99
+ f"X first dimension ({X.shape[0]}) must equal n_sequences ({self.n_sequences})"
100
+ )
101
+
102
+ # Set names
103
+ self.state_names = state_names or [f"State {i+1}" for i in range(n_states)]
104
+
105
+ # Initialize coefficients if not provided
106
+ rng = np.random.RandomState(random_state)
107
+
108
+ if eta_pi is None:
109
+ # Initialize with small random values
110
+ self.eta_pi = rng.randn(self.n_covariates, n_states) * 0.1
111
+ else:
112
+ if eta_pi.shape != (self.n_covariates, n_states):
113
+ raise ValueError(
114
+ f"eta_pi shape ({eta_pi.shape}) must be ({self.n_covariates}, {n_states})"
115
+ )
116
+ self.eta_pi = eta_pi
117
+
118
+ if eta_A is None:
119
+ # Initialize with small random values
120
+ self.eta_A = rng.randn(self.n_covariates, n_states, n_states) * 0.1
121
+ else:
122
+ if eta_A.shape != (self.n_covariates, n_states, n_states):
123
+ raise ValueError(
124
+ f"eta_A shape ({eta_A.shape}) must be ({self.n_covariates}, {n_states}, {n_states})"
125
+ )
126
+ self.eta_A = eta_A
127
+
128
+ if eta_B is None:
129
+ # Initialize with small random values
130
+ self.eta_B = rng.randn(self.n_covariates, n_states, self.n_symbols) * 0.1
131
+ else:
132
+ if eta_B.shape != (self.n_covariates, n_states, self.n_symbols):
133
+ raise ValueError(
134
+ f"eta_B shape ({eta_B.shape}) must be ({self.n_covariates}, {n_states}, {self.n_symbols})"
135
+ )
136
+ self.eta_B = eta_B
137
+
138
+ # Fitting results
139
+ self.log_likelihood = None
140
+ self.n_iter = None
141
+ self.converged = None
142
+
143
+ def _compute_probs(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
144
+ """
145
+ Compute probabilities from coefficients and covariates.
146
+
147
+ Returns:
148
+ tuple: (initial_probs, transition_probs, emission_probs)
149
+ """
150
+ # Compute initial probabilities
151
+ initial_probs = compute_initial_probs_with_covariates(
152
+ self.eta_pi, self.X, self.n_states
153
+ )
154
+
155
+ # Compute transition probabilities
156
+ transition_probs = compute_transition_probs_with_covariates(
157
+ self.eta_A, self.X, self.n_states
158
+ )
159
+
160
+ # Compute emission probabilities
161
+ emission_probs = compute_emission_probs_with_covariates(
162
+ self.eta_B, self.X, self.n_states, self.n_symbols
163
+ )
164
+
165
+ return initial_probs, transition_probs, emission_probs
166
+
167
+ def _log_likelihood(self, params: np.ndarray) -> float:
168
+ """
169
+ Compute negative log-likelihood (for minimization).
170
+
171
+ Uses the forward-backward algorithm to compute the exact likelihood
172
+ for time-varying probabilities.
173
+
174
+ Args:
175
+ params: Flattened parameter vector
176
+
177
+ Returns:
178
+ float: Negative log-likelihood
179
+ """
180
+ # Reshape parameters
181
+ n_pi = self.n_covariates * self.n_states
182
+ n_A = self.n_covariates * self.n_states * self.n_states
183
+ n_B = self.n_covariates * self.n_states * self.n_symbols
184
+
185
+ self.eta_pi = params[:n_pi].reshape(self.n_covariates, self.n_states)
186
+ self.eta_A = params[n_pi:n_pi+n_A].reshape(self.n_covariates, self.n_states, self.n_states)
187
+ self.eta_B = params[n_pi+n_A:].reshape(self.n_covariates, self.n_states, self.n_symbols)
188
+
189
+ # Compute log-likelihood using forward-backward algorithm
190
+ from .forward_backward_nhmm import log_likelihood_nhmm
191
+ log_lik = log_likelihood_nhmm(self)
192
+
193
+ return -log_lik # Return negative for minimization
194
+
195
+ def fit(
196
+ self,
197
+ n_iter: int = 100,
198
+ tol: float = 1e-4,
199
+ verbose: bool = False
200
+ ) -> 'NHMM':
201
+ """
202
+ Fit the NHMM model using numerical optimization.
203
+
204
+ Note: This is a simplified implementation. A full implementation would
205
+ use the forward-backward algorithm and proper gradient computation.
206
+
207
+ Args:
208
+ n_iter: Maximum number of optimization iterations
209
+ tol: Convergence tolerance
210
+ verbose: Whether to print progress
211
+
212
+ Returns:
213
+ self: Returns self for method chaining
214
+ """
215
+ # Flatten parameters
216
+ params = np.concatenate([
217
+ self.eta_pi.flatten(),
218
+ self.eta_A.flatten(),
219
+ self.eta_B.flatten()
220
+ ])
221
+
222
+ # Optimize using scipy with analytical gradients if available
223
+ try:
224
+ from .gradients_nhmm import compute_gradient_nhmm
225
+
226
+ def objective_with_grad(params):
227
+ """Objective function with gradient."""
228
+ neg_log_lik = self._log_likelihood(params)
229
+ grad = -compute_gradient_nhmm(self) # Negative because we minimize
230
+ return neg_log_lik, grad
231
+
232
+ # Use L-BFGS-B with analytical gradients
233
+ result = minimize(
234
+ objective_with_grad,
235
+ params,
236
+ method='L-BFGS-B',
237
+ jac=True, # Indicate that gradient is provided
238
+ options={'maxiter': n_iter, 'ftol': tol, 'disp': verbose}
239
+ )
240
+ except ImportError:
241
+ # Fall back to numerical gradients if analytical not available
242
+ result = minimize(
243
+ self._log_likelihood,
244
+ params,
245
+ method='L-BFGS-B',
246
+ options={'maxiter': n_iter, 'ftol': tol, 'disp': verbose}
247
+ )
248
+
249
+ # Store results
250
+ self.n_iter = result.nit
251
+ self.converged = result.success
252
+ self.log_likelihood = -result.fun
253
+
254
+ # Recompute log-likelihood using forward-backward for accuracy
255
+ from .forward_backward_nhmm import log_likelihood_nhmm
256
+ self.log_likelihood = log_likelihood_nhmm(self)
257
+
258
+ if verbose:
259
+ print(f"Optimization {'converged' if result.success else 'did not converge'}")
260
+ print(f"Log-likelihood: {self.log_likelihood:.4f}")
261
+ print(f"Iterations: {self.n_iter}")
262
+
263
+ return self
264
+
265
+ def __repr__(self) -> str:
266
+ """String representation of the NHMM."""
267
+ status = "fitted" if self.log_likelihood is not None else "unfitted"
268
+ return (f"NHMM(n_states={self.n_states}, n_symbols={self.n_symbols}, "
269
+ f"n_covariates={self.n_covariates}, n_sequences={self.n_sequences}, "
270
+ f"status='{status}')")
@@ -0,0 +1,191 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : nhmm_utils.py
4
+ @Time : 2025-11-23 10:20
5
+ @Desc : Utility functions for Non-homogeneous HMM
6
+
7
+ This module provides utility functions for NHMM, including Softmax parameterization
8
+ and gradient computation.
9
+ """
10
+
11
+ import numpy as np
12
+ from typing import Optional, Tuple
13
+
14
+
15
+ def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray:
16
+ """
17
+ Compute softmax function for numerical stability.
18
+
19
+ Softmax converts a vector of real numbers into a probability distribution.
20
+ Formula: softmax(x_i) = exp(x_i) / sum(exp(x_j))
21
+
22
+ We use the log-sum-exp trick for numerical stability:
23
+ softmax(x_i) = exp(x_i - max(x)) / sum(exp(x_j - max(x)))
24
+
25
+ Args:
26
+ x: Input array
27
+ axis: Axis along which to compute softmax
28
+
29
+ Returns:
30
+ numpy array: Softmax probabilities (sums to 1 along specified axis)
31
+ """
32
+ # Subtract max for numerical stability
33
+ x_shifted = x - np.max(x, axis=axis, keepdims=True)
34
+ exp_x = np.exp(x_shifted)
35
+ return exp_x / np.sum(exp_x, axis=axis, keepdims=True)
36
+
37
+
38
+ def eta_to_gamma(eta: np.ndarray, n_categories: int) -> np.ndarray:
39
+ """
40
+ Convert eta (linear predictor) to gamma (probabilities) using Softmax.
41
+
42
+ In NHMM, we use linear predictors (eta) that are transformed to probabilities
43
+ (gamma) using the Softmax function. This allows covariates to influence
44
+ probabilities while ensuring they sum to 1.
45
+
46
+ Args:
47
+ eta: Linear predictor array of shape (..., n_categories)
48
+ n_categories: Number of categories (e.g., number of states)
49
+
50
+ Returns:
51
+ numpy array: Probabilities of shape (..., n_categories), sums to 1 along last axis
52
+ """
53
+ # Reshape eta to (n_samples, n_categories)
54
+ original_shape = eta.shape
55
+ eta_flat = eta.reshape(-1, n_categories)
56
+
57
+ # Apply softmax
58
+ gamma_flat = softmax(eta_flat, axis=1)
59
+
60
+ # Reshape back to original shape
61
+ return gamma_flat.reshape(original_shape)
62
+
63
+
64
+ def compute_transition_probs_with_covariates(
65
+ eta_A: np.ndarray,
66
+ X: np.ndarray,
67
+ n_states: int
68
+ ) -> np.ndarray:
69
+ """
70
+ Compute transition probabilities from covariates using Softmax.
71
+
72
+ For each time point and each sequence, we compute:
73
+ eta = X @ coefficients
74
+ gamma = softmax(eta)
75
+
76
+ Args:
77
+ eta_A: Coefficient matrix of shape (n_covariates, n_states, n_states)
78
+ where eta_A[c, i, j] is the coefficient for covariate c,
79
+ transition from state i to state j
80
+ X: Covariate matrix of shape (n_sequences, n_timepoints, n_covariates)
81
+ n_states: Number of hidden states
82
+
83
+ Returns:
84
+ numpy array: Transition probabilities of shape (n_sequences, n_timepoints, n_states, n_states)
85
+ """
86
+ n_sequences, n_timepoints, n_covariates = X.shape
87
+
88
+ # Initialize transition probability matrix
89
+ transition_probs = np.zeros((n_sequences, n_timepoints, n_states, n_states))
90
+
91
+ # For each sequence and time point
92
+ for seq_idx in range(n_sequences):
93
+ for t in range(n_timepoints):
94
+ # Get covariates for this time point
95
+ x_t = X[seq_idx, t, :] # Shape: (n_covariates,)
96
+
97
+ # Compute linear predictor for each transition
98
+ # eta[i, j] = sum over covariates: x[c] * eta_A[c, i, j]
99
+ eta = np.zeros((n_states, n_states))
100
+ for i in range(n_states):
101
+ for j in range(n_states):
102
+ eta[i, j] = np.sum(x_t * eta_A[:, i, j])
103
+
104
+ # Convert to probabilities using softmax (row-wise)
105
+ for i in range(n_states):
106
+ transition_probs[seq_idx, t, i, :] = softmax(eta[i, :])
107
+
108
+ return transition_probs
109
+
110
+
111
+ def compute_emission_probs_with_covariates(
112
+ eta_B: np.ndarray,
113
+ X: np.ndarray,
114
+ n_states: int,
115
+ n_symbols: int
116
+ ) -> np.ndarray:
117
+ """
118
+ Compute emission probabilities from covariates using Softmax.
119
+
120
+ Similar to transition probabilities, but for emission probabilities.
121
+
122
+ Args:
123
+ eta_B: Coefficient matrix of shape (n_covariates, n_states, n_symbols)
124
+ X: Covariate matrix of shape (n_sequences, n_timepoints, n_covariates)
125
+ n_states: Number of hidden states
126
+ n_symbols: Number of observed symbols
127
+
128
+ Returns:
129
+ numpy array: Emission probabilities of shape (n_sequences, n_timepoints, n_states, n_symbols)
130
+ """
131
+ n_sequences, n_timepoints, n_covariates = X.shape
132
+
133
+ # Initialize emission probability matrix
134
+ emission_probs = np.zeros((n_sequences, n_timepoints, n_states, n_symbols))
135
+
136
+ # For each sequence and time point
137
+ for seq_idx in range(n_sequences):
138
+ for t in range(n_timepoints):
139
+ # Get covariates for this time point
140
+ x_t = X[seq_idx, t, :] # Shape: (n_covariates,)
141
+
142
+ # Compute linear predictor for each emission
143
+ # eta[i, j] = sum over covariates: x[c] * eta_B[c, i, j]
144
+ eta = np.zeros((n_states, n_symbols))
145
+ for i in range(n_states):
146
+ for j in range(n_symbols):
147
+ eta[i, j] = np.sum(x_t * eta_B[:, i, j])
148
+
149
+ # Convert to probabilities using softmax (row-wise)
150
+ for i in range(n_states):
151
+ emission_probs[seq_idx, t, i, :] = softmax(eta[i, :])
152
+
153
+ return emission_probs
154
+
155
+
156
+ def compute_initial_probs_with_covariates(
157
+ eta_pi: np.ndarray,
158
+ X: np.ndarray,
159
+ n_states: int
160
+ ) -> np.ndarray:
161
+ """
162
+ Compute initial state probabilities from covariates using Softmax.
163
+
164
+ Args:
165
+ eta_pi: Coefficient matrix of shape (n_covariates, n_states)
166
+ X: Covariate matrix of shape (n_sequences, 1, n_covariates) for initial time
167
+ n_states: Number of hidden states
168
+
169
+ Returns:
170
+ numpy array: Initial probabilities of shape (n_sequences, n_states)
171
+ """
172
+ n_sequences = X.shape[0]
173
+
174
+ # Initialize initial probability matrix
175
+ initial_probs = np.zeros((n_sequences, n_states))
176
+
177
+ # For each sequence
178
+ for seq_idx in range(n_sequences):
179
+ # Get covariates for initial time point
180
+ x_0 = X[seq_idx, 0, :] # Shape: (n_covariates,)
181
+
182
+ # Compute linear predictor
183
+ # eta[i] = sum over covariates: x[c] * eta_pi[c, i]
184
+ eta = np.zeros(n_states)
185
+ for i in range(n_states):
186
+ eta[i] = np.sum(x_0 * eta_pi[:, i])
187
+
188
+ # Convert to probabilities using softmax
189
+ initial_probs[seq_idx, :] = softmax(eta)
190
+
191
+ return initial_probs
@@ -0,0 +1,137 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : predict.py
4
+ @Time : 2025-11-13 17:05
5
+ @Desc : Prediction and inference functions for HMM models
6
+
7
+ This module provides functions for predicting hidden states and computing
8
+ posterior probabilities, similar to seqHMM's predict() and posterior_probs()
9
+ functions in R.
10
+ """
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+ from typing import Optional, List
15
+ from sequenzo.define_sequence_data import SequenceData
16
+ from .hmm import HMM
17
+ from .utils import sequence_data_to_hmmlearn_format
18
+
19
+
20
+ def predict(
21
+ model: HMM,
22
+ newdata: Optional[SequenceData] = None
23
+ ) -> np.ndarray:
24
+ """
25
+ Predict the most likely hidden state sequence using Viterbi algorithm.
26
+
27
+ This function finds the most likely sequence of hidden states given the
28
+ observed sequence, using the Viterbi algorithm (dynamic programming).
29
+
30
+ It is similar to seqHMM's predict() function in R.
31
+
32
+ Args:
33
+ model: Fitted HMM model object
34
+ newdata: Optional SequenceData to predict. If None, uses the data
35
+ the model was fitted on.
36
+
37
+ Returns:
38
+ numpy array: Predicted hidden states for each time point in each sequence.
39
+ The array is flattened (all sequences concatenated).
40
+
41
+ Examples:
42
+ >>> from sequenzo import SequenceData, load_dataset
43
+ >>> from sequenzo.seqhmm import build_hmm, fit_model, predict
44
+ >>>
45
+ >>> # Load and prepare data
46
+ >>> df = load_dataset('mvad')
47
+ >>> seq = SequenceData(df, time=range(15, 86), states=['EM', 'FE', 'HE', 'JL', 'SC', 'TR'])
48
+ >>>
49
+ >>> # Build and fit model
50
+ >>> hmm = build_hmm(seq, n_states=4, random_state=42)
51
+ >>> hmm = fit_model(hmm)
52
+ >>>
53
+ >>> # Predict hidden states
54
+ >>> predicted_states = predict(hmm)
55
+ >>> print(f"Predicted {len(predicted_states)} hidden states")
56
+ """
57
+ if model.log_likelihood is None:
58
+ raise ValueError("Model must be fitted before prediction. Use fit_model() first.")
59
+
60
+ return model.predict(newdata)
61
+
62
+
63
+ def posterior_probs(
64
+ model: HMM,
65
+ newdata: Optional[SequenceData] = None
66
+ ) -> pd.DataFrame:
67
+ """
68
+ Compute posterior probabilities of hidden states.
69
+
70
+ This function computes the probability of each hidden state at each time point,
71
+ given the observed sequence. It uses the forward-backward algorithm.
72
+
73
+ It is similar to seqHMM's posterior_probs() function in R.
74
+
75
+ Args:
76
+ model: Fitted HMM model object
77
+ newdata: Optional SequenceData. If None, uses the data the model was fitted on.
78
+
79
+ Returns:
80
+ pandas DataFrame: Posterior probabilities with columns:
81
+ - id: Sequence identifier (index in the original data)
82
+ - time: Time point within the sequence
83
+ - state: Hidden state index
84
+ - probability: Posterior probability of being in this state at this time
85
+
86
+ Examples:
87
+ >>> from sequenzo import SequenceData, load_dataset
88
+ >>> from sequenzo.seqhmm import build_hmm, fit_model, posterior_probs
89
+ >>>
90
+ >>> # Load and prepare data
91
+ >>> df = load_dataset('mvad')
92
+ >>> seq = SequenceData(df, time=range(15, 86), states=['EM', 'FE', 'HE', 'JL', 'SC', 'TR'])
93
+ >>>
94
+ >>> # Build and fit model
95
+ >>> hmm = build_hmm(seq, n_states=4, random_state=42)
96
+ >>> hmm = fit_model(hmm)
97
+ >>>
98
+ >>> # Get posterior probabilities
99
+ >>> posteriors = posterior_probs(hmm)
100
+ >>> print(posteriors.head())
101
+ >>>
102
+ >>> # Find most probable state at each time point
103
+ >>> most_probable = posteriors.groupby(['id', 'time'])['probability'].idxmax()
104
+ """
105
+ if model.log_likelihood is None:
106
+ raise ValueError("Model must be fitted before computing posterior probabilities. Use fit_model() first.")
107
+
108
+ # Get sequences to use
109
+ sequences = newdata if newdata is not None else model.observations
110
+
111
+ # Get posterior probabilities from model
112
+ proba = model.predict_proba(sequences)
113
+
114
+ # Get sequence information
115
+ X, lengths = sequence_data_to_hmmlearn_format(sequences)
116
+
117
+ # Create DataFrame with results
118
+ rows = []
119
+ seq_idx = 0
120
+ time_idx = 0
121
+
122
+ for seq_id in range(len(lengths)):
123
+ seq_length = lengths[seq_id]
124
+ for t in range(seq_length):
125
+ for state_idx in range(model.n_states):
126
+ rows.append({
127
+ 'id': seq_id,
128
+ 'time': t + 1, # 1-indexed for consistency with R
129
+ 'state': state_idx,
130
+ 'probability': proba[time_idx, state_idx]
131
+ })
132
+ time_idx += 1
133
+ seq_idx += 1
134
+
135
+ df = pd.DataFrame(rows)
136
+
137
+ return df