sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
  2. sequenzo/__init__.py +349 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +476 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +22 -0
  30. sequenzo/data_preprocessing/helpers.py +303 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/dyadic_children.csv +61 -0
  44. sequenzo/datasets/dyadic_parents.csv +61 -0
  45. sequenzo/datasets/mvad.csv +713 -0
  46. sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
  47. sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
  48. sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
  49. sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
  50. sequenzo/datasets/political_science_aid_shock.csv +166 -0
  51. sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
  52. sequenzo/define_sequence_data.py +1400 -0
  53. sequenzo/dissimilarity_measures/__init__.py +31 -0
  54. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  55. sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
  56. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
  57. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  58. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  59. sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
  60. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  61. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  62. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  63. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  64. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  65. sequenzo/dissimilarity_measures/src/module.cpp +40 -0
  66. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  67. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  214. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  215. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  216. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  217. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  218. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  219. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  220. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  221. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  222. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  223. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  224. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  225. sequenzo/multidomain/__init__.py +23 -0
  226. sequenzo/multidomain/association_between_domains.py +311 -0
  227. sequenzo/multidomain/cat.py +597 -0
  228. sequenzo/multidomain/combt.py +519 -0
  229. sequenzo/multidomain/dat.py +81 -0
  230. sequenzo/multidomain/idcd.py +139 -0
  231. sequenzo/multidomain/linked_polyad.py +292 -0
  232. sequenzo/openmp_setup.py +233 -0
  233. sequenzo/prefix_tree/__init__.py +62 -0
  234. sequenzo/prefix_tree/hub.py +114 -0
  235. sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
  236. sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
  237. sequenzo/prefix_tree/spell_level_indicators.py +297 -0
  238. sequenzo/prefix_tree/system_level_indicators.py +544 -0
  239. sequenzo/prefix_tree/utils.py +54 -0
  240. sequenzo/seqhmm/__init__.py +95 -0
  241. sequenzo/seqhmm/advanced_optimization.py +305 -0
  242. sequenzo/seqhmm/bootstrap.py +411 -0
  243. sequenzo/seqhmm/build_hmm.py +142 -0
  244. sequenzo/seqhmm/build_mhmm.py +136 -0
  245. sequenzo/seqhmm/build_nhmm.py +121 -0
  246. sequenzo/seqhmm/fit_mhmm.py +62 -0
  247. sequenzo/seqhmm/fit_model.py +61 -0
  248. sequenzo/seqhmm/fit_nhmm.py +76 -0
  249. sequenzo/seqhmm/formulas.py +289 -0
  250. sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
  251. sequenzo/seqhmm/gradients_nhmm.py +306 -0
  252. sequenzo/seqhmm/hmm.py +291 -0
  253. sequenzo/seqhmm/mhmm.py +314 -0
  254. sequenzo/seqhmm/model_comparison.py +238 -0
  255. sequenzo/seqhmm/multichannel_em.py +282 -0
  256. sequenzo/seqhmm/multichannel_utils.py +138 -0
  257. sequenzo/seqhmm/nhmm.py +270 -0
  258. sequenzo/seqhmm/nhmm_utils.py +191 -0
  259. sequenzo/seqhmm/predict.py +137 -0
  260. sequenzo/seqhmm/predict_mhmm.py +142 -0
  261. sequenzo/seqhmm/simulate.py +878 -0
  262. sequenzo/seqhmm/utils.py +218 -0
  263. sequenzo/seqhmm/visualization.py +910 -0
  264. sequenzo/sequence_characteristics/__init__.py +40 -0
  265. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  266. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  267. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  268. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  269. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  270. sequenzo/sequence_characteristics/turbulence.py +155 -0
  271. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  272. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  273. sequenzo/suffix_tree/__init__.py +66 -0
  274. sequenzo/suffix_tree/hub.py +114 -0
  275. sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
  276. sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
  277. sequenzo/suffix_tree/spell_level_indicators.py +248 -0
  278. sequenzo/suffix_tree/system_level_indicators.py +535 -0
  279. sequenzo/suffix_tree/utils.py +56 -0
  280. sequenzo/version_check.py +283 -0
  281. sequenzo/visualization/__init__.py +29 -0
  282. sequenzo/visualization/plot_mean_time.py +222 -0
  283. sequenzo/visualization/plot_modal_state.py +276 -0
  284. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  285. sequenzo/visualization/plot_relative_frequency.py +405 -0
  286. sequenzo/visualization/plot_sequence_index.py +1175 -0
  287. sequenzo/visualization/plot_single_medoid.py +153 -0
  288. sequenzo/visualization/plot_state_distribution.py +651 -0
  289. sequenzo/visualization/plot_transition_matrix.py +190 -0
  290. sequenzo/visualization/utils/__init__.py +23 -0
  291. sequenzo/visualization/utils/utils.py +310 -0
  292. sequenzo/with_event_history_analysis/__init__.py +35 -0
  293. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  294. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  295. sequenzo-0.1.31.dist-info/METADATA +286 -0
  296. sequenzo-0.1.31.dist-info/RECORD +299 -0
  297. sequenzo-0.1.31.dist-info/WHEEL +5 -0
  298. sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
  299. sequenzo-0.1.31.dist-info/top_level.txt +2 -0
@@ -0,0 +1,878 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : simulate.py
4
+ @Time : 2025-10-12 10:47
5
+ @Desc : Model simulation functions for HMM models
6
+
7
+ This module provides functions for simulating sequences from HMM models,
8
+ similar to seqHMM's simulate_hmm() and simulate_mhmm() functions in R.
9
+ """
10
+
11
+ import numpy as np
12
+ import pandas as pd
13
+ from typing import Optional, List, Union, Dict
14
+ from sequenzo.define_sequence_data import SequenceData
15
+ from .hmm import HMM
16
+ from .mhmm import MHMM
17
+ from .formulas import create_model_matrix_time_constant
18
+
19
+
20
+ def simulate_hmm(
21
+ n_sequences: int,
22
+ initial_probs: np.ndarray,
23
+ transition_probs: np.ndarray,
24
+ emission_probs: np.ndarray,
25
+ sequence_length: int,
26
+ alphabet: Optional[List[str]] = None,
27
+ state_names: Optional[List[str]] = None,
28
+ random_state: Optional[int] = None
29
+ ) -> dict:
30
+ """
31
+ Simulate sequences from a Hidden Markov Model.
32
+
33
+ This function generates sequences of observed and hidden states given
34
+ HMM parameters. It is similar to seqHMM's simulate_hmm() function in R.
35
+
36
+ Args:
37
+ n_sequences: Number of sequences to simulate
38
+ initial_probs: Initial state probabilities (n_states,)
39
+ transition_probs: Transition probability matrix (n_states x n_states)
40
+ emission_probs: Emission probability matrix (n_states x n_symbols)
41
+ sequence_length: Length of each simulated sequence
42
+ alphabet: Optional list of observed state symbols. If None, uses integers.
43
+ state_names: Optional list of hidden state names. If None, uses integers.
44
+ random_state: Random seed for reproducibility
45
+
46
+ Returns:
47
+ dict: Dictionary with keys:
48
+ - 'observations': List of observed sequences (as lists)
49
+ - 'states': List of hidden state sequences (as lists)
50
+ - 'observations_df': DataFrame format (for creating SequenceData)
51
+
52
+ Examples:
53
+ >>> from sequenzo.seqhmm import simulate_hmm
54
+ >>> import numpy as np
55
+ >>>
56
+ >>> # Define HMM parameters
57
+ >>> initial_probs = np.array([0.5, 0.5])
58
+ >>> transition_probs = np.array([[0.7, 0.3], [0.3, 0.7]])
59
+ >>> emission_probs = np.array([[0.9, 0.1], [0.1, 0.9]])
60
+ >>>
61
+ >>> # Simulate 10 sequences of length 20
62
+ >>> sim = simulate_hmm(
63
+ ... n_sequences=10,
64
+ ... initial_probs=initial_probs,
65
+ ... transition_probs=transition_probs,
66
+ ... emission_probs=emission_probs,
67
+ ... sequence_length=20,
68
+ ... alphabet=['A', 'B'],
69
+ ... random_state=42
70
+ ... )
71
+ >>>
72
+ >>> print(f"Simulated {len(sim['observations'])} sequences")
73
+ >>> print(f"First sequence: {sim['observations'][0]}")
74
+ """
75
+ rng = np.random.RandomState(random_state)
76
+
77
+ n_states = len(initial_probs)
78
+ n_symbols = emission_probs.shape[1]
79
+
80
+ # Validate dimensions
81
+ if transition_probs.shape != (n_states, n_states):
82
+ raise ValueError(
83
+ f"transition_probs shape ({transition_probs.shape}) must be ({n_states}, {n_states})"
84
+ )
85
+ if emission_probs.shape != (n_states, n_symbols):
86
+ raise ValueError(
87
+ f"emission_probs shape ({emission_probs.shape}) must be ({n_states}, {n_symbols})"
88
+ )
89
+
90
+ # Set default names
91
+ if alphabet is None:
92
+ alphabet = [str(i) for i in range(n_symbols)]
93
+ if state_names is None:
94
+ state_names = [str(i) for i in range(n_states)]
95
+
96
+ # Initialize arrays
97
+ states = []
98
+ observations = []
99
+
100
+ # Simulate sequences
101
+ for seq_idx in range(n_sequences):
102
+ seq_states = []
103
+ seq_obs = []
104
+
105
+ # Sample initial state
106
+ initial_state_idx = rng.choice(n_states, p=initial_probs)
107
+ seq_states.append(state_names[initial_state_idx])
108
+
109
+ # Sample initial observation
110
+ obs_idx = rng.choice(n_symbols, p=emission_probs[initial_state_idx, :])
111
+ seq_obs.append(alphabet[obs_idx])
112
+
113
+ # Simulate remaining time points
114
+ current_state = initial_state_idx
115
+ for t in range(1, sequence_length):
116
+ # Sample next state
117
+ current_state = rng.choice(n_states, p=transition_probs[current_state, :])
118
+ seq_states.append(state_names[current_state])
119
+
120
+ # Sample observation
121
+ obs_idx = rng.choice(n_symbols, p=emission_probs[current_state, :])
122
+ seq_obs.append(alphabet[obs_idx])
123
+
124
+ states.append(seq_states)
125
+ observations.append(seq_obs)
126
+
127
+ # Create DataFrame format for easy conversion to SequenceData
128
+ # This format: one row per sequence, columns are time points
129
+ obs_dict = {}
130
+ for t in range(sequence_length):
131
+ obs_dict[f'time_{t+1}'] = [obs[t] for obs in observations]
132
+
133
+ observations_df = pd.DataFrame(obs_dict)
134
+
135
+ return {
136
+ 'observations': observations,
137
+ 'states': states,
138
+ 'observations_df': observations_df,
139
+ 'alphabet': alphabet,
140
+ 'state_names': state_names
141
+ }
142
+
143
+
144
+ def compute_mixture_probs_from_covariates(
145
+ X: np.ndarray,
146
+ coefficients: np.ndarray
147
+ ) -> np.ndarray:
148
+ """
149
+ Compute mixture probabilities from covariates and coefficients using multinomial logit.
150
+
151
+ This function implements the softmax (multinomial logit) link function to convert
152
+ linear predictors (X @ coefficients) into probabilities. The first column of
153
+ coefficients is set to zero (reference category).
154
+
155
+ Formula: P(cluster k | covariates) = exp(X @ coefficients[:, k]) / sum(exp(X @ coefficients))
156
+
157
+ Args:
158
+ X: Model matrix of shape (n_sequences, n_covariates) including intercept
159
+ coefficients: Coefficient matrix of shape (n_covariates, n_clusters)
160
+ First column should be zeros (reference category)
161
+
162
+ Returns:
163
+ numpy array: Mixture probabilities of shape (n_sequences, n_clusters)
164
+ Each row sums to 1
165
+
166
+ Examples:
167
+ >>> import numpy as np
168
+ >>> from sequenzo.seqhmm.simulate import compute_mixture_probs_from_covariates
169
+ >>>
170
+ >>> # Create model matrix (intercept + 1 covariate, 3 sequences)
171
+ >>> X = np.array([[1, 0.5], [1, 1.0], [1, 1.5]])
172
+ >>>
173
+ >>> # Create coefficients (2 covariates x 3 clusters, first column zeros)
174
+ >>> coefs = np.array([
175
+ ... [0, -1.5, 0.5], # intercepts (first cluster is reference)
176
+ ... [0, 3.0, -0.7] # covariate effects
177
+ ... ])
178
+ >>>
179
+ >>> # Compute mixture probabilities
180
+ >>> probs = compute_mixture_probs_from_covariates(X, coefs)
181
+ >>> print(probs.shape) # (3, 3) - 3 sequences, 3 clusters
182
+ >>> print(probs.sum(axis=1)) # [1. 1. 1.] - each row sums to 1
183
+ """
184
+ # Compute linear predictors: X @ coefficients
185
+ # Result shape: (n_sequences, n_clusters)
186
+ linear_predictors = X @ coefficients
187
+
188
+ # Apply softmax (multinomial logit) to convert to probabilities
189
+ # For numerical stability, subtract the max before exponentiating
190
+ # This doesn't change the result but prevents overflow
191
+ max_vals = np.max(linear_predictors, axis=1, keepdims=True)
192
+ exp_vals = np.exp(linear_predictors - max_vals)
193
+
194
+ # Normalize so each row sums to 1
195
+ probs = exp_vals / np.sum(exp_vals, axis=1, keepdims=True)
196
+
197
+ return probs
198
+
199
+
200
+ def simulate_mhmm(
201
+ n_sequences: int,
202
+ n_clusters: int,
203
+ initial_probs: List[np.ndarray],
204
+ transition_probs: List[np.ndarray],
205
+ emission_probs: List[np.ndarray],
206
+ cluster_probs: Optional[np.ndarray] = None,
207
+ sequence_length: Optional[int] = None,
208
+ alphabet: Optional[List[str]] = None,
209
+ state_names: Optional[List[List[str]]] = None,
210
+ cluster_names: Optional[List[str]] = None,
211
+ formula: Optional[Union[str, None]] = None,
212
+ data: Optional[pd.DataFrame] = None,
213
+ coefficients: Optional[np.ndarray] = None,
214
+ random_state: Optional[int] = None
215
+ ) -> dict:
216
+ """
217
+ Simulate sequences from a Mixture Hidden Markov Model.
218
+
219
+ This function generates sequences from a Mixture HMM, where each sequence
220
+ is first assigned to a cluster, then simulated from that cluster's HMM.
221
+ It is similar to seqHMM's simulate_mhmm() function in R.
222
+
223
+ Cluster assignments can be done in two ways:
224
+ 1. Using fixed cluster probabilities (cluster_probs parameter)
225
+ 2. Using formula-based covariates (formula, data, coefficients parameters)
226
+
227
+ When using formula-based covariates, mixture probabilities are computed
228
+ using multinomial logit (softmax) from covariates and coefficients.
229
+
230
+ Args:
231
+ n_sequences: Number of sequences to simulate
232
+ n_clusters: Number of clusters
233
+ initial_probs: List of initial state probabilities, one per cluster
234
+ transition_probs: List of transition matrices, one per cluster
235
+ emission_probs: List of emission matrices, one per cluster
236
+ cluster_probs: Optional fixed cluster probabilities (n_clusters,).
237
+ Either cluster_probs OR (formula + data + coefficients) must be provided.
238
+ sequence_length: Length of each simulated sequence
239
+ alphabet: Optional list of observed state symbols
240
+ state_names: Optional list of state name lists, one per cluster
241
+ cluster_names: Optional names for clusters
242
+ formula: Optional formula string (e.g., "~ covariate_1 + covariate_2")
243
+ for time-constant covariates. If provided, data and coefficients must also be provided.
244
+ data: Optional DataFrame containing covariates (one row per sequence).
245
+ Required if formula is provided.
246
+ coefficients: Optional coefficient matrix of shape (n_covariates, n_clusters)
247
+ for formula-based covariates. First column should be zeros (reference category).
248
+ Required if formula is provided.
249
+ random_state: Random seed for reproducibility
250
+
251
+ Returns:
252
+ dict: Dictionary with keys:
253
+ - 'observations': List of observed sequences
254
+ - 'states': List of hidden state sequences
255
+ - 'clusters': List of cluster assignments
256
+ - 'observations_df': DataFrame format
257
+
258
+ Examples:
259
+ >>> from sequenzo.seqhmm import simulate_mhmm
260
+ >>> import numpy as np
261
+ >>> import pandas as pd
262
+ >>>
263
+ >>> # Method 1: Fixed cluster probabilities
264
+ >>> initial_probs = [np.array([0.5, 0.5]), np.array([0.3, 0.7])]
265
+ >>> transition_probs = [
266
+ ... np.array([[0.7, 0.3], [0.3, 0.7]]),
267
+ ... np.array([[0.8, 0.2], [0.2, 0.8]])
268
+ ... ]
269
+ >>> emission_probs = [
270
+ ... np.array([[0.9, 0.1], [0.1, 0.9]]),
271
+ ... np.array([[0.7, 0.3], [0.3, 0.7]])
272
+ ... ]
273
+ >>> cluster_probs = np.array([0.6, 0.4])
274
+ >>>
275
+ >>> sim = simulate_mhmm(
276
+ ... n_sequences=10,
277
+ ... n_clusters=2,
278
+ ... initial_probs=initial_probs,
279
+ ... transition_probs=transition_probs,
280
+ ... emission_probs=emission_probs,
281
+ ... cluster_probs=cluster_probs,
282
+ ... sequence_length=20,
283
+ ... alphabet=['A', 'B'],
284
+ ... random_state=42
285
+ ... )
286
+ >>>
287
+ >>> # Method 2: Formula-based covariates
288
+ >>> # Create covariate data
289
+ >>> data = pd.DataFrame({
290
+ ... 'covariate_1': np.random.rand(30),
291
+ ... 'covariate_2': np.random.choice(['A', 'B'], size=30)
292
+ ... })
293
+ >>>
294
+ >>> # Define coefficients (intercept + 2 covariates) x (2 clusters)
295
+ >>> # First column is zeros (reference), second column has effects
296
+ >>> coefs = np.array([
297
+ ... [0, -1.5], # intercepts
298
+ ... [0, 3.0], # covariate_1 effect
299
+ ... [0, -0.7] # covariate_2_B effect (dummy for 'B')
300
+ ... ])
301
+ >>>
302
+ >>> sim = simulate_mhmm(
303
+ ... n_sequences=30,
304
+ ... n_clusters=2,
305
+ ... initial_probs=initial_probs,
306
+ ... transition_probs=transition_probs,
307
+ ... emission_probs=emission_probs,
308
+ ... sequence_length=20,
309
+ ... formula="~ covariate_1 + covariate_2",
310
+ ... data=data,
311
+ ... coefficients=coefs,
312
+ ... alphabet=['A', 'B'],
313
+ ... random_state=42
314
+ ... )
315
+ """
316
+ rng = np.random.RandomState(random_state)
317
+
318
+ # Validate sequence_length is provided
319
+ if sequence_length is None:
320
+ raise ValueError("sequence_length must be provided")
321
+ if sequence_length < 1:
322
+ raise ValueError(f"sequence_length must be at least 1, got {sequence_length}")
323
+
324
+ # Validate inputs
325
+ if len(initial_probs) != n_clusters:
326
+ raise ValueError(f"initial_probs length ({len(initial_probs)}) must equal n_clusters ({n_clusters})")
327
+ if len(transition_probs) != n_clusters:
328
+ raise ValueError(f"transition_probs length ({len(transition_probs)}) must equal n_clusters ({n_clusters})")
329
+ if len(emission_probs) != n_clusters:
330
+ raise ValueError(f"emission_probs length ({len(emission_probs)}) must equal n_clusters ({n_clusters})")
331
+
332
+ # Validate that either cluster_probs OR formula-based approach is used
333
+ use_formula = (formula is not None)
334
+ use_fixed_probs = (cluster_probs is not None)
335
+
336
+ if use_formula and use_fixed_probs:
337
+ raise ValueError(
338
+ "Cannot specify both cluster_probs and formula-based covariates. "
339
+ "Use either cluster_probs OR (formula + data + coefficients)."
340
+ )
341
+
342
+ if not use_formula and not use_fixed_probs:
343
+ raise ValueError(
344
+ "Must specify either cluster_probs OR (formula + data + coefficients) "
345
+ "for cluster assignment probabilities."
346
+ )
347
+
348
+ if use_formula:
349
+ # Validate formula-based inputs
350
+ if data is None:
351
+ raise ValueError("If formula is provided, data must also be provided")
352
+ if coefficients is None:
353
+ raise ValueError("If formula is provided, coefficients must also be provided")
354
+
355
+ # Create model matrix from formula and data
356
+ # Step 1: Create model matrix X of shape (n_sequences, n_covariates)
357
+ X = create_model_matrix_time_constant(formula, data, n_sequences)
358
+ n_covariates = X.shape[1]
359
+
360
+ # Step 2: Validate coefficients matrix
361
+ if coefficients.shape != (n_covariates, n_clusters):
362
+ raise ValueError(
363
+ f"coefficients shape ({coefficients.shape}) must be "
364
+ f"(n_covariates, n_clusters) = ({n_covariates}, {n_clusters}). "
365
+ f"Note: n_covariates includes intercept and any dummy variables from categorical covariates."
366
+ )
367
+
368
+ # Step 3: Ensure first column of coefficients is zeros (reference category)
369
+ coefficients = coefficients.copy() # Don't modify original
370
+ coefficients[:, 0] = 0.0
371
+
372
+ # Step 4: Compute mixture probabilities from covariates
373
+ # Result: (n_sequences, n_clusters) - probabilities for each sequence
374
+ mixture_probs = compute_mixture_probs_from_covariates(X, coefficients)
375
+
376
+ else:
377
+ # Use fixed cluster probabilities
378
+ if len(cluster_probs) != n_clusters:
379
+ raise ValueError(f"cluster_probs length ({len(cluster_probs)}) must equal n_clusters ({n_clusters})")
380
+
381
+ # Check that probabilities sum to approximately 1
382
+ if not np.isclose(np.sum(cluster_probs), 1.0):
383
+ raise ValueError(f"cluster_probs must sum to 1.0, but sum is {np.sum(cluster_probs)}")
384
+
385
+ # Broadcast to (n_sequences, n_clusters) - same probabilities for all sequences
386
+ mixture_probs = np.tile(cluster_probs, (n_sequences, 1))
387
+
388
+ # Get alphabet from first cluster
389
+ n_symbols = emission_probs[0].shape[1]
390
+ if alphabet is None:
391
+ alphabet = [str(i) for i in range(n_symbols)]
392
+
393
+ # Set default names
394
+ if cluster_names is None:
395
+ cluster_names = [f"Cluster {i+1}" for i in range(n_clusters)]
396
+
397
+ if state_names is None:
398
+ state_names = []
399
+ for k in range(n_clusters):
400
+ n_states_k = len(initial_probs[k])
401
+ state_names.append([str(i) for i in range(n_states_k)])
402
+
403
+ # Initialize arrays
404
+ observations = []
405
+ states = []
406
+ clusters = []
407
+
408
+ # Simulate sequences
409
+ for seq_idx in range(n_sequences):
410
+ # Sample cluster assignment using probabilities for this specific sequence
411
+ # If using formula-based covariates, each sequence has different probabilities
412
+ # If using fixed cluster_probs, all sequences have the same probabilities
413
+ cluster_idx = rng.choice(n_clusters, p=mixture_probs[seq_idx, :])
414
+ clusters.append(cluster_names[cluster_idx])
415
+
416
+ # Get parameters for this cluster
417
+ cluster_initial = initial_probs[cluster_idx]
418
+ cluster_transition = transition_probs[cluster_idx]
419
+ cluster_emission = emission_probs[cluster_idx]
420
+ cluster_state_names = state_names[cluster_idx]
421
+ n_states_k = len(cluster_initial)
422
+
423
+ # Simulate sequence from this cluster's HMM
424
+ seq_states = []
425
+ seq_obs = []
426
+
427
+ # Sample initial state
428
+ initial_state_idx = rng.choice(n_states_k, p=cluster_initial)
429
+ seq_states.append(cluster_state_names[initial_state_idx])
430
+
431
+ # Sample initial observation
432
+ obs_idx = rng.choice(n_symbols, p=cluster_emission[initial_state_idx, :])
433
+ seq_obs.append(alphabet[obs_idx])
434
+
435
+ # Simulate remaining time points
436
+ current_state = initial_state_idx
437
+ for t in range(1, sequence_length):
438
+ # Sample next state
439
+ current_state = rng.choice(n_states_k, p=cluster_transition[current_state, :])
440
+ seq_states.append(cluster_state_names[current_state])
441
+
442
+ # Sample observation
443
+ obs_idx = rng.choice(n_symbols, p=cluster_emission[current_state, :])
444
+ seq_obs.append(alphabet[obs_idx])
445
+
446
+ states.append(seq_states)
447
+ observations.append(seq_obs)
448
+
449
+ # Create DataFrame format
450
+ obs_dict = {}
451
+ for t in range(sequence_length):
452
+ obs_dict[f'time_{t+1}'] = [obs[t] for obs in observations]
453
+
454
+ observations_df = pd.DataFrame(obs_dict)
455
+ observations_df['cluster'] = clusters
456
+
457
+ return {
458
+ 'observations': observations,
459
+ 'states': states,
460
+ 'clusters': clusters,
461
+ 'observations_df': observations_df,
462
+ 'alphabet': alphabet,
463
+ 'state_names': state_names,
464
+ 'cluster_names': cluster_names
465
+ }
466
+
467
+
468
+ def simulate_nhmm(
469
+ n_states: int,
470
+ emission_formula: Union[str, None],
471
+ data: pd.DataFrame,
472
+ id_var: str,
473
+ time_var: str,
474
+ initial_formula: Union[str, None] = None,
475
+ transition_formula: Union[str, None] = None,
476
+ coefs: Optional[Dict[str, np.ndarray]] = None,
477
+ init_sd: Optional[float] = None,
478
+ random_state: Optional[int] = None
479
+ ) -> dict:
480
+ """
481
+ Simulate sequences from a Non-homogeneous Hidden Markov Model.
482
+
483
+ This function generates sequences of observed and hidden states given the parameters
484
+ of a non-homogeneous hidden Markov model. In an NHMM, transition and emission
485
+ probabilities can vary over time or with covariates.
486
+
487
+ It is similar to seqHMM's simulate_nhmm() function in R.
488
+
489
+ Args:
490
+ n_states: Number of hidden states (must be > 1)
491
+ emission_formula: Formula string for emission probabilities (e.g., "~ x1 + x2").
492
+ The left-hand side should specify the response variable(s).
493
+ For multiple responses, use a list of formulas.
494
+ data: DataFrame containing the variables used in model formulas.
495
+ Must include the response variable(s) to define the number of observed
496
+ symbols and sequence lengths. The actual values of response variables
497
+ will be replaced by simulated values.
498
+ id_var: Name of the ID variable in data identifying different sequences
499
+ time_var: Name of the time index variable in data
500
+ initial_formula: Optional formula string for initial state probabilities.
501
+ Default is "~ 1" (intercept only).
502
+ transition_formula: Optional formula string for transition probabilities.
503
+ Default is "~ 1" (intercept only).
504
+ coefs: Optional dictionary with keys 'initial_probs', 'transition_probs', 'emission_probs'
505
+ containing coefficient matrices (etas). If None, coefficients are generated randomly.
506
+ init_sd: Standard deviation for random coefficient generation.
507
+ Default is 2.0 when coefs is None, 0.0 otherwise.
508
+ random_state: Random seed for reproducibility
509
+
510
+ Returns:
511
+ dict: Dictionary with keys:
512
+ - 'observations': List of observed sequences (as lists)
513
+ - 'states': List of hidden state sequences (as lists)
514
+ - 'data': DataFrame with simulated response variables
515
+ - 'model': Dictionary containing model information (coefficients, etc.)
516
+
517
+ Examples:
518
+ >>> import pandas as pd
519
+ >>> import numpy as np
520
+ >>> from sequenzo.seqhmm import simulate_nhmm
521
+ >>>
522
+ >>> # Create data with covariates and response variable
523
+ >>> n_sequences = 10
524
+ >>> sequence_length = 20
525
+ >>> data = pd.DataFrame({
526
+ ... 'id': np.repeat(range(1, n_sequences + 1), sequence_length),
527
+ ... 'time': np.tile(range(1, sequence_length + 1), n_sequences),
528
+ ... 'age': np.repeat(np.random.randint(20, 60, n_sequences), sequence_length),
529
+ ... 'response': np.repeat(['A', 'B', 'C'], n_sequences * sequence_length // 3 + 1)[:n_sequences * sequence_length]
530
+ ... })
531
+ >>>
532
+ >>> # Simulate NHMM with time-varying probabilities
533
+ >>> sim = simulate_nhmm(
534
+ ... n_states=3,
535
+ ... emission_formula="response ~ age",
536
+ ... data=data,
537
+ ... id_var='id',
538
+ ... time_var='time',
539
+ ... initial_formula="~ age",
540
+ ... transition_formula="~ age",
541
+ ... random_state=42
542
+ ... )
543
+ >>>
544
+ >>> print(f"Simulated {len(sim['observations'])} sequences")
545
+ >>> print(f"First sequence: {sim['observations'][0]}")
546
+ """
547
+ rng = np.random.RandomState(random_state)
548
+
549
+ # Step 1: Validate inputs
550
+ if n_states < 2:
551
+ raise ValueError(f"n_states must be at least 2, got {n_states}")
552
+
553
+ if emission_formula is None:
554
+ raise ValueError("emission_formula is required")
555
+
556
+ if id_var not in data.columns:
557
+ raise ValueError(f"id_var '{id_var}' not found in data columns: {list(data.columns)}")
558
+ if time_var not in data.columns:
559
+ raise ValueError(f"time_var '{time_var}' not found in data columns: {list(data.columns)}")
560
+
561
+ # Step 2: Parse emission formula to get response variable(s)
562
+ # For simplicity, we'll extract the response from the left-hand side
563
+ # Format: "response ~ covariates" or "~ covariates" (response inferred from data)
564
+ if isinstance(emission_formula, str):
565
+ if '~' in emission_formula:
566
+ parts = emission_formula.split('~')
567
+ response_part = parts[0].strip()
568
+ if response_part:
569
+ # Response variable specified
570
+ response_vars = [v.strip() for v in response_part.split('+')]
571
+ else:
572
+ # No response specified, try to infer from data
573
+ # Look for categorical/object columns that might be responses
574
+ response_vars = None
575
+ else:
576
+ raise ValueError("emission_formula must contain '~' separator")
577
+ else:
578
+ raise ValueError("emission_formula must be a string")
579
+
580
+ # For now, we'll use a simplified approach: assume response is in data
581
+ # and extract unique values to determine alphabet
582
+ if response_vars is None:
583
+ # Try to find a response column (categorical/object type)
584
+ cat_cols = [col for col in data.columns
585
+ if col not in [id_var, time_var] and
586
+ (pd.api.types.is_categorical_dtype(data[col]) or
587
+ pd.api.types.is_object_dtype(data[col]))]
588
+ if cat_cols:
589
+ response_vars = [cat_cols[0]]
590
+ else:
591
+ raise ValueError("Could not determine response variable. Please specify in emission_formula (e.g., 'response ~ x1')")
592
+
593
+ # Get alphabet from response variable
594
+ response_var = response_vars[0]
595
+ if response_var not in data.columns:
596
+ raise ValueError(f"Response variable '{response_var}' not found in data columns")
597
+
598
+ # Extract unique values to form alphabet
599
+ alphabet = sorted(data[response_var].dropna().unique().tolist())
600
+ n_symbols = len(alphabet)
601
+
602
+ # Step 3: Get sequence information from data
603
+ # Group by id to get sequence lengths
604
+ sequence_info = data.groupby(id_var).agg({
605
+ time_var: ['min', 'max', 'count']
606
+ }).reset_index()
607
+ sequence_info.columns = [id_var, 'time_min', 'time_max', 'length']
608
+
609
+ n_sequences = len(sequence_info)
610
+ sequence_lengths = sequence_info['length'].values
611
+ max_length = int(sequence_lengths.max())
612
+
613
+ # Step 4: Create model matrices from formulas
614
+ # Extract formula parts (right-hand side after ~)
615
+ if isinstance(emission_formula, str):
616
+ emission_rhs = emission_formula.split('~')[1].strip()
617
+ else:
618
+ emission_rhs = "1"
619
+
620
+ if initial_formula is None:
621
+ initial_formula = "~ 1"
622
+ if isinstance(initial_formula, str):
623
+ initial_rhs = initial_formula.split('~')[1].strip() if '~' in initial_formula else "1"
624
+ else:
625
+ initial_rhs = "1"
626
+
627
+ if transition_formula is None:
628
+ transition_formula = "~ 1"
629
+ if isinstance(transition_formula, str):
630
+ transition_rhs = transition_formula.split('~')[1].strip() if '~' in transition_formula else "1"
631
+ else:
632
+ transition_rhs = "1"
633
+
634
+ # Create model matrices using the formulas module
635
+ # For NHMM, we need time-varying covariates, so we create 3D matrices
636
+ from .formulas import create_model_matrix
637
+
638
+ # Create X matrices for initial, transition, and emission
639
+ # Note: For initial probabilities, we only need the first time point
640
+ # For transition and emission, we need all time points
641
+
642
+ # Get unique sequence IDs
643
+ unique_ids = sequence_info[id_var].values
644
+
645
+ # Create a temporary SequenceData-like structure for create_model_matrix
646
+ # We'll create model matrices directly from the data
647
+ X_pi = _create_model_matrix_from_data(
648
+ initial_rhs, data, id_var, time_var, n_sequences, max_length, rng
649
+ )
650
+ X_A = _create_model_matrix_from_data(
651
+ transition_rhs, data, id_var, time_var, n_sequences, max_length, rng
652
+ )
653
+ X_B = _create_model_matrix_from_data(
654
+ emission_rhs, data, id_var, time_var, n_sequences, max_length, rng
655
+ )
656
+
657
+ # Step 5: Generate or use provided coefficients
658
+ n_covariates_pi = X_pi.shape[2]
659
+ n_covariates_A = X_A.shape[2]
660
+ n_covariates_B = X_B.shape[2]
661
+
662
+ # Set default init_sd
663
+ if init_sd is None:
664
+ init_sd = 2.0 if coefs is None else 0.0
665
+
666
+ if coefs is None:
667
+ # Generate random coefficients
668
+ eta_pi = rng.randn(n_covariates_pi, n_states) * init_sd
669
+ eta_A = rng.randn(n_covariates_A, n_states, n_states) * init_sd
670
+ eta_B = rng.randn(n_covariates_B, n_states, n_symbols) * init_sd
671
+ else:
672
+ # Use provided coefficients
673
+ eta_pi = coefs.get('initial_probs')
674
+ eta_A = coefs.get('transition_probs')
675
+ eta_B = coefs.get('emission_probs')
676
+
677
+ if eta_pi is None:
678
+ eta_pi = rng.randn(n_covariates_pi, n_states) * init_sd
679
+ if eta_A is None:
680
+ eta_A = rng.randn(n_covariates_A, n_states, n_states) * init_sd
681
+ if eta_B is None:
682
+ eta_B = rng.randn(n_covariates_B, n_states, n_symbols) * init_sd
683
+
684
+ # Step 6: Compute probabilities from coefficients using softmax
685
+ # Import utility functions
686
+ from .nhmm_utils import (
687
+ compute_initial_probs_with_covariates,
688
+ compute_transition_probs_with_covariates,
689
+ compute_emission_probs_with_covariates
690
+ )
691
+
692
+ # Compute initial probabilities (one per sequence, using first time point)
693
+ X_pi_first = X_pi[:, 0:1, :] # Shape: (n_sequences, 1, n_covariates)
694
+ initial_probs = compute_initial_probs_with_covariates(eta_pi, X_pi_first, n_states)
695
+ # Result shape: (n_sequences, n_states)
696
+
697
+ # Compute transition probabilities (for each sequence and time point)
698
+ transition_probs = compute_transition_probs_with_covariates(eta_A, X_A, n_states)
699
+ # Result shape: (n_sequences, n_timepoints, n_states, n_states)
700
+
701
+ # Compute emission probabilities (for each sequence and time point)
702
+ emission_probs = compute_emission_probs_with_covariates(eta_B, X_B, n_states, n_symbols)
703
+ # Result shape: (n_sequences, n_timepoints, n_states, n_symbols)
704
+
705
+ # Step 7: Simulate sequences
706
+ state_names = [f"State {i+1}" for i in range(n_states)]
707
+ observations = []
708
+ states = []
709
+
710
+ for seq_idx in range(n_sequences):
711
+ seq_length = int(sequence_lengths[seq_idx])
712
+ seq_states = []
713
+ seq_obs = []
714
+
715
+ # Sample initial state using initial probabilities for this sequence
716
+ initial_state_idx = rng.choice(n_states, p=initial_probs[seq_idx, :])
717
+ seq_states.append(state_names[initial_state_idx])
718
+
719
+ # Sample initial observation using emission probabilities
720
+ # Use first time point (t=0) for initial emission
721
+ t = 0
722
+ if t < seq_length:
723
+ emission_probs_t = emission_probs[seq_idx, t, initial_state_idx, :]
724
+ obs_idx = rng.choice(n_symbols, p=emission_probs_t)
725
+ seq_obs.append(alphabet[obs_idx])
726
+
727
+ # Simulate remaining time points
728
+ current_state = initial_state_idx
729
+ for t in range(1, seq_length):
730
+ # Sample next state using transition probabilities
731
+ # transition_probs[seq_idx, t-1, current_state, :] gives probabilities
732
+ # for transitions from current_state at time t-1
733
+ transition_probs_t = transition_probs[seq_idx, t-1, current_state, :]
734
+ current_state = rng.choice(n_states, p=transition_probs_t)
735
+ seq_states.append(state_names[current_state])
736
+
737
+ # Sample observation using emission probabilities
738
+ emission_probs_t = emission_probs[seq_idx, t, current_state, :]
739
+ obs_idx = rng.choice(n_symbols, p=emission_probs_t)
740
+ seq_obs.append(alphabet[obs_idx])
741
+
742
+ states.append(seq_states)
743
+ observations.append(seq_obs)
744
+
745
+ # Step 8: Update data with simulated observations
746
+ data_sim = data.copy()
747
+
748
+ # Replace response variable values with simulated values
749
+ # Create a mapping from (id, time) to observation
750
+ obs_dict = {}
751
+ for seq_idx, seq_id in enumerate(unique_ids):
752
+ seq_obs = observations[seq_idx]
753
+ seq_times = data[data[id_var] == seq_id][time_var].values
754
+ for t_idx, time_val in enumerate(seq_times):
755
+ if t_idx < len(seq_obs):
756
+ obs_dict[(seq_id, time_val)] = seq_obs[t_idx]
757
+
758
+ # Update data
759
+ def get_obs(row):
760
+ key = (row[id_var], row[time_var])
761
+ return obs_dict.get(key, data.loc[row.name, response_var])
762
+
763
+ data_sim[response_var] = data_sim.apply(get_obs, axis=1)
764
+
765
+ # Step 9: Create states DataFrame
766
+ states_list = []
767
+ for seq_idx, seq_id in enumerate(unique_ids):
768
+ seq_states = states[seq_idx]
769
+ seq_times = data[data[id_var] == seq_id][time_var].values
770
+ for t_idx, time_val in enumerate(seq_times):
771
+ if t_idx < len(seq_states):
772
+ states_list.append({
773
+ id_var: seq_id,
774
+ time_var: time_val,
775
+ 'state': seq_states[t_idx]
776
+ })
777
+
778
+ states_df = pd.DataFrame(states_list)
779
+
780
+ # Step 10: Return results
781
+ return {
782
+ 'observations': observations,
783
+ 'states': states,
784
+ 'data': data_sim,
785
+ 'states_df': states_df,
786
+ 'model': {
787
+ 'n_states': n_states,
788
+ 'n_symbols': n_symbols,
789
+ 'alphabet': alphabet,
790
+ 'state_names': state_names,
791
+ 'eta_pi': eta_pi,
792
+ 'eta_A': eta_A,
793
+ 'eta_B': eta_B,
794
+ 'n_covariates_pi': n_covariates_pi,
795
+ 'n_covariates_A': n_covariates_A,
796
+ 'n_covariates_B': n_covariates_B
797
+ }
798
+ }
799
+
800
+
801
+ def _create_model_matrix_from_data(
802
+ formula_rhs: str,
803
+ data: pd.DataFrame,
804
+ id_var: str,
805
+ time_var: str,
806
+ n_sequences: int,
807
+ max_length: int,
808
+ rng: np.random.RandomState
809
+ ) -> np.ndarray:
810
+ """
811
+ Helper function to create model matrix from formula and data.
812
+
813
+ This function creates a 3D covariate matrix of shape (n_sequences, n_timepoints, n_covariates)
814
+ from a formula string and data DataFrame.
815
+
816
+ Args:
817
+ formula_rhs: Right-hand side of formula (e.g., "x1 + x2" or "1")
818
+ data: DataFrame containing covariates
819
+ id_var: Column name for sequence IDs
820
+ time_var: Column name for time variable
821
+ n_sequences: Number of sequences
822
+ max_length: Maximum sequence length
823
+ rng: Random number generator
824
+
825
+ Returns:
826
+ numpy array: Model matrix of shape (n_sequences, max_length, n_covariates)
827
+ """
828
+ # Parse formula terms
829
+ if not formula_rhs or formula_rhs.strip() == "1":
830
+ # Intercept only: return matrix of ones
831
+ return np.ones((n_sequences, max_length, 1))
832
+
833
+ # Split by + to get terms
834
+ terms = [term.strip() for term in formula_rhs.split('+')]
835
+ terms = [t for t in terms if t and t != '1'] # Remove empty and intercept (handled separately)
836
+
837
+ # Always include intercept
838
+ n_covariates = len(terms) + 1
839
+
840
+ # Initialize matrix
841
+ X = np.zeros((n_sequences, max_length, n_covariates))
842
+
843
+ # First column is intercept (all ones)
844
+ X[:, :, 0] = 1.0
845
+
846
+ # Get unique sequence IDs
847
+ unique_ids = sorted(data[id_var].unique())
848
+
849
+ # Fill in covariates
850
+ for term_idx, term in enumerate(terms):
851
+ col_idx = term_idx + 1 # +1 because first column is intercept
852
+
853
+ if term not in data.columns:
854
+ raise ValueError(
855
+ f"Variable '{term}' not found in data columns: {list(data.columns)}"
856
+ )
857
+
858
+ # For each sequence, extract covariate values
859
+ for seq_idx, seq_id in enumerate(unique_ids):
860
+ if seq_idx >= n_sequences:
861
+ break
862
+
863
+ # Get data for this sequence
864
+ seq_data = data[data[id_var] == seq_id].sort_values(time_var)
865
+
866
+ # Extract covariate values
867
+ covar_values = seq_data[term].values
868
+
869
+ # Fill matrix (pad with last value if sequence is shorter than max_length)
870
+ seq_length = len(covar_values)
871
+ for t in range(max_length):
872
+ if t < seq_length:
873
+ X[seq_idx, t, col_idx] = covar_values[t]
874
+ else:
875
+ # Pad with last value if sequence is shorter
876
+ X[seq_idx, t, col_idx] = covar_values[-1] if seq_length > 0 else 0.0
877
+
878
+ return X