sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
  2. sequenzo/__init__.py +349 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +476 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +22 -0
  30. sequenzo/data_preprocessing/helpers.py +303 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/dyadic_children.csv +61 -0
  44. sequenzo/datasets/dyadic_parents.csv +61 -0
  45. sequenzo/datasets/mvad.csv +713 -0
  46. sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
  47. sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
  48. sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
  49. sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
  50. sequenzo/datasets/political_science_aid_shock.csv +166 -0
  51. sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
  52. sequenzo/define_sequence_data.py +1400 -0
  53. sequenzo/dissimilarity_measures/__init__.py +31 -0
  54. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  55. sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
  56. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
  57. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  58. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  59. sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
  60. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  61. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  62. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  63. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  64. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  65. sequenzo/dissimilarity_measures/src/module.cpp +40 -0
  66. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  67. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  214. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  215. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  216. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  217. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  218. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  219. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  220. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  221. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  222. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  223. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  224. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  225. sequenzo/multidomain/__init__.py +23 -0
  226. sequenzo/multidomain/association_between_domains.py +311 -0
  227. sequenzo/multidomain/cat.py +597 -0
  228. sequenzo/multidomain/combt.py +519 -0
  229. sequenzo/multidomain/dat.py +81 -0
  230. sequenzo/multidomain/idcd.py +139 -0
  231. sequenzo/multidomain/linked_polyad.py +292 -0
  232. sequenzo/openmp_setup.py +233 -0
  233. sequenzo/prefix_tree/__init__.py +62 -0
  234. sequenzo/prefix_tree/hub.py +114 -0
  235. sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
  236. sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
  237. sequenzo/prefix_tree/spell_level_indicators.py +297 -0
  238. sequenzo/prefix_tree/system_level_indicators.py +544 -0
  239. sequenzo/prefix_tree/utils.py +54 -0
  240. sequenzo/seqhmm/__init__.py +95 -0
  241. sequenzo/seqhmm/advanced_optimization.py +305 -0
  242. sequenzo/seqhmm/bootstrap.py +411 -0
  243. sequenzo/seqhmm/build_hmm.py +142 -0
  244. sequenzo/seqhmm/build_mhmm.py +136 -0
  245. sequenzo/seqhmm/build_nhmm.py +121 -0
  246. sequenzo/seqhmm/fit_mhmm.py +62 -0
  247. sequenzo/seqhmm/fit_model.py +61 -0
  248. sequenzo/seqhmm/fit_nhmm.py +76 -0
  249. sequenzo/seqhmm/formulas.py +289 -0
  250. sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
  251. sequenzo/seqhmm/gradients_nhmm.py +306 -0
  252. sequenzo/seqhmm/hmm.py +291 -0
  253. sequenzo/seqhmm/mhmm.py +314 -0
  254. sequenzo/seqhmm/model_comparison.py +238 -0
  255. sequenzo/seqhmm/multichannel_em.py +282 -0
  256. sequenzo/seqhmm/multichannel_utils.py +138 -0
  257. sequenzo/seqhmm/nhmm.py +270 -0
  258. sequenzo/seqhmm/nhmm_utils.py +191 -0
  259. sequenzo/seqhmm/predict.py +137 -0
  260. sequenzo/seqhmm/predict_mhmm.py +142 -0
  261. sequenzo/seqhmm/simulate.py +878 -0
  262. sequenzo/seqhmm/utils.py +218 -0
  263. sequenzo/seqhmm/visualization.py +910 -0
  264. sequenzo/sequence_characteristics/__init__.py +40 -0
  265. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  266. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  267. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  268. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  269. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  270. sequenzo/sequence_characteristics/turbulence.py +155 -0
  271. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  272. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  273. sequenzo/suffix_tree/__init__.py +66 -0
  274. sequenzo/suffix_tree/hub.py +114 -0
  275. sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
  276. sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
  277. sequenzo/suffix_tree/spell_level_indicators.py +248 -0
  278. sequenzo/suffix_tree/system_level_indicators.py +535 -0
  279. sequenzo/suffix_tree/utils.py +56 -0
  280. sequenzo/version_check.py +283 -0
  281. sequenzo/visualization/__init__.py +29 -0
  282. sequenzo/visualization/plot_mean_time.py +222 -0
  283. sequenzo/visualization/plot_modal_state.py +276 -0
  284. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  285. sequenzo/visualization/plot_relative_frequency.py +405 -0
  286. sequenzo/visualization/plot_sequence_index.py +1175 -0
  287. sequenzo/visualization/plot_single_medoid.py +153 -0
  288. sequenzo/visualization/plot_state_distribution.py +651 -0
  289. sequenzo/visualization/plot_transition_matrix.py +190 -0
  290. sequenzo/visualization/utils/__init__.py +23 -0
  291. sequenzo/visualization/utils/utils.py +310 -0
  292. sequenzo/with_event_history_analysis/__init__.py +35 -0
  293. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  294. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  295. sequenzo-0.1.31.dist-info/METADATA +286 -0
  296. sequenzo-0.1.31.dist-info/RECORD +299 -0
  297. sequenzo-0.1.31.dist-info/WHEEL +5 -0
  298. sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
  299. sequenzo-0.1.31.dist-info/top_level.txt +2 -0
@@ -0,0 +1,121 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : build_nhmm.py
4
+ @Time : 2025-11-22 19:30
5
+ @Desc : Build Non-homogeneous HMM models
6
+
7
+ This module provides the build_nhmm function, which creates Non-homogeneous HMM
8
+ model objects similar to seqHMM's build_nhmm() function in R.
9
+
10
+ Note: This is a simplified implementation. A full implementation would require
11
+ more sophisticated handling of formulas and data structures.
12
+ """
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ from typing import Optional, List, Union
17
+ from sequenzo.define_sequence_data import SequenceData
18
+ from .nhmm import NHMM
19
+ from .formulas import Formula
20
+
21
+
22
+ def build_nhmm(
23
+ observations: SequenceData,
24
+ n_states: int,
25
+ X: Optional[np.ndarray] = None,
26
+ emission_formula: Optional[Union[str, Formula]] = None,
27
+ initial_formula: Optional[Union[str, Formula]] = None,
28
+ transition_formula: Optional[Union[str, Formula]] = None,
29
+ data: Optional[pd.DataFrame] = None,
30
+ id_var: Optional[str] = None,
31
+ time_var: Optional[str] = None,
32
+ eta_pi: Optional[np.ndarray] = None,
33
+ eta_A: Optional[np.ndarray] = None,
34
+ eta_B: Optional[np.ndarray] = None,
35
+ state_names: Optional[List[str]] = None,
36
+ random_state: Optional[int] = None
37
+ ) -> NHMM:
38
+ """
39
+ Build a Non-homogeneous Hidden Markov Model object.
40
+
41
+ A Non-homogeneous HMM allows transition and emission probabilities to vary
42
+ over time or with covariates. This function creates the model structure but
43
+ does not fit it (use fit_nhmm() for that).
44
+
45
+ It is similar to seqHMM's build_nhmm() function in R. Supports both
46
+ direct covariate matrix input and formula-based specification.
47
+
48
+ Args:
49
+ observations: SequenceData object containing the sequences to model
50
+ n_states: Number of hidden states
51
+ X: Optional covariate matrix of shape (n_sequences, n_timepoints, n_covariates).
52
+ If None, will be created from formulas.
53
+ emission_formula: Optional formula string for emission probabilities (e.g., "~ x1 + x2")
54
+ initial_formula: Optional formula string for initial probabilities
55
+ transition_formula: Optional formula string for transition probabilities
56
+ data: Optional DataFrame containing covariates (required if using formulas)
57
+ id_var: Optional column name for sequence IDs in data (required if using formulas)
58
+ time_var: Optional column name for time variable in data (required if using formulas)
59
+ eta_pi: Optional coefficients for initial probabilities (n_covariates x n_states)
60
+ eta_A: Optional coefficients for transition probabilities (n_covariates x n_states x n_states)
61
+ eta_B: Optional coefficients for emission probabilities (n_covariates x n_states x n_symbols)
62
+ state_names: Optional names for hidden states
63
+ random_state: Random seed for initialization
64
+
65
+ Returns:
66
+ NHMM: A Non-homogeneous HMM model object (not yet fitted)
67
+
68
+ Examples:
69
+ >>> from sequenzo import SequenceData, load_dataset
70
+ >>> from sequenzo.seqhmm import build_nhmm
71
+ >>> import numpy as np
72
+ >>>
73
+ >>> # Method 1: Direct covariate matrix
74
+ >>> n_sequences = len(seq.sequences)
75
+ >>> n_timepoints = max(len(s) for s in seq.sequences)
76
+ >>> X = np.zeros((n_sequences, n_timepoints, 1))
77
+ >>> for i in range(n_sequences):
78
+ ... for t in range(len(seq.sequences[i])):
79
+ ... X[i, t, 0] = t # Time covariate
80
+ >>> nhmm = build_nhmm(seq, n_states=4, X=X, random_state=42)
81
+ >>>
82
+ >>> # Method 2: Formula-based (requires data DataFrame)
83
+ >>> nhmm = build_nhmm(
84
+ ... seq, n_states=4,
85
+ ... emission_formula="~ time + age",
86
+ ... data=covariate_df,
87
+ ... id_var='id',
88
+ ... time_var='time',
89
+ ... random_state=42
90
+ ... )
91
+ """
92
+ # Create covariate matrix from formulas if X is not provided
93
+ if X is None:
94
+ if data is None or id_var is None or time_var is None:
95
+ raise ValueError(
96
+ "If X is not provided, must provide data, id_var, and time_var for formula-based specification."
97
+ )
98
+
99
+ # Use emission_formula as default if others not specified
100
+ formula = emission_formula or initial_formula or transition_formula
101
+ if formula is None:
102
+ raise ValueError("Must provide either X or at least one formula (emission_formula, initial_formula, or transition_formula).")
103
+
104
+ # Create model matrix
105
+ n_sequences = len(observations.sequences)
106
+ n_timepoints = max(len(seq) for seq in observations.sequences)
107
+ X = create_model_matrix(formula, data, id_var, time_var, n_sequences, n_timepoints)
108
+
109
+ # Create and return NHMM object
110
+ nhmm = NHMM(
111
+ observations=observations,
112
+ n_states=n_states,
113
+ X=X,
114
+ eta_pi=eta_pi,
115
+ eta_A=eta_A,
116
+ eta_B=eta_B,
117
+ state_names=state_names,
118
+ random_state=random_state
119
+ )
120
+
121
+ return nhmm
@@ -0,0 +1,62 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : fit_mhmm.py
4
+ @Time : 2025-11-21 13:37
5
+ @Desc : Fit Mixture HMM models using EM algorithm
6
+
7
+ This module provides the fit_mhmm function, which estimates Mixture HMM parameters
8
+ using the EM algorithm, similar to seqHMM's fit_model() function for mhmm objects.
9
+ """
10
+
11
+ from typing import Optional
12
+ from .mhmm import MHMM
13
+
14
+
15
+ def fit_mhmm(
16
+ model: MHMM,
17
+ n_iter: int = 100,
18
+ tol: float = 1e-2,
19
+ verbose: bool = False
20
+ ) -> MHMM:
21
+ """
22
+ Fit a Mixture HMM model to the observations using EM algorithm.
23
+
24
+ This function estimates the parameters of a Mixture HMM model using the
25
+ Expectation-Maximization (EM) algorithm. The EM algorithm alternates between:
26
+ 1. E-step: Compute responsibilities (posterior cluster probabilities)
27
+ 2. M-step: Update cluster probabilities and HMM parameters for each cluster
28
+
29
+ It is similar to seqHMM's fit_model() function for mhmm objects in R.
30
+
31
+ Args:
32
+ model: MHMM model object created by build_mhmm()
33
+ n_iter: Maximum number of EM iterations. Default is 100.
34
+ tol: Convergence tolerance. EM stops if the gain in log-likelihood
35
+ is below this value. Default is 1e-2.
36
+ verbose: Whether to print progress information. Default is False.
37
+
38
+ Returns:
39
+ MHMM: The fitted model (same object, modified in place)
40
+
41
+ Examples:
42
+ >>> from sequenzo import SequenceData, load_dataset
43
+ >>> from sequenzo.seqhmm import build_mhmm, fit_mhmm
44
+ >>>
45
+ >>> # Load and prepare data
46
+ >>> df = load_dataset('mvad')
47
+ >>> seq = SequenceData(df, time=range(15, 86), states=['EM', 'FE', 'HE', 'JL', 'SC', 'TR'])
48
+ >>>
49
+ >>> # Build and fit model
50
+ >>> mhmm = build_mhmm(seq, n_clusters=3, n_states=4, random_state=42)
51
+ >>> mhmm = fit_mhmm(mhmm, n_iter=100, tol=1e-2, verbose=True)
52
+ >>>
53
+ >>> # Check results
54
+ >>> print(f"Log-likelihood: {mhmm.log_likelihood:.2f}")
55
+ >>> print(f"Iterations: {mhmm.n_iter}")
56
+ >>> print(f"Converged: {mhmm.converged}")
57
+ >>> print(f"Cluster probabilities: {mhmm.cluster_probs}")
58
+ """
59
+ # Fit the model
60
+ model.fit(n_iter=n_iter, tol=tol, verbose=verbose)
61
+
62
+ return model
@@ -0,0 +1,61 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : fit_model.py
4
+ @Time : 2025-11-22 22:57
5
+ @Desc : Fit HMM models using EM algorithm
6
+
7
+ This module provides the fit_model function, which estimates HMM parameters
8
+ using the EM algorithm, similar to seqHMM's fit_model() function in R.
9
+ """
10
+
11
+ from typing import Optional, Dict, Any
12
+ from .hmm import HMM
13
+
14
+
15
+ def fit_model(
16
+ model: HMM,
17
+ n_iter: int = 100,
18
+ tol: float = 1e-2,
19
+ verbose: bool = False
20
+ ) -> HMM:
21
+ """
22
+ Fit an HMM model to the observations using EM algorithm.
23
+
24
+ This function estimates the parameters (initial probabilities, transition
25
+ probabilities, and emission probabilities) of an HMM model using the
26
+ Expectation-Maximization (EM) algorithm.
27
+
28
+ It is similar to seqHMM's fit_model() function in R, but currently only
29
+ supports the EM algorithm step (not global or local optimization).
30
+
31
+ Args:
32
+ model: HMM model object created by build_hmm()
33
+ n_iter: Maximum number of EM iterations. Default is 100.
34
+ tol: Convergence tolerance. EM stops if the gain in log-likelihood
35
+ is below this value. Default is 1e-2.
36
+ verbose: Whether to print progress information. Default is False.
37
+
38
+ Returns:
39
+ HMM: The fitted model (same object, modified in place)
40
+
41
+ Examples:
42
+ >>> from sequenzo import SequenceData, load_dataset
43
+ >>> from sequenzo.seqhmm import build_hmm, fit_model
44
+ >>>
45
+ >>> # Load and prepare data
46
+ >>> df = load_dataset('mvad')
47
+ >>> seq = SequenceData(df, time=range(15, 86), states=['EM', 'FE', 'HE', 'JL', 'SC', 'TR'])
48
+ >>>
49
+ >>> # Build and fit model
50
+ >>> hmm = build_hmm(seq, n_states=4, random_state=42)
51
+ >>> hmm = fit_model(hmm, n_iter=100, tol=1e-2, verbose=True)
52
+ >>>
53
+ >>> # Check results
54
+ >>> print(f"Log-likelihood: {hmm.log_likelihood:.2f}")
55
+ >>> print(f"Iterations: {hmm.n_iter}")
56
+ >>> print(f"Converged: {hmm.converged}")
57
+ """
58
+ # Fit the model
59
+ model.fit(n_iter=n_iter, tol=tol, verbose=verbose)
60
+
61
+ return model
@@ -0,0 +1,76 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : fit_nhmm.py
4
+ @Time : 2025-11-23 13:38
5
+ @Desc : Fit Non-homogeneous HMM models
6
+
7
+ This module provides the fit_nhmm function, which estimates NHMM parameters
8
+ using numerical optimization, similar to seqHMM's fit_nhmm() function in R.
9
+
10
+ Note: This is a simplified implementation. A full implementation would use
11
+ the forward-backward algorithm and proper gradient computation.
12
+ """
13
+
14
+ from typing import Optional
15
+ from .nhmm import NHMM
16
+
17
+
18
+ def fit_nhmm(
19
+ model: NHMM,
20
+ n_iter: int = 100,
21
+ tol: float = 1e-4,
22
+ verbose: bool = False
23
+ ) -> NHMM:
24
+ """
25
+ Fit a Non-homogeneous HMM model to the observations.
26
+
27
+ This function estimates the coefficients (eta_pi, eta_A, eta_B) that
28
+ determine how covariates influence the initial, transition, and emission
29
+ probabilities.
30
+
31
+ Note: This is a simplified implementation. A full implementation would:
32
+ 1. Use the forward-backward algorithm to compute exact log-likelihood
33
+ 2. Compute analytical gradients
34
+ 3. Use more sophisticated optimization methods
35
+
36
+ It is similar to seqHMM's fit_nhmm() function in R.
37
+
38
+ Args:
39
+ model: NHMM model object created by build_nhmm()
40
+ n_iter: Maximum number of optimization iterations. Default is 100.
41
+ tol: Convergence tolerance. Default is 1e-4.
42
+ verbose: Whether to print progress information. Default is False.
43
+
44
+ Returns:
45
+ NHMM: The fitted model (same object, modified in place)
46
+
47
+ Examples:
48
+ >>> from sequenzo import SequenceData, load_dataset
49
+ >>> from sequenzo.seqhmm import build_nhmm, fit_nhmm
50
+ >>> import numpy as np
51
+ >>>
52
+ >>> # Load and prepare data
53
+ >>> df = load_dataset('mvad')
54
+ >>> seq = SequenceData(df, time=range(15, 86), states=['EM', 'FE', 'HE', 'JL', 'SC', 'TR'])
55
+ >>>
56
+ >>> # Create covariate matrix
57
+ >>> n_sequences = len(seq.sequences)
58
+ >>> n_timepoints = max(len(s) for s in seq.sequences)
59
+ >>> X = np.zeros((n_sequences, n_timepoints, 1))
60
+ >>> for i in range(n_sequences):
61
+ ... for t in range(len(seq.sequences[i])):
62
+ ... X[i, t, 0] = t
63
+ >>>
64
+ >>> # Build and fit model
65
+ >>> nhmm = build_nhmm(seq, n_states=4, X=X, random_state=42)
66
+ >>> nhmm = fit_nhmm(nhmm, n_iter=100, tol=1e-4, verbose=True)
67
+ >>>
68
+ >>> # Check results
69
+ >>> print(f"Log-likelihood: {nhmm.log_likelihood:.2f}")
70
+ >>> print(f"Iterations: {nhmm.n_iter}")
71
+ >>> print(f"Converged: {nhmm.converged}")
72
+ """
73
+ # Fit the model
74
+ model.fit(n_iter=n_iter, tol=tol, verbose=verbose)
75
+
76
+ return model
@@ -0,0 +1,289 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : formulas.py
4
+ @Time : 2025-10-18 16:23
5
+ @Desc : Formula-based covariate specification for NHMM
6
+
7
+ This module provides a formula interface for specifying covariates in NHMM,
8
+ similar to seqHMM's formula interface in R. Users can specify covariates
9
+ using a string formula like "~ x1 + x2" instead of manually creating
10
+ covariate matrices.
11
+
12
+ Note: This is a simplified implementation. A full implementation would
13
+ support more complex formulas (interactions, transformations, etc.).
14
+ """
15
+
16
+ import numpy as np
17
+ import pandas as pd
18
+ from typing import Optional, List, Union, Dict
19
+ from sequenzo.define_sequence_data import SequenceData
20
+
21
+
22
+ class Formula:
23
+ """
24
+ Formula object for specifying covariates.
25
+
26
+ This class represents a formula like "~ x1 + x2" and can be used
27
+ to create model matrices from data.
28
+
29
+ Examples:
30
+ >>> formula = Formula("~ age + gender")
31
+ >>> X = formula.create_matrix(data, id_var='id', time_var='time')
32
+ """
33
+
34
+ def __init__(self, formula: str):
35
+ """
36
+ Initialize a formula object.
37
+
38
+ Args:
39
+ formula: Formula string, e.g., "~ x1 + x2" or "x1 + x2"
40
+ (tilde is optional)
41
+ """
42
+ # Remove leading/trailing whitespace
43
+ formula = formula.strip()
44
+
45
+ # Remove tilde if present
46
+ if formula.startswith('~'):
47
+ formula = formula[1:].strip()
48
+
49
+ self.formula = formula
50
+ self.terms = self._parse_formula(formula)
51
+
52
+ def _parse_formula(self, formula: str) -> List[str]:
53
+ """
54
+ Parse formula string into terms.
55
+
56
+ Args:
57
+ formula: Formula string
58
+
59
+ Returns:
60
+ List of variable names
61
+ """
62
+ if not formula:
63
+ return []
64
+
65
+ # Split by + and clean up
66
+ terms = [term.strip() for term in formula.split('+')]
67
+ return [t for t in terms if t] # Remove empty strings
68
+
69
+ def create_matrix(
70
+ self,
71
+ data: pd.DataFrame,
72
+ id_var: str,
73
+ time_var: str,
74
+ n_sequences: int,
75
+ n_timepoints: int
76
+ ) -> np.ndarray:
77
+ """
78
+ Create covariate matrix from formula and data.
79
+
80
+ This function creates a covariate matrix X of shape
81
+ (n_sequences, n_timepoints, n_covariates) from a DataFrame
82
+ and formula specification.
83
+
84
+ Args:
85
+ data: DataFrame containing covariates
86
+ id_var: Column name for sequence IDs
87
+ time_var: Column name for time variable
88
+ n_sequences: Number of sequences
89
+ n_timepoints: Number of time points
90
+
91
+ Returns:
92
+ numpy array: Covariate matrix (n_sequences, n_timepoints, n_covariates)
93
+ """
94
+ if not self.terms:
95
+ # No covariates: return matrix of ones (intercept only)
96
+ return np.ones((n_sequences, n_timepoints, 1))
97
+
98
+ # Initialize covariate matrix
99
+ X = np.zeros((n_sequences, n_timepoints, len(self.terms) + 1)) # +1 for intercept
100
+
101
+ # First column is intercept (always 1)
102
+ X[:, :, 0] = 1.0
103
+
104
+ # Fill in covariates
105
+ for term_idx, term in enumerate(self.terms):
106
+ col_idx = term_idx + 1 # +1 because first column is intercept
107
+
108
+ if term not in data.columns:
109
+ raise ValueError(f"Variable '{term}' not found in data columns: {list(data.columns)}")
110
+
111
+ # Get values for this covariate
112
+ covar_values = data[term].values
113
+
114
+ # Reshape to match sequence structure
115
+ # This assumes data is in long format (one row per sequence-time combination)
116
+ # We need to reshape it to (n_sequences, n_timepoints)
117
+
118
+ # If data has id_var and time_var, use them to reshape
119
+ if id_var in data.columns and time_var in data.columns:
120
+ # Pivot to wide format
121
+ pivot_df = data.pivot(index=id_var, columns=time_var, values=term)
122
+
123
+ # Fill matrix
124
+ for seq_idx, seq_id in enumerate(pivot_df.index):
125
+ if seq_idx < n_sequences:
126
+ for t_idx, time_val in enumerate(pivot_df.columns):
127
+ if t_idx < n_timepoints:
128
+ X[seq_idx, t_idx, col_idx] = pivot_df.loc[seq_id, time_val]
129
+ else:
130
+ # Assume data is already in sequence-time order
131
+ # Reshape assuming row-major order (sequence by sequence)
132
+ if len(covar_values) == n_sequences * n_timepoints:
133
+ X[:, :, col_idx] = covar_values.reshape(n_sequences, n_timepoints)
134
+ else:
135
+ raise ValueError(
136
+ f"Data length ({len(covar_values)}) doesn't match "
137
+ f"n_sequences * n_timepoints ({n_sequences * n_timepoints})"
138
+ )
139
+
140
+ return X
141
+
142
+
143
+ def create_model_matrix(
144
+ formula: Union[str, Formula],
145
+ data: pd.DataFrame,
146
+ id_var: str,
147
+ time_var: str,
148
+ n_sequences: int,
149
+ n_timepoints: int
150
+ ) -> np.ndarray:
151
+ """
152
+ Create model matrix from formula and data.
153
+
154
+ This is a convenience function that creates a covariate matrix
155
+ from a formula string, similar to seqHMM's model_matrix() function.
156
+
157
+ Args:
158
+ formula: Formula string (e.g., "~ x1 + x2") or Formula object
159
+ data: DataFrame containing covariates
160
+ id_var: Column name for sequence IDs
161
+ time_var: Column name for time variable
162
+ n_sequences: Number of sequences
163
+ n_timepoints: Number of time points
164
+
165
+ Returns:
166
+ numpy array: Covariate matrix (n_sequences, n_timepoints, n_covariates)
167
+
168
+ Examples:
169
+ >>> import pandas as pd
170
+ >>> from sequenzo.seqhmm import create_model_matrix
171
+ >>>
172
+ >>> # Create data with covariates
173
+ >>> data = pd.DataFrame({
174
+ ... 'id': [1, 1, 1, 2, 2, 2],
175
+ ... 'time': [1, 2, 3, 1, 2, 3],
176
+ ... 'age': [20, 21, 22, 25, 26, 27],
177
+ ... 'gender': [0, 0, 0, 1, 1, 1]
178
+ ... })
179
+ >>>
180
+ >>> # Create model matrix
181
+ >>> X = create_model_matrix("~ age + gender", data, 'id', 'time', n_sequences=2, n_timepoints=3)
182
+ >>> print(X.shape) # (2, 3, 3) - 2 sequences, 3 timepoints, 3 covariates (intercept + age + gender)
183
+ """
184
+ if isinstance(formula, str):
185
+ formula = Formula(formula)
186
+
187
+ return formula.create_matrix(data, id_var, time_var, n_sequences, n_timepoints)
188
+
189
+
190
+ def create_model_matrix_time_constant(
191
+ formula: Union[str, Formula, None],
192
+ data: Optional[pd.DataFrame],
193
+ n_sequences: int
194
+ ) -> np.ndarray:
195
+ """
196
+ Create model matrix for time-constant covariates (one value per sequence).
197
+
198
+ This function creates a model matrix for time-constant covariates used in
199
+ MHMM simulation. The covariates are constant across time points for each sequence,
200
+ so the output matrix has shape (n_sequences, n_covariates) where n_covariates
201
+ includes an intercept column.
202
+
203
+ This is similar to R's model.matrix() function but for time-constant covariates.
204
+
205
+ Args:
206
+ formula: Formula string (e.g., "~ covariate_1 + covariate_2") or Formula object.
207
+ If None, returns a matrix with only intercept (column of ones).
208
+ data: DataFrame containing covariates. Must have n_sequences rows.
209
+ Each row corresponds to one sequence.
210
+ n_sequences: Number of sequences to simulate
211
+
212
+ Returns:
213
+ numpy array: Model matrix of shape (n_sequences, n_covariates)
214
+ First column is always intercept (ones)
215
+ Subsequent columns are the covariates specified in formula
216
+
217
+ Examples:
218
+ >>> import pandas as pd
219
+ >>> import numpy as np
220
+ >>> from sequenzo.seqhmm.formulas import create_model_matrix_time_constant
221
+ >>>
222
+ >>> # Create covariate data (one row per sequence)
223
+ >>> data = pd.DataFrame({
224
+ ... 'covariate_1': np.random.rand(10),
225
+ ... 'covariate_2': np.random.choice(['A', 'B'], size=10)
226
+ ... })
227
+ >>>
228
+ >>> # Create model matrix with formula
229
+ >>> X = create_model_matrix_time_constant("~ covariate_1 + covariate_2", data, n_sequences=10)
230
+ >>> print(X.shape) # (10, n_covariates) where n_covariates includes intercept and dummies
231
+ """
232
+ # If no formula is provided, return intercept-only matrix
233
+ if formula is None:
234
+ return np.ones((n_sequences, 1))
235
+
236
+ # Parse formula
237
+ if isinstance(formula, str):
238
+ formula = Formula(formula)
239
+
240
+ # Validate data
241
+ if data is None:
242
+ raise ValueError("If formula is provided, data must also be provided")
243
+
244
+ if len(data) != n_sequences:
245
+ raise ValueError(
246
+ f"Number of rows in data ({len(data)}) must equal n_sequences ({n_sequences})"
247
+ )
248
+
249
+ # Get terms from formula
250
+ terms = formula.terms
251
+
252
+ # Initialize model matrix with intercept column
253
+ # We'll build it step by step, handling factor variables
254
+ columns_list = []
255
+ column_names = ['(Intercept)']
256
+
257
+ # Add intercept column (all ones)
258
+ columns_list.append(np.ones(n_sequences))
259
+
260
+ # Process each term in the formula
261
+ for term in terms:
262
+ if term not in data.columns:
263
+ raise ValueError(
264
+ f"Variable '{term}' not found in data columns: {list(data.columns)}"
265
+ )
266
+
267
+ covar_values = data[term].values
268
+
269
+ # Check if this is a categorical variable
270
+ if pd.api.types.is_categorical_dtype(data[term]) or \
271
+ pd.api.types.is_object_dtype(data[term]) or \
272
+ (data[term].dtype == 'object'):
273
+ # Categorical variable: create dummy variables
274
+ # Use pandas get_dummies to create dummies, drop first level as reference
275
+ dummies = pd.get_dummies(data[[term]], prefix=term, drop_first=True)
276
+
277
+ # Add each dummy column
278
+ for dummy_col in dummies.columns:
279
+ columns_list.append(dummies[dummy_col].values)
280
+ column_names.append(dummy_col)
281
+ else:
282
+ # Numeric variable: add as is
283
+ columns_list.append(covar_values)
284
+ column_names.append(term)
285
+
286
+ # Stack all columns into a matrix
287
+ X = np.column_stack(columns_list)
288
+
289
+ return X