sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
  2. sequenzo/__init__.py +349 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +476 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +22 -0
  30. sequenzo/data_preprocessing/helpers.py +303 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/dyadic_children.csv +61 -0
  44. sequenzo/datasets/dyadic_parents.csv +61 -0
  45. sequenzo/datasets/mvad.csv +713 -0
  46. sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
  47. sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
  48. sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
  49. sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
  50. sequenzo/datasets/political_science_aid_shock.csv +166 -0
  51. sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
  52. sequenzo/define_sequence_data.py +1400 -0
  53. sequenzo/dissimilarity_measures/__init__.py +31 -0
  54. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  55. sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
  56. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
  57. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  58. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  59. sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
  60. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  61. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  62. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  63. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  64. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  65. sequenzo/dissimilarity_measures/src/module.cpp +40 -0
  66. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  67. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  214. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  215. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  216. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  217. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  218. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  219. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  220. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  221. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  222. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  223. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  224. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  225. sequenzo/multidomain/__init__.py +23 -0
  226. sequenzo/multidomain/association_between_domains.py +311 -0
  227. sequenzo/multidomain/cat.py +597 -0
  228. sequenzo/multidomain/combt.py +519 -0
  229. sequenzo/multidomain/dat.py +81 -0
  230. sequenzo/multidomain/idcd.py +139 -0
  231. sequenzo/multidomain/linked_polyad.py +292 -0
  232. sequenzo/openmp_setup.py +233 -0
  233. sequenzo/prefix_tree/__init__.py +62 -0
  234. sequenzo/prefix_tree/hub.py +114 -0
  235. sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
  236. sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
  237. sequenzo/prefix_tree/spell_level_indicators.py +297 -0
  238. sequenzo/prefix_tree/system_level_indicators.py +544 -0
  239. sequenzo/prefix_tree/utils.py +54 -0
  240. sequenzo/seqhmm/__init__.py +95 -0
  241. sequenzo/seqhmm/advanced_optimization.py +305 -0
  242. sequenzo/seqhmm/bootstrap.py +411 -0
  243. sequenzo/seqhmm/build_hmm.py +142 -0
  244. sequenzo/seqhmm/build_mhmm.py +136 -0
  245. sequenzo/seqhmm/build_nhmm.py +121 -0
  246. sequenzo/seqhmm/fit_mhmm.py +62 -0
  247. sequenzo/seqhmm/fit_model.py +61 -0
  248. sequenzo/seqhmm/fit_nhmm.py +76 -0
  249. sequenzo/seqhmm/formulas.py +289 -0
  250. sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
  251. sequenzo/seqhmm/gradients_nhmm.py +306 -0
  252. sequenzo/seqhmm/hmm.py +291 -0
  253. sequenzo/seqhmm/mhmm.py +314 -0
  254. sequenzo/seqhmm/model_comparison.py +238 -0
  255. sequenzo/seqhmm/multichannel_em.py +282 -0
  256. sequenzo/seqhmm/multichannel_utils.py +138 -0
  257. sequenzo/seqhmm/nhmm.py +270 -0
  258. sequenzo/seqhmm/nhmm_utils.py +191 -0
  259. sequenzo/seqhmm/predict.py +137 -0
  260. sequenzo/seqhmm/predict_mhmm.py +142 -0
  261. sequenzo/seqhmm/simulate.py +878 -0
  262. sequenzo/seqhmm/utils.py +218 -0
  263. sequenzo/seqhmm/visualization.py +910 -0
  264. sequenzo/sequence_characteristics/__init__.py +40 -0
  265. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  266. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  267. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  268. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  269. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  270. sequenzo/sequence_characteristics/turbulence.py +155 -0
  271. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  272. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  273. sequenzo/suffix_tree/__init__.py +66 -0
  274. sequenzo/suffix_tree/hub.py +114 -0
  275. sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
  276. sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
  277. sequenzo/suffix_tree/spell_level_indicators.py +248 -0
  278. sequenzo/suffix_tree/system_level_indicators.py +535 -0
  279. sequenzo/suffix_tree/utils.py +56 -0
  280. sequenzo/version_check.py +283 -0
  281. sequenzo/visualization/__init__.py +29 -0
  282. sequenzo/visualization/plot_mean_time.py +222 -0
  283. sequenzo/visualization/plot_modal_state.py +276 -0
  284. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  285. sequenzo/visualization/plot_relative_frequency.py +405 -0
  286. sequenzo/visualization/plot_sequence_index.py +1175 -0
  287. sequenzo/visualization/plot_single_medoid.py +153 -0
  288. sequenzo/visualization/plot_state_distribution.py +651 -0
  289. sequenzo/visualization/plot_transition_matrix.py +190 -0
  290. sequenzo/visualization/utils/__init__.py +23 -0
  291. sequenzo/visualization/utils/utils.py +310 -0
  292. sequenzo/with_event_history_analysis/__init__.py +35 -0
  293. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  294. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  295. sequenzo-0.1.31.dist-info/METADATA +286 -0
  296. sequenzo-0.1.31.dist-info/RECORD +299 -0
  297. sequenzo-0.1.31.dist-info/WHEEL +5 -0
  298. sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
  299. sequenzo-0.1.31.dist-info/top_level.txt +2 -0
@@ -0,0 +1,850 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : sequence_analysis_multi_state_model.py
4
+ @Time : 30/09/2025 20:27
5
+ @Desc : Sequence Analysis Multi-state Model (SAMM) for event history analysis
6
+
7
+ This module provides tools for analyzing sequences through a multi-state perspective,
8
+ creating person-period datasets that can be used for event history analysis.
9
+
10
+ Based on the TraMineR package's SAMM functionality.
11
+
12
+ IMPORTANT DIFFERENCES FROM R'S TraMineR IMPLEMENTATION:
13
+
14
+ Plotting Approach Differences:
15
+
16
+ R's plot.SAMM() function:
17
+ - Uses TraMineR's seqplot() function with grouping
18
+ - Original R code: plot.SAMM <- function(x, type="d", ...){
19
+ seqdata <- attr(x, "stslist")[x$transition,]
20
+ group <- x[x$transition, attr(x, "sname")[1]]
21
+ levels(group) <- paste("Transition out of", levels(group))
22
+ seqplot(seqdata, group=group, type=type, ...)
23
+ }
24
+ - Creates grouped sequence plots where sequences are grouped by starting state
25
+ - Relies on TraMineR's built-in plotting system
26
+ - Links:
27
+ Source code: https://rdrr.io/cran/TraMineRextras/src/R/seqsamm.R
28
+ Documentation: https://cran.r-project.org/web/packages/TraMineRextras/refman/TraMineRextras.html#seqsha
29
+
30
+ Our Python plot_samm() function:
31
+ - Uses matplotlib's imshow() with sequence index plot approach
32
+ - Creates separate subplots for each starting state (one subplot per transition state)
33
+ - Each subplot shows all subsequences that start with a specific state as colored horizontal bars
34
+ - Displays actual sequence patterns using a color-coded matrix visualization
35
+ - Automatically handles varying numbers of sequences per state with dynamic subplot heights
36
+
37
+ Why We Made This Choice:
38
+ 1. Better Visual Separation: Each starting state gets its own dedicated subplot,
39
+ making it easier to compare patterns across different states
40
+ 2. Scalability: Works well with large numbers of sequences and states
41
+ 3. Clarity: Direct visualization of subsequence patterns without grouping artifacts
42
+ 4. Python Ecosystem: Leverages matplotlib's powerful visualization capabilities
43
+ 5. Detail Preservation: Shows individual sequence patterns rather than aggregate summaries
44
+
45
+ Both approaches show transition patterns effectively, but our Python implementation
46
+ provides more detailed, subplot-based visualizations that are particularly suitable
47
+ for exploratory data analysis and detailed pattern inspection.
48
+ """
49
+
50
+ import numpy as np
51
+ import pandas as pd
52
+ from typing import Optional, Union, Dict, List, Tuple
53
+ import matplotlib.pyplot as plt
54
+
55
+ # Import the SequenceData class from the parent package
56
+ from sequenzo.define_sequence_data import SequenceData
57
+
58
+
59
+ class SAMM:
60
+ """
61
+ Sequence Analysis Multi-state Model (SAMM) object.
62
+
63
+ This class stores a person-period dataset generated from sequence data,
64
+ where each row represents one time point for one person, along with
65
+ information about subsequences, transitions, and spell characteristics.
66
+
67
+ Attributes:
68
+ data (pd.DataFrame): The person-period dataset
69
+ alphabet (list): The state space (unique states in the sequences)
70
+ labels (list): Labels for the states
71
+ color_map (dict): Color mapping for visualization
72
+ sname (list): Column names for subsequence variables (e.g., ['s.1', 's.2', 's.3'])
73
+ sublength (int): Length of the subsequences being tracked
74
+ """
75
+
76
+ def __init__(self, data: pd.DataFrame, alphabet: list, labels: list,
77
+ color_map: dict, sname: list, sublength: int):
78
+ """
79
+ Initialize a SAMM object.
80
+
81
+ Args:
82
+ data: Person-period dataset
83
+ alphabet: List of unique states
84
+ labels: Labels for states
85
+ color_map: Dictionary mapping states to colors
86
+ sname: List of subsequence column names
87
+ sublength: Length of subsequences
88
+ """
89
+ self.data = data
90
+ self.alphabet = alphabet
91
+ self.labels = labels
92
+ self.color_map = color_map
93
+ self.sname = sname
94
+ self.sublength = sublength
95
+
96
+ # Initialize typology column (will be set later using set_typology)
97
+ if 'typology' not in self.data.columns:
98
+ self.data['typology'] = 'None'
99
+
100
+ def __repr__(self):
101
+ """String representation of SAMM object."""
102
+ return f"SAMM(n_rows={len(self.data)}, sublength={self.sublength})"
103
+
104
+ def __len__(self):
105
+ """Return number of rows in the person-period dataset."""
106
+ return len(self.data)
107
+
108
+
109
+ def sequence_analysis_multi_state_model(seqdata: SequenceData, sublength: int, covar: Optional[pd.DataFrame] = None) -> SAMM:
110
+ """
111
+ Generate a person-period dataset from sequence data for multi-state analysis.
112
+
113
+ This function transforms sequence data into a "person-period" format where each row
114
+ represents one time point for one individual. At each time position, it also extracts
115
+ the subsequence for the next 'sublength' time units.
116
+
117
+ **What is person-period data?**
118
+ Instead of having one row per person with all their time points as columns,
119
+ person-period data has one row for each person-time combination. For example,
120
+ if we track 3 people over 5 time periods, we get 15 rows (3 x 5).
121
+
122
+ **What are subsequences?**
123
+ At each time point, we look ahead and record what happens in the next few time periods.
124
+ For example, if sublength=3 and we're at time 2, we record states at time 2, 3, and 4.
125
+
126
+ Args:
127
+ seqdata (SequenceData): A SequenceData object containing your sequence data.
128
+ This should be created using the SequenceData class.
129
+ sublength (int): The length of the subsequence to extract at each time point.
130
+ For example, if sublength=3, we look 3 steps ahead from each position.
131
+ covar (pd.DataFrame, optional): Time-invariant covariates (variables that don't change over time).
132
+ For example: gender, education level, birth year, etc.
133
+ The row index should match the sequence IDs.
134
+
135
+ Returns:
136
+ SAMM: A SAMM object containing the person-period dataset with the following variables:
137
+ - id: Identifier for each sequence/person
138
+ - time: Time elapsed since the beginning of the sequence (starts at 1)
139
+ - begin: Time when the current spell began
140
+ - spell_time: Time elapsed since the beginning of the current spell
141
+ - transition: Boolean indicator (True if there's a state transition at this point)
142
+ - s.1, s.2, ..., s.X: The subsequence values (number depends on sublength)
143
+ - Additional covariate columns (if covar was provided)
144
+
145
+ Example:
146
+ >>> # Suppose we have sequence data tracking employment states
147
+ >>> # States: 'employed', 'unemployed', 'education'
148
+ >>> # We want to analyze what happens in the next 3 time periods
149
+ >>> samm_obj = sequence_analysis_multi_state_model(my_seqdata, sublength=3)
150
+ >>> # Now we can use this for event history analysis
151
+ """
152
+
153
+ # Extract the sequence data as a numpy array (rows=individuals, columns=time points)
154
+ # Each cell contains a numeric code representing a state (1, 2, 3, etc.)
155
+ seqdata_array = seqdata.values
156
+ n_individuals = seqdata_array.shape[0] # Number of sequences/people
157
+ n_timepoints = seqdata_array.shape[1] # Length of each sequence
158
+
159
+ # Create column names for the subsequence variables
160
+ # For example, if sublength=3, this creates ['s.1', 's.2', 's.3']
161
+ sname = [f's.{i+1}' for i in range(sublength)]
162
+
163
+ # Get the IDs for each sequence
164
+ # If the SequenceData has an ID column, use it; otherwise use row numbers
165
+ if seqdata.id_col is not None:
166
+ id_values = seqdata.ids
167
+ else:
168
+ id_values = np.arange(1, n_individuals + 1)
169
+
170
+ # This will store all the person-period rows as we process each time point
171
+ all_subseq_list = []
172
+
173
+ # Track when each individual's current spell began
174
+ # A "spell" is a continuous period in the same state
175
+ # Initialize: everyone's spell begins at time 1
176
+ spell_begin = np.ones(n_individuals, dtype=int)
177
+
178
+ # Loop through each time point (but stop before the end to ensure we have enough future points)
179
+ # For example, if sublength=3 and we have 10 time points, we only go up to time point 7
180
+ # because from time 8, 9, 10 we can't look 3 steps ahead
181
+ for tt in range(n_timepoints - sublength + 1):
182
+
183
+ # Extract the subsequence starting at time 'tt' and going for 'sublength' time units
184
+ # For example, if tt=2 and sublength=3, extract columns 2, 3, 4
185
+ subseq = seqdata_array[:, tt:(tt + sublength)]
186
+
187
+ # Create a DataFrame for this subsequence with proper column names
188
+ subseq_df = pd.DataFrame(subseq, columns=sname)
189
+
190
+ # Detect transitions: A transition occurs when the state changes from this time to the next
191
+ # Compare the first column (current state) with the second column (next state)
192
+ transition = (subseq_df['s.1'].values != subseq_df['s.2'].values)
193
+
194
+ # Update spell begin times
195
+ # If this isn't the first time point, check if there was a state change from previous time
196
+ if tt > 0:
197
+ # Get the state at the previous time point and current time point
198
+ prev_state = seqdata_array[:, tt - 1]
199
+ curr_state = seqdata_array[:, tt]
200
+
201
+ # Find where the state changed (spell reset)
202
+ spell_reset_mask = (prev_state != curr_state)
203
+
204
+ # For those individuals, update their spell begin time to current time (tt + 1, since time starts at 1)
205
+ spell_begin[spell_reset_mask] = tt + 1
206
+
207
+ # Calculate spell duration: how long has the current spell lasted?
208
+ # This is the current time minus when the spell began
209
+ spell_time = (tt + 1) - spell_begin
210
+
211
+ # Create the person-period dataset for this time point
212
+ # Each row represents one individual at this specific time point
213
+ subseq_record = pd.DataFrame({
214
+ 'id': id_values, # Individual identifier
215
+ 'time': tt + 1, # Current time point (1-indexed)
216
+ 'begin': spell_begin, # When current spell began
217
+ 'spell_time': spell_time, # Duration of current spell
218
+ 'transition': transition # Whether transition occurs
219
+ })
220
+
221
+ # Add the subsequence columns (s.1, s.2, ..., s.X)
222
+ subseq_record = pd.concat([subseq_record, subseq_df], axis=1)
223
+
224
+ # Add this time point's data to our collection
225
+ all_subseq_list.append(subseq_record)
226
+
227
+ # Combine all time points into one large person-period dataset
228
+ # Stack all the DataFrames on top of each other
229
+ result = pd.concat(all_subseq_list, ignore_index=True)
230
+
231
+ # If time-invariant covariates were provided, merge them in
232
+ # These are variables that don't change over time (e.g., gender, birth year)
233
+ if covar is not None:
234
+ # Match covariates to IDs
235
+ covar_with_id = covar.copy()
236
+ covar_with_id['id'] = id_values
237
+
238
+ # Merge the covariates into our person-period data based on ID
239
+ result = result.merge(covar_with_id, on='id', how='left')
240
+
241
+ # Sort the data by ID and time for easier reading
242
+ result = result.sort_values(['id', 'time']).reset_index(drop=True)
243
+
244
+ # Map numeric state codes back to their readable labels
245
+ # First map to states, then to labels for better interpretability
246
+ inverse_mapping = seqdata.inverse_state_mapping # Maps numeric codes to states
247
+ state_to_label = seqdata.state_to_label # Maps states to descriptive labels
248
+
249
+ for col in sname:
250
+ # First convert numeric codes to states
251
+ result[col] = result[col].map(inverse_mapping)
252
+ # Then convert states to labels for better readability
253
+ result[col] = result[col].map(state_to_label)
254
+
255
+ # Create and return the SAMM object
256
+ samm_obj = SAMM(
257
+ data=result,
258
+ alphabet=seqdata.alphabet,
259
+ labels=seqdata.labels,
260
+ color_map=seqdata.color_map,
261
+ sname=sname,
262
+ sublength=sublength
263
+ )
264
+
265
+ return samm_obj
266
+
267
+
268
+ def plot_samm(samm: SAMM, plot_type: str = "d", base_width: int = 15,
269
+ title: Optional[str] = None, save_as: Optional[str] = None,
270
+ dpi: int = 200, fontsize: int = 10):
271
+ """
272
+ Plot subsequences following transitions in the SAMM data using sequence index plots.
273
+
274
+ This function creates sequence index visualizations showing what subsequences occur
275
+ after transitions out of each state. Similar to R's TraMineR seqplot function.
276
+
277
+ **What does this show?**
278
+ For each state, this displays the actual subsequence patterns (as colored bars)
279
+ that occur when individuals transition OUT of that state. Each row is one sequence,
280
+ and colors represent different states in the subsequence.
281
+
282
+ Args:
283
+ samm (SAMM): A SAMM object created by sequence_analysis_multi_state_model()
284
+ plot_type (str): Type of plot to create (currently supports 'd' for sequence index plot)
285
+ base_width (int): Base width for the figure. Default 15 (wider for better proportions).
286
+ title (str, optional): Custom title for the plot
287
+ save_as (str, optional): File path to save the plot (if None, plot is displayed)
288
+ dpi (int): Resolution for saved images
289
+ fontsize (int): Base font size for labels and titles
290
+
291
+ Example:
292
+ >>> samm_obj = sequence_analysis_multi_state_model(my_seqdata, sublength=3)
293
+ >>> plot_samm(samm_obj, title="Transition Patterns")
294
+ """
295
+
296
+ # Import visualization utilities
297
+ from io import BytesIO
298
+ from sequenzo.visualization.utils import (
299
+ create_standalone_legend,
300
+ combine_plot_with_legend,
301
+ save_figure_to_buffer
302
+ )
303
+ from matplotlib.colors import ListedColormap
304
+
305
+ # Filter to only rows where a transition occurs
306
+ transition_rows = samm.data[samm.data['transition'] == True].copy()
307
+
308
+ if len(transition_rows) == 0:
309
+ print("No transitions found in the data.")
310
+ return
311
+
312
+ # Group by the starting state (s.1) to see transitions out of each state
313
+ starting_states = sorted(transition_rows['s.1'].unique())
314
+
315
+ # Create subplots: one for each starting state
316
+ n_states = len(starting_states)
317
+ ncols = min(3, n_states) # Maximum 3 columns
318
+ nrows = int(np.ceil(n_states / ncols))
319
+
320
+ # Calculate dynamic heights for each subplot based on number of sequences
321
+ # We'll use gridspec to allow different heights
322
+ from matplotlib import gridspec
323
+
324
+ # First, count sequences for each state to determine heights
325
+ state_seq_counts = {}
326
+ for state in starting_states:
327
+ state_seq_counts[state] = len(transition_rows[transition_rows['s.1'] == state])
328
+
329
+ # Calculate height ratios - base height per sequence, min 2.5, max 5 for better aspect ratio
330
+ height_ratios = []
331
+ for i in range(nrows):
332
+ row_states = starting_states[i*ncols : (i+1)*ncols]
333
+ if row_states:
334
+ max_seqs_in_row = max([state_seq_counts[s] for s in row_states])
335
+ # Height: 2.5-5 inches, scaled by number of sequences
336
+ # Use smaller scaling factor (0.01 instead of 0.015) to make plots less stretched
337
+ height = min(5, max(2.5, max_seqs_in_row * 0.01))
338
+ height_ratios.append(height)
339
+
340
+ # Calculate total figure height with more spacing
341
+ total_height = sum(height_ratios) + (nrows - 1) * 2.0 # Add more spacing between rows
342
+
343
+ # Create figure with GridSpec for flexible heights
344
+ fig = plt.figure(figsize=(base_width, total_height))
345
+ gs = gridspec.GridSpec(nrows, ncols, figure=fig, height_ratios=height_ratios,
346
+ hspace=0.5, wspace=0.25) # Adjusted spacing for better layout
347
+
348
+ # Create a reverse mapping from labels back to numeric codes for plotting
349
+ label_to_numeric = {label: i + 1 for i, label in enumerate(samm.labels)}
350
+
351
+ # Use the color map from the original sequence data
352
+ cmap = ListedColormap([samm.color_map[i] for i in sorted(samm.color_map.keys())])
353
+
354
+ # For each starting state, create a sequence index plot
355
+ for idx, state in enumerate(starting_states):
356
+ row = idx // ncols
357
+ col = idx % ncols
358
+ ax = fig.add_subplot(gs[row, col])
359
+
360
+ # Get all subsequences that start with this state and have a transition
361
+ state_data = transition_rows[transition_rows['s.1'] == state].copy()
362
+
363
+ # Extract subsequence columns and convert labels to numeric codes
364
+ subseq_cols = samm.sname
365
+ subseq_matrix = state_data[subseq_cols].values
366
+
367
+ # Convert label strings to numeric codes for plotting
368
+ numeric_matrix = np.zeros_like(subseq_matrix, dtype=float)
369
+ for i in range(subseq_matrix.shape[0]):
370
+ for j in range(subseq_matrix.shape[1]):
371
+ label = subseq_matrix[i, j]
372
+ if pd.notna(label) and label in label_to_numeric:
373
+ numeric_matrix[i, j] = label_to_numeric[label]
374
+ else:
375
+ numeric_matrix[i, j] = np.nan
376
+
377
+ # Plot with masked array for NaN handling
378
+ ax.imshow(np.ma.masked_invalid(numeric_matrix),
379
+ aspect='auto',
380
+ cmap=cmap,
381
+ interpolation='nearest',
382
+ vmin=1,
383
+ vmax=len(samm.labels))
384
+
385
+ # Disable grid
386
+ ax.grid(False)
387
+
388
+ # Set title showing the starting state with count
389
+ num_seqs = numeric_matrix.shape[0]
390
+ title_text = f'Transitions out of: {state} (n={num_seqs})'
391
+
392
+ # Break long titles into multiple lines
393
+ if len(title_text) > 35: # If title is too long
394
+ # Try to break at a natural point
395
+ if 'Transitions out of:' in title_text:
396
+ parts = title_text.split('Transitions out of:')
397
+ if len(parts) == 2:
398
+ title_text = f'Transitions out of:\n{parts[1].strip()}'
399
+
400
+ ax.set_title(title_text, fontsize=fontsize+1, pad=12, color='black')
401
+
402
+ # X-axis: time steps in subsequence
403
+ ax.set_xlabel('Subsequence Position', fontsize=fontsize, labelpad=8, color='black')
404
+ xticks = np.arange(len(subseq_cols))
405
+ ax.set_xticks(xticks)
406
+ ax.set_xticklabels([f't+{i}' for i in range(len(subseq_cols))],
407
+ fontsize=fontsize-2, color='gray')
408
+
409
+ # Y-axis: sequence count
410
+ ax.set_ylabel('Sequences', fontsize=fontsize, labelpad=8, color='black')
411
+
412
+ # Smart y-tick display based on sequence count
413
+ if num_seqs <= 10:
414
+ yticks = np.arange(num_seqs)
415
+ ax.set_yticks(yticks)
416
+ ax.set_yticklabels(range(1, num_seqs + 1), fontsize=fontsize-2, color='gray')
417
+ elif num_seqs <= 50:
418
+ # Show every 5th or 10th
419
+ step = 5 if num_seqs <= 25 else 10
420
+ yticks = np.arange(0, num_seqs, step)
421
+ if yticks[-1] != num_seqs - 1:
422
+ yticks = np.append(yticks, num_seqs - 1)
423
+ ax.set_yticks(yticks)
424
+ ax.set_yticklabels([str(y + 1) for y in yticks], fontsize=fontsize-2, color='gray')
425
+ else:
426
+ # Show quartiles for large numbers
427
+ ytick_positions = [0, num_seqs // 4, num_seqs // 2, 3 * num_seqs // 4, num_seqs - 1]
428
+ ax.set_yticks(ytick_positions)
429
+ ax.set_yticklabels([str(pos + 1) for pos in ytick_positions],
430
+ fontsize=fontsize-2, color='gray')
431
+
432
+ # Style axis spines and ticks like index plot
433
+ for spine in ['top', 'right']:
434
+ ax.spines[spine].set_visible(False)
435
+ for spine in ['bottom', 'left']:
436
+ ax.spines[spine].set_color('gray')
437
+ ax.spines[spine].set_linewidth(0.8)
438
+
439
+ # Tick parameters matching index plot style
440
+ ax.tick_params(axis='x', colors='gray', length=4, width=0.7, which='major')
441
+ ax.tick_params(axis='y', colors='gray', length=4, width=0.7, which='major')
442
+ ax.tick_params(axis='both', which='major', direction='out')
443
+
444
+ # Adjust layout first
445
+ plt.tight_layout(rect=[0, 0, 1, 0.95]) # Leave less space at top for title
446
+
447
+ # Add overall title if provided (after tight_layout to prevent overlap)
448
+ if title:
449
+ fig.suptitle(title, fontsize=fontsize+4, y=0.93, color='black')
450
+
451
+ # Save main figure to buffer
452
+ main_buffer = save_figure_to_buffer(fig, dpi=dpi)
453
+
454
+ # Create standalone legend using the same style as index plot
455
+ colors = {samm.labels[i]: samm.color_map[i+1] for i in range(len(samm.labels))}
456
+ legend_buffer = create_standalone_legend(
457
+ colors=colors,
458
+ labels=samm.labels,
459
+ ncol=min(5, len(samm.labels)),
460
+ figsize=(base_width, 1),
461
+ fontsize=fontsize,
462
+ dpi=dpi
463
+ )
464
+
465
+ # Combine plot with legend
466
+ if save_as and not save_as.lower().endswith(('.png', '.jpg', '.jpeg', '.pdf')):
467
+ save_as = save_as + '.png'
468
+
469
+ combined_img = combine_plot_with_legend(
470
+ main_buffer,
471
+ legend_buffer,
472
+ output_path=save_as,
473
+ dpi=dpi,
474
+ padding=20
475
+ )
476
+
477
+ # Display combined image
478
+ plt.figure(figsize=(base_width, total_height + 1))
479
+ plt.imshow(combined_img)
480
+ plt.axis('off')
481
+ plt.show()
482
+ plt.close('all')
483
+
484
+
485
+ def seqsammseq(samm: SAMM, spell: str) -> pd.DataFrame:
486
+ """
487
+ Extract subsequences that follow a specific state (spell).
488
+
489
+ This function returns all the subsequences that occur after a given state,
490
+ specifically when there is a transition OUT of that state.
491
+
492
+ **Why is this useful?**
493
+ It helps you analyze what happens after a particular state. For example,
494
+ if you're studying employment sequences, you might want to know:
495
+ "What happens after someone becomes unemployed?" or
496
+ "What patterns follow graduation?"
497
+
498
+ Args:
499
+ samm (SAMM): A SAMM object created by sequence_analysis_multi_state_model()
500
+ spell (str): The state you want to analyze transitions from
501
+ (e.g., 'employed', 'single', 'education')
502
+
503
+ Returns:
504
+ pd.DataFrame: A DataFrame containing only the subsequence columns (s.1, s.2, ...)
505
+ for rows where:
506
+ 1. The starting state (s.1) matches the specified spell
507
+ 2. A transition occurs at that point
508
+
509
+ Example:
510
+ >>> # Get all subsequences following unemployment
511
+ >>> unemployed_subsequences = seqsammseq(samm_obj, spell='unemployed')
512
+ >>> print(unemployed_subsequences.head())
513
+ # This shows what typically happens after someone becomes unemployed
514
+ """
515
+
516
+ # Filter for rows that:
517
+ # 1. Start with the specified state (s.1 == spell)
518
+ # 2. Have a transition occurring (transition == True)
519
+ condition = (samm.data['s.1'] == spell) & (samm.data['transition'] == True)
520
+
521
+ # Extract only the subsequence columns
522
+ subsequences = samm.data.loc[condition, samm.sname].copy()
523
+
524
+ # Reset index for cleaner output
525
+ subsequences = subsequences.reset_index(drop=True)
526
+
527
+ return subsequences
528
+
529
+
530
+ def _expand_typology_for_transitions(
531
+ samm: SAMM,
532
+ spell: str,
533
+ mapping: Union[Dict, pd.Series],
534
+ by: Optional[str] = None,
535
+ cluster_to_name: Optional[Dict] = None
536
+ ) -> np.ndarray:
537
+ """
538
+ Build a row-aligned typology vector for transition rows given a mapping.
539
+
540
+ Parameters
541
+ ----------
542
+ samm : SAMM
543
+ The SAMM object.
544
+ spell : str
545
+ The state to analyze transitions out of.
546
+ mapping : dict or pandas.Series
547
+ Either a mapping of id -> cluster/label, or (id, begin) -> cluster/label.
548
+ Values can be final label strings, or cluster ids to be mapped via cluster_to_name.
549
+ by : {"id", "id_begin"}, optional
550
+ If None, auto-detect by inspecting mapping keys/index. Use "id_begin" when
551
+ mapping is keyed by (id, begin).
552
+ cluster_to_name : dict, optional
553
+ Mapping from cluster id to human-readable label. Required if mapping values
554
+ are cluster ids rather than label strings.
555
+
556
+ Returns
557
+ -------
558
+ numpy.ndarray
559
+ A vector of labels aligned to samm.data.loc[(s.1==spell) & transition].
560
+ """
561
+ condition = (samm.data['s.1'] == spell) & (samm.data['transition'] == True)
562
+ trans_df = samm.data.loc[condition, ['id', 'begin']].copy()
563
+
564
+ # Normalize mapping to a dict for fast lookup
565
+ if isinstance(mapping, pd.Series):
566
+ if mapping.index.nlevels == 1:
567
+ normalized: Dict = mapping.to_dict()
568
+ inferred_by = 'id'
569
+ elif mapping.index.nlevels == 2:
570
+ normalized = {tuple(idx): val for idx, val in mapping.items()}
571
+ inferred_by = 'id_begin'
572
+ else:
573
+ raise ValueError("Mapping Series index must be 1 or 2 levels: id or (id, begin)")
574
+ else:
575
+ normalized = dict(mapping)
576
+ # Auto-detect key type when by is not provided
577
+ if by is None:
578
+ if len(normalized) == 0:
579
+ inferred_by = 'id' # default
580
+ else:
581
+ sample_key = next(iter(normalized.keys()))
582
+ inferred_by = 'id_begin' if isinstance(sample_key, tuple) and len(sample_key) == 2 else 'id'
583
+ else:
584
+ inferred_by = by
585
+
586
+ labels: List[str] = []
587
+ missing_keys: List[Union[int, Tuple[int, int]]] = []
588
+
589
+ if inferred_by == 'id':
590
+ for pid in trans_df['id'].tolist():
591
+ if pid not in normalized:
592
+ missing_keys.append(pid)
593
+ labels.append(None)
594
+ continue
595
+ val = normalized[pid]
596
+ # If val is numeric-like and cluster_to_name is provided, map to name
597
+ if cluster_to_name is not None and pd.notna(val):
598
+ try:
599
+ labels.append(cluster_to_name[val])
600
+ except KeyError:
601
+ raise ValueError(f"cluster_to_name is missing key {val!r} for id {pid}")
602
+ else:
603
+ labels.append(val)
604
+ elif inferred_by == 'id_begin':
605
+ ids = trans_df['id'].to_list()
606
+ begins = trans_df['begin'].to_list()
607
+ for pid, b in zip(ids, begins):
608
+ key = (pid, b)
609
+ if key not in normalized:
610
+ missing_keys.append(key)
611
+ labels.append(None)
612
+ continue
613
+ val = normalized[key]
614
+ if cluster_to_name is not None and pd.notna(val):
615
+ try:
616
+ labels.append(cluster_to_name[val])
617
+ except KeyError:
618
+ raise ValueError(f"cluster_to_name is missing key {val!r} for (id, begin) {key}")
619
+ else:
620
+ labels.append(val)
621
+ else:
622
+ raise ValueError("Parameter 'by' must be one of {'id', 'id_begin'}")
623
+
624
+ if missing_keys:
625
+ sample = missing_keys[:5]
626
+ raise ValueError(
627
+ f"Missing {len(missing_keys)} keys in mapping for transitions from '{spell}'. "
628
+ f"Examples: {sample}. You can provide (id, begin) or id mappings, "
629
+ f"and use cluster_to_name to map cluster ids to names."
630
+ )
631
+
632
+ return np.asarray(labels, dtype=object)
633
+
634
+
635
+ def set_typology(
636
+ samm: SAMM,
637
+ spell: str,
638
+ typology: Union[pd.Series, np.ndarray, list, None] = None,
639
+ *,
640
+ clusters: Optional[Union[pd.Series, np.ndarray, list]] = None,
641
+ cluster_to_name: Optional[Dict] = None,
642
+ mapping: Optional[Union[Dict, pd.Series]] = None,
643
+ by: Optional[str] = None
644
+ ) -> SAMM:
645
+ """
646
+ Assign a typology classification to subsequences following a specific state.
647
+
648
+ This function allows you to categorize the different patterns that occur
649
+ after transitioning out of a particular state. This is useful for creating
650
+ meaningful groups for further analysis.
651
+
652
+ **What is a typology?**
653
+ A typology is a classification system. For example, after unemployment,
654
+ you might classify subsequences as:
655
+ - "Quick reemployment" (gets job within 3 months)
656
+ - "Long-term unemployment" (stays unemployed > 6 months)
657
+ - "Exit labor force" (moves to education or retirement)
658
+
659
+ Args:
660
+ samm (SAMM): A SAMM object created by sequence_analysis_multi_state_model()
661
+ spell (str): The state for which you're setting typologies
662
+ typology (array-like, optional): Final labels for each transition row (length = n_transitions).
663
+ clusters (array-like, optional): Cluster ids per transition row (length = n_transitions).
664
+ cluster_to_name (dict, optional): Mapping from cluster id -> label name. Used with clusters
665
+ or when mapping values are cluster ids.
666
+ mapping (dict or pandas.Series, optional): id -> cluster/label or (id, begin) -> cluster/label.
667
+ by (str, optional): 'id' or 'id_begin'. If None, auto-detect from mapping keys.
668
+
669
+ Returns:
670
+ SAMM: The updated SAMM object with typology column filled in
671
+
672
+ Example:
673
+ >>> # First, identify transitions from unemployment
674
+ >>> unemployed_transitions = (samm_obj.data['s.1'] == 'unemployed') & samm_obj.data['transition']
675
+ >>> # Create your typology based on some logic
676
+ >>> my_typology = ['quick_return', 'education', 'long_term', ...] # One label per transition
677
+ >>> # Apply the typology
678
+ >>> samm_obj = set_typology(samm_obj, spell='unemployed', typology=my_typology)
679
+ """
680
+
681
+ # Find rows where: state is the specified spell AND there's a transition
682
+ condition = (samm.data['s.1'] == spell) & (samm.data['transition'] == True)
683
+
684
+ n_transitions = int(condition.sum())
685
+
686
+ labels_array: Optional[np.ndarray] = None
687
+
688
+ # Case 1: direct typology vector
689
+ if typology is not None:
690
+ if isinstance(typology, pd.Series):
691
+ labels_array = typology.values
692
+ else:
693
+ labels_array = np.asarray(typology, dtype=object)
694
+ if len(labels_array) != n_transitions:
695
+ raise ValueError(
696
+ f"Length mismatch: provided length {len(labels_array)} but there are {n_transitions} "
697
+ f"transitions from state '{spell}'. You should provide a typology vector of length n_transitions "
698
+ f"(one label per transition row), not a list of unique type names. Use clusters+cluster_to_name "
699
+ f"or mapping parameters instead."
700
+ )
701
+
702
+ # Case 2: clusters aligned to transition rows + mapping dict
703
+ elif clusters is not None:
704
+ clusters_array = clusters.values if isinstance(clusters, pd.Series) else np.asarray(clusters)
705
+ if len(clusters_array) != n_transitions:
706
+ raise ValueError(
707
+ f"Length mismatch: clusters length {len(clusters_array)} must match n_transitions={n_transitions}"
708
+ )
709
+ if cluster_to_name is not None:
710
+ try:
711
+ labels_array = np.asarray([cluster_to_name[c] for c in clusters_array], dtype=object)
712
+ except KeyError as e:
713
+ raise ValueError(f"cluster_to_name is missing key {e.args[0]!r}")
714
+ else:
715
+ # Assume clusters are already label strings
716
+ labels_array = clusters_array.astype(object)
717
+
718
+ # Case 3: mapping keyed by id or (id, begin)
719
+ elif mapping is not None:
720
+ labels_array = _expand_typology_for_transitions(
721
+ samm=samm, spell=spell, mapping=mapping, by=by, cluster_to_name=cluster_to_name
722
+ )
723
+
724
+ else:
725
+ raise ValueError(
726
+ "You must provide one of: typology (row-aligned), clusters+cluster_to_name (row-aligned), "
727
+ "or mapping (id or (id, begin) to cluster/label)."
728
+ )
729
+
730
+ # Assign the typology labels to the corresponding rows
731
+ samm.data.loc[condition, 'typology'] = labels_array
732
+
733
+ return samm
734
+
735
+
736
+ def seqsammeha(
737
+ samm: SAMM,
738
+ spell: str,
739
+ typology: Union[pd.Series, np.ndarray, list, None] = None,
740
+ *,
741
+ clusters: Optional[Union[pd.Series, np.ndarray, list]] = None,
742
+ cluster_to_name: Optional[Dict] = None,
743
+ mapping: Optional[Union[Dict, pd.Series]] = None,
744
+ by: Optional[str] = None,
745
+ persper: bool = True
746
+ ) -> pd.DataFrame:
747
+ """
748
+ Generate a dataset for Event History Analysis (EHA) with typology outcomes.
749
+
750
+ This function prepares your data for statistical models (like logistic regression
751
+ or survival analysis) that estimate the probability of different outcomes
752
+ following a specific state.
753
+
754
+ **What is Event History Analysis?**
755
+ EHA examines the timing and nature of events. For example:
756
+ - "What factors predict returning to work after unemployment?"
757
+ - "How long do people stay in education before entering the labor force?"
758
+
759
+ **Person-period vs. Spell-level data:**
760
+ - person-period (persper=True): One row for EACH time point in the spell
761
+ Good for: Time-varying effects, duration dependence
762
+ - spell-level (persper=False): One row per spell (only the last observation)
763
+ Good for: Simpler models, overall spell outcomes
764
+
765
+ Args:
766
+ samm (SAMM): A SAMM object created by sequence_analysis_multi_state_model()
767
+ spell (str): The state you're analyzing (e.g., 'unemployed', 'single')
768
+ typology (array-like, optional): Final labels for each transition row (length = n_transitions)
769
+ clusters (array-like, optional): Cluster ids per transition row (length = n_transitions)
770
+ cluster_to_name (dict, optional): Mapping from cluster id -> label name
771
+ mapping (dict or pandas.Series, optional): id -> cluster/label or (id, begin) -> cluster/label
772
+ by (str, optional): 'id' or 'id_begin'. If None, auto-detect
773
+ persper (bool): If True, return person-period data (multiple rows per spell).
774
+ If False, return spell-level data (one row per spell).
775
+
776
+ Returns:
777
+ pd.DataFrame: A dataset ready for event history analysis with:
778
+ - All original SAMM variables (id, time, spell_time, etc.)
779
+ - SAMMtypology: The typology classification (with "None" for non-events)
780
+ - lastobs: Boolean indicating if this is the last observation of a spell
781
+ - SAMM[type1], SAMM[type2], ...: Binary indicators for each typology category
782
+ (these are your outcome variables for analysis)
783
+
784
+ Example:
785
+ >>> # Define typologies for transitions from unemployment
786
+ >>> typology = ['reemployed', 'education', 'reemployed', 'retired', ...]
787
+ >>> # Create EHA dataset
788
+ >>> eha_data = seqsammeha(samm_obj, spell='unemployed', typology=typology, persper=True)
789
+ >>> # Now you can use this with logistic regression, Cox models, etc.
790
+ >>> # For example: predict probability of reemployment vs. other outcomes
791
+ """
792
+
793
+ # First, set the typology in the SAMM object using any of the supported inputs
794
+ samm = set_typology(
795
+ samm,
796
+ spell=spell,
797
+ typology=typology,
798
+ clusters=clusters,
799
+ cluster_to_name=cluster_to_name,
800
+ mapping=mapping,
801
+ by=by
802
+ )
803
+
804
+ # Filter data to only include rows in the specified spell
805
+ spell_condition = (samm.data['s.1'] == spell)
806
+ ppdata = samm.data[spell_condition].copy()
807
+
808
+ # Identify the last observation for each spell
809
+ # Group by individual ID and spell begin time, then mark the maximum spell_time
810
+ ppdata['lastobs'] = ppdata.groupby(['id', 'begin'])['spell_time'].transform('max') == ppdata['spell_time']
811
+
812
+ # Create binary indicator variables for each typology category
813
+ # This creates dummy variables that statistical models can use
814
+ # Determine unique typologies from the rows where typology is set
815
+ typology_series = samm.data.loc[(samm.data['s.1'] == spell) & (samm.data['transition'] == True), 'typology']
816
+ unique_types = typology_series.dropna().unique()
817
+
818
+ # Create a column for each unique typology
819
+ for type_label in unique_types:
820
+ col_name = f'SAMM{type_label}'
821
+ ppdata[col_name] = (ppdata['typology'] == type_label).astype(int)
822
+
823
+ # Ensure 'SAMMtypology' column exists and is properly named
824
+ ppdata = ppdata.rename(columns={'typology': 'SAMMtypology'})
825
+
826
+ # If persper=False, return only the last observation of each spell
827
+ if not persper:
828
+ ppdata = ppdata[ppdata['lastobs']].copy()
829
+
830
+ # Reset index for clean output
831
+ ppdata = ppdata.reset_index(drop=True)
832
+
833
+ return ppdata
834
+
835
+
836
+ # Define what gets imported with "from module import *"
837
+ __all__ = [
838
+ 'SAMM',
839
+ 'sequence_analysis_multi_state_model',
840
+ 'plot_samm',
841
+ 'seqsammseq',
842
+ 'set_typology',
843
+ 'seqsammeha',
844
+ '_expand_typology_for_transitions',
845
+ # Keep old names for backward compatibility
846
+ 'seqsamm'
847
+ ]
848
+
849
+ # Backward compatibility aliases
850
+ seqsamm = sequence_analysis_multi_state_model