sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
  2. sequenzo/__init__.py +349 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +476 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +22 -0
  30. sequenzo/data_preprocessing/helpers.py +303 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/dyadic_children.csv +61 -0
  44. sequenzo/datasets/dyadic_parents.csv +61 -0
  45. sequenzo/datasets/mvad.csv +713 -0
  46. sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
  47. sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
  48. sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
  49. sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
  50. sequenzo/datasets/political_science_aid_shock.csv +166 -0
  51. sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
  52. sequenzo/define_sequence_data.py +1400 -0
  53. sequenzo/dissimilarity_measures/__init__.py +31 -0
  54. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  55. sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
  56. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
  57. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  58. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  59. sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
  60. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  61. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  62. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  63. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  64. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  65. sequenzo/dissimilarity_measures/src/module.cpp +40 -0
  66. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  67. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  214. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  215. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  216. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  217. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  218. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  219. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  220. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  221. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  222. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  223. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  224. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  225. sequenzo/multidomain/__init__.py +23 -0
  226. sequenzo/multidomain/association_between_domains.py +311 -0
  227. sequenzo/multidomain/cat.py +597 -0
  228. sequenzo/multidomain/combt.py +519 -0
  229. sequenzo/multidomain/dat.py +81 -0
  230. sequenzo/multidomain/idcd.py +139 -0
  231. sequenzo/multidomain/linked_polyad.py +292 -0
  232. sequenzo/openmp_setup.py +233 -0
  233. sequenzo/prefix_tree/__init__.py +62 -0
  234. sequenzo/prefix_tree/hub.py +114 -0
  235. sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
  236. sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
  237. sequenzo/prefix_tree/spell_level_indicators.py +297 -0
  238. sequenzo/prefix_tree/system_level_indicators.py +544 -0
  239. sequenzo/prefix_tree/utils.py +54 -0
  240. sequenzo/seqhmm/__init__.py +95 -0
  241. sequenzo/seqhmm/advanced_optimization.py +305 -0
  242. sequenzo/seqhmm/bootstrap.py +411 -0
  243. sequenzo/seqhmm/build_hmm.py +142 -0
  244. sequenzo/seqhmm/build_mhmm.py +136 -0
  245. sequenzo/seqhmm/build_nhmm.py +121 -0
  246. sequenzo/seqhmm/fit_mhmm.py +62 -0
  247. sequenzo/seqhmm/fit_model.py +61 -0
  248. sequenzo/seqhmm/fit_nhmm.py +76 -0
  249. sequenzo/seqhmm/formulas.py +289 -0
  250. sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
  251. sequenzo/seqhmm/gradients_nhmm.py +306 -0
  252. sequenzo/seqhmm/hmm.py +291 -0
  253. sequenzo/seqhmm/mhmm.py +314 -0
  254. sequenzo/seqhmm/model_comparison.py +238 -0
  255. sequenzo/seqhmm/multichannel_em.py +282 -0
  256. sequenzo/seqhmm/multichannel_utils.py +138 -0
  257. sequenzo/seqhmm/nhmm.py +270 -0
  258. sequenzo/seqhmm/nhmm_utils.py +191 -0
  259. sequenzo/seqhmm/predict.py +137 -0
  260. sequenzo/seqhmm/predict_mhmm.py +142 -0
  261. sequenzo/seqhmm/simulate.py +878 -0
  262. sequenzo/seqhmm/utils.py +218 -0
  263. sequenzo/seqhmm/visualization.py +910 -0
  264. sequenzo/sequence_characteristics/__init__.py +40 -0
  265. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  266. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  267. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  268. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  269. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  270. sequenzo/sequence_characteristics/turbulence.py +155 -0
  271. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  272. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  273. sequenzo/suffix_tree/__init__.py +66 -0
  274. sequenzo/suffix_tree/hub.py +114 -0
  275. sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
  276. sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
  277. sequenzo/suffix_tree/spell_level_indicators.py +248 -0
  278. sequenzo/suffix_tree/system_level_indicators.py +535 -0
  279. sequenzo/suffix_tree/utils.py +56 -0
  280. sequenzo/version_check.py +283 -0
  281. sequenzo/visualization/__init__.py +29 -0
  282. sequenzo/visualization/plot_mean_time.py +222 -0
  283. sequenzo/visualization/plot_modal_state.py +276 -0
  284. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  285. sequenzo/visualization/plot_relative_frequency.py +405 -0
  286. sequenzo/visualization/plot_sequence_index.py +1175 -0
  287. sequenzo/visualization/plot_single_medoid.py +153 -0
  288. sequenzo/visualization/plot_state_distribution.py +651 -0
  289. sequenzo/visualization/plot_transition_matrix.py +190 -0
  290. sequenzo/visualization/utils/__init__.py +23 -0
  291. sequenzo/visualization/utils/utils.py +310 -0
  292. sequenzo/with_event_history_analysis/__init__.py +35 -0
  293. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  294. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  295. sequenzo-0.1.31.dist-info/METADATA +286 -0
  296. sequenzo-0.1.31.dist-info/RECORD +299 -0
  297. sequenzo-0.1.31.dist-info/WHEEL +5 -0
  298. sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
  299. sequenzo-0.1.31.dist-info/top_level.txt +2 -0
@@ -0,0 +1,651 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : plot_state_distribution.py
4
+ @Time : 15/02/2025 22:03
5
+ @Desc :
6
+ """
7
+ import numpy as np
8
+ import pandas as pd
9
+ import matplotlib.pyplot as plt
10
+
11
+ from sequenzo import SequenceData
12
+ from sequenzo.visualization.utils import (
13
+ set_up_time_labels_for_x_axis,
14
+ save_figure_to_buffer,
15
+ create_standalone_legend,
16
+ combine_plot_with_legend,
17
+ save_and_show_results,
18
+ determine_layout,
19
+ show_plot_title,
20
+ show_group_title
21
+ )
22
+
23
+
24
+ def smart_sort_groups(groups):
25
+ """
26
+ Smart sorting: prioritize numeric prefix, fallback to string sorting
27
+
28
+ :param groups: List of group names
29
+ :return: Sorted list of group names
30
+ """
31
+ import re
32
+
33
+ # Compile regex once for better performance
34
+ numeric_pattern = re.compile(r'^(\d+)')
35
+
36
+ def sort_key(item):
37
+ match = numeric_pattern.match(str(item))
38
+ return (int(match.group(1)), str(item)) if match else (float('inf'), str(item))
39
+
40
+ return sorted(groups, key=sort_key)
41
+
42
+
43
+ def plot_state_distribution(seqdata: SequenceData,
44
+ # Grouping parameters
45
+ group_by_column=None,
46
+ group_dataframe=None,
47
+ group_column_name=None,
48
+ group_labels=None,
49
+ # Other parameters
50
+ weights="auto",
51
+ figsize=(12, 7),
52
+ plot_style="standard",
53
+ title=None,
54
+ xlabel="Time",
55
+ ylabel="State Distribution (%)",
56
+ save_as=None,
57
+ dpi=200,
58
+ layout='column',
59
+ nrows: int = None,
60
+ ncols: int = None,
61
+ stacked=True,
62
+ show=True,
63
+ include_legend=True,
64
+ group_order=None,
65
+ fontsize=12,
66
+ sort_groups='auto',
67
+ show_group_titles: bool = True,
68
+ show_title: bool = True) -> None:
69
+ """
70
+ Creates state distribution plots for different groups, showing how state
71
+ prevalence changes over time within each group.
72
+
73
+ **Two API modes for grouping:**
74
+
75
+ 1. **Simplified API** (when grouping info is already in the data):
76
+ ```python
77
+ plot_state_distribution(seqdata, group_by_column="Cluster", group_labels=cluster_labels)
78
+ ```
79
+
80
+ 2. **Complete API** (when grouping info is in a separate dataframe):
81
+ ```python
82
+ plot_state_distribution(seqdata, group_dataframe=membership_df,
83
+ group_column_name="Cluster", group_labels=cluster_labels)
84
+ ```
85
+
86
+ :param seqdata: (SequenceData) A SequenceData object containing sequences
87
+
88
+ **Grouping parameters:**
89
+ :param group_by_column: (str, optional) Column name from seqdata.data to group by.
90
+ Use this when grouping information is already in your data.
91
+ Example: "Cluster", "sex", "education"
92
+ :param group_dataframe: (pd.DataFrame, optional) Separate dataframe containing grouping information.
93
+ Use this when grouping info is in a separate table (e.g., clustering results).
94
+ Must contain ID column and grouping column.
95
+ :param group_column_name: (str, optional) Name of the grouping column in group_dataframe.
96
+ Required when using group_dataframe.
97
+ :param group_labels: (dict, optional) Custom labels for group values.
98
+ Example: {1: "Late Family Formation", 2: "Early Partnership"}
99
+ Maps original values to display labels.
100
+ :param weights: (np.ndarray or "auto") Weights for sequences. If "auto", uses seqdata.weights if available
101
+ :param figsize: (tuple) Size of the figure (only used when plot_style="custom")
102
+ :param plot_style: Plot aspect style:
103
+ - 'standard': Standard proportions (12, 7) - balanced view
104
+ - 'compact': Compact/vertical proportions (10, 8) - more vertical like R plots
105
+ - 'wide': Wide proportions (14, 5) - emphasizes time progression
106
+ - 'narrow': Narrow/tall proportions (9, 11) - moderately vertical
107
+ - 'custom': Use the provided figsize parameter
108
+ :param title: (str) Optional title for the plot
109
+ :param xlabel: (str) Label for the x-axis
110
+ :param ylabel: (str) Label for the y-axis
111
+ :param save_as: (str) Optional file path to save the plot
112
+ :param dpi: (int) Resolution of the saved plot
113
+ :param layout: (str) Layout style - 'column' (default, 3xn), 'grid' (nxn)
114
+ :param stacked: (bool) Whether to create stacked area plots (True) or line plots (False)
115
+ :param group_order: List, manually specify group order (overrides sort_groups)
116
+ :param sort_groups: String, sorting method: 'auto'(smart numeric), 'numeric'(numeric prefix), 'alpha'(alphabetical), 'none'(original order)
117
+ :param show_title: (bool, default: True) If False, suppresses the main title display even if title parameter is provided.
118
+ This allows you to control title visibility separately from providing a title string.
119
+
120
+ :return: None
121
+ """
122
+ # Determine figure size based on plot style
123
+ style_sizes = {
124
+ 'standard': (12, 7), # Balanced view
125
+ 'compact': (10, 8), # More square, like R plots
126
+ 'wide': (14, 5), # Wide, emphasizes time
127
+ 'narrow': (9, 11), # Moderately vertical
128
+ 'custom': figsize # User-provided
129
+ }
130
+
131
+ if plot_style not in style_sizes:
132
+ raise ValueError(f"Invalid plot_style '{plot_style}'. "
133
+ f"Supported styles: {list(style_sizes.keys())}")
134
+
135
+ # Special validation for custom plot style
136
+ if plot_style == 'custom' and figsize == (12, 7):
137
+ raise ValueError(
138
+ "When using plot_style='custom', you must explicitly provide a figsize parameter "
139
+ "that differs from the default (12, 7). "
140
+ "Suggested custom sizes:\n"
141
+ " - For wide plots: figsize=(16, 6)\n"
142
+ " - For tall plots: figsize=(8, 12)\n"
143
+ " - For square plots: figsize=(10, 10)\n"
144
+ " - For small plots: figsize=(8, 5)\n"
145
+ "Example: plot_state_distribution(data, plot_style='custom', figsize=(14, 9))"
146
+ )
147
+
148
+ actual_figsize = style_sizes[plot_style]
149
+
150
+ # Handle the simplified API: group_by_column
151
+ if group_by_column is not None:
152
+ # Validate that the column exists in the original data
153
+ if group_by_column not in seqdata.data.columns:
154
+ available_cols = [col for col in seqdata.data.columns if col not in seqdata.time and col != seqdata.id_col]
155
+ raise ValueError(
156
+ f"Column '{group_by_column}' not found in the data. "
157
+ f"Available columns for grouping: {available_cols}"
158
+ )
159
+
160
+ # Automatically create group_dataframe and group_column_name from the simplified API
161
+ group_dataframe = seqdata.data[[seqdata.id_col, group_by_column]].copy()
162
+ group_dataframe.columns = ['Entity ID', 'Category']
163
+ group_column_name = 'Category'
164
+
165
+ # Handle group labels - flexible and user-controllable
166
+ unique_values = seqdata.data[group_by_column].unique()
167
+
168
+ if group_labels is not None:
169
+ # User provided custom labels - use them
170
+ missing_keys = set(unique_values) - set(group_labels.keys())
171
+ if missing_keys:
172
+ raise ValueError(
173
+ f"group_labels missing mappings for values: {missing_keys}. "
174
+ f"Please provide labels for all unique values in '{group_by_column}': {sorted(unique_values)}"
175
+ )
176
+ group_dataframe['Category'] = group_dataframe['Category'].map(group_labels)
177
+ else:
178
+ # No custom labels provided - use smart defaults
179
+ if all(isinstance(v, (int, float, np.integer, np.floating)) and not pd.isna(v) for v in unique_values):
180
+ # Numeric values - keep as is (user can provide group_labels if they want custom names)
181
+ pass
182
+ # For string/categorical values, keep original values
183
+ # This handles cases where users already have meaningful labels like "Male"/"Female"
184
+
185
+ print(f"[>] Creating grouped plots by '{group_by_column}' with {len(unique_values)} categories")
186
+
187
+ # If no grouping information, create a single plot
188
+ if group_dataframe is None or group_column_name is None:
189
+ return _plot_state_distribution_single(
190
+ seqdata=seqdata, weights=weights, figsize=actual_figsize,
191
+ plot_style=plot_style, title=title, xlabel=xlabel, ylabel=ylabel,
192
+ save_as=save_as, dpi=dpi, stacked=stacked,
193
+ show=show, include_legend=include_legend, fontsize=fontsize,
194
+ show_title=show_title
195
+ )
196
+
197
+ # Process weights
198
+ if isinstance(weights, str) and weights == "auto":
199
+ weights = getattr(seqdata, "weights", None)
200
+
201
+ if weights is not None:
202
+ weights = np.asarray(weights, dtype=float).reshape(-1)
203
+ if len(weights) != len(seqdata.values):
204
+ raise ValueError("Length of weights must equal number of sequences.")
205
+
206
+ # Ensure ID columns match (convert if needed)
207
+ id_col_name = "Entity ID" if "Entity ID" in group_dataframe.columns else group_dataframe.columns[0]
208
+
209
+ # Apply group_labels if provided (for group_dataframe API)
210
+ if group_labels is not None and group_column_name in group_dataframe.columns:
211
+ # Validate that all values in the group column have labels
212
+ unique_values = group_dataframe[group_column_name].unique()
213
+ missing_keys = set(unique_values) - set(group_labels.keys())
214
+ if missing_keys:
215
+ raise ValueError(
216
+ f"group_labels missing mappings for values: {missing_keys}. "
217
+ f"Please provide labels for all unique values in '{group_column_name}': {sorted(unique_values)}"
218
+ )
219
+ # Apply the labels mapping
220
+ group_dataframe = group_dataframe.copy() # Avoid modifying original
221
+ group_dataframe[group_column_name] = group_dataframe[group_column_name].map(group_labels)
222
+
223
+ # Get unique groups and sort them based on user preference
224
+ if group_order:
225
+ # Use manually specified order, filter out non-existing groups
226
+ groups = [g for g in group_order if g in group_dataframe[group_column_name].unique()]
227
+ missing_groups = [g for g in group_dataframe[group_column_name].unique() if g not in group_order]
228
+ if missing_groups:
229
+ print(f"[Warning] Groups not in group_order will be excluded: {missing_groups}")
230
+ elif group_labels is not None:
231
+ # If group_labels is provided, use its key order to determine groups order
232
+ # This ensures subplot order matches the order in group_labels dictionary
233
+ # Note: group_labels keys are original values, values are labels (which become groups)
234
+ mapped_labels = []
235
+ available_labels = set(group_dataframe[group_column_name].unique())
236
+
237
+ # Iterate through group_labels in order (Python 3.7+ dicts maintain insertion order)
238
+ for original_key, label_value in group_labels.items():
239
+ # Check if this label exists in the mapped dataframe
240
+ if label_value in available_labels:
241
+ mapped_labels.append(label_value)
242
+
243
+ # Also check for any labels in dataframe that weren't in group_labels
244
+ missing_in_labels = available_labels - set(mapped_labels)
245
+ if missing_in_labels:
246
+ print(f"[Warning] Some groups in data are not in group_labels and will be excluded: {missing_in_labels}")
247
+
248
+ groups = mapped_labels
249
+ elif sort_groups == 'numeric' or sort_groups == 'auto':
250
+ groups = smart_sort_groups(group_dataframe[group_column_name].unique())
251
+ elif sort_groups == 'alpha':
252
+ groups = sorted(group_dataframe[group_column_name].unique())
253
+ elif sort_groups == 'none':
254
+ groups = list(group_dataframe[group_column_name].unique())
255
+ else:
256
+ raise ValueError(f"Invalid sort_groups value: {sort_groups}. Use 'auto', 'numeric', 'alpha', or 'none'.")
257
+
258
+ num_groups = len(groups)
259
+
260
+ # Calculate figure size and layout based on number of groups and specified layout
261
+ nrows, ncols = determine_layout(num_groups, layout=layout, nrows=nrows, ncols=ncols)
262
+
263
+ fig, axes = plt.subplots(
264
+ nrows=nrows,
265
+ ncols=ncols,
266
+ figsize=(actual_figsize[0] * ncols, actual_figsize[1] * nrows),
267
+ gridspec_kw={'wspace': 0.15, 'hspace': 0.25} # Reduced spacing for tighter layout
268
+ )
269
+ axes = axes.flatten()
270
+
271
+ # Create state mapping from numerical values back to state names
272
+ inv_state_mapping = {v: k for k, v in seqdata.state_mapping.items()}
273
+
274
+ # Process each group
275
+ for i, group in enumerate(groups):
276
+ # Get IDs for this group
277
+ group_ids = group_dataframe[group_dataframe[group_column_name] == group][id_col_name].values
278
+
279
+ # Match IDs with sequence data
280
+ mask = np.isin(seqdata.ids, group_ids)
281
+ if not np.any(mask):
282
+ print(f"Warning: No matching sequences found for group '{group}'")
283
+ continue
284
+
285
+ # Get sequences for this group
286
+ group_seq_df = seqdata.to_dataframe().loc[mask]
287
+
288
+ # Get weights for this group
289
+ if weights is None:
290
+ w = np.ones(len(group_seq_df))
291
+ else:
292
+ w = np.asarray(weights)[mask]
293
+
294
+ # Broadcast weights to each time point
295
+ W = np.repeat(w[:, None], group_seq_df.shape[1], axis=1)
296
+
297
+ # Calculate weighted state distributions at each time point
298
+ distributions = []
299
+ for t, col in enumerate(group_seq_df.columns):
300
+ col_vals = group_seq_df[col].to_numpy()
301
+
302
+ # Calculate weighted counts for each state
303
+ sums = {s: float(W[col_vals == s, t].sum()) for s in range(1, len(seqdata.states)+1)}
304
+ totw = float(W[:, t].sum())
305
+
306
+ # Convert to weighted percentages
307
+ dist = {inv_state_mapping.get(s, 'Missing'): 100.0 * (sums[s] / totw if totw > 0 else 0.0)
308
+ for s in range(1, len(seqdata.states) + 1)}
309
+
310
+ # Add time point and distribution to the list
311
+ distributions.append(dict({"time": col, **{str(k): v for k, v in dist.items()}}))
312
+
313
+ # Ensure percentages sum to exactly 100% to avoid gaps
314
+ for j in range(len(distributions)):
315
+ total_percentage = sum(distributions[j][str(state)] for state in seqdata.states)
316
+ if total_percentage < 100:
317
+ top_state = str(seqdata.states[-1])
318
+ distributions[j][str(top_state)] += (100 - total_percentage)
319
+
320
+ # Convert to DataFrame for plotting
321
+ dist_df = pd.DataFrame(distributions)
322
+
323
+ # Plot on the corresponding axis
324
+ ax = axes[i]
325
+
326
+ # Get colors for each state
327
+ # seqdata.states are integer encodings (e.g., 1, 2, ...)
328
+ # seqdata.state_mapping[state] maps integers to labels (e.g., 'Married', 'Single')
329
+ # seqdata.color_map[...] gets color by label
330
+ base_colors = [seqdata.color_map[seqdata.state_mapping[state]] for state in seqdata.states]
331
+
332
+ # Plot the data
333
+ if stacked:
334
+ # Create a stacked area plot
335
+ ax.stackplot(range(len(dist_df)),
336
+ [dist_df[str(state)] for state in seqdata.states],
337
+ labels=seqdata.labels,
338
+ colors=base_colors,
339
+ alpha=1.0)
340
+
341
+ # Add grid lines behind the stack plot
342
+ ax.grid(axis='y', linestyle='-', alpha=0.2)
343
+ ax.set_axisbelow(True)
344
+ else:
345
+ # Create a line plot
346
+ for state, label, color in zip(seqdata.states, seqdata.labels, base_colors):
347
+ ax.plot(range(len(dist_df)), dist_df[str(state)],
348
+ label=label, color=color,
349
+ linewidth=2.5, marker='o', markersize=5)
350
+
351
+ # Add grid lines
352
+ ax.grid(True, linestyle='-', alpha=0.2)
353
+
354
+ # Set group title with weighted sample size
355
+ # Check if we have effective weights (not all 1.0) and they were provided by user
356
+ original_weights = getattr(seqdata, "weights", None)
357
+ if original_weights is not None and not np.allclose(original_weights, 1.0):
358
+ sum_w = float(w.sum())
359
+ group_title = f"{group} (n = {len(group_seq_df)}, total weight = {sum_w:.1f})"
360
+ else:
361
+ group_title = f"{group} (n = {len(group_seq_df)})"
362
+ if show_group_titles:
363
+ show_group_title(ax, group_title, show=True, fontsize=fontsize)
364
+
365
+ # Set y-axis limits from 0 to 100%
366
+ ax.set_ylim(0, 100)
367
+
368
+ # Clean up axis aesthetics
369
+ ax.spines['top'].set_visible(False)
370
+ ax.spines['right'].set_visible(False)
371
+ ax.spines['left'].set_color('gray')
372
+ ax.spines['bottom'].set_color('gray')
373
+ ax.spines['left'].set_linewidth(0.7)
374
+ ax.spines['bottom'].set_linewidth(0.7)
375
+
376
+ # Move spines slightly away from the plot area for better aesthetics (same as index plot)
377
+ ax.spines['left'].set_position(('outward', 5))
378
+ ax.spines['bottom'].set_position(('outward', 5))
379
+
380
+ ax.tick_params(axis='x', colors='gray', length=4, width=0.7)
381
+ ax.tick_params(axis='y', colors='gray', length=4, width=0.7)
382
+
383
+ # Set x-axis labels
384
+ set_up_time_labels_for_x_axis(seqdata, ax)
385
+
386
+ # Set x-axis range to prevent over-extension like in the reference image
387
+ ax.set_xlim(-0.5, len(seqdata.cleaned_time) - 0.5)
388
+
389
+ # Add axis labels
390
+ if i % ncols == 0:
391
+ ax.set_ylabel(ylabel, fontsize=fontsize, labelpad=10, color='black')
392
+
393
+ # if i >= num_groups - ncols:
394
+ ax.set_xlabel(xlabel, fontsize=fontsize, labelpad=10, color='black')
395
+
396
+ # Hide unused subplots
397
+ for j in range(i + 1, len(axes)):
398
+ axes[j].set_visible(False)
399
+
400
+ # Add a common title if provided and show_title is True
401
+ if title and show_title:
402
+ fig.suptitle(title, fontsize=fontsize+2, y=1.02)
403
+
404
+ # Adjust layout to remove tight_layout warning and eliminate extra right space
405
+ fig.subplots_adjust(wspace=0.15, hspace=0.25, bottom=0.1, top=0.9, right=0.98, left=0.08)
406
+
407
+ # Save main figure to memory
408
+ main_buffer = save_figure_to_buffer(fig, dpi=dpi)
409
+
410
+ if include_legend:
411
+ # Create standalone legend
412
+ colors = seqdata.color_map_by_label
413
+ legend_buffer = create_standalone_legend(
414
+ colors=colors,
415
+ labels=seqdata.labels,
416
+ ncol=min(5, len(seqdata.states)),
417
+ figsize=(actual_figsize[0] * ncols, 1),
418
+ fontsize=fontsize-2,
419
+ dpi=dpi
420
+ )
421
+
422
+ # Combine plot with legend
423
+ if save_as and not save_as.lower().endswith(('.png', '.jpg', '.jpeg', '.pdf')):
424
+ save_as = save_as + '.png'
425
+
426
+ combined_img = combine_plot_with_legend(
427
+ main_buffer,
428
+ legend_buffer,
429
+ output_path=save_as,
430
+ dpi=dpi,
431
+ padding=20
432
+ )
433
+
434
+ # Display combined image
435
+ plt.figure(figsize=(actual_figsize[0] * ncols, actual_figsize[1] * nrows + 1))
436
+ plt.imshow(combined_img)
437
+ plt.axis('off')
438
+ if show or save_as: # Show if displaying or saving is needed
439
+ plt.show()
440
+ plt.close()
441
+ else:
442
+ # Display plot without legend
443
+ if save_as and not save_as.lower().endswith(('.png', '.jpg', '.jpeg', '.pdf')):
444
+ save_as = save_as + '.png'
445
+
446
+ # Save or show the main plot directly
447
+ plt.figure(figsize=(actual_figsize[0] * ncols, actual_figsize[1] * nrows))
448
+ plt.imshow(main_buffer)
449
+ plt.axis('off')
450
+
451
+ if save_as:
452
+ plt.savefig(save_as, dpi=dpi, bbox_inches='tight')
453
+ if show:
454
+ plt.show()
455
+ plt.close()
456
+
457
+ # No longer return fig to avoid duplicate rendering by environment
458
+ return None
459
+ # return fig
460
+
461
+
462
+ def _plot_state_distribution_single(seqdata: SequenceData,
463
+ weights="auto",
464
+ figsize=(12, 7),
465
+ plot_style="standard",
466
+ title=None,
467
+ xlabel="Time",
468
+ ylabel="State Distribution (%)",
469
+ stacked=True,
470
+ save_as=None,
471
+ dpi=200,
472
+ show=False,
473
+ include_legend=True,
474
+ fontsize=12,
475
+ show_title=True) -> None:
476
+ """
477
+ Creates a state distribution plot showing how the prevalence of states changes over time,
478
+ with enhanced color vibrancy.
479
+
480
+ :param seqdata: (SequenceData) A SequenceData object containing sequences
481
+ :param weights: (np.ndarray or "auto") Weights for sequences. If "auto", uses seqdata.weights if available
482
+ :param figsize: (tuple) Size of the figure (only used when plot_style="custom")
483
+ :param plot_style: Plot aspect style ('standard', 'compact', 'wide', 'narrow', 'custom')
484
+ :param title: (str) Optional title for the plot
485
+ :param xlabel: (str) Label for the x-axis
486
+ :param ylabel: (str) Label for the y-axis
487
+ :param stacked: (bool) Whether to create a stacked area plot (True) or line plot (False)
488
+ :param save_as: (str) Optional file path to save the plot
489
+ :param dpi: (int) Resolution of the saved plot
490
+
491
+ :return: None
492
+ """
493
+ # Determine figure size based on plot style
494
+ style_sizes = {
495
+ 'standard': (12, 7), # Balanced view
496
+ 'compact': (10, 8), # More square, like R plots
497
+ 'wide': (14, 5), # Wide, emphasizes time
498
+ 'narrow': (9, 11), # Moderately vertical
499
+ 'custom': figsize # User-provided
500
+ }
501
+
502
+ if plot_style not in style_sizes:
503
+ raise ValueError(f"Invalid plot_style '{plot_style}'. "
504
+ f"Supported styles: {list(style_sizes.keys())}")
505
+
506
+ # Special validation for custom plot style
507
+ if plot_style == 'custom' and figsize == (12, 7):
508
+ raise ValueError(
509
+ "When using plot_style='custom', you must explicitly provide a figsize parameter "
510
+ "that differs from the default (12, 7). "
511
+ "Suggested custom sizes:\n"
512
+ " - For wide plots: figsize=(16, 6)\n"
513
+ " - For tall plots: figsize=(8, 12)\n"
514
+ " - For square plots: figsize=(10, 10)\n"
515
+ " - For small plots: figsize=(8, 5)\n"
516
+ "Example: plot_state_distribution(data, plot_style='custom', figsize=(14, 9))"
517
+ )
518
+
519
+ actual_figsize = style_sizes[plot_style]
520
+
521
+ # Process weights
522
+ if isinstance(weights, str) and weights == "auto":
523
+ weights = getattr(seqdata, "weights", None)
524
+
525
+ if weights is not None:
526
+ weights = np.asarray(weights, dtype=float).reshape(-1)
527
+ if len(weights) != len(seqdata.values):
528
+ raise ValueError("Length of weights must equal number of sequences.")
529
+
530
+ # Get sequence data as a DataFrame
531
+ seq_df = seqdata.to_dataframe()
532
+
533
+ # Get weights
534
+ if weights is None:
535
+ w = np.ones(len(seq_df))
536
+ else:
537
+ w = np.asarray(weights)
538
+
539
+ # Broadcast weights to each time point
540
+ W = np.repeat(w[:, None], seq_df.shape[1], axis=1)
541
+
542
+ # Create a state mapping from numerical values back to state names
543
+ inv_state_mapping = {v: k for k, v in seqdata.state_mapping.items()}
544
+
545
+ # Calculate weighted state distributions at each time point
546
+ distributions = []
547
+ for t, col in enumerate(seq_df.columns):
548
+ col_vals = seq_df[col].to_numpy()
549
+
550
+ # Calculate weighted counts for each state
551
+ sums = {s: float(W[col_vals == s, t].sum()) for s in range(1, len(seqdata.states)+1)}
552
+ totw = float(W[:, t].sum())
553
+
554
+ # Convert to weighted percentages
555
+ dist = {inv_state_mapping.get(s, 'Missing'): 100.0 * (sums[s] / totw if totw > 0 else 0.0)
556
+ for s in range(1, len(seqdata.states) + 1)}
557
+
558
+ # Add time point and distribution to the list
559
+ distributions.append(dict({"time": col, **{str(k): v for k, v in dist.items()}}))
560
+
561
+ # Ensure percentages sum to exactly 100% to avoid gaps
562
+ for i in range(len(distributions)):
563
+ # Get sum of all state percentages for this time point
564
+ total_percentage = sum(distributions[i][str(state)] for state in seqdata.states)
565
+
566
+ # If there's a gap, add the difference to the top-most state
567
+ if total_percentage < 100:
568
+ # Get the last (top-most) state in your stack
569
+ top_state = str(seqdata.states[-1])
570
+ # Add the difference to make total exactly 100%
571
+ distributions[i][top_state] += (100 - total_percentage)
572
+
573
+ # Convert to DataFrame for plotting
574
+ dist_df = pd.DataFrame(distributions)
575
+
576
+ # Create the plot
577
+ plt.style.use('default') # Start with default style for clean slate
578
+ fig, ax = plt.subplots(figsize=actual_figsize)
579
+
580
+ # Get colors for each state and enhance vibrancy
581
+ base_colors = [seqdata.color_map[seqdata.state_mapping[state]] for state in seqdata.states]
582
+
583
+ # Plot the data
584
+ if stacked:
585
+ # Create a stacked area plot with enhanced colors
586
+ ax.stackplot(range(len(dist_df)),
587
+ [dist_df[str(state)] for state in seqdata.states],
588
+ labels=seqdata.labels,
589
+ colors=base_colors,
590
+ alpha=1.0) # Full opacity for maximum vibrancy
591
+
592
+ # Add grid lines behind the stack plot
593
+ ax.grid(axis='y', linestyle='-', alpha=0.2)
594
+ ax.set_axisbelow(True)
595
+ else:
596
+ # Create a line plot with enhanced colors
597
+ for i, state in enumerate(seqdata.states):
598
+ ax.plot(range(len(dist_df)), dist_df[str(state)],
599
+ label=state, color=base_colors[i],
600
+ linewidth=2.5, marker='o', markersize=5)
601
+
602
+ # Add grid lines
603
+ ax.grid(True, linestyle='-', alpha=0.2)
604
+
605
+ # Set axis labels and title
606
+ ax.set_xlabel(xlabel, fontsize=fontsize, labelpad=10)
607
+ ax.set_ylabel(ylabel, fontsize=fontsize, labelpad=10)
608
+
609
+ if title and show_title:
610
+ ax.set_title(title, fontsize=fontsize+2, fontweight='bold', pad=20)
611
+
612
+ # Set x-axis labels based on time points
613
+ set_up_time_labels_for_x_axis(seqdata, ax)
614
+
615
+ # Set x-axis range to prevent over-extension like in the reference image
616
+ ax.set_xlim(-0.5, len(seqdata.cleaned_time) - 0.5)
617
+
618
+ # Enhance aesthetics
619
+ ax.spines['top'].set_visible(False)
620
+ ax.spines['right'].set_visible(False)
621
+ ax.spines['left'].set_color('gray')
622
+ ax.spines['bottom'].set_color('gray')
623
+ ax.spines['left'].set_linewidth(0.7)
624
+ ax.spines['bottom'].set_linewidth(0.7)
625
+
626
+ # Move spines slightly away from the plot area for better aesthetics (same as index plot)
627
+ ax.spines['left'].set_position(('outward', 5))
628
+ ax.spines['bottom'].set_position(('outward', 5))
629
+
630
+ # Ensure ticks are visible and styled consistently
631
+ ax.tick_params(axis='x', colors='gray', length=4, width=0.7, which='major')
632
+ ax.tick_params(axis='y', colors='gray', length=4, width=0.7, which='major')
633
+
634
+ # Set y-axis limits from 0 to 100%
635
+ ax.set_ylim(0, 100)
636
+
637
+ # Add legend
638
+ if include_legend:
639
+ legend = ax.legend(loc='center left', bbox_to_anchor=(1.01, 0.5),
640
+ frameon=False, fontsize=fontsize-2)
641
+
642
+ # Adjust layout to make room for the legend
643
+ plt.tight_layout()
644
+
645
+ save_and_show_results(save_as, dpi=dpi, show=show)
646
+
647
+ # return fig
648
+ # No longer return fig to avoid duplicate rendering by environment
649
+ return None
650
+
651
+