sequenzo 0.1.21__cp310-cp310-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sequenzo might be problematic. Click here for more details.

Files changed (260) hide show
  1. sequenzo/__init__.py +240 -0
  2. sequenzo/big_data/__init__.py +12 -0
  3. sequenzo/big_data/clara/__init__.py +26 -0
  4. sequenzo/big_data/clara/clara.py +467 -0
  5. sequenzo/big_data/clara/utils/__init__.py +27 -0
  6. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  7. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  8. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  9. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  10. sequenzo/big_data/clara/visualization.py +88 -0
  11. sequenzo/clustering/KMedoids.py +196 -0
  12. sequenzo/clustering/__init__.py +30 -0
  13. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  14. sequenzo/clustering/hierarchical_clustering.py +1380 -0
  15. sequenzo/clustering/src/KMedoid.cpp +262 -0
  16. sequenzo/clustering/src/PAM.cpp +236 -0
  17. sequenzo/clustering/src/PAMonce.cpp +234 -0
  18. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  19. sequenzo/clustering/src/cluster_quality.h +128 -0
  20. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  21. sequenzo/clustering/src/module.cpp +228 -0
  22. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  23. sequenzo/clustering/utils/__init__.py +27 -0
  24. sequenzo/clustering/utils/disscenter.py +122 -0
  25. sequenzo/data_preprocessing/__init__.py +20 -0
  26. sequenzo/data_preprocessing/helpers.py +256 -0
  27. sequenzo/datasets/__init__.py +41 -0
  28. sequenzo/datasets/biofam.csv +2001 -0
  29. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  30. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  31. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  32. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  33. sequenzo/datasets/country_co2_emissions.csv +194 -0
  34. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  35. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  36. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  37. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  38. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  39. sequenzo/datasets/mvad.csv +713 -0
  40. sequenzo/datasets/pairfam_family.csv +1867 -0
  41. sequenzo/datasets/polyadic_samplec1.csv +61 -0
  42. sequenzo/datasets/polyadic_samplep1.csv +61 -0
  43. sequenzo/datasets/polyadic_seqc1.csv +61 -0
  44. sequenzo/datasets/polyadic_seqp1.csv +61 -0
  45. sequenzo/define_sequence_data.py +609 -0
  46. sequenzo/dissimilarity_measures/__init__.py +31 -0
  47. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  48. sequenzo/dissimilarity_measures/get_distance_matrix.py +702 -0
  49. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +241 -0
  50. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  51. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  52. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  53. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  54. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  55. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  56. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  57. sequenzo/dissimilarity_measures/src/module.cpp +34 -0
  58. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  59. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  60. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  61. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  62. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  63. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  64. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  65. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  66. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  67. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  210. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  211. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  212. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  213. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  214. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  215. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  216. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  217. sequenzo/multidomain/__init__.py +23 -0
  218. sequenzo/multidomain/association_between_domains.py +311 -0
  219. sequenzo/multidomain/cat.py +431 -0
  220. sequenzo/multidomain/combt.py +519 -0
  221. sequenzo/multidomain/dat.py +89 -0
  222. sequenzo/multidomain/idcd.py +139 -0
  223. sequenzo/multidomain/linked_polyad.py +292 -0
  224. sequenzo/openmp_setup.py +233 -0
  225. sequenzo/prefix_tree/__init__.py +43 -0
  226. sequenzo/prefix_tree/individual_level_indicators.py +1274 -0
  227. sequenzo/prefix_tree/system_level_indicators.py +465 -0
  228. sequenzo/prefix_tree/utils.py +54 -0
  229. sequenzo/sequence_characteristics/__init__.py +40 -0
  230. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  231. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  232. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  233. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  234. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  235. sequenzo/sequence_characteristics/turbulence.py +155 -0
  236. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  237. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  238. sequenzo/suffix_tree/__init__.py +48 -0
  239. sequenzo/suffix_tree/individual_level_indicators.py +1638 -0
  240. sequenzo/suffix_tree/system_level_indicators.py +456 -0
  241. sequenzo/suffix_tree/utils.py +56 -0
  242. sequenzo/visualization/__init__.py +29 -0
  243. sequenzo/visualization/plot_mean_time.py +194 -0
  244. sequenzo/visualization/plot_modal_state.py +276 -0
  245. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  246. sequenzo/visualization/plot_relative_frequency.py +404 -0
  247. sequenzo/visualization/plot_sequence_index.py +937 -0
  248. sequenzo/visualization/plot_single_medoid.py +153 -0
  249. sequenzo/visualization/plot_state_distribution.py +613 -0
  250. sequenzo/visualization/plot_transition_matrix.py +190 -0
  251. sequenzo/visualization/utils/__init__.py +23 -0
  252. sequenzo/visualization/utils/utils.py +310 -0
  253. sequenzo/with_event_history_analysis/__init__.py +35 -0
  254. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  255. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  256. sequenzo-0.1.21.dist-info/METADATA +308 -0
  257. sequenzo-0.1.21.dist-info/RECORD +254 -0
  258. sequenzo-0.1.21.dist-info/WHEEL +5 -0
  259. sequenzo-0.1.21.dist-info/licenses/LICENSE +28 -0
  260. sequenzo-0.1.21.dist-info/top_level.txt +1 -0
@@ -0,0 +1,194 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : plot_mean_time.py
4
+ @Time : 14/02/2025 10:12
5
+ @Desc :
6
+ Implementation of Mean Time Plot for social sequence analysis,
7
+ closely following ggseqplot's `ggseqmtplot` function,
8
+ and TraMineR's `plot.stslist.meant.Rd` for mean time calculation.
9
+ """
10
+ import numpy as np
11
+ import pandas as pd
12
+ import matplotlib.pyplot as plt
13
+ from typing import Optional
14
+ from sequenzo.define_sequence_data import SequenceData
15
+ from sequenzo.visualization.utils import (
16
+ save_and_show_results,
17
+ show_plot_title
18
+ )
19
+
20
+
21
+ def _compute_mean_time(seqdata: SequenceData, weights="auto") -> pd.DataFrame:
22
+ """
23
+ Compute mean total time spent in each state across all sequences.
24
+ Optimized version using pandas operations.
25
+
26
+ :param seqdata: SequenceData object containing sequence information
27
+ :param weights: (np.ndarray or "auto") Weights for sequences. If "auto", uses seqdata.weights if available
28
+ :return: DataFrame with mean time spent and standard error for each state
29
+ """
30
+ # Process weights
31
+ if isinstance(weights, str) and weights == "auto":
32
+ weights = getattr(seqdata, "weights", None)
33
+
34
+ if weights is not None:
35
+ weights = np.asarray(weights, dtype=float).reshape(-1)
36
+ if len(weights) != len(seqdata.values):
37
+ raise ValueError("Length of weights must equal number of sequences.")
38
+
39
+ # Get data and preprocess
40
+ seq_df = seqdata.to_dataframe()
41
+ inv = {v: k for k, v in seqdata.state_mapping.items()}
42
+ states = list(range(1, len(seqdata.states) + 1)) # Use numerical state indices
43
+ n = len(seq_df)
44
+
45
+ # Get weights
46
+ if weights is None:
47
+ w = np.ones(n)
48
+ else:
49
+ w = np.asarray(weights, dtype=float)
50
+
51
+ # Broadcast weights to each time point
52
+ W = np.repeat(w[:, None], seq_df.shape[1], axis=1)
53
+
54
+ # Convert to long format with weights
55
+ df_long = seq_df.melt(value_name='state_idx')
56
+ # Replicate weights for each time point
57
+ W_long = pd.DataFrame(W, columns=seq_df.columns).melt(value_name='w')['w'].to_numpy()
58
+ df_long['w'] = W_long
59
+ # Keep state_idx as numerical for consistent grouping
60
+
61
+ # Calculate mean time spent in each state per sequence
62
+ # For each sequence, count time spent in each state, then take weighted average
63
+ seq_state_times = {}
64
+ for s in states:
65
+ # Count occurrences of state s in each sequence
66
+ state_counts = (seq_df == s).sum(axis=1) # Sum across time for each sequence
67
+ # Calculate weighted mean across sequences
68
+ seq_state_times[s] = np.average(state_counts, weights=w) if len(state_counts) > 0 else 0.0
69
+
70
+ mean_times = seq_state_times
71
+
72
+ # Calculate standard errors for mean time
73
+ se = {}
74
+ n_sequences = len(seq_df)
75
+
76
+ for s in states:
77
+ if n_sequences > 1:
78
+ # Count occurrences of state s in each sequence
79
+ state_counts = (seq_df == s).sum(axis=1)
80
+ # Calculate weighted standard error
81
+ weighted_mean = seq_state_times[s]
82
+ weighted_var = np.average((state_counts - weighted_mean) ** 2, weights=w)
83
+ # Standard error of the weighted mean
84
+ se[s] = np.sqrt(weighted_var / n_sequences) if weighted_var >= 0 else 0.0
85
+ else:
86
+ se[s] = 0.0
87
+
88
+ # Create result DataFrame
89
+ mean_time_df = pd.DataFrame({
90
+ 'State': [inv[s] for s in states],
91
+ 'MeanTime': [mean_times[s] for s in states],
92
+ 'StandardError': [se[s] for s in states]
93
+ })
94
+
95
+ mean_time_df.sort_values(by='MeanTime', ascending=True, inplace=True)
96
+
97
+ return mean_time_df
98
+
99
+
100
+ def plot_mean_time(seqdata: SequenceData,
101
+ weights="auto",
102
+ show_error_bar: bool = True,
103
+ title=None,
104
+ x_label="Mean Time",
105
+ y_label="State",
106
+ fontsize: int = 12,
107
+ save_as: Optional[str] = None,
108
+ dpi: int = 200) -> None:
109
+ """
110
+ Plot Mean Time Plot for sequence data with clean white background.
111
+
112
+ :param seqdata: SequenceData object containing sequence information
113
+ :param weights: (np.ndarray or "auto") Weights for sequences. If "auto", uses seqdata.weights if available
114
+ :param show_error_bar: Boolean flag to show or hide error bars
115
+ :param title: Optional title for the plot
116
+ :param x_label: Label for the x-axis
117
+ :param y_label: Label for the y-axis
118
+ :param save_as: Optional file path to save the plot
119
+ :param dpi: Resolution of the saved plot
120
+ """
121
+ # Use default style as base
122
+ plt.style.use('default')
123
+
124
+ # Compute all required data at once
125
+ mean_time_df = _compute_mean_time(seqdata, weights)
126
+
127
+ # Create figure and preallocate memory
128
+ fig = plt.figure(figsize=(12, 7))
129
+
130
+ # Create main plot
131
+ ax = plt.subplot(111)
132
+
133
+ # Get color mapping - use original colors without enhancement
134
+ cmap = seqdata.get_colormap()
135
+ colors = [cmap.colors[i] for i in range(len(seqdata.states))]
136
+
137
+ # Assign colors to states (without enhancing saturation)
138
+ mean_time_df['Color'] = pd.Categorical(mean_time_df['State']).codes
139
+ mean_time_df['Color'] = mean_time_df['Color'].map(lambda x: colors[x])
140
+
141
+ # Create custom barplot
142
+ for i, (_, row) in enumerate(mean_time_df.iterrows()):
143
+ ax.barh(y=i, width=row['MeanTime'], height=0.7,
144
+ color=row['Color'], edgecolor='white', linewidth=0.5)
145
+
146
+ # Set y-axis ticks and labels
147
+ ax.set_yticks(range(len(mean_time_df)))
148
+ ax.set_yticklabels(mean_time_df['State'], fontsize=fontsize-2)
149
+
150
+ # Add error bars if needed
151
+ if show_error_bar:
152
+ ax.errorbar(
153
+ x=mean_time_df["MeanTime"],
154
+ y=range(len(mean_time_df)),
155
+ xerr=mean_time_df["StandardError"],
156
+ fmt='none',
157
+ ecolor='black',
158
+ capsize=3,
159
+ capthick=1,
160
+ elinewidth=1.5
161
+ )
162
+
163
+ # Set plot properties
164
+ if title:
165
+ show_plot_title(ax, title, show=True, fontsize=fontsize+2, fontweight='bold', pad=20)
166
+ ax.set_xlabel(x_label, fontsize=fontsize)
167
+ ax.set_ylabel(y_label, fontsize=fontsize, labelpad=15)
168
+
169
+ # Clean white background with light grid
170
+ ax.set_facecolor('white')
171
+ ax.grid(axis='x', color='#E0E0E0', linestyle='-', linewidth=0.5)
172
+ ax.set_axisbelow(True) # Place grid lines behind the bars
173
+
174
+ # Customize borders
175
+ for spine in ax.spines.values():
176
+ spine.set_color('#CCCCCC') # Light gray border
177
+ spine.set_linewidth(0.5)
178
+
179
+ # Adjust layout(1/2)
180
+ plt.subplots_adjust(left=0.3)
181
+
182
+ # Add a note about normalization
183
+ relative_threshold = 0.01
184
+ max_val = mean_time_df['MeanTime'].max()
185
+ too_many_small = np.sum(mean_time_df['MeanTime'] < relative_threshold * max_val) >= 1
186
+ if too_many_small:
187
+ norm_note = f"Note: Some bars may appear as zero, but actually have small non-zero values."
188
+ plt.figtext(0.5, -0.02, norm_note, ha='center', fontsize=fontsize-2, style='italic')
189
+
190
+ # Adjust layout(2/2)
191
+ plt.tight_layout()
192
+
193
+ save_and_show_results(save_as, dpi=200)
194
+
@@ -0,0 +1,276 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : plot_modal_state.py
4
+ @Time : 01/03/2025 13:45
5
+ @Desc :
6
+ """
7
+ import numpy as np
8
+ import pandas as pd
9
+ import matplotlib.pyplot as plt
10
+ from typing import Optional, Union, List
11
+ from sequenzo import SequenceData
12
+ from sequenzo.visualization.utils import (
13
+ set_up_time_labels_for_x_axis,
14
+ create_standalone_legend,
15
+ save_figure_to_buffer,
16
+ combine_plot_with_legend,
17
+ save_and_show_results,
18
+ show_plot_title
19
+ )
20
+ from PIL import Image
21
+
22
+
23
+ def plot_modal_state(seqdata: SequenceData,
24
+ group_by: Optional[Union[str, pd.Series, np.ndarray]] = None,
25
+ group_labels: Optional[List[str]] = None,
26
+ weights="auto",
27
+ xlabel: str = "Time",
28
+ ylabel: str = "Rel. Freq.",
29
+ fig_width: int = 12,
30
+ fig_height: Optional[int] = None,
31
+ show_counts: bool = True,
32
+ show_group_titles: bool = True,
33
+ fontsize: int = 12,
34
+ save_as: Optional[str] = None,
35
+ dpi: int = 200) -> None:
36
+ """
37
+ Creates a modal state frequency plot showing the most frequent state at each position
38
+ and its relative frequency, with optional grouping by a categorical variable.
39
+
40
+ :param seqdata: SequenceData object containing sequence information
41
+ :param group_by: Column name or array with grouping variable
42
+ :param group_labels: Optional custom labels for groups
43
+ :param weights: (np.ndarray or "auto") Weights for sequences. If "auto", uses seqdata.weights if available
44
+ :param xlabel: Label for the x-axis
45
+ :param ylabel: Label for the y-axis
46
+ :param fig_width: Width of the figure
47
+ :param fig_height: Height of the figure (auto-calculated based on groups if None)
48
+ :param show_counts: Whether to show the count of sequences in each group title
49
+ :param save_as: Optional file path to save the plot
50
+ :param dpi: Resolution of the saved plot
51
+
52
+ :return: None
53
+ """
54
+ # Process weights
55
+ if isinstance(weights, str) and weights == "auto":
56
+ weights = getattr(seqdata, "weights", None)
57
+
58
+ if weights is not None:
59
+ weights = np.asarray(weights, dtype=float).reshape(-1)
60
+ if len(weights) != len(seqdata.values):
61
+ raise ValueError("Length of weights must equal number of sequences.")
62
+
63
+ # Get sequence data as a DataFrame
64
+ seq_df = seqdata.to_dataframe()
65
+
66
+ # Ensure seq_df has the same index as the original data
67
+ # This is crucial to align the grouping variable with sequence data
68
+ seq_df.index = seqdata.data.index
69
+
70
+ # Get weights for all sequences
71
+ if weights is None:
72
+ w_all = np.ones(len(seq_df))
73
+ else:
74
+ w_all = np.asarray(weights)
75
+
76
+ # Create state mapping from numerical values back to state names
77
+ inv_state_mapping = {v: k for k, v in seqdata.state_mapping.items()}
78
+
79
+ # Process grouping variable
80
+ if group_by is None:
81
+ # If no grouping, create a single group with all sequences
82
+ groups = pd.Series(["All Sequences"] * len(seq_df), index=seq_df.index)
83
+ if group_labels is None:
84
+ group_labels = ["All Sequences"]
85
+ elif isinstance(group_by, str):
86
+ # If grouping by column name from original data
87
+ if group_by not in seqdata.data.columns:
88
+ raise ValueError(f"Column '{group_by}' not found in sequence data")
89
+ groups = seqdata.data[group_by]
90
+ if group_labels is None:
91
+ group_labels = sorted(groups.unique())
92
+ else:
93
+ # If grouping by external array or Series
94
+ if len(group_by) != len(seq_df):
95
+ raise ValueError("Length of group_by must match number of sequences")
96
+ groups = pd.Series(group_by)
97
+ if group_labels is None:
98
+ group_labels = sorted(set(groups))
99
+
100
+ # Prepare plotting
101
+ n_groups = len(group_labels)
102
+ n_time_points = len(seq_df.columns)
103
+
104
+ if fig_height is None:
105
+ # Auto-calculate height based on number of groups
106
+ fig_height = max(4, 3 * n_groups)
107
+
108
+ # TODO: Title is not very pretty here so I decided to remove it.
109
+ # But here I keep 1 to keep the space big enough for the distance
110
+ # between the second subplot and the upper first subplot
111
+ title_height = 1
112
+ adjusted_fig_height = fig_height + title_height
113
+
114
+ # Create main figure with additional space for title
115
+ main_fig = plt.figure(figsize=(fig_width, adjusted_fig_height))
116
+
117
+ # No title, use whole figure for plots
118
+ plot_gs = main_fig.add_gridspec(nrows=n_groups, height_ratios=[1] * n_groups, hspace=0.3)
119
+
120
+ # Create axes for each group
121
+ axes = []
122
+ for i in range(n_groups):
123
+ axes.append(main_fig.add_subplot(plot_gs[i]))
124
+
125
+ # Make sure all axes share x and y scales
126
+ for ax in axes[1:]:
127
+ ax.sharex(axes[0])
128
+ ax.sharey(axes[0])
129
+
130
+ # Get colors for states
131
+ colors = seqdata.color_map_by_label
132
+
133
+ # Process each group
134
+ for i, group in enumerate(group_labels):
135
+ ax = axes[i]
136
+
137
+ # Get indices for this group
138
+ group_indices = groups == group
139
+ group_count = group_indices.sum()
140
+
141
+ # Skip if no sequences in this group
142
+ if group_count == 0:
143
+ continue
144
+
145
+ # Subset data for this group and get corresponding weights
146
+ group_data = seq_df[group_indices]
147
+ w = w_all[group_indices.to_numpy()]
148
+
149
+ # Calculate modal states and their frequencies for each time point
150
+ modal_states = []
151
+ modal_freqs = []
152
+
153
+ for col in group_data.columns:
154
+ states_idx = group_data[col].to_numpy()
155
+
156
+ # Calculate weighted counts for each state
157
+ weighted_sum = {}
158
+ # Use numerical state indices (1, 2, 3, ...) instead of state labels
159
+ for s_num in range(1, len(seqdata.states) + 1): # s_num is the integer encoding
160
+ weighted_sum[s_num] = float(w[states_idx == s_num].sum())
161
+
162
+ totw = float(w.sum())
163
+
164
+ if totw > 0:
165
+ # Find the state with maximum weighted count
166
+ modal_s = max(weighted_sum, key=weighted_sum.get)
167
+ modal_state = inv_state_mapping[modal_s]
168
+ modal_freq = weighted_sum[modal_s] / totw
169
+ else:
170
+ modal_state, modal_freq = None, 0.0
171
+
172
+ modal_states.append(modal_state)
173
+ modal_freqs.append(modal_freq)
174
+
175
+ # Equal width for all bars
176
+ x = np.arange(n_time_points)
177
+ bar_width = 0.8 # Fixed width for all bars
178
+
179
+ # Create bars with consistent width
180
+ for j, (state, freq) in enumerate(zip(modal_states, modal_freqs)):
181
+ if state is not None:
182
+ # state is already a label from inv_state_mapping
183
+ ax.bar(x[j], freq, width=bar_width, color=colors[state],
184
+ edgecolor='white', linewidth=0.5)
185
+
186
+ # Set group title with count if requested
187
+ if show_group_titles:
188
+ if show_counts:
189
+ if weights is not None and not np.allclose(weights, 1.0):
190
+ sum_w = float(w.sum())
191
+ title_text = f"{group} (n={group_count}, total weight={sum_w:.1f})"
192
+ else:
193
+ title_text = f"{group} (n={group_count})"
194
+ else:
195
+ title_text = group
196
+ show_plot_title(ax, title_text, show=True, fontsize=fontsize, pad=15)
197
+
198
+ # Set y-axis limits and ticks
199
+ ax.set_ylim(0, 1.0)
200
+ ax.set_yticks([0, 0.25, 0.5, 0.75, 1.0])
201
+
202
+ # Set grid and spines
203
+ ax.grid(axis='y', color='#E0E0E0', linestyle='-', linewidth=0.5)
204
+ ax.set_axisbelow(True)
205
+
206
+ # Clean up borders
207
+ for spine in ax.spines.values():
208
+ spine.set_color('#CCCCCC')
209
+ spine.set_linewidth(0.5)
210
+
211
+ # Add y-label only for the middle subplot
212
+ if i == n_groups // 2:
213
+ ax.set_ylabel(ylabel, fontsize=fontsize)
214
+
215
+ # Set up X-axis (time) labels on the bottom subplot
216
+ set_up_time_labels_for_x_axis(seqdata, axes[-1])
217
+ axes[-1].set_xlabel(xlabel, fontsize=fontsize, labelpad=10)
218
+
219
+ # Save main figure to memory
220
+ main_buffer = save_figure_to_buffer(main_fig, dpi=dpi)
221
+
222
+ # Create a legend
223
+ # Create standalone legend
224
+ legend_buffer = create_standalone_legend(
225
+ colors=colors,
226
+ labels=seqdata.labels,
227
+ ncol=min(5, len(seqdata.states)),
228
+ figsize=(fig_width, 1),
229
+ fontsize=fontsize-2,
230
+ dpi=dpi
231
+ )
232
+
233
+ if save_as and not save_as.lower().endswith(('.png', '.jpg', '.jpeg', '.pdf')):
234
+ save_as = save_as + '.png'
235
+
236
+ # Combine main plot with legend
237
+ combined_img = combine_plot_with_legend(
238
+ main_buffer,
239
+ legend_buffer,
240
+ output_path=save_as,
241
+ dpi=dpi,
242
+ padding=20 # Increased padding between plot and legend
243
+ )
244
+
245
+ # Display combined image
246
+ plt.figure(figsize=(fig_width, adjusted_fig_height + 1))
247
+ plt.imshow(combined_img)
248
+ plt.axis('off')
249
+ plt.show()
250
+ plt.close()
251
+
252
+
253
+
254
+ if __name__ == '__main__':
255
+ # Import necessary libraries
256
+ from sequenzo import * # Social sequence analysis
257
+ import pandas as pd # Data manipulation
258
+
259
+ # List all the available datasets in Sequenzo
260
+ print('Available datasets in Sequenzo: ', list_datasets())
261
+
262
+ # Load the data that we would like to explore in this tutorial
263
+ # `df` is the short for `dataframe`, which is a common variable name for a dataset
264
+ df = load_dataset('country_co2_emissions')
265
+
266
+ # Create a SequenceData object from the dataset
267
+
268
+ # Define the time-span variable
269
+ time = list(df.columns)[1:]
270
+
271
+ states = ['Very Low', 'Low', 'Middle', 'High', 'Very High']
272
+
273
+ sequence_data = SequenceData(df, time=time, time_type="year", id_col="country", states=states)
274
+
275
+ plot_modal_state(sequence_data)
276
+
@@ -0,0 +1,147 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : plot_most_frequent_sequences.py
4
+ @Time : 12/02/2025 10:40
5
+ @Desc :
6
+ Generate sequence frequency plots.
7
+
8
+ This script plots the 10 most frequent sequences,
9
+ similar to `seqfplot` in R's TraMineR package.
10
+ """
11
+
12
+ import pandas as pd
13
+ import matplotlib.pyplot as plt
14
+ import numpy as np
15
+
16
+ from sequenzo.define_sequence_data import SequenceData
17
+ from sequenzo.visualization.utils import (
18
+ set_up_time_labels_for_x_axis,
19
+ save_and_show_results,
20
+ show_plot_title
21
+ )
22
+
23
+
24
+ def plot_most_frequent_sequences(seqdata: SequenceData, top_n: int = 10, weights="auto", title=None, fontsize=12, save_as=None, dpi=200, show_title: bool = True):
25
+ """
26
+ Generate a sequence frequency plot, similar to R's seqfplot.
27
+
28
+ :param seqdata: (SequenceData) A SequenceData object containing sequences.
29
+ :param top_n: (int) Number of most frequent sequences to display.
30
+ :param weights: (np.ndarray or "auto") Weights for sequences. If "auto", uses seqdata.weights if available
31
+ :param title: (str, optional) Title for the plot. If None, no title will be displayed.
32
+ :param fontsize: (int) Base font size for text elements
33
+ :param save_as: (str, optional) Path to save the plot.
34
+ :param dpi: (int) Resolution of the saved plot.
35
+ """
36
+ sequences = seqdata.values.tolist()
37
+
38
+ # Process weights
39
+ if isinstance(weights, str) and weights == "auto":
40
+ weights = getattr(seqdata, "weights", None)
41
+
42
+ if weights is not None:
43
+ weights = np.asarray(weights, dtype=float).reshape(-1)
44
+ if len(weights) != len(seqdata.values):
45
+ raise ValueError("Length of weights must equal number of sequences.")
46
+
47
+ if weights is None:
48
+ weights = np.ones(len(sequences))
49
+
50
+ # Weighted counting of sequences
51
+ agg = {}
52
+ for seq, w in zip(sequences, weights):
53
+ key = tuple(seq)
54
+ agg[key] = agg.get(key, 0.0) + float(w)
55
+
56
+ # Select Top-N by weighted frequency
57
+ items = sorted(agg.items(), key=lambda kv: kv[1], reverse=True)[:top_n]
58
+ df = pd.DataFrame(items, columns=['sequence', 'wcount'])
59
+ totw = float(np.sum(weights))
60
+ df['freq'] = df['wcount'] / (totw if totw > 0 else 1.0) * 100.0
61
+
62
+ # **Ensure colors match seqdef**
63
+ # Use numeric color map directly to avoid label/state-name mismatches
64
+ inv_state_mapping = {v: k for k, v in seqdata.state_mapping.items()} # Reverse mapping kept if needed elsewhere
65
+
66
+ # **Plot settings**
67
+ fig, ax = plt.subplots(figsize=(10, 6))
68
+
69
+ # **Adjust y_positions calculation to ensure sequences fill the entire y-axis**
70
+ y_positions = df['freq'].cumsum() - df['freq'] / 2 # Center the bars
71
+
72
+ for i, (seq, freq) in enumerate(zip(df['sequence'], df['freq'])):
73
+ left = 0 # Starting x position
74
+ for t, state_idx in enumerate(seq):
75
+ # Use numeric-coded color map; if unknown, fall back to gray
76
+ color = seqdata.color_map.get(int(state_idx), "gray")
77
+
78
+ width = 1 # Width of each time slice
79
+ ax.barh(y=y_positions[i], width=width * 1.01, left=left - 0.005,
80
+ height=freq, color=color, linewidth=0,
81
+ antialiased=False)
82
+ left += width # Move to the next time slice
83
+
84
+ # **Formatting**
85
+ ax.set_xlabel("Time", fontsize=fontsize)
86
+ # Check if we have effective weights (not all 1.0) and they were provided by user
87
+ original_weights = getattr(seqdata, "weights", None)
88
+ if original_weights is not None and not np.allclose(original_weights, 1.0):
89
+ # Show both count and weighted total if weights are used
90
+ ax.set_ylabel("Cumulative Frequency (%)\nN={:,}, total weight={:.1f}".format(len(sequences), totw), fontsize=fontsize)
91
+ else:
92
+ ax.set_ylabel("Cumulative Frequency (%)\nN={:,}".format(len(sequences)), fontsize=fontsize)
93
+ if show_title and title is not None:
94
+ show_plot_title(ax, title, show=True, fontsize=fontsize+2, pad=20)
95
+
96
+ # **Optimize X-axis ticks: align to the center of each bar**
97
+ set_up_time_labels_for_x_axis(seqdata, ax)
98
+
99
+ # **Set Y-axis ticks and labels**
100
+ sum_freq_top_10 = df['freq'].sum() # Cumulative frequency of top 10 sequences
101
+ max_freq = df['freq'].max() # Frequency of the top 1 sequence
102
+
103
+ # Set Y-axis ticks: 0%, top1 frequency, top10 cumulative frequency
104
+ y_ticks = [0, max_freq, sum_freq_top_10]
105
+ ax.set_yticks(y_ticks)
106
+ ax.set_yticklabels([f"{ytick:.1f}%" for ytick in y_ticks], fontsize=fontsize-2)
107
+
108
+ # **Set Y-axis range to ensure the highest tick is the top10 cumulative frequency**
109
+ # Force Y-axis range to be from 0 to sum_freq_top_10
110
+ ax.set_ylim(0, sum_freq_top_10)
111
+
112
+ # **Annotate the frequency percentage on the left side of the highest frequency sequence**
113
+ ax.annotate(f"{max_freq:.1f}%", xy=(-0.5, y_positions.iloc[0]),
114
+ xycoords="data", fontsize=fontsize, color="black", ha="left", va="center")
115
+
116
+ # **Annotate 0% at the bottom of the Y-axis**
117
+ ax.annotate("0%", xy=(-0.5, 0), xycoords="data", fontsize=fontsize, color="black", ha="left", va="center")
118
+
119
+ # **Clean up axis aesthetics like plot_state_distribution**
120
+ ax.spines['top'].set_visible(False)
121
+ ax.spines['right'].set_visible(False)
122
+ ax.spines['left'].set_visible(True) # Keep the left border like state_distribution
123
+ ax.spines['bottom'].set_visible(True) # Show bottom border to connect with left
124
+
125
+ # Style the left spine to match plot_state_distribution
126
+ ax.spines['left'].set_color('gray')
127
+ ax.spines['left'].set_linewidth(0.7)
128
+ ax.spines['bottom'].set_color('gray')
129
+ ax.spines['bottom'].set_linewidth(0.7)
130
+
131
+ # Style the tick parameters
132
+ ax.tick_params(axis='y', colors='gray', length=4, width=0.7)
133
+ ax.tick_params(axis='x', colors='gray', length=4, width=0.7)
134
+
135
+ # Extend the left spine slightly beyond the plot area
136
+ ax.spines['left'].set_bounds(0, sum_freq_top_10)
137
+ ax.spines['left'].set_position(('outward', 5)) # Move spine 5 points to the left
138
+
139
+ # Align bottom spine with the left spine position
140
+ ax.spines['bottom'].set_position(('outward', 5)) # Move bottom spine to align with left
141
+
142
+ # Use legend from SequenceData
143
+ ax.legend(*seqdata.get_legend(), bbox_to_anchor=(1.05, 1), loc='upper left')
144
+
145
+ save_and_show_results(save_as, dpi=200)
146
+
147
+