sequenzo 0.1.21__cp312-cp312-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sequenzo might be problematic. Click here for more details.

Files changed (260) hide show
  1. sequenzo/__init__.py +240 -0
  2. sequenzo/big_data/__init__.py +12 -0
  3. sequenzo/big_data/clara/__init__.py +26 -0
  4. sequenzo/big_data/clara/clara.py +467 -0
  5. sequenzo/big_data/clara/utils/__init__.py +27 -0
  6. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  7. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  8. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-312-darwin.so +0 -0
  9. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  10. sequenzo/big_data/clara/visualization.py +88 -0
  11. sequenzo/clustering/KMedoids.py +196 -0
  12. sequenzo/clustering/__init__.py +30 -0
  13. sequenzo/clustering/clustering_c_code.cpython-312-darwin.so +0 -0
  14. sequenzo/clustering/hierarchical_clustering.py +1380 -0
  15. sequenzo/clustering/src/KMedoid.cpp +262 -0
  16. sequenzo/clustering/src/PAM.cpp +236 -0
  17. sequenzo/clustering/src/PAMonce.cpp +234 -0
  18. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  19. sequenzo/clustering/src/cluster_quality.h +128 -0
  20. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  21. sequenzo/clustering/src/module.cpp +228 -0
  22. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  23. sequenzo/clustering/utils/__init__.py +27 -0
  24. sequenzo/clustering/utils/disscenter.py +122 -0
  25. sequenzo/data_preprocessing/__init__.py +20 -0
  26. sequenzo/data_preprocessing/helpers.py +256 -0
  27. sequenzo/datasets/__init__.py +41 -0
  28. sequenzo/datasets/biofam.csv +2001 -0
  29. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  30. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  31. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  32. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  33. sequenzo/datasets/country_co2_emissions.csv +194 -0
  34. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  35. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  36. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  37. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  38. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  39. sequenzo/datasets/mvad.csv +713 -0
  40. sequenzo/datasets/pairfam_family.csv +1867 -0
  41. sequenzo/datasets/polyadic_samplec1.csv +61 -0
  42. sequenzo/datasets/polyadic_samplep1.csv +61 -0
  43. sequenzo/datasets/polyadic_seqc1.csv +61 -0
  44. sequenzo/datasets/polyadic_seqp1.csv +61 -0
  45. sequenzo/define_sequence_data.py +609 -0
  46. sequenzo/dissimilarity_measures/__init__.py +31 -0
  47. sequenzo/dissimilarity_measures/c_code.cpython-312-darwin.so +0 -0
  48. sequenzo/dissimilarity_measures/get_distance_matrix.py +702 -0
  49. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +241 -0
  50. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  51. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  52. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  53. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  54. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  55. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  56. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  57. sequenzo/dissimilarity_measures/src/module.cpp +34 -0
  58. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  59. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  60. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  61. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  62. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  63. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  64. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  65. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  66. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  67. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  210. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  211. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  212. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-312-darwin.so +0 -0
  213. sequenzo/dissimilarity_measures/utils/seqconc.cpython-312-darwin.so +0 -0
  214. sequenzo/dissimilarity_measures/utils/seqdss.cpython-312-darwin.so +0 -0
  215. sequenzo/dissimilarity_measures/utils/seqdur.cpython-312-darwin.so +0 -0
  216. sequenzo/dissimilarity_measures/utils/seqlength.cpython-312-darwin.so +0 -0
  217. sequenzo/multidomain/__init__.py +23 -0
  218. sequenzo/multidomain/association_between_domains.py +311 -0
  219. sequenzo/multidomain/cat.py +431 -0
  220. sequenzo/multidomain/combt.py +519 -0
  221. sequenzo/multidomain/dat.py +89 -0
  222. sequenzo/multidomain/idcd.py +139 -0
  223. sequenzo/multidomain/linked_polyad.py +292 -0
  224. sequenzo/openmp_setup.py +233 -0
  225. sequenzo/prefix_tree/__init__.py +43 -0
  226. sequenzo/prefix_tree/individual_level_indicators.py +1274 -0
  227. sequenzo/prefix_tree/system_level_indicators.py +465 -0
  228. sequenzo/prefix_tree/utils.py +54 -0
  229. sequenzo/sequence_characteristics/__init__.py +40 -0
  230. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  231. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  232. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  233. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  234. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  235. sequenzo/sequence_characteristics/turbulence.py +155 -0
  236. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  237. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  238. sequenzo/suffix_tree/__init__.py +48 -0
  239. sequenzo/suffix_tree/individual_level_indicators.py +1638 -0
  240. sequenzo/suffix_tree/system_level_indicators.py +456 -0
  241. sequenzo/suffix_tree/utils.py +56 -0
  242. sequenzo/visualization/__init__.py +29 -0
  243. sequenzo/visualization/plot_mean_time.py +194 -0
  244. sequenzo/visualization/plot_modal_state.py +276 -0
  245. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  246. sequenzo/visualization/plot_relative_frequency.py +404 -0
  247. sequenzo/visualization/plot_sequence_index.py +937 -0
  248. sequenzo/visualization/plot_single_medoid.py +153 -0
  249. sequenzo/visualization/plot_state_distribution.py +613 -0
  250. sequenzo/visualization/plot_transition_matrix.py +190 -0
  251. sequenzo/visualization/utils/__init__.py +23 -0
  252. sequenzo/visualization/utils/utils.py +310 -0
  253. sequenzo/with_event_history_analysis/__init__.py +35 -0
  254. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  255. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  256. sequenzo-0.1.21.dist-info/METADATA +308 -0
  257. sequenzo-0.1.21.dist-info/RECORD +254 -0
  258. sequenzo-0.1.21.dist-info/WHEEL +5 -0
  259. sequenzo-0.1.21.dist-info/licenses/LICENSE +28 -0
  260. sequenzo-0.1.21.dist-info/top_level.txt +1 -0
@@ -0,0 +1,937 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : plot_sequence_index.py
4
+ @Time : 29/12/2024 09:08
5
+ @Desc :
6
+ Generate sequence index plots.
7
+ """
8
+ import numpy as np
9
+ import pandas as pd
10
+ import matplotlib.pyplot as plt
11
+
12
+ # Use relative import to avoid circular import when top-level package imports visualization
13
+ from ..define_sequence_data import SequenceData
14
+ from sequenzo.visualization.utils import (
15
+ set_up_time_labels_for_x_axis,
16
+ save_figure_to_buffer,
17
+ create_standalone_legend,
18
+ combine_plot_with_legend,
19
+ save_and_show_results,
20
+ determine_layout,
21
+ show_plot_title
22
+ )
23
+
24
+
25
+ def smart_sort_groups(groups):
26
+ """
27
+ Smart sorting: prioritize numeric prefix, fallback to string sorting
28
+
29
+ :param groups: List of group names
30
+ :return: Sorted list of group names
31
+ """
32
+ import re
33
+
34
+ # Compile regex once for better performance
35
+ numeric_pattern = re.compile(r'^(\d+)')
36
+
37
+ def sort_key(item):
38
+ match = numeric_pattern.match(str(item))
39
+ return (int(match.group(1)), str(item)) if match else (float('inf'), str(item))
40
+
41
+ return sorted(groups, key=sort_key)
42
+
43
+
44
+ def _cmdscale(D):
45
+ """
46
+ Classic Multidimensional Scaling (MDS), equivalent to R's cmdscale()
47
+
48
+ :param D: A NxN symmetric distance matrix
49
+ :return: Y, a Nxd coordinate matrix, where d is the largest positive eigenvalues' count
50
+ """
51
+ n = len(D)
52
+
53
+ # Step 1: Compute the centering matrix
54
+ H = np.eye(n) - np.ones((n, n)) / n
55
+
56
+ # Step 2: Compute the double centered distance matrix
57
+ B = -0.5 * H @ (D ** 2) @ H
58
+
59
+ # Step 3: Compute eigenvalues and eigenvectors
60
+ eigvals, eigvecs = np.linalg.eigh(B)
61
+
62
+ # Step 4: Sort eigenvalues and eigenvectors in descending order
63
+ idx = np.argsort(eigvals)[::-1]
64
+ eigvals = eigvals[idx]
65
+ eigvecs = eigvecs[:, idx]
66
+
67
+ # Step 5: Select only positive eigenvalues
68
+ w, = np.where(eigvals > 0)
69
+ if len(w) > 0:
70
+ L = np.diag(np.sqrt(eigvals[w]))
71
+ V = eigvecs[:, w]
72
+ return V @ L # Return the MDS coordinates
73
+ else:
74
+ # Fallback if no positive eigenvalues
75
+ return np.zeros((n, 1))
76
+
77
+
78
+ def _find_most_frequent_sequence(sequences):
79
+ """
80
+ Find the most frequent sequence in the dataset.
81
+
82
+ :param sequences: numpy array of sequences
83
+ :return: index of the most frequent sequence
84
+ """
85
+ from collections import Counter
86
+
87
+ # Convert sequences to tuples for hashing
88
+ seq_tuples = [tuple(seq) for seq in sequences]
89
+
90
+ # Count frequencies
91
+ counter = Counter(seq_tuples)
92
+
93
+ # Find the most frequent sequence
94
+ most_frequent = counter.most_common(1)[0][0]
95
+
96
+ # Find the index of this sequence in the original array
97
+ for i, seq in enumerate(seq_tuples):
98
+ if seq == most_frequent:
99
+ return i
100
+
101
+ return 0 # Fallback
102
+
103
+
104
+ def _select_sequences_subset(seqdata, sequence_selection, n_sequences, sort_by, sort_by_weight, weights, mask=None):
105
+ """
106
+ Select a subset of sequences based on the selection method.
107
+
108
+ :param seqdata: SequenceData object
109
+ :param sequence_selection: Selection method ("all", "first_n", "last_n", or list of IDs)
110
+ :param n_sequences: Number of sequences for "first_n" or "last_n"
111
+ :param sort_by: Sorting method to use before selection
112
+ :param sort_by_weight: Whether to sort by weight
113
+ :param weights: Sequence weights
114
+ :param mask: Optional mask for pre-filtering sequences
115
+ :return: Boolean mask for selected sequences
116
+ """
117
+ # Start with all sequences or pre-filtered mask
118
+ if mask is None:
119
+ mask = np.ones(len(seqdata.values), dtype=bool)
120
+
121
+ # If "all", return the current mask
122
+ if sequence_selection == "all":
123
+ return mask
124
+
125
+ # Get indices of sequences that pass the mask
126
+ valid_indices = np.where(mask)[0]
127
+
128
+ # Handle ID list selection
129
+ if isinstance(sequence_selection, list):
130
+ # Convert list to set for faster lookup
131
+ selected_ids = set(sequence_selection)
132
+
133
+ # Find indices of sequences with matching IDs
134
+ selected_mask = np.zeros(len(seqdata.values), dtype=bool)
135
+ if hasattr(seqdata, 'ids') and seqdata.ids is not None:
136
+ for i in valid_indices:
137
+ if seqdata.ids[i] in selected_ids:
138
+ selected_mask[i] = True
139
+ else:
140
+ print("Warning: sequence_selection provided as ID list but seqdata has no IDs. Using all sequences.")
141
+ return mask
142
+
143
+ return selected_mask
144
+
145
+ # For "first_n" or "last_n", we need to sort first
146
+ if sequence_selection in ["first_n", "last_n"]:
147
+ # Get the subset of data based on current mask
148
+ subset_seqdata = seqdata
149
+ subset_weights = weights
150
+
151
+ if not np.all(mask):
152
+ # Create subset if mask is not all True
153
+ subset_values = seqdata.values[mask]
154
+ subset_ids = seqdata.ids[mask] if hasattr(seqdata, 'ids') and seqdata.ids is not None else None
155
+
156
+ # Use original seqdata for structure, just work with filtered values
157
+ subset_seqdata = seqdata # Keep original structure
158
+
159
+ if weights is not None:
160
+ subset_weights = weights[mask]
161
+
162
+ # Apply sorting to get the order
163
+ distance_matrix = None
164
+ if sort_by in ["mds", "distance_to_most_frequent"]:
165
+ try:
166
+ from sequenzo.dissimilarity_measures.get_distance_matrix import get_distance_matrix
167
+ distance_matrix = get_distance_matrix(
168
+ seqdata=subset_seqdata,
169
+ method="OM",
170
+ sm="CONSTANT",
171
+ indel="auto"
172
+ )
173
+ if hasattr(distance_matrix, 'values'):
174
+ distance_matrix = distance_matrix.values
175
+ except ImportError:
176
+ print(f"Warning: Cannot compute distance matrix for '{sort_by}' sorting. Using unsorted order.")
177
+ sort_by = "unsorted"
178
+
179
+ # Apply sorting to the masked subset
180
+ if sort_by_weight and subset_weights is not None:
181
+ # Sort by weight on the subset
182
+ sorted_indices = np.argsort(-subset_weights)
183
+ else:
184
+ # Sort on the subset values
185
+ if sort_by == "unsorted" or sort_by == "none":
186
+ sorted_indices = np.arange(len(valid_indices))
187
+ elif sort_by == "lexicographic":
188
+ subset_values = seqdata.values[mask]
189
+ vals = subset_values.astype(float, copy=True)
190
+ vals = np.nan_to_num(vals, nan=np.inf)
191
+ sorted_indices = np.lexsort(vals.T[::-1])
192
+ elif sort_by in ["mds", "distance_to_most_frequent"]:
193
+ # For complex sorting that requires distance matrix,
194
+ # we'll fall back to simple lexicographic for now
195
+ subset_values = seqdata.values[mask]
196
+ vals = subset_values.astype(float, copy=True)
197
+ vals = np.nan_to_num(vals, nan=np.inf)
198
+ sorted_indices = np.lexsort(vals.T[::-1])
199
+ print(f"Warning: {sort_by} sorting simplified to lexicographic for sequence selection")
200
+ else:
201
+ sorted_indices = np.arange(len(valid_indices))
202
+
203
+ # Select first_n or last_n
204
+ n_available = len(sorted_indices)
205
+ n_to_select = min(n_sequences, n_available)
206
+
207
+ if sequence_selection == "first_n":
208
+ selected_subset_indices = sorted_indices[:n_to_select]
209
+ elif sequence_selection == "last_n":
210
+ selected_subset_indices = sorted_indices[-n_to_select:]
211
+
212
+ # Map back to original indices
213
+ original_indices = valid_indices[selected_subset_indices]
214
+
215
+ # Create final mask
216
+ final_mask = np.zeros(len(seqdata.values), dtype=bool)
217
+ final_mask[original_indices] = True
218
+
219
+ return final_mask
220
+
221
+ else:
222
+ raise ValueError(f"Unsupported sequence_selection: {sequence_selection}. "
223
+ f"Supported options: 'all', 'first_n', 'last_n', or list of IDs")
224
+
225
+
226
+ def sort_sequences_by_method(seqdata, method="unsorted", mask=None, distance_matrix=None, weights=None):
227
+ """
228
+ Sort sequences in SequenceData based on specified method.
229
+
230
+ :param seqdata: SequenceData object
231
+ :param method: str, sorting method - "unsorted", "lexicographic", "mds", "distance_to_most_frequent"
232
+ :param mask: np.array(bool), if provided, sort only this subset
233
+ :param distance_matrix: np.array, required for "mds" and "distance_to_most_frequent" methods
234
+ :param weights: np.array, optional weights for sequences
235
+ :return: np.array sorting indices (relative to original order)
236
+ """
237
+ values = seqdata.values.copy()
238
+
239
+ n_sequences = len(values) if mask is None else int(np.sum(mask))
240
+
241
+ if mask is not None:
242
+ values = values[mask]
243
+ if distance_matrix is not None:
244
+ # Only slice if distance_matrix is for the full sample
245
+ if distance_matrix.shape[0] != n_sequences:
246
+ masked_indices = np.where(mask)[0]
247
+ distance_matrix = distance_matrix[np.ix_(masked_indices, masked_indices)]
248
+
249
+ if method == "unsorted" or method == "none":
250
+ # Keep original order (R default)
251
+ return np.arange(n_sequences)
252
+
253
+ elif method == "lexicographic":
254
+ # Lexicographic sorting (NaN-safe)
255
+ vals = values.astype(float, copy=True)
256
+ # Push NaNs to the end for sorting
257
+ vals = np.nan_to_num(vals, nan=np.inf)
258
+ return np.lexsort(vals.T[::-1])
259
+
260
+ elif method == "mds":
261
+ # MDS first dimension sorting
262
+ if distance_matrix is None:
263
+ raise ValueError("Distance matrix is required for MDS sorting")
264
+
265
+ # TODO: Support weighted MDS (TraMineR's wcmdscale analogue) when weights are provided.
266
+ # Compute MDS coordinates
267
+ mds_coords = _cmdscale(distance_matrix)
268
+
269
+ # Sort by first MDS dimension
270
+ return np.argsort(mds_coords[:, 0])
271
+
272
+ elif method == "distance_to_most_frequent":
273
+ # Sort by distance to most frequent sequence
274
+ if distance_matrix is None:
275
+ raise ValueError("Distance matrix is required for distance_to_most_frequent sorting")
276
+
277
+ # Find most frequent sequence
278
+ most_freq_idx = _find_most_frequent_sequence(values)
279
+
280
+ # Get distances to most frequent sequence
281
+ distances = distance_matrix[most_freq_idx, :]
282
+
283
+ # Sort by distance (ascending)
284
+ return np.argsort(distances)
285
+
286
+ else:
287
+ raise ValueError(f"Unsupported sorting method: {method}. "
288
+ f"Supported methods are: 'unsorted', 'lexicographic', 'mds', 'distance_to_most_frequent'")
289
+
290
+
291
+ def plot_sequence_index(seqdata: SequenceData,
292
+ # Grouping parameters
293
+ group_by_column=None,
294
+ group_dataframe=None,
295
+ group_column_name=None,
296
+ group_labels=None,
297
+ # Other parameters
298
+ sort_by="lexicographic",
299
+ sort_by_weight=False,
300
+ weights="auto",
301
+ figsize=(10, 6),
302
+ plot_style="standard",
303
+ title=None,
304
+ xlabel="Time",
305
+ ylabel="Sequences",
306
+ save_as=None,
307
+ dpi=200,
308
+ layout='column',
309
+ nrows: int = None,
310
+ ncols: int = None,
311
+ group_order=None,
312
+ sort_groups='auto',
313
+ fontsize=12,
314
+ show_group_titles: bool = True,
315
+ include_legend: bool = True,
316
+ sequence_selection="all",
317
+ n_sequences=10,
318
+ show_sequence_ids=False
319
+ ):
320
+ """Creates sequence index plots, optionally grouped by categories.
321
+
322
+ This function creates index plots that visualize sequences as horizontal lines,
323
+ with different sorting options matching R's TraMineR functionality.
324
+
325
+ **Two API modes for grouping:**
326
+
327
+ 1. **Simplified API** (when grouping info is already in the data):
328
+ ```python
329
+ plot_sequence_index(seqdata, group_by_column="Cluster", group_labels=cluster_labels)
330
+ ```
331
+
332
+ 2. **Complete API** (when grouping info is in a separate dataframe):
333
+ ```python
334
+ plot_sequence_index(seqdata, group_dataframe=membership_df,
335
+ group_column_name="Cluster", group_labels=cluster_labels)
336
+ ```
337
+
338
+ :param seqdata: SequenceData object containing sequence information
339
+
340
+ **New API parameters (recommended):**
341
+ :param group_by_column: (str, optional) Column name from seqdata.data to group by.
342
+ Use this when grouping information is already in your data.
343
+ Example: "Cluster", "sex", "education"
344
+ :param group_dataframe: (pd.DataFrame, optional) Separate dataframe containing grouping information.
345
+ Use this when grouping info is in a separate table (e.g., clustering results).
346
+ Must contain ID column and grouping column.
347
+ :param group_column_name: (str, optional) Name of the grouping column in group_dataframe.
348
+ Required when using group_dataframe.
349
+ :param group_labels: (dict, optional) Custom labels for group values.
350
+ Example: {1: "Late Family Formation", 2: "Early Partnership"}
351
+ Maps original values to display labels.
352
+
353
+ :param sort_by: Sorting method for sequences within groups:
354
+ - 'unsorted' or 'none': Keep original order (R TraMineR default)
355
+ - 'lexicographic': Sort sequences lexicographically
356
+ - 'mds': Sort by first MDS dimension (requires distance computation)
357
+ - 'distance_to_most_frequent': Sort by distance to most frequent sequence
358
+ :param sort_by_weight: If True, sort sequences by weight (descending), overrides sort_by
359
+ :param weights: (np.ndarray or "auto") Weights for sequences. If "auto", uses seqdata.weights if available
360
+ :param figsize: Size of each subplot figure (only used when plot_style="custom")
361
+ :param plot_style: Plot aspect style:
362
+ - 'standard': Standard proportions (10, 6) - balanced view
363
+ - 'compact': Compact/vertical proportions (8, 8) - more vertical like R plots
364
+ - 'wide': Wide proportions (12, 4) - emphasizes time progression
365
+ - 'narrow': Narrow/tall proportions (8, 10) - moderately vertical
366
+ - 'custom': Use the provided figsize parameter
367
+ :param title: Title for the plot (if None, default titles will be used)
368
+ :param xlabel: Label for the x-axis
369
+ :param ylabel: Label for the y-axis
370
+ :param save_as: File path to save the plot (if None, plot will be shown)
371
+ :param dpi: DPI for saved image
372
+ :param layout: Layout style - 'column' (default, 3xn), 'grid' (nxn)
373
+ :param group_order: List, manually specify group order (overrides sort_groups)
374
+ :param sort_groups: String, sorting method: 'auto'(smart numeric), 'numeric'(numeric prefix), 'alpha'(alphabetical), 'none'(original order)
375
+ :param fontsize: Base font size for text elements (titles use fontsize+2, ticks use fontsize-2)
376
+ :param show_group_titles: Whether to show group titles
377
+ :param include_legend: Whether to include legend in the plot (True by default)
378
+ :param sequence_selection: Method for selecting sequences to visualize:
379
+ - "all": Show all sequences (default)
380
+ - "first_n": Show first n sequences from each group
381
+ - "last_n": Show last n sequences from each group
382
+ - list: List of specific sequence IDs to show
383
+ :param n_sequences: Number of sequences to show when using "first_n" or "last_n" (default: 10)
384
+ :param show_sequence_ids: If True, show actual sequence IDs on y-axis instead of sequence numbers.
385
+ Most useful when sequence_selection is a list of IDs (default: False)
386
+
387
+ Note: For 'mds' and 'distance_to_most_frequent' sorting, distance matrices are computed
388
+ automatically using Optimal Matching (OM) with constant substitution costs.
389
+ """
390
+ # Determine figure size based on plot style
391
+ style_sizes = {
392
+ 'standard': (10, 6), # Balanced view
393
+ 'compact': (8, 8), # More square, like R plots
394
+ 'wide': (12, 4), # Wide, emphasizes time
395
+ 'narrow': (8, 10), # Moderately vertical
396
+ 'custom': figsize # User-provided
397
+ }
398
+
399
+ if plot_style not in style_sizes:
400
+ raise ValueError(f"Invalid plot_style '{plot_style}'. "
401
+ f"Supported styles: {list(style_sizes.keys())}")
402
+
403
+ # Special validation for custom plot style
404
+ if plot_style == 'custom' and figsize == (10, 6):
405
+ raise ValueError(
406
+ "When using plot_style='custom', you must explicitly provide a figsize parameter "
407
+ "that differs from the default (10, 6). "
408
+ "Suggested custom sizes:\n"
409
+ " - For wide plots: figsize=(15, 5)\n"
410
+ " - For tall plots: figsize=(7, 12)\n"
411
+ " - For square plots: figsize=(9, 9)\n"
412
+ " - For small plots: figsize=(6, 4)\n"
413
+ "Example: plot_sequence_index(data, plot_style='custom', figsize=(12, 8))"
414
+ )
415
+
416
+ actual_figsize = style_sizes[plot_style]
417
+
418
+ # Handle the simplified API: group_by_column
419
+ if group_by_column is not None:
420
+ # Validate that the column exists in the original data
421
+ if group_by_column not in seqdata.data.columns:
422
+ available_cols = [col for col in seqdata.data.columns if col not in seqdata.time and col != seqdata.id_col]
423
+ raise ValueError(
424
+ f"Column '{group_by_column}' not found in the data. "
425
+ f"Available columns for grouping: {available_cols}"
426
+ )
427
+
428
+ # Automatically create group_dataframe and group_column_name from the simplified API
429
+ group_dataframe = seqdata.data[[seqdata.id_col, group_by_column]].copy()
430
+ group_dataframe.columns = ['Entity ID', 'Category']
431
+ group_column_name = 'Category'
432
+
433
+ # Handle group labels - flexible and user-controllable
434
+ unique_values = seqdata.data[group_by_column].unique()
435
+
436
+ if group_labels is not None:
437
+ # User provided custom labels - use them
438
+ missing_keys = set(unique_values) - set(group_labels.keys())
439
+ if missing_keys:
440
+ raise ValueError(
441
+ f"group_labels missing mappings for values: {missing_keys}. "
442
+ f"Please provide labels for all unique values in '{group_by_column}': {sorted(unique_values)}"
443
+ )
444
+ group_dataframe['Category'] = group_dataframe['Category'].map(group_labels)
445
+ else:
446
+ # No custom labels provided - use smart defaults
447
+ if all(isinstance(v, (int, float, np.integer, np.floating)) and not pd.isna(v) for v in unique_values):
448
+ # Numeric values - keep as is (user can provide group_labels if they want custom names)
449
+ pass
450
+ # For string/categorical values, keep original values
451
+ # This handles cases where users already have meaningful labels like "Male"/"Female"
452
+
453
+ print(f"[>] Creating grouped plots by '{group_by_column}' with {len(unique_values)} categories")
454
+
455
+ # If no grouping information, create a single plot
456
+ if group_dataframe is None or group_column_name is None:
457
+ return _sequence_index_plot_single(seqdata, sort_by, sort_by_weight, weights, actual_figsize, plot_style, title, xlabel, ylabel, save_as, dpi, fontsize, include_legend, sequence_selection, n_sequences, show_sequence_ids)
458
+
459
+ # Process weights
460
+ if isinstance(weights, str) and weights == "auto":
461
+ weights = getattr(seqdata, "weights", None)
462
+
463
+ if weights is not None:
464
+ weights = np.asarray(weights, dtype=float).reshape(-1)
465
+ if len(weights) != len(seqdata.values):
466
+ raise ValueError("Length of weights must equal number of sequences.")
467
+
468
+ # Ensure ID columns match (convert if needed)
469
+ id_col_name = "Entity ID" if "Entity ID" in group_dataframe.columns else group_dataframe.columns[0]
470
+
471
+ # Get unique groups and sort them based on user preference
472
+ if group_order:
473
+ # Use manually specified order, filter out non-existing groups
474
+ groups = [g for g in group_order if g in group_dataframe[group_column_name].unique()]
475
+ missing_groups = [g for g in group_dataframe[group_column_name].unique() if g not in group_order]
476
+ if missing_groups:
477
+ print(f"[Warning] Groups not in group_order will be excluded: {missing_groups}")
478
+ elif sort_groups == 'numeric' or sort_groups == 'auto':
479
+ groups = smart_sort_groups(group_dataframe[group_column_name].unique())
480
+ elif sort_groups == 'alpha':
481
+ groups = sorted(group_dataframe[group_column_name].unique())
482
+ elif sort_groups == 'none':
483
+ groups = list(group_dataframe[group_column_name].unique())
484
+ else:
485
+ raise ValueError(f"Invalid sort_groups value: {sort_groups}. Use 'auto', 'numeric', 'alpha', or 'none'.")
486
+
487
+ num_groups = len(groups)
488
+
489
+ # Calculate figure size and layout based on number of groups and specified layout
490
+ nrows, ncols = determine_layout(num_groups, layout=layout, nrows=nrows, ncols=ncols)
491
+
492
+ fig, axes = plt.subplots(
493
+ nrows=nrows,
494
+ ncols=ncols,
495
+ figsize=(actual_figsize[0] * ncols, actual_figsize[1] * nrows),
496
+ gridspec_kw={'wspace': 0.15, 'hspace': 0.25} # Reduced spacing for tighter layout
497
+ )
498
+ axes = axes.flatten()
499
+
500
+ # Create a plot for each group
501
+ for i, group in enumerate(groups):
502
+ # Get IDs for this group
503
+ group_ids = group_dataframe[group_dataframe[group_column_name] == group][id_col_name].values
504
+
505
+ # Match IDs with sequence data
506
+ mask = np.isin(seqdata.ids, group_ids)
507
+ if not np.any(mask):
508
+ print(f"Warning: No matching sequences found for group '{group}'")
509
+ continue
510
+
511
+ # Apply sequence selection to this group
512
+ mask = _select_sequences_subset(seqdata, sequence_selection, n_sequences, sort_by, sort_by_weight, weights, mask)
513
+
514
+ # Extract sequences for this group
515
+ group_sequences = seqdata.values[mask]
516
+
517
+ # Track group IDs for y-axis labels
518
+ group_ids_for_labels = None
519
+ if hasattr(seqdata, 'ids') and seqdata.ids is not None and show_sequence_ids:
520
+ group_ids_for_labels = seqdata.ids[mask]
521
+
522
+ # Get weights for this group
523
+ if weights is not None:
524
+ group_weights = weights[mask]
525
+ else:
526
+ group_weights = None
527
+
528
+ # Handle NaN values for better visualization
529
+ if np.isnan(group_sequences).any():
530
+ # Map NaN to a dedicated state code with proper masking
531
+ group_sequences = group_sequences.astype(float)
532
+ group_sequences[np.isnan(group_sequences)] = np.nan
533
+
534
+ if sort_by_weight and group_weights is not None:
535
+ # Sort by weight (descending)
536
+ sorted_indices = np.argsort(-group_weights)
537
+ else:
538
+ # For group plots, we'll use simpler sorting to avoid complex object creation
539
+ if sort_by == "lexicographic":
540
+ vals = group_sequences.astype(float, copy=True)
541
+ vals = np.nan_to_num(vals, nan=np.inf)
542
+ sorted_indices = np.lexsort(vals.T[::-1])
543
+ elif sort_by in ["mds", "distance_to_most_frequent"]:
544
+ # Fallback to lexicographic for complex sorting methods
545
+ print(f"Warning: {sort_by} sorting simplified to lexicographic for grouped plots with sequence selection")
546
+ vals = group_sequences.astype(float, copy=True)
547
+ vals = np.nan_to_num(vals, nan=np.inf)
548
+ sorted_indices = np.lexsort(vals.T[::-1])
549
+ else:
550
+ # unsorted or other methods
551
+ sorted_indices = np.arange(len(group_sequences))
552
+
553
+ sorted_data = group_sequences[sorted_indices]
554
+
555
+ # Track sorted IDs for y-axis labels if needed
556
+ sorted_group_ids = None
557
+ if group_ids_for_labels is not None and show_sequence_ids:
558
+ sorted_group_ids = group_ids_for_labels[sorted_indices]
559
+
560
+ # Plot on the corresponding axis
561
+ ax = axes[i]
562
+ # Use masked array for better NaN handling
563
+ data = sorted_data.astype(float)
564
+ data[data < 1] = np.nan
565
+
566
+ # Check for all-missing or all-invalid data
567
+ if np.all(~np.isfinite(data)):
568
+ print(f"Warning: all values missing/invalid for group '{group}'")
569
+ ax.axis('off')
570
+ continue
571
+
572
+ im = ax.imshow(np.ma.masked_invalid(data), aspect='auto', cmap=seqdata.get_colormap(),
573
+ interpolation='nearest', vmin=1, vmax=len(seqdata.states))
574
+
575
+ # Remove grid lines
576
+ ax.grid(False)
577
+
578
+ # Set up time labels
579
+ set_up_time_labels_for_x_axis(seqdata, ax)
580
+
581
+ # Enhance y-axis aesthetics - evenly spaced ticks including the last sequence
582
+ num_sequences = sorted_data.shape[0]
583
+
584
+ # Determine tick positions and labels
585
+ if show_sequence_ids and sorted_group_ids is not None:
586
+ # Show sequence IDs instead of sequence numbers
587
+ # For large number of sequences, show fewer ticks to avoid overcrowding
588
+ if num_sequences <= 10:
589
+ ytick_positions = np.arange(num_sequences)
590
+ ytick_labels = [str(sid) for sid in sorted_group_ids]
591
+ else:
592
+ # Show subset of IDs for readability
593
+ if plot_style == "narrow":
594
+ num_ticks = min(8, num_sequences)
595
+ else:
596
+ num_ticks = min(11, num_sequences)
597
+ ytick_positions = np.linspace(0, num_sequences - 1, num=num_ticks, dtype=int)
598
+ ytick_positions = np.unique(ytick_positions)
599
+ ytick_labels = [str(sorted_group_ids[pos]) for pos in ytick_positions]
600
+ else:
601
+ # Default behavior: show sequence numbers
602
+ if plot_style == "narrow":
603
+ num_ticks = min(8, num_sequences) # Fewer ticks for narrow plots
604
+ else:
605
+ num_ticks = min(11, num_sequences)
606
+ ytick_positions = np.linspace(0, num_sequences - 1, num=num_ticks, dtype=int)
607
+ ytick_positions = np.unique(ytick_positions)
608
+ ytick_labels = (ytick_positions + 1).astype(int)
609
+
610
+ ax.set_yticks(ytick_positions)
611
+ ax.set_yticklabels(ytick_labels, fontsize=fontsize-2, color='black')
612
+
613
+ # Customize axis style
614
+ ax.spines['top'].set_visible(False)
615
+ ax.spines['right'].set_visible(False)
616
+ ax.spines['left'].set_color('gray')
617
+ ax.spines['bottom'].set_color('gray')
618
+ ax.spines['left'].set_linewidth(0.7)
619
+ ax.spines['bottom'].set_linewidth(0.7)
620
+
621
+ # Move spines slightly away from the plot area for better aesthetics
622
+ ax.spines['left'].set_position(('outward', 5))
623
+ ax.spines['bottom'].set_position(('outward', 5))
624
+
625
+ # Ensure ticks are always visible regardless of plot style
626
+ ax.tick_params(axis='x', colors='gray', length=4, width=0.7, which='major')
627
+ ax.tick_params(axis='y', colors='gray', length=4, width=0.7, which='major')
628
+
629
+ # Force tick visibility for narrow plot styles
630
+ ax.xaxis.set_ticks_position('bottom')
631
+ ax.yaxis.set_ticks_position('left')
632
+ ax.tick_params(axis='both', which='major', direction='out')
633
+
634
+ # Add group title with weight information
635
+ # Check if we have effective weights (not all 1.0) and they were provided by user
636
+ original_weights = getattr(seqdata, "weights", None)
637
+ if original_weights is not None and not np.allclose(original_weights, 1.0) and group_weights is not None:
638
+ sum_w = float(group_weights.sum())
639
+ group_title = f"{group} (n = {num_sequences}, total weight = {sum_w:.1f})"
640
+ else:
641
+ group_title = f"{group} (n = {num_sequences})"
642
+ if show_group_titles:
643
+ show_plot_title(ax, group_title, show=True, fontsize=fontsize, loc='right')
644
+
645
+ # Add axis labels
646
+ if i % ncols == 0:
647
+ ax.set_ylabel(ylabel, fontsize=fontsize, labelpad=10, color='black')
648
+
649
+ # if i >= num_groups - ncols:
650
+ ax.set_xlabel(xlabel, fontsize=fontsize, labelpad=10, color='black')
651
+
652
+ # Hide unused subplots
653
+ for j in range(i + 1, len(axes)):
654
+ axes[j].set_visible(False)
655
+
656
+ # Add a common title if provided
657
+ if title:
658
+ fig.suptitle(title, fontsize=fontsize+2, y=1.02)
659
+
660
+ # Adjust layout to remove tight_layout warning and eliminate extra right space
661
+ fig.subplots_adjust(wspace=0.15, hspace=0.25, bottom=0.1, top=0.9, right=0.98, left=0.08)
662
+
663
+ # Save main figure to memory
664
+ main_buffer = save_figure_to_buffer(fig, dpi=dpi)
665
+
666
+ if include_legend:
667
+ # Create standalone legend
668
+ colors = seqdata.color_map_by_label
669
+ legend_buffer = create_standalone_legend(
670
+ colors=colors,
671
+ labels=seqdata.labels,
672
+ ncol=min(5, len(seqdata.states)),
673
+ figsize=(actual_figsize[0] * ncols, 1),
674
+ fontsize=fontsize-2,
675
+ dpi=dpi
676
+ )
677
+
678
+ # Combine plot with legend
679
+ if save_as and not save_as.lower().endswith(('.png', '.jpg', '.jpeg', '.pdf')):
680
+ save_as = save_as + '.png'
681
+
682
+ combined_img = combine_plot_with_legend(
683
+ main_buffer,
684
+ legend_buffer,
685
+ output_path=save_as,
686
+ dpi=dpi,
687
+ padding=20
688
+ )
689
+
690
+ # Display combined image
691
+ plt.figure(figsize=(actual_figsize[0] * ncols, actual_figsize[1] * nrows + 1))
692
+ plt.imshow(combined_img)
693
+ plt.axis('off')
694
+ plt.show()
695
+ plt.close()
696
+ else:
697
+ # Display plot without legend
698
+ if save_as and not save_as.lower().endswith(('.png', '.jpg', '.jpeg', '.pdf')):
699
+ save_as = save_as + '.png'
700
+
701
+ # Save or show the main plot directly
702
+ plt.figure(figsize=(actual_figsize[0] * ncols, actual_figsize[1] * nrows))
703
+ plt.imshow(main_buffer)
704
+ plt.axis('off')
705
+
706
+ if save_as:
707
+ plt.savefig(save_as, dpi=dpi, bbox_inches='tight')
708
+ plt.show()
709
+ plt.close()
710
+
711
+
712
+ def _sequence_index_plot_single(seqdata: SequenceData,
713
+ sort_by="unsorted",
714
+ sort_by_weight=False,
715
+ weights="auto",
716
+ figsize=(10, 6),
717
+ plot_style="standard",
718
+ title=None,
719
+ xlabel="Time",
720
+ ylabel="Sequences",
721
+ save_as=None,
722
+ dpi=200,
723
+ fontsize=12,
724
+ include_legend=True,
725
+ sequence_selection="all",
726
+ n_sequences=10,
727
+ show_sequence_ids=False):
728
+ """Efficiently creates a sequence index plot using `imshow` for faster rendering.
729
+
730
+ :param seqdata: SequenceData object containing sequence information
731
+ :param sort_by: Sorting method ('unsorted', 'lexicographic', 'mds', 'distance_to_most_frequent')
732
+ :param sort_by_weight: If True, sort sequences by weight (descending), overrides sort_by
733
+ :param weights: (np.ndarray or "auto") Weights for sequences. If "auto", uses seqdata.weights if available
734
+ :param figsize: (tuple): Size of the figure (only used when plot_style="custom").
735
+ :param plot_style: Plot aspect style ('standard', 'compact', 'wide', 'narrow', 'custom')
736
+ :param title: (str): Title for the plot.
737
+ :param xlabel: (str): Label for the x-axis.
738
+ :param ylabel: (str): Label for the y-axis.
739
+ :param save_as: File path to save the plot
740
+ :param dpi: DPI for saved image
741
+ :param include_legend: Whether to include legend in the plot (True by default)
742
+ :param sequence_selection: Method for selecting sequences ("all", "first_n", "last_n", or list of IDs)
743
+ :param n_sequences: Number of sequences for "first_n" or "last_n"
744
+ :param show_sequence_ids: If True, show actual sequence IDs on y-axis instead of sequence numbers
745
+
746
+ :return None.
747
+ """
748
+ # Determine figure size based on plot style
749
+ style_sizes = {
750
+ 'standard': (10, 6), # Balanced view
751
+ 'compact': (8, 8), # More square, like R plots
752
+ 'wide': (12, 4), # Wide, emphasizes time
753
+ 'narrow': (8, 10), # Moderately vertical
754
+ 'custom': figsize # User-provided
755
+ }
756
+
757
+ if plot_style not in style_sizes:
758
+ raise ValueError(f"Invalid plot_style '{plot_style}'. "
759
+ f"Supported styles: {list(style_sizes.keys())}")
760
+
761
+ # Special validation for custom plot style
762
+ if plot_style == 'custom' and figsize == (10, 6):
763
+ raise ValueError(
764
+ "When using plot_style='custom', you must explicitly provide a figsize parameter "
765
+ "that differs from the default (10, 6). "
766
+ "Suggested custom sizes:\n"
767
+ " - For wide plots: figsize=(15, 5)\n"
768
+ " - For tall plots: figsize=(7, 12)\n"
769
+ " - For square plots: figsize=(9, 9)\n"
770
+ " - For small plots: figsize=(6, 4)\n"
771
+ "Example: plot_sequence_index(data, plot_style='custom', figsize=(12, 8))"
772
+ )
773
+
774
+ actual_figsize = style_sizes[plot_style]
775
+
776
+ # Process weights
777
+ if isinstance(weights, str) and weights == "auto":
778
+ weights = getattr(seqdata, "weights", None)
779
+
780
+ if weights is not None:
781
+ weights = np.asarray(weights, dtype=float).reshape(-1)
782
+ if len(weights) != len(seqdata.values):
783
+ raise ValueError("Length of weights must equal number of sequences.")
784
+
785
+ # Apply sequence selection and get the filtered data directly
786
+ selection_mask = _select_sequences_subset(seqdata, sequence_selection, n_sequences, sort_by, sort_by_weight, weights)
787
+
788
+ # Get sequence values as NumPy array (apply selection if needed)
789
+ selected_ids = None # Track selected IDs for y-axis labels
790
+ if not np.all(selection_mask):
791
+ sequence_values = seqdata.values[selection_mask].copy()
792
+ # Track selected IDs for y-axis display
793
+ if hasattr(seqdata, 'ids') and seqdata.ids is not None:
794
+ selected_ids = seqdata.ids[selection_mask]
795
+ # Update weights if provided
796
+ if weights is not None:
797
+ weights = weights[selection_mask]
798
+ else:
799
+ sequence_values = seqdata.values.copy()
800
+ # All IDs are selected
801
+ if hasattr(seqdata, 'ids') and seqdata.ids is not None:
802
+ selected_ids = seqdata.ids
803
+
804
+ # Handle NaN values for better visualization
805
+ if np.isnan(sequence_values).any():
806
+ # Keep NaN as float for proper masking
807
+ sequence_values = sequence_values.astype(float)
808
+
809
+ # Sort sequences based on specified method
810
+ if sort_by_weight and weights is not None:
811
+ # Sort by weight (descending)
812
+ sorted_indices = np.argsort(-weights)
813
+ else:
814
+ # Use simpler sorting for the filtered data
815
+ if sort_by == "lexicographic":
816
+ vals = sequence_values.astype(float, copy=True)
817
+ vals = np.nan_to_num(vals, nan=np.inf)
818
+ sorted_indices = np.lexsort(vals.T[::-1])
819
+ elif sort_by in ["mds", "distance_to_most_frequent"]:
820
+ # Fallback to lexicographic for complex sorting methods
821
+ print(f"Warning: {sort_by} sorting simplified to lexicographic for sequence selection")
822
+ vals = sequence_values.astype(float, copy=True)
823
+ vals = np.nan_to_num(vals, nan=np.inf)
824
+ sorted_indices = np.lexsort(vals.T[::-1])
825
+ else:
826
+ # unsorted or other methods
827
+ sorted_indices = np.arange(len(sequence_values))
828
+
829
+ sorted_data = sequence_values[sorted_indices]
830
+
831
+ # Track sorted IDs for y-axis labels if needed
832
+ sorted_ids = None
833
+ if selected_ids is not None and show_sequence_ids:
834
+ sorted_ids = selected_ids[sorted_indices]
835
+
836
+ # Create the plot using imshow with proper NaN handling
837
+ fig, ax = plt.subplots(figsize=actual_figsize)
838
+ # Use masked array for better NaN handling
839
+ data = sorted_data.astype(float)
840
+ data[data < 1] = np.nan
841
+
842
+ # Check for all-missing or all-invalid data
843
+ if np.all(~np.isfinite(data)):
844
+ print(f"Warning: all values missing/invalid in sequence data")
845
+ ax.axis('off')
846
+ return
847
+
848
+ ax.imshow(np.ma.masked_invalid(data), aspect='auto', cmap=seqdata.get_colormap(),
849
+ interpolation='nearest', vmin=1, vmax=len(seqdata.states))
850
+
851
+ # Disable background grid and all axis guide lines
852
+ ax.grid(False)
853
+
854
+ # Optional: remove tick marks and tick labels to avoid visual grid effects
855
+ # ax.set_xticks([])
856
+ # ax.set_yticks([])
857
+
858
+ # x label
859
+ set_up_time_labels_for_x_axis(seqdata, ax)
860
+
861
+ # Enhance y-axis aesthetics - evenly spaced ticks including the last sequence
862
+ num_sequences = sorted_data.shape[0]
863
+
864
+ # Determine tick positions and labels
865
+ if show_sequence_ids and sorted_ids is not None:
866
+ # Show sequence IDs instead of sequence numbers
867
+ # For large number of sequences, show fewer ticks to avoid overcrowding
868
+ if num_sequences <= 10:
869
+ ytick_positions = np.arange(num_sequences)
870
+ ytick_labels = [str(sid) for sid in sorted_ids]
871
+ else:
872
+ # Show subset of IDs for readability
873
+ if plot_style == "narrow":
874
+ num_ticks = min(8, num_sequences)
875
+ else:
876
+ num_ticks = min(11, num_sequences)
877
+ ytick_positions = np.linspace(0, num_sequences - 1, num=num_ticks, dtype=int)
878
+ ytick_positions = np.unique(ytick_positions)
879
+ ytick_labels = [str(sorted_ids[pos]) for pos in ytick_positions]
880
+ else:
881
+ # Default behavior: show sequence numbers
882
+ if plot_style == "narrow":
883
+ num_ticks = min(8, num_sequences) # Fewer ticks for narrow plots
884
+ else:
885
+ num_ticks = min(11, num_sequences)
886
+ ytick_positions = np.linspace(0, num_sequences - 1, num=num_ticks, dtype=int)
887
+ ytick_positions = np.unique(ytick_positions)
888
+ ytick_labels = (ytick_positions + 1).astype(int)
889
+
890
+ ax.set_yticks(ytick_positions)
891
+ ax.set_yticklabels(ytick_labels, fontsize=fontsize-2, color='black')
892
+
893
+
894
+ # Customize axis line styles and ticks
895
+ ax.spines['top'].set_visible(False)
896
+ ax.spines['right'].set_visible(False)
897
+ ax.spines['left'].set_color('gray')
898
+ ax.spines['bottom'].set_color('gray')
899
+ ax.spines['left'].set_linewidth(0.7)
900
+ ax.spines['bottom'].set_linewidth(0.7)
901
+
902
+ # Move spines slightly away from the plot area for better aesthetics
903
+ ax.spines['left'].set_position(('outward', 5))
904
+ ax.spines['bottom'].set_position(('outward', 5))
905
+
906
+ # Ensure ticks are always visible regardless of plot style
907
+ ax.tick_params(axis='x', colors='gray', length=4, width=0.7, which='major')
908
+ ax.tick_params(axis='y', colors='gray', length=4, width=0.7, which='major')
909
+
910
+ # Force tick visibility for narrow plot styles
911
+ ax.xaxis.set_ticks_position('bottom')
912
+ ax.yaxis.set_ticks_position('left')
913
+ ax.tick_params(axis='both', which='major', direction='out')
914
+
915
+ # Add labels and title
916
+ ax.set_xlabel(xlabel, fontsize=fontsize, labelpad=10, color='black')
917
+ ax.set_ylabel(ylabel, fontsize=fontsize, labelpad=10, color='black')
918
+
919
+ # Set title with weight information if available
920
+ if title is not None:
921
+ display_title = title
922
+
923
+ # Check if we have effective weights (not all 1.0) and they were provided by user
924
+ original_weights = getattr(seqdata, "weights", None)
925
+ if original_weights is not None and not np.allclose(original_weights, 1.0) and weights is not None:
926
+ sum_w = float(weights.sum())
927
+ display_title += f" (n = {num_sequences}, total weight = {sum_w:.1f})"
928
+ else:
929
+ display_title += f" (n = {num_sequences})"
930
+
931
+ ax.set_title(display_title, fontsize=fontsize+2, color='black')
932
+
933
+ # Use legend from SequenceData if requested
934
+ if include_legend:
935
+ ax.legend(*seqdata.get_legend(), bbox_to_anchor=(1.05, 1), loc='upper left')
936
+
937
+ save_and_show_results(save_as, dpi=dpi)