sequenzo 0.1.24__cp311-cp311-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sequenzo might be problematic. Click here for more details.

Files changed (264) hide show
  1. _sequenzo_fastcluster.cpython-311-darwin.so +0 -0
  2. sequenzo/__init__.py +240 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +474 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-311-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-311-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +20 -0
  30. sequenzo/data_preprocessing/helpers.py +256 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/mvad.csv +713 -0
  44. sequenzo/datasets/pairfam_family.csv +1867 -0
  45. sequenzo/datasets/polyadic_samplec1.csv +61 -0
  46. sequenzo/datasets/polyadic_samplep1.csv +61 -0
  47. sequenzo/datasets/polyadic_seqc1.csv +61 -0
  48. sequenzo/datasets/polyadic_seqp1.csv +61 -0
  49. sequenzo/define_sequence_data.py +609 -0
  50. sequenzo/dissimilarity_measures/__init__.py +31 -0
  51. sequenzo/dissimilarity_measures/c_code.cpython-311-darwin.so +0 -0
  52. sequenzo/dissimilarity_measures/get_distance_matrix.py +702 -0
  53. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +241 -0
  54. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  55. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  56. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  57. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  58. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  59. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  60. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  61. sequenzo/dissimilarity_measures/src/module.cpp +34 -0
  62. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  63. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  64. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  65. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  66. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  67. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  214. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  215. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  216. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-311-darwin.so +0 -0
  217. sequenzo/dissimilarity_measures/utils/seqconc.cpython-311-darwin.so +0 -0
  218. sequenzo/dissimilarity_measures/utils/seqdss.cpython-311-darwin.so +0 -0
  219. sequenzo/dissimilarity_measures/utils/seqdur.cpython-311-darwin.so +0 -0
  220. sequenzo/dissimilarity_measures/utils/seqlength.cpython-311-darwin.so +0 -0
  221. sequenzo/multidomain/__init__.py +23 -0
  222. sequenzo/multidomain/association_between_domains.py +311 -0
  223. sequenzo/multidomain/cat.py +431 -0
  224. sequenzo/multidomain/combt.py +519 -0
  225. sequenzo/multidomain/dat.py +89 -0
  226. sequenzo/multidomain/idcd.py +139 -0
  227. sequenzo/multidomain/linked_polyad.py +292 -0
  228. sequenzo/openmp_setup.py +233 -0
  229. sequenzo/prefix_tree/__init__.py +43 -0
  230. sequenzo/prefix_tree/individual_level_indicators.py +1274 -0
  231. sequenzo/prefix_tree/system_level_indicators.py +465 -0
  232. sequenzo/prefix_tree/utils.py +54 -0
  233. sequenzo/sequence_characteristics/__init__.py +40 -0
  234. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  235. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  236. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  237. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  238. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  239. sequenzo/sequence_characteristics/turbulence.py +155 -0
  240. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  241. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  242. sequenzo/suffix_tree/__init__.py +48 -0
  243. sequenzo/suffix_tree/individual_level_indicators.py +1638 -0
  244. sequenzo/suffix_tree/system_level_indicators.py +456 -0
  245. sequenzo/suffix_tree/utils.py +56 -0
  246. sequenzo/visualization/__init__.py +29 -0
  247. sequenzo/visualization/plot_mean_time.py +194 -0
  248. sequenzo/visualization/plot_modal_state.py +276 -0
  249. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  250. sequenzo/visualization/plot_relative_frequency.py +404 -0
  251. sequenzo/visualization/plot_sequence_index.py +951 -0
  252. sequenzo/visualization/plot_single_medoid.py +153 -0
  253. sequenzo/visualization/plot_state_distribution.py +627 -0
  254. sequenzo/visualization/plot_transition_matrix.py +190 -0
  255. sequenzo/visualization/utils/__init__.py +23 -0
  256. sequenzo/visualization/utils/utils.py +310 -0
  257. sequenzo/with_event_history_analysis/__init__.py +35 -0
  258. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  259. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  260. sequenzo-0.1.24.dist-info/METADATA +255 -0
  261. sequenzo-0.1.24.dist-info/RECORD +264 -0
  262. sequenzo-0.1.24.dist-info/WHEEL +5 -0
  263. sequenzo-0.1.24.dist-info/licenses/LICENSE +28 -0
  264. sequenzo-0.1.24.dist-info/top_level.txt +2 -0
@@ -0,0 +1,951 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : plot_sequence_index.py
4
+ @Time : 29/12/2024 09:08
5
+ @Desc :
6
+ Generate sequence index plots.
7
+ """
8
+ import numpy as np
9
+ import pandas as pd
10
+ import matplotlib.pyplot as plt
11
+
12
+ # Use relative import to avoid circular import when top-level package imports visualization
13
+ from ..define_sequence_data import SequenceData
14
+ from sequenzo.visualization.utils import (
15
+ set_up_time_labels_for_x_axis,
16
+ save_figure_to_buffer,
17
+ create_standalone_legend,
18
+ combine_plot_with_legend,
19
+ save_and_show_results,
20
+ determine_layout,
21
+ show_plot_title
22
+ )
23
+
24
+
25
+ def smart_sort_groups(groups):
26
+ """
27
+ Smart sorting: prioritize numeric prefix, fallback to string sorting
28
+
29
+ :param groups: List of group names
30
+ :return: Sorted list of group names
31
+ """
32
+ import re
33
+
34
+ # Compile regex once for better performance
35
+ numeric_pattern = re.compile(r'^(\d+)')
36
+
37
+ def sort_key(item):
38
+ match = numeric_pattern.match(str(item))
39
+ return (int(match.group(1)), str(item)) if match else (float('inf'), str(item))
40
+
41
+ return sorted(groups, key=sort_key)
42
+
43
+
44
+ def _cmdscale(D):
45
+ """
46
+ Classic Multidimensional Scaling (MDS), equivalent to R's cmdscale()
47
+
48
+ :param D: A NxN symmetric distance matrix
49
+ :return: Y, a Nxd coordinate matrix, where d is the largest positive eigenvalues' count
50
+ """
51
+ n = len(D)
52
+
53
+ # Step 1: Compute the centering matrix
54
+ H = np.eye(n) - np.ones((n, n)) / n
55
+
56
+ # Step 2: Compute the double centered distance matrix
57
+ B = -0.5 * H @ (D ** 2) @ H
58
+
59
+ # Step 3: Compute eigenvalues and eigenvectors
60
+ eigvals, eigvecs = np.linalg.eigh(B)
61
+
62
+ # Step 4: Sort eigenvalues and eigenvectors in descending order
63
+ idx = np.argsort(eigvals)[::-1]
64
+ eigvals = eigvals[idx]
65
+ eigvecs = eigvecs[:, idx]
66
+
67
+ # Step 5: Select only positive eigenvalues
68
+ w, = np.where(eigvals > 0)
69
+ if len(w) > 0:
70
+ L = np.diag(np.sqrt(eigvals[w]))
71
+ V = eigvecs[:, w]
72
+ return V @ L # Return the MDS coordinates
73
+ else:
74
+ # Fallback if no positive eigenvalues
75
+ return np.zeros((n, 1))
76
+
77
+
78
+ def _find_most_frequent_sequence(sequences):
79
+ """
80
+ Find the most frequent sequence in the dataset.
81
+
82
+ :param sequences: numpy array of sequences
83
+ :return: index of the most frequent sequence
84
+ """
85
+ from collections import Counter
86
+
87
+ # Convert sequences to tuples for hashing
88
+ seq_tuples = [tuple(seq) for seq in sequences]
89
+
90
+ # Count frequencies
91
+ counter = Counter(seq_tuples)
92
+
93
+ # Find the most frequent sequence
94
+ most_frequent = counter.most_common(1)[0][0]
95
+
96
+ # Find the index of this sequence in the original array
97
+ for i, seq in enumerate(seq_tuples):
98
+ if seq == most_frequent:
99
+ return i
100
+
101
+ return 0 # Fallback
102
+
103
+
104
+ def _select_sequences_subset(seqdata, sequence_selection, n_sequences, sort_by, sort_by_weight, weights, mask=None):
105
+ """
106
+ Select a subset of sequences based on the selection method.
107
+
108
+ :param seqdata: SequenceData object
109
+ :param sequence_selection: Selection method ("all", "first_n", "last_n", or list of IDs)
110
+ :param n_sequences: Number of sequences for "first_n" or "last_n"
111
+ :param sort_by: Sorting method to use before selection
112
+ :param sort_by_weight: Whether to sort by weight
113
+ :param weights: Sequence weights
114
+ :param mask: Optional mask for pre-filtering sequences
115
+ :return: Boolean mask for selected sequences
116
+ """
117
+ # Start with all sequences or pre-filtered mask
118
+ if mask is None:
119
+ mask = np.ones(len(seqdata.values), dtype=bool)
120
+
121
+ # If "all", return the current mask
122
+ if sequence_selection == "all":
123
+ return mask
124
+
125
+ # Get indices of sequences that pass the mask
126
+ valid_indices = np.where(mask)[0]
127
+
128
+ # Handle ID list selection
129
+ if isinstance(sequence_selection, list):
130
+ # Convert list to set for faster lookup
131
+ selected_ids = set(sequence_selection)
132
+
133
+ # Find indices of sequences with matching IDs
134
+ selected_mask = np.zeros(len(seqdata.values), dtype=bool)
135
+ if hasattr(seqdata, 'ids') and seqdata.ids is not None:
136
+ for i in valid_indices:
137
+ if seqdata.ids[i] in selected_ids:
138
+ selected_mask[i] = True
139
+ else:
140
+ print("Warning: sequence_selection provided as ID list but seqdata has no IDs. Using all sequences.")
141
+ return mask
142
+
143
+ return selected_mask
144
+
145
+ # For "first_n" or "last_n", we need to sort first
146
+ if sequence_selection in ["first_n", "last_n"]:
147
+ # Get the subset of data based on current mask
148
+ subset_seqdata = seqdata
149
+ subset_weights = weights
150
+
151
+ if not np.all(mask):
152
+ # Create subset if mask is not all True
153
+ subset_values = seqdata.values[mask]
154
+ subset_ids = seqdata.ids[mask] if hasattr(seqdata, 'ids') and seqdata.ids is not None else None
155
+
156
+ # Use original seqdata for structure, just work with filtered values
157
+ subset_seqdata = seqdata # Keep original structure
158
+
159
+ if weights is not None:
160
+ subset_weights = weights[mask]
161
+
162
+ # Apply sorting to get the order
163
+ distance_matrix = None
164
+ if sort_by in ["mds", "distance_to_most_frequent"]:
165
+ try:
166
+ from sequenzo.dissimilarity_measures.get_distance_matrix import get_distance_matrix
167
+ distance_matrix = get_distance_matrix(
168
+ seqdata=subset_seqdata,
169
+ method="OM",
170
+ sm="CONSTANT",
171
+ indel="auto"
172
+ )
173
+ if hasattr(distance_matrix, 'values'):
174
+ distance_matrix = distance_matrix.values
175
+ except ImportError:
176
+ print(f"Warning: Cannot compute distance matrix for '{sort_by}' sorting. Using unsorted order.")
177
+ sort_by = "unsorted"
178
+
179
+ # Apply sorting to the masked subset
180
+ if sort_by_weight and subset_weights is not None:
181
+ # Sort by weight on the subset
182
+ sorted_indices = np.argsort(-subset_weights)
183
+ else:
184
+ # Sort on the subset values
185
+ if sort_by == "unsorted" or sort_by == "none":
186
+ sorted_indices = np.arange(len(valid_indices))
187
+ elif sort_by == "lexicographic":
188
+ subset_values = seqdata.values[mask]
189
+ vals = subset_values.astype(float, copy=True)
190
+ vals = np.nan_to_num(vals, nan=np.inf)
191
+ sorted_indices = np.lexsort(vals.T[::-1])
192
+ elif sort_by in ["mds", "distance_to_most_frequent"]:
193
+ # For complex sorting that requires distance matrix,
194
+ # we'll fall back to simple lexicographic for now
195
+ subset_values = seqdata.values[mask]
196
+ vals = subset_values.astype(float, copy=True)
197
+ vals = np.nan_to_num(vals, nan=np.inf)
198
+ sorted_indices = np.lexsort(vals.T[::-1])
199
+ print(f"Warning: {sort_by} sorting simplified to lexicographic for sequence selection")
200
+ else:
201
+ sorted_indices = np.arange(len(valid_indices))
202
+
203
+ # Select first_n or last_n
204
+ n_available = len(sorted_indices)
205
+ n_to_select = min(n_sequences, n_available)
206
+
207
+ if sequence_selection == "first_n":
208
+ selected_subset_indices = sorted_indices[:n_to_select]
209
+ elif sequence_selection == "last_n":
210
+ selected_subset_indices = sorted_indices[-n_to_select:]
211
+
212
+ # Map back to original indices
213
+ original_indices = valid_indices[selected_subset_indices]
214
+
215
+ # Create final mask
216
+ final_mask = np.zeros(len(seqdata.values), dtype=bool)
217
+ final_mask[original_indices] = True
218
+
219
+ return final_mask
220
+
221
+ else:
222
+ raise ValueError(f"Unsupported sequence_selection: {sequence_selection}. "
223
+ f"Supported options: 'all', 'first_n', 'last_n', or list of IDs")
224
+
225
+
226
+ def sort_sequences_by_method(seqdata, method="unsorted", mask=None, distance_matrix=None, weights=None):
227
+ """
228
+ Sort sequences in SequenceData based on specified method.
229
+
230
+ :param seqdata: SequenceData object
231
+ :param method: str, sorting method - "unsorted", "lexicographic", "mds", "distance_to_most_frequent"
232
+ :param mask: np.array(bool), if provided, sort only this subset
233
+ :param distance_matrix: np.array, required for "mds" and "distance_to_most_frequent" methods
234
+ :param weights: np.array, optional weights for sequences
235
+ :return: np.array sorting indices (relative to original order)
236
+ """
237
+ values = seqdata.values.copy()
238
+
239
+ n_sequences = len(values) if mask is None else int(np.sum(mask))
240
+
241
+ if mask is not None:
242
+ values = values[mask]
243
+ if distance_matrix is not None:
244
+ # Only slice if distance_matrix is for the full sample
245
+ if distance_matrix.shape[0] != n_sequences:
246
+ masked_indices = np.where(mask)[0]
247
+ distance_matrix = distance_matrix[np.ix_(masked_indices, masked_indices)]
248
+
249
+ if method == "unsorted" or method == "none":
250
+ # Keep original order (R default)
251
+ return np.arange(n_sequences)
252
+
253
+ elif method == "lexicographic":
254
+ # Lexicographic sorting (NaN-safe)
255
+ vals = values.astype(float, copy=True)
256
+ # Push NaNs to the end for sorting
257
+ vals = np.nan_to_num(vals, nan=np.inf)
258
+ return np.lexsort(vals.T[::-1])
259
+
260
+ elif method == "mds":
261
+ # MDS first dimension sorting
262
+ if distance_matrix is None:
263
+ raise ValueError("Distance matrix is required for MDS sorting")
264
+
265
+ # TODO: Support weighted MDS (TraMineR's wcmdscale analogue) when weights are provided.
266
+ # Compute MDS coordinates
267
+ mds_coords = _cmdscale(distance_matrix)
268
+
269
+ # Sort by first MDS dimension
270
+ return np.argsort(mds_coords[:, 0])
271
+
272
+ elif method == "distance_to_most_frequent":
273
+ # Sort by distance to most frequent sequence
274
+ if distance_matrix is None:
275
+ raise ValueError("Distance matrix is required for distance_to_most_frequent sorting")
276
+
277
+ # Find most frequent sequence
278
+ most_freq_idx = _find_most_frequent_sequence(values)
279
+
280
+ # Get distances to most frequent sequence
281
+ distances = distance_matrix[most_freq_idx, :]
282
+
283
+ # Sort by distance (ascending)
284
+ return np.argsort(distances)
285
+
286
+ else:
287
+ raise ValueError(f"Unsupported sorting method: {method}. "
288
+ f"Supported methods are: 'unsorted', 'lexicographic', 'mds', 'distance_to_most_frequent'")
289
+
290
+
291
+ def plot_sequence_index(seqdata: SequenceData,
292
+ # Grouping parameters
293
+ group_by_column=None,
294
+ group_dataframe=None,
295
+ group_column_name=None,
296
+ group_labels=None,
297
+ # Other parameters
298
+ sort_by="lexicographic",
299
+ sort_by_weight=False,
300
+ weights="auto",
301
+ figsize=(10, 6),
302
+ plot_style="standard",
303
+ title=None,
304
+ xlabel="Time",
305
+ ylabel="Sequences",
306
+ save_as=None,
307
+ dpi=200,
308
+ layout='column',
309
+ nrows: int = None,
310
+ ncols: int = None,
311
+ group_order=None,
312
+ sort_groups='auto',
313
+ fontsize=12,
314
+ show_group_titles: bool = True,
315
+ include_legend: bool = True,
316
+ sequence_selection="all",
317
+ n_sequences=10,
318
+ show_sequence_ids=False
319
+ ):
320
+ """Creates sequence index plots, optionally grouped by categories.
321
+
322
+ This function creates index plots that visualize sequences as horizontal lines,
323
+ with different sorting options matching R's TraMineR functionality.
324
+
325
+ **Two API modes for grouping:**
326
+
327
+ 1. **Simplified API** (when grouping info is already in the data):
328
+ ```python
329
+ plot_sequence_index(seqdata, group_by_column="Cluster", group_labels=cluster_labels)
330
+ ```
331
+
332
+ 2. **Complete API** (when grouping info is in a separate dataframe):
333
+ ```python
334
+ plot_sequence_index(seqdata, group_dataframe=membership_df,
335
+ group_column_name="Cluster", group_labels=cluster_labels)
336
+ ```
337
+
338
+ :param seqdata: SequenceData object containing sequence information
339
+
340
+ **New API parameters (recommended):**
341
+ :param group_by_column: (str, optional) Column name from seqdata.data to group by.
342
+ Use this when grouping information is already in your data.
343
+ Example: "Cluster", "sex", "education"
344
+ :param group_dataframe: (pd.DataFrame, optional) Separate dataframe containing grouping information.
345
+ Use this when grouping info is in a separate table (e.g., clustering results).
346
+ Must contain ID column and grouping column.
347
+ :param group_column_name: (str, optional) Name of the grouping column in group_dataframe.
348
+ Required when using group_dataframe.
349
+ :param group_labels: (dict, optional) Custom labels for group values.
350
+ Example: {1: "Late Family Formation", 2: "Early Partnership"}
351
+ Maps original values to display labels.
352
+
353
+ :param sort_by: Sorting method for sequences within groups:
354
+ - 'unsorted' or 'none': Keep original order (R TraMineR default)
355
+ - 'lexicographic': Sort sequences lexicographically
356
+ - 'mds': Sort by first MDS dimension (requires distance computation)
357
+ - 'distance_to_most_frequent': Sort by distance to most frequent sequence
358
+ :param sort_by_weight: If True, sort sequences by weight (descending), overrides sort_by
359
+ :param weights: (np.ndarray or "auto") Weights for sequences. If "auto", uses seqdata.weights if available
360
+ :param figsize: Size of each subplot figure (only used when plot_style="custom")
361
+ :param plot_style: Plot aspect style:
362
+ - 'standard': Standard proportions (10, 6) - balanced view
363
+ - 'compact': Compact/vertical proportions (8, 8) - more vertical like R plots
364
+ - 'wide': Wide proportions (12, 4) - emphasizes time progression
365
+ - 'narrow': Narrow/tall proportions (8, 10) - moderately vertical
366
+ - 'custom': Use the provided figsize parameter
367
+ :param title: Title for the plot (if None, default titles will be used)
368
+ :param xlabel: Label for the x-axis
369
+ :param ylabel: Label for the y-axis
370
+ :param save_as: File path to save the plot (if None, plot will be shown)
371
+ :param dpi: DPI for saved image
372
+ :param layout: Layout style - 'column' (default, 3xn), 'grid' (nxn)
373
+ :param group_order: List, manually specify group order (overrides sort_groups)
374
+ :param sort_groups: String, sorting method: 'auto'(smart numeric), 'numeric'(numeric prefix), 'alpha'(alphabetical), 'none'(original order)
375
+ :param fontsize: Base font size for text elements (titles use fontsize+2, ticks use fontsize-2)
376
+ :param show_group_titles: Whether to show group titles
377
+ :param include_legend: Whether to include legend in the plot (True by default)
378
+ :param sequence_selection: Method for selecting sequences to visualize:
379
+ - "all": Show all sequences (default)
380
+ - "first_n": Show first n sequences from each group
381
+ - "last_n": Show last n sequences from each group
382
+ - list: List of specific sequence IDs to show
383
+ :param n_sequences: Number of sequences to show when using "first_n" or "last_n" (default: 10)
384
+ :param show_sequence_ids: If True, show actual sequence IDs on y-axis instead of sequence numbers.
385
+ Most useful when sequence_selection is a list of IDs (default: False)
386
+
387
+ Note: For 'mds' and 'distance_to_most_frequent' sorting, distance matrices are computed
388
+ automatically using Optimal Matching (OM) with constant substitution costs.
389
+ """
390
+ # Determine figure size based on plot style
391
+ style_sizes = {
392
+ 'standard': (10, 6), # Balanced view
393
+ 'compact': (8, 8), # More square, like R plots
394
+ 'wide': (12, 4), # Wide, emphasizes time
395
+ 'narrow': (8, 10), # Moderately vertical
396
+ 'custom': figsize # User-provided
397
+ }
398
+
399
+ if plot_style not in style_sizes:
400
+ raise ValueError(f"Invalid plot_style '{plot_style}'. "
401
+ f"Supported styles: {list(style_sizes.keys())}")
402
+
403
+ # Special validation for custom plot style
404
+ if plot_style == 'custom' and figsize == (10, 6):
405
+ raise ValueError(
406
+ "When using plot_style='custom', you must explicitly provide a figsize parameter "
407
+ "that differs from the default (10, 6). "
408
+ "Suggested custom sizes:\n"
409
+ " - For wide plots: figsize=(15, 5)\n"
410
+ " - For tall plots: figsize=(7, 12)\n"
411
+ " - For square plots: figsize=(9, 9)\n"
412
+ " - For small plots: figsize=(6, 4)\n"
413
+ "Example: plot_sequence_index(data, plot_style='custom', figsize=(12, 8))"
414
+ )
415
+
416
+ actual_figsize = style_sizes[plot_style]
417
+
418
+ # Handle the simplified API: group_by_column
419
+ if group_by_column is not None:
420
+ # Validate that the column exists in the original data
421
+ if group_by_column not in seqdata.data.columns:
422
+ available_cols = [col for col in seqdata.data.columns if col not in seqdata.time and col != seqdata.id_col]
423
+ raise ValueError(
424
+ f"Column '{group_by_column}' not found in the data. "
425
+ f"Available columns for grouping: {available_cols}"
426
+ )
427
+
428
+ # Automatically create group_dataframe and group_column_name from the simplified API
429
+ group_dataframe = seqdata.data[[seqdata.id_col, group_by_column]].copy()
430
+ group_dataframe.columns = ['Entity ID', 'Category']
431
+ group_column_name = 'Category'
432
+
433
+ # Handle group labels - flexible and user-controllable
434
+ unique_values = seqdata.data[group_by_column].unique()
435
+
436
+ if group_labels is not None:
437
+ # User provided custom labels - use them
438
+ missing_keys = set(unique_values) - set(group_labels.keys())
439
+ if missing_keys:
440
+ raise ValueError(
441
+ f"group_labels missing mappings for values: {missing_keys}. "
442
+ f"Please provide labels for all unique values in '{group_by_column}': {sorted(unique_values)}"
443
+ )
444
+ group_dataframe['Category'] = group_dataframe['Category'].map(group_labels)
445
+ else:
446
+ # No custom labels provided - use smart defaults
447
+ if all(isinstance(v, (int, float, np.integer, np.floating)) and not pd.isna(v) for v in unique_values):
448
+ # Numeric values - keep as is (user can provide group_labels if they want custom names)
449
+ pass
450
+ # For string/categorical values, keep original values
451
+ # This handles cases where users already have meaningful labels like "Male"/"Female"
452
+
453
+ print(f"[>] Creating grouped plots by '{group_by_column}' with {len(unique_values)} categories")
454
+
455
+ # If no grouping information, create a single plot
456
+ if group_dataframe is None or group_column_name is None:
457
+ return _sequence_index_plot_single(seqdata, sort_by, sort_by_weight, weights, actual_figsize, plot_style, title, xlabel, ylabel, save_as, dpi, fontsize, include_legend, sequence_selection, n_sequences, show_sequence_ids)
458
+
459
+ # Process weights
460
+ if isinstance(weights, str) and weights == "auto":
461
+ weights = getattr(seqdata, "weights", None)
462
+
463
+ if weights is not None:
464
+ weights = np.asarray(weights, dtype=float).reshape(-1)
465
+ if len(weights) != len(seqdata.values):
466
+ raise ValueError("Length of weights must equal number of sequences.")
467
+
468
+ # Ensure ID columns match (convert if needed)
469
+ id_col_name = "Entity ID" if "Entity ID" in group_dataframe.columns else group_dataframe.columns[0]
470
+
471
+ # Apply group_labels if provided (for group_dataframe API)
472
+ if group_labels is not None and group_column_name in group_dataframe.columns:
473
+ # Validate that all values in the group column have labels
474
+ unique_values = group_dataframe[group_column_name].unique()
475
+ missing_keys = set(unique_values) - set(group_labels.keys())
476
+ if missing_keys:
477
+ raise ValueError(
478
+ f"group_labels missing mappings for values: {missing_keys}. "
479
+ f"Please provide labels for all unique values in '{group_column_name}': {sorted(unique_values)}"
480
+ )
481
+ # Apply the labels mapping
482
+ group_dataframe = group_dataframe.copy() # Avoid modifying original
483
+ group_dataframe[group_column_name] = group_dataframe[group_column_name].map(group_labels)
484
+
485
+ # Get unique groups and sort them based on user preference
486
+ if group_order:
487
+ # Use manually specified order, filter out non-existing groups
488
+ groups = [g for g in group_order if g in group_dataframe[group_column_name].unique()]
489
+ missing_groups = [g for g in group_dataframe[group_column_name].unique() if g not in group_order]
490
+ if missing_groups:
491
+ print(f"[Warning] Groups not in group_order will be excluded: {missing_groups}")
492
+ elif sort_groups == 'numeric' or sort_groups == 'auto':
493
+ groups = smart_sort_groups(group_dataframe[group_column_name].unique())
494
+ elif sort_groups == 'alpha':
495
+ groups = sorted(group_dataframe[group_column_name].unique())
496
+ elif sort_groups == 'none':
497
+ groups = list(group_dataframe[group_column_name].unique())
498
+ else:
499
+ raise ValueError(f"Invalid sort_groups value: {sort_groups}. Use 'auto', 'numeric', 'alpha', or 'none'.")
500
+
501
+ num_groups = len(groups)
502
+
503
+ # Calculate figure size and layout based on number of groups and specified layout
504
+ nrows, ncols = determine_layout(num_groups, layout=layout, nrows=nrows, ncols=ncols)
505
+
506
+ fig, axes = plt.subplots(
507
+ nrows=nrows,
508
+ ncols=ncols,
509
+ figsize=(actual_figsize[0] * ncols, actual_figsize[1] * nrows),
510
+ gridspec_kw={'wspace': 0.15, 'hspace': 0.25} # Reduced spacing for tighter layout
511
+ )
512
+ axes = axes.flatten()
513
+
514
+ # Create a plot for each group
515
+ for i, group in enumerate(groups):
516
+ # Get IDs for this group
517
+ group_ids = group_dataframe[group_dataframe[group_column_name] == group][id_col_name].values
518
+
519
+ # Match IDs with sequence data
520
+ mask = np.isin(seqdata.ids, group_ids)
521
+ if not np.any(mask):
522
+ print(f"Warning: No matching sequences found for group '{group}'")
523
+ continue
524
+
525
+ # Apply sequence selection to this group
526
+ mask = _select_sequences_subset(seqdata, sequence_selection, n_sequences, sort_by, sort_by_weight, weights, mask)
527
+
528
+ # Extract sequences for this group
529
+ group_sequences = seqdata.values[mask]
530
+
531
+ # Track group IDs for y-axis labels
532
+ group_ids_for_labels = None
533
+ if hasattr(seqdata, 'ids') and seqdata.ids is not None and show_sequence_ids:
534
+ group_ids_for_labels = seqdata.ids[mask]
535
+
536
+ # Get weights for this group
537
+ if weights is not None:
538
+ group_weights = weights[mask]
539
+ else:
540
+ group_weights = None
541
+
542
+ # Handle NaN values for better visualization
543
+ if np.isnan(group_sequences).any():
544
+ # Map NaN to a dedicated state code with proper masking
545
+ group_sequences = group_sequences.astype(float)
546
+ group_sequences[np.isnan(group_sequences)] = np.nan
547
+
548
+ if sort_by_weight and group_weights is not None:
549
+ # Sort by weight (descending)
550
+ sorted_indices = np.argsort(-group_weights)
551
+ else:
552
+ # For group plots, we'll use simpler sorting to avoid complex object creation
553
+ if sort_by == "lexicographic":
554
+ vals = group_sequences.astype(float, copy=True)
555
+ vals = np.nan_to_num(vals, nan=np.inf)
556
+ sorted_indices = np.lexsort(vals.T[::-1])
557
+ elif sort_by in ["mds", "distance_to_most_frequent"]:
558
+ # Fallback to lexicographic for complex sorting methods
559
+ print(f"Warning: {sort_by} sorting simplified to lexicographic for grouped plots with sequence selection")
560
+ vals = group_sequences.astype(float, copy=True)
561
+ vals = np.nan_to_num(vals, nan=np.inf)
562
+ sorted_indices = np.lexsort(vals.T[::-1])
563
+ else:
564
+ # unsorted or other methods
565
+ sorted_indices = np.arange(len(group_sequences))
566
+
567
+ sorted_data = group_sequences[sorted_indices]
568
+
569
+ # Track sorted IDs for y-axis labels if needed
570
+ sorted_group_ids = None
571
+ if group_ids_for_labels is not None and show_sequence_ids:
572
+ sorted_group_ids = group_ids_for_labels[sorted_indices]
573
+
574
+ # Plot on the corresponding axis
575
+ ax = axes[i]
576
+ # Use masked array for better NaN handling
577
+ data = sorted_data.astype(float)
578
+ data[data < 1] = np.nan
579
+
580
+ # Check for all-missing or all-invalid data
581
+ if np.all(~np.isfinite(data)):
582
+ print(f"Warning: all values missing/invalid for group '{group}'")
583
+ ax.axis('off')
584
+ continue
585
+
586
+ im = ax.imshow(np.ma.masked_invalid(data), aspect='auto', cmap=seqdata.get_colormap(),
587
+ interpolation='nearest', vmin=1, vmax=len(seqdata.states))
588
+
589
+ # Remove grid lines
590
+ ax.grid(False)
591
+
592
+ # Set up time labels
593
+ set_up_time_labels_for_x_axis(seqdata, ax)
594
+
595
+ # Enhance y-axis aesthetics - evenly spaced ticks including the last sequence
596
+ num_sequences = sorted_data.shape[0]
597
+
598
+ # Determine tick positions and labels
599
+ if show_sequence_ids and sorted_group_ids is not None:
600
+ # Show sequence IDs instead of sequence numbers
601
+ # For large number of sequences, show fewer ticks to avoid overcrowding
602
+ if num_sequences <= 10:
603
+ ytick_positions = np.arange(num_sequences)
604
+ ytick_labels = [str(sid) for sid in sorted_group_ids]
605
+ else:
606
+ # Show subset of IDs for readability
607
+ if plot_style == "narrow":
608
+ num_ticks = min(8, num_sequences)
609
+ else:
610
+ num_ticks = min(11, num_sequences)
611
+ ytick_positions = np.linspace(0, num_sequences - 1, num=num_ticks, dtype=int)
612
+ ytick_positions = np.unique(ytick_positions)
613
+ ytick_labels = [str(sorted_group_ids[pos]) for pos in ytick_positions]
614
+ else:
615
+ # Default behavior: show sequence numbers
616
+ if plot_style == "narrow":
617
+ num_ticks = min(8, num_sequences) # Fewer ticks for narrow plots
618
+ else:
619
+ num_ticks = min(11, num_sequences)
620
+ ytick_positions = np.linspace(0, num_sequences - 1, num=num_ticks, dtype=int)
621
+ ytick_positions = np.unique(ytick_positions)
622
+ ytick_labels = (ytick_positions + 1).astype(int)
623
+
624
+ ax.set_yticks(ytick_positions)
625
+ ax.set_yticklabels(ytick_labels, fontsize=fontsize-2, color='black')
626
+
627
+ # Customize axis style
628
+ ax.spines['top'].set_visible(False)
629
+ ax.spines['right'].set_visible(False)
630
+ ax.spines['left'].set_color('gray')
631
+ ax.spines['bottom'].set_color('gray')
632
+ ax.spines['left'].set_linewidth(0.7)
633
+ ax.spines['bottom'].set_linewidth(0.7)
634
+
635
+ # Move spines slightly away from the plot area for better aesthetics
636
+ ax.spines['left'].set_position(('outward', 5))
637
+ ax.spines['bottom'].set_position(('outward', 5))
638
+
639
+ # Ensure ticks are always visible regardless of plot style
640
+ ax.tick_params(axis='x', colors='gray', length=4, width=0.7, which='major')
641
+ ax.tick_params(axis='y', colors='gray', length=4, width=0.7, which='major')
642
+
643
+ # Force tick visibility for narrow plot styles
644
+ ax.xaxis.set_ticks_position('bottom')
645
+ ax.yaxis.set_ticks_position('left')
646
+ ax.tick_params(axis='both', which='major', direction='out')
647
+
648
+ # Add group title with weight information
649
+ # Check if we have effective weights (not all 1.0) and they were provided by user
650
+ original_weights = getattr(seqdata, "weights", None)
651
+ if original_weights is not None and not np.allclose(original_weights, 1.0) and group_weights is not None:
652
+ sum_w = float(group_weights.sum())
653
+ group_title = f"{group} (n = {num_sequences}, total weight = {sum_w:.1f})"
654
+ else:
655
+ group_title = f"{group} (n = {num_sequences})"
656
+ if show_group_titles:
657
+ show_plot_title(ax, group_title, show=True, fontsize=fontsize, loc='right')
658
+
659
+ # Add axis labels
660
+ if i % ncols == 0:
661
+ ax.set_ylabel(ylabel, fontsize=fontsize, labelpad=10, color='black')
662
+
663
+ # if i >= num_groups - ncols:
664
+ ax.set_xlabel(xlabel, fontsize=fontsize, labelpad=10, color='black')
665
+
666
+ # Hide unused subplots
667
+ for j in range(i + 1, len(axes)):
668
+ axes[j].set_visible(False)
669
+
670
+ # Add a common title if provided
671
+ if title:
672
+ fig.suptitle(title, fontsize=fontsize+2, y=1.02)
673
+
674
+ # Adjust layout to remove tight_layout warning and eliminate extra right space
675
+ fig.subplots_adjust(wspace=0.15, hspace=0.25, bottom=0.1, top=0.9, right=0.98, left=0.08)
676
+
677
+ # Save main figure to memory
678
+ main_buffer = save_figure_to_buffer(fig, dpi=dpi)
679
+
680
+ if include_legend:
681
+ # Create standalone legend
682
+ colors = seqdata.color_map_by_label
683
+ legend_buffer = create_standalone_legend(
684
+ colors=colors,
685
+ labels=seqdata.labels,
686
+ ncol=min(5, len(seqdata.states)),
687
+ figsize=(actual_figsize[0] * ncols, 1),
688
+ fontsize=fontsize-2,
689
+ dpi=dpi
690
+ )
691
+
692
+ # Combine plot with legend
693
+ if save_as and not save_as.lower().endswith(('.png', '.jpg', '.jpeg', '.pdf')):
694
+ save_as = save_as + '.png'
695
+
696
+ combined_img = combine_plot_with_legend(
697
+ main_buffer,
698
+ legend_buffer,
699
+ output_path=save_as,
700
+ dpi=dpi,
701
+ padding=20
702
+ )
703
+
704
+ # Display combined image
705
+ plt.figure(figsize=(actual_figsize[0] * ncols, actual_figsize[1] * nrows + 1))
706
+ plt.imshow(combined_img)
707
+ plt.axis('off')
708
+ plt.show()
709
+ plt.close()
710
+ else:
711
+ # Display plot without legend
712
+ if save_as and not save_as.lower().endswith(('.png', '.jpg', '.jpeg', '.pdf')):
713
+ save_as = save_as + '.png'
714
+
715
+ # Save or show the main plot directly
716
+ plt.figure(figsize=(actual_figsize[0] * ncols, actual_figsize[1] * nrows))
717
+ plt.imshow(main_buffer)
718
+ plt.axis('off')
719
+
720
+ if save_as:
721
+ plt.savefig(save_as, dpi=dpi, bbox_inches='tight')
722
+ plt.show()
723
+ plt.close()
724
+
725
+
726
+ def _sequence_index_plot_single(seqdata: SequenceData,
727
+ sort_by="unsorted",
728
+ sort_by_weight=False,
729
+ weights="auto",
730
+ figsize=(10, 6),
731
+ plot_style="standard",
732
+ title=None,
733
+ xlabel="Time",
734
+ ylabel="Sequences",
735
+ save_as=None,
736
+ dpi=200,
737
+ fontsize=12,
738
+ include_legend=True,
739
+ sequence_selection="all",
740
+ n_sequences=10,
741
+ show_sequence_ids=False):
742
+ """Efficiently creates a sequence index plot using `imshow` for faster rendering.
743
+
744
+ :param seqdata: SequenceData object containing sequence information
745
+ :param sort_by: Sorting method ('unsorted', 'lexicographic', 'mds', 'distance_to_most_frequent')
746
+ :param sort_by_weight: If True, sort sequences by weight (descending), overrides sort_by
747
+ :param weights: (np.ndarray or "auto") Weights for sequences. If "auto", uses seqdata.weights if available
748
+ :param figsize: (tuple): Size of the figure (only used when plot_style="custom").
749
+ :param plot_style: Plot aspect style ('standard', 'compact', 'wide', 'narrow', 'custom')
750
+ :param title: (str): Title for the plot.
751
+ :param xlabel: (str): Label for the x-axis.
752
+ :param ylabel: (str): Label for the y-axis.
753
+ :param save_as: File path to save the plot
754
+ :param dpi: DPI for saved image
755
+ :param include_legend: Whether to include legend in the plot (True by default)
756
+ :param sequence_selection: Method for selecting sequences ("all", "first_n", "last_n", or list of IDs)
757
+ :param n_sequences: Number of sequences for "first_n" or "last_n"
758
+ :param show_sequence_ids: If True, show actual sequence IDs on y-axis instead of sequence numbers
759
+
760
+ :return None.
761
+ """
762
+ # Determine figure size based on plot style
763
+ style_sizes = {
764
+ 'standard': (10, 6), # Balanced view
765
+ 'compact': (8, 8), # More square, like R plots
766
+ 'wide': (12, 4), # Wide, emphasizes time
767
+ 'narrow': (8, 10), # Moderately vertical
768
+ 'custom': figsize # User-provided
769
+ }
770
+
771
+ if plot_style not in style_sizes:
772
+ raise ValueError(f"Invalid plot_style '{plot_style}'. "
773
+ f"Supported styles: {list(style_sizes.keys())}")
774
+
775
+ # Special validation for custom plot style
776
+ if plot_style == 'custom' and figsize == (10, 6):
777
+ raise ValueError(
778
+ "When using plot_style='custom', you must explicitly provide a figsize parameter "
779
+ "that differs from the default (10, 6). "
780
+ "Suggested custom sizes:\n"
781
+ " - For wide plots: figsize=(15, 5)\n"
782
+ " - For tall plots: figsize=(7, 12)\n"
783
+ " - For square plots: figsize=(9, 9)\n"
784
+ " - For small plots: figsize=(6, 4)\n"
785
+ "Example: plot_sequence_index(data, plot_style='custom', figsize=(12, 8))"
786
+ )
787
+
788
+ actual_figsize = style_sizes[plot_style]
789
+
790
+ # Process weights
791
+ if isinstance(weights, str) and weights == "auto":
792
+ weights = getattr(seqdata, "weights", None)
793
+
794
+ if weights is not None:
795
+ weights = np.asarray(weights, dtype=float).reshape(-1)
796
+ if len(weights) != len(seqdata.values):
797
+ raise ValueError("Length of weights must equal number of sequences.")
798
+
799
+ # Apply sequence selection and get the filtered data directly
800
+ selection_mask = _select_sequences_subset(seqdata, sequence_selection, n_sequences, sort_by, sort_by_weight, weights)
801
+
802
+ # Get sequence values as NumPy array (apply selection if needed)
803
+ selected_ids = None # Track selected IDs for y-axis labels
804
+ if not np.all(selection_mask):
805
+ sequence_values = seqdata.values[selection_mask].copy()
806
+ # Track selected IDs for y-axis display
807
+ if hasattr(seqdata, 'ids') and seqdata.ids is not None:
808
+ selected_ids = seqdata.ids[selection_mask]
809
+ # Update weights if provided
810
+ if weights is not None:
811
+ weights = weights[selection_mask]
812
+ else:
813
+ sequence_values = seqdata.values.copy()
814
+ # All IDs are selected
815
+ if hasattr(seqdata, 'ids') and seqdata.ids is not None:
816
+ selected_ids = seqdata.ids
817
+
818
+ # Handle NaN values for better visualization
819
+ if np.isnan(sequence_values).any():
820
+ # Keep NaN as float for proper masking
821
+ sequence_values = sequence_values.astype(float)
822
+
823
+ # Sort sequences based on specified method
824
+ if sort_by_weight and weights is not None:
825
+ # Sort by weight (descending)
826
+ sorted_indices = np.argsort(-weights)
827
+ else:
828
+ # Use simpler sorting for the filtered data
829
+ if sort_by == "lexicographic":
830
+ vals = sequence_values.astype(float, copy=True)
831
+ vals = np.nan_to_num(vals, nan=np.inf)
832
+ sorted_indices = np.lexsort(vals.T[::-1])
833
+ elif sort_by in ["mds", "distance_to_most_frequent"]:
834
+ # Fallback to lexicographic for complex sorting methods
835
+ print(f"Warning: {sort_by} sorting simplified to lexicographic for sequence selection")
836
+ vals = sequence_values.astype(float, copy=True)
837
+ vals = np.nan_to_num(vals, nan=np.inf)
838
+ sorted_indices = np.lexsort(vals.T[::-1])
839
+ else:
840
+ # unsorted or other methods
841
+ sorted_indices = np.arange(len(sequence_values))
842
+
843
+ sorted_data = sequence_values[sorted_indices]
844
+
845
+ # Track sorted IDs for y-axis labels if needed
846
+ sorted_ids = None
847
+ if selected_ids is not None and show_sequence_ids:
848
+ sorted_ids = selected_ids[sorted_indices]
849
+
850
+ # Create the plot using imshow with proper NaN handling
851
+ fig, ax = plt.subplots(figsize=actual_figsize)
852
+ # Use masked array for better NaN handling
853
+ data = sorted_data.astype(float)
854
+ data[data < 1] = np.nan
855
+
856
+ # Check for all-missing or all-invalid data
857
+ if np.all(~np.isfinite(data)):
858
+ print(f"Warning: all values missing/invalid in sequence data")
859
+ ax.axis('off')
860
+ return
861
+
862
+ ax.imshow(np.ma.masked_invalid(data), aspect='auto', cmap=seqdata.get_colormap(),
863
+ interpolation='nearest', vmin=1, vmax=len(seqdata.states))
864
+
865
+ # Disable background grid and all axis guide lines
866
+ ax.grid(False)
867
+
868
+ # Optional: remove tick marks and tick labels to avoid visual grid effects
869
+ # ax.set_xticks([])
870
+ # ax.set_yticks([])
871
+
872
+ # x label
873
+ set_up_time_labels_for_x_axis(seqdata, ax)
874
+
875
+ # Enhance y-axis aesthetics - evenly spaced ticks including the last sequence
876
+ num_sequences = sorted_data.shape[0]
877
+
878
+ # Determine tick positions and labels
879
+ if show_sequence_ids and sorted_ids is not None:
880
+ # Show sequence IDs instead of sequence numbers
881
+ # For large number of sequences, show fewer ticks to avoid overcrowding
882
+ if num_sequences <= 10:
883
+ ytick_positions = np.arange(num_sequences)
884
+ ytick_labels = [str(sid) for sid in sorted_ids]
885
+ else:
886
+ # Show subset of IDs for readability
887
+ if plot_style == "narrow":
888
+ num_ticks = min(8, num_sequences)
889
+ else:
890
+ num_ticks = min(11, num_sequences)
891
+ ytick_positions = np.linspace(0, num_sequences - 1, num=num_ticks, dtype=int)
892
+ ytick_positions = np.unique(ytick_positions)
893
+ ytick_labels = [str(sorted_ids[pos]) for pos in ytick_positions]
894
+ else:
895
+ # Default behavior: show sequence numbers
896
+ if plot_style == "narrow":
897
+ num_ticks = min(8, num_sequences) # Fewer ticks for narrow plots
898
+ else:
899
+ num_ticks = min(11, num_sequences)
900
+ ytick_positions = np.linspace(0, num_sequences - 1, num=num_ticks, dtype=int)
901
+ ytick_positions = np.unique(ytick_positions)
902
+ ytick_labels = (ytick_positions + 1).astype(int)
903
+
904
+ ax.set_yticks(ytick_positions)
905
+ ax.set_yticklabels(ytick_labels, fontsize=fontsize-2, color='black')
906
+
907
+
908
+ # Customize axis line styles and ticks
909
+ ax.spines['top'].set_visible(False)
910
+ ax.spines['right'].set_visible(False)
911
+ ax.spines['left'].set_color('gray')
912
+ ax.spines['bottom'].set_color('gray')
913
+ ax.spines['left'].set_linewidth(0.7)
914
+ ax.spines['bottom'].set_linewidth(0.7)
915
+
916
+ # Move spines slightly away from the plot area for better aesthetics
917
+ ax.spines['left'].set_position(('outward', 5))
918
+ ax.spines['bottom'].set_position(('outward', 5))
919
+
920
+ # Ensure ticks are always visible regardless of plot style
921
+ ax.tick_params(axis='x', colors='gray', length=4, width=0.7, which='major')
922
+ ax.tick_params(axis='y', colors='gray', length=4, width=0.7, which='major')
923
+
924
+ # Force tick visibility for narrow plot styles
925
+ ax.xaxis.set_ticks_position('bottom')
926
+ ax.yaxis.set_ticks_position('left')
927
+ ax.tick_params(axis='both', which='major', direction='out')
928
+
929
+ # Add labels and title
930
+ ax.set_xlabel(xlabel, fontsize=fontsize, labelpad=10, color='black')
931
+ ax.set_ylabel(ylabel, fontsize=fontsize, labelpad=10, color='black')
932
+
933
+ # Set title with weight information if available
934
+ if title is not None:
935
+ display_title = title
936
+
937
+ # Check if we have effective weights (not all 1.0) and they were provided by user
938
+ original_weights = getattr(seqdata, "weights", None)
939
+ if original_weights is not None and not np.allclose(original_weights, 1.0) and weights is not None:
940
+ sum_w = float(weights.sum())
941
+ display_title += f" (n = {num_sequences}, total weight = {sum_w:.1f})"
942
+ else:
943
+ display_title += f" (n = {num_sequences})"
944
+
945
+ ax.set_title(display_title, fontsize=fontsize+2, color='black')
946
+
947
+ # Use legend from SequenceData if requested
948
+ if include_legend:
949
+ ax.legend(*seqdata.get_legend(), bbox_to_anchor=(1.05, 1), loc='upper left')
950
+
951
+ save_and_show_results(save_as, dpi=dpi)