sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
  2. sequenzo/__init__.py +349 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +476 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +22 -0
  30. sequenzo/data_preprocessing/helpers.py +303 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/dyadic_children.csv +61 -0
  44. sequenzo/datasets/dyadic_parents.csv +61 -0
  45. sequenzo/datasets/mvad.csv +713 -0
  46. sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
  47. sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
  48. sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
  49. sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
  50. sequenzo/datasets/political_science_aid_shock.csv +166 -0
  51. sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
  52. sequenzo/define_sequence_data.py +1400 -0
  53. sequenzo/dissimilarity_measures/__init__.py +31 -0
  54. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  55. sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
  56. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
  57. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  58. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  59. sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
  60. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  61. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  62. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  63. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  64. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  65. sequenzo/dissimilarity_measures/src/module.cpp +40 -0
  66. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  67. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  214. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  215. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  216. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  217. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  218. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  219. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  220. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  221. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  222. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  223. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  224. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  225. sequenzo/multidomain/__init__.py +23 -0
  226. sequenzo/multidomain/association_between_domains.py +311 -0
  227. sequenzo/multidomain/cat.py +597 -0
  228. sequenzo/multidomain/combt.py +519 -0
  229. sequenzo/multidomain/dat.py +81 -0
  230. sequenzo/multidomain/idcd.py +139 -0
  231. sequenzo/multidomain/linked_polyad.py +292 -0
  232. sequenzo/openmp_setup.py +233 -0
  233. sequenzo/prefix_tree/__init__.py +62 -0
  234. sequenzo/prefix_tree/hub.py +114 -0
  235. sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
  236. sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
  237. sequenzo/prefix_tree/spell_level_indicators.py +297 -0
  238. sequenzo/prefix_tree/system_level_indicators.py +544 -0
  239. sequenzo/prefix_tree/utils.py +54 -0
  240. sequenzo/seqhmm/__init__.py +95 -0
  241. sequenzo/seqhmm/advanced_optimization.py +305 -0
  242. sequenzo/seqhmm/bootstrap.py +411 -0
  243. sequenzo/seqhmm/build_hmm.py +142 -0
  244. sequenzo/seqhmm/build_mhmm.py +136 -0
  245. sequenzo/seqhmm/build_nhmm.py +121 -0
  246. sequenzo/seqhmm/fit_mhmm.py +62 -0
  247. sequenzo/seqhmm/fit_model.py +61 -0
  248. sequenzo/seqhmm/fit_nhmm.py +76 -0
  249. sequenzo/seqhmm/formulas.py +289 -0
  250. sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
  251. sequenzo/seqhmm/gradients_nhmm.py +306 -0
  252. sequenzo/seqhmm/hmm.py +291 -0
  253. sequenzo/seqhmm/mhmm.py +314 -0
  254. sequenzo/seqhmm/model_comparison.py +238 -0
  255. sequenzo/seqhmm/multichannel_em.py +282 -0
  256. sequenzo/seqhmm/multichannel_utils.py +138 -0
  257. sequenzo/seqhmm/nhmm.py +270 -0
  258. sequenzo/seqhmm/nhmm_utils.py +191 -0
  259. sequenzo/seqhmm/predict.py +137 -0
  260. sequenzo/seqhmm/predict_mhmm.py +142 -0
  261. sequenzo/seqhmm/simulate.py +878 -0
  262. sequenzo/seqhmm/utils.py +218 -0
  263. sequenzo/seqhmm/visualization.py +910 -0
  264. sequenzo/sequence_characteristics/__init__.py +40 -0
  265. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  266. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  267. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  268. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  269. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  270. sequenzo/sequence_characteristics/turbulence.py +155 -0
  271. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  272. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  273. sequenzo/suffix_tree/__init__.py +66 -0
  274. sequenzo/suffix_tree/hub.py +114 -0
  275. sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
  276. sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
  277. sequenzo/suffix_tree/spell_level_indicators.py +248 -0
  278. sequenzo/suffix_tree/system_level_indicators.py +535 -0
  279. sequenzo/suffix_tree/utils.py +56 -0
  280. sequenzo/version_check.py +283 -0
  281. sequenzo/visualization/__init__.py +29 -0
  282. sequenzo/visualization/plot_mean_time.py +222 -0
  283. sequenzo/visualization/plot_modal_state.py +276 -0
  284. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  285. sequenzo/visualization/plot_relative_frequency.py +405 -0
  286. sequenzo/visualization/plot_sequence_index.py +1175 -0
  287. sequenzo/visualization/plot_single_medoid.py +153 -0
  288. sequenzo/visualization/plot_state_distribution.py +651 -0
  289. sequenzo/visualization/plot_transition_matrix.py +190 -0
  290. sequenzo/visualization/utils/__init__.py +23 -0
  291. sequenzo/visualization/utils/utils.py +310 -0
  292. sequenzo/with_event_history_analysis/__init__.py +35 -0
  293. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  294. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  295. sequenzo-0.1.31.dist-info/METADATA +286 -0
  296. sequenzo-0.1.31.dist-info/RECORD +299 -0
  297. sequenzo-0.1.31.dist-info/WHEEL +5 -0
  298. sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
  299. sequenzo-0.1.31.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1400 @@
1
+ """
2
+ @Author : 梁彧祺 Yuqi Liang, 李欣怡 Xinyi Li
3
+ @File : define_sequence_data.py
4
+ @Time : 05/02/2025 12:47
5
+ @Desc :
6
+
7
+ Optimized SequenceData class with integrated color scheme & legend handling.
8
+
9
+ Note on `states` and `alphabet`:
10
+
11
+ In traditional sequence analysis tools (e.g., TraMineR), the `alphabet` refers to the full set of distinct states
12
+ found in the data and is often inferred automatically from the observed sequences.
13
+
14
+ However, in this implementation, we require the user to explicitly provide the set of `states`. This explicit control
15
+ is essential for ensuring consistent ordering of states, reproducibility of visualizations, and compatibility across
16
+ sequence datasets - especially when certain states may not appear in a given subset of the data.
17
+
18
+ As a result, `alphabet` is automatically set to `states` upon initialization, and kept as a semantic alias for clarity
19
+ and potential compatibility. Users should treat `states` as the definitive state space and are not required to provide
20
+ `alphabet` separately.
21
+
22
+ # ----------------------------------------------------------------------
23
+ # [Hint] Handling the ID column for sequence analysis
24
+ # ----------------------------------------------------------------------
25
+
26
+ # STEP 1: Check if your DataFrame already has a column representing unique entity IDs
27
+ # For example, check if "Entity ID" or "country" or any other identifier exists:
28
+ print(df.columns)
29
+
30
+ # If your data already has an ID column (e.g., 'Entity ID'), you can directly use it:
31
+ seq = SequenceData(df, id_col='Entity ID', time=..., states=...)
32
+
33
+ # ----------------------------------------------------------------------
34
+ # STEP 2: If your data has NO ID column, use the helper function below
35
+ # ----------------------------------------------------------------------
36
+ from sequenzo.utils import assign_unique_ids
37
+
38
+ # This will insert a new ID column named 'Entity ID' as the first column
39
+ df = assign_unique_ids(df, id_col_name='Entity ID')
40
+
41
+ # Optional: Save it for future use to avoid repeating this step
42
+ df.to_csv('your_dataset_with_ids.csv', index=False)
43
+
44
+ # Then you can use it like this:
45
+ seq = SequenceData(df, id_col='Entity ID', time=..., states=...)
46
+
47
+ """
48
+ # Only applicable to Python 3.7+, add this line to defer type annotation evaluation
49
+ from __future__ import annotations
50
+ # Define the public API at the top of the file
51
+ __all__ = ['SequenceData']
52
+
53
+ # Global variables and other imports that do not depend on pandas are placed here
54
+ import numpy as np
55
+ import seaborn as sns
56
+ import matplotlib.pyplot as plt
57
+ import pandas as pd
58
+ from docutils.parsers.rst import states
59
+ from matplotlib.colors import ListedColormap
60
+ import re
61
+ from typing import Union
62
+
63
+
64
+ class SequenceData:
65
+ """
66
+ A class for defining and processing a sequence dataset for social sequence analysis.
67
+
68
+ This class provides:
69
+ - Sequence extraction & missing value handling.
70
+ - Automatic alphabet (state space) management.
71
+ - Efficient sequence-to-numeric conversion.
72
+ - Color mapping & legend storage for visualization.
73
+ """
74
+
75
+ def __init__(
76
+ self,
77
+ data: pd.DataFrame,
78
+ time: list,
79
+ states: list,
80
+ labels: list = None,
81
+ id_col: str = None,
82
+ weights: np.ndarray = None,
83
+ start: int = 1,
84
+ custom_colors: list = None,
85
+ additional_colors: dict = None,
86
+ missing_values: Union[None, int, float, str, list] = None
87
+ ):
88
+ """
89
+ Initialize the SequenceData object.
90
+
91
+ :param data: DataFrame containing sequence data.
92
+ :param time: List of columns containing time labels.
93
+ :param states: List of unique states (categories).
94
+ :param alphabet: Optional predefined state space.
95
+ :param labels: Labels for states (optional, for visualization).
96
+ :param id_col: Column name for row identifiers, which is very important for hierarchical clustering.
97
+ :param weights: Sequence weights (optional).
98
+ :param start: Starting time index (default: 1).
99
+ :param missing_handling: Dict specifying handling for missing values (left, right, gaps).
100
+ :param void: Symbol for void elements (default: "%").
101
+ :param nr: Symbol for missing values (default: "*").
102
+ :param custom_colors: Custom color palette for visualization.
103
+ If provided, should be a list of colors matching the number of states.
104
+ Colors can be hex strings (e.g., "#FF5733") or RGB tuples.
105
+ :param additional_colors: Dictionary to specify additional custom colors for specific states
106
+ while keeping the default palette for others. This is useful when you want to keep default colors
107
+ but assign custom colors to specific states (e.g., {"Other": "#BDBDBD"} to make "Other" gray).
108
+ Format: {state_name: color}, where color can be hex string (e.g., "#BDBDBD") or RGB tuple.
109
+ Example: additional_colors={"Other": "#BDBDBD", "Missing": "#E0E0E0"}
110
+ :param missing_values: Custom missing value indicators. Can be:
111
+ - None (default): Auto-detect missing values (NaN, string "Missing")
112
+ - Single value: e.g., 99, 9, 1000, "Missing"
113
+ - List: e.g., [99, 9, 1000] or ["Missing", "N/A"]
114
+ The system will also check for pandas NaN and string "Missing" (case-insensitive)
115
+ and warn if other missing values are detected.
116
+ """
117
+ # Import pandas here instead of the top of the file
118
+ import pandas as pd
119
+
120
+ self.data = data.copy()
121
+ self.time = time
122
+
123
+ # Remove all non-numeric characters from the year labels, e.g., "Year2020" -> "2020", or "C1" -> "1"
124
+ # self.cleaned_time = [re.sub(r'\D', '', str(year)) for year in time]
125
+ # No longer support this feature as we encourage users to clean the time variables.
126
+ # TODO: might implement a helper function for users to clean up their time variables.
127
+ self.cleaned_time = time
128
+ self.states = states.copy()
129
+ self.alphabet = states.copy() or sorted(set(data[time].stack().unique()))
130
+ self.labels = labels or [str(s) for s in states]
131
+ self.id_col = id_col
132
+ self.ids = np.array(self.data[self.id_col].values) if self.id_col else data.index
133
+ self.weights = weights
134
+ self._weights_provided = weights is not None # Track if weights were originally provided
135
+ self.start = start
136
+ self.custom_colors = custom_colors
137
+ self.additional_colors = additional_colors or {}
138
+
139
+ # Process missing_values parameter: convert to list format
140
+ if missing_values is None:
141
+ self.missing_values = []
142
+ elif isinstance(missing_values, (list, tuple)):
143
+ self.missing_values = list(missing_values)
144
+ else:
145
+ self.missing_values = [missing_values]
146
+
147
+ # Track original number of states before processing missing values
148
+ # This helps us determine if custom_colors needs adjustment
149
+ self._original_num_states = len(self.states)
150
+ self._missing_auto_added = False # Track if Missing was automatically added
151
+
152
+ # Validate parameters
153
+ self._validate_parameters()
154
+
155
+ # Validate additional_colors if provided
156
+ if self.additional_colors:
157
+ if self.custom_colors:
158
+ raise ValueError(
159
+ "[!] You cannot use both 'custom_colors' and 'additional_colors' at the same time.\n"
160
+ " -> Use 'custom_colors' to specify all colors, or\n"
161
+ " -> Use 'additional_colors' to assign custom colors to specific states while keeping default colors."
162
+ )
163
+ # Check that all states in additional_colors exist in self.states
164
+ invalid_states = [state for state in self.additional_colors.keys() if state not in self.states]
165
+ if invalid_states:
166
+ raise ValueError(
167
+ f"[!] The following states in 'additional_colors' are not found in 'states': {invalid_states}\n"
168
+ f" Available states: {self.states}"
169
+ )
170
+
171
+ # Extract & process sequences
172
+ self.seqdata = self._extract_sequences()
173
+ self._process_missing_values()
174
+
175
+ # The following two lines of code are for visualization
176
+ self.state_to_label = dict(zip(self.states, self.labels))
177
+ self.label_to_state = dict(zip(self.labels, self.states))
178
+
179
+ self._convert_states()
180
+
181
+ # Assign colors & save legend
182
+ self._assign_colors()
183
+
184
+ # Automatically print dataset overview
185
+ print("\n[>] SequenceData initialized successfully! Here's a summary:")
186
+ self.describe()
187
+
188
+ @property
189
+ def values(self):
190
+ """Returns sequence data as a NumPy array, similar to xinyi_original_seqdef()."""
191
+ return self.seqdata.to_numpy(dtype=np.int32)
192
+
193
+ def __repr__(self):
194
+ return f"SequenceData({len(self.seqdata)} sequences, States: {self.states})"
195
+
196
+ def _validate_parameters(self):
197
+ """Ensures correct input parameters and checks consistency with data."""
198
+ # Check states, alphabet, labels
199
+ if not self.states:
200
+ raise ValueError("'states' must be provided.")
201
+
202
+ # Get all unique values from the data (including NaN)
203
+ # stack() drops NaN by default, so we need to check separately
204
+ # Convert to Python native types for consistent comparison across Python versions
205
+ # Python 3.12 may return numpy scalar types which need to be converted
206
+ stacked_values = self.data[self.time].stack().unique()
207
+ # Normalize values to Python native types for consistent comparison
208
+ data_values_list = []
209
+ for val in stacked_values:
210
+ # Skip NaN values (they're handled separately)
211
+ if pd.isna(val):
212
+ continue
213
+ # Convert numpy scalar types to Python native types for consistent comparison
214
+ # This is important for Python 3.12 compatibility
215
+ if hasattr(val, 'item'): # numpy scalar
216
+ val = val.item()
217
+ data_values_list.append(val)
218
+
219
+ data_values_no_nan = set(data_values_list)
220
+ # Check if there are any NaN values in the data
221
+ has_nan_in_data = self.data[self.time].isna().any().any()
222
+
223
+ # Combine all data values (including NaN indicator if present)
224
+ all_data_values = data_values_no_nan.copy()
225
+ if has_nan_in_data:
226
+ all_data_values.add(np.nan)
227
+
228
+ # Validate that states are present in the actual data values
229
+ states_clean = [s for s in self.states if not pd.isna(s)] # stack() removes nan values, so if states contains np.nan, it will cause an error
230
+ # Normalize states to Python native types for consistent comparison
231
+ states_clean_normalized = []
232
+ for s in states_clean:
233
+ if hasattr(s, 'item'): # numpy scalar
234
+ s = s.item()
235
+ states_clean_normalized.append(s)
236
+
237
+ unmatched_states = [s for s in data_values_no_nan if s not in states_clean_normalized]
238
+
239
+ if unmatched_states:
240
+ raise ValueError(
241
+ f"[!] The following provided 'states' are not found in the data: {unmatched_states}\n"
242
+ f" Hint: Check spelling or formatting. Data contains these unique values: {sorted([v for v in data_values_no_nan if not pd.isna(v)])}"
243
+ )
244
+
245
+ # Validate that all data values are present in the provided states (complete state space check)
246
+ # Exclude missing values from this check (NaN and user-specified missing_values)
247
+ # Normalize states to Python native types for consistent comparison
248
+ states_normalized = []
249
+ for s in self.states:
250
+ if pd.isna(s):
251
+ states_normalized.append(s)
252
+ else:
253
+ if hasattr(s, 'item'): # numpy scalar
254
+ s = s.item()
255
+ states_normalized.append(s)
256
+ states_list = list(states_normalized)
257
+ states_set = set(states_normalized)
258
+ # Check for NaN in states
259
+ has_nan_in_states = any(pd.isna(s) for s in self.states)
260
+
261
+ # Get missing value indicators to exclude from the check
262
+ missing_indicators = set()
263
+ if has_nan_in_states:
264
+ missing_indicators.add(np.nan)
265
+ # Add user-specified missing_values
266
+ for mv in self.missing_values:
267
+ if pd.isna(mv):
268
+ missing_indicators.add(np.nan)
269
+ else:
270
+ missing_indicators.add(mv)
271
+ # Also check for string "Missing" (case-insensitive) in states
272
+ for s in self.states:
273
+ if isinstance(s, str) and s.lower() == 'missing':
274
+ missing_indicators.add(s)
275
+ # Also check for string "NaN" (case-insensitive) in states
276
+ for s in self.states:
277
+ if isinstance(s, str) and s.lower() == 'nan':
278
+ missing_indicators.add(s)
279
+
280
+ # Auto-detect string "NaN" (case-insensitive) in data as missing value
281
+ # Similar to how we handle string "Missing" in _process_missing_values
282
+ # Also check for string "Missing" (case-insensitive) in data
283
+ for dv in data_values_no_nan:
284
+ if isinstance(dv, str):
285
+ dv_lower = dv.lower()
286
+ if dv_lower == 'nan' or dv_lower == 'missing':
287
+ missing_indicators.add(dv)
288
+
289
+ # Find data values that are not in states and not missing values
290
+ # Use more robust comparison that handles type mismatches
291
+ missing_from_states = []
292
+ for dv in all_data_values:
293
+ # Skip if it's a missing value indicator
294
+ if pd.isna(dv):
295
+ # True pandas NaN should always be automatically handled, skip it
296
+ continue
297
+ elif dv in missing_indicators:
298
+ continue # This is a known missing value, skip
299
+ elif isinstance(dv, str) and (dv.lower() == 'nan' or dv.lower() == 'missing'):
300
+ # Double-check: if it's a string "NaN" or "Missing" (case-insensitive), skip it
301
+ continue
302
+
303
+ # Check if dv is in states_set (both are now normalized to Python native types)
304
+ if dv not in states_set:
305
+ missing_from_states.append(dv)
306
+
307
+ if missing_from_states:
308
+ # Format the error message nicely
309
+ data_values_display = sorted([v for v in data_values_no_nan if not pd.isna(v)])
310
+ if has_nan_in_data:
311
+ data_values_display.append("NaN")
312
+
313
+ raise ValueError(
314
+ f"[!] The following values found in the data are not included in your 'states' list: {missing_from_states}\n"
315
+ f" Your provided states: {self.states}\n"
316
+ f" All unique values in data: {data_values_display}\n"
317
+ f" Hint: You must include ALL unique values from the data in your 'states' parameter.\n"
318
+ f" Missing values (NaN or user-specified) are automatically handled, but all other data values must be in 'states'."
319
+ )
320
+
321
+ # ----------------
322
+ # Check if ID column is provided and valid
323
+ if self.id_col is not None and self.id_col not in self.data.columns:
324
+ raise ValueError(
325
+ f"[!] You must specify a valid `id_col` parameter that exists in your dataset.\n"
326
+ f" ID is required to uniquely identify each sequence (e.g., individuals).\n"
327
+ f" -> Hint: If your data does not have an ID column yet, you can use the helper function:\n\n"
328
+ f" from sequenzo.utils import assign_unique_ids\n"
329
+ f" df = assign_unique_ids(df, id_col_name='Entity ID')\n"
330
+ f" df.to_csv('your_dataset_with_ids.csv', index=False)\n\n"
331
+ f" This will permanently assign unique IDs to your dataset for future use."
332
+ )
333
+
334
+ # Because it is already implemented at initialization time
335
+ # self.ids = np.array(self.data[self.id_col].values)
336
+
337
+ # Validate ID uniqueness and length
338
+ if len(self.ids) != len(self.data):
339
+ raise ValueError(f"[!] Length of ID column ('{self.id_col}') must match number of rows in the dataset.")
340
+ if len(np.unique(self.ids)) != len(self.ids):
341
+ raise ValueError(f"[!] IDs in column '{self.id_col}' must be unique.")
342
+
343
+ # ----------------
344
+ if self.alphabet and set(self.alphabet) != set(self.states):
345
+ raise ValueError("'alphabet' must match 'states'.")
346
+
347
+ if self.labels:
348
+ if len(self.labels) != len(self.states):
349
+ # Provide detailed error message showing what's missing or extra
350
+ states_len = len(self.states)
351
+ labels_len = len(self.labels)
352
+
353
+ if labels_len < states_len:
354
+ missing_count = states_len - labels_len
355
+ error_msg = (
356
+ f"[!] 'labels' length ({labels_len}) is shorter than 'states' length ({states_len}).\n"
357
+ f" Missing {missing_count} label(s).\n"
358
+ f" Your states: {self.states}\n"
359
+ f" Your labels: {self.labels}\n"
360
+ f" Hint: You need to provide {states_len} labels, one for each state.\n"
361
+ f" Example: labels = {[str(s) for s in self.states]}"
362
+ )
363
+ else:
364
+ extra_count = labels_len - states_len
365
+ error_msg = (
366
+ f"[!] 'labels' length ({labels_len}) is longer than 'states' length ({states_len}).\n"
367
+ f" You have {extra_count} extra label(s).\n"
368
+ f" Your states: {self.states}\n"
369
+ f" Your labels: {self.labels}\n"
370
+ f" Hint: You should provide exactly {states_len} labels, one for each state.\n"
371
+ f" The extra labels are: {self.labels[states_len:]}"
372
+ )
373
+
374
+ raise ValueError(error_msg)
375
+
376
+ # Ensure labels are all strings
377
+ non_string_labels = [label for label in self.labels if not isinstance(label, str)]
378
+ if non_string_labels:
379
+ raise TypeError(
380
+ f"[!] All elements in 'labels' must be strings for proper visualization (e.g., for legends or annotations).\n"
381
+ f" Detected non-string labels: {non_string_labels}\n"
382
+ f" Example fix: instead of using `labels = [1, 2, 3]`, use `labels = ['Single', 'Married', 'Divorced']`."
383
+ )
384
+
385
+ # Check weights
386
+ if self.weights is not None:
387
+ if len(self.weights) != len(self.data):
388
+ raise ValueError("'weights' must match the length of 'data'.")
389
+ else:
390
+ self.weights = np.ones(self.data.shape[0])
391
+
392
+ def _extract_sequences(self) -> pd.DataFrame:
393
+ """Extracts only relevant sequence columns."""
394
+ return self.data[self.time].copy()
395
+
396
+ def _process_missing_values(self):
397
+ """Handles missing values based on the specified rules and user-defined missing_values."""
398
+ # left, right, gaps = self.missing_handling.values()
399
+ #
400
+ # # Fill left-side missing values
401
+ # if not pd.isna(left) and left != "DEL":
402
+ # self.seqdata.fillna(left, inplace=True)
403
+ #
404
+ # # Process right-side missing values
405
+ # if right == "DEL":
406
+ # self.seqdata = self.seqdata.apply(lambda row: row.dropna().reset_index(drop=True), axis=1)
407
+ #
408
+ # # Process gaps (internal missing values)
409
+ # if not pd.isna(gaps) and gaps != "DEL":
410
+ # self.seqdata.replace(self.nr, gaps, inplace=True)
411
+
412
+ # Collect all detected missing value indicators
413
+ detected_missing = []
414
+
415
+ # Check for pandas NaN values
416
+ has_pandas_nan = self.seqdata.isna().any().any()
417
+ if has_pandas_nan:
418
+ detected_missing.append("NaN (pandas)")
419
+
420
+ # Check for user-specified missing_values in the data
421
+ user_missing_found = []
422
+ for mv in self.missing_values:
423
+ if pd.isna(mv):
424
+ # Handle NaN in missing_values list
425
+ if has_pandas_nan and "NaN (pandas)" not in user_missing_found:
426
+ user_missing_found.append("NaN (pandas)")
427
+ else:
428
+ # Check if this missing value exists in the data
429
+ if (self.seqdata == mv).any().any():
430
+ user_missing_found.append(mv)
431
+
432
+ # Check for string "Missing" (case-insensitive) as missing indicator
433
+ # This handles cases where missing values are represented as the string "Missing" instead of NaN
434
+ # Only check if not already in user-specified missing_values
435
+ has_string_missing = False
436
+ string_missing_variants = []
437
+
438
+ # Check if "Missing" (case-insensitive) is already in user-specified missing_values
439
+ has_missing_string_in_user_spec = any(
440
+ isinstance(mv, str) and mv.lower() == 'missing' for mv in self.missing_values
441
+ )
442
+
443
+ if not has_missing_string_in_user_spec:
444
+ try:
445
+ # Check case-insensitive "missing" strings
446
+ missing_mask = self.seqdata.astype(str).str.lower() == 'missing'
447
+ if missing_mask.any().any():
448
+ has_string_missing = True
449
+ # Find actual string values (preserving case)
450
+ actual_values = self.seqdata[missing_mask].dropna().unique()
451
+ string_missing_variants = [str(v) for v in actual_values if str(v).lower() == 'missing']
452
+ except (AttributeError, TypeError):
453
+ # If conversion fails, check column by column
454
+ try:
455
+ for col in self.seqdata.columns:
456
+ col_mask = self.seqdata[col].astype(str).str.lower() == 'missing'
457
+ if col_mask.any():
458
+ has_string_missing = True
459
+ actual_values = self.seqdata.loc[col_mask, col].unique()
460
+ for v in actual_values:
461
+ variant = str(v)
462
+ if variant.lower() == 'missing' and variant not in string_missing_variants:
463
+ string_missing_variants.append(variant)
464
+ except:
465
+ pass
466
+
467
+ if has_string_missing:
468
+ # Add unique string variants to detected missing (only if not already specified by user)
469
+ for variant in string_missing_variants:
470
+ if variant not in detected_missing and variant not in user_missing_found:
471
+ detected_missing.append(variant)
472
+
473
+ # Check for string "NaN" (case-insensitive) as missing indicator
474
+ # Similar to how we handle string "Missing"
475
+ # Only check if not already in user-specified missing_values
476
+ has_string_nan = False
477
+ string_nan_variants = []
478
+
479
+ # Check if "NaN" (case-insensitive) is already in user-specified missing_values
480
+ has_nan_string_in_user_spec = any(
481
+ isinstance(mv, str) and mv.lower() == 'nan' for mv in self.missing_values
482
+ )
483
+
484
+ if not has_nan_string_in_user_spec:
485
+ try:
486
+ # Check case-insensitive "nan" strings
487
+ nan_mask = self.seqdata.astype(str).str.lower() == 'nan'
488
+ if nan_mask.any().any():
489
+ has_string_nan = True
490
+ # Find actual string values (preserving case)
491
+ actual_values = self.seqdata[nan_mask].dropna().unique()
492
+ string_nan_variants = [str(v) for v in actual_values if str(v).lower() == 'nan']
493
+ except (AttributeError, TypeError):
494
+ # If conversion fails, check column by column
495
+ try:
496
+ for col in self.seqdata.columns:
497
+ col_mask = self.seqdata[col].astype(str).str.lower() == 'nan'
498
+ if col_mask.any():
499
+ has_string_nan = True
500
+ actual_values = self.seqdata.loc[col_mask, col].unique()
501
+ for v in actual_values:
502
+ variant = str(v)
503
+ if variant.lower() == 'nan' and variant not in string_nan_variants:
504
+ string_nan_variants.append(variant)
505
+ except:
506
+ pass
507
+
508
+ if has_string_nan:
509
+ # Add unique string variants to detected missing (only if not already specified by user)
510
+ for variant in string_nan_variants:
511
+ if variant not in detected_missing and variant not in user_missing_found:
512
+ detected_missing.append(variant)
513
+
514
+ # Combine user-specified and auto-detected missing values
515
+ all_missing_values = list(set(self.missing_values + detected_missing))
516
+ # Remove NaN placeholders and add actual NaN check
517
+ if has_pandas_nan:
518
+ all_missing_values = [mv for mv in all_missing_values if mv != "NaN (pandas)"] + [np.nan]
519
+ else:
520
+ all_missing_values = [mv for mv in all_missing_values if mv != "NaN (pandas)"]
521
+
522
+ # Check if there are any missing values at all
523
+ has_any_missing = False
524
+ if has_pandas_nan:
525
+ has_any_missing = True
526
+ elif user_missing_found:
527
+ has_any_missing = True
528
+ elif has_string_missing:
529
+ has_any_missing = True
530
+ elif has_string_nan:
531
+ has_any_missing = True
532
+ else:
533
+ # Check if any user-specified missing_values exist in data
534
+ for mv in self.missing_values:
535
+ if not pd.isna(mv):
536
+ if (self.seqdata == mv).any().any():
537
+ has_any_missing = True
538
+ break
539
+
540
+ self.ismissing = has_any_missing
541
+
542
+ # Warn user if other missing values were detected beyond what they specified
543
+ if self.missing_values and detected_missing:
544
+ other_missing = [mv for mv in detected_missing if mv not in [str(m) for m in self.missing_values] and mv != "NaN (pandas)"]
545
+ if other_missing or (has_pandas_nan and not any(pd.isna(mv) for mv in self.missing_values)):
546
+ print(
547
+ f"[!] Warning: Detected additional missing value indicators in your data beyond those you specified.\n"
548
+ f" You specified: {self.missing_values}\n"
549
+ f" Additional missing values found: {other_missing + (['NaN'] if has_pandas_nan and not any(pd.isna(mv) for mv in self.missing_values) else [])}\n"
550
+ f" Recommendation: Include these in the `missing_values` parameter for complete handling.\n"
551
+ f" Example: missing_values={self.missing_values + other_missing + (['NaN'] if has_pandas_nan and not any(pd.isna(mv) for mv in self.missing_values) else [])}"
552
+ )
553
+
554
+ # Determine the canonical missing representation for states/labels
555
+ # This will be used when adding missing to states if needed
556
+ canonical_missing_value = None
557
+ if has_pandas_nan:
558
+ canonical_missing_value = np.nan
559
+ elif string_missing_variants:
560
+ # Use the first variant (usually "Missing")
561
+ canonical_missing_value = string_missing_variants[0]
562
+ elif string_nan_variants:
563
+ # Use the first variant (usually "NaN")
564
+ canonical_missing_value = string_nan_variants[0]
565
+ elif user_missing_found:
566
+ # Use the first user-specified missing value that was found
567
+ canonical_missing_value = user_missing_found[0]
568
+ elif self.missing_values:
569
+ # Use the first user-specified missing value
570
+ canonical_missing_value = self.missing_values[0]
571
+
572
+ if self.ismissing:
573
+ # Check if states already contains any form of "Missing" or np.nan
574
+ # Check if states contains any representation of missing values
575
+ has_missing_state = False
576
+ for state in self.states:
577
+ if pd.isna(state):
578
+ has_missing_state = True
579
+ break
580
+ elif isinstance(state, str):
581
+ # Check if state matches any missing value (case-insensitive for strings)
582
+ state_lower = state.lower()
583
+ if state_lower == "missing" or state_lower == "nan" or state in self.missing_values or state in user_missing_found:
584
+ has_missing_state = True
585
+ break
586
+ elif state in self.missing_values or state in user_missing_found:
587
+ has_missing_state = True
588
+ break
589
+
590
+ # Also check labels
591
+ has_missing_label = any(
592
+ (label.lower() == "missing" or label.lower() == "nan") or label in self.missing_values or label in user_missing_found
593
+ for label in self.labels if isinstance(label, str)
594
+ ) or any(pd.isna(label) for label in self.labels)
595
+
596
+ if not has_missing_state and canonical_missing_value is not None:
597
+ # Automatically determine if states are string type or numeric type
598
+ if pd.isna(canonical_missing_value):
599
+ example_missing = "np.nan"
600
+ quote = ""
601
+ missing_state_value = np.nan
602
+ else:
603
+ example_missing = f"'{canonical_missing_value}'" if isinstance(canonical_missing_value, str) else str(canonical_missing_value)
604
+ quote = "'" if isinstance(canonical_missing_value, str) else ""
605
+ missing_state_value = canonical_missing_value
606
+
607
+ # Build description of missing types found
608
+ missing_types = []
609
+ if has_pandas_nan:
610
+ missing_types.append("NaN (pandas)")
611
+ if string_missing_variants:
612
+ missing_types.extend([f"'{v}'" for v in string_missing_variants])
613
+ if string_nan_variants:
614
+ missing_types.extend([f"'{v}'" for v in string_nan_variants])
615
+ if user_missing_found:
616
+ missing_types.extend([str(v) for v in user_missing_found if v not in string_missing_variants and v not in string_nan_variants and not pd.isna(v)])
617
+ missing_type_desc = ", ".join(missing_types) if missing_types else "missing values"
618
+
619
+ missing_values_desc = ""
620
+ if self.missing_values:
621
+ missing_values_desc = f"\n You specified missing_values={self.missing_values}."
622
+
623
+ print(
624
+ f"[!] Detected missing values ({missing_type_desc}) in the sequence data.{missing_values_desc}\n"
625
+ f" -> Automatically added {example_missing} to `states` and `labels` for compatibility.\n"
626
+ " However, it's strongly recommended to manually include it when defining `states` and `labels`.\n"
627
+ " For example:\n\n"
628
+ f" states = [{quote}At Home{quote}, {quote}Left Home{quote}, {example_missing}]\n"
629
+ f" labels = [{quote}At Home{quote}, {quote}Left Home{quote}, {quote}Missing{quote}]\n\n"
630
+ " This ensures consistent color mapping and avoids unexpected visualization errors."
631
+ )
632
+
633
+ # Add missing to states
634
+ self.states.append(missing_state_value)
635
+
636
+ # Always ensure labels has the same length as states after appending missing state
637
+ # Strategy:
638
+ # 1. If labels already has "Missing", we need to ensure it's removed and re-added at the end
639
+ # 2. We need to preserve labels for the original states (before adding missing)
640
+ # 3. If labels length matches original states length, just replace any "Missing" and append
641
+ # 4. If labels has extra elements, take only the first N (where N = original states count)
642
+
643
+ # Remove any existing "Missing" labels (case-insensitive)
644
+ labels_without_missing = [label for label in self.labels
645
+ if not (isinstance(label, str) and label.lower() == "missing")]
646
+
647
+ # Ensure we have the correct number of labels for non-missing states
648
+ # If labels_without_missing has fewer elements than original states, we're missing some labels
649
+ # If it has more, we take only the first N that match original states
650
+ if len(labels_without_missing) < self._original_num_states:
651
+ # Not enough labels - this is unusual but we'll pad with generic labels
652
+ while len(labels_without_missing) < self._original_num_states:
653
+ labels_without_missing.append(f"State {len(labels_without_missing) + 1}")
654
+ elif len(labels_without_missing) > self._original_num_states:
655
+ # Too many labels - take only the first N
656
+ labels_without_missing = labels_without_missing[:self._original_num_states]
657
+
658
+ # Append "Missing" label at the end to match the appended missing state
659
+ self.labels = labels_without_missing + ["Missing"]
660
+
661
+ # Verify lengths match (safety check)
662
+ if len(self.states) != len(self.labels):
663
+ raise ValueError(
664
+ f"Internal error: Length mismatch after adding missing state. "
665
+ f"States length: {len(self.states)}, Labels length: {len(self.labels)}. "
666
+ f"States: {self.states}, Labels: {self.labels}. "
667
+ f"Original num states: {self._original_num_states}"
668
+ )
669
+
670
+ # Mark that Missing was automatically added
671
+ self._missing_auto_added = True
672
+
673
+
674
+ def _convert_states(self):
675
+ """
676
+ Converts categorical states into numerical values for processing.
677
+ Note that the order has to be the same as when the user defines the states of the class,
678
+ as it is very important for visualization.
679
+ Otherwise, the colors will be assigned incorrectly.
680
+
681
+ For instance, self.states = ['Very Low', 'Low', 'Middle', 'High', 'Very High'], as the user defines when defining the class
682
+ but the older version here is {'High': 1, 'Low': 2, 'Middle': 3, 'Very High': 4, 'Very Low': 5}
683
+ """
684
+ correct_order = self.states
685
+
686
+ # Create the state mapping with correct order
687
+ self.state_mapping = {original_state: i + 1 for i, original_state in enumerate(self.states)}
688
+ # Keep the inverse mapping so that legends and plots can use numeric encoding
689
+ self.inverse_state_mapping = {v: k for k, v in self.state_mapping.items()}
690
+
691
+ # Apply the mapping
692
+ # Handle missing values: replace with the last index (which should be the missing state)
693
+ # Also handle user-specified missing_values that might not be in state_mapping
694
+ def map_value(x):
695
+ # First check if it's in the state mapping
696
+ if x in self.state_mapping:
697
+ return self.state_mapping[x]
698
+ # Check if it's a pandas NaN
699
+ if pd.isna(x):
700
+ return len(self.states) # Last state should be missing
701
+ # Check if it's in user-specified missing_values
702
+ if x in self.missing_values or str(x).lower() == 'missing':
703
+ # If missing value is in states, use its mapping; otherwise use last index
704
+ if x in self.states:
705
+ return self.state_mapping.get(x, len(self.states))
706
+ else:
707
+ return len(self.states)
708
+ # If not found, use last index as fallback (treat as missing)
709
+ return len(self.states)
710
+
711
+ try:
712
+ self.seqdata = self.seqdata.map(map_value)
713
+ except AttributeError:
714
+ self.seqdata = self.seqdata.applymap(map_value)
715
+
716
+ if self.ids is not None:
717
+ self.seqdata.index = self.ids
718
+
719
+ def _assign_colors(self, reverse_colors=True):
720
+ """Assigns a color palette using user-defined or default Spectral palette.
721
+
722
+ If missing values are present, automatically assigns a fixed gray color (#cfcccc)
723
+ to missing values and uses the existing color scheme for non-missing states.
724
+ """
725
+ num_states = len(self.states)
726
+
727
+ # Check if missing values are present
728
+ has_missing = self.ismissing
729
+ missing_gray_color = (0.811765, 0.8, 0.8) # Fixed gray color for missing values (#cfcccc)
730
+
731
+ if has_missing:
732
+ # Count non-missing states for color palette generation
733
+ non_missing_states = num_states - 1
734
+
735
+ if self.custom_colors:
736
+ # If user provided custom colors, check if they account for missing values
737
+ if len(self.custom_colors) == num_states:
738
+ # User provided colors for all states including missing - use as is
739
+ color_list = self.custom_colors
740
+ elif len(self.custom_colors) == non_missing_states:
741
+ # User provided colors only for non-missing states - add gray for missing
742
+ color_list = self.custom_colors + [missing_gray_color]
743
+ if self._missing_auto_added:
744
+ print(
745
+ f"[!] Automatically added gray color (#cfcccc) for missing values.\n"
746
+ f" -> You provided {len(self.custom_colors)} colors for {self._original_num_states} states, "
747
+ f"but Missing was automatically added.\n"
748
+ f" -> Added gray (#cfcccc) as the color for Missing state."
749
+ )
750
+ elif self._missing_auto_added and len(self.custom_colors) == self._original_num_states:
751
+ # Missing was automatically added, and user provided colors for original states
752
+ # Automatically add gray for the missing state
753
+ color_list = self.custom_colors + [missing_gray_color]
754
+ print(
755
+ f"[!] Automatically added gray color (#cfcccc) for missing values.\n"
756
+ f" -> You provided {len(self.custom_colors)} colors for {self._original_num_states} states, "
757
+ f"but Missing was automatically added.\n"
758
+ f" -> Added gray (#cfcccc) as the color for Missing state."
759
+ )
760
+ else:
761
+ raise ValueError(
762
+ f"Length of custom_colors ({len(self.custom_colors)}) must match "
763
+ f"either total states ({num_states}) or non-missing states ({non_missing_states}).\n"
764
+ f"Hint: If Missing was automatically added, you can either:\n"
765
+ f" 1. Include 'Missing' in your states and labels when creating SequenceData, or\n"
766
+ f" 2. Provide {non_missing_states} colors (without Missing) and we'll add gray automatically."
767
+ )
768
+ else:
769
+ # Generate colors for non-missing states and add gray for missing
770
+ if non_missing_states <= 20:
771
+ non_missing_color_list = sns.color_palette("Spectral", non_missing_states)
772
+ else:
773
+ # Use a more elegant color palette for many states - combination of viridis and pastel colors
774
+ if non_missing_states <= 40:
775
+ # Use viridis for up to 40 states (more colorful than cubehelix)
776
+ non_missing_color_list = sns.color_palette("viridis", non_missing_states)
777
+ else:
778
+ # For very large state counts, use a custom palette combining multiple schemes
779
+ viridis_colors = sns.color_palette("viridis", min(non_missing_states // 2, 20))
780
+ pastel_colors = sns.color_palette("Set3", min(non_missing_states // 2, 12))
781
+ tab20_colors = sns.color_palette("tab20", min(non_missing_states // 3, 20))
782
+
783
+ # Combine and extend the palette
784
+ combined_colors = viridis_colors + pastel_colors + tab20_colors
785
+ # If we need more colors, cycle through the combined palette
786
+ while len(combined_colors) < non_missing_states:
787
+ combined_colors.extend(combined_colors[:min(len(combined_colors), non_missing_states - len(combined_colors))])
788
+
789
+ non_missing_color_list = combined_colors[:non_missing_states]
790
+
791
+ if reverse_colors:
792
+ non_missing_color_list = list(reversed(non_missing_color_list))
793
+
794
+ # Add fixed gray color for missing values at the end
795
+ color_list = list(non_missing_color_list) + [missing_gray_color]
796
+ else:
797
+ # No missing values - use original logic
798
+ if self.custom_colors:
799
+ if len(self.custom_colors) != num_states:
800
+ raise ValueError("Length of custom_colors must match number of states.")
801
+ color_list = self.custom_colors
802
+ else:
803
+ if num_states <= 20:
804
+ color_list = sns.color_palette("Spectral", num_states)
805
+ else:
806
+ # Use a more elegant color palette for many states - combination of viridis and pastel colors
807
+ if num_states <= 40:
808
+ # Use viridis for up to 40 states (more colorful than cubehelix)
809
+ color_list = sns.color_palette("viridis", num_states)
810
+ else:
811
+ # For very large state counts, use a custom palette combining multiple schemes
812
+ viridis_colors = sns.color_palette("viridis", min(num_states // 2, 20))
813
+ pastel_colors = sns.color_palette("Set3", min(num_states // 2, 12))
814
+ tab20_colors = sns.color_palette("tab20", min(num_states // 3, 20))
815
+
816
+ # Combine and extend the palette
817
+ combined_colors = viridis_colors + pastel_colors + tab20_colors
818
+ # If we need more colors, cycle through the combined palette
819
+ while len(combined_colors) < num_states:
820
+ combined_colors.extend(combined_colors[:min(len(combined_colors), num_states - len(combined_colors))])
821
+
822
+ color_list = combined_colors[:num_states]
823
+
824
+ if reverse_colors:
825
+ color_list = list(reversed(color_list))
826
+
827
+ # Apply additional_colors if specified (assign custom colors to specific states while keeping default colors)
828
+ if self.additional_colors:
829
+ color_list = list(color_list) # Make a copy to avoid modifying original
830
+ for state, custom_color in self.additional_colors.items():
831
+ if state in self.states:
832
+ state_index = self.states.index(state)
833
+ # Convert hex string to RGB tuple if needed
834
+ if isinstance(custom_color, str) and custom_color.startswith('#'):
835
+ # Convert hex to RGB tuple (values 0-1)
836
+ hex_color = custom_color.lstrip('#')
837
+ rgb = tuple(int(hex_color[i:i+2], 16) / 255.0 for i in (0, 2, 4))
838
+ color_list[state_index] = rgb
839
+ elif isinstance(custom_color, (tuple, list)) and len(custom_color) == 3:
840
+ # If RGB values are 0-255, convert to 0-1
841
+ if all(0 <= v <= 255 for v in custom_color):
842
+ color_list[state_index] = tuple(v / 255.0 for v in custom_color)
843
+ else:
844
+ # Assume already 0-1 range
845
+ color_list[state_index] = tuple(custom_color)
846
+ else:
847
+ color_list[state_index] = custom_color
848
+
849
+ # self.color_map = {state: color_list[i] for i, state in enumerate(self.states)}
850
+ # This way all color map keys are 1, 2, 3..., which aligns with imshow(vmin=1, vmax=N)
851
+ self.color_map = {i + 1: color_list[i] for i in range(num_states)}
852
+
853
+ # Construct color_map with label as key (for legend)
854
+ self.color_map_by_label = {
855
+ self.state_to_label[state]: self.color_map[self.state_mapping[state]]
856
+ for state in self.states
857
+ }
858
+
859
+ def get_colormap(self):
860
+ """Returns a ListedColormap for visualization."""
861
+ # return ListedColormap([self.color_map[state] for state in self.states])
862
+ return ListedColormap([self.color_map[i + 1] for i in range(len(self.states))])
863
+
864
+ def describe(self):
865
+ """
866
+ Prints an overview of the sequence dataset.
867
+
868
+ # NOTE:
869
+ # Printing 'missing_index' directly may cause issues in Jupyter Notebook/Lab if the list is too long.
870
+ # For example, if there are thousands of sequences with missing values, the full list can easily exceed
871
+ # the IOPub data rate limit (1MB/sec by default), which will interrupt output to the client.
872
+ # To avoid this, it's safer to only display a subset (e.g., the first 10) or add a 'verbose' flag to control output.
873
+ """
874
+ print(f"[>] Number of sequences: {len(self.seqdata)}")
875
+ print(f"[>] Number of time points: {self.n_steps}")
876
+
877
+ if self.ismissing:
878
+ lengths = self.seqdata.apply(lambda row: (row != len(self.states)).sum(), axis=1)
879
+ print(f"[>] Min/Max sequence length: {lengths.min()} / {lengths.max()}")
880
+
881
+ # Identify missing values and related IDs
882
+ missing_locs = self.seqdata.stack()[self.seqdata.stack() == len(self.states)].index.get_level_values(0)
883
+ missing_count = len(missing_locs)
884
+ unique_missing_ids = missing_locs.unique().tolist()
885
+ print(f"[>] There are {missing_count} missing values across {len(unique_missing_ids)} sequences.")
886
+ print(f" First few missing sequence IDs: {unique_missing_ids[:10]} ...")
887
+
888
+ # Find and display sequences with the most missing points
889
+ missing_counts = self.seqdata.isin([len(self.states)]).sum(axis=1)
890
+ most_missing = missing_counts[missing_counts > 0].sort_values(ascending=False).head(5)
891
+ print("[>] Top sequences with the most missing time points:")
892
+ print(" (Each row shows a sequence ID and its number of missing values)\n")
893
+ print(most_missing.rename("Missing Count").to_frame().rename_axis("Sequence ID"))
894
+
895
+ else:
896
+ print(
897
+ f"[>] Min/Max sequence length: {self.seqdata.notna().sum(axis=1).min()} / {self.seqdata.notna().sum(axis=1).max()}")
898
+
899
+ print(f"[>] States: {self.states}")
900
+ print(f"[>] Labels: {self.labels}")
901
+
902
+ # Display weights information if weights were originally provided
903
+ if self._weights_provided:
904
+ weight_mean = np.mean(self.weights)
905
+ weight_std = np.std(self.weights)
906
+ print(f"[>] Weights: Provided (total weight={sum(self.weights):.3f}, mean={weight_mean:.3f}, std={weight_std:.3f})")
907
+ else:
908
+ print(f"[>] Weights: Not provided")
909
+
910
+ def get_legend(self):
911
+ """Returns the legend handles and labels for visualization."""
912
+ # self.legend_handles = [plt.Rectangle((0, 0), 1, 1,
913
+ # color=self.color_map[state],
914
+ # label=label)
915
+ # for state, label in zip(self.states, self.labels)]
916
+ # return [handle for handle in self.legend_handles], self.labels
917
+
918
+ self.legend_handles = [
919
+ plt.Rectangle((0, 0), 1, 1,
920
+ color=self.color_map[i + 1],
921
+ label=self.labels[i])
922
+ for i in range(len(self.states))
923
+ ]
924
+ return self.legend_handles, self.labels
925
+
926
+ def to_dataframe(self) -> pd.DataFrame:
927
+ """Returns the processed sequence dataset as a DataFrame."""
928
+ return self.seqdata
929
+
930
+ def plot_legend(self, save_as=None, dpi=200):
931
+ """Displays the saved legend for sequence state colors."""
932
+ # Ensure legend handles exist even if get_legend() wasn't called
933
+ legend_handles = getattr(self, "legend_handles", None)
934
+ if not legend_handles:
935
+ legend_handles = [
936
+ plt.Rectangle((0, 0), 1, 1, color=self.color_map[i + 1], label=self.labels[i]
937
+ ) for i in range(len(self.states))
938
+ ]
939
+ self.legend_handles = legend_handles
940
+
941
+ fig, ax = plt.subplots(figsize=(2, 2))
942
+ ax.legend(handles=legend_handles, loc='center', title="States", fontsize=10)
943
+ ax.axis('off')
944
+
945
+ if save_as:
946
+ plt.savefig(save_as, dpi=dpi)
947
+ plt.show()
948
+ else:
949
+ plt.tight_layout()
950
+ plt.show()
951
+
952
+ # ------------------------------
953
+ # The following are for multidomain sequence analysis, especially for seqdomassoc()
954
+
955
+ @property
956
+ def n_sequences(self):
957
+ """Returns number of sequences (rows)."""
958
+ return self.seqdata.shape[0]
959
+
960
+ @property
961
+ def n_steps(self):
962
+ """Returns sequence length (columns)."""
963
+ return self.seqdata.shape[1]
964
+
965
+ @property
966
+ def alphabet(self):
967
+ """Returns state alphabet."""
968
+ return self._alphabet
969
+
970
+ @alphabet.setter
971
+ def alphabet(self, val):
972
+ self._alphabet = val
973
+
974
+ @property
975
+ def sequences(self):
976
+ """Returns sequences as a list of lists (one list per sequence)."""
977
+ return [list(row) for row in self.seqdata.values]
978
+
979
+ @property
980
+ def weights(self):
981
+ return self._weights
982
+
983
+ @weights.setter
984
+ def weights(self, val):
985
+ self._weights = val
986
+
987
+ def flatten(self) -> np.ndarray:
988
+ """Flatten all sequences into a 1D array (row-wise)."""
989
+ return self.seqdata.values.flatten()
990
+
991
+ def flatten_weights(self) -> np.ndarray:
992
+ """
993
+ Repeat weights across sequence length for 1D alignment with flatten().
994
+ E.g., 5 sequences x 10 steps -> repeat each weight 10 times.
995
+ """
996
+ return np.repeat(self.weights, self.n_steps)
997
+
998
+ def to_numeric(self) -> np.ndarray:
999
+ """Returns integer-coded sequence data as NumPy array."""
1000
+ return self.seqdata.to_numpy(dtype=np.int32)
1001
+
1002
+ def get_xtabs(self, other: SequenceData, weighted=True) -> np.ndarray:
1003
+ """
1004
+ NumPy-only version of get_xtabs.
1005
+ Returns a raw NumPy matrix: shape (len(alphabet1), len(alphabet2))
1006
+ """
1007
+ if self.n_sequences != other.n_sequences or self.n_steps != other.n_steps:
1008
+ raise ValueError("Both SequenceData objects must have same shape.")
1009
+
1010
+ v1 = self.flatten()
1011
+ v2 = other.flatten()
1012
+
1013
+ # Equivalent to self.alphabet,
1014
+ # but alphabet cannot be used directly, because it does not account for missing values
1015
+ n1 = len(self.states)
1016
+ n2 = len(other.states)
1017
+
1018
+ table = np.zeros((n1, n2), dtype=np.float64)
1019
+
1020
+ if weighted:
1021
+ w = self.flatten_weights()
1022
+ # Safe increment using integer indices
1023
+ # Numpy's index starts from 0, thus it is important to reduce by 1
1024
+ np.add.at(table, (v1 - 1, v2 - 1), w)
1025
+ else:
1026
+ np.add.at(table, (v1 - 1, v2 - 1), 1)
1027
+
1028
+ return table
1029
+
1030
+ def check_uniqueness_rate(self, weighted: bool = False):
1031
+ """
1032
+ Compute uniqueness statistics of the sequences.
1033
+
1034
+ Returns:
1035
+ dict with keys:
1036
+ - n_sequences: total number of sequences (unweighted count)
1037
+ - n_unique: number of unique sequence patterns
1038
+ - uniqueness_rate: n_unique / n_sequences
1039
+ - weighted_total: total weighted count (only if weighted=True)
1040
+ - weighted_uniqueness_rate: n_unique / weighted_total (only if weighted=True)
1041
+
1042
+ Parameters:
1043
+ weighted: if True, use sequence weights to calculate weighted frequencies and uniqueness rates;
1044
+ if False, use simple counts (default behavior for backward compatibility).
1045
+ """
1046
+ import numpy as np
1047
+ import pandas as pd
1048
+
1049
+ A = self.to_numeric() # shape (n, m), int32
1050
+ n, m = A.shape
1051
+
1052
+ # Use a byte-level view to let np.unique work row-wise efficiently
1053
+ A_contig = np.ascontiguousarray(A)
1054
+ row_view = A_contig.view(np.dtype((np.void, A_contig.dtype.itemsize * m))).ravel()
1055
+
1056
+ # Get unique patterns
1057
+ uniq, inverse = np.unique(row_view, return_inverse=True)
1058
+
1059
+ n_unique = uniq.size
1060
+ uniqueness_rate = float(n_unique) / float(n) if n > 0 else np.nan
1061
+
1062
+ # Build simplified result dictionary with only essential statistics
1063
+ result = {
1064
+ "n_sequences": int(n),
1065
+ "n_unique": int(n_unique),
1066
+ "uniqueness_rate": uniqueness_rate
1067
+ }
1068
+
1069
+ # Add weighted statistics if requested
1070
+ if weighted:
1071
+ weighted_total = float(np.sum(self.weights))
1072
+ weighted_uniqueness_rate = float(n_unique) / weighted_total if weighted_total > 0 else np.nan
1073
+ result["weighted_total"] = weighted_total
1074
+ result["weighted_uniqueness_rate"] = weighted_uniqueness_rate
1075
+
1076
+ return result
1077
+
1078
+ def show_color_palette(self, palette_name: str = 'default', save_as: str = None, dpi: int = 200):
1079
+ """
1080
+ Instance method to show the default color palette for the current number of states.
1081
+ This is a convenience method that calls show_default_color_palette() with the number of states
1082
+ from this SequenceData instance.
1083
+
1084
+ Parameters:
1085
+ -----------
1086
+ palette_name : str, default='default'
1087
+ Name of the color palette to use. See show_default_color_palette() for available options.
1088
+ save_as : str, optional
1089
+ If provided, save the color preview figure to this file path.
1090
+ dpi : int, default=200
1091
+ Resolution for saving the figure (if save_as is provided).
1092
+
1093
+ Returns:
1094
+ --------
1095
+ dict : Dictionary with keys:
1096
+ - 'colors': List of RGB tuples (0-1 range)
1097
+ - 'hex_colors': List of hex color codes (e.g., "#FF5733")
1098
+ - 'rgb_255': List of RGB tuples (0-255 range)
1099
+
1100
+ Example:
1101
+ --------
1102
+ # Show color palette for this SequenceData instance
1103
+ seq_data = SequenceData(...)
1104
+ color_info = seq_data.show_color_palette()
1105
+
1106
+ # Show with a specific palette template
1107
+ color_info = seq_data.show_color_palette(palette_name='viridis')
1108
+ """
1109
+ return SequenceData.show_default_color_palette(
1110
+ n_states=len(self.states),
1111
+ reverse_colors=True,
1112
+ palette_name=palette_name,
1113
+ save_as=save_as,
1114
+ dpi=dpi
1115
+ )
1116
+
1117
+ @staticmethod
1118
+ def _get_available_palette_names():
1119
+ """
1120
+ Get list of available color palette names that can be used with show_default_color_palette.
1121
+
1122
+ Returns:
1123
+ --------
1124
+ list : List of available palette names (strings)
1125
+ """
1126
+ # Common seaborn color palettes
1127
+ available_palettes = [
1128
+ 'default', # Uses automatic selection based on n_states (Spectral/viridis/combined)
1129
+ 'Spectral',
1130
+ 'viridis',
1131
+ 'Set3',
1132
+ 'tab20',
1133
+ 'deep',
1134
+ 'muted',
1135
+ 'pastel',
1136
+ 'bright',
1137
+ 'dark',
1138
+ 'colorblind',
1139
+ 'husl',
1140
+ 'hls',
1141
+ 'coolwarm',
1142
+ 'RdYlGn',
1143
+ 'RdYlBu',
1144
+ 'RdBu',
1145
+ 'PiYG',
1146
+ 'PRGn',
1147
+ 'BrBG',
1148
+ 'Set1',
1149
+ 'Set2',
1150
+ 'Paired',
1151
+ 'Accent',
1152
+ 'Dark2',
1153
+ ]
1154
+ return available_palettes
1155
+
1156
+ @staticmethod
1157
+ def _generate_color_list(n_states: int, palette_name: str = 'default', reverse_colors: bool = True):
1158
+ """
1159
+ Generate color list based on palette name and number of states.
1160
+
1161
+ Parameters:
1162
+ -----------
1163
+ n_states : int
1164
+ Number of states (colors) to generate.
1165
+ palette_name : str, default='default'
1166
+ Name of the color palette to use. Use 'default' for automatic selection.
1167
+ Available palettes: see _get_available_palette_names()
1168
+ reverse_colors : bool, default=True
1169
+ Whether to reverse the color order.
1170
+
1171
+ Returns:
1172
+ --------
1173
+ list : List of RGB tuples (0-1 range)
1174
+ """
1175
+ if palette_name == 'default':
1176
+ # Use the original logic for default palette selection
1177
+ if n_states <= 20:
1178
+ color_list = sns.color_palette("Spectral", n_states)
1179
+ else:
1180
+ if n_states <= 40:
1181
+ color_list = sns.color_palette("viridis", n_states)
1182
+ else:
1183
+ viridis_colors = sns.color_palette("viridis", min(n_states // 2, 20))
1184
+ pastel_colors = sns.color_palette("Set3", min(n_states // 2, 12))
1185
+ tab20_colors = sns.color_palette("tab20", min(n_states // 3, 20))
1186
+ combined_colors = viridis_colors + pastel_colors + tab20_colors
1187
+ while len(combined_colors) < n_states:
1188
+ combined_colors.extend(combined_colors[:min(len(combined_colors), n_states - len(combined_colors))])
1189
+ color_list = combined_colors[:n_states]
1190
+ else:
1191
+ # Use specified palette name
1192
+ try:
1193
+ color_list = sns.color_palette(palette_name, n_states)
1194
+ except ValueError:
1195
+ # If palette doesn't support n_states directly, try to generate more colors
1196
+ try:
1197
+ # Try to get a base palette and extend it
1198
+ base_palette = sns.color_palette(palette_name)
1199
+ color_list = []
1200
+ while len(color_list) < n_states:
1201
+ color_list.extend(base_palette)
1202
+ color_list = color_list[:n_states]
1203
+ except Exception as e:
1204
+ raise ValueError(f"Invalid palette name '{palette_name}'. Available palettes: {', '.join(SequenceData._get_available_palette_names())}") from e
1205
+
1206
+ if reverse_colors:
1207
+ color_list = list(reversed(color_list))
1208
+
1209
+ return color_list
1210
+
1211
+ @staticmethod
1212
+ def _convert_rgb_to_hex_and_255(color_list):
1213
+ """
1214
+ Convert list of RGB tuples (0-1 range) to hex codes and RGB (0-255 range).
1215
+
1216
+ Parameters:
1217
+ -----------
1218
+ color_list : list
1219
+ List of RGB tuples in 0-1 range.
1220
+
1221
+ Returns:
1222
+ --------
1223
+ tuple : (hex_colors, rgb_255_list) where:
1224
+ - hex_colors: List of hex color codes (e.g., "#FF5733")
1225
+ - rgb_255_list: List of RGB tuples (0-255 range)
1226
+ """
1227
+ hex_colors = []
1228
+ rgb_255_list = []
1229
+ for rgb in color_list:
1230
+ # Convert from 0-1 to 0-255
1231
+ rgb_255 = tuple(int(c * 255) for c in rgb)
1232
+ rgb_255_list.append(rgb_255)
1233
+ # Convert to hex
1234
+ hex_color = f"#{rgb_255[0]:02X}{rgb_255[1]:02X}{rgb_255[2]:02X}"
1235
+ hex_colors.append(hex_color)
1236
+ return hex_colors, rgb_255_list
1237
+
1238
+ @staticmethod
1239
+ def show_default_color_palette(n_states: int, reverse_colors: bool = True, palette_name: str = 'default', save_as: str = None, dpi: int = 200):
1240
+ """
1241
+ Display the default color palette that would be used for a given number of states.
1242
+ This is useful for viewing default colors and copying hex codes to create custom_colors.
1243
+
1244
+ Parameters:
1245
+ -----------
1246
+ n_states : int
1247
+ Number of states (colors) to generate.
1248
+ reverse_colors : bool, default=True
1249
+ Whether to reverse the color order (same as default behavior in SequenceData).
1250
+ palette_name : str, default='default'
1251
+ Name of the color palette to use. Use 'default' to use the automatic palette selection
1252
+ (Spectral for ≤20 states, viridis for 21-40 states, combined for >40 states).
1253
+ Available palettes: 'default', 'Spectral', 'viridis', 'Set3', 'tab20', 'deep', 'muted',
1254
+ 'pastel', 'bright', 'dark', 'colorblind', 'husl', 'hls', 'coolwarm', 'RdYlGn', 'RdYlBu',
1255
+ 'RdBu', 'PiYG', 'PRGn', 'BrBG', 'Set1', 'Set2', 'Paired', 'Accent', 'Dark2', etc.
1256
+ Call SequenceData._get_available_palette_names() to see all available options.
1257
+ save_as : str, optional
1258
+ If provided, save the color preview figure to this file path.
1259
+ dpi : int, default=200
1260
+ Resolution for saving the figure (if save_as is provided).
1261
+
1262
+ Returns:
1263
+ --------
1264
+ dict : Dictionary with keys:
1265
+ - 'colors': List of RGB tuples (0-1 range)
1266
+ - 'hex_colors': List of hex color codes (e.g., "#FF5733")
1267
+ - 'rgb_255': List of RGB tuples (0-255 range)
1268
+
1269
+ Example:
1270
+ --------
1271
+ # View default colors for 13 states (call via class)
1272
+ color_info = SequenceData.show_default_color_palette(13)
1273
+
1274
+ # View a specific palette template
1275
+ color_info = SequenceData.show_default_color_palette(13, palette_name='viridis')
1276
+
1277
+ # Or call via instance (which will use the instance's number of states)
1278
+ seq_data = SequenceData(...)
1279
+ color_info = seq_data.show_color_palette()
1280
+
1281
+ # Then you can copy the hex_colors to use as custom_colors
1282
+ custom_colors = color_info['hex_colors']
1283
+ """
1284
+ # Generate colors using the specified palette
1285
+ color_list = SequenceData._generate_color_list(n_states, palette_name, reverse_colors)
1286
+
1287
+ # Convert RGB (0-1) to hex and RGB (0-255)
1288
+ hex_colors, rgb_255_list = SequenceData._convert_rgb_to_hex_and_255(color_list)
1289
+
1290
+ # Print header with palette information
1291
+ print(f"\n{'='*80}")
1292
+ palette_display_name = "Default (automatic selection)" if palette_name == 'default' else palette_name
1293
+ print(f"Color Palette: {palette_display_name} for {n_states} States")
1294
+ print(f"{'='*80}\n")
1295
+
1296
+ # Show available palette names if using default
1297
+ if palette_name == 'default':
1298
+ available_palettes = SequenceData._get_available_palette_names()
1299
+ print("Available color palette templates:")
1300
+ print(" " + ", ".join(available_palettes))
1301
+ print("\n You can specify a palette template by using the 'palette_name' parameter.")
1302
+ print(" Example: show_default_color_palette(13, palette_name='viridis')\n")
1303
+
1304
+ # Create visualization
1305
+ fig, ax = plt.subplots(figsize=(12, max(6, n_states * 0.5)))
1306
+
1307
+ for i, (hex_color, rgb, rgb_255) in enumerate(zip(hex_colors, color_list, rgb_255_list)):
1308
+ # Draw color swatch
1309
+ y_pos = n_states - i - 1
1310
+ rect = plt.Rectangle((0, y_pos), 1, 0.8, facecolor=rgb, edgecolor='black', linewidth=0.5)
1311
+ ax.add_patch(rect)
1312
+ ax.text(1.1, y_pos + 0.4, f"{i+1:2d}. {hex_color} | RGB{rgb_255}",
1313
+ va='center', fontsize=10, fontfamily='monospace')
1314
+
1315
+ print(f"{'='*80}")
1316
+ print("\nTo use these colors as custom_colors, copy the hex codes:")
1317
+ print(" custom_colors = " + str(hex_colors))
1318
+ print("\nOr use additional_colors to assign custom colors to specific states:")
1319
+ print(" additional_colors = {'Other': '#BDBDBD'} # Assign gray color to 'Other' state")
1320
+ print(f"{'='*80}\n")
1321
+
1322
+ # Configure plot
1323
+ ax.set_xlim(0, 8)
1324
+ ax.set_ylim(-0.5, n_states)
1325
+ ax.set_yticks([])
1326
+ ax.set_xticks([])
1327
+ ax.spines['top'].set_visible(False)
1328
+ ax.spines['right'].set_visible(False)
1329
+ ax.spines['bottom'].set_visible(False)
1330
+ ax.spines['left'].set_visible(False)
1331
+ title_text = f"Color Palette: {palette_display_name} ({n_states} States)"
1332
+ ax.set_title(title_text, fontsize=14, pad=20)
1333
+
1334
+ plt.tight_layout()
1335
+
1336
+ if save_as:
1337
+ plt.savefig(save_as, dpi=dpi, bbox_inches='tight')
1338
+ print(f"[>] Color palette saved to: {save_as}")
1339
+
1340
+ plt.show()
1341
+
1342
+ return {
1343
+ 'colors': color_list, # RGB tuples (0-1 range)
1344
+ 'hex_colors': hex_colors, # Hex codes
1345
+ 'rgb_255': rgb_255_list # RGB tuples (0-255 range)
1346
+ }
1347
+
1348
+ @staticmethod
1349
+ def get_default_color_palette(n_states: int, reverse_colors: bool = True, palette_name: str = 'default', return_format: str = 'hex'):
1350
+ """
1351
+ Get the default color palette for a given number of states.
1352
+ This returns the colors without displaying them (useful for programmatic use).
1353
+
1354
+ Parameters:
1355
+ -----------
1356
+ n_states : int
1357
+ Number of states (colors) to generate.
1358
+ reverse_colors : bool, default=True
1359
+ Whether to reverse the color order (same as default behavior in SequenceData).
1360
+ palette_name : str, default='default'
1361
+ Name of the color palette to use. See show_default_color_palette() for available options.
1362
+ return_format : str, default='hex'
1363
+ Format to return colors in. Options:
1364
+ - 'hex': List of hex color codes (e.g., "#FF5733")
1365
+ - 'rgb': List of RGB tuples (0-1 range, for matplotlib)
1366
+ - 'rgb255': List of RGB tuples (0-255 range)
1367
+
1368
+ Returns:
1369
+ --------
1370
+ list : List of colors in the requested format.
1371
+
1372
+ Example:
1373
+ --------
1374
+ # Get hex codes for 13 states using default palette
1375
+ hex_colors = SequenceData.get_default_color_palette(13, return_format='hex')
1376
+
1377
+ # Get hex codes using a specific palette template
1378
+ hex_colors = SequenceData.get_default_color_palette(13, palette_name='viridis', return_format='hex')
1379
+
1380
+ # Use them as custom_colors
1381
+ seq = SequenceData(df, time=..., states=..., custom_colors=hex_colors)
1382
+ """
1383
+ # Generate colors using the specified palette
1384
+ color_list = SequenceData._generate_color_list(n_states, palette_name, reverse_colors)
1385
+
1386
+ if return_format == 'rgb':
1387
+ return color_list
1388
+ elif return_format == 'hex':
1389
+ hex_colors, _ = SequenceData._convert_rgb_to_hex_and_255(color_list)
1390
+ return hex_colors
1391
+ elif return_format == 'rgb255':
1392
+ _, rgb_255_list = SequenceData._convert_rgb_to_hex_and_255(color_list)
1393
+ return rgb_255_list
1394
+ else:
1395
+ raise ValueError(f"return_format must be 'hex', 'rgb', or 'rgb255', got '{return_format}'")
1396
+
1397
+
1398
+
1399
+
1400
+