sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
- sequenzo/__init__.py +349 -0
- sequenzo/big_data/__init__.py +12 -0
- sequenzo/big_data/clara/__init__.py +26 -0
- sequenzo/big_data/clara/clara.py +476 -0
- sequenzo/big_data/clara/utils/__init__.py +27 -0
- sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
- sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
- sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
- sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
- sequenzo/big_data/clara/visualization.py +88 -0
- sequenzo/clustering/KMedoids.py +178 -0
- sequenzo/clustering/__init__.py +30 -0
- sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
- sequenzo/clustering/hierarchical_clustering.py +1256 -0
- sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
- sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
- sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
- sequenzo/clustering/src/KMedoid.cpp +263 -0
- sequenzo/clustering/src/PAM.cpp +237 -0
- sequenzo/clustering/src/PAMonce.cpp +265 -0
- sequenzo/clustering/src/cluster_quality.cpp +496 -0
- sequenzo/clustering/src/cluster_quality.h +128 -0
- sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
- sequenzo/clustering/src/module.cpp +228 -0
- sequenzo/clustering/src/weightedinertia.cpp +111 -0
- sequenzo/clustering/utils/__init__.py +27 -0
- sequenzo/clustering/utils/disscenter.py +122 -0
- sequenzo/data_preprocessing/__init__.py +22 -0
- sequenzo/data_preprocessing/helpers.py +303 -0
- sequenzo/datasets/__init__.py +41 -0
- sequenzo/datasets/biofam.csv +2001 -0
- sequenzo/datasets/biofam_child_domain.csv +2001 -0
- sequenzo/datasets/biofam_left_domain.csv +2001 -0
- sequenzo/datasets/biofam_married_domain.csv +2001 -0
- sequenzo/datasets/chinese_colonial_territories.csv +12 -0
- sequenzo/datasets/country_co2_emissions.csv +194 -0
- sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
- sequenzo/datasets/country_gdp_per_capita.csv +194 -0
- sequenzo/datasets/dyadic_children.csv +61 -0
- sequenzo/datasets/dyadic_parents.csv +61 -0
- sequenzo/datasets/mvad.csv +713 -0
- sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
- sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
- sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
- sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
- sequenzo/datasets/political_science_aid_shock.csv +166 -0
- sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
- sequenzo/define_sequence_data.py +1400 -0
- sequenzo/dissimilarity_measures/__init__.py +31 -0
- sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
- sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
- sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
- sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
- sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
- sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
- sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
- sequenzo/dissimilarity_measures/src/__init__.py +0 -0
- sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
- sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
- sequenzo/dissimilarity_measures/src/module.cpp +40 -0
- sequenzo/dissimilarity_measures/src/setup.py +30 -0
- sequenzo/dissimilarity_measures/src/utils.h +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
- sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
- sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
- sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
- sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
- sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
- sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
- sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
- sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
- sequenzo/multidomain/__init__.py +23 -0
- sequenzo/multidomain/association_between_domains.py +311 -0
- sequenzo/multidomain/cat.py +597 -0
- sequenzo/multidomain/combt.py +519 -0
- sequenzo/multidomain/dat.py +81 -0
- sequenzo/multidomain/idcd.py +139 -0
- sequenzo/multidomain/linked_polyad.py +292 -0
- sequenzo/openmp_setup.py +233 -0
- sequenzo/prefix_tree/__init__.py +62 -0
- sequenzo/prefix_tree/hub.py +114 -0
- sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
- sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
- sequenzo/prefix_tree/spell_level_indicators.py +297 -0
- sequenzo/prefix_tree/system_level_indicators.py +544 -0
- sequenzo/prefix_tree/utils.py +54 -0
- sequenzo/seqhmm/__init__.py +95 -0
- sequenzo/seqhmm/advanced_optimization.py +305 -0
- sequenzo/seqhmm/bootstrap.py +411 -0
- sequenzo/seqhmm/build_hmm.py +142 -0
- sequenzo/seqhmm/build_mhmm.py +136 -0
- sequenzo/seqhmm/build_nhmm.py +121 -0
- sequenzo/seqhmm/fit_mhmm.py +62 -0
- sequenzo/seqhmm/fit_model.py +61 -0
- sequenzo/seqhmm/fit_nhmm.py +76 -0
- sequenzo/seqhmm/formulas.py +289 -0
- sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
- sequenzo/seqhmm/gradients_nhmm.py +306 -0
- sequenzo/seqhmm/hmm.py +291 -0
- sequenzo/seqhmm/mhmm.py +314 -0
- sequenzo/seqhmm/model_comparison.py +238 -0
- sequenzo/seqhmm/multichannel_em.py +282 -0
- sequenzo/seqhmm/multichannel_utils.py +138 -0
- sequenzo/seqhmm/nhmm.py +270 -0
- sequenzo/seqhmm/nhmm_utils.py +191 -0
- sequenzo/seqhmm/predict.py +137 -0
- sequenzo/seqhmm/predict_mhmm.py +142 -0
- sequenzo/seqhmm/simulate.py +878 -0
- sequenzo/seqhmm/utils.py +218 -0
- sequenzo/seqhmm/visualization.py +910 -0
- sequenzo/sequence_characteristics/__init__.py +40 -0
- sequenzo/sequence_characteristics/complexity_index.py +49 -0
- sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
- sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
- sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
- sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
- sequenzo/sequence_characteristics/turbulence.py +155 -0
- sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
- sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
- sequenzo/suffix_tree/__init__.py +66 -0
- sequenzo/suffix_tree/hub.py +114 -0
- sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
- sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
- sequenzo/suffix_tree/spell_level_indicators.py +248 -0
- sequenzo/suffix_tree/system_level_indicators.py +535 -0
- sequenzo/suffix_tree/utils.py +56 -0
- sequenzo/version_check.py +283 -0
- sequenzo/visualization/__init__.py +29 -0
- sequenzo/visualization/plot_mean_time.py +222 -0
- sequenzo/visualization/plot_modal_state.py +276 -0
- sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
- sequenzo/visualization/plot_relative_frequency.py +405 -0
- sequenzo/visualization/plot_sequence_index.py +1175 -0
- sequenzo/visualization/plot_single_medoid.py +153 -0
- sequenzo/visualization/plot_state_distribution.py +651 -0
- sequenzo/visualization/plot_transition_matrix.py +190 -0
- sequenzo/visualization/utils/__init__.py +23 -0
- sequenzo/visualization/utils/utils.py +310 -0
- sequenzo/with_event_history_analysis/__init__.py +35 -0
- sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
- sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
- sequenzo-0.1.31.dist-info/METADATA +286 -0
- sequenzo-0.1.31.dist-info/RECORD +299 -0
- sequenzo-0.1.31.dist-info/WHEEL +5 -0
- sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
- sequenzo-0.1.31.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,1400 @@
|
|
|
1
|
+
"""
|
|
2
|
+
@Author : 梁彧祺 Yuqi Liang, 李欣怡 Xinyi Li
|
|
3
|
+
@File : define_sequence_data.py
|
|
4
|
+
@Time : 05/02/2025 12:47
|
|
5
|
+
@Desc :
|
|
6
|
+
|
|
7
|
+
Optimized SequenceData class with integrated color scheme & legend handling.
|
|
8
|
+
|
|
9
|
+
Note on `states` and `alphabet`:
|
|
10
|
+
|
|
11
|
+
In traditional sequence analysis tools (e.g., TraMineR), the `alphabet` refers to the full set of distinct states
|
|
12
|
+
found in the data and is often inferred automatically from the observed sequences.
|
|
13
|
+
|
|
14
|
+
However, in this implementation, we require the user to explicitly provide the set of `states`. This explicit control
|
|
15
|
+
is essential for ensuring consistent ordering of states, reproducibility of visualizations, and compatibility across
|
|
16
|
+
sequence datasets - especially when certain states may not appear in a given subset of the data.
|
|
17
|
+
|
|
18
|
+
As a result, `alphabet` is automatically set to `states` upon initialization, and kept as a semantic alias for clarity
|
|
19
|
+
and potential compatibility. Users should treat `states` as the definitive state space and are not required to provide
|
|
20
|
+
`alphabet` separately.
|
|
21
|
+
|
|
22
|
+
# ----------------------------------------------------------------------
|
|
23
|
+
# [Hint] Handling the ID column for sequence analysis
|
|
24
|
+
# ----------------------------------------------------------------------
|
|
25
|
+
|
|
26
|
+
# STEP 1: Check if your DataFrame already has a column representing unique entity IDs
|
|
27
|
+
# For example, check if "Entity ID" or "country" or any other identifier exists:
|
|
28
|
+
print(df.columns)
|
|
29
|
+
|
|
30
|
+
# If your data already has an ID column (e.g., 'Entity ID'), you can directly use it:
|
|
31
|
+
seq = SequenceData(df, id_col='Entity ID', time=..., states=...)
|
|
32
|
+
|
|
33
|
+
# ----------------------------------------------------------------------
|
|
34
|
+
# STEP 2: If your data has NO ID column, use the helper function below
|
|
35
|
+
# ----------------------------------------------------------------------
|
|
36
|
+
from sequenzo.utils import assign_unique_ids
|
|
37
|
+
|
|
38
|
+
# This will insert a new ID column named 'Entity ID' as the first column
|
|
39
|
+
df = assign_unique_ids(df, id_col_name='Entity ID')
|
|
40
|
+
|
|
41
|
+
# Optional: Save it for future use to avoid repeating this step
|
|
42
|
+
df.to_csv('your_dataset_with_ids.csv', index=False)
|
|
43
|
+
|
|
44
|
+
# Then you can use it like this:
|
|
45
|
+
seq = SequenceData(df, id_col='Entity ID', time=..., states=...)
|
|
46
|
+
|
|
47
|
+
"""
|
|
48
|
+
# Only applicable to Python 3.7+, add this line to defer type annotation evaluation
|
|
49
|
+
from __future__ import annotations
|
|
50
|
+
# Define the public API at the top of the file
|
|
51
|
+
__all__ = ['SequenceData']
|
|
52
|
+
|
|
53
|
+
# Global variables and other imports that do not depend on pandas are placed here
|
|
54
|
+
import numpy as np
|
|
55
|
+
import seaborn as sns
|
|
56
|
+
import matplotlib.pyplot as plt
|
|
57
|
+
import pandas as pd
|
|
58
|
+
from docutils.parsers.rst import states
|
|
59
|
+
from matplotlib.colors import ListedColormap
|
|
60
|
+
import re
|
|
61
|
+
from typing import Union
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class SequenceData:
|
|
65
|
+
"""
|
|
66
|
+
A class for defining and processing a sequence dataset for social sequence analysis.
|
|
67
|
+
|
|
68
|
+
This class provides:
|
|
69
|
+
- Sequence extraction & missing value handling.
|
|
70
|
+
- Automatic alphabet (state space) management.
|
|
71
|
+
- Efficient sequence-to-numeric conversion.
|
|
72
|
+
- Color mapping & legend storage for visualization.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
def __init__(
|
|
76
|
+
self,
|
|
77
|
+
data: pd.DataFrame,
|
|
78
|
+
time: list,
|
|
79
|
+
states: list,
|
|
80
|
+
labels: list = None,
|
|
81
|
+
id_col: str = None,
|
|
82
|
+
weights: np.ndarray = None,
|
|
83
|
+
start: int = 1,
|
|
84
|
+
custom_colors: list = None,
|
|
85
|
+
additional_colors: dict = None,
|
|
86
|
+
missing_values: Union[None, int, float, str, list] = None
|
|
87
|
+
):
|
|
88
|
+
"""
|
|
89
|
+
Initialize the SequenceData object.
|
|
90
|
+
|
|
91
|
+
:param data: DataFrame containing sequence data.
|
|
92
|
+
:param time: List of columns containing time labels.
|
|
93
|
+
:param states: List of unique states (categories).
|
|
94
|
+
:param alphabet: Optional predefined state space.
|
|
95
|
+
:param labels: Labels for states (optional, for visualization).
|
|
96
|
+
:param id_col: Column name for row identifiers, which is very important for hierarchical clustering.
|
|
97
|
+
:param weights: Sequence weights (optional).
|
|
98
|
+
:param start: Starting time index (default: 1).
|
|
99
|
+
:param missing_handling: Dict specifying handling for missing values (left, right, gaps).
|
|
100
|
+
:param void: Symbol for void elements (default: "%").
|
|
101
|
+
:param nr: Symbol for missing values (default: "*").
|
|
102
|
+
:param custom_colors: Custom color palette for visualization.
|
|
103
|
+
If provided, should be a list of colors matching the number of states.
|
|
104
|
+
Colors can be hex strings (e.g., "#FF5733") or RGB tuples.
|
|
105
|
+
:param additional_colors: Dictionary to specify additional custom colors for specific states
|
|
106
|
+
while keeping the default palette for others. This is useful when you want to keep default colors
|
|
107
|
+
but assign custom colors to specific states (e.g., {"Other": "#BDBDBD"} to make "Other" gray).
|
|
108
|
+
Format: {state_name: color}, where color can be hex string (e.g., "#BDBDBD") or RGB tuple.
|
|
109
|
+
Example: additional_colors={"Other": "#BDBDBD", "Missing": "#E0E0E0"}
|
|
110
|
+
:param missing_values: Custom missing value indicators. Can be:
|
|
111
|
+
- None (default): Auto-detect missing values (NaN, string "Missing")
|
|
112
|
+
- Single value: e.g., 99, 9, 1000, "Missing"
|
|
113
|
+
- List: e.g., [99, 9, 1000] or ["Missing", "N/A"]
|
|
114
|
+
The system will also check for pandas NaN and string "Missing" (case-insensitive)
|
|
115
|
+
and warn if other missing values are detected.
|
|
116
|
+
"""
|
|
117
|
+
# Import pandas here instead of the top of the file
|
|
118
|
+
import pandas as pd
|
|
119
|
+
|
|
120
|
+
self.data = data.copy()
|
|
121
|
+
self.time = time
|
|
122
|
+
|
|
123
|
+
# Remove all non-numeric characters from the year labels, e.g., "Year2020" -> "2020", or "C1" -> "1"
|
|
124
|
+
# self.cleaned_time = [re.sub(r'\D', '', str(year)) for year in time]
|
|
125
|
+
# No longer support this feature as we encourage users to clean the time variables.
|
|
126
|
+
# TODO: might implement a helper function for users to clean up their time variables.
|
|
127
|
+
self.cleaned_time = time
|
|
128
|
+
self.states = states.copy()
|
|
129
|
+
self.alphabet = states.copy() or sorted(set(data[time].stack().unique()))
|
|
130
|
+
self.labels = labels or [str(s) for s in states]
|
|
131
|
+
self.id_col = id_col
|
|
132
|
+
self.ids = np.array(self.data[self.id_col].values) if self.id_col else data.index
|
|
133
|
+
self.weights = weights
|
|
134
|
+
self._weights_provided = weights is not None # Track if weights were originally provided
|
|
135
|
+
self.start = start
|
|
136
|
+
self.custom_colors = custom_colors
|
|
137
|
+
self.additional_colors = additional_colors or {}
|
|
138
|
+
|
|
139
|
+
# Process missing_values parameter: convert to list format
|
|
140
|
+
if missing_values is None:
|
|
141
|
+
self.missing_values = []
|
|
142
|
+
elif isinstance(missing_values, (list, tuple)):
|
|
143
|
+
self.missing_values = list(missing_values)
|
|
144
|
+
else:
|
|
145
|
+
self.missing_values = [missing_values]
|
|
146
|
+
|
|
147
|
+
# Track original number of states before processing missing values
|
|
148
|
+
# This helps us determine if custom_colors needs adjustment
|
|
149
|
+
self._original_num_states = len(self.states)
|
|
150
|
+
self._missing_auto_added = False # Track if Missing was automatically added
|
|
151
|
+
|
|
152
|
+
# Validate parameters
|
|
153
|
+
self._validate_parameters()
|
|
154
|
+
|
|
155
|
+
# Validate additional_colors if provided
|
|
156
|
+
if self.additional_colors:
|
|
157
|
+
if self.custom_colors:
|
|
158
|
+
raise ValueError(
|
|
159
|
+
"[!] You cannot use both 'custom_colors' and 'additional_colors' at the same time.\n"
|
|
160
|
+
" -> Use 'custom_colors' to specify all colors, or\n"
|
|
161
|
+
" -> Use 'additional_colors' to assign custom colors to specific states while keeping default colors."
|
|
162
|
+
)
|
|
163
|
+
# Check that all states in additional_colors exist in self.states
|
|
164
|
+
invalid_states = [state for state in self.additional_colors.keys() if state not in self.states]
|
|
165
|
+
if invalid_states:
|
|
166
|
+
raise ValueError(
|
|
167
|
+
f"[!] The following states in 'additional_colors' are not found in 'states': {invalid_states}\n"
|
|
168
|
+
f" Available states: {self.states}"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# Extract & process sequences
|
|
172
|
+
self.seqdata = self._extract_sequences()
|
|
173
|
+
self._process_missing_values()
|
|
174
|
+
|
|
175
|
+
# The following two lines of code are for visualization
|
|
176
|
+
self.state_to_label = dict(zip(self.states, self.labels))
|
|
177
|
+
self.label_to_state = dict(zip(self.labels, self.states))
|
|
178
|
+
|
|
179
|
+
self._convert_states()
|
|
180
|
+
|
|
181
|
+
# Assign colors & save legend
|
|
182
|
+
self._assign_colors()
|
|
183
|
+
|
|
184
|
+
# Automatically print dataset overview
|
|
185
|
+
print("\n[>] SequenceData initialized successfully! Here's a summary:")
|
|
186
|
+
self.describe()
|
|
187
|
+
|
|
188
|
+
@property
|
|
189
|
+
def values(self):
|
|
190
|
+
"""Returns sequence data as a NumPy array, similar to xinyi_original_seqdef()."""
|
|
191
|
+
return self.seqdata.to_numpy(dtype=np.int32)
|
|
192
|
+
|
|
193
|
+
def __repr__(self):
|
|
194
|
+
return f"SequenceData({len(self.seqdata)} sequences, States: {self.states})"
|
|
195
|
+
|
|
196
|
+
def _validate_parameters(self):
|
|
197
|
+
"""Ensures correct input parameters and checks consistency with data."""
|
|
198
|
+
# Check states, alphabet, labels
|
|
199
|
+
if not self.states:
|
|
200
|
+
raise ValueError("'states' must be provided.")
|
|
201
|
+
|
|
202
|
+
# Get all unique values from the data (including NaN)
|
|
203
|
+
# stack() drops NaN by default, so we need to check separately
|
|
204
|
+
# Convert to Python native types for consistent comparison across Python versions
|
|
205
|
+
# Python 3.12 may return numpy scalar types which need to be converted
|
|
206
|
+
stacked_values = self.data[self.time].stack().unique()
|
|
207
|
+
# Normalize values to Python native types for consistent comparison
|
|
208
|
+
data_values_list = []
|
|
209
|
+
for val in stacked_values:
|
|
210
|
+
# Skip NaN values (they're handled separately)
|
|
211
|
+
if pd.isna(val):
|
|
212
|
+
continue
|
|
213
|
+
# Convert numpy scalar types to Python native types for consistent comparison
|
|
214
|
+
# This is important for Python 3.12 compatibility
|
|
215
|
+
if hasattr(val, 'item'): # numpy scalar
|
|
216
|
+
val = val.item()
|
|
217
|
+
data_values_list.append(val)
|
|
218
|
+
|
|
219
|
+
data_values_no_nan = set(data_values_list)
|
|
220
|
+
# Check if there are any NaN values in the data
|
|
221
|
+
has_nan_in_data = self.data[self.time].isna().any().any()
|
|
222
|
+
|
|
223
|
+
# Combine all data values (including NaN indicator if present)
|
|
224
|
+
all_data_values = data_values_no_nan.copy()
|
|
225
|
+
if has_nan_in_data:
|
|
226
|
+
all_data_values.add(np.nan)
|
|
227
|
+
|
|
228
|
+
# Validate that states are present in the actual data values
|
|
229
|
+
states_clean = [s for s in self.states if not pd.isna(s)] # stack() removes nan values, so if states contains np.nan, it will cause an error
|
|
230
|
+
# Normalize states to Python native types for consistent comparison
|
|
231
|
+
states_clean_normalized = []
|
|
232
|
+
for s in states_clean:
|
|
233
|
+
if hasattr(s, 'item'): # numpy scalar
|
|
234
|
+
s = s.item()
|
|
235
|
+
states_clean_normalized.append(s)
|
|
236
|
+
|
|
237
|
+
unmatched_states = [s for s in data_values_no_nan if s not in states_clean_normalized]
|
|
238
|
+
|
|
239
|
+
if unmatched_states:
|
|
240
|
+
raise ValueError(
|
|
241
|
+
f"[!] The following provided 'states' are not found in the data: {unmatched_states}\n"
|
|
242
|
+
f" Hint: Check spelling or formatting. Data contains these unique values: {sorted([v for v in data_values_no_nan if not pd.isna(v)])}"
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
# Validate that all data values are present in the provided states (complete state space check)
|
|
246
|
+
# Exclude missing values from this check (NaN and user-specified missing_values)
|
|
247
|
+
# Normalize states to Python native types for consistent comparison
|
|
248
|
+
states_normalized = []
|
|
249
|
+
for s in self.states:
|
|
250
|
+
if pd.isna(s):
|
|
251
|
+
states_normalized.append(s)
|
|
252
|
+
else:
|
|
253
|
+
if hasattr(s, 'item'): # numpy scalar
|
|
254
|
+
s = s.item()
|
|
255
|
+
states_normalized.append(s)
|
|
256
|
+
states_list = list(states_normalized)
|
|
257
|
+
states_set = set(states_normalized)
|
|
258
|
+
# Check for NaN in states
|
|
259
|
+
has_nan_in_states = any(pd.isna(s) for s in self.states)
|
|
260
|
+
|
|
261
|
+
# Get missing value indicators to exclude from the check
|
|
262
|
+
missing_indicators = set()
|
|
263
|
+
if has_nan_in_states:
|
|
264
|
+
missing_indicators.add(np.nan)
|
|
265
|
+
# Add user-specified missing_values
|
|
266
|
+
for mv in self.missing_values:
|
|
267
|
+
if pd.isna(mv):
|
|
268
|
+
missing_indicators.add(np.nan)
|
|
269
|
+
else:
|
|
270
|
+
missing_indicators.add(mv)
|
|
271
|
+
# Also check for string "Missing" (case-insensitive) in states
|
|
272
|
+
for s in self.states:
|
|
273
|
+
if isinstance(s, str) and s.lower() == 'missing':
|
|
274
|
+
missing_indicators.add(s)
|
|
275
|
+
# Also check for string "NaN" (case-insensitive) in states
|
|
276
|
+
for s in self.states:
|
|
277
|
+
if isinstance(s, str) and s.lower() == 'nan':
|
|
278
|
+
missing_indicators.add(s)
|
|
279
|
+
|
|
280
|
+
# Auto-detect string "NaN" (case-insensitive) in data as missing value
|
|
281
|
+
# Similar to how we handle string "Missing" in _process_missing_values
|
|
282
|
+
# Also check for string "Missing" (case-insensitive) in data
|
|
283
|
+
for dv in data_values_no_nan:
|
|
284
|
+
if isinstance(dv, str):
|
|
285
|
+
dv_lower = dv.lower()
|
|
286
|
+
if dv_lower == 'nan' or dv_lower == 'missing':
|
|
287
|
+
missing_indicators.add(dv)
|
|
288
|
+
|
|
289
|
+
# Find data values that are not in states and not missing values
|
|
290
|
+
# Use more robust comparison that handles type mismatches
|
|
291
|
+
missing_from_states = []
|
|
292
|
+
for dv in all_data_values:
|
|
293
|
+
# Skip if it's a missing value indicator
|
|
294
|
+
if pd.isna(dv):
|
|
295
|
+
# True pandas NaN should always be automatically handled, skip it
|
|
296
|
+
continue
|
|
297
|
+
elif dv in missing_indicators:
|
|
298
|
+
continue # This is a known missing value, skip
|
|
299
|
+
elif isinstance(dv, str) and (dv.lower() == 'nan' or dv.lower() == 'missing'):
|
|
300
|
+
# Double-check: if it's a string "NaN" or "Missing" (case-insensitive), skip it
|
|
301
|
+
continue
|
|
302
|
+
|
|
303
|
+
# Check if dv is in states_set (both are now normalized to Python native types)
|
|
304
|
+
if dv not in states_set:
|
|
305
|
+
missing_from_states.append(dv)
|
|
306
|
+
|
|
307
|
+
if missing_from_states:
|
|
308
|
+
# Format the error message nicely
|
|
309
|
+
data_values_display = sorted([v for v in data_values_no_nan if not pd.isna(v)])
|
|
310
|
+
if has_nan_in_data:
|
|
311
|
+
data_values_display.append("NaN")
|
|
312
|
+
|
|
313
|
+
raise ValueError(
|
|
314
|
+
f"[!] The following values found in the data are not included in your 'states' list: {missing_from_states}\n"
|
|
315
|
+
f" Your provided states: {self.states}\n"
|
|
316
|
+
f" All unique values in data: {data_values_display}\n"
|
|
317
|
+
f" Hint: You must include ALL unique values from the data in your 'states' parameter.\n"
|
|
318
|
+
f" Missing values (NaN or user-specified) are automatically handled, but all other data values must be in 'states'."
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
# ----------------
|
|
322
|
+
# Check if ID column is provided and valid
|
|
323
|
+
if self.id_col is not None and self.id_col not in self.data.columns:
|
|
324
|
+
raise ValueError(
|
|
325
|
+
f"[!] You must specify a valid `id_col` parameter that exists in your dataset.\n"
|
|
326
|
+
f" ID is required to uniquely identify each sequence (e.g., individuals).\n"
|
|
327
|
+
f" -> Hint: If your data does not have an ID column yet, you can use the helper function:\n\n"
|
|
328
|
+
f" from sequenzo.utils import assign_unique_ids\n"
|
|
329
|
+
f" df = assign_unique_ids(df, id_col_name='Entity ID')\n"
|
|
330
|
+
f" df.to_csv('your_dataset_with_ids.csv', index=False)\n\n"
|
|
331
|
+
f" This will permanently assign unique IDs to your dataset for future use."
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
# Because it is already implemented at initialization time
|
|
335
|
+
# self.ids = np.array(self.data[self.id_col].values)
|
|
336
|
+
|
|
337
|
+
# Validate ID uniqueness and length
|
|
338
|
+
if len(self.ids) != len(self.data):
|
|
339
|
+
raise ValueError(f"[!] Length of ID column ('{self.id_col}') must match number of rows in the dataset.")
|
|
340
|
+
if len(np.unique(self.ids)) != len(self.ids):
|
|
341
|
+
raise ValueError(f"[!] IDs in column '{self.id_col}' must be unique.")
|
|
342
|
+
|
|
343
|
+
# ----------------
|
|
344
|
+
if self.alphabet and set(self.alphabet) != set(self.states):
|
|
345
|
+
raise ValueError("'alphabet' must match 'states'.")
|
|
346
|
+
|
|
347
|
+
if self.labels:
|
|
348
|
+
if len(self.labels) != len(self.states):
|
|
349
|
+
# Provide detailed error message showing what's missing or extra
|
|
350
|
+
states_len = len(self.states)
|
|
351
|
+
labels_len = len(self.labels)
|
|
352
|
+
|
|
353
|
+
if labels_len < states_len:
|
|
354
|
+
missing_count = states_len - labels_len
|
|
355
|
+
error_msg = (
|
|
356
|
+
f"[!] 'labels' length ({labels_len}) is shorter than 'states' length ({states_len}).\n"
|
|
357
|
+
f" Missing {missing_count} label(s).\n"
|
|
358
|
+
f" Your states: {self.states}\n"
|
|
359
|
+
f" Your labels: {self.labels}\n"
|
|
360
|
+
f" Hint: You need to provide {states_len} labels, one for each state.\n"
|
|
361
|
+
f" Example: labels = {[str(s) for s in self.states]}"
|
|
362
|
+
)
|
|
363
|
+
else:
|
|
364
|
+
extra_count = labels_len - states_len
|
|
365
|
+
error_msg = (
|
|
366
|
+
f"[!] 'labels' length ({labels_len}) is longer than 'states' length ({states_len}).\n"
|
|
367
|
+
f" You have {extra_count} extra label(s).\n"
|
|
368
|
+
f" Your states: {self.states}\n"
|
|
369
|
+
f" Your labels: {self.labels}\n"
|
|
370
|
+
f" Hint: You should provide exactly {states_len} labels, one for each state.\n"
|
|
371
|
+
f" The extra labels are: {self.labels[states_len:]}"
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
raise ValueError(error_msg)
|
|
375
|
+
|
|
376
|
+
# Ensure labels are all strings
|
|
377
|
+
non_string_labels = [label for label in self.labels if not isinstance(label, str)]
|
|
378
|
+
if non_string_labels:
|
|
379
|
+
raise TypeError(
|
|
380
|
+
f"[!] All elements in 'labels' must be strings for proper visualization (e.g., for legends or annotations).\n"
|
|
381
|
+
f" Detected non-string labels: {non_string_labels}\n"
|
|
382
|
+
f" Example fix: instead of using `labels = [1, 2, 3]`, use `labels = ['Single', 'Married', 'Divorced']`."
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
# Check weights
|
|
386
|
+
if self.weights is not None:
|
|
387
|
+
if len(self.weights) != len(self.data):
|
|
388
|
+
raise ValueError("'weights' must match the length of 'data'.")
|
|
389
|
+
else:
|
|
390
|
+
self.weights = np.ones(self.data.shape[0])
|
|
391
|
+
|
|
392
|
+
def _extract_sequences(self) -> pd.DataFrame:
|
|
393
|
+
"""Extracts only relevant sequence columns."""
|
|
394
|
+
return self.data[self.time].copy()
|
|
395
|
+
|
|
396
|
+
def _process_missing_values(self):
|
|
397
|
+
"""Handles missing values based on the specified rules and user-defined missing_values."""
|
|
398
|
+
# left, right, gaps = self.missing_handling.values()
|
|
399
|
+
#
|
|
400
|
+
# # Fill left-side missing values
|
|
401
|
+
# if not pd.isna(left) and left != "DEL":
|
|
402
|
+
# self.seqdata.fillna(left, inplace=True)
|
|
403
|
+
#
|
|
404
|
+
# # Process right-side missing values
|
|
405
|
+
# if right == "DEL":
|
|
406
|
+
# self.seqdata = self.seqdata.apply(lambda row: row.dropna().reset_index(drop=True), axis=1)
|
|
407
|
+
#
|
|
408
|
+
# # Process gaps (internal missing values)
|
|
409
|
+
# if not pd.isna(gaps) and gaps != "DEL":
|
|
410
|
+
# self.seqdata.replace(self.nr, gaps, inplace=True)
|
|
411
|
+
|
|
412
|
+
# Collect all detected missing value indicators
|
|
413
|
+
detected_missing = []
|
|
414
|
+
|
|
415
|
+
# Check for pandas NaN values
|
|
416
|
+
has_pandas_nan = self.seqdata.isna().any().any()
|
|
417
|
+
if has_pandas_nan:
|
|
418
|
+
detected_missing.append("NaN (pandas)")
|
|
419
|
+
|
|
420
|
+
# Check for user-specified missing_values in the data
|
|
421
|
+
user_missing_found = []
|
|
422
|
+
for mv in self.missing_values:
|
|
423
|
+
if pd.isna(mv):
|
|
424
|
+
# Handle NaN in missing_values list
|
|
425
|
+
if has_pandas_nan and "NaN (pandas)" not in user_missing_found:
|
|
426
|
+
user_missing_found.append("NaN (pandas)")
|
|
427
|
+
else:
|
|
428
|
+
# Check if this missing value exists in the data
|
|
429
|
+
if (self.seqdata == mv).any().any():
|
|
430
|
+
user_missing_found.append(mv)
|
|
431
|
+
|
|
432
|
+
# Check for string "Missing" (case-insensitive) as missing indicator
|
|
433
|
+
# This handles cases where missing values are represented as the string "Missing" instead of NaN
|
|
434
|
+
# Only check if not already in user-specified missing_values
|
|
435
|
+
has_string_missing = False
|
|
436
|
+
string_missing_variants = []
|
|
437
|
+
|
|
438
|
+
# Check if "Missing" (case-insensitive) is already in user-specified missing_values
|
|
439
|
+
has_missing_string_in_user_spec = any(
|
|
440
|
+
isinstance(mv, str) and mv.lower() == 'missing' for mv in self.missing_values
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
if not has_missing_string_in_user_spec:
|
|
444
|
+
try:
|
|
445
|
+
# Check case-insensitive "missing" strings
|
|
446
|
+
missing_mask = self.seqdata.astype(str).str.lower() == 'missing'
|
|
447
|
+
if missing_mask.any().any():
|
|
448
|
+
has_string_missing = True
|
|
449
|
+
# Find actual string values (preserving case)
|
|
450
|
+
actual_values = self.seqdata[missing_mask].dropna().unique()
|
|
451
|
+
string_missing_variants = [str(v) for v in actual_values if str(v).lower() == 'missing']
|
|
452
|
+
except (AttributeError, TypeError):
|
|
453
|
+
# If conversion fails, check column by column
|
|
454
|
+
try:
|
|
455
|
+
for col in self.seqdata.columns:
|
|
456
|
+
col_mask = self.seqdata[col].astype(str).str.lower() == 'missing'
|
|
457
|
+
if col_mask.any():
|
|
458
|
+
has_string_missing = True
|
|
459
|
+
actual_values = self.seqdata.loc[col_mask, col].unique()
|
|
460
|
+
for v in actual_values:
|
|
461
|
+
variant = str(v)
|
|
462
|
+
if variant.lower() == 'missing' and variant not in string_missing_variants:
|
|
463
|
+
string_missing_variants.append(variant)
|
|
464
|
+
except:
|
|
465
|
+
pass
|
|
466
|
+
|
|
467
|
+
if has_string_missing:
|
|
468
|
+
# Add unique string variants to detected missing (only if not already specified by user)
|
|
469
|
+
for variant in string_missing_variants:
|
|
470
|
+
if variant not in detected_missing and variant not in user_missing_found:
|
|
471
|
+
detected_missing.append(variant)
|
|
472
|
+
|
|
473
|
+
# Check for string "NaN" (case-insensitive) as missing indicator
|
|
474
|
+
# Similar to how we handle string "Missing"
|
|
475
|
+
# Only check if not already in user-specified missing_values
|
|
476
|
+
has_string_nan = False
|
|
477
|
+
string_nan_variants = []
|
|
478
|
+
|
|
479
|
+
# Check if "NaN" (case-insensitive) is already in user-specified missing_values
|
|
480
|
+
has_nan_string_in_user_spec = any(
|
|
481
|
+
isinstance(mv, str) and mv.lower() == 'nan' for mv in self.missing_values
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
if not has_nan_string_in_user_spec:
|
|
485
|
+
try:
|
|
486
|
+
# Check case-insensitive "nan" strings
|
|
487
|
+
nan_mask = self.seqdata.astype(str).str.lower() == 'nan'
|
|
488
|
+
if nan_mask.any().any():
|
|
489
|
+
has_string_nan = True
|
|
490
|
+
# Find actual string values (preserving case)
|
|
491
|
+
actual_values = self.seqdata[nan_mask].dropna().unique()
|
|
492
|
+
string_nan_variants = [str(v) for v in actual_values if str(v).lower() == 'nan']
|
|
493
|
+
except (AttributeError, TypeError):
|
|
494
|
+
# If conversion fails, check column by column
|
|
495
|
+
try:
|
|
496
|
+
for col in self.seqdata.columns:
|
|
497
|
+
col_mask = self.seqdata[col].astype(str).str.lower() == 'nan'
|
|
498
|
+
if col_mask.any():
|
|
499
|
+
has_string_nan = True
|
|
500
|
+
actual_values = self.seqdata.loc[col_mask, col].unique()
|
|
501
|
+
for v in actual_values:
|
|
502
|
+
variant = str(v)
|
|
503
|
+
if variant.lower() == 'nan' and variant not in string_nan_variants:
|
|
504
|
+
string_nan_variants.append(variant)
|
|
505
|
+
except:
|
|
506
|
+
pass
|
|
507
|
+
|
|
508
|
+
if has_string_nan:
|
|
509
|
+
# Add unique string variants to detected missing (only if not already specified by user)
|
|
510
|
+
for variant in string_nan_variants:
|
|
511
|
+
if variant not in detected_missing and variant not in user_missing_found:
|
|
512
|
+
detected_missing.append(variant)
|
|
513
|
+
|
|
514
|
+
# Combine user-specified and auto-detected missing values
|
|
515
|
+
all_missing_values = list(set(self.missing_values + detected_missing))
|
|
516
|
+
# Remove NaN placeholders and add actual NaN check
|
|
517
|
+
if has_pandas_nan:
|
|
518
|
+
all_missing_values = [mv for mv in all_missing_values if mv != "NaN (pandas)"] + [np.nan]
|
|
519
|
+
else:
|
|
520
|
+
all_missing_values = [mv for mv in all_missing_values if mv != "NaN (pandas)"]
|
|
521
|
+
|
|
522
|
+
# Check if there are any missing values at all
|
|
523
|
+
has_any_missing = False
|
|
524
|
+
if has_pandas_nan:
|
|
525
|
+
has_any_missing = True
|
|
526
|
+
elif user_missing_found:
|
|
527
|
+
has_any_missing = True
|
|
528
|
+
elif has_string_missing:
|
|
529
|
+
has_any_missing = True
|
|
530
|
+
elif has_string_nan:
|
|
531
|
+
has_any_missing = True
|
|
532
|
+
else:
|
|
533
|
+
# Check if any user-specified missing_values exist in data
|
|
534
|
+
for mv in self.missing_values:
|
|
535
|
+
if not pd.isna(mv):
|
|
536
|
+
if (self.seqdata == mv).any().any():
|
|
537
|
+
has_any_missing = True
|
|
538
|
+
break
|
|
539
|
+
|
|
540
|
+
self.ismissing = has_any_missing
|
|
541
|
+
|
|
542
|
+
# Warn user if other missing values were detected beyond what they specified
|
|
543
|
+
if self.missing_values and detected_missing:
|
|
544
|
+
other_missing = [mv for mv in detected_missing if mv not in [str(m) for m in self.missing_values] and mv != "NaN (pandas)"]
|
|
545
|
+
if other_missing or (has_pandas_nan and not any(pd.isna(mv) for mv in self.missing_values)):
|
|
546
|
+
print(
|
|
547
|
+
f"[!] Warning: Detected additional missing value indicators in your data beyond those you specified.\n"
|
|
548
|
+
f" You specified: {self.missing_values}\n"
|
|
549
|
+
f" Additional missing values found: {other_missing + (['NaN'] if has_pandas_nan and not any(pd.isna(mv) for mv in self.missing_values) else [])}\n"
|
|
550
|
+
f" Recommendation: Include these in the `missing_values` parameter for complete handling.\n"
|
|
551
|
+
f" Example: missing_values={self.missing_values + other_missing + (['NaN'] if has_pandas_nan and not any(pd.isna(mv) for mv in self.missing_values) else [])}"
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
# Determine the canonical missing representation for states/labels
|
|
555
|
+
# This will be used when adding missing to states if needed
|
|
556
|
+
canonical_missing_value = None
|
|
557
|
+
if has_pandas_nan:
|
|
558
|
+
canonical_missing_value = np.nan
|
|
559
|
+
elif string_missing_variants:
|
|
560
|
+
# Use the first variant (usually "Missing")
|
|
561
|
+
canonical_missing_value = string_missing_variants[0]
|
|
562
|
+
elif string_nan_variants:
|
|
563
|
+
# Use the first variant (usually "NaN")
|
|
564
|
+
canonical_missing_value = string_nan_variants[0]
|
|
565
|
+
elif user_missing_found:
|
|
566
|
+
# Use the first user-specified missing value that was found
|
|
567
|
+
canonical_missing_value = user_missing_found[0]
|
|
568
|
+
elif self.missing_values:
|
|
569
|
+
# Use the first user-specified missing value
|
|
570
|
+
canonical_missing_value = self.missing_values[0]
|
|
571
|
+
|
|
572
|
+
if self.ismissing:
|
|
573
|
+
# Check if states already contains any form of "Missing" or np.nan
|
|
574
|
+
# Check if states contains any representation of missing values
|
|
575
|
+
has_missing_state = False
|
|
576
|
+
for state in self.states:
|
|
577
|
+
if pd.isna(state):
|
|
578
|
+
has_missing_state = True
|
|
579
|
+
break
|
|
580
|
+
elif isinstance(state, str):
|
|
581
|
+
# Check if state matches any missing value (case-insensitive for strings)
|
|
582
|
+
state_lower = state.lower()
|
|
583
|
+
if state_lower == "missing" or state_lower == "nan" or state in self.missing_values or state in user_missing_found:
|
|
584
|
+
has_missing_state = True
|
|
585
|
+
break
|
|
586
|
+
elif state in self.missing_values or state in user_missing_found:
|
|
587
|
+
has_missing_state = True
|
|
588
|
+
break
|
|
589
|
+
|
|
590
|
+
# Also check labels
|
|
591
|
+
has_missing_label = any(
|
|
592
|
+
(label.lower() == "missing" or label.lower() == "nan") or label in self.missing_values or label in user_missing_found
|
|
593
|
+
for label in self.labels if isinstance(label, str)
|
|
594
|
+
) or any(pd.isna(label) for label in self.labels)
|
|
595
|
+
|
|
596
|
+
if not has_missing_state and canonical_missing_value is not None:
|
|
597
|
+
# Automatically determine if states are string type or numeric type
|
|
598
|
+
if pd.isna(canonical_missing_value):
|
|
599
|
+
example_missing = "np.nan"
|
|
600
|
+
quote = ""
|
|
601
|
+
missing_state_value = np.nan
|
|
602
|
+
else:
|
|
603
|
+
example_missing = f"'{canonical_missing_value}'" if isinstance(canonical_missing_value, str) else str(canonical_missing_value)
|
|
604
|
+
quote = "'" if isinstance(canonical_missing_value, str) else ""
|
|
605
|
+
missing_state_value = canonical_missing_value
|
|
606
|
+
|
|
607
|
+
# Build description of missing types found
|
|
608
|
+
missing_types = []
|
|
609
|
+
if has_pandas_nan:
|
|
610
|
+
missing_types.append("NaN (pandas)")
|
|
611
|
+
if string_missing_variants:
|
|
612
|
+
missing_types.extend([f"'{v}'" for v in string_missing_variants])
|
|
613
|
+
if string_nan_variants:
|
|
614
|
+
missing_types.extend([f"'{v}'" for v in string_nan_variants])
|
|
615
|
+
if user_missing_found:
|
|
616
|
+
missing_types.extend([str(v) for v in user_missing_found if v not in string_missing_variants and v not in string_nan_variants and not pd.isna(v)])
|
|
617
|
+
missing_type_desc = ", ".join(missing_types) if missing_types else "missing values"
|
|
618
|
+
|
|
619
|
+
missing_values_desc = ""
|
|
620
|
+
if self.missing_values:
|
|
621
|
+
missing_values_desc = f"\n You specified missing_values={self.missing_values}."
|
|
622
|
+
|
|
623
|
+
print(
|
|
624
|
+
f"[!] Detected missing values ({missing_type_desc}) in the sequence data.{missing_values_desc}\n"
|
|
625
|
+
f" -> Automatically added {example_missing} to `states` and `labels` for compatibility.\n"
|
|
626
|
+
" However, it's strongly recommended to manually include it when defining `states` and `labels`.\n"
|
|
627
|
+
" For example:\n\n"
|
|
628
|
+
f" states = [{quote}At Home{quote}, {quote}Left Home{quote}, {example_missing}]\n"
|
|
629
|
+
f" labels = [{quote}At Home{quote}, {quote}Left Home{quote}, {quote}Missing{quote}]\n\n"
|
|
630
|
+
" This ensures consistent color mapping and avoids unexpected visualization errors."
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
# Add missing to states
|
|
634
|
+
self.states.append(missing_state_value)
|
|
635
|
+
|
|
636
|
+
# Always ensure labels has the same length as states after appending missing state
|
|
637
|
+
# Strategy:
|
|
638
|
+
# 1. If labels already has "Missing", we need to ensure it's removed and re-added at the end
|
|
639
|
+
# 2. We need to preserve labels for the original states (before adding missing)
|
|
640
|
+
# 3. If labels length matches original states length, just replace any "Missing" and append
|
|
641
|
+
# 4. If labels has extra elements, take only the first N (where N = original states count)
|
|
642
|
+
|
|
643
|
+
# Remove any existing "Missing" labels (case-insensitive)
|
|
644
|
+
labels_without_missing = [label for label in self.labels
|
|
645
|
+
if not (isinstance(label, str) and label.lower() == "missing")]
|
|
646
|
+
|
|
647
|
+
# Ensure we have the correct number of labels for non-missing states
|
|
648
|
+
# If labels_without_missing has fewer elements than original states, we're missing some labels
|
|
649
|
+
# If it has more, we take only the first N that match original states
|
|
650
|
+
if len(labels_without_missing) < self._original_num_states:
|
|
651
|
+
# Not enough labels - this is unusual but we'll pad with generic labels
|
|
652
|
+
while len(labels_without_missing) < self._original_num_states:
|
|
653
|
+
labels_without_missing.append(f"State {len(labels_without_missing) + 1}")
|
|
654
|
+
elif len(labels_without_missing) > self._original_num_states:
|
|
655
|
+
# Too many labels - take only the first N
|
|
656
|
+
labels_without_missing = labels_without_missing[:self._original_num_states]
|
|
657
|
+
|
|
658
|
+
# Append "Missing" label at the end to match the appended missing state
|
|
659
|
+
self.labels = labels_without_missing + ["Missing"]
|
|
660
|
+
|
|
661
|
+
# Verify lengths match (safety check)
|
|
662
|
+
if len(self.states) != len(self.labels):
|
|
663
|
+
raise ValueError(
|
|
664
|
+
f"Internal error: Length mismatch after adding missing state. "
|
|
665
|
+
f"States length: {len(self.states)}, Labels length: {len(self.labels)}. "
|
|
666
|
+
f"States: {self.states}, Labels: {self.labels}. "
|
|
667
|
+
f"Original num states: {self._original_num_states}"
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
# Mark that Missing was automatically added
|
|
671
|
+
self._missing_auto_added = True
|
|
672
|
+
|
|
673
|
+
|
|
674
|
+
def _convert_states(self):
|
|
675
|
+
"""
|
|
676
|
+
Converts categorical states into numerical values for processing.
|
|
677
|
+
Note that the order has to be the same as when the user defines the states of the class,
|
|
678
|
+
as it is very important for visualization.
|
|
679
|
+
Otherwise, the colors will be assigned incorrectly.
|
|
680
|
+
|
|
681
|
+
For instance, self.states = ['Very Low', 'Low', 'Middle', 'High', 'Very High'], as the user defines when defining the class
|
|
682
|
+
but the older version here is {'High': 1, 'Low': 2, 'Middle': 3, 'Very High': 4, 'Very Low': 5}
|
|
683
|
+
"""
|
|
684
|
+
correct_order = self.states
|
|
685
|
+
|
|
686
|
+
# Create the state mapping with correct order
|
|
687
|
+
self.state_mapping = {original_state: i + 1 for i, original_state in enumerate(self.states)}
|
|
688
|
+
# Keep the inverse mapping so that legends and plots can use numeric encoding
|
|
689
|
+
self.inverse_state_mapping = {v: k for k, v in self.state_mapping.items()}
|
|
690
|
+
|
|
691
|
+
# Apply the mapping
|
|
692
|
+
# Handle missing values: replace with the last index (which should be the missing state)
|
|
693
|
+
# Also handle user-specified missing_values that might not be in state_mapping
|
|
694
|
+
def map_value(x):
|
|
695
|
+
# First check if it's in the state mapping
|
|
696
|
+
if x in self.state_mapping:
|
|
697
|
+
return self.state_mapping[x]
|
|
698
|
+
# Check if it's a pandas NaN
|
|
699
|
+
if pd.isna(x):
|
|
700
|
+
return len(self.states) # Last state should be missing
|
|
701
|
+
# Check if it's in user-specified missing_values
|
|
702
|
+
if x in self.missing_values or str(x).lower() == 'missing':
|
|
703
|
+
# If missing value is in states, use its mapping; otherwise use last index
|
|
704
|
+
if x in self.states:
|
|
705
|
+
return self.state_mapping.get(x, len(self.states))
|
|
706
|
+
else:
|
|
707
|
+
return len(self.states)
|
|
708
|
+
# If not found, use last index as fallback (treat as missing)
|
|
709
|
+
return len(self.states)
|
|
710
|
+
|
|
711
|
+
try:
|
|
712
|
+
self.seqdata = self.seqdata.map(map_value)
|
|
713
|
+
except AttributeError:
|
|
714
|
+
self.seqdata = self.seqdata.applymap(map_value)
|
|
715
|
+
|
|
716
|
+
if self.ids is not None:
|
|
717
|
+
self.seqdata.index = self.ids
|
|
718
|
+
|
|
719
|
+
def _assign_colors(self, reverse_colors=True):
|
|
720
|
+
"""Assigns a color palette using user-defined or default Spectral palette.
|
|
721
|
+
|
|
722
|
+
If missing values are present, automatically assigns a fixed gray color (#cfcccc)
|
|
723
|
+
to missing values and uses the existing color scheme for non-missing states.
|
|
724
|
+
"""
|
|
725
|
+
num_states = len(self.states)
|
|
726
|
+
|
|
727
|
+
# Check if missing values are present
|
|
728
|
+
has_missing = self.ismissing
|
|
729
|
+
missing_gray_color = (0.811765, 0.8, 0.8) # Fixed gray color for missing values (#cfcccc)
|
|
730
|
+
|
|
731
|
+
if has_missing:
|
|
732
|
+
# Count non-missing states for color palette generation
|
|
733
|
+
non_missing_states = num_states - 1
|
|
734
|
+
|
|
735
|
+
if self.custom_colors:
|
|
736
|
+
# If user provided custom colors, check if they account for missing values
|
|
737
|
+
if len(self.custom_colors) == num_states:
|
|
738
|
+
# User provided colors for all states including missing - use as is
|
|
739
|
+
color_list = self.custom_colors
|
|
740
|
+
elif len(self.custom_colors) == non_missing_states:
|
|
741
|
+
# User provided colors only for non-missing states - add gray for missing
|
|
742
|
+
color_list = self.custom_colors + [missing_gray_color]
|
|
743
|
+
if self._missing_auto_added:
|
|
744
|
+
print(
|
|
745
|
+
f"[!] Automatically added gray color (#cfcccc) for missing values.\n"
|
|
746
|
+
f" -> You provided {len(self.custom_colors)} colors for {self._original_num_states} states, "
|
|
747
|
+
f"but Missing was automatically added.\n"
|
|
748
|
+
f" -> Added gray (#cfcccc) as the color for Missing state."
|
|
749
|
+
)
|
|
750
|
+
elif self._missing_auto_added and len(self.custom_colors) == self._original_num_states:
|
|
751
|
+
# Missing was automatically added, and user provided colors for original states
|
|
752
|
+
# Automatically add gray for the missing state
|
|
753
|
+
color_list = self.custom_colors + [missing_gray_color]
|
|
754
|
+
print(
|
|
755
|
+
f"[!] Automatically added gray color (#cfcccc) for missing values.\n"
|
|
756
|
+
f" -> You provided {len(self.custom_colors)} colors for {self._original_num_states} states, "
|
|
757
|
+
f"but Missing was automatically added.\n"
|
|
758
|
+
f" -> Added gray (#cfcccc) as the color for Missing state."
|
|
759
|
+
)
|
|
760
|
+
else:
|
|
761
|
+
raise ValueError(
|
|
762
|
+
f"Length of custom_colors ({len(self.custom_colors)}) must match "
|
|
763
|
+
f"either total states ({num_states}) or non-missing states ({non_missing_states}).\n"
|
|
764
|
+
f"Hint: If Missing was automatically added, you can either:\n"
|
|
765
|
+
f" 1. Include 'Missing' in your states and labels when creating SequenceData, or\n"
|
|
766
|
+
f" 2. Provide {non_missing_states} colors (without Missing) and we'll add gray automatically."
|
|
767
|
+
)
|
|
768
|
+
else:
|
|
769
|
+
# Generate colors for non-missing states and add gray for missing
|
|
770
|
+
if non_missing_states <= 20:
|
|
771
|
+
non_missing_color_list = sns.color_palette("Spectral", non_missing_states)
|
|
772
|
+
else:
|
|
773
|
+
# Use a more elegant color palette for many states - combination of viridis and pastel colors
|
|
774
|
+
if non_missing_states <= 40:
|
|
775
|
+
# Use viridis for up to 40 states (more colorful than cubehelix)
|
|
776
|
+
non_missing_color_list = sns.color_palette("viridis", non_missing_states)
|
|
777
|
+
else:
|
|
778
|
+
# For very large state counts, use a custom palette combining multiple schemes
|
|
779
|
+
viridis_colors = sns.color_palette("viridis", min(non_missing_states // 2, 20))
|
|
780
|
+
pastel_colors = sns.color_palette("Set3", min(non_missing_states // 2, 12))
|
|
781
|
+
tab20_colors = sns.color_palette("tab20", min(non_missing_states // 3, 20))
|
|
782
|
+
|
|
783
|
+
# Combine and extend the palette
|
|
784
|
+
combined_colors = viridis_colors + pastel_colors + tab20_colors
|
|
785
|
+
# If we need more colors, cycle through the combined palette
|
|
786
|
+
while len(combined_colors) < non_missing_states:
|
|
787
|
+
combined_colors.extend(combined_colors[:min(len(combined_colors), non_missing_states - len(combined_colors))])
|
|
788
|
+
|
|
789
|
+
non_missing_color_list = combined_colors[:non_missing_states]
|
|
790
|
+
|
|
791
|
+
if reverse_colors:
|
|
792
|
+
non_missing_color_list = list(reversed(non_missing_color_list))
|
|
793
|
+
|
|
794
|
+
# Add fixed gray color for missing values at the end
|
|
795
|
+
color_list = list(non_missing_color_list) + [missing_gray_color]
|
|
796
|
+
else:
|
|
797
|
+
# No missing values - use original logic
|
|
798
|
+
if self.custom_colors:
|
|
799
|
+
if len(self.custom_colors) != num_states:
|
|
800
|
+
raise ValueError("Length of custom_colors must match number of states.")
|
|
801
|
+
color_list = self.custom_colors
|
|
802
|
+
else:
|
|
803
|
+
if num_states <= 20:
|
|
804
|
+
color_list = sns.color_palette("Spectral", num_states)
|
|
805
|
+
else:
|
|
806
|
+
# Use a more elegant color palette for many states - combination of viridis and pastel colors
|
|
807
|
+
if num_states <= 40:
|
|
808
|
+
# Use viridis for up to 40 states (more colorful than cubehelix)
|
|
809
|
+
color_list = sns.color_palette("viridis", num_states)
|
|
810
|
+
else:
|
|
811
|
+
# For very large state counts, use a custom palette combining multiple schemes
|
|
812
|
+
viridis_colors = sns.color_palette("viridis", min(num_states // 2, 20))
|
|
813
|
+
pastel_colors = sns.color_palette("Set3", min(num_states // 2, 12))
|
|
814
|
+
tab20_colors = sns.color_palette("tab20", min(num_states // 3, 20))
|
|
815
|
+
|
|
816
|
+
# Combine and extend the palette
|
|
817
|
+
combined_colors = viridis_colors + pastel_colors + tab20_colors
|
|
818
|
+
# If we need more colors, cycle through the combined palette
|
|
819
|
+
while len(combined_colors) < num_states:
|
|
820
|
+
combined_colors.extend(combined_colors[:min(len(combined_colors), num_states - len(combined_colors))])
|
|
821
|
+
|
|
822
|
+
color_list = combined_colors[:num_states]
|
|
823
|
+
|
|
824
|
+
if reverse_colors:
|
|
825
|
+
color_list = list(reversed(color_list))
|
|
826
|
+
|
|
827
|
+
# Apply additional_colors if specified (assign custom colors to specific states while keeping default colors)
|
|
828
|
+
if self.additional_colors:
|
|
829
|
+
color_list = list(color_list) # Make a copy to avoid modifying original
|
|
830
|
+
for state, custom_color in self.additional_colors.items():
|
|
831
|
+
if state in self.states:
|
|
832
|
+
state_index = self.states.index(state)
|
|
833
|
+
# Convert hex string to RGB tuple if needed
|
|
834
|
+
if isinstance(custom_color, str) and custom_color.startswith('#'):
|
|
835
|
+
# Convert hex to RGB tuple (values 0-1)
|
|
836
|
+
hex_color = custom_color.lstrip('#')
|
|
837
|
+
rgb = tuple(int(hex_color[i:i+2], 16) / 255.0 for i in (0, 2, 4))
|
|
838
|
+
color_list[state_index] = rgb
|
|
839
|
+
elif isinstance(custom_color, (tuple, list)) and len(custom_color) == 3:
|
|
840
|
+
# If RGB values are 0-255, convert to 0-1
|
|
841
|
+
if all(0 <= v <= 255 for v in custom_color):
|
|
842
|
+
color_list[state_index] = tuple(v / 255.0 for v in custom_color)
|
|
843
|
+
else:
|
|
844
|
+
# Assume already 0-1 range
|
|
845
|
+
color_list[state_index] = tuple(custom_color)
|
|
846
|
+
else:
|
|
847
|
+
color_list[state_index] = custom_color
|
|
848
|
+
|
|
849
|
+
# self.color_map = {state: color_list[i] for i, state in enumerate(self.states)}
|
|
850
|
+
# This way all color map keys are 1, 2, 3..., which aligns with imshow(vmin=1, vmax=N)
|
|
851
|
+
self.color_map = {i + 1: color_list[i] for i in range(num_states)}
|
|
852
|
+
|
|
853
|
+
# Construct color_map with label as key (for legend)
|
|
854
|
+
self.color_map_by_label = {
|
|
855
|
+
self.state_to_label[state]: self.color_map[self.state_mapping[state]]
|
|
856
|
+
for state in self.states
|
|
857
|
+
}
|
|
858
|
+
|
|
859
|
+
def get_colormap(self):
|
|
860
|
+
"""Returns a ListedColormap for visualization."""
|
|
861
|
+
# return ListedColormap([self.color_map[state] for state in self.states])
|
|
862
|
+
return ListedColormap([self.color_map[i + 1] for i in range(len(self.states))])
|
|
863
|
+
|
|
864
|
+
def describe(self):
|
|
865
|
+
"""
|
|
866
|
+
Prints an overview of the sequence dataset.
|
|
867
|
+
|
|
868
|
+
# NOTE:
|
|
869
|
+
# Printing 'missing_index' directly may cause issues in Jupyter Notebook/Lab if the list is too long.
|
|
870
|
+
# For example, if there are thousands of sequences with missing values, the full list can easily exceed
|
|
871
|
+
# the IOPub data rate limit (1MB/sec by default), which will interrupt output to the client.
|
|
872
|
+
# To avoid this, it's safer to only display a subset (e.g., the first 10) or add a 'verbose' flag to control output.
|
|
873
|
+
"""
|
|
874
|
+
print(f"[>] Number of sequences: {len(self.seqdata)}")
|
|
875
|
+
print(f"[>] Number of time points: {self.n_steps}")
|
|
876
|
+
|
|
877
|
+
if self.ismissing:
|
|
878
|
+
lengths = self.seqdata.apply(lambda row: (row != len(self.states)).sum(), axis=1)
|
|
879
|
+
print(f"[>] Min/Max sequence length: {lengths.min()} / {lengths.max()}")
|
|
880
|
+
|
|
881
|
+
# Identify missing values and related IDs
|
|
882
|
+
missing_locs = self.seqdata.stack()[self.seqdata.stack() == len(self.states)].index.get_level_values(0)
|
|
883
|
+
missing_count = len(missing_locs)
|
|
884
|
+
unique_missing_ids = missing_locs.unique().tolist()
|
|
885
|
+
print(f"[>] There are {missing_count} missing values across {len(unique_missing_ids)} sequences.")
|
|
886
|
+
print(f" First few missing sequence IDs: {unique_missing_ids[:10]} ...")
|
|
887
|
+
|
|
888
|
+
# Find and display sequences with the most missing points
|
|
889
|
+
missing_counts = self.seqdata.isin([len(self.states)]).sum(axis=1)
|
|
890
|
+
most_missing = missing_counts[missing_counts > 0].sort_values(ascending=False).head(5)
|
|
891
|
+
print("[>] Top sequences with the most missing time points:")
|
|
892
|
+
print(" (Each row shows a sequence ID and its number of missing values)\n")
|
|
893
|
+
print(most_missing.rename("Missing Count").to_frame().rename_axis("Sequence ID"))
|
|
894
|
+
|
|
895
|
+
else:
|
|
896
|
+
print(
|
|
897
|
+
f"[>] Min/Max sequence length: {self.seqdata.notna().sum(axis=1).min()} / {self.seqdata.notna().sum(axis=1).max()}")
|
|
898
|
+
|
|
899
|
+
print(f"[>] States: {self.states}")
|
|
900
|
+
print(f"[>] Labels: {self.labels}")
|
|
901
|
+
|
|
902
|
+
# Display weights information if weights were originally provided
|
|
903
|
+
if self._weights_provided:
|
|
904
|
+
weight_mean = np.mean(self.weights)
|
|
905
|
+
weight_std = np.std(self.weights)
|
|
906
|
+
print(f"[>] Weights: Provided (total weight={sum(self.weights):.3f}, mean={weight_mean:.3f}, std={weight_std:.3f})")
|
|
907
|
+
else:
|
|
908
|
+
print(f"[>] Weights: Not provided")
|
|
909
|
+
|
|
910
|
+
def get_legend(self):
|
|
911
|
+
"""Returns the legend handles and labels for visualization."""
|
|
912
|
+
# self.legend_handles = [plt.Rectangle((0, 0), 1, 1,
|
|
913
|
+
# color=self.color_map[state],
|
|
914
|
+
# label=label)
|
|
915
|
+
# for state, label in zip(self.states, self.labels)]
|
|
916
|
+
# return [handle for handle in self.legend_handles], self.labels
|
|
917
|
+
|
|
918
|
+
self.legend_handles = [
|
|
919
|
+
plt.Rectangle((0, 0), 1, 1,
|
|
920
|
+
color=self.color_map[i + 1],
|
|
921
|
+
label=self.labels[i])
|
|
922
|
+
for i in range(len(self.states))
|
|
923
|
+
]
|
|
924
|
+
return self.legend_handles, self.labels
|
|
925
|
+
|
|
926
|
+
def to_dataframe(self) -> pd.DataFrame:
|
|
927
|
+
"""Returns the processed sequence dataset as a DataFrame."""
|
|
928
|
+
return self.seqdata
|
|
929
|
+
|
|
930
|
+
def plot_legend(self, save_as=None, dpi=200):
|
|
931
|
+
"""Displays the saved legend for sequence state colors."""
|
|
932
|
+
# Ensure legend handles exist even if get_legend() wasn't called
|
|
933
|
+
legend_handles = getattr(self, "legend_handles", None)
|
|
934
|
+
if not legend_handles:
|
|
935
|
+
legend_handles = [
|
|
936
|
+
plt.Rectangle((0, 0), 1, 1, color=self.color_map[i + 1], label=self.labels[i]
|
|
937
|
+
) for i in range(len(self.states))
|
|
938
|
+
]
|
|
939
|
+
self.legend_handles = legend_handles
|
|
940
|
+
|
|
941
|
+
fig, ax = plt.subplots(figsize=(2, 2))
|
|
942
|
+
ax.legend(handles=legend_handles, loc='center', title="States", fontsize=10)
|
|
943
|
+
ax.axis('off')
|
|
944
|
+
|
|
945
|
+
if save_as:
|
|
946
|
+
plt.savefig(save_as, dpi=dpi)
|
|
947
|
+
plt.show()
|
|
948
|
+
else:
|
|
949
|
+
plt.tight_layout()
|
|
950
|
+
plt.show()
|
|
951
|
+
|
|
952
|
+
# ------------------------------
|
|
953
|
+
# The following are for multidomain sequence analysis, especially for seqdomassoc()
|
|
954
|
+
|
|
955
|
+
@property
|
|
956
|
+
def n_sequences(self):
|
|
957
|
+
"""Returns number of sequences (rows)."""
|
|
958
|
+
return self.seqdata.shape[0]
|
|
959
|
+
|
|
960
|
+
@property
|
|
961
|
+
def n_steps(self):
|
|
962
|
+
"""Returns sequence length (columns)."""
|
|
963
|
+
return self.seqdata.shape[1]
|
|
964
|
+
|
|
965
|
+
@property
|
|
966
|
+
def alphabet(self):
|
|
967
|
+
"""Returns state alphabet."""
|
|
968
|
+
return self._alphabet
|
|
969
|
+
|
|
970
|
+
@alphabet.setter
|
|
971
|
+
def alphabet(self, val):
|
|
972
|
+
self._alphabet = val
|
|
973
|
+
|
|
974
|
+
@property
|
|
975
|
+
def sequences(self):
|
|
976
|
+
"""Returns sequences as a list of lists (one list per sequence)."""
|
|
977
|
+
return [list(row) for row in self.seqdata.values]
|
|
978
|
+
|
|
979
|
+
@property
|
|
980
|
+
def weights(self):
|
|
981
|
+
return self._weights
|
|
982
|
+
|
|
983
|
+
@weights.setter
|
|
984
|
+
def weights(self, val):
|
|
985
|
+
self._weights = val
|
|
986
|
+
|
|
987
|
+
def flatten(self) -> np.ndarray:
|
|
988
|
+
"""Flatten all sequences into a 1D array (row-wise)."""
|
|
989
|
+
return self.seqdata.values.flatten()
|
|
990
|
+
|
|
991
|
+
def flatten_weights(self) -> np.ndarray:
|
|
992
|
+
"""
|
|
993
|
+
Repeat weights across sequence length for 1D alignment with flatten().
|
|
994
|
+
E.g., 5 sequences x 10 steps -> repeat each weight 10 times.
|
|
995
|
+
"""
|
|
996
|
+
return np.repeat(self.weights, self.n_steps)
|
|
997
|
+
|
|
998
|
+
def to_numeric(self) -> np.ndarray:
|
|
999
|
+
"""Returns integer-coded sequence data as NumPy array."""
|
|
1000
|
+
return self.seqdata.to_numpy(dtype=np.int32)
|
|
1001
|
+
|
|
1002
|
+
def get_xtabs(self, other: SequenceData, weighted=True) -> np.ndarray:
|
|
1003
|
+
"""
|
|
1004
|
+
NumPy-only version of get_xtabs.
|
|
1005
|
+
Returns a raw NumPy matrix: shape (len(alphabet1), len(alphabet2))
|
|
1006
|
+
"""
|
|
1007
|
+
if self.n_sequences != other.n_sequences or self.n_steps != other.n_steps:
|
|
1008
|
+
raise ValueError("Both SequenceData objects must have same shape.")
|
|
1009
|
+
|
|
1010
|
+
v1 = self.flatten()
|
|
1011
|
+
v2 = other.flatten()
|
|
1012
|
+
|
|
1013
|
+
# Equivalent to self.alphabet,
|
|
1014
|
+
# but alphabet cannot be used directly, because it does not account for missing values
|
|
1015
|
+
n1 = len(self.states)
|
|
1016
|
+
n2 = len(other.states)
|
|
1017
|
+
|
|
1018
|
+
table = np.zeros((n1, n2), dtype=np.float64)
|
|
1019
|
+
|
|
1020
|
+
if weighted:
|
|
1021
|
+
w = self.flatten_weights()
|
|
1022
|
+
# Safe increment using integer indices
|
|
1023
|
+
# Numpy's index starts from 0, thus it is important to reduce by 1
|
|
1024
|
+
np.add.at(table, (v1 - 1, v2 - 1), w)
|
|
1025
|
+
else:
|
|
1026
|
+
np.add.at(table, (v1 - 1, v2 - 1), 1)
|
|
1027
|
+
|
|
1028
|
+
return table
|
|
1029
|
+
|
|
1030
|
+
def check_uniqueness_rate(self, weighted: bool = False):
|
|
1031
|
+
"""
|
|
1032
|
+
Compute uniqueness statistics of the sequences.
|
|
1033
|
+
|
|
1034
|
+
Returns:
|
|
1035
|
+
dict with keys:
|
|
1036
|
+
- n_sequences: total number of sequences (unweighted count)
|
|
1037
|
+
- n_unique: number of unique sequence patterns
|
|
1038
|
+
- uniqueness_rate: n_unique / n_sequences
|
|
1039
|
+
- weighted_total: total weighted count (only if weighted=True)
|
|
1040
|
+
- weighted_uniqueness_rate: n_unique / weighted_total (only if weighted=True)
|
|
1041
|
+
|
|
1042
|
+
Parameters:
|
|
1043
|
+
weighted: if True, use sequence weights to calculate weighted frequencies and uniqueness rates;
|
|
1044
|
+
if False, use simple counts (default behavior for backward compatibility).
|
|
1045
|
+
"""
|
|
1046
|
+
import numpy as np
|
|
1047
|
+
import pandas as pd
|
|
1048
|
+
|
|
1049
|
+
A = self.to_numeric() # shape (n, m), int32
|
|
1050
|
+
n, m = A.shape
|
|
1051
|
+
|
|
1052
|
+
# Use a byte-level view to let np.unique work row-wise efficiently
|
|
1053
|
+
A_contig = np.ascontiguousarray(A)
|
|
1054
|
+
row_view = A_contig.view(np.dtype((np.void, A_contig.dtype.itemsize * m))).ravel()
|
|
1055
|
+
|
|
1056
|
+
# Get unique patterns
|
|
1057
|
+
uniq, inverse = np.unique(row_view, return_inverse=True)
|
|
1058
|
+
|
|
1059
|
+
n_unique = uniq.size
|
|
1060
|
+
uniqueness_rate = float(n_unique) / float(n) if n > 0 else np.nan
|
|
1061
|
+
|
|
1062
|
+
# Build simplified result dictionary with only essential statistics
|
|
1063
|
+
result = {
|
|
1064
|
+
"n_sequences": int(n),
|
|
1065
|
+
"n_unique": int(n_unique),
|
|
1066
|
+
"uniqueness_rate": uniqueness_rate
|
|
1067
|
+
}
|
|
1068
|
+
|
|
1069
|
+
# Add weighted statistics if requested
|
|
1070
|
+
if weighted:
|
|
1071
|
+
weighted_total = float(np.sum(self.weights))
|
|
1072
|
+
weighted_uniqueness_rate = float(n_unique) / weighted_total if weighted_total > 0 else np.nan
|
|
1073
|
+
result["weighted_total"] = weighted_total
|
|
1074
|
+
result["weighted_uniqueness_rate"] = weighted_uniqueness_rate
|
|
1075
|
+
|
|
1076
|
+
return result
|
|
1077
|
+
|
|
1078
|
+
def show_color_palette(self, palette_name: str = 'default', save_as: str = None, dpi: int = 200):
|
|
1079
|
+
"""
|
|
1080
|
+
Instance method to show the default color palette for the current number of states.
|
|
1081
|
+
This is a convenience method that calls show_default_color_palette() with the number of states
|
|
1082
|
+
from this SequenceData instance.
|
|
1083
|
+
|
|
1084
|
+
Parameters:
|
|
1085
|
+
-----------
|
|
1086
|
+
palette_name : str, default='default'
|
|
1087
|
+
Name of the color palette to use. See show_default_color_palette() for available options.
|
|
1088
|
+
save_as : str, optional
|
|
1089
|
+
If provided, save the color preview figure to this file path.
|
|
1090
|
+
dpi : int, default=200
|
|
1091
|
+
Resolution for saving the figure (if save_as is provided).
|
|
1092
|
+
|
|
1093
|
+
Returns:
|
|
1094
|
+
--------
|
|
1095
|
+
dict : Dictionary with keys:
|
|
1096
|
+
- 'colors': List of RGB tuples (0-1 range)
|
|
1097
|
+
- 'hex_colors': List of hex color codes (e.g., "#FF5733")
|
|
1098
|
+
- 'rgb_255': List of RGB tuples (0-255 range)
|
|
1099
|
+
|
|
1100
|
+
Example:
|
|
1101
|
+
--------
|
|
1102
|
+
# Show color palette for this SequenceData instance
|
|
1103
|
+
seq_data = SequenceData(...)
|
|
1104
|
+
color_info = seq_data.show_color_palette()
|
|
1105
|
+
|
|
1106
|
+
# Show with a specific palette template
|
|
1107
|
+
color_info = seq_data.show_color_palette(palette_name='viridis')
|
|
1108
|
+
"""
|
|
1109
|
+
return SequenceData.show_default_color_palette(
|
|
1110
|
+
n_states=len(self.states),
|
|
1111
|
+
reverse_colors=True,
|
|
1112
|
+
palette_name=palette_name,
|
|
1113
|
+
save_as=save_as,
|
|
1114
|
+
dpi=dpi
|
|
1115
|
+
)
|
|
1116
|
+
|
|
1117
|
+
@staticmethod
|
|
1118
|
+
def _get_available_palette_names():
|
|
1119
|
+
"""
|
|
1120
|
+
Get list of available color palette names that can be used with show_default_color_palette.
|
|
1121
|
+
|
|
1122
|
+
Returns:
|
|
1123
|
+
--------
|
|
1124
|
+
list : List of available palette names (strings)
|
|
1125
|
+
"""
|
|
1126
|
+
# Common seaborn color palettes
|
|
1127
|
+
available_palettes = [
|
|
1128
|
+
'default', # Uses automatic selection based on n_states (Spectral/viridis/combined)
|
|
1129
|
+
'Spectral',
|
|
1130
|
+
'viridis',
|
|
1131
|
+
'Set3',
|
|
1132
|
+
'tab20',
|
|
1133
|
+
'deep',
|
|
1134
|
+
'muted',
|
|
1135
|
+
'pastel',
|
|
1136
|
+
'bright',
|
|
1137
|
+
'dark',
|
|
1138
|
+
'colorblind',
|
|
1139
|
+
'husl',
|
|
1140
|
+
'hls',
|
|
1141
|
+
'coolwarm',
|
|
1142
|
+
'RdYlGn',
|
|
1143
|
+
'RdYlBu',
|
|
1144
|
+
'RdBu',
|
|
1145
|
+
'PiYG',
|
|
1146
|
+
'PRGn',
|
|
1147
|
+
'BrBG',
|
|
1148
|
+
'Set1',
|
|
1149
|
+
'Set2',
|
|
1150
|
+
'Paired',
|
|
1151
|
+
'Accent',
|
|
1152
|
+
'Dark2',
|
|
1153
|
+
]
|
|
1154
|
+
return available_palettes
|
|
1155
|
+
|
|
1156
|
+
@staticmethod
|
|
1157
|
+
def _generate_color_list(n_states: int, palette_name: str = 'default', reverse_colors: bool = True):
|
|
1158
|
+
"""
|
|
1159
|
+
Generate color list based on palette name and number of states.
|
|
1160
|
+
|
|
1161
|
+
Parameters:
|
|
1162
|
+
-----------
|
|
1163
|
+
n_states : int
|
|
1164
|
+
Number of states (colors) to generate.
|
|
1165
|
+
palette_name : str, default='default'
|
|
1166
|
+
Name of the color palette to use. Use 'default' for automatic selection.
|
|
1167
|
+
Available palettes: see _get_available_palette_names()
|
|
1168
|
+
reverse_colors : bool, default=True
|
|
1169
|
+
Whether to reverse the color order.
|
|
1170
|
+
|
|
1171
|
+
Returns:
|
|
1172
|
+
--------
|
|
1173
|
+
list : List of RGB tuples (0-1 range)
|
|
1174
|
+
"""
|
|
1175
|
+
if palette_name == 'default':
|
|
1176
|
+
# Use the original logic for default palette selection
|
|
1177
|
+
if n_states <= 20:
|
|
1178
|
+
color_list = sns.color_palette("Spectral", n_states)
|
|
1179
|
+
else:
|
|
1180
|
+
if n_states <= 40:
|
|
1181
|
+
color_list = sns.color_palette("viridis", n_states)
|
|
1182
|
+
else:
|
|
1183
|
+
viridis_colors = sns.color_palette("viridis", min(n_states // 2, 20))
|
|
1184
|
+
pastel_colors = sns.color_palette("Set3", min(n_states // 2, 12))
|
|
1185
|
+
tab20_colors = sns.color_palette("tab20", min(n_states // 3, 20))
|
|
1186
|
+
combined_colors = viridis_colors + pastel_colors + tab20_colors
|
|
1187
|
+
while len(combined_colors) < n_states:
|
|
1188
|
+
combined_colors.extend(combined_colors[:min(len(combined_colors), n_states - len(combined_colors))])
|
|
1189
|
+
color_list = combined_colors[:n_states]
|
|
1190
|
+
else:
|
|
1191
|
+
# Use specified palette name
|
|
1192
|
+
try:
|
|
1193
|
+
color_list = sns.color_palette(palette_name, n_states)
|
|
1194
|
+
except ValueError:
|
|
1195
|
+
# If palette doesn't support n_states directly, try to generate more colors
|
|
1196
|
+
try:
|
|
1197
|
+
# Try to get a base palette and extend it
|
|
1198
|
+
base_palette = sns.color_palette(palette_name)
|
|
1199
|
+
color_list = []
|
|
1200
|
+
while len(color_list) < n_states:
|
|
1201
|
+
color_list.extend(base_palette)
|
|
1202
|
+
color_list = color_list[:n_states]
|
|
1203
|
+
except Exception as e:
|
|
1204
|
+
raise ValueError(f"Invalid palette name '{palette_name}'. Available palettes: {', '.join(SequenceData._get_available_palette_names())}") from e
|
|
1205
|
+
|
|
1206
|
+
if reverse_colors:
|
|
1207
|
+
color_list = list(reversed(color_list))
|
|
1208
|
+
|
|
1209
|
+
return color_list
|
|
1210
|
+
|
|
1211
|
+
@staticmethod
|
|
1212
|
+
def _convert_rgb_to_hex_and_255(color_list):
|
|
1213
|
+
"""
|
|
1214
|
+
Convert list of RGB tuples (0-1 range) to hex codes and RGB (0-255 range).
|
|
1215
|
+
|
|
1216
|
+
Parameters:
|
|
1217
|
+
-----------
|
|
1218
|
+
color_list : list
|
|
1219
|
+
List of RGB tuples in 0-1 range.
|
|
1220
|
+
|
|
1221
|
+
Returns:
|
|
1222
|
+
--------
|
|
1223
|
+
tuple : (hex_colors, rgb_255_list) where:
|
|
1224
|
+
- hex_colors: List of hex color codes (e.g., "#FF5733")
|
|
1225
|
+
- rgb_255_list: List of RGB tuples (0-255 range)
|
|
1226
|
+
"""
|
|
1227
|
+
hex_colors = []
|
|
1228
|
+
rgb_255_list = []
|
|
1229
|
+
for rgb in color_list:
|
|
1230
|
+
# Convert from 0-1 to 0-255
|
|
1231
|
+
rgb_255 = tuple(int(c * 255) for c in rgb)
|
|
1232
|
+
rgb_255_list.append(rgb_255)
|
|
1233
|
+
# Convert to hex
|
|
1234
|
+
hex_color = f"#{rgb_255[0]:02X}{rgb_255[1]:02X}{rgb_255[2]:02X}"
|
|
1235
|
+
hex_colors.append(hex_color)
|
|
1236
|
+
return hex_colors, rgb_255_list
|
|
1237
|
+
|
|
1238
|
+
@staticmethod
|
|
1239
|
+
def show_default_color_palette(n_states: int, reverse_colors: bool = True, palette_name: str = 'default', save_as: str = None, dpi: int = 200):
|
|
1240
|
+
"""
|
|
1241
|
+
Display the default color palette that would be used for a given number of states.
|
|
1242
|
+
This is useful for viewing default colors and copying hex codes to create custom_colors.
|
|
1243
|
+
|
|
1244
|
+
Parameters:
|
|
1245
|
+
-----------
|
|
1246
|
+
n_states : int
|
|
1247
|
+
Number of states (colors) to generate.
|
|
1248
|
+
reverse_colors : bool, default=True
|
|
1249
|
+
Whether to reverse the color order (same as default behavior in SequenceData).
|
|
1250
|
+
palette_name : str, default='default'
|
|
1251
|
+
Name of the color palette to use. Use 'default' to use the automatic palette selection
|
|
1252
|
+
(Spectral for ≤20 states, viridis for 21-40 states, combined for >40 states).
|
|
1253
|
+
Available palettes: 'default', 'Spectral', 'viridis', 'Set3', 'tab20', 'deep', 'muted',
|
|
1254
|
+
'pastel', 'bright', 'dark', 'colorblind', 'husl', 'hls', 'coolwarm', 'RdYlGn', 'RdYlBu',
|
|
1255
|
+
'RdBu', 'PiYG', 'PRGn', 'BrBG', 'Set1', 'Set2', 'Paired', 'Accent', 'Dark2', etc.
|
|
1256
|
+
Call SequenceData._get_available_palette_names() to see all available options.
|
|
1257
|
+
save_as : str, optional
|
|
1258
|
+
If provided, save the color preview figure to this file path.
|
|
1259
|
+
dpi : int, default=200
|
|
1260
|
+
Resolution for saving the figure (if save_as is provided).
|
|
1261
|
+
|
|
1262
|
+
Returns:
|
|
1263
|
+
--------
|
|
1264
|
+
dict : Dictionary with keys:
|
|
1265
|
+
- 'colors': List of RGB tuples (0-1 range)
|
|
1266
|
+
- 'hex_colors': List of hex color codes (e.g., "#FF5733")
|
|
1267
|
+
- 'rgb_255': List of RGB tuples (0-255 range)
|
|
1268
|
+
|
|
1269
|
+
Example:
|
|
1270
|
+
--------
|
|
1271
|
+
# View default colors for 13 states (call via class)
|
|
1272
|
+
color_info = SequenceData.show_default_color_palette(13)
|
|
1273
|
+
|
|
1274
|
+
# View a specific palette template
|
|
1275
|
+
color_info = SequenceData.show_default_color_palette(13, palette_name='viridis')
|
|
1276
|
+
|
|
1277
|
+
# Or call via instance (which will use the instance's number of states)
|
|
1278
|
+
seq_data = SequenceData(...)
|
|
1279
|
+
color_info = seq_data.show_color_palette()
|
|
1280
|
+
|
|
1281
|
+
# Then you can copy the hex_colors to use as custom_colors
|
|
1282
|
+
custom_colors = color_info['hex_colors']
|
|
1283
|
+
"""
|
|
1284
|
+
# Generate colors using the specified palette
|
|
1285
|
+
color_list = SequenceData._generate_color_list(n_states, palette_name, reverse_colors)
|
|
1286
|
+
|
|
1287
|
+
# Convert RGB (0-1) to hex and RGB (0-255)
|
|
1288
|
+
hex_colors, rgb_255_list = SequenceData._convert_rgb_to_hex_and_255(color_list)
|
|
1289
|
+
|
|
1290
|
+
# Print header with palette information
|
|
1291
|
+
print(f"\n{'='*80}")
|
|
1292
|
+
palette_display_name = "Default (automatic selection)" if palette_name == 'default' else palette_name
|
|
1293
|
+
print(f"Color Palette: {palette_display_name} for {n_states} States")
|
|
1294
|
+
print(f"{'='*80}\n")
|
|
1295
|
+
|
|
1296
|
+
# Show available palette names if using default
|
|
1297
|
+
if palette_name == 'default':
|
|
1298
|
+
available_palettes = SequenceData._get_available_palette_names()
|
|
1299
|
+
print("Available color palette templates:")
|
|
1300
|
+
print(" " + ", ".join(available_palettes))
|
|
1301
|
+
print("\n You can specify a palette template by using the 'palette_name' parameter.")
|
|
1302
|
+
print(" Example: show_default_color_palette(13, palette_name='viridis')\n")
|
|
1303
|
+
|
|
1304
|
+
# Create visualization
|
|
1305
|
+
fig, ax = plt.subplots(figsize=(12, max(6, n_states * 0.5)))
|
|
1306
|
+
|
|
1307
|
+
for i, (hex_color, rgb, rgb_255) in enumerate(zip(hex_colors, color_list, rgb_255_list)):
|
|
1308
|
+
# Draw color swatch
|
|
1309
|
+
y_pos = n_states - i - 1
|
|
1310
|
+
rect = plt.Rectangle((0, y_pos), 1, 0.8, facecolor=rgb, edgecolor='black', linewidth=0.5)
|
|
1311
|
+
ax.add_patch(rect)
|
|
1312
|
+
ax.text(1.1, y_pos + 0.4, f"{i+1:2d}. {hex_color} | RGB{rgb_255}",
|
|
1313
|
+
va='center', fontsize=10, fontfamily='monospace')
|
|
1314
|
+
|
|
1315
|
+
print(f"{'='*80}")
|
|
1316
|
+
print("\nTo use these colors as custom_colors, copy the hex codes:")
|
|
1317
|
+
print(" custom_colors = " + str(hex_colors))
|
|
1318
|
+
print("\nOr use additional_colors to assign custom colors to specific states:")
|
|
1319
|
+
print(" additional_colors = {'Other': '#BDBDBD'} # Assign gray color to 'Other' state")
|
|
1320
|
+
print(f"{'='*80}\n")
|
|
1321
|
+
|
|
1322
|
+
# Configure plot
|
|
1323
|
+
ax.set_xlim(0, 8)
|
|
1324
|
+
ax.set_ylim(-0.5, n_states)
|
|
1325
|
+
ax.set_yticks([])
|
|
1326
|
+
ax.set_xticks([])
|
|
1327
|
+
ax.spines['top'].set_visible(False)
|
|
1328
|
+
ax.spines['right'].set_visible(False)
|
|
1329
|
+
ax.spines['bottom'].set_visible(False)
|
|
1330
|
+
ax.spines['left'].set_visible(False)
|
|
1331
|
+
title_text = f"Color Palette: {palette_display_name} ({n_states} States)"
|
|
1332
|
+
ax.set_title(title_text, fontsize=14, pad=20)
|
|
1333
|
+
|
|
1334
|
+
plt.tight_layout()
|
|
1335
|
+
|
|
1336
|
+
if save_as:
|
|
1337
|
+
plt.savefig(save_as, dpi=dpi, bbox_inches='tight')
|
|
1338
|
+
print(f"[>] Color palette saved to: {save_as}")
|
|
1339
|
+
|
|
1340
|
+
plt.show()
|
|
1341
|
+
|
|
1342
|
+
return {
|
|
1343
|
+
'colors': color_list, # RGB tuples (0-1 range)
|
|
1344
|
+
'hex_colors': hex_colors, # Hex codes
|
|
1345
|
+
'rgb_255': rgb_255_list # RGB tuples (0-255 range)
|
|
1346
|
+
}
|
|
1347
|
+
|
|
1348
|
+
@staticmethod
|
|
1349
|
+
def get_default_color_palette(n_states: int, reverse_colors: bool = True, palette_name: str = 'default', return_format: str = 'hex'):
|
|
1350
|
+
"""
|
|
1351
|
+
Get the default color palette for a given number of states.
|
|
1352
|
+
This returns the colors without displaying them (useful for programmatic use).
|
|
1353
|
+
|
|
1354
|
+
Parameters:
|
|
1355
|
+
-----------
|
|
1356
|
+
n_states : int
|
|
1357
|
+
Number of states (colors) to generate.
|
|
1358
|
+
reverse_colors : bool, default=True
|
|
1359
|
+
Whether to reverse the color order (same as default behavior in SequenceData).
|
|
1360
|
+
palette_name : str, default='default'
|
|
1361
|
+
Name of the color palette to use. See show_default_color_palette() for available options.
|
|
1362
|
+
return_format : str, default='hex'
|
|
1363
|
+
Format to return colors in. Options:
|
|
1364
|
+
- 'hex': List of hex color codes (e.g., "#FF5733")
|
|
1365
|
+
- 'rgb': List of RGB tuples (0-1 range, for matplotlib)
|
|
1366
|
+
- 'rgb255': List of RGB tuples (0-255 range)
|
|
1367
|
+
|
|
1368
|
+
Returns:
|
|
1369
|
+
--------
|
|
1370
|
+
list : List of colors in the requested format.
|
|
1371
|
+
|
|
1372
|
+
Example:
|
|
1373
|
+
--------
|
|
1374
|
+
# Get hex codes for 13 states using default palette
|
|
1375
|
+
hex_colors = SequenceData.get_default_color_palette(13, return_format='hex')
|
|
1376
|
+
|
|
1377
|
+
# Get hex codes using a specific palette template
|
|
1378
|
+
hex_colors = SequenceData.get_default_color_palette(13, palette_name='viridis', return_format='hex')
|
|
1379
|
+
|
|
1380
|
+
# Use them as custom_colors
|
|
1381
|
+
seq = SequenceData(df, time=..., states=..., custom_colors=hex_colors)
|
|
1382
|
+
"""
|
|
1383
|
+
# Generate colors using the specified palette
|
|
1384
|
+
color_list = SequenceData._generate_color_list(n_states, palette_name, reverse_colors)
|
|
1385
|
+
|
|
1386
|
+
if return_format == 'rgb':
|
|
1387
|
+
return color_list
|
|
1388
|
+
elif return_format == 'hex':
|
|
1389
|
+
hex_colors, _ = SequenceData._convert_rgb_to_hex_and_255(color_list)
|
|
1390
|
+
return hex_colors
|
|
1391
|
+
elif return_format == 'rgb255':
|
|
1392
|
+
_, rgb_255_list = SequenceData._convert_rgb_to_hex_and_255(color_list)
|
|
1393
|
+
return rgb_255_list
|
|
1394
|
+
else:
|
|
1395
|
+
raise ValueError(f"return_format must be 'hex', 'rgb', or 'rgb255', got '{return_format}'")
|
|
1396
|
+
|
|
1397
|
+
|
|
1398
|
+
|
|
1399
|
+
|
|
1400
|
+
|