sequenzo 0.1.21__cp39-cp39-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sequenzo might be problematic. Click here for more details.
- sequenzo/__init__.py +240 -0
- sequenzo/big_data/__init__.py +12 -0
- sequenzo/big_data/clara/__init__.py +26 -0
- sequenzo/big_data/clara/clara.py +467 -0
- sequenzo/big_data/clara/utils/__init__.py +27 -0
- sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
- sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
- sequenzo/big_data/clara/utils/get_weighted_diss.cpython-39-darwin.so +0 -0
- sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
- sequenzo/big_data/clara/visualization.py +88 -0
- sequenzo/clustering/KMedoids.py +196 -0
- sequenzo/clustering/__init__.py +30 -0
- sequenzo/clustering/clustering_c_code.cpython-39-darwin.so +0 -0
- sequenzo/clustering/hierarchical_clustering.py +1380 -0
- sequenzo/clustering/src/KMedoid.cpp +262 -0
- sequenzo/clustering/src/PAM.cpp +236 -0
- sequenzo/clustering/src/PAMonce.cpp +234 -0
- sequenzo/clustering/src/cluster_quality.cpp +496 -0
- sequenzo/clustering/src/cluster_quality.h +128 -0
- sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
- sequenzo/clustering/src/module.cpp +228 -0
- sequenzo/clustering/src/weightedinertia.cpp +111 -0
- sequenzo/clustering/utils/__init__.py +27 -0
- sequenzo/clustering/utils/disscenter.py +122 -0
- sequenzo/data_preprocessing/__init__.py +20 -0
- sequenzo/data_preprocessing/helpers.py +256 -0
- sequenzo/datasets/__init__.py +41 -0
- sequenzo/datasets/biofam.csv +2001 -0
- sequenzo/datasets/biofam_child_domain.csv +2001 -0
- sequenzo/datasets/biofam_left_domain.csv +2001 -0
- sequenzo/datasets/biofam_married_domain.csv +2001 -0
- sequenzo/datasets/chinese_colonial_territories.csv +12 -0
- sequenzo/datasets/country_co2_emissions.csv +194 -0
- sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
- sequenzo/datasets/country_gdp_per_capita.csv +194 -0
- sequenzo/datasets/mvad.csv +713 -0
- sequenzo/datasets/pairfam_family.csv +1867 -0
- sequenzo/datasets/polyadic_samplec1.csv +61 -0
- sequenzo/datasets/polyadic_samplep1.csv +61 -0
- sequenzo/datasets/polyadic_seqc1.csv +61 -0
- sequenzo/datasets/polyadic_seqp1.csv +61 -0
- sequenzo/define_sequence_data.py +609 -0
- sequenzo/dissimilarity_measures/__init__.py +31 -0
- sequenzo/dissimilarity_measures/c_code.cpython-39-darwin.so +0 -0
- sequenzo/dissimilarity_measures/get_distance_matrix.py +702 -0
- sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +241 -0
- sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
- sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
- sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
- sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
- sequenzo/dissimilarity_measures/src/__init__.py +0 -0
- sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
- sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
- sequenzo/dissimilarity_measures/src/module.cpp +34 -0
- sequenzo/dissimilarity_measures/src/setup.py +30 -0
- sequenzo/dissimilarity_measures/src/utils.h +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
- sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
- sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
- sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
- sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
- sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
- sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
- sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
- sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-39-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqconc.cpython-39-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqdss.cpython-39-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqdur.cpython-39-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqlength.cpython-39-darwin.so +0 -0
- sequenzo/multidomain/__init__.py +23 -0
- sequenzo/multidomain/association_between_domains.py +311 -0
- sequenzo/multidomain/cat.py +431 -0
- sequenzo/multidomain/combt.py +519 -0
- sequenzo/multidomain/dat.py +89 -0
- sequenzo/multidomain/idcd.py +139 -0
- sequenzo/multidomain/linked_polyad.py +292 -0
- sequenzo/openmp_setup.py +233 -0
- sequenzo/prefix_tree/__init__.py +43 -0
- sequenzo/prefix_tree/individual_level_indicators.py +1274 -0
- sequenzo/prefix_tree/system_level_indicators.py +465 -0
- sequenzo/prefix_tree/utils.py +54 -0
- sequenzo/sequence_characteristics/__init__.py +40 -0
- sequenzo/sequence_characteristics/complexity_index.py +49 -0
- sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
- sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
- sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
- sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
- sequenzo/sequence_characteristics/turbulence.py +155 -0
- sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
- sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
- sequenzo/suffix_tree/__init__.py +48 -0
- sequenzo/suffix_tree/individual_level_indicators.py +1638 -0
- sequenzo/suffix_tree/system_level_indicators.py +456 -0
- sequenzo/suffix_tree/utils.py +56 -0
- sequenzo/visualization/__init__.py +29 -0
- sequenzo/visualization/plot_mean_time.py +194 -0
- sequenzo/visualization/plot_modal_state.py +276 -0
- sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
- sequenzo/visualization/plot_relative_frequency.py +404 -0
- sequenzo/visualization/plot_sequence_index.py +937 -0
- sequenzo/visualization/plot_single_medoid.py +153 -0
- sequenzo/visualization/plot_state_distribution.py +613 -0
- sequenzo/visualization/plot_transition_matrix.py +190 -0
- sequenzo/visualization/utils/__init__.py +23 -0
- sequenzo/visualization/utils/utils.py +310 -0
- sequenzo/with_event_history_analysis/__init__.py +35 -0
- sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
- sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
- sequenzo-0.1.21.dist-info/METADATA +308 -0
- sequenzo-0.1.21.dist-info/RECORD +254 -0
- sequenzo-0.1.21.dist-info/WHEEL +5 -0
- sequenzo-0.1.21.dist-info/licenses/LICENSE +28 -0
- sequenzo-0.1.21.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,937 @@
|
|
|
1
|
+
"""
|
|
2
|
+
@Author : Yuqi Liang 梁彧祺
|
|
3
|
+
@File : plot_sequence_index.py
|
|
4
|
+
@Time : 29/12/2024 09:08
|
|
5
|
+
@Desc :
|
|
6
|
+
Generate sequence index plots.
|
|
7
|
+
"""
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import matplotlib.pyplot as plt
|
|
11
|
+
|
|
12
|
+
# Use relative import to avoid circular import when top-level package imports visualization
|
|
13
|
+
from ..define_sequence_data import SequenceData
|
|
14
|
+
from sequenzo.visualization.utils import (
|
|
15
|
+
set_up_time_labels_for_x_axis,
|
|
16
|
+
save_figure_to_buffer,
|
|
17
|
+
create_standalone_legend,
|
|
18
|
+
combine_plot_with_legend,
|
|
19
|
+
save_and_show_results,
|
|
20
|
+
determine_layout,
|
|
21
|
+
show_plot_title
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def smart_sort_groups(groups):
|
|
26
|
+
"""
|
|
27
|
+
Smart sorting: prioritize numeric prefix, fallback to string sorting
|
|
28
|
+
|
|
29
|
+
:param groups: List of group names
|
|
30
|
+
:return: Sorted list of group names
|
|
31
|
+
"""
|
|
32
|
+
import re
|
|
33
|
+
|
|
34
|
+
# Compile regex once for better performance
|
|
35
|
+
numeric_pattern = re.compile(r'^(\d+)')
|
|
36
|
+
|
|
37
|
+
def sort_key(item):
|
|
38
|
+
match = numeric_pattern.match(str(item))
|
|
39
|
+
return (int(match.group(1)), str(item)) if match else (float('inf'), str(item))
|
|
40
|
+
|
|
41
|
+
return sorted(groups, key=sort_key)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _cmdscale(D):
|
|
45
|
+
"""
|
|
46
|
+
Classic Multidimensional Scaling (MDS), equivalent to R's cmdscale()
|
|
47
|
+
|
|
48
|
+
:param D: A NxN symmetric distance matrix
|
|
49
|
+
:return: Y, a Nxd coordinate matrix, where d is the largest positive eigenvalues' count
|
|
50
|
+
"""
|
|
51
|
+
n = len(D)
|
|
52
|
+
|
|
53
|
+
# Step 1: Compute the centering matrix
|
|
54
|
+
H = np.eye(n) - np.ones((n, n)) / n
|
|
55
|
+
|
|
56
|
+
# Step 2: Compute the double centered distance matrix
|
|
57
|
+
B = -0.5 * H @ (D ** 2) @ H
|
|
58
|
+
|
|
59
|
+
# Step 3: Compute eigenvalues and eigenvectors
|
|
60
|
+
eigvals, eigvecs = np.linalg.eigh(B)
|
|
61
|
+
|
|
62
|
+
# Step 4: Sort eigenvalues and eigenvectors in descending order
|
|
63
|
+
idx = np.argsort(eigvals)[::-1]
|
|
64
|
+
eigvals = eigvals[idx]
|
|
65
|
+
eigvecs = eigvecs[:, idx]
|
|
66
|
+
|
|
67
|
+
# Step 5: Select only positive eigenvalues
|
|
68
|
+
w, = np.where(eigvals > 0)
|
|
69
|
+
if len(w) > 0:
|
|
70
|
+
L = np.diag(np.sqrt(eigvals[w]))
|
|
71
|
+
V = eigvecs[:, w]
|
|
72
|
+
return V @ L # Return the MDS coordinates
|
|
73
|
+
else:
|
|
74
|
+
# Fallback if no positive eigenvalues
|
|
75
|
+
return np.zeros((n, 1))
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _find_most_frequent_sequence(sequences):
|
|
79
|
+
"""
|
|
80
|
+
Find the most frequent sequence in the dataset.
|
|
81
|
+
|
|
82
|
+
:param sequences: numpy array of sequences
|
|
83
|
+
:return: index of the most frequent sequence
|
|
84
|
+
"""
|
|
85
|
+
from collections import Counter
|
|
86
|
+
|
|
87
|
+
# Convert sequences to tuples for hashing
|
|
88
|
+
seq_tuples = [tuple(seq) for seq in sequences]
|
|
89
|
+
|
|
90
|
+
# Count frequencies
|
|
91
|
+
counter = Counter(seq_tuples)
|
|
92
|
+
|
|
93
|
+
# Find the most frequent sequence
|
|
94
|
+
most_frequent = counter.most_common(1)[0][0]
|
|
95
|
+
|
|
96
|
+
# Find the index of this sequence in the original array
|
|
97
|
+
for i, seq in enumerate(seq_tuples):
|
|
98
|
+
if seq == most_frequent:
|
|
99
|
+
return i
|
|
100
|
+
|
|
101
|
+
return 0 # Fallback
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _select_sequences_subset(seqdata, sequence_selection, n_sequences, sort_by, sort_by_weight, weights, mask=None):
|
|
105
|
+
"""
|
|
106
|
+
Select a subset of sequences based on the selection method.
|
|
107
|
+
|
|
108
|
+
:param seqdata: SequenceData object
|
|
109
|
+
:param sequence_selection: Selection method ("all", "first_n", "last_n", or list of IDs)
|
|
110
|
+
:param n_sequences: Number of sequences for "first_n" or "last_n"
|
|
111
|
+
:param sort_by: Sorting method to use before selection
|
|
112
|
+
:param sort_by_weight: Whether to sort by weight
|
|
113
|
+
:param weights: Sequence weights
|
|
114
|
+
:param mask: Optional mask for pre-filtering sequences
|
|
115
|
+
:return: Boolean mask for selected sequences
|
|
116
|
+
"""
|
|
117
|
+
# Start with all sequences or pre-filtered mask
|
|
118
|
+
if mask is None:
|
|
119
|
+
mask = np.ones(len(seqdata.values), dtype=bool)
|
|
120
|
+
|
|
121
|
+
# If "all", return the current mask
|
|
122
|
+
if sequence_selection == "all":
|
|
123
|
+
return mask
|
|
124
|
+
|
|
125
|
+
# Get indices of sequences that pass the mask
|
|
126
|
+
valid_indices = np.where(mask)[0]
|
|
127
|
+
|
|
128
|
+
# Handle ID list selection
|
|
129
|
+
if isinstance(sequence_selection, list):
|
|
130
|
+
# Convert list to set for faster lookup
|
|
131
|
+
selected_ids = set(sequence_selection)
|
|
132
|
+
|
|
133
|
+
# Find indices of sequences with matching IDs
|
|
134
|
+
selected_mask = np.zeros(len(seqdata.values), dtype=bool)
|
|
135
|
+
if hasattr(seqdata, 'ids') and seqdata.ids is not None:
|
|
136
|
+
for i in valid_indices:
|
|
137
|
+
if seqdata.ids[i] in selected_ids:
|
|
138
|
+
selected_mask[i] = True
|
|
139
|
+
else:
|
|
140
|
+
print("Warning: sequence_selection provided as ID list but seqdata has no IDs. Using all sequences.")
|
|
141
|
+
return mask
|
|
142
|
+
|
|
143
|
+
return selected_mask
|
|
144
|
+
|
|
145
|
+
# For "first_n" or "last_n", we need to sort first
|
|
146
|
+
if sequence_selection in ["first_n", "last_n"]:
|
|
147
|
+
# Get the subset of data based on current mask
|
|
148
|
+
subset_seqdata = seqdata
|
|
149
|
+
subset_weights = weights
|
|
150
|
+
|
|
151
|
+
if not np.all(mask):
|
|
152
|
+
# Create subset if mask is not all True
|
|
153
|
+
subset_values = seqdata.values[mask]
|
|
154
|
+
subset_ids = seqdata.ids[mask] if hasattr(seqdata, 'ids') and seqdata.ids is not None else None
|
|
155
|
+
|
|
156
|
+
# Use original seqdata for structure, just work with filtered values
|
|
157
|
+
subset_seqdata = seqdata # Keep original structure
|
|
158
|
+
|
|
159
|
+
if weights is not None:
|
|
160
|
+
subset_weights = weights[mask]
|
|
161
|
+
|
|
162
|
+
# Apply sorting to get the order
|
|
163
|
+
distance_matrix = None
|
|
164
|
+
if sort_by in ["mds", "distance_to_most_frequent"]:
|
|
165
|
+
try:
|
|
166
|
+
from sequenzo.dissimilarity_measures.get_distance_matrix import get_distance_matrix
|
|
167
|
+
distance_matrix = get_distance_matrix(
|
|
168
|
+
seqdata=subset_seqdata,
|
|
169
|
+
method="OM",
|
|
170
|
+
sm="CONSTANT",
|
|
171
|
+
indel="auto"
|
|
172
|
+
)
|
|
173
|
+
if hasattr(distance_matrix, 'values'):
|
|
174
|
+
distance_matrix = distance_matrix.values
|
|
175
|
+
except ImportError:
|
|
176
|
+
print(f"Warning: Cannot compute distance matrix for '{sort_by}' sorting. Using unsorted order.")
|
|
177
|
+
sort_by = "unsorted"
|
|
178
|
+
|
|
179
|
+
# Apply sorting to the masked subset
|
|
180
|
+
if sort_by_weight and subset_weights is not None:
|
|
181
|
+
# Sort by weight on the subset
|
|
182
|
+
sorted_indices = np.argsort(-subset_weights)
|
|
183
|
+
else:
|
|
184
|
+
# Sort on the subset values
|
|
185
|
+
if sort_by == "unsorted" or sort_by == "none":
|
|
186
|
+
sorted_indices = np.arange(len(valid_indices))
|
|
187
|
+
elif sort_by == "lexicographic":
|
|
188
|
+
subset_values = seqdata.values[mask]
|
|
189
|
+
vals = subset_values.astype(float, copy=True)
|
|
190
|
+
vals = np.nan_to_num(vals, nan=np.inf)
|
|
191
|
+
sorted_indices = np.lexsort(vals.T[::-1])
|
|
192
|
+
elif sort_by in ["mds", "distance_to_most_frequent"]:
|
|
193
|
+
# For complex sorting that requires distance matrix,
|
|
194
|
+
# we'll fall back to simple lexicographic for now
|
|
195
|
+
subset_values = seqdata.values[mask]
|
|
196
|
+
vals = subset_values.astype(float, copy=True)
|
|
197
|
+
vals = np.nan_to_num(vals, nan=np.inf)
|
|
198
|
+
sorted_indices = np.lexsort(vals.T[::-1])
|
|
199
|
+
print(f"Warning: {sort_by} sorting simplified to lexicographic for sequence selection")
|
|
200
|
+
else:
|
|
201
|
+
sorted_indices = np.arange(len(valid_indices))
|
|
202
|
+
|
|
203
|
+
# Select first_n or last_n
|
|
204
|
+
n_available = len(sorted_indices)
|
|
205
|
+
n_to_select = min(n_sequences, n_available)
|
|
206
|
+
|
|
207
|
+
if sequence_selection == "first_n":
|
|
208
|
+
selected_subset_indices = sorted_indices[:n_to_select]
|
|
209
|
+
elif sequence_selection == "last_n":
|
|
210
|
+
selected_subset_indices = sorted_indices[-n_to_select:]
|
|
211
|
+
|
|
212
|
+
# Map back to original indices
|
|
213
|
+
original_indices = valid_indices[selected_subset_indices]
|
|
214
|
+
|
|
215
|
+
# Create final mask
|
|
216
|
+
final_mask = np.zeros(len(seqdata.values), dtype=bool)
|
|
217
|
+
final_mask[original_indices] = True
|
|
218
|
+
|
|
219
|
+
return final_mask
|
|
220
|
+
|
|
221
|
+
else:
|
|
222
|
+
raise ValueError(f"Unsupported sequence_selection: {sequence_selection}. "
|
|
223
|
+
f"Supported options: 'all', 'first_n', 'last_n', or list of IDs")
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def sort_sequences_by_method(seqdata, method="unsorted", mask=None, distance_matrix=None, weights=None):
|
|
227
|
+
"""
|
|
228
|
+
Sort sequences in SequenceData based on specified method.
|
|
229
|
+
|
|
230
|
+
:param seqdata: SequenceData object
|
|
231
|
+
:param method: str, sorting method - "unsorted", "lexicographic", "mds", "distance_to_most_frequent"
|
|
232
|
+
:param mask: np.array(bool), if provided, sort only this subset
|
|
233
|
+
:param distance_matrix: np.array, required for "mds" and "distance_to_most_frequent" methods
|
|
234
|
+
:param weights: np.array, optional weights for sequences
|
|
235
|
+
:return: np.array sorting indices (relative to original order)
|
|
236
|
+
"""
|
|
237
|
+
values = seqdata.values.copy()
|
|
238
|
+
|
|
239
|
+
n_sequences = len(values) if mask is None else int(np.sum(mask))
|
|
240
|
+
|
|
241
|
+
if mask is not None:
|
|
242
|
+
values = values[mask]
|
|
243
|
+
if distance_matrix is not None:
|
|
244
|
+
# Only slice if distance_matrix is for the full sample
|
|
245
|
+
if distance_matrix.shape[0] != n_sequences:
|
|
246
|
+
masked_indices = np.where(mask)[0]
|
|
247
|
+
distance_matrix = distance_matrix[np.ix_(masked_indices, masked_indices)]
|
|
248
|
+
|
|
249
|
+
if method == "unsorted" or method == "none":
|
|
250
|
+
# Keep original order (R default)
|
|
251
|
+
return np.arange(n_sequences)
|
|
252
|
+
|
|
253
|
+
elif method == "lexicographic":
|
|
254
|
+
# Lexicographic sorting (NaN-safe)
|
|
255
|
+
vals = values.astype(float, copy=True)
|
|
256
|
+
# Push NaNs to the end for sorting
|
|
257
|
+
vals = np.nan_to_num(vals, nan=np.inf)
|
|
258
|
+
return np.lexsort(vals.T[::-1])
|
|
259
|
+
|
|
260
|
+
elif method == "mds":
|
|
261
|
+
# MDS first dimension sorting
|
|
262
|
+
if distance_matrix is None:
|
|
263
|
+
raise ValueError("Distance matrix is required for MDS sorting")
|
|
264
|
+
|
|
265
|
+
# TODO: Support weighted MDS (TraMineR's wcmdscale analogue) when weights are provided.
|
|
266
|
+
# Compute MDS coordinates
|
|
267
|
+
mds_coords = _cmdscale(distance_matrix)
|
|
268
|
+
|
|
269
|
+
# Sort by first MDS dimension
|
|
270
|
+
return np.argsort(mds_coords[:, 0])
|
|
271
|
+
|
|
272
|
+
elif method == "distance_to_most_frequent":
|
|
273
|
+
# Sort by distance to most frequent sequence
|
|
274
|
+
if distance_matrix is None:
|
|
275
|
+
raise ValueError("Distance matrix is required for distance_to_most_frequent sorting")
|
|
276
|
+
|
|
277
|
+
# Find most frequent sequence
|
|
278
|
+
most_freq_idx = _find_most_frequent_sequence(values)
|
|
279
|
+
|
|
280
|
+
# Get distances to most frequent sequence
|
|
281
|
+
distances = distance_matrix[most_freq_idx, :]
|
|
282
|
+
|
|
283
|
+
# Sort by distance (ascending)
|
|
284
|
+
return np.argsort(distances)
|
|
285
|
+
|
|
286
|
+
else:
|
|
287
|
+
raise ValueError(f"Unsupported sorting method: {method}. "
|
|
288
|
+
f"Supported methods are: 'unsorted', 'lexicographic', 'mds', 'distance_to_most_frequent'")
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def plot_sequence_index(seqdata: SequenceData,
|
|
292
|
+
# Grouping parameters
|
|
293
|
+
group_by_column=None,
|
|
294
|
+
group_dataframe=None,
|
|
295
|
+
group_column_name=None,
|
|
296
|
+
group_labels=None,
|
|
297
|
+
# Other parameters
|
|
298
|
+
sort_by="lexicographic",
|
|
299
|
+
sort_by_weight=False,
|
|
300
|
+
weights="auto",
|
|
301
|
+
figsize=(10, 6),
|
|
302
|
+
plot_style="standard",
|
|
303
|
+
title=None,
|
|
304
|
+
xlabel="Time",
|
|
305
|
+
ylabel="Sequences",
|
|
306
|
+
save_as=None,
|
|
307
|
+
dpi=200,
|
|
308
|
+
layout='column',
|
|
309
|
+
nrows: int = None,
|
|
310
|
+
ncols: int = None,
|
|
311
|
+
group_order=None,
|
|
312
|
+
sort_groups='auto',
|
|
313
|
+
fontsize=12,
|
|
314
|
+
show_group_titles: bool = True,
|
|
315
|
+
include_legend: bool = True,
|
|
316
|
+
sequence_selection="all",
|
|
317
|
+
n_sequences=10,
|
|
318
|
+
show_sequence_ids=False
|
|
319
|
+
):
|
|
320
|
+
"""Creates sequence index plots, optionally grouped by categories.
|
|
321
|
+
|
|
322
|
+
This function creates index plots that visualize sequences as horizontal lines,
|
|
323
|
+
with different sorting options matching R's TraMineR functionality.
|
|
324
|
+
|
|
325
|
+
**Two API modes for grouping:**
|
|
326
|
+
|
|
327
|
+
1. **Simplified API** (when grouping info is already in the data):
|
|
328
|
+
```python
|
|
329
|
+
plot_sequence_index(seqdata, group_by_column="Cluster", group_labels=cluster_labels)
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
2. **Complete API** (when grouping info is in a separate dataframe):
|
|
333
|
+
```python
|
|
334
|
+
plot_sequence_index(seqdata, group_dataframe=membership_df,
|
|
335
|
+
group_column_name="Cluster", group_labels=cluster_labels)
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
:param seqdata: SequenceData object containing sequence information
|
|
339
|
+
|
|
340
|
+
**New API parameters (recommended):**
|
|
341
|
+
:param group_by_column: (str, optional) Column name from seqdata.data to group by.
|
|
342
|
+
Use this when grouping information is already in your data.
|
|
343
|
+
Example: "Cluster", "sex", "education"
|
|
344
|
+
:param group_dataframe: (pd.DataFrame, optional) Separate dataframe containing grouping information.
|
|
345
|
+
Use this when grouping info is in a separate table (e.g., clustering results).
|
|
346
|
+
Must contain ID column and grouping column.
|
|
347
|
+
:param group_column_name: (str, optional) Name of the grouping column in group_dataframe.
|
|
348
|
+
Required when using group_dataframe.
|
|
349
|
+
:param group_labels: (dict, optional) Custom labels for group values.
|
|
350
|
+
Example: {1: "Late Family Formation", 2: "Early Partnership"}
|
|
351
|
+
Maps original values to display labels.
|
|
352
|
+
|
|
353
|
+
:param sort_by: Sorting method for sequences within groups:
|
|
354
|
+
- 'unsorted' or 'none': Keep original order (R TraMineR default)
|
|
355
|
+
- 'lexicographic': Sort sequences lexicographically
|
|
356
|
+
- 'mds': Sort by first MDS dimension (requires distance computation)
|
|
357
|
+
- 'distance_to_most_frequent': Sort by distance to most frequent sequence
|
|
358
|
+
:param sort_by_weight: If True, sort sequences by weight (descending), overrides sort_by
|
|
359
|
+
:param weights: (np.ndarray or "auto") Weights for sequences. If "auto", uses seqdata.weights if available
|
|
360
|
+
:param figsize: Size of each subplot figure (only used when plot_style="custom")
|
|
361
|
+
:param plot_style: Plot aspect style:
|
|
362
|
+
- 'standard': Standard proportions (10, 6) - balanced view
|
|
363
|
+
- 'compact': Compact/vertical proportions (8, 8) - more vertical like R plots
|
|
364
|
+
- 'wide': Wide proportions (12, 4) - emphasizes time progression
|
|
365
|
+
- 'narrow': Narrow/tall proportions (8, 10) - moderately vertical
|
|
366
|
+
- 'custom': Use the provided figsize parameter
|
|
367
|
+
:param title: Title for the plot (if None, default titles will be used)
|
|
368
|
+
:param xlabel: Label for the x-axis
|
|
369
|
+
:param ylabel: Label for the y-axis
|
|
370
|
+
:param save_as: File path to save the plot (if None, plot will be shown)
|
|
371
|
+
:param dpi: DPI for saved image
|
|
372
|
+
:param layout: Layout style - 'column' (default, 3xn), 'grid' (nxn)
|
|
373
|
+
:param group_order: List, manually specify group order (overrides sort_groups)
|
|
374
|
+
:param sort_groups: String, sorting method: 'auto'(smart numeric), 'numeric'(numeric prefix), 'alpha'(alphabetical), 'none'(original order)
|
|
375
|
+
:param fontsize: Base font size for text elements (titles use fontsize+2, ticks use fontsize-2)
|
|
376
|
+
:param show_group_titles: Whether to show group titles
|
|
377
|
+
:param include_legend: Whether to include legend in the plot (True by default)
|
|
378
|
+
:param sequence_selection: Method for selecting sequences to visualize:
|
|
379
|
+
- "all": Show all sequences (default)
|
|
380
|
+
- "first_n": Show first n sequences from each group
|
|
381
|
+
- "last_n": Show last n sequences from each group
|
|
382
|
+
- list: List of specific sequence IDs to show
|
|
383
|
+
:param n_sequences: Number of sequences to show when using "first_n" or "last_n" (default: 10)
|
|
384
|
+
:param show_sequence_ids: If True, show actual sequence IDs on y-axis instead of sequence numbers.
|
|
385
|
+
Most useful when sequence_selection is a list of IDs (default: False)
|
|
386
|
+
|
|
387
|
+
Note: For 'mds' and 'distance_to_most_frequent' sorting, distance matrices are computed
|
|
388
|
+
automatically using Optimal Matching (OM) with constant substitution costs.
|
|
389
|
+
"""
|
|
390
|
+
# Determine figure size based on plot style
|
|
391
|
+
style_sizes = {
|
|
392
|
+
'standard': (10, 6), # Balanced view
|
|
393
|
+
'compact': (8, 8), # More square, like R plots
|
|
394
|
+
'wide': (12, 4), # Wide, emphasizes time
|
|
395
|
+
'narrow': (8, 10), # Moderately vertical
|
|
396
|
+
'custom': figsize # User-provided
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
if plot_style not in style_sizes:
|
|
400
|
+
raise ValueError(f"Invalid plot_style '{plot_style}'. "
|
|
401
|
+
f"Supported styles: {list(style_sizes.keys())}")
|
|
402
|
+
|
|
403
|
+
# Special validation for custom plot style
|
|
404
|
+
if plot_style == 'custom' and figsize == (10, 6):
|
|
405
|
+
raise ValueError(
|
|
406
|
+
"When using plot_style='custom', you must explicitly provide a figsize parameter "
|
|
407
|
+
"that differs from the default (10, 6). "
|
|
408
|
+
"Suggested custom sizes:\n"
|
|
409
|
+
" - For wide plots: figsize=(15, 5)\n"
|
|
410
|
+
" - For tall plots: figsize=(7, 12)\n"
|
|
411
|
+
" - For square plots: figsize=(9, 9)\n"
|
|
412
|
+
" - For small plots: figsize=(6, 4)\n"
|
|
413
|
+
"Example: plot_sequence_index(data, plot_style='custom', figsize=(12, 8))"
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
actual_figsize = style_sizes[plot_style]
|
|
417
|
+
|
|
418
|
+
# Handle the simplified API: group_by_column
|
|
419
|
+
if group_by_column is not None:
|
|
420
|
+
# Validate that the column exists in the original data
|
|
421
|
+
if group_by_column not in seqdata.data.columns:
|
|
422
|
+
available_cols = [col for col in seqdata.data.columns if col not in seqdata.time and col != seqdata.id_col]
|
|
423
|
+
raise ValueError(
|
|
424
|
+
f"Column '{group_by_column}' not found in the data. "
|
|
425
|
+
f"Available columns for grouping: {available_cols}"
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
# Automatically create group_dataframe and group_column_name from the simplified API
|
|
429
|
+
group_dataframe = seqdata.data[[seqdata.id_col, group_by_column]].copy()
|
|
430
|
+
group_dataframe.columns = ['Entity ID', 'Category']
|
|
431
|
+
group_column_name = 'Category'
|
|
432
|
+
|
|
433
|
+
# Handle group labels - flexible and user-controllable
|
|
434
|
+
unique_values = seqdata.data[group_by_column].unique()
|
|
435
|
+
|
|
436
|
+
if group_labels is not None:
|
|
437
|
+
# User provided custom labels - use them
|
|
438
|
+
missing_keys = set(unique_values) - set(group_labels.keys())
|
|
439
|
+
if missing_keys:
|
|
440
|
+
raise ValueError(
|
|
441
|
+
f"group_labels missing mappings for values: {missing_keys}. "
|
|
442
|
+
f"Please provide labels for all unique values in '{group_by_column}': {sorted(unique_values)}"
|
|
443
|
+
)
|
|
444
|
+
group_dataframe['Category'] = group_dataframe['Category'].map(group_labels)
|
|
445
|
+
else:
|
|
446
|
+
# No custom labels provided - use smart defaults
|
|
447
|
+
if all(isinstance(v, (int, float, np.integer, np.floating)) and not pd.isna(v) for v in unique_values):
|
|
448
|
+
# Numeric values - keep as is (user can provide group_labels if they want custom names)
|
|
449
|
+
pass
|
|
450
|
+
# For string/categorical values, keep original values
|
|
451
|
+
# This handles cases where users already have meaningful labels like "Male"/"Female"
|
|
452
|
+
|
|
453
|
+
print(f"[>] Creating grouped plots by '{group_by_column}' with {len(unique_values)} categories")
|
|
454
|
+
|
|
455
|
+
# If no grouping information, create a single plot
|
|
456
|
+
if group_dataframe is None or group_column_name is None:
|
|
457
|
+
return _sequence_index_plot_single(seqdata, sort_by, sort_by_weight, weights, actual_figsize, plot_style, title, xlabel, ylabel, save_as, dpi, fontsize, include_legend, sequence_selection, n_sequences, show_sequence_ids)
|
|
458
|
+
|
|
459
|
+
# Process weights
|
|
460
|
+
if isinstance(weights, str) and weights == "auto":
|
|
461
|
+
weights = getattr(seqdata, "weights", None)
|
|
462
|
+
|
|
463
|
+
if weights is not None:
|
|
464
|
+
weights = np.asarray(weights, dtype=float).reshape(-1)
|
|
465
|
+
if len(weights) != len(seqdata.values):
|
|
466
|
+
raise ValueError("Length of weights must equal number of sequences.")
|
|
467
|
+
|
|
468
|
+
# Ensure ID columns match (convert if needed)
|
|
469
|
+
id_col_name = "Entity ID" if "Entity ID" in group_dataframe.columns else group_dataframe.columns[0]
|
|
470
|
+
|
|
471
|
+
# Get unique groups and sort them based on user preference
|
|
472
|
+
if group_order:
|
|
473
|
+
# Use manually specified order, filter out non-existing groups
|
|
474
|
+
groups = [g for g in group_order if g in group_dataframe[group_column_name].unique()]
|
|
475
|
+
missing_groups = [g for g in group_dataframe[group_column_name].unique() if g not in group_order]
|
|
476
|
+
if missing_groups:
|
|
477
|
+
print(f"[Warning] Groups not in group_order will be excluded: {missing_groups}")
|
|
478
|
+
elif sort_groups == 'numeric' or sort_groups == 'auto':
|
|
479
|
+
groups = smart_sort_groups(group_dataframe[group_column_name].unique())
|
|
480
|
+
elif sort_groups == 'alpha':
|
|
481
|
+
groups = sorted(group_dataframe[group_column_name].unique())
|
|
482
|
+
elif sort_groups == 'none':
|
|
483
|
+
groups = list(group_dataframe[group_column_name].unique())
|
|
484
|
+
else:
|
|
485
|
+
raise ValueError(f"Invalid sort_groups value: {sort_groups}. Use 'auto', 'numeric', 'alpha', or 'none'.")
|
|
486
|
+
|
|
487
|
+
num_groups = len(groups)
|
|
488
|
+
|
|
489
|
+
# Calculate figure size and layout based on number of groups and specified layout
|
|
490
|
+
nrows, ncols = determine_layout(num_groups, layout=layout, nrows=nrows, ncols=ncols)
|
|
491
|
+
|
|
492
|
+
fig, axes = plt.subplots(
|
|
493
|
+
nrows=nrows,
|
|
494
|
+
ncols=ncols,
|
|
495
|
+
figsize=(actual_figsize[0] * ncols, actual_figsize[1] * nrows),
|
|
496
|
+
gridspec_kw={'wspace': 0.15, 'hspace': 0.25} # Reduced spacing for tighter layout
|
|
497
|
+
)
|
|
498
|
+
axes = axes.flatten()
|
|
499
|
+
|
|
500
|
+
# Create a plot for each group
|
|
501
|
+
for i, group in enumerate(groups):
|
|
502
|
+
# Get IDs for this group
|
|
503
|
+
group_ids = group_dataframe[group_dataframe[group_column_name] == group][id_col_name].values
|
|
504
|
+
|
|
505
|
+
# Match IDs with sequence data
|
|
506
|
+
mask = np.isin(seqdata.ids, group_ids)
|
|
507
|
+
if not np.any(mask):
|
|
508
|
+
print(f"Warning: No matching sequences found for group '{group}'")
|
|
509
|
+
continue
|
|
510
|
+
|
|
511
|
+
# Apply sequence selection to this group
|
|
512
|
+
mask = _select_sequences_subset(seqdata, sequence_selection, n_sequences, sort_by, sort_by_weight, weights, mask)
|
|
513
|
+
|
|
514
|
+
# Extract sequences for this group
|
|
515
|
+
group_sequences = seqdata.values[mask]
|
|
516
|
+
|
|
517
|
+
# Track group IDs for y-axis labels
|
|
518
|
+
group_ids_for_labels = None
|
|
519
|
+
if hasattr(seqdata, 'ids') and seqdata.ids is not None and show_sequence_ids:
|
|
520
|
+
group_ids_for_labels = seqdata.ids[mask]
|
|
521
|
+
|
|
522
|
+
# Get weights for this group
|
|
523
|
+
if weights is not None:
|
|
524
|
+
group_weights = weights[mask]
|
|
525
|
+
else:
|
|
526
|
+
group_weights = None
|
|
527
|
+
|
|
528
|
+
# Handle NaN values for better visualization
|
|
529
|
+
if np.isnan(group_sequences).any():
|
|
530
|
+
# Map NaN to a dedicated state code with proper masking
|
|
531
|
+
group_sequences = group_sequences.astype(float)
|
|
532
|
+
group_sequences[np.isnan(group_sequences)] = np.nan
|
|
533
|
+
|
|
534
|
+
if sort_by_weight and group_weights is not None:
|
|
535
|
+
# Sort by weight (descending)
|
|
536
|
+
sorted_indices = np.argsort(-group_weights)
|
|
537
|
+
else:
|
|
538
|
+
# For group plots, we'll use simpler sorting to avoid complex object creation
|
|
539
|
+
if sort_by == "lexicographic":
|
|
540
|
+
vals = group_sequences.astype(float, copy=True)
|
|
541
|
+
vals = np.nan_to_num(vals, nan=np.inf)
|
|
542
|
+
sorted_indices = np.lexsort(vals.T[::-1])
|
|
543
|
+
elif sort_by in ["mds", "distance_to_most_frequent"]:
|
|
544
|
+
# Fallback to lexicographic for complex sorting methods
|
|
545
|
+
print(f"Warning: {sort_by} sorting simplified to lexicographic for grouped plots with sequence selection")
|
|
546
|
+
vals = group_sequences.astype(float, copy=True)
|
|
547
|
+
vals = np.nan_to_num(vals, nan=np.inf)
|
|
548
|
+
sorted_indices = np.lexsort(vals.T[::-1])
|
|
549
|
+
else:
|
|
550
|
+
# unsorted or other methods
|
|
551
|
+
sorted_indices = np.arange(len(group_sequences))
|
|
552
|
+
|
|
553
|
+
sorted_data = group_sequences[sorted_indices]
|
|
554
|
+
|
|
555
|
+
# Track sorted IDs for y-axis labels if needed
|
|
556
|
+
sorted_group_ids = None
|
|
557
|
+
if group_ids_for_labels is not None and show_sequence_ids:
|
|
558
|
+
sorted_group_ids = group_ids_for_labels[sorted_indices]
|
|
559
|
+
|
|
560
|
+
# Plot on the corresponding axis
|
|
561
|
+
ax = axes[i]
|
|
562
|
+
# Use masked array for better NaN handling
|
|
563
|
+
data = sorted_data.astype(float)
|
|
564
|
+
data[data < 1] = np.nan
|
|
565
|
+
|
|
566
|
+
# Check for all-missing or all-invalid data
|
|
567
|
+
if np.all(~np.isfinite(data)):
|
|
568
|
+
print(f"Warning: all values missing/invalid for group '{group}'")
|
|
569
|
+
ax.axis('off')
|
|
570
|
+
continue
|
|
571
|
+
|
|
572
|
+
im = ax.imshow(np.ma.masked_invalid(data), aspect='auto', cmap=seqdata.get_colormap(),
|
|
573
|
+
interpolation='nearest', vmin=1, vmax=len(seqdata.states))
|
|
574
|
+
|
|
575
|
+
# Remove grid lines
|
|
576
|
+
ax.grid(False)
|
|
577
|
+
|
|
578
|
+
# Set up time labels
|
|
579
|
+
set_up_time_labels_for_x_axis(seqdata, ax)
|
|
580
|
+
|
|
581
|
+
# Enhance y-axis aesthetics - evenly spaced ticks including the last sequence
|
|
582
|
+
num_sequences = sorted_data.shape[0]
|
|
583
|
+
|
|
584
|
+
# Determine tick positions and labels
|
|
585
|
+
if show_sequence_ids and sorted_group_ids is not None:
|
|
586
|
+
# Show sequence IDs instead of sequence numbers
|
|
587
|
+
# For large number of sequences, show fewer ticks to avoid overcrowding
|
|
588
|
+
if num_sequences <= 10:
|
|
589
|
+
ytick_positions = np.arange(num_sequences)
|
|
590
|
+
ytick_labels = [str(sid) for sid in sorted_group_ids]
|
|
591
|
+
else:
|
|
592
|
+
# Show subset of IDs for readability
|
|
593
|
+
if plot_style == "narrow":
|
|
594
|
+
num_ticks = min(8, num_sequences)
|
|
595
|
+
else:
|
|
596
|
+
num_ticks = min(11, num_sequences)
|
|
597
|
+
ytick_positions = np.linspace(0, num_sequences - 1, num=num_ticks, dtype=int)
|
|
598
|
+
ytick_positions = np.unique(ytick_positions)
|
|
599
|
+
ytick_labels = [str(sorted_group_ids[pos]) for pos in ytick_positions]
|
|
600
|
+
else:
|
|
601
|
+
# Default behavior: show sequence numbers
|
|
602
|
+
if plot_style == "narrow":
|
|
603
|
+
num_ticks = min(8, num_sequences) # Fewer ticks for narrow plots
|
|
604
|
+
else:
|
|
605
|
+
num_ticks = min(11, num_sequences)
|
|
606
|
+
ytick_positions = np.linspace(0, num_sequences - 1, num=num_ticks, dtype=int)
|
|
607
|
+
ytick_positions = np.unique(ytick_positions)
|
|
608
|
+
ytick_labels = (ytick_positions + 1).astype(int)
|
|
609
|
+
|
|
610
|
+
ax.set_yticks(ytick_positions)
|
|
611
|
+
ax.set_yticklabels(ytick_labels, fontsize=fontsize-2, color='black')
|
|
612
|
+
|
|
613
|
+
# Customize axis style
|
|
614
|
+
ax.spines['top'].set_visible(False)
|
|
615
|
+
ax.spines['right'].set_visible(False)
|
|
616
|
+
ax.spines['left'].set_color('gray')
|
|
617
|
+
ax.spines['bottom'].set_color('gray')
|
|
618
|
+
ax.spines['left'].set_linewidth(0.7)
|
|
619
|
+
ax.spines['bottom'].set_linewidth(0.7)
|
|
620
|
+
|
|
621
|
+
# Move spines slightly away from the plot area for better aesthetics
|
|
622
|
+
ax.spines['left'].set_position(('outward', 5))
|
|
623
|
+
ax.spines['bottom'].set_position(('outward', 5))
|
|
624
|
+
|
|
625
|
+
# Ensure ticks are always visible regardless of plot style
|
|
626
|
+
ax.tick_params(axis='x', colors='gray', length=4, width=0.7, which='major')
|
|
627
|
+
ax.tick_params(axis='y', colors='gray', length=4, width=0.7, which='major')
|
|
628
|
+
|
|
629
|
+
# Force tick visibility for narrow plot styles
|
|
630
|
+
ax.xaxis.set_ticks_position('bottom')
|
|
631
|
+
ax.yaxis.set_ticks_position('left')
|
|
632
|
+
ax.tick_params(axis='both', which='major', direction='out')
|
|
633
|
+
|
|
634
|
+
# Add group title with weight information
|
|
635
|
+
# Check if we have effective weights (not all 1.0) and they were provided by user
|
|
636
|
+
original_weights = getattr(seqdata, "weights", None)
|
|
637
|
+
if original_weights is not None and not np.allclose(original_weights, 1.0) and group_weights is not None:
|
|
638
|
+
sum_w = float(group_weights.sum())
|
|
639
|
+
group_title = f"{group} (n = {num_sequences}, total weight = {sum_w:.1f})"
|
|
640
|
+
else:
|
|
641
|
+
group_title = f"{group} (n = {num_sequences})"
|
|
642
|
+
if show_group_titles:
|
|
643
|
+
show_plot_title(ax, group_title, show=True, fontsize=fontsize, loc='right')
|
|
644
|
+
|
|
645
|
+
# Add axis labels
|
|
646
|
+
if i % ncols == 0:
|
|
647
|
+
ax.set_ylabel(ylabel, fontsize=fontsize, labelpad=10, color='black')
|
|
648
|
+
|
|
649
|
+
# if i >= num_groups - ncols:
|
|
650
|
+
ax.set_xlabel(xlabel, fontsize=fontsize, labelpad=10, color='black')
|
|
651
|
+
|
|
652
|
+
# Hide unused subplots
|
|
653
|
+
for j in range(i + 1, len(axes)):
|
|
654
|
+
axes[j].set_visible(False)
|
|
655
|
+
|
|
656
|
+
# Add a common title if provided
|
|
657
|
+
if title:
|
|
658
|
+
fig.suptitle(title, fontsize=fontsize+2, y=1.02)
|
|
659
|
+
|
|
660
|
+
# Adjust layout to remove tight_layout warning and eliminate extra right space
|
|
661
|
+
fig.subplots_adjust(wspace=0.15, hspace=0.25, bottom=0.1, top=0.9, right=0.98, left=0.08)
|
|
662
|
+
|
|
663
|
+
# Save main figure to memory
|
|
664
|
+
main_buffer = save_figure_to_buffer(fig, dpi=dpi)
|
|
665
|
+
|
|
666
|
+
if include_legend:
|
|
667
|
+
# Create standalone legend
|
|
668
|
+
colors = seqdata.color_map_by_label
|
|
669
|
+
legend_buffer = create_standalone_legend(
|
|
670
|
+
colors=colors,
|
|
671
|
+
labels=seqdata.labels,
|
|
672
|
+
ncol=min(5, len(seqdata.states)),
|
|
673
|
+
figsize=(actual_figsize[0] * ncols, 1),
|
|
674
|
+
fontsize=fontsize-2,
|
|
675
|
+
dpi=dpi
|
|
676
|
+
)
|
|
677
|
+
|
|
678
|
+
# Combine plot with legend
|
|
679
|
+
if save_as and not save_as.lower().endswith(('.png', '.jpg', '.jpeg', '.pdf')):
|
|
680
|
+
save_as = save_as + '.png'
|
|
681
|
+
|
|
682
|
+
combined_img = combine_plot_with_legend(
|
|
683
|
+
main_buffer,
|
|
684
|
+
legend_buffer,
|
|
685
|
+
output_path=save_as,
|
|
686
|
+
dpi=dpi,
|
|
687
|
+
padding=20
|
|
688
|
+
)
|
|
689
|
+
|
|
690
|
+
# Display combined image
|
|
691
|
+
plt.figure(figsize=(actual_figsize[0] * ncols, actual_figsize[1] * nrows + 1))
|
|
692
|
+
plt.imshow(combined_img)
|
|
693
|
+
plt.axis('off')
|
|
694
|
+
plt.show()
|
|
695
|
+
plt.close()
|
|
696
|
+
else:
|
|
697
|
+
# Display plot without legend
|
|
698
|
+
if save_as and not save_as.lower().endswith(('.png', '.jpg', '.jpeg', '.pdf')):
|
|
699
|
+
save_as = save_as + '.png'
|
|
700
|
+
|
|
701
|
+
# Save or show the main plot directly
|
|
702
|
+
plt.figure(figsize=(actual_figsize[0] * ncols, actual_figsize[1] * nrows))
|
|
703
|
+
plt.imshow(main_buffer)
|
|
704
|
+
plt.axis('off')
|
|
705
|
+
|
|
706
|
+
if save_as:
|
|
707
|
+
plt.savefig(save_as, dpi=dpi, bbox_inches='tight')
|
|
708
|
+
plt.show()
|
|
709
|
+
plt.close()
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
def _sequence_index_plot_single(seqdata: SequenceData,
|
|
713
|
+
sort_by="unsorted",
|
|
714
|
+
sort_by_weight=False,
|
|
715
|
+
weights="auto",
|
|
716
|
+
figsize=(10, 6),
|
|
717
|
+
plot_style="standard",
|
|
718
|
+
title=None,
|
|
719
|
+
xlabel="Time",
|
|
720
|
+
ylabel="Sequences",
|
|
721
|
+
save_as=None,
|
|
722
|
+
dpi=200,
|
|
723
|
+
fontsize=12,
|
|
724
|
+
include_legend=True,
|
|
725
|
+
sequence_selection="all",
|
|
726
|
+
n_sequences=10,
|
|
727
|
+
show_sequence_ids=False):
|
|
728
|
+
"""Efficiently creates a sequence index plot using `imshow` for faster rendering.
|
|
729
|
+
|
|
730
|
+
:param seqdata: SequenceData object containing sequence information
|
|
731
|
+
:param sort_by: Sorting method ('unsorted', 'lexicographic', 'mds', 'distance_to_most_frequent')
|
|
732
|
+
:param sort_by_weight: If True, sort sequences by weight (descending), overrides sort_by
|
|
733
|
+
:param weights: (np.ndarray or "auto") Weights for sequences. If "auto", uses seqdata.weights if available
|
|
734
|
+
:param figsize: (tuple): Size of the figure (only used when plot_style="custom").
|
|
735
|
+
:param plot_style: Plot aspect style ('standard', 'compact', 'wide', 'narrow', 'custom')
|
|
736
|
+
:param title: (str): Title for the plot.
|
|
737
|
+
:param xlabel: (str): Label for the x-axis.
|
|
738
|
+
:param ylabel: (str): Label for the y-axis.
|
|
739
|
+
:param save_as: File path to save the plot
|
|
740
|
+
:param dpi: DPI for saved image
|
|
741
|
+
:param include_legend: Whether to include legend in the plot (True by default)
|
|
742
|
+
:param sequence_selection: Method for selecting sequences ("all", "first_n", "last_n", or list of IDs)
|
|
743
|
+
:param n_sequences: Number of sequences for "first_n" or "last_n"
|
|
744
|
+
:param show_sequence_ids: If True, show actual sequence IDs on y-axis instead of sequence numbers
|
|
745
|
+
|
|
746
|
+
:return None.
|
|
747
|
+
"""
|
|
748
|
+
# Determine figure size based on plot style
|
|
749
|
+
style_sizes = {
|
|
750
|
+
'standard': (10, 6), # Balanced view
|
|
751
|
+
'compact': (8, 8), # More square, like R plots
|
|
752
|
+
'wide': (12, 4), # Wide, emphasizes time
|
|
753
|
+
'narrow': (8, 10), # Moderately vertical
|
|
754
|
+
'custom': figsize # User-provided
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
if plot_style not in style_sizes:
|
|
758
|
+
raise ValueError(f"Invalid plot_style '{plot_style}'. "
|
|
759
|
+
f"Supported styles: {list(style_sizes.keys())}")
|
|
760
|
+
|
|
761
|
+
# Special validation for custom plot style
|
|
762
|
+
if plot_style == 'custom' and figsize == (10, 6):
|
|
763
|
+
raise ValueError(
|
|
764
|
+
"When using plot_style='custom', you must explicitly provide a figsize parameter "
|
|
765
|
+
"that differs from the default (10, 6). "
|
|
766
|
+
"Suggested custom sizes:\n"
|
|
767
|
+
" - For wide plots: figsize=(15, 5)\n"
|
|
768
|
+
" - For tall plots: figsize=(7, 12)\n"
|
|
769
|
+
" - For square plots: figsize=(9, 9)\n"
|
|
770
|
+
" - For small plots: figsize=(6, 4)\n"
|
|
771
|
+
"Example: plot_sequence_index(data, plot_style='custom', figsize=(12, 8))"
|
|
772
|
+
)
|
|
773
|
+
|
|
774
|
+
actual_figsize = style_sizes[plot_style]
|
|
775
|
+
|
|
776
|
+
# Process weights
|
|
777
|
+
if isinstance(weights, str) and weights == "auto":
|
|
778
|
+
weights = getattr(seqdata, "weights", None)
|
|
779
|
+
|
|
780
|
+
if weights is not None:
|
|
781
|
+
weights = np.asarray(weights, dtype=float).reshape(-1)
|
|
782
|
+
if len(weights) != len(seqdata.values):
|
|
783
|
+
raise ValueError("Length of weights must equal number of sequences.")
|
|
784
|
+
|
|
785
|
+
# Apply sequence selection and get the filtered data directly
|
|
786
|
+
selection_mask = _select_sequences_subset(seqdata, sequence_selection, n_sequences, sort_by, sort_by_weight, weights)
|
|
787
|
+
|
|
788
|
+
# Get sequence values as NumPy array (apply selection if needed)
|
|
789
|
+
selected_ids = None # Track selected IDs for y-axis labels
|
|
790
|
+
if not np.all(selection_mask):
|
|
791
|
+
sequence_values = seqdata.values[selection_mask].copy()
|
|
792
|
+
# Track selected IDs for y-axis display
|
|
793
|
+
if hasattr(seqdata, 'ids') and seqdata.ids is not None:
|
|
794
|
+
selected_ids = seqdata.ids[selection_mask]
|
|
795
|
+
# Update weights if provided
|
|
796
|
+
if weights is not None:
|
|
797
|
+
weights = weights[selection_mask]
|
|
798
|
+
else:
|
|
799
|
+
sequence_values = seqdata.values.copy()
|
|
800
|
+
# All IDs are selected
|
|
801
|
+
if hasattr(seqdata, 'ids') and seqdata.ids is not None:
|
|
802
|
+
selected_ids = seqdata.ids
|
|
803
|
+
|
|
804
|
+
# Handle NaN values for better visualization
|
|
805
|
+
if np.isnan(sequence_values).any():
|
|
806
|
+
# Keep NaN as float for proper masking
|
|
807
|
+
sequence_values = sequence_values.astype(float)
|
|
808
|
+
|
|
809
|
+
# Sort sequences based on specified method
|
|
810
|
+
if sort_by_weight and weights is not None:
|
|
811
|
+
# Sort by weight (descending)
|
|
812
|
+
sorted_indices = np.argsort(-weights)
|
|
813
|
+
else:
|
|
814
|
+
# Use simpler sorting for the filtered data
|
|
815
|
+
if sort_by == "lexicographic":
|
|
816
|
+
vals = sequence_values.astype(float, copy=True)
|
|
817
|
+
vals = np.nan_to_num(vals, nan=np.inf)
|
|
818
|
+
sorted_indices = np.lexsort(vals.T[::-1])
|
|
819
|
+
elif sort_by in ["mds", "distance_to_most_frequent"]:
|
|
820
|
+
# Fallback to lexicographic for complex sorting methods
|
|
821
|
+
print(f"Warning: {sort_by} sorting simplified to lexicographic for sequence selection")
|
|
822
|
+
vals = sequence_values.astype(float, copy=True)
|
|
823
|
+
vals = np.nan_to_num(vals, nan=np.inf)
|
|
824
|
+
sorted_indices = np.lexsort(vals.T[::-1])
|
|
825
|
+
else:
|
|
826
|
+
# unsorted or other methods
|
|
827
|
+
sorted_indices = np.arange(len(sequence_values))
|
|
828
|
+
|
|
829
|
+
sorted_data = sequence_values[sorted_indices]
|
|
830
|
+
|
|
831
|
+
# Track sorted IDs for y-axis labels if needed
|
|
832
|
+
sorted_ids = None
|
|
833
|
+
if selected_ids is not None and show_sequence_ids:
|
|
834
|
+
sorted_ids = selected_ids[sorted_indices]
|
|
835
|
+
|
|
836
|
+
# Create the plot using imshow with proper NaN handling
|
|
837
|
+
fig, ax = plt.subplots(figsize=actual_figsize)
|
|
838
|
+
# Use masked array for better NaN handling
|
|
839
|
+
data = sorted_data.astype(float)
|
|
840
|
+
data[data < 1] = np.nan
|
|
841
|
+
|
|
842
|
+
# Check for all-missing or all-invalid data
|
|
843
|
+
if np.all(~np.isfinite(data)):
|
|
844
|
+
print(f"Warning: all values missing/invalid in sequence data")
|
|
845
|
+
ax.axis('off')
|
|
846
|
+
return
|
|
847
|
+
|
|
848
|
+
ax.imshow(np.ma.masked_invalid(data), aspect='auto', cmap=seqdata.get_colormap(),
|
|
849
|
+
interpolation='nearest', vmin=1, vmax=len(seqdata.states))
|
|
850
|
+
|
|
851
|
+
# Disable background grid and all axis guide lines
|
|
852
|
+
ax.grid(False)
|
|
853
|
+
|
|
854
|
+
# Optional: remove tick marks and tick labels to avoid visual grid effects
|
|
855
|
+
# ax.set_xticks([])
|
|
856
|
+
# ax.set_yticks([])
|
|
857
|
+
|
|
858
|
+
# x label
|
|
859
|
+
set_up_time_labels_for_x_axis(seqdata, ax)
|
|
860
|
+
|
|
861
|
+
# Enhance y-axis aesthetics - evenly spaced ticks including the last sequence
|
|
862
|
+
num_sequences = sorted_data.shape[0]
|
|
863
|
+
|
|
864
|
+
# Determine tick positions and labels
|
|
865
|
+
if show_sequence_ids and sorted_ids is not None:
|
|
866
|
+
# Show sequence IDs instead of sequence numbers
|
|
867
|
+
# For large number of sequences, show fewer ticks to avoid overcrowding
|
|
868
|
+
if num_sequences <= 10:
|
|
869
|
+
ytick_positions = np.arange(num_sequences)
|
|
870
|
+
ytick_labels = [str(sid) for sid in sorted_ids]
|
|
871
|
+
else:
|
|
872
|
+
# Show subset of IDs for readability
|
|
873
|
+
if plot_style == "narrow":
|
|
874
|
+
num_ticks = min(8, num_sequences)
|
|
875
|
+
else:
|
|
876
|
+
num_ticks = min(11, num_sequences)
|
|
877
|
+
ytick_positions = np.linspace(0, num_sequences - 1, num=num_ticks, dtype=int)
|
|
878
|
+
ytick_positions = np.unique(ytick_positions)
|
|
879
|
+
ytick_labels = [str(sorted_ids[pos]) for pos in ytick_positions]
|
|
880
|
+
else:
|
|
881
|
+
# Default behavior: show sequence numbers
|
|
882
|
+
if plot_style == "narrow":
|
|
883
|
+
num_ticks = min(8, num_sequences) # Fewer ticks for narrow plots
|
|
884
|
+
else:
|
|
885
|
+
num_ticks = min(11, num_sequences)
|
|
886
|
+
ytick_positions = np.linspace(0, num_sequences - 1, num=num_ticks, dtype=int)
|
|
887
|
+
ytick_positions = np.unique(ytick_positions)
|
|
888
|
+
ytick_labels = (ytick_positions + 1).astype(int)
|
|
889
|
+
|
|
890
|
+
ax.set_yticks(ytick_positions)
|
|
891
|
+
ax.set_yticklabels(ytick_labels, fontsize=fontsize-2, color='black')
|
|
892
|
+
|
|
893
|
+
|
|
894
|
+
# Customize axis line styles and ticks
|
|
895
|
+
ax.spines['top'].set_visible(False)
|
|
896
|
+
ax.spines['right'].set_visible(False)
|
|
897
|
+
ax.spines['left'].set_color('gray')
|
|
898
|
+
ax.spines['bottom'].set_color('gray')
|
|
899
|
+
ax.spines['left'].set_linewidth(0.7)
|
|
900
|
+
ax.spines['bottom'].set_linewidth(0.7)
|
|
901
|
+
|
|
902
|
+
# Move spines slightly away from the plot area for better aesthetics
|
|
903
|
+
ax.spines['left'].set_position(('outward', 5))
|
|
904
|
+
ax.spines['bottom'].set_position(('outward', 5))
|
|
905
|
+
|
|
906
|
+
# Ensure ticks are always visible regardless of plot style
|
|
907
|
+
ax.tick_params(axis='x', colors='gray', length=4, width=0.7, which='major')
|
|
908
|
+
ax.tick_params(axis='y', colors='gray', length=4, width=0.7, which='major')
|
|
909
|
+
|
|
910
|
+
# Force tick visibility for narrow plot styles
|
|
911
|
+
ax.xaxis.set_ticks_position('bottom')
|
|
912
|
+
ax.yaxis.set_ticks_position('left')
|
|
913
|
+
ax.tick_params(axis='both', which='major', direction='out')
|
|
914
|
+
|
|
915
|
+
# Add labels and title
|
|
916
|
+
ax.set_xlabel(xlabel, fontsize=fontsize, labelpad=10, color='black')
|
|
917
|
+
ax.set_ylabel(ylabel, fontsize=fontsize, labelpad=10, color='black')
|
|
918
|
+
|
|
919
|
+
# Set title with weight information if available
|
|
920
|
+
if title is not None:
|
|
921
|
+
display_title = title
|
|
922
|
+
|
|
923
|
+
# Check if we have effective weights (not all 1.0) and they were provided by user
|
|
924
|
+
original_weights = getattr(seqdata, "weights", None)
|
|
925
|
+
if original_weights is not None and not np.allclose(original_weights, 1.0) and weights is not None:
|
|
926
|
+
sum_w = float(weights.sum())
|
|
927
|
+
display_title += f" (n = {num_sequences}, total weight = {sum_w:.1f})"
|
|
928
|
+
else:
|
|
929
|
+
display_title += f" (n = {num_sequences})"
|
|
930
|
+
|
|
931
|
+
ax.set_title(display_title, fontsize=fontsize+2, color='black')
|
|
932
|
+
|
|
933
|
+
# Use legend from SequenceData if requested
|
|
934
|
+
if include_legend:
|
|
935
|
+
ax.legend(*seqdata.get_legend(), bbox_to_anchor=(1.05, 1), loc='upper left')
|
|
936
|
+
|
|
937
|
+
save_and_show_results(save_as, dpi=dpi)
|