sequenzo 0.1.21__cp312-cp312-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sequenzo might be problematic. Click here for more details.
- sequenzo/__init__.py +240 -0
- sequenzo/big_data/__init__.py +12 -0
- sequenzo/big_data/clara/__init__.py +26 -0
- sequenzo/big_data/clara/clara.py +467 -0
- sequenzo/big_data/clara/utils/__init__.py +27 -0
- sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
- sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
- sequenzo/big_data/clara/utils/get_weighted_diss.cpython-312-darwin.so +0 -0
- sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
- sequenzo/big_data/clara/visualization.py +88 -0
- sequenzo/clustering/KMedoids.py +196 -0
- sequenzo/clustering/__init__.py +30 -0
- sequenzo/clustering/clustering_c_code.cpython-312-darwin.so +0 -0
- sequenzo/clustering/hierarchical_clustering.py +1380 -0
- sequenzo/clustering/src/KMedoid.cpp +262 -0
- sequenzo/clustering/src/PAM.cpp +236 -0
- sequenzo/clustering/src/PAMonce.cpp +234 -0
- sequenzo/clustering/src/cluster_quality.cpp +496 -0
- sequenzo/clustering/src/cluster_quality.h +128 -0
- sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
- sequenzo/clustering/src/module.cpp +228 -0
- sequenzo/clustering/src/weightedinertia.cpp +111 -0
- sequenzo/clustering/utils/__init__.py +27 -0
- sequenzo/clustering/utils/disscenter.py +122 -0
- sequenzo/data_preprocessing/__init__.py +20 -0
- sequenzo/data_preprocessing/helpers.py +256 -0
- sequenzo/datasets/__init__.py +41 -0
- sequenzo/datasets/biofam.csv +2001 -0
- sequenzo/datasets/biofam_child_domain.csv +2001 -0
- sequenzo/datasets/biofam_left_domain.csv +2001 -0
- sequenzo/datasets/biofam_married_domain.csv +2001 -0
- sequenzo/datasets/chinese_colonial_territories.csv +12 -0
- sequenzo/datasets/country_co2_emissions.csv +194 -0
- sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
- sequenzo/datasets/country_gdp_per_capita.csv +194 -0
- sequenzo/datasets/mvad.csv +713 -0
- sequenzo/datasets/pairfam_family.csv +1867 -0
- sequenzo/datasets/polyadic_samplec1.csv +61 -0
- sequenzo/datasets/polyadic_samplep1.csv +61 -0
- sequenzo/datasets/polyadic_seqc1.csv +61 -0
- sequenzo/datasets/polyadic_seqp1.csv +61 -0
- sequenzo/define_sequence_data.py +609 -0
- sequenzo/dissimilarity_measures/__init__.py +31 -0
- sequenzo/dissimilarity_measures/c_code.cpython-312-darwin.so +0 -0
- sequenzo/dissimilarity_measures/get_distance_matrix.py +702 -0
- sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +241 -0
- sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
- sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
- sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
- sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
- sequenzo/dissimilarity_measures/src/__init__.py +0 -0
- sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
- sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
- sequenzo/dissimilarity_measures/src/module.cpp +34 -0
- sequenzo/dissimilarity_measures/src/setup.py +30 -0
- sequenzo/dissimilarity_measures/src/utils.h +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
- sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
- sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
- sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
- sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
- sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
- sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
- sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
- sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-312-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqconc.cpython-312-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqdss.cpython-312-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqdur.cpython-312-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqlength.cpython-312-darwin.so +0 -0
- sequenzo/multidomain/__init__.py +23 -0
- sequenzo/multidomain/association_between_domains.py +311 -0
- sequenzo/multidomain/cat.py +431 -0
- sequenzo/multidomain/combt.py +519 -0
- sequenzo/multidomain/dat.py +89 -0
- sequenzo/multidomain/idcd.py +139 -0
- sequenzo/multidomain/linked_polyad.py +292 -0
- sequenzo/openmp_setup.py +233 -0
- sequenzo/prefix_tree/__init__.py +43 -0
- sequenzo/prefix_tree/individual_level_indicators.py +1274 -0
- sequenzo/prefix_tree/system_level_indicators.py +465 -0
- sequenzo/prefix_tree/utils.py +54 -0
- sequenzo/sequence_characteristics/__init__.py +40 -0
- sequenzo/sequence_characteristics/complexity_index.py +49 -0
- sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
- sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
- sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
- sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
- sequenzo/sequence_characteristics/turbulence.py +155 -0
- sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
- sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
- sequenzo/suffix_tree/__init__.py +48 -0
- sequenzo/suffix_tree/individual_level_indicators.py +1638 -0
- sequenzo/suffix_tree/system_level_indicators.py +456 -0
- sequenzo/suffix_tree/utils.py +56 -0
- sequenzo/visualization/__init__.py +29 -0
- sequenzo/visualization/plot_mean_time.py +194 -0
- sequenzo/visualization/plot_modal_state.py +276 -0
- sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
- sequenzo/visualization/plot_relative_frequency.py +404 -0
- sequenzo/visualization/plot_sequence_index.py +937 -0
- sequenzo/visualization/plot_single_medoid.py +153 -0
- sequenzo/visualization/plot_state_distribution.py +613 -0
- sequenzo/visualization/plot_transition_matrix.py +190 -0
- sequenzo/visualization/utils/__init__.py +23 -0
- sequenzo/visualization/utils/utils.py +310 -0
- sequenzo/with_event_history_analysis/__init__.py +35 -0
- sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
- sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
- sequenzo-0.1.21.dist-info/METADATA +308 -0
- sequenzo-0.1.21.dist-info/RECORD +254 -0
- sequenzo-0.1.21.dist-info/WHEEL +5 -0
- sequenzo-0.1.21.dist-info/licenses/LICENSE +28 -0
- sequenzo-0.1.21.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,609 @@
|
|
|
1
|
+
"""
|
|
2
|
+
@Author : 梁彧祺 Yuqi Liang
|
|
3
|
+
@File : define_sequence_data.py
|
|
4
|
+
@Time : 05/02/2025 12:47
|
|
5
|
+
@Desc :
|
|
6
|
+
|
|
7
|
+
Optimized SequenceData class with integrated color scheme & legend handling.
|
|
8
|
+
|
|
9
|
+
Note on `states` and `alphabet`:
|
|
10
|
+
|
|
11
|
+
In traditional sequence analysis tools (e.g., TraMineR), the `alphabet` refers to the full set of distinct states
|
|
12
|
+
found in the data and is often inferred automatically from the observed sequences.
|
|
13
|
+
|
|
14
|
+
However, in this implementation, we require the user to explicitly provide the set of `states`. This explicit control
|
|
15
|
+
is essential for ensuring consistent ordering of states, reproducibility of visualizations, and compatibility across
|
|
16
|
+
sequence datasets - especially when certain states may not appear in a given subset of the data.
|
|
17
|
+
|
|
18
|
+
As a result, `alphabet` is automatically set to `states` upon initialization, and kept as a semantic alias for clarity
|
|
19
|
+
and potential compatibility. Users should treat `states` as the definitive state space and are not required to provide
|
|
20
|
+
`alphabet` separately.
|
|
21
|
+
|
|
22
|
+
# ----------------------------------------------------------------------
|
|
23
|
+
# [Hint] Handling the ID column for sequence analysis
|
|
24
|
+
# ----------------------------------------------------------------------
|
|
25
|
+
|
|
26
|
+
# STEP 1: Check if your DataFrame already has a column representing unique entity IDs
|
|
27
|
+
# For example, check if "Entity ID" or "country" or any other identifier exists:
|
|
28
|
+
print(df.columns)
|
|
29
|
+
|
|
30
|
+
# If your data already has an ID column (e.g., 'Entity ID'), you can directly use it:
|
|
31
|
+
seq = SequenceData(df, id_col='Entity ID', time=..., states=...)
|
|
32
|
+
|
|
33
|
+
# ----------------------------------------------------------------------
|
|
34
|
+
# STEP 2: If your data has NO ID column, use the helper function below
|
|
35
|
+
# ----------------------------------------------------------------------
|
|
36
|
+
from sequenzo.utils import assign_unique_ids
|
|
37
|
+
|
|
38
|
+
# This will insert a new ID column named 'Entity ID' as the first column
|
|
39
|
+
df = assign_unique_ids(df, id_col_name='Entity ID')
|
|
40
|
+
|
|
41
|
+
# Optional: Save it for future use to avoid repeating this step
|
|
42
|
+
df.to_csv('your_dataset_with_ids.csv', index=False)
|
|
43
|
+
|
|
44
|
+
# Then you can use it like this:
|
|
45
|
+
seq = SequenceData(df, id_col='Entity ID', time=..., states=...)
|
|
46
|
+
|
|
47
|
+
"""
|
|
48
|
+
# Only applicable to Python 3.7+, add this line to defer type annotation evaluation
|
|
49
|
+
from __future__ import annotations
|
|
50
|
+
# Define the public API at the top of the file
|
|
51
|
+
__all__ = ['SequenceData']
|
|
52
|
+
|
|
53
|
+
# Global variables and other imports that do not depend on pandas are placed here
|
|
54
|
+
import numpy as np
|
|
55
|
+
import seaborn as sns
|
|
56
|
+
import matplotlib.pyplot as plt
|
|
57
|
+
import pandas as pd
|
|
58
|
+
from docutils.parsers.rst import states
|
|
59
|
+
from matplotlib.colors import ListedColormap
|
|
60
|
+
import re
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class SequenceData:
|
|
64
|
+
"""
|
|
65
|
+
A class for defining and processing a sequence dataset for social sequence analysis.
|
|
66
|
+
|
|
67
|
+
This class provides:
|
|
68
|
+
- Sequence extraction & missing value handling.
|
|
69
|
+
- Automatic alphabet (state space) management.
|
|
70
|
+
- Efficient sequence-to-numeric conversion.
|
|
71
|
+
- Color mapping & legend storage for visualization.
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
data: pd.DataFrame,
|
|
77
|
+
time: list,
|
|
78
|
+
states: list,
|
|
79
|
+
labels: list = None,
|
|
80
|
+
id_col: str = None,
|
|
81
|
+
weights: np.ndarray = None,
|
|
82
|
+
start: int = 1,
|
|
83
|
+
custom_colors: list = None
|
|
84
|
+
):
|
|
85
|
+
"""
|
|
86
|
+
Initialize the SequenceData object.
|
|
87
|
+
|
|
88
|
+
:param data: DataFrame containing sequence data.
|
|
89
|
+
:param time: List of columns containing time labels.
|
|
90
|
+
:param states: List of unique states (categories).
|
|
91
|
+
:param alphabet: Optional predefined state space.
|
|
92
|
+
:param labels: Labels for states (optional, for visualization).
|
|
93
|
+
:param id_col: Column name for row identifiers, which is very important for hierarchical clustering.
|
|
94
|
+
:param weights: Sequence weights (optional).
|
|
95
|
+
:param start: Starting time index (default: 1).
|
|
96
|
+
:param missing_handling: Dict specifying handling for missing values (left, right, gaps).
|
|
97
|
+
:param void: Symbol for void elements (default: "%").
|
|
98
|
+
:param nr: Symbol for missing values (default: "*").
|
|
99
|
+
:param custom_colors: Custom color palette for visualization.
|
|
100
|
+
"""
|
|
101
|
+
# Import pandas here instead of the top of the file
|
|
102
|
+
import pandas as pd
|
|
103
|
+
|
|
104
|
+
self.data = data.copy()
|
|
105
|
+
self.time = time
|
|
106
|
+
|
|
107
|
+
# Remove all non-numeric characters from the year labels, e.g., "Year2020" -> "2020", or "C1" -> "1"
|
|
108
|
+
# self.cleaned_time = [re.sub(r'\D', '', str(year)) for year in time]
|
|
109
|
+
# No longer support this feature as we encourage users to clean the time variables.
|
|
110
|
+
# TODO: might implement a helper function for users to clean up their time variables.
|
|
111
|
+
self.cleaned_time = time
|
|
112
|
+
self.states = states.copy()
|
|
113
|
+
self.alphabet = states.copy() or sorted(set(data[time].stack().unique()))
|
|
114
|
+
self.labels = labels or [str(s) for s in states]
|
|
115
|
+
self.id_col = id_col
|
|
116
|
+
self.ids = np.array(self.data[self.id_col].values) if self.id_col else data.index
|
|
117
|
+
self.weights = weights
|
|
118
|
+
self._weights_provided = weights is not None # Track if weights were originally provided
|
|
119
|
+
self.start = start
|
|
120
|
+
self.custom_colors = custom_colors
|
|
121
|
+
|
|
122
|
+
# Validate parameters
|
|
123
|
+
self._validate_parameters()
|
|
124
|
+
|
|
125
|
+
# Extract & process sequences
|
|
126
|
+
self.seqdata = self._extract_sequences()
|
|
127
|
+
self._process_missing_values()
|
|
128
|
+
|
|
129
|
+
# The following two lines of code are for visualization
|
|
130
|
+
self.state_to_label = dict(zip(self.states, self.labels))
|
|
131
|
+
self.label_to_state = dict(zip(self.labels, self.states))
|
|
132
|
+
|
|
133
|
+
self._convert_states()
|
|
134
|
+
|
|
135
|
+
# Assign colors & save legend
|
|
136
|
+
self._assign_colors()
|
|
137
|
+
|
|
138
|
+
# Automatically print dataset overview
|
|
139
|
+
print("\n[>] SequenceData initialized successfully! Here's a summary:")
|
|
140
|
+
self.describe()
|
|
141
|
+
|
|
142
|
+
@property
|
|
143
|
+
def values(self):
|
|
144
|
+
"""Returns sequence data as a NumPy array, similar to xinyi_original_seqdef()."""
|
|
145
|
+
return self.seqdata.to_numpy(dtype=np.int32)
|
|
146
|
+
|
|
147
|
+
def __repr__(self):
|
|
148
|
+
return f"SequenceData({len(self.seqdata)} sequences, States: {self.states})"
|
|
149
|
+
|
|
150
|
+
def _validate_parameters(self):
|
|
151
|
+
"""Ensures correct input parameters and checks consistency with data."""
|
|
152
|
+
# Check states, alphabet, labels
|
|
153
|
+
if not self.states:
|
|
154
|
+
raise ValueError("'states' must be provided.")
|
|
155
|
+
|
|
156
|
+
# Validate that states are present in the actual data values
|
|
157
|
+
data_values = set(self.data[self.time].stack().unique())
|
|
158
|
+
states_clean = [s for s in self.states if not pd.isna(s)] # stack() 会去掉 nan 值,因此如果传进来的 states 有 np.nan,则会报错
|
|
159
|
+
unmatched_states = [s for s in states_clean if s not in data_values]
|
|
160
|
+
|
|
161
|
+
if unmatched_states:
|
|
162
|
+
raise ValueError(
|
|
163
|
+
f"[!] The following provided 'states' are not found in the data: {unmatched_states}\n"
|
|
164
|
+
f" Hint: Check spelling or formatting. Data contains these unique values: {sorted(data_values)}"
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# ----------------
|
|
168
|
+
# Check if ID column is provided and valid
|
|
169
|
+
if self.id_col is not None and self.id_col not in self.data.columns:
|
|
170
|
+
raise ValueError(
|
|
171
|
+
f"[!] You must specify a valid `id_col` parameter that exists in your dataset.\n"
|
|
172
|
+
f" ID is required to uniquely identify each sequence (e.g., individuals).\n"
|
|
173
|
+
f" -> Hint: If your data does not have an ID column yet, you can use the helper function:\n\n"
|
|
174
|
+
f" from sequenzo.utils import assign_unique_ids\n"
|
|
175
|
+
f" df = assign_unique_ids(df, id_col_name='Entity ID')\n"
|
|
176
|
+
f" df.to_csv('your_dataset_with_ids.csv', index=False)\n\n"
|
|
177
|
+
f" This will permanently assign unique IDs to your dataset for future use."
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# Because it is already implemented at initialization time
|
|
181
|
+
# self.ids = np.array(self.data[self.id_col].values)
|
|
182
|
+
|
|
183
|
+
# Validate ID uniqueness and length
|
|
184
|
+
if len(self.ids) != len(self.data):
|
|
185
|
+
raise ValueError(f"[!] Length of ID column ('{self.id_col}') must match number of rows in the dataset.")
|
|
186
|
+
if len(np.unique(self.ids)) != len(self.ids):
|
|
187
|
+
raise ValueError(f"[!] IDs in column '{self.id_col}' must be unique.")
|
|
188
|
+
|
|
189
|
+
# ----------------
|
|
190
|
+
if self.alphabet and set(self.alphabet) != set(self.states):
|
|
191
|
+
raise ValueError("'alphabet' must match 'states'.")
|
|
192
|
+
|
|
193
|
+
if self.labels:
|
|
194
|
+
if len(self.labels) != len(self.states):
|
|
195
|
+
raise ValueError("'labels' must match the length of 'states'.")
|
|
196
|
+
|
|
197
|
+
# Ensure labels are all strings
|
|
198
|
+
non_string_labels = [label for label in self.labels if not isinstance(label, str)]
|
|
199
|
+
if non_string_labels:
|
|
200
|
+
raise TypeError(
|
|
201
|
+
f"[!] All elements in 'labels' must be strings for proper visualization (e.g., for legends or annotations).\n"
|
|
202
|
+
f" Detected non-string labels: {non_string_labels}\n"
|
|
203
|
+
f" Example fix: instead of using `labels = [1, 2, 3]`, use `labels = ['Single', 'Married', 'Divorced']`."
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
# Check weights
|
|
207
|
+
if self.weights is not None:
|
|
208
|
+
if len(self.weights) != len(self.data):
|
|
209
|
+
raise ValueError("'weights' must match the length of 'data'.")
|
|
210
|
+
else:
|
|
211
|
+
self.weights = np.ones(self.data.shape[0])
|
|
212
|
+
|
|
213
|
+
def _extract_sequences(self) -> pd.DataFrame:
|
|
214
|
+
"""Extracts only relevant sequence columns."""
|
|
215
|
+
return self.data[self.time].copy()
|
|
216
|
+
|
|
217
|
+
def _process_missing_values(self):
|
|
218
|
+
"""Handles missing values based on the specified rules."""
|
|
219
|
+
# left, right, gaps = self.missing_handling.values()
|
|
220
|
+
#
|
|
221
|
+
# # Fill left-side missing values
|
|
222
|
+
# if not pd.isna(left) and left != "DEL":
|
|
223
|
+
# self.seqdata.fillna(left, inplace=True)
|
|
224
|
+
#
|
|
225
|
+
# # Process right-side missing values
|
|
226
|
+
# if right == "DEL":
|
|
227
|
+
# self.seqdata = self.seqdata.apply(lambda row: row.dropna().reset_index(drop=True), axis=1)
|
|
228
|
+
#
|
|
229
|
+
# # Process gaps (internal missing values)
|
|
230
|
+
# if not pd.isna(gaps) and gaps != "DEL":
|
|
231
|
+
# self.seqdata.replace(self.nr, gaps, inplace=True)
|
|
232
|
+
|
|
233
|
+
self.ismissing = self.seqdata.isna().any().any()
|
|
234
|
+
|
|
235
|
+
if self.ismissing:
|
|
236
|
+
# 判断 states 中是否已经含有 Missing(无论是字符串还是 np.nan)
|
|
237
|
+
# 兼容用户传进来的各种形式的"missing"
|
|
238
|
+
has_missing_state = any(pd.isna(s) for s in self.states) or any(s.lower() == "missing" for s in self.states if isinstance(s, str))
|
|
239
|
+
has_missing_label = any(label.lower() == "missing" for label in self.labels if isinstance(label, str))
|
|
240
|
+
|
|
241
|
+
if not has_missing_state:
|
|
242
|
+
# 自动判断 states 是字符串型还是数字型
|
|
243
|
+
example_missing = "'Missing'" if all(isinstance(s, str) for s in self.states) else "np.nan"
|
|
244
|
+
quote = "" if example_missing == "np.nan" else "'"
|
|
245
|
+
|
|
246
|
+
print(
|
|
247
|
+
"[!] Detected missing values (empty cells) in the sequence data.\n"
|
|
248
|
+
f" -> Automatically added {example_missing} to `states` and `labels` for compatibility.\n"
|
|
249
|
+
" However, it's strongly recommended to manually include it when defining `states` and `labels`.\n"
|
|
250
|
+
" For example:\n\n"
|
|
251
|
+
f" states = [{quote}At Home{quote}, {quote}Left Home{quote}, {example_missing}]\n"
|
|
252
|
+
f" labels = [{quote}At Home{quote}, {quote}Left Home{quote}, {quote}Missing{quote}]\n\n"
|
|
253
|
+
" This ensures consistent color mapping and avoids unexpected visualization errors."
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
# 添加 missing 到 states 和 labels
|
|
257
|
+
if example_missing == "'Missing'":
|
|
258
|
+
self.states.append("Missing")
|
|
259
|
+
else:
|
|
260
|
+
self.states.append(np.nan)
|
|
261
|
+
|
|
262
|
+
# 只有当labels中没有Missing时才添加
|
|
263
|
+
if not has_missing_label:
|
|
264
|
+
self.labels = [label for label in self.labels # 去除所有大小写混杂的 "missing"
|
|
265
|
+
if not (isinstance(label, str) and label.lower() == "missing")]
|
|
266
|
+
self.labels.append("Missing")
|
|
267
|
+
|
|
268
|
+
def _convert_states(self):
|
|
269
|
+
"""
|
|
270
|
+
Converts categorical states into numerical values for processing.
|
|
271
|
+
Note that the order has to be the same as when the user defines the states of the class,
|
|
272
|
+
as it is very important for visualization.
|
|
273
|
+
Otherwise, the colors will be assigned incorrectly.
|
|
274
|
+
|
|
275
|
+
For instance, self.states = ['Very Low', 'Low', 'Middle', 'High', 'Very High'], as the user defines when defining the class
|
|
276
|
+
but the older version here is {'High': 1, 'Low': 2, 'Middle': 3, 'Very High': 4, 'Very Low': 5}
|
|
277
|
+
"""
|
|
278
|
+
correct_order = self.states
|
|
279
|
+
|
|
280
|
+
# Create the state mapping with correct order
|
|
281
|
+
self.state_mapping = {original_state: i + 1 for i, original_state in enumerate(self.states)}
|
|
282
|
+
# 保留下面的映射关系,这样后面 legend 和绘图都能用 numeric 编码了
|
|
283
|
+
self.inverse_state_mapping = {v: k for k, v in self.state_mapping.items()}
|
|
284
|
+
|
|
285
|
+
# Apply the mapping
|
|
286
|
+
# If there are missing values, replace them with the last index + 1
|
|
287
|
+
# And update the additional missing value as a new state in self.state and self.alphabet
|
|
288
|
+
try:
|
|
289
|
+
self.seqdata = self.seqdata.map(lambda x: self.state_mapping.get(x, len(self.states)))
|
|
290
|
+
except AttributeError:
|
|
291
|
+
self.seqdata = self.seqdata.applymap(lambda x: self.state_mapping.get(x, len(self.states)))
|
|
292
|
+
|
|
293
|
+
if self.ids is not None:
|
|
294
|
+
self.seqdata.index = self.ids
|
|
295
|
+
|
|
296
|
+
def _assign_colors(self, reverse_colors=True):
|
|
297
|
+
"""Assigns a color palette using user-defined or default Spectral palette.
|
|
298
|
+
|
|
299
|
+
If missing values are present, automatically assigns a fixed gray color (#cfcccc)
|
|
300
|
+
to missing values and uses the existing color scheme for non-missing states.
|
|
301
|
+
"""
|
|
302
|
+
num_states = len(self.states)
|
|
303
|
+
|
|
304
|
+
# Check if missing values are present
|
|
305
|
+
has_missing = self.ismissing
|
|
306
|
+
missing_gray_color = (0.811765, 0.8, 0.8) # Fixed gray color for missing values (#cfcccc)
|
|
307
|
+
|
|
308
|
+
if has_missing:
|
|
309
|
+
# Count non-missing states for color palette generation
|
|
310
|
+
non_missing_states = num_states - 1
|
|
311
|
+
|
|
312
|
+
if self.custom_colors:
|
|
313
|
+
# If user provided custom colors, check if they account for missing values
|
|
314
|
+
if len(self.custom_colors) == num_states:
|
|
315
|
+
# User provided colors for all states including missing - use as is
|
|
316
|
+
color_list = self.custom_colors
|
|
317
|
+
elif len(self.custom_colors) == non_missing_states:
|
|
318
|
+
# User provided colors only for non-missing states - add gray for missing
|
|
319
|
+
color_list = self.custom_colors + [missing_gray_color]
|
|
320
|
+
else:
|
|
321
|
+
raise ValueError(f"Length of custom_colors ({len(self.custom_colors)}) must match "
|
|
322
|
+
f"either total states ({num_states}) or non-missing states ({non_missing_states}).")
|
|
323
|
+
else:
|
|
324
|
+
# Generate colors for non-missing states and add gray for missing
|
|
325
|
+
if non_missing_states <= 20:
|
|
326
|
+
non_missing_color_list = sns.color_palette("Spectral", non_missing_states)
|
|
327
|
+
else:
|
|
328
|
+
# Use a more elegant color palette for many states - combination of viridis and pastel colors
|
|
329
|
+
if non_missing_states <= 40:
|
|
330
|
+
# Use viridis for up to 40 states (more colorful than cubehelix)
|
|
331
|
+
non_missing_color_list = sns.color_palette("viridis", non_missing_states)
|
|
332
|
+
else:
|
|
333
|
+
# For very large state counts, use a custom palette combining multiple schemes
|
|
334
|
+
viridis_colors = sns.color_palette("viridis", min(non_missing_states // 2, 20))
|
|
335
|
+
pastel_colors = sns.color_palette("Set3", min(non_missing_states // 2, 12))
|
|
336
|
+
tab20_colors = sns.color_palette("tab20", min(non_missing_states // 3, 20))
|
|
337
|
+
|
|
338
|
+
# Combine and extend the palette
|
|
339
|
+
combined_colors = viridis_colors + pastel_colors + tab20_colors
|
|
340
|
+
# If we need more colors, cycle through the combined palette
|
|
341
|
+
while len(combined_colors) < non_missing_states:
|
|
342
|
+
combined_colors.extend(combined_colors[:min(len(combined_colors), non_missing_states - len(combined_colors))])
|
|
343
|
+
|
|
344
|
+
non_missing_color_list = combined_colors[:non_missing_states]
|
|
345
|
+
|
|
346
|
+
if reverse_colors:
|
|
347
|
+
non_missing_color_list = list(reversed(non_missing_color_list))
|
|
348
|
+
|
|
349
|
+
# Add fixed gray color for missing values at the end
|
|
350
|
+
color_list = list(non_missing_color_list) + [missing_gray_color]
|
|
351
|
+
else:
|
|
352
|
+
# No missing values - use original logic
|
|
353
|
+
if self.custom_colors:
|
|
354
|
+
if len(self.custom_colors) != num_states:
|
|
355
|
+
raise ValueError("Length of custom_colors must match number of states.")
|
|
356
|
+
color_list = self.custom_colors
|
|
357
|
+
else:
|
|
358
|
+
if num_states <= 20:
|
|
359
|
+
color_list = sns.color_palette("Spectral", num_states)
|
|
360
|
+
else:
|
|
361
|
+
# Use a more elegant color palette for many states - combination of viridis and pastel colors
|
|
362
|
+
if num_states <= 40:
|
|
363
|
+
# Use viridis for up to 40 states (more colorful than cubehelix)
|
|
364
|
+
color_list = sns.color_palette("viridis", num_states)
|
|
365
|
+
else:
|
|
366
|
+
# For very large state counts, use a custom palette combining multiple schemes
|
|
367
|
+
viridis_colors = sns.color_palette("viridis", min(num_states // 2, 20))
|
|
368
|
+
pastel_colors = sns.color_palette("Set3", min(num_states // 2, 12))
|
|
369
|
+
tab20_colors = sns.color_palette("tab20", min(num_states // 3, 20))
|
|
370
|
+
|
|
371
|
+
# Combine and extend the palette
|
|
372
|
+
combined_colors = viridis_colors + pastel_colors + tab20_colors
|
|
373
|
+
# If we need more colors, cycle through the combined palette
|
|
374
|
+
while len(combined_colors) < num_states:
|
|
375
|
+
combined_colors.extend(combined_colors[:min(len(combined_colors), num_states - len(combined_colors))])
|
|
376
|
+
|
|
377
|
+
color_list = combined_colors[:num_states]
|
|
378
|
+
|
|
379
|
+
if reverse_colors:
|
|
380
|
+
color_list = list(reversed(color_list))
|
|
381
|
+
|
|
382
|
+
# self.color_map = {state: color_list[i] for i, state in enumerate(self.states)}
|
|
383
|
+
# 这样所有 color map key 是 1, 2, 3...,就可以和 imshow(vmin=1, vmax=N) 对齐
|
|
384
|
+
self.color_map = {i + 1: color_list[i] for i in range(num_states)}
|
|
385
|
+
|
|
386
|
+
# 构造以 label 为 key 的 color_map(用于 legend)
|
|
387
|
+
self.color_map_by_label = {
|
|
388
|
+
self.state_to_label[state]: self.color_map[self.state_mapping[state]]
|
|
389
|
+
for state in self.states
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
def get_colormap(self):
|
|
393
|
+
"""Returns a ListedColormap for visualization."""
|
|
394
|
+
# return ListedColormap([self.color_map[state] for state in self.states])
|
|
395
|
+
return ListedColormap([self.color_map[i + 1] for i in range(len(self.states))])
|
|
396
|
+
|
|
397
|
+
def describe(self):
|
|
398
|
+
"""
|
|
399
|
+
Prints an overview of the sequence dataset.
|
|
400
|
+
|
|
401
|
+
# NOTE:
|
|
402
|
+
# Printing 'missing_index' directly may cause issues in Jupyter Notebook/Lab if the list is too long.
|
|
403
|
+
# For example, if there are thousands of sequences with missing values, the full list can easily exceed
|
|
404
|
+
# the IOPub data rate limit (1MB/sec by default), which will interrupt output to the client.
|
|
405
|
+
# To avoid this, it's safer to only display a subset (e.g., the first 10) or add a 'verbose' flag to control output.
|
|
406
|
+
"""
|
|
407
|
+
print(f"[>] Number of sequences: {len(self.seqdata)}")
|
|
408
|
+
print(f"[>] Number of time points: {self.n_steps}")
|
|
409
|
+
|
|
410
|
+
if self.ismissing:
|
|
411
|
+
lengths = self.seqdata.apply(lambda row: (row != len(self.states)).sum(), axis=1)
|
|
412
|
+
print(f"[>] Min/Max sequence length: {lengths.min()} / {lengths.max()}")
|
|
413
|
+
|
|
414
|
+
# Identify missing values and related IDs
|
|
415
|
+
missing_locs = self.seqdata.stack()[self.seqdata.stack() == len(self.states)].index.get_level_values(0)
|
|
416
|
+
missing_count = len(missing_locs)
|
|
417
|
+
unique_missing_ids = missing_locs.unique().tolist()
|
|
418
|
+
print(f"[>] There are {missing_count} missing values across {len(unique_missing_ids)} sequences.")
|
|
419
|
+
print(f" First few missing sequence IDs: {unique_missing_ids[:10]} ...")
|
|
420
|
+
|
|
421
|
+
# Find and display sequences with the most missing points
|
|
422
|
+
missing_counts = self.seqdata.isin([len(self.states)]).sum(axis=1)
|
|
423
|
+
most_missing = missing_counts[missing_counts > 0].sort_values(ascending=False).head(5)
|
|
424
|
+
print("[>] Top sequences with the most missing time points:")
|
|
425
|
+
print(" (Each row shows a sequence ID and its number of missing values)\n")
|
|
426
|
+
print(most_missing.rename("Missing Count").to_frame().rename_axis("Sequence ID"))
|
|
427
|
+
|
|
428
|
+
else:
|
|
429
|
+
print(
|
|
430
|
+
f"[>] Min/Max sequence length: {self.seqdata.notna().sum(axis=1).min()} / {self.seqdata.notna().sum(axis=1).max()}")
|
|
431
|
+
|
|
432
|
+
print(f"[>] States: {self.states}")
|
|
433
|
+
print(f"[>] Labels: {self.labels}")
|
|
434
|
+
|
|
435
|
+
# Display weights information if weights were originally provided
|
|
436
|
+
if self._weights_provided:
|
|
437
|
+
weight_mean = np.mean(self.weights)
|
|
438
|
+
weight_std = np.std(self.weights)
|
|
439
|
+
print(f"[>] Weights: Provided (total weight={sum(self.weights):.3f}, mean={weight_mean:.3f}, std={weight_std:.3f})")
|
|
440
|
+
else:
|
|
441
|
+
print(f"[>] Weights: Not provided")
|
|
442
|
+
|
|
443
|
+
def get_legend(self):
|
|
444
|
+
"""Returns the legend handles and labels for visualization."""
|
|
445
|
+
# self.legend_handles = [plt.Rectangle((0, 0), 1, 1,
|
|
446
|
+
# color=self.color_map[state],
|
|
447
|
+
# label=label)
|
|
448
|
+
# for state, label in zip(self.states, self.labels)]
|
|
449
|
+
# return [handle for handle in self.legend_handles], self.labels
|
|
450
|
+
|
|
451
|
+
self.legend_handles = [
|
|
452
|
+
plt.Rectangle((0, 0), 1, 1,
|
|
453
|
+
color=self.color_map[i + 1],
|
|
454
|
+
label=self.labels[i])
|
|
455
|
+
for i in range(len(self.states))
|
|
456
|
+
]
|
|
457
|
+
return self.legend_handles, self.labels
|
|
458
|
+
|
|
459
|
+
def to_dataframe(self) -> pd.DataFrame:
|
|
460
|
+
"""Returns the processed sequence dataset as a DataFrame."""
|
|
461
|
+
return self.seqdata
|
|
462
|
+
|
|
463
|
+
def plot_legend(self, save_as=None, dpi=200):
|
|
464
|
+
"""Displays the saved legend for sequence state colors."""
|
|
465
|
+
# Ensure legend handles exist even if get_legend() wasn't called
|
|
466
|
+
legend_handles = getattr(self, "legend_handles", None)
|
|
467
|
+
if not legend_handles:
|
|
468
|
+
legend_handles = [
|
|
469
|
+
plt.Rectangle((0, 0), 1, 1, color=self.color_map[i + 1], label=self.labels[i]
|
|
470
|
+
) for i in range(len(self.states))
|
|
471
|
+
]
|
|
472
|
+
self.legend_handles = legend_handles
|
|
473
|
+
|
|
474
|
+
fig, ax = plt.subplots(figsize=(2, 2))
|
|
475
|
+
ax.legend(handles=legend_handles, loc='center', title="States", fontsize=10)
|
|
476
|
+
ax.axis('off')
|
|
477
|
+
|
|
478
|
+
if save_as:
|
|
479
|
+
plt.savefig(save_as, dpi=dpi)
|
|
480
|
+
plt.show()
|
|
481
|
+
else:
|
|
482
|
+
plt.tight_layout()
|
|
483
|
+
plt.show()
|
|
484
|
+
|
|
485
|
+
# ------------------------------
|
|
486
|
+
# The following are for multidomain sequence analysis, especially for seqdomassoc()
|
|
487
|
+
|
|
488
|
+
@property
|
|
489
|
+
def n_sequences(self):
|
|
490
|
+
"""Returns number of sequences (rows)."""
|
|
491
|
+
return self.seqdata.shape[0]
|
|
492
|
+
|
|
493
|
+
@property
|
|
494
|
+
def n_steps(self):
|
|
495
|
+
"""Returns sequence length (columns)."""
|
|
496
|
+
return self.seqdata.shape[1]
|
|
497
|
+
|
|
498
|
+
@property
|
|
499
|
+
def alphabet(self):
|
|
500
|
+
"""Returns state alphabet."""
|
|
501
|
+
return self._alphabet
|
|
502
|
+
|
|
503
|
+
@alphabet.setter
|
|
504
|
+
def alphabet(self, val):
|
|
505
|
+
self._alphabet = val
|
|
506
|
+
|
|
507
|
+
@property
|
|
508
|
+
def weights(self):
|
|
509
|
+
return self._weights
|
|
510
|
+
|
|
511
|
+
@weights.setter
|
|
512
|
+
def weights(self, val):
|
|
513
|
+
self._weights = val
|
|
514
|
+
|
|
515
|
+
def flatten(self) -> np.ndarray:
|
|
516
|
+
"""Flatten all sequences into a 1D array (row-wise)."""
|
|
517
|
+
return self.seqdata.values.flatten()
|
|
518
|
+
|
|
519
|
+
def flatten_weights(self) -> np.ndarray:
|
|
520
|
+
"""
|
|
521
|
+
Repeat weights across sequence length for 1D alignment with flatten().
|
|
522
|
+
E.g., 5 sequences x 10 steps -> repeat each weight 10 times.
|
|
523
|
+
"""
|
|
524
|
+
return np.repeat(self.weights, self.n_steps)
|
|
525
|
+
|
|
526
|
+
def to_numeric(self) -> np.ndarray:
|
|
527
|
+
"""Returns integer-coded sequence data as NumPy array."""
|
|
528
|
+
return self.seqdata.to_numpy(dtype=np.int32)
|
|
529
|
+
|
|
530
|
+
def get_xtabs(self, other: SequenceData, weighted=True) -> np.ndarray:
|
|
531
|
+
"""
|
|
532
|
+
NumPy-only version of get_xtabs.
|
|
533
|
+
Returns a raw NumPy matrix: shape (len(alphabet1), len(alphabet2))
|
|
534
|
+
"""
|
|
535
|
+
if self.n_sequences != other.n_sequences or self.n_steps != other.n_steps:
|
|
536
|
+
raise ValueError("Both SequenceData objects must have same shape.")
|
|
537
|
+
|
|
538
|
+
v1 = self.flatten()
|
|
539
|
+
v2 = other.flatten()
|
|
540
|
+
|
|
541
|
+
# Equivalent to self.alphabet,
|
|
542
|
+
# but alphabet cannot be used directly, because it does not account for missing values
|
|
543
|
+
n1 = len(self.states)
|
|
544
|
+
n2 = len(other.states)
|
|
545
|
+
|
|
546
|
+
table = np.zeros((n1, n2), dtype=np.float64)
|
|
547
|
+
|
|
548
|
+
if weighted:
|
|
549
|
+
w = self.flatten_weights()
|
|
550
|
+
# Safe increment using integer indices
|
|
551
|
+
# Numpy's index starts from 0, thus it is important to reduce by 1
|
|
552
|
+
np.add.at(table, (v1 - 1, v2 - 1), w)
|
|
553
|
+
else:
|
|
554
|
+
np.add.at(table, (v1 - 1, v2 - 1), 1)
|
|
555
|
+
|
|
556
|
+
return table
|
|
557
|
+
|
|
558
|
+
def uniqueness_stats(self, weighted: bool = False):
|
|
559
|
+
"""
|
|
560
|
+
Compute uniqueness statistics of the sequences.
|
|
561
|
+
|
|
562
|
+
Returns:
|
|
563
|
+
dict with keys:
|
|
564
|
+
- n_sequences: total number of sequences (unweighted count)
|
|
565
|
+
- n_unique: number of unique sequence patterns
|
|
566
|
+
- uniqueness_rate: n_unique / n_sequences
|
|
567
|
+
- weighted_total: total weighted count (only if weighted=True)
|
|
568
|
+
- weighted_uniqueness_rate: n_unique / weighted_total (only if weighted=True)
|
|
569
|
+
|
|
570
|
+
Parameters:
|
|
571
|
+
weighted: if True, use sequence weights to calculate weighted frequencies and uniqueness rates;
|
|
572
|
+
if False, use simple counts (default behavior for backward compatibility).
|
|
573
|
+
"""
|
|
574
|
+
import numpy as np
|
|
575
|
+
import pandas as pd
|
|
576
|
+
|
|
577
|
+
A = self.to_numeric() # shape (n, m), int32
|
|
578
|
+
n, m = A.shape
|
|
579
|
+
|
|
580
|
+
# Use a byte-level view to let np.unique work row-wise efficiently
|
|
581
|
+
A_contig = np.ascontiguousarray(A)
|
|
582
|
+
row_view = A_contig.view(np.dtype((np.void, A_contig.dtype.itemsize * m))).ravel()
|
|
583
|
+
|
|
584
|
+
# Get unique patterns
|
|
585
|
+
uniq, inverse = np.unique(row_view, return_inverse=True)
|
|
586
|
+
|
|
587
|
+
n_unique = uniq.size
|
|
588
|
+
uniqueness_rate = float(n_unique) / float(n) if n > 0 else np.nan
|
|
589
|
+
|
|
590
|
+
# Build simplified result dictionary with only essential statistics
|
|
591
|
+
result = {
|
|
592
|
+
"n_sequences": int(n),
|
|
593
|
+
"n_unique": int(n_unique),
|
|
594
|
+
"uniqueness_rate": uniqueness_rate
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
# Add weighted statistics if requested
|
|
598
|
+
if weighted:
|
|
599
|
+
weighted_total = float(np.sum(self.weights))
|
|
600
|
+
weighted_uniqueness_rate = float(n_unique) / weighted_total if weighted_total > 0 else np.nan
|
|
601
|
+
result["weighted_total"] = weighted_total
|
|
602
|
+
result["weighted_uniqueness_rate"] = weighted_uniqueness_rate
|
|
603
|
+
|
|
604
|
+
return result
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
|
|
608
|
+
|
|
609
|
+
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""
|
|
2
|
+
@Author : 李欣怡
|
|
3
|
+
@File : __init__.py
|
|
4
|
+
@Time : 2025/2/26 23:19
|
|
5
|
+
@Desc :
|
|
6
|
+
"""
|
|
7
|
+
from .utils import get_sm_trate_substitution_cost_matrix, seqconc, seqdss, seqdur, seqlength
|
|
8
|
+
from .utils import get_LCP_length_for_2_seq
|
|
9
|
+
from .get_distance_matrix import get_distance_matrix
|
|
10
|
+
from .get_substitution_cost_matrix import get_substitution_cost_matrix
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _import_c_code():
|
|
14
|
+
"""Lazily import the c_code module to avoid circular dependencies during installation"""
|
|
15
|
+
try:
|
|
16
|
+
from sequenzo.dissimilarity_measures import c_code
|
|
17
|
+
return c_code
|
|
18
|
+
except ImportError:
|
|
19
|
+
# If the C extension cannot be imported, return None
|
|
20
|
+
print(
|
|
21
|
+
"Warning: The C++ extension (c_code) could not be imported. Please ensure the extension module is compiled correctly.")
|
|
22
|
+
return None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"get_distance_matrix",
|
|
27
|
+
"get_substitution_cost_matrix",
|
|
28
|
+
"get_LCP_length_for_2_seq"
|
|
29
|
+
# Add other functions as needed
|
|
30
|
+
]
|
|
31
|
+
|
|
Binary file
|