sequenzo 0.1.24__cp311-cp311-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sequenzo might be problematic. Click here for more details.
- _sequenzo_fastcluster.cpython-311-darwin.so +0 -0
- sequenzo/__init__.py +240 -0
- sequenzo/big_data/__init__.py +12 -0
- sequenzo/big_data/clara/__init__.py +26 -0
- sequenzo/big_data/clara/clara.py +474 -0
- sequenzo/big_data/clara/utils/__init__.py +27 -0
- sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
- sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
- sequenzo/big_data/clara/utils/get_weighted_diss.cpython-311-darwin.so +0 -0
- sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
- sequenzo/big_data/clara/visualization.py +88 -0
- sequenzo/clustering/KMedoids.py +178 -0
- sequenzo/clustering/__init__.py +30 -0
- sequenzo/clustering/clustering_c_code.cpython-311-darwin.so +0 -0
- sequenzo/clustering/hierarchical_clustering.py +1256 -0
- sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
- sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
- sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
- sequenzo/clustering/src/KMedoid.cpp +263 -0
- sequenzo/clustering/src/PAM.cpp +237 -0
- sequenzo/clustering/src/PAMonce.cpp +265 -0
- sequenzo/clustering/src/cluster_quality.cpp +496 -0
- sequenzo/clustering/src/cluster_quality.h +128 -0
- sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
- sequenzo/clustering/src/module.cpp +228 -0
- sequenzo/clustering/src/weightedinertia.cpp +111 -0
- sequenzo/clustering/utils/__init__.py +27 -0
- sequenzo/clustering/utils/disscenter.py +122 -0
- sequenzo/data_preprocessing/__init__.py +20 -0
- sequenzo/data_preprocessing/helpers.py +256 -0
- sequenzo/datasets/__init__.py +41 -0
- sequenzo/datasets/biofam.csv +2001 -0
- sequenzo/datasets/biofam_child_domain.csv +2001 -0
- sequenzo/datasets/biofam_left_domain.csv +2001 -0
- sequenzo/datasets/biofam_married_domain.csv +2001 -0
- sequenzo/datasets/chinese_colonial_territories.csv +12 -0
- sequenzo/datasets/country_co2_emissions.csv +194 -0
- sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
- sequenzo/datasets/country_gdp_per_capita.csv +194 -0
- sequenzo/datasets/mvad.csv +713 -0
- sequenzo/datasets/pairfam_family.csv +1867 -0
- sequenzo/datasets/polyadic_samplec1.csv +61 -0
- sequenzo/datasets/polyadic_samplep1.csv +61 -0
- sequenzo/datasets/polyadic_seqc1.csv +61 -0
- sequenzo/datasets/polyadic_seqp1.csv +61 -0
- sequenzo/define_sequence_data.py +609 -0
- sequenzo/dissimilarity_measures/__init__.py +31 -0
- sequenzo/dissimilarity_measures/c_code.cpython-311-darwin.so +0 -0
- sequenzo/dissimilarity_measures/get_distance_matrix.py +702 -0
- sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +241 -0
- sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
- sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
- sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
- sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
- sequenzo/dissimilarity_measures/src/__init__.py +0 -0
- sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
- sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
- sequenzo/dissimilarity_measures/src/module.cpp +34 -0
- sequenzo/dissimilarity_measures/src/setup.py +30 -0
- sequenzo/dissimilarity_measures/src/utils.h +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
- sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
- sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
- sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
- sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
- sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
- sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
- sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
- sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-311-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqconc.cpython-311-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqdss.cpython-311-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqdur.cpython-311-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqlength.cpython-311-darwin.so +0 -0
- sequenzo/multidomain/__init__.py +23 -0
- sequenzo/multidomain/association_between_domains.py +311 -0
- sequenzo/multidomain/cat.py +431 -0
- sequenzo/multidomain/combt.py +519 -0
- sequenzo/multidomain/dat.py +89 -0
- sequenzo/multidomain/idcd.py +139 -0
- sequenzo/multidomain/linked_polyad.py +292 -0
- sequenzo/openmp_setup.py +233 -0
- sequenzo/prefix_tree/__init__.py +43 -0
- sequenzo/prefix_tree/individual_level_indicators.py +1274 -0
- sequenzo/prefix_tree/system_level_indicators.py +465 -0
- sequenzo/prefix_tree/utils.py +54 -0
- sequenzo/sequence_characteristics/__init__.py +40 -0
- sequenzo/sequence_characteristics/complexity_index.py +49 -0
- sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
- sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
- sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
- sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
- sequenzo/sequence_characteristics/turbulence.py +155 -0
- sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
- sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
- sequenzo/suffix_tree/__init__.py +48 -0
- sequenzo/suffix_tree/individual_level_indicators.py +1638 -0
- sequenzo/suffix_tree/system_level_indicators.py +456 -0
- sequenzo/suffix_tree/utils.py +56 -0
- sequenzo/visualization/__init__.py +29 -0
- sequenzo/visualization/plot_mean_time.py +194 -0
- sequenzo/visualization/plot_modal_state.py +276 -0
- sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
- sequenzo/visualization/plot_relative_frequency.py +404 -0
- sequenzo/visualization/plot_sequence_index.py +951 -0
- sequenzo/visualization/plot_single_medoid.py +153 -0
- sequenzo/visualization/plot_state_distribution.py +627 -0
- sequenzo/visualization/plot_transition_matrix.py +190 -0
- sequenzo/visualization/utils/__init__.py +23 -0
- sequenzo/visualization/utils/utils.py +310 -0
- sequenzo/with_event_history_analysis/__init__.py +35 -0
- sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
- sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
- sequenzo-0.1.24.dist-info/METADATA +255 -0
- sequenzo-0.1.24.dist-info/RECORD +264 -0
- sequenzo-0.1.24.dist-info/WHEEL +5 -0
- sequenzo-0.1.24.dist-info/licenses/LICENSE +28 -0
- sequenzo-0.1.24.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,1256 @@
|
|
|
1
|
+
"""
|
|
2
|
+
@Author : Yuqi Liang 梁彧祺
|
|
3
|
+
@File : hierarchical_clustering.py
|
|
4
|
+
@Time : 18/12/2024 17:59
|
|
5
|
+
@Desc :
|
|
6
|
+
This module provides a flexible and user-friendly implementation of hierarchical clustering,
|
|
7
|
+
along with tools to evaluate cluster quality and analyze clustering results.
|
|
8
|
+
|
|
9
|
+
It supports common hierarchical clustering methods and evaluation metrics,
|
|
10
|
+
designed for social sequence analysis and other research applications.
|
|
11
|
+
|
|
12
|
+
This module leverages fastcluster, a tool specifically designed to enhance the efficiency of large-scale hierarchical clustering.
|
|
13
|
+
Unlike native Python tools such as SciPy, fastcluster optimizes linkage matrix computations,
|
|
14
|
+
enabling it to handle datasets with millions of entries more efficiently.
|
|
15
|
+
|
|
16
|
+
It has three main components:
|
|
17
|
+
1. Cluster Class: Performs hierarchical clustering on a precomputed distance matrix.
|
|
18
|
+
2. ClusterQuality Class: Evaluates the quality of clustering for different numbers of clusters using various metrics.
|
|
19
|
+
3. ClusterResults Class: Analyzes and visualizes the clustering results (e.g., membership tables and cluster distributions).
|
|
20
|
+
|
|
21
|
+
WEIGHTED CLUSTERING SUPPORT:
|
|
22
|
+
All classes now support weighted data analysis:
|
|
23
|
+
- Cluster: Hierarchical linkage is computed on the given distance matrix (unweighted). Optional weights are applied to evaluation and summaries
|
|
24
|
+
- ClusterQuality: Computes weighted versions of quality metrics (ASWw, HG, R2, HC)
|
|
25
|
+
- ClusterResults: Provides weighted cluster distribution statistics and visualizations
|
|
26
|
+
|
|
27
|
+
Weighted metrics account for sequence importance when calculating clustering quality,
|
|
28
|
+
making the analysis more representative when sequences have different sampling weights
|
|
29
|
+
or population sizes.
|
|
30
|
+
|
|
31
|
+
WARD METHOD VARIANTS:
|
|
32
|
+
The module supports two Ward linkage variants:
|
|
33
|
+
- 'ward_d' (Ward D): Classic Ward method using squared Euclidean distances ÷ 2
|
|
34
|
+
- 'ward_d2' (Ward D2): Ward method using squared Euclidean distances
|
|
35
|
+
For backward compatibility, 'ward' maps to 'ward_d'.
|
|
36
|
+
|
|
37
|
+
The difference affects clustering results and dendrogram heights:
|
|
38
|
+
- Ward D produces smaller distances in the linkage matrix
|
|
39
|
+
- Ward D2 produces distances equal to the increase in cluster variance
|
|
40
|
+
- Both methods produce identical cluster assignments, only distances differ
|
|
41
|
+
|
|
42
|
+
ROBUSTNESS AND VALIDATION FEATURES:
|
|
43
|
+
- Ward Method Validation: Automatic detection of non-Euclidean distance matrices
|
|
44
|
+
- One-time Warning System: Alerts users when Ward methods are used with potentially incompatible distances
|
|
45
|
+
- Robust Matrix Cleanup: Handles NaN/Inf values using 95th percentile replacement
|
|
46
|
+
- Distance Matrix Validation: Ensures zero diagonal and non-negativity
|
|
47
|
+
- Symmetry Handling: Automatically symmetrizes matrices when required by clustering algorithms
|
|
48
|
+
- Method Recommendations: Suggests alternative methods for sequence distances
|
|
49
|
+
|
|
50
|
+
For sequence distances (OM, LCS, etc.), Ward linkage methods may produce suboptimal results.
|
|
51
|
+
Consider using alternative methods like 'average' (UPGMA) for better theoretical validity.
|
|
52
|
+
|
|
53
|
+
Original code references:
|
|
54
|
+
Cluster(): Derived from `hclust`, a key function from fastcluster
|
|
55
|
+
R code: https://github.com/cran/fastcluster/blob/master/R/fastcluster.R
|
|
56
|
+
Python code: https://github.com/fastcluster/fastcluster/blob/master/src/fastcluster.cpp
|
|
57
|
+
The Python version of facluster does not support Ward D method but only Ward D2, whereas R supports both.
|
|
58
|
+
Thus, we provide Ward D by ourselves here.
|
|
59
|
+
|
|
60
|
+
ClusterQuality(): Derived from ``, a key function from weightedcluster
|
|
61
|
+
CQI equivalence of R is here (two files):
|
|
62
|
+
https://github.com/cran/WeightedCluster/blob/master/src/clusterquality.cpp
|
|
63
|
+
https://github.com/cran/WeightedCluster/blob/master/src/clusterqualitybody.cpp
|
|
64
|
+
plot_cqi_scores(): `wcCmpCluster()` produces `clustrangefamily` object + `plot.clustrangefamily()` for plotting
|
|
65
|
+
"""
|
|
66
|
+
import matplotlib.pyplot as plt
|
|
67
|
+
import seaborn as sns
|
|
68
|
+
import warnings
|
|
69
|
+
from matplotlib.ticker import MaxNLocator
|
|
70
|
+
|
|
71
|
+
import pandas as pd
|
|
72
|
+
import numpy as np
|
|
73
|
+
from scipy.cluster.hierarchy import fcluster, dendrogram
|
|
74
|
+
from scipy.spatial.distance import squareform
|
|
75
|
+
# sklearn metrics no longer needed - using C++ implementation
|
|
76
|
+
# Import from sequenzo_fastcluster (our custom fastcluster with ward_d and ward_d2 support)
|
|
77
|
+
try:
|
|
78
|
+
from sequenzo.clustering.sequenzo_fastcluster.fastcluster import linkage
|
|
79
|
+
except ImportError:
|
|
80
|
+
# Fallback: try absolute import
|
|
81
|
+
try:
|
|
82
|
+
from sequenzo_fastcluster.fastcluster import linkage
|
|
83
|
+
except ImportError:
|
|
84
|
+
# Last resort: try relative import
|
|
85
|
+
from .sequenzo_fastcluster.fastcluster import linkage
|
|
86
|
+
|
|
87
|
+
# Import C++ cluster quality functions
|
|
88
|
+
try:
|
|
89
|
+
from . import clustering_c_code
|
|
90
|
+
_CPP_AVAILABLE = True
|
|
91
|
+
except ImportError:
|
|
92
|
+
_CPP_AVAILABLE = False
|
|
93
|
+
print("[!] Warning: C++ cluster quality functions not available. Using Python fallback.")
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# Corrected imports: Use relative imports *within* the package.
|
|
97
|
+
from sequenzo.visualization.utils import save_and_show_results
|
|
98
|
+
|
|
99
|
+
# Global flag to ensure Ward warning is only shown once per session
|
|
100
|
+
_WARD_WARNING_SHOWN = False
|
|
101
|
+
|
|
102
|
+
def _check_euclidean_compatibility(matrix, method):
|
|
103
|
+
"""
|
|
104
|
+
Check if a distance matrix is likely compatible with Euclidean-based methods like Ward.
|
|
105
|
+
|
|
106
|
+
This performs heuristic checks rather than exact validation since perfect validation
|
|
107
|
+
would be computationally expensive for large matrices.
|
|
108
|
+
|
|
109
|
+
Parameters:
|
|
110
|
+
-----------
|
|
111
|
+
matrix : np.ndarray
|
|
112
|
+
Distance matrix to check
|
|
113
|
+
method : str
|
|
114
|
+
Clustering method name
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
--------
|
|
118
|
+
bool
|
|
119
|
+
True if matrix appears Euclidean-compatible, False otherwise
|
|
120
|
+
"""
|
|
121
|
+
# Check for Ward methods (both Ward D and Ward D2 require Euclidean distances)
|
|
122
|
+
if method.lower() not in ["ward", "ward_d", "ward_d2"]:
|
|
123
|
+
return True # Other methods don't require Euclidean distances
|
|
124
|
+
|
|
125
|
+
# Basic checks for Euclidean properties
|
|
126
|
+
n = matrix.shape[0]
|
|
127
|
+
|
|
128
|
+
# Check 1: Triangle inequality violations (sample a subset for large matrices)
|
|
129
|
+
sample_size = min(50, n) # Sample up to 50 points for efficiency
|
|
130
|
+
if n > sample_size:
|
|
131
|
+
indices = np.random.choice(n, sample_size, replace=False)
|
|
132
|
+
sample_matrix = matrix[np.ix_(indices, indices)]
|
|
133
|
+
else:
|
|
134
|
+
sample_matrix = matrix
|
|
135
|
+
indices = np.arange(n)
|
|
136
|
+
|
|
137
|
+
sample_n = sample_matrix.shape[0]
|
|
138
|
+
violations = 0
|
|
139
|
+
total_checks = 0
|
|
140
|
+
|
|
141
|
+
# Check triangle inequality: d(i,k) <= d(i,j) + d(j,k)
|
|
142
|
+
for i in range(sample_n):
|
|
143
|
+
for j in range(i + 1, sample_n):
|
|
144
|
+
for k in range(j + 1, sample_n):
|
|
145
|
+
dij = sample_matrix[i, j]
|
|
146
|
+
dik = sample_matrix[i, k]
|
|
147
|
+
djk = sample_matrix[j, k]
|
|
148
|
+
|
|
149
|
+
# Check all three triangle inequalities
|
|
150
|
+
if (dik > dij + djk + 1e-10 or
|
|
151
|
+
dij > dik + djk + 1e-10 or
|
|
152
|
+
djk > dij + dik + 1e-10):
|
|
153
|
+
violations += 1
|
|
154
|
+
total_checks += 1
|
|
155
|
+
|
|
156
|
+
if total_checks > 0:
|
|
157
|
+
violation_rate = violations / total_checks
|
|
158
|
+
if violation_rate > 0.1: # More than 10% violations suggests non-Euclidean
|
|
159
|
+
return False
|
|
160
|
+
|
|
161
|
+
# Check 2: Negative eigenvalues in distance matrix (indicates non-Euclidean)
|
|
162
|
+
# Use double centering to convert distances to inner products
|
|
163
|
+
try:
|
|
164
|
+
# For efficiency, only check this for smaller matrices
|
|
165
|
+
if sample_n <= 100:
|
|
166
|
+
H = np.eye(sample_n) - np.ones((sample_n, sample_n)) / sample_n
|
|
167
|
+
B = -0.5 * H @ (sample_matrix ** 2) @ H
|
|
168
|
+
eigenvals = np.linalg.eigvals(B)
|
|
169
|
+
|
|
170
|
+
# Check if there are significant negative eigenvalues
|
|
171
|
+
negative_eigenvals = eigenvals[eigenvals < -1e-10]
|
|
172
|
+
if len(negative_eigenvals) > 0:
|
|
173
|
+
neg_energy = -np.sum(negative_eigenvals)
|
|
174
|
+
total_energy = np.sum(np.abs(eigenvals))
|
|
175
|
+
if neg_energy / total_energy > 0.1: # > 10% negative energy
|
|
176
|
+
return False
|
|
177
|
+
except np.linalg.LinAlgError:
|
|
178
|
+
# If eigenvalue computation fails, assume potentially problematic
|
|
179
|
+
pass
|
|
180
|
+
|
|
181
|
+
return True
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _warn_ward_usage_once(matrix, method):
|
|
185
|
+
"""
|
|
186
|
+
Issue a one-time warning about using Ward with potentially non-Euclidean distances.
|
|
187
|
+
"""
|
|
188
|
+
global _WARD_WARNING_SHOWN
|
|
189
|
+
|
|
190
|
+
# Check for both Ward D and Ward D2 methods
|
|
191
|
+
if not _WARD_WARNING_SHOWN and method.lower() in ["ward", "ward_d", "ward_d2"]:
|
|
192
|
+
if not _check_euclidean_compatibility(matrix, method):
|
|
193
|
+
warnings.warn(
|
|
194
|
+
"\n[!] Ward linkage method detected with potentially non-Euclidean distance matrix!\n"
|
|
195
|
+
" Ward clustering (both Ward D and Ward D2) assumes Euclidean distances for theoretical validity.\n"
|
|
196
|
+
" \n"
|
|
197
|
+
" Ward method variants:\n"
|
|
198
|
+
" - 'ward_d' (classic): Uses squared Euclidean distances ÷ 2\n"
|
|
199
|
+
" - 'ward_d2': Uses squared Euclidean distances\n"
|
|
200
|
+
" \n"
|
|
201
|
+
" For sequence distances (OM, LCS, etc.), consider using:\n"
|
|
202
|
+
" - method='average' (UPGMA)\n"
|
|
203
|
+
" - method='complete' (complete linkage)\n"
|
|
204
|
+
" - method='single' (single linkage)\n"
|
|
205
|
+
" \n"
|
|
206
|
+
" Note: 'centroid' and 'median' methods may also produce inversions\n"
|
|
207
|
+
" (non-monotonic dendrograms) with non-Euclidean distances.\n"
|
|
208
|
+
" \n"
|
|
209
|
+
" This warning is shown only once per session.",
|
|
210
|
+
UserWarning,
|
|
211
|
+
stacklevel=3
|
|
212
|
+
)
|
|
213
|
+
_WARD_WARNING_SHOWN = True
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _clean_distance_matrix(matrix):
|
|
217
|
+
"""
|
|
218
|
+
Clean and validate a distance matrix for hierarchical clustering.
|
|
219
|
+
|
|
220
|
+
This function:
|
|
221
|
+
1. Handles NaN/Inf values using robust percentile-based replacement
|
|
222
|
+
2. Sets diagonal to zero
|
|
223
|
+
3. Ensures non-negativity
|
|
224
|
+
|
|
225
|
+
Note: Symmetry is NOT enforced at this stage since distance matrices may legitimately
|
|
226
|
+
be asymmetric (e.g., directed sequence distances, time-dependent measures, etc.).
|
|
227
|
+
However, symmetrization will be performed later in linkage computation when required
|
|
228
|
+
by clustering algorithms.
|
|
229
|
+
|
|
230
|
+
Parameters:
|
|
231
|
+
-----------
|
|
232
|
+
matrix : np.ndarray
|
|
233
|
+
Input distance matrix
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
--------
|
|
237
|
+
np.ndarray
|
|
238
|
+
Cleaned distance matrix
|
|
239
|
+
"""
|
|
240
|
+
matrix = matrix.copy() # Don't modify the original
|
|
241
|
+
|
|
242
|
+
# Step 1: Handle NaN/Inf values with percentile-based replacement
|
|
243
|
+
if np.any(np.isnan(matrix)) or np.any(np.isinf(matrix)):
|
|
244
|
+
print("[!] Warning: Distance matrix contains NaN or Inf values.")
|
|
245
|
+
|
|
246
|
+
# Get finite values for percentile calculation
|
|
247
|
+
finite_vals = matrix[np.isfinite(matrix)]
|
|
248
|
+
|
|
249
|
+
if len(finite_vals) > 0:
|
|
250
|
+
# Use 95th percentile as replacement value (more conservative than max)
|
|
251
|
+
replacement_val = np.percentile(finite_vals, 95)
|
|
252
|
+
print(f" Replacing with 95th percentile value: {replacement_val:.6f}")
|
|
253
|
+
else:
|
|
254
|
+
# If no finite values, use 1.0 as default
|
|
255
|
+
replacement_val = 1.0
|
|
256
|
+
print(f" No finite values found, using default: {replacement_val}")
|
|
257
|
+
|
|
258
|
+
matrix[~np.isfinite(matrix)] = replacement_val
|
|
259
|
+
|
|
260
|
+
# Step 2: Force diagonal to be exactly zero (self-distance should be zero)
|
|
261
|
+
np.fill_diagonal(matrix, 0.0)
|
|
262
|
+
|
|
263
|
+
# Step 3: Ensure non-negativity (distance matrices should be non-negative)
|
|
264
|
+
if np.any(matrix < 0):
|
|
265
|
+
print("[!] Warning: Distance matrix contains negative values. Clipping to zero...")
|
|
266
|
+
matrix = np.maximum(matrix, 0.0)
|
|
267
|
+
|
|
268
|
+
return matrix
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def _hclust_to_linkage_matrix(linkage_matrix):
|
|
272
|
+
"""
|
|
273
|
+
Convert an R `hclust` object to a SciPy-compatible linkage matrix.
|
|
274
|
+
|
|
275
|
+
This function takes an `hclust` object returned by R (e.g., from
|
|
276
|
+
`fastcluster::hclust`) and converts it into the standard linkage matrix
|
|
277
|
+
format used by SciPy (`scipy.cluster.hierarchy.linkage`), which can be
|
|
278
|
+
used for dendrogram plotting or further clustering analysis in Python.
|
|
279
|
+
|
|
280
|
+
Parameters
|
|
281
|
+
----------
|
|
282
|
+
linkage_matrix : rpy2.robjects.ListVector
|
|
283
|
+
An R `hclust` object. Expected to contain at least the following fields:
|
|
284
|
+
- 'merge': ndarray of shape (n-1, 2), indicating which clusters are merged
|
|
285
|
+
at each step (negative indices for original observations,
|
|
286
|
+
positive indices for previously merged clusters).
|
|
287
|
+
- 'height': ndarray of shape (n-1,), distances at which merges occur.
|
|
288
|
+
- 'order': ordering of the leaves.
|
|
289
|
+
|
|
290
|
+
Returns
|
|
291
|
+
-------
|
|
292
|
+
Z : numpy.ndarray, shape (n-1, 4), dtype=float
|
|
293
|
+
A SciPy-compatible linkage matrix where each row represents a merge:
|
|
294
|
+
- Z[i, 0] : index of the first cluster (0-based)
|
|
295
|
+
- Z[i, 1] : index of the second cluster (0-based)
|
|
296
|
+
- Z[i, 2] : distance between the merged clusters
|
|
297
|
+
- Z[i, 3] : total number of original samples in the newly formed cluster
|
|
298
|
+
|
|
299
|
+
Notes
|
|
300
|
+
-----
|
|
301
|
+
- The conversion handles the difference in indexing:
|
|
302
|
+
- In R's `hclust`, negative numbers in 'merge' indicate original samples
|
|
303
|
+
and positive numbers indicate previously merged clusters (1-based).
|
|
304
|
+
- In the returned SciPy linkage matrix, all indices are converted to 0-based.
|
|
305
|
+
- The function iteratively tracks cluster sizes to populate the fourth column
|
|
306
|
+
(sample counts) required by SciPy.
|
|
307
|
+
"""
|
|
308
|
+
|
|
309
|
+
n = len(linkage_matrix.rx2("order")) # 样本数
|
|
310
|
+
merge = np.array(linkage_matrix.rx2("merge"), dtype=int) # (n-1, 2)
|
|
311
|
+
height = np.array(linkage_matrix.rx2("height"), dtype=float)
|
|
312
|
+
|
|
313
|
+
cluster_sizes = np.ones(n, dtype=int) # 单个样本初始大小 = 1
|
|
314
|
+
Z = np.zeros((n - 1, 4), dtype=float)
|
|
315
|
+
|
|
316
|
+
for i in range(n - 1):
|
|
317
|
+
a, b = merge[i]
|
|
318
|
+
|
|
319
|
+
# R hclust 编号负数表示原始样本
|
|
320
|
+
if a < 0:
|
|
321
|
+
idx1 = -a - 1 # 转成 0-based
|
|
322
|
+
size1 = 1
|
|
323
|
+
else:
|
|
324
|
+
idx1 = n + a - 1 # 已合并簇,0-based
|
|
325
|
+
size1 = cluster_sizes[idx1]
|
|
326
|
+
|
|
327
|
+
if b < 0:
|
|
328
|
+
idx2 = -b - 1
|
|
329
|
+
size2 = 1
|
|
330
|
+
else:
|
|
331
|
+
idx2 = n + b - 1
|
|
332
|
+
size2 = cluster_sizes[idx2]
|
|
333
|
+
|
|
334
|
+
Z[i, 0] = idx1
|
|
335
|
+
Z[i, 1] = idx2
|
|
336
|
+
Z[i, 2] = height[i]
|
|
337
|
+
Z[i, 3] = size1 + size2
|
|
338
|
+
|
|
339
|
+
# 更新 cluster_sizes,用于后续簇
|
|
340
|
+
cluster_sizes = np.append(cluster_sizes, size1 + size2)
|
|
341
|
+
|
|
342
|
+
return Z
|
|
343
|
+
|
|
344
|
+
class Cluster:
|
|
345
|
+
def __init__(self,
|
|
346
|
+
matrix,
|
|
347
|
+
entity_ids,
|
|
348
|
+
clustering_method="ward",
|
|
349
|
+
weights=None):
|
|
350
|
+
"""
|
|
351
|
+
A class to handle hierarchical clustering operations using fastcluster for improved performance.
|
|
352
|
+
|
|
353
|
+
:param matrix: Precomputed distance matrix (full square form).
|
|
354
|
+
:param entity_ids: List of IDs corresponding to the entities in the matrix.
|
|
355
|
+
:param clustering_method: Clustering algorithm to use. Options include:
|
|
356
|
+
- "ward" or "ward_d": Classic Ward method (squared Euclidean distances ÷ 2) [default]
|
|
357
|
+
- "ward_d2": Ward method with squared Euclidean distances
|
|
358
|
+
- "single": Single linkage (minimum method)
|
|
359
|
+
- "complete": Complete linkage (maximum method)
|
|
360
|
+
- "average": Average linkage (UPGMA)
|
|
361
|
+
- "centroid": Centroid linkage
|
|
362
|
+
- "median": Median linkage
|
|
363
|
+
:param weights: Optional array of weights for each entity (default: None for equal weights).
|
|
364
|
+
"""
|
|
365
|
+
# Ensure entity_ids is a numpy array for consistent processing
|
|
366
|
+
self.entity_ids = np.array(entity_ids)
|
|
367
|
+
|
|
368
|
+
# Check if entity_ids is valid
|
|
369
|
+
if len(self.entity_ids) != len(matrix):
|
|
370
|
+
raise ValueError("Length of entity_ids must match the size of the matrix.")
|
|
371
|
+
|
|
372
|
+
# Optional: Check uniqueness of entity_ids
|
|
373
|
+
if len(np.unique(self.entity_ids)) != len(self.entity_ids):
|
|
374
|
+
raise ValueError("entity_ids must contain unique values.")
|
|
375
|
+
|
|
376
|
+
# Initialize and validate weights
|
|
377
|
+
if weights is not None:
|
|
378
|
+
self.weights = np.array(weights, dtype=np.float64)
|
|
379
|
+
if len(self.weights) != len(matrix):
|
|
380
|
+
raise ValueError("Length of weights must match the size of the matrix.")
|
|
381
|
+
if np.any(self.weights < 0):
|
|
382
|
+
raise ValueError("All weights must be non-negative.")
|
|
383
|
+
if np.sum(self.weights) == 0:
|
|
384
|
+
raise ValueError("Sum of weights must be greater than zero.")
|
|
385
|
+
else:
|
|
386
|
+
# Default to equal weights (all ones)
|
|
387
|
+
self.weights = np.ones(len(matrix), dtype=np.float64)
|
|
388
|
+
|
|
389
|
+
# Convert matrix to numpy array if it's a DataFrame
|
|
390
|
+
if isinstance(matrix, pd.DataFrame):
|
|
391
|
+
print("[>] Converting DataFrame to NumPy array...")
|
|
392
|
+
self.full_matrix = matrix.values
|
|
393
|
+
else:
|
|
394
|
+
self.full_matrix = matrix
|
|
395
|
+
|
|
396
|
+
# Verify matrix is in square form
|
|
397
|
+
if len(self.full_matrix.shape) != 2 or self.full_matrix.shape[0] != self.full_matrix.shape[1]:
|
|
398
|
+
raise ValueError("Input must be a full square-form distance matrix.")
|
|
399
|
+
|
|
400
|
+
self.clustering_method = clustering_method.lower()
|
|
401
|
+
|
|
402
|
+
# Supported clustering methods
|
|
403
|
+
supported_methods = ["ward", "ward_d", "ward_d2", "single", "complete", "average", "centroid", "median"]
|
|
404
|
+
if self.clustering_method not in supported_methods:
|
|
405
|
+
raise ValueError(
|
|
406
|
+
f"Unsupported clustering method '{clustering_method}'. Supported methods: {supported_methods}")
|
|
407
|
+
|
|
408
|
+
# Handle backward compatibility: 'ward' maps to 'ward_d' (classic Ward method)
|
|
409
|
+
if self.clustering_method == "ward":
|
|
410
|
+
self.clustering_method = "ward_d"
|
|
411
|
+
print("[>] Note: 'ward' method maps to 'ward_d' (classic Ward method).")
|
|
412
|
+
print(" Use 'ward_d2' for Ward method with squared Euclidean distances.")
|
|
413
|
+
|
|
414
|
+
# Compute linkage matrix using fastcluster
|
|
415
|
+
self.linkage_matrix = self._compute_linkage()
|
|
416
|
+
|
|
417
|
+
def _compute_linkage(self):
|
|
418
|
+
"""
|
|
419
|
+
Compute the linkage matrix using fastcluster for improved performance.
|
|
420
|
+
Supports both Ward D (classic) and Ward D2 methods.
|
|
421
|
+
"""
|
|
422
|
+
# Clean and validate the distance matrix using robust methods
|
|
423
|
+
self.full_matrix = _clean_distance_matrix(self.full_matrix)
|
|
424
|
+
|
|
425
|
+
# Check Ward compatibility and issue one-time warning if needed
|
|
426
|
+
_warn_ward_usage_once(self.full_matrix, self.clustering_method)
|
|
427
|
+
|
|
428
|
+
# Check symmetry before converting to condensed form
|
|
429
|
+
# squareform() requires symmetric matrices
|
|
430
|
+
if not np.allclose(self.full_matrix, self.full_matrix.T, rtol=1e-5, atol=1e-8):
|
|
431
|
+
print("[!] Warning: Distance matrix is not symmetric.")
|
|
432
|
+
print(" Hierarchical clustering algorithms require symmetric distance matrices.")
|
|
433
|
+
print(" Automatically symmetrizing using (matrix + matrix.T) / 2")
|
|
434
|
+
print(" If this is not appropriate for your data, please provide a symmetric matrix.")
|
|
435
|
+
self.full_matrix = (self.full_matrix + self.full_matrix.T) / 2
|
|
436
|
+
|
|
437
|
+
# Convert square matrix to condensed form
|
|
438
|
+
self.condensed_matrix = squareform(self.full_matrix)
|
|
439
|
+
|
|
440
|
+
# Map our method names to fastcluster's expected method names
|
|
441
|
+
fastcluster_method = self._map_method_name(self.clustering_method)
|
|
442
|
+
|
|
443
|
+
linkage_matrix = linkage(self.condensed_matrix, method=fastcluster_method)
|
|
444
|
+
|
|
445
|
+
return linkage_matrix
|
|
446
|
+
|
|
447
|
+
def _map_method_name(self, method):
|
|
448
|
+
"""
|
|
449
|
+
Map our internal method names to fastcluster's expected method names.
|
|
450
|
+
"""
|
|
451
|
+
method_mapping = {
|
|
452
|
+
"ward_d": "ward", # Classic Ward (will be corrected later) (updated: it was solved on Nov.15, 2025 by Xinyi)
|
|
453
|
+
"ward_d2": "ward_d2", # Ward D2 (no correction needed)
|
|
454
|
+
"single": "single",
|
|
455
|
+
"complete": "complete",
|
|
456
|
+
"average": "average",
|
|
457
|
+
"centroid": "centroid",
|
|
458
|
+
"median": "median"
|
|
459
|
+
}
|
|
460
|
+
return method_mapping.get(method, method)
|
|
461
|
+
|
|
462
|
+
def _apply_ward_d_correction(self, linkage_matrix):
|
|
463
|
+
"""
|
|
464
|
+
Apply Ward D correction by dividing distances by 2.
|
|
465
|
+
This converts Ward D2 results to classic Ward D results.
|
|
466
|
+
"""
|
|
467
|
+
linkage_corrected = linkage_matrix.copy()
|
|
468
|
+
linkage_corrected[:, 2] = linkage_corrected[:, 2] / 2.0
|
|
469
|
+
print("[>] Applied Ward D correction: distances divided by 2 for classic Ward method.")
|
|
470
|
+
return linkage_corrected
|
|
471
|
+
|
|
472
|
+
def plot_dendrogram(self,
|
|
473
|
+
save_as=None,
|
|
474
|
+
style="whitegrid",
|
|
475
|
+
title="Dendrogram",
|
|
476
|
+
xlabel="Entities",
|
|
477
|
+
ylabel="Distance",
|
|
478
|
+
grid=False,
|
|
479
|
+
dpi=200,
|
|
480
|
+
figsize=(12, 8)):
|
|
481
|
+
"""
|
|
482
|
+
Plot a dendrogram of the hierarchical clustering with optional high-resolution output.
|
|
483
|
+
|
|
484
|
+
:param save_as: File path to save the plot. If None, the plot will be shown.
|
|
485
|
+
:param style: Seaborn style for the plot.
|
|
486
|
+
:param title: Title of the plot.
|
|
487
|
+
:param xlabel: X-axis label.
|
|
488
|
+
:param ylabel: Y-axis label.
|
|
489
|
+
:param grid: Whether to display grid lines.
|
|
490
|
+
:param dpi: Dots per inch for the saved image (default: 300 for high resolution).
|
|
491
|
+
:param figsize: Tuple specifying the figure size in inches (default: (12, 8)).
|
|
492
|
+
"""
|
|
493
|
+
if self.linkage_matrix is None:
|
|
494
|
+
raise ValueError("Linkage matrix is not computed.")
|
|
495
|
+
|
|
496
|
+
sns.set(style=style)
|
|
497
|
+
plt.figure(figsize=figsize)
|
|
498
|
+
dendrogram(self.linkage_matrix, labels=None) # Do not plot labels for large datasets
|
|
499
|
+
plt.xticks([])
|
|
500
|
+
plt.title(title, fontsize=14, fontweight="bold")
|
|
501
|
+
plt.xlabel(xlabel)
|
|
502
|
+
plt.ylabel(ylabel)
|
|
503
|
+
if not grid:
|
|
504
|
+
plt.grid(False)
|
|
505
|
+
|
|
506
|
+
save_and_show_results(save_as, dpi=200)
|
|
507
|
+
|
|
508
|
+
def get_cluster_labels(self, num_clusters):
|
|
509
|
+
"""
|
|
510
|
+
Get cluster labels for a specified number of clusters.
|
|
511
|
+
|
|
512
|
+
There is a common point of confusion because
|
|
513
|
+
k is typically used to represent the number of clusters in clustering algorithms (e.g., k-means).
|
|
514
|
+
|
|
515
|
+
However, SciPy's hierarchical clustering API specifically uses t as the parameter name.
|
|
516
|
+
|
|
517
|
+
:param num_clusters: The number of clusters to create.
|
|
518
|
+
:return: Array of cluster labels corresponding to entity_ids.
|
|
519
|
+
"""
|
|
520
|
+
if self.linkage_matrix is None:
|
|
521
|
+
raise ValueError("Linkage matrix is not computed.")
|
|
522
|
+
|
|
523
|
+
cluster_labels = fcluster(self.linkage_matrix, t=num_clusters, criterion="maxclust")
|
|
524
|
+
|
|
525
|
+
return cluster_labels
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
class ClusterQuality:
|
|
529
|
+
def __init__(self, matrix_or_cluster, max_clusters=20, clustering_method=None, weights=None):
|
|
530
|
+
"""
|
|
531
|
+
Initialize the ClusterQuality class for precomputed distance matrices or a Cluster instance.
|
|
532
|
+
|
|
533
|
+
Allow the ClusterQuality class to directly accept a Cluster instance
|
|
534
|
+
and internally extract the relevant matrix (cluster.full_matrix)
|
|
535
|
+
and clustering method (cluster.clustering_method).
|
|
536
|
+
|
|
537
|
+
This keeps the user interface clean and simple while handling the logic under the hood.
|
|
538
|
+
|
|
539
|
+
:param matrix_or_cluster: The precomputed distance matrix (full square form or condensed form)
|
|
540
|
+
or an instance of the Cluster class.
|
|
541
|
+
:param max_clusters: Maximum number of clusters to evaluate (default: 20).
|
|
542
|
+
:param clustering_method: Clustering algorithm to use. If None, inherit from Cluster instance.
|
|
543
|
+
:param weights: Optional array of weights for each entity. If None and using Cluster instance,
|
|
544
|
+
weights will be extracted from the Cluster object.
|
|
545
|
+
"""
|
|
546
|
+
if isinstance(matrix_or_cluster, Cluster):
|
|
547
|
+
# Extract matrix, clustering method, and weights from the Cluster instance
|
|
548
|
+
self.matrix = matrix_or_cluster.full_matrix
|
|
549
|
+
self.clustering_method = matrix_or_cluster.clustering_method
|
|
550
|
+
self.linkage_matrix = matrix_or_cluster.linkage_matrix
|
|
551
|
+
self.weights = matrix_or_cluster.weights
|
|
552
|
+
|
|
553
|
+
elif isinstance(matrix_or_cluster, (np.ndarray, pd.DataFrame)):
|
|
554
|
+
# Handle direct matrix input
|
|
555
|
+
if isinstance(matrix_or_cluster, pd.DataFrame):
|
|
556
|
+
print("[>] Detected Pandas DataFrame. Converting to NumPy array...")
|
|
557
|
+
matrix_or_cluster = matrix_or_cluster.values
|
|
558
|
+
self.matrix = matrix_or_cluster
|
|
559
|
+
self.clustering_method = clustering_method or "ward_d" # Default to classic Ward
|
|
560
|
+
|
|
561
|
+
# Initialize weights for direct matrix input
|
|
562
|
+
if weights is not None:
|
|
563
|
+
self.weights = np.array(weights, dtype=np.float64)
|
|
564
|
+
if len(self.weights) != len(self.matrix):
|
|
565
|
+
raise ValueError("Length of weights must match the size of the matrix.")
|
|
566
|
+
else:
|
|
567
|
+
self.weights = np.ones(len(self.matrix), dtype=np.float64)
|
|
568
|
+
|
|
569
|
+
# Compute linkage matrix for direct input (needed for clustering operations)
|
|
570
|
+
self.linkage_matrix = self._compute_linkage_for_direct_input()
|
|
571
|
+
|
|
572
|
+
else:
|
|
573
|
+
raise ValueError(
|
|
574
|
+
"Input must be a Cluster instance, a NumPy array, or a Pandas DataFrame."
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
if self.matrix.shape[0] != self.matrix.shape[1]:
|
|
578
|
+
raise ValueError("Matrix must be a full square-form distance matrix.")
|
|
579
|
+
|
|
580
|
+
self.max_clusters = max_clusters
|
|
581
|
+
self.metric_order = [
|
|
582
|
+
"PBC",
|
|
583
|
+
"HG",
|
|
584
|
+
"HGSD",
|
|
585
|
+
"ASW",
|
|
586
|
+
"ASWw",
|
|
587
|
+
"CH",
|
|
588
|
+
"R2",
|
|
589
|
+
"CHsq",
|
|
590
|
+
"R2sq",
|
|
591
|
+
"HC",
|
|
592
|
+
]
|
|
593
|
+
self.scores = {metric: [] for metric in self.metric_order}
|
|
594
|
+
|
|
595
|
+
# Store original scores separately to preserve raw values
|
|
596
|
+
self.original_scores = None
|
|
597
|
+
|
|
598
|
+
def _compute_linkage_for_direct_input(self):
|
|
599
|
+
"""
|
|
600
|
+
Compute linkage matrix for direct matrix input (similar to Cluster class logic).
|
|
601
|
+
Supports both Ward D and Ward D2 methods.
|
|
602
|
+
"""
|
|
603
|
+
# Handle backward compatibility: 'ward' maps to 'ward_d'
|
|
604
|
+
if self.clustering_method == "ward":
|
|
605
|
+
self.clustering_method = "ward_d"
|
|
606
|
+
print("[>] Note: 'ward' method maps to 'ward_d' (classic Ward method).")
|
|
607
|
+
print(" Use 'ward_d2' for Ward method with squared Euclidean distances.")
|
|
608
|
+
|
|
609
|
+
# Clean and validate the distance matrix using robust methods
|
|
610
|
+
self.matrix = _clean_distance_matrix(self.matrix)
|
|
611
|
+
|
|
612
|
+
# Check Ward compatibility and issue one-time warning if needed
|
|
613
|
+
_warn_ward_usage_once(self.matrix, self.clustering_method)
|
|
614
|
+
|
|
615
|
+
# Check symmetry before converting to condensed form
|
|
616
|
+
# squareform() requires symmetric matrices
|
|
617
|
+
if not np.allclose(self.matrix, self.matrix.T, rtol=1e-5, atol=1e-8):
|
|
618
|
+
print("[!] Warning: Distance matrix is not symmetric.")
|
|
619
|
+
print(" Hierarchical clustering algorithms require symmetric distance matrices.")
|
|
620
|
+
print(" Automatically symmetrizing using (matrix + matrix.T) / 2")
|
|
621
|
+
print(" If this is not appropriate for your data, please provide a symmetric matrix.")
|
|
622
|
+
self.matrix = (self.matrix + self.matrix.T) / 2
|
|
623
|
+
|
|
624
|
+
# Convert square matrix to condensed form for linkage computation
|
|
625
|
+
condensed_matrix = squareform(self.matrix)
|
|
626
|
+
|
|
627
|
+
try:
|
|
628
|
+
# Map our method names to fastcluster's expected method names
|
|
629
|
+
fastcluster_method = self._map_method_name(self.clustering_method)
|
|
630
|
+
linkage_matrix = linkage(condensed_matrix, method=fastcluster_method)
|
|
631
|
+
|
|
632
|
+
# Apply Ward D correction if needed
|
|
633
|
+
if self.clustering_method == "ward_d":
|
|
634
|
+
linkage_matrix = self._apply_ward_d_correction(linkage_matrix)
|
|
635
|
+
|
|
636
|
+
except Exception as e:
|
|
637
|
+
raise RuntimeError(
|
|
638
|
+
f"Failed to compute linkage with method '{self.clustering_method}'. "
|
|
639
|
+
"Check that the distance matrix is square, symmetric, finite, non-negative, and has a zero diagonal. "
|
|
640
|
+
"For sequence distances, consider using 'average', 'complete', or 'single' instead of Ward methods. "
|
|
641
|
+
f"Original error: {e}"
|
|
642
|
+
)
|
|
643
|
+
return linkage_matrix
|
|
644
|
+
|
|
645
|
+
def _map_method_name(self, method):
|
|
646
|
+
"""
|
|
647
|
+
Map our internal method names to fastcluster's expected method names.
|
|
648
|
+
"""
|
|
649
|
+
method_mapping = {
|
|
650
|
+
"ward_d": "ward", # Classic Ward (will be corrected later)
|
|
651
|
+
"ward_d2": "ward", # Ward D2 (no correction needed)
|
|
652
|
+
"single": "single",
|
|
653
|
+
"complete": "complete",
|
|
654
|
+
"average": "average",
|
|
655
|
+
"centroid": "centroid",
|
|
656
|
+
"median": "median"
|
|
657
|
+
}
|
|
658
|
+
return method_mapping.get(method, method)
|
|
659
|
+
|
|
660
|
+
def _apply_ward_d_correction(self, linkage_matrix):
|
|
661
|
+
"""
|
|
662
|
+
Apply Ward D correction by dividing distances by 2.
|
|
663
|
+
This converts Ward D2 results to classic Ward D results.
|
|
664
|
+
"""
|
|
665
|
+
linkage_corrected = linkage_matrix.copy()
|
|
666
|
+
linkage_corrected[:, 2] = linkage_corrected[:, 2] / 2.0
|
|
667
|
+
print("[>] Applied Ward D correction: distances divided by 2 for classic Ward method.")
|
|
668
|
+
return linkage_corrected
|
|
669
|
+
|
|
670
|
+
def compute_cluster_quality_scores(self):
|
|
671
|
+
"""
|
|
672
|
+
Compute clustering quality scores for different numbers of clusters.
|
|
673
|
+
|
|
674
|
+
Uses C++ implementation for accuracy and performance.
|
|
675
|
+
This implementation aligns with R WeightedCluster package results.
|
|
676
|
+
"""
|
|
677
|
+
if not _CPP_AVAILABLE:
|
|
678
|
+
raise RuntimeError(
|
|
679
|
+
"C++ cluster quality implementation is not available. "
|
|
680
|
+
"Please ensure the C++ extensions are properly compiled."
|
|
681
|
+
)
|
|
682
|
+
self._compute_cluster_quality_scores_cpp()
|
|
683
|
+
|
|
684
|
+
# Save original scores immediately after computation
|
|
685
|
+
self.original_scores = {}
|
|
686
|
+
for metric, values in self.scores.items():
|
|
687
|
+
self.original_scores[metric] = np.array(values).copy()
|
|
688
|
+
|
|
689
|
+
def _compute_cluster_quality_scores_cpp(self):
|
|
690
|
+
"""
|
|
691
|
+
Compute clustering quality scores using C++ implementation (matches R WeightedCluster).
|
|
692
|
+
"""
|
|
693
|
+
# Convert matrix to format expected by C++
|
|
694
|
+
# Ensure we have a full square matrix
|
|
695
|
+
if self.matrix.shape[0] != self.matrix.shape[1]:
|
|
696
|
+
raise ValueError("Matrix must be square for C++ implementation")
|
|
697
|
+
|
|
698
|
+
# Convert to condensed once to reduce per-call overhead in C++
|
|
699
|
+
condensed = squareform(self.matrix)
|
|
700
|
+
|
|
701
|
+
for k in range(2, self.max_clusters + 1):
|
|
702
|
+
# Get cluster labels (fcluster returns 1-based labels, which C++ expects)
|
|
703
|
+
labels = fcluster(self.linkage_matrix, k, criterion="maxclust")
|
|
704
|
+
|
|
705
|
+
try:
|
|
706
|
+
# Call C++ function (condensed) - expects 1-based labels
|
|
707
|
+
result = clustering_c_code.cluster_quality_condensed(
|
|
708
|
+
condensed.astype(np.float64, copy=False),
|
|
709
|
+
labels.astype(np.int32, copy=False),
|
|
710
|
+
self.weights.astype(np.float64, copy=False),
|
|
711
|
+
self.matrix.shape[0],
|
|
712
|
+
k
|
|
713
|
+
)
|
|
714
|
+
|
|
715
|
+
# Extract results from C++ (mapping to match R WeightedCluster exactly)
|
|
716
|
+
for metric in self.metric_order:
|
|
717
|
+
self.scores[metric].append(result.get(metric, np.nan))
|
|
718
|
+
|
|
719
|
+
except Exception as e:
|
|
720
|
+
print(f"[!] Error: C++ computation failed for k={k}: {e}")
|
|
721
|
+
print(" Python fallback has been removed due to accuracy issues.")
|
|
722
|
+
# Insert NaN values for failed computation
|
|
723
|
+
for metric in self.metric_order:
|
|
724
|
+
self.scores[metric].append(np.nan)
|
|
725
|
+
raise RuntimeError(f"C++ cluster quality computation failed for k={k}. "
|
|
726
|
+
"Python fallback is not available.")
|
|
727
|
+
|
|
728
|
+
def _compute_cluster_quality_scores_python(self):
|
|
729
|
+
"""
|
|
730
|
+
Python fallback implementation has been removed.
|
|
731
|
+
Only C++ implementation is available for accuracy and performance.
|
|
732
|
+
"""
|
|
733
|
+
raise NotImplementedError(
|
|
734
|
+
"Python cluster quality implementation has been removed due to accuracy issues. "
|
|
735
|
+
"Please use C++ implementation by setting use_cpp=True (default)."
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
def _normalize_scores(self, method="zscore") -> None:
|
|
739
|
+
"""
|
|
740
|
+
Normalize each metric independently.
|
|
741
|
+
|
|
742
|
+
:param method: Normalization method. Options are "zscore" or "range".
|
|
743
|
+
"""
|
|
744
|
+
for metric in self.scores:
|
|
745
|
+
values = np.array(self.scores[metric])
|
|
746
|
+
if method == "zscore":
|
|
747
|
+
mean_val = np.nanmean(values)
|
|
748
|
+
std_val = np.nanstd(values)
|
|
749
|
+
if std_val > 0:
|
|
750
|
+
self.scores[metric] = (values - mean_val) / std_val
|
|
751
|
+
elif method == "range":
|
|
752
|
+
min_val = np.nanmin(values)
|
|
753
|
+
max_val = np.nanmax(values)
|
|
754
|
+
if max_val > min_val:
|
|
755
|
+
self.scores[metric] = (values - min_val) / (max_val - min_val)
|
|
756
|
+
|
|
757
|
+
def get_cluster_range_table(self) -> pd.DataFrame:
|
|
758
|
+
"""
|
|
759
|
+
Return a metrics-by-cluster table mirroring R's `as.clustrange()` output.
|
|
760
|
+
|
|
761
|
+
:return: DataFrame indexed by cluster count ("cluster2", ...)
|
|
762
|
+
with raw metric values for each quality indicator.
|
|
763
|
+
"""
|
|
764
|
+
# Prefer preserved raw scores to avoid normalization side-effects
|
|
765
|
+
if self.original_scores is not None:
|
|
766
|
+
scores_to_use = self.original_scores
|
|
767
|
+
else:
|
|
768
|
+
scores_to_use = self.scores
|
|
769
|
+
|
|
770
|
+
# Ensure metrics are available
|
|
771
|
+
if not scores_to_use or not any(len(scores_to_use[m]) for m in self.metric_order):
|
|
772
|
+
raise ValueError("Cluster quality scores are empty. Run `compute_cluster_quality_scores()` first.")
|
|
773
|
+
|
|
774
|
+
# Determine number of evaluated cluster counts
|
|
775
|
+
lengths = [len(scores_to_use[metric]) for metric in self.metric_order if metric in scores_to_use]
|
|
776
|
+
if not lengths:
|
|
777
|
+
raise ValueError("No recognized metrics found in scores.")
|
|
778
|
+
|
|
779
|
+
if len(set(lengths)) != 1:
|
|
780
|
+
raise ValueError("Inconsistent metric lengths detected. Please recompute cluster quality scores.")
|
|
781
|
+
|
|
782
|
+
n_rows = lengths[0]
|
|
783
|
+
if n_rows == 0:
|
|
784
|
+
raise ValueError("Cluster quality scores contain no entries.")
|
|
785
|
+
|
|
786
|
+
# Build DataFrame matching R output ordering
|
|
787
|
+
data = {}
|
|
788
|
+
for metric in self.metric_order:
|
|
789
|
+
values = scores_to_use.get(metric)
|
|
790
|
+
if values is None:
|
|
791
|
+
continue
|
|
792
|
+
data[metric] = np.array(values, dtype=np.float64)
|
|
793
|
+
|
|
794
|
+
index_labels = [f"cluster{k}" for k in range(2, 2 + n_rows)]
|
|
795
|
+
table = pd.DataFrame(data, index=index_labels)
|
|
796
|
+
table.index.name = "Cluster"
|
|
797
|
+
|
|
798
|
+
return table
|
|
799
|
+
|
|
800
|
+
def get_cqi_table(self):
|
|
801
|
+
"""
|
|
802
|
+
Generate a summary table of clustering quality indicators with concise column names.
|
|
803
|
+
|
|
804
|
+
:return: Pandas DataFrame summarizing the optimal number of clusters (N groups),
|
|
805
|
+
the corresponding raw metric values, and z-score normalized values.
|
|
806
|
+
"""
|
|
807
|
+
# Use original scores if available, otherwise fall back to current scores
|
|
808
|
+
if self.original_scores is not None:
|
|
809
|
+
scores_to_use = self.original_scores
|
|
810
|
+
else:
|
|
811
|
+
scores_to_use = self.scores
|
|
812
|
+
|
|
813
|
+
# Deep copy to avoid overwriting during normalization
|
|
814
|
+
original_scores = {}
|
|
815
|
+
for metric, values in scores_to_use.items():
|
|
816
|
+
original_scores[metric] = np.array(values).copy()
|
|
817
|
+
|
|
818
|
+
# Create temporary copy for z-score normalization
|
|
819
|
+
temp_scores = {}
|
|
820
|
+
for metric, values in original_scores.items():
|
|
821
|
+
temp_scores[metric] = values.copy()
|
|
822
|
+
|
|
823
|
+
# Apply z-score normalization to temp copy
|
|
824
|
+
zscore_normalized = {}
|
|
825
|
+
for metric in temp_scores:
|
|
826
|
+
values = temp_scores[metric]
|
|
827
|
+
mean_val = np.nanmean(values)
|
|
828
|
+
std_val = np.nanstd(values)
|
|
829
|
+
if std_val > 0:
|
|
830
|
+
zscore_normalized[metric] = (values - mean_val) / std_val
|
|
831
|
+
else:
|
|
832
|
+
zscore_normalized[metric] = values.copy()
|
|
833
|
+
|
|
834
|
+
# Generate summary table (removed redundant Min-Max Norm column)
|
|
835
|
+
summary = {
|
|
836
|
+
"Metric": [],
|
|
837
|
+
"Opt. Clusters": [], # Abbreviated from "Optimal Clusters"
|
|
838
|
+
"Raw Value": [], # Raw optimal value (not normalized)
|
|
839
|
+
"Z-Score Norm.": [], # Z-Score normalized optimal value
|
|
840
|
+
}
|
|
841
|
+
|
|
842
|
+
# Get maximum value and its position from original scores
|
|
843
|
+
for metric in self.metric_order:
|
|
844
|
+
values = original_scores.get(metric)
|
|
845
|
+
if values is None:
|
|
846
|
+
continue
|
|
847
|
+
|
|
848
|
+
if np.all(np.isnan(values)):
|
|
849
|
+
optimal_k, raw_value, z_val = np.nan, np.nan, np.nan
|
|
850
|
+
else:
|
|
851
|
+
pos = np.nanargmax(values)
|
|
852
|
+
optimal_k = pos + 2
|
|
853
|
+
raw_value = values[pos] # Use raw original value
|
|
854
|
+
z_val = zscore_normalized[metric][pos]
|
|
855
|
+
|
|
856
|
+
# Add data to the summary table
|
|
857
|
+
summary["Metric"].append(metric)
|
|
858
|
+
summary["Opt. Clusters"].append(optimal_k)
|
|
859
|
+
summary["Raw Value"].append(raw_value) # Raw value, not normalized
|
|
860
|
+
summary["Z-Score Norm."].append(z_val)
|
|
861
|
+
|
|
862
|
+
return pd.DataFrame(summary)
|
|
863
|
+
|
|
864
|
+
def plot_cqi_scores(self,
|
|
865
|
+
metrics_list=None,
|
|
866
|
+
norm="zscore",
|
|
867
|
+
palette="husl",
|
|
868
|
+
line_width=2,
|
|
869
|
+
style="whitegrid",
|
|
870
|
+
title=None,
|
|
871
|
+
xlabel="Number of Clusters",
|
|
872
|
+
ylabel="Normalized Score",
|
|
873
|
+
grid=True,
|
|
874
|
+
save_as=None,
|
|
875
|
+
dpi=200,
|
|
876
|
+
figsize=(12, 8),
|
|
877
|
+
show=True
|
|
878
|
+
):
|
|
879
|
+
"""
|
|
880
|
+
Plot combined scores for clustering quality indicators with customizable parameters.
|
|
881
|
+
|
|
882
|
+
This function displays normalized metric values for easier comparison while preserving
|
|
883
|
+
the original statistical properties in the legend.
|
|
884
|
+
|
|
885
|
+
It first calculates raw means and standard deviations from the original data before applying any normalization,
|
|
886
|
+
then uses these raw statistics in the legend labels to provide context about the actual scale and
|
|
887
|
+
distribution of each metric.
|
|
888
|
+
|
|
889
|
+
:param metrics_list: List of metrics to plot (default: all available metrics)
|
|
890
|
+
:param norm: Normalization method for plotting ("zscore", "range", or "none")
|
|
891
|
+
:param palette: Color palette for the plot
|
|
892
|
+
:param line_width: Width of plotted lines
|
|
893
|
+
:param style: Seaborn style for the plot
|
|
894
|
+
:param title: Plot title
|
|
895
|
+
:param xlabel: X-axis label
|
|
896
|
+
:param ylabel: Y-axis label
|
|
897
|
+
:param grid: Whether to show grid lines
|
|
898
|
+
:param save_as: File path to save the plot
|
|
899
|
+
:param dpi: DPI for saved image
|
|
900
|
+
:param figsize: Figure size in inches
|
|
901
|
+
:param show: Whether to display the figure (default: True)
|
|
902
|
+
|
|
903
|
+
:return: The figure object
|
|
904
|
+
"""
|
|
905
|
+
# Store original scores before normalization
|
|
906
|
+
original_scores = self.scores.copy()
|
|
907
|
+
|
|
908
|
+
# Calculate statistics from original data
|
|
909
|
+
original_stats = {}
|
|
910
|
+
for metric in metrics_list or self.metric_order:
|
|
911
|
+
values = np.array(original_scores[metric])
|
|
912
|
+
original_stats[metric] = {
|
|
913
|
+
'mean': np.nanmean(values),
|
|
914
|
+
'std': np.nanstd(values)
|
|
915
|
+
}
|
|
916
|
+
|
|
917
|
+
# Apply normalization if requested
|
|
918
|
+
if norm != "none":
|
|
919
|
+
self._normalize_scores(method=norm)
|
|
920
|
+
|
|
921
|
+
# Set up plot
|
|
922
|
+
sns.set(style=style)
|
|
923
|
+
palette_colors = sns.color_palette(palette, len(metrics_list) if metrics_list else len(self.scores))
|
|
924
|
+
plt.figure(figsize=figsize)
|
|
925
|
+
|
|
926
|
+
if metrics_list is None:
|
|
927
|
+
metrics_list = list(self.metric_order)
|
|
928
|
+
else:
|
|
929
|
+
metrics_list = [metric for metric in metrics_list if metric in self.metric_order]
|
|
930
|
+
|
|
931
|
+
# Plot each metric
|
|
932
|
+
for idx, metric in enumerate(metrics_list):
|
|
933
|
+
values = np.array(self.scores[metric])
|
|
934
|
+
|
|
935
|
+
# Use original statistics for legend
|
|
936
|
+
mean_val = original_stats[metric]['mean']
|
|
937
|
+
std_val = original_stats[metric]['std']
|
|
938
|
+
legend_label = f"{metric} ({mean_val:.2f} / {std_val:.2f})"
|
|
939
|
+
|
|
940
|
+
plt.plot(
|
|
941
|
+
range(2, self.max_clusters + 1),
|
|
942
|
+
values,
|
|
943
|
+
label=legend_label,
|
|
944
|
+
color=palette_colors[idx],
|
|
945
|
+
linewidth=line_width,
|
|
946
|
+
)
|
|
947
|
+
|
|
948
|
+
# Set title and labels
|
|
949
|
+
if title is None:
|
|
950
|
+
title = "Cluster Quality Metrics"
|
|
951
|
+
|
|
952
|
+
plt.title(title, fontsize=14, fontweight="bold")
|
|
953
|
+
plt.xlabel(xlabel, fontsize=12)
|
|
954
|
+
plt.ylabel(ylabel, fontsize=12)
|
|
955
|
+
|
|
956
|
+
# Configure ticks and legend
|
|
957
|
+
plt.xticks(ticks=range(2, self.max_clusters + 1), fontsize=10)
|
|
958
|
+
plt.yticks(fontsize=10)
|
|
959
|
+
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
|
|
960
|
+
plt.legend(title="Metrics (Raw Mean / Std Dev)", fontsize=10, title_fontsize=12)
|
|
961
|
+
|
|
962
|
+
# Add a note about normalization
|
|
963
|
+
norm_note = f"Note: Lines show {norm} normalized values; legend shows raw statistics"
|
|
964
|
+
plt.figtext(0.5, 0.01, norm_note, ha='center', fontsize=10, style='italic')
|
|
965
|
+
|
|
966
|
+
# Configure grid
|
|
967
|
+
if grid:
|
|
968
|
+
plt.grid(True, linestyle="--", alpha=0.7)
|
|
969
|
+
else:
|
|
970
|
+
plt.grid(False)
|
|
971
|
+
|
|
972
|
+
# Adjust layout to make room for the note
|
|
973
|
+
plt.tight_layout()
|
|
974
|
+
plt.subplots_adjust(bottom=0.1)
|
|
975
|
+
|
|
976
|
+
# Save and show the plot
|
|
977
|
+
return save_and_show_results(save_as, dpi, show=show)
|
|
978
|
+
|
|
979
|
+
|
|
980
|
+
class ClusterResults:
|
|
981
|
+
def __init__(self, cluster):
|
|
982
|
+
"""
|
|
983
|
+
Initialize the ClusterResults class.
|
|
984
|
+
|
|
985
|
+
:param cluster: An instance of the Cluster class.
|
|
986
|
+
"""
|
|
987
|
+
if not isinstance(cluster, Cluster):
|
|
988
|
+
raise ValueError("Input must be an instance of the Cluster class.")
|
|
989
|
+
|
|
990
|
+
self.linkage_matrix = cluster.linkage_matrix
|
|
991
|
+
self.entity_ids = cluster.entity_ids # Retrieve entity IDs from Cluster class
|
|
992
|
+
self.weights = cluster.weights # Retrieve weights from Cluster class
|
|
993
|
+
|
|
994
|
+
def get_cluster_memberships(self, num_clusters) -> pd.DataFrame:
|
|
995
|
+
"""
|
|
996
|
+
Generate a table mapping entity IDs to their corresponding cluster IDs.
|
|
997
|
+
Based on this table, users later can link this to the original dataframe for further regression models.
|
|
998
|
+
|
|
999
|
+
There is a common point of confusion because
|
|
1000
|
+
k is typically used to represent the number of clusters in clustering algorithms (e.g., k-means).
|
|
1001
|
+
However, SciPy's hierarchical clustering API specifically uses t as the parameter name.
|
|
1002
|
+
|
|
1003
|
+
:param num_clusters: The number of clusters to create.
|
|
1004
|
+
:return: Pandas DataFrame with entity IDs and cluster memberships.
|
|
1005
|
+
"""
|
|
1006
|
+
if self.linkage_matrix is None:
|
|
1007
|
+
raise ValueError("Linkage matrix is not computed.")
|
|
1008
|
+
|
|
1009
|
+
# Generate cluster labels
|
|
1010
|
+
cluster_labels = fcluster(self.linkage_matrix, t=num_clusters, criterion="maxclust")
|
|
1011
|
+
return pd.DataFrame({"Entity ID": self.entity_ids, "Cluster": cluster_labels})
|
|
1012
|
+
|
|
1013
|
+
def get_cluster_distribution(self, num_clusters, weighted=False) -> pd.DataFrame:
|
|
1014
|
+
"""
|
|
1015
|
+
Generate a distribution summary of clusters showing counts, percentages, and optionally weighted statistics.
|
|
1016
|
+
|
|
1017
|
+
This function calculates how many entities belong to each cluster and what
|
|
1018
|
+
percentage of the total they represent. When weighted=True, it also provides
|
|
1019
|
+
weight-based statistics.
|
|
1020
|
+
|
|
1021
|
+
:param num_clusters: The number of clusters to create.
|
|
1022
|
+
:param weighted: If True, include weighted statistics in the distribution.
|
|
1023
|
+
:return: DataFrame with cluster distribution information.
|
|
1024
|
+
"""
|
|
1025
|
+
# Get cluster memberships
|
|
1026
|
+
memberships_df = self.get_cluster_memberships(num_clusters)
|
|
1027
|
+
|
|
1028
|
+
# Count entities in each cluster
|
|
1029
|
+
cluster_counts = memberships_df['Cluster'].value_counts().sort_index()
|
|
1030
|
+
|
|
1031
|
+
# Calculate percentages
|
|
1032
|
+
total_entities = len(memberships_df)
|
|
1033
|
+
cluster_percentages = (cluster_counts / total_entities * 100).round(2)
|
|
1034
|
+
|
|
1035
|
+
# Create basic distribution dataframe
|
|
1036
|
+
distribution = pd.DataFrame({
|
|
1037
|
+
'Cluster': cluster_counts.index,
|
|
1038
|
+
'Count': cluster_counts.values,
|
|
1039
|
+
'Percentage': cluster_percentages.values
|
|
1040
|
+
}).sort_values('Cluster')
|
|
1041
|
+
|
|
1042
|
+
# Add weighted statistics if requested
|
|
1043
|
+
if weighted:
|
|
1044
|
+
cluster_weights = []
|
|
1045
|
+
weighted_percentages = []
|
|
1046
|
+
total_weight = np.sum(self.weights)
|
|
1047
|
+
|
|
1048
|
+
for cluster_id in distribution['Cluster']:
|
|
1049
|
+
# Find entities in this cluster
|
|
1050
|
+
cluster_mask = memberships_df['Cluster'] == cluster_id
|
|
1051
|
+
cluster_entity_indices = memberships_df.index[cluster_mask]
|
|
1052
|
+
|
|
1053
|
+
# Sum weights for entities in this cluster
|
|
1054
|
+
cluster_weight = np.sum(self.weights[cluster_entity_indices])
|
|
1055
|
+
cluster_weights.append(cluster_weight)
|
|
1056
|
+
|
|
1057
|
+
# Calculate weighted percentage
|
|
1058
|
+
weighted_pct = (cluster_weight / total_weight * 100) if total_weight > 0 else 0.0
|
|
1059
|
+
weighted_percentages.append(round(weighted_pct, 2))
|
|
1060
|
+
|
|
1061
|
+
distribution['Weight_Sum'] = cluster_weights
|
|
1062
|
+
distribution['Weight_Percentage'] = weighted_percentages
|
|
1063
|
+
|
|
1064
|
+
return distribution
|
|
1065
|
+
|
|
1066
|
+
def plot_cluster_distribution(self, num_clusters, save_as=None, title=None,
|
|
1067
|
+
style="whitegrid", dpi=200, figsize=(10, 6), weighted=False):
|
|
1068
|
+
"""
|
|
1069
|
+
Plot the distribution of entities across clusters as a bar chart.
|
|
1070
|
+
|
|
1071
|
+
This visualization shows how many entities belong to each cluster, providing
|
|
1072
|
+
insight into the balance and size distribution of the clustering result.
|
|
1073
|
+
When weighted=True, displays weight-based percentages.
|
|
1074
|
+
|
|
1075
|
+
:param num_clusters: The number of clusters to create.
|
|
1076
|
+
:param save_as: File path to save the plot. If None, the plot will be shown.
|
|
1077
|
+
:param title: Title for the plot. If None, a default title will be used.
|
|
1078
|
+
:param style: Seaborn style for the plot.
|
|
1079
|
+
:param dpi: DPI for saved image.
|
|
1080
|
+
:param figsize: Figure size in inches.
|
|
1081
|
+
:param weighted: If True, display weighted percentages instead of entity count percentages.
|
|
1082
|
+
"""
|
|
1083
|
+
# Get cluster distribution data (include weights if needed)
|
|
1084
|
+
distribution = self.get_cluster_distribution(num_clusters, weighted=weighted)
|
|
1085
|
+
|
|
1086
|
+
# Set up plot
|
|
1087
|
+
sns.set(style=style)
|
|
1088
|
+
plt.figure(figsize=figsize)
|
|
1089
|
+
|
|
1090
|
+
# Choose what to plot based on weighted parameter
|
|
1091
|
+
if weighted and 'Weight_Sum' in distribution.columns:
|
|
1092
|
+
y_column = 'Weight_Sum'
|
|
1093
|
+
percentage_column = 'Weight_Percentage'
|
|
1094
|
+
ylabel = "Total Weight"
|
|
1095
|
+
note_text = "Y-axis shows weight sums; percentages above bars indicate weight-based relative frequency."
|
|
1096
|
+
else:
|
|
1097
|
+
y_column = 'Count'
|
|
1098
|
+
percentage_column = 'Percentage'
|
|
1099
|
+
ylabel = "Number of Entities"
|
|
1100
|
+
note_text = "Y-axis shows entity counts; percentages above bars indicate their relative frequency."
|
|
1101
|
+
|
|
1102
|
+
# Create bar plot with a more poetic, fresh color palette
|
|
1103
|
+
# 'muted', 'pastel', and 'husl' are good options for fresher colors
|
|
1104
|
+
ax = sns.barplot(x='Cluster', y=y_column, data=distribution, palette='pastel')
|
|
1105
|
+
|
|
1106
|
+
# Set the Y-axis range to prevent text overflow
|
|
1107
|
+
ax.set_ylim(0, distribution[y_column].max() * 1.2)
|
|
1108
|
+
|
|
1109
|
+
# Ensure Y-axis uses appropriate ticks
|
|
1110
|
+
if not weighted:
|
|
1111
|
+
plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))
|
|
1112
|
+
|
|
1113
|
+
# Add percentage labels on top of bars
|
|
1114
|
+
for p, (_, row) in zip(ax.patches, distribution.iterrows()):
|
|
1115
|
+
height = p.get_height()
|
|
1116
|
+
percentage = row[percentage_column]
|
|
1117
|
+
ax.text(p.get_x() + p.get_width() / 2., height + max(height * 0.02, 0.5),
|
|
1118
|
+
f'{percentage:.1f}%', ha="center", fontsize=9)
|
|
1119
|
+
|
|
1120
|
+
# Set a simple label for entity count at the top
|
|
1121
|
+
if title is None:
|
|
1122
|
+
if weighted:
|
|
1123
|
+
title = f"N = {len(self.entity_ids)}, Total Weight = {np.sum(self.weights):.1f}"
|
|
1124
|
+
else:
|
|
1125
|
+
title = f"N = {len(self.entity_ids)}"
|
|
1126
|
+
|
|
1127
|
+
# Use a lighter, non-bold title style
|
|
1128
|
+
plt.title(title, fontsize=12, fontweight="normal", loc='right')
|
|
1129
|
+
|
|
1130
|
+
plt.xlabel("Cluster ID", fontsize=12)
|
|
1131
|
+
plt.ylabel(ylabel, fontsize=12)
|
|
1132
|
+
plt.xticks(fontsize=10)
|
|
1133
|
+
plt.yticks(fontsize=10)
|
|
1134
|
+
|
|
1135
|
+
# Ensure integer ticks for cluster IDs
|
|
1136
|
+
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
|
|
1137
|
+
|
|
1138
|
+
# Add grid for better readability but make it lighter
|
|
1139
|
+
plt.grid(axis='y', linestyle='--', alpha=0.4)
|
|
1140
|
+
|
|
1141
|
+
# Adjust layout
|
|
1142
|
+
plt.tight_layout()
|
|
1143
|
+
|
|
1144
|
+
# Adjust layout to make room for the note
|
|
1145
|
+
plt.subplots_adjust(bottom=0.13)
|
|
1146
|
+
|
|
1147
|
+
# Add a note about what is being displayed
|
|
1148
|
+
plt.figtext(0.5, 0.01, note_text, ha='center', fontsize=10, style='italic')
|
|
1149
|
+
|
|
1150
|
+
# Save and show the plot
|
|
1151
|
+
save_and_show_results(save_as, dpi)
|
|
1152
|
+
|
|
1153
|
+
|
|
1154
|
+
# For xinyi's test, because she can't debug in Jupyter :
|
|
1155
|
+
# Traceback (most recent call last):
|
|
1156
|
+
# File "/Applications/PyCharm.app/Contents/plugins/python-ce/helpers/pydev/_pydevd_bundle/pydevd_comm.py", line 736, in make_thread_stack_str
|
|
1157
|
+
# append('file="%s" line="%s">' % (make_valid_xml_value(my_file), lineno))
|
|
1158
|
+
# File "/Applications/PyCharm.app/Contents/plugins/python-ce/helpers/pydev/_pydevd_bundle/pydevd_xml.py", line 36, in make_valid_xml_value
|
|
1159
|
+
# return s.replace("&", "&").replace('<', '<').replace('>', '>').replace('"', '"')
|
|
1160
|
+
# AttributeError: 'tuple' object has no attribute 'replace'
|
|
1161
|
+
|
|
1162
|
+
if __name__ == '__main__':
|
|
1163
|
+
# Import necessary libraries
|
|
1164
|
+
# Your calling code (e.g., in a script or notebook)
|
|
1165
|
+
|
|
1166
|
+
from sequenzo import * # Import the package, give it a short alias
|
|
1167
|
+
import pandas as pd # Data manipulation
|
|
1168
|
+
import numpy as np
|
|
1169
|
+
|
|
1170
|
+
# List all the available datasets in Sequenzo
|
|
1171
|
+
# Now access functions using the alias:
|
|
1172
|
+
print('Available datasets in Sequenzo: ', list_datasets())
|
|
1173
|
+
|
|
1174
|
+
# Load the data that we would like to explore in this tutorial
|
|
1175
|
+
# `df` is the short for `dataframe`, which is a common variable name for a dataset
|
|
1176
|
+
# df = load_dataset('country_co2_emissions')
|
|
1177
|
+
# df = load_dataset('mvad')
|
|
1178
|
+
df = pd.read_csv("/Users/xinyi/Projects/sequenzo/sequenzo/data_and_output/orignal data/mvad.csv")
|
|
1179
|
+
|
|
1180
|
+
# 时间列表
|
|
1181
|
+
time_list = ['Jul.93', 'Aug.93', 'Sep.93', 'Oct.93', 'Nov.93', 'Dec.93',
|
|
1182
|
+
'Jan.94', 'Feb.94', 'Mar.94', 'Apr.94', 'May.94', 'Jun.94', 'Jul.94',
|
|
1183
|
+
'Aug.94', 'Sep.94', 'Oct.94', 'Nov.94', 'Dec.94', 'Jan.95', 'Feb.95',
|
|
1184
|
+
'Mar.95', 'Apr.95', 'May.95', 'Jun.95', 'Jul.95', 'Aug.95', 'Sep.95',
|
|
1185
|
+
'Oct.95', 'Nov.95', 'Dec.95', 'Jan.96', 'Feb.96', 'Mar.96', 'Apr.96',
|
|
1186
|
+
'May.96', 'Jun.96', 'Jul.96', 'Aug.96', 'Sep.96', 'Oct.96', 'Nov.96',
|
|
1187
|
+
'Dec.96', 'Jan.97', 'Feb.97', 'Mar.97', 'Apr.97', 'May.97', 'Jun.97',
|
|
1188
|
+
'Jul.97', 'Aug.97', 'Sep.97', 'Oct.97', 'Nov.97', 'Dec.97', 'Jan.98',
|
|
1189
|
+
'Feb.98', 'Mar.98', 'Apr.98', 'May.98', 'Jun.98', 'Jul.98', 'Aug.98',
|
|
1190
|
+
'Sep.98', 'Oct.98', 'Nov.98', 'Dec.98', 'Jan.99', 'Feb.99', 'Mar.99',
|
|
1191
|
+
'Apr.99', 'May.99', 'Jun.99']
|
|
1192
|
+
|
|
1193
|
+
# 方法1: 使用pandas获取所有唯一值
|
|
1194
|
+
time_states_df = df[time_list]
|
|
1195
|
+
all_unique_states = set()
|
|
1196
|
+
|
|
1197
|
+
for col in time_list:
|
|
1198
|
+
unique_vals = df[col].dropna().unique() # Remove NaN values
|
|
1199
|
+
all_unique_states.update(unique_vals)
|
|
1200
|
+
|
|
1201
|
+
# 转换为排序的列表
|
|
1202
|
+
states = sorted(list(all_unique_states))
|
|
1203
|
+
print("All unique states:")
|
|
1204
|
+
for i, state in enumerate(states, 1):
|
|
1205
|
+
print(f"{i:2d}. {state}")
|
|
1206
|
+
|
|
1207
|
+
print(f"\nstates list:")
|
|
1208
|
+
print(f"states = {states}")
|
|
1209
|
+
|
|
1210
|
+
# Create a SequenceData object
|
|
1211
|
+
|
|
1212
|
+
# Define the time-span variable
|
|
1213
|
+
time_list = ['Jul.93', 'Aug.93', 'Sep.93', 'Oct.93', 'Nov.93', 'Dec.93',
|
|
1214
|
+
'Jan.94', 'Feb.94', 'Mar.94', 'Apr.94', 'May.94', 'Jun.94', 'Jul.94',
|
|
1215
|
+
'Aug.94', 'Sep.94', 'Oct.94', 'Nov.94', 'Dec.94', 'Jan.95', 'Feb.95',
|
|
1216
|
+
'Mar.95', 'Apr.95', 'May.95', 'Jun.95', 'Jul.95', 'Aug.95', 'Sep.95',
|
|
1217
|
+
'Oct.95', 'Nov.95', 'Dec.95', 'Jan.96', 'Feb.96', 'Mar.96', 'Apr.96',
|
|
1218
|
+
'May.96', 'Jun.96', 'Jul.96', 'Aug.96', 'Sep.96', 'Oct.96', 'Nov.96',
|
|
1219
|
+
'Dec.96', 'Jan.97', 'Feb.97', 'Mar.97', 'Apr.97', 'May.97', 'Jun.97',
|
|
1220
|
+
'Jul.97', 'Aug.97', 'Sep.97', 'Oct.97', 'Nov.97', 'Dec.97', 'Jan.98',
|
|
1221
|
+
'Feb.98', 'Mar.98', 'Apr.98', 'May.98', 'Jun.98', 'Jul.98', 'Aug.98',
|
|
1222
|
+
'Sep.98', 'Oct.98', 'Nov.98', 'Dec.98', 'Jan.99', 'Feb.99', 'Mar.99',
|
|
1223
|
+
'Apr.99', 'May.99', 'Jun.99']
|
|
1224
|
+
|
|
1225
|
+
states = ['FE', 'HE', 'employment', 'joblessness', 'school', 'training']
|
|
1226
|
+
labels = ['further education', 'higher education', 'employment', 'joblessness', 'school', 'training']
|
|
1227
|
+
|
|
1228
|
+
# TODO: write a try and error: if no such a parameter, then ask to pass the right ones
|
|
1229
|
+
# sequence_data = SequenceData(df, time=time, id_col="country", ids=df['country'].values, states=states)
|
|
1230
|
+
|
|
1231
|
+
sequence_data = SequenceData(df,
|
|
1232
|
+
time=time_list,
|
|
1233
|
+
id_col="id",
|
|
1234
|
+
states=states,
|
|
1235
|
+
labels=labels,
|
|
1236
|
+
)
|
|
1237
|
+
|
|
1238
|
+
om = get_distance_matrix(sequence_data,
|
|
1239
|
+
method="OM",
|
|
1240
|
+
sm="CONSTANT",
|
|
1241
|
+
indel=1)
|
|
1242
|
+
|
|
1243
|
+
cluster = Cluster(om, sequence_data.ids, clustering_method='ward_d')
|
|
1244
|
+
cluster.plot_dendrogram(xlabel="Individuals", ylabel="Distance")
|
|
1245
|
+
|
|
1246
|
+
# Create a ClusterQuality object to evaluate clustering quality
|
|
1247
|
+
cluster_quality = ClusterQuality(cluster)
|
|
1248
|
+
cluster_quality.compute_cluster_quality_scores()
|
|
1249
|
+
cluster_quality.plot_cqi_scores(norm='zscore')
|
|
1250
|
+
summary_table = cluster_quality.get_cqi_table()
|
|
1251
|
+
print(summary_table)
|
|
1252
|
+
|
|
1253
|
+
table = cluster_quality.get_cluster_range_table()
|
|
1254
|
+
# table.to_csv("cluster_quality_table.csv")
|
|
1255
|
+
|
|
1256
|
+
print(table)
|