sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
- sequenzo/__init__.py +349 -0
- sequenzo/big_data/__init__.py +12 -0
- sequenzo/big_data/clara/__init__.py +26 -0
- sequenzo/big_data/clara/clara.py +476 -0
- sequenzo/big_data/clara/utils/__init__.py +27 -0
- sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
- sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
- sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
- sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
- sequenzo/big_data/clara/visualization.py +88 -0
- sequenzo/clustering/KMedoids.py +178 -0
- sequenzo/clustering/__init__.py +30 -0
- sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
- sequenzo/clustering/hierarchical_clustering.py +1256 -0
- sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
- sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
- sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
- sequenzo/clustering/src/KMedoid.cpp +263 -0
- sequenzo/clustering/src/PAM.cpp +237 -0
- sequenzo/clustering/src/PAMonce.cpp +265 -0
- sequenzo/clustering/src/cluster_quality.cpp +496 -0
- sequenzo/clustering/src/cluster_quality.h +128 -0
- sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
- sequenzo/clustering/src/module.cpp +228 -0
- sequenzo/clustering/src/weightedinertia.cpp +111 -0
- sequenzo/clustering/utils/__init__.py +27 -0
- sequenzo/clustering/utils/disscenter.py +122 -0
- sequenzo/data_preprocessing/__init__.py +22 -0
- sequenzo/data_preprocessing/helpers.py +303 -0
- sequenzo/datasets/__init__.py +41 -0
- sequenzo/datasets/biofam.csv +2001 -0
- sequenzo/datasets/biofam_child_domain.csv +2001 -0
- sequenzo/datasets/biofam_left_domain.csv +2001 -0
- sequenzo/datasets/biofam_married_domain.csv +2001 -0
- sequenzo/datasets/chinese_colonial_territories.csv +12 -0
- sequenzo/datasets/country_co2_emissions.csv +194 -0
- sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
- sequenzo/datasets/country_gdp_per_capita.csv +194 -0
- sequenzo/datasets/dyadic_children.csv +61 -0
- sequenzo/datasets/dyadic_parents.csv +61 -0
- sequenzo/datasets/mvad.csv +713 -0
- sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
- sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
- sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
- sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
- sequenzo/datasets/political_science_aid_shock.csv +166 -0
- sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
- sequenzo/define_sequence_data.py +1400 -0
- sequenzo/dissimilarity_measures/__init__.py +31 -0
- sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
- sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
- sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
- sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
- sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
- sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
- sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
- sequenzo/dissimilarity_measures/src/__init__.py +0 -0
- sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
- sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
- sequenzo/dissimilarity_measures/src/module.cpp +40 -0
- sequenzo/dissimilarity_measures/src/setup.py +30 -0
- sequenzo/dissimilarity_measures/src/utils.h +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
- sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
- sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
- sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
- sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
- sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
- sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
- sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
- sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
- sequenzo/multidomain/__init__.py +23 -0
- sequenzo/multidomain/association_between_domains.py +311 -0
- sequenzo/multidomain/cat.py +597 -0
- sequenzo/multidomain/combt.py +519 -0
- sequenzo/multidomain/dat.py +81 -0
- sequenzo/multidomain/idcd.py +139 -0
- sequenzo/multidomain/linked_polyad.py +292 -0
- sequenzo/openmp_setup.py +233 -0
- sequenzo/prefix_tree/__init__.py +62 -0
- sequenzo/prefix_tree/hub.py +114 -0
- sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
- sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
- sequenzo/prefix_tree/spell_level_indicators.py +297 -0
- sequenzo/prefix_tree/system_level_indicators.py +544 -0
- sequenzo/prefix_tree/utils.py +54 -0
- sequenzo/seqhmm/__init__.py +95 -0
- sequenzo/seqhmm/advanced_optimization.py +305 -0
- sequenzo/seqhmm/bootstrap.py +411 -0
- sequenzo/seqhmm/build_hmm.py +142 -0
- sequenzo/seqhmm/build_mhmm.py +136 -0
- sequenzo/seqhmm/build_nhmm.py +121 -0
- sequenzo/seqhmm/fit_mhmm.py +62 -0
- sequenzo/seqhmm/fit_model.py +61 -0
- sequenzo/seqhmm/fit_nhmm.py +76 -0
- sequenzo/seqhmm/formulas.py +289 -0
- sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
- sequenzo/seqhmm/gradients_nhmm.py +306 -0
- sequenzo/seqhmm/hmm.py +291 -0
- sequenzo/seqhmm/mhmm.py +314 -0
- sequenzo/seqhmm/model_comparison.py +238 -0
- sequenzo/seqhmm/multichannel_em.py +282 -0
- sequenzo/seqhmm/multichannel_utils.py +138 -0
- sequenzo/seqhmm/nhmm.py +270 -0
- sequenzo/seqhmm/nhmm_utils.py +191 -0
- sequenzo/seqhmm/predict.py +137 -0
- sequenzo/seqhmm/predict_mhmm.py +142 -0
- sequenzo/seqhmm/simulate.py +878 -0
- sequenzo/seqhmm/utils.py +218 -0
- sequenzo/seqhmm/visualization.py +910 -0
- sequenzo/sequence_characteristics/__init__.py +40 -0
- sequenzo/sequence_characteristics/complexity_index.py +49 -0
- sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
- sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
- sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
- sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
- sequenzo/sequence_characteristics/turbulence.py +155 -0
- sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
- sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
- sequenzo/suffix_tree/__init__.py +66 -0
- sequenzo/suffix_tree/hub.py +114 -0
- sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
- sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
- sequenzo/suffix_tree/spell_level_indicators.py +248 -0
- sequenzo/suffix_tree/system_level_indicators.py +535 -0
- sequenzo/suffix_tree/utils.py +56 -0
- sequenzo/version_check.py +283 -0
- sequenzo/visualization/__init__.py +29 -0
- sequenzo/visualization/plot_mean_time.py +222 -0
- sequenzo/visualization/plot_modal_state.py +276 -0
- sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
- sequenzo/visualization/plot_relative_frequency.py +405 -0
- sequenzo/visualization/plot_sequence_index.py +1175 -0
- sequenzo/visualization/plot_single_medoid.py +153 -0
- sequenzo/visualization/plot_state_distribution.py +651 -0
- sequenzo/visualization/plot_transition_matrix.py +190 -0
- sequenzo/visualization/utils/__init__.py +23 -0
- sequenzo/visualization/utils/utils.py +310 -0
- sequenzo/with_event_history_analysis/__init__.py +35 -0
- sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
- sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
- sequenzo-0.1.31.dist-info/METADATA +286 -0
- sequenzo-0.1.31.dist-info/RECORD +299 -0
- sequenzo-0.1.31.dist-info/WHEEL +5 -0
- sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
- sequenzo-0.1.31.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,570 @@
|
|
|
1
|
+
#include "cluster_quality.h"
|
|
2
|
+
#include <iostream>
|
|
3
|
+
#include <limits>
|
|
4
|
+
#include <cstring>
|
|
5
|
+
|
|
6
|
+
#ifdef _OPENMP
|
|
7
|
+
#include <omp.h>
|
|
8
|
+
#endif
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Implementation of cluster quality indicators matching R WeightedCluster package
|
|
12
|
+
*
|
|
13
|
+
* This implementation closely follows the logic in R's clusterquality.cpp
|
|
14
|
+
* to ensure numerical consistency with the WeightedCluster package.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
void resetKendallTree(KendallTree& kendall) {
|
|
18
|
+
for (auto& pair : kendall) {
|
|
19
|
+
pair.second->clustDist0 = 0.0L;
|
|
20
|
+
pair.second->clustDist1 = 0.0L;
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
void finalizeKendall(KendallTree& kendall) {
|
|
25
|
+
for (auto& pair : kendall) {
|
|
26
|
+
delete pair.second;
|
|
27
|
+
}
|
|
28
|
+
kendall.clear();
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Compute individual ASW scores for full distance matrix
|
|
33
|
+
*/
|
|
34
|
+
void indiv_asw(const double* diss, const int* cluster, const double* weights,
|
|
35
|
+
int n, int nclusters, double* asw_i, double* asw_w) {
|
|
36
|
+
|
|
37
|
+
// Initialize output arrays
|
|
38
|
+
std::fill(asw_i, asw_i + n, std::numeric_limits<double>::quiet_NaN());
|
|
39
|
+
std::fill(asw_w, asw_w + n, std::numeric_limits<double>::quiet_NaN());
|
|
40
|
+
|
|
41
|
+
// Count cluster sizes and validate
|
|
42
|
+
std::vector<int> cluster_sizes(nclusters + 1, 0);
|
|
43
|
+
for (int i = 0; i < n; i++) {
|
|
44
|
+
if (cluster[i] >= 1 && cluster[i] <= nclusters) {
|
|
45
|
+
cluster_sizes[cluster[i]]++;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
#pragma omp parallel for
|
|
50
|
+
for (int i = 0; i < n; i++) {
|
|
51
|
+
int ci = cluster[i];
|
|
52
|
+
if (ci < 1 || ci > nclusters || cluster_sizes[ci] <= 1) {
|
|
53
|
+
continue; // Skip singletons or invalid clusters
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
double a_i = 0.0; // Within-cluster average distance
|
|
57
|
+
double b_i = std::numeric_limits<double>::max(); // Min between-cluster average
|
|
58
|
+
|
|
59
|
+
// Calculate within-cluster average (a_i)
|
|
60
|
+
double sum_within = 0.0;
|
|
61
|
+
double weight_within = 0.0;
|
|
62
|
+
|
|
63
|
+
for (int j = 0; j < n; j++) {
|
|
64
|
+
if (i != j && cluster[j] == ci) {
|
|
65
|
+
double dist = diss[i * n + j];
|
|
66
|
+
sum_within += dist * weights[j];
|
|
67
|
+
weight_within += weights[j];
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
if (weight_within > 0) {
|
|
72
|
+
a_i = sum_within / weight_within;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Calculate minimum between-cluster average (b_i)
|
|
76
|
+
for (int k = 1; k <= nclusters; k++) {
|
|
77
|
+
if (k == ci || cluster_sizes[k] == 0) continue;
|
|
78
|
+
|
|
79
|
+
double sum_between = 0.0;
|
|
80
|
+
double weight_between = 0.0;
|
|
81
|
+
|
|
82
|
+
for (int j = 0; j < n; j++) {
|
|
83
|
+
if (cluster[j] == k) {
|
|
84
|
+
double dist = diss[i * n + j];
|
|
85
|
+
sum_between += dist * weights[j];
|
|
86
|
+
weight_between += weights[j];
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
if (weight_between > 0) {
|
|
91
|
+
double avg_between = sum_between / weight_between;
|
|
92
|
+
b_i = std::min(b_i, avg_between);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// Calculate silhouette scores
|
|
97
|
+
if (b_i != std::numeric_limits<double>::max()) {
|
|
98
|
+
double max_ab = std::max(a_i, b_i);
|
|
99
|
+
if (max_ab > 0) {
|
|
100
|
+
asw_i[i] = (b_i - a_i) / max_ab;
|
|
101
|
+
asw_w[i] = asw_i[i]; // For individual scores, weighted = unweighted
|
|
102
|
+
} else {
|
|
103
|
+
asw_i[i] = 0.0;
|
|
104
|
+
asw_w[i] = 0.0;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Compute individual ASW scores for condensed distance array
|
|
112
|
+
*/
|
|
113
|
+
void indiv_asw_dist(const double* diss, const int* cluster, const double* weights,
|
|
114
|
+
int n, int nclusters, double* asw_i, double* asw_w) {
|
|
115
|
+
|
|
116
|
+
// Initialize output arrays
|
|
117
|
+
std::fill(asw_i, asw_i + n, std::numeric_limits<double>::quiet_NaN());
|
|
118
|
+
std::fill(asw_w, asw_w + n, std::numeric_limits<double>::quiet_NaN());
|
|
119
|
+
|
|
120
|
+
// Count cluster sizes and validate
|
|
121
|
+
std::vector<int> cluster_sizes(nclusters + 1, 0);
|
|
122
|
+
for (int i = 0; i < n; i++) {
|
|
123
|
+
if (cluster[i] >= 1 && cluster[i] <= nclusters) {
|
|
124
|
+
cluster_sizes[cluster[i]]++;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
#pragma omp parallel for
|
|
129
|
+
for (int i = 0; i < n; i++) {
|
|
130
|
+
int ci = cluster[i];
|
|
131
|
+
if (ci < 1 || ci > nclusters || cluster_sizes[ci] <= 1) {
|
|
132
|
+
continue; // Skip singletons or invalid clusters
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
double a_i = 0.0; // Within-cluster average distance
|
|
136
|
+
double b_i = std::numeric_limits<double>::max(); // Min between-cluster average
|
|
137
|
+
|
|
138
|
+
// Calculate within-cluster average (a_i)
|
|
139
|
+
double sum_within = 0.0;
|
|
140
|
+
double weight_within = 0.0;
|
|
141
|
+
|
|
142
|
+
for (int j = 0; j < n; j++) {
|
|
143
|
+
if (i != j && cluster[j] == ci) {
|
|
144
|
+
double dist = getDistanceFromCondensed(diss, i, j, n);
|
|
145
|
+
sum_within += dist * weights[j];
|
|
146
|
+
weight_within += weights[j];
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
if (weight_within > 0) {
|
|
151
|
+
a_i = sum_within / weight_within;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// Calculate minimum between-cluster average (b_i)
|
|
155
|
+
for (int k = 1; k <= nclusters; k++) {
|
|
156
|
+
if (k == ci || cluster_sizes[k] == 0) continue;
|
|
157
|
+
|
|
158
|
+
double sum_between = 0.0;
|
|
159
|
+
double weight_between = 0.0;
|
|
160
|
+
|
|
161
|
+
for (int j = 0; j < n; j++) {
|
|
162
|
+
if (cluster[j] == k) {
|
|
163
|
+
double dist = getDistanceFromCondensed(diss, i, j, n);
|
|
164
|
+
sum_between += dist * weights[j];
|
|
165
|
+
weight_between += weights[j];
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
if (weight_between > 0) {
|
|
170
|
+
double avg_between = sum_between / weight_between;
|
|
171
|
+
b_i = std::min(b_i, avg_between);
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// Calculate silhouette scores
|
|
176
|
+
if (b_i != std::numeric_limits<double>::max()) {
|
|
177
|
+
double max_ab = std::max(a_i, b_i);
|
|
178
|
+
if (max_ab > 0) {
|
|
179
|
+
asw_i[i] = (b_i - a_i) / max_ab;
|
|
180
|
+
asw_w[i] = asw_i[i]; // For individual scores, weighted = unweighted
|
|
181
|
+
} else {
|
|
182
|
+
asw_i[i] = 0.0;
|
|
183
|
+
asw_w[i] = 0.0;
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
/**
|
|
190
|
+
* Core function to compute all cluster quality indicators
|
|
191
|
+
* This follows the R implementation logic exactly
|
|
192
|
+
*/
|
|
193
|
+
template<bool UseCondensed>
|
|
194
|
+
void compute_cluster_quality_core(const double* diss, const int* cluster, const double* weights,
|
|
195
|
+
int n, double* stats, int nclusters, double* asw,
|
|
196
|
+
KendallTree& kendall) {
|
|
197
|
+
|
|
198
|
+
// Initialize all statistics to NaN
|
|
199
|
+
std::fill(stats, stats + ClusterQualNumStat, std::numeric_limits<double>::quiet_NaN());
|
|
200
|
+
std::fill(asw, asw + 2 * nclusters, std::numeric_limits<double>::quiet_NaN());
|
|
201
|
+
|
|
202
|
+
// Validate input - return all NaN for invalid cases
|
|
203
|
+
if (n < 2 || nclusters < 1 || nclusters >= n) {
|
|
204
|
+
return;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Count cluster sizes and compute total weight
|
|
208
|
+
std::vector<int> cluster_sizes(nclusters + 1, 0);
|
|
209
|
+
std::vector<double> cluster_weights(nclusters + 1, 0.0);
|
|
210
|
+
double total_weight = 0.0;
|
|
211
|
+
|
|
212
|
+
for (int i = 0; i < n; i++) {
|
|
213
|
+
if (cluster[i] >= 1 && cluster[i] <= nclusters) {
|
|
214
|
+
cluster_sizes[cluster[i]]++;
|
|
215
|
+
cluster_weights[cluster[i]] += weights[i];
|
|
216
|
+
}
|
|
217
|
+
total_weight += weights[i];
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// Check for valid clustering - need at least 2 non-empty clusters
|
|
221
|
+
int valid_clusters = 0;
|
|
222
|
+
for (int c = 1; c <= nclusters; c++) {
|
|
223
|
+
if (cluster_sizes[c] > 0) valid_clusters++;
|
|
224
|
+
}
|
|
225
|
+
if (valid_clusters < 2) {
|
|
226
|
+
// All stats remain NaN for invalid clustering
|
|
227
|
+
return;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// ===== Compute ASW (both individual and weighted) =====
|
|
231
|
+
std::vector<double> asw_individual(n);
|
|
232
|
+
std::vector<double> asw_weighted(n);
|
|
233
|
+
|
|
234
|
+
if constexpr (UseCondensed) {
|
|
235
|
+
indiv_asw_dist(diss, cluster, weights, n, nclusters, asw_individual.data(), asw_weighted.data());
|
|
236
|
+
} else {
|
|
237
|
+
indiv_asw(diss, cluster, weights, n, nclusters, asw_individual.data(), asw_weighted.data());
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// Aggregate ASW by cluster
|
|
241
|
+
std::vector<double> cluster_asw_sum(nclusters + 1, 0.0);
|
|
242
|
+
std::vector<double> cluster_asw_weight(nclusters + 1, 0.0);
|
|
243
|
+
std::vector<double> cluster_asw_weighted_sum(nclusters + 1, 0.0);
|
|
244
|
+
|
|
245
|
+
for (int i = 0; i < n; i++) {
|
|
246
|
+
int ci = cluster[i];
|
|
247
|
+
if (ci >= 1 && ci <= nclusters && !std::isnan(asw_individual[i])) {
|
|
248
|
+
cluster_asw_sum[ci] += asw_individual[i];
|
|
249
|
+
cluster_asw_weighted_sum[ci] += asw_weighted[i] * weights[i];
|
|
250
|
+
cluster_asw_weight[ci] += weights[i];
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
// Store cluster-level ASW
|
|
255
|
+
double global_asw = 0.0, global_asw_weighted = 0.0;
|
|
256
|
+
double global_weight = 0.0;
|
|
257
|
+
int global_count = 0;
|
|
258
|
+
|
|
259
|
+
for (int c = 1; c <= nclusters; c++) {
|
|
260
|
+
if (cluster_sizes[c] > 1) { // Only include clusters with more than 1 member for ASW calculation
|
|
261
|
+
// Count valid individuals in this cluster (those with non-NaN ASW)
|
|
262
|
+
int valid_individuals = 0;
|
|
263
|
+
for (int i = 0; i < n; i++) {
|
|
264
|
+
if (cluster[i] == c && !std::isnan(asw_individual[i])) {
|
|
265
|
+
valid_individuals++;
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
if (valid_individuals > 0) {
|
|
270
|
+
asw[2 * (c - 1)] = cluster_asw_sum[c] / valid_individuals; // Unweighted ASW
|
|
271
|
+
if (cluster_asw_weight[c] > 0) {
|
|
272
|
+
asw[2 * (c - 1) + 1] = cluster_asw_weighted_sum[c] / cluster_asw_weight[c]; // Weighted ASW
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
global_asw += cluster_asw_sum[c];
|
|
276
|
+
global_asw_weighted += cluster_asw_weighted_sum[c];
|
|
277
|
+
global_weight += cluster_asw_weight[c];
|
|
278
|
+
global_count += valid_individuals;
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
stats[ClusterQualASWi] = (global_count > 0) ? global_asw / global_count : 0.0;
|
|
284
|
+
stats[ClusterQualASWw] = (global_weight > 0) ? global_asw_weighted / global_weight : 0.0;
|
|
285
|
+
|
|
286
|
+
// ===== Compute R² (weighted) =====
|
|
287
|
+
long double D_bar = 0.0L; // Global weighted mean of distances
|
|
288
|
+
long double total_pair_weight = 0.0L;
|
|
289
|
+
|
|
290
|
+
// Calculate global weighted mean (using upper triangle only)
|
|
291
|
+
for (int i = 0; i < n - 1; i++) {
|
|
292
|
+
for (int j = i + 1; j < n; j++) {
|
|
293
|
+
double dist;
|
|
294
|
+
if constexpr (UseCondensed) {
|
|
295
|
+
dist = diss[getCondensedIndex(i, j, n)];
|
|
296
|
+
} else {
|
|
297
|
+
dist = diss[i * n + j];
|
|
298
|
+
}
|
|
299
|
+
long double pair_weight = static_cast<long double>(weights[i]) * weights[j];
|
|
300
|
+
D_bar += dist * pair_weight;
|
|
301
|
+
total_pair_weight += pair_weight;
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
if (total_pair_weight > 0) {
|
|
305
|
+
D_bar /= total_pair_weight;
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
// Calculate total sum of squares
|
|
309
|
+
long double total_ss = 0.0L;
|
|
310
|
+
for (int i = 0; i < n - 1; i++) {
|
|
311
|
+
for (int j = i + 1; j < n; j++) {
|
|
312
|
+
double dist;
|
|
313
|
+
if constexpr (UseCondensed) {
|
|
314
|
+
dist = diss[getCondensedIndex(i, j, n)];
|
|
315
|
+
} else {
|
|
316
|
+
dist = diss[i * n + j];
|
|
317
|
+
}
|
|
318
|
+
long double pair_weight = static_cast<long double>(weights[i]) * weights[j];
|
|
319
|
+
long double diff = dist - D_bar;
|
|
320
|
+
total_ss += pair_weight * diff * diff;
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
// Calculate within-cluster sum of squares
|
|
325
|
+
long double within_ss = 0.0L;
|
|
326
|
+
for (int c = 1; c <= nclusters; c++) {
|
|
327
|
+
if (cluster_sizes[c] < 2) continue;
|
|
328
|
+
|
|
329
|
+
// Get cluster members
|
|
330
|
+
std::vector<int> cluster_members;
|
|
331
|
+
for (int i = 0; i < n; i++) {
|
|
332
|
+
if (cluster[i] == c) {
|
|
333
|
+
cluster_members.push_back(i);
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
// Calculate cluster weighted mean
|
|
338
|
+
long double cluster_sum = 0.0L;
|
|
339
|
+
long double cluster_weight = 0.0L;
|
|
340
|
+
for (size_t ii = 0; ii < cluster_members.size() - 1; ii++) {
|
|
341
|
+
for (size_t jj = ii + 1; jj < cluster_members.size(); jj++) {
|
|
342
|
+
int i = cluster_members[ii];
|
|
343
|
+
int j = cluster_members[jj];
|
|
344
|
+
double dist;
|
|
345
|
+
if constexpr (UseCondensed) {
|
|
346
|
+
dist = diss[getCondensedIndex(i, j, n)];
|
|
347
|
+
} else {
|
|
348
|
+
dist = diss[i * n + j];
|
|
349
|
+
}
|
|
350
|
+
long double pair_weight = static_cast<long double>(weights[i]) * weights[j];
|
|
351
|
+
cluster_sum += dist * pair_weight;
|
|
352
|
+
cluster_weight += pair_weight;
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
if (cluster_weight > 0) {
|
|
357
|
+
long double cluster_mean = cluster_sum / cluster_weight;
|
|
358
|
+
|
|
359
|
+
// Add to within-cluster sum of squares
|
|
360
|
+
for (size_t ii = 0; ii < cluster_members.size() - 1; ii++) {
|
|
361
|
+
for (size_t jj = ii + 1; jj < cluster_members.size(); jj++) {
|
|
362
|
+
int i = cluster_members[ii];
|
|
363
|
+
int j = cluster_members[jj];
|
|
364
|
+
double dist;
|
|
365
|
+
if constexpr (UseCondensed) {
|
|
366
|
+
dist = diss[getCondensedIndex(i, j, n)];
|
|
367
|
+
} else {
|
|
368
|
+
dist = diss[i * n + j];
|
|
369
|
+
}
|
|
370
|
+
long double pair_weight = static_cast<long double>(weights[i]) * weights[j];
|
|
371
|
+
long double diff = dist - cluster_mean;
|
|
372
|
+
within_ss += pair_weight * diff * diff;
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
stats[ClusterQualR] = (total_ss > 0) ? static_cast<double>(1.0L - within_ss / total_ss) : 0.0;
|
|
379
|
+
stats[ClusterQualR2] = stats[ClusterQualR] * stats[ClusterQualR];
|
|
380
|
+
|
|
381
|
+
// ===== Compute Calinski-Harabasz =====
|
|
382
|
+
long double between_ss = total_ss - within_ss;
|
|
383
|
+
if (within_ss > 0 && nclusters > 1 && n > nclusters) {
|
|
384
|
+
long double f_stat = (between_ss / (nclusters - 1)) / (within_ss / (n - nclusters));
|
|
385
|
+
stats[ClusterQualF] = static_cast<double>(f_stat);
|
|
386
|
+
stats[ClusterQualF2] = stats[ClusterQualF] * stats[ClusterQualF];
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
// ===== Compute HPG (weighted point-biserial correlation) =====
|
|
390
|
+
{
|
|
391
|
+
long double sum_w = 0.0L; // Σ wij
|
|
392
|
+
long double sum_xw = 0.0L; // Σ wij * d_ij
|
|
393
|
+
long double sum_yw = 0.0L; // Σ wij * y_ij
|
|
394
|
+
long double sum_x2w= 0.0L; // Σ wij * d_ij^2
|
|
395
|
+
long double sum_y2w= 0.0L; // Σ wij * y_ij^2 (y^2==y 因为 y∈{0,1})
|
|
396
|
+
long double sum_xyw= 0.0L; // Σ wij * d_ij * y_ij
|
|
397
|
+
|
|
398
|
+
for (int i = 0; i < n - 1; ++i) {
|
|
399
|
+
for (int j = i + 1; j < n; ++j) {
|
|
400
|
+
const double wij = weights[i] * weights[j];
|
|
401
|
+
if (wij <= 0) continue;
|
|
402
|
+
const double dij = (UseCondensed ? diss[getCondensedIndex(i,j,n)]
|
|
403
|
+
: diss[i*n + j]);
|
|
404
|
+
const double yij = (cluster[i] == cluster[j]) ? 1.0 : 0.0;
|
|
405
|
+
|
|
406
|
+
sum_w += wij;
|
|
407
|
+
sum_xw += wij * dij;
|
|
408
|
+
sum_yw += wij * yij;
|
|
409
|
+
sum_x2w += wij * dij * dij;
|
|
410
|
+
sum_y2w += wij * yij; // yij^2 == yij
|
|
411
|
+
sum_xyw += wij * dij * yij;
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
if (sum_w > 0) {
|
|
416
|
+
const long double mx = sum_xw / sum_w;
|
|
417
|
+
const long double my = sum_yw / sum_w;
|
|
418
|
+
const long double cov_xy = (sum_xyw / sum_w) - mx * my;
|
|
419
|
+
const long double var_x = (sum_x2w / sum_w) - mx * mx;
|
|
420
|
+
const long double var_y = (sum_y2w / sum_w) - my * my;
|
|
421
|
+
|
|
422
|
+
if (var_x > 0 && var_y > 0) {
|
|
423
|
+
stats[ClusterQualHPG] = static_cast<double>(cov_xy / std::sqrt(var_x * var_y));
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
// ===== Compute HG and HGSD (Hubert's Gamma) =====
|
|
429
|
+
// Based on R WeightedCluster implementation - correct Kendall tau calculation
|
|
430
|
+
|
|
431
|
+
// Reset Kendall tree
|
|
432
|
+
for (auto& pair : kendall) {
|
|
433
|
+
pair.second->clustDist0 = 0.0L;
|
|
434
|
+
pair.second->clustDist1 = 0.0L;
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
// Build distance groups with cluster memberships
|
|
438
|
+
for (int i = 0; i < n - 1; i++) {
|
|
439
|
+
for (int j = i + 1; j < n; j++) {
|
|
440
|
+
double dist_ij;
|
|
441
|
+
if constexpr (UseCondensed) {
|
|
442
|
+
dist_ij = diss[getCondensedIndex(i, j, n)];
|
|
443
|
+
} else {
|
|
444
|
+
dist_ij = diss[i * n + j];
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
// Get or create entry in Kendall tree
|
|
448
|
+
auto it = kendall.find(dist_ij);
|
|
449
|
+
CmpCluster* cmp;
|
|
450
|
+
if (it == kendall.end()) {
|
|
451
|
+
cmp = new CmpCluster();
|
|
452
|
+
kendall[dist_ij] = cmp;
|
|
453
|
+
} else {
|
|
454
|
+
cmp = it->second;
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
// Count pairs: clustDist1 = same cluster, clustDist0 = different clusters
|
|
458
|
+
long double weight_pair = static_cast<long double>(weights[i]) * weights[j];
|
|
459
|
+
if (cluster[i] == cluster[j]) {
|
|
460
|
+
cmp->clustDist1 += weight_pair;
|
|
461
|
+
} else {
|
|
462
|
+
cmp->clustDist0 += weight_pair;
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
// Calculate Kendall's tau (Gamma) from the tree
|
|
468
|
+
long double gamma_concordant = 0.0L;
|
|
469
|
+
long double gamma_discordant = 0.0L;
|
|
470
|
+
|
|
471
|
+
for (auto it1 = kendall.begin(); it1 != kendall.end(); ++it1) {
|
|
472
|
+
for (auto it2 = std::next(it1); it2 != kendall.end(); ++it2) {
|
|
473
|
+
double d1 = it1->first;
|
|
474
|
+
double d2 = it2->first;
|
|
475
|
+
CmpCluster* cmp1 = it1->second;
|
|
476
|
+
CmpCluster* cmp2 = it2->second;
|
|
477
|
+
|
|
478
|
+
if (d1 < d2) {
|
|
479
|
+
// For distances d1 < d2, we expect same-cluster pairs to be more common at d1
|
|
480
|
+
// Concordant: more same-cluster pairs at smaller distance
|
|
481
|
+
gamma_concordant += cmp1->clustDist1 * cmp2->clustDist0;
|
|
482
|
+
// Discordant: more different-cluster pairs at smaller distance
|
|
483
|
+
gamma_discordant += cmp1->clustDist0 * cmp2->clustDist1;
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
long double gamma_total = gamma_concordant + gamma_discordant;
|
|
489
|
+
if (gamma_total > 0) {
|
|
490
|
+
stats[ClusterQualHG] = static_cast<double>((gamma_concordant - gamma_discordant) / gamma_total);
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
// HGSD: Temporarily set to NaN until exact R formula is ported
|
|
494
|
+
stats[ClusterQualHGSD] = std::numeric_limits<double>::quiet_NaN();
|
|
495
|
+
|
|
496
|
+
// ===== Compute HC (Hierarchical Criterion) =====
|
|
497
|
+
// This is a simplified version - the full implementation would need the dendrogram
|
|
498
|
+
std::vector<double> cluster_means(nclusters + 1, 0.0);
|
|
499
|
+
for (int c = 1; c <= nclusters; c++) {
|
|
500
|
+
if (cluster_sizes[c] > 0) {
|
|
501
|
+
// Calculate mean within-cluster distance
|
|
502
|
+
double sum_dist = 0.0;
|
|
503
|
+
int count = 0;
|
|
504
|
+
for (int i = 0; i < n; i++) {
|
|
505
|
+
if (cluster[i] == c) {
|
|
506
|
+
for (int j = i + 1; j < n; j++) {
|
|
507
|
+
if (cluster[j] == c) {
|
|
508
|
+
double dist;
|
|
509
|
+
if constexpr (UseCondensed) {
|
|
510
|
+
dist = diss[getCondensedIndex(i, j, n)];
|
|
511
|
+
} else {
|
|
512
|
+
dist = diss[i * n + j];
|
|
513
|
+
}
|
|
514
|
+
sum_dist += dist;
|
|
515
|
+
count++;
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
cluster_means[c] = (count > 0) ? sum_dist / count : 0.0;
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
double mean_of_means = 0.0;
|
|
525
|
+
int valid_mean_count = 0;
|
|
526
|
+
for (int c = 1; c <= nclusters; c++) {
|
|
527
|
+
if (cluster_sizes[c] > 0) {
|
|
528
|
+
mean_of_means += cluster_means[c];
|
|
529
|
+
valid_mean_count++;
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
mean_of_means /= valid_mean_count;
|
|
533
|
+
|
|
534
|
+
double variance = 0.0;
|
|
535
|
+
for (int c = 1; c <= nclusters; c++) {
|
|
536
|
+
if (cluster_sizes[c] > 0) {
|
|
537
|
+
variance += (cluster_means[c] - mean_of_means) * (cluster_means[c] - mean_of_means);
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
// HC: Temporarily set to NaN until exact R formula is ported
|
|
541
|
+
stats[ClusterQualHC] = std::numeric_limits<double>::quiet_NaN();
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
// Template instantiations
|
|
545
|
+
void clusterquality(const double* diss, const int* cluster, const double* weights,
|
|
546
|
+
int n, double* stats, int nclusters, double* asw,
|
|
547
|
+
KendallTree& kendall) {
|
|
548
|
+
compute_cluster_quality_core<false>(diss, cluster, weights, n, stats, nclusters, asw, kendall);
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
void clusterquality_dist(const double* diss, const int* cluster, const double* weights,
|
|
552
|
+
int n, double* stats, int nclusters, double* asw,
|
|
553
|
+
KendallTree& kendall) {
|
|
554
|
+
compute_cluster_quality_core<true>(diss, cluster, weights, n, stats, nclusters, asw, kendall);
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
// Simplified versions (subset of statistics)
|
|
558
|
+
void clusterqualitySimple(const double* diss, const int* cluster, const double* weights,
|
|
559
|
+
int n, double* stats, int nclusters, double* asw) {
|
|
560
|
+
KendallTree kendall; // Local Kendall tree for simple version
|
|
561
|
+
clusterquality(diss, cluster, weights, n, stats, nclusters, asw, kendall);
|
|
562
|
+
finalizeKendall(kendall);
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
void clusterqualitySimple_dist(const double* diss, const int* cluster, const double* weights,
|
|
566
|
+
int n, double* stats, int nclusters, double* asw) {
|
|
567
|
+
KendallTree kendall; // Local Kendall tree for simple version
|
|
568
|
+
clusterquality_dist(diss, cluster, weights, n, stats, nclusters, asw, kendall);
|
|
569
|
+
finalizeKendall(kendall);
|
|
570
|
+
}
|