sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
- sequenzo/__init__.py +349 -0
- sequenzo/big_data/__init__.py +12 -0
- sequenzo/big_data/clara/__init__.py +26 -0
- sequenzo/big_data/clara/clara.py +476 -0
- sequenzo/big_data/clara/utils/__init__.py +27 -0
- sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
- sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
- sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
- sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
- sequenzo/big_data/clara/visualization.py +88 -0
- sequenzo/clustering/KMedoids.py +178 -0
- sequenzo/clustering/__init__.py +30 -0
- sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
- sequenzo/clustering/hierarchical_clustering.py +1256 -0
- sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
- sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
- sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
- sequenzo/clustering/src/KMedoid.cpp +263 -0
- sequenzo/clustering/src/PAM.cpp +237 -0
- sequenzo/clustering/src/PAMonce.cpp +265 -0
- sequenzo/clustering/src/cluster_quality.cpp +496 -0
- sequenzo/clustering/src/cluster_quality.h +128 -0
- sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
- sequenzo/clustering/src/module.cpp +228 -0
- sequenzo/clustering/src/weightedinertia.cpp +111 -0
- sequenzo/clustering/utils/__init__.py +27 -0
- sequenzo/clustering/utils/disscenter.py +122 -0
- sequenzo/data_preprocessing/__init__.py +22 -0
- sequenzo/data_preprocessing/helpers.py +303 -0
- sequenzo/datasets/__init__.py +41 -0
- sequenzo/datasets/biofam.csv +2001 -0
- sequenzo/datasets/biofam_child_domain.csv +2001 -0
- sequenzo/datasets/biofam_left_domain.csv +2001 -0
- sequenzo/datasets/biofam_married_domain.csv +2001 -0
- sequenzo/datasets/chinese_colonial_territories.csv +12 -0
- sequenzo/datasets/country_co2_emissions.csv +194 -0
- sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
- sequenzo/datasets/country_gdp_per_capita.csv +194 -0
- sequenzo/datasets/dyadic_children.csv +61 -0
- sequenzo/datasets/dyadic_parents.csv +61 -0
- sequenzo/datasets/mvad.csv +713 -0
- sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
- sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
- sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
- sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
- sequenzo/datasets/political_science_aid_shock.csv +166 -0
- sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
- sequenzo/define_sequence_data.py +1400 -0
- sequenzo/dissimilarity_measures/__init__.py +31 -0
- sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
- sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
- sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
- sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
- sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
- sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
- sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
- sequenzo/dissimilarity_measures/src/__init__.py +0 -0
- sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
- sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
- sequenzo/dissimilarity_measures/src/module.cpp +40 -0
- sequenzo/dissimilarity_measures/src/setup.py +30 -0
- sequenzo/dissimilarity_measures/src/utils.h +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
- sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
- sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
- sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
- sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
- sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
- sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
- sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
- sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
- sequenzo/multidomain/__init__.py +23 -0
- sequenzo/multidomain/association_between_domains.py +311 -0
- sequenzo/multidomain/cat.py +597 -0
- sequenzo/multidomain/combt.py +519 -0
- sequenzo/multidomain/dat.py +81 -0
- sequenzo/multidomain/idcd.py +139 -0
- sequenzo/multidomain/linked_polyad.py +292 -0
- sequenzo/openmp_setup.py +233 -0
- sequenzo/prefix_tree/__init__.py +62 -0
- sequenzo/prefix_tree/hub.py +114 -0
- sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
- sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
- sequenzo/prefix_tree/spell_level_indicators.py +297 -0
- sequenzo/prefix_tree/system_level_indicators.py +544 -0
- sequenzo/prefix_tree/utils.py +54 -0
- sequenzo/seqhmm/__init__.py +95 -0
- sequenzo/seqhmm/advanced_optimization.py +305 -0
- sequenzo/seqhmm/bootstrap.py +411 -0
- sequenzo/seqhmm/build_hmm.py +142 -0
- sequenzo/seqhmm/build_mhmm.py +136 -0
- sequenzo/seqhmm/build_nhmm.py +121 -0
- sequenzo/seqhmm/fit_mhmm.py +62 -0
- sequenzo/seqhmm/fit_model.py +61 -0
- sequenzo/seqhmm/fit_nhmm.py +76 -0
- sequenzo/seqhmm/formulas.py +289 -0
- sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
- sequenzo/seqhmm/gradients_nhmm.py +306 -0
- sequenzo/seqhmm/hmm.py +291 -0
- sequenzo/seqhmm/mhmm.py +314 -0
- sequenzo/seqhmm/model_comparison.py +238 -0
- sequenzo/seqhmm/multichannel_em.py +282 -0
- sequenzo/seqhmm/multichannel_utils.py +138 -0
- sequenzo/seqhmm/nhmm.py +270 -0
- sequenzo/seqhmm/nhmm_utils.py +191 -0
- sequenzo/seqhmm/predict.py +137 -0
- sequenzo/seqhmm/predict_mhmm.py +142 -0
- sequenzo/seqhmm/simulate.py +878 -0
- sequenzo/seqhmm/utils.py +218 -0
- sequenzo/seqhmm/visualization.py +910 -0
- sequenzo/sequence_characteristics/__init__.py +40 -0
- sequenzo/sequence_characteristics/complexity_index.py +49 -0
- sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
- sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
- sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
- sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
- sequenzo/sequence_characteristics/turbulence.py +155 -0
- sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
- sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
- sequenzo/suffix_tree/__init__.py +66 -0
- sequenzo/suffix_tree/hub.py +114 -0
- sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
- sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
- sequenzo/suffix_tree/spell_level_indicators.py +248 -0
- sequenzo/suffix_tree/system_level_indicators.py +535 -0
- sequenzo/suffix_tree/utils.py +56 -0
- sequenzo/version_check.py +283 -0
- sequenzo/visualization/__init__.py +29 -0
- sequenzo/visualization/plot_mean_time.py +222 -0
- sequenzo/visualization/plot_modal_state.py +276 -0
- sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
- sequenzo/visualization/plot_relative_frequency.py +405 -0
- sequenzo/visualization/plot_sequence_index.py +1175 -0
- sequenzo/visualization/plot_single_medoid.py +153 -0
- sequenzo/visualization/plot_state_distribution.py +651 -0
- sequenzo/visualization/plot_transition_matrix.py +190 -0
- sequenzo/visualization/utils/__init__.py +23 -0
- sequenzo/visualization/utils/utils.py +310 -0
- sequenzo/with_event_history_analysis/__init__.py +35 -0
- sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
- sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
- sequenzo-0.1.31.dist-info/METADATA +286 -0
- sequenzo-0.1.31.dist-info/RECORD +299 -0
- sequenzo-0.1.31.dist-info/WHEEL +5 -0
- sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
- sequenzo-0.1.31.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,496 @@
|
|
|
1
|
+
#include "cluster_quality.h"
|
|
2
|
+
#include <iostream>
|
|
3
|
+
#include <limits>
|
|
4
|
+
#include <cstring>
|
|
5
|
+
|
|
6
|
+
#ifdef _OPENMP
|
|
7
|
+
#include <omp.h>
|
|
8
|
+
#endif
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Implementation matching R WeightedCluster exactly
|
|
12
|
+
* Based on clusterqualitybody.cpp from R package
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
void resetKendallTree(KendallTree& kendall) {
|
|
16
|
+
for (auto& pair : kendall) {
|
|
17
|
+
pair.second->clustDist0 = 0.0;
|
|
18
|
+
pair.second->clustDist1 = 0.0;
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
void finalizeKendall(KendallTree& kendall) {
|
|
23
|
+
for (auto& pair : kendall) {
|
|
24
|
+
delete pair.second;
|
|
25
|
+
}
|
|
26
|
+
kendall.clear();
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Core function exactly matching R WeightedCluster implementation
|
|
31
|
+
*/
|
|
32
|
+
template<bool UseCondensed>
|
|
33
|
+
void compute_cluster_quality_core(const double* diss, const int* cluster, const double* weights,
|
|
34
|
+
int n, double* stats, int nclusters, double* asw,
|
|
35
|
+
KendallTree& kendall) {
|
|
36
|
+
|
|
37
|
+
// Initialize all statistics to NaN
|
|
38
|
+
std::fill(stats, stats + ClusterQualNumStat, std::numeric_limits<double>::quiet_NaN());
|
|
39
|
+
std::fill(asw, asw + 2 * nclusters, std::numeric_limits<double>::quiet_NaN());
|
|
40
|
+
|
|
41
|
+
// Variables following R implementation exactly - use double like R
|
|
42
|
+
double totweights = 0.0, wxy = 0.0, wx = 0.0, wy = 0.0, wx2 = 0.0;
|
|
43
|
+
double ww, xx, covxy, covx, covy, pearson, xb, yb, xw, xxw;
|
|
44
|
+
int ij = 0;
|
|
45
|
+
|
|
46
|
+
// Allocate arrays like R version (0-based indexing)
|
|
47
|
+
std::vector<double> errors(nclusters, 0.0);
|
|
48
|
+
std::vector<double> errors2(nclusters, 0.0);
|
|
49
|
+
std::vector<double> sizes(nclusters, 0.0);
|
|
50
|
+
|
|
51
|
+
// Initialize ASW arrays (output)
|
|
52
|
+
for (int i = 0; i < nclusters; i++) {
|
|
53
|
+
asw[i] = 0.0;
|
|
54
|
+
asw[i + nclusters] = 0.0;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// Initialize Kendall tree with zero distance node (like R)
|
|
58
|
+
CmpCluster* ZeroDist;
|
|
59
|
+
auto it_zero = kendall.find(0.0);
|
|
60
|
+
if (it_zero != kendall.end()) {
|
|
61
|
+
ZeroDist = it_zero->second;
|
|
62
|
+
} else {
|
|
63
|
+
ZeroDist = new CmpCluster();
|
|
64
|
+
kendall[0.0] = ZeroDist;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Main computation loop following R version exactly
|
|
68
|
+
if constexpr (UseCondensed) {
|
|
69
|
+
ij = -n; // Condensed version initialization
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
for (int i = 0; i < n; i++) {
|
|
73
|
+
int iclustIndex = cluster[i] - 1; // Convert to 0-based for array access
|
|
74
|
+
if (iclustIndex >= 0 && iclustIndex < nclusters) {
|
|
75
|
+
sizes[iclustIndex] += weights[i];
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
if constexpr (!UseCondensed) {
|
|
79
|
+
ij = i * n; // Full matrix version
|
|
80
|
+
} else {
|
|
81
|
+
ij += n - i - 1; // Condensed version offset
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
if (weights[i] > 0) {
|
|
85
|
+
// Diagonal term (distance to self = 0)
|
|
86
|
+
ww = weights[i] * weights[i];
|
|
87
|
+
wy += ww;
|
|
88
|
+
ZeroDist->clustDist0 += ww;
|
|
89
|
+
totweights += ww;
|
|
90
|
+
|
|
91
|
+
for (int j = i + 1; j < n; j++) {
|
|
92
|
+
if (weights[j] > 0) {
|
|
93
|
+
ww = 2.0 * weights[i] * weights[j]; // Factor of 2 like R
|
|
94
|
+
|
|
95
|
+
if constexpr (UseCondensed) {
|
|
96
|
+
// Use explicit condensed indexing to avoid stride/layout issues
|
|
97
|
+
xx = diss[getCondensedIndex(i, j, n)];
|
|
98
|
+
} else {
|
|
99
|
+
// Full square matrix (row-major) indexing
|
|
100
|
+
xx = diss[ij + j];
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Find or create Kendall tree node
|
|
104
|
+
auto it = kendall.find(xx);
|
|
105
|
+
CmpCluster* cmpclust;
|
|
106
|
+
if (it != kendall.end()) {
|
|
107
|
+
cmpclust = it->second;
|
|
108
|
+
} else {
|
|
109
|
+
cmpclust = new CmpCluster();
|
|
110
|
+
kendall[xx] = cmpclust;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
xw = ww * xx;
|
|
114
|
+
xxw = xw * xx;
|
|
115
|
+
wx += xw;
|
|
116
|
+
wx2 += xxw;
|
|
117
|
+
|
|
118
|
+
if (cluster[i] == cluster[j]) {
|
|
119
|
+
// Same cluster
|
|
120
|
+
if (iclustIndex >= 0 && iclustIndex < nclusters) {
|
|
121
|
+
errors[iclustIndex] += xw;
|
|
122
|
+
errors2[iclustIndex] += xxw; // Add errors2 calculation like R
|
|
123
|
+
}
|
|
124
|
+
wxy += xw;
|
|
125
|
+
wy += ww;
|
|
126
|
+
cmpclust->clustDist0 += ww;
|
|
127
|
+
} else {
|
|
128
|
+
// Different clusters
|
|
129
|
+
cmpclust->clustDist1 += ww;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
totweights += ww;
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// Calculate Pearson correlation (HPG) exactly like R
|
|
139
|
+
if (totweights > 0) {
|
|
140
|
+
xb = wx / totweights;
|
|
141
|
+
yb = wy / totweights;
|
|
142
|
+
covx = wx2 / totweights - xb * xb;
|
|
143
|
+
covy = wy / totweights - yb * yb;
|
|
144
|
+
covxy = wxy / totweights - yb * xb;
|
|
145
|
+
|
|
146
|
+
// Debug: Print intermediate values
|
|
147
|
+
#ifdef DEBUG_PBC
|
|
148
|
+
std::cout << "DEBUG PBC: totweights=" << totweights << ", wx=" << wx << ", wy=" << wy << ", wxy=" << wxy << ", wx2=" << wx2 << std::endl;
|
|
149
|
+
std::cout << "DEBUG PBC: xb=" << xb << ", yb=" << yb << std::endl;
|
|
150
|
+
std::cout << "DEBUG PBC: covx=" << covx << ", covy=" << covy << ", covxy=" << covxy << std::endl;
|
|
151
|
+
#endif
|
|
152
|
+
|
|
153
|
+
if (covx > 0 && covy > 0) {
|
|
154
|
+
pearson = covxy / std::sqrt(covx * covy);
|
|
155
|
+
double pbc_value = -1.0 * static_cast<double>(pearson); // Apply negative to get positive PBC
|
|
156
|
+
stats[ClusterQualHPG] = pbc_value;
|
|
157
|
+
|
|
158
|
+
// Debug: Print final calculation
|
|
159
|
+
#ifdef DEBUG_PBC
|
|
160
|
+
std::cout << "DEBUG PBC: pearson=" << pearson << ", pbc_value=" << pbc_value << std::endl;
|
|
161
|
+
#endif
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// Compute Kendall statistics (HG, HGSD, HC) exactly like R
|
|
166
|
+
double nc = 0.0, nd = 0.0, currentclustdist0 = 0.0, currentclustdist1 = 0.0;
|
|
167
|
+
double totdist0 = wy, totdist1 = totweights - wy, ntiesdist = 0.0;
|
|
168
|
+
double Smin = 0.0, wSmin = wy, Smax = 0.0, wSmax = totdist1, currentww = 0.0;
|
|
169
|
+
|
|
170
|
+
for (auto it = kendall.begin(); it != kendall.end(); ++it) {
|
|
171
|
+
CmpCluster* cmpclust = it->second;
|
|
172
|
+
ww = cmpclust->clustDist1 + cmpclust->clustDist0;
|
|
173
|
+
|
|
174
|
+
if (ww > 0) {
|
|
175
|
+
// Smin calculation
|
|
176
|
+
if (currentww <= wSmin) {
|
|
177
|
+
if (currentww + ww > wSmin) {
|
|
178
|
+
Smin += (wSmin - currentww) * it->first;
|
|
179
|
+
} else {
|
|
180
|
+
Smin += ww * it->first;
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
currentww += ww;
|
|
184
|
+
|
|
185
|
+
// Smax calculation
|
|
186
|
+
if (currentww > wSmax) {
|
|
187
|
+
if (currentww - ww < wSmax) {
|
|
188
|
+
Smax += (currentww - wSmax) * it->first;
|
|
189
|
+
} else {
|
|
190
|
+
Smax += ww * it->first;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// Count ties
|
|
195
|
+
ntiesdist += cmpclust->clustDist1 * cmpclust->clustDist0;
|
|
196
|
+
|
|
197
|
+
// Concordant and discordant pairs - exactly like R
|
|
198
|
+
nc += cmpclust->clustDist1 * currentclustdist0; // Bottom of table
|
|
199
|
+
nd += cmpclust->clustDist0 * currentclustdist1;
|
|
200
|
+
|
|
201
|
+
// Update running totals
|
|
202
|
+
currentclustdist0 += cmpclust->clustDist0;
|
|
203
|
+
currentclustdist1 += cmpclust->clustDist1;
|
|
204
|
+
|
|
205
|
+
// Top of table
|
|
206
|
+
nc += cmpclust->clustDist0 * (totdist1 - currentclustdist1);
|
|
207
|
+
nd += cmpclust->clustDist1 * (totdist0 - currentclustdist0);
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
// Compute final Kendall statistics (guard divisions to avoid NaN while matching R behavior)
|
|
212
|
+
double denom_hg = (nc + nd);
|
|
213
|
+
if (denom_hg > 0) {
|
|
214
|
+
stats[ClusterQualHG] = static_cast<double>((nc - nd) / denom_hg); // Gamma
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// HGSD (Somers' D)
|
|
218
|
+
double denom_hgsd = (nc + nd + ntiesdist);
|
|
219
|
+
if (denom_hgsd > 0) {
|
|
220
|
+
stats[ClusterQualHGSD] = (nc - nd) / denom_hgsd;
|
|
221
|
+
} else {
|
|
222
|
+
stats[ClusterQualHGSD] = 0.0; // avoid NaN in degenerate cases
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// HC (Hierarchical Criterion)
|
|
226
|
+
double denom_hc = (Smax - Smin);
|
|
227
|
+
if (denom_hc > 0) {
|
|
228
|
+
stats[ClusterQualHC] = (wxy - Smin) / denom_hc;
|
|
229
|
+
} else {
|
|
230
|
+
stats[ClusterQualHC] = 0.0; // avoid NaN when Smax == Smin
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
// Compute F and R statistics exactly like R
|
|
235
|
+
double SSres = 0.0;
|
|
236
|
+
double SS2res = 0.0;
|
|
237
|
+
double total_cluster_weights = 0.0;
|
|
238
|
+
|
|
239
|
+
for (int i = 0; i < nclusters; i++) {
|
|
240
|
+
if (sizes[i] > 0) {
|
|
241
|
+
SSres += errors[i] / sizes[i];
|
|
242
|
+
SS2res += errors2[i] / sizes[i];
|
|
243
|
+
total_cluster_weights += sizes[i];
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
if (total_cluster_weights > 0) {
|
|
248
|
+
double SSexpl = wx / total_cluster_weights - SSres;
|
|
249
|
+
double SS2expl = wx2 / total_cluster_weights - SS2res;
|
|
250
|
+
double dncluster = static_cast<double>(nclusters);
|
|
251
|
+
|
|
252
|
+
if (total_cluster_weights > dncluster && SSres > 0) {
|
|
253
|
+
stats[ClusterQualF] = (SSexpl / (dncluster - 1.0)) / (SSres / (total_cluster_weights - dncluster));
|
|
254
|
+
stats[ClusterQualR] = SSexpl / (SSres + SSexpl);
|
|
255
|
+
// F2 and R2 should be based on SS2, not squares of F and R
|
|
256
|
+
stats[ClusterQualF2] = (SS2expl / (dncluster - 1.0)) / (SS2res / (total_cluster_weights - dncluster));
|
|
257
|
+
stats[ClusterQualR2] = SS2expl / (SS2res + SS2expl);
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
// Compute ASW exactly like R version
|
|
262
|
+
double asw_i = 0.0;
|
|
263
|
+
double asw_w = 0.0;
|
|
264
|
+
|
|
265
|
+
// Reset ASW arrays
|
|
266
|
+
for (int j = 0; j < nclusters; j++) {
|
|
267
|
+
asw[j] = 0.0;
|
|
268
|
+
asw[j + nclusters] = 0.0;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
for (int i = 0; i < n; i++) {
|
|
272
|
+
if (weights[i] > 0) {
|
|
273
|
+
int iclustIndex = cluster[i] - 1; // Convert to 0-based
|
|
274
|
+
if (iclustIndex < 0 || iclustIndex >= nclusters) continue;
|
|
275
|
+
|
|
276
|
+
double aik = 0.0;
|
|
277
|
+
std::vector<double> othergroups(nclusters, 0.0);
|
|
278
|
+
|
|
279
|
+
// Calculate distances to all other points
|
|
280
|
+
if constexpr (!UseCondensed) {
|
|
281
|
+
ij = i * n;
|
|
282
|
+
for (int j = 0; j < n; j++) {
|
|
283
|
+
if (i == j) continue;
|
|
284
|
+
int jclustIndex = cluster[j] - 1;
|
|
285
|
+
if (jclustIndex < 0 || jclustIndex >= nclusters) continue;
|
|
286
|
+
|
|
287
|
+
if (iclustIndex == jclustIndex) {
|
|
288
|
+
aik += weights[j] * diss[ij + j];
|
|
289
|
+
} else {
|
|
290
|
+
othergroups[jclustIndex] += weights[j] * diss[ij + j];
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
} else {
|
|
294
|
+
// Condensed version
|
|
295
|
+
for (int j = 0; j < n; j++) {
|
|
296
|
+
if (i == j) continue;
|
|
297
|
+
int jclustIndex = cluster[j] - 1;
|
|
298
|
+
if (jclustIndex < 0 || jclustIndex >= nclusters) continue;
|
|
299
|
+
|
|
300
|
+
double dist_val = (i < j) ? diss[getCondensedIndex(i, j, n)] : diss[getCondensedIndex(j, i, n)];
|
|
301
|
+
|
|
302
|
+
if (iclustIndex == jclustIndex) {
|
|
303
|
+
aik += weights[j] * dist_val;
|
|
304
|
+
} else {
|
|
305
|
+
othergroups[jclustIndex] += weights[j] * dist_val;
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
// Find minimum average distance to other clusters
|
|
311
|
+
double bik = std::numeric_limits<double>::max();
|
|
312
|
+
for (int j = 0; j < nclusters; j++) {
|
|
313
|
+
if (j != iclustIndex && sizes[j] > 0) {
|
|
314
|
+
double avg_dist = othergroups[j] / sizes[j];
|
|
315
|
+
if (bik >= avg_dist) {
|
|
316
|
+
bik = avg_dist;
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
// Calculate ASW values like R
|
|
322
|
+
double aik_w = aik / sizes[iclustIndex]; // Weighted version
|
|
323
|
+
if (sizes[iclustIndex] <= 1.0) {
|
|
324
|
+
aik = 0.0; // Avoid division by zero for singletons
|
|
325
|
+
} else {
|
|
326
|
+
aik /= (sizes[iclustIndex] - 1.0); // Unweighted version
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
if (bik != std::numeric_limits<double>::max()) {
|
|
330
|
+
double sik_i = weights[i] * ((bik - aik) / std::max(aik, bik));
|
|
331
|
+
double sik_w = weights[i] * ((bik - aik_w) / std::max(aik_w, bik));
|
|
332
|
+
|
|
333
|
+
asw[iclustIndex] += sik_i;
|
|
334
|
+
asw[iclustIndex + nclusters] += sik_w;
|
|
335
|
+
asw_i += sik_i;
|
|
336
|
+
asw_w += sik_w;
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
// Normalize cluster ASW by cluster sizes
|
|
342
|
+
for (int j = 0; j < nclusters; j++) {
|
|
343
|
+
if (sizes[j] > 0) {
|
|
344
|
+
asw[j] /= sizes[j];
|
|
345
|
+
asw[j + nclusters] /= sizes[j];
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
if (total_cluster_weights > 0) {
|
|
350
|
+
stats[ClusterQualASWi] = asw_i / total_cluster_weights;
|
|
351
|
+
stats[ClusterQualASWw] = asw_w / total_cluster_weights;
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
// Template instantiations
|
|
356
|
+
void clusterquality(const double* diss, const int* cluster, const double* weights,
|
|
357
|
+
int n, double* stats, int nclusters, double* asw,
|
|
358
|
+
KendallTree& kendall) {
|
|
359
|
+
compute_cluster_quality_core<false>(diss, cluster, weights, n, stats, nclusters, asw, kendall);
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
void clusterquality_dist(const double* diss, const int* cluster, const double* weights,
|
|
363
|
+
int n, double* stats, int nclusters, double* asw,
|
|
364
|
+
KendallTree& kendall) {
|
|
365
|
+
compute_cluster_quality_core<true>(diss, cluster, weights, n, stats, nclusters, asw, kendall);
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
// Individual ASW functions (simplified, calling the main function)
|
|
369
|
+
void indiv_asw(const double* diss, const int* cluster, const double* weights,
|
|
370
|
+
int n, int nclusters, double* asw_i, double* asw_w) {
|
|
371
|
+
|
|
372
|
+
std::fill(asw_i, asw_i + n, std::numeric_limits<double>::quiet_NaN());
|
|
373
|
+
std::fill(asw_w, asw_w + n, std::numeric_limits<double>::quiet_NaN());
|
|
374
|
+
|
|
375
|
+
// For individual ASW, we can use simplified computation
|
|
376
|
+
std::vector<double> sizes(nclusters, 0.0);
|
|
377
|
+
for (int i = 0; i < n; i++) {
|
|
378
|
+
int clustIndex = cluster[i] - 1;
|
|
379
|
+
if (clustIndex >= 0 && clustIndex < nclusters) {
|
|
380
|
+
sizes[clustIndex] += weights[i];
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
for (int i = 0; i < n; i++) {
|
|
385
|
+
int iclustIndex = cluster[i] - 1;
|
|
386
|
+
if (iclustIndex < 0 || iclustIndex >= nclusters || sizes[iclustIndex] <= 1.0) {
|
|
387
|
+
continue;
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
double aik = 0.0, aik_w = 0.0;
|
|
391
|
+
std::vector<double> othergroups(nclusters, 0.0);
|
|
392
|
+
|
|
393
|
+
for (int j = 0; j < n; j++) {
|
|
394
|
+
if (i == j) continue;
|
|
395
|
+
int jclustIndex = cluster[j] - 1;
|
|
396
|
+
if (jclustIndex < 0 || jclustIndex >= nclusters) continue;
|
|
397
|
+
|
|
398
|
+
double dist = diss[i * n + j];
|
|
399
|
+
if (iclustIndex == jclustIndex) {
|
|
400
|
+
aik += weights[j] * dist;
|
|
401
|
+
} else {
|
|
402
|
+
othergroups[jclustIndex] += weights[j] * dist;
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
double bik = std::numeric_limits<double>::max();
|
|
407
|
+
for (int j = 0; j < nclusters; j++) {
|
|
408
|
+
if (j != iclustIndex && sizes[j] > 0) {
|
|
409
|
+
double avg_dist = othergroups[j] / sizes[j];
|
|
410
|
+
if (bik >= avg_dist) {
|
|
411
|
+
bik = avg_dist;
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
aik_w = aik / sizes[iclustIndex];
|
|
417
|
+
aik /= (sizes[iclustIndex] - 1.0);
|
|
418
|
+
|
|
419
|
+
if (bik != std::numeric_limits<double>::max()) {
|
|
420
|
+
asw_i[i] = (bik - aik) / std::max(aik, bik);
|
|
421
|
+
asw_w[i] = (bik - aik_w) / std::max(aik_w, bik);
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
void indiv_asw_dist(const double* diss, const int* cluster, const double* weights,
|
|
427
|
+
int n, int nclusters, double* asw_i, double* asw_w) {
|
|
428
|
+
|
|
429
|
+
std::fill(asw_i, asw_i + n, std::numeric_limits<double>::quiet_NaN());
|
|
430
|
+
std::fill(asw_w, asw_w + n, std::numeric_limits<double>::quiet_NaN());
|
|
431
|
+
|
|
432
|
+
// For condensed version
|
|
433
|
+
std::vector<double> sizes(nclusters, 0.0);
|
|
434
|
+
for (int i = 0; i < n; i++) {
|
|
435
|
+
int clustIndex = cluster[i] - 1;
|
|
436
|
+
if (clustIndex >= 0 && clustIndex < nclusters) {
|
|
437
|
+
sizes[clustIndex] += weights[i];
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
for (int i = 0; i < n; i++) {
|
|
442
|
+
int iclustIndex = cluster[i] - 1;
|
|
443
|
+
if (iclustIndex < 0 || iclustIndex >= nclusters || sizes[iclustIndex] <= 1.0) {
|
|
444
|
+
continue;
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
double aik = 0.0, aik_w = 0.0;
|
|
448
|
+
std::vector<double> othergroups(nclusters, 0.0);
|
|
449
|
+
|
|
450
|
+
for (int j = 0; j < n; j++) {
|
|
451
|
+
if (i == j) continue;
|
|
452
|
+
int jclustIndex = cluster[j] - 1;
|
|
453
|
+
if (jclustIndex < 0 || jclustIndex >= nclusters) continue;
|
|
454
|
+
|
|
455
|
+
double dist = getDistanceFromCondensed(diss, i, j, n);
|
|
456
|
+
if (iclustIndex == jclustIndex) {
|
|
457
|
+
aik += weights[j] * dist;
|
|
458
|
+
} else {
|
|
459
|
+
othergroups[jclustIndex] += weights[j] * dist;
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
double bik = std::numeric_limits<double>::max();
|
|
464
|
+
for (int j = 0; j < nclusters; j++) {
|
|
465
|
+
if (j != iclustIndex && sizes[j] > 0) {
|
|
466
|
+
double avg_dist = othergroups[j] / sizes[j];
|
|
467
|
+
if (bik >= avg_dist) {
|
|
468
|
+
bik = avg_dist;
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
aik_w = aik / sizes[iclustIndex];
|
|
474
|
+
aik /= (sizes[iclustIndex] - 1.0);
|
|
475
|
+
|
|
476
|
+
if (bik != std::numeric_limits<double>::max()) {
|
|
477
|
+
asw_i[i] = (bik - aik) / std::max(aik, bik);
|
|
478
|
+
asw_w[i] = (bik - aik_w) / std::max(aik_w, bik);
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
// Simplified versions
|
|
484
|
+
void clusterqualitySimple(const double* diss, const int* cluster, const double* weights,
|
|
485
|
+
int n, double* stats, int nclusters, double* asw) {
|
|
486
|
+
KendallTree kendall;
|
|
487
|
+
clusterquality(diss, cluster, weights, n, stats, nclusters, asw, kendall);
|
|
488
|
+
finalizeKendall(kendall);
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
void clusterqualitySimple_dist(const double* diss, const int* cluster, const double* weights,
|
|
492
|
+
int n, double* stats, int nclusters, double* asw) {
|
|
493
|
+
KendallTree kendall;
|
|
494
|
+
clusterquality_dist(diss, cluster, weights, n, stats, nclusters, asw, kendall);
|
|
495
|
+
finalizeKendall(kendall);
|
|
496
|
+
}
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include <pybind11/pybind11.h>
|
|
4
|
+
#include <pybind11/numpy.h>
|
|
5
|
+
#include <pybind11/stl.h>
|
|
6
|
+
#include <vector>
|
|
7
|
+
#include <map>
|
|
8
|
+
#include <cmath>
|
|
9
|
+
#include <algorithm>
|
|
10
|
+
#include <numeric>
|
|
11
|
+
|
|
12
|
+
namespace py = pybind11;
|
|
13
|
+
|
|
14
|
+
// Cluster Quality Index constants (matching R WeightedCluster package)
|
|
15
|
+
#define ClusterQualHPG 0 // Hubert's Gamma Prime (not implemented in this version)
|
|
16
|
+
#define ClusterQualHG 1 // Hubert's Gamma
|
|
17
|
+
#define ClusterQualHGSD 2 // Hubert's Gamma Standard Deviation
|
|
18
|
+
#define ClusterQualASWi 3 // Average Silhouette Width (individual)
|
|
19
|
+
#define ClusterQualASWw 4 // Average Silhouette Width (weighted)
|
|
20
|
+
#define ClusterQualF 5 // Calinski-Harabasz (F statistic)
|
|
21
|
+
#define ClusterQualR 6 // R-squared
|
|
22
|
+
#define ClusterQualF2 7 // Calinski-Harabasz squared
|
|
23
|
+
#define ClusterQualR2 8 // R-squared squared
|
|
24
|
+
#define ClusterQualHC 9 // Hierarchical Criterion
|
|
25
|
+
#define ClusterQualNumStat 10
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Class for caching pairwise distance comparisons used in Kendall's tau calculations
|
|
29
|
+
* This corresponds to the CmpCluster class in R's implementation
|
|
30
|
+
*/
|
|
31
|
+
class CmpCluster {
|
|
32
|
+
public:
|
|
33
|
+
double clustDist0;
|
|
34
|
+
double clustDist1;
|
|
35
|
+
|
|
36
|
+
CmpCluster() : clustDist0(0.0), clustDist1(0.0) {}
|
|
37
|
+
~CmpCluster() {}
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
typedef std::map<double, CmpCluster*> KendallTree;
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Core cluster quality computation functions
|
|
44
|
+
* These match the R WeightedCluster package implementation
|
|
45
|
+
*/
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Compute all cluster quality indicators for a distance matrix
|
|
49
|
+
*
|
|
50
|
+
* @param diss Distance matrix (full square form, n x n)
|
|
51
|
+
* @param cluster Cluster labels (1-based, as in R)
|
|
52
|
+
* @param weights Sample weights
|
|
53
|
+
* @param n Number of samples
|
|
54
|
+
* @param stats Output array for statistics [ClusterQualNumStat]
|
|
55
|
+
* @param nclusters Number of clusters
|
|
56
|
+
* @param asw Output array for cluster-level ASW [2 * nclusters]
|
|
57
|
+
* @param kendall Reference to Kendall tree for caching
|
|
58
|
+
*/
|
|
59
|
+
void clusterquality(const double* diss, const int* cluster, const double* weights,
|
|
60
|
+
int n, double* stats, int nclusters, double* asw,
|
|
61
|
+
KendallTree& kendall);
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Compute all cluster quality indicators for a condensed distance array
|
|
65
|
+
*
|
|
66
|
+
* @param diss Condensed distance array (upper triangle, length n*(n-1)/2)
|
|
67
|
+
* @param cluster Cluster labels (1-based, as in R)
|
|
68
|
+
* @param weights Sample weights
|
|
69
|
+
* @param n Number of samples
|
|
70
|
+
* @param stats Output array for statistics [ClusterQualNumStat]
|
|
71
|
+
* @param nclusters Number of clusters
|
|
72
|
+
* @param asw Output array for cluster-level ASW [2 * nclusters]
|
|
73
|
+
* @param kendall Reference to Kendall tree for caching
|
|
74
|
+
*/
|
|
75
|
+
void clusterquality_dist(const double* diss, const int* cluster, const double* weights,
|
|
76
|
+
int n, double* stats, int nclusters, double* asw,
|
|
77
|
+
KendallTree& kendall);
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Compute individual ASW scores for all samples
|
|
81
|
+
*
|
|
82
|
+
* @param diss Distance matrix (full square form, n x n)
|
|
83
|
+
* @param cluster Cluster labels (1-based, as in R)
|
|
84
|
+
* @param weights Sample weights
|
|
85
|
+
* @param n Number of samples
|
|
86
|
+
* @param nclusters Number of clusters
|
|
87
|
+
* @param asw_i Output array for individual ASW [n]
|
|
88
|
+
* @param asw_w Output array for weighted individual ASW [n]
|
|
89
|
+
*/
|
|
90
|
+
void indiv_asw(const double* diss, const int* cluster, const double* weights,
|
|
91
|
+
int n, int nclusters, double* asw_i, double* asw_w);
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Compute individual ASW scores for condensed distance array
|
|
95
|
+
*/
|
|
96
|
+
void indiv_asw_dist(const double* diss, const int* cluster, const double* weights,
|
|
97
|
+
int n, int nclusters, double* asw_i, double* asw_w);
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Simplified version that computes only basic statistics (without HG/HGSD)
|
|
101
|
+
*/
|
|
102
|
+
void clusterqualitySimple(const double* diss, const int* cluster, const double* weights,
|
|
103
|
+
int n, double* stats, int nclusters, double* asw);
|
|
104
|
+
|
|
105
|
+
void clusterqualitySimple_dist(const double* diss, const int* cluster, const double* weights,
|
|
106
|
+
int n, double* stats, int nclusters, double* asw);
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Helper functions for Kendall tree management
|
|
110
|
+
*/
|
|
111
|
+
void resetKendallTree(KendallTree& kendall);
|
|
112
|
+
void finalizeKendall(KendallTree& kendall);
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Utility functions
|
|
116
|
+
*/
|
|
117
|
+
inline int getCondensedIndex(int i, int j, int n) {
|
|
118
|
+
// Convert (i,j) indices to condensed array index
|
|
119
|
+
// Use SciPy/R standard upper triangle ordering: for i < j
|
|
120
|
+
if (i > j) std::swap(i, j); // Ensure i < j for upper triangle
|
|
121
|
+
return i * n - i * (i + 1) / 2 + j - i - 1;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
inline double getDistanceFromCondensed(const double* diss, int i, int j, int n) {
|
|
125
|
+
if (i == j) return 0.0;
|
|
126
|
+
// No need to swap here since getCondensedIndex handles it
|
|
127
|
+
return diss[getCondensedIndex(i, j, n)];
|
|
128
|
+
}
|