sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
- sequenzo/__init__.py +349 -0
- sequenzo/big_data/__init__.py +12 -0
- sequenzo/big_data/clara/__init__.py +26 -0
- sequenzo/big_data/clara/clara.py +476 -0
- sequenzo/big_data/clara/utils/__init__.py +27 -0
- sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
- sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
- sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
- sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
- sequenzo/big_data/clara/visualization.py +88 -0
- sequenzo/clustering/KMedoids.py +178 -0
- sequenzo/clustering/__init__.py +30 -0
- sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
- sequenzo/clustering/hierarchical_clustering.py +1256 -0
- sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
- sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
- sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
- sequenzo/clustering/src/KMedoid.cpp +263 -0
- sequenzo/clustering/src/PAM.cpp +237 -0
- sequenzo/clustering/src/PAMonce.cpp +265 -0
- sequenzo/clustering/src/cluster_quality.cpp +496 -0
- sequenzo/clustering/src/cluster_quality.h +128 -0
- sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
- sequenzo/clustering/src/module.cpp +228 -0
- sequenzo/clustering/src/weightedinertia.cpp +111 -0
- sequenzo/clustering/utils/__init__.py +27 -0
- sequenzo/clustering/utils/disscenter.py +122 -0
- sequenzo/data_preprocessing/__init__.py +22 -0
- sequenzo/data_preprocessing/helpers.py +303 -0
- sequenzo/datasets/__init__.py +41 -0
- sequenzo/datasets/biofam.csv +2001 -0
- sequenzo/datasets/biofam_child_domain.csv +2001 -0
- sequenzo/datasets/biofam_left_domain.csv +2001 -0
- sequenzo/datasets/biofam_married_domain.csv +2001 -0
- sequenzo/datasets/chinese_colonial_territories.csv +12 -0
- sequenzo/datasets/country_co2_emissions.csv +194 -0
- sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
- sequenzo/datasets/country_gdp_per_capita.csv +194 -0
- sequenzo/datasets/dyadic_children.csv +61 -0
- sequenzo/datasets/dyadic_parents.csv +61 -0
- sequenzo/datasets/mvad.csv +713 -0
- sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
- sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
- sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
- sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
- sequenzo/datasets/political_science_aid_shock.csv +166 -0
- sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
- sequenzo/define_sequence_data.py +1400 -0
- sequenzo/dissimilarity_measures/__init__.py +31 -0
- sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
- sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
- sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
- sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
- sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
- sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
- sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
- sequenzo/dissimilarity_measures/src/__init__.py +0 -0
- sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
- sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
- sequenzo/dissimilarity_measures/src/module.cpp +40 -0
- sequenzo/dissimilarity_measures/src/setup.py +30 -0
- sequenzo/dissimilarity_measures/src/utils.h +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
- sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
- sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
- sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
- sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
- sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
- sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
- sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
- sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
- sequenzo/multidomain/__init__.py +23 -0
- sequenzo/multidomain/association_between_domains.py +311 -0
- sequenzo/multidomain/cat.py +597 -0
- sequenzo/multidomain/combt.py +519 -0
- sequenzo/multidomain/dat.py +81 -0
- sequenzo/multidomain/idcd.py +139 -0
- sequenzo/multidomain/linked_polyad.py +292 -0
- sequenzo/openmp_setup.py +233 -0
- sequenzo/prefix_tree/__init__.py +62 -0
- sequenzo/prefix_tree/hub.py +114 -0
- sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
- sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
- sequenzo/prefix_tree/spell_level_indicators.py +297 -0
- sequenzo/prefix_tree/system_level_indicators.py +544 -0
- sequenzo/prefix_tree/utils.py +54 -0
- sequenzo/seqhmm/__init__.py +95 -0
- sequenzo/seqhmm/advanced_optimization.py +305 -0
- sequenzo/seqhmm/bootstrap.py +411 -0
- sequenzo/seqhmm/build_hmm.py +142 -0
- sequenzo/seqhmm/build_mhmm.py +136 -0
- sequenzo/seqhmm/build_nhmm.py +121 -0
- sequenzo/seqhmm/fit_mhmm.py +62 -0
- sequenzo/seqhmm/fit_model.py +61 -0
- sequenzo/seqhmm/fit_nhmm.py +76 -0
- sequenzo/seqhmm/formulas.py +289 -0
- sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
- sequenzo/seqhmm/gradients_nhmm.py +306 -0
- sequenzo/seqhmm/hmm.py +291 -0
- sequenzo/seqhmm/mhmm.py +314 -0
- sequenzo/seqhmm/model_comparison.py +238 -0
- sequenzo/seqhmm/multichannel_em.py +282 -0
- sequenzo/seqhmm/multichannel_utils.py +138 -0
- sequenzo/seqhmm/nhmm.py +270 -0
- sequenzo/seqhmm/nhmm_utils.py +191 -0
- sequenzo/seqhmm/predict.py +137 -0
- sequenzo/seqhmm/predict_mhmm.py +142 -0
- sequenzo/seqhmm/simulate.py +878 -0
- sequenzo/seqhmm/utils.py +218 -0
- sequenzo/seqhmm/visualization.py +910 -0
- sequenzo/sequence_characteristics/__init__.py +40 -0
- sequenzo/sequence_characteristics/complexity_index.py +49 -0
- sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
- sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
- sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
- sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
- sequenzo/sequence_characteristics/turbulence.py +155 -0
- sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
- sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
- sequenzo/suffix_tree/__init__.py +66 -0
- sequenzo/suffix_tree/hub.py +114 -0
- sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
- sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
- sequenzo/suffix_tree/spell_level_indicators.py +248 -0
- sequenzo/suffix_tree/system_level_indicators.py +535 -0
- sequenzo/suffix_tree/utils.py +56 -0
- sequenzo/version_check.py +283 -0
- sequenzo/visualization/__init__.py +29 -0
- sequenzo/visualization/plot_mean_time.py +222 -0
- sequenzo/visualization/plot_modal_state.py +276 -0
- sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
- sequenzo/visualization/plot_relative_frequency.py +405 -0
- sequenzo/visualization/plot_sequence_index.py +1175 -0
- sequenzo/visualization/plot_single_medoid.py +153 -0
- sequenzo/visualization/plot_state_distribution.py +651 -0
- sequenzo/visualization/plot_transition_matrix.py +190 -0
- sequenzo/visualization/utils/__init__.py +23 -0
- sequenzo/visualization/utils/utils.py +310 -0
- sequenzo/with_event_history_analysis/__init__.py +35 -0
- sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
- sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
- sequenzo-0.1.31.dist-info/METADATA +286 -0
- sequenzo-0.1.31.dist-info/RECORD +299 -0
- sequenzo-0.1.31.dist-info/WHEEL +5 -0
- sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
- sequenzo-0.1.31.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,1679 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Individual-level indicators for position-based suffix tree (level = time index from end).
|
|
3
|
+
|
|
4
|
+
This module provides per-sequence (per-individual) convergence and rarity measures when
|
|
5
|
+
the unit of analysis is TIME INDEX FROM END: level t = suffix (states from time t to end).
|
|
6
|
+
Lower suffix rarity means a more typical (common) ending pattern; convergence = low rarity.
|
|
7
|
+
|
|
8
|
+
Usage (position-based: list of sequences, same length)
|
|
9
|
+
-----------------------------------------------------
|
|
10
|
+
from sequenzo import IndividualConvergence, extract_sequences
|
|
11
|
+
|
|
12
|
+
# sequences: list of lists, all same length T (e.g. from build_suffix_tree(..., mode="position").sequences)
|
|
13
|
+
sequences = extract_sequences(df, time_cols, id_col, states)
|
|
14
|
+
ind = IndividualConvergence(sequences)
|
|
15
|
+
|
|
16
|
+
# Per-year suffix rarity: (N x T) matrix or DataFrame. rarity_{i,t} = -log(freq(suffix_{i,t})/N)
|
|
17
|
+
rarity_df = ind.compute_suffix_rarity_per_year(as_dataframe=True, zscore=False)
|
|
18
|
+
|
|
19
|
+
# One score per individual: sum over t of rarity, or standardized (min over windows of max z)
|
|
20
|
+
scores = ind.compute_suffix_rarity_score()
|
|
21
|
+
std_scores = ind.compute_standardized_rarity_score(min_t=1, window=1)
|
|
22
|
+
|
|
23
|
+
# Binary convergence (0/1) and first convergence year (1-indexed, or None)
|
|
24
|
+
converged = ind.compute_converged(method="zscore", z_threshold=1.5, min_t=1, window=1)
|
|
25
|
+
first_year = ind.compute_first_convergence_year(method="zscore", z_threshold=1.5, min_t=1)
|
|
26
|
+
|
|
27
|
+
# Methods: "zscore" (window of low z, i.e. z < -z_threshold), "top_proportion" (bottom p% most typical), "quantile" (below quantile)
|
|
28
|
+
# With group_labels, top_proportion/quantile are applied within each group.
|
|
29
|
+
|
|
30
|
+
# Path uniqueness: count of time steps (from end) at which suffix is unique (freq==1)
|
|
31
|
+
uniqueness = ind.compute_path_uniqueness()
|
|
32
|
+
|
|
33
|
+
Spell-based (level = spell index from end) is in spell_individual_level_indicators.SpellIndividualConvergence;
|
|
34
|
+
use build_spell_suffix_tree(seqdata) then SpellIndividualConvergence(tree).
|
|
35
|
+
|
|
36
|
+
@Author : Yuqi Liang 梁彧祺
|
|
37
|
+
@File : individual_level_indicators.py
|
|
38
|
+
@Time : 08/08/2025 15:30
|
|
39
|
+
"""
|
|
40
|
+
from collections import defaultdict
|
|
41
|
+
from typing import Optional, List
|
|
42
|
+
import numpy as np
|
|
43
|
+
import pandas as pd
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class IndividualConvergence:
|
|
47
|
+
"""
|
|
48
|
+
Individual-level convergence and suffix rarity for position-based suffix trees.
|
|
49
|
+
|
|
50
|
+
Input: sequences — a list of sequences (list of lists), all of the same length T.
|
|
51
|
+
Each sequence is the list of states at time 1, 2, ..., T. Level t corresponds
|
|
52
|
+
to the suffix (states from time t to end). Rarity at (i, t) is
|
|
53
|
+
-log(freq(suffix_{i,t})/N); lower rarity = more typical ending.
|
|
54
|
+
|
|
55
|
+
Main methods:
|
|
56
|
+
- compute_suffix_rarity_per_year: (N x T) rarity matrix or DataFrame.
|
|
57
|
+
- compute_suffix_rarity_score: one aggregated rarity score per individual (sum over t).
|
|
58
|
+
- compute_standardized_rarity_score: z-based score for classification (lower = more typical).
|
|
59
|
+
- compute_converged: binary 0/1 per individual (method: zscore, top_proportion, quantile).
|
|
60
|
+
- compute_first_convergence_year: first year (1-indexed) at which converged, or None.
|
|
61
|
+
- compute_path_uniqueness: count of time steps (from end) with unique suffix per individual.
|
|
62
|
+
- diagnose_convergence_calculation: diagnostic dict (variance by year, count converged, etc.).
|
|
63
|
+
|
|
64
|
+
Plotting: plot_suffix_rarity_distribution, plot_individual_indicators_correlation (in this module).
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
def __init__(self, sequences):
|
|
68
|
+
# Handle case where sequences might already be an IndividualConvergence object
|
|
69
|
+
if isinstance(sequences, IndividualConvergence):
|
|
70
|
+
# Extract sequences from existing object
|
|
71
|
+
self.sequences = sequences.sequences
|
|
72
|
+
elif hasattr(sequences, 'sequences'):
|
|
73
|
+
# Handle case where input might be another object with sequences attribute
|
|
74
|
+
self.sequences = sequences.sequences
|
|
75
|
+
else:
|
|
76
|
+
# Normal case: sequences is a list of sequences
|
|
77
|
+
self.sequences = sequences
|
|
78
|
+
|
|
79
|
+
# Validate input
|
|
80
|
+
if not self.sequences or len(self.sequences) == 0:
|
|
81
|
+
raise ValueError("sequences cannot be empty")
|
|
82
|
+
if not hasattr(self.sequences[0], '__len__') and not hasattr(self.sequences[0], '__iter__'):
|
|
83
|
+
raise ValueError("sequences must be a list of sequences (e.g., [[1,2,3], [2,3,1], ...])")
|
|
84
|
+
|
|
85
|
+
# 验证所有序列长度相同,防止不规整序列的静默错误
|
|
86
|
+
L0 = len(self.sequences[0])
|
|
87
|
+
if any(len(s) != L0 for s in self.sequences):
|
|
88
|
+
raise ValueError("All sequences must have the same length")
|
|
89
|
+
self.T = L0
|
|
90
|
+
|
|
91
|
+
self.suffix_freq_by_year = self._build_suffix_frequencies()
|
|
92
|
+
|
|
93
|
+
def _build_suffix_frequencies(self):
|
|
94
|
+
"""
|
|
95
|
+
Build suffix frequencies for each year t.
|
|
96
|
+
suffix[t] contains frequency of suffixes from year t to end for all individuals.
|
|
97
|
+
"""
|
|
98
|
+
freq_by_year = [defaultdict(int) for _ in range(self.T)]
|
|
99
|
+
for seq in self.sequences:
|
|
100
|
+
for t in range(self.T):
|
|
101
|
+
suffix = tuple(seq[t:]) # suffix from year t to end
|
|
102
|
+
freq_by_year[t][suffix] += 1
|
|
103
|
+
return freq_by_year
|
|
104
|
+
|
|
105
|
+
# Divergence-related computations are intentionally omitted in this convergence-focused module.
|
|
106
|
+
|
|
107
|
+
def compute_converged(
|
|
108
|
+
self,
|
|
109
|
+
z_threshold=1.5,
|
|
110
|
+
min_t=1,
|
|
111
|
+
max_t=None,
|
|
112
|
+
window=1,
|
|
113
|
+
inclusive=False,
|
|
114
|
+
group_labels=None,
|
|
115
|
+
*,
|
|
116
|
+
method: str = "zscore",
|
|
117
|
+
proportion: Optional[float] = None,
|
|
118
|
+
quantile_p: Optional[float] = None,
|
|
119
|
+
min_count: int = 1,
|
|
120
|
+
):
|
|
121
|
+
"""
|
|
122
|
+
Compute binary convergence flags with multiple selection methods.
|
|
123
|
+
|
|
124
|
+
Definition (common intuition): lower suffix-rarity implies more typical behavior.
|
|
125
|
+
We compute per-year rarity via suffix frequencies and then detect convergence using
|
|
126
|
+
one of the following methods:
|
|
127
|
+
|
|
128
|
+
Methods
|
|
129
|
+
-------
|
|
130
|
+
- "zscore" (window-based, default):
|
|
131
|
+
Uses per-year z-scores of rarity. A person is converged if there exists a window
|
|
132
|
+
of length `window` starting between years `[min_t, max_t]` where all z-scores are
|
|
133
|
+
below `-z_threshold` (use `inclusive=True` for `<=`). Zero-variance years remain
|
|
134
|
+
NaN and any window containing NaN is skipped.
|
|
135
|
+
|
|
136
|
+
- "top_proportion" (aka "topk"/"proportion"/"rank"):
|
|
137
|
+
Uses the aggregated standardized score from `compute_standardized_rarity_score`
|
|
138
|
+
(lower = more typical). Select the most typical `proportion` within each group if
|
|
139
|
+
`group_labels` is provided, otherwise globally. `min_count` ensures at least the
|
|
140
|
+
specified number per group.
|
|
141
|
+
|
|
142
|
+
- "quantile":
|
|
143
|
+
Uses a quantile threshold (`quantile_p`) on the aggregated standardized score,
|
|
144
|
+
within each group (or globally if no `group_labels`). Individuals at or below the
|
|
145
|
+
threshold are marked converged.
|
|
146
|
+
|
|
147
|
+
Parameters
|
|
148
|
+
----------
|
|
149
|
+
z_threshold : float, default 1.5
|
|
150
|
+
zscore method only. Converged when z < -z_threshold (or <= if inclusive=True).
|
|
151
|
+
min_t, max_t : int
|
|
152
|
+
Search interval for the starting year (1-indexed). If max_t is None, uses T - window + 1.
|
|
153
|
+
window : int, default 1
|
|
154
|
+
Number of consecutive years required in zscore method and used in standardized aggregation.
|
|
155
|
+
inclusive : bool, default False
|
|
156
|
+
zscore method only. If True, use <= comparisons.
|
|
157
|
+
group_labels : array-like or None
|
|
158
|
+
If provided, proportion/quantile selections are computed within each group.
|
|
159
|
+
method : str, default "zscore"
|
|
160
|
+
One of {"zscore", "top_proportion" (aliases: "topk","proportion","rank"), "quantile"}.
|
|
161
|
+
proportion : float or None
|
|
162
|
+
For top_proportion. Fraction (0,1) to select as converged. Defaults to 0.10 if None.
|
|
163
|
+
quantile_p : float or None
|
|
164
|
+
For quantile. Quantile in (0,1) used as threshold. Defaults to 0.10 if None.
|
|
165
|
+
min_count : int, default 1
|
|
166
|
+
For top_proportion. Lower bound for number selected per group.
|
|
167
|
+
|
|
168
|
+
Returns
|
|
169
|
+
-------
|
|
170
|
+
List[int]
|
|
171
|
+
0/1 indicator for each individual.
|
|
172
|
+
"""
|
|
173
|
+
if max_t is None:
|
|
174
|
+
max_t = self.T - window + 1
|
|
175
|
+
|
|
176
|
+
N = len(self.sequences)
|
|
177
|
+
method_norm = (method or "zscore").lower()
|
|
178
|
+
|
|
179
|
+
# Branch: rank/quantile style selections using aggregated standardized scores
|
|
180
|
+
if method_norm in {"top_proportion", "topk", "proportion", "rank"}:
|
|
181
|
+
p = proportion if proportion is not None else 0.10
|
|
182
|
+
scores = np.asarray(
|
|
183
|
+
self.compute_standardized_rarity_score(min_t=min_t, max_t=max_t, window=window), dtype=float
|
|
184
|
+
)
|
|
185
|
+
if group_labels is None:
|
|
186
|
+
vals = scores
|
|
187
|
+
finite_mask = np.isfinite(vals)
|
|
188
|
+
n_valid = int(np.sum(finite_mask))
|
|
189
|
+
if n_valid == 0:
|
|
190
|
+
return [0] * N
|
|
191
|
+
k = int(np.floor(p * n_valid))
|
|
192
|
+
if k < int(min_count):
|
|
193
|
+
k = int(min_count)
|
|
194
|
+
if k > n_valid:
|
|
195
|
+
k = n_valid
|
|
196
|
+
order = np.argsort(np.where(np.isfinite(vals), vals, np.inf), kind="mergesort")
|
|
197
|
+
flags = np.zeros(N, dtype=int)
|
|
198
|
+
if k >= 1:
|
|
199
|
+
selected = order[:k]
|
|
200
|
+
flags[selected] = 1
|
|
201
|
+
return flags.tolist()
|
|
202
|
+
else:
|
|
203
|
+
flags, _ = self.compute_converged_by_top_proportion(
|
|
204
|
+
group_labels=group_labels,
|
|
205
|
+
proportion=float(p),
|
|
206
|
+
min_t=min_t,
|
|
207
|
+
max_t=max_t,
|
|
208
|
+
window=window,
|
|
209
|
+
min_count=min_count,
|
|
210
|
+
)
|
|
211
|
+
return flags
|
|
212
|
+
|
|
213
|
+
if method_norm == "quantile":
|
|
214
|
+
q = quantile_p if quantile_p is not None else 0.10
|
|
215
|
+
scores = np.asarray(
|
|
216
|
+
self.compute_standardized_rarity_score(min_t=min_t, max_t=max_t, window=window), dtype=float
|
|
217
|
+
)
|
|
218
|
+
flags = np.zeros(N, dtype=int)
|
|
219
|
+
if group_labels is None:
|
|
220
|
+
# Global quantile
|
|
221
|
+
valid = scores[np.isfinite(scores)]
|
|
222
|
+
if valid.size == 0:
|
|
223
|
+
return flags.tolist()
|
|
224
|
+
try:
|
|
225
|
+
xq = float(np.nanquantile(scores, q))
|
|
226
|
+
except Exception:
|
|
227
|
+
xq = float(np.quantile(valid, q))
|
|
228
|
+
flags[np.where(scores <= xq)[0]] = 1
|
|
229
|
+
return flags.tolist()
|
|
230
|
+
else:
|
|
231
|
+
labels = np.asarray(group_labels)
|
|
232
|
+
for g in pd.unique(labels):
|
|
233
|
+
idx = np.where(labels == g)[0]
|
|
234
|
+
vals = scores[idx]
|
|
235
|
+
valid = vals[np.isfinite(vals)]
|
|
236
|
+
if valid.size == 0:
|
|
237
|
+
continue
|
|
238
|
+
try:
|
|
239
|
+
xq = float(np.nanquantile(vals, q))
|
|
240
|
+
except Exception:
|
|
241
|
+
xq = float(np.quantile(valid, q))
|
|
242
|
+
local = np.where(vals <= xq)[0]
|
|
243
|
+
flags[idx[local]] = 1
|
|
244
|
+
return flags.tolist()
|
|
245
|
+
|
|
246
|
+
# Default branch: z-score window logic (supports group or global frequencies)
|
|
247
|
+
if group_labels is not None:
|
|
248
|
+
# 组内收敛:使用组内频率和样本大小
|
|
249
|
+
return self._compute_converged_by_group(z_threshold, min_t, max_t, window, inclusive, group_labels)
|
|
250
|
+
|
|
251
|
+
# 使用全局频率计算稀有度
|
|
252
|
+
rarity_matrix = []
|
|
253
|
+
for seq in self.sequences:
|
|
254
|
+
score = []
|
|
255
|
+
for t in range(self.T):
|
|
256
|
+
suffix = tuple(seq[t:])
|
|
257
|
+
freq = self.suffix_freq_by_year[t][suffix] / N
|
|
258
|
+
score.append(-np.log(freq + 1e-10))
|
|
259
|
+
rarity_matrix.append(score)
|
|
260
|
+
|
|
261
|
+
rarity_df = pd.DataFrame(rarity_matrix)
|
|
262
|
+
# 按列 z 标准化;保留 NaN(零方差年份),后续窗口检测时跳过含 NaN 的窗口
|
|
263
|
+
rarity_z = rarity_df.apply(lambda x: (x - x.mean()) / x.std(), axis=0)
|
|
264
|
+
rarity_z = rarity_z.replace([np.inf, -np.inf], np.nan)
|
|
265
|
+
|
|
266
|
+
flags = []
|
|
267
|
+
for i in range(N):
|
|
268
|
+
z = rarity_z.iloc[i]
|
|
269
|
+
converged = 0
|
|
270
|
+
for t in range(min_t - 1, max_t): # min_t-1 for 0-indexed, max_t already accounts for window
|
|
271
|
+
# 跳过包含 NaN 的窗口(如零方差年份)
|
|
272
|
+
vals = [z.iloc[t + k] for k in range(window)]
|
|
273
|
+
if not np.all(np.isfinite(vals)):
|
|
274
|
+
continue
|
|
275
|
+
# 收敛 = 低稀有(更典型)
|
|
276
|
+
if inclusive:
|
|
277
|
+
condition = all(v <= -z_threshold for v in vals)
|
|
278
|
+
else:
|
|
279
|
+
condition = all(v < -z_threshold for v in vals)
|
|
280
|
+
if condition:
|
|
281
|
+
converged = 1
|
|
282
|
+
break
|
|
283
|
+
flags.append(converged)
|
|
284
|
+
return flags
|
|
285
|
+
|
|
286
|
+
def _compute_converged_by_group(self, z_threshold, min_t, max_t, window, inclusive, group_labels):
|
|
287
|
+
"""
|
|
288
|
+
计算组内收敛:使用组内频率和样本大小计算稀有度
|
|
289
|
+
"""
|
|
290
|
+
from collections import defaultdict
|
|
291
|
+
|
|
292
|
+
# 按组构建 suffix 频率表
|
|
293
|
+
group_suffix_freq = {}
|
|
294
|
+
group_sizes = {}
|
|
295
|
+
|
|
296
|
+
# 先统计各组信息
|
|
297
|
+
group_sequences = defaultdict(list)
|
|
298
|
+
for i, (seq, group) in enumerate(zip(self.sequences, group_labels)):
|
|
299
|
+
group_sequences[group].append((i, seq))
|
|
300
|
+
|
|
301
|
+
# 为每个组构建 suffix 频率表
|
|
302
|
+
for group, seq_list in group_sequences.items():
|
|
303
|
+
group_sizes[group] = len(seq_list)
|
|
304
|
+
freq_by_year = [defaultdict(int) for _ in range(self.T)]
|
|
305
|
+
|
|
306
|
+
for _, seq in seq_list:
|
|
307
|
+
for t in range(self.T):
|
|
308
|
+
suffix = tuple(seq[t:])
|
|
309
|
+
freq_by_year[t][suffix] += 1
|
|
310
|
+
|
|
311
|
+
group_suffix_freq[group] = freq_by_year
|
|
312
|
+
|
|
313
|
+
# 为每个个体计算组内稀有度
|
|
314
|
+
all_flags = [0] * len(self.sequences)
|
|
315
|
+
|
|
316
|
+
for group, seq_list in group_sequences.items():
|
|
317
|
+
group_n = group_sizes[group]
|
|
318
|
+
group_freq = group_suffix_freq[group]
|
|
319
|
+
|
|
320
|
+
# 计算该组的稀有度矩阵
|
|
321
|
+
rarity_matrix = []
|
|
322
|
+
group_indices = []
|
|
323
|
+
|
|
324
|
+
for orig_idx, seq in seq_list:
|
|
325
|
+
group_indices.append(orig_idx)
|
|
326
|
+
score = []
|
|
327
|
+
for t in range(self.T):
|
|
328
|
+
suffix = tuple(seq[t:])
|
|
329
|
+
freq = group_freq[t][suffix] / group_n
|
|
330
|
+
score.append(-np.log(freq + 1e-10))
|
|
331
|
+
rarity_matrix.append(score)
|
|
332
|
+
|
|
333
|
+
# 计算 z 分数
|
|
334
|
+
rarity_df = pd.DataFrame(rarity_matrix)
|
|
335
|
+
rarity_z = rarity_df.apply(lambda x: (x - x.mean()) / x.std(), axis=0)
|
|
336
|
+
rarity_z = rarity_z.replace([np.inf, -np.inf], np.nan)
|
|
337
|
+
|
|
338
|
+
# 判断收敛
|
|
339
|
+
for i, orig_idx in enumerate(group_indices):
|
|
340
|
+
z = rarity_z.iloc[i]
|
|
341
|
+
converged = 0
|
|
342
|
+
for t in range(min_t - 1, max_t):
|
|
343
|
+
vals = [z.iloc[t + k] for k in range(window)]
|
|
344
|
+
if not np.all(np.isfinite(vals)):
|
|
345
|
+
continue
|
|
346
|
+
if inclusive:
|
|
347
|
+
condition = all(v <= -z_threshold for v in vals)
|
|
348
|
+
else:
|
|
349
|
+
condition = all(v < -z_threshold for v in vals)
|
|
350
|
+
if condition:
|
|
351
|
+
converged = 1
|
|
352
|
+
break
|
|
353
|
+
|
|
354
|
+
all_flags[orig_idx] = converged
|
|
355
|
+
|
|
356
|
+
return all_flags
|
|
357
|
+
|
|
358
|
+
# First-divergence timing is intentionally omitted in this convergence-focused module.
|
|
359
|
+
|
|
360
|
+
def compute_first_convergence_year(
|
|
361
|
+
self,
|
|
362
|
+
z_threshold=1.5,
|
|
363
|
+
min_t=1,
|
|
364
|
+
max_t=None,
|
|
365
|
+
window=1,
|
|
366
|
+
inclusive=False,
|
|
367
|
+
group_labels=None,
|
|
368
|
+
*,
|
|
369
|
+
method: str = "zscore",
|
|
370
|
+
proportion: Optional[float] = None,
|
|
371
|
+
quantile_p: Optional[float] = None,
|
|
372
|
+
min_count: int = 1,
|
|
373
|
+
):
|
|
374
|
+
"""
|
|
375
|
+
Compute the first convergence year per individual with multiple selection methods.
|
|
376
|
+
|
|
377
|
+
Methods
|
|
378
|
+
-------
|
|
379
|
+
- "zscore" (default):
|
|
380
|
+
Find the earliest starting year t in [min_t, max_t] such that all z-scores in the
|
|
381
|
+
length-`window` block are below `-z_threshold` (or <= if inclusive=True). Zero-variance
|
|
382
|
+
years are NaN; windows containing NaN are skipped.
|
|
383
|
+
|
|
384
|
+
- "top_proportion" (aka "topk"/"proportion"/"rank"):
|
|
385
|
+
Use aggregated standardized scores to pick the most typical `proportion` within each group
|
|
386
|
+
(or globally). For the selected individuals, return the earliest t where the per-window
|
|
387
|
+
max z-score is <= the selection threshold; others return None. `min_count` is respected.
|
|
388
|
+
|
|
389
|
+
- "quantile":
|
|
390
|
+
Use per-group (or global) quantile threshold `quantile_p` on aggregated standardized scores;
|
|
391
|
+
individuals at or below the threshold return the earliest qualifying year; others return None.
|
|
392
|
+
|
|
393
|
+
Parameters
|
|
394
|
+
----------
|
|
395
|
+
z_threshold, min_t, max_t, window, inclusive, group_labels
|
|
396
|
+
Same definitions as in `compute_converged` for the zscore method.
|
|
397
|
+
method : str, default "zscore"
|
|
398
|
+
One of {"zscore", "top_proportion" (aliases: "topk","proportion","rank"), "quantile"}.
|
|
399
|
+
proportion : float or None
|
|
400
|
+
For top_proportion. Fraction (0,1) to select as converged. Defaults to 0.10 if None.
|
|
401
|
+
quantile_p : float or None
|
|
402
|
+
For quantile. Quantile in (0,1) used as threshold. Defaults to 0.10 if None.
|
|
403
|
+
min_count : int, default 1
|
|
404
|
+
For top_proportion. Lower bound for number selected per group.
|
|
405
|
+
|
|
406
|
+
Returns
|
|
407
|
+
-------
|
|
408
|
+
List[Optional[int]]
|
|
409
|
+
First convergence years (1-indexed). None indicates no convergence.
|
|
410
|
+
"""
|
|
411
|
+
if max_t is None:
|
|
412
|
+
max_t = self.T - window + 1
|
|
413
|
+
|
|
414
|
+
N = len(self.sequences)
|
|
415
|
+
method_norm = (method or "zscore").lower()
|
|
416
|
+
|
|
417
|
+
# Helper: standardized z matrix and per-t window maxima per individual
|
|
418
|
+
def _compute_window_max_list():
|
|
419
|
+
# Build rarity matrix and columnwise z (global standardization)
|
|
420
|
+
rarity_matrix = []
|
|
421
|
+
for seq in self.sequences:
|
|
422
|
+
score = []
|
|
423
|
+
for t in range(self.T):
|
|
424
|
+
suffix = tuple(seq[t:])
|
|
425
|
+
freq = self.suffix_freq_by_year[t][suffix] / N
|
|
426
|
+
score.append(-np.log(freq + 1e-10))
|
|
427
|
+
rarity_matrix.append(score)
|
|
428
|
+
rarity_arr = np.asarray(rarity_matrix, dtype=float)
|
|
429
|
+
col_means = np.nanmean(rarity_arr, axis=0)
|
|
430
|
+
col_stds = np.nanstd(rarity_arr, axis=0, ddof=1)
|
|
431
|
+
with np.errstate(invalid='ignore', divide='ignore'):
|
|
432
|
+
rarity_z = (rarity_arr - col_means) / col_stds
|
|
433
|
+
rarity_z = np.where(np.isfinite(rarity_z), rarity_z, np.nan)
|
|
434
|
+
# Compute per-individual window maxima sequence over t
|
|
435
|
+
window_maxes = [] # list of list per i
|
|
436
|
+
for i in range(N):
|
|
437
|
+
z_scores = rarity_z[i, :]
|
|
438
|
+
vals_per_t = []
|
|
439
|
+
for t0 in range(min_t - 1, max_t):
|
|
440
|
+
vals = [z_scores[t0 + k] for k in range(window)]
|
|
441
|
+
if not np.all(np.isfinite(vals)):
|
|
442
|
+
vals_per_t.append(np.nan)
|
|
443
|
+
else:
|
|
444
|
+
vals_per_t.append(float(np.max(vals)))
|
|
445
|
+
window_maxes.append(vals_per_t)
|
|
446
|
+
return np.asarray(window_maxes, dtype=float)
|
|
447
|
+
|
|
448
|
+
# Branches for rank/quantile-style thresholds
|
|
449
|
+
if method_norm in {"top_proportion", "topk", "proportion", "rank", "quantile"}:
|
|
450
|
+
# Compute aggregated scores for thresholding
|
|
451
|
+
agg_scores = np.asarray(
|
|
452
|
+
self.compute_standardized_rarity_score(min_t=min_t, max_t=max_t, window=window), dtype=float
|
|
453
|
+
)
|
|
454
|
+
per_t_window_max = _compute_window_max_list()
|
|
455
|
+
|
|
456
|
+
if method_norm in {"top_proportion", "topk", "proportion", "rank"}:
|
|
457
|
+
p = proportion if proportion is not None else 0.10
|
|
458
|
+
if group_labels is None:
|
|
459
|
+
vals = agg_scores
|
|
460
|
+
finite_mask = np.isfinite(vals)
|
|
461
|
+
n_valid = int(np.sum(finite_mask))
|
|
462
|
+
if n_valid == 0:
|
|
463
|
+
return [None] * N
|
|
464
|
+
k = int(np.floor(p * n_valid))
|
|
465
|
+
if k < int(min_count):
|
|
466
|
+
k = int(min_count)
|
|
467
|
+
if k > n_valid:
|
|
468
|
+
k = n_valid
|
|
469
|
+
order = np.argsort(np.where(np.isfinite(vals), vals, np.inf), kind="mergesort")
|
|
470
|
+
selected_idx = set(order[:k].tolist()) if k >= 1 else set()
|
|
471
|
+
years = []
|
|
472
|
+
for i in range(N):
|
|
473
|
+
if i not in selected_idx:
|
|
474
|
+
years.append(None)
|
|
475
|
+
continue
|
|
476
|
+
wm = per_t_window_max[i]
|
|
477
|
+
# threshold value is kth value
|
|
478
|
+
thresh_val = vals[order[k - 1]] if k >= 1 else np.nan
|
|
479
|
+
if not np.isfinite(thresh_val):
|
|
480
|
+
years.append(None)
|
|
481
|
+
continue
|
|
482
|
+
# earliest t where window_max <= threshold
|
|
483
|
+
yr = None
|
|
484
|
+
for t_idx, wv in enumerate(wm):
|
|
485
|
+
if np.isfinite(wv) and wv <= float(thresh_val):
|
|
486
|
+
yr = t_idx + 1 # 1-indexed
|
|
487
|
+
break
|
|
488
|
+
years.append(yr)
|
|
489
|
+
return years
|
|
490
|
+
else:
|
|
491
|
+
labels = np.asarray(group_labels)
|
|
492
|
+
years = [None] * N
|
|
493
|
+
for g in pd.unique(labels):
|
|
494
|
+
idx = np.where(labels == g)[0]
|
|
495
|
+
vals = agg_scores[idx]
|
|
496
|
+
finite_mask = np.isfinite(vals)
|
|
497
|
+
n_valid = int(np.sum(finite_mask))
|
|
498
|
+
if n_valid == 0:
|
|
499
|
+
continue
|
|
500
|
+
k = int(np.floor(p * n_valid))
|
|
501
|
+
if k < int(min_count):
|
|
502
|
+
k = int(min_count)
|
|
503
|
+
if k > n_valid:
|
|
504
|
+
k = n_valid
|
|
505
|
+
order_local = np.argsort(np.where(np.isfinite(vals), vals, np.inf), kind="mergesort")
|
|
506
|
+
selected_local = set(order_local[:k].tolist()) if k >= 1 else set()
|
|
507
|
+
thresh_val = vals[order_local[k - 1]] if k >= 1 else np.nan
|
|
508
|
+
for j_local, i_global in enumerate(idx):
|
|
509
|
+
if j_local not in selected_local or not np.isfinite(thresh_val):
|
|
510
|
+
continue
|
|
511
|
+
wm = per_t_window_max[i_global]
|
|
512
|
+
for t_idx, wv in enumerate(wm):
|
|
513
|
+
if np.isfinite(wv) and wv <= float(thresh_val):
|
|
514
|
+
years[i_global] = int(t_idx + 1)
|
|
515
|
+
break
|
|
516
|
+
return years
|
|
517
|
+
|
|
518
|
+
# quantile branch
|
|
519
|
+
q = quantile_p if quantile_p is not None else 0.10
|
|
520
|
+
years = [None] * N
|
|
521
|
+
if group_labels is None:
|
|
522
|
+
valid = agg_scores[np.isfinite(agg_scores)]
|
|
523
|
+
if valid.size == 0:
|
|
524
|
+
return years
|
|
525
|
+
try:
|
|
526
|
+
xq = float(np.nanquantile(agg_scores, q))
|
|
527
|
+
except Exception:
|
|
528
|
+
xq = float(np.quantile(valid, q))
|
|
529
|
+
for i in range(N):
|
|
530
|
+
if not np.isfinite(agg_scores[i]) or agg_scores[i] > xq:
|
|
531
|
+
continue
|
|
532
|
+
wm = per_t_window_max[i]
|
|
533
|
+
for t_idx, wv in enumerate(wm):
|
|
534
|
+
if np.isfinite(wv) and wv <= xq:
|
|
535
|
+
years[i] = int(t_idx + 1)
|
|
536
|
+
break
|
|
537
|
+
return years
|
|
538
|
+
else:
|
|
539
|
+
labels = np.asarray(group_labels)
|
|
540
|
+
for g in pd.unique(labels):
|
|
541
|
+
idx = np.where(labels == g)[0]
|
|
542
|
+
vals = agg_scores[idx]
|
|
543
|
+
valid = vals[np.isfinite(vals)]
|
|
544
|
+
if valid.size == 0:
|
|
545
|
+
continue
|
|
546
|
+
try:
|
|
547
|
+
xq = float(np.nanquantile(vals, q))
|
|
548
|
+
except Exception:
|
|
549
|
+
xq = float(np.quantile(valid, q))
|
|
550
|
+
for j_local, i_global in enumerate(idx):
|
|
551
|
+
if not np.isfinite(vals[j_local]) or vals[j_local] > xq:
|
|
552
|
+
continue
|
|
553
|
+
wm = per_t_window_max[i_global]
|
|
554
|
+
for t_idx, wv in enumerate(wm):
|
|
555
|
+
if np.isfinite(wv) and wv <= xq:
|
|
556
|
+
years[i_global] = t_idx + 1
|
|
557
|
+
break
|
|
558
|
+
return years
|
|
559
|
+
|
|
560
|
+
if group_labels is not None and method_norm == "zscore":
|
|
561
|
+
# 组内收敛:使用组内频率和样本大小
|
|
562
|
+
return self._compute_first_convergence_year_by_group(z_threshold, min_t, max_t, window, inclusive, group_labels)
|
|
563
|
+
|
|
564
|
+
# 使用全局频率计算稀有度
|
|
565
|
+
rarity_matrix = []
|
|
566
|
+
for seq in self.sequences:
|
|
567
|
+
score = []
|
|
568
|
+
for t in range(self.T):
|
|
569
|
+
suffix = tuple(seq[t:])
|
|
570
|
+
freq = self.suffix_freq_by_year[t][suffix] / N
|
|
571
|
+
score.append(-np.log(freq + 1e-10))
|
|
572
|
+
rarity_matrix.append(score)
|
|
573
|
+
|
|
574
|
+
rarity_df = pd.DataFrame(rarity_matrix)
|
|
575
|
+
rarity_z = rarity_df.apply(lambda x: (x - x.mean()) / x.std(), axis=0)
|
|
576
|
+
# 保留 NaN 以便跳过含零方差年份的窗口
|
|
577
|
+
rarity_z = rarity_z.replace([np.inf, -np.inf], np.nan)
|
|
578
|
+
|
|
579
|
+
years = []
|
|
580
|
+
for i in range(N):
|
|
581
|
+
z = rarity_z.iloc[i]
|
|
582
|
+
year = None
|
|
583
|
+
for t in range(min_t - 1, max_t): # min_t-1 for 0-indexed, max_t already accounts for window
|
|
584
|
+
vals = [z.iloc[t + k] for k in range(window)]
|
|
585
|
+
if not np.all(np.isfinite(vals)):
|
|
586
|
+
continue
|
|
587
|
+
# 收敛 = 低稀有(更典型)
|
|
588
|
+
if inclusive:
|
|
589
|
+
condition = all(v <= -z_threshold for v in vals)
|
|
590
|
+
else:
|
|
591
|
+
condition = all(v < -z_threshold for v in vals)
|
|
592
|
+
if condition:
|
|
593
|
+
year = int(t + 1) # Convert to 1-indexed integer
|
|
594
|
+
break
|
|
595
|
+
years.append(year)
|
|
596
|
+
return years
|
|
597
|
+
|
|
598
|
+
def _compute_first_convergence_year_by_group(self, z_threshold, min_t, max_t, window, inclusive, group_labels):
|
|
599
|
+
"""
|
|
600
|
+
计算组内第一次收敛年份:使用组内频率和样本大小计算稀有度
|
|
601
|
+
"""
|
|
602
|
+
from collections import defaultdict
|
|
603
|
+
|
|
604
|
+
# 按组构建 suffix 频率表(重用 _compute_converged_by_group 的逻辑)
|
|
605
|
+
group_sequences = defaultdict(list)
|
|
606
|
+
for i, (seq, group) in enumerate(zip(self.sequences, group_labels)):
|
|
607
|
+
group_sequences[group].append((i, seq))
|
|
608
|
+
|
|
609
|
+
# 为每个组构建 suffix 频率表
|
|
610
|
+
group_suffix_freq = {}
|
|
611
|
+
group_sizes = {}
|
|
612
|
+
for group, seq_list in group_sequences.items():
|
|
613
|
+
group_sizes[group] = len(seq_list)
|
|
614
|
+
freq_by_year = [defaultdict(int) for _ in range(self.T)]
|
|
615
|
+
|
|
616
|
+
for _, seq in seq_list:
|
|
617
|
+
for t in range(self.T):
|
|
618
|
+
suffix = tuple(seq[t:])
|
|
619
|
+
freq_by_year[t][suffix] += 1
|
|
620
|
+
|
|
621
|
+
group_suffix_freq[group] = freq_by_year
|
|
622
|
+
|
|
623
|
+
# 为每个个体计算组内收敛年份
|
|
624
|
+
all_years = [None] * len(self.sequences)
|
|
625
|
+
|
|
626
|
+
for group, seq_list in group_sequences.items():
|
|
627
|
+
group_n = group_sizes[group]
|
|
628
|
+
group_freq = group_suffix_freq[group]
|
|
629
|
+
|
|
630
|
+
# 计算该组的稀有度矩阵
|
|
631
|
+
rarity_matrix = []
|
|
632
|
+
group_indices = []
|
|
633
|
+
|
|
634
|
+
for orig_idx, seq in seq_list:
|
|
635
|
+
group_indices.append(orig_idx)
|
|
636
|
+
score = []
|
|
637
|
+
for t in range(self.T):
|
|
638
|
+
suffix = tuple(seq[t:])
|
|
639
|
+
freq = group_freq[t][suffix] / group_n
|
|
640
|
+
score.append(-np.log(freq + 1e-10))
|
|
641
|
+
rarity_matrix.append(score)
|
|
642
|
+
|
|
643
|
+
# 计算 z 分数
|
|
644
|
+
rarity_df = pd.DataFrame(rarity_matrix)
|
|
645
|
+
rarity_z = rarity_df.apply(lambda x: (x - x.mean()) / x.std(), axis=0)
|
|
646
|
+
rarity_z = rarity_z.replace([np.inf, -np.inf], np.nan)
|
|
647
|
+
|
|
648
|
+
# 寻找第一次收敛年份
|
|
649
|
+
for i, orig_idx in enumerate(group_indices):
|
|
650
|
+
z = rarity_z.iloc[i]
|
|
651
|
+
year = None
|
|
652
|
+
for t in range(min_t - 1, max_t):
|
|
653
|
+
vals = [z.iloc[t + k] for k in range(window)]
|
|
654
|
+
if not np.all(np.isfinite(vals)):
|
|
655
|
+
continue
|
|
656
|
+
if inclusive:
|
|
657
|
+
condition = all(v <= -z_threshold for v in vals)
|
|
658
|
+
else:
|
|
659
|
+
condition = all(v < -z_threshold for v in vals)
|
|
660
|
+
if condition:
|
|
661
|
+
year = int(t + 1)
|
|
662
|
+
break
|
|
663
|
+
|
|
664
|
+
all_years[orig_idx] = year
|
|
665
|
+
|
|
666
|
+
return all_years
|
|
667
|
+
|
|
668
|
+
def compute_suffix_rarity_per_year(self, as_dataframe: bool = True, column_prefix: str = "t", zscore: bool = False):
|
|
669
|
+
"""
|
|
670
|
+
Compute per-year suffix rarity scores for each individual.
|
|
671
|
+
|
|
672
|
+
Definition (mirror of prefix rarity):
|
|
673
|
+
rarity_{i,t} = -log( freq(suffix_{i,t}) / N ) >= 0
|
|
674
|
+
|
|
675
|
+
Where suffix_{i,t} is the observed suffix from year t to end for person i,
|
|
676
|
+
and N is total number of individuals. Higher means rarer (less typical).
|
|
677
|
+
|
|
678
|
+
Parameters
|
|
679
|
+
----------
|
|
680
|
+
as_dataframe : bool, default True
|
|
681
|
+
If True, returns a pandas DataFrame with columns f"{column_prefix}1"..f"{column_prefix}T".
|
|
682
|
+
If False, returns a NumPy array of shape (N, T).
|
|
683
|
+
column_prefix : str, default "t"
|
|
684
|
+
Column name prefix when returning a DataFrame.
|
|
685
|
+
zscore : bool, default False
|
|
686
|
+
If True, z-standardize the rarity scores column-wise (by year).
|
|
687
|
+
|
|
688
|
+
Returns
|
|
689
|
+
-------
|
|
690
|
+
pandas.DataFrame or np.ndarray
|
|
691
|
+
Per-year rarity scores (optionally z-scored).
|
|
692
|
+
"""
|
|
693
|
+
N = len(self.sequences)
|
|
694
|
+
rarity_matrix = []
|
|
695
|
+
|
|
696
|
+
for seq in self.sequences:
|
|
697
|
+
score_list = []
|
|
698
|
+
for t in range(self.T):
|
|
699
|
+
suffix = tuple(seq[t:])
|
|
700
|
+
freq = self.suffix_freq_by_year[t][suffix] / N
|
|
701
|
+
score_list.append(-np.log(freq + 1e-10))
|
|
702
|
+
rarity_matrix.append(score_list)
|
|
703
|
+
|
|
704
|
+
rarity_arr = np.array(rarity_matrix, dtype=float)
|
|
705
|
+
|
|
706
|
+
if zscore:
|
|
707
|
+
col_means = np.nanmean(rarity_arr, axis=0)
|
|
708
|
+
col_stds = np.nanstd(rarity_arr, axis=0, ddof=1) # 与 pandas DataFrame.std() 保持一致
|
|
709
|
+
with np.errstate(invalid='ignore', divide='ignore'):
|
|
710
|
+
rarity_arr = (rarity_arr - col_means) / col_stds
|
|
711
|
+
|
|
712
|
+
if not as_dataframe:
|
|
713
|
+
return rarity_arr
|
|
714
|
+
|
|
715
|
+
columns = [f"{column_prefix}{t+1}" for t in range(self.T)]
|
|
716
|
+
return pd.DataFrame(rarity_arr, columns=columns)
|
|
717
|
+
|
|
718
|
+
def compute_suffix_rarity_score(self):
|
|
719
|
+
"""
|
|
720
|
+
Compute cumulative suffix rarity score for each individual:
|
|
721
|
+
rarity_score_i = sum_{t=1}^T -log( freq(suffix_{i,t}) / N )
|
|
722
|
+
|
|
723
|
+
Higher scores indicate rarer, less typical future paths from each year onward.
|
|
724
|
+
"""
|
|
725
|
+
rarity_scores = []
|
|
726
|
+
N = len(self.sequences)
|
|
727
|
+
|
|
728
|
+
for seq in self.sequences:
|
|
729
|
+
score = 0.0
|
|
730
|
+
for t in range(self.T):
|
|
731
|
+
suffix = tuple(seq[t:])
|
|
732
|
+
freq = self.suffix_freq_by_year[t][suffix] / N
|
|
733
|
+
score += -np.log(freq + 1e-10)
|
|
734
|
+
rarity_scores.append(score)
|
|
735
|
+
return rarity_scores
|
|
736
|
+
|
|
737
|
+
def compute_standardized_rarity_score(self, min_t=1, max_t=None, window=1):
|
|
738
|
+
"""
|
|
739
|
+
Compute standardized rarity scores for convergence classification and visualization
|
|
740
|
+
using true statistical z-scores.
|
|
741
|
+
|
|
742
|
+
This method computes standardized rarity scores used for individual-level
|
|
743
|
+
convergence classification:
|
|
744
|
+
standardized_score_i = min_t max_{k=0..window-1} z_{i,t+k}
|
|
745
|
+
|
|
746
|
+
Where z_{i,t} are the year-wise true z-scores of suffix rarity computed column-wise
|
|
747
|
+
across individuals with sample standard deviation (ddof=1):
|
|
748
|
+
z_{i,t} = (x_{i,t} - mean_t) / std_t
|
|
749
|
+
|
|
750
|
+
The standardized scores can be used with a threshold (e.g., z <= -1.5) to classify
|
|
751
|
+
individuals as converged/not converged, and are particularly useful for visualization.
|
|
752
|
+
|
|
753
|
+
Note: For convergence (suffix tree), we look for LOW rarity (more typical patterns),
|
|
754
|
+
so lower z-scores indicate convergence. This is opposite to prefix tree divergence.
|
|
755
|
+
|
|
756
|
+
Parameters:
|
|
757
|
+
-----------
|
|
758
|
+
min_t : int, default=1
|
|
759
|
+
Minimum year (1-indexed) after which convergence is considered valid.
|
|
760
|
+
max_t : int, optional
|
|
761
|
+
Maximum year (1-indexed) before which convergence is considered valid.
|
|
762
|
+
If None, uses T-window+1.
|
|
763
|
+
window : int, default=1
|
|
764
|
+
Number of consecutive low-z years required
|
|
765
|
+
|
|
766
|
+
Returns:
|
|
767
|
+
--------
|
|
768
|
+
List[float]
|
|
769
|
+
Standardized rarity scores for each individual. Values <= -z_threshold indicate convergence.
|
|
770
|
+
|
|
771
|
+
Notes:
|
|
772
|
+
------
|
|
773
|
+
The standardization uses sample standard deviation (ddof=1) for each year column,
|
|
774
|
+
which is consistent with pandas' default behavior for DataFrame.std().
|
|
775
|
+
This is essentially the z-score normalized version of suffix rarity scores.
|
|
776
|
+
For convergence detection, we look for the MINIMUM z-score (most typical behavior).
|
|
777
|
+
"""
|
|
778
|
+
if max_t is None:
|
|
779
|
+
max_t = self.T - window + 1
|
|
780
|
+
|
|
781
|
+
N = len(self.sequences)
|
|
782
|
+
|
|
783
|
+
# Step 1: Calculate rarity matrix
|
|
784
|
+
rarity_matrix = []
|
|
785
|
+
for seq in self.sequences:
|
|
786
|
+
score = []
|
|
787
|
+
for t in range(self.T):
|
|
788
|
+
suffix = tuple(seq[t:])
|
|
789
|
+
freq = self.suffix_freq_by_year[t][suffix] / N
|
|
790
|
+
score.append(-np.log(freq + 1e-10))
|
|
791
|
+
rarity_matrix.append(score)
|
|
792
|
+
|
|
793
|
+
# Step 2: Column-wise true z-score standardization (by year, ddof=1)
|
|
794
|
+
rarity_arr = np.asarray(rarity_matrix, dtype=float)
|
|
795
|
+
col_means = np.nanmean(rarity_arr, axis=0)
|
|
796
|
+
col_stds = np.nanstd(rarity_arr, axis=0, ddof=1)
|
|
797
|
+
with np.errstate(invalid='ignore', divide='ignore'):
|
|
798
|
+
rarity_z = (rarity_arr - col_means) / col_stds
|
|
799
|
+
# Keep NaN for zero-variance years to allow window skipping downstream
|
|
800
|
+
rarity_z = np.where(np.isfinite(rarity_z), rarity_z, np.nan)
|
|
801
|
+
|
|
802
|
+
# Step 3: Compute standardized rarity score for each individual
|
|
803
|
+
standardized_scores = []
|
|
804
|
+
for i in range(N):
|
|
805
|
+
z_scores = rarity_z[i, :]
|
|
806
|
+
candidate_values = []
|
|
807
|
+
|
|
808
|
+
# For each possible starting time t
|
|
809
|
+
for t in range(min_t - 1, max_t): # min_t-1 for 0-indexed
|
|
810
|
+
vals = [z_scores[t + k] for k in range(window)]
|
|
811
|
+
# Skip windows containing NaN (e.g., zero-variance years)
|
|
812
|
+
if not np.all(np.isfinite(vals)):
|
|
813
|
+
continue
|
|
814
|
+
# For convergence, take maximum within window (ensure all finite)
|
|
815
|
+
window_max = float(np.max(vals))
|
|
816
|
+
candidate_values.append(window_max)
|
|
817
|
+
|
|
818
|
+
# Take the minimum across all starting times (most convergent period)
|
|
819
|
+
if candidate_values:
|
|
820
|
+
standardized_score = float(np.min(candidate_values))
|
|
821
|
+
else:
|
|
822
|
+
standardized_score = np.nan
|
|
823
|
+
|
|
824
|
+
standardized_scores.append(standardized_score)
|
|
825
|
+
|
|
826
|
+
return standardized_scores
|
|
827
|
+
|
|
828
|
+
def compute_converged_by_top_proportion(
|
|
829
|
+
self,
|
|
830
|
+
group_labels,
|
|
831
|
+
proportion: float = 0.10,
|
|
832
|
+
min_t: int = 1,
|
|
833
|
+
max_t: Optional[int] = None,
|
|
834
|
+
window: int = 1,
|
|
835
|
+
min_count: int = 1,
|
|
836
|
+
):
|
|
837
|
+
"""
|
|
838
|
+
Classify convergence by selecting the top proportion of most typical (smallest) standardized scores
|
|
839
|
+
WITHIN EACH GROUP (e.g., country). This ensures identical proportion thresholds across groups,
|
|
840
|
+
independent of distribution shape or discreteness.
|
|
841
|
+
|
|
842
|
+
Steps:
|
|
843
|
+
1) Compute true-z standardized rarity aggregated score per individual using
|
|
844
|
+
`compute_standardized_rarity_score(min_t, max_t, window)`.
|
|
845
|
+
2) For each group g, sort scores ascending and select the first k = max(min_count, floor(p*n_g)) indices
|
|
846
|
+
as convergers.
|
|
847
|
+
|
|
848
|
+
Parameters
|
|
849
|
+
----------
|
|
850
|
+
group_labels : Sequence
|
|
851
|
+
Group label per individual (e.g., country). Length must equal number of sequences.
|
|
852
|
+
proportion : float, default 0.10
|
|
853
|
+
Top p proportion to mark as converged within each group (0<p<1).
|
|
854
|
+
min_t : int, default 1
|
|
855
|
+
Minimum year considered in the aggregated score.
|
|
856
|
+
max_t : Optional[int], default None
|
|
857
|
+
Maximum starting year considered; if None, uses T-window+1.
|
|
858
|
+
window : int, default 1
|
|
859
|
+
Number of consecutive years in the aggregated statistic.
|
|
860
|
+
min_count : int, default 1
|
|
861
|
+
Minimum number selected per group (useful for very small groups).
|
|
862
|
+
|
|
863
|
+
Returns
|
|
864
|
+
-------
|
|
865
|
+
tuple[List[int], dict]
|
|
866
|
+
(flags, info) where flags is a 0/1 list for convergence, and info is per-group metadata:
|
|
867
|
+
{group: {"k": int, "n": int, "threshold_value": float}}
|
|
868
|
+
"""
|
|
869
|
+
if not (0 < float(proportion) < 1):
|
|
870
|
+
raise ValueError(f"proportion must be in (0,1), got {proportion}")
|
|
871
|
+
|
|
872
|
+
N = len(self.sequences)
|
|
873
|
+
if len(group_labels) != N:
|
|
874
|
+
raise ValueError("Length of group_labels must match number of sequences")
|
|
875
|
+
|
|
876
|
+
# 1) Compute aggregated standardized score (lower = more typical)
|
|
877
|
+
scores = np.asarray(self.compute_standardized_rarity_score(min_t=min_t, max_t=max_t, window=window), dtype=float)
|
|
878
|
+
|
|
879
|
+
labels = np.asarray(group_labels)
|
|
880
|
+
flags = np.zeros(N, dtype=int)
|
|
881
|
+
info = {}
|
|
882
|
+
|
|
883
|
+
# Iterate groups deterministically by sorted group name for reproducibility
|
|
884
|
+
for g in sorted(pd.unique(labels)):
|
|
885
|
+
idx = np.where(labels == g)[0]
|
|
886
|
+
vals = scores[idx]
|
|
887
|
+
|
|
888
|
+
n_g = len(idx)
|
|
889
|
+
if n_g == 0:
|
|
890
|
+
info[g] = {"k": 0, "n": 0, "threshold_value": np.nan}
|
|
891
|
+
continue
|
|
892
|
+
|
|
893
|
+
# Determine k with lower bound min_count and upper bound n_g
|
|
894
|
+
k = int(np.floor(proportion * n_g))
|
|
895
|
+
if k < min_count:
|
|
896
|
+
k = min_count
|
|
897
|
+
if k > n_g:
|
|
898
|
+
k = n_g
|
|
899
|
+
|
|
900
|
+
# Treat NaN as worst (push to the end); still allow exact k selection
|
|
901
|
+
order_vals = np.where(np.isfinite(vals), vals, np.inf)
|
|
902
|
+
order = np.argsort(order_vals, kind="mergesort") # stable for tie-breaking
|
|
903
|
+
|
|
904
|
+
if k >= 1:
|
|
905
|
+
selected_local = order[:k]
|
|
906
|
+
selected_global = idx[selected_local]
|
|
907
|
+
flags[selected_global] = 1
|
|
908
|
+
kth_val = vals[order[k - 1]]
|
|
909
|
+
kth_val = float(kth_val) if np.isfinite(kth_val) else np.nan
|
|
910
|
+
else:
|
|
911
|
+
selected_local = np.array([], dtype=int)
|
|
912
|
+
kth_val = np.nan
|
|
913
|
+
|
|
914
|
+
info[g] = {"k": int(k), "n": int(n_g), "threshold_value": kth_val}
|
|
915
|
+
|
|
916
|
+
return flags.tolist(), info
|
|
917
|
+
|
|
918
|
+
def diagnose_convergence_calculation(self, z_threshold=1.5, max_t=None, window=1, inclusive=False, group_labels=None):
|
|
919
|
+
"""
|
|
920
|
+
Diagnostic function to analyze convergence year calculation and identify
|
|
921
|
+
years with insufficient variance (std ≈ 0) that cannot trigger convergence.
|
|
922
|
+
|
|
923
|
+
This is methodologically appropriate: when all individuals follow similar
|
|
924
|
+
trajectories in a given year, no convergence should be detected.
|
|
925
|
+
|
|
926
|
+
Returns:
|
|
927
|
+
--------
|
|
928
|
+
dict: Diagnostic information including:
|
|
929
|
+
- years_with_zero_variance: List of years where std ≈ 0
|
|
930
|
+
- rarity_std_by_year: Standard deviation of rarity scores per year
|
|
931
|
+
- n_individuals_with_convergence: Count of individuals with any convergence
|
|
932
|
+
- convergence_year_distribution: Value counts of convergence years
|
|
933
|
+
"""
|
|
934
|
+
if max_t is None:
|
|
935
|
+
max_t = self.T - window + 1
|
|
936
|
+
|
|
937
|
+
N = len(self.sequences)
|
|
938
|
+
rarity_matrix = []
|
|
939
|
+
|
|
940
|
+
# Calculate rarity scores (same as in compute_first_convergence_year)
|
|
941
|
+
for seq in self.sequences:
|
|
942
|
+
score = []
|
|
943
|
+
for t in range(self.T):
|
|
944
|
+
suffix = tuple(seq[t:])
|
|
945
|
+
freq = self.suffix_freq_by_year[t][suffix] / N
|
|
946
|
+
score.append(-np.log(freq + 1e-10))
|
|
947
|
+
rarity_matrix.append(score)
|
|
948
|
+
|
|
949
|
+
rarity_df = pd.DataFrame(rarity_matrix)
|
|
950
|
+
|
|
951
|
+
# Calculate standard deviations by year
|
|
952
|
+
rarity_std_by_year = rarity_df.std(axis=0)
|
|
953
|
+
years_with_zero_variance = []
|
|
954
|
+
|
|
955
|
+
# Identify years with near-zero variance (threshold can be adjusted)
|
|
956
|
+
for t, std_val in enumerate(rarity_std_by_year):
|
|
957
|
+
if pd.isna(std_val) or std_val < 1e-10:
|
|
958
|
+
years_with_zero_variance.append(t + 1) # 1-indexed
|
|
959
|
+
|
|
960
|
+
# Calculate z-scores
|
|
961
|
+
rarity_z = rarity_df.apply(lambda x: (x - x.mean()) / x.std(), axis=0)
|
|
962
|
+
|
|
963
|
+
# Count individuals with convergence
|
|
964
|
+
convergence_years = self.compute_first_convergence_year(
|
|
965
|
+
z_threshold=z_threshold, min_t=1, max_t=max_t, window=window,
|
|
966
|
+
inclusive=inclusive, group_labels=group_labels
|
|
967
|
+
)
|
|
968
|
+
n_individuals_with_convergence = sum(1 for year in convergence_years if year is not None)
|
|
969
|
+
|
|
970
|
+
# Distribution of convergence years
|
|
971
|
+
convergence_year_counts = pd.Series(convergence_years).value_counts(dropna=False).sort_index()
|
|
972
|
+
|
|
973
|
+
return {
|
|
974
|
+
'years_with_zero_variance': years_with_zero_variance,
|
|
975
|
+
'rarity_std_by_year': rarity_std_by_year.tolist(),
|
|
976
|
+
'n_individuals_with_convergence': n_individuals_with_convergence,
|
|
977
|
+
'convergence_year_distribution': convergence_year_counts.to_dict(),
|
|
978
|
+
'total_individuals': N,
|
|
979
|
+
'parameters_used': {
|
|
980
|
+
'z_threshold': z_threshold,
|
|
981
|
+
'max_t': max_t,
|
|
982
|
+
'window': window,
|
|
983
|
+
'inclusive': inclusive,
|
|
984
|
+
'group_labels': group_labels is not None
|
|
985
|
+
}
|
|
986
|
+
}
|
|
987
|
+
|
|
988
|
+
def compute_path_uniqueness(self):
|
|
989
|
+
"""
|
|
990
|
+
Count, for each individual, how many years t their suffix (from t to end)
|
|
991
|
+
is unique in the population (frequency == 1). Uses suffix-based logic.
|
|
992
|
+
"""
|
|
993
|
+
uniqueness_scores = []
|
|
994
|
+
for seq in self.sequences:
|
|
995
|
+
count = 0
|
|
996
|
+
for t in range(self.T):
|
|
997
|
+
suffix = tuple(seq[t:])
|
|
998
|
+
if self.suffix_freq_by_year[t][suffix] == 1:
|
|
999
|
+
count += 1
|
|
1000
|
+
uniqueness_scores.append(count)
|
|
1001
|
+
return uniqueness_scores
|
|
1002
|
+
|
|
1003
|
+
|
|
1004
|
+
def plot_suffix_rarity_distribution(
|
|
1005
|
+
data,
|
|
1006
|
+
# === Core Parameters ===
|
|
1007
|
+
group_names=None,
|
|
1008
|
+
colors=None,
|
|
1009
|
+
# === Threshold Settings ===
|
|
1010
|
+
show_threshold=True,
|
|
1011
|
+
threshold_method="top_proportion", # Changed default to top_proportion
|
|
1012
|
+
proportion_p=0.07, # Simplified parameter name, default 7%
|
|
1013
|
+
# === Plotting Options ===
|
|
1014
|
+
figsize=(10, 6),
|
|
1015
|
+
kde_bw=None,
|
|
1016
|
+
# === Export Options ===
|
|
1017
|
+
save_as=None,
|
|
1018
|
+
dpi=300,
|
|
1019
|
+
show=True,
|
|
1020
|
+
# === Parameters for Different Methods ===
|
|
1021
|
+
z_threshold=1.5,
|
|
1022
|
+
is_standardized_score=False,
|
|
1023
|
+
quantile_p=0.10
|
|
1024
|
+
):
|
|
1025
|
+
"""
|
|
1026
|
+
Plot suffix rarity score distribution(s) with clean threshold lines.
|
|
1027
|
+
|
|
1028
|
+
Parameters
|
|
1029
|
+
----------
|
|
1030
|
+
data : dict or array-like
|
|
1031
|
+
Data to plot. If dict: {"group1": scores1, "group2": scores2}
|
|
1032
|
+
If array-like: single group data
|
|
1033
|
+
group_names : list, optional
|
|
1034
|
+
Custom group names. Auto-detected from dict keys if not provided
|
|
1035
|
+
colors : dict or list, optional
|
|
1036
|
+
Colors for groups. If None, uses default palette
|
|
1037
|
+
|
|
1038
|
+
show_threshold : bool, default True
|
|
1039
|
+
Whether to show threshold vertical lines
|
|
1040
|
+
threshold_method : str, default "top_proportion"
|
|
1041
|
+
Threshold method:
|
|
1042
|
+
- "top_proportion": Select top proportion_p% most extreme values
|
|
1043
|
+
- "quantile": Use quantile_p percentile
|
|
1044
|
+
- "zscore": Use z-score threshold (for standardized data)
|
|
1045
|
+
proportion_p : float, default 0.05
|
|
1046
|
+
Proportion for top_proportion method (e.g., 0.05 = top 5%)
|
|
1047
|
+
|
|
1048
|
+
figsize : tuple, default (10, 6)
|
|
1049
|
+
Figure size (width, height)
|
|
1050
|
+
kde_bw : float, optional
|
|
1051
|
+
KDE bandwidth adjustment. If None, uses seaborn default
|
|
1052
|
+
|
|
1053
|
+
save_as : str, optional
|
|
1054
|
+
Save path (without extension)
|
|
1055
|
+
dpi : int, default 300
|
|
1056
|
+
Resolution for saved figure
|
|
1057
|
+
show : bool, default True
|
|
1058
|
+
Whether to display plot
|
|
1059
|
+
|
|
1060
|
+
Returns
|
|
1061
|
+
-------
|
|
1062
|
+
dict
|
|
1063
|
+
Statistics including threshold values per group
|
|
1064
|
+
|
|
1065
|
+
Examples
|
|
1066
|
+
--------
|
|
1067
|
+
# Basic usage - top 5% threshold (default)
|
|
1068
|
+
>>> plot_suffix_rarity_distribution({"India": india_scores, "US": us_scores})
|
|
1069
|
+
|
|
1070
|
+
# Custom threshold proportion
|
|
1071
|
+
>>> plot_suffix_rarity_distribution(
|
|
1072
|
+
... data={"India": india_scores, "US": us_scores},
|
|
1073
|
+
... proportion_p=0.03, # top 3%
|
|
1074
|
+
... save_as="rarity_comparison"
|
|
1075
|
+
... )
|
|
1076
|
+
|
|
1077
|
+
# Quantile-based threshold
|
|
1078
|
+
>>> plot_suffix_rarity_distribution(
|
|
1079
|
+
... data={"India": india_scores, "US": us_scores},
|
|
1080
|
+
... threshold_method="quantile",
|
|
1081
|
+
... quantile_p=0.10, # 10th percentile
|
|
1082
|
+
... )
|
|
1083
|
+
|
|
1084
|
+
# Clean plot without thresholds
|
|
1085
|
+
>>> plot_suffix_rarity_distribution(
|
|
1086
|
+
... data,
|
|
1087
|
+
... show_threshold=False,
|
|
1088
|
+
... colors={"India": "#E8B88A", "US": "#A3BFD9"}
|
|
1089
|
+
... )
|
|
1090
|
+
"""
|
|
1091
|
+
import matplotlib.pyplot as plt
|
|
1092
|
+
import seaborn as sns
|
|
1093
|
+
import numpy as np
|
|
1094
|
+
|
|
1095
|
+
# Process input data
|
|
1096
|
+
if isinstance(data, dict):
|
|
1097
|
+
# Multi-group case
|
|
1098
|
+
groups = data
|
|
1099
|
+
if group_names is None:
|
|
1100
|
+
group_names = list(groups.keys())
|
|
1101
|
+
else:
|
|
1102
|
+
# Single group case
|
|
1103
|
+
if group_names is None:
|
|
1104
|
+
group_names = ["Group"]
|
|
1105
|
+
groups = {group_names[0]: data}
|
|
1106
|
+
|
|
1107
|
+
# Set up colors (simplified)
|
|
1108
|
+
if colors is None:
|
|
1109
|
+
default_colors = ["#A3BFD9", "#E8B88A", "#C6A5CF", "#A6C1A9", "#F4A460", "#87CEEB"]
|
|
1110
|
+
color_map = dict(zip(group_names, default_colors[:len(group_names)]))
|
|
1111
|
+
elif isinstance(colors, dict):
|
|
1112
|
+
color_map = colors
|
|
1113
|
+
else:
|
|
1114
|
+
color_map = dict(zip(group_names, colors))
|
|
1115
|
+
|
|
1116
|
+
# Normalize method and prepare stats
|
|
1117
|
+
threshold_method = (threshold_method or "top_proportion").lower()
|
|
1118
|
+
|
|
1119
|
+
# Handle legacy parameter mapping
|
|
1120
|
+
if threshold_method in {"top_proportion", "topk", "proportion", "rank"}:
|
|
1121
|
+
# Use the simplified proportion_p parameter
|
|
1122
|
+
top_proportion_p = proportion_p
|
|
1123
|
+
topk_min_count = 1
|
|
1124
|
+
elif threshold_method == "quantile":
|
|
1125
|
+
# Use quantile_p for quantile method
|
|
1126
|
+
pass
|
|
1127
|
+
elif threshold_method in {"zscore", "z"} and is_standardized_score:
|
|
1128
|
+
# Auto-handle standardized scores
|
|
1129
|
+
pass
|
|
1130
|
+
|
|
1131
|
+
stats = {"per_group": {}, "threshold_method": threshold_method}
|
|
1132
|
+
|
|
1133
|
+
# Validate quantiles if needed
|
|
1134
|
+
def _check_q(q: float):
|
|
1135
|
+
if not (0 < float(q) < 1):
|
|
1136
|
+
raise ValueError(f"quantile must be in (0,1), got {q}")
|
|
1137
|
+
quantiles_to_draw = None
|
|
1138
|
+
if threshold_method == "quantile":
|
|
1139
|
+
_check_q(quantile_p)
|
|
1140
|
+
quantiles_to_draw = [quantile_p] # Simplified - no additional_quantiles
|
|
1141
|
+
# Per-group quantile(s)
|
|
1142
|
+
for g in group_names:
|
|
1143
|
+
if g in groups:
|
|
1144
|
+
arr = np.asarray(groups[g], dtype=float)
|
|
1145
|
+
# Compute requested quantiles with NaN handling
|
|
1146
|
+
valid = arr[~np.isnan(arr)]
|
|
1147
|
+
thresholds_g = {}
|
|
1148
|
+
if valid.size > 0:
|
|
1149
|
+
for q in quantiles_to_draw:
|
|
1150
|
+
try:
|
|
1151
|
+
xq = float(np.nanquantile(arr, q))
|
|
1152
|
+
except Exception:
|
|
1153
|
+
xq = float(np.quantile(valid, q))
|
|
1154
|
+
thresholds_g[f"p{int(round(q*100)):02d}"] = xq
|
|
1155
|
+
else:
|
|
1156
|
+
for q in quantiles_to_draw:
|
|
1157
|
+
thresholds_g[f"p{int(round(q*100)):02d}"] = np.nan
|
|
1158
|
+
# Primary threshold (for backward compatibility)
|
|
1159
|
+
primary_label = f"p{int(round(quantile_p*100)):02d}"
|
|
1160
|
+
primary_value = thresholds_g.get(primary_label, np.nan)
|
|
1161
|
+
# Proportion below primary
|
|
1162
|
+
vals = valid
|
|
1163
|
+
prop_below = float(np.nanmean(vals <= primary_value)) if vals.size > 0 and not np.isnan(primary_value) else np.nan
|
|
1164
|
+
stats["per_group"][g] = {
|
|
1165
|
+
"threshold_values": thresholds_g,
|
|
1166
|
+
"is_group_relative": True,
|
|
1167
|
+
"threshold_value": primary_value,
|
|
1168
|
+
"primary_quantile": primary_label,
|
|
1169
|
+
"prop_below": prop_below
|
|
1170
|
+
}
|
|
1171
|
+
elif threshold_method in {"zscore", "z"}:
|
|
1172
|
+
# z-score method (backward compatibility)
|
|
1173
|
+
for g in group_names:
|
|
1174
|
+
if g in groups:
|
|
1175
|
+
arr = np.asarray(groups[g], dtype=float)
|
|
1176
|
+
mean_g = np.nanmean(arr)
|
|
1177
|
+
std_g = np.nanstd(arr, ddof=1) # sample std to match pandas
|
|
1178
|
+
if is_standardized_score:
|
|
1179
|
+
x_thresh_g = -float(z_threshold)
|
|
1180
|
+
else:
|
|
1181
|
+
x_thresh_g = float(mean_g - z_threshold * std_g)
|
|
1182
|
+
vals = arr[~np.isnan(arr)]
|
|
1183
|
+
prop_below = float(np.nanmean(vals <= x_thresh_g)) if vals.size > 0 and not np.isnan(x_thresh_g) else np.nan
|
|
1184
|
+
stats["per_group"][g] = {
|
|
1185
|
+
"mean": float(mean_g),
|
|
1186
|
+
"std": float(std_g),
|
|
1187
|
+
"threshold_value": float(x_thresh_g),
|
|
1188
|
+
"z_threshold": float(z_threshold),
|
|
1189
|
+
"is_group_relative": True,
|
|
1190
|
+
"prop_below": prop_below,
|
|
1191
|
+
"num_below": int(np.sum(vals <= x_thresh_g)) if vals.size > 0 and not np.isnan(x_thresh_g) else 0,
|
|
1192
|
+
"n": int(vals.size)
|
|
1193
|
+
}
|
|
1194
|
+
elif threshold_method in {"topk", "top_proportion", "proportion", "rank"}:
|
|
1195
|
+
# Rank-based proportion selection within each group: pick top p% (smallest values)
|
|
1196
|
+
if not (0 < float(proportion_p) < 1):
|
|
1197
|
+
raise ValueError(f"proportion_p must be in (0,1), got {proportion_p}")
|
|
1198
|
+
top_proportion_p = proportion_p # Map to internal variable
|
|
1199
|
+
for g in group_names:
|
|
1200
|
+
if g in groups:
|
|
1201
|
+
arr = np.asarray(groups[g], dtype=float)
|
|
1202
|
+
finite_mask = np.isfinite(arr)
|
|
1203
|
+
vals = arr[finite_mask]
|
|
1204
|
+
n_valid = int(vals.size)
|
|
1205
|
+
if n_valid == 0:
|
|
1206
|
+
stats["per_group"][g] = {
|
|
1207
|
+
"threshold_value": np.nan,
|
|
1208
|
+
"k": 0,
|
|
1209
|
+
"n": 0,
|
|
1210
|
+
"prop_selected": np.nan,
|
|
1211
|
+
"num_leq_threshold": 0
|
|
1212
|
+
}
|
|
1213
|
+
continue
|
|
1214
|
+
k = int(np.floor(top_proportion_p * n_valid))
|
|
1215
|
+
if k < int(topk_min_count):
|
|
1216
|
+
k = int(topk_min_count)
|
|
1217
|
+
if k > n_valid:
|
|
1218
|
+
k = n_valid
|
|
1219
|
+
# Sort ascending (most typical first)
|
|
1220
|
+
order = np.argsort(vals, kind="mergesort")
|
|
1221
|
+
thresh_val = vals[order[k - 1]] if k >= 1 else np.nan
|
|
1222
|
+
num_leq = int(np.sum(vals <= thresh_val)) if k >= 1 and np.isfinite(thresh_val) else 0
|
|
1223
|
+
stats["per_group"][g] = {
|
|
1224
|
+
"threshold_value": float(thresh_val) if np.isfinite(thresh_val) else np.nan,
|
|
1225
|
+
"k": int(k),
|
|
1226
|
+
"n": int(n_valid),
|
|
1227
|
+
"prop_selected": (k / n_valid) if n_valid > 0 else np.nan,
|
|
1228
|
+
"num_leq_threshold": num_leq
|
|
1229
|
+
}
|
|
1230
|
+
stats["threshold_method"] = "topk"
|
|
1231
|
+
else:
|
|
1232
|
+
raise ValueError(f"Unknown threshold_method: {threshold_method}")
|
|
1233
|
+
|
|
1234
|
+
# Create plot
|
|
1235
|
+
plt.figure(figsize=figsize)
|
|
1236
|
+
|
|
1237
|
+
# Plot distributions
|
|
1238
|
+
for idx, group_name in enumerate(group_names):
|
|
1239
|
+
if group_name in groups:
|
|
1240
|
+
scores = groups[group_name]
|
|
1241
|
+
color = color_map.get(group_name, "#1f77b4")
|
|
1242
|
+
arr = np.asarray(scores, dtype=float)
|
|
1243
|
+
vmin = np.nanmin(arr) if np.isfinite(arr).any() else None
|
|
1244
|
+
vmax = np.nanmax(arr) if np.isfinite(arr).any() else None
|
|
1245
|
+
kde_kwargs = {"label": group_name, "fill": True, "color": color, "linewidth": 2}
|
|
1246
|
+
if kde_bw is not None:
|
|
1247
|
+
kde_kwargs["bw_adjust"] = kde_bw
|
|
1248
|
+
if vmin is not None and vmax is not None and vmin < vmax:
|
|
1249
|
+
kde_kwargs["clip"] = (vmin, vmax)
|
|
1250
|
+
sns.kdeplot(arr, **kde_kwargs)
|
|
1251
|
+
|
|
1252
|
+
# Add per-group threshold lines if requested (color-matched)
|
|
1253
|
+
if show_threshold:
|
|
1254
|
+
for i, g in enumerate(group_names):
|
|
1255
|
+
if g in stats["per_group"]:
|
|
1256
|
+
color = color_map.get(g, "#1f77b4")
|
|
1257
|
+
ax = plt.gca()
|
|
1258
|
+
y_max = ax.get_ylim()[1]
|
|
1259
|
+
x_min, x_max = ax.get_xlim()
|
|
1260
|
+
text_y = y_max * 0.9
|
|
1261
|
+
x_offset = (x_max - x_min) * 0.005 * (i + 1)
|
|
1262
|
+
if threshold_method == "quantile":
|
|
1263
|
+
thresholds_g = stats["per_group"][g]["threshold_values"]
|
|
1264
|
+
# Draw multiple lines if multiple quantiles
|
|
1265
|
+
for k_idx, (q_lbl, xg) in enumerate(sorted(thresholds_g.items())):
|
|
1266
|
+
if np.isnan(xg):
|
|
1267
|
+
continue
|
|
1268
|
+
# Clean threshold line without text label
|
|
1269
|
+
plt.axvline(xg, color=color, linestyle="--", linewidth=1.6)
|
|
1270
|
+
elif threshold_method in {"zscore", "z"}:
|
|
1271
|
+
xg = stats["per_group"][g]["threshold_value"]
|
|
1272
|
+
# Clean threshold line without text label
|
|
1273
|
+
plt.axvline(xg, color=color, linestyle="--", linewidth=1.6)
|
|
1274
|
+
else: # top_proportion
|
|
1275
|
+
xg = stats["per_group"][g]["threshold_value"]
|
|
1276
|
+
if np.isfinite(xg):
|
|
1277
|
+
# Clean threshold line without text label
|
|
1278
|
+
plt.axvline(xg, color=color, linestyle="--", linewidth=1.6)
|
|
1279
|
+
|
|
1280
|
+
# Formatting
|
|
1281
|
+
if is_standardized_score:
|
|
1282
|
+
plt.xlabel("Standardized Suffix Rarity Score", fontsize=13)
|
|
1283
|
+
else:
|
|
1284
|
+
plt.xlabel("Suffix Rarity Score", fontsize=13)
|
|
1285
|
+
plt.ylabel("Density", fontsize=13)
|
|
1286
|
+
if len(group_names) > 1:
|
|
1287
|
+
plt.legend(title="Group")
|
|
1288
|
+
sns.despine()
|
|
1289
|
+
plt.tight_layout()
|
|
1290
|
+
|
|
1291
|
+
# Save and show
|
|
1292
|
+
if save_as:
|
|
1293
|
+
plt.savefig(f"{save_as}.png", dpi=dpi, bbox_inches='tight')
|
|
1294
|
+
|
|
1295
|
+
if show:
|
|
1296
|
+
plt.show()
|
|
1297
|
+
|
|
1298
|
+
return stats
|
|
1299
|
+
|
|
1300
|
+
|
|
1301
|
+
def plot_individual_indicators_correlation(
|
|
1302
|
+
df,
|
|
1303
|
+
indicator_columns=None,
|
|
1304
|
+
correlation_method='pearson',
|
|
1305
|
+
group_column=None,
|
|
1306
|
+
figsize=(10, 8),
|
|
1307
|
+
cmap='RdBu_r',
|
|
1308
|
+
center=0,
|
|
1309
|
+
annot=True,
|
|
1310
|
+
fmt='.2f',
|
|
1311
|
+
save_as=None,
|
|
1312
|
+
dpi=300,
|
|
1313
|
+
show=True
|
|
1314
|
+
):
|
|
1315
|
+
"""
|
|
1316
|
+
Plot correlation heatmap of individual-level indicators with beautiful styling.
|
|
1317
|
+
|
|
1318
|
+
Parameters:
|
|
1319
|
+
-----------
|
|
1320
|
+
df : pandas.DataFrame
|
|
1321
|
+
DataFrame containing individual-level indicators
|
|
1322
|
+
indicator_columns : list, optional
|
|
1323
|
+
List of column names to include in correlation analysis.
|
|
1324
|
+
If None, automatically detects indicator columns (converged, first_convergence_year,
|
|
1325
|
+
suffix_typicality_score, path_uniqueness, etc.)
|
|
1326
|
+
correlation_method : str, default='pearson'
|
|
1327
|
+
Correlation method: 'pearson', 'spearman', 'kendall'
|
|
1328
|
+
group_column : str, optional
|
|
1329
|
+
Column name for grouping (e.g., 'country'). If provided, shows separate
|
|
1330
|
+
heatmaps for each group
|
|
1331
|
+
figsize : tuple, default=(10, 8)
|
|
1332
|
+
Figure size (width, height)
|
|
1333
|
+
cmap : str, default='RdBu_r'
|
|
1334
|
+
Colormap for heatmap. Options: 'RdBu_r', 'coolwarm', 'viridis', 'plasma'
|
|
1335
|
+
center : float, default=0
|
|
1336
|
+
Value to center the colormap at
|
|
1337
|
+
annot : bool, default=True
|
|
1338
|
+
Whether to annotate cells with correlation values
|
|
1339
|
+
fmt : str, default='.2f'
|
|
1340
|
+
Format for annotations
|
|
1341
|
+
save_as : str, optional
|
|
1342
|
+
Path to save the figure (without extension)
|
|
1343
|
+
dpi : int, default=300
|
|
1344
|
+
DPI for saving
|
|
1345
|
+
show : bool, default=True
|
|
1346
|
+
Whether to display the plot
|
|
1347
|
+
|
|
1348
|
+
Returns:
|
|
1349
|
+
--------
|
|
1350
|
+
dict: Correlation matrix/matrices and statistics
|
|
1351
|
+
|
|
1352
|
+
Example:
|
|
1353
|
+
--------
|
|
1354
|
+
# Basic usage
|
|
1355
|
+
>>> plot_individual_indicators_correlation(df)
|
|
1356
|
+
|
|
1357
|
+
# Custom indicators with grouping
|
|
1358
|
+
>>> plot_individual_indicators_correlation(
|
|
1359
|
+
... df,
|
|
1360
|
+
... indicator_columns=['converged', 'suffix_rarity_score', 'path_uniqueness'],
|
|
1361
|
+
... group_column='country',
|
|
1362
|
+
... correlation_method='spearman'
|
|
1363
|
+
... )
|
|
1364
|
+
|
|
1365
|
+
# Custom styling
|
|
1366
|
+
>>> plot_individual_indicators_correlation(
|
|
1367
|
+
... df,
|
|
1368
|
+
... cmap='plasma',
|
|
1369
|
+
... figsize=(12, 10),
|
|
1370
|
+
... save_as="indicators_correlation_heatmap"
|
|
1371
|
+
... )
|
|
1372
|
+
"""
|
|
1373
|
+
import matplotlib.pyplot as plt
|
|
1374
|
+
import seaborn as sns
|
|
1375
|
+
import pandas as pd
|
|
1376
|
+
import numpy as np
|
|
1377
|
+
|
|
1378
|
+
# Auto-detect indicator columns if not provided
|
|
1379
|
+
if indicator_columns is None:
|
|
1380
|
+
# Common individual-level indicator patterns (convergence-focused)
|
|
1381
|
+
potential_indicators = [
|
|
1382
|
+
'converged', 'first_convergence_year', 'convergence_year',
|
|
1383
|
+
'suffix_rarity_score', 'path_uniqueness', 'rarity_score', 'uniqueness_score'
|
|
1384
|
+
]
|
|
1385
|
+
indicator_columns = [col for col in df.columns if col in potential_indicators]
|
|
1386
|
+
|
|
1387
|
+
# Also include numeric columns that might be indicators
|
|
1388
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
1389
|
+
for col in numeric_cols:
|
|
1390
|
+
if col not in indicator_columns and any(
|
|
1391
|
+
keyword in col.lower() for keyword in
|
|
1392
|
+
['score', 'index', 'count', 'factor', 'rate', 'ratio']
|
|
1393
|
+
):
|
|
1394
|
+
indicator_columns.append(col)
|
|
1395
|
+
|
|
1396
|
+
# Filter and clean data
|
|
1397
|
+
df_indicators = df[indicator_columns].copy()
|
|
1398
|
+
|
|
1399
|
+
# Handle missing values and convert data types
|
|
1400
|
+
for col in df_indicators.columns:
|
|
1401
|
+
if df_indicators[col].dtype == 'object':
|
|
1402
|
+
# Try to convert to numeric
|
|
1403
|
+
df_indicators[col] = pd.to_numeric(df_indicators[col], errors='coerce')
|
|
1404
|
+
|
|
1405
|
+
# Remove columns with too many missing values (>50%)
|
|
1406
|
+
valid_cols = []
|
|
1407
|
+
for col in df_indicators.columns:
|
|
1408
|
+
if df_indicators[col].notna().sum() / len(df_indicators) > 0.5:
|
|
1409
|
+
valid_cols.append(col)
|
|
1410
|
+
|
|
1411
|
+
df_indicators = df_indicators[valid_cols]
|
|
1412
|
+
|
|
1413
|
+
# Drop rows with any missing values for correlation calculation
|
|
1414
|
+
df_clean = df_indicators.dropna()
|
|
1415
|
+
|
|
1416
|
+
if len(df_clean) == 0:
|
|
1417
|
+
raise ValueError("No valid data remaining after cleaning. Check for missing values.")
|
|
1418
|
+
|
|
1419
|
+
# Calculate correlations
|
|
1420
|
+
results = {}
|
|
1421
|
+
|
|
1422
|
+
if group_column is None or group_column not in df.columns:
|
|
1423
|
+
# Single correlation matrix
|
|
1424
|
+
corr_matrix = df_clean.corr(method=correlation_method)
|
|
1425
|
+
results['overall'] = corr_matrix
|
|
1426
|
+
|
|
1427
|
+
# Create plot
|
|
1428
|
+
plt.figure(figsize=figsize)
|
|
1429
|
+
|
|
1430
|
+
# Create mask for upper triangle (optional - makes it cleaner)
|
|
1431
|
+
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
|
|
1432
|
+
|
|
1433
|
+
# Generate heatmap
|
|
1434
|
+
sns.heatmap(
|
|
1435
|
+
corr_matrix,
|
|
1436
|
+
mask=mask,
|
|
1437
|
+
annot=annot,
|
|
1438
|
+
fmt=fmt,
|
|
1439
|
+
cmap=cmap,
|
|
1440
|
+
center=center,
|
|
1441
|
+
square=True,
|
|
1442
|
+
cbar_kws={"shrink": .8, "label": f"{correlation_method.title()} Correlation"},
|
|
1443
|
+
linewidths=0.5
|
|
1444
|
+
)
|
|
1445
|
+
|
|
1446
|
+
plt.title(f"Individual-Level Indicators Correlation Heatmap\n({correlation_method.title()} Correlation)",
|
|
1447
|
+
fontsize=14, pad=20)
|
|
1448
|
+
plt.xticks(rotation=45, ha='right')
|
|
1449
|
+
plt.yticks(rotation=0)
|
|
1450
|
+
|
|
1451
|
+
else:
|
|
1452
|
+
# Group-based correlation matrices
|
|
1453
|
+
groups = df[group_column].unique()
|
|
1454
|
+
n_groups = len(groups)
|
|
1455
|
+
|
|
1456
|
+
# Calculate subplot layout
|
|
1457
|
+
if n_groups <= 2:
|
|
1458
|
+
nrows, ncols = 1, n_groups
|
|
1459
|
+
figsize = (figsize[0] * n_groups, figsize[1])
|
|
1460
|
+
else:
|
|
1461
|
+
ncols = min(3, n_groups)
|
|
1462
|
+
nrows = (n_groups + ncols - 1) // ncols
|
|
1463
|
+
figsize = (figsize[0] * ncols, figsize[1] * nrows)
|
|
1464
|
+
|
|
1465
|
+
fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
|
|
1466
|
+
if n_groups == 1:
|
|
1467
|
+
axes = [axes]
|
|
1468
|
+
elif nrows == 1:
|
|
1469
|
+
axes = axes
|
|
1470
|
+
else:
|
|
1471
|
+
axes = axes.flatten()
|
|
1472
|
+
|
|
1473
|
+
for i, group in enumerate(groups):
|
|
1474
|
+
group_data = df[df[group_column] == group][indicator_columns].dropna()
|
|
1475
|
+
|
|
1476
|
+
if len(group_data) < 2:
|
|
1477
|
+
print(f"Warning: Group '{group}' has insufficient data for correlation")
|
|
1478
|
+
continue
|
|
1479
|
+
|
|
1480
|
+
corr_matrix = group_data.corr(method=correlation_method)
|
|
1481
|
+
results[group] = corr_matrix
|
|
1482
|
+
|
|
1483
|
+
# Create mask for upper triangle
|
|
1484
|
+
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
|
|
1485
|
+
|
|
1486
|
+
# Plot heatmap
|
|
1487
|
+
sns.heatmap(
|
|
1488
|
+
corr_matrix,
|
|
1489
|
+
mask=mask,
|
|
1490
|
+
annot=annot,
|
|
1491
|
+
fmt=fmt,
|
|
1492
|
+
cmap=cmap,
|
|
1493
|
+
center=center,
|
|
1494
|
+
square=True,
|
|
1495
|
+
cbar=i == 0, # Only show colorbar for first subplot
|
|
1496
|
+
cbar_kws={"shrink": .8, "label": f"{correlation_method.title()} Correlation"} if i == 0 else {},
|
|
1497
|
+
linewidths=0.5,
|
|
1498
|
+
ax=axes[i]
|
|
1499
|
+
)
|
|
1500
|
+
|
|
1501
|
+
axes[i].set_title(f"{group}\n({len(group_data)} individuals)", fontsize=12)
|
|
1502
|
+
axes[i].set_xticks(axes[i].get_xticks())
|
|
1503
|
+
axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45, ha='right')
|
|
1504
|
+
axes[i].set_yticks(axes[i].get_yticks())
|
|
1505
|
+
axes[i].set_yticklabels(axes[i].get_yticklabels(), rotation=0)
|
|
1506
|
+
|
|
1507
|
+
# Hide unused subplots
|
|
1508
|
+
for j in range(i + 1, len(axes)):
|
|
1509
|
+
axes[j].set_visible(False)
|
|
1510
|
+
|
|
1511
|
+
plt.suptitle(f"Individual-Level Indicators Correlation by {group_column.title()}\n({correlation_method.title()} Correlation)",
|
|
1512
|
+
fontsize=16, y=0.98)
|
|
1513
|
+
|
|
1514
|
+
plt.tight_layout()
|
|
1515
|
+
|
|
1516
|
+
# Save and show
|
|
1517
|
+
if save_as:
|
|
1518
|
+
plt.savefig(f"{save_as}.png", dpi=dpi, bbox_inches='tight')
|
|
1519
|
+
|
|
1520
|
+
if show:
|
|
1521
|
+
plt.show()
|
|
1522
|
+
|
|
1523
|
+
# Add summary statistics
|
|
1524
|
+
if group_column is None:
|
|
1525
|
+
sample_size = len(df_clean)
|
|
1526
|
+
else:
|
|
1527
|
+
sizes = {}
|
|
1528
|
+
for g in df[group_column].unique():
|
|
1529
|
+
g_clean = df[df[group_column]==g][indicator_columns].apply(pd.to_numeric, errors='coerce').dropna()
|
|
1530
|
+
sizes[g] = len(g_clean)
|
|
1531
|
+
sample_size = sizes
|
|
1532
|
+
|
|
1533
|
+
results['summary'] = {
|
|
1534
|
+
'method': correlation_method,
|
|
1535
|
+
'n_indicators': len(valid_cols),
|
|
1536
|
+
'indicators_included': valid_cols,
|
|
1537
|
+
'sample_size': sample_size
|
|
1538
|
+
}
|
|
1539
|
+
|
|
1540
|
+
return results
|
|
1541
|
+
|
|
1542
|
+
|
|
1543
|
+
def compute_quantile_thresholds_by_group(scores, group_labels, quantiles=None):
|
|
1544
|
+
"""
|
|
1545
|
+
Compute per-group quantile thresholds for a 1D array of scores.
|
|
1546
|
+
|
|
1547
|
+
Parameters
|
|
1548
|
+
----------
|
|
1549
|
+
scores : array-like of shape (N,)
|
|
1550
|
+
Scores (e.g., standardized rarity) aligned with labels.
|
|
1551
|
+
group_labels : array-like of shape (N,)
|
|
1552
|
+
Group label per observation.
|
|
1553
|
+
quantiles : Optional[List[float]]
|
|
1554
|
+
Quantiles to compute (e.g., [0.10]). Defaults to [0.10].
|
|
1555
|
+
|
|
1556
|
+
Returns
|
|
1557
|
+
-------
|
|
1558
|
+
dict
|
|
1559
|
+
{group: {"p10": value, ...}}
|
|
1560
|
+
"""
|
|
1561
|
+
if quantiles is None:
|
|
1562
|
+
quantiles = [0.10]
|
|
1563
|
+
arr = np.asarray(scores, dtype=float)
|
|
1564
|
+
labels = np.asarray(group_labels)
|
|
1565
|
+
result = {}
|
|
1566
|
+
for g in pd.unique(labels):
|
|
1567
|
+
mask = labels == g
|
|
1568
|
+
vals = arr[mask]
|
|
1569
|
+
vals = vals[~np.isnan(vals)]
|
|
1570
|
+
thresholds_g = {}
|
|
1571
|
+
if vals.size > 0:
|
|
1572
|
+
for q in quantiles:
|
|
1573
|
+
thresholds_g[f"p{int(round(q*100)):02d}"] = float(np.nanquantile(vals, q))
|
|
1574
|
+
else:
|
|
1575
|
+
for q in quantiles:
|
|
1576
|
+
thresholds_g[f"p{int(round(q*100)):02d}"] = np.nan
|
|
1577
|
+
result[g] = thresholds_g
|
|
1578
|
+
return result
|
|
1579
|
+
|
|
1580
|
+
|
|
1581
|
+
def compute_quantile_thresholds_by_group_year(scores, group_labels, year_labels, quantiles=None, min_group_year_size=30):
|
|
1582
|
+
"""
|
|
1583
|
+
Compute quantile thresholds by group x year for time-drifting distributions.
|
|
1584
|
+
|
|
1585
|
+
Parameters
|
|
1586
|
+
----------
|
|
1587
|
+
scores : array-like of shape (N,)
|
|
1588
|
+
Scores aligned with labels.
|
|
1589
|
+
group_labels : array-like of shape (N,)
|
|
1590
|
+
Group label per observation.
|
|
1591
|
+
year_labels : array-like of shape (N,)
|
|
1592
|
+
Year label per observation (int/str).
|
|
1593
|
+
quantiles : Optional[List[float]]
|
|
1594
|
+
Quantiles to compute (e.g., [0.10]). Defaults to [0.10].
|
|
1595
|
+
min_group_year_size : int, default 30
|
|
1596
|
+
Minimum sample size to compute thresholds for a group-year cell. If fewer, returns NaN.
|
|
1597
|
+
|
|
1598
|
+
Returns
|
|
1599
|
+
-------
|
|
1600
|
+
dict
|
|
1601
|
+
{group: {year: {"p10": value, ...}}}
|
|
1602
|
+
"""
|
|
1603
|
+
if quantiles is None:
|
|
1604
|
+
quantiles = [0.10]
|
|
1605
|
+
arr = np.asarray(scores, dtype=float)
|
|
1606
|
+
g_arr = np.asarray(group_labels)
|
|
1607
|
+
y_arr = np.asarray(year_labels)
|
|
1608
|
+
result = {}
|
|
1609
|
+
df = pd.DataFrame({"score": arr, "group": g_arr, "year": y_arr})
|
|
1610
|
+
for g, gdf in df.groupby("group"):
|
|
1611
|
+
result[g] = {}
|
|
1612
|
+
for y, ydf in gdf.groupby("year"):
|
|
1613
|
+
vals = ydf["score"].astype(float).to_numpy()
|
|
1614
|
+
vals = vals[~np.isnan(vals)]
|
|
1615
|
+
thresholds_gy = {}
|
|
1616
|
+
if vals.size >= min_group_year_size:
|
|
1617
|
+
for q in quantiles:
|
|
1618
|
+
thresholds_gy[f"p{int(round(q*100)):02d}"] = float(np.nanquantile(vals, q))
|
|
1619
|
+
else:
|
|
1620
|
+
for q in quantiles:
|
|
1621
|
+
thresholds_gy[f"p{int(round(q*100)):02d}"] = np.nan
|
|
1622
|
+
result[g][y] = thresholds_gy
|
|
1623
|
+
return result
|
|
1624
|
+
|
|
1625
|
+
|
|
1626
|
+
def compute_path_uniqueness_by_group_suffix(sequences, group_labels):
|
|
1627
|
+
"""
|
|
1628
|
+
Compute path uniqueness within each subgroup defined by group_labels using suffix-based approach.
|
|
1629
|
+
This is consistent with the convergence module's suffix-based logic.
|
|
1630
|
+
:param sequences: List of sequences.
|
|
1631
|
+
:param group_labels: List of group keys (same length as sequences), e.g., country, gender.
|
|
1632
|
+
:return: List of path uniqueness scores (same order as input).
|
|
1633
|
+
"""
|
|
1634
|
+
from collections import defaultdict
|
|
1635
|
+
|
|
1636
|
+
T = len(sequences[0])
|
|
1637
|
+
df = pd.DataFrame({
|
|
1638
|
+
"sequence": sequences,
|
|
1639
|
+
"group": group_labels
|
|
1640
|
+
})
|
|
1641
|
+
|
|
1642
|
+
# Step 1: Precompute suffix frequency tables per group (changed from prefix to suffix)
|
|
1643
|
+
group_suffix_freq = {}
|
|
1644
|
+
for group, group_df in df.groupby("group"):
|
|
1645
|
+
suffix_freq = [defaultdict(int) for _ in range(T)]
|
|
1646
|
+
for seq in group_df["sequence"]:
|
|
1647
|
+
for t in range(T):
|
|
1648
|
+
suffix = tuple(seq[t:]) # suffix from year t to end
|
|
1649
|
+
suffix_freq[t][suffix] += 1
|
|
1650
|
+
group_suffix_freq[group] = suffix_freq
|
|
1651
|
+
|
|
1652
|
+
# Step 2: Compute path uniqueness per individual using suffix logic
|
|
1653
|
+
uniqueness_scores = []
|
|
1654
|
+
for seq, group in zip(sequences, group_labels):
|
|
1655
|
+
suffix_freq = group_suffix_freq[group]
|
|
1656
|
+
count = 0
|
|
1657
|
+
for t in range(T):
|
|
1658
|
+
suffix = tuple(seq[t:]) # suffix from year t to end
|
|
1659
|
+
if suffix_freq[t][suffix] == 1:
|
|
1660
|
+
count += 1
|
|
1661
|
+
uniqueness_scores.append(count)
|
|
1662
|
+
|
|
1663
|
+
return uniqueness_scores
|
|
1664
|
+
|
|
1665
|
+
|
|
1666
|
+
# Provide a default version for backward compatibility
|
|
1667
|
+
def compute_path_uniqueness_by_group(sequences, group_labels):
|
|
1668
|
+
"""
|
|
1669
|
+
Compute path uniqueness within each subgroup defined by group_labels.
|
|
1670
|
+
|
|
1671
|
+
This is the default version using suffix-based approach (convergence logic).
|
|
1672
|
+
For explicit control, use compute_path_uniqueness_by_group_suffix() or
|
|
1673
|
+
compute_path_uniqueness_by_group_prefix() from the prefix_tree module.
|
|
1674
|
+
|
|
1675
|
+
:param sequences: List of sequences.
|
|
1676
|
+
:param group_labels: List of group keys (same length as sequences), e.g., country, gender.
|
|
1677
|
+
:return: List of path uniqueness scores (same order as input).
|
|
1678
|
+
"""
|
|
1679
|
+
return compute_path_uniqueness_by_group_suffix(sequences, group_labels)
|