sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
- sequenzo/__init__.py +349 -0
- sequenzo/big_data/__init__.py +12 -0
- sequenzo/big_data/clara/__init__.py +26 -0
- sequenzo/big_data/clara/clara.py +476 -0
- sequenzo/big_data/clara/utils/__init__.py +27 -0
- sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
- sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
- sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
- sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
- sequenzo/big_data/clara/visualization.py +88 -0
- sequenzo/clustering/KMedoids.py +178 -0
- sequenzo/clustering/__init__.py +30 -0
- sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
- sequenzo/clustering/hierarchical_clustering.py +1256 -0
- sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
- sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
- sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
- sequenzo/clustering/src/KMedoid.cpp +263 -0
- sequenzo/clustering/src/PAM.cpp +237 -0
- sequenzo/clustering/src/PAMonce.cpp +265 -0
- sequenzo/clustering/src/cluster_quality.cpp +496 -0
- sequenzo/clustering/src/cluster_quality.h +128 -0
- sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
- sequenzo/clustering/src/module.cpp +228 -0
- sequenzo/clustering/src/weightedinertia.cpp +111 -0
- sequenzo/clustering/utils/__init__.py +27 -0
- sequenzo/clustering/utils/disscenter.py +122 -0
- sequenzo/data_preprocessing/__init__.py +22 -0
- sequenzo/data_preprocessing/helpers.py +303 -0
- sequenzo/datasets/__init__.py +41 -0
- sequenzo/datasets/biofam.csv +2001 -0
- sequenzo/datasets/biofam_child_domain.csv +2001 -0
- sequenzo/datasets/biofam_left_domain.csv +2001 -0
- sequenzo/datasets/biofam_married_domain.csv +2001 -0
- sequenzo/datasets/chinese_colonial_territories.csv +12 -0
- sequenzo/datasets/country_co2_emissions.csv +194 -0
- sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
- sequenzo/datasets/country_gdp_per_capita.csv +194 -0
- sequenzo/datasets/dyadic_children.csv +61 -0
- sequenzo/datasets/dyadic_parents.csv +61 -0
- sequenzo/datasets/mvad.csv +713 -0
- sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
- sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
- sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
- sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
- sequenzo/datasets/political_science_aid_shock.csv +166 -0
- sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
- sequenzo/define_sequence_data.py +1400 -0
- sequenzo/dissimilarity_measures/__init__.py +31 -0
- sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
- sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
- sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
- sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
- sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
- sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
- sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
- sequenzo/dissimilarity_measures/src/__init__.py +0 -0
- sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
- sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
- sequenzo/dissimilarity_measures/src/module.cpp +40 -0
- sequenzo/dissimilarity_measures/src/setup.py +30 -0
- sequenzo/dissimilarity_measures/src/utils.h +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
- sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
- sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
- sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
- sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
- sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
- sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
- sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
- sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
- sequenzo/multidomain/__init__.py +23 -0
- sequenzo/multidomain/association_between_domains.py +311 -0
- sequenzo/multidomain/cat.py +597 -0
- sequenzo/multidomain/combt.py +519 -0
- sequenzo/multidomain/dat.py +81 -0
- sequenzo/multidomain/idcd.py +139 -0
- sequenzo/multidomain/linked_polyad.py +292 -0
- sequenzo/openmp_setup.py +233 -0
- sequenzo/prefix_tree/__init__.py +62 -0
- sequenzo/prefix_tree/hub.py +114 -0
- sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
- sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
- sequenzo/prefix_tree/spell_level_indicators.py +297 -0
- sequenzo/prefix_tree/system_level_indicators.py +544 -0
- sequenzo/prefix_tree/utils.py +54 -0
- sequenzo/seqhmm/__init__.py +95 -0
- sequenzo/seqhmm/advanced_optimization.py +305 -0
- sequenzo/seqhmm/bootstrap.py +411 -0
- sequenzo/seqhmm/build_hmm.py +142 -0
- sequenzo/seqhmm/build_mhmm.py +136 -0
- sequenzo/seqhmm/build_nhmm.py +121 -0
- sequenzo/seqhmm/fit_mhmm.py +62 -0
- sequenzo/seqhmm/fit_model.py +61 -0
- sequenzo/seqhmm/fit_nhmm.py +76 -0
- sequenzo/seqhmm/formulas.py +289 -0
- sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
- sequenzo/seqhmm/gradients_nhmm.py +306 -0
- sequenzo/seqhmm/hmm.py +291 -0
- sequenzo/seqhmm/mhmm.py +314 -0
- sequenzo/seqhmm/model_comparison.py +238 -0
- sequenzo/seqhmm/multichannel_em.py +282 -0
- sequenzo/seqhmm/multichannel_utils.py +138 -0
- sequenzo/seqhmm/nhmm.py +270 -0
- sequenzo/seqhmm/nhmm_utils.py +191 -0
- sequenzo/seqhmm/predict.py +137 -0
- sequenzo/seqhmm/predict_mhmm.py +142 -0
- sequenzo/seqhmm/simulate.py +878 -0
- sequenzo/seqhmm/utils.py +218 -0
- sequenzo/seqhmm/visualization.py +910 -0
- sequenzo/sequence_characteristics/__init__.py +40 -0
- sequenzo/sequence_characteristics/complexity_index.py +49 -0
- sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
- sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
- sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
- sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
- sequenzo/sequence_characteristics/turbulence.py +155 -0
- sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
- sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
- sequenzo/suffix_tree/__init__.py +66 -0
- sequenzo/suffix_tree/hub.py +114 -0
- sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
- sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
- sequenzo/suffix_tree/spell_level_indicators.py +248 -0
- sequenzo/suffix_tree/system_level_indicators.py +535 -0
- sequenzo/suffix_tree/utils.py +56 -0
- sequenzo/version_check.py +283 -0
- sequenzo/visualization/__init__.py +29 -0
- sequenzo/visualization/plot_mean_time.py +222 -0
- sequenzo/visualization/plot_modal_state.py +276 -0
- sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
- sequenzo/visualization/plot_relative_frequency.py +405 -0
- sequenzo/visualization/plot_sequence_index.py +1175 -0
- sequenzo/visualization/plot_single_medoid.py +153 -0
- sequenzo/visualization/plot_state_distribution.py +651 -0
- sequenzo/visualization/plot_transition_matrix.py +190 -0
- sequenzo/visualization/utils/__init__.py +23 -0
- sequenzo/visualization/utils/utils.py +310 -0
- sequenzo/with_event_history_analysis/__init__.py +35 -0
- sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
- sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
- sequenzo-0.1.31.dist-info/METADATA +286 -0
- sequenzo-0.1.31.dist-info/RECORD +299 -0
- sequenzo-0.1.31.dist-info/WHEEL +5 -0
- sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
- sequenzo-0.1.31.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,411 @@
|
|
|
1
|
+
"""
|
|
2
|
+
@Author : Yuqi Liang 梁彧祺
|
|
3
|
+
@File : bootstrap.py
|
|
4
|
+
@Time : 2025-10-05 08:15
|
|
5
|
+
@Desc : Bootstrap confidence intervals for HMM model coefficients
|
|
6
|
+
|
|
7
|
+
This module provides functions for computing bootstrap confidence intervals
|
|
8
|
+
for model parameters, similar to seqHMM's bootstrap_coefs() function in R.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
from typing import Optional, List, Dict, Callable, Union
|
|
13
|
+
from .hmm import HMM
|
|
14
|
+
from .mhmm import MHMM
|
|
15
|
+
from .nhmm import NHMM
|
|
16
|
+
from sequenzo.define_sequence_data import SequenceData
|
|
17
|
+
|
|
18
|
+
# Try to import tqdm for progress bar, but make it optional
|
|
19
|
+
try:
|
|
20
|
+
from tqdm import tqdm
|
|
21
|
+
HAS_TQDM = True
|
|
22
|
+
except ImportError:
|
|
23
|
+
HAS_TQDM = False
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def bootstrap_model(
|
|
27
|
+
model: Union[HMM, MHMM, NHMM],
|
|
28
|
+
n_sim: int = 100,
|
|
29
|
+
method: str = 'nonparametric',
|
|
30
|
+
random_state: Optional[int] = None,
|
|
31
|
+
verbose: bool = True,
|
|
32
|
+
n_jobs: int = 1
|
|
33
|
+
) -> dict:
|
|
34
|
+
"""
|
|
35
|
+
Bootstrap sampling for HMM model coefficients.
|
|
36
|
+
|
|
37
|
+
This function performs bootstrap resampling to estimate confidence intervals
|
|
38
|
+
for model parameters. For each bootstrap sample, the model is refitted and
|
|
39
|
+
parameters are stored. This is similar to seqHMM's bootstrap_coefs() function.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
model: Fitted HMM, MHMM, or NHMM model object
|
|
43
|
+
n_sim: Number of bootstrap samples. Default is 100.
|
|
44
|
+
method: Bootstrap method. Options:
|
|
45
|
+
- 'nonparametric': Resample sequences with replacement (default)
|
|
46
|
+
- 'parametric': Not yet implemented
|
|
47
|
+
random_state: Random seed for reproducibility
|
|
48
|
+
verbose: Whether to show progress bar
|
|
49
|
+
n_jobs: Number of parallel jobs (not yet implemented, always uses 1)
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
dict: Dictionary containing:
|
|
53
|
+
- 'bootstrap_samples': List of bootstrap parameter estimates
|
|
54
|
+
- 'original_model': Original model object
|
|
55
|
+
- 'n_sim': Number of bootstrap samples
|
|
56
|
+
- 'method': Bootstrap method used
|
|
57
|
+
- 'summary': Summary statistics (mean, std, percentiles)
|
|
58
|
+
|
|
59
|
+
Examples:
|
|
60
|
+
>>> from sequenzo.seqhmm import build_hmm, fit_model, bootstrap_model
|
|
61
|
+
>>>
|
|
62
|
+
>>> # Fit model
|
|
63
|
+
>>> hmm = build_hmm(seq, n_states=4, random_state=42)
|
|
64
|
+
>>> hmm = fit_model(hmm)
|
|
65
|
+
>>>
|
|
66
|
+
>>> # Bootstrap
|
|
67
|
+
>>> boot_results = bootstrap_model(hmm, n_sim=100, verbose=True)
|
|
68
|
+
>>>
|
|
69
|
+
>>> # Get confidence intervals
|
|
70
|
+
>>> ci = boot_results['summary']['ci_95']
|
|
71
|
+
>>> print(f"95% CI for initial_probs: {ci['initial_probs']}")
|
|
72
|
+
"""
|
|
73
|
+
if model.log_likelihood is None:
|
|
74
|
+
raise ValueError("Model must be fitted before bootstrapping. Use fit_model() first.")
|
|
75
|
+
|
|
76
|
+
rng = np.random.RandomState(random_state)
|
|
77
|
+
n_sequences = model.n_sequences
|
|
78
|
+
|
|
79
|
+
# Store bootstrap samples
|
|
80
|
+
bootstrap_samples = []
|
|
81
|
+
|
|
82
|
+
# Progress bar
|
|
83
|
+
if verbose and HAS_TQDM:
|
|
84
|
+
iterator = tqdm(range(n_sim), desc="Bootstrap sampling")
|
|
85
|
+
else:
|
|
86
|
+
iterator = range(n_sim)
|
|
87
|
+
if verbose:
|
|
88
|
+
print(f"Running {n_sim} bootstrap samples...")
|
|
89
|
+
|
|
90
|
+
# Bootstrap loop
|
|
91
|
+
for b in iterator:
|
|
92
|
+
if method == 'nonparametric':
|
|
93
|
+
# Resample sequences with replacement
|
|
94
|
+
bootstrap_indices = rng.choice(n_sequences, size=n_sequences, replace=True)
|
|
95
|
+
|
|
96
|
+
# Create bootstrap dataset
|
|
97
|
+
bootstrap_obs = _resample_sequences(model.observations, bootstrap_indices)
|
|
98
|
+
|
|
99
|
+
# Create and fit bootstrap model
|
|
100
|
+
try:
|
|
101
|
+
bootstrap_model_obj = _create_bootstrap_model(model, bootstrap_obs)
|
|
102
|
+
bootstrap_model_obj = _fit_bootstrap_model(bootstrap_model_obj)
|
|
103
|
+
|
|
104
|
+
# Extract parameters
|
|
105
|
+
params = _extract_parameters(bootstrap_model_obj)
|
|
106
|
+
bootstrap_samples.append(params)
|
|
107
|
+
|
|
108
|
+
except Exception as e:
|
|
109
|
+
if verbose:
|
|
110
|
+
print(f"Warning: Bootstrap sample {b+1} failed: {e}")
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
else:
|
|
114
|
+
raise ValueError(f"Unknown bootstrap method: {method}")
|
|
115
|
+
|
|
116
|
+
if len(bootstrap_samples) == 0:
|
|
117
|
+
raise ValueError("All bootstrap samples failed. Check model fitting.")
|
|
118
|
+
|
|
119
|
+
# Compute summary statistics
|
|
120
|
+
summary = _compute_bootstrap_summary(bootstrap_samples, model)
|
|
121
|
+
|
|
122
|
+
return {
|
|
123
|
+
'bootstrap_samples': bootstrap_samples,
|
|
124
|
+
'original_model': model,
|
|
125
|
+
'n_sim': n_sim,
|
|
126
|
+
'n_successful': len(bootstrap_samples),
|
|
127
|
+
'method': method,
|
|
128
|
+
'summary': summary
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _resample_sequences(observations: SequenceData, indices: np.ndarray) -> SequenceData:
|
|
133
|
+
"""
|
|
134
|
+
Resample sequences based on bootstrap indices.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
observations: Original SequenceData object
|
|
138
|
+
indices: Bootstrap indices (which sequences to include, with replacement)
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
SequenceData: Resampled SequenceData object
|
|
142
|
+
"""
|
|
143
|
+
import pandas as pd
|
|
144
|
+
|
|
145
|
+
# Get the original DataFrame
|
|
146
|
+
original_df = observations.to_dataframe()
|
|
147
|
+
|
|
148
|
+
# Resample rows based on indices
|
|
149
|
+
resampled_df = original_df.iloc[indices].copy()
|
|
150
|
+
|
|
151
|
+
# Reset index to create new sequence IDs
|
|
152
|
+
resampled_df = resampled_df.reset_index(drop=True)
|
|
153
|
+
|
|
154
|
+
# Get time columns from original observations
|
|
155
|
+
# We need to extract the time column names from the original data
|
|
156
|
+
# This is a bit tricky - we'll use the values attribute
|
|
157
|
+
time_cols = observations.values.columns.tolist() if hasattr(observations, 'values') else None
|
|
158
|
+
|
|
159
|
+
# If we can't get time columns directly, try to infer from sequence length
|
|
160
|
+
if time_cols is None:
|
|
161
|
+
# Get max sequence length
|
|
162
|
+
max_length = max(len(seq) for seq in observations.sequences)
|
|
163
|
+
time_cols = list(range(1, max_length + 1))
|
|
164
|
+
|
|
165
|
+
# Create new SequenceData object
|
|
166
|
+
seq_data = SequenceData(
|
|
167
|
+
resampled_df,
|
|
168
|
+
time=time_cols,
|
|
169
|
+
states=observations.states,
|
|
170
|
+
labels=observations.labels,
|
|
171
|
+
id_col=None
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
return seq_data
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _create_bootstrap_model(
|
|
178
|
+
original_model: Union[HMM, MHMM, NHMM],
|
|
179
|
+
bootstrap_obs: SequenceData
|
|
180
|
+
) -> Union[HMM, MHMM, NHMM]:
|
|
181
|
+
"""
|
|
182
|
+
Create a new model object for bootstrap sample.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
original_model: Original fitted model
|
|
186
|
+
bootstrap_obs: Bootstrap resampled observations
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
New model object with same structure as original
|
|
190
|
+
"""
|
|
191
|
+
if isinstance(original_model, HMM):
|
|
192
|
+
from .build_hmm import build_hmm
|
|
193
|
+
return build_hmm(
|
|
194
|
+
bootstrap_obs,
|
|
195
|
+
n_states=original_model.n_states,
|
|
196
|
+
initial_probs=original_model.initial_probs.copy(),
|
|
197
|
+
transition_probs=original_model.transition_probs.copy(),
|
|
198
|
+
emission_probs=original_model.emission_probs.copy(),
|
|
199
|
+
state_names=original_model.state_names,
|
|
200
|
+
random_state=None
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
elif isinstance(original_model, MHMM):
|
|
204
|
+
from .build_mhmm import build_mhmm
|
|
205
|
+
|
|
206
|
+
# Get cluster parameters
|
|
207
|
+
initial_probs_list = [c.initial_probs.copy() for c in original_model.clusters]
|
|
208
|
+
transition_probs_list = [c.transition_probs.copy() for c in original_model.clusters]
|
|
209
|
+
emission_probs_list = [c.emission_probs.copy() for c in original_model.clusters]
|
|
210
|
+
state_names_list = [c.state_names for c in original_model.clusters]
|
|
211
|
+
|
|
212
|
+
return build_mhmm(
|
|
213
|
+
bootstrap_obs,
|
|
214
|
+
n_clusters=original_model.n_clusters,
|
|
215
|
+
n_states=[c.n_states for c in original_model.clusters],
|
|
216
|
+
initial_probs=initial_probs_list,
|
|
217
|
+
transition_probs=transition_probs_list,
|
|
218
|
+
emission_probs=emission_probs_list,
|
|
219
|
+
cluster_probs=original_model.cluster_probs.copy(),
|
|
220
|
+
cluster_names=original_model.cluster_names,
|
|
221
|
+
state_names=state_names_list,
|
|
222
|
+
random_state=None
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
elif isinstance(original_model, NHMM):
|
|
226
|
+
from .build_nhmm import build_nhmm
|
|
227
|
+
return build_nhmm(
|
|
228
|
+
bootstrap_obs,
|
|
229
|
+
n_states=original_model.n_states,
|
|
230
|
+
X=original_model.X, # Use same covariates (or resample if needed)
|
|
231
|
+
eta_pi=original_model.eta_pi.copy(),
|
|
232
|
+
eta_A=original_model.eta_A.copy(),
|
|
233
|
+
eta_B=original_model.eta_B.copy(),
|
|
234
|
+
state_names=original_model.state_names,
|
|
235
|
+
random_state=None
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
else:
|
|
239
|
+
raise ValueError(f"Unknown model type: {type(original_model)}")
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def _fit_bootstrap_model(model: Union[HMM, MHMM, NHMM]) -> Union[HMM, MHMM, NHMM]:
|
|
243
|
+
"""
|
|
244
|
+
Fit a bootstrap model.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
model: Bootstrap model object
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
Fitted model
|
|
251
|
+
"""
|
|
252
|
+
if isinstance(model, HMM):
|
|
253
|
+
from .fit_model import fit_model
|
|
254
|
+
return fit_model(model, n_iter=50, tol=1e-2, verbose=False)
|
|
255
|
+
|
|
256
|
+
elif isinstance(model, MHMM):
|
|
257
|
+
from .fit_mhmm import fit_mhmm
|
|
258
|
+
return fit_mhmm(model, n_iter=50, tol=1e-2, verbose=False)
|
|
259
|
+
|
|
260
|
+
elif isinstance(model, NHMM):
|
|
261
|
+
from .fit_nhmm import fit_nhmm
|
|
262
|
+
return fit_nhmm(model, n_iter=50, tol=1e-3, verbose=False)
|
|
263
|
+
|
|
264
|
+
else:
|
|
265
|
+
raise ValueError(f"Unknown model type: {type(model)}")
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _extract_parameters(model: Union[HMM, MHMM, NHMM]) -> dict:
|
|
269
|
+
"""
|
|
270
|
+
Extract parameters from a fitted model.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
model: Fitted model object
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
dict: Dictionary of parameters
|
|
277
|
+
"""
|
|
278
|
+
if isinstance(model, HMM):
|
|
279
|
+
return {
|
|
280
|
+
'initial_probs': model.initial_probs.copy(),
|
|
281
|
+
'transition_probs': model.transition_probs.copy(),
|
|
282
|
+
'emission_probs': model.emission_probs.copy()
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
elif isinstance(model, MHMM):
|
|
286
|
+
return {
|
|
287
|
+
'cluster_probs': model.cluster_probs.copy(),
|
|
288
|
+
'clusters': [
|
|
289
|
+
{
|
|
290
|
+
'initial_probs': c.initial_probs.copy(),
|
|
291
|
+
'transition_probs': c.transition_probs.copy(),
|
|
292
|
+
'emission_probs': c.emission_probs.copy()
|
|
293
|
+
}
|
|
294
|
+
for c in model.clusters
|
|
295
|
+
],
|
|
296
|
+
'coefficients': model.coefficients.copy() if model.coefficients is not None else None
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
elif isinstance(model, NHMM):
|
|
300
|
+
return {
|
|
301
|
+
'eta_pi': model.eta_pi.copy(),
|
|
302
|
+
'eta_A': model.eta_A.copy(),
|
|
303
|
+
'eta_B': model.eta_B.copy()
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
else:
|
|
307
|
+
raise ValueError(f"Unknown model type: {type(model)}")
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def _compute_bootstrap_summary(
|
|
311
|
+
bootstrap_samples: List[dict],
|
|
312
|
+
original_model: Union[HMM, MHMM, NHMM]
|
|
313
|
+
) -> dict:
|
|
314
|
+
"""
|
|
315
|
+
Compute summary statistics from bootstrap samples.
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
bootstrap_samples: List of parameter dictionaries from bootstrap samples
|
|
319
|
+
original_model: Original fitted model
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
dict: Summary statistics including means, stds, and confidence intervals
|
|
323
|
+
"""
|
|
324
|
+
summary = {}
|
|
325
|
+
|
|
326
|
+
if isinstance(original_model, HMM):
|
|
327
|
+
# Stack arrays
|
|
328
|
+
initial_probs_stack = np.array([s['initial_probs'] for s in bootstrap_samples])
|
|
329
|
+
transition_probs_stack = np.array([s['transition_probs'] for s in bootstrap_samples])
|
|
330
|
+
emission_probs_stack = np.array([s['emission_probs'] for s in bootstrap_samples])
|
|
331
|
+
|
|
332
|
+
# Compute statistics
|
|
333
|
+
summary['initial_probs'] = {
|
|
334
|
+
'mean': np.mean(initial_probs_stack, axis=0),
|
|
335
|
+
'std': np.std(initial_probs_stack, axis=0),
|
|
336
|
+
'ci_95': np.percentile(initial_probs_stack, [2.5, 97.5], axis=0)
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
summary['transition_probs'] = {
|
|
340
|
+
'mean': np.mean(transition_probs_stack, axis=0),
|
|
341
|
+
'std': np.std(transition_probs_stack, axis=0),
|
|
342
|
+
'ci_95': np.percentile(transition_probs_stack, [2.5, 97.5], axis=0)
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
summary['emission_probs'] = {
|
|
346
|
+
'mean': np.mean(emission_probs_stack, axis=0),
|
|
347
|
+
'std': np.std(emission_probs_stack, axis=0),
|
|
348
|
+
'ci_95': np.percentile(emission_probs_stack, [2.5, 97.5], axis=0)
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
elif isinstance(original_model, MHMM):
|
|
352
|
+
# Cluster probabilities
|
|
353
|
+
cluster_probs_stack = np.array([s['cluster_probs'] for s in bootstrap_samples])
|
|
354
|
+
summary['cluster_probs'] = {
|
|
355
|
+
'mean': np.mean(cluster_probs_stack, axis=0),
|
|
356
|
+
'std': np.std(cluster_probs_stack, axis=0),
|
|
357
|
+
'ci_95': np.percentile(cluster_probs_stack, [2.5, 97.5], axis=0)
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
# Cluster-specific parameters
|
|
361
|
+
summary['clusters'] = []
|
|
362
|
+
for k in range(original_model.n_clusters):
|
|
363
|
+
cluster_params = {
|
|
364
|
+
'initial_probs': np.array([s['clusters'][k]['initial_probs'] for s in bootstrap_samples]),
|
|
365
|
+
'transition_probs': np.array([s['clusters'][k]['transition_probs'] for s in bootstrap_samples]),
|
|
366
|
+
'emission_probs': np.array([s['clusters'][k]['emission_probs'] for s in bootstrap_samples])
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
summary['clusters'].append({
|
|
370
|
+
'initial_probs': {
|
|
371
|
+
'mean': np.mean(cluster_params['initial_probs'], axis=0),
|
|
372
|
+
'std': np.std(cluster_params['initial_probs'], axis=0),
|
|
373
|
+
'ci_95': np.percentile(cluster_params['initial_probs'], [2.5, 97.5], axis=0)
|
|
374
|
+
},
|
|
375
|
+
'transition_probs': {
|
|
376
|
+
'mean': np.mean(cluster_params['transition_probs'], axis=0),
|
|
377
|
+
'std': np.std(cluster_params['transition_probs'], axis=0),
|
|
378
|
+
'ci_95': np.percentile(cluster_params['transition_probs'], [2.5, 97.5], axis=0)
|
|
379
|
+
},
|
|
380
|
+
'emission_probs': {
|
|
381
|
+
'mean': np.mean(cluster_params['emission_probs'], axis=0),
|
|
382
|
+
'std': np.std(cluster_params['emission_probs'], axis=0),
|
|
383
|
+
'ci_95': np.percentile(cluster_params['emission_probs'], [2.5, 97.5], axis=0)
|
|
384
|
+
}
|
|
385
|
+
})
|
|
386
|
+
|
|
387
|
+
elif isinstance(original_model, NHMM):
|
|
388
|
+
# Coefficients
|
|
389
|
+
eta_pi_stack = np.array([s['eta_pi'] for s in bootstrap_samples])
|
|
390
|
+
eta_A_stack = np.array([s['eta_A'] for s in bootstrap_samples])
|
|
391
|
+
eta_B_stack = np.array([s['eta_B'] for s in bootstrap_samples])
|
|
392
|
+
|
|
393
|
+
summary['eta_pi'] = {
|
|
394
|
+
'mean': np.mean(eta_pi_stack, axis=0),
|
|
395
|
+
'std': np.std(eta_pi_stack, axis=0),
|
|
396
|
+
'ci_95': np.percentile(eta_pi_stack, [2.5, 97.5], axis=0)
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
summary['eta_A'] = {
|
|
400
|
+
'mean': np.mean(eta_A_stack, axis=0),
|
|
401
|
+
'std': np.std(eta_A_stack, axis=0),
|
|
402
|
+
'ci_95': np.percentile(eta_A_stack, [2.5, 97.5], axis=0)
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
summary['eta_B'] = {
|
|
406
|
+
'mean': np.mean(eta_B_stack, axis=0),
|
|
407
|
+
'std': np.std(eta_B_stack, axis=0),
|
|
408
|
+
'ci_95': np.percentile(eta_B_stack, [2.5, 97.5], axis=0)
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
return summary
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""
|
|
2
|
+
@Author : Yuqi Liang 梁彧祺
|
|
3
|
+
@File : build_hmm.py
|
|
4
|
+
@Time : 2025-11-10 09:05
|
|
5
|
+
@Desc : Build HMM models from SequenceData
|
|
6
|
+
|
|
7
|
+
This module provides the build_hmm function, which creates HMM model objects
|
|
8
|
+
similar to seqHMM's build_hmm() function in R.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
from typing import Optional, List, Union
|
|
13
|
+
from sequenzo.define_sequence_data import SequenceData
|
|
14
|
+
from .multichannel_utils import prepare_multichannel_data
|
|
15
|
+
from .hmm import HMM
|
|
16
|
+
from .utils import (
|
|
17
|
+
create_initial_probs,
|
|
18
|
+
create_transition_probs,
|
|
19
|
+
create_emission_probs
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def build_hmm(
|
|
24
|
+
observations: Union[SequenceData, List[SequenceData]],
|
|
25
|
+
n_states: Optional[int] = None,
|
|
26
|
+
initial_probs: Optional[np.ndarray] = None,
|
|
27
|
+
transition_probs: Optional[np.ndarray] = None,
|
|
28
|
+
emission_probs: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
|
|
29
|
+
state_names: Optional[List[str]] = None,
|
|
30
|
+
channel_names: Optional[List[str]] = None,
|
|
31
|
+
random_state: Optional[int] = None
|
|
32
|
+
) -> HMM:
|
|
33
|
+
"""
|
|
34
|
+
Build a Hidden Markov Model object.
|
|
35
|
+
|
|
36
|
+
This function creates an HMM model object that can be fitted to sequence data.
|
|
37
|
+
It supports both single-channel and multichannel data.
|
|
38
|
+
It is similar to seqHMM's build_hmm() function in R.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
observations: SequenceData object or list of SequenceData objects (for multichannel)
|
|
42
|
+
containing the sequences to model
|
|
43
|
+
n_states: Number of hidden states. Required if initial_probs, transition_probs,
|
|
44
|
+
or emission_probs are not provided.
|
|
45
|
+
initial_probs: Optional initial state probabilities (n_states,).
|
|
46
|
+
If None, will be randomly initialized.
|
|
47
|
+
transition_probs: Optional transition probability matrix (n_states x n_states).
|
|
48
|
+
If None, will be randomly initialized.
|
|
49
|
+
emission_probs: Optional emission probability matrix (n_states x n_symbols).
|
|
50
|
+
If None, will be randomly initialized.
|
|
51
|
+
state_names: Optional names for hidden states. If None, uses "State 1", "State 2", etc.
|
|
52
|
+
channel_names: Optional names for channels. Currently only single-channel is supported.
|
|
53
|
+
random_state: Random seed for initialization of random parameters.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
HMM: An HMM model object (not yet fitted)
|
|
57
|
+
|
|
58
|
+
Examples:
|
|
59
|
+
>>> from sequenzo import SequenceData, load_dataset
|
|
60
|
+
>>> from sequenzo.seqhmm import build_hmm
|
|
61
|
+
>>>
|
|
62
|
+
>>> # Load example data
|
|
63
|
+
>>> df = load_dataset('mvad')
|
|
64
|
+
>>> seq = SequenceData(df, time=range(15, 86), states=['EM', 'FE', 'HE', 'JL', 'SC', 'TR'])
|
|
65
|
+
>>>
|
|
66
|
+
>>> # Build HMM with 4 states, random initialization
|
|
67
|
+
>>> hmm = build_hmm(seq, n_states=4, random_state=42)
|
|
68
|
+
>>>
|
|
69
|
+
>>> # Build HMM with custom initial parameters
|
|
70
|
+
>>> init_probs = np.array([0.3, 0.3, 0.2, 0.2])
|
|
71
|
+
>>> trans_probs = np.array([[0.8, 0.1, 0.05, 0.05],
|
|
72
|
+
... [0.05, 0.8, 0.1, 0.05],
|
|
73
|
+
... [0.05, 0.05, 0.8, 0.1],
|
|
74
|
+
... [0.05, 0.05, 0.1, 0.8]])
|
|
75
|
+
>>> emission_probs = np.random.rand(4, 6) # 4 states, 6 symbols
|
|
76
|
+
>>> emission_probs = emission_probs / emission_probs.sum(axis=1, keepdims=True)
|
|
77
|
+
>>> hmm = build_hmm(seq, initial_probs=init_probs,
|
|
78
|
+
... transition_probs=trans_probs,
|
|
79
|
+
... emission_probs=emission_probs)
|
|
80
|
+
"""
|
|
81
|
+
# Determine number of states
|
|
82
|
+
if n_states is None:
|
|
83
|
+
if initial_probs is not None:
|
|
84
|
+
n_states = len(initial_probs)
|
|
85
|
+
elif transition_probs is not None:
|
|
86
|
+
n_states = transition_probs.shape[0]
|
|
87
|
+
elif emission_probs is not None:
|
|
88
|
+
n_states = emission_probs.shape[0]
|
|
89
|
+
else:
|
|
90
|
+
raise ValueError(
|
|
91
|
+
"n_states must be provided if initial_probs, transition_probs, "
|
|
92
|
+
"and emission_probs are all None"
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Get alphabet size
|
|
96
|
+
n_symbols = len(observations.alphabet)
|
|
97
|
+
|
|
98
|
+
# Create initial probabilities if not provided
|
|
99
|
+
if initial_probs is None:
|
|
100
|
+
initial_probs = create_initial_probs(n_states, method='uniform')
|
|
101
|
+
|
|
102
|
+
# Create transition probabilities if not provided
|
|
103
|
+
if transition_probs is None:
|
|
104
|
+
transition_probs = create_transition_probs(
|
|
105
|
+
n_states, method='random', random_state=random_state
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Create emission probabilities if not provided
|
|
109
|
+
if emission_probs is None:
|
|
110
|
+
emission_probs = create_emission_probs(
|
|
111
|
+
n_states, n_symbols, method='random', random_state=random_state
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Validate dimensions
|
|
115
|
+
if len(initial_probs) != n_states:
|
|
116
|
+
raise ValueError(
|
|
117
|
+
f"initial_probs length ({len(initial_probs)}) must equal n_states ({n_states})"
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
if transition_probs.shape != (n_states, n_states):
|
|
121
|
+
raise ValueError(
|
|
122
|
+
f"transition_probs shape ({transition_probs.shape}) must be ({n_states}, {n_states})"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
if emission_probs.shape != (n_states, n_symbols):
|
|
126
|
+
raise ValueError(
|
|
127
|
+
f"emission_probs shape ({emission_probs.shape}) must be ({n_states}, {n_symbols})"
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# Create and return HMM object
|
|
131
|
+
hmm = HMM(
|
|
132
|
+
observations=observations,
|
|
133
|
+
n_states=n_states,
|
|
134
|
+
initial_probs=initial_probs,
|
|
135
|
+
transition_probs=transition_probs,
|
|
136
|
+
emission_probs=emission_probs,
|
|
137
|
+
state_names=state_names,
|
|
138
|
+
channel_names=channel_names,
|
|
139
|
+
random_state=random_state
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
return hmm
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""
|
|
2
|
+
@Author : Yuqi Liang 梁彧祺
|
|
3
|
+
@File : build_mhmm.py
|
|
4
|
+
@Time : 2025-11-21 10:55
|
|
5
|
+
@Desc : Build Mixture HMM models from SequenceData
|
|
6
|
+
|
|
7
|
+
This module provides the build_mhmm function, which creates Mixture HMM model objects
|
|
8
|
+
similar to seqHMM's build_mhmm() function in R.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
from typing import Optional, List, Union
|
|
13
|
+
from sequenzo.define_sequence_data import SequenceData
|
|
14
|
+
from .mhmm import MHMM
|
|
15
|
+
from .hmm import HMM
|
|
16
|
+
from .utils import (
|
|
17
|
+
create_initial_probs,
|
|
18
|
+
create_transition_probs,
|
|
19
|
+
create_emission_probs
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def build_mhmm(
|
|
24
|
+
observations: SequenceData,
|
|
25
|
+
n_clusters: int,
|
|
26
|
+
n_states: Union[int, List[int]],
|
|
27
|
+
initial_probs: Optional[List[np.ndarray]] = None,
|
|
28
|
+
transition_probs: Optional[List[np.ndarray]] = None,
|
|
29
|
+
emission_probs: Optional[List[np.ndarray]] = None,
|
|
30
|
+
cluster_probs: Optional[np.ndarray] = None,
|
|
31
|
+
cluster_names: Optional[List[str]] = None,
|
|
32
|
+
state_names: Optional[List[List[str]]] = None,
|
|
33
|
+
channel_names: Optional[List[str]] = None,
|
|
34
|
+
random_state: Optional[int] = None
|
|
35
|
+
) -> MHMM:
|
|
36
|
+
"""
|
|
37
|
+
Build a Mixture Hidden Markov Model object.
|
|
38
|
+
|
|
39
|
+
A Mixture HMM consists of multiple HMM submodels (clusters). Each sequence
|
|
40
|
+
belongs to one of these clusters with certain probabilities. This function
|
|
41
|
+
creates the model structure but does not fit it (use fit_mhmm() for that).
|
|
42
|
+
|
|
43
|
+
It is similar to seqHMM's build_mhmm() function in R.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
observations: SequenceData object containing the sequences to model
|
|
47
|
+
n_clusters: Number of clusters (submodels)
|
|
48
|
+
n_states: Number of hidden states per cluster. Can be:
|
|
49
|
+
- int: Same number of states for all clusters
|
|
50
|
+
- List[int]: Different number of states for each cluster
|
|
51
|
+
initial_probs: Optional list of initial state probabilities, one per cluster.
|
|
52
|
+
Each element should be (n_states[k],) array.
|
|
53
|
+
transition_probs: Optional list of transition matrices, one per cluster.
|
|
54
|
+
Each element should be (n_states[k], n_states[k]) array.
|
|
55
|
+
emission_probs: Optional list of emission matrices, one per cluster.
|
|
56
|
+
Each element should be (n_states[k], n_symbols) array.
|
|
57
|
+
cluster_probs: Optional initial cluster probabilities (n_clusters,).
|
|
58
|
+
If None, uses uniform probabilities.
|
|
59
|
+
cluster_names: Optional names for clusters
|
|
60
|
+
state_names: Optional names for hidden states. Should be a list of lists,
|
|
61
|
+
where state_names[k] contains names for cluster k.
|
|
62
|
+
channel_names: Optional names for channels
|
|
63
|
+
random_state: Random seed for initialization
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
MHMM: A Mixture HMM model object (not yet fitted)
|
|
67
|
+
|
|
68
|
+
Examples:
|
|
69
|
+
>>> from sequenzo import SequenceData, load_dataset
|
|
70
|
+
>>> from sequenzo.seqhmm import build_mhmm
|
|
71
|
+
>>>
|
|
72
|
+
>>> # Load example data
|
|
73
|
+
>>> df = load_dataset('mvad')
|
|
74
|
+
>>> seq = SequenceData(df, time=range(15, 86), states=['EM', 'FE', 'HE', 'JL', 'SC', 'TR'])
|
|
75
|
+
>>>
|
|
76
|
+
>>> # Build MHMM with 3 clusters, 4 states each
|
|
77
|
+
>>> mhmm = build_mhmm(seq, n_clusters=3, n_states=4, random_state=42)
|
|
78
|
+
>>>
|
|
79
|
+
>>> # Build MHMM with different number of states per cluster
|
|
80
|
+
>>> mhmm = build_mhmm(seq, n_clusters=3, n_states=[4, 4, 6], random_state=42)
|
|
81
|
+
"""
|
|
82
|
+
# Get alphabet size
|
|
83
|
+
n_symbols = len(observations.alphabet)
|
|
84
|
+
|
|
85
|
+
# Handle n_states: convert to list if int
|
|
86
|
+
if isinstance(n_states, int):
|
|
87
|
+
n_states_list = [n_states] * n_clusters
|
|
88
|
+
else:
|
|
89
|
+
n_states_list = n_states
|
|
90
|
+
|
|
91
|
+
# Validate n_states length
|
|
92
|
+
if len(n_states_list) != n_clusters:
|
|
93
|
+
raise ValueError(
|
|
94
|
+
f"n_states length ({len(n_states_list)}) must equal n_clusters ({n_clusters})"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Build HMM clusters
|
|
98
|
+
clusters = []
|
|
99
|
+
for k in range(n_clusters):
|
|
100
|
+
# Get parameters for this cluster
|
|
101
|
+
cluster_initial = initial_probs[k] if initial_probs is not None and k < len(initial_probs) else None
|
|
102
|
+
cluster_transition = transition_probs[k] if transition_probs is not None and k < len(transition_probs) else None
|
|
103
|
+
cluster_emission = emission_probs[k] if emission_probs is not None and k < len(emission_probs) else None
|
|
104
|
+
|
|
105
|
+
# Get state names for this cluster
|
|
106
|
+
cluster_state_names = None
|
|
107
|
+
if state_names is not None and k < len(state_names):
|
|
108
|
+
cluster_state_names = state_names[k]
|
|
109
|
+
|
|
110
|
+
# Create HMM for this cluster
|
|
111
|
+
hmm = HMM(
|
|
112
|
+
observations=observations,
|
|
113
|
+
n_states=n_states_list[k],
|
|
114
|
+
initial_probs=cluster_initial,
|
|
115
|
+
transition_probs=cluster_transition,
|
|
116
|
+
emission_probs=cluster_emission,
|
|
117
|
+
state_names=cluster_state_names,
|
|
118
|
+
channel_names=channel_names,
|
|
119
|
+
random_state=random_state
|
|
120
|
+
)
|
|
121
|
+
clusters.append(hmm)
|
|
122
|
+
|
|
123
|
+
# Create and return MHMM object
|
|
124
|
+
mhmm = MHMM(
|
|
125
|
+
observations=observations,
|
|
126
|
+
n_clusters=n_clusters,
|
|
127
|
+
n_states=n_states_list,
|
|
128
|
+
clusters=clusters,
|
|
129
|
+
cluster_probs=cluster_probs,
|
|
130
|
+
cluster_names=cluster_names,
|
|
131
|
+
state_names=state_names,
|
|
132
|
+
channel_names=channel_names,
|
|
133
|
+
random_state=random_state
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
return mhmm
|