sequenzo 0.1.24__cp311-cp311-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sequenzo might be problematic. Click here for more details.
- _sequenzo_fastcluster.cpython-311-darwin.so +0 -0
- sequenzo/__init__.py +240 -0
- sequenzo/big_data/__init__.py +12 -0
- sequenzo/big_data/clara/__init__.py +26 -0
- sequenzo/big_data/clara/clara.py +474 -0
- sequenzo/big_data/clara/utils/__init__.py +27 -0
- sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
- sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
- sequenzo/big_data/clara/utils/get_weighted_diss.cpython-311-darwin.so +0 -0
- sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
- sequenzo/big_data/clara/visualization.py +88 -0
- sequenzo/clustering/KMedoids.py +178 -0
- sequenzo/clustering/__init__.py +30 -0
- sequenzo/clustering/clustering_c_code.cpython-311-darwin.so +0 -0
- sequenzo/clustering/hierarchical_clustering.py +1256 -0
- sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
- sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
- sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
- sequenzo/clustering/src/KMedoid.cpp +263 -0
- sequenzo/clustering/src/PAM.cpp +237 -0
- sequenzo/clustering/src/PAMonce.cpp +265 -0
- sequenzo/clustering/src/cluster_quality.cpp +496 -0
- sequenzo/clustering/src/cluster_quality.h +128 -0
- sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
- sequenzo/clustering/src/module.cpp +228 -0
- sequenzo/clustering/src/weightedinertia.cpp +111 -0
- sequenzo/clustering/utils/__init__.py +27 -0
- sequenzo/clustering/utils/disscenter.py +122 -0
- sequenzo/data_preprocessing/__init__.py +20 -0
- sequenzo/data_preprocessing/helpers.py +256 -0
- sequenzo/datasets/__init__.py +41 -0
- sequenzo/datasets/biofam.csv +2001 -0
- sequenzo/datasets/biofam_child_domain.csv +2001 -0
- sequenzo/datasets/biofam_left_domain.csv +2001 -0
- sequenzo/datasets/biofam_married_domain.csv +2001 -0
- sequenzo/datasets/chinese_colonial_territories.csv +12 -0
- sequenzo/datasets/country_co2_emissions.csv +194 -0
- sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
- sequenzo/datasets/country_gdp_per_capita.csv +194 -0
- sequenzo/datasets/mvad.csv +713 -0
- sequenzo/datasets/pairfam_family.csv +1867 -0
- sequenzo/datasets/polyadic_samplec1.csv +61 -0
- sequenzo/datasets/polyadic_samplep1.csv +61 -0
- sequenzo/datasets/polyadic_seqc1.csv +61 -0
- sequenzo/datasets/polyadic_seqp1.csv +61 -0
- sequenzo/define_sequence_data.py +609 -0
- sequenzo/dissimilarity_measures/__init__.py +31 -0
- sequenzo/dissimilarity_measures/c_code.cpython-311-darwin.so +0 -0
- sequenzo/dissimilarity_measures/get_distance_matrix.py +702 -0
- sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +241 -0
- sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
- sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
- sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
- sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
- sequenzo/dissimilarity_measures/src/__init__.py +0 -0
- sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
- sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
- sequenzo/dissimilarity_measures/src/module.cpp +34 -0
- sequenzo/dissimilarity_measures/src/setup.py +30 -0
- sequenzo/dissimilarity_measures/src/utils.h +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
- sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
- sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
- sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
- sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
- sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
- sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
- sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
- sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-311-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqconc.cpython-311-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqdss.cpython-311-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqdur.cpython-311-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqlength.cpython-311-darwin.so +0 -0
- sequenzo/multidomain/__init__.py +23 -0
- sequenzo/multidomain/association_between_domains.py +311 -0
- sequenzo/multidomain/cat.py +431 -0
- sequenzo/multidomain/combt.py +519 -0
- sequenzo/multidomain/dat.py +89 -0
- sequenzo/multidomain/idcd.py +139 -0
- sequenzo/multidomain/linked_polyad.py +292 -0
- sequenzo/openmp_setup.py +233 -0
- sequenzo/prefix_tree/__init__.py +43 -0
- sequenzo/prefix_tree/individual_level_indicators.py +1274 -0
- sequenzo/prefix_tree/system_level_indicators.py +465 -0
- sequenzo/prefix_tree/utils.py +54 -0
- sequenzo/sequence_characteristics/__init__.py +40 -0
- sequenzo/sequence_characteristics/complexity_index.py +49 -0
- sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
- sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
- sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
- sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
- sequenzo/sequence_characteristics/turbulence.py +155 -0
- sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
- sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
- sequenzo/suffix_tree/__init__.py +48 -0
- sequenzo/suffix_tree/individual_level_indicators.py +1638 -0
- sequenzo/suffix_tree/system_level_indicators.py +456 -0
- sequenzo/suffix_tree/utils.py +56 -0
- sequenzo/visualization/__init__.py +29 -0
- sequenzo/visualization/plot_mean_time.py +194 -0
- sequenzo/visualization/plot_modal_state.py +276 -0
- sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
- sequenzo/visualization/plot_relative_frequency.py +404 -0
- sequenzo/visualization/plot_sequence_index.py +951 -0
- sequenzo/visualization/plot_single_medoid.py +153 -0
- sequenzo/visualization/plot_state_distribution.py +627 -0
- sequenzo/visualization/plot_transition_matrix.py +190 -0
- sequenzo/visualization/utils/__init__.py +23 -0
- sequenzo/visualization/utils/utils.py +310 -0
- sequenzo/with_event_history_analysis/__init__.py +35 -0
- sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
- sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
- sequenzo-0.1.24.dist-info/METADATA +255 -0
- sequenzo-0.1.24.dist-info/RECORD +264 -0
- sequenzo-0.1.24.dist-info/WHEEL +5 -0
- sequenzo-0.1.24.dist-info/licenses/LICENSE +28 -0
- sequenzo-0.1.24.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,1264 @@
|
|
|
1
|
+
/*
|
|
2
|
+
fastcluster: Fast hierarchical clustering routines for R and Python
|
|
3
|
+
|
|
4
|
+
Copyright:
|
|
5
|
+
* Until package version 1.1.23: © 2011 Daniel Müllner <https://danifold.net>
|
|
6
|
+
* All changes from version 1.1.24 on: © Google Inc. <https://www.google.com>
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
// for INT32_MAX in fastcluster.cpp
|
|
10
|
+
// This must be defined here since Python.h loads the header file pyport.h,
|
|
11
|
+
// and from this stdint.h. INT32_MAX is defined in stdint.h, but only if
|
|
12
|
+
// __STDC_LIMIT_MACROS is defined.
|
|
13
|
+
#define __STDC_LIMIT_MACROS
|
|
14
|
+
|
|
15
|
+
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
|
|
16
|
+
|
|
17
|
+
#if __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 6))
|
|
18
|
+
#define HAVE_DIAGNOSTIC 1
|
|
19
|
+
#endif
|
|
20
|
+
|
|
21
|
+
#if HAVE_DIAGNOSTIC
|
|
22
|
+
#pragma GCC diagnostic push
|
|
23
|
+
#pragma GCC diagnostic ignored "-Wswitch-default"
|
|
24
|
+
#pragma GCC diagnostic ignored "-Wpadded"
|
|
25
|
+
#pragma GCC diagnostic ignored "-Wlong-long"
|
|
26
|
+
#pragma GCC diagnostic ignored "-Wformat"
|
|
27
|
+
#endif
|
|
28
|
+
#include <Python.h>
|
|
29
|
+
#if HAVE_DIAGNOSTIC
|
|
30
|
+
#pragma GCC diagnostic pop
|
|
31
|
+
#endif
|
|
32
|
+
#if HAVE_DIAGNOSTIC
|
|
33
|
+
#pragma GCC diagnostic push
|
|
34
|
+
#pragma GCC diagnostic ignored "-Wlong-long"
|
|
35
|
+
#pragma GCC diagnostic ignored "-Wpedantic"
|
|
36
|
+
#pragma GCC diagnostic ignored "-Wpadded"
|
|
37
|
+
#pragma GCC diagnostic ignored "-Wcast-qual"
|
|
38
|
+
#endif
|
|
39
|
+
#include <numpy/arrayobject.h>
|
|
40
|
+
#if HAVE_DIAGNOSTIC
|
|
41
|
+
#pragma GCC diagnostic pop
|
|
42
|
+
#endif
|
|
43
|
+
|
|
44
|
+
/* It's complicated, but if I do not include the C++ math headers, GCC
|
|
45
|
+
will complain about conversions from 'double' to 'float', whenever 'isnan'
|
|
46
|
+
is called in a templated function (but not outside templates).
|
|
47
|
+
|
|
48
|
+
The '#include <cmath>' seems to cure the problem.
|
|
49
|
+
*/
|
|
50
|
+
//#include <cmath>
|
|
51
|
+
#define fc_isnan(X) ((X)!=(X))
|
|
52
|
+
|
|
53
|
+
// There is Py_IS_NAN but it is so much slower on my x86_64 system with GCC!
|
|
54
|
+
|
|
55
|
+
#include <cmath> // for std::abs, std::pow, std::sqrt
|
|
56
|
+
#include <cstddef> // for std::ptrdiff_t
|
|
57
|
+
#include <limits> // for std::numeric_limits<...>::infinity()
|
|
58
|
+
#include <algorithm> // for std::stable_sort
|
|
59
|
+
#include <new> // for std::bad_alloc
|
|
60
|
+
#include <exception> // for std::exception
|
|
61
|
+
|
|
62
|
+
#include "fastcluster.cpp"
|
|
63
|
+
|
|
64
|
+
// backwards compatibility
|
|
65
|
+
#ifndef NPY_ARRAY_CARRAY_RO
|
|
66
|
+
#define NPY_ARRAY_CARRAY_RO NPY_CARRAY_RO
|
|
67
|
+
#endif
|
|
68
|
+
|
|
69
|
+
/* Since the public interface is given by the Python respectively R interface,
|
|
70
|
+
* we do not want other symbols than the interface initalization routines to be
|
|
71
|
+
* visible in the shared object file. The "visibility" switch is a GCC concept.
|
|
72
|
+
* Hiding symbols keeps the relocation table small and decreases startup time.
|
|
73
|
+
* See http://gcc.gnu.org/wiki/Visibility
|
|
74
|
+
*/
|
|
75
|
+
#if HAVE_VISIBILITY
|
|
76
|
+
#pragma GCC visibility push(hidden)
|
|
77
|
+
#endif
|
|
78
|
+
|
|
79
|
+
/*
|
|
80
|
+
Convenience class for the output array: automatic counter.
|
|
81
|
+
*/
|
|
82
|
+
class linkage_output {
|
|
83
|
+
private:
|
|
84
|
+
t_float * Z;
|
|
85
|
+
|
|
86
|
+
public:
|
|
87
|
+
linkage_output(t_float * const Z_)
|
|
88
|
+
: Z(Z_)
|
|
89
|
+
{}
|
|
90
|
+
|
|
91
|
+
void append(const t_index node1, const t_index node2, const t_float dist,
|
|
92
|
+
const t_float size) {
|
|
93
|
+
if (node1<node2) {
|
|
94
|
+
*(Z++) = static_cast<t_float>(node1);
|
|
95
|
+
*(Z++) = static_cast<t_float>(node2);
|
|
96
|
+
}
|
|
97
|
+
else {
|
|
98
|
+
*(Z++) = static_cast<t_float>(node2);
|
|
99
|
+
*(Z++) = static_cast<t_float>(node1);
|
|
100
|
+
}
|
|
101
|
+
*(Z++) = dist;
|
|
102
|
+
*(Z++) = size;
|
|
103
|
+
}
|
|
104
|
+
};
|
|
105
|
+
|
|
106
|
+
/*
|
|
107
|
+
Generate the SciPy-specific output format for a dendrogram from the
|
|
108
|
+
clustering output.
|
|
109
|
+
|
|
110
|
+
The list of merging steps can be sorted or unsorted.
|
|
111
|
+
*/
|
|
112
|
+
// The size of a node is either 1 (a single point) or is looked up from
|
|
113
|
+
// one of the clusters.
|
|
114
|
+
#define size_(r_) ( ((r_<N) ? 1 : Z_(r_-N,3)) )
|
|
115
|
+
|
|
116
|
+
template <const bool sorted>
|
|
117
|
+
static void generate_SciPy_dendrogram(t_float * const Z, cluster_result & Z2, const t_index N) {
|
|
118
|
+
// The array "nodes" is a union-find data structure for the cluster
|
|
119
|
+
// identities (only needed for unsorted cluster_result input).
|
|
120
|
+
union_find nodes(sorted ? 0 : N);
|
|
121
|
+
if (!sorted) {
|
|
122
|
+
std::stable_sort(Z2[0], Z2[N-1]);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
linkage_output output(Z);
|
|
126
|
+
t_index node1, node2;
|
|
127
|
+
|
|
128
|
+
for (node const * NN=Z2[0]; NN!=Z2[N-1]; ++NN) {
|
|
129
|
+
// Get two data points whose clusters are merged in step i.
|
|
130
|
+
if (sorted) {
|
|
131
|
+
node1 = NN->node1;
|
|
132
|
+
node2 = NN->node2;
|
|
133
|
+
}
|
|
134
|
+
else {
|
|
135
|
+
// Find the cluster identifiers for these points.
|
|
136
|
+
node1 = nodes.Find(NN->node1);
|
|
137
|
+
node2 = nodes.Find(NN->node2);
|
|
138
|
+
// Merge the nodes in the union-find data structure by making them
|
|
139
|
+
// children of a new node.
|
|
140
|
+
nodes.Union(node1, node2);
|
|
141
|
+
}
|
|
142
|
+
output.append(node1, node2, NN->dist, size_(node1)+size_(node2));
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/*
|
|
147
|
+
Python interface code
|
|
148
|
+
*/
|
|
149
|
+
static PyObject * linkage_wrap(PyObject * const self, PyObject * const args);
|
|
150
|
+
static PyObject * linkage_vector_wrap(PyObject * const self, PyObject * const args);
|
|
151
|
+
|
|
152
|
+
// List the C++ methods that this extension provides.
|
|
153
|
+
static PyMethodDef _fastclusterWrapMethods[] = {
|
|
154
|
+
{"linkage_wrap", linkage_wrap, METH_VARARGS, NULL},
|
|
155
|
+
{"linkage_vector_wrap", linkage_vector_wrap, METH_VARARGS, NULL},
|
|
156
|
+
{NULL, NULL, 0, NULL} /* Sentinel - marks the end of this structure */
|
|
157
|
+
};
|
|
158
|
+
|
|
159
|
+
/* Tell Python about these methods.
|
|
160
|
+
|
|
161
|
+
Python 2.x and 3.x differ in their C APIs for this part.
|
|
162
|
+
*/
|
|
163
|
+
#if PY_VERSION_HEX >= 0x03000000
|
|
164
|
+
|
|
165
|
+
static struct PyModuleDef fastclustermodule = {
|
|
166
|
+
PyModuleDef_HEAD_INIT,
|
|
167
|
+
"_sequenzo_fastcluster",
|
|
168
|
+
NULL, // no module documentation
|
|
169
|
+
-1, /* size of per-interpreter state of the module,
|
|
170
|
+
or -1 if the module keeps state in global variables. */
|
|
171
|
+
_fastclusterWrapMethods,
|
|
172
|
+
NULL, NULL, NULL, NULL
|
|
173
|
+
};
|
|
174
|
+
|
|
175
|
+
/* Make the interface initalization routines visible in the shared object
|
|
176
|
+
* file.
|
|
177
|
+
*/
|
|
178
|
+
#if HAVE_VISIBILITY
|
|
179
|
+
#pragma GCC visibility push(default)
|
|
180
|
+
#endif
|
|
181
|
+
|
|
182
|
+
PyMODINIT_FUNC PyInit__sequenzo_fastcluster(void) {
|
|
183
|
+
PyObject * m;
|
|
184
|
+
m = PyModule_Create(&fastclustermodule);
|
|
185
|
+
if (!m) {
|
|
186
|
+
return NULL;
|
|
187
|
+
}
|
|
188
|
+
import_array(); // Must be present for NumPy. Called first after above line.
|
|
189
|
+
return m;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
#if HAVE_VISIBILITY
|
|
193
|
+
#pragma GCC visibility pop
|
|
194
|
+
#endif
|
|
195
|
+
|
|
196
|
+
# else // Python 2.x
|
|
197
|
+
|
|
198
|
+
#if HAVE_VISIBILITY
|
|
199
|
+
#pragma GCC visibility push(default)
|
|
200
|
+
#endif
|
|
201
|
+
|
|
202
|
+
PyMODINIT_FUNC init_sequenzo_fastcluster(void) {
|
|
203
|
+
(void) Py_InitModule("_sequenzo_fastcluster", _fastclusterWrapMethods);
|
|
204
|
+
import_array(); // Must be present for NumPy. Called first after above line.
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
#if HAVE_VISIBILITY
|
|
208
|
+
#pragma GCC visibility pop
|
|
209
|
+
#endif
|
|
210
|
+
|
|
211
|
+
#endif // PY_VERSION
|
|
212
|
+
|
|
213
|
+
class GIL_release
|
|
214
|
+
{
|
|
215
|
+
private:
|
|
216
|
+
// noncopyable
|
|
217
|
+
GIL_release(GIL_release const &);
|
|
218
|
+
GIL_release & operator=(GIL_release const &);
|
|
219
|
+
public:
|
|
220
|
+
inline
|
|
221
|
+
GIL_release(bool really = true)
|
|
222
|
+
: _save(really ? PyEval_SaveThread() : NULL)
|
|
223
|
+
{
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
inline
|
|
227
|
+
~GIL_release()
|
|
228
|
+
{
|
|
229
|
+
if (_save)
|
|
230
|
+
PyEval_RestoreThread(_save);
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
private:
|
|
234
|
+
PyThreadState * _save;
|
|
235
|
+
};
|
|
236
|
+
|
|
237
|
+
/*
|
|
238
|
+
Interface to Python, part 1:
|
|
239
|
+
The input is a dissimilarity matrix.
|
|
240
|
+
*/
|
|
241
|
+
|
|
242
|
+
static PyObject *linkage_wrap(PyObject * const, PyObject * const args) {
|
|
243
|
+
PyArrayObject * D, * Z;
|
|
244
|
+
long int N_ = 0;
|
|
245
|
+
unsigned char method;
|
|
246
|
+
|
|
247
|
+
try{
|
|
248
|
+
#if HAVE_DIAGNOSTIC
|
|
249
|
+
#pragma GCC diagnostic push
|
|
250
|
+
#pragma GCC diagnostic ignored "-Wold-style-cast"
|
|
251
|
+
#endif
|
|
252
|
+
// Parse the input arguments
|
|
253
|
+
if (!PyArg_ParseTuple(args, "lO!O!b",
|
|
254
|
+
&N_, // signed long integer
|
|
255
|
+
&PyArray_Type, &D, // NumPy array
|
|
256
|
+
&PyArray_Type, &Z, // NumPy array
|
|
257
|
+
&method)) { // unsigned char
|
|
258
|
+
return NULL; // Error if the arguments have the wrong type.
|
|
259
|
+
}
|
|
260
|
+
#if HAVE_DIAGNOSTIC
|
|
261
|
+
#pragma GCC diagnostic pop
|
|
262
|
+
#endif
|
|
263
|
+
if (N_ < 1 ) {
|
|
264
|
+
// N must be at least 1.
|
|
265
|
+
PyErr_SetString(PyExc_ValueError,
|
|
266
|
+
"At least one element is needed for clustering.");
|
|
267
|
+
return NULL;
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
/*
|
|
271
|
+
(1)
|
|
272
|
+
The biggest index used below is 4*(N-2)+3, as an index to Z. This must
|
|
273
|
+
fit into the data type used for indices.
|
|
274
|
+
(2)
|
|
275
|
+
The largest representable integer, without loss of precision, by a
|
|
276
|
+
floating point number of type t_float is 2^T_FLOAT_MANT_DIG. Here, we
|
|
277
|
+
make sure that all cluster labels from 0 to 2N-2 in the output can be
|
|
278
|
+
accurately represented by a floating point number.
|
|
279
|
+
|
|
280
|
+
Conversion of N to 64 bits below is not really necessary but it prevents
|
|
281
|
+
a warning ("shift count >= width of type") on systems where "long int"
|
|
282
|
+
is 32 bits wide.
|
|
283
|
+
*/
|
|
284
|
+
if (N_ > MAX_INDEX/4 ||
|
|
285
|
+
static_cast<int64_t>(N_-1)>>(T_FLOAT_MANT_DIG-1) > 0) {
|
|
286
|
+
PyErr_SetString(PyExc_ValueError,
|
|
287
|
+
"Data is too big, index overflow.");
|
|
288
|
+
return NULL;
|
|
289
|
+
}
|
|
290
|
+
t_index N = static_cast<t_index>(N_);
|
|
291
|
+
|
|
292
|
+
// Allow threads!
|
|
293
|
+
GIL_release G;
|
|
294
|
+
|
|
295
|
+
t_float * const D_ = reinterpret_cast<t_float *>(PyArray_DATA(D));
|
|
296
|
+
cluster_result Z2(N-1);
|
|
297
|
+
auto_array_ptr<t_index> members;
|
|
298
|
+
// For these methods, the distance update formula needs the number of
|
|
299
|
+
// data points in a cluster.
|
|
300
|
+
if (method==METHOD_METR_AVERAGE ||
|
|
301
|
+
method==METHOD_METR_WARD ||
|
|
302
|
+
method==METHOD_METR_WARD_D2 ||
|
|
303
|
+
method==METHOD_METR_CENTROID) {
|
|
304
|
+
members.init(N, 1);
|
|
305
|
+
}
|
|
306
|
+
// Operate on squared distances for these methods.
|
|
307
|
+
if (method==METHOD_METR_WARD ||
|
|
308
|
+
method==METHOD_METR_WARD_D2 ||
|
|
309
|
+
method==METHOD_METR_CENTROID ||
|
|
310
|
+
method==METHOD_METR_MEDIAN) {
|
|
311
|
+
for (t_float * DD = D_; DD!=D_+static_cast<std::ptrdiff_t>(N)*(N-1)/2;
|
|
312
|
+
++DD)
|
|
313
|
+
*DD *= *DD;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
switch (method) {
|
|
317
|
+
case METHOD_METR_SINGLE:
|
|
318
|
+
MST_linkage_core(N, D_, Z2);
|
|
319
|
+
break;
|
|
320
|
+
case METHOD_METR_COMPLETE:
|
|
321
|
+
NN_chain_core<METHOD_METR_COMPLETE, t_index>(N, D_, NULL, Z2);
|
|
322
|
+
break;
|
|
323
|
+
case METHOD_METR_AVERAGE:
|
|
324
|
+
NN_chain_core<METHOD_METR_AVERAGE, t_index>(N, D_, members, Z2);
|
|
325
|
+
break;
|
|
326
|
+
case METHOD_METR_WEIGHTED:
|
|
327
|
+
NN_chain_core<METHOD_METR_WEIGHTED, t_index>(N, D_, NULL, Z2);
|
|
328
|
+
break;
|
|
329
|
+
case METHOD_METR_WARD:
|
|
330
|
+
NN_chain_core<METHOD_METR_WARD, t_index>(N, D_, members, Z2);
|
|
331
|
+
break;
|
|
332
|
+
case METHOD_METR_WARD_D2:
|
|
333
|
+
NN_chain_core<METHOD_METR_WARD_D2, t_index>(N, D_, members, Z2);
|
|
334
|
+
break;
|
|
335
|
+
case METHOD_METR_CENTROID:
|
|
336
|
+
generic_linkage<METHOD_METR_CENTROID, t_index>(N, D_, members, Z2);
|
|
337
|
+
break;
|
|
338
|
+
case METHOD_METR_MEDIAN:
|
|
339
|
+
generic_linkage<METHOD_METR_MEDIAN, t_index>(N, D_, NULL, Z2);
|
|
340
|
+
break;
|
|
341
|
+
default:
|
|
342
|
+
throw std::runtime_error(std::string("Invalid method index."));
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
if (method==METHOD_METR_WARD_D2 ||
|
|
346
|
+
method==METHOD_METR_CENTROID ||
|
|
347
|
+
method==METHOD_METR_MEDIAN) {
|
|
348
|
+
Z2.sqrt();
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
t_float * const Z_ = reinterpret_cast<t_float *>(PyArray_DATA(Z));
|
|
352
|
+
if (method==METHOD_METR_CENTROID ||
|
|
353
|
+
method==METHOD_METR_MEDIAN) {
|
|
354
|
+
generate_SciPy_dendrogram<true>(Z_, Z2, N);
|
|
355
|
+
}
|
|
356
|
+
else {
|
|
357
|
+
generate_SciPy_dendrogram<false>(Z_, Z2, N);
|
|
358
|
+
}
|
|
359
|
+
} // try
|
|
360
|
+
catch (const std::bad_alloc&) {
|
|
361
|
+
return PyErr_NoMemory();
|
|
362
|
+
}
|
|
363
|
+
catch(const std::exception& e){
|
|
364
|
+
PyErr_SetString(PyExc_EnvironmentError, e.what());
|
|
365
|
+
return NULL;
|
|
366
|
+
}
|
|
367
|
+
catch(const nan_error&){
|
|
368
|
+
PyErr_SetString(PyExc_FloatingPointError, "NaN dissimilarity value.");
|
|
369
|
+
return NULL;
|
|
370
|
+
}
|
|
371
|
+
#ifdef FE_INVALID
|
|
372
|
+
catch(const fenv_error&){
|
|
373
|
+
PyErr_SetString(PyExc_FloatingPointError,
|
|
374
|
+
"NaN dissimilarity value in intermediate results.");
|
|
375
|
+
return NULL;
|
|
376
|
+
}
|
|
377
|
+
#endif
|
|
378
|
+
catch(...){
|
|
379
|
+
PyErr_SetString(PyExc_EnvironmentError,
|
|
380
|
+
"C++ exception (unknown reason). Please send a bug report.");
|
|
381
|
+
return NULL;
|
|
382
|
+
}
|
|
383
|
+
#if HAVE_DIAGNOSTIC
|
|
384
|
+
#pragma GCC diagnostic push
|
|
385
|
+
#pragma GCC diagnostic ignored "-Wold-style-cast"
|
|
386
|
+
#endif
|
|
387
|
+
Py_RETURN_NONE;
|
|
388
|
+
#if HAVE_DIAGNOSTIC
|
|
389
|
+
#pragma GCC diagnostic pop
|
|
390
|
+
#endif
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
/*
|
|
394
|
+
Part 2: Clustering on vector data
|
|
395
|
+
*/
|
|
396
|
+
|
|
397
|
+
/* Metric codes.
|
|
398
|
+
|
|
399
|
+
These codes must agree with the dictionary mtridx in fastcluster.py.
|
|
400
|
+
*/
|
|
401
|
+
enum metric_codes {
|
|
402
|
+
// metrics
|
|
403
|
+
METRIC_EUCLIDEAN = 0,
|
|
404
|
+
METRIC_MINKOWSKI = 1,
|
|
405
|
+
METRIC_CITYBLOCK = 2,
|
|
406
|
+
METRIC_SEUCLIDEAN = 3,
|
|
407
|
+
METRIC_SQEUCLIDEAN = 4,
|
|
408
|
+
METRIC_COSINE = 5,
|
|
409
|
+
METRIC_HAMMING = 6,
|
|
410
|
+
METRIC_JACCARD = 7,
|
|
411
|
+
METRIC_CHEBYCHEV = 8,
|
|
412
|
+
METRIC_CANBERRA = 9,
|
|
413
|
+
METRIC_BRAYCURTIS = 10,
|
|
414
|
+
METRIC_MAHALANOBIS = 11,
|
|
415
|
+
METRIC_YULE = 12,
|
|
416
|
+
METRIC_MATCHING = 13,
|
|
417
|
+
METRIC_DICE = 14,
|
|
418
|
+
METRIC_ROGERSTANIMOTO = 15,
|
|
419
|
+
METRIC_RUSSELLRAO = 16,
|
|
420
|
+
METRIC_SOKALSNEATH = 17,
|
|
421
|
+
METRIC_KULSINSKI = 18,
|
|
422
|
+
METRIC_USER = 19,
|
|
423
|
+
METRIC_INVALID = 20, // sentinel
|
|
424
|
+
METRIC_JACCARD_BOOL = 21, // separate function for Jaccard metric on
|
|
425
|
+
}; // Boolean input data
|
|
426
|
+
|
|
427
|
+
/*
|
|
428
|
+
Helper class: Throw this if calling the Python interpreter from within
|
|
429
|
+
C returned an error.
|
|
430
|
+
*/
|
|
431
|
+
class pythonerror {};
|
|
432
|
+
|
|
433
|
+
/*
|
|
434
|
+
This class handles all the information about the dissimilarity
|
|
435
|
+
computation.
|
|
436
|
+
*/
|
|
437
|
+
|
|
438
|
+
class python_dissimilarity {
|
|
439
|
+
private:
|
|
440
|
+
t_float * Xa;
|
|
441
|
+
std::ptrdiff_t dim; // size_t saves many statis_cast<> in products
|
|
442
|
+
t_index N;
|
|
443
|
+
auto_array_ptr<t_float> Xnew;
|
|
444
|
+
t_index * members;
|
|
445
|
+
void (cluster_result::*postprocessfn) (const t_float) const;
|
|
446
|
+
t_float postprocessarg;
|
|
447
|
+
|
|
448
|
+
t_float (python_dissimilarity::*distfn) (const t_index, const t_index) const;
|
|
449
|
+
|
|
450
|
+
// for user-defined metrics
|
|
451
|
+
PyObject * X_Python;
|
|
452
|
+
PyObject * userfn;
|
|
453
|
+
|
|
454
|
+
auto_array_ptr<t_float> precomputed;
|
|
455
|
+
t_float * precomputed2;
|
|
456
|
+
|
|
457
|
+
PyArrayObject * V;
|
|
458
|
+
const t_float * V_data;
|
|
459
|
+
|
|
460
|
+
// noncopyable
|
|
461
|
+
python_dissimilarity();
|
|
462
|
+
python_dissimilarity(python_dissimilarity const &);
|
|
463
|
+
python_dissimilarity & operator=(python_dissimilarity const &);
|
|
464
|
+
|
|
465
|
+
public:
|
|
466
|
+
// Ignore warning about uninitialized member variables. I know what I am
|
|
467
|
+
// doing here, and some member variables are only used for certain metrics.
|
|
468
|
+
#if HAVE_DIAGNOSTIC
|
|
469
|
+
#pragma GCC diagnostic push
|
|
470
|
+
#pragma GCC diagnostic ignored "-Weffc++"
|
|
471
|
+
#endif
|
|
472
|
+
python_dissimilarity (PyArrayObject * const Xarg,
|
|
473
|
+
t_index * const members_,
|
|
474
|
+
const method_codes method,
|
|
475
|
+
const metric_codes metric,
|
|
476
|
+
PyObject * const extraarg,
|
|
477
|
+
bool temp_point_array)
|
|
478
|
+
: Xa(reinterpret_cast<t_float *>(PyArray_DATA(Xarg))),
|
|
479
|
+
dim(PyArray_DIM(Xarg, 1)),
|
|
480
|
+
N(static_cast<t_index>(PyArray_DIM(Xarg, 0))),
|
|
481
|
+
Xnew(temp_point_array ? (N-1)*dim : 0),
|
|
482
|
+
members(members_),
|
|
483
|
+
postprocessfn(NULL),
|
|
484
|
+
V(NULL)
|
|
485
|
+
{
|
|
486
|
+
switch (method) {
|
|
487
|
+
case METHOD_METR_SINGLE:
|
|
488
|
+
postprocessfn = NULL; // default
|
|
489
|
+
switch (metric) {
|
|
490
|
+
case METRIC_EUCLIDEAN:
|
|
491
|
+
set_euclidean();
|
|
492
|
+
break;
|
|
493
|
+
case METRIC_SEUCLIDEAN:
|
|
494
|
+
if (extraarg==NULL) {
|
|
495
|
+
PyErr_SetString(PyExc_TypeError,
|
|
496
|
+
"The 'seuclidean' metric needs a variance parameter.");
|
|
497
|
+
throw pythonerror();
|
|
498
|
+
}
|
|
499
|
+
#if HAVE_DIAGNOSTIC
|
|
500
|
+
#pragma GCC diagnostic push
|
|
501
|
+
#pragma GCC diagnostic ignored "-Wold-style-cast"
|
|
502
|
+
#endif
|
|
503
|
+
V = reinterpret_cast<PyArrayObject *>(PyArray_FromAny(extraarg,
|
|
504
|
+
PyArray_DescrFromType(NPY_DOUBLE),
|
|
505
|
+
1, 1,
|
|
506
|
+
NPY_ARRAY_CARRAY_RO,
|
|
507
|
+
NULL));
|
|
508
|
+
#if HAVE_DIAGNOSTIC
|
|
509
|
+
#pragma GCC diagnostic pop
|
|
510
|
+
#endif
|
|
511
|
+
if (PyErr_Occurred()) {
|
|
512
|
+
throw pythonerror();
|
|
513
|
+
}
|
|
514
|
+
if (PyArray_DIM(V, 0)!=dim) {
|
|
515
|
+
PyErr_SetString(PyExc_ValueError,
|
|
516
|
+
"The variance vector must have the same dimensionality as the data.");
|
|
517
|
+
throw pythonerror();
|
|
518
|
+
}
|
|
519
|
+
V_data = reinterpret_cast<t_float *>(PyArray_DATA(V));
|
|
520
|
+
distfn = &python_dissimilarity::seuclidean;
|
|
521
|
+
postprocessfn = &cluster_result::sqrt;
|
|
522
|
+
break;
|
|
523
|
+
case METRIC_SQEUCLIDEAN:
|
|
524
|
+
distfn = &python_dissimilarity::sqeuclidean<false>;
|
|
525
|
+
break;
|
|
526
|
+
case METRIC_CITYBLOCK:
|
|
527
|
+
set_cityblock();
|
|
528
|
+
break;
|
|
529
|
+
case METRIC_CHEBYCHEV:
|
|
530
|
+
set_chebychev();
|
|
531
|
+
break;
|
|
532
|
+
case METRIC_MINKOWSKI:
|
|
533
|
+
set_minkowski(extraarg);
|
|
534
|
+
break;
|
|
535
|
+
case METRIC_COSINE:
|
|
536
|
+
distfn = &python_dissimilarity::cosine;
|
|
537
|
+
postprocessfn = &cluster_result::plusone;
|
|
538
|
+
// precompute norms
|
|
539
|
+
precomputed.init(N);
|
|
540
|
+
for (t_index i=0; i<N; ++i) {
|
|
541
|
+
t_float sum=0;
|
|
542
|
+
for (t_index k=0; k<dim; ++k) {
|
|
543
|
+
sum += X(i,k)*X(i,k);
|
|
544
|
+
}
|
|
545
|
+
precomputed[i] = 1/std::sqrt(sum);
|
|
546
|
+
}
|
|
547
|
+
break;
|
|
548
|
+
case METRIC_HAMMING:
|
|
549
|
+
distfn = &python_dissimilarity::hamming;
|
|
550
|
+
postprocessfn = &cluster_result::divide;
|
|
551
|
+
postprocessarg = static_cast<t_float>(dim);
|
|
552
|
+
break;
|
|
553
|
+
case METRIC_JACCARD:
|
|
554
|
+
distfn = &python_dissimilarity::jaccard;
|
|
555
|
+
break;
|
|
556
|
+
case METRIC_CANBERRA:
|
|
557
|
+
distfn = &python_dissimilarity::canberra;
|
|
558
|
+
break;
|
|
559
|
+
case METRIC_BRAYCURTIS:
|
|
560
|
+
distfn = &python_dissimilarity::braycurtis;
|
|
561
|
+
break;
|
|
562
|
+
case METRIC_MAHALANOBIS:
|
|
563
|
+
if (extraarg==NULL) {
|
|
564
|
+
PyErr_SetString(PyExc_TypeError,
|
|
565
|
+
"The 'mahalanobis' metric needs a parameter for the inverse covariance.");
|
|
566
|
+
throw pythonerror();
|
|
567
|
+
}
|
|
568
|
+
#if HAVE_DIAGNOSTIC
|
|
569
|
+
#pragma GCC diagnostic push
|
|
570
|
+
#pragma GCC diagnostic ignored "-Wold-style-cast"
|
|
571
|
+
#endif
|
|
572
|
+
V = reinterpret_cast<PyArrayObject *>(PyArray_FromAny(extraarg,
|
|
573
|
+
PyArray_DescrFromType(NPY_DOUBLE),
|
|
574
|
+
2, 2,
|
|
575
|
+
NPY_ARRAY_CARRAY_RO,
|
|
576
|
+
NULL));
|
|
577
|
+
#if HAVE_DIAGNOSTIC
|
|
578
|
+
#pragma GCC diagnostic pop
|
|
579
|
+
#endif
|
|
580
|
+
if (PyErr_Occurred()) {
|
|
581
|
+
throw pythonerror();
|
|
582
|
+
}
|
|
583
|
+
if (PyArray_DIM(V, 0)!=N || PyArray_DIM(V, 1)!=dim) {
|
|
584
|
+
PyErr_SetString(PyExc_ValueError,
|
|
585
|
+
"The inverse covariance matrix has the wrong size.");
|
|
586
|
+
throw pythonerror();
|
|
587
|
+
}
|
|
588
|
+
V_data = reinterpret_cast<t_float *>(PyArray_DATA(V));
|
|
589
|
+
distfn = &python_dissimilarity::mahalanobis;
|
|
590
|
+
postprocessfn = &cluster_result::sqrt;
|
|
591
|
+
break;
|
|
592
|
+
case METRIC_YULE:
|
|
593
|
+
distfn = &python_dissimilarity::yule;
|
|
594
|
+
break;
|
|
595
|
+
case METRIC_MATCHING:
|
|
596
|
+
distfn = &python_dissimilarity::matching;
|
|
597
|
+
postprocessfn = &cluster_result::divide;
|
|
598
|
+
postprocessarg = static_cast<t_float>(dim);
|
|
599
|
+
break;
|
|
600
|
+
case METRIC_DICE:
|
|
601
|
+
distfn = &python_dissimilarity::dice;
|
|
602
|
+
break;
|
|
603
|
+
case METRIC_ROGERSTANIMOTO:
|
|
604
|
+
distfn = &python_dissimilarity::rogerstanimoto;
|
|
605
|
+
break;
|
|
606
|
+
case METRIC_RUSSELLRAO:
|
|
607
|
+
distfn = &python_dissimilarity::russellrao;
|
|
608
|
+
postprocessfn = &cluster_result::divide;
|
|
609
|
+
postprocessarg = static_cast<t_float>(dim);
|
|
610
|
+
break;
|
|
611
|
+
case METRIC_SOKALSNEATH:
|
|
612
|
+
distfn = &python_dissimilarity::sokalsneath;
|
|
613
|
+
break;
|
|
614
|
+
case METRIC_KULSINSKI:
|
|
615
|
+
distfn = &python_dissimilarity::kulsinski;
|
|
616
|
+
postprocessfn = &cluster_result::plusone;
|
|
617
|
+
precomputed.init(N);
|
|
618
|
+
for (t_index i=0; i<N; ++i) {
|
|
619
|
+
t_index sum=0;
|
|
620
|
+
for (t_index k=0; k<dim; ++k) {
|
|
621
|
+
sum += Xb(i,k);
|
|
622
|
+
}
|
|
623
|
+
precomputed[i] = -.5/static_cast<t_float>(sum);
|
|
624
|
+
}
|
|
625
|
+
break;
|
|
626
|
+
case METRIC_USER:
|
|
627
|
+
X_Python = reinterpret_cast<PyObject *>(Xarg);
|
|
628
|
+
this->userfn = extraarg;
|
|
629
|
+
distfn = &python_dissimilarity::user;
|
|
630
|
+
break;
|
|
631
|
+
default: // case METRIC_JACCARD_BOOL:
|
|
632
|
+
distfn = &python_dissimilarity::jaccard_bool;
|
|
633
|
+
}
|
|
634
|
+
break;
|
|
635
|
+
|
|
636
|
+
case METHOD_METR_WARD:
|
|
637
|
+
postprocessfn = &cluster_result::sqrtward;
|
|
638
|
+
break;
|
|
639
|
+
|
|
640
|
+
case METHOD_METR_WARD_D2:
|
|
641
|
+
postprocessfn = &cluster_result::sqrtdouble;
|
|
642
|
+
break;
|
|
643
|
+
|
|
644
|
+
default:
|
|
645
|
+
postprocessfn = &cluster_result::sqrt;
|
|
646
|
+
}
|
|
647
|
+
}
|
|
648
|
+
#if HAVE_DIAGNOSTIC
|
|
649
|
+
#pragma GCC diagnostic pop
|
|
650
|
+
#endif
|
|
651
|
+
|
|
652
|
+
~python_dissimilarity() {
|
|
653
|
+
#if HAVE_DIAGNOSTIC
|
|
654
|
+
#pragma GCC diagnostic push
|
|
655
|
+
#pragma GCC diagnostic ignored "-Wold-style-cast"
|
|
656
|
+
#endif
|
|
657
|
+
Py_XDECREF(V);
|
|
658
|
+
#if HAVE_DIAGNOSTIC
|
|
659
|
+
#pragma GCC diagnostic pop
|
|
660
|
+
#endif
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
inline t_float operator () (const t_index i, const t_index j) const {
|
|
664
|
+
return (this->*distfn)(i,j);
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
inline t_float X (const t_index i, const t_index j) const {
|
|
668
|
+
return Xa[i*dim+j];
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
inline bool Xb (const t_index i, const t_index j) const {
|
|
672
|
+
return reinterpret_cast<bool *>(Xa)[i*dim+j];
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
inline t_float * Xptr(const t_index i, const t_index j) const {
|
|
676
|
+
return Xa+i*dim+j;
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
void merge(const t_index i, const t_index j, const t_index newnode) const {
|
|
680
|
+
t_float const * const Pi = i<N ? Xa+i*dim : Xnew+(i-N)*dim;
|
|
681
|
+
t_float const * const Pj = j<N ? Xa+j*dim : Xnew+(j-N)*dim;
|
|
682
|
+
for(t_index k=0; k<dim; ++k) {
|
|
683
|
+
Xnew[(newnode-N)*dim+k] = (Pi[k]*static_cast<t_float>(members[i]) +
|
|
684
|
+
Pj[k]*static_cast<t_float>(members[j])) /
|
|
685
|
+
static_cast<t_float>(members[i]+members[j]);
|
|
686
|
+
}
|
|
687
|
+
members[newnode] = members[i]+members[j];
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
void merge_weighted(const t_index i, const t_index j, const t_index newnode)
|
|
691
|
+
const {
|
|
692
|
+
t_float const * const Pi = i<N ? Xa+i*dim : Xnew+(i-N)*dim;
|
|
693
|
+
t_float const * const Pj = j<N ? Xa+j*dim : Xnew+(j-N)*dim;
|
|
694
|
+
for(t_index k=0; k<dim; ++k) {
|
|
695
|
+
Xnew[(newnode-N)*dim+k] = (Pi[k]+Pj[k])*.5;
|
|
696
|
+
}
|
|
697
|
+
}
|
|
698
|
+
|
|
699
|
+
void merge_inplace(const t_index i, const t_index j) const {
|
|
700
|
+
t_float const * const Pi = Xa+i*dim;
|
|
701
|
+
t_float * const Pj = Xa+j*dim;
|
|
702
|
+
for(t_index k=0; k<dim; ++k) {
|
|
703
|
+
Pj[k] = (Pi[k]*static_cast<t_float>(members[i]) +
|
|
704
|
+
Pj[k]*static_cast<t_float>(members[j])) /
|
|
705
|
+
static_cast<t_float>(members[i]+members[j]);
|
|
706
|
+
}
|
|
707
|
+
members[j] += members[i];
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
void merge_inplace_weighted(const t_index i, const t_index j) const {
|
|
711
|
+
t_float const * const Pi = Xa+i*dim;
|
|
712
|
+
t_float * const Pj = Xa+j*dim;
|
|
713
|
+
for(t_index k=0; k<dim; ++k) {
|
|
714
|
+
Pj[k] = (Pi[k]+Pj[k])*.5;
|
|
715
|
+
}
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
void postprocess(cluster_result & Z2) const {
|
|
719
|
+
if (postprocessfn!=NULL) {
|
|
720
|
+
(Z2.*postprocessfn)(postprocessarg);
|
|
721
|
+
}
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
inline t_float ward(const t_index i, const t_index j) const {
|
|
725
|
+
t_float mi = static_cast<t_float>(members[i]);
|
|
726
|
+
t_float mj = static_cast<t_float>(members[j]);
|
|
727
|
+
return sqeuclidean<true>(i,j)*mi*mj/(mi+mj);
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
inline t_float ward_initial(const t_index i, const t_index j) const {
|
|
731
|
+
// alias for sqeuclidean
|
|
732
|
+
// Factor 2!!!
|
|
733
|
+
return sqeuclidean<true>(i,j);
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
// This method must not produce NaN if the input is non-NaN.
|
|
737
|
+
inline static t_float ward_initial_conversion(const t_float min) {
|
|
738
|
+
return min*.5;
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
inline t_float ward_extended(const t_index i, const t_index j) const {
|
|
742
|
+
t_float mi = static_cast<t_float>(members[i]);
|
|
743
|
+
t_float mj = static_cast<t_float>(members[j]);
|
|
744
|
+
return sqeuclidean_extended(i,j)*mi*mj/(mi+mj);
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
/* We need two variants of the Euclidean metric: one that does not check
|
|
748
|
+
for a NaN result, which is used for the initial distances, and one which
|
|
749
|
+
does, for the updated distances during the clustering procedure.
|
|
750
|
+
*/
|
|
751
|
+
template <const bool check_NaN>
|
|
752
|
+
t_float sqeuclidean(const t_index i, const t_index j) const {
|
|
753
|
+
t_float sum = 0;
|
|
754
|
+
/*
|
|
755
|
+
for (t_index k=0; k<dim; ++k) {
|
|
756
|
+
t_float diff = X(i,k) - X(j,k);
|
|
757
|
+
sum += diff*diff;
|
|
758
|
+
}
|
|
759
|
+
*/
|
|
760
|
+
// faster
|
|
761
|
+
t_float const * Pi = Xa+i*dim;
|
|
762
|
+
t_float const * Pj = Xa+j*dim;
|
|
763
|
+
for (t_index k=0; k<dim; ++k) {
|
|
764
|
+
t_float diff = Pi[k] - Pj[k];
|
|
765
|
+
sum += diff*diff;
|
|
766
|
+
}
|
|
767
|
+
if (check_NaN) {
|
|
768
|
+
#if HAVE_DIAGNOSTIC
|
|
769
|
+
#pragma GCC diagnostic push
|
|
770
|
+
#pragma GCC diagnostic ignored "-Wfloat-equal"
|
|
771
|
+
#endif
|
|
772
|
+
if (fc_isnan(sum))
|
|
773
|
+
#if HAVE_DIAGNOSTIC
|
|
774
|
+
#pragma GCC diagnostic pop
|
|
775
|
+
#endif
|
|
776
|
+
throw(nan_error());
|
|
777
|
+
}
|
|
778
|
+
return sum;
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
t_float sqeuclidean_extended(const t_index i, const t_index j) const {
|
|
782
|
+
t_float sum = 0;
|
|
783
|
+
t_float const * Pi = i<N ? Xa+i*dim : Xnew+(i-N)*dim; // TBD
|
|
784
|
+
t_float const * Pj = j<N ? Xa+j*dim : Xnew+(j-N)*dim;
|
|
785
|
+
for (t_index k=0; k<dim; ++k) {
|
|
786
|
+
t_float diff = Pi[k] - Pj[k];
|
|
787
|
+
sum += diff*diff;
|
|
788
|
+
}
|
|
789
|
+
#if HAVE_DIAGNOSTIC
|
|
790
|
+
#pragma GCC diagnostic push
|
|
791
|
+
#pragma GCC diagnostic ignored "-Wfloat-equal"
|
|
792
|
+
#endif
|
|
793
|
+
if (fc_isnan(sum))
|
|
794
|
+
throw(nan_error());
|
|
795
|
+
#if HAVE_DIAGNOSTIC
|
|
796
|
+
#pragma GCC diagnostic pop
|
|
797
|
+
#endif
|
|
798
|
+
return sum;
|
|
799
|
+
}
|
|
800
|
+
|
|
801
|
+
private:
|
|
802
|
+
void set_minkowski(PyObject * extraarg) {
|
|
803
|
+
if (extraarg==NULL) {
|
|
804
|
+
PyErr_SetString(PyExc_TypeError,
|
|
805
|
+
"The Minkowski metric needs a parameter.");
|
|
806
|
+
throw pythonerror();
|
|
807
|
+
}
|
|
808
|
+
postprocessarg = PyFloat_AsDouble(extraarg);
|
|
809
|
+
if (PyErr_Occurred()) {
|
|
810
|
+
throw pythonerror();
|
|
811
|
+
}
|
|
812
|
+
|
|
813
|
+
#if HAVE_DIAGNOSTIC
|
|
814
|
+
#pragma GCC diagnostic push
|
|
815
|
+
#pragma GCC diagnostic ignored "-Wfloat-equal"
|
|
816
|
+
#endif
|
|
817
|
+
if (postprocessarg==std::numeric_limits<t_float>::infinity()) {
|
|
818
|
+
set_chebychev();
|
|
819
|
+
}
|
|
820
|
+
else if (postprocessarg==1.0){
|
|
821
|
+
set_cityblock();
|
|
822
|
+
}
|
|
823
|
+
else if (postprocessarg==2.0){
|
|
824
|
+
set_euclidean();
|
|
825
|
+
}
|
|
826
|
+
else {
|
|
827
|
+
distfn = &python_dissimilarity::minkowski;
|
|
828
|
+
postprocessfn = &cluster_result::power;
|
|
829
|
+
}
|
|
830
|
+
#if HAVE_DIAGNOSTIC
|
|
831
|
+
#pragma GCC diagnostic pop
|
|
832
|
+
#endif
|
|
833
|
+
}
|
|
834
|
+
|
|
835
|
+
void set_euclidean() {
|
|
836
|
+
distfn = &python_dissimilarity::sqeuclidean<false>;
|
|
837
|
+
postprocessfn = &cluster_result::sqrt;
|
|
838
|
+
}
|
|
839
|
+
|
|
840
|
+
void set_cityblock() {
|
|
841
|
+
distfn = &python_dissimilarity::cityblock;
|
|
842
|
+
}
|
|
843
|
+
|
|
844
|
+
void set_chebychev() {
|
|
845
|
+
distfn = &python_dissimilarity::chebychev;
|
|
846
|
+
}
|
|
847
|
+
|
|
848
|
+
t_float seuclidean(const t_index i, const t_index j) const {
|
|
849
|
+
t_float sum = 0;
|
|
850
|
+
for (t_index k=0; k<dim; ++k) {
|
|
851
|
+
t_float diff = X(i,k)-X(j,k);
|
|
852
|
+
sum += diff*diff/V_data[k];
|
|
853
|
+
}
|
|
854
|
+
return sum;
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
t_float cityblock(const t_index i, const t_index j) const {
|
|
858
|
+
t_float sum = 0;
|
|
859
|
+
for (t_index k=0; k<dim; ++k) {
|
|
860
|
+
sum += std::abs(X(i,k)-X(j,k));
|
|
861
|
+
}
|
|
862
|
+
return sum;
|
|
863
|
+
}
|
|
864
|
+
|
|
865
|
+
t_float minkowski(const t_index i, const t_index j) const {
|
|
866
|
+
t_float sum = 0;
|
|
867
|
+
for (t_index k=0; k<dim; ++k) {
|
|
868
|
+
sum += std::pow(std::abs(X(i,k)-X(j,k)),postprocessarg);
|
|
869
|
+
}
|
|
870
|
+
return sum;
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
t_float chebychev(const t_index i, const t_index j) const {
|
|
874
|
+
t_float max = 0;
|
|
875
|
+
for (t_index k=0; k<dim; ++k) {
|
|
876
|
+
t_float diff = std::abs(X(i,k)-X(j,k));
|
|
877
|
+
if (diff>max) {
|
|
878
|
+
max = diff;
|
|
879
|
+
}
|
|
880
|
+
}
|
|
881
|
+
return max;
|
|
882
|
+
}
|
|
883
|
+
|
|
884
|
+
t_float cosine(const t_index i, const t_index j) const {
|
|
885
|
+
t_float sum = 0;
|
|
886
|
+
for (t_index k=0; k<dim; ++k) {
|
|
887
|
+
sum -= X(i,k)*X(j,k);
|
|
888
|
+
}
|
|
889
|
+
return sum*precomputed[i]*precomputed[j];
|
|
890
|
+
}
|
|
891
|
+
|
|
892
|
+
t_float hamming(const t_index i, const t_index j) const {
|
|
893
|
+
t_float sum = 0;
|
|
894
|
+
for (t_index k=0; k<dim; ++k) {
|
|
895
|
+
#if HAVE_DIAGNOSTIC
|
|
896
|
+
#pragma GCC diagnostic push
|
|
897
|
+
#pragma GCC diagnostic ignored "-Wfloat-equal"
|
|
898
|
+
#endif
|
|
899
|
+
sum += (X(i,k)!=X(j,k));
|
|
900
|
+
#if HAVE_DIAGNOSTIC
|
|
901
|
+
#pragma GCC diagnostic pop
|
|
902
|
+
#endif
|
|
903
|
+
}
|
|
904
|
+
return sum;
|
|
905
|
+
}
|
|
906
|
+
|
|
907
|
+
// Differs from scipy.spatial.distance: equal vectors correctly
|
|
908
|
+
// return distance 0.
|
|
909
|
+
t_float jaccard(const t_index i, const t_index j) const {
|
|
910
|
+
t_index sum1 = 0;
|
|
911
|
+
t_index sum2 = 0;
|
|
912
|
+
for (t_index k=0; k<dim; ++k) {
|
|
913
|
+
#if HAVE_DIAGNOSTIC
|
|
914
|
+
#pragma GCC diagnostic push
|
|
915
|
+
#pragma GCC diagnostic ignored "-Wfloat-equal"
|
|
916
|
+
#endif
|
|
917
|
+
sum1 += (X(i,k)!=X(j,k));
|
|
918
|
+
sum2 += ((X(i,k)!=0) || (X(j,k)!=0));
|
|
919
|
+
#if HAVE_DIAGNOSTIC
|
|
920
|
+
#pragma GCC diagnostic pop
|
|
921
|
+
#endif
|
|
922
|
+
}
|
|
923
|
+
return sum1==0 ? 0 : static_cast<t_float>(sum1) / static_cast<t_float>(sum2);
|
|
924
|
+
}
|
|
925
|
+
|
|
926
|
+
t_float canberra(const t_index i, const t_index j) const {
|
|
927
|
+
t_float sum = 0;
|
|
928
|
+
for (t_index k=0; k<dim; ++k) {
|
|
929
|
+
t_float numerator = std::abs(X(i,k)-X(j,k));
|
|
930
|
+
#if HAVE_DIAGNOSTIC
|
|
931
|
+
#pragma GCC diagnostic push
|
|
932
|
+
#pragma GCC diagnostic ignored "-Wfloat-equal"
|
|
933
|
+
#endif
|
|
934
|
+
sum += numerator==0 ? 0 : numerator / (std::abs(X(i,k)) + std::abs(X(j,k)));
|
|
935
|
+
#if HAVE_DIAGNOSTIC
|
|
936
|
+
#pragma GCC diagnostic pop
|
|
937
|
+
#endif
|
|
938
|
+
}
|
|
939
|
+
return sum;
|
|
940
|
+
}
|
|
941
|
+
|
|
942
|
+
t_float user(const t_index i, const t_index j) const {
|
|
943
|
+
#if HAVE_DIAGNOSTIC
|
|
944
|
+
#pragma GCC diagnostic push
|
|
945
|
+
#pragma GCC diagnostic ignored "-Wold-style-cast"
|
|
946
|
+
#endif
|
|
947
|
+
PyObject * u = PySequence_ITEM(X_Python, i);
|
|
948
|
+
PyObject * v = PySequence_ITEM(X_Python, j);
|
|
949
|
+
PyObject * result = PyObject_CallFunctionObjArgs(userfn, u, v, NULL);
|
|
950
|
+
Py_DECREF(u);
|
|
951
|
+
Py_DECREF(v);
|
|
952
|
+
#if HAVE_DIAGNOSTIC
|
|
953
|
+
#pragma GCC diagnostic pop
|
|
954
|
+
#endif
|
|
955
|
+
if (result==NULL) {
|
|
956
|
+
throw pythonerror();
|
|
957
|
+
}
|
|
958
|
+
#if HAVE_DIAGNOSTIC
|
|
959
|
+
#pragma GCC diagnostic push
|
|
960
|
+
#pragma GCC diagnostic ignored "-Wold-style-cast"
|
|
961
|
+
#endif
|
|
962
|
+
const t_float C_result = PyFloat_AsDouble(result);
|
|
963
|
+
Py_DECREF(result);
|
|
964
|
+
#if HAVE_DIAGNOSTIC
|
|
965
|
+
#pragma GCC diagnostic pop
|
|
966
|
+
#endif
|
|
967
|
+
if (PyErr_Occurred()) {
|
|
968
|
+
throw pythonerror();
|
|
969
|
+
}
|
|
970
|
+
return C_result;
|
|
971
|
+
}
|
|
972
|
+
|
|
973
|
+
t_float braycurtis(const t_index i, const t_index j) const {
|
|
974
|
+
t_float sum1 = 0;
|
|
975
|
+
t_float sum2 = 0;
|
|
976
|
+
for (t_index k=0; k<dim; ++k) {
|
|
977
|
+
sum1 += std::abs(X(i,k)-X(j,k));
|
|
978
|
+
sum2 += std::abs(X(i,k)+X(j,k));
|
|
979
|
+
}
|
|
980
|
+
return sum1/sum2;
|
|
981
|
+
}
|
|
982
|
+
|
|
983
|
+
t_float mahalanobis(const t_index i, const t_index j) const {
|
|
984
|
+
// V_data contains the product X*VI
|
|
985
|
+
t_float sum = 0;
|
|
986
|
+
for (t_index k=0; k<dim; ++k) {
|
|
987
|
+
sum += (V_data[i*dim+k]-V_data[j*dim+k])*(X(i,k)-X(j,k));
|
|
988
|
+
}
|
|
989
|
+
return sum;
|
|
990
|
+
}
|
|
991
|
+
|
|
992
|
+
t_index mutable NTT; // 'local' variables
|
|
993
|
+
t_index mutable NXO;
|
|
994
|
+
t_index mutable NTF;
|
|
995
|
+
#define NTFFT NTF
|
|
996
|
+
#define NFFTT NTT
|
|
997
|
+
|
|
998
|
+
void nbool_correspond(const t_index i, const t_index j) const {
|
|
999
|
+
NTT = 0;
|
|
1000
|
+
NXO = 0;
|
|
1001
|
+
for (t_index k=0; k<dim; ++k) {
|
|
1002
|
+
NTT += (Xb(i,k) & Xb(j,k)) ;
|
|
1003
|
+
NXO += (Xb(i,k) ^ Xb(j,k)) ;
|
|
1004
|
+
}
|
|
1005
|
+
}
|
|
1006
|
+
|
|
1007
|
+
void nbool_correspond_tfft(const t_index i, const t_index j) const {
|
|
1008
|
+
NTT = 0;
|
|
1009
|
+
NXO = 0;
|
|
1010
|
+
NTF = 0;
|
|
1011
|
+
for (t_index k=0; k<dim; ++k) {
|
|
1012
|
+
NTT += (Xb(i,k) & Xb(j,k)) ;
|
|
1013
|
+
NXO += (Xb(i,k) ^ Xb(j,k)) ;
|
|
1014
|
+
NTF += (Xb(i,k) & !Xb(j,k)) ;
|
|
1015
|
+
}
|
|
1016
|
+
NTF *= (NXO-NTF); // NTFFT
|
|
1017
|
+
NTT *= (static_cast<t_index>(dim)-NTT-NXO); // NFFTT
|
|
1018
|
+
}
|
|
1019
|
+
|
|
1020
|
+
void nbool_correspond_xo(const t_index i, const t_index j) const {
|
|
1021
|
+
NXO = 0;
|
|
1022
|
+
for (t_index k=0; k<dim; ++k) {
|
|
1023
|
+
NXO += (Xb(i,k) ^ Xb(j,k)) ;
|
|
1024
|
+
}
|
|
1025
|
+
}
|
|
1026
|
+
|
|
1027
|
+
void nbool_correspond_tt(const t_index i, const t_index j) const {
|
|
1028
|
+
NTT = 0;
|
|
1029
|
+
for (t_index k=0; k<dim; ++k) {
|
|
1030
|
+
NTT += (Xb(i,k) & Xb(j,k)) ;
|
|
1031
|
+
}
|
|
1032
|
+
}
|
|
1033
|
+
|
|
1034
|
+
t_float yule(const t_index i, const t_index j) const {
|
|
1035
|
+
nbool_correspond_tfft(i, j);
|
|
1036
|
+
return (NTFFT==0) ? 0 :
|
|
1037
|
+
static_cast<t_float>(2*NTFFT) / static_cast<t_float>(NTFFT + NFFTT);
|
|
1038
|
+
}
|
|
1039
|
+
|
|
1040
|
+
// Prevent a zero denominator for equal vectors.
|
|
1041
|
+
t_float dice(const t_index i, const t_index j) const {
|
|
1042
|
+
nbool_correspond(i, j);
|
|
1043
|
+
return (NXO==0) ? 0 :
|
|
1044
|
+
static_cast<t_float>(NXO) / static_cast<t_float>(NXO+2*NTT);
|
|
1045
|
+
}
|
|
1046
|
+
|
|
1047
|
+
t_float rogerstanimoto(const t_index i, const t_index j) const {
|
|
1048
|
+
nbool_correspond_xo(i, j);
|
|
1049
|
+
return static_cast<t_float>(2*NXO) / static_cast<t_float>(NXO+dim);
|
|
1050
|
+
}
|
|
1051
|
+
|
|
1052
|
+
t_float russellrao(const t_index i, const t_index j) const {
|
|
1053
|
+
nbool_correspond_tt(i, j);
|
|
1054
|
+
return static_cast<t_float>(dim-NTT);
|
|
1055
|
+
}
|
|
1056
|
+
|
|
1057
|
+
// Prevent a zero denominator for equal vectors.
|
|
1058
|
+
t_float sokalsneath(const t_index i, const t_index j) const {
|
|
1059
|
+
nbool_correspond(i, j);
|
|
1060
|
+
return (NXO==0) ? 0 :
|
|
1061
|
+
static_cast<t_float>(2*NXO) / static_cast<t_float>(NTT+2*NXO);
|
|
1062
|
+
}
|
|
1063
|
+
|
|
1064
|
+
t_float kulsinski(const t_index i, const t_index j) const {
|
|
1065
|
+
nbool_correspond_tt(i, j);
|
|
1066
|
+
return static_cast<t_float>(NTT) * (precomputed[i] + precomputed[j]);
|
|
1067
|
+
}
|
|
1068
|
+
|
|
1069
|
+
// 'matching' distance = Hamming distance
|
|
1070
|
+
t_float matching(const t_index i, const t_index j) const {
|
|
1071
|
+
nbool_correspond_xo(i, j);
|
|
1072
|
+
return static_cast<t_float>(NXO);
|
|
1073
|
+
}
|
|
1074
|
+
|
|
1075
|
+
// Prevent a zero denominator for equal vectors.
|
|
1076
|
+
t_float jaccard_bool(const t_index i, const t_index j) const {
|
|
1077
|
+
nbool_correspond(i, j);
|
|
1078
|
+
return (NXO==0) ? 0 :
|
|
1079
|
+
static_cast<t_float>(NXO) / static_cast<t_float>(NXO+NTT);
|
|
1080
|
+
}
|
|
1081
|
+
};
|
|
1082
|
+
|
|
1083
|
+
static PyObject *linkage_vector_wrap(PyObject * const, PyObject * const args) {
|
|
1084
|
+
PyArrayObject * X, * Z;
|
|
1085
|
+
unsigned char method, metric;
|
|
1086
|
+
PyObject * extraarg;
|
|
1087
|
+
|
|
1088
|
+
try{
|
|
1089
|
+
// Parse the input arguments
|
|
1090
|
+
#if HAVE_DIAGNOSTIC
|
|
1091
|
+
#pragma GCC diagnostic push
|
|
1092
|
+
#pragma GCC diagnostic ignored "-Wold-style-cast"
|
|
1093
|
+
#endif
|
|
1094
|
+
if (!PyArg_ParseTuple(args, "O!O!bbO",
|
|
1095
|
+
&PyArray_Type, &X, // NumPy array
|
|
1096
|
+
&PyArray_Type, &Z, // NumPy array
|
|
1097
|
+
&method, // unsigned char
|
|
1098
|
+
&metric, // unsigned char
|
|
1099
|
+
&extraarg )) { // Python object
|
|
1100
|
+
return NULL;
|
|
1101
|
+
}
|
|
1102
|
+
#if HAVE_DIAGNOSTIC
|
|
1103
|
+
#pragma GCC diagnostic pop
|
|
1104
|
+
#endif
|
|
1105
|
+
|
|
1106
|
+
if (PyArray_NDIM(X) != 2) {
|
|
1107
|
+
PyErr_SetString(PyExc_ValueError,
|
|
1108
|
+
"The input array must be two-dimensional.");
|
|
1109
|
+
}
|
|
1110
|
+
npy_intp const N_ = PyArray_DIM(X, 0);
|
|
1111
|
+
if (N_ < 1 ) {
|
|
1112
|
+
// N must be at least 1.
|
|
1113
|
+
PyErr_SetString(PyExc_ValueError,
|
|
1114
|
+
"At least one element is needed for clustering.");
|
|
1115
|
+
return NULL;
|
|
1116
|
+
}
|
|
1117
|
+
|
|
1118
|
+
npy_intp const dim = PyArray_DIM(X, 1);
|
|
1119
|
+
if (dim < 1 ) {
|
|
1120
|
+
PyErr_SetString(PyExc_ValueError,
|
|
1121
|
+
"Invalid dimension of the data set.");
|
|
1122
|
+
return NULL;
|
|
1123
|
+
}
|
|
1124
|
+
|
|
1125
|
+
/*
|
|
1126
|
+
(1)
|
|
1127
|
+
The biggest index used below is 4*(N-2)+3, as an index to Z. This must
|
|
1128
|
+
fit into the data type used for indices.
|
|
1129
|
+
(2)
|
|
1130
|
+
The largest representable integer, without loss of precision, by a
|
|
1131
|
+
floating point number of type t_float is 2^T_FLOAT_MANT_DIG. Here, we
|
|
1132
|
+
make sure that all cluster labels from 0 to 2N-2 in the output can be
|
|
1133
|
+
accurately represented by a floating point number.
|
|
1134
|
+
|
|
1135
|
+
Conversion of N to 64 bits below is not really necessary but it prevents
|
|
1136
|
+
a warning ("shift count >= width of type") on systems where "int" is 32
|
|
1137
|
+
bits wide.
|
|
1138
|
+
*/
|
|
1139
|
+
if (N_ > MAX_INDEX/4 || dim > MAX_INDEX ||
|
|
1140
|
+
static_cast<int64_t>(N_-1)>>(T_FLOAT_MANT_DIG-1) > 0) {
|
|
1141
|
+
PyErr_SetString(PyExc_ValueError,
|
|
1142
|
+
"Data is too big, index overflow.");
|
|
1143
|
+
return NULL;
|
|
1144
|
+
}
|
|
1145
|
+
t_index N = static_cast<t_index>(N_);
|
|
1146
|
+
|
|
1147
|
+
cluster_result Z2(N-1);
|
|
1148
|
+
|
|
1149
|
+
auto_array_ptr<t_index> members;
|
|
1150
|
+
if (method==METHOD_METR_WARD || method==METHOD_METR_WARD_D2 || method==METHOD_METR_CENTROID) {
|
|
1151
|
+
members.init(2*N-1, 1);
|
|
1152
|
+
}
|
|
1153
|
+
|
|
1154
|
+
if ((method!=METHOD_METR_SINGLE && metric!=METRIC_EUCLIDEAN) ||
|
|
1155
|
+
metric>=METRIC_INVALID) {
|
|
1156
|
+
PyErr_SetString(PyExc_IndexError, "Invalid metric index.");
|
|
1157
|
+
return NULL;
|
|
1158
|
+
}
|
|
1159
|
+
|
|
1160
|
+
if (PyArray_ISBOOL(X)) {
|
|
1161
|
+
if (metric==METRIC_HAMMING) {
|
|
1162
|
+
metric = METRIC_MATCHING; // Alias
|
|
1163
|
+
}
|
|
1164
|
+
if (metric==METRIC_JACCARD) {
|
|
1165
|
+
metric = METRIC_JACCARD_BOOL;
|
|
1166
|
+
}
|
|
1167
|
+
}
|
|
1168
|
+
|
|
1169
|
+
if (extraarg!=Py_None &&
|
|
1170
|
+
metric!=METRIC_MINKOWSKI &&
|
|
1171
|
+
metric!=METRIC_SEUCLIDEAN &&
|
|
1172
|
+
metric!=METRIC_MAHALANOBIS &&
|
|
1173
|
+
metric!=METRIC_USER) {
|
|
1174
|
+
PyErr_SetString(PyExc_TypeError,
|
|
1175
|
+
"No extra parameter is allowed for this metric.");
|
|
1176
|
+
return NULL;
|
|
1177
|
+
}
|
|
1178
|
+
|
|
1179
|
+
/* temp_point_array must be true if the alternative algorithm
|
|
1180
|
+
is used below (currently for the centroid and median methods). */
|
|
1181
|
+
bool temp_point_array = (method==METHOD_METR_CENTROID ||
|
|
1182
|
+
method==METHOD_METR_MEDIAN);
|
|
1183
|
+
|
|
1184
|
+
python_dissimilarity dist(X, members, static_cast<method_codes>(method),
|
|
1185
|
+
static_cast<metric_codes>(metric), extraarg,
|
|
1186
|
+
temp_point_array);
|
|
1187
|
+
|
|
1188
|
+
if (method!=METHOD_METR_SINGLE &&
|
|
1189
|
+
method!=METHOD_METR_WARD &&
|
|
1190
|
+
method!=METHOD_METR_WARD_D2 &&
|
|
1191
|
+
method!=METHOD_METR_CENTROID &&
|
|
1192
|
+
method!=METHOD_METR_MEDIAN) {
|
|
1193
|
+
PyErr_SetString(PyExc_IndexError, "Invalid method index.");
|
|
1194
|
+
return NULL;
|
|
1195
|
+
}
|
|
1196
|
+
|
|
1197
|
+
// Allow threads if the metric is not "user"!
|
|
1198
|
+
GIL_release G(metric!=METRIC_USER);
|
|
1199
|
+
|
|
1200
|
+
switch (method) {
|
|
1201
|
+
case METHOD_METR_SINGLE:
|
|
1202
|
+
MST_linkage_core_vector(N, dist, Z2);
|
|
1203
|
+
break;
|
|
1204
|
+
case METHOD_METR_WARD:
|
|
1205
|
+
generic_linkage_vector<METHOD_VECTOR_WARD>(N, dist, Z2);
|
|
1206
|
+
break;
|
|
1207
|
+
case METHOD_METR_WARD_D2:
|
|
1208
|
+
generic_linkage_vector<METHOD_VECTOR_WARD_D2>(N, dist, Z2);
|
|
1209
|
+
break;
|
|
1210
|
+
case METHOD_METR_CENTROID:
|
|
1211
|
+
generic_linkage_vector_alternative<METHOD_VECTOR_CENTROID>(N, dist, Z2);
|
|
1212
|
+
break;
|
|
1213
|
+
default: // case METHOD_METR_MEDIAN:
|
|
1214
|
+
generic_linkage_vector_alternative<METHOD_VECTOR_MEDIAN>(N, dist, Z2);
|
|
1215
|
+
}
|
|
1216
|
+
|
|
1217
|
+
if (method==METHOD_METR_WARD ||
|
|
1218
|
+
method==METHOD_METR_WARD_D2 ||
|
|
1219
|
+
method==METHOD_METR_CENTROID) {
|
|
1220
|
+
members.free();
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1223
|
+
dist.postprocess(Z2);
|
|
1224
|
+
|
|
1225
|
+
t_float * const Z_ = reinterpret_cast<t_float *>(PyArray_DATA(Z));
|
|
1226
|
+
if (method!=METHOD_METR_SINGLE) {
|
|
1227
|
+
generate_SciPy_dendrogram<true>(Z_, Z2, N);
|
|
1228
|
+
}
|
|
1229
|
+
else {
|
|
1230
|
+
generate_SciPy_dendrogram<false>(Z_, Z2, N);
|
|
1231
|
+
}
|
|
1232
|
+
} // try
|
|
1233
|
+
catch (const std::bad_alloc&) {
|
|
1234
|
+
return PyErr_NoMemory();
|
|
1235
|
+
}
|
|
1236
|
+
catch(const std::exception& e){
|
|
1237
|
+
PyErr_SetString(PyExc_EnvironmentError, e.what());
|
|
1238
|
+
return NULL;
|
|
1239
|
+
}
|
|
1240
|
+
catch(const nan_error&){
|
|
1241
|
+
PyErr_SetString(PyExc_FloatingPointError, "NaN dissimilarity value.");
|
|
1242
|
+
return NULL;
|
|
1243
|
+
}
|
|
1244
|
+
catch(const pythonerror){
|
|
1245
|
+
return NULL;
|
|
1246
|
+
}
|
|
1247
|
+
catch(...){
|
|
1248
|
+
PyErr_SetString(PyExc_EnvironmentError,
|
|
1249
|
+
"C++ exception (unknown reason). Please send a bug report.");
|
|
1250
|
+
return NULL;
|
|
1251
|
+
}
|
|
1252
|
+
#if HAVE_DIAGNOSTIC
|
|
1253
|
+
#pragma GCC diagnostic push
|
|
1254
|
+
#pragma GCC diagnostic ignored "-Wold-style-cast"
|
|
1255
|
+
#endif
|
|
1256
|
+
Py_RETURN_NONE;
|
|
1257
|
+
#if HAVE_DIAGNOSTIC
|
|
1258
|
+
#pragma GCC diagnostic pop
|
|
1259
|
+
#endif
|
|
1260
|
+
}
|
|
1261
|
+
|
|
1262
|
+
#if HAVE_VISIBILITY
|
|
1263
|
+
#pragma GCC visibility pop
|
|
1264
|
+
#endif
|