sequenzo 0.1.24__cp311-cp311-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sequenzo might be problematic. Click here for more details.
- _sequenzo_fastcluster.cpython-311-darwin.so +0 -0
- sequenzo/__init__.py +240 -0
- sequenzo/big_data/__init__.py +12 -0
- sequenzo/big_data/clara/__init__.py +26 -0
- sequenzo/big_data/clara/clara.py +474 -0
- sequenzo/big_data/clara/utils/__init__.py +27 -0
- sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
- sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
- sequenzo/big_data/clara/utils/get_weighted_diss.cpython-311-darwin.so +0 -0
- sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
- sequenzo/big_data/clara/visualization.py +88 -0
- sequenzo/clustering/KMedoids.py +178 -0
- sequenzo/clustering/__init__.py +30 -0
- sequenzo/clustering/clustering_c_code.cpython-311-darwin.so +0 -0
- sequenzo/clustering/hierarchical_clustering.py +1256 -0
- sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
- sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
- sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
- sequenzo/clustering/src/KMedoid.cpp +263 -0
- sequenzo/clustering/src/PAM.cpp +237 -0
- sequenzo/clustering/src/PAMonce.cpp +265 -0
- sequenzo/clustering/src/cluster_quality.cpp +496 -0
- sequenzo/clustering/src/cluster_quality.h +128 -0
- sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
- sequenzo/clustering/src/module.cpp +228 -0
- sequenzo/clustering/src/weightedinertia.cpp +111 -0
- sequenzo/clustering/utils/__init__.py +27 -0
- sequenzo/clustering/utils/disscenter.py +122 -0
- sequenzo/data_preprocessing/__init__.py +20 -0
- sequenzo/data_preprocessing/helpers.py +256 -0
- sequenzo/datasets/__init__.py +41 -0
- sequenzo/datasets/biofam.csv +2001 -0
- sequenzo/datasets/biofam_child_domain.csv +2001 -0
- sequenzo/datasets/biofam_left_domain.csv +2001 -0
- sequenzo/datasets/biofam_married_domain.csv +2001 -0
- sequenzo/datasets/chinese_colonial_territories.csv +12 -0
- sequenzo/datasets/country_co2_emissions.csv +194 -0
- sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
- sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
- sequenzo/datasets/country_gdp_per_capita.csv +194 -0
- sequenzo/datasets/mvad.csv +713 -0
- sequenzo/datasets/pairfam_family.csv +1867 -0
- sequenzo/datasets/polyadic_samplec1.csv +61 -0
- sequenzo/datasets/polyadic_samplep1.csv +61 -0
- sequenzo/datasets/polyadic_seqc1.csv +61 -0
- sequenzo/datasets/polyadic_seqp1.csv +61 -0
- sequenzo/define_sequence_data.py +609 -0
- sequenzo/dissimilarity_measures/__init__.py +31 -0
- sequenzo/dissimilarity_measures/c_code.cpython-311-darwin.so +0 -0
- sequenzo/dissimilarity_measures/get_distance_matrix.py +702 -0
- sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +241 -0
- sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
- sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
- sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
- sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
- sequenzo/dissimilarity_measures/src/__init__.py +0 -0
- sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
- sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
- sequenzo/dissimilarity_measures/src/module.cpp +34 -0
- sequenzo/dissimilarity_measures/src/setup.py +30 -0
- sequenzo/dissimilarity_measures/src/utils.h +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
- sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
- sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
- sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
- sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
- sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
- sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
- sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
- sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-311-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqconc.cpython-311-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqdss.cpython-311-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqdur.cpython-311-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqlength.cpython-311-darwin.so +0 -0
- sequenzo/multidomain/__init__.py +23 -0
- sequenzo/multidomain/association_between_domains.py +311 -0
- sequenzo/multidomain/cat.py +431 -0
- sequenzo/multidomain/combt.py +519 -0
- sequenzo/multidomain/dat.py +89 -0
- sequenzo/multidomain/idcd.py +139 -0
- sequenzo/multidomain/linked_polyad.py +292 -0
- sequenzo/openmp_setup.py +233 -0
- sequenzo/prefix_tree/__init__.py +43 -0
- sequenzo/prefix_tree/individual_level_indicators.py +1274 -0
- sequenzo/prefix_tree/system_level_indicators.py +465 -0
- sequenzo/prefix_tree/utils.py +54 -0
- sequenzo/sequence_characteristics/__init__.py +40 -0
- sequenzo/sequence_characteristics/complexity_index.py +49 -0
- sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
- sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
- sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
- sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
- sequenzo/sequence_characteristics/turbulence.py +155 -0
- sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
- sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
- sequenzo/suffix_tree/__init__.py +48 -0
- sequenzo/suffix_tree/individual_level_indicators.py +1638 -0
- sequenzo/suffix_tree/system_level_indicators.py +456 -0
- sequenzo/suffix_tree/utils.py +56 -0
- sequenzo/visualization/__init__.py +29 -0
- sequenzo/visualization/plot_mean_time.py +194 -0
- sequenzo/visualization/plot_modal_state.py +276 -0
- sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
- sequenzo/visualization/plot_relative_frequency.py +404 -0
- sequenzo/visualization/plot_sequence_index.py +951 -0
- sequenzo/visualization/plot_single_medoid.py +153 -0
- sequenzo/visualization/plot_state_distribution.py +627 -0
- sequenzo/visualization/plot_transition_matrix.py +190 -0
- sequenzo/visualization/utils/__init__.py +23 -0
- sequenzo/visualization/utils/utils.py +310 -0
- sequenzo/with_event_history_analysis/__init__.py +35 -0
- sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
- sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
- sequenzo-0.1.24.dist-info/METADATA +255 -0
- sequenzo-0.1.24.dist-info/RECORD +264 -0
- sequenzo-0.1.24.dist-info/WHEEL +5 -0
- sequenzo-0.1.24.dist-info/licenses/LICENSE +28 -0
- sequenzo-0.1.24.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,1877 @@
|
|
|
1
|
+
/*
|
|
2
|
+
fastcluster: Fast hierarchical clustering routines for R and Python
|
|
3
|
+
|
|
4
|
+
Copyright:
|
|
5
|
+
* Until package version 1.1.23: © 2011 Daniel Müllner <https://danifold.net>
|
|
6
|
+
* All changes from version 1.1.24 on: © Google Inc. <https://www.google.com>
|
|
7
|
+
|
|
8
|
+
This library implements various fast algorithms for hierarchical,
|
|
9
|
+
agglomerative clustering methods:
|
|
10
|
+
|
|
11
|
+
(1) Algorithms for the "stored matrix approach": the input is the array of
|
|
12
|
+
pairwise dissimilarities.
|
|
13
|
+
|
|
14
|
+
MST_linkage_core: single linkage clustering with the "minimum spanning
|
|
15
|
+
tree algorithm (Rohlfs)
|
|
16
|
+
|
|
17
|
+
NN_chain_core: nearest-neighbor-chain algorithm, suitable for single,
|
|
18
|
+
complete, average, weighted and Ward linkage (Murtagh)
|
|
19
|
+
|
|
20
|
+
generic_linkage: generic algorithm, suitable for all distance update
|
|
21
|
+
formulas (Müllner)
|
|
22
|
+
|
|
23
|
+
(2) Algorithms for the "stored data approach": the input are points in a
|
|
24
|
+
vector space.
|
|
25
|
+
|
|
26
|
+
MST_linkage_core_vector: single linkage clustering for vector data
|
|
27
|
+
|
|
28
|
+
generic_linkage_vector: generic algorithm for vector data, suitable for
|
|
29
|
+
the Ward, centroid and median methods.
|
|
30
|
+
|
|
31
|
+
generic_linkage_vector_alternative: alternative scheme for updating the
|
|
32
|
+
nearest neighbors. This method seems faster than "generic_linkage_vector"
|
|
33
|
+
for the centroid and median methods but slower for the Ward method.
|
|
34
|
+
|
|
35
|
+
All these implementation treat infinity values correctly. They throw an
|
|
36
|
+
exception if a NaN distance value occurs.
|
|
37
|
+
*/
|
|
38
|
+
|
|
39
|
+
// Older versions of Microsoft Visual Studio do not have the fenv header.
|
|
40
|
+
#ifdef _MSC_VER
|
|
41
|
+
#if (_MSC_VER == 1500 || _MSC_VER == 1600)
|
|
42
|
+
#define NO_INCLUDE_FENV
|
|
43
|
+
#endif
|
|
44
|
+
#endif
|
|
45
|
+
// NaN detection via fenv might not work on systems with software
|
|
46
|
+
// floating-point emulation (bug report for Debian armel).
|
|
47
|
+
#ifdef __SOFTFP__
|
|
48
|
+
#define NO_INCLUDE_FENV
|
|
49
|
+
#endif
|
|
50
|
+
#ifdef NO_INCLUDE_FENV
|
|
51
|
+
#pragma message("Do not use fenv header.")
|
|
52
|
+
#else
|
|
53
|
+
#pragma message("Use fenv header.")
|
|
54
|
+
/* The following #pragma is necessary even if it generates a warning in many
|
|
55
|
+
compilers. Quoting https://en.cppreference.com/w/cpp/numeric/fenv:
|
|
56
|
+
"The floating-point environment access and modification is only meaningful
|
|
57
|
+
when #pragma STDC FENV_ACCESS is supported and is set to ON. [...]
|
|
58
|
+
In practice, few current compilers, such as HP aCC, Oracle Studio, or IBM XL,
|
|
59
|
+
support the #pragma explicitly, but most compilers allow meaningful access
|
|
60
|
+
to the floating-point environment anyway."
|
|
61
|
+
*/
|
|
62
|
+
#pragma STDC FENV_ACCESS ON
|
|
63
|
+
#pragma messag("If there is a warning about unknown #pragma STDC FENV_ACCESS, this can be ignored.")
|
|
64
|
+
#include <fenv.h>
|
|
65
|
+
#endif
|
|
66
|
+
|
|
67
|
+
#include <cmath> // for std::pow, std::sqrt
|
|
68
|
+
#include <cstddef> // for std::ptrdiff_t
|
|
69
|
+
#include <limits> // for std::numeric_limits<...>::infinity()
|
|
70
|
+
#include <algorithm> // for std::fill_n
|
|
71
|
+
#include <stdexcept> // for std::runtime_error
|
|
72
|
+
#include <string> // for std::string
|
|
73
|
+
|
|
74
|
+
#include <cfloat> // also for DBL_MAX, DBL_MIN
|
|
75
|
+
#ifndef DBL_MANT_DIG
|
|
76
|
+
#error The constant DBL_MANT_DIG could not be defined.
|
|
77
|
+
#endif
|
|
78
|
+
#define T_FLOAT_MANT_DIG DBL_MANT_DIG
|
|
79
|
+
|
|
80
|
+
#ifndef LONG_MAX
|
|
81
|
+
#include <climits>
|
|
82
|
+
#endif
|
|
83
|
+
#ifndef LONG_MAX
|
|
84
|
+
#error The constant LONG_MAX could not be defined.
|
|
85
|
+
#endif
|
|
86
|
+
#ifndef INT_MAX
|
|
87
|
+
#error The constant INT_MAX could not be defined.
|
|
88
|
+
#endif
|
|
89
|
+
|
|
90
|
+
#ifndef INT32_MAX
|
|
91
|
+
#ifdef _MSC_VER
|
|
92
|
+
#if _MSC_VER >= 1600
|
|
93
|
+
#define __STDC_LIMIT_MACROS
|
|
94
|
+
#include <stdint.h>
|
|
95
|
+
#else
|
|
96
|
+
typedef __int32 int_fast32_t;
|
|
97
|
+
typedef __int64 int64_t;
|
|
98
|
+
#endif
|
|
99
|
+
#else
|
|
100
|
+
#define __STDC_LIMIT_MACROS
|
|
101
|
+
#include <stdint.h>
|
|
102
|
+
#endif
|
|
103
|
+
#endif
|
|
104
|
+
|
|
105
|
+
#define FILL_N std::fill_n
|
|
106
|
+
#ifdef _MSC_VER
|
|
107
|
+
#if _MSC_VER < 1600
|
|
108
|
+
#undef FILL_N
|
|
109
|
+
#define FILL_N stdext::unchecked_fill_n
|
|
110
|
+
#endif
|
|
111
|
+
#endif
|
|
112
|
+
|
|
113
|
+
// Suppress warnings about (potentially) uninitialized variables.
|
|
114
|
+
#ifdef _MSC_VER
|
|
115
|
+
#pragma warning (disable:4700)
|
|
116
|
+
#endif
|
|
117
|
+
|
|
118
|
+
#ifndef HAVE_DIAGNOSTIC
|
|
119
|
+
#if __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 6))
|
|
120
|
+
#define HAVE_DIAGNOSTIC 1
|
|
121
|
+
#endif
|
|
122
|
+
#endif
|
|
123
|
+
|
|
124
|
+
#ifndef HAVE_VISIBILITY
|
|
125
|
+
#if __GNUC__ >= 4
|
|
126
|
+
#define HAVE_VISIBILITY 1
|
|
127
|
+
#endif
|
|
128
|
+
#endif
|
|
129
|
+
|
|
130
|
+
/* Since the public interface is given by the Python respectively R interface,
|
|
131
|
+
* we do not want other symbols than the interface initalization routines to be
|
|
132
|
+
* visible in the shared object file. The "visibility" switch is a GCC concept.
|
|
133
|
+
* Hiding symbols keeps the relocation table small and decreases startup time.
|
|
134
|
+
* See http://gcc.gnu.org/wiki/Visibility
|
|
135
|
+
*/
|
|
136
|
+
#if HAVE_VISIBILITY
|
|
137
|
+
#pragma GCC visibility push(hidden)
|
|
138
|
+
#endif
|
|
139
|
+
|
|
140
|
+
typedef int_fast32_t t_index;
|
|
141
|
+
#ifndef INT32_MAX
|
|
142
|
+
#define MAX_INDEX 0x7fffffffL
|
|
143
|
+
#else
|
|
144
|
+
#define MAX_INDEX INT32_MAX
|
|
145
|
+
#endif
|
|
146
|
+
#if (LONG_MAX < MAX_INDEX)
|
|
147
|
+
#error The integer format "t_index" must not have a greater range than "long int".
|
|
148
|
+
#endif
|
|
149
|
+
#if (INT_MAX > MAX_INDEX)
|
|
150
|
+
#error The integer format "int" must not have a greater range than "t_index".
|
|
151
|
+
#endif
|
|
152
|
+
typedef double t_float;
|
|
153
|
+
|
|
154
|
+
/* Method codes.
|
|
155
|
+
|
|
156
|
+
These codes must agree with the METHODS array in fastcluster.R and the
|
|
157
|
+
dictionary mthidx in fastcluster.py.
|
|
158
|
+
*/
|
|
159
|
+
enum method_codes {
|
|
160
|
+
// non-Euclidean methods
|
|
161
|
+
METHOD_METR_SINGLE = 0,
|
|
162
|
+
METHOD_METR_COMPLETE = 1,
|
|
163
|
+
METHOD_METR_AVERAGE = 2,
|
|
164
|
+
METHOD_METR_WEIGHTED = 3,
|
|
165
|
+
METHOD_METR_WARD = 4,
|
|
166
|
+
METHOD_METR_WARD_D = METHOD_METR_WARD,
|
|
167
|
+
METHOD_METR_CENTROID = 5,
|
|
168
|
+
METHOD_METR_MEDIAN = 6,
|
|
169
|
+
METHOD_METR_WARD_D2 = 7,
|
|
170
|
+
|
|
171
|
+
MIN_METHOD_CODE = 0,
|
|
172
|
+
MAX_METHOD_CODE = 7
|
|
173
|
+
};
|
|
174
|
+
|
|
175
|
+
enum method_codes_vector {
|
|
176
|
+
// Euclidean methods
|
|
177
|
+
METHOD_VECTOR_SINGLE = 0,
|
|
178
|
+
METHOD_VECTOR_WARD = 1,
|
|
179
|
+
METHOD_VECTOR_WARD_D2 = 4,
|
|
180
|
+
METHOD_VECTOR_CENTROID = 2,
|
|
181
|
+
METHOD_VECTOR_MEDIAN = 3,
|
|
182
|
+
|
|
183
|
+
MIN_METHOD_VECTOR_CODE = 0,
|
|
184
|
+
MAX_METHOD_VECTOR_CODE = 3
|
|
185
|
+
};
|
|
186
|
+
|
|
187
|
+
// self-destructing array pointer
|
|
188
|
+
template <typename type>
|
|
189
|
+
class auto_array_ptr{
|
|
190
|
+
private:
|
|
191
|
+
type * ptr;
|
|
192
|
+
auto_array_ptr(auto_array_ptr const &); // non construction-copyable
|
|
193
|
+
auto_array_ptr& operator=(auto_array_ptr const &); // non copyable
|
|
194
|
+
public:
|
|
195
|
+
auto_array_ptr()
|
|
196
|
+
: ptr(NULL)
|
|
197
|
+
{ }
|
|
198
|
+
template <typename index>
|
|
199
|
+
auto_array_ptr(index const size)
|
|
200
|
+
: ptr(new type[size])
|
|
201
|
+
{ }
|
|
202
|
+
template <typename index, typename value>
|
|
203
|
+
auto_array_ptr(index const size, value const val)
|
|
204
|
+
: ptr(new type[size])
|
|
205
|
+
{
|
|
206
|
+
FILL_N(ptr, size, val);
|
|
207
|
+
}
|
|
208
|
+
~auto_array_ptr() {
|
|
209
|
+
delete [] ptr; }
|
|
210
|
+
void free() {
|
|
211
|
+
delete [] ptr;
|
|
212
|
+
ptr = NULL;
|
|
213
|
+
}
|
|
214
|
+
template <typename index>
|
|
215
|
+
void init(index const size) {
|
|
216
|
+
ptr = new type [size];
|
|
217
|
+
}
|
|
218
|
+
template <typename index, typename value>
|
|
219
|
+
void init(index const size, value const val) {
|
|
220
|
+
init(size);
|
|
221
|
+
FILL_N(ptr, size, val);
|
|
222
|
+
}
|
|
223
|
+
inline operator type *() const { return ptr; }
|
|
224
|
+
};
|
|
225
|
+
|
|
226
|
+
struct node {
|
|
227
|
+
t_index node1, node2;
|
|
228
|
+
t_float dist;
|
|
229
|
+
};
|
|
230
|
+
|
|
231
|
+
inline bool operator< (const node a, const node b) {
|
|
232
|
+
return (a.dist < b.dist);
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
class cluster_result {
|
|
236
|
+
private:
|
|
237
|
+
auto_array_ptr<node> Z;
|
|
238
|
+
t_index pos;
|
|
239
|
+
|
|
240
|
+
public:
|
|
241
|
+
cluster_result(const t_index size)
|
|
242
|
+
: Z(size)
|
|
243
|
+
, pos(0)
|
|
244
|
+
{}
|
|
245
|
+
|
|
246
|
+
void append(const t_index node1, const t_index node2, const t_float dist) {
|
|
247
|
+
Z[pos].node1 = node1;
|
|
248
|
+
Z[pos].node2 = node2;
|
|
249
|
+
Z[pos].dist = dist;
|
|
250
|
+
++pos;
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
node * operator[] (const t_index idx) const { return Z + idx; }
|
|
254
|
+
|
|
255
|
+
/* Define several methods to postprocess the distances. All these functions
|
|
256
|
+
are monotone, so they do not change the sorted order of distances. */
|
|
257
|
+
|
|
258
|
+
void sqrt() const {
|
|
259
|
+
for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
|
|
260
|
+
ZZ->dist = std::sqrt(ZZ->dist);
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
void sqrt(const t_float) const { // ignore the argument
|
|
265
|
+
sqrt();
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
void sqrtdouble(const t_float) const { // ignore the argument
|
|
269
|
+
for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
|
|
270
|
+
ZZ->dist = std::sqrt(2*ZZ->dist);
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
void sqrtward(const t_float) const { // ignore the argument
|
|
275
|
+
for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
|
|
276
|
+
ZZ->dist = 2*ZZ->dist;
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
#ifdef R_pow
|
|
281
|
+
#define my_pow R_pow
|
|
282
|
+
#else
|
|
283
|
+
#define my_pow std::pow
|
|
284
|
+
#endif
|
|
285
|
+
|
|
286
|
+
void power(const t_float p) const {
|
|
287
|
+
t_float const q = 1/p;
|
|
288
|
+
for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
|
|
289
|
+
ZZ->dist = my_pow(ZZ->dist,q);
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
void plusone(const t_float) const { // ignore the argument
|
|
294
|
+
for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
|
|
295
|
+
ZZ->dist += 1;
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
void divide(const t_float denom) const {
|
|
300
|
+
for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
|
|
301
|
+
ZZ->dist /= denom;
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
};
|
|
305
|
+
|
|
306
|
+
class doubly_linked_list {
|
|
307
|
+
/*
|
|
308
|
+
Class for a doubly linked list. Initially, the list is the integer range
|
|
309
|
+
[0, size]. We provide a forward iterator and a method to delete an index
|
|
310
|
+
from the list.
|
|
311
|
+
|
|
312
|
+
Typical use: for (i=L.start; L<size; i=L.succ[I])
|
|
313
|
+
or
|
|
314
|
+
for (i=somevalue; L<size; i=L.succ[I])
|
|
315
|
+
*/
|
|
316
|
+
public:
|
|
317
|
+
t_index start;
|
|
318
|
+
auto_array_ptr<t_index> succ;
|
|
319
|
+
|
|
320
|
+
private:
|
|
321
|
+
auto_array_ptr<t_index> pred;
|
|
322
|
+
// Not necessarily private, we just do not need it in this instance.
|
|
323
|
+
|
|
324
|
+
public:
|
|
325
|
+
doubly_linked_list(const t_index size)
|
|
326
|
+
// Initialize to the given size.
|
|
327
|
+
: start(0)
|
|
328
|
+
, succ(size+1)
|
|
329
|
+
, pred(size+1)
|
|
330
|
+
{
|
|
331
|
+
for (t_index i=0; i<size; ++i) {
|
|
332
|
+
pred[i+1] = i;
|
|
333
|
+
succ[i] = i+1;
|
|
334
|
+
}
|
|
335
|
+
// pred[0] is never accessed!
|
|
336
|
+
//succ[size] is never accessed!
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
~doubly_linked_list() {}
|
|
340
|
+
|
|
341
|
+
void remove(const t_index idx) {
|
|
342
|
+
// Remove an index from the list.
|
|
343
|
+
if (idx==start) {
|
|
344
|
+
start = succ[idx];
|
|
345
|
+
}
|
|
346
|
+
else {
|
|
347
|
+
succ[pred[idx]] = succ[idx];
|
|
348
|
+
pred[succ[idx]] = pred[idx];
|
|
349
|
+
}
|
|
350
|
+
succ[idx] = 0; // Mark as inactive
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
bool is_inactive(t_index idx) const {
|
|
354
|
+
return (succ[idx]==0);
|
|
355
|
+
}
|
|
356
|
+
};
|
|
357
|
+
|
|
358
|
+
// Indexing functions
|
|
359
|
+
// D is the upper triangular part of a symmetric (NxN)-matrix
|
|
360
|
+
// We require r_ < c_ !
|
|
361
|
+
#define D_(r_,c_) ( D[(static_cast<std::ptrdiff_t>(2*N-3-(r_))*(r_)>>1)+(c_)-1] )
|
|
362
|
+
// Z is an ((N-1)x4)-array
|
|
363
|
+
#define Z_(_r, _c) (Z[(_r)*4 + (_c)])
|
|
364
|
+
|
|
365
|
+
/*
|
|
366
|
+
Lookup function for a union-find data structure.
|
|
367
|
+
|
|
368
|
+
The function finds the root of idx by going iteratively through all
|
|
369
|
+
parent elements until a root is found. An element i is a root if
|
|
370
|
+
nodes[i] is zero. To make subsequent searches faster, the entry for
|
|
371
|
+
idx and all its parents is updated with the root element.
|
|
372
|
+
*/
|
|
373
|
+
class union_find {
|
|
374
|
+
private:
|
|
375
|
+
auto_array_ptr<t_index> parent;
|
|
376
|
+
t_index nextparent;
|
|
377
|
+
|
|
378
|
+
public:
|
|
379
|
+
union_find(const t_index size)
|
|
380
|
+
: parent(size>0 ? 2*size-1 : 0, 0)
|
|
381
|
+
, nextparent(size)
|
|
382
|
+
{ }
|
|
383
|
+
|
|
384
|
+
t_index Find (t_index idx) const {
|
|
385
|
+
if (parent[idx] != 0 ) { // a → b
|
|
386
|
+
t_index p = idx;
|
|
387
|
+
idx = parent[idx];
|
|
388
|
+
if (parent[idx] != 0 ) { // a → b → c
|
|
389
|
+
do {
|
|
390
|
+
idx = parent[idx];
|
|
391
|
+
} while (parent[idx] != 0);
|
|
392
|
+
do {
|
|
393
|
+
t_index tmp = parent[p];
|
|
394
|
+
parent[p] = idx;
|
|
395
|
+
p = tmp;
|
|
396
|
+
} while (parent[p] != idx);
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
return idx;
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
void Union (const t_index node1, const t_index node2) {
|
|
403
|
+
parent[node1] = parent[node2] = nextparent++;
|
|
404
|
+
}
|
|
405
|
+
};
|
|
406
|
+
|
|
407
|
+
class nan_error{};
|
|
408
|
+
#ifdef FE_INVALID
|
|
409
|
+
class fenv_error{};
|
|
410
|
+
#endif
|
|
411
|
+
|
|
412
|
+
static void MST_linkage_core(const t_index N, const t_float * const D,
|
|
413
|
+
cluster_result & Z2) {
|
|
414
|
+
/*
|
|
415
|
+
N: integer, number of data points
|
|
416
|
+
D: condensed distance matrix N*(N-1)/2
|
|
417
|
+
Z2: output data structure
|
|
418
|
+
|
|
419
|
+
The basis of this algorithm is an algorithm by Rohlf:
|
|
420
|
+
|
|
421
|
+
F. James Rohlf, Hierarchical clustering using the minimum spanning tree,
|
|
422
|
+
The Computer Journal, vol. 16, 1973, p. 93–95.
|
|
423
|
+
*/
|
|
424
|
+
t_index i;
|
|
425
|
+
t_index idx2;
|
|
426
|
+
doubly_linked_list active_nodes(N);
|
|
427
|
+
auto_array_ptr<t_float> d(N);
|
|
428
|
+
|
|
429
|
+
t_index prev_node;
|
|
430
|
+
t_float min;
|
|
431
|
+
|
|
432
|
+
// first iteration
|
|
433
|
+
idx2 = 1;
|
|
434
|
+
min = std::numeric_limits<t_float>::infinity();
|
|
435
|
+
for (i=1; i<N; ++i) {
|
|
436
|
+
d[i] = D[i-1];
|
|
437
|
+
#if HAVE_DIAGNOSTIC
|
|
438
|
+
#pragma GCC diagnostic push
|
|
439
|
+
#pragma GCC diagnostic ignored "-Wfloat-equal"
|
|
440
|
+
#endif
|
|
441
|
+
if (d[i] < min) {
|
|
442
|
+
min = d[i];
|
|
443
|
+
idx2 = i;
|
|
444
|
+
}
|
|
445
|
+
else if (fc_isnan(d[i]))
|
|
446
|
+
throw (nan_error());
|
|
447
|
+
#if HAVE_DIAGNOSTIC
|
|
448
|
+
#pragma GCC diagnostic pop
|
|
449
|
+
#endif
|
|
450
|
+
}
|
|
451
|
+
Z2.append(0, idx2, min);
|
|
452
|
+
|
|
453
|
+
for (t_index j=1; j<N-1; ++j) {
|
|
454
|
+
prev_node = idx2;
|
|
455
|
+
active_nodes.remove(prev_node);
|
|
456
|
+
|
|
457
|
+
idx2 = active_nodes.succ[0];
|
|
458
|
+
min = d[idx2];
|
|
459
|
+
for (i=idx2; i<prev_node; i=active_nodes.succ[i]) {
|
|
460
|
+
t_float tmp = D_(i, prev_node);
|
|
461
|
+
#if HAVE_DIAGNOSTIC
|
|
462
|
+
#pragma GCC diagnostic push
|
|
463
|
+
#pragma GCC diagnostic ignored "-Wfloat-equal"
|
|
464
|
+
#endif
|
|
465
|
+
if (tmp < d[i])
|
|
466
|
+
d[i] = tmp;
|
|
467
|
+
else if (fc_isnan(tmp))
|
|
468
|
+
throw (nan_error());
|
|
469
|
+
#if HAVE_DIAGNOSTIC
|
|
470
|
+
#pragma GCC diagnostic pop
|
|
471
|
+
#endif
|
|
472
|
+
if (d[i] < min) {
|
|
473
|
+
min = d[i];
|
|
474
|
+
idx2 = i;
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
for (; i<N; i=active_nodes.succ[i]) {
|
|
478
|
+
t_float tmp = D_(prev_node, i);
|
|
479
|
+
#if HAVE_DIAGNOSTIC
|
|
480
|
+
#pragma GCC diagnostic push
|
|
481
|
+
#pragma GCC diagnostic ignored "-Wfloat-equal"
|
|
482
|
+
#endif
|
|
483
|
+
if (d[i] > tmp)
|
|
484
|
+
d[i] = tmp;
|
|
485
|
+
else if (fc_isnan(tmp))
|
|
486
|
+
throw (nan_error());
|
|
487
|
+
#if HAVE_DIAGNOSTIC
|
|
488
|
+
#pragma GCC diagnostic pop
|
|
489
|
+
#endif
|
|
490
|
+
if (d[i] < min) {
|
|
491
|
+
min = d[i];
|
|
492
|
+
idx2 = i;
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
Z2.append(prev_node, idx2, min);
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
/* Functions for the update of the dissimilarity array */
|
|
500
|
+
|
|
501
|
+
inline static void f_single( t_float * const b, const t_float a ) {
|
|
502
|
+
if (*b > a) *b = a;
|
|
503
|
+
}
|
|
504
|
+
inline static void f_complete( t_float * const b, const t_float a ) {
|
|
505
|
+
if (*b < a) *b = a;
|
|
506
|
+
}
|
|
507
|
+
inline static void f_average( t_float * const b, const t_float a, const t_float s, const t_float t) {
|
|
508
|
+
*b = s*a + t*(*b);
|
|
509
|
+
#ifndef FE_INVALID
|
|
510
|
+
#if HAVE_DIAGNOSTIC
|
|
511
|
+
#pragma GCC diagnostic push
|
|
512
|
+
#pragma GCC diagnostic ignored "-Wfloat-equal"
|
|
513
|
+
#endif
|
|
514
|
+
if (fc_isnan(*b)) {
|
|
515
|
+
throw(nan_error());
|
|
516
|
+
}
|
|
517
|
+
#if HAVE_DIAGNOSTIC
|
|
518
|
+
#pragma GCC diagnostic pop
|
|
519
|
+
#endif
|
|
520
|
+
#endif
|
|
521
|
+
}
|
|
522
|
+
inline static void f_weighted( t_float * const b, const t_float a) {
|
|
523
|
+
*b = (a+*b)*.5;
|
|
524
|
+
#ifndef FE_INVALID
|
|
525
|
+
#if HAVE_DIAGNOSTIC
|
|
526
|
+
#pragma GCC diagnostic push
|
|
527
|
+
#pragma GCC diagnostic ignored "-Wfloat-equal"
|
|
528
|
+
#endif
|
|
529
|
+
if (fc_isnan(*b)) {
|
|
530
|
+
throw(nan_error());
|
|
531
|
+
}
|
|
532
|
+
#if HAVE_DIAGNOSTIC
|
|
533
|
+
#pragma GCC diagnostic pop
|
|
534
|
+
#endif
|
|
535
|
+
#endif
|
|
536
|
+
}
|
|
537
|
+
inline static void f_ward( t_float * const b, const t_float a, const t_float c, const t_float s, const t_float t, const t_float v) {
|
|
538
|
+
*b = ( (v+s)*a - v*c + (v+t)*(*b) ) / (s+t+v);
|
|
539
|
+
//*b = a+(*b)-(t*a+s*(*b)+v*c)/(s+t+v);
|
|
540
|
+
#ifndef FE_INVALID
|
|
541
|
+
#if HAVE_DIAGNOSTIC
|
|
542
|
+
#pragma GCC diagnostic push
|
|
543
|
+
#pragma GCC diagnostic ignored "-Wfloat-equal"
|
|
544
|
+
#endif
|
|
545
|
+
if (fc_isnan(*b)) {
|
|
546
|
+
throw(nan_error());
|
|
547
|
+
}
|
|
548
|
+
#if HAVE_DIAGNOSTIC
|
|
549
|
+
#pragma GCC diagnostic pop
|
|
550
|
+
#endif
|
|
551
|
+
#endif
|
|
552
|
+
}
|
|
553
|
+
inline static void f_centroid( t_float * const b, const t_float a, const t_float stc, const t_float s, const t_float t) {
|
|
554
|
+
*b = s*a - stc + t*(*b);
|
|
555
|
+
#ifndef FE_INVALID
|
|
556
|
+
if (fc_isnan(*b)) {
|
|
557
|
+
throw(nan_error());
|
|
558
|
+
}
|
|
559
|
+
#if HAVE_DIAGNOSTIC
|
|
560
|
+
#pragma GCC diagnostic pop
|
|
561
|
+
#endif
|
|
562
|
+
#endif
|
|
563
|
+
}
|
|
564
|
+
inline static void f_median( t_float * const b, const t_float a, const t_float c_4) {
|
|
565
|
+
*b = (a+(*b))*.5 - c_4;
|
|
566
|
+
#ifndef FE_INVALID
|
|
567
|
+
#if HAVE_DIAGNOSTIC
|
|
568
|
+
#pragma GCC diagnostic push
|
|
569
|
+
#pragma GCC diagnostic ignored "-Wfloat-equal"
|
|
570
|
+
#endif
|
|
571
|
+
if (fc_isnan(*b)) {
|
|
572
|
+
throw(nan_error());
|
|
573
|
+
}
|
|
574
|
+
#if HAVE_DIAGNOSTIC
|
|
575
|
+
#pragma GCC diagnostic pop
|
|
576
|
+
#endif
|
|
577
|
+
#endif
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
template <method_codes method, typename t_members>
|
|
581
|
+
static void NN_chain_core(const t_index N, t_float * const D, t_members * const members, cluster_result & Z2) {
|
|
582
|
+
/*
|
|
583
|
+
N: integer
|
|
584
|
+
D: condensed distance matrix N*(N-1)/2
|
|
585
|
+
Z2: output data structure
|
|
586
|
+
|
|
587
|
+
This is the NN-chain algorithm, described on page 86 in the following book:
|
|
588
|
+
|
|
589
|
+
Fionn Murtagh, Multidimensional Clustering Algorithms,
|
|
590
|
+
Vienna, Würzburg: Physica-Verlag, 1985.
|
|
591
|
+
*/
|
|
592
|
+
t_index i;
|
|
593
|
+
|
|
594
|
+
auto_array_ptr<t_index> NN_chain(N);
|
|
595
|
+
t_index NN_chain_tip = 0;
|
|
596
|
+
|
|
597
|
+
t_index idx1, idx2;
|
|
598
|
+
|
|
599
|
+
t_float size1, size2;
|
|
600
|
+
doubly_linked_list active_nodes(N);
|
|
601
|
+
|
|
602
|
+
t_float min;
|
|
603
|
+
|
|
604
|
+
for (t_float const * DD=D; DD!=D+(static_cast<std::ptrdiff_t>(N)*(N-1)>>1);
|
|
605
|
+
++DD) {
|
|
606
|
+
#if HAVE_DIAGNOSTIC
|
|
607
|
+
#pragma GCC diagnostic push
|
|
608
|
+
#pragma GCC diagnostic ignored "-Wfloat-equal"
|
|
609
|
+
#endif
|
|
610
|
+
if (fc_isnan(*DD)) {
|
|
611
|
+
throw(nan_error());
|
|
612
|
+
}
|
|
613
|
+
#if HAVE_DIAGNOSTIC
|
|
614
|
+
#pragma GCC diagnostic pop
|
|
615
|
+
#endif
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
#ifdef FE_INVALID
|
|
619
|
+
if (feclearexcept(FE_INVALID)) throw fenv_error();
|
|
620
|
+
#endif
|
|
621
|
+
|
|
622
|
+
for (t_index j=0; j<N-1; ++j) {
|
|
623
|
+
if (NN_chain_tip <= 3) {
|
|
624
|
+
NN_chain[0] = idx1 = active_nodes.start;
|
|
625
|
+
NN_chain_tip = 1;
|
|
626
|
+
|
|
627
|
+
idx2 = active_nodes.succ[idx1];
|
|
628
|
+
min = D_(idx1,idx2);
|
|
629
|
+
for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i]) {
|
|
630
|
+
if (D_(idx1,i) < min) {
|
|
631
|
+
min = D_(idx1,i);
|
|
632
|
+
idx2 = i;
|
|
633
|
+
}
|
|
634
|
+
}
|
|
635
|
+
} // a: idx1 b: idx2
|
|
636
|
+
else {
|
|
637
|
+
NN_chain_tip -= 3;
|
|
638
|
+
idx1 = NN_chain[NN_chain_tip-1];
|
|
639
|
+
idx2 = NN_chain[NN_chain_tip];
|
|
640
|
+
min = idx1<idx2 ? D_(idx1,idx2) : D_(idx2,idx1);
|
|
641
|
+
} // a: idx1 b: idx2
|
|
642
|
+
|
|
643
|
+
do {
|
|
644
|
+
NN_chain[NN_chain_tip] = idx2;
|
|
645
|
+
|
|
646
|
+
for (i=active_nodes.start; i<idx2; i=active_nodes.succ[i]) {
|
|
647
|
+
if (D_(i,idx2) < min) {
|
|
648
|
+
min = D_(i,idx2);
|
|
649
|
+
idx1 = i;
|
|
650
|
+
}
|
|
651
|
+
}
|
|
652
|
+
for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i]) {
|
|
653
|
+
if (D_(idx2,i) < min) {
|
|
654
|
+
min = D_(idx2,i);
|
|
655
|
+
idx1 = i;
|
|
656
|
+
}
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
idx2 = idx1;
|
|
660
|
+
idx1 = NN_chain[NN_chain_tip++];
|
|
661
|
+
|
|
662
|
+
} while (idx2 != NN_chain[NN_chain_tip-2]);
|
|
663
|
+
|
|
664
|
+
Z2.append(idx1, idx2, min);
|
|
665
|
+
|
|
666
|
+
if (idx1>idx2) {
|
|
667
|
+
t_index tmp = idx1;
|
|
668
|
+
idx1 = idx2;
|
|
669
|
+
idx2 = tmp;
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
if (method==METHOD_METR_AVERAGE ||
|
|
673
|
+
method==METHOD_METR_WARD ||
|
|
674
|
+
method==METHOD_METR_WARD_D2) {
|
|
675
|
+
size1 = static_cast<t_float>(members[idx1]);
|
|
676
|
+
size2 = static_cast<t_float>(members[idx2]);
|
|
677
|
+
members[idx2] += members[idx1];
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
// Remove the smaller index from the valid indices (active_nodes).
|
|
681
|
+
active_nodes.remove(idx1);
|
|
682
|
+
|
|
683
|
+
switch (method) {
|
|
684
|
+
case METHOD_METR_SINGLE:
|
|
685
|
+
/*
|
|
686
|
+
Single linkage.
|
|
687
|
+
|
|
688
|
+
Characteristic: new distances are never longer than the old distances.
|
|
689
|
+
*/
|
|
690
|
+
// Update the distance matrix in the range [start, idx1).
|
|
691
|
+
for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
|
|
692
|
+
f_single(&D_(i, idx2), D_(i, idx1) );
|
|
693
|
+
// Update the distance matrix in the range (idx1, idx2).
|
|
694
|
+
for (; i<idx2; i=active_nodes.succ[i])
|
|
695
|
+
f_single(&D_(i, idx2), D_(idx1, i) );
|
|
696
|
+
// Update the distance matrix in the range (idx2, N).
|
|
697
|
+
for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
|
|
698
|
+
f_single(&D_(idx2, i), D_(idx1, i) );
|
|
699
|
+
break;
|
|
700
|
+
|
|
701
|
+
case METHOD_METR_COMPLETE:
|
|
702
|
+
/*
|
|
703
|
+
Complete linkage.
|
|
704
|
+
|
|
705
|
+
Characteristic: new distances are never shorter than the old distances.
|
|
706
|
+
*/
|
|
707
|
+
// Update the distance matrix in the range [start, idx1).
|
|
708
|
+
for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
|
|
709
|
+
f_complete(&D_(i, idx2), D_(i, idx1) );
|
|
710
|
+
// Update the distance matrix in the range (idx1, idx2).
|
|
711
|
+
for (; i<idx2; i=active_nodes.succ[i])
|
|
712
|
+
f_complete(&D_(i, idx2), D_(idx1, i) );
|
|
713
|
+
// Update the distance matrix in the range (idx2, N).
|
|
714
|
+
for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
|
|
715
|
+
f_complete(&D_(idx2, i), D_(idx1, i) );
|
|
716
|
+
break;
|
|
717
|
+
|
|
718
|
+
case METHOD_METR_AVERAGE: {
|
|
719
|
+
/*
|
|
720
|
+
Average linkage.
|
|
721
|
+
|
|
722
|
+
Shorter and longer distances can occur.
|
|
723
|
+
*/
|
|
724
|
+
// Update the distance matrix in the range [start, idx1).
|
|
725
|
+
t_float s = size1/(size1+size2);
|
|
726
|
+
t_float t = size2/(size1+size2);
|
|
727
|
+
for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
|
|
728
|
+
f_average(&D_(i, idx2), D_(i, idx1), s, t );
|
|
729
|
+
// Update the distance matrix in the range (idx1, idx2).
|
|
730
|
+
for (; i<idx2; i=active_nodes.succ[i])
|
|
731
|
+
f_average(&D_(i, idx2), D_(idx1, i), s, t );
|
|
732
|
+
// Update the distance matrix in the range (idx2, N).
|
|
733
|
+
for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
|
|
734
|
+
f_average(&D_(idx2, i), D_(idx1, i), s, t );
|
|
735
|
+
break;
|
|
736
|
+
}
|
|
737
|
+
|
|
738
|
+
case METHOD_METR_WEIGHTED:
|
|
739
|
+
/*
|
|
740
|
+
Weighted linkage.
|
|
741
|
+
|
|
742
|
+
Shorter and longer distances can occur.
|
|
743
|
+
*/
|
|
744
|
+
// Update the distance matrix in the range [start, idx1).
|
|
745
|
+
for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
|
|
746
|
+
f_weighted(&D_(i, idx2), D_(i, idx1) );
|
|
747
|
+
// Update the distance matrix in the range (idx1, idx2).
|
|
748
|
+
for (; i<idx2; i=active_nodes.succ[i])
|
|
749
|
+
f_weighted(&D_(i, idx2), D_(idx1, i) );
|
|
750
|
+
// Update the distance matrix in the range (idx2, N).
|
|
751
|
+
for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
|
|
752
|
+
f_weighted(&D_(idx2, i), D_(idx1, i) );
|
|
753
|
+
break;
|
|
754
|
+
|
|
755
|
+
case METHOD_METR_WARD:
|
|
756
|
+
/*
|
|
757
|
+
Ward linkage.
|
|
758
|
+
|
|
759
|
+
Shorter and longer distances can occur, not smaller than min(d1,d2)
|
|
760
|
+
but maybe bigger than max(d1,d2).
|
|
761
|
+
*/
|
|
762
|
+
// Update the distance matrix in the range [start, idx1).
|
|
763
|
+
//t_float v = static_cast<t_float>(members[i]);
|
|
764
|
+
for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
|
|
765
|
+
f_ward(&D_(i, idx2), D_(i, idx1), min,
|
|
766
|
+
size1, size2, static_cast<t_float>(members[i]) );
|
|
767
|
+
// Update the distance matrix in the range (idx1, idx2).
|
|
768
|
+
for (; i<idx2; i=active_nodes.succ[i])
|
|
769
|
+
f_ward(&D_(i, idx2), D_(idx1, i), min,
|
|
770
|
+
size1, size2, static_cast<t_float>(members[i]) );
|
|
771
|
+
// Update the distance matrix in the range (idx2, N).
|
|
772
|
+
for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
|
|
773
|
+
f_ward(&D_(idx2, i), D_(idx1, i), min,
|
|
774
|
+
size1, size2, static_cast<t_float>(members[i]) );
|
|
775
|
+
break;
|
|
776
|
+
|
|
777
|
+
case METHOD_METR_WARD_D2:
|
|
778
|
+
/*
|
|
779
|
+
Ward D2 linkage (with squared Euclidean distances).
|
|
780
|
+
|
|
781
|
+
Shorter and longer distances can occur, not smaller than min(d1,d2)
|
|
782
|
+
but maybe bigger than max(d1,d2).
|
|
783
|
+
Uses the same update formula as Ward, but with different post-processing.
|
|
784
|
+
*/
|
|
785
|
+
// Update the distance matrix in the range [start, idx1).
|
|
786
|
+
for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
|
|
787
|
+
f_ward(&D_(i, idx2), D_(i, idx1), min,
|
|
788
|
+
size1, size2, static_cast<t_float>(members[i]) );
|
|
789
|
+
// Update the distance matrix in the range (idx1, idx2).
|
|
790
|
+
for (; i<idx2; i=active_nodes.succ[i])
|
|
791
|
+
f_ward(&D_(i, idx2), D_(idx1, i), min,
|
|
792
|
+
size1, size2, static_cast<t_float>(members[i]) );
|
|
793
|
+
// Update the distance matrix in the range (idx2, N).
|
|
794
|
+
for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
|
|
795
|
+
f_ward(&D_(idx2, i), D_(idx1, i), min,
|
|
796
|
+
size1, size2, static_cast<t_float>(members[i]) );
|
|
797
|
+
break;
|
|
798
|
+
|
|
799
|
+
default:
|
|
800
|
+
throw std::runtime_error(std::string("Invalid method."));
|
|
801
|
+
}
|
|
802
|
+
}
|
|
803
|
+
#ifdef FE_INVALID
|
|
804
|
+
if (fetestexcept(FE_INVALID)) throw fenv_error();
|
|
805
|
+
#endif
|
|
806
|
+
}
|
|
807
|
+
|
|
808
|
+
class binary_min_heap {
|
|
809
|
+
/*
|
|
810
|
+
Class for a binary min-heap. The data resides in an array A. The elements of
|
|
811
|
+
A are not changed but two lists I and R of indices are generated which point
|
|
812
|
+
to elements of A and backwards.
|
|
813
|
+
|
|
814
|
+
The heap tree structure is
|
|
815
|
+
|
|
816
|
+
H[2*i+1] H[2*i+2]
|
|
817
|
+
\ /
|
|
818
|
+
\ /
|
|
819
|
+
≤ ≤
|
|
820
|
+
\ /
|
|
821
|
+
\ /
|
|
822
|
+
H[i]
|
|
823
|
+
|
|
824
|
+
where the children must be less or equal than their parent. Thus, H[0]
|
|
825
|
+
contains the minimum. The lists I and R are made such that H[i] = A[I[i]]
|
|
826
|
+
and R[I[i]] = i.
|
|
827
|
+
|
|
828
|
+
This implementation is not designed to handle NaN values.
|
|
829
|
+
*/
|
|
830
|
+
private:
|
|
831
|
+
t_float * const A;
|
|
832
|
+
t_index size;
|
|
833
|
+
auto_array_ptr<t_index> I;
|
|
834
|
+
auto_array_ptr<t_index> R;
|
|
835
|
+
|
|
836
|
+
// no default constructor
|
|
837
|
+
binary_min_heap();
|
|
838
|
+
// noncopyable
|
|
839
|
+
binary_min_heap(binary_min_heap const &);
|
|
840
|
+
binary_min_heap & operator=(binary_min_heap const &);
|
|
841
|
+
|
|
842
|
+
public:
|
|
843
|
+
binary_min_heap(t_float * const A_, const t_index size_)
|
|
844
|
+
: A(A_), size(size_), I(size), R(size)
|
|
845
|
+
{ // Allocate memory and initialize the lists I and R to the identity. This
|
|
846
|
+
// does not make it a heap. Call heapify afterwards!
|
|
847
|
+
for (t_index i=0; i<size; ++i)
|
|
848
|
+
R[i] = I[i] = i;
|
|
849
|
+
}
|
|
850
|
+
|
|
851
|
+
binary_min_heap(t_float * const A_, const t_index size1, const t_index size2,
|
|
852
|
+
const t_index start)
|
|
853
|
+
: A(A_), size(size1), I(size1), R(size2)
|
|
854
|
+
{ // Allocate memory and initialize the lists I and R to the identity. This
|
|
855
|
+
// does not make it a heap. Call heapify afterwards!
|
|
856
|
+
for (t_index i=0; i<size; ++i) {
|
|
857
|
+
R[i+start] = i;
|
|
858
|
+
I[i] = i + start;
|
|
859
|
+
}
|
|
860
|
+
}
|
|
861
|
+
|
|
862
|
+
~binary_min_heap() {}
|
|
863
|
+
|
|
864
|
+
void heapify() {
|
|
865
|
+
// Arrange the indices I and R so that H[i] := A[I[i]] satisfies the heap
|
|
866
|
+
// condition H[i] < H[2*i+1] and H[i] < H[2*i+2] for each i.
|
|
867
|
+
//
|
|
868
|
+
// Complexity: Θ(size)
|
|
869
|
+
// Reference: Cormen, Leiserson, Rivest, Stein, Introduction to Algorithms,
|
|
870
|
+
// 3rd ed., 2009, Section 6.3 “Building a heap”
|
|
871
|
+
t_index idx;
|
|
872
|
+
for (idx=(size>>1); idx>0; ) {
|
|
873
|
+
--idx;
|
|
874
|
+
update_geq_(idx);
|
|
875
|
+
}
|
|
876
|
+
}
|
|
877
|
+
|
|
878
|
+
inline t_index argmin() const {
|
|
879
|
+
// Return the minimal element.
|
|
880
|
+
return I[0];
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
void heap_pop() {
|
|
884
|
+
// Remove the minimal element from the heap.
|
|
885
|
+
--size;
|
|
886
|
+
I[0] = I[size];
|
|
887
|
+
R[I[0]] = 0;
|
|
888
|
+
update_geq_(0);
|
|
889
|
+
}
|
|
890
|
+
|
|
891
|
+
void remove(t_index idx) {
|
|
892
|
+
// Remove an element from the heap.
|
|
893
|
+
--size;
|
|
894
|
+
R[I[size]] = R[idx];
|
|
895
|
+
I[R[idx]] = I[size];
|
|
896
|
+
if ( H(size)<=A[idx] ) {
|
|
897
|
+
update_leq_(R[idx]);
|
|
898
|
+
}
|
|
899
|
+
else {
|
|
900
|
+
update_geq_(R[idx]);
|
|
901
|
+
}
|
|
902
|
+
}
|
|
903
|
+
|
|
904
|
+
void replace ( const t_index idxold, const t_index idxnew,
|
|
905
|
+
const t_float val) {
|
|
906
|
+
R[idxnew] = R[idxold];
|
|
907
|
+
I[R[idxnew]] = idxnew;
|
|
908
|
+
if (val<=A[idxold])
|
|
909
|
+
update_leq(idxnew, val);
|
|
910
|
+
else
|
|
911
|
+
update_geq(idxnew, val);
|
|
912
|
+
}
|
|
913
|
+
|
|
914
|
+
void update ( const t_index idx, const t_float val ) const {
|
|
915
|
+
// Update the element A[i] with val and re-arrange the indices to preserve
|
|
916
|
+
// the heap condition.
|
|
917
|
+
if (val<=A[idx])
|
|
918
|
+
update_leq(idx, val);
|
|
919
|
+
else
|
|
920
|
+
update_geq(idx, val);
|
|
921
|
+
}
|
|
922
|
+
|
|
923
|
+
void update_leq ( const t_index idx, const t_float val ) const {
|
|
924
|
+
// Use this when the new value is not more than the old value.
|
|
925
|
+
A[idx] = val;
|
|
926
|
+
update_leq_(R[idx]);
|
|
927
|
+
}
|
|
928
|
+
|
|
929
|
+
void update_geq ( const t_index idx, const t_float val ) const {
|
|
930
|
+
// Use this when the new value is not less than the old value.
|
|
931
|
+
A[idx] = val;
|
|
932
|
+
update_geq_(R[idx]);
|
|
933
|
+
}
|
|
934
|
+
|
|
935
|
+
private:
|
|
936
|
+
void update_leq_ (t_index i) const {
|
|
937
|
+
t_index j;
|
|
938
|
+
for ( ; (i>0) && ( H(i)<H(j=(i-1)>>1) ); i=j)
|
|
939
|
+
heap_swap(i,j);
|
|
940
|
+
}
|
|
941
|
+
|
|
942
|
+
void update_geq_ (t_index i) const {
|
|
943
|
+
t_index j;
|
|
944
|
+
for ( ; (j=2*i+1)<size; i=j) {
|
|
945
|
+
if ( H(j)>=H(i) ) {
|
|
946
|
+
++j;
|
|
947
|
+
if ( j>=size || H(j)>=H(i) ) break;
|
|
948
|
+
}
|
|
949
|
+
else if ( j+1<size && H(j+1)<H(j) ) ++j;
|
|
950
|
+
heap_swap(i, j);
|
|
951
|
+
}
|
|
952
|
+
}
|
|
953
|
+
|
|
954
|
+
void heap_swap(const t_index i, const t_index j) const {
|
|
955
|
+
// Swap two indices.
|
|
956
|
+
t_index tmp = I[i];
|
|
957
|
+
I[i] = I[j];
|
|
958
|
+
I[j] = tmp;
|
|
959
|
+
R[I[i]] = i;
|
|
960
|
+
R[I[j]] = j;
|
|
961
|
+
}
|
|
962
|
+
|
|
963
|
+
inline t_float H(const t_index i) const {
|
|
964
|
+
return A[I[i]];
|
|
965
|
+
}
|
|
966
|
+
|
|
967
|
+
};
|
|
968
|
+
|
|
969
|
+
template <method_codes method, typename t_members>
|
|
970
|
+
static void generic_linkage(const t_index N, t_float * const D, t_members * const members, cluster_result & Z2) {
|
|
971
|
+
/*
|
|
972
|
+
N: integer, number of data points
|
|
973
|
+
D: condensed distance matrix N*(N-1)/2
|
|
974
|
+
Z2: output data structure
|
|
975
|
+
*/
|
|
976
|
+
|
|
977
|
+
const t_index N_1 = N-1;
|
|
978
|
+
t_index i, j; // loop variables
|
|
979
|
+
t_index idx1, idx2; // row and column indices
|
|
980
|
+
|
|
981
|
+
auto_array_ptr<t_index> n_nghbr(N_1); // array of nearest neighbors
|
|
982
|
+
auto_array_ptr<t_float> mindist(N_1); // distances to the nearest neighbors
|
|
983
|
+
auto_array_ptr<t_index> row_repr(N); // row_repr[i]: node number that the
|
|
984
|
+
// i-th row represents
|
|
985
|
+
doubly_linked_list active_nodes(N);
|
|
986
|
+
binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for
|
|
987
|
+
// the distance to the nearest neighbor of each point
|
|
988
|
+
t_index node1, node2; // node numbers in the output
|
|
989
|
+
t_float size1, size2; // and their cardinalities
|
|
990
|
+
|
|
991
|
+
t_float min; // minimum and row index for nearest-neighbor search
|
|
992
|
+
t_index idx;
|
|
993
|
+
|
|
994
|
+
for (i=0; i<N; ++i)
|
|
995
|
+
// Build a list of row ↔ node label assignments.
|
|
996
|
+
// Initially i ↦ i
|
|
997
|
+
row_repr[i] = i;
|
|
998
|
+
|
|
999
|
+
// Initialize the minimal distances:
|
|
1000
|
+
// Find the nearest neighbor of each point.
|
|
1001
|
+
// n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
|
|
1002
|
+
t_float const * DD = D;
|
|
1003
|
+
for (i=0; i<N_1; ++i) {
|
|
1004
|
+
min = std::numeric_limits<t_float>::infinity();
|
|
1005
|
+
for (idx=j=i+1; j<N; ++j, ++DD) {
|
|
1006
|
+
#if HAVE_DIAGNOSTIC
|
|
1007
|
+
#pragma GCC diagnostic push
|
|
1008
|
+
#pragma GCC diagnostic ignored "-Wfloat-equal"
|
|
1009
|
+
#endif
|
|
1010
|
+
if (*DD<min) {
|
|
1011
|
+
min = *DD;
|
|
1012
|
+
idx = j;
|
|
1013
|
+
}
|
|
1014
|
+
else if (fc_isnan(*DD))
|
|
1015
|
+
throw(nan_error());
|
|
1016
|
+
}
|
|
1017
|
+
#if HAVE_DIAGNOSTIC
|
|
1018
|
+
#pragma GCC diagnostic pop
|
|
1019
|
+
#endif
|
|
1020
|
+
mindist[i] = min;
|
|
1021
|
+
n_nghbr[i] = idx;
|
|
1022
|
+
}
|
|
1023
|
+
|
|
1024
|
+
// Put the minimal distances into a heap structure to make the repeated
|
|
1025
|
+
// global minimum searches fast.
|
|
1026
|
+
nn_distances.heapify();
|
|
1027
|
+
|
|
1028
|
+
#ifdef FE_INVALID
|
|
1029
|
+
if (feclearexcept(FE_INVALID)) throw fenv_error();
|
|
1030
|
+
#endif
|
|
1031
|
+
|
|
1032
|
+
// Main loop: We have N-1 merging steps.
|
|
1033
|
+
for (i=0; i<N_1; ++i) {
|
|
1034
|
+
/*
|
|
1035
|
+
Here is a special feature that allows fast bookkeeping and updates of the
|
|
1036
|
+
minimal distances.
|
|
1037
|
+
|
|
1038
|
+
mindist[i] stores a lower bound on the minimum distance of the point i to
|
|
1039
|
+
all points of higher index:
|
|
1040
|
+
|
|
1041
|
+
mindist[i] ≥ min_{j>i} D(i,j)
|
|
1042
|
+
|
|
1043
|
+
Normally, we have equality. However, this minimum may become invalid due
|
|
1044
|
+
to the updates in the distance matrix. The rules are:
|
|
1045
|
+
|
|
1046
|
+
1) If mindist[i] is equal to D(i, n_nghbr[i]), this is the correct
|
|
1047
|
+
minimum and n_nghbr[i] is a nearest neighbor.
|
|
1048
|
+
|
|
1049
|
+
2) If mindist[i] is smaller than D(i, n_nghbr[i]), this might not be the
|
|
1050
|
+
correct minimum. The minimum needs to be recomputed.
|
|
1051
|
+
|
|
1052
|
+
3) mindist[i] is never bigger than the true minimum. Hence, we never
|
|
1053
|
+
miss the true minimum if we take the smallest mindist entry,
|
|
1054
|
+
re-compute the value if necessary (thus maybe increasing it) and
|
|
1055
|
+
looking for the now smallest mindist entry until a valid minimal
|
|
1056
|
+
entry is found. This step is done in the lines below.
|
|
1057
|
+
|
|
1058
|
+
The update process for D below takes care that these rules are
|
|
1059
|
+
fulfilled. This makes sure that the minima in the rows D(i,i+1:)of D are
|
|
1060
|
+
re-calculated when necessary but re-calculation is avoided whenever
|
|
1061
|
+
possible.
|
|
1062
|
+
|
|
1063
|
+
The re-calculation of the minima makes the worst-case runtime of this
|
|
1064
|
+
algorithm cubic in N. We avoid this whenever possible, and in most cases
|
|
1065
|
+
the runtime appears to be quadratic.
|
|
1066
|
+
*/
|
|
1067
|
+
idx1 = nn_distances.argmin();
|
|
1068
|
+
if (method != METHOD_METR_SINGLE) {
|
|
1069
|
+
while ( mindist[idx1] < D_(idx1, n_nghbr[idx1]) ) {
|
|
1070
|
+
// Recompute the minimum mindist[idx1] and n_nghbr[idx1].
|
|
1071
|
+
n_nghbr[idx1] = j = active_nodes.succ[idx1]; // exists, maximally N-1
|
|
1072
|
+
min = D_(idx1,j);
|
|
1073
|
+
for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
|
|
1074
|
+
if (D_(idx1,j)<min) {
|
|
1075
|
+
min = D_(idx1,j);
|
|
1076
|
+
n_nghbr[idx1] = j;
|
|
1077
|
+
}
|
|
1078
|
+
}
|
|
1079
|
+
/* Update the heap with the new true minimum and search for the
|
|
1080
|
+
(possibly different) minimal entry. */
|
|
1081
|
+
nn_distances.update_geq(idx1, min);
|
|
1082
|
+
idx1 = nn_distances.argmin();
|
|
1083
|
+
}
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1086
|
+
nn_distances.heap_pop(); // Remove the current minimum from the heap.
|
|
1087
|
+
idx2 = n_nghbr[idx1];
|
|
1088
|
+
|
|
1089
|
+
// Write the newly found minimal pair of nodes to the output array.
|
|
1090
|
+
node1 = row_repr[idx1];
|
|
1091
|
+
node2 = row_repr[idx2];
|
|
1092
|
+
|
|
1093
|
+
if (method==METHOD_METR_AVERAGE ||
|
|
1094
|
+
method==METHOD_METR_WARD ||
|
|
1095
|
+
method==METHOD_METR_WARD_D2 ||
|
|
1096
|
+
method==METHOD_METR_CENTROID) {
|
|
1097
|
+
size1 = static_cast<t_float>(members[idx1]);
|
|
1098
|
+
size2 = static_cast<t_float>(members[idx2]);
|
|
1099
|
+
members[idx2] += members[idx1];
|
|
1100
|
+
}
|
|
1101
|
+
Z2.append(node1, node2, mindist[idx1]);
|
|
1102
|
+
|
|
1103
|
+
// Remove idx1 from the list of active indices (active_nodes).
|
|
1104
|
+
active_nodes.remove(idx1);
|
|
1105
|
+
// Index idx2 now represents the new (merged) node with label N+i.
|
|
1106
|
+
row_repr[idx2] = N+i;
|
|
1107
|
+
|
|
1108
|
+
// Update the distance matrix
|
|
1109
|
+
switch (method) {
|
|
1110
|
+
case METHOD_METR_SINGLE:
|
|
1111
|
+
/*
|
|
1112
|
+
Single linkage.
|
|
1113
|
+
|
|
1114
|
+
Characteristic: new distances are never longer than the old distances.
|
|
1115
|
+
*/
|
|
1116
|
+
// Update the distance matrix in the range [start, idx1).
|
|
1117
|
+
for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
|
|
1118
|
+
f_single(&D_(j, idx2), D_(j, idx1));
|
|
1119
|
+
if (n_nghbr[j] == idx1)
|
|
1120
|
+
n_nghbr[j] = idx2;
|
|
1121
|
+
}
|
|
1122
|
+
// Update the distance matrix in the range (idx1, idx2).
|
|
1123
|
+
for (; j<idx2; j=active_nodes.succ[j]) {
|
|
1124
|
+
f_single(&D_(j, idx2), D_(idx1, j));
|
|
1125
|
+
// If the new value is below the old minimum in a row, update
|
|
1126
|
+
// the mindist and n_nghbr arrays.
|
|
1127
|
+
if (D_(j, idx2) < mindist[j]) {
|
|
1128
|
+
nn_distances.update_leq(j, D_(j, idx2));
|
|
1129
|
+
n_nghbr[j] = idx2;
|
|
1130
|
+
}
|
|
1131
|
+
}
|
|
1132
|
+
// Update the distance matrix in the range (idx2, N).
|
|
1133
|
+
// Recompute the minimum mindist[idx2] and n_nghbr[idx2].
|
|
1134
|
+
if (idx2<N_1) {
|
|
1135
|
+
min = mindist[idx2];
|
|
1136
|
+
for (j=active_nodes.succ[idx2]; j<N; j=active_nodes.succ[j]) {
|
|
1137
|
+
f_single(&D_(idx2, j), D_(idx1, j) );
|
|
1138
|
+
if (D_(idx2, j) < min) {
|
|
1139
|
+
n_nghbr[idx2] = j;
|
|
1140
|
+
min = D_(idx2, j);
|
|
1141
|
+
}
|
|
1142
|
+
}
|
|
1143
|
+
nn_distances.update_leq(idx2, min);
|
|
1144
|
+
}
|
|
1145
|
+
break;
|
|
1146
|
+
|
|
1147
|
+
case METHOD_METR_COMPLETE:
|
|
1148
|
+
/*
|
|
1149
|
+
Complete linkage.
|
|
1150
|
+
|
|
1151
|
+
Characteristic: new distances are never shorter than the old distances.
|
|
1152
|
+
*/
|
|
1153
|
+
// Update the distance matrix in the range [start, idx1).
|
|
1154
|
+
for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
|
|
1155
|
+
f_complete(&D_(j, idx2), D_(j, idx1) );
|
|
1156
|
+
if (n_nghbr[j] == idx1)
|
|
1157
|
+
n_nghbr[j] = idx2;
|
|
1158
|
+
}
|
|
1159
|
+
// Update the distance matrix in the range (idx1, idx2).
|
|
1160
|
+
for (; j<idx2; j=active_nodes.succ[j])
|
|
1161
|
+
f_complete(&D_(j, idx2), D_(idx1, j) );
|
|
1162
|
+
// Update the distance matrix in the range (idx2, N).
|
|
1163
|
+
for (j=active_nodes.succ[idx2]; j<N; j=active_nodes.succ[j])
|
|
1164
|
+
f_complete(&D_(idx2, j), D_(idx1, j) );
|
|
1165
|
+
break;
|
|
1166
|
+
|
|
1167
|
+
case METHOD_METR_AVERAGE: {
|
|
1168
|
+
/*
|
|
1169
|
+
Average linkage.
|
|
1170
|
+
|
|
1171
|
+
Shorter and longer distances can occur.
|
|
1172
|
+
*/
|
|
1173
|
+
// Update the distance matrix in the range [start, idx1).
|
|
1174
|
+
t_float s = size1/(size1+size2);
|
|
1175
|
+
t_float t = size2/(size1+size2);
|
|
1176
|
+
for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
|
|
1177
|
+
f_average(&D_(j, idx2), D_(j, idx1), s, t);
|
|
1178
|
+
if (n_nghbr[j] == idx1)
|
|
1179
|
+
n_nghbr[j] = idx2;
|
|
1180
|
+
}
|
|
1181
|
+
// Update the distance matrix in the range (idx1, idx2).
|
|
1182
|
+
for (; j<idx2; j=active_nodes.succ[j]) {
|
|
1183
|
+
f_average(&D_(j, idx2), D_(idx1, j), s, t);
|
|
1184
|
+
if (D_(j, idx2) < mindist[j]) {
|
|
1185
|
+
nn_distances.update_leq(j, D_(j, idx2));
|
|
1186
|
+
n_nghbr[j] = idx2;
|
|
1187
|
+
}
|
|
1188
|
+
}
|
|
1189
|
+
// Update the distance matrix in the range (idx2, N).
|
|
1190
|
+
if (idx2<N_1) {
|
|
1191
|
+
n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
|
|
1192
|
+
f_average(&D_(idx2, j), D_(idx1, j), s, t);
|
|
1193
|
+
min = D_(idx2,j);
|
|
1194
|
+
for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
|
|
1195
|
+
f_average(&D_(idx2, j), D_(idx1, j), s, t);
|
|
1196
|
+
if (D_(idx2,j) < min) {
|
|
1197
|
+
min = D_(idx2,j);
|
|
1198
|
+
n_nghbr[idx2] = j;
|
|
1199
|
+
}
|
|
1200
|
+
}
|
|
1201
|
+
nn_distances.update(idx2, min);
|
|
1202
|
+
}
|
|
1203
|
+
break;
|
|
1204
|
+
}
|
|
1205
|
+
|
|
1206
|
+
case METHOD_METR_WEIGHTED:
|
|
1207
|
+
/*
|
|
1208
|
+
Weighted linkage.
|
|
1209
|
+
|
|
1210
|
+
Shorter and longer distances can occur.
|
|
1211
|
+
*/
|
|
1212
|
+
// Update the distance matrix in the range [start, idx1).
|
|
1213
|
+
for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
|
|
1214
|
+
f_weighted(&D_(j, idx2), D_(j, idx1) );
|
|
1215
|
+
if (n_nghbr[j] == idx1)
|
|
1216
|
+
n_nghbr[j] = idx2;
|
|
1217
|
+
}
|
|
1218
|
+
// Update the distance matrix in the range (idx1, idx2).
|
|
1219
|
+
for (; j<idx2; j=active_nodes.succ[j]) {
|
|
1220
|
+
f_weighted(&D_(j, idx2), D_(idx1, j) );
|
|
1221
|
+
if (D_(j, idx2) < mindist[j]) {
|
|
1222
|
+
nn_distances.update_leq(j, D_(j, idx2));
|
|
1223
|
+
n_nghbr[j] = idx2;
|
|
1224
|
+
}
|
|
1225
|
+
}
|
|
1226
|
+
// Update the distance matrix in the range (idx2, N).
|
|
1227
|
+
if (idx2<N_1) {
|
|
1228
|
+
n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
|
|
1229
|
+
f_weighted(&D_(idx2, j), D_(idx1, j) );
|
|
1230
|
+
min = D_(idx2,j);
|
|
1231
|
+
for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
|
|
1232
|
+
f_weighted(&D_(idx2, j), D_(idx1, j) );
|
|
1233
|
+
if (D_(idx2,j) < min) {
|
|
1234
|
+
min = D_(idx2,j);
|
|
1235
|
+
n_nghbr[idx2] = j;
|
|
1236
|
+
}
|
|
1237
|
+
}
|
|
1238
|
+
nn_distances.update(idx2, min);
|
|
1239
|
+
}
|
|
1240
|
+
break;
|
|
1241
|
+
|
|
1242
|
+
case METHOD_METR_WARD:
|
|
1243
|
+
/*
|
|
1244
|
+
Ward linkage.
|
|
1245
|
+
|
|
1246
|
+
Shorter and longer distances can occur, not smaller than min(d1,d2)
|
|
1247
|
+
but maybe bigger than max(d1,d2).
|
|
1248
|
+
*/
|
|
1249
|
+
// Update the distance matrix in the range [start, idx1).
|
|
1250
|
+
for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
|
|
1251
|
+
f_ward(&D_(j, idx2), D_(j, idx1), mindist[idx1],
|
|
1252
|
+
size1, size2, static_cast<t_float>(members[j]) );
|
|
1253
|
+
if (n_nghbr[j] == idx1)
|
|
1254
|
+
n_nghbr[j] = idx2;
|
|
1255
|
+
}
|
|
1256
|
+
// Update the distance matrix in the range (idx1, idx2).
|
|
1257
|
+
for (; j<idx2; j=active_nodes.succ[j]) {
|
|
1258
|
+
f_ward(&D_(j, idx2), D_(idx1, j), mindist[idx1], size1, size2,
|
|
1259
|
+
static_cast<t_float>(members[j]) );
|
|
1260
|
+
if (D_(j, idx2) < mindist[j]) {
|
|
1261
|
+
nn_distances.update_leq(j, D_(j, idx2));
|
|
1262
|
+
n_nghbr[j] = idx2;
|
|
1263
|
+
}
|
|
1264
|
+
}
|
|
1265
|
+
// Update the distance matrix in the range (idx2, N).
|
|
1266
|
+
if (idx2<N_1) {
|
|
1267
|
+
n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
|
|
1268
|
+
f_ward(&D_(idx2, j), D_(idx1, j), mindist[idx1],
|
|
1269
|
+
size1, size2, static_cast<t_float>(members[j]) );
|
|
1270
|
+
min = D_(idx2,j);
|
|
1271
|
+
for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
|
|
1272
|
+
f_ward(&D_(idx2, j), D_(idx1, j), mindist[idx1],
|
|
1273
|
+
size1, size2, static_cast<t_float>(members[j]) );
|
|
1274
|
+
if (D_(idx2,j) < min) {
|
|
1275
|
+
min = D_(idx2,j);
|
|
1276
|
+
n_nghbr[idx2] = j;
|
|
1277
|
+
}
|
|
1278
|
+
}
|
|
1279
|
+
nn_distances.update(idx2, min);
|
|
1280
|
+
}
|
|
1281
|
+
break;
|
|
1282
|
+
|
|
1283
|
+
case METHOD_METR_WARD_D2:
|
|
1284
|
+
/*
|
|
1285
|
+
Ward D2 linkage (with squared Euclidean distances).
|
|
1286
|
+
|
|
1287
|
+
Shorter and longer distances can occur, not smaller than min(d1,d2)
|
|
1288
|
+
but maybe bigger than max(d1,d2).
|
|
1289
|
+
Uses the same update formula as Ward, but with different post-processing.
|
|
1290
|
+
*/
|
|
1291
|
+
// Update the distance matrix in the range [start, idx1).
|
|
1292
|
+
for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
|
|
1293
|
+
f_ward(&D_(j, idx2), D_(j, idx1), mindist[idx1],
|
|
1294
|
+
size1, size2, static_cast<t_float>(members[j]) );
|
|
1295
|
+
if (n_nghbr[j] == idx1)
|
|
1296
|
+
n_nghbr[j] = idx2;
|
|
1297
|
+
}
|
|
1298
|
+
// Update the distance matrix in the range (idx1, idx2).
|
|
1299
|
+
for (; j<idx2; j=active_nodes.succ[j]) {
|
|
1300
|
+
f_ward(&D_(j, idx2), D_(idx1, j), mindist[idx1], size1, size2,
|
|
1301
|
+
static_cast<t_float>(members[j]) );
|
|
1302
|
+
if (D_(j, idx2) < mindist[j]) {
|
|
1303
|
+
nn_distances.update_leq(j, D_(j, idx2));
|
|
1304
|
+
n_nghbr[j] = idx2;
|
|
1305
|
+
}
|
|
1306
|
+
}
|
|
1307
|
+
// Update the distance matrix in the range (idx2, N).
|
|
1308
|
+
if (idx2<N_1) {
|
|
1309
|
+
n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
|
|
1310
|
+
f_ward(&D_(idx2, j), D_(idx1, j), mindist[idx1],
|
|
1311
|
+
size1, size2, static_cast<t_float>(members[j]) );
|
|
1312
|
+
min = D_(idx2,j);
|
|
1313
|
+
for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
|
|
1314
|
+
f_ward(&D_(idx2, j), D_(idx1, j), mindist[idx1],
|
|
1315
|
+
size1, size2, static_cast<t_float>(members[j]) );
|
|
1316
|
+
if (D_(idx2,j) < min) {
|
|
1317
|
+
min = D_(idx2,j);
|
|
1318
|
+
n_nghbr[idx2] = j;
|
|
1319
|
+
}
|
|
1320
|
+
}
|
|
1321
|
+
nn_distances.update(idx2, min);
|
|
1322
|
+
}
|
|
1323
|
+
break;
|
|
1324
|
+
|
|
1325
|
+
case METHOD_METR_CENTROID: {
|
|
1326
|
+
/*
|
|
1327
|
+
Centroid linkage.
|
|
1328
|
+
|
|
1329
|
+
Shorter and longer distances can occur, not bigger than max(d1,d2)
|
|
1330
|
+
but maybe smaller than min(d1,d2).
|
|
1331
|
+
*/
|
|
1332
|
+
// Update the distance matrix in the range [start, idx1).
|
|
1333
|
+
t_float s = size1/(size1+size2);
|
|
1334
|
+
t_float t = size2/(size1+size2);
|
|
1335
|
+
t_float stc = s*t*mindist[idx1];
|
|
1336
|
+
for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
|
|
1337
|
+
f_centroid(&D_(j, idx2), D_(j, idx1), stc, s, t);
|
|
1338
|
+
if (D_(j, idx2) < mindist[j]) {
|
|
1339
|
+
nn_distances.update_leq(j, D_(j, idx2));
|
|
1340
|
+
n_nghbr[j] = idx2;
|
|
1341
|
+
}
|
|
1342
|
+
else if (n_nghbr[j] == idx1)
|
|
1343
|
+
n_nghbr[j] = idx2;
|
|
1344
|
+
}
|
|
1345
|
+
// Update the distance matrix in the range (idx1, idx2).
|
|
1346
|
+
for (; j<idx2; j=active_nodes.succ[j]) {
|
|
1347
|
+
f_centroid(&D_(j, idx2), D_(idx1, j), stc, s, t);
|
|
1348
|
+
if (D_(j, idx2) < mindist[j]) {
|
|
1349
|
+
nn_distances.update_leq(j, D_(j, idx2));
|
|
1350
|
+
n_nghbr[j] = idx2;
|
|
1351
|
+
}
|
|
1352
|
+
}
|
|
1353
|
+
// Update the distance matrix in the range (idx2, N).
|
|
1354
|
+
if (idx2<N_1) {
|
|
1355
|
+
n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
|
|
1356
|
+
f_centroid(&D_(idx2, j), D_(idx1, j), stc, s, t);
|
|
1357
|
+
min = D_(idx2,j);
|
|
1358
|
+
for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
|
|
1359
|
+
f_centroid(&D_(idx2, j), D_(idx1, j), stc, s, t);
|
|
1360
|
+
if (D_(idx2,j) < min) {
|
|
1361
|
+
min = D_(idx2,j);
|
|
1362
|
+
n_nghbr[idx2] = j;
|
|
1363
|
+
}
|
|
1364
|
+
}
|
|
1365
|
+
nn_distances.update(idx2, min);
|
|
1366
|
+
}
|
|
1367
|
+
break;
|
|
1368
|
+
}
|
|
1369
|
+
|
|
1370
|
+
case METHOD_METR_MEDIAN: {
|
|
1371
|
+
/*
|
|
1372
|
+
Median linkage.
|
|
1373
|
+
|
|
1374
|
+
Shorter and longer distances can occur, not bigger than max(d1,d2)
|
|
1375
|
+
but maybe smaller than min(d1,d2).
|
|
1376
|
+
*/
|
|
1377
|
+
// Update the distance matrix in the range [start, idx1).
|
|
1378
|
+
t_float c_4 = mindist[idx1]*.25;
|
|
1379
|
+
for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
|
|
1380
|
+
f_median(&D_(j, idx2), D_(j, idx1), c_4 );
|
|
1381
|
+
if (D_(j, idx2) < mindist[j]) {
|
|
1382
|
+
nn_distances.update_leq(j, D_(j, idx2));
|
|
1383
|
+
n_nghbr[j] = idx2;
|
|
1384
|
+
}
|
|
1385
|
+
else if (n_nghbr[j] == idx1)
|
|
1386
|
+
n_nghbr[j] = idx2;
|
|
1387
|
+
}
|
|
1388
|
+
// Update the distance matrix in the range (idx1, idx2).
|
|
1389
|
+
for (; j<idx2; j=active_nodes.succ[j]) {
|
|
1390
|
+
f_median(&D_(j, idx2), D_(idx1, j), c_4 );
|
|
1391
|
+
if (D_(j, idx2) < mindist[j]) {
|
|
1392
|
+
nn_distances.update_leq(j, D_(j, idx2));
|
|
1393
|
+
n_nghbr[j] = idx2;
|
|
1394
|
+
}
|
|
1395
|
+
}
|
|
1396
|
+
// Update the distance matrix in the range (idx2, N).
|
|
1397
|
+
if (idx2<N_1) {
|
|
1398
|
+
n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
|
|
1399
|
+
f_median(&D_(idx2, j), D_(idx1, j), c_4 );
|
|
1400
|
+
min = D_(idx2,j);
|
|
1401
|
+
for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
|
|
1402
|
+
f_median(&D_(idx2, j), D_(idx1, j), c_4 );
|
|
1403
|
+
if (D_(idx2,j) < min) {
|
|
1404
|
+
min = D_(idx2,j);
|
|
1405
|
+
n_nghbr[idx2] = j;
|
|
1406
|
+
}
|
|
1407
|
+
}
|
|
1408
|
+
nn_distances.update(idx2, min);
|
|
1409
|
+
}
|
|
1410
|
+
break;
|
|
1411
|
+
}
|
|
1412
|
+
|
|
1413
|
+
default:
|
|
1414
|
+
throw std::runtime_error(std::string("Invalid method."));
|
|
1415
|
+
}
|
|
1416
|
+
}
|
|
1417
|
+
#ifdef FE_INVALID
|
|
1418
|
+
if (fetestexcept(FE_INVALID)) throw fenv_error();
|
|
1419
|
+
#endif
|
|
1420
|
+
}
|
|
1421
|
+
|
|
1422
|
+
/*
|
|
1423
|
+
Clustering methods for vector data
|
|
1424
|
+
*/
|
|
1425
|
+
|
|
1426
|
+
template <typename t_dissimilarity>
|
|
1427
|
+
static void MST_linkage_core_vector(const t_index N,
|
|
1428
|
+
t_dissimilarity & dist,
|
|
1429
|
+
cluster_result & Z2) {
|
|
1430
|
+
/*
|
|
1431
|
+
N: integer, number of data points
|
|
1432
|
+
dist: function pointer to the metric
|
|
1433
|
+
Z2: output data structure
|
|
1434
|
+
|
|
1435
|
+
The basis of this algorithm is an algorithm by Rohlf:
|
|
1436
|
+
|
|
1437
|
+
F. James Rohlf, Hierarchical clustering using the minimum spanning tree,
|
|
1438
|
+
The Computer Journal, vol. 16, 1973, p. 93–95.
|
|
1439
|
+
*/
|
|
1440
|
+
t_index i;
|
|
1441
|
+
t_index idx2;
|
|
1442
|
+
doubly_linked_list active_nodes(N);
|
|
1443
|
+
auto_array_ptr<t_float> d(N);
|
|
1444
|
+
|
|
1445
|
+
t_index prev_node;
|
|
1446
|
+
t_float min;
|
|
1447
|
+
|
|
1448
|
+
// first iteration
|
|
1449
|
+
idx2 = 1;
|
|
1450
|
+
min = std::numeric_limits<t_float>::infinity();
|
|
1451
|
+
for (i=1; i<N; ++i) {
|
|
1452
|
+
d[i] = dist(0,i);
|
|
1453
|
+
#if HAVE_DIAGNOSTIC
|
|
1454
|
+
#pragma GCC diagnostic push
|
|
1455
|
+
#pragma GCC diagnostic ignored "-Wfloat-equal"
|
|
1456
|
+
#endif
|
|
1457
|
+
if (d[i] < min) {
|
|
1458
|
+
min = d[i];
|
|
1459
|
+
idx2 = i;
|
|
1460
|
+
}
|
|
1461
|
+
else if (fc_isnan(d[i]))
|
|
1462
|
+
throw (nan_error());
|
|
1463
|
+
#if HAVE_DIAGNOSTIC
|
|
1464
|
+
#pragma GCC diagnostic pop
|
|
1465
|
+
#endif
|
|
1466
|
+
}
|
|
1467
|
+
|
|
1468
|
+
Z2.append(0, idx2, min);
|
|
1469
|
+
|
|
1470
|
+
for (t_index j=1; j<N-1; ++j) {
|
|
1471
|
+
prev_node = idx2;
|
|
1472
|
+
active_nodes.remove(prev_node);
|
|
1473
|
+
|
|
1474
|
+
idx2 = active_nodes.succ[0];
|
|
1475
|
+
min = d[idx2];
|
|
1476
|
+
|
|
1477
|
+
for (i=idx2; i<N; i=active_nodes.succ[i]) {
|
|
1478
|
+
t_float tmp = dist(i, prev_node);
|
|
1479
|
+
#if HAVE_DIAGNOSTIC
|
|
1480
|
+
#pragma GCC diagnostic push
|
|
1481
|
+
#pragma GCC diagnostic ignored "-Wfloat-equal"
|
|
1482
|
+
#endif
|
|
1483
|
+
if (d[i] > tmp)
|
|
1484
|
+
d[i] = tmp;
|
|
1485
|
+
else if (fc_isnan(tmp))
|
|
1486
|
+
throw (nan_error());
|
|
1487
|
+
#if HAVE_DIAGNOSTIC
|
|
1488
|
+
#pragma GCC diagnostic pop
|
|
1489
|
+
#endif
|
|
1490
|
+
if (d[i] < min) {
|
|
1491
|
+
min = d[i];
|
|
1492
|
+
idx2 = i;
|
|
1493
|
+
}
|
|
1494
|
+
}
|
|
1495
|
+
Z2.append(prev_node, idx2, min);
|
|
1496
|
+
}
|
|
1497
|
+
}
|
|
1498
|
+
|
|
1499
|
+
template <method_codes_vector method, typename t_dissimilarity>
|
|
1500
|
+
static void generic_linkage_vector(const t_index N,
|
|
1501
|
+
t_dissimilarity & dist,
|
|
1502
|
+
cluster_result & Z2) {
|
|
1503
|
+
/*
|
|
1504
|
+
N: integer, number of data points
|
|
1505
|
+
dist: function pointer to the metric
|
|
1506
|
+
Z2: output data structure
|
|
1507
|
+
|
|
1508
|
+
This algorithm is valid for the distance update methods
|
|
1509
|
+
"Ward", "centroid" and "median" only!
|
|
1510
|
+
*/
|
|
1511
|
+
const t_index N_1 = N-1;
|
|
1512
|
+
t_index i, j; // loop variables
|
|
1513
|
+
t_index idx1, idx2; // row and column indices
|
|
1514
|
+
|
|
1515
|
+
auto_array_ptr<t_index> n_nghbr(N_1); // array of nearest neighbors
|
|
1516
|
+
auto_array_ptr<t_float> mindist(N_1); // distances to the nearest neighbors
|
|
1517
|
+
auto_array_ptr<t_index> row_repr(N); // row_repr[i]: node number that the
|
|
1518
|
+
// i-th row represents
|
|
1519
|
+
doubly_linked_list active_nodes(N);
|
|
1520
|
+
binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for
|
|
1521
|
+
// the distance to the nearest neighbor of each point
|
|
1522
|
+
t_index node1, node2; // node numbers in the output
|
|
1523
|
+
t_float min; // minimum and row index for nearest-neighbor search
|
|
1524
|
+
|
|
1525
|
+
for (i=0; i<N; ++i)
|
|
1526
|
+
// Build a list of row ↔ node label assignments.
|
|
1527
|
+
// Initially i ↦ i
|
|
1528
|
+
row_repr[i] = i;
|
|
1529
|
+
|
|
1530
|
+
// Initialize the minimal distances:
|
|
1531
|
+
// Find the nearest neighbor of each point.
|
|
1532
|
+
// n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
|
|
1533
|
+
for (i=0; i<N_1; ++i) {
|
|
1534
|
+
min = std::numeric_limits<t_float>::infinity();
|
|
1535
|
+
t_index idx;
|
|
1536
|
+
for (idx=j=i+1; j<N; ++j) {
|
|
1537
|
+
t_float tmp;
|
|
1538
|
+
switch (method) {
|
|
1539
|
+
case METHOD_VECTOR_WARD:
|
|
1540
|
+
tmp = dist.ward_initial(i,j);
|
|
1541
|
+
break;
|
|
1542
|
+
default:
|
|
1543
|
+
tmp = dist.template sqeuclidean<true>(i,j);
|
|
1544
|
+
}
|
|
1545
|
+
if (tmp<min) {
|
|
1546
|
+
min = tmp;
|
|
1547
|
+
idx = j;
|
|
1548
|
+
}
|
|
1549
|
+
}
|
|
1550
|
+
switch (method) {
|
|
1551
|
+
case METHOD_VECTOR_WARD:
|
|
1552
|
+
mindist[i] = t_dissimilarity::ward_initial_conversion(min);
|
|
1553
|
+
break;
|
|
1554
|
+
default:
|
|
1555
|
+
mindist[i] = min;
|
|
1556
|
+
}
|
|
1557
|
+
n_nghbr[i] = idx;
|
|
1558
|
+
}
|
|
1559
|
+
|
|
1560
|
+
// Put the minimal distances into a heap structure to make the repeated
|
|
1561
|
+
// global minimum searches fast.
|
|
1562
|
+
nn_distances.heapify();
|
|
1563
|
+
|
|
1564
|
+
// Main loop: We have N-1 merging steps.
|
|
1565
|
+
for (i=0; i<N_1; ++i) {
|
|
1566
|
+
idx1 = nn_distances.argmin();
|
|
1567
|
+
|
|
1568
|
+
while ( active_nodes.is_inactive(n_nghbr[idx1]) ) {
|
|
1569
|
+
// Recompute the minimum mindist[idx1] and n_nghbr[idx1].
|
|
1570
|
+
n_nghbr[idx1] = j = active_nodes.succ[idx1]; // exists, maximally N-1
|
|
1571
|
+
switch (method) {
|
|
1572
|
+
case METHOD_VECTOR_WARD:
|
|
1573
|
+
min = dist.ward(idx1,j);
|
|
1574
|
+
for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
|
|
1575
|
+
t_float const tmp = dist.ward(idx1,j);
|
|
1576
|
+
if (tmp<min) {
|
|
1577
|
+
min = tmp;
|
|
1578
|
+
n_nghbr[idx1] = j;
|
|
1579
|
+
}
|
|
1580
|
+
}
|
|
1581
|
+
break;
|
|
1582
|
+
default:
|
|
1583
|
+
min = dist.template sqeuclidean<true>(idx1,j);
|
|
1584
|
+
for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
|
|
1585
|
+
t_float const tmp = dist.template sqeuclidean<true>(idx1,j);
|
|
1586
|
+
if (tmp<min) {
|
|
1587
|
+
min = tmp;
|
|
1588
|
+
n_nghbr[idx1] = j;
|
|
1589
|
+
}
|
|
1590
|
+
}
|
|
1591
|
+
}
|
|
1592
|
+
/* Update the heap with the new true minimum and search for the (possibly
|
|
1593
|
+
different) minimal entry. */
|
|
1594
|
+
nn_distances.update_geq(idx1, min);
|
|
1595
|
+
idx1 = nn_distances.argmin();
|
|
1596
|
+
}
|
|
1597
|
+
|
|
1598
|
+
nn_distances.heap_pop(); // Remove the current minimum from the heap.
|
|
1599
|
+
idx2 = n_nghbr[idx1];
|
|
1600
|
+
|
|
1601
|
+
// Write the newly found minimal pair of nodes to the output array.
|
|
1602
|
+
node1 = row_repr[idx1];
|
|
1603
|
+
node2 = row_repr[idx2];
|
|
1604
|
+
|
|
1605
|
+
Z2.append(node1, node2, mindist[idx1]);
|
|
1606
|
+
|
|
1607
|
+
switch (method) {
|
|
1608
|
+
case METHOD_VECTOR_WARD:
|
|
1609
|
+
case METHOD_VECTOR_CENTROID:
|
|
1610
|
+
dist.merge_inplace(idx1, idx2);
|
|
1611
|
+
break;
|
|
1612
|
+
case METHOD_VECTOR_MEDIAN:
|
|
1613
|
+
dist.merge_inplace_weighted(idx1, idx2);
|
|
1614
|
+
break;
|
|
1615
|
+
default:
|
|
1616
|
+
throw std::runtime_error(std::string("Invalid method."));
|
|
1617
|
+
}
|
|
1618
|
+
|
|
1619
|
+
// Index idx2 now represents the new (merged) node with label N+i.
|
|
1620
|
+
row_repr[idx2] = N+i;
|
|
1621
|
+
// Remove idx1 from the list of active indices (active_nodes).
|
|
1622
|
+
active_nodes.remove(idx1); // TBD later!!!
|
|
1623
|
+
|
|
1624
|
+
// Update the distance matrix
|
|
1625
|
+
switch (method) {
|
|
1626
|
+
case METHOD_VECTOR_WARD:
|
|
1627
|
+
/*
|
|
1628
|
+
Ward linkage.
|
|
1629
|
+
|
|
1630
|
+
Shorter and longer distances can occur, not smaller than min(d1,d2)
|
|
1631
|
+
but maybe bigger than max(d1,d2).
|
|
1632
|
+
*/
|
|
1633
|
+
// Update the distance matrix in the range [start, idx1).
|
|
1634
|
+
for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
|
|
1635
|
+
if (n_nghbr[j] == idx2) {
|
|
1636
|
+
n_nghbr[j] = idx1; // invalidate
|
|
1637
|
+
}
|
|
1638
|
+
}
|
|
1639
|
+
// Update the distance matrix in the range (idx1, idx2).
|
|
1640
|
+
for ( ; j<idx2; j=active_nodes.succ[j]) {
|
|
1641
|
+
t_float const tmp = dist.ward(j, idx2);
|
|
1642
|
+
if (tmp < mindist[j]) {
|
|
1643
|
+
nn_distances.update_leq(j, tmp);
|
|
1644
|
+
n_nghbr[j] = idx2;
|
|
1645
|
+
}
|
|
1646
|
+
else if (n_nghbr[j]==idx2) {
|
|
1647
|
+
n_nghbr[j] = idx1; // invalidate
|
|
1648
|
+
}
|
|
1649
|
+
}
|
|
1650
|
+
// Find the nearest neighbor for idx2.
|
|
1651
|
+
if (idx2<N_1) {
|
|
1652
|
+
n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
|
|
1653
|
+
min = dist.ward(idx2,j);
|
|
1654
|
+
for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
|
|
1655
|
+
t_float const tmp = dist.ward(idx2,j);
|
|
1656
|
+
if (tmp < min) {
|
|
1657
|
+
min = tmp;
|
|
1658
|
+
n_nghbr[idx2] = j;
|
|
1659
|
+
}
|
|
1660
|
+
}
|
|
1661
|
+
nn_distances.update(idx2, min);
|
|
1662
|
+
}
|
|
1663
|
+
break;
|
|
1664
|
+
|
|
1665
|
+
default:
|
|
1666
|
+
/*
|
|
1667
|
+
Centroid and median linkage.
|
|
1668
|
+
|
|
1669
|
+
Shorter and longer distances can occur, not bigger than max(d1,d2)
|
|
1670
|
+
but maybe smaller than min(d1,d2).
|
|
1671
|
+
*/
|
|
1672
|
+
for (j=active_nodes.start; j<idx2; j=active_nodes.succ[j]) {
|
|
1673
|
+
t_float const tmp = dist.template sqeuclidean<true>(j, idx2);
|
|
1674
|
+
if (tmp < mindist[j]) {
|
|
1675
|
+
nn_distances.update_leq(j, tmp);
|
|
1676
|
+
n_nghbr[j] = idx2;
|
|
1677
|
+
}
|
|
1678
|
+
else if (n_nghbr[j] == idx2)
|
|
1679
|
+
n_nghbr[j] = idx1; // invalidate
|
|
1680
|
+
}
|
|
1681
|
+
// Find the nearest neighbor for idx2.
|
|
1682
|
+
if (idx2<N_1) {
|
|
1683
|
+
n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
|
|
1684
|
+
min = dist.template sqeuclidean<true>(idx2,j);
|
|
1685
|
+
for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
|
|
1686
|
+
t_float const tmp = dist.template sqeuclidean<true>(idx2, j);
|
|
1687
|
+
if (tmp < min) {
|
|
1688
|
+
min = tmp;
|
|
1689
|
+
n_nghbr[idx2] = j;
|
|
1690
|
+
}
|
|
1691
|
+
}
|
|
1692
|
+
nn_distances.update(idx2, min);
|
|
1693
|
+
}
|
|
1694
|
+
}
|
|
1695
|
+
}
|
|
1696
|
+
}
|
|
1697
|
+
|
|
1698
|
+
template <method_codes_vector method, typename t_dissimilarity>
|
|
1699
|
+
static void generic_linkage_vector_alternative(const t_index N,
|
|
1700
|
+
t_dissimilarity & dist,
|
|
1701
|
+
cluster_result & Z2) {
|
|
1702
|
+
/*
|
|
1703
|
+
N: integer, number of data points
|
|
1704
|
+
dist: function pointer to the metric
|
|
1705
|
+
Z2: output data structure
|
|
1706
|
+
|
|
1707
|
+
This algorithm is valid for the distance update methods
|
|
1708
|
+
"Ward", "centroid" and "median" only!
|
|
1709
|
+
*/
|
|
1710
|
+
const t_index N_1 = N-1;
|
|
1711
|
+
t_index i, j=0; // loop variables
|
|
1712
|
+
t_index idx1, idx2; // row and column indices
|
|
1713
|
+
|
|
1714
|
+
auto_array_ptr<t_index> n_nghbr(2*N-2); // array of nearest neighbors
|
|
1715
|
+
auto_array_ptr<t_float> mindist(2*N-2); // distances to the nearest neighbors
|
|
1716
|
+
|
|
1717
|
+
doubly_linked_list active_nodes(N+N_1);
|
|
1718
|
+
binary_min_heap nn_distances(&*mindist, N_1, 2*N-2, 1); // minimum heap
|
|
1719
|
+
// structure for the distance to the nearest neighbor of each point
|
|
1720
|
+
|
|
1721
|
+
t_float min; // minimum for nearest-neighbor searches
|
|
1722
|
+
|
|
1723
|
+
// Initialize the minimal distances:
|
|
1724
|
+
// Find the nearest neighbor of each point.
|
|
1725
|
+
// n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
|
|
1726
|
+
for (i=1; i<N; ++i) {
|
|
1727
|
+
min = std::numeric_limits<t_float>::infinity();
|
|
1728
|
+
t_index idx;
|
|
1729
|
+
for (idx=j=0; j<i; ++j) {
|
|
1730
|
+
t_float tmp;
|
|
1731
|
+
switch (method) {
|
|
1732
|
+
case METHOD_VECTOR_WARD:
|
|
1733
|
+
tmp = dist.ward_initial(i,j);
|
|
1734
|
+
break;
|
|
1735
|
+
default:
|
|
1736
|
+
tmp = dist.template sqeuclidean<true>(i,j);
|
|
1737
|
+
}
|
|
1738
|
+
if (tmp<min) {
|
|
1739
|
+
min = tmp;
|
|
1740
|
+
idx = j;
|
|
1741
|
+
}
|
|
1742
|
+
}
|
|
1743
|
+
switch (method) {
|
|
1744
|
+
case METHOD_VECTOR_WARD:
|
|
1745
|
+
mindist[i] = t_dissimilarity::ward_initial_conversion(min);
|
|
1746
|
+
break;
|
|
1747
|
+
default:
|
|
1748
|
+
mindist[i] = min;
|
|
1749
|
+
}
|
|
1750
|
+
n_nghbr[i] = idx;
|
|
1751
|
+
}
|
|
1752
|
+
|
|
1753
|
+
// Put the minimal distances into a heap structure to make the repeated
|
|
1754
|
+
// global minimum searches fast.
|
|
1755
|
+
nn_distances.heapify();
|
|
1756
|
+
|
|
1757
|
+
// Main loop: We have N-1 merging steps.
|
|
1758
|
+
for (i=N; i<N+N_1; ++i) {
|
|
1759
|
+
/*
|
|
1760
|
+
The bookkeeping is different from the "stored matrix approach" algorithm
|
|
1761
|
+
generic_linkage.
|
|
1762
|
+
|
|
1763
|
+
mindist[i] stores a lower bound on the minimum distance of the point i to
|
|
1764
|
+
all points of *lower* index:
|
|
1765
|
+
|
|
1766
|
+
mindist[i] ≥ min_{j<i} D(i,j)
|
|
1767
|
+
|
|
1768
|
+
Moreover, new nodes do not re-use one of the old indices, but they are
|
|
1769
|
+
given a new, unique index (SciPy convention: initial nodes are 0,…,N−1,
|
|
1770
|
+
new nodes are N,…,2N−2).
|
|
1771
|
+
|
|
1772
|
+
Invalid nearest neighbors are not recognized by the fact that the stored
|
|
1773
|
+
distance is smaller than the actual distance, but the list active_nodes
|
|
1774
|
+
maintains a flag whether a node is inactive. If n_nghbr[i] points to an
|
|
1775
|
+
active node, the entries nn_distances[i] and n_nghbr[i] are valid,
|
|
1776
|
+
otherwise they must be recomputed.
|
|
1777
|
+
*/
|
|
1778
|
+
idx1 = nn_distances.argmin();
|
|
1779
|
+
while ( active_nodes.is_inactive(n_nghbr[idx1]) ) {
|
|
1780
|
+
// Recompute the minimum mindist[idx1] and n_nghbr[idx1].
|
|
1781
|
+
n_nghbr[idx1] = j = active_nodes.start;
|
|
1782
|
+
switch (method) {
|
|
1783
|
+
case METHOD_VECTOR_WARD:
|
|
1784
|
+
min = dist.ward_extended(idx1,j);
|
|
1785
|
+
for (j=active_nodes.succ[j]; j<idx1; j=active_nodes.succ[j]) {
|
|
1786
|
+
t_float tmp = dist.ward_extended(idx1,j);
|
|
1787
|
+
if (tmp<min) {
|
|
1788
|
+
min = tmp;
|
|
1789
|
+
n_nghbr[idx1] = j;
|
|
1790
|
+
}
|
|
1791
|
+
}
|
|
1792
|
+
break;
|
|
1793
|
+
default:
|
|
1794
|
+
min = dist.sqeuclidean_extended(idx1,j);
|
|
1795
|
+
for (j=active_nodes.succ[j]; j<idx1; j=active_nodes.succ[j]) {
|
|
1796
|
+
t_float const tmp = dist.sqeuclidean_extended(idx1,j);
|
|
1797
|
+
if (tmp<min) {
|
|
1798
|
+
min = tmp;
|
|
1799
|
+
n_nghbr[idx1] = j;
|
|
1800
|
+
}
|
|
1801
|
+
}
|
|
1802
|
+
}
|
|
1803
|
+
/* Update the heap with the new true minimum and search for the (possibly
|
|
1804
|
+
different) minimal entry. */
|
|
1805
|
+
nn_distances.update_geq(idx1, min);
|
|
1806
|
+
idx1 = nn_distances.argmin();
|
|
1807
|
+
}
|
|
1808
|
+
|
|
1809
|
+
idx2 = n_nghbr[idx1];
|
|
1810
|
+
active_nodes.remove(idx1);
|
|
1811
|
+
active_nodes.remove(idx2);
|
|
1812
|
+
|
|
1813
|
+
Z2.append(idx1, idx2, mindist[idx1]);
|
|
1814
|
+
|
|
1815
|
+
if (i<2*N_1) {
|
|
1816
|
+
switch (method) {
|
|
1817
|
+
case METHOD_VECTOR_WARD:
|
|
1818
|
+
case METHOD_VECTOR_CENTROID:
|
|
1819
|
+
dist.merge(idx1, idx2, i);
|
|
1820
|
+
break;
|
|
1821
|
+
|
|
1822
|
+
case METHOD_VECTOR_MEDIAN:
|
|
1823
|
+
dist.merge_weighted(idx1, idx2, i);
|
|
1824
|
+
break;
|
|
1825
|
+
|
|
1826
|
+
default:
|
|
1827
|
+
throw std::runtime_error(std::string("Invalid method."));
|
|
1828
|
+
}
|
|
1829
|
+
|
|
1830
|
+
n_nghbr[i] = active_nodes.start;
|
|
1831
|
+
if (method==METHOD_VECTOR_WARD) {
|
|
1832
|
+
/*
|
|
1833
|
+
Ward linkage.
|
|
1834
|
+
|
|
1835
|
+
Shorter and longer distances can occur, not smaller than min(d1,d2)
|
|
1836
|
+
but maybe bigger than max(d1,d2).
|
|
1837
|
+
*/
|
|
1838
|
+
min = dist.ward_extended(active_nodes.start, i);
|
|
1839
|
+
for (j=active_nodes.succ[active_nodes.start]; j<i;
|
|
1840
|
+
j=active_nodes.succ[j]) {
|
|
1841
|
+
t_float tmp = dist.ward_extended(j, i);
|
|
1842
|
+
if (tmp < min) {
|
|
1843
|
+
min = tmp;
|
|
1844
|
+
n_nghbr[i] = j;
|
|
1845
|
+
}
|
|
1846
|
+
}
|
|
1847
|
+
}
|
|
1848
|
+
else {
|
|
1849
|
+
/*
|
|
1850
|
+
Centroid and median linkage.
|
|
1851
|
+
|
|
1852
|
+
Shorter and longer distances can occur, not bigger than max(d1,d2)
|
|
1853
|
+
but maybe smaller than min(d1,d2).
|
|
1854
|
+
*/
|
|
1855
|
+
min = dist.sqeuclidean_extended(active_nodes.start, i);
|
|
1856
|
+
for (j=active_nodes.succ[active_nodes.start]; j<i;
|
|
1857
|
+
j=active_nodes.succ[j]) {
|
|
1858
|
+
t_float tmp = dist.sqeuclidean_extended(j, i);
|
|
1859
|
+
if (tmp < min) {
|
|
1860
|
+
min = tmp;
|
|
1861
|
+
n_nghbr[i] = j;
|
|
1862
|
+
}
|
|
1863
|
+
}
|
|
1864
|
+
}
|
|
1865
|
+
if (idx2<active_nodes.start) {
|
|
1866
|
+
nn_distances.remove(active_nodes.start);
|
|
1867
|
+
} else {
|
|
1868
|
+
nn_distances.remove(idx2);
|
|
1869
|
+
}
|
|
1870
|
+
nn_distances.replace(idx1, i, min);
|
|
1871
|
+
}
|
|
1872
|
+
}
|
|
1873
|
+
}
|
|
1874
|
+
|
|
1875
|
+
#if HAVE_VISIBILITY
|
|
1876
|
+
#pragma GCC visibility pop
|
|
1877
|
+
#endif
|