sequenzo 0.1.24__cp311-cp311-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sequenzo might be problematic. Click here for more details.

Files changed (264) hide show
  1. _sequenzo_fastcluster.cpython-311-darwin.so +0 -0
  2. sequenzo/__init__.py +240 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +474 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-311-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-311-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +20 -0
  30. sequenzo/data_preprocessing/helpers.py +256 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/mvad.csv +713 -0
  44. sequenzo/datasets/pairfam_family.csv +1867 -0
  45. sequenzo/datasets/polyadic_samplec1.csv +61 -0
  46. sequenzo/datasets/polyadic_samplep1.csv +61 -0
  47. sequenzo/datasets/polyadic_seqc1.csv +61 -0
  48. sequenzo/datasets/polyadic_seqp1.csv +61 -0
  49. sequenzo/define_sequence_data.py +609 -0
  50. sequenzo/dissimilarity_measures/__init__.py +31 -0
  51. sequenzo/dissimilarity_measures/c_code.cpython-311-darwin.so +0 -0
  52. sequenzo/dissimilarity_measures/get_distance_matrix.py +702 -0
  53. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +241 -0
  54. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  55. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  56. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  57. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  58. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  59. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  60. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  61. sequenzo/dissimilarity_measures/src/module.cpp +34 -0
  62. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  63. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  64. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  65. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  66. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  67. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  214. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  215. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  216. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-311-darwin.so +0 -0
  217. sequenzo/dissimilarity_measures/utils/seqconc.cpython-311-darwin.so +0 -0
  218. sequenzo/dissimilarity_measures/utils/seqdss.cpython-311-darwin.so +0 -0
  219. sequenzo/dissimilarity_measures/utils/seqdur.cpython-311-darwin.so +0 -0
  220. sequenzo/dissimilarity_measures/utils/seqlength.cpython-311-darwin.so +0 -0
  221. sequenzo/multidomain/__init__.py +23 -0
  222. sequenzo/multidomain/association_between_domains.py +311 -0
  223. sequenzo/multidomain/cat.py +431 -0
  224. sequenzo/multidomain/combt.py +519 -0
  225. sequenzo/multidomain/dat.py +89 -0
  226. sequenzo/multidomain/idcd.py +139 -0
  227. sequenzo/multidomain/linked_polyad.py +292 -0
  228. sequenzo/openmp_setup.py +233 -0
  229. sequenzo/prefix_tree/__init__.py +43 -0
  230. sequenzo/prefix_tree/individual_level_indicators.py +1274 -0
  231. sequenzo/prefix_tree/system_level_indicators.py +465 -0
  232. sequenzo/prefix_tree/utils.py +54 -0
  233. sequenzo/sequence_characteristics/__init__.py +40 -0
  234. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  235. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  236. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  237. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  238. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  239. sequenzo/sequence_characteristics/turbulence.py +155 -0
  240. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  241. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  242. sequenzo/suffix_tree/__init__.py +48 -0
  243. sequenzo/suffix_tree/individual_level_indicators.py +1638 -0
  244. sequenzo/suffix_tree/system_level_indicators.py +456 -0
  245. sequenzo/suffix_tree/utils.py +56 -0
  246. sequenzo/visualization/__init__.py +29 -0
  247. sequenzo/visualization/plot_mean_time.py +194 -0
  248. sequenzo/visualization/plot_modal_state.py +276 -0
  249. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  250. sequenzo/visualization/plot_relative_frequency.py +404 -0
  251. sequenzo/visualization/plot_sequence_index.py +951 -0
  252. sequenzo/visualization/plot_single_medoid.py +153 -0
  253. sequenzo/visualization/plot_state_distribution.py +627 -0
  254. sequenzo/visualization/plot_transition_matrix.py +190 -0
  255. sequenzo/visualization/utils/__init__.py +23 -0
  256. sequenzo/visualization/utils/utils.py +310 -0
  257. sequenzo/with_event_history_analysis/__init__.py +35 -0
  258. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  259. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  260. sequenzo-0.1.24.dist-info/METADATA +255 -0
  261. sequenzo-0.1.24.dist-info/RECORD +264 -0
  262. sequenzo-0.1.24.dist-info/WHEEL +5 -0
  263. sequenzo-0.1.24.dist-info/licenses/LICENSE +28 -0
  264. sequenzo-0.1.24.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1877 @@
1
+ /*
2
+ fastcluster: Fast hierarchical clustering routines for R and Python
3
+
4
+ Copyright:
5
+ * Until package version 1.1.23: © 2011 Daniel Müllner <https://danifold.net>
6
+ * All changes from version 1.1.24 on: © Google Inc. <https://www.google.com>
7
+
8
+ This library implements various fast algorithms for hierarchical,
9
+ agglomerative clustering methods:
10
+
11
+ (1) Algorithms for the "stored matrix approach": the input is the array of
12
+ pairwise dissimilarities.
13
+
14
+ MST_linkage_core: single linkage clustering with the "minimum spanning
15
+ tree algorithm (Rohlfs)
16
+
17
+ NN_chain_core: nearest-neighbor-chain algorithm, suitable for single,
18
+ complete, average, weighted and Ward linkage (Murtagh)
19
+
20
+ generic_linkage: generic algorithm, suitable for all distance update
21
+ formulas (Müllner)
22
+
23
+ (2) Algorithms for the "stored data approach": the input are points in a
24
+ vector space.
25
+
26
+ MST_linkage_core_vector: single linkage clustering for vector data
27
+
28
+ generic_linkage_vector: generic algorithm for vector data, suitable for
29
+ the Ward, centroid and median methods.
30
+
31
+ generic_linkage_vector_alternative: alternative scheme for updating the
32
+ nearest neighbors. This method seems faster than "generic_linkage_vector"
33
+ for the centroid and median methods but slower for the Ward method.
34
+
35
+ All these implementation treat infinity values correctly. They throw an
36
+ exception if a NaN distance value occurs.
37
+ */
38
+
39
+ // Older versions of Microsoft Visual Studio do not have the fenv header.
40
+ #ifdef _MSC_VER
41
+ #if (_MSC_VER == 1500 || _MSC_VER == 1600)
42
+ #define NO_INCLUDE_FENV
43
+ #endif
44
+ #endif
45
+ // NaN detection via fenv might not work on systems with software
46
+ // floating-point emulation (bug report for Debian armel).
47
+ #ifdef __SOFTFP__
48
+ #define NO_INCLUDE_FENV
49
+ #endif
50
+ #ifdef NO_INCLUDE_FENV
51
+ #pragma message("Do not use fenv header.")
52
+ #else
53
+ #pragma message("Use fenv header.")
54
+ /* The following #pragma is necessary even if it generates a warning in many
55
+ compilers. Quoting https://en.cppreference.com/w/cpp/numeric/fenv:
56
+ "The floating-point environment access and modification is only meaningful
57
+ when #pragma STDC FENV_ACCESS is supported and is set to ON. [...]
58
+ In practice, few current compilers, such as HP aCC, Oracle Studio, or IBM XL,
59
+ support the #pragma explicitly, but most compilers allow meaningful access
60
+ to the floating-point environment anyway."
61
+ */
62
+ #pragma STDC FENV_ACCESS ON
63
+ #pragma messag("If there is a warning about unknown #pragma STDC FENV_ACCESS, this can be ignored.")
64
+ #include <fenv.h>
65
+ #endif
66
+
67
+ #include <cmath> // for std::pow, std::sqrt
68
+ #include <cstddef> // for std::ptrdiff_t
69
+ #include <limits> // for std::numeric_limits<...>::infinity()
70
+ #include <algorithm> // for std::fill_n
71
+ #include <stdexcept> // for std::runtime_error
72
+ #include <string> // for std::string
73
+
74
+ #include <cfloat> // also for DBL_MAX, DBL_MIN
75
+ #ifndef DBL_MANT_DIG
76
+ #error The constant DBL_MANT_DIG could not be defined.
77
+ #endif
78
+ #define T_FLOAT_MANT_DIG DBL_MANT_DIG
79
+
80
+ #ifndef LONG_MAX
81
+ #include <climits>
82
+ #endif
83
+ #ifndef LONG_MAX
84
+ #error The constant LONG_MAX could not be defined.
85
+ #endif
86
+ #ifndef INT_MAX
87
+ #error The constant INT_MAX could not be defined.
88
+ #endif
89
+
90
+ #ifndef INT32_MAX
91
+ #ifdef _MSC_VER
92
+ #if _MSC_VER >= 1600
93
+ #define __STDC_LIMIT_MACROS
94
+ #include <stdint.h>
95
+ #else
96
+ typedef __int32 int_fast32_t;
97
+ typedef __int64 int64_t;
98
+ #endif
99
+ #else
100
+ #define __STDC_LIMIT_MACROS
101
+ #include <stdint.h>
102
+ #endif
103
+ #endif
104
+
105
+ #define FILL_N std::fill_n
106
+ #ifdef _MSC_VER
107
+ #if _MSC_VER < 1600
108
+ #undef FILL_N
109
+ #define FILL_N stdext::unchecked_fill_n
110
+ #endif
111
+ #endif
112
+
113
+ // Suppress warnings about (potentially) uninitialized variables.
114
+ #ifdef _MSC_VER
115
+ #pragma warning (disable:4700)
116
+ #endif
117
+
118
+ #ifndef HAVE_DIAGNOSTIC
119
+ #if __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 6))
120
+ #define HAVE_DIAGNOSTIC 1
121
+ #endif
122
+ #endif
123
+
124
+ #ifndef HAVE_VISIBILITY
125
+ #if __GNUC__ >= 4
126
+ #define HAVE_VISIBILITY 1
127
+ #endif
128
+ #endif
129
+
130
+ /* Since the public interface is given by the Python respectively R interface,
131
+ * we do not want other symbols than the interface initalization routines to be
132
+ * visible in the shared object file. The "visibility" switch is a GCC concept.
133
+ * Hiding symbols keeps the relocation table small and decreases startup time.
134
+ * See http://gcc.gnu.org/wiki/Visibility
135
+ */
136
+ #if HAVE_VISIBILITY
137
+ #pragma GCC visibility push(hidden)
138
+ #endif
139
+
140
+ typedef int_fast32_t t_index;
141
+ #ifndef INT32_MAX
142
+ #define MAX_INDEX 0x7fffffffL
143
+ #else
144
+ #define MAX_INDEX INT32_MAX
145
+ #endif
146
+ #if (LONG_MAX < MAX_INDEX)
147
+ #error The integer format "t_index" must not have a greater range than "long int".
148
+ #endif
149
+ #if (INT_MAX > MAX_INDEX)
150
+ #error The integer format "int" must not have a greater range than "t_index".
151
+ #endif
152
+ typedef double t_float;
153
+
154
+ /* Method codes.
155
+
156
+ These codes must agree with the METHODS array in fastcluster.R and the
157
+ dictionary mthidx in fastcluster.py.
158
+ */
159
+ enum method_codes {
160
+ // non-Euclidean methods
161
+ METHOD_METR_SINGLE = 0,
162
+ METHOD_METR_COMPLETE = 1,
163
+ METHOD_METR_AVERAGE = 2,
164
+ METHOD_METR_WEIGHTED = 3,
165
+ METHOD_METR_WARD = 4,
166
+ METHOD_METR_WARD_D = METHOD_METR_WARD,
167
+ METHOD_METR_CENTROID = 5,
168
+ METHOD_METR_MEDIAN = 6,
169
+ METHOD_METR_WARD_D2 = 7,
170
+
171
+ MIN_METHOD_CODE = 0,
172
+ MAX_METHOD_CODE = 7
173
+ };
174
+
175
+ enum method_codes_vector {
176
+ // Euclidean methods
177
+ METHOD_VECTOR_SINGLE = 0,
178
+ METHOD_VECTOR_WARD = 1,
179
+ METHOD_VECTOR_WARD_D2 = 4,
180
+ METHOD_VECTOR_CENTROID = 2,
181
+ METHOD_VECTOR_MEDIAN = 3,
182
+
183
+ MIN_METHOD_VECTOR_CODE = 0,
184
+ MAX_METHOD_VECTOR_CODE = 3
185
+ };
186
+
187
+ // self-destructing array pointer
188
+ template <typename type>
189
+ class auto_array_ptr{
190
+ private:
191
+ type * ptr;
192
+ auto_array_ptr(auto_array_ptr const &); // non construction-copyable
193
+ auto_array_ptr& operator=(auto_array_ptr const &); // non copyable
194
+ public:
195
+ auto_array_ptr()
196
+ : ptr(NULL)
197
+ { }
198
+ template <typename index>
199
+ auto_array_ptr(index const size)
200
+ : ptr(new type[size])
201
+ { }
202
+ template <typename index, typename value>
203
+ auto_array_ptr(index const size, value const val)
204
+ : ptr(new type[size])
205
+ {
206
+ FILL_N(ptr, size, val);
207
+ }
208
+ ~auto_array_ptr() {
209
+ delete [] ptr; }
210
+ void free() {
211
+ delete [] ptr;
212
+ ptr = NULL;
213
+ }
214
+ template <typename index>
215
+ void init(index const size) {
216
+ ptr = new type [size];
217
+ }
218
+ template <typename index, typename value>
219
+ void init(index const size, value const val) {
220
+ init(size);
221
+ FILL_N(ptr, size, val);
222
+ }
223
+ inline operator type *() const { return ptr; }
224
+ };
225
+
226
+ struct node {
227
+ t_index node1, node2;
228
+ t_float dist;
229
+ };
230
+
231
+ inline bool operator< (const node a, const node b) {
232
+ return (a.dist < b.dist);
233
+ }
234
+
235
+ class cluster_result {
236
+ private:
237
+ auto_array_ptr<node> Z;
238
+ t_index pos;
239
+
240
+ public:
241
+ cluster_result(const t_index size)
242
+ : Z(size)
243
+ , pos(0)
244
+ {}
245
+
246
+ void append(const t_index node1, const t_index node2, const t_float dist) {
247
+ Z[pos].node1 = node1;
248
+ Z[pos].node2 = node2;
249
+ Z[pos].dist = dist;
250
+ ++pos;
251
+ }
252
+
253
+ node * operator[] (const t_index idx) const { return Z + idx; }
254
+
255
+ /* Define several methods to postprocess the distances. All these functions
256
+ are monotone, so they do not change the sorted order of distances. */
257
+
258
+ void sqrt() const {
259
+ for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
260
+ ZZ->dist = std::sqrt(ZZ->dist);
261
+ }
262
+ }
263
+
264
+ void sqrt(const t_float) const { // ignore the argument
265
+ sqrt();
266
+ }
267
+
268
+ void sqrtdouble(const t_float) const { // ignore the argument
269
+ for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
270
+ ZZ->dist = std::sqrt(2*ZZ->dist);
271
+ }
272
+ }
273
+
274
+ void sqrtward(const t_float) const { // ignore the argument
275
+ for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
276
+ ZZ->dist = 2*ZZ->dist;
277
+ }
278
+ }
279
+
280
+ #ifdef R_pow
281
+ #define my_pow R_pow
282
+ #else
283
+ #define my_pow std::pow
284
+ #endif
285
+
286
+ void power(const t_float p) const {
287
+ t_float const q = 1/p;
288
+ for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
289
+ ZZ->dist = my_pow(ZZ->dist,q);
290
+ }
291
+ }
292
+
293
+ void plusone(const t_float) const { // ignore the argument
294
+ for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
295
+ ZZ->dist += 1;
296
+ }
297
+ }
298
+
299
+ void divide(const t_float denom) const {
300
+ for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
301
+ ZZ->dist /= denom;
302
+ }
303
+ }
304
+ };
305
+
306
+ class doubly_linked_list {
307
+ /*
308
+ Class for a doubly linked list. Initially, the list is the integer range
309
+ [0, size]. We provide a forward iterator and a method to delete an index
310
+ from the list.
311
+
312
+ Typical use: for (i=L.start; L<size; i=L.succ[I])
313
+ or
314
+ for (i=somevalue; L<size; i=L.succ[I])
315
+ */
316
+ public:
317
+ t_index start;
318
+ auto_array_ptr<t_index> succ;
319
+
320
+ private:
321
+ auto_array_ptr<t_index> pred;
322
+ // Not necessarily private, we just do not need it in this instance.
323
+
324
+ public:
325
+ doubly_linked_list(const t_index size)
326
+ // Initialize to the given size.
327
+ : start(0)
328
+ , succ(size+1)
329
+ , pred(size+1)
330
+ {
331
+ for (t_index i=0; i<size; ++i) {
332
+ pred[i+1] = i;
333
+ succ[i] = i+1;
334
+ }
335
+ // pred[0] is never accessed!
336
+ //succ[size] is never accessed!
337
+ }
338
+
339
+ ~doubly_linked_list() {}
340
+
341
+ void remove(const t_index idx) {
342
+ // Remove an index from the list.
343
+ if (idx==start) {
344
+ start = succ[idx];
345
+ }
346
+ else {
347
+ succ[pred[idx]] = succ[idx];
348
+ pred[succ[idx]] = pred[idx];
349
+ }
350
+ succ[idx] = 0; // Mark as inactive
351
+ }
352
+
353
+ bool is_inactive(t_index idx) const {
354
+ return (succ[idx]==0);
355
+ }
356
+ };
357
+
358
+ // Indexing functions
359
+ // D is the upper triangular part of a symmetric (NxN)-matrix
360
+ // We require r_ < c_ !
361
+ #define D_(r_,c_) ( D[(static_cast<std::ptrdiff_t>(2*N-3-(r_))*(r_)>>1)+(c_)-1] )
362
+ // Z is an ((N-1)x4)-array
363
+ #define Z_(_r, _c) (Z[(_r)*4 + (_c)])
364
+
365
+ /*
366
+ Lookup function for a union-find data structure.
367
+
368
+ The function finds the root of idx by going iteratively through all
369
+ parent elements until a root is found. An element i is a root if
370
+ nodes[i] is zero. To make subsequent searches faster, the entry for
371
+ idx and all its parents is updated with the root element.
372
+ */
373
+ class union_find {
374
+ private:
375
+ auto_array_ptr<t_index> parent;
376
+ t_index nextparent;
377
+
378
+ public:
379
+ union_find(const t_index size)
380
+ : parent(size>0 ? 2*size-1 : 0, 0)
381
+ , nextparent(size)
382
+ { }
383
+
384
+ t_index Find (t_index idx) const {
385
+ if (parent[idx] != 0 ) { // a → b
386
+ t_index p = idx;
387
+ idx = parent[idx];
388
+ if (parent[idx] != 0 ) { // a → b → c
389
+ do {
390
+ idx = parent[idx];
391
+ } while (parent[idx] != 0);
392
+ do {
393
+ t_index tmp = parent[p];
394
+ parent[p] = idx;
395
+ p = tmp;
396
+ } while (parent[p] != idx);
397
+ }
398
+ }
399
+ return idx;
400
+ }
401
+
402
+ void Union (const t_index node1, const t_index node2) {
403
+ parent[node1] = parent[node2] = nextparent++;
404
+ }
405
+ };
406
+
407
+ class nan_error{};
408
+ #ifdef FE_INVALID
409
+ class fenv_error{};
410
+ #endif
411
+
412
+ static void MST_linkage_core(const t_index N, const t_float * const D,
413
+ cluster_result & Z2) {
414
+ /*
415
+ N: integer, number of data points
416
+ D: condensed distance matrix N*(N-1)/2
417
+ Z2: output data structure
418
+
419
+ The basis of this algorithm is an algorithm by Rohlf:
420
+
421
+ F. James Rohlf, Hierarchical clustering using the minimum spanning tree,
422
+ The Computer Journal, vol. 16, 1973, p. 93–95.
423
+ */
424
+ t_index i;
425
+ t_index idx2;
426
+ doubly_linked_list active_nodes(N);
427
+ auto_array_ptr<t_float> d(N);
428
+
429
+ t_index prev_node;
430
+ t_float min;
431
+
432
+ // first iteration
433
+ idx2 = 1;
434
+ min = std::numeric_limits<t_float>::infinity();
435
+ for (i=1; i<N; ++i) {
436
+ d[i] = D[i-1];
437
+ #if HAVE_DIAGNOSTIC
438
+ #pragma GCC diagnostic push
439
+ #pragma GCC diagnostic ignored "-Wfloat-equal"
440
+ #endif
441
+ if (d[i] < min) {
442
+ min = d[i];
443
+ idx2 = i;
444
+ }
445
+ else if (fc_isnan(d[i]))
446
+ throw (nan_error());
447
+ #if HAVE_DIAGNOSTIC
448
+ #pragma GCC diagnostic pop
449
+ #endif
450
+ }
451
+ Z2.append(0, idx2, min);
452
+
453
+ for (t_index j=1; j<N-1; ++j) {
454
+ prev_node = idx2;
455
+ active_nodes.remove(prev_node);
456
+
457
+ idx2 = active_nodes.succ[0];
458
+ min = d[idx2];
459
+ for (i=idx2; i<prev_node; i=active_nodes.succ[i]) {
460
+ t_float tmp = D_(i, prev_node);
461
+ #if HAVE_DIAGNOSTIC
462
+ #pragma GCC diagnostic push
463
+ #pragma GCC diagnostic ignored "-Wfloat-equal"
464
+ #endif
465
+ if (tmp < d[i])
466
+ d[i] = tmp;
467
+ else if (fc_isnan(tmp))
468
+ throw (nan_error());
469
+ #if HAVE_DIAGNOSTIC
470
+ #pragma GCC diagnostic pop
471
+ #endif
472
+ if (d[i] < min) {
473
+ min = d[i];
474
+ idx2 = i;
475
+ }
476
+ }
477
+ for (; i<N; i=active_nodes.succ[i]) {
478
+ t_float tmp = D_(prev_node, i);
479
+ #if HAVE_DIAGNOSTIC
480
+ #pragma GCC diagnostic push
481
+ #pragma GCC diagnostic ignored "-Wfloat-equal"
482
+ #endif
483
+ if (d[i] > tmp)
484
+ d[i] = tmp;
485
+ else if (fc_isnan(tmp))
486
+ throw (nan_error());
487
+ #if HAVE_DIAGNOSTIC
488
+ #pragma GCC diagnostic pop
489
+ #endif
490
+ if (d[i] < min) {
491
+ min = d[i];
492
+ idx2 = i;
493
+ }
494
+ }
495
+ Z2.append(prev_node, idx2, min);
496
+ }
497
+ }
498
+
499
+ /* Functions for the update of the dissimilarity array */
500
+
501
+ inline static void f_single( t_float * const b, const t_float a ) {
502
+ if (*b > a) *b = a;
503
+ }
504
+ inline static void f_complete( t_float * const b, const t_float a ) {
505
+ if (*b < a) *b = a;
506
+ }
507
+ inline static void f_average( t_float * const b, const t_float a, const t_float s, const t_float t) {
508
+ *b = s*a + t*(*b);
509
+ #ifndef FE_INVALID
510
+ #if HAVE_DIAGNOSTIC
511
+ #pragma GCC diagnostic push
512
+ #pragma GCC diagnostic ignored "-Wfloat-equal"
513
+ #endif
514
+ if (fc_isnan(*b)) {
515
+ throw(nan_error());
516
+ }
517
+ #if HAVE_DIAGNOSTIC
518
+ #pragma GCC diagnostic pop
519
+ #endif
520
+ #endif
521
+ }
522
+ inline static void f_weighted( t_float * const b, const t_float a) {
523
+ *b = (a+*b)*.5;
524
+ #ifndef FE_INVALID
525
+ #if HAVE_DIAGNOSTIC
526
+ #pragma GCC diagnostic push
527
+ #pragma GCC diagnostic ignored "-Wfloat-equal"
528
+ #endif
529
+ if (fc_isnan(*b)) {
530
+ throw(nan_error());
531
+ }
532
+ #if HAVE_DIAGNOSTIC
533
+ #pragma GCC diagnostic pop
534
+ #endif
535
+ #endif
536
+ }
537
+ inline static void f_ward( t_float * const b, const t_float a, const t_float c, const t_float s, const t_float t, const t_float v) {
538
+ *b = ( (v+s)*a - v*c + (v+t)*(*b) ) / (s+t+v);
539
+ //*b = a+(*b)-(t*a+s*(*b)+v*c)/(s+t+v);
540
+ #ifndef FE_INVALID
541
+ #if HAVE_DIAGNOSTIC
542
+ #pragma GCC diagnostic push
543
+ #pragma GCC diagnostic ignored "-Wfloat-equal"
544
+ #endif
545
+ if (fc_isnan(*b)) {
546
+ throw(nan_error());
547
+ }
548
+ #if HAVE_DIAGNOSTIC
549
+ #pragma GCC diagnostic pop
550
+ #endif
551
+ #endif
552
+ }
553
+ inline static void f_centroid( t_float * const b, const t_float a, const t_float stc, const t_float s, const t_float t) {
554
+ *b = s*a - stc + t*(*b);
555
+ #ifndef FE_INVALID
556
+ if (fc_isnan(*b)) {
557
+ throw(nan_error());
558
+ }
559
+ #if HAVE_DIAGNOSTIC
560
+ #pragma GCC diagnostic pop
561
+ #endif
562
+ #endif
563
+ }
564
+ inline static void f_median( t_float * const b, const t_float a, const t_float c_4) {
565
+ *b = (a+(*b))*.5 - c_4;
566
+ #ifndef FE_INVALID
567
+ #if HAVE_DIAGNOSTIC
568
+ #pragma GCC diagnostic push
569
+ #pragma GCC diagnostic ignored "-Wfloat-equal"
570
+ #endif
571
+ if (fc_isnan(*b)) {
572
+ throw(nan_error());
573
+ }
574
+ #if HAVE_DIAGNOSTIC
575
+ #pragma GCC diagnostic pop
576
+ #endif
577
+ #endif
578
+ }
579
+
580
+ template <method_codes method, typename t_members>
581
+ static void NN_chain_core(const t_index N, t_float * const D, t_members * const members, cluster_result & Z2) {
582
+ /*
583
+ N: integer
584
+ D: condensed distance matrix N*(N-1)/2
585
+ Z2: output data structure
586
+
587
+ This is the NN-chain algorithm, described on page 86 in the following book:
588
+
589
+ Fionn Murtagh, Multidimensional Clustering Algorithms,
590
+ Vienna, Würzburg: Physica-Verlag, 1985.
591
+ */
592
+ t_index i;
593
+
594
+ auto_array_ptr<t_index> NN_chain(N);
595
+ t_index NN_chain_tip = 0;
596
+
597
+ t_index idx1, idx2;
598
+
599
+ t_float size1, size2;
600
+ doubly_linked_list active_nodes(N);
601
+
602
+ t_float min;
603
+
604
+ for (t_float const * DD=D; DD!=D+(static_cast<std::ptrdiff_t>(N)*(N-1)>>1);
605
+ ++DD) {
606
+ #if HAVE_DIAGNOSTIC
607
+ #pragma GCC diagnostic push
608
+ #pragma GCC diagnostic ignored "-Wfloat-equal"
609
+ #endif
610
+ if (fc_isnan(*DD)) {
611
+ throw(nan_error());
612
+ }
613
+ #if HAVE_DIAGNOSTIC
614
+ #pragma GCC diagnostic pop
615
+ #endif
616
+ }
617
+
618
+ #ifdef FE_INVALID
619
+ if (feclearexcept(FE_INVALID)) throw fenv_error();
620
+ #endif
621
+
622
+ for (t_index j=0; j<N-1; ++j) {
623
+ if (NN_chain_tip <= 3) {
624
+ NN_chain[0] = idx1 = active_nodes.start;
625
+ NN_chain_tip = 1;
626
+
627
+ idx2 = active_nodes.succ[idx1];
628
+ min = D_(idx1,idx2);
629
+ for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i]) {
630
+ if (D_(idx1,i) < min) {
631
+ min = D_(idx1,i);
632
+ idx2 = i;
633
+ }
634
+ }
635
+ } // a: idx1 b: idx2
636
+ else {
637
+ NN_chain_tip -= 3;
638
+ idx1 = NN_chain[NN_chain_tip-1];
639
+ idx2 = NN_chain[NN_chain_tip];
640
+ min = idx1<idx2 ? D_(idx1,idx2) : D_(idx2,idx1);
641
+ } // a: idx1 b: idx2
642
+
643
+ do {
644
+ NN_chain[NN_chain_tip] = idx2;
645
+
646
+ for (i=active_nodes.start; i<idx2; i=active_nodes.succ[i]) {
647
+ if (D_(i,idx2) < min) {
648
+ min = D_(i,idx2);
649
+ idx1 = i;
650
+ }
651
+ }
652
+ for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i]) {
653
+ if (D_(idx2,i) < min) {
654
+ min = D_(idx2,i);
655
+ idx1 = i;
656
+ }
657
+ }
658
+
659
+ idx2 = idx1;
660
+ idx1 = NN_chain[NN_chain_tip++];
661
+
662
+ } while (idx2 != NN_chain[NN_chain_tip-2]);
663
+
664
+ Z2.append(idx1, idx2, min);
665
+
666
+ if (idx1>idx2) {
667
+ t_index tmp = idx1;
668
+ idx1 = idx2;
669
+ idx2 = tmp;
670
+ }
671
+
672
+ if (method==METHOD_METR_AVERAGE ||
673
+ method==METHOD_METR_WARD ||
674
+ method==METHOD_METR_WARD_D2) {
675
+ size1 = static_cast<t_float>(members[idx1]);
676
+ size2 = static_cast<t_float>(members[idx2]);
677
+ members[idx2] += members[idx1];
678
+ }
679
+
680
+ // Remove the smaller index from the valid indices (active_nodes).
681
+ active_nodes.remove(idx1);
682
+
683
+ switch (method) {
684
+ case METHOD_METR_SINGLE:
685
+ /*
686
+ Single linkage.
687
+
688
+ Characteristic: new distances are never longer than the old distances.
689
+ */
690
+ // Update the distance matrix in the range [start, idx1).
691
+ for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
692
+ f_single(&D_(i, idx2), D_(i, idx1) );
693
+ // Update the distance matrix in the range (idx1, idx2).
694
+ for (; i<idx2; i=active_nodes.succ[i])
695
+ f_single(&D_(i, idx2), D_(idx1, i) );
696
+ // Update the distance matrix in the range (idx2, N).
697
+ for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
698
+ f_single(&D_(idx2, i), D_(idx1, i) );
699
+ break;
700
+
701
+ case METHOD_METR_COMPLETE:
702
+ /*
703
+ Complete linkage.
704
+
705
+ Characteristic: new distances are never shorter than the old distances.
706
+ */
707
+ // Update the distance matrix in the range [start, idx1).
708
+ for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
709
+ f_complete(&D_(i, idx2), D_(i, idx1) );
710
+ // Update the distance matrix in the range (idx1, idx2).
711
+ for (; i<idx2; i=active_nodes.succ[i])
712
+ f_complete(&D_(i, idx2), D_(idx1, i) );
713
+ // Update the distance matrix in the range (idx2, N).
714
+ for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
715
+ f_complete(&D_(idx2, i), D_(idx1, i) );
716
+ break;
717
+
718
+ case METHOD_METR_AVERAGE: {
719
+ /*
720
+ Average linkage.
721
+
722
+ Shorter and longer distances can occur.
723
+ */
724
+ // Update the distance matrix in the range [start, idx1).
725
+ t_float s = size1/(size1+size2);
726
+ t_float t = size2/(size1+size2);
727
+ for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
728
+ f_average(&D_(i, idx2), D_(i, idx1), s, t );
729
+ // Update the distance matrix in the range (idx1, idx2).
730
+ for (; i<idx2; i=active_nodes.succ[i])
731
+ f_average(&D_(i, idx2), D_(idx1, i), s, t );
732
+ // Update the distance matrix in the range (idx2, N).
733
+ for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
734
+ f_average(&D_(idx2, i), D_(idx1, i), s, t );
735
+ break;
736
+ }
737
+
738
+ case METHOD_METR_WEIGHTED:
739
+ /*
740
+ Weighted linkage.
741
+
742
+ Shorter and longer distances can occur.
743
+ */
744
+ // Update the distance matrix in the range [start, idx1).
745
+ for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
746
+ f_weighted(&D_(i, idx2), D_(i, idx1) );
747
+ // Update the distance matrix in the range (idx1, idx2).
748
+ for (; i<idx2; i=active_nodes.succ[i])
749
+ f_weighted(&D_(i, idx2), D_(idx1, i) );
750
+ // Update the distance matrix in the range (idx2, N).
751
+ for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
752
+ f_weighted(&D_(idx2, i), D_(idx1, i) );
753
+ break;
754
+
755
+ case METHOD_METR_WARD:
756
+ /*
757
+ Ward linkage.
758
+
759
+ Shorter and longer distances can occur, not smaller than min(d1,d2)
760
+ but maybe bigger than max(d1,d2).
761
+ */
762
+ // Update the distance matrix in the range [start, idx1).
763
+ //t_float v = static_cast<t_float>(members[i]);
764
+ for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
765
+ f_ward(&D_(i, idx2), D_(i, idx1), min,
766
+ size1, size2, static_cast<t_float>(members[i]) );
767
+ // Update the distance matrix in the range (idx1, idx2).
768
+ for (; i<idx2; i=active_nodes.succ[i])
769
+ f_ward(&D_(i, idx2), D_(idx1, i), min,
770
+ size1, size2, static_cast<t_float>(members[i]) );
771
+ // Update the distance matrix in the range (idx2, N).
772
+ for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
773
+ f_ward(&D_(idx2, i), D_(idx1, i), min,
774
+ size1, size2, static_cast<t_float>(members[i]) );
775
+ break;
776
+
777
+ case METHOD_METR_WARD_D2:
778
+ /*
779
+ Ward D2 linkage (with squared Euclidean distances).
780
+
781
+ Shorter and longer distances can occur, not smaller than min(d1,d2)
782
+ but maybe bigger than max(d1,d2).
783
+ Uses the same update formula as Ward, but with different post-processing.
784
+ */
785
+ // Update the distance matrix in the range [start, idx1).
786
+ for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
787
+ f_ward(&D_(i, idx2), D_(i, idx1), min,
788
+ size1, size2, static_cast<t_float>(members[i]) );
789
+ // Update the distance matrix in the range (idx1, idx2).
790
+ for (; i<idx2; i=active_nodes.succ[i])
791
+ f_ward(&D_(i, idx2), D_(idx1, i), min,
792
+ size1, size2, static_cast<t_float>(members[i]) );
793
+ // Update the distance matrix in the range (idx2, N).
794
+ for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
795
+ f_ward(&D_(idx2, i), D_(idx1, i), min,
796
+ size1, size2, static_cast<t_float>(members[i]) );
797
+ break;
798
+
799
+ default:
800
+ throw std::runtime_error(std::string("Invalid method."));
801
+ }
802
+ }
803
+ #ifdef FE_INVALID
804
+ if (fetestexcept(FE_INVALID)) throw fenv_error();
805
+ #endif
806
+ }
807
+
808
+ class binary_min_heap {
809
+ /*
810
+ Class for a binary min-heap. The data resides in an array A. The elements of
811
+ A are not changed but two lists I and R of indices are generated which point
812
+ to elements of A and backwards.
813
+
814
+ The heap tree structure is
815
+
816
+ H[2*i+1] H[2*i+2]
817
+ \ /
818
+ \ /
819
+ ≤ ≤
820
+ \ /
821
+ \ /
822
+ H[i]
823
+
824
+ where the children must be less or equal than their parent. Thus, H[0]
825
+ contains the minimum. The lists I and R are made such that H[i] = A[I[i]]
826
+ and R[I[i]] = i.
827
+
828
+ This implementation is not designed to handle NaN values.
829
+ */
830
+ private:
831
+ t_float * const A;
832
+ t_index size;
833
+ auto_array_ptr<t_index> I;
834
+ auto_array_ptr<t_index> R;
835
+
836
+ // no default constructor
837
+ binary_min_heap();
838
+ // noncopyable
839
+ binary_min_heap(binary_min_heap const &);
840
+ binary_min_heap & operator=(binary_min_heap const &);
841
+
842
+ public:
843
+ binary_min_heap(t_float * const A_, const t_index size_)
844
+ : A(A_), size(size_), I(size), R(size)
845
+ { // Allocate memory and initialize the lists I and R to the identity. This
846
+ // does not make it a heap. Call heapify afterwards!
847
+ for (t_index i=0; i<size; ++i)
848
+ R[i] = I[i] = i;
849
+ }
850
+
851
+ binary_min_heap(t_float * const A_, const t_index size1, const t_index size2,
852
+ const t_index start)
853
+ : A(A_), size(size1), I(size1), R(size2)
854
+ { // Allocate memory and initialize the lists I and R to the identity. This
855
+ // does not make it a heap. Call heapify afterwards!
856
+ for (t_index i=0; i<size; ++i) {
857
+ R[i+start] = i;
858
+ I[i] = i + start;
859
+ }
860
+ }
861
+
862
+ ~binary_min_heap() {}
863
+
864
+ void heapify() {
865
+ // Arrange the indices I and R so that H[i] := A[I[i]] satisfies the heap
866
+ // condition H[i] < H[2*i+1] and H[i] < H[2*i+2] for each i.
867
+ //
868
+ // Complexity: Θ(size)
869
+ // Reference: Cormen, Leiserson, Rivest, Stein, Introduction to Algorithms,
870
+ // 3rd ed., 2009, Section 6.3 “Building a heap”
871
+ t_index idx;
872
+ for (idx=(size>>1); idx>0; ) {
873
+ --idx;
874
+ update_geq_(idx);
875
+ }
876
+ }
877
+
878
+ inline t_index argmin() const {
879
+ // Return the minimal element.
880
+ return I[0];
881
+ }
882
+
883
+ void heap_pop() {
884
+ // Remove the minimal element from the heap.
885
+ --size;
886
+ I[0] = I[size];
887
+ R[I[0]] = 0;
888
+ update_geq_(0);
889
+ }
890
+
891
+ void remove(t_index idx) {
892
+ // Remove an element from the heap.
893
+ --size;
894
+ R[I[size]] = R[idx];
895
+ I[R[idx]] = I[size];
896
+ if ( H(size)<=A[idx] ) {
897
+ update_leq_(R[idx]);
898
+ }
899
+ else {
900
+ update_geq_(R[idx]);
901
+ }
902
+ }
903
+
904
+ void replace ( const t_index idxold, const t_index idxnew,
905
+ const t_float val) {
906
+ R[idxnew] = R[idxold];
907
+ I[R[idxnew]] = idxnew;
908
+ if (val<=A[idxold])
909
+ update_leq(idxnew, val);
910
+ else
911
+ update_geq(idxnew, val);
912
+ }
913
+
914
+ void update ( const t_index idx, const t_float val ) const {
915
+ // Update the element A[i] with val and re-arrange the indices to preserve
916
+ // the heap condition.
917
+ if (val<=A[idx])
918
+ update_leq(idx, val);
919
+ else
920
+ update_geq(idx, val);
921
+ }
922
+
923
+ void update_leq ( const t_index idx, const t_float val ) const {
924
+ // Use this when the new value is not more than the old value.
925
+ A[idx] = val;
926
+ update_leq_(R[idx]);
927
+ }
928
+
929
+ void update_geq ( const t_index idx, const t_float val ) const {
930
+ // Use this when the new value is not less than the old value.
931
+ A[idx] = val;
932
+ update_geq_(R[idx]);
933
+ }
934
+
935
+ private:
936
+ void update_leq_ (t_index i) const {
937
+ t_index j;
938
+ for ( ; (i>0) && ( H(i)<H(j=(i-1)>>1) ); i=j)
939
+ heap_swap(i,j);
940
+ }
941
+
942
+ void update_geq_ (t_index i) const {
943
+ t_index j;
944
+ for ( ; (j=2*i+1)<size; i=j) {
945
+ if ( H(j)>=H(i) ) {
946
+ ++j;
947
+ if ( j>=size || H(j)>=H(i) ) break;
948
+ }
949
+ else if ( j+1<size && H(j+1)<H(j) ) ++j;
950
+ heap_swap(i, j);
951
+ }
952
+ }
953
+
954
+ void heap_swap(const t_index i, const t_index j) const {
955
+ // Swap two indices.
956
+ t_index tmp = I[i];
957
+ I[i] = I[j];
958
+ I[j] = tmp;
959
+ R[I[i]] = i;
960
+ R[I[j]] = j;
961
+ }
962
+
963
+ inline t_float H(const t_index i) const {
964
+ return A[I[i]];
965
+ }
966
+
967
+ };
968
+
969
+ template <method_codes method, typename t_members>
970
+ static void generic_linkage(const t_index N, t_float * const D, t_members * const members, cluster_result & Z2) {
971
+ /*
972
+ N: integer, number of data points
973
+ D: condensed distance matrix N*(N-1)/2
974
+ Z2: output data structure
975
+ */
976
+
977
+ const t_index N_1 = N-1;
978
+ t_index i, j; // loop variables
979
+ t_index idx1, idx2; // row and column indices
980
+
981
+ auto_array_ptr<t_index> n_nghbr(N_1); // array of nearest neighbors
982
+ auto_array_ptr<t_float> mindist(N_1); // distances to the nearest neighbors
983
+ auto_array_ptr<t_index> row_repr(N); // row_repr[i]: node number that the
984
+ // i-th row represents
985
+ doubly_linked_list active_nodes(N);
986
+ binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for
987
+ // the distance to the nearest neighbor of each point
988
+ t_index node1, node2; // node numbers in the output
989
+ t_float size1, size2; // and their cardinalities
990
+
991
+ t_float min; // minimum and row index for nearest-neighbor search
992
+ t_index idx;
993
+
994
+ for (i=0; i<N; ++i)
995
+ // Build a list of row ↔ node label assignments.
996
+ // Initially i ↦ i
997
+ row_repr[i] = i;
998
+
999
+ // Initialize the minimal distances:
1000
+ // Find the nearest neighbor of each point.
1001
+ // n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
1002
+ t_float const * DD = D;
1003
+ for (i=0; i<N_1; ++i) {
1004
+ min = std::numeric_limits<t_float>::infinity();
1005
+ for (idx=j=i+1; j<N; ++j, ++DD) {
1006
+ #if HAVE_DIAGNOSTIC
1007
+ #pragma GCC diagnostic push
1008
+ #pragma GCC diagnostic ignored "-Wfloat-equal"
1009
+ #endif
1010
+ if (*DD<min) {
1011
+ min = *DD;
1012
+ idx = j;
1013
+ }
1014
+ else if (fc_isnan(*DD))
1015
+ throw(nan_error());
1016
+ }
1017
+ #if HAVE_DIAGNOSTIC
1018
+ #pragma GCC diagnostic pop
1019
+ #endif
1020
+ mindist[i] = min;
1021
+ n_nghbr[i] = idx;
1022
+ }
1023
+
1024
+ // Put the minimal distances into a heap structure to make the repeated
1025
+ // global minimum searches fast.
1026
+ nn_distances.heapify();
1027
+
1028
+ #ifdef FE_INVALID
1029
+ if (feclearexcept(FE_INVALID)) throw fenv_error();
1030
+ #endif
1031
+
1032
+ // Main loop: We have N-1 merging steps.
1033
+ for (i=0; i<N_1; ++i) {
1034
+ /*
1035
+ Here is a special feature that allows fast bookkeeping and updates of the
1036
+ minimal distances.
1037
+
1038
+ mindist[i] stores a lower bound on the minimum distance of the point i to
1039
+ all points of higher index:
1040
+
1041
+ mindist[i] ≥ min_{j>i} D(i,j)
1042
+
1043
+ Normally, we have equality. However, this minimum may become invalid due
1044
+ to the updates in the distance matrix. The rules are:
1045
+
1046
+ 1) If mindist[i] is equal to D(i, n_nghbr[i]), this is the correct
1047
+ minimum and n_nghbr[i] is a nearest neighbor.
1048
+
1049
+ 2) If mindist[i] is smaller than D(i, n_nghbr[i]), this might not be the
1050
+ correct minimum. The minimum needs to be recomputed.
1051
+
1052
+ 3) mindist[i] is never bigger than the true minimum. Hence, we never
1053
+ miss the true minimum if we take the smallest mindist entry,
1054
+ re-compute the value if necessary (thus maybe increasing it) and
1055
+ looking for the now smallest mindist entry until a valid minimal
1056
+ entry is found. This step is done in the lines below.
1057
+
1058
+ The update process for D below takes care that these rules are
1059
+ fulfilled. This makes sure that the minima in the rows D(i,i+1:)of D are
1060
+ re-calculated when necessary but re-calculation is avoided whenever
1061
+ possible.
1062
+
1063
+ The re-calculation of the minima makes the worst-case runtime of this
1064
+ algorithm cubic in N. We avoid this whenever possible, and in most cases
1065
+ the runtime appears to be quadratic.
1066
+ */
1067
+ idx1 = nn_distances.argmin();
1068
+ if (method != METHOD_METR_SINGLE) {
1069
+ while ( mindist[idx1] < D_(idx1, n_nghbr[idx1]) ) {
1070
+ // Recompute the minimum mindist[idx1] and n_nghbr[idx1].
1071
+ n_nghbr[idx1] = j = active_nodes.succ[idx1]; // exists, maximally N-1
1072
+ min = D_(idx1,j);
1073
+ for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1074
+ if (D_(idx1,j)<min) {
1075
+ min = D_(idx1,j);
1076
+ n_nghbr[idx1] = j;
1077
+ }
1078
+ }
1079
+ /* Update the heap with the new true minimum and search for the
1080
+ (possibly different) minimal entry. */
1081
+ nn_distances.update_geq(idx1, min);
1082
+ idx1 = nn_distances.argmin();
1083
+ }
1084
+ }
1085
+
1086
+ nn_distances.heap_pop(); // Remove the current minimum from the heap.
1087
+ idx2 = n_nghbr[idx1];
1088
+
1089
+ // Write the newly found minimal pair of nodes to the output array.
1090
+ node1 = row_repr[idx1];
1091
+ node2 = row_repr[idx2];
1092
+
1093
+ if (method==METHOD_METR_AVERAGE ||
1094
+ method==METHOD_METR_WARD ||
1095
+ method==METHOD_METR_WARD_D2 ||
1096
+ method==METHOD_METR_CENTROID) {
1097
+ size1 = static_cast<t_float>(members[idx1]);
1098
+ size2 = static_cast<t_float>(members[idx2]);
1099
+ members[idx2] += members[idx1];
1100
+ }
1101
+ Z2.append(node1, node2, mindist[idx1]);
1102
+
1103
+ // Remove idx1 from the list of active indices (active_nodes).
1104
+ active_nodes.remove(idx1);
1105
+ // Index idx2 now represents the new (merged) node with label N+i.
1106
+ row_repr[idx2] = N+i;
1107
+
1108
+ // Update the distance matrix
1109
+ switch (method) {
1110
+ case METHOD_METR_SINGLE:
1111
+ /*
1112
+ Single linkage.
1113
+
1114
+ Characteristic: new distances are never longer than the old distances.
1115
+ */
1116
+ // Update the distance matrix in the range [start, idx1).
1117
+ for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1118
+ f_single(&D_(j, idx2), D_(j, idx1));
1119
+ if (n_nghbr[j] == idx1)
1120
+ n_nghbr[j] = idx2;
1121
+ }
1122
+ // Update the distance matrix in the range (idx1, idx2).
1123
+ for (; j<idx2; j=active_nodes.succ[j]) {
1124
+ f_single(&D_(j, idx2), D_(idx1, j));
1125
+ // If the new value is below the old minimum in a row, update
1126
+ // the mindist and n_nghbr arrays.
1127
+ if (D_(j, idx2) < mindist[j]) {
1128
+ nn_distances.update_leq(j, D_(j, idx2));
1129
+ n_nghbr[j] = idx2;
1130
+ }
1131
+ }
1132
+ // Update the distance matrix in the range (idx2, N).
1133
+ // Recompute the minimum mindist[idx2] and n_nghbr[idx2].
1134
+ if (idx2<N_1) {
1135
+ min = mindist[idx2];
1136
+ for (j=active_nodes.succ[idx2]; j<N; j=active_nodes.succ[j]) {
1137
+ f_single(&D_(idx2, j), D_(idx1, j) );
1138
+ if (D_(idx2, j) < min) {
1139
+ n_nghbr[idx2] = j;
1140
+ min = D_(idx2, j);
1141
+ }
1142
+ }
1143
+ nn_distances.update_leq(idx2, min);
1144
+ }
1145
+ break;
1146
+
1147
+ case METHOD_METR_COMPLETE:
1148
+ /*
1149
+ Complete linkage.
1150
+
1151
+ Characteristic: new distances are never shorter than the old distances.
1152
+ */
1153
+ // Update the distance matrix in the range [start, idx1).
1154
+ for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1155
+ f_complete(&D_(j, idx2), D_(j, idx1) );
1156
+ if (n_nghbr[j] == idx1)
1157
+ n_nghbr[j] = idx2;
1158
+ }
1159
+ // Update the distance matrix in the range (idx1, idx2).
1160
+ for (; j<idx2; j=active_nodes.succ[j])
1161
+ f_complete(&D_(j, idx2), D_(idx1, j) );
1162
+ // Update the distance matrix in the range (idx2, N).
1163
+ for (j=active_nodes.succ[idx2]; j<N; j=active_nodes.succ[j])
1164
+ f_complete(&D_(idx2, j), D_(idx1, j) );
1165
+ break;
1166
+
1167
+ case METHOD_METR_AVERAGE: {
1168
+ /*
1169
+ Average linkage.
1170
+
1171
+ Shorter and longer distances can occur.
1172
+ */
1173
+ // Update the distance matrix in the range [start, idx1).
1174
+ t_float s = size1/(size1+size2);
1175
+ t_float t = size2/(size1+size2);
1176
+ for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1177
+ f_average(&D_(j, idx2), D_(j, idx1), s, t);
1178
+ if (n_nghbr[j] == idx1)
1179
+ n_nghbr[j] = idx2;
1180
+ }
1181
+ // Update the distance matrix in the range (idx1, idx2).
1182
+ for (; j<idx2; j=active_nodes.succ[j]) {
1183
+ f_average(&D_(j, idx2), D_(idx1, j), s, t);
1184
+ if (D_(j, idx2) < mindist[j]) {
1185
+ nn_distances.update_leq(j, D_(j, idx2));
1186
+ n_nghbr[j] = idx2;
1187
+ }
1188
+ }
1189
+ // Update the distance matrix in the range (idx2, N).
1190
+ if (idx2<N_1) {
1191
+ n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1192
+ f_average(&D_(idx2, j), D_(idx1, j), s, t);
1193
+ min = D_(idx2,j);
1194
+ for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1195
+ f_average(&D_(idx2, j), D_(idx1, j), s, t);
1196
+ if (D_(idx2,j) < min) {
1197
+ min = D_(idx2,j);
1198
+ n_nghbr[idx2] = j;
1199
+ }
1200
+ }
1201
+ nn_distances.update(idx2, min);
1202
+ }
1203
+ break;
1204
+ }
1205
+
1206
+ case METHOD_METR_WEIGHTED:
1207
+ /*
1208
+ Weighted linkage.
1209
+
1210
+ Shorter and longer distances can occur.
1211
+ */
1212
+ // Update the distance matrix in the range [start, idx1).
1213
+ for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1214
+ f_weighted(&D_(j, idx2), D_(j, idx1) );
1215
+ if (n_nghbr[j] == idx1)
1216
+ n_nghbr[j] = idx2;
1217
+ }
1218
+ // Update the distance matrix in the range (idx1, idx2).
1219
+ for (; j<idx2; j=active_nodes.succ[j]) {
1220
+ f_weighted(&D_(j, idx2), D_(idx1, j) );
1221
+ if (D_(j, idx2) < mindist[j]) {
1222
+ nn_distances.update_leq(j, D_(j, idx2));
1223
+ n_nghbr[j] = idx2;
1224
+ }
1225
+ }
1226
+ // Update the distance matrix in the range (idx2, N).
1227
+ if (idx2<N_1) {
1228
+ n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1229
+ f_weighted(&D_(idx2, j), D_(idx1, j) );
1230
+ min = D_(idx2,j);
1231
+ for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1232
+ f_weighted(&D_(idx2, j), D_(idx1, j) );
1233
+ if (D_(idx2,j) < min) {
1234
+ min = D_(idx2,j);
1235
+ n_nghbr[idx2] = j;
1236
+ }
1237
+ }
1238
+ nn_distances.update(idx2, min);
1239
+ }
1240
+ break;
1241
+
1242
+ case METHOD_METR_WARD:
1243
+ /*
1244
+ Ward linkage.
1245
+
1246
+ Shorter and longer distances can occur, not smaller than min(d1,d2)
1247
+ but maybe bigger than max(d1,d2).
1248
+ */
1249
+ // Update the distance matrix in the range [start, idx1).
1250
+ for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1251
+ f_ward(&D_(j, idx2), D_(j, idx1), mindist[idx1],
1252
+ size1, size2, static_cast<t_float>(members[j]) );
1253
+ if (n_nghbr[j] == idx1)
1254
+ n_nghbr[j] = idx2;
1255
+ }
1256
+ // Update the distance matrix in the range (idx1, idx2).
1257
+ for (; j<idx2; j=active_nodes.succ[j]) {
1258
+ f_ward(&D_(j, idx2), D_(idx1, j), mindist[idx1], size1, size2,
1259
+ static_cast<t_float>(members[j]) );
1260
+ if (D_(j, idx2) < mindist[j]) {
1261
+ nn_distances.update_leq(j, D_(j, idx2));
1262
+ n_nghbr[j] = idx2;
1263
+ }
1264
+ }
1265
+ // Update the distance matrix in the range (idx2, N).
1266
+ if (idx2<N_1) {
1267
+ n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1268
+ f_ward(&D_(idx2, j), D_(idx1, j), mindist[idx1],
1269
+ size1, size2, static_cast<t_float>(members[j]) );
1270
+ min = D_(idx2,j);
1271
+ for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1272
+ f_ward(&D_(idx2, j), D_(idx1, j), mindist[idx1],
1273
+ size1, size2, static_cast<t_float>(members[j]) );
1274
+ if (D_(idx2,j) < min) {
1275
+ min = D_(idx2,j);
1276
+ n_nghbr[idx2] = j;
1277
+ }
1278
+ }
1279
+ nn_distances.update(idx2, min);
1280
+ }
1281
+ break;
1282
+
1283
+ case METHOD_METR_WARD_D2:
1284
+ /*
1285
+ Ward D2 linkage (with squared Euclidean distances).
1286
+
1287
+ Shorter and longer distances can occur, not smaller than min(d1,d2)
1288
+ but maybe bigger than max(d1,d2).
1289
+ Uses the same update formula as Ward, but with different post-processing.
1290
+ */
1291
+ // Update the distance matrix in the range [start, idx1).
1292
+ for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1293
+ f_ward(&D_(j, idx2), D_(j, idx1), mindist[idx1],
1294
+ size1, size2, static_cast<t_float>(members[j]) );
1295
+ if (n_nghbr[j] == idx1)
1296
+ n_nghbr[j] = idx2;
1297
+ }
1298
+ // Update the distance matrix in the range (idx1, idx2).
1299
+ for (; j<idx2; j=active_nodes.succ[j]) {
1300
+ f_ward(&D_(j, idx2), D_(idx1, j), mindist[idx1], size1, size2,
1301
+ static_cast<t_float>(members[j]) );
1302
+ if (D_(j, idx2) < mindist[j]) {
1303
+ nn_distances.update_leq(j, D_(j, idx2));
1304
+ n_nghbr[j] = idx2;
1305
+ }
1306
+ }
1307
+ // Update the distance matrix in the range (idx2, N).
1308
+ if (idx2<N_1) {
1309
+ n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1310
+ f_ward(&D_(idx2, j), D_(idx1, j), mindist[idx1],
1311
+ size1, size2, static_cast<t_float>(members[j]) );
1312
+ min = D_(idx2,j);
1313
+ for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1314
+ f_ward(&D_(idx2, j), D_(idx1, j), mindist[idx1],
1315
+ size1, size2, static_cast<t_float>(members[j]) );
1316
+ if (D_(idx2,j) < min) {
1317
+ min = D_(idx2,j);
1318
+ n_nghbr[idx2] = j;
1319
+ }
1320
+ }
1321
+ nn_distances.update(idx2, min);
1322
+ }
1323
+ break;
1324
+
1325
+ case METHOD_METR_CENTROID: {
1326
+ /*
1327
+ Centroid linkage.
1328
+
1329
+ Shorter and longer distances can occur, not bigger than max(d1,d2)
1330
+ but maybe smaller than min(d1,d2).
1331
+ */
1332
+ // Update the distance matrix in the range [start, idx1).
1333
+ t_float s = size1/(size1+size2);
1334
+ t_float t = size2/(size1+size2);
1335
+ t_float stc = s*t*mindist[idx1];
1336
+ for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1337
+ f_centroid(&D_(j, idx2), D_(j, idx1), stc, s, t);
1338
+ if (D_(j, idx2) < mindist[j]) {
1339
+ nn_distances.update_leq(j, D_(j, idx2));
1340
+ n_nghbr[j] = idx2;
1341
+ }
1342
+ else if (n_nghbr[j] == idx1)
1343
+ n_nghbr[j] = idx2;
1344
+ }
1345
+ // Update the distance matrix in the range (idx1, idx2).
1346
+ for (; j<idx2; j=active_nodes.succ[j]) {
1347
+ f_centroid(&D_(j, idx2), D_(idx1, j), stc, s, t);
1348
+ if (D_(j, idx2) < mindist[j]) {
1349
+ nn_distances.update_leq(j, D_(j, idx2));
1350
+ n_nghbr[j] = idx2;
1351
+ }
1352
+ }
1353
+ // Update the distance matrix in the range (idx2, N).
1354
+ if (idx2<N_1) {
1355
+ n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1356
+ f_centroid(&D_(idx2, j), D_(idx1, j), stc, s, t);
1357
+ min = D_(idx2,j);
1358
+ for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1359
+ f_centroid(&D_(idx2, j), D_(idx1, j), stc, s, t);
1360
+ if (D_(idx2,j) < min) {
1361
+ min = D_(idx2,j);
1362
+ n_nghbr[idx2] = j;
1363
+ }
1364
+ }
1365
+ nn_distances.update(idx2, min);
1366
+ }
1367
+ break;
1368
+ }
1369
+
1370
+ case METHOD_METR_MEDIAN: {
1371
+ /*
1372
+ Median linkage.
1373
+
1374
+ Shorter and longer distances can occur, not bigger than max(d1,d2)
1375
+ but maybe smaller than min(d1,d2).
1376
+ */
1377
+ // Update the distance matrix in the range [start, idx1).
1378
+ t_float c_4 = mindist[idx1]*.25;
1379
+ for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1380
+ f_median(&D_(j, idx2), D_(j, idx1), c_4 );
1381
+ if (D_(j, idx2) < mindist[j]) {
1382
+ nn_distances.update_leq(j, D_(j, idx2));
1383
+ n_nghbr[j] = idx2;
1384
+ }
1385
+ else if (n_nghbr[j] == idx1)
1386
+ n_nghbr[j] = idx2;
1387
+ }
1388
+ // Update the distance matrix in the range (idx1, idx2).
1389
+ for (; j<idx2; j=active_nodes.succ[j]) {
1390
+ f_median(&D_(j, idx2), D_(idx1, j), c_4 );
1391
+ if (D_(j, idx2) < mindist[j]) {
1392
+ nn_distances.update_leq(j, D_(j, idx2));
1393
+ n_nghbr[j] = idx2;
1394
+ }
1395
+ }
1396
+ // Update the distance matrix in the range (idx2, N).
1397
+ if (idx2<N_1) {
1398
+ n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1399
+ f_median(&D_(idx2, j), D_(idx1, j), c_4 );
1400
+ min = D_(idx2,j);
1401
+ for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1402
+ f_median(&D_(idx2, j), D_(idx1, j), c_4 );
1403
+ if (D_(idx2,j) < min) {
1404
+ min = D_(idx2,j);
1405
+ n_nghbr[idx2] = j;
1406
+ }
1407
+ }
1408
+ nn_distances.update(idx2, min);
1409
+ }
1410
+ break;
1411
+ }
1412
+
1413
+ default:
1414
+ throw std::runtime_error(std::string("Invalid method."));
1415
+ }
1416
+ }
1417
+ #ifdef FE_INVALID
1418
+ if (fetestexcept(FE_INVALID)) throw fenv_error();
1419
+ #endif
1420
+ }
1421
+
1422
+ /*
1423
+ Clustering methods for vector data
1424
+ */
1425
+
1426
+ template <typename t_dissimilarity>
1427
+ static void MST_linkage_core_vector(const t_index N,
1428
+ t_dissimilarity & dist,
1429
+ cluster_result & Z2) {
1430
+ /*
1431
+ N: integer, number of data points
1432
+ dist: function pointer to the metric
1433
+ Z2: output data structure
1434
+
1435
+ The basis of this algorithm is an algorithm by Rohlf:
1436
+
1437
+ F. James Rohlf, Hierarchical clustering using the minimum spanning tree,
1438
+ The Computer Journal, vol. 16, 1973, p. 93–95.
1439
+ */
1440
+ t_index i;
1441
+ t_index idx2;
1442
+ doubly_linked_list active_nodes(N);
1443
+ auto_array_ptr<t_float> d(N);
1444
+
1445
+ t_index prev_node;
1446
+ t_float min;
1447
+
1448
+ // first iteration
1449
+ idx2 = 1;
1450
+ min = std::numeric_limits<t_float>::infinity();
1451
+ for (i=1; i<N; ++i) {
1452
+ d[i] = dist(0,i);
1453
+ #if HAVE_DIAGNOSTIC
1454
+ #pragma GCC diagnostic push
1455
+ #pragma GCC diagnostic ignored "-Wfloat-equal"
1456
+ #endif
1457
+ if (d[i] < min) {
1458
+ min = d[i];
1459
+ idx2 = i;
1460
+ }
1461
+ else if (fc_isnan(d[i]))
1462
+ throw (nan_error());
1463
+ #if HAVE_DIAGNOSTIC
1464
+ #pragma GCC diagnostic pop
1465
+ #endif
1466
+ }
1467
+
1468
+ Z2.append(0, idx2, min);
1469
+
1470
+ for (t_index j=1; j<N-1; ++j) {
1471
+ prev_node = idx2;
1472
+ active_nodes.remove(prev_node);
1473
+
1474
+ idx2 = active_nodes.succ[0];
1475
+ min = d[idx2];
1476
+
1477
+ for (i=idx2; i<N; i=active_nodes.succ[i]) {
1478
+ t_float tmp = dist(i, prev_node);
1479
+ #if HAVE_DIAGNOSTIC
1480
+ #pragma GCC diagnostic push
1481
+ #pragma GCC diagnostic ignored "-Wfloat-equal"
1482
+ #endif
1483
+ if (d[i] > tmp)
1484
+ d[i] = tmp;
1485
+ else if (fc_isnan(tmp))
1486
+ throw (nan_error());
1487
+ #if HAVE_DIAGNOSTIC
1488
+ #pragma GCC diagnostic pop
1489
+ #endif
1490
+ if (d[i] < min) {
1491
+ min = d[i];
1492
+ idx2 = i;
1493
+ }
1494
+ }
1495
+ Z2.append(prev_node, idx2, min);
1496
+ }
1497
+ }
1498
+
1499
+ template <method_codes_vector method, typename t_dissimilarity>
1500
+ static void generic_linkage_vector(const t_index N,
1501
+ t_dissimilarity & dist,
1502
+ cluster_result & Z2) {
1503
+ /*
1504
+ N: integer, number of data points
1505
+ dist: function pointer to the metric
1506
+ Z2: output data structure
1507
+
1508
+ This algorithm is valid for the distance update methods
1509
+ "Ward", "centroid" and "median" only!
1510
+ */
1511
+ const t_index N_1 = N-1;
1512
+ t_index i, j; // loop variables
1513
+ t_index idx1, idx2; // row and column indices
1514
+
1515
+ auto_array_ptr<t_index> n_nghbr(N_1); // array of nearest neighbors
1516
+ auto_array_ptr<t_float> mindist(N_1); // distances to the nearest neighbors
1517
+ auto_array_ptr<t_index> row_repr(N); // row_repr[i]: node number that the
1518
+ // i-th row represents
1519
+ doubly_linked_list active_nodes(N);
1520
+ binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for
1521
+ // the distance to the nearest neighbor of each point
1522
+ t_index node1, node2; // node numbers in the output
1523
+ t_float min; // minimum and row index for nearest-neighbor search
1524
+
1525
+ for (i=0; i<N; ++i)
1526
+ // Build a list of row ↔ node label assignments.
1527
+ // Initially i ↦ i
1528
+ row_repr[i] = i;
1529
+
1530
+ // Initialize the minimal distances:
1531
+ // Find the nearest neighbor of each point.
1532
+ // n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
1533
+ for (i=0; i<N_1; ++i) {
1534
+ min = std::numeric_limits<t_float>::infinity();
1535
+ t_index idx;
1536
+ for (idx=j=i+1; j<N; ++j) {
1537
+ t_float tmp;
1538
+ switch (method) {
1539
+ case METHOD_VECTOR_WARD:
1540
+ tmp = dist.ward_initial(i,j);
1541
+ break;
1542
+ default:
1543
+ tmp = dist.template sqeuclidean<true>(i,j);
1544
+ }
1545
+ if (tmp<min) {
1546
+ min = tmp;
1547
+ idx = j;
1548
+ }
1549
+ }
1550
+ switch (method) {
1551
+ case METHOD_VECTOR_WARD:
1552
+ mindist[i] = t_dissimilarity::ward_initial_conversion(min);
1553
+ break;
1554
+ default:
1555
+ mindist[i] = min;
1556
+ }
1557
+ n_nghbr[i] = idx;
1558
+ }
1559
+
1560
+ // Put the minimal distances into a heap structure to make the repeated
1561
+ // global minimum searches fast.
1562
+ nn_distances.heapify();
1563
+
1564
+ // Main loop: We have N-1 merging steps.
1565
+ for (i=0; i<N_1; ++i) {
1566
+ idx1 = nn_distances.argmin();
1567
+
1568
+ while ( active_nodes.is_inactive(n_nghbr[idx1]) ) {
1569
+ // Recompute the minimum mindist[idx1] and n_nghbr[idx1].
1570
+ n_nghbr[idx1] = j = active_nodes.succ[idx1]; // exists, maximally N-1
1571
+ switch (method) {
1572
+ case METHOD_VECTOR_WARD:
1573
+ min = dist.ward(idx1,j);
1574
+ for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1575
+ t_float const tmp = dist.ward(idx1,j);
1576
+ if (tmp<min) {
1577
+ min = tmp;
1578
+ n_nghbr[idx1] = j;
1579
+ }
1580
+ }
1581
+ break;
1582
+ default:
1583
+ min = dist.template sqeuclidean<true>(idx1,j);
1584
+ for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1585
+ t_float const tmp = dist.template sqeuclidean<true>(idx1,j);
1586
+ if (tmp<min) {
1587
+ min = tmp;
1588
+ n_nghbr[idx1] = j;
1589
+ }
1590
+ }
1591
+ }
1592
+ /* Update the heap with the new true minimum and search for the (possibly
1593
+ different) minimal entry. */
1594
+ nn_distances.update_geq(idx1, min);
1595
+ idx1 = nn_distances.argmin();
1596
+ }
1597
+
1598
+ nn_distances.heap_pop(); // Remove the current minimum from the heap.
1599
+ idx2 = n_nghbr[idx1];
1600
+
1601
+ // Write the newly found minimal pair of nodes to the output array.
1602
+ node1 = row_repr[idx1];
1603
+ node2 = row_repr[idx2];
1604
+
1605
+ Z2.append(node1, node2, mindist[idx1]);
1606
+
1607
+ switch (method) {
1608
+ case METHOD_VECTOR_WARD:
1609
+ case METHOD_VECTOR_CENTROID:
1610
+ dist.merge_inplace(idx1, idx2);
1611
+ break;
1612
+ case METHOD_VECTOR_MEDIAN:
1613
+ dist.merge_inplace_weighted(idx1, idx2);
1614
+ break;
1615
+ default:
1616
+ throw std::runtime_error(std::string("Invalid method."));
1617
+ }
1618
+
1619
+ // Index idx2 now represents the new (merged) node with label N+i.
1620
+ row_repr[idx2] = N+i;
1621
+ // Remove idx1 from the list of active indices (active_nodes).
1622
+ active_nodes.remove(idx1); // TBD later!!!
1623
+
1624
+ // Update the distance matrix
1625
+ switch (method) {
1626
+ case METHOD_VECTOR_WARD:
1627
+ /*
1628
+ Ward linkage.
1629
+
1630
+ Shorter and longer distances can occur, not smaller than min(d1,d2)
1631
+ but maybe bigger than max(d1,d2).
1632
+ */
1633
+ // Update the distance matrix in the range [start, idx1).
1634
+ for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
1635
+ if (n_nghbr[j] == idx2) {
1636
+ n_nghbr[j] = idx1; // invalidate
1637
+ }
1638
+ }
1639
+ // Update the distance matrix in the range (idx1, idx2).
1640
+ for ( ; j<idx2; j=active_nodes.succ[j]) {
1641
+ t_float const tmp = dist.ward(j, idx2);
1642
+ if (tmp < mindist[j]) {
1643
+ nn_distances.update_leq(j, tmp);
1644
+ n_nghbr[j] = idx2;
1645
+ }
1646
+ else if (n_nghbr[j]==idx2) {
1647
+ n_nghbr[j] = idx1; // invalidate
1648
+ }
1649
+ }
1650
+ // Find the nearest neighbor for idx2.
1651
+ if (idx2<N_1) {
1652
+ n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1653
+ min = dist.ward(idx2,j);
1654
+ for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1655
+ t_float const tmp = dist.ward(idx2,j);
1656
+ if (tmp < min) {
1657
+ min = tmp;
1658
+ n_nghbr[idx2] = j;
1659
+ }
1660
+ }
1661
+ nn_distances.update(idx2, min);
1662
+ }
1663
+ break;
1664
+
1665
+ default:
1666
+ /*
1667
+ Centroid and median linkage.
1668
+
1669
+ Shorter and longer distances can occur, not bigger than max(d1,d2)
1670
+ but maybe smaller than min(d1,d2).
1671
+ */
1672
+ for (j=active_nodes.start; j<idx2; j=active_nodes.succ[j]) {
1673
+ t_float const tmp = dist.template sqeuclidean<true>(j, idx2);
1674
+ if (tmp < mindist[j]) {
1675
+ nn_distances.update_leq(j, tmp);
1676
+ n_nghbr[j] = idx2;
1677
+ }
1678
+ else if (n_nghbr[j] == idx2)
1679
+ n_nghbr[j] = idx1; // invalidate
1680
+ }
1681
+ // Find the nearest neighbor for idx2.
1682
+ if (idx2<N_1) {
1683
+ n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1684
+ min = dist.template sqeuclidean<true>(idx2,j);
1685
+ for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
1686
+ t_float const tmp = dist.template sqeuclidean<true>(idx2, j);
1687
+ if (tmp < min) {
1688
+ min = tmp;
1689
+ n_nghbr[idx2] = j;
1690
+ }
1691
+ }
1692
+ nn_distances.update(idx2, min);
1693
+ }
1694
+ }
1695
+ }
1696
+ }
1697
+
1698
+ template <method_codes_vector method, typename t_dissimilarity>
1699
+ static void generic_linkage_vector_alternative(const t_index N,
1700
+ t_dissimilarity & dist,
1701
+ cluster_result & Z2) {
1702
+ /*
1703
+ N: integer, number of data points
1704
+ dist: function pointer to the metric
1705
+ Z2: output data structure
1706
+
1707
+ This algorithm is valid for the distance update methods
1708
+ "Ward", "centroid" and "median" only!
1709
+ */
1710
+ const t_index N_1 = N-1;
1711
+ t_index i, j=0; // loop variables
1712
+ t_index idx1, idx2; // row and column indices
1713
+
1714
+ auto_array_ptr<t_index> n_nghbr(2*N-2); // array of nearest neighbors
1715
+ auto_array_ptr<t_float> mindist(2*N-2); // distances to the nearest neighbors
1716
+
1717
+ doubly_linked_list active_nodes(N+N_1);
1718
+ binary_min_heap nn_distances(&*mindist, N_1, 2*N-2, 1); // minimum heap
1719
+ // structure for the distance to the nearest neighbor of each point
1720
+
1721
+ t_float min; // minimum for nearest-neighbor searches
1722
+
1723
+ // Initialize the minimal distances:
1724
+ // Find the nearest neighbor of each point.
1725
+ // n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
1726
+ for (i=1; i<N; ++i) {
1727
+ min = std::numeric_limits<t_float>::infinity();
1728
+ t_index idx;
1729
+ for (idx=j=0; j<i; ++j) {
1730
+ t_float tmp;
1731
+ switch (method) {
1732
+ case METHOD_VECTOR_WARD:
1733
+ tmp = dist.ward_initial(i,j);
1734
+ break;
1735
+ default:
1736
+ tmp = dist.template sqeuclidean<true>(i,j);
1737
+ }
1738
+ if (tmp<min) {
1739
+ min = tmp;
1740
+ idx = j;
1741
+ }
1742
+ }
1743
+ switch (method) {
1744
+ case METHOD_VECTOR_WARD:
1745
+ mindist[i] = t_dissimilarity::ward_initial_conversion(min);
1746
+ break;
1747
+ default:
1748
+ mindist[i] = min;
1749
+ }
1750
+ n_nghbr[i] = idx;
1751
+ }
1752
+
1753
+ // Put the minimal distances into a heap structure to make the repeated
1754
+ // global minimum searches fast.
1755
+ nn_distances.heapify();
1756
+
1757
+ // Main loop: We have N-1 merging steps.
1758
+ for (i=N; i<N+N_1; ++i) {
1759
+ /*
1760
+ The bookkeeping is different from the "stored matrix approach" algorithm
1761
+ generic_linkage.
1762
+
1763
+ mindist[i] stores a lower bound on the minimum distance of the point i to
1764
+ all points of *lower* index:
1765
+
1766
+ mindist[i] ≥ min_{j<i} D(i,j)
1767
+
1768
+ Moreover, new nodes do not re-use one of the old indices, but they are
1769
+ given a new, unique index (SciPy convention: initial nodes are 0,…,N−1,
1770
+ new nodes are N,…,2N−2).
1771
+
1772
+ Invalid nearest neighbors are not recognized by the fact that the stored
1773
+ distance is smaller than the actual distance, but the list active_nodes
1774
+ maintains a flag whether a node is inactive. If n_nghbr[i] points to an
1775
+ active node, the entries nn_distances[i] and n_nghbr[i] are valid,
1776
+ otherwise they must be recomputed.
1777
+ */
1778
+ idx1 = nn_distances.argmin();
1779
+ while ( active_nodes.is_inactive(n_nghbr[idx1]) ) {
1780
+ // Recompute the minimum mindist[idx1] and n_nghbr[idx1].
1781
+ n_nghbr[idx1] = j = active_nodes.start;
1782
+ switch (method) {
1783
+ case METHOD_VECTOR_WARD:
1784
+ min = dist.ward_extended(idx1,j);
1785
+ for (j=active_nodes.succ[j]; j<idx1; j=active_nodes.succ[j]) {
1786
+ t_float tmp = dist.ward_extended(idx1,j);
1787
+ if (tmp<min) {
1788
+ min = tmp;
1789
+ n_nghbr[idx1] = j;
1790
+ }
1791
+ }
1792
+ break;
1793
+ default:
1794
+ min = dist.sqeuclidean_extended(idx1,j);
1795
+ for (j=active_nodes.succ[j]; j<idx1; j=active_nodes.succ[j]) {
1796
+ t_float const tmp = dist.sqeuclidean_extended(idx1,j);
1797
+ if (tmp<min) {
1798
+ min = tmp;
1799
+ n_nghbr[idx1] = j;
1800
+ }
1801
+ }
1802
+ }
1803
+ /* Update the heap with the new true minimum and search for the (possibly
1804
+ different) minimal entry. */
1805
+ nn_distances.update_geq(idx1, min);
1806
+ idx1 = nn_distances.argmin();
1807
+ }
1808
+
1809
+ idx2 = n_nghbr[idx1];
1810
+ active_nodes.remove(idx1);
1811
+ active_nodes.remove(idx2);
1812
+
1813
+ Z2.append(idx1, idx2, mindist[idx1]);
1814
+
1815
+ if (i<2*N_1) {
1816
+ switch (method) {
1817
+ case METHOD_VECTOR_WARD:
1818
+ case METHOD_VECTOR_CENTROID:
1819
+ dist.merge(idx1, idx2, i);
1820
+ break;
1821
+
1822
+ case METHOD_VECTOR_MEDIAN:
1823
+ dist.merge_weighted(idx1, idx2, i);
1824
+ break;
1825
+
1826
+ default:
1827
+ throw std::runtime_error(std::string("Invalid method."));
1828
+ }
1829
+
1830
+ n_nghbr[i] = active_nodes.start;
1831
+ if (method==METHOD_VECTOR_WARD) {
1832
+ /*
1833
+ Ward linkage.
1834
+
1835
+ Shorter and longer distances can occur, not smaller than min(d1,d2)
1836
+ but maybe bigger than max(d1,d2).
1837
+ */
1838
+ min = dist.ward_extended(active_nodes.start, i);
1839
+ for (j=active_nodes.succ[active_nodes.start]; j<i;
1840
+ j=active_nodes.succ[j]) {
1841
+ t_float tmp = dist.ward_extended(j, i);
1842
+ if (tmp < min) {
1843
+ min = tmp;
1844
+ n_nghbr[i] = j;
1845
+ }
1846
+ }
1847
+ }
1848
+ else {
1849
+ /*
1850
+ Centroid and median linkage.
1851
+
1852
+ Shorter and longer distances can occur, not bigger than max(d1,d2)
1853
+ but maybe smaller than min(d1,d2).
1854
+ */
1855
+ min = dist.sqeuclidean_extended(active_nodes.start, i);
1856
+ for (j=active_nodes.succ[active_nodes.start]; j<i;
1857
+ j=active_nodes.succ[j]) {
1858
+ t_float tmp = dist.sqeuclidean_extended(j, i);
1859
+ if (tmp < min) {
1860
+ min = tmp;
1861
+ n_nghbr[i] = j;
1862
+ }
1863
+ }
1864
+ }
1865
+ if (idx2<active_nodes.start) {
1866
+ nn_distances.remove(active_nodes.start);
1867
+ } else {
1868
+ nn_distances.remove(idx2);
1869
+ }
1870
+ nn_distances.replace(idx1, i, min);
1871
+ }
1872
+ }
1873
+ }
1874
+
1875
+ #if HAVE_VISIBILITY
1876
+ #pragma GCC visibility pop
1877
+ #endif