sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
  2. sequenzo/__init__.py +349 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +476 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +22 -0
  30. sequenzo/data_preprocessing/helpers.py +303 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/dyadic_children.csv +61 -0
  44. sequenzo/datasets/dyadic_parents.csv +61 -0
  45. sequenzo/datasets/mvad.csv +713 -0
  46. sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
  47. sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
  48. sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
  49. sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
  50. sequenzo/datasets/political_science_aid_shock.csv +166 -0
  51. sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
  52. sequenzo/define_sequence_data.py +1400 -0
  53. sequenzo/dissimilarity_measures/__init__.py +31 -0
  54. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  55. sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
  56. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
  57. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  58. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  59. sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
  60. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  61. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  62. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  63. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  64. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  65. sequenzo/dissimilarity_measures/src/module.cpp +40 -0
  66. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  67. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  214. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  215. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  216. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  217. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  218. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  219. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  220. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  221. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  222. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  223. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  224. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  225. sequenzo/multidomain/__init__.py +23 -0
  226. sequenzo/multidomain/association_between_domains.py +311 -0
  227. sequenzo/multidomain/cat.py +597 -0
  228. sequenzo/multidomain/combt.py +519 -0
  229. sequenzo/multidomain/dat.py +81 -0
  230. sequenzo/multidomain/idcd.py +139 -0
  231. sequenzo/multidomain/linked_polyad.py +292 -0
  232. sequenzo/openmp_setup.py +233 -0
  233. sequenzo/prefix_tree/__init__.py +62 -0
  234. sequenzo/prefix_tree/hub.py +114 -0
  235. sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
  236. sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
  237. sequenzo/prefix_tree/spell_level_indicators.py +297 -0
  238. sequenzo/prefix_tree/system_level_indicators.py +544 -0
  239. sequenzo/prefix_tree/utils.py +54 -0
  240. sequenzo/seqhmm/__init__.py +95 -0
  241. sequenzo/seqhmm/advanced_optimization.py +305 -0
  242. sequenzo/seqhmm/bootstrap.py +411 -0
  243. sequenzo/seqhmm/build_hmm.py +142 -0
  244. sequenzo/seqhmm/build_mhmm.py +136 -0
  245. sequenzo/seqhmm/build_nhmm.py +121 -0
  246. sequenzo/seqhmm/fit_mhmm.py +62 -0
  247. sequenzo/seqhmm/fit_model.py +61 -0
  248. sequenzo/seqhmm/fit_nhmm.py +76 -0
  249. sequenzo/seqhmm/formulas.py +289 -0
  250. sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
  251. sequenzo/seqhmm/gradients_nhmm.py +306 -0
  252. sequenzo/seqhmm/hmm.py +291 -0
  253. sequenzo/seqhmm/mhmm.py +314 -0
  254. sequenzo/seqhmm/model_comparison.py +238 -0
  255. sequenzo/seqhmm/multichannel_em.py +282 -0
  256. sequenzo/seqhmm/multichannel_utils.py +138 -0
  257. sequenzo/seqhmm/nhmm.py +270 -0
  258. sequenzo/seqhmm/nhmm_utils.py +191 -0
  259. sequenzo/seqhmm/predict.py +137 -0
  260. sequenzo/seqhmm/predict_mhmm.py +142 -0
  261. sequenzo/seqhmm/simulate.py +878 -0
  262. sequenzo/seqhmm/utils.py +218 -0
  263. sequenzo/seqhmm/visualization.py +910 -0
  264. sequenzo/sequence_characteristics/__init__.py +40 -0
  265. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  266. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  267. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  268. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  269. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  270. sequenzo/sequence_characteristics/turbulence.py +155 -0
  271. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  272. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  273. sequenzo/suffix_tree/__init__.py +66 -0
  274. sequenzo/suffix_tree/hub.py +114 -0
  275. sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
  276. sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
  277. sequenzo/suffix_tree/spell_level_indicators.py +248 -0
  278. sequenzo/suffix_tree/system_level_indicators.py +535 -0
  279. sequenzo/suffix_tree/utils.py +56 -0
  280. sequenzo/version_check.py +283 -0
  281. sequenzo/visualization/__init__.py +29 -0
  282. sequenzo/visualization/plot_mean_time.py +222 -0
  283. sequenzo/visualization/plot_modal_state.py +276 -0
  284. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  285. sequenzo/visualization/plot_relative_frequency.py +405 -0
  286. sequenzo/visualization/plot_sequence_index.py +1175 -0
  287. sequenzo/visualization/plot_single_medoid.py +153 -0
  288. sequenzo/visualization/plot_state_distribution.py +651 -0
  289. sequenzo/visualization/plot_transition_matrix.py +190 -0
  290. sequenzo/visualization/utils/__init__.py +23 -0
  291. sequenzo/visualization/utils/utils.py +310 -0
  292. sequenzo/with_event_history_analysis/__init__.py +35 -0
  293. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  294. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  295. sequenzo-0.1.31.dist-info/METADATA +286 -0
  296. sequenzo-0.1.31.dist-info/RECORD +299 -0
  297. sequenzo-0.1.31.dist-info/WHEEL +5 -0
  298. sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
  299. sequenzo-0.1.31.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1264 @@
1
+ /*
2
+ fastcluster: Fast hierarchical clustering routines for R and Python
3
+
4
+ Copyright:
5
+ * Until package version 1.1.23: © 2011 Daniel Müllner <https://danifold.net>
6
+ * All changes from version 1.1.24 on: © Google Inc. <https://www.google.com>
7
+ */
8
+
9
+ // for INT32_MAX in fastcluster.cpp
10
+ // This must be defined here since Python.h loads the header file pyport.h,
11
+ // and from this stdint.h. INT32_MAX is defined in stdint.h, but only if
12
+ // __STDC_LIMIT_MACROS is defined.
13
+ #define __STDC_LIMIT_MACROS
14
+
15
+ #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
16
+
17
+ #if __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 6))
18
+ #define HAVE_DIAGNOSTIC 1
19
+ #endif
20
+
21
+ #if HAVE_DIAGNOSTIC
22
+ #pragma GCC diagnostic push
23
+ #pragma GCC diagnostic ignored "-Wswitch-default"
24
+ #pragma GCC diagnostic ignored "-Wpadded"
25
+ #pragma GCC diagnostic ignored "-Wlong-long"
26
+ #pragma GCC diagnostic ignored "-Wformat"
27
+ #endif
28
+ #include <Python.h>
29
+ #if HAVE_DIAGNOSTIC
30
+ #pragma GCC diagnostic pop
31
+ #endif
32
+ #if HAVE_DIAGNOSTIC
33
+ #pragma GCC diagnostic push
34
+ #pragma GCC diagnostic ignored "-Wlong-long"
35
+ #pragma GCC diagnostic ignored "-Wpedantic"
36
+ #pragma GCC diagnostic ignored "-Wpadded"
37
+ #pragma GCC diagnostic ignored "-Wcast-qual"
38
+ #endif
39
+ #include <numpy/arrayobject.h>
40
+ #if HAVE_DIAGNOSTIC
41
+ #pragma GCC diagnostic pop
42
+ #endif
43
+
44
+ /* It's complicated, but if I do not include the C++ math headers, GCC
45
+ will complain about conversions from 'double' to 'float', whenever 'isnan'
46
+ is called in a templated function (but not outside templates).
47
+
48
+ The '#include <cmath>' seems to cure the problem.
49
+ */
50
+ //#include <cmath>
51
+ #define fc_isnan(X) ((X)!=(X))
52
+
53
+ // There is Py_IS_NAN but it is so much slower on my x86_64 system with GCC!
54
+
55
+ #include <cmath> // for std::abs, std::pow, std::sqrt
56
+ #include <cstddef> // for std::ptrdiff_t
57
+ #include <limits> // for std::numeric_limits<...>::infinity()
58
+ #include <algorithm> // for std::stable_sort
59
+ #include <new> // for std::bad_alloc
60
+ #include <exception> // for std::exception
61
+
62
+ #include "fastcluster.cpp"
63
+
64
+ // backwards compatibility
65
+ #ifndef NPY_ARRAY_CARRAY_RO
66
+ #define NPY_ARRAY_CARRAY_RO NPY_CARRAY_RO
67
+ #endif
68
+
69
+ /* Since the public interface is given by the Python respectively R interface,
70
+ * we do not want other symbols than the interface initalization routines to be
71
+ * visible in the shared object file. The "visibility" switch is a GCC concept.
72
+ * Hiding symbols keeps the relocation table small and decreases startup time.
73
+ * See http://gcc.gnu.org/wiki/Visibility
74
+ */
75
+ #if HAVE_VISIBILITY
76
+ #pragma GCC visibility push(hidden)
77
+ #endif
78
+
79
+ /*
80
+ Convenience class for the output array: automatic counter.
81
+ */
82
+ class linkage_output {
83
+ private:
84
+ t_float * Z;
85
+
86
+ public:
87
+ linkage_output(t_float * const Z_)
88
+ : Z(Z_)
89
+ {}
90
+
91
+ void append(const t_index node1, const t_index node2, const t_float dist,
92
+ const t_float size) {
93
+ if (node1<node2) {
94
+ *(Z++) = static_cast<t_float>(node1);
95
+ *(Z++) = static_cast<t_float>(node2);
96
+ }
97
+ else {
98
+ *(Z++) = static_cast<t_float>(node2);
99
+ *(Z++) = static_cast<t_float>(node1);
100
+ }
101
+ *(Z++) = dist;
102
+ *(Z++) = size;
103
+ }
104
+ };
105
+
106
+ /*
107
+ Generate the SciPy-specific output format for a dendrogram from the
108
+ clustering output.
109
+
110
+ The list of merging steps can be sorted or unsorted.
111
+ */
112
+ // The size of a node is either 1 (a single point) or is looked up from
113
+ // one of the clusters.
114
+ #define size_(r_) ( ((r_<N) ? 1 : Z_(r_-N,3)) )
115
+
116
+ template <const bool sorted>
117
+ static void generate_SciPy_dendrogram(t_float * const Z, cluster_result & Z2, const t_index N) {
118
+ // The array "nodes" is a union-find data structure for the cluster
119
+ // identities (only needed for unsorted cluster_result input).
120
+ union_find nodes(sorted ? 0 : N);
121
+ if (!sorted) {
122
+ std::stable_sort(Z2[0], Z2[N-1]);
123
+ }
124
+
125
+ linkage_output output(Z);
126
+ t_index node1, node2;
127
+
128
+ for (node const * NN=Z2[0]; NN!=Z2[N-1]; ++NN) {
129
+ // Get two data points whose clusters are merged in step i.
130
+ if (sorted) {
131
+ node1 = NN->node1;
132
+ node2 = NN->node2;
133
+ }
134
+ else {
135
+ // Find the cluster identifiers for these points.
136
+ node1 = nodes.Find(NN->node1);
137
+ node2 = nodes.Find(NN->node2);
138
+ // Merge the nodes in the union-find data structure by making them
139
+ // children of a new node.
140
+ nodes.Union(node1, node2);
141
+ }
142
+ output.append(node1, node2, NN->dist, size_(node1)+size_(node2));
143
+ }
144
+ }
145
+
146
+ /*
147
+ Python interface code
148
+ */
149
+ static PyObject * linkage_wrap(PyObject * const self, PyObject * const args);
150
+ static PyObject * linkage_vector_wrap(PyObject * const self, PyObject * const args);
151
+
152
+ // List the C++ methods that this extension provides.
153
+ static PyMethodDef _fastclusterWrapMethods[] = {
154
+ {"linkage_wrap", linkage_wrap, METH_VARARGS, NULL},
155
+ {"linkage_vector_wrap", linkage_vector_wrap, METH_VARARGS, NULL},
156
+ {NULL, NULL, 0, NULL} /* Sentinel - marks the end of this structure */
157
+ };
158
+
159
+ /* Tell Python about these methods.
160
+
161
+ Python 2.x and 3.x differ in their C APIs for this part.
162
+ */
163
+ #if PY_VERSION_HEX >= 0x03000000
164
+
165
+ static struct PyModuleDef fastclustermodule = {
166
+ PyModuleDef_HEAD_INIT,
167
+ "_sequenzo_fastcluster",
168
+ NULL, // no module documentation
169
+ -1, /* size of per-interpreter state of the module,
170
+ or -1 if the module keeps state in global variables. */
171
+ _fastclusterWrapMethods,
172
+ NULL, NULL, NULL, NULL
173
+ };
174
+
175
+ /* Make the interface initalization routines visible in the shared object
176
+ * file.
177
+ */
178
+ #if HAVE_VISIBILITY
179
+ #pragma GCC visibility push(default)
180
+ #endif
181
+
182
+ PyMODINIT_FUNC PyInit__sequenzo_fastcluster(void) {
183
+ PyObject * m;
184
+ m = PyModule_Create(&fastclustermodule);
185
+ if (!m) {
186
+ return NULL;
187
+ }
188
+ import_array(); // Must be present for NumPy. Called first after above line.
189
+ return m;
190
+ }
191
+
192
+ #if HAVE_VISIBILITY
193
+ #pragma GCC visibility pop
194
+ #endif
195
+
196
+ # else // Python 2.x
197
+
198
+ #if HAVE_VISIBILITY
199
+ #pragma GCC visibility push(default)
200
+ #endif
201
+
202
+ PyMODINIT_FUNC init_sequenzo_fastcluster(void) {
203
+ (void) Py_InitModule("_sequenzo_fastcluster", _fastclusterWrapMethods);
204
+ import_array(); // Must be present for NumPy. Called first after above line.
205
+ }
206
+
207
+ #if HAVE_VISIBILITY
208
+ #pragma GCC visibility pop
209
+ #endif
210
+
211
+ #endif // PY_VERSION
212
+
213
+ class GIL_release
214
+ {
215
+ private:
216
+ // noncopyable
217
+ GIL_release(GIL_release const &);
218
+ GIL_release & operator=(GIL_release const &);
219
+ public:
220
+ inline
221
+ GIL_release(bool really = true)
222
+ : _save(really ? PyEval_SaveThread() : NULL)
223
+ {
224
+ }
225
+
226
+ inline
227
+ ~GIL_release()
228
+ {
229
+ if (_save)
230
+ PyEval_RestoreThread(_save);
231
+ }
232
+
233
+ private:
234
+ PyThreadState * _save;
235
+ };
236
+
237
+ /*
238
+ Interface to Python, part 1:
239
+ The input is a dissimilarity matrix.
240
+ */
241
+
242
+ static PyObject *linkage_wrap(PyObject * const, PyObject * const args) {
243
+ PyArrayObject * D, * Z;
244
+ long int N_ = 0;
245
+ unsigned char method;
246
+
247
+ try{
248
+ #if HAVE_DIAGNOSTIC
249
+ #pragma GCC diagnostic push
250
+ #pragma GCC diagnostic ignored "-Wold-style-cast"
251
+ #endif
252
+ // Parse the input arguments
253
+ if (!PyArg_ParseTuple(args, "lO!O!b",
254
+ &N_, // signed long integer
255
+ &PyArray_Type, &D, // NumPy array
256
+ &PyArray_Type, &Z, // NumPy array
257
+ &method)) { // unsigned char
258
+ return NULL; // Error if the arguments have the wrong type.
259
+ }
260
+ #if HAVE_DIAGNOSTIC
261
+ #pragma GCC diagnostic pop
262
+ #endif
263
+ if (N_ < 1 ) {
264
+ // N must be at least 1.
265
+ PyErr_SetString(PyExc_ValueError,
266
+ "At least one element is needed for clustering.");
267
+ return NULL;
268
+ }
269
+
270
+ /*
271
+ (1)
272
+ The biggest index used below is 4*(N-2)+3, as an index to Z. This must
273
+ fit into the data type used for indices.
274
+ (2)
275
+ The largest representable integer, without loss of precision, by a
276
+ floating point number of type t_float is 2^T_FLOAT_MANT_DIG. Here, we
277
+ make sure that all cluster labels from 0 to 2N-2 in the output can be
278
+ accurately represented by a floating point number.
279
+
280
+ Conversion of N to 64 bits below is not really necessary but it prevents
281
+ a warning ("shift count >= width of type") on systems where "long int"
282
+ is 32 bits wide.
283
+ */
284
+ if (N_ > MAX_INDEX/4 ||
285
+ static_cast<int64_t>(N_-1)>>(T_FLOAT_MANT_DIG-1) > 0) {
286
+ PyErr_SetString(PyExc_ValueError,
287
+ "Data is too big, index overflow.");
288
+ return NULL;
289
+ }
290
+ t_index N = static_cast<t_index>(N_);
291
+
292
+ // Allow threads!
293
+ GIL_release G;
294
+
295
+ t_float * const D_ = reinterpret_cast<t_float *>(PyArray_DATA(D));
296
+ cluster_result Z2(N-1);
297
+ auto_array_ptr<t_index> members;
298
+ // For these methods, the distance update formula needs the number of
299
+ // data points in a cluster.
300
+ if (method==METHOD_METR_AVERAGE ||
301
+ method==METHOD_METR_WARD ||
302
+ method==METHOD_METR_WARD_D2 ||
303
+ method==METHOD_METR_CENTROID) {
304
+ members.init(N, 1);
305
+ }
306
+ // Operate on squared distances for these methods.
307
+ if (method==METHOD_METR_WARD ||
308
+ method==METHOD_METR_WARD_D2 ||
309
+ method==METHOD_METR_CENTROID ||
310
+ method==METHOD_METR_MEDIAN) {
311
+ for (t_float * DD = D_; DD!=D_+static_cast<std::ptrdiff_t>(N)*(N-1)/2;
312
+ ++DD)
313
+ *DD *= *DD;
314
+ }
315
+
316
+ switch (method) {
317
+ case METHOD_METR_SINGLE:
318
+ MST_linkage_core(N, D_, Z2);
319
+ break;
320
+ case METHOD_METR_COMPLETE:
321
+ NN_chain_core<METHOD_METR_COMPLETE, t_index>(N, D_, NULL, Z2);
322
+ break;
323
+ case METHOD_METR_AVERAGE:
324
+ NN_chain_core<METHOD_METR_AVERAGE, t_index>(N, D_, members, Z2);
325
+ break;
326
+ case METHOD_METR_WEIGHTED:
327
+ NN_chain_core<METHOD_METR_WEIGHTED, t_index>(N, D_, NULL, Z2);
328
+ break;
329
+ case METHOD_METR_WARD:
330
+ NN_chain_core<METHOD_METR_WARD, t_index>(N, D_, members, Z2);
331
+ break;
332
+ case METHOD_METR_WARD_D2:
333
+ NN_chain_core<METHOD_METR_WARD_D2, t_index>(N, D_, members, Z2);
334
+ break;
335
+ case METHOD_METR_CENTROID:
336
+ generic_linkage<METHOD_METR_CENTROID, t_index>(N, D_, members, Z2);
337
+ break;
338
+ case METHOD_METR_MEDIAN:
339
+ generic_linkage<METHOD_METR_MEDIAN, t_index>(N, D_, NULL, Z2);
340
+ break;
341
+ default:
342
+ throw std::runtime_error(std::string("Invalid method index."));
343
+ }
344
+
345
+ if (method==METHOD_METR_WARD_D2 ||
346
+ method==METHOD_METR_CENTROID ||
347
+ method==METHOD_METR_MEDIAN) {
348
+ Z2.sqrt();
349
+ }
350
+
351
+ t_float * const Z_ = reinterpret_cast<t_float *>(PyArray_DATA(Z));
352
+ if (method==METHOD_METR_CENTROID ||
353
+ method==METHOD_METR_MEDIAN) {
354
+ generate_SciPy_dendrogram<true>(Z_, Z2, N);
355
+ }
356
+ else {
357
+ generate_SciPy_dendrogram<false>(Z_, Z2, N);
358
+ }
359
+ } // try
360
+ catch (const std::bad_alloc&) {
361
+ return PyErr_NoMemory();
362
+ }
363
+ catch(const std::exception& e){
364
+ PyErr_SetString(PyExc_EnvironmentError, e.what());
365
+ return NULL;
366
+ }
367
+ catch(const nan_error&){
368
+ PyErr_SetString(PyExc_FloatingPointError, "NaN dissimilarity value.");
369
+ return NULL;
370
+ }
371
+ #ifdef FE_INVALID
372
+ catch(const fenv_error&){
373
+ PyErr_SetString(PyExc_FloatingPointError,
374
+ "NaN dissimilarity value in intermediate results.");
375
+ return NULL;
376
+ }
377
+ #endif
378
+ catch(...){
379
+ PyErr_SetString(PyExc_EnvironmentError,
380
+ "C++ exception (unknown reason). Please send a bug report.");
381
+ return NULL;
382
+ }
383
+ #if HAVE_DIAGNOSTIC
384
+ #pragma GCC diagnostic push
385
+ #pragma GCC diagnostic ignored "-Wold-style-cast"
386
+ #endif
387
+ Py_RETURN_NONE;
388
+ #if HAVE_DIAGNOSTIC
389
+ #pragma GCC diagnostic pop
390
+ #endif
391
+ }
392
+
393
+ /*
394
+ Part 2: Clustering on vector data
395
+ */
396
+
397
+ /* Metric codes.
398
+
399
+ These codes must agree with the dictionary mtridx in fastcluster.py.
400
+ */
401
+ enum metric_codes {
402
+ // metrics
403
+ METRIC_EUCLIDEAN = 0,
404
+ METRIC_MINKOWSKI = 1,
405
+ METRIC_CITYBLOCK = 2,
406
+ METRIC_SEUCLIDEAN = 3,
407
+ METRIC_SQEUCLIDEAN = 4,
408
+ METRIC_COSINE = 5,
409
+ METRIC_HAMMING = 6,
410
+ METRIC_JACCARD = 7,
411
+ METRIC_CHEBYCHEV = 8,
412
+ METRIC_CANBERRA = 9,
413
+ METRIC_BRAYCURTIS = 10,
414
+ METRIC_MAHALANOBIS = 11,
415
+ METRIC_YULE = 12,
416
+ METRIC_MATCHING = 13,
417
+ METRIC_DICE = 14,
418
+ METRIC_ROGERSTANIMOTO = 15,
419
+ METRIC_RUSSELLRAO = 16,
420
+ METRIC_SOKALSNEATH = 17,
421
+ METRIC_KULSINSKI = 18,
422
+ METRIC_USER = 19,
423
+ METRIC_INVALID = 20, // sentinel
424
+ METRIC_JACCARD_BOOL = 21, // separate function for Jaccard metric on
425
+ }; // Boolean input data
426
+
427
+ /*
428
+ Helper class: Throw this if calling the Python interpreter from within
429
+ C returned an error.
430
+ */
431
+ class pythonerror {};
432
+
433
+ /*
434
+ This class handles all the information about the dissimilarity
435
+ computation.
436
+ */
437
+
438
+ class python_dissimilarity {
439
+ private:
440
+ t_float * Xa;
441
+ std::ptrdiff_t dim; // size_t saves many statis_cast<> in products
442
+ t_index N;
443
+ auto_array_ptr<t_float> Xnew;
444
+ t_index * members;
445
+ void (cluster_result::*postprocessfn) (const t_float) const;
446
+ t_float postprocessarg;
447
+
448
+ t_float (python_dissimilarity::*distfn) (const t_index, const t_index) const;
449
+
450
+ // for user-defined metrics
451
+ PyObject * X_Python;
452
+ PyObject * userfn;
453
+
454
+ auto_array_ptr<t_float> precomputed;
455
+ t_float * precomputed2;
456
+
457
+ PyArrayObject * V;
458
+ const t_float * V_data;
459
+
460
+ // noncopyable
461
+ python_dissimilarity();
462
+ python_dissimilarity(python_dissimilarity const &);
463
+ python_dissimilarity & operator=(python_dissimilarity const &);
464
+
465
+ public:
466
+ // Ignore warning about uninitialized member variables. I know what I am
467
+ // doing here, and some member variables are only used for certain metrics.
468
+ #if HAVE_DIAGNOSTIC
469
+ #pragma GCC diagnostic push
470
+ #pragma GCC diagnostic ignored "-Weffc++"
471
+ #endif
472
+ python_dissimilarity (PyArrayObject * const Xarg,
473
+ t_index * const members_,
474
+ const method_codes method,
475
+ const metric_codes metric,
476
+ PyObject * const extraarg,
477
+ bool temp_point_array)
478
+ : Xa(reinterpret_cast<t_float *>(PyArray_DATA(Xarg))),
479
+ dim(PyArray_DIM(Xarg, 1)),
480
+ N(static_cast<t_index>(PyArray_DIM(Xarg, 0))),
481
+ Xnew(temp_point_array ? (N-1)*dim : 0),
482
+ members(members_),
483
+ postprocessfn(NULL),
484
+ V(NULL)
485
+ {
486
+ switch (method) {
487
+ case METHOD_METR_SINGLE:
488
+ postprocessfn = NULL; // default
489
+ switch (metric) {
490
+ case METRIC_EUCLIDEAN:
491
+ set_euclidean();
492
+ break;
493
+ case METRIC_SEUCLIDEAN:
494
+ if (extraarg==NULL) {
495
+ PyErr_SetString(PyExc_TypeError,
496
+ "The 'seuclidean' metric needs a variance parameter.");
497
+ throw pythonerror();
498
+ }
499
+ #if HAVE_DIAGNOSTIC
500
+ #pragma GCC diagnostic push
501
+ #pragma GCC diagnostic ignored "-Wold-style-cast"
502
+ #endif
503
+ V = reinterpret_cast<PyArrayObject *>(PyArray_FromAny(extraarg,
504
+ PyArray_DescrFromType(NPY_DOUBLE),
505
+ 1, 1,
506
+ NPY_ARRAY_CARRAY_RO,
507
+ NULL));
508
+ #if HAVE_DIAGNOSTIC
509
+ #pragma GCC diagnostic pop
510
+ #endif
511
+ if (PyErr_Occurred()) {
512
+ throw pythonerror();
513
+ }
514
+ if (PyArray_DIM(V, 0)!=dim) {
515
+ PyErr_SetString(PyExc_ValueError,
516
+ "The variance vector must have the same dimensionality as the data.");
517
+ throw pythonerror();
518
+ }
519
+ V_data = reinterpret_cast<t_float *>(PyArray_DATA(V));
520
+ distfn = &python_dissimilarity::seuclidean;
521
+ postprocessfn = &cluster_result::sqrt;
522
+ break;
523
+ case METRIC_SQEUCLIDEAN:
524
+ distfn = &python_dissimilarity::sqeuclidean<false>;
525
+ break;
526
+ case METRIC_CITYBLOCK:
527
+ set_cityblock();
528
+ break;
529
+ case METRIC_CHEBYCHEV:
530
+ set_chebychev();
531
+ break;
532
+ case METRIC_MINKOWSKI:
533
+ set_minkowski(extraarg);
534
+ break;
535
+ case METRIC_COSINE:
536
+ distfn = &python_dissimilarity::cosine;
537
+ postprocessfn = &cluster_result::plusone;
538
+ // precompute norms
539
+ precomputed.init(N);
540
+ for (t_index i=0; i<N; ++i) {
541
+ t_float sum=0;
542
+ for (t_index k=0; k<dim; ++k) {
543
+ sum += X(i,k)*X(i,k);
544
+ }
545
+ precomputed[i] = 1/std::sqrt(sum);
546
+ }
547
+ break;
548
+ case METRIC_HAMMING:
549
+ distfn = &python_dissimilarity::hamming;
550
+ postprocessfn = &cluster_result::divide;
551
+ postprocessarg = static_cast<t_float>(dim);
552
+ break;
553
+ case METRIC_JACCARD:
554
+ distfn = &python_dissimilarity::jaccard;
555
+ break;
556
+ case METRIC_CANBERRA:
557
+ distfn = &python_dissimilarity::canberra;
558
+ break;
559
+ case METRIC_BRAYCURTIS:
560
+ distfn = &python_dissimilarity::braycurtis;
561
+ break;
562
+ case METRIC_MAHALANOBIS:
563
+ if (extraarg==NULL) {
564
+ PyErr_SetString(PyExc_TypeError,
565
+ "The 'mahalanobis' metric needs a parameter for the inverse covariance.");
566
+ throw pythonerror();
567
+ }
568
+ #if HAVE_DIAGNOSTIC
569
+ #pragma GCC diagnostic push
570
+ #pragma GCC diagnostic ignored "-Wold-style-cast"
571
+ #endif
572
+ V = reinterpret_cast<PyArrayObject *>(PyArray_FromAny(extraarg,
573
+ PyArray_DescrFromType(NPY_DOUBLE),
574
+ 2, 2,
575
+ NPY_ARRAY_CARRAY_RO,
576
+ NULL));
577
+ #if HAVE_DIAGNOSTIC
578
+ #pragma GCC diagnostic pop
579
+ #endif
580
+ if (PyErr_Occurred()) {
581
+ throw pythonerror();
582
+ }
583
+ if (PyArray_DIM(V, 0)!=N || PyArray_DIM(V, 1)!=dim) {
584
+ PyErr_SetString(PyExc_ValueError,
585
+ "The inverse covariance matrix has the wrong size.");
586
+ throw pythonerror();
587
+ }
588
+ V_data = reinterpret_cast<t_float *>(PyArray_DATA(V));
589
+ distfn = &python_dissimilarity::mahalanobis;
590
+ postprocessfn = &cluster_result::sqrt;
591
+ break;
592
+ case METRIC_YULE:
593
+ distfn = &python_dissimilarity::yule;
594
+ break;
595
+ case METRIC_MATCHING:
596
+ distfn = &python_dissimilarity::matching;
597
+ postprocessfn = &cluster_result::divide;
598
+ postprocessarg = static_cast<t_float>(dim);
599
+ break;
600
+ case METRIC_DICE:
601
+ distfn = &python_dissimilarity::dice;
602
+ break;
603
+ case METRIC_ROGERSTANIMOTO:
604
+ distfn = &python_dissimilarity::rogerstanimoto;
605
+ break;
606
+ case METRIC_RUSSELLRAO:
607
+ distfn = &python_dissimilarity::russellrao;
608
+ postprocessfn = &cluster_result::divide;
609
+ postprocessarg = static_cast<t_float>(dim);
610
+ break;
611
+ case METRIC_SOKALSNEATH:
612
+ distfn = &python_dissimilarity::sokalsneath;
613
+ break;
614
+ case METRIC_KULSINSKI:
615
+ distfn = &python_dissimilarity::kulsinski;
616
+ postprocessfn = &cluster_result::plusone;
617
+ precomputed.init(N);
618
+ for (t_index i=0; i<N; ++i) {
619
+ t_index sum=0;
620
+ for (t_index k=0; k<dim; ++k) {
621
+ sum += Xb(i,k);
622
+ }
623
+ precomputed[i] = -.5/static_cast<t_float>(sum);
624
+ }
625
+ break;
626
+ case METRIC_USER:
627
+ X_Python = reinterpret_cast<PyObject *>(Xarg);
628
+ this->userfn = extraarg;
629
+ distfn = &python_dissimilarity::user;
630
+ break;
631
+ default: // case METRIC_JACCARD_BOOL:
632
+ distfn = &python_dissimilarity::jaccard_bool;
633
+ }
634
+ break;
635
+
636
+ case METHOD_METR_WARD:
637
+ postprocessfn = &cluster_result::sqrtward;
638
+ break;
639
+
640
+ case METHOD_METR_WARD_D2:
641
+ postprocessfn = &cluster_result::sqrtdouble;
642
+ break;
643
+
644
+ default:
645
+ postprocessfn = &cluster_result::sqrt;
646
+ }
647
+ }
648
+ #if HAVE_DIAGNOSTIC
649
+ #pragma GCC diagnostic pop
650
+ #endif
651
+
652
+ ~python_dissimilarity() {
653
+ #if HAVE_DIAGNOSTIC
654
+ #pragma GCC diagnostic push
655
+ #pragma GCC diagnostic ignored "-Wold-style-cast"
656
+ #endif
657
+ Py_XDECREF(V);
658
+ #if HAVE_DIAGNOSTIC
659
+ #pragma GCC diagnostic pop
660
+ #endif
661
+ }
662
+
663
+ inline t_float operator () (const t_index i, const t_index j) const {
664
+ return (this->*distfn)(i,j);
665
+ }
666
+
667
+ inline t_float X (const t_index i, const t_index j) const {
668
+ return Xa[i*dim+j];
669
+ }
670
+
671
+ inline bool Xb (const t_index i, const t_index j) const {
672
+ return reinterpret_cast<bool *>(Xa)[i*dim+j];
673
+ }
674
+
675
+ inline t_float * Xptr(const t_index i, const t_index j) const {
676
+ return Xa+i*dim+j;
677
+ }
678
+
679
+ void merge(const t_index i, const t_index j, const t_index newnode) const {
680
+ t_float const * const Pi = i<N ? Xa+i*dim : Xnew+(i-N)*dim;
681
+ t_float const * const Pj = j<N ? Xa+j*dim : Xnew+(j-N)*dim;
682
+ for(t_index k=0; k<dim; ++k) {
683
+ Xnew[(newnode-N)*dim+k] = (Pi[k]*static_cast<t_float>(members[i]) +
684
+ Pj[k]*static_cast<t_float>(members[j])) /
685
+ static_cast<t_float>(members[i]+members[j]);
686
+ }
687
+ members[newnode] = members[i]+members[j];
688
+ }
689
+
690
+ void merge_weighted(const t_index i, const t_index j, const t_index newnode)
691
+ const {
692
+ t_float const * const Pi = i<N ? Xa+i*dim : Xnew+(i-N)*dim;
693
+ t_float const * const Pj = j<N ? Xa+j*dim : Xnew+(j-N)*dim;
694
+ for(t_index k=0; k<dim; ++k) {
695
+ Xnew[(newnode-N)*dim+k] = (Pi[k]+Pj[k])*.5;
696
+ }
697
+ }
698
+
699
+ void merge_inplace(const t_index i, const t_index j) const {
700
+ t_float const * const Pi = Xa+i*dim;
701
+ t_float * const Pj = Xa+j*dim;
702
+ for(t_index k=0; k<dim; ++k) {
703
+ Pj[k] = (Pi[k]*static_cast<t_float>(members[i]) +
704
+ Pj[k]*static_cast<t_float>(members[j])) /
705
+ static_cast<t_float>(members[i]+members[j]);
706
+ }
707
+ members[j] += members[i];
708
+ }
709
+
710
+ void merge_inplace_weighted(const t_index i, const t_index j) const {
711
+ t_float const * const Pi = Xa+i*dim;
712
+ t_float * const Pj = Xa+j*dim;
713
+ for(t_index k=0; k<dim; ++k) {
714
+ Pj[k] = (Pi[k]+Pj[k])*.5;
715
+ }
716
+ }
717
+
718
+ void postprocess(cluster_result & Z2) const {
719
+ if (postprocessfn!=NULL) {
720
+ (Z2.*postprocessfn)(postprocessarg);
721
+ }
722
+ }
723
+
724
+ inline t_float ward(const t_index i, const t_index j) const {
725
+ t_float mi = static_cast<t_float>(members[i]);
726
+ t_float mj = static_cast<t_float>(members[j]);
727
+ return sqeuclidean<true>(i,j)*mi*mj/(mi+mj);
728
+ }
729
+
730
+ inline t_float ward_initial(const t_index i, const t_index j) const {
731
+ // alias for sqeuclidean
732
+ // Factor 2!!!
733
+ return sqeuclidean<true>(i,j);
734
+ }
735
+
736
+ // This method must not produce NaN if the input is non-NaN.
737
+ inline static t_float ward_initial_conversion(const t_float min) {
738
+ return min*.5;
739
+ }
740
+
741
+ inline t_float ward_extended(const t_index i, const t_index j) const {
742
+ t_float mi = static_cast<t_float>(members[i]);
743
+ t_float mj = static_cast<t_float>(members[j]);
744
+ return sqeuclidean_extended(i,j)*mi*mj/(mi+mj);
745
+ }
746
+
747
+ /* We need two variants of the Euclidean metric: one that does not check
748
+ for a NaN result, which is used for the initial distances, and one which
749
+ does, for the updated distances during the clustering procedure.
750
+ */
751
+ template <const bool check_NaN>
752
+ t_float sqeuclidean(const t_index i, const t_index j) const {
753
+ t_float sum = 0;
754
+ /*
755
+ for (t_index k=0; k<dim; ++k) {
756
+ t_float diff = X(i,k) - X(j,k);
757
+ sum += diff*diff;
758
+ }
759
+ */
760
+ // faster
761
+ t_float const * Pi = Xa+i*dim;
762
+ t_float const * Pj = Xa+j*dim;
763
+ for (t_index k=0; k<dim; ++k) {
764
+ t_float diff = Pi[k] - Pj[k];
765
+ sum += diff*diff;
766
+ }
767
+ if (check_NaN) {
768
+ #if HAVE_DIAGNOSTIC
769
+ #pragma GCC diagnostic push
770
+ #pragma GCC diagnostic ignored "-Wfloat-equal"
771
+ #endif
772
+ if (fc_isnan(sum))
773
+ #if HAVE_DIAGNOSTIC
774
+ #pragma GCC diagnostic pop
775
+ #endif
776
+ throw(nan_error());
777
+ }
778
+ return sum;
779
+ }
780
+
781
+ t_float sqeuclidean_extended(const t_index i, const t_index j) const {
782
+ t_float sum = 0;
783
+ t_float const * Pi = i<N ? Xa+i*dim : Xnew+(i-N)*dim; // TBD
784
+ t_float const * Pj = j<N ? Xa+j*dim : Xnew+(j-N)*dim;
785
+ for (t_index k=0; k<dim; ++k) {
786
+ t_float diff = Pi[k] - Pj[k];
787
+ sum += diff*diff;
788
+ }
789
+ #if HAVE_DIAGNOSTIC
790
+ #pragma GCC diagnostic push
791
+ #pragma GCC diagnostic ignored "-Wfloat-equal"
792
+ #endif
793
+ if (fc_isnan(sum))
794
+ throw(nan_error());
795
+ #if HAVE_DIAGNOSTIC
796
+ #pragma GCC diagnostic pop
797
+ #endif
798
+ return sum;
799
+ }
800
+
801
+ private:
802
+ void set_minkowski(PyObject * extraarg) {
803
+ if (extraarg==NULL) {
804
+ PyErr_SetString(PyExc_TypeError,
805
+ "The Minkowski metric needs a parameter.");
806
+ throw pythonerror();
807
+ }
808
+ postprocessarg = PyFloat_AsDouble(extraarg);
809
+ if (PyErr_Occurred()) {
810
+ throw pythonerror();
811
+ }
812
+
813
+ #if HAVE_DIAGNOSTIC
814
+ #pragma GCC diagnostic push
815
+ #pragma GCC diagnostic ignored "-Wfloat-equal"
816
+ #endif
817
+ if (postprocessarg==std::numeric_limits<t_float>::infinity()) {
818
+ set_chebychev();
819
+ }
820
+ else if (postprocessarg==1.0){
821
+ set_cityblock();
822
+ }
823
+ else if (postprocessarg==2.0){
824
+ set_euclidean();
825
+ }
826
+ else {
827
+ distfn = &python_dissimilarity::minkowski;
828
+ postprocessfn = &cluster_result::power;
829
+ }
830
+ #if HAVE_DIAGNOSTIC
831
+ #pragma GCC diagnostic pop
832
+ #endif
833
+ }
834
+
835
+ void set_euclidean() {
836
+ distfn = &python_dissimilarity::sqeuclidean<false>;
837
+ postprocessfn = &cluster_result::sqrt;
838
+ }
839
+
840
+ void set_cityblock() {
841
+ distfn = &python_dissimilarity::cityblock;
842
+ }
843
+
844
+ void set_chebychev() {
845
+ distfn = &python_dissimilarity::chebychev;
846
+ }
847
+
848
+ t_float seuclidean(const t_index i, const t_index j) const {
849
+ t_float sum = 0;
850
+ for (t_index k=0; k<dim; ++k) {
851
+ t_float diff = X(i,k)-X(j,k);
852
+ sum += diff*diff/V_data[k];
853
+ }
854
+ return sum;
855
+ }
856
+
857
+ t_float cityblock(const t_index i, const t_index j) const {
858
+ t_float sum = 0;
859
+ for (t_index k=0; k<dim; ++k) {
860
+ sum += std::abs(X(i,k)-X(j,k));
861
+ }
862
+ return sum;
863
+ }
864
+
865
+ t_float minkowski(const t_index i, const t_index j) const {
866
+ t_float sum = 0;
867
+ for (t_index k=0; k<dim; ++k) {
868
+ sum += std::pow(std::abs(X(i,k)-X(j,k)),postprocessarg);
869
+ }
870
+ return sum;
871
+ }
872
+
873
+ t_float chebychev(const t_index i, const t_index j) const {
874
+ t_float max = 0;
875
+ for (t_index k=0; k<dim; ++k) {
876
+ t_float diff = std::abs(X(i,k)-X(j,k));
877
+ if (diff>max) {
878
+ max = diff;
879
+ }
880
+ }
881
+ return max;
882
+ }
883
+
884
+ t_float cosine(const t_index i, const t_index j) const {
885
+ t_float sum = 0;
886
+ for (t_index k=0; k<dim; ++k) {
887
+ sum -= X(i,k)*X(j,k);
888
+ }
889
+ return sum*precomputed[i]*precomputed[j];
890
+ }
891
+
892
+ t_float hamming(const t_index i, const t_index j) const {
893
+ t_float sum = 0;
894
+ for (t_index k=0; k<dim; ++k) {
895
+ #if HAVE_DIAGNOSTIC
896
+ #pragma GCC diagnostic push
897
+ #pragma GCC diagnostic ignored "-Wfloat-equal"
898
+ #endif
899
+ sum += (X(i,k)!=X(j,k));
900
+ #if HAVE_DIAGNOSTIC
901
+ #pragma GCC diagnostic pop
902
+ #endif
903
+ }
904
+ return sum;
905
+ }
906
+
907
+ // Differs from scipy.spatial.distance: equal vectors correctly
908
+ // return distance 0.
909
+ t_float jaccard(const t_index i, const t_index j) const {
910
+ t_index sum1 = 0;
911
+ t_index sum2 = 0;
912
+ for (t_index k=0; k<dim; ++k) {
913
+ #if HAVE_DIAGNOSTIC
914
+ #pragma GCC diagnostic push
915
+ #pragma GCC diagnostic ignored "-Wfloat-equal"
916
+ #endif
917
+ sum1 += (X(i,k)!=X(j,k));
918
+ sum2 += ((X(i,k)!=0) || (X(j,k)!=0));
919
+ #if HAVE_DIAGNOSTIC
920
+ #pragma GCC diagnostic pop
921
+ #endif
922
+ }
923
+ return sum1==0 ? 0 : static_cast<t_float>(sum1) / static_cast<t_float>(sum2);
924
+ }
925
+
926
+ t_float canberra(const t_index i, const t_index j) const {
927
+ t_float sum = 0;
928
+ for (t_index k=0; k<dim; ++k) {
929
+ t_float numerator = std::abs(X(i,k)-X(j,k));
930
+ #if HAVE_DIAGNOSTIC
931
+ #pragma GCC diagnostic push
932
+ #pragma GCC diagnostic ignored "-Wfloat-equal"
933
+ #endif
934
+ sum += numerator==0 ? 0 : numerator / (std::abs(X(i,k)) + std::abs(X(j,k)));
935
+ #if HAVE_DIAGNOSTIC
936
+ #pragma GCC diagnostic pop
937
+ #endif
938
+ }
939
+ return sum;
940
+ }
941
+
942
+ t_float user(const t_index i, const t_index j) const {
943
+ #if HAVE_DIAGNOSTIC
944
+ #pragma GCC diagnostic push
945
+ #pragma GCC diagnostic ignored "-Wold-style-cast"
946
+ #endif
947
+ PyObject * u = PySequence_ITEM(X_Python, i);
948
+ PyObject * v = PySequence_ITEM(X_Python, j);
949
+ PyObject * result = PyObject_CallFunctionObjArgs(userfn, u, v, NULL);
950
+ Py_DECREF(u);
951
+ Py_DECREF(v);
952
+ #if HAVE_DIAGNOSTIC
953
+ #pragma GCC diagnostic pop
954
+ #endif
955
+ if (result==NULL) {
956
+ throw pythonerror();
957
+ }
958
+ #if HAVE_DIAGNOSTIC
959
+ #pragma GCC diagnostic push
960
+ #pragma GCC diagnostic ignored "-Wold-style-cast"
961
+ #endif
962
+ const t_float C_result = PyFloat_AsDouble(result);
963
+ Py_DECREF(result);
964
+ #if HAVE_DIAGNOSTIC
965
+ #pragma GCC diagnostic pop
966
+ #endif
967
+ if (PyErr_Occurred()) {
968
+ throw pythonerror();
969
+ }
970
+ return C_result;
971
+ }
972
+
973
+ t_float braycurtis(const t_index i, const t_index j) const {
974
+ t_float sum1 = 0;
975
+ t_float sum2 = 0;
976
+ for (t_index k=0; k<dim; ++k) {
977
+ sum1 += std::abs(X(i,k)-X(j,k));
978
+ sum2 += std::abs(X(i,k)+X(j,k));
979
+ }
980
+ return sum1/sum2;
981
+ }
982
+
983
+ t_float mahalanobis(const t_index i, const t_index j) const {
984
+ // V_data contains the product X*VI
985
+ t_float sum = 0;
986
+ for (t_index k=0; k<dim; ++k) {
987
+ sum += (V_data[i*dim+k]-V_data[j*dim+k])*(X(i,k)-X(j,k));
988
+ }
989
+ return sum;
990
+ }
991
+
992
+ t_index mutable NTT; // 'local' variables
993
+ t_index mutable NXO;
994
+ t_index mutable NTF;
995
+ #define NTFFT NTF
996
+ #define NFFTT NTT
997
+
998
+ void nbool_correspond(const t_index i, const t_index j) const {
999
+ NTT = 0;
1000
+ NXO = 0;
1001
+ for (t_index k=0; k<dim; ++k) {
1002
+ NTT += (Xb(i,k) & Xb(j,k)) ;
1003
+ NXO += (Xb(i,k) ^ Xb(j,k)) ;
1004
+ }
1005
+ }
1006
+
1007
+ void nbool_correspond_tfft(const t_index i, const t_index j) const {
1008
+ NTT = 0;
1009
+ NXO = 0;
1010
+ NTF = 0;
1011
+ for (t_index k=0; k<dim; ++k) {
1012
+ NTT += (Xb(i,k) & Xb(j,k)) ;
1013
+ NXO += (Xb(i,k) ^ Xb(j,k)) ;
1014
+ NTF += (Xb(i,k) & !Xb(j,k)) ;
1015
+ }
1016
+ NTF *= (NXO-NTF); // NTFFT
1017
+ NTT *= (static_cast<t_index>(dim)-NTT-NXO); // NFFTT
1018
+ }
1019
+
1020
+ void nbool_correspond_xo(const t_index i, const t_index j) const {
1021
+ NXO = 0;
1022
+ for (t_index k=0; k<dim; ++k) {
1023
+ NXO += (Xb(i,k) ^ Xb(j,k)) ;
1024
+ }
1025
+ }
1026
+
1027
+ void nbool_correspond_tt(const t_index i, const t_index j) const {
1028
+ NTT = 0;
1029
+ for (t_index k=0; k<dim; ++k) {
1030
+ NTT += (Xb(i,k) & Xb(j,k)) ;
1031
+ }
1032
+ }
1033
+
1034
+ t_float yule(const t_index i, const t_index j) const {
1035
+ nbool_correspond_tfft(i, j);
1036
+ return (NTFFT==0) ? 0 :
1037
+ static_cast<t_float>(2*NTFFT) / static_cast<t_float>(NTFFT + NFFTT);
1038
+ }
1039
+
1040
+ // Prevent a zero denominator for equal vectors.
1041
+ t_float dice(const t_index i, const t_index j) const {
1042
+ nbool_correspond(i, j);
1043
+ return (NXO==0) ? 0 :
1044
+ static_cast<t_float>(NXO) / static_cast<t_float>(NXO+2*NTT);
1045
+ }
1046
+
1047
+ t_float rogerstanimoto(const t_index i, const t_index j) const {
1048
+ nbool_correspond_xo(i, j);
1049
+ return static_cast<t_float>(2*NXO) / static_cast<t_float>(NXO+dim);
1050
+ }
1051
+
1052
+ t_float russellrao(const t_index i, const t_index j) const {
1053
+ nbool_correspond_tt(i, j);
1054
+ return static_cast<t_float>(dim-NTT);
1055
+ }
1056
+
1057
+ // Prevent a zero denominator for equal vectors.
1058
+ t_float sokalsneath(const t_index i, const t_index j) const {
1059
+ nbool_correspond(i, j);
1060
+ return (NXO==0) ? 0 :
1061
+ static_cast<t_float>(2*NXO) / static_cast<t_float>(NTT+2*NXO);
1062
+ }
1063
+
1064
+ t_float kulsinski(const t_index i, const t_index j) const {
1065
+ nbool_correspond_tt(i, j);
1066
+ return static_cast<t_float>(NTT) * (precomputed[i] + precomputed[j]);
1067
+ }
1068
+
1069
+ // 'matching' distance = Hamming distance
1070
+ t_float matching(const t_index i, const t_index j) const {
1071
+ nbool_correspond_xo(i, j);
1072
+ return static_cast<t_float>(NXO);
1073
+ }
1074
+
1075
+ // Prevent a zero denominator for equal vectors.
1076
+ t_float jaccard_bool(const t_index i, const t_index j) const {
1077
+ nbool_correspond(i, j);
1078
+ return (NXO==0) ? 0 :
1079
+ static_cast<t_float>(NXO) / static_cast<t_float>(NXO+NTT);
1080
+ }
1081
+ };
1082
+
1083
+ static PyObject *linkage_vector_wrap(PyObject * const, PyObject * const args) {
1084
+ PyArrayObject * X, * Z;
1085
+ unsigned char method, metric;
1086
+ PyObject * extraarg;
1087
+
1088
+ try{
1089
+ // Parse the input arguments
1090
+ #if HAVE_DIAGNOSTIC
1091
+ #pragma GCC diagnostic push
1092
+ #pragma GCC diagnostic ignored "-Wold-style-cast"
1093
+ #endif
1094
+ if (!PyArg_ParseTuple(args, "O!O!bbO",
1095
+ &PyArray_Type, &X, // NumPy array
1096
+ &PyArray_Type, &Z, // NumPy array
1097
+ &method, // unsigned char
1098
+ &metric, // unsigned char
1099
+ &extraarg )) { // Python object
1100
+ return NULL;
1101
+ }
1102
+ #if HAVE_DIAGNOSTIC
1103
+ #pragma GCC diagnostic pop
1104
+ #endif
1105
+
1106
+ if (PyArray_NDIM(X) != 2) {
1107
+ PyErr_SetString(PyExc_ValueError,
1108
+ "The input array must be two-dimensional.");
1109
+ }
1110
+ npy_intp const N_ = PyArray_DIM(X, 0);
1111
+ if (N_ < 1 ) {
1112
+ // N must be at least 1.
1113
+ PyErr_SetString(PyExc_ValueError,
1114
+ "At least one element is needed for clustering.");
1115
+ return NULL;
1116
+ }
1117
+
1118
+ npy_intp const dim = PyArray_DIM(X, 1);
1119
+ if (dim < 1 ) {
1120
+ PyErr_SetString(PyExc_ValueError,
1121
+ "Invalid dimension of the data set.");
1122
+ return NULL;
1123
+ }
1124
+
1125
+ /*
1126
+ (1)
1127
+ The biggest index used below is 4*(N-2)+3, as an index to Z. This must
1128
+ fit into the data type used for indices.
1129
+ (2)
1130
+ The largest representable integer, without loss of precision, by a
1131
+ floating point number of type t_float is 2^T_FLOAT_MANT_DIG. Here, we
1132
+ make sure that all cluster labels from 0 to 2N-2 in the output can be
1133
+ accurately represented by a floating point number.
1134
+
1135
+ Conversion of N to 64 bits below is not really necessary but it prevents
1136
+ a warning ("shift count >= width of type") on systems where "int" is 32
1137
+ bits wide.
1138
+ */
1139
+ if (N_ > MAX_INDEX/4 || dim > MAX_INDEX ||
1140
+ static_cast<int64_t>(N_-1)>>(T_FLOAT_MANT_DIG-1) > 0) {
1141
+ PyErr_SetString(PyExc_ValueError,
1142
+ "Data is too big, index overflow.");
1143
+ return NULL;
1144
+ }
1145
+ t_index N = static_cast<t_index>(N_);
1146
+
1147
+ cluster_result Z2(N-1);
1148
+
1149
+ auto_array_ptr<t_index> members;
1150
+ if (method==METHOD_METR_WARD || method==METHOD_METR_WARD_D2 || method==METHOD_METR_CENTROID) {
1151
+ members.init(2*N-1, 1);
1152
+ }
1153
+
1154
+ if ((method!=METHOD_METR_SINGLE && metric!=METRIC_EUCLIDEAN) ||
1155
+ metric>=METRIC_INVALID) {
1156
+ PyErr_SetString(PyExc_IndexError, "Invalid metric index.");
1157
+ return NULL;
1158
+ }
1159
+
1160
+ if (PyArray_ISBOOL(X)) {
1161
+ if (metric==METRIC_HAMMING) {
1162
+ metric = METRIC_MATCHING; // Alias
1163
+ }
1164
+ if (metric==METRIC_JACCARD) {
1165
+ metric = METRIC_JACCARD_BOOL;
1166
+ }
1167
+ }
1168
+
1169
+ if (extraarg!=Py_None &&
1170
+ metric!=METRIC_MINKOWSKI &&
1171
+ metric!=METRIC_SEUCLIDEAN &&
1172
+ metric!=METRIC_MAHALANOBIS &&
1173
+ metric!=METRIC_USER) {
1174
+ PyErr_SetString(PyExc_TypeError,
1175
+ "No extra parameter is allowed for this metric.");
1176
+ return NULL;
1177
+ }
1178
+
1179
+ /* temp_point_array must be true if the alternative algorithm
1180
+ is used below (currently for the centroid and median methods). */
1181
+ bool temp_point_array = (method==METHOD_METR_CENTROID ||
1182
+ method==METHOD_METR_MEDIAN);
1183
+
1184
+ python_dissimilarity dist(X, members, static_cast<method_codes>(method),
1185
+ static_cast<metric_codes>(metric), extraarg,
1186
+ temp_point_array);
1187
+
1188
+ if (method!=METHOD_METR_SINGLE &&
1189
+ method!=METHOD_METR_WARD &&
1190
+ method!=METHOD_METR_WARD_D2 &&
1191
+ method!=METHOD_METR_CENTROID &&
1192
+ method!=METHOD_METR_MEDIAN) {
1193
+ PyErr_SetString(PyExc_IndexError, "Invalid method index.");
1194
+ return NULL;
1195
+ }
1196
+
1197
+ // Allow threads if the metric is not "user"!
1198
+ GIL_release G(metric!=METRIC_USER);
1199
+
1200
+ switch (method) {
1201
+ case METHOD_METR_SINGLE:
1202
+ MST_linkage_core_vector(N, dist, Z2);
1203
+ break;
1204
+ case METHOD_METR_WARD:
1205
+ generic_linkage_vector<METHOD_VECTOR_WARD>(N, dist, Z2);
1206
+ break;
1207
+ case METHOD_METR_WARD_D2:
1208
+ generic_linkage_vector<METHOD_VECTOR_WARD_D2>(N, dist, Z2);
1209
+ break;
1210
+ case METHOD_METR_CENTROID:
1211
+ generic_linkage_vector_alternative<METHOD_VECTOR_CENTROID>(N, dist, Z2);
1212
+ break;
1213
+ default: // case METHOD_METR_MEDIAN:
1214
+ generic_linkage_vector_alternative<METHOD_VECTOR_MEDIAN>(N, dist, Z2);
1215
+ }
1216
+
1217
+ if (method==METHOD_METR_WARD ||
1218
+ method==METHOD_METR_WARD_D2 ||
1219
+ method==METHOD_METR_CENTROID) {
1220
+ members.free();
1221
+ }
1222
+
1223
+ dist.postprocess(Z2);
1224
+
1225
+ t_float * const Z_ = reinterpret_cast<t_float *>(PyArray_DATA(Z));
1226
+ if (method!=METHOD_METR_SINGLE) {
1227
+ generate_SciPy_dendrogram<true>(Z_, Z2, N);
1228
+ }
1229
+ else {
1230
+ generate_SciPy_dendrogram<false>(Z_, Z2, N);
1231
+ }
1232
+ } // try
1233
+ catch (const std::bad_alloc&) {
1234
+ return PyErr_NoMemory();
1235
+ }
1236
+ catch(const std::exception& e){
1237
+ PyErr_SetString(PyExc_EnvironmentError, e.what());
1238
+ return NULL;
1239
+ }
1240
+ catch(const nan_error&){
1241
+ PyErr_SetString(PyExc_FloatingPointError, "NaN dissimilarity value.");
1242
+ return NULL;
1243
+ }
1244
+ catch(const pythonerror){
1245
+ return NULL;
1246
+ }
1247
+ catch(...){
1248
+ PyErr_SetString(PyExc_EnvironmentError,
1249
+ "C++ exception (unknown reason). Please send a bug report.");
1250
+ return NULL;
1251
+ }
1252
+ #if HAVE_DIAGNOSTIC
1253
+ #pragma GCC diagnostic push
1254
+ #pragma GCC diagnostic ignored "-Wold-style-cast"
1255
+ #endif
1256
+ Py_RETURN_NONE;
1257
+ #if HAVE_DIAGNOSTIC
1258
+ #pragma GCC diagnostic pop
1259
+ #endif
1260
+ }
1261
+
1262
+ #if HAVE_VISIBILITY
1263
+ #pragma GCC visibility pop
1264
+ #endif