sequenzo 0.1.21__cp312-cp312-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sequenzo might be problematic. Click here for more details.

Files changed (260) hide show
  1. sequenzo/__init__.py +240 -0
  2. sequenzo/big_data/__init__.py +12 -0
  3. sequenzo/big_data/clara/__init__.py +26 -0
  4. sequenzo/big_data/clara/clara.py +467 -0
  5. sequenzo/big_data/clara/utils/__init__.py +27 -0
  6. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  7. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  8. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-312-darwin.so +0 -0
  9. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  10. sequenzo/big_data/clara/visualization.py +88 -0
  11. sequenzo/clustering/KMedoids.py +196 -0
  12. sequenzo/clustering/__init__.py +30 -0
  13. sequenzo/clustering/clustering_c_code.cpython-312-darwin.so +0 -0
  14. sequenzo/clustering/hierarchical_clustering.py +1380 -0
  15. sequenzo/clustering/src/KMedoid.cpp +262 -0
  16. sequenzo/clustering/src/PAM.cpp +236 -0
  17. sequenzo/clustering/src/PAMonce.cpp +234 -0
  18. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  19. sequenzo/clustering/src/cluster_quality.h +128 -0
  20. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  21. sequenzo/clustering/src/module.cpp +228 -0
  22. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  23. sequenzo/clustering/utils/__init__.py +27 -0
  24. sequenzo/clustering/utils/disscenter.py +122 -0
  25. sequenzo/data_preprocessing/__init__.py +20 -0
  26. sequenzo/data_preprocessing/helpers.py +256 -0
  27. sequenzo/datasets/__init__.py +41 -0
  28. sequenzo/datasets/biofam.csv +2001 -0
  29. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  30. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  31. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  32. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  33. sequenzo/datasets/country_co2_emissions.csv +194 -0
  34. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  35. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  36. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  37. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  38. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  39. sequenzo/datasets/mvad.csv +713 -0
  40. sequenzo/datasets/pairfam_family.csv +1867 -0
  41. sequenzo/datasets/polyadic_samplec1.csv +61 -0
  42. sequenzo/datasets/polyadic_samplep1.csv +61 -0
  43. sequenzo/datasets/polyadic_seqc1.csv +61 -0
  44. sequenzo/datasets/polyadic_seqp1.csv +61 -0
  45. sequenzo/define_sequence_data.py +609 -0
  46. sequenzo/dissimilarity_measures/__init__.py +31 -0
  47. sequenzo/dissimilarity_measures/c_code.cpython-312-darwin.so +0 -0
  48. sequenzo/dissimilarity_measures/get_distance_matrix.py +702 -0
  49. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +241 -0
  50. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  51. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  52. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  53. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  54. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  55. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  56. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  57. sequenzo/dissimilarity_measures/src/module.cpp +34 -0
  58. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  59. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  60. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  61. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  62. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  63. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  64. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  65. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  66. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  67. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  210. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  211. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  212. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-312-darwin.so +0 -0
  213. sequenzo/dissimilarity_measures/utils/seqconc.cpython-312-darwin.so +0 -0
  214. sequenzo/dissimilarity_measures/utils/seqdss.cpython-312-darwin.so +0 -0
  215. sequenzo/dissimilarity_measures/utils/seqdur.cpython-312-darwin.so +0 -0
  216. sequenzo/dissimilarity_measures/utils/seqlength.cpython-312-darwin.so +0 -0
  217. sequenzo/multidomain/__init__.py +23 -0
  218. sequenzo/multidomain/association_between_domains.py +311 -0
  219. sequenzo/multidomain/cat.py +431 -0
  220. sequenzo/multidomain/combt.py +519 -0
  221. sequenzo/multidomain/dat.py +89 -0
  222. sequenzo/multidomain/idcd.py +139 -0
  223. sequenzo/multidomain/linked_polyad.py +292 -0
  224. sequenzo/openmp_setup.py +233 -0
  225. sequenzo/prefix_tree/__init__.py +43 -0
  226. sequenzo/prefix_tree/individual_level_indicators.py +1274 -0
  227. sequenzo/prefix_tree/system_level_indicators.py +465 -0
  228. sequenzo/prefix_tree/utils.py +54 -0
  229. sequenzo/sequence_characteristics/__init__.py +40 -0
  230. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  231. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  232. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  233. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  234. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  235. sequenzo/sequence_characteristics/turbulence.py +155 -0
  236. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  237. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  238. sequenzo/suffix_tree/__init__.py +48 -0
  239. sequenzo/suffix_tree/individual_level_indicators.py +1638 -0
  240. sequenzo/suffix_tree/system_level_indicators.py +456 -0
  241. sequenzo/suffix_tree/utils.py +56 -0
  242. sequenzo/visualization/__init__.py +29 -0
  243. sequenzo/visualization/plot_mean_time.py +194 -0
  244. sequenzo/visualization/plot_modal_state.py +276 -0
  245. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  246. sequenzo/visualization/plot_relative_frequency.py +404 -0
  247. sequenzo/visualization/plot_sequence_index.py +937 -0
  248. sequenzo/visualization/plot_single_medoid.py +153 -0
  249. sequenzo/visualization/plot_state_distribution.py +613 -0
  250. sequenzo/visualization/plot_transition_matrix.py +190 -0
  251. sequenzo/visualization/utils/__init__.py +23 -0
  252. sequenzo/visualization/utils/utils.py +310 -0
  253. sequenzo/with_event_history_analysis/__init__.py +35 -0
  254. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  255. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  256. sequenzo-0.1.21.dist-info/METADATA +308 -0
  257. sequenzo-0.1.21.dist-info/RECORD +254 -0
  258. sequenzo-0.1.21.dist-info/WHEEL +5 -0
  259. sequenzo-0.1.21.dist-info/licenses/LICENSE +28 -0
  260. sequenzo-0.1.21.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1380 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : hierarchical_clustering.py
4
+ @Time : 18/12/2024 17:59
5
+ @Desc :
6
+ This module provides a flexible and user-friendly implementation of hierarchical clustering,
7
+ along with tools to evaluate cluster quality and analyze clustering results.
8
+
9
+ It supports common hierarchical clustering methods and evaluation metrics,
10
+ designed for social sequence analysis and other research applications.
11
+
12
+ This module leverages fastcluster, a tool specifically designed to enhance the efficiency of large-scale hierarchical clustering.
13
+ Unlike native Python tools such as SciPy, fastcluster optimizes linkage matrix computations,
14
+ enabling it to handle datasets with millions of entries more efficiently.
15
+
16
+ It has three main components:
17
+ 1. Cluster Class: Performs hierarchical clustering on a precomputed distance matrix.
18
+ 2. ClusterQuality Class: Evaluates the quality of clustering for different numbers of clusters using various metrics.
19
+ 3. ClusterResults Class: Analyzes and visualizes the clustering results (e.g., membership tables and cluster distributions).
20
+
21
+ WEIGHTED CLUSTERING SUPPORT:
22
+ All classes now support weighted data analysis:
23
+ - Cluster: Hierarchical linkage is computed on the given distance matrix (unweighted). Optional weights are applied to evaluation and summaries
24
+ - ClusterQuality: Computes weighted versions of quality metrics (ASWw, HG, R2, HC)
25
+ - ClusterResults: Provides weighted cluster distribution statistics and visualizations
26
+
27
+ Weighted metrics account for sequence importance when calculating clustering quality,
28
+ making the analysis more representative when sequences have different sampling weights
29
+ or population sizes.
30
+
31
+ WARD METHOD VARIANTS:
32
+ The module supports two Ward linkage variants:
33
+ - 'ward_d' (Ward D): Classic Ward method using squared Euclidean distances ÷ 2
34
+ - 'ward_d2' (Ward D2): Ward method using squared Euclidean distances
35
+ For backward compatibility, 'ward' maps to 'ward_d'.
36
+
37
+ The difference affects clustering results and dendrogram heights:
38
+ - Ward D produces smaller distances in the linkage matrix
39
+ - Ward D2 produces distances equal to the increase in cluster variance
40
+ - Both methods produce identical cluster assignments, only distances differ
41
+
42
+ ROBUSTNESS AND VALIDATION FEATURES:
43
+ - Ward Method Validation: Automatic detection of non-Euclidean distance matrices
44
+ - One-time Warning System: Alerts users when Ward methods are used with potentially incompatible distances
45
+ - Robust Matrix Cleanup: Handles NaN/Inf values using 95th percentile replacement
46
+ - Distance Matrix Validation: Ensures zero diagonal and non-negativity
47
+ - Symmetry Handling: Automatically symmetrizes matrices when required by clustering algorithms
48
+ - Method Recommendations: Suggests alternative methods for sequence distances
49
+
50
+ For sequence distances (OM, LCS, etc.), Ward linkage methods may produce suboptimal results.
51
+ Consider using alternative methods like 'average' (UPGMA) for better theoretical validity.
52
+
53
+ Original code references:
54
+ Cluster(): Derived from `hclust`, a key function from fastcluster
55
+ R code: https://github.com/cran/fastcluster/blob/master/R/fastcluster.R
56
+ Python code: https://github.com/fastcluster/fastcluster/blob/master/src/fastcluster.cpp
57
+ The Python version of facluster does not support Ward D method but only Ward D2, whereas R supports both.
58
+ Thus, we provide Ward D by ourselves here.
59
+
60
+ ClusterQuality(): Derived from ``, a key function from weightedcluster
61
+ CQI equivalence of R is here (two files):
62
+ https://github.com/cran/WeightedCluster/blob/master/src/clusterquality.cpp
63
+ https://github.com/cran/WeightedCluster/blob/master/src/clusterqualitybody.cpp
64
+ plot_cqi_scores(): `wcCmpCluster()` produces `clustrangefamily` object + `plot.clustrangefamily()` for plotting
65
+ """
66
+ import matplotlib.pyplot as plt
67
+ import seaborn as sns
68
+ import warnings
69
+ from matplotlib.ticker import MaxNLocator
70
+
71
+ import pandas as pd
72
+ import numpy as np
73
+ from scipy.cluster.hierarchy import fcluster, dendrogram
74
+ from scipy.spatial.distance import squareform
75
+ # sklearn metrics no longer needed - using C++ implementation
76
+ from fastcluster import linkage
77
+
78
+ # Optional R integration for Ward D method
79
+ _RPY2_AVAILABLE = False
80
+ ro = None
81
+ importr = None
82
+
83
+ try:
84
+ import rpy2.robjects as ro
85
+ from rpy2.robjects.packages import importr, PackageNotInstalledError
86
+ _RPY2_AVAILABLE = True
87
+ except Exception:
88
+ # Catch any error during rpy2 import/initialization (e.g., R not installed / R_HOME missing on Windows)
89
+ _RPY2_AVAILABLE = False
90
+
91
+ if not _RPY2_AVAILABLE:
92
+ print("[!] Warning: rpy2 not available or R not properly configured. Ward D clustering method will not be supported.")
93
+ print(" To use Ward D: please install R and properly configure rpy2 (make sure `R --version` works).")
94
+ print(" use 'ward_d2', 'average', 'complete', or 'single' methods.")
95
+ print(" See CRAN: https://cloud.r-project.org.")
96
+ print(" Alternatively, use the 'ward_d2', 'average', 'complete', or 'single' method instead.")
97
+
98
+ # Import C++ cluster quality functions
99
+ try:
100
+ from . import clustering_c_code
101
+ _CPP_AVAILABLE = True
102
+ except ImportError:
103
+ _CPP_AVAILABLE = False
104
+ print("[!] Warning: C++ cluster quality functions not available. Using Python fallback.")
105
+
106
+
107
+ # Corrected imports: Use relative imports *within* the package.
108
+ from sequenzo.visualization.utils import save_and_show_results
109
+
110
+ # Global flag to ensure Ward warning is only shown once per session
111
+ _WARD_WARNING_SHOWN = False
112
+
113
+ def _ensure_r_environment_and_fastcluster():
114
+ """
115
+ Ensure R runtime is discoverable and R package 'fastcluster' is installed.
116
+ - If R is not available, raise with platform-specific installation instructions
117
+ - Choose CRAN mirror
118
+ - Auto-install 'fastcluster' if not present
119
+ """
120
+ print(' - Checking R runtime environment and fastcluster.')
121
+ # Try to load R runtime via rpy2. If this fails, provide installation instructions.
122
+ try:
123
+ utils = importr('utils')
124
+ except Exception as e:
125
+ # Build platform-specific guidance
126
+ if sys.platform == 'darwin':
127
+ guide = (
128
+ "No available R runtime environment detected.\n"
129
+ "Please install and configure R first:\n"
130
+ "1) Installation: Install R from CRAN (recommended macOS package): https://cloud.r-project.org \n"
131
+ " Or use Homebrew: brew install r\n"
132
+ "2) Verification: Run `R --version` in the terminal to check the version information.\n"
133
+ "3) If the error persists, you can set the environment variable manually:\n"
134
+ " export R_HOME=/Library/Frameworks/R.framework/Resources\n"
135
+ " and ensure that `which R` can locate the R executable."
136
+ )
137
+ elif sys.platform.startswith('linux'):
138
+ guide = (
139
+ "No available R runtime environment detected.\n"
140
+ "Please install and configure R first:\n"
141
+ "1) Installation: Use your distribution's package manager "
142
+ "(e.g., Ubuntu: sudo apt-get install -y r-base; CentOS/RHEL: sudo yum install -y R)\n"
143
+ " or download from CRAN: https://cloud.r-project.org \n"
144
+ "2) Verification: Run `R --version` in the terminal.\n"
145
+ "3) If necessary, set R_HOME to point to the R installation directory (e.g., /usr/lib/R)."
146
+ )
147
+ elif sys.platform == 'win32':
148
+ guide = (
149
+ "No available R runtime environment detected.\n"
150
+ "Please install and configure R first:\n"
151
+ "1) Installation: Download and install R for Windows from CRAN: https://cloud.r-project.org \n"
152
+ "2) Verification: Run `R --version` in PowerShell.\n"
153
+ "3) If the error persists, add the R bin directory to your system PATH environment variable,\n"
154
+ " e.g., C:\\Program Files\\R\\R-x.y.z\\bin; if needed, set R_HOME to that R directory."
155
+ )
156
+ else:
157
+ guide = (
158
+ "No available R runtime environment detected. "
159
+ "Please install R and ensure that `R --version` can be executed from the command line.\n"
160
+ "CRAN: https://cloud.r-project.org"
161
+ )
162
+ raise RuntimeError(f"{guide}\nOriginal error: {e}")
163
+
164
+ # Ensure mirror
165
+ try:
166
+ # If a mirror is not chosen, choose the first (may be reset by user later)
167
+ utils.chooseCRANmirror(ind=1)
168
+ except Exception:
169
+ # Fallback to cloud mirror
170
+ try:
171
+ ro.r('options(repos = c(CRAN = "https://cloud.r-project.org"))')
172
+ except Exception:
173
+ pass
174
+
175
+ # Ensure fastcluster installed
176
+ try:
177
+ importr('fastcluster')
178
+ except PackageNotInstalledError:
179
+ try:
180
+ # Try install with explicit repo and limited parallelism
181
+ utils.install_packages('fastcluster', repos='https://cloud.r-project.org', Ncpus=2)
182
+ importr('fastcluster')
183
+ except Exception as install_err:
184
+ raise RuntimeError(
185
+ "Failed to install R package 'fastcluster' automatically. "
186
+ "Please ensure internet access and a working R toolchain are available. "
187
+ f"Original error: {install_err}"
188
+ )
189
+
190
+
191
+
192
+ def _check_euclidean_compatibility(matrix, method):
193
+ """
194
+ Check if a distance matrix is likely compatible with Euclidean-based methods like Ward.
195
+
196
+ This performs heuristic checks rather than exact validation since perfect validation
197
+ would be computationally expensive for large matrices.
198
+
199
+ Parameters:
200
+ -----------
201
+ matrix : np.ndarray
202
+ Distance matrix to check
203
+ method : str
204
+ Clustering method name
205
+
206
+ Returns:
207
+ --------
208
+ bool
209
+ True if matrix appears Euclidean-compatible, False otherwise
210
+ """
211
+ # Check for Ward methods (both Ward D and Ward D2 require Euclidean distances)
212
+ if method.lower() not in ["ward", "ward_d", "ward_d2"]:
213
+ return True # Other methods don't require Euclidean distances
214
+
215
+ # Basic checks for Euclidean properties
216
+ n = matrix.shape[0]
217
+
218
+ # Check 1: Triangle inequality violations (sample a subset for large matrices)
219
+ sample_size = min(50, n) # Sample up to 50 points for efficiency
220
+ if n > sample_size:
221
+ indices = np.random.choice(n, sample_size, replace=False)
222
+ sample_matrix = matrix[np.ix_(indices, indices)]
223
+ else:
224
+ sample_matrix = matrix
225
+ indices = np.arange(n)
226
+
227
+ sample_n = sample_matrix.shape[0]
228
+ violations = 0
229
+ total_checks = 0
230
+
231
+ # Check triangle inequality: d(i,k) <= d(i,j) + d(j,k)
232
+ for i in range(sample_n):
233
+ for j in range(i + 1, sample_n):
234
+ for k in range(j + 1, sample_n):
235
+ dij = sample_matrix[i, j]
236
+ dik = sample_matrix[i, k]
237
+ djk = sample_matrix[j, k]
238
+
239
+ # Check all three triangle inequalities
240
+ if (dik > dij + djk + 1e-10 or
241
+ dij > dik + djk + 1e-10 or
242
+ djk > dij + dik + 1e-10):
243
+ violations += 1
244
+ total_checks += 1
245
+
246
+ if total_checks > 0:
247
+ violation_rate = violations / total_checks
248
+ if violation_rate > 0.1: # More than 10% violations suggests non-Euclidean
249
+ return False
250
+
251
+ # Check 2: Negative eigenvalues in distance matrix (indicates non-Euclidean)
252
+ # Use double centering to convert distances to inner products
253
+ try:
254
+ # For efficiency, only check this for smaller matrices
255
+ if sample_n <= 100:
256
+ H = np.eye(sample_n) - np.ones((sample_n, sample_n)) / sample_n
257
+ B = -0.5 * H @ (sample_matrix ** 2) @ H
258
+ eigenvals = np.linalg.eigvals(B)
259
+
260
+ # Check if there are significant negative eigenvalues
261
+ negative_eigenvals = eigenvals[eigenvals < -1e-10]
262
+ if len(negative_eigenvals) > 0:
263
+ neg_energy = -np.sum(negative_eigenvals)
264
+ total_energy = np.sum(np.abs(eigenvals))
265
+ if neg_energy / total_energy > 0.1: # > 10% negative energy
266
+ return False
267
+ except np.linalg.LinAlgError:
268
+ # If eigenvalue computation fails, assume potentially problematic
269
+ pass
270
+
271
+ return True
272
+
273
+
274
+ def _warn_ward_usage_once(matrix, method):
275
+ """
276
+ Issue a one-time warning about using Ward with potentially non-Euclidean distances.
277
+ """
278
+ global _WARD_WARNING_SHOWN
279
+
280
+ # Check for both Ward D and Ward D2 methods
281
+ if not _WARD_WARNING_SHOWN and method.lower() in ["ward", "ward_d", "ward_d2"]:
282
+ if not _check_euclidean_compatibility(matrix, method):
283
+ warnings.warn(
284
+ "\n[!] Ward linkage method detected with potentially non-Euclidean distance matrix!\n"
285
+ " Ward clustering (both Ward D and Ward D2) assumes Euclidean distances for theoretical validity.\n"
286
+ " \n"
287
+ " Ward method variants:\n"
288
+ " - 'ward_d' (classic): Uses squared Euclidean distances ÷ 2\n"
289
+ " - 'ward_d2': Uses squared Euclidean distances\n"
290
+ " \n"
291
+ " For sequence distances (OM, LCS, etc.), consider using:\n"
292
+ " - method='average' (UPGMA)\n"
293
+ " - method='complete' (complete linkage)\n"
294
+ " - method='single' (single linkage)\n"
295
+ " \n"
296
+ " Note: 'centroid' and 'median' methods may also produce inversions\n"
297
+ " (non-monotonic dendrograms) with non-Euclidean distances.\n"
298
+ " \n"
299
+ " This warning is shown only once per session.",
300
+ UserWarning,
301
+ stacklevel=3
302
+ )
303
+ _WARD_WARNING_SHOWN = True
304
+
305
+
306
+ def _clean_distance_matrix(matrix):
307
+ """
308
+ Clean and validate a distance matrix for hierarchical clustering.
309
+
310
+ This function:
311
+ 1. Handles NaN/Inf values using robust percentile-based replacement
312
+ 2. Sets diagonal to zero
313
+ 3. Ensures non-negativity
314
+
315
+ Note: Symmetry is NOT enforced at this stage since distance matrices may legitimately
316
+ be asymmetric (e.g., directed sequence distances, time-dependent measures, etc.).
317
+ However, symmetrization will be performed later in linkage computation when required
318
+ by clustering algorithms.
319
+
320
+ Parameters:
321
+ -----------
322
+ matrix : np.ndarray
323
+ Input distance matrix
324
+
325
+ Returns:
326
+ --------
327
+ np.ndarray
328
+ Cleaned distance matrix
329
+ """
330
+ matrix = matrix.copy() # Don't modify the original
331
+
332
+ # Step 1: Handle NaN/Inf values with percentile-based replacement
333
+ if np.any(np.isnan(matrix)) or np.any(np.isinf(matrix)):
334
+ print("[!] Warning: Distance matrix contains NaN or Inf values.")
335
+
336
+ # Get finite values for percentile calculation
337
+ finite_vals = matrix[np.isfinite(matrix)]
338
+
339
+ if len(finite_vals) > 0:
340
+ # Use 95th percentile as replacement value (more conservative than max)
341
+ replacement_val = np.percentile(finite_vals, 95)
342
+ print(f" Replacing with 95th percentile value: {replacement_val:.6f}")
343
+ else:
344
+ # If no finite values, use 1.0 as default
345
+ replacement_val = 1.0
346
+ print(f" No finite values found, using default: {replacement_val}")
347
+
348
+ matrix[~np.isfinite(matrix)] = replacement_val
349
+
350
+ # Step 2: Force diagonal to be exactly zero (self-distance should be zero)
351
+ np.fill_diagonal(matrix, 0.0)
352
+
353
+ # Step 3: Ensure non-negativity (distance matrices should be non-negative)
354
+ if np.any(matrix < 0):
355
+ print("[!] Warning: Distance matrix contains negative values. Clipping to zero...")
356
+ matrix = np.maximum(matrix, 0.0)
357
+
358
+ return matrix
359
+
360
+
361
+ def _hclust_to_linkage_matrix(linkage_matrix):
362
+ """
363
+ Convert an R `hclust` object to a SciPy-compatible linkage matrix.
364
+
365
+ This function takes an `hclust` object returned by R (e.g., from
366
+ `fastcluster::hclust`) and converts it into the standard linkage matrix
367
+ format used by SciPy (`scipy.cluster.hierarchy.linkage`), which can be
368
+ used for dendrogram plotting or further clustering analysis in Python.
369
+
370
+ Parameters
371
+ ----------
372
+ linkage_matrix : rpy2.robjects.ListVector
373
+ An R `hclust` object. Expected to contain at least the following fields:
374
+ - 'merge': ndarray of shape (n-1, 2), indicating which clusters are merged
375
+ at each step (negative indices for original observations,
376
+ positive indices for previously merged clusters).
377
+ - 'height': ndarray of shape (n-1,), distances at which merges occur.
378
+ - 'order': ordering of the leaves.
379
+
380
+ Returns
381
+ -------
382
+ Z : numpy.ndarray, shape (n-1, 4), dtype=float
383
+ A SciPy-compatible linkage matrix where each row represents a merge:
384
+ - Z[i, 0] : index of the first cluster (0-based)
385
+ - Z[i, 1] : index of the second cluster (0-based)
386
+ - Z[i, 2] : distance between the merged clusters
387
+ - Z[i, 3] : total number of original samples in the newly formed cluster
388
+
389
+ Notes
390
+ -----
391
+ - The conversion handles the difference in indexing:
392
+ - In R's `hclust`, negative numbers in 'merge' indicate original samples
393
+ and positive numbers indicate previously merged clusters (1-based).
394
+ - In the returned SciPy linkage matrix, all indices are converted to 0-based.
395
+ - The function iteratively tracks cluster sizes to populate the fourth column
396
+ (sample counts) required by SciPy.
397
+ """
398
+
399
+ n = len(linkage_matrix.rx2("order")) # 样本数
400
+ merge = np.array(linkage_matrix.rx2("merge"), dtype=int) # (n-1, 2)
401
+ height = np.array(linkage_matrix.rx2("height"), dtype=float)
402
+
403
+ cluster_sizes = np.ones(n, dtype=int) # 单个样本初始大小 = 1
404
+ Z = np.zeros((n - 1, 4), dtype=float)
405
+
406
+ for i in range(n - 1):
407
+ a, b = merge[i]
408
+
409
+ # R hclust 编号负数表示原始样本
410
+ if a < 0:
411
+ idx1 = -a - 1 # 转成 0-based
412
+ size1 = 1
413
+ else:
414
+ idx1 = n + a - 1 # 已合并簇,0-based
415
+ size1 = cluster_sizes[idx1]
416
+
417
+ if b < 0:
418
+ idx2 = -b - 1
419
+ size2 = 1
420
+ else:
421
+ idx2 = n + b - 1
422
+ size2 = cluster_sizes[idx2]
423
+
424
+ Z[i, 0] = idx1
425
+ Z[i, 1] = idx2
426
+ Z[i, 2] = height[i]
427
+ Z[i, 3] = size1 + size2
428
+
429
+ # 更新 cluster_sizes,用于后续簇
430
+ cluster_sizes = np.append(cluster_sizes, size1 + size2)
431
+
432
+ return Z
433
+
434
+ class Cluster:
435
+ def __init__(self,
436
+ matrix,
437
+ entity_ids,
438
+ clustering_method="ward",
439
+ weights=None):
440
+ """
441
+ A class to handle hierarchical clustering operations using fastcluster for improved performance.
442
+
443
+ :param matrix: Precomputed distance matrix (full square form).
444
+ :param entity_ids: List of IDs corresponding to the entities in the matrix.
445
+ :param clustering_method: Clustering algorithm to use. Options include:
446
+ - "ward" or "ward_d": Classic Ward method (squared Euclidean distances ÷ 2) [default]
447
+ - "ward_d2": Ward method with squared Euclidean distances
448
+ - "single": Single linkage (minimum method)
449
+ - "complete": Complete linkage (maximum method)
450
+ - "average": Average linkage (UPGMA)
451
+ - "centroid": Centroid linkage
452
+ - "median": Median linkage
453
+ :param weights: Optional array of weights for each entity (default: None for equal weights).
454
+ """
455
+ # Ensure entity_ids is a numpy array for consistent processing
456
+ self.entity_ids = np.array(entity_ids)
457
+
458
+ # Check if entity_ids is valid
459
+ if len(self.entity_ids) != len(matrix):
460
+ raise ValueError("Length of entity_ids must match the size of the matrix.")
461
+
462
+ # Optional: Check uniqueness of entity_ids
463
+ if len(np.unique(self.entity_ids)) != len(self.entity_ids):
464
+ raise ValueError("entity_ids must contain unique values.")
465
+
466
+ # Initialize and validate weights
467
+ if weights is not None:
468
+ self.weights = np.array(weights, dtype=np.float64)
469
+ if len(self.weights) != len(matrix):
470
+ raise ValueError("Length of weights must match the size of the matrix.")
471
+ if np.any(self.weights < 0):
472
+ raise ValueError("All weights must be non-negative.")
473
+ if np.sum(self.weights) == 0:
474
+ raise ValueError("Sum of weights must be greater than zero.")
475
+ else:
476
+ # Default to equal weights (all ones)
477
+ self.weights = np.ones(len(matrix), dtype=np.float64)
478
+
479
+ # Convert matrix to numpy array if it's a DataFrame
480
+ if isinstance(matrix, pd.DataFrame):
481
+ print("[>] Converting DataFrame to NumPy array...")
482
+ self.full_matrix = matrix.values
483
+ else:
484
+ self.full_matrix = matrix
485
+
486
+ # Verify matrix is in square form
487
+ if len(self.full_matrix.shape) != 2 or self.full_matrix.shape[0] != self.full_matrix.shape[1]:
488
+ raise ValueError("Input must be a full square-form distance matrix.")
489
+
490
+ self.clustering_method = clustering_method.lower()
491
+
492
+ # Supported clustering methods
493
+ supported_methods = ["ward", "ward_d", "ward_d2", "single", "complete", "average", "centroid", "median"]
494
+ if self.clustering_method not in supported_methods:
495
+ raise ValueError(
496
+ f"Unsupported clustering method '{clustering_method}'. Supported methods: {supported_methods}")
497
+
498
+ # Handle backward compatibility: 'ward' maps to 'ward_d' (classic Ward method)
499
+ if self.clustering_method == "ward":
500
+ self.clustering_method = "ward_d"
501
+ print("[>] Note: 'ward' method maps to 'ward_d' (classic Ward method).")
502
+ print(" Use 'ward_d2' for Ward method with squared Euclidean distances.")
503
+
504
+ # Compute linkage matrix using fastcluster
505
+ self.linkage_matrix = self._compute_linkage()
506
+
507
+ def _compute_linkage(self):
508
+ """
509
+ Compute the linkage matrix using fastcluster for improved performance.
510
+ Supports both Ward D (classic) and Ward D2 methods.
511
+ """
512
+ # Clean and validate the distance matrix using robust methods
513
+ self.full_matrix = _clean_distance_matrix(self.full_matrix)
514
+
515
+ # Check Ward compatibility and issue one-time warning if needed
516
+ _warn_ward_usage_once(self.full_matrix, self.clustering_method)
517
+
518
+ # Check symmetry before converting to condensed form
519
+ # squareform() requires symmetric matrices
520
+ if not np.allclose(self.full_matrix, self.full_matrix.T, rtol=1e-5, atol=1e-8):
521
+ print("[!] Warning: Distance matrix is not symmetric.")
522
+ print(" Hierarchical clustering algorithms require symmetric distance matrices.")
523
+ print(" Automatically symmetrizing using (matrix + matrix.T) / 2")
524
+ print(" If this is not appropriate for your data, please provide a symmetric matrix.")
525
+ self.full_matrix = (self.full_matrix + self.full_matrix.T) / 2
526
+
527
+ # Convert square matrix to condensed form
528
+ self.condensed_matrix = squareform(self.full_matrix)
529
+
530
+ try:
531
+ # Map our method names to fastcluster's expected method names
532
+ fastcluster_method = self._map_method_name(self.clustering_method)
533
+
534
+ if self.clustering_method == "ward_d" or self.clustering_method == "ward":
535
+ if not _RPY2_AVAILABLE:
536
+ raise ImportError(
537
+ "rpy2 is required for Ward D clustering method but is not available or R is not properly configured.\n"
538
+ "Install rpy2 and ensure R is properly set up, or install with: pip install sequenzo[r]\n"
539
+ "Alternatively, use 'ward_d2', 'average', 'complete', or 'single' methods."
540
+ )
541
+ # Ensure R and R package fastcluster are available (auto-install if needed)
542
+ _ensure_r_environment_and_fastcluster()
543
+
544
+ fastcluster_r = importr("fastcluster")
545
+
546
+ # 将 full_matrix 转换为 R 矩阵(直接从 Python 数组创建),避免 rpy2 对大向量长度出错
547
+ # 用‘F’强制按列展开,符合 R 的内存布局(列优先)
548
+ full_matrix_r = ro.r.matrix(ro.FloatVector(self.full_matrix.flatten('F')),
549
+ nrow=self.full_matrix.shape[0], ncol=self.full_matrix.shape[1])
550
+ r_om = ro.r['as.dist'](full_matrix_r)
551
+
552
+ linkage_matrix = fastcluster_r.hclust(r_om, method="ward.D")
553
+
554
+ linkage_matrix = _hclust_to_linkage_matrix(linkage_matrix)
555
+
556
+ else:
557
+ linkage_matrix = linkage(self.condensed_matrix, method=fastcluster_method)
558
+
559
+ # Apply Ward D correction if needed (divide distances by 2 for classic Ward)
560
+ # if self.clustering_method == "ward_d":
561
+ # linkage_matrix = self._apply_ward_d_correction(linkage_matrix)
562
+
563
+ except Exception as e:
564
+ raise RuntimeError(
565
+ f"Failed to compute linkage with method '{self.clustering_method}'. "
566
+ "Check that the distance matrix is square, symmetric, finite, non-negative, and has a zero diagonal. "
567
+ "For sequence distances, consider using 'average', 'complete', or 'single' instead of Ward methods. "
568
+ f"Original error: {e}"
569
+ )
570
+ return linkage_matrix
571
+
572
+ def _map_method_name(self, method):
573
+ """
574
+ Map our internal method names to fastcluster's expected method names.
575
+ """
576
+ method_mapping = {
577
+ "ward_d": "ward", # Classic Ward (will be corrected later)
578
+ "ward_d2": "ward", # Ward D2 (no correction needed)
579
+ "single": "single",
580
+ "complete": "complete",
581
+ "average": "average",
582
+ "centroid": "centroid",
583
+ "median": "median"
584
+ }
585
+ return method_mapping.get(method, method)
586
+
587
+ def _apply_ward_d_correction(self, linkage_matrix):
588
+ """
589
+ Apply Ward D correction by dividing distances by 2.
590
+ This converts Ward D2 results to classic Ward D results.
591
+ """
592
+ linkage_corrected = linkage_matrix.copy()
593
+ linkage_corrected[:, 2] = linkage_corrected[:, 2] / 2.0
594
+ print("[>] Applied Ward D correction: distances divided by 2 for classic Ward method.")
595
+ return linkage_corrected
596
+
597
+ def plot_dendrogram(self,
598
+ save_as=None,
599
+ style="whitegrid",
600
+ title="Dendrogram",
601
+ xlabel="Entities",
602
+ ylabel="Distance",
603
+ grid=False,
604
+ dpi=200,
605
+ figsize=(12, 8)):
606
+ """
607
+ Plot a dendrogram of the hierarchical clustering with optional high-resolution output.
608
+
609
+ :param save_as: File path to save the plot. If None, the plot will be shown.
610
+ :param style: Seaborn style for the plot.
611
+ :param title: Title of the plot.
612
+ :param xlabel: X-axis label.
613
+ :param ylabel: Y-axis label.
614
+ :param grid: Whether to display grid lines.
615
+ :param dpi: Dots per inch for the saved image (default: 300 for high resolution).
616
+ :param figsize: Tuple specifying the figure size in inches (default: (12, 8)).
617
+ """
618
+ if self.linkage_matrix is None:
619
+ raise ValueError("Linkage matrix is not computed.")
620
+
621
+ sns.set(style=style)
622
+ plt.figure(figsize=figsize)
623
+ dendrogram(self.linkage_matrix, labels=None) # Do not plot labels for large datasets
624
+ plt.xticks([])
625
+ plt.title(title, fontsize=14, fontweight="bold")
626
+ plt.xlabel(xlabel)
627
+ plt.ylabel(ylabel)
628
+ if not grid:
629
+ plt.grid(False)
630
+
631
+ save_and_show_results(save_as, dpi=200)
632
+
633
+ def get_cluster_labels(self, num_clusters):
634
+ """
635
+ Get cluster labels for a specified number of clusters.
636
+
637
+ There is a common point of confusion because
638
+ k is typically used to represent the number of clusters in clustering algorithms (e.g., k-means).
639
+
640
+ However, SciPy's hierarchical clustering API specifically uses t as the parameter name.
641
+
642
+ :param num_clusters: The number of clusters to create.
643
+ :return: Array of cluster labels corresponding to entity_ids.
644
+ """
645
+ if self.linkage_matrix is None:
646
+ raise ValueError("Linkage matrix is not computed.")
647
+
648
+ cluster_labels = fcluster(self.linkage_matrix, t=num_clusters, criterion="maxclust")
649
+
650
+ return cluster_labels
651
+
652
+
653
+ class ClusterQuality:
654
+ def __init__(self, matrix_or_cluster, max_clusters=20, clustering_method=None, weights=None):
655
+ """
656
+ Initialize the ClusterQuality class for precomputed distance matrices or a Cluster instance.
657
+
658
+ Allow the ClusterQuality class to directly accept a Cluster instance
659
+ and internally extract the relevant matrix (cluster.full_matrix)
660
+ and clustering method (cluster.clustering_method).
661
+
662
+ This keeps the user interface clean and simple while handling the logic under the hood.
663
+
664
+ :param matrix_or_cluster: The precomputed distance matrix (full square form or condensed form)
665
+ or an instance of the Cluster class.
666
+ :param max_clusters: Maximum number of clusters to evaluate (default: 20).
667
+ :param clustering_method: Clustering algorithm to use. If None, inherit from Cluster instance.
668
+ :param weights: Optional array of weights for each entity. If None and using Cluster instance,
669
+ weights will be extracted from the Cluster object.
670
+ """
671
+ if isinstance(matrix_or_cluster, Cluster):
672
+ # Extract matrix, clustering method, and weights from the Cluster instance
673
+ self.matrix = matrix_or_cluster.full_matrix
674
+ self.clustering_method = matrix_or_cluster.clustering_method
675
+ self.linkage_matrix = matrix_or_cluster.linkage_matrix
676
+ self.weights = matrix_or_cluster.weights
677
+
678
+ elif isinstance(matrix_or_cluster, (np.ndarray, pd.DataFrame)):
679
+ # Handle direct matrix input
680
+ if isinstance(matrix_or_cluster, pd.DataFrame):
681
+ print("[>] Detected Pandas DataFrame. Converting to NumPy array...")
682
+ matrix_or_cluster = matrix_or_cluster.values
683
+ self.matrix = matrix_or_cluster
684
+ self.clustering_method = clustering_method or "ward_d" # Default to classic Ward
685
+
686
+ # Initialize weights for direct matrix input
687
+ if weights is not None:
688
+ self.weights = np.array(weights, dtype=np.float64)
689
+ if len(self.weights) != len(self.matrix):
690
+ raise ValueError("Length of weights must match the size of the matrix.")
691
+ else:
692
+ self.weights = np.ones(len(self.matrix), dtype=np.float64)
693
+
694
+ # Compute linkage matrix for direct input (needed for clustering operations)
695
+ self.linkage_matrix = self._compute_linkage_for_direct_input()
696
+
697
+ else:
698
+ raise ValueError(
699
+ "Input must be a Cluster instance, a NumPy array, or a Pandas DataFrame."
700
+ )
701
+
702
+ if self.matrix.shape[0] != self.matrix.shape[1]:
703
+ raise ValueError("Matrix must be a full square-form distance matrix.")
704
+
705
+ self.max_clusters = max_clusters
706
+ self.metric_order = [
707
+ "PBC",
708
+ "HG",
709
+ "HGSD",
710
+ "ASW",
711
+ "ASWw",
712
+ "CH",
713
+ "R2",
714
+ "CHsq",
715
+ "R2sq",
716
+ "HC",
717
+ ]
718
+ self.scores = {metric: [] for metric in self.metric_order}
719
+
720
+ # Store original scores separately to preserve raw values
721
+ self.original_scores = None
722
+
723
+ def _compute_linkage_for_direct_input(self):
724
+ """
725
+ Compute linkage matrix for direct matrix input (similar to Cluster class logic).
726
+ Supports both Ward D and Ward D2 methods.
727
+ """
728
+ # Handle backward compatibility: 'ward' maps to 'ward_d'
729
+ if self.clustering_method == "ward":
730
+ self.clustering_method = "ward_d"
731
+ print("[>] Note: 'ward' method maps to 'ward_d' (classic Ward method).")
732
+ print(" Use 'ward_d2' for Ward method with squared Euclidean distances.")
733
+
734
+ # Clean and validate the distance matrix using robust methods
735
+ self.matrix = _clean_distance_matrix(self.matrix)
736
+
737
+ # Check Ward compatibility and issue one-time warning if needed
738
+ _warn_ward_usage_once(self.matrix, self.clustering_method)
739
+
740
+ # Check symmetry before converting to condensed form
741
+ # squareform() requires symmetric matrices
742
+ if not np.allclose(self.matrix, self.matrix.T, rtol=1e-5, atol=1e-8):
743
+ print("[!] Warning: Distance matrix is not symmetric.")
744
+ print(" Hierarchical clustering algorithms require symmetric distance matrices.")
745
+ print(" Automatically symmetrizing using (matrix + matrix.T) / 2")
746
+ print(" If this is not appropriate for your data, please provide a symmetric matrix.")
747
+ self.matrix = (self.matrix + self.matrix.T) / 2
748
+
749
+ # Convert square matrix to condensed form for linkage computation
750
+ condensed_matrix = squareform(self.matrix)
751
+
752
+ try:
753
+ # Map our method names to fastcluster's expected method names
754
+ fastcluster_method = self._map_method_name(self.clustering_method)
755
+ linkage_matrix = linkage(condensed_matrix, method=fastcluster_method)
756
+
757
+ # Apply Ward D correction if needed
758
+ if self.clustering_method == "ward_d":
759
+ linkage_matrix = self._apply_ward_d_correction(linkage_matrix)
760
+
761
+ except Exception as e:
762
+ raise RuntimeError(
763
+ f"Failed to compute linkage with method '{self.clustering_method}'. "
764
+ "Check that the distance matrix is square, symmetric, finite, non-negative, and has a zero diagonal. "
765
+ "For sequence distances, consider using 'average', 'complete', or 'single' instead of Ward methods. "
766
+ f"Original error: {e}"
767
+ )
768
+ return linkage_matrix
769
+
770
+ def _map_method_name(self, method):
771
+ """
772
+ Map our internal method names to fastcluster's expected method names.
773
+ """
774
+ method_mapping = {
775
+ "ward_d": "ward", # Classic Ward (will be corrected later)
776
+ "ward_d2": "ward", # Ward D2 (no correction needed)
777
+ "single": "single",
778
+ "complete": "complete",
779
+ "average": "average",
780
+ "centroid": "centroid",
781
+ "median": "median"
782
+ }
783
+ return method_mapping.get(method, method)
784
+
785
+ def _apply_ward_d_correction(self, linkage_matrix):
786
+ """
787
+ Apply Ward D correction by dividing distances by 2.
788
+ This converts Ward D2 results to classic Ward D results.
789
+ """
790
+ linkage_corrected = linkage_matrix.copy()
791
+ linkage_corrected[:, 2] = linkage_corrected[:, 2] / 2.0
792
+ print("[>] Applied Ward D correction: distances divided by 2 for classic Ward method.")
793
+ return linkage_corrected
794
+
795
+ def compute_cluster_quality_scores(self):
796
+ """
797
+ Compute clustering quality scores for different numbers of clusters.
798
+
799
+ Uses C++ implementation for accuracy and performance.
800
+ This implementation aligns with R WeightedCluster package results.
801
+ """
802
+ if not _CPP_AVAILABLE:
803
+ raise RuntimeError(
804
+ "C++ cluster quality implementation is not available. "
805
+ "Please ensure the C++ extensions are properly compiled."
806
+ )
807
+ self._compute_cluster_quality_scores_cpp()
808
+
809
+ # Save original scores immediately after computation
810
+ self.original_scores = {}
811
+ for metric, values in self.scores.items():
812
+ self.original_scores[metric] = np.array(values).copy()
813
+
814
+ def _compute_cluster_quality_scores_cpp(self):
815
+ """
816
+ Compute clustering quality scores using C++ implementation (matches R WeightedCluster).
817
+ """
818
+ # Convert matrix to format expected by C++
819
+ # Ensure we have a full square matrix
820
+ if self.matrix.shape[0] != self.matrix.shape[1]:
821
+ raise ValueError("Matrix must be square for C++ implementation")
822
+
823
+ # Convert to condensed once to reduce per-call overhead in C++
824
+ condensed = squareform(self.matrix)
825
+
826
+ for k in range(2, self.max_clusters + 1):
827
+ # Get cluster labels (fcluster returns 1-based labels, which C++ expects)
828
+ labels = fcluster(self.linkage_matrix, k, criterion="maxclust")
829
+
830
+ try:
831
+ # Call C++ function (condensed) - expects 1-based labels
832
+ result = clustering_c_code.cluster_quality_condensed(
833
+ condensed.astype(np.float64, copy=False),
834
+ labels.astype(np.int32, copy=False),
835
+ self.weights.astype(np.float64, copy=False),
836
+ self.matrix.shape[0],
837
+ k
838
+ )
839
+
840
+ # Extract results from C++ (mapping to match R WeightedCluster exactly)
841
+ for metric in self.metric_order:
842
+ self.scores[metric].append(result.get(metric, np.nan))
843
+
844
+ except Exception as e:
845
+ print(f"[!] Error: C++ computation failed for k={k}: {e}")
846
+ print(" Python fallback has been removed due to accuracy issues.")
847
+ # Insert NaN values for failed computation
848
+ for metric in self.metric_order:
849
+ self.scores[metric].append(np.nan)
850
+ raise RuntimeError(f"C++ cluster quality computation failed for k={k}. "
851
+ "Python fallback is not available.")
852
+
853
+ def _compute_cluster_quality_scores_python(self):
854
+ """
855
+ Python fallback implementation has been removed.
856
+ Only C++ implementation is available for accuracy and performance.
857
+ """
858
+ raise NotImplementedError(
859
+ "Python cluster quality implementation has been removed due to accuracy issues. "
860
+ "Please use C++ implementation by setting use_cpp=True (default)."
861
+ )
862
+
863
+ def _normalize_scores(self, method="zscore") -> None:
864
+ """
865
+ Normalize each metric independently.
866
+
867
+ :param method: Normalization method. Options are "zscore" or "range".
868
+ """
869
+ for metric in self.scores:
870
+ values = np.array(self.scores[metric])
871
+ if method == "zscore":
872
+ mean_val = np.nanmean(values)
873
+ std_val = np.nanstd(values)
874
+ if std_val > 0:
875
+ self.scores[metric] = (values - mean_val) / std_val
876
+ elif method == "range":
877
+ min_val = np.nanmin(values)
878
+ max_val = np.nanmax(values)
879
+ if max_val > min_val:
880
+ self.scores[metric] = (values - min_val) / (max_val - min_val)
881
+
882
+ def get_cluster_range_table(self) -> pd.DataFrame:
883
+ """
884
+ Return a metrics-by-cluster table mirroring R's `as.clustrange()` output.
885
+
886
+ :return: DataFrame indexed by cluster count ("cluster2", ...)
887
+ with raw metric values for each quality indicator.
888
+ """
889
+ # Prefer preserved raw scores to avoid normalization side-effects
890
+ if self.original_scores is not None:
891
+ scores_to_use = self.original_scores
892
+ else:
893
+ scores_to_use = self.scores
894
+
895
+ # Ensure metrics are available
896
+ if not scores_to_use or not any(len(scores_to_use[m]) for m in self.metric_order):
897
+ raise ValueError("Cluster quality scores are empty. Run `compute_cluster_quality_scores()` first.")
898
+
899
+ # Determine number of evaluated cluster counts
900
+ lengths = [len(scores_to_use[metric]) for metric in self.metric_order if metric in scores_to_use]
901
+ if not lengths:
902
+ raise ValueError("No recognized metrics found in scores.")
903
+
904
+ if len(set(lengths)) != 1:
905
+ raise ValueError("Inconsistent metric lengths detected. Please recompute cluster quality scores.")
906
+
907
+ n_rows = lengths[0]
908
+ if n_rows == 0:
909
+ raise ValueError("Cluster quality scores contain no entries.")
910
+
911
+ # Build DataFrame matching R output ordering
912
+ data = {}
913
+ for metric in self.metric_order:
914
+ values = scores_to_use.get(metric)
915
+ if values is None:
916
+ continue
917
+ data[metric] = np.array(values, dtype=np.float64)
918
+
919
+ index_labels = [f"cluster{k}" for k in range(2, 2 + n_rows)]
920
+ table = pd.DataFrame(data, index=index_labels)
921
+ table.index.name = "Cluster"
922
+
923
+ return table
924
+
925
+ def get_cqi_table(self):
926
+ """
927
+ Generate a summary table of clustering quality indicators with concise column names.
928
+
929
+ :return: Pandas DataFrame summarizing the optimal number of clusters (N groups),
930
+ the corresponding raw metric values, and z-score normalized values.
931
+ """
932
+ # Use original scores if available, otherwise fall back to current scores
933
+ if self.original_scores is not None:
934
+ scores_to_use = self.original_scores
935
+ else:
936
+ scores_to_use = self.scores
937
+
938
+ # Deep copy to avoid overwriting during normalization
939
+ original_scores = {}
940
+ for metric, values in scores_to_use.items():
941
+ original_scores[metric] = np.array(values).copy()
942
+
943
+ # Create temporary copy for z-score normalization
944
+ temp_scores = {}
945
+ for metric, values in original_scores.items():
946
+ temp_scores[metric] = values.copy()
947
+
948
+ # Apply z-score normalization to temp copy
949
+ zscore_normalized = {}
950
+ for metric in temp_scores:
951
+ values = temp_scores[metric]
952
+ mean_val = np.nanmean(values)
953
+ std_val = np.nanstd(values)
954
+ if std_val > 0:
955
+ zscore_normalized[metric] = (values - mean_val) / std_val
956
+ else:
957
+ zscore_normalized[metric] = values.copy()
958
+
959
+ # Generate summary table (removed redundant Min-Max Norm column)
960
+ summary = {
961
+ "Metric": [],
962
+ "Opt. Clusters": [], # Abbreviated from "Optimal Clusters"
963
+ "Raw Value": [], # Raw optimal value (not normalized)
964
+ "Z-Score Norm.": [], # Z-Score normalized optimal value
965
+ }
966
+
967
+ # Get maximum value and its position from original scores
968
+ for metric in self.metric_order:
969
+ values = original_scores.get(metric)
970
+ if values is None:
971
+ continue
972
+
973
+ if np.all(np.isnan(values)):
974
+ optimal_k, raw_value, z_val = np.nan, np.nan, np.nan
975
+ else:
976
+ pos = np.nanargmax(values)
977
+ optimal_k = pos + 2
978
+ raw_value = values[pos] # Use raw original value
979
+ z_val = zscore_normalized[metric][pos]
980
+
981
+ # Add data to the summary table
982
+ summary["Metric"].append(metric)
983
+ summary["Opt. Clusters"].append(optimal_k)
984
+ summary["Raw Value"].append(raw_value) # Raw value, not normalized
985
+ summary["Z-Score Norm."].append(z_val)
986
+
987
+ return pd.DataFrame(summary)
988
+
989
+ def plot_cqi_scores(self,
990
+ metrics_list=None,
991
+ norm="zscore",
992
+ palette="husl",
993
+ line_width=2,
994
+ style="whitegrid",
995
+ title=None,
996
+ xlabel="Number of Clusters",
997
+ ylabel="Normalized Score",
998
+ grid=True,
999
+ save_as=None,
1000
+ dpi=200,
1001
+ figsize=(12, 8),
1002
+ show=True
1003
+ ):
1004
+ """
1005
+ Plot combined scores for clustering quality indicators with customizable parameters.
1006
+
1007
+ This function displays normalized metric values for easier comparison while preserving
1008
+ the original statistical properties in the legend.
1009
+
1010
+ It first calculates raw means and standard deviations from the original data before applying any normalization,
1011
+ then uses these raw statistics in the legend labels to provide context about the actual scale and
1012
+ distribution of each metric.
1013
+
1014
+ :param metrics_list: List of metrics to plot (default: all available metrics)
1015
+ :param norm: Normalization method for plotting ("zscore", "range", or "none")
1016
+ :param palette: Color palette for the plot
1017
+ :param line_width: Width of plotted lines
1018
+ :param style: Seaborn style for the plot
1019
+ :param title: Plot title
1020
+ :param xlabel: X-axis label
1021
+ :param ylabel: Y-axis label
1022
+ :param grid: Whether to show grid lines
1023
+ :param save_as: File path to save the plot
1024
+ :param dpi: DPI for saved image
1025
+ :param figsize: Figure size in inches
1026
+ :param show: Whether to display the figure (default: True)
1027
+
1028
+ :return: The figure object
1029
+ """
1030
+ # Store original scores before normalization
1031
+ original_scores = self.scores.copy()
1032
+
1033
+ # Calculate statistics from original data
1034
+ original_stats = {}
1035
+ for metric in metrics_list or self.metric_order:
1036
+ values = np.array(original_scores[metric])
1037
+ original_stats[metric] = {
1038
+ 'mean': np.nanmean(values),
1039
+ 'std': np.nanstd(values)
1040
+ }
1041
+
1042
+ # Apply normalization if requested
1043
+ if norm != "none":
1044
+ self._normalize_scores(method=norm)
1045
+
1046
+ # Set up plot
1047
+ sns.set(style=style)
1048
+ palette_colors = sns.color_palette(palette, len(metrics_list) if metrics_list else len(self.scores))
1049
+ plt.figure(figsize=figsize)
1050
+
1051
+ if metrics_list is None:
1052
+ metrics_list = list(self.metric_order)
1053
+ else:
1054
+ metrics_list = [metric for metric in metrics_list if metric in self.metric_order]
1055
+
1056
+ # Plot each metric
1057
+ for idx, metric in enumerate(metrics_list):
1058
+ values = np.array(self.scores[metric])
1059
+
1060
+ # Use original statistics for legend
1061
+ mean_val = original_stats[metric]['mean']
1062
+ std_val = original_stats[metric]['std']
1063
+ legend_label = f"{metric} ({mean_val:.2f} / {std_val:.2f})"
1064
+
1065
+ plt.plot(
1066
+ range(2, self.max_clusters + 1),
1067
+ values,
1068
+ label=legend_label,
1069
+ color=palette_colors[idx],
1070
+ linewidth=line_width,
1071
+ )
1072
+
1073
+ # Set title and labels
1074
+ if title is None:
1075
+ title = "Cluster Quality Metrics"
1076
+
1077
+ plt.title(title, fontsize=14, fontweight="bold")
1078
+ plt.xlabel(xlabel, fontsize=12)
1079
+ plt.ylabel(ylabel, fontsize=12)
1080
+
1081
+ # Configure ticks and legend
1082
+ plt.xticks(ticks=range(2, self.max_clusters + 1), fontsize=10)
1083
+ plt.yticks(fontsize=10)
1084
+ plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
1085
+ plt.legend(title="Metrics (Raw Mean / Std Dev)", fontsize=10, title_fontsize=12)
1086
+
1087
+ # Add a note about normalization
1088
+ norm_note = f"Note: Lines show {norm} normalized values; legend shows raw statistics"
1089
+ plt.figtext(0.5, 0.01, norm_note, ha='center', fontsize=10, style='italic')
1090
+
1091
+ # Configure grid
1092
+ if grid:
1093
+ plt.grid(True, linestyle="--", alpha=0.7)
1094
+ else:
1095
+ plt.grid(False)
1096
+
1097
+ # Adjust layout to make room for the note
1098
+ plt.tight_layout()
1099
+ plt.subplots_adjust(bottom=0.1)
1100
+
1101
+ # Save and show the plot
1102
+ return save_and_show_results(save_as, dpi, show=show)
1103
+
1104
+
1105
+ class ClusterResults:
1106
+ def __init__(self, cluster):
1107
+ """
1108
+ Initialize the ClusterResults class.
1109
+
1110
+ :param cluster: An instance of the Cluster class.
1111
+ """
1112
+ if not isinstance(cluster, Cluster):
1113
+ raise ValueError("Input must be an instance of the Cluster class.")
1114
+
1115
+ self.linkage_matrix = cluster.linkage_matrix
1116
+ self.entity_ids = cluster.entity_ids # Retrieve entity IDs from Cluster class
1117
+ self.weights = cluster.weights # Retrieve weights from Cluster class
1118
+
1119
+ def get_cluster_memberships(self, num_clusters) -> pd.DataFrame:
1120
+ """
1121
+ Generate a table mapping entity IDs to their corresponding cluster IDs.
1122
+ Based on this table, users later can link this to the original dataframe for further regression models.
1123
+
1124
+ There is a common point of confusion because
1125
+ k is typically used to represent the number of clusters in clustering algorithms (e.g., k-means).
1126
+ However, SciPy's hierarchical clustering API specifically uses t as the parameter name.
1127
+
1128
+ :param num_clusters: The number of clusters to create.
1129
+ :return: Pandas DataFrame with entity IDs and cluster memberships.
1130
+ """
1131
+ if self.linkage_matrix is None:
1132
+ raise ValueError("Linkage matrix is not computed.")
1133
+
1134
+ # Generate cluster labels
1135
+ cluster_labels = fcluster(self.linkage_matrix, t=num_clusters, criterion="maxclust")
1136
+ return pd.DataFrame({"Entity ID": self.entity_ids, "Cluster": cluster_labels})
1137
+
1138
+ def get_cluster_distribution(self, num_clusters, weighted=False) -> pd.DataFrame:
1139
+ """
1140
+ Generate a distribution summary of clusters showing counts, percentages, and optionally weighted statistics.
1141
+
1142
+ This function calculates how many entities belong to each cluster and what
1143
+ percentage of the total they represent. When weighted=True, it also provides
1144
+ weight-based statistics.
1145
+
1146
+ :param num_clusters: The number of clusters to create.
1147
+ :param weighted: If True, include weighted statistics in the distribution.
1148
+ :return: DataFrame with cluster distribution information.
1149
+ """
1150
+ # Get cluster memberships
1151
+ memberships_df = self.get_cluster_memberships(num_clusters)
1152
+
1153
+ # Count entities in each cluster
1154
+ cluster_counts = memberships_df['Cluster'].value_counts().sort_index()
1155
+
1156
+ # Calculate percentages
1157
+ total_entities = len(memberships_df)
1158
+ cluster_percentages = (cluster_counts / total_entities * 100).round(2)
1159
+
1160
+ # Create basic distribution dataframe
1161
+ distribution = pd.DataFrame({
1162
+ 'Cluster': cluster_counts.index,
1163
+ 'Count': cluster_counts.values,
1164
+ 'Percentage': cluster_percentages.values
1165
+ }).sort_values('Cluster')
1166
+
1167
+ # Add weighted statistics if requested
1168
+ if weighted:
1169
+ cluster_weights = []
1170
+ weighted_percentages = []
1171
+ total_weight = np.sum(self.weights)
1172
+
1173
+ for cluster_id in distribution['Cluster']:
1174
+ # Find entities in this cluster
1175
+ cluster_mask = memberships_df['Cluster'] == cluster_id
1176
+ cluster_entity_indices = memberships_df.index[cluster_mask]
1177
+
1178
+ # Sum weights for entities in this cluster
1179
+ cluster_weight = np.sum(self.weights[cluster_entity_indices])
1180
+ cluster_weights.append(cluster_weight)
1181
+
1182
+ # Calculate weighted percentage
1183
+ weighted_pct = (cluster_weight / total_weight * 100) if total_weight > 0 else 0.0
1184
+ weighted_percentages.append(round(weighted_pct, 2))
1185
+
1186
+ distribution['Weight_Sum'] = cluster_weights
1187
+ distribution['Weight_Percentage'] = weighted_percentages
1188
+
1189
+ return distribution
1190
+
1191
+ def plot_cluster_distribution(self, num_clusters, save_as=None, title=None,
1192
+ style="whitegrid", dpi=200, figsize=(10, 6), weighted=False):
1193
+ """
1194
+ Plot the distribution of entities across clusters as a bar chart.
1195
+
1196
+ This visualization shows how many entities belong to each cluster, providing
1197
+ insight into the balance and size distribution of the clustering result.
1198
+ When weighted=True, displays weight-based percentages.
1199
+
1200
+ :param num_clusters: The number of clusters to create.
1201
+ :param save_as: File path to save the plot. If None, the plot will be shown.
1202
+ :param title: Title for the plot. If None, a default title will be used.
1203
+ :param style: Seaborn style for the plot.
1204
+ :param dpi: DPI for saved image.
1205
+ :param figsize: Figure size in inches.
1206
+ :param weighted: If True, display weighted percentages instead of entity count percentages.
1207
+ """
1208
+ # Get cluster distribution data (include weights if needed)
1209
+ distribution = self.get_cluster_distribution(num_clusters, weighted=weighted)
1210
+
1211
+ # Set up plot
1212
+ sns.set(style=style)
1213
+ plt.figure(figsize=figsize)
1214
+
1215
+ # Choose what to plot based on weighted parameter
1216
+ if weighted and 'Weight_Sum' in distribution.columns:
1217
+ y_column = 'Weight_Sum'
1218
+ percentage_column = 'Weight_Percentage'
1219
+ ylabel = "Total Weight"
1220
+ note_text = "Y-axis shows weight sums; percentages above bars indicate weight-based relative frequency."
1221
+ else:
1222
+ y_column = 'Count'
1223
+ percentage_column = 'Percentage'
1224
+ ylabel = "Number of Entities"
1225
+ note_text = "Y-axis shows entity counts; percentages above bars indicate their relative frequency."
1226
+
1227
+ # Create bar plot with a more poetic, fresh color palette
1228
+ # 'muted', 'pastel', and 'husl' are good options for fresher colors
1229
+ ax = sns.barplot(x='Cluster', y=y_column, data=distribution, palette='pastel')
1230
+
1231
+ # Set the Y-axis range to prevent text overflow
1232
+ ax.set_ylim(0, distribution[y_column].max() * 1.2)
1233
+
1234
+ # Ensure Y-axis uses appropriate ticks
1235
+ if not weighted:
1236
+ plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))
1237
+
1238
+ # Add percentage labels on top of bars
1239
+ for p, (_, row) in zip(ax.patches, distribution.iterrows()):
1240
+ height = p.get_height()
1241
+ percentage = row[percentage_column]
1242
+ ax.text(p.get_x() + p.get_width() / 2., height + max(height * 0.02, 0.5),
1243
+ f'{percentage:.1f}%', ha="center", fontsize=9)
1244
+
1245
+ # Set a simple label for entity count at the top
1246
+ if title is None:
1247
+ if weighted:
1248
+ title = f"N = {len(self.entity_ids)}, Total Weight = {np.sum(self.weights):.1f}"
1249
+ else:
1250
+ title = f"N = {len(self.entity_ids)}"
1251
+
1252
+ # Use a lighter, non-bold title style
1253
+ plt.title(title, fontsize=12, fontweight="normal", loc='right')
1254
+
1255
+ plt.xlabel("Cluster ID", fontsize=12)
1256
+ plt.ylabel(ylabel, fontsize=12)
1257
+ plt.xticks(fontsize=10)
1258
+ plt.yticks(fontsize=10)
1259
+
1260
+ # Ensure integer ticks for cluster IDs
1261
+ plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
1262
+
1263
+ # Add grid for better readability but make it lighter
1264
+ plt.grid(axis='y', linestyle='--', alpha=0.4)
1265
+
1266
+ # Adjust layout
1267
+ plt.tight_layout()
1268
+
1269
+ # Adjust layout to make room for the note
1270
+ plt.subplots_adjust(bottom=0.13)
1271
+
1272
+ # Add a note about what is being displayed
1273
+ plt.figtext(0.5, 0.01, note_text, ha='center', fontsize=10, style='italic')
1274
+
1275
+ # Save and show the plot
1276
+ save_and_show_results(save_as, dpi)
1277
+
1278
+
1279
+ # For xinyi's test, because she can't debug in Jupyter :
1280
+ # Traceback (most recent call last):
1281
+ # File "/Applications/PyCharm.app/Contents/plugins/python-ce/helpers/pydev/_pydevd_bundle/pydevd_comm.py", line 736, in make_thread_stack_str
1282
+ # append('file="%s" line="%s">' % (make_valid_xml_value(my_file), lineno))
1283
+ # File "/Applications/PyCharm.app/Contents/plugins/python-ce/helpers/pydev/_pydevd_bundle/pydevd_xml.py", line 36, in make_valid_xml_value
1284
+ # return s.replace("&", "&amp;").replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
1285
+ # AttributeError: 'tuple' object has no attribute 'replace'
1286
+
1287
+ if __name__ == '__main__':
1288
+ # Import necessary libraries
1289
+ # Your calling code (e.g., in a script or notebook)
1290
+
1291
+ from sequenzo import * # Import the package, give it a short alias
1292
+ import pandas as pd # Data manipulation
1293
+ import numpy as np
1294
+
1295
+ # List all the available datasets in Sequenzo
1296
+ # Now access functions using the alias:
1297
+ print('Available datasets in Sequenzo: ', list_datasets())
1298
+
1299
+ # Load the data that we would like to explore in this tutorial
1300
+ # `df` is the short for `dataframe`, which is a common variable name for a dataset
1301
+ # df = load_dataset('country_co2_emissions')
1302
+ df = load_dataset('mvad')
1303
+
1304
+ # 时间列表
1305
+ time_list = ['Jul.93', 'Aug.93', 'Sep.93', 'Oct.93', 'Nov.93', 'Dec.93',
1306
+ 'Jan.94', 'Feb.94', 'Mar.94', 'Apr.94', 'May.94', 'Jun.94', 'Jul.94',
1307
+ 'Aug.94', 'Sep.94', 'Oct.94', 'Nov.94', 'Dec.94', 'Jan.95', 'Feb.95',
1308
+ 'Mar.95', 'Apr.95', 'May.95', 'Jun.95', 'Jul.95', 'Aug.95', 'Sep.95',
1309
+ 'Oct.95', 'Nov.95', 'Dec.95', 'Jan.96', 'Feb.96', 'Mar.96', 'Apr.96',
1310
+ 'May.96', 'Jun.96', 'Jul.96', 'Aug.96', 'Sep.96', 'Oct.96', 'Nov.96',
1311
+ 'Dec.96', 'Jan.97', 'Feb.97', 'Mar.97', 'Apr.97', 'May.97', 'Jun.97',
1312
+ 'Jul.97', 'Aug.97', 'Sep.97', 'Oct.97', 'Nov.97', 'Dec.97', 'Jan.98',
1313
+ 'Feb.98', 'Mar.98', 'Apr.98', 'May.98', 'Jun.98', 'Jul.98', 'Aug.98',
1314
+ 'Sep.98', 'Oct.98', 'Nov.98', 'Dec.98', 'Jan.99', 'Feb.99', 'Mar.99',
1315
+ 'Apr.99', 'May.99', 'Jun.99']
1316
+
1317
+ # 方法1: 使用pandas获取所有唯一值
1318
+ time_states_df = df[time_list]
1319
+ all_unique_states = set()
1320
+
1321
+ for col in time_list:
1322
+ unique_vals = df[col].dropna().unique() # Remove NaN values
1323
+ all_unique_states.update(unique_vals)
1324
+
1325
+ # 转换为排序的列表
1326
+ states = sorted(list(all_unique_states))
1327
+ print("All unique states:")
1328
+ for i, state in enumerate(states, 1):
1329
+ print(f"{i:2d}. {state}")
1330
+
1331
+ print(f"\nstates list:")
1332
+ print(f"states = {states}")
1333
+
1334
+ # Create a SequenceData object
1335
+
1336
+ # Define the time-span variable
1337
+ time_list = ['Jul.93', 'Aug.93', 'Sep.93', 'Oct.93', 'Nov.93', 'Dec.93',
1338
+ 'Jan.94', 'Feb.94', 'Mar.94', 'Apr.94', 'May.94', 'Jun.94', 'Jul.94',
1339
+ 'Aug.94', 'Sep.94', 'Oct.94', 'Nov.94', 'Dec.94', 'Jan.95', 'Feb.95',
1340
+ 'Mar.95', 'Apr.95', 'May.95', 'Jun.95', 'Jul.95', 'Aug.95', 'Sep.95',
1341
+ 'Oct.95', 'Nov.95', 'Dec.95', 'Jan.96', 'Feb.96', 'Mar.96', 'Apr.96',
1342
+ 'May.96', 'Jun.96', 'Jul.96', 'Aug.96', 'Sep.96', 'Oct.96', 'Nov.96',
1343
+ 'Dec.96', 'Jan.97', 'Feb.97', 'Mar.97', 'Apr.97', 'May.97', 'Jun.97',
1344
+ 'Jul.97', 'Aug.97', 'Sep.97', 'Oct.97', 'Nov.97', 'Dec.97', 'Jan.98',
1345
+ 'Feb.98', 'Mar.98', 'Apr.98', 'May.98', 'Jun.98', 'Jul.98', 'Aug.98',
1346
+ 'Sep.98', 'Oct.98', 'Nov.98', 'Dec.98', 'Jan.99', 'Feb.99', 'Mar.99',
1347
+ 'Apr.99', 'May.99', 'Jun.99']
1348
+
1349
+ states = ['FE', 'HE', 'employment', 'joblessness', 'school', 'training']
1350
+ labels = ['further education', 'higher education', 'employment', 'joblessness', 'school', 'training']
1351
+
1352
+ # TODO: write a try and error: if no such a parameter, then ask to pass the right ones
1353
+ # sequence_data = SequenceData(df, time=time, id_col="country", ids=df['country'].values, states=states)
1354
+
1355
+ sequence_data = SequenceData(df,
1356
+ time=time_list,
1357
+ id_col="id",
1358
+ states=states,
1359
+ labels=labels,
1360
+ )
1361
+
1362
+ om = get_distance_matrix(sequence_data,
1363
+ method="OM",
1364
+ sm="CONSTANT",
1365
+ indel=1)
1366
+
1367
+ cluster = Cluster(om, sequence_data.ids, clustering_method='ward_d')
1368
+ cluster.plot_dendrogram(xlabel="Individuals", ylabel="Distance")
1369
+
1370
+ # Create a ClusterQuality object to evaluate clustering quality
1371
+ cluster_quality = ClusterQuality(cluster)
1372
+ cluster_quality.compute_cluster_quality_scores()
1373
+ cluster_quality.plot_cqi_scores(norm='zscore')
1374
+ summary_table = cluster_quality.get_cqi_table()
1375
+ print(summary_table)
1376
+
1377
+ table = cluster_quality.get_cluster_range_table()
1378
+ # table.to_csv("cluster_quality_table.csv")
1379
+
1380
+ print(table)