sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
  2. sequenzo/__init__.py +349 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +476 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +22 -0
  30. sequenzo/data_preprocessing/helpers.py +303 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/dyadic_children.csv +61 -0
  44. sequenzo/datasets/dyadic_parents.csv +61 -0
  45. sequenzo/datasets/mvad.csv +713 -0
  46. sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
  47. sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
  48. sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
  49. sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
  50. sequenzo/datasets/political_science_aid_shock.csv +166 -0
  51. sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
  52. sequenzo/define_sequence_data.py +1400 -0
  53. sequenzo/dissimilarity_measures/__init__.py +31 -0
  54. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  55. sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
  56. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
  57. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  58. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  59. sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
  60. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  61. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  62. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  63. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  64. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  65. sequenzo/dissimilarity_measures/src/module.cpp +40 -0
  66. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  67. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  214. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  215. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  216. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  217. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  218. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  219. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  220. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  221. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  222. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  223. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  224. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  225. sequenzo/multidomain/__init__.py +23 -0
  226. sequenzo/multidomain/association_between_domains.py +311 -0
  227. sequenzo/multidomain/cat.py +597 -0
  228. sequenzo/multidomain/combt.py +519 -0
  229. sequenzo/multidomain/dat.py +81 -0
  230. sequenzo/multidomain/idcd.py +139 -0
  231. sequenzo/multidomain/linked_polyad.py +292 -0
  232. sequenzo/openmp_setup.py +233 -0
  233. sequenzo/prefix_tree/__init__.py +62 -0
  234. sequenzo/prefix_tree/hub.py +114 -0
  235. sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
  236. sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
  237. sequenzo/prefix_tree/spell_level_indicators.py +297 -0
  238. sequenzo/prefix_tree/system_level_indicators.py +544 -0
  239. sequenzo/prefix_tree/utils.py +54 -0
  240. sequenzo/seqhmm/__init__.py +95 -0
  241. sequenzo/seqhmm/advanced_optimization.py +305 -0
  242. sequenzo/seqhmm/bootstrap.py +411 -0
  243. sequenzo/seqhmm/build_hmm.py +142 -0
  244. sequenzo/seqhmm/build_mhmm.py +136 -0
  245. sequenzo/seqhmm/build_nhmm.py +121 -0
  246. sequenzo/seqhmm/fit_mhmm.py +62 -0
  247. sequenzo/seqhmm/fit_model.py +61 -0
  248. sequenzo/seqhmm/fit_nhmm.py +76 -0
  249. sequenzo/seqhmm/formulas.py +289 -0
  250. sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
  251. sequenzo/seqhmm/gradients_nhmm.py +306 -0
  252. sequenzo/seqhmm/hmm.py +291 -0
  253. sequenzo/seqhmm/mhmm.py +314 -0
  254. sequenzo/seqhmm/model_comparison.py +238 -0
  255. sequenzo/seqhmm/multichannel_em.py +282 -0
  256. sequenzo/seqhmm/multichannel_utils.py +138 -0
  257. sequenzo/seqhmm/nhmm.py +270 -0
  258. sequenzo/seqhmm/nhmm_utils.py +191 -0
  259. sequenzo/seqhmm/predict.py +137 -0
  260. sequenzo/seqhmm/predict_mhmm.py +142 -0
  261. sequenzo/seqhmm/simulate.py +878 -0
  262. sequenzo/seqhmm/utils.py +218 -0
  263. sequenzo/seqhmm/visualization.py +910 -0
  264. sequenzo/sequence_characteristics/__init__.py +40 -0
  265. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  266. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  267. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  268. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  269. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  270. sequenzo/sequence_characteristics/turbulence.py +155 -0
  271. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  272. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  273. sequenzo/suffix_tree/__init__.py +66 -0
  274. sequenzo/suffix_tree/hub.py +114 -0
  275. sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
  276. sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
  277. sequenzo/suffix_tree/spell_level_indicators.py +248 -0
  278. sequenzo/suffix_tree/system_level_indicators.py +535 -0
  279. sequenzo/suffix_tree/utils.py +56 -0
  280. sequenzo/version_check.py +283 -0
  281. sequenzo/visualization/__init__.py +29 -0
  282. sequenzo/visualization/plot_mean_time.py +222 -0
  283. sequenzo/visualization/plot_modal_state.py +276 -0
  284. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  285. sequenzo/visualization/plot_relative_frequency.py +405 -0
  286. sequenzo/visualization/plot_sequence_index.py +1175 -0
  287. sequenzo/visualization/plot_single_medoid.py +153 -0
  288. sequenzo/visualization/plot_state_distribution.py +651 -0
  289. sequenzo/visualization/plot_transition_matrix.py +190 -0
  290. sequenzo/visualization/utils/__init__.py +23 -0
  291. sequenzo/visualization/utils/utils.py +310 -0
  292. sequenzo/with_event_history_analysis/__init__.py +35 -0
  293. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  294. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  295. sequenzo-0.1.31.dist-info/METADATA +286 -0
  296. sequenzo-0.1.31.dist-info/RECORD +299 -0
  297. sequenzo-0.1.31.dist-info/WHEEL +5 -0
  298. sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
  299. sequenzo-0.1.31.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1256 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : hierarchical_clustering.py
4
+ @Time : 18/12/2024 17:59
5
+ @Desc :
6
+ This module provides a flexible and user-friendly implementation of hierarchical clustering,
7
+ along with tools to evaluate cluster quality and analyze clustering results.
8
+
9
+ It supports common hierarchical clustering methods and evaluation metrics,
10
+ designed for social sequence analysis and other research applications.
11
+
12
+ This module leverages fastcluster, a tool specifically designed to enhance the efficiency of large-scale hierarchical clustering.
13
+ Unlike native Python tools such as SciPy, fastcluster optimizes linkage matrix computations,
14
+ enabling it to handle datasets with millions of entries more efficiently.
15
+
16
+ It has three main components:
17
+ 1. Cluster Class: Performs hierarchical clustering on a precomputed distance matrix.
18
+ 2. ClusterQuality Class: Evaluates the quality of clustering for different numbers of clusters using various metrics.
19
+ 3. ClusterResults Class: Analyzes and visualizes the clustering results (e.g., membership tables and cluster distributions).
20
+
21
+ WEIGHTED CLUSTERING SUPPORT:
22
+ All classes now support weighted data analysis:
23
+ - Cluster: Hierarchical linkage is computed on the given distance matrix (unweighted). Optional weights are applied to evaluation and summaries
24
+ - ClusterQuality: Computes weighted versions of quality metrics (ASWw, HG, R2, HC)
25
+ - ClusterResults: Provides weighted cluster distribution statistics and visualizations
26
+
27
+ Weighted metrics account for sequence importance when calculating clustering quality,
28
+ making the analysis more representative when sequences have different sampling weights
29
+ or population sizes.
30
+
31
+ WARD METHOD VARIANTS:
32
+ The module supports two Ward linkage variants:
33
+ - 'ward_d' (Ward D): Classic Ward method using squared Euclidean distances ÷ 2
34
+ - 'ward_d2' (Ward D2): Ward method using squared Euclidean distances
35
+ For backward compatibility, 'ward' maps to 'ward_d'.
36
+
37
+ The difference affects clustering results and dendrogram heights:
38
+ - Ward D produces smaller distances in the linkage matrix
39
+ - Ward D2 produces distances equal to the increase in cluster variance
40
+ - Both methods produce identical cluster assignments, only distances differ
41
+
42
+ ROBUSTNESS AND VALIDATION FEATURES:
43
+ - Ward Method Validation: Automatic detection of non-Euclidean distance matrices
44
+ - One-time Warning System: Alerts users when Ward methods are used with potentially incompatible distances
45
+ - Robust Matrix Cleanup: Handles NaN/Inf values using 95th percentile replacement
46
+ - Distance Matrix Validation: Ensures zero diagonal and non-negativity
47
+ - Symmetry Handling: Automatically symmetrizes matrices when required by clustering algorithms
48
+ - Method Recommendations: Suggests alternative methods for sequence distances
49
+
50
+ For sequence distances (OM, LCS, etc.), Ward linkage methods may produce suboptimal results.
51
+ Consider using alternative methods like 'average' (UPGMA) for better theoretical validity.
52
+
53
+ Original code references:
54
+ Cluster(): Derived from `hclust`, a key function from fastcluster
55
+ R code: https://github.com/cran/fastcluster/blob/master/R/fastcluster.R
56
+ Python code: https://github.com/fastcluster/fastcluster/blob/master/src/fastcluster.cpp
57
+ The Python version of facluster does not support Ward D method but only Ward D2, whereas R supports both.
58
+ Thus, we provide Ward D by ourselves here.
59
+
60
+ ClusterQuality(): Derived from ``, a key function from weightedcluster
61
+ CQI equivalence of R is here (two files):
62
+ https://github.com/cran/WeightedCluster/blob/master/src/clusterquality.cpp
63
+ https://github.com/cran/WeightedCluster/blob/master/src/clusterqualitybody.cpp
64
+ plot_cqi_scores(): `wcCmpCluster()` produces `clustrangefamily` object + `plot.clustrangefamily()` for plotting
65
+ """
66
+ import matplotlib.pyplot as plt
67
+ import seaborn as sns
68
+ import warnings
69
+ from matplotlib.ticker import MaxNLocator
70
+
71
+ import pandas as pd
72
+ import numpy as np
73
+ from scipy.cluster.hierarchy import fcluster, dendrogram
74
+ from scipy.spatial.distance import squareform
75
+ # sklearn metrics no longer needed - using C++ implementation
76
+ # Import from sequenzo_fastcluster (our custom fastcluster with ward_d and ward_d2 support)
77
+ try:
78
+ from sequenzo.clustering.sequenzo_fastcluster.fastcluster import linkage
79
+ except ImportError:
80
+ # Fallback: try absolute import
81
+ try:
82
+ from sequenzo_fastcluster.fastcluster import linkage
83
+ except ImportError:
84
+ # Last resort: try relative import
85
+ from .sequenzo_fastcluster.fastcluster import linkage
86
+
87
+ # Import C++ cluster quality functions
88
+ try:
89
+ from . import clustering_c_code
90
+ _CPP_AVAILABLE = True
91
+ except ImportError:
92
+ _CPP_AVAILABLE = False
93
+ print("[!] Warning: C++ cluster quality functions not available. Using Python fallback.")
94
+
95
+
96
+ # Corrected imports: Use relative imports *within* the package.
97
+ from sequenzo.visualization.utils import save_and_show_results
98
+
99
+ # Global flag to ensure Ward warning is only shown once per session
100
+ _WARD_WARNING_SHOWN = False
101
+
102
+ def _check_euclidean_compatibility(matrix, method):
103
+ """
104
+ Check if a distance matrix is likely compatible with Euclidean-based methods like Ward.
105
+
106
+ This performs heuristic checks rather than exact validation since perfect validation
107
+ would be computationally expensive for large matrices.
108
+
109
+ Parameters:
110
+ -----------
111
+ matrix : np.ndarray
112
+ Distance matrix to check
113
+ method : str
114
+ Clustering method name
115
+
116
+ Returns:
117
+ --------
118
+ bool
119
+ True if matrix appears Euclidean-compatible, False otherwise
120
+ """
121
+ # Check for Ward methods (both Ward D and Ward D2 require Euclidean distances)
122
+ if method.lower() not in ["ward", "ward_d", "ward_d2"]:
123
+ return True # Other methods don't require Euclidean distances
124
+
125
+ # Basic checks for Euclidean properties
126
+ n = matrix.shape[0]
127
+
128
+ # Check 1: Triangle inequality violations (sample a subset for large matrices)
129
+ sample_size = min(50, n) # Sample up to 50 points for efficiency
130
+ if n > sample_size:
131
+ indices = np.random.choice(n, sample_size, replace=False)
132
+ sample_matrix = matrix[np.ix_(indices, indices)]
133
+ else:
134
+ sample_matrix = matrix
135
+ indices = np.arange(n)
136
+
137
+ sample_n = sample_matrix.shape[0]
138
+ violations = 0
139
+ total_checks = 0
140
+
141
+ # Check triangle inequality: d(i,k) <= d(i,j) + d(j,k)
142
+ for i in range(sample_n):
143
+ for j in range(i + 1, sample_n):
144
+ for k in range(j + 1, sample_n):
145
+ dij = sample_matrix[i, j]
146
+ dik = sample_matrix[i, k]
147
+ djk = sample_matrix[j, k]
148
+
149
+ # Check all three triangle inequalities
150
+ if (dik > dij + djk + 1e-10 or
151
+ dij > dik + djk + 1e-10 or
152
+ djk > dij + dik + 1e-10):
153
+ violations += 1
154
+ total_checks += 1
155
+
156
+ if total_checks > 0:
157
+ violation_rate = violations / total_checks
158
+ if violation_rate > 0.1: # More than 10% violations suggests non-Euclidean
159
+ return False
160
+
161
+ # Check 2: Negative eigenvalues in distance matrix (indicates non-Euclidean)
162
+ # Use double centering to convert distances to inner products
163
+ try:
164
+ # For efficiency, only check this for smaller matrices
165
+ if sample_n <= 100:
166
+ H = np.eye(sample_n) - np.ones((sample_n, sample_n)) / sample_n
167
+ B = -0.5 * H @ (sample_matrix ** 2) @ H
168
+ eigenvals = np.linalg.eigvals(B)
169
+
170
+ # Check if there are significant negative eigenvalues
171
+ negative_eigenvals = eigenvals[eigenvals < -1e-10]
172
+ if len(negative_eigenvals) > 0:
173
+ neg_energy = -np.sum(negative_eigenvals)
174
+ total_energy = np.sum(np.abs(eigenvals))
175
+ if neg_energy / total_energy > 0.1: # > 10% negative energy
176
+ return False
177
+ except np.linalg.LinAlgError:
178
+ # If eigenvalue computation fails, assume potentially problematic
179
+ pass
180
+
181
+ return True
182
+
183
+
184
+ def _warn_ward_usage_once(matrix, method):
185
+ """
186
+ Issue a one-time warning about using Ward with potentially non-Euclidean distances.
187
+ """
188
+ global _WARD_WARNING_SHOWN
189
+
190
+ # Check for both Ward D and Ward D2 methods
191
+ if not _WARD_WARNING_SHOWN and method.lower() in ["ward", "ward_d", "ward_d2"]:
192
+ if not _check_euclidean_compatibility(matrix, method):
193
+ warnings.warn(
194
+ "\n[!] Ward linkage method detected with potentially non-Euclidean distance matrix!\n"
195
+ " Ward clustering (both Ward D and Ward D2) assumes Euclidean distances for theoretical validity.\n"
196
+ " \n"
197
+ " Ward method variants:\n"
198
+ " - 'ward_d' (classic): Uses squared Euclidean distances ÷ 2\n"
199
+ " - 'ward_d2': Uses squared Euclidean distances\n"
200
+ " \n"
201
+ " For sequence distances (OM, LCS, etc.), consider using:\n"
202
+ " - method='average' (UPGMA)\n"
203
+ " - method='complete' (complete linkage)\n"
204
+ " - method='single' (single linkage)\n"
205
+ " \n"
206
+ " Note: 'centroid' and 'median' methods may also produce inversions\n"
207
+ " (non-monotonic dendrograms) with non-Euclidean distances.\n"
208
+ " \n"
209
+ " This warning is shown only once per session.",
210
+ UserWarning,
211
+ stacklevel=3
212
+ )
213
+ _WARD_WARNING_SHOWN = True
214
+
215
+
216
+ def _clean_distance_matrix(matrix):
217
+ """
218
+ Clean and validate a distance matrix for hierarchical clustering.
219
+
220
+ This function:
221
+ 1. Handles NaN/Inf values using robust percentile-based replacement
222
+ 2. Sets diagonal to zero
223
+ 3. Ensures non-negativity
224
+
225
+ Note: Symmetry is NOT enforced at this stage since distance matrices may legitimately
226
+ be asymmetric (e.g., directed sequence distances, time-dependent measures, etc.).
227
+ However, symmetrization will be performed later in linkage computation when required
228
+ by clustering algorithms.
229
+
230
+ Parameters:
231
+ -----------
232
+ matrix : np.ndarray
233
+ Input distance matrix
234
+
235
+ Returns:
236
+ --------
237
+ np.ndarray
238
+ Cleaned distance matrix
239
+ """
240
+ matrix = matrix.copy() # Don't modify the original
241
+
242
+ # Step 1: Handle NaN/Inf values with percentile-based replacement
243
+ if np.any(np.isnan(matrix)) or np.any(np.isinf(matrix)):
244
+ print("[!] Warning: Distance matrix contains NaN or Inf values.")
245
+
246
+ # Get finite values for percentile calculation
247
+ finite_vals = matrix[np.isfinite(matrix)]
248
+
249
+ if len(finite_vals) > 0:
250
+ # Use 95th percentile as replacement value (more conservative than max)
251
+ replacement_val = np.percentile(finite_vals, 95)
252
+ print(f" Replacing with 95th percentile value: {replacement_val:.6f}")
253
+ else:
254
+ # If no finite values, use 1.0 as default
255
+ replacement_val = 1.0
256
+ print(f" No finite values found, using default: {replacement_val}")
257
+
258
+ matrix[~np.isfinite(matrix)] = replacement_val
259
+
260
+ # Step 2: Force diagonal to be exactly zero (self-distance should be zero)
261
+ np.fill_diagonal(matrix, 0.0)
262
+
263
+ # Step 3: Ensure non-negativity (distance matrices should be non-negative)
264
+ if np.any(matrix < 0):
265
+ print("[!] Warning: Distance matrix contains negative values. Clipping to zero...")
266
+ matrix = np.maximum(matrix, 0.0)
267
+
268
+ return matrix
269
+
270
+
271
+ def _hclust_to_linkage_matrix(linkage_matrix):
272
+ """
273
+ Convert an R `hclust` object to a SciPy-compatible linkage matrix.
274
+
275
+ This function takes an `hclust` object returned by R (e.g., from
276
+ `fastcluster::hclust`) and converts it into the standard linkage matrix
277
+ format used by SciPy (`scipy.cluster.hierarchy.linkage`), which can be
278
+ used for dendrogram plotting or further clustering analysis in Python.
279
+
280
+ Parameters
281
+ ----------
282
+ linkage_matrix : rpy2.robjects.ListVector
283
+ An R `hclust` object. Expected to contain at least the following fields:
284
+ - 'merge': ndarray of shape (n-1, 2), indicating which clusters are merged
285
+ at each step (negative indices for original observations,
286
+ positive indices for previously merged clusters).
287
+ - 'height': ndarray of shape (n-1,), distances at which merges occur.
288
+ - 'order': ordering of the leaves.
289
+
290
+ Returns
291
+ -------
292
+ Z : numpy.ndarray, shape (n-1, 4), dtype=float
293
+ A SciPy-compatible linkage matrix where each row represents a merge:
294
+ - Z[i, 0] : index of the first cluster (0-based)
295
+ - Z[i, 1] : index of the second cluster (0-based)
296
+ - Z[i, 2] : distance between the merged clusters
297
+ - Z[i, 3] : total number of original samples in the newly formed cluster
298
+
299
+ Notes
300
+ -----
301
+ - The conversion handles the difference in indexing:
302
+ - In R's `hclust`, negative numbers in 'merge' indicate original samples
303
+ and positive numbers indicate previously merged clusters (1-based).
304
+ - In the returned SciPy linkage matrix, all indices are converted to 0-based.
305
+ - The function iteratively tracks cluster sizes to populate the fourth column
306
+ (sample counts) required by SciPy.
307
+ """
308
+
309
+ n = len(linkage_matrix.rx2("order")) # 样本数
310
+ merge = np.array(linkage_matrix.rx2("merge"), dtype=int) # (n-1, 2)
311
+ height = np.array(linkage_matrix.rx2("height"), dtype=float)
312
+
313
+ cluster_sizes = np.ones(n, dtype=int) # 单个样本初始大小 = 1
314
+ Z = np.zeros((n - 1, 4), dtype=float)
315
+
316
+ for i in range(n - 1):
317
+ a, b = merge[i]
318
+
319
+ # R hclust 编号负数表示原始样本
320
+ if a < 0:
321
+ idx1 = -a - 1 # 转成 0-based
322
+ size1 = 1
323
+ else:
324
+ idx1 = n + a - 1 # 已合并簇,0-based
325
+ size1 = cluster_sizes[idx1]
326
+
327
+ if b < 0:
328
+ idx2 = -b - 1
329
+ size2 = 1
330
+ else:
331
+ idx2 = n + b - 1
332
+ size2 = cluster_sizes[idx2]
333
+
334
+ Z[i, 0] = idx1
335
+ Z[i, 1] = idx2
336
+ Z[i, 2] = height[i]
337
+ Z[i, 3] = size1 + size2
338
+
339
+ # 更新 cluster_sizes,用于后续簇
340
+ cluster_sizes = np.append(cluster_sizes, size1 + size2)
341
+
342
+ return Z
343
+
344
+ class Cluster:
345
+ def __init__(self,
346
+ matrix,
347
+ entity_ids,
348
+ clustering_method="ward",
349
+ weights=None):
350
+ """
351
+ A class to handle hierarchical clustering operations using fastcluster for improved performance.
352
+
353
+ :param matrix: Precomputed distance matrix (full square form).
354
+ :param entity_ids: List of IDs corresponding to the entities in the matrix.
355
+ :param clustering_method: Clustering algorithm to use. Options include:
356
+ - "ward" or "ward_d": Classic Ward method (squared Euclidean distances ÷ 2) [default]
357
+ - "ward_d2": Ward method with squared Euclidean distances
358
+ - "single": Single linkage (minimum method)
359
+ - "complete": Complete linkage (maximum method)
360
+ - "average": Average linkage (UPGMA)
361
+ - "centroid": Centroid linkage
362
+ - "median": Median linkage
363
+ :param weights: Optional array of weights for each entity (default: None for equal weights).
364
+ """
365
+ # Ensure entity_ids is a numpy array for consistent processing
366
+ self.entity_ids = np.array(entity_ids)
367
+
368
+ # Check if entity_ids is valid
369
+ if len(self.entity_ids) != len(matrix):
370
+ raise ValueError("Length of entity_ids must match the size of the matrix.")
371
+
372
+ # Optional: Check uniqueness of entity_ids
373
+ if len(np.unique(self.entity_ids)) != len(self.entity_ids):
374
+ raise ValueError("entity_ids must contain unique values.")
375
+
376
+ # Initialize and validate weights
377
+ if weights is not None:
378
+ self.weights = np.array(weights, dtype=np.float64)
379
+ if len(self.weights) != len(matrix):
380
+ raise ValueError("Length of weights must match the size of the matrix.")
381
+ if np.any(self.weights < 0):
382
+ raise ValueError("All weights must be non-negative.")
383
+ if np.sum(self.weights) == 0:
384
+ raise ValueError("Sum of weights must be greater than zero.")
385
+ else:
386
+ # Default to equal weights (all ones)
387
+ self.weights = np.ones(len(matrix), dtype=np.float64)
388
+
389
+ # Convert matrix to numpy array if it's a DataFrame
390
+ if isinstance(matrix, pd.DataFrame):
391
+ print("[>] Converting DataFrame to NumPy array...")
392
+ self.full_matrix = matrix.values
393
+ else:
394
+ self.full_matrix = matrix
395
+
396
+ # Verify matrix is in square form
397
+ if len(self.full_matrix.shape) != 2 or self.full_matrix.shape[0] != self.full_matrix.shape[1]:
398
+ raise ValueError("Input must be a full square-form distance matrix.")
399
+
400
+ self.clustering_method = clustering_method.lower()
401
+
402
+ # Supported clustering methods
403
+ supported_methods = ["ward", "ward_d", "ward_d2", "single", "complete", "average", "centroid", "median"]
404
+ if self.clustering_method not in supported_methods:
405
+ raise ValueError(
406
+ f"Unsupported clustering method '{clustering_method}'. Supported methods: {supported_methods}")
407
+
408
+ # Handle backward compatibility: 'ward' maps to 'ward_d' (classic Ward method)
409
+ if self.clustering_method == "ward":
410
+ self.clustering_method = "ward_d"
411
+ print("[>] Note: 'ward' method maps to 'ward_d' (classic Ward method).")
412
+ print(" Use 'ward_d2' for Ward method with squared Euclidean distances.")
413
+
414
+ # Compute linkage matrix using fastcluster
415
+ self.linkage_matrix = self._compute_linkage()
416
+
417
+ def _compute_linkage(self):
418
+ """
419
+ Compute the linkage matrix using fastcluster for improved performance.
420
+ Supports both Ward D (classic) and Ward D2 methods.
421
+ """
422
+ # Clean and validate the distance matrix using robust methods
423
+ self.full_matrix = _clean_distance_matrix(self.full_matrix)
424
+
425
+ # Check Ward compatibility and issue one-time warning if needed
426
+ _warn_ward_usage_once(self.full_matrix, self.clustering_method)
427
+
428
+ # Check symmetry before converting to condensed form
429
+ # squareform() requires symmetric matrices
430
+ if not np.allclose(self.full_matrix, self.full_matrix.T, rtol=1e-5, atol=1e-8):
431
+ print("[!] Warning: Distance matrix is not symmetric.")
432
+ print(" Hierarchical clustering algorithms require symmetric distance matrices.")
433
+ print(" Automatically symmetrizing using (matrix + matrix.T) / 2")
434
+ print(" If this is not appropriate for your data, please provide a symmetric matrix.")
435
+ self.full_matrix = (self.full_matrix + self.full_matrix.T) / 2
436
+
437
+ # Convert square matrix to condensed form
438
+ self.condensed_matrix = squareform(self.full_matrix)
439
+
440
+ # Map our method names to fastcluster's expected method names
441
+ fastcluster_method = self._map_method_name(self.clustering_method)
442
+
443
+ linkage_matrix = linkage(self.condensed_matrix, method=fastcluster_method)
444
+
445
+ return linkage_matrix
446
+
447
+ def _map_method_name(self, method):
448
+ """
449
+ Map our internal method names to fastcluster's expected method names.
450
+ """
451
+ method_mapping = {
452
+ "ward_d": "ward", # Classic Ward (will be corrected later) (updated: it was solved on Nov.15, 2025 by Xinyi)
453
+ "ward_d2": "ward_d2", # Ward D2 (no correction needed)
454
+ "single": "single",
455
+ "complete": "complete",
456
+ "average": "average",
457
+ "centroid": "centroid",
458
+ "median": "median"
459
+ }
460
+ return method_mapping.get(method, method)
461
+
462
+ def _apply_ward_d_correction(self, linkage_matrix):
463
+ """
464
+ Apply Ward D correction by dividing distances by 2.
465
+ This converts Ward D2 results to classic Ward D results.
466
+ """
467
+ linkage_corrected = linkage_matrix.copy()
468
+ linkage_corrected[:, 2] = linkage_corrected[:, 2] / 2.0
469
+ print("[>] Applied Ward D correction: distances divided by 2 for classic Ward method.")
470
+ return linkage_corrected
471
+
472
+ def plot_dendrogram(self,
473
+ save_as=None,
474
+ style="whitegrid",
475
+ title="Dendrogram",
476
+ xlabel="Entities",
477
+ ylabel="Distance",
478
+ grid=False,
479
+ dpi=200,
480
+ figsize=(12, 8)):
481
+ """
482
+ Plot a dendrogram of the hierarchical clustering with optional high-resolution output.
483
+
484
+ :param save_as: File path to save the plot. If None, the plot will be shown.
485
+ :param style: Seaborn style for the plot.
486
+ :param title: Title of the plot.
487
+ :param xlabel: X-axis label.
488
+ :param ylabel: Y-axis label.
489
+ :param grid: Whether to display grid lines.
490
+ :param dpi: Dots per inch for the saved image (default: 300 for high resolution).
491
+ :param figsize: Tuple specifying the figure size in inches (default: (12, 8)).
492
+ """
493
+ if self.linkage_matrix is None:
494
+ raise ValueError("Linkage matrix is not computed.")
495
+
496
+ sns.set(style=style)
497
+ plt.figure(figsize=figsize)
498
+ dendrogram(self.linkage_matrix, labels=None) # Do not plot labels for large datasets
499
+ plt.xticks([])
500
+ plt.title(title, fontsize=14, fontweight="bold")
501
+ plt.xlabel(xlabel)
502
+ plt.ylabel(ylabel)
503
+ if not grid:
504
+ plt.grid(False)
505
+
506
+ save_and_show_results(save_as, dpi=200)
507
+
508
+ def get_cluster_labels(self, num_clusters):
509
+ """
510
+ Get cluster labels for a specified number of clusters.
511
+
512
+ There is a common point of confusion because
513
+ k is typically used to represent the number of clusters in clustering algorithms (e.g., k-means).
514
+
515
+ However, SciPy's hierarchical clustering API specifically uses t as the parameter name.
516
+
517
+ :param num_clusters: The number of clusters to create.
518
+ :return: Array of cluster labels corresponding to entity_ids.
519
+ """
520
+ if self.linkage_matrix is None:
521
+ raise ValueError("Linkage matrix is not computed.")
522
+
523
+ cluster_labels = fcluster(self.linkage_matrix, t=num_clusters, criterion="maxclust")
524
+
525
+ return cluster_labels
526
+
527
+
528
+ class ClusterQuality:
529
+ def __init__(self, matrix_or_cluster, max_clusters=20, clustering_method=None, weights=None):
530
+ """
531
+ Initialize the ClusterQuality class for precomputed distance matrices or a Cluster instance.
532
+
533
+ Allow the ClusterQuality class to directly accept a Cluster instance
534
+ and internally extract the relevant matrix (cluster.full_matrix)
535
+ and clustering method (cluster.clustering_method).
536
+
537
+ This keeps the user interface clean and simple while handling the logic under the hood.
538
+
539
+ :param matrix_or_cluster: The precomputed distance matrix (full square form or condensed form)
540
+ or an instance of the Cluster class.
541
+ :param max_clusters: Maximum number of clusters to evaluate (default: 20).
542
+ :param clustering_method: Clustering algorithm to use. If None, inherit from Cluster instance.
543
+ :param weights: Optional array of weights for each entity. If None and using Cluster instance,
544
+ weights will be extracted from the Cluster object.
545
+ """
546
+ if isinstance(matrix_or_cluster, Cluster):
547
+ # Extract matrix, clustering method, and weights from the Cluster instance
548
+ self.matrix = matrix_or_cluster.full_matrix
549
+ self.clustering_method = matrix_or_cluster.clustering_method
550
+ self.linkage_matrix = matrix_or_cluster.linkage_matrix
551
+ self.weights = matrix_or_cluster.weights
552
+
553
+ elif isinstance(matrix_or_cluster, (np.ndarray, pd.DataFrame)):
554
+ # Handle direct matrix input
555
+ if isinstance(matrix_or_cluster, pd.DataFrame):
556
+ print("[>] Detected Pandas DataFrame. Converting to NumPy array...")
557
+ matrix_or_cluster = matrix_or_cluster.values
558
+ self.matrix = matrix_or_cluster
559
+ self.clustering_method = clustering_method or "ward_d" # Default to classic Ward
560
+
561
+ # Initialize weights for direct matrix input
562
+ if weights is not None:
563
+ self.weights = np.array(weights, dtype=np.float64)
564
+ if len(self.weights) != len(self.matrix):
565
+ raise ValueError("Length of weights must match the size of the matrix.")
566
+ else:
567
+ self.weights = np.ones(len(self.matrix), dtype=np.float64)
568
+
569
+ # Compute linkage matrix for direct input (needed for clustering operations)
570
+ self.linkage_matrix = self._compute_linkage_for_direct_input()
571
+
572
+ else:
573
+ raise ValueError(
574
+ "Input must be a Cluster instance, a NumPy array, or a Pandas DataFrame."
575
+ )
576
+
577
+ if self.matrix.shape[0] != self.matrix.shape[1]:
578
+ raise ValueError("Matrix must be a full square-form distance matrix.")
579
+
580
+ self.max_clusters = max_clusters
581
+ self.metric_order = [
582
+ "PBC",
583
+ "HG",
584
+ "HGSD",
585
+ "ASW",
586
+ "ASWw",
587
+ "CH",
588
+ "R2",
589
+ "CHsq",
590
+ "R2sq",
591
+ "HC",
592
+ ]
593
+ self.scores = {metric: [] for metric in self.metric_order}
594
+
595
+ # Store original scores separately to preserve raw values
596
+ self.original_scores = None
597
+
598
+ def _compute_linkage_for_direct_input(self):
599
+ """
600
+ Compute linkage matrix for direct matrix input (similar to Cluster class logic).
601
+ Supports both Ward D and Ward D2 methods.
602
+ """
603
+ # Handle backward compatibility: 'ward' maps to 'ward_d'
604
+ if self.clustering_method == "ward":
605
+ self.clustering_method = "ward_d"
606
+ print("[>] Note: 'ward' method maps to 'ward_d' (classic Ward method).")
607
+ print(" Use 'ward_d2' for Ward method with squared Euclidean distances.")
608
+
609
+ # Clean and validate the distance matrix using robust methods
610
+ self.matrix = _clean_distance_matrix(self.matrix)
611
+
612
+ # Check Ward compatibility and issue one-time warning if needed
613
+ _warn_ward_usage_once(self.matrix, self.clustering_method)
614
+
615
+ # Check symmetry before converting to condensed form
616
+ # squareform() requires symmetric matrices
617
+ if not np.allclose(self.matrix, self.matrix.T, rtol=1e-5, atol=1e-8):
618
+ print("[!] Warning: Distance matrix is not symmetric.")
619
+ print(" Hierarchical clustering algorithms require symmetric distance matrices.")
620
+ print(" Automatically symmetrizing using (matrix + matrix.T) / 2")
621
+ print(" If this is not appropriate for your data, please provide a symmetric matrix.")
622
+ self.matrix = (self.matrix + self.matrix.T) / 2
623
+
624
+ # Convert square matrix to condensed form for linkage computation
625
+ condensed_matrix = squareform(self.matrix)
626
+
627
+ try:
628
+ # Map our method names to fastcluster's expected method names
629
+ fastcluster_method = self._map_method_name(self.clustering_method)
630
+ linkage_matrix = linkage(condensed_matrix, method=fastcluster_method)
631
+
632
+ # Apply Ward D correction if needed
633
+ if self.clustering_method == "ward_d":
634
+ linkage_matrix = self._apply_ward_d_correction(linkage_matrix)
635
+
636
+ except Exception as e:
637
+ raise RuntimeError(
638
+ f"Failed to compute linkage with method '{self.clustering_method}'. "
639
+ "Check that the distance matrix is square, symmetric, finite, non-negative, and has a zero diagonal. "
640
+ "For sequence distances, consider using 'average', 'complete', or 'single' instead of Ward methods. "
641
+ f"Original error: {e}"
642
+ )
643
+ return linkage_matrix
644
+
645
+ def _map_method_name(self, method):
646
+ """
647
+ Map our internal method names to fastcluster's expected method names.
648
+ """
649
+ method_mapping = {
650
+ "ward_d": "ward", # Classic Ward (will be corrected later)
651
+ "ward_d2": "ward", # Ward D2 (no correction needed)
652
+ "single": "single",
653
+ "complete": "complete",
654
+ "average": "average",
655
+ "centroid": "centroid",
656
+ "median": "median"
657
+ }
658
+ return method_mapping.get(method, method)
659
+
660
+ def _apply_ward_d_correction(self, linkage_matrix):
661
+ """
662
+ Apply Ward D correction by dividing distances by 2.
663
+ This converts Ward D2 results to classic Ward D results.
664
+ """
665
+ linkage_corrected = linkage_matrix.copy()
666
+ linkage_corrected[:, 2] = linkage_corrected[:, 2] / 2.0
667
+ print("[>] Applied Ward D correction: distances divided by 2 for classic Ward method.")
668
+ return linkage_corrected
669
+
670
+ def compute_cluster_quality_scores(self):
671
+ """
672
+ Compute clustering quality scores for different numbers of clusters.
673
+
674
+ Uses C++ implementation for accuracy and performance.
675
+ This implementation aligns with R WeightedCluster package results.
676
+ """
677
+ if not _CPP_AVAILABLE:
678
+ raise RuntimeError(
679
+ "C++ cluster quality implementation is not available. "
680
+ "Please ensure the C++ extensions are properly compiled."
681
+ )
682
+ self._compute_cluster_quality_scores_cpp()
683
+
684
+ # Save original scores immediately after computation
685
+ self.original_scores = {}
686
+ for metric, values in self.scores.items():
687
+ self.original_scores[metric] = np.array(values).copy()
688
+
689
+ def _compute_cluster_quality_scores_cpp(self):
690
+ """
691
+ Compute clustering quality scores using C++ implementation (matches R WeightedCluster).
692
+ """
693
+ # Convert matrix to format expected by C++
694
+ # Ensure we have a full square matrix
695
+ if self.matrix.shape[0] != self.matrix.shape[1]:
696
+ raise ValueError("Matrix must be square for C++ implementation")
697
+
698
+ # Convert to condensed once to reduce per-call overhead in C++
699
+ condensed = squareform(self.matrix)
700
+
701
+ for k in range(2, self.max_clusters + 1):
702
+ # Get cluster labels (fcluster returns 1-based labels, which C++ expects)
703
+ labels = fcluster(self.linkage_matrix, k, criterion="maxclust")
704
+
705
+ try:
706
+ # Call C++ function (condensed) - expects 1-based labels
707
+ result = clustering_c_code.cluster_quality_condensed(
708
+ condensed.astype(np.float64, copy=False),
709
+ labels.astype(np.int32, copy=False),
710
+ self.weights.astype(np.float64, copy=False),
711
+ self.matrix.shape[0],
712
+ k
713
+ )
714
+
715
+ # Extract results from C++ (mapping to match R WeightedCluster exactly)
716
+ for metric in self.metric_order:
717
+ self.scores[metric].append(result.get(metric, np.nan))
718
+
719
+ except Exception as e:
720
+ print(f"[!] Error: C++ computation failed for k={k}: {e}")
721
+ print(" Python fallback has been removed due to accuracy issues.")
722
+ # Insert NaN values for failed computation
723
+ for metric in self.metric_order:
724
+ self.scores[metric].append(np.nan)
725
+ raise RuntimeError(f"C++ cluster quality computation failed for k={k}. "
726
+ "Python fallback is not available.")
727
+
728
+ def _compute_cluster_quality_scores_python(self):
729
+ """
730
+ Python fallback implementation has been removed.
731
+ Only C++ implementation is available for accuracy and performance.
732
+ """
733
+ raise NotImplementedError(
734
+ "Python cluster quality implementation has been removed due to accuracy issues. "
735
+ "Please use C++ implementation by setting use_cpp=True (default)."
736
+ )
737
+
738
+ def _normalize_scores(self, method="zscore") -> None:
739
+ """
740
+ Normalize each metric independently.
741
+
742
+ :param method: Normalization method. Options are "zscore" or "range".
743
+ """
744
+ for metric in self.scores:
745
+ values = np.array(self.scores[metric])
746
+ if method == "zscore":
747
+ mean_val = np.nanmean(values)
748
+ std_val = np.nanstd(values)
749
+ if std_val > 0:
750
+ self.scores[metric] = (values - mean_val) / std_val
751
+ elif method == "range":
752
+ min_val = np.nanmin(values)
753
+ max_val = np.nanmax(values)
754
+ if max_val > min_val:
755
+ self.scores[metric] = (values - min_val) / (max_val - min_val)
756
+
757
+ def get_cluster_range_table(self) -> pd.DataFrame:
758
+ """
759
+ Return a metrics-by-cluster table mirroring R's `as.clustrange()` output.
760
+
761
+ :return: DataFrame indexed by cluster count ("cluster2", ...)
762
+ with raw metric values for each quality indicator.
763
+ """
764
+ # Prefer preserved raw scores to avoid normalization side-effects
765
+ if self.original_scores is not None:
766
+ scores_to_use = self.original_scores
767
+ else:
768
+ scores_to_use = self.scores
769
+
770
+ # Ensure metrics are available
771
+ if not scores_to_use or not any(len(scores_to_use[m]) for m in self.metric_order):
772
+ raise ValueError("Cluster quality scores are empty. Run `compute_cluster_quality_scores()` first.")
773
+
774
+ # Determine number of evaluated cluster counts
775
+ lengths = [len(scores_to_use[metric]) for metric in self.metric_order if metric in scores_to_use]
776
+ if not lengths:
777
+ raise ValueError("No recognized metrics found in scores.")
778
+
779
+ if len(set(lengths)) != 1:
780
+ raise ValueError("Inconsistent metric lengths detected. Please recompute cluster quality scores.")
781
+
782
+ n_rows = lengths[0]
783
+ if n_rows == 0:
784
+ raise ValueError("Cluster quality scores contain no entries.")
785
+
786
+ # Build DataFrame matching R output ordering
787
+ data = {}
788
+ for metric in self.metric_order:
789
+ values = scores_to_use.get(metric)
790
+ if values is None:
791
+ continue
792
+ data[metric] = np.array(values, dtype=np.float64)
793
+
794
+ index_labels = [f"cluster{k}" for k in range(2, 2 + n_rows)]
795
+ table = pd.DataFrame(data, index=index_labels)
796
+ table.index.name = "Cluster"
797
+
798
+ return table
799
+
800
+ def get_cqi_table(self):
801
+ """
802
+ Generate a summary table of clustering quality indicators with concise column names.
803
+
804
+ :return: Pandas DataFrame summarizing the optimal number of clusters (N groups),
805
+ the corresponding raw metric values, and z-score normalized values.
806
+ """
807
+ # Use original scores if available, otherwise fall back to current scores
808
+ if self.original_scores is not None:
809
+ scores_to_use = self.original_scores
810
+ else:
811
+ scores_to_use = self.scores
812
+
813
+ # Deep copy to avoid overwriting during normalization
814
+ original_scores = {}
815
+ for metric, values in scores_to_use.items():
816
+ original_scores[metric] = np.array(values).copy()
817
+
818
+ # Create temporary copy for z-score normalization
819
+ temp_scores = {}
820
+ for metric, values in original_scores.items():
821
+ temp_scores[metric] = values.copy()
822
+
823
+ # Apply z-score normalization to temp copy
824
+ zscore_normalized = {}
825
+ for metric in temp_scores:
826
+ values = temp_scores[metric]
827
+ mean_val = np.nanmean(values)
828
+ std_val = np.nanstd(values)
829
+ if std_val > 0:
830
+ zscore_normalized[metric] = (values - mean_val) / std_val
831
+ else:
832
+ zscore_normalized[metric] = values.copy()
833
+
834
+ # Generate summary table (removed redundant Min-Max Norm column)
835
+ summary = {
836
+ "Metric": [],
837
+ "Opt. Clusters": [], # Abbreviated from "Optimal Clusters"
838
+ "Raw Value": [], # Raw optimal value (not normalized)
839
+ "Z-Score Norm.": [], # Z-Score normalized optimal value
840
+ }
841
+
842
+ # Get maximum value and its position from original scores
843
+ for metric in self.metric_order:
844
+ values = original_scores.get(metric)
845
+ if values is None:
846
+ continue
847
+
848
+ if np.all(np.isnan(values)):
849
+ optimal_k, raw_value, z_val = np.nan, np.nan, np.nan
850
+ else:
851
+ pos = np.nanargmax(values)
852
+ optimal_k = pos + 2
853
+ raw_value = values[pos] # Use raw original value
854
+ z_val = zscore_normalized[metric][pos]
855
+
856
+ # Add data to the summary table
857
+ summary["Metric"].append(metric)
858
+ summary["Opt. Clusters"].append(optimal_k)
859
+ summary["Raw Value"].append(raw_value) # Raw value, not normalized
860
+ summary["Z-Score Norm."].append(z_val)
861
+
862
+ return pd.DataFrame(summary)
863
+
864
+ def plot_cqi_scores(self,
865
+ metrics_list=None,
866
+ norm="zscore",
867
+ palette="husl",
868
+ line_width=2,
869
+ style="whitegrid",
870
+ title=None,
871
+ xlabel="Number of Clusters",
872
+ ylabel="Normalized Score",
873
+ grid=True,
874
+ save_as=None,
875
+ dpi=200,
876
+ figsize=(12, 8),
877
+ show=True
878
+ ):
879
+ """
880
+ Plot combined scores for clustering quality indicators with customizable parameters.
881
+
882
+ This function displays normalized metric values for easier comparison while preserving
883
+ the original statistical properties in the legend.
884
+
885
+ It first calculates raw means and standard deviations from the original data before applying any normalization,
886
+ then uses these raw statistics in the legend labels to provide context about the actual scale and
887
+ distribution of each metric.
888
+
889
+ :param metrics_list: List of metrics to plot (default: all available metrics)
890
+ :param norm: Normalization method for plotting ("zscore", "range", or "none")
891
+ :param palette: Color palette for the plot
892
+ :param line_width: Width of plotted lines
893
+ :param style: Seaborn style for the plot
894
+ :param title: Plot title
895
+ :param xlabel: X-axis label
896
+ :param ylabel: Y-axis label
897
+ :param grid: Whether to show grid lines
898
+ :param save_as: File path to save the plot
899
+ :param dpi: DPI for saved image
900
+ :param figsize: Figure size in inches
901
+ :param show: Whether to display the figure (default: True)
902
+
903
+ :return: The figure object
904
+ """
905
+ # Store original scores before normalization
906
+ original_scores = self.scores.copy()
907
+
908
+ # Calculate statistics from original data
909
+ original_stats = {}
910
+ for metric in metrics_list or self.metric_order:
911
+ values = np.array(original_scores[metric])
912
+ original_stats[metric] = {
913
+ 'mean': np.nanmean(values),
914
+ 'std': np.nanstd(values)
915
+ }
916
+
917
+ # Apply normalization if requested
918
+ if norm != "none":
919
+ self._normalize_scores(method=norm)
920
+
921
+ # Set up plot
922
+ sns.set(style=style)
923
+ palette_colors = sns.color_palette(palette, len(metrics_list) if metrics_list else len(self.scores))
924
+ plt.figure(figsize=figsize)
925
+
926
+ if metrics_list is None:
927
+ metrics_list = list(self.metric_order)
928
+ else:
929
+ metrics_list = [metric for metric in metrics_list if metric in self.metric_order]
930
+
931
+ # Plot each metric
932
+ for idx, metric in enumerate(metrics_list):
933
+ values = np.array(self.scores[metric])
934
+
935
+ # Use original statistics for legend
936
+ mean_val = original_stats[metric]['mean']
937
+ std_val = original_stats[metric]['std']
938
+ legend_label = f"{metric} ({mean_val:.2f} / {std_val:.2f})"
939
+
940
+ plt.plot(
941
+ range(2, self.max_clusters + 1),
942
+ values,
943
+ label=legend_label,
944
+ color=palette_colors[idx],
945
+ linewidth=line_width,
946
+ )
947
+
948
+ # Set title and labels
949
+ if title is None:
950
+ title = "Cluster Quality Metrics"
951
+
952
+ plt.title(title, fontsize=14, fontweight="bold")
953
+ plt.xlabel(xlabel, fontsize=12)
954
+ plt.ylabel(ylabel, fontsize=12)
955
+
956
+ # Configure ticks and legend
957
+ plt.xticks(ticks=range(2, self.max_clusters + 1), fontsize=10)
958
+ plt.yticks(fontsize=10)
959
+ plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
960
+ plt.legend(title="Metrics (Raw Mean / Std Dev)", fontsize=10, title_fontsize=12)
961
+
962
+ # Add a note about normalization
963
+ norm_note = f"Note: Lines show {norm} normalized values; legend shows raw statistics"
964
+ plt.figtext(0.5, 0.01, norm_note, ha='center', fontsize=10, style='italic')
965
+
966
+ # Configure grid
967
+ if grid:
968
+ plt.grid(True, linestyle="--", alpha=0.7)
969
+ else:
970
+ plt.grid(False)
971
+
972
+ # Adjust layout to make room for the note
973
+ plt.tight_layout()
974
+ plt.subplots_adjust(bottom=0.1)
975
+
976
+ # Save and show the plot
977
+ return save_and_show_results(save_as, dpi, show=show)
978
+
979
+
980
+ class ClusterResults:
981
+ def __init__(self, cluster):
982
+ """
983
+ Initialize the ClusterResults class.
984
+
985
+ :param cluster: An instance of the Cluster class.
986
+ """
987
+ if not isinstance(cluster, Cluster):
988
+ raise ValueError("Input must be an instance of the Cluster class.")
989
+
990
+ self.linkage_matrix = cluster.linkage_matrix
991
+ self.entity_ids = cluster.entity_ids # Retrieve entity IDs from Cluster class
992
+ self.weights = cluster.weights # Retrieve weights from Cluster class
993
+
994
+ def get_cluster_memberships(self, num_clusters) -> pd.DataFrame:
995
+ """
996
+ Generate a table mapping entity IDs to their corresponding cluster IDs.
997
+ Based on this table, users later can link this to the original dataframe for further regression models.
998
+
999
+ There is a common point of confusion because
1000
+ k is typically used to represent the number of clusters in clustering algorithms (e.g., k-means).
1001
+ However, SciPy's hierarchical clustering API specifically uses t as the parameter name.
1002
+
1003
+ :param num_clusters: The number of clusters to create.
1004
+ :return: Pandas DataFrame with entity IDs and cluster memberships.
1005
+ """
1006
+ if self.linkage_matrix is None:
1007
+ raise ValueError("Linkage matrix is not computed.")
1008
+
1009
+ # Generate cluster labels
1010
+ cluster_labels = fcluster(self.linkage_matrix, t=num_clusters, criterion="maxclust")
1011
+ return pd.DataFrame({"Entity ID": self.entity_ids, "Cluster": cluster_labels})
1012
+
1013
+ def get_cluster_distribution(self, num_clusters, weighted=False) -> pd.DataFrame:
1014
+ """
1015
+ Generate a distribution summary of clusters showing counts, percentages, and optionally weighted statistics.
1016
+
1017
+ This function calculates how many entities belong to each cluster and what
1018
+ percentage of the total they represent. When weighted=True, it also provides
1019
+ weight-based statistics.
1020
+
1021
+ :param num_clusters: The number of clusters to create.
1022
+ :param weighted: If True, include weighted statistics in the distribution.
1023
+ :return: DataFrame with cluster distribution information.
1024
+ """
1025
+ # Get cluster memberships
1026
+ memberships_df = self.get_cluster_memberships(num_clusters)
1027
+
1028
+ # Count entities in each cluster
1029
+ cluster_counts = memberships_df['Cluster'].value_counts().sort_index()
1030
+
1031
+ # Calculate percentages
1032
+ total_entities = len(memberships_df)
1033
+ cluster_percentages = (cluster_counts / total_entities * 100).round(2)
1034
+
1035
+ # Create basic distribution dataframe
1036
+ distribution = pd.DataFrame({
1037
+ 'Cluster': cluster_counts.index,
1038
+ 'Count': cluster_counts.values,
1039
+ 'Percentage': cluster_percentages.values
1040
+ }).sort_values('Cluster')
1041
+
1042
+ # Add weighted statistics if requested
1043
+ if weighted:
1044
+ cluster_weights = []
1045
+ weighted_percentages = []
1046
+ total_weight = np.sum(self.weights)
1047
+
1048
+ for cluster_id in distribution['Cluster']:
1049
+ # Find entities in this cluster
1050
+ cluster_mask = memberships_df['Cluster'] == cluster_id
1051
+ cluster_entity_indices = memberships_df.index[cluster_mask]
1052
+
1053
+ # Sum weights for entities in this cluster
1054
+ cluster_weight = np.sum(self.weights[cluster_entity_indices])
1055
+ cluster_weights.append(cluster_weight)
1056
+
1057
+ # Calculate weighted percentage
1058
+ weighted_pct = (cluster_weight / total_weight * 100) if total_weight > 0 else 0.0
1059
+ weighted_percentages.append(round(weighted_pct, 2))
1060
+
1061
+ distribution['Weight_Sum'] = cluster_weights
1062
+ distribution['Weight_Percentage'] = weighted_percentages
1063
+
1064
+ return distribution
1065
+
1066
+ def plot_cluster_distribution(self, num_clusters, save_as=None, title=None,
1067
+ style="whitegrid", dpi=200, figsize=(10, 6), weighted=False):
1068
+ """
1069
+ Plot the distribution of entities across clusters as a bar chart.
1070
+
1071
+ This visualization shows how many entities belong to each cluster, providing
1072
+ insight into the balance and size distribution of the clustering result.
1073
+ When weighted=True, displays weight-based percentages.
1074
+
1075
+ :param num_clusters: The number of clusters to create.
1076
+ :param save_as: File path to save the plot. If None, the plot will be shown.
1077
+ :param title: Title for the plot. If None, a default title will be used.
1078
+ :param style: Seaborn style for the plot.
1079
+ :param dpi: DPI for saved image.
1080
+ :param figsize: Figure size in inches.
1081
+ :param weighted: If True, display weighted percentages instead of entity count percentages.
1082
+ """
1083
+ # Get cluster distribution data (include weights if needed)
1084
+ distribution = self.get_cluster_distribution(num_clusters, weighted=weighted)
1085
+
1086
+ # Set up plot
1087
+ sns.set(style=style)
1088
+ plt.figure(figsize=figsize)
1089
+
1090
+ # Choose what to plot based on weighted parameter
1091
+ if weighted and 'Weight_Sum' in distribution.columns:
1092
+ y_column = 'Weight_Sum'
1093
+ percentage_column = 'Weight_Percentage'
1094
+ ylabel = "Total Weight"
1095
+ note_text = "Y-axis shows weight sums; percentages above bars indicate weight-based relative frequency."
1096
+ else:
1097
+ y_column = 'Count'
1098
+ percentage_column = 'Percentage'
1099
+ ylabel = "Number of Entities"
1100
+ note_text = "Y-axis shows entity counts; percentages above bars indicate their relative frequency."
1101
+
1102
+ # Create bar plot with a more poetic, fresh color palette
1103
+ # 'muted', 'pastel', and 'husl' are good options for fresher colors
1104
+ ax = sns.barplot(x='Cluster', y=y_column, data=distribution, palette='pastel')
1105
+
1106
+ # Set the Y-axis range to prevent text overflow
1107
+ ax.set_ylim(0, distribution[y_column].max() * 1.2)
1108
+
1109
+ # Ensure Y-axis uses appropriate ticks
1110
+ if not weighted:
1111
+ plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))
1112
+
1113
+ # Add percentage labels on top of bars
1114
+ for p, (_, row) in zip(ax.patches, distribution.iterrows()):
1115
+ height = p.get_height()
1116
+ percentage = row[percentage_column]
1117
+ ax.text(p.get_x() + p.get_width() / 2., height + max(height * 0.02, 0.5),
1118
+ f'{percentage:.1f}%', ha="center", fontsize=9)
1119
+
1120
+ # Set a simple label for entity count at the top
1121
+ if title is None:
1122
+ if weighted:
1123
+ title = f"N = {len(self.entity_ids)}, Total Weight = {np.sum(self.weights):.1f}"
1124
+ else:
1125
+ title = f"N = {len(self.entity_ids)}"
1126
+
1127
+ # Use a lighter, non-bold title style
1128
+ plt.title(title, fontsize=12, fontweight="normal", loc='right')
1129
+
1130
+ plt.xlabel("Cluster ID", fontsize=12)
1131
+ plt.ylabel(ylabel, fontsize=12)
1132
+ plt.xticks(fontsize=10)
1133
+ plt.yticks(fontsize=10)
1134
+
1135
+ # Ensure integer ticks for cluster IDs
1136
+ plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
1137
+
1138
+ # Add grid for better readability but make it lighter
1139
+ plt.grid(axis='y', linestyle='--', alpha=0.4)
1140
+
1141
+ # Adjust layout
1142
+ plt.tight_layout()
1143
+
1144
+ # Adjust layout to make room for the note
1145
+ plt.subplots_adjust(bottom=0.13)
1146
+
1147
+ # Add a note about what is being displayed
1148
+ plt.figtext(0.5, 0.01, note_text, ha='center', fontsize=10, style='italic')
1149
+
1150
+ # Save and show the plot
1151
+ save_and_show_results(save_as, dpi)
1152
+
1153
+
1154
+ # For xinyi's test, because she can't debug in Jupyter :
1155
+ # Traceback (most recent call last):
1156
+ # File "/Applications/PyCharm.app/Contents/plugins/python-ce/helpers/pydev/_pydevd_bundle/pydevd_comm.py", line 736, in make_thread_stack_str
1157
+ # append('file="%s" line="%s">' % (make_valid_xml_value(my_file), lineno))
1158
+ # File "/Applications/PyCharm.app/Contents/plugins/python-ce/helpers/pydev/_pydevd_bundle/pydevd_xml.py", line 36, in make_valid_xml_value
1159
+ # return s.replace("&", "&amp;").replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
1160
+ # AttributeError: 'tuple' object has no attribute 'replace'
1161
+
1162
+ if __name__ == '__main__':
1163
+ # Import necessary libraries
1164
+ # Your calling code (e.g., in a script or notebook)
1165
+
1166
+ from sequenzo import * # Import the package, give it a short alias
1167
+ import pandas as pd # Data manipulation
1168
+ import numpy as np
1169
+
1170
+ # List all the available datasets in Sequenzo
1171
+ # Now access functions using the alias:
1172
+ print('Available datasets in Sequenzo: ', list_datasets())
1173
+
1174
+ # Load the data that we would like to explore in this tutorial
1175
+ # `df` is the short for `dataframe`, which is a common variable name for a dataset
1176
+ # df = load_dataset('country_co2_emissions')
1177
+ # df = load_dataset('mvad')
1178
+ df = pd.read_csv("/Users/xinyi/Projects/sequenzo/sequenzo/data_and_output/orignal data/mvad.csv")
1179
+
1180
+ # 时间列表
1181
+ time_list = ['Jul.93', 'Aug.93', 'Sep.93', 'Oct.93', 'Nov.93', 'Dec.93',
1182
+ 'Jan.94', 'Feb.94', 'Mar.94', 'Apr.94', 'May.94', 'Jun.94', 'Jul.94',
1183
+ 'Aug.94', 'Sep.94', 'Oct.94', 'Nov.94', 'Dec.94', 'Jan.95', 'Feb.95',
1184
+ 'Mar.95', 'Apr.95', 'May.95', 'Jun.95', 'Jul.95', 'Aug.95', 'Sep.95',
1185
+ 'Oct.95', 'Nov.95', 'Dec.95', 'Jan.96', 'Feb.96', 'Mar.96', 'Apr.96',
1186
+ 'May.96', 'Jun.96', 'Jul.96', 'Aug.96', 'Sep.96', 'Oct.96', 'Nov.96',
1187
+ 'Dec.96', 'Jan.97', 'Feb.97', 'Mar.97', 'Apr.97', 'May.97', 'Jun.97',
1188
+ 'Jul.97', 'Aug.97', 'Sep.97', 'Oct.97', 'Nov.97', 'Dec.97', 'Jan.98',
1189
+ 'Feb.98', 'Mar.98', 'Apr.98', 'May.98', 'Jun.98', 'Jul.98', 'Aug.98',
1190
+ 'Sep.98', 'Oct.98', 'Nov.98', 'Dec.98', 'Jan.99', 'Feb.99', 'Mar.99',
1191
+ 'Apr.99', 'May.99', 'Jun.99']
1192
+
1193
+ # 方法1: 使用pandas获取所有唯一值
1194
+ time_states_df = df[time_list]
1195
+ all_unique_states = set()
1196
+
1197
+ for col in time_list:
1198
+ unique_vals = df[col].dropna().unique() # Remove NaN values
1199
+ all_unique_states.update(unique_vals)
1200
+
1201
+ # 转换为排序的列表
1202
+ states = sorted(list(all_unique_states))
1203
+ print("All unique states:")
1204
+ for i, state in enumerate(states, 1):
1205
+ print(f"{i:2d}. {state}")
1206
+
1207
+ print(f"\nstates list:")
1208
+ print(f"states = {states}")
1209
+
1210
+ # Create a SequenceData object
1211
+
1212
+ # Define the time-span variable
1213
+ time_list = ['Jul.93', 'Aug.93', 'Sep.93', 'Oct.93', 'Nov.93', 'Dec.93',
1214
+ 'Jan.94', 'Feb.94', 'Mar.94', 'Apr.94', 'May.94', 'Jun.94', 'Jul.94',
1215
+ 'Aug.94', 'Sep.94', 'Oct.94', 'Nov.94', 'Dec.94', 'Jan.95', 'Feb.95',
1216
+ 'Mar.95', 'Apr.95', 'May.95', 'Jun.95', 'Jul.95', 'Aug.95', 'Sep.95',
1217
+ 'Oct.95', 'Nov.95', 'Dec.95', 'Jan.96', 'Feb.96', 'Mar.96', 'Apr.96',
1218
+ 'May.96', 'Jun.96', 'Jul.96', 'Aug.96', 'Sep.96', 'Oct.96', 'Nov.96',
1219
+ 'Dec.96', 'Jan.97', 'Feb.97', 'Mar.97', 'Apr.97', 'May.97', 'Jun.97',
1220
+ 'Jul.97', 'Aug.97', 'Sep.97', 'Oct.97', 'Nov.97', 'Dec.97', 'Jan.98',
1221
+ 'Feb.98', 'Mar.98', 'Apr.98', 'May.98', 'Jun.98', 'Jul.98', 'Aug.98',
1222
+ 'Sep.98', 'Oct.98', 'Nov.98', 'Dec.98', 'Jan.99', 'Feb.99', 'Mar.99',
1223
+ 'Apr.99', 'May.99', 'Jun.99']
1224
+
1225
+ states = ['FE', 'HE', 'employment', 'joblessness', 'school', 'training']
1226
+ labels = ['further education', 'higher education', 'employment', 'joblessness', 'school', 'training']
1227
+
1228
+ # TODO: write a try and error: if no such a parameter, then ask to pass the right ones
1229
+ # sequence_data = SequenceData(df, time=time, id_col="country", ids=df['country'].values, states=states)
1230
+
1231
+ sequence_data = SequenceData(df,
1232
+ time=time_list,
1233
+ id_col="id",
1234
+ states=states,
1235
+ labels=labels,
1236
+ )
1237
+
1238
+ om = get_distance_matrix(sequence_data,
1239
+ method="OM",
1240
+ sm="CONSTANT",
1241
+ indel=1)
1242
+
1243
+ cluster = Cluster(om, sequence_data.ids, clustering_method='ward_d')
1244
+ cluster.plot_dendrogram(xlabel="Individuals", ylabel="Distance")
1245
+
1246
+ # Create a ClusterQuality object to evaluate clustering quality
1247
+ cluster_quality = ClusterQuality(cluster)
1248
+ cluster_quality.compute_cluster_quality_scores()
1249
+ cluster_quality.plot_cqi_scores(norm='zscore')
1250
+ summary_table = cluster_quality.get_cqi_table()
1251
+ print(summary_table)
1252
+
1253
+ table = cluster_quality.get_cluster_range_table()
1254
+ # table.to_csv("cluster_quality_table.csv")
1255
+
1256
+ print(table)