sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
  2. sequenzo/__init__.py +349 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +476 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +22 -0
  30. sequenzo/data_preprocessing/helpers.py +303 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/dyadic_children.csv +61 -0
  44. sequenzo/datasets/dyadic_parents.csv +61 -0
  45. sequenzo/datasets/mvad.csv +713 -0
  46. sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
  47. sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
  48. sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
  49. sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
  50. sequenzo/datasets/political_science_aid_shock.csv +166 -0
  51. sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
  52. sequenzo/define_sequence_data.py +1400 -0
  53. sequenzo/dissimilarity_measures/__init__.py +31 -0
  54. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  55. sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
  56. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
  57. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  58. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  59. sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
  60. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  61. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  62. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  63. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  64. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  65. sequenzo/dissimilarity_measures/src/module.cpp +40 -0
  66. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  67. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  214. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  215. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  216. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  217. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  218. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  219. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  220. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  221. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  222. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  223. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  224. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  225. sequenzo/multidomain/__init__.py +23 -0
  226. sequenzo/multidomain/association_between_domains.py +311 -0
  227. sequenzo/multidomain/cat.py +597 -0
  228. sequenzo/multidomain/combt.py +519 -0
  229. sequenzo/multidomain/dat.py +81 -0
  230. sequenzo/multidomain/idcd.py +139 -0
  231. sequenzo/multidomain/linked_polyad.py +292 -0
  232. sequenzo/openmp_setup.py +233 -0
  233. sequenzo/prefix_tree/__init__.py +62 -0
  234. sequenzo/prefix_tree/hub.py +114 -0
  235. sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
  236. sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
  237. sequenzo/prefix_tree/spell_level_indicators.py +297 -0
  238. sequenzo/prefix_tree/system_level_indicators.py +544 -0
  239. sequenzo/prefix_tree/utils.py +54 -0
  240. sequenzo/seqhmm/__init__.py +95 -0
  241. sequenzo/seqhmm/advanced_optimization.py +305 -0
  242. sequenzo/seqhmm/bootstrap.py +411 -0
  243. sequenzo/seqhmm/build_hmm.py +142 -0
  244. sequenzo/seqhmm/build_mhmm.py +136 -0
  245. sequenzo/seqhmm/build_nhmm.py +121 -0
  246. sequenzo/seqhmm/fit_mhmm.py +62 -0
  247. sequenzo/seqhmm/fit_model.py +61 -0
  248. sequenzo/seqhmm/fit_nhmm.py +76 -0
  249. sequenzo/seqhmm/formulas.py +289 -0
  250. sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
  251. sequenzo/seqhmm/gradients_nhmm.py +306 -0
  252. sequenzo/seqhmm/hmm.py +291 -0
  253. sequenzo/seqhmm/mhmm.py +314 -0
  254. sequenzo/seqhmm/model_comparison.py +238 -0
  255. sequenzo/seqhmm/multichannel_em.py +282 -0
  256. sequenzo/seqhmm/multichannel_utils.py +138 -0
  257. sequenzo/seqhmm/nhmm.py +270 -0
  258. sequenzo/seqhmm/nhmm_utils.py +191 -0
  259. sequenzo/seqhmm/predict.py +137 -0
  260. sequenzo/seqhmm/predict_mhmm.py +142 -0
  261. sequenzo/seqhmm/simulate.py +878 -0
  262. sequenzo/seqhmm/utils.py +218 -0
  263. sequenzo/seqhmm/visualization.py +910 -0
  264. sequenzo/sequence_characteristics/__init__.py +40 -0
  265. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  266. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  267. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  268. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  269. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  270. sequenzo/sequence_characteristics/turbulence.py +155 -0
  271. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  272. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  273. sequenzo/suffix_tree/__init__.py +66 -0
  274. sequenzo/suffix_tree/hub.py +114 -0
  275. sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
  276. sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
  277. sequenzo/suffix_tree/spell_level_indicators.py +248 -0
  278. sequenzo/suffix_tree/system_level_indicators.py +535 -0
  279. sequenzo/suffix_tree/utils.py +56 -0
  280. sequenzo/version_check.py +283 -0
  281. sequenzo/visualization/__init__.py +29 -0
  282. sequenzo/visualization/plot_mean_time.py +222 -0
  283. sequenzo/visualization/plot_modal_state.py +276 -0
  284. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  285. sequenzo/visualization/plot_relative_frequency.py +405 -0
  286. sequenzo/visualization/plot_sequence_index.py +1175 -0
  287. sequenzo/visualization/plot_single_medoid.py +153 -0
  288. sequenzo/visualization/plot_state_distribution.py +651 -0
  289. sequenzo/visualization/plot_transition_matrix.py +190 -0
  290. sequenzo/visualization/utils/__init__.py +23 -0
  291. sequenzo/visualization/utils/utils.py +310 -0
  292. sequenzo/with_event_history_analysis/__init__.py +35 -0
  293. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  294. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  295. sequenzo-0.1.31.dist-info/METADATA +286 -0
  296. sequenzo-0.1.31.dist-info/RECORD +299 -0
  297. sequenzo-0.1.31.dist-info/WHEEL +5 -0
  298. sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
  299. sequenzo-0.1.31.dist-info/top_level.txt +2 -0
@@ -0,0 +1,496 @@
1
+ #include "cluster_quality.h"
2
+ #include <iostream>
3
+ #include <limits>
4
+ #include <cstring>
5
+
6
+ #ifdef _OPENMP
7
+ #include <omp.h>
8
+ #endif
9
+
10
+ /**
11
+ * Implementation matching R WeightedCluster exactly
12
+ * Based on clusterqualitybody.cpp from R package
13
+ */
14
+
15
+ void resetKendallTree(KendallTree& kendall) {
16
+ for (auto& pair : kendall) {
17
+ pair.second->clustDist0 = 0.0;
18
+ pair.second->clustDist1 = 0.0;
19
+ }
20
+ }
21
+
22
+ void finalizeKendall(KendallTree& kendall) {
23
+ for (auto& pair : kendall) {
24
+ delete pair.second;
25
+ }
26
+ kendall.clear();
27
+ }
28
+
29
+ /**
30
+ * Core function exactly matching R WeightedCluster implementation
31
+ */
32
+ template<bool UseCondensed>
33
+ void compute_cluster_quality_core(const double* diss, const int* cluster, const double* weights,
34
+ int n, double* stats, int nclusters, double* asw,
35
+ KendallTree& kendall) {
36
+
37
+ // Initialize all statistics to NaN
38
+ std::fill(stats, stats + ClusterQualNumStat, std::numeric_limits<double>::quiet_NaN());
39
+ std::fill(asw, asw + 2 * nclusters, std::numeric_limits<double>::quiet_NaN());
40
+
41
+ // Variables following R implementation exactly - use double like R
42
+ double totweights = 0.0, wxy = 0.0, wx = 0.0, wy = 0.0, wx2 = 0.0;
43
+ double ww, xx, covxy, covx, covy, pearson, xb, yb, xw, xxw;
44
+ int ij = 0;
45
+
46
+ // Allocate arrays like R version (0-based indexing)
47
+ std::vector<double> errors(nclusters, 0.0);
48
+ std::vector<double> errors2(nclusters, 0.0);
49
+ std::vector<double> sizes(nclusters, 0.0);
50
+
51
+ // Initialize ASW arrays (output)
52
+ for (int i = 0; i < nclusters; i++) {
53
+ asw[i] = 0.0;
54
+ asw[i + nclusters] = 0.0;
55
+ }
56
+
57
+ // Initialize Kendall tree with zero distance node (like R)
58
+ CmpCluster* ZeroDist;
59
+ auto it_zero = kendall.find(0.0);
60
+ if (it_zero != kendall.end()) {
61
+ ZeroDist = it_zero->second;
62
+ } else {
63
+ ZeroDist = new CmpCluster();
64
+ kendall[0.0] = ZeroDist;
65
+ }
66
+
67
+ // Main computation loop following R version exactly
68
+ if constexpr (UseCondensed) {
69
+ ij = -n; // Condensed version initialization
70
+ }
71
+
72
+ for (int i = 0; i < n; i++) {
73
+ int iclustIndex = cluster[i] - 1; // Convert to 0-based for array access
74
+ if (iclustIndex >= 0 && iclustIndex < nclusters) {
75
+ sizes[iclustIndex] += weights[i];
76
+ }
77
+
78
+ if constexpr (!UseCondensed) {
79
+ ij = i * n; // Full matrix version
80
+ } else {
81
+ ij += n - i - 1; // Condensed version offset
82
+ }
83
+
84
+ if (weights[i] > 0) {
85
+ // Diagonal term (distance to self = 0)
86
+ ww = weights[i] * weights[i];
87
+ wy += ww;
88
+ ZeroDist->clustDist0 += ww;
89
+ totweights += ww;
90
+
91
+ for (int j = i + 1; j < n; j++) {
92
+ if (weights[j] > 0) {
93
+ ww = 2.0 * weights[i] * weights[j]; // Factor of 2 like R
94
+
95
+ if constexpr (UseCondensed) {
96
+ // Use explicit condensed indexing to avoid stride/layout issues
97
+ xx = diss[getCondensedIndex(i, j, n)];
98
+ } else {
99
+ // Full square matrix (row-major) indexing
100
+ xx = diss[ij + j];
101
+ }
102
+
103
+ // Find or create Kendall tree node
104
+ auto it = kendall.find(xx);
105
+ CmpCluster* cmpclust;
106
+ if (it != kendall.end()) {
107
+ cmpclust = it->second;
108
+ } else {
109
+ cmpclust = new CmpCluster();
110
+ kendall[xx] = cmpclust;
111
+ }
112
+
113
+ xw = ww * xx;
114
+ xxw = xw * xx;
115
+ wx += xw;
116
+ wx2 += xxw;
117
+
118
+ if (cluster[i] == cluster[j]) {
119
+ // Same cluster
120
+ if (iclustIndex >= 0 && iclustIndex < nclusters) {
121
+ errors[iclustIndex] += xw;
122
+ errors2[iclustIndex] += xxw; // Add errors2 calculation like R
123
+ }
124
+ wxy += xw;
125
+ wy += ww;
126
+ cmpclust->clustDist0 += ww;
127
+ } else {
128
+ // Different clusters
129
+ cmpclust->clustDist1 += ww;
130
+ }
131
+
132
+ totweights += ww;
133
+ }
134
+ }
135
+ }
136
+ }
137
+
138
+ // Calculate Pearson correlation (HPG) exactly like R
139
+ if (totweights > 0) {
140
+ xb = wx / totweights;
141
+ yb = wy / totweights;
142
+ covx = wx2 / totweights - xb * xb;
143
+ covy = wy / totweights - yb * yb;
144
+ covxy = wxy / totweights - yb * xb;
145
+
146
+ // Debug: Print intermediate values
147
+ #ifdef DEBUG_PBC
148
+ std::cout << "DEBUG PBC: totweights=" << totweights << ", wx=" << wx << ", wy=" << wy << ", wxy=" << wxy << ", wx2=" << wx2 << std::endl;
149
+ std::cout << "DEBUG PBC: xb=" << xb << ", yb=" << yb << std::endl;
150
+ std::cout << "DEBUG PBC: covx=" << covx << ", covy=" << covy << ", covxy=" << covxy << std::endl;
151
+ #endif
152
+
153
+ if (covx > 0 && covy > 0) {
154
+ pearson = covxy / std::sqrt(covx * covy);
155
+ double pbc_value = -1.0 * static_cast<double>(pearson); // Apply negative to get positive PBC
156
+ stats[ClusterQualHPG] = pbc_value;
157
+
158
+ // Debug: Print final calculation
159
+ #ifdef DEBUG_PBC
160
+ std::cout << "DEBUG PBC: pearson=" << pearson << ", pbc_value=" << pbc_value << std::endl;
161
+ #endif
162
+ }
163
+ }
164
+
165
+ // Compute Kendall statistics (HG, HGSD, HC) exactly like R
166
+ double nc = 0.0, nd = 0.0, currentclustdist0 = 0.0, currentclustdist1 = 0.0;
167
+ double totdist0 = wy, totdist1 = totweights - wy, ntiesdist = 0.0;
168
+ double Smin = 0.0, wSmin = wy, Smax = 0.0, wSmax = totdist1, currentww = 0.0;
169
+
170
+ for (auto it = kendall.begin(); it != kendall.end(); ++it) {
171
+ CmpCluster* cmpclust = it->second;
172
+ ww = cmpclust->clustDist1 + cmpclust->clustDist0;
173
+
174
+ if (ww > 0) {
175
+ // Smin calculation
176
+ if (currentww <= wSmin) {
177
+ if (currentww + ww > wSmin) {
178
+ Smin += (wSmin - currentww) * it->first;
179
+ } else {
180
+ Smin += ww * it->first;
181
+ }
182
+ }
183
+ currentww += ww;
184
+
185
+ // Smax calculation
186
+ if (currentww > wSmax) {
187
+ if (currentww - ww < wSmax) {
188
+ Smax += (currentww - wSmax) * it->first;
189
+ } else {
190
+ Smax += ww * it->first;
191
+ }
192
+ }
193
+
194
+ // Count ties
195
+ ntiesdist += cmpclust->clustDist1 * cmpclust->clustDist0;
196
+
197
+ // Concordant and discordant pairs - exactly like R
198
+ nc += cmpclust->clustDist1 * currentclustdist0; // Bottom of table
199
+ nd += cmpclust->clustDist0 * currentclustdist1;
200
+
201
+ // Update running totals
202
+ currentclustdist0 += cmpclust->clustDist0;
203
+ currentclustdist1 += cmpclust->clustDist1;
204
+
205
+ // Top of table
206
+ nc += cmpclust->clustDist0 * (totdist1 - currentclustdist1);
207
+ nd += cmpclust->clustDist1 * (totdist0 - currentclustdist0);
208
+ }
209
+ }
210
+
211
+ // Compute final Kendall statistics (guard divisions to avoid NaN while matching R behavior)
212
+ double denom_hg = (nc + nd);
213
+ if (denom_hg > 0) {
214
+ stats[ClusterQualHG] = static_cast<double>((nc - nd) / denom_hg); // Gamma
215
+ }
216
+
217
+ // HGSD (Somers' D)
218
+ double denom_hgsd = (nc + nd + ntiesdist);
219
+ if (denom_hgsd > 0) {
220
+ stats[ClusterQualHGSD] = (nc - nd) / denom_hgsd;
221
+ } else {
222
+ stats[ClusterQualHGSD] = 0.0; // avoid NaN in degenerate cases
223
+ }
224
+
225
+ // HC (Hierarchical Criterion)
226
+ double denom_hc = (Smax - Smin);
227
+ if (denom_hc > 0) {
228
+ stats[ClusterQualHC] = (wxy - Smin) / denom_hc;
229
+ } else {
230
+ stats[ClusterQualHC] = 0.0; // avoid NaN when Smax == Smin
231
+ }
232
+
233
+
234
+ // Compute F and R statistics exactly like R
235
+ double SSres = 0.0;
236
+ double SS2res = 0.0;
237
+ double total_cluster_weights = 0.0;
238
+
239
+ for (int i = 0; i < nclusters; i++) {
240
+ if (sizes[i] > 0) {
241
+ SSres += errors[i] / sizes[i];
242
+ SS2res += errors2[i] / sizes[i];
243
+ total_cluster_weights += sizes[i];
244
+ }
245
+ }
246
+
247
+ if (total_cluster_weights > 0) {
248
+ double SSexpl = wx / total_cluster_weights - SSres;
249
+ double SS2expl = wx2 / total_cluster_weights - SS2res;
250
+ double dncluster = static_cast<double>(nclusters);
251
+
252
+ if (total_cluster_weights > dncluster && SSres > 0) {
253
+ stats[ClusterQualF] = (SSexpl / (dncluster - 1.0)) / (SSres / (total_cluster_weights - dncluster));
254
+ stats[ClusterQualR] = SSexpl / (SSres + SSexpl);
255
+ // F2 and R2 should be based on SS2, not squares of F and R
256
+ stats[ClusterQualF2] = (SS2expl / (dncluster - 1.0)) / (SS2res / (total_cluster_weights - dncluster));
257
+ stats[ClusterQualR2] = SS2expl / (SS2res + SS2expl);
258
+ }
259
+ }
260
+
261
+ // Compute ASW exactly like R version
262
+ double asw_i = 0.0;
263
+ double asw_w = 0.0;
264
+
265
+ // Reset ASW arrays
266
+ for (int j = 0; j < nclusters; j++) {
267
+ asw[j] = 0.0;
268
+ asw[j + nclusters] = 0.0;
269
+ }
270
+
271
+ for (int i = 0; i < n; i++) {
272
+ if (weights[i] > 0) {
273
+ int iclustIndex = cluster[i] - 1; // Convert to 0-based
274
+ if (iclustIndex < 0 || iclustIndex >= nclusters) continue;
275
+
276
+ double aik = 0.0;
277
+ std::vector<double> othergroups(nclusters, 0.0);
278
+
279
+ // Calculate distances to all other points
280
+ if constexpr (!UseCondensed) {
281
+ ij = i * n;
282
+ for (int j = 0; j < n; j++) {
283
+ if (i == j) continue;
284
+ int jclustIndex = cluster[j] - 1;
285
+ if (jclustIndex < 0 || jclustIndex >= nclusters) continue;
286
+
287
+ if (iclustIndex == jclustIndex) {
288
+ aik += weights[j] * diss[ij + j];
289
+ } else {
290
+ othergroups[jclustIndex] += weights[j] * diss[ij + j];
291
+ }
292
+ }
293
+ } else {
294
+ // Condensed version
295
+ for (int j = 0; j < n; j++) {
296
+ if (i == j) continue;
297
+ int jclustIndex = cluster[j] - 1;
298
+ if (jclustIndex < 0 || jclustIndex >= nclusters) continue;
299
+
300
+ double dist_val = (i < j) ? diss[getCondensedIndex(i, j, n)] : diss[getCondensedIndex(j, i, n)];
301
+
302
+ if (iclustIndex == jclustIndex) {
303
+ aik += weights[j] * dist_val;
304
+ } else {
305
+ othergroups[jclustIndex] += weights[j] * dist_val;
306
+ }
307
+ }
308
+ }
309
+
310
+ // Find minimum average distance to other clusters
311
+ double bik = std::numeric_limits<double>::max();
312
+ for (int j = 0; j < nclusters; j++) {
313
+ if (j != iclustIndex && sizes[j] > 0) {
314
+ double avg_dist = othergroups[j] / sizes[j];
315
+ if (bik >= avg_dist) {
316
+ bik = avg_dist;
317
+ }
318
+ }
319
+ }
320
+
321
+ // Calculate ASW values like R
322
+ double aik_w = aik / sizes[iclustIndex]; // Weighted version
323
+ if (sizes[iclustIndex] <= 1.0) {
324
+ aik = 0.0; // Avoid division by zero for singletons
325
+ } else {
326
+ aik /= (sizes[iclustIndex] - 1.0); // Unweighted version
327
+ }
328
+
329
+ if (bik != std::numeric_limits<double>::max()) {
330
+ double sik_i = weights[i] * ((bik - aik) / std::max(aik, bik));
331
+ double sik_w = weights[i] * ((bik - aik_w) / std::max(aik_w, bik));
332
+
333
+ asw[iclustIndex] += sik_i;
334
+ asw[iclustIndex + nclusters] += sik_w;
335
+ asw_i += sik_i;
336
+ asw_w += sik_w;
337
+ }
338
+ }
339
+ }
340
+
341
+ // Normalize cluster ASW by cluster sizes
342
+ for (int j = 0; j < nclusters; j++) {
343
+ if (sizes[j] > 0) {
344
+ asw[j] /= sizes[j];
345
+ asw[j + nclusters] /= sizes[j];
346
+ }
347
+ }
348
+
349
+ if (total_cluster_weights > 0) {
350
+ stats[ClusterQualASWi] = asw_i / total_cluster_weights;
351
+ stats[ClusterQualASWw] = asw_w / total_cluster_weights;
352
+ }
353
+ }
354
+
355
+ // Template instantiations
356
+ void clusterquality(const double* diss, const int* cluster, const double* weights,
357
+ int n, double* stats, int nclusters, double* asw,
358
+ KendallTree& kendall) {
359
+ compute_cluster_quality_core<false>(diss, cluster, weights, n, stats, nclusters, asw, kendall);
360
+ }
361
+
362
+ void clusterquality_dist(const double* diss, const int* cluster, const double* weights,
363
+ int n, double* stats, int nclusters, double* asw,
364
+ KendallTree& kendall) {
365
+ compute_cluster_quality_core<true>(diss, cluster, weights, n, stats, nclusters, asw, kendall);
366
+ }
367
+
368
+ // Individual ASW functions (simplified, calling the main function)
369
+ void indiv_asw(const double* diss, const int* cluster, const double* weights,
370
+ int n, int nclusters, double* asw_i, double* asw_w) {
371
+
372
+ std::fill(asw_i, asw_i + n, std::numeric_limits<double>::quiet_NaN());
373
+ std::fill(asw_w, asw_w + n, std::numeric_limits<double>::quiet_NaN());
374
+
375
+ // For individual ASW, we can use simplified computation
376
+ std::vector<double> sizes(nclusters, 0.0);
377
+ for (int i = 0; i < n; i++) {
378
+ int clustIndex = cluster[i] - 1;
379
+ if (clustIndex >= 0 && clustIndex < nclusters) {
380
+ sizes[clustIndex] += weights[i];
381
+ }
382
+ }
383
+
384
+ for (int i = 0; i < n; i++) {
385
+ int iclustIndex = cluster[i] - 1;
386
+ if (iclustIndex < 0 || iclustIndex >= nclusters || sizes[iclustIndex] <= 1.0) {
387
+ continue;
388
+ }
389
+
390
+ double aik = 0.0, aik_w = 0.0;
391
+ std::vector<double> othergroups(nclusters, 0.0);
392
+
393
+ for (int j = 0; j < n; j++) {
394
+ if (i == j) continue;
395
+ int jclustIndex = cluster[j] - 1;
396
+ if (jclustIndex < 0 || jclustIndex >= nclusters) continue;
397
+
398
+ double dist = diss[i * n + j];
399
+ if (iclustIndex == jclustIndex) {
400
+ aik += weights[j] * dist;
401
+ } else {
402
+ othergroups[jclustIndex] += weights[j] * dist;
403
+ }
404
+ }
405
+
406
+ double bik = std::numeric_limits<double>::max();
407
+ for (int j = 0; j < nclusters; j++) {
408
+ if (j != iclustIndex && sizes[j] > 0) {
409
+ double avg_dist = othergroups[j] / sizes[j];
410
+ if (bik >= avg_dist) {
411
+ bik = avg_dist;
412
+ }
413
+ }
414
+ }
415
+
416
+ aik_w = aik / sizes[iclustIndex];
417
+ aik /= (sizes[iclustIndex] - 1.0);
418
+
419
+ if (bik != std::numeric_limits<double>::max()) {
420
+ asw_i[i] = (bik - aik) / std::max(aik, bik);
421
+ asw_w[i] = (bik - aik_w) / std::max(aik_w, bik);
422
+ }
423
+ }
424
+ }
425
+
426
+ void indiv_asw_dist(const double* diss, const int* cluster, const double* weights,
427
+ int n, int nclusters, double* asw_i, double* asw_w) {
428
+
429
+ std::fill(asw_i, asw_i + n, std::numeric_limits<double>::quiet_NaN());
430
+ std::fill(asw_w, asw_w + n, std::numeric_limits<double>::quiet_NaN());
431
+
432
+ // For condensed version
433
+ std::vector<double> sizes(nclusters, 0.0);
434
+ for (int i = 0; i < n; i++) {
435
+ int clustIndex = cluster[i] - 1;
436
+ if (clustIndex >= 0 && clustIndex < nclusters) {
437
+ sizes[clustIndex] += weights[i];
438
+ }
439
+ }
440
+
441
+ for (int i = 0; i < n; i++) {
442
+ int iclustIndex = cluster[i] - 1;
443
+ if (iclustIndex < 0 || iclustIndex >= nclusters || sizes[iclustIndex] <= 1.0) {
444
+ continue;
445
+ }
446
+
447
+ double aik = 0.0, aik_w = 0.0;
448
+ std::vector<double> othergroups(nclusters, 0.0);
449
+
450
+ for (int j = 0; j < n; j++) {
451
+ if (i == j) continue;
452
+ int jclustIndex = cluster[j] - 1;
453
+ if (jclustIndex < 0 || jclustIndex >= nclusters) continue;
454
+
455
+ double dist = getDistanceFromCondensed(diss, i, j, n);
456
+ if (iclustIndex == jclustIndex) {
457
+ aik += weights[j] * dist;
458
+ } else {
459
+ othergroups[jclustIndex] += weights[j] * dist;
460
+ }
461
+ }
462
+
463
+ double bik = std::numeric_limits<double>::max();
464
+ for (int j = 0; j < nclusters; j++) {
465
+ if (j != iclustIndex && sizes[j] > 0) {
466
+ double avg_dist = othergroups[j] / sizes[j];
467
+ if (bik >= avg_dist) {
468
+ bik = avg_dist;
469
+ }
470
+ }
471
+ }
472
+
473
+ aik_w = aik / sizes[iclustIndex];
474
+ aik /= (sizes[iclustIndex] - 1.0);
475
+
476
+ if (bik != std::numeric_limits<double>::max()) {
477
+ asw_i[i] = (bik - aik) / std::max(aik, bik);
478
+ asw_w[i] = (bik - aik_w) / std::max(aik_w, bik);
479
+ }
480
+ }
481
+ }
482
+
483
+ // Simplified versions
484
+ void clusterqualitySimple(const double* diss, const int* cluster, const double* weights,
485
+ int n, double* stats, int nclusters, double* asw) {
486
+ KendallTree kendall;
487
+ clusterquality(diss, cluster, weights, n, stats, nclusters, asw, kendall);
488
+ finalizeKendall(kendall);
489
+ }
490
+
491
+ void clusterqualitySimple_dist(const double* diss, const int* cluster, const double* weights,
492
+ int n, double* stats, int nclusters, double* asw) {
493
+ KendallTree kendall;
494
+ clusterquality_dist(diss, cluster, weights, n, stats, nclusters, asw, kendall);
495
+ finalizeKendall(kendall);
496
+ }
@@ -0,0 +1,128 @@
1
+ #pragma once
2
+
3
+ #include <pybind11/pybind11.h>
4
+ #include <pybind11/numpy.h>
5
+ #include <pybind11/stl.h>
6
+ #include <vector>
7
+ #include <map>
8
+ #include <cmath>
9
+ #include <algorithm>
10
+ #include <numeric>
11
+
12
+ namespace py = pybind11;
13
+
14
+ // Cluster Quality Index constants (matching R WeightedCluster package)
15
+ #define ClusterQualHPG 0 // Hubert's Gamma Prime (not implemented in this version)
16
+ #define ClusterQualHG 1 // Hubert's Gamma
17
+ #define ClusterQualHGSD 2 // Hubert's Gamma Standard Deviation
18
+ #define ClusterQualASWi 3 // Average Silhouette Width (individual)
19
+ #define ClusterQualASWw 4 // Average Silhouette Width (weighted)
20
+ #define ClusterQualF 5 // Calinski-Harabasz (F statistic)
21
+ #define ClusterQualR 6 // R-squared
22
+ #define ClusterQualF2 7 // Calinski-Harabasz squared
23
+ #define ClusterQualR2 8 // R-squared squared
24
+ #define ClusterQualHC 9 // Hierarchical Criterion
25
+ #define ClusterQualNumStat 10
26
+
27
+ /**
28
+ * Class for caching pairwise distance comparisons used in Kendall's tau calculations
29
+ * This corresponds to the CmpCluster class in R's implementation
30
+ */
31
+ class CmpCluster {
32
+ public:
33
+ double clustDist0;
34
+ double clustDist1;
35
+
36
+ CmpCluster() : clustDist0(0.0), clustDist1(0.0) {}
37
+ ~CmpCluster() {}
38
+ };
39
+
40
+ typedef std::map<double, CmpCluster*> KendallTree;
41
+
42
+ /**
43
+ * Core cluster quality computation functions
44
+ * These match the R WeightedCluster package implementation
45
+ */
46
+
47
+ /**
48
+ * Compute all cluster quality indicators for a distance matrix
49
+ *
50
+ * @param diss Distance matrix (full square form, n x n)
51
+ * @param cluster Cluster labels (1-based, as in R)
52
+ * @param weights Sample weights
53
+ * @param n Number of samples
54
+ * @param stats Output array for statistics [ClusterQualNumStat]
55
+ * @param nclusters Number of clusters
56
+ * @param asw Output array for cluster-level ASW [2 * nclusters]
57
+ * @param kendall Reference to Kendall tree for caching
58
+ */
59
+ void clusterquality(const double* diss, const int* cluster, const double* weights,
60
+ int n, double* stats, int nclusters, double* asw,
61
+ KendallTree& kendall);
62
+
63
+ /**
64
+ * Compute all cluster quality indicators for a condensed distance array
65
+ *
66
+ * @param diss Condensed distance array (upper triangle, length n*(n-1)/2)
67
+ * @param cluster Cluster labels (1-based, as in R)
68
+ * @param weights Sample weights
69
+ * @param n Number of samples
70
+ * @param stats Output array for statistics [ClusterQualNumStat]
71
+ * @param nclusters Number of clusters
72
+ * @param asw Output array for cluster-level ASW [2 * nclusters]
73
+ * @param kendall Reference to Kendall tree for caching
74
+ */
75
+ void clusterquality_dist(const double* diss, const int* cluster, const double* weights,
76
+ int n, double* stats, int nclusters, double* asw,
77
+ KendallTree& kendall);
78
+
79
+ /**
80
+ * Compute individual ASW scores for all samples
81
+ *
82
+ * @param diss Distance matrix (full square form, n x n)
83
+ * @param cluster Cluster labels (1-based, as in R)
84
+ * @param weights Sample weights
85
+ * @param n Number of samples
86
+ * @param nclusters Number of clusters
87
+ * @param asw_i Output array for individual ASW [n]
88
+ * @param asw_w Output array for weighted individual ASW [n]
89
+ */
90
+ void indiv_asw(const double* diss, const int* cluster, const double* weights,
91
+ int n, int nclusters, double* asw_i, double* asw_w);
92
+
93
+ /**
94
+ * Compute individual ASW scores for condensed distance array
95
+ */
96
+ void indiv_asw_dist(const double* diss, const int* cluster, const double* weights,
97
+ int n, int nclusters, double* asw_i, double* asw_w);
98
+
99
+ /**
100
+ * Simplified version that computes only basic statistics (without HG/HGSD)
101
+ */
102
+ void clusterqualitySimple(const double* diss, const int* cluster, const double* weights,
103
+ int n, double* stats, int nclusters, double* asw);
104
+
105
+ void clusterqualitySimple_dist(const double* diss, const int* cluster, const double* weights,
106
+ int n, double* stats, int nclusters, double* asw);
107
+
108
+ /**
109
+ * Helper functions for Kendall tree management
110
+ */
111
+ void resetKendallTree(KendallTree& kendall);
112
+ void finalizeKendall(KendallTree& kendall);
113
+
114
+ /**
115
+ * Utility functions
116
+ */
117
+ inline int getCondensedIndex(int i, int j, int n) {
118
+ // Convert (i,j) indices to condensed array index
119
+ // Use SciPy/R standard upper triangle ordering: for i < j
120
+ if (i > j) std::swap(i, j); // Ensure i < j for upper triangle
121
+ return i * n - i * (i + 1) / 2 + j - i - 1;
122
+ }
123
+
124
+ inline double getDistanceFromCondensed(const double* diss, int i, int j, int n) {
125
+ if (i == j) return 0.0;
126
+ // No need to swap here since getCondensedIndex handles it
127
+ return diss[getCondensedIndex(i, j, n)];
128
+ }