sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
  2. sequenzo/__init__.py +349 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +476 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +22 -0
  30. sequenzo/data_preprocessing/helpers.py +303 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/dyadic_children.csv +61 -0
  44. sequenzo/datasets/dyadic_parents.csv +61 -0
  45. sequenzo/datasets/mvad.csv +713 -0
  46. sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
  47. sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
  48. sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
  49. sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
  50. sequenzo/datasets/political_science_aid_shock.csv +166 -0
  51. sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
  52. sequenzo/define_sequence_data.py +1400 -0
  53. sequenzo/dissimilarity_measures/__init__.py +31 -0
  54. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  55. sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
  56. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
  57. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  58. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  59. sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
  60. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  61. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  62. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  63. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  64. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  65. sequenzo/dissimilarity_measures/src/module.cpp +40 -0
  66. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  67. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  214. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  215. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  216. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  217. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  218. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  219. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  220. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  221. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  222. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  223. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  224. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  225. sequenzo/multidomain/__init__.py +23 -0
  226. sequenzo/multidomain/association_between_domains.py +311 -0
  227. sequenzo/multidomain/cat.py +597 -0
  228. sequenzo/multidomain/combt.py +519 -0
  229. sequenzo/multidomain/dat.py +81 -0
  230. sequenzo/multidomain/idcd.py +139 -0
  231. sequenzo/multidomain/linked_polyad.py +292 -0
  232. sequenzo/openmp_setup.py +233 -0
  233. sequenzo/prefix_tree/__init__.py +62 -0
  234. sequenzo/prefix_tree/hub.py +114 -0
  235. sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
  236. sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
  237. sequenzo/prefix_tree/spell_level_indicators.py +297 -0
  238. sequenzo/prefix_tree/system_level_indicators.py +544 -0
  239. sequenzo/prefix_tree/utils.py +54 -0
  240. sequenzo/seqhmm/__init__.py +95 -0
  241. sequenzo/seqhmm/advanced_optimization.py +305 -0
  242. sequenzo/seqhmm/bootstrap.py +411 -0
  243. sequenzo/seqhmm/build_hmm.py +142 -0
  244. sequenzo/seqhmm/build_mhmm.py +136 -0
  245. sequenzo/seqhmm/build_nhmm.py +121 -0
  246. sequenzo/seqhmm/fit_mhmm.py +62 -0
  247. sequenzo/seqhmm/fit_model.py +61 -0
  248. sequenzo/seqhmm/fit_nhmm.py +76 -0
  249. sequenzo/seqhmm/formulas.py +289 -0
  250. sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
  251. sequenzo/seqhmm/gradients_nhmm.py +306 -0
  252. sequenzo/seqhmm/hmm.py +291 -0
  253. sequenzo/seqhmm/mhmm.py +314 -0
  254. sequenzo/seqhmm/model_comparison.py +238 -0
  255. sequenzo/seqhmm/multichannel_em.py +282 -0
  256. sequenzo/seqhmm/multichannel_utils.py +138 -0
  257. sequenzo/seqhmm/nhmm.py +270 -0
  258. sequenzo/seqhmm/nhmm_utils.py +191 -0
  259. sequenzo/seqhmm/predict.py +137 -0
  260. sequenzo/seqhmm/predict_mhmm.py +142 -0
  261. sequenzo/seqhmm/simulate.py +878 -0
  262. sequenzo/seqhmm/utils.py +218 -0
  263. sequenzo/seqhmm/visualization.py +910 -0
  264. sequenzo/sequence_characteristics/__init__.py +40 -0
  265. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  266. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  267. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  268. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  269. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  270. sequenzo/sequence_characteristics/turbulence.py +155 -0
  271. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  272. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  273. sequenzo/suffix_tree/__init__.py +66 -0
  274. sequenzo/suffix_tree/hub.py +114 -0
  275. sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
  276. sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
  277. sequenzo/suffix_tree/spell_level_indicators.py +248 -0
  278. sequenzo/suffix_tree/system_level_indicators.py +535 -0
  279. sequenzo/suffix_tree/utils.py +56 -0
  280. sequenzo/version_check.py +283 -0
  281. sequenzo/visualization/__init__.py +29 -0
  282. sequenzo/visualization/plot_mean_time.py +222 -0
  283. sequenzo/visualization/plot_modal_state.py +276 -0
  284. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  285. sequenzo/visualization/plot_relative_frequency.py +405 -0
  286. sequenzo/visualization/plot_sequence_index.py +1175 -0
  287. sequenzo/visualization/plot_single_medoid.py +153 -0
  288. sequenzo/visualization/plot_state_distribution.py +651 -0
  289. sequenzo/visualization/plot_transition_matrix.py +190 -0
  290. sequenzo/visualization/utils/__init__.py +23 -0
  291. sequenzo/visualization/utils/utils.py +310 -0
  292. sequenzo/with_event_history_analysis/__init__.py +35 -0
  293. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  294. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  295. sequenzo-0.1.31.dist-info/METADATA +286 -0
  296. sequenzo-0.1.31.dist-info/RECORD +299 -0
  297. sequenzo-0.1.31.dist-info/WHEEL +5 -0
  298. sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
  299. sequenzo-0.1.31.dist-info/top_level.txt +2 -0
@@ -0,0 +1,476 @@
1
+ """
2
+ @Author : 李欣怡
3
+ @File : clara.py
4
+ @Time : 2024/12/27 12:04
5
+ @Desc :
6
+ """
7
+
8
+ import gc
9
+ import os
10
+ from contextlib import redirect_stdout
11
+ import warnings
12
+
13
+ from joblib import Parallel, delayed
14
+
15
+ # from Tutorials.test import result
16
+ from sequenzo.clustering.sequenzo_fastcluster.fastcluster import linkage
17
+ from scipy.special import comb
18
+ from itertools import product
19
+
20
+ from sequenzo.big_data.clara.utils.aggregatecases import *
21
+ from sequenzo.big_data.clara.utils.davies_bouldin import *
22
+ from sequenzo.clustering.KMedoids import *
23
+ from sequenzo.big_data.clara.utils.get_weighted_diss import *
24
+
25
+ from sequenzo.define_sequence_data import SequenceData
26
+ from sequenzo.dissimilarity_measures.get_distance_matrix import get_distance_matrix
27
+
28
+
29
+ def adjustedRandIndex(x, y=None):
30
+ if isinstance(x, np.ndarray):
31
+ x = np.array(x)
32
+ y = np.array(y)
33
+ if len(x) != len(y):
34
+ raise ValueError("Arguments must be vectors of the same length")
35
+
36
+ tab = pd.crosstab(x, y)
37
+ else:
38
+ tab = x
39
+
40
+ if tab.shape == (1, 1):
41
+ return 1
42
+
43
+ # 计算 ARI 的四个部分:a, b, c, d
44
+ a = np.sum(comb(tab.to_numpy(), 2)) # 选择每对组合的组合数
45
+ b = np.sum(comb(np.sum(tab.to_numpy(), axis=1), 2)) - a
46
+ c = np.sum(comb(np.sum(tab.to_numpy(), axis=0), 2)) - a
47
+ d = comb(np.sum(tab.to_numpy()), 2) - a - b - c
48
+
49
+ ARI = (a - (a + b) * (a + c) / (a + b + c + d)) / ((a + b + a + c) / 2 - (a + b) * (a + c) / (a + b + c + d))
50
+ return ARI
51
+
52
+
53
+ def jaccardCoef(tab):
54
+ if tab.shape == (1, 1):
55
+ return 1
56
+
57
+ # 计算交集(n11)和并集(n01 和 n10)
58
+ n11 = np.sum(tab.to_numpy() ** 2) # 交集
59
+ n01 = np.sum(np.sum(tab.to_numpy(), axis=0) ** 2) # 列和的平方
60
+ n10 = np.sum(np.sum(tab.to_numpy(), axis=1) ** 2) # 行和的平方
61
+
62
+ return n11 / (n01 + n10 - n11)
63
+
64
+
65
+ def clara(seqdata, R=100, kvals=None, sample_size=None, method="crisp", dist_args=None,
66
+ criteria=["distance"], stability=False, max_dist=None):
67
+
68
+ # ==================
69
+ # Parameter checking
70
+ # ==================
71
+ if kvals is None:
72
+ kvals = range(2, 11)
73
+
74
+ if sample_size is None:
75
+ sample_size = 40 + 2 * max(kvals)
76
+
77
+ print("[>] Starting generalized CLARA for sequence analysis.")
78
+
79
+ # Check for input data type (should be a sequence object)
80
+ if not isinstance(seqdata, SequenceData):
81
+ raise ValueError("[!] 'seqdata' should be SequenceData, check the input format.")
82
+
83
+ if max(kvals) > sample_size:
84
+ raise ValueError("[!] More clusters than the size of the sample requested.")
85
+
86
+ allmethods = ["crisp"]
87
+ if method.lower() not in [m.lower() for m in allmethods]:
88
+ raise ValueError(f"[!] Unknown method {method}. Please specify one of the following: {', '.join(allmethods)}")
89
+
90
+ if method.lower() == "representativeness" and max_dist is None:
91
+ raise ValueError("[!] You need to set max.dist when using representativeness method.")
92
+
93
+ allcriteria = ["distance", "db", "xb", "pbm", "ams"]
94
+ criteria = [c.lower() for c in criteria]
95
+ if not all(c in allcriteria for c in criteria):
96
+ raise ValueError(
97
+ f"[!] Unknown criteria among {', '.join(criteria)}. Please specify at least one among {', '.join(allcriteria)}.")
98
+
99
+ if dist_args is None:
100
+ raise ValueError("[!] You need to set the 'dist_args' for get_distance_matrix function.")
101
+
102
+ print(f"[>] Using {method} clustering optimizing the following criterion: {', '.join(criteria)}.")
103
+
104
+ # FIXME : Add coherance check between method and criteria
105
+
106
+ # ===========
107
+ # Aggregation
108
+ # ===========
109
+ number_seq = len(seqdata.seqdata)
110
+ print(f" - Aggregating {number_seq} sequences...")
111
+
112
+ ac = DataFrameAggregator().aggregate(seqdata.seqdata)
113
+ agseqdata = seqdata.seqdata.iloc[ac['aggIndex'], :]
114
+ # agseqdata.attrs['weights'] = None
115
+ ac['probs'] = ac['aggWeights'] / number_seq
116
+ print(f" - OK ({len(ac['aggWeights'])} unique cases).")
117
+
118
+ # Memory cleanup before parallel computation
119
+ gc.collect()
120
+ print("[>] Starting iterations...")
121
+
122
+ def calc_pam_iter(circle, agseqdata, sample_size, kvals, ac):
123
+ # Sampling with replacement allows the process to proceed normally
124
+ # even when the sample size exceeds the dataset size, as samples can be repeatedly drawn."
125
+ mysample = np.random.choice(len(agseqdata), size=sample_size, p=ac['probs'], replace=True)
126
+ mysample = pd.DataFrame({'id': mysample})
127
+
128
+ # Re-aggregate!
129
+ ac2 = DataFrameAggregator().aggregate(mysample)
130
+ data_subset = agseqdata.iloc[mysample.iloc[ac2['aggIndex'], 0], :]
131
+
132
+ with open(os.devnull, 'w') as fnull:
133
+ with redirect_stdout(fnull):
134
+ states = np.arange(1, len(seqdata.states) + 1).tolist()
135
+ data_subset = SequenceData(data_subset,
136
+ time=seqdata.time,
137
+ states=states)
138
+ dist_args['seqdata'] = data_subset
139
+ diss = get_distance_matrix(opts=dist_args)
140
+
141
+ diss = diss.values
142
+ _diss = diss.copy()
143
+ _diss = get_weighted_diss(_diss, ac2['aggWeights'])
144
+ hc = linkage(_diss, method='ward')
145
+ del _diss
146
+
147
+ # For each number of clusters
148
+ allclust = []
149
+
150
+ for k in kvals:
151
+ # Weighted PAM clustering on subsample
152
+ # TODO : hc 已经是选好的中心点了,为什么初始化 clusterid 的时候要用 -1 呢?
153
+ # 因为没有必要啊,直接用原来的不好吗?尤其在没有进入 if 分支的情况下,这样处理也能避免 -1 的数据访问越界。所以为什么要初始化为-1呢?
154
+ clustering = KMedoids(diss=diss, k=k, cluster_only=True, initialclust=hc, weights=ac2['aggWeights'], verbose=False)
155
+ medoids = mysample.iloc[ac2['aggIndex'][np.unique(clustering)], :]
156
+ medoids = medoids.to_numpy().flatten()
157
+
158
+ del clustering
159
+
160
+ # =====================================================
161
+ # Compute Distances Between All Sequence to the Medoids
162
+ # =====================================================
163
+ refseq = [list(range(0, len(agseqdata))), medoids.tolist()]
164
+ with open(os.devnull, 'w') as fnull:
165
+ with redirect_stdout(fnull):
166
+ states = np.arange(1, len(seqdata.states) + 1).tolist()
167
+ agseqdata = SequenceData(agseqdata,
168
+ time=seqdata.time,
169
+ states=states)
170
+ dist_args['seqdata'] = agseqdata
171
+ dist_args['refseq'] = refseq
172
+ diss2 = get_distance_matrix(opts=dist_args)
173
+ del dist_args['refseq']
174
+ agseqdata = agseqdata.seqdata # Restore scene
175
+
176
+ # Compute two minimal distances are used for silhouette width
177
+ # and other criterions
178
+ diss2 = diss2.to_numpy()
179
+ alphabeta = np.array([np.sort(row)[:2] for row in diss2])
180
+ sil = (alphabeta[:, 1] - alphabeta[:, 0]) / np.maximum(alphabeta[:, 1], alphabeta[:, 0])
181
+
182
+ # Allocate to clusters
183
+ memb = np.argmin(diss2, axis=1) # Each data point is assigned to its nearest cluster
184
+
185
+ mean_diss = np.sum(alphabeta[:, 0] * ac['probs'])
186
+
187
+ warnings.filterwarnings('ignore', category=RuntimeWarning) # The ÷0 case is ignored
188
+ db = davies_bouldin_internal(diss=diss2, clustering=memb, medoids=medoids, weights=ac['aggWeights'])['db']
189
+ warnings.resetwarnings()
190
+ pbm = ((1 / len(medoids)) * (np.max(diss2[medoids]) / mean_diss)) ** 2
191
+ ams = np.sum(sil * ac['probs'])
192
+
193
+ distmed = diss2[medoids, :]
194
+ distmed_flat = distmed[np.triu_indices_from(distmed, k=1)] # Take the upper triangular part
195
+ minsep = np.min(distmed_flat)
196
+
197
+ xb = mean_diss / minsep
198
+
199
+ del alphabeta
200
+ del sil
201
+ del diss2
202
+ del distmed
203
+ del minsep
204
+
205
+ allclust.append({
206
+ 'mean_diss': mean_diss,
207
+ 'db': db,
208
+ 'pbm': pbm,
209
+ 'ams': ams,
210
+ 'xb': xb,
211
+ 'clustering': memb,
212
+ 'medoids': medoids
213
+ })
214
+
215
+ del diss
216
+ gc.collect()
217
+
218
+ return allclust
219
+
220
+ # Compute in parallel using joblib
221
+ # the output example of `results`:
222
+ # results[0] = all iter1's = [{k=2's}, {k=3's}, ... , {k=10's}]
223
+ # results[1] = all iter2's = [{k=2's}, {k=3's}, ... , {k=10's}]
224
+ results = Parallel(n_jobs=-1)(
225
+ delayed(calc_pam_iter)(circle=i, agseqdata=agseqdata, sample_size=sample_size, kvals=kvals, ac=ac) for i in range(R))
226
+ # results = []
227
+ # for i in range(R):
228
+ # res = calc_pam_iter(circle=i,
229
+ # agseqdata=agseqdata,
230
+ # sample_size=sample_size,
231
+ # kvals=kvals,
232
+ # ac=ac)
233
+ # results.append(res)
234
+
235
+ print(" - Done.")
236
+ print("[>] Aggregating iterations for each k values...")
237
+
238
+ # aggregated output example :
239
+ # data[0] = all k=2's = [{when iter1, k=2's}, {when iter2, k=2's}, ... , {when iter100, k=2's}]
240
+ # data[1] = all k=3's = [{when iter1, k=3's}, {when iter2, k=3's}, ... , {when iter100, k=3's}]
241
+ collected_data = [[] for _ in kvals]
242
+ for iter_result in results:
243
+ k = 0
244
+ for item in iter_result:
245
+ collected_data[k].append(item)
246
+ k += 1
247
+
248
+ kvalscriteria = list(product(range(len(kvals)), criteria))
249
+ kret = []
250
+ for item in kvalscriteria:
251
+ k = item[0]
252
+ _criteria = item[1]
253
+
254
+ mean_all_diss = [d['mean_diss'] for d in collected_data[k]]
255
+ db_all = [d['db'] for d in collected_data[k]]
256
+ pbm_all = [d['pbm'] for d in collected_data[k]]
257
+ ams_all = [d['ams'] for d in collected_data[k]]
258
+ xb_all = [d['xb'] for d in collected_data[k]]
259
+ clustering_all_diss = [d['clustering'] for d in collected_data[k]]
260
+ med_all_diss = [d['medoids'] for d in collected_data[k]]
261
+
262
+ # Find best clustering
263
+ objective = {
264
+ "distance": mean_all_diss,
265
+ "pbm": pbm_all,
266
+ "db": db_all,
267
+ "ams": ams_all,
268
+ "xb": xb_all
269
+ }
270
+ objective = objective[_criteria]
271
+ best = np.argmax(objective) if _criteria in ["ams", "pbm"] else np.argmin(objective)
272
+
273
+ # Compute clustering stability of the best partition
274
+ if stability:
275
+ def process_task(j, clustering_all_diss, ac, best):
276
+ df = pd.DataFrame({
277
+ 'clustering_j': clustering_all_diss[j], # The J-TH cluster
278
+ 'clustering_best': clustering_all_diss[best], # The best-TH clustering
279
+ 'aggWeights': ac['aggWeights']
280
+ })
281
+ tab = df.groupby(['clustering_j', 'clustering_best'])['aggWeights'].sum().unstack(fill_value=0)
282
+
283
+ val = [adjustedRandIndex(tab), jaccardCoef(tab)]
284
+ return val
285
+
286
+ arilist = []
287
+
288
+ if method in ["noise", "fuzzy"]:
289
+ for j in range(R):
290
+ val = process_task(j, clustering_all_diss, ac, best)
291
+ arilist.append(val)
292
+ else:
293
+ arilist = Parallel(n_jobs=-1)(
294
+ delayed(process_task)(j, clustering_all_diss, ac, best) for j in range(R))
295
+
296
+ arimatrix = np.vstack(arilist)
297
+ arimatrix = pd.DataFrame(arimatrix, columns=["ARI", "JC"])
298
+ ari08 = np.sum(arimatrix.iloc[:, 0] >= 0.8)
299
+ jc08 = np.sum(arimatrix.iloc[:, 1] >= 0.8)
300
+
301
+ else:
302
+ arimatrix = np.nan
303
+ ari08 = np.nan
304
+ jc08 = np.nan
305
+
306
+ _clustering = clustering_all_diss[best]
307
+
308
+ disagclust = np.full(seqdata.seqdata.shape[0], -1)
309
+ for i, index in enumerate(ac["disaggIndex"]):
310
+ disagclust[i] = _clustering[index] + 1 # 1-based index for clusters
311
+
312
+ evol_diss = np.maximum.accumulate(objective) if _criteria in ["ams", "pbm"] else np.minimum.accumulate(objective)
313
+
314
+ # Store the best solution and evaluations of the others
315
+ bestcluster = {
316
+ "medoids": ac["aggIndex"][med_all_diss[best]],
317
+ "clustering": disagclust,
318
+ "evol_diss": evol_diss,
319
+ "iter_objective": objective,
320
+ "objective": objective[best],
321
+ "iteration": best,
322
+ "arimatrix": arimatrix,
323
+ "criteria": _criteria,
324
+ "method": method,
325
+ "avg_dist": mean_all_diss[best],
326
+ "pbm": pbm_all[best],
327
+ "db": db_all[best],
328
+ "xb": xb_all[best],
329
+ "ams": ams_all[best],
330
+ "ari08": ari08,
331
+ "jc08": jc08,
332
+ "R": R,
333
+ "k": k
334
+ }
335
+
336
+ # Store computed cluster quality
337
+ kresult = {
338
+ "k": k+2,
339
+ "criteria": criteria,
340
+ "stats": [bestcluster["avg_dist"], bestcluster["pbm"], bestcluster["db"], bestcluster["xb"],
341
+ bestcluster["ams"], bestcluster["ari08"], bestcluster["jc08"], best],
342
+ "bestcluster": bestcluster
343
+ }
344
+
345
+ kret.append(kresult)
346
+
347
+ def claraObj(kretlines, method, kvals, kret, seqdata):
348
+ clustering = np.full((seqdata.seqdata.shape[0], len(kvals)), -1)
349
+ clustering = pd.DataFrame(clustering)
350
+ clustering.columns = [f"Cluster {val}" for val in kvals]
351
+ clustering.index = seqdata.ids
352
+
353
+ ret = {
354
+ "kvals": kvals,
355
+ "clara": {},
356
+ "clustering": clustering,
357
+ "stats": np.full((len(kvals), 8), -1, dtype=float)
358
+ }
359
+
360
+ for i in kretlines:
361
+ k = kret[i]['k'] - 2 # start from 0, not 2
362
+ ret['stats'][k, :] = np.array(kret[i]['stats'])
363
+ ret['clara'][k] = kret[i]['bestcluster']
364
+
365
+ ret['clustering'].iloc[:, k] = kret[i]['bestcluster']['clustering']
366
+
367
+ ret['stats'] = pd.DataFrame(ret['stats'],
368
+ columns=["Avg dist", "PBM", "DB", "XB", "AMS", "ARI>0.8", "JC>0.8", "Best iter"])
369
+ ret['stats'].insert(0, "Number of Clusters", [f"Cluster {k}" for k in kvals])
370
+ ret['stats']["k_num"] = kvals
371
+
372
+ return ret
373
+
374
+ if len(criteria) > 1:
375
+ ret = {
376
+ 'param': {
377
+ 'criteria': criteria,
378
+ 'pam_combine': False,
379
+ 'all_criterias': criteria,
380
+ 'kvals': kvals,
381
+ 'method': method,
382
+ 'stability': stability
383
+ }
384
+ }
385
+
386
+ for meth in criteria:
387
+ indices = np.where(np.array([tup[1] for tup in kvalscriteria]) == meth)[0]
388
+ ret[meth] = claraObj(kretlines=indices, method=method, kvals=kvals, kret=kret, seqdata=seqdata)
389
+
390
+ allstats = {}
391
+
392
+ for meth in criteria:
393
+ stats = pd.DataFrame(ret[meth]['stats'])
394
+ stats['criteria'] = meth
395
+
396
+ allstats[meth] = stats
397
+
398
+ ret['allstats'] = pd.concat(allstats.values(), ignore_index=False)
399
+ else:
400
+
401
+ ret = claraObj(kretlines=range(len(kvalscriteria)), method=method, kvals=kvals, kret=kret, seqdata=seqdata)
402
+
403
+ print(" - Done.")
404
+
405
+ return ret
406
+
407
+
408
+ if __name__ == '__main__':
409
+ from sequenzo import * # Social sequence analysis
410
+ import pandas as pd # Import necesarry packages
411
+
412
+ # TODO : clara 返回的隶属矩阵要转置一下,因为plot_sequence_index里的参数id_group_df:cluster id 是行,id 是列
413
+
414
+ # ===============================
415
+ # Sohee
416
+ # ===============================
417
+ # df = pd.read_csv('D:/college/research/QiQi/sequenzo/data_and_output/orignal data/sohee/sequence_data.csv')
418
+ # time_list = list(df.columns)[1:133]
419
+ # states = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
420
+ # # states = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
421
+ # labels = ['FT+WC', 'FT+BC', 'PT+WC', 'PT+BC', 'U', 'OLF']
422
+ # sequence_data = SequenceData(df, time=time_list, time_type="age", states=states, labels=labels, id_col="PID")
423
+
424
+ # om.to_csv("D:/college/research/QiQi/sequenzo/files/sequenzo_Sohee_string_OM_TRATE.csv", index=True)
425
+
426
+ # ===============================
427
+ # kass
428
+ # ===============================
429
+ # df = pd.read_csv('D:/college/research/QiQi/sequenzo/files/orignal data/kass/wide_civil_final_df.csv')
430
+ # time_list = list(df.columns)[1:]
431
+ # states = ['Extensive Warfare', 'Limited Violence', 'No Violence', 'Pervasive Warfare', 'Prolonged Warfare',
432
+ # 'Serious Violence', 'Serious Warfare', 'Sporadic Violence', 'Technological Warfare', 'Total Warfare']
433
+ # sequence_data = SequenceData(df, time=time_list, time_type="year", states=states, id_col="COUNTRY")
434
+
435
+ # ===============================
436
+ # CO2
437
+ # ===============================
438
+ # df = pd.read_csv("D:/country_co2_emissions_missing.csv")
439
+ # time = list(df.columns)[1:]
440
+ # states = ['Very Low', 'Low', 'Middle', 'High', 'Very High']
441
+ # sequence_data = SequenceData(df, time_type="age", time=time, id_col="country", states=states)
442
+
443
+ # ===============================
444
+ # detailed
445
+ # ===============================
446
+ # df = pd.read_csv("/Users/xinyi/Projects/sequenzo/sequenzo/data_and_output/sampled_data_sets/detailed_data/sampled_1000_data.csv")
447
+ # time = list(df.columns)[4:]
448
+ # states = ['data', 'data & intensive math', 'hardware', 'research', 'software', 'software & hardware', 'support & test']
449
+ # sequence_data = SequenceData(df[['worker_id', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10']],
450
+ # time=time, id_col="worker_id", states=states)
451
+
452
+ # ===============================
453
+ # broad
454
+ # ===============================
455
+ # df = pd.read_csv("D:/college/research/QiQi/sequenzo/data_and_output/sampled_data_sets/broad_data/sampled_1000_data.csv")
456
+ # time = list(df.columns)[4:]
457
+ # states = ['Non-computing', 'Non-technical computing', 'Technical computing']
458
+ # sequence_data = SequenceData(df[['worker_id', 'C1', 'C2', 'C3', 'C4', 'C5']],
459
+ # time_type="age", time=time, id_col="worker_id", states=states)
460
+
461
+ df = pd.read_csv("/Users/xinyi/Projects/sequenzo/sequenzo/data_and_output/orignal data/not_real_detailed_data/synthetic_detailed_U5_N1000.csv")
462
+ _time = list(df.columns)[2:]
463
+ states = ["Data", "Data science", "Hardware", "Research", "Software", "Support & test", "Systems & infrastructure"]
464
+ df = df[['id', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']]
465
+ sequence_data = SequenceData(df, time=_time, id_col="id", states=states)
466
+
467
+ result = clara(sequence_data,
468
+ R=250,
469
+ sample_size=500,
470
+ kvals=range(2, 6),
471
+ criteria=['distance'],
472
+ dist_args={"method": "OM", "sm": "CONSTANT", "indel": 1},
473
+ stability=True)
474
+
475
+ # print(result)
476
+ print(result['stats'])
@@ -0,0 +1,27 @@
1
+ """
2
+ @Author : 李欣怡
3
+ @File : __init__.py.py
4
+ @Time : 2025/2/28 00:30
5
+ @Desc :
6
+ """
7
+ from .aggregatecases import *
8
+ from .davies_bouldin import *
9
+ from .wfcmdd import *
10
+ from sequenzo.clustering.KMedoids import KMedoids
11
+
12
+
13
+ def _import_c_code():
14
+ """Lazily import the c_code module to avoid circular dependencies during installation"""
15
+ try:
16
+ from sequenzo.clustering import clustering_c_code
17
+ return clustering_c_code
18
+ except ImportError:
19
+ # If the C extension cannot be imported, return None
20
+ print(
21
+ "Warning: The C++ extension (c_code) could not be imported. Please ensure the extension module is compiled correctly.")
22
+ return None
23
+
24
+
25
+ __all__ = [
26
+ 'KMedoids'
27
+ ]
@@ -0,0 +1,92 @@
1
+ """
2
+ @Author : 李欣怡
3
+ @File : aggregatecases.py
4
+ @Time : 2024/12/27 10:12
5
+ @Desc :
6
+ """
7
+ import pandas as pd
8
+ import numpy as np
9
+
10
+
11
+ class WcAggregateCases:
12
+ def aggregate(self, x, weights=None, **kwargs):
13
+ """
14
+ The appropriate aggregation method is invoked dynamically depending on the type of x
15
+ """
16
+ method_name = f"aggregate_{type(x).__name__}"
17
+ method = getattr(self, method_name, None)
18
+
19
+ if method is None:
20
+ raise NotImplementedError(f"No aggregation method for type {type(x).__name__}")
21
+
22
+ return method(x, weights, **kwargs)
23
+
24
+
25
+ class WcAggregateCasesInternal:
26
+ def aggregate(self, x, weights=None):
27
+ x = pd.DataFrame(x)
28
+ lx = len(x)
29
+
30
+ if weights is None:
31
+ weights = np.ones(lx)
32
+
33
+ ids = x.apply(lambda row: "@@@WC_SEP@@".join(row.astype(str)), axis=1)
34
+
35
+ mcorr = [np.nan] * lx
36
+
37
+ def _compute_weight_each_group_and_sum(group):
38
+ first_element = group.iloc[0]
39
+
40
+ for idx in group:
41
+ mcorr[idx] = first_element
42
+ weighted_sum = np.sum(weights[group])
43
+ return [first_element, weighted_sum]
44
+
45
+ df = pd.DataFrame({
46
+ 'index': range(0, lx),
47
+ 'id': ids
48
+ })
49
+
50
+ grouped = df.groupby('id')['index'].apply(_compute_weight_each_group_and_sum)
51
+
52
+ agg_df = pd.DataFrame(grouped.tolist(), columns=['aggIndex', 'aggWeights'])
53
+
54
+ aggIndex = agg_df['aggIndex']
55
+ mcorr2 = [aggIndex[aggIndex == val].index[0] if val in aggIndex.values else -1 for val in mcorr]
56
+
57
+ ret = {
58
+ "aggIndex": agg_df['aggIndex'].values,
59
+ "aggWeights": agg_df['aggWeights'].values,
60
+ "disaggIndex": mcorr2,
61
+ "disaggWeights": weights
62
+ }
63
+
64
+ return ret
65
+
66
+
67
+ class DataFrameAggregator(WcAggregateCases):
68
+ def aggregate_DataFrame(self, x, weights=None, **kwargs):
69
+ internal = WcAggregateCasesInternal()
70
+ return internal.aggregate(x, weights)
71
+
72
+
73
+ class MatrixAggregator(WcAggregateCases):
74
+ def aggregate_ndarray(self, x, weights=None, **kwargs):
75
+ internal = WcAggregateCasesInternal()
76
+ return internal.aggregate(x, weights)
77
+
78
+
79
+ class StsListAggregator(WcAggregateCases):
80
+ def aggregate_stslist(self, x, weights=None, weighted=True, **kwargs):
81
+ if weights is None and weighted:
82
+ weights = getattr(x, "weights", None)
83
+ internal = WcAggregateCasesInternal()
84
+ return internal.aggregate(x, weights)
85
+
86
+
87
+ # Print function (for output)
88
+ def print_wcAggregateCases(result):
89
+ print(f"Number of disaggregated cases: {len(result['disaggWeights'])}")
90
+ print(f"Number of aggregated cases: {len(result['aggWeights'])}")
91
+ print(f"Average aggregated cases: {len(result['disaggWeights']) / len(result['aggWeights'])}")
92
+ print(f"Average (weighted) aggregation: {np.mean(result['aggWeights'])}")
@@ -0,0 +1,91 @@
1
+ """
2
+ @Author : 李欣怡
3
+ @File : davies_bouldin.py
4
+ @Time : 2024/12/27 17:56
5
+ @Desc :
6
+ :param
7
+ diss : numpy 2D, 距离矩阵
8
+ clustering : numpy 1D, 每个数据点的隶属矩阵(一种可能:初始化时每个点构成一个簇,则每个数据点隶属自己)
9
+ medoids : numpy 1D, 簇的中心点
10
+ """
11
+ import numpy as np
12
+
13
+
14
+ def davies_bouldin_internal(diss, clustering, medoids, p=1, weights=None, medoidclust=False):
15
+ # If weights are not provided, use uniform weights
16
+ if weights is None:
17
+ weights = np.ones(diss.shape[0])
18
+
19
+ list_diam = np.zeros(len(medoids))
20
+
21
+ # Calculate the diameter for each medoid
22
+ for i in range(len(medoids)):
23
+ medi = medoids[i] if medoidclust else i
24
+ cond = (clustering == medi)
25
+
26
+ # Calculate the diameter (weighted distance)
27
+ list_diam[i] = (np.sum(weights[cond] * diss[cond, i] ** p) / np.sum(weights[cond])) ** (1 / p)
28
+
29
+ maximum = np.zeros(len(medoids))
30
+
31
+ # Calculate the maximum ratio for each medoid
32
+ for i in range(len(medoids)):
33
+ # Calculate the distance to other medoids
34
+ maximum2 = (list_diam[i] + list_diam) / diss[medoids[i], :]
35
+
36
+ # Take the maximum of the valid (finite) values
37
+ # ensure values for "same" medoids
38
+ maximum[i] = np.max(maximum2[np.isfinite(maximum2)])
39
+
40
+ # Calculate the final Davies-Bouldin index (average of maximum values)
41
+ final_db = np.mean(maximum)
42
+
43
+ return {'db': final_db, 'per_cluster': maximum}
44
+
45
+
46
+ def fuzzy_davies_bouldin_internal(diss, memb, medoids, weights=None):
47
+ if weights is None:
48
+ weights = np.ones(diss.shape[0])
49
+
50
+ # R 中定义后未使用,用另一个值赋值了
51
+ list_diam = np.zeros(len(medoids))
52
+
53
+ # R 中只定义未使用
54
+ # n = np.sum(weights)
55
+
56
+ mw = memb * weights[:, None]
57
+ list_diam = np.sum(mw * diss, axis=0) / np.sum(mw, axis=0)
58
+
59
+ # 初始化一个数组来存储每个簇的最大值
60
+ maximum = np.zeros(len(medoids))
61
+
62
+ # 对每个簇计算其与其他簇的相似度
63
+ for i in range(len(medoids)):
64
+ maximum2 = (list_diam[i] + list_diam) / diss[medoids[i], :]
65
+
66
+ maximum[i] = np.max(maximum2[np.isfinite(maximum2)])
67
+
68
+ final_db = np.mean(maximum)
69
+
70
+ return {'db': final_db, 'per_cluster': maximum}
71
+
72
+
73
+ def adjpbm_internal(diss, clustering, medoids, p=1, weights=None, medoidclust=False):
74
+ if weights is None:
75
+ weights = np.ones(diss.shape[0])
76
+
77
+ # Calculate internal distance
78
+ internaldist = [
79
+ (sum(weights[clustering == (medoids[i] if medoidclust else i)] * diss[
80
+ clustering == (medoids[i] if medoidclust else i), i] ** p) /
81
+ sum(weights[clustering == (medoids[i] if medoidclust else i)])) ** (1 / p)
82
+ for i in range(len(medoids))
83
+ ]
84
+
85
+ # Calculate the minimum separation distance between medoids
86
+ separation = np.nanmin(diss[medoids, :][:, medoids])
87
+
88
+ # Calculate pbm (probabilistic cluster separation)
89
+ pbm = (1 / len(medoids)) * (separation / np.sum(internaldist))
90
+
91
+ return pbm