sequenzo 0.1.24__cp311-cp311-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sequenzo might be problematic. Click here for more details.

Files changed (264) hide show
  1. _sequenzo_fastcluster.cpython-311-darwin.so +0 -0
  2. sequenzo/__init__.py +240 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +474 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-311-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-311-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +20 -0
  30. sequenzo/data_preprocessing/helpers.py +256 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/mvad.csv +713 -0
  44. sequenzo/datasets/pairfam_family.csv +1867 -0
  45. sequenzo/datasets/polyadic_samplec1.csv +61 -0
  46. sequenzo/datasets/polyadic_samplep1.csv +61 -0
  47. sequenzo/datasets/polyadic_seqc1.csv +61 -0
  48. sequenzo/datasets/polyadic_seqp1.csv +61 -0
  49. sequenzo/define_sequence_data.py +609 -0
  50. sequenzo/dissimilarity_measures/__init__.py +31 -0
  51. sequenzo/dissimilarity_measures/c_code.cpython-311-darwin.so +0 -0
  52. sequenzo/dissimilarity_measures/get_distance_matrix.py +702 -0
  53. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +241 -0
  54. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  55. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  56. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  57. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  58. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  59. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  60. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  61. sequenzo/dissimilarity_measures/src/module.cpp +34 -0
  62. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  63. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  64. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  65. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  66. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  67. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  214. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  215. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  216. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-311-darwin.so +0 -0
  217. sequenzo/dissimilarity_measures/utils/seqconc.cpython-311-darwin.so +0 -0
  218. sequenzo/dissimilarity_measures/utils/seqdss.cpython-311-darwin.so +0 -0
  219. sequenzo/dissimilarity_measures/utils/seqdur.cpython-311-darwin.so +0 -0
  220. sequenzo/dissimilarity_measures/utils/seqlength.cpython-311-darwin.so +0 -0
  221. sequenzo/multidomain/__init__.py +23 -0
  222. sequenzo/multidomain/association_between_domains.py +311 -0
  223. sequenzo/multidomain/cat.py +431 -0
  224. sequenzo/multidomain/combt.py +519 -0
  225. sequenzo/multidomain/dat.py +89 -0
  226. sequenzo/multidomain/idcd.py +139 -0
  227. sequenzo/multidomain/linked_polyad.py +292 -0
  228. sequenzo/openmp_setup.py +233 -0
  229. sequenzo/prefix_tree/__init__.py +43 -0
  230. sequenzo/prefix_tree/individual_level_indicators.py +1274 -0
  231. sequenzo/prefix_tree/system_level_indicators.py +465 -0
  232. sequenzo/prefix_tree/utils.py +54 -0
  233. sequenzo/sequence_characteristics/__init__.py +40 -0
  234. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  235. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  236. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  237. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  238. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  239. sequenzo/sequence_characteristics/turbulence.py +155 -0
  240. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  241. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  242. sequenzo/suffix_tree/__init__.py +48 -0
  243. sequenzo/suffix_tree/individual_level_indicators.py +1638 -0
  244. sequenzo/suffix_tree/system_level_indicators.py +456 -0
  245. sequenzo/suffix_tree/utils.py +56 -0
  246. sequenzo/visualization/__init__.py +29 -0
  247. sequenzo/visualization/plot_mean_time.py +194 -0
  248. sequenzo/visualization/plot_modal_state.py +276 -0
  249. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  250. sequenzo/visualization/plot_relative_frequency.py +404 -0
  251. sequenzo/visualization/plot_sequence_index.py +951 -0
  252. sequenzo/visualization/plot_single_medoid.py +153 -0
  253. sequenzo/visualization/plot_state_distribution.py +627 -0
  254. sequenzo/visualization/plot_transition_matrix.py +190 -0
  255. sequenzo/visualization/utils/__init__.py +23 -0
  256. sequenzo/visualization/utils/utils.py +310 -0
  257. sequenzo/with_event_history_analysis/__init__.py +35 -0
  258. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  259. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  260. sequenzo-0.1.24.dist-info/METADATA +255 -0
  261. sequenzo-0.1.24.dist-info/RECORD +264 -0
  262. sequenzo-0.1.24.dist-info/WHEEL +5 -0
  263. sequenzo-0.1.24.dist-info/licenses/LICENSE +28 -0
  264. sequenzo-0.1.24.dist-info/top_level.txt +2 -0
@@ -0,0 +1,474 @@
1
+ """
2
+ @Author : 李欣怡
3
+ @File : clara.py
4
+ @Time : 2024/12/27 12:04
5
+ @Desc :
6
+ """
7
+
8
+ import gc
9
+ import os
10
+ from contextlib import redirect_stdout
11
+ import warnings
12
+
13
+ from joblib import Parallel, delayed
14
+ from sequenzo.clustering.sequenzo_fastcluster.fastcluster import linkage
15
+ from scipy.special import comb
16
+ from itertools import product
17
+
18
+ from sequenzo.big_data.clara.utils.aggregatecases import *
19
+ from sequenzo.big_data.clara.utils.davies_bouldin import *
20
+ from sequenzo.clustering.KMedoids import *
21
+ from sequenzo.big_data.clara.utils.get_weighted_diss import *
22
+
23
+ from sequenzo.define_sequence_data import SequenceData
24
+ from sequenzo.dissimilarity_measures.get_distance_matrix import get_distance_matrix
25
+
26
+
27
+ def adjustedRandIndex(x, y=None):
28
+ if isinstance(x, np.ndarray):
29
+ x = np.array(x)
30
+ y = np.array(y)
31
+ if len(x) != len(y):
32
+ raise ValueError("Arguments must be vectors of the same length")
33
+
34
+ tab = pd.crosstab(x, y)
35
+ else:
36
+ tab = x
37
+
38
+ if tab.shape == (1, 1):
39
+ return 1
40
+
41
+ # 计算 ARI 的四个部分:a, b, c, d
42
+ a = np.sum(comb(tab.to_numpy(), 2)) # 选择每对组合的组合数
43
+ b = np.sum(comb(np.sum(tab.to_numpy(), axis=1), 2)) - a
44
+ c = np.sum(comb(np.sum(tab.to_numpy(), axis=0), 2)) - a
45
+ d = comb(np.sum(tab.to_numpy()), 2) - a - b - c
46
+
47
+ ARI = (a - (a + b) * (a + c) / (a + b + c + d)) / ((a + b + a + c) / 2 - (a + b) * (a + c) / (a + b + c + d))
48
+ return ARI
49
+
50
+
51
+ def jaccardCoef(tab):
52
+ if tab.shape == (1, 1):
53
+ return 1
54
+
55
+ # 计算交集(n11)和并集(n01 和 n10)
56
+ n11 = np.sum(tab.to_numpy() ** 2) # 交集
57
+ n01 = np.sum(np.sum(tab.to_numpy(), axis=0) ** 2) # 列和的平方
58
+ n10 = np.sum(np.sum(tab.to_numpy(), axis=1) ** 2) # 行和的平方
59
+
60
+ return n11 / (n01 + n10 - n11)
61
+
62
+
63
+ def clara(seqdata, R=100, kvals=None, sample_size=None, method="crisp", dist_args=None,
64
+ criteria=["distance"], stability=False, max_dist=None):
65
+
66
+ # ==================
67
+ # Parameter checking
68
+ # ==================
69
+ if kvals is None:
70
+ kvals = range(2, 11)
71
+
72
+ if sample_size is None:
73
+ sample_size = 40 + 2 * max(kvals)
74
+
75
+ print("[>] Starting generalized CLARA for sequence analysis.")
76
+
77
+ # Check for input data type (should be a sequence object)
78
+ if not isinstance(seqdata, SequenceData):
79
+ raise ValueError("[!] 'seqdata' should be SequenceData, check the input format.")
80
+
81
+ if max(kvals) > sample_size:
82
+ raise ValueError("[!] More clusters than the size of the sample requested.")
83
+
84
+ allmethods = ["crisp"]
85
+ if method.lower() not in [m.lower() for m in allmethods]:
86
+ raise ValueError(f"[!] Unknown method {method}. Please specify one of the following: {', '.join(allmethods)}")
87
+
88
+ if method.lower() == "representativeness" and max_dist is None:
89
+ raise ValueError("[!] You need to set max.dist when using representativeness method.")
90
+
91
+ allcriteria = ["distance", "db", "xb", "pbm", "ams"]
92
+ criteria = [c.lower() for c in criteria]
93
+ if not all(c in allcriteria for c in criteria):
94
+ raise ValueError(
95
+ f"[!] Unknown criteria among {', '.join(criteria)}. Please specify at least one among {', '.join(allcriteria)}.")
96
+
97
+ if dist_args is None:
98
+ raise ValueError("[!] You need to set the 'dist_args' for get_distance_matrix function.")
99
+
100
+ print(f"[>] Using {method} clustering optimizing the following criterion: {', '.join(criteria)}.")
101
+
102
+ # FIXME : Add coherance check between method and criteria
103
+
104
+ # ===========
105
+ # Aggregation
106
+ # ===========
107
+ number_seq = len(seqdata.seqdata)
108
+ print(f" - Aggregating {number_seq} sequences...")
109
+
110
+ ac = DataFrameAggregator().aggregate(seqdata.seqdata)
111
+ agseqdata = seqdata.seqdata.iloc[ac['aggIndex'], :]
112
+ # agseqdata.attrs['weights'] = None
113
+ ac['probs'] = ac['aggWeights'] / number_seq
114
+ print(f" - OK ({len(ac['aggWeights'])} unique cases).")
115
+
116
+ # Memory cleanup before parallel computation
117
+ gc.collect()
118
+ print("[>] Starting iterations...")
119
+
120
+ def calc_pam_iter(circle, agseqdata, sample_size, kvals, ac):
121
+ # Sampling with replacement allows the process to proceed normally
122
+ # even when the sample size exceeds the dataset size, as samples can be repeatedly drawn."
123
+ mysample = np.random.choice(len(agseqdata), size=sample_size, p=ac['probs'], replace=True)
124
+ mysample = pd.DataFrame({'id': mysample})
125
+
126
+ # Re-aggregate!
127
+ ac2 = DataFrameAggregator().aggregate(mysample)
128
+ data_subset = agseqdata.iloc[mysample.iloc[ac2['aggIndex'], 0], :]
129
+
130
+ with open(os.devnull, 'w') as fnull:
131
+ with redirect_stdout(fnull):
132
+ states = np.arange(1, len(seqdata.states)).tolist()
133
+ data_subset = SequenceData(data_subset,
134
+ time=seqdata.time,
135
+ states=states)
136
+ dist_args['seqdata'] = data_subset
137
+ diss = get_distance_matrix(opts=dist_args)
138
+
139
+ diss = diss.values
140
+ _diss = diss.copy()
141
+ _diss = get_weighted_diss(_diss, ac2['aggWeights'])
142
+ hc = linkage(_diss, method='ward')
143
+ del _diss
144
+
145
+ # For each number of clusters
146
+ allclust = []
147
+
148
+ for k in kvals:
149
+ # Weighted PAM clustering on subsample
150
+ # TODO : hc 已经是选好的中心点了,为什么初始化 clusterid 的时候要用 -1 呢?
151
+ # 因为没有必要啊,直接用原来的不好吗?尤其在没有进入 if 分支的情况下,这样处理也能避免 -1 的数据访问越界。所以为什么要初始化为-1呢?
152
+ clustering = KMedoids(diss=diss, k=k, cluster_only=True, initialclust=hc, weights=ac2['aggWeights'], verbose=False)
153
+ medoids = mysample.iloc[ac2['aggIndex'][np.unique(clustering)], :]
154
+ medoids = medoids.to_numpy().flatten()
155
+
156
+ del clustering
157
+
158
+ # =====================================================
159
+ # Compute Distances Between All Sequence to the Medoids
160
+ # =====================================================
161
+ refseq = [list(range(0, len(agseqdata))), medoids.tolist()]
162
+ with open(os.devnull, 'w') as fnull:
163
+ with redirect_stdout(fnull):
164
+ states = np.arange(1, len(seqdata.states)).tolist()
165
+ agseqdata = SequenceData(agseqdata,
166
+ time=seqdata.time,
167
+ states=states)
168
+ dist_args['seqdata'] = agseqdata
169
+ dist_args['refseq'] = refseq
170
+ diss2 = get_distance_matrix(opts=dist_args)
171
+ del dist_args['refseq']
172
+ agseqdata = agseqdata.seqdata # Restore scene
173
+
174
+ # Compute two minimal distances are used for silhouette width
175
+ # and other criterions
176
+ diss2 = diss2.to_numpy()
177
+ alphabeta = np.array([np.sort(row)[:2] for row in diss2])
178
+ sil = (alphabeta[:, 1] - alphabeta[:, 0]) / np.maximum(alphabeta[:, 1], alphabeta[:, 0])
179
+
180
+ # Allocate to clusters
181
+ memb = np.argmin(diss2, axis=1) # Each data point is assigned to its nearest cluster
182
+
183
+ mean_diss = np.sum(alphabeta[:, 0] * ac['probs'])
184
+
185
+ warnings.filterwarnings('ignore', category=RuntimeWarning) # The ÷0 case is ignored
186
+ db = davies_bouldin_internal(diss=diss2, clustering=memb, medoids=medoids, weights=ac['aggWeights'])['db']
187
+ warnings.resetwarnings()
188
+ pbm = ((1 / len(medoids)) * (np.max(diss2[medoids]) / mean_diss)) ** 2
189
+ ams = np.sum(sil * ac['probs'])
190
+
191
+ distmed = diss2[medoids, :]
192
+ distmed_flat = distmed[np.triu_indices_from(distmed, k=1)] # Take the upper triangular part
193
+ minsep = np.min(distmed_flat)
194
+
195
+ xb = mean_diss / minsep
196
+
197
+ del alphabeta
198
+ del sil
199
+ del diss2
200
+ del distmed
201
+ del minsep
202
+
203
+ allclust.append({
204
+ 'mean_diss': mean_diss,
205
+ 'db': db,
206
+ 'pbm': pbm,
207
+ 'ams': ams,
208
+ 'xb': xb,
209
+ 'clustering': memb,
210
+ 'medoids': medoids
211
+ })
212
+
213
+ del diss
214
+ gc.collect()
215
+
216
+ return allclust
217
+
218
+ # Compute in parallel using joblib
219
+ # the output example of `results`:
220
+ # results[0] = all iter1's = [{k=2's}, {k=3's}, ... , {k=10's}]
221
+ # results[1] = all iter2's = [{k=2's}, {k=3's}, ... , {k=10's}]
222
+ results = Parallel(n_jobs=-1)(
223
+ delayed(calc_pam_iter)(circle=i, agseqdata=agseqdata, sample_size=sample_size, kvals=kvals, ac=ac) for i in range(R))
224
+ # results = []
225
+ # for i in range(R):
226
+ # res = calc_pam_iter(circle=i,
227
+ # agseqdata=agseqdata,
228
+ # sample_size=sample_size,
229
+ # kvals=kvals,
230
+ # ac=ac)
231
+ # results.append(res)
232
+
233
+ print(" - Done.")
234
+ print("[>] Aggregating iterations for each k values...")
235
+
236
+ # aggregated output example :
237
+ # data[0] = all k=2's = [{when iter1, k=2's}, {when iter2, k=2's}, ... , {when iter100, k=2's}]
238
+ # data[1] = all k=3's = [{when iter1, k=3's}, {when iter2, k=3's}, ... , {when iter100, k=3's}]
239
+ collected_data = [[] for _ in kvals]
240
+ for iter_result in results:
241
+ k = 0
242
+ for item in iter_result:
243
+ collected_data[k].append(item)
244
+ k += 1
245
+
246
+ kvalscriteria = list(product(range(len(kvals)), criteria))
247
+ kret = []
248
+ for item in kvalscriteria:
249
+ k = item[0]
250
+ _criteria = item[1]
251
+
252
+ mean_all_diss = [d['mean_diss'] for d in collected_data[k]]
253
+ db_all = [d['db'] for d in collected_data[k]]
254
+ pbm_all = [d['pbm'] for d in collected_data[k]]
255
+ ams_all = [d['ams'] for d in collected_data[k]]
256
+ xb_all = [d['xb'] for d in collected_data[k]]
257
+ clustering_all_diss = [d['clustering'] for d in collected_data[k]]
258
+ med_all_diss = [d['medoids'] for d in collected_data[k]]
259
+
260
+ # Find best clustering
261
+ objective = {
262
+ "distance": mean_all_diss,
263
+ "pbm": pbm_all,
264
+ "db": db_all,
265
+ "ams": ams_all,
266
+ "xb": xb_all
267
+ }
268
+ objective = objective[_criteria]
269
+ best = np.argmax(objective) if _criteria in ["ams", "pbm"] else np.argmin(objective)
270
+
271
+ # Compute clustering stability of the best partition
272
+ if stability:
273
+ def process_task(j, clustering_all_diss, ac, best):
274
+ df = pd.DataFrame({
275
+ 'clustering_j': clustering_all_diss[j], # The J-TH cluster
276
+ 'clustering_best': clustering_all_diss[best], # The best-TH clustering
277
+ 'aggWeights': ac['aggWeights']
278
+ })
279
+ tab = df.groupby(['clustering_j', 'clustering_best'])['aggWeights'].sum().unstack(fill_value=0)
280
+
281
+ val = [adjustedRandIndex(tab), jaccardCoef(tab)]
282
+ return val
283
+
284
+ arilist = []
285
+
286
+ if method in ["noise", "fuzzy"]:
287
+ for j in range(R):
288
+ val = process_task(j, clustering_all_diss, ac, best)
289
+ arilist.append(val)
290
+ else:
291
+ arilist = Parallel(n_jobs=-1)(
292
+ delayed(process_task)(j, clustering_all_diss, ac, best) for j in range(R))
293
+
294
+ arimatrix = np.vstack(arilist)
295
+ arimatrix = pd.DataFrame(arimatrix, columns=["ARI", "JC"])
296
+ ari08 = np.sum(arimatrix.iloc[:, 0] >= 0.8)
297
+ jc08 = np.sum(arimatrix.iloc[:, 1] >= 0.8)
298
+
299
+ else:
300
+ arimatrix = np.nan
301
+ ari08 = np.nan
302
+ jc08 = np.nan
303
+
304
+ _clustering = clustering_all_diss[best]
305
+
306
+ disagclust = np.full(seqdata.seqdata.shape[0], -1)
307
+ for i, index in enumerate(ac["disaggIndex"]):
308
+ disagclust[i] = _clustering[index] + 1 # 1-based index for clusters
309
+
310
+ evol_diss = np.maximum.accumulate(objective) if _criteria in ["ams", "pbm"] else np.minimum.accumulate(objective)
311
+
312
+ # Store the best solution and evaluations of the others
313
+ bestcluster = {
314
+ "medoids": ac["aggIndex"][med_all_diss[best]],
315
+ "clustering": disagclust,
316
+ "evol_diss": evol_diss,
317
+ "iter_objective": objective,
318
+ "objective": objective[best],
319
+ "iteration": best,
320
+ "arimatrix": arimatrix,
321
+ "criteria": _criteria,
322
+ "method": method,
323
+ "avg_dist": mean_all_diss[best],
324
+ "pbm": pbm_all[best],
325
+ "db": db_all[best],
326
+ "xb": xb_all[best],
327
+ "ams": ams_all[best],
328
+ "ari08": ari08,
329
+ "jc08": jc08,
330
+ "R": R,
331
+ "k": k
332
+ }
333
+
334
+ # Store computed cluster quality
335
+ kresult = {
336
+ "k": k+2,
337
+ "criteria": criteria,
338
+ "stats": [bestcluster["avg_dist"], bestcluster["pbm"], bestcluster["db"], bestcluster["xb"],
339
+ bestcluster["ams"], bestcluster["ari08"], bestcluster["jc08"], best],
340
+ "bestcluster": bestcluster
341
+ }
342
+
343
+ kret.append(kresult)
344
+
345
+ def claraObj(kretlines, method, kvals, kret, seqdata):
346
+ clustering = np.full((seqdata.seqdata.shape[0], len(kvals)), -1)
347
+ clustering = pd.DataFrame(clustering)
348
+ clustering.columns = [f"Cluster {val}" for val in kvals]
349
+ clustering.index = seqdata.ids
350
+
351
+ ret = {
352
+ "kvals": kvals,
353
+ "clara": {},
354
+ "clustering": clustering,
355
+ "stats": np.full((len(kvals), 8), -1, dtype=float)
356
+ }
357
+
358
+ for i in kretlines:
359
+ k = kret[i]['k'] - 2 # start from 0, not 2
360
+ ret['stats'][k, :] = np.array(kret[i]['stats'])
361
+ ret['clara'][k] = kret[i]['bestcluster']
362
+
363
+ ret['clustering'].iloc[:, k] = kret[i]['bestcluster']['clustering']
364
+
365
+ ret['stats'] = pd.DataFrame(ret['stats'],
366
+ columns=["Avg dist", "PBM", "DB", "XB", "AMS", "ARI>0.8", "JC>0.8", "Best iter"])
367
+ ret['stats'].insert(0, "Number of Clusters", [f"Cluster {k}" for k in kvals])
368
+ ret['stats']["k_num"] = kvals
369
+
370
+ return ret
371
+
372
+ if len(criteria) > 1:
373
+ ret = {
374
+ 'param': {
375
+ 'criteria': criteria,
376
+ 'pam_combine': False,
377
+ 'all_criterias': criteria,
378
+ 'kvals': kvals,
379
+ 'method': method,
380
+ 'stability': stability
381
+ }
382
+ }
383
+
384
+ for meth in criteria:
385
+ indices = np.where(np.array([tup[1] for tup in kvalscriteria]) == meth)[0]
386
+ ret[meth] = claraObj(kretlines=indices, method=method, kvals=kvals, kret=kret, seqdata=seqdata)
387
+
388
+ allstats = {}
389
+
390
+ for meth in criteria:
391
+ stats = pd.DataFrame(ret[meth]['stats'])
392
+ stats['criteria'] = meth
393
+
394
+ allstats[meth] = stats
395
+
396
+ ret['allstats'] = pd.concat(allstats.values(), ignore_index=False)
397
+ else:
398
+
399
+ ret = claraObj(kretlines=range(len(kvalscriteria)), method=method, kvals=kvals, kret=kret, seqdata=seqdata)
400
+
401
+ print(" - Done.")
402
+
403
+ return ret
404
+
405
+
406
+ if __name__ == '__main__':
407
+ from sequenzo import * # Social sequence analysis
408
+ import pandas as pd # Import necesarry packages
409
+
410
+ # TODO : clara 返回的隶属矩阵要转置一下,因为plot_sequence_index里的参数id_group_df:cluster id 是行,id 是列
411
+
412
+ # ===============================
413
+ # Sohee
414
+ # ===============================
415
+ # df = pd.read_csv('D:/college/research/QiQi/sequenzo/data_and_output/orignal data/sohee/sequence_data.csv')
416
+ # time_list = list(df.columns)[1:133]
417
+ # states = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
418
+ # # states = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
419
+ # labels = ['FT+WC', 'FT+BC', 'PT+WC', 'PT+BC', 'U', 'OLF']
420
+ # sequence_data = SequenceData(df, time=time_list, time_type="age", states=states, labels=labels, id_col="PID")
421
+
422
+ # om.to_csv("D:/college/research/QiQi/sequenzo/files/sequenzo_Sohee_string_OM_TRATE.csv", index=True)
423
+
424
+ # ===============================
425
+ # kass
426
+ # ===============================
427
+ # df = pd.read_csv('D:/college/research/QiQi/sequenzo/files/orignal data/kass/wide_civil_final_df.csv')
428
+ # time_list = list(df.columns)[1:]
429
+ # states = ['Extensive Warfare', 'Limited Violence', 'No Violence', 'Pervasive Warfare', 'Prolonged Warfare',
430
+ # 'Serious Violence', 'Serious Warfare', 'Sporadic Violence', 'Technological Warfare', 'Total Warfare']
431
+ # sequence_data = SequenceData(df, time=time_list, time_type="year", states=states, id_col="COUNTRY")
432
+
433
+ # ===============================
434
+ # CO2
435
+ # ===============================
436
+ # df = pd.read_csv("D:/country_co2_emissions_missing.csv")
437
+ # time = list(df.columns)[1:]
438
+ # states = ['Very Low', 'Low', 'Middle', 'High', 'Very High']
439
+ # sequence_data = SequenceData(df, time_type="age", time=time, id_col="country", states=states)
440
+
441
+ # ===============================
442
+ # detailed
443
+ # ===============================
444
+ # df = pd.read_csv("/Users/xinyi/Projects/sequenzo/sequenzo/data_and_output/sampled_data_sets/detailed_data/sampled_1000_data.csv")
445
+ # time = list(df.columns)[4:]
446
+ # states = ['data', 'data & intensive math', 'hardware', 'research', 'software', 'software & hardware', 'support & test']
447
+ # sequence_data = SequenceData(df[['worker_id', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10']],
448
+ # time=time, id_col="worker_id", states=states)
449
+
450
+ # ===============================
451
+ # broad
452
+ # ===============================
453
+ # df = pd.read_csv("D:/college/research/QiQi/sequenzo/data_and_output/sampled_data_sets/broad_data/sampled_1000_data.csv")
454
+ # time = list(df.columns)[4:]
455
+ # states = ['Non-computing', 'Non-technical computing', 'Technical computing']
456
+ # sequence_data = SequenceData(df[['worker_id', 'C1', 'C2', 'C3', 'C4', 'C5']],
457
+ # time_type="age", time=time, id_col="worker_id", states=states)
458
+
459
+ df = pd.read_csv("/Users/xinyi/Projects/sequenzo/sequenzo/data_and_output/orignal data/not_real_detailed_data/synthetic_detailed_U5_N1000.csv")
460
+ _time = list(df.columns)[2:]
461
+ states = ["Data", "Data science", "Hardware", "Research", "Software", "Support & test", "Systems & infrastructure"]
462
+ df = df[['id', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']]
463
+ sequence_data = SequenceData(df, time=_time, id_col="id", states=states)
464
+
465
+ result = clara(sequence_data,
466
+ R=250,
467
+ sample_size=500,
468
+ kvals=range(2, 6),
469
+ criteria=['distance'],
470
+ dist_args={"method": "OM", "sm": "CONSTANT", "indel": 1},
471
+ stability=True)
472
+
473
+ # print(result)
474
+ print(result['stats'])
@@ -0,0 +1,27 @@
1
+ """
2
+ @Author : 李欣怡
3
+ @File : __init__.py.py
4
+ @Time : 2025/2/28 00:30
5
+ @Desc :
6
+ """
7
+ from .aggregatecases import *
8
+ from .davies_bouldin import *
9
+ from .wfcmdd import *
10
+ from sequenzo.clustering.KMedoids import KMedoids
11
+
12
+
13
+ def _import_c_code():
14
+ """Lazily import the c_code module to avoid circular dependencies during installation"""
15
+ try:
16
+ from sequenzo.clustering import clustering_c_code
17
+ return clustering_c_code
18
+ except ImportError:
19
+ # If the C extension cannot be imported, return None
20
+ print(
21
+ "Warning: The C++ extension (c_code) could not be imported. Please ensure the extension module is compiled correctly.")
22
+ return None
23
+
24
+
25
+ __all__ = [
26
+ 'KMedoids'
27
+ ]
@@ -0,0 +1,92 @@
1
+ """
2
+ @Author : 李欣怡
3
+ @File : aggregatecases.py
4
+ @Time : 2024/12/27 10:12
5
+ @Desc :
6
+ """
7
+ import pandas as pd
8
+ import numpy as np
9
+
10
+
11
+ class WcAggregateCases:
12
+ def aggregate(self, x, weights=None, **kwargs):
13
+ """
14
+ The appropriate aggregation method is invoked dynamically depending on the type of x
15
+ """
16
+ method_name = f"aggregate_{type(x).__name__}"
17
+ method = getattr(self, method_name, None)
18
+
19
+ if method is None:
20
+ raise NotImplementedError(f"No aggregation method for type {type(x).__name__}")
21
+
22
+ return method(x, weights, **kwargs)
23
+
24
+
25
+ class WcAggregateCasesInternal:
26
+ def aggregate(self, x, weights=None):
27
+ x = pd.DataFrame(x)
28
+ lx = len(x)
29
+
30
+ if weights is None:
31
+ weights = np.ones(lx)
32
+
33
+ ids = x.apply(lambda row: "@@@WC_SEP@@".join(row.astype(str)), axis=1)
34
+
35
+ mcorr = [np.nan] * lx
36
+
37
+ def _compute_weight_each_group_and_sum(group):
38
+ first_element = group.iloc[0]
39
+
40
+ for idx in group:
41
+ mcorr[idx] = first_element
42
+ weighted_sum = np.sum(weights[group])
43
+ return [first_element, weighted_sum]
44
+
45
+ df = pd.DataFrame({
46
+ 'index': range(0, lx),
47
+ 'id': ids
48
+ })
49
+
50
+ grouped = df.groupby('id')['index'].apply(_compute_weight_each_group_and_sum)
51
+
52
+ agg_df = pd.DataFrame(grouped.tolist(), columns=['aggIndex', 'aggWeights'])
53
+
54
+ aggIndex = agg_df['aggIndex']
55
+ mcorr2 = [aggIndex[aggIndex == val].index[0] if val in aggIndex.values else -1 for val in mcorr]
56
+
57
+ ret = {
58
+ "aggIndex": agg_df['aggIndex'].values,
59
+ "aggWeights": agg_df['aggWeights'].values,
60
+ "disaggIndex": mcorr2,
61
+ "disaggWeights": weights
62
+ }
63
+
64
+ return ret
65
+
66
+
67
+ class DataFrameAggregator(WcAggregateCases):
68
+ def aggregate_DataFrame(self, x, weights=None, **kwargs):
69
+ internal = WcAggregateCasesInternal()
70
+ return internal.aggregate(x, weights)
71
+
72
+
73
+ class MatrixAggregator(WcAggregateCases):
74
+ def aggregate_ndarray(self, x, weights=None, **kwargs):
75
+ internal = WcAggregateCasesInternal()
76
+ return internal.aggregate(x, weights)
77
+
78
+
79
+ class StsListAggregator(WcAggregateCases):
80
+ def aggregate_stslist(self, x, weights=None, weighted=True, **kwargs):
81
+ if weights is None and weighted:
82
+ weights = getattr(x, "weights", None)
83
+ internal = WcAggregateCasesInternal()
84
+ return internal.aggregate(x, weights)
85
+
86
+
87
+ # Print function (for output)
88
+ def print_wcAggregateCases(result):
89
+ print(f"Number of disaggregated cases: {len(result['disaggWeights'])}")
90
+ print(f"Number of aggregated cases: {len(result['aggWeights'])}")
91
+ print(f"Average aggregated cases: {len(result['disaggWeights']) / len(result['aggWeights'])}")
92
+ print(f"Average (weighted) aggregation: {np.mean(result['aggWeights'])}")
@@ -0,0 +1,91 @@
1
+ """
2
+ @Author : 李欣怡
3
+ @File : davies_bouldin.py
4
+ @Time : 2024/12/27 17:56
5
+ @Desc :
6
+ :param
7
+ diss : numpy 2D, 距离矩阵
8
+ clustering : numpy 1D, 每个数据点的隶属矩阵(一种可能:初始化时每个点构成一个簇,则每个数据点隶属自己)
9
+ medoids : numpy 1D, 簇的中心点
10
+ """
11
+ import numpy as np
12
+
13
+
14
+ def davies_bouldin_internal(diss, clustering, medoids, p=1, weights=None, medoidclust=False):
15
+ # If weights are not provided, use uniform weights
16
+ if weights is None:
17
+ weights = np.ones(diss.shape[0])
18
+
19
+ list_diam = np.zeros(len(medoids))
20
+
21
+ # Calculate the diameter for each medoid
22
+ for i in range(len(medoids)):
23
+ medi = medoids[i] if medoidclust else i
24
+ cond = (clustering == medi)
25
+
26
+ # Calculate the diameter (weighted distance)
27
+ list_diam[i] = (np.sum(weights[cond] * diss[cond, i] ** p) / np.sum(weights[cond])) ** (1 / p)
28
+
29
+ maximum = np.zeros(len(medoids))
30
+
31
+ # Calculate the maximum ratio for each medoid
32
+ for i in range(len(medoids)):
33
+ # Calculate the distance to other medoids
34
+ maximum2 = (list_diam[i] + list_diam) / diss[medoids[i], :]
35
+
36
+ # Take the maximum of the valid (finite) values
37
+ # ensure values for "same" medoids
38
+ maximum[i] = np.max(maximum2[np.isfinite(maximum2)])
39
+
40
+ # Calculate the final Davies-Bouldin index (average of maximum values)
41
+ final_db = np.mean(maximum)
42
+
43
+ return {'db': final_db, 'per_cluster': maximum}
44
+
45
+
46
+ def fuzzy_davies_bouldin_internal(diss, memb, medoids, weights=None):
47
+ if weights is None:
48
+ weights = np.ones(diss.shape[0])
49
+
50
+ # R 中定义后未使用,用另一个值赋值了
51
+ list_diam = np.zeros(len(medoids))
52
+
53
+ # R 中只定义未使用
54
+ # n = np.sum(weights)
55
+
56
+ mw = memb * weights[:, None]
57
+ list_diam = np.sum(mw * diss, axis=0) / np.sum(mw, axis=0)
58
+
59
+ # 初始化一个数组来存储每个簇的最大值
60
+ maximum = np.zeros(len(medoids))
61
+
62
+ # 对每个簇计算其与其他簇的相似度
63
+ for i in range(len(medoids)):
64
+ maximum2 = (list_diam[i] + list_diam) / diss[medoids[i], :]
65
+
66
+ maximum[i] = np.max(maximum2[np.isfinite(maximum2)])
67
+
68
+ final_db = np.mean(maximum)
69
+
70
+ return {'db': final_db, 'per_cluster': maximum}
71
+
72
+
73
+ def adjpbm_internal(diss, clustering, medoids, p=1, weights=None, medoidclust=False):
74
+ if weights is None:
75
+ weights = np.ones(diss.shape[0])
76
+
77
+ # Calculate internal distance
78
+ internaldist = [
79
+ (sum(weights[clustering == (medoids[i] if medoidclust else i)] * diss[
80
+ clustering == (medoids[i] if medoidclust else i), i] ** p) /
81
+ sum(weights[clustering == (medoids[i] if medoidclust else i)])) ** (1 / p)
82
+ for i in range(len(medoids))
83
+ ]
84
+
85
+ # Calculate the minimum separation distance between medoids
86
+ separation = np.nanmin(diss[medoids, :][:, medoids])
87
+
88
+ # Calculate pbm (probabilistic cluster separation)
89
+ pbm = (1 / len(medoids)) * (separation / np.sum(internaldist))
90
+
91
+ return pbm