sequenzo 0.1.21__cp310-cp310-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sequenzo might be problematic. Click here for more details.

Files changed (260) hide show
  1. sequenzo/__init__.py +240 -0
  2. sequenzo/big_data/__init__.py +12 -0
  3. sequenzo/big_data/clara/__init__.py +26 -0
  4. sequenzo/big_data/clara/clara.py +467 -0
  5. sequenzo/big_data/clara/utils/__init__.py +27 -0
  6. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  7. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  8. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  9. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  10. sequenzo/big_data/clara/visualization.py +88 -0
  11. sequenzo/clustering/KMedoids.py +196 -0
  12. sequenzo/clustering/__init__.py +30 -0
  13. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  14. sequenzo/clustering/hierarchical_clustering.py +1380 -0
  15. sequenzo/clustering/src/KMedoid.cpp +262 -0
  16. sequenzo/clustering/src/PAM.cpp +236 -0
  17. sequenzo/clustering/src/PAMonce.cpp +234 -0
  18. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  19. sequenzo/clustering/src/cluster_quality.h +128 -0
  20. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  21. sequenzo/clustering/src/module.cpp +228 -0
  22. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  23. sequenzo/clustering/utils/__init__.py +27 -0
  24. sequenzo/clustering/utils/disscenter.py +122 -0
  25. sequenzo/data_preprocessing/__init__.py +20 -0
  26. sequenzo/data_preprocessing/helpers.py +256 -0
  27. sequenzo/datasets/__init__.py +41 -0
  28. sequenzo/datasets/biofam.csv +2001 -0
  29. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  30. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  31. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  32. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  33. sequenzo/datasets/country_co2_emissions.csv +194 -0
  34. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  35. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  36. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  37. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  38. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  39. sequenzo/datasets/mvad.csv +713 -0
  40. sequenzo/datasets/pairfam_family.csv +1867 -0
  41. sequenzo/datasets/polyadic_samplec1.csv +61 -0
  42. sequenzo/datasets/polyadic_samplep1.csv +61 -0
  43. sequenzo/datasets/polyadic_seqc1.csv +61 -0
  44. sequenzo/datasets/polyadic_seqp1.csv +61 -0
  45. sequenzo/define_sequence_data.py +609 -0
  46. sequenzo/dissimilarity_measures/__init__.py +31 -0
  47. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  48. sequenzo/dissimilarity_measures/get_distance_matrix.py +702 -0
  49. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +241 -0
  50. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  51. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  52. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  53. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  54. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  55. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  56. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  57. sequenzo/dissimilarity_measures/src/module.cpp +34 -0
  58. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  59. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  60. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  61. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  62. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  63. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  64. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  65. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  66. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  67. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  210. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  211. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  212. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  213. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  214. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  215. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  216. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  217. sequenzo/multidomain/__init__.py +23 -0
  218. sequenzo/multidomain/association_between_domains.py +311 -0
  219. sequenzo/multidomain/cat.py +431 -0
  220. sequenzo/multidomain/combt.py +519 -0
  221. sequenzo/multidomain/dat.py +89 -0
  222. sequenzo/multidomain/idcd.py +139 -0
  223. sequenzo/multidomain/linked_polyad.py +292 -0
  224. sequenzo/openmp_setup.py +233 -0
  225. sequenzo/prefix_tree/__init__.py +43 -0
  226. sequenzo/prefix_tree/individual_level_indicators.py +1274 -0
  227. sequenzo/prefix_tree/system_level_indicators.py +465 -0
  228. sequenzo/prefix_tree/utils.py +54 -0
  229. sequenzo/sequence_characteristics/__init__.py +40 -0
  230. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  231. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  232. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  233. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  234. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  235. sequenzo/sequence_characteristics/turbulence.py +155 -0
  236. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  237. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  238. sequenzo/suffix_tree/__init__.py +48 -0
  239. sequenzo/suffix_tree/individual_level_indicators.py +1638 -0
  240. sequenzo/suffix_tree/system_level_indicators.py +456 -0
  241. sequenzo/suffix_tree/utils.py +56 -0
  242. sequenzo/visualization/__init__.py +29 -0
  243. sequenzo/visualization/plot_mean_time.py +194 -0
  244. sequenzo/visualization/plot_modal_state.py +276 -0
  245. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  246. sequenzo/visualization/plot_relative_frequency.py +404 -0
  247. sequenzo/visualization/plot_sequence_index.py +937 -0
  248. sequenzo/visualization/plot_single_medoid.py +153 -0
  249. sequenzo/visualization/plot_state_distribution.py +613 -0
  250. sequenzo/visualization/plot_transition_matrix.py +190 -0
  251. sequenzo/visualization/utils/__init__.py +23 -0
  252. sequenzo/visualization/utils/utils.py +310 -0
  253. sequenzo/with_event_history_analysis/__init__.py +35 -0
  254. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  255. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  256. sequenzo-0.1.21.dist-info/METADATA +308 -0
  257. sequenzo-0.1.21.dist-info/RECORD +254 -0
  258. sequenzo-0.1.21.dist-info/WHEEL +5 -0
  259. sequenzo-0.1.21.dist-info/licenses/LICENSE +28 -0
  260. sequenzo-0.1.21.dist-info/top_level.txt +1 -0
@@ -0,0 +1,404 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : plot_relative_frequency.py
4
+ @Time : 06/02/2025 10:17
5
+ @Desc :
6
+ Generate sequence relative frequency plots with medoids and dissimilarities.
7
+ TODO: Update the xticks.
8
+ """
9
+ import pandas as pd
10
+ import numpy as np
11
+ from scipy.stats import f_oneway
12
+ # from sklearn.preprocessing import StandardScaler
13
+
14
+ import matplotlib.pyplot as plt
15
+ from matplotlib.patches import Rectangle
16
+ import seaborn as sns
17
+
18
+ from sequenzo.define_sequence_data import SequenceData
19
+ from sequenzo.visualization.utils import (
20
+ save_and_show_results,
21
+ set_up_time_labels_for_x_axis,
22
+ show_plot_title
23
+ )
24
+
25
+
26
+ # Delay imports to avoid circular dependency issues during installation
27
+ def _get_standard_scaler():
28
+ try:
29
+ from sklearn.preprocessing import StandardScaler
30
+ return StandardScaler
31
+ except ImportError:
32
+ print(
33
+ "Warning: Not able to install StandardScaler。Please ensure that you have installed scikit-learn successfully.")
34
+ return None
35
+
36
+
37
+ def plot_relative_frequency(seqdata: SequenceData,
38
+ distance_matrix: np.ndarray,
39
+ num_groups: int = 12,
40
+ weights="auto",
41
+ grouping_method="first",
42
+ fontsize=12,
43
+ save_as=None,
44
+ dpi=200,
45
+ show_title: bool = True):
46
+ """
47
+ Generate a sequence relative frequency (seqrf) plot.
48
+
49
+ :param seqdata: (SequenceData) The SequenceData object.
50
+ :param distance_matrix: (np.ndarray) A 2D pairwise distance matrix.
51
+ :param num_groups: (int) Number of frequency groups.
52
+ :param weights: (np.ndarray or "auto") Weights for sequences. If "auto", uses seqdata.weights if available
53
+ :param grouping_method: (str) Grouping method: "first" (equal size) or "prop" (weighted grouping)
54
+ :param save_as: (str, optional) File path to save the plot.
55
+ :param dpi: (int) Resolution of the saved plot.
56
+ """
57
+ if isinstance(distance_matrix, pd.DataFrame):
58
+ distance_matrix = distance_matrix.to_numpy()
59
+
60
+ # Process weights
61
+ if isinstance(weights, str) and weights == "auto":
62
+ weights = getattr(seqdata, "weights", None)
63
+
64
+ if weights is not None:
65
+ weights = np.asarray(weights, dtype=float).reshape(-1)
66
+ if len(weights) != len(seqdata.values):
67
+ raise ValueError("Length of weights must equal number of sequences.")
68
+
69
+ # Auto-switch to weighted grouping if weights are provided
70
+ if weights is not None and grouping_method == "first":
71
+ grouping_method = "prop"
72
+
73
+ # Compute medoids and dissimilarities
74
+ rep_sequences, dissimilarities, group_labels = _compute_seqrf(
75
+ seqdata, distance_matrix, num_groups,
76
+ weights=weights, grouping_method=grouping_method
77
+ )
78
+
79
+ # **Auto-adjust figure ratio**: dynamically scale aspect ratio
80
+ num_seq = len(rep_sequences)
81
+ fig_width = 14 # Fixed width
82
+ fig_height = max(6, num_seq / 20) # Adjust height based on the number of sequences
83
+
84
+ fig, axes = plt.subplots(1, 2, figsize=(fig_width, fig_height), gridspec_kw={'width_ratios': [2.5, 1]})
85
+ sns.set_palette("muted")
86
+
87
+ # Use color mapping stored in SequenceData
88
+ state_palette = seqdata.color_map
89
+
90
+ # **LEFT PLOT: Group Medoids (Sequence Index Plot)**
91
+ ax = axes[0]
92
+ for i, seq in enumerate(rep_sequences):
93
+ for t, state_idx in enumerate(seq):
94
+ color = state_palette.get(state_idx, "gray") # 直接用整数查颜色
95
+ ax.add_patch(Rectangle((t, i), 1, 1, color=color))
96
+
97
+ ax.set_xlim(0, seqdata.values.shape[1])
98
+ ax.set_ylim(0, len(rep_sequences))
99
+ # Add weight information to title if weights are used
100
+ if show_title:
101
+ if weights is not None and not np.allclose(weights, 1.0):
102
+ total_w = float(np.sum(weights))
103
+ title_text = f"Group Medoids (n={len(seqdata.values)}, total weight={total_w:.1f})"
104
+ else:
105
+ title_text = f"Group Medoids (n={len(seqdata.values)})"
106
+ show_plot_title(ax, title_text, show=True, fontsize=fontsize+2)
107
+ ax.set_xlabel("Time", fontsize=fontsize)
108
+ ax.set_ylabel("Frequency Group", fontsize=fontsize)
109
+
110
+ # X-axis labels
111
+ # TODO 权宜之计,不然 index plot 里面没有,但是这里有但是在 quickstart 和 multidomain main_tutorial 里面
112
+ # 因为time一个数字一个string导致不一样,太麻烦了
113
+ # 仅显示一部分 xticks,避免过于密集
114
+ xtick_positions = np.arange(len(seqdata.cleaned_time))
115
+ skip = max(1, len(seqdata.cleaned_time) // 8) # 每隔几个显示一个(可调)
116
+ visible_positions = xtick_positions[::skip]
117
+ visible_labels = [seqdata.cleaned_time[i] for i in visible_positions]
118
+
119
+ ax.set_xticks(visible_positions)
120
+ ax.set_xticklabels(visible_labels, fontsize=fontsize-2, rotation=0, ha='right', color='gray')
121
+
122
+ # Y-axis labels
123
+ ax.set_yticks(range(0, num_groups, max(1, num_groups // 10)))
124
+ ax.set_yticklabels(range(1, num_groups + 1, max(1, num_groups // 10)), fontsize=fontsize-2, color='gray')
125
+
126
+ # **Remove unwanted black outlines**
127
+ ax.spines["top"].set_visible(False)
128
+ ax.spines["right"].set_visible(False)
129
+ ax.spines["left"].set_visible(False)
130
+ ax.spines["bottom"].set_visible(False)
131
+
132
+ # **RIGHT PLOT: Dissimilarity Box Plot**
133
+ box_ax = axes[1]
134
+
135
+ # Set box plot styling
136
+ box_parts = box_ax.boxplot(
137
+ dissimilarities,
138
+ vert=False, # Horizontal box plot
139
+ patch_artist=True, # Allow fill color
140
+ boxprops=dict(facecolor='lightblue', edgecolor='gray', linewidth=1), # Box style
141
+ whiskerprops=dict(color='gray', linewidth=1), # Whisker style
142
+ capprops=dict(color='gray', linewidth=1), # Cap line style
143
+ medianprops=dict(color='red', linewidth=2), # Median line style
144
+ flierprops=dict(marker='o', markerfacecolor='gray', markersize=5, markeredgecolor='none') # Outlier style
145
+ )
146
+
147
+ # Y-axis labels
148
+ box_ax.set_yticks(range(0, num_groups, max(1, num_groups // 10)))
149
+ box_ax.set_yticklabels(range(1, num_groups + 1, max(1, num_groups // 10)), fontsize=fontsize-2, color='black')
150
+
151
+ # Keep only the bottom x-axis visible
152
+ box_ax.spines["top"].set_visible(False)
153
+ box_ax.spines["right"].set_visible(False)
154
+ box_ax.spines["left"].set_visible(True)
155
+ box_ax.spines["bottom"].set_visible(True)
156
+
157
+ # Set titles and labels
158
+ box_ax.set_title("Dissimilarities to Medoid", fontsize=fontsize+2)
159
+ box_ax.set_xlabel("Dissimilarity", fontsize=fontsize)
160
+ box_ax.set_ylabel("Group", fontsize=fontsize)
161
+
162
+ # Adjust layout
163
+ # TODO 出现问题的地方 - 状态多了就有问题(quickstart) ,状态比较少就没问题 Tutorial/multidomain/main_tutorial
164
+ # plt.subplots_adjust(bottom=0.23, wspace=0.4)
165
+ num_legend_items = len(state_palette)
166
+ bottom_margin = min(0.33, 0.17 + num_legend_items * 0.015)
167
+ plt.subplots_adjust(bottom=bottom_margin, wspace=0.4)
168
+
169
+ # **Representation Quality Stats**
170
+ r_squared, f_statistic, p_value = _compute_r2_f_statistic(distance_matrix, group_labels)
171
+
172
+ # Compute significance level for p-value (show as *, **, ***)
173
+ def get_p_value_stars(p_value):
174
+ if p_value < 0.001:
175
+ return "***"
176
+ elif p_value < 0.01:
177
+ return "**"
178
+ elif p_value < 0.05:
179
+ return "*"
180
+ else:
181
+ return ""
182
+
183
+ # Format p-value for display
184
+ p_value_stars = get_p_value_stars(p_value)
185
+ p_value_text = f"p = {p_value:.2e} {p_value_stars}"
186
+
187
+ # Explanation of p-value significance levels
188
+ stars_explanation = "*: p < 0.05, **: p < 0.01, ***: p < 0.001"
189
+
190
+ stats_text = (f"Representation quality: Pseudo/medoid-based R² = {r_squared:.2f}, F statistic = {f_statistic:.2f}, "
191
+ f"{p_value_text} ({stars_explanation})")
192
+
193
+ # **LEGEND BELOW PLOTS**
194
+ legend_patches = [
195
+ Rectangle((0, 0), 1, 1, color=seqdata.color_map_by_label[label], label=label)
196
+ for label in seqdata.labels
197
+ ]
198
+
199
+ # Automatically adjust legend layout (maximum of 7 items per row)
200
+ # ncol = min(7, len(seqdata.states)) # Maximum of 7 legend items per row
201
+ # legend = fig.legend(
202
+ # handles=legend_patches,
203
+ # loc='lower center',
204
+ # ncol=ncol,
205
+ # fontsize=12,
206
+ # frameon=False,
207
+ # bbox_to_anchor=(0.5, 0.05) # Position legend at the bottom center
208
+ # )
209
+
210
+ # Estimate how many rows are needed for the legend
211
+ max_items_per_row = 5
212
+ n_states = len(seqdata.states)
213
+ ncol = min(max_items_per_row, n_states)
214
+ nrow = (n_states + max_items_per_row - 1) // max_items_per_row # 向上取整
215
+
216
+ legend = fig.legend(
217
+ handles=legend_patches,
218
+ loc='lower center',
219
+ ncol=ncol,
220
+ fontsize=fontsize,
221
+ frameon=False,
222
+ bbox_to_anchor=(0.5, 0.05 + 0.015 * (nrow - 1)) # 动态向上移动避免遮挡文本
223
+ )
224
+
225
+ # Display statistical information below the legend
226
+ plt.figtext(
227
+ 0.5, 0.02, # Adjust position, place below the legend
228
+ stats_text,
229
+ ha="center",
230
+ fontsize=fontsize,
231
+ color="black"
232
+ )
233
+
234
+ # **Save or Show Plot**
235
+ save_and_show_results(save_as, dpi)
236
+
237
+
238
+ def _compute_seqrf(seqdata: SequenceData, distance_matrix: np.ndarray, n_groups: int = 10,
239
+ weights: np.ndarray = None, grouping_method: str = "first"):
240
+ """
241
+ Compute the representative sequences (medoids) for each frequency group in a SequenceData object.
242
+
243
+ :param seqdata: A SequenceData object.
244
+ :param distance_matrix: A 2D pairwise distance matrix.
245
+ :param n_groups: The number of frequency groups to divide sequences into.
246
+ :param weights: Optional weight vector for sequences.
247
+ :param grouping_method: Grouping method, either "first" (equal size) or "prop" (weighted).
248
+
249
+ :return: (Tuple[np.ndarray, np.ndarray, np.ndarray])
250
+ - rep_sequences: Representative sequences (medoids) for each group.
251
+ - dissimilarities: Distances of sequences in each group to their respective medoid.
252
+ - group_labels: Group assignments for each sequence.
253
+ """
254
+ n_sequences = seqdata.values.shape[0]
255
+ if weights is None:
256
+ weights = np.ones(n_sequences) # Default to equal weights
257
+
258
+ # **Step 1: Compute MDS using cmdscale()**
259
+ mds_coords = _cmdscale(distance_matrix) # Classic MDS
260
+ mds_coords_1d = mds_coords[:, 0] # Take only 1D result
261
+
262
+ # **Step 2: Standardize MDS coordinates and sort**
263
+ # 获取 StandardScaler
264
+ scaler_class = _get_standard_scaler()
265
+ if scaler_class is None:
266
+ raise ImportError("需要 scikit-learn 来执行此功能。请安装: pip install scikit-learn")
267
+ scaler = scaler_class() # 实例化对象
268
+ mds_coords_1d = scaler.fit_transform(mds_coords_1d.reshape(-1, 1)).flatten()
269
+
270
+ # Eigenvector direction in np.linalg.eigh() may differ from R, causing cmdscale() to output reversed coordinates.
271
+ mds_coords_1d = -mds_coords_1d # Reverse direction
272
+ sorted_indices = np.argsort(mds_coords_1d) # Sort in ascending order
273
+ sorted_coords = mds_coords_1d[sorted_indices]
274
+
275
+ # **Step 3: Perform grouping based on different methods**
276
+ if grouping_method == "first":
277
+ # **Divide evenly, each group has an equal size**
278
+ group_size = n_sequences // n_groups
279
+ frequency_groups = [sorted_indices[i * group_size:(i + 1) * group_size] for i in range(n_groups)]
280
+ if n_sequences % n_groups != 0:
281
+ frequency_groups[-1] = np.append(frequency_groups[-1], sorted_indices[n_groups * group_size:])
282
+
283
+ elif grouping_method == "prop":
284
+ # **Divide based on weights**
285
+ cumweights = np.cumsum(weights[sorted_indices])
286
+ wsum = np.sum(weights)
287
+ gsize = wsum / n_groups # Target weight for each group
288
+
289
+ frequency_groups = []
290
+ start_idx = 0
291
+ for i in range(n_groups):
292
+ if i == n_groups - 1:
293
+ group = sorted_indices[start_idx:] # Last group includes remaining data
294
+ else:
295
+ end_idx = np.searchsorted(cumweights, (i + 1) * gsize) # Find group boundary
296
+ group = sorted_indices[start_idx:end_idx]
297
+ start_idx = end_idx
298
+ frequency_groups.append(group)
299
+
300
+ else:
301
+ raise ValueError("Invalid grouping_method! Use 'first' or 'prop'.")
302
+
303
+ # **Step 4: Compute the medoid for each group**
304
+ medoid_indices = np.array([
305
+ _compute_group_medoid(distance_matrix, group, weights[group]) for group in frequency_groups
306
+ ])
307
+ rep_sequences = seqdata.values[medoid_indices]
308
+
309
+ # **Step 5: Compute distances to medoid for each group**
310
+ dissimilarities = [
311
+ distance_matrix[np.ix_(group, [medoid_idx])].flatten() for group, medoid_idx in
312
+ zip(frequency_groups, medoid_indices)
313
+ ]
314
+
315
+ # **Step 6: Assign group labels**
316
+ group_labels = np.zeros(n_sequences)
317
+ for i, group in enumerate(frequency_groups):
318
+ group_labels[group] = i
319
+
320
+ return rep_sequences, dissimilarities, group_labels
321
+
322
+
323
+ def _cmdscale(D):
324
+ """
325
+ Classic Multidimensional Scaling (MDS), equivalent to R's cmdscale()
326
+ How Traminer uses cmdscale(): https://github.com/cran/TraMineR/blob/master/R/dissrf.R
327
+
328
+ :param D: A NxN symmetric distance matrix
329
+ :return: Y, a Nxd coordinate matrix, where d is the largest positive eigenvalues' count
330
+ """
331
+ n = len(D)
332
+
333
+ # Step 1: Compute the centering matrix
334
+ H = np.eye(n) - np.ones((n, n)) / n
335
+
336
+ # Step 2: Compute the double centered distance matrix
337
+ B = -0.5 * H @ (D ** 2) @ H
338
+
339
+ # Step 3: Compute eigenvalues and eigenvectors
340
+ eigvals, eigvecs = np.linalg.eigh(B)
341
+
342
+ # Step 4: Sort eigenvalues and eigenvectors in descending order
343
+ idx = np.argsort(eigvals)[::-1]
344
+ eigvals = eigvals[idx]
345
+ eigvecs = eigvecs[:, idx]
346
+
347
+ # Step 5: Select only positive eigenvalues
348
+ w, = np.where(eigvals > 0)
349
+ L = np.diag(np.sqrt(eigvals[w]))
350
+ L = np.diag(np.sqrt(eigvals[w]))
351
+ V = eigvecs[:, w]
352
+
353
+ return V @ L # Return the MDS coordinates
354
+
355
+
356
+ def _compute_group_medoid(distance_matrix: np.ndarray, group_indices: np.ndarray, weights: np.ndarray = None) -> int:
357
+ """Compute the weighted medoid of a given frequency group,
358
+ matching R's disscenter() implementation.
359
+
360
+ :param distance_matrix: (np.ndarray) A 2D symmetric pairwise distance matrix.
361
+ :param group_indices: (np.ndarray) An array of indices representing the sequences in the group.
362
+ :param weights: (np.ndarray, optional) A weight vector for sequences. Defaults to equal weights if not provided.
363
+
364
+ :return: (int)
365
+ The index of the medoid sequence, which has the minimum weighted sum of distances within the group.
366
+ """
367
+ group_distances = distance_matrix[np.ix_(group_indices, group_indices)]
368
+
369
+ if weights is None:
370
+ weights = np.ones(len(group_indices)) # Default to equal weights
371
+
372
+ # **Fix: Compute the weighted sum of distances for each candidate medoid**
373
+ # For each candidate medoid m: sum_i w_i * D(i, m)
374
+ total_distances = group_distances.T @ weights
375
+
376
+ # **Fix: Select the medoid with the minimum weighted distance**
377
+ return group_indices[np.argmin(total_distances)]
378
+
379
+
380
+ def _compute_r2_f_statistic(distance_matrix: np.ndarray, group_labels: np.ndarray):
381
+ """
382
+ Compute the pseudo R² and F-statistic for sequence frequency grouping.
383
+ :param distance_matrix: (np.ndarray) A 2D pairwise distance matrix.
384
+ """
385
+ unique_groups = np.unique(group_labels)
386
+ total_var = np.var(distance_matrix)
387
+
388
+ group_means = np.array([np.mean(distance_matrix[group_labels == g]) for g in unique_groups])
389
+ within_group_vars = np.array([np.var(distance_matrix[group_labels == g]) for g in unique_groups])
390
+
391
+ ss_between = sum(len(distance_matrix[group_labels == g]) * (mean - np.mean(distance_matrix)) ** 2
392
+ for g, mean in zip(unique_groups, group_means))
393
+ ss_within = sum(within_group_vars)
394
+
395
+ # Ensure valid ANOVA conditions
396
+ valid_groups = [distance_matrix[group_labels == g].flatten() for g in unique_groups if
397
+ np.sum(group_labels == g) > 1]
398
+ if len(valid_groups) > 1:
399
+ f_statistic, p_value = f_oneway(*valid_groups)
400
+ else:
401
+ f_statistic, p_value = np.nan, np.nan
402
+
403
+ r_squared = float(ss_between / total_var) if total_var > 0 else 0.0
404
+ return r_squared, float(f_statistic), float(p_value)