sequenzo 0.1.21__cp39-cp39-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sequenzo might be problematic. Click here for more details.

Files changed (260) hide show
  1. sequenzo/__init__.py +240 -0
  2. sequenzo/big_data/__init__.py +12 -0
  3. sequenzo/big_data/clara/__init__.py +26 -0
  4. sequenzo/big_data/clara/clara.py +467 -0
  5. sequenzo/big_data/clara/utils/__init__.py +27 -0
  6. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  7. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  8. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-39-darwin.so +0 -0
  9. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  10. sequenzo/big_data/clara/visualization.py +88 -0
  11. sequenzo/clustering/KMedoids.py +196 -0
  12. sequenzo/clustering/__init__.py +30 -0
  13. sequenzo/clustering/clustering_c_code.cpython-39-darwin.so +0 -0
  14. sequenzo/clustering/hierarchical_clustering.py +1380 -0
  15. sequenzo/clustering/src/KMedoid.cpp +262 -0
  16. sequenzo/clustering/src/PAM.cpp +236 -0
  17. sequenzo/clustering/src/PAMonce.cpp +234 -0
  18. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  19. sequenzo/clustering/src/cluster_quality.h +128 -0
  20. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  21. sequenzo/clustering/src/module.cpp +228 -0
  22. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  23. sequenzo/clustering/utils/__init__.py +27 -0
  24. sequenzo/clustering/utils/disscenter.py +122 -0
  25. sequenzo/data_preprocessing/__init__.py +20 -0
  26. sequenzo/data_preprocessing/helpers.py +256 -0
  27. sequenzo/datasets/__init__.py +41 -0
  28. sequenzo/datasets/biofam.csv +2001 -0
  29. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  30. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  31. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  32. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  33. sequenzo/datasets/country_co2_emissions.csv +194 -0
  34. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  35. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  36. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  37. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  38. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  39. sequenzo/datasets/mvad.csv +713 -0
  40. sequenzo/datasets/pairfam_family.csv +1867 -0
  41. sequenzo/datasets/polyadic_samplec1.csv +61 -0
  42. sequenzo/datasets/polyadic_samplep1.csv +61 -0
  43. sequenzo/datasets/polyadic_seqc1.csv +61 -0
  44. sequenzo/datasets/polyadic_seqp1.csv +61 -0
  45. sequenzo/define_sequence_data.py +609 -0
  46. sequenzo/dissimilarity_measures/__init__.py +31 -0
  47. sequenzo/dissimilarity_measures/c_code.cpython-39-darwin.so +0 -0
  48. sequenzo/dissimilarity_measures/get_distance_matrix.py +702 -0
  49. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +241 -0
  50. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  51. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  52. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  53. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  54. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  55. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  56. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  57. sequenzo/dissimilarity_measures/src/module.cpp +34 -0
  58. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  59. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  60. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  61. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  62. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  63. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  64. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  65. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  66. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  67. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  210. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  211. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  212. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-39-darwin.so +0 -0
  213. sequenzo/dissimilarity_measures/utils/seqconc.cpython-39-darwin.so +0 -0
  214. sequenzo/dissimilarity_measures/utils/seqdss.cpython-39-darwin.so +0 -0
  215. sequenzo/dissimilarity_measures/utils/seqdur.cpython-39-darwin.so +0 -0
  216. sequenzo/dissimilarity_measures/utils/seqlength.cpython-39-darwin.so +0 -0
  217. sequenzo/multidomain/__init__.py +23 -0
  218. sequenzo/multidomain/association_between_domains.py +311 -0
  219. sequenzo/multidomain/cat.py +431 -0
  220. sequenzo/multidomain/combt.py +519 -0
  221. sequenzo/multidomain/dat.py +89 -0
  222. sequenzo/multidomain/idcd.py +139 -0
  223. sequenzo/multidomain/linked_polyad.py +292 -0
  224. sequenzo/openmp_setup.py +233 -0
  225. sequenzo/prefix_tree/__init__.py +43 -0
  226. sequenzo/prefix_tree/individual_level_indicators.py +1274 -0
  227. sequenzo/prefix_tree/system_level_indicators.py +465 -0
  228. sequenzo/prefix_tree/utils.py +54 -0
  229. sequenzo/sequence_characteristics/__init__.py +40 -0
  230. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  231. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  232. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  233. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  234. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  235. sequenzo/sequence_characteristics/turbulence.py +155 -0
  236. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  237. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  238. sequenzo/suffix_tree/__init__.py +48 -0
  239. sequenzo/suffix_tree/individual_level_indicators.py +1638 -0
  240. sequenzo/suffix_tree/system_level_indicators.py +456 -0
  241. sequenzo/suffix_tree/utils.py +56 -0
  242. sequenzo/visualization/__init__.py +29 -0
  243. sequenzo/visualization/plot_mean_time.py +194 -0
  244. sequenzo/visualization/plot_modal_state.py +276 -0
  245. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  246. sequenzo/visualization/plot_relative_frequency.py +404 -0
  247. sequenzo/visualization/plot_sequence_index.py +937 -0
  248. sequenzo/visualization/plot_single_medoid.py +153 -0
  249. sequenzo/visualization/plot_state_distribution.py +613 -0
  250. sequenzo/visualization/plot_transition_matrix.py +190 -0
  251. sequenzo/visualization/utils/__init__.py +23 -0
  252. sequenzo/visualization/utils/utils.py +310 -0
  253. sequenzo/with_event_history_analysis/__init__.py +35 -0
  254. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  255. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  256. sequenzo-0.1.21.dist-info/METADATA +308 -0
  257. sequenzo-0.1.21.dist-info/RECORD +254 -0
  258. sequenzo-0.1.21.dist-info/WHEEL +5 -0
  259. sequenzo-0.1.21.dist-info/licenses/LICENSE +28 -0
  260. sequenzo-0.1.21.dist-info/top_level.txt +1 -0
@@ -0,0 +1,431 @@
1
+ """
2
+ @Author : 李欣怡
3
+ @File : cat.py
4
+ @Time : 2025/4/8 09:06
5
+ @Desc : Build multidomain (MD) sequences of combined individual domain states (expanded alphabet),
6
+ derive multidomain indel and substitution costs from domain costs by means of an additive trick (CAT),
7
+ and compute OM pairwise distances using CAT costs.
8
+ """
9
+ import numpy as np
10
+ import pandas as pd
11
+ from typing import List, Union, Optional
12
+ import contextlib
13
+ import io
14
+
15
+ from sequenzo.define_sequence_data import SequenceData
16
+ from sequenzo.dissimilarity_measures.utils import seqlength
17
+ from sequenzo.dissimilarity_measures import get_distance_matrix, get_substitution_cost_matrix
18
+
19
+
20
+ def compute_cat_distance_matrix(channels: List[SequenceData],
21
+ method: Optional[str] = None,
22
+ norm: str = "none",
23
+ indel: Union[float, np.ndarray, List[Union[float, List[float]]]] = "auto",
24
+ sm: Optional[Union[List[str], List[np.ndarray]]] = None,
25
+ with_missing: Optional[Union[bool, List[bool]]] = None,
26
+ full_matrix: bool = True,
27
+ link: str = "sum",
28
+ cval: float = 2,
29
+ miss_cost: float = 2,
30
+ cweight: Optional[List[float]] = None,
31
+ what: str = "MDseq",
32
+ ch_sep: str = "+"):
33
+ """
34
+ mulitdomain sequences analysis, you can get:
35
+ - multi-domain sequences ('MDseq')
36
+ - multi-domain substitution and indel costs ('cost')
37
+ - multi-domain distance_matrix ('diss')
38
+
39
+ :param channels: A list of domain state sequence stslist objects defined with the define_sequences_data function
40
+ :param method: Dissimilarity measure between sequences.
41
+ :param norm: The normalization method to use. Ignored if what is not "diss".
42
+ :param indel: An insertion/deletion cost or a vector of state dependent indel costs for each domain.
43
+ :param sm: A list with a substitution-cost matrix for each domain,
44
+ or a list of method names for generating the domain substitution costs
45
+ :param with_missing: Whether consider missing values
46
+ :param full_matrix: the full distance matrix between MD sequences is returned.
47
+ :param link: Method to compute the "link" between domains.
48
+ :param cval: Domain substitution cost for "CONSTANT" matrix, for seqcost
49
+ :param miss_cost: Cost to substitute missing values at domain level, for seqcost
50
+ :param cweight: A vector of domain weights.
51
+ :param what: What output should be returned?
52
+ :param ch_sep: Separator used for building state names of the expanded alphabet.
53
+ """
54
+
55
+ # ==================
56
+ # Checking Arguments
57
+ # ==================
58
+ if what == "sm":
59
+ print("[!] what='sm' deprecated! Use what='cost' instead.")
60
+ what = "cost"
61
+ elif what == "seqmc":
62
+ print("[!] what='seqmc' deprecated! Use what='MDseq' instead.")
63
+ what = "MDseq"
64
+
65
+ valid_whats = ["MDseq", "cost", "diss"]
66
+ if what not in valid_whats:
67
+ raise ValueError(f"[!] 'what' should be one of {valid_whats}.")
68
+
69
+ if what == "diss" and not method:
70
+ raise ValueError("[!] A valid 'method' must be provided when what = 'diss'.")
71
+ if what == "cost" and sm is None:
72
+ raise ValueError("[!] 'sm' cannot be NULL when what = 'cost'.")
73
+
74
+ nchannels = len(channels)
75
+ if nchannels < 2:
76
+ raise ValueError("[!] Please specify at least two domains.")
77
+
78
+ # Check cweight
79
+ if cweight is None:
80
+ cweight = np.repeat(1.0, nchannels)
81
+
82
+ # If time varying sm are provided, all sm must be 3-dimensional
83
+ timeVarying = False
84
+ if isinstance(sm, list) and isinstance(sm[0], np.ndarray):
85
+ ndims = [arr.ndim for arr in sm]
86
+ if any(d == 3 for d in ndims) and not all(d == 3 for d in ndims):
87
+ raise ValueError("[!] One sm is 3-dimensional and some are not.")
88
+
89
+ if ndims[0] == 3:
90
+ timeVarying = True
91
+
92
+ # Check indel
93
+ # Convert all elements in indel(list) to list
94
+ if isinstance(indel, (float, int)):
95
+ indel = [indel] * nchannels
96
+ indel = [[x] for x in indel]
97
+
98
+ if isinstance(indel, np.ndarray):
99
+ indel = [[x] for x in indel.tolist()]
100
+
101
+ if isinstance(indel, list) and isinstance(indel[0], (float, int)):
102
+ indel = [[x] for x in indel]
103
+
104
+ if len(indel) > 1 and any(indel == "auto" for indel in indel):
105
+ raise ValueError("[!] 'auto' not allowed in vector or list indel.")
106
+
107
+ if isinstance(indel, list) and len(indel) == 1:
108
+ raise ValueError("[!] When a list or vector, indel must be of length equal to number of domains.")
109
+
110
+ if isinstance(indel, list) and len(indel) != nchannels:
111
+ raise ValueError("[!] When a list or vector, indel must be of length equal to number of domains.")
112
+
113
+ # Check missing
114
+ has_miss = np.repeat(False, nchannels)
115
+
116
+ for i in range(nchannels):
117
+ channel = channels[i]
118
+ alphabet = channel.states
119
+
120
+ # Check separator
121
+ if any(ch_sep in str(s) for s in alphabet):
122
+ raise ValueError(f"[!] 'ch.sep' symbol ({ch_sep}) occurs in alphabet of at least one channel.")
123
+
124
+ has_miss[i] = channel.ismissing
125
+ if with_missing is not None and has_miss[i] != with_missing[i]:
126
+ with_missing[i] = has_miss[i]
127
+ print(f"[!] Bad with.missing value for domain {i + 1}. I set it as {has_miss[i]}.")
128
+
129
+ if with_missing is None:
130
+ with_missing = has_miss
131
+
132
+ if isinstance(with_missing, bool) or len(with_missing) == 1:
133
+ with_missing = np.repeat(with_missing, nchannels)
134
+
135
+ if len(with_missing) > 1 and len(with_missing) != nchannels:
136
+ raise ValueError("[!] When a vector, with.missing must be of length equal to number of domains.")
137
+
138
+ # Check number of sequences for each channel
139
+ first_len = channels[0].seqdata.shape[0]
140
+ if not all(channel.seqdata.shape[0] == first_len for channel in channels):
141
+ raise ValueError("[!] sequence objects have different numbers of rows.")
142
+
143
+ numseq = first_len
144
+
145
+ print(f"[>] {nchannels} domains with {numseq} sequences.")
146
+ # Actually LCP and RLCP are not included
147
+
148
+ # Check what : method, sm
149
+ if what == "diss":
150
+ metlist = ["OM", "LCS", "DHD", "HAM"]
151
+
152
+ if method not in metlist:
153
+ raise ValueError(f"[!] 'method' should be one of {metlist}.")
154
+ if not isinstance(sm, list):
155
+ raise ValueError(f"[!] 'sm' should be a list.")
156
+
157
+ if method == "LCS":
158
+ method = "OM"
159
+ sm = "CONSTANT"
160
+ indel = list(np.repeat(indel, nchannels))
161
+ cval = 2
162
+ miss_cost = 2
163
+
164
+ timeVarying = method == "DHD"
165
+
166
+ if sm is None:
167
+ costmethod = "CONSTANT"
168
+ if method == "DHD":
169
+ costmethod = "TRATE"
170
+ sm = list(np.repeat(costmethod, nchannels))
171
+
172
+ if len(sm) == 1 and sm[0] in ["CONSTANT", "TRATE", "INDELS", "INDELSLOG"]:
173
+ sm = list(np.repeat(sm, nchannels))
174
+
175
+ # Checking correct numbers of info per channel
176
+ if what != "MDseq":
177
+ if len(sm) != nchannels or len(cweight) != nchannels:
178
+ raise ValueError("[!] You must supply one weight, one substitution matrix, and one indel per domain.")
179
+
180
+ # Checking that all channels have the same length
181
+ slength1 = seqlength(channels[1])
182
+ for i in range(1, nchannels):
183
+ if not np.array_equal(slength1, seqlength(channels[i])):
184
+ print("[!] Cases with sequences of different length across domains.")
185
+ break
186
+
187
+ substmat_list = [] # subsitution matrix
188
+ indel_list = [] # indels per channel
189
+ alphabet_list = [] # alphabet for each channel
190
+ alphsize_list = [] # alphabet size per channel
191
+ maxlength_list = np.zeros(nchannels) # seqlenth of each channels
192
+
193
+ # Storing number of columns and cnames
194
+ for i in range(nchannels):
195
+ maxlength_list[i] = channels[i].seqdata.shape[1]
196
+ max_index = np.argmax(maxlength_list)
197
+ md_cnames = channels[max_index].seqdata.columns
198
+
199
+ print("[>] Building MD sequences of combined states.")
200
+
201
+ # ================================
202
+ # Building the New Sequence Object
203
+ # ================================
204
+ maxlength = int(np.max(maxlength_list))
205
+ newseqdata = np.full((numseq, maxlength), "", dtype='U256')
206
+
207
+ for i in range(nchannels):
208
+ seqchan = channels[i].values.copy()
209
+
210
+ for j in range(maxlength):
211
+ if j < maxlength_list[i]:
212
+ newcol = seqchan[:, j].astype(str)
213
+
214
+ # TraMineR default missing value is legal, and we already do this.
215
+ # newseqdataNA[,j] <- newseqdataNA[,j] & newCol == void
216
+
217
+ # SequenceData has no attributes void, so we default fill with missing value (np.nan)
218
+ # if (fill.with.miss == TRUE & has.miss[i] & any(newCol == void)) {
219
+ # newCol[newCol == void] < - nr
220
+ # }
221
+
222
+ else:
223
+ newcol = np.repeat("", numseq)
224
+
225
+ if i > 0:
226
+ newseqdata[:, j] = np.char.add(np.char.add(newseqdata[:, j], ch_sep), newcol)
227
+ else:
228
+ newseqdata[:, j] = newcol
229
+
230
+ states_space = list(np.unique(newseqdata))
231
+
232
+ print(" - OK.")
233
+
234
+ if what == "MDseq":
235
+ return newseqdata
236
+ else:
237
+ # ==================================================
238
+ # Building Substitution Matrix and Indel Per Channel
239
+ # ==================================================
240
+ for i in range(nchannels):
241
+ channel = channels[i]
242
+
243
+ if not isinstance(channel, SequenceData):
244
+ raise ValueError("[!] Channel ", i,
245
+ " is not a state sequence object, use 'seqdef' function to create one.")
246
+
247
+ # Since states is prepared for the upcoming MD states
248
+ # And MD uses numeric representations, we use numbers here instead of the original string states.
249
+ states = np.arange(1, len(channel.states) + 1).astype(str).tolist()
250
+
251
+ # Checking missing values
252
+ if with_missing[i]:
253
+ print("[>] Including missing value as an additional state.")
254
+ else:
255
+ if channel.ismissing:
256
+ raise ValueError("[!] Found missing values in channel ", i,
257
+ ", set with.missing as TRUE for that channel.")
258
+
259
+ # Check states
260
+ alphabet_list.append(states)
261
+ alphsize_list.append(len(states))
262
+
263
+ # Pre-progress indel
264
+ if indel != "auto" and len(indel[i]) == 1:
265
+ indel[i] = np.repeat(indel[i], alphsize_list[i])
266
+
267
+ # Substitution matrix generation method is given
268
+ if isinstance(sm[i], str):
269
+ print(f"[>] Computing substitution cost matrix for domain {i}.")
270
+
271
+ with contextlib.redirect_stdout(io.StringIO()):
272
+ costs = get_substitution_cost_matrix(channel, sm[i],
273
+ with_missing=has_miss[i],
274
+ time_varying=timeVarying, cval=cval,
275
+ miss_cost=miss_cost)
276
+ substmat_list.append(costs['sm'])
277
+
278
+ if "auto" == indel:
279
+ costs['indel'] = np.repeat(costs['indel'], alphsize_list[i])
280
+ indel_list.append(costs['indel'])
281
+ else:
282
+ indel_list.append(indel[i])
283
+
284
+ else: # Provided sm
285
+ substmat_list.append(sm[i])
286
+
287
+ if "auto" == indel:
288
+ indel_list.append(np.repeat(np.max(sm[i]) / 2, alphsize_list[i]))
289
+ else:
290
+ indel_list.append(indel[i])
291
+
292
+ # Mutliply by channel weight
293
+ substmat_list[i] = cweight[i] * substmat_list[i]
294
+
295
+ if "auto" == indel:
296
+ indel = indel_list
297
+
298
+ # =============================================
299
+ # Building the New CAT Substitution Cost Matrix
300
+ # =============================================
301
+ print("[>] Computing MD substitution and indel costs with additive trick.")
302
+
303
+ # Build new subsitution matrix and new alphabet
304
+ alphabet = states_space
305
+ alphabet_size = len(alphabet)
306
+ newindel = None
307
+
308
+ # Recomputing the substitution matrix
309
+ if not timeVarying:
310
+ newsm = np.zeros((alphabet_size, alphabet_size))
311
+ newindel = np.zeros(alphabet_size)
312
+
313
+ # To reduce redundancy, we simply merged the code for retrieving sm and indel
314
+ statelisti = alphabet[alphabet_size - 1].split(ch_sep)
315
+ for i in range(nchannels):
316
+ state = statelisti[i]
317
+ ipos = alphabet_list[i].index(state)
318
+
319
+ newindel[alphabet_size - 1] += indel[i][ipos] * cweight[i]
320
+
321
+ for i in range(alphabet_size - 1):
322
+ statelisti = alphabet[i].split(ch_sep)
323
+
324
+ for chan in range(nchannels):
325
+ state = statelisti[chan]
326
+ ipos = alphabet_list[chan].index(state)
327
+
328
+ newindel[i] += indel[chan][ipos] * cweight[chan]
329
+
330
+ for j in range(i + 1, alphabet_size):
331
+ cost = 0
332
+ statelistj = alphabet[j].split(ch_sep)
333
+
334
+ for chan in range(nchannels):
335
+ ipos = alphabet_list[chan].index(statelisti[chan]) + 1
336
+ jpos = alphabet_list[chan].index(statelistj[chan]) + 1
337
+ cost += substmat_list[chan].iloc[ipos, jpos]
338
+
339
+ newsm[i, j] = cost
340
+ newsm[j, i] = cost
341
+
342
+ else:
343
+ # Recomputing time varying substitution
344
+ newsm = np.zeros((maxlength, alphabet_size, alphabet_size))
345
+
346
+ for t in range(maxlength):
347
+ for i in range(alphabet_size - 1):
348
+ statelisti = alphabet[i].split(ch_sep)
349
+
350
+ for j in range(i + 1, alphabet_size):
351
+ cost = 0
352
+ statelistj = alphabet[j].split(ch_sep)
353
+
354
+ for chan in range(nchannels):
355
+ ipos = alphabet_list[chan].index(statelisti[chan])
356
+ jpos = alphabet_list[chan].index(statelistj[chan])
357
+
358
+ cost += substmat_list[chan][t, ipos, jpos]
359
+
360
+ newsm[t, i, j] = cost
361
+ newsm[t, j, i] = cost
362
+
363
+ print(" - OK.")
364
+
365
+ # Indel as sum
366
+ if newindel is None:
367
+ newindel = np.sum(cweight * cweight[:, np.newaxis], axis=0)
368
+
369
+ # If we want the mean of cost
370
+ if link == "mean":
371
+ newindel = newindel / np.sum(cweight)
372
+ newsm = newsm / np.sum(cweight)
373
+
374
+ if what == "cost":
375
+ return {
376
+ "sm": newsm,
377
+ "indel": newindel,
378
+ "alphabet": alphabet,
379
+ "cweight": cweight
380
+ }
381
+
382
+ if what == "diss":
383
+ if np.any(np.isnan(newsm)) or np.any(np.isnan(newindel)):
384
+ raise ValueError("NA values found in substitution or indel costs. Cannot compute MD distances.")
385
+
386
+ print("[>] Computing MD distances using additive trick.")
387
+
388
+ # This step will hide the state concatenation,
389
+ # And the returned result will convert the MD strings into numbers.
390
+ # for example : '1+2+3' --> 1, '1+4+6' --> 2
391
+ newseqdata_df = pd.DataFrame(newseqdata, columns=md_cnames)
392
+ newseqdata_df.insert(0, channels[0].id_col, channels[0].ids)
393
+
394
+ # Reconstruct multi-domain labels for composite states
395
+ domain_labels = [channel.labels for channel in
396
+ channels] # e.g., [["At home", "Left home"], ["No child", "Child"]]
397
+
398
+ md_labels = []
399
+ for md_state in states_space:
400
+ parts = md_state.split(ch_sep) # e.g., ["0", "1"]
401
+ if len(parts) != len(domain_labels):
402
+ md_labels.append(md_state) # fallback if structure doesn't match
403
+ else:
404
+ label_parts = []
405
+ for val, dom_lab in zip(parts, domain_labels):
406
+ try:
407
+ label_parts.append(dom_lab[int(val)])
408
+ except (ValueError, IndexError):
409
+ label_parts.append(str(val)) # fallback if unexpected value
410
+ md_labels.append(" + ".join(label_parts))
411
+
412
+ with contextlib.redirect_stdout(io.StringIO()):
413
+ newseqdata_seq = SequenceData(newseqdata_df,
414
+ time=md_cnames,
415
+ states=states_space,
416
+ labels=md_labels,
417
+ id_col=channels[0].id_col)
418
+
419
+ newindel = np.max(newindel)
420
+ with contextlib.redirect_stdout(io.StringIO()):
421
+ diss_matrix = get_distance_matrix(newseqdata_seq,
422
+ method=method,
423
+ norm=norm,
424
+ indel=newindel,
425
+ sm=newsm,
426
+ with_missing=False,
427
+ full_matrix=full_matrix)
428
+ print(" - OK.")
429
+
430
+ diss_matrix = pd.DataFrame(diss_matrix, index=channels[0].ids, columns=channels[0].ids)
431
+ return diss_matrix