sequenzo 0.1.21__cp310-cp310-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sequenzo might be problematic. Click here for more details.

Files changed (260) hide show
  1. sequenzo/__init__.py +240 -0
  2. sequenzo/big_data/__init__.py +12 -0
  3. sequenzo/big_data/clara/__init__.py +26 -0
  4. sequenzo/big_data/clara/clara.py +467 -0
  5. sequenzo/big_data/clara/utils/__init__.py +27 -0
  6. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  7. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  8. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  9. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  10. sequenzo/big_data/clara/visualization.py +88 -0
  11. sequenzo/clustering/KMedoids.py +196 -0
  12. sequenzo/clustering/__init__.py +30 -0
  13. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  14. sequenzo/clustering/hierarchical_clustering.py +1380 -0
  15. sequenzo/clustering/src/KMedoid.cpp +262 -0
  16. sequenzo/clustering/src/PAM.cpp +236 -0
  17. sequenzo/clustering/src/PAMonce.cpp +234 -0
  18. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  19. sequenzo/clustering/src/cluster_quality.h +128 -0
  20. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  21. sequenzo/clustering/src/module.cpp +228 -0
  22. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  23. sequenzo/clustering/utils/__init__.py +27 -0
  24. sequenzo/clustering/utils/disscenter.py +122 -0
  25. sequenzo/data_preprocessing/__init__.py +20 -0
  26. sequenzo/data_preprocessing/helpers.py +256 -0
  27. sequenzo/datasets/__init__.py +41 -0
  28. sequenzo/datasets/biofam.csv +2001 -0
  29. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  30. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  31. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  32. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  33. sequenzo/datasets/country_co2_emissions.csv +194 -0
  34. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  35. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  36. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  37. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  38. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  39. sequenzo/datasets/mvad.csv +713 -0
  40. sequenzo/datasets/pairfam_family.csv +1867 -0
  41. sequenzo/datasets/polyadic_samplec1.csv +61 -0
  42. sequenzo/datasets/polyadic_samplep1.csv +61 -0
  43. sequenzo/datasets/polyadic_seqc1.csv +61 -0
  44. sequenzo/datasets/polyadic_seqp1.csv +61 -0
  45. sequenzo/define_sequence_data.py +609 -0
  46. sequenzo/dissimilarity_measures/__init__.py +31 -0
  47. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  48. sequenzo/dissimilarity_measures/get_distance_matrix.py +702 -0
  49. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +241 -0
  50. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  51. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  52. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  53. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  54. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  55. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  56. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  57. sequenzo/dissimilarity_measures/src/module.cpp +34 -0
  58. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  59. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  60. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  61. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  62. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  63. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  64. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  65. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  66. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  67. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  210. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  211. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  212. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  213. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  214. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  215. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  216. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  217. sequenzo/multidomain/__init__.py +23 -0
  218. sequenzo/multidomain/association_between_domains.py +311 -0
  219. sequenzo/multidomain/cat.py +431 -0
  220. sequenzo/multidomain/combt.py +519 -0
  221. sequenzo/multidomain/dat.py +89 -0
  222. sequenzo/multidomain/idcd.py +139 -0
  223. sequenzo/multidomain/linked_polyad.py +292 -0
  224. sequenzo/openmp_setup.py +233 -0
  225. sequenzo/prefix_tree/__init__.py +43 -0
  226. sequenzo/prefix_tree/individual_level_indicators.py +1274 -0
  227. sequenzo/prefix_tree/system_level_indicators.py +465 -0
  228. sequenzo/prefix_tree/utils.py +54 -0
  229. sequenzo/sequence_characteristics/__init__.py +40 -0
  230. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  231. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  232. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  233. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  234. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  235. sequenzo/sequence_characteristics/turbulence.py +155 -0
  236. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  237. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  238. sequenzo/suffix_tree/__init__.py +48 -0
  239. sequenzo/suffix_tree/individual_level_indicators.py +1638 -0
  240. sequenzo/suffix_tree/system_level_indicators.py +456 -0
  241. sequenzo/suffix_tree/utils.py +56 -0
  242. sequenzo/visualization/__init__.py +29 -0
  243. sequenzo/visualization/plot_mean_time.py +194 -0
  244. sequenzo/visualization/plot_modal_state.py +276 -0
  245. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  246. sequenzo/visualization/plot_relative_frequency.py +404 -0
  247. sequenzo/visualization/plot_sequence_index.py +937 -0
  248. sequenzo/visualization/plot_single_medoid.py +153 -0
  249. sequenzo/visualization/plot_state_distribution.py +613 -0
  250. sequenzo/visualization/plot_transition_matrix.py +190 -0
  251. sequenzo/visualization/utils/__init__.py +23 -0
  252. sequenzo/visualization/utils/utils.py +310 -0
  253. sequenzo/with_event_history_analysis/__init__.py +35 -0
  254. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  255. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  256. sequenzo-0.1.21.dist-info/METADATA +308 -0
  257. sequenzo-0.1.21.dist-info/RECORD +254 -0
  258. sequenzo-0.1.21.dist-info/WHEEL +5 -0
  259. sequenzo-0.1.21.dist-info/licenses/LICENSE +28 -0
  260. sequenzo-0.1.21.dist-info/top_level.txt +1 -0
@@ -0,0 +1,609 @@
1
+ """
2
+ @Author : 梁彧祺 Yuqi Liang
3
+ @File : define_sequence_data.py
4
+ @Time : 05/02/2025 12:47
5
+ @Desc :
6
+
7
+ Optimized SequenceData class with integrated color scheme & legend handling.
8
+
9
+ Note on `states` and `alphabet`:
10
+
11
+ In traditional sequence analysis tools (e.g., TraMineR), the `alphabet` refers to the full set of distinct states
12
+ found in the data and is often inferred automatically from the observed sequences.
13
+
14
+ However, in this implementation, we require the user to explicitly provide the set of `states`. This explicit control
15
+ is essential for ensuring consistent ordering of states, reproducibility of visualizations, and compatibility across
16
+ sequence datasets - especially when certain states may not appear in a given subset of the data.
17
+
18
+ As a result, `alphabet` is automatically set to `states` upon initialization, and kept as a semantic alias for clarity
19
+ and potential compatibility. Users should treat `states` as the definitive state space and are not required to provide
20
+ `alphabet` separately.
21
+
22
+ # ----------------------------------------------------------------------
23
+ # [Hint] Handling the ID column for sequence analysis
24
+ # ----------------------------------------------------------------------
25
+
26
+ # STEP 1: Check if your DataFrame already has a column representing unique entity IDs
27
+ # For example, check if "Entity ID" or "country" or any other identifier exists:
28
+ print(df.columns)
29
+
30
+ # If your data already has an ID column (e.g., 'Entity ID'), you can directly use it:
31
+ seq = SequenceData(df, id_col='Entity ID', time=..., states=...)
32
+
33
+ # ----------------------------------------------------------------------
34
+ # STEP 2: If your data has NO ID column, use the helper function below
35
+ # ----------------------------------------------------------------------
36
+ from sequenzo.utils import assign_unique_ids
37
+
38
+ # This will insert a new ID column named 'Entity ID' as the first column
39
+ df = assign_unique_ids(df, id_col_name='Entity ID')
40
+
41
+ # Optional: Save it for future use to avoid repeating this step
42
+ df.to_csv('your_dataset_with_ids.csv', index=False)
43
+
44
+ # Then you can use it like this:
45
+ seq = SequenceData(df, id_col='Entity ID', time=..., states=...)
46
+
47
+ """
48
+ # Only applicable to Python 3.7+, add this line to defer type annotation evaluation
49
+ from __future__ import annotations
50
+ # Define the public API at the top of the file
51
+ __all__ = ['SequenceData']
52
+
53
+ # Global variables and other imports that do not depend on pandas are placed here
54
+ import numpy as np
55
+ import seaborn as sns
56
+ import matplotlib.pyplot as plt
57
+ import pandas as pd
58
+ from docutils.parsers.rst import states
59
+ from matplotlib.colors import ListedColormap
60
+ import re
61
+
62
+
63
+ class SequenceData:
64
+ """
65
+ A class for defining and processing a sequence dataset for social sequence analysis.
66
+
67
+ This class provides:
68
+ - Sequence extraction & missing value handling.
69
+ - Automatic alphabet (state space) management.
70
+ - Efficient sequence-to-numeric conversion.
71
+ - Color mapping & legend storage for visualization.
72
+ """
73
+
74
+ def __init__(
75
+ self,
76
+ data: pd.DataFrame,
77
+ time: list,
78
+ states: list,
79
+ labels: list = None,
80
+ id_col: str = None,
81
+ weights: np.ndarray = None,
82
+ start: int = 1,
83
+ custom_colors: list = None
84
+ ):
85
+ """
86
+ Initialize the SequenceData object.
87
+
88
+ :param data: DataFrame containing sequence data.
89
+ :param time: List of columns containing time labels.
90
+ :param states: List of unique states (categories).
91
+ :param alphabet: Optional predefined state space.
92
+ :param labels: Labels for states (optional, for visualization).
93
+ :param id_col: Column name for row identifiers, which is very important for hierarchical clustering.
94
+ :param weights: Sequence weights (optional).
95
+ :param start: Starting time index (default: 1).
96
+ :param missing_handling: Dict specifying handling for missing values (left, right, gaps).
97
+ :param void: Symbol for void elements (default: "%").
98
+ :param nr: Symbol for missing values (default: "*").
99
+ :param custom_colors: Custom color palette for visualization.
100
+ """
101
+ # Import pandas here instead of the top of the file
102
+ import pandas as pd
103
+
104
+ self.data = data.copy()
105
+ self.time = time
106
+
107
+ # Remove all non-numeric characters from the year labels, e.g., "Year2020" -> "2020", or "C1" -> "1"
108
+ # self.cleaned_time = [re.sub(r'\D', '', str(year)) for year in time]
109
+ # No longer support this feature as we encourage users to clean the time variables.
110
+ # TODO: might implement a helper function for users to clean up their time variables.
111
+ self.cleaned_time = time
112
+ self.states = states.copy()
113
+ self.alphabet = states.copy() or sorted(set(data[time].stack().unique()))
114
+ self.labels = labels or [str(s) for s in states]
115
+ self.id_col = id_col
116
+ self.ids = np.array(self.data[self.id_col].values) if self.id_col else data.index
117
+ self.weights = weights
118
+ self._weights_provided = weights is not None # Track if weights were originally provided
119
+ self.start = start
120
+ self.custom_colors = custom_colors
121
+
122
+ # Validate parameters
123
+ self._validate_parameters()
124
+
125
+ # Extract & process sequences
126
+ self.seqdata = self._extract_sequences()
127
+ self._process_missing_values()
128
+
129
+ # The following two lines of code are for visualization
130
+ self.state_to_label = dict(zip(self.states, self.labels))
131
+ self.label_to_state = dict(zip(self.labels, self.states))
132
+
133
+ self._convert_states()
134
+
135
+ # Assign colors & save legend
136
+ self._assign_colors()
137
+
138
+ # Automatically print dataset overview
139
+ print("\n[>] SequenceData initialized successfully! Here's a summary:")
140
+ self.describe()
141
+
142
+ @property
143
+ def values(self):
144
+ """Returns sequence data as a NumPy array, similar to xinyi_original_seqdef()."""
145
+ return self.seqdata.to_numpy(dtype=np.int32)
146
+
147
+ def __repr__(self):
148
+ return f"SequenceData({len(self.seqdata)} sequences, States: {self.states})"
149
+
150
+ def _validate_parameters(self):
151
+ """Ensures correct input parameters and checks consistency with data."""
152
+ # Check states, alphabet, labels
153
+ if not self.states:
154
+ raise ValueError("'states' must be provided.")
155
+
156
+ # Validate that states are present in the actual data values
157
+ data_values = set(self.data[self.time].stack().unique())
158
+ states_clean = [s for s in self.states if not pd.isna(s)] # stack() 会去掉 nan 值,因此如果传进来的 states 有 np.nan,则会报错
159
+ unmatched_states = [s for s in states_clean if s not in data_values]
160
+
161
+ if unmatched_states:
162
+ raise ValueError(
163
+ f"[!] The following provided 'states' are not found in the data: {unmatched_states}\n"
164
+ f" Hint: Check spelling or formatting. Data contains these unique values: {sorted(data_values)}"
165
+ )
166
+
167
+ # ----------------
168
+ # Check if ID column is provided and valid
169
+ if self.id_col is not None and self.id_col not in self.data.columns:
170
+ raise ValueError(
171
+ f"[!] You must specify a valid `id_col` parameter that exists in your dataset.\n"
172
+ f" ID is required to uniquely identify each sequence (e.g., individuals).\n"
173
+ f" -> Hint: If your data does not have an ID column yet, you can use the helper function:\n\n"
174
+ f" from sequenzo.utils import assign_unique_ids\n"
175
+ f" df = assign_unique_ids(df, id_col_name='Entity ID')\n"
176
+ f" df.to_csv('your_dataset_with_ids.csv', index=False)\n\n"
177
+ f" This will permanently assign unique IDs to your dataset for future use."
178
+ )
179
+
180
+ # Because it is already implemented at initialization time
181
+ # self.ids = np.array(self.data[self.id_col].values)
182
+
183
+ # Validate ID uniqueness and length
184
+ if len(self.ids) != len(self.data):
185
+ raise ValueError(f"[!] Length of ID column ('{self.id_col}') must match number of rows in the dataset.")
186
+ if len(np.unique(self.ids)) != len(self.ids):
187
+ raise ValueError(f"[!] IDs in column '{self.id_col}' must be unique.")
188
+
189
+ # ----------------
190
+ if self.alphabet and set(self.alphabet) != set(self.states):
191
+ raise ValueError("'alphabet' must match 'states'.")
192
+
193
+ if self.labels:
194
+ if len(self.labels) != len(self.states):
195
+ raise ValueError("'labels' must match the length of 'states'.")
196
+
197
+ # Ensure labels are all strings
198
+ non_string_labels = [label for label in self.labels if not isinstance(label, str)]
199
+ if non_string_labels:
200
+ raise TypeError(
201
+ f"[!] All elements in 'labels' must be strings for proper visualization (e.g., for legends or annotations).\n"
202
+ f" Detected non-string labels: {non_string_labels}\n"
203
+ f" Example fix: instead of using `labels = [1, 2, 3]`, use `labels = ['Single', 'Married', 'Divorced']`."
204
+ )
205
+
206
+ # Check weights
207
+ if self.weights is not None:
208
+ if len(self.weights) != len(self.data):
209
+ raise ValueError("'weights' must match the length of 'data'.")
210
+ else:
211
+ self.weights = np.ones(self.data.shape[0])
212
+
213
+ def _extract_sequences(self) -> pd.DataFrame:
214
+ """Extracts only relevant sequence columns."""
215
+ return self.data[self.time].copy()
216
+
217
+ def _process_missing_values(self):
218
+ """Handles missing values based on the specified rules."""
219
+ # left, right, gaps = self.missing_handling.values()
220
+ #
221
+ # # Fill left-side missing values
222
+ # if not pd.isna(left) and left != "DEL":
223
+ # self.seqdata.fillna(left, inplace=True)
224
+ #
225
+ # # Process right-side missing values
226
+ # if right == "DEL":
227
+ # self.seqdata = self.seqdata.apply(lambda row: row.dropna().reset_index(drop=True), axis=1)
228
+ #
229
+ # # Process gaps (internal missing values)
230
+ # if not pd.isna(gaps) and gaps != "DEL":
231
+ # self.seqdata.replace(self.nr, gaps, inplace=True)
232
+
233
+ self.ismissing = self.seqdata.isna().any().any()
234
+
235
+ if self.ismissing:
236
+ # 判断 states 中是否已经含有 Missing(无论是字符串还是 np.nan)
237
+ # 兼容用户传进来的各种形式的"missing"
238
+ has_missing_state = any(pd.isna(s) for s in self.states) or any(s.lower() == "missing" for s in self.states if isinstance(s, str))
239
+ has_missing_label = any(label.lower() == "missing" for label in self.labels if isinstance(label, str))
240
+
241
+ if not has_missing_state:
242
+ # 自动判断 states 是字符串型还是数字型
243
+ example_missing = "'Missing'" if all(isinstance(s, str) for s in self.states) else "np.nan"
244
+ quote = "" if example_missing == "np.nan" else "'"
245
+
246
+ print(
247
+ "[!] Detected missing values (empty cells) in the sequence data.\n"
248
+ f" -> Automatically added {example_missing} to `states` and `labels` for compatibility.\n"
249
+ " However, it's strongly recommended to manually include it when defining `states` and `labels`.\n"
250
+ " For example:\n\n"
251
+ f" states = [{quote}At Home{quote}, {quote}Left Home{quote}, {example_missing}]\n"
252
+ f" labels = [{quote}At Home{quote}, {quote}Left Home{quote}, {quote}Missing{quote}]\n\n"
253
+ " This ensures consistent color mapping and avoids unexpected visualization errors."
254
+ )
255
+
256
+ # 添加 missing 到 states 和 labels
257
+ if example_missing == "'Missing'":
258
+ self.states.append("Missing")
259
+ else:
260
+ self.states.append(np.nan)
261
+
262
+ # 只有当labels中没有Missing时才添加
263
+ if not has_missing_label:
264
+ self.labels = [label for label in self.labels # 去除所有大小写混杂的 "missing"
265
+ if not (isinstance(label, str) and label.lower() == "missing")]
266
+ self.labels.append("Missing")
267
+
268
+ def _convert_states(self):
269
+ """
270
+ Converts categorical states into numerical values for processing.
271
+ Note that the order has to be the same as when the user defines the states of the class,
272
+ as it is very important for visualization.
273
+ Otherwise, the colors will be assigned incorrectly.
274
+
275
+ For instance, self.states = ['Very Low', 'Low', 'Middle', 'High', 'Very High'], as the user defines when defining the class
276
+ but the older version here is {'High': 1, 'Low': 2, 'Middle': 3, 'Very High': 4, 'Very Low': 5}
277
+ """
278
+ correct_order = self.states
279
+
280
+ # Create the state mapping with correct order
281
+ self.state_mapping = {original_state: i + 1 for i, original_state in enumerate(self.states)}
282
+ # 保留下面的映射关系,这样后面 legend 和绘图都能用 numeric 编码了
283
+ self.inverse_state_mapping = {v: k for k, v in self.state_mapping.items()}
284
+
285
+ # Apply the mapping
286
+ # If there are missing values, replace them with the last index + 1
287
+ # And update the additional missing value as a new state in self.state and self.alphabet
288
+ try:
289
+ self.seqdata = self.seqdata.map(lambda x: self.state_mapping.get(x, len(self.states)))
290
+ except AttributeError:
291
+ self.seqdata = self.seqdata.applymap(lambda x: self.state_mapping.get(x, len(self.states)))
292
+
293
+ if self.ids is not None:
294
+ self.seqdata.index = self.ids
295
+
296
+ def _assign_colors(self, reverse_colors=True):
297
+ """Assigns a color palette using user-defined or default Spectral palette.
298
+
299
+ If missing values are present, automatically assigns a fixed gray color (#cfcccc)
300
+ to missing values and uses the existing color scheme for non-missing states.
301
+ """
302
+ num_states = len(self.states)
303
+
304
+ # Check if missing values are present
305
+ has_missing = self.ismissing
306
+ missing_gray_color = (0.811765, 0.8, 0.8) # Fixed gray color for missing values (#cfcccc)
307
+
308
+ if has_missing:
309
+ # Count non-missing states for color palette generation
310
+ non_missing_states = num_states - 1
311
+
312
+ if self.custom_colors:
313
+ # If user provided custom colors, check if they account for missing values
314
+ if len(self.custom_colors) == num_states:
315
+ # User provided colors for all states including missing - use as is
316
+ color_list = self.custom_colors
317
+ elif len(self.custom_colors) == non_missing_states:
318
+ # User provided colors only for non-missing states - add gray for missing
319
+ color_list = self.custom_colors + [missing_gray_color]
320
+ else:
321
+ raise ValueError(f"Length of custom_colors ({len(self.custom_colors)}) must match "
322
+ f"either total states ({num_states}) or non-missing states ({non_missing_states}).")
323
+ else:
324
+ # Generate colors for non-missing states and add gray for missing
325
+ if non_missing_states <= 20:
326
+ non_missing_color_list = sns.color_palette("Spectral", non_missing_states)
327
+ else:
328
+ # Use a more elegant color palette for many states - combination of viridis and pastel colors
329
+ if non_missing_states <= 40:
330
+ # Use viridis for up to 40 states (more colorful than cubehelix)
331
+ non_missing_color_list = sns.color_palette("viridis", non_missing_states)
332
+ else:
333
+ # For very large state counts, use a custom palette combining multiple schemes
334
+ viridis_colors = sns.color_palette("viridis", min(non_missing_states // 2, 20))
335
+ pastel_colors = sns.color_palette("Set3", min(non_missing_states // 2, 12))
336
+ tab20_colors = sns.color_palette("tab20", min(non_missing_states // 3, 20))
337
+
338
+ # Combine and extend the palette
339
+ combined_colors = viridis_colors + pastel_colors + tab20_colors
340
+ # If we need more colors, cycle through the combined palette
341
+ while len(combined_colors) < non_missing_states:
342
+ combined_colors.extend(combined_colors[:min(len(combined_colors), non_missing_states - len(combined_colors))])
343
+
344
+ non_missing_color_list = combined_colors[:non_missing_states]
345
+
346
+ if reverse_colors:
347
+ non_missing_color_list = list(reversed(non_missing_color_list))
348
+
349
+ # Add fixed gray color for missing values at the end
350
+ color_list = list(non_missing_color_list) + [missing_gray_color]
351
+ else:
352
+ # No missing values - use original logic
353
+ if self.custom_colors:
354
+ if len(self.custom_colors) != num_states:
355
+ raise ValueError("Length of custom_colors must match number of states.")
356
+ color_list = self.custom_colors
357
+ else:
358
+ if num_states <= 20:
359
+ color_list = sns.color_palette("Spectral", num_states)
360
+ else:
361
+ # Use a more elegant color palette for many states - combination of viridis and pastel colors
362
+ if num_states <= 40:
363
+ # Use viridis for up to 40 states (more colorful than cubehelix)
364
+ color_list = sns.color_palette("viridis", num_states)
365
+ else:
366
+ # For very large state counts, use a custom palette combining multiple schemes
367
+ viridis_colors = sns.color_palette("viridis", min(num_states // 2, 20))
368
+ pastel_colors = sns.color_palette("Set3", min(num_states // 2, 12))
369
+ tab20_colors = sns.color_palette("tab20", min(num_states // 3, 20))
370
+
371
+ # Combine and extend the palette
372
+ combined_colors = viridis_colors + pastel_colors + tab20_colors
373
+ # If we need more colors, cycle through the combined palette
374
+ while len(combined_colors) < num_states:
375
+ combined_colors.extend(combined_colors[:min(len(combined_colors), num_states - len(combined_colors))])
376
+
377
+ color_list = combined_colors[:num_states]
378
+
379
+ if reverse_colors:
380
+ color_list = list(reversed(color_list))
381
+
382
+ # self.color_map = {state: color_list[i] for i, state in enumerate(self.states)}
383
+ # 这样所有 color map key 是 1, 2, 3...,就可以和 imshow(vmin=1, vmax=N) 对齐
384
+ self.color_map = {i + 1: color_list[i] for i in range(num_states)}
385
+
386
+ # 构造以 label 为 key 的 color_map(用于 legend)
387
+ self.color_map_by_label = {
388
+ self.state_to_label[state]: self.color_map[self.state_mapping[state]]
389
+ for state in self.states
390
+ }
391
+
392
+ def get_colormap(self):
393
+ """Returns a ListedColormap for visualization."""
394
+ # return ListedColormap([self.color_map[state] for state in self.states])
395
+ return ListedColormap([self.color_map[i + 1] for i in range(len(self.states))])
396
+
397
+ def describe(self):
398
+ """
399
+ Prints an overview of the sequence dataset.
400
+
401
+ # NOTE:
402
+ # Printing 'missing_index' directly may cause issues in Jupyter Notebook/Lab if the list is too long.
403
+ # For example, if there are thousands of sequences with missing values, the full list can easily exceed
404
+ # the IOPub data rate limit (1MB/sec by default), which will interrupt output to the client.
405
+ # To avoid this, it's safer to only display a subset (e.g., the first 10) or add a 'verbose' flag to control output.
406
+ """
407
+ print(f"[>] Number of sequences: {len(self.seqdata)}")
408
+ print(f"[>] Number of time points: {self.n_steps}")
409
+
410
+ if self.ismissing:
411
+ lengths = self.seqdata.apply(lambda row: (row != len(self.states)).sum(), axis=1)
412
+ print(f"[>] Min/Max sequence length: {lengths.min()} / {lengths.max()}")
413
+
414
+ # Identify missing values and related IDs
415
+ missing_locs = self.seqdata.stack()[self.seqdata.stack() == len(self.states)].index.get_level_values(0)
416
+ missing_count = len(missing_locs)
417
+ unique_missing_ids = missing_locs.unique().tolist()
418
+ print(f"[>] There are {missing_count} missing values across {len(unique_missing_ids)} sequences.")
419
+ print(f" First few missing sequence IDs: {unique_missing_ids[:10]} ...")
420
+
421
+ # Find and display sequences with the most missing points
422
+ missing_counts = self.seqdata.isin([len(self.states)]).sum(axis=1)
423
+ most_missing = missing_counts[missing_counts > 0].sort_values(ascending=False).head(5)
424
+ print("[>] Top sequences with the most missing time points:")
425
+ print(" (Each row shows a sequence ID and its number of missing values)\n")
426
+ print(most_missing.rename("Missing Count").to_frame().rename_axis("Sequence ID"))
427
+
428
+ else:
429
+ print(
430
+ f"[>] Min/Max sequence length: {self.seqdata.notna().sum(axis=1).min()} / {self.seqdata.notna().sum(axis=1).max()}")
431
+
432
+ print(f"[>] States: {self.states}")
433
+ print(f"[>] Labels: {self.labels}")
434
+
435
+ # Display weights information if weights were originally provided
436
+ if self._weights_provided:
437
+ weight_mean = np.mean(self.weights)
438
+ weight_std = np.std(self.weights)
439
+ print(f"[>] Weights: Provided (total weight={sum(self.weights):.3f}, mean={weight_mean:.3f}, std={weight_std:.3f})")
440
+ else:
441
+ print(f"[>] Weights: Not provided")
442
+
443
+ def get_legend(self):
444
+ """Returns the legend handles and labels for visualization."""
445
+ # self.legend_handles = [plt.Rectangle((0, 0), 1, 1,
446
+ # color=self.color_map[state],
447
+ # label=label)
448
+ # for state, label in zip(self.states, self.labels)]
449
+ # return [handle for handle in self.legend_handles], self.labels
450
+
451
+ self.legend_handles = [
452
+ plt.Rectangle((0, 0), 1, 1,
453
+ color=self.color_map[i + 1],
454
+ label=self.labels[i])
455
+ for i in range(len(self.states))
456
+ ]
457
+ return self.legend_handles, self.labels
458
+
459
+ def to_dataframe(self) -> pd.DataFrame:
460
+ """Returns the processed sequence dataset as a DataFrame."""
461
+ return self.seqdata
462
+
463
+ def plot_legend(self, save_as=None, dpi=200):
464
+ """Displays the saved legend for sequence state colors."""
465
+ # Ensure legend handles exist even if get_legend() wasn't called
466
+ legend_handles = getattr(self, "legend_handles", None)
467
+ if not legend_handles:
468
+ legend_handles = [
469
+ plt.Rectangle((0, 0), 1, 1, color=self.color_map[i + 1], label=self.labels[i]
470
+ ) for i in range(len(self.states))
471
+ ]
472
+ self.legend_handles = legend_handles
473
+
474
+ fig, ax = plt.subplots(figsize=(2, 2))
475
+ ax.legend(handles=legend_handles, loc='center', title="States", fontsize=10)
476
+ ax.axis('off')
477
+
478
+ if save_as:
479
+ plt.savefig(save_as, dpi=dpi)
480
+ plt.show()
481
+ else:
482
+ plt.tight_layout()
483
+ plt.show()
484
+
485
+ # ------------------------------
486
+ # The following are for multidomain sequence analysis, especially for seqdomassoc()
487
+
488
+ @property
489
+ def n_sequences(self):
490
+ """Returns number of sequences (rows)."""
491
+ return self.seqdata.shape[0]
492
+
493
+ @property
494
+ def n_steps(self):
495
+ """Returns sequence length (columns)."""
496
+ return self.seqdata.shape[1]
497
+
498
+ @property
499
+ def alphabet(self):
500
+ """Returns state alphabet."""
501
+ return self._alphabet
502
+
503
+ @alphabet.setter
504
+ def alphabet(self, val):
505
+ self._alphabet = val
506
+
507
+ @property
508
+ def weights(self):
509
+ return self._weights
510
+
511
+ @weights.setter
512
+ def weights(self, val):
513
+ self._weights = val
514
+
515
+ def flatten(self) -> np.ndarray:
516
+ """Flatten all sequences into a 1D array (row-wise)."""
517
+ return self.seqdata.values.flatten()
518
+
519
+ def flatten_weights(self) -> np.ndarray:
520
+ """
521
+ Repeat weights across sequence length for 1D alignment with flatten().
522
+ E.g., 5 sequences x 10 steps -> repeat each weight 10 times.
523
+ """
524
+ return np.repeat(self.weights, self.n_steps)
525
+
526
+ def to_numeric(self) -> np.ndarray:
527
+ """Returns integer-coded sequence data as NumPy array."""
528
+ return self.seqdata.to_numpy(dtype=np.int32)
529
+
530
+ def get_xtabs(self, other: SequenceData, weighted=True) -> np.ndarray:
531
+ """
532
+ NumPy-only version of get_xtabs.
533
+ Returns a raw NumPy matrix: shape (len(alphabet1), len(alphabet2))
534
+ """
535
+ if self.n_sequences != other.n_sequences or self.n_steps != other.n_steps:
536
+ raise ValueError("Both SequenceData objects must have same shape.")
537
+
538
+ v1 = self.flatten()
539
+ v2 = other.flatten()
540
+
541
+ # Equivalent to self.alphabet,
542
+ # but alphabet cannot be used directly, because it does not account for missing values
543
+ n1 = len(self.states)
544
+ n2 = len(other.states)
545
+
546
+ table = np.zeros((n1, n2), dtype=np.float64)
547
+
548
+ if weighted:
549
+ w = self.flatten_weights()
550
+ # Safe increment using integer indices
551
+ # Numpy's index starts from 0, thus it is important to reduce by 1
552
+ np.add.at(table, (v1 - 1, v2 - 1), w)
553
+ else:
554
+ np.add.at(table, (v1 - 1, v2 - 1), 1)
555
+
556
+ return table
557
+
558
+ def uniqueness_stats(self, weighted: bool = False):
559
+ """
560
+ Compute uniqueness statistics of the sequences.
561
+
562
+ Returns:
563
+ dict with keys:
564
+ - n_sequences: total number of sequences (unweighted count)
565
+ - n_unique: number of unique sequence patterns
566
+ - uniqueness_rate: n_unique / n_sequences
567
+ - weighted_total: total weighted count (only if weighted=True)
568
+ - weighted_uniqueness_rate: n_unique / weighted_total (only if weighted=True)
569
+
570
+ Parameters:
571
+ weighted: if True, use sequence weights to calculate weighted frequencies and uniqueness rates;
572
+ if False, use simple counts (default behavior for backward compatibility).
573
+ """
574
+ import numpy as np
575
+ import pandas as pd
576
+
577
+ A = self.to_numeric() # shape (n, m), int32
578
+ n, m = A.shape
579
+
580
+ # Use a byte-level view to let np.unique work row-wise efficiently
581
+ A_contig = np.ascontiguousarray(A)
582
+ row_view = A_contig.view(np.dtype((np.void, A_contig.dtype.itemsize * m))).ravel()
583
+
584
+ # Get unique patterns
585
+ uniq, inverse = np.unique(row_view, return_inverse=True)
586
+
587
+ n_unique = uniq.size
588
+ uniqueness_rate = float(n_unique) / float(n) if n > 0 else np.nan
589
+
590
+ # Build simplified result dictionary with only essential statistics
591
+ result = {
592
+ "n_sequences": int(n),
593
+ "n_unique": int(n_unique),
594
+ "uniqueness_rate": uniqueness_rate
595
+ }
596
+
597
+ # Add weighted statistics if requested
598
+ if weighted:
599
+ weighted_total = float(np.sum(self.weights))
600
+ weighted_uniqueness_rate = float(n_unique) / weighted_total if weighted_total > 0 else np.nan
601
+ result["weighted_total"] = weighted_total
602
+ result["weighted_uniqueness_rate"] = weighted_uniqueness_rate
603
+
604
+ return result
605
+
606
+
607
+
608
+
609
+
@@ -0,0 +1,31 @@
1
+ """
2
+ @Author : 李欣怡
3
+ @File : __init__.py
4
+ @Time : 2025/2/26 23:19
5
+ @Desc :
6
+ """
7
+ from .utils import get_sm_trate_substitution_cost_matrix, seqconc, seqdss, seqdur, seqlength
8
+ from .utils import get_LCP_length_for_2_seq
9
+ from .get_distance_matrix import get_distance_matrix
10
+ from .get_substitution_cost_matrix import get_substitution_cost_matrix
11
+
12
+
13
+ def _import_c_code():
14
+ """Lazily import the c_code module to avoid circular dependencies during installation"""
15
+ try:
16
+ from sequenzo.dissimilarity_measures import c_code
17
+ return c_code
18
+ except ImportError:
19
+ # If the C extension cannot be imported, return None
20
+ print(
21
+ "Warning: The C++ extension (c_code) could not be imported. Please ensure the extension module is compiled correctly.")
22
+ return None
23
+
24
+
25
+ __all__ = [
26
+ "get_distance_matrix",
27
+ "get_substitution_cost_matrix",
28
+ "get_LCP_length_for_2_seq"
29
+ # Add other functions as needed
30
+ ]
31
+