sequenzo 0.1.21__cp39-cp39-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sequenzo might be problematic. Click here for more details.

Files changed (260) hide show
  1. sequenzo/__init__.py +240 -0
  2. sequenzo/big_data/__init__.py +12 -0
  3. sequenzo/big_data/clara/__init__.py +26 -0
  4. sequenzo/big_data/clara/clara.py +467 -0
  5. sequenzo/big_data/clara/utils/__init__.py +27 -0
  6. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  7. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  8. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-39-darwin.so +0 -0
  9. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  10. sequenzo/big_data/clara/visualization.py +88 -0
  11. sequenzo/clustering/KMedoids.py +196 -0
  12. sequenzo/clustering/__init__.py +30 -0
  13. sequenzo/clustering/clustering_c_code.cpython-39-darwin.so +0 -0
  14. sequenzo/clustering/hierarchical_clustering.py +1380 -0
  15. sequenzo/clustering/src/KMedoid.cpp +262 -0
  16. sequenzo/clustering/src/PAM.cpp +236 -0
  17. sequenzo/clustering/src/PAMonce.cpp +234 -0
  18. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  19. sequenzo/clustering/src/cluster_quality.h +128 -0
  20. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  21. sequenzo/clustering/src/module.cpp +228 -0
  22. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  23. sequenzo/clustering/utils/__init__.py +27 -0
  24. sequenzo/clustering/utils/disscenter.py +122 -0
  25. sequenzo/data_preprocessing/__init__.py +20 -0
  26. sequenzo/data_preprocessing/helpers.py +256 -0
  27. sequenzo/datasets/__init__.py +41 -0
  28. sequenzo/datasets/biofam.csv +2001 -0
  29. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  30. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  31. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  32. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  33. sequenzo/datasets/country_co2_emissions.csv +194 -0
  34. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  35. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  36. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  37. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  38. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  39. sequenzo/datasets/mvad.csv +713 -0
  40. sequenzo/datasets/pairfam_family.csv +1867 -0
  41. sequenzo/datasets/polyadic_samplec1.csv +61 -0
  42. sequenzo/datasets/polyadic_samplep1.csv +61 -0
  43. sequenzo/datasets/polyadic_seqc1.csv +61 -0
  44. sequenzo/datasets/polyadic_seqp1.csv +61 -0
  45. sequenzo/define_sequence_data.py +609 -0
  46. sequenzo/dissimilarity_measures/__init__.py +31 -0
  47. sequenzo/dissimilarity_measures/c_code.cpython-39-darwin.so +0 -0
  48. sequenzo/dissimilarity_measures/get_distance_matrix.py +702 -0
  49. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +241 -0
  50. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  51. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  52. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  53. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  54. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  55. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  56. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  57. sequenzo/dissimilarity_measures/src/module.cpp +34 -0
  58. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  59. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  60. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  61. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  62. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  63. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  64. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  65. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  66. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  67. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  210. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  211. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  212. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-39-darwin.so +0 -0
  213. sequenzo/dissimilarity_measures/utils/seqconc.cpython-39-darwin.so +0 -0
  214. sequenzo/dissimilarity_measures/utils/seqdss.cpython-39-darwin.so +0 -0
  215. sequenzo/dissimilarity_measures/utils/seqdur.cpython-39-darwin.so +0 -0
  216. sequenzo/dissimilarity_measures/utils/seqlength.cpython-39-darwin.so +0 -0
  217. sequenzo/multidomain/__init__.py +23 -0
  218. sequenzo/multidomain/association_between_domains.py +311 -0
  219. sequenzo/multidomain/cat.py +431 -0
  220. sequenzo/multidomain/combt.py +519 -0
  221. sequenzo/multidomain/dat.py +89 -0
  222. sequenzo/multidomain/idcd.py +139 -0
  223. sequenzo/multidomain/linked_polyad.py +292 -0
  224. sequenzo/openmp_setup.py +233 -0
  225. sequenzo/prefix_tree/__init__.py +43 -0
  226. sequenzo/prefix_tree/individual_level_indicators.py +1274 -0
  227. sequenzo/prefix_tree/system_level_indicators.py +465 -0
  228. sequenzo/prefix_tree/utils.py +54 -0
  229. sequenzo/sequence_characteristics/__init__.py +40 -0
  230. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  231. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  232. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  233. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  234. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  235. sequenzo/sequence_characteristics/turbulence.py +155 -0
  236. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  237. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  238. sequenzo/suffix_tree/__init__.py +48 -0
  239. sequenzo/suffix_tree/individual_level_indicators.py +1638 -0
  240. sequenzo/suffix_tree/system_level_indicators.py +456 -0
  241. sequenzo/suffix_tree/utils.py +56 -0
  242. sequenzo/visualization/__init__.py +29 -0
  243. sequenzo/visualization/plot_mean_time.py +194 -0
  244. sequenzo/visualization/plot_modal_state.py +276 -0
  245. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  246. sequenzo/visualization/plot_relative_frequency.py +404 -0
  247. sequenzo/visualization/plot_sequence_index.py +937 -0
  248. sequenzo/visualization/plot_single_medoid.py +153 -0
  249. sequenzo/visualization/plot_state_distribution.py +613 -0
  250. sequenzo/visualization/plot_transition_matrix.py +190 -0
  251. sequenzo/visualization/utils/__init__.py +23 -0
  252. sequenzo/visualization/utils/utils.py +310 -0
  253. sequenzo/with_event_history_analysis/__init__.py +35 -0
  254. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  255. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  256. sequenzo-0.1.21.dist-info/METADATA +308 -0
  257. sequenzo-0.1.21.dist-info/RECORD +254 -0
  258. sequenzo-0.1.21.dist-info/WHEEL +5 -0
  259. sequenzo-0.1.21.dist-info/licenses/LICENSE +28 -0
  260. sequenzo-0.1.21.dist-info/top_level.txt +1 -0
@@ -0,0 +1,256 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : helpers.py
4
+ @Time : 01/05/2025 09:27
5
+ @Desc :
6
+ """
7
+ import pandas as pd
8
+ import numpy as np
9
+ import matplotlib.pyplot as plt
10
+ import missingno as msno
11
+ from typing import Union, List
12
+
13
+
14
+ def assign_unique_ids(df: pd.DataFrame, id_col_name: str = "Entity ID") -> pd.DataFrame:
15
+ """
16
+ Assigns a unique integer ID to each row in the DataFrame and inserts it as the first column.
17
+
18
+ :param df: Input DataFrame.
19
+ :param id_col_name: Name of the new ID column (default = "Entity ID").
20
+ :return: DataFrame with the ID column inserted at the first position.
21
+ """
22
+ if id_col_name in df.columns:
23
+ raise ValueError(f"[!] Column '{id_col_name}' already exists in the DataFrame.")
24
+
25
+ df = df.copy()
26
+ df.insert(0, id_col_name, np.arange(len(df)))
27
+ return df
28
+
29
+
30
+ def long_to_wide_format_data(df: pd.DataFrame,
31
+ id_col: str,
32
+ time_col: str,
33
+ value_col: Union[str, List[str]]) -> pd.DataFrame:
34
+ """
35
+ Convert a long-format DataFrame to wide format.
36
+
37
+ This function pivots the long-format data so that each unique time point becomes
38
+ a separate column, and each row corresponds to one unique sequence (identified by `id_col`).
39
+
40
+ Parameters:
41
+ ----------
42
+ df : pd.DataFrame
43
+ Input DataFrame in long format.
44
+
45
+ id_col : str
46
+ The name of the column representing unique entity IDs.
47
+
48
+ time_col : str
49
+ The name of the column containing time points (must be a string, not a list).
50
+
51
+ value_col : Union[str, List[str]]
52
+ The name(s) of the column(s) containing state values.
53
+ Can be a single string or a list of strings.
54
+
55
+ Returns:
56
+ -------
57
+ pd.DataFrame
58
+ A wide-format DataFrame with one row per ID and one column per time point for each value column.
59
+ The column names are taken from the unique values in `time_col` combined with value column names.
60
+
61
+ Notes:
62
+ -----
63
+ - This function assumes `df` is already in long format.
64
+ - `time_col` must be a column *name* (string), not a list.
65
+ - The top-left "column name" in the output (from pivot) may carry over as a column index name;
66
+ this is removed automatically for clean output.
67
+ - If multiple value columns are provided, the result will have multi-level columns.
68
+ """
69
+ # Ensure value_col is a list for consistency
70
+ if isinstance(value_col, str):
71
+ value_col = [value_col]
72
+
73
+ wide_list = []
74
+
75
+ for col in value_col:
76
+ pivoted = df.pivot(index=id_col, columns=time_col, values=col).add_prefix(f'{col}_').reset_index()
77
+ wide_list.append(pivoted)
78
+
79
+ # Merge all pivoted DataFrames on the ID column
80
+ wide = wide_list[0]
81
+ for w in wide_list[1:]:
82
+ wide = pd.merge(wide, w, on=id_col, how='outer')
83
+
84
+ wide.columns.name = None # Remove residual column group name from pivot
85
+ return wide
86
+
87
+
88
+ def wide_to_long_format_data(df: pd.DataFrame,
89
+ id_col: str,
90
+ time_cols: Union[List[str], List[List[str]]],
91
+ var_name="time",
92
+ value_name="state") -> pd.DataFrame:
93
+ """
94
+ Convert a wide-format DataFrame to long format.
95
+
96
+ :param df: Wide-format DataFrame.
97
+ :param id_col: Column with unique IDs.
98
+ :param time_cols: List of time columns or a list of lists if multiple value columns.
99
+ :param var_name: Name for the time variable in long format.
100
+ :param value_name: Name for the state value.
101
+ :return: Long-format DataFrame.
102
+ """
103
+ if isinstance(time_cols[0], str):
104
+ return df.melt(id_vars=[id_col], value_vars=time_cols, var_name=var_name, value_name=value_name)
105
+ else:
106
+ long_dfs = []
107
+ for cols in time_cols:
108
+ long_df = df.melt(id_vars=[id_col], value_vars=cols, var_name=var_name, value_name=value_name)
109
+ long_dfs.append(long_df)
110
+ return pd.concat(long_dfs, ignore_index=True).reset_index(drop=True)
111
+
112
+
113
+ def summarize_missing_values(df: pd.DataFrame,
114
+ plot: bool = True,
115
+ top_n: int = 5,
116
+ columns: list = None,
117
+ mode: str = 'matrix', # 'matrix' or 'bar'
118
+ figsize=(10, 5),
119
+ save_as: str = None,
120
+ show: bool = True) -> None:
121
+ """
122
+ Summarize missing values in a DataFrame, with optional visualization.
123
+
124
+ :param df: Input DataFrame
125
+ :param plot: Whether to visualize missing values
126
+ :param top_n: Number of rows with most missing values to show
127
+ :param columns: Columns to limit analysis to
128
+ :param mode: 'matrix' or 'bar' for visualization mode
129
+ :param figsize: Figure size for visualization
130
+ :param save_as: Path to save figure
131
+ :param show: Whether to display the figure
132
+ """
133
+ print("🔍 Missing Value Summary")
134
+ print("-" * 40)
135
+
136
+ if columns:
137
+ df = df[columns]
138
+
139
+ # 1. Summary per column
140
+ missing_per_column = df.isnull().sum()
141
+ percent_missing = (missing_per_column / len(df)) * 100
142
+ summary_df = pd.DataFrame({
143
+ 'Missing Count': missing_per_column,
144
+ 'Missing (%)': percent_missing.round(2)
145
+ }).sort_values('Missing Count', ascending=False)
146
+
147
+ print("[Columns with Missing Values]")
148
+ print(summary_df[summary_df['Missing Count'] > 0])
149
+
150
+ # 2. Summary per row
151
+ row_missing = df.isnull().sum(axis=1)
152
+ if row_missing.max() > 0:
153
+ print(f"\n[Top {top_n} Rows with Most Missing Values]")
154
+ print(row_missing.sort_values(ascending=False).head(top_n).rename("Missing Count").to_frame())
155
+
156
+ # 3. Visualization
157
+ if plot and not summary_df.empty:
158
+ plt.figure(figsize=figsize)
159
+ if mode == 'matrix':
160
+ fig = msno.matrix(df)
161
+ elif mode == 'bar':
162
+ fig = msno.bar(df)
163
+ else:
164
+ raise ValueError("mode must be either 'matrix' or 'bar'")
165
+
166
+ if save_as:
167
+ fig.figure.savefig(save_as, bbox_inches='tight', dpi=200)
168
+ if not show:
169
+ plt.close()
170
+
171
+
172
+ def replace_cluster_id_by_labels(df, mapping=None, new_cluster_column_name='Cluster', new_id_column_name='Entity ID'):
173
+ """
174
+ Once users have gotten the membership table,
175
+ this function helps replace cluster IDs in a DataFrame with user-defined labels and updates column names.
176
+
177
+ Parameters:
178
+ df (pd.DataFrame): The input DataFrame containing 'Entity ID' and 'Cluster' columns.
179
+ mapping (dict, optional): A dictionary where keys are cluster IDs (e.g., 1, 2, 3, 4)
180
+ and values are the corresponding labels. Default is an empty dictionary.
181
+ new_cluster_column_name (str): The name of the new cluster column. Default is 'Cluster'.
182
+ new_id_column_name (str): The name of the new entity ID column. Default is 'Entity ID'.
183
+
184
+ Returns:
185
+ pd.DataFrame: A new DataFrame with cluster IDs replaced by labels and updated column names.
186
+
187
+ Example:
188
+ original_df = pd.DataFrame({'Entity ID': [1, 2, 3], 'Cluster': [1, 2, 3]})
189
+ mapping = {1: 'A', 2: 'B', 3: 'C'}
190
+ new_df = replace_cluster_id_by_labels(original_df, mapping, 'New Cluster', 'New ID')
191
+ """
192
+ if mapping is None:
193
+ mapping = {}
194
+
195
+ # Check if the necessary columns exist in the DataFrame
196
+ if 'Entity ID' not in df.columns or 'Cluster' not in df.columns:
197
+ raise ValueError("The input DataFrame must contain 'Entity ID' and 'Cluster' columns.")
198
+
199
+ # Check if all keys in the mapping are valid cluster IDs in the DataFrame
200
+ unique_clusters = set(df['Cluster'].unique())
201
+ for cluster_id in mapping.keys():
202
+ if cluster_id not in unique_clusters:
203
+ raise ValueError(f"Cluster ID {cluster_id} from the mapping does not exist in the DataFrame.")
204
+
205
+ # Replace cluster IDs with the specified labels
206
+ df['Cluster'] = df['Cluster'].map(mapping).fillna(df['Cluster'])
207
+
208
+ # Rename the columns
209
+ df.rename(columns={'Entity ID': new_id_column_name, 'Cluster': new_cluster_column_name}, inplace=True)
210
+
211
+ return df
212
+
213
+
214
+ if __name__ == '__main__':
215
+ # Example long-format data
216
+ data = {
217
+ 'id': ['A', 'A', 'A', 'B', 'B', 'C', 'C', 'C'],
218
+ 'time': ['T1', 'T2', 'T3', 'T1', 'T2', 'T1', 'T2', 'T3'],
219
+ 'value1': [10, 20, 30, 40, 50, 60, 70, 80],
220
+ 'value2': [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8]
221
+ }
222
+
223
+ df = pd.DataFrame(data)
224
+ print(df)
225
+
226
+ # Test with a single value column
227
+ print("\nTest with a single value column:")
228
+ first_df = long_to_wide_format_data(df, 'id', 'time', 'value1')
229
+ print(first_df)
230
+
231
+ # Test with multiple value columns
232
+ print("\nTest with multiple value columns:")
233
+ second_df = long_to_wide_format_data(df, 'id', 'time', ['value1', 'value2'])
234
+ print(second_df)
235
+ print('end')
236
+
237
+ # ------------------------------------
238
+ # data = {
239
+ # 'id': ['A', 'B', 'C'],
240
+ # 'T1_value1': [10, 40, 60],
241
+ # 'T2_value1': [20, 50, 70],
242
+ # 'T3_value1': [30, None, 80],
243
+ # 'T1_value2': [1.1, 4.4, 6.6],
244
+ # 'T2_value2': [2.2, 5.5, 7.7],
245
+ # 'T3_value2': [3.3, None, 8.8]
246
+ # }
247
+ # df = pd.DataFrame(data)
248
+ #
249
+ # print(df)
250
+ #
251
+ # print("\nTest with single value column:")
252
+ # print(wide_to_long_format_data(df, 'id', ['T1_value1', 'T2_value1', 'T3_value1']))
253
+ #
254
+ # print("\nTest with multiple value columns:")
255
+ # print(wide_to_long_format_data(df, 'id',
256
+ # [['T1_value1', 'T2_value1', 'T3_value1'], ['T1_value2', 'T2_value2', 'T3_value2']]))
@@ -0,0 +1,41 @@
1
+ # This file makes 'datasets' a Python package
2
+
3
+
4
+ def list_datasets():
5
+ """List all available datasets in the `datasets` package."""
6
+ # Delay imports to avoid circular dependency issues during installation
7
+ import importlib.resources as pkg_resources
8
+
9
+ with pkg_resources.path("sequenzo.datasets", "__init__.py") as datasets_path:
10
+ datasets_dir = datasets_path.parent # Get the datasets directory path
11
+ return [file.stem for file in datasets_dir.iterdir() if file.suffix == ".csv"]
12
+
13
+
14
+ def load_dataset(name):
15
+ """
16
+ Load a built-in dataset from the sequenzo package dynamically.
17
+
18
+ Parameters:
19
+ name (str): The name of the dataset (without `.csv`).
20
+
21
+ Returns:
22
+ pd.DataFrame: Loaded dataset as a pandas DataFrame.
23
+ """
24
+ # Import pandas only when the function is called, not when the module is loaded
25
+ import pandas as pd
26
+ import os
27
+ # Import resources management module
28
+ import importlib.resources as pkg_resources
29
+
30
+ available_datasets = list_datasets() # Get the dynamic dataset list
31
+
32
+ if name not in available_datasets:
33
+ raise ValueError(f"Dataset '{name}' not found. Available datasets: {available_datasets}")
34
+
35
+ # Load the dataset from the package
36
+ with pkg_resources.open_text("sequenzo.datasets", f"{name}.csv") as f:
37
+ return pd.read_csv(f)
38
+
39
+
40
+ # Key: Add this line to ensure load_dataset can be accessed externally
41
+ __all__ = ["load_dataset", "list_datasets"]