sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
  2. sequenzo/__init__.py +349 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +476 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +22 -0
  30. sequenzo/data_preprocessing/helpers.py +303 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/dyadic_children.csv +61 -0
  44. sequenzo/datasets/dyadic_parents.csv +61 -0
  45. sequenzo/datasets/mvad.csv +713 -0
  46. sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
  47. sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
  48. sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
  49. sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
  50. sequenzo/datasets/political_science_aid_shock.csv +166 -0
  51. sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
  52. sequenzo/define_sequence_data.py +1400 -0
  53. sequenzo/dissimilarity_measures/__init__.py +31 -0
  54. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  55. sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
  56. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
  57. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  58. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  59. sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
  60. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  61. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  62. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  63. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  64. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  65. sequenzo/dissimilarity_measures/src/module.cpp +40 -0
  66. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  67. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  214. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  215. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  216. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  217. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  218. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  219. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  220. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  221. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  222. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  223. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  224. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  225. sequenzo/multidomain/__init__.py +23 -0
  226. sequenzo/multidomain/association_between_domains.py +311 -0
  227. sequenzo/multidomain/cat.py +597 -0
  228. sequenzo/multidomain/combt.py +519 -0
  229. sequenzo/multidomain/dat.py +81 -0
  230. sequenzo/multidomain/idcd.py +139 -0
  231. sequenzo/multidomain/linked_polyad.py +292 -0
  232. sequenzo/openmp_setup.py +233 -0
  233. sequenzo/prefix_tree/__init__.py +62 -0
  234. sequenzo/prefix_tree/hub.py +114 -0
  235. sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
  236. sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
  237. sequenzo/prefix_tree/spell_level_indicators.py +297 -0
  238. sequenzo/prefix_tree/system_level_indicators.py +544 -0
  239. sequenzo/prefix_tree/utils.py +54 -0
  240. sequenzo/seqhmm/__init__.py +95 -0
  241. sequenzo/seqhmm/advanced_optimization.py +305 -0
  242. sequenzo/seqhmm/bootstrap.py +411 -0
  243. sequenzo/seqhmm/build_hmm.py +142 -0
  244. sequenzo/seqhmm/build_mhmm.py +136 -0
  245. sequenzo/seqhmm/build_nhmm.py +121 -0
  246. sequenzo/seqhmm/fit_mhmm.py +62 -0
  247. sequenzo/seqhmm/fit_model.py +61 -0
  248. sequenzo/seqhmm/fit_nhmm.py +76 -0
  249. sequenzo/seqhmm/formulas.py +289 -0
  250. sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
  251. sequenzo/seqhmm/gradients_nhmm.py +306 -0
  252. sequenzo/seqhmm/hmm.py +291 -0
  253. sequenzo/seqhmm/mhmm.py +314 -0
  254. sequenzo/seqhmm/model_comparison.py +238 -0
  255. sequenzo/seqhmm/multichannel_em.py +282 -0
  256. sequenzo/seqhmm/multichannel_utils.py +138 -0
  257. sequenzo/seqhmm/nhmm.py +270 -0
  258. sequenzo/seqhmm/nhmm_utils.py +191 -0
  259. sequenzo/seqhmm/predict.py +137 -0
  260. sequenzo/seqhmm/predict_mhmm.py +142 -0
  261. sequenzo/seqhmm/simulate.py +878 -0
  262. sequenzo/seqhmm/utils.py +218 -0
  263. sequenzo/seqhmm/visualization.py +910 -0
  264. sequenzo/sequence_characteristics/__init__.py +40 -0
  265. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  266. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  267. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  268. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  269. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  270. sequenzo/sequence_characteristics/turbulence.py +155 -0
  271. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  272. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  273. sequenzo/suffix_tree/__init__.py +66 -0
  274. sequenzo/suffix_tree/hub.py +114 -0
  275. sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
  276. sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
  277. sequenzo/suffix_tree/spell_level_indicators.py +248 -0
  278. sequenzo/suffix_tree/system_level_indicators.py +535 -0
  279. sequenzo/suffix_tree/utils.py +56 -0
  280. sequenzo/version_check.py +283 -0
  281. sequenzo/visualization/__init__.py +29 -0
  282. sequenzo/visualization/plot_mean_time.py +222 -0
  283. sequenzo/visualization/plot_modal_state.py +276 -0
  284. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  285. sequenzo/visualization/plot_relative_frequency.py +405 -0
  286. sequenzo/visualization/plot_sequence_index.py +1175 -0
  287. sequenzo/visualization/plot_single_medoid.py +153 -0
  288. sequenzo/visualization/plot_state_distribution.py +651 -0
  289. sequenzo/visualization/plot_transition_matrix.py +190 -0
  290. sequenzo/visualization/utils/__init__.py +23 -0
  291. sequenzo/visualization/utils/utils.py +310 -0
  292. sequenzo/with_event_history_analysis/__init__.py +35 -0
  293. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  294. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  295. sequenzo-0.1.31.dist-info/METADATA +286 -0
  296. sequenzo-0.1.31.dist-info/RECORD +299 -0
  297. sequenzo-0.1.31.dist-info/WHEEL +5 -0
  298. sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
  299. sequenzo-0.1.31.dist-info/top_level.txt +2 -0
@@ -0,0 +1,303 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : helpers.py
4
+ @Time : 01/05/2025 09:27
5
+ @Desc :
6
+ """
7
+ import pandas as pd
8
+ import numpy as np
9
+ import re
10
+ import matplotlib.pyplot as plt
11
+ import missingno as msno
12
+ from typing import Union, List
13
+
14
+
15
+ def clean_time_columns_auto(df: pd.DataFrame, prefix_patterns: list = None) -> pd.DataFrame:
16
+ """
17
+ Clean column names by extracting the numeric part (e.g. status15 -> 15, pstatus16 -> 16).
18
+
19
+ Columns whose names start with one of the given prefixes are renamed to the number
20
+ they contain. Other columns (e.g. id, sex) are left unchanged. Useful for time-series
21
+ or panel data where columns are named like status1, status2, ... or pstatus15, ...
22
+
23
+ Parameters
24
+ ----------
25
+ df : pd.DataFrame
26
+ Input DataFrame.
27
+ prefix_patterns : list, optional
28
+ Prefixes to match (e.g. ['status'], ['pstatus']). Only these columns are renamed.
29
+ If None, any column whose name contains both letters and digits is processed.
30
+
31
+ Returns
32
+ -------
33
+ pd.DataFrame
34
+ DataFrame with cleaned column names (same data).
35
+ """
36
+ df_cleaned = df.copy()
37
+ new_columns = {}
38
+
39
+ for col in df.columns:
40
+ if prefix_patterns is None:
41
+ should_process = bool(re.search(r"\d+", col))
42
+ else:
43
+ should_process = any(col.startswith(prefix) for prefix in prefix_patterns)
44
+
45
+ if should_process:
46
+ numbers = re.findall(r"\d+", col)
47
+ if numbers:
48
+ extracted_number = numbers[-1]
49
+ if re.search(r"[a-zA-Z]", col):
50
+ new_columns[col] = extracted_number
51
+ else:
52
+ new_columns[col] = col
53
+ else:
54
+ new_columns[col] = col
55
+ else:
56
+ new_columns[col] = col
57
+
58
+ return df_cleaned.rename(columns=new_columns)
59
+
60
+
61
+ def assign_unique_ids(df: pd.DataFrame, id_col_name: str = "Entity ID") -> pd.DataFrame:
62
+ """
63
+ Assigns a unique integer ID to each row in the DataFrame and inserts it as the first column.
64
+
65
+ :param df: Input DataFrame.
66
+ :param id_col_name: Name of the new ID column (default = "Entity ID").
67
+ :return: DataFrame with the ID column inserted at the first position.
68
+ """
69
+ if id_col_name in df.columns:
70
+ raise ValueError(f"[!] Column '{id_col_name}' already exists in the DataFrame.")
71
+
72
+ df = df.copy()
73
+ df.insert(0, id_col_name, np.arange(len(df)))
74
+ return df
75
+
76
+
77
+ def long_to_wide_format_data(df: pd.DataFrame,
78
+ id_col: str,
79
+ time_col: str,
80
+ value_col: Union[str, List[str]]) -> pd.DataFrame:
81
+ """
82
+ Convert a long-format DataFrame to wide format.
83
+
84
+ This function pivots the long-format data so that each unique time point becomes
85
+ a separate column, and each row corresponds to one unique sequence (identified by `id_col`).
86
+
87
+ Parameters:
88
+ ----------
89
+ df : pd.DataFrame
90
+ Input DataFrame in long format.
91
+
92
+ id_col : str
93
+ The name of the column representing unique entity IDs.
94
+
95
+ time_col : str
96
+ The name of the column containing time points (must be a string, not a list).
97
+
98
+ value_col : Union[str, List[str]]
99
+ The name(s) of the column(s) containing state values.
100
+ Can be a single string or a list of strings.
101
+
102
+ Returns:
103
+ -------
104
+ pd.DataFrame
105
+ A wide-format DataFrame with one row per ID and one column per time point for each value column.
106
+ The column names are taken from the unique values in `time_col` combined with value column names.
107
+
108
+ Notes:
109
+ -----
110
+ - This function assumes `df` is already in long format.
111
+ - `time_col` must be a column *name* (string), not a list.
112
+ - The top-left "column name" in the output (from pivot) may carry over as a column index name;
113
+ this is removed automatically for clean output.
114
+ - If multiple value columns are provided, the result will have multi-level columns.
115
+ """
116
+ # Ensure value_col is a list for consistency
117
+ if isinstance(value_col, str):
118
+ value_col = [value_col]
119
+
120
+ wide_list = []
121
+
122
+ for col in value_col:
123
+ pivoted = df.pivot(index=id_col, columns=time_col, values=col).add_prefix(f'{col}_').reset_index()
124
+ wide_list.append(pivoted)
125
+
126
+ # Merge all pivoted DataFrames on the ID column
127
+ wide = wide_list[0]
128
+ for w in wide_list[1:]:
129
+ wide = pd.merge(wide, w, on=id_col, how='outer')
130
+
131
+ wide.columns.name = None # Remove residual column group name from pivot
132
+ return wide
133
+
134
+
135
+ def wide_to_long_format_data(df: pd.DataFrame,
136
+ id_col: str,
137
+ time_cols: Union[List[str], List[List[str]]],
138
+ var_name="time",
139
+ value_name="state") -> pd.DataFrame:
140
+ """
141
+ Convert a wide-format DataFrame to long format.
142
+
143
+ :param df: Wide-format DataFrame.
144
+ :param id_col: Column with unique IDs.
145
+ :param time_cols: List of time columns or a list of lists if multiple value columns.
146
+ :param var_name: Name for the time variable in long format.
147
+ :param value_name: Name for the state value.
148
+ :return: Long-format DataFrame.
149
+ """
150
+ if isinstance(time_cols[0], str):
151
+ return df.melt(id_vars=[id_col], value_vars=time_cols, var_name=var_name, value_name=value_name)
152
+ else:
153
+ long_dfs = []
154
+ for cols in time_cols:
155
+ long_df = df.melt(id_vars=[id_col], value_vars=cols, var_name=var_name, value_name=value_name)
156
+ long_dfs.append(long_df)
157
+ return pd.concat(long_dfs, ignore_index=True).reset_index(drop=True)
158
+
159
+
160
+ def summarize_missing_values(df: pd.DataFrame,
161
+ plot: bool = True,
162
+ top_n: int = 5,
163
+ columns: list = None,
164
+ mode: str = 'matrix', # 'matrix' or 'bar'
165
+ figsize=(10, 5),
166
+ save_as: str = None,
167
+ show: bool = True) -> None:
168
+ """
169
+ Summarize missing values in a DataFrame, with optional visualization.
170
+
171
+ :param df: Input DataFrame
172
+ :param plot: Whether to visualize missing values
173
+ :param top_n: Number of rows with most missing values to show
174
+ :param columns: Columns to limit analysis to
175
+ :param mode: 'matrix' or 'bar' for visualization mode
176
+ :param figsize: Figure size for visualization
177
+ :param save_as: Path to save figure
178
+ :param show: Whether to display the figure
179
+ """
180
+ print("🔍 Missing Value Summary")
181
+ print("-" * 40)
182
+
183
+ if columns:
184
+ df = df[columns]
185
+
186
+ # 1. Summary per column
187
+ missing_per_column = df.isnull().sum()
188
+ percent_missing = (missing_per_column / len(df)) * 100
189
+ summary_df = pd.DataFrame({
190
+ 'Missing Count': missing_per_column,
191
+ 'Missing (%)': percent_missing.round(2)
192
+ }).sort_values('Missing Count', ascending=False)
193
+
194
+ print("[Columns with Missing Values]")
195
+ print(summary_df[summary_df['Missing Count'] > 0])
196
+
197
+ # 2. Summary per row
198
+ row_missing = df.isnull().sum(axis=1)
199
+ if row_missing.max() > 0:
200
+ print(f"\n[Top {top_n} Rows with Most Missing Values]")
201
+ print(row_missing.sort_values(ascending=False).head(top_n).rename("Missing Count").to_frame())
202
+
203
+ # 3. Visualization
204
+ if plot and not summary_df.empty:
205
+ plt.figure(figsize=figsize)
206
+ if mode == 'matrix':
207
+ fig = msno.matrix(df)
208
+ elif mode == 'bar':
209
+ fig = msno.bar(df)
210
+ else:
211
+ raise ValueError("mode must be either 'matrix' or 'bar'")
212
+
213
+ if save_as:
214
+ fig.figure.savefig(save_as, bbox_inches='tight', dpi=200)
215
+ if not show:
216
+ plt.close()
217
+
218
+
219
+ def replace_cluster_id_by_labels(df, mapping=None, new_cluster_column_name='Cluster', new_id_column_name='Entity ID'):
220
+ """
221
+ Once users have gotten the membership table,
222
+ this function helps replace cluster IDs in a DataFrame with user-defined labels and updates column names.
223
+
224
+ Parameters:
225
+ df (pd.DataFrame): The input DataFrame containing 'Entity ID' and 'Cluster' columns.
226
+ mapping (dict, optional): A dictionary where keys are cluster IDs (e.g., 1, 2, 3, 4)
227
+ and values are the corresponding labels. Default is an empty dictionary.
228
+ new_cluster_column_name (str): The name of the new cluster column. Default is 'Cluster'.
229
+ new_id_column_name (str): The name of the new entity ID column. Default is 'Entity ID'.
230
+
231
+ Returns:
232
+ pd.DataFrame: A new DataFrame with cluster IDs replaced by labels and updated column names.
233
+
234
+ Example:
235
+ original_df = pd.DataFrame({'Entity ID': [1, 2, 3], 'Cluster': [1, 2, 3]})
236
+ mapping = {1: 'A', 2: 'B', 3: 'C'}
237
+ new_df = replace_cluster_id_by_labels(original_df, mapping, 'New Cluster', 'New ID')
238
+ """
239
+ if mapping is None:
240
+ mapping = {}
241
+
242
+ # Check if the necessary columns exist in the DataFrame
243
+ if 'Entity ID' not in df.columns or 'Cluster' not in df.columns:
244
+ raise ValueError("The input DataFrame must contain 'Entity ID' and 'Cluster' columns.")
245
+
246
+ # Check if all keys in the mapping are valid cluster IDs in the DataFrame
247
+ unique_clusters = set(df['Cluster'].unique())
248
+ for cluster_id in mapping.keys():
249
+ if cluster_id not in unique_clusters:
250
+ raise ValueError(f"Cluster ID {cluster_id} from the mapping does not exist in the DataFrame.")
251
+
252
+ # Replace cluster IDs with the specified labels
253
+ df['Cluster'] = df['Cluster'].map(mapping).fillna(df['Cluster'])
254
+
255
+ # Rename the columns
256
+ df.rename(columns={'Entity ID': new_id_column_name, 'Cluster': new_cluster_column_name}, inplace=True)
257
+
258
+ return df
259
+
260
+
261
+ if __name__ == '__main__':
262
+ # Example long-format data
263
+ data = {
264
+ 'id': ['A', 'A', 'A', 'B', 'B', 'C', 'C', 'C'],
265
+ 'time': ['T1', 'T2', 'T3', 'T1', 'T2', 'T1', 'T2', 'T3'],
266
+ 'value1': [10, 20, 30, 40, 50, 60, 70, 80],
267
+ 'value2': [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8]
268
+ }
269
+
270
+ df = pd.DataFrame(data)
271
+ print(df)
272
+
273
+ # Test with a single value column
274
+ print("\nTest with a single value column:")
275
+ first_df = long_to_wide_format_data(df, 'id', 'time', 'value1')
276
+ print(first_df)
277
+
278
+ # Test with multiple value columns
279
+ print("\nTest with multiple value columns:")
280
+ second_df = long_to_wide_format_data(df, 'id', 'time', ['value1', 'value2'])
281
+ print(second_df)
282
+ print('end')
283
+
284
+ # ------------------------------------
285
+ # data = {
286
+ # 'id': ['A', 'B', 'C'],
287
+ # 'T1_value1': [10, 40, 60],
288
+ # 'T2_value1': [20, 50, 70],
289
+ # 'T3_value1': [30, None, 80],
290
+ # 'T1_value2': [1.1, 4.4, 6.6],
291
+ # 'T2_value2': [2.2, 5.5, 7.7],
292
+ # 'T3_value2': [3.3, None, 8.8]
293
+ # }
294
+ # df = pd.DataFrame(data)
295
+ #
296
+ # print(df)
297
+ #
298
+ # print("\nTest with single value column:")
299
+ # print(wide_to_long_format_data(df, 'id', ['T1_value1', 'T2_value1', 'T3_value1']))
300
+ #
301
+ # print("\nTest with multiple value columns:")
302
+ # print(wide_to_long_format_data(df, 'id',
303
+ # [['T1_value1', 'T2_value1', 'T3_value1'], ['T1_value2', 'T2_value2', 'T3_value2']]))
@@ -0,0 +1,41 @@
1
+ # This file makes 'datasets' a Python package
2
+
3
+
4
+ def list_datasets():
5
+ """List all available datasets in the `datasets` package."""
6
+ # Delay imports to avoid circular dependency issues during installation
7
+ import importlib.resources as pkg_resources
8
+
9
+ with pkg_resources.path("sequenzo.datasets", "__init__.py") as datasets_path:
10
+ datasets_dir = datasets_path.parent # Get the datasets directory path
11
+ return [file.stem for file in datasets_dir.iterdir() if file.suffix == ".csv"]
12
+
13
+
14
+ def load_dataset(name):
15
+ """
16
+ Load a built-in dataset from the sequenzo package dynamically.
17
+
18
+ Parameters:
19
+ name (str): The name of the dataset (without `.csv`).
20
+
21
+ Returns:
22
+ pd.DataFrame: Loaded dataset as a pandas DataFrame.
23
+ """
24
+ # Import pandas only when the function is called, not when the module is loaded
25
+ import pandas as pd
26
+ import os
27
+ # Import resources management module
28
+ import importlib.resources as pkg_resources
29
+
30
+ available_datasets = list_datasets() # Get the dynamic dataset list
31
+
32
+ if name not in available_datasets:
33
+ raise ValueError(f"Dataset '{name}' not found. Available datasets: {available_datasets}")
34
+
35
+ # Load the dataset from the package
36
+ with pkg_resources.open_text("sequenzo.datasets", f"{name}.csv") as f:
37
+ return pd.read_csv(f)
38
+
39
+
40
+ # Key: Add this line to ensure load_dataset can be accessed externally
41
+ __all__ = ["load_dataset", "list_datasets"]