sequenzo 0.1.21__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sequenzo might be problematic. Click here for more details.

Files changed (260) hide show
  1. sequenzo/__init__.py +240 -0
  2. sequenzo/big_data/__init__.py +12 -0
  3. sequenzo/big_data/clara/__init__.py +26 -0
  4. sequenzo/big_data/clara/clara.py +467 -0
  5. sequenzo/big_data/clara/utils/__init__.py +27 -0
  6. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  7. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  8. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-311-darwin.so +0 -0
  9. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  10. sequenzo/big_data/clara/visualization.py +88 -0
  11. sequenzo/clustering/KMedoids.py +196 -0
  12. sequenzo/clustering/__init__.py +30 -0
  13. sequenzo/clustering/clustering_c_code.cpython-311-darwin.so +0 -0
  14. sequenzo/clustering/hierarchical_clustering.py +1380 -0
  15. sequenzo/clustering/src/KMedoid.cpp +262 -0
  16. sequenzo/clustering/src/PAM.cpp +236 -0
  17. sequenzo/clustering/src/PAMonce.cpp +234 -0
  18. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  19. sequenzo/clustering/src/cluster_quality.h +128 -0
  20. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  21. sequenzo/clustering/src/module.cpp +228 -0
  22. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  23. sequenzo/clustering/utils/__init__.py +27 -0
  24. sequenzo/clustering/utils/disscenter.py +122 -0
  25. sequenzo/data_preprocessing/__init__.py +20 -0
  26. sequenzo/data_preprocessing/helpers.py +256 -0
  27. sequenzo/datasets/__init__.py +41 -0
  28. sequenzo/datasets/biofam.csv +2001 -0
  29. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  30. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  31. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  32. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  33. sequenzo/datasets/country_co2_emissions.csv +194 -0
  34. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  35. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  36. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  37. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  38. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  39. sequenzo/datasets/mvad.csv +713 -0
  40. sequenzo/datasets/pairfam_family.csv +1867 -0
  41. sequenzo/datasets/polyadic_samplec1.csv +61 -0
  42. sequenzo/datasets/polyadic_samplep1.csv +61 -0
  43. sequenzo/datasets/polyadic_seqc1.csv +61 -0
  44. sequenzo/datasets/polyadic_seqp1.csv +61 -0
  45. sequenzo/define_sequence_data.py +609 -0
  46. sequenzo/dissimilarity_measures/__init__.py +31 -0
  47. sequenzo/dissimilarity_measures/c_code.cpython-311-darwin.so +0 -0
  48. sequenzo/dissimilarity_measures/get_distance_matrix.py +702 -0
  49. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +241 -0
  50. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  51. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  52. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  53. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  54. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  55. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  56. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  57. sequenzo/dissimilarity_measures/src/module.cpp +34 -0
  58. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  59. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  60. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  61. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  62. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  63. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  64. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  65. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  66. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  67. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  210. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  211. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  212. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-311-darwin.so +0 -0
  213. sequenzo/dissimilarity_measures/utils/seqconc.cpython-311-darwin.so +0 -0
  214. sequenzo/dissimilarity_measures/utils/seqdss.cpython-311-darwin.so +0 -0
  215. sequenzo/dissimilarity_measures/utils/seqdur.cpython-311-darwin.so +0 -0
  216. sequenzo/dissimilarity_measures/utils/seqlength.cpython-311-darwin.so +0 -0
  217. sequenzo/multidomain/__init__.py +23 -0
  218. sequenzo/multidomain/association_between_domains.py +311 -0
  219. sequenzo/multidomain/cat.py +431 -0
  220. sequenzo/multidomain/combt.py +519 -0
  221. sequenzo/multidomain/dat.py +89 -0
  222. sequenzo/multidomain/idcd.py +139 -0
  223. sequenzo/multidomain/linked_polyad.py +292 -0
  224. sequenzo/openmp_setup.py +233 -0
  225. sequenzo/prefix_tree/__init__.py +43 -0
  226. sequenzo/prefix_tree/individual_level_indicators.py +1274 -0
  227. sequenzo/prefix_tree/system_level_indicators.py +465 -0
  228. sequenzo/prefix_tree/utils.py +54 -0
  229. sequenzo/sequence_characteristics/__init__.py +40 -0
  230. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  231. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  232. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  233. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  234. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  235. sequenzo/sequence_characteristics/turbulence.py +155 -0
  236. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  237. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  238. sequenzo/suffix_tree/__init__.py +48 -0
  239. sequenzo/suffix_tree/individual_level_indicators.py +1638 -0
  240. sequenzo/suffix_tree/system_level_indicators.py +456 -0
  241. sequenzo/suffix_tree/utils.py +56 -0
  242. sequenzo/visualization/__init__.py +29 -0
  243. sequenzo/visualization/plot_mean_time.py +194 -0
  244. sequenzo/visualization/plot_modal_state.py +276 -0
  245. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  246. sequenzo/visualization/plot_relative_frequency.py +404 -0
  247. sequenzo/visualization/plot_sequence_index.py +937 -0
  248. sequenzo/visualization/plot_single_medoid.py +153 -0
  249. sequenzo/visualization/plot_state_distribution.py +613 -0
  250. sequenzo/visualization/plot_transition_matrix.py +190 -0
  251. sequenzo/visualization/utils/__init__.py +23 -0
  252. sequenzo/visualization/utils/utils.py +310 -0
  253. sequenzo/with_event_history_analysis/__init__.py +35 -0
  254. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  255. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  256. sequenzo-0.1.21.dist-info/METADATA +308 -0
  257. sequenzo-0.1.21.dist-info/RECORD +254 -0
  258. sequenzo-0.1.21.dist-info/WHEEL +5 -0
  259. sequenzo-0.1.21.dist-info/licenses/LICENSE +28 -0
  260. sequenzo-0.1.21.dist-info/top_level.txt +1 -0
@@ -0,0 +1,283 @@
1
+ """
2
+ @Author : Yuqi Liang 撁彧η₯Ί
3
+ @File : sequence_history_analysis.py
4
+ @Time : 30/09/2025 21:08
5
+ @Desc : Sequence History Analysis - Convert person-level sequence data to person-period format
6
+ """
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+
12
+ def person_level_to_person_period(data, id_col="id", period_col="time", event_col="event"):
13
+ """
14
+ Convert person-level data to person-period format.
15
+
16
+ This function expands each person's single row into multiple rows,
17
+ one for each time period they are observed.
18
+
19
+ Parameters
20
+ ----------
21
+ data : pandas.DataFrame
22
+ Input data with one row per person
23
+ id_col : str, optional
24
+ Name of the ID column (default: "id")
25
+ period_col : str, optional
26
+ Name of the time period column (default: "time")
27
+ event_col : str, optional
28
+ Name of the event indicator column (default: "event")
29
+
30
+ Returns
31
+ -------
32
+ pandas.DataFrame
33
+ Expanded data with one row per person-period
34
+
35
+ Examples
36
+ --------
37
+ >>> data = pd.DataFrame({'id': [1, 2], 'time': [3, 2], 'event': [True, False]})
38
+ >>> person_level_to_person_period(data)
39
+ id time event
40
+ 0 1 1 False
41
+ 1 1 2 False
42
+ 2 1 3 True
43
+ 3 2 1 False
44
+ 4 2 2 False
45
+ """
46
+ # Check for missing values in critical columns
47
+ if data[[id_col, period_col, event_col]].isna().any().any():
48
+ raise ValueError("Cannot handle missing data in the time or event variables")
49
+
50
+ # Create an index that repeats each row based on the time value
51
+ # For example, if time=3, that row will be repeated 3 times
52
+ index = np.repeat(np.arange(len(data)), data[period_col].values)
53
+
54
+ # Find the cumulative sum to identify which rows should have the event
55
+ idmax = np.cumsum(data[period_col].values) - 1
56
+
57
+ # Expand the data by repeating rows
58
+ dat = data.iloc[index].copy()
59
+ dat.reset_index(drop=True, inplace=True)
60
+
61
+ # Create sequential time periods for each ID (1, 2, 3, ...)
62
+ dat[period_col] = dat.groupby(id_col).cumcount() + 1
63
+
64
+ # Set all events to False initially
65
+ dat[event_col] = False
66
+
67
+ # Set events to True only at the final period for each person
68
+ # Convert to bool to avoid dtype incompatibility warning
69
+ dat.loc[idmax, event_col] = data[event_col].values.astype(bool)
70
+
71
+ return dat
72
+
73
+
74
+ def _extract_sequence_dataframe(seqdata):
75
+ """
76
+ Extract sequence DataFrame from various input types.
77
+
78
+ Parameters
79
+ ----------
80
+ seqdata : SequenceData, pandas.DataFrame, or numpy.ndarray
81
+ Input sequence data
82
+
83
+ Returns
84
+ -------
85
+ pandas.DataFrame
86
+ Sequence data as a DataFrame
87
+ """
88
+ # Check if input is a SequenceData object
89
+ if hasattr(seqdata, 'seqdata'):
90
+ # This is a SequenceData object
91
+ return seqdata.seqdata.copy()
92
+ elif isinstance(seqdata, pd.DataFrame):
93
+ return seqdata.copy()
94
+ else:
95
+ # Assume it's array-like
96
+ return pd.DataFrame(seqdata)
97
+
98
+
99
+ def seqsha(seqdata, time, event, include_present=False, align_end=False, covar=None):
100
+ """
101
+ Sequence History Analysis: Create person-period format with sequence history.
102
+
103
+ This function converts sequence data into a person-period format where each
104
+ row represents a time point for a person, with columns showing their sequence
105
+ history up to that point.
106
+
107
+ Parameters
108
+ ----------
109
+ seqdata : SequenceData, pandas.DataFrame, or numpy.ndarray
110
+ Sequence data where each row is a person and each column is a time point.
111
+ Can be a SequenceData object, DataFrame, or array.
112
+ time : array-like
113
+ Duration or time until event for each person. Length should equal the
114
+ number of sequences. Each value indicates how many time periods that
115
+ person is observed. For example, if all persons are observed for the
116
+ full sequence length, use: np.full(n_persons, sequence_length)
117
+ event : array-like
118
+ Event indicator for each person (True/False or 1/0). Length should
119
+ equal the number of sequences.
120
+ include_present : bool, optional
121
+ If True, include the current time point in the history (default: False)
122
+ If False, only include past time points (recommended for most analyses)
123
+ align_end : bool, optional
124
+ If True, align sequences from the end (right-aligned) (default: False)
125
+ If False, align sequences from the start (left-aligned)
126
+ covar : pandas.DataFrame or numpy.ndarray, optional
127
+ Additional covariates to merge with the output (default: None)
128
+ Should have the same number of rows as seqdata
129
+
130
+ Returns
131
+ -------
132
+ pandas.DataFrame
133
+ Person-period data with the following columns:
134
+ - id: Person identifier
135
+ - time: Time period within person
136
+ - event: Event indicator (True only at the final period for each person)
137
+ - Sequence history columns (varies based on align_end parameter)
138
+ - Additional covariate columns (if covar is provided)
139
+
140
+ Raises
141
+ ------
142
+ ValueError
143
+ If maximum time exceeds the length of the longest sequence
144
+
145
+ Examples
146
+ --------
147
+ Example 1: Basic usage with DataFrame
148
+ >>> import pandas as pd
149
+ >>> import numpy as np
150
+ >>> seqdata = pd.DataFrame([[1, 2, 3, 4], [1, 1, 2, 2]])
151
+ >>> time = np.array([3, 2])
152
+ >>> event = np.array([True, False])
153
+ >>> result = seqsha(seqdata, time, event)
154
+
155
+ Example 2: Usage with SequenceData object (recommended)
156
+ >>> from sequenzo import SequenceData, load_dataset
157
+ >>> df = load_dataset('pairfam_family')
158
+ >>> time_cols = [str(i) for i in range(1, 265)]
159
+ >>> seq_data = SequenceData(df, time=time_cols, id_col='id',
160
+ ... states=list(range(1, 10)))
161
+ >>> # All persons observed for 264 months
162
+ >>> time = np.full(len(df), 264)
163
+ >>> event = df['highschool'].values
164
+ >>> result = seqsha(seq_data, time, event)
165
+
166
+ Example 3: With covariates
167
+ >>> covar = df[['sex', 'yeduc', 'east']]
168
+ >>> result = seqsha(seq_data, time, event, covar=covar)
169
+
170
+ Example 4: Right-aligned sequences
171
+ >>> result = seqsha(seq_data, time, event, align_end=True)
172
+
173
+ Notes
174
+ -----
175
+ - The time parameter represents observation duration, not calendar time
176
+ - When include_present=False (default), only past states are included
177
+ - Use align_end=True when analyzing sequences leading up to an event
178
+ - Missing values in the original sequence are converted to "NA_orig"
179
+ """
180
+ # Extract sequence DataFrame from input (handles SequenceData, DataFrame, or array)
181
+ seq_df = _extract_sequence_dataframe(seqdata)
182
+
183
+ # Convert time and event to numpy arrays for consistency
184
+ time_array = np.asarray(time)
185
+ event_array = np.asarray(event)
186
+
187
+ # Check that dimensions match
188
+ n_sequences = len(seq_df)
189
+ if len(time_array) != n_sequences:
190
+ raise ValueError(
191
+ f"Length of 'time' ({len(time_array)}) must match number of sequences ({n_sequences})"
192
+ )
193
+ if len(event_array) != n_sequences:
194
+ raise ValueError(
195
+ f"Length of 'event' ({len(event_array)}) must match number of sequences ({n_sequences})"
196
+ )
197
+
198
+ # Create base time data: one row per person with their time and event
199
+ basetime = pd.DataFrame({
200
+ 'id': np.arange(1, n_sequences + 1),
201
+ 'time': time_array,
202
+ 'event': event_array
203
+ })
204
+
205
+ # Convert to person-period format (expand rows)
206
+ persper = person_level_to_person_period(basetime, "id", "time", "event")
207
+
208
+ # Convert sequence data to matrix and handle missing values
209
+ sdata = seq_df.values.astype(str)
210
+ sdata[pd.isna(seq_df.values)] = "NA_orig"
211
+
212
+ # Get the time periods for each row in person-period data
213
+ age = persper['time'].values
214
+ ma = int(np.max(age))
215
+
216
+ # Check if time values are valid
217
+ if ma > seq_df.shape[1]:
218
+ raise ValueError("Maximum time of event occurrence is higher than the longest sequence!")
219
+
220
+ # Create empty matrix to store past sequence states
221
+ past = np.full((len(persper), seq_df.shape[1]), np.nan, dtype=object)
222
+
223
+ if align_end:
224
+ # Right-align the sequences (align from the end)
225
+ start = 1 if include_present else 2
226
+
227
+ for aa in range(start, ma + 1):
228
+ # Find rows where time equals aa
229
+ cond = age == aa
230
+ # Get the person IDs for these rows
231
+ ids_a = persper.loc[cond, 'id'].values - 1 # Subtract 1 for 0-based indexing
232
+
233
+ if include_present:
234
+ # Include current time point: fill from (ncol-aa) to end
235
+ past[cond, (seq_df.shape[1] - aa):seq_df.shape[1]] = sdata[ids_a, 0:aa]
236
+ else:
237
+ # Exclude current time point: fill from (ncol-aa+1) to end
238
+ past[cond, (seq_df.shape[1] - aa + 1):seq_df.shape[1]] = sdata[ids_a, 0:(aa - 1)]
239
+
240
+ # Create column names counting backwards
241
+ col_names = [f"Tm{i}" for i in range(seq_df.shape[1], 0, -1)]
242
+ else:
243
+ # Left-align the sequences (align from the start)
244
+ for aa in range(1, ma + 1):
245
+ if include_present:
246
+ # Include present: use time > aa
247
+ cond = age > aa
248
+ else:
249
+ # Exclude present: use time >= aa
250
+ cond = age >= aa
251
+
252
+ # Get the person IDs for these rows
253
+ ids_a = persper.loc[cond, 'id'].values - 1 # Subtract 1 for 0-based indexing
254
+
255
+ # Fill in the sequence state at position aa-1 (0-based)
256
+ past[cond, aa - 1] = sdata[ids_a, aa - 1]
257
+
258
+ # Use original column names or create default ones
259
+ if seq_df.columns is not None and len(seq_df.columns) > 0:
260
+ col_names = [str(col) for col in seq_df.columns[:ma]]
261
+ # Pad with additional column names if needed
262
+ col_names += [f"col_{i}" for i in range(ma, seq_df.shape[1])]
263
+ else:
264
+ col_names = [f"col_{i}" for i in range(seq_df.shape[1])]
265
+
266
+ # Convert past matrix to DataFrame
267
+ past_df = pd.DataFrame(past, columns=col_names)
268
+
269
+ # Combine person-period data with sequence history
270
+ alldata = pd.concat([persper.reset_index(drop=True), past_df], axis=1)
271
+
272
+ # Add covariates if provided
273
+ if covar is not None:
274
+ # Merge covariates based on the ID (subtract 1 for 0-based indexing)
275
+ if isinstance(covar, pd.DataFrame):
276
+ covar_subset = covar.iloc[alldata['id'].values - 1].reset_index(drop=True)
277
+ alldata = pd.concat([alldata, covar_subset], axis=1)
278
+ else:
279
+ covar_array = np.array(covar)
280
+ covar_subset = covar_array[alldata['id'].values - 1]
281
+ alldata = pd.concat([alldata, pd.DataFrame(covar_subset)], axis=1)
282
+
283
+ return alldata
@@ -0,0 +1,308 @@
1
+ Metadata-Version: 2.4
2
+ Name: sequenzo
3
+ Version: 0.1.21
4
+ Summary: A fast, scalable and intuitive Python package for social sequence analysis.
5
+ Author-email: Yuqi Liang <yuqi.liang.1900@gmail.com>, Xinyi Li <1836724126@qq.com>, Jan Heinrich Ernst Meyerhoff-Liang <jan.meyerhoff1@gmail.com>
6
+ License: BSD 3-Clause License
7
+
8
+ Copyright (c) 2025, Yuqi Liang
9
+
10
+ Redistribution and use in source and binary forms, with or without
11
+ modification, are permitted provided that the following conditions are met:
12
+
13
+ 1. Redistributions of source code must retain the above copyright notice, this
14
+ list of conditions and the following disclaimer.
15
+
16
+ 2. Redistributions in binary form must reproduce the above copyright notice,
17
+ this list of conditions and the following disclaimer in the documentation
18
+ and/or other materials provided with the distribution.
19
+
20
+ 3. Neither the name of the copyright holder nor the names of its
21
+ contributors may be used to endorse or promote products derived from
22
+ this software without specific prior written permission.
23
+
24
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
25
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
27
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
28
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34
+
35
+ Project-URL: Homepage, https://github.com/Liang-Team/Sequenzo
36
+ Project-URL: Documentation, https://sequenzo.yuqi-liang.tech
37
+ Classifier: Development Status :: 3 - Alpha
38
+ Classifier: Intended Audience :: Science/Research
39
+ Classifier: Intended Audience :: Developers
40
+ Classifier: Programming Language :: Python :: 3.9
41
+ Classifier: Programming Language :: Python :: 3.10
42
+ Classifier: Programming Language :: Python :: 3.11
43
+ Classifier: Programming Language :: Python :: 3.12
44
+ Requires-Python: <3.13,>=3.9
45
+ Description-Content-Type: text/markdown
46
+ License-File: LICENSE
47
+ Requires-Dist: numpy>=2.0.0
48
+ Requires-Dist: pandas>=1.2.5
49
+ Requires-Dist: matplotlib>=3.4.3
50
+ Requires-Dist: seaborn>=0.11.2
51
+ Requires-Dist: Pillow>=8.3.2
52
+ Requires-Dist: pybind11>=2.6.0
53
+ Requires-Dist: cython>=0.29.21
54
+ Requires-Dist: scipy>=1.6.3
55
+ Requires-Dist: scikit-learn>=0.24.2
56
+ Requires-Dist: fastcluster>=1.2.6
57
+ Requires-Dist: joblib>=1.0.1
58
+ Requires-Dist: docutils>=0.17
59
+ Requires-Dist: tqdm<5.0.0,>=4.62.3
60
+ Requires-Dist: missingno<0.6.0,>=0.5.2
61
+ Requires-Dist: cffi>=1.15.0
62
+ Provides-Extra: r
63
+ Requires-Dist: rpy2>=3.5.12; python_version >= "3.12" and extra == "r"
64
+ Requires-Dist: rpy2>=3.5.6; python_version == "3.11" and extra == "r"
65
+ Requires-Dist: rpy2>=3.5.6; python_version == "3.10" and extra == "r"
66
+ Requires-Dist: rpy2>=3.5.6; python_version == "3.9" and extra == "r"
67
+ Provides-Extra: dev
68
+ Requires-Dist: pytest>=6.2.5; extra == "dev"
69
+ Requires-Dist: flake8>=3.9.2; extra == "dev"
70
+ Dynamic: license-file
71
+
72
+ <p align="center">
73
+ <img src="https://raw.githubusercontent.com/Liang-Team/Sequenzo/main/assets/logo/FullLogo_NoBuffer.jpg" alt="Sequenzo Logo" width="300">
74
+ </p>
75
+
76
+ <p align="center">
77
+ <!-- βœ… PyPI Latest Version Badge -->
78
+ <a href="https://pypi.org/project/sequenzo/">
79
+ <img alt="PyPI - Version" src="https://img.shields.io/pypi/v/sequenzo?color=blue">
80
+ </a>
81
+
82
+ <!-- πŸ“¦ Downloads Badge (可选) -->
83
+ <a href="https://pypi.org/project/sequenzo/">
84
+ <img alt="Downloads" src="https://static.pepy.tech/badge/sequenzo">
85
+ </a>
86
+
87
+ <!-- πŸ“„ License Badge -->
88
+ <a href="https://github.com/Liang-Team/Sequenzo/blob/main/LICENSE">
89
+ <img alt="License" src="https://img.shields.io/github/license/Liang-Team/Sequenzo">
90
+ </a>
91
+ </p>
92
+
93
+ # Sequenzo: Fast, scalable, and intuitive social sequence analysis in Python
94
+
95
+ Sequenzo is a high-performance Python package designed for social sequence analysis. It is built to analyze **any sequence of categorical events**, from individual career paths and migration patterns to corporate growth and urban development.
96
+ Whether you are working with **people, places, or policies**, Sequenzo helps uncover meaningful patterns efficiently.
97
+
98
+ Sequenzo outperforms traditional R-based tools in social sequence analysis, delivering faster processing and superior efficiency, especially for large-scale datasets. **No big data? No problem. You don’t need big data to benefit as Sequenzo is designed to enhance sequence analysis at any scale, making complex methods accessible to everyone.**
99
+
100
+ > πŸš€ **Explore the official documentation at [sequenzo.yuqi-liang.tech](https://sequenzo.yuqi-liang.tech/en/)**
101
+ > with tutorials, practical examples, and API references to help you get started quickly.
102
+ >
103
+ > πŸ“– Available in **English and Chinese**, our docs are written to be approachable, practical, and easy to follow.
104
+
105
+ ## ✨ Be part of the Sequenzo community
106
+ Join our Discord channel to iscuss ideas, get help, and hear about upcoming Sequenzo versions, tutorials, and workshops first.
107
+
108
+ ➑️ https://discord.gg/3bMDKRHW
109
+
110
+ ## Target Users
111
+
112
+ Sequenzo is designed for:
113
+
114
+ - Quantitative researchers in sociology, demography, political science, economics, management, etc.
115
+ - Data scientists, data analysts, and business analysts working on trajectory/time-series clustering
116
+ - Educators teaching courses involving social sequence data
117
+ - Users familiar with R packages such as `TraMineR` who want a Python-native alternative
118
+
119
+ ## Why Choose Sequenzo?
120
+
121
+ πŸš€ **High Performance**
122
+
123
+ Leverages Python’s computational power to achieve 8Γ— faster processing than traditional R-based tools like TraMineR.
124
+
125
+ 🎯 **Easy-to-Use API**
126
+
127
+ Designed with simplicity in mind: intuitive functions streamline complex sequence analysis without compromising flexibility.
128
+
129
+ 🌍 **Flexible for Any Scenario**
130
+
131
+ Perfect for research, policy, and business, enabling seamless analysis of categorical data and its evolution over time.
132
+
133
+ ## Platform Compatibility
134
+
135
+ Sequenzo provides pre-built Python wheels for maximum compatibility β€” no need to compile from source.
136
+
137
+ | Platform | Architecture | Python Versions | Status |
138
+ |------------------|-------------------------------|-----------------------|-------------------|
139
+ | **macOS** | `universal2` (Intel + Apple Silicon) | 3.9, 3.10, 3.11, 3.12 | βœ… Pre-built wheel |
140
+ | **Windows** | `AMD64` (64-bit) | 3.9, 3.10, 3.11, 3.12 | βœ… Pre-built wheel |
141
+ | **Linux (glibc)**| `x86_64` (standard Linux) | 3.9, 3.10, 3.11, 3.12 | βœ… Pre-built wheel |
142
+ | **Linux (musl)** | `x86_64` (Alpine Linux) | 3.9, 3.10, 3.11, 3.12 | βœ… Pre-built wheel |
143
+
144
+
145
+ What do these terms mean?
146
+ - **universal2 (macOS)**: One wheel supports both Intel (x86_64) and Apple Silicon (arm64) Macs.
147
+ - **manylinux2014 (glibc-based Linux)**: Compatible with most mainstream Linux distributions (e.g., Ubuntu, Debian, CentOS).
148
+ - **musllinux_1_2 (musl-based Linux)**: For lightweight Alpine Linux environments, common in Docker containers.
149
+ - **AMD64 (Windows)**: Standard 64-bit Windows system architecture.
150
+
151
+ All of these wheels are pre-built and available on PyPI β€” so `pip install sequenzo` should work on supported platforms, without needing a compiler.
152
+
153
+ **Windows (win32)** and **Linux (i686)** are dropped due to:
154
+
155
+ - Extremely low usage in modern systems (post-2020)
156
+ - Memory limitations (≀ 4GB) unsuitable for scientific computing workloads
157
+ - Increasing incompatibility with packages such as `numpy`, `scipy`, and `pybind11`
158
+ - Frequent build failures and maintenance overhead in CI/CD pipelines
159
+
160
+
161
+ ## Installation
162
+
163
+ If you haven't installed Python, please follow [Yuqi's tutorial about how to set up Python and your virtual environment](https://www.yuqi-liang.tech/blog/setup-python-virtual-environment/).
164
+
165
+ Once Python is installed, we highly recommend using [PyCharm](https://www.jetbrains.com/pycharm/download/) as your IDE (Integrated Development Environment β€” the place where you open your folder and files to work with Python), rather than Visual Studio. PyCharm has excellent built-in support for managing virtual environments, making your workflow much easier and more reliable.
166
+
167
+ In PyCharm, please make sure to select a virtual environment using Python 3.9, 3.10, or 3.11 as these versions are fully supported by `sequenzo`.
168
+
169
+ Then, you can open the built-in terminal by clicking the Terminal icon
170
+ <img src="https://github.com/user-attachments/assets/1e9e3af0-4286-47ba-aa88-29c3288cb7cb" alt="terminal icon" width="30" style="display:inline; vertical-align:middle;">
171
+ in the left sidebar (usually near the bottom). It looks like a small command-line window icon.
172
+
173
+ Once it’s open, type the following to install `sequenzo`:
174
+
175
+ ```
176
+ pip install sequenzo
177
+ ```
178
+
179
+ If you have some issues with the installation, it might because you have both Python 2 and Python 3 installed on your computer. In this case, you can try to use `pip3` instead of `pip` to install the package.
180
+
181
+ ```
182
+ pip3 install sequenzo
183
+ ```
184
+
185
+ ### ⚠️ Having Installation or Import Issues?
186
+
187
+ **Error:** `ImportError: numpy.core.multiarray failed to import` or `ValueError: numpy.dtype size changed`
188
+
189
+ **Cause:** NumPy version incompatibility. Sequenzo 0.1.21+ requires NumPy 2.x.
190
+
191
+ **Quick Fix** (copy-paste these commands):
192
+ ```bash
193
+ # Check your NumPy version first
194
+ python -c "import numpy; print(f'NumPy: {numpy.__version__}')"
195
+
196
+ # If you see 1.x.x, upgrade to 2.x:
197
+ pip install --upgrade "numpy>=2.0.0"
198
+ pip uninstall sequenzo -y
199
+ pip install --no-cache-dir sequenzo
200
+ ```
201
+
202
+ **Note:** NumPy 2.x is backward compatible with code written for NumPy 1.x, so upgrading is safe.
203
+
204
+ πŸ“– **Still having issues?**
205
+ 1. Run our diagnostic tool to identify the problem:
206
+ ```bash
207
+ curl -O https://raw.githubusercontent.com/Liang-Team/Sequenzo/main/diagnose.py
208
+ python diagnose.py
209
+ ```
210
+ 2. See our detailed guides:
211
+ - **[QUICK_FIX.md](QUICK_FIX.md)** - Simple step-by-step solutions
212
+ - **[TROUBLESHOOTING.md](TROUBLESHOOTING.md)** - Comprehensive troubleshooting
213
+ - **[docs/WHY_IMPORT_FAILS.md](docs/WHY_IMPORT_FAILS.md)** - Technical explanation
214
+
215
+ ### Optional R Integration
216
+
217
+ Sequenzo now checks the system environment variables before running ward.D hierarchical clustering.
218
+
219
+ If R is missing, a relevant prompt will be displayed along with specific installation instructions. If `fastcluster` is missing, Sequenzo will automatically download `fastcluster`.
220
+
221
+ Before automatically downloading `fastcluster`, Sequenzo checks whether R is available; if R is not installed, sequenzo will not automatically download fastcluster.
222
+
223
+ Sequenzo supports advanced Ward clustering methods that require R integration. If you need to use the `ward_d` clustering method, install with R support:
224
+
225
+ ```
226
+ pip install sequenzo[r]
227
+ ```
228
+
229
+ This will install the optional `rpy2` dependency, which provides Python-R interoperability. Note that R must also be installed on your system for `rpy2` to work.
230
+
231
+ For more information about the latest stable release and required dependencies, please refer to [PyPI](https://pypi.org/project/sequenzo/).
232
+
233
+ ## Documentation
234
+
235
+ Explore the full Sequenzo documentation [here](sequenzo.yuqi-liang.tech). Even though the documentation website is still under construction, you can already find some useful information there.
236
+
237
+ **Where to start on the documentation website?**
238
+ * New to Sequenzo or social sequence analysis? Begin with "About Sequenzo" β†’ "Quickstart Guide" for a smooth introduction.
239
+ * Got your own data? After going through "About Sequenzo" and "Quickstart Guide", you are ready to dive in and start analyzing.
240
+ * Looking for more? Check out our example datasets and tutorials to deepen your understanding.
241
+
242
+ For Chinese users, additional tutorials are available on [Yuqi's video tutorials on Bilibili](https://space.bilibili.com/263594713/lists/4147974).
243
+
244
+ ## Join the Community
245
+
246
+ πŸ’¬ **Have a question or found a bug?**
247
+
248
+ Please submit an issue on [GitHub Issues](https://github.com/Liang-Team/Sequenzo/issues) by following [this instruction](https://sequenzo.yuqi-liang.tech/en/faq/bug_reports_and_feature_requests).
249
+
250
+ * We will respond as quickly as possible.
251
+ * For requests that are not too large, we aim to fix or implement the feature **within one week** from our response time.
252
+ * Timeline may vary depending on how many requests we receive.
253
+
254
+ 🌟 **Enjoying Sequenzo?**
255
+
256
+ Support the project by starring ⭐ the GitHub repo and spreading the word!
257
+
258
+ πŸ›  **Interested in contributing?**
259
+
260
+ Check out our [contribution guide]() for more details (work in progress).
261
+
262
+ * Write code? Submit a pull request to enhance Sequenzo.
263
+ * Testing? Try Sequenzo and share your feedback. Every suggestion counts!
264
+
265
+ If you're contributing or debugging, use:
266
+
267
+ ```bash
268
+ pip install -r requirements/requirements-3.10.txt # Or matching your Python version
269
+ ```
270
+
271
+ For standard installation, use:
272
+
273
+ ```bash
274
+ pip install . # Uses pyproject.toml
275
+ ```
276
+
277
+ ## Team
278
+
279
+ **Paper Authors**
280
+ * [Yuqi Liang, University of Oxford](https://www.yuqi-liang.tech/)
281
+ * [Xinyi Li, Northeastern University](https://github.com/Fantasy201)
282
+ * [Jan Heinrich Ernst Meyerhoff-Liang, Institute for New Economic Thinking Oxford](https://www.inet.ox.ac.uk/people/jan-meyerhoff-liang)
283
+
284
+ **Package Contributors**
285
+
286
+ Coding contributors:
287
+ * [Sebastian Daza](https://sdaza.com/)
288
+ * [Cheng Deng](https://github.com/de-de-de-de-de)
289
+ * [Liangxingyun He, Stockholm School of Economics, Sweden](https://www.linkedin.com/in/liangxingyun-he-6aa128304/)
290
+
291
+ Documentation contributors:
292
+ * [Liangxingyun He, Stockholm School of Economics, Sweden](https://www.linkedin.com/in/liangxingyun-he-6aa128304/)
293
+ * [Yukun Ming, Universidad Carlos III de Madrid (Spain)](https://www.linkedin.com/in/yukun)
294
+ * [Sizhu Qu, Northeastern University (US)](https://www.linkedin.com/in/sizhuq)
295
+ * [Ziting Yang, Rochester Wniversity (US)](https://www.linkedin.com/in/ziting-yang-7b33832bb)
296
+
297
+ Others
298
+ * With special thanks to our initial testers (alphabetically ordered): [Joji Chia](https://sociology.illinois.edu/directory/profile/jbchia2), [Kass Gonzalez](https://www.linkedin.com/in/kass-gonzalez-72a778276/), [Sinyee Lu](https://sociology.illinois.edu/directory/profile/qianyil4), [Sohee Shin](https://sociology.illinois.edu/directory/profile/sohees2)
299
+ * Website and related technical support: [Mactavish](https://github.com/mactavishz)
300
+ * Sequence data sources compilation - History: Jingrui Chen
301
+ * Visual design consultant: Changyu Yi
302
+
303
+ **Acknowledgements**
304
+
305
+ * Methodological advisor in sequence analysis: [Professor Tim Liao (University of Illinois Urbana-Champaign)](https://sociology.illinois.edu/directory/profile/tfliao)
306
+ * Yuqi's PhD advisor [Professor Ridhi Kashyap (University of Oxford)](https://www.nuffield.ox.ac.uk/people/profiles/ridhi-kashyap/), and mentor [Charles Rahal (University of Oxford)](https://crahal.com/)
307
+ * Yuqi's original programming mentor: [JiangHuShiNian](https://github.com/jianghushinian)
308
+