sequenzo 0.1.24__cp311-cp311-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sequenzo might be problematic. Click here for more details.

Files changed (264) hide show
  1. _sequenzo_fastcluster.cpython-311-darwin.so +0 -0
  2. sequenzo/__init__.py +240 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +474 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-311-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-311-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +20 -0
  30. sequenzo/data_preprocessing/helpers.py +256 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/mvad.csv +713 -0
  44. sequenzo/datasets/pairfam_family.csv +1867 -0
  45. sequenzo/datasets/polyadic_samplec1.csv +61 -0
  46. sequenzo/datasets/polyadic_samplep1.csv +61 -0
  47. sequenzo/datasets/polyadic_seqc1.csv +61 -0
  48. sequenzo/datasets/polyadic_seqp1.csv +61 -0
  49. sequenzo/define_sequence_data.py +609 -0
  50. sequenzo/dissimilarity_measures/__init__.py +31 -0
  51. sequenzo/dissimilarity_measures/c_code.cpython-311-darwin.so +0 -0
  52. sequenzo/dissimilarity_measures/get_distance_matrix.py +702 -0
  53. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +241 -0
  54. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  55. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  56. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  57. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  58. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  59. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  60. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  61. sequenzo/dissimilarity_measures/src/module.cpp +34 -0
  62. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  63. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  64. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  65. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  66. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  67. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  214. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  215. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  216. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-311-darwin.so +0 -0
  217. sequenzo/dissimilarity_measures/utils/seqconc.cpython-311-darwin.so +0 -0
  218. sequenzo/dissimilarity_measures/utils/seqdss.cpython-311-darwin.so +0 -0
  219. sequenzo/dissimilarity_measures/utils/seqdur.cpython-311-darwin.so +0 -0
  220. sequenzo/dissimilarity_measures/utils/seqlength.cpython-311-darwin.so +0 -0
  221. sequenzo/multidomain/__init__.py +23 -0
  222. sequenzo/multidomain/association_between_domains.py +311 -0
  223. sequenzo/multidomain/cat.py +431 -0
  224. sequenzo/multidomain/combt.py +519 -0
  225. sequenzo/multidomain/dat.py +89 -0
  226. sequenzo/multidomain/idcd.py +139 -0
  227. sequenzo/multidomain/linked_polyad.py +292 -0
  228. sequenzo/openmp_setup.py +233 -0
  229. sequenzo/prefix_tree/__init__.py +43 -0
  230. sequenzo/prefix_tree/individual_level_indicators.py +1274 -0
  231. sequenzo/prefix_tree/system_level_indicators.py +465 -0
  232. sequenzo/prefix_tree/utils.py +54 -0
  233. sequenzo/sequence_characteristics/__init__.py +40 -0
  234. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  235. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  236. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  237. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  238. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  239. sequenzo/sequence_characteristics/turbulence.py +155 -0
  240. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  241. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  242. sequenzo/suffix_tree/__init__.py +48 -0
  243. sequenzo/suffix_tree/individual_level_indicators.py +1638 -0
  244. sequenzo/suffix_tree/system_level_indicators.py +456 -0
  245. sequenzo/suffix_tree/utils.py +56 -0
  246. sequenzo/visualization/__init__.py +29 -0
  247. sequenzo/visualization/plot_mean_time.py +194 -0
  248. sequenzo/visualization/plot_modal_state.py +276 -0
  249. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  250. sequenzo/visualization/plot_relative_frequency.py +404 -0
  251. sequenzo/visualization/plot_sequence_index.py +951 -0
  252. sequenzo/visualization/plot_single_medoid.py +153 -0
  253. sequenzo/visualization/plot_state_distribution.py +627 -0
  254. sequenzo/visualization/plot_transition_matrix.py +190 -0
  255. sequenzo/visualization/utils/__init__.py +23 -0
  256. sequenzo/visualization/utils/utils.py +310 -0
  257. sequenzo/with_event_history_analysis/__init__.py +35 -0
  258. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  259. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  260. sequenzo-0.1.24.dist-info/METADATA +255 -0
  261. sequenzo-0.1.24.dist-info/RECORD +264 -0
  262. sequenzo-0.1.24.dist-info/WHEEL +5 -0
  263. sequenzo-0.1.24.dist-info/licenses/LICENSE +28 -0
  264. sequenzo-0.1.24.dist-info/top_level.txt +2 -0
@@ -0,0 +1,311 @@
1
+ """
2
+ @Author : 梁彧祺
3
+ @File : simple_characteristics.py
4
+ @Time : 22/09/2025 22:40
5
+ @Desc : Simple sequence characteristics functions
6
+ """
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ from typing import Union, List
11
+
12
+ from sequenzo.define_sequence_data import SequenceData
13
+ from sequenzo.dissimilarity_measures.utils.seqdss import seqdss
14
+ from sequenzo.dissimilarity_measures.utils.seqlength import seqlength
15
+ from sequenzo.dissimilarity_measures.utils.get_sm_trate_substitution_cost_matrix import get_sm_trate_substitution_cost_matrix
16
+
17
+
18
+ def get_subsequences_in_single_sequence(x: np.ndarray, nbstat: int, statlist: List, void=None, nr=None, with_missing: bool = False) -> int:
19
+ """
20
+ Internal helper function to count distinct subsequences in a single sequence.
21
+
22
+ This is a low-level computational function that implements the dynamic programming
23
+ algorithm for counting subsequences. It's designed to be called by higher-level
24
+ functions like get_number_of_subsequences().
25
+
26
+ Args:
27
+ x (np.ndarray): Single sequence array (e.g., [1, 2, 1, 3])
28
+ nbstat (int): Number of distinct states/symbols
29
+ statlist (List): List of all possible states/symbols
30
+ void: Symbol representing void/empty elements (not used in current implementation)
31
+ nr: Symbol representing missing values
32
+ with_missing (bool): Whether to include missing values in the calculation
33
+
34
+ Returns:
35
+ int: Number of distinct subsequences in this one sequence
36
+
37
+ Note:
38
+ This is an internal function. Use get_number_of_subsequences() for analyzing
39
+ sequence datasets. The algorithm uses dynamic programming for efficiency.
40
+ """
41
+ # Initialize state tracking array
42
+ l = np.zeros(nbstat, dtype=int) - 1 # 必须是 -1(或其他负数)。避免 0-based 索引与 0 代表的无效值冲突
43
+
44
+ # Remove void elements if specified
45
+ if void is not None:
46
+ x = x[x != void]
47
+
48
+ # Remove missing values if not including them
49
+ if not with_missing and nr is not None:
50
+ x = x[x != nr]
51
+
52
+ slength = len(x)
53
+
54
+ # Empty sequence has one subsequence (the empty one)
55
+ if slength == 0:
56
+ return 1
57
+
58
+ # Dynamic programming array
59
+ N = np.zeros(slength + 1, dtype=object) # Use object dtype to handle large integers
60
+ N[0] = 1
61
+
62
+ for i in range(1, slength + 1):
63
+ N[i] = 2 * N[i-1]
64
+
65
+ # Find the index of current state in statlist
66
+ current_state = x[i-1]
67
+ try:
68
+ cidx = statlist.index(current_state)
69
+ except ValueError:
70
+ # If state not in statlist, skip this iteration
71
+ continue
72
+
73
+ # Subtract previously counted subsequences ending with this state
74
+ if l[cidx] > -1:
75
+ N[i] = N[i] - N[l[cidx]]
76
+
77
+ # Update last position of this state
78
+ l[cidx] = i - 1
79
+
80
+ return N[slength]
81
+
82
+
83
+ def get_subsequences_all_sequences(seqdata, dss: bool = True, with_missing: bool = False) -> pd.DataFrame:
84
+ """
85
+ Calculate the number of distinct subsequences for all sequences in the dataset.
86
+
87
+ This is the main function you'll use to analyze subsequence complexity across
88
+ multiple sequences. It processes your entire sequence dataset and returns a
89
+ summary table showing how many distinct subsequences exist in each sequence.
90
+
91
+ Args:
92
+ seqdata: SequenceData object or pandas DataFrame containing your sequence data
93
+ dss (bool): Whether to apply distinct state sequence preprocessing.
94
+ If True, consecutive identical states are compressed (e.g., [1,1,2,2] -> [1,2])
95
+ with_missing (bool): Whether to include missing values in the calculation
96
+
97
+ Returns:
98
+ pd.DataFrame: Results table with one column 'Subseq.' showing the subsequence
99
+ count for each sequence. Row names match your sequence identifiers.
100
+
101
+ Examples:
102
+ >>> # Analyze subsequence complexity in your sequence dataset
103
+ >>> result = get_number_of_subsequences(seq_data, dss=True, with_missing=False)
104
+ >>> print(result.head())
105
+ Subseq.
106
+ seq_1 15
107
+ seq_2 23
108
+ seq_3 8
109
+
110
+ >>> # Higher numbers = more complex sequences with more possible subsequences
111
+
112
+ Note:
113
+ This function works with SequenceData objects (recommended) or pandas DataFrames.
114
+ Use this to understand the complexity and diversity patterns in your sequences.
115
+ """
116
+ if isinstance(seqdata, np.ndarray):
117
+ seqdata = pd.DataFrame(seqdata)
118
+
119
+ # Check if input is a SequenceData object
120
+ if hasattr(seqdata, 'seqdata'):
121
+ # It is a SequenceData object
122
+ sequences = seqdata.seqdata
123
+ states = seqdata.states
124
+ state_mapping = seqdata.state_mapping
125
+ ids = sequences.index
126
+
127
+ # Handle missing values
128
+ nr_code = len(states) + 1 if hasattr(seqdata, 'ismissing') and seqdata.ismissing else None
129
+
130
+ elif isinstance(seqdata, pd.DataFrame):
131
+ # It's a DataFrame
132
+ sequences = seqdata
133
+ # Try to infer states from the data
134
+ unique_vals = set()
135
+ for col in sequences.columns:
136
+ unique_vals.update(sequences[col].dropna().unique())
137
+ states = sorted(list(unique_vals))
138
+ state_mapping = {state: i+1 for i, state in enumerate(states)}
139
+ ids = sequences.index
140
+ nr_code = None
141
+
142
+ else:
143
+ raise ValueError("seqdata must be a SequenceData object or pandas DataFrame")
144
+
145
+ # Apply DSS (Distinct State Sequences) if requested
146
+ if dss:
147
+ processed_sequences = sequences.copy()
148
+ for idx in processed_sequences.index:
149
+ row = processed_sequences.loc[idx].values
150
+ # Remove consecutive duplicates
151
+ if len(row) > 0:
152
+ new_row = [row[0]]
153
+ for i in range(1, len(row)):
154
+ if row[i] != row[i-1]:
155
+ new_row.append(row[i])
156
+ # Pad with NaN if sequence got shorter
157
+ while len(new_row) < len(row):
158
+ new_row.append(np.nan)
159
+ processed_sequences.loc[idx] = new_row
160
+ else:
161
+ processed_sequences = sequences
162
+
163
+ # Get state list
164
+ if hasattr(seqdata, 'states'):
165
+ # Use numeric codes from SequenceData
166
+ statlist = list(range(1, len(states) + 1))
167
+ if with_missing and nr_code is not None:
168
+ statlist.append(nr_code)
169
+ else:
170
+ # Use original states
171
+ statlist = states
172
+
173
+ nbstat = len(statlist)
174
+
175
+ # Calculate subsequence count for each sequence
176
+ results = []
177
+ for idx in processed_sequences.index:
178
+ seq_values = processed_sequences.loc[idx].values
179
+
180
+ # Remove NaN values
181
+ seq_values = seq_values[~pd.isna(seq_values)]
182
+
183
+ if len(seq_values) == 0:
184
+ result = 1 # Empty sequence has 1 subsequence
185
+ else:
186
+ result = get_subsequences_in_single_sequence(
187
+ seq_values.astype(int),
188
+ nbstat,
189
+ statlist,
190
+ void=None,
191
+ nr=nr_code,
192
+ with_missing=with_missing
193
+ )
194
+ results.append(result)
195
+
196
+ # Create result DataFrame
197
+ result_df = pd.DataFrame(results, columns=['Subseq.'], index=ids)
198
+
199
+ return result_df
200
+
201
+ def cut_prefix(row, x=0):
202
+ arr = row.to_numpy()
203
+ if np.issubdtype(arr.dtype, np.number):
204
+ pos_idx = np.where(arr < x)[0]
205
+ if len(pos_idx) > 0:
206
+ arr = arr[:pos_idx[0]]
207
+ return arr
208
+
209
+ def seqsubsn(seqdata, DSS=True, with_missing=False) -> pd.DataFrame:
210
+ if isinstance(seqdata, np.ndarray):
211
+ sl = pd.unique(seqdata.ravel())
212
+ seqdata = pd.DataFrame(seqdata)
213
+ statelist = sl.tolist()
214
+ elif isinstance(seqdata, pd.DataFrame):
215
+ sl = pd.unique(seqdata.values.ravel())
216
+ statelist = sl.tolist()
217
+ pass
218
+ elif isinstance(seqdata, SequenceData):
219
+ sl = seqdata.states.copy()
220
+ seqdata = seqdata.seqdata
221
+ statelist = list(range(1, len(sl) + 1))
222
+ else:
223
+ raise ValueError("[!] seqdata must be a SequenceData object, see SequenceData function to create one.")
224
+
225
+ if DSS:
226
+ seqdata = seqdss(seqdata)
227
+ seqdata = pd.DataFrame(seqdata)
228
+
229
+ ns = len(sl)
230
+
231
+ result = seqdata.apply(lambda row: get_subsequences_in_single_sequence(
232
+ cut_prefix(row),
233
+ nbstat=ns,
234
+ statlist=statelist
235
+ ), axis=1)
236
+
237
+ result = pd.DataFrame(result, columns=['Subseq.'], index=seqdata.index)
238
+ return result
239
+
240
+ def get_number_of_transitions(seqdata, norm=False, pwight=False) -> pd.DataFrame:
241
+ """
242
+ Calculate how many state changes occur in each sequence.
243
+
244
+ This function measures sequence instability by counting transitions (state changes).
245
+ A transition happens whenever the sequence changes from one state to another.
246
+ More transitions = more volatile/unstable sequences.
247
+
248
+ Args:
249
+ seqdata: SequenceData object or pandas DataFrame containing your sequence data
250
+ norm: If set as TRUE, the number of transitions is divided by its theoretical maximum, length of the sequence minus 1.
251
+ When the length of the sequence is 1, the normalized value is set as 0.
252
+ pwight: If set as TRUE, return count of transitions weighted
253
+ by their probability to not occur to give higher weights to rare transitions.
254
+
255
+ Returns:
256
+ pd.DataFrame: Results table with one column 'Transitions' showing the number of
257
+ state changes for each sequence. Row names match your sequence identifiers.
258
+
259
+ Examples:
260
+ >>> # Count state changes in your sequences
261
+ >>> result = get_number_of_transitions(seq_data)
262
+ >>> print(result.head())
263
+ Transitions
264
+ seq_1 3
265
+ seq_2 5
266
+ seq_3 2
267
+
268
+ >>> # Example: sequence [1, 1, 2, 2, 1, 3] has 3 transitions:
269
+ >>> # 1->2 (position 3), 2->1 (position 5), 1->3 (position 6)
270
+
271
+ Note:
272
+ Missing values are automatically ignored. Only counts actual state changes
273
+ between valid sequence elements. Use this to measure sequence volatility.
274
+ """
275
+ # Check if input is a SequenceData object
276
+ if not hasattr(seqdata, 'seqdata'):
277
+ raise ValueError("[!] seqdata must be a SequenceData object, see SequenceData function to create one.")
278
+
279
+ dss = seqdss(seqdata)
280
+ dss_length = seqlength(dss)
281
+ number_seq = seqdata.seqdata.shape[0]
282
+
283
+ if pwight:
284
+ # 返回的是每个id序列在每个时间点下的各状态不发生概率的累加和
285
+ tr = get_sm_trate_substitution_cost_matrix(seqdata)
286
+ dss = dss + 1
287
+ trans = np.zeros((number_seq, 1))
288
+
289
+ for i in range(number_seq):
290
+ if dss_length.iloc[i, 0] > 1:
291
+ for j in range(1, dss_length.iloc[i, 0]):
292
+ state_from = dss.iloc[i, j-1]
293
+ state_to = dss.iloc[i, j]
294
+ trans[i, 0] += tr[state_from, state_to]
295
+
296
+ else:
297
+ # 返回的是每个id序列的转变次数,与上面的例子一致
298
+ trans = dss_length - 1
299
+ if any(dss_length==0):
300
+ trans[dss_length==0] = 0
301
+
302
+ if norm:
303
+ seq_length = seqlength(seqdata)
304
+ trans = trans / (seq_length-1)
305
+ if any(seq_length<=1):
306
+ trans[seq_length<=1] = 0
307
+
308
+ trans = pd.DataFrame(trans, index=seqdata.seqdata.index, columns=['Transitions'])
309
+ trans = trans.reset_index().rename(columns={'index': 'ID'})
310
+
311
+ return trans
@@ -0,0 +1,39 @@
1
+ """
2
+ @Author : 李欣怡
3
+ @File : state_frequencies_and_entropy_per_sequence.py
4
+ @Time : 2025/9/23 19:34
5
+ @Desc : State distribution for each individual
6
+
7
+ The corresponding function name in TraMineR is seqistatd.R,
8
+ with the source code available at: https://github.com/cran/TraMineR/blob/master/R/seqistatd.R
9
+ """
10
+ import numpy as np
11
+ import pandas as pd
12
+
13
+ from sequenzo.define_sequence_data import SequenceData
14
+
15
+ def get_state_freq_and_entropy_per_seq(seqdata, prop=False):
16
+ if not isinstance(seqdata, SequenceData):
17
+ raise ValueError("[!] data is NOT a sequence object, see SequenceData function to create one.")
18
+
19
+ if seqdata.labels is not None:
20
+ states = seqdata.labels
21
+ else:
22
+ states = seqdata.states
23
+
24
+ number_states = len(states)
25
+ number_seq = seqdata.seqdata.shape[0]
26
+
27
+ iseqtab = pd.DataFrame(np.zeros((number_seq, number_states)), index=seqdata.seqdata.index, columns=states)
28
+
29
+ print(f"[>] Computing state distribution for {number_seq} sequences and {number_states} states ...")
30
+
31
+ for i, state in enumerate(states):
32
+ iseqtab.iloc[:, i] = (seqdata.seqdata == (i+1)).sum(axis=1)
33
+
34
+ if prop:
35
+ iseqtab = iseqtab.div(iseqtab.sum(axis=1), axis=0)
36
+
37
+ iseqtab = iseqtab.reset_index().rename(columns={'index': 'ID'})
38
+
39
+ return iseqtab
@@ -0,0 +1,155 @@
1
+ """
2
+ @Author : Xinyi Li, Yuqi Liang
3
+ @File : turbulence.py
4
+ @Time : 2025/9/24 14:09
5
+ @Desc : Computes the sequence turbulence measure
6
+
7
+ The corresponding function name in TraMineR is seqST.R,
8
+ with the source code available at: https://github.com/cran/TraMineR/blob/master/R/seqST.R
9
+
10
+ """
11
+ import os
12
+ from contextlib import redirect_stdout
13
+ import numpy as np
14
+ import pandas as pd
15
+
16
+ from sequenzo.define_sequence_data import SequenceData
17
+ from sequenzo.dissimilarity_measures.utils.seqdss import seqdss
18
+ from sequenzo.dissimilarity_measures.utils.seqlength import seqlength
19
+ from .simple_characteristics import seqsubsn
20
+ from .variance_of_spell_durations import get_spell_duration_variance
21
+
22
+ def turb(x):
23
+ phi = x[0]
24
+ s2_tx = x[1]
25
+ s2max = x[2]
26
+
27
+ Tux = np.log2(phi * ((s2max + 1) / (s2_tx + 1)))
28
+ return Tux
29
+
30
+ def get_turbulence(seqdata, norm=False, silent=True, type=1, id_as_column=True):
31
+ """
32
+ Computes the sequence turbulence measure
33
+
34
+ Parameters
35
+ ----------
36
+ seqdata : SequenceData
37
+ A sequence object created by the SequenceData function.
38
+ norm : bool, default True
39
+ If True, the frequencies are normalized to sum to 1 at each time unit.
40
+ silent : bool, default True
41
+ If True, suppresses the output messages.
42
+ type : int, default 1
43
+ Type of spell duration variance to be used. Can be either 1 or 2.
44
+ id_as_column : bool, default True
45
+ If True, the ID will be included as a separate column instead of as the index.
46
+
47
+ Returns
48
+ -------
49
+ pd.DataFrame
50
+ A DataFrame with one column containing the turbulence measure for each sequence.
51
+ If id_as_column=True, also includes an ID column.
52
+ """
53
+
54
+ if not hasattr(seqdata, 'seqdata'):
55
+ raise ValueError("[!] data is NOT a sequence object, see SequenceData function to create one.")
56
+
57
+ if not silent:
58
+ print(f" - extracting symbols and durations ...")
59
+ spells = seqdss(seqdata)
60
+
61
+ if not silent:
62
+ print(f" - computing turbulence type {type} for {seqdata.seqdata.shape[0]} sequence(s) ...")
63
+ phi = seqsubsn(spells, DSS=False, with_missing=True)
64
+
65
+ if any(np.isnan(phi)):
66
+ # 使用有限的大数值,避免转换警告
67
+ # np.finfo(float).max 在NumPy 1.24+会触发"invalid value encountered in cast"警告
68
+ large_but_finite = 1e15 # 足够大但不会导致溢出警告
69
+ phi = np.where(np.isnan(phi), large_but_finite, phi)
70
+ print("[!] One or more missing values were found after calculating the number of distinct subsequences. They have been replaced with a large number of 1e15 to ensure the calculation continues.")
71
+
72
+ s2_tx = get_spell_duration_variance(seqdata=seqdata, type=type)
73
+ s2_tx_max = s2_tx['vmax']
74
+ s2_tx = s2_tx['result']
75
+
76
+ # Extract phi values and ensure 1D array
77
+ if hasattr(phi, 'iloc'):
78
+ phi_values = phi.iloc[:, 0].values
79
+ elif hasattr(phi, 'values'):
80
+ phi_values = phi.values
81
+ else:
82
+ phi_values = phi
83
+
84
+ # Ensure phi_values is 1D
85
+ phi_values = np.asarray(phi_values).flatten()
86
+
87
+ # Extract 1D arrays from s2_tx and s2_tx_max DataFrames
88
+ s2_tx_values = s2_tx.iloc[:, 1].values if hasattr(s2_tx, 'iloc') else np.asarray(s2_tx).flatten()
89
+ s2_tx_max_values = s2_tx_max.iloc[:, 1].values if hasattr(s2_tx_max, 'iloc') else np.asarray(s2_tx_max).flatten()
90
+
91
+ tmp = pd.DataFrame({'phi': phi_values, 's2_tx': s2_tx_values, 's2max': s2_tx_max_values})
92
+ Tx = tmp.apply(lambda row: turb([row['phi'], row['s2_tx'], row['s2max']]), axis=1).to_numpy()
93
+
94
+ if norm:
95
+ alph = seqdata.states.copy()
96
+ maxlength = max(seqlength(seqdata))
97
+ nrep = -(-maxlength // len(alph)) # Ceiling division
98
+
99
+ turb_seq = pd.DataFrame(np.array((alph * nrep)[:maxlength]).reshape(1, -1))
100
+ with open(os.devnull, 'w') as fnull:
101
+ with redirect_stdout(fnull):
102
+ # 为 states 创建对应的 labels,需要特别处理 np.nan 的情况
103
+ turb_labels = []
104
+ for i, state in enumerate(alph):
105
+ if pd.isna(state):
106
+ turb_labels.append("Missing")
107
+ else:
108
+ turb_labels.append(f"State_{i}")
109
+ turb_seq = SequenceData(turb_seq, time=list(range(turb_seq.shape[1])), states=alph, labels=turb_labels)
110
+
111
+ if len(alph) > 1:
112
+ turb_phi = seqsubsn(turb_seq, DSS=False, with_missing=True)
113
+ else:
114
+ turb_phi = 2
115
+
116
+ if hasattr(turb_phi, 'isna') and turb_phi.isna().any().any():
117
+ turb_phi = 1e15 # 使用有限大数值避免转换警告
118
+ print("[!] phi set as max float due to exceeding value when computing max turbulence.")
119
+
120
+ turb_s2 = get_spell_duration_variance(turb_seq, type=type)
121
+ turb_s2_max = turb_s2['vmax']
122
+ turb_s2 = turb_s2['result']
123
+
124
+ # Extract turb_phi values and ensure 1D
125
+ if hasattr(turb_phi, 'iloc'):
126
+ phi_value = turb_phi.iloc[:, 0].values
127
+ else:
128
+ phi_value = [turb_phi]
129
+
130
+ phi_value = np.asarray(phi_value).flatten()
131
+
132
+ # Extract 1D arrays from turb_s2 and turb_s2_max DataFrames
133
+ turb_s2_values = turb_s2.iloc[:, 1].values if hasattr(turb_s2, 'iloc') else np.asarray(turb_s2).flatten()
134
+ turb_s2_max_values = turb_s2_max.iloc[:, 1].values if hasattr(turb_s2_max, 'iloc') else np.asarray(turb_s2_max).flatten()
135
+
136
+ tmp = pd.DataFrame({'phi': phi_value, 's2_tx': turb_s2_values, 's2max': turb_s2_max_values})
137
+ maxT = tmp.apply(lambda row: turb([row['phi'], row['s2_tx'], row['s2max']]), axis=1).to_numpy()
138
+
139
+ Tx_zero = np.where(Tx < 1)[0]
140
+ Tx = (Tx - 1) / (maxT - 1)
141
+ if len(Tx_zero) > 0:
142
+ Tx[Tx_zero, :] = 0
143
+
144
+ Tx_df = pd.DataFrame(Tx, index=seqdata.seqdata.index, columns=['Turbulence'])
145
+
146
+ # Handle ID display options
147
+ if id_as_column:
148
+ # Add ID as a separate column and reset index to numeric
149
+ Tx_df['ID'] = Tx_df.index
150
+ Tx_df = Tx_df[['ID', 'Turbulence']].reset_index(drop=True)
151
+ else:
152
+ # Always set index name to 'ID' for clarity
153
+ Tx_df.index.name = 'ID'
154
+
155
+ return Tx_df
@@ -0,0 +1,86 @@
1
+ """
2
+ @Author : Xinyi Li, Yuqi Liang
3
+ @File : variance_of_spell_durations.py
4
+ @Time : 2025/9/24 14:22
5
+ @Desc : Variance of spell durations of individual state sequences.
6
+
7
+ The corresponding function name in TraMineR is seqivardur,
8
+ with the source code available at: https://github.com/cran/TraMineR/blob/master/R/seqivardur.R
9
+
10
+ """
11
+ import os
12
+ from contextlib import redirect_stdout
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+
17
+ from sequenzo.dissimilarity_measures.utils.seqdss import seqdss
18
+ from sequenzo.dissimilarity_measures.utils.seqlength import seqlength
19
+ from sequenzo.dissimilarity_measures.utils.seqdur import seqdur
20
+ from .state_frequencies_and_entropy_per_sequence import get_state_freq_and_entropy_per_seq
21
+ from .simple_characteristics import cut_prefix
22
+
23
+ def get_spell_duration_variance(seqdata, type=1):
24
+ if not hasattr(seqdata, 'seqdata'):
25
+ raise ValueError("[!] data is NOT a sequence object, see SequenceData function to create one.")
26
+ if type not in [1, 2]:
27
+ raise ValueError("[!] type must be 1 or 2.")
28
+
29
+ with open(os.devnull, 'w') as fnull:
30
+ with redirect_stdout(fnull):
31
+ dss = seqdss(seqdata)
32
+
33
+ lgth = seqlength(seqdata)
34
+ dlgth = seqlength(dss)
35
+ sdist = get_state_freq_and_entropy_per_seq(seqdata)
36
+ nnvisit = (sdist.iloc[:, 1:]==0).sum(axis=1)
37
+
38
+ def realvar(x):
39
+ n = len(x)
40
+ var_val = 1 / n * np.sum((x - np.mean(x)) ** 2)
41
+ return var_val
42
+
43
+ dur = pd.DataFrame(seqdur(seqdata)).apply(lambda row: cut_prefix(row, 1), axis=1)
44
+
45
+ if type == 1:
46
+ ret = dur.apply(realvar)
47
+ meand = dur.apply(np.nanmean)
48
+ var_max = (dlgth - 1) * (1 - meand) ** 2
49
+
50
+ elif type == 2:
51
+ meand = dur.apply(lambda arr: np.nansum(arr))
52
+ meand /= dlgth + nnvisit.to_numpy()
53
+
54
+ ddur = dur.to_frame("arr").join(meand.rename("m")).apply(
55
+ lambda row: (np.array(row["arr"]) - row["m"]) ** 2, axis=1
56
+ )
57
+ # ret = (np.nansum(ddur, axis=1) + nnvisit * (meand ** 2)) / (dlgth + nnvisit)
58
+ ddur = pd.DataFrame(ddur.tolist())
59
+ sum_sqdiff = np.nansum(ddur.to_numpy(), axis=1)
60
+ ret_values = (sum_sqdiff + nnvisit.to_numpy() * (meand.to_numpy() ** 2)) / (dlgth + nnvisit.to_numpy())
61
+ ret = pd.Series(ret_values, index=meand.index)
62
+
63
+ alph = seqdata.states.copy()
64
+ alph_size = len(alph)
65
+ if alph_size < 2:
66
+ maxnnv = 0
67
+ else:
68
+ maxnnv = np.where(dlgth == 1, alph_size - 1, alph_size - 2)
69
+
70
+ meand_max = meand.to_numpy() * (dlgth + nnvisit.to_numpy()) / (dlgth + maxnnv)
71
+ var_max_values = ((dlgth-1) * (1-meand_max)**2 + (lgth - dlgth + 1 - meand_max)**2 + maxnnv * meand_max**2) / (dlgth + maxnnv)
72
+ var_max = pd.Series(var_max_values, index=meand.index)
73
+
74
+ meand.index = seqdata.seqdata.index
75
+ ret.index = seqdata.seqdata.index
76
+ var_max.index = seqdata.seqdata.index
77
+
78
+ meand = meand.to_frame("meand")
79
+ ret = ret.to_frame("var_spell_dur")
80
+ var_max = var_max.to_frame("var_max")
81
+
82
+ return {
83
+ "meand": meand.reset_index().rename(columns={"index": "ID"}),
84
+ "result": ret.reset_index().rename(columns={"index": "ID"}),
85
+ "vmax": var_max.reset_index().rename(columns={"index": "ID"}),
86
+ }
@@ -0,0 +1,43 @@
1
+ """
2
+ @Author : 李欣怡
3
+ @File : within_sequence_entropy.py
4
+ @Time : 2025/9/23 19:44
5
+ @Desc : Within Sequence Entropy
6
+
7
+ The corresponding function name in TraMineR is seqient.R,
8
+ with the source code available at: https://github.com/cran/TraMineR/blob/master/R/seqient.R
9
+ """
10
+ import os
11
+ from contextlib import redirect_stdout
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+ from scipy.stats import entropy
16
+
17
+ from sequenzo.define_sequence_data import SequenceData
18
+ from .state_frequencies_and_entropy_per_sequence import get_state_freq_and_entropy_per_seq
19
+
20
+ def get_within_sequence_entropy(seqdata, norm=True, base=np.e, silent=True):
21
+ if not isinstance(seqdata, SequenceData):
22
+ raise ValueError("[!] data is NOT a sequence object, see SequenceData function to create one.")
23
+
24
+ states = seqdata.states.copy()
25
+
26
+ if not silent:
27
+ print(f" - computing within sequence entropy for {seqdata.seqdata.shape[0]} sequences and {len(states)} states ...")
28
+
29
+ with open(os.devnull, 'w') as fnull:
30
+ with redirect_stdout(fnull):
31
+ iseqtab = get_state_freq_and_entropy_per_seq(seqdata=seqdata)
32
+ iseqtab.index = seqdata.seqdata.index
33
+
34
+ ient = iseqtab.iloc[:, 1:].apply(lambda row: entropy(row, base=base), axis=1)
35
+
36
+ if norm:
37
+ maxent = np.log(len(states))
38
+ ient = ient / maxent
39
+
40
+ ient = pd.DataFrame(ient, index=seqdata.seqdata.index, columns=['Entropy'])
41
+ ient = ient.reset_index().rename(columns={'index': 'ID'})
42
+
43
+ return ient
@@ -0,0 +1,48 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : __init__.py
4
+ @Time : 08/08/2025 15:50
5
+ @Desc :
6
+ Suffix Tree Framework - exposes core indicators and utilities for sequence convergence analysis.
7
+ """
8
+ from .system_level_indicators import (
9
+ build_suffix_tree,
10
+ compute_suffix_count,
11
+ compute_merging_factor,
12
+ compute_js_convergence,
13
+ plot_system_indicators,
14
+ plot_system_indicators_multiple_comparison,
15
+ )
16
+
17
+ from .individual_level_indicators import (
18
+ IndividualConvergence,
19
+ compute_path_uniqueness_by_group,
20
+ plot_suffix_rarity_distribution,
21
+ )
22
+
23
+ from .utils import (
24
+ extract_sequences,
25
+ get_state_space,
26
+ convert_to_suffix_tree_data
27
+ )
28
+
29
+ __all__ = [
30
+ # System-level
31
+ "build_suffix_tree",
32
+ "compute_suffix_count",
33
+ "compute_merging_factor",
34
+ "compute_js_convergence",
35
+ # plotting
36
+ "plot_system_indicators",
37
+ "plot_system_indicators_multiple_comparison",
38
+
39
+ # Individual-level
40
+ "IndividualConvergence",
41
+ "compute_path_uniqueness_by_group",
42
+ "plot_suffix_rarity_distribution",
43
+
44
+ # Utilities
45
+ "extract_sequences",
46
+ "get_state_space",
47
+ "convert_to_suffix_tree_data",
48
+ ]