sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
  2. sequenzo/__init__.py +349 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +476 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +22 -0
  30. sequenzo/data_preprocessing/helpers.py +303 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/dyadic_children.csv +61 -0
  44. sequenzo/datasets/dyadic_parents.csv +61 -0
  45. sequenzo/datasets/mvad.csv +713 -0
  46. sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
  47. sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
  48. sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
  49. sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
  50. sequenzo/datasets/political_science_aid_shock.csv +166 -0
  51. sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
  52. sequenzo/define_sequence_data.py +1400 -0
  53. sequenzo/dissimilarity_measures/__init__.py +31 -0
  54. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  55. sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
  56. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
  57. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  58. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  59. sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
  60. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  61. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  62. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  63. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  64. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  65. sequenzo/dissimilarity_measures/src/module.cpp +40 -0
  66. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  67. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  214. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  215. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  216. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  217. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  218. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  219. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  220. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  221. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  222. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  223. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  224. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  225. sequenzo/multidomain/__init__.py +23 -0
  226. sequenzo/multidomain/association_between_domains.py +311 -0
  227. sequenzo/multidomain/cat.py +597 -0
  228. sequenzo/multidomain/combt.py +519 -0
  229. sequenzo/multidomain/dat.py +81 -0
  230. sequenzo/multidomain/idcd.py +139 -0
  231. sequenzo/multidomain/linked_polyad.py +292 -0
  232. sequenzo/openmp_setup.py +233 -0
  233. sequenzo/prefix_tree/__init__.py +62 -0
  234. sequenzo/prefix_tree/hub.py +114 -0
  235. sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
  236. sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
  237. sequenzo/prefix_tree/spell_level_indicators.py +297 -0
  238. sequenzo/prefix_tree/system_level_indicators.py +544 -0
  239. sequenzo/prefix_tree/utils.py +54 -0
  240. sequenzo/seqhmm/__init__.py +95 -0
  241. sequenzo/seqhmm/advanced_optimization.py +305 -0
  242. sequenzo/seqhmm/bootstrap.py +411 -0
  243. sequenzo/seqhmm/build_hmm.py +142 -0
  244. sequenzo/seqhmm/build_mhmm.py +136 -0
  245. sequenzo/seqhmm/build_nhmm.py +121 -0
  246. sequenzo/seqhmm/fit_mhmm.py +62 -0
  247. sequenzo/seqhmm/fit_model.py +61 -0
  248. sequenzo/seqhmm/fit_nhmm.py +76 -0
  249. sequenzo/seqhmm/formulas.py +289 -0
  250. sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
  251. sequenzo/seqhmm/gradients_nhmm.py +306 -0
  252. sequenzo/seqhmm/hmm.py +291 -0
  253. sequenzo/seqhmm/mhmm.py +314 -0
  254. sequenzo/seqhmm/model_comparison.py +238 -0
  255. sequenzo/seqhmm/multichannel_em.py +282 -0
  256. sequenzo/seqhmm/multichannel_utils.py +138 -0
  257. sequenzo/seqhmm/nhmm.py +270 -0
  258. sequenzo/seqhmm/nhmm_utils.py +191 -0
  259. sequenzo/seqhmm/predict.py +137 -0
  260. sequenzo/seqhmm/predict_mhmm.py +142 -0
  261. sequenzo/seqhmm/simulate.py +878 -0
  262. sequenzo/seqhmm/utils.py +218 -0
  263. sequenzo/seqhmm/visualization.py +910 -0
  264. sequenzo/sequence_characteristics/__init__.py +40 -0
  265. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  266. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  267. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  268. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  269. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  270. sequenzo/sequence_characteristics/turbulence.py +155 -0
  271. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  272. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  273. sequenzo/suffix_tree/__init__.py +66 -0
  274. sequenzo/suffix_tree/hub.py +114 -0
  275. sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
  276. sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
  277. sequenzo/suffix_tree/spell_level_indicators.py +248 -0
  278. sequenzo/suffix_tree/system_level_indicators.py +535 -0
  279. sequenzo/suffix_tree/utils.py +56 -0
  280. sequenzo/version_check.py +283 -0
  281. sequenzo/visualization/__init__.py +29 -0
  282. sequenzo/visualization/plot_mean_time.py +222 -0
  283. sequenzo/visualization/plot_modal_state.py +276 -0
  284. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  285. sequenzo/visualization/plot_relative_frequency.py +405 -0
  286. sequenzo/visualization/plot_sequence_index.py +1175 -0
  287. sequenzo/visualization/plot_single_medoid.py +153 -0
  288. sequenzo/visualization/plot_state_distribution.py +651 -0
  289. sequenzo/visualization/plot_transition_matrix.py +190 -0
  290. sequenzo/visualization/utils/__init__.py +23 -0
  291. sequenzo/visualization/utils/utils.py +310 -0
  292. sequenzo/with_event_history_analysis/__init__.py +35 -0
  293. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  294. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  295. sequenzo-0.1.31.dist-info/METADATA +286 -0
  296. sequenzo-0.1.31.dist-info/RECORD +299 -0
  297. sequenzo-0.1.31.dist-info/WHEEL +5 -0
  298. sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
  299. sequenzo-0.1.31.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1321 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : individual_level_indicators.py
4
+ @Time : 01/30/2026 11:07
5
+ @Desc :
6
+ Individual-level indicators for position-based prefix tree (level = time index).
7
+
8
+ This module provides per-sequence (per-individual) divergence and rarity measures when
9
+ the unit of analysis is TIME INDEX: level t = states from the start up to time t.
10
+ Higher prefix rarity means a more atypical (rarer) path from the start.
11
+
12
+ Usage (position-based: list of sequences, same length)
13
+ ------------------------------------------------------
14
+ from sequenzo import IndividualDivergence, extract_sequences
15
+
16
+ # sequences: list of lists, e.g. [[1,2,3], [2,3,1], ...], all same length T
17
+ sequences = extract_sequences(df, time_cols, id_col, states) # or from build_prefix_tree(..., mode="position").sequences
18
+ ind = IndividualDivergence(sequences)
19
+
20
+ # Per-year rarity: (N x T) matrix or DataFrame. rarity_{i,t} = -log(freq(prefix_{i,t})/N)
21
+ rarity_df = ind.compute_prefix_rarity_per_year(as_dataframe=True, zscore=False)
22
+
23
+ # One score per individual: sum over t of rarity, or standardized (max over windows of min z)
24
+ scores = ind.compute_prefix_rarity_score()
25
+ std_scores = ind.compute_standardized_rarity_score(min_t=2, window=1)
26
+
27
+ # Binary divergence (0/1) and first divergence year (1-indexed, or None)
28
+ diverged = ind.compute_diverged(method="zscore", z_threshold=1.5, min_t=2, window=1)
29
+ first_year = ind.compute_first_divergence_year(method="zscore", z_threshold=1.5, min_t=2)
30
+
31
+ # Methods: "zscore" (window of high z), "top_proportion" (top p% most atypical), "quantile" (above quantile)
32
+ # With group_labels, top_proportion/quantile are applied within each group.
33
+
34
+ # Path uniqueness: count of time steps at which prefix is unique (freq==1)
35
+ uniqueness = ind.compute_path_uniqueness()
36
+
37
+ Spell-based (level = spell index) is in spell_individual_level_indicators.SpellIndividualDivergence;
38
+ use build_spell_prefix_tree(seqdata) then SpellIndividualDivergence(tree).
39
+ """
40
+ from collections import defaultdict, Counter
41
+ from typing import Optional
42
+ import numpy as np
43
+ import pandas as pd
44
+
45
+
46
+ class IndividualDivergence:
47
+ """
48
+ Individual-level divergence and prefix rarity for position-based prefix trees.
49
+
50
+ Input: sequences — a list of sequences (list of lists), all of the same length T.
51
+ Each sequence is the list of states at time 1, 2, ..., T. Level t corresponds
52
+ to the prefix (states from start up to time t). Rarity at (i, t) is
53
+ -log(freq(prefix_{i,t})/N); higher rarity = more atypical path.
54
+
55
+ Main methods:
56
+ - compute_prefix_rarity_per_year: (N x T) rarity matrix or DataFrame.
57
+ - compute_prefix_rarity_score: one aggregated rarity score per individual (sum over t).
58
+ - compute_standardized_rarity_score: z-based score for classification (higher = more atypical).
59
+ - compute_diverged: binary 0/1 per individual (method: zscore, top_proportion, quantile).
60
+ - compute_first_divergence_year: first year (1-indexed) at which diverged, or None.
61
+ - compute_path_uniqueness: count of time steps with unique prefix per individual.
62
+ - diagnose_divergence_calculation: diagnostic dict (variance by year, count diverged, etc.).
63
+
64
+ Plotting: plot_prefix_rarity_distribution, plot_individual_indicators_correlation (in this module).
65
+ """
66
+
67
+ def __init__(self, sequences):
68
+ # Handle case where sequences might already be an IndividualDivergence object
69
+ if isinstance(sequences, IndividualDivergence):
70
+ # Extract sequences from existing object
71
+ self.sequences = sequences.sequences
72
+ elif hasattr(sequences, 'sequences'):
73
+ # Handle case where input might be another object with sequences attribute
74
+ self.sequences = sequences.sequences
75
+ else:
76
+ # Normal case: sequences is a list of sequences
77
+ self.sequences = sequences
78
+
79
+ # Validate input
80
+ if not self.sequences or len(self.sequences) == 0:
81
+ raise ValueError("sequences cannot be empty")
82
+ if not hasattr(self.sequences[0], '__len__') and not hasattr(self.sequences[0], '__iter__'):
83
+ raise ValueError("sequences must be a list of sequences (e.g., [[1,2,3], [2,3,1], ...])")
84
+
85
+ self.T = len(self.sequences[0])
86
+ self.prefix_freq_by_year = self._build_prefix_frequencies()
87
+
88
+ def _build_prefix_frequencies(self):
89
+ freq_by_year = [defaultdict(int) for _ in range(self.T)]
90
+ for seq in self.sequences:
91
+ prefix = []
92
+ for t in range(self.T):
93
+ prefix.append(seq[t])
94
+ freq_by_year[t][tuple(prefix)] += 1
95
+ return freq_by_year
96
+
97
+ def compute_diverged(
98
+ self,
99
+ z_threshold=1.5,
100
+ min_t=2,
101
+ window=1,
102
+ inclusive=False,
103
+ group_labels=None,
104
+ *,
105
+ method: str = "zscore",
106
+ proportion: Optional[float] = None,
107
+ quantile_p: Optional[float] = None,
108
+ min_count: int = 1,
109
+ ):
110
+ """
111
+ Compute binary divergence flags with multiple selection methods.
112
+
113
+ Definition (common intuition): higher prefix-rarity implies more atypical behavior.
114
+ We compute per-year rarity via prefix frequencies and then detect divergence using
115
+ one of the following methods:
116
+
117
+ Methods
118
+ -------
119
+ - "zscore" (window-based, default):
120
+ Uses per-year z-scores of rarity. A person is diverged if there exists a window
121
+ of length `window` starting between years `[min_t, max_t]` where all z-scores are
122
+ above `z_threshold` (use `inclusive=True` for `>=`). Zero-variance years remain
123
+ NaN and any window containing NaN is skipped.
124
+
125
+ - "top_proportion" (aka "topk"/"proportion"/"rank"):
126
+ Uses the aggregated standardized score from `compute_standardized_rarity_score`
127
+ (higher = more atypical). Select the most atypical `proportion` within each group if
128
+ `group_labels` is provided, otherwise globally. `min_count` ensures at least the
129
+ specified number per group.
130
+
131
+ - "quantile":
132
+ Uses a quantile threshold (`quantile_p`) on the aggregated standardized score,
133
+ within each group (or globally if no `group_labels`). Individuals at or above the
134
+ threshold are marked diverged.
135
+
136
+ Parameters
137
+ ----------
138
+ z_threshold : float, default 1.5
139
+ zscore method only. Diverged when z > z_threshold (or >= if inclusive=True).
140
+ min_t, max_t : int
141
+ Search interval for the starting year (1-indexed). If max_t is None, uses T - window + 1.
142
+ window : int, default 1
143
+ Number of consecutive years required in zscore method and used in standardized aggregation.
144
+ inclusive : bool, default False
145
+ zscore method only. If True, use >= comparisons.
146
+ group_labels : array-like or None
147
+ If provided, proportion/quantile selections are computed within each group.
148
+ method : str, default "zscore"
149
+ One of {"zscore", "top_proportion" (aliases: "topk","proportion","rank"), "quantile"}.
150
+ proportion : float or None
151
+ For top_proportion. Fraction (0,1) to select as diverged. Defaults to 0.10 if None.
152
+ quantile_p : float or None
153
+ For quantile. Quantile in (0,1) used as threshold. Defaults to 0.90 if None.
154
+ min_count : int, default 1
155
+ For top_proportion. Lower bound for number selected per group.
156
+
157
+ Returns
158
+ -------
159
+ List[int]
160
+ 0/1 indicator for each individual.
161
+ """
162
+ N = len(self.sequences)
163
+ method_norm = (method or "zscore").lower()
164
+ max_t = self.T - window + 1
165
+
166
+ # Branch: rank/quantile style selections using aggregated standardized scores
167
+ if method_norm in {"top_proportion", "topk", "proportion", "rank"}:
168
+ p = proportion if proportion is not None else 0.10
169
+ scores = np.asarray(
170
+ self.compute_standardized_rarity_score(min_t=min_t, window=window), dtype=float
171
+ )
172
+ if group_labels is None:
173
+ vals = scores
174
+ finite_mask = np.isfinite(vals)
175
+ n_valid = int(np.sum(finite_mask))
176
+ if n_valid == 0:
177
+ return [0] * N
178
+ k = int(np.floor(p * n_valid))
179
+ if k < int(min_count):
180
+ k = int(min_count)
181
+ if k > n_valid:
182
+ k = n_valid
183
+ # For divergence: higher scores = more atypical, so take the largest k values
184
+ order = np.argsort(np.where(np.isfinite(vals), vals, -np.inf), kind="mergesort")
185
+ flags = np.zeros(N, dtype=int)
186
+ if k >= 1:
187
+ selected = order[-k:] # Take the k largest (most divergent)
188
+ flags[selected] = 1
189
+ return flags.tolist()
190
+ else:
191
+ # Group-wise selection - need to implement similar to suffix_tree
192
+ labels = np.asarray(group_labels)
193
+ flags = np.zeros(N, dtype=int)
194
+ for g in pd.unique(labels):
195
+ idx = np.where(labels == g)[0]
196
+ vals = scores[idx]
197
+ finite_mask = np.isfinite(vals)
198
+ n_valid = int(np.sum(finite_mask))
199
+ if n_valid == 0:
200
+ continue
201
+ k = int(np.floor(p * n_valid))
202
+ if k < int(min_count):
203
+ k = int(min_count)
204
+ if k > n_valid:
205
+ k = n_valid
206
+ order_local = np.argsort(np.where(np.isfinite(vals), vals, -np.inf), kind="mergesort")
207
+ if k >= 1:
208
+ selected_local = order_local[-k:] # Take k largest within group
209
+ selected_global = idx[selected_local]
210
+ flags[selected_global] = 1
211
+ return flags.tolist()
212
+
213
+ if method_norm == "quantile":
214
+ q = quantile_p if quantile_p is not None else 0.90 # High quantile for divergence
215
+ scores = np.asarray(
216
+ self.compute_standardized_rarity_score(min_t=min_t, window=window), dtype=float
217
+ )
218
+ flags = np.zeros(N, dtype=int)
219
+ if group_labels is None:
220
+ # Global quantile
221
+ valid = scores[np.isfinite(scores)]
222
+ if valid.size == 0:
223
+ return flags.tolist()
224
+ try:
225
+ xq = float(np.nanquantile(scores, q))
226
+ except Exception:
227
+ xq = float(np.quantile(valid, q))
228
+ flags[np.where(scores >= xq)[0]] = 1
229
+ return flags.tolist()
230
+ else:
231
+ labels = np.asarray(group_labels)
232
+ for g in pd.unique(labels):
233
+ idx = np.where(labels == g)[0]
234
+ vals = scores[idx]
235
+ valid = vals[np.isfinite(vals)]
236
+ if valid.size == 0:
237
+ continue
238
+ try:
239
+ xq = float(np.nanquantile(vals, q))
240
+ except Exception:
241
+ xq = float(np.quantile(valid, q))
242
+ local = np.where(vals >= xq)[0]
243
+ flags[idx[local]] = 1
244
+ return flags.tolist()
245
+
246
+ # Default branch: z-score window logic
247
+ rarity_matrix = []
248
+ for seq in self.sequences:
249
+ prefix = []
250
+ score = []
251
+ for t in range(self.T):
252
+ prefix.append(seq[t])
253
+ freq = self.prefix_freq_by_year[t][tuple(prefix)] / N
254
+ score.append(-np.log(freq + 1e-10))
255
+ rarity_matrix.append(score)
256
+
257
+ rarity_df = pd.DataFrame(rarity_matrix)
258
+ rarity_z = rarity_df.apply(lambda x: (x - x.mean()) / x.std(), axis=0)
259
+ # Keep NaNs for zero-variance years and skip NaN windows
260
+ rarity_z = rarity_z.replace([np.inf, -np.inf], np.nan)
261
+
262
+ flags = []
263
+ for i in range(N):
264
+ z = rarity_z.iloc[i]
265
+ diverged = 0
266
+ for t in range(min_t - 1, max_t):
267
+ # Skip windows containing NaN (zero-variance years)
268
+ vals = [z.iloc[t + k] for k in range(window)]
269
+ if not np.all(np.isfinite(vals)):
270
+ continue
271
+ # 发散 = 高稀有(更不典型)
272
+ if inclusive:
273
+ condition = all(v >= z_threshold for v in vals)
274
+ else:
275
+ condition = all(v > z_threshold for v in vals)
276
+ if condition:
277
+ diverged = 1
278
+ break
279
+ flags.append(diverged)
280
+ return flags
281
+
282
+ def compute_first_divergence_year(
283
+ self,
284
+ z_threshold=1.5,
285
+ min_t=2,
286
+ window=1,
287
+ inclusive=False,
288
+ group_labels=None,
289
+ *,
290
+ method: str = "zscore",
291
+ proportion: Optional[float] = None,
292
+ quantile_p: Optional[float] = None,
293
+ min_count: int = 1,
294
+ ):
295
+ """
296
+ Compute the first divergence year per individual with multiple selection methods.
297
+
298
+ Methods
299
+ -------
300
+ - "zscore" (default):
301
+ Find the earliest starting year t in [min_t, max_t] such that all z-scores in the
302
+ length-`window` block are above `z_threshold` (or >= if inclusive=True). Zero-variance
303
+ years are NaN; windows containing NaN are skipped.
304
+
305
+ - "top_proportion" (aka "topk"/"proportion"/"rank"):
306
+ Use aggregated standardized scores to pick the most atypical `proportion` within each group
307
+ (or globally). For the selected individuals, return the earliest t where the per-window
308
+ max z-score is >= the selection threshold; others return None. `min_count` is respected.
309
+
310
+ - "quantile":
311
+ Use per-group (or global) quantile threshold `quantile_p` on aggregated standardized scores;
312
+ individuals at or above the threshold return the earliest qualifying year; others return None.
313
+
314
+ Parameters
315
+ ----------
316
+ z_threshold, min_t, window, inclusive, group_labels
317
+ Same definitions as in `compute_diverged` for the zscore method.
318
+ method : str, default "zscore"
319
+ One of {"zscore", "top_proportion" (aliases: "topk","proportion","rank"), "quantile"}.
320
+ proportion : float or None
321
+ For top_proportion. Fraction (0,1) to select as diverged. Defaults to 0.10 if None.
322
+ quantile_p : float or None
323
+ For quantile. Quantile in (0,1) used as threshold. Defaults to 0.90 if None.
324
+ min_count : int, default 1
325
+ For top_proportion. Lower bound for number selected per group.
326
+
327
+ Returns
328
+ -------
329
+ List[Optional[int]]
330
+ First divergence years (1-indexed). None indicates no divergence.
331
+ """
332
+ N = len(self.sequences)
333
+ method_norm = (method or "zscore").lower()
334
+ max_t = self.T - window + 1
335
+
336
+ # Helper: standardized z matrix and per-t window maxima per individual
337
+ def _compute_window_max_list():
338
+ # Build rarity matrix and columnwise z (global standardization)
339
+ rarity_matrix = []
340
+ for seq in self.sequences:
341
+ prefix = []
342
+ score = []
343
+ for t in range(self.T):
344
+ prefix.append(seq[t])
345
+ freq = self.prefix_freq_by_year[t][tuple(prefix)] / N
346
+ score.append(-np.log(freq + 1e-10))
347
+ rarity_matrix.append(score)
348
+ rarity_arr = np.asarray(rarity_matrix, dtype=float)
349
+ col_means = np.nanmean(rarity_arr, axis=0)
350
+ col_stds = np.nanstd(rarity_arr, axis=0, ddof=1)
351
+ with np.errstate(invalid='ignore', divide='ignore'):
352
+ rarity_z = (rarity_arr - col_means) / col_stds
353
+ rarity_z = np.where(np.isfinite(rarity_z), rarity_z, np.nan)
354
+ # Compute per-individual window maxima sequence over t
355
+ window_maxes = [] # list of list per i
356
+ for i in range(N):
357
+ z_scores = rarity_z[i, :]
358
+ vals_per_t = []
359
+ for t0 in range(min_t - 1, max_t):
360
+ vals = [z_scores[t0 + k] for k in range(window)]
361
+ if not np.all(np.isfinite(vals)):
362
+ vals_per_t.append(np.nan)
363
+ else:
364
+ vals_per_t.append(float(np.max(vals)))
365
+ window_maxes.append(vals_per_t)
366
+ return np.asarray(window_maxes, dtype=float)
367
+
368
+ # Branches for rank/quantile-style thresholds
369
+ if method_norm in {"top_proportion", "topk", "proportion", "rank", "quantile"}:
370
+ # Compute aggregated scores for thresholding
371
+ agg_scores = np.asarray(
372
+ self.compute_standardized_rarity_score(min_t=min_t, window=window), dtype=float
373
+ )
374
+ per_t_window_max = _compute_window_max_list()
375
+
376
+ if method_norm in {"top_proportion", "topk", "proportion", "rank"}:
377
+ p = proportion if proportion is not None else 0.10
378
+ if group_labels is None:
379
+ vals = agg_scores
380
+ finite_mask = np.isfinite(vals)
381
+ n_valid = int(np.sum(finite_mask))
382
+ if n_valid == 0:
383
+ return [None] * N
384
+ k = int(np.floor(p * n_valid))
385
+ if k < int(min_count):
386
+ k = int(min_count)
387
+ if k > n_valid:
388
+ k = n_valid
389
+ # For divergence: take highest scores (most atypical)
390
+ order = np.argsort(np.where(np.isfinite(vals), vals, -np.inf), kind="mergesort")
391
+ selected_idx = set(order[-k:].tolist()) if k >= 1 else set()
392
+ years = []
393
+ for i in range(N):
394
+ if i not in selected_idx:
395
+ years.append(None)
396
+ continue
397
+ wm = per_t_window_max[i]
398
+ # threshold value is kth largest value
399
+ thresh_val = vals[order[-k]] if k >= 1 else np.nan
400
+ if not np.isfinite(thresh_val):
401
+ years.append(None)
402
+ continue
403
+ # earliest t where window_max >= threshold
404
+ yr = None
405
+ for t_idx, wv in enumerate(wm):
406
+ if np.isfinite(wv) and wv >= float(thresh_val):
407
+ yr = t_idx + min_t # Convert back to 1-indexed
408
+ break
409
+ years.append(yr)
410
+ return years
411
+ else:
412
+ labels = np.asarray(group_labels)
413
+ years = [None] * N
414
+ for g in pd.unique(labels):
415
+ idx = np.where(labels == g)[0]
416
+ vals = agg_scores[idx]
417
+ finite_mask = np.isfinite(vals)
418
+ n_valid = int(np.sum(finite_mask))
419
+ if n_valid == 0:
420
+ continue
421
+ k = int(np.floor(p * n_valid))
422
+ if k < int(min_count):
423
+ k = int(min_count)
424
+ if k > n_valid:
425
+ k = n_valid
426
+ order_local = np.argsort(np.where(np.isfinite(vals), vals, -np.inf), kind="mergesort")
427
+ selected_local = set(order_local[-k:].tolist()) if k >= 1 else set()
428
+ thresh_val = vals[order_local[-k]] if k >= 1 else np.nan
429
+ for j_local, i_global in enumerate(idx):
430
+ if j_local not in selected_local or not np.isfinite(thresh_val):
431
+ continue
432
+ wm = per_t_window_max[i_global]
433
+ for t_idx, wv in enumerate(wm):
434
+ if np.isfinite(wv) and wv >= float(thresh_val):
435
+ years[i_global] = int(t_idx + min_t)
436
+ break
437
+ return years
438
+
439
+ # quantile branch
440
+ q = quantile_p if quantile_p is not None else 0.90
441
+ years = [None] * N
442
+ if group_labels is None:
443
+ valid = agg_scores[np.isfinite(agg_scores)]
444
+ if valid.size == 0:
445
+ return years
446
+ try:
447
+ xq = float(np.nanquantile(agg_scores, q))
448
+ except Exception:
449
+ xq = float(np.quantile(valid, q))
450
+ for i in range(N):
451
+ if not np.isfinite(agg_scores[i]) or agg_scores[i] < xq:
452
+ continue
453
+ wm = per_t_window_max[i]
454
+ for t_idx, wv in enumerate(wm):
455
+ if np.isfinite(wv) and wv >= xq:
456
+ years[i] = int(t_idx + min_t)
457
+ break
458
+ return years
459
+ else:
460
+ labels = np.asarray(group_labels)
461
+ for g in pd.unique(labels):
462
+ idx = np.where(labels == g)[0]
463
+ vals = agg_scores[idx]
464
+ valid = vals[np.isfinite(vals)]
465
+ if valid.size == 0:
466
+ continue
467
+ try:
468
+ xq = float(np.nanquantile(vals, q))
469
+ except Exception:
470
+ xq = float(np.quantile(valid, q))
471
+ for j_local, i_global in enumerate(idx):
472
+ if not np.isfinite(vals[j_local]) or vals[j_local] < xq:
473
+ continue
474
+ wm = per_t_window_max[i_global]
475
+ for t_idx, wv in enumerate(wm):
476
+ if np.isfinite(wv) and wv >= xq:
477
+ years[i_global] = t_idx + min_t
478
+ break
479
+ return years
480
+
481
+ # Default branch: z-score window logic
482
+ rarity_matrix = []
483
+ for seq in self.sequences:
484
+ prefix = []
485
+ score = []
486
+ for t in range(self.T):
487
+ prefix.append(seq[t])
488
+ freq = self.prefix_freq_by_year[t][tuple(prefix)] / N
489
+ score.append(-np.log(freq + 1e-10))
490
+ rarity_matrix.append(score)
491
+
492
+ rarity_df = pd.DataFrame(rarity_matrix)
493
+ rarity_z = rarity_df.apply(lambda x: (x - x.mean()) / x.std(), axis=0)
494
+ # Keep NaNs for zero-variance years and skip NaN windows
495
+ rarity_z = rarity_z.replace([np.inf, -np.inf], np.nan)
496
+
497
+ years = []
498
+ for i in range(N):
499
+ z = rarity_z.iloc[i]
500
+ year = None
501
+ for t in range(min_t - 1, max_t):
502
+ # Skip windows containing NaN (zero-variance years)
503
+ vals = [z.iloc[t + k] for k in range(window)]
504
+ if not np.all(np.isfinite(vals)):
505
+ continue
506
+ # 发散 = 高稀有(更不典型)
507
+ if inclusive:
508
+ condition = all(v >= z_threshold for v in vals)
509
+ else:
510
+ condition = all(v > z_threshold for v in vals)
511
+ if condition:
512
+ year = int(t + 1) # Convert to 1-indexed integer
513
+ break
514
+ years.append(year)
515
+ return years
516
+
517
+ def compute_prefix_rarity_per_year(self, as_dataframe: bool = True, column_prefix: str = "t", zscore: bool = False):
518
+ """
519
+ Compute per-year prefix rarity scores for each individual.
520
+
521
+ For each individual i and year t (1..T), rarity score is defined as:
522
+ rarity_{i,t} = -log( freq(prefix_{i,t}) / N )
523
+ where prefix_{i,t} is the sequence of observed states up to year t for individual i,
524
+ freq(prefix) counts how many individuals share that exact prefix up to year t,
525
+ and N is the total number of individuals.
526
+
527
+ Parameters
528
+ ----------
529
+ as_dataframe : bool, default True
530
+ If True, returns a pandas DataFrame with columns f"{column_prefix}1"..f"{column_prefix}T".
531
+ If False, returns a NumPy array of shape (N, T).
532
+ column_prefix : str, default "t"
533
+ Column name prefix when returning a DataFrame.
534
+ zscore : bool, default False
535
+ If True, z-standardize the rarity scores column-wise (by year) using
536
+ sample standard deviation (ddof=1).
537
+
538
+ Returns
539
+ -------
540
+ pandas.DataFrame or np.ndarray
541
+ Per-year rarity scores (optionally z-scored).
542
+ """
543
+ N = len(self.sequences)
544
+ rarity_matrix = []
545
+
546
+ for seq in self.sequences:
547
+ prefix = []
548
+ score_list = []
549
+ for t in range(self.T):
550
+ prefix.append(seq[t])
551
+ freq = self.prefix_freq_by_year[t][tuple(prefix)] / N
552
+ score_list.append(-np.log(freq + 1e-10))
553
+ rarity_matrix.append(score_list)
554
+
555
+ rarity_arr = np.array(rarity_matrix, dtype=float)
556
+
557
+ if zscore:
558
+ # Column-wise z-score; handle zero-std columns gracefully (leave as NaN)
559
+ col_means = np.nanmean(rarity_arr, axis=0)
560
+ col_stds = np.nanstd(rarity_arr, axis=0, ddof=1) # Use sample std for consistency with pandas
561
+ with np.errstate(invalid='ignore', divide='ignore'):
562
+ rarity_arr = (rarity_arr - col_means) / col_stds
563
+
564
+ if not as_dataframe:
565
+ return rarity_arr
566
+
567
+ columns = [f"{column_prefix}{t+1}" for t in range(self.T)]
568
+ return pd.DataFrame(rarity_arr, columns=columns)
569
+
570
+ def compute_prefix_rarity_score(self):
571
+ rarity_scores = []
572
+ N = len(self.sequences)
573
+
574
+ for seq in self.sequences:
575
+ prefix = []
576
+ score = 0.0
577
+ for t in range(self.T):
578
+ prefix.append(seq[t])
579
+ freq = self.prefix_freq_by_year[t][tuple(prefix)] / N
580
+ score += -np.log(freq + 1e-10) # small constant to avoid log(0)
581
+ rarity_scores.append(score)
582
+ return rarity_scores
583
+
584
+ def diagnose_divergence_calculation(self, z_threshold=1.5, min_t=2, window=1):
585
+ """
586
+ Diagnostic function to analyze divergence year calculation and identify
587
+ years with insufficient variance (std ≈ 0) that cannot trigger divergence.
588
+
589
+ This is methodologically appropriate: when all individuals follow similar
590
+ trajectories in a given year, no divergence should be detected.
591
+
592
+ Returns:
593
+ --------
594
+ dict: Diagnostic information including:
595
+ - years_with_zero_variance: List of years where std ≈ 0
596
+ - rarity_std_by_year: Standard deviation of rarity scores per year
597
+ - n_individuals_with_divergence: Count of individuals with any divergence
598
+ - divergence_year_distribution: Value counts of divergence years
599
+ """
600
+ N = len(self.sequences)
601
+ rarity_matrix = []
602
+
603
+ # Calculate rarity scores (same as in compute_first_divergence_year)
604
+ for seq in self.sequences:
605
+ prefix = []
606
+ score = []
607
+ for t in range(self.T):
608
+ prefix.append(seq[t])
609
+ freq = self.prefix_freq_by_year[t][tuple(prefix)] / N
610
+ score.append(-np.log(freq + 1e-10))
611
+ rarity_matrix.append(score)
612
+
613
+ rarity_df = pd.DataFrame(rarity_matrix)
614
+
615
+ # Calculate standard deviations by year
616
+ rarity_std_by_year = rarity_df.std(axis=0)
617
+ years_with_zero_variance = []
618
+
619
+ # Identify years with near-zero variance (threshold can be adjusted)
620
+ for t, std_val in enumerate(rarity_std_by_year):
621
+ if pd.isna(std_val) or std_val < 1e-10:
622
+ years_with_zero_variance.append(t + 1) # 1-indexed
623
+
624
+ # Calculate z-scores
625
+ rarity_z = rarity_df.apply(lambda x: (x - x.mean()) / x.std(), axis=0)
626
+
627
+ # Count individuals with divergence
628
+ divergence_years = self.compute_first_divergence_year(z_threshold, min_t, window)
629
+ n_individuals_with_divergence = sum(1 for year in divergence_years if year is not None)
630
+
631
+ # Distribution of divergence years
632
+ divergence_year_counts = pd.Series(divergence_years).value_counts(dropna=False).sort_index()
633
+
634
+ return {
635
+ 'years_with_zero_variance': years_with_zero_variance,
636
+ 'rarity_std_by_year': rarity_std_by_year.tolist(),
637
+ 'n_individuals_with_divergence': n_individuals_with_divergence,
638
+ 'divergence_year_distribution': divergence_year_counts.to_dict(),
639
+ 'total_individuals': N,
640
+ 'parameters_used': {
641
+ 'z_threshold': z_threshold,
642
+ 'min_t': min_t,
643
+ 'window': window
644
+ }
645
+ }
646
+
647
+ def compute_standardized_rarity_score(self, min_t=2, window=1):
648
+ """
649
+ Compute standardized rarity scores for divergence classification and visualization
650
+ using true statistical z-scores.
651
+
652
+ This method computes standardized rarity scores used for individual-level
653
+ divergence classification:
654
+ standardized_score_i = max_t min_{k=0..window-1} z_{i,t+k}
655
+
656
+ Where z_{i,t} are the year-wise standardized prefix rarity scores using column-wise
657
+ standardization with sample standard deviation (ddof=1, as computed by pandas).
658
+
659
+ The standardized scores can be used with a threshold (e.g., z >= 1.5) to classify
660
+ individuals as diverged/not diverged, and are particularly useful for visualization.
661
+
662
+ Parameters:
663
+ -----------
664
+ min_t : int, default=2
665
+ Minimum year (1-indexed) after which divergence is considered valid
666
+ window : int, default=1
667
+ Number of consecutive high-z years required
668
+
669
+ Returns:
670
+ --------
671
+ List[float]
672
+ Standardized rarity scores for each individual. Values >= z_threshold indicate divergence.
673
+
674
+ Notes:
675
+ ------
676
+ The standardization uses sample standard deviation (ddof=1) for each year column,
677
+ which is consistent with pandas' default behavior for DataFrame.std().
678
+ This is essentially the z-score normalized version of prefix rarity scores.
679
+ """
680
+ N = len(self.sequences)
681
+ # Step 1: Compute rarity matrix (same as in compute_diverged)
682
+ rarity_matrix = []
683
+
684
+ for seq in self.sequences:
685
+ prefix = []
686
+ score = []
687
+ for t in range(self.T):
688
+ prefix.append(seq[t])
689
+ freq = self.prefix_freq_by_year[t][tuple(prefix)] / N
690
+ score.append(-np.log(freq + 1e-10))
691
+ rarity_matrix.append(score)
692
+
693
+ # Step 2: Column-wise true z-score standardization (by year, ddof=1)
694
+ rarity_arr = np.asarray(rarity_matrix, dtype=float)
695
+ col_means = np.nanmean(rarity_arr, axis=0)
696
+ col_stds = np.nanstd(rarity_arr, axis=0, ddof=1)
697
+ with np.errstate(invalid='ignore', divide='ignore'):
698
+ rarity_z = (rarity_arr - col_means) / col_stds
699
+ # Keep NaN for zero-variance years to allow window skipping downstream
700
+ rarity_z = np.where(np.isfinite(rarity_z), rarity_z, np.nan)
701
+
702
+ # Step 3: Compute standardized rarity score for each individual
703
+ standardized_scores = []
704
+ for i in range(N):
705
+ z_scores = rarity_z[i, :]
706
+ candidate_values = []
707
+
708
+ # For each possible starting time t
709
+ for t in range(min_t - 1, self.T - window + 1):
710
+ vals = [z_scores[t + k] for k in range(window)]
711
+ # Skip windows containing NaN (e.g., zero-variance years)
712
+ if not np.all(np.isfinite(vals)):
713
+ continue
714
+ # For divergence, take minimum within window (ensure all finite)
715
+ window_min = float(np.min(vals))
716
+ candidate_values.append(window_min)
717
+
718
+ # Take the maximum across all starting times
719
+ if candidate_values:
720
+ standardized_score = np.nanmax(candidate_values)
721
+ else:
722
+ standardized_score = np.nan
723
+
724
+ standardized_scores.append(standardized_score)
725
+
726
+ return standardized_scores
727
+
728
+ def compute_path_uniqueness(self):
729
+ uniqueness_scores = []
730
+ for seq in self.sequences:
731
+ prefix = []
732
+ count = 0
733
+ for t in range(self.T):
734
+ prefix.append(seq[t])
735
+ if self.prefix_freq_by_year[t][tuple(prefix)] == 1:
736
+ count += 1
737
+ uniqueness_scores.append(count)
738
+ return uniqueness_scores
739
+
740
+
741
+ def plot_prefix_rarity_distribution(
742
+ data,
743
+ # === Core Parameters ===
744
+ group_names=None,
745
+ colors=None,
746
+ # === Threshold Settings ===
747
+ show_threshold=True,
748
+ threshold_method="zscore", # Changed default to top_proportion
749
+ proportion_p=0.07, # Simplified parameter name, default 7%
750
+ # === Plotting Options ===
751
+ figsize=(10, 6),
752
+ kde_bw=None,
753
+ # === Export Options ===
754
+ save_as=None,
755
+ dpi=300,
756
+ show=True,
757
+ # === Parameters for Different Methods ===
758
+ z_threshold=1.5,
759
+ is_standardized_score=False,
760
+ quantile_p=0.90
761
+ ):
762
+ """
763
+ Plot prefix rarity score distribution(s) with clean threshold lines.
764
+
765
+ Parameters
766
+ ----------
767
+ data : dict or array-like
768
+ Data to plot. If dict: {"group1": scores1, "group2": scores2}
769
+ If array-like: single group data
770
+ group_names : list, optional
771
+ Custom group names. Auto-detected from dict keys if not provided
772
+ colors : dict or list, optional
773
+ Colors for groups. If None, uses default palette
774
+
775
+ show_threshold : bool, default True
776
+ Whether to show threshold vertical lines
777
+ threshold_method : str, default "top_proportion"
778
+ Threshold method:
779
+ - "top_proportion": Select top proportion_p% most extreme values
780
+ - "quantile": Use quantile_p percentile
781
+ - "zscore": Use z-score threshold (for standardized data)
782
+ proportion_p : float, default 0.05
783
+ Proportion for top_proportion method (e.g., 0.05 = top 5%)
784
+
785
+ figsize : tuple, default (10, 6)
786
+ Figure size (width, height)
787
+ kde_bw : float, optional
788
+ KDE bandwidth adjustment. If None, uses seaborn default
789
+
790
+ save_as : str, optional
791
+ Save path (without extension)
792
+ dpi : int, default 300
793
+ Resolution for saved figure
794
+ show : bool, default True
795
+ Whether to display plot
796
+
797
+ Returns
798
+ -------
799
+ dict
800
+ Statistics including threshold values per group
801
+
802
+ Examples
803
+ --------
804
+ # Basic usage - top 5% threshold (default)
805
+ >>> plot_prefix_rarity_distribution({"India": india_scores, "US": us_scores})
806
+
807
+ # Custom threshold proportion
808
+ >>> plot_prefix_rarity_distribution(
809
+ ... data={"India": india_scores, "US": us_scores},
810
+ ... proportion_p=0.03, # top 3%
811
+ ... save_as="rarity_comparison"
812
+ ... )
813
+
814
+ # Quantile-based threshold
815
+ >>> plot_prefix_rarity_distribution(
816
+ ... data={"India": india_scores, "US": us_scores},
817
+ ... threshold_method="quantile",
818
+ ... quantile_p=0.90, # 90th percentile
819
+ ... )
820
+
821
+ # Clean plot without thresholds
822
+ >>> plot_prefix_rarity_distribution(
823
+ ... data,
824
+ ... show_threshold=False,
825
+ ... colors={"India": "#E8B88A", "US": "#A3BFD9"}
826
+ ... )
827
+ """
828
+ import matplotlib.pyplot as plt
829
+ import seaborn as sns
830
+ import numpy as np
831
+
832
+ # Process input data
833
+ if isinstance(data, dict):
834
+ # Multi-group case
835
+ groups = data
836
+ if group_names is None:
837
+ group_names = list(groups.keys())
838
+ else:
839
+ # Single group case
840
+ if group_names is None:
841
+ group_names = ["Group"]
842
+ groups = {group_names[0]: data}
843
+
844
+ # Set up colors (simplified)
845
+ if colors is None:
846
+ default_colors = ["#A3BFD9", "#E8B88A", "#C6A5CF", "#A6C1A9", "#F4A460", "#87CEEB"]
847
+ color_map = dict(zip(group_names, default_colors[:len(group_names)]))
848
+ elif isinstance(colors, dict):
849
+ color_map = colors
850
+ else:
851
+ color_map = dict(zip(group_names, colors))
852
+
853
+ # Normalize method and prepare stats
854
+ threshold_method = (threshold_method or "top_proportion").lower()
855
+
856
+ # Handle legacy parameter mapping
857
+ if threshold_method in {"top_proportion", "topk", "proportion", "rank"}:
858
+ # Use the simplified proportion_p parameter
859
+ top_proportion_p = proportion_p
860
+ topk_min_count = 1
861
+ elif threshold_method == "quantile":
862
+ # Use quantile_p for quantile method
863
+ pass
864
+ elif threshold_method in {"zscore", "z"} and is_standardized_score:
865
+ # Auto-handle standardized scores
866
+ pass
867
+
868
+ stats = {"per_group": {}, "threshold_method": threshold_method}
869
+
870
+ # Validate quantiles if needed
871
+ def _check_q(q: float):
872
+ if not (0 < float(q) < 1):
873
+ raise ValueError(f"quantile must be in (0,1), got {q}")
874
+ quantiles_to_draw = None
875
+ if threshold_method == "quantile":
876
+ _check_q(quantile_p)
877
+ quantiles_to_draw = [quantile_p] # Simplified - no additional_quantiles
878
+ # Per-group quantile(s)
879
+ for g in group_names:
880
+ if g in groups:
881
+ arr = np.asarray(groups[g], dtype=float)
882
+ # Compute requested quantiles with NaN handling
883
+ valid = arr[~np.isnan(arr)]
884
+ thresholds_g = {}
885
+ if valid.size > 0:
886
+ for q in quantiles_to_draw:
887
+ try:
888
+ xq = float(np.nanquantile(arr, q))
889
+ except Exception:
890
+ xq = float(np.quantile(valid, q))
891
+ thresholds_g[f"p{int(round(q*100)):02d}"] = xq
892
+ else:
893
+ for q in quantiles_to_draw:
894
+ thresholds_g[f"p{int(round(q*100)):02d}"] = np.nan
895
+ # Primary threshold (for backward compatibility)
896
+ primary_label = f"p{int(round(quantile_p*100)):02d}"
897
+ primary_value = thresholds_g.get(primary_label, np.nan)
898
+ # Proportion below primary
899
+ vals = valid
900
+ prop_below = float(np.nanmean(vals <= primary_value)) if vals.size > 0 and not np.isnan(primary_value) else np.nan
901
+ stats["per_group"][g] = {
902
+ "threshold_values": thresholds_g,
903
+ "is_group_relative": True,
904
+ "threshold_value": primary_value,
905
+ "primary_quantile": primary_label,
906
+ "prop_below": prop_below
907
+ }
908
+ elif threshold_method in {"zscore", "z"}:
909
+ # z-score method (backward compatibility)
910
+ for g in group_names:
911
+ if g in groups:
912
+ arr = np.asarray(groups[g], dtype=float)
913
+ mean_g = np.nanmean(arr)
914
+ std_g = np.nanstd(arr, ddof=1) # sample std to match pandas
915
+ if is_standardized_score:
916
+ x_thresh_g = float(z_threshold)
917
+ else:
918
+ # For prefix (divergence): high scores indicate divergence, so mean + z*std
919
+ x_thresh_g = float(mean_g + z_threshold * std_g)
920
+ vals = arr[~np.isnan(arr)]
921
+ prop_above = float(np.nanmean(vals >= x_thresh_g)) if vals.size > 0 and not np.isnan(x_thresh_g) else np.nan
922
+ stats["per_group"][g] = {
923
+ "mean": float(mean_g),
924
+ "std": float(std_g),
925
+ "threshold_value": float(x_thresh_g),
926
+ "z_threshold": float(z_threshold),
927
+ "is_group_relative": True,
928
+ "prop_above": prop_above,
929
+ "num_above": int(np.sum(vals >= x_thresh_g)) if vals.size > 0 and not np.isnan(x_thresh_g) else 0,
930
+ "n": int(vals.size)
931
+ }
932
+ elif threshold_method in {"topk", "top_proportion", "proportion", "rank"}:
933
+ # Rank-based proportion selection within each group: pick top p% (highest values for prefix divergence)
934
+ if not (0 < float(proportion_p) < 1):
935
+ raise ValueError(f"proportion_p must be in (0,1), got {proportion_p}")
936
+ top_proportion_p = proportion_p # Map to internal variable
937
+ for g in group_names:
938
+ if g in groups:
939
+ arr = np.asarray(groups[g], dtype=float)
940
+ finite_mask = np.isfinite(arr)
941
+ vals = arr[finite_mask]
942
+ n_valid = int(vals.size)
943
+ if n_valid == 0:
944
+ stats["per_group"][g] = {
945
+ "threshold_value": np.nan,
946
+ "k": 0,
947
+ "n": 0,
948
+ "prop_selected": np.nan,
949
+ "num_geq_threshold": 0
950
+ }
951
+ continue
952
+ k = int(np.floor(top_proportion_p * n_valid))
953
+ if k < int(topk_min_count):
954
+ k = int(topk_min_count)
955
+ if k > n_valid:
956
+ k = n_valid
957
+ # Sort descending (most divergent first for prefix)
958
+ order = np.argsort(vals, kind="mergesort")[::-1]
959
+ thresh_val = vals[order[k - 1]] if k >= 1 else np.nan
960
+ num_geq = int(np.sum(vals >= thresh_val)) if k >= 1 and np.isfinite(thresh_val) else 0
961
+ stats["per_group"][g] = {
962
+ "threshold_value": float(thresh_val) if np.isfinite(thresh_val) else np.nan,
963
+ "k": int(k),
964
+ "n": int(n_valid),
965
+ "prop_selected": (k / n_valid) if n_valid > 0 else np.nan,
966
+ "num_geq_threshold": num_geq
967
+ }
968
+ stats["threshold_method"] = "topk"
969
+ else:
970
+ raise ValueError(f"Unknown threshold_method: {threshold_method}")
971
+
972
+ # Create plot
973
+ plt.figure(figsize=figsize)
974
+
975
+ # Plot distributions
976
+ for idx, group_name in enumerate(group_names):
977
+ if group_name in groups:
978
+ scores = groups[group_name]
979
+ color = color_map.get(group_name, "#1f77b4")
980
+ arr = np.asarray(scores, dtype=float)
981
+ vmin = np.nanmin(arr) if np.isfinite(arr).any() else None
982
+ vmax = np.nanmax(arr) if np.isfinite(arr).any() else None
983
+ kde_kwargs = {"label": group_name, "fill": True, "color": color, "linewidth": 2}
984
+ if kde_bw is not None:
985
+ kde_kwargs["bw_adjust"] = kde_bw
986
+ if vmin is not None and vmax is not None and vmin < vmax:
987
+ kde_kwargs["clip"] = (vmin, vmax)
988
+ sns.kdeplot(arr, **kde_kwargs)
989
+
990
+ # Add per-group threshold lines if requested (color-matched)
991
+ if show_threshold:
992
+ for i, g in enumerate(group_names):
993
+ if g in stats["per_group"]:
994
+ color = color_map.get(g, "#1f77b4")
995
+ ax = plt.gca()
996
+ y_max = ax.get_ylim()[1]
997
+ x_min, x_max = ax.get_xlim()
998
+ text_y = y_max * 0.9
999
+ x_offset = (x_max - x_min) * 0.005 * (i + 1)
1000
+ if threshold_method == "quantile":
1001
+ thresholds_g = stats["per_group"][g]["threshold_values"]
1002
+ # Draw multiple lines if multiple quantiles
1003
+ for k_idx, (q_lbl, xg) in enumerate(sorted(thresholds_g.items())):
1004
+ if np.isnan(xg):
1005
+ continue
1006
+ # Clean threshold line without text label
1007
+ plt.axvline(xg, color=color, linestyle="--", linewidth=1.6)
1008
+ elif threshold_method in {"zscore", "z"}:
1009
+ xg = stats["per_group"][g]["threshold_value"]
1010
+ # Clean threshold line without text label
1011
+ plt.axvline(xg, color=color, linestyle="--", linewidth=1.6)
1012
+ else: # top_proportion
1013
+ xg = stats["per_group"][g]["threshold_value"]
1014
+ if np.isfinite(xg):
1015
+ # Clean threshold line without text label
1016
+ plt.axvline(xg, color=color, linestyle="--", linewidth=1.6)
1017
+
1018
+ # Formatting
1019
+ if is_standardized_score:
1020
+ plt.xlabel("Standardized Prefix Rarity Score", fontsize=13)
1021
+ else:
1022
+ plt.xlabel("Prefix Rarity Score", fontsize=13)
1023
+ plt.ylabel("Density", fontsize=13)
1024
+ if len(group_names) > 1:
1025
+ plt.legend(title="Country")
1026
+ sns.despine()
1027
+ plt.tight_layout()
1028
+
1029
+ # Save and show
1030
+ if save_as:
1031
+ plt.savefig(f"{save_as}.png", dpi=dpi, bbox_inches='tight')
1032
+
1033
+ if show:
1034
+ plt.show()
1035
+
1036
+ return stats
1037
+
1038
+
1039
+ def plot_individual_indicators_correlation(
1040
+ df,
1041
+ indicator_columns=None,
1042
+ correlation_method='pearson',
1043
+ group_column=None,
1044
+ figsize=(10, 8),
1045
+ cmap='RdBu_r',
1046
+ center=0,
1047
+ annot=True,
1048
+ fmt='.2f',
1049
+ save_as=None,
1050
+ dpi=300,
1051
+ show=True
1052
+ ):
1053
+ """
1054
+ Plot correlation heatmap of individual-level indicators with beautiful styling.
1055
+
1056
+ Parameters:
1057
+ -----------
1058
+ df : pandas.DataFrame
1059
+ DataFrame containing individual-level indicators
1060
+ indicator_columns : list, optional
1061
+ List of column names to include in correlation analysis.
1062
+ If None, automatically detects indicator columns (diverged, divergence_year,
1063
+ prefix_rarity_score, path_uniqueness, etc.)
1064
+ correlation_method : str, default='pearson'
1065
+ Correlation method: 'pearson', 'spearman', 'kendall'
1066
+ group_column : str, optional
1067
+ Column name for grouping (e.g., 'country'). If provided, shows separate
1068
+ heatmaps for each group
1069
+ figsize : tuple, default=(10, 8)
1070
+ Figure size (width, height)
1071
+ cmap : str, default='RdBu_r'
1072
+ Colormap for heatmap. Options: 'RdBu_r', 'coolwarm', 'viridis', 'plasma'
1073
+ center : float, default=0
1074
+ Value to center the colormap at
1075
+ annot : bool, default=True
1076
+ Whether to annotate cells with correlation values
1077
+ fmt : str, default='.2f'
1078
+ Format for annotations
1079
+ save_as : str, optional
1080
+ Path to save the figure (without extension)
1081
+ dpi : int, default=300
1082
+ DPI for saving
1083
+ show : bool, default=True
1084
+ Whether to display the plot
1085
+
1086
+ Returns:
1087
+ --------
1088
+ dict: Correlation matrix/matrices and statistics
1089
+
1090
+ Example:
1091
+ --------
1092
+ # Basic usage
1093
+ >>> plot_individual_indicators_correlation(df)
1094
+
1095
+ # Custom indicators with grouping
1096
+ >>> plot_individual_indicators_correlation(
1097
+ ... df,
1098
+ ... indicator_columns=['diverged', 'prefix_rarity_score', 'path_uniqueness'],
1099
+ ... group_column='country',
1100
+ ... correlation_method='spearman'
1101
+ ... )
1102
+
1103
+ # Custom styling
1104
+ >>> plot_individual_indicators_correlation(
1105
+ ... df,
1106
+ ... cmap='plasma',
1107
+ ... figsize=(12, 10),
1108
+ ... save_as="indicators_correlation_heatmap"
1109
+ ... )
1110
+ """
1111
+ import matplotlib.pyplot as plt
1112
+ import seaborn as sns
1113
+ import pandas as pd
1114
+ import numpy as np
1115
+
1116
+ # Auto-detect indicator columns if not provided
1117
+ if indicator_columns is None:
1118
+ # Common individual-level indicator patterns
1119
+ potential_indicators = [
1120
+ 'diverged', 'first_divergence_year', 'divergence_year',
1121
+ 'prefix_rarity_score', 'path_uniqueness',
1122
+ 'rarity_score', 'uniqueness_score'
1123
+ ]
1124
+ indicator_columns = [col for col in df.columns if col in potential_indicators]
1125
+
1126
+ # Also include numeric columns that might be indicators
1127
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
1128
+ for col in numeric_cols:
1129
+ if col not in indicator_columns and any(
1130
+ keyword in col.lower() for keyword in
1131
+ ['score', 'index', 'count', 'factor', 'rate', 'ratio']
1132
+ ):
1133
+ indicator_columns.append(col)
1134
+
1135
+ # Filter and clean data
1136
+ df_indicators = df[indicator_columns].copy()
1137
+
1138
+ # Handle missing values and convert data types
1139
+ for col in df_indicators.columns:
1140
+ if df_indicators[col].dtype == 'object':
1141
+ # Try to convert to numeric
1142
+ df_indicators[col] = pd.to_numeric(df_indicators[col], errors='coerce')
1143
+
1144
+ # Remove columns with too many missing values (>50%)
1145
+ valid_cols = []
1146
+ for col in df_indicators.columns:
1147
+ if df_indicators[col].notna().sum() / len(df_indicators) > 0.5:
1148
+ valid_cols.append(col)
1149
+
1150
+ df_indicators = df_indicators[valid_cols]
1151
+
1152
+ # Drop rows with any missing values for correlation calculation
1153
+ df_clean = df_indicators.dropna()
1154
+
1155
+ if len(df_clean) == 0:
1156
+ raise ValueError("No valid data remaining after cleaning. Check for missing values.")
1157
+
1158
+ # Calculate correlations
1159
+ results = {}
1160
+
1161
+ if group_column is None or group_column not in df.columns:
1162
+ # Single correlation matrix
1163
+ corr_matrix = df_clean.corr(method=correlation_method)
1164
+ results['overall'] = corr_matrix
1165
+
1166
+ # Create plot
1167
+ plt.figure(figsize=figsize)
1168
+
1169
+ # Create mask for upper triangle (optional - makes it cleaner)
1170
+ mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
1171
+
1172
+ # Generate heatmap
1173
+ sns.heatmap(
1174
+ corr_matrix,
1175
+ mask=mask,
1176
+ annot=annot,
1177
+ fmt=fmt,
1178
+ cmap=cmap,
1179
+ center=center,
1180
+ square=True,
1181
+ cbar_kws={"shrink": .8, "label": f"{correlation_method.title()} Correlation"},
1182
+ linewidths=0.5
1183
+ )
1184
+
1185
+ plt.title(f"Individual-Level Indicators Correlation Heatmap\n({correlation_method.title()} Correlation)",
1186
+ fontsize=14, pad=20)
1187
+ plt.xticks(rotation=45, ha='right')
1188
+ plt.yticks(rotation=0)
1189
+
1190
+ else:
1191
+ # Group-based correlation matrices
1192
+ groups = df[group_column].unique()
1193
+ n_groups = len(groups)
1194
+
1195
+ # Calculate subplot layout
1196
+ if n_groups <= 2:
1197
+ nrows, ncols = 1, n_groups
1198
+ figsize = (figsize[0] * n_groups, figsize[1])
1199
+ else:
1200
+ ncols = min(3, n_groups)
1201
+ nrows = (n_groups + ncols - 1) // ncols
1202
+ figsize = (figsize[0] * ncols, figsize[1] * nrows)
1203
+
1204
+ fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
1205
+ if n_groups == 1:
1206
+ axes = [axes]
1207
+ elif nrows == 1:
1208
+ axes = axes
1209
+ else:
1210
+ axes = axes.flatten()
1211
+
1212
+ for i, group in enumerate(groups):
1213
+ group_data = df[df[group_column] == group][indicator_columns].dropna()
1214
+
1215
+ if len(group_data) < 2:
1216
+ print(f"Warning: Group '{group}' has insufficient data for correlation")
1217
+ continue
1218
+
1219
+ corr_matrix = group_data.corr(method=correlation_method)
1220
+ results[group] = corr_matrix
1221
+
1222
+ # Create mask for upper triangle
1223
+ mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
1224
+
1225
+ # Plot heatmap
1226
+ sns.heatmap(
1227
+ corr_matrix,
1228
+ mask=mask,
1229
+ annot=annot,
1230
+ fmt=fmt,
1231
+ cmap=cmap,
1232
+ center=center,
1233
+ square=True,
1234
+ cbar=i == 0, # Only show colorbar for first subplot
1235
+ cbar_kws={"shrink": .8, "label": f"{correlation_method.title()} Correlation"} if i == 0 else {},
1236
+ linewidths=0.5,
1237
+ ax=axes[i]
1238
+ )
1239
+
1240
+ axes[i].set_title(f"{group}\n({len(group_data)} individuals)", fontsize=12)
1241
+ axes[i].set_xticks(axes[i].get_xticks())
1242
+ axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45, ha='right')
1243
+ axes[i].set_yticks(axes[i].get_yticks())
1244
+ axes[i].set_yticklabels(axes[i].get_yticklabels(), rotation=0)
1245
+
1246
+ # Hide unused subplots
1247
+ for j in range(i + 1, len(axes)):
1248
+ axes[j].set_visible(False)
1249
+
1250
+ plt.suptitle(f"Individual-Level Indicators Correlation by {group_column.title()}\n({correlation_method.title()} Correlation)",
1251
+ fontsize=16, y=0.98)
1252
+
1253
+ plt.tight_layout()
1254
+
1255
+ # Save and show
1256
+ if save_as:
1257
+ plt.savefig(f"{save_as}.png", dpi=dpi, bbox_inches='tight')
1258
+
1259
+ if show:
1260
+ plt.show()
1261
+
1262
+ # Add summary statistics
1263
+ if group_column is None:
1264
+ sample_size = len(df_clean)
1265
+ else:
1266
+ sizes = {}
1267
+ for g in df[group_column].unique():
1268
+ g_clean = df[df[group_column]==g][indicator_columns].apply(pd.to_numeric, errors='coerce').dropna()
1269
+ sizes[g] = len(g_clean)
1270
+ sample_size = sizes
1271
+
1272
+ results['summary'] = {
1273
+ 'method': correlation_method,
1274
+ 'n_indicators': len(valid_cols),
1275
+ 'indicators_included': valid_cols,
1276
+ 'sample_size': sample_size
1277
+ }
1278
+
1279
+ return results
1280
+
1281
+
1282
+ def compute_path_uniqueness_by_group_prefix(sequences, group_labels):
1283
+ """
1284
+ Compute path uniqueness within each subgroup defined by group_labels using prefix-based approach.
1285
+ This is consistent with the divergence module's prefix-based logic.
1286
+ :param sequences: List of sequences.
1287
+ :param group_labels: List of group keys (same length as sequences), e.g., country, gender.
1288
+ :return: List of path uniqueness scores (same order as input).
1289
+ """
1290
+ from collections import defaultdict
1291
+
1292
+ T = len(sequences[0])
1293
+ df = pd.DataFrame({
1294
+ "sequence": sequences,
1295
+ "group": group_labels
1296
+ })
1297
+
1298
+ # Step 1: Precompute prefix frequency tables per group
1299
+ group_prefix_freq = {}
1300
+ for group, group_df in df.groupby("group"):
1301
+ prefix_freq = [defaultdict(int) for _ in range(T)]
1302
+ for seq in group_df["sequence"]:
1303
+ prefix = []
1304
+ for t in range(T):
1305
+ prefix.append(seq[t])
1306
+ prefix_freq[t][tuple(prefix)] += 1
1307
+ group_prefix_freq[group] = prefix_freq
1308
+
1309
+ # Step 2: Compute path uniqueness per individual
1310
+ uniqueness_scores = []
1311
+ for seq, group in zip(sequences, group_labels):
1312
+ prefix_freq = group_prefix_freq[group]
1313
+ prefix = []
1314
+ count = 0
1315
+ for t in range(T):
1316
+ prefix.append(seq[t])
1317
+ if prefix_freq[t][tuple(prefix)] == 1:
1318
+ count += 1
1319
+ uniqueness_scores.append(count)
1320
+
1321
+ return uniqueness_scores