sequenzo 0.1.21__cp39-cp39-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sequenzo might be problematic. Click here for more details.

Files changed (260) hide show
  1. sequenzo/__init__.py +240 -0
  2. sequenzo/big_data/__init__.py +12 -0
  3. sequenzo/big_data/clara/__init__.py +26 -0
  4. sequenzo/big_data/clara/clara.py +467 -0
  5. sequenzo/big_data/clara/utils/__init__.py +27 -0
  6. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  7. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  8. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-39-darwin.so +0 -0
  9. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  10. sequenzo/big_data/clara/visualization.py +88 -0
  11. sequenzo/clustering/KMedoids.py +196 -0
  12. sequenzo/clustering/__init__.py +30 -0
  13. sequenzo/clustering/clustering_c_code.cpython-39-darwin.so +0 -0
  14. sequenzo/clustering/hierarchical_clustering.py +1380 -0
  15. sequenzo/clustering/src/KMedoid.cpp +262 -0
  16. sequenzo/clustering/src/PAM.cpp +236 -0
  17. sequenzo/clustering/src/PAMonce.cpp +234 -0
  18. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  19. sequenzo/clustering/src/cluster_quality.h +128 -0
  20. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  21. sequenzo/clustering/src/module.cpp +228 -0
  22. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  23. sequenzo/clustering/utils/__init__.py +27 -0
  24. sequenzo/clustering/utils/disscenter.py +122 -0
  25. sequenzo/data_preprocessing/__init__.py +20 -0
  26. sequenzo/data_preprocessing/helpers.py +256 -0
  27. sequenzo/datasets/__init__.py +41 -0
  28. sequenzo/datasets/biofam.csv +2001 -0
  29. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  30. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  31. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  32. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  33. sequenzo/datasets/country_co2_emissions.csv +194 -0
  34. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  35. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  36. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  37. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  38. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  39. sequenzo/datasets/mvad.csv +713 -0
  40. sequenzo/datasets/pairfam_family.csv +1867 -0
  41. sequenzo/datasets/polyadic_samplec1.csv +61 -0
  42. sequenzo/datasets/polyadic_samplep1.csv +61 -0
  43. sequenzo/datasets/polyadic_seqc1.csv +61 -0
  44. sequenzo/datasets/polyadic_seqp1.csv +61 -0
  45. sequenzo/define_sequence_data.py +609 -0
  46. sequenzo/dissimilarity_measures/__init__.py +31 -0
  47. sequenzo/dissimilarity_measures/c_code.cpython-39-darwin.so +0 -0
  48. sequenzo/dissimilarity_measures/get_distance_matrix.py +702 -0
  49. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +241 -0
  50. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  51. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  52. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  53. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  54. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  55. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  56. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  57. sequenzo/dissimilarity_measures/src/module.cpp +34 -0
  58. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  59. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  60. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  61. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  62. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  63. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  64. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  65. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  66. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  67. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  210. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  211. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  212. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-39-darwin.so +0 -0
  213. sequenzo/dissimilarity_measures/utils/seqconc.cpython-39-darwin.so +0 -0
  214. sequenzo/dissimilarity_measures/utils/seqdss.cpython-39-darwin.so +0 -0
  215. sequenzo/dissimilarity_measures/utils/seqdur.cpython-39-darwin.so +0 -0
  216. sequenzo/dissimilarity_measures/utils/seqlength.cpython-39-darwin.so +0 -0
  217. sequenzo/multidomain/__init__.py +23 -0
  218. sequenzo/multidomain/association_between_domains.py +311 -0
  219. sequenzo/multidomain/cat.py +431 -0
  220. sequenzo/multidomain/combt.py +519 -0
  221. sequenzo/multidomain/dat.py +89 -0
  222. sequenzo/multidomain/idcd.py +139 -0
  223. sequenzo/multidomain/linked_polyad.py +292 -0
  224. sequenzo/openmp_setup.py +233 -0
  225. sequenzo/prefix_tree/__init__.py +43 -0
  226. sequenzo/prefix_tree/individual_level_indicators.py +1274 -0
  227. sequenzo/prefix_tree/system_level_indicators.py +465 -0
  228. sequenzo/prefix_tree/utils.py +54 -0
  229. sequenzo/sequence_characteristics/__init__.py +40 -0
  230. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  231. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  232. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  233. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  234. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  235. sequenzo/sequence_characteristics/turbulence.py +155 -0
  236. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  237. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  238. sequenzo/suffix_tree/__init__.py +48 -0
  239. sequenzo/suffix_tree/individual_level_indicators.py +1638 -0
  240. sequenzo/suffix_tree/system_level_indicators.py +456 -0
  241. sequenzo/suffix_tree/utils.py +56 -0
  242. sequenzo/visualization/__init__.py +29 -0
  243. sequenzo/visualization/plot_mean_time.py +194 -0
  244. sequenzo/visualization/plot_modal_state.py +276 -0
  245. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  246. sequenzo/visualization/plot_relative_frequency.py +404 -0
  247. sequenzo/visualization/plot_sequence_index.py +937 -0
  248. sequenzo/visualization/plot_single_medoid.py +153 -0
  249. sequenzo/visualization/plot_state_distribution.py +613 -0
  250. sequenzo/visualization/plot_transition_matrix.py +190 -0
  251. sequenzo/visualization/utils/__init__.py +23 -0
  252. sequenzo/visualization/utils/utils.py +310 -0
  253. sequenzo/with_event_history_analysis/__init__.py +35 -0
  254. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  255. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  256. sequenzo-0.1.21.dist-info/METADATA +308 -0
  257. sequenzo-0.1.21.dist-info/RECORD +254 -0
  258. sequenzo-0.1.21.dist-info/WHEEL +5 -0
  259. sequenzo-0.1.21.dist-info/licenses/LICENSE +28 -0
  260. sequenzo-0.1.21.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1638 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : individual_level_indicators.py
4
+ @Time : 08/08/2025 15:30
5
+ @Desc :
6
+ This module provides methods for calculating individual-level convergence indicators
7
+ in sequence data analysis. It includes tools to assess convergence, identify timing,
8
+ measure suffix rarity, and evaluate path uniqueness.
9
+
10
+ The convergence indicators capture whether, when, and to what extent a person's trajectory
11
+ aligns with dominant population patterns over time.
12
+
13
+ Key indicators:
14
+ - Suffix Rarity Score: cumulative rarity of path suffixes (positive, higher = rarer)
15
+ - Binary converged indicator: low rarity z-scores indicate convergence to typical patterns
16
+ - First convergence year: timing when trajectory becomes more typical (low rarity)
17
+ - Path uniqueness for extreme structural isolation
18
+ """
19
+ from collections import defaultdict
20
+ from typing import Optional, List
21
+ import numpy as np
22
+ import pandas as pd
23
+
24
+
25
+ class IndividualConvergence:
26
+ def __init__(self, sequences):
27
+ # Handle case where sequences might already be an IndividualConvergence object
28
+ if isinstance(sequences, IndividualConvergence):
29
+ # Extract sequences from existing object
30
+ self.sequences = sequences.sequences
31
+ elif hasattr(sequences, 'sequences'):
32
+ # Handle case where input might be another object with sequences attribute
33
+ self.sequences = sequences.sequences
34
+ else:
35
+ # Normal case: sequences is a list of sequences
36
+ self.sequences = sequences
37
+
38
+ # Validate input
39
+ if not self.sequences or len(self.sequences) == 0:
40
+ raise ValueError("sequences cannot be empty")
41
+ if not hasattr(self.sequences[0], '__len__') and not hasattr(self.sequences[0], '__iter__'):
42
+ raise ValueError("sequences must be a list of sequences (e.g., [[1,2,3], [2,3,1], ...])")
43
+
44
+ # 验证所有序列长度相同,防止不规整序列的静默错误
45
+ L0 = len(self.sequences[0])
46
+ if any(len(s) != L0 for s in self.sequences):
47
+ raise ValueError("All sequences must have the same length")
48
+ self.T = L0
49
+
50
+ self.suffix_freq_by_year = self._build_suffix_frequencies()
51
+
52
+ def _build_suffix_frequencies(self):
53
+ """
54
+ Build suffix frequencies for each year t.
55
+ suffix[t] contains frequency of suffixes from year t to end for all individuals.
56
+ """
57
+ freq_by_year = [defaultdict(int) for _ in range(self.T)]
58
+ for seq in self.sequences:
59
+ for t in range(self.T):
60
+ suffix = tuple(seq[t:]) # suffix from year t to end
61
+ freq_by_year[t][suffix] += 1
62
+ return freq_by_year
63
+
64
+ # Divergence-related computations are intentionally omitted in this convergence-focused module.
65
+
66
+ def compute_converged(
67
+ self,
68
+ z_threshold=1.5,
69
+ min_t=1,
70
+ max_t=None,
71
+ window=1,
72
+ inclusive=False,
73
+ group_labels=None,
74
+ *,
75
+ method: str = "zscore",
76
+ proportion: Optional[float] = None,
77
+ quantile_p: Optional[float] = None,
78
+ min_count: int = 1,
79
+ ):
80
+ """
81
+ Compute binary convergence flags with multiple selection methods.
82
+
83
+ Definition (common intuition): lower suffix-rarity implies more typical behavior.
84
+ We compute per-year rarity via suffix frequencies and then detect convergence using
85
+ one of the following methods:
86
+
87
+ Methods
88
+ -------
89
+ - "zscore" (window-based, default):
90
+ Uses per-year z-scores of rarity. A person is converged if there exists a window
91
+ of length `window` starting between years `[min_t, max_t]` where all z-scores are
92
+ below `-z_threshold` (use `inclusive=True` for `<=`). Zero-variance years remain
93
+ NaN and any window containing NaN is skipped.
94
+
95
+ - "top_proportion" (aka "topk"/"proportion"/"rank"):
96
+ Uses the aggregated standardized score from `compute_standardized_rarity_score`
97
+ (lower = more typical). Select the most typical `proportion` within each group if
98
+ `group_labels` is provided, otherwise globally. `min_count` ensures at least the
99
+ specified number per group.
100
+
101
+ - "quantile":
102
+ Uses a quantile threshold (`quantile_p`) on the aggregated standardized score,
103
+ within each group (or globally if no `group_labels`). Individuals at or below the
104
+ threshold are marked converged.
105
+
106
+ Parameters
107
+ ----------
108
+ z_threshold : float, default 1.5
109
+ zscore method only. Converged when z < -z_threshold (or <= if inclusive=True).
110
+ min_t, max_t : int
111
+ Search interval for the starting year (1-indexed). If max_t is None, uses T - window + 1.
112
+ window : int, default 1
113
+ Number of consecutive years required in zscore method and used in standardized aggregation.
114
+ inclusive : bool, default False
115
+ zscore method only. If True, use <= comparisons.
116
+ group_labels : array-like or None
117
+ If provided, proportion/quantile selections are computed within each group.
118
+ method : str, default "zscore"
119
+ One of {"zscore", "top_proportion" (aliases: "topk","proportion","rank"), "quantile"}.
120
+ proportion : float or None
121
+ For top_proportion. Fraction (0,1) to select as converged. Defaults to 0.10 if None.
122
+ quantile_p : float or None
123
+ For quantile. Quantile in (0,1) used as threshold. Defaults to 0.10 if None.
124
+ min_count : int, default 1
125
+ For top_proportion. Lower bound for number selected per group.
126
+
127
+ Returns
128
+ -------
129
+ List[int]
130
+ 0/1 indicator for each individual.
131
+ """
132
+ if max_t is None:
133
+ max_t = self.T - window + 1
134
+
135
+ N = len(self.sequences)
136
+ method_norm = (method or "zscore").lower()
137
+
138
+ # Branch: rank/quantile style selections using aggregated standardized scores
139
+ if method_norm in {"top_proportion", "topk", "proportion", "rank"}:
140
+ p = proportion if proportion is not None else 0.10
141
+ scores = np.asarray(
142
+ self.compute_standardized_rarity_score(min_t=min_t, max_t=max_t, window=window), dtype=float
143
+ )
144
+ if group_labels is None:
145
+ vals = scores
146
+ finite_mask = np.isfinite(vals)
147
+ n_valid = int(np.sum(finite_mask))
148
+ if n_valid == 0:
149
+ return [0] * N
150
+ k = int(np.floor(p * n_valid))
151
+ if k < int(min_count):
152
+ k = int(min_count)
153
+ if k > n_valid:
154
+ k = n_valid
155
+ order = np.argsort(np.where(np.isfinite(vals), vals, np.inf), kind="mergesort")
156
+ flags = np.zeros(N, dtype=int)
157
+ if k >= 1:
158
+ selected = order[:k]
159
+ flags[selected] = 1
160
+ return flags.tolist()
161
+ else:
162
+ flags, _ = self.compute_converged_by_top_proportion(
163
+ group_labels=group_labels,
164
+ proportion=float(p),
165
+ min_t=min_t,
166
+ max_t=max_t,
167
+ window=window,
168
+ min_count=min_count,
169
+ )
170
+ return flags
171
+
172
+ if method_norm == "quantile":
173
+ q = quantile_p if quantile_p is not None else 0.10
174
+ scores = np.asarray(
175
+ self.compute_standardized_rarity_score(min_t=min_t, max_t=max_t, window=window), dtype=float
176
+ )
177
+ flags = np.zeros(N, dtype=int)
178
+ if group_labels is None:
179
+ # Global quantile
180
+ valid = scores[np.isfinite(scores)]
181
+ if valid.size == 0:
182
+ return flags.tolist()
183
+ try:
184
+ xq = float(np.nanquantile(scores, q))
185
+ except Exception:
186
+ xq = float(np.quantile(valid, q))
187
+ flags[np.where(scores <= xq)[0]] = 1
188
+ return flags.tolist()
189
+ else:
190
+ labels = np.asarray(group_labels)
191
+ for g in pd.unique(labels):
192
+ idx = np.where(labels == g)[0]
193
+ vals = scores[idx]
194
+ valid = vals[np.isfinite(vals)]
195
+ if valid.size == 0:
196
+ continue
197
+ try:
198
+ xq = float(np.nanquantile(vals, q))
199
+ except Exception:
200
+ xq = float(np.quantile(valid, q))
201
+ local = np.where(vals <= xq)[0]
202
+ flags[idx[local]] = 1
203
+ return flags.tolist()
204
+
205
+ # Default branch: z-score window logic (supports group or global frequencies)
206
+ if group_labels is not None:
207
+ # 组内收敛:使用组内频率和样本大小
208
+ return self._compute_converged_by_group(z_threshold, min_t, max_t, window, inclusive, group_labels)
209
+
210
+ # 使用全局频率计算稀有度
211
+ rarity_matrix = []
212
+ for seq in self.sequences:
213
+ score = []
214
+ for t in range(self.T):
215
+ suffix = tuple(seq[t:])
216
+ freq = self.suffix_freq_by_year[t][suffix] / N
217
+ score.append(-np.log(freq + 1e-10))
218
+ rarity_matrix.append(score)
219
+
220
+ rarity_df = pd.DataFrame(rarity_matrix)
221
+ # 按列 z 标准化;保留 NaN(零方差年份),后续窗口检测时跳过含 NaN 的窗口
222
+ rarity_z = rarity_df.apply(lambda x: (x - x.mean()) / x.std(), axis=0)
223
+ rarity_z = rarity_z.replace([np.inf, -np.inf], np.nan)
224
+
225
+ flags = []
226
+ for i in range(N):
227
+ z = rarity_z.iloc[i]
228
+ converged = 0
229
+ for t in range(min_t - 1, max_t): # min_t-1 for 0-indexed, max_t already accounts for window
230
+ # 跳过包含 NaN 的窗口(如零方差年份)
231
+ vals = [z.iloc[t + k] for k in range(window)]
232
+ if not np.all(np.isfinite(vals)):
233
+ continue
234
+ # 收敛 = 低稀有(更典型)
235
+ if inclusive:
236
+ condition = all(v <= -z_threshold for v in vals)
237
+ else:
238
+ condition = all(v < -z_threshold for v in vals)
239
+ if condition:
240
+ converged = 1
241
+ break
242
+ flags.append(converged)
243
+ return flags
244
+
245
+ def _compute_converged_by_group(self, z_threshold, min_t, max_t, window, inclusive, group_labels):
246
+ """
247
+ 计算组内收敛:使用组内频率和样本大小计算稀有度
248
+ """
249
+ from collections import defaultdict
250
+
251
+ # 按组构建 suffix 频率表
252
+ group_suffix_freq = {}
253
+ group_sizes = {}
254
+
255
+ # 先统计各组信息
256
+ group_sequences = defaultdict(list)
257
+ for i, (seq, group) in enumerate(zip(self.sequences, group_labels)):
258
+ group_sequences[group].append((i, seq))
259
+
260
+ # 为每个组构建 suffix 频率表
261
+ for group, seq_list in group_sequences.items():
262
+ group_sizes[group] = len(seq_list)
263
+ freq_by_year = [defaultdict(int) for _ in range(self.T)]
264
+
265
+ for _, seq in seq_list:
266
+ for t in range(self.T):
267
+ suffix = tuple(seq[t:])
268
+ freq_by_year[t][suffix] += 1
269
+
270
+ group_suffix_freq[group] = freq_by_year
271
+
272
+ # 为每个个体计算组内稀有度
273
+ all_flags = [0] * len(self.sequences)
274
+
275
+ for group, seq_list in group_sequences.items():
276
+ group_n = group_sizes[group]
277
+ group_freq = group_suffix_freq[group]
278
+
279
+ # 计算该组的稀有度矩阵
280
+ rarity_matrix = []
281
+ group_indices = []
282
+
283
+ for orig_idx, seq in seq_list:
284
+ group_indices.append(orig_idx)
285
+ score = []
286
+ for t in range(self.T):
287
+ suffix = tuple(seq[t:])
288
+ freq = group_freq[t][suffix] / group_n
289
+ score.append(-np.log(freq + 1e-10))
290
+ rarity_matrix.append(score)
291
+
292
+ # 计算 z 分数
293
+ rarity_df = pd.DataFrame(rarity_matrix)
294
+ rarity_z = rarity_df.apply(lambda x: (x - x.mean()) / x.std(), axis=0)
295
+ rarity_z = rarity_z.replace([np.inf, -np.inf], np.nan)
296
+
297
+ # 判断收敛
298
+ for i, orig_idx in enumerate(group_indices):
299
+ z = rarity_z.iloc[i]
300
+ converged = 0
301
+ for t in range(min_t - 1, max_t):
302
+ vals = [z.iloc[t + k] for k in range(window)]
303
+ if not np.all(np.isfinite(vals)):
304
+ continue
305
+ if inclusive:
306
+ condition = all(v <= -z_threshold for v in vals)
307
+ else:
308
+ condition = all(v < -z_threshold for v in vals)
309
+ if condition:
310
+ converged = 1
311
+ break
312
+
313
+ all_flags[orig_idx] = converged
314
+
315
+ return all_flags
316
+
317
+ # First-divergence timing is intentionally omitted in this convergence-focused module.
318
+
319
+ def compute_first_convergence_year(
320
+ self,
321
+ z_threshold=1.5,
322
+ min_t=1,
323
+ max_t=None,
324
+ window=1,
325
+ inclusive=False,
326
+ group_labels=None,
327
+ *,
328
+ method: str = "zscore",
329
+ proportion: Optional[float] = None,
330
+ quantile_p: Optional[float] = None,
331
+ min_count: int = 1,
332
+ ):
333
+ """
334
+ Compute the first convergence year per individual with multiple selection methods.
335
+
336
+ Methods
337
+ -------
338
+ - "zscore" (default):
339
+ Find the earliest starting year t in [min_t, max_t] such that all z-scores in the
340
+ length-`window` block are below `-z_threshold` (or <= if inclusive=True). Zero-variance
341
+ years are NaN; windows containing NaN are skipped.
342
+
343
+ - "top_proportion" (aka "topk"/"proportion"/"rank"):
344
+ Use aggregated standardized scores to pick the most typical `proportion` within each group
345
+ (or globally). For the selected individuals, return the earliest t where the per-window
346
+ max z-score is <= the selection threshold; others return None. `min_count` is respected.
347
+
348
+ - "quantile":
349
+ Use per-group (or global) quantile threshold `quantile_p` on aggregated standardized scores;
350
+ individuals at or below the threshold return the earliest qualifying year; others return None.
351
+
352
+ Parameters
353
+ ----------
354
+ z_threshold, min_t, max_t, window, inclusive, group_labels
355
+ Same definitions as in `compute_converged` for the zscore method.
356
+ method : str, default "zscore"
357
+ One of {"zscore", "top_proportion" (aliases: "topk","proportion","rank"), "quantile"}.
358
+ proportion : float or None
359
+ For top_proportion. Fraction (0,1) to select as converged. Defaults to 0.10 if None.
360
+ quantile_p : float or None
361
+ For quantile. Quantile in (0,1) used as threshold. Defaults to 0.10 if None.
362
+ min_count : int, default 1
363
+ For top_proportion. Lower bound for number selected per group.
364
+
365
+ Returns
366
+ -------
367
+ List[Optional[int]]
368
+ First convergence years (1-indexed). None indicates no convergence.
369
+ """
370
+ if max_t is None:
371
+ max_t = self.T - window + 1
372
+
373
+ N = len(self.sequences)
374
+ method_norm = (method or "zscore").lower()
375
+
376
+ # Helper: standardized z matrix and per-t window maxima per individual
377
+ def _compute_window_max_list():
378
+ # Build rarity matrix and columnwise z (global standardization)
379
+ rarity_matrix = []
380
+ for seq in self.sequences:
381
+ score = []
382
+ for t in range(self.T):
383
+ suffix = tuple(seq[t:])
384
+ freq = self.suffix_freq_by_year[t][suffix] / N
385
+ score.append(-np.log(freq + 1e-10))
386
+ rarity_matrix.append(score)
387
+ rarity_arr = np.asarray(rarity_matrix, dtype=float)
388
+ col_means = np.nanmean(rarity_arr, axis=0)
389
+ col_stds = np.nanstd(rarity_arr, axis=0, ddof=1)
390
+ with np.errstate(invalid='ignore', divide='ignore'):
391
+ rarity_z = (rarity_arr - col_means) / col_stds
392
+ rarity_z = np.where(np.isfinite(rarity_z), rarity_z, np.nan)
393
+ # Compute per-individual window maxima sequence over t
394
+ window_maxes = [] # list of list per i
395
+ for i in range(N):
396
+ z_scores = rarity_z[i, :]
397
+ vals_per_t = []
398
+ for t0 in range(min_t - 1, max_t):
399
+ vals = [z_scores[t0 + k] for k in range(window)]
400
+ if not np.all(np.isfinite(vals)):
401
+ vals_per_t.append(np.nan)
402
+ else:
403
+ vals_per_t.append(float(np.max(vals)))
404
+ window_maxes.append(vals_per_t)
405
+ return np.asarray(window_maxes, dtype=float)
406
+
407
+ # Branches for rank/quantile-style thresholds
408
+ if method_norm in {"top_proportion", "topk", "proportion", "rank", "quantile"}:
409
+ # Compute aggregated scores for thresholding
410
+ agg_scores = np.asarray(
411
+ self.compute_standardized_rarity_score(min_t=min_t, max_t=max_t, window=window), dtype=float
412
+ )
413
+ per_t_window_max = _compute_window_max_list()
414
+
415
+ if method_norm in {"top_proportion", "topk", "proportion", "rank"}:
416
+ p = proportion if proportion is not None else 0.10
417
+ if group_labels is None:
418
+ vals = agg_scores
419
+ finite_mask = np.isfinite(vals)
420
+ n_valid = int(np.sum(finite_mask))
421
+ if n_valid == 0:
422
+ return [None] * N
423
+ k = int(np.floor(p * n_valid))
424
+ if k < int(min_count):
425
+ k = int(min_count)
426
+ if k > n_valid:
427
+ k = n_valid
428
+ order = np.argsort(np.where(np.isfinite(vals), vals, np.inf), kind="mergesort")
429
+ selected_idx = set(order[:k].tolist()) if k >= 1 else set()
430
+ years = []
431
+ for i in range(N):
432
+ if i not in selected_idx:
433
+ years.append(None)
434
+ continue
435
+ wm = per_t_window_max[i]
436
+ # threshold value is kth value
437
+ thresh_val = vals[order[k - 1]] if k >= 1 else np.nan
438
+ if not np.isfinite(thresh_val):
439
+ years.append(None)
440
+ continue
441
+ # earliest t where window_max <= threshold
442
+ yr = None
443
+ for t_idx, wv in enumerate(wm):
444
+ if np.isfinite(wv) and wv <= float(thresh_val):
445
+ yr = t_idx + 1 # 1-indexed
446
+ break
447
+ years.append(yr)
448
+ return years
449
+ else:
450
+ labels = np.asarray(group_labels)
451
+ years = [None] * N
452
+ for g in pd.unique(labels):
453
+ idx = np.where(labels == g)[0]
454
+ vals = agg_scores[idx]
455
+ finite_mask = np.isfinite(vals)
456
+ n_valid = int(np.sum(finite_mask))
457
+ if n_valid == 0:
458
+ continue
459
+ k = int(np.floor(p * n_valid))
460
+ if k < int(min_count):
461
+ k = int(min_count)
462
+ if k > n_valid:
463
+ k = n_valid
464
+ order_local = np.argsort(np.where(np.isfinite(vals), vals, np.inf), kind="mergesort")
465
+ selected_local = set(order_local[:k].tolist()) if k >= 1 else set()
466
+ thresh_val = vals[order_local[k - 1]] if k >= 1 else np.nan
467
+ for j_local, i_global in enumerate(idx):
468
+ if j_local not in selected_local or not np.isfinite(thresh_val):
469
+ continue
470
+ wm = per_t_window_max[i_global]
471
+ for t_idx, wv in enumerate(wm):
472
+ if np.isfinite(wv) and wv <= float(thresh_val):
473
+ years[i_global] = int(t_idx + 1)
474
+ break
475
+ return years
476
+
477
+ # quantile branch
478
+ q = quantile_p if quantile_p is not None else 0.10
479
+ years = [None] * N
480
+ if group_labels is None:
481
+ valid = agg_scores[np.isfinite(agg_scores)]
482
+ if valid.size == 0:
483
+ return years
484
+ try:
485
+ xq = float(np.nanquantile(agg_scores, q))
486
+ except Exception:
487
+ xq = float(np.quantile(valid, q))
488
+ for i in range(N):
489
+ if not np.isfinite(agg_scores[i]) or agg_scores[i] > xq:
490
+ continue
491
+ wm = per_t_window_max[i]
492
+ for t_idx, wv in enumerate(wm):
493
+ if np.isfinite(wv) and wv <= xq:
494
+ years[i] = int(t_idx + 1)
495
+ break
496
+ return years
497
+ else:
498
+ labels = np.asarray(group_labels)
499
+ for g in pd.unique(labels):
500
+ idx = np.where(labels == g)[0]
501
+ vals = agg_scores[idx]
502
+ valid = vals[np.isfinite(vals)]
503
+ if valid.size == 0:
504
+ continue
505
+ try:
506
+ xq = float(np.nanquantile(vals, q))
507
+ except Exception:
508
+ xq = float(np.quantile(valid, q))
509
+ for j_local, i_global in enumerate(idx):
510
+ if not np.isfinite(vals[j_local]) or vals[j_local] > xq:
511
+ continue
512
+ wm = per_t_window_max[i_global]
513
+ for t_idx, wv in enumerate(wm):
514
+ if np.isfinite(wv) and wv <= xq:
515
+ years[i_global] = t_idx + 1
516
+ break
517
+ return years
518
+
519
+ if group_labels is not None and method_norm == "zscore":
520
+ # 组内收敛:使用组内频率和样本大小
521
+ return self._compute_first_convergence_year_by_group(z_threshold, min_t, max_t, window, inclusive, group_labels)
522
+
523
+ # 使用全局频率计算稀有度
524
+ rarity_matrix = []
525
+ for seq in self.sequences:
526
+ score = []
527
+ for t in range(self.T):
528
+ suffix = tuple(seq[t:])
529
+ freq = self.suffix_freq_by_year[t][suffix] / N
530
+ score.append(-np.log(freq + 1e-10))
531
+ rarity_matrix.append(score)
532
+
533
+ rarity_df = pd.DataFrame(rarity_matrix)
534
+ rarity_z = rarity_df.apply(lambda x: (x - x.mean()) / x.std(), axis=0)
535
+ # 保留 NaN 以便跳过含零方差年份的窗口
536
+ rarity_z = rarity_z.replace([np.inf, -np.inf], np.nan)
537
+
538
+ years = []
539
+ for i in range(N):
540
+ z = rarity_z.iloc[i]
541
+ year = None
542
+ for t in range(min_t - 1, max_t): # min_t-1 for 0-indexed, max_t already accounts for window
543
+ vals = [z.iloc[t + k] for k in range(window)]
544
+ if not np.all(np.isfinite(vals)):
545
+ continue
546
+ # 收敛 = 低稀有(更典型)
547
+ if inclusive:
548
+ condition = all(v <= -z_threshold for v in vals)
549
+ else:
550
+ condition = all(v < -z_threshold for v in vals)
551
+ if condition:
552
+ year = int(t + 1) # Convert to 1-indexed integer
553
+ break
554
+ years.append(year)
555
+ return years
556
+
557
+ def _compute_first_convergence_year_by_group(self, z_threshold, min_t, max_t, window, inclusive, group_labels):
558
+ """
559
+ 计算组内第一次收敛年份:使用组内频率和样本大小计算稀有度
560
+ """
561
+ from collections import defaultdict
562
+
563
+ # 按组构建 suffix 频率表(重用 _compute_converged_by_group 的逻辑)
564
+ group_sequences = defaultdict(list)
565
+ for i, (seq, group) in enumerate(zip(self.sequences, group_labels)):
566
+ group_sequences[group].append((i, seq))
567
+
568
+ # 为每个组构建 suffix 频率表
569
+ group_suffix_freq = {}
570
+ group_sizes = {}
571
+ for group, seq_list in group_sequences.items():
572
+ group_sizes[group] = len(seq_list)
573
+ freq_by_year = [defaultdict(int) for _ in range(self.T)]
574
+
575
+ for _, seq in seq_list:
576
+ for t in range(self.T):
577
+ suffix = tuple(seq[t:])
578
+ freq_by_year[t][suffix] += 1
579
+
580
+ group_suffix_freq[group] = freq_by_year
581
+
582
+ # 为每个个体计算组内收敛年份
583
+ all_years = [None] * len(self.sequences)
584
+
585
+ for group, seq_list in group_sequences.items():
586
+ group_n = group_sizes[group]
587
+ group_freq = group_suffix_freq[group]
588
+
589
+ # 计算该组的稀有度矩阵
590
+ rarity_matrix = []
591
+ group_indices = []
592
+
593
+ for orig_idx, seq in seq_list:
594
+ group_indices.append(orig_idx)
595
+ score = []
596
+ for t in range(self.T):
597
+ suffix = tuple(seq[t:])
598
+ freq = group_freq[t][suffix] / group_n
599
+ score.append(-np.log(freq + 1e-10))
600
+ rarity_matrix.append(score)
601
+
602
+ # 计算 z 分数
603
+ rarity_df = pd.DataFrame(rarity_matrix)
604
+ rarity_z = rarity_df.apply(lambda x: (x - x.mean()) / x.std(), axis=0)
605
+ rarity_z = rarity_z.replace([np.inf, -np.inf], np.nan)
606
+
607
+ # 寻找第一次收敛年份
608
+ for i, orig_idx in enumerate(group_indices):
609
+ z = rarity_z.iloc[i]
610
+ year = None
611
+ for t in range(min_t - 1, max_t):
612
+ vals = [z.iloc[t + k] for k in range(window)]
613
+ if not np.all(np.isfinite(vals)):
614
+ continue
615
+ if inclusive:
616
+ condition = all(v <= -z_threshold for v in vals)
617
+ else:
618
+ condition = all(v < -z_threshold for v in vals)
619
+ if condition:
620
+ year = int(t + 1)
621
+ break
622
+
623
+ all_years[orig_idx] = year
624
+
625
+ return all_years
626
+
627
+ def compute_suffix_rarity_per_year(self, as_dataframe: bool = True, column_prefix: str = "t", zscore: bool = False):
628
+ """
629
+ Compute per-year suffix rarity scores for each individual.
630
+
631
+ Definition (mirror of prefix rarity):
632
+ rarity_{i,t} = -log( freq(suffix_{i,t}) / N ) >= 0
633
+
634
+ Where suffix_{i,t} is the observed suffix from year t to end for person i,
635
+ and N is total number of individuals. Higher means rarer (less typical).
636
+
637
+ Parameters
638
+ ----------
639
+ as_dataframe : bool, default True
640
+ If True, returns a pandas DataFrame with columns f"{column_prefix}1"..f"{column_prefix}T".
641
+ If False, returns a NumPy array of shape (N, T).
642
+ column_prefix : str, default "t"
643
+ Column name prefix when returning a DataFrame.
644
+ zscore : bool, default False
645
+ If True, z-standardize the rarity scores column-wise (by year).
646
+
647
+ Returns
648
+ -------
649
+ pandas.DataFrame or np.ndarray
650
+ Per-year rarity scores (optionally z-scored).
651
+ """
652
+ N = len(self.sequences)
653
+ rarity_matrix = []
654
+
655
+ for seq in self.sequences:
656
+ score_list = []
657
+ for t in range(self.T):
658
+ suffix = tuple(seq[t:])
659
+ freq = self.suffix_freq_by_year[t][suffix] / N
660
+ score_list.append(-np.log(freq + 1e-10))
661
+ rarity_matrix.append(score_list)
662
+
663
+ rarity_arr = np.array(rarity_matrix, dtype=float)
664
+
665
+ if zscore:
666
+ col_means = np.nanmean(rarity_arr, axis=0)
667
+ col_stds = np.nanstd(rarity_arr, axis=0, ddof=1) # 与 pandas DataFrame.std() 保持一致
668
+ with np.errstate(invalid='ignore', divide='ignore'):
669
+ rarity_arr = (rarity_arr - col_means) / col_stds
670
+
671
+ if not as_dataframe:
672
+ return rarity_arr
673
+
674
+ columns = [f"{column_prefix}{t+1}" for t in range(self.T)]
675
+ return pd.DataFrame(rarity_arr, columns=columns)
676
+
677
+ def compute_suffix_rarity_score(self):
678
+ """
679
+ Compute cumulative suffix rarity score for each individual:
680
+ rarity_score_i = sum_{t=1}^T -log( freq(suffix_{i,t}) / N )
681
+
682
+ Higher scores indicate rarer, less typical future paths from each year onward.
683
+ """
684
+ rarity_scores = []
685
+ N = len(self.sequences)
686
+
687
+ for seq in self.sequences:
688
+ score = 0.0
689
+ for t in range(self.T):
690
+ suffix = tuple(seq[t:])
691
+ freq = self.suffix_freq_by_year[t][suffix] / N
692
+ score += -np.log(freq + 1e-10)
693
+ rarity_scores.append(score)
694
+ return rarity_scores
695
+
696
+ def compute_standardized_rarity_score(self, min_t=1, max_t=None, window=1):
697
+ """
698
+ Compute standardized rarity scores for convergence classification and visualization
699
+ using true statistical z-scores.
700
+
701
+ This method computes standardized rarity scores used for individual-level
702
+ convergence classification:
703
+ standardized_score_i = min_t max_{k=0..window-1} z_{i,t+k}
704
+
705
+ Where z_{i,t} are the year-wise true z-scores of suffix rarity computed column-wise
706
+ across individuals with sample standard deviation (ddof=1):
707
+ z_{i,t} = (x_{i,t} - mean_t) / std_t
708
+
709
+ The standardized scores can be used with a threshold (e.g., z <= -1.5) to classify
710
+ individuals as converged/not converged, and are particularly useful for visualization.
711
+
712
+ Note: For convergence (suffix tree), we look for LOW rarity (more typical patterns),
713
+ so lower z-scores indicate convergence. This is opposite to prefix tree divergence.
714
+
715
+ Parameters:
716
+ -----------
717
+ min_t : int, default=1
718
+ Minimum year (1-indexed) after which convergence is considered valid.
719
+ max_t : int, optional
720
+ Maximum year (1-indexed) before which convergence is considered valid.
721
+ If None, uses T-window+1.
722
+ window : int, default=1
723
+ Number of consecutive low-z years required
724
+
725
+ Returns:
726
+ --------
727
+ List[float]
728
+ Standardized rarity scores for each individual. Values <= -z_threshold indicate convergence.
729
+
730
+ Notes:
731
+ ------
732
+ The standardization uses sample standard deviation (ddof=1) for each year column,
733
+ which is consistent with pandas' default behavior for DataFrame.std().
734
+ This is essentially the z-score normalized version of suffix rarity scores.
735
+ For convergence detection, we look for the MINIMUM z-score (most typical behavior).
736
+ """
737
+ if max_t is None:
738
+ max_t = self.T - window + 1
739
+
740
+ N = len(self.sequences)
741
+
742
+ # Step 1: Calculate rarity matrix
743
+ rarity_matrix = []
744
+ for seq in self.sequences:
745
+ score = []
746
+ for t in range(self.T):
747
+ suffix = tuple(seq[t:])
748
+ freq = self.suffix_freq_by_year[t][suffix] / N
749
+ score.append(-np.log(freq + 1e-10))
750
+ rarity_matrix.append(score)
751
+
752
+ # Step 2: Column-wise true z-score standardization (by year, ddof=1)
753
+ rarity_arr = np.asarray(rarity_matrix, dtype=float)
754
+ col_means = np.nanmean(rarity_arr, axis=0)
755
+ col_stds = np.nanstd(rarity_arr, axis=0, ddof=1)
756
+ with np.errstate(invalid='ignore', divide='ignore'):
757
+ rarity_z = (rarity_arr - col_means) / col_stds
758
+ # Keep NaN for zero-variance years to allow window skipping downstream
759
+ rarity_z = np.where(np.isfinite(rarity_z), rarity_z, np.nan)
760
+
761
+ # Step 3: Compute standardized rarity score for each individual
762
+ standardized_scores = []
763
+ for i in range(N):
764
+ z_scores = rarity_z[i, :]
765
+ candidate_values = []
766
+
767
+ # For each possible starting time t
768
+ for t in range(min_t - 1, max_t): # min_t-1 for 0-indexed
769
+ vals = [z_scores[t + k] for k in range(window)]
770
+ # Skip windows containing NaN (e.g., zero-variance years)
771
+ if not np.all(np.isfinite(vals)):
772
+ continue
773
+ # For convergence, take maximum within window (ensure all finite)
774
+ window_max = float(np.max(vals))
775
+ candidate_values.append(window_max)
776
+
777
+ # Take the minimum across all starting times (most convergent period)
778
+ if candidate_values:
779
+ standardized_score = float(np.min(candidate_values))
780
+ else:
781
+ standardized_score = np.nan
782
+
783
+ standardized_scores.append(standardized_score)
784
+
785
+ return standardized_scores
786
+
787
+ def compute_converged_by_top_proportion(
788
+ self,
789
+ group_labels,
790
+ proportion: float = 0.10,
791
+ min_t: int = 1,
792
+ max_t: Optional[int] = None,
793
+ window: int = 1,
794
+ min_count: int = 1,
795
+ ):
796
+ """
797
+ Classify convergence by selecting the top proportion of most typical (smallest) standardized scores
798
+ WITHIN EACH GROUP (e.g., country). This ensures identical proportion thresholds across groups,
799
+ independent of distribution shape or discreteness.
800
+
801
+ Steps:
802
+ 1) Compute true-z standardized rarity aggregated score per individual using
803
+ `compute_standardized_rarity_score(min_t, max_t, window)`.
804
+ 2) For each group g, sort scores ascending and select the first k = max(min_count, floor(p*n_g)) indices
805
+ as convergers.
806
+
807
+ Parameters
808
+ ----------
809
+ group_labels : Sequence
810
+ Group label per individual (e.g., country). Length must equal number of sequences.
811
+ proportion : float, default 0.10
812
+ Top p proportion to mark as converged within each group (0<p<1).
813
+ min_t : int, default 1
814
+ Minimum year considered in the aggregated score.
815
+ max_t : Optional[int], default None
816
+ Maximum starting year considered; if None, uses T-window+1.
817
+ window : int, default 1
818
+ Number of consecutive years in the aggregated statistic.
819
+ min_count : int, default 1
820
+ Minimum number selected per group (useful for very small groups).
821
+
822
+ Returns
823
+ -------
824
+ tuple[List[int], dict]
825
+ (flags, info) where flags is a 0/1 list for convergence, and info is per-group metadata:
826
+ {group: {"k": int, "n": int, "threshold_value": float}}
827
+ """
828
+ if not (0 < float(proportion) < 1):
829
+ raise ValueError(f"proportion must be in (0,1), got {proportion}")
830
+
831
+ N = len(self.sequences)
832
+ if len(group_labels) != N:
833
+ raise ValueError("Length of group_labels must match number of sequences")
834
+
835
+ # 1) Compute aggregated standardized score (lower = more typical)
836
+ scores = np.asarray(self.compute_standardized_rarity_score(min_t=min_t, max_t=max_t, window=window), dtype=float)
837
+
838
+ labels = np.asarray(group_labels)
839
+ flags = np.zeros(N, dtype=int)
840
+ info = {}
841
+
842
+ # Iterate groups deterministically by sorted group name for reproducibility
843
+ for g in sorted(pd.unique(labels)):
844
+ idx = np.where(labels == g)[0]
845
+ vals = scores[idx]
846
+
847
+ n_g = len(idx)
848
+ if n_g == 0:
849
+ info[g] = {"k": 0, "n": 0, "threshold_value": np.nan}
850
+ continue
851
+
852
+ # Determine k with lower bound min_count and upper bound n_g
853
+ k = int(np.floor(proportion * n_g))
854
+ if k < min_count:
855
+ k = min_count
856
+ if k > n_g:
857
+ k = n_g
858
+
859
+ # Treat NaN as worst (push to the end); still allow exact k selection
860
+ order_vals = np.where(np.isfinite(vals), vals, np.inf)
861
+ order = np.argsort(order_vals, kind="mergesort") # stable for tie-breaking
862
+
863
+ if k >= 1:
864
+ selected_local = order[:k]
865
+ selected_global = idx[selected_local]
866
+ flags[selected_global] = 1
867
+ kth_val = vals[order[k - 1]]
868
+ kth_val = float(kth_val) if np.isfinite(kth_val) else np.nan
869
+ else:
870
+ selected_local = np.array([], dtype=int)
871
+ kth_val = np.nan
872
+
873
+ info[g] = {"k": int(k), "n": int(n_g), "threshold_value": kth_val}
874
+
875
+ return flags.tolist(), info
876
+
877
+ def diagnose_convergence_calculation(self, z_threshold=1.5, max_t=None, window=1, inclusive=False, group_labels=None):
878
+ """
879
+ Diagnostic function to analyze convergence year calculation and identify
880
+ years with insufficient variance (std ≈ 0) that cannot trigger convergence.
881
+
882
+ This is methodologically appropriate: when all individuals follow similar
883
+ trajectories in a given year, no convergence should be detected.
884
+
885
+ Returns:
886
+ --------
887
+ dict: Diagnostic information including:
888
+ - years_with_zero_variance: List of years where std ≈ 0
889
+ - rarity_std_by_year: Standard deviation of rarity scores per year
890
+ - n_individuals_with_convergence: Count of individuals with any convergence
891
+ - convergence_year_distribution: Value counts of convergence years
892
+ """
893
+ if max_t is None:
894
+ max_t = self.T - window + 1
895
+
896
+ N = len(self.sequences)
897
+ rarity_matrix = []
898
+
899
+ # Calculate rarity scores (same as in compute_first_convergence_year)
900
+ for seq in self.sequences:
901
+ score = []
902
+ for t in range(self.T):
903
+ suffix = tuple(seq[t:])
904
+ freq = self.suffix_freq_by_year[t][suffix] / N
905
+ score.append(-np.log(freq + 1e-10))
906
+ rarity_matrix.append(score)
907
+
908
+ rarity_df = pd.DataFrame(rarity_matrix)
909
+
910
+ # Calculate standard deviations by year
911
+ rarity_std_by_year = rarity_df.std(axis=0)
912
+ years_with_zero_variance = []
913
+
914
+ # Identify years with near-zero variance (threshold can be adjusted)
915
+ for t, std_val in enumerate(rarity_std_by_year):
916
+ if pd.isna(std_val) or std_val < 1e-10:
917
+ years_with_zero_variance.append(t + 1) # 1-indexed
918
+
919
+ # Calculate z-scores
920
+ rarity_z = rarity_df.apply(lambda x: (x - x.mean()) / x.std(), axis=0)
921
+
922
+ # Count individuals with convergence
923
+ convergence_years = self.compute_first_convergence_year(
924
+ z_threshold=z_threshold, min_t=1, max_t=max_t, window=window,
925
+ inclusive=inclusive, group_labels=group_labels
926
+ )
927
+ n_individuals_with_convergence = sum(1 for year in convergence_years if year is not None)
928
+
929
+ # Distribution of convergence years
930
+ convergence_year_counts = pd.Series(convergence_years).value_counts(dropna=False).sort_index()
931
+
932
+ return {
933
+ 'years_with_zero_variance': years_with_zero_variance,
934
+ 'rarity_std_by_year': rarity_std_by_year.tolist(),
935
+ 'n_individuals_with_convergence': n_individuals_with_convergence,
936
+ 'convergence_year_distribution': convergence_year_counts.to_dict(),
937
+ 'total_individuals': N,
938
+ 'parameters_used': {
939
+ 'z_threshold': z_threshold,
940
+ 'max_t': max_t,
941
+ 'window': window,
942
+ 'inclusive': inclusive,
943
+ 'group_labels': group_labels is not None
944
+ }
945
+ }
946
+
947
+ def compute_path_uniqueness(self):
948
+ """
949
+ Count, for each individual, how many years t their suffix (from t to end)
950
+ is unique in the population (frequency == 1). Uses suffix-based logic.
951
+ """
952
+ uniqueness_scores = []
953
+ for seq in self.sequences:
954
+ count = 0
955
+ for t in range(self.T):
956
+ suffix = tuple(seq[t:])
957
+ if self.suffix_freq_by_year[t][suffix] == 1:
958
+ count += 1
959
+ uniqueness_scores.append(count)
960
+ return uniqueness_scores
961
+
962
+
963
+ def plot_suffix_rarity_distribution(
964
+ data,
965
+ # === Core Parameters ===
966
+ group_names=None,
967
+ colors=None,
968
+ # === Threshold Settings ===
969
+ show_threshold=True,
970
+ threshold_method="top_proportion", # Changed default to top_proportion
971
+ proportion_p=0.07, # Simplified parameter name, default 7%
972
+ # === Plotting Options ===
973
+ figsize=(10, 6),
974
+ kde_bw=None,
975
+ # === Export Options ===
976
+ save_as=None,
977
+ dpi=300,
978
+ show=True,
979
+ # === Parameters for Different Methods ===
980
+ z_threshold=1.5,
981
+ is_standardized_score=False,
982
+ quantile_p=0.10
983
+ ):
984
+ """
985
+ Plot suffix rarity score distribution(s) with clean threshold lines.
986
+
987
+ Parameters
988
+ ----------
989
+ data : dict or array-like
990
+ Data to plot. If dict: {"group1": scores1, "group2": scores2}
991
+ If array-like: single group data
992
+ group_names : list, optional
993
+ Custom group names. Auto-detected from dict keys if not provided
994
+ colors : dict or list, optional
995
+ Colors for groups. If None, uses default palette
996
+
997
+ show_threshold : bool, default True
998
+ Whether to show threshold vertical lines
999
+ threshold_method : str, default "top_proportion"
1000
+ Threshold method:
1001
+ - "top_proportion": Select top proportion_p% most extreme values
1002
+ - "quantile": Use quantile_p percentile
1003
+ - "zscore": Use z-score threshold (for standardized data)
1004
+ proportion_p : float, default 0.05
1005
+ Proportion for top_proportion method (e.g., 0.05 = top 5%)
1006
+
1007
+ figsize : tuple, default (10, 6)
1008
+ Figure size (width, height)
1009
+ kde_bw : float, optional
1010
+ KDE bandwidth adjustment. If None, uses seaborn default
1011
+
1012
+ save_as : str, optional
1013
+ Save path (without extension)
1014
+ dpi : int, default 300
1015
+ Resolution for saved figure
1016
+ show : bool, default True
1017
+ Whether to display plot
1018
+
1019
+ Returns
1020
+ -------
1021
+ dict
1022
+ Statistics including threshold values per group
1023
+
1024
+ Examples
1025
+ --------
1026
+ # Basic usage - top 5% threshold (default)
1027
+ >>> plot_suffix_rarity_distribution({"India": india_scores, "US": us_scores})
1028
+
1029
+ # Custom threshold proportion
1030
+ >>> plot_suffix_rarity_distribution(
1031
+ ... data={"India": india_scores, "US": us_scores},
1032
+ ... proportion_p=0.03, # top 3%
1033
+ ... save_as="rarity_comparison"
1034
+ ... )
1035
+
1036
+ # Quantile-based threshold
1037
+ >>> plot_suffix_rarity_distribution(
1038
+ ... data={"India": india_scores, "US": us_scores},
1039
+ ... threshold_method="quantile",
1040
+ ... quantile_p=0.10, # 10th percentile
1041
+ ... )
1042
+
1043
+ # Clean plot without thresholds
1044
+ >>> plot_suffix_rarity_distribution(
1045
+ ... data,
1046
+ ... show_threshold=False,
1047
+ ... colors={"India": "#E8B88A", "US": "#A3BFD9"}
1048
+ ... )
1049
+ """
1050
+ import matplotlib.pyplot as plt
1051
+ import seaborn as sns
1052
+ import numpy as np
1053
+
1054
+ # Process input data
1055
+ if isinstance(data, dict):
1056
+ # Multi-group case
1057
+ groups = data
1058
+ if group_names is None:
1059
+ group_names = list(groups.keys())
1060
+ else:
1061
+ # Single group case
1062
+ if group_names is None:
1063
+ group_names = ["Group"]
1064
+ groups = {group_names[0]: data}
1065
+
1066
+ # Set up colors (simplified)
1067
+ if colors is None:
1068
+ default_colors = ["#A3BFD9", "#E8B88A", "#C6A5CF", "#A6C1A9", "#F4A460", "#87CEEB"]
1069
+ color_map = dict(zip(group_names, default_colors[:len(group_names)]))
1070
+ elif isinstance(colors, dict):
1071
+ color_map = colors
1072
+ else:
1073
+ color_map = dict(zip(group_names, colors))
1074
+
1075
+ # Normalize method and prepare stats
1076
+ threshold_method = (threshold_method or "top_proportion").lower()
1077
+
1078
+ # Handle legacy parameter mapping
1079
+ if threshold_method in {"top_proportion", "topk", "proportion", "rank"}:
1080
+ # Use the simplified proportion_p parameter
1081
+ top_proportion_p = proportion_p
1082
+ topk_min_count = 1
1083
+ elif threshold_method == "quantile":
1084
+ # Use quantile_p for quantile method
1085
+ pass
1086
+ elif threshold_method in {"zscore", "z"} and is_standardized_score:
1087
+ # Auto-handle standardized scores
1088
+ pass
1089
+
1090
+ stats = {"per_group": {}, "threshold_method": threshold_method}
1091
+
1092
+ # Validate quantiles if needed
1093
+ def _check_q(q: float):
1094
+ if not (0 < float(q) < 1):
1095
+ raise ValueError(f"quantile must be in (0,1), got {q}")
1096
+ quantiles_to_draw = None
1097
+ if threshold_method == "quantile":
1098
+ _check_q(quantile_p)
1099
+ quantiles_to_draw = [quantile_p] # Simplified - no additional_quantiles
1100
+ # Per-group quantile(s)
1101
+ for g in group_names:
1102
+ if g in groups:
1103
+ arr = np.asarray(groups[g], dtype=float)
1104
+ # Compute requested quantiles with NaN handling
1105
+ valid = arr[~np.isnan(arr)]
1106
+ thresholds_g = {}
1107
+ if valid.size > 0:
1108
+ for q in quantiles_to_draw:
1109
+ try:
1110
+ xq = float(np.nanquantile(arr, q))
1111
+ except Exception:
1112
+ xq = float(np.quantile(valid, q))
1113
+ thresholds_g[f"p{int(round(q*100)):02d}"] = xq
1114
+ else:
1115
+ for q in quantiles_to_draw:
1116
+ thresholds_g[f"p{int(round(q*100)):02d}"] = np.nan
1117
+ # Primary threshold (for backward compatibility)
1118
+ primary_label = f"p{int(round(quantile_p*100)):02d}"
1119
+ primary_value = thresholds_g.get(primary_label, np.nan)
1120
+ # Proportion below primary
1121
+ vals = valid
1122
+ prop_below = float(np.nanmean(vals <= primary_value)) if vals.size > 0 and not np.isnan(primary_value) else np.nan
1123
+ stats["per_group"][g] = {
1124
+ "threshold_values": thresholds_g,
1125
+ "is_group_relative": True,
1126
+ "threshold_value": primary_value,
1127
+ "primary_quantile": primary_label,
1128
+ "prop_below": prop_below
1129
+ }
1130
+ elif threshold_method in {"zscore", "z"}:
1131
+ # z-score method (backward compatibility)
1132
+ for g in group_names:
1133
+ if g in groups:
1134
+ arr = np.asarray(groups[g], dtype=float)
1135
+ mean_g = np.nanmean(arr)
1136
+ std_g = np.nanstd(arr, ddof=1) # sample std to match pandas
1137
+ if is_standardized_score:
1138
+ x_thresh_g = -float(z_threshold)
1139
+ else:
1140
+ x_thresh_g = float(mean_g - z_threshold * std_g)
1141
+ vals = arr[~np.isnan(arr)]
1142
+ prop_below = float(np.nanmean(vals <= x_thresh_g)) if vals.size > 0 and not np.isnan(x_thresh_g) else np.nan
1143
+ stats["per_group"][g] = {
1144
+ "mean": float(mean_g),
1145
+ "std": float(std_g),
1146
+ "threshold_value": float(x_thresh_g),
1147
+ "z_threshold": float(z_threshold),
1148
+ "is_group_relative": True,
1149
+ "prop_below": prop_below,
1150
+ "num_below": int(np.sum(vals <= x_thresh_g)) if vals.size > 0 and not np.isnan(x_thresh_g) else 0,
1151
+ "n": int(vals.size)
1152
+ }
1153
+ elif threshold_method in {"topk", "top_proportion", "proportion", "rank"}:
1154
+ # Rank-based proportion selection within each group: pick top p% (smallest values)
1155
+ if not (0 < float(proportion_p) < 1):
1156
+ raise ValueError(f"proportion_p must be in (0,1), got {proportion_p}")
1157
+ top_proportion_p = proportion_p # Map to internal variable
1158
+ for g in group_names:
1159
+ if g in groups:
1160
+ arr = np.asarray(groups[g], dtype=float)
1161
+ finite_mask = np.isfinite(arr)
1162
+ vals = arr[finite_mask]
1163
+ n_valid = int(vals.size)
1164
+ if n_valid == 0:
1165
+ stats["per_group"][g] = {
1166
+ "threshold_value": np.nan,
1167
+ "k": 0,
1168
+ "n": 0,
1169
+ "prop_selected": np.nan,
1170
+ "num_leq_threshold": 0
1171
+ }
1172
+ continue
1173
+ k = int(np.floor(top_proportion_p * n_valid))
1174
+ if k < int(topk_min_count):
1175
+ k = int(topk_min_count)
1176
+ if k > n_valid:
1177
+ k = n_valid
1178
+ # Sort ascending (most typical first)
1179
+ order = np.argsort(vals, kind="mergesort")
1180
+ thresh_val = vals[order[k - 1]] if k >= 1 else np.nan
1181
+ num_leq = int(np.sum(vals <= thresh_val)) if k >= 1 and np.isfinite(thresh_val) else 0
1182
+ stats["per_group"][g] = {
1183
+ "threshold_value": float(thresh_val) if np.isfinite(thresh_val) else np.nan,
1184
+ "k": int(k),
1185
+ "n": int(n_valid),
1186
+ "prop_selected": (k / n_valid) if n_valid > 0 else np.nan,
1187
+ "num_leq_threshold": num_leq
1188
+ }
1189
+ stats["threshold_method"] = "topk"
1190
+ else:
1191
+ raise ValueError(f"Unknown threshold_method: {threshold_method}")
1192
+
1193
+ # Create plot
1194
+ plt.figure(figsize=figsize)
1195
+
1196
+ # Plot distributions
1197
+ for idx, group_name in enumerate(group_names):
1198
+ if group_name in groups:
1199
+ scores = groups[group_name]
1200
+ color = color_map.get(group_name, "#1f77b4")
1201
+ arr = np.asarray(scores, dtype=float)
1202
+ vmin = np.nanmin(arr) if np.isfinite(arr).any() else None
1203
+ vmax = np.nanmax(arr) if np.isfinite(arr).any() else None
1204
+ kde_kwargs = {"label": group_name, "fill": True, "color": color, "linewidth": 2}
1205
+ if kde_bw is not None:
1206
+ kde_kwargs["bw_adjust"] = kde_bw
1207
+ if vmin is not None and vmax is not None and vmin < vmax:
1208
+ kde_kwargs["clip"] = (vmin, vmax)
1209
+ sns.kdeplot(arr, **kde_kwargs)
1210
+
1211
+ # Add per-group threshold lines if requested (color-matched)
1212
+ if show_threshold:
1213
+ for i, g in enumerate(group_names):
1214
+ if g in stats["per_group"]:
1215
+ color = color_map.get(g, "#1f77b4")
1216
+ ax = plt.gca()
1217
+ y_max = ax.get_ylim()[1]
1218
+ x_min, x_max = ax.get_xlim()
1219
+ text_y = y_max * 0.9
1220
+ x_offset = (x_max - x_min) * 0.005 * (i + 1)
1221
+ if threshold_method == "quantile":
1222
+ thresholds_g = stats["per_group"][g]["threshold_values"]
1223
+ # Draw multiple lines if multiple quantiles
1224
+ for k_idx, (q_lbl, xg) in enumerate(sorted(thresholds_g.items())):
1225
+ if np.isnan(xg):
1226
+ continue
1227
+ # Clean threshold line without text label
1228
+ plt.axvline(xg, color=color, linestyle="--", linewidth=1.6)
1229
+ elif threshold_method in {"zscore", "z"}:
1230
+ xg = stats["per_group"][g]["threshold_value"]
1231
+ # Clean threshold line without text label
1232
+ plt.axvline(xg, color=color, linestyle="--", linewidth=1.6)
1233
+ else: # top_proportion
1234
+ xg = stats["per_group"][g]["threshold_value"]
1235
+ if np.isfinite(xg):
1236
+ # Clean threshold line without text label
1237
+ plt.axvline(xg, color=color, linestyle="--", linewidth=1.6)
1238
+
1239
+ # Formatting
1240
+ if is_standardized_score:
1241
+ plt.xlabel("Standardized Suffix Rarity Score", fontsize=13)
1242
+ else:
1243
+ plt.xlabel("Suffix Rarity Score", fontsize=13)
1244
+ plt.ylabel("Density", fontsize=13)
1245
+ if len(group_names) > 1:
1246
+ plt.legend(title="Group")
1247
+ sns.despine()
1248
+ plt.tight_layout()
1249
+
1250
+ # Save and show
1251
+ if save_as:
1252
+ plt.savefig(f"{save_as}.png", dpi=dpi, bbox_inches='tight')
1253
+
1254
+ if show:
1255
+ plt.show()
1256
+
1257
+ return stats
1258
+
1259
+
1260
+ def plot_individual_indicators_correlation(
1261
+ df,
1262
+ indicator_columns=None,
1263
+ correlation_method='pearson',
1264
+ group_column=None,
1265
+ figsize=(10, 8),
1266
+ cmap='RdBu_r',
1267
+ center=0,
1268
+ annot=True,
1269
+ fmt='.2f',
1270
+ save_as=None,
1271
+ dpi=300,
1272
+ show=True
1273
+ ):
1274
+ """
1275
+ Plot correlation heatmap of individual-level indicators with beautiful styling.
1276
+
1277
+ Parameters:
1278
+ -----------
1279
+ df : pandas.DataFrame
1280
+ DataFrame containing individual-level indicators
1281
+ indicator_columns : list, optional
1282
+ List of column names to include in correlation analysis.
1283
+ If None, automatically detects indicator columns (converged, first_convergence_year,
1284
+ suffix_typicality_score, path_uniqueness, etc.)
1285
+ correlation_method : str, default='pearson'
1286
+ Correlation method: 'pearson', 'spearman', 'kendall'
1287
+ group_column : str, optional
1288
+ Column name for grouping (e.g., 'country'). If provided, shows separate
1289
+ heatmaps for each group
1290
+ figsize : tuple, default=(10, 8)
1291
+ Figure size (width, height)
1292
+ cmap : str, default='RdBu_r'
1293
+ Colormap for heatmap. Options: 'RdBu_r', 'coolwarm', 'viridis', 'plasma'
1294
+ center : float, default=0
1295
+ Value to center the colormap at
1296
+ annot : bool, default=True
1297
+ Whether to annotate cells with correlation values
1298
+ fmt : str, default='.2f'
1299
+ Format for annotations
1300
+ save_as : str, optional
1301
+ Path to save the figure (without extension)
1302
+ dpi : int, default=300
1303
+ DPI for saving
1304
+ show : bool, default=True
1305
+ Whether to display the plot
1306
+
1307
+ Returns:
1308
+ --------
1309
+ dict: Correlation matrix/matrices and statistics
1310
+
1311
+ Example:
1312
+ --------
1313
+ # Basic usage
1314
+ >>> plot_individual_indicators_correlation(df)
1315
+
1316
+ # Custom indicators with grouping
1317
+ >>> plot_individual_indicators_correlation(
1318
+ ... df,
1319
+ ... indicator_columns=['converged', 'suffix_rarity_score', 'path_uniqueness'],
1320
+ ... group_column='country',
1321
+ ... correlation_method='spearman'
1322
+ ... )
1323
+
1324
+ # Custom styling
1325
+ >>> plot_individual_indicators_correlation(
1326
+ ... df,
1327
+ ... cmap='plasma',
1328
+ ... figsize=(12, 10),
1329
+ ... save_as="indicators_correlation_heatmap"
1330
+ ... )
1331
+ """
1332
+ import matplotlib.pyplot as plt
1333
+ import seaborn as sns
1334
+ import pandas as pd
1335
+ import numpy as np
1336
+
1337
+ # Auto-detect indicator columns if not provided
1338
+ if indicator_columns is None:
1339
+ # Common individual-level indicator patterns (convergence-focused)
1340
+ potential_indicators = [
1341
+ 'converged', 'first_convergence_year', 'convergence_year',
1342
+ 'suffix_rarity_score', 'path_uniqueness', 'rarity_score', 'uniqueness_score'
1343
+ ]
1344
+ indicator_columns = [col for col in df.columns if col in potential_indicators]
1345
+
1346
+ # Also include numeric columns that might be indicators
1347
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
1348
+ for col in numeric_cols:
1349
+ if col not in indicator_columns and any(
1350
+ keyword in col.lower() for keyword in
1351
+ ['score', 'index', 'count', 'factor', 'rate', 'ratio']
1352
+ ):
1353
+ indicator_columns.append(col)
1354
+
1355
+ # Filter and clean data
1356
+ df_indicators = df[indicator_columns].copy()
1357
+
1358
+ # Handle missing values and convert data types
1359
+ for col in df_indicators.columns:
1360
+ if df_indicators[col].dtype == 'object':
1361
+ # Try to convert to numeric
1362
+ df_indicators[col] = pd.to_numeric(df_indicators[col], errors='coerce')
1363
+
1364
+ # Remove columns with too many missing values (>50%)
1365
+ valid_cols = []
1366
+ for col in df_indicators.columns:
1367
+ if df_indicators[col].notna().sum() / len(df_indicators) > 0.5:
1368
+ valid_cols.append(col)
1369
+
1370
+ df_indicators = df_indicators[valid_cols]
1371
+
1372
+ # Drop rows with any missing values for correlation calculation
1373
+ df_clean = df_indicators.dropna()
1374
+
1375
+ if len(df_clean) == 0:
1376
+ raise ValueError("No valid data remaining after cleaning. Check for missing values.")
1377
+
1378
+ # Calculate correlations
1379
+ results = {}
1380
+
1381
+ if group_column is None or group_column not in df.columns:
1382
+ # Single correlation matrix
1383
+ corr_matrix = df_clean.corr(method=correlation_method)
1384
+ results['overall'] = corr_matrix
1385
+
1386
+ # Create plot
1387
+ plt.figure(figsize=figsize)
1388
+
1389
+ # Create mask for upper triangle (optional - makes it cleaner)
1390
+ mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
1391
+
1392
+ # Generate heatmap
1393
+ sns.heatmap(
1394
+ corr_matrix,
1395
+ mask=mask,
1396
+ annot=annot,
1397
+ fmt=fmt,
1398
+ cmap=cmap,
1399
+ center=center,
1400
+ square=True,
1401
+ cbar_kws={"shrink": .8, "label": f"{correlation_method.title()} Correlation"},
1402
+ linewidths=0.5
1403
+ )
1404
+
1405
+ plt.title(f"Individual-Level Indicators Correlation Heatmap\n({correlation_method.title()} Correlation)",
1406
+ fontsize=14, pad=20)
1407
+ plt.xticks(rotation=45, ha='right')
1408
+ plt.yticks(rotation=0)
1409
+
1410
+ else:
1411
+ # Group-based correlation matrices
1412
+ groups = df[group_column].unique()
1413
+ n_groups = len(groups)
1414
+
1415
+ # Calculate subplot layout
1416
+ if n_groups <= 2:
1417
+ nrows, ncols = 1, n_groups
1418
+ figsize = (figsize[0] * n_groups, figsize[1])
1419
+ else:
1420
+ ncols = min(3, n_groups)
1421
+ nrows = (n_groups + ncols - 1) // ncols
1422
+ figsize = (figsize[0] * ncols, figsize[1] * nrows)
1423
+
1424
+ fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
1425
+ if n_groups == 1:
1426
+ axes = [axes]
1427
+ elif nrows == 1:
1428
+ axes = axes
1429
+ else:
1430
+ axes = axes.flatten()
1431
+
1432
+ for i, group in enumerate(groups):
1433
+ group_data = df[df[group_column] == group][indicator_columns].dropna()
1434
+
1435
+ if len(group_data) < 2:
1436
+ print(f"Warning: Group '{group}' has insufficient data for correlation")
1437
+ continue
1438
+
1439
+ corr_matrix = group_data.corr(method=correlation_method)
1440
+ results[group] = corr_matrix
1441
+
1442
+ # Create mask for upper triangle
1443
+ mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
1444
+
1445
+ # Plot heatmap
1446
+ sns.heatmap(
1447
+ corr_matrix,
1448
+ mask=mask,
1449
+ annot=annot,
1450
+ fmt=fmt,
1451
+ cmap=cmap,
1452
+ center=center,
1453
+ square=True,
1454
+ cbar=i == 0, # Only show colorbar for first subplot
1455
+ cbar_kws={"shrink": .8, "label": f"{correlation_method.title()} Correlation"} if i == 0 else {},
1456
+ linewidths=0.5,
1457
+ ax=axes[i]
1458
+ )
1459
+
1460
+ axes[i].set_title(f"{group}\n({len(group_data)} individuals)", fontsize=12)
1461
+ axes[i].set_xticks(axes[i].get_xticks())
1462
+ axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45, ha='right')
1463
+ axes[i].set_yticks(axes[i].get_yticks())
1464
+ axes[i].set_yticklabels(axes[i].get_yticklabels(), rotation=0)
1465
+
1466
+ # Hide unused subplots
1467
+ for j in range(i + 1, len(axes)):
1468
+ axes[j].set_visible(False)
1469
+
1470
+ plt.suptitle(f"Individual-Level Indicators Correlation by {group_column.title()}\n({correlation_method.title()} Correlation)",
1471
+ fontsize=16, y=0.98)
1472
+
1473
+ plt.tight_layout()
1474
+
1475
+ # Save and show
1476
+ if save_as:
1477
+ plt.savefig(f"{save_as}.png", dpi=dpi, bbox_inches='tight')
1478
+
1479
+ if show:
1480
+ plt.show()
1481
+
1482
+ # Add summary statistics
1483
+ if group_column is None:
1484
+ sample_size = len(df_clean)
1485
+ else:
1486
+ sizes = {}
1487
+ for g in df[group_column].unique():
1488
+ g_clean = df[df[group_column]==g][indicator_columns].apply(pd.to_numeric, errors='coerce').dropna()
1489
+ sizes[g] = len(g_clean)
1490
+ sample_size = sizes
1491
+
1492
+ results['summary'] = {
1493
+ 'method': correlation_method,
1494
+ 'n_indicators': len(valid_cols),
1495
+ 'indicators_included': valid_cols,
1496
+ 'sample_size': sample_size
1497
+ }
1498
+
1499
+ return results
1500
+
1501
+
1502
+ def compute_quantile_thresholds_by_group(scores, group_labels, quantiles=None):
1503
+ """
1504
+ Compute per-group quantile thresholds for a 1D array of scores.
1505
+
1506
+ Parameters
1507
+ ----------
1508
+ scores : array-like of shape (N,)
1509
+ Scores (e.g., standardized rarity) aligned with labels.
1510
+ group_labels : array-like of shape (N,)
1511
+ Group label per observation.
1512
+ quantiles : Optional[List[float]]
1513
+ Quantiles to compute (e.g., [0.10]). Defaults to [0.10].
1514
+
1515
+ Returns
1516
+ -------
1517
+ dict
1518
+ {group: {"p10": value, ...}}
1519
+ """
1520
+ if quantiles is None:
1521
+ quantiles = [0.10]
1522
+ arr = np.asarray(scores, dtype=float)
1523
+ labels = np.asarray(group_labels)
1524
+ result = {}
1525
+ for g in pd.unique(labels):
1526
+ mask = labels == g
1527
+ vals = arr[mask]
1528
+ vals = vals[~np.isnan(vals)]
1529
+ thresholds_g = {}
1530
+ if vals.size > 0:
1531
+ for q in quantiles:
1532
+ thresholds_g[f"p{int(round(q*100)):02d}"] = float(np.nanquantile(vals, q))
1533
+ else:
1534
+ for q in quantiles:
1535
+ thresholds_g[f"p{int(round(q*100)):02d}"] = np.nan
1536
+ result[g] = thresholds_g
1537
+ return result
1538
+
1539
+
1540
+ def compute_quantile_thresholds_by_group_year(scores, group_labels, year_labels, quantiles=None, min_group_year_size=30):
1541
+ """
1542
+ Compute quantile thresholds by group x year for time-drifting distributions.
1543
+
1544
+ Parameters
1545
+ ----------
1546
+ scores : array-like of shape (N,)
1547
+ Scores aligned with labels.
1548
+ group_labels : array-like of shape (N,)
1549
+ Group label per observation.
1550
+ year_labels : array-like of shape (N,)
1551
+ Year label per observation (int/str).
1552
+ quantiles : Optional[List[float]]
1553
+ Quantiles to compute (e.g., [0.10]). Defaults to [0.10].
1554
+ min_group_year_size : int, default 30
1555
+ Minimum sample size to compute thresholds for a group-year cell. If fewer, returns NaN.
1556
+
1557
+ Returns
1558
+ -------
1559
+ dict
1560
+ {group: {year: {"p10": value, ...}}}
1561
+ """
1562
+ if quantiles is None:
1563
+ quantiles = [0.10]
1564
+ arr = np.asarray(scores, dtype=float)
1565
+ g_arr = np.asarray(group_labels)
1566
+ y_arr = np.asarray(year_labels)
1567
+ result = {}
1568
+ df = pd.DataFrame({"score": arr, "group": g_arr, "year": y_arr})
1569
+ for g, gdf in df.groupby("group"):
1570
+ result[g] = {}
1571
+ for y, ydf in gdf.groupby("year"):
1572
+ vals = ydf["score"].astype(float).to_numpy()
1573
+ vals = vals[~np.isnan(vals)]
1574
+ thresholds_gy = {}
1575
+ if vals.size >= min_group_year_size:
1576
+ for q in quantiles:
1577
+ thresholds_gy[f"p{int(round(q*100)):02d}"] = float(np.nanquantile(vals, q))
1578
+ else:
1579
+ for q in quantiles:
1580
+ thresholds_gy[f"p{int(round(q*100)):02d}"] = np.nan
1581
+ result[g][y] = thresholds_gy
1582
+ return result
1583
+
1584
+
1585
+ def compute_path_uniqueness_by_group_suffix(sequences, group_labels):
1586
+ """
1587
+ Compute path uniqueness within each subgroup defined by group_labels using suffix-based approach.
1588
+ This is consistent with the convergence module's suffix-based logic.
1589
+ :param sequences: List of sequences.
1590
+ :param group_labels: List of group keys (same length as sequences), e.g., country, gender.
1591
+ :return: List of path uniqueness scores (same order as input).
1592
+ """
1593
+ from collections import defaultdict
1594
+
1595
+ T = len(sequences[0])
1596
+ df = pd.DataFrame({
1597
+ "sequence": sequences,
1598
+ "group": group_labels
1599
+ })
1600
+
1601
+ # Step 1: Precompute suffix frequency tables per group (changed from prefix to suffix)
1602
+ group_suffix_freq = {}
1603
+ for group, group_df in df.groupby("group"):
1604
+ suffix_freq = [defaultdict(int) for _ in range(T)]
1605
+ for seq in group_df["sequence"]:
1606
+ for t in range(T):
1607
+ suffix = tuple(seq[t:]) # suffix from year t to end
1608
+ suffix_freq[t][suffix] += 1
1609
+ group_suffix_freq[group] = suffix_freq
1610
+
1611
+ # Step 2: Compute path uniqueness per individual using suffix logic
1612
+ uniqueness_scores = []
1613
+ for seq, group in zip(sequences, group_labels):
1614
+ suffix_freq = group_suffix_freq[group]
1615
+ count = 0
1616
+ for t in range(T):
1617
+ suffix = tuple(seq[t:]) # suffix from year t to end
1618
+ if suffix_freq[t][suffix] == 1:
1619
+ count += 1
1620
+ uniqueness_scores.append(count)
1621
+
1622
+ return uniqueness_scores
1623
+
1624
+
1625
+ # Provide a default version for backward compatibility
1626
+ def compute_path_uniqueness_by_group(sequences, group_labels):
1627
+ """
1628
+ Compute path uniqueness within each subgroup defined by group_labels.
1629
+
1630
+ This is the default version using suffix-based approach (convergence logic).
1631
+ For explicit control, use compute_path_uniqueness_by_group_suffix() or
1632
+ compute_path_uniqueness_by_group_prefix() from the prefix_tree module.
1633
+
1634
+ :param sequences: List of sequences.
1635
+ :param group_labels: List of group keys (same length as sequences), e.g., country, gender.
1636
+ :return: List of path uniqueness scores (same order as input).
1637
+ """
1638
+ return compute_path_uniqueness_by_group_suffix(sequences, group_labels)