sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
  2. sequenzo/__init__.py +349 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +476 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +22 -0
  30. sequenzo/data_preprocessing/helpers.py +303 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/dyadic_children.csv +61 -0
  44. sequenzo/datasets/dyadic_parents.csv +61 -0
  45. sequenzo/datasets/mvad.csv +713 -0
  46. sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
  47. sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
  48. sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
  49. sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
  50. sequenzo/datasets/political_science_aid_shock.csv +166 -0
  51. sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
  52. sequenzo/define_sequence_data.py +1400 -0
  53. sequenzo/dissimilarity_measures/__init__.py +31 -0
  54. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  55. sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
  56. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
  57. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  58. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  59. sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
  60. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  61. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  62. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  63. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  64. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  65. sequenzo/dissimilarity_measures/src/module.cpp +40 -0
  66. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  67. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  214. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  215. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  216. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  217. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  218. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  219. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  220. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  221. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  222. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  223. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  224. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  225. sequenzo/multidomain/__init__.py +23 -0
  226. sequenzo/multidomain/association_between_domains.py +311 -0
  227. sequenzo/multidomain/cat.py +597 -0
  228. sequenzo/multidomain/combt.py +519 -0
  229. sequenzo/multidomain/dat.py +81 -0
  230. sequenzo/multidomain/idcd.py +139 -0
  231. sequenzo/multidomain/linked_polyad.py +292 -0
  232. sequenzo/openmp_setup.py +233 -0
  233. sequenzo/prefix_tree/__init__.py +62 -0
  234. sequenzo/prefix_tree/hub.py +114 -0
  235. sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
  236. sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
  237. sequenzo/prefix_tree/spell_level_indicators.py +297 -0
  238. sequenzo/prefix_tree/system_level_indicators.py +544 -0
  239. sequenzo/prefix_tree/utils.py +54 -0
  240. sequenzo/seqhmm/__init__.py +95 -0
  241. sequenzo/seqhmm/advanced_optimization.py +305 -0
  242. sequenzo/seqhmm/bootstrap.py +411 -0
  243. sequenzo/seqhmm/build_hmm.py +142 -0
  244. sequenzo/seqhmm/build_mhmm.py +136 -0
  245. sequenzo/seqhmm/build_nhmm.py +121 -0
  246. sequenzo/seqhmm/fit_mhmm.py +62 -0
  247. sequenzo/seqhmm/fit_model.py +61 -0
  248. sequenzo/seqhmm/fit_nhmm.py +76 -0
  249. sequenzo/seqhmm/formulas.py +289 -0
  250. sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
  251. sequenzo/seqhmm/gradients_nhmm.py +306 -0
  252. sequenzo/seqhmm/hmm.py +291 -0
  253. sequenzo/seqhmm/mhmm.py +314 -0
  254. sequenzo/seqhmm/model_comparison.py +238 -0
  255. sequenzo/seqhmm/multichannel_em.py +282 -0
  256. sequenzo/seqhmm/multichannel_utils.py +138 -0
  257. sequenzo/seqhmm/nhmm.py +270 -0
  258. sequenzo/seqhmm/nhmm_utils.py +191 -0
  259. sequenzo/seqhmm/predict.py +137 -0
  260. sequenzo/seqhmm/predict_mhmm.py +142 -0
  261. sequenzo/seqhmm/simulate.py +878 -0
  262. sequenzo/seqhmm/utils.py +218 -0
  263. sequenzo/seqhmm/visualization.py +910 -0
  264. sequenzo/sequence_characteristics/__init__.py +40 -0
  265. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  266. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  267. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  268. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  269. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  270. sequenzo/sequence_characteristics/turbulence.py +155 -0
  271. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  272. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  273. sequenzo/suffix_tree/__init__.py +66 -0
  274. sequenzo/suffix_tree/hub.py +114 -0
  275. sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
  276. sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
  277. sequenzo/suffix_tree/spell_level_indicators.py +248 -0
  278. sequenzo/suffix_tree/system_level_indicators.py +535 -0
  279. sequenzo/suffix_tree/utils.py +56 -0
  280. sequenzo/version_check.py +283 -0
  281. sequenzo/visualization/__init__.py +29 -0
  282. sequenzo/visualization/plot_mean_time.py +222 -0
  283. sequenzo/visualization/plot_modal_state.py +276 -0
  284. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  285. sequenzo/visualization/plot_relative_frequency.py +405 -0
  286. sequenzo/visualization/plot_sequence_index.py +1175 -0
  287. sequenzo/visualization/plot_single_medoid.py +153 -0
  288. sequenzo/visualization/plot_state_distribution.py +651 -0
  289. sequenzo/visualization/plot_transition_matrix.py +190 -0
  290. sequenzo/visualization/utils/__init__.py +23 -0
  291. sequenzo/visualization/utils/utils.py +310 -0
  292. sequenzo/with_event_history_analysis/__init__.py +35 -0
  293. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  294. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  295. sequenzo-0.1.31.dist-info/METADATA +286 -0
  296. sequenzo-0.1.31.dist-info/RECORD +299 -0
  297. sequenzo-0.1.31.dist-info/WHEEL +5 -0
  298. sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
  299. sequenzo-0.1.31.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1679 @@
1
+ """
2
+ Individual-level indicators for position-based suffix tree (level = time index from end).
3
+
4
+ This module provides per-sequence (per-individual) convergence and rarity measures when
5
+ the unit of analysis is TIME INDEX FROM END: level t = suffix (states from time t to end).
6
+ Lower suffix rarity means a more typical (common) ending pattern; convergence = low rarity.
7
+
8
+ Usage (position-based: list of sequences, same length)
9
+ -----------------------------------------------------
10
+ from sequenzo import IndividualConvergence, extract_sequences
11
+
12
+ # sequences: list of lists, all same length T (e.g. from build_suffix_tree(..., mode="position").sequences)
13
+ sequences = extract_sequences(df, time_cols, id_col, states)
14
+ ind = IndividualConvergence(sequences)
15
+
16
+ # Per-year suffix rarity: (N x T) matrix or DataFrame. rarity_{i,t} = -log(freq(suffix_{i,t})/N)
17
+ rarity_df = ind.compute_suffix_rarity_per_year(as_dataframe=True, zscore=False)
18
+
19
+ # One score per individual: sum over t of rarity, or standardized (min over windows of max z)
20
+ scores = ind.compute_suffix_rarity_score()
21
+ std_scores = ind.compute_standardized_rarity_score(min_t=1, window=1)
22
+
23
+ # Binary convergence (0/1) and first convergence year (1-indexed, or None)
24
+ converged = ind.compute_converged(method="zscore", z_threshold=1.5, min_t=1, window=1)
25
+ first_year = ind.compute_first_convergence_year(method="zscore", z_threshold=1.5, min_t=1)
26
+
27
+ # Methods: "zscore" (window of low z, i.e. z < -z_threshold), "top_proportion" (bottom p% most typical), "quantile" (below quantile)
28
+ # With group_labels, top_proportion/quantile are applied within each group.
29
+
30
+ # Path uniqueness: count of time steps (from end) at which suffix is unique (freq==1)
31
+ uniqueness = ind.compute_path_uniqueness()
32
+
33
+ Spell-based (level = spell index from end) is in spell_individual_level_indicators.SpellIndividualConvergence;
34
+ use build_spell_suffix_tree(seqdata) then SpellIndividualConvergence(tree).
35
+
36
+ @Author : Yuqi Liang 梁彧祺
37
+ @File : individual_level_indicators.py
38
+ @Time : 08/08/2025 15:30
39
+ """
40
+ from collections import defaultdict
41
+ from typing import Optional, List
42
+ import numpy as np
43
+ import pandas as pd
44
+
45
+
46
+ class IndividualConvergence:
47
+ """
48
+ Individual-level convergence and suffix rarity for position-based suffix trees.
49
+
50
+ Input: sequences — a list of sequences (list of lists), all of the same length T.
51
+ Each sequence is the list of states at time 1, 2, ..., T. Level t corresponds
52
+ to the suffix (states from time t to end). Rarity at (i, t) is
53
+ -log(freq(suffix_{i,t})/N); lower rarity = more typical ending.
54
+
55
+ Main methods:
56
+ - compute_suffix_rarity_per_year: (N x T) rarity matrix or DataFrame.
57
+ - compute_suffix_rarity_score: one aggregated rarity score per individual (sum over t).
58
+ - compute_standardized_rarity_score: z-based score for classification (lower = more typical).
59
+ - compute_converged: binary 0/1 per individual (method: zscore, top_proportion, quantile).
60
+ - compute_first_convergence_year: first year (1-indexed) at which converged, or None.
61
+ - compute_path_uniqueness: count of time steps (from end) with unique suffix per individual.
62
+ - diagnose_convergence_calculation: diagnostic dict (variance by year, count converged, etc.).
63
+
64
+ Plotting: plot_suffix_rarity_distribution, plot_individual_indicators_correlation (in this module).
65
+ """
66
+
67
+ def __init__(self, sequences):
68
+ # Handle case where sequences might already be an IndividualConvergence object
69
+ if isinstance(sequences, IndividualConvergence):
70
+ # Extract sequences from existing object
71
+ self.sequences = sequences.sequences
72
+ elif hasattr(sequences, 'sequences'):
73
+ # Handle case where input might be another object with sequences attribute
74
+ self.sequences = sequences.sequences
75
+ else:
76
+ # Normal case: sequences is a list of sequences
77
+ self.sequences = sequences
78
+
79
+ # Validate input
80
+ if not self.sequences or len(self.sequences) == 0:
81
+ raise ValueError("sequences cannot be empty")
82
+ if not hasattr(self.sequences[0], '__len__') and not hasattr(self.sequences[0], '__iter__'):
83
+ raise ValueError("sequences must be a list of sequences (e.g., [[1,2,3], [2,3,1], ...])")
84
+
85
+ # 验证所有序列长度相同,防止不规整序列的静默错误
86
+ L0 = len(self.sequences[0])
87
+ if any(len(s) != L0 for s in self.sequences):
88
+ raise ValueError("All sequences must have the same length")
89
+ self.T = L0
90
+
91
+ self.suffix_freq_by_year = self._build_suffix_frequencies()
92
+
93
+ def _build_suffix_frequencies(self):
94
+ """
95
+ Build suffix frequencies for each year t.
96
+ suffix[t] contains frequency of suffixes from year t to end for all individuals.
97
+ """
98
+ freq_by_year = [defaultdict(int) for _ in range(self.T)]
99
+ for seq in self.sequences:
100
+ for t in range(self.T):
101
+ suffix = tuple(seq[t:]) # suffix from year t to end
102
+ freq_by_year[t][suffix] += 1
103
+ return freq_by_year
104
+
105
+ # Divergence-related computations are intentionally omitted in this convergence-focused module.
106
+
107
+ def compute_converged(
108
+ self,
109
+ z_threshold=1.5,
110
+ min_t=1,
111
+ max_t=None,
112
+ window=1,
113
+ inclusive=False,
114
+ group_labels=None,
115
+ *,
116
+ method: str = "zscore",
117
+ proportion: Optional[float] = None,
118
+ quantile_p: Optional[float] = None,
119
+ min_count: int = 1,
120
+ ):
121
+ """
122
+ Compute binary convergence flags with multiple selection methods.
123
+
124
+ Definition (common intuition): lower suffix-rarity implies more typical behavior.
125
+ We compute per-year rarity via suffix frequencies and then detect convergence using
126
+ one of the following methods:
127
+
128
+ Methods
129
+ -------
130
+ - "zscore" (window-based, default):
131
+ Uses per-year z-scores of rarity. A person is converged if there exists a window
132
+ of length `window` starting between years `[min_t, max_t]` where all z-scores are
133
+ below `-z_threshold` (use `inclusive=True` for `<=`). Zero-variance years remain
134
+ NaN and any window containing NaN is skipped.
135
+
136
+ - "top_proportion" (aka "topk"/"proportion"/"rank"):
137
+ Uses the aggregated standardized score from `compute_standardized_rarity_score`
138
+ (lower = more typical). Select the most typical `proportion` within each group if
139
+ `group_labels` is provided, otherwise globally. `min_count` ensures at least the
140
+ specified number per group.
141
+
142
+ - "quantile":
143
+ Uses a quantile threshold (`quantile_p`) on the aggregated standardized score,
144
+ within each group (or globally if no `group_labels`). Individuals at or below the
145
+ threshold are marked converged.
146
+
147
+ Parameters
148
+ ----------
149
+ z_threshold : float, default 1.5
150
+ zscore method only. Converged when z < -z_threshold (or <= if inclusive=True).
151
+ min_t, max_t : int
152
+ Search interval for the starting year (1-indexed). If max_t is None, uses T - window + 1.
153
+ window : int, default 1
154
+ Number of consecutive years required in zscore method and used in standardized aggregation.
155
+ inclusive : bool, default False
156
+ zscore method only. If True, use <= comparisons.
157
+ group_labels : array-like or None
158
+ If provided, proportion/quantile selections are computed within each group.
159
+ method : str, default "zscore"
160
+ One of {"zscore", "top_proportion" (aliases: "topk","proportion","rank"), "quantile"}.
161
+ proportion : float or None
162
+ For top_proportion. Fraction (0,1) to select as converged. Defaults to 0.10 if None.
163
+ quantile_p : float or None
164
+ For quantile. Quantile in (0,1) used as threshold. Defaults to 0.10 if None.
165
+ min_count : int, default 1
166
+ For top_proportion. Lower bound for number selected per group.
167
+
168
+ Returns
169
+ -------
170
+ List[int]
171
+ 0/1 indicator for each individual.
172
+ """
173
+ if max_t is None:
174
+ max_t = self.T - window + 1
175
+
176
+ N = len(self.sequences)
177
+ method_norm = (method or "zscore").lower()
178
+
179
+ # Branch: rank/quantile style selections using aggregated standardized scores
180
+ if method_norm in {"top_proportion", "topk", "proportion", "rank"}:
181
+ p = proportion if proportion is not None else 0.10
182
+ scores = np.asarray(
183
+ self.compute_standardized_rarity_score(min_t=min_t, max_t=max_t, window=window), dtype=float
184
+ )
185
+ if group_labels is None:
186
+ vals = scores
187
+ finite_mask = np.isfinite(vals)
188
+ n_valid = int(np.sum(finite_mask))
189
+ if n_valid == 0:
190
+ return [0] * N
191
+ k = int(np.floor(p * n_valid))
192
+ if k < int(min_count):
193
+ k = int(min_count)
194
+ if k > n_valid:
195
+ k = n_valid
196
+ order = np.argsort(np.where(np.isfinite(vals), vals, np.inf), kind="mergesort")
197
+ flags = np.zeros(N, dtype=int)
198
+ if k >= 1:
199
+ selected = order[:k]
200
+ flags[selected] = 1
201
+ return flags.tolist()
202
+ else:
203
+ flags, _ = self.compute_converged_by_top_proportion(
204
+ group_labels=group_labels,
205
+ proportion=float(p),
206
+ min_t=min_t,
207
+ max_t=max_t,
208
+ window=window,
209
+ min_count=min_count,
210
+ )
211
+ return flags
212
+
213
+ if method_norm == "quantile":
214
+ q = quantile_p if quantile_p is not None else 0.10
215
+ scores = np.asarray(
216
+ self.compute_standardized_rarity_score(min_t=min_t, max_t=max_t, window=window), dtype=float
217
+ )
218
+ flags = np.zeros(N, dtype=int)
219
+ if group_labels is None:
220
+ # Global quantile
221
+ valid = scores[np.isfinite(scores)]
222
+ if valid.size == 0:
223
+ return flags.tolist()
224
+ try:
225
+ xq = float(np.nanquantile(scores, q))
226
+ except Exception:
227
+ xq = float(np.quantile(valid, q))
228
+ flags[np.where(scores <= xq)[0]] = 1
229
+ return flags.tolist()
230
+ else:
231
+ labels = np.asarray(group_labels)
232
+ for g in pd.unique(labels):
233
+ idx = np.where(labels == g)[0]
234
+ vals = scores[idx]
235
+ valid = vals[np.isfinite(vals)]
236
+ if valid.size == 0:
237
+ continue
238
+ try:
239
+ xq = float(np.nanquantile(vals, q))
240
+ except Exception:
241
+ xq = float(np.quantile(valid, q))
242
+ local = np.where(vals <= xq)[0]
243
+ flags[idx[local]] = 1
244
+ return flags.tolist()
245
+
246
+ # Default branch: z-score window logic (supports group or global frequencies)
247
+ if group_labels is not None:
248
+ # 组内收敛:使用组内频率和样本大小
249
+ return self._compute_converged_by_group(z_threshold, min_t, max_t, window, inclusive, group_labels)
250
+
251
+ # 使用全局频率计算稀有度
252
+ rarity_matrix = []
253
+ for seq in self.sequences:
254
+ score = []
255
+ for t in range(self.T):
256
+ suffix = tuple(seq[t:])
257
+ freq = self.suffix_freq_by_year[t][suffix] / N
258
+ score.append(-np.log(freq + 1e-10))
259
+ rarity_matrix.append(score)
260
+
261
+ rarity_df = pd.DataFrame(rarity_matrix)
262
+ # 按列 z 标准化;保留 NaN(零方差年份),后续窗口检测时跳过含 NaN 的窗口
263
+ rarity_z = rarity_df.apply(lambda x: (x - x.mean()) / x.std(), axis=0)
264
+ rarity_z = rarity_z.replace([np.inf, -np.inf], np.nan)
265
+
266
+ flags = []
267
+ for i in range(N):
268
+ z = rarity_z.iloc[i]
269
+ converged = 0
270
+ for t in range(min_t - 1, max_t): # min_t-1 for 0-indexed, max_t already accounts for window
271
+ # 跳过包含 NaN 的窗口(如零方差年份)
272
+ vals = [z.iloc[t + k] for k in range(window)]
273
+ if not np.all(np.isfinite(vals)):
274
+ continue
275
+ # 收敛 = 低稀有(更典型)
276
+ if inclusive:
277
+ condition = all(v <= -z_threshold for v in vals)
278
+ else:
279
+ condition = all(v < -z_threshold for v in vals)
280
+ if condition:
281
+ converged = 1
282
+ break
283
+ flags.append(converged)
284
+ return flags
285
+
286
+ def _compute_converged_by_group(self, z_threshold, min_t, max_t, window, inclusive, group_labels):
287
+ """
288
+ 计算组内收敛:使用组内频率和样本大小计算稀有度
289
+ """
290
+ from collections import defaultdict
291
+
292
+ # 按组构建 suffix 频率表
293
+ group_suffix_freq = {}
294
+ group_sizes = {}
295
+
296
+ # 先统计各组信息
297
+ group_sequences = defaultdict(list)
298
+ for i, (seq, group) in enumerate(zip(self.sequences, group_labels)):
299
+ group_sequences[group].append((i, seq))
300
+
301
+ # 为每个组构建 suffix 频率表
302
+ for group, seq_list in group_sequences.items():
303
+ group_sizes[group] = len(seq_list)
304
+ freq_by_year = [defaultdict(int) for _ in range(self.T)]
305
+
306
+ for _, seq in seq_list:
307
+ for t in range(self.T):
308
+ suffix = tuple(seq[t:])
309
+ freq_by_year[t][suffix] += 1
310
+
311
+ group_suffix_freq[group] = freq_by_year
312
+
313
+ # 为每个个体计算组内稀有度
314
+ all_flags = [0] * len(self.sequences)
315
+
316
+ for group, seq_list in group_sequences.items():
317
+ group_n = group_sizes[group]
318
+ group_freq = group_suffix_freq[group]
319
+
320
+ # 计算该组的稀有度矩阵
321
+ rarity_matrix = []
322
+ group_indices = []
323
+
324
+ for orig_idx, seq in seq_list:
325
+ group_indices.append(orig_idx)
326
+ score = []
327
+ for t in range(self.T):
328
+ suffix = tuple(seq[t:])
329
+ freq = group_freq[t][suffix] / group_n
330
+ score.append(-np.log(freq + 1e-10))
331
+ rarity_matrix.append(score)
332
+
333
+ # 计算 z 分数
334
+ rarity_df = pd.DataFrame(rarity_matrix)
335
+ rarity_z = rarity_df.apply(lambda x: (x - x.mean()) / x.std(), axis=0)
336
+ rarity_z = rarity_z.replace([np.inf, -np.inf], np.nan)
337
+
338
+ # 判断收敛
339
+ for i, orig_idx in enumerate(group_indices):
340
+ z = rarity_z.iloc[i]
341
+ converged = 0
342
+ for t in range(min_t - 1, max_t):
343
+ vals = [z.iloc[t + k] for k in range(window)]
344
+ if not np.all(np.isfinite(vals)):
345
+ continue
346
+ if inclusive:
347
+ condition = all(v <= -z_threshold for v in vals)
348
+ else:
349
+ condition = all(v < -z_threshold for v in vals)
350
+ if condition:
351
+ converged = 1
352
+ break
353
+
354
+ all_flags[orig_idx] = converged
355
+
356
+ return all_flags
357
+
358
+ # First-divergence timing is intentionally omitted in this convergence-focused module.
359
+
360
+ def compute_first_convergence_year(
361
+ self,
362
+ z_threshold=1.5,
363
+ min_t=1,
364
+ max_t=None,
365
+ window=1,
366
+ inclusive=False,
367
+ group_labels=None,
368
+ *,
369
+ method: str = "zscore",
370
+ proportion: Optional[float] = None,
371
+ quantile_p: Optional[float] = None,
372
+ min_count: int = 1,
373
+ ):
374
+ """
375
+ Compute the first convergence year per individual with multiple selection methods.
376
+
377
+ Methods
378
+ -------
379
+ - "zscore" (default):
380
+ Find the earliest starting year t in [min_t, max_t] such that all z-scores in the
381
+ length-`window` block are below `-z_threshold` (or <= if inclusive=True). Zero-variance
382
+ years are NaN; windows containing NaN are skipped.
383
+
384
+ - "top_proportion" (aka "topk"/"proportion"/"rank"):
385
+ Use aggregated standardized scores to pick the most typical `proportion` within each group
386
+ (or globally). For the selected individuals, return the earliest t where the per-window
387
+ max z-score is <= the selection threshold; others return None. `min_count` is respected.
388
+
389
+ - "quantile":
390
+ Use per-group (or global) quantile threshold `quantile_p` on aggregated standardized scores;
391
+ individuals at or below the threshold return the earliest qualifying year; others return None.
392
+
393
+ Parameters
394
+ ----------
395
+ z_threshold, min_t, max_t, window, inclusive, group_labels
396
+ Same definitions as in `compute_converged` for the zscore method.
397
+ method : str, default "zscore"
398
+ One of {"zscore", "top_proportion" (aliases: "topk","proportion","rank"), "quantile"}.
399
+ proportion : float or None
400
+ For top_proportion. Fraction (0,1) to select as converged. Defaults to 0.10 if None.
401
+ quantile_p : float or None
402
+ For quantile. Quantile in (0,1) used as threshold. Defaults to 0.10 if None.
403
+ min_count : int, default 1
404
+ For top_proportion. Lower bound for number selected per group.
405
+
406
+ Returns
407
+ -------
408
+ List[Optional[int]]
409
+ First convergence years (1-indexed). None indicates no convergence.
410
+ """
411
+ if max_t is None:
412
+ max_t = self.T - window + 1
413
+
414
+ N = len(self.sequences)
415
+ method_norm = (method or "zscore").lower()
416
+
417
+ # Helper: standardized z matrix and per-t window maxima per individual
418
+ def _compute_window_max_list():
419
+ # Build rarity matrix and columnwise z (global standardization)
420
+ rarity_matrix = []
421
+ for seq in self.sequences:
422
+ score = []
423
+ for t in range(self.T):
424
+ suffix = tuple(seq[t:])
425
+ freq = self.suffix_freq_by_year[t][suffix] / N
426
+ score.append(-np.log(freq + 1e-10))
427
+ rarity_matrix.append(score)
428
+ rarity_arr = np.asarray(rarity_matrix, dtype=float)
429
+ col_means = np.nanmean(rarity_arr, axis=0)
430
+ col_stds = np.nanstd(rarity_arr, axis=0, ddof=1)
431
+ with np.errstate(invalid='ignore', divide='ignore'):
432
+ rarity_z = (rarity_arr - col_means) / col_stds
433
+ rarity_z = np.where(np.isfinite(rarity_z), rarity_z, np.nan)
434
+ # Compute per-individual window maxima sequence over t
435
+ window_maxes = [] # list of list per i
436
+ for i in range(N):
437
+ z_scores = rarity_z[i, :]
438
+ vals_per_t = []
439
+ for t0 in range(min_t - 1, max_t):
440
+ vals = [z_scores[t0 + k] for k in range(window)]
441
+ if not np.all(np.isfinite(vals)):
442
+ vals_per_t.append(np.nan)
443
+ else:
444
+ vals_per_t.append(float(np.max(vals)))
445
+ window_maxes.append(vals_per_t)
446
+ return np.asarray(window_maxes, dtype=float)
447
+
448
+ # Branches for rank/quantile-style thresholds
449
+ if method_norm in {"top_proportion", "topk", "proportion", "rank", "quantile"}:
450
+ # Compute aggregated scores for thresholding
451
+ agg_scores = np.asarray(
452
+ self.compute_standardized_rarity_score(min_t=min_t, max_t=max_t, window=window), dtype=float
453
+ )
454
+ per_t_window_max = _compute_window_max_list()
455
+
456
+ if method_norm in {"top_proportion", "topk", "proportion", "rank"}:
457
+ p = proportion if proportion is not None else 0.10
458
+ if group_labels is None:
459
+ vals = agg_scores
460
+ finite_mask = np.isfinite(vals)
461
+ n_valid = int(np.sum(finite_mask))
462
+ if n_valid == 0:
463
+ return [None] * N
464
+ k = int(np.floor(p * n_valid))
465
+ if k < int(min_count):
466
+ k = int(min_count)
467
+ if k > n_valid:
468
+ k = n_valid
469
+ order = np.argsort(np.where(np.isfinite(vals), vals, np.inf), kind="mergesort")
470
+ selected_idx = set(order[:k].tolist()) if k >= 1 else set()
471
+ years = []
472
+ for i in range(N):
473
+ if i not in selected_idx:
474
+ years.append(None)
475
+ continue
476
+ wm = per_t_window_max[i]
477
+ # threshold value is kth value
478
+ thresh_val = vals[order[k - 1]] if k >= 1 else np.nan
479
+ if not np.isfinite(thresh_val):
480
+ years.append(None)
481
+ continue
482
+ # earliest t where window_max <= threshold
483
+ yr = None
484
+ for t_idx, wv in enumerate(wm):
485
+ if np.isfinite(wv) and wv <= float(thresh_val):
486
+ yr = t_idx + 1 # 1-indexed
487
+ break
488
+ years.append(yr)
489
+ return years
490
+ else:
491
+ labels = np.asarray(group_labels)
492
+ years = [None] * N
493
+ for g in pd.unique(labels):
494
+ idx = np.where(labels == g)[0]
495
+ vals = agg_scores[idx]
496
+ finite_mask = np.isfinite(vals)
497
+ n_valid = int(np.sum(finite_mask))
498
+ if n_valid == 0:
499
+ continue
500
+ k = int(np.floor(p * n_valid))
501
+ if k < int(min_count):
502
+ k = int(min_count)
503
+ if k > n_valid:
504
+ k = n_valid
505
+ order_local = np.argsort(np.where(np.isfinite(vals), vals, np.inf), kind="mergesort")
506
+ selected_local = set(order_local[:k].tolist()) if k >= 1 else set()
507
+ thresh_val = vals[order_local[k - 1]] if k >= 1 else np.nan
508
+ for j_local, i_global in enumerate(idx):
509
+ if j_local not in selected_local or not np.isfinite(thresh_val):
510
+ continue
511
+ wm = per_t_window_max[i_global]
512
+ for t_idx, wv in enumerate(wm):
513
+ if np.isfinite(wv) and wv <= float(thresh_val):
514
+ years[i_global] = int(t_idx + 1)
515
+ break
516
+ return years
517
+
518
+ # quantile branch
519
+ q = quantile_p if quantile_p is not None else 0.10
520
+ years = [None] * N
521
+ if group_labels is None:
522
+ valid = agg_scores[np.isfinite(agg_scores)]
523
+ if valid.size == 0:
524
+ return years
525
+ try:
526
+ xq = float(np.nanquantile(agg_scores, q))
527
+ except Exception:
528
+ xq = float(np.quantile(valid, q))
529
+ for i in range(N):
530
+ if not np.isfinite(agg_scores[i]) or agg_scores[i] > xq:
531
+ continue
532
+ wm = per_t_window_max[i]
533
+ for t_idx, wv in enumerate(wm):
534
+ if np.isfinite(wv) and wv <= xq:
535
+ years[i] = int(t_idx + 1)
536
+ break
537
+ return years
538
+ else:
539
+ labels = np.asarray(group_labels)
540
+ for g in pd.unique(labels):
541
+ idx = np.where(labels == g)[0]
542
+ vals = agg_scores[idx]
543
+ valid = vals[np.isfinite(vals)]
544
+ if valid.size == 0:
545
+ continue
546
+ try:
547
+ xq = float(np.nanquantile(vals, q))
548
+ except Exception:
549
+ xq = float(np.quantile(valid, q))
550
+ for j_local, i_global in enumerate(idx):
551
+ if not np.isfinite(vals[j_local]) or vals[j_local] > xq:
552
+ continue
553
+ wm = per_t_window_max[i_global]
554
+ for t_idx, wv in enumerate(wm):
555
+ if np.isfinite(wv) and wv <= xq:
556
+ years[i_global] = t_idx + 1
557
+ break
558
+ return years
559
+
560
+ if group_labels is not None and method_norm == "zscore":
561
+ # 组内收敛:使用组内频率和样本大小
562
+ return self._compute_first_convergence_year_by_group(z_threshold, min_t, max_t, window, inclusive, group_labels)
563
+
564
+ # 使用全局频率计算稀有度
565
+ rarity_matrix = []
566
+ for seq in self.sequences:
567
+ score = []
568
+ for t in range(self.T):
569
+ suffix = tuple(seq[t:])
570
+ freq = self.suffix_freq_by_year[t][suffix] / N
571
+ score.append(-np.log(freq + 1e-10))
572
+ rarity_matrix.append(score)
573
+
574
+ rarity_df = pd.DataFrame(rarity_matrix)
575
+ rarity_z = rarity_df.apply(lambda x: (x - x.mean()) / x.std(), axis=0)
576
+ # 保留 NaN 以便跳过含零方差年份的窗口
577
+ rarity_z = rarity_z.replace([np.inf, -np.inf], np.nan)
578
+
579
+ years = []
580
+ for i in range(N):
581
+ z = rarity_z.iloc[i]
582
+ year = None
583
+ for t in range(min_t - 1, max_t): # min_t-1 for 0-indexed, max_t already accounts for window
584
+ vals = [z.iloc[t + k] for k in range(window)]
585
+ if not np.all(np.isfinite(vals)):
586
+ continue
587
+ # 收敛 = 低稀有(更典型)
588
+ if inclusive:
589
+ condition = all(v <= -z_threshold for v in vals)
590
+ else:
591
+ condition = all(v < -z_threshold for v in vals)
592
+ if condition:
593
+ year = int(t + 1) # Convert to 1-indexed integer
594
+ break
595
+ years.append(year)
596
+ return years
597
+
598
+ def _compute_first_convergence_year_by_group(self, z_threshold, min_t, max_t, window, inclusive, group_labels):
599
+ """
600
+ 计算组内第一次收敛年份:使用组内频率和样本大小计算稀有度
601
+ """
602
+ from collections import defaultdict
603
+
604
+ # 按组构建 suffix 频率表(重用 _compute_converged_by_group 的逻辑)
605
+ group_sequences = defaultdict(list)
606
+ for i, (seq, group) in enumerate(zip(self.sequences, group_labels)):
607
+ group_sequences[group].append((i, seq))
608
+
609
+ # 为每个组构建 suffix 频率表
610
+ group_suffix_freq = {}
611
+ group_sizes = {}
612
+ for group, seq_list in group_sequences.items():
613
+ group_sizes[group] = len(seq_list)
614
+ freq_by_year = [defaultdict(int) for _ in range(self.T)]
615
+
616
+ for _, seq in seq_list:
617
+ for t in range(self.T):
618
+ suffix = tuple(seq[t:])
619
+ freq_by_year[t][suffix] += 1
620
+
621
+ group_suffix_freq[group] = freq_by_year
622
+
623
+ # 为每个个体计算组内收敛年份
624
+ all_years = [None] * len(self.sequences)
625
+
626
+ for group, seq_list in group_sequences.items():
627
+ group_n = group_sizes[group]
628
+ group_freq = group_suffix_freq[group]
629
+
630
+ # 计算该组的稀有度矩阵
631
+ rarity_matrix = []
632
+ group_indices = []
633
+
634
+ for orig_idx, seq in seq_list:
635
+ group_indices.append(orig_idx)
636
+ score = []
637
+ for t in range(self.T):
638
+ suffix = tuple(seq[t:])
639
+ freq = group_freq[t][suffix] / group_n
640
+ score.append(-np.log(freq + 1e-10))
641
+ rarity_matrix.append(score)
642
+
643
+ # 计算 z 分数
644
+ rarity_df = pd.DataFrame(rarity_matrix)
645
+ rarity_z = rarity_df.apply(lambda x: (x - x.mean()) / x.std(), axis=0)
646
+ rarity_z = rarity_z.replace([np.inf, -np.inf], np.nan)
647
+
648
+ # 寻找第一次收敛年份
649
+ for i, orig_idx in enumerate(group_indices):
650
+ z = rarity_z.iloc[i]
651
+ year = None
652
+ for t in range(min_t - 1, max_t):
653
+ vals = [z.iloc[t + k] for k in range(window)]
654
+ if not np.all(np.isfinite(vals)):
655
+ continue
656
+ if inclusive:
657
+ condition = all(v <= -z_threshold for v in vals)
658
+ else:
659
+ condition = all(v < -z_threshold for v in vals)
660
+ if condition:
661
+ year = int(t + 1)
662
+ break
663
+
664
+ all_years[orig_idx] = year
665
+
666
+ return all_years
667
+
668
+ def compute_suffix_rarity_per_year(self, as_dataframe: bool = True, column_prefix: str = "t", zscore: bool = False):
669
+ """
670
+ Compute per-year suffix rarity scores for each individual.
671
+
672
+ Definition (mirror of prefix rarity):
673
+ rarity_{i,t} = -log( freq(suffix_{i,t}) / N ) >= 0
674
+
675
+ Where suffix_{i,t} is the observed suffix from year t to end for person i,
676
+ and N is total number of individuals. Higher means rarer (less typical).
677
+
678
+ Parameters
679
+ ----------
680
+ as_dataframe : bool, default True
681
+ If True, returns a pandas DataFrame with columns f"{column_prefix}1"..f"{column_prefix}T".
682
+ If False, returns a NumPy array of shape (N, T).
683
+ column_prefix : str, default "t"
684
+ Column name prefix when returning a DataFrame.
685
+ zscore : bool, default False
686
+ If True, z-standardize the rarity scores column-wise (by year).
687
+
688
+ Returns
689
+ -------
690
+ pandas.DataFrame or np.ndarray
691
+ Per-year rarity scores (optionally z-scored).
692
+ """
693
+ N = len(self.sequences)
694
+ rarity_matrix = []
695
+
696
+ for seq in self.sequences:
697
+ score_list = []
698
+ for t in range(self.T):
699
+ suffix = tuple(seq[t:])
700
+ freq = self.suffix_freq_by_year[t][suffix] / N
701
+ score_list.append(-np.log(freq + 1e-10))
702
+ rarity_matrix.append(score_list)
703
+
704
+ rarity_arr = np.array(rarity_matrix, dtype=float)
705
+
706
+ if zscore:
707
+ col_means = np.nanmean(rarity_arr, axis=0)
708
+ col_stds = np.nanstd(rarity_arr, axis=0, ddof=1) # 与 pandas DataFrame.std() 保持一致
709
+ with np.errstate(invalid='ignore', divide='ignore'):
710
+ rarity_arr = (rarity_arr - col_means) / col_stds
711
+
712
+ if not as_dataframe:
713
+ return rarity_arr
714
+
715
+ columns = [f"{column_prefix}{t+1}" for t in range(self.T)]
716
+ return pd.DataFrame(rarity_arr, columns=columns)
717
+
718
+ def compute_suffix_rarity_score(self):
719
+ """
720
+ Compute cumulative suffix rarity score for each individual:
721
+ rarity_score_i = sum_{t=1}^T -log( freq(suffix_{i,t}) / N )
722
+
723
+ Higher scores indicate rarer, less typical future paths from each year onward.
724
+ """
725
+ rarity_scores = []
726
+ N = len(self.sequences)
727
+
728
+ for seq in self.sequences:
729
+ score = 0.0
730
+ for t in range(self.T):
731
+ suffix = tuple(seq[t:])
732
+ freq = self.suffix_freq_by_year[t][suffix] / N
733
+ score += -np.log(freq + 1e-10)
734
+ rarity_scores.append(score)
735
+ return rarity_scores
736
+
737
+ def compute_standardized_rarity_score(self, min_t=1, max_t=None, window=1):
738
+ """
739
+ Compute standardized rarity scores for convergence classification and visualization
740
+ using true statistical z-scores.
741
+
742
+ This method computes standardized rarity scores used for individual-level
743
+ convergence classification:
744
+ standardized_score_i = min_t max_{k=0..window-1} z_{i,t+k}
745
+
746
+ Where z_{i,t} are the year-wise true z-scores of suffix rarity computed column-wise
747
+ across individuals with sample standard deviation (ddof=1):
748
+ z_{i,t} = (x_{i,t} - mean_t) / std_t
749
+
750
+ The standardized scores can be used with a threshold (e.g., z <= -1.5) to classify
751
+ individuals as converged/not converged, and are particularly useful for visualization.
752
+
753
+ Note: For convergence (suffix tree), we look for LOW rarity (more typical patterns),
754
+ so lower z-scores indicate convergence. This is opposite to prefix tree divergence.
755
+
756
+ Parameters:
757
+ -----------
758
+ min_t : int, default=1
759
+ Minimum year (1-indexed) after which convergence is considered valid.
760
+ max_t : int, optional
761
+ Maximum year (1-indexed) before which convergence is considered valid.
762
+ If None, uses T-window+1.
763
+ window : int, default=1
764
+ Number of consecutive low-z years required
765
+
766
+ Returns:
767
+ --------
768
+ List[float]
769
+ Standardized rarity scores for each individual. Values <= -z_threshold indicate convergence.
770
+
771
+ Notes:
772
+ ------
773
+ The standardization uses sample standard deviation (ddof=1) for each year column,
774
+ which is consistent with pandas' default behavior for DataFrame.std().
775
+ This is essentially the z-score normalized version of suffix rarity scores.
776
+ For convergence detection, we look for the MINIMUM z-score (most typical behavior).
777
+ """
778
+ if max_t is None:
779
+ max_t = self.T - window + 1
780
+
781
+ N = len(self.sequences)
782
+
783
+ # Step 1: Calculate rarity matrix
784
+ rarity_matrix = []
785
+ for seq in self.sequences:
786
+ score = []
787
+ for t in range(self.T):
788
+ suffix = tuple(seq[t:])
789
+ freq = self.suffix_freq_by_year[t][suffix] / N
790
+ score.append(-np.log(freq + 1e-10))
791
+ rarity_matrix.append(score)
792
+
793
+ # Step 2: Column-wise true z-score standardization (by year, ddof=1)
794
+ rarity_arr = np.asarray(rarity_matrix, dtype=float)
795
+ col_means = np.nanmean(rarity_arr, axis=0)
796
+ col_stds = np.nanstd(rarity_arr, axis=0, ddof=1)
797
+ with np.errstate(invalid='ignore', divide='ignore'):
798
+ rarity_z = (rarity_arr - col_means) / col_stds
799
+ # Keep NaN for zero-variance years to allow window skipping downstream
800
+ rarity_z = np.where(np.isfinite(rarity_z), rarity_z, np.nan)
801
+
802
+ # Step 3: Compute standardized rarity score for each individual
803
+ standardized_scores = []
804
+ for i in range(N):
805
+ z_scores = rarity_z[i, :]
806
+ candidate_values = []
807
+
808
+ # For each possible starting time t
809
+ for t in range(min_t - 1, max_t): # min_t-1 for 0-indexed
810
+ vals = [z_scores[t + k] for k in range(window)]
811
+ # Skip windows containing NaN (e.g., zero-variance years)
812
+ if not np.all(np.isfinite(vals)):
813
+ continue
814
+ # For convergence, take maximum within window (ensure all finite)
815
+ window_max = float(np.max(vals))
816
+ candidate_values.append(window_max)
817
+
818
+ # Take the minimum across all starting times (most convergent period)
819
+ if candidate_values:
820
+ standardized_score = float(np.min(candidate_values))
821
+ else:
822
+ standardized_score = np.nan
823
+
824
+ standardized_scores.append(standardized_score)
825
+
826
+ return standardized_scores
827
+
828
+ def compute_converged_by_top_proportion(
829
+ self,
830
+ group_labels,
831
+ proportion: float = 0.10,
832
+ min_t: int = 1,
833
+ max_t: Optional[int] = None,
834
+ window: int = 1,
835
+ min_count: int = 1,
836
+ ):
837
+ """
838
+ Classify convergence by selecting the top proportion of most typical (smallest) standardized scores
839
+ WITHIN EACH GROUP (e.g., country). This ensures identical proportion thresholds across groups,
840
+ independent of distribution shape or discreteness.
841
+
842
+ Steps:
843
+ 1) Compute true-z standardized rarity aggregated score per individual using
844
+ `compute_standardized_rarity_score(min_t, max_t, window)`.
845
+ 2) For each group g, sort scores ascending and select the first k = max(min_count, floor(p*n_g)) indices
846
+ as convergers.
847
+
848
+ Parameters
849
+ ----------
850
+ group_labels : Sequence
851
+ Group label per individual (e.g., country). Length must equal number of sequences.
852
+ proportion : float, default 0.10
853
+ Top p proportion to mark as converged within each group (0<p<1).
854
+ min_t : int, default 1
855
+ Minimum year considered in the aggregated score.
856
+ max_t : Optional[int], default None
857
+ Maximum starting year considered; if None, uses T-window+1.
858
+ window : int, default 1
859
+ Number of consecutive years in the aggregated statistic.
860
+ min_count : int, default 1
861
+ Minimum number selected per group (useful for very small groups).
862
+
863
+ Returns
864
+ -------
865
+ tuple[List[int], dict]
866
+ (flags, info) where flags is a 0/1 list for convergence, and info is per-group metadata:
867
+ {group: {"k": int, "n": int, "threshold_value": float}}
868
+ """
869
+ if not (0 < float(proportion) < 1):
870
+ raise ValueError(f"proportion must be in (0,1), got {proportion}")
871
+
872
+ N = len(self.sequences)
873
+ if len(group_labels) != N:
874
+ raise ValueError("Length of group_labels must match number of sequences")
875
+
876
+ # 1) Compute aggregated standardized score (lower = more typical)
877
+ scores = np.asarray(self.compute_standardized_rarity_score(min_t=min_t, max_t=max_t, window=window), dtype=float)
878
+
879
+ labels = np.asarray(group_labels)
880
+ flags = np.zeros(N, dtype=int)
881
+ info = {}
882
+
883
+ # Iterate groups deterministically by sorted group name for reproducibility
884
+ for g in sorted(pd.unique(labels)):
885
+ idx = np.where(labels == g)[0]
886
+ vals = scores[idx]
887
+
888
+ n_g = len(idx)
889
+ if n_g == 0:
890
+ info[g] = {"k": 0, "n": 0, "threshold_value": np.nan}
891
+ continue
892
+
893
+ # Determine k with lower bound min_count and upper bound n_g
894
+ k = int(np.floor(proportion * n_g))
895
+ if k < min_count:
896
+ k = min_count
897
+ if k > n_g:
898
+ k = n_g
899
+
900
+ # Treat NaN as worst (push to the end); still allow exact k selection
901
+ order_vals = np.where(np.isfinite(vals), vals, np.inf)
902
+ order = np.argsort(order_vals, kind="mergesort") # stable for tie-breaking
903
+
904
+ if k >= 1:
905
+ selected_local = order[:k]
906
+ selected_global = idx[selected_local]
907
+ flags[selected_global] = 1
908
+ kth_val = vals[order[k - 1]]
909
+ kth_val = float(kth_val) if np.isfinite(kth_val) else np.nan
910
+ else:
911
+ selected_local = np.array([], dtype=int)
912
+ kth_val = np.nan
913
+
914
+ info[g] = {"k": int(k), "n": int(n_g), "threshold_value": kth_val}
915
+
916
+ return flags.tolist(), info
917
+
918
+ def diagnose_convergence_calculation(self, z_threshold=1.5, max_t=None, window=1, inclusive=False, group_labels=None):
919
+ """
920
+ Diagnostic function to analyze convergence year calculation and identify
921
+ years with insufficient variance (std ≈ 0) that cannot trigger convergence.
922
+
923
+ This is methodologically appropriate: when all individuals follow similar
924
+ trajectories in a given year, no convergence should be detected.
925
+
926
+ Returns:
927
+ --------
928
+ dict: Diagnostic information including:
929
+ - years_with_zero_variance: List of years where std ≈ 0
930
+ - rarity_std_by_year: Standard deviation of rarity scores per year
931
+ - n_individuals_with_convergence: Count of individuals with any convergence
932
+ - convergence_year_distribution: Value counts of convergence years
933
+ """
934
+ if max_t is None:
935
+ max_t = self.T - window + 1
936
+
937
+ N = len(self.sequences)
938
+ rarity_matrix = []
939
+
940
+ # Calculate rarity scores (same as in compute_first_convergence_year)
941
+ for seq in self.sequences:
942
+ score = []
943
+ for t in range(self.T):
944
+ suffix = tuple(seq[t:])
945
+ freq = self.suffix_freq_by_year[t][suffix] / N
946
+ score.append(-np.log(freq + 1e-10))
947
+ rarity_matrix.append(score)
948
+
949
+ rarity_df = pd.DataFrame(rarity_matrix)
950
+
951
+ # Calculate standard deviations by year
952
+ rarity_std_by_year = rarity_df.std(axis=0)
953
+ years_with_zero_variance = []
954
+
955
+ # Identify years with near-zero variance (threshold can be adjusted)
956
+ for t, std_val in enumerate(rarity_std_by_year):
957
+ if pd.isna(std_val) or std_val < 1e-10:
958
+ years_with_zero_variance.append(t + 1) # 1-indexed
959
+
960
+ # Calculate z-scores
961
+ rarity_z = rarity_df.apply(lambda x: (x - x.mean()) / x.std(), axis=0)
962
+
963
+ # Count individuals with convergence
964
+ convergence_years = self.compute_first_convergence_year(
965
+ z_threshold=z_threshold, min_t=1, max_t=max_t, window=window,
966
+ inclusive=inclusive, group_labels=group_labels
967
+ )
968
+ n_individuals_with_convergence = sum(1 for year in convergence_years if year is not None)
969
+
970
+ # Distribution of convergence years
971
+ convergence_year_counts = pd.Series(convergence_years).value_counts(dropna=False).sort_index()
972
+
973
+ return {
974
+ 'years_with_zero_variance': years_with_zero_variance,
975
+ 'rarity_std_by_year': rarity_std_by_year.tolist(),
976
+ 'n_individuals_with_convergence': n_individuals_with_convergence,
977
+ 'convergence_year_distribution': convergence_year_counts.to_dict(),
978
+ 'total_individuals': N,
979
+ 'parameters_used': {
980
+ 'z_threshold': z_threshold,
981
+ 'max_t': max_t,
982
+ 'window': window,
983
+ 'inclusive': inclusive,
984
+ 'group_labels': group_labels is not None
985
+ }
986
+ }
987
+
988
+ def compute_path_uniqueness(self):
989
+ """
990
+ Count, for each individual, how many years t their suffix (from t to end)
991
+ is unique in the population (frequency == 1). Uses suffix-based logic.
992
+ """
993
+ uniqueness_scores = []
994
+ for seq in self.sequences:
995
+ count = 0
996
+ for t in range(self.T):
997
+ suffix = tuple(seq[t:])
998
+ if self.suffix_freq_by_year[t][suffix] == 1:
999
+ count += 1
1000
+ uniqueness_scores.append(count)
1001
+ return uniqueness_scores
1002
+
1003
+
1004
+ def plot_suffix_rarity_distribution(
1005
+ data,
1006
+ # === Core Parameters ===
1007
+ group_names=None,
1008
+ colors=None,
1009
+ # === Threshold Settings ===
1010
+ show_threshold=True,
1011
+ threshold_method="top_proportion", # Changed default to top_proportion
1012
+ proportion_p=0.07, # Simplified parameter name, default 7%
1013
+ # === Plotting Options ===
1014
+ figsize=(10, 6),
1015
+ kde_bw=None,
1016
+ # === Export Options ===
1017
+ save_as=None,
1018
+ dpi=300,
1019
+ show=True,
1020
+ # === Parameters for Different Methods ===
1021
+ z_threshold=1.5,
1022
+ is_standardized_score=False,
1023
+ quantile_p=0.10
1024
+ ):
1025
+ """
1026
+ Plot suffix rarity score distribution(s) with clean threshold lines.
1027
+
1028
+ Parameters
1029
+ ----------
1030
+ data : dict or array-like
1031
+ Data to plot. If dict: {"group1": scores1, "group2": scores2}
1032
+ If array-like: single group data
1033
+ group_names : list, optional
1034
+ Custom group names. Auto-detected from dict keys if not provided
1035
+ colors : dict or list, optional
1036
+ Colors for groups. If None, uses default palette
1037
+
1038
+ show_threshold : bool, default True
1039
+ Whether to show threshold vertical lines
1040
+ threshold_method : str, default "top_proportion"
1041
+ Threshold method:
1042
+ - "top_proportion": Select top proportion_p% most extreme values
1043
+ - "quantile": Use quantile_p percentile
1044
+ - "zscore": Use z-score threshold (for standardized data)
1045
+ proportion_p : float, default 0.05
1046
+ Proportion for top_proportion method (e.g., 0.05 = top 5%)
1047
+
1048
+ figsize : tuple, default (10, 6)
1049
+ Figure size (width, height)
1050
+ kde_bw : float, optional
1051
+ KDE bandwidth adjustment. If None, uses seaborn default
1052
+
1053
+ save_as : str, optional
1054
+ Save path (without extension)
1055
+ dpi : int, default 300
1056
+ Resolution for saved figure
1057
+ show : bool, default True
1058
+ Whether to display plot
1059
+
1060
+ Returns
1061
+ -------
1062
+ dict
1063
+ Statistics including threshold values per group
1064
+
1065
+ Examples
1066
+ --------
1067
+ # Basic usage - top 5% threshold (default)
1068
+ >>> plot_suffix_rarity_distribution({"India": india_scores, "US": us_scores})
1069
+
1070
+ # Custom threshold proportion
1071
+ >>> plot_suffix_rarity_distribution(
1072
+ ... data={"India": india_scores, "US": us_scores},
1073
+ ... proportion_p=0.03, # top 3%
1074
+ ... save_as="rarity_comparison"
1075
+ ... )
1076
+
1077
+ # Quantile-based threshold
1078
+ >>> plot_suffix_rarity_distribution(
1079
+ ... data={"India": india_scores, "US": us_scores},
1080
+ ... threshold_method="quantile",
1081
+ ... quantile_p=0.10, # 10th percentile
1082
+ ... )
1083
+
1084
+ # Clean plot without thresholds
1085
+ >>> plot_suffix_rarity_distribution(
1086
+ ... data,
1087
+ ... show_threshold=False,
1088
+ ... colors={"India": "#E8B88A", "US": "#A3BFD9"}
1089
+ ... )
1090
+ """
1091
+ import matplotlib.pyplot as plt
1092
+ import seaborn as sns
1093
+ import numpy as np
1094
+
1095
+ # Process input data
1096
+ if isinstance(data, dict):
1097
+ # Multi-group case
1098
+ groups = data
1099
+ if group_names is None:
1100
+ group_names = list(groups.keys())
1101
+ else:
1102
+ # Single group case
1103
+ if group_names is None:
1104
+ group_names = ["Group"]
1105
+ groups = {group_names[0]: data}
1106
+
1107
+ # Set up colors (simplified)
1108
+ if colors is None:
1109
+ default_colors = ["#A3BFD9", "#E8B88A", "#C6A5CF", "#A6C1A9", "#F4A460", "#87CEEB"]
1110
+ color_map = dict(zip(group_names, default_colors[:len(group_names)]))
1111
+ elif isinstance(colors, dict):
1112
+ color_map = colors
1113
+ else:
1114
+ color_map = dict(zip(group_names, colors))
1115
+
1116
+ # Normalize method and prepare stats
1117
+ threshold_method = (threshold_method or "top_proportion").lower()
1118
+
1119
+ # Handle legacy parameter mapping
1120
+ if threshold_method in {"top_proportion", "topk", "proportion", "rank"}:
1121
+ # Use the simplified proportion_p parameter
1122
+ top_proportion_p = proportion_p
1123
+ topk_min_count = 1
1124
+ elif threshold_method == "quantile":
1125
+ # Use quantile_p for quantile method
1126
+ pass
1127
+ elif threshold_method in {"zscore", "z"} and is_standardized_score:
1128
+ # Auto-handle standardized scores
1129
+ pass
1130
+
1131
+ stats = {"per_group": {}, "threshold_method": threshold_method}
1132
+
1133
+ # Validate quantiles if needed
1134
+ def _check_q(q: float):
1135
+ if not (0 < float(q) < 1):
1136
+ raise ValueError(f"quantile must be in (0,1), got {q}")
1137
+ quantiles_to_draw = None
1138
+ if threshold_method == "quantile":
1139
+ _check_q(quantile_p)
1140
+ quantiles_to_draw = [quantile_p] # Simplified - no additional_quantiles
1141
+ # Per-group quantile(s)
1142
+ for g in group_names:
1143
+ if g in groups:
1144
+ arr = np.asarray(groups[g], dtype=float)
1145
+ # Compute requested quantiles with NaN handling
1146
+ valid = arr[~np.isnan(arr)]
1147
+ thresholds_g = {}
1148
+ if valid.size > 0:
1149
+ for q in quantiles_to_draw:
1150
+ try:
1151
+ xq = float(np.nanquantile(arr, q))
1152
+ except Exception:
1153
+ xq = float(np.quantile(valid, q))
1154
+ thresholds_g[f"p{int(round(q*100)):02d}"] = xq
1155
+ else:
1156
+ for q in quantiles_to_draw:
1157
+ thresholds_g[f"p{int(round(q*100)):02d}"] = np.nan
1158
+ # Primary threshold (for backward compatibility)
1159
+ primary_label = f"p{int(round(quantile_p*100)):02d}"
1160
+ primary_value = thresholds_g.get(primary_label, np.nan)
1161
+ # Proportion below primary
1162
+ vals = valid
1163
+ prop_below = float(np.nanmean(vals <= primary_value)) if vals.size > 0 and not np.isnan(primary_value) else np.nan
1164
+ stats["per_group"][g] = {
1165
+ "threshold_values": thresholds_g,
1166
+ "is_group_relative": True,
1167
+ "threshold_value": primary_value,
1168
+ "primary_quantile": primary_label,
1169
+ "prop_below": prop_below
1170
+ }
1171
+ elif threshold_method in {"zscore", "z"}:
1172
+ # z-score method (backward compatibility)
1173
+ for g in group_names:
1174
+ if g in groups:
1175
+ arr = np.asarray(groups[g], dtype=float)
1176
+ mean_g = np.nanmean(arr)
1177
+ std_g = np.nanstd(arr, ddof=1) # sample std to match pandas
1178
+ if is_standardized_score:
1179
+ x_thresh_g = -float(z_threshold)
1180
+ else:
1181
+ x_thresh_g = float(mean_g - z_threshold * std_g)
1182
+ vals = arr[~np.isnan(arr)]
1183
+ prop_below = float(np.nanmean(vals <= x_thresh_g)) if vals.size > 0 and not np.isnan(x_thresh_g) else np.nan
1184
+ stats["per_group"][g] = {
1185
+ "mean": float(mean_g),
1186
+ "std": float(std_g),
1187
+ "threshold_value": float(x_thresh_g),
1188
+ "z_threshold": float(z_threshold),
1189
+ "is_group_relative": True,
1190
+ "prop_below": prop_below,
1191
+ "num_below": int(np.sum(vals <= x_thresh_g)) if vals.size > 0 and not np.isnan(x_thresh_g) else 0,
1192
+ "n": int(vals.size)
1193
+ }
1194
+ elif threshold_method in {"topk", "top_proportion", "proportion", "rank"}:
1195
+ # Rank-based proportion selection within each group: pick top p% (smallest values)
1196
+ if not (0 < float(proportion_p) < 1):
1197
+ raise ValueError(f"proportion_p must be in (0,1), got {proportion_p}")
1198
+ top_proportion_p = proportion_p # Map to internal variable
1199
+ for g in group_names:
1200
+ if g in groups:
1201
+ arr = np.asarray(groups[g], dtype=float)
1202
+ finite_mask = np.isfinite(arr)
1203
+ vals = arr[finite_mask]
1204
+ n_valid = int(vals.size)
1205
+ if n_valid == 0:
1206
+ stats["per_group"][g] = {
1207
+ "threshold_value": np.nan,
1208
+ "k": 0,
1209
+ "n": 0,
1210
+ "prop_selected": np.nan,
1211
+ "num_leq_threshold": 0
1212
+ }
1213
+ continue
1214
+ k = int(np.floor(top_proportion_p * n_valid))
1215
+ if k < int(topk_min_count):
1216
+ k = int(topk_min_count)
1217
+ if k > n_valid:
1218
+ k = n_valid
1219
+ # Sort ascending (most typical first)
1220
+ order = np.argsort(vals, kind="mergesort")
1221
+ thresh_val = vals[order[k - 1]] if k >= 1 else np.nan
1222
+ num_leq = int(np.sum(vals <= thresh_val)) if k >= 1 and np.isfinite(thresh_val) else 0
1223
+ stats["per_group"][g] = {
1224
+ "threshold_value": float(thresh_val) if np.isfinite(thresh_val) else np.nan,
1225
+ "k": int(k),
1226
+ "n": int(n_valid),
1227
+ "prop_selected": (k / n_valid) if n_valid > 0 else np.nan,
1228
+ "num_leq_threshold": num_leq
1229
+ }
1230
+ stats["threshold_method"] = "topk"
1231
+ else:
1232
+ raise ValueError(f"Unknown threshold_method: {threshold_method}")
1233
+
1234
+ # Create plot
1235
+ plt.figure(figsize=figsize)
1236
+
1237
+ # Plot distributions
1238
+ for idx, group_name in enumerate(group_names):
1239
+ if group_name in groups:
1240
+ scores = groups[group_name]
1241
+ color = color_map.get(group_name, "#1f77b4")
1242
+ arr = np.asarray(scores, dtype=float)
1243
+ vmin = np.nanmin(arr) if np.isfinite(arr).any() else None
1244
+ vmax = np.nanmax(arr) if np.isfinite(arr).any() else None
1245
+ kde_kwargs = {"label": group_name, "fill": True, "color": color, "linewidth": 2}
1246
+ if kde_bw is not None:
1247
+ kde_kwargs["bw_adjust"] = kde_bw
1248
+ if vmin is not None and vmax is not None and vmin < vmax:
1249
+ kde_kwargs["clip"] = (vmin, vmax)
1250
+ sns.kdeplot(arr, **kde_kwargs)
1251
+
1252
+ # Add per-group threshold lines if requested (color-matched)
1253
+ if show_threshold:
1254
+ for i, g in enumerate(group_names):
1255
+ if g in stats["per_group"]:
1256
+ color = color_map.get(g, "#1f77b4")
1257
+ ax = plt.gca()
1258
+ y_max = ax.get_ylim()[1]
1259
+ x_min, x_max = ax.get_xlim()
1260
+ text_y = y_max * 0.9
1261
+ x_offset = (x_max - x_min) * 0.005 * (i + 1)
1262
+ if threshold_method == "quantile":
1263
+ thresholds_g = stats["per_group"][g]["threshold_values"]
1264
+ # Draw multiple lines if multiple quantiles
1265
+ for k_idx, (q_lbl, xg) in enumerate(sorted(thresholds_g.items())):
1266
+ if np.isnan(xg):
1267
+ continue
1268
+ # Clean threshold line without text label
1269
+ plt.axvline(xg, color=color, linestyle="--", linewidth=1.6)
1270
+ elif threshold_method in {"zscore", "z"}:
1271
+ xg = stats["per_group"][g]["threshold_value"]
1272
+ # Clean threshold line without text label
1273
+ plt.axvline(xg, color=color, linestyle="--", linewidth=1.6)
1274
+ else: # top_proportion
1275
+ xg = stats["per_group"][g]["threshold_value"]
1276
+ if np.isfinite(xg):
1277
+ # Clean threshold line without text label
1278
+ plt.axvline(xg, color=color, linestyle="--", linewidth=1.6)
1279
+
1280
+ # Formatting
1281
+ if is_standardized_score:
1282
+ plt.xlabel("Standardized Suffix Rarity Score", fontsize=13)
1283
+ else:
1284
+ plt.xlabel("Suffix Rarity Score", fontsize=13)
1285
+ plt.ylabel("Density", fontsize=13)
1286
+ if len(group_names) > 1:
1287
+ plt.legend(title="Group")
1288
+ sns.despine()
1289
+ plt.tight_layout()
1290
+
1291
+ # Save and show
1292
+ if save_as:
1293
+ plt.savefig(f"{save_as}.png", dpi=dpi, bbox_inches='tight')
1294
+
1295
+ if show:
1296
+ plt.show()
1297
+
1298
+ return stats
1299
+
1300
+
1301
+ def plot_individual_indicators_correlation(
1302
+ df,
1303
+ indicator_columns=None,
1304
+ correlation_method='pearson',
1305
+ group_column=None,
1306
+ figsize=(10, 8),
1307
+ cmap='RdBu_r',
1308
+ center=0,
1309
+ annot=True,
1310
+ fmt='.2f',
1311
+ save_as=None,
1312
+ dpi=300,
1313
+ show=True
1314
+ ):
1315
+ """
1316
+ Plot correlation heatmap of individual-level indicators with beautiful styling.
1317
+
1318
+ Parameters:
1319
+ -----------
1320
+ df : pandas.DataFrame
1321
+ DataFrame containing individual-level indicators
1322
+ indicator_columns : list, optional
1323
+ List of column names to include in correlation analysis.
1324
+ If None, automatically detects indicator columns (converged, first_convergence_year,
1325
+ suffix_typicality_score, path_uniqueness, etc.)
1326
+ correlation_method : str, default='pearson'
1327
+ Correlation method: 'pearson', 'spearman', 'kendall'
1328
+ group_column : str, optional
1329
+ Column name for grouping (e.g., 'country'). If provided, shows separate
1330
+ heatmaps for each group
1331
+ figsize : tuple, default=(10, 8)
1332
+ Figure size (width, height)
1333
+ cmap : str, default='RdBu_r'
1334
+ Colormap for heatmap. Options: 'RdBu_r', 'coolwarm', 'viridis', 'plasma'
1335
+ center : float, default=0
1336
+ Value to center the colormap at
1337
+ annot : bool, default=True
1338
+ Whether to annotate cells with correlation values
1339
+ fmt : str, default='.2f'
1340
+ Format for annotations
1341
+ save_as : str, optional
1342
+ Path to save the figure (without extension)
1343
+ dpi : int, default=300
1344
+ DPI for saving
1345
+ show : bool, default=True
1346
+ Whether to display the plot
1347
+
1348
+ Returns:
1349
+ --------
1350
+ dict: Correlation matrix/matrices and statistics
1351
+
1352
+ Example:
1353
+ --------
1354
+ # Basic usage
1355
+ >>> plot_individual_indicators_correlation(df)
1356
+
1357
+ # Custom indicators with grouping
1358
+ >>> plot_individual_indicators_correlation(
1359
+ ... df,
1360
+ ... indicator_columns=['converged', 'suffix_rarity_score', 'path_uniqueness'],
1361
+ ... group_column='country',
1362
+ ... correlation_method='spearman'
1363
+ ... )
1364
+
1365
+ # Custom styling
1366
+ >>> plot_individual_indicators_correlation(
1367
+ ... df,
1368
+ ... cmap='plasma',
1369
+ ... figsize=(12, 10),
1370
+ ... save_as="indicators_correlation_heatmap"
1371
+ ... )
1372
+ """
1373
+ import matplotlib.pyplot as plt
1374
+ import seaborn as sns
1375
+ import pandas as pd
1376
+ import numpy as np
1377
+
1378
+ # Auto-detect indicator columns if not provided
1379
+ if indicator_columns is None:
1380
+ # Common individual-level indicator patterns (convergence-focused)
1381
+ potential_indicators = [
1382
+ 'converged', 'first_convergence_year', 'convergence_year',
1383
+ 'suffix_rarity_score', 'path_uniqueness', 'rarity_score', 'uniqueness_score'
1384
+ ]
1385
+ indicator_columns = [col for col in df.columns if col in potential_indicators]
1386
+
1387
+ # Also include numeric columns that might be indicators
1388
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
1389
+ for col in numeric_cols:
1390
+ if col not in indicator_columns and any(
1391
+ keyword in col.lower() for keyword in
1392
+ ['score', 'index', 'count', 'factor', 'rate', 'ratio']
1393
+ ):
1394
+ indicator_columns.append(col)
1395
+
1396
+ # Filter and clean data
1397
+ df_indicators = df[indicator_columns].copy()
1398
+
1399
+ # Handle missing values and convert data types
1400
+ for col in df_indicators.columns:
1401
+ if df_indicators[col].dtype == 'object':
1402
+ # Try to convert to numeric
1403
+ df_indicators[col] = pd.to_numeric(df_indicators[col], errors='coerce')
1404
+
1405
+ # Remove columns with too many missing values (>50%)
1406
+ valid_cols = []
1407
+ for col in df_indicators.columns:
1408
+ if df_indicators[col].notna().sum() / len(df_indicators) > 0.5:
1409
+ valid_cols.append(col)
1410
+
1411
+ df_indicators = df_indicators[valid_cols]
1412
+
1413
+ # Drop rows with any missing values for correlation calculation
1414
+ df_clean = df_indicators.dropna()
1415
+
1416
+ if len(df_clean) == 0:
1417
+ raise ValueError("No valid data remaining after cleaning. Check for missing values.")
1418
+
1419
+ # Calculate correlations
1420
+ results = {}
1421
+
1422
+ if group_column is None or group_column not in df.columns:
1423
+ # Single correlation matrix
1424
+ corr_matrix = df_clean.corr(method=correlation_method)
1425
+ results['overall'] = corr_matrix
1426
+
1427
+ # Create plot
1428
+ plt.figure(figsize=figsize)
1429
+
1430
+ # Create mask for upper triangle (optional - makes it cleaner)
1431
+ mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
1432
+
1433
+ # Generate heatmap
1434
+ sns.heatmap(
1435
+ corr_matrix,
1436
+ mask=mask,
1437
+ annot=annot,
1438
+ fmt=fmt,
1439
+ cmap=cmap,
1440
+ center=center,
1441
+ square=True,
1442
+ cbar_kws={"shrink": .8, "label": f"{correlation_method.title()} Correlation"},
1443
+ linewidths=0.5
1444
+ )
1445
+
1446
+ plt.title(f"Individual-Level Indicators Correlation Heatmap\n({correlation_method.title()} Correlation)",
1447
+ fontsize=14, pad=20)
1448
+ plt.xticks(rotation=45, ha='right')
1449
+ plt.yticks(rotation=0)
1450
+
1451
+ else:
1452
+ # Group-based correlation matrices
1453
+ groups = df[group_column].unique()
1454
+ n_groups = len(groups)
1455
+
1456
+ # Calculate subplot layout
1457
+ if n_groups <= 2:
1458
+ nrows, ncols = 1, n_groups
1459
+ figsize = (figsize[0] * n_groups, figsize[1])
1460
+ else:
1461
+ ncols = min(3, n_groups)
1462
+ nrows = (n_groups + ncols - 1) // ncols
1463
+ figsize = (figsize[0] * ncols, figsize[1] * nrows)
1464
+
1465
+ fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
1466
+ if n_groups == 1:
1467
+ axes = [axes]
1468
+ elif nrows == 1:
1469
+ axes = axes
1470
+ else:
1471
+ axes = axes.flatten()
1472
+
1473
+ for i, group in enumerate(groups):
1474
+ group_data = df[df[group_column] == group][indicator_columns].dropna()
1475
+
1476
+ if len(group_data) < 2:
1477
+ print(f"Warning: Group '{group}' has insufficient data for correlation")
1478
+ continue
1479
+
1480
+ corr_matrix = group_data.corr(method=correlation_method)
1481
+ results[group] = corr_matrix
1482
+
1483
+ # Create mask for upper triangle
1484
+ mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
1485
+
1486
+ # Plot heatmap
1487
+ sns.heatmap(
1488
+ corr_matrix,
1489
+ mask=mask,
1490
+ annot=annot,
1491
+ fmt=fmt,
1492
+ cmap=cmap,
1493
+ center=center,
1494
+ square=True,
1495
+ cbar=i == 0, # Only show colorbar for first subplot
1496
+ cbar_kws={"shrink": .8, "label": f"{correlation_method.title()} Correlation"} if i == 0 else {},
1497
+ linewidths=0.5,
1498
+ ax=axes[i]
1499
+ )
1500
+
1501
+ axes[i].set_title(f"{group}\n({len(group_data)} individuals)", fontsize=12)
1502
+ axes[i].set_xticks(axes[i].get_xticks())
1503
+ axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45, ha='right')
1504
+ axes[i].set_yticks(axes[i].get_yticks())
1505
+ axes[i].set_yticklabels(axes[i].get_yticklabels(), rotation=0)
1506
+
1507
+ # Hide unused subplots
1508
+ for j in range(i + 1, len(axes)):
1509
+ axes[j].set_visible(False)
1510
+
1511
+ plt.suptitle(f"Individual-Level Indicators Correlation by {group_column.title()}\n({correlation_method.title()} Correlation)",
1512
+ fontsize=16, y=0.98)
1513
+
1514
+ plt.tight_layout()
1515
+
1516
+ # Save and show
1517
+ if save_as:
1518
+ plt.savefig(f"{save_as}.png", dpi=dpi, bbox_inches='tight')
1519
+
1520
+ if show:
1521
+ plt.show()
1522
+
1523
+ # Add summary statistics
1524
+ if group_column is None:
1525
+ sample_size = len(df_clean)
1526
+ else:
1527
+ sizes = {}
1528
+ for g in df[group_column].unique():
1529
+ g_clean = df[df[group_column]==g][indicator_columns].apply(pd.to_numeric, errors='coerce').dropna()
1530
+ sizes[g] = len(g_clean)
1531
+ sample_size = sizes
1532
+
1533
+ results['summary'] = {
1534
+ 'method': correlation_method,
1535
+ 'n_indicators': len(valid_cols),
1536
+ 'indicators_included': valid_cols,
1537
+ 'sample_size': sample_size
1538
+ }
1539
+
1540
+ return results
1541
+
1542
+
1543
+ def compute_quantile_thresholds_by_group(scores, group_labels, quantiles=None):
1544
+ """
1545
+ Compute per-group quantile thresholds for a 1D array of scores.
1546
+
1547
+ Parameters
1548
+ ----------
1549
+ scores : array-like of shape (N,)
1550
+ Scores (e.g., standardized rarity) aligned with labels.
1551
+ group_labels : array-like of shape (N,)
1552
+ Group label per observation.
1553
+ quantiles : Optional[List[float]]
1554
+ Quantiles to compute (e.g., [0.10]). Defaults to [0.10].
1555
+
1556
+ Returns
1557
+ -------
1558
+ dict
1559
+ {group: {"p10": value, ...}}
1560
+ """
1561
+ if quantiles is None:
1562
+ quantiles = [0.10]
1563
+ arr = np.asarray(scores, dtype=float)
1564
+ labels = np.asarray(group_labels)
1565
+ result = {}
1566
+ for g in pd.unique(labels):
1567
+ mask = labels == g
1568
+ vals = arr[mask]
1569
+ vals = vals[~np.isnan(vals)]
1570
+ thresholds_g = {}
1571
+ if vals.size > 0:
1572
+ for q in quantiles:
1573
+ thresholds_g[f"p{int(round(q*100)):02d}"] = float(np.nanquantile(vals, q))
1574
+ else:
1575
+ for q in quantiles:
1576
+ thresholds_g[f"p{int(round(q*100)):02d}"] = np.nan
1577
+ result[g] = thresholds_g
1578
+ return result
1579
+
1580
+
1581
+ def compute_quantile_thresholds_by_group_year(scores, group_labels, year_labels, quantiles=None, min_group_year_size=30):
1582
+ """
1583
+ Compute quantile thresholds by group x year for time-drifting distributions.
1584
+
1585
+ Parameters
1586
+ ----------
1587
+ scores : array-like of shape (N,)
1588
+ Scores aligned with labels.
1589
+ group_labels : array-like of shape (N,)
1590
+ Group label per observation.
1591
+ year_labels : array-like of shape (N,)
1592
+ Year label per observation (int/str).
1593
+ quantiles : Optional[List[float]]
1594
+ Quantiles to compute (e.g., [0.10]). Defaults to [0.10].
1595
+ min_group_year_size : int, default 30
1596
+ Minimum sample size to compute thresholds for a group-year cell. If fewer, returns NaN.
1597
+
1598
+ Returns
1599
+ -------
1600
+ dict
1601
+ {group: {year: {"p10": value, ...}}}
1602
+ """
1603
+ if quantiles is None:
1604
+ quantiles = [0.10]
1605
+ arr = np.asarray(scores, dtype=float)
1606
+ g_arr = np.asarray(group_labels)
1607
+ y_arr = np.asarray(year_labels)
1608
+ result = {}
1609
+ df = pd.DataFrame({"score": arr, "group": g_arr, "year": y_arr})
1610
+ for g, gdf in df.groupby("group"):
1611
+ result[g] = {}
1612
+ for y, ydf in gdf.groupby("year"):
1613
+ vals = ydf["score"].astype(float).to_numpy()
1614
+ vals = vals[~np.isnan(vals)]
1615
+ thresholds_gy = {}
1616
+ if vals.size >= min_group_year_size:
1617
+ for q in quantiles:
1618
+ thresholds_gy[f"p{int(round(q*100)):02d}"] = float(np.nanquantile(vals, q))
1619
+ else:
1620
+ for q in quantiles:
1621
+ thresholds_gy[f"p{int(round(q*100)):02d}"] = np.nan
1622
+ result[g][y] = thresholds_gy
1623
+ return result
1624
+
1625
+
1626
+ def compute_path_uniqueness_by_group_suffix(sequences, group_labels):
1627
+ """
1628
+ Compute path uniqueness within each subgroup defined by group_labels using suffix-based approach.
1629
+ This is consistent with the convergence module's suffix-based logic.
1630
+ :param sequences: List of sequences.
1631
+ :param group_labels: List of group keys (same length as sequences), e.g., country, gender.
1632
+ :return: List of path uniqueness scores (same order as input).
1633
+ """
1634
+ from collections import defaultdict
1635
+
1636
+ T = len(sequences[0])
1637
+ df = pd.DataFrame({
1638
+ "sequence": sequences,
1639
+ "group": group_labels
1640
+ })
1641
+
1642
+ # Step 1: Precompute suffix frequency tables per group (changed from prefix to suffix)
1643
+ group_suffix_freq = {}
1644
+ for group, group_df in df.groupby("group"):
1645
+ suffix_freq = [defaultdict(int) for _ in range(T)]
1646
+ for seq in group_df["sequence"]:
1647
+ for t in range(T):
1648
+ suffix = tuple(seq[t:]) # suffix from year t to end
1649
+ suffix_freq[t][suffix] += 1
1650
+ group_suffix_freq[group] = suffix_freq
1651
+
1652
+ # Step 2: Compute path uniqueness per individual using suffix logic
1653
+ uniqueness_scores = []
1654
+ for seq, group in zip(sequences, group_labels):
1655
+ suffix_freq = group_suffix_freq[group]
1656
+ count = 0
1657
+ for t in range(T):
1658
+ suffix = tuple(seq[t:]) # suffix from year t to end
1659
+ if suffix_freq[t][suffix] == 1:
1660
+ count += 1
1661
+ uniqueness_scores.append(count)
1662
+
1663
+ return uniqueness_scores
1664
+
1665
+
1666
+ # Provide a default version for backward compatibility
1667
+ def compute_path_uniqueness_by_group(sequences, group_labels):
1668
+ """
1669
+ Compute path uniqueness within each subgroup defined by group_labels.
1670
+
1671
+ This is the default version using suffix-based approach (convergence logic).
1672
+ For explicit control, use compute_path_uniqueness_by_group_suffix() or
1673
+ compute_path_uniqueness_by_group_prefix() from the prefix_tree module.
1674
+
1675
+ :param sequences: List of sequences.
1676
+ :param group_labels: List of group keys (same length as sequences), e.g., country, gender.
1677
+ :return: List of path uniqueness scores (same order as input).
1678
+ """
1679
+ return compute_path_uniqueness_by_group_suffix(sequences, group_labels)