sequenzo 0.1.21__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sequenzo might be problematic. Click here for more details.

Files changed (260) hide show
  1. sequenzo/__init__.py +240 -0
  2. sequenzo/big_data/__init__.py +12 -0
  3. sequenzo/big_data/clara/__init__.py +26 -0
  4. sequenzo/big_data/clara/clara.py +467 -0
  5. sequenzo/big_data/clara/utils/__init__.py +27 -0
  6. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  7. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  8. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-311-darwin.so +0 -0
  9. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  10. sequenzo/big_data/clara/visualization.py +88 -0
  11. sequenzo/clustering/KMedoids.py +196 -0
  12. sequenzo/clustering/__init__.py +30 -0
  13. sequenzo/clustering/clustering_c_code.cpython-311-darwin.so +0 -0
  14. sequenzo/clustering/hierarchical_clustering.py +1380 -0
  15. sequenzo/clustering/src/KMedoid.cpp +262 -0
  16. sequenzo/clustering/src/PAM.cpp +236 -0
  17. sequenzo/clustering/src/PAMonce.cpp +234 -0
  18. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  19. sequenzo/clustering/src/cluster_quality.h +128 -0
  20. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  21. sequenzo/clustering/src/module.cpp +228 -0
  22. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  23. sequenzo/clustering/utils/__init__.py +27 -0
  24. sequenzo/clustering/utils/disscenter.py +122 -0
  25. sequenzo/data_preprocessing/__init__.py +20 -0
  26. sequenzo/data_preprocessing/helpers.py +256 -0
  27. sequenzo/datasets/__init__.py +41 -0
  28. sequenzo/datasets/biofam.csv +2001 -0
  29. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  30. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  31. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  32. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  33. sequenzo/datasets/country_co2_emissions.csv +194 -0
  34. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  35. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  36. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  37. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  38. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  39. sequenzo/datasets/mvad.csv +713 -0
  40. sequenzo/datasets/pairfam_family.csv +1867 -0
  41. sequenzo/datasets/polyadic_samplec1.csv +61 -0
  42. sequenzo/datasets/polyadic_samplep1.csv +61 -0
  43. sequenzo/datasets/polyadic_seqc1.csv +61 -0
  44. sequenzo/datasets/polyadic_seqp1.csv +61 -0
  45. sequenzo/define_sequence_data.py +609 -0
  46. sequenzo/dissimilarity_measures/__init__.py +31 -0
  47. sequenzo/dissimilarity_measures/c_code.cpython-311-darwin.so +0 -0
  48. sequenzo/dissimilarity_measures/get_distance_matrix.py +702 -0
  49. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +241 -0
  50. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  51. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  52. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  53. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  54. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  55. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  56. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  57. sequenzo/dissimilarity_measures/src/module.cpp +34 -0
  58. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  59. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  60. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  61. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  62. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  63. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  64. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  65. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  66. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  67. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  210. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  211. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  212. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-311-darwin.so +0 -0
  213. sequenzo/dissimilarity_measures/utils/seqconc.cpython-311-darwin.so +0 -0
  214. sequenzo/dissimilarity_measures/utils/seqdss.cpython-311-darwin.so +0 -0
  215. sequenzo/dissimilarity_measures/utils/seqdur.cpython-311-darwin.so +0 -0
  216. sequenzo/dissimilarity_measures/utils/seqlength.cpython-311-darwin.so +0 -0
  217. sequenzo/multidomain/__init__.py +23 -0
  218. sequenzo/multidomain/association_between_domains.py +311 -0
  219. sequenzo/multidomain/cat.py +431 -0
  220. sequenzo/multidomain/combt.py +519 -0
  221. sequenzo/multidomain/dat.py +89 -0
  222. sequenzo/multidomain/idcd.py +139 -0
  223. sequenzo/multidomain/linked_polyad.py +292 -0
  224. sequenzo/openmp_setup.py +233 -0
  225. sequenzo/prefix_tree/__init__.py +43 -0
  226. sequenzo/prefix_tree/individual_level_indicators.py +1274 -0
  227. sequenzo/prefix_tree/system_level_indicators.py +465 -0
  228. sequenzo/prefix_tree/utils.py +54 -0
  229. sequenzo/sequence_characteristics/__init__.py +40 -0
  230. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  231. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  232. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  233. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  234. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  235. sequenzo/sequence_characteristics/turbulence.py +155 -0
  236. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  237. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  238. sequenzo/suffix_tree/__init__.py +48 -0
  239. sequenzo/suffix_tree/individual_level_indicators.py +1638 -0
  240. sequenzo/suffix_tree/system_level_indicators.py +456 -0
  241. sequenzo/suffix_tree/utils.py +56 -0
  242. sequenzo/visualization/__init__.py +29 -0
  243. sequenzo/visualization/plot_mean_time.py +194 -0
  244. sequenzo/visualization/plot_modal_state.py +276 -0
  245. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  246. sequenzo/visualization/plot_relative_frequency.py +404 -0
  247. sequenzo/visualization/plot_sequence_index.py +937 -0
  248. sequenzo/visualization/plot_single_medoid.py +153 -0
  249. sequenzo/visualization/plot_state_distribution.py +613 -0
  250. sequenzo/visualization/plot_transition_matrix.py +190 -0
  251. sequenzo/visualization/utils/__init__.py +23 -0
  252. sequenzo/visualization/utils/utils.py +310 -0
  253. sequenzo/with_event_history_analysis/__init__.py +35 -0
  254. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  255. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  256. sequenzo-0.1.21.dist-info/METADATA +308 -0
  257. sequenzo-0.1.21.dist-info/RECORD +254 -0
  258. sequenzo-0.1.21.dist-info/WHEEL +5 -0
  259. sequenzo-0.1.21.dist-info/licenses/LICENSE +28 -0
  260. sequenzo-0.1.21.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1274 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : individual_level_indicators.py
4
+ @Time : 02/05/2025 11:07
5
+ @Desc :
6
+ This module provides methods for calculating individual-level indicators in sequence data analysis.
7
+ It includes tools to assess sequence divergence, identify divergence timing, measure prefix rarity,
8
+ and evaluate path uniqueness for individuals or groups.
9
+
10
+ These indicators help quantify how typical or unique an individual's sequence is within a population,
11
+ and can be used for both overall and subgroup analyses.
12
+ """
13
+ from collections import defaultdict, Counter
14
+ from typing import Optional
15
+ import numpy as np
16
+ import pandas as pd
17
+
18
+
19
+ class IndividualDivergence:
20
+ def __init__(self, sequences):
21
+ # Handle case where sequences might already be an IndividualDivergence object
22
+ if isinstance(sequences, IndividualDivergence):
23
+ # Extract sequences from existing object
24
+ self.sequences = sequences.sequences
25
+ elif hasattr(sequences, 'sequences'):
26
+ # Handle case where input might be another object with sequences attribute
27
+ self.sequences = sequences.sequences
28
+ else:
29
+ # Normal case: sequences is a list of sequences
30
+ self.sequences = sequences
31
+
32
+ # Validate input
33
+ if not self.sequences or len(self.sequences) == 0:
34
+ raise ValueError("sequences cannot be empty")
35
+ if not hasattr(self.sequences[0], '__len__') and not hasattr(self.sequences[0], '__iter__'):
36
+ raise ValueError("sequences must be a list of sequences (e.g., [[1,2,3], [2,3,1], ...])")
37
+
38
+ self.T = len(self.sequences[0])
39
+ self.prefix_freq_by_year = self._build_prefix_frequencies()
40
+
41
+ def _build_prefix_frequencies(self):
42
+ freq_by_year = [defaultdict(int) for _ in range(self.T)]
43
+ for seq in self.sequences:
44
+ prefix = []
45
+ for t in range(self.T):
46
+ prefix.append(seq[t])
47
+ freq_by_year[t][tuple(prefix)] += 1
48
+ return freq_by_year
49
+
50
+ def compute_diverged(
51
+ self,
52
+ z_threshold=1.5,
53
+ min_t=2,
54
+ window=1,
55
+ inclusive=False,
56
+ group_labels=None,
57
+ *,
58
+ method: str = "zscore",
59
+ proportion: Optional[float] = None,
60
+ quantile_p: Optional[float] = None,
61
+ min_count: int = 1,
62
+ ):
63
+ """
64
+ Compute binary divergence flags with multiple selection methods.
65
+
66
+ Definition (common intuition): higher prefix-rarity implies more atypical behavior.
67
+ We compute per-year rarity via prefix frequencies and then detect divergence using
68
+ one of the following methods:
69
+
70
+ Methods
71
+ -------
72
+ - "zscore" (window-based, default):
73
+ Uses per-year z-scores of rarity. A person is diverged if there exists a window
74
+ of length `window` starting between years `[min_t, max_t]` where all z-scores are
75
+ above `z_threshold` (use `inclusive=True` for `>=`). Zero-variance years remain
76
+ NaN and any window containing NaN is skipped.
77
+
78
+ - "top_proportion" (aka "topk"/"proportion"/"rank"):
79
+ Uses the aggregated standardized score from `compute_standardized_rarity_score`
80
+ (higher = more atypical). Select the most atypical `proportion` within each group if
81
+ `group_labels` is provided, otherwise globally. `min_count` ensures at least the
82
+ specified number per group.
83
+
84
+ - "quantile":
85
+ Uses a quantile threshold (`quantile_p`) on the aggregated standardized score,
86
+ within each group (or globally if no `group_labels`). Individuals at or above the
87
+ threshold are marked diverged.
88
+
89
+ Parameters
90
+ ----------
91
+ z_threshold : float, default 1.5
92
+ zscore method only. Diverged when z > z_threshold (or >= if inclusive=True).
93
+ min_t, max_t : int
94
+ Search interval for the starting year (1-indexed). If max_t is None, uses T - window + 1.
95
+ window : int, default 1
96
+ Number of consecutive years required in zscore method and used in standardized aggregation.
97
+ inclusive : bool, default False
98
+ zscore method only. If True, use >= comparisons.
99
+ group_labels : array-like or None
100
+ If provided, proportion/quantile selections are computed within each group.
101
+ method : str, default "zscore"
102
+ One of {"zscore", "top_proportion" (aliases: "topk","proportion","rank"), "quantile"}.
103
+ proportion : float or None
104
+ For top_proportion. Fraction (0,1) to select as diverged. Defaults to 0.10 if None.
105
+ quantile_p : float or None
106
+ For quantile. Quantile in (0,1) used as threshold. Defaults to 0.90 if None.
107
+ min_count : int, default 1
108
+ For top_proportion. Lower bound for number selected per group.
109
+
110
+ Returns
111
+ -------
112
+ List[int]
113
+ 0/1 indicator for each individual.
114
+ """
115
+ N = len(self.sequences)
116
+ method_norm = (method or "zscore").lower()
117
+ max_t = self.T - window + 1
118
+
119
+ # Branch: rank/quantile style selections using aggregated standardized scores
120
+ if method_norm in {"top_proportion", "topk", "proportion", "rank"}:
121
+ p = proportion if proportion is not None else 0.10
122
+ scores = np.asarray(
123
+ self.compute_standardized_rarity_score(min_t=min_t, window=window), dtype=float
124
+ )
125
+ if group_labels is None:
126
+ vals = scores
127
+ finite_mask = np.isfinite(vals)
128
+ n_valid = int(np.sum(finite_mask))
129
+ if n_valid == 0:
130
+ return [0] * N
131
+ k = int(np.floor(p * n_valid))
132
+ if k < int(min_count):
133
+ k = int(min_count)
134
+ if k > n_valid:
135
+ k = n_valid
136
+ # For divergence: higher scores = more atypical, so take the largest k values
137
+ order = np.argsort(np.where(np.isfinite(vals), vals, -np.inf), kind="mergesort")
138
+ flags = np.zeros(N, dtype=int)
139
+ if k >= 1:
140
+ selected = order[-k:] # Take the k largest (most divergent)
141
+ flags[selected] = 1
142
+ return flags.tolist()
143
+ else:
144
+ # Group-wise selection - need to implement similar to suffix_tree
145
+ labels = np.asarray(group_labels)
146
+ flags = np.zeros(N, dtype=int)
147
+ for g in pd.unique(labels):
148
+ idx = np.where(labels == g)[0]
149
+ vals = scores[idx]
150
+ finite_mask = np.isfinite(vals)
151
+ n_valid = int(np.sum(finite_mask))
152
+ if n_valid == 0:
153
+ continue
154
+ k = int(np.floor(p * n_valid))
155
+ if k < int(min_count):
156
+ k = int(min_count)
157
+ if k > n_valid:
158
+ k = n_valid
159
+ order_local = np.argsort(np.where(np.isfinite(vals), vals, -np.inf), kind="mergesort")
160
+ if k >= 1:
161
+ selected_local = order_local[-k:] # Take k largest within group
162
+ selected_global = idx[selected_local]
163
+ flags[selected_global] = 1
164
+ return flags.tolist()
165
+
166
+ if method_norm == "quantile":
167
+ q = quantile_p if quantile_p is not None else 0.90 # High quantile for divergence
168
+ scores = np.asarray(
169
+ self.compute_standardized_rarity_score(min_t=min_t, window=window), dtype=float
170
+ )
171
+ flags = np.zeros(N, dtype=int)
172
+ if group_labels is None:
173
+ # Global quantile
174
+ valid = scores[np.isfinite(scores)]
175
+ if valid.size == 0:
176
+ return flags.tolist()
177
+ try:
178
+ xq = float(np.nanquantile(scores, q))
179
+ except Exception:
180
+ xq = float(np.quantile(valid, q))
181
+ flags[np.where(scores >= xq)[0]] = 1
182
+ return flags.tolist()
183
+ else:
184
+ labels = np.asarray(group_labels)
185
+ for g in pd.unique(labels):
186
+ idx = np.where(labels == g)[0]
187
+ vals = scores[idx]
188
+ valid = vals[np.isfinite(vals)]
189
+ if valid.size == 0:
190
+ continue
191
+ try:
192
+ xq = float(np.nanquantile(vals, q))
193
+ except Exception:
194
+ xq = float(np.quantile(valid, q))
195
+ local = np.where(vals >= xq)[0]
196
+ flags[idx[local]] = 1
197
+ return flags.tolist()
198
+
199
+ # Default branch: z-score window logic
200
+ rarity_matrix = []
201
+ for seq in self.sequences:
202
+ prefix = []
203
+ score = []
204
+ for t in range(self.T):
205
+ prefix.append(seq[t])
206
+ freq = self.prefix_freq_by_year[t][tuple(prefix)] / N
207
+ score.append(-np.log(freq + 1e-10))
208
+ rarity_matrix.append(score)
209
+
210
+ rarity_df = pd.DataFrame(rarity_matrix)
211
+ rarity_z = rarity_df.apply(lambda x: (x - x.mean()) / x.std(), axis=0)
212
+ # Keep NaNs for zero-variance years and skip NaN windows
213
+ rarity_z = rarity_z.replace([np.inf, -np.inf], np.nan)
214
+
215
+ flags = []
216
+ for i in range(N):
217
+ z = rarity_z.iloc[i]
218
+ diverged = 0
219
+ for t in range(min_t - 1, max_t):
220
+ # Skip windows containing NaN (zero-variance years)
221
+ vals = [z.iloc[t + k] for k in range(window)]
222
+ if not np.all(np.isfinite(vals)):
223
+ continue
224
+ # 发散 = 高稀有(更不典型)
225
+ if inclusive:
226
+ condition = all(v >= z_threshold for v in vals)
227
+ else:
228
+ condition = all(v > z_threshold for v in vals)
229
+ if condition:
230
+ diverged = 1
231
+ break
232
+ flags.append(diverged)
233
+ return flags
234
+
235
+ def compute_first_divergence_year(
236
+ self,
237
+ z_threshold=1.5,
238
+ min_t=2,
239
+ window=1,
240
+ inclusive=False,
241
+ group_labels=None,
242
+ *,
243
+ method: str = "zscore",
244
+ proportion: Optional[float] = None,
245
+ quantile_p: Optional[float] = None,
246
+ min_count: int = 1,
247
+ ):
248
+ """
249
+ Compute the first divergence year per individual with multiple selection methods.
250
+
251
+ Methods
252
+ -------
253
+ - "zscore" (default):
254
+ Find the earliest starting year t in [min_t, max_t] such that all z-scores in the
255
+ length-`window` block are above `z_threshold` (or >= if inclusive=True). Zero-variance
256
+ years are NaN; windows containing NaN are skipped.
257
+
258
+ - "top_proportion" (aka "topk"/"proportion"/"rank"):
259
+ Use aggregated standardized scores to pick the most atypical `proportion` within each group
260
+ (or globally). For the selected individuals, return the earliest t where the per-window
261
+ max z-score is >= the selection threshold; others return None. `min_count` is respected.
262
+
263
+ - "quantile":
264
+ Use per-group (or global) quantile threshold `quantile_p` on aggregated standardized scores;
265
+ individuals at or above the threshold return the earliest qualifying year; others return None.
266
+
267
+ Parameters
268
+ ----------
269
+ z_threshold, min_t, window, inclusive, group_labels
270
+ Same definitions as in `compute_diverged` for the zscore method.
271
+ method : str, default "zscore"
272
+ One of {"zscore", "top_proportion" (aliases: "topk","proportion","rank"), "quantile"}.
273
+ proportion : float or None
274
+ For top_proportion. Fraction (0,1) to select as diverged. Defaults to 0.10 if None.
275
+ quantile_p : float or None
276
+ For quantile. Quantile in (0,1) used as threshold. Defaults to 0.90 if None.
277
+ min_count : int, default 1
278
+ For top_proportion. Lower bound for number selected per group.
279
+
280
+ Returns
281
+ -------
282
+ List[Optional[int]]
283
+ First divergence years (1-indexed). None indicates no divergence.
284
+ """
285
+ N = len(self.sequences)
286
+ method_norm = (method or "zscore").lower()
287
+ max_t = self.T - window + 1
288
+
289
+ # Helper: standardized z matrix and per-t window maxima per individual
290
+ def _compute_window_max_list():
291
+ # Build rarity matrix and columnwise z (global standardization)
292
+ rarity_matrix = []
293
+ for seq in self.sequences:
294
+ prefix = []
295
+ score = []
296
+ for t in range(self.T):
297
+ prefix.append(seq[t])
298
+ freq = self.prefix_freq_by_year[t][tuple(prefix)] / N
299
+ score.append(-np.log(freq + 1e-10))
300
+ rarity_matrix.append(score)
301
+ rarity_arr = np.asarray(rarity_matrix, dtype=float)
302
+ col_means = np.nanmean(rarity_arr, axis=0)
303
+ col_stds = np.nanstd(rarity_arr, axis=0, ddof=1)
304
+ with np.errstate(invalid='ignore', divide='ignore'):
305
+ rarity_z = (rarity_arr - col_means) / col_stds
306
+ rarity_z = np.where(np.isfinite(rarity_z), rarity_z, np.nan)
307
+ # Compute per-individual window maxima sequence over t
308
+ window_maxes = [] # list of list per i
309
+ for i in range(N):
310
+ z_scores = rarity_z[i, :]
311
+ vals_per_t = []
312
+ for t0 in range(min_t - 1, max_t):
313
+ vals = [z_scores[t0 + k] for k in range(window)]
314
+ if not np.all(np.isfinite(vals)):
315
+ vals_per_t.append(np.nan)
316
+ else:
317
+ vals_per_t.append(float(np.max(vals)))
318
+ window_maxes.append(vals_per_t)
319
+ return np.asarray(window_maxes, dtype=float)
320
+
321
+ # Branches for rank/quantile-style thresholds
322
+ if method_norm in {"top_proportion", "topk", "proportion", "rank", "quantile"}:
323
+ # Compute aggregated scores for thresholding
324
+ agg_scores = np.asarray(
325
+ self.compute_standardized_rarity_score(min_t=min_t, window=window), dtype=float
326
+ )
327
+ per_t_window_max = _compute_window_max_list()
328
+
329
+ if method_norm in {"top_proportion", "topk", "proportion", "rank"}:
330
+ p = proportion if proportion is not None else 0.10
331
+ if group_labels is None:
332
+ vals = agg_scores
333
+ finite_mask = np.isfinite(vals)
334
+ n_valid = int(np.sum(finite_mask))
335
+ if n_valid == 0:
336
+ return [None] * N
337
+ k = int(np.floor(p * n_valid))
338
+ if k < int(min_count):
339
+ k = int(min_count)
340
+ if k > n_valid:
341
+ k = n_valid
342
+ # For divergence: take highest scores (most atypical)
343
+ order = np.argsort(np.where(np.isfinite(vals), vals, -np.inf), kind="mergesort")
344
+ selected_idx = set(order[-k:].tolist()) if k >= 1 else set()
345
+ years = []
346
+ for i in range(N):
347
+ if i not in selected_idx:
348
+ years.append(None)
349
+ continue
350
+ wm = per_t_window_max[i]
351
+ # threshold value is kth largest value
352
+ thresh_val = vals[order[-k]] if k >= 1 else np.nan
353
+ if not np.isfinite(thresh_val):
354
+ years.append(None)
355
+ continue
356
+ # earliest t where window_max >= threshold
357
+ yr = None
358
+ for t_idx, wv in enumerate(wm):
359
+ if np.isfinite(wv) and wv >= float(thresh_val):
360
+ yr = t_idx + min_t # Convert back to 1-indexed
361
+ break
362
+ years.append(yr)
363
+ return years
364
+ else:
365
+ labels = np.asarray(group_labels)
366
+ years = [None] * N
367
+ for g in pd.unique(labels):
368
+ idx = np.where(labels == g)[0]
369
+ vals = agg_scores[idx]
370
+ finite_mask = np.isfinite(vals)
371
+ n_valid = int(np.sum(finite_mask))
372
+ if n_valid == 0:
373
+ continue
374
+ k = int(np.floor(p * n_valid))
375
+ if k < int(min_count):
376
+ k = int(min_count)
377
+ if k > n_valid:
378
+ k = n_valid
379
+ order_local = np.argsort(np.where(np.isfinite(vals), vals, -np.inf), kind="mergesort")
380
+ selected_local = set(order_local[-k:].tolist()) if k >= 1 else set()
381
+ thresh_val = vals[order_local[-k]] if k >= 1 else np.nan
382
+ for j_local, i_global in enumerate(idx):
383
+ if j_local not in selected_local or not np.isfinite(thresh_val):
384
+ continue
385
+ wm = per_t_window_max[i_global]
386
+ for t_idx, wv in enumerate(wm):
387
+ if np.isfinite(wv) and wv >= float(thresh_val):
388
+ years[i_global] = int(t_idx + min_t)
389
+ break
390
+ return years
391
+
392
+ # quantile branch
393
+ q = quantile_p if quantile_p is not None else 0.90
394
+ years = [None] * N
395
+ if group_labels is None:
396
+ valid = agg_scores[np.isfinite(agg_scores)]
397
+ if valid.size == 0:
398
+ return years
399
+ try:
400
+ xq = float(np.nanquantile(agg_scores, q))
401
+ except Exception:
402
+ xq = float(np.quantile(valid, q))
403
+ for i in range(N):
404
+ if not np.isfinite(agg_scores[i]) or agg_scores[i] < xq:
405
+ continue
406
+ wm = per_t_window_max[i]
407
+ for t_idx, wv in enumerate(wm):
408
+ if np.isfinite(wv) and wv >= xq:
409
+ years[i] = int(t_idx + min_t)
410
+ break
411
+ return years
412
+ else:
413
+ labels = np.asarray(group_labels)
414
+ for g in pd.unique(labels):
415
+ idx = np.where(labels == g)[0]
416
+ vals = agg_scores[idx]
417
+ valid = vals[np.isfinite(vals)]
418
+ if valid.size == 0:
419
+ continue
420
+ try:
421
+ xq = float(np.nanquantile(vals, q))
422
+ except Exception:
423
+ xq = float(np.quantile(valid, q))
424
+ for j_local, i_global in enumerate(idx):
425
+ if not np.isfinite(vals[j_local]) or vals[j_local] < xq:
426
+ continue
427
+ wm = per_t_window_max[i_global]
428
+ for t_idx, wv in enumerate(wm):
429
+ if np.isfinite(wv) and wv >= xq:
430
+ years[i_global] = t_idx + min_t
431
+ break
432
+ return years
433
+
434
+ # Default branch: z-score window logic
435
+ rarity_matrix = []
436
+ for seq in self.sequences:
437
+ prefix = []
438
+ score = []
439
+ for t in range(self.T):
440
+ prefix.append(seq[t])
441
+ freq = self.prefix_freq_by_year[t][tuple(prefix)] / N
442
+ score.append(-np.log(freq + 1e-10))
443
+ rarity_matrix.append(score)
444
+
445
+ rarity_df = pd.DataFrame(rarity_matrix)
446
+ rarity_z = rarity_df.apply(lambda x: (x - x.mean()) / x.std(), axis=0)
447
+ # Keep NaNs for zero-variance years and skip NaN windows
448
+ rarity_z = rarity_z.replace([np.inf, -np.inf], np.nan)
449
+
450
+ years = []
451
+ for i in range(N):
452
+ z = rarity_z.iloc[i]
453
+ year = None
454
+ for t in range(min_t - 1, max_t):
455
+ # Skip windows containing NaN (zero-variance years)
456
+ vals = [z.iloc[t + k] for k in range(window)]
457
+ if not np.all(np.isfinite(vals)):
458
+ continue
459
+ # 发散 = 高稀有(更不典型)
460
+ if inclusive:
461
+ condition = all(v >= z_threshold for v in vals)
462
+ else:
463
+ condition = all(v > z_threshold for v in vals)
464
+ if condition:
465
+ year = int(t + 1) # Convert to 1-indexed integer
466
+ break
467
+ years.append(year)
468
+ return years
469
+
470
+ def compute_prefix_rarity_per_year(self, as_dataframe: bool = True, column_prefix: str = "t", zscore: bool = False):
471
+ """
472
+ Compute per-year prefix rarity scores for each individual.
473
+
474
+ For each individual i and year t (1..T), rarity score is defined as:
475
+ rarity_{i,t} = -log( freq(prefix_{i,t}) / N )
476
+ where prefix_{i,t} is the sequence of observed states up to year t for individual i,
477
+ freq(prefix) counts how many individuals share that exact prefix up to year t,
478
+ and N is the total number of individuals.
479
+
480
+ Parameters
481
+ ----------
482
+ as_dataframe : bool, default True
483
+ If True, returns a pandas DataFrame with columns f"{column_prefix}1"..f"{column_prefix}T".
484
+ If False, returns a NumPy array of shape (N, T).
485
+ column_prefix : str, default "t"
486
+ Column name prefix when returning a DataFrame.
487
+ zscore : bool, default False
488
+ If True, z-standardize the rarity scores column-wise (by year) using
489
+ sample standard deviation (ddof=1).
490
+
491
+ Returns
492
+ -------
493
+ pandas.DataFrame or np.ndarray
494
+ Per-year rarity scores (optionally z-scored).
495
+ """
496
+ N = len(self.sequences)
497
+ rarity_matrix = []
498
+
499
+ for seq in self.sequences:
500
+ prefix = []
501
+ score_list = []
502
+ for t in range(self.T):
503
+ prefix.append(seq[t])
504
+ freq = self.prefix_freq_by_year[t][tuple(prefix)] / N
505
+ score_list.append(-np.log(freq + 1e-10))
506
+ rarity_matrix.append(score_list)
507
+
508
+ rarity_arr = np.array(rarity_matrix, dtype=float)
509
+
510
+ if zscore:
511
+ # Column-wise z-score; handle zero-std columns gracefully (leave as NaN)
512
+ col_means = np.nanmean(rarity_arr, axis=0)
513
+ col_stds = np.nanstd(rarity_arr, axis=0, ddof=1) # Use sample std for consistency with pandas
514
+ with np.errstate(invalid='ignore', divide='ignore'):
515
+ rarity_arr = (rarity_arr - col_means) / col_stds
516
+
517
+ if not as_dataframe:
518
+ return rarity_arr
519
+
520
+ columns = [f"{column_prefix}{t+1}" for t in range(self.T)]
521
+ return pd.DataFrame(rarity_arr, columns=columns)
522
+
523
+ def compute_prefix_rarity_score(self):
524
+ rarity_scores = []
525
+ N = len(self.sequences)
526
+
527
+ for seq in self.sequences:
528
+ prefix = []
529
+ score = 0.0
530
+ for t in range(self.T):
531
+ prefix.append(seq[t])
532
+ freq = self.prefix_freq_by_year[t][tuple(prefix)] / N
533
+ score += -np.log(freq + 1e-10) # small constant to avoid log(0)
534
+ rarity_scores.append(score)
535
+ return rarity_scores
536
+
537
+ def diagnose_divergence_calculation(self, z_threshold=1.5, min_t=2, window=1):
538
+ """
539
+ Diagnostic function to analyze divergence year calculation and identify
540
+ years with insufficient variance (std ≈ 0) that cannot trigger divergence.
541
+
542
+ This is methodologically appropriate: when all individuals follow similar
543
+ trajectories in a given year, no divergence should be detected.
544
+
545
+ Returns:
546
+ --------
547
+ dict: Diagnostic information including:
548
+ - years_with_zero_variance: List of years where std ≈ 0
549
+ - rarity_std_by_year: Standard deviation of rarity scores per year
550
+ - n_individuals_with_divergence: Count of individuals with any divergence
551
+ - divergence_year_distribution: Value counts of divergence years
552
+ """
553
+ N = len(self.sequences)
554
+ rarity_matrix = []
555
+
556
+ # Calculate rarity scores (same as in compute_first_divergence_year)
557
+ for seq in self.sequences:
558
+ prefix = []
559
+ score = []
560
+ for t in range(self.T):
561
+ prefix.append(seq[t])
562
+ freq = self.prefix_freq_by_year[t][tuple(prefix)] / N
563
+ score.append(-np.log(freq + 1e-10))
564
+ rarity_matrix.append(score)
565
+
566
+ rarity_df = pd.DataFrame(rarity_matrix)
567
+
568
+ # Calculate standard deviations by year
569
+ rarity_std_by_year = rarity_df.std(axis=0)
570
+ years_with_zero_variance = []
571
+
572
+ # Identify years with near-zero variance (threshold can be adjusted)
573
+ for t, std_val in enumerate(rarity_std_by_year):
574
+ if pd.isna(std_val) or std_val < 1e-10:
575
+ years_with_zero_variance.append(t + 1) # 1-indexed
576
+
577
+ # Calculate z-scores
578
+ rarity_z = rarity_df.apply(lambda x: (x - x.mean()) / x.std(), axis=0)
579
+
580
+ # Count individuals with divergence
581
+ divergence_years = self.compute_first_divergence_year(z_threshold, min_t, window)
582
+ n_individuals_with_divergence = sum(1 for year in divergence_years if year is not None)
583
+
584
+ # Distribution of divergence years
585
+ divergence_year_counts = pd.Series(divergence_years).value_counts(dropna=False).sort_index()
586
+
587
+ return {
588
+ 'years_with_zero_variance': years_with_zero_variance,
589
+ 'rarity_std_by_year': rarity_std_by_year.tolist(),
590
+ 'n_individuals_with_divergence': n_individuals_with_divergence,
591
+ 'divergence_year_distribution': divergence_year_counts.to_dict(),
592
+ 'total_individuals': N,
593
+ 'parameters_used': {
594
+ 'z_threshold': z_threshold,
595
+ 'min_t': min_t,
596
+ 'window': window
597
+ }
598
+ }
599
+
600
+ def compute_standardized_rarity_score(self, min_t=2, window=1):
601
+ """
602
+ Compute standardized rarity scores for divergence classification and visualization
603
+ using true statistical z-scores.
604
+
605
+ This method computes standardized rarity scores used for individual-level
606
+ divergence classification:
607
+ standardized_score_i = max_t min_{k=0..window-1} z_{i,t+k}
608
+
609
+ Where z_{i,t} are the year-wise standardized prefix rarity scores using column-wise
610
+ standardization with sample standard deviation (ddof=1, as computed by pandas).
611
+
612
+ The standardized scores can be used with a threshold (e.g., z >= 1.5) to classify
613
+ individuals as diverged/not diverged, and are particularly useful for visualization.
614
+
615
+ Parameters:
616
+ -----------
617
+ min_t : int, default=2
618
+ Minimum year (1-indexed) after which divergence is considered valid
619
+ window : int, default=1
620
+ Number of consecutive high-z years required
621
+
622
+ Returns:
623
+ --------
624
+ List[float]
625
+ Standardized rarity scores for each individual. Values >= z_threshold indicate divergence.
626
+
627
+ Notes:
628
+ ------
629
+ The standardization uses sample standard deviation (ddof=1) for each year column,
630
+ which is consistent with pandas' default behavior for DataFrame.std().
631
+ This is essentially the z-score normalized version of prefix rarity scores.
632
+ """
633
+ N = len(self.sequences)
634
+ # Step 1: Compute rarity matrix (same as in compute_diverged)
635
+ rarity_matrix = []
636
+
637
+ for seq in self.sequences:
638
+ prefix = []
639
+ score = []
640
+ for t in range(self.T):
641
+ prefix.append(seq[t])
642
+ freq = self.prefix_freq_by_year[t][tuple(prefix)] / N
643
+ score.append(-np.log(freq + 1e-10))
644
+ rarity_matrix.append(score)
645
+
646
+ # Step 2: Column-wise true z-score standardization (by year, ddof=1)
647
+ rarity_arr = np.asarray(rarity_matrix, dtype=float)
648
+ col_means = np.nanmean(rarity_arr, axis=0)
649
+ col_stds = np.nanstd(rarity_arr, axis=0, ddof=1)
650
+ with np.errstate(invalid='ignore', divide='ignore'):
651
+ rarity_z = (rarity_arr - col_means) / col_stds
652
+ # Keep NaN for zero-variance years to allow window skipping downstream
653
+ rarity_z = np.where(np.isfinite(rarity_z), rarity_z, np.nan)
654
+
655
+ # Step 3: Compute standardized rarity score for each individual
656
+ standardized_scores = []
657
+ for i in range(N):
658
+ z_scores = rarity_z[i, :]
659
+ candidate_values = []
660
+
661
+ # For each possible starting time t
662
+ for t in range(min_t - 1, self.T - window + 1):
663
+ vals = [z_scores[t + k] for k in range(window)]
664
+ # Skip windows containing NaN (e.g., zero-variance years)
665
+ if not np.all(np.isfinite(vals)):
666
+ continue
667
+ # For divergence, take minimum within window (ensure all finite)
668
+ window_min = float(np.min(vals))
669
+ candidate_values.append(window_min)
670
+
671
+ # Take the maximum across all starting times
672
+ if candidate_values:
673
+ standardized_score = np.nanmax(candidate_values)
674
+ else:
675
+ standardized_score = np.nan
676
+
677
+ standardized_scores.append(standardized_score)
678
+
679
+ return standardized_scores
680
+
681
+ def compute_path_uniqueness(self):
682
+ uniqueness_scores = []
683
+ for seq in self.sequences:
684
+ prefix = []
685
+ count = 0
686
+ for t in range(self.T):
687
+ prefix.append(seq[t])
688
+ if self.prefix_freq_by_year[t][tuple(prefix)] == 1:
689
+ count += 1
690
+ uniqueness_scores.append(count)
691
+ return uniqueness_scores
692
+
693
+
694
+ def plot_prefix_rarity_distribution(
695
+ data,
696
+ # === Core Parameters ===
697
+ group_names=None,
698
+ colors=None,
699
+ # === Threshold Settings ===
700
+ show_threshold=True,
701
+ threshold_method="zscore", # Changed default to top_proportion
702
+ proportion_p=0.07, # Simplified parameter name, default 7%
703
+ # === Plotting Options ===
704
+ figsize=(10, 6),
705
+ kde_bw=None,
706
+ # === Export Options ===
707
+ save_as=None,
708
+ dpi=300,
709
+ show=True,
710
+ # === Parameters for Different Methods ===
711
+ z_threshold=1.5,
712
+ is_standardized_score=False,
713
+ quantile_p=0.90
714
+ ):
715
+ """
716
+ Plot prefix rarity score distribution(s) with clean threshold lines.
717
+
718
+ Parameters
719
+ ----------
720
+ data : dict or array-like
721
+ Data to plot. If dict: {"group1": scores1, "group2": scores2}
722
+ If array-like: single group data
723
+ group_names : list, optional
724
+ Custom group names. Auto-detected from dict keys if not provided
725
+ colors : dict or list, optional
726
+ Colors for groups. If None, uses default palette
727
+
728
+ show_threshold : bool, default True
729
+ Whether to show threshold vertical lines
730
+ threshold_method : str, default "top_proportion"
731
+ Threshold method:
732
+ - "top_proportion": Select top proportion_p% most extreme values
733
+ - "quantile": Use quantile_p percentile
734
+ - "zscore": Use z-score threshold (for standardized data)
735
+ proportion_p : float, default 0.05
736
+ Proportion for top_proportion method (e.g., 0.05 = top 5%)
737
+
738
+ figsize : tuple, default (10, 6)
739
+ Figure size (width, height)
740
+ kde_bw : float, optional
741
+ KDE bandwidth adjustment. If None, uses seaborn default
742
+
743
+ save_as : str, optional
744
+ Save path (without extension)
745
+ dpi : int, default 300
746
+ Resolution for saved figure
747
+ show : bool, default True
748
+ Whether to display plot
749
+
750
+ Returns
751
+ -------
752
+ dict
753
+ Statistics including threshold values per group
754
+
755
+ Examples
756
+ --------
757
+ # Basic usage - top 5% threshold (default)
758
+ >>> plot_prefix_rarity_distribution({"India": india_scores, "US": us_scores})
759
+
760
+ # Custom threshold proportion
761
+ >>> plot_prefix_rarity_distribution(
762
+ ... data={"India": india_scores, "US": us_scores},
763
+ ... proportion_p=0.03, # top 3%
764
+ ... save_as="rarity_comparison"
765
+ ... )
766
+
767
+ # Quantile-based threshold
768
+ >>> plot_prefix_rarity_distribution(
769
+ ... data={"India": india_scores, "US": us_scores},
770
+ ... threshold_method="quantile",
771
+ ... quantile_p=0.90, # 90th percentile
772
+ ... )
773
+
774
+ # Clean plot without thresholds
775
+ >>> plot_prefix_rarity_distribution(
776
+ ... data,
777
+ ... show_threshold=False,
778
+ ... colors={"India": "#E8B88A", "US": "#A3BFD9"}
779
+ ... )
780
+ """
781
+ import matplotlib.pyplot as plt
782
+ import seaborn as sns
783
+ import numpy as np
784
+
785
+ # Process input data
786
+ if isinstance(data, dict):
787
+ # Multi-group case
788
+ groups = data
789
+ if group_names is None:
790
+ group_names = list(groups.keys())
791
+ else:
792
+ # Single group case
793
+ if group_names is None:
794
+ group_names = ["Group"]
795
+ groups = {group_names[0]: data}
796
+
797
+ # Set up colors (simplified)
798
+ if colors is None:
799
+ default_colors = ["#A3BFD9", "#E8B88A", "#C6A5CF", "#A6C1A9", "#F4A460", "#87CEEB"]
800
+ color_map = dict(zip(group_names, default_colors[:len(group_names)]))
801
+ elif isinstance(colors, dict):
802
+ color_map = colors
803
+ else:
804
+ color_map = dict(zip(group_names, colors))
805
+
806
+ # Normalize method and prepare stats
807
+ threshold_method = (threshold_method or "top_proportion").lower()
808
+
809
+ # Handle legacy parameter mapping
810
+ if threshold_method in {"top_proportion", "topk", "proportion", "rank"}:
811
+ # Use the simplified proportion_p parameter
812
+ top_proportion_p = proportion_p
813
+ topk_min_count = 1
814
+ elif threshold_method == "quantile":
815
+ # Use quantile_p for quantile method
816
+ pass
817
+ elif threshold_method in {"zscore", "z"} and is_standardized_score:
818
+ # Auto-handle standardized scores
819
+ pass
820
+
821
+ stats = {"per_group": {}, "threshold_method": threshold_method}
822
+
823
+ # Validate quantiles if needed
824
+ def _check_q(q: float):
825
+ if not (0 < float(q) < 1):
826
+ raise ValueError(f"quantile must be in (0,1), got {q}")
827
+ quantiles_to_draw = None
828
+ if threshold_method == "quantile":
829
+ _check_q(quantile_p)
830
+ quantiles_to_draw = [quantile_p] # Simplified - no additional_quantiles
831
+ # Per-group quantile(s)
832
+ for g in group_names:
833
+ if g in groups:
834
+ arr = np.asarray(groups[g], dtype=float)
835
+ # Compute requested quantiles with NaN handling
836
+ valid = arr[~np.isnan(arr)]
837
+ thresholds_g = {}
838
+ if valid.size > 0:
839
+ for q in quantiles_to_draw:
840
+ try:
841
+ xq = float(np.nanquantile(arr, q))
842
+ except Exception:
843
+ xq = float(np.quantile(valid, q))
844
+ thresholds_g[f"p{int(round(q*100)):02d}"] = xq
845
+ else:
846
+ for q in quantiles_to_draw:
847
+ thresholds_g[f"p{int(round(q*100)):02d}"] = np.nan
848
+ # Primary threshold (for backward compatibility)
849
+ primary_label = f"p{int(round(quantile_p*100)):02d}"
850
+ primary_value = thresholds_g.get(primary_label, np.nan)
851
+ # Proportion below primary
852
+ vals = valid
853
+ prop_below = float(np.nanmean(vals <= primary_value)) if vals.size > 0 and not np.isnan(primary_value) else np.nan
854
+ stats["per_group"][g] = {
855
+ "threshold_values": thresholds_g,
856
+ "is_group_relative": True,
857
+ "threshold_value": primary_value,
858
+ "primary_quantile": primary_label,
859
+ "prop_below": prop_below
860
+ }
861
+ elif threshold_method in {"zscore", "z"}:
862
+ # z-score method (backward compatibility)
863
+ for g in group_names:
864
+ if g in groups:
865
+ arr = np.asarray(groups[g], dtype=float)
866
+ mean_g = np.nanmean(arr)
867
+ std_g = np.nanstd(arr, ddof=1) # sample std to match pandas
868
+ if is_standardized_score:
869
+ x_thresh_g = float(z_threshold)
870
+ else:
871
+ # For prefix (divergence): high scores indicate divergence, so mean + z*std
872
+ x_thresh_g = float(mean_g + z_threshold * std_g)
873
+ vals = arr[~np.isnan(arr)]
874
+ prop_above = float(np.nanmean(vals >= x_thresh_g)) if vals.size > 0 and not np.isnan(x_thresh_g) else np.nan
875
+ stats["per_group"][g] = {
876
+ "mean": float(mean_g),
877
+ "std": float(std_g),
878
+ "threshold_value": float(x_thresh_g),
879
+ "z_threshold": float(z_threshold),
880
+ "is_group_relative": True,
881
+ "prop_above": prop_above,
882
+ "num_above": int(np.sum(vals >= x_thresh_g)) if vals.size > 0 and not np.isnan(x_thresh_g) else 0,
883
+ "n": int(vals.size)
884
+ }
885
+ elif threshold_method in {"topk", "top_proportion", "proportion", "rank"}:
886
+ # Rank-based proportion selection within each group: pick top p% (highest values for prefix divergence)
887
+ if not (0 < float(proportion_p) < 1):
888
+ raise ValueError(f"proportion_p must be in (0,1), got {proportion_p}")
889
+ top_proportion_p = proportion_p # Map to internal variable
890
+ for g in group_names:
891
+ if g in groups:
892
+ arr = np.asarray(groups[g], dtype=float)
893
+ finite_mask = np.isfinite(arr)
894
+ vals = arr[finite_mask]
895
+ n_valid = int(vals.size)
896
+ if n_valid == 0:
897
+ stats["per_group"][g] = {
898
+ "threshold_value": np.nan,
899
+ "k": 0,
900
+ "n": 0,
901
+ "prop_selected": np.nan,
902
+ "num_geq_threshold": 0
903
+ }
904
+ continue
905
+ k = int(np.floor(top_proportion_p * n_valid))
906
+ if k < int(topk_min_count):
907
+ k = int(topk_min_count)
908
+ if k > n_valid:
909
+ k = n_valid
910
+ # Sort descending (most divergent first for prefix)
911
+ order = np.argsort(vals, kind="mergesort")[::-1]
912
+ thresh_val = vals[order[k - 1]] if k >= 1 else np.nan
913
+ num_geq = int(np.sum(vals >= thresh_val)) if k >= 1 and np.isfinite(thresh_val) else 0
914
+ stats["per_group"][g] = {
915
+ "threshold_value": float(thresh_val) if np.isfinite(thresh_val) else np.nan,
916
+ "k": int(k),
917
+ "n": int(n_valid),
918
+ "prop_selected": (k / n_valid) if n_valid > 0 else np.nan,
919
+ "num_geq_threshold": num_geq
920
+ }
921
+ stats["threshold_method"] = "topk"
922
+ else:
923
+ raise ValueError(f"Unknown threshold_method: {threshold_method}")
924
+
925
+ # Create plot
926
+ plt.figure(figsize=figsize)
927
+
928
+ # Plot distributions
929
+ for idx, group_name in enumerate(group_names):
930
+ if group_name in groups:
931
+ scores = groups[group_name]
932
+ color = color_map.get(group_name, "#1f77b4")
933
+ arr = np.asarray(scores, dtype=float)
934
+ vmin = np.nanmin(arr) if np.isfinite(arr).any() else None
935
+ vmax = np.nanmax(arr) if np.isfinite(arr).any() else None
936
+ kde_kwargs = {"label": group_name, "fill": True, "color": color, "linewidth": 2}
937
+ if kde_bw is not None:
938
+ kde_kwargs["bw_adjust"] = kde_bw
939
+ if vmin is not None and vmax is not None and vmin < vmax:
940
+ kde_kwargs["clip"] = (vmin, vmax)
941
+ sns.kdeplot(arr, **kde_kwargs)
942
+
943
+ # Add per-group threshold lines if requested (color-matched)
944
+ if show_threshold:
945
+ for i, g in enumerate(group_names):
946
+ if g in stats["per_group"]:
947
+ color = color_map.get(g, "#1f77b4")
948
+ ax = plt.gca()
949
+ y_max = ax.get_ylim()[1]
950
+ x_min, x_max = ax.get_xlim()
951
+ text_y = y_max * 0.9
952
+ x_offset = (x_max - x_min) * 0.005 * (i + 1)
953
+ if threshold_method == "quantile":
954
+ thresholds_g = stats["per_group"][g]["threshold_values"]
955
+ # Draw multiple lines if multiple quantiles
956
+ for k_idx, (q_lbl, xg) in enumerate(sorted(thresholds_g.items())):
957
+ if np.isnan(xg):
958
+ continue
959
+ # Clean threshold line without text label
960
+ plt.axvline(xg, color=color, linestyle="--", linewidth=1.6)
961
+ elif threshold_method in {"zscore", "z"}:
962
+ xg = stats["per_group"][g]["threshold_value"]
963
+ # Clean threshold line without text label
964
+ plt.axvline(xg, color=color, linestyle="--", linewidth=1.6)
965
+ else: # top_proportion
966
+ xg = stats["per_group"][g]["threshold_value"]
967
+ if np.isfinite(xg):
968
+ # Clean threshold line without text label
969
+ plt.axvline(xg, color=color, linestyle="--", linewidth=1.6)
970
+
971
+ # Formatting
972
+ if is_standardized_score:
973
+ plt.xlabel("Standardized Prefix Rarity Score", fontsize=13)
974
+ else:
975
+ plt.xlabel("Prefix Rarity Score", fontsize=13)
976
+ plt.ylabel("Density", fontsize=13)
977
+ if len(group_names) > 1:
978
+ plt.legend(title="Country")
979
+ sns.despine()
980
+ plt.tight_layout()
981
+
982
+ # Save and show
983
+ if save_as:
984
+ plt.savefig(f"{save_as}.png", dpi=dpi, bbox_inches='tight')
985
+
986
+ if show:
987
+ plt.show()
988
+
989
+ return stats
990
+
991
+
992
+ def plot_individual_indicators_correlation(
993
+ df,
994
+ indicator_columns=None,
995
+ correlation_method='pearson',
996
+ group_column=None,
997
+ figsize=(10, 8),
998
+ cmap='RdBu_r',
999
+ center=0,
1000
+ annot=True,
1001
+ fmt='.2f',
1002
+ save_as=None,
1003
+ dpi=300,
1004
+ show=True
1005
+ ):
1006
+ """
1007
+ Plot correlation heatmap of individual-level indicators with beautiful styling.
1008
+
1009
+ Parameters:
1010
+ -----------
1011
+ df : pandas.DataFrame
1012
+ DataFrame containing individual-level indicators
1013
+ indicator_columns : list, optional
1014
+ List of column names to include in correlation analysis.
1015
+ If None, automatically detects indicator columns (diverged, divergence_year,
1016
+ prefix_rarity_score, path_uniqueness, etc.)
1017
+ correlation_method : str, default='pearson'
1018
+ Correlation method: 'pearson', 'spearman', 'kendall'
1019
+ group_column : str, optional
1020
+ Column name for grouping (e.g., 'country'). If provided, shows separate
1021
+ heatmaps for each group
1022
+ figsize : tuple, default=(10, 8)
1023
+ Figure size (width, height)
1024
+ cmap : str, default='RdBu_r'
1025
+ Colormap for heatmap. Options: 'RdBu_r', 'coolwarm', 'viridis', 'plasma'
1026
+ center : float, default=0
1027
+ Value to center the colormap at
1028
+ annot : bool, default=True
1029
+ Whether to annotate cells with correlation values
1030
+ fmt : str, default='.2f'
1031
+ Format for annotations
1032
+ save_as : str, optional
1033
+ Path to save the figure (without extension)
1034
+ dpi : int, default=300
1035
+ DPI for saving
1036
+ show : bool, default=True
1037
+ Whether to display the plot
1038
+
1039
+ Returns:
1040
+ --------
1041
+ dict: Correlation matrix/matrices and statistics
1042
+
1043
+ Example:
1044
+ --------
1045
+ # Basic usage
1046
+ >>> plot_individual_indicators_correlation(df)
1047
+
1048
+ # Custom indicators with grouping
1049
+ >>> plot_individual_indicators_correlation(
1050
+ ... df,
1051
+ ... indicator_columns=['diverged', 'prefix_rarity_score', 'path_uniqueness'],
1052
+ ... group_column='country',
1053
+ ... correlation_method='spearman'
1054
+ ... )
1055
+
1056
+ # Custom styling
1057
+ >>> plot_individual_indicators_correlation(
1058
+ ... df,
1059
+ ... cmap='plasma',
1060
+ ... figsize=(12, 10),
1061
+ ... save_as="indicators_correlation_heatmap"
1062
+ ... )
1063
+ """
1064
+ import matplotlib.pyplot as plt
1065
+ import seaborn as sns
1066
+ import pandas as pd
1067
+ import numpy as np
1068
+
1069
+ # Auto-detect indicator columns if not provided
1070
+ if indicator_columns is None:
1071
+ # Common individual-level indicator patterns
1072
+ potential_indicators = [
1073
+ 'diverged', 'first_divergence_year', 'divergence_year',
1074
+ 'prefix_rarity_score', 'path_uniqueness',
1075
+ 'rarity_score', 'uniqueness_score'
1076
+ ]
1077
+ indicator_columns = [col for col in df.columns if col in potential_indicators]
1078
+
1079
+ # Also include numeric columns that might be indicators
1080
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
1081
+ for col in numeric_cols:
1082
+ if col not in indicator_columns and any(
1083
+ keyword in col.lower() for keyword in
1084
+ ['score', 'index', 'count', 'factor', 'rate', 'ratio']
1085
+ ):
1086
+ indicator_columns.append(col)
1087
+
1088
+ # Filter and clean data
1089
+ df_indicators = df[indicator_columns].copy()
1090
+
1091
+ # Handle missing values and convert data types
1092
+ for col in df_indicators.columns:
1093
+ if df_indicators[col].dtype == 'object':
1094
+ # Try to convert to numeric
1095
+ df_indicators[col] = pd.to_numeric(df_indicators[col], errors='coerce')
1096
+
1097
+ # Remove columns with too many missing values (>50%)
1098
+ valid_cols = []
1099
+ for col in df_indicators.columns:
1100
+ if df_indicators[col].notna().sum() / len(df_indicators) > 0.5:
1101
+ valid_cols.append(col)
1102
+
1103
+ df_indicators = df_indicators[valid_cols]
1104
+
1105
+ # Drop rows with any missing values for correlation calculation
1106
+ df_clean = df_indicators.dropna()
1107
+
1108
+ if len(df_clean) == 0:
1109
+ raise ValueError("No valid data remaining after cleaning. Check for missing values.")
1110
+
1111
+ # Calculate correlations
1112
+ results = {}
1113
+
1114
+ if group_column is None or group_column not in df.columns:
1115
+ # Single correlation matrix
1116
+ corr_matrix = df_clean.corr(method=correlation_method)
1117
+ results['overall'] = corr_matrix
1118
+
1119
+ # Create plot
1120
+ plt.figure(figsize=figsize)
1121
+
1122
+ # Create mask for upper triangle (optional - makes it cleaner)
1123
+ mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
1124
+
1125
+ # Generate heatmap
1126
+ sns.heatmap(
1127
+ corr_matrix,
1128
+ mask=mask,
1129
+ annot=annot,
1130
+ fmt=fmt,
1131
+ cmap=cmap,
1132
+ center=center,
1133
+ square=True,
1134
+ cbar_kws={"shrink": .8, "label": f"{correlation_method.title()} Correlation"},
1135
+ linewidths=0.5
1136
+ )
1137
+
1138
+ plt.title(f"Individual-Level Indicators Correlation Heatmap\n({correlation_method.title()} Correlation)",
1139
+ fontsize=14, pad=20)
1140
+ plt.xticks(rotation=45, ha='right')
1141
+ plt.yticks(rotation=0)
1142
+
1143
+ else:
1144
+ # Group-based correlation matrices
1145
+ groups = df[group_column].unique()
1146
+ n_groups = len(groups)
1147
+
1148
+ # Calculate subplot layout
1149
+ if n_groups <= 2:
1150
+ nrows, ncols = 1, n_groups
1151
+ figsize = (figsize[0] * n_groups, figsize[1])
1152
+ else:
1153
+ ncols = min(3, n_groups)
1154
+ nrows = (n_groups + ncols - 1) // ncols
1155
+ figsize = (figsize[0] * ncols, figsize[1] * nrows)
1156
+
1157
+ fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
1158
+ if n_groups == 1:
1159
+ axes = [axes]
1160
+ elif nrows == 1:
1161
+ axes = axes
1162
+ else:
1163
+ axes = axes.flatten()
1164
+
1165
+ for i, group in enumerate(groups):
1166
+ group_data = df[df[group_column] == group][indicator_columns].dropna()
1167
+
1168
+ if len(group_data) < 2:
1169
+ print(f"Warning: Group '{group}' has insufficient data for correlation")
1170
+ continue
1171
+
1172
+ corr_matrix = group_data.corr(method=correlation_method)
1173
+ results[group] = corr_matrix
1174
+
1175
+ # Create mask for upper triangle
1176
+ mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
1177
+
1178
+ # Plot heatmap
1179
+ sns.heatmap(
1180
+ corr_matrix,
1181
+ mask=mask,
1182
+ annot=annot,
1183
+ fmt=fmt,
1184
+ cmap=cmap,
1185
+ center=center,
1186
+ square=True,
1187
+ cbar=i == 0, # Only show colorbar for first subplot
1188
+ cbar_kws={"shrink": .8, "label": f"{correlation_method.title()} Correlation"} if i == 0 else {},
1189
+ linewidths=0.5,
1190
+ ax=axes[i]
1191
+ )
1192
+
1193
+ axes[i].set_title(f"{group}\n({len(group_data)} individuals)", fontsize=12)
1194
+ axes[i].set_xticks(axes[i].get_xticks())
1195
+ axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45, ha='right')
1196
+ axes[i].set_yticks(axes[i].get_yticks())
1197
+ axes[i].set_yticklabels(axes[i].get_yticklabels(), rotation=0)
1198
+
1199
+ # Hide unused subplots
1200
+ for j in range(i + 1, len(axes)):
1201
+ axes[j].set_visible(False)
1202
+
1203
+ plt.suptitle(f"Individual-Level Indicators Correlation by {group_column.title()}\n({correlation_method.title()} Correlation)",
1204
+ fontsize=16, y=0.98)
1205
+
1206
+ plt.tight_layout()
1207
+
1208
+ # Save and show
1209
+ if save_as:
1210
+ plt.savefig(f"{save_as}.png", dpi=dpi, bbox_inches='tight')
1211
+
1212
+ if show:
1213
+ plt.show()
1214
+
1215
+ # Add summary statistics
1216
+ if group_column is None:
1217
+ sample_size = len(df_clean)
1218
+ else:
1219
+ sizes = {}
1220
+ for g in df[group_column].unique():
1221
+ g_clean = df[df[group_column]==g][indicator_columns].apply(pd.to_numeric, errors='coerce').dropna()
1222
+ sizes[g] = len(g_clean)
1223
+ sample_size = sizes
1224
+
1225
+ results['summary'] = {
1226
+ 'method': correlation_method,
1227
+ 'n_indicators': len(valid_cols),
1228
+ 'indicators_included': valid_cols,
1229
+ 'sample_size': sample_size
1230
+ }
1231
+
1232
+ return results
1233
+
1234
+
1235
+ def compute_path_uniqueness_by_group_prefix(sequences, group_labels):
1236
+ """
1237
+ Compute path uniqueness within each subgroup defined by group_labels using prefix-based approach.
1238
+ This is consistent with the divergence module's prefix-based logic.
1239
+ :param sequences: List of sequences.
1240
+ :param group_labels: List of group keys (same length as sequences), e.g., country, gender.
1241
+ :return: List of path uniqueness scores (same order as input).
1242
+ """
1243
+ from collections import defaultdict
1244
+
1245
+ T = len(sequences[0])
1246
+ df = pd.DataFrame({
1247
+ "sequence": sequences,
1248
+ "group": group_labels
1249
+ })
1250
+
1251
+ # Step 1: Precompute prefix frequency tables per group
1252
+ group_prefix_freq = {}
1253
+ for group, group_df in df.groupby("group"):
1254
+ prefix_freq = [defaultdict(int) for _ in range(T)]
1255
+ for seq in group_df["sequence"]:
1256
+ prefix = []
1257
+ for t in range(T):
1258
+ prefix.append(seq[t])
1259
+ prefix_freq[t][tuple(prefix)] += 1
1260
+ group_prefix_freq[group] = prefix_freq
1261
+
1262
+ # Step 2: Compute path uniqueness per individual
1263
+ uniqueness_scores = []
1264
+ for seq, group in zip(sequences, group_labels):
1265
+ prefix_freq = group_prefix_freq[group]
1266
+ prefix = []
1267
+ count = 0
1268
+ for t in range(T):
1269
+ prefix.append(seq[t])
1270
+ if prefix_freq[t][tuple(prefix)] == 1:
1271
+ count += 1
1272
+ uniqueness_scores.append(count)
1273
+
1274
+ return uniqueness_scores