sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. _sequenzo_fastcluster.cpython-310-darwin.so +0 -0
  2. sequenzo/__init__.py +349 -0
  3. sequenzo/big_data/__init__.py +12 -0
  4. sequenzo/big_data/clara/__init__.py +26 -0
  5. sequenzo/big_data/clara/clara.py +476 -0
  6. sequenzo/big_data/clara/utils/__init__.py +27 -0
  7. sequenzo/big_data/clara/utils/aggregatecases.py +92 -0
  8. sequenzo/big_data/clara/utils/davies_bouldin.py +91 -0
  9. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so +0 -0
  10. sequenzo/big_data/clara/utils/wfcmdd.py +205 -0
  11. sequenzo/big_data/clara/visualization.py +88 -0
  12. sequenzo/clustering/KMedoids.py +178 -0
  13. sequenzo/clustering/__init__.py +30 -0
  14. sequenzo/clustering/clustering_c_code.cpython-310-darwin.so +0 -0
  15. sequenzo/clustering/hierarchical_clustering.py +1256 -0
  16. sequenzo/clustering/sequenzo_fastcluster/fastcluster.py +495 -0
  17. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster.cpp +1877 -0
  18. sequenzo/clustering/sequenzo_fastcluster/src/fastcluster_python.cpp +1264 -0
  19. sequenzo/clustering/src/KMedoid.cpp +263 -0
  20. sequenzo/clustering/src/PAM.cpp +237 -0
  21. sequenzo/clustering/src/PAMonce.cpp +265 -0
  22. sequenzo/clustering/src/cluster_quality.cpp +496 -0
  23. sequenzo/clustering/src/cluster_quality.h +128 -0
  24. sequenzo/clustering/src/cluster_quality_backup.cpp +570 -0
  25. sequenzo/clustering/src/module.cpp +228 -0
  26. sequenzo/clustering/src/weightedinertia.cpp +111 -0
  27. sequenzo/clustering/utils/__init__.py +27 -0
  28. sequenzo/clustering/utils/disscenter.py +122 -0
  29. sequenzo/data_preprocessing/__init__.py +22 -0
  30. sequenzo/data_preprocessing/helpers.py +303 -0
  31. sequenzo/datasets/__init__.py +41 -0
  32. sequenzo/datasets/biofam.csv +2001 -0
  33. sequenzo/datasets/biofam_child_domain.csv +2001 -0
  34. sequenzo/datasets/biofam_left_domain.csv +2001 -0
  35. sequenzo/datasets/biofam_married_domain.csv +2001 -0
  36. sequenzo/datasets/chinese_colonial_territories.csv +12 -0
  37. sequenzo/datasets/country_co2_emissions.csv +194 -0
  38. sequenzo/datasets/country_co2_emissions_global_deciles.csv +195 -0
  39. sequenzo/datasets/country_co2_emissions_global_quintiles.csv +195 -0
  40. sequenzo/datasets/country_co2_emissions_local_deciles.csv +195 -0
  41. sequenzo/datasets/country_co2_emissions_local_quintiles.csv +195 -0
  42. sequenzo/datasets/country_gdp_per_capita.csv +194 -0
  43. sequenzo/datasets/dyadic_children.csv +61 -0
  44. sequenzo/datasets/dyadic_parents.csv +61 -0
  45. sequenzo/datasets/mvad.csv +713 -0
  46. sequenzo/datasets/pairfam_activity_by_month.csv +1028 -0
  47. sequenzo/datasets/pairfam_activity_by_year.csv +1028 -0
  48. sequenzo/datasets/pairfam_family_by_month.csv +1028 -0
  49. sequenzo/datasets/pairfam_family_by_year.csv +1028 -0
  50. sequenzo/datasets/political_science_aid_shock.csv +166 -0
  51. sequenzo/datasets/political_science_donor_fragmentation.csv +157 -0
  52. sequenzo/define_sequence_data.py +1400 -0
  53. sequenzo/dissimilarity_measures/__init__.py +31 -0
  54. sequenzo/dissimilarity_measures/c_code.cpython-310-darwin.so +0 -0
  55. sequenzo/dissimilarity_measures/get_distance_matrix.py +762 -0
  56. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +246 -0
  57. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +148 -0
  58. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +114 -0
  59. sequenzo/dissimilarity_measures/src/LCPspellDistance.cpp +215 -0
  60. sequenzo/dissimilarity_measures/src/OMdistance.cpp +247 -0
  61. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +281 -0
  62. sequenzo/dissimilarity_measures/src/__init__.py +0 -0
  63. sequenzo/dissimilarity_measures/src/dist2matrix.cpp +63 -0
  64. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  65. sequenzo/dissimilarity_measures/src/module.cpp +40 -0
  66. sequenzo/dissimilarity_measures/src/setup.py +30 -0
  67. sequenzo/dissimilarity_measures/src/utils.h +25 -0
  68. sequenzo/dissimilarity_measures/src/xsimd/.github/cmake-test/main.cpp +6 -0
  69. sequenzo/dissimilarity_measures/src/xsimd/benchmark/main.cpp +159 -0
  70. sequenzo/dissimilarity_measures/src/xsimd/benchmark/xsimd_benchmark.hpp +565 -0
  71. sequenzo/dissimilarity_measures/src/xsimd/docs/source/conf.py +37 -0
  72. sequenzo/dissimilarity_measures/src/xsimd/examples/mandelbrot.cpp +330 -0
  73. sequenzo/dissimilarity_measures/src/xsimd/examples/pico_bench.hpp +246 -0
  74. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +266 -0
  75. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +112 -0
  76. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +323 -0
  77. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +218 -0
  78. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +2583 -0
  79. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +880 -0
  80. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_rounding.hpp +72 -0
  81. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  82. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +978 -0
  83. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +1924 -0
  84. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +1144 -0
  85. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +656 -0
  86. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp +28 -0
  87. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +244 -0
  88. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512er.hpp +20 -0
  89. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +2650 -0
  90. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp +20 -0
  91. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp +20 -0
  92. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +77 -0
  93. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +131 -0
  94. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +20 -0
  95. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +20 -0
  96. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp +20 -0
  97. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +24 -0
  98. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +77 -0
  99. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +393 -0
  100. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +788 -0
  101. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +93 -0
  102. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp +46 -0
  103. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +97 -0
  104. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +92 -0
  105. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp +17 -0
  106. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +142 -0
  107. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +3142 -0
  108. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +1543 -0
  109. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +1513 -0
  110. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +1260 -0
  111. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +2024 -0
  112. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +67 -0
  113. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp +339 -0
  114. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp +44 -0
  115. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +186 -0
  116. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +1155 -0
  117. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  118. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +1780 -0
  119. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +240 -0
  120. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +484 -0
  121. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +269 -0
  122. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +27 -0
  123. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp +719 -0
  124. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp +349 -0
  125. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/memory/xsimd_alignment.hpp +91 -0
  126. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +55 -0
  127. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +2765 -0
  128. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx2_register.hpp +44 -0
  129. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp +51 -0
  130. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp +51 -0
  131. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp +51 -0
  132. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp +51 -0
  133. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +77 -0
  134. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp +51 -0
  135. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp +51 -0
  136. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi2_register.hpp +51 -0
  137. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp +51 -0
  138. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp +54 -0
  139. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp +53 -0
  140. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avx_register.hpp +64 -0
  141. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp +44 -0
  142. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +1524 -0
  143. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +300 -0
  144. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_common_arch.hpp +47 -0
  145. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_emulated_register.hpp +80 -0
  146. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp +50 -0
  147. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp +50 -0
  148. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp +50 -0
  149. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_fma4_register.hpp +50 -0
  150. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp +55 -0
  151. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon64_register.hpp +55 -0
  152. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_neon_register.hpp +154 -0
  153. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_register.hpp +94 -0
  154. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +506 -0
  155. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +59 -0
  156. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse3_register.hpp +49 -0
  157. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp +48 -0
  158. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp +48 -0
  159. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp +48 -0
  160. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_sve_register.hpp +156 -0
  161. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +337 -0
  162. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_utils.hpp +536 -0
  163. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  164. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +59 -0
  165. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +75 -0
  166. sequenzo/dissimilarity_measures/src/xsimd/test/architectures/dummy.cpp +7 -0
  167. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set.cpp +13 -0
  168. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean.cpp +24 -0
  169. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp +25 -0
  170. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp +28 -0
  171. sequenzo/dissimilarity_measures/src/xsimd/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp +25 -0
  172. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_abstract_batches.cpp +7 -0
  173. sequenzo/dissimilarity_measures/src/xsimd/test/doc/manipulating_parametric_batches.cpp +8 -0
  174. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum.hpp +31 -0
  175. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_avx2.cpp +3 -0
  176. sequenzo/dissimilarity_measures/src/xsimd/test/doc/sum_sse2.cpp +3 -0
  177. sequenzo/dissimilarity_measures/src/xsimd/test/doc/writing_vectorized_code.cpp +11 -0
  178. sequenzo/dissimilarity_measures/src/xsimd/test/main.cpp +31 -0
  179. sequenzo/dissimilarity_measures/src/xsimd/test/test_api.cpp +230 -0
  180. sequenzo/dissimilarity_measures/src/xsimd/test/test_arch.cpp +217 -0
  181. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +183 -0
  182. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +1049 -0
  183. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +508 -0
  184. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +409 -0
  185. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +712 -0
  186. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_constant.cpp +286 -0
  187. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_float.cpp +141 -0
  188. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +365 -0
  189. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +308 -0
  190. sequenzo/dissimilarity_measures/src/xsimd/test/test_bitwise_cast.cpp +222 -0
  191. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_exponential.cpp +226 -0
  192. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_hyperbolic.cpp +183 -0
  193. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_power.cpp +265 -0
  194. sequenzo/dissimilarity_measures/src/xsimd/test/test_complex_trigonometric.cpp +236 -0
  195. sequenzo/dissimilarity_measures/src/xsimd/test/test_conversion.cpp +248 -0
  196. sequenzo/dissimilarity_measures/src/xsimd/test/test_custom_default_arch.cpp +28 -0
  197. sequenzo/dissimilarity_measures/src/xsimd/test/test_error_gamma.cpp +170 -0
  198. sequenzo/dissimilarity_measures/src/xsimd/test/test_explicit_batch_instantiation.cpp +32 -0
  199. sequenzo/dissimilarity_measures/src/xsimd/test/test_exponential.cpp +202 -0
  200. sequenzo/dissimilarity_measures/src/xsimd/test/test_extract_pair.cpp +92 -0
  201. sequenzo/dissimilarity_measures/src/xsimd/test/test_fp_manipulation.cpp +77 -0
  202. sequenzo/dissimilarity_measures/src/xsimd/test/test_gnu_source.cpp +30 -0
  203. sequenzo/dissimilarity_measures/src/xsimd/test/test_hyperbolic.cpp +167 -0
  204. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +304 -0
  205. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +61 -0
  206. sequenzo/dissimilarity_measures/src/xsimd/test/test_poly_evaluation.cpp +64 -0
  207. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +184 -0
  208. sequenzo/dissimilarity_measures/src/xsimd/test/test_rounding.cpp +199 -0
  209. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +101 -0
  210. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +760 -0
  211. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.cpp +4 -0
  212. sequenzo/dissimilarity_measures/src/xsimd/test/test_sum.hpp +34 -0
  213. sequenzo/dissimilarity_measures/src/xsimd/test/test_traits.cpp +172 -0
  214. sequenzo/dissimilarity_measures/src/xsimd/test/test_trigonometric.cpp +208 -0
  215. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +611 -0
  216. sequenzo/dissimilarity_measures/src/xsimd/test/test_wasm/test_wasm_playwright.py +123 -0
  217. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +1460 -0
  218. sequenzo/dissimilarity_measures/utils/__init__.py +16 -0
  219. sequenzo/dissimilarity_measures/utils/get_LCP_length_for_2_seq.py +44 -0
  220. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-310-darwin.so +0 -0
  221. sequenzo/dissimilarity_measures/utils/seqconc.cpython-310-darwin.so +0 -0
  222. sequenzo/dissimilarity_measures/utils/seqdss.cpython-310-darwin.so +0 -0
  223. sequenzo/dissimilarity_measures/utils/seqdur.cpython-310-darwin.so +0 -0
  224. sequenzo/dissimilarity_measures/utils/seqlength.cpython-310-darwin.so +0 -0
  225. sequenzo/multidomain/__init__.py +23 -0
  226. sequenzo/multidomain/association_between_domains.py +311 -0
  227. sequenzo/multidomain/cat.py +597 -0
  228. sequenzo/multidomain/combt.py +519 -0
  229. sequenzo/multidomain/dat.py +81 -0
  230. sequenzo/multidomain/idcd.py +139 -0
  231. sequenzo/multidomain/linked_polyad.py +292 -0
  232. sequenzo/openmp_setup.py +233 -0
  233. sequenzo/prefix_tree/__init__.py +62 -0
  234. sequenzo/prefix_tree/hub.py +114 -0
  235. sequenzo/prefix_tree/individual_level_indicators.py +1321 -0
  236. sequenzo/prefix_tree/spell_individual_level_indicators.py +580 -0
  237. sequenzo/prefix_tree/spell_level_indicators.py +297 -0
  238. sequenzo/prefix_tree/system_level_indicators.py +544 -0
  239. sequenzo/prefix_tree/utils.py +54 -0
  240. sequenzo/seqhmm/__init__.py +95 -0
  241. sequenzo/seqhmm/advanced_optimization.py +305 -0
  242. sequenzo/seqhmm/bootstrap.py +411 -0
  243. sequenzo/seqhmm/build_hmm.py +142 -0
  244. sequenzo/seqhmm/build_mhmm.py +136 -0
  245. sequenzo/seqhmm/build_nhmm.py +121 -0
  246. sequenzo/seqhmm/fit_mhmm.py +62 -0
  247. sequenzo/seqhmm/fit_model.py +61 -0
  248. sequenzo/seqhmm/fit_nhmm.py +76 -0
  249. sequenzo/seqhmm/formulas.py +289 -0
  250. sequenzo/seqhmm/forward_backward_nhmm.py +276 -0
  251. sequenzo/seqhmm/gradients_nhmm.py +306 -0
  252. sequenzo/seqhmm/hmm.py +291 -0
  253. sequenzo/seqhmm/mhmm.py +314 -0
  254. sequenzo/seqhmm/model_comparison.py +238 -0
  255. sequenzo/seqhmm/multichannel_em.py +282 -0
  256. sequenzo/seqhmm/multichannel_utils.py +138 -0
  257. sequenzo/seqhmm/nhmm.py +270 -0
  258. sequenzo/seqhmm/nhmm_utils.py +191 -0
  259. sequenzo/seqhmm/predict.py +137 -0
  260. sequenzo/seqhmm/predict_mhmm.py +142 -0
  261. sequenzo/seqhmm/simulate.py +878 -0
  262. sequenzo/seqhmm/utils.py +218 -0
  263. sequenzo/seqhmm/visualization.py +910 -0
  264. sequenzo/sequence_characteristics/__init__.py +40 -0
  265. sequenzo/sequence_characteristics/complexity_index.py +49 -0
  266. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +220 -0
  267. sequenzo/sequence_characteristics/plot_characteristics.py +593 -0
  268. sequenzo/sequence_characteristics/simple_characteristics.py +311 -0
  269. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +39 -0
  270. sequenzo/sequence_characteristics/turbulence.py +155 -0
  271. sequenzo/sequence_characteristics/variance_of_spell_durations.py +86 -0
  272. sequenzo/sequence_characteristics/within_sequence_entropy.py +43 -0
  273. sequenzo/suffix_tree/__init__.py +66 -0
  274. sequenzo/suffix_tree/hub.py +114 -0
  275. sequenzo/suffix_tree/individual_level_indicators.py +1679 -0
  276. sequenzo/suffix_tree/spell_individual_level_indicators.py +493 -0
  277. sequenzo/suffix_tree/spell_level_indicators.py +248 -0
  278. sequenzo/suffix_tree/system_level_indicators.py +535 -0
  279. sequenzo/suffix_tree/utils.py +56 -0
  280. sequenzo/version_check.py +283 -0
  281. sequenzo/visualization/__init__.py +29 -0
  282. sequenzo/visualization/plot_mean_time.py +222 -0
  283. sequenzo/visualization/plot_modal_state.py +276 -0
  284. sequenzo/visualization/plot_most_frequent_sequences.py +147 -0
  285. sequenzo/visualization/plot_relative_frequency.py +405 -0
  286. sequenzo/visualization/plot_sequence_index.py +1175 -0
  287. sequenzo/visualization/plot_single_medoid.py +153 -0
  288. sequenzo/visualization/plot_state_distribution.py +651 -0
  289. sequenzo/visualization/plot_transition_matrix.py +190 -0
  290. sequenzo/visualization/utils/__init__.py +23 -0
  291. sequenzo/visualization/utils/utils.py +310 -0
  292. sequenzo/with_event_history_analysis/__init__.py +35 -0
  293. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  294. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  295. sequenzo-0.1.31.dist-info/METADATA +286 -0
  296. sequenzo-0.1.31.dist-info/RECORD +299 -0
  297. sequenzo-0.1.31.dist-info/WHEEL +5 -0
  298. sequenzo-0.1.31.dist-info/licenses/LICENSE +28 -0
  299. sequenzo-0.1.31.dist-info/top_level.txt +2 -0
@@ -0,0 +1,593 @@
1
+ """
2
+ @Author : Yuqi Liang 梁彧祺
3
+ @File : plot_characteristics.py
4
+ @Time : 2025/9/24 23:22
5
+ @Desc : Plot longitudinal characteristics of sequences with elegant visualization
6
+ """
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ import matplotlib.pyplot as plt
11
+ import warnings
12
+ from matplotlib.font_manager import FontProperties
13
+
14
+ # Import the correct functions from sequence characteristics modules
15
+ from .simple_characteristics import get_number_of_transitions
16
+ from .within_sequence_entropy import get_within_sequence_entropy
17
+ from .turbulence import get_turbulence
18
+ from .complexity_index import get_complexity_index
19
+ from .overall_cross_sectional_entropy import get_cross_sectional_entropy
20
+
21
+ # Import visualization utilities
22
+ try:
23
+ from ..visualization.utils.utils import set_up_time_labels_for_x_axis
24
+ except ImportError:
25
+ # Fallback function if import fails
26
+ def set_up_time_labels_for_x_axis(seqdata, ax, color="gray"):
27
+ time_labels = np.array(seqdata.cleaned_time)
28
+ num_time_steps = len(time_labels)
29
+
30
+ if num_time_steps <= 10:
31
+ xtick_positions = np.arange(num_time_steps)
32
+ elif num_time_steps <= 20:
33
+ xtick_positions = np.arange(0, num_time_steps, step=2)
34
+ else:
35
+ xtick_positions = np.linspace(0, num_time_steps - 1, num=10, dtype=int)
36
+
37
+ ax.set_xticks(xtick_positions)
38
+ ax.set_xticklabels(time_labels[xtick_positions], fontsize=10, rotation=0, ha="center", color=color)
39
+
40
+ def plot_longitudinal_characteristics(seqdata,
41
+ pick_ids=None,
42
+ k=9,
43
+ selection='first',
44
+ order_by="complexity",
45
+ figsize=(8, 6),
46
+ fontsize=12,
47
+ title=None,
48
+ show_title=True,
49
+ xlabel="Normalized Values",
50
+ ylabel="Sequence ID",
51
+ save_as=None,
52
+ dpi=200,
53
+ custom_colors=None,
54
+ show_sequence_ids=False,
55
+ id_as_column=True):
56
+ """
57
+ Create a horizontal bar chart showing four key characteristics for selected sequences.
58
+
59
+ This function calculates and visualizes four important sequence characteristics:
60
+ - Transitions: How many times sequences change from one state to another
61
+ - Entropy: How diverse/varied the sequences are
62
+ - Turbulence: How chaotic or unpredictable the sequences are
63
+ - Complexity: How complex the overall pattern is
64
+
65
+ All values are normalized to 0-1 scale for easy comparison.
66
+
67
+ Parameters
68
+ ----------
69
+ seqdata : SequenceData
70
+ Your sequence data object containing the sequences to analyze.
71
+
72
+ pick_ids : list, optional (default=None)
73
+ Specific sequence IDs you want to plot. If provided, only these sequences
74
+ will be shown. If None, the function will automatically select sequences
75
+ based on the 'selection' and 'k' parameters.
76
+ Example: [1, 5, 10, 23] to show sequences with IDs 1, 5, 10, and 23
77
+
78
+ k : int, optional (default=9)
79
+ Number of sequences to display when pick_ids is None.
80
+ Warning: Using more than 15 may make the plot hard to read.
81
+
82
+ selection : str, optional (default='first')
83
+ How to select sequences when pick_ids is None:
84
+ - 'first': Show the k sequences with highest values for the order_by metric
85
+ - 'last': Show the k sequences with lowest values for the order_by metric
86
+
87
+ order_by : str, optional (default='complexity')
88
+ Which metric to use for sorting sequences when pick_ids is None:
89
+ - 'transitions': Sort by number of state changes
90
+ - 'entropy': Sort by sequence diversity
91
+ - 'turbulence': Sort by sequence unpredictability
92
+ - 'complexity': Sort by overall sequence complexity
93
+
94
+ figsize : tuple, optional (default=(8, 6))
95
+ Size of the plot as (width, height) in inches.
96
+ Example: (10, 8) for a larger plot, (6, 4) for a smaller one
97
+
98
+ fontsize : int, optional (default=12)
99
+ Base font size for labels, ticks, and legend. Title uses fontsize+2.
100
+
101
+ title : str, optional (default=None)
102
+ Title to display at the top of the plot. If None, no title is shown.
103
+ Example: "Sequence Characteristics Comparison"
104
+
105
+ show_title : bool, optional (default=True)
106
+ Whether to display the title. If False, no title will be shown regardless
107
+ of the title parameter value. This provides consistent control with other plots.
108
+
109
+ xlabel : str, optional (default="Normalized Values")
110
+ Label for the horizontal axis (x-axis).
111
+
112
+ ylabel : str, optional (default="Sequence ID")
113
+ Label for the vertical axis (y-axis).
114
+
115
+ save_as : str, optional (default=None)
116
+ File path to save the plot. If None, plot will only be displayed.
117
+ Supported formats: .png, .jpg, .jpeg, .pdf, .svg
118
+ If no extension provided, .png will be added automatically.
119
+
120
+ dpi : int, optional (default=200)
121
+ Resolution (dots per inch) for saved image. Higher values result in
122
+ better quality but larger file sizes.
123
+
124
+ custom_colors : dict or list, optional (default=None)
125
+ Colors used for the four bars. If dict, keys can include
126
+ {'Transitions', 'Entropy', 'Turbulence', 'Complexity'} to override defaults.
127
+ If list/tuple of length 4, it maps to the above order.
128
+
129
+ show_sequence_ids : bool, optional (default=False)
130
+ If True, y-axis shows actual sequence IDs (when available).
131
+ If False, shows 1..N index positions.
132
+
133
+ id_as_column : bool, optional (default=True)
134
+ If True, the returned DataFrame will include ID as a separate column on the same level as other columns.
135
+ If False, IDs will be used as the DataFrame index.
136
+
137
+ Returns
138
+ -------
139
+ pandas.DataFrame
140
+ A DataFrame containing the calculated metrics for all plotted sequences.
141
+ If id_as_column=True: Columns: ['ID', 'Transitions', 'Entropy', 'Turbulence', 'Complexity'] (all columns at same level)
142
+ If id_as_column=False: Columns: ['Transitions', 'Entropy', 'Turbulence', 'Complexity'], Index: The sequence IDs
143
+
144
+ Warnings
145
+ --------
146
+ - If you try to plot more than 15 sequences, you'll get a warning about
147
+ potential overplotting (too crowded to read clearly)
148
+ - All metric values are automatically normalized to 0-1 scale
149
+
150
+ Examples
151
+ --------
152
+ Basic usage - plot 9 most complex sequences:
153
+ >>> metrics = plot_longitudinal_characteristics(my_seqdata)
154
+
155
+ Plot specific sequences by ID:
156
+ >>> metrics = plot_longitudinal_characteristics(my_seqdata,
157
+ ... pick_ids=[1, 5, 10, 15])
158
+
159
+ Plot 5 sequences with highest number of transitions:
160
+ >>> metrics = plot_longitudinal_characteristics(my_seqdata,
161
+ ... k=5,
162
+ ... order_by='transitions')
163
+
164
+ Customize the plot appearance:
165
+ >>> metrics = plot_longitudinal_characteristics(my_seqdata,
166
+ ... k=6,
167
+ ... figsize=(12, 8),
168
+ ... title="My Sequence Analysis",
169
+ ... xlabel="Characteristic Scores",
170
+ ... ylabel="Person ID")
171
+
172
+ Plot without title:
173
+ >>> metrics = plot_longitudinal_characteristics(my_seqdata, show_title=False)
174
+
175
+ Save plot to file:
176
+ >>> metrics = plot_longitudinal_characteristics(my_seqdata,
177
+ ... save_as="sequence_characteristics.png",
178
+ ... dpi=300)
179
+
180
+ Save as PDF:
181
+ >>> metrics = plot_longitudinal_characteristics(my_seqdata,
182
+ ... save_as="characteristics_analysis.pdf")
183
+
184
+ Notes
185
+ -----
186
+ The four characteristics help you understand different aspects of your sequences:
187
+
188
+ - **Transitions**: Higher values mean sequences change states frequently
189
+ - **Entropy**: Higher values mean sequences have more diverse states
190
+ - **Turbulence**: Higher values mean sequences are more unpredictable
191
+ - **Complexity**: Higher values mean sequences have more complex patterns
192
+
193
+ All values range from 0 to 1, making them easy to compare across different
194
+ types of sequences and datasets.
195
+ """
196
+ # Calculate four metrics (all should be 0-1 normalized)
197
+ df_t = get_number_of_transitions(seqdata=seqdata, norm=True).iloc[:, 1] # Series
198
+ df_e = get_within_sequence_entropy(seqdata=seqdata, norm=True) # Series or single-column DataFrame
199
+ if isinstance(df_e, pd.DataFrame): df_e = df_e.iloc[:, 1]
200
+
201
+ df_tb = get_turbulence(seqdata=seqdata, norm=True, type=2, id_as_column=True) # Normalized turbulence
202
+ if isinstance(df_tb, pd.DataFrame): df_tb = df_tb.iloc[:, 1]
203
+
204
+ df_c = get_complexity_index(seqdata=seqdata) # Already 0-1 normalized
205
+ if isinstance(df_c, pd.DataFrame): df_c = df_c.iloc[:, 1]
206
+
207
+ # Create metrics DataFrame with actual sequence IDs as index
208
+ metrics = pd.DataFrame({
209
+ "Transitions": df_t,
210
+ "Entropy": df_e,
211
+ "Turbulence": df_tb,
212
+ "Complexity": df_c
213
+ })
214
+
215
+ # Set the index to actual sequence IDs if available
216
+ if hasattr(seqdata, 'ids') and seqdata.ids is not None:
217
+ metrics.index = seqdata.ids
218
+
219
+ # Check for overplotting and issue warning if needed
220
+ if pick_ids is not None:
221
+ num_sequences = len(pick_ids)
222
+ if num_sequences > 15:
223
+ warnings.warn(f"Plotting {num_sequences} sequences may cause overplotting issues. "
224
+ f"Consider reducing to 15 or fewer sequences for better visualization.",
225
+ UserWarning)
226
+ elif k > 15:
227
+ warnings.warn(f"Plotting {k} sequences may cause overplotting issues. "
228
+ f"Consider reducing to 15 or fewer sequences for better visualization.",
229
+ UserWarning)
230
+
231
+ # Select sequences to display
232
+ if pick_ids is not None:
233
+ # Custom ID selection
234
+ metrics = metrics.loc[pick_ids]
235
+ else:
236
+ # Sort by specified metric and select first/last k sequences
237
+ key = order_by.capitalize()
238
+ if key not in metrics.columns:
239
+ key = "Complexity"
240
+
241
+ metrics_sorted = metrics.sort_values(key, ascending=False)
242
+
243
+ if selection == 'first':
244
+ metrics = metrics_sorted.head(k)
245
+ elif selection == 'last':
246
+ metrics = metrics_sorted.tail(k)
247
+ else:
248
+ # Default to first k sequences
249
+ metrics = metrics_sorted.head(k)
250
+
251
+ # Create horizontal grouped bar chart
252
+ # Use the DataFrame index which now contains the actual sequence IDs
253
+ if show_sequence_ids:
254
+ labels = list(metrics.index)
255
+ else:
256
+ labels = list(range(1, len(metrics) + 1))
257
+ y = np.arange(len(metrics))
258
+ bar_h = 0.18
259
+
260
+ # Basic matplotlib styling
261
+ plt.figure(figsize=figsize)
262
+ ax = plt.gca()
263
+
264
+ # Add simple background grid
265
+ ax.grid(True, axis='x', alpha=0.3)
266
+ ax.set_axisbelow(True)
267
+
268
+ # Axis/text color theme
269
+ axis_gray = '#666666'
270
+
271
+ # Add title only if provided and show_title is True
272
+ if show_title and title is not None:
273
+ plt.title(title, fontsize=fontsize + 2, color=axis_gray)
274
+
275
+ # Color palette with optional overrides
276
+ default_colors = {
277
+ 'Transitions': '#74C9B4', # Soft green
278
+ 'Entropy': '#A6E3D0', # Light green
279
+ 'Turbulence': '#F9E79F', # Light yellow
280
+ 'Complexity': '#F6CDA3' # Light orange
281
+ }
282
+
283
+ if isinstance(custom_colors, dict):
284
+ colors = {**default_colors, **custom_colors}
285
+ elif isinstance(custom_colors, (list, tuple)) and len(custom_colors) == 4:
286
+ ordered_keys = ['Transitions', 'Entropy', 'Turbulence', 'Complexity']
287
+ colors = {k: v for k, v in zip(ordered_keys, custom_colors)}
288
+ else:
289
+ colors = default_colors
290
+
291
+ plt.barh(y + 0.30, metrics["Transitions"].values, height=bar_h,
292
+ label="Transitions", color=colors['Transitions'])
293
+ plt.barh(y + 0.10, metrics["Entropy"].values, height=bar_h,
294
+ label="Entropy", color=colors['Entropy'])
295
+ plt.barh(y - 0.10, metrics["Turbulence"].values, height=bar_h,
296
+ label="Turbulence", color=colors['Turbulence'])
297
+ plt.barh(y - 0.30, metrics["Complexity"].values, height=bar_h,
298
+ label="Complexity", color=colors['Complexity'])
299
+
300
+ # Use actual sequence IDs as y-tick labels
301
+ plt.yticks(y, labels)
302
+ plt.xlim(0, 1)
303
+
304
+ # Use custom labels with refined spacing
305
+ ax.set_xlabel(xlabel, labelpad=8, fontsize=fontsize, color=axis_gray)
306
+ # Slightly expand y-axis label letter spacing
307
+ ylabel_props = FontProperties(stretch='expanded')
308
+ ax.set_ylabel(ylabel, labelpad=6, fontproperties=ylabel_props, fontsize=fontsize, color=axis_gray)
309
+
310
+ # Simple legend
311
+ plt.legend(loc="lower right", fontsize=max(6, fontsize - 1))
312
+
313
+ # Style axes like index plot - clean and minimal
314
+ ax.spines['top'].set_visible(False)
315
+ ax.spines['right'].set_visible(False)
316
+ ax.spines['left'].set_linewidth(0.8)
317
+ ax.spines['bottom'].set_linewidth(0.8)
318
+ ax.spines['left'].set_color(axis_gray)
319
+ ax.spines['bottom'].set_color(axis_gray)
320
+
321
+ # Move spines slightly away from the plot area (but keep y-axis closer than before)
322
+ ax.spines['left'].set_position(('outward', 2))
323
+ ax.spines['bottom'].set_position(('outward', 4))
324
+
325
+ # Ticks styling and subtle padding
326
+ ax.tick_params(axis='x', which='major', colors=axis_gray, length=4, width=0.7, direction='out', pad=4, labelsize=max(6, fontsize - 1))
327
+ ax.tick_params(axis='y', which='major', colors=axis_gray, length=4, width=0.7, direction='out', pad=3, labelsize=max(6, fontsize - 1))
328
+
329
+ # Extend axes slightly beyond the data range for better spacing
330
+ ax.set_ylim(-0.5, len(metrics) - 0.5)
331
+ ax.set_xlim(-0.05, 1.05)
332
+
333
+ plt.tight_layout()
334
+
335
+ # Handle saving and display
336
+ if save_as:
337
+ if not any(save_as.endswith(ext) for ext in ['.png', '.jpg', '.jpeg', '.pdf', '.svg']):
338
+ save_as += '.png'
339
+ plt.savefig(save_as, dpi=dpi, bbox_inches='tight')
340
+
341
+ plt.show()
342
+ plt.close()
343
+
344
+ # Handle ID display options for returned DataFrame
345
+ if id_as_column:
346
+ # Add ID as a separate column and reset index to numeric
347
+ metrics_result = metrics.copy()
348
+ metrics_result['ID'] = metrics_result.index
349
+ metrics_result = metrics_result[['ID', 'Transitions', 'Entropy', 'Turbulence', 'Complexity']].reset_index(drop=True)
350
+ return metrics_result
351
+ else:
352
+ # Return with ID as index (traditional format)
353
+ metrics.index.name = 'ID'
354
+ return metrics
355
+
356
+
357
+ def plot_cross_sectional_characteristics(seqdata,
358
+ figsize=(10, 6),
359
+ fontsize=12,
360
+ title="Cross-sectional entropy over time",
361
+ show_title=True,
362
+ xlabel="Time",
363
+ ylabel="Entropy (0-1)",
364
+ line_color="#74C9B4",
365
+ save_as=None,
366
+ dpi=200,
367
+ return_data=False,
368
+ custom_state_colors=None):
369
+ """
370
+ Visualize cross-sectional entropy across time points.
371
+
372
+ This function shows how diverse the population is at each time point,
373
+ providing a complementary view to longitudinal analysis which tracks
374
+ individual sequences over time.
375
+
376
+ The plot displays cross-sectional entropy across time points.
377
+
378
+ Parameters
379
+ ----------
380
+ seqdata : SequenceData
381
+ Your sequence data object containing the sequences to analyze.
382
+
383
+ figsize : tuple, optional (default=(10, 6))
384
+ Size of the plot as (width, height) in inches.
385
+
386
+ fontsize : int, optional (default=12)
387
+ Base font size for labels, ticks, and axes. Title uses fontsize+1.
388
+
389
+ title : str, optional (default="Cross-sectional entropy over time")
390
+ Title for the entropy plot. If show_title=False, this is ignored.
391
+
392
+ show_title : bool, optional (default=True)
393
+ Whether to display the title. If False, no title will be shown regardless
394
+ of the title parameter value.
395
+
396
+ xlabel : str, optional (default="Time")
397
+ Label for the x-axis.
398
+
399
+ ylabel : str, optional (default="Entropy (0-1)")
400
+ Label for the y-axis (main entropy axis).
401
+
402
+ line_color : str, optional (default="#74C9B4")
403
+ Color for the entropy line. Can be any valid matplotlib color including
404
+ hex colors like "#FF5733", named colors like "red", or RGB tuples.
405
+
406
+ save_as : str, optional (default=None)
407
+ File path to save the plot. If None, plot will only be displayed.
408
+ Supported formats: .png, .jpg, .jpeg, .pdf, .svg
409
+ If no extension provided, .png will be added automatically.
410
+
411
+ dpi : int, optional (default=200)
412
+ Resolution (dots per inch) for saved image. Higher values result in
413
+ better quality but larger file sizes.
414
+
415
+ return_data : bool, optional (default=False)
416
+ Whether to return the computed data. If False, only displays the plot.
417
+ If True, returns a dictionary with frequencies, entropy, and valid states.
418
+
419
+ custom_state_colors : dict, optional (default=None)
420
+ Custom color mapping for states. Keys should match your state labels.
421
+ If None, uses the colors defined in your SequenceData object.
422
+ Example: {"Education": "#A7D8DE", "Employment": "#F6CDA3"}
423
+ Note: This parameter is maintained for compatibility but not used in entropy plot.
424
+
425
+ Returns
426
+ -------
427
+ dict or None
428
+ If return_data=True, returns a dictionary containing the computed data:
429
+ - "Frequencies": DataFrame with states as rows and time points as columns
430
+ - "Entropy": Series with entropy values for each time point (0-1 normalized)
431
+ - "ValidStates": Series with number of valid observations per time point
432
+
433
+ If return_data=False (default), returns None to keep output clean and focus on visualization.
434
+
435
+ Notes
436
+ -----
437
+ **Cross-sectional entropy** measures how diverse the population is at each
438
+ time point. Values range from 0 to 1:
439
+ - 0: Everyone is in the same state (no diversity)
440
+ - 1: Population is equally distributed across all possible states (maximum diversity)
441
+
442
+ The plot uses index plot styling with clean borders. For state distribution
443
+ visualization, use the dedicated `plot_state_distribution` function.
444
+
445
+ Examples
446
+ --------
447
+ Basic usage (displays plot only, no data returned):
448
+ >>> plot_cross_sectional_characteristics(my_seqdata)
449
+
450
+ Custom title and size:
451
+ >>> plot_cross_sectional_characteristics(my_seqdata,
452
+ ... figsize=(12, 6),
453
+ ... title="Population Diversity Over Time")
454
+
455
+ Plot without title:
456
+ >>> plot_cross_sectional_characteristics(my_seqdata, show_title=False)
457
+
458
+ Custom labels and colors:
459
+ >>> plot_cross_sectional_characteristics(my_seqdata,
460
+ ... xlabel="Years",
461
+ ... ylabel="Indicators",
462
+ ... line_color="#FF5733")
463
+
464
+ Custom hex color:
465
+ >>> plot_cross_sectional_characteristics(my_seqdata, line_color="#2E86AB")
466
+
467
+ Save plot to file:
468
+ >>> plot_cross_sectional_characteristics(my_seqdata,
469
+ ... save_as="entropy_plot.png",
470
+ ... dpi=300)
471
+
472
+ Save with custom format:
473
+ >>> plot_cross_sectional_characteristics(my_seqdata,
474
+ ... save_as="entropy_analysis.pdf")
475
+
476
+ Get data when needed (only when explicitly requested):
477
+ >>> result = plot_cross_sectional_characteristics(my_seqdata, return_data=True)
478
+ >>> entropy_values = result['Entropy'] # Access entropy data
479
+ >>> frequencies = result['Frequencies'] # State frequencies by time
480
+ >>> valid_n = result['ValidStates'] # Sample sizes by time
481
+ """
482
+ # Get cross-sectional data using the existing function
483
+ res = get_cross_sectional_entropy(seqdata, weighted=True, norm=True, return_format="dict")
484
+
485
+ freq = res["Frequencies"] # rows: states, cols: time points
486
+ # Get normalized or raw entropy (check which key exists)
487
+ if "per_time_entropy_norm" in res and res["per_time_entropy_norm"] is not None:
488
+ ent = res["per_time_entropy_norm"]
489
+ else:
490
+ ent = res["Entropy"]
491
+ N = res.get("ValidStates", None) # valid sample sizes per time point
492
+
493
+ # Sort time axis if possible (handles both numeric and string time labels)
494
+ try:
495
+ # Try to sort columns as integers first
496
+ sorted_cols = sorted(freq.columns, key=lambda x: int(x))
497
+ freq = freq[sorted_cols]
498
+ ent = ent.loc[sorted_cols]
499
+ if N is not None:
500
+ N = N.loc[sorted_cols]
501
+ except (ValueError, TypeError):
502
+ try:
503
+ # If that fails, sort as strings
504
+ sorted_cols = sorted(freq.columns)
505
+ freq = freq[sorted_cols]
506
+ ent = ent.loc[sorted_cols]
507
+ if N is not None:
508
+ N = N.loc[sorted_cols]
509
+ except Exception:
510
+ # If all sorting fails, keep original order
511
+ pass
512
+
513
+ # Prepare color scheme - use SequenceData's color mapping
514
+ if custom_state_colors is not None:
515
+ # Use custom colors if provided
516
+ colors = [custom_state_colors.get(s, None) for s in freq.index]
517
+ colors = [c for c in colors if c is not None] or None
518
+ else:
519
+ # Use SequenceData's built-in color mapping (this is the standard way)
520
+ colors = None
521
+ if hasattr(seqdata, 'color_map') and seqdata.color_map:
522
+ # Map state labels to colors using the sequence data's color mapping
523
+ colors = []
524
+ for state_label in freq.index:
525
+ # Find the state index for this label
526
+ if hasattr(seqdata, 'state_mapping') and seqdata.state_mapping:
527
+ state_idx = seqdata.state_mapping.get(state_label)
528
+ if state_idx is not None and state_idx in seqdata.color_map:
529
+ colors.append(seqdata.color_map[state_idx])
530
+ else:
531
+ colors.append(None)
532
+ else:
533
+ # Fallback: try direct label lookup
534
+ colors.append(seqdata.color_map.get(state_label, None))
535
+
536
+ # Filter out None values
537
+ colors = [c for c in colors if c is not None] or None
538
+
539
+ # Color scheme consistent with existing plot style
540
+ axis_gray = '#666666'
541
+
542
+ # Create entropy plot with optional valid N secondary axis
543
+ fig = plt.figure(figsize=figsize)
544
+ ax1 = fig.add_subplot(111)
545
+
546
+ ax1.plot(ent.index, ent.values, marker='o', color=line_color, linewidth=2, markersize=4)
547
+ ax1.set_ylim(0, 1)
548
+ ax1.set_ylabel(ylabel, fontsize=fontsize, color=axis_gray)
549
+
550
+ # Set title only if show_title is True
551
+ if show_title and title:
552
+ ax1.set_title(title, fontsize=fontsize+1, color=axis_gray)
553
+
554
+
555
+ # Set up x-axis labels using the utility function
556
+ set_up_time_labels_for_x_axis(seqdata, ax1, color=axis_gray)
557
+
558
+ # Style consistent with index plot design
559
+ ax1.grid(True, axis='y', alpha=0.3)
560
+ ax1.set_axisbelow(True)
561
+ # Use index plot style borders - only show left and bottom spines
562
+ ax1.spines['top'].set_visible(False)
563
+ ax1.spines['right'].set_visible(False)
564
+ ax1.spines['left'].set_color('gray')
565
+ ax1.spines['bottom'].set_color('gray')
566
+ ax1.spines['left'].set_linewidth(0.7)
567
+ ax1.spines['bottom'].set_linewidth(0.7)
568
+
569
+ # Move spines slightly away from the plot area for better aesthetics
570
+ ax1.spines['left'].set_position(('outward', 5))
571
+ ax1.spines['bottom'].set_position(('outward', 5))
572
+
573
+ ax1.tick_params(axis='both', colors=axis_gray, labelsize=max(6, fontsize-1), length=4, width=0.7)
574
+
575
+ # Add x-axis label
576
+ ax1.set_xlabel(xlabel, fontsize=fontsize, color=axis_gray)
577
+
578
+ plt.tight_layout()
579
+
580
+ # Handle saving and display
581
+ if save_as:
582
+ if not any(save_as.endswith(ext) for ext in ['.png', '.jpg', '.jpeg', '.pdf', '.svg']):
583
+ save_as += '.png'
584
+ plt.savefig(save_as, dpi=dpi, bbox_inches='tight')
585
+
586
+ plt.show()
587
+ plt.close()
588
+
589
+ # Only return data if explicitly requested
590
+ if return_data:
591
+ return {"Frequencies": freq, "Entropy": ent, "ValidStates": N}
592
+ else:
593
+ return None