sequenzo 0.1.17__cp39-cp39-macosx_10_9_universal2.whl → 0.1.18__cp39-cp39-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sequenzo might be problematic. Click here for more details.

Files changed (86) hide show
  1. sequenzo/__init__.py +25 -1
  2. sequenzo/big_data/clara/clara.py +1 -1
  3. sequenzo/big_data/clara/utils/get_weighted_diss.c +157 -157
  4. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-39-darwin.so +0 -0
  5. sequenzo/clustering/hierarchical_clustering.py +202 -8
  6. sequenzo/define_sequence_data.py +34 -2
  7. sequenzo/dissimilarity_measures/c_code.cpython-39-darwin.so +0 -0
  8. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +1 -1
  9. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +13 -37
  10. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +13 -37
  11. sequenzo/dissimilarity_measures/src/OMdistance.cpp +12 -47
  12. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +103 -67
  13. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  14. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +41 -16
  15. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +4 -0
  16. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +7 -0
  17. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +10 -0
  18. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +127 -43
  19. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +30 -2
  20. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  21. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +14 -5
  22. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +111 -54
  23. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +131 -9
  24. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +11 -113
  25. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +39 -7
  26. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +336 -30
  27. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +9 -37
  28. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +58 -0
  29. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +1 -0
  30. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +35 -2
  31. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +3 -1
  32. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +17 -0
  33. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +13 -0
  34. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +18 -0
  35. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +13 -0
  36. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +8 -0
  37. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +363 -34
  38. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +7 -0
  39. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +13 -0
  40. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +41 -4
  41. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +252 -16
  42. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +9 -0
  43. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +12 -1
  44. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +7 -0
  45. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  46. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +78 -1
  47. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +3 -1
  48. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +13 -2
  49. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +5 -0
  50. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +5 -1
  51. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +2 -0
  52. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +64 -1
  53. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +36 -0
  54. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +40 -31
  55. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +8 -0
  56. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  57. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +6 -0
  58. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.c +157 -157
  59. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-39-darwin.so +0 -0
  60. sequenzo/dissimilarity_measures/utils/seqconc.c +157 -157
  61. sequenzo/dissimilarity_measures/utils/seqconc.cpython-39-darwin.so +0 -0
  62. sequenzo/dissimilarity_measures/utils/seqdss.c +157 -157
  63. sequenzo/dissimilarity_measures/utils/seqdss.cpython-39-darwin.so +0 -0
  64. sequenzo/dissimilarity_measures/utils/seqdur.c +157 -157
  65. sequenzo/dissimilarity_measures/utils/seqdur.cpython-39-darwin.so +0 -0
  66. sequenzo/dissimilarity_measures/utils/seqlength.c +157 -157
  67. sequenzo/dissimilarity_measures/utils/seqlength.cpython-39-darwin.so +0 -0
  68. sequenzo/sequence_characteristics/__init__.py +4 -0
  69. sequenzo/sequence_characteristics/complexity_index.py +17 -57
  70. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +177 -111
  71. sequenzo/sequence_characteristics/plot_characteristics.py +30 -11
  72. sequenzo/sequence_characteristics/simple_characteristics.py +1 -0
  73. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +9 -3
  74. sequenzo/sequence_characteristics/turbulence.py +47 -67
  75. sequenzo/sequence_characteristics/variance_of_spell_durations.py +19 -9
  76. sequenzo/sequence_characteristics/within_sequence_entropy.py +5 -58
  77. sequenzo/visualization/plot_sequence_index.py +58 -35
  78. sequenzo/visualization/plot_state_distribution.py +57 -36
  79. sequenzo/with_event_history_analysis/__init__.py +35 -0
  80. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  81. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  82. {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/METADATA +7 -6
  83. {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/RECORD +86 -79
  84. {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/WHEEL +0 -0
  85. {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/licenses/LICENSE +0 -0
  86. {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/top_level.txt +0 -0
@@ -18,6 +18,8 @@ from .variance_of_spell_durations import get_spell_duration_variance
18
18
 
19
19
  from .turbulence import get_turbulence
20
20
 
21
+ from .complexity_index import get_complexity_index
22
+
21
23
  from .plot_characteristics import plot_longitudinal_characteristics, plot_cross_sectional_characteristics
22
24
 
23
25
  __all__ = [
@@ -25,6 +27,8 @@ __all__ = [
25
27
  "get_subsequences_all_sequences",
26
28
  "get_number_of_transitions",
27
29
 
30
+ "get_complexity_index",
31
+
28
32
  "get_state_freq_and_entropy_per_seq",
29
33
  "get_within_sequence_entropy",
30
34
  "get_cross_sectional_entropy",
@@ -18,72 +18,32 @@ from .within_sequence_entropy import get_within_sequence_entropy
18
18
 
19
19
  def get_complexity_index(seqdata, silent=True):
20
20
  if not isinstance(seqdata, SequenceData):
21
- raise ValueError(" [!] data is NOT a sequence object, see SequenceData function to create one.")
21
+ raise ValueError("[!] data is NOT a sequence object, see SequenceData function to create one.")
22
22
 
23
23
  if not silent:
24
- print(f" - computing complexity index for {seqdata.seqdata.shape[0]} sequences ...")
24
+ print(f"[>] Computing complexity index for {seqdata.seqdata.shape[0]} sequences ...")
25
25
 
26
- trans = get_number_of_transitions(seqdata=seqdata, norm=True)
26
+ trans = get_number_of_transitions(seqdata=seqdata, norm=True).iloc[:, 1]
27
+ trans.index = seqdata.seqdata.index
27
28
 
28
29
  with open(os.devnull, 'w') as fnull:
29
30
  with redirect_stdout(fnull):
30
- ient = get_within_sequence_entropy(seqdata=seqdata, norm=True)
31
+ ient = get_within_sequence_entropy(seqdata=seqdata, norm=True).iloc[:, 1]
32
+ ient.index = seqdata.seqdata.index
31
33
 
32
- complxity = np.sqrt(trans.iloc[:, 0] * ient)
33
- complxity.columns = ['ComplexityIndex']
34
+ complxity = np.sqrt(trans * ient)
35
+ complxity = pd.DataFrame(complxity, index=seqdata.seqdata.index, columns=['Complexity Index'])
36
+ complxity = complxity.reset_index().rename(columns={'index': 'ID'})
34
37
 
35
38
  return complxity
36
39
 
40
+ if __name__ == '__main__':
41
+ from sequenzo import *
37
42
 
38
- if __name__ == "__main__":
39
- # ===============================
40
- # Sohee
41
- # ===============================
42
- df = pd.read_csv('D:/college/research/QiQi/sequenzo/data_and_output/orignal data/sohee/sequence_data.csv')
43
- time_list = list(df.columns)[1:133]
44
- states = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
45
- # states = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
46
- labels = ['FT+WC', 'FT+BC', 'PT+WC', 'PT+BC', 'U', 'OLF']
47
- sequence_data = SequenceData(df, time=time_list, states=states, labels=labels, id_col="PID")
48
- res = get_complexity_index(sequence_data)
49
-
50
- # ===============================
51
- # kass
52
- # ===============================
53
- # df = pd.read_csv('D:/college/research/QiQi/sequenzo/files/orignal data/kass/wide_civil_final_df.csv')
54
- # time_list = list(df.columns)[1:]
55
- # states = ['Extensive Warfare', 'Limited Violence', 'No Violence', 'Pervasive Warfare', 'Prolonged Warfare',
56
- # 'Serious Violence', 'Serious Warfare', 'Sporadic Violence', 'Technological Warfare', 'Total Warfare']
57
- # sequence_data = SequenceData(df, time=time_list, states=states, id_col="COUNTRY")
58
- # res = seqici(sequence_data)
59
-
60
- # ===============================
61
- # CO2
62
- # ===============================
63
- # df = pd.read_csv("D:/country_co2_emissions_missing.csv")
64
- # _time = list(df.columns)[1:]
65
- # states = ['Very Low', 'Low', 'Middle', 'High', 'Very High']
66
- # sequence_data = SequenceData(df, time=_time, id_col="country", states=states)
67
- # res = seqici(sequence_data)
43
+ df = load_dataset("country_co2_emissions")
44
+ _time = list(df.columns)[1:]
45
+ states = ['Very Low', 'Low', 'Middle', 'High', 'Very High']
46
+ sequence_data = SequenceData(df, time=_time, id_col="country", states=states)
68
47
 
69
- # ===============================
70
- # detailed
71
- # ===============================
72
- # df = pd.read_csv("D:/college/research/QiQi/sequenzo/data_and_output/sampled_data_sets/detailed_data/sampled_1000_data.csv")
73
- # _time = list(df.columns)[4:]
74
- # states = ['data', 'data & intensive math', 'hardware', 'research', 'software', 'software & hardware', 'support & test']
75
- # sequence_data = SequenceData(df[['worker_id', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10']],
76
- # time=_time, id_col="worker_id", states=states)
77
- # res = seqici(sequence_data)
78
-
79
- # ===============================
80
- # broad
81
- # ===============================
82
- # df = pd.read_csv("D:/college/research/QiQi/sequenzo/data_and_output/sampled_data_sets/broad_data/sampled_1000_data.csv")
83
- # _time = list(df.columns)[4:]
84
- # states = ['Non-computing', 'Non-technical computing', 'Technical computing']
85
- # sequence_data = SequenceData(df[['worker_id', 'C1', 'C2', 'C3', 'C4', 'C5']],
86
- # time=_time, id_col="worker_id", states=states)
87
- # res = seqici(sequence_data)
88
-
89
- print(res)
48
+ res = get_complexity_index(sequence_data)
49
+ res
@@ -1,5 +1,5 @@
1
1
  """
2
- @Author : 李欣怡
2
+ @Author : Yuqi Liang, Xinyi Li
3
3
  @File : overall_cross_sectional_entropy.py
4
4
  @Time : 2025/9/15 21:52
5
5
  @Desc : States frequency by time unit
@@ -13,11 +13,29 @@ import pandas as pd
13
13
  from scipy.stats import entropy
14
14
  from sequenzo.define_sequence_data import SequenceData
15
15
 
16
- def get_cross_sectional_entropy(seqdata: SequenceData,
17
- weighted: bool = True,
18
- norm: bool = True):
16
+ def get_cross_sectional_entropy(
17
+ seqdata: SequenceData,
18
+ weighted: bool = True,
19
+ norm: bool = True,
20
+ return_format: str = "tidy", # "tidy" | "wide" | "dict"
21
+ include_effective_states: bool = True,
22
+ add_topk: int = 1, # Mark top K dominant states at each time point
23
+ round_decimals: int = 6
24
+ ):
19
25
  """
20
- Compute the states frequency by time unit.
26
+ Cross-sectional state distribution by time with entropy and readable outputs.
27
+
28
+ What you get in a tidy format:
29
+ time state freq entropy per_time_entropy_norm N_valid rank is_top
30
+ 1 A 0.645 0.380 0.380 2346.27 1 True
31
+ ...
32
+
33
+ Additional metrics:
34
+ - per_time_entropy_norm: If norm=True, normalized by maximum entropy (|S|), range 0–1
35
+ - effective_states (H_effective): exp(H), equivalent "effective number of states"
36
+ - summary: Key interpretation points (entropy peaks/valleys, dominant state intervals, average entropy, etc.)
37
+
38
+ Parameters maintain your semantics, with new return_format, add_topk etc. for better interpretability.
21
39
 
22
40
  Parameters
23
41
  ----------
@@ -25,130 +43,178 @@ def get_cross_sectional_entropy(seqdata: SequenceData,
25
43
  A sequence object created by the SequenceData function.
26
44
  weighted : bool, default True
27
45
  If True, the frequencies are weighted by the number of non-missing values at each time unit.
28
- with_missing : bool, default False
29
- If True, missing values are included in the frequency computation.
30
46
  norm : bool, default True
31
- If True, the frequencies are normalized to sum to 1 at each time unit.
47
+ If True, the entropy is normalized by maximum possible entropy.
48
+ return_format : str, default "tidy"
49
+ Return format: "tidy" for long-form table, "wide" for matrices, "dict" for original dict format.
50
+ include_effective_states : bool, default True
51
+ If True, calculate effective number of states (exp(entropy)).
52
+ add_topk : int, default 1
53
+ Mark top K dominant states at each time point.
54
+ round_decimals : int, default 6
55
+ Number of decimal places for rounding.
32
56
 
33
57
  Returns
34
58
  -------
35
- pd.DataFrame
36
- A dict, containing the frequency of each state at each time unit, validStates and Entropy.
59
+ pd.DataFrame or dict
60
+ Depending on return_format:
61
+ - "tidy": Long-form DataFrame with interpretable columns
62
+ - "wide": Dict with frequency matrix, entropy series, etc.
63
+ - "dict": Original dict format (backward compatible)
37
64
  """
38
65
 
39
66
  if not isinstance(seqdata, SequenceData):
40
- raise ValueError(" [!] data is NOT a sequence object, see SequenceData function to create one.")
41
-
42
- # Retrieve the states, shape and colors
43
- states = seqdata.states.copy()
44
- statl = range(1, len(states) + 1)
45
- number_states = len(statl)
67
+ raise ValueError("[!] data is NOT a sequence object, see SequenceData.")
46
68
 
47
- number_seq = seqdata.seqdata.shape[1]
69
+ # Basic metadata
70
+ states_labels = list(seqdata.states) # Human-readable state labels
71
+ S = len(states_labels)
72
+ T = seqdata.seqdata.shape[1] # Number of time points
73
+ times = list(seqdata.seqdata.columns)
48
74
 
75
+ # Color attributes
49
76
  cpal = seqdata.custom_colors
50
77
 
51
- # SequenceData already added missing values into states and colors internally
52
- # if seqdata.ismissing:
53
- # statl.append(seqdata.missing_value)
54
- # col.append(seqdata.missing_color)
55
-
56
- sd = pd.DataFrame(np.zeros((number_states, number_seq)), index=states, columns=seqdata.seqdata.columns)
57
-
58
78
  # Weights
59
- weights = seqdata.weights if seqdata.weights is not None else np.ones(seqdata.seqdata.shape[0])
60
-
61
79
  # Also takes into account that in unweighted sequence objects created with
62
80
  # older TraMineR versions the weights attribute is a vector of 1
63
81
  # instead of NULL
64
- if np.all(weights == 1):
82
+ w = seqdata.weights if seqdata.weights is not None else np.ones(seqdata.seqdata.shape[0])
83
+ if np.all(w == 1):
65
84
  weighted = False
66
85
 
67
- for i in range(number_states):
68
- for j in range(number_seq):
69
- sd.iloc[i, j] = np.sum(weights[(seqdata.seqdata.iloc[:, j] == statl[i]).values])
86
+ # Your data is usually encoded with 1..S; if internally already labels, we can map here
87
+ # For compatibility: build a "value -> row index" lookup table
88
+ # Try to support both numeric encoding (1..S) and labels themselves
89
+ value_to_row = {v: i for i, v in enumerate(range(1, S+1))}
90
+ label_to_row = {lab: i for i, lab in enumerate(states_labels)}
91
+
92
+ # Frequency matrix (S x T)
93
+ freq_counts = np.zeros((S, T), dtype=float)
94
+
95
+ for j in range(T):
96
+ col = seqdata.seqdata.iloc[:, j]
97
+ for i in range(S):
98
+ # Try both encoding and label matching
99
+ mask_num = (col == (i+1))
100
+ mask_lab = (col == states_labels[i])
101
+ mask = mask_num | mask_lab
102
+ if weighted:
103
+ freq_counts[i, j] = w[mask].sum()
104
+ else:
105
+ freq_counts[i, j] = mask.sum()
106
+
107
+ N_valid = freq_counts.sum(axis=0) # Valid weight/sample size per time point
108
+ with np.errstate(divide='ignore', invalid='ignore'):
109
+ P = np.divide(freq_counts, N_valid, where=(N_valid>0)) # Frequencies
110
+
111
+ # Entropy
112
+ H = np.array([entropy(P[:, j][P[:, j] > 0]) if N_valid[j] > 0 else 0.0 for j in range(T)])
70
113
 
71
- N = sd.sum(axis=0)
72
- sd = sd.div(N, axis=1)
73
-
74
- E = sd.apply(lambda col: entropy(col[col > 0]), axis=0)
75
-
76
- # Maximum entropy is the entropy of the alphabet
77
114
  if norm:
78
- E_max = entropy(np.ones(number_states) / number_states)
79
- E = E / E_max
80
-
81
- res = {
82
- "Frequencies": sd,
83
- "ValidStates": N,
84
- "Entropy": E
85
- }
86
-
87
- res_attrs = {
88
- "nbseq": np.sum(weights),
89
- "cpal": cpal,
90
- "xtlab": list(seqdata.seqdata.columns),
91
- "xtstep": getattr(seqdata, "xtstep", None),
92
- "tick_last": getattr(seqdata, "tick_last", None),
93
- "weighted": weighted,
94
- "norm": norm
115
+ Hmax = entropy(np.ones(S) / S) if S > 0 else 1.0
116
+ H_norm = H / Hmax if Hmax > 0 else H
117
+ else:
118
+ H_norm = H
119
+
120
+ # Effective number of states (highly interpretable: equivalent "how many equiprobable states")
121
+ H_eff = np.exp(H) if include_effective_states else None
122
+
123
+ # Organize output: wide format
124
+ freq_df_wide = pd.DataFrame(P, index=states_labels, columns=times).round(round_decimals)
125
+ entropy_s = pd.Series(H_norm if norm else H, index=times, name=("per_time_entropy_norm" if norm else "Entropy")).round(round_decimals)
126
+ valid_s = pd.Series(N_valid, index=times, name="N_valid").round(round_decimals)
127
+ eff_s = (pd.Series(H_eff, index=times, name="Effective States").round(round_decimals)
128
+ if include_effective_states else None)
129
+
130
+ # Generate tidy table (interpretation-friendly)
131
+ tidy = (
132
+ freq_df_wide
133
+ .reset_index()
134
+ .melt(id_vars="index", var_name="time", value_name="freq")
135
+ .rename(columns={"index": "state"})
136
+ .sort_values(["time", "freq"], ascending=[True, False])
137
+ )
138
+ # Ranking + topK annotation
139
+ tidy["rank"] = tidy.groupby("time")["freq"].rank(method="first", ascending=False).astype(int)
140
+ if add_topk and add_topk > 0:
141
+ tidy["is_top"] = tidy["rank"] <= add_topk
142
+ else:
143
+ tidy["is_top"] = False
144
+
145
+ # Merge entropy/sample size/effective states
146
+ tidy = tidy.merge(entropy_s.reset_index().rename(columns={"index": "time"}), on="time", how="left")
147
+ tidy = tidy.merge(valid_s.reset_index().rename(columns={"index": "time"}), on="time", how="left")
148
+ if eff_s is not None:
149
+ tidy = tidy.merge(eff_s.reset_index().rename(columns={"index": "time"}), on="time", how="left")
150
+
151
+ # Friendly column order
152
+ cols = ["time", "state", "freq"]
153
+ if norm:
154
+ cols += ["per_time_entropy_norm"]
155
+ else:
156
+ cols += ["Entropy"]
157
+ cols += ["N_valid"]
158
+ if include_effective_states:
159
+ cols += ["Effective States"]
160
+ cols += ["rank", "is_top"]
161
+ tidy = tidy[cols]
162
+
163
+ # Summary: key statistics that can be explained in one sentence
164
+ summary = {
165
+ "states": states_labels,
166
+ "n_states": S,
167
+ "n_timepoints": T,
168
+ "avg_entropy_norm": float(tidy["per_time_entropy_norm"].mean()) if norm else None,
169
+ "avg_entropy": float((entropy_s if not norm else entropy_s * entropy(np.ones(S)/S)).mean()) if not norm else None,
170
+ "peak_entropy_time": tidy.loc[tidy["per_time_entropy_norm" if norm else "Entropy"].idxmax(), "time"] if T > 0 else None,
171
+ "lowest_entropy_time": tidy.loc[tidy["per_time_entropy_norm" if norm else "Entropy"].idxmin(), "time"] if T > 0 else None,
172
+ "dominant_stability_ratio": float(tidy.query("rank==1")["freq"].mean()), # Average proportion of dominant state
173
+ "cpal": cpal
95
174
  }
96
175
 
97
- res["__attrs__"] = res_attrs
98
-
99
- return res
100
-
101
- if __name__ == "__main__":
102
- # ===============================
103
- # Sohee
104
- # ===============================
105
- df = pd.read_csv('D:/college/research/QiQi/sequenzo/data_and_output/orignal data/sohee/sequence_data.csv')
106
- time_list = list(df.columns)[1:133]
107
- states = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
108
- # states = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
109
- labels = ['FT+WC', 'FT+BC', 'PT+WC', 'PT+BC', 'U', 'OLF']
110
- sequence_data = SequenceData(df, time=time_list, states=states, labels=labels, id_col="PID")
111
- res = get_cross_sectional_entropy(sequence_data)
112
-
113
- # ===============================
114
- # kass
115
- # ===============================
116
- # df = pd.read_csv('D:/college/research/QiQi/sequenzo/files/orignal data/kass/wide_civil_final_df.csv')
117
- # time_list = list(df.columns)[1:]
118
- # states = ['Extensive Warfare', 'Limited Violence', 'No Violence', 'Pervasive Warfare', 'Prolonged Warfare',
119
- # 'Serious Violence', 'Serious Warfare', 'Sporadic Violence', 'Technological Warfare', 'Total Warfare']
120
- # sequence_data = SequenceData(df, time=time_list, time_type="year", states=states, id_col="COUNTRY")
121
- # res = seqstatd(sequence_data)
122
-
123
- # ===============================
124
- # CO2
125
- # ===============================
126
- # df = pd.read_csv("D:/country_co2_emissions_missing.csv")
127
- # _time = list(df.columns)[1:]
128
- # states = ['Very Low', 'Low', 'Middle', 'High', 'Very High']
129
- # sequence_data = SequenceData(df, time_type="age", time=_time, id_col="country", states=states)
130
- # res = seqstatd(sequence_data)
131
-
132
- # ===============================
133
- # detailed
134
- # ===============================
135
- # df = pd.read_csv("D:/college/research/QiQi/sequenzo/data_and_output/sampled_data_sets/detailed_data/sampled_1000_data.csv")
136
- # _time = list(df.columns)[4:]
137
- # states = ['data', 'data & intensive math', 'hardware', 'research', 'software', 'software & hardware', 'support & test']
138
- # sequence_data = SequenceData(df[['worker_id', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10']],
139
- # time_type="age", time=_time, id_col="worker_id", states=states)
140
- # res = seqstatd(sequence_data)
141
-
142
- # ===============================
143
- # broad
144
- # ===============================
145
- # df = pd.read_csv("D:/college/research/QiQi/sequenzo/data_and_output/sampled_data_sets/broad_data/sampled_1000_data.csv")
146
- # _time = list(df.columns)[4:]
147
- # states = ['Non-computing', 'Non-technical computing', 'Technical computing']
148
- # sequence_data = SequenceData(df[['worker_id', 'C1', 'C2', 'C3', 'C4', 'C5']],
149
- # time_type="age", time=_time, id_col="worker_id", states=states)
150
- # res = seqstatd(sequence_data)
151
-
152
- print(res['Frequencies'])
153
- print(res['ValidStates'])
154
- print(res['Entropy'])
176
+ # Print descriptive statistics
177
+ print("\n" + "="*70)
178
+ print("Cross-Sectional Entropy Summary")
179
+ print("="*70)
180
+ print(f"[>] Number of states: {summary['n_states']}")
181
+ print(f"[>] Number of time points: {summary['n_timepoints']}")
182
+ print(f"[>] On average, the most common state accounts for {summary['dominant_stability_ratio']:.1%} of cases")
183
+ print(f"[>] Entropy is highest at time point {summary['peak_entropy_time']}")
184
+ print(f"[>] Entropy is lowest at time point {summary['lowest_entropy_time']}")
185
+ if norm:
186
+ print(f"[>] Average normalized entropy: {summary['avg_entropy_norm']:.3f} (range: 0 = fully concentrated, 1 = evenly distributed)")
187
+ print("="*70 + "\n")
188
+
189
+ # Compatible with different return formats
190
+ if return_format == "tidy":
191
+ tidy.attrs = {"summary": summary}
192
+ return tidy
193
+ elif return_format == "wide":
194
+ out = {
195
+ "Frequencies": freq_df_wide,
196
+ "N_valid": valid_s,
197
+ ("per_time_entropy_norm" if norm else "Entropy"): entropy_s
198
+ }
199
+ if eff_s is not None:
200
+ out["Effective States"] = eff_s
201
+ return out
202
+ else: # "dict" —— try to be more readable too
203
+ res = {
204
+ "Frequencies": freq_df_wide,
205
+ "ValidStates": valid_s,
206
+ "Entropy": entropy_s if not norm else None,
207
+ "per_time_entropy_norm": entropy_s if norm else None,
208
+ "Effective States": eff_s,
209
+ "__attrs__": {
210
+ "nbseq": float(valid_s.iloc[0]) if len(valid_s)>0 else None,
211
+ "cpal": cpal,
212
+ "xtlab": times,
213
+ "xtstep": getattr(seqdata, "xtstep", None),
214
+ "tick_last": getattr(seqdata, "tick_last", None),
215
+ "weighted": weighted,
216
+ "norm": norm,
217
+ "summary": summary
218
+ }
219
+ }
220
+ return res
@@ -51,7 +51,8 @@ def plot_longitudinal_characteristics(seqdata,
51
51
  save_as=None,
52
52
  dpi=200,
53
53
  custom_colors=None,
54
- show_sequence_ids=False):
54
+ show_sequence_ids=False,
55
+ id_as_column=True):
55
56
  """
56
57
  Create a horizontal bar chart showing four key characteristics for selected sequences.
57
58
 
@@ -128,13 +129,17 @@ def plot_longitudinal_characteristics(seqdata,
128
129
  show_sequence_ids : bool, optional (default=False)
129
130
  If True, y-axis shows actual sequence IDs (when available).
130
131
  If False, shows 1..N index positions.
132
+
133
+ id_as_column : bool, optional (default=True)
134
+ If True, the returned DataFrame will include ID as a separate column on the same level as other columns.
135
+ If False, IDs will be used as the DataFrame index.
131
136
 
132
137
  Returns
133
138
  -------
134
139
  pandas.DataFrame
135
140
  A DataFrame containing the calculated metrics for all plotted sequences.
136
- Columns: ['Transitions', 'Entropy', 'Turbulence', 'Complexity']
137
- Index: The sequence IDs that were plotted
141
+ If id_as_column=True: Columns: ['ID', 'Transitions', 'Entropy', 'Turbulence', 'Complexity'] (all columns at same level)
142
+ If id_as_column=False: Columns: ['Transitions', 'Entropy', 'Turbulence', 'Complexity'], Index: The sequence IDs
138
143
 
139
144
  Warnings
140
145
  --------
@@ -189,15 +194,15 @@ def plot_longitudinal_characteristics(seqdata,
189
194
  types of sequences and datasets.
190
195
  """
191
196
  # Calculate four metrics (all should be 0-1 normalized)
192
- df_t = get_number_of_transitions(seqdata=seqdata, norm=True).iloc[:, 0] # Series
197
+ df_t = get_number_of_transitions(seqdata=seqdata, norm=True).iloc[:, 1] # Series
193
198
  df_e = get_within_sequence_entropy(seqdata=seqdata, norm=True) # Series or single-column DataFrame
194
- if isinstance(df_e, pd.DataFrame): df_e = df_e.iloc[:, 0]
199
+ if isinstance(df_e, pd.DataFrame): df_e = df_e.iloc[:, 1]
195
200
 
196
- df_tb = get_turbulence(seqdata=seqdata, norm=True, type=2) # Normalized turbulence
197
- if isinstance(df_tb, pd.DataFrame): df_tb = df_tb.iloc[:, 0]
201
+ df_tb = get_turbulence(seqdata=seqdata, norm=True, type=2, id_as_column=True) # Normalized turbulence
202
+ if isinstance(df_tb, pd.DataFrame): df_tb = df_tb.iloc[:, 1]
198
203
 
199
204
  df_c = get_complexity_index(seqdata=seqdata) # Already 0-1 normalized
200
- if isinstance(df_c, pd.DataFrame): df_c = df_c.iloc[:, 0]
205
+ if isinstance(df_c, pd.DataFrame): df_c = df_c.iloc[:, 1]
201
206
 
202
207
  # Create metrics DataFrame with actual sequence IDs as index
203
208
  metrics = pd.DataFrame({
@@ -336,7 +341,17 @@ def plot_longitudinal_characteristics(seqdata,
336
341
  plt.show()
337
342
  plt.close()
338
343
 
339
- return metrics # Return the data used for plotting for inspection
344
+ # Handle ID display options for returned DataFrame
345
+ if id_as_column:
346
+ # Add ID as a separate column and reset index to numeric
347
+ metrics_result = metrics.copy()
348
+ metrics_result['ID'] = metrics_result.index
349
+ metrics_result = metrics_result[['ID', 'Transitions', 'Entropy', 'Turbulence', 'Complexity']].reset_index(drop=True)
350
+ return metrics_result
351
+ else:
352
+ # Return with ID as index (traditional format)
353
+ metrics.index.name = 'ID'
354
+ return metrics
340
355
 
341
356
 
342
357
  def plot_cross_sectional_characteristics(seqdata,
@@ -465,10 +480,14 @@ def plot_cross_sectional_characteristics(seqdata,
465
480
  >>> valid_n = result['ValidStates'] # Sample sizes by time
466
481
  """
467
482
  # Get cross-sectional data using the existing function
468
- res = get_cross_sectional_entropy(seqdata, weighted=True, norm=True)
483
+ res = get_cross_sectional_entropy(seqdata, weighted=True, norm=True, return_format="dict")
469
484
 
470
485
  freq = res["Frequencies"] # rows: states, cols: time points
471
- ent = res["Entropy"] # index: time points
486
+ # Get normalized or raw entropy (check which key exists)
487
+ if "per_time_entropy_norm" in res and res["per_time_entropy_norm"] is not None:
488
+ ent = res["per_time_entropy_norm"]
489
+ else:
490
+ ent = res["Entropy"]
472
491
  N = res.get("ValidStates", None) # valid sample sizes per time point
473
492
 
474
493
  # Sort time axis if possible (handles both numeric and string time labels)
@@ -306,5 +306,6 @@ def get_number_of_transitions(seqdata, norm=False, pwight=False) -> pd.DataFrame
306
306
  trans[seq_length<=1] = 0
307
307
 
308
308
  trans = pd.DataFrame(trans, index=seqdata.seqdata.index, columns=['Transitions'])
309
+ trans = trans.reset_index().rename(columns={'index': 'ID'})
309
310
 
310
311
  return trans
@@ -14,15 +14,19 @@ from sequenzo.define_sequence_data import SequenceData
14
14
 
15
15
  def get_state_freq_and_entropy_per_seq(seqdata, prop=False):
16
16
  if not isinstance(seqdata, SequenceData):
17
- raise ValueError(" [!] data is NOT a sequence object, see SequenceData function to create one.")
17
+ raise ValueError("[!] data is NOT a sequence object, see SequenceData function to create one.")
18
+
19
+ if seqdata.labels is not None:
20
+ states = seqdata.labels
21
+ else:
22
+ states = seqdata.states
18
23
 
19
- states = seqdata.states.copy()
20
24
  number_states = len(states)
21
25
  number_seq = seqdata.seqdata.shape[0]
22
26
 
23
27
  iseqtab = pd.DataFrame(np.zeros((number_seq, number_states)), index=seqdata.seqdata.index, columns=states)
24
28
 
25
- print(f" - computing state distribution for {number_seq} sequences and {number_states} states ...")
29
+ print(f"[>] Computing state distribution for {number_seq} sequences and {number_states} states ...")
26
30
 
27
31
  for i, state in enumerate(states):
28
32
  iseqtab.iloc[:, i] = (seqdata.seqdata == (i+1)).sum(axis=1)
@@ -30,4 +34,6 @@ def get_state_freq_and_entropy_per_seq(seqdata, prop=False):
30
34
  if prop:
31
35
  iseqtab = iseqtab.div(iseqtab.sum(axis=1), axis=0)
32
36
 
37
+ iseqtab = iseqtab.reset_index().rename(columns={'index': 'ID'})
38
+
33
39
  return iseqtab