sequenzo 0.1.17__cp39-cp39-macosx_10_9_universal2.whl → 0.1.18__cp39-cp39-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sequenzo might be problematic. Click here for more details.
- sequenzo/__init__.py +25 -1
- sequenzo/big_data/clara/clara.py +1 -1
- sequenzo/big_data/clara/utils/get_weighted_diss.c +157 -157
- sequenzo/big_data/clara/utils/get_weighted_diss.cpython-39-darwin.so +0 -0
- sequenzo/clustering/hierarchical_clustering.py +202 -8
- sequenzo/define_sequence_data.py +34 -2
- sequenzo/dissimilarity_measures/c_code.cpython-39-darwin.so +0 -0
- sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +1 -1
- sequenzo/dissimilarity_measures/src/DHDdistance.cpp +13 -37
- sequenzo/dissimilarity_measures/src/LCPdistance.cpp +13 -37
- sequenzo/dissimilarity_measures/src/OMdistance.cpp +12 -47
- sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +103 -67
- sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +41 -16
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +4 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +10 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +127 -43
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +30 -2
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +14 -5
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +111 -54
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +131 -9
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +11 -113
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +39 -7
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +336 -30
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +9 -37
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +58 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +1 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +35 -2
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +3 -1
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +17 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +13 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +18 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +13 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +8 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +363 -34
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +13 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +41 -4
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +252 -16
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +9 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +12 -1
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +78 -1
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +3 -1
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +13 -2
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +5 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +5 -1
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +2 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +64 -1
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +36 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +40 -31
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +8 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +6 -0
- sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.c +157 -157
- sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-39-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqconc.c +157 -157
- sequenzo/dissimilarity_measures/utils/seqconc.cpython-39-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqdss.c +157 -157
- sequenzo/dissimilarity_measures/utils/seqdss.cpython-39-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqdur.c +157 -157
- sequenzo/dissimilarity_measures/utils/seqdur.cpython-39-darwin.so +0 -0
- sequenzo/dissimilarity_measures/utils/seqlength.c +157 -157
- sequenzo/dissimilarity_measures/utils/seqlength.cpython-39-darwin.so +0 -0
- sequenzo/sequence_characteristics/__init__.py +4 -0
- sequenzo/sequence_characteristics/complexity_index.py +17 -57
- sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +177 -111
- sequenzo/sequence_characteristics/plot_characteristics.py +30 -11
- sequenzo/sequence_characteristics/simple_characteristics.py +1 -0
- sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +9 -3
- sequenzo/sequence_characteristics/turbulence.py +47 -67
- sequenzo/sequence_characteristics/variance_of_spell_durations.py +19 -9
- sequenzo/sequence_characteristics/within_sequence_entropy.py +5 -58
- sequenzo/visualization/plot_sequence_index.py +58 -35
- sequenzo/visualization/plot_state_distribution.py +57 -36
- sequenzo/with_event_history_analysis/__init__.py +35 -0
- sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
- sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
- {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/METADATA +7 -6
- {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/RECORD +86 -79
- {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/WHEEL +0 -0
- {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/licenses/LICENSE +0 -0
- {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/top_level.txt +0 -0
|
Binary file
|
|
@@ -18,6 +18,8 @@ from .variance_of_spell_durations import get_spell_duration_variance
|
|
|
18
18
|
|
|
19
19
|
from .turbulence import get_turbulence
|
|
20
20
|
|
|
21
|
+
from .complexity_index import get_complexity_index
|
|
22
|
+
|
|
21
23
|
from .plot_characteristics import plot_longitudinal_characteristics, plot_cross_sectional_characteristics
|
|
22
24
|
|
|
23
25
|
__all__ = [
|
|
@@ -25,6 +27,8 @@ __all__ = [
|
|
|
25
27
|
"get_subsequences_all_sequences",
|
|
26
28
|
"get_number_of_transitions",
|
|
27
29
|
|
|
30
|
+
"get_complexity_index",
|
|
31
|
+
|
|
28
32
|
"get_state_freq_and_entropy_per_seq",
|
|
29
33
|
"get_within_sequence_entropy",
|
|
30
34
|
"get_cross_sectional_entropy",
|
|
@@ -18,72 +18,32 @@ from .within_sequence_entropy import get_within_sequence_entropy
|
|
|
18
18
|
|
|
19
19
|
def get_complexity_index(seqdata, silent=True):
|
|
20
20
|
if not isinstance(seqdata, SequenceData):
|
|
21
|
-
raise ValueError("
|
|
21
|
+
raise ValueError("[!] data is NOT a sequence object, see SequenceData function to create one.")
|
|
22
22
|
|
|
23
23
|
if not silent:
|
|
24
|
-
print(f"
|
|
24
|
+
print(f"[>] Computing complexity index for {seqdata.seqdata.shape[0]} sequences ...")
|
|
25
25
|
|
|
26
|
-
trans = get_number_of_transitions(seqdata=seqdata, norm=True)
|
|
26
|
+
trans = get_number_of_transitions(seqdata=seqdata, norm=True).iloc[:, 1]
|
|
27
|
+
trans.index = seqdata.seqdata.index
|
|
27
28
|
|
|
28
29
|
with open(os.devnull, 'w') as fnull:
|
|
29
30
|
with redirect_stdout(fnull):
|
|
30
|
-
ient = get_within_sequence_entropy(seqdata=seqdata, norm=True)
|
|
31
|
+
ient = get_within_sequence_entropy(seqdata=seqdata, norm=True).iloc[:, 1]
|
|
32
|
+
ient.index = seqdata.seqdata.index
|
|
31
33
|
|
|
32
|
-
complxity = np.sqrt(trans
|
|
33
|
-
complxity.
|
|
34
|
+
complxity = np.sqrt(trans * ient)
|
|
35
|
+
complxity = pd.DataFrame(complxity, index=seqdata.seqdata.index, columns=['Complexity Index'])
|
|
36
|
+
complxity = complxity.reset_index().rename(columns={'index': 'ID'})
|
|
34
37
|
|
|
35
38
|
return complxity
|
|
36
39
|
|
|
40
|
+
if __name__ == '__main__':
|
|
41
|
+
from sequenzo import *
|
|
37
42
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
df = pd.read_csv('D:/college/research/QiQi/sequenzo/data_and_output/orignal data/sohee/sequence_data.csv')
|
|
43
|
-
time_list = list(df.columns)[1:133]
|
|
44
|
-
states = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
|
|
45
|
-
# states = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
|
|
46
|
-
labels = ['FT+WC', 'FT+BC', 'PT+WC', 'PT+BC', 'U', 'OLF']
|
|
47
|
-
sequence_data = SequenceData(df, time=time_list, states=states, labels=labels, id_col="PID")
|
|
48
|
-
res = get_complexity_index(sequence_data)
|
|
49
|
-
|
|
50
|
-
# ===============================
|
|
51
|
-
# kass
|
|
52
|
-
# ===============================
|
|
53
|
-
# df = pd.read_csv('D:/college/research/QiQi/sequenzo/files/orignal data/kass/wide_civil_final_df.csv')
|
|
54
|
-
# time_list = list(df.columns)[1:]
|
|
55
|
-
# states = ['Extensive Warfare', 'Limited Violence', 'No Violence', 'Pervasive Warfare', 'Prolonged Warfare',
|
|
56
|
-
# 'Serious Violence', 'Serious Warfare', 'Sporadic Violence', 'Technological Warfare', 'Total Warfare']
|
|
57
|
-
# sequence_data = SequenceData(df, time=time_list, states=states, id_col="COUNTRY")
|
|
58
|
-
# res = seqici(sequence_data)
|
|
59
|
-
|
|
60
|
-
# ===============================
|
|
61
|
-
# CO2
|
|
62
|
-
# ===============================
|
|
63
|
-
# df = pd.read_csv("D:/country_co2_emissions_missing.csv")
|
|
64
|
-
# _time = list(df.columns)[1:]
|
|
65
|
-
# states = ['Very Low', 'Low', 'Middle', 'High', 'Very High']
|
|
66
|
-
# sequence_data = SequenceData(df, time=_time, id_col="country", states=states)
|
|
67
|
-
# res = seqici(sequence_data)
|
|
43
|
+
df = load_dataset("country_co2_emissions")
|
|
44
|
+
_time = list(df.columns)[1:]
|
|
45
|
+
states = ['Very Low', 'Low', 'Middle', 'High', 'Very High']
|
|
46
|
+
sequence_data = SequenceData(df, time=_time, id_col="country", states=states)
|
|
68
47
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
# ===============================
|
|
72
|
-
# df = pd.read_csv("D:/college/research/QiQi/sequenzo/data_and_output/sampled_data_sets/detailed_data/sampled_1000_data.csv")
|
|
73
|
-
# _time = list(df.columns)[4:]
|
|
74
|
-
# states = ['data', 'data & intensive math', 'hardware', 'research', 'software', 'software & hardware', 'support & test']
|
|
75
|
-
# sequence_data = SequenceData(df[['worker_id', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10']],
|
|
76
|
-
# time=_time, id_col="worker_id", states=states)
|
|
77
|
-
# res = seqici(sequence_data)
|
|
78
|
-
|
|
79
|
-
# ===============================
|
|
80
|
-
# broad
|
|
81
|
-
# ===============================
|
|
82
|
-
# df = pd.read_csv("D:/college/research/QiQi/sequenzo/data_and_output/sampled_data_sets/broad_data/sampled_1000_data.csv")
|
|
83
|
-
# _time = list(df.columns)[4:]
|
|
84
|
-
# states = ['Non-computing', 'Non-technical computing', 'Technical computing']
|
|
85
|
-
# sequence_data = SequenceData(df[['worker_id', 'C1', 'C2', 'C3', 'C4', 'C5']],
|
|
86
|
-
# time=_time, id_col="worker_id", states=states)
|
|
87
|
-
# res = seqici(sequence_data)
|
|
88
|
-
|
|
89
|
-
print(res)
|
|
48
|
+
res = get_complexity_index(sequence_data)
|
|
49
|
+
res
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
@Author :
|
|
2
|
+
@Author : Yuqi Liang, Xinyi Li
|
|
3
3
|
@File : overall_cross_sectional_entropy.py
|
|
4
4
|
@Time : 2025/9/15 21:52
|
|
5
5
|
@Desc : States frequency by time unit
|
|
@@ -13,11 +13,29 @@ import pandas as pd
|
|
|
13
13
|
from scipy.stats import entropy
|
|
14
14
|
from sequenzo.define_sequence_data import SequenceData
|
|
15
15
|
|
|
16
|
-
def get_cross_sectional_entropy(
|
|
17
|
-
|
|
18
|
-
|
|
16
|
+
def get_cross_sectional_entropy(
|
|
17
|
+
seqdata: SequenceData,
|
|
18
|
+
weighted: bool = True,
|
|
19
|
+
norm: bool = True,
|
|
20
|
+
return_format: str = "tidy", # "tidy" | "wide" | "dict"
|
|
21
|
+
include_effective_states: bool = True,
|
|
22
|
+
add_topk: int = 1, # Mark top K dominant states at each time point
|
|
23
|
+
round_decimals: int = 6
|
|
24
|
+
):
|
|
19
25
|
"""
|
|
20
|
-
|
|
26
|
+
Cross-sectional state distribution by time with entropy and readable outputs.
|
|
27
|
+
|
|
28
|
+
What you get in a tidy format:
|
|
29
|
+
time state freq entropy per_time_entropy_norm N_valid rank is_top
|
|
30
|
+
1 A 0.645 0.380 0.380 2346.27 1 True
|
|
31
|
+
...
|
|
32
|
+
|
|
33
|
+
Additional metrics:
|
|
34
|
+
- per_time_entropy_norm: If norm=True, normalized by maximum entropy (|S|), range 0–1
|
|
35
|
+
- effective_states (H_effective): exp(H), equivalent "effective number of states"
|
|
36
|
+
- summary: Key interpretation points (entropy peaks/valleys, dominant state intervals, average entropy, etc.)
|
|
37
|
+
|
|
38
|
+
Parameters maintain your semantics, with new return_format, add_topk etc. for better interpretability.
|
|
21
39
|
|
|
22
40
|
Parameters
|
|
23
41
|
----------
|
|
@@ -25,130 +43,178 @@ def get_cross_sectional_entropy(seqdata: SequenceData,
|
|
|
25
43
|
A sequence object created by the SequenceData function.
|
|
26
44
|
weighted : bool, default True
|
|
27
45
|
If True, the frequencies are weighted by the number of non-missing values at each time unit.
|
|
28
|
-
with_missing : bool, default False
|
|
29
|
-
If True, missing values are included in the frequency computation.
|
|
30
46
|
norm : bool, default True
|
|
31
|
-
If True, the
|
|
47
|
+
If True, the entropy is normalized by maximum possible entropy.
|
|
48
|
+
return_format : str, default "tidy"
|
|
49
|
+
Return format: "tidy" for long-form table, "wide" for matrices, "dict" for original dict format.
|
|
50
|
+
include_effective_states : bool, default True
|
|
51
|
+
If True, calculate effective number of states (exp(entropy)).
|
|
52
|
+
add_topk : int, default 1
|
|
53
|
+
Mark top K dominant states at each time point.
|
|
54
|
+
round_decimals : int, default 6
|
|
55
|
+
Number of decimal places for rounding.
|
|
32
56
|
|
|
33
57
|
Returns
|
|
34
58
|
-------
|
|
35
|
-
pd.DataFrame
|
|
36
|
-
|
|
59
|
+
pd.DataFrame or dict
|
|
60
|
+
Depending on return_format:
|
|
61
|
+
- "tidy": Long-form DataFrame with interpretable columns
|
|
62
|
+
- "wide": Dict with frequency matrix, entropy series, etc.
|
|
63
|
+
- "dict": Original dict format (backward compatible)
|
|
37
64
|
"""
|
|
38
65
|
|
|
39
66
|
if not isinstance(seqdata, SequenceData):
|
|
40
|
-
raise ValueError("
|
|
41
|
-
|
|
42
|
-
# Retrieve the states, shape and colors
|
|
43
|
-
states = seqdata.states.copy()
|
|
44
|
-
statl = range(1, len(states) + 1)
|
|
45
|
-
number_states = len(statl)
|
|
67
|
+
raise ValueError("[!] data is NOT a sequence object, see SequenceData.")
|
|
46
68
|
|
|
47
|
-
|
|
69
|
+
# Basic metadata
|
|
70
|
+
states_labels = list(seqdata.states) # Human-readable state labels
|
|
71
|
+
S = len(states_labels)
|
|
72
|
+
T = seqdata.seqdata.shape[1] # Number of time points
|
|
73
|
+
times = list(seqdata.seqdata.columns)
|
|
48
74
|
|
|
75
|
+
# Color attributes
|
|
49
76
|
cpal = seqdata.custom_colors
|
|
50
77
|
|
|
51
|
-
# SequenceData already added missing values into states and colors internally
|
|
52
|
-
# if seqdata.ismissing:
|
|
53
|
-
# statl.append(seqdata.missing_value)
|
|
54
|
-
# col.append(seqdata.missing_color)
|
|
55
|
-
|
|
56
|
-
sd = pd.DataFrame(np.zeros((number_states, number_seq)), index=states, columns=seqdata.seqdata.columns)
|
|
57
|
-
|
|
58
78
|
# Weights
|
|
59
|
-
weights = seqdata.weights if seqdata.weights is not None else np.ones(seqdata.seqdata.shape[0])
|
|
60
|
-
|
|
61
79
|
# Also takes into account that in unweighted sequence objects created with
|
|
62
80
|
# older TraMineR versions the weights attribute is a vector of 1
|
|
63
81
|
# instead of NULL
|
|
64
|
-
if
|
|
82
|
+
w = seqdata.weights if seqdata.weights is not None else np.ones(seqdata.seqdata.shape[0])
|
|
83
|
+
if np.all(w == 1):
|
|
65
84
|
weighted = False
|
|
66
85
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
86
|
+
# Your data is usually encoded with 1..S; if internally already labels, we can map here
|
|
87
|
+
# For compatibility: build a "value -> row index" lookup table
|
|
88
|
+
# Try to support both numeric encoding (1..S) and labels themselves
|
|
89
|
+
value_to_row = {v: i for i, v in enumerate(range(1, S+1))}
|
|
90
|
+
label_to_row = {lab: i for i, lab in enumerate(states_labels)}
|
|
91
|
+
|
|
92
|
+
# Frequency matrix (S x T)
|
|
93
|
+
freq_counts = np.zeros((S, T), dtype=float)
|
|
94
|
+
|
|
95
|
+
for j in range(T):
|
|
96
|
+
col = seqdata.seqdata.iloc[:, j]
|
|
97
|
+
for i in range(S):
|
|
98
|
+
# Try both encoding and label matching
|
|
99
|
+
mask_num = (col == (i+1))
|
|
100
|
+
mask_lab = (col == states_labels[i])
|
|
101
|
+
mask = mask_num | mask_lab
|
|
102
|
+
if weighted:
|
|
103
|
+
freq_counts[i, j] = w[mask].sum()
|
|
104
|
+
else:
|
|
105
|
+
freq_counts[i, j] = mask.sum()
|
|
106
|
+
|
|
107
|
+
N_valid = freq_counts.sum(axis=0) # Valid weight/sample size per time point
|
|
108
|
+
with np.errstate(divide='ignore', invalid='ignore'):
|
|
109
|
+
P = np.divide(freq_counts, N_valid, where=(N_valid>0)) # Frequencies
|
|
110
|
+
|
|
111
|
+
# Entropy
|
|
112
|
+
H = np.array([entropy(P[:, j][P[:, j] > 0]) if N_valid[j] > 0 else 0.0 for j in range(T)])
|
|
70
113
|
|
|
71
|
-
N = sd.sum(axis=0)
|
|
72
|
-
sd = sd.div(N, axis=1)
|
|
73
|
-
|
|
74
|
-
E = sd.apply(lambda col: entropy(col[col > 0]), axis=0)
|
|
75
|
-
|
|
76
|
-
# Maximum entropy is the entropy of the alphabet
|
|
77
114
|
if norm:
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
115
|
+
Hmax = entropy(np.ones(S) / S) if S > 0 else 1.0
|
|
116
|
+
H_norm = H / Hmax if Hmax > 0 else H
|
|
117
|
+
else:
|
|
118
|
+
H_norm = H
|
|
119
|
+
|
|
120
|
+
# Effective number of states (highly interpretable: equivalent "how many equiprobable states")
|
|
121
|
+
H_eff = np.exp(H) if include_effective_states else None
|
|
122
|
+
|
|
123
|
+
# Organize output: wide format
|
|
124
|
+
freq_df_wide = pd.DataFrame(P, index=states_labels, columns=times).round(round_decimals)
|
|
125
|
+
entropy_s = pd.Series(H_norm if norm else H, index=times, name=("per_time_entropy_norm" if norm else "Entropy")).round(round_decimals)
|
|
126
|
+
valid_s = pd.Series(N_valid, index=times, name="N_valid").round(round_decimals)
|
|
127
|
+
eff_s = (pd.Series(H_eff, index=times, name="Effective States").round(round_decimals)
|
|
128
|
+
if include_effective_states else None)
|
|
129
|
+
|
|
130
|
+
# Generate tidy table (interpretation-friendly)
|
|
131
|
+
tidy = (
|
|
132
|
+
freq_df_wide
|
|
133
|
+
.reset_index()
|
|
134
|
+
.melt(id_vars="index", var_name="time", value_name="freq")
|
|
135
|
+
.rename(columns={"index": "state"})
|
|
136
|
+
.sort_values(["time", "freq"], ascending=[True, False])
|
|
137
|
+
)
|
|
138
|
+
# Ranking + topK annotation
|
|
139
|
+
tidy["rank"] = tidy.groupby("time")["freq"].rank(method="first", ascending=False).astype(int)
|
|
140
|
+
if add_topk and add_topk > 0:
|
|
141
|
+
tidy["is_top"] = tidy["rank"] <= add_topk
|
|
142
|
+
else:
|
|
143
|
+
tidy["is_top"] = False
|
|
144
|
+
|
|
145
|
+
# Merge entropy/sample size/effective states
|
|
146
|
+
tidy = tidy.merge(entropy_s.reset_index().rename(columns={"index": "time"}), on="time", how="left")
|
|
147
|
+
tidy = tidy.merge(valid_s.reset_index().rename(columns={"index": "time"}), on="time", how="left")
|
|
148
|
+
if eff_s is not None:
|
|
149
|
+
tidy = tidy.merge(eff_s.reset_index().rename(columns={"index": "time"}), on="time", how="left")
|
|
150
|
+
|
|
151
|
+
# Friendly column order
|
|
152
|
+
cols = ["time", "state", "freq"]
|
|
153
|
+
if norm:
|
|
154
|
+
cols += ["per_time_entropy_norm"]
|
|
155
|
+
else:
|
|
156
|
+
cols += ["Entropy"]
|
|
157
|
+
cols += ["N_valid"]
|
|
158
|
+
if include_effective_states:
|
|
159
|
+
cols += ["Effective States"]
|
|
160
|
+
cols += ["rank", "is_top"]
|
|
161
|
+
tidy = tidy[cols]
|
|
162
|
+
|
|
163
|
+
# Summary: key statistics that can be explained in one sentence
|
|
164
|
+
summary = {
|
|
165
|
+
"states": states_labels,
|
|
166
|
+
"n_states": S,
|
|
167
|
+
"n_timepoints": T,
|
|
168
|
+
"avg_entropy_norm": float(tidy["per_time_entropy_norm"].mean()) if norm else None,
|
|
169
|
+
"avg_entropy": float((entropy_s if not norm else entropy_s * entropy(np.ones(S)/S)).mean()) if not norm else None,
|
|
170
|
+
"peak_entropy_time": tidy.loc[tidy["per_time_entropy_norm" if norm else "Entropy"].idxmax(), "time"] if T > 0 else None,
|
|
171
|
+
"lowest_entropy_time": tidy.loc[tidy["per_time_entropy_norm" if norm else "Entropy"].idxmin(), "time"] if T > 0 else None,
|
|
172
|
+
"dominant_stability_ratio": float(tidy.query("rank==1")["freq"].mean()), # Average proportion of dominant state
|
|
173
|
+
"cpal": cpal
|
|
95
174
|
}
|
|
96
175
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
#
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
# ===============================
|
|
143
|
-
# broad
|
|
144
|
-
# ===============================
|
|
145
|
-
# df = pd.read_csv("D:/college/research/QiQi/sequenzo/data_and_output/sampled_data_sets/broad_data/sampled_1000_data.csv")
|
|
146
|
-
# _time = list(df.columns)[4:]
|
|
147
|
-
# states = ['Non-computing', 'Non-technical computing', 'Technical computing']
|
|
148
|
-
# sequence_data = SequenceData(df[['worker_id', 'C1', 'C2', 'C3', 'C4', 'C5']],
|
|
149
|
-
# time_type="age", time=_time, id_col="worker_id", states=states)
|
|
150
|
-
# res = seqstatd(sequence_data)
|
|
151
|
-
|
|
152
|
-
print(res['Frequencies'])
|
|
153
|
-
print(res['ValidStates'])
|
|
154
|
-
print(res['Entropy'])
|
|
176
|
+
# Print descriptive statistics
|
|
177
|
+
print("\n" + "="*70)
|
|
178
|
+
print("Cross-Sectional Entropy Summary")
|
|
179
|
+
print("="*70)
|
|
180
|
+
print(f"[>] Number of states: {summary['n_states']}")
|
|
181
|
+
print(f"[>] Number of time points: {summary['n_timepoints']}")
|
|
182
|
+
print(f"[>] On average, the most common state accounts for {summary['dominant_stability_ratio']:.1%} of cases")
|
|
183
|
+
print(f"[>] Entropy is highest at time point {summary['peak_entropy_time']}")
|
|
184
|
+
print(f"[>] Entropy is lowest at time point {summary['lowest_entropy_time']}")
|
|
185
|
+
if norm:
|
|
186
|
+
print(f"[>] Average normalized entropy: {summary['avg_entropy_norm']:.3f} (range: 0 = fully concentrated, 1 = evenly distributed)")
|
|
187
|
+
print("="*70 + "\n")
|
|
188
|
+
|
|
189
|
+
# Compatible with different return formats
|
|
190
|
+
if return_format == "tidy":
|
|
191
|
+
tidy.attrs = {"summary": summary}
|
|
192
|
+
return tidy
|
|
193
|
+
elif return_format == "wide":
|
|
194
|
+
out = {
|
|
195
|
+
"Frequencies": freq_df_wide,
|
|
196
|
+
"N_valid": valid_s,
|
|
197
|
+
("per_time_entropy_norm" if norm else "Entropy"): entropy_s
|
|
198
|
+
}
|
|
199
|
+
if eff_s is not None:
|
|
200
|
+
out["Effective States"] = eff_s
|
|
201
|
+
return out
|
|
202
|
+
else: # "dict" —— try to be more readable too
|
|
203
|
+
res = {
|
|
204
|
+
"Frequencies": freq_df_wide,
|
|
205
|
+
"ValidStates": valid_s,
|
|
206
|
+
"Entropy": entropy_s if not norm else None,
|
|
207
|
+
"per_time_entropy_norm": entropy_s if norm else None,
|
|
208
|
+
"Effective States": eff_s,
|
|
209
|
+
"__attrs__": {
|
|
210
|
+
"nbseq": float(valid_s.iloc[0]) if len(valid_s)>0 else None,
|
|
211
|
+
"cpal": cpal,
|
|
212
|
+
"xtlab": times,
|
|
213
|
+
"xtstep": getattr(seqdata, "xtstep", None),
|
|
214
|
+
"tick_last": getattr(seqdata, "tick_last", None),
|
|
215
|
+
"weighted": weighted,
|
|
216
|
+
"norm": norm,
|
|
217
|
+
"summary": summary
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
return res
|
|
@@ -51,7 +51,8 @@ def plot_longitudinal_characteristics(seqdata,
|
|
|
51
51
|
save_as=None,
|
|
52
52
|
dpi=200,
|
|
53
53
|
custom_colors=None,
|
|
54
|
-
show_sequence_ids=False
|
|
54
|
+
show_sequence_ids=False,
|
|
55
|
+
id_as_column=True):
|
|
55
56
|
"""
|
|
56
57
|
Create a horizontal bar chart showing four key characteristics for selected sequences.
|
|
57
58
|
|
|
@@ -128,13 +129,17 @@ def plot_longitudinal_characteristics(seqdata,
|
|
|
128
129
|
show_sequence_ids : bool, optional (default=False)
|
|
129
130
|
If True, y-axis shows actual sequence IDs (when available).
|
|
130
131
|
If False, shows 1..N index positions.
|
|
132
|
+
|
|
133
|
+
id_as_column : bool, optional (default=True)
|
|
134
|
+
If True, the returned DataFrame will include ID as a separate column on the same level as other columns.
|
|
135
|
+
If False, IDs will be used as the DataFrame index.
|
|
131
136
|
|
|
132
137
|
Returns
|
|
133
138
|
-------
|
|
134
139
|
pandas.DataFrame
|
|
135
140
|
A DataFrame containing the calculated metrics for all plotted sequences.
|
|
136
|
-
Columns: ['Transitions', 'Entropy', 'Turbulence', 'Complexity']
|
|
137
|
-
Index: The sequence IDs
|
|
141
|
+
If id_as_column=True: Columns: ['ID', 'Transitions', 'Entropy', 'Turbulence', 'Complexity'] (all columns at same level)
|
|
142
|
+
If id_as_column=False: Columns: ['Transitions', 'Entropy', 'Turbulence', 'Complexity'], Index: The sequence IDs
|
|
138
143
|
|
|
139
144
|
Warnings
|
|
140
145
|
--------
|
|
@@ -189,15 +194,15 @@ def plot_longitudinal_characteristics(seqdata,
|
|
|
189
194
|
types of sequences and datasets.
|
|
190
195
|
"""
|
|
191
196
|
# Calculate four metrics (all should be 0-1 normalized)
|
|
192
|
-
df_t = get_number_of_transitions(seqdata=seqdata, norm=True).iloc[:,
|
|
197
|
+
df_t = get_number_of_transitions(seqdata=seqdata, norm=True).iloc[:, 1] # Series
|
|
193
198
|
df_e = get_within_sequence_entropy(seqdata=seqdata, norm=True) # Series or single-column DataFrame
|
|
194
|
-
if isinstance(df_e, pd.DataFrame): df_e = df_e.iloc[:,
|
|
199
|
+
if isinstance(df_e, pd.DataFrame): df_e = df_e.iloc[:, 1]
|
|
195
200
|
|
|
196
|
-
df_tb = get_turbulence(seqdata=seqdata, norm=True, type=2) # Normalized turbulence
|
|
197
|
-
if isinstance(df_tb, pd.DataFrame): df_tb = df_tb.iloc[:,
|
|
201
|
+
df_tb = get_turbulence(seqdata=seqdata, norm=True, type=2, id_as_column=True) # Normalized turbulence
|
|
202
|
+
if isinstance(df_tb, pd.DataFrame): df_tb = df_tb.iloc[:, 1]
|
|
198
203
|
|
|
199
204
|
df_c = get_complexity_index(seqdata=seqdata) # Already 0-1 normalized
|
|
200
|
-
if isinstance(df_c, pd.DataFrame): df_c = df_c.iloc[:,
|
|
205
|
+
if isinstance(df_c, pd.DataFrame): df_c = df_c.iloc[:, 1]
|
|
201
206
|
|
|
202
207
|
# Create metrics DataFrame with actual sequence IDs as index
|
|
203
208
|
metrics = pd.DataFrame({
|
|
@@ -336,7 +341,17 @@ def plot_longitudinal_characteristics(seqdata,
|
|
|
336
341
|
plt.show()
|
|
337
342
|
plt.close()
|
|
338
343
|
|
|
339
|
-
|
|
344
|
+
# Handle ID display options for returned DataFrame
|
|
345
|
+
if id_as_column:
|
|
346
|
+
# Add ID as a separate column and reset index to numeric
|
|
347
|
+
metrics_result = metrics.copy()
|
|
348
|
+
metrics_result['ID'] = metrics_result.index
|
|
349
|
+
metrics_result = metrics_result[['ID', 'Transitions', 'Entropy', 'Turbulence', 'Complexity']].reset_index(drop=True)
|
|
350
|
+
return metrics_result
|
|
351
|
+
else:
|
|
352
|
+
# Return with ID as index (traditional format)
|
|
353
|
+
metrics.index.name = 'ID'
|
|
354
|
+
return metrics
|
|
340
355
|
|
|
341
356
|
|
|
342
357
|
def plot_cross_sectional_characteristics(seqdata,
|
|
@@ -465,10 +480,14 @@ def plot_cross_sectional_characteristics(seqdata,
|
|
|
465
480
|
>>> valid_n = result['ValidStates'] # Sample sizes by time
|
|
466
481
|
"""
|
|
467
482
|
# Get cross-sectional data using the existing function
|
|
468
|
-
res = get_cross_sectional_entropy(seqdata, weighted=True, norm=True)
|
|
483
|
+
res = get_cross_sectional_entropy(seqdata, weighted=True, norm=True, return_format="dict")
|
|
469
484
|
|
|
470
485
|
freq = res["Frequencies"] # rows: states, cols: time points
|
|
471
|
-
|
|
486
|
+
# Get normalized or raw entropy (check which key exists)
|
|
487
|
+
if "per_time_entropy_norm" in res and res["per_time_entropy_norm"] is not None:
|
|
488
|
+
ent = res["per_time_entropy_norm"]
|
|
489
|
+
else:
|
|
490
|
+
ent = res["Entropy"]
|
|
472
491
|
N = res.get("ValidStates", None) # valid sample sizes per time point
|
|
473
492
|
|
|
474
493
|
# Sort time axis if possible (handles both numeric and string time labels)
|
|
@@ -306,5 +306,6 @@ def get_number_of_transitions(seqdata, norm=False, pwight=False) -> pd.DataFrame
|
|
|
306
306
|
trans[seq_length<=1] = 0
|
|
307
307
|
|
|
308
308
|
trans = pd.DataFrame(trans, index=seqdata.seqdata.index, columns=['Transitions'])
|
|
309
|
+
trans = trans.reset_index().rename(columns={'index': 'ID'})
|
|
309
310
|
|
|
310
311
|
return trans
|
|
@@ -14,15 +14,19 @@ from sequenzo.define_sequence_data import SequenceData
|
|
|
14
14
|
|
|
15
15
|
def get_state_freq_and_entropy_per_seq(seqdata, prop=False):
|
|
16
16
|
if not isinstance(seqdata, SequenceData):
|
|
17
|
-
raise ValueError("
|
|
17
|
+
raise ValueError("[!] data is NOT a sequence object, see SequenceData function to create one.")
|
|
18
|
+
|
|
19
|
+
if seqdata.labels is not None:
|
|
20
|
+
states = seqdata.labels
|
|
21
|
+
else:
|
|
22
|
+
states = seqdata.states
|
|
18
23
|
|
|
19
|
-
states = seqdata.states.copy()
|
|
20
24
|
number_states = len(states)
|
|
21
25
|
number_seq = seqdata.seqdata.shape[0]
|
|
22
26
|
|
|
23
27
|
iseqtab = pd.DataFrame(np.zeros((number_seq, number_states)), index=seqdata.seqdata.index, columns=states)
|
|
24
28
|
|
|
25
|
-
print(f"
|
|
29
|
+
print(f"[>] Computing state distribution for {number_seq} sequences and {number_states} states ...")
|
|
26
30
|
|
|
27
31
|
for i, state in enumerate(states):
|
|
28
32
|
iseqtab.iloc[:, i] = (seqdata.seqdata == (i+1)).sum(axis=1)
|
|
@@ -30,4 +34,6 @@ def get_state_freq_and_entropy_per_seq(seqdata, prop=False):
|
|
|
30
34
|
if prop:
|
|
31
35
|
iseqtab = iseqtab.div(iseqtab.sum(axis=1), axis=0)
|
|
32
36
|
|
|
37
|
+
iseqtab = iseqtab.reset_index().rename(columns={'index': 'ID'})
|
|
38
|
+
|
|
33
39
|
return iseqtab
|