PyPI - sequenzo - Versions diffs - 0.1.24__cp311-cp311-macosx_10_9_x86_64.whl - Mend

sequenzo 0.1.24__cp311-cp311-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of sequenzo might be problematic. Click here for more details.

Files changed (264) hide show

sequenzo/sequence_characteristics/simple_characteristics.py ADDED Viewed

@@ -0,0 +1,311 @@
+"""
+@Author  : 梁彧祺
+@File    : simple_characteristics.py
+@Time    : 22/09/2025 22:40
+@Desc    : Simple sequence characteristics functions
+"""
+import numpy as np
+import pandas as pd
+from typing import Union, List
+from sequenzo.define_sequence_data import SequenceData
+from sequenzo.dissimilarity_measures.utils.seqdss import seqdss
+from sequenzo.dissimilarity_measures.utils.seqlength import seqlength
+from sequenzo.dissimilarity_measures.utils.get_sm_trate_substitution_cost_matrix import get_sm_trate_substitution_cost_matrix
+def get_subsequences_in_single_sequence(x: np.ndarray, nbstat: int, statlist: List, void=None, nr=None, with_missing: bool = False) -> int:
+    """
+    Internal helper function to count distinct subsequences in a single sequence.
+    This is a low-level computational function that implements the dynamic programming
+    algorithm for counting subsequences. It's designed to be called by higher-level
+    functions like get_number_of_subsequences().
+    Args:
+        x (np.ndarray): Single sequence array (e.g., [1, 2, 1, 3])
+        nbstat (int): Number of distinct states/symbols
+        statlist (List): List of all possible states/symbols
+        void: Symbol representing void/empty elements (not used in current implementation)
+        nr: Symbol representing missing values
+        with_missing (bool): Whether to include missing values in the calculation
+    Returns:
+        int: Number of distinct subsequences in this one sequence
+    Note:
+        This is an internal function. Use get_number_of_subsequences() for analyzing
+        sequence datasets. The algorithm uses dynamic programming for efficiency.
+    """
+    # Initialize state tracking array
+    l = np.zeros(nbstat, dtype=int) - 1  # 必须是 -1（或其他负数）。避免 0-based 索引与 0 代表的无效值冲突
+    # Remove void elements if specified
+    if void is not None:
+        x = x[x != void]
+    # Remove missing values if not including them
+    if not with_missing and nr is not None:
+        x = x[x != nr]
+    slength = len(x)
+    # Empty sequence has one subsequence (the empty one)
+    if slength == 0:
+        return 1
+    # Dynamic programming array
+    N = np.zeros(slength + 1, dtype=object)  # Use object dtype to handle large integers
+    N[0] = 1
+    for i in range(1, slength + 1):
+        N[i] = 2 * N[i-1]
+        # Find the index of current state in statlist
+        current_state = x[i-1]
+        try:
+            cidx = statlist.index(current_state)
+        except ValueError:
+            # If state not in statlist, skip this iteration
+            continue
+        # Subtract previously counted subsequences ending with this state
+        if l[cidx] > -1:
+            N[i] = N[i] - N[l[cidx]]
+        # Update last position of this state
+        l[cidx] = i - 1
+    return N[slength]
+def get_subsequences_all_sequences(seqdata, dss: bool = True, with_missing: bool = False) -> pd.DataFrame:
+    """
+    Calculate the number of distinct subsequences for all sequences in the dataset.
+    This is the main function you'll use to analyze subsequence complexity across
+    multiple sequences. It processes your entire sequence dataset and returns a
+    summary table showing how many distinct subsequences exist in each sequence.
+    Args:
+        seqdata: SequenceData object or pandas DataFrame containing your sequence data
+        dss (bool): Whether to apply distinct state sequence preprocessing.
+                   If True, consecutive identical states are compressed (e.g., [1,1,2,2] -> [1,2])
+        with_missing (bool): Whether to include missing values in the calculation
+    Returns:
+        pd.DataFrame: Results table with one column 'Subseq.' showing the subsequence
+                     count for each sequence. Row names match your sequence identifiers.
+    Examples:
+        >>> # Analyze subsequence complexity in your sequence dataset
+        >>> result = get_number_of_subsequences(seq_data, dss=True, with_missing=False)
+        >>> print(result.head())
+                Subseq.
+        seq_1        15
+        seq_2        23
+        seq_3         8
+        >>> # Higher numbers = more complex sequences with more possible subsequences
+    Note:
+        This function works with SequenceData objects (recommended) or pandas DataFrames.
+        Use this to understand the complexity and diversity patterns in your sequences.
+    """
+    if isinstance(seqdata, np.ndarray):
+        seqdata = pd.DataFrame(seqdata)
+    # Check if input is a SequenceData object
+    if hasattr(seqdata, 'seqdata'):
+        # It is a SequenceData object
+        sequences = seqdata.seqdata
+        states = seqdata.states
+        state_mapping = seqdata.state_mapping
+        ids = sequences.index
+        # Handle missing values
+        nr_code = len(states) + 1 if hasattr(seqdata, 'ismissing') and seqdata.ismissing else None
+    elif isinstance(seqdata, pd.DataFrame):
+        # It's a DataFrame
+        sequences = seqdata
+        # Try to infer states from the data
+        unique_vals = set()
+        for col in sequences.columns:
+            unique_vals.update(sequences[col].dropna().unique())
+        states = sorted(list(unique_vals))
+        state_mapping = {state: i+1 for i, state in enumerate(states)}
+        ids = sequences.index
+        nr_code = None
+    else:
+        raise ValueError("seqdata must be a SequenceData object or pandas DataFrame")
+    # Apply DSS (Distinct State Sequences) if requested
+    if dss:
+        processed_sequences = sequences.copy()
+        for idx in processed_sequences.index:
+            row = processed_sequences.loc[idx].values
+            # Remove consecutive duplicates
+            if len(row) > 0:
+                new_row = [row[0]]
+                for i in range(1, len(row)):
+                    if row[i] != row[i-1]:
+                        new_row.append(row[i])
+                # Pad with NaN if sequence got shorter
+                while len(new_row) < len(row):
+                    new_row.append(np.nan)
+                processed_sequences.loc[idx] = new_row
+    else:
+        processed_sequences = sequences
+    # Get state list
+    if hasattr(seqdata, 'states'):
+        # Use numeric codes from SequenceData
+        statlist = list(range(1, len(states) + 1))
+        if with_missing and nr_code is not None:
+            statlist.append(nr_code)
+    else:
+        # Use original states
+        statlist = states
+    nbstat = len(statlist)
+    # Calculate subsequence count for each sequence
+    results = []
+    for idx in processed_sequences.index:
+        seq_values = processed_sequences.loc[idx].values
+        # Remove NaN values
+        seq_values = seq_values[~pd.isna(seq_values)]
+        if len(seq_values) == 0:
+            result = 1  # Empty sequence has 1 subsequence
+        else:
+            result = get_subsequences_in_single_sequence(
+                seq_values.astype(int),
+                nbstat,
+                statlist,
+                void=None,
+                nr=nr_code,
+                with_missing=with_missing
+            )
+        results.append(result)
+    # Create result DataFrame
+    result_df = pd.DataFrame(results, columns=['Subseq.'], index=ids)
+    return result_df
+def cut_prefix(row, x=0):
+    arr = row.to_numpy()
+    if np.issubdtype(arr.dtype, np.number):
+        pos_idx = np.where(arr < x)[0]
+        if len(pos_idx) > 0:
+            arr = arr[:pos_idx[0]]
+    return arr
+def seqsubsn(seqdata, DSS=True, with_missing=False) -> pd.DataFrame:
+    if isinstance(seqdata, np.ndarray):
+        sl = pd.unique(seqdata.ravel())
+        seqdata = pd.DataFrame(seqdata)
+        statelist = sl.tolist()
+    elif isinstance(seqdata, pd.DataFrame):
+        sl = pd.unique(seqdata.values.ravel())
+        statelist = sl.tolist()
+        pass
+    elif isinstance(seqdata, SequenceData):
+        sl = seqdata.states.copy()
+        seqdata = seqdata.seqdata
+        statelist = list(range(1, len(sl) + 1))
+    else:
+        raise ValueError("[!] seqdata must be a SequenceData object, see SequenceData function to create one.")
+    if DSS:
+        seqdata = seqdss(seqdata)
+        seqdata = pd.DataFrame(seqdata)
+    ns = len(sl)
+    result = seqdata.apply(lambda row: get_subsequences_in_single_sequence(
+        cut_prefix(row),
+        nbstat=ns,
+        statlist=statelist
+    ), axis=1)
+    result = pd.DataFrame(result, columns=['Subseq.'], index=seqdata.index)
+    return result
+def get_number_of_transitions(seqdata, norm=False, pwight=False) -> pd.DataFrame:
+    """
+    Calculate how many state changes occur in each sequence.
+    This function measures sequence instability by counting transitions (state changes).
+    A transition happens whenever the sequence changes from one state to another.
+    More transitions = more volatile/unstable sequences.
+    Args:
+        seqdata: SequenceData object or pandas DataFrame containing your sequence data
+        norm:    If set as TRUE, the number of transitions is divided by its theoretical maximum, length of the sequence minus 1.
+                 When the length of the sequence is 1, the normalized value is set as 0.
+        pwight:  If set as TRUE, return count of transitions weighted
+                 by their probability to not occur to give higher weights to rare transitions.
+    Returns:
+        pd.DataFrame: Results table with one column 'Transitions' showing the number of
+                     state changes for each sequence. Row names match your sequence identifiers.
+    Examples:
+        >>> # Count state changes in your sequences
+        >>> result = get_number_of_transitions(seq_data)
+        >>> print(result.head())
+                Transitions
+        seq_1            3
+        seq_2            5
+        seq_3            2
+        >>> # Example: sequence [1, 1, 2, 2, 1, 3] has 3 transitions:
+        >>> # 1->2 (position 3), 2->1 (position 5), 1->3 (position 6)
+    Note:
+        Missing values are automatically ignored. Only counts actual state changes
+        between valid sequence elements. Use this to measure sequence volatility.
+    """
+    # Check if input is a SequenceData object
+    if not hasattr(seqdata, 'seqdata'):
+        raise ValueError("[!] seqdata must be a SequenceData object, see SequenceData function to create one.")
+    dss = seqdss(seqdata)
+    dss_length = seqlength(dss)
+    number_seq = seqdata.seqdata.shape[0]
+    if pwight:
+        # 返回的是每个id序列在每个时间点下的各状态不发生概率的累加和
+        tr = get_sm_trate_substitution_cost_matrix(seqdata)
+        dss = dss + 1
+        trans = np.zeros((number_seq, 1))
+        for i in range(number_seq):
+            if dss_length.iloc[i, 0] > 1:
+                for j in range(1, dss_length.iloc[i, 0]):
+                    state_from = dss.iloc[i, j-1]
+                    state_to = dss.iloc[i, j]
+                    trans[i, 0] += tr[state_from, state_to]
+    else:
+        # 返回的是每个id序列的转变次数，与上面的例子一致
+        trans = dss_length - 1
+        if any(dss_length==0):
+            trans[dss_length==0] = 0
+    if norm:
+        seq_length = seqlength(seqdata)
+        trans = trans / (seq_length-1)
+        if any(seq_length<=1):
+            trans[seq_length<=1] = 0
+    trans = pd.DataFrame(trans, index=seqdata.seqdata.index, columns=['Transitions'])
+    trans = trans.reset_index().rename(columns={'index': 'ID'})
+    return trans

sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py ADDED Viewed

@@ -0,0 +1,39 @@
+"""
+@Author  : 李欣怡
+@File    : state_frequencies_and_entropy_per_sequence.py
+@Time    : 2025/9/23 19:34
+@Desc    : State distribution for each individual
+        The corresponding function name in TraMineR is seqistatd.R,
+        with the source code available at: https://github.com/cran/TraMineR/blob/master/R/seqistatd.R
+"""
+import numpy as np
+import pandas as pd
+from sequenzo.define_sequence_data import SequenceData
+def get_state_freq_and_entropy_per_seq(seqdata, prop=False):
+    if not isinstance(seqdata, SequenceData):
+        raise ValueError("[!] data is NOT a sequence object, see SequenceData function to create one.")
+    if seqdata.labels is not None:
+        states = seqdata.labels
+    else:
+        states = seqdata.states
+    number_states = len(states)
+    number_seq = seqdata.seqdata.shape[0]
+    iseqtab = pd.DataFrame(np.zeros((number_seq, number_states)), index=seqdata.seqdata.index, columns=states)
+    print(f"[>] Computing state distribution for {number_seq} sequences and {number_states} states ...")
+    for i, state in enumerate(states):
+        iseqtab.iloc[:, i] = (seqdata.seqdata == (i+1)).sum(axis=1)
+    if prop:
+        iseqtab = iseqtab.div(iseqtab.sum(axis=1), axis=0)
+    iseqtab = iseqtab.reset_index().rename(columns={'index': 'ID'})
+    return iseqtab

sequenzo/sequence_characteristics/turbulence.py ADDED Viewed

@@ -0,0 +1,155 @@
+"""
+@Author  : Xinyi Li, Yuqi Liang
+@File    : turbulence.py
+@Time    : 2025/9/24 14:09
+@Desc    : Computes the sequence turbulence measure
+        The corresponding function name in TraMineR is seqST.R,
+        with the source code available at: https://github.com/cran/TraMineR/blob/master/R/seqST.R
+"""
+import os
+from contextlib import redirect_stdout
+import numpy as np
+import pandas as pd
+from sequenzo.define_sequence_data import SequenceData
+from sequenzo.dissimilarity_measures.utils.seqdss import seqdss
+from sequenzo.dissimilarity_measures.utils.seqlength import seqlength
+from .simple_characteristics import seqsubsn
+from .variance_of_spell_durations import get_spell_duration_variance
+def turb(x):
+    phi = x[0]
+    s2_tx = x[1]
+    s2max = x[2]
+    Tux = np.log2(phi * ((s2max + 1) / (s2_tx + 1)))
+    return Tux
+def get_turbulence(seqdata, norm=False, silent=True, type=1, id_as_column=True):
+    """
+    Computes the sequence turbulence measure
+    Parameters
+    ----------
+    seqdata : SequenceData
+        A sequence object created by the SequenceData function.
+    norm : bool, default True
+        If True, the frequencies are normalized to sum to 1 at each time unit.
+    silent : bool, default True
+        If True, suppresses the output messages.
+    type : int, default 1
+        Type of spell duration variance to be used. Can be either 1 or 2.
+    id_as_column : bool, default True
+        If True, the ID will be included as a separate column instead of as the index.
+    Returns
+    -------
+    pd.DataFrame
+        A DataFrame with one column containing the turbulence measure for each sequence.
+        If id_as_column=True, also includes an ID column.
+    """
+    if not hasattr(seqdata, 'seqdata'):
+        raise ValueError("[!] data is NOT a sequence object, see SequenceData function to create one.")
+    if not silent:
+        print(f"  - extracting symbols and durations ...")
+    spells = seqdss(seqdata)
+    if not silent:
+        print(f"  - computing turbulence type {type} for {seqdata.seqdata.shape[0]} sequence(s) ...")
+    phi = seqsubsn(spells, DSS=False, with_missing=True)
+    if any(np.isnan(phi)):
+        # 使用有限的大数值，避免转换警告
+        # np.finfo(float).max 在NumPy 1.24+会触发"invalid value encountered in cast"警告
+        large_but_finite = 1e15  # 足够大但不会导致溢出警告
+        phi = np.where(np.isnan(phi), large_but_finite, phi)
+        print("[!] One or more missing values were found after calculating the number of distinct subsequences. They have been replaced with a large number of 1e15 to ensure the calculation continues.")
+    s2_tx = get_spell_duration_variance(seqdata=seqdata, type=type)
+    s2_tx_max = s2_tx['vmax']
+    s2_tx = s2_tx['result']
+    # Extract phi values and ensure 1D array
+    if hasattr(phi, 'iloc'):
+        phi_values = phi.iloc[:, 0].values
+    elif hasattr(phi, 'values'):
+        phi_values = phi.values
+    else:
+        phi_values = phi
+    # Ensure phi_values is 1D
+    phi_values = np.asarray(phi_values).flatten()
+    # Extract 1D arrays from s2_tx and s2_tx_max DataFrames
+    s2_tx_values = s2_tx.iloc[:, 1].values if hasattr(s2_tx, 'iloc') else np.asarray(s2_tx).flatten()
+    s2_tx_max_values = s2_tx_max.iloc[:, 1].values if hasattr(s2_tx_max, 'iloc') else np.asarray(s2_tx_max).flatten()
+    tmp = pd.DataFrame({'phi': phi_values, 's2_tx': s2_tx_values, 's2max': s2_tx_max_values})
+    Tx = tmp.apply(lambda row: turb([row['phi'], row['s2_tx'], row['s2max']]), axis=1).to_numpy()
+    if norm:
+        alph = seqdata.states.copy()
+        maxlength = max(seqlength(seqdata))
+        nrep = -(-maxlength // len(alph))  # Ceiling division
+        turb_seq = pd.DataFrame(np.array((alph * nrep)[:maxlength]).reshape(1, -1))
+        with open(os.devnull, 'w') as fnull:
+            with redirect_stdout(fnull):
+                # 为 states 创建对应的 labels，需要特别处理 np.nan 的情况
+                turb_labels = []
+                for i, state in enumerate(alph):
+                    if pd.isna(state):
+                        turb_labels.append("Missing")
+                    else:
+                        turb_labels.append(f"State_{i}")
+                turb_seq = SequenceData(turb_seq, time=list(range(turb_seq.shape[1])), states=alph, labels=turb_labels)
+        if len(alph) > 1:
+            turb_phi = seqsubsn(turb_seq, DSS=False, with_missing=True)
+        else:
+            turb_phi = 2
+        if hasattr(turb_phi, 'isna') and turb_phi.isna().any().any():
+            turb_phi = 1e15  # 使用有限大数值避免转换警告
+            print("[!] phi set as max float due to exceeding value when computing max turbulence.")
+        turb_s2 = get_spell_duration_variance(turb_seq, type=type)
+        turb_s2_max = turb_s2['vmax']
+        turb_s2 = turb_s2['result']
+        # Extract turb_phi values and ensure 1D
+        if hasattr(turb_phi, 'iloc'):
+            phi_value = turb_phi.iloc[:, 0].values
+        else:
+            phi_value = [turb_phi]
+        phi_value = np.asarray(phi_value).flatten()
+        # Extract 1D arrays from turb_s2 and turb_s2_max DataFrames
+        turb_s2_values = turb_s2.iloc[:, 1].values if hasattr(turb_s2, 'iloc') else np.asarray(turb_s2).flatten()
+        turb_s2_max_values = turb_s2_max.iloc[:, 1].values if hasattr(turb_s2_max, 'iloc') else np.asarray(turb_s2_max).flatten()
+        tmp = pd.DataFrame({'phi': phi_value, 's2_tx': turb_s2_values, 's2max': turb_s2_max_values})
+        maxT = tmp.apply(lambda row: turb([row['phi'], row['s2_tx'], row['s2max']]), axis=1).to_numpy()
+        Tx_zero = np.where(Tx < 1)[0]
+        Tx = (Tx - 1) / (maxT - 1)
+        if len(Tx_zero) > 0:
+            Tx[Tx_zero, :] = 0
+    Tx_df = pd.DataFrame(Tx, index=seqdata.seqdata.index, columns=['Turbulence'])
+    # Handle ID display options
+    if id_as_column:
+        # Add ID as a separate column and reset index to numeric
+        Tx_df['ID'] = Tx_df.index
+        Tx_df = Tx_df[['ID', 'Turbulence']].reset_index(drop=True)
+    else:
+        # Always set index name to 'ID' for clarity
+        Tx_df.index.name = 'ID'
+    return Tx_df

sequenzo/sequence_characteristics/variance_of_spell_durations.py ADDED Viewed

@@ -0,0 +1,86 @@
+"""
+@Author  : Xinyi Li, Yuqi Liang
+@File    : variance_of_spell_durations.py
+@Time    : 2025/9/24 14:22
+@Desc    : Variance of spell durations of individual state sequences.
+        The corresponding function name in TraMineR is seqivardur,
+        with the source code available at: https://github.com/cran/TraMineR/blob/master/R/seqivardur.R
+"""
+import os
+from contextlib import redirect_stdout
+import numpy as np
+import pandas as pd
+from sequenzo.dissimilarity_measures.utils.seqdss import seqdss
+from sequenzo.dissimilarity_measures.utils.seqlength import seqlength
+from sequenzo.dissimilarity_measures.utils.seqdur import seqdur
+from .state_frequencies_and_entropy_per_sequence import get_state_freq_and_entropy_per_seq
+from .simple_characteristics import cut_prefix
+def get_spell_duration_variance(seqdata, type=1):
+    if not hasattr(seqdata, 'seqdata'):
+        raise ValueError("[!] data is NOT a sequence object, see SequenceData function to create one.")
+    if type not in [1, 2]:
+        raise ValueError("[!] type must be 1 or 2.")
+    with open(os.devnull, 'w') as fnull:
+        with redirect_stdout(fnull):
+            dss = seqdss(seqdata)
+            lgth = seqlength(seqdata)
+            dlgth = seqlength(dss)
+            sdist = get_state_freq_and_entropy_per_seq(seqdata)
+            nnvisit = (sdist.iloc[:, 1:]==0).sum(axis=1)
+    def realvar(x):
+        n = len(x)
+        var_val = 1 / n * np.sum((x - np.mean(x)) ** 2)
+        return var_val
+    dur = pd.DataFrame(seqdur(seqdata)).apply(lambda row: cut_prefix(row, 1), axis=1)
+    if type == 1:
+        ret = dur.apply(realvar)
+        meand = dur.apply(np.nanmean)
+        var_max = (dlgth - 1) * (1 - meand) ** 2
+    elif type == 2:
+        meand = dur.apply(lambda arr: np.nansum(arr))
+        meand /= dlgth + nnvisit.to_numpy()
+        ddur = dur.to_frame("arr").join(meand.rename("m")).apply(
+                    lambda row: (np.array(row["arr"]) - row["m"]) ** 2, axis=1
+                )
+        # ret = (np.nansum(ddur, axis=1) + nnvisit * (meand ** 2)) / (dlgth + nnvisit)
+        ddur = pd.DataFrame(ddur.tolist())
+        sum_sqdiff = np.nansum(ddur.to_numpy(), axis=1)
+        ret_values = (sum_sqdiff + nnvisit.to_numpy() * (meand.to_numpy() ** 2)) / (dlgth + nnvisit.to_numpy())
+        ret = pd.Series(ret_values, index=meand.index)
+        alph = seqdata.states.copy()
+        alph_size = len(alph)
+        if alph_size < 2:
+            maxnnv = 0
+        else:
+            maxnnv = np.where(dlgth == 1, alph_size - 1, alph_size - 2)
+        meand_max = meand.to_numpy() * (dlgth + nnvisit.to_numpy()) / (dlgth + maxnnv)
+        var_max_values = ((dlgth-1) * (1-meand_max)**2 + (lgth - dlgth + 1 - meand_max)**2 + maxnnv * meand_max**2) / (dlgth + maxnnv)
+        var_max = pd.Series(var_max_values, index=meand.index)
+    meand.index = seqdata.seqdata.index
+    ret.index = seqdata.seqdata.index
+    var_max.index = seqdata.seqdata.index
+    meand = meand.to_frame("meand")
+    ret = ret.to_frame("var_spell_dur")
+    var_max = var_max.to_frame("var_max")
+    return {
+        "meand": meand.reset_index().rename(columns={"index": "ID"}),
+        "result": ret.reset_index().rename(columns={"index": "ID"}),
+        "vmax": var_max.reset_index().rename(columns={"index": "ID"}),
+    }

sequenzo/sequence_characteristics/within_sequence_entropy.py ADDED Viewed

@@ -0,0 +1,43 @@
+"""
+@Author  : 李欣怡
+@File    : within_sequence_entropy.py
+@Time    : 2025/9/23 19:44
+@Desc    : Within Sequence Entropy
+        The corresponding function name in TraMineR is seqient.R,
+        with the source code available at: https://github.com/cran/TraMineR/blob/master/R/seqient.R
+"""
+import os
+from contextlib import redirect_stdout
+import numpy as np
+import pandas as pd
+from scipy.stats import entropy
+from sequenzo.define_sequence_data import SequenceData
+from .state_frequencies_and_entropy_per_sequence import get_state_freq_and_entropy_per_seq
+def get_within_sequence_entropy(seqdata, norm=True, base=np.e, silent=True):
+    if not isinstance(seqdata, SequenceData):
+        raise ValueError("[!] data is NOT a sequence object, see SequenceData function to create one.")
+    states = seqdata.states.copy()
+    if not silent:
+        print(f"  - computing within sequence entropy for {seqdata.seqdata.shape[0]} sequences and {len(states)} states ...")
+    with open(os.devnull, 'w') as fnull:
+        with redirect_stdout(fnull):
+            iseqtab = get_state_freq_and_entropy_per_seq(seqdata=seqdata)
+            iseqtab.index = seqdata.seqdata.index
+    ient = iseqtab.iloc[:, 1:].apply(lambda row: entropy(row, base=base), axis=1)
+    if norm:
+        maxent = np.log(len(states))
+        ient = ient / maxent
+    ient = pd.DataFrame(ient, index=seqdata.seqdata.index, columns=['Entropy'])
+    ient = ient.reset_index().rename(columns={'index': 'ID'})
+    return ient

sequenzo/suffix_tree/__init__.py ADDED Viewed

@@ -0,0 +1,48 @@
+"""
+@Author  : Yuqi Liang 梁彧祺
+@File    : __init__.py
+@Time    : 08/08/2025 15:50
+@Desc    :
+    Suffix Tree Framework - exposes core indicators and utilities for sequence convergence analysis.
+"""
+from .system_level_indicators import (
+    build_suffix_tree,
+    compute_suffix_count,
+    compute_merging_factor,
+    compute_js_convergence,
+    plot_system_indicators,
+    plot_system_indicators_multiple_comparison,
+)
+from .individual_level_indicators import (
+    IndividualConvergence,
+    compute_path_uniqueness_by_group,
+    plot_suffix_rarity_distribution,
+)
+from .utils import (
+    extract_sequences,
+    get_state_space,
+    convert_to_suffix_tree_data
+)
+__all__ = [
+    # System-level
+    "build_suffix_tree",
+    "compute_suffix_count",
+    "compute_merging_factor",
+    "compute_js_convergence",
+    # plotting
+    "plot_system_indicators",
+    "plot_system_indicators_multiple_comparison",
+    # Individual-level
+    "IndividualConvergence",
+    "compute_path_uniqueness_by_group",
+    "plot_suffix_rarity_distribution",
+    # Utilities
+    "extract_sequences",
+    "get_state_space",
+    "convert_to_suffix_tree_data",
+]