PyPI - sequenzo - Versions diffs - 0.1.17__cp39-cp39-win_amd64.whl → 0.1.18__cp39-cp39-win_amd64.whl - Mend

sequenzo 0.1.17__cp39-cp39-win_amd64.whl → 0.1.18__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of sequenzo might be problematic. Click here for more details.

Files changed (101) hide show

sequenzo/sequence_characteristics/turbulence.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
-@Author  : 李欣怡
-@File    : seqST.py
+@Author  : Xinyi Li, Yuqi Liang
+@File    : turbulence.py
 @Time    : 2025/9/24 14:09
 @Desc    : Computes the sequence turbulence measure
@@ -27,7 +27,7 @@ def turb(x):
     Tux = np.log2(phi * ((s2max + 1) / (s2_tx + 1)))
     return Tux
-def get_turbulence(seqdata, norm=False, silent=True, type=1):
+def get_turbulence(seqdata, norm=False, silent=True, type=1, id_as_column=True):
     """
     Computes the sequence turbulence measure
@@ -41,15 +41,18 @@ def get_turbulence(seqdata, norm=False, silent=True, type=1):
         If True, suppresses the output messages.
     type : int, default 1
         Type of spell duration variance to be used. Can be either 1 or 2.
+    id_as_column : bool, default True
+        If True, the ID will be included as a separate column instead of as the index.
     Returns
     -------
     pd.DataFrame
         A DataFrame with one column containing the turbulence measure for each sequence.
+        If id_as_column=True, also includes an ID column.
     """
     if not hasattr(seqdata, 'seqdata'):
-        raise ValueError(" [!] data is NOT a sequence object, see SequenceData function to create one.")
+        raise ValueError("[!] data is NOT a sequence object, see SequenceData function to create one.")
     if not silent:
         print(f"  - extracting symbols and durations ...")
@@ -70,7 +73,22 @@ def get_turbulence(seqdata, norm=False, silent=True, type=1):
     s2_tx_max = s2_tx['vmax']
     s2_tx = s2_tx['result']
-    tmp = pd.DataFrame({'phi': phi.flatten(), 's2_tx': s2_tx, 's2max': s2_tx_max})
+    # Extract phi values and ensure 1D array
+    if hasattr(phi, 'iloc'):
+        phi_values = phi.iloc[:, 0].values
+    elif hasattr(phi, 'values'):
+        phi_values = phi.values
+    else:
+        phi_values = phi
+    # Ensure phi_values is 1D
+    phi_values = np.asarray(phi_values).flatten()
+    # Extract 1D arrays from s2_tx and s2_tx_max DataFrames
+    s2_tx_values = s2_tx.iloc[:, 1].values if hasattr(s2_tx, 'iloc') else np.asarray(s2_tx).flatten()
+    s2_tx_max_values = s2_tx_max.iloc[:, 1].values if hasattr(s2_tx_max, 'iloc') else np.asarray(s2_tx_max).flatten()
+    tmp = pd.DataFrame({'phi': phi_values, 's2_tx': s2_tx_values, 's2max': s2_tx_max_values})
     Tx = tmp.apply(lambda row: turb([row['phi'], row['s2_tx'], row['s2max']]), axis=1).to_numpy()
     if norm:
@@ -95,7 +113,7 @@ def get_turbulence(seqdata, norm=False, silent=True, type=1):
         else:
             turb_phi = 2
-        if turb_phi.isna().any().any():
+        if hasattr(turb_phi, 'isna') and turb_phi.isna().any().any():
             turb_phi = 1e15  # 使用有限大数值避免转换警告
             print("[!] phi set as max float due to exceeding value when computing max turbulence.")
@@ -103,7 +121,19 @@ def get_turbulence(seqdata, norm=False, silent=True, type=1):
         turb_s2_max = turb_s2['vmax']
         turb_s2 = turb_s2['result']
-        tmp = pd.DataFrame({'phi': turb_phi.iloc[:, 0], 's2_tx': turb_s2, 's2max': turb_s2_max})
+        # Extract turb_phi values and ensure 1D
+        if hasattr(turb_phi, 'iloc'):
+            phi_value = turb_phi.iloc[:, 0].values
+        else:
+            phi_value = [turb_phi]
+        phi_value = np.asarray(phi_value).flatten()
+        # Extract 1D arrays from turb_s2 and turb_s2_max DataFrames
+        turb_s2_values = turb_s2.iloc[:, 1].values if hasattr(turb_s2, 'iloc') else np.asarray(turb_s2).flatten()
+        turb_s2_max_values = turb_s2_max.iloc[:, 1].values if hasattr(turb_s2_max, 'iloc') else np.asarray(turb_s2_max).flatten()
+        tmp = pd.DataFrame({'phi': phi_value, 's2_tx': turb_s2_values, 's2max': turb_s2_max_values})
         maxT = tmp.apply(lambda row: turb([row['phi'], row['s2_tx'], row['s2max']]), axis=1).to_numpy()
         Tx_zero = np.where(Tx < 1)[0]
@@ -112,64 +142,14 @@ def get_turbulence(seqdata, norm=False, silent=True, type=1):
             Tx[Tx_zero, :] = 0
     Tx_df = pd.DataFrame(Tx, index=seqdata.seqdata.index, columns=['Turbulence'])
-    return Tx_df
-if __name__ == "__main__":
-    from sequenzo import *
-    # ===============================
-    #             Sohee
-    # ===============================
-    # df = pd.read_csv('D:/college/research/QiQi/sequenzo/data_and_output/orignal data/sohee/sequence_data.csv')
-    # # df = pd.read_csv('/Users/lei/Documents/Sequenzo_all_folders/sequence_data_sources/sohee/sequence_data.csv')
-    # time_list = list(df.columns)[1:133]
-    # states = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
-    # # states = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
-    # labels = ['FT+WC', 'FT+BC', 'PT+WC', 'PT+BC', 'U', 'OLF']
-    # sequence_data = SequenceData(df, time=time_list, states=states, labels=labels, id_col="PID")
-    # res = get_turbulence(sequence_data)
-    # ===============================
-    #             kass
-    # ===============================
-    # df = pd.read_csv('D:/college/research/QiQi/sequenzo/files/orignal data/kass/wide_civil_final_df.csv')
-    # time_list = list(df.columns)[1:]
-    # states = ['Extensive Warfare', 'Limited Violence', 'No Violence', 'Pervasive Warfare', 'Prolonged Warfare',
-    #           'Serious Violence', 'Serious Warfare', 'Sporadic Violence', 'Technological Warfare', 'Total Warfare']
-    # sequence_data = SequenceData(df, time=time_list, states=states, id_col="COUNTRY")
-    # res = seqST(sequence_data)
-    # ===============================
-    #             CO2
-    # ===============================
-    # df = pd.read_csv("D:/country_co2_emissions_missing.csv")
-    df = load_dataset('country_co2_emissions_local_deciles')
-    df.to_csv("D:/country_co2_emissions_local_deciles.csv", index=False)
-    _time = list(df.columns)[1:]
-    # states = ['Very Low', 'Low', 'Middle', 'High', 'Very High']
-    states = ['D1 (Very Low)', 'D10 (Very High)', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9']
-    sequence_data = SequenceData(df, time=_time, id_col="country", states=states)
-    res = get_turbulence(sequence_data, norm=True, type=2)
-    # ===============================
-    #            detailed
-    # ===============================
-    # df = pd.read_csv("D:/college/research/QiQi/sequenzo/data_and_output/sampled_data_sets/detailed_data/sampled_1000_data.csv")
-    # _time = list(df.columns)[4:]
-    # states = ['data', 'data & intensive math', 'hardware', 'research', 'software', 'software & hardware', 'support & test']
-    # sequence_data = SequenceData(df[['worker_id', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10']],
-    #                              time=_time, id_col="worker_id", states=states)
-    # res = seqST(sequence_data, norm=False, type=2)
-    # ===============================
-    #             broad
-    # ===============================
-    # df = pd.read_csv("D:/college/research/QiQi/sequenzo/data_and_output/sampled_data_sets/broad_data/sampled_1000_data.csv")
-    # _time = list(df.columns)[4:]
-    # states = ['Non-computing', 'Non-technical computing', 'Technical computing']
-    # sequence_data = SequenceData(df[['worker_id', 'C1', 'C2', 'C3', 'C4', 'C5']],
-    #                              time=_time, id_col="worker_id", states=states)
-    # res = seqST(sequence_data, norm=True, type=2)
-    print(res)
+    # Handle ID display options
+    if id_as_column:
+        # Add ID as a separate column and reset index to numeric
+        Tx_df['ID'] = Tx_df.index
+        Tx_df = Tx_df[['ID', 'Turbulence']].reset_index(drop=True)
+    else:
+        # Always set index name to 'ID' for clarity
+        Tx_df.index.name = 'ID'
+    return Tx_df

sequenzo/sequence_characteristics/variance_of_spell_durations.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-@Author  : 李欣怡
+@Author  : Xinyi Li, Yuqi Liang
 @File    : variance_of_spell_durations.py
 @Time    : 2025/9/24 14:22
 @Desc    : Variance of spell durations of individual state sequences.
@@ -22,9 +22,9 @@ from .simple_characteristics import cut_prefix
 def get_spell_duration_variance(seqdata, type=1):
     if not hasattr(seqdata, 'seqdata'):
-        raise ValueError(" [!] data is NOT a sequence object, see SequenceData function to create one.")
+        raise ValueError("[!] data is NOT a sequence object, see SequenceData function to create one.")
     if type not in [1, 2]:
-        raise ValueError(" [!] type must be 1 or 2.")
+        raise ValueError("[!] type must be 1 or 2.")
     with open(os.devnull, 'w') as fnull:
         with redirect_stdout(fnull):
@@ -33,7 +33,7 @@ def get_spell_duration_variance(seqdata, type=1):
             lgth = seqlength(seqdata)
             dlgth = seqlength(dss)
             sdist = get_state_freq_and_entropy_per_seq(seqdata)
-            nnvisit = (sdist==0).sum(axis=1)
+            nnvisit = (sdist.iloc[:, 1:]==0).sum(axis=1)
     def realvar(x):
         n = len(x)
@@ -57,7 +57,8 @@ def get_spell_duration_variance(seqdata, type=1):
         # ret = (np.nansum(ddur, axis=1) + nnvisit * (meand ** 2)) / (dlgth + nnvisit)
         ddur = pd.DataFrame(ddur.tolist())
         sum_sqdiff = np.nansum(ddur.to_numpy(), axis=1)
-        ret = (sum_sqdiff + nnvisit.to_numpy() * (meand.to_numpy() ** 2)) / (dlgth + nnvisit.to_numpy())
+        ret_values = (sum_sqdiff + nnvisit.to_numpy() * (meand.to_numpy() ** 2)) / (dlgth + nnvisit.to_numpy())
+        ret = pd.Series(ret_values, index=meand.index)
         alph = seqdata.states.copy()
         alph_size = len(alph)
@@ -67,10 +68,19 @@ def get_spell_duration_variance(seqdata, type=1):
             maxnnv = np.where(dlgth == 1, alph_size - 1, alph_size - 2)
         meand_max = meand.to_numpy() * (dlgth + nnvisit.to_numpy()) / (dlgth + maxnnv)
-        var_max = ((dlgth-1) * (1-meand_max)**2 + (lgth - dlgth + 1 - meand_max)**2 + maxnnv * meand_max**2) / (dlgth + maxnnv)
+        var_max_values = ((dlgth-1) * (1-meand_max)**2 + (lgth - dlgth + 1 - meand_max)**2 + maxnnv * meand_max**2) / (dlgth + maxnnv)
+        var_max = pd.Series(var_max_values, index=meand.index)
+    meand.index = seqdata.seqdata.index
+    ret.index = seqdata.seqdata.index
+    var_max.index = seqdata.seqdata.index
+    meand = meand.to_frame("meand")
+    ret = ret.to_frame("var_spell_dur")
+    var_max = var_max.to_frame("var_max")
     return {
-        "meand": meand,
-        "result": ret,
-        "vmax": var_max
+        "meand": meand.reset_index().rename(columns={"index": "ID"}),
+        "result": ret.reset_index().rename(columns={"index": "ID"}),
+        "vmax": var_max.reset_index().rename(columns={"index": "ID"}),
     }

sequenzo/sequence_characteristics/within_sequence_entropy.py CHANGED Viewed

@@ -19,7 +19,7 @@ from .state_frequencies_and_entropy_per_sequence import get_state_freq_and_entro
 def get_within_sequence_entropy(seqdata, norm=True, base=np.e, silent=True):
     if not isinstance(seqdata, SequenceData):
-        raise ValueError(" [!] data is NOT a sequence object, see SequenceData function to create one.")
+        raise ValueError("[!] data is NOT a sequence object, see SequenceData function to create one.")
     states = seqdata.states.copy()
@@ -29,68 +29,15 @@ def get_within_sequence_entropy(seqdata, norm=True, base=np.e, silent=True):
     with open(os.devnull, 'w') as fnull:
         with redirect_stdout(fnull):
             iseqtab = get_state_freq_and_entropy_per_seq(seqdata=seqdata)
+            iseqtab.index = seqdata.seqdata.index
-    ient = iseqtab.apply(lambda row: entropy(row, base=base), axis=1)
+    ient = iseqtab.iloc[:, 1:].apply(lambda row: entropy(row, base=base), axis=1)
     if norm:
         maxent = np.log(len(states))
         ient = ient / maxent
-    ient.columns = ['Entropy']
-    ient.index = seqdata.seqdata.index
+    ient = pd.DataFrame(ient, index=seqdata.seqdata.index, columns=['Entropy'])
+    ient = ient.reset_index().rename(columns={'index': 'ID'})
     return ient
-if __name__ == "__main__":
-    # ===============================
-    #             Sohee
-    # ===============================
-    # df = pd.read_csv('D:/college/research/QiQi/sequenzo/data_and_output/orignal data/sohee/sequence_data.csv')
-    # time_list = list(df.columns)[1:133]
-    # states = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
-    # # states = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
-    # labels = ['FT+WC', 'FT+BC', 'PT+WC', 'PT+BC', 'U', 'OLF']
-    # sequence_data = SequenceData(df, time=time_list, states=states, labels=labels, id_col="PID")
-    # res = seqient(sequence_data)
-    # ===============================
-    #             kass
-    # ===============================
-    # df = pd.read_csv('D:/college/research/QiQi/sequenzo/files/orignal data/kass/wide_civil_final_df.csv')
-    # time_list = list(df.columns)[1:]
-    # states = ['Extensive Warfare', 'Limited Violence', 'No Violence', 'Pervasive Warfare', 'Prolonged Warfare',
-    #           'Serious Violence', 'Serious Warfare', 'Sporadic Violence', 'Technological Warfare', 'Total Warfare']
-    # sequence_data = SequenceData(df, time=time_list, states=states, id_col="COUNTRY")
-    # res = seqient(sequence_data)
-    # ===============================
-    #             CO2
-    # ===============================
-    # df = pd.read_csv("D:/country_co2_emissions_missing.csv")
-    # _time = list(df.columns)[1:]
-    # states = ['Very Low', 'Low', 'Middle', 'High', 'Very High']
-    # sequence_data = SequenceData(df, time=_time, id_col="country", states=states)
-    # res = seqient(sequence_data)
-    # ===============================
-    #            detailed
-    # ===============================
-    # df = pd.read_csv("D:/college/research/QiQi/sequenzo/data_and_output/sampled_data_sets/detailed_data/sampled_1000_data.csv")
-    # _time = list(df.columns)[4:]
-    # states = ['data', 'data & intensive math', 'hardware', 'research', 'software', 'software & hardware', 'support & test']
-    # sequence_data = SequenceData(df[['worker_id', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10']],
-    #                              time=_time, id_col="worker_id", states=states)
-    # res = seqient(sequence_data)
-    # ===============================
-    #             broad
-    # ===============================
-    df = pd.read_csv("D:/college/research/QiQi/sequenzo/data_and_output/sampled_data_sets/broad_data/sampled_1000_data.csv")
-    _time = list(df.columns)[4:]
-    states = ['Non-computing', 'Non-technical computing', 'Technical computing']
-    sequence_data = SequenceData(df[['worker_id', 'C1', 'C2', 'C3', 'C4', 'C5']],
-                                 time=_time, id_col="worker_id", states=states)
-    res = get_within_sequence_entropy(sequence_data)
-    print(res)

sequenzo/visualization/plot_sequence_index.py CHANGED Viewed

@@ -289,10 +289,12 @@ def sort_sequences_by_method(seqdata, method="unsorted", mask=None, distance_mat
 def plot_sequence_index(seqdata: SequenceData,
-                        show_by_category=None,
-                        category_labels=None,
-                        id_group_df=None,
-                        categories=None,
+                        # Grouping parameters
+                        group_by_column=None,
+                        group_dataframe=None,
+                        group_column_name=None,
+                        group_labels=None,
+                        # Other parameters
                         sort_by="lexicographic",
                         sort_by_weight=False,
                         weights="auto",
@@ -320,13 +322,34 @@ def plot_sequence_index(seqdata: SequenceData,
     This function creates index plots that visualize sequences as horizontal lines,
     with different sorting options matching R's TraMineR functionality.
+    **Two API modes for grouping:**
+    1. **Simplified API** (when grouping info is already in the data):
+       ```python
+       plot_sequence_index(seqdata, group_by_column="Cluster", group_labels=cluster_labels)
+       ```
+    2. **Complete API** (when grouping info is in a separate dataframe):
+       ```python
+       plot_sequence_index(seqdata, group_dataframe=membership_df,
+                          group_column_name="Cluster", group_labels=cluster_labels)
+       ```
     :param seqdata: SequenceData object containing sequence information
-    :param show_by_category: (str, optional) Simple way to create grouped plots.
-                            Specify the column name from the original data (e.g., "sex", "education").
-                            This will automatically create separate plots for each category.
-    :param category_labels: (dict, optional) Custom labels for category values.
-                           Example: {0: "Female", 1: "Male"} or {"low": "Low Education", "high": "High Education"}.
-                           If not provided, will use original values or auto-generate readable labels.
+    **New API parameters (recommended):**
+    :param group_by_column: (str, optional) Column name from seqdata.data to group by.
+                           Use this when grouping information is already in your data.
+                           Example: "Cluster", "sex", "education"
+    :param group_dataframe: (pd.DataFrame, optional) Separate dataframe containing grouping information.
+                           Use this when grouping info is in a separate table (e.g., clustering results).
+                           Must contain ID column and grouping column.
+    :param group_column_name: (str, optional) Name of the grouping column in group_dataframe.
+                             Required when using group_dataframe.
+    :param group_labels: (dict, optional) Custom labels for group values.
+                        Example: {1: "Late Family Formation", 2: "Early Partnership"}
+                        Maps original values to display labels.
     :param sort_by: Sorting method for sequences within groups:
         - 'unsorted' or 'none': Keep original order (R TraMineR default)
         - 'lexicographic': Sort sequences lexicographically
@@ -392,45 +415,45 @@ def plot_sequence_index(seqdata: SequenceData,
     actual_figsize = style_sizes[plot_style]
-    # Handle the new simplified API: show_by_category
-    if show_by_category is not None:
+    # Handle the simplified API: group_by_column
+    if group_by_column is not None:
         # Validate that the column exists in the original data
-        if show_by_category not in seqdata.data.columns:
+        if group_by_column not in seqdata.data.columns:
             available_cols = [col for col in seqdata.data.columns if col not in seqdata.time and col != seqdata.id_col]
             raise ValueError(
-                f"Column '{show_by_category}' not found in the data. "
+                f"Column '{group_by_column}' not found in the data. "
                 f"Available columns for grouping: {available_cols}"
             )
-        # Automatically create id_group_df and categories from the simplified API
-        id_group_df = seqdata.data[[seqdata.id_col, show_by_category]].copy()
-        id_group_df.columns = ['Entity ID', 'Category']
-        categories = 'Category'
+        # Automatically create group_dataframe and group_column_name from the simplified API
+        group_dataframe = seqdata.data[[seqdata.id_col, group_by_column]].copy()
+        group_dataframe.columns = ['Entity ID', 'Category']
+        group_column_name = 'Category'
-        # Handle category labels - flexible and user-controllable
-        unique_values = seqdata.data[show_by_category].unique()
+        # Handle group labels - flexible and user-controllable
+        unique_values = seqdata.data[group_by_column].unique()
-        if category_labels is not None:
+        if group_labels is not None:
             # User provided custom labels - use them
-            missing_keys = set(unique_values) - set(category_labels.keys())
+            missing_keys = set(unique_values) - set(group_labels.keys())
             if missing_keys:
                 raise ValueError(
-                    f"category_labels missing mappings for values: {missing_keys}. "
-                    f"Please provide labels for all unique values in '{show_by_category}': {sorted(unique_values)}"
+                    f"group_labels missing mappings for values: {missing_keys}. "
+                    f"Please provide labels for all unique values in '{group_by_column}': {sorted(unique_values)}"
                 )
-            id_group_df['Category'] = id_group_df['Category'].map(category_labels)
+            group_dataframe['Category'] = group_dataframe['Category'].map(group_labels)
         else:
             # No custom labels provided - use smart defaults
             if all(isinstance(v, (int, float, np.integer, np.floating)) and not pd.isna(v) for v in unique_values):
-                # Numeric values - keep as is (user can provide category_labels if they want custom names)
+                # Numeric values - keep as is (user can provide group_labels if they want custom names)
                 pass
             # For string/categorical values, keep original values
             # This handles cases where users already have meaningful labels like "Male"/"Female"
-        print(f"[>] Creating grouped plots by '{show_by_category}' with {len(unique_values)} categories")
+        print(f"[>] Creating grouped plots by '{group_by_column}' with {len(unique_values)} categories")
     # If no grouping information, create a single plot
-    if id_group_df is None or categories is None:
+    if group_dataframe is None or group_column_name is None:
         return _sequence_index_plot_single(seqdata, sort_by, sort_by_weight, weights, actual_figsize, plot_style, title, xlabel, ylabel, save_as, dpi, fontsize, include_legend, sequence_selection, n_sequences, show_sequence_ids)
     # Process weights
@@ -443,21 +466,21 @@ def plot_sequence_index(seqdata: SequenceData,
             raise ValueError("Length of weights must equal number of sequences.")
     # Ensure ID columns match (convert if needed)
-    id_col_name = "Entity ID" if "Entity ID" in id_group_df.columns else id_group_df.columns[0]
+    id_col_name = "Entity ID" if "Entity ID" in group_dataframe.columns else group_dataframe.columns[0]
     # Get unique groups and sort them based on user preference
     if group_order:
         # Use manually specified order, filter out non-existing groups
-        groups = [g for g in group_order if g in id_group_df[categories].unique()]
-        missing_groups = [g for g in id_group_df[categories].unique() if g not in group_order]
+        groups = [g for g in group_order if g in group_dataframe[group_column_name].unique()]
+        missing_groups = [g for g in group_dataframe[group_column_name].unique() if g not in group_order]
         if missing_groups:
             print(f"[Warning] Groups not in group_order will be excluded: {missing_groups}")
     elif sort_groups == 'numeric' or sort_groups == 'auto':
-        groups = smart_sort_groups(id_group_df[categories].unique())
+        groups = smart_sort_groups(group_dataframe[group_column_name].unique())
     elif sort_groups == 'alpha':
-        groups = sorted(id_group_df[categories].unique())
+        groups = sorted(group_dataframe[group_column_name].unique())
     elif sort_groups == 'none':
-        groups = list(id_group_df[categories].unique())
+        groups = list(group_dataframe[group_column_name].unique())
     else:
         raise ValueError(f"Invalid sort_groups value: {sort_groups}. Use 'auto', 'numeric', 'alpha', or 'none'.")
@@ -477,7 +500,7 @@ def plot_sequence_index(seqdata: SequenceData,
     # Create a plot for each group
     for i, group in enumerate(groups):
         # Get IDs for this group
-        group_ids = id_group_df[id_group_df[categories] == group][id_col_name].values
+        group_ids = group_dataframe[group_dataframe[group_column_name] == group][id_col_name].values
         # Match IDs with sequence data
         mask = np.isin(seqdata.ids, group_ids)

sequenzo/visualization/plot_state_distribution.py CHANGED Viewed

@@ -41,10 +41,12 @@ def smart_sort_groups(groups):
 def plot_state_distribution(seqdata: SequenceData,
-                            show_by_category=None,
-                            category_labels=None,
-                            id_group_df=None,
-                            categories=None,
+                            # Grouping parameters
+                            group_by_column=None,
+                            group_dataframe=None,
+                            group_column_name=None,
+                            group_labels=None,
+                            # Other parameters
                             weights="auto",
                             figsize=(12, 7),
                             plot_style="standard",
@@ -67,13 +69,33 @@ def plot_state_distribution(seqdata: SequenceData,
     Creates state distribution plots for different groups, showing how state
     prevalence changes over time within each group.
+    **Two API modes for grouping:**
+    1. **Simplified API** (when grouping info is already in the data):
+       ```python
+       plot_state_distribution(seqdata, group_by_column="Cluster", group_labels=cluster_labels)
+       ```
+    2. **Complete API** (when grouping info is in a separate dataframe):
+       ```python
+       plot_state_distribution(seqdata, group_dataframe=membership_df,
+                              group_column_name="Cluster", group_labels=cluster_labels)
+       ```
     :param seqdata: (SequenceData) A SequenceData object containing sequences
-    :param show_by_category: (str, optional) Simple way to create grouped plots.
-                            Specify the column name from the original data (e.g., "sex", "education").
-                            This will automatically create separate plots for each category.
-    :param category_labels: (dict, optional) Custom labels for category values.
-                           Example: {0: "Female", 1: "Male"} or {"low": "Low Education", "high": "High Education"}.
-                           If not provided, will use original values or auto-generate readable labels.
+    **Grouping parameters:**
+    :param group_by_column: (str, optional) Column name from seqdata.data to group by.
+                           Use this when grouping information is already in your data.
+                           Example: "Cluster", "sex", "education"
+    :param group_dataframe: (pd.DataFrame, optional) Separate dataframe containing grouping information.
+                           Use this when grouping info is in a separate table (e.g., clustering results).
+                           Must contain ID column and grouping column.
+    :param group_column_name: (str, optional) Name of the grouping column in group_dataframe.
+                             Required when using group_dataframe.
+    :param group_labels: (dict, optional) Custom labels for group values.
+                        Example: {1: "Late Family Formation", 2: "Early Partnership"}
+                        Maps original values to display labels.
     :param weights: (np.ndarray or "auto") Weights for sequences. If "auto", uses seqdata.weights if available
     :param figsize: (tuple) Size of the figure (only used when plot_style="custom")
     :param plot_style: Plot aspect style:
@@ -122,46 +144,45 @@ def plot_state_distribution(seqdata: SequenceData,
     actual_figsize = style_sizes[plot_style]
-    # Handle the new simplified API: show_by_category
-    if show_by_category is not None:
+    # Handle the simplified API: group_by_column
+    if group_by_column is not None:
         # Validate that the column exists in the original data
-        if show_by_category not in seqdata.data.columns:
+        if group_by_column not in seqdata.data.columns:
             available_cols = [col for col in seqdata.data.columns if col not in seqdata.time and col != seqdata.id_col]
             raise ValueError(
-                f"Column '{show_by_category}' not found in the data. "
+                f"Column '{group_by_column}' not found in the data. "
                 f"Available columns for grouping: {available_cols}"
             )
-        # Automatically create id_group_df and categories from the simplified API
-        id_group_df = seqdata.data[[seqdata.id_col, show_by_category]].copy()
-        id_group_df.columns = ['Entity ID', 'Category']
-        categories = 'Category'
+        # Automatically create group_dataframe and group_column_name from the simplified API
+        group_dataframe = seqdata.data[[seqdata.id_col, group_by_column]].copy()
+        group_dataframe.columns = ['Entity ID', 'Category']
+        group_column_name = 'Category'
-        # Handle category labels - flexible and user-controllable
-        unique_values = seqdata.data[show_by_category].unique()
+        # Handle group labels - flexible and user-controllable
+        unique_values = seqdata.data[group_by_column].unique()
-        if category_labels is not None:
+        if group_labels is not None:
             # User provided custom labels - use them
-            missing_keys = set(unique_values) - set(category_labels.keys())
+            missing_keys = set(unique_values) - set(group_labels.keys())
             if missing_keys:
                 raise ValueError(
-                    f"category_labels missing mappings for values: {missing_keys}. "
-                    f"Please provide labels for all unique values in '{show_by_category}': {sorted(unique_values)}"
+                    f"group_labels missing mappings for values: {missing_keys}. "
+                    f"Please provide labels for all unique values in '{group_by_column}': {sorted(unique_values)}"
                 )
-            id_group_df['Category'] = id_group_df['Category'].map(category_labels)
+            group_dataframe['Category'] = group_dataframe['Category'].map(group_labels)
         else:
             # No custom labels provided - use smart defaults
             if all(isinstance(v, (int, float, np.integer, np.floating)) and not pd.isna(v) for v in unique_values):
-                # Numeric values - keep as is (user can provide category_labels if they want custom names)
+                # Numeric values - keep as is (user can provide group_labels if they want custom names)
                 pass
             # For string/categorical values, keep original values
             # This handles cases where users already have meaningful labels like "Male"/"Female"
-        print(f"[>] Creating grouped plots by '{show_by_category}' with {len(unique_values)} categories")
+        print(f"[>] Creating grouped plots by '{group_by_column}' with {len(unique_values)} categories")
     # If no grouping information, create a single plot
-    if id_group_df is None or categories is None:
+    if group_dataframe is None or group_column_name is None:
         return _plot_state_distribution_single(
             seqdata=seqdata, weights=weights, figsize=actual_figsize,
             plot_style=plot_style, title=title, xlabel=xlabel, ylabel=ylabel,
@@ -179,21 +200,21 @@ def plot_state_distribution(seqdata: SequenceData,
             raise ValueError("Length of weights must equal number of sequences.")
     # Ensure ID columns match (convert if needed)
-    id_col_name = "Entity ID" if "Entity ID" in id_group_df.columns else id_group_df.columns[0]
+    id_col_name = "Entity ID" if "Entity ID" in group_dataframe.columns else group_dataframe.columns[0]
     # Get unique groups and sort them based on user preference
     if group_order:
         # Use manually specified order, filter out non-existing groups
-        groups = [g for g in group_order if g in id_group_df[categories].unique()]
-        missing_groups = [g for g in id_group_df[categories].unique() if g not in group_order]
+        groups = [g for g in group_order if g in group_dataframe[group_column_name].unique()]
+        missing_groups = [g for g in group_dataframe[group_column_name].unique() if g not in group_order]
         if missing_groups:
             print(f"[Warning] Groups not in group_order will be excluded: {missing_groups}")
     elif sort_groups == 'numeric' or sort_groups == 'auto':
-        groups = smart_sort_groups(id_group_df[categories].unique())
+        groups = smart_sort_groups(group_dataframe[group_column_name].unique())
     elif sort_groups == 'alpha':
-        groups = sorted(id_group_df[categories].unique())
+        groups = sorted(group_dataframe[group_column_name].unique())
     elif sort_groups == 'none':
-        groups = list(id_group_df[categories].unique())
+        groups = list(group_dataframe[group_column_name].unique())
     else:
         raise ValueError(f"Invalid sort_groups value: {sort_groups}. Use 'auto', 'numeric', 'alpha', or 'none'.")
@@ -216,7 +237,7 @@ def plot_state_distribution(seqdata: SequenceData,
     # Process each group
     for i, group in enumerate(groups):
         # Get IDs for this group
-        group_ids = id_group_df[id_group_df[categories] == group][id_col_name].values
+        group_ids = group_dataframe[group_dataframe[group_column_name] == group][id_col_name].values
         # Match IDs with sequence data
         mask = np.isin(seqdata.ids, group_ids)