PyPI - sequenzo - Versions diffs - 0.1.21__cp310-cp310-macosx_11_0_arm64.whl - Mend

sequenzo 0.1.21__cp310-cp310-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of sequenzo might be problematic. Click here for more details.

Files changed (260) hide show

sequenzo/visualization/plot_mean_time.py ADDED Viewed

@@ -0,0 +1,194 @@
+"""
+@Author  : Yuqi Liang 梁彧祺
+@File    : plot_mean_time.py
+@Time    : 14/02/2025 10:12
+@Desc    :
+    Implementation of Mean Time Plot for social sequence analysis,
+    closely following ggseqplot's `ggseqmtplot` function,
+    and TraMineR's `plot.stslist.meant.Rd` for mean time calculation.
+"""
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from typing import Optional
+from sequenzo.define_sequence_data import SequenceData
+from sequenzo.visualization.utils import (
+    save_and_show_results,
+    show_plot_title
+)
+def _compute_mean_time(seqdata: SequenceData, weights="auto") -> pd.DataFrame:
+    """
+    Compute mean total time spent in each state across all sequences.
+    Optimized version using pandas operations.
+    :param seqdata: SequenceData object containing sequence information
+    :param weights: (np.ndarray or "auto") Weights for sequences. If "auto", uses seqdata.weights if available
+    :return: DataFrame with mean time spent and standard error for each state
+    """
+    # Process weights
+    if isinstance(weights, str) and weights == "auto":
+        weights = getattr(seqdata, "weights", None)
+    if weights is not None:
+        weights = np.asarray(weights, dtype=float).reshape(-1)
+        if len(weights) != len(seqdata.values):
+            raise ValueError("Length of weights must equal number of sequences.")
+    # Get data and preprocess
+    seq_df = seqdata.to_dataframe()
+    inv = {v: k for k, v in seqdata.state_mapping.items()}
+    states = list(range(1, len(seqdata.states) + 1))  # Use numerical state indices
+    n = len(seq_df)
+    # Get weights
+    if weights is None:
+        w = np.ones(n)
+    else:
+        w = np.asarray(weights, dtype=float)
+    # Broadcast weights to each time point
+    W = np.repeat(w[:, None], seq_df.shape[1], axis=1)
+    # Convert to long format with weights
+    df_long = seq_df.melt(value_name='state_idx')
+    # Replicate weights for each time point
+    W_long = pd.DataFrame(W, columns=seq_df.columns).melt(value_name='w')['w'].to_numpy()
+    df_long['w'] = W_long
+    # Keep state_idx as numerical for consistent grouping
+    # Calculate mean time spent in each state per sequence
+    # For each sequence, count time spent in each state, then take weighted average
+    seq_state_times = {}
+    for s in states:
+        # Count occurrences of state s in each sequence
+        state_counts = (seq_df == s).sum(axis=1)  # Sum across time for each sequence
+        # Calculate weighted mean across sequences
+        seq_state_times[s] = np.average(state_counts, weights=w) if len(state_counts) > 0 else 0.0
+    mean_times = seq_state_times
+    # Calculate standard errors for mean time
+    se = {}
+    n_sequences = len(seq_df)
+    for s in states:
+        if n_sequences > 1:
+            # Count occurrences of state s in each sequence
+            state_counts = (seq_df == s).sum(axis=1)
+            # Calculate weighted standard error
+            weighted_mean = seq_state_times[s]
+            weighted_var = np.average((state_counts - weighted_mean) ** 2, weights=w)
+            # Standard error of the weighted mean
+            se[s] = np.sqrt(weighted_var / n_sequences) if weighted_var >= 0 else 0.0
+        else:
+            se[s] = 0.0
+    # Create result DataFrame
+    mean_time_df = pd.DataFrame({
+        'State': [inv[s] for s in states],
+        'MeanTime': [mean_times[s] for s in states],
+        'StandardError': [se[s] for s in states]
+    })
+    mean_time_df.sort_values(by='MeanTime', ascending=True, inplace=True)
+    return mean_time_df
+def plot_mean_time(seqdata: SequenceData,
+                   weights="auto",
+                   show_error_bar: bool = True,
+                   title=None,
+                   x_label="Mean Time",
+                   y_label="State",
+                   fontsize: int = 12,
+                   save_as: Optional[str] = None,
+                   dpi: int = 200) -> None:
+    """
+    Plot Mean Time Plot for sequence data with clean white background.
+    :param seqdata: SequenceData object containing sequence information
+    :param weights: (np.ndarray or "auto") Weights for sequences. If "auto", uses seqdata.weights if available
+    :param show_error_bar: Boolean flag to show or hide error bars
+    :param title: Optional title for the plot
+    :param x_label: Label for the x-axis
+    :param y_label: Label for the y-axis
+    :param save_as: Optional file path to save the plot
+    :param dpi: Resolution of the saved plot
+    """
+    # Use default style as base
+    plt.style.use('default')
+    # Compute all required data at once
+    mean_time_df = _compute_mean_time(seqdata, weights)
+    # Create figure and preallocate memory
+    fig = plt.figure(figsize=(12, 7))
+    # Create main plot
+    ax = plt.subplot(111)
+    # Get color mapping - use original colors without enhancement
+    cmap = seqdata.get_colormap()
+    colors = [cmap.colors[i] for i in range(len(seqdata.states))]
+    # Assign colors to states (without enhancing saturation)
+    mean_time_df['Color'] = pd.Categorical(mean_time_df['State']).codes
+    mean_time_df['Color'] = mean_time_df['Color'].map(lambda x: colors[x])
+    # Create custom barplot
+    for i, (_, row) in enumerate(mean_time_df.iterrows()):
+        ax.barh(y=i, width=row['MeanTime'], height=0.7,
+                color=row['Color'], edgecolor='white', linewidth=0.5)
+    # Set y-axis ticks and labels
+    ax.set_yticks(range(len(mean_time_df)))
+    ax.set_yticklabels(mean_time_df['State'], fontsize=fontsize-2)
+    # Add error bars if needed
+    if show_error_bar:
+        ax.errorbar(
+            x=mean_time_df["MeanTime"],
+            y=range(len(mean_time_df)),
+            xerr=mean_time_df["StandardError"],
+            fmt='none',
+            ecolor='black',
+            capsize=3,
+            capthick=1,
+            elinewidth=1.5
+        )
+    # Set plot properties
+    if title:
+        show_plot_title(ax, title, show=True, fontsize=fontsize+2, fontweight='bold', pad=20)
+    ax.set_xlabel(x_label, fontsize=fontsize)
+    ax.set_ylabel(y_label, fontsize=fontsize, labelpad=15)
+    # Clean white background with light grid
+    ax.set_facecolor('white')
+    ax.grid(axis='x', color='#E0E0E0', linestyle='-', linewidth=0.5)
+    ax.set_axisbelow(True)  # Place grid lines behind the bars
+    # Customize borders
+    for spine in ax.spines.values():
+        spine.set_color('#CCCCCC')  # Light gray border
+        spine.set_linewidth(0.5)
+    # Adjust layout(1/2)
+    plt.subplots_adjust(left=0.3)
+    # Add a note about normalization
+    relative_threshold = 0.01
+    max_val = mean_time_df['MeanTime'].max()
+    too_many_small = np.sum(mean_time_df['MeanTime'] < relative_threshold * max_val) >= 1
+    if too_many_small:
+        norm_note = f"Note: Some bars may appear as zero, but actually have small non-zero values."
+        plt.figtext(0.5, -0.02, norm_note, ha='center', fontsize=fontsize-2, style='italic')
+    # Adjust layout(2/2)
+    plt.tight_layout()
+    save_and_show_results(save_as, dpi=200)

sequenzo/visualization/plot_modal_state.py ADDED Viewed

@@ -0,0 +1,276 @@
+"""
+@Author  : Yuqi Liang 梁彧祺
+@File    : plot_modal_state.py
+@Time    : 01/03/2025 13:45
+@Desc    :
+"""
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from typing import Optional, Union, List
+from sequenzo import SequenceData
+from sequenzo.visualization.utils import (
+    set_up_time_labels_for_x_axis,
+    create_standalone_legend,
+    save_figure_to_buffer,
+    combine_plot_with_legend,
+    save_and_show_results,
+    show_plot_title
+)
+from PIL import Image
+def plot_modal_state(seqdata: SequenceData,
+                     group_by: Optional[Union[str, pd.Series, np.ndarray]] = None,
+                     group_labels: Optional[List[str]] = None,
+                     weights="auto",
+                     xlabel: str = "Time",
+                     ylabel: str = "Rel. Freq.",
+                     fig_width: int = 12,
+                     fig_height: Optional[int] = None,
+                     show_counts: bool = True,
+                     show_group_titles: bool = True,
+                     fontsize: int = 12,
+                     save_as: Optional[str] = None,
+                     dpi: int = 200) -> None:
+    """
+    Creates a modal state frequency plot showing the most frequent state at each position
+    and its relative frequency, with optional grouping by a categorical variable.
+    :param seqdata: SequenceData object containing sequence information
+    :param group_by: Column name or array with grouping variable
+    :param group_labels: Optional custom labels for groups
+    :param weights: (np.ndarray or "auto") Weights for sequences. If "auto", uses seqdata.weights if available
+    :param xlabel: Label for the x-axis
+    :param ylabel: Label for the y-axis
+    :param fig_width: Width of the figure
+    :param fig_height: Height of the figure (auto-calculated based on groups if None)
+    :param show_counts: Whether to show the count of sequences in each group title
+    :param save_as: Optional file path to save the plot
+    :param dpi: Resolution of the saved plot
+    :return: None
+    """
+    # Process weights
+    if isinstance(weights, str) and weights == "auto":
+        weights = getattr(seqdata, "weights", None)
+    if weights is not None:
+        weights = np.asarray(weights, dtype=float).reshape(-1)
+        if len(weights) != len(seqdata.values):
+            raise ValueError("Length of weights must equal number of sequences.")
+    # Get sequence data as a DataFrame
+    seq_df = seqdata.to_dataframe()
+    # Ensure seq_df has the same index as the original data
+    # This is crucial to align the grouping variable with sequence data
+    seq_df.index = seqdata.data.index
+    # Get weights for all sequences
+    if weights is None:
+        w_all = np.ones(len(seq_df))
+    else:
+        w_all = np.asarray(weights)
+    # Create state mapping from numerical values back to state names
+    inv_state_mapping = {v: k for k, v in seqdata.state_mapping.items()}
+    # Process grouping variable
+    if group_by is None:
+        # If no grouping, create a single group with all sequences
+        groups = pd.Series(["All Sequences"] * len(seq_df), index=seq_df.index)
+        if group_labels is None:
+            group_labels = ["All Sequences"]
+    elif isinstance(group_by, str):
+        # If grouping by column name from original data
+        if group_by not in seqdata.data.columns:
+            raise ValueError(f"Column '{group_by}' not found in sequence data")
+        groups = seqdata.data[group_by]
+        if group_labels is None:
+            group_labels = sorted(groups.unique())
+    else:
+        # If grouping by external array or Series
+        if len(group_by) != len(seq_df):
+            raise ValueError("Length of group_by must match number of sequences")
+        groups = pd.Series(group_by)
+        if group_labels is None:
+            group_labels = sorted(set(groups))
+    # Prepare plotting
+    n_groups = len(group_labels)
+    n_time_points = len(seq_df.columns)
+    if fig_height is None:
+        # Auto-calculate height based on number of groups
+        fig_height = max(4, 3 * n_groups)
+    # TODO: Title is not very pretty here so I decided to remove it.
+    # But here I keep 1 to keep the space big enough for the distance
+    # between the second subplot and the upper first subplot
+    title_height = 1
+    adjusted_fig_height = fig_height + title_height
+    # Create main figure with additional space for title
+    main_fig = plt.figure(figsize=(fig_width, adjusted_fig_height))
+    # No title, use whole figure for plots
+    plot_gs = main_fig.add_gridspec(nrows=n_groups, height_ratios=[1] * n_groups, hspace=0.3)
+    # Create axes for each group
+    axes = []
+    for i in range(n_groups):
+        axes.append(main_fig.add_subplot(plot_gs[i]))
+    # Make sure all axes share x and y scales
+    for ax in axes[1:]:
+        ax.sharex(axes[0])
+        ax.sharey(axes[0])
+    # Get colors for states
+    colors = seqdata.color_map_by_label
+    # Process each group
+    for i, group in enumerate(group_labels):
+        ax = axes[i]
+        # Get indices for this group
+        group_indices = groups == group
+        group_count = group_indices.sum()
+        # Skip if no sequences in this group
+        if group_count == 0:
+            continue
+        # Subset data for this group and get corresponding weights
+        group_data = seq_df[group_indices]
+        w = w_all[group_indices.to_numpy()]
+        # Calculate modal states and their frequencies for each time point
+        modal_states = []
+        modal_freqs = []
+        for col in group_data.columns:
+            states_idx = group_data[col].to_numpy()
+            # Calculate weighted counts for each state
+            weighted_sum = {}
+            # Use numerical state indices (1, 2, 3, ...) instead of state labels
+            for s_num in range(1, len(seqdata.states) + 1):  # s_num is the integer encoding
+                weighted_sum[s_num] = float(w[states_idx == s_num].sum())
+            totw = float(w.sum())
+            if totw > 0:
+                # Find the state with maximum weighted count
+                modal_s = max(weighted_sum, key=weighted_sum.get)
+                modal_state = inv_state_mapping[modal_s]
+                modal_freq = weighted_sum[modal_s] / totw
+            else:
+                modal_state, modal_freq = None, 0.0
+            modal_states.append(modal_state)
+            modal_freqs.append(modal_freq)
+        # Equal width for all bars
+        x = np.arange(n_time_points)
+        bar_width = 0.8  # Fixed width for all bars
+        # Create bars with consistent width
+        for j, (state, freq) in enumerate(zip(modal_states, modal_freqs)):
+            if state is not None:
+                # state is already a label from inv_state_mapping
+                ax.bar(x[j], freq, width=bar_width, color=colors[state],
+                       edgecolor='white', linewidth=0.5)
+        # Set group title with count if requested
+        if show_group_titles:
+            if show_counts:
+                if weights is not None and not np.allclose(weights, 1.0):
+                    sum_w = float(w.sum())
+                    title_text = f"{group} (n={group_count}, total weight={sum_w:.1f})"
+                else:
+                    title_text = f"{group} (n={group_count})"
+            else:
+                title_text = group
+            show_plot_title(ax, title_text, show=True, fontsize=fontsize, pad=15)
+        # Set y-axis limits and ticks
+        ax.set_ylim(0, 1.0)
+        ax.set_yticks([0, 0.25, 0.5, 0.75, 1.0])
+        # Set grid and spines
+        ax.grid(axis='y', color='#E0E0E0', linestyle='-', linewidth=0.5)
+        ax.set_axisbelow(True)
+        # Clean up borders
+        for spine in ax.spines.values():
+            spine.set_color('#CCCCCC')
+            spine.set_linewidth(0.5)
+        # Add y-label only for the middle subplot
+        if i == n_groups // 2:
+            ax.set_ylabel(ylabel, fontsize=fontsize)
+    # Set up X-axis (time) labels on the bottom subplot
+    set_up_time_labels_for_x_axis(seqdata, axes[-1])
+    axes[-1].set_xlabel(xlabel, fontsize=fontsize, labelpad=10)
+    # Save main figure to memory
+    main_buffer = save_figure_to_buffer(main_fig, dpi=dpi)
+    # Create a legend
+    # Create standalone legend
+    legend_buffer = create_standalone_legend(
+        colors=colors,
+        labels=seqdata.labels,
+        ncol=min(5, len(seqdata.states)),
+        figsize=(fig_width, 1),
+        fontsize=fontsize-2,
+        dpi=dpi
+    )
+    if save_as and not save_as.lower().endswith(('.png', '.jpg', '.jpeg', '.pdf')):
+        save_as = save_as + '.png'
+    # Combine main plot with legend
+    combined_img = combine_plot_with_legend(
+        main_buffer,
+        legend_buffer,
+        output_path=save_as,
+        dpi=dpi,
+        padding=20  # Increased padding between plot and legend
+    )
+    # Display combined image
+    plt.figure(figsize=(fig_width, adjusted_fig_height + 1))
+    plt.imshow(combined_img)
+    plt.axis('off')
+    plt.show()
+    plt.close()
+if __name__ == '__main__':
+    # Import necessary libraries
+    from sequenzo import *  # Social sequence analysis
+    import pandas as pd  # Data manipulation
+    # List all the available datasets in Sequenzo
+    print('Available datasets in Sequenzo: ', list_datasets())
+    # Load the data that we would like to explore in this tutorial
+    # `df` is the short for `dataframe`, which is a common variable name for a dataset
+    df = load_dataset('country_co2_emissions')
+    # Create a SequenceData object from the dataset
+    # Define the time-span variable
+    time = list(df.columns)[1:]
+    states = ['Very Low', 'Low', 'Middle', 'High', 'Very High']
+    sequence_data = SequenceData(df, time=time, time_type="year", id_col="country", states=states)
+    plot_modal_state(sequence_data)

sequenzo/visualization/plot_most_frequent_sequences.py ADDED Viewed

@@ -0,0 +1,147 @@
+"""
+@Author  : Yuqi Liang 梁彧祺
+@File    : plot_most_frequent_sequences.py
+@Time    : 12/02/2025 10:40
+@Desc    :
+    Generate sequence frequency plots.
+    This script plots the 10 most frequent sequences,
+    similar to `seqfplot` in R's TraMineR package.
+"""
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+from sequenzo.define_sequence_data import SequenceData
+from sequenzo.visualization.utils import (
+    set_up_time_labels_for_x_axis,
+    save_and_show_results,
+    show_plot_title
+)
+def plot_most_frequent_sequences(seqdata: SequenceData, top_n: int = 10, weights="auto", title=None, fontsize=12, save_as=None, dpi=200, show_title: bool = True):
+    """
+    Generate a sequence frequency plot, similar to R's seqfplot.
+    :param seqdata: (SequenceData) A SequenceData object containing sequences.
+    :param top_n: (int) Number of most frequent sequences to display.
+    :param weights: (np.ndarray or "auto") Weights for sequences. If "auto", uses seqdata.weights if available
+    :param title: (str, optional) Title for the plot. If None, no title will be displayed.
+    :param fontsize: (int) Base font size for text elements
+    :param save_as: (str, optional) Path to save the plot.
+    :param dpi: (int) Resolution of the saved plot.
+    """
+    sequences = seqdata.values.tolist()
+    # Process weights
+    if isinstance(weights, str) and weights == "auto":
+        weights = getattr(seqdata, "weights", None)
+    if weights is not None:
+        weights = np.asarray(weights, dtype=float).reshape(-1)
+        if len(weights) != len(seqdata.values):
+            raise ValueError("Length of weights must equal number of sequences.")
+    if weights is None:
+        weights = np.ones(len(sequences))
+    # Weighted counting of sequences
+    agg = {}
+    for seq, w in zip(sequences, weights):
+        key = tuple(seq)
+        agg[key] = agg.get(key, 0.0) + float(w)
+    # Select Top-N by weighted frequency
+    items = sorted(agg.items(), key=lambda kv: kv[1], reverse=True)[:top_n]
+    df = pd.DataFrame(items, columns=['sequence', 'wcount'])
+    totw = float(np.sum(weights))
+    df['freq'] = df['wcount'] / (totw if totw > 0 else 1.0) * 100.0
+    # **Ensure colors match seqdef**
+    # Use numeric color map directly to avoid label/state-name mismatches
+    inv_state_mapping = {v: k for k, v in seqdata.state_mapping.items()}  # Reverse mapping kept if needed elsewhere
+    # **Plot settings**
+    fig, ax = plt.subplots(figsize=(10, 6))
+    # **Adjust y_positions calculation to ensure sequences fill the entire y-axis**
+    y_positions = df['freq'].cumsum() - df['freq'] / 2  # Center the bars
+    for i, (seq, freq) in enumerate(zip(df['sequence'], df['freq'])):
+        left = 0  # Starting x position
+        for t, state_idx in enumerate(seq):
+            # Use numeric-coded color map; if unknown, fall back to gray
+            color = seqdata.color_map.get(int(state_idx), "gray")
+            width = 1  # Width of each time slice
+            ax.barh(y=y_positions[i], width=width * 1.01, left=left - 0.005,
+                    height=freq, color=color, linewidth=0,
+                    antialiased=False)
+            left += width  # Move to the next time slice
+    # **Formatting**
+    ax.set_xlabel("Time", fontsize=fontsize)
+    # Check if we have effective weights (not all 1.0) and they were provided by user
+    original_weights = getattr(seqdata, "weights", None)
+    if original_weights is not None and not np.allclose(original_weights, 1.0):
+        # Show both count and weighted total if weights are used
+        ax.set_ylabel("Cumulative Frequency (%)\nN={:,}, total weight={:.1f}".format(len(sequences), totw), fontsize=fontsize)
+    else:
+        ax.set_ylabel("Cumulative Frequency (%)\nN={:,}".format(len(sequences)), fontsize=fontsize)
+    if show_title and title is not None:
+        show_plot_title(ax, title, show=True, fontsize=fontsize+2, pad=20)
+    # **Optimize X-axis ticks: align to the center of each bar**
+    set_up_time_labels_for_x_axis(seqdata, ax)
+    # **Set Y-axis ticks and labels**
+    sum_freq_top_10 = df['freq'].sum()  # Cumulative frequency of top 10 sequences
+    max_freq = df['freq'].max()  # Frequency of the top 1 sequence
+    # Set Y-axis ticks: 0%, top1 frequency, top10 cumulative frequency
+    y_ticks = [0, max_freq, sum_freq_top_10]
+    ax.set_yticks(y_ticks)
+    ax.set_yticklabels([f"{ytick:.1f}%" for ytick in y_ticks], fontsize=fontsize-2)
+    # **Set Y-axis range to ensure the highest tick is the top10 cumulative frequency**
+    # Force Y-axis range to be from 0 to sum_freq_top_10
+    ax.set_ylim(0, sum_freq_top_10)
+    # **Annotate the frequency percentage on the left side of the highest frequency sequence**
+    ax.annotate(f"{max_freq:.1f}%", xy=(-0.5, y_positions.iloc[0]),
+                xycoords="data", fontsize=fontsize, color="black", ha="left", va="center")
+    # **Annotate 0% at the bottom of the Y-axis**
+    ax.annotate("0%", xy=(-0.5, 0), xycoords="data", fontsize=fontsize, color="black", ha="left", va="center")
+    # **Clean up axis aesthetics like plot_state_distribution**
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.spines['left'].set_visible(True)  # Keep the left border like state_distribution
+    ax.spines['bottom'].set_visible(True)  # Show bottom border to connect with left
+    # Style the left spine to match plot_state_distribution
+    ax.spines['left'].set_color('gray')
+    ax.spines['left'].set_linewidth(0.7)
+    ax.spines['bottom'].set_color('gray')
+    ax.spines['bottom'].set_linewidth(0.7)
+    # Style the tick parameters
+    ax.tick_params(axis='y', colors='gray', length=4, width=0.7)
+    ax.tick_params(axis='x', colors='gray', length=4, width=0.7)
+    # Extend the left spine slightly beyond the plot area
+    ax.spines['left'].set_bounds(0, sum_freq_top_10)
+    ax.spines['left'].set_position(('outward', 5))  # Move spine 5 points to the left
+    # Align bottom spine with the left spine position
+    ax.spines['bottom'].set_position(('outward', 5))  # Move bottom spine to align with left
+    # Use legend from SequenceData
+    ax.legend(*seqdata.get_legend(), bbox_to_anchor=(1.05, 1), loc='upper left')
+    save_and_show_results(save_as, dpi=200)