PyPI - sequenzo - Versions diffs - 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl - Mend

sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (299) hide show

sequenzo/seqhmm/build_nhmm.py ADDED Viewed

@@ -0,0 +1,121 @@
+"""
+@Author  : Yuqi Liang 梁彧祺
+@File    : build_nhmm.py
+@Time    : 2025-11-22 19:30
+@Desc    : Build Non-homogeneous HMM models
+This module provides the build_nhmm function, which creates Non-homogeneous HMM
+model objects similar to seqHMM's build_nhmm() function in R.
+Note: This is a simplified implementation. A full implementation would require
+more sophisticated handling of formulas and data structures.
+"""
+import numpy as np
+import pandas as pd
+from typing import Optional, List, Union
+from sequenzo.define_sequence_data import SequenceData
+from .nhmm import NHMM
+from .formulas import Formula
+def build_nhmm(
+    observations: SequenceData,
+    n_states: int,
+    X: Optional[np.ndarray] = None,
+    emission_formula: Optional[Union[str, Formula]] = None,
+    initial_formula: Optional[Union[str, Formula]] = None,
+    transition_formula: Optional[Union[str, Formula]] = None,
+    data: Optional[pd.DataFrame] = None,
+    id_var: Optional[str] = None,
+    time_var: Optional[str] = None,
+    eta_pi: Optional[np.ndarray] = None,
+    eta_A: Optional[np.ndarray] = None,
+    eta_B: Optional[np.ndarray] = None,
+    state_names: Optional[List[str]] = None,
+    random_state: Optional[int] = None
+) -> NHMM:
+    """
+    Build a Non-homogeneous Hidden Markov Model object.
+    A Non-homogeneous HMM allows transition and emission probabilities to vary
+    over time or with covariates. This function creates the model structure but
+    does not fit it (use fit_nhmm() for that).
+    It is similar to seqHMM's build_nhmm() function in R. Supports both
+    direct covariate matrix input and formula-based specification.
+    Args:
+        observations: SequenceData object containing the sequences to model
+        n_states: Number of hidden states
+        X: Optional covariate matrix of shape (n_sequences, n_timepoints, n_covariates).
+           If None, will be created from formulas.
+        emission_formula: Optional formula string for emission probabilities (e.g., "~ x1 + x2")
+        initial_formula: Optional formula string for initial probabilities
+        transition_formula: Optional formula string for transition probabilities
+        data: Optional DataFrame containing covariates (required if using formulas)
+        id_var: Optional column name for sequence IDs in data (required if using formulas)
+        time_var: Optional column name for time variable in data (required if using formulas)
+        eta_pi: Optional coefficients for initial probabilities (n_covariates x n_states)
+        eta_A: Optional coefficients for transition probabilities (n_covariates x n_states x n_states)
+        eta_B: Optional coefficients for emission probabilities (n_covariates x n_states x n_symbols)
+        state_names: Optional names for hidden states
+        random_state: Random seed for initialization
+    Returns:
+        NHMM: A Non-homogeneous HMM model object (not yet fitted)
+    Examples:
+        >>> from sequenzo import SequenceData, load_dataset
+        >>> from sequenzo.seqhmm import build_nhmm
+        >>> import numpy as np
+        >>>
+        >>> # Method 1: Direct covariate matrix
+        >>> n_sequences = len(seq.sequences)
+        >>> n_timepoints = max(len(s) for s in seq.sequences)
+        >>> X = np.zeros((n_sequences, n_timepoints, 1))
+        >>> for i in range(n_sequences):
+        ...     for t in range(len(seq.sequences[i])):
+        ...         X[i, t, 0] = t  # Time covariate
+        >>> nhmm = build_nhmm(seq, n_states=4, X=X, random_state=42)
+        >>>
+        >>> # Method 2: Formula-based (requires data DataFrame)
+        >>> nhmm = build_nhmm(
+        ...     seq, n_states=4,
+        ...     emission_formula="~ time + age",
+        ...     data=covariate_df,
+        ...     id_var='id',
+        ...     time_var='time',
+        ...     random_state=42
+        ... )
+    """
+    # Create covariate matrix from formulas if X is not provided
+    if X is None:
+        if data is None or id_var is None or time_var is None:
+            raise ValueError(
+                "If X is not provided, must provide data, id_var, and time_var for formula-based specification."
+            )
+        # Use emission_formula as default if others not specified
+        formula = emission_formula or initial_formula or transition_formula
+        if formula is None:
+            raise ValueError("Must provide either X or at least one formula (emission_formula, initial_formula, or transition_formula).")
+        # Create model matrix
+        n_sequences = len(observations.sequences)
+        n_timepoints = max(len(seq) for seq in observations.sequences)
+        X = create_model_matrix(formula, data, id_var, time_var, n_sequences, n_timepoints)
+    # Create and return NHMM object
+    nhmm = NHMM(
+        observations=observations,
+        n_states=n_states,
+        X=X,
+        eta_pi=eta_pi,
+        eta_A=eta_A,
+        eta_B=eta_B,
+        state_names=state_names,
+        random_state=random_state
+    )
+    return nhmm

sequenzo/seqhmm/fit_mhmm.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""
+@Author  : Yuqi Liang 梁彧祺
+@File    : fit_mhmm.py
+@Time    : 2025-11-21 13:37
+@Desc    : Fit Mixture HMM models using EM algorithm
+This module provides the fit_mhmm function, which estimates Mixture HMM parameters
+using the EM algorithm, similar to seqHMM's fit_model() function for mhmm objects.
+"""
+from typing import Optional
+from .mhmm import MHMM
+def fit_mhmm(
+    model: MHMM,
+    n_iter: int = 100,
+    tol: float = 1e-2,
+    verbose: bool = False
+) -> MHMM:
+    """
+    Fit a Mixture HMM model to the observations using EM algorithm.
+    This function estimates the parameters of a Mixture HMM model using the
+    Expectation-Maximization (EM) algorithm. The EM algorithm alternates between:
+    1. E-step: Compute responsibilities (posterior cluster probabilities)
+    2. M-step: Update cluster probabilities and HMM parameters for each cluster
+    It is similar to seqHMM's fit_model() function for mhmm objects in R.
+    Args:
+        model: MHMM model object created by build_mhmm()
+        n_iter: Maximum number of EM iterations. Default is 100.
+        tol: Convergence tolerance. EM stops if the gain in log-likelihood
+             is below this value. Default is 1e-2.
+        verbose: Whether to print progress information. Default is False.
+    Returns:
+        MHMM: The fitted model (same object, modified in place)
+    Examples:
+        >>> from sequenzo import SequenceData, load_dataset
+        >>> from sequenzo.seqhmm import build_mhmm, fit_mhmm
+        >>>
+        >>> # Load and prepare data
+        >>> df = load_dataset('mvad')
+        >>> seq = SequenceData(df, time=range(15, 86), states=['EM', 'FE', 'HE', 'JL', 'SC', 'TR'])
+        >>>
+        >>> # Build and fit model
+        >>> mhmm = build_mhmm(seq, n_clusters=3, n_states=4, random_state=42)
+        >>> mhmm = fit_mhmm(mhmm, n_iter=100, tol=1e-2, verbose=True)
+        >>>
+        >>> # Check results
+        >>> print(f"Log-likelihood: {mhmm.log_likelihood:.2f}")
+        >>> print(f"Iterations: {mhmm.n_iter}")
+        >>> print(f"Converged: {mhmm.converged}")
+        >>> print(f"Cluster probabilities: {mhmm.cluster_probs}")
+    """
+    # Fit the model
+    model.fit(n_iter=n_iter, tol=tol, verbose=verbose)
+    return model

sequenzo/seqhmm/fit_model.py ADDED Viewed

@@ -0,0 +1,61 @@
+"""
+@Author  : Yuqi Liang 梁彧祺
+@File    : fit_model.py
+@Time    : 2025-11-22 22:57
+@Desc    : Fit HMM models using EM algorithm
+This module provides the fit_model function, which estimates HMM parameters
+using the EM algorithm, similar to seqHMM's fit_model() function in R.
+"""
+from typing import Optional, Dict, Any
+from .hmm import HMM
+def fit_model(
+    model: HMM,
+    n_iter: int = 100,
+    tol: float = 1e-2,
+    verbose: bool = False
+) -> HMM:
+    """
+    Fit an HMM model to the observations using EM algorithm.
+    This function estimates the parameters (initial probabilities, transition
+    probabilities, and emission probabilities) of an HMM model using the
+    Expectation-Maximization (EM) algorithm.
+    It is similar to seqHMM's fit_model() function in R, but currently only
+    supports the EM algorithm step (not global or local optimization).
+    Args:
+        model: HMM model object created by build_hmm()
+        n_iter: Maximum number of EM iterations. Default is 100.
+        tol: Convergence tolerance. EM stops if the gain in log-likelihood
+             is below this value. Default is 1e-2.
+        verbose: Whether to print progress information. Default is False.
+    Returns:
+        HMM: The fitted model (same object, modified in place)
+    Examples:
+        >>> from sequenzo import SequenceData, load_dataset
+        >>> from sequenzo.seqhmm import build_hmm, fit_model
+        >>>
+        >>> # Load and prepare data
+        >>> df = load_dataset('mvad')
+        >>> seq = SequenceData(df, time=range(15, 86), states=['EM', 'FE', 'HE', 'JL', 'SC', 'TR'])
+        >>>
+        >>> # Build and fit model
+        >>> hmm = build_hmm(seq, n_states=4, random_state=42)
+        >>> hmm = fit_model(hmm, n_iter=100, tol=1e-2, verbose=True)
+        >>>
+        >>> # Check results
+        >>> print(f"Log-likelihood: {hmm.log_likelihood:.2f}")
+        >>> print(f"Iterations: {hmm.n_iter}")
+        >>> print(f"Converged: {hmm.converged}")
+    """
+    # Fit the model
+    model.fit(n_iter=n_iter, tol=tol, verbose=verbose)
+    return model

sequenzo/seqhmm/fit_nhmm.py ADDED Viewed

@@ -0,0 +1,76 @@
+"""
+@Author  : Yuqi Liang 梁彧祺
+@File    : fit_nhmm.py
+@Time    : 2025-11-23 13:38
+@Desc    : Fit Non-homogeneous HMM models
+This module provides the fit_nhmm function, which estimates NHMM parameters
+using numerical optimization, similar to seqHMM's fit_nhmm() function in R.
+Note: This is a simplified implementation. A full implementation would use
+the forward-backward algorithm and proper gradient computation.
+"""
+from typing import Optional
+from .nhmm import NHMM
+def fit_nhmm(
+    model: NHMM,
+    n_iter: int = 100,
+    tol: float = 1e-4,
+    verbose: bool = False
+) -> NHMM:
+    """
+    Fit a Non-homogeneous HMM model to the observations.
+    This function estimates the coefficients (eta_pi, eta_A, eta_B) that
+    determine how covariates influence the initial, transition, and emission
+    probabilities.
+    Note: This is a simplified implementation. A full implementation would:
+    1. Use the forward-backward algorithm to compute exact log-likelihood
+    2. Compute analytical gradients
+    3. Use more sophisticated optimization methods
+    It is similar to seqHMM's fit_nhmm() function in R.
+    Args:
+        model: NHMM model object created by build_nhmm()
+        n_iter: Maximum number of optimization iterations. Default is 100.
+        tol: Convergence tolerance. Default is 1e-4.
+        verbose: Whether to print progress information. Default is False.
+    Returns:
+        NHMM: The fitted model (same object, modified in place)
+    Examples:
+        >>> from sequenzo import SequenceData, load_dataset
+        >>> from sequenzo.seqhmm import build_nhmm, fit_nhmm
+        >>> import numpy as np
+        >>>
+        >>> # Load and prepare data
+        >>> df = load_dataset('mvad')
+        >>> seq = SequenceData(df, time=range(15, 86), states=['EM', 'FE', 'HE', 'JL', 'SC', 'TR'])
+        >>>
+        >>> # Create covariate matrix
+        >>> n_sequences = len(seq.sequences)
+        >>> n_timepoints = max(len(s) for s in seq.sequences)
+        >>> X = np.zeros((n_sequences, n_timepoints, 1))
+        >>> for i in range(n_sequences):
+        ...     for t in range(len(seq.sequences[i])):
+        ...         X[i, t, 0] = t
+        >>>
+        >>> # Build and fit model
+        >>> nhmm = build_nhmm(seq, n_states=4, X=X, random_state=42)
+        >>> nhmm = fit_nhmm(nhmm, n_iter=100, tol=1e-4, verbose=True)
+        >>>
+        >>> # Check results
+        >>> print(f"Log-likelihood: {nhmm.log_likelihood:.2f}")
+        >>> print(f"Iterations: {nhmm.n_iter}")
+        >>> print(f"Converged: {nhmm.converged}")
+    """
+    # Fit the model
+    model.fit(n_iter=n_iter, tol=tol, verbose=verbose)
+    return model

sequenzo/seqhmm/formulas.py ADDED Viewed

@@ -0,0 +1,289 @@
+"""
+@Author  : Yuqi Liang 梁彧祺
+@File    : formulas.py
+@Time    : 2025-10-18 16:23
+@Desc    : Formula-based covariate specification for NHMM
+This module provides a formula interface for specifying covariates in NHMM,
+similar to seqHMM's formula interface in R. Users can specify covariates
+using a string formula like "~ x1 + x2" instead of manually creating
+covariate matrices.
+Note: This is a simplified implementation. A full implementation would
+support more complex formulas (interactions, transformations, etc.).
+"""
+import numpy as np
+import pandas as pd
+from typing import Optional, List, Union, Dict
+from sequenzo.define_sequence_data import SequenceData
+class Formula:
+    """
+    Formula object for specifying covariates.
+    This class represents a formula like "~ x1 + x2" and can be used
+    to create model matrices from data.
+    Examples:
+        >>> formula = Formula("~ age + gender")
+        >>> X = formula.create_matrix(data, id_var='id', time_var='time')
+    """
+    def __init__(self, formula: str):
+        """
+        Initialize a formula object.
+        Args:
+            formula: Formula string, e.g., "~ x1 + x2" or "x1 + x2"
+                    (tilde is optional)
+        """
+        # Remove leading/trailing whitespace
+        formula = formula.strip()
+        # Remove tilde if present
+        if formula.startswith('~'):
+            formula = formula[1:].strip()
+        self.formula = formula
+        self.terms = self._parse_formula(formula)
+    def _parse_formula(self, formula: str) -> List[str]:
+        """
+        Parse formula string into terms.
+        Args:
+            formula: Formula string
+        Returns:
+            List of variable names
+        """
+        if not formula:
+            return []
+        # Split by + and clean up
+        terms = [term.strip() for term in formula.split('+')]
+        return [t for t in terms if t]  # Remove empty strings
+    def create_matrix(
+        self,
+        data: pd.DataFrame,
+        id_var: str,
+        time_var: str,
+        n_sequences: int,
+        n_timepoints: int
+    ) -> np.ndarray:
+        """
+        Create covariate matrix from formula and data.
+        This function creates a covariate matrix X of shape
+        (n_sequences, n_timepoints, n_covariates) from a DataFrame
+        and formula specification.
+        Args:
+            data: DataFrame containing covariates
+            id_var: Column name for sequence IDs
+            time_var: Column name for time variable
+            n_sequences: Number of sequences
+            n_timepoints: Number of time points
+        Returns:
+            numpy array: Covariate matrix (n_sequences, n_timepoints, n_covariates)
+        """
+        if not self.terms:
+            # No covariates: return matrix of ones (intercept only)
+            return np.ones((n_sequences, n_timepoints, 1))
+        # Initialize covariate matrix
+        X = np.zeros((n_sequences, n_timepoints, len(self.terms) + 1))  # +1 for intercept
+        # First column is intercept (always 1)
+        X[:, :, 0] = 1.0
+        # Fill in covariates
+        for term_idx, term in enumerate(self.terms):
+            col_idx = term_idx + 1  # +1 because first column is intercept
+            if term not in data.columns:
+                raise ValueError(f"Variable '{term}' not found in data columns: {list(data.columns)}")
+            # Get values for this covariate
+            covar_values = data[term].values
+            # Reshape to match sequence structure
+            # This assumes data is in long format (one row per sequence-time combination)
+            # We need to reshape it to (n_sequences, n_timepoints)
+            # If data has id_var and time_var, use them to reshape
+            if id_var in data.columns and time_var in data.columns:
+                # Pivot to wide format
+                pivot_df = data.pivot(index=id_var, columns=time_var, values=term)
+                # Fill matrix
+                for seq_idx, seq_id in enumerate(pivot_df.index):
+                    if seq_idx < n_sequences:
+                        for t_idx, time_val in enumerate(pivot_df.columns):
+                            if t_idx < n_timepoints:
+                                X[seq_idx, t_idx, col_idx] = pivot_df.loc[seq_id, time_val]
+            else:
+                # Assume data is already in sequence-time order
+                # Reshape assuming row-major order (sequence by sequence)
+                if len(covar_values) == n_sequences * n_timepoints:
+                    X[:, :, col_idx] = covar_values.reshape(n_sequences, n_timepoints)
+                else:
+                    raise ValueError(
+                        f"Data length ({len(covar_values)}) doesn't match "
+                        f"n_sequences * n_timepoints ({n_sequences * n_timepoints})"
+                    )
+        return X
+def create_model_matrix(
+    formula: Union[str, Formula],
+    data: pd.DataFrame,
+    id_var: str,
+    time_var: str,
+    n_sequences: int,
+    n_timepoints: int
+) -> np.ndarray:
+    """
+    Create model matrix from formula and data.
+    This is a convenience function that creates a covariate matrix
+    from a formula string, similar to seqHMM's model_matrix() function.
+    Args:
+        formula: Formula string (e.g., "~ x1 + x2") or Formula object
+        data: DataFrame containing covariates
+        id_var: Column name for sequence IDs
+        time_var: Column name for time variable
+        n_sequences: Number of sequences
+        n_timepoints: Number of time points
+    Returns:
+        numpy array: Covariate matrix (n_sequences, n_timepoints, n_covariates)
+    Examples:
+        >>> import pandas as pd
+        >>> from sequenzo.seqhmm import create_model_matrix
+        >>>
+        >>> # Create data with covariates
+        >>> data = pd.DataFrame({
+        ...     'id': [1, 1, 1, 2, 2, 2],
+        ...     'time': [1, 2, 3, 1, 2, 3],
+        ...     'age': [20, 21, 22, 25, 26, 27],
+        ...     'gender': [0, 0, 0, 1, 1, 1]
+        ... })
+        >>>
+        >>> # Create model matrix
+        >>> X = create_model_matrix("~ age + gender", data, 'id', 'time', n_sequences=2, n_timepoints=3)
+        >>> print(X.shape)  # (2, 3, 3) - 2 sequences, 3 timepoints, 3 covariates (intercept + age + gender)
+    """
+    if isinstance(formula, str):
+        formula = Formula(formula)
+    return formula.create_matrix(data, id_var, time_var, n_sequences, n_timepoints)
+def create_model_matrix_time_constant(
+    formula: Union[str, Formula, None],
+    data: Optional[pd.DataFrame],
+    n_sequences: int
+) -> np.ndarray:
+    """
+    Create model matrix for time-constant covariates (one value per sequence).
+    This function creates a model matrix for time-constant covariates used in
+    MHMM simulation. The covariates are constant across time points for each sequence,
+    so the output matrix has shape (n_sequences, n_covariates) where n_covariates
+    includes an intercept column.
+    This is similar to R's model.matrix() function but for time-constant covariates.
+    Args:
+        formula: Formula string (e.g., "~ covariate_1 + covariate_2") or Formula object.
+                If None, returns a matrix with only intercept (column of ones).
+        data: DataFrame containing covariates. Must have n_sequences rows.
+               Each row corresponds to one sequence.
+        n_sequences: Number of sequences to simulate
+    Returns:
+        numpy array: Model matrix of shape (n_sequences, n_covariates)
+                    First column is always intercept (ones)
+                    Subsequent columns are the covariates specified in formula
+    Examples:
+        >>> import pandas as pd
+        >>> import numpy as np
+        >>> from sequenzo.seqhmm.formulas import create_model_matrix_time_constant
+        >>>
+        >>> # Create covariate data (one row per sequence)
+        >>> data = pd.DataFrame({
+        ...     'covariate_1': np.random.rand(10),
+        ...     'covariate_2': np.random.choice(['A', 'B'], size=10)
+        ... })
+        >>>
+        >>> # Create model matrix with formula
+        >>> X = create_model_matrix_time_constant("~ covariate_1 + covariate_2", data, n_sequences=10)
+        >>> print(X.shape)  # (10, n_covariates) where n_covariates includes intercept and dummies
+    """
+    # If no formula is provided, return intercept-only matrix
+    if formula is None:
+        return np.ones((n_sequences, 1))
+    # Parse formula
+    if isinstance(formula, str):
+        formula = Formula(formula)
+    # Validate data
+    if data is None:
+        raise ValueError("If formula is provided, data must also be provided")
+    if len(data) != n_sequences:
+        raise ValueError(
+            f"Number of rows in data ({len(data)}) must equal n_sequences ({n_sequences})"
+        )
+    # Get terms from formula
+    terms = formula.terms
+    # Initialize model matrix with intercept column
+    # We'll build it step by step, handling factor variables
+    columns_list = []
+    column_names = ['(Intercept)']
+    # Add intercept column (all ones)
+    columns_list.append(np.ones(n_sequences))
+    # Process each term in the formula
+    for term in terms:
+        if term not in data.columns:
+            raise ValueError(
+                f"Variable '{term}' not found in data columns: {list(data.columns)}"
+            )
+        covar_values = data[term].values
+        # Check if this is a categorical variable
+        if pd.api.types.is_categorical_dtype(data[term]) or \
+           pd.api.types.is_object_dtype(data[term]) or \
+           (data[term].dtype == 'object'):
+            # Categorical variable: create dummy variables
+            # Use pandas get_dummies to create dummies, drop first level as reference
+            dummies = pd.get_dummies(data[[term]], prefix=term, drop_first=True)
+            # Add each dummy column
+            for dummy_col in dummies.columns:
+                columns_list.append(dummies[dummy_col].values)
+                column_names.append(dummy_col)
+        else:
+            # Numeric variable: add as is
+            columns_list.append(covar_values)
+            column_names.append(term)
+    # Stack all columns into a matrix
+    X = np.column_stack(columns_list)
+    return X