PyPI - sequenzo - Versions diffs - 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl - Mend

sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (299) hide show

sequenzo/seqhmm/nhmm.py ADDED Viewed

@@ -0,0 +1,270 @@
+"""
+@Author  : Yuqi Liang 梁彧祺
+@File    : nhmm.py
+@Time    : 2025-11-23 13:39
+@Desc    : Non-homogeneous Hidden Markov Model (NHMM) for Sequenzo
+A Non-homogeneous HMM allows transition and emission probabilities to vary
+over time or with covariates. This is useful when the underlying process
+changes over time or depends on external factors.
+This is similar to seqHMM's nhmm class in R.
+"""
+import numpy as np
+import pandas as pd
+from typing import Optional, List, Dict, Union, Tuple
+from scipy.optimize import minimize
+from sequenzo.define_sequence_data import SequenceData
+from .utils import sequence_data_to_hmmlearn_format
+from .nhmm_utils import (
+    compute_transition_probs_with_covariates,
+    compute_emission_probs_with_covariates,
+    compute_initial_probs_with_covariates
+)
+class NHMM:
+    """
+    Non-homogeneous Hidden Markov Model for sequence analysis.
+    In a Non-homogeneous HMM, transition and emission probabilities can vary
+    over time or with covariates. This allows the model to capture time-varying
+    or covariate-dependent patterns in the data.
+    Attributes:
+        observations: SequenceData object containing the observed sequences
+        n_states: Number of hidden states
+        n_symbols: Number of observed symbols
+        alphabet: List of observed state symbols
+        state_names: Optional names for hidden states
+        X: Covariate matrix (n_sequences x n_timepoints x n_covariates)
+        n_covariates: Number of covariates
+        # Model parameters (coefficients)
+        eta_pi: Coefficients for initial probabilities (n_covariates x n_states)
+        eta_A: Coefficients for transition probabilities (n_covariates x n_states x n_states)
+        eta_B: Coefficients for emission probabilities (n_covariates x n_states x n_symbols)
+        # Fitting results
+        log_likelihood: Log-likelihood of the fitted model
+        n_iter: Number of optimization iterations
+        converged: Whether optimization converged
+    """
+    def __init__(
+        self,
+        observations: SequenceData,
+        n_states: int,
+        X: np.ndarray,
+        eta_pi: Optional[np.ndarray] = None,
+        eta_A: Optional[np.ndarray] = None,
+        eta_B: Optional[np.ndarray] = None,
+        state_names: Optional[List[str]] = None,
+        random_state: Optional[int] = None
+    ):
+        """
+        Initialize a Non-homogeneous HMM model.
+        Args:
+            observations: SequenceData object containing the sequences
+            n_states: Number of hidden states
+            X: Covariate matrix of shape (n_sequences, n_timepoints, n_covariates)
+               where X[i, t, c] is the value of covariate c at time t for sequence i
+            eta_pi: Optional coefficients for initial probabilities (n_covariates x n_states)
+            eta_A: Optional coefficients for transition probabilities (n_covariates x n_states x n_states)
+            eta_B: Optional coefficients for emission probabilities (n_covariates x n_states x n_symbols)
+            state_names: Optional names for hidden states
+            random_state: Random seed for initialization
+        """
+        self.observations = observations
+        self.alphabet = observations.alphabet
+        self.n_symbols = len(self.alphabet)
+        self.n_states = n_states
+        self.n_sequences = len(observations.sequences)
+        # Validate and store covariates
+        if X.ndim != 3:
+            raise ValueError("X must be 3-dimensional: (n_sequences, n_timepoints, n_covariates)")
+        self.X = X
+        self.n_covariates = X.shape[2]
+        # Get sequence lengths
+        self.sequence_lengths = np.array([len(seq) for seq in observations.sequences])
+        self.length_of_sequences = int(self.sequence_lengths.max())
+        # Validate X dimensions match sequences
+        if X.shape[0] != self.n_sequences:
+            raise ValueError(
+                f"X first dimension ({X.shape[0]}) must equal n_sequences ({self.n_sequences})"
+            )
+        # Set names
+        self.state_names = state_names or [f"State {i+1}" for i in range(n_states)]
+        # Initialize coefficients if not provided
+        rng = np.random.RandomState(random_state)
+        if eta_pi is None:
+            # Initialize with small random values
+            self.eta_pi = rng.randn(self.n_covariates, n_states) * 0.1
+        else:
+            if eta_pi.shape != (self.n_covariates, n_states):
+                raise ValueError(
+                    f"eta_pi shape ({eta_pi.shape}) must be ({self.n_covariates}, {n_states})"
+                )
+            self.eta_pi = eta_pi
+        if eta_A is None:
+            # Initialize with small random values
+            self.eta_A = rng.randn(self.n_covariates, n_states, n_states) * 0.1
+        else:
+            if eta_A.shape != (self.n_covariates, n_states, n_states):
+                raise ValueError(
+                    f"eta_A shape ({eta_A.shape}) must be ({self.n_covariates}, {n_states}, {n_states})"
+                )
+            self.eta_A = eta_A
+        if eta_B is None:
+            # Initialize with small random values
+            self.eta_B = rng.randn(self.n_covariates, n_states, self.n_symbols) * 0.1
+        else:
+            if eta_B.shape != (self.n_covariates, n_states, self.n_symbols):
+                raise ValueError(
+                    f"eta_B shape ({eta_B.shape}) must be ({self.n_covariates}, {n_states}, {self.n_symbols})"
+                )
+            self.eta_B = eta_B
+        # Fitting results
+        self.log_likelihood = None
+        self.n_iter = None
+        self.converged = None
+    def _compute_probs(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Compute probabilities from coefficients and covariates.
+        Returns:
+            tuple: (initial_probs, transition_probs, emission_probs)
+        """
+        # Compute initial probabilities
+        initial_probs = compute_initial_probs_with_covariates(
+            self.eta_pi, self.X, self.n_states
+        )
+        # Compute transition probabilities
+        transition_probs = compute_transition_probs_with_covariates(
+            self.eta_A, self.X, self.n_states
+        )
+        # Compute emission probabilities
+        emission_probs = compute_emission_probs_with_covariates(
+            self.eta_B, self.X, self.n_states, self.n_symbols
+        )
+        return initial_probs, transition_probs, emission_probs
+    def _log_likelihood(self, params: np.ndarray) -> float:
+        """
+        Compute negative log-likelihood (for minimization).
+        Uses the forward-backward algorithm to compute the exact likelihood
+        for time-varying probabilities.
+        Args:
+            params: Flattened parameter vector
+        Returns:
+            float: Negative log-likelihood
+        """
+        # Reshape parameters
+        n_pi = self.n_covariates * self.n_states
+        n_A = self.n_covariates * self.n_states * self.n_states
+        n_B = self.n_covariates * self.n_states * self.n_symbols
+        self.eta_pi = params[:n_pi].reshape(self.n_covariates, self.n_states)
+        self.eta_A = params[n_pi:n_pi+n_A].reshape(self.n_covariates, self.n_states, self.n_states)
+        self.eta_B = params[n_pi+n_A:].reshape(self.n_covariates, self.n_states, self.n_symbols)
+        # Compute log-likelihood using forward-backward algorithm
+        from .forward_backward_nhmm import log_likelihood_nhmm
+        log_lik = log_likelihood_nhmm(self)
+        return -log_lik  # Return negative for minimization
+    def fit(
+        self,
+        n_iter: int = 100,
+        tol: float = 1e-4,
+        verbose: bool = False
+    ) -> 'NHMM':
+        """
+        Fit the NHMM model using numerical optimization.
+        Note: This is a simplified implementation. A full implementation would
+        use the forward-backward algorithm and proper gradient computation.
+        Args:
+            n_iter: Maximum number of optimization iterations
+            tol: Convergence tolerance
+            verbose: Whether to print progress
+        Returns:
+            self: Returns self for method chaining
+        """
+        # Flatten parameters
+        params = np.concatenate([
+            self.eta_pi.flatten(),
+            self.eta_A.flatten(),
+            self.eta_B.flatten()
+        ])
+        # Optimize using scipy with analytical gradients if available
+        try:
+            from .gradients_nhmm import compute_gradient_nhmm
+            def objective_with_grad(params):
+                """Objective function with gradient."""
+                neg_log_lik = self._log_likelihood(params)
+                grad = -compute_gradient_nhmm(self)  # Negative because we minimize
+                return neg_log_lik, grad
+            # Use L-BFGS-B with analytical gradients
+            result = minimize(
+                objective_with_grad,
+                params,
+                method='L-BFGS-B',
+                jac=True,  # Indicate that gradient is provided
+                options={'maxiter': n_iter, 'ftol': tol, 'disp': verbose}
+            )
+        except ImportError:
+            # Fall back to numerical gradients if analytical not available
+            result = minimize(
+                self._log_likelihood,
+                params,
+                method='L-BFGS-B',
+                options={'maxiter': n_iter, 'ftol': tol, 'disp': verbose}
+            )
+        # Store results
+        self.n_iter = result.nit
+        self.converged = result.success
+        self.log_likelihood = -result.fun
+        # Recompute log-likelihood using forward-backward for accuracy
+        from .forward_backward_nhmm import log_likelihood_nhmm
+        self.log_likelihood = log_likelihood_nhmm(self)
+        if verbose:
+            print(f"Optimization {'converged' if result.success else 'did not converge'}")
+            print(f"Log-likelihood: {self.log_likelihood:.4f}")
+            print(f"Iterations: {self.n_iter}")
+        return self
+    def __repr__(self) -> str:
+        """String representation of the NHMM."""
+        status = "fitted" if self.log_likelihood is not None else "unfitted"
+        return (f"NHMM(n_states={self.n_states}, n_symbols={self.n_symbols}, "
+                f"n_covariates={self.n_covariates}, n_sequences={self.n_sequences}, "
+                f"status='{status}')")

sequenzo/seqhmm/nhmm_utils.py ADDED Viewed

@@ -0,0 +1,191 @@
+"""
+@Author  : Yuqi Liang 梁彧祺
+@File    : nhmm_utils.py
+@Time    : 2025-11-23 10:20
+@Desc    : Utility functions for Non-homogeneous HMM
+This module provides utility functions for NHMM, including Softmax parameterization
+and gradient computation.
+"""
+import numpy as np
+from typing import Optional, Tuple
+def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray:
+    """
+    Compute softmax function for numerical stability.
+    Softmax converts a vector of real numbers into a probability distribution.
+    Formula: softmax(x_i) = exp(x_i) / sum(exp(x_j))
+    We use the log-sum-exp trick for numerical stability:
+    softmax(x_i) = exp(x_i - max(x)) / sum(exp(x_j - max(x)))
+    Args:
+        x: Input array
+        axis: Axis along which to compute softmax
+    Returns:
+        numpy array: Softmax probabilities (sums to 1 along specified axis)
+    """
+    # Subtract max for numerical stability
+    x_shifted = x - np.max(x, axis=axis, keepdims=True)
+    exp_x = np.exp(x_shifted)
+    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)
+def eta_to_gamma(eta: np.ndarray, n_categories: int) -> np.ndarray:
+    """
+    Convert eta (linear predictor) to gamma (probabilities) using Softmax.
+    In NHMM, we use linear predictors (eta) that are transformed to probabilities
+    (gamma) using the Softmax function. This allows covariates to influence
+    probabilities while ensuring they sum to 1.
+    Args:
+        eta: Linear predictor array of shape (..., n_categories)
+        n_categories: Number of categories (e.g., number of states)
+    Returns:
+        numpy array: Probabilities of shape (..., n_categories), sums to 1 along last axis
+    """
+    # Reshape eta to (n_samples, n_categories)
+    original_shape = eta.shape
+    eta_flat = eta.reshape(-1, n_categories)
+    # Apply softmax
+    gamma_flat = softmax(eta_flat, axis=1)
+    # Reshape back to original shape
+    return gamma_flat.reshape(original_shape)
+def compute_transition_probs_with_covariates(
+    eta_A: np.ndarray,
+    X: np.ndarray,
+    n_states: int
+) -> np.ndarray:
+    """
+    Compute transition probabilities from covariates using Softmax.
+    For each time point and each sequence, we compute:
+    eta = X @ coefficients
+    gamma = softmax(eta)
+    Args:
+        eta_A: Coefficient matrix of shape (n_covariates, n_states, n_states)
+               where eta_A[c, i, j] is the coefficient for covariate c,
+               transition from state i to state j
+        X: Covariate matrix of shape (n_sequences, n_timepoints, n_covariates)
+        n_states: Number of hidden states
+    Returns:
+        numpy array: Transition probabilities of shape (n_sequences, n_timepoints, n_states, n_states)
+    """
+    n_sequences, n_timepoints, n_covariates = X.shape
+    # Initialize transition probability matrix
+    transition_probs = np.zeros((n_sequences, n_timepoints, n_states, n_states))
+    # For each sequence and time point
+    for seq_idx in range(n_sequences):
+        for t in range(n_timepoints):
+            # Get covariates for this time point
+            x_t = X[seq_idx, t, :]  # Shape: (n_covariates,)
+            # Compute linear predictor for each transition
+            # eta[i, j] = sum over covariates: x[c] * eta_A[c, i, j]
+            eta = np.zeros((n_states, n_states))
+            for i in range(n_states):
+                for j in range(n_states):
+                    eta[i, j] = np.sum(x_t * eta_A[:, i, j])
+            # Convert to probabilities using softmax (row-wise)
+            for i in range(n_states):
+                transition_probs[seq_idx, t, i, :] = softmax(eta[i, :])
+    return transition_probs
+def compute_emission_probs_with_covariates(
+    eta_B: np.ndarray,
+    X: np.ndarray,
+    n_states: int,
+    n_symbols: int
+) -> np.ndarray:
+    """
+    Compute emission probabilities from covariates using Softmax.
+    Similar to transition probabilities, but for emission probabilities.
+    Args:
+        eta_B: Coefficient matrix of shape (n_covariates, n_states, n_symbols)
+        X: Covariate matrix of shape (n_sequences, n_timepoints, n_covariates)
+        n_states: Number of hidden states
+        n_symbols: Number of observed symbols
+    Returns:
+        numpy array: Emission probabilities of shape (n_sequences, n_timepoints, n_states, n_symbols)
+    """
+    n_sequences, n_timepoints, n_covariates = X.shape
+    # Initialize emission probability matrix
+    emission_probs = np.zeros((n_sequences, n_timepoints, n_states, n_symbols))
+    # For each sequence and time point
+    for seq_idx in range(n_sequences):
+        for t in range(n_timepoints):
+            # Get covariates for this time point
+            x_t = X[seq_idx, t, :]  # Shape: (n_covariates,)
+            # Compute linear predictor for each emission
+            # eta[i, j] = sum over covariates: x[c] * eta_B[c, i, j]
+            eta = np.zeros((n_states, n_symbols))
+            for i in range(n_states):
+                for j in range(n_symbols):
+                    eta[i, j] = np.sum(x_t * eta_B[:, i, j])
+            # Convert to probabilities using softmax (row-wise)
+            for i in range(n_states):
+                emission_probs[seq_idx, t, i, :] = softmax(eta[i, :])
+    return emission_probs
+def compute_initial_probs_with_covariates(
+    eta_pi: np.ndarray,
+    X: np.ndarray,
+    n_states: int
+) -> np.ndarray:
+    """
+    Compute initial state probabilities from covariates using Softmax.
+    Args:
+        eta_pi: Coefficient matrix of shape (n_covariates, n_states)
+        X: Covariate matrix of shape (n_sequences, 1, n_covariates) for initial time
+        n_states: Number of hidden states
+    Returns:
+        numpy array: Initial probabilities of shape (n_sequences, n_states)
+    """
+    n_sequences = X.shape[0]
+    # Initialize initial probability matrix
+    initial_probs = np.zeros((n_sequences, n_states))
+    # For each sequence
+    for seq_idx in range(n_sequences):
+        # Get covariates for initial time point
+        x_0 = X[seq_idx, 0, :]  # Shape: (n_covariates,)
+        # Compute linear predictor
+        # eta[i] = sum over covariates: x[c] * eta_pi[c, i]
+        eta = np.zeros(n_states)
+        for i in range(n_states):
+            eta[i] = np.sum(x_0 * eta_pi[:, i])
+        # Convert to probabilities using softmax
+        initial_probs[seq_idx, :] = softmax(eta)
+    return initial_probs

sequenzo/seqhmm/predict.py ADDED Viewed

@@ -0,0 +1,137 @@
+"""
+@Author  : Yuqi Liang 梁彧祺
+@File    : predict.py
+@Time    : 2025-11-13 17:05
+@Desc    : Prediction and inference functions for HMM models
+This module provides functions for predicting hidden states and computing
+posterior probabilities, similar to seqHMM's predict() and posterior_probs()
+functions in R.
+"""
+import numpy as np
+import pandas as pd
+from typing import Optional, List
+from sequenzo.define_sequence_data import SequenceData
+from .hmm import HMM
+from .utils import sequence_data_to_hmmlearn_format
+def predict(
+    model: HMM,
+    newdata: Optional[SequenceData] = None
+) -> np.ndarray:
+    """
+    Predict the most likely hidden state sequence using Viterbi algorithm.
+    This function finds the most likely sequence of hidden states given the
+    observed sequence, using the Viterbi algorithm (dynamic programming).
+    It is similar to seqHMM's predict() function in R.
+    Args:
+        model: Fitted HMM model object
+        newdata: Optional SequenceData to predict. If None, uses the data
+                 the model was fitted on.
+    Returns:
+        numpy array: Predicted hidden states for each time point in each sequence.
+                    The array is flattened (all sequences concatenated).
+    Examples:
+        >>> from sequenzo import SequenceData, load_dataset
+        >>> from sequenzo.seqhmm import build_hmm, fit_model, predict
+        >>>
+        >>> # Load and prepare data
+        >>> df = load_dataset('mvad')
+        >>> seq = SequenceData(df, time=range(15, 86), states=['EM', 'FE', 'HE', 'JL', 'SC', 'TR'])
+        >>>
+        >>> # Build and fit model
+        >>> hmm = build_hmm(seq, n_states=4, random_state=42)
+        >>> hmm = fit_model(hmm)
+        >>>
+        >>> # Predict hidden states
+        >>> predicted_states = predict(hmm)
+        >>> print(f"Predicted {len(predicted_states)} hidden states")
+    """
+    if model.log_likelihood is None:
+        raise ValueError("Model must be fitted before prediction. Use fit_model() first.")
+    return model.predict(newdata)
+def posterior_probs(
+    model: HMM,
+    newdata: Optional[SequenceData] = None
+) -> pd.DataFrame:
+    """
+    Compute posterior probabilities of hidden states.
+    This function computes the probability of each hidden state at each time point,
+    given the observed sequence. It uses the forward-backward algorithm.
+    It is similar to seqHMM's posterior_probs() function in R.
+    Args:
+        model: Fitted HMM model object
+        newdata: Optional SequenceData. If None, uses the data the model was fitted on.
+    Returns:
+        pandas DataFrame: Posterior probabilities with columns:
+            - id: Sequence identifier (index in the original data)
+            - time: Time point within the sequence
+            - state: Hidden state index
+            - probability: Posterior probability of being in this state at this time
+    Examples:
+        >>> from sequenzo import SequenceData, load_dataset
+        >>> from sequenzo.seqhmm import build_hmm, fit_model, posterior_probs
+        >>>
+        >>> # Load and prepare data
+        >>> df = load_dataset('mvad')
+        >>> seq = SequenceData(df, time=range(15, 86), states=['EM', 'FE', 'HE', 'JL', 'SC', 'TR'])
+        >>>
+        >>> # Build and fit model
+        >>> hmm = build_hmm(seq, n_states=4, random_state=42)
+        >>> hmm = fit_model(hmm)
+        >>>
+        >>> # Get posterior probabilities
+        >>> posteriors = posterior_probs(hmm)
+        >>> print(posteriors.head())
+        >>>
+        >>> # Find most probable state at each time point
+        >>> most_probable = posteriors.groupby(['id', 'time'])['probability'].idxmax()
+    """
+    if model.log_likelihood is None:
+        raise ValueError("Model must be fitted before computing posterior probabilities. Use fit_model() first.")
+    # Get sequences to use
+    sequences = newdata if newdata is not None else model.observations
+    # Get posterior probabilities from model
+    proba = model.predict_proba(sequences)
+    # Get sequence information
+    X, lengths = sequence_data_to_hmmlearn_format(sequences)
+    # Create DataFrame with results
+    rows = []
+    seq_idx = 0
+    time_idx = 0
+    for seq_id in range(len(lengths)):
+        seq_length = lengths[seq_id]
+        for t in range(seq_length):
+            for state_idx in range(model.n_states):
+                rows.append({
+                    'id': seq_id,
+                    'time': t + 1,  # 1-indexed for consistency with R
+                    'state': state_idx,
+                    'probability': proba[time_idx, state_idx]
+                })
+            time_idx += 1
+        seq_idx += 1
+    df = pd.DataFrame(rows)
+    return df