PyPI - sequenzo - Versions diffs - 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl - Mend

sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (299) hide show

sequenzo/seqhmm/hmm.py ADDED Viewed

@@ -0,0 +1,291 @@
+"""
+@Author  : Yuqi Liang 梁彧祺
+@File    : hmm.py
+@Time    : 2025-11-13 16:20
+@Desc    : Base HMM class for Sequenzo
+This module provides the HMM class that wraps hmmlearn's CategoricalHMM
+and adapts it for use with Sequenzo's SequenceData format.
+"""
+import numpy as np
+import pandas as pd
+from typing import Optional, List, Dict, Union
+from hmmlearn.hmm import CategoricalHMM
+from sequenzo.define_sequence_data import SequenceData
+from .utils import (
+    sequence_data_to_hmmlearn_format,
+    int_to_state_mapping,
+    state_to_int_mapping
+)
+from .multichannel_utils import prepare_multichannel_data
+class HMM:
+    """
+    Hidden Markov Model for sequence analysis.
+    This class wraps hmmlearn's CategoricalHMM and provides a Sequenzo-friendly
+    interface that works with SequenceData objects.
+    Attributes:
+        observations: SequenceData object containing the observed sequences
+        n_states: Number of hidden states
+        n_symbols: Number of observed symbols (alphabet size)
+        alphabet: List of observed state symbols
+        state_names: Optional names for hidden states
+        channel_names: Optional names for channels (for multichannel data)
+        length_of_sequences: Maximum sequence length
+        sequence_lengths: Array of individual sequence lengths
+        n_sequences: Number of sequences
+        n_channels: Number of channels (currently 1 for single-channel)
+        # Model parameters (after fitting)
+        initial_probs: Initial state probabilities
+        transition_probs: Transition probability matrix
+        emission_probs: Emission probability matrix
+        # hmmlearn model
+        _hmm_model: Internal hmmlearn CategoricalHMM model
+    """
+    def __init__(
+        self,
+        observations: Union[SequenceData, List[SequenceData]],
+        n_states: int,
+        initial_probs: Optional[np.ndarray] = None,
+        transition_probs: Optional[np.ndarray] = None,
+        emission_probs: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
+        state_names: Optional[List[str]] = None,
+        channel_names: Optional[List[str]] = None,
+        random_state: Optional[int] = None
+    ):
+        """
+        Initialize an HMM model.
+        Args:
+            observations: SequenceData object or list of SequenceData objects (for multichannel)
+            n_states: Number of hidden states
+            initial_probs: Optional initial state probabilities (n_states,)
+            transition_probs: Optional transition matrix (n_states x n_states)
+            emission_probs: Optional emission matrix (n_states x n_symbols) or
+                          list of matrices (one per channel for multichannel)
+            state_names: Optional names for hidden states
+            channel_names: Optional names for channels
+            random_state: Random seed for initialization
+        """
+        # Handle multichannel data
+        channels, channel_names_list, alphabets = prepare_multichannel_data(observations)
+        self.channels = channels
+        self.n_channels = len(channels)
+        # For single channel, store as observations for backward compatibility
+        if self.n_channels == 1:
+            self.observations = channels[0]
+            self.alphabet = alphabets[0]
+        else:
+            # For multichannel, store first channel as primary (for compatibility)
+            self.observations = channels[0]
+            self.alphabet = alphabets[0]
+        self.alphabets = alphabets
+        self.n_symbols = [len(alph) for alph in alphabets]
+        # For single channel, use single n_symbols
+        if self.n_channels == 1:
+            self.n_symbols = self.n_symbols[0]
+        self.n_states = n_states
+        # Store metadata
+        self.state_names = state_names or [f"State {i+1}" for i in range(n_states)]
+        self.channel_names = channel_names or channel_names_list
+        # Get sequence information (use first channel for sequence info)
+        self.sequence_lengths = np.array([len(seq) for seq in channels[0].sequences])
+        self.length_of_sequences = int(self.sequence_lengths.max())
+        self.n_sequences = len(channels[0].sequences)
+        # Create mappings
+        self._int_to_state = int_to_state_mapping(self.alphabet)
+        self._state_to_int = state_to_int_mapping(self.alphabet)
+        # Initialize hmmlearn model (only for single channel)
+        # For multichannel, we'll need custom implementation
+        if self.n_channels == 1:
+            self._hmm_model = CategoricalHMM(
+                n_components=n_states,
+                n_features=self.n_symbols,
+                random_state=random_state,
+                n_iter=100,  # Default max iterations
+                tol=1e-2,    # Default tolerance
+                verbose=False
+            )
+            # Set initial parameters if provided
+            # When custom parameters are provided, we need to remove the corresponding
+            # letters from init_params to prevent hmmlearn from re-initializing them
+            # 's' = startprob, 't' = transmat, 'e' = emissionprob
+            if initial_probs is not None:
+                self._hmm_model.startprob_ = initial_probs
+                # Remove 's' from init_params so startprob won't be re-initialized during fit
+                self._hmm_model.init_params = self._hmm_model.init_params.replace('s', '')
+            if transition_probs is not None:
+                self._hmm_model.transmat_ = transition_probs
+                # Remove 't' from init_params so transmat won't be re-initialized during fit
+                self._hmm_model.init_params = self._hmm_model.init_params.replace('t', '')
+            if emission_probs is not None:
+                self._hmm_model.emissionprob_ = emission_probs
+                # Remove 'e' from init_params so emissionprob won't be re-initialized during fit
+                self._hmm_model.init_params = self._hmm_model.init_params.replace('e', '')
+        else:
+            # Multichannel: hmmlearn doesn't support this directly
+            # We'll implement custom fitting
+            self._hmm_model = None
+            if emission_probs is not None and isinstance(emission_probs, list):
+                if len(emission_probs) != self.n_channels:
+                    raise ValueError(
+                        f"emission_probs list length ({len(emission_probs)}) must equal n_channels ({self.n_channels})"
+                    )
+        # Store parameters (will be updated after fitting)
+        self.initial_probs = initial_probs
+        self.transition_probs = transition_probs
+        self.emission_probs = emission_probs
+        # Fitting results
+        self.log_likelihood = None
+        self.n_iter = None
+        self.converged = None
+    def fit(
+        self,
+        n_iter: int = 100,
+        tol: float = 1e-2,
+        verbose: bool = False
+    ) -> 'HMM':
+        """
+        Fit the HMM model to the observations using EM algorithm.
+        For single-channel data, uses hmmlearn's EM algorithm.
+        For multichannel data, uses custom multichannel EM algorithm.
+        Args:
+            n_iter: Maximum number of EM iterations
+            tol: Convergence tolerance
+            verbose: Whether to print progress
+        Returns:
+            self: Returns self for method chaining
+        """
+        if self.n_channels == 1:
+            # Single channel: use hmmlearn
+            X, lengths = sequence_data_to_hmmlearn_format(self.observations)
+            # Ensure init_params is correctly set before fitting
+            # Remove letters from init_params if we have custom parameters
+            if self.initial_probs is not None:
+                self._hmm_model.startprob_ = self.initial_probs.copy()
+                # Remove 's' from init_params to prevent re-initialization
+                if 's' in self._hmm_model.init_params:
+                    self._hmm_model.init_params = self._hmm_model.init_params.replace('s', '')
+            if self.transition_probs is not None:
+                self._hmm_model.transmat_ = self.transition_probs.copy()
+                # Remove 't' from init_params to prevent re-initialization
+                if 't' in self._hmm_model.init_params:
+                    self._hmm_model.init_params = self._hmm_model.init_params.replace('t', '')
+            if self.emission_probs is not None:
+                self._hmm_model.emissionprob_ = self.emission_probs.copy()
+                # Remove 'e' from init_params to prevent re-initialization
+                if 'e' in self._hmm_model.init_params:
+                    self._hmm_model.init_params = self._hmm_model.init_params.replace('e', '')
+            # Update hmmlearn model parameters
+            self._hmm_model.n_iter = n_iter
+            self._hmm_model.tol = tol
+            self._hmm_model.verbose = verbose
+            # Fit the model, suppressing warnings about init_params
+            import warnings
+            with warnings.catch_warnings():
+                warnings.filterwarnings('ignore', message='.*init_params.*')
+                warnings.filterwarnings('ignore', message='.*overwritten during initialization.*')
+                self._hmm_model.fit(X, lengths)
+            # Extract fitted parameters
+            self.initial_probs = self._hmm_model.startprob_.copy()
+            self.transition_probs = self._hmm_model.transmat_.copy()
+            self.emission_probs = self._hmm_model.emissionprob_.copy()
+            # Store fitting results
+            self.log_likelihood = self._hmm_model.score(X, lengths)
+            self.n_iter = self._hmm_model.monitor_.iter
+            self.converged = self._hmm_model.monitor_.converged
+        else:
+            # Multichannel: use custom EM algorithm
+            from .multichannel_em import fit_multichannel_hmm
+            fit_multichannel_hmm(self, n_iter=n_iter, tol=tol, verbose=verbose)
+        return self
+    def predict(self, sequences: Optional[SequenceData] = None) -> np.ndarray:
+        """
+        Predict the most likely hidden state sequence using Viterbi algorithm.
+        Args:
+            sequences: Optional SequenceData to predict (uses self.observations if None)
+        Returns:
+            numpy array: Predicted hidden states for each sequence
+        """
+        if sequences is None:
+            sequences = self.observations
+        X, lengths = sequence_data_to_hmmlearn_format(sequences)
+        states = self._hmm_model.predict(X, lengths)
+        return states
+    def predict_proba(self, sequences: Optional[SequenceData] = None) -> np.ndarray:
+        """
+        Compute posterior probabilities of hidden states.
+        Args:
+            sequences: Optional SequenceData (uses self.observations if None)
+        Returns:
+            numpy array: Posterior probabilities for each time point
+        """
+        if sequences is None:
+            sequences = self.observations
+        X, lengths = sequence_data_to_hmmlearn_format(sequences)
+        posteriors = self._hmm_model.predict_proba(X, lengths)
+        return posteriors
+    def score(self, sequences: Optional[SequenceData] = None) -> float:
+        """
+        Compute the log-likelihood of sequences under the model.
+        Args:
+            sequences: Optional SequenceData (uses self.observations if None)
+        Returns:
+            float: Log-likelihood
+        """
+        if sequences is None:
+            sequences = self.observations
+        X, lengths = sequence_data_to_hmmlearn_format(sequences)
+        return self._hmm_model.score(X, lengths)
+    def __repr__(self) -> str:
+        """String representation of the HMM."""
+        status = "fitted" if self.log_likelihood is not None else "unfitted"
+        return (f"HMM(n_states={self.n_states}, n_symbols={self.n_symbols}, "
+                f"n_sequences={self.n_sequences}, status='{status}')")

sequenzo/seqhmm/mhmm.py ADDED Viewed

@@ -0,0 +1,314 @@
+"""
+@Author  : Yuqi Liang 梁彧祺
+@File    : mhmm.py
+@Time    : 2025-11-22 08:47
+@Desc    : Mixture Hidden Markov Model (MHMM) for Sequenzo
+A Mixture HMM consists of multiple HMM submodels, where each submodel represents
+a cluster or type. The model assigns each sequence to one of these clusters with
+certain probabilities.
+This is similar to seqHMM's mhmm class in R.
+"""
+import numpy as np
+import pandas as pd
+from typing import Optional, List, Dict, Union
+from sequenzo.define_sequence_data import SequenceData
+from .hmm import HMM
+from .utils import (
+    sequence_data_to_hmmlearn_format,
+    create_initial_probs,
+    create_transition_probs,
+    create_emission_probs
+)
+class MHMM:
+    """
+    Mixture Hidden Markov Model for sequence analysis.
+    A Mixture HMM consists of multiple HMM submodels (clusters). Each sequence
+    belongs to one of these clusters with certain probabilities. The model
+    estimates both the cluster membership probabilities and the parameters
+    of each HMM submodel.
+    Attributes:
+        observations: SequenceData object containing the observed sequences
+        n_clusters: Number of clusters (submodels)
+        clusters: List of HMM objects, one for each cluster
+        cluster_probs: Mixture probabilities (probability of each cluster)
+        coefficients: Optional regression coefficients for covariates
+        X: Optional covariate matrix
+        cluster_names: Optional names for clusters
+        state_names: Optional names for hidden states (per cluster)
+        channel_names: Optional names for channels
+        # Model parameters (after fitting)
+        log_likelihood: Log-likelihood of the fitted model
+        n_iter: Number of EM iterations performed
+        converged: Whether the EM algorithm converged
+    """
+    def __init__(
+        self,
+        observations: SequenceData,
+        n_clusters: int,
+        n_states: Union[int, List[int]],
+        clusters: Optional[List[HMM]] = None,
+        cluster_probs: Optional[np.ndarray] = None,
+        coefficients: Optional[np.ndarray] = None,
+        X: Optional[np.ndarray] = None,
+        cluster_names: Optional[List[str]] = None,
+        state_names: Optional[List[List[str]]] = None,
+        channel_names: Optional[List[str]] = None,
+        random_state: Optional[int] = None
+    ):
+        """
+        Initialize a Mixture HMM model.
+        Args:
+            observations: SequenceData object containing the sequences
+            n_clusters: Number of clusters (submodels)
+            n_states: Number of hidden states per cluster. Can be:
+                     - int: Same number of states for all clusters
+                     - List[int]: Different number of states for each cluster
+            clusters: Optional list of pre-built HMM objects for each cluster
+            cluster_probs: Optional initial cluster probabilities (n_clusters,)
+            coefficients: Optional regression coefficients for covariates
+            X: Optional covariate matrix (n_sequences x n_covariates)
+            cluster_names: Optional names for clusters
+            state_names: Optional names for hidden states (list of lists)
+            channel_names: Optional names for channels
+            random_state: Random seed for initialization
+        """
+        self.observations = observations
+        self.n_clusters = n_clusters
+        self.alphabet = observations.alphabet
+        self.n_symbols = len(self.alphabet)
+        self.n_sequences = len(observations.sequences)
+        # Handle n_states: convert to list if int
+        if isinstance(n_states, int):
+            n_states = [n_states] * n_clusters
+        self.n_states = n_states
+        # Validate n_states length
+        if len(n_states) != n_clusters:
+            raise ValueError(
+                f"n_states length ({len(n_states)}) must equal n_clusters ({n_clusters})"
+            )
+        # Set names
+        self.cluster_names = cluster_names or [f"Cluster {i+1}" for i in range(n_clusters)]
+        self.channel_names = channel_names or ["Channel 1"]
+        self.n_channels = len(self.channel_names)
+        # Initialize clusters (HMM submodels)
+        if clusters is None:
+            self.clusters = []
+            for k in range(n_clusters):
+                # Get state names for this cluster
+                cluster_state_names = None
+                if state_names is not None:
+                    cluster_state_names = state_names[k] if k < len(state_names) else None
+                # Create HMM for this cluster
+                hmm = HMM(
+                    observations=observations,
+                    n_states=n_states[k],
+                    state_names=cluster_state_names,
+                    channel_names=channel_names,
+                    random_state=random_state
+                )
+                self.clusters.append(hmm)
+        else:
+            if len(clusters) != n_clusters:
+                raise ValueError(
+                    f"Number of clusters ({len(clusters)}) must equal n_clusters ({n_clusters})"
+                )
+            self.clusters = clusters
+        # Initialize cluster probabilities
+        if cluster_probs is None:
+            self.cluster_probs = np.ones(n_clusters) / n_clusters  # Uniform
+        else:
+            if len(cluster_probs) != n_clusters:
+                raise ValueError(
+                    f"cluster_probs length ({len(cluster_probs)}) must equal n_clusters ({n_clusters})"
+                )
+            if not np.isclose(np.sum(cluster_probs), 1.0):
+                raise ValueError("cluster_probs must sum to 1.0")
+            self.cluster_probs = np.array(cluster_probs)
+        # Covariates (for future extension)
+        self.coefficients = coefficients
+        self.X = X
+        self.n_covariates = X.shape[1] if X is not None else 0
+        # Fitting results
+        self.log_likelihood = None
+        self.n_iter = None
+        self.converged = None
+        # Store responsibilities (posterior cluster probabilities) after fitting
+        self.responsibilities = None
+    def fit(
+        self,
+        n_iter: int = 100,
+        tol: float = 1e-2,
+        verbose: bool = False
+    ) -> 'MHMM':
+        """
+        Fit the Mixture HMM model using EM algorithm.
+        The EM algorithm alternates between:
+        1. E-step: Compute responsibilities (posterior cluster probabilities)
+        2. M-step: Update cluster probabilities and HMM parameters
+        Args:
+            n_iter: Maximum number of EM iterations
+            tol: Convergence tolerance
+            verbose: Whether to print progress
+        Returns:
+            self: Returns self for method chaining
+        """
+        # Convert SequenceData to hmmlearn format
+        X, lengths = sequence_data_to_hmmlearn_format(self.observations)
+        n_sequences = len(lengths)
+        # Initialize log-likelihood
+        prev_log_likelihood = -np.inf
+        # EM algorithm
+        for iteration in range(n_iter):
+            # E-step: Compute responsibilities
+            # Responsibility = P(cluster | sequence) = P(sequence | cluster) * P(cluster) / P(sequence)
+            # Compute log-likelihood for each sequence under each cluster
+            log_likelihoods = np.zeros((n_sequences, self.n_clusters))
+            for k in range(self.n_clusters):
+                # Fit this cluster's HMM if not already fitted
+                # Suppress warnings about init_params during fitting
+                import warnings
+                with warnings.catch_warnings():
+                    warnings.filterwarnings('ignore', message='.*init_params.*')
+                    if self.clusters[k].log_likelihood is None:
+                        self.clusters[k].fit(n_iter=10, tol=tol, verbose=False)
+                # Compute log-likelihood for each sequence
+                for seq_idx in range(n_sequences):
+                    # Get sequence indices
+                    start_idx = lengths[:seq_idx].sum()
+                    end_idx = start_idx + lengths[seq_idx]
+                    seq_X = X[start_idx:end_idx]
+                    seq_lengths = np.array([lengths[seq_idx]])
+                    # Compute log-likelihood
+                    log_likelihoods[seq_idx, k] = self.clusters[k]._hmm_model.score(seq_X, seq_lengths)
+            # Add log of cluster probabilities
+            log_probs = np.log(self.cluster_probs + 1e-10)  # Add small epsilon to avoid log(0)
+            log_likelihoods += log_probs[np.newaxis, :]
+            # Compute responsibilities using log-sum-exp trick for numerical stability
+            # responsibility = exp(log_likelihood - log_sum_exp(log_likelihoods))
+            max_log_lik = np.max(log_likelihoods, axis=1, keepdims=True)
+            exp_log_lik = np.exp(log_likelihoods - max_log_lik)
+            responsibilities = exp_log_lik / np.sum(exp_log_lik, axis=1, keepdims=True)
+            self.responsibilities = responsibilities
+            # M-step: Update cluster probabilities
+            self.cluster_probs = np.mean(responsibilities, axis=0)
+            # M-step: Update each cluster's HMM parameters
+            # We use weighted fitting: each sequence contributes to each cluster
+            # proportionally to its responsibility
+            for k in range(self.n_clusters):
+                # For simplicity, we fit using all sequences but this could be
+                # optimized to use only sequences with high responsibility
+                # For now, we refit each cluster's HMM
+                # Suppress warnings about init_params during fitting
+                import warnings
+                with warnings.catch_warnings():
+                    warnings.filterwarnings('ignore', message='.*init_params.*')
+                    self.clusters[k].fit(n_iter=10, tol=tol, verbose=False)
+            # Compute overall log-likelihood
+            # log P(data) = sum over sequences of log(sum over clusters of P(seq | cluster) * P(cluster))
+            log_likelihood = np.sum(
+                np.log(np.sum(np.exp(log_likelihoods), axis=1) + 1e-10)
+            )
+            if verbose:
+                print(f"Iteration {iteration + 1}: log-likelihood = {log_likelihood:.4f}")
+            # Check convergence
+            if iteration > 0:
+                change = log_likelihood - prev_log_likelihood
+                if abs(change) < tol:
+                    self.converged = True
+                    if verbose:
+                        print(f"Converged at iteration {iteration + 1}")
+                    break
+            prev_log_likelihood = log_likelihood
+        self.log_likelihood = prev_log_likelihood
+        self.n_iter = iteration + 1
+        if not self.converged:
+            self.converged = False
+            if verbose:
+                print(f"Did not converge after {n_iter} iterations")
+        return self
+    def predict_cluster(self, sequences: Optional[SequenceData] = None) -> np.ndarray:
+        """
+        Predict the most likely cluster for each sequence.
+        Args:
+            sequences: Optional SequenceData (uses self.observations if None)
+        Returns:
+            numpy array: Predicted cluster index for each sequence
+        """
+        if self.responsibilities is None:
+            raise ValueError("Model must be fitted before prediction. Use fit() first.")
+        if sequences is None:
+            return np.argmax(self.responsibilities, axis=1)
+        else:
+            # Compute responsibilities for new sequences
+            X, lengths = sequence_data_to_hmmlearn_format(sequences)
+            n_sequences = len(lengths)
+            log_likelihoods = np.zeros((n_sequences, self.n_clusters))
+            for k in range(self.n_clusters):
+                for seq_idx in range(n_sequences):
+                    start_idx = lengths[:seq_idx].sum()
+                    end_idx = start_idx + lengths[seq_idx]
+                    seq_X = X[start_idx:end_idx]
+                    seq_lengths = np.array([lengths[seq_idx]])
+                    log_likelihoods[seq_idx, k] = self.clusters[k]._hmm_model.score(seq_X, seq_lengths)
+            log_probs = np.log(self.cluster_probs + 1e-10)
+            log_likelihoods += log_probs[np.newaxis, :]
+            max_log_lik = np.max(log_likelihoods, axis=1, keepdims=True)
+            exp_log_lik = np.exp(log_likelihoods - max_log_lik)
+            responsibilities = exp_log_lik / np.sum(exp_log_lik, axis=1, keepdims=True)
+            return np.argmax(responsibilities, axis=1)
+    def __repr__(self) -> str:
+        """String representation of the MHMM."""
+        status = "fitted" if self.log_likelihood is not None else "unfitted"
+        return (f"MHMM(n_clusters={self.n_clusters}, n_states={self.n_states}, "
+                f"n_sequences={self.n_sequences}, status='{status}')")