PyPI - sequenzo - Versions diffs - 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl - Mend

sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (299) hide show

sequenzo/seqhmm/model_comparison.py ADDED Viewed

@@ -0,0 +1,238 @@
+"""
+@Author  : Yuqi Liang 梁彧祺
+@File    : model_comparison.py
+@Time    : 2025-10-08 14:32
+@Desc    : Model comparison functions (AIC, BIC) for HMM models
+This module provides functions for computing AIC and BIC to compare different
+HMM models, similar to seqHMM's logLik() and summary() functions in R.
+"""
+import numpy as np
+from typing import Optional
+from .hmm import HMM
+from .mhmm import MHMM
+from .nhmm import NHMM
+def compute_n_parameters(model) -> int:
+    """
+    Compute the number of free parameters in a model.
+    This is used for computing AIC and BIC. The number of parameters
+    (degrees of freedom) is the number of estimable parameters in the model.
+    Args:
+        model: HMM, MHMM, or NHMM model object
+    Returns:
+        int: Number of free parameters
+    """
+    if isinstance(model, HMM):
+        # For basic HMM:
+        # - Initial probabilities: n_states - 1 (sum to 1)
+        # - Transition probabilities: n_states * (n_states - 1) (each row sums to 1)
+        # - Emission probabilities: n_states * (n_symbols - 1) (each row sums to 1)
+        n_init = model.n_states - 1
+        n_trans = model.n_states * (model.n_states - 1)
+        n_emiss = model.n_states * (model.n_symbols - 1)
+        return n_init + n_trans + n_emiss
+    elif isinstance(model, MHMM):
+        # For Mixture HMM:
+        # - Cluster probabilities: n_clusters - 1 (sum to 1)
+        # - For each cluster: same as basic HMM
+        # - Covariate coefficients (if any): n_covariates * (n_clusters - 1)
+        n_cluster = model.n_clusters - 1
+        # Parameters for each cluster
+        n_per_cluster = 0
+        for k in range(model.n_clusters):
+            cluster = model.clusters[k]
+            n_init = cluster.n_states - 1
+            n_trans = cluster.n_states * (cluster.n_states - 1)
+            n_emiss = cluster.n_states * (cluster.n_symbols - 1)
+            n_per_cluster += n_init + n_trans + n_emiss
+        # Covariate coefficients (if any)
+        n_coefs = 0
+        if model.coefficients is not None:
+            n_coefs = model.coefficients.size - model.n_clusters  # First column is zero
+        return n_cluster + n_per_cluster + n_coefs
+    elif isinstance(model, NHMM):
+        # For Non-homogeneous HMM:
+        # - eta_pi: n_covariates * n_states
+        # - eta_A: n_covariates * n_states * n_states
+        # - eta_B: n_covariates * n_states * n_symbols
+        # Note: We don't subtract constraints here because Softmax handles them
+        n_pi = model.n_covariates * model.n_states
+        n_A = model.n_covariates * model.n_states * model.n_states
+        n_B = model.n_covariates * model.n_states * model.n_symbols
+        return n_pi + n_A + n_B
+    else:
+        raise ValueError(f"Unknown model type: {type(model)}")
+def compute_n_observations(model) -> int:
+    """
+    Compute the number of observations in a model.
+    For multichannel models, each observed value in a single channel
+    amounts to 1/n_channels observation, i.e., a fully observed time point
+    for a single sequence amounts to one observation.
+    Args:
+        model: HMM, MHMM, or NHMM model object
+    Returns:
+        int: Number of observations
+    """
+    if isinstance(model, (HMM, MHMM, NHMM)):
+        # For single-channel models, each time point is one observation
+        # For multichannel models, we divide by number of channels
+        n_channels = getattr(model, 'n_channels', 1)
+        total_timepoints = sum(model.sequence_lengths)
+        return int(total_timepoints / n_channels)
+    else:
+        raise ValueError(f"Unknown model type: {type(model)}")
+def aic(model, log_likelihood: Optional[float] = None) -> float:
+    """
+    Compute Akaike Information Criterion (AIC) for a model.
+    AIC = -2 * log-likelihood + 2 * n_parameters
+    Lower AIC values indicate better models (better fit with fewer parameters).
+    This is similar to seqHMM's AIC computation via stats::AIC(logLik(model)).
+    Args:
+        model: Fitted HMM, MHMM, or NHMM model object
+        log_likelihood: Optional log-likelihood value. If None, uses model.log_likelihood
+    Returns:
+        float: AIC value
+    Examples:
+        >>> from sequenzo.seqhmm import build_hmm, fit_model, aic
+        >>>
+        >>> hmm = build_hmm(seq, n_states=4, random_state=42)
+        >>> hmm = fit_model(hmm)
+        >>> aic_value = aic(hmm)
+        >>> print(f"AIC: {aic_value:.2f}")
+    """
+    if log_likelihood is None:
+        if model.log_likelihood is None:
+            raise ValueError("Model must be fitted before computing AIC. Use fit_model() first.")
+        log_likelihood = model.log_likelihood
+    n_params = compute_n_parameters(model)
+    aic_value = -2 * log_likelihood + 2 * n_params
+    return aic_value
+def bic(model, log_likelihood: Optional[float] = None) -> float:
+    """
+    Compute Bayesian Information Criterion (BIC) for a model.
+    BIC = -2 * log-likelihood + log(n_observations) * n_parameters
+    Lower BIC values indicate better models. BIC penalizes complexity more
+    than AIC, especially for large datasets.
+    This is similar to seqHMM's BIC computation via stats::BIC(logLik(model)).
+    Args:
+        model: Fitted HMM, MHMM, or NHMM model object
+        log_likelihood: Optional log-likelihood value. If None, uses model.log_likelihood
+    Returns:
+        float: BIC value
+    Examples:
+        >>> from sequenzo.seqhmm import build_hmm, fit_model, bic
+        >>>
+        >>> hmm = build_hmm(seq, n_states=4, random_state=42)
+        >>> hmm = fit_model(hmm)
+        >>> bic_value = bic(hmm)
+        >>> print(f"BIC: {bic_value:.2f}")
+    """
+    if log_likelihood is None:
+        if model.log_likelihood is None:
+            raise ValueError("Model must be fitted before computing BIC. Use fit_model() first.")
+        log_likelihood = model.log_likelihood
+    n_params = compute_n_parameters(model)
+    n_obs = compute_n_observations(model)
+    bic_value = -2 * log_likelihood + np.log(n_obs) * n_params
+    return bic_value
+def compare_models(models: list, criterion: str = 'BIC') -> dict:
+    """
+    Compare multiple models using AIC or BIC.
+    This function computes AIC or BIC for multiple models and returns
+    a comparison table, similar to comparing models in seqHMM.
+    Args:
+        models: List of fitted model objects (HMM, MHMM, or NHMM)
+        criterion: Criterion to use ('AIC' or 'BIC'). Default is 'BIC'.
+    Returns:
+        dict: Dictionary with model names, log-likelihood, n_parameters, and criterion values
+    Examples:
+        >>> from sequenzo.seqhmm import build_hmm, fit_model, compare_models
+        >>>
+        >>> # Fit models with different numbers of states
+        >>> hmm3 = build_hmm(seq, n_states=3, random_state=42)
+        >>> hmm4 = build_hmm(seq, n_states=4, random_state=42)
+        >>> hmm5 = build_hmm(seq, n_states=5, random_state=42)
+        >>>
+        >>> hmm3 = fit_model(hmm3)
+        >>> hmm4 = fit_model(hmm4)
+        >>> hmm5 = fit_model(hmm5)
+        >>>
+        >>> # Compare models
+        >>> comparison = compare_models([hmm3, hmm4, hmm5], criterion='BIC')
+        >>> print(comparison)
+    """
+    if criterion not in ['AIC', 'BIC']:
+        raise ValueError("criterion must be 'AIC' or 'BIC'")
+    results = []
+    for i, model in enumerate(models):
+        if model.log_likelihood is None:
+            raise ValueError(f"Model {i} must be fitted before comparison.")
+        n_params = compute_n_parameters(model)
+        n_obs = compute_n_observations(model)
+        if criterion == 'AIC':
+            criterion_value = aic(model)
+        else:
+            criterion_value = bic(model)
+        results.append({
+            'model': f"Model {i+1}",
+            'log_likelihood': model.log_likelihood,
+            'n_parameters': n_params,
+            'n_observations': n_obs,
+            criterion: criterion_value
+        })
+    # Sort by criterion value (lower is better)
+    results.sort(key=lambda x: x[criterion])
+    return {
+        'criterion': criterion,
+        'models': results,
+        'best_model': results[0]['model']
+    }

sequenzo/seqhmm/multichannel_em.py ADDED Viewed

@@ -0,0 +1,282 @@
+"""
+@Author  : Yuqi Liang 梁彧祺
+@File    : multichannel_em.py
+@Time    : 2025-11-08 13:52
+@Desc    : EM algorithm for multichannel HMM
+This module provides the EM algorithm implementation for multichannel HMM,
+where each sequence has multiple parallel channels (e.g., marriage, children, residence).
+"""
+import numpy as np
+from typing import List
+from .hmm import HMM
+from .multichannel_utils import multichannel_to_hmmlearn_format, compute_multichannel_emission_prob
+from .utils import sequence_data_to_hmmlearn_format, state_to_int_mapping
+def fit_multichannel_hmm(
+    model: HMM,
+    n_iter: int = 100,
+    tol: float = 1e-2,
+    verbose: bool = False
+) -> HMM:
+    """
+    Fit a multichannel HMM using EM algorithm.
+    For multichannel HMM, the emission probability is the product of
+    emission probabilities across all channels (assuming independence).
+    This is similar to seqHMM's multichannel HMM fitting in R.
+    Args:
+        model: HMM model object with multichannel data
+        n_iter: Maximum number of EM iterations
+        tol: Convergence tolerance
+        verbose: Whether to print progress
+    Returns:
+        HMM: Fitted model
+    """
+    n_channels = model.n_channels
+    n_states = model.n_states
+    channels = model.channels
+    # Get sequence lengths (same for all channels)
+    lengths = model.sequence_lengths
+    n_sequences = len(lengths)
+    # Initialize parameters if not provided
+    if model.initial_probs is None:
+        model.initial_probs = np.ones(n_states) / n_states
+    if model.transition_probs is None:
+        model.transition_probs = np.ones((n_states, n_states)) / n_states
+    if model.emission_probs is None or not isinstance(model.emission_probs, list):
+        # Initialize emission probabilities for each channel
+        model.emission_probs = []
+        for ch in range(n_channels):
+            n_symbols_ch = model.n_symbols[ch]
+            emission_ch = np.random.rand(n_states, n_symbols_ch)
+            emission_ch = emission_ch / emission_ch.sum(axis=1, keepdims=True)
+            model.emission_probs.append(emission_ch)
+    # Convert channels to integer format
+    X_list = []
+    state_to_int_list = []
+    for ch in range(n_channels):
+        X_ch, _ = sequence_data_to_hmmlearn_format(channels[ch])
+        X_list.append(X_ch)
+        state_to_int_ch = state_to_int_mapping(channels[ch].alphabet)
+        state_to_int_list.append(state_to_int_ch)
+    # EM algorithm
+    prev_log_likelihood = -np.inf
+    for iteration in range(n_iter):
+        # E-step: Compute forward and backward probabilities
+        # For multichannel, we need to compute emission probabilities
+        # as product across channels
+        # Initialize forward and backward arrays
+        log_alpha = {}  # Dictionary: seq_idx -> (n_states, T) array
+        log_beta = {}   # Dictionary: seq_idx -> (n_states, T) array
+        total_log_lik = 0.0
+        # Forward pass
+        for seq_idx in range(n_sequences):
+            seq_length = lengths[seq_idx]
+            start_idx = lengths[:seq_idx].sum()
+            end_idx = start_idx + seq_length
+            # Get observations for all channels
+            obs_list = [X_ch[start_idx:end_idx, 0] for X_ch in X_list]
+            # Initialize forward probabilities
+            alpha = np.zeros((n_states, seq_length))
+            # Initialization: alpha[i, 0] = pi[i] * product(B_ch[i, obs_ch[0]])
+            for i in range(n_states):
+                emission_prob = 1.0
+                for ch in range(n_channels):
+                    emission_prob *= model.emission_probs[ch][i, obs_list[ch][0]]
+                alpha[i, 0] = model.initial_probs[i] * emission_prob
+            # Scale to prevent underflow
+            scale = alpha[:, 0].sum()
+            alpha[:, 0] /= scale
+            log_scale = np.log(scale)
+            # Recursion
+            for t in range(1, seq_length):
+                for j in range(n_states):
+                    # Compute emission probability for multichannel
+                    emission_prob = 1.0
+                    for ch in range(n_channels):
+                        emission_prob *= model.emission_probs[ch][j, obs_list[ch][t]]
+                    # Forward: alpha[j, t] = sum_i(alpha[i, t-1] * A[i, j] * B[j, obs[t]])
+                    alpha[j, t] = np.sum(alpha[:, t-1] * model.transition_probs[:, j]) * emission_prob
+                # Scale
+                scale = alpha[:, t].sum()
+                alpha[:, t] /= scale
+                log_scale += np.log(scale)
+            log_alpha[seq_idx] = np.log(alpha + 1e-10)
+            total_log_lik += log_scale
+        # Backward pass
+        for seq_idx in range(n_sequences):
+            seq_length = lengths[seq_idx]
+            start_idx = lengths[:seq_idx].sum()
+            end_idx = start_idx + seq_length
+            obs_list = [X_ch[start_idx:end_idx, 0] for X_ch in X_list]
+            beta = np.ones((n_states, seq_length))
+            # Recursion (backward)
+            for t in range(seq_length - 2, -1, -1):
+                for i in range(n_states):
+                    # Compute emission probability for next time
+                    emission_prob_next = 1.0
+                    for ch in range(n_channels):
+                        emission_prob_next *= model.emission_probs[ch][:, obs_list[ch][t+1]]
+                    # Backward: beta[i, t] = sum_j(A[i, j] * B[j, obs[t+1]] * beta[j, t+1])
+                    beta[i, t] = np.sum(
+                        model.transition_probs[i, :] * emission_prob_next * beta[:, t+1]
+                    )
+                # Scale (use same scale as forward)
+                beta[:, t] /= beta[:, t].sum()
+            log_beta[seq_idx] = np.log(beta + 1e-10)
+        # M-step: Update parameters
+        # Update initial probabilities
+        gamma_0 = np.zeros(n_states)
+        for seq_idx in range(n_sequences):
+            gamma_0 += np.exp(log_alpha[seq_idx][:, 0] + log_beta[seq_idx][:, 0] -
+                             np.log(np.sum(np.exp(log_alpha[seq_idx][:, 0] + log_beta[seq_idx][:, 0]))))
+        model.initial_probs = gamma_0 / n_sequences
+        # Update transition probabilities
+        xi_sum = np.zeros((n_states, n_states))
+        gamma_sum = np.zeros(n_states)
+        for seq_idx in range(n_sequences):
+            seq_length = lengths[seq_idx]
+            start_idx = lengths[:seq_idx].sum()
+            end_idx = start_idx + seq_length
+            obs_list = [X_ch[start_idx:end_idx, 0] for X_ch in X_list]
+            # Compute gamma and xi
+            for t in range(seq_length):
+                # Gamma: posterior probability of being in state i at time t
+                log_gamma = log_alpha[seq_idx][:, t] + log_beta[seq_idx][:, t]
+                log_gamma -= np.log(np.sum(np.exp(log_gamma)))
+                gamma = np.exp(log_gamma)
+                gamma_sum += gamma
+                if t < seq_length - 1:
+                    # Xi: joint probability of state i at t and state j at t+1
+                    for i in range(n_states):
+                        for j in range(n_states):
+                            # Compute emission probability for next time
+                            emission_prob_next = 1.0
+                            for ch in range(n_channels):
+                                emission_prob_next *= model.emission_probs[ch][j, obs_list[ch][t+1]]
+                            log_xi = (
+                                log_alpha[seq_idx][i, t] +
+                                np.log(model.transition_probs[i, j] + 1e-10) +
+                                np.log(emission_prob_next + 1e-10) +
+                                log_beta[seq_idx][j, t+1]
+                            )
+                            # Normalize
+                            log_xi_sum = -np.inf
+                            for i2 in range(n_states):
+                                for j2 in range(n_states):
+                                    emission_prob_next2 = 1.0
+                                    for ch in range(n_channels):
+                                        emission_prob_next2 *= model.emission_probs[ch][j2, obs_list[ch][t+1]]
+                                    log_xi_term = (
+                                        log_alpha[seq_idx][i2, t] +
+                                        np.log(model.transition_probs[i2, j2] + 1e-10) +
+                                        np.log(emission_prob_next2 + 1e-10) +
+                                        log_beta[seq_idx][j2, t+1]
+                                    )
+                                    if log_xi_sum == -np.inf:
+                                        log_xi_sum = log_xi_term
+                                    else:
+                                        log_xi_sum = np.logaddexp(log_xi_sum, log_xi_term)
+                            xi = np.exp(log_xi - log_xi_sum)
+                            xi_sum[i, j] += xi
+        # Normalize transition probabilities
+        for i in range(n_states):
+            if gamma_sum[i] > 0:
+                model.transition_probs[i, :] = xi_sum[i, :] / gamma_sum[i]
+            else:
+                model.transition_probs[i, :] = 1.0 / n_states
+        # Update emission probabilities for each channel
+        for ch in range(n_channels):
+            n_symbols_ch = model.n_symbols[ch]
+            emission_ch = np.zeros((n_states, n_symbols_ch))
+            gamma_sum_ch = np.zeros(n_states)
+            for seq_idx in range(n_sequences):
+                seq_length = lengths[seq_idx]
+                start_idx = lengths[:seq_idx].sum()
+                end_idx = start_idx + seq_length
+                obs_ch = X_list[ch][start_idx:end_idx, 0]
+                for t in range(seq_length):
+                    # Gamma: posterior probability
+                    log_gamma = log_alpha[seq_idx][:, t] + log_beta[seq_idx][:, t]
+                    log_gamma -= np.log(np.sum(np.exp(log_gamma)))
+                    gamma = np.exp(log_gamma)
+                    # Update emission counts
+                    for i in range(n_states):
+                        emission_ch[i, obs_ch[t]] += gamma[i]
+                        gamma_sum_ch[i] += gamma[i]
+            # Normalize
+            for i in range(n_states):
+                if gamma_sum_ch[i] > 0:
+                    model.emission_probs[ch][i, :] = emission_ch[i, :] / gamma_sum_ch[i]
+                else:
+                    model.emission_probs[ch][i, :] = 1.0 / n_symbols_ch
+        # Check convergence
+        if iteration > 0:
+            change = total_log_lik - prev_log_likelihood
+            if abs(change) < tol:
+                model.converged = True
+                if verbose:
+                    print(f"Converged at iteration {iteration + 1}")
+                break
+        prev_log_likelihood = total_log_lik
+        if verbose and (iteration + 1) % 10 == 0:
+            print(f"Iteration {iteration + 1}: log-likelihood = {total_log_lik:.4f}")
+    model.log_likelihood = prev_log_likelihood
+    model.n_iter = iteration + 1
+    if not model.converged:
+        model.converged = False
+        if verbose:
+            print(f"Did not converge after {n_iter} iterations")
+    return model

sequenzo/seqhmm/multichannel_utils.py ADDED Viewed

@@ -0,0 +1,138 @@
+"""
+@Author  : Yuqi Liang 梁彧祺
+@File    : multichannel_utils.py
+@Time    : 2025-11-05 11:26
+@Desc    : Utility functions for multichannel HMM support
+This module provides helper functions for handling multichannel sequence data,
+where each subject has multiple parallel sequences (channels).
+"""
+import numpy as np
+from typing import List, Union, Tuple
+from sequenzo.define_sequence_data import SequenceData
+def prepare_multichannel_data(
+    observations: Union[SequenceData, List[SequenceData]]
+) -> Tuple[List[SequenceData], List[str], List[List[str]]]:
+    """
+    Prepare multichannel data for HMM.
+    This function handles both single-channel (SequenceData) and
+    multichannel (List[SequenceData]) inputs.
+    Args:
+        observations: Either a single SequenceData or a list of SequenceData objects
+    Returns:
+        tuple: (channels, channel_names, alphabets) where:
+            - channels: List of SequenceData objects (one per channel)
+            - channel_names: List of channel names
+            - alphabets: List of alphabets (one per channel)
+    """
+    if isinstance(observations, SequenceData):
+        # Single channel
+        return [observations], ["Channel 1"], [observations.alphabet]
+    elif isinstance(observations, list):
+        # Multichannel
+        if len(observations) == 0:
+            raise ValueError("observations list cannot be empty")
+        # Validate all channels have same number of sequences
+        n_sequences = len(observations[0].sequences)
+        for i, obs in enumerate(observations):
+            if not isinstance(obs, SequenceData):
+                raise ValueError(f"observations[{i}] must be a SequenceData object")
+            if len(obs.sequences) != n_sequences:
+                raise ValueError(
+                    f"All channels must have the same number of sequences. "
+                    f"Channel 0 has {n_sequences}, channel {i} has {len(obs.sequences)}"
+                )
+        # Get channel names and alphabets
+        channel_names = [f"Channel {i+1}" for i in range(len(observations))]
+        alphabets = [obs.alphabet for obs in observations]
+        return observations, channel_names, alphabets
+    else:
+        raise ValueError(
+            f"observations must be SequenceData or List[SequenceData], got {type(observations)}"
+        )
+def multichannel_to_hmmlearn_format(
+    channels: List[SequenceData]
+) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Convert multichannel SequenceData to format for hmmlearn.
+    For multichannel data, hmmlearn expects observations to be a tuple
+    of arrays, one per channel. However, since hmmlearn's CategoricalHMM
+    doesn't directly support multichannel, we'll need to handle this
+    differently or use a custom implementation.
+    For now, this function prepares the data structure. A full implementation
+    would require extending hmmlearn or implementing multichannel HMM from scratch.
+    Args:
+        channels: List of SequenceData objects (one per channel)
+    Returns:
+        tuple: (X_list, lengths) where:
+            - X_list: List of observation arrays (one per channel)
+            - lengths: Array of sequence lengths (same for all channels)
+    """
+    from .utils import sequence_data_to_hmmlearn_format
+    X_list = []
+    lengths_list = []
+    for channel in channels:
+        X, lengths = sequence_data_to_hmmlearn_format(channel)
+        X_list.append(X)
+        lengths_list.append(lengths)
+    # Validate all channels have same lengths
+    lengths = lengths_list[0]
+    for i, l in enumerate(lengths_list[1:], 1):
+        if not np.array_equal(lengths, l):
+            raise ValueError(
+                f"All channels must have the same sequence lengths. "
+                f"Channel 0 and channel {i} differ."
+            )
+    return X_list, lengths
+def compute_multichannel_emission_prob(
+    emission_probs: List[np.ndarray],
+    observations: List[np.ndarray],
+    n_states: int
+) -> float:
+    """
+    Compute emission probability for multichannel observations.
+    For multichannel HMM, the emission probability is the product of
+    emission probabilities across all channels (assuming independence).
+    P(obs | state) = product over channels: P(obs_channel | state)
+    Args:
+        emission_probs: List of emission probability matrices, one per channel
+        observations: List of observed symbols (one per channel) at current time
+        n_states: Number of hidden states
+    Returns:
+        numpy array: Emission probabilities (n_states,) for current observations
+    """
+    n_channels = len(emission_probs)
+    emission = np.ones(n_states)
+    # Multiply probabilities across channels
+    for ch in range(n_channels):
+        emission *= emission_probs[ch][:, observations[ch]]
+    return emission