PyPI - pg-sui - Versions diffs - 1.6.16a3__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

pg-sui 1.6.16a3py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/METADATA +26 -30
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/RECORD +29 -33
pgsui/__init__.py +0 -8
pgsui/_version.py +2 -2
pgsui/cli.py +577 -125
pgsui/data_processing/config.py +1 -2
pgsui/data_processing/containers.py +203 -530
pgsui/data_processing/transformers.py +44 -20
pgsui/impute/deterministic/imputers/mode.py +475 -182
pgsui/impute/deterministic/imputers/ref_allele.py +454 -147
pgsui/impute/supervised/imputers/hist_gradient_boosting.py +4 -3
pgsui/impute/supervised/imputers/random_forest.py +3 -2
pgsui/impute/unsupervised/base.py +1269 -534
pgsui/impute/unsupervised/callbacks.py +28 -33
pgsui/impute/unsupervised/imputers/autoencoder.py +870 -841
pgsui/impute/unsupervised/imputers/vae.py +931 -787
pgsui/impute/unsupervised/loss_functions.py +156 -202
pgsui/impute/unsupervised/models/autoencoder_model.py +7 -49
pgsui/impute/unsupervised/models/vae_model.py +40 -221
pgsui/impute/unsupervised/nn_scorers.py +53 -13
pgsui/utils/classification_viz.py +240 -97
pgsui/utils/misc.py +201 -3
pgsui/utils/plotting.py +73 -58
pgsui/utils/pretty_metrics.py +2 -6
pgsui/utils/scorers.py +39 -0
pgsui/impute/unsupervised/imputers/nlpca.py +0 -1666
pgsui/impute/unsupervised/imputers/ubp.py +0 -1660
pgsui/impute/unsupervised/models/nlpca_model.py +0 -206
pgsui/impute/unsupervised/models/ubp_model.py +0 -200
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/WHEEL +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/entry_points.txt +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/licenses/LICENSE +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/top_level.txt +0 -0

pgsui/impute/unsupervised/models/vae_model.py CHANGED Viewed

@@ -1,46 +1,27 @@
-from typing import List, Literal, Tuple
+from __future__ import annotations
-import numpy as np
+import copy
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from snpio.utils.logging import LoggerManager
-from torch.distributions import Normal
+from typing import List, Literal, Optional, Tuple, Union
+import numpy as np
-from pgsui.impute.unsupervised.loss_functions import MaskedFocalLoss
+from snpio.utils.logging import LoggerManager
 from pgsui.utils.logging_utils import configure_logger
 class Sampling(nn.Module):
-    """A layer that samples from a latent distribution using the reparameterization trick.
-    This layer is a core component of a Variational Autoencoder (VAE). It takes the mean and log-variance of a latent distribution as input and generates a sample from that distribution. By using the reparameterization trick ($z = \mu + \sigma \cdot \epsilon$), it allows gradients to be backpropagated through the random sampling process, making the VAE trainable.
-    """
+    """A layer that samples from a latent distribution using the reparameterization trick."""
     def forward(self, z_mean: torch.Tensor, z_log_var: torch.Tensor) -> torch.Tensor:
-        """Performs the forward pass to generate a latent sample.
-        Args:
-            z_mean (torch.Tensor): The mean of the latent normal distribution.
-            z_log_var (torch.Tensor): The log of the variance of the latent normal distribution.
-        Returns:
-            torch.Tensor: A sampled vector from the latent space.
-        """
-        z_sigma = torch.exp(0.5 * z_log_var)  # Precompute outside
-        # Ensure on GPU
-        # rand_like takes random values from a normal distribution
-        # of the same shape as z_mean.
+        z_sigma = torch.exp(0.5 * z_log_var)
         epsilon = torch.randn_like(z_mean, device=z_mean.device)
         return z_mean + z_sigma * epsilon
 class Encoder(nn.Module):
-    """The Encoder module of a Variational Autoencoder (VAE).
-    This module defines the encoder network, which takes high-dimensional input data and maps it to the parameters of a lower-dimensional latent distribution. The architecture consists of a series of fully-connected hidden layers that process the flattened input. The network culminates in two separate linear layers that output the mean (`z_mean`) and log-variance (`z_log_var`) of the approximate posterior distribution, $q(z|x)$.
-    """
+    """The Encoder module of a Variational Autoencoder (VAE)."""
     def __init__(
         self,
@@ -51,33 +32,17 @@ class Encoder(nn.Module):
         dropout_rate: float,
         activation: torch.nn.Module,
     ):
-        """Initializes the Encoder module.
-        Args:
-            n_features (int): The number of features in the input data (e.g., SNPs).
-            num_classes (int): Number of genotype states per locus (2 for haploid, 3 for diploid in practice).
-            latent_dim (int): The dimensionality of the latent space.
-            hidden_layer_sizes (List[int]): A list of integers specifying the size of each hidden layer.
-            dropout_rate (float): The dropout rate for regularization in the hidden layers.
-            activation (torch.nn.Module): An instantiated activation function module (e.g., `nn.ReLU()`) for the hidden layers.
-        """
-        super(Encoder, self).__init__()
+        super().__init__()
         self.flatten = nn.Flatten()
-        self.activation = (
-            getattr(F, activation) if isinstance(activation, str) else activation
-        )
         layers = []
-        # The input dimension accounts for channels
         input_dim = n_features * num_classes
         for hidden_size in hidden_layer_sizes:
             layers.append(nn.Linear(input_dim, hidden_size))
-            # BatchNorm can lead to faster convergence.
             layers.append(nn.BatchNorm1d(hidden_size))
+            layers.append(copy.deepcopy(activation))
             layers.append(nn.Dropout(dropout_rate))
-            layers.append(activation)
             input_dim = hidden_size
         self.hidden_layers = nn.Sequential(*layers)
@@ -88,14 +53,6 @@ class Encoder(nn.Module):
     def forward(
         self, x: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Performs the forward pass through the encoder.
-        Args:
-            x (torch.Tensor): The input data tensor of shape `(batch_size, n_features, num_classes)`.
-        Returns:
-            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: A tuple containing the latent mean (`z_mean`), latent log-variance (`z_log_var`), and a sample from the latent distribution (`z`).
-        """
         x = self.flatten(x)
         x = self.hidden_layers(x)
         z_mean = self.dense_z_mean(x)
@@ -105,10 +62,7 @@ class Encoder(nn.Module):
 class Decoder(nn.Module):
-    """The Decoder module of a Variational Autoencoder (VAE).
-    This module defines the decoder network, which takes a sample from the low-dimensional latent space and maps it back to the high-dimensional data space. It aims to reconstruct the original input data. The architecture consists of a series of fully-connected hidden layers followed by a final linear layer that produces the reconstructed data, which is then reshaped to match the original input's dimensions.
-    """
+    """The Decoder module of a Variational Autoencoder (VAE)."""
     def __init__(
         self,
@@ -119,65 +73,30 @@ class Decoder(nn.Module):
         dropout_rate: float,
         activation: torch.nn.Module,
     ) -> None:
-        """Initializes the Decoder module.
-        Args:
-            n_features (int): The number of features in the output data (e.g., SNPs).
-            num_classes (int): Number of genotype states per locus (typically 2 or 3).
-            latent_dim (int): The dimensionality of the input latent space.
-            hidden_layer_sizes (List[int]): A list of integers specifying the size of each hidden layer.
-            dropout_rate (float): The dropout rate for regularization in the hidden layers.
-            activation (torch.nn.Module): An instantiated activation function module (e.g., `nn.ReLU()`) for the hidden layers.
-        """
-        super(Decoder, self).__init__()
+        super().__init__()
         layers = []
         input_dim = latent_dim
         for hidden_size in hidden_layer_sizes:
             layers.append(nn.Linear(input_dim, hidden_size))
-            # BatchNorm can lead to faster convergence.
             layers.append(nn.BatchNorm1d(hidden_size))
+            layers.append(copy.deepcopy(activation))
             layers.append(nn.Dropout(dropout_rate))
-            layers.append(activation)
             input_dim = hidden_size
         self.hidden_layers = nn.Sequential(*layers)
-        # UPDATED: Output dimension must account for channels
         output_dim = n_features * num_classes
         self.dense_output = nn.Linear(input_dim, output_dim)
-        # UPDATED: Reshape must account for channels
         self.reshape = (n_features, num_classes)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Performs the forward pass through the decoder.
-        Args:
-            x (torch.Tensor): The input latent tensor of shape `(batch_size, latent_dim)`.
-        Returns:
-            torch.Tensor: The reconstructed output data of shape `(batch_size, n_features, num_classes)`.
-        """
         x = self.hidden_layers(x)
         x = self.dense_output(x)
         return x.view(-1, *self.reshape)
 class VAEModel(nn.Module):
-    """A Variational Autoencoder (VAE) model for imputation.
-    This class combines an `Encoder` and a `Decoder` to form a VAE, a generative model for learning complex data distributions. It is designed for imputing missing values in categorical data, such as genomic SNPs. The model is trained by maximizing the Evidence Lower Bound (ELBO), which is a lower bound on the log-likelihood of the data.
-    **Objective Function (ELBO):**
-    The VAE loss function is derived from the ELBO and consists of two main components: a reconstruction term and a regularization term.
-    $$
-    \\mathcal{L}(\\theta, \\phi; x) = \\underbrace{\\mathbb{E}_{q_{\\phi}(z|x)}[\\log p_{\\theta}(x|z)]}_{\\text{Reconstruction Loss}} - \\underbrace{D_{KL}(q_{\\phi}(z|x) || p(z))}_{\\text{KL Divergence}}
-    $$
-    -   The **Reconstruction Loss** encourages the decoder to accurately reconstruct the input data from its latent representation. This implementation uses a `MaskedFocalLoss`.
-    -   The **KL Divergence** acts as a regularizer, forcing the approximate posterior distribution $q_{\\phi}(z|x)$ learned by the encoder to be close to a prior distribution $p(z)$ (typically a standard normal distribution).
-    """
     def __init__(
         self,
         n_features: int,
@@ -188,33 +107,18 @@ class VAEModel(nn.Module):
         latent_dim: int = 2,
         dropout_rate: float = 0.2,
         activation: Literal["relu", "elu", "selu", "leaky_relu"] = "relu",
-        gamma: float = 2.0,
-        beta: float = 1.0,
+        kl_beta: float = 1.0,
         device: Literal["cpu", "gpu", "mps"] = "cpu",
         verbose: bool = False,
         debug: bool = False,
     ):
-        """Initializes the VAEModel.
-        Args:
-            n_features (int): The number of features in the input data (e.g., SNPs).
-            prefix (str): A prefix used for logging.
-            num_classes (int): Number of genotype states per locus. Defaults to 4 for backward compatibility, though the imputer passes 2 (haploid) or 3 (diploid).
-            hidden_layer_sizes (List[int] | np.ndarray): A list of integers specifying the size of each hidden layer in the encoder and decoder. Defaults to [128, 64].
-            latent_dim (int): The dimensionality of the latent space. Defaults to 2.
-            dropout_rate (float): The dropout rate for regularization in the hidden layers. Defaults to 0.2.
-            activation (str): The name of the activation function to use in hidden layers. Defaults to "relu".
-            gamma (float): The focusing parameter for the focal loss component. Defaults to 2.0.
-            beta (float): A weighting factor for the KL divergence term in the total loss ($\beta$-VAE). Defaults to 1.0.
-            device (Literal["cpu", "gpu", "mps"]): The device to run the model on.
-            verbose (bool): If True, enables detailed logging. Defaults to False.
-            debug (bool): If True, enables debug mode. Defaults to False.
-        """
-        super(VAEModel, self).__init__()
-        self.num_classes = num_classes
-        self.gamma = gamma
-        self.beta = beta
-        self.device = device
+        """Variational Autoencoder (VAE) model for unsupervised imputation."""
+        super().__init__()
+        self.n_features = int(n_features)
+        self.num_classes = int(num_classes)
+        self.latent_dim = int(latent_dim)
+        self.kl_beta = float(kl_beta)
+        self.torch_device = device
         logman = LoggerManager(
             name=__name__, prefix=prefix, verbose=verbose, debug=debug
@@ -224,23 +128,20 @@ class VAEModel(nn.Module):
         )
         act = self._resolve_activation(activation)
-        if isinstance(hidden_layer_sizes, np.ndarray):
-            hls = hidden_layer_sizes.tolist()
-        else:
-            hls = hidden_layer_sizes
+        hls = (
+            hidden_layer_sizes.tolist()
+            if isinstance(hidden_layer_sizes, np.ndarray)
+            else hidden_layer_sizes
+        )
         self.encoder = Encoder(
-            n_features, self.num_classes, latent_dim, hls, dropout_rate, act
+            self.n_features, self.num_classes, self.latent_dim, hls, dropout_rate, act
         )
-        decoder_layer_sizes = list(reversed(hls))
         self.decoder = Decoder(
-            n_features,
+            self.n_features,
             self.num_classes,
-            latent_dim,
-            decoder_layer_sizes,
+            self.latent_dim,
+            list(reversed(hls)),
             dropout_rate,
             act,
         )
@@ -248,102 +149,20 @@ class VAEModel(nn.Module):
     def forward(
         self, x: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Performs the forward pass through the full VAE model.
-        Args:
-            x (torch.Tensor): The input data tensor of shape `(batch_size, n_features, num_classes)`.
-        Returns:
-            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: A tuple containing the reconstructed output, the latent mean (`z_mean`), and the latent log-variance (`z_log_var`).
-        """
         z_mean, z_log_var, z = self.encoder(x)
         reconstruction = self.decoder(z)
         return reconstruction, z_mean, z_log_var
-    def compute_loss(
-        self,
-        outputs: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
-        y: torch.Tensor,
-        mask: torch.Tensor | None = None,
-        class_weights: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        """Computes the VAE loss function (negative ELBO).
-        The loss is the sum of a reconstruction term and a regularizing KL divergence term. The reconstruction loss is calculated using a masked focal loss, and the KL divergence measures the difference between the learned latent distribution and a standard normal prior.
-        Args:
-            outputs (Tuple[torch.Tensor, torch.Tensor, torch.Tensor]): The tuple of (reconstruction, z_mean, z_log_var) from the model's forward pass.
-            y (torch.Tensor): The target data tensor, expected to be one-hot encoded. This is converted to class indices internally for the loss function.
-            mask (torch.Tensor | None): A boolean mask to exclude missing values from the reconstruction loss.
-            class_weights (torch.Tensor | None): Weights to apply to each class in the reconstruction loss to handle imbalance.
-        Returns:
-            torch.Tensor: The computed scalar loss value.
-        """
-        reconstruction, z_mean, z_log_var = outputs
-        # 1. KL Divergence Calculation
-        prior = Normal(torch.zeros_like(z_mean), torch.ones_like(z_log_var))
-        posterior = Normal(z_mean, torch.exp(0.5 * z_log_var))
-        kl_loss = (
-            torch.distributions.kl.kl_divergence(posterior, prior).sum(dim=1).mean()
-        )
-        if class_weights is None:
-            class_weights = torch.ones(self.num_classes, device=y.device)
-        # 2. Reconstruction Loss Calculation
-        # Reverting to the robust method of flattening tensors and using the
-        # custom loss function.
-        n_classes = reconstruction.shape[-1]
-        logits_flat = reconstruction.reshape(-1, n_classes)
-        # Convert one-hot `y` to class indices for the loss function.
-        targets_flat = torch.argmax(y, dim=-1).reshape(-1)
-        if mask is None:
-            # If no mask is provided, all targets are considered valid.
-            mask_flat = torch.ones_like(targets_flat, dtype=torch.bool)
-        else:
-            # The mask needs to be reshaped to match the flattened targets.
-            mask_flat = mask.reshape(-1)
-        # Logits, class-index targets, and the valid mask.
-        criterion = MaskedFocalLoss(alpha=class_weights, gamma=self.gamma)
-        reconstruction_loss = criterion(
-            logits_flat.to(self.device),
-            targets_flat.to(self.device),
-            valid_mask=mask_flat.to(self.device),
-        )
-        return reconstruction_loss + self.beta * kl_loss
-    def _resolve_activation(
-        self, activation: Literal["relu", "elu", "leaky_relu", "selu"]
-    ) -> torch.nn.Module:
-        """Resolves an activation function module from a string name.
-        Args:
-            activation (Literal["relu", "elu", "leaky_relu", "selu"]): The name of the activation function.
-        Returns:
-            torch.nn.Module: The corresponding instantiated PyTorch activation function module.
-        Raises:
-            ValueError: If the provided activation name is not supported.
-        """
-        if isinstance(activation, str):
-            a = activation.lower()
+    def _resolve_activation(self, activation: Union[str, nn.Module]) -> nn.Module:
+        if isinstance(activation, nn.Module):
+            return activation
+        a = activation.lower()
         if a == "relu":
             return nn.ReLU()
-        elif a == "elu":
+        if a == "elu":
             return nn.ELU()
-        elif a in {"leaky_relu", "leakyrelu"}:
+        if a in {"leaky_relu", "leakyrelu"}:
             return nn.LeakyReLU()
-        elif a == "selu":
+        if a == "selu":
             return nn.SELU()
-        else:
-            msg = f"Activation {activation} not supported."
-            self.logger.error(msg)
-            raise ValueError(msg)
+        raise ValueError(f"Activation {activation} not supported.")

pgsui/impute/unsupervised/nn_scorers.py CHANGED Viewed

@@ -6,6 +6,8 @@ from sklearn.metrics import (
     accuracy_score,
     average_precision_score,
     f1_score,
+    jaccard_score,
+    matthews_corrcoef,
     precision_score,
     recall_score,
     roc_auc_score,
@@ -106,18 +108,23 @@ class Scorer:
             recall_score(y_true, y_pred, average=self.average, zero_division=0)
         )
-    def roc_auc(self, y_true: np.ndarray, y_pred_proba: np.ndarray) -> float:
+    def roc_auc(self, y_true_ohe: np.ndarray, y_pred_proba: np.ndarray) -> float:
         """Compute the ROC AUC score.
         Args:
-            y_true (np.ndarray): Ground truth (correct) target values.
+            y_true_ohe (np.ndarray): One-hot encoded ground truth (correct) target values.
             y_pred_proba (np.ndarray): Predicted probabilities.
         Returns:
             float: The ROC AUC score.
         """
-        if len(np.unique(y_true)) < 2:
-            return 0.5
+        if np.all(np.count_nonzero(y_true_ohe[..., 1]) == 0) or np.all(
+            np.count_nonzero(y_true_ohe[..., 2]) == 0
+        ):
+            # ROC AUC is not defined in that case
+            msg = "No positive samples in y_true; ROC AUC score is undefined. Setting to 0.5 (random classification chance)."
+            self.logger.warning(msg)
+            return 0.5  # Return a neutral score
         if y_pred_proba.shape[-1] == 2:
             # Binary classification case
@@ -125,14 +132,11 @@ class Scorer:
             # Otherwise it throws an error.
             y_pred_proba = y_pred_proba[:, 1]
-        try:
-            return float(
-                roc_auc_score(
-                    y_true, y_pred_proba, average=self.average, multi_class="ovr"
-                )
+        return float(
+            roc_auc_score(
+                y_true_ohe, y_pred_proba, average=self.average, multi_class="ovr"
             )
-        except Exception:
-            return float(roc_auc_score(y_true, y_pred_proba, average=self.average))
+        )
     # This method now correctly expects one-hot encoded true labels
     def average_precision(
@@ -160,6 +164,34 @@ class Scorer:
             average_precision_score(y_true_ohe, y_pred_proba, average=self.average)
         )
+    def jaccard(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
+        """Compute the Jaccard score.
+        Args:
+            y_true (np.ndarray): Ground truth (correct) target values.
+            y_pred (np.ndarray): Estimated target values.
+        Returns:
+            float: The Jaccard score.
+        """
+        return float(
+            jaccard_score(y_true, y_pred, average=self.average, zero_division=0)
+        )
+    def mcc(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
+        """Compute the Matthews correlation coefficient (MCC).
+        MCC is a balanced measure that can be used even if the classes are of very different sizes. It returns a value between -1 and +1, where +1 indicates a perfect prediction, 0 indicates no better than random prediction, and -1 indicates total disagreement between prediction and observation.
+        Args:
+            y_true (np.ndarray): Ground truth (correct) target values.
+            y_pred (np.ndarray): Estimated target values.
+        Returns:
+            float: The Matthews correlation coefficient.
+        """
+        return float(matthews_corrcoef(y_true, y_pred))
     def pr_macro(self, y_true_ohe: np.ndarray, y_pred_proba: np.ndarray) -> float:
         """Compute the macro-average precision score.
@@ -196,6 +228,8 @@ class Scorer:
             "f1",
             "precision",
             "recall",
+            "mcc",
+            "jaccard",
         ] = "pr_macro",
     ) -> Dict[str, float]:
         """Evaluate the model using various metrics.
@@ -218,7 +252,7 @@ class Scorer:
                     np.asarray(y_true_ohe), np.asarray(y_pred_proba)
                 ),
                 "roc_auc": lambda: self.roc_auc(
-                    np.asarray(y_true), np.asarray(y_pred_proba)
+                    np.asarray(y_true_ohe), np.asarray(y_pred_proba)
                 ),
                 "average_precision": lambda: self.average_precision(
                     np.asarray(y_true_ohe), np.asarray(y_pred_proba)
@@ -231,6 +265,8 @@ class Scorer:
                     np.asarray(y_true), np.asarray(y_pred)
                 ),
                 "recall": lambda: self.recall(np.asarray(y_true), np.asarray(y_pred)),
+                "mcc": lambda: self.mcc(np.asarray(y_true), np.asarray(y_pred)),
+                "jaccard": lambda: self.jaccard(np.asarray(y_true), np.asarray(y_pred)),
             }
             if tune_metric not in metric_calculators:
                 msg = f"Invalid tune_metric provided: '{tune_metric}'."
@@ -244,12 +280,16 @@ class Scorer:
                 "f1": self.f1(np.asarray(y_true), np.asarray(y_pred)),
                 "precision": self.precision(np.asarray(y_true), np.asarray(y_pred)),
                 "recall": self.recall(np.asarray(y_true), np.asarray(y_pred)),
-                "roc_auc": self.roc_auc(np.asarray(y_true), np.asarray(y_pred_proba)),
+                "roc_auc": self.roc_auc(
+                    np.asarray(y_true_ohe), np.asarray(y_pred_proba)
+                ),
                 "average_precision": self.average_precision(
                     np.asarray(y_true_ohe), np.asarray(y_pred_proba)
                 ),
                 "pr_macro": self.pr_macro(
                     np.asarray(y_true_ohe), np.asarray(y_pred_proba)
                 ),
+                "mcc": self.mcc(np.asarray(y_true), np.asarray(y_pred)),
+                "jaccard": self.jaccard(np.asarray(y_true), np.asarray(y_pred)),
             }
         return {k: float(v) for k, v in metrics.items()}

pg-sui 1.6.16a3__py3-none-any.whl → 1.7.0__py3-none-any.whl

pg-sui 1.6.16a3py3-none-any.whl → 1.7.0py3-none-any.whl