PyPI - pg-sui - Versions diffs - 1.6.16a3__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

pg-sui 1.6.16a3py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/METADATA +26 -30
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/RECORD +29 -33
pgsui/__init__.py +0 -8
pgsui/_version.py +2 -2
pgsui/cli.py +577 -125
pgsui/data_processing/config.py +1 -2
pgsui/data_processing/containers.py +203 -530
pgsui/data_processing/transformers.py +44 -20
pgsui/impute/deterministic/imputers/mode.py +475 -182
pgsui/impute/deterministic/imputers/ref_allele.py +454 -147
pgsui/impute/supervised/imputers/hist_gradient_boosting.py +4 -3
pgsui/impute/supervised/imputers/random_forest.py +3 -2
pgsui/impute/unsupervised/base.py +1269 -534
pgsui/impute/unsupervised/callbacks.py +28 -33
pgsui/impute/unsupervised/imputers/autoencoder.py +870 -841
pgsui/impute/unsupervised/imputers/vae.py +931 -787
pgsui/impute/unsupervised/loss_functions.py +156 -202
pgsui/impute/unsupervised/models/autoencoder_model.py +7 -49
pgsui/impute/unsupervised/models/vae_model.py +40 -221
pgsui/impute/unsupervised/nn_scorers.py +53 -13
pgsui/utils/classification_viz.py +240 -97
pgsui/utils/misc.py +201 -3
pgsui/utils/plotting.py +73 -58
pgsui/utils/pretty_metrics.py +2 -6
pgsui/utils/scorers.py +39 -0
pgsui/impute/unsupervised/imputers/nlpca.py +0 -1666
pgsui/impute/unsupervised/imputers/ubp.py +0 -1660
pgsui/impute/unsupervised/models/nlpca_model.py +0 -206
pgsui/impute/unsupervised/models/ubp_model.py +0 -200
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/WHEEL +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/entry_points.txt +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/licenses/LICENSE +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/top_level.txt +0 -0

pgsui/impute/unsupervised/models/nlpca_model.py DELETED Viewed

@@ -1,206 +0,0 @@
-from typing import List, Literal
-import numpy as np
-import torch
-import torch.nn as nn
-from snpio.utils.logging import LoggerManager
-from pgsui.impute.unsupervised.loss_functions import MaskedFocalLoss
-from pgsui.utils.logging_utils import configure_logger
-class NLPCAModel(nn.Module):
-    r"""A non-linear Principal Component Analysis (NLPCA) decoder for genotypes.
-    This module maps a low-dimensional latent vector to logits over genotype states
-    (two classes for haploids or three for diploids) at every locus. It is a fully
-    connected network with optional batch normalization and dropout layers and is
-    used as the decoder inside the NLPCA imputer.
-    **Model Architecture**
-    Let :math:`z \in \mathbb{R}^{d_{\text{latent}}}` be the latent vector. For a
-    network with :math:`L` hidden layers, the transformations are
-    .. math::
-        h_1 = f(W_1 z + b_1)
-    .. math::
-        h_2 = f(W_2 h_1 + b_2)
-    .. math::
-        \vdots
-    .. math::
-        h_L = f(W_L h_{L-1} + b_L)
-    The final layer produces logits of shape ``(batch_size, n_features, num_classes)``
-    by reshaping a linear projection back to the (loci, genotype-state) grid.
-    **Loss Function**
-    Training minimizes ``MaskedFocalLoss``, which extends cross-entropy with class
-    weighting, focal re-weighting, and masking so that only observed genotypes
-    contribute to the objective.
-    """
-    def __init__(
-        self,
-        n_features: int,
-        prefix: str,
-        *,
-        num_classes: int = 4,
-        hidden_layer_sizes: List[int] | np.ndarray = [128, 64],
-        latent_dim: int = 2,
-        dropout_rate: float = 0.2,
-        activation: Literal["relu", "elu", "selu", "leaky_relu"] = "relu",
-        gamma: float = 2.0,
-        device: Literal["gpu", "cpu", "mps"] = "cpu",
-        verbose: bool = False,
-        debug: bool = False,
-    ):
-        """Initializes the NLPCAModel.
-        Args:
-            n_features (int): The number of features (SNPs) in the input data.
-            prefix (str): A prefix used for logging.
-            num_classes (int): Number of genotype states per locus (2 for haploid, 3 for diploid in practice). Defaults to 4 for backward compatibility.
-            hidden_layer_sizes (list[int] | np.ndarray): A list of integers specifying the number of units in each hidden layer. Defaults to [128, 64].
-            latent_dim (int): The dimensionality of the latent space (the size of the bottleneck layer). Defaults to 2.
-            dropout_rate (float): The dropout rate applied to each hidden layer for regularization. Defaults to 0.2.
-            activation (Literal["relu", "elu", "selu", "leaky_relu"]): The non-linear activation function to use in hidden layers. Defaults to 'relu'.
-            gamma (float): The focusing parameter for the focal loss function, which down-weights well-classified examples. Defaults to 2.0.
-            device (Literal["gpu", "cpu", "mps"]): The PyTorch device to run the model on. Defaults to 'cpu'.
-            verbose (bool): If True, enables detailed logging. Defaults to False.
-            debug (bool): If True, enables debug mode. Defaults to False.
-        """
-        super(NLPCAModel, self).__init__()
-        logman = LoggerManager(
-            name=__name__, prefix=prefix, verbose=verbose, debug=debug
-        )
-        self.logger = configure_logger(
-            logman.get_logger(), verbose=verbose, debug=debug
-        )
-        self.n_features = n_features
-        self.num_classes = num_classes
-        self.latent_dim = latent_dim
-        self.gamma = gamma
-        self.device = device
-        if isinstance(hidden_layer_sizes, np.ndarray):
-            hidden_layer_sizes = hidden_layer_sizes.tolist()
-        layers = []
-        input_dim = latent_dim
-        for size in hidden_layer_sizes:
-            layers.append(nn.Linear(input_dim, size))
-            layers.append(nn.BatchNorm1d(size))
-            layers.append(nn.Dropout(dropout_rate))
-            layers.append(self._resolve_activation(activation))
-            input_dim = size
-        # Final layer output size is now n_features * num_classes
-        final_output_size = self.n_features * self.num_classes
-        layers.append(nn.Linear(hidden_layer_sizes[-1], final_output_size))
-        self.phase23_decoder = nn.Sequential(*layers)
-        # Reshape tuple reflects the output structure
-        self.reshape = (self.n_features, self.num_classes)
-    def _resolve_activation(
-        self, activation: Literal["relu", "elu", "selu", "leaky_relu"]
-    ) -> nn.Module:
-        """Resolves an activation function from a string name.
-        This method acts as a factory, returning the correct PyTorch activation function module based on the provided name.
-        Args:
-            activation (Literal["relu", "elu", "selu", "leaky_relu"]): The name of the activation function.
-        Returns:
-            nn.Module: The corresponding PyTorch activation function module.
-        Raises:
-            ValueError: If the provided activation name is not supported.
-        """
-        act: str = activation.lower()
-        if act == "relu":
-            return nn.ReLU()
-        elif act == "elu":
-            return nn.ELU()
-        elif act == "leaky_relu":
-            return nn.LeakyReLU()
-        elif act == "selu":
-            return nn.SELU()
-        else:
-            msg = f"Activation function {act} not supported."
-            self.logger.error(msg)
-            raise ValueError(msg)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Performs the forward pass of the model.
-        The input tensor is passed through the decoder network to produce logits,
-        which are reshaped to align with the locus-by-class grid used by the loss.
-        Args:
-            x (torch.Tensor): The input tensor, which should represent the latent space vector.
-        Returns:
-            torch.Tensor: The reconstructed output tensor of shape `(batch_size, n_features, num_classes)`.
-        """
-        x = self.phase23_decoder(x)
-        # Reshape to (batch, features, num_classes)
-        return x.view(-1, *self.reshape)
-    def compute_loss(
-        self,
-        y: torch.Tensor,
-        outputs: torch.Tensor,
-        mask: torch.Tensor | None = None,
-        class_weights: torch.Tensor | None = None,
-        gamma: float = 2.0,
-    ) -> torch.Tensor:
-        """Computes the masked focal loss between model outputs and ground truth.
-        This method calculates the loss value, handling class imbalance with weights and ignoring masked (missing) values.
-        Args:
-            y (torch.Tensor): Integer ground-truth genotypes of shape `(batch_size, n_features)`.
-            outputs (torch.Tensor): Logits of shape `(batch_size, n_features, num_classes)`.
-            mask (torch.Tensor | None): An optional boolean mask indicating which elements should be included in the loss calculation. Defaults to None.
-            class_weights (torch.Tensor | None): An optional tensor of weights for each class to address imbalance. Defaults to None.
-            gamma (float): The focusing parameter for the focal loss. Defaults to 2.0.
-        Returns:
-            torch.Tensor: The computed scalar loss value.
-        """
-        if class_weights is None:
-            class_weights = torch.ones(self.num_classes, device=outputs.device)
-        if mask is None:
-            mask = torch.ones_like(y, dtype=torch.bool)
-        # Explicitly flatten all tensors to the (N, C) and (N,) format.
-        # This creates a clear contract with the new MaskedFocalLoss function.
-        n_classes = outputs.shape[-1]
-        logits_flat = outputs.reshape(-1, n_classes)
-        targets_flat = y.reshape(-1)
-        mask_flat = mask.reshape(-1)
-        criterion = MaskedFocalLoss(gamma=gamma, alpha=class_weights)
-        return criterion(
-            logits_flat.to(self.device),
-            targets_flat.to(self.device),
-            valid_mask=mask_flat.to(self.device),
-        )

pgsui/impute/unsupervised/models/ubp_model.py DELETED Viewed

@@ -1,200 +0,0 @@
-from typing import Callable, List, Literal
-import numpy as np
-import torch
-import torch.nn as nn
-from snpio.utils.logging import LoggerManager
-from pgsui.impute.unsupervised.loss_functions import MaskedFocalLoss
-from pgsui.utils.logging_utils import configure_logger
-class UBPModel(nn.Module):
-    """An Unsupervised Backpropagation (UBP) decoder for genotype logits.
-    The model reconstructs locus-level genotype probabilities (two states for haploid data or three for diploid data) from a latent vector. It exposes two decoding branches so the training schedule can follow the UBP recipe:
-    1. **Phase 1 decoder** - a shallow linear layer that co-trains with latent codes.
-    2. **Phase 2/3 decoder** - a deeper MLP with batch normalization and dropout that is first trained in isolation and later fine-tuned jointly with the latents.
-    Both paths ultimately reshape their logits to ``(batch_size, n_features, num_classes)`` and training uses ``MaskedFocalLoss`` to focus on hard examples while masking missing entries.
-    """
-    def __init__(
-        self,
-        n_features: int,
-        prefix: str,
-        *,
-        num_classes: int = 3,
-        hidden_layer_sizes: List[int] | np.ndarray = [128, 64],
-        latent_dim: int = 2,
-        dropout_rate: float = 0.2,
-        activation: Literal["relu", "elu", "selu", "leaky_relu"] = "relu",
-        gamma: float = 2.0,
-        device: Literal["cpu", "gpu", "mps"] = "cpu",
-        verbose: bool = False,
-        debug: bool = False,
-    ):
-        """Initializes the UBPModel.
-        Args:
-            n_features (int): The number of features (SNPs) in the input data.
-            prefix (str): A prefix used for logging.
-            num_classes (int): Number of genotype states per locus (typically 2 or 3). Defaults to 3.
-            hidden_layer_sizes (list[int] | np.ndarray): A list of integers specifying the size of each hidden layer in the deep (Phase 2/3) decoder. Defaults to [128, 64].
-            latent_dim (int): The dimensionality of the input latent space. Defaults to 2.
-            dropout_rate (float): The dropout rate for regularization in the deep decoder. Defaults to 0.2.
-            activation (str): The non-linear activation function to use in the deep decoder's hidden layers. Defaults to 'relu'.
-            gamma (float): The focusing parameter for the focal loss function. Defaults to 2.0.
-            device (Literal["cpu", "gpu", "mps"]): The PyTorch device to run the model on. Defaults to 'cpu'.
-            verbose (bool): If True, enables detailed logging. Defaults to False.
-            debug (bool): If True, enables debug mode. Defaults to False.
-        """
-        super(UBPModel, self).__init__()
-        logman = LoggerManager(
-            name=__name__, prefix=prefix, verbose=verbose, debug=debug
-        )
-        self.logger = configure_logger(
-            logman.get_logger(), verbose=verbose, debug=debug
-        )
-        self.n_features = n_features
-        self.num_classes = num_classes
-        self.latent_dim = latent_dim
-        self.gamma = gamma
-        self.device = device
-        if isinstance(hidden_layer_sizes, np.ndarray):
-            hidden_layer_sizes = hidden_layer_sizes.tolist()
-        # Final layer output size is now n_features * num_classes
-        final_output_size = n_features * num_classes
-        # Phase 1 decoder: Simple linear model
-        self.phase1_decoder = nn.Sequential(
-            nn.Linear(latent_dim, final_output_size, device=device),
-        )
-        # Phase 2 & 3 uses the Convolutional Decoder
-        act_factory = self._resolve_activation_factory(activation)
-        if hidden_layer_sizes[0] > hidden_layer_sizes[-1]:
-            hidden_layer_sizes = list(reversed(hidden_layer_sizes))
-        # Phase 2 & 3: Flexible deeper network
-        layers = []
-        input_dim = latent_dim
-        for size in hidden_layer_sizes:
-            layers.append(nn.Linear(input_dim, size))
-            layers.append(nn.BatchNorm1d(size))
-            layers.append(nn.Dropout(dropout_rate))
-            layers.append(act_factory())
-            input_dim = size
-        layers.append(nn.Linear(hidden_layer_sizes[-1], final_output_size))
-        self.phase23_decoder = nn.Sequential(*layers)
-        self.reshape = (self.n_features, self.num_classes)
-    def _resolve_activation_factory(
-        self, activation: Literal["relu", "elu", "selu", "leaky_relu"]
-    ) -> Callable[[], nn.Module]:
-        """Resolves an activation function factory from a string name.
-        This method acts as a factory, returning a callable (lambda function) that produces the desired PyTorch activation function module when called.
-        Args:
-            activation (Literal["relu", "elu", "selu", "leaky_relu"]): The name of the activation function.
-        Returns:
-            Callable[[], nn.Module]: A factory function that, when called, returns an instance of the specified activation layer.
-        Raises:
-            ValueError: If the provided activation name is not supported.
-        """
-        a = activation.lower()
-        if a == "relu":
-            return lambda: nn.ReLU()
-        if a == "elu":
-            return lambda: nn.ELU()
-        if a == "leaky_relu":
-            return lambda: nn.LeakyReLU()
-        if a == "selu":
-            return lambda: nn.SELU()
-        msg = f"Activation function {activation} not supported."
-        self.logger.error(msg)
-        raise ValueError(msg)
-    def forward(self, x: torch.Tensor, phase: int = 1) -> torch.Tensor:
-        """Performs the forward pass through the UBP model.
-        This method routes the input tensor through the appropriate decoder based on
-        the specified training ``phase`` and reshapes the logits to the
-        `(batch_size, n_features, num_classes)` grid expected by the loss.
-        Args:
-            x (torch.Tensor): The input latent tensor of shape `(batch_size, latent_dim)`.
-            phase (int): The training phase (1, 2, or 3), which determines which decoder path to use.
-        Returns:
-            torch.Tensor: Logits shaped as `(batch_size, n_features, num_classes)`.
-        Raises:
-            ValueError: If an invalid phase is provided.
-        """
-        if phase == 1:
-            # Linear decoder for phase 1
-            x = self.phase1_decoder(x)
-            return x.view(-1, *self.reshape)
-        elif phase in {2, 3}:
-            x = self.phase23_decoder(x)
-            return x.view(-1, *self.reshape)
-        else:
-            msg = f"Invalid phase: {phase}. Expected 1, 2, or 3."
-            self.logger.error(msg)
-            raise ValueError(msg)
-    def compute_loss(
-        self,
-        y: torch.Tensor,
-        outputs: torch.Tensor,
-        mask: torch.Tensor | None = None,
-        class_weights: torch.Tensor | None = None,
-        gamma: float = 2.0,
-    ) -> torch.Tensor:
-        """Computes the masked focal loss between model outputs and ground truth.
-        This method calculates the loss value, handling class imbalance with weights and ignoring masked (missing) values in the ground truth tensor.
-        Args:
-            y (torch.Tensor): Integer ground-truth genotypes of shape `(batch_size, n_features)`.
-            outputs (torch.Tensor): Logits of shape `(batch_size, n_features, num_classes)`.
-            mask (torch.Tensor | None): An optional boolean mask indicating which elements should be included in the loss calculation.
-            class_weights (torch.Tensor | None): An optional tensor of weights for each class to address imbalance.
-            gamma (float): The focusing parameter for the focal loss.
-        Returns:
-            torch.Tensor: The computed scalar loss value.
-        """
-        if class_weights is None:
-            class_weights = torch.ones(self.num_classes, device=outputs.device)
-        if mask is None:
-            mask = torch.ones_like(y, dtype=torch.bool)
-        # Explicitly flatten all tensors to the (N, C) and (N,) format.
-        # This creates a clear contract with the new MaskedFocalLoss function.
-        n_classes = outputs.shape[-1]
-        logits_flat = outputs.reshape(-1, n_classes)
-        targets_flat = y.reshape(-1)
-        mask_flat = mask.reshape(-1)
-        criterion = MaskedFocalLoss(gamma=gamma, alpha=class_weights)
-        return criterion(
-            logits_flat.to(self.device),
-            targets_flat.to(self.device),
-            valid_mask=mask_flat.to(self.device),
-        )

{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

pg-sui 1.6.16a3__py3-none-any.whl → 1.7.0__py3-none-any.whl

pg-sui 1.6.16a3py3-none-any.whl → 1.7.0py3-none-any.whl