pg-sui 1.6.16a3__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/METADATA +26 -30
  2. {pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/RECORD +29 -33
  3. pgsui/__init__.py +0 -8
  4. pgsui/_version.py +2 -2
  5. pgsui/cli.py +577 -125
  6. pgsui/data_processing/config.py +1 -2
  7. pgsui/data_processing/containers.py +203 -530
  8. pgsui/data_processing/transformers.py +44 -20
  9. pgsui/impute/deterministic/imputers/mode.py +475 -182
  10. pgsui/impute/deterministic/imputers/ref_allele.py +454 -147
  11. pgsui/impute/supervised/imputers/hist_gradient_boosting.py +4 -3
  12. pgsui/impute/supervised/imputers/random_forest.py +3 -2
  13. pgsui/impute/unsupervised/base.py +1269 -534
  14. pgsui/impute/unsupervised/callbacks.py +28 -33
  15. pgsui/impute/unsupervised/imputers/autoencoder.py +870 -841
  16. pgsui/impute/unsupervised/imputers/vae.py +931 -787
  17. pgsui/impute/unsupervised/loss_functions.py +156 -202
  18. pgsui/impute/unsupervised/models/autoencoder_model.py +7 -49
  19. pgsui/impute/unsupervised/models/vae_model.py +40 -221
  20. pgsui/impute/unsupervised/nn_scorers.py +53 -13
  21. pgsui/utils/classification_viz.py +240 -97
  22. pgsui/utils/misc.py +201 -3
  23. pgsui/utils/plotting.py +73 -58
  24. pgsui/utils/pretty_metrics.py +2 -6
  25. pgsui/utils/scorers.py +39 -0
  26. pgsui/impute/unsupervised/imputers/nlpca.py +0 -1666
  27. pgsui/impute/unsupervised/imputers/ubp.py +0 -1660
  28. pgsui/impute/unsupervised/models/nlpca_model.py +0 -206
  29. pgsui/impute/unsupervised/models/ubp_model.py +0 -200
  30. {pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/WHEEL +0 -0
  31. {pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/entry_points.txt +0 -0
  32. {pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/licenses/LICENSE +0 -0
  33. {pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/top_level.txt +0 -0
@@ -1,206 +0,0 @@
1
- from typing import List, Literal
2
-
3
- import numpy as np
4
- import torch
5
- import torch.nn as nn
6
- from snpio.utils.logging import LoggerManager
7
-
8
- from pgsui.impute.unsupervised.loss_functions import MaskedFocalLoss
9
- from pgsui.utils.logging_utils import configure_logger
10
-
11
-
12
- class NLPCAModel(nn.Module):
13
- r"""A non-linear Principal Component Analysis (NLPCA) decoder for genotypes.
14
-
15
- This module maps a low-dimensional latent vector to logits over genotype states
16
- (two classes for haploids or three for diploids) at every locus. It is a fully
17
- connected network with optional batch normalization and dropout layers and is
18
- used as the decoder inside the NLPCA imputer.
19
-
20
- **Model Architecture**
21
-
22
- Let :math:`z \in \mathbb{R}^{d_{\text{latent}}}` be the latent vector. For a
23
- network with :math:`L` hidden layers, the transformations are
24
-
25
- .. math::
26
-
27
- h_1 = f(W_1 z + b_1)
28
-
29
- .. math::
30
-
31
- h_2 = f(W_2 h_1 + b_2)
32
-
33
- .. math::
34
-
35
- \vdots
36
-
37
- .. math::
38
-
39
- h_L = f(W_L h_{L-1} + b_L)
40
-
41
- The final layer produces logits of shape ``(batch_size, n_features, num_classes)``
42
- by reshaping a linear projection back to the (loci, genotype-state) grid.
43
-
44
- **Loss Function**
45
-
46
- Training minimizes ``MaskedFocalLoss``, which extends cross-entropy with class
47
- weighting, focal re-weighting, and masking so that only observed genotypes
48
- contribute to the objective.
49
- """
50
-
51
- def __init__(
52
- self,
53
- n_features: int,
54
- prefix: str,
55
- *,
56
- num_classes: int = 4,
57
- hidden_layer_sizes: List[int] | np.ndarray = [128, 64],
58
- latent_dim: int = 2,
59
- dropout_rate: float = 0.2,
60
- activation: Literal["relu", "elu", "selu", "leaky_relu"] = "relu",
61
- gamma: float = 2.0,
62
- device: Literal["gpu", "cpu", "mps"] = "cpu",
63
- verbose: bool = False,
64
- debug: bool = False,
65
- ):
66
- """Initializes the NLPCAModel.
67
-
68
- Args:
69
- n_features (int): The number of features (SNPs) in the input data.
70
- prefix (str): A prefix used for logging.
71
- num_classes (int): Number of genotype states per locus (2 for haploid, 3 for diploid in practice). Defaults to 4 for backward compatibility.
72
- hidden_layer_sizes (list[int] | np.ndarray): A list of integers specifying the number of units in each hidden layer. Defaults to [128, 64].
73
- latent_dim (int): The dimensionality of the latent space (the size of the bottleneck layer). Defaults to 2.
74
- dropout_rate (float): The dropout rate applied to each hidden layer for regularization. Defaults to 0.2.
75
- activation (Literal["relu", "elu", "selu", "leaky_relu"]): The non-linear activation function to use in hidden layers. Defaults to 'relu'.
76
- gamma (float): The focusing parameter for the focal loss function, which down-weights well-classified examples. Defaults to 2.0.
77
- device (Literal["gpu", "cpu", "mps"]): The PyTorch device to run the model on. Defaults to 'cpu'.
78
- verbose (bool): If True, enables detailed logging. Defaults to False.
79
- debug (bool): If True, enables debug mode. Defaults to False.
80
- """
81
- super(NLPCAModel, self).__init__()
82
-
83
- logman = LoggerManager(
84
- name=__name__, prefix=prefix, verbose=verbose, debug=debug
85
- )
86
- self.logger = configure_logger(
87
- logman.get_logger(), verbose=verbose, debug=debug
88
- )
89
-
90
- self.n_features = n_features
91
- self.num_classes = num_classes
92
- self.latent_dim = latent_dim
93
- self.gamma = gamma
94
- self.device = device
95
-
96
- if isinstance(hidden_layer_sizes, np.ndarray):
97
- hidden_layer_sizes = hidden_layer_sizes.tolist()
98
-
99
- layers = []
100
- input_dim = latent_dim
101
- for size in hidden_layer_sizes:
102
- layers.append(nn.Linear(input_dim, size))
103
- layers.append(nn.BatchNorm1d(size))
104
- layers.append(nn.Dropout(dropout_rate))
105
- layers.append(self._resolve_activation(activation))
106
- input_dim = size
107
-
108
- # Final layer output size is now n_features * num_classes
109
- final_output_size = self.n_features * self.num_classes
110
- layers.append(nn.Linear(hidden_layer_sizes[-1], final_output_size))
111
-
112
- self.phase23_decoder = nn.Sequential(*layers)
113
-
114
- # Reshape tuple reflects the output structure
115
- self.reshape = (self.n_features, self.num_classes)
116
-
117
- def _resolve_activation(
118
- self, activation: Literal["relu", "elu", "selu", "leaky_relu"]
119
- ) -> nn.Module:
120
- """Resolves an activation function from a string name.
121
-
122
- This method acts as a factory, returning the correct PyTorch activation function module based on the provided name.
123
-
124
- Args:
125
- activation (Literal["relu", "elu", "selu", "leaky_relu"]): The name of the activation function.
126
-
127
- Returns:
128
- nn.Module: The corresponding PyTorch activation function module.
129
-
130
- Raises:
131
- ValueError: If the provided activation name is not supported.
132
- """
133
- act: str = activation.lower()
134
-
135
- if act == "relu":
136
- return nn.ReLU()
137
- elif act == "elu":
138
- return nn.ELU()
139
- elif act == "leaky_relu":
140
- return nn.LeakyReLU()
141
- elif act == "selu":
142
- return nn.SELU()
143
- else:
144
- msg = f"Activation function {act} not supported."
145
- self.logger.error(msg)
146
- raise ValueError(msg)
147
-
148
- def forward(self, x: torch.Tensor) -> torch.Tensor:
149
- """Performs the forward pass of the model.
150
-
151
- The input tensor is passed through the decoder network to produce logits,
152
- which are reshaped to align with the locus-by-class grid used by the loss.
153
-
154
- Args:
155
- x (torch.Tensor): The input tensor, which should represent the latent space vector.
156
-
157
- Returns:
158
- torch.Tensor: The reconstructed output tensor of shape `(batch_size, n_features, num_classes)`.
159
- """
160
- x = self.phase23_decoder(x)
161
-
162
- # Reshape to (batch, features, num_classes)
163
- return x.view(-1, *self.reshape)
164
-
165
- def compute_loss(
166
- self,
167
- y: torch.Tensor,
168
- outputs: torch.Tensor,
169
- mask: torch.Tensor | None = None,
170
- class_weights: torch.Tensor | None = None,
171
- gamma: float = 2.0,
172
- ) -> torch.Tensor:
173
- """Computes the masked focal loss between model outputs and ground truth.
174
-
175
- This method calculates the loss value, handling class imbalance with weights and ignoring masked (missing) values.
176
-
177
- Args:
178
- y (torch.Tensor): Integer ground-truth genotypes of shape `(batch_size, n_features)`.
179
- outputs (torch.Tensor): Logits of shape `(batch_size, n_features, num_classes)`.
180
- mask (torch.Tensor | None): An optional boolean mask indicating which elements should be included in the loss calculation. Defaults to None.
181
- class_weights (torch.Tensor | None): An optional tensor of weights for each class to address imbalance. Defaults to None.
182
- gamma (float): The focusing parameter for the focal loss. Defaults to 2.0.
183
-
184
- Returns:
185
- torch.Tensor: The computed scalar loss value.
186
- """
187
- if class_weights is None:
188
- class_weights = torch.ones(self.num_classes, device=outputs.device)
189
-
190
- if mask is None:
191
- mask = torch.ones_like(y, dtype=torch.bool)
192
-
193
- # Explicitly flatten all tensors to the (N, C) and (N,) format.
194
- # This creates a clear contract with the new MaskedFocalLoss function.
195
- n_classes = outputs.shape[-1]
196
- logits_flat = outputs.reshape(-1, n_classes)
197
- targets_flat = y.reshape(-1)
198
- mask_flat = mask.reshape(-1)
199
-
200
- criterion = MaskedFocalLoss(gamma=gamma, alpha=class_weights)
201
-
202
- return criterion(
203
- logits_flat.to(self.device),
204
- targets_flat.to(self.device),
205
- valid_mask=mask_flat.to(self.device),
206
- )
@@ -1,200 +0,0 @@
1
- from typing import Callable, List, Literal
2
-
3
- import numpy as np
4
- import torch
5
- import torch.nn as nn
6
- from snpio.utils.logging import LoggerManager
7
-
8
- from pgsui.impute.unsupervised.loss_functions import MaskedFocalLoss
9
- from pgsui.utils.logging_utils import configure_logger
10
-
11
-
12
- class UBPModel(nn.Module):
13
- """An Unsupervised Backpropagation (UBP) decoder for genotype logits.
14
-
15
- The model reconstructs locus-level genotype probabilities (two states for haploid data or three for diploid data) from a latent vector. It exposes two decoding branches so the training schedule can follow the UBP recipe:
16
-
17
- 1. **Phase 1 decoder** - a shallow linear layer that co-trains with latent codes.
18
- 2. **Phase 2/3 decoder** - a deeper MLP with batch normalization and dropout that is first trained in isolation and later fine-tuned jointly with the latents.
19
-
20
- Both paths ultimately reshape their logits to ``(batch_size, n_features, num_classes)`` and training uses ``MaskedFocalLoss`` to focus on hard examples while masking missing entries.
21
- """
22
-
23
- def __init__(
24
- self,
25
- n_features: int,
26
- prefix: str,
27
- *,
28
- num_classes: int = 3,
29
- hidden_layer_sizes: List[int] | np.ndarray = [128, 64],
30
- latent_dim: int = 2,
31
- dropout_rate: float = 0.2,
32
- activation: Literal["relu", "elu", "selu", "leaky_relu"] = "relu",
33
- gamma: float = 2.0,
34
- device: Literal["cpu", "gpu", "mps"] = "cpu",
35
- verbose: bool = False,
36
- debug: bool = False,
37
- ):
38
- """Initializes the UBPModel.
39
-
40
- Args:
41
- n_features (int): The number of features (SNPs) in the input data.
42
- prefix (str): A prefix used for logging.
43
- num_classes (int): Number of genotype states per locus (typically 2 or 3). Defaults to 3.
44
- hidden_layer_sizes (list[int] | np.ndarray): A list of integers specifying the size of each hidden layer in the deep (Phase 2/3) decoder. Defaults to [128, 64].
45
- latent_dim (int): The dimensionality of the input latent space. Defaults to 2.
46
- dropout_rate (float): The dropout rate for regularization in the deep decoder. Defaults to 0.2.
47
- activation (str): The non-linear activation function to use in the deep decoder's hidden layers. Defaults to 'relu'.
48
- gamma (float): The focusing parameter for the focal loss function. Defaults to 2.0.
49
- device (Literal["cpu", "gpu", "mps"]): The PyTorch device to run the model on. Defaults to 'cpu'.
50
- verbose (bool): If True, enables detailed logging. Defaults to False.
51
- debug (bool): If True, enables debug mode. Defaults to False.
52
- """
53
- super(UBPModel, self).__init__()
54
-
55
- logman = LoggerManager(
56
- name=__name__, prefix=prefix, verbose=verbose, debug=debug
57
- )
58
- self.logger = configure_logger(
59
- logman.get_logger(), verbose=verbose, debug=debug
60
- )
61
-
62
- self.n_features = n_features
63
- self.num_classes = num_classes
64
- self.latent_dim = latent_dim
65
- self.gamma = gamma
66
- self.device = device
67
-
68
- if isinstance(hidden_layer_sizes, np.ndarray):
69
- hidden_layer_sizes = hidden_layer_sizes.tolist()
70
-
71
- # Final layer output size is now n_features * num_classes
72
- final_output_size = n_features * num_classes
73
-
74
- # Phase 1 decoder: Simple linear model
75
- self.phase1_decoder = nn.Sequential(
76
- nn.Linear(latent_dim, final_output_size, device=device),
77
- )
78
-
79
- # Phase 2 & 3 uses the Convolutional Decoder
80
- act_factory = self._resolve_activation_factory(activation)
81
-
82
- if hidden_layer_sizes[0] > hidden_layer_sizes[-1]:
83
- hidden_layer_sizes = list(reversed(hidden_layer_sizes))
84
-
85
- # Phase 2 & 3: Flexible deeper network
86
- layers = []
87
- input_dim = latent_dim
88
- for size in hidden_layer_sizes:
89
- layers.append(nn.Linear(input_dim, size))
90
- layers.append(nn.BatchNorm1d(size))
91
- layers.append(nn.Dropout(dropout_rate))
92
- layers.append(act_factory())
93
- input_dim = size
94
-
95
- layers.append(nn.Linear(hidden_layer_sizes[-1], final_output_size))
96
-
97
- self.phase23_decoder = nn.Sequential(*layers)
98
- self.reshape = (self.n_features, self.num_classes)
99
-
100
- def _resolve_activation_factory(
101
- self, activation: Literal["relu", "elu", "selu", "leaky_relu"]
102
- ) -> Callable[[], nn.Module]:
103
- """Resolves an activation function factory from a string name.
104
-
105
- This method acts as a factory, returning a callable (lambda function) that produces the desired PyTorch activation function module when called.
106
-
107
- Args:
108
- activation (Literal["relu", "elu", "selu", "leaky_relu"]): The name of the activation function.
109
-
110
- Returns:
111
- Callable[[], nn.Module]: A factory function that, when called, returns an instance of the specified activation layer.
112
-
113
- Raises:
114
- ValueError: If the provided activation name is not supported.
115
- """
116
- a = activation.lower()
117
- if a == "relu":
118
- return lambda: nn.ReLU()
119
- if a == "elu":
120
- return lambda: nn.ELU()
121
- if a == "leaky_relu":
122
- return lambda: nn.LeakyReLU()
123
- if a == "selu":
124
- return lambda: nn.SELU()
125
-
126
- msg = f"Activation function {activation} not supported."
127
- self.logger.error(msg)
128
- raise ValueError(msg)
129
-
130
- def forward(self, x: torch.Tensor, phase: int = 1) -> torch.Tensor:
131
- """Performs the forward pass through the UBP model.
132
-
133
- This method routes the input tensor through the appropriate decoder based on
134
- the specified training ``phase`` and reshapes the logits to the
135
- `(batch_size, n_features, num_classes)` grid expected by the loss.
136
-
137
- Args:
138
- x (torch.Tensor): The input latent tensor of shape `(batch_size, latent_dim)`.
139
- phase (int): The training phase (1, 2, or 3), which determines which decoder path to use.
140
-
141
- Returns:
142
- torch.Tensor: Logits shaped as `(batch_size, n_features, num_classes)`.
143
-
144
- Raises:
145
- ValueError: If an invalid phase is provided.
146
- """
147
- if phase == 1:
148
- # Linear decoder for phase 1
149
- x = self.phase1_decoder(x)
150
- return x.view(-1, *self.reshape)
151
- elif phase in {2, 3}:
152
- x = self.phase23_decoder(x)
153
- return x.view(-1, *self.reshape)
154
- else:
155
- msg = f"Invalid phase: {phase}. Expected 1, 2, or 3."
156
- self.logger.error(msg)
157
- raise ValueError(msg)
158
-
159
- def compute_loss(
160
- self,
161
- y: torch.Tensor,
162
- outputs: torch.Tensor,
163
- mask: torch.Tensor | None = None,
164
- class_weights: torch.Tensor | None = None,
165
- gamma: float = 2.0,
166
- ) -> torch.Tensor:
167
- """Computes the masked focal loss between model outputs and ground truth.
168
-
169
- This method calculates the loss value, handling class imbalance with weights and ignoring masked (missing) values in the ground truth tensor.
170
-
171
- Args:
172
- y (torch.Tensor): Integer ground-truth genotypes of shape `(batch_size, n_features)`.
173
- outputs (torch.Tensor): Logits of shape `(batch_size, n_features, num_classes)`.
174
- mask (torch.Tensor | None): An optional boolean mask indicating which elements should be included in the loss calculation.
175
- class_weights (torch.Tensor | None): An optional tensor of weights for each class to address imbalance.
176
- gamma (float): The focusing parameter for the focal loss.
177
-
178
- Returns:
179
- torch.Tensor: The computed scalar loss value.
180
- """
181
- if class_weights is None:
182
- class_weights = torch.ones(self.num_classes, device=outputs.device)
183
-
184
- if mask is None:
185
- mask = torch.ones_like(y, dtype=torch.bool)
186
-
187
- # Explicitly flatten all tensors to the (N, C) and (N,) format.
188
- # This creates a clear contract with the new MaskedFocalLoss function.
189
- n_classes = outputs.shape[-1]
190
- logits_flat = outputs.reshape(-1, n_classes)
191
- targets_flat = y.reshape(-1)
192
- mask_flat = mask.reshape(-1)
193
-
194
- criterion = MaskedFocalLoss(gamma=gamma, alpha=class_weights)
195
-
196
- return criterion(
197
- logits_flat.to(self.device),
198
- targets_flat.to(self.device),
199
- valid_mask=mask_flat.to(self.device),
200
- )