pg-sui 1.6.14.dev9__py3-none-any.whl → 1.6.16a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pg_sui-1.6.16a3.dist-info/METADATA +292 -0
- {pg_sui-1.6.14.dev9.dist-info → pg_sui-1.6.16a3.dist-info}/RECORD +14 -14
- pgsui/_version.py +2 -2
- pgsui/cli.py +14 -1
- pgsui/data_processing/containers.py +116 -104
- pgsui/impute/unsupervised/base.py +4 -1
- pgsui/impute/unsupervised/imputers/autoencoder.py +111 -35
- pgsui/impute/unsupervised/imputers/nlpca.py +239 -127
- pgsui/impute/unsupervised/imputers/ubp.py +135 -50
- pgsui/impute/unsupervised/imputers/vae.py +134 -46
- pg_sui-1.6.14.dev9.dist-info/METADATA +0 -344
- {pg_sui-1.6.14.dev9.dist-info → pg_sui-1.6.16a3.dist-info}/WHEEL +0 -0
- {pg_sui-1.6.14.dev9.dist-info → pg_sui-1.6.16a3.dist-info}/entry_points.txt +0 -0
- {pg_sui-1.6.14.dev9.dist-info → pg_sui-1.6.16a3.dist-info}/licenses/LICENSE +0 -0
- {pg_sui-1.6.14.dev9.dist-info → pg_sui-1.6.16a3.dist-info}/top_level.txt +0 -0
|
@@ -7,6 +7,7 @@ import matplotlib.pyplot as plt
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
import optuna
|
|
9
9
|
import torch
|
|
10
|
+
import torch.nn.functional as F
|
|
10
11
|
from sklearn.exceptions import NotFittedError
|
|
11
12
|
from sklearn.model_selection import train_test_split
|
|
12
13
|
from snpio.analysis.genotype_encoder import GenotypeEncoder
|
|
@@ -152,6 +153,7 @@ class ImputeVAE(BaseNNImputer):
|
|
|
152
153
|
self.verbose = self.cfg.io.verbose
|
|
153
154
|
self.debug = self.cfg.io.debug
|
|
154
155
|
self.rng = np.random.default_rng(self.seed)
|
|
156
|
+
self.pos_weights_: torch.Tensor | None = None
|
|
155
157
|
|
|
156
158
|
# Simulated-missing controls (config defaults + ctor overrides)
|
|
157
159
|
sim_cfg = getattr(self.cfg, "sim", None)
|
|
@@ -300,9 +302,10 @@ class ImputeVAE(BaseNNImputer):
|
|
|
300
302
|
)
|
|
301
303
|
self.ploidy = 1 if self.is_haploid else 2
|
|
302
304
|
self.num_classes_ = 2 if self.is_haploid else 3
|
|
305
|
+
self.output_classes_ = 2
|
|
303
306
|
self.logger.info(
|
|
304
307
|
f"Data is {'haploid' if self.is_haploid else 'diploid'}; "
|
|
305
|
-
f"using {self.num_classes_} classes."
|
|
308
|
+
f"using {self.num_classes_} classes for scoring and {self.output_classes_} output channels."
|
|
306
309
|
)
|
|
307
310
|
|
|
308
311
|
if self.is_haploid:
|
|
@@ -314,7 +317,7 @@ class ImputeVAE(BaseNNImputer):
|
|
|
314
317
|
# Model params (decoder outputs L*K logits)
|
|
315
318
|
self.model_params = {
|
|
316
319
|
"n_features": self.num_features_,
|
|
317
|
-
"num_classes": self.
|
|
320
|
+
"num_classes": self.output_classes_,
|
|
318
321
|
"latent_dim": self.latent_dim,
|
|
319
322
|
"dropout_rate": self.dropout_rate,
|
|
320
323
|
"activation": self.activation,
|
|
@@ -352,6 +355,10 @@ class ImputeVAE(BaseNNImputer):
|
|
|
352
355
|
self.class_weights_ = self._normalize_class_weights(
|
|
353
356
|
self._class_weights_from_zygosity(self.X_train_)
|
|
354
357
|
)
|
|
358
|
+
if not self.is_haploid:
|
|
359
|
+
self.pos_weights_ = self._compute_pos_weights(self.X_train_)
|
|
360
|
+
else:
|
|
361
|
+
self.pos_weights_ = None
|
|
355
362
|
|
|
356
363
|
# DataLoader
|
|
357
364
|
train_loader = self._get_data_loader(self.X_train_)
|
|
@@ -370,7 +377,7 @@ class ImputeVAE(BaseNNImputer):
|
|
|
370
377
|
X_val=self.X_val_,
|
|
371
378
|
params=self.best_params_,
|
|
372
379
|
prune_metric=self.tune_metric,
|
|
373
|
-
prune_warmup_epochs=
|
|
380
|
+
prune_warmup_epochs=10,
|
|
374
381
|
eval_interval=1,
|
|
375
382
|
eval_requires_latents=False, # no latent refinement for eval
|
|
376
383
|
eval_latent_steps=0,
|
|
@@ -480,7 +487,7 @@ class ImputeVAE(BaseNNImputer):
|
|
|
480
487
|
X_val: np.ndarray | None = None,
|
|
481
488
|
params: dict | None = None,
|
|
482
489
|
prune_metric: str | None = None, # "f1" | "accuracy" | "pr_macro"
|
|
483
|
-
prune_warmup_epochs: int =
|
|
490
|
+
prune_warmup_epochs: int = 10,
|
|
484
491
|
eval_interval: int = 1,
|
|
485
492
|
eval_requires_latents: bool = False, # VAE: no latent eval refinement
|
|
486
493
|
eval_latent_steps: int = 0,
|
|
@@ -562,7 +569,7 @@ class ImputeVAE(BaseNNImputer):
|
|
|
562
569
|
X_val: np.ndarray | None = None,
|
|
563
570
|
params: dict | None = None,
|
|
564
571
|
prune_metric: str | None = None,
|
|
565
|
-
prune_warmup_epochs: int =
|
|
572
|
+
prune_warmup_epochs: int = 10,
|
|
566
573
|
eval_interval: int = 1,
|
|
567
574
|
eval_requires_latents: bool = False,
|
|
568
575
|
eval_latent_steps: int = 0,
|
|
@@ -755,14 +762,14 @@ class ImputeVAE(BaseNNImputer):
|
|
|
755
762
|
for _, y_batch in loader:
|
|
756
763
|
optimizer.zero_grad(set_to_none=True)
|
|
757
764
|
|
|
758
|
-
# targets: (B, L) int in {0,1,2,-1}
|
|
759
765
|
y_int = y_batch.to(self.device, non_blocking=True).long()
|
|
760
766
|
|
|
761
|
-
|
|
762
|
-
|
|
767
|
+
if self.is_haploid:
|
|
768
|
+
x_in = self._one_hot_encode_012(y_int) # (B, L, 2)
|
|
769
|
+
else:
|
|
770
|
+
x_in = self._encode_multilabel_inputs(y_int) # (B, L, 2)
|
|
763
771
|
|
|
764
|
-
|
|
765
|
-
out = model(x_ohe)
|
|
772
|
+
out = model(x_in)
|
|
766
773
|
if isinstance(out, (list, tuple)):
|
|
767
774
|
recon_logits, mu, logvar = out[0], out[1], out[2]
|
|
768
775
|
else:
|
|
@@ -780,15 +787,30 @@ class ImputeVAE(BaseNNImputer):
|
|
|
780
787
|
beta = float(getattr(model, "beta", getattr(self, "kl_beta_final", 0.0)))
|
|
781
788
|
gamma = max(0.0, min(gamma, 10.0))
|
|
782
789
|
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
790
|
+
if self.is_haploid:
|
|
791
|
+
loss = compute_vae_loss(
|
|
792
|
+
recon_logits=recon_logits,
|
|
793
|
+
targets=y_int,
|
|
794
|
+
mu=mu,
|
|
795
|
+
logvar=logvar,
|
|
796
|
+
class_weights=class_weights,
|
|
797
|
+
gamma=gamma,
|
|
798
|
+
beta=beta,
|
|
799
|
+
)
|
|
800
|
+
else:
|
|
801
|
+
targets = self._multi_hot_targets(y_int)
|
|
802
|
+
pos_w = getattr(self, "pos_weights_", None)
|
|
803
|
+
bce = F.binary_cross_entropy_with_logits(
|
|
804
|
+
recon_logits, targets, pos_weight=pos_w, reduction="none"
|
|
805
|
+
)
|
|
806
|
+
mask = (y_int != -1).unsqueeze(-1).float()
|
|
807
|
+
recon_loss = (bce * mask).sum() / mask.sum().clamp_min(1e-8)
|
|
808
|
+
kl = (
|
|
809
|
+
-0.5
|
|
810
|
+
* torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
|
|
811
|
+
/ (y_int.shape[0] + 1e-8)
|
|
812
|
+
)
|
|
813
|
+
loss = recon_loss + beta * kl
|
|
792
814
|
|
|
793
815
|
if l1_penalty > 0:
|
|
794
816
|
l1 = torch.zeros((), device=self.device)
|
|
@@ -845,11 +867,25 @@ class ImputeVAE(BaseNNImputer):
|
|
|
845
867
|
with torch.no_grad():
|
|
846
868
|
X_tensor = torch.from_numpy(X) if isinstance(X, np.ndarray) else X
|
|
847
869
|
X_tensor = X_tensor.to(self.device).long()
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
870
|
+
if self.is_haploid:
|
|
871
|
+
x_ohe = self._one_hot_encode_012(X_tensor)
|
|
872
|
+
outputs = model(x_ohe)
|
|
873
|
+
logits = outputs[0].view(-1, self.num_features_, self.output_classes_)
|
|
874
|
+
probas = torch.softmax(logits, dim=-1)
|
|
875
|
+
labels = torch.argmax(probas, dim=-1)
|
|
876
|
+
else:
|
|
877
|
+
x_in = self._encode_multilabel_inputs(X_tensor)
|
|
878
|
+
outputs = model(x_in)
|
|
879
|
+
logits = outputs[0].view(-1, self.num_features_, self.output_classes_)
|
|
880
|
+
probas2 = torch.sigmoid(logits)
|
|
881
|
+
p_ref = probas2[..., 0]
|
|
882
|
+
p_alt = probas2[..., 1]
|
|
883
|
+
p_het = p_ref * p_alt
|
|
884
|
+
p_ref_only = p_ref * (1 - p_alt)
|
|
885
|
+
p_alt_only = p_alt * (1 - p_ref)
|
|
886
|
+
probas = torch.stack([p_ref_only, p_het, p_alt_only], dim=-1)
|
|
887
|
+
probas = probas / probas.sum(dim=-1, keepdim=True).clamp_min(1e-8)
|
|
888
|
+
labels = torch.argmax(probas, dim=-1)
|
|
853
889
|
|
|
854
890
|
if return_proba:
|
|
855
891
|
return labels.cpu().numpy(), probas.cpu().numpy()
|
|
@@ -1047,12 +1083,21 @@ class ImputeVAE(BaseNNImputer):
|
|
|
1047
1083
|
try:
|
|
1048
1084
|
params = self._sample_hyperparameters(trial)
|
|
1049
1085
|
|
|
1050
|
-
|
|
1051
|
-
|
|
1086
|
+
# Use tune subsets when available (tune_fast)
|
|
1087
|
+
X_train = getattr(self, "_tune_X_train", None)
|
|
1088
|
+
X_val = getattr(self, "_tune_X_test", None)
|
|
1089
|
+
if X_train is None or X_val is None:
|
|
1090
|
+
X_train = getattr(self, "X_train_", self.ground_truth_[self.train_idx_])
|
|
1091
|
+
X_val = getattr(self, "X_val_", self.ground_truth_[self.test_idx_])
|
|
1052
1092
|
|
|
1053
1093
|
class_weights = self._normalize_class_weights(
|
|
1054
1094
|
self._class_weights_from_zygosity(X_train)
|
|
1055
1095
|
)
|
|
1096
|
+
# Pos weights for diploid multilabel BCE during tuning
|
|
1097
|
+
if not self.is_haploid:
|
|
1098
|
+
self.pos_weights_ = self._compute_pos_weights(X_train)
|
|
1099
|
+
else:
|
|
1100
|
+
self.pos_weights_ = None
|
|
1056
1101
|
train_loader = self._get_data_loader(X_train)
|
|
1057
1102
|
|
|
1058
1103
|
model = self.build_model(self.Model, params["model_params"])
|
|
@@ -1073,7 +1118,7 @@ class ImputeVAE(BaseNNImputer):
|
|
|
1073
1118
|
X_val=X_val,
|
|
1074
1119
|
params=params,
|
|
1075
1120
|
prune_metric=self.tune_metric,
|
|
1076
|
-
prune_warmup_epochs=
|
|
1121
|
+
prune_warmup_epochs=10,
|
|
1077
1122
|
eval_interval=self.tune_eval_interval,
|
|
1078
1123
|
eval_requires_latents=False,
|
|
1079
1124
|
eval_latent_steps=0,
|
|
@@ -1116,27 +1161,32 @@ class ImputeVAE(BaseNNImputer):
|
|
|
1116
1161
|
Dict[str, int | float | str]: Sampled hyperparameters.
|
|
1117
1162
|
"""
|
|
1118
1163
|
params = {
|
|
1119
|
-
"latent_dim": trial.suggest_int("latent_dim",
|
|
1120
|
-
"lr": trial.suggest_float("learning_rate",
|
|
1121
|
-
"dropout_rate": trial.suggest_float("dropout_rate", 0.0, 0.
|
|
1122
|
-
"num_hidden_layers": trial.suggest_int("num_hidden_layers", 1,
|
|
1164
|
+
"latent_dim": trial.suggest_int("latent_dim", 4, 16, step=2),
|
|
1165
|
+
"lr": trial.suggest_float("learning_rate", 3e-4, 1e-3, log=True),
|
|
1166
|
+
"dropout_rate": trial.suggest_float("dropout_rate", 0.0, 0.30, step=0.05),
|
|
1167
|
+
"num_hidden_layers": trial.suggest_int("num_hidden_layers", 1, 6),
|
|
1123
1168
|
"activation": trial.suggest_categorical(
|
|
1124
|
-
"activation", ["relu", "elu", "selu"]
|
|
1169
|
+
"activation", ["relu", "elu", "selu", "leaky_relu"]
|
|
1125
1170
|
),
|
|
1126
|
-
"l1_penalty": trial.suggest_float("l1_penalty", 1e-
|
|
1171
|
+
"l1_penalty": trial.suggest_float("l1_penalty", 1e-6, 1e-3, log=True),
|
|
1127
1172
|
"layer_scaling_factor": trial.suggest_float(
|
|
1128
|
-
"layer_scaling_factor", 2.0,
|
|
1173
|
+
"layer_scaling_factor", 2.0, 4.0, step=0.5
|
|
1129
1174
|
),
|
|
1130
1175
|
"layer_schedule": trial.suggest_categorical(
|
|
1131
|
-
"layer_schedule", ["pyramid", "
|
|
1176
|
+
"layer_schedule", ["pyramid", "linear"]
|
|
1132
1177
|
),
|
|
1133
1178
|
# VAE-specific β (final value after anneal)
|
|
1134
|
-
"beta": trial.suggest_float("beta", 0.
|
|
1179
|
+
"beta": trial.suggest_float("beta", 0.5, 2.0, step=0.5),
|
|
1135
1180
|
# focal gamma (if used in VAE recon CE)
|
|
1136
|
-
"gamma": trial.suggest_float("gamma", 0.0, 5
|
|
1181
|
+
"gamma": trial.suggest_float("gamma", 0.5, 3.0, step=0.5),
|
|
1137
1182
|
}
|
|
1138
1183
|
|
|
1139
|
-
|
|
1184
|
+
use_n_features = (
|
|
1185
|
+
self._tune_num_features
|
|
1186
|
+
if (self.tune and self.tune_fast and hasattr(self, "_tune_num_features"))
|
|
1187
|
+
else self.num_features_
|
|
1188
|
+
)
|
|
1189
|
+
input_dim = use_n_features * self.output_classes_
|
|
1140
1190
|
hidden_layer_sizes = self._compute_hidden_layer_sizes(
|
|
1141
1191
|
n_inputs=input_dim,
|
|
1142
1192
|
n_outputs=input_dim,
|
|
@@ -1150,8 +1200,8 @@ class ImputeVAE(BaseNNImputer):
|
|
|
1150
1200
|
hidden_only = [hidden_layer_sizes[0]] + hidden_layer_sizes[1:-1]
|
|
1151
1201
|
|
|
1152
1202
|
params["model_params"] = {
|
|
1153
|
-
"n_features":
|
|
1154
|
-
"num_classes": self.
|
|
1203
|
+
"n_features": use_n_features,
|
|
1204
|
+
"num_classes": self.output_classes_,
|
|
1155
1205
|
"latent_dim": params["latent_dim"],
|
|
1156
1206
|
"dropout_rate": params["dropout_rate"],
|
|
1157
1207
|
"hidden_layer_sizes": hidden_only,
|
|
@@ -1182,8 +1232,8 @@ class ImputeVAE(BaseNNImputer):
|
|
|
1182
1232
|
self.gamma = best_params.get("gamma", self.gamma)
|
|
1183
1233
|
|
|
1184
1234
|
hidden_layer_sizes = self._compute_hidden_layer_sizes(
|
|
1185
|
-
n_inputs=self.num_features_ * self.
|
|
1186
|
-
n_outputs=self.num_features_ * self.
|
|
1235
|
+
n_inputs=self.num_features_ * self.output_classes_,
|
|
1236
|
+
n_outputs=self.num_features_ * self.output_classes_,
|
|
1187
1237
|
n_samples=len(self.train_idx_),
|
|
1188
1238
|
n_hidden=best_params["num_hidden_layers"],
|
|
1189
1239
|
alpha=best_params["layer_scaling_factor"],
|
|
@@ -1197,7 +1247,7 @@ class ImputeVAE(BaseNNImputer):
|
|
|
1197
1247
|
"hidden_layer_sizes": hidden_only,
|
|
1198
1248
|
"dropout_rate": self.dropout_rate,
|
|
1199
1249
|
"activation": self.activation,
|
|
1200
|
-
"num_classes": self.
|
|
1250
|
+
"num_classes": self.output_classes_,
|
|
1201
1251
|
"beta": self.kl_beta_final,
|
|
1202
1252
|
"gamma": self.gamma,
|
|
1203
1253
|
}
|
|
@@ -1209,8 +1259,8 @@ class ImputeVAE(BaseNNImputer):
|
|
|
1209
1259
|
Dict[str, int | float | str | list]: VAE model parameters.
|
|
1210
1260
|
"""
|
|
1211
1261
|
hidden_layer_sizes = self._compute_hidden_layer_sizes(
|
|
1212
|
-
n_inputs=self.num_features_ * self.
|
|
1213
|
-
n_outputs=self.num_features_ * self.
|
|
1262
|
+
n_inputs=self.num_features_ * self.output_classes_,
|
|
1263
|
+
n_outputs=self.num_features_ * self.output_classes_,
|
|
1214
1264
|
n_samples=len(self.ground_truth_),
|
|
1215
1265
|
n_hidden=self.num_hidden_layers,
|
|
1216
1266
|
alpha=self.layer_scaling_factor,
|
|
@@ -1222,7 +1272,45 @@ class ImputeVAE(BaseNNImputer):
|
|
|
1222
1272
|
"hidden_layer_sizes": hidden_layer_sizes,
|
|
1223
1273
|
"dropout_rate": self.dropout_rate,
|
|
1224
1274
|
"activation": self.activation,
|
|
1225
|
-
"num_classes": self.
|
|
1275
|
+
"num_classes": self.output_classes_,
|
|
1226
1276
|
"beta": self.kl_beta_final,
|
|
1227
1277
|
"gamma": self.gamma,
|
|
1228
1278
|
}
|
|
1279
|
+
|
|
1280
|
+
def _encode_multilabel_inputs(self, y: torch.Tensor) -> torch.Tensor:
|
|
1281
|
+
"""Two-channel multi-hot for diploid: REF-only, ALT-only; HET sets both."""
|
|
1282
|
+
if self.is_haploid:
|
|
1283
|
+
return self._one_hot_encode_012(y)
|
|
1284
|
+
y = y.to(self.device)
|
|
1285
|
+
shape = y.shape + (2,)
|
|
1286
|
+
out = torch.zeros(shape, device=self.device, dtype=torch.float32)
|
|
1287
|
+
valid = y != -1
|
|
1288
|
+
ref_mask = valid & (y != 2)
|
|
1289
|
+
alt_mask = valid & (y != 0)
|
|
1290
|
+
out[ref_mask, 0] = 1.0
|
|
1291
|
+
out[alt_mask, 1] = 1.0
|
|
1292
|
+
return out
|
|
1293
|
+
|
|
1294
|
+
def _multi_hot_targets(self, y: torch.Tensor) -> torch.Tensor:
|
|
1295
|
+
"""Targets aligned with _encode_multilabel_inputs for diploid training."""
|
|
1296
|
+
if self.is_haploid:
|
|
1297
|
+
raise RuntimeError("_multi_hot_targets called for haploid data.")
|
|
1298
|
+
y = y.to(self.device)
|
|
1299
|
+
out = torch.zeros(y.shape + (2,), device=self.device, dtype=torch.float32)
|
|
1300
|
+
valid = y != -1
|
|
1301
|
+
ref_mask = valid & (y != 2)
|
|
1302
|
+
alt_mask = valid & (y != 0)
|
|
1303
|
+
out[ref_mask, 0] = 1.0
|
|
1304
|
+
out[alt_mask, 1] = 1.0
|
|
1305
|
+
return out
|
|
1306
|
+
|
|
1307
|
+
def _compute_pos_weights(self, X: np.ndarray) -> torch.Tensor:
|
|
1308
|
+
"""Balance REF/ALT channels for multilabel BCE."""
|
|
1309
|
+
ref_pos = np.count_nonzero((X == 0) | (X == 1))
|
|
1310
|
+
alt_pos = np.count_nonzero((X == 2) | (X == 1))
|
|
1311
|
+
total_valid = np.count_nonzero(X != -1)
|
|
1312
|
+
pos_counts = np.array([ref_pos, alt_pos], dtype=np.float32)
|
|
1313
|
+
neg_counts = np.maximum(total_valid - pos_counts, 1.0)
|
|
1314
|
+
pos_counts = np.maximum(pos_counts, 1.0)
|
|
1315
|
+
weights = neg_counts / pos_counts
|
|
1316
|
+
return torch.tensor(weights, device=self.device, dtype=torch.float32)
|
|
@@ -1,344 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: pg-sui
|
|
3
|
-
Version: 1.6.14.dev9
|
|
4
|
-
Summary: Python machine and deep learning API to impute missing genotypes
|
|
5
|
-
Author-email: "Drs. Bradley T. Martin and Tyler K. Chafin" <evobio721@gmail.com>
|
|
6
|
-
Maintainer-email: "Dr. Bradley T. Martin" <evobio721@gmail.com>
|
|
7
|
-
License: GNU General Public License v3 (GPLv3)
|
|
8
|
-
Project-URL: Homepage, https://github.com/btmartin721/PG-SUI
|
|
9
|
-
Project-URL: Documentation, https://pg-sui.readthedocs.io/en/latest/
|
|
10
|
-
Project-URL: Source, https://github.com/btmartin721/PG-SUI.git
|
|
11
|
-
Project-URL: BugTracker, https://github.com/btmartin721/PG-SUI/issues
|
|
12
|
-
Keywords: impute,imputation,AI,deep learning,machine learning,neural network,vae,autoencoder,ubp,nlpca,population genetics,unsupervised,supervised,bioinformatics,snp,genomics,genotype,missing data,data analysis,data science,statistics,data visualization,python
|
|
13
|
-
Classifier: Programming Language :: Python :: 3
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
-
Classifier: Development Status :: 4 - Beta
|
|
17
|
-
Classifier: Environment :: Console
|
|
18
|
-
Classifier: Intended Audience :: Science/Research
|
|
19
|
-
Classifier: Intended Audience :: Developers
|
|
20
|
-
Classifier: Intended Audience :: Education
|
|
21
|
-
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
22
|
-
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
23
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
24
|
-
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
25
|
-
Classifier: Topic :: Scientific/Engineering :: Visualization
|
|
26
|
-
Classifier: Operating System :: MacOS
|
|
27
|
-
Classifier: Operating System :: MacOS :: MacOS X
|
|
28
|
-
Classifier: Operating System :: Unix
|
|
29
|
-
Classifier: Operating System :: POSIX
|
|
30
|
-
Classifier: Natural Language :: English
|
|
31
|
-
Requires-Python: >=3.11
|
|
32
|
-
Description-Content-Type: text/markdown
|
|
33
|
-
License-File: LICENSE
|
|
34
|
-
Requires-Dist: matplotlib
|
|
35
|
-
Requires-Dist: numpy>=2.1
|
|
36
|
-
Requires-Dist: pandas>=2.2.2
|
|
37
|
-
Requires-Dist: scikit-learn>=1.4
|
|
38
|
-
Requires-Dist: scipy
|
|
39
|
-
Requires-Dist: seaborn
|
|
40
|
-
Requires-Dist: torch
|
|
41
|
-
Requires-Dist: tqdm
|
|
42
|
-
Requires-Dist: toytree
|
|
43
|
-
Requires-Dist: optuna
|
|
44
|
-
Requires-Dist: rich
|
|
45
|
-
Requires-Dist: rich[jupyter]
|
|
46
|
-
Requires-Dist: snpio
|
|
47
|
-
Provides-Extra: intel
|
|
48
|
-
Requires-Dist: scikit-learn-intelex; extra == "intel"
|
|
49
|
-
Provides-Extra: docs
|
|
50
|
-
Requires-Dist: sphinx; extra == "docs"
|
|
51
|
-
Requires-Dist: sphinx-rtd-theme; extra == "docs"
|
|
52
|
-
Requires-Dist: sphinx_autodoc_typehints; extra == "docs"
|
|
53
|
-
Requires-Dist: sphinxcontrib-napoleon; extra == "docs"
|
|
54
|
-
Requires-Dist: sphinxcontrib-programoutput; extra == "docs"
|
|
55
|
-
Provides-Extra: dev
|
|
56
|
-
Requires-Dist: twine; extra == "dev"
|
|
57
|
-
Requires-Dist: wheel; extra == "dev"
|
|
58
|
-
Requires-Dist: pytest; extra == "dev"
|
|
59
|
-
Requires-Dist: sphinx; extra == "dev"
|
|
60
|
-
Requires-Dist: sphinx-rtd-theme; extra == "dev"
|
|
61
|
-
Requires-Dist: sphinx-autodoc-typehints; extra == "dev"
|
|
62
|
-
Requires-Dist: sphinxcontrib-napoleon; extra == "dev"
|
|
63
|
-
Requires-Dist: sphinxcontrib-programoutput; extra == "dev"
|
|
64
|
-
Requires-Dist: requests; extra == "dev"
|
|
65
|
-
Provides-Extra: optional
|
|
66
|
-
Requires-Dist: PyObjC; extra == "optional"
|
|
67
|
-
Provides-Extra: gui
|
|
68
|
-
Requires-Dist: fastapi>=0.110; extra == "gui"
|
|
69
|
-
Requires-Dist: uvicorn[standard]>=0.23; extra == "gui"
|
|
70
|
-
Dynamic: license-file
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
<img src="https://github.com/btmartin721/PG-SUI/blob/master/img/pgsui-logo-faded.png" alt="PG-SUI Logo" width="50%" height="50%">
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
# PG-SUI
|
|
77
|
-
|
|
78
|
-
Population Genomic Supervised and Unsupervised Imputation.
|
|
79
|
-
|
|
80
|
-
## About PG-SUI
|
|
81
|
-
|
|
82
|
-
PG-SUI is a Python 3 API that uses machine learning to impute missing values from population genomic SNP data. There are several supervised and unsupervised machine learning algorithms available to impute missing data, as well as some non-machine learning imputers that are useful.
|
|
83
|
-
|
|
84
|
-
Below is some general information and a basic tutorial. For more detailed information, see our [API Documentation](https://pg-sui.readthedocs.io/en/latest/).
|
|
85
|
-
|
|
86
|
-
### Supervised Imputation Methods
|
|
87
|
-
|
|
88
|
-
Supervised methods utilze the scikit-learn's IterativeImputer, which is based on the MICE (Multivariate Imputation by Chained Equations) algorithm ([1](#1)), and iterates over each SNP site (i.e., feature) while uses the N nearest neighbor features to inform the imputation. The number of nearest features can be adjusted by users. IterativeImputer currently works with any of the following scikit-learn classifiers:
|
|
89
|
-
|
|
90
|
-
+ K-Nearest Neighbors
|
|
91
|
-
+ Random Forest
|
|
92
|
-
+ XGBoost
|
|
93
|
-
|
|
94
|
-
See the scikit-learn documentation (https://scikit-learn.org) for more information on IterativeImputer and each of the classifiers.
|
|
95
|
-
|
|
96
|
-
### Unsupervised Imputation Methods
|
|
97
|
-
|
|
98
|
-
Unsupervised imputers include three custom neural network models:
|
|
99
|
-
|
|
100
|
-
+ Variational Autoencoder (VAE) ([2](#2))
|
|
101
|
-
+ Standard Autoencoder (SAE) ([3](#3))
|
|
102
|
-
+ Non-linear Principal Component Analysis (NLPCA) ([4](#4))
|
|
103
|
-
+ Unsupervised Backpropagation (UBP) ([5](#5))
|
|
104
|
-
|
|
105
|
-
VAE models train themselves to reconstruct their input (i.e., the genotypes). To use VAE for imputation, the missing values are masked and the VAE model gets trained to reconstruct only on known values. Once the model is trained, it is then used to predict the missing values.
|
|
106
|
-
|
|
107
|
-
SAE is a standard autoencoder that trains the input to predict itself. As with VAE, missing values are masked and the model gets trained only on known values. Predictions are then made on the missing values.
|
|
108
|
-
|
|
109
|
-
NLPCA initializes random, reduced-dimensional input, then trains itself by using the known values (i.e., genotypes) as targets and refining the random input until it accurately predicts the genotype output. The trained model can then predict the missing values.
|
|
110
|
-
|
|
111
|
-
UBP is an extension of NLPCA that runs over three phases. Phase 1 refines the randomly generated, reduced-dimensional input in a single layer perceptron neural network to obtain good initial input values. Phase 2 uses the refined reduced-dimensional input from phase 1 as input into a multi-layer perceptron (MLP), but in Phase 2 only the neural network weights are refined. Phase three uses an MLP to refine both the weights and the reduced-dimensional input. Once the model is trained, it can be used to predict the missing values.
|
|
112
|
-
|
|
113
|
-
### Non-Machine Learning Methods
|
|
114
|
-
|
|
115
|
-
We also include several non-machine learning options for imputing missing data, including:
|
|
116
|
-
|
|
117
|
-
+ Per-population mode per SNP site
|
|
118
|
-
+ Global mode per SNP site
|
|
119
|
-
+ Using a phylogeny as input to inform the imputation
|
|
120
|
-
+ Matrix Factorization
|
|
121
|
-
|
|
122
|
-
These four "simple" imputation methods can be used as standalone imputers, as the initial imputation strategy for IterativeImputer (at least one method is required to be chosen), and to validate the accuracy of both IterativeImputer and the neural network models.
|
|
123
|
-
|
|
124
|
-
## Installing PG-SUI
|
|
125
|
-
|
|
126
|
-
The easiest way to install PG-SUI is to use pip:
|
|
127
|
-
|
|
128
|
-
```
|
|
129
|
-
pip install pg-sui
|
|
130
|
-
```
|
|
131
|
-
|
|
132
|
-
If you have an Intel CPU and want to use the sklearn-genetic-intelex package to speed up scikit-learn computations, you can do:
|
|
133
|
-
|
|
134
|
-
```
|
|
135
|
-
pip install pg-sui[intel]
|
|
136
|
-
```
|
|
137
|
-
|
|
138
|
-
### Optional GUI (Electron)
|
|
139
|
-
|
|
140
|
-
PG-SUI ships an Electron GUI wrapper around the Python CLI.
|
|
141
|
-
|
|
142
|
-
1. Install the Python-side extras (FastAPI/uvicorn helper) if you want to serve from Python:
|
|
143
|
-
`pip install pg-sui[gui]`
|
|
144
|
-
2. Install Node.js (https://nodejs.org) and fetch the app dependencies once:
|
|
145
|
-
`pgsui-gui-setup`
|
|
146
|
-
3. Launch the GUI:
|
|
147
|
-
`pgsui-gui`
|
|
148
|
-
|
|
149
|
-
The GUI shells out to the same CLI underneath, so presets/overrides and YAML configs behave identically.
|
|
150
|
-
|
|
151
|
-
## Manual Installation
|
|
152
|
-
|
|
153
|
-
### Dependencies
|
|
154
|
-
|
|
155
|
-
+ python >= 3.11
|
|
156
|
-
+ pandas
|
|
157
|
-
+ numpy
|
|
158
|
-
+ scipy
|
|
159
|
-
+ matplotlib
|
|
160
|
-
+ seaborn
|
|
161
|
-
+ plotly
|
|
162
|
-
+ kaleido
|
|
163
|
-
+ tqdm
|
|
164
|
-
+ toytree
|
|
165
|
-
+ scikit-learn
|
|
166
|
-
+ xgboost
|
|
167
|
-
+ snpio
|
|
168
|
-
+ optuna
|
|
169
|
-
|
|
170
|
-
#### Installation troubleshooting
|
|
171
|
-
|
|
172
|
-
##### "use_2to3 is invalid" error
|
|
173
|
-
|
|
174
|
-
Users running setuptools v58 may encounter this error during the last step of installation, using pip to install sklearn-genetic-opt:
|
|
175
|
-
|
|
176
|
-
```
|
|
177
|
-
ERROR: Command errored out with exit status 1:
|
|
178
|
-
command: /Users/tyler/miniforge3/envs/pg-sui/bin/python3.8 -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'/private/var/folders/6x/t6g4kn711z5cxmc2_tvq0mlw0000gn/T/pip-install-6y5g_mhs/deap_1d32f65d60a44056bd7031f3aad44571/setup.py'"'"'; __file__='"'"'/private/var/folders/6x/t6g4kn711z5cxmc2_tvq0mlw0000gn/T/pip-install-6y5g_mhs/deap_1d32f65d60a44056bd7031f3aad44571/setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base /private/var/folders/6x/t6g4kn711z5cxmc2_tvq0mlw0000gn/T/pip-pip-egg-info-7hg3hcq2
|
|
179
|
-
cwd: /private/var/folders/6x/t6g4kn711z5cxmc2_tvq0mlw0000gn/T/pip-install-6y5g_mhs/deap_1d32f65d60a44056bd7031f3aad44571/
|
|
180
|
-
Complete output (1 lines):
|
|
181
|
-
error in deap setup command: use_2to3 is invalid.
|
|
182
|
-
```
|
|
183
|
-
|
|
184
|
-
This occurs during the installation of DEAP, one of the dependencies for sklearn-genetic-opt. As a workaround, first downgrade setuptools, and then proceed with the installation as normal:
|
|
185
|
-
```
|
|
186
|
-
pip install setuptools==57
|
|
187
|
-
pip install sklearn-genetic-opt[all]
|
|
188
|
-
|
|
189
|
-
```
|
|
190
|
-
|
|
191
|
-
##### Mac ARM architecture
|
|
192
|
-
|
|
193
|
-
PG-SUI has been tested on the new Mac M1 chips and is working fine, but some changes to the installation process were necessary as of 9-December-21. Installation was successful using the following:
|
|
194
|
-
|
|
195
|
-
```
|
|
196
|
-
### Install Miniforge3 instead of Miniconda3
|
|
197
|
-
### Download: https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
|
|
198
|
-
bash ~/Downloads/Miniforge3-MacOSX-arm64.sh
|
|
199
|
-
|
|
200
|
-
# Close and re-open terminal #
|
|
201
|
-
|
|
202
|
-
#create and activate conda environment
|
|
203
|
-
conda create -n pg-sui python
|
|
204
|
-
|
|
205
|
-
#activate environment
|
|
206
|
-
conda activate pg-sui
|
|
207
|
-
|
|
208
|
-
#install packages
|
|
209
|
-
conda install -c conda-forge matplotlib seaborn jupyterlab scikit-learn tqdm pandas numpy scipy xgboost lightgbm tensorflow keras sklearn-genetic-opt toytree
|
|
210
|
-
conda install -c bioconda pyvolve
|
|
211
|
-
|
|
212
|
-
#downgrade setuptools (may or may not be necessary)
|
|
213
|
-
pip install setuptools==57
|
|
214
|
-
|
|
215
|
-
#install sklearn-genetic-opt and mlflow
|
|
216
|
-
pip install sklearn-genetic-opt mlflow
|
|
217
|
-
|
|
218
|
-
```
|
|
219
|
-
|
|
220
|
-
Any other problems we run into testing on the Mac ARM architecture will be adjusted here. Note that the step installing scikit-learn-intelex was skipped here. PG-SUI will automatically detect the CPU architecture you are running, and forgo importing this package (which will only work on Intel processors)
|
|
221
|
-
|
|
222
|
-
## Input Data
|
|
223
|
-
|
|
224
|
-
You can read your input files as a GenotypeData object from the [SNPio](https://snpio.readthedocs.io/en/latest/) package:
|
|
225
|
-
|
|
226
|
-
```
|
|
227
|
-
|
|
228
|
-
# Import snpio. Automatically installed with pgsui when using pip.
|
|
229
|
-
from snpio import GenotypeData
|
|
230
|
-
|
|
231
|
-
# Read in PHYLIP, VCF, or STRUCTURE-formatted alignments.
|
|
232
|
-
data = GenotypeData(
|
|
233
|
-
filename="example_data/phylip_files/phylogen_nomx.u.snps.phy",
|
|
234
|
-
popmapfile="example_data/popmaps/phylogen_nomx.popmap",
|
|
235
|
-
force_popmap=True,
|
|
236
|
-
filetype="auto",
|
|
237
|
-
qmatrix_iqtree="example_data/trees/test.qmat",
|
|
238
|
-
siterates_iqtree="example_data/trees/test.rate",
|
|
239
|
-
guidetree="example_data/trees/test.tre",
|
|
240
|
-
include_pops=["EA", "TT", "GU"], # Only include these populations. There's also an exclude_pops option that will exclude the provided populations.
|
|
241
|
-
)
|
|
242
|
-
```
|
|
243
|
-
|
|
244
|
-
## Supported Imputation Methods
|
|
245
|
-
|
|
246
|
-
There are numerous supported algorithms to impute missing data. Each one can be run by calling the corresponding class. You must provide a GenotypeData instance as the first positional argument.
|
|
247
|
-
|
|
248
|
-
You can import all the supported methods with:
|
|
249
|
-
|
|
250
|
-
```
|
|
251
|
-
from pgsui import *
|
|
252
|
-
```
|
|
253
|
-
|
|
254
|
-
Or you can import them one at a time.
|
|
255
|
-
|
|
256
|
-
```
|
|
257
|
-
from pgsui import ImputeVAE
|
|
258
|
-
```
|
|
259
|
-
|
|
260
|
-
### Supervised Imputers
|
|
261
|
-
|
|
262
|
-
Various supervised imputation options are supported:
|
|
263
|
-
|
|
264
|
-
```
|
|
265
|
-
# Supervised IterativeImputer classifiers
|
|
266
|
-
knn = ImputeKNN(data) # K-Nearest Neighbors
|
|
267
|
-
rf = ImputeRandomForest(data) # Random Forest or Extra Trees
|
|
268
|
-
xgb = ImputeXGBoost(data) # XGBoost
|
|
269
|
-
```
|
|
270
|
-
|
|
271
|
-
### Non-machine learning methods
|
|
272
|
-
|
|
273
|
-
Use phylogeny to inform imputation:
|
|
274
|
-
|
|
275
|
-
```
|
|
276
|
-
phylo = ImputePhylo(data)
|
|
277
|
-
```
|
|
278
|
-
|
|
279
|
-
Use by-population or global allele frequency to inform imputation
|
|
280
|
-
|
|
281
|
-
```
|
|
282
|
-
pop_af = ImputeAlleleFreq(data, by_populations=True)
|
|
283
|
-
global_af = ImputeAlleleFreq(data, by_populations=False)
|
|
284
|
-
ref_af = ImputeRefAllele(data)
|
|
285
|
-
```
|
|
286
|
-
|
|
287
|
-
Non-matrix factorization:
|
|
288
|
-
|
|
289
|
-
```
|
|
290
|
-
mf = ImputeMF(*args) # Matrix factorization
|
|
291
|
-
```
|
|
292
|
-
|
|
293
|
-
### Unsupervised Neural Networks
|
|
294
|
-
|
|
295
|
-
``` python
|
|
296
|
-
vae = ImputeVAE(data) # Variational autoencoder
|
|
297
|
-
nlpca = ImputeNLPCA(data) # Nonlinear PCA
|
|
298
|
-
ubp = ImputeUBP(data) # Unsupervised backpropagation
|
|
299
|
-
sae = ImputeStandardAutoEncoder(data) # standard autoencoder
|
|
300
|
-
```
|
|
301
|
-
|
|
302
|
-
## Command-Line Interface
|
|
303
|
-
|
|
304
|
-
Run the PG-SUI CLI with ``pg-sui`` (installed alongside the library). The CLI follows the same precedence model as the Python API:
|
|
305
|
-
|
|
306
|
-
``code defaults < preset (--preset) < YAML (--config) < explicit CLI flags < --set key=value``.
|
|
307
|
-
|
|
308
|
-
Recent releases add explicit switches for the simulated-missingness workflow shared by the neural and supervised models:
|
|
309
|
-
|
|
310
|
-
- ``--sim-strategy`` selects one of ``random``, ``random_weighted``, ``random_weighted_inv``, ``nonrandom``, ``nonrandom_weighted``.
|
|
311
|
-
- ``--sim-prop`` sets the proportion of observed calls to temporarily mask when building the evaluation set.
|
|
312
|
-
- ``--simulate-missing`` disables simulated masking entirely (store-false flag); omit it to inherit preset/YAML defaults or re-enable via ``--set sim.simulate_missing=True``.
|
|
313
|
-
|
|
314
|
-
Example:
|
|
315
|
-
|
|
316
|
-
```
|
|
317
|
-
pg-sui \
|
|
318
|
-
--vcf data.vcf.gz \
|
|
319
|
-
--popmap pops.popmap \
|
|
320
|
-
--models ImputeUBP ImputeVAE \
|
|
321
|
-
--preset balanced \
|
|
322
|
-
--sim-strategy random_weighted_inv \
|
|
323
|
-
--sim-prop 0.25 \
|
|
324
|
-
--set io.prefix=vae_vs_ubp
|
|
325
|
-
```
|
|
326
|
-
|
|
327
|
-
CLI overrides cascade into every selected model, so a single invocation can evaluate multiple imputers with a consistent simulation strategy and output prefix.
|
|
328
|
-
|
|
329
|
-
## To-Dos
|
|
330
|
-
|
|
331
|
-
- simulations
|
|
332
|
-
- Documentation
|
|
333
|
-
|
|
334
|
-
## References:
|
|
335
|
-
|
|
336
|
-
<a name="1">1. </a>Stef van Buuren, Karin Groothuis-Oudshoorn (2011). mice: Multivariate Imputation by Chained Equations in R. Journal of Statistical Software 45: 1-67.
|
|
337
|
-
|
|
338
|
-
<a name="2">2. </a>Kingma, D.P. & Welling, M. (2013). Auto-encoding variational bayes. In: Proceedings of the International Conference on Learning Representations (ICLR). arXiv:1312.6114 [stat.ML].
|
|
339
|
-
|
|
340
|
-
<a name="3">3. </a>Hinton, G.E., & Salakhutdinov, R.R. (2006). Reducing the dimensionality of data with neural networks. Science, 313(5786), 504-507.
|
|
341
|
-
|
|
342
|
-
<a name="4">4. </a>Scholz, M., Kaplan, F., Guy, C. L., Kopka, J., & Selbig, J. (2005). Non-linear PCA: a missing data approach. Bioinformatics, 21(20), 3887-3895.
|
|
343
|
-
|
|
344
|
-
<a name="5">5. </a>Gashler, M. S., Smith, M. R., Morris, R., & Martinez, T. (2016). Missing value imputation with unsupervised backpropagation. Computational Intelligence, 32(2), 196-215.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|