pg-sui 1.6.14.dev9__py3-none-any.whl → 1.6.16a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,7 @@ import matplotlib.pyplot as plt
7
7
  import numpy as np
8
8
  import optuna
9
9
  import torch
10
+ import torch.nn.functional as F
10
11
  from sklearn.exceptions import NotFittedError
11
12
  from sklearn.model_selection import train_test_split
12
13
  from snpio.analysis.genotype_encoder import GenotypeEncoder
@@ -152,6 +153,7 @@ class ImputeVAE(BaseNNImputer):
152
153
  self.verbose = self.cfg.io.verbose
153
154
  self.debug = self.cfg.io.debug
154
155
  self.rng = np.random.default_rng(self.seed)
156
+ self.pos_weights_: torch.Tensor | None = None
155
157
 
156
158
  # Simulated-missing controls (config defaults + ctor overrides)
157
159
  sim_cfg = getattr(self.cfg, "sim", None)
@@ -300,9 +302,10 @@ class ImputeVAE(BaseNNImputer):
300
302
  )
301
303
  self.ploidy = 1 if self.is_haploid else 2
302
304
  self.num_classes_ = 2 if self.is_haploid else 3
305
+ self.output_classes_ = 2
303
306
  self.logger.info(
304
307
  f"Data is {'haploid' if self.is_haploid else 'diploid'}; "
305
- f"using {self.num_classes_} classes."
308
+ f"using {self.num_classes_} classes for scoring and {self.output_classes_} output channels."
306
309
  )
307
310
 
308
311
  if self.is_haploid:
@@ -314,7 +317,7 @@ class ImputeVAE(BaseNNImputer):
314
317
  # Model params (decoder outputs L*K logits)
315
318
  self.model_params = {
316
319
  "n_features": self.num_features_,
317
- "num_classes": self.num_classes_,
320
+ "num_classes": self.output_classes_,
318
321
  "latent_dim": self.latent_dim,
319
322
  "dropout_rate": self.dropout_rate,
320
323
  "activation": self.activation,
@@ -352,6 +355,10 @@ class ImputeVAE(BaseNNImputer):
352
355
  self.class_weights_ = self._normalize_class_weights(
353
356
  self._class_weights_from_zygosity(self.X_train_)
354
357
  )
358
+ if not self.is_haploid:
359
+ self.pos_weights_ = self._compute_pos_weights(self.X_train_)
360
+ else:
361
+ self.pos_weights_ = None
355
362
 
356
363
  # DataLoader
357
364
  train_loader = self._get_data_loader(self.X_train_)
@@ -370,7 +377,7 @@ class ImputeVAE(BaseNNImputer):
370
377
  X_val=self.X_val_,
371
378
  params=self.best_params_,
372
379
  prune_metric=self.tune_metric,
373
- prune_warmup_epochs=5,
380
+ prune_warmup_epochs=10,
374
381
  eval_interval=1,
375
382
  eval_requires_latents=False, # no latent refinement for eval
376
383
  eval_latent_steps=0,
@@ -480,7 +487,7 @@ class ImputeVAE(BaseNNImputer):
480
487
  X_val: np.ndarray | None = None,
481
488
  params: dict | None = None,
482
489
  prune_metric: str | None = None, # "f1" | "accuracy" | "pr_macro"
483
- prune_warmup_epochs: int = 3,
490
+ prune_warmup_epochs: int = 10,
484
491
  eval_interval: int = 1,
485
492
  eval_requires_latents: bool = False, # VAE: no latent eval refinement
486
493
  eval_latent_steps: int = 0,
@@ -562,7 +569,7 @@ class ImputeVAE(BaseNNImputer):
562
569
  X_val: np.ndarray | None = None,
563
570
  params: dict | None = None,
564
571
  prune_metric: str | None = None,
565
- prune_warmup_epochs: int = 3,
572
+ prune_warmup_epochs: int = 10,
566
573
  eval_interval: int = 1,
567
574
  eval_requires_latents: bool = False,
568
575
  eval_latent_steps: int = 0,
@@ -755,14 +762,14 @@ class ImputeVAE(BaseNNImputer):
755
762
  for _, y_batch in loader:
756
763
  optimizer.zero_grad(set_to_none=True)
757
764
 
758
- # targets: (B, L) int in {0,1,2,-1}
759
765
  y_int = y_batch.to(self.device, non_blocking=True).long()
760
766
 
761
- # inputs: one-hot with zeros for missing
762
- x_ohe = self._one_hot_encode_012(y_int) # (B, L, K)
767
+ if self.is_haploid:
768
+ x_in = self._one_hot_encode_012(y_int) # (B, L, 2)
769
+ else:
770
+ x_in = self._encode_multilabel_inputs(y_int) # (B, L, 2)
763
771
 
764
- # Forward. Expect model to return recon_logits, mu, logvar, ...
765
- out = model(x_ohe)
772
+ out = model(x_in)
766
773
  if isinstance(out, (list, tuple)):
767
774
  recon_logits, mu, logvar = out[0], out[1], out[2]
768
775
  else:
@@ -780,15 +787,30 @@ class ImputeVAE(BaseNNImputer):
780
787
  beta = float(getattr(model, "beta", getattr(self, "kl_beta_final", 0.0)))
781
788
  gamma = max(0.0, min(gamma, 10.0))
782
789
 
783
- loss = compute_vae_loss(
784
- recon_logits=recon_logits,
785
- targets=y_int,
786
- mu=mu,
787
- logvar=logvar,
788
- class_weights=class_weights,
789
- gamma=gamma,
790
- beta=beta,
791
- )
790
+ if self.is_haploid:
791
+ loss = compute_vae_loss(
792
+ recon_logits=recon_logits,
793
+ targets=y_int,
794
+ mu=mu,
795
+ logvar=logvar,
796
+ class_weights=class_weights,
797
+ gamma=gamma,
798
+ beta=beta,
799
+ )
800
+ else:
801
+ targets = self._multi_hot_targets(y_int)
802
+ pos_w = getattr(self, "pos_weights_", None)
803
+ bce = F.binary_cross_entropy_with_logits(
804
+ recon_logits, targets, pos_weight=pos_w, reduction="none"
805
+ )
806
+ mask = (y_int != -1).unsqueeze(-1).float()
807
+ recon_loss = (bce * mask).sum() / mask.sum().clamp_min(1e-8)
808
+ kl = (
809
+ -0.5
810
+ * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
811
+ / (y_int.shape[0] + 1e-8)
812
+ )
813
+ loss = recon_loss + beta * kl
792
814
 
793
815
  if l1_penalty > 0:
794
816
  l1 = torch.zeros((), device=self.device)
@@ -845,11 +867,25 @@ class ImputeVAE(BaseNNImputer):
845
867
  with torch.no_grad():
846
868
  X_tensor = torch.from_numpy(X) if isinstance(X, np.ndarray) else X
847
869
  X_tensor = X_tensor.to(self.device).long()
848
- x_ohe = self._one_hot_encode_012(X_tensor)
849
- outputs = model(x_ohe) # first element must be recon logits
850
- logits = outputs[0].view(-1, self.num_features_, self.num_classes_)
851
- probas = torch.softmax(logits, dim=-1)
852
- labels = torch.argmax(probas, dim=-1)
870
+ if self.is_haploid:
871
+ x_ohe = self._one_hot_encode_012(X_tensor)
872
+ outputs = model(x_ohe)
873
+ logits = outputs[0].view(-1, self.num_features_, self.output_classes_)
874
+ probas = torch.softmax(logits, dim=-1)
875
+ labels = torch.argmax(probas, dim=-1)
876
+ else:
877
+ x_in = self._encode_multilabel_inputs(X_tensor)
878
+ outputs = model(x_in)
879
+ logits = outputs[0].view(-1, self.num_features_, self.output_classes_)
880
+ probas2 = torch.sigmoid(logits)
881
+ p_ref = probas2[..., 0]
882
+ p_alt = probas2[..., 1]
883
+ p_het = p_ref * p_alt
884
+ p_ref_only = p_ref * (1 - p_alt)
885
+ p_alt_only = p_alt * (1 - p_ref)
886
+ probas = torch.stack([p_ref_only, p_het, p_alt_only], dim=-1)
887
+ probas = probas / probas.sum(dim=-1, keepdim=True).clamp_min(1e-8)
888
+ labels = torch.argmax(probas, dim=-1)
853
889
 
854
890
  if return_proba:
855
891
  return labels.cpu().numpy(), probas.cpu().numpy()
@@ -1047,12 +1083,21 @@ class ImputeVAE(BaseNNImputer):
1047
1083
  try:
1048
1084
  params = self._sample_hyperparameters(trial)
1049
1085
 
1050
- X_train = getattr(self, "X_train_", self.ground_truth_[self.train_idx_])
1051
- X_val = getattr(self, "X_val_", self.ground_truth_[self.test_idx_])
1086
+ # Use tune subsets when available (tune_fast)
1087
+ X_train = getattr(self, "_tune_X_train", None)
1088
+ X_val = getattr(self, "_tune_X_test", None)
1089
+ if X_train is None or X_val is None:
1090
+ X_train = getattr(self, "X_train_", self.ground_truth_[self.train_idx_])
1091
+ X_val = getattr(self, "X_val_", self.ground_truth_[self.test_idx_])
1052
1092
 
1053
1093
  class_weights = self._normalize_class_weights(
1054
1094
  self._class_weights_from_zygosity(X_train)
1055
1095
  )
1096
+ # Pos weights for diploid multilabel BCE during tuning
1097
+ if not self.is_haploid:
1098
+ self.pos_weights_ = self._compute_pos_weights(X_train)
1099
+ else:
1100
+ self.pos_weights_ = None
1056
1101
  train_loader = self._get_data_loader(X_train)
1057
1102
 
1058
1103
  model = self.build_model(self.Model, params["model_params"])
@@ -1073,7 +1118,7 @@ class ImputeVAE(BaseNNImputer):
1073
1118
  X_val=X_val,
1074
1119
  params=params,
1075
1120
  prune_metric=self.tune_metric,
1076
- prune_warmup_epochs=5,
1121
+ prune_warmup_epochs=10,
1077
1122
  eval_interval=self.tune_eval_interval,
1078
1123
  eval_requires_latents=False,
1079
1124
  eval_latent_steps=0,
@@ -1116,27 +1161,32 @@ class ImputeVAE(BaseNNImputer):
1116
1161
  Dict[str, int | float | str]: Sampled hyperparameters.
1117
1162
  """
1118
1163
  params = {
1119
- "latent_dim": trial.suggest_int("latent_dim", 2, 64),
1120
- "lr": trial.suggest_float("learning_rate", 1e-5, 1e-2, log=True),
1121
- "dropout_rate": trial.suggest_float("dropout_rate", 0.0, 0.6),
1122
- "num_hidden_layers": trial.suggest_int("num_hidden_layers", 1, 8),
1164
+ "latent_dim": trial.suggest_int("latent_dim", 4, 16, step=2),
1165
+ "lr": trial.suggest_float("learning_rate", 3e-4, 1e-3, log=True),
1166
+ "dropout_rate": trial.suggest_float("dropout_rate", 0.0, 0.30, step=0.05),
1167
+ "num_hidden_layers": trial.suggest_int("num_hidden_layers", 1, 6),
1123
1168
  "activation": trial.suggest_categorical(
1124
- "activation", ["relu", "elu", "selu"]
1169
+ "activation", ["relu", "elu", "selu", "leaky_relu"]
1125
1170
  ),
1126
- "l1_penalty": trial.suggest_float("l1_penalty", 1e-7, 1e-2, log=True),
1171
+ "l1_penalty": trial.suggest_float("l1_penalty", 1e-6, 1e-3, log=True),
1127
1172
  "layer_scaling_factor": trial.suggest_float(
1128
- "layer_scaling_factor", 2.0, 10.0
1173
+ "layer_scaling_factor", 2.0, 4.0, step=0.5
1129
1174
  ),
1130
1175
  "layer_schedule": trial.suggest_categorical(
1131
- "layer_schedule", ["pyramid", "constant", "linear"]
1176
+ "layer_schedule", ["pyramid", "linear"]
1132
1177
  ),
1133
1178
  # VAE-specific β (final value after anneal)
1134
- "beta": trial.suggest_float("beta", 0.25, 4.0),
1179
+ "beta": trial.suggest_float("beta", 0.5, 2.0, step=0.5),
1135
1180
  # focal gamma (if used in VAE recon CE)
1136
- "gamma": trial.suggest_float("gamma", 0.0, 5.0),
1181
+ "gamma": trial.suggest_float("gamma", 0.5, 3.0, step=0.5),
1137
1182
  }
1138
1183
 
1139
- input_dim = self.num_features_ * self.num_classes_
1184
+ use_n_features = (
1185
+ self._tune_num_features
1186
+ if (self.tune and self.tune_fast and hasattr(self, "_tune_num_features"))
1187
+ else self.num_features_
1188
+ )
1189
+ input_dim = use_n_features * self.output_classes_
1140
1190
  hidden_layer_sizes = self._compute_hidden_layer_sizes(
1141
1191
  n_inputs=input_dim,
1142
1192
  n_outputs=input_dim,
@@ -1150,8 +1200,8 @@ class ImputeVAE(BaseNNImputer):
1150
1200
  hidden_only = [hidden_layer_sizes[0]] + hidden_layer_sizes[1:-1]
1151
1201
 
1152
1202
  params["model_params"] = {
1153
- "n_features": self.num_features_,
1154
- "num_classes": self.num_classes_,
1203
+ "n_features": use_n_features,
1204
+ "num_classes": self.output_classes_,
1155
1205
  "latent_dim": params["latent_dim"],
1156
1206
  "dropout_rate": params["dropout_rate"],
1157
1207
  "hidden_layer_sizes": hidden_only,
@@ -1182,8 +1232,8 @@ class ImputeVAE(BaseNNImputer):
1182
1232
  self.gamma = best_params.get("gamma", self.gamma)
1183
1233
 
1184
1234
  hidden_layer_sizes = self._compute_hidden_layer_sizes(
1185
- n_inputs=self.num_features_ * self.num_classes_,
1186
- n_outputs=self.num_features_ * self.num_classes_,
1235
+ n_inputs=self.num_features_ * self.output_classes_,
1236
+ n_outputs=self.num_features_ * self.output_classes_,
1187
1237
  n_samples=len(self.train_idx_),
1188
1238
  n_hidden=best_params["num_hidden_layers"],
1189
1239
  alpha=best_params["layer_scaling_factor"],
@@ -1197,7 +1247,7 @@ class ImputeVAE(BaseNNImputer):
1197
1247
  "hidden_layer_sizes": hidden_only,
1198
1248
  "dropout_rate": self.dropout_rate,
1199
1249
  "activation": self.activation,
1200
- "num_classes": self.num_classes_,
1250
+ "num_classes": self.output_classes_,
1201
1251
  "beta": self.kl_beta_final,
1202
1252
  "gamma": self.gamma,
1203
1253
  }
@@ -1209,8 +1259,8 @@ class ImputeVAE(BaseNNImputer):
1209
1259
  Dict[str, int | float | str | list]: VAE model parameters.
1210
1260
  """
1211
1261
  hidden_layer_sizes = self._compute_hidden_layer_sizes(
1212
- n_inputs=self.num_features_ * self.num_classes_,
1213
- n_outputs=self.num_features_ * self.num_classes_,
1262
+ n_inputs=self.num_features_ * self.output_classes_,
1263
+ n_outputs=self.num_features_ * self.output_classes_,
1214
1264
  n_samples=len(self.ground_truth_),
1215
1265
  n_hidden=self.num_hidden_layers,
1216
1266
  alpha=self.layer_scaling_factor,
@@ -1222,7 +1272,45 @@ class ImputeVAE(BaseNNImputer):
1222
1272
  "hidden_layer_sizes": hidden_layer_sizes,
1223
1273
  "dropout_rate": self.dropout_rate,
1224
1274
  "activation": self.activation,
1225
- "num_classes": self.num_classes_,
1275
+ "num_classes": self.output_classes_,
1226
1276
  "beta": self.kl_beta_final,
1227
1277
  "gamma": self.gamma,
1228
1278
  }
1279
+
1280
+ def _encode_multilabel_inputs(self, y: torch.Tensor) -> torch.Tensor:
1281
+ """Two-channel multi-hot for diploid: REF-only, ALT-only; HET sets both."""
1282
+ if self.is_haploid:
1283
+ return self._one_hot_encode_012(y)
1284
+ y = y.to(self.device)
1285
+ shape = y.shape + (2,)
1286
+ out = torch.zeros(shape, device=self.device, dtype=torch.float32)
1287
+ valid = y != -1
1288
+ ref_mask = valid & (y != 2)
1289
+ alt_mask = valid & (y != 0)
1290
+ out[ref_mask, 0] = 1.0
1291
+ out[alt_mask, 1] = 1.0
1292
+ return out
1293
+
1294
+ def _multi_hot_targets(self, y: torch.Tensor) -> torch.Tensor:
1295
+ """Targets aligned with _encode_multilabel_inputs for diploid training."""
1296
+ if self.is_haploid:
1297
+ raise RuntimeError("_multi_hot_targets called for haploid data.")
1298
+ y = y.to(self.device)
1299
+ out = torch.zeros(y.shape + (2,), device=self.device, dtype=torch.float32)
1300
+ valid = y != -1
1301
+ ref_mask = valid & (y != 2)
1302
+ alt_mask = valid & (y != 0)
1303
+ out[ref_mask, 0] = 1.0
1304
+ out[alt_mask, 1] = 1.0
1305
+ return out
1306
+
1307
+ def _compute_pos_weights(self, X: np.ndarray) -> torch.Tensor:
1308
+ """Balance REF/ALT channels for multilabel BCE."""
1309
+ ref_pos = np.count_nonzero((X == 0) | (X == 1))
1310
+ alt_pos = np.count_nonzero((X == 2) | (X == 1))
1311
+ total_valid = np.count_nonzero(X != -1)
1312
+ pos_counts = np.array([ref_pos, alt_pos], dtype=np.float32)
1313
+ neg_counts = np.maximum(total_valid - pos_counts, 1.0)
1314
+ pos_counts = np.maximum(pos_counts, 1.0)
1315
+ weights = neg_counts / pos_counts
1316
+ return torch.tensor(weights, device=self.device, dtype=torch.float32)
@@ -1,344 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: pg-sui
3
- Version: 1.6.14.dev9
4
- Summary: Python machine and deep learning API to impute missing genotypes
5
- Author-email: "Drs. Bradley T. Martin and Tyler K. Chafin" <evobio721@gmail.com>
6
- Maintainer-email: "Dr. Bradley T. Martin" <evobio721@gmail.com>
7
- License: GNU General Public License v3 (GPLv3)
8
- Project-URL: Homepage, https://github.com/btmartin721/PG-SUI
9
- Project-URL: Documentation, https://pg-sui.readthedocs.io/en/latest/
10
- Project-URL: Source, https://github.com/btmartin721/PG-SUI.git
11
- Project-URL: BugTracker, https://github.com/btmartin721/PG-SUI/issues
12
- Keywords: impute,imputation,AI,deep learning,machine learning,neural network,vae,autoencoder,ubp,nlpca,population genetics,unsupervised,supervised,bioinformatics,snp,genomics,genotype,missing data,data analysis,data science,statistics,data visualization,python
13
- Classifier: Programming Language :: Python :: 3
14
- Classifier: Programming Language :: Python :: 3.11
15
- Classifier: Programming Language :: Python :: 3.12
16
- Classifier: Development Status :: 4 - Beta
17
- Classifier: Environment :: Console
18
- Classifier: Intended Audience :: Science/Research
19
- Classifier: Intended Audience :: Developers
20
- Classifier: Intended Audience :: Education
21
- Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
22
- Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
23
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
- Classifier: Topic :: Scientific/Engineering :: Information Analysis
25
- Classifier: Topic :: Scientific/Engineering :: Visualization
26
- Classifier: Operating System :: MacOS
27
- Classifier: Operating System :: MacOS :: MacOS X
28
- Classifier: Operating System :: Unix
29
- Classifier: Operating System :: POSIX
30
- Classifier: Natural Language :: English
31
- Requires-Python: >=3.11
32
- Description-Content-Type: text/markdown
33
- License-File: LICENSE
34
- Requires-Dist: matplotlib
35
- Requires-Dist: numpy>=2.1
36
- Requires-Dist: pandas>=2.2.2
37
- Requires-Dist: scikit-learn>=1.4
38
- Requires-Dist: scipy
39
- Requires-Dist: seaborn
40
- Requires-Dist: torch
41
- Requires-Dist: tqdm
42
- Requires-Dist: toytree
43
- Requires-Dist: optuna
44
- Requires-Dist: rich
45
- Requires-Dist: rich[jupyter]
46
- Requires-Dist: snpio
47
- Provides-Extra: intel
48
- Requires-Dist: scikit-learn-intelex; extra == "intel"
49
- Provides-Extra: docs
50
- Requires-Dist: sphinx; extra == "docs"
51
- Requires-Dist: sphinx-rtd-theme; extra == "docs"
52
- Requires-Dist: sphinx_autodoc_typehints; extra == "docs"
53
- Requires-Dist: sphinxcontrib-napoleon; extra == "docs"
54
- Requires-Dist: sphinxcontrib-programoutput; extra == "docs"
55
- Provides-Extra: dev
56
- Requires-Dist: twine; extra == "dev"
57
- Requires-Dist: wheel; extra == "dev"
58
- Requires-Dist: pytest; extra == "dev"
59
- Requires-Dist: sphinx; extra == "dev"
60
- Requires-Dist: sphinx-rtd-theme; extra == "dev"
61
- Requires-Dist: sphinx-autodoc-typehints; extra == "dev"
62
- Requires-Dist: sphinxcontrib-napoleon; extra == "dev"
63
- Requires-Dist: sphinxcontrib-programoutput; extra == "dev"
64
- Requires-Dist: requests; extra == "dev"
65
- Provides-Extra: optional
66
- Requires-Dist: PyObjC; extra == "optional"
67
- Provides-Extra: gui
68
- Requires-Dist: fastapi>=0.110; extra == "gui"
69
- Requires-Dist: uvicorn[standard]>=0.23; extra == "gui"
70
- Dynamic: license-file
71
-
72
-
73
- <img src="https://github.com/btmartin721/PG-SUI/blob/master/img/pgsui-logo-faded.png" alt="PG-SUI Logo" width="50%" height="50%">
74
-
75
-
76
- # PG-SUI
77
-
78
- Population Genomic Supervised and Unsupervised Imputation.
79
-
80
- ## About PG-SUI
81
-
82
- PG-SUI is a Python 3 API that uses machine learning to impute missing values from population genomic SNP data. There are several supervised and unsupervised machine learning algorithms available to impute missing data, as well as some non-machine learning imputers that are useful.
83
-
84
- Below is some general information and a basic tutorial. For more detailed information, see our [API Documentation](https://pg-sui.readthedocs.io/en/latest/).
85
-
86
- ### Supervised Imputation Methods
87
-
88
- Supervised methods utilze the scikit-learn's IterativeImputer, which is based on the MICE (Multivariate Imputation by Chained Equations) algorithm ([1](#1)), and iterates over each SNP site (i.e., feature) while uses the N nearest neighbor features to inform the imputation. The number of nearest features can be adjusted by users. IterativeImputer currently works with any of the following scikit-learn classifiers:
89
-
90
- + K-Nearest Neighbors
91
- + Random Forest
92
- + XGBoost
93
-
94
- See the scikit-learn documentation (https://scikit-learn.org) for more information on IterativeImputer and each of the classifiers.
95
-
96
- ### Unsupervised Imputation Methods
97
-
98
- Unsupervised imputers include three custom neural network models:
99
-
100
- + Variational Autoencoder (VAE) ([2](#2))
101
- + Standard Autoencoder (SAE) ([3](#3))
102
- + Non-linear Principal Component Analysis (NLPCA) ([4](#4))
103
- + Unsupervised Backpropagation (UBP) ([5](#5))
104
-
105
- VAE models train themselves to reconstruct their input (i.e., the genotypes). To use VAE for imputation, the missing values are masked and the VAE model gets trained to reconstruct only on known values. Once the model is trained, it is then used to predict the missing values.
106
-
107
- SAE is a standard autoencoder that trains the input to predict itself. As with VAE, missing values are masked and the model gets trained only on known values. Predictions are then made on the missing values.
108
-
109
- NLPCA initializes random, reduced-dimensional input, then trains itself by using the known values (i.e., genotypes) as targets and refining the random input until it accurately predicts the genotype output. The trained model can then predict the missing values.
110
-
111
- UBP is an extension of NLPCA that runs over three phases. Phase 1 refines the randomly generated, reduced-dimensional input in a single layer perceptron neural network to obtain good initial input values. Phase 2 uses the refined reduced-dimensional input from phase 1 as input into a multi-layer perceptron (MLP), but in Phase 2 only the neural network weights are refined. Phase three uses an MLP to refine both the weights and the reduced-dimensional input. Once the model is trained, it can be used to predict the missing values.
112
-
113
- ### Non-Machine Learning Methods
114
-
115
- We also include several non-machine learning options for imputing missing data, including:
116
-
117
- + Per-population mode per SNP site
118
- + Global mode per SNP site
119
- + Using a phylogeny as input to inform the imputation
120
- + Matrix Factorization
121
-
122
- These four "simple" imputation methods can be used as standalone imputers, as the initial imputation strategy for IterativeImputer (at least one method is required to be chosen), and to validate the accuracy of both IterativeImputer and the neural network models.
123
-
124
- ## Installing PG-SUI
125
-
126
- The easiest way to install PG-SUI is to use pip:
127
-
128
- ```
129
- pip install pg-sui
130
- ```
131
-
132
- If you have an Intel CPU and want to use the sklearn-genetic-intelex package to speed up scikit-learn computations, you can do:
133
-
134
- ```
135
- pip install pg-sui[intel]
136
- ```
137
-
138
- ### Optional GUI (Electron)
139
-
140
- PG-SUI ships an Electron GUI wrapper around the Python CLI.
141
-
142
- 1. Install the Python-side extras (FastAPI/uvicorn helper) if you want to serve from Python:
143
- `pip install pg-sui[gui]`
144
- 2. Install Node.js (https://nodejs.org) and fetch the app dependencies once:
145
- `pgsui-gui-setup`
146
- 3. Launch the GUI:
147
- `pgsui-gui`
148
-
149
- The GUI shells out to the same CLI underneath, so presets/overrides and YAML configs behave identically.
150
-
151
- ## Manual Installation
152
-
153
- ### Dependencies
154
-
155
- + python >= 3.11
156
- + pandas
157
- + numpy
158
- + scipy
159
- + matplotlib
160
- + seaborn
161
- + plotly
162
- + kaleido
163
- + tqdm
164
- + toytree
165
- + scikit-learn
166
- + xgboost
167
- + snpio
168
- + optuna
169
-
170
- #### Installation troubleshooting
171
-
172
- ##### "use_2to3 is invalid" error
173
-
174
- Users running setuptools v58 may encounter this error during the last step of installation, using pip to install sklearn-genetic-opt:
175
-
176
- ```
177
- ERROR: Command errored out with exit status 1:
178
- command: /Users/tyler/miniforge3/envs/pg-sui/bin/python3.8 -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'/private/var/folders/6x/t6g4kn711z5cxmc2_tvq0mlw0000gn/T/pip-install-6y5g_mhs/deap_1d32f65d60a44056bd7031f3aad44571/setup.py'"'"'; __file__='"'"'/private/var/folders/6x/t6g4kn711z5cxmc2_tvq0mlw0000gn/T/pip-install-6y5g_mhs/deap_1d32f65d60a44056bd7031f3aad44571/setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base /private/var/folders/6x/t6g4kn711z5cxmc2_tvq0mlw0000gn/T/pip-pip-egg-info-7hg3hcq2
179
- cwd: /private/var/folders/6x/t6g4kn711z5cxmc2_tvq0mlw0000gn/T/pip-install-6y5g_mhs/deap_1d32f65d60a44056bd7031f3aad44571/
180
- Complete output (1 lines):
181
- error in deap setup command: use_2to3 is invalid.
182
- ```
183
-
184
- This occurs during the installation of DEAP, one of the dependencies for sklearn-genetic-opt. As a workaround, first downgrade setuptools, and then proceed with the installation as normal:
185
- ```
186
- pip install setuptools==57
187
- pip install sklearn-genetic-opt[all]
188
-
189
- ```
190
-
191
- ##### Mac ARM architecture
192
-
193
- PG-SUI has been tested on the new Mac M1 chips and is working fine, but some changes to the installation process were necessary as of 9-December-21. Installation was successful using the following:
194
-
195
- ```
196
- ### Install Miniforge3 instead of Miniconda3
197
- ### Download: https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
198
- bash ~/Downloads/Miniforge3-MacOSX-arm64.sh
199
-
200
- # Close and re-open terminal #
201
-
202
- #create and activate conda environment
203
- conda create -n pg-sui python
204
-
205
- #activate environment
206
- conda activate pg-sui
207
-
208
- #install packages
209
- conda install -c conda-forge matplotlib seaborn jupyterlab scikit-learn tqdm pandas numpy scipy xgboost lightgbm tensorflow keras sklearn-genetic-opt toytree
210
- conda install -c bioconda pyvolve
211
-
212
- #downgrade setuptools (may or may not be necessary)
213
- pip install setuptools==57
214
-
215
- #install sklearn-genetic-opt and mlflow
216
- pip install sklearn-genetic-opt mlflow
217
-
218
- ```
219
-
220
- Any other problems we run into testing on the Mac ARM architecture will be adjusted here. Note that the step installing scikit-learn-intelex was skipped here. PG-SUI will automatically detect the CPU architecture you are running, and forgo importing this package (which will only work on Intel processors)
221
-
222
- ## Input Data
223
-
224
- You can read your input files as a GenotypeData object from the [SNPio](https://snpio.readthedocs.io/en/latest/) package:
225
-
226
- ```
227
-
228
- # Import snpio. Automatically installed with pgsui when using pip.
229
- from snpio import GenotypeData
230
-
231
- # Read in PHYLIP, VCF, or STRUCTURE-formatted alignments.
232
- data = GenotypeData(
233
- filename="example_data/phylip_files/phylogen_nomx.u.snps.phy",
234
- popmapfile="example_data/popmaps/phylogen_nomx.popmap",
235
- force_popmap=True,
236
- filetype="auto",
237
- qmatrix_iqtree="example_data/trees/test.qmat",
238
- siterates_iqtree="example_data/trees/test.rate",
239
- guidetree="example_data/trees/test.tre",
240
- include_pops=["EA", "TT", "GU"], # Only include these populations. There's also an exclude_pops option that will exclude the provided populations.
241
- )
242
- ```
243
-
244
- ## Supported Imputation Methods
245
-
246
- There are numerous supported algorithms to impute missing data. Each one can be run by calling the corresponding class. You must provide a GenotypeData instance as the first positional argument.
247
-
248
- You can import all the supported methods with:
249
-
250
- ```
251
- from pgsui import *
252
- ```
253
-
254
- Or you can import them one at a time.
255
-
256
- ```
257
- from pgsui import ImputeVAE
258
- ```
259
-
260
- ### Supervised Imputers
261
-
262
- Various supervised imputation options are supported:
263
-
264
- ```
265
- # Supervised IterativeImputer classifiers
266
- knn = ImputeKNN(data) # K-Nearest Neighbors
267
- rf = ImputeRandomForest(data) # Random Forest or Extra Trees
268
- xgb = ImputeXGBoost(data) # XGBoost
269
- ```
270
-
271
- ### Non-machine learning methods
272
-
273
- Use phylogeny to inform imputation:
274
-
275
- ```
276
- phylo = ImputePhylo(data)
277
- ```
278
-
279
- Use by-population or global allele frequency to inform imputation
280
-
281
- ```
282
- pop_af = ImputeAlleleFreq(data, by_populations=True)
283
- global_af = ImputeAlleleFreq(data, by_populations=False)
284
- ref_af = ImputeRefAllele(data)
285
- ```
286
-
287
- Non-matrix factorization:
288
-
289
- ```
290
- mf = ImputeMF(*args) # Matrix factorization
291
- ```
292
-
293
- ### Unsupervised Neural Networks
294
-
295
- ``` python
296
- vae = ImputeVAE(data) # Variational autoencoder
297
- nlpca = ImputeNLPCA(data) # Nonlinear PCA
298
- ubp = ImputeUBP(data) # Unsupervised backpropagation
299
- sae = ImputeStandardAutoEncoder(data) # standard autoencoder
300
- ```
301
-
302
- ## Command-Line Interface
303
-
304
- Run the PG-SUI CLI with ``pg-sui`` (installed alongside the library). The CLI follows the same precedence model as the Python API:
305
-
306
- ``code defaults < preset (--preset) < YAML (--config) < explicit CLI flags < --set key=value``.
307
-
308
- Recent releases add explicit switches for the simulated-missingness workflow shared by the neural and supervised models:
309
-
310
- - ``--sim-strategy`` selects one of ``random``, ``random_weighted``, ``random_weighted_inv``, ``nonrandom``, ``nonrandom_weighted``.
311
- - ``--sim-prop`` sets the proportion of observed calls to temporarily mask when building the evaluation set.
312
- - ``--simulate-missing`` disables simulated masking entirely (store-false flag); omit it to inherit preset/YAML defaults or re-enable via ``--set sim.simulate_missing=True``.
313
-
314
- Example:
315
-
316
- ```
317
- pg-sui \
318
- --vcf data.vcf.gz \
319
- --popmap pops.popmap \
320
- --models ImputeUBP ImputeVAE \
321
- --preset balanced \
322
- --sim-strategy random_weighted_inv \
323
- --sim-prop 0.25 \
324
- --set io.prefix=vae_vs_ubp
325
- ```
326
-
327
- CLI overrides cascade into every selected model, so a single invocation can evaluate multiple imputers with a consistent simulation strategy and output prefix.
328
-
329
- ## To-Dos
330
-
331
- - simulations
332
- - Documentation
333
-
334
- ## References:
335
-
336
- <a name="1">1. </a>Stef van Buuren, Karin Groothuis-Oudshoorn (2011). mice: Multivariate Imputation by Chained Equations in R. Journal of Statistical Software 45: 1-67.
337
-
338
- <a name="2">2. </a>Kingma, D.P. & Welling, M. (2013). Auto-encoding variational bayes. In: Proceedings of the International Conference on Learning Representations (ICLR). arXiv:1312.6114 [stat.ML].
339
-
340
- <a name="3">3. </a>Hinton, G.E., & Salakhutdinov, R.R. (2006). Reducing the dimensionality of data with neural networks. Science, 313(5786), 504-507.
341
-
342
- <a name="4">4. </a>Scholz, M., Kaplan, F., Guy, C. L., Kopka, J., & Selbig, J. (2005). Non-linear PCA: a missing data approach. Bioinformatics, 21(20), 3887-3895.
343
-
344
- <a name="5">5. </a>Gashler, M. S., Smith, M. R., Morris, R., & Martinez, T. (2016). Missing value imputation with unsupervised backpropagation. Computational Intelligence, 32(2), 196-215.