pg-sui 1.0.2.1__py3-none-any.whl → 1.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pg-sui might be problematic. Click here for more details.

Files changed (112) hide show
  1. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/METADATA +51 -70
  2. pg_sui-1.6.8.dist-info/RECORD +78 -0
  3. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/WHEEL +1 -1
  4. pg_sui-1.6.8.dist-info/entry_points.txt +4 -0
  5. pg_sui-1.6.8.dist-info/top_level.txt +1 -0
  6. pgsui/__init__.py +35 -54
  7. pgsui/_version.py +34 -0
  8. pgsui/cli.py +635 -0
  9. pgsui/data_processing/config.py +576 -0
  10. pgsui/data_processing/containers.py +1782 -0
  11. pgsui/data_processing/transformers.py +121 -1103
  12. pgsui/electron/app/__main__.py +5 -0
  13. pgsui/electron/app/icons/icons/1024x1024.png +0 -0
  14. pgsui/electron/app/icons/icons/128x128.png +0 -0
  15. pgsui/electron/app/icons/icons/16x16.png +0 -0
  16. pgsui/electron/app/icons/icons/24x24.png +0 -0
  17. pgsui/electron/app/icons/icons/256x256.png +0 -0
  18. pgsui/electron/app/icons/icons/32x32.png +0 -0
  19. pgsui/electron/app/icons/icons/48x48.png +0 -0
  20. pgsui/electron/app/icons/icons/512x512.png +0 -0
  21. pgsui/electron/app/icons/icons/64x64.png +0 -0
  22. pgsui/electron/app/icons/icons/icon.icns +0 -0
  23. pgsui/electron/app/icons/icons/icon.ico +0 -0
  24. pgsui/electron/app/main.js +189 -0
  25. pgsui/electron/app/package-lock.json +6893 -0
  26. pgsui/electron/app/package.json +50 -0
  27. pgsui/electron/app/preload.js +15 -0
  28. pgsui/electron/app/server.py +146 -0
  29. pgsui/electron/app/ui/logo.png +0 -0
  30. pgsui/electron/app/ui/renderer.js +130 -0
  31. pgsui/electron/app/ui/styles.css +59 -0
  32. pgsui/electron/app/ui/ui_shim.js +72 -0
  33. pgsui/electron/bootstrap.py +43 -0
  34. pgsui/electron/launch.py +59 -0
  35. pgsui/electron/package.json +14 -0
  36. pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
  37. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
  38. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
  39. pgsui/impute/deterministic/imputers/allele_freq.py +691 -0
  40. pgsui/impute/deterministic/imputers/mode.py +679 -0
  41. pgsui/impute/deterministic/imputers/nmf.py +221 -0
  42. pgsui/impute/deterministic/imputers/phylo.py +971 -0
  43. pgsui/impute/deterministic/imputers/ref_allele.py +530 -0
  44. pgsui/impute/supervised/base.py +339 -0
  45. pgsui/impute/supervised/imputers/hist_gradient_boosting.py +293 -0
  46. pgsui/impute/supervised/imputers/random_forest.py +287 -0
  47. pgsui/impute/unsupervised/base.py +924 -0
  48. pgsui/impute/unsupervised/callbacks.py +89 -263
  49. pgsui/impute/unsupervised/imputers/autoencoder.py +972 -0
  50. pgsui/impute/unsupervised/imputers/nlpca.py +1264 -0
  51. pgsui/impute/unsupervised/imputers/ubp.py +1288 -0
  52. pgsui/impute/unsupervised/imputers/vae.py +957 -0
  53. pgsui/impute/unsupervised/loss_functions.py +158 -0
  54. pgsui/impute/unsupervised/models/autoencoder_model.py +208 -558
  55. pgsui/impute/unsupervised/models/nlpca_model.py +149 -468
  56. pgsui/impute/unsupervised/models/ubp_model.py +198 -1317
  57. pgsui/impute/unsupervised/models/vae_model.py +259 -618
  58. pgsui/impute/unsupervised/nn_scorers.py +215 -0
  59. pgsui/utils/classification_viz.py +591 -0
  60. pgsui/utils/misc.py +35 -480
  61. pgsui/utils/plotting.py +514 -824
  62. pgsui/utils/scorers.py +212 -438
  63. pg_sui-1.0.2.1.dist-info/RECORD +0 -75
  64. pg_sui-1.0.2.1.dist-info/top_level.txt +0 -3
  65. pgsui/example_data/phylip_files/test_n10.phy +0 -118
  66. pgsui/example_data/phylip_files/test_n100.phy +0 -118
  67. pgsui/example_data/phylip_files/test_n2.phy +0 -118
  68. pgsui/example_data/phylip_files/test_n500.phy +0 -118
  69. pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
  70. pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
  71. pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
  72. pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
  73. pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
  74. pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
  75. pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
  76. pgsui/example_data/trees/test.iqtree +0 -376
  77. pgsui/example_data/trees/test.qmat +0 -5
  78. pgsui/example_data/trees/test.rate +0 -2033
  79. pgsui/example_data/trees/test.tre +0 -1
  80. pgsui/example_data/trees/test_n10.rate +0 -19
  81. pgsui/example_data/trees/test_n100.rate +0 -109
  82. pgsui/example_data/trees/test_n500.rate +0 -509
  83. pgsui/example_data/trees/test_siterates.txt +0 -2024
  84. pgsui/example_data/trees/test_siterates_n10.txt +0 -10
  85. pgsui/example_data/trees/test_siterates_n100.txt +0 -100
  86. pgsui/example_data/trees/test_siterates_n500.txt +0 -500
  87. pgsui/example_data/vcf_files/test.vcf +0 -244
  88. pgsui/example_data/vcf_files/test.vcf.gz +0 -0
  89. pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
  90. pgsui/impute/estimators.py +0 -735
  91. pgsui/impute/impute.py +0 -1486
  92. pgsui/impute/simple_imputers.py +0 -1439
  93. pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -785
  94. pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1027
  95. pgsui/impute/unsupervised/keras_classifiers.py +0 -702
  96. pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
  97. pgsui/impute/unsupervised/neural_network_imputers.py +0 -1424
  98. pgsui/impute/unsupervised/neural_network_methods.py +0 -1549
  99. pgsui/pg_sui.py +0 -261
  100. pgsui/utils/sequence_tools.py +0 -407
  101. simulation/sim_benchmarks.py +0 -333
  102. simulation/sim_treeparams.py +0 -475
  103. test/__init__.py +0 -0
  104. test/pg_sui_simtest.py +0 -215
  105. test/pg_sui_testing.py +0 -523
  106. test/test.py +0 -297
  107. test/test_pgsui.py +0 -374
  108. test/test_tkc.py +0 -214
  109. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info/licenses}/LICENSE +0 -0
  110. /pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
  111. /pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
  112. {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
@@ -1,1424 +0,0 @@
1
- # Standard Library Imports
2
- import logging
3
- import os
4
- import pprint
5
- import sys
6
- import warnings
7
-
8
- warnings.simplefilter(action="ignore", category=FutureWarning)
9
-
10
- from pathlib import Path
11
-
12
- # Third-party Imports
13
- import numpy as np
14
- import pandas as pd
15
- from matplotlib import pyplot as plt
16
-
17
- # Grid search imports
18
- from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
19
-
20
- # Scikit-learn imports
21
- from sklearn.base import BaseEstimator, TransformerMixin
22
-
23
- # Genetic algorithm grid search imports
24
- from sklearn_genetic import GASearchCV
25
- from sklearn_genetic.callbacks import ConsecutiveStopping, ProgressBar
26
- from sklearn_genetic.plots import plot_fitness_evolution
27
-
28
- # Import tensorflow with reduced warnings.
29
- os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
30
- logging.getLogger("tensorflow").disabled = True
31
- warnings.filterwarnings("ignore", category=UserWarning)
32
-
33
- # noinspection PyPackageRequirements
34
- import tensorflow as tf
35
-
36
- # Disable can't find cuda .dll errors. Also turns of GPU support.
37
- tf.config.set_visible_devices([], "GPU")
38
-
39
- from tensorflow.python.util import deprecation
40
-
41
- # Disable warnings and info logs.
42
- tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
43
- tf.get_logger().setLevel(logging.ERROR)
44
-
45
-
46
- # Monkey patching deprecation utils to supress warnings.
47
- # noinspection PyUnusedLocal
48
- def deprecated(
49
- date, instructions, warn_once=True
50
- ): # pylint: disable=unused-argument
51
- def deprecated_wrapper(func):
52
- return func
53
-
54
- return deprecated_wrapper
55
-
56
-
57
- deprecation.deprecated = deprecated
58
-
59
- from tensorflow.keras.callbacks import (
60
- ReduceLROnPlateau,
61
- CSVLogger,
62
- )
63
-
64
- # For development purposes
65
- # from memory_profiler import memory_usage
66
-
67
- # Custom module imports
68
- try:
69
- from ...utils.misc import timer
70
- from ...utils.misc import isnotebook
71
- from ...utils.misc import validate_input_type
72
- from .neural_network_methods import NeuralNetworkMethods, DisabledCV
73
- from ...utils.scorers import Scorers
74
- from ...utils.plotting import Plotting
75
- from .callbacks import (
76
- UBPCallbacks,
77
- VAECallbacks,
78
- CyclicalAnnealingCallback,
79
- )
80
- from .keras_classifiers import VAEClassifier, MLPClassifier, SAEClassifier
81
- from ...data_processing.transformers import (
82
- SimGenotypeDataTransformer,
83
- AutoEncoderFeatureTransformer,
84
- )
85
- except (ModuleNotFoundError, ValueError, ImportError):
86
- from utils.misc import timer
87
- from utils.misc import isnotebook
88
- from utils.misc import validate_input_type
89
- from impute.unsupervised.neural_network_methods import (
90
- NeuralNetworkMethods,
91
- DisabledCV,
92
- )
93
- from utils.scorers import Scorers
94
- from utils.plotting import Plotting
95
- from impute.unsupervised.callbacks import (
96
- UBPCallbacks,
97
- VAECallbacks,
98
- CyclicalAnnealingCallback,
99
- )
100
- from impute.unsupervised.keras_classifiers import (
101
- VAEClassifier,
102
- MLPClassifier,
103
- SAEClassifier,
104
- )
105
- from data_processing.transformers import (
106
- SimGenotypeDataTransformer,
107
- AutoEncoderFeatureTransformer,
108
- )
109
-
110
- is_notebook = isnotebook()
111
-
112
- if is_notebook:
113
- from tqdm.notebook import tqdm as progressbar
114
- else:
115
- from tqdm import tqdm as progressbar
116
-
117
- from tqdm.keras import TqdmCallback
118
-
119
-
120
- class BaseNNImputer(BaseEstimator, TransformerMixin):
121
- """Base transformer class for neural network imputers.
122
-
123
- Args:
124
- genotype_data (GenotypeData): Input GenotypeData instance.
125
-
126
- prefix (str, optional): Prefix for output files. Defaults to "output".
127
-
128
- gridparams (Dict[str, Any] or None, optional): Dictionary with keys=keyword arguments for the specified estimator and values=lists of parameter values or distributions. If using GridSearchCV, distributions can be specified by using scipy.stats.uniform(low, high) (for a uniform distribution) or scipy.stats.loguniform(low, high) (useful if range of values spans orders of magnitude). ``gridparams`` will be used for a randomized grid search with cross-validation. If using the genetic algorithm grid search (GASearchCV) by setting ``ga=True``\, the parameters can be specified as ``sklearn_genetic.space`` objects. The grid search will determine the optimal parameters as those that maximize accuracy (or minimize root mean squared error for BayesianRidge regressor). NOTE: Takes a long time, so run it with a small subset of the data just to find the optimal parameters for the classifier, then run a full imputation using the optimal parameters. If ``gridparams=None``\, a grid search is not performed. Defaults to None.
129
-
130
- disable_progressbar (bool, optional): Whether to disable the tqdm progress bar. Useful if you are doing the imputation on e.g. a high-performance computing cluster, where sometimes tqdm does not work correctly. If False, uses tqdm progress bar. If True, does not use tqdm. Defaults to False.
131
-
132
- batch_size (int, optional): Batch size per epoch to train the model with.
133
-
134
- n_components (int, optional): Number of components to use as the input data. Defaults to 3.
135
-
136
- early_stop_gen (int, optional): Early stopping criterion for epochs. Training will stop if the loss (error) does not decrease past the tolerance level for ``early_stop_gen`` epochs. Will save the optimal model and reload it once ``early_stop_gen`` has been reached. Defaults to 25.
137
-
138
- num_hidden_layers (int, optional): Number of hidden layers to use in the model. Adjust if overfitting occurs. Defaults to 3.
139
-
140
- hidden_layer_sizes (str, List[int], List[str], or int, optional): Number of neurons to use in hidden layers. If string or a list of strings is supplied, the strings must be either "midpoint", "sqrt", or "log2". "midpoint" will calculate the midpoint as ``(n_features + n_components) / 2``. If "sqrt" is supplied, the square root of the number of features will be used to calculate the output units. If "log2" is supplied, the units will be calculated as ``log2(n_features)``. hidden_layer_sizes will calculate and set the number of output units for each hidden layer. If one string or integer is supplied, the model will use the same number of output units for each hidden layer. If a list of integers or strings is supplied, the model will use the values supplied in the list, which can differ. The list length must be equal to the ``num_hidden_layers``. Defaults to "midpoint".
141
-
142
- optimizer (str, optional): The optimizer to use with gradient descent. Possible value include: "adam", "sgd", "adagrad", "adadelta", "adamax", "ftrl", "nadam", and "rmsprop" are supported. See tf.keras.optimizers for more info. Defaults to "adam".
143
-
144
- hidden_activation (str, optional): The activation function to use for the hidden layers. See tf.keras.activations for more info. Commonly used activation functions include "elu", "relu", and "sigmoid". Defaults to "elu".
145
-
146
- learning_rate (float, optional): The learning rate for the optimizers. Adjust if the loss is learning too slowly. Defaults to 0.1.
147
-
148
- lr_patience (int, optional): Number of epochs with no loss improvement to wait before reducing the learning rate.
149
-
150
- epochs (int, optional): Maximum number of epochs to run if the ``early_stop_gen`` criterion is not met.
151
-
152
- weights_initializer (str, optional): Initializer to use for the model weights. See tf.keras.initializers for more info. Defaults to "glorot_normal".
153
-
154
- l1_penalty (float, optional): L1 regularization penalty to apply to reduce overfitting. Defaults to 0.01.
155
-
156
- l2_penalty (float, optional): L2 regularization penalty to apply to reduce overfitting. Defaults to 0.01.
157
-
158
- dropout_rate (float, optional): Dropout rate during training to reduce overfitting. Must be a float between 0 and 1. Defaults to 0.2.
159
-
160
- recurrent_weight (float, optional): Recurrent weight to calculate predictions. Defaults to 0.5.
161
-
162
- sample_weights (str or Dict[int, float], optional): Whether to weight each genotype by its class frequency. If ``sample_weights='auto'`` then it automatically calculates sample weights based on genotype class frequencies per locus; for example, if there are a lot more 0s and fewer 2s, then it will balance out the classes by weighting each genotype accordingly. ``sample_weights`` can also be a dictionary with the genotypes (0, 1, and 2) as the keys and the weights as the keys. If ``sample_weights`` is anything else, then they are not calculated. Defaults to False.
163
-
164
- grid_iter (int, optional): Number of iterations for grid search. Defaults to 50.
165
-
166
- gridsearch_method (str, optional): Grid search method to use. Possible options include: 'gridsearch', 'randomized_gridsearch', and 'genetic_algorithm'. 'gridsearch' runs all possible permutations of parameters, 'randomized_gridsearch' runs a random subset of parameters, and 'genetic_algorithm' uses a genetic algorithm gridsearch (via GASearchCV). Defaults to 'gridsearch'.
167
-
168
- ga_kwargs (Dict[str, Any] or None): Keyword arguments to be passed to a Genetic Algorithm grid search. Only used if ``ga==True``\.
169
-
170
- scoring_metric (str, optional): Scoring metric to use for randomized or genetic algorithm grid searches. See https://scikit-learn.org/stable/modules/model_evaluation.html for supported options. Defaults to "accuracy".
171
-
172
- sim_strategy (str, optional): Strategy to use for simulating missing data. Only used to validate the accuracy of the imputation. The final model will be trained with the non-simulated dataset. Supported options include: "random", "nonrandom", and "nonrandom_weighted". "random" randomly simulates missing data. When set to "nonrandom", branches from ``GenotypeData.guidetree`` will be randomly sampled to generate missing data on descendant nodes. For "nonrandom_weighted", missing data will be placed on nodes proportionally to their branch lengths (e.g., to generate data distributed as might be the case with mutation-disruption of RAD sites). Defaults to "random".
173
-
174
- sim_prop_missing (float, optional): Proportion of missing data to simulate with the SimGenotypeDataTransformer. Defaults to 0.1.
175
-
176
- n_jobs (int, optional): Number of parallel jobs to use in the grid search if ``gridparams`` is not None. -1 means use all available processors. Defaults to 1.
177
-
178
- verbose (int, optional): Verbosity setting. Can be 0, 1, or 2. 0 is the least and 2 is the most verbose. Defaults to 0.
179
-
180
- ToDo:
181
- Fix sample_weight for multi-label encodings.
182
- """
183
-
184
- def __init__(
185
- self,
186
- activate,
187
- nn_method,
188
- num_classes,
189
- act_func,
190
- *,
191
- genotype_data=None,
192
- prefix="imputer",
193
- gridparams=None,
194
- disable_progressbar=False,
195
- batch_size=32,
196
- n_components=3,
197
- early_stop_gen=25,
198
- num_hidden_layers=3,
199
- hidden_layer_sizes="midpoint",
200
- optimizer="adam",
201
- hidden_activation="elu",
202
- learning_rate=0.01,
203
- lr_patience=1,
204
- epochs=100,
205
- weights_initializer="glorot_normal",
206
- l1_penalty=0.0001,
207
- l2_penalty=0.0001,
208
- dropout_rate=0.2,
209
- sample_weights=False,
210
- grid_iter=80,
211
- gridsearch_method="gridsearch",
212
- ga_kwargs=None,
213
- scoring_metric="auc_macro",
214
- sim_strategy="random",
215
- sim_prop_missing=0.2,
216
- n_jobs=1,
217
- verbose=0,
218
- kl_beta=tf.Variable(1.0, trainable=False),
219
- validation_split=0.2,
220
- nlpca=False,
221
- testing=False,
222
- ):
223
- self.activate = activate
224
- self.act_func_ = act_func
225
- self.num_classes = num_classes
226
- self.testing = testing
227
- self.nn_method_ = nn_method
228
-
229
- self.genotype_data = genotype_data
230
- self.prefix = prefix
231
- self.gridparams = gridparams
232
- self.disable_progressbar = disable_progressbar
233
- self.batch_size = batch_size
234
- self.n_components = n_components
235
-
236
- self.early_stop_gen = early_stop_gen
237
- self.num_hidden_layers = num_hidden_layers
238
- self.hidden_layer_sizes = hidden_layer_sizes
239
- self.optimizer = optimizer
240
- self.hidden_activation = hidden_activation
241
- self.learning_rate = learning_rate
242
- self.lr_patience = lr_patience
243
- self.epochs = epochs
244
- self.weights_initializer = weights_initializer
245
- self.l1_penalty = l1_penalty
246
- self.l2_penalty = l2_penalty
247
- self.dropout_rate = dropout_rate
248
- self.sample_weights = sample_weights
249
- self.grid_iter = grid_iter
250
- self.gridsearch_method = gridsearch_method
251
- self.ga_kwargs = ga_kwargs
252
- self.scoring_metric = scoring_metric
253
- self.sim_strategy = sim_strategy
254
- self.sim_prop_missing = sim_prop_missing
255
- self.n_jobs = n_jobs
256
- self.verbose = verbose
257
-
258
- self.kl_beta = kl_beta
259
- self.validation_split = validation_split
260
- self.nlpca = nlpca
261
-
262
- self.run_gridsearch_ = False if self.gridparams is None else True
263
- self.is_multiclass_ = True if self.num_classes != 4 else False
264
-
265
- # Simulate missing data and get missing masks.
266
- self.sim = SimGenotypeDataTransformer(
267
- self.genotype_data,
268
- prop_missing=self.sim_prop_missing,
269
- strategy=self.sim_strategy,
270
- mask_missing=True,
271
- )
272
-
273
- # Binary encode y to get y_train.
274
- self.tt_ = AutoEncoderFeatureTransformer(
275
- num_classes=self.num_classes, activate=self.activate
276
- )
277
-
278
- @timer
279
- def fit(self, X):
280
- """Train the VAE model on input data X.
281
-
282
- Args:
283
- X (pandas.DataFrame, numpy.ndarray, or List[List[int]]): Input 012-encoded genotypes.
284
-
285
- Returns:
286
- self: Current instance; allows method chaining.
287
-
288
- Raises:
289
- TypeError: Must be either pandas.DataFrame, numpy.ndarray, or List[List[int]].
290
- """
291
- # Treating y as X here for compatibility with UBP/NLPCA.
292
- # With VAE, y=X anyways.
293
- y = X
294
- y = validate_input_type(y, return_type="array")
295
-
296
- self.nn_ = NeuralNetworkMethods()
297
- plotting = Plotting()
298
-
299
- if self.gridsearch_method == "genetic_algorithm":
300
- self.ga_ = True
301
- else:
302
- self.ga_ = False
303
-
304
- self.y_original_ = y.copy()
305
- self.y_simulated_ = self.sim.fit_transform(self.y_original_)
306
-
307
- # Get values where original value was not missing but missing data
308
- # was simulated.
309
- self.sim_missing_mask_ = self.sim.sim_missing_mask_
310
-
311
- # Original missing data.
312
- self.original_missing_mask_ = self.sim.original_missing_mask_
313
-
314
- # Both simulated and original missing data.
315
- self.all_missing_ = self.sim.all_missing_mask_
316
-
317
- # Just y_original with missing values encoded as -1.
318
- y_train = self.tt_.fit_transform(self.y_original_)
319
-
320
- if self.gridparams is not None:
321
- self.scoring_metrics_ = [
322
- "average_precision_macro",
323
- "average_precision_micro",
324
- "average_precision_weighted",
325
- "f1_micro",
326
- "f1_macro",
327
- "f1_weighted",
328
- "roc_auc_macro",
329
- "roc_auc_micro",
330
- "roc_auc_weighted",
331
- "accuracy",
332
- "hamming",
333
- ]
334
-
335
- (
336
- logfile,
337
- callbacks,
338
- compile_params,
339
- model_params,
340
- fit_params,
341
- ) = self._initialize_parameters(y_train)
342
-
343
- if self.nn_method_ == "VAE":
344
- func = self.run_vae
345
- elif self.nn_method_ == "SAE":
346
- func = self.run_sae
347
- elif self.nn_method_ == "NLPCA":
348
- func = self.run_nlpca
349
- elif self.nn_method_ == "UBP":
350
- func = self.run_ubp
351
- else:
352
- raise ValueError(f"Invalid nn_method specified: {self.nn_method_}")
353
-
354
- (
355
- self.models_,
356
- self.histories_,
357
- self.best_params_,
358
- self.best_score_,
359
- self.best_estimator_,
360
- self.search_,
361
- self.metrics_,
362
- ) = func(
363
- self.y_original_,
364
- y_train,
365
- model_params,
366
- compile_params,
367
- fit_params,
368
- )
369
-
370
- if (
371
- self.best_params_ is not None
372
- and "optimizer__learning_rate" in self.best_params_
373
- ):
374
- self.best_params_["learning_rate"] = self.best_params_.pop(
375
- "optimizer__learning_rate"
376
- )
377
-
378
- if self.gridparams is not None:
379
- if self.verbose > 0:
380
- print("\nBest found parameters:")
381
- pprint.pprint(self.best_params_)
382
- print(f"\nBest score: {self.best_score_}")
383
- plotting.plot_grid_search(
384
- self.search_.cv_results_, self.nn_method_, self.prefix
385
- )
386
-
387
- plotting.plot_history(
388
- self.histories_, self.nn_method_, prefix=self.prefix
389
- )
390
- plotting.plot_metrics(
391
- self.metrics_, self.num_classes, self.prefix, self.nn_method_
392
- )
393
-
394
- if self.ga_:
395
- plot_fitness_evolution(self.search_)
396
- plt.savefig(
397
- os.path.join(
398
- f"{self.prefix}_output",
399
- "plots",
400
- "Unsupervised",
401
- self.nn_method_,
402
- "fitness_evolution.pdf",
403
- ),
404
- bbox_inches="tight",
405
- facecolor="white",
406
- )
407
- plt.cla()
408
- plt.clf()
409
- plt.close()
410
-
411
- g = plotting.plot_search_space(self.search_)
412
- plt.savefig(
413
- os.path.join(
414
- f"{self.prefix}_output",
415
- "plots",
416
- "Unsupervised",
417
- self.nn_method_,
418
- "search_space.pdf",
419
- ),
420
- bbox_inches="tight",
421
- facecolor="white",
422
- )
423
- plt.cla()
424
- plt.clf()
425
- plt.close()
426
-
427
- return self
428
-
429
- def transform(self, X):
430
- """Predict and decode imputations and return transformed array.
431
-
432
- Args:
433
- X (pandas.DataFrame, numpy.ndarray, or List[List[int]]): Input data to transform.
434
-
435
- Returns:
436
- numpy.ndarray: Imputed data.
437
- """
438
- y = X
439
- y = validate_input_type(y, return_type="array")
440
-
441
- if self.nn_method_ not in ["UBP", "NLPCA"]:
442
- model = self.models_[0]
443
- else:
444
- if len(self.models_) == 1:
445
- model = self.models_[0]
446
- else:
447
- model = self.models_[-1]
448
-
449
- y_true = y.copy()
450
- y_train = self.tt_.transform(y_true)
451
- y_true_1d = y_true.ravel()
452
- y_size = y_true.size
453
- y_missing_idx = np.flatnonzero(self.original_missing_mask_)
454
-
455
- if self.nn_method_ == "VAE":
456
- y_pred = model(
457
- tf.convert_to_tensor(y_train),
458
- training=False,
459
- )
460
- elif self.nn_method_ == "SAE":
461
- y_pred = model(y_train, training=False)
462
- else:
463
- y_pred = model(model.V_latent, training=False)
464
- y_pred = self.tt_.inverse_transform(y_pred)
465
-
466
- y_pred_decoded = self.nn_.decode_masked(
467
- y_train,
468
- y_pred,
469
- is_multiclass=self.is_multiclass_,
470
- )
471
- # y_pred_decoded, y_pred_certainty = self.nn_.decode_masked(
472
- # y_train, y_pred, return_proba=True
473
- # )
474
-
475
- y_pred_1d = y_pred_decoded.ravel()
476
-
477
- # Only replace originally missing values at missing indexes.
478
- for i in np.arange(y_size):
479
- if i in y_missing_idx:
480
- y_true_1d[i] = y_pred_1d[i]
481
-
482
- self.nn_.write_gt_state_probs(
483
- y_pred,
484
- y_pred_1d,
485
- y_true,
486
- y_true_1d,
487
- self.nn_method_,
488
- self.sim_missing_mask_,
489
- self.original_missing_mask_,
490
- prefix=self.prefix,
491
- )
492
-
493
- Plotting.plot_confusion_matrix(
494
- y_true_1d, y_pred_1d, self.nn_method_, prefix=self.prefix
495
- )
496
-
497
- # if self.nn_method_ == "VAE":
498
- # Plotting.plot_label_clusters(z_mean, y_true_1d)
499
-
500
- # Return to original shape.
501
- return np.reshape(y_true_1d, y_true.shape)
502
-
503
- def run_clf(
504
- self,
505
- y_train,
506
- y_true,
507
- model_params,
508
- compile_params,
509
- fit_params,
510
- ubp_weights=None,
511
- phase=None,
512
- testing=False,
513
- **kwargs,
514
- ):
515
- """Run KerasClassifier with neural network model and grid search.
516
-
517
- Args:
518
- y_train (numpy.ndarray): Onehot-encoded training input data of shape (n_samples, n_features, num_classes).
519
-
520
- y_true (numpy.ndarray): Original 012-encoded input data of shape (n_samples, n_features).
521
-
522
- model_params (Dict[str, Any]): Dictionary with model parameters to be passed to KerasClassifier model.
523
-
524
- compile_params (Dict[str, Any]): Dictionary with params to be passed to Keras model.compile() in KerasClassifier.
525
-
526
- fit_params (Dict[str, Any]): Dictionary with parameters to be passed to fit in KerasClassifier.
527
-
528
- Returns:
529
- List[tf.keras.Model]: List of keras model objects. One for each phase (len=1 if NLPCA, len=3 if UBP).
530
-
531
- List[Dict[str, float]]: List of dictionaries with best neural network model history.
532
-
533
- Dict[str, Any] or None: Best parameters found during a grid search, or None if a grid search was not run.
534
-
535
- float: Best score obtained during grid search.
536
-
537
- tf.keras.Model: Best model found during grid search.
538
-
539
- sklearn.model_selection object (GridSearchCV, RandomizedSearchCV) or GASearchCV object.
540
-
541
- Dict[str, Any]: Per-class, micro, and macro-averaged metrics including accuracy, ROC-AUC, and Precision-Recall with Average Precision scores.
542
- """
543
- # This reduces memory usage.
544
- # tensorflow builds graphs that
545
- # will stack if not cleared before
546
- # building a new model.
547
- tf.keras.backend.clear_session()
548
- self.nn_.reset_seeds()
549
-
550
- model = None
551
- if self.nn_method_ in ["UBP", "NLPCA"]:
552
- V = model_params.pop("V")
553
- if phase is not None:
554
- desc = f"Epoch (Phase {phase}): "
555
- else:
556
- desc = "Epoch: "
557
-
558
- else:
559
- desc = "Epoch: "
560
-
561
- if not self.disable_progressbar and not self.run_gridsearch_:
562
- fit_params["callbacks"][-1] = TqdmCallback(
563
- epochs=self.epochs, verbose=self.verbose, desc=desc
564
- )
565
-
566
- if self.nn_method_ == "VAE":
567
- clf = VAEClassifier(
568
- **model_params,
569
- optimizer=compile_params["optimizer"],
570
- optimizer__learning_rate=compile_params["learning_rate"],
571
- loss=compile_params["loss"],
572
- metrics=compile_params["metrics"],
573
- run_eagerly=compile_params["run_eagerly"],
574
- callbacks=fit_params["callbacks"],
575
- epochs=fit_params["epochs"],
576
- verbose=0,
577
- num_classes=self.num_classes,
578
- activate=self.act_func_,
579
- fit__validation_split=fit_params["validation_split"],
580
- score__missing_mask=self.sim_missing_mask_,
581
- score__scoring_metric=self.scoring_metric,
582
- score__num_classes=self.num_classes,
583
- score__n_classes=self.num_classes,
584
- )
585
- elif self.nn_method_ == "SAE":
586
- clf = SAEClassifier(
587
- **model_params,
588
- optimizer=compile_params["optimizer"],
589
- optimizer__learning_rate=compile_params["learning_rate"],
590
- loss=compile_params["loss"],
591
- metrics=compile_params["metrics"],
592
- callbacks=fit_params["callbacks"],
593
- epochs=fit_params["epochs"],
594
- verbose=0,
595
- activate=self.act_func_,
596
- fit__validation_split=fit_params["validation_split"],
597
- score__missing_mask=self.sim_missing_mask_,
598
- score__scoring_metric=self.scoring_metric,
599
- score__num_classes=self.num_classes,
600
- score__n_classes=self.num_classes,
601
- )
602
- else:
603
- clf = MLPClassifier(
604
- V,
605
- y_train,
606
- **model_params,
607
- ubp_weights=ubp_weights,
608
- optimizer=compile_params["optimizer"],
609
- optimizer__learning_rate=compile_params["learning_rate"],
610
- loss=compile_params["loss"],
611
- metrics=compile_params["metrics"],
612
- epochs=fit_params["epochs"],
613
- phase=phase,
614
- callbacks=fit_params["callbacks"],
615
- validation_split=fit_params["validation_split"],
616
- verbose=0,
617
- activate=self.act_func_,
618
- score__missing_mask=self.sim_missing_mask_,
619
- score__scoring_metric=self.scoring_metric,
620
- )
621
-
622
- if self.run_gridsearch_:
623
- # Cannot do CV because there is no way to use test splits
624
- # given that the input gets refined. If using a test split,
625
- # then it would just be the randomly initialized values and
626
- # would not accurately represent the model.
627
- # Thus, we disable cross-validation for the grid searches.
628
- cross_val = DisabledCV()
629
- verbose = False if self.verbose == 0 else True
630
-
631
- scorers = Scorers()
632
- scoring = scorers.make_multimetric_scorer(
633
- self.scoring_metrics_,
634
- self.sim_missing_mask_,
635
- num_classes=self.num_classes,
636
- )
637
-
638
- if self.ga_:
639
- # Stop searching if GA sees no improvement.
640
- callback = [
641
- ConsecutiveStopping(
642
- generations=self.early_stop_gen, metric="fitness"
643
- )
644
- ]
645
-
646
- if not self.disable_progressbar:
647
- callback.append(ProgressBar())
648
-
649
- # Do genetic algorithm
650
- # with HiddenPrints():
651
- search = GASearchCV(
652
- estimator=clf,
653
- cv=cross_val,
654
- scoring=scoring,
655
- generations=self.grid_iter,
656
- param_grid=self.gridparams,
657
- n_jobs=self.n_jobs,
658
- refit=self.scoring_metric,
659
- verbose=verbose,
660
- **self.ga_kwargs,
661
- error_score="raise",
662
- )
663
-
664
- if self.nn_method_ in ["UBP", "NLPCA"]:
665
- search.fit(V[self.n_components], y=y_true)
666
- else:
667
- search.fit(y_true, y_true, callbacks=callback)
668
-
669
- else:
670
- # Write GridSearchCV to log file instead of STDOUT.
671
- if self.verbose >= 10:
672
- old_stdout = sys.stdout
673
- log_file = open(
674
- os.path.join(
675
- f"{self.prefix}_output",
676
- "logs",
677
- "Unsupervised",
678
- self.nn_method_,
679
- "gridsearch_progress_log.txt",
680
- ),
681
- "w",
682
- )
683
- sys.stdout = log_file
684
-
685
- if self.gridsearch_method.lower() == "gridsearch":
686
- # Do GridSearchCV
687
- search = GridSearchCV(
688
- clf,
689
- param_grid=self.gridparams,
690
- n_jobs=self.n_jobs,
691
- cv=cross_val,
692
- scoring=scoring,
693
- refit=self.scoring_metric,
694
- verbose=self.verbose * 4,
695
- error_score="raise",
696
- )
697
-
698
- elif self.gridsearch_method.lower() == "randomized_gridsearch":
699
- search = RandomizedSearchCV(
700
- clf,
701
- param_distributions=self.gridparams,
702
- n_iter=self.grid_iter,
703
- n_jobs=self.n_jobs,
704
- cv=cross_val,
705
- scoring=scoring,
706
- refit=self.scoring_metric,
707
- verbose=verbose * 4,
708
- error_score="raise",
709
- )
710
-
711
- else:
712
- raise ValueError(
713
- f"Invalid gridsearch_method specified: "
714
- f"{self.gridsearch_method}"
715
- )
716
-
717
- if self.nn_method_ in ["UBP", "NLPCA"]:
718
- search.fit(V[self.n_components], y=y_true)
719
- else:
720
- search.fit(y_true, y=y_true)
721
-
722
- if self.verbose >= 10:
723
- # Make sure to revert STDOUT back to original.
724
- sys.stdout = old_stdout
725
- log_file.close()
726
-
727
- best_params = search.best_params_
728
- best_score = search.best_score_
729
- best_clf = search.best_estimator_
730
-
731
- fp = os.path.join(
732
- f"{self.prefix}_output",
733
- "reports",
734
- "Unsupervised",
735
- self.nn_method_,
736
- f"cvresults_{self.nn_method_}.csv",
737
- )
738
-
739
- cv_results = pd.DataFrame(search.cv_results_)
740
- cv_results.to_csv(fp, index=False)
741
-
742
- else:
743
- if self.nn_method_ in ["UBP", "NLPCA"]:
744
- clf.fit(V[self.n_components], y=y_true)
745
- else:
746
- clf.fit(y_true, y=y_true)
747
- best_params = None
748
- best_score = None
749
- search = None
750
- best_clf = clf
751
-
752
- model = best_clf.model_
753
- best_history = best_clf.history_
754
-
755
- if self.nn_method_ == "VAE":
756
- y_pred = model(
757
- tf.convert_to_tensor(y_train),
758
- training=False,
759
- )
760
- y_pred = self.tt_.inverse_transform(y_pred)
761
- elif self.nn_method_ == "SAE":
762
- y_pred = model(y_train, training=False)
763
- y_pred = self.tt_.inverse_transform(y_pred)
764
- elif self.nn_method_ in ["UBP", "NLPCA"]:
765
- # Third run_clf function
766
- y_pred_proba = model(model.V_latent, training=False)
767
- y_pred = self.tt_.inverse_transform(y_pred_proba)
768
-
769
- # Get metric scores.
770
- metrics = Scorers.scorer(
771
- y_true,
772
- y_pred,
773
- missing_mask=self.sim_missing_mask_,
774
- num_classes=self.num_classes,
775
- testing=self.testing,
776
- )
777
-
778
- if self.nn_method_ in ["UBP", "NLPCA"]:
779
- return (
780
- V,
781
- model,
782
- best_history,
783
- best_params,
784
- best_score,
785
- best_clf,
786
- search,
787
- metrics,
788
- )
789
- else:
790
- return (
791
- model,
792
- best_history,
793
- best_params,
794
- best_score,
795
- best_clf,
796
- search,
797
- metrics,
798
- )
799
-
800
- def _initialize_parameters(self, y_train):
801
- """Initialize important parameters.
802
-
803
- Args:
804
- y_train (numpy.ndarray): Training subset of original input data.
805
-
806
- Returns:
807
- Dict[str, Any]: Parameters to use for model.compile().
808
- Dict[str, Any]: Other parameters to pass to KerasClassifier().
809
- Dict[str, Any]: Parameters to pass to fit_params() in grid search.
810
- """
811
- # For CSVLogger() callback.
812
-
813
- append = True if self.nn_method_ == "UBP" else False
814
- logdir = os.path.join(
815
- f"{self.prefix}_output",
816
- "logs",
817
- "Unsupervised",
818
- self.nn_method_,
819
- )
820
-
821
- Path(logdir).mkdir(parents=True, exist_ok=True)
822
- logfile = os.path.join(logdir, "training_log.csv")
823
-
824
- callbacks = [
825
- CSVLogger(filename=logfile, append=append),
826
- ReduceLROnPlateau(
827
- patience=self.lr_patience, min_lr=1e-6, min_delta=1e-6
828
- ),
829
- ]
830
-
831
- if self.nn_method_ in ["VAE", "SAE"]:
832
- callbacks.append(VAECallbacks())
833
-
834
- if self.nn_method_ == "VAE":
835
- callbacks.append(
836
- CyclicalAnnealingCallback(
837
- self.epochs, schedule_type="sigmoid"
838
- )
839
- )
840
- else:
841
- callbacks.append(UBPCallbacks())
842
-
843
- search_mode = True if self.run_gridsearch_ else False
844
-
845
- if not self.disable_progressbar and not search_mode:
846
- callbacks.append(
847
- TqdmCallback(epochs=self.epochs, verbose=0, desc="Epoch: ")
848
- )
849
-
850
- if self.nn_method_ in ["UBP", "NLPCA"]:
851
- vinput = self._initV(y_train, search_mode)
852
-
853
- if self.sample_weights == "auto" or self.sample_weights == "logsmooth":
854
- # Get class weights for each column.
855
- sample_weights = self.nn_.get_class_weights(
856
- self.y_original_,
857
- self.original_missing_mask_,
858
- return_1d=False,
859
- method=self.sample_weights,
860
- )
861
- sample_weights = self.nn_.normalize_data(sample_weights)
862
-
863
- elif isinstance(self.sample_weights, dict):
864
- for i in range(self.num_classes):
865
- if self.sample_weights[i] == 0.0:
866
- self.sim_missing_mask_[self.y_original_ == i] = False
867
-
868
- sample_weights = self.nn_.get_class_weights(
869
- self.y_original_, user_weights=self.sample_weights
870
- )
871
-
872
- else:
873
- sample_weights = None
874
-
875
- compile_params = self.nn_.set_compile_params(
876
- self.optimizer,
877
- sample_weights,
878
- vae=False,
879
- act_func="sigmoid",
880
- )
881
-
882
- compile_params["learning_rate"] = self.learning_rate
883
-
884
- if self.nn_method_ in ["VAE", "SAE"]:
885
- model_params = {
886
- "y": y_train,
887
- "batch_size": self.batch_size,
888
- "sample_weight": sample_weights,
889
- "missing_mask": self.original_missing_mask_,
890
- "output_shape": y_train.shape[1],
891
- "weights_initializer": self.weights_initializer,
892
- "n_components": self.n_components,
893
- "hidden_layer_sizes": self.hidden_layer_sizes,
894
- "num_hidden_layers": self.num_hidden_layers,
895
- "hidden_activation": self.hidden_activation,
896
- "l1_penalty": self.l1_penalty,
897
- "l2_penalty": self.l2_penalty,
898
- "dropout_rate": self.dropout_rate,
899
- }
900
-
901
- if self.nn_method_ == "VAE":
902
- model_params["kl_beta"] = (1.0 / y_train.shape[0],)
903
- else:
904
- model_params = {
905
- "V": vinput,
906
- "y_train": y_train,
907
- "batch_size": self.batch_size,
908
- "missing_mask": self.original_missing_mask_,
909
- "output_shape": y_train.shape[1],
910
- "weights_initializer": self.weights_initializer,
911
- "n_components": self.n_components,
912
- "hidden_layer_sizes": self.hidden_layer_sizes,
913
- "num_hidden_layers": self.num_hidden_layers,
914
- "hidden_activation": self.hidden_activation,
915
- "l1_penalty": self.l1_penalty,
916
- "l2_penalty": self.l2_penalty,
917
- "dropout_rate": self.dropout_rate,
918
- "num_classes": self.num_classes,
919
- }
920
-
921
- model_params["sample_weight"] = sample_weights
922
-
923
- fit_verbose = 1 if self.verbose == 2 else 0
924
-
925
- fit_params = {
926
- "batch_size": self.batch_size,
927
- "epochs": self.epochs,
928
- "callbacks": callbacks,
929
- "verbose": fit_verbose,
930
- "sample_weight": sample_weights,
931
- }
932
-
933
- if self.nn_method_ in ["VAE", "SAE"]:
934
- fit_params["validation_split"] = self.validation_split
935
- else:
936
- fit_params["validation_split"] = self.validation_split
937
-
938
- fit_params["shuffle"] = False
939
-
940
- if self.run_gridsearch_ and "learning_rate" in self.gridparams:
941
- self.gridparams["optimizer__learning_rate"] = self.gridparams[
942
- "learning_rate"
943
- ]
944
-
945
- self.gridparams.pop("learning_rate")
946
-
947
- return (
948
- logfile,
949
- callbacks,
950
- compile_params,
951
- model_params,
952
- fit_params,
953
- )
954
-
955
-
956
- class VAE(BaseNNImputer):
957
- """Class to impute missing data using a Variational Autoencoder neural network."""
958
-
959
- def __init__(
960
- self,
961
- kl_beta=tf.Variable(1.0, trainable=False),
962
- validation_split=0.2,
963
- **kwargs,
964
- ):
965
- self.kl_beta = kl_beta
966
- self.validation_split = validation_split
967
-
968
- self.nn_method_ = "VAE"
969
- self.num_classes = 4
970
- self.activate = None
971
- self.is_multiclass_ = True if self.num_classes != 4 else False
972
- self.testing = kwargs.get("testing", False)
973
- self.do_act_in_model_ = True if self.activate is None else False
974
-
975
- self.act_func_ = None
976
-
977
- super().__init__(
978
- self.activate,
979
- self.nn_method_,
980
- self.num_classes,
981
- self.act_func_,
982
- **kwargs,
983
- kl_beta=self.kl_beta,
984
- validation_split=self.validation_split,
985
- )
986
-
987
- def run_vae(
988
- self,
989
- y_true,
990
- y_train,
991
- model_params,
992
- compile_params,
993
- fit_params,
994
- ):
995
- """Run VAE using custom subclassed model.
996
-
997
- Args:
998
- y_true (numpy.ndarray): Original genotypes (training dataset) with known and missing values, of shape (n_samples, n_features).
999
-
1000
- y_train (numpy.ndarray): Onehot encoded genotypes (training dataset) with known and missing values, of shape (n_samples, n_features, num_classes).
1001
-
1002
- model_params (Dict[str, Any]): Dictionary with parameters to pass to the classifier model.
1003
-
1004
- compile_params (Dict[str, Any]): Dictionary with parameters to pass to the tensorflow compile function.
1005
-
1006
- fit_params (Dict[str, Any]): Dictionary with parameters to pass to the fit() function.
1007
-
1008
- Returns:
1009
- List[tf.keras.Model]: List of keras model objects. One for each phase (len=1 if NLPCA, len=3 if UBP).
1010
-
1011
- List[Dict[str, float]]: List of dictionaries with best neural network model history.
1012
-
1013
- Dict[str, Any] or None: Best parameters found during a grid search, or None if a grid search was not run.
1014
-
1015
- float: Best score obtained during grid search.
1016
-
1017
- tf.keras.Model: Best model found during grid search.
1018
-
1019
- sklearn.model_selection object (GridSearchCV, RandomizedSearchCV) or GASearchCV object.
1020
-
1021
- Dict[str, Any]: Per-class, micro, and macro-averaged metrics including accuracy, ROC-AUC, and Precision-Recall with Average Precision scores.
1022
- """
1023
- scorers = Scorers()
1024
- scoring = None
1025
-
1026
- histories = list()
1027
- models = list()
1028
-
1029
- (
1030
- model,
1031
- best_history,
1032
- best_params,
1033
- best_score,
1034
- best_clf,
1035
- search,
1036
- metrics,
1037
- ) = self.run_clf(
1038
- y_train,
1039
- y_true,
1040
- model_params,
1041
- compile_params,
1042
- fit_params,
1043
- )
1044
-
1045
- histories.append(best_history)
1046
- models.append(model)
1047
- del model
1048
-
1049
- return (
1050
- models,
1051
- histories,
1052
- best_params,
1053
- best_score,
1054
- best_clf,
1055
- search,
1056
- metrics,
1057
- )
1058
-
1059
-
1060
- class SAE(BaseNNImputer):
1061
- def __init__(
1062
- self,
1063
- **kwargs,
1064
- ):
1065
- self.num_classes = 4
1066
- self.is_multiclass_ = True if self.num_classes != 4 else False
1067
- self.activate = None
1068
- self.nn_method_ = "SAE"
1069
- self.act_func_ = None
1070
- self.testing = kwargs.get("testing", False)
1071
-
1072
- super().__init__(
1073
- self.activate,
1074
- self.nn_method_,
1075
- self.num_classes,
1076
- self.act_func_,
1077
- **kwargs,
1078
- )
1079
-
1080
- def run_sae(
1081
- self,
1082
- y_true,
1083
- y_train,
1084
- model_params,
1085
- compile_params,
1086
- fit_params,
1087
- ):
1088
- """Run standard autoencoder using custom subclassed model.
1089
-
1090
- Args:
1091
- y_true (numpy.ndarray): Original genotypes (training dataset) with known and missing values of shape (n_samples, n_features).
1092
-
1093
- y_train (numpy.ndarray): Onehot-encoded genotypes (training dataset) with known and missing values of shape (n_samples, n_features, num_classes.)
1094
-
1095
- model_params (Dict[str, Any]): Dictionary with parameters to pass to the classifier model.
1096
-
1097
- compile_params (Dict[str, Any]): Dictionary with parameters to pass to the tensorflow compile function.
1098
-
1099
- fit_params (Dict[str, Any]): Dictionary with parameters to pass to the fit() function.
1100
-
1101
- Returns:
1102
- List[tf.keras.Model]: List of keras model objects. One for each phase (len=1 if NLPCA, len=3 if UBP).
1103
-
1104
- List[Dict[str, float]]: List of dictionaries with best neural network model history.
1105
-
1106
- Dict[str, Any] or None: Best parameters found during a grid search, or None if a grid search was not run.
1107
-
1108
- float: Best score obtained during grid search.
1109
-
1110
- tf.keras.Model: Best model found during grid search.
1111
-
1112
- sklearn.model_selection object (GridSearchCV, RandomizedSearchCV) or GASearchCV object.
1113
-
1114
- Dict[str, Any]: Per-class, micro, and macro-averaged metrics including accuracy, ROC-AUC, and Precision-Recall with Average Precision scores.
1115
- """
1116
- scorers = Scorers()
1117
- scoring = None
1118
-
1119
- histories = list()
1120
- models = list()
1121
-
1122
- (
1123
- model,
1124
- best_history,
1125
- best_params,
1126
- best_score,
1127
- best_clf,
1128
- search,
1129
- metrics,
1130
- ) = self.run_clf(
1131
- y_train,
1132
- y_true,
1133
- model_params,
1134
- compile_params,
1135
- fit_params,
1136
- testing=False,
1137
- )
1138
-
1139
- histories.append(best_history)
1140
- models.append(model)
1141
- del model
1142
-
1143
- return (
1144
- models,
1145
- histories,
1146
- best_params,
1147
- best_score,
1148
- best_clf,
1149
- search,
1150
- metrics,
1151
- )
1152
-
1153
-
1154
- class UBP(BaseNNImputer):
1155
- def __init__(
1156
- self,
1157
- *,
1158
- nlpca=False,
1159
- **kwargs,
1160
- ):
1161
- # TODO: Make estimators compatible with variable number of classes.
1162
- # E.g., with morphological data.
1163
- self.nlpca = nlpca
1164
- self.nn_method_ = "NLPCA" if self.nlpca else "UBP"
1165
- self.num_classes = 4
1166
- self.is_multiclass_ = True if self.num_classes != 4 else False
1167
- self.testing = kwargs.get("testing", False)
1168
- self.activate = None
1169
- self.act_func_ = None
1170
-
1171
- super().__init__(
1172
- self.activate,
1173
- self.nn_method_,
1174
- self.num_classes,
1175
- self.act_func_,
1176
- **kwargs,
1177
- nlpca=self.nlpca,
1178
- )
1179
-
1180
- def run_nlpca(
1181
- self,
1182
- y_true,
1183
- y_train,
1184
- model_params,
1185
- compile_params,
1186
- fit_params,
1187
- ):
1188
- """Run NLPCA using custom subclassed model.
1189
-
1190
- Args:
1191
- y_true (numpy.ndarray): Original genotypes with known and missing values.
1192
-
1193
- y_train (numpy.ndarray): For compatibility with VAE and SAE. Not used.
1194
-
1195
- model_params (Dict[str, Any]): Dictionary with parameters to pass to the classifier model.
1196
-
1197
- compile_params (Dict[str, Any]): Dictionary with parameters to pass to the tensorflow compile function.
1198
-
1199
- fit_params (Dict[str, Any]): Dictionary with parameters to pass to the fit() function.
1200
-
1201
- Returns:
1202
- List[tf.keras.Model]: List of keras model objects. One for each phase (len=1 if NLPCA, len=3 if UBP).
1203
-
1204
- List[Dict[str, float]]: List of dictionaries with best neural network model history.
1205
-
1206
- Dict[str, Any] or None: Best parameters found during a grid search, or None if a grid search was not run.
1207
-
1208
- float: Best score obtained during grid search.
1209
-
1210
- tf.keras.Model: Best model found during grid search.
1211
-
1212
- sklearn.model_selection object (GridSearchCV, RandomizedSearchCV) or GASearchCV object.
1213
-
1214
- Dict[str, Any]: Per-class, micro, and macro-averaged metrics including accuracy, ROC-AUC, and Precision-Recall with Average Precision scores.
1215
- """
1216
- scorers = Scorers()
1217
-
1218
- histories = list()
1219
- models = list()
1220
- y_train = model_params.pop("y_train")
1221
- ubp_weights = None
1222
- phase = None
1223
-
1224
- (
1225
- V,
1226
- model,
1227
- best_history,
1228
- best_params,
1229
- best_score,
1230
- best_clf,
1231
- search,
1232
- metrics,
1233
- ) = self.run_clf(
1234
- y_train,
1235
- y_true,
1236
- model_params,
1237
- compile_params,
1238
- fit_params,
1239
- ubp_weights=ubp_weights,
1240
- phase=phase,
1241
- testing=False,
1242
- )
1243
-
1244
- histories.append(best_history)
1245
- models.append(model)
1246
- del model
1247
-
1248
- return (
1249
- models,
1250
- histories,
1251
- best_params,
1252
- best_score,
1253
- best_clf,
1254
- search,
1255
- metrics,
1256
- )
1257
-
1258
- def run_ubp(
1259
- self,
1260
- y_true,
1261
- y_train,
1262
- model_params,
1263
- compile_params,
1264
- fit_params,
1265
- ):
1266
- """Run UBP using custom subclassed model.
1267
-
1268
- Args:
1269
- y_true (numpy.ndarray): Original genotypes with known and missing values.
1270
-
1271
- y_train (numpy.ndarray): For compatibility with VAE and SAE. Not used.
1272
-
1273
- model_params (Dict[str, Any]): Dictionary with parameters to pass to the classifier model.
1274
-
1275
- compile_params (Dict[str, Any]): Dictionary with parameters to pass to the tensorflow compile function.
1276
-
1277
- fit_params (Dict[str, Any]): Dictionary with parameters to pass to the fit() function.
1278
-
1279
- Returns:
1280
- List[tf.keras.Model]: List of keras model objects. One for each phase (len=1 if NLPCA, len=3 if UBP).
1281
-
1282
- List[Dict[str, float]]: List of dictionaries with best neural network model history.
1283
-
1284
- Dict[str, Any] or None: Best parameters found during a grid search, or None if a grid search was not run.
1285
-
1286
- float: Best score obtained during grid search.
1287
-
1288
- tf.keras.Model: Best model found during grid search.
1289
-
1290
- sklearn.model_selection object (GridSearchCV, RandomizedSearchCV) or GASearchCV object.
1291
-
1292
- Dict[str, Any]: Per-class, micro, and macro-averaged metrics including accuracy, ROC-AUC, and Precision-Recall with Average Precision scores.
1293
- """
1294
- scorers = Scorers()
1295
-
1296
- histories = list()
1297
- models = list()
1298
- search_n_components = False
1299
-
1300
- y_train = model_params.pop("y_train")
1301
-
1302
- if self.run_gridsearch_:
1303
- # Cannot do CV because there is no way to use test splits
1304
- # given that the input gets refined. If using a test split,
1305
- # then it would just be the randomly initialized values and
1306
- # would not accurately represent the model.
1307
- # Thus, we disable cross-validation for the grid searches.
1308
- scoring = scorers.make_multimetric_scorer(
1309
- self.scoring_metrics_,
1310
- self.sim_missing_mask_,
1311
- num_classes=self.num_classes,
1312
- )
1313
-
1314
- if "n_components" in self.gridparams:
1315
- search_n_components = True
1316
- n_components_searched = self.n_components
1317
- else:
1318
- scoring = None
1319
-
1320
- for phase in range(1, 4):
1321
- ubp_weights = models[1].get_weights() if phase == 3 else None
1322
-
1323
- (
1324
- V,
1325
- model,
1326
- best_history,
1327
- best_params,
1328
- best_score,
1329
- best_clf,
1330
- search,
1331
- metrics,
1332
- ) = self.run_clf(
1333
- y_train,
1334
- y_true,
1335
- model_params,
1336
- compile_params,
1337
- fit_params,
1338
- ubp_weights=ubp_weights,
1339
- phase=phase,
1340
- testing=False,
1341
- )
1342
-
1343
- if phase == 1:
1344
- # Cannot have V input with different n_components
1345
- # in other phases than are in phase 1.
1346
- # So the n_components search has to happen in phase 1.
1347
- if best_params is not None and search_n_components:
1348
- n_components_searched = best_params["n_components"]
1349
- model_params["V"] = {
1350
- n_components_searched: model.V_latent.copy()
1351
- }
1352
- model_params["n_components"] = n_components_searched
1353
- self.n_components = n_components_searched
1354
- self.gridparams.pop("n_components")
1355
-
1356
- else:
1357
- model_params["V"] = V
1358
- elif phase == 2:
1359
- model_params["V"] = V
1360
-
1361
- elif phase == 3:
1362
- if best_params is not None and search_n_components:
1363
- best_params["n_components"] = n_components_searched
1364
-
1365
- histories.append(best_history)
1366
- models.append(model)
1367
- del model
1368
-
1369
- return (
1370
- models,
1371
- histories,
1372
- best_params,
1373
- best_score,
1374
- best_clf,
1375
- search,
1376
- metrics,
1377
- )
1378
-
1379
- def _initV(self, y_train, search_mode):
1380
- """Initialize random input V as dictionary of numpy arrays.
1381
-
1382
- Args:
1383
- y_train (numpy.ndarray): One-hot encoded training dataset (actual data).
1384
-
1385
- search_mode (bool): Whether doing grid search.
1386
-
1387
- Returns:
1388
- Dict[int, numpy.ndarray]: Dictionary with n_components: V as key-value pairs.
1389
-
1390
- Raises:
1391
- ValueError: Number of components must be >= 2.
1392
- """
1393
- vinput = dict()
1394
- if search_mode:
1395
- if "n_components" in self.gridparams:
1396
- n_components = self.gridparams["n_components"]
1397
- else:
1398
- n_components = self.n_components
1399
-
1400
- if not isinstance(n_components, int):
1401
- if min(n_components) < 2:
1402
- raise ValueError(
1403
- f"n_components must be >= 2, but a value of {n_components} was specified."
1404
- )
1405
-
1406
- elif len(n_components) == 1:
1407
- vinput[n_components[0]] = self.nn_.init_weights(
1408
- y_train.shape[0], n_components[0]
1409
- )
1410
-
1411
- else:
1412
- for c in n_components:
1413
- vinput[c] = self.nn_.init_weights(y_train.shape[0], c)
1414
- else:
1415
- vinput[self.n_components] = self.nn_.init_weights(
1416
- y_train.shape[0], self.n_components
1417
- )
1418
-
1419
- else:
1420
- vinput[self.n_components] = self.nn_.init_weights(
1421
- y_train.shape[0], self.n_components
1422
- )
1423
-
1424
- return vinput