pg-sui 0.2.3__py3-none-any.whl → 1.6.14.dev9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. {pg_sui-0.2.3.dist-info → pg_sui-1.6.14.dev9.dist-info}/METADATA +99 -77
  2. pg_sui-1.6.14.dev9.dist-info/RECORD +81 -0
  3. {pg_sui-0.2.3.dist-info → pg_sui-1.6.14.dev9.dist-info}/WHEEL +1 -1
  4. pg_sui-1.6.14.dev9.dist-info/entry_points.txt +4 -0
  5. {pg_sui-0.2.3.dist-info → pg_sui-1.6.14.dev9.dist-info/licenses}/LICENSE +0 -0
  6. pg_sui-1.6.14.dev9.dist-info/top_level.txt +1 -0
  7. pgsui/__init__.py +35 -54
  8. pgsui/_version.py +34 -0
  9. pgsui/cli.py +909 -0
  10. pgsui/data_processing/__init__.py +0 -0
  11. pgsui/data_processing/config.py +565 -0
  12. pgsui/data_processing/containers.py +1424 -0
  13. pgsui/data_processing/transformers.py +557 -907
  14. pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
  15. pgsui/electron/app/__main__.py +5 -0
  16. pgsui/electron/app/extra-resources/.gitkeep +1 -0
  17. pgsui/electron/app/icons/icons/1024x1024.png +0 -0
  18. pgsui/electron/app/icons/icons/128x128.png +0 -0
  19. pgsui/electron/app/icons/icons/16x16.png +0 -0
  20. pgsui/electron/app/icons/icons/24x24.png +0 -0
  21. pgsui/electron/app/icons/icons/256x256.png +0 -0
  22. pgsui/electron/app/icons/icons/32x32.png +0 -0
  23. pgsui/electron/app/icons/icons/48x48.png +0 -0
  24. pgsui/electron/app/icons/icons/512x512.png +0 -0
  25. pgsui/electron/app/icons/icons/64x64.png +0 -0
  26. pgsui/electron/app/icons/icons/icon.icns +0 -0
  27. pgsui/electron/app/icons/icons/icon.ico +0 -0
  28. pgsui/electron/app/main.js +227 -0
  29. pgsui/electron/app/package-lock.json +6894 -0
  30. pgsui/electron/app/package.json +51 -0
  31. pgsui/electron/app/preload.js +15 -0
  32. pgsui/electron/app/server.py +157 -0
  33. pgsui/electron/app/ui/logo.png +0 -0
  34. pgsui/electron/app/ui/renderer.js +131 -0
  35. pgsui/electron/app/ui/styles.css +59 -0
  36. pgsui/electron/app/ui/ui_shim.js +72 -0
  37. pgsui/electron/bootstrap.py +43 -0
  38. pgsui/electron/launch.py +57 -0
  39. pgsui/electron/package.json +14 -0
  40. pgsui/example_data/__init__.py +0 -0
  41. pgsui/example_data/phylip_files/__init__.py +0 -0
  42. pgsui/example_data/phylip_files/test.phy +0 -0
  43. pgsui/example_data/popmaps/__init__.py +0 -0
  44. pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
  45. pgsui/example_data/structure_files/__init__.py +0 -0
  46. pgsui/example_data/structure_files/test.pops.2row.allsites.str +0 -0
  47. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
  48. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
  49. pgsui/impute/__init__.py +0 -0
  50. pgsui/impute/deterministic/imputers/allele_freq.py +725 -0
  51. pgsui/impute/deterministic/imputers/mode.py +844 -0
  52. pgsui/impute/deterministic/imputers/nmf.py +221 -0
  53. pgsui/impute/deterministic/imputers/phylo.py +973 -0
  54. pgsui/impute/deterministic/imputers/ref_allele.py +669 -0
  55. pgsui/impute/supervised/__init__.py +0 -0
  56. pgsui/impute/supervised/base.py +343 -0
  57. pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
  58. pgsui/impute/supervised/imputers/hist_gradient_boosting.py +317 -0
  59. pgsui/impute/supervised/imputers/random_forest.py +291 -0
  60. pgsui/impute/unsupervised/__init__.py +0 -0
  61. pgsui/impute/unsupervised/base.py +1118 -0
  62. pgsui/impute/unsupervised/callbacks.py +92 -262
  63. {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
  64. pgsui/impute/unsupervised/imputers/autoencoder.py +1285 -0
  65. pgsui/impute/unsupervised/imputers/nlpca.py +1554 -0
  66. pgsui/impute/unsupervised/imputers/ubp.py +1575 -0
  67. pgsui/impute/unsupervised/imputers/vae.py +1228 -0
  68. pgsui/impute/unsupervised/loss_functions.py +261 -0
  69. pgsui/impute/unsupervised/models/__init__.py +0 -0
  70. pgsui/impute/unsupervised/models/autoencoder_model.py +215 -567
  71. pgsui/impute/unsupervised/models/nlpca_model.py +155 -394
  72. pgsui/impute/unsupervised/models/ubp_model.py +180 -1106
  73. pgsui/impute/unsupervised/models/vae_model.py +269 -630
  74. pgsui/impute/unsupervised/nn_scorers.py +255 -0
  75. pgsui/utils/__init__.py +0 -0
  76. pgsui/utils/classification_viz.py +608 -0
  77. pgsui/utils/logging_utils.py +22 -0
  78. pgsui/utils/misc.py +35 -480
  79. pgsui/utils/plotting.py +996 -829
  80. pgsui/utils/pretty_metrics.py +290 -0
  81. pgsui/utils/scorers.py +213 -666
  82. pg_sui-0.2.3.dist-info/RECORD +0 -75
  83. pg_sui-0.2.3.dist-info/top_level.txt +0 -3
  84. pgsui/example_data/phylip_files/test_n10.phy +0 -118
  85. pgsui/example_data/phylip_files/test_n100.phy +0 -118
  86. pgsui/example_data/phylip_files/test_n2.phy +0 -118
  87. pgsui/example_data/phylip_files/test_n500.phy +0 -118
  88. pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
  89. pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
  90. pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
  91. pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
  92. pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
  93. pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
  94. pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
  95. pgsui/example_data/trees/test.iqtree +0 -376
  96. pgsui/example_data/trees/test.qmat +0 -5
  97. pgsui/example_data/trees/test.rate +0 -2033
  98. pgsui/example_data/trees/test.tre +0 -1
  99. pgsui/example_data/trees/test_n10.rate +0 -19
  100. pgsui/example_data/trees/test_n100.rate +0 -109
  101. pgsui/example_data/trees/test_n500.rate +0 -509
  102. pgsui/example_data/trees/test_siterates.txt +0 -2024
  103. pgsui/example_data/trees/test_siterates_n10.txt +0 -10
  104. pgsui/example_data/trees/test_siterates_n100.txt +0 -100
  105. pgsui/example_data/trees/test_siterates_n500.txt +0 -500
  106. pgsui/example_data/vcf_files/test.vcf +0 -244
  107. pgsui/example_data/vcf_files/test.vcf.gz +0 -0
  108. pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
  109. pgsui/impute/estimators.py +0 -1268
  110. pgsui/impute/impute.py +0 -1463
  111. pgsui/impute/simple_imputers.py +0 -1431
  112. pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -782
  113. pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1024
  114. pgsui/impute/unsupervised/keras_classifiers.py +0 -697
  115. pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
  116. pgsui/impute/unsupervised/neural_network_imputers.py +0 -1440
  117. pgsui/impute/unsupervised/neural_network_methods.py +0 -1395
  118. pgsui/pg_sui.py +0 -261
  119. pgsui/utils/sequence_tools.py +0 -407
  120. simulation/sim_benchmarks.py +0 -333
  121. simulation/sim_treeparams.py +0 -475
  122. test/__init__.py +0 -0
  123. test/pg_sui_simtest.py +0 -215
  124. test/pg_sui_testing.py +0 -523
  125. test/test.py +0 -151
  126. test/test_pgsui.py +0 -374
  127. test/test_tkc.py +0 -185
@@ -1,1440 +0,0 @@
1
- # Standard Library Imports
2
- import logging
3
- import os
4
- import pprint
5
- import sys
6
- import warnings
7
-
8
- # Third-party Imports
9
- import numpy as np
10
- import pandas as pd
11
- from matplotlib import pyplot as plt
12
-
13
- # Grid search imports
14
- from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
15
-
16
- # Scikit-learn imports
17
- from sklearn.base import BaseEstimator, TransformerMixin
18
-
19
- # Genetic algorithm grid search imports
20
- from sklearn_genetic import GASearchCV
21
- from sklearn_genetic.callbacks import ConsecutiveStopping, ProgressBar
22
- from sklearn_genetic.plots import plot_fitness_evolution
23
-
24
- # Import tensorflow with reduced warnings.
25
- os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
26
- logging.getLogger("tensorflow").disabled = True
27
- warnings.filterwarnings("ignore", category=UserWarning)
28
-
29
- # noinspection PyPackageRequirements
30
- import tensorflow as tf
31
-
32
- # Disable can't find cuda .dll errors. Also turns of GPU support.
33
- tf.config.set_visible_devices([], "GPU")
34
-
35
- from tensorflow.python.util import deprecation
36
-
37
- # Disable warnings and info logs.
38
- tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
39
- tf.get_logger().setLevel(logging.ERROR)
40
-
41
-
42
- # Monkey patching deprecation utils to supress warnings.
43
- # noinspection PyUnusedLocal
44
- def deprecated(
45
- date, instructions, warn_once=True
46
- ): # pylint: disable=unused-argument
47
- def deprecated_wrapper(func):
48
- return func
49
-
50
- return deprecated_wrapper
51
-
52
-
53
- deprecation.deprecated = deprecated
54
-
55
- from tensorflow.keras.callbacks import (
56
- ReduceLROnPlateau,
57
- CSVLogger,
58
- )
59
-
60
- # For development purposes
61
- # from memory_profiler import memory_usage
62
-
63
- # Custom module imports
64
- try:
65
- from ...utils.misc import timer
66
- from ...utils.misc import isnotebook
67
- from ...utils.misc import validate_input_type
68
- from .neural_network_methods import NeuralNetworkMethods, DisabledCV
69
- from ...utils.scorers import Scorers
70
- from ...utils.plotting import Plotting
71
- from .callbacks import (
72
- UBPCallbacks,
73
- VAECallbacks,
74
- CyclicalAnnealingCallback,
75
- )
76
- from .keras_classifiers import VAEClassifier, MLPClassifier, SAEClassifier
77
- from ...data_processing.transformers import (
78
- SimGenotypeDataTransformer,
79
- AutoEncoderFeatureTransformer,
80
- )
81
- except (ModuleNotFoundError, ValueError, ImportError):
82
- from utils.misc import timer
83
- from utils.misc import isnotebook
84
- from utils.misc import validate_input_type
85
- from impute.unsupervised.neural_network_methods import (
86
- NeuralNetworkMethods,
87
- DisabledCV,
88
- )
89
- from utils.scorers import Scorers
90
- from utils.plotting import Plotting
91
- from impute.unsupervised.callbacks import (
92
- UBPCallbacks,
93
- VAECallbacks,
94
- CyclicalAnnealingCallback,
95
- )
96
- from impute.unsupervised.keras_classifiers import (
97
- VAEClassifier,
98
- MLPClassifier,
99
- SAEClassifier,
100
- )
101
- from data_processing.transformers import (
102
- SimGenotypeDataTransformer,
103
- AutoEncoderFeatureTransformer,
104
- )
105
-
106
- is_notebook = isnotebook()
107
-
108
- if is_notebook:
109
- from tqdm.notebook import tqdm as progressbar
110
- else:
111
- from tqdm import tqdm as progressbar
112
-
113
- from tqdm.keras import TqdmCallback
114
-
115
-
116
- class BaseNNImputer(BaseEstimator, TransformerMixin):
117
- """Base transformer class for neural network imputers.
118
-
119
- Args:
120
- genotype_data (GenotypeData): Input GenotypeData instance.
121
-
122
- prefix (str, optional): Prefix for output files. Defaults to "output".
123
-
124
- gridparams (Dict[str, Any] or None, optional): Dictionary with keys=keyword arguments for the specified estimator and values=lists of parameter values or distributions. If using GridSearchCV, distributions can be specified by using scipy.stats.uniform(low, high) (for a uniform distribution) or scipy.stats.loguniform(low, high) (useful if range of values spans orders of magnitude). ``gridparams`` will be used for a randomized grid search with cross-validation. If using the genetic algorithm grid search (GASearchCV) by setting ``ga=True``\, the parameters can be specified as ``sklearn_genetic.space`` objects. The grid search will determine the optimal parameters as those that maximize accuracy (or minimize root mean squared error for BayesianRidge regressor). NOTE: Takes a long time, so run it with a small subset of the data just to find the optimal parameters for the classifier, then run a full imputation using the optimal parameters. If ``gridparams=None``\, a grid search is not performed. Defaults to None.
125
-
126
- disable_progressbar (bool, optional): Whether to disable the tqdm progress bar. Useful if you are doing the imputation on e.g. a high-performance computing cluster, where sometimes tqdm does not work correctly. If False, uses tqdm progress bar. If True, does not use tqdm. Defaults to False.
127
-
128
- batch_size (int, optional): Batch size per epoch to train the model with.
129
-
130
- n_components (int, optional): Number of components to use as the input data. Defaults to 3.
131
-
132
- early_stop_gen (int, optional): Early stopping criterion for epochs. Training will stop if the loss (error) does not decrease past the tolerance level for ``early_stop_gen`` epochs. Will save the optimal model and reload it once ``early_stop_gen`` has been reached. Defaults to 25.
133
-
134
- num_hidden_layers (int, optional): Number of hidden layers to use in the model. Adjust if overfitting occurs. Defaults to 3.
135
-
136
- hidden_layer_sizes (str, List[int], List[str], or int, optional): Number of neurons to use in hidden layers. If string or a list of strings is supplied, the strings must be either "midpoint", "sqrt", or "log2". "midpoint" will calculate the midpoint as ``(n_features + n_components) / 2``. If "sqrt" is supplied, the square root of the number of features will be used to calculate the output units. If "log2" is supplied, the units will be calculated as ``log2(n_features)``. hidden_layer_sizes will calculate and set the number of output units for each hidden layer. If one string or integer is supplied, the model will use the same number of output units for each hidden layer. If a list of integers or strings is supplied, the model will use the values supplied in the list, which can differ. The list length must be equal to the ``num_hidden_layers``. Defaults to "midpoint".
137
-
138
- optimizer (str, optional): The optimizer to use with gradient descent. Possible value include: "adam", "sgd", "adagrad", "adadelta", "adamax", "ftrl", "nadam", and "rmsprop" are supported. See tf.keras.optimizers for more info. Defaults to "adam".
139
-
140
- hidden_activation (str, optional): The activation function to use for the hidden layers. See tf.keras.activations for more info. Commonly used activation functions include "elu", "relu", and "sigmoid". Defaults to "elu".
141
-
142
- learning_rate (float, optional): The learning rate for the optimizers. Adjust if the loss is learning too slowly. Defaults to 0.1.
143
-
144
- lr_patience (int, optional): Number of epochs with no loss improvement to wait before reducing the learning rate.
145
-
146
- epochs (int, optional): Maximum number of epochs to run if the ``early_stop_gen`` criterion is not met.
147
-
148
- weights_initializer (str, optional): Initializer to use for the model weights. See tf.keras.initializers for more info. Defaults to "glorot_normal".
149
-
150
- l1_penalty (float, optional): L1 regularization penalty to apply to reduce overfitting. Defaults to 0.01.
151
-
152
- l2_penalty (float, optional): L2 regularization penalty to apply to reduce overfitting. Defaults to 0.01.
153
-
154
- dropout_rate (float, optional): Dropout rate during training to reduce overfitting. Must be a float between 0 and 1. Defaults to 0.2.
155
-
156
- recurrent_weight (float, optional): Recurrent weight to calculate predictions. Defaults to 0.5.
157
-
158
- sample_weights (str or Dict[int, float], optional): Whether to weight each genotype by its class frequency. If ``sample_weights='auto'`` then it automatically calculates sample weights based on genotype class frequencies per locus; for example, if there are a lot more 0s and fewer 2s, then it will balance out the classes by weighting each genotype accordingly. ``sample_weights`` can also be a dictionary with the genotypes (0, 1, and 2) as the keys and the weights as the keys. If ``sample_weights`` is anything else, then they are not calculated. Defaults to False.
159
-
160
- grid_iter (int, optional): Number of iterations for grid search. Defaults to 50.
161
-
162
- gridsearch_method (str, optional): Grid search method to use. Possible options include: 'gridsearch', 'randomized_gridsearch', and 'genetic_algorithm'. 'gridsearch' runs all possible permutations of parameters, 'randomized_gridsearch' runs a random subset of parameters, and 'genetic_algorithm' uses a genetic algorithm gridsearch (via GASearchCV). Defaults to 'gridsearch'.
163
-
164
- ga_kwargs (Dict[str, Any] or None): Keyword arguments to be passed to a Genetic Algorithm grid search. Only used if ``ga==True``\.
165
-
166
- scoring_metric (str, optional): Scoring metric to use for randomized or genetic algorithm grid searches. See https://scikit-learn.org/stable/modules/model_evaluation.html for supported options. Defaults to "accuracy".
167
-
168
- sim_strategy (str, optional): Strategy to use for simulating missing data. Only used to validate the accuracy of the imputation. The final model will be trained with the non-simulated dataset. Supported options include: "random", "nonrandom", and "nonrandom_weighted". "random" randomly simulates missing data. When set to "nonrandom", branches from ``GenotypeData.guidetree`` will be randomly sampled to generate missing data on descendant nodes. For "nonrandom_weighted", missing data will be placed on nodes proportionally to their branch lengths (e.g., to generate data distributed as might be the case with mutation-disruption of RAD sites). Defaults to "random".
169
-
170
- sim_prop_missing (float, optional): Proportion of missing data to simulate with the SimGenotypeDataTransformer. Defaults to 0.1.
171
-
172
- n_jobs (int, optional): Number of parallel jobs to use in the grid search if ``gridparams`` is not None. -1 means use all available processors. Defaults to 1.
173
-
174
- verbose (int, optional): Verbosity setting. Can be 0, 1, or 2. 0 is the least and 2 is the most verbose. Defaults to 0.
175
-
176
- ToDo:
177
- Fix sample_weight for multi-label encodings.
178
- """
179
-
180
- def __init__(
181
- self,
182
- activate,
183
- nn_method,
184
- num_classes,
185
- act_func,
186
- *,
187
- genotype_data=None,
188
- prefix="imputer",
189
- gridparams=None,
190
- disable_progressbar=False,
191
- batch_size=32,
192
- n_components=3,
193
- early_stop_gen=25,
194
- num_hidden_layers=3,
195
- hidden_layer_sizes="midpoint",
196
- optimizer="adam",
197
- hidden_activation="elu",
198
- learning_rate=0.01,
199
- lr_patience=1,
200
- epochs=100,
201
- weights_initializer="glorot_normal",
202
- l1_penalty=0.0001,
203
- l2_penalty=0.0001,
204
- dropout_rate=0.2,
205
- sample_weights=False,
206
- grid_iter=80,
207
- gridsearch_method="gridsearch",
208
- ga_kwargs=None,
209
- scoring_metric="auc_macro",
210
- sim_strategy="random",
211
- sim_prop_missing=0.2,
212
- n_jobs=1,
213
- verbose=0,
214
- kl_beta=tf.Variable(1.0, trainable=False),
215
- validation_split=0.0,
216
- nlpca=False,
217
- testing=False,
218
- ):
219
- self.activate = activate
220
- self.act_func_ = act_func
221
- self.num_classes = num_classes
222
- self.testing = testing
223
- self.nn_method_ = nn_method
224
-
225
- self.genotype_data = genotype_data
226
- self.prefix = prefix
227
- self.gridparams = gridparams
228
- self.disable_progressbar = disable_progressbar
229
- self.batch_size = batch_size
230
- self.n_components = n_components
231
-
232
- self.early_stop_gen = early_stop_gen
233
- self.num_hidden_layers = num_hidden_layers
234
- self.hidden_layer_sizes = hidden_layer_sizes
235
- self.optimizer = optimizer
236
- self.hidden_activation = hidden_activation
237
- self.learning_rate = learning_rate
238
- self.lr_patience = lr_patience
239
- self.epochs = epochs
240
- self.weights_initializer = weights_initializer
241
- self.l1_penalty = l1_penalty
242
- self.l2_penalty = l2_penalty
243
- self.dropout_rate = dropout_rate
244
- self.sample_weights = sample_weights
245
- self.grid_iter = grid_iter
246
- self.gridsearch_method = gridsearch_method
247
- self.ga_kwargs = ga_kwargs
248
- self.scoring_metric = scoring_metric
249
- self.sim_strategy = sim_strategy
250
- self.sim_prop_missing = sim_prop_missing
251
- self.n_jobs = n_jobs
252
- self.verbose = verbose
253
-
254
- self.kl_beta = kl_beta
255
- self.validation_split = validation_split
256
- self.nlpca = nlpca
257
-
258
- self.run_gridsearch_ = False if self.gridparams is None else True
259
- self.is_multiclass_ = True if self.num_classes != 4 else False
260
-
261
- # Simulate missing data and get missing masks.
262
- self.sim = SimGenotypeDataTransformer(
263
- self.genotype_data,
264
- prop_missing=self.sim_prop_missing,
265
- strategy=self.sim_strategy,
266
- mask_missing=True,
267
- )
268
-
269
- # Binary encode y to get y_train.
270
- self.tt_ = AutoEncoderFeatureTransformer(
271
- num_classes=self.num_classes, activate=self.activate
272
- )
273
-
274
- @timer
275
- def fit(self, X):
276
- """Train the VAE model on input data X.
277
-
278
- Args:
279
- X (pandas.DataFrame, numpy.ndarray, or List[List[int]]): Input 012-encoded genotypes.
280
-
281
- Returns:
282
- self: Current instance; allows method chaining.
283
-
284
- Raises:
285
- TypeError: Must be either pandas.DataFrame, numpy.ndarray, or List[List[int]].
286
- """
287
- # Treating y as X here for compatibility with UBP/NLPCA.
288
- # With VAE, y=X anyways.
289
- y = X
290
- y = validate_input_type(y, return_type="array")
291
-
292
- self.nn_ = NeuralNetworkMethods()
293
- plotting = Plotting()
294
-
295
- if self.gridsearch_method == "genetic_algorithm":
296
- self.ga_ = True
297
- else:
298
- self.ga_ = False
299
-
300
- self.y_original_ = y.copy()
301
- self.y_simulated_ = self.sim.fit_transform(self.y_original_)
302
-
303
- # Get values where original value was not missing but missing data
304
- # was simulated.
305
- self.sim_missing_mask_ = self.sim.sim_missing_mask_
306
-
307
- # Original missing data.
308
- self.original_missing_mask_ = self.sim.original_missing_mask_
309
-
310
- # Both simulated and original missing data.
311
- self.all_missing_ = self.sim.all_missing_mask_
312
-
313
- # Just y_original with missing values encoded as -1.
314
- y_train = self.tt_.fit_transform(self.y_original_)
315
-
316
- if self.gridparams is not None:
317
- self.scoring_metrics_ = [
318
- "precision_recall_macro",
319
- "precision_recall_micro",
320
- "f1_score",
321
- "auc_macro",
322
- "auc_micro",
323
- "accuracy",
324
- "hamming",
325
- ]
326
-
327
- (
328
- logfile,
329
- callbacks,
330
- compile_params,
331
- model_params,
332
- fit_params,
333
- ) = self._initialize_parameters(y_train)
334
-
335
- if self.nn_method_ == "VAE":
336
- func = self.run_vae
337
- elif self.nn_method_ == "SAE":
338
- func = self.run_sae
339
- elif self.nn_method_ == "NLPCA":
340
- func = self.run_nlpca
341
- elif self.nn_method_ == "UBP":
342
- func = self.run_ubp
343
- else:
344
- raise ValueError(f"Invalid nn_method specified: {self.nn_method_}")
345
-
346
- (
347
- self.models_,
348
- self.histories_,
349
- self.best_params_,
350
- self.best_score_,
351
- self.best_estimator_,
352
- self.search_,
353
- self.metrics_,
354
- ) = func(
355
- self.y_original_,
356
- y_train,
357
- model_params,
358
- compile_params,
359
- fit_params,
360
- )
361
-
362
- if (
363
- self.best_params_ is not None
364
- and "optimizer__learning_rate" in self.best_params_
365
- ):
366
- self.best_params_["learning_rate"] = self.best_params_.pop(
367
- "optimizer__learning_rate"
368
- )
369
-
370
- if self.gridparams is not None:
371
- if self.verbose > 0:
372
- print("\nBest found parameters:")
373
- pprint.pprint(self.best_params_)
374
- print(f"\nBest score: {self.best_score_}")
375
- plotting.plot_grid_search(
376
- self.search_.cv_results_, self.nn_method_, self.prefix
377
- )
378
-
379
- plotting.plot_history(
380
- self.histories_, self.nn_method_, prefix=self.prefix
381
- )
382
- plotting.plot_metrics(
383
- self.metrics_, self.num_classes, self.prefix, self.nn_method_
384
- )
385
-
386
- if self.ga_:
387
- plot_fitness_evolution(self.search_)
388
- plt.savefig(
389
- os.path.join(
390
- f"{self.prefix}_output",
391
- "plots",
392
- "Unsupervised",
393
- self.nn_method_,
394
- "fitness_evolution.pdf",
395
- ),
396
- bbox_inches="tight",
397
- facecolor="white",
398
- )
399
- plt.cla()
400
- plt.clf()
401
- plt.close()
402
-
403
- g = plotting.plot_search_space(self.search_)
404
- plt.savefig(
405
- os.path.join(
406
- f"{self.prefix}_output",
407
- "plots",
408
- "Unsupervised",
409
- self.nn_method_,
410
- "search_space.pdf",
411
- ),
412
- bbox_inches="tight",
413
- facecolor="white",
414
- )
415
- plt.cla()
416
- plt.clf()
417
- plt.close()
418
-
419
- return self
420
-
421
- def transform(self, X):
422
- """Predict and decode imputations and return transformed array.
423
-
424
- Args:
425
- X (pandas.DataFrame, numpy.ndarray, or List[List[int]]): Input data to transform.
426
-
427
- Returns:
428
- numpy.ndarray: Imputed data.
429
- """
430
- y = X
431
- y = validate_input_type(y, return_type="array")
432
-
433
- if self.nn_method_ not in ["UBP", "NLPCA"]:
434
- model = self.models_[0]
435
- else:
436
- if len(self.models_) == 1:
437
- model = self.models_[0]
438
- else:
439
- model = self.models_[-1]
440
-
441
- y_true = y.copy()
442
- y_train = self.tt_.transform(y_true)
443
- y_true_1d = y_true.ravel()
444
- y_size = y_true.size
445
- y_missing_idx = np.flatnonzero(self.original_missing_mask_)
446
-
447
- if self.nn_method_ == "VAE":
448
- y_pred = model(
449
- tf.convert_to_tensor(y_train),
450
- training=False,
451
- )
452
- elif self.nn_method_ == "SAE":
453
- y_pred = model(y_train, training=False)
454
- else:
455
- y_pred = model(model.V_latent, training=False)
456
- y_pred = self.tt_.inverse_transform(y_pred)
457
-
458
- y_pred_decoded = self.nn_.decode_masked(
459
- y_train,
460
- y_pred,
461
- is_multiclass=self.is_multiclass_,
462
- )
463
- # y_pred_decoded, y_pred_certainty = self.nn_.decode_masked(
464
- # y_train, y_pred, return_proba=True
465
- # )
466
-
467
- y_pred_1d = y_pred_decoded.ravel()
468
-
469
- # Only replace originally missing values at missing indexes.
470
- for i in np.arange(y_size):
471
- if i in y_missing_idx:
472
- y_true_1d[i] = y_pred_1d[i]
473
-
474
- self.nn_.write_gt_state_probs(
475
- y_pred,
476
- y_pred_1d,
477
- y_true,
478
- y_true_1d,
479
- self.nn_method_,
480
- self.sim_missing_mask_,
481
- self.original_missing_mask_,
482
- prefix=self.prefix,
483
- )
484
-
485
- Plotting.plot_confusion_matrix(
486
- y_true_1d, y_pred_1d, self.nn_method_, prefix=self.prefix
487
- )
488
-
489
- # if self.nn_method_ == "VAE":
490
- # Plotting.plot_label_clusters(z_mean, y_true_1d)
491
-
492
- # Return to original shape.
493
- return np.reshape(y_true_1d, y_true.shape)
494
-
495
- def run_clf(
496
- self,
497
- y_train,
498
- y_true,
499
- model_params,
500
- compile_params,
501
- fit_params,
502
- ubp_weights=None,
503
- phase=None,
504
- scoring=None,
505
- testing=False,
506
- **kwargs,
507
- ):
508
- """Run KerasClassifier with neural network model and grid search.
509
-
510
- Args:
511
- y_train (numpy.ndarray): Onehot-encoded training input data of shape (n_samples, n_features, num_classes).
512
-
513
- y_true (numpy.ndarray): Original 012-encoded input data of shape (n_samples, n_features).
514
-
515
- model_params (Dict[str, Any]): Dictionary with model parameters to be passed to KerasClassifier model.
516
-
517
- compile_params (Dict[str, Any]): Dictionary with params to be passed to Keras model.compile() in KerasClassifier.
518
-
519
- fit_params (Dict[str, Any]): Dictionary with parameters to be passed to fit in KerasClassifier.
520
-
521
- scoring (Dict[str, Callable], optional): Multimetric scorer made using sklearn.metrics.make_scorer. To be used with grid search.
522
-
523
- Returns:
524
- List[tf.keras.Model]: List of keras model objects. One for each phase (len=1 if NLPCA, len=3 if UBP).
525
-
526
- List[Dict[str, float]]: List of dictionaries with best neural network model history.
527
-
528
- Dict[str, Any] or None: Best parameters found during a grid search, or None if a grid search was not run.
529
-
530
- float: Best score obtained during grid search.
531
-
532
- tf.keras.Model: Best model found during grid search.
533
-
534
- sklearn.model_selection object (GridSearchCV, RandomizedSearchCV) or GASearchCV object.
535
-
536
- Dict[str, Any]: Per-class, micro, and macro-averaged metrics including accuracy, ROC-AUC, and Precision-Recall with Average Precision scores.
537
- """
538
- # This reduces memory usage.
539
- # tensorflow builds graphs that
540
- # will stack if not cleared before
541
- # building a new model.
542
- tf.keras.backend.clear_session()
543
- self.nn_.reset_seeds()
544
-
545
- model = None
546
- if self.nn_method_ in ["UBP", "NLPCA"]:
547
- V = model_params.pop("V")
548
- if phase is not None:
549
- desc = f"Epoch (Phase {phase}): "
550
- else:
551
- desc = "Epoch: "
552
-
553
- else:
554
- desc = "Epoch: "
555
-
556
- if not self.disable_progressbar and not self.run_gridsearch_:
557
- fit_params["callbacks"][-1] = TqdmCallback(
558
- epochs=self.epochs, verbose=self.verbose, desc=desc
559
- )
560
-
561
- if self.nn_method_ == "VAE":
562
- clf = VAEClassifier(
563
- **model_params,
564
- optimizer=compile_params["optimizer"],
565
- optimizer__learning_rate=compile_params["learning_rate"],
566
- loss=compile_params["loss"],
567
- metrics=compile_params["metrics"],
568
- run_eagerly=compile_params["run_eagerly"],
569
- callbacks=fit_params["callbacks"],
570
- epochs=fit_params["epochs"],
571
- verbose=0,
572
- num_classes=self.num_classes,
573
- activate=self.act_func_,
574
- fit__validation_split=fit_params["validation_split"],
575
- score__missing_mask=self.sim_missing_mask_,
576
- score__scoring_metric=self.scoring_metric,
577
- score__num_classes=self.num_classes,
578
- score__n_classes=self.num_classes,
579
- )
580
- elif self.nn_method_ == "SAE":
581
- clf = SAEClassifier(
582
- **model_params,
583
- optimizer=compile_params["optimizer"],
584
- optimizer__learning_rate=compile_params["learning_rate"],
585
- loss=compile_params["loss"],
586
- metrics=compile_params["metrics"],
587
- callbacks=fit_params["callbacks"],
588
- epochs=fit_params["epochs"],
589
- verbose=0,
590
- activate=self.act_func_,
591
- fit__validation_split=fit_params["validation_split"],
592
- score__missing_mask=self.sim_missing_mask_,
593
- score__scoring_metric=self.scoring_metric,
594
- score__num_classes=self.num_classes,
595
- score__n_classes=self.num_classes,
596
- )
597
- else:
598
- clf = MLPClassifier(
599
- V,
600
- y_train,
601
- **model_params,
602
- ubp_weights=ubp_weights,
603
- optimizer=compile_params["optimizer"],
604
- optimizer__learning_rate=compile_params["learning_rate"],
605
- loss=compile_params["loss"],
606
- metrics=compile_params["metrics"],
607
- epochs=fit_params["epochs"],
608
- phase=phase,
609
- callbacks=fit_params["callbacks"],
610
- validation_split=fit_params["validation_split"],
611
- verbose=0,
612
- score__missing_mask=self.sim_missing_mask_,
613
- score__scoring_metric=self.scoring_metric,
614
- )
615
-
616
- if self.run_gridsearch_:
617
- # Cannot do CV because there is no way to use test splits
618
- # given that the input gets refined. If using a test split,
619
- # then it would just be the randomly initialized values and
620
- # would not accurately represent the model.
621
- # Thus, we disable cross-validation for the grid searches.
622
- cross_val = DisabledCV()
623
- verbose = False if self.verbose == 0 else True
624
-
625
- if self.ga_:
626
- # Stop searching if GA sees no improvement.
627
- callback = [
628
- ConsecutiveStopping(
629
- generations=self.early_stop_gen, metric="fitness"
630
- )
631
- ]
632
-
633
- if not self.disable_progressbar:
634
- callback.append(ProgressBar())
635
-
636
- # Do genetic algorithm
637
- # with HiddenPrints():
638
- search = GASearchCV(
639
- estimator=clf,
640
- cv=cross_val,
641
- scoring=scoring,
642
- generations=self.grid_iter,
643
- param_grid=self.gridparams,
644
- n_jobs=self.n_jobs,
645
- refit=self.scoring_metric,
646
- verbose=verbose,
647
- **self.ga_kwargs,
648
- error_score="raise",
649
- )
650
-
651
- if self.nn_method_ in ["UBP", "NLPCA"]:
652
- search.fit(V[self.n_components], y=y_true)
653
- else:
654
- search.fit(y_true, y_true, callbacks=callback)
655
-
656
- else:
657
- # Write GridSearchCV to log file instead of STDOUT.
658
- if self.verbose >= 10:
659
- old_stdout = sys.stdout
660
- log_file = open(
661
- os.path.join(
662
- f"{self.prefix}_output",
663
- "logs",
664
- "Unsupervised",
665
- self.nn_method_,
666
- "gridsearch_progress_log.txt",
667
- ),
668
- "w",
669
- )
670
- sys.stdout = log_file
671
-
672
- if self.gridsearch_method.lower() == "gridsearch":
673
- # Do GridSearchCV
674
- search = GridSearchCV(
675
- clf,
676
- param_grid=self.gridparams,
677
- n_jobs=self.n_jobs,
678
- cv=cross_val,
679
- scoring=scoring,
680
- refit=self.scoring_metric,
681
- verbose=self.verbose * 4,
682
- error_score="raise",
683
- )
684
-
685
- elif self.gridsearch_method.lower() == "randomized_gridsearch":
686
- search = RandomizedSearchCV(
687
- clf,
688
- param_distributions=self.gridparams,
689
- n_iter=self.grid_iter,
690
- n_jobs=self.n_jobs,
691
- cv=cross_val,
692
- scoring=scoring,
693
- refit=self.scoring_metric,
694
- verbose=verbose * 4,
695
- error_score="raise",
696
- )
697
-
698
- else:
699
- raise ValueError(
700
- f"Invalid gridsearch_method specified: "
701
- f"{self.gridsearch_method}"
702
- )
703
-
704
- if self.nn_method_ in ["UBP", "NLPCA"]:
705
- search.fit(V[self.n_components], y=y_true)
706
- else:
707
- search.fit(y_true, y=y_true)
708
-
709
- if self.verbose >= 10:
710
- # Make sure to revert STDOUT back to original.
711
- sys.stdout = old_stdout
712
- log_file.close()
713
-
714
- best_params = search.best_params_
715
- best_score = search.best_score_
716
- best_clf = search.best_estimator_
717
-
718
- fp = os.path.join(
719
- f"{self.prefix}_output",
720
- "reports",
721
- "Unsupervised",
722
- self.nn_method_,
723
- f"cvresults_{self.nn_method_}.csv",
724
- )
725
-
726
- cv_results = pd.DataFrame(search.cv_results_)
727
- cv_results.to_csv(fp, index=False)
728
-
729
- else:
730
- if self.nn_method_ in ["UBP", "NLPCA"]:
731
- clf.fit(V[self.n_components], y=y_true)
732
- else:
733
- clf.fit(y_true, y=y_true)
734
- best_params = None
735
- best_score = None
736
- search = None
737
- best_clf = clf
738
-
739
- model = best_clf.model_
740
- best_history = best_clf.history_
741
-
742
- if self.nn_method_ == "VAE":
743
- y_pred = model(
744
- tf.convert_to_tensor(y_train),
745
- training=False,
746
- )
747
- y_pred = self.tt_.inverse_transform(y_pred)
748
- elif self.nn_method_ == "SAE":
749
- y_pred = model(y_train, training=False)
750
- y_pred = self.tt_.inverse_transform(y_pred)
751
- elif self.nn_method_ in ["UBP", "NLPCA"]:
752
- # Third run_clf function
753
- y_pred_proba = model(model.V_latent, training=False)
754
- y_pred = self.tt_.inverse_transform(y_pred_proba)
755
-
756
- # Get metric scores.
757
- metrics = Scorers.scorer(
758
- y_true,
759
- y_pred,
760
- missing_mask=self.sim_missing_mask_,
761
- num_classes=self.num_classes,
762
- testing=self.testing,
763
- )
764
-
765
- if self.nn_method_ in ["UBP", "NLPCA"]:
766
- return (
767
- V,
768
- model,
769
- best_history,
770
- best_params,
771
- best_score,
772
- best_clf,
773
- search,
774
- metrics,
775
- )
776
- else:
777
- return (
778
- model,
779
- best_history,
780
- best_params,
781
- best_score,
782
- best_clf,
783
- search,
784
- metrics,
785
- )
786
-
787
- def _initialize_parameters(self, y_train):
788
- """Initialize important parameters.
789
-
790
- Args:
791
- y_train (numpy.ndarray): Training subset of original input data.
792
-
793
- Returns:
794
- Dict[str, Any]: Parameters to use for model.compile().
795
- Dict[str, Any]: Other parameters to pass to KerasClassifier().
796
- Dict[str, Any]: Parameters to pass to fit_params() in grid search.
797
- """
798
- # For CSVLogger() callback.
799
-
800
- append = True if self.nn_method_ == "UBP" else False
801
- logfile = os.path.join(
802
- f"{self.prefix}_output",
803
- "logs",
804
- "Unsupervised",
805
- self.nn_method_,
806
- "training_log.csv",
807
- )
808
-
809
- callbacks = [
810
- CSVLogger(filename=logfile, append=append),
811
- ReduceLROnPlateau(
812
- patience=self.lr_patience, min_lr=1e-6, min_delta=1e-6
813
- ),
814
- ]
815
-
816
- if self.nn_method_ in ["VAE", "SAE"]:
817
- callbacks.append(VAECallbacks())
818
-
819
- if self.nn_method_ == "VAE":
820
- callbacks.append(
821
- CyclicalAnnealingCallback(
822
- self.epochs, schedule_type="sigmoid"
823
- )
824
- )
825
- else:
826
- callbacks.append(UBPCallbacks())
827
-
828
- search_mode = True if self.run_gridsearch_ else False
829
-
830
- if not self.disable_progressbar and not search_mode:
831
- callbacks.append(
832
- TqdmCallback(epochs=self.epochs, verbose=0, desc="Epoch: ")
833
- )
834
-
835
- if self.nn_method_ in ["UBP", "NLPCA"]:
836
- vinput = self._initV(y_train, search_mode)
837
- compile_params = self.nn_.set_compile_params(self.optimizer)
838
- else:
839
- vae = True if self.nn_method_ in ["VAE", "SAE"] else False
840
-
841
- if self.sample_weights == "auto" or self.sample_weights == "logsmooth":
842
- # Get class weights for each column.
843
- sample_weights = self.nn_.get_class_weights(
844
- self.y_original_,
845
- self.original_missing_mask_,
846
- return_1d=False,
847
- method=self.sample_weights,
848
- )
849
- sample_weights = self.nn_.normalize_data(sample_weights)
850
-
851
- elif isinstance(self.sample_weights, dict):
852
- for i in range(self.num_classes):
853
- if self.sample_weights[i] == 0.0:
854
- self.sim_missing_mask_[self.y_original_ == i] = False
855
-
856
- sample_weights = self.nn_.get_class_weights(
857
- self.y_original_, user_weights=self.sample_weights
858
- )
859
-
860
- else:
861
- sample_weights = None
862
-
863
- vae = True if self.nn_method_ == "VAE" else False
864
-
865
- compile_params = self.nn_.set_compile_params(
866
- self.optimizer,
867
- sample_weights,
868
- vae=vae,
869
- act_func=self.act_func_,
870
- )
871
-
872
- compile_params["learning_rate"] = self.learning_rate
873
-
874
- if self.nn_method_ in ["VAE", "SAE"]:
875
- model_params = {
876
- "y": y_train,
877
- "batch_size": self.batch_size,
878
- "sample_weight": sample_weights,
879
- "missing_mask": self.original_missing_mask_,
880
- "output_shape": y_train.shape[1],
881
- "weights_initializer": self.weights_initializer,
882
- "n_components": self.n_components,
883
- "hidden_layer_sizes": self.hidden_layer_sizes,
884
- "num_hidden_layers": self.num_hidden_layers,
885
- "hidden_activation": self.hidden_activation,
886
- "l1_penalty": self.l1_penalty,
887
- "l2_penalty": self.l2_penalty,
888
- "dropout_rate": self.dropout_rate,
889
- }
890
-
891
- if self.nn_method_ == "VAE":
892
- model_params["kl_beta"] = (1.0 / y_train.shape[0],)
893
- else:
894
- model_params = {
895
- "V": vinput,
896
- "y_train": y_train,
897
- "batch_size": self.batch_size,
898
- "missing_mask": self.original_missing_mask_,
899
- "output_shape": y_train.shape[1],
900
- "weights_initializer": self.weights_initializer,
901
- "n_components": self.n_components,
902
- "hidden_layer_sizes": self.hidden_layer_sizes,
903
- "num_hidden_layers": self.num_hidden_layers,
904
- "hidden_activation": self.hidden_activation,
905
- "l1_penalty": self.l1_penalty,
906
- "l2_penalty": self.l2_penalty,
907
- "dropout_rate": self.dropout_rate,
908
- "num_classes": self.num_classes,
909
- }
910
-
911
- model_params["sample_weight"] = sample_weights
912
-
913
- fit_verbose = 1 if self.verbose == 2 else 0
914
-
915
- fit_params = {
916
- "batch_size": self.batch_size,
917
- "epochs": self.epochs,
918
- "callbacks": callbacks,
919
- "shuffle": True,
920
- "verbose": fit_verbose,
921
- "sample_weight": sample_weights,
922
- }
923
-
924
- if self.nn_method_ in ["VAE", "SAE"]:
925
- shuffle = True
926
- fit_params["validation_split"] = self.validation_split
927
- else:
928
- shuffle = False
929
- fit_params["validation_split"] = 0.0
930
-
931
- fit_params["shuffle"] = shuffle
932
-
933
- if self.run_gridsearch_ and "learning_rate" in self.gridparams:
934
- self.gridparams["optimizer__learning_rate"] = self.gridparams[
935
- "learning_rate"
936
- ]
937
-
938
- self.gridparams.pop("learning_rate")
939
-
940
- return (
941
- logfile,
942
- callbacks,
943
- compile_params,
944
- model_params,
945
- fit_params,
946
- )
947
-
948
-
949
- class VAE(BaseNNImputer):
950
- """Class to impute missing data using a Variational Autoencoder neural network."""
951
-
952
- def __init__(
953
- self,
954
- kl_beta=tf.Variable(1.0, trainable=False),
955
- validation_split=0.2,
956
- **kwargs,
957
- ):
958
- self.kl_beta = kl_beta
959
- self.validation_split = validation_split
960
-
961
- self.nn_method_ = "VAE"
962
- self.num_classes = 4
963
- self.activate = None
964
- self.is_multiclass_ = True if self.num_classes != 4 else False
965
- self.testing = kwargs.get("testing", False)
966
- self.do_act_in_model_ = True if self.activate is None else False
967
-
968
- if self.do_act_in_model_ and self.is_multiclass_:
969
- self.act_func_ = "softmax"
970
- elif self.do_act_in_model_ and not self.is_multiclass_:
971
- self.act_func_ = "sigmoid"
972
- else:
973
- self.act_func_ = None
974
-
975
- super().__init__(
976
- self.activate,
977
- self.nn_method_,
978
- self.num_classes,
979
- self.act_func_,
980
- **kwargs,
981
- kl_beta=self.kl_beta,
982
- validation_split=self.validation_split,
983
- )
984
-
985
- def run_vae(
986
- self,
987
- y_true,
988
- y_train,
989
- model_params,
990
- compile_params,
991
- fit_params,
992
- ):
993
- """Run VAE using custom subclassed model.
994
-
995
- Args:
996
- y_true (numpy.ndarray): Original genotypes (training dataset) with known and missing values, of shape (n_samples, n_features).
997
-
998
- y_train (numpy.ndarray): Onehot encoded genotypes (training dataset) with known and missing values, of shape (n_samples, n_features, num_classes).
999
-
1000
- model_params (Dict[str, Any]): Dictionary with parameters to pass to the classifier model.
1001
-
1002
- compile_params (Dict[str, Any]): Dictionary with parameters to pass to the tensorflow compile function.
1003
-
1004
- fit_params (Dict[str, Any]): Dictionary with parameters to pass to the fit() function.
1005
-
1006
- Returns:
1007
- List[tf.keras.Model]: List of keras model objects. One for each phase (len=1 if NLPCA, len=3 if UBP).
1008
-
1009
- List[Dict[str, float]]: List of dictionaries with best neural network model history.
1010
-
1011
- Dict[str, Any] or None: Best parameters found during a grid search, or None if a grid search was not run.
1012
-
1013
- float: Best score obtained during grid search.
1014
-
1015
- tf.keras.Model: Best model found during grid search.
1016
-
1017
- sklearn.model_selection object (GridSearchCV, RandomizedSearchCV) or GASearchCV object.
1018
-
1019
- Dict[str, Any]: Per-class, micro, and macro-averaged metrics including accuracy, ROC-AUC, and Precision-Recall with Average Precision scores.
1020
- """
1021
- scorers = Scorers()
1022
- scoring = None
1023
-
1024
- histories = list()
1025
- models = list()
1026
-
1027
- if self.run_gridsearch_:
1028
- scoring = scorers.make_multimetric_scorer(
1029
- self.scoring_metrics_,
1030
- self.sim_missing_mask_,
1031
- num_classes=self.num_classes,
1032
- )
1033
-
1034
- (
1035
- model,
1036
- best_history,
1037
- best_params,
1038
- best_score,
1039
- best_clf,
1040
- search,
1041
- metrics,
1042
- ) = self.run_clf(
1043
- y_train,
1044
- y_true,
1045
- model_params,
1046
- compile_params,
1047
- fit_params,
1048
- scoring=scoring,
1049
- )
1050
-
1051
- histories.append(best_history)
1052
- models.append(model)
1053
- del model
1054
-
1055
- return (
1056
- models,
1057
- histories,
1058
- best_params,
1059
- best_score,
1060
- best_clf,
1061
- search,
1062
- metrics,
1063
- )
1064
-
1065
-
1066
- class SAE(BaseNNImputer):
1067
- def __init__(
1068
- self,
1069
- **kwargs,
1070
- ):
1071
- self.num_classes = 3
1072
- self.activate = "softmax"
1073
- self.nn_method_ = "SAE"
1074
- self.act_func_ = "softmax"
1075
- self.testing = kwargs.get("testing", False)
1076
-
1077
- super().__init__(
1078
- self.activate,
1079
- self.nn_method_,
1080
- self.num_classes,
1081
- self.act_func_,
1082
- **kwargs,
1083
- )
1084
-
1085
- def run_sae(
1086
- self,
1087
- y_true,
1088
- y_train,
1089
- model_params,
1090
- compile_params,
1091
- fit_params,
1092
- ):
1093
- """Run standard autoencoder using custom subclassed model.
1094
-
1095
- Args:
1096
- y_true (numpy.ndarray): Original genotypes (training dataset) with known and missing values of shape (n_samples, n_features).
1097
-
1098
- y_train (numpy.ndarray): Onehot-encoded genotypes (training dataset) with known and missing values of shape (n_samples, n_features, num_classes.)
1099
-
1100
- model_params (Dict[str, Any]): Dictionary with parameters to pass to the classifier model.
1101
-
1102
- compile_params (Dict[str, Any]): Dictionary with parameters to pass to the tensorflow compile function.
1103
-
1104
- fit_params (Dict[str, Any]): Dictionary with parameters to pass to the fit() function.
1105
-
1106
- Returns:
1107
- List[tf.keras.Model]: List of keras model objects. One for each phase (len=1 if NLPCA, len=3 if UBP).
1108
-
1109
- List[Dict[str, float]]: List of dictionaries with best neural network model history.
1110
-
1111
- Dict[str, Any] or None: Best parameters found during a grid search, or None if a grid search was not run.
1112
-
1113
- float: Best score obtained during grid search.
1114
-
1115
- tf.keras.Model: Best model found during grid search.
1116
-
1117
- sklearn.model_selection object (GridSearchCV, RandomizedSearchCV) or GASearchCV object.
1118
-
1119
- Dict[str, Any]: Per-class, micro, and macro-averaged metrics including accuracy, ROC-AUC, and Precision-Recall with Average Precision scores.
1120
- """
1121
- scorers = Scorers()
1122
- scoring = None
1123
-
1124
- histories = list()
1125
- models = list()
1126
-
1127
- if self.run_gridsearch_:
1128
- scoring = scorers.make_multimetric_scorer(
1129
- self.scoring_metrics_, self.sim_missing_mask_
1130
- )
1131
-
1132
- (
1133
- model,
1134
- best_history,
1135
- best_params,
1136
- best_score,
1137
- best_clf,
1138
- search,
1139
- metrics,
1140
- ) = self.run_clf(
1141
- y_train,
1142
- y_true,
1143
- model_params,
1144
- compile_params,
1145
- fit_params,
1146
- scoring=scoring,
1147
- testing=False,
1148
- )
1149
-
1150
- histories.append(best_history)
1151
- models.append(model)
1152
- del model
1153
-
1154
- return (
1155
- models,
1156
- histories,
1157
- best_params,
1158
- best_score,
1159
- best_clf,
1160
- search,
1161
- metrics,
1162
- )
1163
-
1164
-
1165
- class UBP(BaseNNImputer):
1166
- def __init__(
1167
- self,
1168
- *,
1169
- nlpca=False,
1170
- **kwargs,
1171
- ):
1172
- # TODO: Make estimators compatible with variable number of classes.
1173
- # E.g., with morphological data.
1174
- self.nlpca = nlpca
1175
- self.nn_method_ = "NLPCA" if self.nlpca else "UBP"
1176
- self.num_classes = 3
1177
- self.testing = kwargs.get("testing", False)
1178
- self.activate = None
1179
- self.act_func_ = "softmax"
1180
-
1181
- super().__init__(
1182
- self.activate,
1183
- self.nn_method_,
1184
- self.num_classes,
1185
- self.act_func_,
1186
- **kwargs,
1187
- nlpca=self.nlpca,
1188
- )
1189
-
1190
- def run_nlpca(
1191
- self,
1192
- y_true,
1193
- y_train,
1194
- model_params,
1195
- compile_params,
1196
- fit_params,
1197
- ):
1198
- """Run NLPCA using custom subclassed model.
1199
-
1200
- Args:
1201
- y_true (numpy.ndarray): Original genotypes with known and missing values.
1202
-
1203
- y_train (numpy.ndarray): For compatibility with VAE and SAE. Not used.
1204
-
1205
- model_params (Dict[str, Any]): Dictionary with parameters to pass to the classifier model.
1206
-
1207
- compile_params (Dict[str, Any]): Dictionary with parameters to pass to the tensorflow compile function.
1208
-
1209
- fit_params (Dict[str, Any]): Dictionary with parameters to pass to the fit() function.
1210
-
1211
- Returns:
1212
- List[tf.keras.Model]: List of keras model objects. One for each phase (len=1 if NLPCA, len=3 if UBP).
1213
-
1214
- List[Dict[str, float]]: List of dictionaries with best neural network model history.
1215
-
1216
- Dict[str, Any] or None: Best parameters found during a grid search, or None if a grid search was not run.
1217
-
1218
- float: Best score obtained during grid search.
1219
-
1220
- tf.keras.Model: Best model found during grid search.
1221
-
1222
- sklearn.model_selection object (GridSearchCV, RandomizedSearchCV) or GASearchCV object.
1223
-
1224
- Dict[str, Any]: Per-class, micro, and macro-averaged metrics including accuracy, ROC-AUC, and Precision-Recall with Average Precision scores.
1225
- """
1226
- scorers = Scorers()
1227
-
1228
- histories = list()
1229
- models = list()
1230
- y_train = model_params.pop("y_train")
1231
- ubp_weights = None
1232
- phase = None
1233
- scoring = None
1234
-
1235
- if self.run_gridsearch_:
1236
- scoring = scorers.make_multimetric_scorer(
1237
- self.scoring_metrics_, self.sim_missing_mask_
1238
- )
1239
-
1240
- (
1241
- V,
1242
- model,
1243
- best_history,
1244
- best_params,
1245
- best_score,
1246
- best_clf,
1247
- search,
1248
- metrics,
1249
- ) = self.run_clf(
1250
- y_train,
1251
- y_true,
1252
- model_params,
1253
- compile_params,
1254
- fit_params,
1255
- ubp_weights=ubp_weights,
1256
- phase=phase,
1257
- scoring=scoring,
1258
- testing=False,
1259
- )
1260
-
1261
- histories.append(best_history)
1262
- models.append(model)
1263
- del model
1264
-
1265
- return (
1266
- models,
1267
- histories,
1268
- best_params,
1269
- best_score,
1270
- best_clf,
1271
- search,
1272
- metrics,
1273
- )
1274
-
1275
- def run_ubp(
1276
- self,
1277
- y_true,
1278
- y_train,
1279
- model_params,
1280
- compile_params,
1281
- fit_params,
1282
- ):
1283
- """Run UBP using custom subclassed model.
1284
-
1285
- Args:
1286
- y_true (numpy.ndarray): Original genotypes with known and missing values.
1287
-
1288
- y_train (numpy.ndarray): For compatibility with VAE and SAE. Not used.
1289
-
1290
- model_params (Dict[str, Any]): Dictionary with parameters to pass to the classifier model.
1291
-
1292
- compile_params (Dict[str, Any]): Dictionary with parameters to pass to the tensorflow compile function.
1293
-
1294
- fit_params (Dict[str, Any]): Dictionary with parameters to pass to the fit() function.
1295
-
1296
- Returns:
1297
- List[tf.keras.Model]: List of keras model objects. One for each phase (len=1 if NLPCA, len=3 if UBP).
1298
-
1299
- List[Dict[str, float]]: List of dictionaries with best neural network model history.
1300
-
1301
- Dict[str, Any] or None: Best parameters found during a grid search, or None if a grid search was not run.
1302
-
1303
- float: Best score obtained during grid search.
1304
-
1305
- tf.keras.Model: Best model found during grid search.
1306
-
1307
- sklearn.model_selection object (GridSearchCV, RandomizedSearchCV) or GASearchCV object.
1308
-
1309
- Dict[str, Any]: Per-class, micro, and macro-averaged metrics including accuracy, ROC-AUC, and Precision-Recall with Average Precision scores.
1310
- """
1311
- scorers = Scorers()
1312
-
1313
- histories = list()
1314
- models = list()
1315
- search_n_components = False
1316
-
1317
- y_train = model_params.pop("y_train")
1318
-
1319
- if self.run_gridsearch_:
1320
- # Cannot do CV because there is no way to use test splits
1321
- # given that the input gets refined. If using a test split,
1322
- # then it would just be the randomly initialized values and
1323
- # would not accurately represent the model.
1324
- # Thus, we disable cross-validation for the grid searches.
1325
- scoring = scorers.make_multimetric_scorer(
1326
- self.scoring_metrics_, self.sim_missing_mask_
1327
- )
1328
-
1329
- if "n_components" in self.gridparams:
1330
- search_n_components = True
1331
- n_components_searched = self.n_components
1332
- else:
1333
- scoring = None
1334
-
1335
- for phase in range(1, 4):
1336
- ubp_weights = models[1].get_weights() if phase == 3 else None
1337
-
1338
- (
1339
- V,
1340
- model,
1341
- best_history,
1342
- best_params,
1343
- best_score,
1344
- best_clf,
1345
- search,
1346
- metrics,
1347
- ) = self.run_clf(
1348
- y_train,
1349
- y_true,
1350
- model_params,
1351
- compile_params,
1352
- fit_params,
1353
- ubp_weights=ubp_weights,
1354
- phase=phase,
1355
- scoring=scoring,
1356
- testing=False,
1357
- )
1358
-
1359
- if phase == 1:
1360
- # Cannot have V input with different n_components
1361
- # in other phases than are in phase 1.
1362
- # So the n_components search has to happen in phase 1.
1363
- if best_params is not None and search_n_components:
1364
- n_components_searched = best_params["n_components"]
1365
- model_params["V"] = {
1366
- n_components_searched: model.V_latent.copy()
1367
- }
1368
- model_params["n_components"] = n_components_searched
1369
- self.n_components = n_components_searched
1370
- self.gridparams.pop("n_components")
1371
-
1372
- else:
1373
- model_params["V"] = V
1374
- elif phase == 2:
1375
- model_params["V"] = V
1376
-
1377
- elif phase == 3:
1378
- if best_params is not None and search_n_components:
1379
- best_params["n_components"] = n_components_searched
1380
-
1381
- histories.append(best_history)
1382
- models.append(model)
1383
- del model
1384
-
1385
- return (
1386
- models,
1387
- histories,
1388
- best_params,
1389
- best_score,
1390
- best_clf,
1391
- search,
1392
- metrics,
1393
- )
1394
-
1395
- def _initV(self, y_train, search_mode):
1396
- """Initialize random input V as dictionary of numpy arrays.
1397
-
1398
- Args:
1399
- y_train (numpy.ndarray): One-hot encoded training dataset (actual data).
1400
-
1401
- search_mode (bool): Whether doing grid search.
1402
-
1403
- Returns:
1404
- Dict[int, numpy.ndarray]: Dictionary with n_components: V as key-value pairs.
1405
-
1406
- Raises:
1407
- ValueError: Number of components must be >= 2.
1408
- """
1409
- vinput = dict()
1410
- if search_mode:
1411
- if "n_components" in self.gridparams:
1412
- n_components = self.gridparams["n_components"]
1413
- else:
1414
- n_components = self.n_components
1415
-
1416
- if not isinstance(n_components, int):
1417
- if min(n_components) < 2:
1418
- raise ValueError(
1419
- f"n_components must be >= 2, but a value of {n_components} was specified."
1420
- )
1421
-
1422
- elif len(n_components) == 1:
1423
- vinput[n_components[0]] = self.nn_.init_weights(
1424
- y_train.shape[0], n_components[0]
1425
- )
1426
-
1427
- else:
1428
- for c in n_components:
1429
- vinput[c] = self.nn_.init_weights(y_train.shape[0], c)
1430
- else:
1431
- vinput[self.n_components] = self.nn_.init_weights(
1432
- y_train.shape[0], self.n_components
1433
- )
1434
-
1435
- else:
1436
- vinput[self.n_components] = self.nn_.init_weights(
1437
- y_train.shape[0], self.n_components
1438
- )
1439
-
1440
- return vinput