pg-sui 0.2.3__py3-none-any.whl → 1.6.14.dev9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pg_sui-0.2.3.dist-info → pg_sui-1.6.14.dev9.dist-info}/METADATA +99 -77
- pg_sui-1.6.14.dev9.dist-info/RECORD +81 -0
- {pg_sui-0.2.3.dist-info → pg_sui-1.6.14.dev9.dist-info}/WHEEL +1 -1
- pg_sui-1.6.14.dev9.dist-info/entry_points.txt +4 -0
- {pg_sui-0.2.3.dist-info → pg_sui-1.6.14.dev9.dist-info/licenses}/LICENSE +0 -0
- pg_sui-1.6.14.dev9.dist-info/top_level.txt +1 -0
- pgsui/__init__.py +35 -54
- pgsui/_version.py +34 -0
- pgsui/cli.py +909 -0
- pgsui/data_processing/__init__.py +0 -0
- pgsui/data_processing/config.py +565 -0
- pgsui/data_processing/containers.py +1424 -0
- pgsui/data_processing/transformers.py +557 -907
- pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
- pgsui/electron/app/__main__.py +5 -0
- pgsui/electron/app/extra-resources/.gitkeep +1 -0
- pgsui/electron/app/icons/icons/1024x1024.png +0 -0
- pgsui/electron/app/icons/icons/128x128.png +0 -0
- pgsui/electron/app/icons/icons/16x16.png +0 -0
- pgsui/electron/app/icons/icons/24x24.png +0 -0
- pgsui/electron/app/icons/icons/256x256.png +0 -0
- pgsui/electron/app/icons/icons/32x32.png +0 -0
- pgsui/electron/app/icons/icons/48x48.png +0 -0
- pgsui/electron/app/icons/icons/512x512.png +0 -0
- pgsui/electron/app/icons/icons/64x64.png +0 -0
- pgsui/electron/app/icons/icons/icon.icns +0 -0
- pgsui/electron/app/icons/icons/icon.ico +0 -0
- pgsui/electron/app/main.js +227 -0
- pgsui/electron/app/package-lock.json +6894 -0
- pgsui/electron/app/package.json +51 -0
- pgsui/electron/app/preload.js +15 -0
- pgsui/electron/app/server.py +157 -0
- pgsui/electron/app/ui/logo.png +0 -0
- pgsui/electron/app/ui/renderer.js +131 -0
- pgsui/electron/app/ui/styles.css +59 -0
- pgsui/electron/app/ui/ui_shim.js +72 -0
- pgsui/electron/bootstrap.py +43 -0
- pgsui/electron/launch.py +57 -0
- pgsui/electron/package.json +14 -0
- pgsui/example_data/__init__.py +0 -0
- pgsui/example_data/phylip_files/__init__.py +0 -0
- pgsui/example_data/phylip_files/test.phy +0 -0
- pgsui/example_data/popmaps/__init__.py +0 -0
- pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
- pgsui/example_data/structure_files/__init__.py +0 -0
- pgsui/example_data/structure_files/test.pops.2row.allsites.str +0 -0
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
- pgsui/impute/__init__.py +0 -0
- pgsui/impute/deterministic/imputers/allele_freq.py +725 -0
- pgsui/impute/deterministic/imputers/mode.py +844 -0
- pgsui/impute/deterministic/imputers/nmf.py +221 -0
- pgsui/impute/deterministic/imputers/phylo.py +973 -0
- pgsui/impute/deterministic/imputers/ref_allele.py +669 -0
- pgsui/impute/supervised/__init__.py +0 -0
- pgsui/impute/supervised/base.py +343 -0
- pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
- pgsui/impute/supervised/imputers/hist_gradient_boosting.py +317 -0
- pgsui/impute/supervised/imputers/random_forest.py +291 -0
- pgsui/impute/unsupervised/__init__.py +0 -0
- pgsui/impute/unsupervised/base.py +1118 -0
- pgsui/impute/unsupervised/callbacks.py +92 -262
- {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
- pgsui/impute/unsupervised/imputers/autoencoder.py +1285 -0
- pgsui/impute/unsupervised/imputers/nlpca.py +1554 -0
- pgsui/impute/unsupervised/imputers/ubp.py +1575 -0
- pgsui/impute/unsupervised/imputers/vae.py +1228 -0
- pgsui/impute/unsupervised/loss_functions.py +261 -0
- pgsui/impute/unsupervised/models/__init__.py +0 -0
- pgsui/impute/unsupervised/models/autoencoder_model.py +215 -567
- pgsui/impute/unsupervised/models/nlpca_model.py +155 -394
- pgsui/impute/unsupervised/models/ubp_model.py +180 -1106
- pgsui/impute/unsupervised/models/vae_model.py +269 -630
- pgsui/impute/unsupervised/nn_scorers.py +255 -0
- pgsui/utils/__init__.py +0 -0
- pgsui/utils/classification_viz.py +608 -0
- pgsui/utils/logging_utils.py +22 -0
- pgsui/utils/misc.py +35 -480
- pgsui/utils/plotting.py +996 -829
- pgsui/utils/pretty_metrics.py +290 -0
- pgsui/utils/scorers.py +213 -666
- pg_sui-0.2.3.dist-info/RECORD +0 -75
- pg_sui-0.2.3.dist-info/top_level.txt +0 -3
- pgsui/example_data/phylip_files/test_n10.phy +0 -118
- pgsui/example_data/phylip_files/test_n100.phy +0 -118
- pgsui/example_data/phylip_files/test_n2.phy +0 -118
- pgsui/example_data/phylip_files/test_n500.phy +0 -118
- pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
- pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
- pgsui/example_data/trees/test.iqtree +0 -376
- pgsui/example_data/trees/test.qmat +0 -5
- pgsui/example_data/trees/test.rate +0 -2033
- pgsui/example_data/trees/test.tre +0 -1
- pgsui/example_data/trees/test_n10.rate +0 -19
- pgsui/example_data/trees/test_n100.rate +0 -109
- pgsui/example_data/trees/test_n500.rate +0 -509
- pgsui/example_data/trees/test_siterates.txt +0 -2024
- pgsui/example_data/trees/test_siterates_n10.txt +0 -10
- pgsui/example_data/trees/test_siterates_n100.txt +0 -100
- pgsui/example_data/trees/test_siterates_n500.txt +0 -500
- pgsui/example_data/vcf_files/test.vcf +0 -244
- pgsui/example_data/vcf_files/test.vcf.gz +0 -0
- pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
- pgsui/impute/estimators.py +0 -1268
- pgsui/impute/impute.py +0 -1463
- pgsui/impute/simple_imputers.py +0 -1431
- pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -782
- pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1024
- pgsui/impute/unsupervised/keras_classifiers.py +0 -697
- pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
- pgsui/impute/unsupervised/neural_network_imputers.py +0 -1440
- pgsui/impute/unsupervised/neural_network_methods.py +0 -1395
- pgsui/pg_sui.py +0 -261
- pgsui/utils/sequence_tools.py +0 -407
- simulation/sim_benchmarks.py +0 -333
- simulation/sim_treeparams.py +0 -475
- test/__init__.py +0 -0
- test/pg_sui_simtest.py +0 -215
- test/pg_sui_testing.py +0 -523
- test/test.py +0 -151
- test/test_pgsui.py +0 -374
- test/test_tkc.py +0 -185
|
@@ -1,1440 +0,0 @@
|
|
|
1
|
-
# Standard Library Imports
|
|
2
|
-
import logging
|
|
3
|
-
import os
|
|
4
|
-
import pprint
|
|
5
|
-
import sys
|
|
6
|
-
import warnings
|
|
7
|
-
|
|
8
|
-
# Third-party Imports
|
|
9
|
-
import numpy as np
|
|
10
|
-
import pandas as pd
|
|
11
|
-
from matplotlib import pyplot as plt
|
|
12
|
-
|
|
13
|
-
# Grid search imports
|
|
14
|
-
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
|
|
15
|
-
|
|
16
|
-
# Scikit-learn imports
|
|
17
|
-
from sklearn.base import BaseEstimator, TransformerMixin
|
|
18
|
-
|
|
19
|
-
# Genetic algorithm grid search imports
|
|
20
|
-
from sklearn_genetic import GASearchCV
|
|
21
|
-
from sklearn_genetic.callbacks import ConsecutiveStopping, ProgressBar
|
|
22
|
-
from sklearn_genetic.plots import plot_fitness_evolution
|
|
23
|
-
|
|
24
|
-
# Import tensorflow with reduced warnings.
|
|
25
|
-
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
|
26
|
-
logging.getLogger("tensorflow").disabled = True
|
|
27
|
-
warnings.filterwarnings("ignore", category=UserWarning)
|
|
28
|
-
|
|
29
|
-
# noinspection PyPackageRequirements
|
|
30
|
-
import tensorflow as tf
|
|
31
|
-
|
|
32
|
-
# Disable can't find cuda .dll errors. Also turns of GPU support.
|
|
33
|
-
tf.config.set_visible_devices([], "GPU")
|
|
34
|
-
|
|
35
|
-
from tensorflow.python.util import deprecation
|
|
36
|
-
|
|
37
|
-
# Disable warnings and info logs.
|
|
38
|
-
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
|
|
39
|
-
tf.get_logger().setLevel(logging.ERROR)
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
# Monkey patching deprecation utils to supress warnings.
|
|
43
|
-
# noinspection PyUnusedLocal
|
|
44
|
-
def deprecated(
|
|
45
|
-
date, instructions, warn_once=True
|
|
46
|
-
): # pylint: disable=unused-argument
|
|
47
|
-
def deprecated_wrapper(func):
|
|
48
|
-
return func
|
|
49
|
-
|
|
50
|
-
return deprecated_wrapper
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
deprecation.deprecated = deprecated
|
|
54
|
-
|
|
55
|
-
from tensorflow.keras.callbacks import (
|
|
56
|
-
ReduceLROnPlateau,
|
|
57
|
-
CSVLogger,
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
# For development purposes
|
|
61
|
-
# from memory_profiler import memory_usage
|
|
62
|
-
|
|
63
|
-
# Custom module imports
|
|
64
|
-
try:
|
|
65
|
-
from ...utils.misc import timer
|
|
66
|
-
from ...utils.misc import isnotebook
|
|
67
|
-
from ...utils.misc import validate_input_type
|
|
68
|
-
from .neural_network_methods import NeuralNetworkMethods, DisabledCV
|
|
69
|
-
from ...utils.scorers import Scorers
|
|
70
|
-
from ...utils.plotting import Plotting
|
|
71
|
-
from .callbacks import (
|
|
72
|
-
UBPCallbacks,
|
|
73
|
-
VAECallbacks,
|
|
74
|
-
CyclicalAnnealingCallback,
|
|
75
|
-
)
|
|
76
|
-
from .keras_classifiers import VAEClassifier, MLPClassifier, SAEClassifier
|
|
77
|
-
from ...data_processing.transformers import (
|
|
78
|
-
SimGenotypeDataTransformer,
|
|
79
|
-
AutoEncoderFeatureTransformer,
|
|
80
|
-
)
|
|
81
|
-
except (ModuleNotFoundError, ValueError, ImportError):
|
|
82
|
-
from utils.misc import timer
|
|
83
|
-
from utils.misc import isnotebook
|
|
84
|
-
from utils.misc import validate_input_type
|
|
85
|
-
from impute.unsupervised.neural_network_methods import (
|
|
86
|
-
NeuralNetworkMethods,
|
|
87
|
-
DisabledCV,
|
|
88
|
-
)
|
|
89
|
-
from utils.scorers import Scorers
|
|
90
|
-
from utils.plotting import Plotting
|
|
91
|
-
from impute.unsupervised.callbacks import (
|
|
92
|
-
UBPCallbacks,
|
|
93
|
-
VAECallbacks,
|
|
94
|
-
CyclicalAnnealingCallback,
|
|
95
|
-
)
|
|
96
|
-
from impute.unsupervised.keras_classifiers import (
|
|
97
|
-
VAEClassifier,
|
|
98
|
-
MLPClassifier,
|
|
99
|
-
SAEClassifier,
|
|
100
|
-
)
|
|
101
|
-
from data_processing.transformers import (
|
|
102
|
-
SimGenotypeDataTransformer,
|
|
103
|
-
AutoEncoderFeatureTransformer,
|
|
104
|
-
)
|
|
105
|
-
|
|
106
|
-
is_notebook = isnotebook()
|
|
107
|
-
|
|
108
|
-
if is_notebook:
|
|
109
|
-
from tqdm.notebook import tqdm as progressbar
|
|
110
|
-
else:
|
|
111
|
-
from tqdm import tqdm as progressbar
|
|
112
|
-
|
|
113
|
-
from tqdm.keras import TqdmCallback
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
class BaseNNImputer(BaseEstimator, TransformerMixin):
|
|
117
|
-
"""Base transformer class for neural network imputers.
|
|
118
|
-
|
|
119
|
-
Args:
|
|
120
|
-
genotype_data (GenotypeData): Input GenotypeData instance.
|
|
121
|
-
|
|
122
|
-
prefix (str, optional): Prefix for output files. Defaults to "output".
|
|
123
|
-
|
|
124
|
-
gridparams (Dict[str, Any] or None, optional): Dictionary with keys=keyword arguments for the specified estimator and values=lists of parameter values or distributions. If using GridSearchCV, distributions can be specified by using scipy.stats.uniform(low, high) (for a uniform distribution) or scipy.stats.loguniform(low, high) (useful if range of values spans orders of magnitude). ``gridparams`` will be used for a randomized grid search with cross-validation. If using the genetic algorithm grid search (GASearchCV) by setting ``ga=True``\, the parameters can be specified as ``sklearn_genetic.space`` objects. The grid search will determine the optimal parameters as those that maximize accuracy (or minimize root mean squared error for BayesianRidge regressor). NOTE: Takes a long time, so run it with a small subset of the data just to find the optimal parameters for the classifier, then run a full imputation using the optimal parameters. If ``gridparams=None``\, a grid search is not performed. Defaults to None.
|
|
125
|
-
|
|
126
|
-
disable_progressbar (bool, optional): Whether to disable the tqdm progress bar. Useful if you are doing the imputation on e.g. a high-performance computing cluster, where sometimes tqdm does not work correctly. If False, uses tqdm progress bar. If True, does not use tqdm. Defaults to False.
|
|
127
|
-
|
|
128
|
-
batch_size (int, optional): Batch size per epoch to train the model with.
|
|
129
|
-
|
|
130
|
-
n_components (int, optional): Number of components to use as the input data. Defaults to 3.
|
|
131
|
-
|
|
132
|
-
early_stop_gen (int, optional): Early stopping criterion for epochs. Training will stop if the loss (error) does not decrease past the tolerance level for ``early_stop_gen`` epochs. Will save the optimal model and reload it once ``early_stop_gen`` has been reached. Defaults to 25.
|
|
133
|
-
|
|
134
|
-
num_hidden_layers (int, optional): Number of hidden layers to use in the model. Adjust if overfitting occurs. Defaults to 3.
|
|
135
|
-
|
|
136
|
-
hidden_layer_sizes (str, List[int], List[str], or int, optional): Number of neurons to use in hidden layers. If string or a list of strings is supplied, the strings must be either "midpoint", "sqrt", or "log2". "midpoint" will calculate the midpoint as ``(n_features + n_components) / 2``. If "sqrt" is supplied, the square root of the number of features will be used to calculate the output units. If "log2" is supplied, the units will be calculated as ``log2(n_features)``. hidden_layer_sizes will calculate and set the number of output units for each hidden layer. If one string or integer is supplied, the model will use the same number of output units for each hidden layer. If a list of integers or strings is supplied, the model will use the values supplied in the list, which can differ. The list length must be equal to the ``num_hidden_layers``. Defaults to "midpoint".
|
|
137
|
-
|
|
138
|
-
optimizer (str, optional): The optimizer to use with gradient descent. Possible value include: "adam", "sgd", "adagrad", "adadelta", "adamax", "ftrl", "nadam", and "rmsprop" are supported. See tf.keras.optimizers for more info. Defaults to "adam".
|
|
139
|
-
|
|
140
|
-
hidden_activation (str, optional): The activation function to use for the hidden layers. See tf.keras.activations for more info. Commonly used activation functions include "elu", "relu", and "sigmoid". Defaults to "elu".
|
|
141
|
-
|
|
142
|
-
learning_rate (float, optional): The learning rate for the optimizers. Adjust if the loss is learning too slowly. Defaults to 0.1.
|
|
143
|
-
|
|
144
|
-
lr_patience (int, optional): Number of epochs with no loss improvement to wait before reducing the learning rate.
|
|
145
|
-
|
|
146
|
-
epochs (int, optional): Maximum number of epochs to run if the ``early_stop_gen`` criterion is not met.
|
|
147
|
-
|
|
148
|
-
weights_initializer (str, optional): Initializer to use for the model weights. See tf.keras.initializers for more info. Defaults to "glorot_normal".
|
|
149
|
-
|
|
150
|
-
l1_penalty (float, optional): L1 regularization penalty to apply to reduce overfitting. Defaults to 0.01.
|
|
151
|
-
|
|
152
|
-
l2_penalty (float, optional): L2 regularization penalty to apply to reduce overfitting. Defaults to 0.01.
|
|
153
|
-
|
|
154
|
-
dropout_rate (float, optional): Dropout rate during training to reduce overfitting. Must be a float between 0 and 1. Defaults to 0.2.
|
|
155
|
-
|
|
156
|
-
recurrent_weight (float, optional): Recurrent weight to calculate predictions. Defaults to 0.5.
|
|
157
|
-
|
|
158
|
-
sample_weights (str or Dict[int, float], optional): Whether to weight each genotype by its class frequency. If ``sample_weights='auto'`` then it automatically calculates sample weights based on genotype class frequencies per locus; for example, if there are a lot more 0s and fewer 2s, then it will balance out the classes by weighting each genotype accordingly. ``sample_weights`` can also be a dictionary with the genotypes (0, 1, and 2) as the keys and the weights as the keys. If ``sample_weights`` is anything else, then they are not calculated. Defaults to False.
|
|
159
|
-
|
|
160
|
-
grid_iter (int, optional): Number of iterations for grid search. Defaults to 50.
|
|
161
|
-
|
|
162
|
-
gridsearch_method (str, optional): Grid search method to use. Possible options include: 'gridsearch', 'randomized_gridsearch', and 'genetic_algorithm'. 'gridsearch' runs all possible permutations of parameters, 'randomized_gridsearch' runs a random subset of parameters, and 'genetic_algorithm' uses a genetic algorithm gridsearch (via GASearchCV). Defaults to 'gridsearch'.
|
|
163
|
-
|
|
164
|
-
ga_kwargs (Dict[str, Any] or None): Keyword arguments to be passed to a Genetic Algorithm grid search. Only used if ``ga==True``\.
|
|
165
|
-
|
|
166
|
-
scoring_metric (str, optional): Scoring metric to use for randomized or genetic algorithm grid searches. See https://scikit-learn.org/stable/modules/model_evaluation.html for supported options. Defaults to "accuracy".
|
|
167
|
-
|
|
168
|
-
sim_strategy (str, optional): Strategy to use for simulating missing data. Only used to validate the accuracy of the imputation. The final model will be trained with the non-simulated dataset. Supported options include: "random", "nonrandom", and "nonrandom_weighted". "random" randomly simulates missing data. When set to "nonrandom", branches from ``GenotypeData.guidetree`` will be randomly sampled to generate missing data on descendant nodes. For "nonrandom_weighted", missing data will be placed on nodes proportionally to their branch lengths (e.g., to generate data distributed as might be the case with mutation-disruption of RAD sites). Defaults to "random".
|
|
169
|
-
|
|
170
|
-
sim_prop_missing (float, optional): Proportion of missing data to simulate with the SimGenotypeDataTransformer. Defaults to 0.1.
|
|
171
|
-
|
|
172
|
-
n_jobs (int, optional): Number of parallel jobs to use in the grid search if ``gridparams`` is not None. -1 means use all available processors. Defaults to 1.
|
|
173
|
-
|
|
174
|
-
verbose (int, optional): Verbosity setting. Can be 0, 1, or 2. 0 is the least and 2 is the most verbose. Defaults to 0.
|
|
175
|
-
|
|
176
|
-
ToDo:
|
|
177
|
-
Fix sample_weight for multi-label encodings.
|
|
178
|
-
"""
|
|
179
|
-
|
|
180
|
-
def __init__(
|
|
181
|
-
self,
|
|
182
|
-
activate,
|
|
183
|
-
nn_method,
|
|
184
|
-
num_classes,
|
|
185
|
-
act_func,
|
|
186
|
-
*,
|
|
187
|
-
genotype_data=None,
|
|
188
|
-
prefix="imputer",
|
|
189
|
-
gridparams=None,
|
|
190
|
-
disable_progressbar=False,
|
|
191
|
-
batch_size=32,
|
|
192
|
-
n_components=3,
|
|
193
|
-
early_stop_gen=25,
|
|
194
|
-
num_hidden_layers=3,
|
|
195
|
-
hidden_layer_sizes="midpoint",
|
|
196
|
-
optimizer="adam",
|
|
197
|
-
hidden_activation="elu",
|
|
198
|
-
learning_rate=0.01,
|
|
199
|
-
lr_patience=1,
|
|
200
|
-
epochs=100,
|
|
201
|
-
weights_initializer="glorot_normal",
|
|
202
|
-
l1_penalty=0.0001,
|
|
203
|
-
l2_penalty=0.0001,
|
|
204
|
-
dropout_rate=0.2,
|
|
205
|
-
sample_weights=False,
|
|
206
|
-
grid_iter=80,
|
|
207
|
-
gridsearch_method="gridsearch",
|
|
208
|
-
ga_kwargs=None,
|
|
209
|
-
scoring_metric="auc_macro",
|
|
210
|
-
sim_strategy="random",
|
|
211
|
-
sim_prop_missing=0.2,
|
|
212
|
-
n_jobs=1,
|
|
213
|
-
verbose=0,
|
|
214
|
-
kl_beta=tf.Variable(1.0, trainable=False),
|
|
215
|
-
validation_split=0.0,
|
|
216
|
-
nlpca=False,
|
|
217
|
-
testing=False,
|
|
218
|
-
):
|
|
219
|
-
self.activate = activate
|
|
220
|
-
self.act_func_ = act_func
|
|
221
|
-
self.num_classes = num_classes
|
|
222
|
-
self.testing = testing
|
|
223
|
-
self.nn_method_ = nn_method
|
|
224
|
-
|
|
225
|
-
self.genotype_data = genotype_data
|
|
226
|
-
self.prefix = prefix
|
|
227
|
-
self.gridparams = gridparams
|
|
228
|
-
self.disable_progressbar = disable_progressbar
|
|
229
|
-
self.batch_size = batch_size
|
|
230
|
-
self.n_components = n_components
|
|
231
|
-
|
|
232
|
-
self.early_stop_gen = early_stop_gen
|
|
233
|
-
self.num_hidden_layers = num_hidden_layers
|
|
234
|
-
self.hidden_layer_sizes = hidden_layer_sizes
|
|
235
|
-
self.optimizer = optimizer
|
|
236
|
-
self.hidden_activation = hidden_activation
|
|
237
|
-
self.learning_rate = learning_rate
|
|
238
|
-
self.lr_patience = lr_patience
|
|
239
|
-
self.epochs = epochs
|
|
240
|
-
self.weights_initializer = weights_initializer
|
|
241
|
-
self.l1_penalty = l1_penalty
|
|
242
|
-
self.l2_penalty = l2_penalty
|
|
243
|
-
self.dropout_rate = dropout_rate
|
|
244
|
-
self.sample_weights = sample_weights
|
|
245
|
-
self.grid_iter = grid_iter
|
|
246
|
-
self.gridsearch_method = gridsearch_method
|
|
247
|
-
self.ga_kwargs = ga_kwargs
|
|
248
|
-
self.scoring_metric = scoring_metric
|
|
249
|
-
self.sim_strategy = sim_strategy
|
|
250
|
-
self.sim_prop_missing = sim_prop_missing
|
|
251
|
-
self.n_jobs = n_jobs
|
|
252
|
-
self.verbose = verbose
|
|
253
|
-
|
|
254
|
-
self.kl_beta = kl_beta
|
|
255
|
-
self.validation_split = validation_split
|
|
256
|
-
self.nlpca = nlpca
|
|
257
|
-
|
|
258
|
-
self.run_gridsearch_ = False if self.gridparams is None else True
|
|
259
|
-
self.is_multiclass_ = True if self.num_classes != 4 else False
|
|
260
|
-
|
|
261
|
-
# Simulate missing data and get missing masks.
|
|
262
|
-
self.sim = SimGenotypeDataTransformer(
|
|
263
|
-
self.genotype_data,
|
|
264
|
-
prop_missing=self.sim_prop_missing,
|
|
265
|
-
strategy=self.sim_strategy,
|
|
266
|
-
mask_missing=True,
|
|
267
|
-
)
|
|
268
|
-
|
|
269
|
-
# Binary encode y to get y_train.
|
|
270
|
-
self.tt_ = AutoEncoderFeatureTransformer(
|
|
271
|
-
num_classes=self.num_classes, activate=self.activate
|
|
272
|
-
)
|
|
273
|
-
|
|
274
|
-
@timer
|
|
275
|
-
def fit(self, X):
|
|
276
|
-
"""Train the VAE model on input data X.
|
|
277
|
-
|
|
278
|
-
Args:
|
|
279
|
-
X (pandas.DataFrame, numpy.ndarray, or List[List[int]]): Input 012-encoded genotypes.
|
|
280
|
-
|
|
281
|
-
Returns:
|
|
282
|
-
self: Current instance; allows method chaining.
|
|
283
|
-
|
|
284
|
-
Raises:
|
|
285
|
-
TypeError: Must be either pandas.DataFrame, numpy.ndarray, or List[List[int]].
|
|
286
|
-
"""
|
|
287
|
-
# Treating y as X here for compatibility with UBP/NLPCA.
|
|
288
|
-
# With VAE, y=X anyways.
|
|
289
|
-
y = X
|
|
290
|
-
y = validate_input_type(y, return_type="array")
|
|
291
|
-
|
|
292
|
-
self.nn_ = NeuralNetworkMethods()
|
|
293
|
-
plotting = Plotting()
|
|
294
|
-
|
|
295
|
-
if self.gridsearch_method == "genetic_algorithm":
|
|
296
|
-
self.ga_ = True
|
|
297
|
-
else:
|
|
298
|
-
self.ga_ = False
|
|
299
|
-
|
|
300
|
-
self.y_original_ = y.copy()
|
|
301
|
-
self.y_simulated_ = self.sim.fit_transform(self.y_original_)
|
|
302
|
-
|
|
303
|
-
# Get values where original value was not missing but missing data
|
|
304
|
-
# was simulated.
|
|
305
|
-
self.sim_missing_mask_ = self.sim.sim_missing_mask_
|
|
306
|
-
|
|
307
|
-
# Original missing data.
|
|
308
|
-
self.original_missing_mask_ = self.sim.original_missing_mask_
|
|
309
|
-
|
|
310
|
-
# Both simulated and original missing data.
|
|
311
|
-
self.all_missing_ = self.sim.all_missing_mask_
|
|
312
|
-
|
|
313
|
-
# Just y_original with missing values encoded as -1.
|
|
314
|
-
y_train = self.tt_.fit_transform(self.y_original_)
|
|
315
|
-
|
|
316
|
-
if self.gridparams is not None:
|
|
317
|
-
self.scoring_metrics_ = [
|
|
318
|
-
"precision_recall_macro",
|
|
319
|
-
"precision_recall_micro",
|
|
320
|
-
"f1_score",
|
|
321
|
-
"auc_macro",
|
|
322
|
-
"auc_micro",
|
|
323
|
-
"accuracy",
|
|
324
|
-
"hamming",
|
|
325
|
-
]
|
|
326
|
-
|
|
327
|
-
(
|
|
328
|
-
logfile,
|
|
329
|
-
callbacks,
|
|
330
|
-
compile_params,
|
|
331
|
-
model_params,
|
|
332
|
-
fit_params,
|
|
333
|
-
) = self._initialize_parameters(y_train)
|
|
334
|
-
|
|
335
|
-
if self.nn_method_ == "VAE":
|
|
336
|
-
func = self.run_vae
|
|
337
|
-
elif self.nn_method_ == "SAE":
|
|
338
|
-
func = self.run_sae
|
|
339
|
-
elif self.nn_method_ == "NLPCA":
|
|
340
|
-
func = self.run_nlpca
|
|
341
|
-
elif self.nn_method_ == "UBP":
|
|
342
|
-
func = self.run_ubp
|
|
343
|
-
else:
|
|
344
|
-
raise ValueError(f"Invalid nn_method specified: {self.nn_method_}")
|
|
345
|
-
|
|
346
|
-
(
|
|
347
|
-
self.models_,
|
|
348
|
-
self.histories_,
|
|
349
|
-
self.best_params_,
|
|
350
|
-
self.best_score_,
|
|
351
|
-
self.best_estimator_,
|
|
352
|
-
self.search_,
|
|
353
|
-
self.metrics_,
|
|
354
|
-
) = func(
|
|
355
|
-
self.y_original_,
|
|
356
|
-
y_train,
|
|
357
|
-
model_params,
|
|
358
|
-
compile_params,
|
|
359
|
-
fit_params,
|
|
360
|
-
)
|
|
361
|
-
|
|
362
|
-
if (
|
|
363
|
-
self.best_params_ is not None
|
|
364
|
-
and "optimizer__learning_rate" in self.best_params_
|
|
365
|
-
):
|
|
366
|
-
self.best_params_["learning_rate"] = self.best_params_.pop(
|
|
367
|
-
"optimizer__learning_rate"
|
|
368
|
-
)
|
|
369
|
-
|
|
370
|
-
if self.gridparams is not None:
|
|
371
|
-
if self.verbose > 0:
|
|
372
|
-
print("\nBest found parameters:")
|
|
373
|
-
pprint.pprint(self.best_params_)
|
|
374
|
-
print(f"\nBest score: {self.best_score_}")
|
|
375
|
-
plotting.plot_grid_search(
|
|
376
|
-
self.search_.cv_results_, self.nn_method_, self.prefix
|
|
377
|
-
)
|
|
378
|
-
|
|
379
|
-
plotting.plot_history(
|
|
380
|
-
self.histories_, self.nn_method_, prefix=self.prefix
|
|
381
|
-
)
|
|
382
|
-
plotting.plot_metrics(
|
|
383
|
-
self.metrics_, self.num_classes, self.prefix, self.nn_method_
|
|
384
|
-
)
|
|
385
|
-
|
|
386
|
-
if self.ga_:
|
|
387
|
-
plot_fitness_evolution(self.search_)
|
|
388
|
-
plt.savefig(
|
|
389
|
-
os.path.join(
|
|
390
|
-
f"{self.prefix}_output",
|
|
391
|
-
"plots",
|
|
392
|
-
"Unsupervised",
|
|
393
|
-
self.nn_method_,
|
|
394
|
-
"fitness_evolution.pdf",
|
|
395
|
-
),
|
|
396
|
-
bbox_inches="tight",
|
|
397
|
-
facecolor="white",
|
|
398
|
-
)
|
|
399
|
-
plt.cla()
|
|
400
|
-
plt.clf()
|
|
401
|
-
plt.close()
|
|
402
|
-
|
|
403
|
-
g = plotting.plot_search_space(self.search_)
|
|
404
|
-
plt.savefig(
|
|
405
|
-
os.path.join(
|
|
406
|
-
f"{self.prefix}_output",
|
|
407
|
-
"plots",
|
|
408
|
-
"Unsupervised",
|
|
409
|
-
self.nn_method_,
|
|
410
|
-
"search_space.pdf",
|
|
411
|
-
),
|
|
412
|
-
bbox_inches="tight",
|
|
413
|
-
facecolor="white",
|
|
414
|
-
)
|
|
415
|
-
plt.cla()
|
|
416
|
-
plt.clf()
|
|
417
|
-
plt.close()
|
|
418
|
-
|
|
419
|
-
return self
|
|
420
|
-
|
|
421
|
-
def transform(self, X):
|
|
422
|
-
"""Predict and decode imputations and return transformed array.
|
|
423
|
-
|
|
424
|
-
Args:
|
|
425
|
-
X (pandas.DataFrame, numpy.ndarray, or List[List[int]]): Input data to transform.
|
|
426
|
-
|
|
427
|
-
Returns:
|
|
428
|
-
numpy.ndarray: Imputed data.
|
|
429
|
-
"""
|
|
430
|
-
y = X
|
|
431
|
-
y = validate_input_type(y, return_type="array")
|
|
432
|
-
|
|
433
|
-
if self.nn_method_ not in ["UBP", "NLPCA"]:
|
|
434
|
-
model = self.models_[0]
|
|
435
|
-
else:
|
|
436
|
-
if len(self.models_) == 1:
|
|
437
|
-
model = self.models_[0]
|
|
438
|
-
else:
|
|
439
|
-
model = self.models_[-1]
|
|
440
|
-
|
|
441
|
-
y_true = y.copy()
|
|
442
|
-
y_train = self.tt_.transform(y_true)
|
|
443
|
-
y_true_1d = y_true.ravel()
|
|
444
|
-
y_size = y_true.size
|
|
445
|
-
y_missing_idx = np.flatnonzero(self.original_missing_mask_)
|
|
446
|
-
|
|
447
|
-
if self.nn_method_ == "VAE":
|
|
448
|
-
y_pred = model(
|
|
449
|
-
tf.convert_to_tensor(y_train),
|
|
450
|
-
training=False,
|
|
451
|
-
)
|
|
452
|
-
elif self.nn_method_ == "SAE":
|
|
453
|
-
y_pred = model(y_train, training=False)
|
|
454
|
-
else:
|
|
455
|
-
y_pred = model(model.V_latent, training=False)
|
|
456
|
-
y_pred = self.tt_.inverse_transform(y_pred)
|
|
457
|
-
|
|
458
|
-
y_pred_decoded = self.nn_.decode_masked(
|
|
459
|
-
y_train,
|
|
460
|
-
y_pred,
|
|
461
|
-
is_multiclass=self.is_multiclass_,
|
|
462
|
-
)
|
|
463
|
-
# y_pred_decoded, y_pred_certainty = self.nn_.decode_masked(
|
|
464
|
-
# y_train, y_pred, return_proba=True
|
|
465
|
-
# )
|
|
466
|
-
|
|
467
|
-
y_pred_1d = y_pred_decoded.ravel()
|
|
468
|
-
|
|
469
|
-
# Only replace originally missing values at missing indexes.
|
|
470
|
-
for i in np.arange(y_size):
|
|
471
|
-
if i in y_missing_idx:
|
|
472
|
-
y_true_1d[i] = y_pred_1d[i]
|
|
473
|
-
|
|
474
|
-
self.nn_.write_gt_state_probs(
|
|
475
|
-
y_pred,
|
|
476
|
-
y_pred_1d,
|
|
477
|
-
y_true,
|
|
478
|
-
y_true_1d,
|
|
479
|
-
self.nn_method_,
|
|
480
|
-
self.sim_missing_mask_,
|
|
481
|
-
self.original_missing_mask_,
|
|
482
|
-
prefix=self.prefix,
|
|
483
|
-
)
|
|
484
|
-
|
|
485
|
-
Plotting.plot_confusion_matrix(
|
|
486
|
-
y_true_1d, y_pred_1d, self.nn_method_, prefix=self.prefix
|
|
487
|
-
)
|
|
488
|
-
|
|
489
|
-
# if self.nn_method_ == "VAE":
|
|
490
|
-
# Plotting.plot_label_clusters(z_mean, y_true_1d)
|
|
491
|
-
|
|
492
|
-
# Return to original shape.
|
|
493
|
-
return np.reshape(y_true_1d, y_true.shape)
|
|
494
|
-
|
|
495
|
-
def run_clf(
|
|
496
|
-
self,
|
|
497
|
-
y_train,
|
|
498
|
-
y_true,
|
|
499
|
-
model_params,
|
|
500
|
-
compile_params,
|
|
501
|
-
fit_params,
|
|
502
|
-
ubp_weights=None,
|
|
503
|
-
phase=None,
|
|
504
|
-
scoring=None,
|
|
505
|
-
testing=False,
|
|
506
|
-
**kwargs,
|
|
507
|
-
):
|
|
508
|
-
"""Run KerasClassifier with neural network model and grid search.
|
|
509
|
-
|
|
510
|
-
Args:
|
|
511
|
-
y_train (numpy.ndarray): Onehot-encoded training input data of shape (n_samples, n_features, num_classes).
|
|
512
|
-
|
|
513
|
-
y_true (numpy.ndarray): Original 012-encoded input data of shape (n_samples, n_features).
|
|
514
|
-
|
|
515
|
-
model_params (Dict[str, Any]): Dictionary with model parameters to be passed to KerasClassifier model.
|
|
516
|
-
|
|
517
|
-
compile_params (Dict[str, Any]): Dictionary with params to be passed to Keras model.compile() in KerasClassifier.
|
|
518
|
-
|
|
519
|
-
fit_params (Dict[str, Any]): Dictionary with parameters to be passed to fit in KerasClassifier.
|
|
520
|
-
|
|
521
|
-
scoring (Dict[str, Callable], optional): Multimetric scorer made using sklearn.metrics.make_scorer. To be used with grid search.
|
|
522
|
-
|
|
523
|
-
Returns:
|
|
524
|
-
List[tf.keras.Model]: List of keras model objects. One for each phase (len=1 if NLPCA, len=3 if UBP).
|
|
525
|
-
|
|
526
|
-
List[Dict[str, float]]: List of dictionaries with best neural network model history.
|
|
527
|
-
|
|
528
|
-
Dict[str, Any] or None: Best parameters found during a grid search, or None if a grid search was not run.
|
|
529
|
-
|
|
530
|
-
float: Best score obtained during grid search.
|
|
531
|
-
|
|
532
|
-
tf.keras.Model: Best model found during grid search.
|
|
533
|
-
|
|
534
|
-
sklearn.model_selection object (GridSearchCV, RandomizedSearchCV) or GASearchCV object.
|
|
535
|
-
|
|
536
|
-
Dict[str, Any]: Per-class, micro, and macro-averaged metrics including accuracy, ROC-AUC, and Precision-Recall with Average Precision scores.
|
|
537
|
-
"""
|
|
538
|
-
# This reduces memory usage.
|
|
539
|
-
# tensorflow builds graphs that
|
|
540
|
-
# will stack if not cleared before
|
|
541
|
-
# building a new model.
|
|
542
|
-
tf.keras.backend.clear_session()
|
|
543
|
-
self.nn_.reset_seeds()
|
|
544
|
-
|
|
545
|
-
model = None
|
|
546
|
-
if self.nn_method_ in ["UBP", "NLPCA"]:
|
|
547
|
-
V = model_params.pop("V")
|
|
548
|
-
if phase is not None:
|
|
549
|
-
desc = f"Epoch (Phase {phase}): "
|
|
550
|
-
else:
|
|
551
|
-
desc = "Epoch: "
|
|
552
|
-
|
|
553
|
-
else:
|
|
554
|
-
desc = "Epoch: "
|
|
555
|
-
|
|
556
|
-
if not self.disable_progressbar and not self.run_gridsearch_:
|
|
557
|
-
fit_params["callbacks"][-1] = TqdmCallback(
|
|
558
|
-
epochs=self.epochs, verbose=self.verbose, desc=desc
|
|
559
|
-
)
|
|
560
|
-
|
|
561
|
-
if self.nn_method_ == "VAE":
|
|
562
|
-
clf = VAEClassifier(
|
|
563
|
-
**model_params,
|
|
564
|
-
optimizer=compile_params["optimizer"],
|
|
565
|
-
optimizer__learning_rate=compile_params["learning_rate"],
|
|
566
|
-
loss=compile_params["loss"],
|
|
567
|
-
metrics=compile_params["metrics"],
|
|
568
|
-
run_eagerly=compile_params["run_eagerly"],
|
|
569
|
-
callbacks=fit_params["callbacks"],
|
|
570
|
-
epochs=fit_params["epochs"],
|
|
571
|
-
verbose=0,
|
|
572
|
-
num_classes=self.num_classes,
|
|
573
|
-
activate=self.act_func_,
|
|
574
|
-
fit__validation_split=fit_params["validation_split"],
|
|
575
|
-
score__missing_mask=self.sim_missing_mask_,
|
|
576
|
-
score__scoring_metric=self.scoring_metric,
|
|
577
|
-
score__num_classes=self.num_classes,
|
|
578
|
-
score__n_classes=self.num_classes,
|
|
579
|
-
)
|
|
580
|
-
elif self.nn_method_ == "SAE":
|
|
581
|
-
clf = SAEClassifier(
|
|
582
|
-
**model_params,
|
|
583
|
-
optimizer=compile_params["optimizer"],
|
|
584
|
-
optimizer__learning_rate=compile_params["learning_rate"],
|
|
585
|
-
loss=compile_params["loss"],
|
|
586
|
-
metrics=compile_params["metrics"],
|
|
587
|
-
callbacks=fit_params["callbacks"],
|
|
588
|
-
epochs=fit_params["epochs"],
|
|
589
|
-
verbose=0,
|
|
590
|
-
activate=self.act_func_,
|
|
591
|
-
fit__validation_split=fit_params["validation_split"],
|
|
592
|
-
score__missing_mask=self.sim_missing_mask_,
|
|
593
|
-
score__scoring_metric=self.scoring_metric,
|
|
594
|
-
score__num_classes=self.num_classes,
|
|
595
|
-
score__n_classes=self.num_classes,
|
|
596
|
-
)
|
|
597
|
-
else:
|
|
598
|
-
clf = MLPClassifier(
|
|
599
|
-
V,
|
|
600
|
-
y_train,
|
|
601
|
-
**model_params,
|
|
602
|
-
ubp_weights=ubp_weights,
|
|
603
|
-
optimizer=compile_params["optimizer"],
|
|
604
|
-
optimizer__learning_rate=compile_params["learning_rate"],
|
|
605
|
-
loss=compile_params["loss"],
|
|
606
|
-
metrics=compile_params["metrics"],
|
|
607
|
-
epochs=fit_params["epochs"],
|
|
608
|
-
phase=phase,
|
|
609
|
-
callbacks=fit_params["callbacks"],
|
|
610
|
-
validation_split=fit_params["validation_split"],
|
|
611
|
-
verbose=0,
|
|
612
|
-
score__missing_mask=self.sim_missing_mask_,
|
|
613
|
-
score__scoring_metric=self.scoring_metric,
|
|
614
|
-
)
|
|
615
|
-
|
|
616
|
-
if self.run_gridsearch_:
|
|
617
|
-
# Cannot do CV because there is no way to use test splits
|
|
618
|
-
# given that the input gets refined. If using a test split,
|
|
619
|
-
# then it would just be the randomly initialized values and
|
|
620
|
-
# would not accurately represent the model.
|
|
621
|
-
# Thus, we disable cross-validation for the grid searches.
|
|
622
|
-
cross_val = DisabledCV()
|
|
623
|
-
verbose = False if self.verbose == 0 else True
|
|
624
|
-
|
|
625
|
-
if self.ga_:
|
|
626
|
-
# Stop searching if GA sees no improvement.
|
|
627
|
-
callback = [
|
|
628
|
-
ConsecutiveStopping(
|
|
629
|
-
generations=self.early_stop_gen, metric="fitness"
|
|
630
|
-
)
|
|
631
|
-
]
|
|
632
|
-
|
|
633
|
-
if not self.disable_progressbar:
|
|
634
|
-
callback.append(ProgressBar())
|
|
635
|
-
|
|
636
|
-
# Do genetic algorithm
|
|
637
|
-
# with HiddenPrints():
|
|
638
|
-
search = GASearchCV(
|
|
639
|
-
estimator=clf,
|
|
640
|
-
cv=cross_val,
|
|
641
|
-
scoring=scoring,
|
|
642
|
-
generations=self.grid_iter,
|
|
643
|
-
param_grid=self.gridparams,
|
|
644
|
-
n_jobs=self.n_jobs,
|
|
645
|
-
refit=self.scoring_metric,
|
|
646
|
-
verbose=verbose,
|
|
647
|
-
**self.ga_kwargs,
|
|
648
|
-
error_score="raise",
|
|
649
|
-
)
|
|
650
|
-
|
|
651
|
-
if self.nn_method_ in ["UBP", "NLPCA"]:
|
|
652
|
-
search.fit(V[self.n_components], y=y_true)
|
|
653
|
-
else:
|
|
654
|
-
search.fit(y_true, y_true, callbacks=callback)
|
|
655
|
-
|
|
656
|
-
else:
|
|
657
|
-
# Write GridSearchCV to log file instead of STDOUT.
|
|
658
|
-
if self.verbose >= 10:
|
|
659
|
-
old_stdout = sys.stdout
|
|
660
|
-
log_file = open(
|
|
661
|
-
os.path.join(
|
|
662
|
-
f"{self.prefix}_output",
|
|
663
|
-
"logs",
|
|
664
|
-
"Unsupervised",
|
|
665
|
-
self.nn_method_,
|
|
666
|
-
"gridsearch_progress_log.txt",
|
|
667
|
-
),
|
|
668
|
-
"w",
|
|
669
|
-
)
|
|
670
|
-
sys.stdout = log_file
|
|
671
|
-
|
|
672
|
-
if self.gridsearch_method.lower() == "gridsearch":
|
|
673
|
-
# Do GridSearchCV
|
|
674
|
-
search = GridSearchCV(
|
|
675
|
-
clf,
|
|
676
|
-
param_grid=self.gridparams,
|
|
677
|
-
n_jobs=self.n_jobs,
|
|
678
|
-
cv=cross_val,
|
|
679
|
-
scoring=scoring,
|
|
680
|
-
refit=self.scoring_metric,
|
|
681
|
-
verbose=self.verbose * 4,
|
|
682
|
-
error_score="raise",
|
|
683
|
-
)
|
|
684
|
-
|
|
685
|
-
elif self.gridsearch_method.lower() == "randomized_gridsearch":
|
|
686
|
-
search = RandomizedSearchCV(
|
|
687
|
-
clf,
|
|
688
|
-
param_distributions=self.gridparams,
|
|
689
|
-
n_iter=self.grid_iter,
|
|
690
|
-
n_jobs=self.n_jobs,
|
|
691
|
-
cv=cross_val,
|
|
692
|
-
scoring=scoring,
|
|
693
|
-
refit=self.scoring_metric,
|
|
694
|
-
verbose=verbose * 4,
|
|
695
|
-
error_score="raise",
|
|
696
|
-
)
|
|
697
|
-
|
|
698
|
-
else:
|
|
699
|
-
raise ValueError(
|
|
700
|
-
f"Invalid gridsearch_method specified: "
|
|
701
|
-
f"{self.gridsearch_method}"
|
|
702
|
-
)
|
|
703
|
-
|
|
704
|
-
if self.nn_method_ in ["UBP", "NLPCA"]:
|
|
705
|
-
search.fit(V[self.n_components], y=y_true)
|
|
706
|
-
else:
|
|
707
|
-
search.fit(y_true, y=y_true)
|
|
708
|
-
|
|
709
|
-
if self.verbose >= 10:
|
|
710
|
-
# Make sure to revert STDOUT back to original.
|
|
711
|
-
sys.stdout = old_stdout
|
|
712
|
-
log_file.close()
|
|
713
|
-
|
|
714
|
-
best_params = search.best_params_
|
|
715
|
-
best_score = search.best_score_
|
|
716
|
-
best_clf = search.best_estimator_
|
|
717
|
-
|
|
718
|
-
fp = os.path.join(
|
|
719
|
-
f"{self.prefix}_output",
|
|
720
|
-
"reports",
|
|
721
|
-
"Unsupervised",
|
|
722
|
-
self.nn_method_,
|
|
723
|
-
f"cvresults_{self.nn_method_}.csv",
|
|
724
|
-
)
|
|
725
|
-
|
|
726
|
-
cv_results = pd.DataFrame(search.cv_results_)
|
|
727
|
-
cv_results.to_csv(fp, index=False)
|
|
728
|
-
|
|
729
|
-
else:
|
|
730
|
-
if self.nn_method_ in ["UBP", "NLPCA"]:
|
|
731
|
-
clf.fit(V[self.n_components], y=y_true)
|
|
732
|
-
else:
|
|
733
|
-
clf.fit(y_true, y=y_true)
|
|
734
|
-
best_params = None
|
|
735
|
-
best_score = None
|
|
736
|
-
search = None
|
|
737
|
-
best_clf = clf
|
|
738
|
-
|
|
739
|
-
model = best_clf.model_
|
|
740
|
-
best_history = best_clf.history_
|
|
741
|
-
|
|
742
|
-
if self.nn_method_ == "VAE":
|
|
743
|
-
y_pred = model(
|
|
744
|
-
tf.convert_to_tensor(y_train),
|
|
745
|
-
training=False,
|
|
746
|
-
)
|
|
747
|
-
y_pred = self.tt_.inverse_transform(y_pred)
|
|
748
|
-
elif self.nn_method_ == "SAE":
|
|
749
|
-
y_pred = model(y_train, training=False)
|
|
750
|
-
y_pred = self.tt_.inverse_transform(y_pred)
|
|
751
|
-
elif self.nn_method_ in ["UBP", "NLPCA"]:
|
|
752
|
-
# Third run_clf function
|
|
753
|
-
y_pred_proba = model(model.V_latent, training=False)
|
|
754
|
-
y_pred = self.tt_.inverse_transform(y_pred_proba)
|
|
755
|
-
|
|
756
|
-
# Get metric scores.
|
|
757
|
-
metrics = Scorers.scorer(
|
|
758
|
-
y_true,
|
|
759
|
-
y_pred,
|
|
760
|
-
missing_mask=self.sim_missing_mask_,
|
|
761
|
-
num_classes=self.num_classes,
|
|
762
|
-
testing=self.testing,
|
|
763
|
-
)
|
|
764
|
-
|
|
765
|
-
if self.nn_method_ in ["UBP", "NLPCA"]:
|
|
766
|
-
return (
|
|
767
|
-
V,
|
|
768
|
-
model,
|
|
769
|
-
best_history,
|
|
770
|
-
best_params,
|
|
771
|
-
best_score,
|
|
772
|
-
best_clf,
|
|
773
|
-
search,
|
|
774
|
-
metrics,
|
|
775
|
-
)
|
|
776
|
-
else:
|
|
777
|
-
return (
|
|
778
|
-
model,
|
|
779
|
-
best_history,
|
|
780
|
-
best_params,
|
|
781
|
-
best_score,
|
|
782
|
-
best_clf,
|
|
783
|
-
search,
|
|
784
|
-
metrics,
|
|
785
|
-
)
|
|
786
|
-
|
|
787
|
-
def _initialize_parameters(self, y_train):
|
|
788
|
-
"""Initialize important parameters.
|
|
789
|
-
|
|
790
|
-
Args:
|
|
791
|
-
y_train (numpy.ndarray): Training subset of original input data.
|
|
792
|
-
|
|
793
|
-
Returns:
|
|
794
|
-
Dict[str, Any]: Parameters to use for model.compile().
|
|
795
|
-
Dict[str, Any]: Other parameters to pass to KerasClassifier().
|
|
796
|
-
Dict[str, Any]: Parameters to pass to fit_params() in grid search.
|
|
797
|
-
"""
|
|
798
|
-
# For CSVLogger() callback.
|
|
799
|
-
|
|
800
|
-
append = True if self.nn_method_ == "UBP" else False
|
|
801
|
-
logfile = os.path.join(
|
|
802
|
-
f"{self.prefix}_output",
|
|
803
|
-
"logs",
|
|
804
|
-
"Unsupervised",
|
|
805
|
-
self.nn_method_,
|
|
806
|
-
"training_log.csv",
|
|
807
|
-
)
|
|
808
|
-
|
|
809
|
-
callbacks = [
|
|
810
|
-
CSVLogger(filename=logfile, append=append),
|
|
811
|
-
ReduceLROnPlateau(
|
|
812
|
-
patience=self.lr_patience, min_lr=1e-6, min_delta=1e-6
|
|
813
|
-
),
|
|
814
|
-
]
|
|
815
|
-
|
|
816
|
-
if self.nn_method_ in ["VAE", "SAE"]:
|
|
817
|
-
callbacks.append(VAECallbacks())
|
|
818
|
-
|
|
819
|
-
if self.nn_method_ == "VAE":
|
|
820
|
-
callbacks.append(
|
|
821
|
-
CyclicalAnnealingCallback(
|
|
822
|
-
self.epochs, schedule_type="sigmoid"
|
|
823
|
-
)
|
|
824
|
-
)
|
|
825
|
-
else:
|
|
826
|
-
callbacks.append(UBPCallbacks())
|
|
827
|
-
|
|
828
|
-
search_mode = True if self.run_gridsearch_ else False
|
|
829
|
-
|
|
830
|
-
if not self.disable_progressbar and not search_mode:
|
|
831
|
-
callbacks.append(
|
|
832
|
-
TqdmCallback(epochs=self.epochs, verbose=0, desc="Epoch: ")
|
|
833
|
-
)
|
|
834
|
-
|
|
835
|
-
if self.nn_method_ in ["UBP", "NLPCA"]:
|
|
836
|
-
vinput = self._initV(y_train, search_mode)
|
|
837
|
-
compile_params = self.nn_.set_compile_params(self.optimizer)
|
|
838
|
-
else:
|
|
839
|
-
vae = True if self.nn_method_ in ["VAE", "SAE"] else False
|
|
840
|
-
|
|
841
|
-
if self.sample_weights == "auto" or self.sample_weights == "logsmooth":
|
|
842
|
-
# Get class weights for each column.
|
|
843
|
-
sample_weights = self.nn_.get_class_weights(
|
|
844
|
-
self.y_original_,
|
|
845
|
-
self.original_missing_mask_,
|
|
846
|
-
return_1d=False,
|
|
847
|
-
method=self.sample_weights,
|
|
848
|
-
)
|
|
849
|
-
sample_weights = self.nn_.normalize_data(sample_weights)
|
|
850
|
-
|
|
851
|
-
elif isinstance(self.sample_weights, dict):
|
|
852
|
-
for i in range(self.num_classes):
|
|
853
|
-
if self.sample_weights[i] == 0.0:
|
|
854
|
-
self.sim_missing_mask_[self.y_original_ == i] = False
|
|
855
|
-
|
|
856
|
-
sample_weights = self.nn_.get_class_weights(
|
|
857
|
-
self.y_original_, user_weights=self.sample_weights
|
|
858
|
-
)
|
|
859
|
-
|
|
860
|
-
else:
|
|
861
|
-
sample_weights = None
|
|
862
|
-
|
|
863
|
-
vae = True if self.nn_method_ == "VAE" else False
|
|
864
|
-
|
|
865
|
-
compile_params = self.nn_.set_compile_params(
|
|
866
|
-
self.optimizer,
|
|
867
|
-
sample_weights,
|
|
868
|
-
vae=vae,
|
|
869
|
-
act_func=self.act_func_,
|
|
870
|
-
)
|
|
871
|
-
|
|
872
|
-
compile_params["learning_rate"] = self.learning_rate
|
|
873
|
-
|
|
874
|
-
if self.nn_method_ in ["VAE", "SAE"]:
|
|
875
|
-
model_params = {
|
|
876
|
-
"y": y_train,
|
|
877
|
-
"batch_size": self.batch_size,
|
|
878
|
-
"sample_weight": sample_weights,
|
|
879
|
-
"missing_mask": self.original_missing_mask_,
|
|
880
|
-
"output_shape": y_train.shape[1],
|
|
881
|
-
"weights_initializer": self.weights_initializer,
|
|
882
|
-
"n_components": self.n_components,
|
|
883
|
-
"hidden_layer_sizes": self.hidden_layer_sizes,
|
|
884
|
-
"num_hidden_layers": self.num_hidden_layers,
|
|
885
|
-
"hidden_activation": self.hidden_activation,
|
|
886
|
-
"l1_penalty": self.l1_penalty,
|
|
887
|
-
"l2_penalty": self.l2_penalty,
|
|
888
|
-
"dropout_rate": self.dropout_rate,
|
|
889
|
-
}
|
|
890
|
-
|
|
891
|
-
if self.nn_method_ == "VAE":
|
|
892
|
-
model_params["kl_beta"] = (1.0 / y_train.shape[0],)
|
|
893
|
-
else:
|
|
894
|
-
model_params = {
|
|
895
|
-
"V": vinput,
|
|
896
|
-
"y_train": y_train,
|
|
897
|
-
"batch_size": self.batch_size,
|
|
898
|
-
"missing_mask": self.original_missing_mask_,
|
|
899
|
-
"output_shape": y_train.shape[1],
|
|
900
|
-
"weights_initializer": self.weights_initializer,
|
|
901
|
-
"n_components": self.n_components,
|
|
902
|
-
"hidden_layer_sizes": self.hidden_layer_sizes,
|
|
903
|
-
"num_hidden_layers": self.num_hidden_layers,
|
|
904
|
-
"hidden_activation": self.hidden_activation,
|
|
905
|
-
"l1_penalty": self.l1_penalty,
|
|
906
|
-
"l2_penalty": self.l2_penalty,
|
|
907
|
-
"dropout_rate": self.dropout_rate,
|
|
908
|
-
"num_classes": self.num_classes,
|
|
909
|
-
}
|
|
910
|
-
|
|
911
|
-
model_params["sample_weight"] = sample_weights
|
|
912
|
-
|
|
913
|
-
fit_verbose = 1 if self.verbose == 2 else 0
|
|
914
|
-
|
|
915
|
-
fit_params = {
|
|
916
|
-
"batch_size": self.batch_size,
|
|
917
|
-
"epochs": self.epochs,
|
|
918
|
-
"callbacks": callbacks,
|
|
919
|
-
"shuffle": True,
|
|
920
|
-
"verbose": fit_verbose,
|
|
921
|
-
"sample_weight": sample_weights,
|
|
922
|
-
}
|
|
923
|
-
|
|
924
|
-
if self.nn_method_ in ["VAE", "SAE"]:
|
|
925
|
-
shuffle = True
|
|
926
|
-
fit_params["validation_split"] = self.validation_split
|
|
927
|
-
else:
|
|
928
|
-
shuffle = False
|
|
929
|
-
fit_params["validation_split"] = 0.0
|
|
930
|
-
|
|
931
|
-
fit_params["shuffle"] = shuffle
|
|
932
|
-
|
|
933
|
-
if self.run_gridsearch_ and "learning_rate" in self.gridparams:
|
|
934
|
-
self.gridparams["optimizer__learning_rate"] = self.gridparams[
|
|
935
|
-
"learning_rate"
|
|
936
|
-
]
|
|
937
|
-
|
|
938
|
-
self.gridparams.pop("learning_rate")
|
|
939
|
-
|
|
940
|
-
return (
|
|
941
|
-
logfile,
|
|
942
|
-
callbacks,
|
|
943
|
-
compile_params,
|
|
944
|
-
model_params,
|
|
945
|
-
fit_params,
|
|
946
|
-
)
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
class VAE(BaseNNImputer):
|
|
950
|
-
"""Class to impute missing data using a Variational Autoencoder neural network."""
|
|
951
|
-
|
|
952
|
-
def __init__(
|
|
953
|
-
self,
|
|
954
|
-
kl_beta=tf.Variable(1.0, trainable=False),
|
|
955
|
-
validation_split=0.2,
|
|
956
|
-
**kwargs,
|
|
957
|
-
):
|
|
958
|
-
self.kl_beta = kl_beta
|
|
959
|
-
self.validation_split = validation_split
|
|
960
|
-
|
|
961
|
-
self.nn_method_ = "VAE"
|
|
962
|
-
self.num_classes = 4
|
|
963
|
-
self.activate = None
|
|
964
|
-
self.is_multiclass_ = True if self.num_classes != 4 else False
|
|
965
|
-
self.testing = kwargs.get("testing", False)
|
|
966
|
-
self.do_act_in_model_ = True if self.activate is None else False
|
|
967
|
-
|
|
968
|
-
if self.do_act_in_model_ and self.is_multiclass_:
|
|
969
|
-
self.act_func_ = "softmax"
|
|
970
|
-
elif self.do_act_in_model_ and not self.is_multiclass_:
|
|
971
|
-
self.act_func_ = "sigmoid"
|
|
972
|
-
else:
|
|
973
|
-
self.act_func_ = None
|
|
974
|
-
|
|
975
|
-
super().__init__(
|
|
976
|
-
self.activate,
|
|
977
|
-
self.nn_method_,
|
|
978
|
-
self.num_classes,
|
|
979
|
-
self.act_func_,
|
|
980
|
-
**kwargs,
|
|
981
|
-
kl_beta=self.kl_beta,
|
|
982
|
-
validation_split=self.validation_split,
|
|
983
|
-
)
|
|
984
|
-
|
|
985
|
-
def run_vae(
|
|
986
|
-
self,
|
|
987
|
-
y_true,
|
|
988
|
-
y_train,
|
|
989
|
-
model_params,
|
|
990
|
-
compile_params,
|
|
991
|
-
fit_params,
|
|
992
|
-
):
|
|
993
|
-
"""Run VAE using custom subclassed model.
|
|
994
|
-
|
|
995
|
-
Args:
|
|
996
|
-
y_true (numpy.ndarray): Original genotypes (training dataset) with known and missing values, of shape (n_samples, n_features).
|
|
997
|
-
|
|
998
|
-
y_train (numpy.ndarray): Onehot encoded genotypes (training dataset) with known and missing values, of shape (n_samples, n_features, num_classes).
|
|
999
|
-
|
|
1000
|
-
model_params (Dict[str, Any]): Dictionary with parameters to pass to the classifier model.
|
|
1001
|
-
|
|
1002
|
-
compile_params (Dict[str, Any]): Dictionary with parameters to pass to the tensorflow compile function.
|
|
1003
|
-
|
|
1004
|
-
fit_params (Dict[str, Any]): Dictionary with parameters to pass to the fit() function.
|
|
1005
|
-
|
|
1006
|
-
Returns:
|
|
1007
|
-
List[tf.keras.Model]: List of keras model objects. One for each phase (len=1 if NLPCA, len=3 if UBP).
|
|
1008
|
-
|
|
1009
|
-
List[Dict[str, float]]: List of dictionaries with best neural network model history.
|
|
1010
|
-
|
|
1011
|
-
Dict[str, Any] or None: Best parameters found during a grid search, or None if a grid search was not run.
|
|
1012
|
-
|
|
1013
|
-
float: Best score obtained during grid search.
|
|
1014
|
-
|
|
1015
|
-
tf.keras.Model: Best model found during grid search.
|
|
1016
|
-
|
|
1017
|
-
sklearn.model_selection object (GridSearchCV, RandomizedSearchCV) or GASearchCV object.
|
|
1018
|
-
|
|
1019
|
-
Dict[str, Any]: Per-class, micro, and macro-averaged metrics including accuracy, ROC-AUC, and Precision-Recall with Average Precision scores.
|
|
1020
|
-
"""
|
|
1021
|
-
scorers = Scorers()
|
|
1022
|
-
scoring = None
|
|
1023
|
-
|
|
1024
|
-
histories = list()
|
|
1025
|
-
models = list()
|
|
1026
|
-
|
|
1027
|
-
if self.run_gridsearch_:
|
|
1028
|
-
scoring = scorers.make_multimetric_scorer(
|
|
1029
|
-
self.scoring_metrics_,
|
|
1030
|
-
self.sim_missing_mask_,
|
|
1031
|
-
num_classes=self.num_classes,
|
|
1032
|
-
)
|
|
1033
|
-
|
|
1034
|
-
(
|
|
1035
|
-
model,
|
|
1036
|
-
best_history,
|
|
1037
|
-
best_params,
|
|
1038
|
-
best_score,
|
|
1039
|
-
best_clf,
|
|
1040
|
-
search,
|
|
1041
|
-
metrics,
|
|
1042
|
-
) = self.run_clf(
|
|
1043
|
-
y_train,
|
|
1044
|
-
y_true,
|
|
1045
|
-
model_params,
|
|
1046
|
-
compile_params,
|
|
1047
|
-
fit_params,
|
|
1048
|
-
scoring=scoring,
|
|
1049
|
-
)
|
|
1050
|
-
|
|
1051
|
-
histories.append(best_history)
|
|
1052
|
-
models.append(model)
|
|
1053
|
-
del model
|
|
1054
|
-
|
|
1055
|
-
return (
|
|
1056
|
-
models,
|
|
1057
|
-
histories,
|
|
1058
|
-
best_params,
|
|
1059
|
-
best_score,
|
|
1060
|
-
best_clf,
|
|
1061
|
-
search,
|
|
1062
|
-
metrics,
|
|
1063
|
-
)
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
class SAE(BaseNNImputer):
|
|
1067
|
-
def __init__(
|
|
1068
|
-
self,
|
|
1069
|
-
**kwargs,
|
|
1070
|
-
):
|
|
1071
|
-
self.num_classes = 3
|
|
1072
|
-
self.activate = "softmax"
|
|
1073
|
-
self.nn_method_ = "SAE"
|
|
1074
|
-
self.act_func_ = "softmax"
|
|
1075
|
-
self.testing = kwargs.get("testing", False)
|
|
1076
|
-
|
|
1077
|
-
super().__init__(
|
|
1078
|
-
self.activate,
|
|
1079
|
-
self.nn_method_,
|
|
1080
|
-
self.num_classes,
|
|
1081
|
-
self.act_func_,
|
|
1082
|
-
**kwargs,
|
|
1083
|
-
)
|
|
1084
|
-
|
|
1085
|
-
def run_sae(
|
|
1086
|
-
self,
|
|
1087
|
-
y_true,
|
|
1088
|
-
y_train,
|
|
1089
|
-
model_params,
|
|
1090
|
-
compile_params,
|
|
1091
|
-
fit_params,
|
|
1092
|
-
):
|
|
1093
|
-
"""Run standard autoencoder using custom subclassed model.
|
|
1094
|
-
|
|
1095
|
-
Args:
|
|
1096
|
-
y_true (numpy.ndarray): Original genotypes (training dataset) with known and missing values of shape (n_samples, n_features).
|
|
1097
|
-
|
|
1098
|
-
y_train (numpy.ndarray): Onehot-encoded genotypes (training dataset) with known and missing values of shape (n_samples, n_features, num_classes.)
|
|
1099
|
-
|
|
1100
|
-
model_params (Dict[str, Any]): Dictionary with parameters to pass to the classifier model.
|
|
1101
|
-
|
|
1102
|
-
compile_params (Dict[str, Any]): Dictionary with parameters to pass to the tensorflow compile function.
|
|
1103
|
-
|
|
1104
|
-
fit_params (Dict[str, Any]): Dictionary with parameters to pass to the fit() function.
|
|
1105
|
-
|
|
1106
|
-
Returns:
|
|
1107
|
-
List[tf.keras.Model]: List of keras model objects. One for each phase (len=1 if NLPCA, len=3 if UBP).
|
|
1108
|
-
|
|
1109
|
-
List[Dict[str, float]]: List of dictionaries with best neural network model history.
|
|
1110
|
-
|
|
1111
|
-
Dict[str, Any] or None: Best parameters found during a grid search, or None if a grid search was not run.
|
|
1112
|
-
|
|
1113
|
-
float: Best score obtained during grid search.
|
|
1114
|
-
|
|
1115
|
-
tf.keras.Model: Best model found during grid search.
|
|
1116
|
-
|
|
1117
|
-
sklearn.model_selection object (GridSearchCV, RandomizedSearchCV) or GASearchCV object.
|
|
1118
|
-
|
|
1119
|
-
Dict[str, Any]: Per-class, micro, and macro-averaged metrics including accuracy, ROC-AUC, and Precision-Recall with Average Precision scores.
|
|
1120
|
-
"""
|
|
1121
|
-
scorers = Scorers()
|
|
1122
|
-
scoring = None
|
|
1123
|
-
|
|
1124
|
-
histories = list()
|
|
1125
|
-
models = list()
|
|
1126
|
-
|
|
1127
|
-
if self.run_gridsearch_:
|
|
1128
|
-
scoring = scorers.make_multimetric_scorer(
|
|
1129
|
-
self.scoring_metrics_, self.sim_missing_mask_
|
|
1130
|
-
)
|
|
1131
|
-
|
|
1132
|
-
(
|
|
1133
|
-
model,
|
|
1134
|
-
best_history,
|
|
1135
|
-
best_params,
|
|
1136
|
-
best_score,
|
|
1137
|
-
best_clf,
|
|
1138
|
-
search,
|
|
1139
|
-
metrics,
|
|
1140
|
-
) = self.run_clf(
|
|
1141
|
-
y_train,
|
|
1142
|
-
y_true,
|
|
1143
|
-
model_params,
|
|
1144
|
-
compile_params,
|
|
1145
|
-
fit_params,
|
|
1146
|
-
scoring=scoring,
|
|
1147
|
-
testing=False,
|
|
1148
|
-
)
|
|
1149
|
-
|
|
1150
|
-
histories.append(best_history)
|
|
1151
|
-
models.append(model)
|
|
1152
|
-
del model
|
|
1153
|
-
|
|
1154
|
-
return (
|
|
1155
|
-
models,
|
|
1156
|
-
histories,
|
|
1157
|
-
best_params,
|
|
1158
|
-
best_score,
|
|
1159
|
-
best_clf,
|
|
1160
|
-
search,
|
|
1161
|
-
metrics,
|
|
1162
|
-
)
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
class UBP(BaseNNImputer):
|
|
1166
|
-
def __init__(
|
|
1167
|
-
self,
|
|
1168
|
-
*,
|
|
1169
|
-
nlpca=False,
|
|
1170
|
-
**kwargs,
|
|
1171
|
-
):
|
|
1172
|
-
# TODO: Make estimators compatible with variable number of classes.
|
|
1173
|
-
# E.g., with morphological data.
|
|
1174
|
-
self.nlpca = nlpca
|
|
1175
|
-
self.nn_method_ = "NLPCA" if self.nlpca else "UBP"
|
|
1176
|
-
self.num_classes = 3
|
|
1177
|
-
self.testing = kwargs.get("testing", False)
|
|
1178
|
-
self.activate = None
|
|
1179
|
-
self.act_func_ = "softmax"
|
|
1180
|
-
|
|
1181
|
-
super().__init__(
|
|
1182
|
-
self.activate,
|
|
1183
|
-
self.nn_method_,
|
|
1184
|
-
self.num_classes,
|
|
1185
|
-
self.act_func_,
|
|
1186
|
-
**kwargs,
|
|
1187
|
-
nlpca=self.nlpca,
|
|
1188
|
-
)
|
|
1189
|
-
|
|
1190
|
-
def run_nlpca(
|
|
1191
|
-
self,
|
|
1192
|
-
y_true,
|
|
1193
|
-
y_train,
|
|
1194
|
-
model_params,
|
|
1195
|
-
compile_params,
|
|
1196
|
-
fit_params,
|
|
1197
|
-
):
|
|
1198
|
-
"""Run NLPCA using custom subclassed model.
|
|
1199
|
-
|
|
1200
|
-
Args:
|
|
1201
|
-
y_true (numpy.ndarray): Original genotypes with known and missing values.
|
|
1202
|
-
|
|
1203
|
-
y_train (numpy.ndarray): For compatibility with VAE and SAE. Not used.
|
|
1204
|
-
|
|
1205
|
-
model_params (Dict[str, Any]): Dictionary with parameters to pass to the classifier model.
|
|
1206
|
-
|
|
1207
|
-
compile_params (Dict[str, Any]): Dictionary with parameters to pass to the tensorflow compile function.
|
|
1208
|
-
|
|
1209
|
-
fit_params (Dict[str, Any]): Dictionary with parameters to pass to the fit() function.
|
|
1210
|
-
|
|
1211
|
-
Returns:
|
|
1212
|
-
List[tf.keras.Model]: List of keras model objects. One for each phase (len=1 if NLPCA, len=3 if UBP).
|
|
1213
|
-
|
|
1214
|
-
List[Dict[str, float]]: List of dictionaries with best neural network model history.
|
|
1215
|
-
|
|
1216
|
-
Dict[str, Any] or None: Best parameters found during a grid search, or None if a grid search was not run.
|
|
1217
|
-
|
|
1218
|
-
float: Best score obtained during grid search.
|
|
1219
|
-
|
|
1220
|
-
tf.keras.Model: Best model found during grid search.
|
|
1221
|
-
|
|
1222
|
-
sklearn.model_selection object (GridSearchCV, RandomizedSearchCV) or GASearchCV object.
|
|
1223
|
-
|
|
1224
|
-
Dict[str, Any]: Per-class, micro, and macro-averaged metrics including accuracy, ROC-AUC, and Precision-Recall with Average Precision scores.
|
|
1225
|
-
"""
|
|
1226
|
-
scorers = Scorers()
|
|
1227
|
-
|
|
1228
|
-
histories = list()
|
|
1229
|
-
models = list()
|
|
1230
|
-
y_train = model_params.pop("y_train")
|
|
1231
|
-
ubp_weights = None
|
|
1232
|
-
phase = None
|
|
1233
|
-
scoring = None
|
|
1234
|
-
|
|
1235
|
-
if self.run_gridsearch_:
|
|
1236
|
-
scoring = scorers.make_multimetric_scorer(
|
|
1237
|
-
self.scoring_metrics_, self.sim_missing_mask_
|
|
1238
|
-
)
|
|
1239
|
-
|
|
1240
|
-
(
|
|
1241
|
-
V,
|
|
1242
|
-
model,
|
|
1243
|
-
best_history,
|
|
1244
|
-
best_params,
|
|
1245
|
-
best_score,
|
|
1246
|
-
best_clf,
|
|
1247
|
-
search,
|
|
1248
|
-
metrics,
|
|
1249
|
-
) = self.run_clf(
|
|
1250
|
-
y_train,
|
|
1251
|
-
y_true,
|
|
1252
|
-
model_params,
|
|
1253
|
-
compile_params,
|
|
1254
|
-
fit_params,
|
|
1255
|
-
ubp_weights=ubp_weights,
|
|
1256
|
-
phase=phase,
|
|
1257
|
-
scoring=scoring,
|
|
1258
|
-
testing=False,
|
|
1259
|
-
)
|
|
1260
|
-
|
|
1261
|
-
histories.append(best_history)
|
|
1262
|
-
models.append(model)
|
|
1263
|
-
del model
|
|
1264
|
-
|
|
1265
|
-
return (
|
|
1266
|
-
models,
|
|
1267
|
-
histories,
|
|
1268
|
-
best_params,
|
|
1269
|
-
best_score,
|
|
1270
|
-
best_clf,
|
|
1271
|
-
search,
|
|
1272
|
-
metrics,
|
|
1273
|
-
)
|
|
1274
|
-
|
|
1275
|
-
def run_ubp(
|
|
1276
|
-
self,
|
|
1277
|
-
y_true,
|
|
1278
|
-
y_train,
|
|
1279
|
-
model_params,
|
|
1280
|
-
compile_params,
|
|
1281
|
-
fit_params,
|
|
1282
|
-
):
|
|
1283
|
-
"""Run UBP using custom subclassed model.
|
|
1284
|
-
|
|
1285
|
-
Args:
|
|
1286
|
-
y_true (numpy.ndarray): Original genotypes with known and missing values.
|
|
1287
|
-
|
|
1288
|
-
y_train (numpy.ndarray): For compatibility with VAE and SAE. Not used.
|
|
1289
|
-
|
|
1290
|
-
model_params (Dict[str, Any]): Dictionary with parameters to pass to the classifier model.
|
|
1291
|
-
|
|
1292
|
-
compile_params (Dict[str, Any]): Dictionary with parameters to pass to the tensorflow compile function.
|
|
1293
|
-
|
|
1294
|
-
fit_params (Dict[str, Any]): Dictionary with parameters to pass to the fit() function.
|
|
1295
|
-
|
|
1296
|
-
Returns:
|
|
1297
|
-
List[tf.keras.Model]: List of keras model objects. One for each phase (len=1 if NLPCA, len=3 if UBP).
|
|
1298
|
-
|
|
1299
|
-
List[Dict[str, float]]: List of dictionaries with best neural network model history.
|
|
1300
|
-
|
|
1301
|
-
Dict[str, Any] or None: Best parameters found during a grid search, or None if a grid search was not run.
|
|
1302
|
-
|
|
1303
|
-
float: Best score obtained during grid search.
|
|
1304
|
-
|
|
1305
|
-
tf.keras.Model: Best model found during grid search.
|
|
1306
|
-
|
|
1307
|
-
sklearn.model_selection object (GridSearchCV, RandomizedSearchCV) or GASearchCV object.
|
|
1308
|
-
|
|
1309
|
-
Dict[str, Any]: Per-class, micro, and macro-averaged metrics including accuracy, ROC-AUC, and Precision-Recall with Average Precision scores.
|
|
1310
|
-
"""
|
|
1311
|
-
scorers = Scorers()
|
|
1312
|
-
|
|
1313
|
-
histories = list()
|
|
1314
|
-
models = list()
|
|
1315
|
-
search_n_components = False
|
|
1316
|
-
|
|
1317
|
-
y_train = model_params.pop("y_train")
|
|
1318
|
-
|
|
1319
|
-
if self.run_gridsearch_:
|
|
1320
|
-
# Cannot do CV because there is no way to use test splits
|
|
1321
|
-
# given that the input gets refined. If using a test split,
|
|
1322
|
-
# then it would just be the randomly initialized values and
|
|
1323
|
-
# would not accurately represent the model.
|
|
1324
|
-
# Thus, we disable cross-validation for the grid searches.
|
|
1325
|
-
scoring = scorers.make_multimetric_scorer(
|
|
1326
|
-
self.scoring_metrics_, self.sim_missing_mask_
|
|
1327
|
-
)
|
|
1328
|
-
|
|
1329
|
-
if "n_components" in self.gridparams:
|
|
1330
|
-
search_n_components = True
|
|
1331
|
-
n_components_searched = self.n_components
|
|
1332
|
-
else:
|
|
1333
|
-
scoring = None
|
|
1334
|
-
|
|
1335
|
-
for phase in range(1, 4):
|
|
1336
|
-
ubp_weights = models[1].get_weights() if phase == 3 else None
|
|
1337
|
-
|
|
1338
|
-
(
|
|
1339
|
-
V,
|
|
1340
|
-
model,
|
|
1341
|
-
best_history,
|
|
1342
|
-
best_params,
|
|
1343
|
-
best_score,
|
|
1344
|
-
best_clf,
|
|
1345
|
-
search,
|
|
1346
|
-
metrics,
|
|
1347
|
-
) = self.run_clf(
|
|
1348
|
-
y_train,
|
|
1349
|
-
y_true,
|
|
1350
|
-
model_params,
|
|
1351
|
-
compile_params,
|
|
1352
|
-
fit_params,
|
|
1353
|
-
ubp_weights=ubp_weights,
|
|
1354
|
-
phase=phase,
|
|
1355
|
-
scoring=scoring,
|
|
1356
|
-
testing=False,
|
|
1357
|
-
)
|
|
1358
|
-
|
|
1359
|
-
if phase == 1:
|
|
1360
|
-
# Cannot have V input with different n_components
|
|
1361
|
-
# in other phases than are in phase 1.
|
|
1362
|
-
# So the n_components search has to happen in phase 1.
|
|
1363
|
-
if best_params is not None and search_n_components:
|
|
1364
|
-
n_components_searched = best_params["n_components"]
|
|
1365
|
-
model_params["V"] = {
|
|
1366
|
-
n_components_searched: model.V_latent.copy()
|
|
1367
|
-
}
|
|
1368
|
-
model_params["n_components"] = n_components_searched
|
|
1369
|
-
self.n_components = n_components_searched
|
|
1370
|
-
self.gridparams.pop("n_components")
|
|
1371
|
-
|
|
1372
|
-
else:
|
|
1373
|
-
model_params["V"] = V
|
|
1374
|
-
elif phase == 2:
|
|
1375
|
-
model_params["V"] = V
|
|
1376
|
-
|
|
1377
|
-
elif phase == 3:
|
|
1378
|
-
if best_params is not None and search_n_components:
|
|
1379
|
-
best_params["n_components"] = n_components_searched
|
|
1380
|
-
|
|
1381
|
-
histories.append(best_history)
|
|
1382
|
-
models.append(model)
|
|
1383
|
-
del model
|
|
1384
|
-
|
|
1385
|
-
return (
|
|
1386
|
-
models,
|
|
1387
|
-
histories,
|
|
1388
|
-
best_params,
|
|
1389
|
-
best_score,
|
|
1390
|
-
best_clf,
|
|
1391
|
-
search,
|
|
1392
|
-
metrics,
|
|
1393
|
-
)
|
|
1394
|
-
|
|
1395
|
-
def _initV(self, y_train, search_mode):
|
|
1396
|
-
"""Initialize random input V as dictionary of numpy arrays.
|
|
1397
|
-
|
|
1398
|
-
Args:
|
|
1399
|
-
y_train (numpy.ndarray): One-hot encoded training dataset (actual data).
|
|
1400
|
-
|
|
1401
|
-
search_mode (bool): Whether doing grid search.
|
|
1402
|
-
|
|
1403
|
-
Returns:
|
|
1404
|
-
Dict[int, numpy.ndarray]: Dictionary with n_components: V as key-value pairs.
|
|
1405
|
-
|
|
1406
|
-
Raises:
|
|
1407
|
-
ValueError: Number of components must be >= 2.
|
|
1408
|
-
"""
|
|
1409
|
-
vinput = dict()
|
|
1410
|
-
if search_mode:
|
|
1411
|
-
if "n_components" in self.gridparams:
|
|
1412
|
-
n_components = self.gridparams["n_components"]
|
|
1413
|
-
else:
|
|
1414
|
-
n_components = self.n_components
|
|
1415
|
-
|
|
1416
|
-
if not isinstance(n_components, int):
|
|
1417
|
-
if min(n_components) < 2:
|
|
1418
|
-
raise ValueError(
|
|
1419
|
-
f"n_components must be >= 2, but a value of {n_components} was specified."
|
|
1420
|
-
)
|
|
1421
|
-
|
|
1422
|
-
elif len(n_components) == 1:
|
|
1423
|
-
vinput[n_components[0]] = self.nn_.init_weights(
|
|
1424
|
-
y_train.shape[0], n_components[0]
|
|
1425
|
-
)
|
|
1426
|
-
|
|
1427
|
-
else:
|
|
1428
|
-
for c in n_components:
|
|
1429
|
-
vinput[c] = self.nn_.init_weights(y_train.shape[0], c)
|
|
1430
|
-
else:
|
|
1431
|
-
vinput[self.n_components] = self.nn_.init_weights(
|
|
1432
|
-
y_train.shape[0], self.n_components
|
|
1433
|
-
)
|
|
1434
|
-
|
|
1435
|
-
else:
|
|
1436
|
-
vinput[self.n_components] = self.nn_.init_weights(
|
|
1437
|
-
y_train.shape[0], self.n_components
|
|
1438
|
-
)
|
|
1439
|
-
|
|
1440
|
-
return vinput
|