pg-sui 0.2.0__py3-none-any.whl → 1.6.14.dev9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info}/METADATA +101 -79
- pg_sui-1.6.14.dev9.dist-info/RECORD +81 -0
- {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info}/WHEEL +1 -1
- pg_sui-1.6.14.dev9.dist-info/entry_points.txt +4 -0
- {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info/licenses}/LICENSE +0 -0
- pg_sui-1.6.14.dev9.dist-info/top_level.txt +1 -0
- pgsui/__init__.py +35 -54
- pgsui/_version.py +34 -0
- pgsui/cli.py +909 -0
- pgsui/data_processing/__init__.py +0 -0
- pgsui/data_processing/config.py +565 -0
- pgsui/data_processing/containers.py +1424 -0
- pgsui/data_processing/transformers.py +557 -907
- pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
- pgsui/electron/app/__main__.py +5 -0
- pgsui/electron/app/extra-resources/.gitkeep +1 -0
- pgsui/electron/app/icons/icons/1024x1024.png +0 -0
- pgsui/electron/app/icons/icons/128x128.png +0 -0
- pgsui/electron/app/icons/icons/16x16.png +0 -0
- pgsui/electron/app/icons/icons/24x24.png +0 -0
- pgsui/electron/app/icons/icons/256x256.png +0 -0
- pgsui/electron/app/icons/icons/32x32.png +0 -0
- pgsui/electron/app/icons/icons/48x48.png +0 -0
- pgsui/electron/app/icons/icons/512x512.png +0 -0
- pgsui/electron/app/icons/icons/64x64.png +0 -0
- pgsui/electron/app/icons/icons/icon.icns +0 -0
- pgsui/electron/app/icons/icons/icon.ico +0 -0
- pgsui/electron/app/main.js +227 -0
- pgsui/electron/app/package-lock.json +6894 -0
- pgsui/electron/app/package.json +51 -0
- pgsui/electron/app/preload.js +15 -0
- pgsui/electron/app/server.py +157 -0
- pgsui/electron/app/ui/logo.png +0 -0
- pgsui/electron/app/ui/renderer.js +131 -0
- pgsui/electron/app/ui/styles.css +59 -0
- pgsui/electron/app/ui/ui_shim.js +72 -0
- pgsui/electron/bootstrap.py +43 -0
- pgsui/electron/launch.py +57 -0
- pgsui/electron/package.json +14 -0
- pgsui/example_data/__init__.py +0 -0
- pgsui/example_data/phylip_files/__init__.py +0 -0
- pgsui/example_data/phylip_files/test.phy +0 -0
- pgsui/example_data/popmaps/__init__.py +0 -0
- pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
- pgsui/example_data/structure_files/__init__.py +0 -0
- pgsui/example_data/structure_files/test.pops.2row.allsites.str +0 -0
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
- pgsui/impute/__init__.py +0 -0
- pgsui/impute/deterministic/imputers/allele_freq.py +725 -0
- pgsui/impute/deterministic/imputers/mode.py +844 -0
- pgsui/impute/deterministic/imputers/nmf.py +221 -0
- pgsui/impute/deterministic/imputers/phylo.py +973 -0
- pgsui/impute/deterministic/imputers/ref_allele.py +669 -0
- pgsui/impute/supervised/__init__.py +0 -0
- pgsui/impute/supervised/base.py +343 -0
- pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
- pgsui/impute/supervised/imputers/hist_gradient_boosting.py +317 -0
- pgsui/impute/supervised/imputers/random_forest.py +291 -0
- pgsui/impute/unsupervised/__init__.py +0 -0
- pgsui/impute/unsupervised/base.py +1118 -0
- pgsui/impute/unsupervised/callbacks.py +92 -262
- {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
- pgsui/impute/unsupervised/imputers/autoencoder.py +1285 -0
- pgsui/impute/unsupervised/imputers/nlpca.py +1554 -0
- pgsui/impute/unsupervised/imputers/ubp.py +1575 -0
- pgsui/impute/unsupervised/imputers/vae.py +1228 -0
- pgsui/impute/unsupervised/loss_functions.py +261 -0
- pgsui/impute/unsupervised/models/__init__.py +0 -0
- pgsui/impute/unsupervised/models/autoencoder_model.py +215 -567
- pgsui/impute/unsupervised/models/nlpca_model.py +155 -394
- pgsui/impute/unsupervised/models/ubp_model.py +180 -1106
- pgsui/impute/unsupervised/models/vae_model.py +269 -630
- pgsui/impute/unsupervised/nn_scorers.py +255 -0
- pgsui/utils/__init__.py +0 -0
- pgsui/utils/classification_viz.py +608 -0
- pgsui/utils/logging_utils.py +22 -0
- pgsui/utils/misc.py +35 -480
- pgsui/utils/plotting.py +996 -829
- pgsui/utils/pretty_metrics.py +290 -0
- pgsui/utils/scorers.py +213 -666
- pg_sui-0.2.0.dist-info/RECORD +0 -75
- pg_sui-0.2.0.dist-info/top_level.txt +0 -3
- pgsui/example_data/phylip_files/test_n10.phy +0 -118
- pgsui/example_data/phylip_files/test_n100.phy +0 -118
- pgsui/example_data/phylip_files/test_n2.phy +0 -118
- pgsui/example_data/phylip_files/test_n500.phy +0 -118
- pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
- pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
- pgsui/example_data/trees/test.iqtree +0 -376
- pgsui/example_data/trees/test.qmat +0 -5
- pgsui/example_data/trees/test.rate +0 -2033
- pgsui/example_data/trees/test.tre +0 -1
- pgsui/example_data/trees/test_n10.rate +0 -19
- pgsui/example_data/trees/test_n100.rate +0 -109
- pgsui/example_data/trees/test_n500.rate +0 -509
- pgsui/example_data/trees/test_siterates.txt +0 -2024
- pgsui/example_data/trees/test_siterates_n10.txt +0 -10
- pgsui/example_data/trees/test_siterates_n100.txt +0 -100
- pgsui/example_data/trees/test_siterates_n500.txt +0 -500
- pgsui/example_data/vcf_files/test.vcf +0 -244
- pgsui/example_data/vcf_files/test.vcf.gz +0 -0
- pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
- pgsui/impute/estimators.py +0 -1268
- pgsui/impute/impute.py +0 -1463
- pgsui/impute/simple_imputers.py +0 -1431
- pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -782
- pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1024
- pgsui/impute/unsupervised/keras_classifiers.py +0 -697
- pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
- pgsui/impute/unsupervised/neural_network_imputers.py +0 -1440
- pgsui/impute/unsupervised/neural_network_methods.py +0 -1395
- pgsui/pg_sui.py +0 -261
- pgsui/utils/sequence_tools.py +0 -407
- simulation/sim_benchmarks.py +0 -333
- simulation/sim_treeparams.py +0 -475
- test/__init__.py +0 -0
- test/pg_sui_simtest.py +0 -215
- test/pg_sui_testing.py +0 -523
- test/test.py +0 -151
- test/test_pgsui.py +0 -374
- test/test_tkc.py +0 -185
pgsui/impute/impute.py
DELETED
|
@@ -1,1463 +0,0 @@
|
|
|
1
|
-
# Standard library imports
|
|
2
|
-
import errno
|
|
3
|
-
import gc
|
|
4
|
-
import math
|
|
5
|
-
import os
|
|
6
|
-
import pprint
|
|
7
|
-
import sys
|
|
8
|
-
from collections import Counter
|
|
9
|
-
from collections import defaultdict
|
|
10
|
-
from operator import itemgetter
|
|
11
|
-
from pathlib import Path
|
|
12
|
-
from statistics import mean, median
|
|
13
|
-
from contextlib import redirect_stdout
|
|
14
|
-
from typing import Optional, Union, List, Dict, Tuple, Any, Callable
|
|
15
|
-
from copy import deepcopy
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
# Third party imports
|
|
19
|
-
import numpy as np
|
|
20
|
-
import pandas as pd
|
|
21
|
-
|
|
22
|
-
from scipy import stats as st
|
|
23
|
-
|
|
24
|
-
# from memory_profiler import memory_usage
|
|
25
|
-
|
|
26
|
-
# Scikit-learn imports
|
|
27
|
-
from sklearn.experimental import enable_iterative_imputer
|
|
28
|
-
from sklearn import metrics
|
|
29
|
-
|
|
30
|
-
from sklearn_genetic.space import Continuous, Categorical, Integer
|
|
31
|
-
|
|
32
|
-
# Custom module imports
|
|
33
|
-
try:
|
|
34
|
-
from .supervised.iterative_imputer_gridsearch import (
|
|
35
|
-
IterativeImputerGridSearch,
|
|
36
|
-
)
|
|
37
|
-
from .supervised.iterative_imputer_fixedparams import (
|
|
38
|
-
IterativeImputerFixedParams,
|
|
39
|
-
)
|
|
40
|
-
from .unsupervised.neural_network_imputers import VAE, UBP, SAE
|
|
41
|
-
from ..utils.misc import isnotebook
|
|
42
|
-
from ..utils.misc import timer
|
|
43
|
-
from ..data_processing.transformers import (
|
|
44
|
-
SimGenotypeDataTransformer,
|
|
45
|
-
)
|
|
46
|
-
except (ModuleNotFoundError, ValueError, ImportError):
|
|
47
|
-
from impute.supervised.iterative_imputer_gridsearch import (
|
|
48
|
-
IterativeImputerGridSearch,
|
|
49
|
-
)
|
|
50
|
-
from impute.supervised.iterative_imputer_fixedparams import (
|
|
51
|
-
IterativeImputerFixedParams,
|
|
52
|
-
)
|
|
53
|
-
from impute.unsupervised.neural_network_imputers import VAE, UBP, SAE
|
|
54
|
-
from utils.misc import isnotebook
|
|
55
|
-
from utils.misc import timer
|
|
56
|
-
from data_processing.transformers import (
|
|
57
|
-
SimGenotypeDataTransformer,
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
is_notebook = isnotebook()
|
|
61
|
-
|
|
62
|
-
if is_notebook:
|
|
63
|
-
from tqdm.notebook import tqdm as progressbar
|
|
64
|
-
else:
|
|
65
|
-
from tqdm import tqdm as progressbar
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
class Impute:
|
|
69
|
-
"""Class to impute missing data from the provided classifier.
|
|
70
|
-
|
|
71
|
-
The Impute class will either run a variational autoencoder or IterativeImputer with the provided estimator. The settings for the provided estimator should be provided as the ``kwargs`` argument as a dictionary object with the estimator's keyword arguments as the keys and the corresponding values. E.g., ``kwargs={"n_jobs", 4, "initial_strategy": "populations"}``\. ``clf_type`` just specifies either "classifier" or "regressor". "regressor" is primarily just for quick and dirty testing and is intended for internal use only.
|
|
72
|
-
|
|
73
|
-
Once the Impute class is initialized, the imputation should be performed with ``fit_predict()``\.
|
|
74
|
-
|
|
75
|
-
The imputed data can then be written to a file with ``write_imputed()``
|
|
76
|
-
|
|
77
|
-
Args:
|
|
78
|
-
clf (str or Callable estimator object): The estimator object to use. If using a variational autoencoder, the provided value should be "VAE". Otherwise, it should be a callable estimator object that is compatible with scikit-learn's IterativeImputer.
|
|
79
|
-
|
|
80
|
-
clf_type (str): Specify whether to use a "classifier" or "regressor". The "regressor" option is just for quick and dirty testing, and "classifier" should almost always be used.
|
|
81
|
-
|
|
82
|
-
kwargs (Dict[str, Any]): Settings to use with the estimator. The keys should be the estimator's keywords, and the values should be their corresponding settings.
|
|
83
|
-
|
|
84
|
-
Raises:
|
|
85
|
-
TypeError: Check whether the ``gridparams`` values are of the correct format if ``ga=True`` or ``ga=False``\.
|
|
86
|
-
|
|
87
|
-
Examples:
|
|
88
|
-
# Don't use parentheses after estimator object.
|
|
89
|
-
>>>imputer = Impute(sklearn.ensemble.RandomForestClassifier,
|
|
90
|
-
"classifier",
|
|
91
|
-
{"n_jobs": 4, "initial_strategy": "populations", "max_iter": 25, "n_estimators": 100, "ga": True})
|
|
92
|
-
>>>self.imputed, self.best_params = imputer.fit_predict(df)
|
|
93
|
-
>>>imputer.write_imputed(self.imputed)
|
|
94
|
-
>>>print(self.imputed)
|
|
95
|
-
[[0, 0, 0, 0],
|
|
96
|
-
[0, 0, 0, 0],
|
|
97
|
-
[0, 1, 1, 0],
|
|
98
|
-
[2, 1, 2, 2]]
|
|
99
|
-
"""
|
|
100
|
-
|
|
101
|
-
def __init__(
|
|
102
|
-
self, clf: Union[str, Callable], clf_type: str, kwargs: Dict[str, Any]
|
|
103
|
-
) -> None:
|
|
104
|
-
self.clf = clf
|
|
105
|
-
self.clf_type = clf_type
|
|
106
|
-
self.original_num_cols = None
|
|
107
|
-
|
|
108
|
-
if self.clf == VAE or self.clf == SAE or self.clf == UBP:
|
|
109
|
-
self.algorithm = "nn"
|
|
110
|
-
self.imp_method = "Unsupervised"
|
|
111
|
-
else:
|
|
112
|
-
self.algorithm = "ii"
|
|
113
|
-
self.imp_method = "Supervised"
|
|
114
|
-
|
|
115
|
-
self.imp_name = self.clf.__name__
|
|
116
|
-
|
|
117
|
-
try:
|
|
118
|
-
self.pops = kwargs["genotype_data"].populations
|
|
119
|
-
except AttributeError:
|
|
120
|
-
self.pops = None
|
|
121
|
-
|
|
122
|
-
self.genotype_data = kwargs["genotype_data"]
|
|
123
|
-
self.verbose = kwargs["verbose"]
|
|
124
|
-
|
|
125
|
-
# Separate local variables into settings objects
|
|
126
|
-
(
|
|
127
|
-
self.imp_kwargs,
|
|
128
|
-
self.clf_kwargs,
|
|
129
|
-
self.ga_kwargs,
|
|
130
|
-
self.cv,
|
|
131
|
-
self.verbose,
|
|
132
|
-
self.n_jobs,
|
|
133
|
-
self.prefix,
|
|
134
|
-
self.column_subset,
|
|
135
|
-
self.disable_progressbar,
|
|
136
|
-
self.chunk_size,
|
|
137
|
-
self.do_validation,
|
|
138
|
-
self.do_gridsearch,
|
|
139
|
-
self.testing,
|
|
140
|
-
) = self._gather_impute_settings(kwargs)
|
|
141
|
-
|
|
142
|
-
if self.algorithm == "ii":
|
|
143
|
-
self.imp_kwargs["pops"] = self.pops
|
|
144
|
-
|
|
145
|
-
if self.do_gridsearch:
|
|
146
|
-
for v in kwargs["gridparams"].values():
|
|
147
|
-
if (
|
|
148
|
-
isinstance(v, (Categorical, Integer, Continuous))
|
|
149
|
-
and kwargs["gridsearch_method"].lower()
|
|
150
|
-
!= "genetic_algorithm"
|
|
151
|
-
):
|
|
152
|
-
raise TypeError(
|
|
153
|
-
"gridsearch_method argument must equal 'genetic_algorithm' if gridparams values are of type sklearn_genetic.space"
|
|
154
|
-
)
|
|
155
|
-
|
|
156
|
-
self.logfilepath = os.path.join(
|
|
157
|
-
f"{self.prefix}_output",
|
|
158
|
-
"logs",
|
|
159
|
-
self.imp_method,
|
|
160
|
-
self.imp_name,
|
|
161
|
-
f"imputer_progress_log.txt",
|
|
162
|
-
)
|
|
163
|
-
|
|
164
|
-
self.invalid_indexes = None
|
|
165
|
-
|
|
166
|
-
# Remove logfile if exists
|
|
167
|
-
try:
|
|
168
|
-
os.remove(self.logfilepath)
|
|
169
|
-
except OSError:
|
|
170
|
-
pass
|
|
171
|
-
|
|
172
|
-
Path(
|
|
173
|
-
os.path.join(
|
|
174
|
-
f"{self.prefix}_output",
|
|
175
|
-
"plots",
|
|
176
|
-
self.imp_method,
|
|
177
|
-
self.imp_name,
|
|
178
|
-
)
|
|
179
|
-
).mkdir(parents=True, exist_ok=True)
|
|
180
|
-
|
|
181
|
-
Path(
|
|
182
|
-
os.path.join(
|
|
183
|
-
f"{self.prefix}_output", "logs", self.imp_method, self.imp_name
|
|
184
|
-
)
|
|
185
|
-
).mkdir(parents=True, exist_ok=True)
|
|
186
|
-
|
|
187
|
-
Path(
|
|
188
|
-
os.path.join(
|
|
189
|
-
f"{self.prefix}_output",
|
|
190
|
-
"reports",
|
|
191
|
-
self.imp_method,
|
|
192
|
-
self.imp_name,
|
|
193
|
-
)
|
|
194
|
-
).mkdir(parents=True, exist_ok=True)
|
|
195
|
-
|
|
196
|
-
Path(
|
|
197
|
-
os.path.join(
|
|
198
|
-
f"{self.prefix}_output",
|
|
199
|
-
"alignments",
|
|
200
|
-
self.imp_method,
|
|
201
|
-
self.imp_name,
|
|
202
|
-
)
|
|
203
|
-
).mkdir(parents=True, exist_ok=True)
|
|
204
|
-
|
|
205
|
-
@timer
|
|
206
|
-
def fit_predict(
|
|
207
|
-
self, X: pd.DataFrame
|
|
208
|
-
) -> Tuple[pd.DataFrame, Dict[str, Any]]:
|
|
209
|
-
"""Fit and predict imputations with IterativeImputer(estimator).
|
|
210
|
-
|
|
211
|
-
Fits and predicts imputed 012-encoded genotypes using IterativeImputer with any of the supported estimator objects. If ``gridparams=None``\, then a grid search is not performed. If ``gridparams!=None``\, then a RandomizedSearchCV is performed on a subset of the data and a final imputation is done on the whole dataset using the best found parameters.
|
|
212
|
-
|
|
213
|
-
Args:
|
|
214
|
-
X (pandas.DataFrame): DataFrame with 012-encoded genotypes.
|
|
215
|
-
|
|
216
|
-
Returns:
|
|
217
|
-
GenotypeData: GenotypeData object with missing genotypes imputed.
|
|
218
|
-
Dict[str, Any]: Best parameters found during grid search.
|
|
219
|
-
"""
|
|
220
|
-
|
|
221
|
-
# Test if output file can be written to
|
|
222
|
-
try:
|
|
223
|
-
outfile = os.path.join(
|
|
224
|
-
f"{self.prefix}_output",
|
|
225
|
-
"alignments",
|
|
226
|
-
self.imp_method,
|
|
227
|
-
self.imp_name,
|
|
228
|
-
"imputed_012.csv",
|
|
229
|
-
)
|
|
230
|
-
|
|
231
|
-
with open(outfile, "w") as fout:
|
|
232
|
-
pass
|
|
233
|
-
except IOError as e:
|
|
234
|
-
print(f"Error: {e.errno}, {e.strerror}")
|
|
235
|
-
if e.errno == errno.EACCES:
|
|
236
|
-
sys.exit(f"Permission denied: Cannot write to {outfile}")
|
|
237
|
-
elif e.errno == errno.EISDIR:
|
|
238
|
-
sys.exit(f"Could not write to {outfile}; It is a directory")
|
|
239
|
-
|
|
240
|
-
# mem_usage = memory_usage((self._impute_single, (X,)))
|
|
241
|
-
# with open(f"profiling_results/memUsage_{self.prefix}.txt", "w") as fout:
|
|
242
|
-
# fout.write(f"{max(mem_usage)}")
|
|
243
|
-
# sys.exit()
|
|
244
|
-
|
|
245
|
-
# Don't do a grid search
|
|
246
|
-
if not self.do_gridsearch:
|
|
247
|
-
imputed_df, df_scores, best_params = self._impute_single(X)
|
|
248
|
-
|
|
249
|
-
if df_scores is not None:
|
|
250
|
-
self._print_scores(df_scores)
|
|
251
|
-
|
|
252
|
-
# Do a grid search and get the transformed data with the best parameters
|
|
253
|
-
else:
|
|
254
|
-
imputed_df, df_scores, best_params = self._impute_gridsearch(X)
|
|
255
|
-
|
|
256
|
-
if self.verbose > 0:
|
|
257
|
-
print("\nBest Parameters:")
|
|
258
|
-
pprint.pprint(best_params)
|
|
259
|
-
|
|
260
|
-
imp_data = self._imputed2genotypedata(imputed_df, self.genotype_data)
|
|
261
|
-
|
|
262
|
-
print("\nDone!\n")
|
|
263
|
-
return imp_data, best_params
|
|
264
|
-
|
|
265
|
-
def _df2chunks(
|
|
266
|
-
self, df: pd.DataFrame, chunk_size: Union[int, float]
|
|
267
|
-
) -> List[pd.DataFrame]:
|
|
268
|
-
"""Break up pandas.DataFrame into chunks and impute chunks.
|
|
269
|
-
|
|
270
|
-
If set to 1.0 of type float, then returns only one chunk containing all the data.
|
|
271
|
-
|
|
272
|
-
Args:
|
|
273
|
-
df (pandas.DataFrame): DataFrame to split into chunks.
|
|
274
|
-
|
|
275
|
-
chunk_size (int or float): If type is integer, then breaks DataFrame into ``chunk_size`` chunks. If type is float, breaks DataFrame up into ``chunk_size * len(df.columns)`` chunks.
|
|
276
|
-
|
|
277
|
-
Returns:
|
|
278
|
-
List[pandas.DataFrame]: List of pandas DataFrames of shape (n_samples, n_features_in_chunk).
|
|
279
|
-
|
|
280
|
-
Raises:
|
|
281
|
-
ValueError: ``chunk_size`` must be of type int or float.
|
|
282
|
-
"""
|
|
283
|
-
if (
|
|
284
|
-
"initial_strategy" in self.imp_kwargs
|
|
285
|
-
and self.imp_kwargs["initial_strategy"] == "phylogeny"
|
|
286
|
-
and chunk_size != 1.0
|
|
287
|
-
):
|
|
288
|
-
print(
|
|
289
|
-
"WARNING: Chunking is not supported with initial_strategy == "
|
|
290
|
-
"'phylogeny'; Setting chunk_size to 1.0 and imputing entire "
|
|
291
|
-
"dataset"
|
|
292
|
-
)
|
|
293
|
-
|
|
294
|
-
chunk_size = 1.0
|
|
295
|
-
|
|
296
|
-
if (
|
|
297
|
-
"initial_strategy" in self.imp_kwargs
|
|
298
|
-
and self.imp_kwargs["initial_strategy"] == "mf"
|
|
299
|
-
and chunk_size != 1.0
|
|
300
|
-
):
|
|
301
|
-
print(
|
|
302
|
-
"WARNING: Chunking is not supported with initial_strategy == "
|
|
303
|
-
"'mf'; Setting chunk_size to 1.0 and imputing entire "
|
|
304
|
-
"dataset"
|
|
305
|
-
)
|
|
306
|
-
|
|
307
|
-
chunk_size = 1.0
|
|
308
|
-
|
|
309
|
-
if isinstance(chunk_size, (int, float)):
|
|
310
|
-
chunks = list()
|
|
311
|
-
df_cp = df.copy()
|
|
312
|
-
|
|
313
|
-
if isinstance(chunk_size, float):
|
|
314
|
-
if chunk_size > 1.0:
|
|
315
|
-
raise ValueError(
|
|
316
|
-
f"If chunk_size is of type float, must be "
|
|
317
|
-
f"between 0.0 and 1.0; Value supplied was {chunk_size}"
|
|
318
|
-
)
|
|
319
|
-
|
|
320
|
-
elif chunk_size == 1.0:
|
|
321
|
-
# All data in one chunk
|
|
322
|
-
chunks.append(df_cp)
|
|
323
|
-
if self.verbose > 1:
|
|
324
|
-
print(
|
|
325
|
-
"Imputing all features at once since chunk_size is "
|
|
326
|
-
"set to 1.0"
|
|
327
|
-
)
|
|
328
|
-
|
|
329
|
-
return chunks
|
|
330
|
-
|
|
331
|
-
tmp = chunk_size
|
|
332
|
-
chunk_size = None
|
|
333
|
-
chunk_size = math.ceil(len(df.columns) * tmp)
|
|
334
|
-
|
|
335
|
-
else:
|
|
336
|
-
raise ValueError(
|
|
337
|
-
f"chunk_size must be of type float or integer, "
|
|
338
|
-
f"but type {type(chunk_size)} was passed"
|
|
339
|
-
)
|
|
340
|
-
|
|
341
|
-
chunk_len_list = list()
|
|
342
|
-
num_chunks = math.ceil(len(df.columns) / chunk_size)
|
|
343
|
-
for i in range(num_chunks):
|
|
344
|
-
chunks.append(df_cp.iloc[:, i * chunk_size : (i + 1) * chunk_size])
|
|
345
|
-
chunk_len_list.append(len(chunks[i].columns))
|
|
346
|
-
|
|
347
|
-
chunk_len = ",".join([str(x) for x in chunk_len_list])
|
|
348
|
-
|
|
349
|
-
if self.verbose > 1:
|
|
350
|
-
print(
|
|
351
|
-
f"Data split into {num_chunks} chunks with {chunk_len} features"
|
|
352
|
-
)
|
|
353
|
-
|
|
354
|
-
return chunks
|
|
355
|
-
|
|
356
|
-
def _imputed2genotypedata(self, imp012, genotype_data):
|
|
357
|
-
"""Create new instance of GenotypeData object from imputed DataFrame.
|
|
358
|
-
|
|
359
|
-
The imputed, decoded DataFrame gets written to file and re-loaded to instantiate a new GenotypeData object.
|
|
360
|
-
|
|
361
|
-
Args:
|
|
362
|
-
imp012 (pandas.DataFrame): Imputed 012-encoded DataFrame.
|
|
363
|
-
|
|
364
|
-
genotype_data (GenotypeData): Original GenotypeData object to load attributes from.
|
|
365
|
-
|
|
366
|
-
Returns:
|
|
367
|
-
GenotypeData: GenotypeData object with imputed data.
|
|
368
|
-
"""
|
|
369
|
-
imputed_gd = deepcopy(genotype_data)
|
|
370
|
-
|
|
371
|
-
if self.clf == VAE:
|
|
372
|
-
if len(imp012.shape) == 3:
|
|
373
|
-
if imp012.shape[-1] == 4:
|
|
374
|
-
imputed_gd.genotypes_onehot = imp012
|
|
375
|
-
else:
|
|
376
|
-
raise ValueError("Invalid shape for imputed output.")
|
|
377
|
-
elif len(imp012.shape) == 2:
|
|
378
|
-
if isinstance(imp012, pd.DataFrame):
|
|
379
|
-
imp012 = imp012.to_numpy()
|
|
380
|
-
imp012 = imp012.astype(int)
|
|
381
|
-
if np.max(imp012) > 2:
|
|
382
|
-
imputed_gd.genotypes_int = imp012
|
|
383
|
-
else:
|
|
384
|
-
imputed_gd.genotypes_012 = imp012
|
|
385
|
-
else:
|
|
386
|
-
raise ValueError(
|
|
387
|
-
f"Invalid shape for imputed output: {imp012.shape}"
|
|
388
|
-
)
|
|
389
|
-
else:
|
|
390
|
-
imputed_gd.genotypes_012 = imp012
|
|
391
|
-
|
|
392
|
-
return imputed_gd
|
|
393
|
-
|
|
394
|
-
def _subset_data_for_gridsearch(
|
|
395
|
-
self,
|
|
396
|
-
df: pd.DataFrame,
|
|
397
|
-
columns_to_subset: Union[int, float],
|
|
398
|
-
original_num_cols: int,
|
|
399
|
-
) -> Tuple[pd.DataFrame, np.ndarray]:
|
|
400
|
-
"""Randomly subsets pandas.DataFrame.
|
|
401
|
-
|
|
402
|
-
Subset pandas DataFrame with ``column_percent`` fraction of the data. Allows for faster validation.
|
|
403
|
-
|
|
404
|
-
Args:
|
|
405
|
-
df (pandas.DataFrame): DataFrame with 012-encoded genotypes.
|
|
406
|
-
|
|
407
|
-
columns_to_subset (int or float): If float, proportion of DataFrame to randomly subset should be between 0 and 1. if integer, subsets ``columns_to_subset`` random columns.
|
|
408
|
-
|
|
409
|
-
original_num_cols (int): Number of columns in original DataFrame.
|
|
410
|
-
|
|
411
|
-
Returns:
|
|
412
|
-
pandas.DataFrame: New DataFrame with random subset of features.
|
|
413
|
-
numpy.ndarray: Sorted numpy array of column indices to keep.
|
|
414
|
-
|
|
415
|
-
Raises:
|
|
416
|
-
TypeError: column_subset must be of type float or int.
|
|
417
|
-
"""
|
|
418
|
-
|
|
419
|
-
# Get a random numpy arrray of column names to select
|
|
420
|
-
if isinstance(columns_to_subset, float):
|
|
421
|
-
n = int(original_num_cols * columns_to_subset)
|
|
422
|
-
elif isinstance(columns_to_subset, int):
|
|
423
|
-
n = columns_to_subset
|
|
424
|
-
else:
|
|
425
|
-
raise TypeError(
|
|
426
|
-
f"column_subset must be of type float or int, "
|
|
427
|
-
f"but got {type(columns_to_subset)}"
|
|
428
|
-
)
|
|
429
|
-
|
|
430
|
-
col_arr = np.array(df.columns)
|
|
431
|
-
|
|
432
|
-
if n > len(df.columns):
|
|
433
|
-
if self.verbose > 0:
|
|
434
|
-
print(
|
|
435
|
-
"Warning: Column_subset is greater than remaining columns following filtering. Using all columns"
|
|
436
|
-
)
|
|
437
|
-
|
|
438
|
-
df_sub = df.copy()
|
|
439
|
-
cols = col_arr.copy()
|
|
440
|
-
else:
|
|
441
|
-
cols = np.random.choice(col_arr, n, replace=False)
|
|
442
|
-
df_sub = df.loc[:, np.sort(cols)]
|
|
443
|
-
# df_sub = df.sample(n=n, axis="columns", replace=False)
|
|
444
|
-
|
|
445
|
-
df_sub.columns = df_sub.columns.astype(str)
|
|
446
|
-
|
|
447
|
-
return df_sub, np.sort(cols)
|
|
448
|
-
|
|
449
|
-
def _print_scores(self, df_scores: pd.DataFrame) -> None:
|
|
450
|
-
"""Print validation scores as pandas.DataFrame.
|
|
451
|
-
|
|
452
|
-
Args:
|
|
453
|
-
df (pandas.DataFrame): DataFrame with score statistics.
|
|
454
|
-
"""
|
|
455
|
-
if self.verbose > 0:
|
|
456
|
-
print("Validation scores:")
|
|
457
|
-
print(df_scores)
|
|
458
|
-
|
|
459
|
-
def _write_imputed_params_score(
|
|
460
|
-
self, df_scores: pd.DataFrame, best_params: Dict[str, Any]
|
|
461
|
-
) -> None:
|
|
462
|
-
"""Save best_score and best_params to files on disk.
|
|
463
|
-
|
|
464
|
-
Args:
|
|
465
|
-
best_score (float): Best RMSE or accuracy score for the regressor or classifier, respectively.
|
|
466
|
-
|
|
467
|
-
best_params (dict): Best parameters found in grid search.
|
|
468
|
-
"""
|
|
469
|
-
|
|
470
|
-
best_score_outfile = os.path.join(
|
|
471
|
-
f"{self.prefix}_output",
|
|
472
|
-
"reports",
|
|
473
|
-
self.imp_method,
|
|
474
|
-
self.imp_name,
|
|
475
|
-
"imputed_best_score.csv",
|
|
476
|
-
)
|
|
477
|
-
best_params_outfile = os.path.join(
|
|
478
|
-
f"{self.prefix}_output",
|
|
479
|
-
"reports",
|
|
480
|
-
self.imp_method,
|
|
481
|
-
self.imp_name,
|
|
482
|
-
"imputed_best_params.csv",
|
|
483
|
-
)
|
|
484
|
-
|
|
485
|
-
if isinstance(df_scores, pd.DataFrame):
|
|
486
|
-
df_scores.to_csv(
|
|
487
|
-
best_score_outfile,
|
|
488
|
-
header=True,
|
|
489
|
-
index=False,
|
|
490
|
-
float_format="%.2f",
|
|
491
|
-
)
|
|
492
|
-
|
|
493
|
-
else:
|
|
494
|
-
with open(best_score_outfile, "w") as fout:
|
|
495
|
-
fout.write(f"accuracy,{df_scores}\n")
|
|
496
|
-
|
|
497
|
-
with open(best_params_outfile, "w") as fout:
|
|
498
|
-
fout.write("parameter,best_value\n")
|
|
499
|
-
for k, v in best_params.items():
|
|
500
|
-
fout.write(f"{k},{v}\n")
|
|
501
|
-
|
|
502
|
-
def _impute_single(
|
|
503
|
-
self, df: pd.DataFrame
|
|
504
|
-
) -> Tuple[pd.DataFrame, pd.DataFrame, None]:
|
|
505
|
-
"""Run IterativeImputer without a grid search.
|
|
506
|
-
|
|
507
|
-
Will do a different type of validation if ``do_validation == True``\.
|
|
508
|
-
|
|
509
|
-
Args:
|
|
510
|
-
df (pandas.DataFrame): DataFrame of 012-encoded genotypes.
|
|
511
|
-
|
|
512
|
-
Returns:
|
|
513
|
-
pandas.DataFrame: Imputed DataFrame of 012-encoded genotypes.
|
|
514
|
-
pandas.DataFrame: DataFrame with validation scores.
|
|
515
|
-
NoneType: Only used with _impute_gridsearch. Set to None here for compatibility.
|
|
516
|
-
"""
|
|
517
|
-
if self.verbose > 0:
|
|
518
|
-
print(
|
|
519
|
-
f"\nDoing {self.clf.__name__} imputation without grid search..."
|
|
520
|
-
)
|
|
521
|
-
|
|
522
|
-
if self.algorithm == "nn":
|
|
523
|
-
clf = None
|
|
524
|
-
|
|
525
|
-
else:
|
|
526
|
-
clf = self.clf(**self.clf_kwargs)
|
|
527
|
-
|
|
528
|
-
if self.do_validation:
|
|
529
|
-
if self.verbose > 0:
|
|
530
|
-
print(f"Estimating {self.clf.__name__} validation scores...")
|
|
531
|
-
|
|
532
|
-
if self.disable_progressbar:
|
|
533
|
-
with open(self.logfilepath, "a") as fout:
|
|
534
|
-
# Redirect to progress logfile
|
|
535
|
-
with redirect_stdout(fout):
|
|
536
|
-
print(
|
|
537
|
-
f"Doing {self.clf.__name__} imputation "
|
|
538
|
-
f"without grid search...\n"
|
|
539
|
-
)
|
|
540
|
-
|
|
541
|
-
if self.verbose > 0:
|
|
542
|
-
print(
|
|
543
|
-
f"Estimating {self.clf.__name__} "
|
|
544
|
-
f"validation scores...\n"
|
|
545
|
-
)
|
|
546
|
-
|
|
547
|
-
df_scores = self._imputer_validation(df, clf)
|
|
548
|
-
|
|
549
|
-
if self.verbose > 0:
|
|
550
|
-
print(f"\nDone with {self.clf.__name__} validation!\n")
|
|
551
|
-
|
|
552
|
-
if self.disable_progressbar:
|
|
553
|
-
if self.verbose > 0:
|
|
554
|
-
with open(self.logfilepath, "a") as fout:
|
|
555
|
-
# Redirect to progress logfile
|
|
556
|
-
with redirect_stdout(fout):
|
|
557
|
-
print(
|
|
558
|
-
f"\nDone with {self.clf.__name__} validation!\n"
|
|
559
|
-
)
|
|
560
|
-
|
|
561
|
-
else:
|
|
562
|
-
df_scores = None
|
|
563
|
-
|
|
564
|
-
if self.algorithm == "nn":
|
|
565
|
-
imputer = None
|
|
566
|
-
|
|
567
|
-
else:
|
|
568
|
-
imputer = self._define_iterative_imputer(
|
|
569
|
-
clf,
|
|
570
|
-
self.logfilepath,
|
|
571
|
-
clf_kwargs=self.clf_kwargs,
|
|
572
|
-
imp_kwargs=self.imp_kwargs,
|
|
573
|
-
)
|
|
574
|
-
|
|
575
|
-
if self.original_num_cols is None:
|
|
576
|
-
self.original_num_cols = len(df.columns)
|
|
577
|
-
|
|
578
|
-
# Remove non-biallelic loci
|
|
579
|
-
# Only used if initial_strategy == 'phylogeny'
|
|
580
|
-
if self.invalid_indexes is not None:
|
|
581
|
-
df.drop(
|
|
582
|
-
labels=self.invalid_indexes,
|
|
583
|
-
axis=1,
|
|
584
|
-
inplace=True,
|
|
585
|
-
)
|
|
586
|
-
|
|
587
|
-
if self.disable_progressbar:
|
|
588
|
-
if self.verbose > 0:
|
|
589
|
-
with open(self.logfilepath, "a") as fout:
|
|
590
|
-
# Redirect to progress logfile
|
|
591
|
-
with redirect_stdout(fout):
|
|
592
|
-
print(f"Doing {self.clf.__name__} imputation...\n")
|
|
593
|
-
|
|
594
|
-
df_chunks = self._df2chunks(df, self.chunk_size)
|
|
595
|
-
imputed_df = self._impute_df(df_chunks, imputer)
|
|
596
|
-
imputed_df = imputed_df.astype(str)
|
|
597
|
-
|
|
598
|
-
if self.disable_progressbar:
|
|
599
|
-
if self.verbose > 0:
|
|
600
|
-
with open(self.logfilepath, "a") as fout:
|
|
601
|
-
# Redirect to progress logfile
|
|
602
|
-
with redirect_stdout(fout):
|
|
603
|
-
print(f"\nDone with {self.clf.__name__} imputation!\n")
|
|
604
|
-
|
|
605
|
-
lst2del = [df_chunks]
|
|
606
|
-
del lst2del
|
|
607
|
-
gc.collect()
|
|
608
|
-
|
|
609
|
-
self._validate_imputed(imputed_df)
|
|
610
|
-
|
|
611
|
-
if self.verbose > 0:
|
|
612
|
-
print(f"\nDone with {self.clf.__name__} imputation!\n")
|
|
613
|
-
|
|
614
|
-
return imputed_df, df_scores, None
|
|
615
|
-
|
|
616
|
-
def _impute_gridsearch(
|
|
617
|
-
self, df: pd.DataFrame
|
|
618
|
-
) -> Tuple[pd.DataFrame, pd.DataFrame, Dict[str, Any]]:
|
|
619
|
-
"""Do IterativeImputer with RandomizedSearchCV or GASearchCV.
|
|
620
|
-
|
|
621
|
-
Args:
|
|
622
|
-
df (pandas.DataFrame): DataFrame with 012-encoded genotypes.
|
|
623
|
-
|
|
624
|
-
Returns:
|
|
625
|
-
pandas.DataFrame: DataFrame with 012-encoded genotypes imputed using the best parameters found with the grid search.
|
|
626
|
-
float: Absolute value of best score found during the grid search.
|
|
627
|
-
dict: Best parameters found during the grid search.
|
|
628
|
-
"""
|
|
629
|
-
original_num_cols = len(df.columns)
|
|
630
|
-
df_subset, cols_to_keep = self._subset_data_for_gridsearch(
|
|
631
|
-
df, self.column_subset, original_num_cols
|
|
632
|
-
)
|
|
633
|
-
|
|
634
|
-
print(f"Doing {self.clf.__name__} grid search...")
|
|
635
|
-
|
|
636
|
-
if self.verbose > 0:
|
|
637
|
-
print(f"Validation dataset size: {len(df_subset.columns)}\n")
|
|
638
|
-
|
|
639
|
-
if self.disable_progressbar:
|
|
640
|
-
with open(self.logfilepath, "a") as fout:
|
|
641
|
-
# Redirect to progress logfile
|
|
642
|
-
with redirect_stdout(fout):
|
|
643
|
-
print(f"Doing {self.clf.__name__} grid search...\n")
|
|
644
|
-
|
|
645
|
-
if self.algorithm == "nn":
|
|
646
|
-
self.imp_kwargs.pop("str_encodings")
|
|
647
|
-
imputer = self.clf(
|
|
648
|
-
**self.clf_kwargs,
|
|
649
|
-
**self.imp_kwargs,
|
|
650
|
-
ga_kwargs=self.ga_kwargs,
|
|
651
|
-
)
|
|
652
|
-
|
|
653
|
-
df_imp = pd.DataFrame(
|
|
654
|
-
imputer.fit_transform(df_subset), columns=cols_to_keep
|
|
655
|
-
)
|
|
656
|
-
|
|
657
|
-
df_imp = df_imp.astype("float")
|
|
658
|
-
df_imp = df_imp.astype("int64")
|
|
659
|
-
|
|
660
|
-
else:
|
|
661
|
-
clf = self.clf()
|
|
662
|
-
df_subset = df_subset.astype("float32")
|
|
663
|
-
df_subset.replace(-9.0, np.nan, inplace=True)
|
|
664
|
-
|
|
665
|
-
imputer = self._define_iterative_imputer(
|
|
666
|
-
clf,
|
|
667
|
-
self.logfilepath,
|
|
668
|
-
clf_kwargs=self.clf_kwargs,
|
|
669
|
-
ga_kwargs=self.ga_kwargs,
|
|
670
|
-
n_jobs=self.n_jobs,
|
|
671
|
-
clf_type=self.clf_type,
|
|
672
|
-
imp_kwargs=self.imp_kwargs,
|
|
673
|
-
)
|
|
674
|
-
|
|
675
|
-
if len(cols_to_keep) == original_num_cols:
|
|
676
|
-
cols_to_keep = None
|
|
677
|
-
|
|
678
|
-
Xt, params_list, score_list = imputer.fit_transform(
|
|
679
|
-
df, cols_to_keep
|
|
680
|
-
)
|
|
681
|
-
|
|
682
|
-
if self.verbose > 0:
|
|
683
|
-
print(f"\nDone with {self.clf.__name__} grid search!")
|
|
684
|
-
|
|
685
|
-
if self.disable_progressbar:
|
|
686
|
-
if self.verbose > 0:
|
|
687
|
-
with open(self.logfilepath, "a") as fout:
|
|
688
|
-
# Redirect to progress logfile
|
|
689
|
-
with redirect_stdout(fout):
|
|
690
|
-
print(
|
|
691
|
-
f"\nDone with {self.clf.__name__} grid search!"
|
|
692
|
-
)
|
|
693
|
-
|
|
694
|
-
if self.algorithm == "ii":
|
|
695
|
-
# Iterative Imputer.
|
|
696
|
-
del imputer
|
|
697
|
-
del Xt
|
|
698
|
-
|
|
699
|
-
# Average or mode of best parameters
|
|
700
|
-
# and write them to a file
|
|
701
|
-
best_params = self._get_best_params(params_list)
|
|
702
|
-
|
|
703
|
-
avg_score = mean(abs(x) for x in score_list if x != -9)
|
|
704
|
-
median_score = median(abs(x) for x in score_list if x != -9)
|
|
705
|
-
max_score = max(abs(x) for x in score_list if x != -9)
|
|
706
|
-
min_score = min(abs(x) for x in score_list if x != -9)
|
|
707
|
-
|
|
708
|
-
df_scores = pd.DataFrame(
|
|
709
|
-
{
|
|
710
|
-
"Mean": avg_score,
|
|
711
|
-
"Median": median_score,
|
|
712
|
-
"Min": min_score,
|
|
713
|
-
"Max": max_score,
|
|
714
|
-
},
|
|
715
|
-
index=[0],
|
|
716
|
-
)
|
|
717
|
-
|
|
718
|
-
df_scores = df_scores.round(2)
|
|
719
|
-
|
|
720
|
-
del avg_score
|
|
721
|
-
del median_score
|
|
722
|
-
del max_score
|
|
723
|
-
del min_score
|
|
724
|
-
gc.collect()
|
|
725
|
-
else:
|
|
726
|
-
# Using neural network.
|
|
727
|
-
best_params = imputer.best_params_
|
|
728
|
-
df_scores = imputer.best_score_
|
|
729
|
-
df_scores = round(df_scores, 2) * 100
|
|
730
|
-
best_imputer = None
|
|
731
|
-
|
|
732
|
-
if self.clf_type == "classifier" and self.algorithm != "nn":
|
|
733
|
-
df_scores = df_scores.apply(lambda x: x * 100)
|
|
734
|
-
|
|
735
|
-
self._write_imputed_params_score(df_scores, best_params)
|
|
736
|
-
|
|
737
|
-
# Change values to the ones in best_params
|
|
738
|
-
self.clf_kwargs.update(best_params)
|
|
739
|
-
|
|
740
|
-
if self.algorithm == "ii":
|
|
741
|
-
if hasattr(self.clf(), "n_jobs"):
|
|
742
|
-
self.clf_kwargs["n_jobs"] = self.n_jobs
|
|
743
|
-
|
|
744
|
-
best_clf = self.clf(**self.clf_kwargs)
|
|
745
|
-
|
|
746
|
-
gc.collect()
|
|
747
|
-
|
|
748
|
-
if self.verbose > 0:
|
|
749
|
-
print(
|
|
750
|
-
f"\nDoing {self.clf.__name__} imputation "
|
|
751
|
-
f"with best found parameters...\n"
|
|
752
|
-
)
|
|
753
|
-
|
|
754
|
-
if self.disable_progressbar:
|
|
755
|
-
with open(self.logfilepath, "a") as fout:
|
|
756
|
-
# Redirect to progress logfile
|
|
757
|
-
with redirect_stdout(fout):
|
|
758
|
-
print(
|
|
759
|
-
f"\nDoing {self.clf.__name__} imputation "
|
|
760
|
-
f"with best found parameters...\n"
|
|
761
|
-
)
|
|
762
|
-
|
|
763
|
-
if self.algorithm == "ii":
|
|
764
|
-
best_imputer = self._define_iterative_imputer(
|
|
765
|
-
best_clf,
|
|
766
|
-
self.logfilepath,
|
|
767
|
-
clf_kwargs=self.clf_kwargs,
|
|
768
|
-
imp_kwargs=self.imp_kwargs,
|
|
769
|
-
)
|
|
770
|
-
|
|
771
|
-
final_cols = None
|
|
772
|
-
if len(df.columns) < original_num_cols:
|
|
773
|
-
final_cols = np.array(df.columns)
|
|
774
|
-
|
|
775
|
-
if self.algorithm == "nn" and self.column_subset == 1.0:
|
|
776
|
-
imputed_df = df_imp.copy()
|
|
777
|
-
df_chunks = None
|
|
778
|
-
else:
|
|
779
|
-
df_chunks = self._df2chunks(df, self.chunk_size)
|
|
780
|
-
imputed_df = self._impute_df(
|
|
781
|
-
df_chunks, best_imputer, cols_to_keep=final_cols
|
|
782
|
-
)
|
|
783
|
-
|
|
784
|
-
lst2del = [df_chunks, df]
|
|
785
|
-
del lst2del
|
|
786
|
-
gc.collect()
|
|
787
|
-
|
|
788
|
-
self._validate_imputed(imputed_df)
|
|
789
|
-
|
|
790
|
-
if self.verbose > 0:
|
|
791
|
-
print(f"Done with {self.clf.__name__} imputation!\n")
|
|
792
|
-
|
|
793
|
-
if self.disable_progressbar:
|
|
794
|
-
with open(self.logfilepath, "a") as fout:
|
|
795
|
-
# Redirect to progress logfile
|
|
796
|
-
with redirect_stdout(fout):
|
|
797
|
-
print(f"Done with {self.clf.__name__} imputation!\n")
|
|
798
|
-
|
|
799
|
-
return imputed_df, df_scores, best_params
|
|
800
|
-
|
|
801
|
-
def _imputer_validation(
|
|
802
|
-
self, df: pd.DataFrame, clf: Optional[Callable]
|
|
803
|
-
) -> pd.DataFrame:
|
|
804
|
-
"""Validate imputation with a validation test set.
|
|
805
|
-
|
|
806
|
-
Validation imputation by running it on a validation test set ``cv`` times. Actual missing values are imputed with sklearn.impute.SimpleImputer, and then missing values are randomly introduced to known genotypes. The dataset with no missing data is compared to the dataset with known missing data to obtain validation scores.
|
|
807
|
-
|
|
808
|
-
Args:
|
|
809
|
-
df (pandas.DataFrame): 012-encoded genotypes to impute.
|
|
810
|
-
|
|
811
|
-
clf (sklearn classifier instance or None): sklearn classifier instance with which to run the imputation.
|
|
812
|
-
|
|
813
|
-
Raises:
|
|
814
|
-
ValueError: If none of the scores were able to be estimated and reps variable is empty.
|
|
815
|
-
|
|
816
|
-
Returns:
|
|
817
|
-
pandas.DataFrame: Validation scores in a pandas DataFrame object. Contains the scoring metric, mean, median, minimum, and maximum validation scores among all features, and the lower and upper 95% confidence interval among the replicates.
|
|
818
|
-
"""
|
|
819
|
-
reps = defaultdict(list)
|
|
820
|
-
for cnt, rep in enumerate(
|
|
821
|
-
progressbar(
|
|
822
|
-
range(self.cv),
|
|
823
|
-
desc="Validation replicates: ",
|
|
824
|
-
leave=True,
|
|
825
|
-
disable=self.disable_progressbar,
|
|
826
|
-
),
|
|
827
|
-
start=1,
|
|
828
|
-
):
|
|
829
|
-
if self.disable_progressbar:
|
|
830
|
-
perc = int((cnt / self.cv) * 100)
|
|
831
|
-
if self.verbose > 0:
|
|
832
|
-
print(f"Validation replicate {cnt}/{self.cv} ({perc}%)")
|
|
833
|
-
|
|
834
|
-
with open(self.logfilepath, "a") as fout:
|
|
835
|
-
# Redirect to progress logfile
|
|
836
|
-
with redirect_stdout(fout):
|
|
837
|
-
print(
|
|
838
|
-
f"Validation replicate {cnt}/{self.cv} ({perc}%)"
|
|
839
|
-
)
|
|
840
|
-
|
|
841
|
-
scores = self._impute_eval(df, clf)
|
|
842
|
-
|
|
843
|
-
for k, score_list in scores.items():
|
|
844
|
-
score_list_filtered = filter(lambda x: x != -9, score_list)
|
|
845
|
-
|
|
846
|
-
if score_list_filtered:
|
|
847
|
-
reps[k].append(score_list_filtered)
|
|
848
|
-
else:
|
|
849
|
-
continue
|
|
850
|
-
|
|
851
|
-
if not reps:
|
|
852
|
-
raise ValueError("None of the features could be validated!")
|
|
853
|
-
|
|
854
|
-
ci_lower = dict()
|
|
855
|
-
ci_upper = dict()
|
|
856
|
-
for k, v in reps.items():
|
|
857
|
-
reps_t = np.array(v).T.tolist()
|
|
858
|
-
|
|
859
|
-
cis = list()
|
|
860
|
-
if len(reps_t) > 1:
|
|
861
|
-
for rep in reps_t:
|
|
862
|
-
rep = [abs(x) for x in rep]
|
|
863
|
-
|
|
864
|
-
cis.append(
|
|
865
|
-
st.t.interval(
|
|
866
|
-
alpha=0.95,
|
|
867
|
-
df=len(rep) - 1,
|
|
868
|
-
loc=np.mean(rep),
|
|
869
|
-
scale=st.sem(rep),
|
|
870
|
-
)
|
|
871
|
-
)
|
|
872
|
-
|
|
873
|
-
ci_lower[k] = mean(x[0] for x in cis)
|
|
874
|
-
ci_upper[k] = mean(x[1] for x in cis)
|
|
875
|
-
else:
|
|
876
|
-
print(
|
|
877
|
-
"Warning: There was no variance among replicates; "
|
|
878
|
-
"the 95% CI could not be calculated"
|
|
879
|
-
)
|
|
880
|
-
|
|
881
|
-
ci_lower[k] = np.nan
|
|
882
|
-
ci_upper[k] = np.nan
|
|
883
|
-
|
|
884
|
-
results_list = list()
|
|
885
|
-
for k, score_list in scores.items():
|
|
886
|
-
avg_score = mean(abs(x) for x in score_list if x != -9)
|
|
887
|
-
median_score = median(abs(x) for x in score_list if x != -9)
|
|
888
|
-
max_score = max(abs(x) for x in score_list if x != -9)
|
|
889
|
-
min_score = min(abs(x) for x in score_list if x != -9)
|
|
890
|
-
|
|
891
|
-
results_list.append(
|
|
892
|
-
{
|
|
893
|
-
"Metric": k,
|
|
894
|
-
"Mean": avg_score,
|
|
895
|
-
"Median": median_score,
|
|
896
|
-
"Min": min_score,
|
|
897
|
-
"Max": max_score,
|
|
898
|
-
"Lower 95% CI": ci_lower[k],
|
|
899
|
-
"Upper 95% CI": ci_upper[k],
|
|
900
|
-
}
|
|
901
|
-
)
|
|
902
|
-
|
|
903
|
-
df_scores = pd.DataFrame(results_list)
|
|
904
|
-
|
|
905
|
-
if self.clf_type == "classifier":
|
|
906
|
-
columns_list = [
|
|
907
|
-
"Mean",
|
|
908
|
-
"Median",
|
|
909
|
-
"Min",
|
|
910
|
-
"Max",
|
|
911
|
-
"Lower 95% CI",
|
|
912
|
-
"Upper 95% CI",
|
|
913
|
-
]
|
|
914
|
-
|
|
915
|
-
df_scores = df_scores.round(2)
|
|
916
|
-
|
|
917
|
-
outfile = os.path.join(
|
|
918
|
-
f"{self.prefix}_output",
|
|
919
|
-
"reports",
|
|
920
|
-
self.imp_method,
|
|
921
|
-
self.imp_name,
|
|
922
|
-
"imputed_best_score.csv",
|
|
923
|
-
)
|
|
924
|
-
df_scores.to_csv(outfile, header=True, index=False)
|
|
925
|
-
|
|
926
|
-
del results_list
|
|
927
|
-
gc.collect()
|
|
928
|
-
|
|
929
|
-
return df_scores
|
|
930
|
-
|
|
931
|
-
def _impute_df(
|
|
932
|
-
self,
|
|
933
|
-
df_chunks: List[pd.DataFrame],
|
|
934
|
-
imputer: Optional[
|
|
935
|
-
Union[IterativeImputerFixedParams, IterativeImputerGridSearch]
|
|
936
|
-
] = None,
|
|
937
|
-
cols_to_keep: Optional[np.ndarray] = None,
|
|
938
|
-
) -> pd.DataFrame:
|
|
939
|
-
"""Impute list of pandas.DataFrame objects using custom IterativeImputer class.
|
|
940
|
-
|
|
941
|
-
The DataFrames are chunks of the whole input data, with each chunk correspoding to ``chunk_size`` features from ``_df2chunks()``\.
|
|
942
|
-
|
|
943
|
-
Args:
|
|
944
|
-
df_chunks (List[pandas.DataFrame]): List of Dataframes of shape(n_samples, n_features_in_chunk).
|
|
945
|
-
|
|
946
|
-
imputer (imputer or classifier instance or None): Imputer or classifier instance to perform the imputation.
|
|
947
|
-
|
|
948
|
-
cols_to_keep (numpy.ndarray or None): Final bi-allelic columns to keep. If some columns were non-biallelic, it will be a subset of columns.
|
|
949
|
-
|
|
950
|
-
Returns:
|
|
951
|
-
pandas.DataFrame: Single DataFrame object, with all the imputed chunks concatenated together.
|
|
952
|
-
"""
|
|
953
|
-
imputed_chunks = list()
|
|
954
|
-
for i, Xchunk in enumerate(df_chunks, start=1):
|
|
955
|
-
if self.clf_type == "classifier":
|
|
956
|
-
if self.algorithm == "nn":
|
|
957
|
-
if self.clf == VAE:
|
|
958
|
-
self.clf_kwargs["testing"] = self.testing
|
|
959
|
-
imputer = self.clf(
|
|
960
|
-
genotype_data=self.imp_kwargs["genotype_data"],
|
|
961
|
-
disable_progressbar=self.disable_progressbar,
|
|
962
|
-
prefix=self.prefix,
|
|
963
|
-
**self.clf_kwargs,
|
|
964
|
-
)
|
|
965
|
-
df_imp = pd.DataFrame(
|
|
966
|
-
imputer.fit_transform(Xchunk),
|
|
967
|
-
)
|
|
968
|
-
df_imp = df_imp.astype("float")
|
|
969
|
-
df_imp = df_imp.astype("Int8")
|
|
970
|
-
|
|
971
|
-
else:
|
|
972
|
-
imp, _, __ = imputer.fit_transform(
|
|
973
|
-
Xchunk, valid_cols=cols_to_keep
|
|
974
|
-
)
|
|
975
|
-
df_imp = pd.DataFrame(imp)
|
|
976
|
-
|
|
977
|
-
imputed_chunks.append(df_imp)
|
|
978
|
-
|
|
979
|
-
else:
|
|
980
|
-
# Regressor. Needs to be rounded to integer first.
|
|
981
|
-
imp, _, __ = imputer.fit_transform(
|
|
982
|
-
Xchunk, valid_cols=cols_to_keep
|
|
983
|
-
)
|
|
984
|
-
df_imp = pd.DataFrame(imp)
|
|
985
|
-
df_imp = df_imp.round(0).astype("Int8")
|
|
986
|
-
|
|
987
|
-
imputed_chunks.append(df_imp)
|
|
988
|
-
|
|
989
|
-
concat_df = pd.concat(imputed_chunks, axis=1)
|
|
990
|
-
|
|
991
|
-
del imputed_chunks
|
|
992
|
-
gc.collect()
|
|
993
|
-
|
|
994
|
-
return concat_df
|
|
995
|
-
|
|
996
|
-
def _validate_imputed(self, df: pd.DataFrame) -> None:
|
|
997
|
-
"""Asserts that there is no missing data left in the imputed DataFrame.
|
|
998
|
-
|
|
999
|
-
Args:
|
|
1000
|
-
df (pandas.DataFrame): DataFrame with imputed 012-encoded genotypes.
|
|
1001
|
-
|
|
1002
|
-
Raises:
|
|
1003
|
-
AssertionError: Error if missing values are still found in the dataset after imputation.
|
|
1004
|
-
"""
|
|
1005
|
-
assert (
|
|
1006
|
-
not df.isnull().values.any()
|
|
1007
|
-
), "Imputation failed...Missing values found in the imputed dataset"
|
|
1008
|
-
|
|
1009
|
-
def _get_best_params(
|
|
1010
|
-
self, params_list: List[Dict[str, Any]]
|
|
1011
|
-
) -> Dict[str, Any]:
|
|
1012
|
-
"""[Gets the best parameters from the grid search. Determines the parameter types and either gets the mean or mode if the type is numeric or string/ boolean]
|
|
1013
|
-
|
|
1014
|
-
Args:
|
|
1015
|
-
params_list (List[dict]): List of grid search parameter values.
|
|
1016
|
-
|
|
1017
|
-
Returns:
|
|
1018
|
-
Dict[str, Any]: Dictionary with parameters as keys and their best values.
|
|
1019
|
-
"""
|
|
1020
|
-
best_params = dict()
|
|
1021
|
-
keys = list(params_list[0].keys())
|
|
1022
|
-
first_key = keys[0]
|
|
1023
|
-
|
|
1024
|
-
params_list = list(filter(lambda i: i[first_key] != -9, params_list))
|
|
1025
|
-
|
|
1026
|
-
for k in keys:
|
|
1027
|
-
if all(
|
|
1028
|
-
isinstance(x[k], (int, float)) for x in params_list if x[k]
|
|
1029
|
-
):
|
|
1030
|
-
if all(isinstance(y[k], int) for y in params_list):
|
|
1031
|
-
best_params[k] = self._average_list_of_dicts(
|
|
1032
|
-
params_list, k, is_int=True
|
|
1033
|
-
)
|
|
1034
|
-
|
|
1035
|
-
elif all(isinstance(z[k], float) for z in params_list):
|
|
1036
|
-
best_params[k] = self._average_list_of_dicts(
|
|
1037
|
-
params_list, k
|
|
1038
|
-
)
|
|
1039
|
-
|
|
1040
|
-
elif all(isinstance(x[k], (str, bool)) for x in params_list):
|
|
1041
|
-
best_params[k] = self._mode_list_of_dicts(params_list, k)
|
|
1042
|
-
|
|
1043
|
-
else:
|
|
1044
|
-
best_params[k] = self._mode_list_of_dicts(params_list, k)
|
|
1045
|
-
|
|
1046
|
-
return best_params
|
|
1047
|
-
|
|
1048
|
-
def _mode_list_of_dicts(
|
|
1049
|
-
self, l: List[Dict[str, Union[str, bool]]], k: str
|
|
1050
|
-
) -> str:
|
|
1051
|
-
"""Get mode for key k in a list of dictionaries.
|
|
1052
|
-
|
|
1053
|
-
Args:
|
|
1054
|
-
l (list(dict)): List of dictionaries.
|
|
1055
|
-
k (str): Key to find the mode across all dictionaries in l.
|
|
1056
|
-
|
|
1057
|
-
Returns:
|
|
1058
|
-
str or bool: Most common value across list of dictionaries for one key.
|
|
1059
|
-
"""
|
|
1060
|
-
k_count = Counter(map(itemgetter(k), l))
|
|
1061
|
-
return k_count.most_common()[0][0]
|
|
1062
|
-
|
|
1063
|
-
def _average_list_of_dicts(
|
|
1064
|
-
self,
|
|
1065
|
-
l: List[Dict[str, Union[int, float]]],
|
|
1066
|
-
k: str,
|
|
1067
|
-
is_int: bool = False,
|
|
1068
|
-
) -> Union[int, float]:
|
|
1069
|
-
"""Get average of a given key in a list of dictionaries.
|
|
1070
|
-
|
|
1071
|
-
Args:
|
|
1072
|
-
l (List[Dict[str, Union[int, float]]]): List of dictionaries.
|
|
1073
|
-
|
|
1074
|
-
k (str): Key to find average across list of dictionaries.
|
|
1075
|
-
|
|
1076
|
-
is_int (bool, optional): Whether or not the value for key k is an integer. If False, it is expected to be of type float. Defaults to False.
|
|
1077
|
-
|
|
1078
|
-
Returns:
|
|
1079
|
-
int or float: average of given key across list of dictionaries.
|
|
1080
|
-
"""
|
|
1081
|
-
if is_int:
|
|
1082
|
-
return int(sum(d[k] for d in l) / len(l))
|
|
1083
|
-
else:
|
|
1084
|
-
return sum(d[k] for d in l) / len(l)
|
|
1085
|
-
|
|
1086
|
-
def _gather_impute_settings(
|
|
1087
|
-
self, kwargs: Dict[str, Any]
|
|
1088
|
-
) -> Tuple[
|
|
1089
|
-
Optional[Dict[str, Any]],
|
|
1090
|
-
Optional[Dict[str, Any]],
|
|
1091
|
-
Optional[Dict[str, Any]],
|
|
1092
|
-
Optional[int],
|
|
1093
|
-
Optional[int],
|
|
1094
|
-
Optional[int],
|
|
1095
|
-
Optional[str],
|
|
1096
|
-
Optional[Union[int, float]],
|
|
1097
|
-
Optional[bool],
|
|
1098
|
-
Optional[Union[int, float]],
|
|
1099
|
-
Optional[bool],
|
|
1100
|
-
Optional[bool],
|
|
1101
|
-
]:
|
|
1102
|
-
"""Gather impute settings from kwargs object.
|
|
1103
|
-
|
|
1104
|
-
Gather impute settings from the various imputation classes and IterativeImputer. Gathers them for use with the ``Impute`` class. Returns dictionary with keys as keyword arguments and the values as the settings. The imputation can then be run by specifying IterativeImputer(imp_kwargs).
|
|
1105
|
-
|
|
1106
|
-
Args:
|
|
1107
|
-
kwargs (Dict[str, Any]): Dictionary with keys as the keyword arguments and their corresponding values.
|
|
1108
|
-
|
|
1109
|
-
Returns:
|
|
1110
|
-
Dict[str, Any]: IterativeImputer keyword arguments.
|
|
1111
|
-
Dict[str, Any]: Classifier keyword arguments.
|
|
1112
|
-
Dict[str, Any]: Genetic algorithm keyword arguments.
|
|
1113
|
-
int: Number of cross-validation folds to use with non-grid search validation.
|
|
1114
|
-
int: Verbosity setting. 0 is silent, 2 is most verbose.
|
|
1115
|
-
int: Number of processors to use with grid search.
|
|
1116
|
-
str or None: Prefix for output files.
|
|
1117
|
-
int or float: Proportion of dataset (if float) or number of columns (if int) to use for grid search.
|
|
1118
|
-
bool: If True, disables the tqdm progress bar and just prints status updates to a file. If False, uses tqdm progress bar.
|
|
1119
|
-
int or float: Chunk sizes for doing full imputation following grid search. If int, then splits into chunks of ``chunk_size``\. If float, then splits into chunks of ``n_features * chunk_size``\.
|
|
1120
|
-
bool: Whether to do validation if ``gridparams is None``.
|
|
1121
|
-
bool: True if doing grid search, False otherwise.
|
|
1122
|
-
"""
|
|
1123
|
-
n_jobs = kwargs.pop("n_jobs", 1)
|
|
1124
|
-
cv = kwargs.pop("cv", None)
|
|
1125
|
-
column_subset = kwargs.pop("column_subset", None)
|
|
1126
|
-
chunk_size = kwargs.pop("chunk_size", 1.0)
|
|
1127
|
-
do_validation = kwargs.pop("do_validation", False)
|
|
1128
|
-
verbose = kwargs.get("verbose", 0)
|
|
1129
|
-
disable_progressbar = kwargs.get("disable_progressbar", False)
|
|
1130
|
-
prefix = kwargs.get("prefix", "imputer")
|
|
1131
|
-
testing = kwargs.get("testing", False)
|
|
1132
|
-
do_gridsearch = False if kwargs["gridparams"] is None else True
|
|
1133
|
-
|
|
1134
|
-
if prefix is None:
|
|
1135
|
-
prefix = "imputer"
|
|
1136
|
-
|
|
1137
|
-
imp_kwargs = kwargs.copy()
|
|
1138
|
-
clf_kwargs = kwargs.copy()
|
|
1139
|
-
ga_kwargs = kwargs.copy()
|
|
1140
|
-
|
|
1141
|
-
imp_keys = [
|
|
1142
|
-
"grid_iter",
|
|
1143
|
-
"tol",
|
|
1144
|
-
"verbose",
|
|
1145
|
-
"genotype_data",
|
|
1146
|
-
"str_encodings",
|
|
1147
|
-
"progress_update_percent",
|
|
1148
|
-
"sim_strategy",
|
|
1149
|
-
"sim_prop_missing",
|
|
1150
|
-
"gridparams",
|
|
1151
|
-
"gridsearch_method",
|
|
1152
|
-
"scoring_metric",
|
|
1153
|
-
"disable_progressbar",
|
|
1154
|
-
"prefix",
|
|
1155
|
-
]
|
|
1156
|
-
|
|
1157
|
-
if self.algorithm == "ii":
|
|
1158
|
-
imp_keys.extend(
|
|
1159
|
-
[
|
|
1160
|
-
"n_nearest_features",
|
|
1161
|
-
"max_iter",
|
|
1162
|
-
"initial_strategy",
|
|
1163
|
-
"imputation_order",
|
|
1164
|
-
"skip_complete",
|
|
1165
|
-
"random_state",
|
|
1166
|
-
"sample_posterior",
|
|
1167
|
-
]
|
|
1168
|
-
)
|
|
1169
|
-
|
|
1170
|
-
ga_keys = [
|
|
1171
|
-
"population_size",
|
|
1172
|
-
"tournament_size",
|
|
1173
|
-
"elitism",
|
|
1174
|
-
"crossover_probability",
|
|
1175
|
-
"mutation_probability",
|
|
1176
|
-
"ga_algorithm",
|
|
1177
|
-
"early_stop_gen",
|
|
1178
|
-
]
|
|
1179
|
-
|
|
1180
|
-
to_remove = ["self", "__class__"]
|
|
1181
|
-
|
|
1182
|
-
for k, v in clf_kwargs.copy().items():
|
|
1183
|
-
if k in to_remove:
|
|
1184
|
-
clf_kwargs.pop(k)
|
|
1185
|
-
if k in imp_keys:
|
|
1186
|
-
clf_kwargs.pop(k)
|
|
1187
|
-
if k in ga_keys:
|
|
1188
|
-
clf_kwargs.pop(k)
|
|
1189
|
-
|
|
1190
|
-
if "clf_random_state" in clf_kwargs:
|
|
1191
|
-
clf_kwargs["random_state"] = clf_kwargs.pop("clf_random_state")
|
|
1192
|
-
|
|
1193
|
-
if "clf_tol" in clf_kwargs:
|
|
1194
|
-
clf_kwargs["tol"] = clf_kwargs.pop("clf_tol")
|
|
1195
|
-
|
|
1196
|
-
for k, v in imp_kwargs.copy().items():
|
|
1197
|
-
if k not in imp_keys:
|
|
1198
|
-
imp_kwargs.pop(k)
|
|
1199
|
-
|
|
1200
|
-
for k, v in ga_kwargs.copy().items():
|
|
1201
|
-
if k not in ga_keys:
|
|
1202
|
-
ga_kwargs.pop(k)
|
|
1203
|
-
|
|
1204
|
-
if "ga_algorithm" in ga_kwargs:
|
|
1205
|
-
ga_kwargs["algorithm"] = ga_kwargs.pop("ga_algorithm")
|
|
1206
|
-
|
|
1207
|
-
if self.clf_type == "regressor":
|
|
1208
|
-
ga_kwargs["criteria"] = "min"
|
|
1209
|
-
|
|
1210
|
-
elif self.clf_type == "classifier":
|
|
1211
|
-
ga_kwargs["criteria"] = "max"
|
|
1212
|
-
|
|
1213
|
-
return (
|
|
1214
|
-
imp_kwargs,
|
|
1215
|
-
clf_kwargs,
|
|
1216
|
-
ga_kwargs,
|
|
1217
|
-
cv,
|
|
1218
|
-
verbose,
|
|
1219
|
-
n_jobs,
|
|
1220
|
-
prefix,
|
|
1221
|
-
column_subset,
|
|
1222
|
-
disable_progressbar,
|
|
1223
|
-
chunk_size,
|
|
1224
|
-
do_validation,
|
|
1225
|
-
do_gridsearch,
|
|
1226
|
-
testing,
|
|
1227
|
-
)
|
|
1228
|
-
|
|
1229
|
-
def _impute_eval(
|
|
1230
|
-
self, df: pd.DataFrame, clf: Optional[Callable]
|
|
1231
|
-
) -> Dict[str, List[Union[float, int]]]:
|
|
1232
|
-
"""Function to run IterativeImputer on a pandas.DataFrame.
|
|
1233
|
-
|
|
1234
|
-
The dataframe columns are randomly subset and a fraction of the known, true values are converted to missing data to allow evalutation of the model with either accuracy or mean_squared_error scores.
|
|
1235
|
-
|
|
1236
|
-
Args:
|
|
1237
|
-
df (pandas.DataFrame): Original DataFrame with 012-encoded genotypes.
|
|
1238
|
-
|
|
1239
|
-
clf (sklearn Classifier or None): Classifier instance to use with IterativeImputer.
|
|
1240
|
-
|
|
1241
|
-
Returns:
|
|
1242
|
-
Dict[List[float or int]]: Validation scores for the current imputation.
|
|
1243
|
-
"""
|
|
1244
|
-
cols = np.random.choice(
|
|
1245
|
-
df.columns,
|
|
1246
|
-
int(len(df.columns) * self.column_subset),
|
|
1247
|
-
replace=False,
|
|
1248
|
-
)
|
|
1249
|
-
|
|
1250
|
-
if self.verbose > 0:
|
|
1251
|
-
print(
|
|
1252
|
-
f"\nSimulating validation data with missing data proportion "
|
|
1253
|
-
f"{self.sim_prop_missing} and strategy {self.sim_strategy}"
|
|
1254
|
-
)
|
|
1255
|
-
|
|
1256
|
-
df_known = df.copy()
|
|
1257
|
-
|
|
1258
|
-
if self.algorithm == "nn":
|
|
1259
|
-
df_unknown = df_known.copy()
|
|
1260
|
-
|
|
1261
|
-
else:
|
|
1262
|
-
df_unknown = pd.DataFrame(
|
|
1263
|
-
SimGenotypeDataTransformer(
|
|
1264
|
-
self.genotype_data,
|
|
1265
|
-
prop_missing=self.imp_kwargs["sim_prop_missing"],
|
|
1266
|
-
strategy=self.imp_kwargs["sim_strategy"],
|
|
1267
|
-
).fit_transform(df_known)
|
|
1268
|
-
)
|
|
1269
|
-
|
|
1270
|
-
df_unknown_slice = df_unknown[cols]
|
|
1271
|
-
|
|
1272
|
-
# Neural networks
|
|
1273
|
-
if self.algorithm == "nn":
|
|
1274
|
-
df_stg = df_unknown_slice.copy()
|
|
1275
|
-
|
|
1276
|
-
for col in df_stg.columns:
|
|
1277
|
-
df_stg[col] = df_stg[col].replace({pd.NA: np.nan})
|
|
1278
|
-
# df_stg.fillna(-9, inplace=True)
|
|
1279
|
-
|
|
1280
|
-
imputer = self.clf(
|
|
1281
|
-
prefix=self.prefix, **self.clf_kwargs, **self.imp_kwargs
|
|
1282
|
-
)
|
|
1283
|
-
|
|
1284
|
-
df_imp = pd.DataFrame(
|
|
1285
|
-
imputer.fit_transform(df_stg.to_numpy()),
|
|
1286
|
-
columns=cols,
|
|
1287
|
-
)
|
|
1288
|
-
|
|
1289
|
-
df_unknown_slice = pd.DataFrame(imputer.y_simulated_, columns=cols)
|
|
1290
|
-
df_known_slice = pd.DataFrame(imputer.y_original_, columns=cols)
|
|
1291
|
-
|
|
1292
|
-
df_missing_mask = pd.DataFrame(
|
|
1293
|
-
imputer.sim_missing_mask_, columns=cols
|
|
1294
|
-
)
|
|
1295
|
-
|
|
1296
|
-
df_imp = df_imp.astype("float")
|
|
1297
|
-
df_imp = df_imp.astype("int64")
|
|
1298
|
-
|
|
1299
|
-
else:
|
|
1300
|
-
df_known_slice = df_known[cols]
|
|
1301
|
-
df_known_slice = df_known[cols]
|
|
1302
|
-
df_missing_mask = df_unknown_slice.isnull()
|
|
1303
|
-
|
|
1304
|
-
df_unknown.replace(-9, np.nan, inplace=True)
|
|
1305
|
-
|
|
1306
|
-
# Using IterativeImputer
|
|
1307
|
-
df_stg = df_unknown.copy()
|
|
1308
|
-
|
|
1309
|
-
imputer = self._define_iterative_imputer(
|
|
1310
|
-
clf,
|
|
1311
|
-
self.logfilepath,
|
|
1312
|
-
clf_kwargs=self.clf_kwargs,
|
|
1313
|
-
imp_kwargs=self.imp_kwargs,
|
|
1314
|
-
)
|
|
1315
|
-
|
|
1316
|
-
imp_arr = imputer.fit_transform(df_stg)
|
|
1317
|
-
|
|
1318
|
-
# Get only subset of validation columns
|
|
1319
|
-
# get_loc returns the index of the value
|
|
1320
|
-
df_imp = pd.DataFrame(
|
|
1321
|
-
imp_arr[:, [df_unknown.columns.get_loc(i) for i in cols]],
|
|
1322
|
-
columns=cols,
|
|
1323
|
-
)
|
|
1324
|
-
|
|
1325
|
-
# Get score of each column
|
|
1326
|
-
scores = defaultdict(list)
|
|
1327
|
-
for col in df_known_slice.columns:
|
|
1328
|
-
# Adapted from: https://medium.com/analytics-vidhya/using-scikit-learns-iterative-imputer-694c3cca34de
|
|
1329
|
-
|
|
1330
|
-
mask = df_missing_mask[col]
|
|
1331
|
-
y_true = df_known[col]
|
|
1332
|
-
y_true = y_true[mask]
|
|
1333
|
-
|
|
1334
|
-
y_pred = df_imp[col]
|
|
1335
|
-
y_pred = y_pred[mask]
|
|
1336
|
-
|
|
1337
|
-
if self.clf_type == "classifier":
|
|
1338
|
-
if y_pred.empty:
|
|
1339
|
-
scores["accuracy"].append(-9)
|
|
1340
|
-
scores["precision"].append(-9)
|
|
1341
|
-
scores["f1"].append(-9)
|
|
1342
|
-
scores["recall"].append(-9)
|
|
1343
|
-
scores["jaccard"].append(-9)
|
|
1344
|
-
continue
|
|
1345
|
-
|
|
1346
|
-
# Had to do this because get incompatible type error if using
|
|
1347
|
-
# initial_imputation="populations"
|
|
1348
|
-
if y_true.dtypes != "int64":
|
|
1349
|
-
y_true = y_true.astype("int64")
|
|
1350
|
-
if y_pred.dtypes != "int64":
|
|
1351
|
-
y_pred = y_pred.astype("int64")
|
|
1352
|
-
|
|
1353
|
-
scores["accuracy"].append(
|
|
1354
|
-
metrics.accuracy_score(y_true, y_pred)
|
|
1355
|
-
)
|
|
1356
|
-
|
|
1357
|
-
scores["precision"].append(
|
|
1358
|
-
metrics.precision_score(
|
|
1359
|
-
y_true, y_pred, average="macro", zero_division=0
|
|
1360
|
-
)
|
|
1361
|
-
)
|
|
1362
|
-
|
|
1363
|
-
scores["f1"].append(
|
|
1364
|
-
metrics.f1_score(
|
|
1365
|
-
y_true, y_pred, average="macro", zero_division=0
|
|
1366
|
-
)
|
|
1367
|
-
)
|
|
1368
|
-
|
|
1369
|
-
scores["recall"].append(
|
|
1370
|
-
metrics.recall_score(
|
|
1371
|
-
y_true, y_pred, average="macro", zero_division=0
|
|
1372
|
-
)
|
|
1373
|
-
)
|
|
1374
|
-
|
|
1375
|
-
scores["jaccard"].append(
|
|
1376
|
-
metrics.jaccard_score(
|
|
1377
|
-
y_true, y_pred, average="macro", zero_division=0
|
|
1378
|
-
)
|
|
1379
|
-
)
|
|
1380
|
-
|
|
1381
|
-
else:
|
|
1382
|
-
scores["explained_var"].append(
|
|
1383
|
-
metrics.explained_variance_score(y_true, y_pred)
|
|
1384
|
-
)
|
|
1385
|
-
|
|
1386
|
-
scores["rmse"].append(
|
|
1387
|
-
metrics.mean_squared_error(y_true, y_pred, squared=False)
|
|
1388
|
-
)
|
|
1389
|
-
|
|
1390
|
-
lst2del = [
|
|
1391
|
-
df_stg,
|
|
1392
|
-
df_imp,
|
|
1393
|
-
df_known,
|
|
1394
|
-
df_known_slice,
|
|
1395
|
-
df_unknown,
|
|
1396
|
-
]
|
|
1397
|
-
|
|
1398
|
-
if self.algorithm == "nn":
|
|
1399
|
-
del lst2del
|
|
1400
|
-
del cols
|
|
1401
|
-
else:
|
|
1402
|
-
del lst2del
|
|
1403
|
-
del imp_arr
|
|
1404
|
-
del imputer
|
|
1405
|
-
del cols
|
|
1406
|
-
|
|
1407
|
-
gc.collect()
|
|
1408
|
-
|
|
1409
|
-
return scores
|
|
1410
|
-
|
|
1411
|
-
def _define_iterative_imputer(
|
|
1412
|
-
self,
|
|
1413
|
-
clf: Callable,
|
|
1414
|
-
logfilepath: str,
|
|
1415
|
-
clf_kwargs: Optional[Dict[str, Any]] = None,
|
|
1416
|
-
imp_kwargs: Optional[str] = None,
|
|
1417
|
-
ga_kwargs: Optional[Dict[str, Any]] = None,
|
|
1418
|
-
n_jobs: Optional[int] = None,
|
|
1419
|
-
clf_type: Optional[str] = None,
|
|
1420
|
-
) -> Union[IterativeImputerGridSearch, IterativeImputerFixedParams]:
|
|
1421
|
-
"""Define an IterativeImputer instance.
|
|
1422
|
-
|
|
1423
|
-
The instances are of custom, overloaded IterativeImputer classes.
|
|
1424
|
-
|
|
1425
|
-
Args:
|
|
1426
|
-
clf (sklearn Classifier instance): Estimator to use with IterativeImputer.
|
|
1427
|
-
|
|
1428
|
-
logfilepath (str): Path to progress log file.
|
|
1429
|
-
|
|
1430
|
-
clf_kwargs (dict, optional): Keyword arguments for classifier. Defaults to None.
|
|
1431
|
-
|
|
1432
|
-
imp_kwargs (Dict[str, Any], optional): Keyword arguments for imputation settings. Defaults to None.
|
|
1433
|
-
|
|
1434
|
-
ga_kwargs (dict, optional): Keyword arguments for genetic algorithm grid search. Defaults to None.
|
|
1435
|
-
|
|
1436
|
-
n_jobs (int, optional): Number of parallel jobs to use with the IterativeImputer grid search. Ignored if ``search_space=None``\. Defaults to None.
|
|
1437
|
-
|
|
1438
|
-
clf_type (str, optional): Type of estimator. Valid options are "classifier" or "regressor". Ignored if ``search_space=None``\. Defaults to None.
|
|
1439
|
-
|
|
1440
|
-
Returns:
|
|
1441
|
-
sklearn.impute.IterativeImputer: IterativeImputer instance.
|
|
1442
|
-
"""
|
|
1443
|
-
if not self.do_gridsearch:
|
|
1444
|
-
imp = IterativeImputerFixedParams(
|
|
1445
|
-
logfilepath,
|
|
1446
|
-
clf_kwargs,
|
|
1447
|
-
estimator=clf,
|
|
1448
|
-
**imp_kwargs,
|
|
1449
|
-
)
|
|
1450
|
-
|
|
1451
|
-
else:
|
|
1452
|
-
# Create iterative imputer
|
|
1453
|
-
imp = IterativeImputerGridSearch(
|
|
1454
|
-
logfilepath,
|
|
1455
|
-
clf_kwargs,
|
|
1456
|
-
ga_kwargs,
|
|
1457
|
-
estimator=clf,
|
|
1458
|
-
grid_n_jobs=n_jobs,
|
|
1459
|
-
clf_type=clf_type,
|
|
1460
|
-
**imp_kwargs,
|
|
1461
|
-
)
|
|
1462
|
-
|
|
1463
|
-
return imp
|