pg-sui 0.2.0__py3-none-any.whl → 1.6.14.dev9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info}/METADATA +101 -79
  2. pg_sui-1.6.14.dev9.dist-info/RECORD +81 -0
  3. {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info}/WHEEL +1 -1
  4. pg_sui-1.6.14.dev9.dist-info/entry_points.txt +4 -0
  5. {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info/licenses}/LICENSE +0 -0
  6. pg_sui-1.6.14.dev9.dist-info/top_level.txt +1 -0
  7. pgsui/__init__.py +35 -54
  8. pgsui/_version.py +34 -0
  9. pgsui/cli.py +909 -0
  10. pgsui/data_processing/__init__.py +0 -0
  11. pgsui/data_processing/config.py +565 -0
  12. pgsui/data_processing/containers.py +1424 -0
  13. pgsui/data_processing/transformers.py +557 -907
  14. pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
  15. pgsui/electron/app/__main__.py +5 -0
  16. pgsui/electron/app/extra-resources/.gitkeep +1 -0
  17. pgsui/electron/app/icons/icons/1024x1024.png +0 -0
  18. pgsui/electron/app/icons/icons/128x128.png +0 -0
  19. pgsui/electron/app/icons/icons/16x16.png +0 -0
  20. pgsui/electron/app/icons/icons/24x24.png +0 -0
  21. pgsui/electron/app/icons/icons/256x256.png +0 -0
  22. pgsui/electron/app/icons/icons/32x32.png +0 -0
  23. pgsui/electron/app/icons/icons/48x48.png +0 -0
  24. pgsui/electron/app/icons/icons/512x512.png +0 -0
  25. pgsui/electron/app/icons/icons/64x64.png +0 -0
  26. pgsui/electron/app/icons/icons/icon.icns +0 -0
  27. pgsui/electron/app/icons/icons/icon.ico +0 -0
  28. pgsui/electron/app/main.js +227 -0
  29. pgsui/electron/app/package-lock.json +6894 -0
  30. pgsui/electron/app/package.json +51 -0
  31. pgsui/electron/app/preload.js +15 -0
  32. pgsui/electron/app/server.py +157 -0
  33. pgsui/electron/app/ui/logo.png +0 -0
  34. pgsui/electron/app/ui/renderer.js +131 -0
  35. pgsui/electron/app/ui/styles.css +59 -0
  36. pgsui/electron/app/ui/ui_shim.js +72 -0
  37. pgsui/electron/bootstrap.py +43 -0
  38. pgsui/electron/launch.py +57 -0
  39. pgsui/electron/package.json +14 -0
  40. pgsui/example_data/__init__.py +0 -0
  41. pgsui/example_data/phylip_files/__init__.py +0 -0
  42. pgsui/example_data/phylip_files/test.phy +0 -0
  43. pgsui/example_data/popmaps/__init__.py +0 -0
  44. pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
  45. pgsui/example_data/structure_files/__init__.py +0 -0
  46. pgsui/example_data/structure_files/test.pops.2row.allsites.str +0 -0
  47. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
  48. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
  49. pgsui/impute/__init__.py +0 -0
  50. pgsui/impute/deterministic/imputers/allele_freq.py +725 -0
  51. pgsui/impute/deterministic/imputers/mode.py +844 -0
  52. pgsui/impute/deterministic/imputers/nmf.py +221 -0
  53. pgsui/impute/deterministic/imputers/phylo.py +973 -0
  54. pgsui/impute/deterministic/imputers/ref_allele.py +669 -0
  55. pgsui/impute/supervised/__init__.py +0 -0
  56. pgsui/impute/supervised/base.py +343 -0
  57. pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
  58. pgsui/impute/supervised/imputers/hist_gradient_boosting.py +317 -0
  59. pgsui/impute/supervised/imputers/random_forest.py +291 -0
  60. pgsui/impute/unsupervised/__init__.py +0 -0
  61. pgsui/impute/unsupervised/base.py +1118 -0
  62. pgsui/impute/unsupervised/callbacks.py +92 -262
  63. {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
  64. pgsui/impute/unsupervised/imputers/autoencoder.py +1285 -0
  65. pgsui/impute/unsupervised/imputers/nlpca.py +1554 -0
  66. pgsui/impute/unsupervised/imputers/ubp.py +1575 -0
  67. pgsui/impute/unsupervised/imputers/vae.py +1228 -0
  68. pgsui/impute/unsupervised/loss_functions.py +261 -0
  69. pgsui/impute/unsupervised/models/__init__.py +0 -0
  70. pgsui/impute/unsupervised/models/autoencoder_model.py +215 -567
  71. pgsui/impute/unsupervised/models/nlpca_model.py +155 -394
  72. pgsui/impute/unsupervised/models/ubp_model.py +180 -1106
  73. pgsui/impute/unsupervised/models/vae_model.py +269 -630
  74. pgsui/impute/unsupervised/nn_scorers.py +255 -0
  75. pgsui/utils/__init__.py +0 -0
  76. pgsui/utils/classification_viz.py +608 -0
  77. pgsui/utils/logging_utils.py +22 -0
  78. pgsui/utils/misc.py +35 -480
  79. pgsui/utils/plotting.py +996 -829
  80. pgsui/utils/pretty_metrics.py +290 -0
  81. pgsui/utils/scorers.py +213 -666
  82. pg_sui-0.2.0.dist-info/RECORD +0 -75
  83. pg_sui-0.2.0.dist-info/top_level.txt +0 -3
  84. pgsui/example_data/phylip_files/test_n10.phy +0 -118
  85. pgsui/example_data/phylip_files/test_n100.phy +0 -118
  86. pgsui/example_data/phylip_files/test_n2.phy +0 -118
  87. pgsui/example_data/phylip_files/test_n500.phy +0 -118
  88. pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
  89. pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
  90. pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
  91. pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
  92. pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
  93. pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
  94. pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
  95. pgsui/example_data/trees/test.iqtree +0 -376
  96. pgsui/example_data/trees/test.qmat +0 -5
  97. pgsui/example_data/trees/test.rate +0 -2033
  98. pgsui/example_data/trees/test.tre +0 -1
  99. pgsui/example_data/trees/test_n10.rate +0 -19
  100. pgsui/example_data/trees/test_n100.rate +0 -109
  101. pgsui/example_data/trees/test_n500.rate +0 -509
  102. pgsui/example_data/trees/test_siterates.txt +0 -2024
  103. pgsui/example_data/trees/test_siterates_n10.txt +0 -10
  104. pgsui/example_data/trees/test_siterates_n100.txt +0 -100
  105. pgsui/example_data/trees/test_siterates_n500.txt +0 -500
  106. pgsui/example_data/vcf_files/test.vcf +0 -244
  107. pgsui/example_data/vcf_files/test.vcf.gz +0 -0
  108. pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
  109. pgsui/impute/estimators.py +0 -1268
  110. pgsui/impute/impute.py +0 -1463
  111. pgsui/impute/simple_imputers.py +0 -1431
  112. pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -782
  113. pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1024
  114. pgsui/impute/unsupervised/keras_classifiers.py +0 -697
  115. pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
  116. pgsui/impute/unsupervised/neural_network_imputers.py +0 -1440
  117. pgsui/impute/unsupervised/neural_network_methods.py +0 -1395
  118. pgsui/pg_sui.py +0 -261
  119. pgsui/utils/sequence_tools.py +0 -407
  120. simulation/sim_benchmarks.py +0 -333
  121. simulation/sim_treeparams.py +0 -475
  122. test/__init__.py +0 -0
  123. test/pg_sui_simtest.py +0 -215
  124. test/pg_sui_testing.py +0 -523
  125. test/test.py +0 -151
  126. test/test_pgsui.py +0 -374
  127. test/test_tkc.py +0 -185
pgsui/impute/impute.py DELETED
@@ -1,1463 +0,0 @@
1
- # Standard library imports
2
- import errno
3
- import gc
4
- import math
5
- import os
6
- import pprint
7
- import sys
8
- from collections import Counter
9
- from collections import defaultdict
10
- from operator import itemgetter
11
- from pathlib import Path
12
- from statistics import mean, median
13
- from contextlib import redirect_stdout
14
- from typing import Optional, Union, List, Dict, Tuple, Any, Callable
15
- from copy import deepcopy
16
-
17
-
18
- # Third party imports
19
- import numpy as np
20
- import pandas as pd
21
-
22
- from scipy import stats as st
23
-
24
- # from memory_profiler import memory_usage
25
-
26
- # Scikit-learn imports
27
- from sklearn.experimental import enable_iterative_imputer
28
- from sklearn import metrics
29
-
30
- from sklearn_genetic.space import Continuous, Categorical, Integer
31
-
32
- # Custom module imports
33
- try:
34
- from .supervised.iterative_imputer_gridsearch import (
35
- IterativeImputerGridSearch,
36
- )
37
- from .supervised.iterative_imputer_fixedparams import (
38
- IterativeImputerFixedParams,
39
- )
40
- from .unsupervised.neural_network_imputers import VAE, UBP, SAE
41
- from ..utils.misc import isnotebook
42
- from ..utils.misc import timer
43
- from ..data_processing.transformers import (
44
- SimGenotypeDataTransformer,
45
- )
46
- except (ModuleNotFoundError, ValueError, ImportError):
47
- from impute.supervised.iterative_imputer_gridsearch import (
48
- IterativeImputerGridSearch,
49
- )
50
- from impute.supervised.iterative_imputer_fixedparams import (
51
- IterativeImputerFixedParams,
52
- )
53
- from impute.unsupervised.neural_network_imputers import VAE, UBP, SAE
54
- from utils.misc import isnotebook
55
- from utils.misc import timer
56
- from data_processing.transformers import (
57
- SimGenotypeDataTransformer,
58
- )
59
-
60
- is_notebook = isnotebook()
61
-
62
- if is_notebook:
63
- from tqdm.notebook import tqdm as progressbar
64
- else:
65
- from tqdm import tqdm as progressbar
66
-
67
-
68
- class Impute:
69
- """Class to impute missing data from the provided classifier.
70
-
71
- The Impute class will either run a variational autoencoder or IterativeImputer with the provided estimator. The settings for the provided estimator should be provided as the ``kwargs`` argument as a dictionary object with the estimator's keyword arguments as the keys and the corresponding values. E.g., ``kwargs={"n_jobs", 4, "initial_strategy": "populations"}``\. ``clf_type`` just specifies either "classifier" or "regressor". "regressor" is primarily just for quick and dirty testing and is intended for internal use only.
72
-
73
- Once the Impute class is initialized, the imputation should be performed with ``fit_predict()``\.
74
-
75
- The imputed data can then be written to a file with ``write_imputed()``
76
-
77
- Args:
78
- clf (str or Callable estimator object): The estimator object to use. If using a variational autoencoder, the provided value should be "VAE". Otherwise, it should be a callable estimator object that is compatible with scikit-learn's IterativeImputer.
79
-
80
- clf_type (str): Specify whether to use a "classifier" or "regressor". The "regressor" option is just for quick and dirty testing, and "classifier" should almost always be used.
81
-
82
- kwargs (Dict[str, Any]): Settings to use with the estimator. The keys should be the estimator's keywords, and the values should be their corresponding settings.
83
-
84
- Raises:
85
- TypeError: Check whether the ``gridparams`` values are of the correct format if ``ga=True`` or ``ga=False``\.
86
-
87
- Examples:
88
- # Don't use parentheses after estimator object.
89
- >>>imputer = Impute(sklearn.ensemble.RandomForestClassifier,
90
- "classifier",
91
- {"n_jobs": 4, "initial_strategy": "populations", "max_iter": 25, "n_estimators": 100, "ga": True})
92
- >>>self.imputed, self.best_params = imputer.fit_predict(df)
93
- >>>imputer.write_imputed(self.imputed)
94
- >>>print(self.imputed)
95
- [[0, 0, 0, 0],
96
- [0, 0, 0, 0],
97
- [0, 1, 1, 0],
98
- [2, 1, 2, 2]]
99
- """
100
-
101
- def __init__(
102
- self, clf: Union[str, Callable], clf_type: str, kwargs: Dict[str, Any]
103
- ) -> None:
104
- self.clf = clf
105
- self.clf_type = clf_type
106
- self.original_num_cols = None
107
-
108
- if self.clf == VAE or self.clf == SAE or self.clf == UBP:
109
- self.algorithm = "nn"
110
- self.imp_method = "Unsupervised"
111
- else:
112
- self.algorithm = "ii"
113
- self.imp_method = "Supervised"
114
-
115
- self.imp_name = self.clf.__name__
116
-
117
- try:
118
- self.pops = kwargs["genotype_data"].populations
119
- except AttributeError:
120
- self.pops = None
121
-
122
- self.genotype_data = kwargs["genotype_data"]
123
- self.verbose = kwargs["verbose"]
124
-
125
- # Separate local variables into settings objects
126
- (
127
- self.imp_kwargs,
128
- self.clf_kwargs,
129
- self.ga_kwargs,
130
- self.cv,
131
- self.verbose,
132
- self.n_jobs,
133
- self.prefix,
134
- self.column_subset,
135
- self.disable_progressbar,
136
- self.chunk_size,
137
- self.do_validation,
138
- self.do_gridsearch,
139
- self.testing,
140
- ) = self._gather_impute_settings(kwargs)
141
-
142
- if self.algorithm == "ii":
143
- self.imp_kwargs["pops"] = self.pops
144
-
145
- if self.do_gridsearch:
146
- for v in kwargs["gridparams"].values():
147
- if (
148
- isinstance(v, (Categorical, Integer, Continuous))
149
- and kwargs["gridsearch_method"].lower()
150
- != "genetic_algorithm"
151
- ):
152
- raise TypeError(
153
- "gridsearch_method argument must equal 'genetic_algorithm' if gridparams values are of type sklearn_genetic.space"
154
- )
155
-
156
- self.logfilepath = os.path.join(
157
- f"{self.prefix}_output",
158
- "logs",
159
- self.imp_method,
160
- self.imp_name,
161
- f"imputer_progress_log.txt",
162
- )
163
-
164
- self.invalid_indexes = None
165
-
166
- # Remove logfile if exists
167
- try:
168
- os.remove(self.logfilepath)
169
- except OSError:
170
- pass
171
-
172
- Path(
173
- os.path.join(
174
- f"{self.prefix}_output",
175
- "plots",
176
- self.imp_method,
177
- self.imp_name,
178
- )
179
- ).mkdir(parents=True, exist_ok=True)
180
-
181
- Path(
182
- os.path.join(
183
- f"{self.prefix}_output", "logs", self.imp_method, self.imp_name
184
- )
185
- ).mkdir(parents=True, exist_ok=True)
186
-
187
- Path(
188
- os.path.join(
189
- f"{self.prefix}_output",
190
- "reports",
191
- self.imp_method,
192
- self.imp_name,
193
- )
194
- ).mkdir(parents=True, exist_ok=True)
195
-
196
- Path(
197
- os.path.join(
198
- f"{self.prefix}_output",
199
- "alignments",
200
- self.imp_method,
201
- self.imp_name,
202
- )
203
- ).mkdir(parents=True, exist_ok=True)
204
-
205
- @timer
206
- def fit_predict(
207
- self, X: pd.DataFrame
208
- ) -> Tuple[pd.DataFrame, Dict[str, Any]]:
209
- """Fit and predict imputations with IterativeImputer(estimator).
210
-
211
- Fits and predicts imputed 012-encoded genotypes using IterativeImputer with any of the supported estimator objects. If ``gridparams=None``\, then a grid search is not performed. If ``gridparams!=None``\, then a RandomizedSearchCV is performed on a subset of the data and a final imputation is done on the whole dataset using the best found parameters.
212
-
213
- Args:
214
- X (pandas.DataFrame): DataFrame with 012-encoded genotypes.
215
-
216
- Returns:
217
- GenotypeData: GenotypeData object with missing genotypes imputed.
218
- Dict[str, Any]: Best parameters found during grid search.
219
- """
220
-
221
- # Test if output file can be written to
222
- try:
223
- outfile = os.path.join(
224
- f"{self.prefix}_output",
225
- "alignments",
226
- self.imp_method,
227
- self.imp_name,
228
- "imputed_012.csv",
229
- )
230
-
231
- with open(outfile, "w") as fout:
232
- pass
233
- except IOError as e:
234
- print(f"Error: {e.errno}, {e.strerror}")
235
- if e.errno == errno.EACCES:
236
- sys.exit(f"Permission denied: Cannot write to {outfile}")
237
- elif e.errno == errno.EISDIR:
238
- sys.exit(f"Could not write to {outfile}; It is a directory")
239
-
240
- # mem_usage = memory_usage((self._impute_single, (X,)))
241
- # with open(f"profiling_results/memUsage_{self.prefix}.txt", "w") as fout:
242
- # fout.write(f"{max(mem_usage)}")
243
- # sys.exit()
244
-
245
- # Don't do a grid search
246
- if not self.do_gridsearch:
247
- imputed_df, df_scores, best_params = self._impute_single(X)
248
-
249
- if df_scores is not None:
250
- self._print_scores(df_scores)
251
-
252
- # Do a grid search and get the transformed data with the best parameters
253
- else:
254
- imputed_df, df_scores, best_params = self._impute_gridsearch(X)
255
-
256
- if self.verbose > 0:
257
- print("\nBest Parameters:")
258
- pprint.pprint(best_params)
259
-
260
- imp_data = self._imputed2genotypedata(imputed_df, self.genotype_data)
261
-
262
- print("\nDone!\n")
263
- return imp_data, best_params
264
-
265
- def _df2chunks(
266
- self, df: pd.DataFrame, chunk_size: Union[int, float]
267
- ) -> List[pd.DataFrame]:
268
- """Break up pandas.DataFrame into chunks and impute chunks.
269
-
270
- If set to 1.0 of type float, then returns only one chunk containing all the data.
271
-
272
- Args:
273
- df (pandas.DataFrame): DataFrame to split into chunks.
274
-
275
- chunk_size (int or float): If type is integer, then breaks DataFrame into ``chunk_size`` chunks. If type is float, breaks DataFrame up into ``chunk_size * len(df.columns)`` chunks.
276
-
277
- Returns:
278
- List[pandas.DataFrame]: List of pandas DataFrames of shape (n_samples, n_features_in_chunk).
279
-
280
- Raises:
281
- ValueError: ``chunk_size`` must be of type int or float.
282
- """
283
- if (
284
- "initial_strategy" in self.imp_kwargs
285
- and self.imp_kwargs["initial_strategy"] == "phylogeny"
286
- and chunk_size != 1.0
287
- ):
288
- print(
289
- "WARNING: Chunking is not supported with initial_strategy == "
290
- "'phylogeny'; Setting chunk_size to 1.0 and imputing entire "
291
- "dataset"
292
- )
293
-
294
- chunk_size = 1.0
295
-
296
- if (
297
- "initial_strategy" in self.imp_kwargs
298
- and self.imp_kwargs["initial_strategy"] == "mf"
299
- and chunk_size != 1.0
300
- ):
301
- print(
302
- "WARNING: Chunking is not supported with initial_strategy == "
303
- "'mf'; Setting chunk_size to 1.0 and imputing entire "
304
- "dataset"
305
- )
306
-
307
- chunk_size = 1.0
308
-
309
- if isinstance(chunk_size, (int, float)):
310
- chunks = list()
311
- df_cp = df.copy()
312
-
313
- if isinstance(chunk_size, float):
314
- if chunk_size > 1.0:
315
- raise ValueError(
316
- f"If chunk_size is of type float, must be "
317
- f"between 0.0 and 1.0; Value supplied was {chunk_size}"
318
- )
319
-
320
- elif chunk_size == 1.0:
321
- # All data in one chunk
322
- chunks.append(df_cp)
323
- if self.verbose > 1:
324
- print(
325
- "Imputing all features at once since chunk_size is "
326
- "set to 1.0"
327
- )
328
-
329
- return chunks
330
-
331
- tmp = chunk_size
332
- chunk_size = None
333
- chunk_size = math.ceil(len(df.columns) * tmp)
334
-
335
- else:
336
- raise ValueError(
337
- f"chunk_size must be of type float or integer, "
338
- f"but type {type(chunk_size)} was passed"
339
- )
340
-
341
- chunk_len_list = list()
342
- num_chunks = math.ceil(len(df.columns) / chunk_size)
343
- for i in range(num_chunks):
344
- chunks.append(df_cp.iloc[:, i * chunk_size : (i + 1) * chunk_size])
345
- chunk_len_list.append(len(chunks[i].columns))
346
-
347
- chunk_len = ",".join([str(x) for x in chunk_len_list])
348
-
349
- if self.verbose > 1:
350
- print(
351
- f"Data split into {num_chunks} chunks with {chunk_len} features"
352
- )
353
-
354
- return chunks
355
-
356
- def _imputed2genotypedata(self, imp012, genotype_data):
357
- """Create new instance of GenotypeData object from imputed DataFrame.
358
-
359
- The imputed, decoded DataFrame gets written to file and re-loaded to instantiate a new GenotypeData object.
360
-
361
- Args:
362
- imp012 (pandas.DataFrame): Imputed 012-encoded DataFrame.
363
-
364
- genotype_data (GenotypeData): Original GenotypeData object to load attributes from.
365
-
366
- Returns:
367
- GenotypeData: GenotypeData object with imputed data.
368
- """
369
- imputed_gd = deepcopy(genotype_data)
370
-
371
- if self.clf == VAE:
372
- if len(imp012.shape) == 3:
373
- if imp012.shape[-1] == 4:
374
- imputed_gd.genotypes_onehot = imp012
375
- else:
376
- raise ValueError("Invalid shape for imputed output.")
377
- elif len(imp012.shape) == 2:
378
- if isinstance(imp012, pd.DataFrame):
379
- imp012 = imp012.to_numpy()
380
- imp012 = imp012.astype(int)
381
- if np.max(imp012) > 2:
382
- imputed_gd.genotypes_int = imp012
383
- else:
384
- imputed_gd.genotypes_012 = imp012
385
- else:
386
- raise ValueError(
387
- f"Invalid shape for imputed output: {imp012.shape}"
388
- )
389
- else:
390
- imputed_gd.genotypes_012 = imp012
391
-
392
- return imputed_gd
393
-
394
- def _subset_data_for_gridsearch(
395
- self,
396
- df: pd.DataFrame,
397
- columns_to_subset: Union[int, float],
398
- original_num_cols: int,
399
- ) -> Tuple[pd.DataFrame, np.ndarray]:
400
- """Randomly subsets pandas.DataFrame.
401
-
402
- Subset pandas DataFrame with ``column_percent`` fraction of the data. Allows for faster validation.
403
-
404
- Args:
405
- df (pandas.DataFrame): DataFrame with 012-encoded genotypes.
406
-
407
- columns_to_subset (int or float): If float, proportion of DataFrame to randomly subset should be between 0 and 1. if integer, subsets ``columns_to_subset`` random columns.
408
-
409
- original_num_cols (int): Number of columns in original DataFrame.
410
-
411
- Returns:
412
- pandas.DataFrame: New DataFrame with random subset of features.
413
- numpy.ndarray: Sorted numpy array of column indices to keep.
414
-
415
- Raises:
416
- TypeError: column_subset must be of type float or int.
417
- """
418
-
419
- # Get a random numpy arrray of column names to select
420
- if isinstance(columns_to_subset, float):
421
- n = int(original_num_cols * columns_to_subset)
422
- elif isinstance(columns_to_subset, int):
423
- n = columns_to_subset
424
- else:
425
- raise TypeError(
426
- f"column_subset must be of type float or int, "
427
- f"but got {type(columns_to_subset)}"
428
- )
429
-
430
- col_arr = np.array(df.columns)
431
-
432
- if n > len(df.columns):
433
- if self.verbose > 0:
434
- print(
435
- "Warning: Column_subset is greater than remaining columns following filtering. Using all columns"
436
- )
437
-
438
- df_sub = df.copy()
439
- cols = col_arr.copy()
440
- else:
441
- cols = np.random.choice(col_arr, n, replace=False)
442
- df_sub = df.loc[:, np.sort(cols)]
443
- # df_sub = df.sample(n=n, axis="columns", replace=False)
444
-
445
- df_sub.columns = df_sub.columns.astype(str)
446
-
447
- return df_sub, np.sort(cols)
448
-
449
- def _print_scores(self, df_scores: pd.DataFrame) -> None:
450
- """Print validation scores as pandas.DataFrame.
451
-
452
- Args:
453
- df (pandas.DataFrame): DataFrame with score statistics.
454
- """
455
- if self.verbose > 0:
456
- print("Validation scores:")
457
- print(df_scores)
458
-
459
- def _write_imputed_params_score(
460
- self, df_scores: pd.DataFrame, best_params: Dict[str, Any]
461
- ) -> None:
462
- """Save best_score and best_params to files on disk.
463
-
464
- Args:
465
- best_score (float): Best RMSE or accuracy score for the regressor or classifier, respectively.
466
-
467
- best_params (dict): Best parameters found in grid search.
468
- """
469
-
470
- best_score_outfile = os.path.join(
471
- f"{self.prefix}_output",
472
- "reports",
473
- self.imp_method,
474
- self.imp_name,
475
- "imputed_best_score.csv",
476
- )
477
- best_params_outfile = os.path.join(
478
- f"{self.prefix}_output",
479
- "reports",
480
- self.imp_method,
481
- self.imp_name,
482
- "imputed_best_params.csv",
483
- )
484
-
485
- if isinstance(df_scores, pd.DataFrame):
486
- df_scores.to_csv(
487
- best_score_outfile,
488
- header=True,
489
- index=False,
490
- float_format="%.2f",
491
- )
492
-
493
- else:
494
- with open(best_score_outfile, "w") as fout:
495
- fout.write(f"accuracy,{df_scores}\n")
496
-
497
- with open(best_params_outfile, "w") as fout:
498
- fout.write("parameter,best_value\n")
499
- for k, v in best_params.items():
500
- fout.write(f"{k},{v}\n")
501
-
502
- def _impute_single(
503
- self, df: pd.DataFrame
504
- ) -> Tuple[pd.DataFrame, pd.DataFrame, None]:
505
- """Run IterativeImputer without a grid search.
506
-
507
- Will do a different type of validation if ``do_validation == True``\.
508
-
509
- Args:
510
- df (pandas.DataFrame): DataFrame of 012-encoded genotypes.
511
-
512
- Returns:
513
- pandas.DataFrame: Imputed DataFrame of 012-encoded genotypes.
514
- pandas.DataFrame: DataFrame with validation scores.
515
- NoneType: Only used with _impute_gridsearch. Set to None here for compatibility.
516
- """
517
- if self.verbose > 0:
518
- print(
519
- f"\nDoing {self.clf.__name__} imputation without grid search..."
520
- )
521
-
522
- if self.algorithm == "nn":
523
- clf = None
524
-
525
- else:
526
- clf = self.clf(**self.clf_kwargs)
527
-
528
- if self.do_validation:
529
- if self.verbose > 0:
530
- print(f"Estimating {self.clf.__name__} validation scores...")
531
-
532
- if self.disable_progressbar:
533
- with open(self.logfilepath, "a") as fout:
534
- # Redirect to progress logfile
535
- with redirect_stdout(fout):
536
- print(
537
- f"Doing {self.clf.__name__} imputation "
538
- f"without grid search...\n"
539
- )
540
-
541
- if self.verbose > 0:
542
- print(
543
- f"Estimating {self.clf.__name__} "
544
- f"validation scores...\n"
545
- )
546
-
547
- df_scores = self._imputer_validation(df, clf)
548
-
549
- if self.verbose > 0:
550
- print(f"\nDone with {self.clf.__name__} validation!\n")
551
-
552
- if self.disable_progressbar:
553
- if self.verbose > 0:
554
- with open(self.logfilepath, "a") as fout:
555
- # Redirect to progress logfile
556
- with redirect_stdout(fout):
557
- print(
558
- f"\nDone with {self.clf.__name__} validation!\n"
559
- )
560
-
561
- else:
562
- df_scores = None
563
-
564
- if self.algorithm == "nn":
565
- imputer = None
566
-
567
- else:
568
- imputer = self._define_iterative_imputer(
569
- clf,
570
- self.logfilepath,
571
- clf_kwargs=self.clf_kwargs,
572
- imp_kwargs=self.imp_kwargs,
573
- )
574
-
575
- if self.original_num_cols is None:
576
- self.original_num_cols = len(df.columns)
577
-
578
- # Remove non-biallelic loci
579
- # Only used if initial_strategy == 'phylogeny'
580
- if self.invalid_indexes is not None:
581
- df.drop(
582
- labels=self.invalid_indexes,
583
- axis=1,
584
- inplace=True,
585
- )
586
-
587
- if self.disable_progressbar:
588
- if self.verbose > 0:
589
- with open(self.logfilepath, "a") as fout:
590
- # Redirect to progress logfile
591
- with redirect_stdout(fout):
592
- print(f"Doing {self.clf.__name__} imputation...\n")
593
-
594
- df_chunks = self._df2chunks(df, self.chunk_size)
595
- imputed_df = self._impute_df(df_chunks, imputer)
596
- imputed_df = imputed_df.astype(str)
597
-
598
- if self.disable_progressbar:
599
- if self.verbose > 0:
600
- with open(self.logfilepath, "a") as fout:
601
- # Redirect to progress logfile
602
- with redirect_stdout(fout):
603
- print(f"\nDone with {self.clf.__name__} imputation!\n")
604
-
605
- lst2del = [df_chunks]
606
- del lst2del
607
- gc.collect()
608
-
609
- self._validate_imputed(imputed_df)
610
-
611
- if self.verbose > 0:
612
- print(f"\nDone with {self.clf.__name__} imputation!\n")
613
-
614
- return imputed_df, df_scores, None
615
-
616
- def _impute_gridsearch(
617
- self, df: pd.DataFrame
618
- ) -> Tuple[pd.DataFrame, pd.DataFrame, Dict[str, Any]]:
619
- """Do IterativeImputer with RandomizedSearchCV or GASearchCV.
620
-
621
- Args:
622
- df (pandas.DataFrame): DataFrame with 012-encoded genotypes.
623
-
624
- Returns:
625
- pandas.DataFrame: DataFrame with 012-encoded genotypes imputed using the best parameters found with the grid search.
626
- float: Absolute value of best score found during the grid search.
627
- dict: Best parameters found during the grid search.
628
- """
629
- original_num_cols = len(df.columns)
630
- df_subset, cols_to_keep = self._subset_data_for_gridsearch(
631
- df, self.column_subset, original_num_cols
632
- )
633
-
634
- print(f"Doing {self.clf.__name__} grid search...")
635
-
636
- if self.verbose > 0:
637
- print(f"Validation dataset size: {len(df_subset.columns)}\n")
638
-
639
- if self.disable_progressbar:
640
- with open(self.logfilepath, "a") as fout:
641
- # Redirect to progress logfile
642
- with redirect_stdout(fout):
643
- print(f"Doing {self.clf.__name__} grid search...\n")
644
-
645
- if self.algorithm == "nn":
646
- self.imp_kwargs.pop("str_encodings")
647
- imputer = self.clf(
648
- **self.clf_kwargs,
649
- **self.imp_kwargs,
650
- ga_kwargs=self.ga_kwargs,
651
- )
652
-
653
- df_imp = pd.DataFrame(
654
- imputer.fit_transform(df_subset), columns=cols_to_keep
655
- )
656
-
657
- df_imp = df_imp.astype("float")
658
- df_imp = df_imp.astype("int64")
659
-
660
- else:
661
- clf = self.clf()
662
- df_subset = df_subset.astype("float32")
663
- df_subset.replace(-9.0, np.nan, inplace=True)
664
-
665
- imputer = self._define_iterative_imputer(
666
- clf,
667
- self.logfilepath,
668
- clf_kwargs=self.clf_kwargs,
669
- ga_kwargs=self.ga_kwargs,
670
- n_jobs=self.n_jobs,
671
- clf_type=self.clf_type,
672
- imp_kwargs=self.imp_kwargs,
673
- )
674
-
675
- if len(cols_to_keep) == original_num_cols:
676
- cols_to_keep = None
677
-
678
- Xt, params_list, score_list = imputer.fit_transform(
679
- df, cols_to_keep
680
- )
681
-
682
- if self.verbose > 0:
683
- print(f"\nDone with {self.clf.__name__} grid search!")
684
-
685
- if self.disable_progressbar:
686
- if self.verbose > 0:
687
- with open(self.logfilepath, "a") as fout:
688
- # Redirect to progress logfile
689
- with redirect_stdout(fout):
690
- print(
691
- f"\nDone with {self.clf.__name__} grid search!"
692
- )
693
-
694
- if self.algorithm == "ii":
695
- # Iterative Imputer.
696
- del imputer
697
- del Xt
698
-
699
- # Average or mode of best parameters
700
- # and write them to a file
701
- best_params = self._get_best_params(params_list)
702
-
703
- avg_score = mean(abs(x) for x in score_list if x != -9)
704
- median_score = median(abs(x) for x in score_list if x != -9)
705
- max_score = max(abs(x) for x in score_list if x != -9)
706
- min_score = min(abs(x) for x in score_list if x != -9)
707
-
708
- df_scores = pd.DataFrame(
709
- {
710
- "Mean": avg_score,
711
- "Median": median_score,
712
- "Min": min_score,
713
- "Max": max_score,
714
- },
715
- index=[0],
716
- )
717
-
718
- df_scores = df_scores.round(2)
719
-
720
- del avg_score
721
- del median_score
722
- del max_score
723
- del min_score
724
- gc.collect()
725
- else:
726
- # Using neural network.
727
- best_params = imputer.best_params_
728
- df_scores = imputer.best_score_
729
- df_scores = round(df_scores, 2) * 100
730
- best_imputer = None
731
-
732
- if self.clf_type == "classifier" and self.algorithm != "nn":
733
- df_scores = df_scores.apply(lambda x: x * 100)
734
-
735
- self._write_imputed_params_score(df_scores, best_params)
736
-
737
- # Change values to the ones in best_params
738
- self.clf_kwargs.update(best_params)
739
-
740
- if self.algorithm == "ii":
741
- if hasattr(self.clf(), "n_jobs"):
742
- self.clf_kwargs["n_jobs"] = self.n_jobs
743
-
744
- best_clf = self.clf(**self.clf_kwargs)
745
-
746
- gc.collect()
747
-
748
- if self.verbose > 0:
749
- print(
750
- f"\nDoing {self.clf.__name__} imputation "
751
- f"with best found parameters...\n"
752
- )
753
-
754
- if self.disable_progressbar:
755
- with open(self.logfilepath, "a") as fout:
756
- # Redirect to progress logfile
757
- with redirect_stdout(fout):
758
- print(
759
- f"\nDoing {self.clf.__name__} imputation "
760
- f"with best found parameters...\n"
761
- )
762
-
763
- if self.algorithm == "ii":
764
- best_imputer = self._define_iterative_imputer(
765
- best_clf,
766
- self.logfilepath,
767
- clf_kwargs=self.clf_kwargs,
768
- imp_kwargs=self.imp_kwargs,
769
- )
770
-
771
- final_cols = None
772
- if len(df.columns) < original_num_cols:
773
- final_cols = np.array(df.columns)
774
-
775
- if self.algorithm == "nn" and self.column_subset == 1.0:
776
- imputed_df = df_imp.copy()
777
- df_chunks = None
778
- else:
779
- df_chunks = self._df2chunks(df, self.chunk_size)
780
- imputed_df = self._impute_df(
781
- df_chunks, best_imputer, cols_to_keep=final_cols
782
- )
783
-
784
- lst2del = [df_chunks, df]
785
- del lst2del
786
- gc.collect()
787
-
788
- self._validate_imputed(imputed_df)
789
-
790
- if self.verbose > 0:
791
- print(f"Done with {self.clf.__name__} imputation!\n")
792
-
793
- if self.disable_progressbar:
794
- with open(self.logfilepath, "a") as fout:
795
- # Redirect to progress logfile
796
- with redirect_stdout(fout):
797
- print(f"Done with {self.clf.__name__} imputation!\n")
798
-
799
- return imputed_df, df_scores, best_params
800
-
801
- def _imputer_validation(
802
- self, df: pd.DataFrame, clf: Optional[Callable]
803
- ) -> pd.DataFrame:
804
- """Validate imputation with a validation test set.
805
-
806
- Validation imputation by running it on a validation test set ``cv`` times. Actual missing values are imputed with sklearn.impute.SimpleImputer, and then missing values are randomly introduced to known genotypes. The dataset with no missing data is compared to the dataset with known missing data to obtain validation scores.
807
-
808
- Args:
809
- df (pandas.DataFrame): 012-encoded genotypes to impute.
810
-
811
- clf (sklearn classifier instance or None): sklearn classifier instance with which to run the imputation.
812
-
813
- Raises:
814
- ValueError: If none of the scores were able to be estimated and reps variable is empty.
815
-
816
- Returns:
817
- pandas.DataFrame: Validation scores in a pandas DataFrame object. Contains the scoring metric, mean, median, minimum, and maximum validation scores among all features, and the lower and upper 95% confidence interval among the replicates.
818
- """
819
- reps = defaultdict(list)
820
- for cnt, rep in enumerate(
821
- progressbar(
822
- range(self.cv),
823
- desc="Validation replicates: ",
824
- leave=True,
825
- disable=self.disable_progressbar,
826
- ),
827
- start=1,
828
- ):
829
- if self.disable_progressbar:
830
- perc = int((cnt / self.cv) * 100)
831
- if self.verbose > 0:
832
- print(f"Validation replicate {cnt}/{self.cv} ({perc}%)")
833
-
834
- with open(self.logfilepath, "a") as fout:
835
- # Redirect to progress logfile
836
- with redirect_stdout(fout):
837
- print(
838
- f"Validation replicate {cnt}/{self.cv} ({perc}%)"
839
- )
840
-
841
- scores = self._impute_eval(df, clf)
842
-
843
- for k, score_list in scores.items():
844
- score_list_filtered = filter(lambda x: x != -9, score_list)
845
-
846
- if score_list_filtered:
847
- reps[k].append(score_list_filtered)
848
- else:
849
- continue
850
-
851
- if not reps:
852
- raise ValueError("None of the features could be validated!")
853
-
854
- ci_lower = dict()
855
- ci_upper = dict()
856
- for k, v in reps.items():
857
- reps_t = np.array(v).T.tolist()
858
-
859
- cis = list()
860
- if len(reps_t) > 1:
861
- for rep in reps_t:
862
- rep = [abs(x) for x in rep]
863
-
864
- cis.append(
865
- st.t.interval(
866
- alpha=0.95,
867
- df=len(rep) - 1,
868
- loc=np.mean(rep),
869
- scale=st.sem(rep),
870
- )
871
- )
872
-
873
- ci_lower[k] = mean(x[0] for x in cis)
874
- ci_upper[k] = mean(x[1] for x in cis)
875
- else:
876
- print(
877
- "Warning: There was no variance among replicates; "
878
- "the 95% CI could not be calculated"
879
- )
880
-
881
- ci_lower[k] = np.nan
882
- ci_upper[k] = np.nan
883
-
884
- results_list = list()
885
- for k, score_list in scores.items():
886
- avg_score = mean(abs(x) for x in score_list if x != -9)
887
- median_score = median(abs(x) for x in score_list if x != -9)
888
- max_score = max(abs(x) for x in score_list if x != -9)
889
- min_score = min(abs(x) for x in score_list if x != -9)
890
-
891
- results_list.append(
892
- {
893
- "Metric": k,
894
- "Mean": avg_score,
895
- "Median": median_score,
896
- "Min": min_score,
897
- "Max": max_score,
898
- "Lower 95% CI": ci_lower[k],
899
- "Upper 95% CI": ci_upper[k],
900
- }
901
- )
902
-
903
- df_scores = pd.DataFrame(results_list)
904
-
905
- if self.clf_type == "classifier":
906
- columns_list = [
907
- "Mean",
908
- "Median",
909
- "Min",
910
- "Max",
911
- "Lower 95% CI",
912
- "Upper 95% CI",
913
- ]
914
-
915
- df_scores = df_scores.round(2)
916
-
917
- outfile = os.path.join(
918
- f"{self.prefix}_output",
919
- "reports",
920
- self.imp_method,
921
- self.imp_name,
922
- "imputed_best_score.csv",
923
- )
924
- df_scores.to_csv(outfile, header=True, index=False)
925
-
926
- del results_list
927
- gc.collect()
928
-
929
- return df_scores
930
-
931
- def _impute_df(
932
- self,
933
- df_chunks: List[pd.DataFrame],
934
- imputer: Optional[
935
- Union[IterativeImputerFixedParams, IterativeImputerGridSearch]
936
- ] = None,
937
- cols_to_keep: Optional[np.ndarray] = None,
938
- ) -> pd.DataFrame:
939
- """Impute list of pandas.DataFrame objects using custom IterativeImputer class.
940
-
941
- The DataFrames are chunks of the whole input data, with each chunk correspoding to ``chunk_size`` features from ``_df2chunks()``\.
942
-
943
- Args:
944
- df_chunks (List[pandas.DataFrame]): List of Dataframes of shape(n_samples, n_features_in_chunk).
945
-
946
- imputer (imputer or classifier instance or None): Imputer or classifier instance to perform the imputation.
947
-
948
- cols_to_keep (numpy.ndarray or None): Final bi-allelic columns to keep. If some columns were non-biallelic, it will be a subset of columns.
949
-
950
- Returns:
951
- pandas.DataFrame: Single DataFrame object, with all the imputed chunks concatenated together.
952
- """
953
- imputed_chunks = list()
954
- for i, Xchunk in enumerate(df_chunks, start=1):
955
- if self.clf_type == "classifier":
956
- if self.algorithm == "nn":
957
- if self.clf == VAE:
958
- self.clf_kwargs["testing"] = self.testing
959
- imputer = self.clf(
960
- genotype_data=self.imp_kwargs["genotype_data"],
961
- disable_progressbar=self.disable_progressbar,
962
- prefix=self.prefix,
963
- **self.clf_kwargs,
964
- )
965
- df_imp = pd.DataFrame(
966
- imputer.fit_transform(Xchunk),
967
- )
968
- df_imp = df_imp.astype("float")
969
- df_imp = df_imp.astype("Int8")
970
-
971
- else:
972
- imp, _, __ = imputer.fit_transform(
973
- Xchunk, valid_cols=cols_to_keep
974
- )
975
- df_imp = pd.DataFrame(imp)
976
-
977
- imputed_chunks.append(df_imp)
978
-
979
- else:
980
- # Regressor. Needs to be rounded to integer first.
981
- imp, _, __ = imputer.fit_transform(
982
- Xchunk, valid_cols=cols_to_keep
983
- )
984
- df_imp = pd.DataFrame(imp)
985
- df_imp = df_imp.round(0).astype("Int8")
986
-
987
- imputed_chunks.append(df_imp)
988
-
989
- concat_df = pd.concat(imputed_chunks, axis=1)
990
-
991
- del imputed_chunks
992
- gc.collect()
993
-
994
- return concat_df
995
-
996
- def _validate_imputed(self, df: pd.DataFrame) -> None:
997
- """Asserts that there is no missing data left in the imputed DataFrame.
998
-
999
- Args:
1000
- df (pandas.DataFrame): DataFrame with imputed 012-encoded genotypes.
1001
-
1002
- Raises:
1003
- AssertionError: Error if missing values are still found in the dataset after imputation.
1004
- """
1005
- assert (
1006
- not df.isnull().values.any()
1007
- ), "Imputation failed...Missing values found in the imputed dataset"
1008
-
1009
- def _get_best_params(
1010
- self, params_list: List[Dict[str, Any]]
1011
- ) -> Dict[str, Any]:
1012
- """[Gets the best parameters from the grid search. Determines the parameter types and either gets the mean or mode if the type is numeric or string/ boolean]
1013
-
1014
- Args:
1015
- params_list (List[dict]): List of grid search parameter values.
1016
-
1017
- Returns:
1018
- Dict[str, Any]: Dictionary with parameters as keys and their best values.
1019
- """
1020
- best_params = dict()
1021
- keys = list(params_list[0].keys())
1022
- first_key = keys[0]
1023
-
1024
- params_list = list(filter(lambda i: i[first_key] != -9, params_list))
1025
-
1026
- for k in keys:
1027
- if all(
1028
- isinstance(x[k], (int, float)) for x in params_list if x[k]
1029
- ):
1030
- if all(isinstance(y[k], int) for y in params_list):
1031
- best_params[k] = self._average_list_of_dicts(
1032
- params_list, k, is_int=True
1033
- )
1034
-
1035
- elif all(isinstance(z[k], float) for z in params_list):
1036
- best_params[k] = self._average_list_of_dicts(
1037
- params_list, k
1038
- )
1039
-
1040
- elif all(isinstance(x[k], (str, bool)) for x in params_list):
1041
- best_params[k] = self._mode_list_of_dicts(params_list, k)
1042
-
1043
- else:
1044
- best_params[k] = self._mode_list_of_dicts(params_list, k)
1045
-
1046
- return best_params
1047
-
1048
- def _mode_list_of_dicts(
1049
- self, l: List[Dict[str, Union[str, bool]]], k: str
1050
- ) -> str:
1051
- """Get mode for key k in a list of dictionaries.
1052
-
1053
- Args:
1054
- l (list(dict)): List of dictionaries.
1055
- k (str): Key to find the mode across all dictionaries in l.
1056
-
1057
- Returns:
1058
- str or bool: Most common value across list of dictionaries for one key.
1059
- """
1060
- k_count = Counter(map(itemgetter(k), l))
1061
- return k_count.most_common()[0][0]
1062
-
1063
- def _average_list_of_dicts(
1064
- self,
1065
- l: List[Dict[str, Union[int, float]]],
1066
- k: str,
1067
- is_int: bool = False,
1068
- ) -> Union[int, float]:
1069
- """Get average of a given key in a list of dictionaries.
1070
-
1071
- Args:
1072
- l (List[Dict[str, Union[int, float]]]): List of dictionaries.
1073
-
1074
- k (str): Key to find average across list of dictionaries.
1075
-
1076
- is_int (bool, optional): Whether or not the value for key k is an integer. If False, it is expected to be of type float. Defaults to False.
1077
-
1078
- Returns:
1079
- int or float: average of given key across list of dictionaries.
1080
- """
1081
- if is_int:
1082
- return int(sum(d[k] for d in l) / len(l))
1083
- else:
1084
- return sum(d[k] for d in l) / len(l)
1085
-
1086
- def _gather_impute_settings(
1087
- self, kwargs: Dict[str, Any]
1088
- ) -> Tuple[
1089
- Optional[Dict[str, Any]],
1090
- Optional[Dict[str, Any]],
1091
- Optional[Dict[str, Any]],
1092
- Optional[int],
1093
- Optional[int],
1094
- Optional[int],
1095
- Optional[str],
1096
- Optional[Union[int, float]],
1097
- Optional[bool],
1098
- Optional[Union[int, float]],
1099
- Optional[bool],
1100
- Optional[bool],
1101
- ]:
1102
- """Gather impute settings from kwargs object.
1103
-
1104
- Gather impute settings from the various imputation classes and IterativeImputer. Gathers them for use with the ``Impute`` class. Returns dictionary with keys as keyword arguments and the values as the settings. The imputation can then be run by specifying IterativeImputer(imp_kwargs).
1105
-
1106
- Args:
1107
- kwargs (Dict[str, Any]): Dictionary with keys as the keyword arguments and their corresponding values.
1108
-
1109
- Returns:
1110
- Dict[str, Any]: IterativeImputer keyword arguments.
1111
- Dict[str, Any]: Classifier keyword arguments.
1112
- Dict[str, Any]: Genetic algorithm keyword arguments.
1113
- int: Number of cross-validation folds to use with non-grid search validation.
1114
- int: Verbosity setting. 0 is silent, 2 is most verbose.
1115
- int: Number of processors to use with grid search.
1116
- str or None: Prefix for output files.
1117
- int or float: Proportion of dataset (if float) or number of columns (if int) to use for grid search.
1118
- bool: If True, disables the tqdm progress bar and just prints status updates to a file. If False, uses tqdm progress bar.
1119
- int or float: Chunk sizes for doing full imputation following grid search. If int, then splits into chunks of ``chunk_size``\. If float, then splits into chunks of ``n_features * chunk_size``\.
1120
- bool: Whether to do validation if ``gridparams is None``.
1121
- bool: True if doing grid search, False otherwise.
1122
- """
1123
- n_jobs = kwargs.pop("n_jobs", 1)
1124
- cv = kwargs.pop("cv", None)
1125
- column_subset = kwargs.pop("column_subset", None)
1126
- chunk_size = kwargs.pop("chunk_size", 1.0)
1127
- do_validation = kwargs.pop("do_validation", False)
1128
- verbose = kwargs.get("verbose", 0)
1129
- disable_progressbar = kwargs.get("disable_progressbar", False)
1130
- prefix = kwargs.get("prefix", "imputer")
1131
- testing = kwargs.get("testing", False)
1132
- do_gridsearch = False if kwargs["gridparams"] is None else True
1133
-
1134
- if prefix is None:
1135
- prefix = "imputer"
1136
-
1137
- imp_kwargs = kwargs.copy()
1138
- clf_kwargs = kwargs.copy()
1139
- ga_kwargs = kwargs.copy()
1140
-
1141
- imp_keys = [
1142
- "grid_iter",
1143
- "tol",
1144
- "verbose",
1145
- "genotype_data",
1146
- "str_encodings",
1147
- "progress_update_percent",
1148
- "sim_strategy",
1149
- "sim_prop_missing",
1150
- "gridparams",
1151
- "gridsearch_method",
1152
- "scoring_metric",
1153
- "disable_progressbar",
1154
- "prefix",
1155
- ]
1156
-
1157
- if self.algorithm == "ii":
1158
- imp_keys.extend(
1159
- [
1160
- "n_nearest_features",
1161
- "max_iter",
1162
- "initial_strategy",
1163
- "imputation_order",
1164
- "skip_complete",
1165
- "random_state",
1166
- "sample_posterior",
1167
- ]
1168
- )
1169
-
1170
- ga_keys = [
1171
- "population_size",
1172
- "tournament_size",
1173
- "elitism",
1174
- "crossover_probability",
1175
- "mutation_probability",
1176
- "ga_algorithm",
1177
- "early_stop_gen",
1178
- ]
1179
-
1180
- to_remove = ["self", "__class__"]
1181
-
1182
- for k, v in clf_kwargs.copy().items():
1183
- if k in to_remove:
1184
- clf_kwargs.pop(k)
1185
- if k in imp_keys:
1186
- clf_kwargs.pop(k)
1187
- if k in ga_keys:
1188
- clf_kwargs.pop(k)
1189
-
1190
- if "clf_random_state" in clf_kwargs:
1191
- clf_kwargs["random_state"] = clf_kwargs.pop("clf_random_state")
1192
-
1193
- if "clf_tol" in clf_kwargs:
1194
- clf_kwargs["tol"] = clf_kwargs.pop("clf_tol")
1195
-
1196
- for k, v in imp_kwargs.copy().items():
1197
- if k not in imp_keys:
1198
- imp_kwargs.pop(k)
1199
-
1200
- for k, v in ga_kwargs.copy().items():
1201
- if k not in ga_keys:
1202
- ga_kwargs.pop(k)
1203
-
1204
- if "ga_algorithm" in ga_kwargs:
1205
- ga_kwargs["algorithm"] = ga_kwargs.pop("ga_algorithm")
1206
-
1207
- if self.clf_type == "regressor":
1208
- ga_kwargs["criteria"] = "min"
1209
-
1210
- elif self.clf_type == "classifier":
1211
- ga_kwargs["criteria"] = "max"
1212
-
1213
- return (
1214
- imp_kwargs,
1215
- clf_kwargs,
1216
- ga_kwargs,
1217
- cv,
1218
- verbose,
1219
- n_jobs,
1220
- prefix,
1221
- column_subset,
1222
- disable_progressbar,
1223
- chunk_size,
1224
- do_validation,
1225
- do_gridsearch,
1226
- testing,
1227
- )
1228
-
1229
- def _impute_eval(
1230
- self, df: pd.DataFrame, clf: Optional[Callable]
1231
- ) -> Dict[str, List[Union[float, int]]]:
1232
- """Function to run IterativeImputer on a pandas.DataFrame.
1233
-
1234
- The dataframe columns are randomly subset and a fraction of the known, true values are converted to missing data to allow evalutation of the model with either accuracy or mean_squared_error scores.
1235
-
1236
- Args:
1237
- df (pandas.DataFrame): Original DataFrame with 012-encoded genotypes.
1238
-
1239
- clf (sklearn Classifier or None): Classifier instance to use with IterativeImputer.
1240
-
1241
- Returns:
1242
- Dict[List[float or int]]: Validation scores for the current imputation.
1243
- """
1244
- cols = np.random.choice(
1245
- df.columns,
1246
- int(len(df.columns) * self.column_subset),
1247
- replace=False,
1248
- )
1249
-
1250
- if self.verbose > 0:
1251
- print(
1252
- f"\nSimulating validation data with missing data proportion "
1253
- f"{self.sim_prop_missing} and strategy {self.sim_strategy}"
1254
- )
1255
-
1256
- df_known = df.copy()
1257
-
1258
- if self.algorithm == "nn":
1259
- df_unknown = df_known.copy()
1260
-
1261
- else:
1262
- df_unknown = pd.DataFrame(
1263
- SimGenotypeDataTransformer(
1264
- self.genotype_data,
1265
- prop_missing=self.imp_kwargs["sim_prop_missing"],
1266
- strategy=self.imp_kwargs["sim_strategy"],
1267
- ).fit_transform(df_known)
1268
- )
1269
-
1270
- df_unknown_slice = df_unknown[cols]
1271
-
1272
- # Neural networks
1273
- if self.algorithm == "nn":
1274
- df_stg = df_unknown_slice.copy()
1275
-
1276
- for col in df_stg.columns:
1277
- df_stg[col] = df_stg[col].replace({pd.NA: np.nan})
1278
- # df_stg.fillna(-9, inplace=True)
1279
-
1280
- imputer = self.clf(
1281
- prefix=self.prefix, **self.clf_kwargs, **self.imp_kwargs
1282
- )
1283
-
1284
- df_imp = pd.DataFrame(
1285
- imputer.fit_transform(df_stg.to_numpy()),
1286
- columns=cols,
1287
- )
1288
-
1289
- df_unknown_slice = pd.DataFrame(imputer.y_simulated_, columns=cols)
1290
- df_known_slice = pd.DataFrame(imputer.y_original_, columns=cols)
1291
-
1292
- df_missing_mask = pd.DataFrame(
1293
- imputer.sim_missing_mask_, columns=cols
1294
- )
1295
-
1296
- df_imp = df_imp.astype("float")
1297
- df_imp = df_imp.astype("int64")
1298
-
1299
- else:
1300
- df_known_slice = df_known[cols]
1301
- df_known_slice = df_known[cols]
1302
- df_missing_mask = df_unknown_slice.isnull()
1303
-
1304
- df_unknown.replace(-9, np.nan, inplace=True)
1305
-
1306
- # Using IterativeImputer
1307
- df_stg = df_unknown.copy()
1308
-
1309
- imputer = self._define_iterative_imputer(
1310
- clf,
1311
- self.logfilepath,
1312
- clf_kwargs=self.clf_kwargs,
1313
- imp_kwargs=self.imp_kwargs,
1314
- )
1315
-
1316
- imp_arr = imputer.fit_transform(df_stg)
1317
-
1318
- # Get only subset of validation columns
1319
- # get_loc returns the index of the value
1320
- df_imp = pd.DataFrame(
1321
- imp_arr[:, [df_unknown.columns.get_loc(i) for i in cols]],
1322
- columns=cols,
1323
- )
1324
-
1325
- # Get score of each column
1326
- scores = defaultdict(list)
1327
- for col in df_known_slice.columns:
1328
- # Adapted from: https://medium.com/analytics-vidhya/using-scikit-learns-iterative-imputer-694c3cca34de
1329
-
1330
- mask = df_missing_mask[col]
1331
- y_true = df_known[col]
1332
- y_true = y_true[mask]
1333
-
1334
- y_pred = df_imp[col]
1335
- y_pred = y_pred[mask]
1336
-
1337
- if self.clf_type == "classifier":
1338
- if y_pred.empty:
1339
- scores["accuracy"].append(-9)
1340
- scores["precision"].append(-9)
1341
- scores["f1"].append(-9)
1342
- scores["recall"].append(-9)
1343
- scores["jaccard"].append(-9)
1344
- continue
1345
-
1346
- # Had to do this because get incompatible type error if using
1347
- # initial_imputation="populations"
1348
- if y_true.dtypes != "int64":
1349
- y_true = y_true.astype("int64")
1350
- if y_pred.dtypes != "int64":
1351
- y_pred = y_pred.astype("int64")
1352
-
1353
- scores["accuracy"].append(
1354
- metrics.accuracy_score(y_true, y_pred)
1355
- )
1356
-
1357
- scores["precision"].append(
1358
- metrics.precision_score(
1359
- y_true, y_pred, average="macro", zero_division=0
1360
- )
1361
- )
1362
-
1363
- scores["f1"].append(
1364
- metrics.f1_score(
1365
- y_true, y_pred, average="macro", zero_division=0
1366
- )
1367
- )
1368
-
1369
- scores["recall"].append(
1370
- metrics.recall_score(
1371
- y_true, y_pred, average="macro", zero_division=0
1372
- )
1373
- )
1374
-
1375
- scores["jaccard"].append(
1376
- metrics.jaccard_score(
1377
- y_true, y_pred, average="macro", zero_division=0
1378
- )
1379
- )
1380
-
1381
- else:
1382
- scores["explained_var"].append(
1383
- metrics.explained_variance_score(y_true, y_pred)
1384
- )
1385
-
1386
- scores["rmse"].append(
1387
- metrics.mean_squared_error(y_true, y_pred, squared=False)
1388
- )
1389
-
1390
- lst2del = [
1391
- df_stg,
1392
- df_imp,
1393
- df_known,
1394
- df_known_slice,
1395
- df_unknown,
1396
- ]
1397
-
1398
- if self.algorithm == "nn":
1399
- del lst2del
1400
- del cols
1401
- else:
1402
- del lst2del
1403
- del imp_arr
1404
- del imputer
1405
- del cols
1406
-
1407
- gc.collect()
1408
-
1409
- return scores
1410
-
1411
- def _define_iterative_imputer(
1412
- self,
1413
- clf: Callable,
1414
- logfilepath: str,
1415
- clf_kwargs: Optional[Dict[str, Any]] = None,
1416
- imp_kwargs: Optional[str] = None,
1417
- ga_kwargs: Optional[Dict[str, Any]] = None,
1418
- n_jobs: Optional[int] = None,
1419
- clf_type: Optional[str] = None,
1420
- ) -> Union[IterativeImputerGridSearch, IterativeImputerFixedParams]:
1421
- """Define an IterativeImputer instance.
1422
-
1423
- The instances are of custom, overloaded IterativeImputer classes.
1424
-
1425
- Args:
1426
- clf (sklearn Classifier instance): Estimator to use with IterativeImputer.
1427
-
1428
- logfilepath (str): Path to progress log file.
1429
-
1430
- clf_kwargs (dict, optional): Keyword arguments for classifier. Defaults to None.
1431
-
1432
- imp_kwargs (Dict[str, Any], optional): Keyword arguments for imputation settings. Defaults to None.
1433
-
1434
- ga_kwargs (dict, optional): Keyword arguments for genetic algorithm grid search. Defaults to None.
1435
-
1436
- n_jobs (int, optional): Number of parallel jobs to use with the IterativeImputer grid search. Ignored if ``search_space=None``\. Defaults to None.
1437
-
1438
- clf_type (str, optional): Type of estimator. Valid options are "classifier" or "regressor". Ignored if ``search_space=None``\. Defaults to None.
1439
-
1440
- Returns:
1441
- sklearn.impute.IterativeImputer: IterativeImputer instance.
1442
- """
1443
- if not self.do_gridsearch:
1444
- imp = IterativeImputerFixedParams(
1445
- logfilepath,
1446
- clf_kwargs,
1447
- estimator=clf,
1448
- **imp_kwargs,
1449
- )
1450
-
1451
- else:
1452
- # Create iterative imputer
1453
- imp = IterativeImputerGridSearch(
1454
- logfilepath,
1455
- clf_kwargs,
1456
- ga_kwargs,
1457
- estimator=clf,
1458
- grid_n_jobs=n_jobs,
1459
- clf_type=clf_type,
1460
- **imp_kwargs,
1461
- )
1462
-
1463
- return imp