pg-sui 1.0.2.1__py3-none-any.whl → 1.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pg-sui might be problematic. Click here for more details.

Files changed (112) hide show
  1. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/METADATA +51 -70
  2. pg_sui-1.6.8.dist-info/RECORD +78 -0
  3. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/WHEEL +1 -1
  4. pg_sui-1.6.8.dist-info/entry_points.txt +4 -0
  5. pg_sui-1.6.8.dist-info/top_level.txt +1 -0
  6. pgsui/__init__.py +35 -54
  7. pgsui/_version.py +34 -0
  8. pgsui/cli.py +635 -0
  9. pgsui/data_processing/config.py +576 -0
  10. pgsui/data_processing/containers.py +1782 -0
  11. pgsui/data_processing/transformers.py +121 -1103
  12. pgsui/electron/app/__main__.py +5 -0
  13. pgsui/electron/app/icons/icons/1024x1024.png +0 -0
  14. pgsui/electron/app/icons/icons/128x128.png +0 -0
  15. pgsui/electron/app/icons/icons/16x16.png +0 -0
  16. pgsui/electron/app/icons/icons/24x24.png +0 -0
  17. pgsui/electron/app/icons/icons/256x256.png +0 -0
  18. pgsui/electron/app/icons/icons/32x32.png +0 -0
  19. pgsui/electron/app/icons/icons/48x48.png +0 -0
  20. pgsui/electron/app/icons/icons/512x512.png +0 -0
  21. pgsui/electron/app/icons/icons/64x64.png +0 -0
  22. pgsui/electron/app/icons/icons/icon.icns +0 -0
  23. pgsui/electron/app/icons/icons/icon.ico +0 -0
  24. pgsui/electron/app/main.js +189 -0
  25. pgsui/electron/app/package-lock.json +6893 -0
  26. pgsui/electron/app/package.json +50 -0
  27. pgsui/electron/app/preload.js +15 -0
  28. pgsui/electron/app/server.py +146 -0
  29. pgsui/electron/app/ui/logo.png +0 -0
  30. pgsui/electron/app/ui/renderer.js +130 -0
  31. pgsui/electron/app/ui/styles.css +59 -0
  32. pgsui/electron/app/ui/ui_shim.js +72 -0
  33. pgsui/electron/bootstrap.py +43 -0
  34. pgsui/electron/launch.py +59 -0
  35. pgsui/electron/package.json +14 -0
  36. pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
  37. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
  38. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
  39. pgsui/impute/deterministic/imputers/allele_freq.py +691 -0
  40. pgsui/impute/deterministic/imputers/mode.py +679 -0
  41. pgsui/impute/deterministic/imputers/nmf.py +221 -0
  42. pgsui/impute/deterministic/imputers/phylo.py +971 -0
  43. pgsui/impute/deterministic/imputers/ref_allele.py +530 -0
  44. pgsui/impute/supervised/base.py +339 -0
  45. pgsui/impute/supervised/imputers/hist_gradient_boosting.py +293 -0
  46. pgsui/impute/supervised/imputers/random_forest.py +287 -0
  47. pgsui/impute/unsupervised/base.py +924 -0
  48. pgsui/impute/unsupervised/callbacks.py +89 -263
  49. pgsui/impute/unsupervised/imputers/autoencoder.py +972 -0
  50. pgsui/impute/unsupervised/imputers/nlpca.py +1264 -0
  51. pgsui/impute/unsupervised/imputers/ubp.py +1288 -0
  52. pgsui/impute/unsupervised/imputers/vae.py +957 -0
  53. pgsui/impute/unsupervised/loss_functions.py +158 -0
  54. pgsui/impute/unsupervised/models/autoencoder_model.py +208 -558
  55. pgsui/impute/unsupervised/models/nlpca_model.py +149 -468
  56. pgsui/impute/unsupervised/models/ubp_model.py +198 -1317
  57. pgsui/impute/unsupervised/models/vae_model.py +259 -618
  58. pgsui/impute/unsupervised/nn_scorers.py +215 -0
  59. pgsui/utils/classification_viz.py +591 -0
  60. pgsui/utils/misc.py +35 -480
  61. pgsui/utils/plotting.py +514 -824
  62. pgsui/utils/scorers.py +212 -438
  63. pg_sui-1.0.2.1.dist-info/RECORD +0 -75
  64. pg_sui-1.0.2.1.dist-info/top_level.txt +0 -3
  65. pgsui/example_data/phylip_files/test_n10.phy +0 -118
  66. pgsui/example_data/phylip_files/test_n100.phy +0 -118
  67. pgsui/example_data/phylip_files/test_n2.phy +0 -118
  68. pgsui/example_data/phylip_files/test_n500.phy +0 -118
  69. pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
  70. pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
  71. pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
  72. pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
  73. pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
  74. pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
  75. pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
  76. pgsui/example_data/trees/test.iqtree +0 -376
  77. pgsui/example_data/trees/test.qmat +0 -5
  78. pgsui/example_data/trees/test.rate +0 -2033
  79. pgsui/example_data/trees/test.tre +0 -1
  80. pgsui/example_data/trees/test_n10.rate +0 -19
  81. pgsui/example_data/trees/test_n100.rate +0 -109
  82. pgsui/example_data/trees/test_n500.rate +0 -509
  83. pgsui/example_data/trees/test_siterates.txt +0 -2024
  84. pgsui/example_data/trees/test_siterates_n10.txt +0 -10
  85. pgsui/example_data/trees/test_siterates_n100.txt +0 -100
  86. pgsui/example_data/trees/test_siterates_n500.txt +0 -500
  87. pgsui/example_data/vcf_files/test.vcf +0 -244
  88. pgsui/example_data/vcf_files/test.vcf.gz +0 -0
  89. pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
  90. pgsui/impute/estimators.py +0 -735
  91. pgsui/impute/impute.py +0 -1486
  92. pgsui/impute/simple_imputers.py +0 -1439
  93. pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -785
  94. pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1027
  95. pgsui/impute/unsupervised/keras_classifiers.py +0 -702
  96. pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
  97. pgsui/impute/unsupervised/neural_network_imputers.py +0 -1424
  98. pgsui/impute/unsupervised/neural_network_methods.py +0 -1549
  99. pgsui/pg_sui.py +0 -261
  100. pgsui/utils/sequence_tools.py +0 -407
  101. simulation/sim_benchmarks.py +0 -333
  102. simulation/sim_treeparams.py +0 -475
  103. test/__init__.py +0 -0
  104. test/pg_sui_simtest.py +0 -215
  105. test/pg_sui_testing.py +0 -523
  106. test/test.py +0 -297
  107. test/test_pgsui.py +0 -374
  108. test/test_tkc.py +0 -214
  109. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info/licenses}/LICENSE +0 -0
  110. /pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
  111. /pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
  112. {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
pgsui/impute/impute.py DELETED
@@ -1,1486 +0,0 @@
1
- # Standard library imports
2
- import errno
3
- import gc
4
- import math
5
- import os
6
- import pprint
7
- import sys
8
- from collections import Counter
9
- from collections import defaultdict
10
- from operator import itemgetter
11
- from pathlib import Path
12
- from statistics import mean, median
13
- from contextlib import redirect_stdout
14
- from typing import Optional, Union, List, Dict, Tuple, Any, Callable
15
-
16
- import warnings
17
-
18
- warnings.simplefilter(action="ignore", category=FutureWarning)
19
-
20
- # Third party imports
21
- import numpy as np
22
- import pandas as pd
23
-
24
- from scipy import stats as st
25
-
26
- # from memory_profiler import memory_usage
27
-
28
- # Scikit-learn imports
29
- from sklearn.experimental import enable_iterative_imputer
30
- from sklearn import metrics
31
-
32
- from sklearn_genetic.space import Continuous, Categorical, Integer
33
-
34
- # Custom module imports
35
- try:
36
- from .supervised.iterative_imputer_gridsearch import (
37
- IterativeImputerGridSearch,
38
- )
39
- from .supervised.iterative_imputer_fixedparams import (
40
- IterativeImputerFixedParams,
41
- )
42
- from .unsupervised.neural_network_imputers import VAE, UBP, SAE
43
- from ..utils.misc import isnotebook
44
- from ..utils.misc import timer
45
- from ..data_processing.transformers import (
46
- SimGenotypeDataTransformer,
47
- )
48
- except (ModuleNotFoundError, ValueError, ImportError):
49
- from impute.supervised.iterative_imputer_gridsearch import (
50
- IterativeImputerGridSearch,
51
- )
52
- from impute.supervised.iterative_imputer_fixedparams import (
53
- IterativeImputerFixedParams,
54
- )
55
- from impute.unsupervised.neural_network_imputers import VAE, UBP, SAE
56
- from utils.misc import isnotebook
57
- from utils.misc import timer
58
- from data_processing.transformers import (
59
- SimGenotypeDataTransformer,
60
- )
61
-
62
- is_notebook = isnotebook()
63
-
64
- if is_notebook:
65
- from tqdm.notebook import tqdm as progressbar
66
- else:
67
- from tqdm import tqdm as progressbar
68
-
69
-
70
- class Impute:
71
- """Class to impute missing data from the provided classifier.
72
-
73
- The Impute class will either run a variational autoencoder or IterativeImputer with the provided estimator. The settings for the provided estimator should be provided as the ``kwargs`` argument as a dictionary object with the estimator's keyword arguments as the keys and the corresponding values. E.g., ``kwargs={"n_jobs", 4, "initial_strategy": "populations"}``\. ``clf_type`` just specifies either "classifier" or "regressor". "regressor" is primarily just for quick and dirty testing and is intended for internal use only.
74
-
75
- Once the Impute class is initialized, the imputation should be performed with ``fit_predict()``\.
76
-
77
- The imputed data can then be written to a file with ``write_imputed()``
78
-
79
- Args:
80
- clf (str or Callable estimator object): The estimator object to use. If using a variational autoencoder, the provided value should be "VAE". Otherwise, it should be a callable estimator object that is compatible with scikit-learn's IterativeImputer.
81
-
82
- clf_type (str): Specify whether to use a "classifier" or "regressor". The "regressor" option is just for quick and dirty testing, and "classifier" should almost always be used.
83
-
84
- kwargs (Dict[str, Any]): Settings to use with the estimator. The keys should be the estimator's keywords, and the values should be their corresponding settings.
85
-
86
- Raises:
87
- TypeError: Check whether the ``gridparams`` values are of the correct format if ``ga=True`` or ``ga=False``\.
88
-
89
- Examples:
90
- # Don't use parentheses after estimator object.
91
- >>> imputer = Impute(sklearn.ensemble.RandomForestClassifier,
92
- "classifier",
93
- {"n_jobs": 4, "initial_strategy": "populations", "max_iter": 25, "n_estimators": 100, "ga": True})
94
- >>> self.imputed, self.best_params = imputer.fit_predict(df)
95
- >>> imputer.write_imputed(self.imputed)
96
- >>> print(self.imputed)
97
- [[0, 0, 0, 0],
98
- [0, 0, 0, 0],
99
- [0, 1, 1, 0],
100
- [2, 1, 2, 2]]
101
- """
102
-
103
- def __init__(
104
- self, clf: Union[str, Callable], clf_type: str, kwargs: Dict[str, Any]
105
- ) -> None:
106
- self.clf = clf
107
- self.clf_type = clf_type
108
- self.original_num_cols = None
109
-
110
- if self.clf == VAE or self.clf == SAE or self.clf == UBP:
111
- self.algorithm = "nn"
112
- self.imp_method = "Unsupervised"
113
- else:
114
- self.algorithm = "ii"
115
- self.imp_method = "Supervised"
116
-
117
- if "nlpca" in kwargs:
118
- nlpca = kwargs.pop("nlpca")
119
- if nlpca:
120
- self.imp_name = "NLPCA"
121
- self.nlpca = True
122
- else:
123
- self.imp_name = "UBP"
124
- self.nlpca = False
125
- else:
126
- self.imp_name = self.clf.__name__
127
-
128
- try:
129
- self.pops = kwargs["genotype_data"].populations
130
- except AttributeError:
131
- self.pops = None
132
-
133
- self.genotype_data = kwargs["genotype_data"]
134
- self.verbose = kwargs["verbose"]
135
-
136
- # Separate local variables into settings objects
137
- (
138
- self.imp_kwargs,
139
- self.clf_kwargs,
140
- self.ga_kwargs,
141
- self.cv,
142
- self.verbose,
143
- self.n_jobs,
144
- self.prefix,
145
- self.column_subset,
146
- self.disable_progressbar,
147
- self.chunk_size,
148
- self.do_validation,
149
- self.do_gridsearch,
150
- self.testing,
151
- ) = self._gather_impute_settings(kwargs)
152
-
153
- if self.algorithm == "ii":
154
- self.imp_kwargs["pops"] = self.pops
155
-
156
- if self.do_gridsearch:
157
- for v in kwargs["gridparams"].values():
158
- if (
159
- isinstance(v, (Categorical, Integer, Continuous))
160
- and kwargs["gridsearch_method"].lower()
161
- != "genetic_algorithm"
162
- ):
163
- raise TypeError(
164
- "gridsearch_method argument must equal 'genetic_algorithm' if gridparams values are of type sklearn_genetic.space"
165
- )
166
-
167
- self.logfilepath = os.path.join(
168
- f"{self.prefix}_output",
169
- "logs",
170
- self.imp_method,
171
- self.imp_name,
172
- f"imputer_progress_log.txt",
173
- )
174
-
175
- self.invalid_indexes = None
176
-
177
- # Remove logfile if exists
178
- try:
179
- os.remove(self.logfilepath)
180
- except OSError:
181
- pass
182
-
183
- Path(
184
- os.path.join(
185
- f"{self.prefix}_output",
186
- "plots",
187
- self.imp_method,
188
- self.imp_name,
189
- )
190
- ).mkdir(parents=True, exist_ok=True)
191
-
192
- Path(
193
- os.path.join(
194
- f"{self.prefix}_output", "logs", self.imp_method, self.imp_name
195
- )
196
- ).mkdir(parents=True, exist_ok=True)
197
-
198
- Path(
199
- os.path.join(
200
- f"{self.prefix}_output",
201
- "reports",
202
- self.imp_method,
203
- self.imp_name,
204
- )
205
- ).mkdir(parents=True, exist_ok=True)
206
-
207
- Path(
208
- os.path.join(
209
- f"{self.prefix}_output",
210
- "alignments",
211
- self.imp_method,
212
- self.imp_name,
213
- )
214
- ).mkdir(parents=True, exist_ok=True)
215
-
216
- @timer
217
- def fit_predict(
218
- self, X: pd.DataFrame
219
- ) -> Tuple[pd.DataFrame, Dict[str, Any]]:
220
- """Fit and predict imputations with IterativeImputer(estimator).
221
-
222
- Fits and predicts imputed 012-encoded genotypes using IterativeImputer with any of the supported estimator objects. If ``gridparams=None``\, then a grid search is not performed. If ``gridparams!=None``\, then a RandomizedSearchCV is performed on a subset of the data and a final imputation is done on the whole dataset using the best found parameters.
223
-
224
- Args:
225
- X (pandas.DataFrame): DataFrame with 012-encoded genotypes.
226
-
227
- Returns:
228
- GenotypeData: GenotypeData object with missing genotypes imputed.
229
- Dict[str, Any]: Best parameters found during grid search.
230
- """
231
-
232
- # Test if output file can be written to
233
- try:
234
- outfile = os.path.join(
235
- f"{self.prefix}_output",
236
- "alignments",
237
- self.imp_method,
238
- self.imp_name,
239
- "imputed_012.csv",
240
- )
241
-
242
- with open(outfile, "w") as fout:
243
- pass
244
- except IOError as e:
245
- print(f"Error: {e.errno}, {e.strerror}")
246
- if e.errno == errno.EACCES:
247
- sys.exit(f"Permission denied: Cannot write to {outfile}")
248
- elif e.errno == errno.EISDIR:
249
- sys.exit(f"Could not write to {outfile}; It is a directory")
250
-
251
- # mem_usage = memory_usage((self._impute_single, (X,)))
252
- # with open(f"profiling_results/memUsage_{self.prefix}.txt", "w") as fout:
253
- # fout.write(f"{max(mem_usage)}")
254
- # sys.exit()
255
-
256
- # Don't do a grid search
257
- if not self.do_gridsearch:
258
- imputed_df, df_scores, best_params = self._impute_single(X)
259
-
260
- if df_scores is not None:
261
- self._print_scores(df_scores)
262
-
263
- # Do a grid search and get the transformed data with the best parameters
264
- else:
265
- imputed_df, df_scores, best_params = self._impute_gridsearch(X)
266
-
267
- if self.verbose > 0:
268
- print("\nBest Parameters:")
269
- pprint.pprint(best_params)
270
-
271
- imp_data = self._imputed2genotypedata(imputed_df, self.genotype_data)
272
-
273
- print("\nDone!\n")
274
- return imp_data, best_params
275
-
276
- def _df2chunks(
277
- self, df: pd.DataFrame, chunk_size: Union[int, float]
278
- ) -> List[pd.DataFrame]:
279
- """Break up pandas.DataFrame into chunks and impute chunks.
280
-
281
- If set to 1.0 of type float, then returns only one chunk containing all the data.
282
-
283
- Args:
284
- df (pandas.DataFrame): DataFrame to split into chunks.
285
-
286
- chunk_size (int or float): If type is integer, then breaks DataFrame into ``chunk_size`` chunks. If type is float, breaks DataFrame up into ``chunk_size * len(df.columns)`` chunks.
287
-
288
- Returns:
289
- List[pandas.DataFrame]: List of pandas DataFrames of shape (n_samples, n_features_in_chunk).
290
-
291
- Raises:
292
- ValueError: ``chunk_size`` must be of type int or float.
293
- """
294
- if (
295
- "initial_strategy" in self.imp_kwargs
296
- and self.imp_kwargs["initial_strategy"] == "phylogeny"
297
- and chunk_size != 1.0
298
- ):
299
- print(
300
- "WARNING: Chunking is not supported with initial_strategy == "
301
- "'phylogeny'; Setting chunk_size to 1.0 and imputing entire "
302
- "dataset"
303
- )
304
-
305
- chunk_size = 1.0
306
-
307
- if (
308
- "initial_strategy" in self.imp_kwargs
309
- and self.imp_kwargs["initial_strategy"] == "mf"
310
- and chunk_size != 1.0
311
- ):
312
- print(
313
- "WARNING: Chunking is not supported with initial_strategy == "
314
- "'mf'; Setting chunk_size to 1.0 and imputing entire "
315
- "dataset"
316
- )
317
-
318
- chunk_size = 1.0
319
-
320
- if isinstance(chunk_size, (int, float)):
321
- chunks = list()
322
- df_cp = df.copy()
323
-
324
- if isinstance(chunk_size, float):
325
- if chunk_size > 1.0:
326
- raise ValueError(
327
- f"If chunk_size is of type float, must be "
328
- f"between 0.0 and 1.0; Value supplied was {chunk_size}"
329
- )
330
-
331
- elif chunk_size == 1.0:
332
- # All data in one chunk
333
- chunks.append(df_cp)
334
- if self.verbose > 1:
335
- print(
336
- "Imputing all features at once since chunk_size is "
337
- "set to 1.0"
338
- )
339
-
340
- return chunks
341
-
342
- tmp = chunk_size
343
- chunk_size = None
344
- chunk_size = math.ceil(len(df.columns) * tmp)
345
-
346
- else:
347
- raise ValueError(
348
- f"chunk_size must be of type float or integer, "
349
- f"but type {type(chunk_size)} was passed"
350
- )
351
-
352
- chunk_len_list = list()
353
- num_chunks = math.ceil(len(df.columns) / chunk_size)
354
- for i in range(num_chunks):
355
- chunks.append(df_cp.iloc[:, i * chunk_size : (i + 1) * chunk_size])
356
- chunk_len_list.append(len(chunks[i].columns))
357
-
358
- chunk_len = ",".join([str(x) for x in chunk_len_list])
359
-
360
- if self.verbose > 1:
361
- print(
362
- f"Data split into {num_chunks} chunks with {chunk_len} features"
363
- )
364
-
365
- return chunks
366
-
367
- def _imputed2genotypedata(self, imp012, genotype_data):
368
- """Create new instance of GenotypeData object from imputed DataFrame.
369
-
370
- The imputed, decoded DataFrame gets written to file and re-loaded to instantiate a new GenotypeData object.
371
-
372
- Args:
373
- imp012 (pandas.DataFrame): Imputed 012-encoded DataFrame.
374
-
375
- genotype_data (GenotypeData): Original GenotypeData object to load attributes from.
376
-
377
- Returns:
378
- GenotypeData: GenotypeData object with imputed data.
379
- """
380
- imputed_gd = genotype_data.copy()
381
-
382
- # if self.clf == VAE:
383
- if len(imp012.shape) == 3:
384
- if imp012.shape[-1] == 4:
385
- imputed_gd.genotypes_onehot = imp012
386
- else:
387
- raise ValueError("Invalid shape for imputed output.")
388
- elif len(imp012.shape) == 2:
389
- if isinstance(imp012, pd.DataFrame):
390
- imp012 = imp012.to_numpy()
391
- imp012 = imp012.astype(int)
392
- if np.max(imp012) > 2:
393
- imputed_gd.genotypes_int = imp012
394
- else:
395
- imputed_gd.genotypes_012 = imp012
396
- else:
397
- raise ValueError(
398
- f"Invalid shape for imputed output: {imp012.shape}"
399
- )
400
- # else:
401
- # imputed_gd.genotypes_012 = imp012
402
-
403
- return imputed_gd
404
-
405
- def _subset_data_for_gridsearch(
406
- self,
407
- df: pd.DataFrame,
408
- columns_to_subset: Union[int, float],
409
- original_num_cols: int,
410
- ) -> Tuple[pd.DataFrame, np.ndarray]:
411
- """Randomly subsets pandas.DataFrame.
412
-
413
- Subset pandas DataFrame with ``column_percent`` fraction of the data. Allows for faster validation.
414
-
415
- Args:
416
- df (pandas.DataFrame): DataFrame with 012-encoded genotypes.
417
-
418
- columns_to_subset (int or float): If float, proportion of DataFrame to randomly subset should be between 0 and 1. if integer, subsets ``columns_to_subset`` random columns.
419
-
420
- original_num_cols (int): Number of columns in original DataFrame.
421
-
422
- Returns:
423
- pandas.DataFrame: New DataFrame with random subset of features.
424
- numpy.ndarray: Sorted numpy array of column indices to keep.
425
-
426
- Raises:
427
- TypeError: column_subset must be of type float or int.
428
- """
429
-
430
- # Get a random numpy arrray of column names to select
431
- if isinstance(columns_to_subset, float):
432
- n = int(original_num_cols * columns_to_subset)
433
- elif isinstance(columns_to_subset, int):
434
- n = columns_to_subset
435
- else:
436
- raise TypeError(
437
- f"column_subset must be of type float or int, "
438
- f"but got {type(columns_to_subset)}"
439
- )
440
-
441
- col_arr = np.array(df.columns)
442
-
443
- if n > len(df.columns):
444
- if self.verbose > 0:
445
- print(
446
- "Warning: Column_subset is greater than remaining columns following filtering. Using all columns"
447
- )
448
-
449
- df_sub = df.copy()
450
- cols = col_arr.copy()
451
- else:
452
- cols = np.random.choice(col_arr, n, replace=False)
453
- df_sub = df.loc[:, np.sort(cols)]
454
- # df_sub = df.sample(n=n, axis="columns", replace=False)
455
-
456
- df_sub.columns = df_sub.columns.astype(str)
457
-
458
- return df_sub, np.sort(cols)
459
-
460
- def _print_scores(self, df_scores: pd.DataFrame) -> None:
461
- """Print validation scores as pandas.DataFrame.
462
-
463
- Args:
464
- df (pandas.DataFrame): DataFrame with score statistics.
465
- """
466
- if self.verbose > 0:
467
- print("Validation scores:")
468
- print(df_scores)
469
-
470
- def _write_imputed_params_score(
471
- self, df_scores: pd.DataFrame, best_params: Dict[str, Any]
472
- ) -> None:
473
- """Save best_score and best_params to files on disk.
474
-
475
- Args:
476
- best_score (float): Best RMSE or accuracy score for the regressor or classifier, respectively.
477
-
478
- best_params (dict): Best parameters found in grid search.
479
- """
480
-
481
- best_score_outfile = os.path.join(
482
- f"{self.prefix}_output",
483
- "reports",
484
- self.imp_method,
485
- self.imp_name,
486
- "imputed_best_score.csv",
487
- )
488
- best_params_outfile = os.path.join(
489
- f"{self.prefix}_output",
490
- "reports",
491
- self.imp_method,
492
- self.imp_name,
493
- "imputed_best_params.csv",
494
- )
495
-
496
- if isinstance(df_scores, pd.DataFrame):
497
- df_scores.to_csv(
498
- best_score_outfile,
499
- header=True,
500
- index=False,
501
- float_format="%.2f",
502
- )
503
-
504
- else:
505
- with open(best_score_outfile, "w") as fout:
506
- fout.write(f"accuracy,{df_scores}\n")
507
-
508
- with open(best_params_outfile, "w") as fout:
509
- fout.write("parameter,best_value\n")
510
- for k, v in best_params.items():
511
- fout.write(f"{k},{v}\n")
512
-
513
- def _impute_single(
514
- self, df: pd.DataFrame
515
- ) -> Tuple[pd.DataFrame, pd.DataFrame, None]:
516
- """Run IterativeImputer without a grid search.
517
-
518
- Will do a different type of validation if ``do_validation == True``\.
519
-
520
- Args:
521
- df (pandas.DataFrame): DataFrame of 012-encoded genotypes.
522
-
523
- Returns:
524
- pandas.DataFrame: Imputed DataFrame of 012-encoded genotypes.
525
- pandas.DataFrame: DataFrame with validation scores.
526
- NoneType: Only used with _impute_gridsearch. Set to None here for compatibility.
527
- """
528
- if self.verbose > 0:
529
- print(
530
- f"\nDoing {self.clf.__name__} imputation without grid search..."
531
- )
532
-
533
- if self.algorithm == "nn":
534
- clf = None
535
-
536
- else:
537
- if "early_stop_gen" in self.clf_kwargs:
538
- self.clf_kwargs.pop("early_stop_gen")
539
- clf = self.clf(**self.clf_kwargs)
540
-
541
- if self.do_validation:
542
- if self.verbose > 0:
543
- print(f"Estimating {self.clf.__name__} validation scores...")
544
-
545
- if self.disable_progressbar:
546
- with open(self.logfilepath, "a") as fout:
547
- # Redirect to progress logfile
548
- with redirect_stdout(fout):
549
- print(
550
- f"Doing {self.clf.__name__} imputation "
551
- f"without grid search...\n"
552
- )
553
-
554
- if self.verbose > 0:
555
- print(
556
- f"Estimating {self.clf.__name__} "
557
- f"validation scores...\n"
558
- )
559
-
560
- df_scores = self._imputer_validation(df, clf)
561
-
562
- if self.verbose > 0:
563
- print(f"\nDone with {self.clf.__name__} validation!\n")
564
-
565
- if self.disable_progressbar:
566
- if self.verbose > 0:
567
- with open(self.logfilepath, "a") as fout:
568
- # Redirect to progress logfile
569
- with redirect_stdout(fout):
570
- print(
571
- f"\nDone with {self.clf.__name__} validation!\n"
572
- )
573
-
574
- else:
575
- df_scores = None
576
-
577
- if self.algorithm == "nn":
578
- imputer = None
579
-
580
- else:
581
- imputer = self._define_iterative_imputer(
582
- clf,
583
- self.logfilepath,
584
- clf_kwargs=self.clf_kwargs,
585
- imp_kwargs=self.imp_kwargs,
586
- )
587
-
588
- if self.original_num_cols is None:
589
- self.original_num_cols = len(df.columns)
590
-
591
- # Remove non-biallelic loci
592
- # Only used if initial_strategy == 'phylogeny'
593
- if self.invalid_indexes is not None:
594
- df.drop(
595
- labels=self.invalid_indexes,
596
- axis=1,
597
- inplace=True,
598
- )
599
-
600
- if self.disable_progressbar:
601
- if self.verbose > 0:
602
- with open(self.logfilepath, "a") as fout:
603
- # Redirect to progress logfile
604
- with redirect_stdout(fout):
605
- print(f"Doing {self.clf.__name__} imputation...\n")
606
-
607
- df_chunks = self._df2chunks(df, self.chunk_size)
608
- imputed_df = self._impute_df(df_chunks, imputer)
609
- imputed_df = imputed_df.astype(str)
610
-
611
- if self.disable_progressbar:
612
- if self.verbose > 0:
613
- with open(self.logfilepath, "a") as fout:
614
- # Redirect to progress logfile
615
- with redirect_stdout(fout):
616
- print(f"\nDone with {self.clf.__name__} imputation!\n")
617
-
618
- lst2del = [df_chunks]
619
- del lst2del
620
- gc.collect()
621
-
622
- self._validate_imputed(imputed_df)
623
-
624
- if self.verbose > 0:
625
- print(f"\nDone with {self.clf.__name__} imputation!\n")
626
-
627
- return imputed_df, df_scores, None
628
-
629
- def _impute_gridsearch(
630
- self, df: pd.DataFrame
631
- ) -> Tuple[pd.DataFrame, pd.DataFrame, Dict[str, Any]]:
632
- """Do IterativeImputer with RandomizedSearchCV or GASearchCV.
633
-
634
- Args:
635
- df (pandas.DataFrame): DataFrame with 012-encoded genotypes.
636
-
637
- Returns:
638
- pandas.DataFrame: DataFrame with 012-encoded genotypes imputed using the best parameters found with the grid search.
639
- float: Absolute value of best score found during the grid search.
640
- dict: Best parameters found during the grid search.
641
- """
642
- original_num_cols = len(df.columns)
643
- df_subset, cols_to_keep = self._subset_data_for_gridsearch(
644
- df, self.column_subset, original_num_cols
645
- )
646
-
647
- print(f"Doing {self.clf.__name__} grid search...")
648
-
649
- if self.verbose > 0:
650
- print(f"Validation dataset size: {len(df_subset.columns)}\n")
651
-
652
- if self.disable_progressbar:
653
- with open(self.logfilepath, "a") as fout:
654
- # Redirect to progress logfile
655
- with redirect_stdout(fout):
656
- print(f"Doing {self.clf.__name__} grid search...\n")
657
-
658
- if self.algorithm == "nn":
659
- if self.clf == UBP:
660
- self.clf_kwargs["nlpca"] = self.nlpca
661
- self.imp_kwargs.pop("str_encodings")
662
- imputer = self.clf(
663
- **self.clf_kwargs,
664
- **self.imp_kwargs,
665
- ga_kwargs=self.ga_kwargs,
666
- )
667
-
668
- df_imp = pd.DataFrame(
669
- imputer.fit_transform(df_subset), columns=cols_to_keep
670
- )
671
-
672
- df_imp = df_imp.astype("float")
673
- df_imp = df_imp.astype("int64")
674
-
675
- else:
676
- if "early_stop_gen" in self.clf_kwargs:
677
- self.clf_kwargs.pop("early_stop_gen")
678
- clf = self.clf()
679
- df_subset = df_subset.astype("float32")
680
- df_subset.replace(-9.0, np.nan, inplace=True)
681
-
682
- imputer = self._define_iterative_imputer(
683
- clf,
684
- self.logfilepath,
685
- clf_kwargs=self.clf_kwargs,
686
- ga_kwargs=self.ga_kwargs,
687
- n_jobs=self.n_jobs,
688
- clf_type=self.clf_type,
689
- imp_kwargs=self.imp_kwargs,
690
- )
691
-
692
- if len(cols_to_keep) == original_num_cols:
693
- cols_to_keep = None
694
-
695
- Xt, params_list, score_list = imputer.fit_transform(
696
- df, cols_to_keep
697
- )
698
-
699
- if self.verbose > 0:
700
- print(f"\nDone with {self.clf.__name__} grid search!")
701
-
702
- if self.disable_progressbar:
703
- if self.verbose > 0:
704
- with open(self.logfilepath, "a") as fout:
705
- # Redirect to progress logfile
706
- with redirect_stdout(fout):
707
- print(
708
- f"\nDone with {self.clf.__name__} grid search!"
709
- )
710
-
711
- if self.algorithm == "ii":
712
- # Iterative Imputer.
713
- del imputer
714
- del Xt
715
-
716
- # Average or mode of best parameters
717
- # and write them to a file
718
- best_params = self._get_best_params(params_list)
719
-
720
- avg_score = mean(abs(x) for x in score_list if x != -9)
721
- median_score = median(abs(x) for x in score_list if x != -9)
722
- max_score = max(abs(x) for x in score_list if x != -9)
723
- min_score = min(abs(x) for x in score_list if x != -9)
724
-
725
- df_scores = pd.DataFrame(
726
- {
727
- "Mean": avg_score,
728
- "Median": median_score,
729
- "Min": min_score,
730
- "Max": max_score,
731
- },
732
- index=[0],
733
- )
734
-
735
- df_scores = df_scores.round(2)
736
-
737
- del avg_score
738
- del median_score
739
- del max_score
740
- del min_score
741
- gc.collect()
742
- else:
743
- # Using neural network.
744
- best_params = imputer.best_params_
745
- df_scores = imputer.best_score_
746
- df_scores = round(df_scores, 2) * 100
747
- best_imputer = None
748
-
749
- if self.clf_type == "classifier" and self.algorithm != "nn":
750
- df_scores = df_scores.apply(lambda x: x * 100)
751
-
752
- self._write_imputed_params_score(df_scores, best_params)
753
-
754
- # Change values to the ones in best_params
755
- self.clf_kwargs.update(best_params)
756
-
757
- if self.algorithm == "ii":
758
- if hasattr(self.clf(), "n_jobs"):
759
- self.clf_kwargs["n_jobs"] = self.n_jobs
760
-
761
- best_clf = self.clf(**self.clf_kwargs)
762
-
763
- gc.collect()
764
-
765
- if self.verbose > 0:
766
- print(
767
- f"\nDoing {self.clf.__name__} imputation "
768
- f"with best found parameters...\n"
769
- )
770
-
771
- if self.disable_progressbar:
772
- with open(self.logfilepath, "a") as fout:
773
- # Redirect to progress logfile
774
- with redirect_stdout(fout):
775
- print(
776
- f"\nDoing {self.clf.__name__} imputation "
777
- f"with best found parameters...\n"
778
- )
779
-
780
- if self.algorithm == "ii":
781
- best_imputer = self._define_iterative_imputer(
782
- best_clf,
783
- self.logfilepath,
784
- clf_kwargs=self.clf_kwargs,
785
- imp_kwargs=self.imp_kwargs,
786
- )
787
-
788
- final_cols = None
789
- if len(df.columns) < original_num_cols:
790
- final_cols = np.array(df.columns)
791
-
792
- if self.algorithm == "nn" and self.column_subset == 1.0:
793
- imputed_df = df_imp.copy()
794
- df_chunks = None
795
- else:
796
- df_chunks = self._df2chunks(df, self.chunk_size)
797
- imputed_df = self._impute_df(
798
- df_chunks, best_imputer, cols_to_keep=final_cols
799
- )
800
-
801
- lst2del = [df_chunks, df]
802
- del lst2del
803
- gc.collect()
804
-
805
- self._validate_imputed(imputed_df)
806
-
807
- if self.verbose > 0:
808
- print(f"Done with {self.clf.__name__} imputation!\n")
809
-
810
- if self.disable_progressbar:
811
- with open(self.logfilepath, "a") as fout:
812
- # Redirect to progress logfile
813
- with redirect_stdout(fout):
814
- print(f"Done with {self.clf.__name__} imputation!\n")
815
-
816
- return imputed_df, df_scores, best_params
817
-
818
- def _imputer_validation(
819
- self, df: pd.DataFrame, clf: Optional[Callable]
820
- ) -> pd.DataFrame:
821
- """Validate imputation with a validation test set.
822
-
823
- Validation imputation by running it on a validation test set ``cv`` times. Actual missing values are imputed with sklearn.impute.SimpleImputer, and then missing values are randomly introduced to known genotypes. The dataset with no missing data is compared to the dataset with known missing data to obtain validation scores.
824
-
825
- Args:
826
- df (pandas.DataFrame): 012-encoded genotypes to impute.
827
-
828
- clf (sklearn classifier instance or None): sklearn classifier instance with which to run the imputation.
829
-
830
- Raises:
831
- ValueError: If none of the scores were able to be estimated and reps variable is empty.
832
-
833
- Returns:
834
- pandas.DataFrame: Validation scores in a pandas DataFrame object. Contains the scoring metric, mean, median, minimum, and maximum validation scores among all features, and the lower and upper 95% confidence interval among the replicates.
835
- """
836
- reps = defaultdict(list)
837
- for cnt, rep in enumerate(
838
- progressbar(
839
- range(self.cv),
840
- desc="Validation replicates: ",
841
- leave=True,
842
- disable=self.disable_progressbar,
843
- ),
844
- start=1,
845
- ):
846
- if self.disable_progressbar:
847
- perc = int((cnt / self.cv) * 100)
848
- if self.verbose > 0:
849
- print(f"Validation replicate {cnt}/{self.cv} ({perc}%)")
850
-
851
- with open(self.logfilepath, "a") as fout:
852
- # Redirect to progress logfile
853
- with redirect_stdout(fout):
854
- print(
855
- f"Validation replicate {cnt}/{self.cv} ({perc}%)"
856
- )
857
-
858
- scores = self._impute_eval(df, clf)
859
-
860
- for k, score_list in scores.items():
861
- score_list_filtered = filter(lambda x: x != -9, score_list)
862
-
863
- if score_list_filtered:
864
- reps[k].append(score_list_filtered)
865
- else:
866
- continue
867
-
868
- if not reps:
869
- raise ValueError("None of the features could be validated!")
870
-
871
- ci_lower = dict()
872
- ci_upper = dict()
873
- for k, v in reps.items():
874
- reps_t = np.array(v).T.tolist()
875
-
876
- cis = list()
877
- if len(reps_t) > 1:
878
- for rep in reps_t:
879
- rep = [abs(x) for x in rep]
880
-
881
- cis.append(
882
- st.t.interval(
883
- alpha=0.95,
884
- df=len(rep) - 1,
885
- loc=np.mean(rep),
886
- scale=st.sem(rep),
887
- )
888
- )
889
-
890
- ci_lower[k] = mean(x[0] for x in cis)
891
- ci_upper[k] = mean(x[1] for x in cis)
892
- else:
893
- print(
894
- "Warning: There was no variance among replicates; "
895
- "the 95% CI could not be calculated"
896
- )
897
-
898
- ci_lower[k] = np.nan
899
- ci_upper[k] = np.nan
900
-
901
- results_list = list()
902
- for k, score_list in scores.items():
903
- avg_score = mean(abs(x) for x in score_list if x != -9)
904
- median_score = median(abs(x) for x in score_list if x != -9)
905
- max_score = max(abs(x) for x in score_list if x != -9)
906
- min_score = min(abs(x) for x in score_list if x != -9)
907
-
908
- results_list.append(
909
- {
910
- "Metric": k,
911
- "Mean": avg_score,
912
- "Median": median_score,
913
- "Min": min_score,
914
- "Max": max_score,
915
- "Lower 95% CI": ci_lower[k],
916
- "Upper 95% CI": ci_upper[k],
917
- }
918
- )
919
-
920
- df_scores = pd.DataFrame(results_list)
921
-
922
- if self.clf_type == "classifier":
923
- columns_list = [
924
- "Mean",
925
- "Median",
926
- "Min",
927
- "Max",
928
- "Lower 95% CI",
929
- "Upper 95% CI",
930
- ]
931
-
932
- df_scores = df_scores.round(2)
933
-
934
- outfile = os.path.join(
935
- f"{self.prefix}_output",
936
- "reports",
937
- self.imp_method,
938
- self.imp_name,
939
- "imputed_best_score.csv",
940
- )
941
- df_scores.to_csv(outfile, header=True, index=False)
942
-
943
- del results_list
944
- gc.collect()
945
-
946
- return df_scores
947
-
948
- def _impute_df(
949
- self,
950
- df_chunks: List[pd.DataFrame],
951
- imputer: Optional[
952
- Union[IterativeImputerFixedParams, IterativeImputerGridSearch]
953
- ] = None,
954
- cols_to_keep: Optional[np.ndarray] = None,
955
- ) -> pd.DataFrame:
956
- """Impute list of pandas.DataFrame objects using custom IterativeImputer class.
957
-
958
- The DataFrames are chunks of the whole input data, with each chunk correspoding to ``chunk_size`` features from ``_df2chunks()``\.
959
-
960
- Args:
961
- df_chunks (List[pandas.DataFrame]): List of Dataframes of shape(n_samples, n_features_in_chunk).
962
-
963
- imputer (imputer or classifier instance or None): Imputer or classifier instance to perform the imputation.
964
-
965
- cols_to_keep (numpy.ndarray or None): Final bi-allelic columns to keep. If some columns were non-biallelic, it will be a subset of columns.
966
-
967
- Returns:
968
- pandas.DataFrame: Single DataFrame object, with all the imputed chunks concatenated together.
969
- """
970
- imputed_chunks = list()
971
- for i, Xchunk in enumerate(df_chunks, start=1):
972
- if self.clf_type == "classifier":
973
- if self.algorithm == "nn":
974
- if self.clf == VAE:
975
- self.clf_kwargs["testing"] = self.testing
976
- if self.clf == UBP:
977
- self.clf_kwargs["nlpca"] = self.nlpca
978
- imputer = self.clf(
979
- genotype_data=self.imp_kwargs["genotype_data"],
980
- disable_progressbar=self.disable_progressbar,
981
- prefix=self.prefix,
982
- **self.clf_kwargs,
983
- )
984
- df_imp = pd.DataFrame(
985
- imputer.fit_transform(Xchunk),
986
- )
987
- df_imp = df_imp.astype("float")
988
- df_imp = df_imp.astype("Int8")
989
-
990
- else:
991
- imp, _, __ = imputer.fit_transform(
992
- Xchunk, valid_cols=cols_to_keep
993
- )
994
- df_imp = pd.DataFrame(imp)
995
-
996
- imputed_chunks.append(df_imp)
997
-
998
- else:
999
- # Regressor. Needs to be rounded to integer first.
1000
- imp, _, __ = imputer.fit_transform(
1001
- Xchunk, valid_cols=cols_to_keep
1002
- )
1003
- df_imp = pd.DataFrame(imp)
1004
- df_imp = df_imp.round(0).astype("Int8")
1005
-
1006
- imputed_chunks.append(df_imp)
1007
-
1008
- concat_df = pd.concat(imputed_chunks, axis=1)
1009
-
1010
- del imputed_chunks
1011
- gc.collect()
1012
-
1013
- return concat_df
1014
-
1015
- def _validate_imputed(self, df: pd.DataFrame) -> None:
1016
- """Asserts that there is no missing data left in the imputed DataFrame.
1017
-
1018
- Args:
1019
- df (pandas.DataFrame): DataFrame with imputed 012-encoded genotypes.
1020
-
1021
- Raises:
1022
- AssertionError: Error if missing values are still found in the dataset after imputation.
1023
- """
1024
- assert (
1025
- not df.isnull().values.any()
1026
- ), "Imputation failed...Missing values found in the imputed dataset"
1027
-
1028
- def _get_best_params(
1029
- self, params_list: List[Dict[str, Any]]
1030
- ) -> Dict[str, Any]:
1031
- """[Gets the best parameters from the grid search. Determines the parameter types and either gets the mean or mode if the type is numeric or string/ boolean]
1032
-
1033
- Args:
1034
- params_list (List[dict]): List of grid search parameter values.
1035
-
1036
- Returns:
1037
- Dict[str, Any]: Dictionary with parameters as keys and their best values.
1038
- """
1039
- best_params = dict()
1040
- keys = list(params_list[0].keys())
1041
- first_key = keys[0]
1042
-
1043
- params_list = list(filter(lambda i: i[first_key] != -9, params_list))
1044
-
1045
- for k in keys:
1046
- if all(
1047
- isinstance(x[k], (int, float)) for x in params_list if x[k]
1048
- ):
1049
- if all(isinstance(y[k], int) for y in params_list):
1050
- best_params[k] = self._average_list_of_dicts(
1051
- params_list, k, is_int=True
1052
- )
1053
-
1054
- elif all(isinstance(z[k], float) for z in params_list):
1055
- best_params[k] = self._average_list_of_dicts(
1056
- params_list, k
1057
- )
1058
-
1059
- elif all(isinstance(x[k], (str, bool)) for x in params_list):
1060
- best_params[k] = self._mode_list_of_dicts(params_list, k)
1061
-
1062
- else:
1063
- best_params[k] = self._mode_list_of_dicts(params_list, k)
1064
-
1065
- return best_params
1066
-
1067
- def _mode_list_of_dicts(
1068
- self, l: List[Dict[str, Union[str, bool]]], k: str
1069
- ) -> str:
1070
- """Get mode for key k in a list of dictionaries.
1071
-
1072
- Args:
1073
- l (list(dict)): List of dictionaries.
1074
- k (str): Key to find the mode across all dictionaries in l.
1075
-
1076
- Returns:
1077
- str or bool: Most common value across list of dictionaries for one key.
1078
- """
1079
- k_count = Counter(map(itemgetter(k), l))
1080
- return k_count.most_common()[0][0]
1081
-
1082
- def _average_list_of_dicts(
1083
- self,
1084
- l: List[Dict[str, Union[int, float]]],
1085
- k: str,
1086
- is_int: bool = False,
1087
- ) -> Union[int, float]:
1088
- """Get average of a given key in a list of dictionaries.
1089
-
1090
- Args:
1091
- l (List[Dict[str, Union[int, float]]]): List of dictionaries.
1092
-
1093
- k (str): Key to find average across list of dictionaries.
1094
-
1095
- is_int (bool, optional): Whether or not the value for key k is an integer. If False, it is expected to be of type float. Defaults to False.
1096
-
1097
- Returns:
1098
- int or float: average of given key across list of dictionaries.
1099
- """
1100
- if is_int:
1101
- return int(sum(d[k] for d in l) / len(l))
1102
- else:
1103
- return sum(d[k] for d in l) / len(l)
1104
-
1105
- def _gather_impute_settings(
1106
- self, kwargs: Dict[str, Any]
1107
- ) -> Tuple[
1108
- Optional[Dict[str, Any]],
1109
- Optional[Dict[str, Any]],
1110
- Optional[Dict[str, Any]],
1111
- Optional[int],
1112
- Optional[int],
1113
- Optional[int],
1114
- Optional[str],
1115
- Optional[Union[int, float]],
1116
- Optional[bool],
1117
- Optional[Union[int, float]],
1118
- Optional[bool],
1119
- Optional[bool],
1120
- ]:
1121
- """Gather impute settings from kwargs object.
1122
-
1123
- Gather impute settings from the various imputation classes and IterativeImputer. Gathers them for use with the ``Impute`` class. Returns dictionary with keys as keyword arguments and the values as the settings. The imputation can then be run by specifying IterativeImputer(imp_kwargs).
1124
-
1125
- Args:
1126
- kwargs (Dict[str, Any]): Dictionary with keys as the keyword arguments and their corresponding values.
1127
-
1128
- Returns:
1129
- Dict[str, Any]: IterativeImputer keyword arguments.
1130
- Dict[str, Any]: Classifier keyword arguments.
1131
- Dict[str, Any]: Genetic algorithm keyword arguments.
1132
- int: Number of cross-validation folds to use with non-grid search validation.
1133
- int: Verbosity setting. 0 is silent, 2 is most verbose.
1134
- int: Number of processors to use with grid search.
1135
- str or None: Prefix for output files.
1136
- int or float: Proportion of dataset (if float) or number of columns (if int) to use for grid search.
1137
- bool: If True, disables the tqdm progress bar and just prints status updates to a file. If False, uses tqdm progress bar.
1138
- int or float: Chunk sizes for doing full imputation following grid search. If int, then splits into chunks of ``chunk_size``\. If float, then splits into chunks of ``n_features * chunk_size``\.
1139
- bool: Whether to do validation if ``gridparams is None``.
1140
- bool: True if doing grid search, False otherwise.
1141
- """
1142
- n_jobs = kwargs.pop("n_jobs", 1)
1143
- cv = kwargs.pop("cv", None)
1144
- column_subset = kwargs.pop("column_subset", None)
1145
- chunk_size = kwargs.pop("chunk_size", 1.0)
1146
- do_validation = kwargs.pop("do_validation", False)
1147
- verbose = kwargs.get("verbose", 0)
1148
- disable_progressbar = kwargs.get("disable_progressbar", False)
1149
- prefix = kwargs.get("prefix", "imputer")
1150
- testing = kwargs.get("testing", False)
1151
- do_gridsearch = False if kwargs["gridparams"] is None else True
1152
-
1153
- if prefix is None:
1154
- prefix = "imputer"
1155
-
1156
- imp_kwargs = kwargs.copy()
1157
- clf_kwargs = kwargs.copy()
1158
- ga_kwargs = kwargs.copy()
1159
-
1160
- imp_keys = [
1161
- "grid_iter",
1162
- "tol",
1163
- "verbose",
1164
- "genotype_data",
1165
- "str_encodings",
1166
- "progress_update_percent",
1167
- "sim_strategy",
1168
- "sim_prop_missing",
1169
- "gridparams",
1170
- "gridsearch_method",
1171
- "scoring_metric",
1172
- "disable_progressbar",
1173
- "prefix",
1174
- ]
1175
-
1176
- if self.algorithm == "ii":
1177
- imp_keys.extend(
1178
- [
1179
- "n_nearest_features",
1180
- "max_iter",
1181
- "initial_strategy",
1182
- "imputation_order",
1183
- "skip_complete",
1184
- "random_state",
1185
- "sample_posterior",
1186
- ]
1187
- )
1188
-
1189
- ga_keys = [
1190
- "population_size",
1191
- "tournament_size",
1192
- "elitism",
1193
- "crossover_probability",
1194
- "mutation_probability",
1195
- "ga_algorithm",
1196
- ]
1197
-
1198
- to_remove = ["self", "__class__"]
1199
-
1200
- for k, v in clf_kwargs.copy().items():
1201
- if k in to_remove:
1202
- clf_kwargs.pop(k)
1203
- if k in imp_keys:
1204
- clf_kwargs.pop(k)
1205
- if k in ga_keys:
1206
- clf_kwargs.pop(k)
1207
-
1208
- if "clf_random_state" in clf_kwargs:
1209
- clf_kwargs["random_state"] = clf_kwargs.pop("clf_random_state")
1210
-
1211
- if "clf_tol" in clf_kwargs:
1212
- clf_kwargs["tol"] = clf_kwargs.pop("clf_tol")
1213
-
1214
- for k, v in imp_kwargs.copy().items():
1215
- if k not in imp_keys:
1216
- imp_kwargs.pop(k)
1217
-
1218
- for k, v in ga_kwargs.copy().items():
1219
- if k not in ga_keys:
1220
- ga_kwargs.pop(k)
1221
-
1222
- if "ga_algorithm" in ga_kwargs:
1223
- ga_kwargs["algorithm"] = ga_kwargs.pop("ga_algorithm")
1224
-
1225
- if self.clf_type == "regressor":
1226
- ga_kwargs["criteria"] = "min"
1227
-
1228
- elif self.clf_type == "classifier":
1229
- ga_kwargs["criteria"] = "max"
1230
-
1231
- return (
1232
- imp_kwargs,
1233
- clf_kwargs,
1234
- ga_kwargs,
1235
- cv,
1236
- verbose,
1237
- n_jobs,
1238
- prefix,
1239
- column_subset,
1240
- disable_progressbar,
1241
- chunk_size,
1242
- do_validation,
1243
- do_gridsearch,
1244
- testing,
1245
- )
1246
-
1247
- def _impute_eval(
1248
- self, df: pd.DataFrame, clf: Optional[Callable]
1249
- ) -> Dict[str, List[Union[float, int]]]:
1250
- """Function to run IterativeImputer on a pandas.DataFrame.
1251
-
1252
- The dataframe columns are randomly subset and a fraction of the known, true values are converted to missing data to allow evalutation of the model with either accuracy or mean_squared_error scores.
1253
-
1254
- Args:
1255
- df (pandas.DataFrame): Original DataFrame with 012-encoded genotypes.
1256
-
1257
- clf (sklearn Classifier or None): Classifier instance to use with IterativeImputer.
1258
-
1259
- Returns:
1260
- Dict[List[float or int]]: Validation scores for the current imputation.
1261
- """
1262
- cols = np.random.choice(
1263
- df.columns,
1264
- int(len(df.columns) * self.column_subset),
1265
- replace=False,
1266
- )
1267
-
1268
- if self.verbose > 0:
1269
- print(
1270
- f"\nSimulating validation data with missing data proportion "
1271
- f"{self.sim_prop_missing} and strategy {self.sim_strategy}"
1272
- )
1273
-
1274
- df_known = df.copy()
1275
-
1276
- if self.algorithm == "nn":
1277
- df_unknown = df_known.copy()
1278
-
1279
- else:
1280
- df_unknown = pd.DataFrame(
1281
- SimGenotypeDataTransformer(
1282
- self.genotype_data,
1283
- prop_missing=self.imp_kwargs["sim_prop_missing"],
1284
- strategy=self.imp_kwargs["sim_strategy"],
1285
- ).fit_transform(df_known)
1286
- )
1287
-
1288
- df_unknown_slice = df_unknown[cols]
1289
-
1290
- # Neural networks
1291
- if self.algorithm == "nn":
1292
- df_stg = df_unknown_slice.copy()
1293
-
1294
- for col in df_stg.columns:
1295
- df_stg[col] = df_stg[col].replace({pd.NA: np.nan})
1296
- # df_stg.fillna(-9, inplace=True)
1297
-
1298
- if self.clf == UBP:
1299
- self.clf_kwargs["nlpca"] = self.nlpca
1300
-
1301
- imputer = self.clf(
1302
- prefix=self.prefix,
1303
- **self.clf_kwargs,
1304
- **self.imp_kwargs,
1305
- )
1306
-
1307
- df_imp = pd.DataFrame(
1308
- imputer.fit_transform(df_stg.to_numpy()),
1309
- columns=cols,
1310
- )
1311
-
1312
- df_unknown_slice = pd.DataFrame(imputer.y_simulated_, columns=cols)
1313
- df_known_slice = pd.DataFrame(imputer.y_original_, columns=cols)
1314
-
1315
- df_missing_mask = pd.DataFrame(
1316
- imputer.sim_missing_mask_, columns=cols
1317
- )
1318
-
1319
- df_imp = df_imp.astype("float")
1320
- df_imp = df_imp.astype("int64")
1321
-
1322
- else:
1323
- df_known_slice = df_known[cols]
1324
- df_known_slice = df_known[cols]
1325
- df_missing_mask = df_unknown_slice.isnull()
1326
-
1327
- df_unknown.replace(-9, np.nan, inplace=True)
1328
-
1329
- # Using IterativeImputer
1330
- df_stg = df_unknown.copy()
1331
-
1332
- imputer = self._define_iterative_imputer(
1333
- clf,
1334
- self.logfilepath,
1335
- clf_kwargs=self.clf_kwargs,
1336
- imp_kwargs=self.imp_kwargs,
1337
- )
1338
-
1339
- imp_arr = imputer.fit_transform(df_stg)
1340
-
1341
- # Get only subset of validation columns
1342
- # get_loc returns the index of the value
1343
- df_imp = pd.DataFrame(
1344
- imp_arr[:, [df_unknown.columns.get_loc(i) for i in cols]],
1345
- columns=cols,
1346
- )
1347
-
1348
- # Get score of each column
1349
- scores = defaultdict(list)
1350
- for col in df_known_slice.columns:
1351
- # Adapted from: https://medium.com/analytics-vidhya/using-scikit-learns-iterative-imputer-694c3cca34de
1352
-
1353
- mask = df_missing_mask[col]
1354
- y_true = df_known[col]
1355
- y_true = y_true[mask]
1356
-
1357
- y_pred = df_imp[col]
1358
- y_pred = y_pred[mask]
1359
-
1360
- if self.clf_type == "classifier":
1361
- if y_pred.empty:
1362
- scores["accuracy"].append(-9)
1363
- scores["precision"].append(-9)
1364
- scores["f1"].append(-9)
1365
- scores["recall"].append(-9)
1366
- scores["jaccard"].append(-9)
1367
- continue
1368
-
1369
- # Had to do this because get incompatible type error if using
1370
- # initial_imputation="populations"
1371
- if y_true.dtypes != "int64":
1372
- y_true = y_true.astype("int64")
1373
- if y_pred.dtypes != "int64":
1374
- y_pred = y_pred.astype("int64")
1375
-
1376
- scores["accuracy"].append(
1377
- metrics.accuracy_score(y_true, y_pred)
1378
- )
1379
-
1380
- scores["precision"].append(
1381
- metrics.precision_score(
1382
- y_true, y_pred, average="macro", zero_division=0
1383
- )
1384
- )
1385
-
1386
- scores["f1"].append(
1387
- metrics.f1_score(
1388
- y_true, y_pred, average="macro", zero_division=0
1389
- )
1390
- )
1391
-
1392
- scores["recall"].append(
1393
- metrics.recall_score(
1394
- y_true, y_pred, average="macro", zero_division=0
1395
- )
1396
- )
1397
-
1398
- scores["jaccard"].append(
1399
- metrics.jaccard_score(
1400
- y_true, y_pred, average="macro", zero_division=0
1401
- )
1402
- )
1403
-
1404
- else:
1405
- scores["explained_var"].append(
1406
- metrics.explained_variance_score(y_true, y_pred)
1407
- )
1408
-
1409
- scores["rmse"].append(
1410
- metrics.mean_squared_error(y_true, y_pred, squared=False)
1411
- )
1412
-
1413
- lst2del = [
1414
- df_stg,
1415
- df_imp,
1416
- df_known,
1417
- df_known_slice,
1418
- df_unknown,
1419
- ]
1420
-
1421
- if self.algorithm == "nn":
1422
- del lst2del
1423
- del cols
1424
- else:
1425
- del lst2del
1426
- del imp_arr
1427
- del imputer
1428
- del cols
1429
-
1430
- gc.collect()
1431
-
1432
- return scores
1433
-
1434
- def _define_iterative_imputer(
1435
- self,
1436
- clf: Callable,
1437
- logfilepath: str,
1438
- clf_kwargs: Optional[Dict[str, Any]] = None,
1439
- imp_kwargs: Optional[str] = None,
1440
- ga_kwargs: Optional[Dict[str, Any]] = None,
1441
- n_jobs: Optional[int] = None,
1442
- clf_type: Optional[str] = None,
1443
- ) -> Union[IterativeImputerGridSearch, IterativeImputerFixedParams]:
1444
- """Define an IterativeImputer instance.
1445
-
1446
- The instances are of custom, overloaded IterativeImputer classes.
1447
-
1448
- Args:
1449
- clf (sklearn Classifier instance): Estimator to use with IterativeImputer.
1450
-
1451
- logfilepath (str): Path to progress log file.
1452
-
1453
- clf_kwargs (dict, optional): Keyword arguments for classifier. Defaults to None.
1454
-
1455
- imp_kwargs (Dict[str, Any], optional): Keyword arguments for imputation settings. Defaults to None.
1456
-
1457
- ga_kwargs (dict, optional): Keyword arguments for genetic algorithm grid search. Defaults to None.
1458
-
1459
- n_jobs (int, optional): Number of parallel jobs to use with the IterativeImputer grid search. Ignored if ``search_space=None``\. Defaults to None.
1460
-
1461
- clf_type (str, optional): Type of estimator. Valid options are "classifier" or "regressor". Ignored if ``search_space=None``\. Defaults to None.
1462
-
1463
- Returns:
1464
- sklearn.impute.IterativeImputer: IterativeImputer instance.
1465
- """
1466
- if not self.do_gridsearch:
1467
- imp = IterativeImputerFixedParams(
1468
- logfilepath,
1469
- clf_kwargs,
1470
- estimator=clf,
1471
- **imp_kwargs,
1472
- )
1473
-
1474
- else:
1475
- # Create iterative imputer
1476
- imp = IterativeImputerGridSearch(
1477
- logfilepath,
1478
- clf_kwargs,
1479
- ga_kwargs,
1480
- estimator=clf,
1481
- grid_n_jobs=n_jobs,
1482
- clf_type=clf_type,
1483
- **imp_kwargs,
1484
- )
1485
-
1486
- return imp