pg-sui 0.2.0__py3-none-any.whl → 1.6.14.dev9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info}/METADATA +101 -79
- pg_sui-1.6.14.dev9.dist-info/RECORD +81 -0
- {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info}/WHEEL +1 -1
- pg_sui-1.6.14.dev9.dist-info/entry_points.txt +4 -0
- {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info/licenses}/LICENSE +0 -0
- pg_sui-1.6.14.dev9.dist-info/top_level.txt +1 -0
- pgsui/__init__.py +35 -54
- pgsui/_version.py +34 -0
- pgsui/cli.py +909 -0
- pgsui/data_processing/__init__.py +0 -0
- pgsui/data_processing/config.py +565 -0
- pgsui/data_processing/containers.py +1424 -0
- pgsui/data_processing/transformers.py +557 -907
- pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
- pgsui/electron/app/__main__.py +5 -0
- pgsui/electron/app/extra-resources/.gitkeep +1 -0
- pgsui/electron/app/icons/icons/1024x1024.png +0 -0
- pgsui/electron/app/icons/icons/128x128.png +0 -0
- pgsui/electron/app/icons/icons/16x16.png +0 -0
- pgsui/electron/app/icons/icons/24x24.png +0 -0
- pgsui/electron/app/icons/icons/256x256.png +0 -0
- pgsui/electron/app/icons/icons/32x32.png +0 -0
- pgsui/electron/app/icons/icons/48x48.png +0 -0
- pgsui/electron/app/icons/icons/512x512.png +0 -0
- pgsui/electron/app/icons/icons/64x64.png +0 -0
- pgsui/electron/app/icons/icons/icon.icns +0 -0
- pgsui/electron/app/icons/icons/icon.ico +0 -0
- pgsui/electron/app/main.js +227 -0
- pgsui/electron/app/package-lock.json +6894 -0
- pgsui/electron/app/package.json +51 -0
- pgsui/electron/app/preload.js +15 -0
- pgsui/electron/app/server.py +157 -0
- pgsui/electron/app/ui/logo.png +0 -0
- pgsui/electron/app/ui/renderer.js +131 -0
- pgsui/electron/app/ui/styles.css +59 -0
- pgsui/electron/app/ui/ui_shim.js +72 -0
- pgsui/electron/bootstrap.py +43 -0
- pgsui/electron/launch.py +57 -0
- pgsui/electron/package.json +14 -0
- pgsui/example_data/__init__.py +0 -0
- pgsui/example_data/phylip_files/__init__.py +0 -0
- pgsui/example_data/phylip_files/test.phy +0 -0
- pgsui/example_data/popmaps/__init__.py +0 -0
- pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
- pgsui/example_data/structure_files/__init__.py +0 -0
- pgsui/example_data/structure_files/test.pops.2row.allsites.str +0 -0
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
- pgsui/impute/__init__.py +0 -0
- pgsui/impute/deterministic/imputers/allele_freq.py +725 -0
- pgsui/impute/deterministic/imputers/mode.py +844 -0
- pgsui/impute/deterministic/imputers/nmf.py +221 -0
- pgsui/impute/deterministic/imputers/phylo.py +973 -0
- pgsui/impute/deterministic/imputers/ref_allele.py +669 -0
- pgsui/impute/supervised/__init__.py +0 -0
- pgsui/impute/supervised/base.py +343 -0
- pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
- pgsui/impute/supervised/imputers/hist_gradient_boosting.py +317 -0
- pgsui/impute/supervised/imputers/random_forest.py +291 -0
- pgsui/impute/unsupervised/__init__.py +0 -0
- pgsui/impute/unsupervised/base.py +1118 -0
- pgsui/impute/unsupervised/callbacks.py +92 -262
- {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
- pgsui/impute/unsupervised/imputers/autoencoder.py +1285 -0
- pgsui/impute/unsupervised/imputers/nlpca.py +1554 -0
- pgsui/impute/unsupervised/imputers/ubp.py +1575 -0
- pgsui/impute/unsupervised/imputers/vae.py +1228 -0
- pgsui/impute/unsupervised/loss_functions.py +261 -0
- pgsui/impute/unsupervised/models/__init__.py +0 -0
- pgsui/impute/unsupervised/models/autoencoder_model.py +215 -567
- pgsui/impute/unsupervised/models/nlpca_model.py +155 -394
- pgsui/impute/unsupervised/models/ubp_model.py +180 -1106
- pgsui/impute/unsupervised/models/vae_model.py +269 -630
- pgsui/impute/unsupervised/nn_scorers.py +255 -0
- pgsui/utils/__init__.py +0 -0
- pgsui/utils/classification_viz.py +608 -0
- pgsui/utils/logging_utils.py +22 -0
- pgsui/utils/misc.py +35 -480
- pgsui/utils/plotting.py +996 -829
- pgsui/utils/pretty_metrics.py +290 -0
- pgsui/utils/scorers.py +213 -666
- pg_sui-0.2.0.dist-info/RECORD +0 -75
- pg_sui-0.2.0.dist-info/top_level.txt +0 -3
- pgsui/example_data/phylip_files/test_n10.phy +0 -118
- pgsui/example_data/phylip_files/test_n100.phy +0 -118
- pgsui/example_data/phylip_files/test_n2.phy +0 -118
- pgsui/example_data/phylip_files/test_n500.phy +0 -118
- pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
- pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
- pgsui/example_data/trees/test.iqtree +0 -376
- pgsui/example_data/trees/test.qmat +0 -5
- pgsui/example_data/trees/test.rate +0 -2033
- pgsui/example_data/trees/test.tre +0 -1
- pgsui/example_data/trees/test_n10.rate +0 -19
- pgsui/example_data/trees/test_n100.rate +0 -109
- pgsui/example_data/trees/test_n500.rate +0 -509
- pgsui/example_data/trees/test_siterates.txt +0 -2024
- pgsui/example_data/trees/test_siterates_n10.txt +0 -10
- pgsui/example_data/trees/test_siterates_n100.txt +0 -100
- pgsui/example_data/trees/test_siterates_n500.txt +0 -500
- pgsui/example_data/vcf_files/test.vcf +0 -244
- pgsui/example_data/vcf_files/test.vcf.gz +0 -0
- pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
- pgsui/impute/estimators.py +0 -1268
- pgsui/impute/impute.py +0 -1463
- pgsui/impute/simple_imputers.py +0 -1431
- pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -782
- pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1024
- pgsui/impute/unsupervised/keras_classifiers.py +0 -697
- pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
- pgsui/impute/unsupervised/neural_network_imputers.py +0 -1440
- pgsui/impute/unsupervised/neural_network_methods.py +0 -1395
- pgsui/pg_sui.py +0 -261
- pgsui/utils/sequence_tools.py +0 -407
- simulation/sim_benchmarks.py +0 -333
- simulation/sim_treeparams.py +0 -475
- test/__init__.py +0 -0
- test/pg_sui_simtest.py +0 -215
- test/pg_sui_testing.py +0 -523
- test/test.py +0 -151
- test/test_pgsui.py +0 -374
- test/test_tkc.py +0 -185
|
@@ -1,1024 +0,0 @@
|
|
|
1
|
-
# Standard library imports
|
|
2
|
-
import gc
|
|
3
|
-
import math
|
|
4
|
-
import os
|
|
5
|
-
import shutil
|
|
6
|
-
import sys
|
|
7
|
-
import warnings
|
|
8
|
-
|
|
9
|
-
# from collections import namedtuple
|
|
10
|
-
from contextlib import redirect_stdout
|
|
11
|
-
from time import time
|
|
12
|
-
from typing import Optional, Union, List, Dict, Tuple, Any, Callable
|
|
13
|
-
|
|
14
|
-
# Third-party imports
|
|
15
|
-
## For plotting
|
|
16
|
-
import matplotlib.pyplot as plt
|
|
17
|
-
import seaborn as sns
|
|
18
|
-
from matplotlib.backends.backend_pdf import PdfPages
|
|
19
|
-
|
|
20
|
-
## For stats and numeric operations
|
|
21
|
-
import numpy as np
|
|
22
|
-
import pandas as pd
|
|
23
|
-
from scipy import stats
|
|
24
|
-
|
|
25
|
-
# scikit-learn imports
|
|
26
|
-
from sklearn.base import clone
|
|
27
|
-
from sklearn.experimental import enable_iterative_imputer
|
|
28
|
-
from sklearn.impute import IterativeImputer
|
|
29
|
-
from sklearn.impute import SimpleImputer
|
|
30
|
-
from sklearn.impute._base import _check_inputs_dtype
|
|
31
|
-
|
|
32
|
-
## For warnings
|
|
33
|
-
from sklearn.exceptions import ConvergenceWarning
|
|
34
|
-
from sklearn.utils._testing import ignore_warnings
|
|
35
|
-
|
|
36
|
-
## Required for IterativeImputer.fit_transform()
|
|
37
|
-
from sklearn.utils import check_random_state, _safe_indexing, is_scalar_nan
|
|
38
|
-
from sklearn.utils._mask import _get_mask
|
|
39
|
-
from sklearn.utils.validation import FLOAT_DTYPES
|
|
40
|
-
|
|
41
|
-
# Grid search imports
|
|
42
|
-
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
|
|
43
|
-
from sklearn.model_selection import StratifiedKFold
|
|
44
|
-
|
|
45
|
-
# Genetic algorithm grid search imports
|
|
46
|
-
from sklearn_genetic import GASearchCV
|
|
47
|
-
from sklearn_genetic.callbacks import ConsecutiveStopping, DeltaThreshold
|
|
48
|
-
from sklearn_genetic.plots import plot_fitness_evolution
|
|
49
|
-
from sklearn.preprocessing import (
|
|
50
|
-
LabelEncoder,
|
|
51
|
-
OneHotEncoder,
|
|
52
|
-
OrdinalEncoder,
|
|
53
|
-
TargetEncoder,
|
|
54
|
-
)
|
|
55
|
-
from sklearn.exceptions import NotFittedError
|
|
56
|
-
from sklearn.utils.class_weight import compute_sample_weight
|
|
57
|
-
from sklearn.pipeline import make_pipeline
|
|
58
|
-
|
|
59
|
-
from xgboost import XGBClassifier
|
|
60
|
-
|
|
61
|
-
# Custom function imports
|
|
62
|
-
try:
|
|
63
|
-
from .. import simple_imputers
|
|
64
|
-
from ...utils.plotting import Plotting
|
|
65
|
-
from ...utils.misc import get_processor_name
|
|
66
|
-
from ...utils.misc import HiddenPrints
|
|
67
|
-
from ...utils.misc import isnotebook
|
|
68
|
-
except (ModuleNotFoundError, ValueError, ImportError):
|
|
69
|
-
from impute import simple_imputers
|
|
70
|
-
from utils.plotting import Plotting
|
|
71
|
-
from utils.misc import get_processor_name
|
|
72
|
-
from utils.misc import HiddenPrints
|
|
73
|
-
from utils.misc import isnotebook
|
|
74
|
-
|
|
75
|
-
# Uses scikit-learn-intellex package if CPU is Intel
|
|
76
|
-
if get_processor_name().strip().startswith("Intel"):
|
|
77
|
-
try:
|
|
78
|
-
from sklearnex import patch_sklearn
|
|
79
|
-
|
|
80
|
-
patch_sklearn(verbose=False)
|
|
81
|
-
except (ImportError, TypeError):
|
|
82
|
-
print(
|
|
83
|
-
"Processor not compatible with scikit-learn-intelex; using "
|
|
84
|
-
"default configuration"
|
|
85
|
-
)
|
|
86
|
-
|
|
87
|
-
is_notebook = isnotebook()
|
|
88
|
-
|
|
89
|
-
if is_notebook:
|
|
90
|
-
from tqdm.notebook import tqdm as progressbar
|
|
91
|
-
else:
|
|
92
|
-
if sys.platform == "linux" or sys.platform == "linux2":
|
|
93
|
-
from tqdm.auto import tqdm as progressbar
|
|
94
|
-
else:
|
|
95
|
-
from tqdm import tqdm as progressbar
|
|
96
|
-
|
|
97
|
-
# NOTE: Removed ImputeTriplets to save memory.
|
|
98
|
-
# ImputerTriplet is there so that the missing values
|
|
99
|
-
# can be predicted on an already-fit model using just the
|
|
100
|
-
# transform method. I didn't need it, so I removed it
|
|
101
|
-
# because it was saving thousands of fit estimator models into the object
|
|
102
|
-
|
|
103
|
-
# _ImputerTripletGrid = namedtuple(
|
|
104
|
-
# '_ImputerTripletGrid', ['feat_idx', 'neighbor_feat_idx', 'estimator'])
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
class IterativeImputerGridSearch(IterativeImputer):
|
|
108
|
-
"""Overridden IterativeImputer methods.
|
|
109
|
-
|
|
110
|
-
Herein, two types of grid searches (RandomizedSearchCV and GASearchCV), progress status updates, and several other improvements have been added. IterativeImputer is a multivariate imputer that estimates each feature from all the others. A strategy for imputing missing values by modeling each feature with missing values as a function of other features in a round-robin fashion.Read more in the scikit-learn User Guide for IterativeImputer. scikit-learn version added: 0.21. NOTE: This estimator is still **experimental** for now: the predictions and the API might change without any deprecation cycle. To use it, you need to explicitly import ``enable_iterative_imputer``\.
|
|
111
|
-
|
|
112
|
-
IterativeImputer is based on the R MICE (Multivariate Imputation by Chained Equationspackage) [van Buuren & Groothuis-Oudshoorn, 2011]_. See [Buck, 1960]_ for more information about multiple versus single imputations.
|
|
113
|
-
|
|
114
|
-
>>> # explicitly require this experimental feature
|
|
115
|
-
>>> from sklearn.experimental import enable_iterative_imputer
|
|
116
|
-
>>>
|
|
117
|
-
>>> # now you can import normally from sklearn.impute
|
|
118
|
-
>>> from sklearn.impute import IterativeImputer
|
|
119
|
-
|
|
120
|
-
Args:
|
|
121
|
-
logfilepath (str): Path to the progress log file.
|
|
122
|
-
|
|
123
|
-
gridparams (sklearn_genetic.space object or Dict[str, Any]): The parameter distributions or values to use for the grid search.
|
|
124
|
-
|
|
125
|
-
clf_kwargs (Dict[str, Any]): A dictionary with the classifier keyword arguments.
|
|
126
|
-
|
|
127
|
-
ga_kwargs (Dict[str, Any]): A dictionary with the genetic algorithm arguments.
|
|
128
|
-
|
|
129
|
-
prefix (str): Prefix for output files.
|
|
130
|
-
|
|
131
|
-
estimator (estimator object, optional): The estimator to use at each step of the round-robin imputation. Defaults to BayesianRidge().
|
|
132
|
-
|
|
133
|
-
grid_cv (int, optional): The number of cross-validation folds to use with the grid search. CV folds will be stratified, and it will attempt to balance the genotypes in each fold. IMPORTANT: Sites with fewer than ``2 * grid_cv`` of each genotypes will be removed prior to doing the random subsetting because otherwise the imputation would fail. At least two of each gentoype are required to be present in each fold. Defaults to 5.
|
|
134
|
-
|
|
135
|
-
grid_n_jobs (int, optional): [The number of processors to use with the grid search. Set ``grid_n_jobs`` to -1 to use all available processors]. Defaults to 1.
|
|
136
|
-
|
|
137
|
-
grid_iter (int, optional): The number of iterations (for RandomizedSearchCV) or generations (for the genetic algorithm) to run with the grid search. For the genetic algorithm, an early stopping callback method is implemented that will stop if the accuracy does not improve for more than 5 consecutive generations. Defaults to 10.
|
|
138
|
-
|
|
139
|
-
clf_type (str, optional): Whether to run ``"classifier"`` or ``"regression"`` based imputation. Defaults to "classifier".
|
|
140
|
-
|
|
141
|
-
ga (bool, optional): Whether or not to use the genetic algorithm. If True, does genetic algorithm, otherwise does RandomizedSearchCV. Defaults to False.
|
|
142
|
-
|
|
143
|
-
disable_progressbar (bool, optional): Whether or not to disable the tqdm progress bar. If True, disables the progress bar. If False, tqdm is used for the progress bar. This can be useful if you are running the imputation on an HPC cluster or are saving the standard output to a file. If True, progress updates will be printed to the screen every ``progress_update_percent`` iterations. Defaults to False.
|
|
144
|
-
|
|
145
|
-
progress_update_percent (int, optional) : How often to display progress updates (as a percentage) if ``disable_progressbar`` is True. If ``progress_update_percent=10``\, then it displays progress updates every 10%. Defaults to 10.
|
|
146
|
-
|
|
147
|
-
pops (List[Union[int, str]]): List of population IDs of shape (n_samples,).
|
|
148
|
-
|
|
149
|
-
scoring_metric (str, optional): Scoring metric to use for the grid search. Defaults to "accuracy".
|
|
150
|
-
|
|
151
|
-
early_stop_gen (int, optional): Number of consecutive generations lacking improvement for which to implement the early stopping callback. Defaults to 5.
|
|
152
|
-
|
|
153
|
-
missing_values (int or np.nan, optional): The placeholder for the missing values. All occurrences of ``missing_values`` will be imputed. For pandas dataframes with nullable integer dtypes with missing values, ``missing_values`` should be set to ``np.nan``\, since ``pd.NA`` will be converted to ``np.nan``\. Defaults to np.nan.
|
|
154
|
-
|
|
155
|
-
Sample_posterior (bool, optional): CURRENTLY NOT SUPPORTED. Whether to sample from the (Gaussian) predictive posterior of the fitted estimator for each imputation. Estimator must support ``return_std`` in its ``predict`` method if set to ``True``\. Set to ``True`` if using ``IterativeImputer`` for multiple imputations. Defaults to False.
|
|
156
|
-
|
|
157
|
-
max_iter (int, optional): Maximum number of imputation rounds to perform before returning the imputations computed during the final round. A round is a single imputation of each feature with missing values. The stopping criterion is met once ``max(abs(X_t - X_{t-1}))/max(abs(X[known_vals])) < tol``\, where ``X_t`` is ``X`` at iteration `t`. Note that early stopping is only applied if ``sample_posterior=False``\. Defaults to 10.
|
|
158
|
-
|
|
159
|
-
tol (float, optional): Tolerance of the stopping condition. Defaults to 1e-3.
|
|
160
|
-
|
|
161
|
-
n_nearest_features (int, optional): Number of other features to use to estimate the missing values of each feature column. Nearness between features is measured using the absolute correlation coefficient between each feature pair (after initial imputation). To ensure coverage of features throughout the imputation process, the neighbor features are not necessarily nearest, but are drawn with probability proportional to correlation for each imputed target feature. Can provide significant speed-up when the number of features is huge. If ``None``\, all features will be used. Defaults to None.
|
|
162
|
-
|
|
163
|
-
initial_strategy (str, optional): Which strategy to use to initialize the missing values. Same as the ``strategy`` parameter in :class:`~sklearn.impute.SimpleImputer` Valid values: "most_frequent", "populations", "mf", or "phylogeny". Defaults to "populations".
|
|
164
|
-
|
|
165
|
-
imputation_order (str, optional): The order in which the features will be imputed. Possible values: "ascending" (From features with fewest missing values to most), "descending" (From features with most missing values to fewest, "roman" (Left to right), "arabic" (Right to left), random" (A random order for each round). Defaults to 'ascending'.
|
|
166
|
-
|
|
167
|
-
skip_complete (bool, optional): If True then features with missing values during ``transform`` that did not have any missing values during ``fit`` will be imputed with the initial imputation method only. Set to ``True`` if you have many features with no missing values at both ``fit`` and ``transform`` time to save compute. Defaults to False.
|
|
168
|
-
|
|
169
|
-
min_value (float or array-like of shape (n_features,), optional): Minimum possible imputed value. Broadcast to shape (n_features,) if scalar. If array-like, expects shape (n_features,), one min value for each feature. The default is `-np.inf`...versionchanged:: 0.23 (Added support for array-like). Defaults to -np.inf.
|
|
170
|
-
|
|
171
|
-
max_value (float or array-like of shape (n_features,), optional): Maximum possible imputed value. Broadcast to shape (n_features,) if scalar. If array-like, expects shape (n_features,), one max value for each feature..versionchanged:: 0.23 (Added support for array-like). Defaults to np.inf.
|
|
172
|
-
|
|
173
|
-
verbose (int, optional): Verbosity flag, controls the debug messages that are issued as functions are evaluated. The higher, the more verbose. Can be 0, 1, or 2. Defaults to 0.
|
|
174
|
-
|
|
175
|
-
random_state (int or RandomState instance, optional): The seed of the pseudo random number generator to use. Randomizes selection of estimator features if n_nearest_features is not None, the ``imputation_order`` if ``random``\, and the sampling from posterior if ``sample_posterior`` is True. Use an integer for determinism. Defaults to None.
|
|
176
|
-
|
|
177
|
-
add_indicator (bool, optional): If True, a :class:`MissingIndicator` transform will stack onto output of the imputer's transform. This allows a predictive estimator to account for missingness despite imputation. If a feature has no missing values at fit/train time, the feature won't appear on the missing indicator even if there are missing values at transform/test time. Defaults to False.
|
|
178
|
-
|
|
179
|
-
genotype_data (GenotypeData object, optional): GenotypeData object containing dictionary with keys=sampleIds and values=list of genotypes for the corresponding key. If using ``initial_strategy="phylogeny``\, then this object also needs contain the treefile and qmatrix objects. Defaults to None.
|
|
180
|
-
|
|
181
|
-
str_encodings (Dict[str, int], optional): Integer encodings used in STRUCTURE-formatted file. Should be a dictionary with keys=nucleotides and values=integer encodings. The missing data encoding should also be included. Argument is ignored if using a PHYLIP-formatted file. Defaults to {"A": 1, "C": 2, "G": 3, "T": 4, "N": -9}
|
|
182
|
-
|
|
183
|
-
Attributes:
|
|
184
|
-
initial_imputer_: (:class:`~sklearn.impute.SimpleImputer`): Imputer used to initialize the missing values.
|
|
185
|
-
|
|
186
|
-
imputation_sequence_ (List[Tuple[numpy.ndarray]]): Each tuple has ``(feat_idx, neighbor_feat_idx, estimator)``\, where ``feat_idx`` is the current feature to be imputed, ``neighbor_feat_idx`` is the array of other features used to impute the current feature, and ``estimator`` is the trained estimator used for the imputation. Length is ``self.n_features_with_missing_ * self.n_iter_``\.
|
|
187
|
-
|
|
188
|
-
n_iter_ (int): Number of iteration rounds that occurred. Will be less than ``self.max_iter`` if early stopping criterion was reached.
|
|
189
|
-
|
|
190
|
-
n_features_with_missing_ (int): Number of features with missing values.
|
|
191
|
-
|
|
192
|
-
indicator_ (:class:`~sklearn.impute.MissingIndicator`): Indicator used to add binary indicators for missing values ``None`` if add_indicator is False.
|
|
193
|
-
|
|
194
|
-
random_state_ (RandomState instance): RandomState instance that is generated either from a seed, the random number generator or by ``np.random``\.
|
|
195
|
-
|
|
196
|
-
genotype_data (GenotypeData object): GenotypeData object.
|
|
197
|
-
|
|
198
|
-
str_encodings (Dict[str, int]): Dictionary with integer encodings for converting from STRUCTURE-formatted file to IUPAC nucleotides.
|
|
199
|
-
|
|
200
|
-
See Also:
|
|
201
|
-
SimpleImputer : Univariate imputation of missing values.
|
|
202
|
-
|
|
203
|
-
Examples:
|
|
204
|
-
>>> import numpy as np
|
|
205
|
-
>>> from sklearn.experimental import enable_iterative_imputer
|
|
206
|
-
>>> from sklearn.impute import IterativeImputer
|
|
207
|
-
>>> imp_mean = IterativeImputer(random_state=0)
|
|
208
|
-
>>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
|
|
209
|
-
>>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
|
|
210
|
-
>>> imp_mean.transform(X)
|
|
211
|
-
array([[ 6.9584..., 2. , 3. ],
|
|
212
|
-
[ 4. , 2.6000..., 6. ],
|
|
213
|
-
[10. , 4.9999..., 9. ]])
|
|
214
|
-
|
|
215
|
-
Notes:
|
|
216
|
-
To support imputation in inductive mode we store each feature's estimator during the ``fit`` phase, and predict without refitting (in order) during the ``transform`` phase. Features which contain all missing values at ``fit`` are discarded upon ``transform``\.
|
|
217
|
-
|
|
218
|
-
References:
|
|
219
|
-
.. [van Buuren & Groothuis-Oudshoorn, 2011] Stef van Buuren, Karin Groothuis-Oudshoorn (2011). mice: Multivariate Imputation by Chained Equations in R. Journal of Statistical Software 45: 1-67.
|
|
220
|
-
|
|
221
|
-
.. [Buck, 1960] S. F. Buck. (1960). A Method of Estimation of Missing Values in Multivariate Data Suitable for use with an Electronic Computer. Journal of the Royal Statistical Society 22(2): 302-306.
|
|
222
|
-
"""
|
|
223
|
-
|
|
224
|
-
def __init__(
|
|
225
|
-
self,
|
|
226
|
-
logfilepath: str,
|
|
227
|
-
clf_kwargs: Dict[str, Any],
|
|
228
|
-
ga_kwargs: Dict[str, Any],
|
|
229
|
-
*,
|
|
230
|
-
estimator: Callable = None,
|
|
231
|
-
gridparams: Dict[str, Any] = None,
|
|
232
|
-
prefix: str = "output",
|
|
233
|
-
grid_cv: int = 5,
|
|
234
|
-
grid_n_jobs: int = 1,
|
|
235
|
-
grid_iter: int = 10,
|
|
236
|
-
clf_type: str = "classifier",
|
|
237
|
-
gridsearch_method: str = "gridsearch",
|
|
238
|
-
disable_progressbar: bool = False,
|
|
239
|
-
progress_update_percent: Optional[int] = None,
|
|
240
|
-
pops: Optional[List[Union[str, int]]] = None,
|
|
241
|
-
scoring_metric: str = "accuracy",
|
|
242
|
-
early_stop_gen: int = 5,
|
|
243
|
-
missing_values: Union[int, float] = np.nan,
|
|
244
|
-
sample_posterior: bool = False,
|
|
245
|
-
max_iter: int = 10,
|
|
246
|
-
tol: float = 1e-3,
|
|
247
|
-
n_nearest_features: Optional[int] = None,
|
|
248
|
-
initial_strategy: str = "populations",
|
|
249
|
-
imputation_order: str = "ascending",
|
|
250
|
-
skip_complete: bool = False,
|
|
251
|
-
min_value: Union[float, int, float] = -np.inf,
|
|
252
|
-
max_value: Union[float, int, float] = np.inf,
|
|
253
|
-
verbose: int = 0,
|
|
254
|
-
random_state: Optional[int] = None,
|
|
255
|
-
add_indicator: bool = False,
|
|
256
|
-
genotype_data: Optional[Any] = None,
|
|
257
|
-
str_encodings: Optional[Dict[str, int]] = None,
|
|
258
|
-
) -> None:
|
|
259
|
-
super().__init__(
|
|
260
|
-
estimator=estimator,
|
|
261
|
-
missing_values=missing_values,
|
|
262
|
-
sample_posterior=sample_posterior,
|
|
263
|
-
max_iter=max_iter,
|
|
264
|
-
tol=tol,
|
|
265
|
-
n_nearest_features=n_nearest_features,
|
|
266
|
-
initial_strategy=initial_strategy,
|
|
267
|
-
imputation_order=imputation_order,
|
|
268
|
-
skip_complete=skip_complete,
|
|
269
|
-
min_value=min_value,
|
|
270
|
-
max_value=max_value,
|
|
271
|
-
verbose=verbose,
|
|
272
|
-
random_state=random_state,
|
|
273
|
-
add_indicator=add_indicator,
|
|
274
|
-
)
|
|
275
|
-
|
|
276
|
-
self.logfilepath = logfilepath
|
|
277
|
-
self.gridparams = gridparams
|
|
278
|
-
self.clf_kwargs = clf_kwargs
|
|
279
|
-
self.ga_kwargs = ga_kwargs
|
|
280
|
-
self.prefix = prefix
|
|
281
|
-
self.estimator = estimator
|
|
282
|
-
self.sample_posterior = sample_posterior
|
|
283
|
-
self.max_iter = max_iter
|
|
284
|
-
self.tol = tol
|
|
285
|
-
self.n_nearest_features = n_nearest_features
|
|
286
|
-
self.initial_strategy = initial_strategy
|
|
287
|
-
self.imputation_order = imputation_order
|
|
288
|
-
self.skip_complete = skip_complete
|
|
289
|
-
self.min_value = min_value
|
|
290
|
-
self.max_value = max_value
|
|
291
|
-
self.verbose = verbose
|
|
292
|
-
self.random_state = random_state
|
|
293
|
-
self.genotype_data = genotype_data
|
|
294
|
-
self.str_encodings = str_encodings
|
|
295
|
-
self.grid_cv = grid_cv
|
|
296
|
-
self.grid_n_jobs = grid_n_jobs
|
|
297
|
-
self.grid_iter = grid_iter
|
|
298
|
-
self.clf_type = clf_type
|
|
299
|
-
self.gridsearch_method = gridsearch_method
|
|
300
|
-
self.disable_progressbar = disable_progressbar
|
|
301
|
-
self.progress_update_percent = progress_update_percent
|
|
302
|
-
self.pops = pops
|
|
303
|
-
self.scoring_metric = scoring_metric
|
|
304
|
-
self.early_stop_gen = early_stop_gen
|
|
305
|
-
self.missing_values = missing_values
|
|
306
|
-
|
|
307
|
-
def _mode_1d(self, array_1d):
|
|
308
|
-
"""Get the mode of a 1D array.
|
|
309
|
-
|
|
310
|
-
Args:
|
|
311
|
-
array_1d (np.ndarray): 1D array to calculate the mode for.
|
|
312
|
-
|
|
313
|
-
Returns:
|
|
314
|
-
int: Most common value of the 1D array.
|
|
315
|
-
"""
|
|
316
|
-
# Get unique elements and their corresponding counts
|
|
317
|
-
vals, counts = np.unique(array_1d, return_counts=True)
|
|
318
|
-
|
|
319
|
-
# Get the index of the most frequent element
|
|
320
|
-
mode_idx = np.argmax(counts)
|
|
321
|
-
|
|
322
|
-
# Return the mode
|
|
323
|
-
return vals[mode_idx]
|
|
324
|
-
|
|
325
|
-
# Define a function to remove columns with a unique count of 1 or 0
|
|
326
|
-
def remove_single_unique_val_cols(self, array_2d, return_idx=False):
|
|
327
|
-
# Apply np.unique to each column and get the counts of unique values (excluding np.nan)
|
|
328
|
-
unique_counts = np.apply_along_axis(
|
|
329
|
-
lambda x: len(np.unique(x[~np.isnan(x)])), axis=0, arr=array_2d
|
|
330
|
-
)
|
|
331
|
-
|
|
332
|
-
# Get the column indices where the count of unique values is greater than 1
|
|
333
|
-
cols_to_keep = np.where(unique_counts > 1)[0]
|
|
334
|
-
|
|
335
|
-
# Index into array_2d to keep only these columns
|
|
336
|
-
array_2d_filtered = array_2d[:, cols_to_keep]
|
|
337
|
-
|
|
338
|
-
if return_idx:
|
|
339
|
-
return cols_to_keep
|
|
340
|
-
else:
|
|
341
|
-
return array_2d_filtered
|
|
342
|
-
|
|
343
|
-
def _initial_imputation(
|
|
344
|
-
self, X: np.ndarray, cols_to_keep: np.ndarray, in_fit: bool = False
|
|
345
|
-
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
|
|
346
|
-
"""Perform initial imputation for input X.
|
|
347
|
-
|
|
348
|
-
Initially imputes training data using a simple imputation method.
|
|
349
|
-
|
|
350
|
-
Args:
|
|
351
|
-
X (numpy.ndarray, shape (n_samples, n_features)): Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.
|
|
352
|
-
|
|
353
|
-
cols_to_keep (numpy.ndarray, shape (n_features,)): Column indices to keep. Only used if ``initial_strategy="phylogeny"``\.
|
|
354
|
-
|
|
355
|
-
in_fit (bool, optional): Whether function is called in fit. Defaults to False.
|
|
356
|
-
|
|
357
|
-
Returns:
|
|
358
|
-
Xt (numpy.ndarray, shape (n_samples, n_features): Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.
|
|
359
|
-
|
|
360
|
-
X_filled (numpy.ndarray, shape (n_samples, n_features)): [Input data with the most recent imputations.
|
|
361
|
-
|
|
362
|
-
mask_missing_values (numpy.ndarray, shape (n_samples, n_features)): Input data's missing indicator matrix, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.
|
|
363
|
-
|
|
364
|
-
X_missing_mask (numpy.ndarray, shape (n_samples, n_features)): Input data's mask matrix indicating missing datapoints, where
|
|
365
|
-
``n_samples`` is the number of samples and ``n_features`` is the
|
|
366
|
-
number of features.
|
|
367
|
-
"""
|
|
368
|
-
if is_scalar_nan(self.missing_values):
|
|
369
|
-
force_all_finite = "allow-nan"
|
|
370
|
-
else:
|
|
371
|
-
force_all_finite = True
|
|
372
|
-
|
|
373
|
-
X = self._validate_data(
|
|
374
|
-
X,
|
|
375
|
-
dtype=FLOAT_DTYPES,
|
|
376
|
-
order="F",
|
|
377
|
-
reset=in_fit,
|
|
378
|
-
force_all_finite=force_all_finite,
|
|
379
|
-
)
|
|
380
|
-
|
|
381
|
-
X[X < 0] = np.nan
|
|
382
|
-
|
|
383
|
-
_check_inputs_dtype(X, self.missing_values)
|
|
384
|
-
|
|
385
|
-
X_missing_mask = _get_mask(X, self.missing_values)
|
|
386
|
-
mask_missing_values = X_missing_mask.copy()
|
|
387
|
-
|
|
388
|
-
if self.initial_strategy == "populations":
|
|
389
|
-
self.initial_imputer_ = simple_imputers.ImputeAlleleFreq(
|
|
390
|
-
self.genotype_data,
|
|
391
|
-
pops=self.pops,
|
|
392
|
-
by_populations=True,
|
|
393
|
-
missing=-9,
|
|
394
|
-
write_output=False,
|
|
395
|
-
output_format="array",
|
|
396
|
-
verbose=False,
|
|
397
|
-
iterative_mode=True,
|
|
398
|
-
validation_mode=True,
|
|
399
|
-
)
|
|
400
|
-
|
|
401
|
-
X_filled = self.initial_imputer_.imputed
|
|
402
|
-
|
|
403
|
-
elif self.initial_strategy == "phylogeny":
|
|
404
|
-
if (
|
|
405
|
-
self.genotype_data.qmatrix is None
|
|
406
|
-
and self.genotype_data.qmatrix_iqtree is None
|
|
407
|
-
) or self.genotype_data.guidetree is None:
|
|
408
|
-
raise AttributeError(
|
|
409
|
-
"GenotypeData object was not initialized with "
|
|
410
|
-
"qmatrix/ qmatrix_iqtree or guidetree arguments, "
|
|
411
|
-
"but initial_strategy == 'phylogeny'"
|
|
412
|
-
)
|
|
413
|
-
|
|
414
|
-
else:
|
|
415
|
-
self.initial_imputer_ = simple_imputers.ImputePhylo(
|
|
416
|
-
genotype_data=self.genotype_data,
|
|
417
|
-
str_encodings=self.str_encodings,
|
|
418
|
-
write_output=False,
|
|
419
|
-
disable_progressbar=True,
|
|
420
|
-
column_subset=cols_to_keep,
|
|
421
|
-
validation_mode=True,
|
|
422
|
-
)
|
|
423
|
-
|
|
424
|
-
X_filled = self.initial_imputer_.imputed
|
|
425
|
-
valid_sites = self.initial_imputer_.valid_sites
|
|
426
|
-
|
|
427
|
-
elif self.initial_strategy == "mf":
|
|
428
|
-
self.initial_imputer_ = simple_imputers.ImputeMF(
|
|
429
|
-
self.genotype_data,
|
|
430
|
-
missing=-9,
|
|
431
|
-
write_output=False,
|
|
432
|
-
verbose=False,
|
|
433
|
-
output_format="array",
|
|
434
|
-
validation_mode=True,
|
|
435
|
-
)
|
|
436
|
-
|
|
437
|
-
X_filled = self.initial_imputer_.imputed
|
|
438
|
-
|
|
439
|
-
else:
|
|
440
|
-
if self.initial_imputer_ is None:
|
|
441
|
-
self.initial_imputer_ = SimpleImputer(
|
|
442
|
-
missing_values=self.missing_values,
|
|
443
|
-
strategy=self.initial_strategy,
|
|
444
|
-
)
|
|
445
|
-
X_filled = self.initial_imputer_.fit_transform(X)
|
|
446
|
-
|
|
447
|
-
else:
|
|
448
|
-
X_filled = self.initial_imputer_.transform(X)
|
|
449
|
-
|
|
450
|
-
valid_sites = self.initial_imputer_.statistics_
|
|
451
|
-
|
|
452
|
-
Xt = X.copy()
|
|
453
|
-
|
|
454
|
-
if self.initial_imputer_ == "phylogeny":
|
|
455
|
-
valid_sites = np.apply_along_axis(self._mode_1d, axis=0, arr=Xt)
|
|
456
|
-
valid_sites = valid_sites[cols_to_keep]
|
|
457
|
-
valid_mask = np.flatnonzero(np.logical_not(np.isnan(valid_sites)))
|
|
458
|
-
Xt = X[:, valid_mask]
|
|
459
|
-
mask_missing_values = mask_missing_values[:, valid_mask]
|
|
460
|
-
X_filled = X_filled[:, valid_mask]
|
|
461
|
-
|
|
462
|
-
return Xt, X_filled, mask_missing_values, X_missing_mask
|
|
463
|
-
|
|
464
|
-
@ignore_warnings(category=UserWarning)
|
|
465
|
-
def _impute_one_feature(
|
|
466
|
-
self,
|
|
467
|
-
X_filled: np.ndarray,
|
|
468
|
-
mask_missing_values: np.ndarray,
|
|
469
|
-
feat_idx: int,
|
|
470
|
-
neighbor_feat_idx: np.ndarray,
|
|
471
|
-
estimator: Optional[Any] = None,
|
|
472
|
-
fit_mode: bool = True,
|
|
473
|
-
) -> Tuple[np.ndarray, Optional[Any]]:
|
|
474
|
-
"""Impute a single feature from the others provided.
|
|
475
|
-
|
|
476
|
-
This function predicts the missing values of one of the features using the current estimates of all the other features. The ``estimator`` must support ``return_std=True`` in its ``predict`` method for this function to work.
|
|
477
|
-
|
|
478
|
-
Args:
|
|
479
|
-
X_filled (numpy.ndarray): Input data with the most recent imputations.
|
|
480
|
-
|
|
481
|
-
mask_missing_values (numpy.ndarray): Input data's missing indicator matrix.
|
|
482
|
-
|
|
483
|
-
feat_idx (int): Index of the feature currently being imputed.
|
|
484
|
-
|
|
485
|
-
neighbor_feat_idx (numpy.ndarray): Indices of the features to be used in imputing ``feat_idx``\.
|
|
486
|
-
|
|
487
|
-
estimator (sklearn estimator object, optional): The estimator to use at this step of the round-robin imputation If ``sample_posterior`` is True, the estimator must support ``return_std`` in its ``predict`` method. If None, it will be cloned from self._estimator. Defaults to None.
|
|
488
|
-
|
|
489
|
-
fit_mode (bool, optional): Whether to fit and predict with the estimator or just predict. Defaults to True.
|
|
490
|
-
|
|
491
|
-
Returns:
|
|
492
|
-
X_filled (ndarray): Input data with ``X_filled[missing_row_mask, feat_idx]`` updated.
|
|
493
|
-
|
|
494
|
-
estimator (estimator with sklearn API): The fitted estimator used to impute ``X_filled[missing_row_mask, feat_idx]``\.
|
|
495
|
-
"""
|
|
496
|
-
if estimator is None and fit_mode is False:
|
|
497
|
-
raise ValueError(
|
|
498
|
-
"If fit_mode is False, then an already-fitted "
|
|
499
|
-
"estimator should be passed in."
|
|
500
|
-
)
|
|
501
|
-
|
|
502
|
-
if type(self._estimator).__name__ == "XGBClassifier":
|
|
503
|
-
self.gridparams["objective"] = ["multi:softmax"]
|
|
504
|
-
self.gridparams["num_class"] = [3]
|
|
505
|
-
|
|
506
|
-
if estimator is None:
|
|
507
|
-
estimator = clone(self._estimator)
|
|
508
|
-
|
|
509
|
-
# Modified code
|
|
510
|
-
cross_val = StratifiedKFold(n_splits=self.grid_cv, shuffle=True)
|
|
511
|
-
|
|
512
|
-
# Modified code
|
|
513
|
-
# If regressor
|
|
514
|
-
if self.clf_type == "regressor":
|
|
515
|
-
if self.gridsearch_method == "genetic_algorithm":
|
|
516
|
-
callback = DeltaThreshold(threshold=1e-3, metric="fitness")
|
|
517
|
-
|
|
518
|
-
else:
|
|
519
|
-
if self.gridsearch_method == "genetic_algorithm":
|
|
520
|
-
callback = ConsecutiveStopping(
|
|
521
|
-
generations=self.early_stop_gen, metric="fitness"
|
|
522
|
-
)
|
|
523
|
-
|
|
524
|
-
# Do randomized grid search
|
|
525
|
-
if self.gridsearch_method == "randomized_gridsearch":
|
|
526
|
-
search = RandomizedSearchCV(
|
|
527
|
-
estimator,
|
|
528
|
-
param_distributions=self.gridparams,
|
|
529
|
-
n_iter=self.grid_iter,
|
|
530
|
-
scoring=self.scoring_metric,
|
|
531
|
-
n_jobs=self.grid_n_jobs,
|
|
532
|
-
cv=cross_val,
|
|
533
|
-
)
|
|
534
|
-
|
|
535
|
-
elif self.gridsearch_method == "gridsearch":
|
|
536
|
-
search = GridSearchCV(
|
|
537
|
-
estimator,
|
|
538
|
-
param_grid=self.gridparams,
|
|
539
|
-
scoring=self.scoring_metric,
|
|
540
|
-
n_jobs=self.grid_n_jobs,
|
|
541
|
-
cv=cross_val,
|
|
542
|
-
error_score="raise",
|
|
543
|
-
)
|
|
544
|
-
|
|
545
|
-
# Do genetic algorithm
|
|
546
|
-
elif self.gridsearch_method == "genetic_algorithm":
|
|
547
|
-
with HiddenPrints():
|
|
548
|
-
search = GASearchCV(
|
|
549
|
-
estimator=estimator,
|
|
550
|
-
cv=cross_val,
|
|
551
|
-
scoring=self.scoring_metric,
|
|
552
|
-
generations=self.grid_iter,
|
|
553
|
-
param_grid=self.gridparams,
|
|
554
|
-
n_jobs=self.grid_n_jobs,
|
|
555
|
-
verbose=False,
|
|
556
|
-
**self.ga_kwargs,
|
|
557
|
-
)
|
|
558
|
-
|
|
559
|
-
else:
|
|
560
|
-
raise ValueError(
|
|
561
|
-
f"Invalid gridsearch_method provided: {self.gridsearch_method}. Supported options are 'gridsearch', 'randomized_gridsearch', and 'genetic_algorithm'"
|
|
562
|
-
)
|
|
563
|
-
|
|
564
|
-
missing_row_mask = mask_missing_values[:, feat_idx]
|
|
565
|
-
|
|
566
|
-
if fit_mode:
|
|
567
|
-
X_train = _safe_indexing(
|
|
568
|
-
X_filled[:, neighbor_feat_idx], ~missing_row_mask
|
|
569
|
-
)
|
|
570
|
-
|
|
571
|
-
y_train = _safe_indexing(X_filled[:, feat_idx], ~missing_row_mask)
|
|
572
|
-
X_train = X_train.astype(int)
|
|
573
|
-
y_train = y_train.astype(int)
|
|
574
|
-
|
|
575
|
-
if self.gridsearch_method == "genetic_algorithm":
|
|
576
|
-
if type(self._estimator).__name__ == "XGBClassifier":
|
|
577
|
-
raise NotImplementedError(
|
|
578
|
-
"genetic_algorithm is not currently supported with ImputeXGBoost."
|
|
579
|
-
)
|
|
580
|
-
search.fit(X_train, y_train, callbacks=callback)
|
|
581
|
-
else:
|
|
582
|
-
if type(self._estimator).__name__ == "XGBClassifier":
|
|
583
|
-
oe = OrdinalEncoder(
|
|
584
|
-
categories="auto",
|
|
585
|
-
dtype=int,
|
|
586
|
-
handle_unknown="use_encoded_value",
|
|
587
|
-
unknown_value=-1,
|
|
588
|
-
)
|
|
589
|
-
X_train = oe.fit_transform(X_train)
|
|
590
|
-
|
|
591
|
-
# Add one dummy sample for each possible class
|
|
592
|
-
for class_label in range(
|
|
593
|
-
3
|
|
594
|
-
): # assuming there are 3 classes 0, 1, 2
|
|
595
|
-
class_count = np.count_nonzero(y_train == class_label)
|
|
596
|
-
if (
|
|
597
|
-
class_label not in y_train
|
|
598
|
-
or class_count < self.grid_cv
|
|
599
|
-
):
|
|
600
|
-
for _ in range(
|
|
601
|
-
self.grid_cv - class_count
|
|
602
|
-
): # add as many dummy samples as there are folds
|
|
603
|
-
dummy_sample = np.zeros((1, X_train.shape[1]))
|
|
604
|
-
X_train = np.vstack([X_train, dummy_sample])
|
|
605
|
-
y_train = np.append(y_train, class_label)
|
|
606
|
-
|
|
607
|
-
# Fit the LabelEncoder after adding the dummy samples
|
|
608
|
-
le = LabelEncoder()
|
|
609
|
-
y_train = le.fit_transform(y_train)
|
|
610
|
-
sample_weight = compute_sample_weight("balanced", y_train)
|
|
611
|
-
|
|
612
|
-
search.fit(X_train, y_train, sample_weight=sample_weight)
|
|
613
|
-
else:
|
|
614
|
-
search.fit(X_train, y_train)
|
|
615
|
-
|
|
616
|
-
# if no missing values, don't predict
|
|
617
|
-
if np.sum(missing_row_mask) == 0:
|
|
618
|
-
return X_filled, None
|
|
619
|
-
|
|
620
|
-
X_test = _safe_indexing(
|
|
621
|
-
X_filled[:, neighbor_feat_idx], missing_row_mask
|
|
622
|
-
)
|
|
623
|
-
|
|
624
|
-
X_test = X_test.astype(int)
|
|
625
|
-
|
|
626
|
-
if type(self._estimator).__name__ == "XGBClassifier":
|
|
627
|
-
X_test = oe.transform(X_test)
|
|
628
|
-
|
|
629
|
-
# Currently un-tested with grid search
|
|
630
|
-
if self.sample_posterior:
|
|
631
|
-
raise NotImplementedError(
|
|
632
|
-
"sample_posterior is not implemented in PG-SUI"
|
|
633
|
-
)
|
|
634
|
-
|
|
635
|
-
else:
|
|
636
|
-
imputed_values = search.predict(X_test)
|
|
637
|
-
|
|
638
|
-
if type(self._estimator).__name__ == "XGBClassifier":
|
|
639
|
-
imputed_values = le.inverse_transform(imputed_values)
|
|
640
|
-
|
|
641
|
-
imputed_values = np.clip(
|
|
642
|
-
imputed_values,
|
|
643
|
-
self._min_value[feat_idx],
|
|
644
|
-
self._max_value[feat_idx],
|
|
645
|
-
)
|
|
646
|
-
|
|
647
|
-
# update the feature
|
|
648
|
-
X_filled[missing_row_mask, feat_idx] = imputed_values
|
|
649
|
-
|
|
650
|
-
del X_train
|
|
651
|
-
del y_train
|
|
652
|
-
del X_test
|
|
653
|
-
del imputed_values
|
|
654
|
-
gc.collect()
|
|
655
|
-
|
|
656
|
-
return X_filled, search
|
|
657
|
-
|
|
658
|
-
@ignore_warnings(category=UserWarning)
|
|
659
|
-
def fit_transform(
|
|
660
|
-
self,
|
|
661
|
-
X: np.ndarray,
|
|
662
|
-
valid_cols: Optional[np.ndarray] = None,
|
|
663
|
-
y: None = None,
|
|
664
|
-
) -> Tuple[np.ndarray, Optional[List[Any]], Optional[List[Any]]]:
|
|
665
|
-
"""Fits the imputer on X and return the transformed X.
|
|
666
|
-
|
|
667
|
-
The basic functionality is to get the nearest neightbors for each feature (column) and loop through all the features and their correlated neighbors to predict missing values.
|
|
668
|
-
|
|
669
|
-
Functionality has been added to perform grid searches using two methods (genetic algorithm and RandomSearchCV). It also makes several useful plots if using the genetic algorithm, and a tqdm progress bar and status updates have been added.
|
|
670
|
-
|
|
671
|
-
Args:
|
|
672
|
-
X (array-like, shape (n_samples, n_features)): Input data, where "n_samples" is the number of samples and "n_features" is the number of features.
|
|
673
|
-
|
|
674
|
-
valid_cols (numpy.ndarray, optional): Array with column indices to keep. Defaults to None.
|
|
675
|
-
|
|
676
|
-
y (None, optional): Ignored. Here for compatibility with other sklearn classes. Defaults to None.
|
|
677
|
-
|
|
678
|
-
Returns:
|
|
679
|
-
array-like, shape (n_samples, n_features)): The imputed input data.
|
|
680
|
-
List[Union[str, int, float]]: List of parameter settings found.
|
|
681
|
-
List[Union[int, float]]: List of scores.
|
|
682
|
-
"""
|
|
683
|
-
self.random_state_ = getattr(
|
|
684
|
-
self, "random_state_", check_random_state(self.random_state)
|
|
685
|
-
)
|
|
686
|
-
|
|
687
|
-
if self.max_iter < 0:
|
|
688
|
-
raise ValueError(
|
|
689
|
-
f"'max_iter' should be a positive integer. Got {self.max_iter} instead."
|
|
690
|
-
)
|
|
691
|
-
|
|
692
|
-
if self.tol < 0:
|
|
693
|
-
raise ValueError(
|
|
694
|
-
f"'tol' should be a non-negative float. Got {self.tol} instead"
|
|
695
|
-
)
|
|
696
|
-
|
|
697
|
-
self._estimator = clone(self.estimator)
|
|
698
|
-
|
|
699
|
-
self.initial_imputer_ = None
|
|
700
|
-
|
|
701
|
-
X, Xt, mask_missing_values, complete_mask = self._initial_imputation(
|
|
702
|
-
X.copy(), valid_cols, in_fit=True
|
|
703
|
-
)
|
|
704
|
-
|
|
705
|
-
super(IterativeImputer, self)._fit_indicator(complete_mask)
|
|
706
|
-
X_indicator = super(IterativeImputer, self)._transform_indicator(
|
|
707
|
-
complete_mask
|
|
708
|
-
)
|
|
709
|
-
|
|
710
|
-
if self.max_iter == 0 or np.all(mask_missing_values):
|
|
711
|
-
self.n_iter_ = 0
|
|
712
|
-
return (
|
|
713
|
-
super(IterativeImputer, self)._concatenate_indicator(
|
|
714
|
-
Xt, X_indicator
|
|
715
|
-
),
|
|
716
|
-
None,
|
|
717
|
-
None,
|
|
718
|
-
)
|
|
719
|
-
|
|
720
|
-
# Edge case: a single feature. We return the initial ...
|
|
721
|
-
if Xt.shape[1] == 1:
|
|
722
|
-
self.n_iter_ = 0
|
|
723
|
-
return (
|
|
724
|
-
super(IterativeImputer, self)._concatenate_indicator(
|
|
725
|
-
Xt, X_indicator
|
|
726
|
-
),
|
|
727
|
-
None,
|
|
728
|
-
None,
|
|
729
|
-
)
|
|
730
|
-
|
|
731
|
-
self._min_value = self._validate_limit(
|
|
732
|
-
self.min_value, "min", X.shape[1]
|
|
733
|
-
)
|
|
734
|
-
self._max_value = self._validate_limit(
|
|
735
|
-
self.max_value, "max", X.shape[1]
|
|
736
|
-
)
|
|
737
|
-
|
|
738
|
-
if not np.all(np.greater(self._max_value, self._min_value)):
|
|
739
|
-
raise ValueError(
|
|
740
|
-
"One (or more) features have min_value >= max_value."
|
|
741
|
-
)
|
|
742
|
-
|
|
743
|
-
# order in which to impute
|
|
744
|
-
# note this is probably too slow for large feature data (d > 100000)
|
|
745
|
-
# and a better way would be good.
|
|
746
|
-
# see: https://goo.gl/KyCNwj and subsequent comments
|
|
747
|
-
ordered_idx = self._get_ordered_idx(mask_missing_values)
|
|
748
|
-
total_features = len(ordered_idx)
|
|
749
|
-
|
|
750
|
-
self.n_features_with_missing_ = len(ordered_idx)
|
|
751
|
-
|
|
752
|
-
abs_corr_mat = self._get_abs_corr_mat(Xt)
|
|
753
|
-
|
|
754
|
-
_, n_features = Xt.shape
|
|
755
|
-
|
|
756
|
-
if self.verbose > 0:
|
|
757
|
-
print(
|
|
758
|
-
f"[IterativeImputer] Completing matrix with shape ({X.shape},)"
|
|
759
|
-
)
|
|
760
|
-
start_t = time()
|
|
761
|
-
|
|
762
|
-
if not self.sample_posterior:
|
|
763
|
-
Xt_previous = Xt.copy()
|
|
764
|
-
normalized_tol = self.tol * np.max(np.abs(X[~mask_missing_values]))
|
|
765
|
-
|
|
766
|
-
params_list = list()
|
|
767
|
-
score_list = list()
|
|
768
|
-
iter_list = list()
|
|
769
|
-
|
|
770
|
-
if self.gridsearch_method == "genetic_algorithm":
|
|
771
|
-
sns.set_style("white")
|
|
772
|
-
|
|
773
|
-
total_iter = self.max_iter
|
|
774
|
-
|
|
775
|
-
#######################################
|
|
776
|
-
### Iterations
|
|
777
|
-
#######################################
|
|
778
|
-
for self.n_iter_ in progressbar(
|
|
779
|
-
range(1, total_iter + 1),
|
|
780
|
-
desc="Iteration: ",
|
|
781
|
-
disable=self.disable_progressbar,
|
|
782
|
-
):
|
|
783
|
-
if self.gridsearch_method == "genetic_algorithm":
|
|
784
|
-
iter_list.append(self.n_iter_)
|
|
785
|
-
|
|
786
|
-
pp_oneline = PdfPages(
|
|
787
|
-
f".score_traces_separate_{self.n_iter_}.pdf"
|
|
788
|
-
)
|
|
789
|
-
|
|
790
|
-
pp_lines = PdfPages(
|
|
791
|
-
f".score_traces_combined_{self.n_iter_}.pdf"
|
|
792
|
-
)
|
|
793
|
-
|
|
794
|
-
pp_space = PdfPages(f".search_space_{self.n_iter_}.pdf")
|
|
795
|
-
|
|
796
|
-
if self.imputation_order == "random":
|
|
797
|
-
ordered_idx = self._get_ordered_idx(mask_missing_values)
|
|
798
|
-
|
|
799
|
-
# Reset lists for current iteration
|
|
800
|
-
params_list.clear()
|
|
801
|
-
score_list.clear()
|
|
802
|
-
searches = list()
|
|
803
|
-
|
|
804
|
-
if self.disable_progressbar:
|
|
805
|
-
with open(self.logfilepath, "a") as fout:
|
|
806
|
-
# Redirect to progress logfile
|
|
807
|
-
with redirect_stdout(fout):
|
|
808
|
-
print(
|
|
809
|
-
f"Iteration Progress: {self.n_iter_}/{self.max_iter} ({int((self.n_iter_ / total_iter) * 100)}%)"
|
|
810
|
-
)
|
|
811
|
-
|
|
812
|
-
if self.progress_update_percent is not None:
|
|
813
|
-
print_perc_interval = self.progress_update_percent
|
|
814
|
-
|
|
815
|
-
########################################
|
|
816
|
-
### Features
|
|
817
|
-
########################################
|
|
818
|
-
for i, feat_idx in enumerate(
|
|
819
|
-
progressbar(
|
|
820
|
-
ordered_idx,
|
|
821
|
-
desc="Feature: ",
|
|
822
|
-
leave=False,
|
|
823
|
-
position=1,
|
|
824
|
-
disable=self.disable_progressbar,
|
|
825
|
-
),
|
|
826
|
-
start=1,
|
|
827
|
-
):
|
|
828
|
-
neighbor_feat_idx = self._get_neighbor_feat_idx(
|
|
829
|
-
n_features, feat_idx, abs_corr_mat
|
|
830
|
-
)
|
|
831
|
-
|
|
832
|
-
Xt, search = self._impute_one_feature(
|
|
833
|
-
Xt,
|
|
834
|
-
mask_missing_values,
|
|
835
|
-
feat_idx,
|
|
836
|
-
neighbor_feat_idx,
|
|
837
|
-
estimator=None,
|
|
838
|
-
fit_mode=True,
|
|
839
|
-
)
|
|
840
|
-
|
|
841
|
-
searches.append(search)
|
|
842
|
-
|
|
843
|
-
# NOTE: The below source code has been commented out to save
|
|
844
|
-
# RAM. estimator_triplet object contains numerous fit estimators
|
|
845
|
-
# that demand a lot of resources. It is primarily used for the
|
|
846
|
-
# transform function, which is not needed in this application.
|
|
847
|
-
|
|
848
|
-
# estimator_triplet = _ImputerTripletGrid(feat_idx,
|
|
849
|
-
# neighbor_feat_idx,
|
|
850
|
-
# estimator)
|
|
851
|
-
|
|
852
|
-
# self.imputation_sequence_.append(estimator_triplet)
|
|
853
|
-
|
|
854
|
-
if search is not None:
|
|
855
|
-
# There was missing data in the feature
|
|
856
|
-
params_list.append(search.best_params_)
|
|
857
|
-
score_list.append(search.best_score_)
|
|
858
|
-
|
|
859
|
-
if self.gridsearch_method == "genetic_algorithm":
|
|
860
|
-
plt.cla()
|
|
861
|
-
plt.clf()
|
|
862
|
-
plt.close()
|
|
863
|
-
|
|
864
|
-
plot_fitness_evolution(search)
|
|
865
|
-
pp_oneline.savefig(bbox_inches="tight")
|
|
866
|
-
plt.cla()
|
|
867
|
-
plt.clf()
|
|
868
|
-
plt.close()
|
|
869
|
-
|
|
870
|
-
Plotting.plot_search_space(search)
|
|
871
|
-
pp_space.savefig(bbox_inches="tight")
|
|
872
|
-
plt.cla()
|
|
873
|
-
plt.clf()
|
|
874
|
-
plt.close()
|
|
875
|
-
|
|
876
|
-
else:
|
|
877
|
-
# Search is None
|
|
878
|
-
# Thus, there was no missing data in the given feature
|
|
879
|
-
tmp_dict = dict()
|
|
880
|
-
for k in self.gridparams.keys():
|
|
881
|
-
tmp_dict[k] = -9
|
|
882
|
-
params_list.append(tmp_dict)
|
|
883
|
-
|
|
884
|
-
score_list.append(-9)
|
|
885
|
-
|
|
886
|
-
# Only print feature updates at each progress_update_percent
|
|
887
|
-
# interval
|
|
888
|
-
if (
|
|
889
|
-
self.progress_update_percent is not None
|
|
890
|
-
and self.disable_progressbar
|
|
891
|
-
):
|
|
892
|
-
current_perc = math.ceil((i / total_features) * 100)
|
|
893
|
-
|
|
894
|
-
if current_perc >= print_perc_interval:
|
|
895
|
-
with open(self.logfilepath, "a") as fout:
|
|
896
|
-
# Redirect progress to file
|
|
897
|
-
with redirect_stdout(fout):
|
|
898
|
-
print(
|
|
899
|
-
f"Feature Progress (Iteration "
|
|
900
|
-
f"{self.n_iter_}/{self.max_iter}): "
|
|
901
|
-
f"{i}/{total_features} ({current_perc}"
|
|
902
|
-
f"%)"
|
|
903
|
-
)
|
|
904
|
-
|
|
905
|
-
if i == len(ordered_idx):
|
|
906
|
-
with redirect_stdout(fout):
|
|
907
|
-
print("")
|
|
908
|
-
|
|
909
|
-
while print_perc_interval <= current_perc:
|
|
910
|
-
print_perc_interval += self.progress_update_percent
|
|
911
|
-
|
|
912
|
-
if self.verbose > 1:
|
|
913
|
-
print(
|
|
914
|
-
f"[IterativeImputer] Ending imputation round "
|
|
915
|
-
f"{self.n_iter_}/{self.max_iter}, "
|
|
916
|
-
f"elapsed time {(time() - start_t):0.2f}"
|
|
917
|
-
)
|
|
918
|
-
|
|
919
|
-
if not self.sample_posterior:
|
|
920
|
-
inf_norm = np.linalg.norm(
|
|
921
|
-
Xt - Xt_previous, ord=np.inf, axis=None
|
|
922
|
-
)
|
|
923
|
-
if self.verbose > 0:
|
|
924
|
-
print(
|
|
925
|
-
f"[IterativeImputer] Change: {inf_norm}, "
|
|
926
|
-
f"scaled tolerance: {normalized_tol} "
|
|
927
|
-
)
|
|
928
|
-
|
|
929
|
-
if inf_norm < normalized_tol:
|
|
930
|
-
if self.disable_progressbar:
|
|
931
|
-
# Early stopping criteria has been reached
|
|
932
|
-
with open(self.logfilepath, "a") as fout:
|
|
933
|
-
# Redirect to progress logfile
|
|
934
|
-
with redirect_stdout(fout):
|
|
935
|
-
print(
|
|
936
|
-
"[IterativeImputer] Early stopping "
|
|
937
|
-
"criterion reached."
|
|
938
|
-
)
|
|
939
|
-
else:
|
|
940
|
-
print(
|
|
941
|
-
"[IterativeImputer] Early stopping criterion "
|
|
942
|
-
"reached."
|
|
943
|
-
)
|
|
944
|
-
|
|
945
|
-
if self.gridsearch_method == "genetic_algorithm":
|
|
946
|
-
pp_oneline.close()
|
|
947
|
-
pp_space.close()
|
|
948
|
-
|
|
949
|
-
plt.cla()
|
|
950
|
-
plt.clf()
|
|
951
|
-
plt.close()
|
|
952
|
-
for iter_search in searches:
|
|
953
|
-
if iter_search is not None:
|
|
954
|
-
plot_fitness_evolution(iter_search)
|
|
955
|
-
|
|
956
|
-
pp_lines.savefig(bbox_inches="tight")
|
|
957
|
-
|
|
958
|
-
plt.cla()
|
|
959
|
-
plt.clf()
|
|
960
|
-
plt.close()
|
|
961
|
-
pp_lines.close()
|
|
962
|
-
|
|
963
|
-
break
|
|
964
|
-
Xt_previous = Xt.copy()
|
|
965
|
-
|
|
966
|
-
if self.gridsearch_method == "genetic_algorithm":
|
|
967
|
-
pp_oneline.close()
|
|
968
|
-
pp_space.close()
|
|
969
|
-
|
|
970
|
-
plt.cla()
|
|
971
|
-
plt.clf()
|
|
972
|
-
plt.close()
|
|
973
|
-
for iter_search in searches:
|
|
974
|
-
if iter_search is not None:
|
|
975
|
-
plot_fitness_evolution(iter_search)
|
|
976
|
-
|
|
977
|
-
pp_lines.savefig(bbox_inches="tight")
|
|
978
|
-
|
|
979
|
-
plt.cla()
|
|
980
|
-
plt.clf()
|
|
981
|
-
plt.close()
|
|
982
|
-
pp_lines.close()
|
|
983
|
-
|
|
984
|
-
else:
|
|
985
|
-
if not self.sample_posterior:
|
|
986
|
-
warnings.warn(
|
|
987
|
-
"[IterativeImputer] Early stopping criterion not"
|
|
988
|
-
" reached.",
|
|
989
|
-
ConvergenceWarning,
|
|
990
|
-
)
|
|
991
|
-
|
|
992
|
-
Xt[~mask_missing_values] = X[~mask_missing_values]
|
|
993
|
-
Xt = Xt.astype(int)
|
|
994
|
-
|
|
995
|
-
if self.gridsearch_method == "genetic_algorithm":
|
|
996
|
-
# Remove all files except last iteration
|
|
997
|
-
final_iter = iter_list.pop()
|
|
998
|
-
|
|
999
|
-
[os.remove(f".score_traces_separate_{x}.pdf") for x in iter_list]
|
|
1000
|
-
[os.remove(f".score_traces_combined_{x}.pdf") for x in iter_list]
|
|
1001
|
-
[os.remove(f".search_space_{x}.pdf") for x in iter_list]
|
|
1002
|
-
|
|
1003
|
-
shutil.move(
|
|
1004
|
-
f".score_traces_separate_{final_iter}.pdf",
|
|
1005
|
-
f"{self.prefix}_score_traces_separate.pdf",
|
|
1006
|
-
)
|
|
1007
|
-
|
|
1008
|
-
shutil.move(
|
|
1009
|
-
f".score_traces_combined_{final_iter}.pdf",
|
|
1010
|
-
f"{self.prefix}_score_traces_combined.pdf",
|
|
1011
|
-
)
|
|
1012
|
-
|
|
1013
|
-
shutil.move(
|
|
1014
|
-
f".search_space_{final_iter}.pdf",
|
|
1015
|
-
f"{self.prefix}_search_space.pdf",
|
|
1016
|
-
)
|
|
1017
|
-
|
|
1018
|
-
return (
|
|
1019
|
-
super(IterativeImputer, self)._concatenate_indicator(
|
|
1020
|
-
Xt, X_indicator
|
|
1021
|
-
),
|
|
1022
|
-
params_list,
|
|
1023
|
-
score_list,
|
|
1024
|
-
)
|