pg-sui 0.2.0__py3-none-any.whl → 1.6.14.dev9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info}/METADATA +101 -79
  2. pg_sui-1.6.14.dev9.dist-info/RECORD +81 -0
  3. {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info}/WHEEL +1 -1
  4. pg_sui-1.6.14.dev9.dist-info/entry_points.txt +4 -0
  5. {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info/licenses}/LICENSE +0 -0
  6. pg_sui-1.6.14.dev9.dist-info/top_level.txt +1 -0
  7. pgsui/__init__.py +35 -54
  8. pgsui/_version.py +34 -0
  9. pgsui/cli.py +909 -0
  10. pgsui/data_processing/__init__.py +0 -0
  11. pgsui/data_processing/config.py +565 -0
  12. pgsui/data_processing/containers.py +1424 -0
  13. pgsui/data_processing/transformers.py +557 -907
  14. pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
  15. pgsui/electron/app/__main__.py +5 -0
  16. pgsui/electron/app/extra-resources/.gitkeep +1 -0
  17. pgsui/electron/app/icons/icons/1024x1024.png +0 -0
  18. pgsui/electron/app/icons/icons/128x128.png +0 -0
  19. pgsui/electron/app/icons/icons/16x16.png +0 -0
  20. pgsui/electron/app/icons/icons/24x24.png +0 -0
  21. pgsui/electron/app/icons/icons/256x256.png +0 -0
  22. pgsui/electron/app/icons/icons/32x32.png +0 -0
  23. pgsui/electron/app/icons/icons/48x48.png +0 -0
  24. pgsui/electron/app/icons/icons/512x512.png +0 -0
  25. pgsui/electron/app/icons/icons/64x64.png +0 -0
  26. pgsui/electron/app/icons/icons/icon.icns +0 -0
  27. pgsui/electron/app/icons/icons/icon.ico +0 -0
  28. pgsui/electron/app/main.js +227 -0
  29. pgsui/electron/app/package-lock.json +6894 -0
  30. pgsui/electron/app/package.json +51 -0
  31. pgsui/electron/app/preload.js +15 -0
  32. pgsui/electron/app/server.py +157 -0
  33. pgsui/electron/app/ui/logo.png +0 -0
  34. pgsui/electron/app/ui/renderer.js +131 -0
  35. pgsui/electron/app/ui/styles.css +59 -0
  36. pgsui/electron/app/ui/ui_shim.js +72 -0
  37. pgsui/electron/bootstrap.py +43 -0
  38. pgsui/electron/launch.py +57 -0
  39. pgsui/electron/package.json +14 -0
  40. pgsui/example_data/__init__.py +0 -0
  41. pgsui/example_data/phylip_files/__init__.py +0 -0
  42. pgsui/example_data/phylip_files/test.phy +0 -0
  43. pgsui/example_data/popmaps/__init__.py +0 -0
  44. pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
  45. pgsui/example_data/structure_files/__init__.py +0 -0
  46. pgsui/example_data/structure_files/test.pops.2row.allsites.str +0 -0
  47. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
  48. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
  49. pgsui/impute/__init__.py +0 -0
  50. pgsui/impute/deterministic/imputers/allele_freq.py +725 -0
  51. pgsui/impute/deterministic/imputers/mode.py +844 -0
  52. pgsui/impute/deterministic/imputers/nmf.py +221 -0
  53. pgsui/impute/deterministic/imputers/phylo.py +973 -0
  54. pgsui/impute/deterministic/imputers/ref_allele.py +669 -0
  55. pgsui/impute/supervised/__init__.py +0 -0
  56. pgsui/impute/supervised/base.py +343 -0
  57. pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
  58. pgsui/impute/supervised/imputers/hist_gradient_boosting.py +317 -0
  59. pgsui/impute/supervised/imputers/random_forest.py +291 -0
  60. pgsui/impute/unsupervised/__init__.py +0 -0
  61. pgsui/impute/unsupervised/base.py +1118 -0
  62. pgsui/impute/unsupervised/callbacks.py +92 -262
  63. {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
  64. pgsui/impute/unsupervised/imputers/autoencoder.py +1285 -0
  65. pgsui/impute/unsupervised/imputers/nlpca.py +1554 -0
  66. pgsui/impute/unsupervised/imputers/ubp.py +1575 -0
  67. pgsui/impute/unsupervised/imputers/vae.py +1228 -0
  68. pgsui/impute/unsupervised/loss_functions.py +261 -0
  69. pgsui/impute/unsupervised/models/__init__.py +0 -0
  70. pgsui/impute/unsupervised/models/autoencoder_model.py +215 -567
  71. pgsui/impute/unsupervised/models/nlpca_model.py +155 -394
  72. pgsui/impute/unsupervised/models/ubp_model.py +180 -1106
  73. pgsui/impute/unsupervised/models/vae_model.py +269 -630
  74. pgsui/impute/unsupervised/nn_scorers.py +255 -0
  75. pgsui/utils/__init__.py +0 -0
  76. pgsui/utils/classification_viz.py +608 -0
  77. pgsui/utils/logging_utils.py +22 -0
  78. pgsui/utils/misc.py +35 -480
  79. pgsui/utils/plotting.py +996 -829
  80. pgsui/utils/pretty_metrics.py +290 -0
  81. pgsui/utils/scorers.py +213 -666
  82. pg_sui-0.2.0.dist-info/RECORD +0 -75
  83. pg_sui-0.2.0.dist-info/top_level.txt +0 -3
  84. pgsui/example_data/phylip_files/test_n10.phy +0 -118
  85. pgsui/example_data/phylip_files/test_n100.phy +0 -118
  86. pgsui/example_data/phylip_files/test_n2.phy +0 -118
  87. pgsui/example_data/phylip_files/test_n500.phy +0 -118
  88. pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
  89. pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
  90. pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
  91. pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
  92. pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
  93. pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
  94. pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
  95. pgsui/example_data/trees/test.iqtree +0 -376
  96. pgsui/example_data/trees/test.qmat +0 -5
  97. pgsui/example_data/trees/test.rate +0 -2033
  98. pgsui/example_data/trees/test.tre +0 -1
  99. pgsui/example_data/trees/test_n10.rate +0 -19
  100. pgsui/example_data/trees/test_n100.rate +0 -109
  101. pgsui/example_data/trees/test_n500.rate +0 -509
  102. pgsui/example_data/trees/test_siterates.txt +0 -2024
  103. pgsui/example_data/trees/test_siterates_n10.txt +0 -10
  104. pgsui/example_data/trees/test_siterates_n100.txt +0 -100
  105. pgsui/example_data/trees/test_siterates_n500.txt +0 -500
  106. pgsui/example_data/vcf_files/test.vcf +0 -244
  107. pgsui/example_data/vcf_files/test.vcf.gz +0 -0
  108. pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
  109. pgsui/impute/estimators.py +0 -1268
  110. pgsui/impute/impute.py +0 -1463
  111. pgsui/impute/simple_imputers.py +0 -1431
  112. pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -782
  113. pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1024
  114. pgsui/impute/unsupervised/keras_classifiers.py +0 -697
  115. pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
  116. pgsui/impute/unsupervised/neural_network_imputers.py +0 -1440
  117. pgsui/impute/unsupervised/neural_network_methods.py +0 -1395
  118. pgsui/pg_sui.py +0 -261
  119. pgsui/utils/sequence_tools.py +0 -407
  120. simulation/sim_benchmarks.py +0 -333
  121. simulation/sim_treeparams.py +0 -475
  122. test/__init__.py +0 -0
  123. test/pg_sui_simtest.py +0 -215
  124. test/pg_sui_testing.py +0 -523
  125. test/test.py +0 -151
  126. test/test_pgsui.py +0 -374
  127. test/test_tkc.py +0 -185
@@ -1,782 +0,0 @@
1
- # Standard library imports
2
- import gc
3
- import math
4
- import os
5
- import sys
6
- import warnings
7
-
8
- # from collections import namedtuple
9
- from contextlib import redirect_stdout
10
- from time import time
11
- from typing import Optional, Union, List, Dict, Tuple, Any, Callable
12
-
13
- # Third-party imports
14
- ## For stats and numeric operations
15
- import numpy as np
16
- import pandas as pd
17
- from scipy import stats
18
-
19
- # scikit-learn imports
20
- from sklearn.base import clone
21
- from sklearn.experimental import enable_iterative_imputer
22
- from sklearn.impute import IterativeImputer
23
- from sklearn.impute import SimpleImputer
24
- from sklearn.impute._base import _check_inputs_dtype
25
-
26
- ## For warnings
27
- from sklearn.exceptions import ConvergenceWarning
28
- from sklearn.utils._testing import ignore_warnings
29
-
30
- ## Required for IterativeImputer.fit_transform()
31
- from sklearn.utils import check_random_state, _safe_indexing, is_scalar_nan
32
- from sklearn.utils._mask import _get_mask
33
- from sklearn.utils.validation import FLOAT_DTYPES
34
- from sklearn.preprocessing import LabelEncoder
35
-
36
- # Custom function imports
37
- try:
38
- from .. import simple_imputers
39
- from ...utils.misc import get_processor_name
40
- from ...utils.misc import HiddenPrints
41
- from ...utils.misc import isnotebook
42
- except (ModuleNotFoundError, ValueError, ImportError):
43
- from pgsui.impute import simple_imputers
44
- from pgsui.utils.misc import get_processor_name
45
- from pgsui.utils.misc import HiddenPrints
46
- from pgsui.utils.misc import isnotebook
47
-
48
- # Uses scikit-learn-intellex package if CPU is Intel
49
- if get_processor_name().strip().startswith("Intel"):
50
- try:
51
- from sklearnex import patch_sklearn
52
-
53
- patch_sklearn(verbose=False)
54
- except (ImportError, TypeError):
55
- print(
56
- "Processor not compatible with scikit-learn-intelex; using "
57
- "default configuration"
58
- )
59
-
60
- is_notebook = isnotebook()
61
-
62
- if is_notebook:
63
- from tqdm.notebook import tqdm as progressbar
64
- else:
65
- if sys.platform == "linux" or sys.platform == "linux2":
66
- from tqdm.auto import tqdm as progressbar
67
- else:
68
- from tqdm import tqdm as progressbar
69
-
70
- # NOTE: Removed ImputeTriplets to save memory.
71
- # ImputerTriplet is there so that the missing values
72
- # can be predicted on an already-fit model using just the
73
- # transform method. I didn't need it, so I removed it
74
- # because it was saving thousands of fit estimator models into the object
75
-
76
- # _ImputerTripletAll = namedtuple(
77
- # '_ImputerTripletAll', ['feat_idx', 'neighbor_feat_idx', 'estimator'])
78
-
79
-
80
- class IterativeImputerFixedParams(IterativeImputer):
81
- """Overridden IterativeImputer methods.
82
-
83
- Herein, progress status updates, optimizations to save RAM, and several other improvements have been added. IterativeImputer is a multivariate imputer that estimates each feature from all the others. A strategy for imputing missing values by modeling each feature with missing values as a function of other features in a round-robin fashion.Read more in the scikit-learn User Guide for IterativeImputer. scikit-learn versionadded: 0.21. NOTE: This estimator is still **experimental** for now: the predictions and the API might change without any deprecation cycle. To use it, you need to explicitly import ``enable_iterative_imputer``\.
84
-
85
- IterativeImputer is based on the R MICE (Multivariate Imputation by Chained Equationspackage) [3]_. See [4]_ for more information about multiple versus single imputations.
86
-
87
- >>> # explicitly require this experimental feature
88
- >>> from sklearn.experimental import enable_iterative_imputer
89
- >>>
90
- >>> # now you can import normally from sklearn.impute
91
- >>> from sklearn.impute import IterativeImputer
92
-
93
- Args:
94
- logfilepath (str): Path to the progress log file.
95
-
96
- clf_kwargs (Dict[str, Any]): A dictionary with the classifier keyword arguments.
97
-
98
- prefix (str): Prefix for output files.
99
-
100
- estimator (callable estimator object, optional): The estimator to use at each step of the round-robin imputation. If ``sample_posterior`` is True, the estimator must support ``return_std`` in its ``predict`` method. Defaults to BayesianRidge().
101
-
102
- clf_type (str, optional): Whether to run ```'classifier'``` or ``'regression'`` based imputation. Defaults to 'classifier'
103
-
104
- disable_progressbar (bool, optional): Whether or not to disable the tqdm progress bar. If True, disables the progress bar. If False, tqdm is used for the progress bar. This can be useful if you are running the imputation on an HPC cluster or are saving the standard output to a file. If True, progress updates will be printed to the screen every ``progress_update_percent`` iterations. Defaults to False.
105
-
106
- progress_update_percent (int, optional): How often to display progress updates (as a percentage) if ``disable_progressbar`` is True. If ``progress_update_frequency=10``\, then it displays progress updates every 10%. Defaults to 10.
107
-
108
- pops (List[Union[str, int]] or None): List of population IDs to be used with ImputeAlleleFreq if ``initial_strategy="populations"``\.
109
-
110
- missing_values (int or np.nan, optional): The placeholder for the missing values. All occurrences of ``missing_values`` will be imputed. For pandas dataframes with nullable integer dtypes with missing values, ``missing_values`` should be set to ``np.nan``\, since ``pd.NA`` will be converted to ``np.nan``\. Defaults to np.nan.
111
-
112
- Sample_posterior (bool, optional): Whether to sample from the (Gaussian) predictive posterior of the fitted estimator for each imputation. Estimator must support ``return_std`` in its ``predict`` method if set to ``True``\. Set to ``True`` if using ``IterativeImputer`` for multiple imputations. Defaults to False.
113
-
114
- max_iter (int, optional): Maximum number of imputation rounds to perform before returning the imputations computed during the final round. A round is a single imputation of each feature with missing values. The stopping criterion is met once ``max(abs(X_t - X_{t-1}))/max(abs(X[known_vals])) < tol``\, where ``X_t`` is ``X`` at iteration ``t``\. Note that early stopping is only applied if ``sample_posterior=False``\. Defaults to 10.
115
-
116
- tol (float, optional): Tolerance of the stopping condition. Defaults to 1e-3.
117
-
118
- n_nearest_features (int, optional): Number of other features to use to estimate the missing values of each feature column. Nearness between features is measured using the absolute correlation coefficient between each feature pair (after initial imputation). To ensure coverage of features throughout the imputation process, the neighbor features are not necessarily nearest, but are drawn with probability proportional to correlation for each imputed target feature. Can provide significant speed-up when the number of features is huge. If ``None``\, all features will be used. Defaults to None.
119
-
120
- initial_strategy (str, optional): Which strategy to use to initialize the missing values. Same as the ``strategy`` parameter in :class:`~sklearn.impute.SimpleImputer` Valid values: "mean", "median", "most_frequent", "populations", "phylogeny", "mf", or "constant". Defaults to "mean".
121
-
122
- imputation_order (str, optional): The order in which the features will be imputed. Possible values: "ascending" (From features with fewest missing values to most), "descending" (From features with most missing values to fewest, "roman" (Left to right), "arabic" (Right to left), random" (A random order for each round). Defaults to "ascending".
123
-
124
- skip_complete (bool, optional): If ``True`` then features with missing values during ``transform`` that did not have any missing values during ``fit`` will be imputed with the initial imputation method only. Set to ``True`` if you have many features with no missing values at both ``fit`` and ``transform`` time to save compute. Defaults to False.
125
-
126
- min_value (float or array-like of shape (n_features,), optional): Minimum possible imputed value. Broadcast to shape (n_features,) if scalar. If array-like, expects shape (n_features,), one min value for each feature. The default is `-np.inf`...sklearn versionchanged:: 0.23 (Added support for array-like). Defaults to -np.inf.
127
-
128
- max_value (float or array-like of shape (n_features,), optional): Maximum possible imputed value. Broadcast to shape (n_features,) if scalar. If array-like, expects shape (n_features,), one max value for each feature..sklearn versionchanged:: 0.23 (Added support for array-like). Defaults to np.inf.
129
-
130
- verbose (int, optional): Verbosity flag, controls the debug messages that are issued as functions are evaluated. The higher, the more verbose. Can be 0, 1, or 2. Defaults to 0.
131
-
132
- random_state (int or RandomState instance, optional): The seed of the pseudo random number generator to use. Randomizes selection of estimator features if n_nearest_features is not None, the ``imputation_order`` if ``random``\, and the sampling from posterior if ``sample_posterior`` is True. Use an integer for determinism. Defaults to None.
133
-
134
- add_indicator (bool, optional): If True, a :class:`MissingIndicator` transform will stack onto output of the imputer's transform. This allows a predictive estimator to account for missingness despite imputation. If a feature has no missing values at fit/train time, the feature won't appear on the missing indicator even if there are missing values at transform/test time. Defaults to False.
135
-
136
- genotype_data (GenotypeData object, optional): GenotypeData object containing dictionary with keys=sampleIds and values=list of genotypes for the corresponding key. If using ``initial_strategy="phylogeny``\, then this object also needs contain the treefile and qmatrix objects. Defaults to None.
137
-
138
- str_encodings (dict(str: int), optional): Integer encodings used in STRUCTURE-formatted file. Should be a dictionary with keys=nucleotides and values=integer encodings. The missing data encoding should also be included. Argument is ignored if using a PHYLIP-formatted file. Defaults to {"A": 1, "C": 2, "G": 3, "T": 4, "N": -9}
139
-
140
- kwargs (Dict[str, Any]): For compatibility with grid search IterativeImputer.
141
-
142
- Attributes:
143
- initial_imputer_ (sklearn.impute.SimpleImputer): Imputer used to initialize the missing values.
144
-
145
- n_iter_ (int): Number of iteration rounds that occurred. Will be less than ``self.max_iter`` if early stopping criterion was reached.
146
-
147
- n_features_with_missing_ (int): Number of features with missing values.
148
-
149
- indicator_ (sklearn.impute.MissingIndicator): Indicator used to add binary indicators for missing values ``None`` if add_indicator is False.
150
-
151
- random_state_ (RandomState instance): RandomState instance that is generated either from a seed, the random number generator or by ``np.random``\.
152
-
153
- logfilepath (str): Path to status logfile.
154
-
155
- clf_kwargs (Dict[str, Any]): Keyword arguments for estimator.
156
-
157
- prefix (str): Prefix for output files.
158
-
159
- clf_type (str): Type of estimator, either "classifier" or "regressor".
160
-
161
- disable_progressbar (bool): Whether to disable the tqdm progress bar. If True, writes status updates to file instead of tqdm progress bar.
162
-
163
- progress_update_percent (float or None): Print feature progress update every ``progress_update_percent`` percent.
164
-
165
- pops (List[Union[str, int]]): List of population IDs of shape (n_samples,).
166
-
167
- estimator (estimator object): Estimator to impute data with.
168
-
169
- sample_posterior (bool): Whether to use the sample_posterior option. This overridden class does not currently support sample_posterior.
170
-
171
- max_iter (int): The maximum number of iterations to run.
172
-
173
- tol (float): Convergence criteria.
174
-
175
- n_nearest_features (int): Number of nearest features to impute target with.
176
-
177
- initial_strategy (str): Strategy to use with SimpleImputer for training data.
178
-
179
- imputation_order (str): Order to impute.
180
-
181
- skip_complete (bool): Whether to skip features with no missing data.
182
-
183
- min_value (int or float): Minimum value of imputed data.
184
-
185
- max_value (int or float): Maximum value of imputed data.
186
-
187
- verbose (int): Verbosity level.
188
-
189
- genotype_data (GenotypeData object): GenotypeData object.
190
-
191
- str_encodings (Dict[str, int]): Dictionary with integer encodings for converting from STRUCTURE-formatted file to IUPAC nucleotides.
192
-
193
- See Also:
194
- SimpleImputer : Univariate imputation of missing values.
195
-
196
- Examples:
197
- >>> import numpy as np
198
- >>> from sklearn.experimental import enable_iterative_imputer
199
- >>> from sklearn.impute import IterativeImputer
200
- >>> imp_mean = IterativeImputer(random_state=0)
201
- >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
202
- >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
203
- >>> imp_mean.transform(X)
204
- array([[ 6.9584..., 2. , 3. ],
205
- [ 4. , 2.6000..., 6. ],
206
- [10. , 4.9999..., 9. ]])
207
-
208
- Notes:
209
- To support imputation in inductive mode we store each feature's estimator during the ``fit`` phase, and predict without refitting (in order) during the ``transform`` phase. Features which contain all missing values at ``fit`` are discarded upon ``transform``\.
210
-
211
- NOTE: Inductive mode support was removed herein.
212
-
213
- References:
214
- .. [3] Stef van Buuren, Karin Groothuis-Oudshoorn (2011). mice: Multivariate Imputation by Chained Equations in R. Journal of Statistical Software 45: 1-67.
215
-
216
- .. [4] S. F. Buck, (1960). A Method of Estimation of Missing Values in Multivariate Data Suitable for use with an Electronic Computer. Journal of the Royal Statistical Society 22(2): 302-306.
217
- """
218
-
219
- def __init__(
220
- self,
221
- logfilepath: str,
222
- clf_kwargs: Dict[str, Any],
223
- *,
224
- estimator: Callable = None,
225
- clf_type: str = "classifier",
226
- disable_progressbar: bool = False,
227
- progress_update_percent: Optional[int] = None,
228
- pops: Optional[List[Union[str, int]]] = None,
229
- missing_values: Union[float, int] = np.nan,
230
- sample_posterior: bool = False,
231
- max_iter: int = 10,
232
- tol: float = 1e-3,
233
- n_nearest_features: Optional[int] = None,
234
- initial_strategy: str = "mean",
235
- imputation_order: str = "ascending",
236
- skip_complete: bool = False,
237
- min_value: Union[int, float] = -np.inf,
238
- max_value: Union[int, float] = np.inf,
239
- verbose: int = 0,
240
- random_state: Optional[int] = None,
241
- add_indicator: bool = False,
242
- genotype_data: Optional[Any] = None,
243
- str_encodings: Optional[Dict[str, int]] = None,
244
- prefix="imputer",
245
- **kwargs,
246
- ) -> None:
247
- super().__init__(
248
- estimator=estimator,
249
- missing_values=missing_values,
250
- sample_posterior=sample_posterior,
251
- max_iter=max_iter,
252
- tol=tol,
253
- n_nearest_features=n_nearest_features,
254
- initial_strategy=initial_strategy,
255
- imputation_order=imputation_order,
256
- skip_complete=skip_complete,
257
- min_value=min_value,
258
- max_value=max_value,
259
- verbose=verbose,
260
- random_state=random_state,
261
- add_indicator=add_indicator,
262
- )
263
-
264
- self.logfilepath = logfilepath
265
- self.clf_kwargs = clf_kwargs
266
- self.prefix = prefix
267
- self.clf_type = clf_type
268
- self.disable_progressbar = disable_progressbar
269
- self.progress_update_percent = progress_update_percent
270
- self.pops = pops
271
- self.estimator = estimator
272
- self.sample_posterior = sample_posterior
273
- self.max_iter = max_iter
274
- self.tol = tol
275
- self.n_nearest_features = n_nearest_features
276
- self.initial_strategy = initial_strategy
277
- self.imputation_order = imputation_order
278
- self.skip_complete = skip_complete
279
- self.min_value = min_value
280
- self.max_value = max_value
281
- self.verbose = verbose
282
- self.random_state = random_state
283
- self.genotype_data = genotype_data
284
- self.str_encodings = str_encodings
285
- self.missing_values = missing_values
286
-
287
- def _initial_imputation(
288
- self, X: np.ndarray, cols_to_keep: np.ndarray, in_fit: bool = False
289
- ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
290
- """Perform initial imputation for input X.
291
-
292
- Performs initial imputation on training data (neighbors).
293
-
294
- Args:
295
- X (ndarray): Input data of shape (n_samples, n_features), where n_samples is the number of samples and n_features is the number of features.
296
-
297
- cols_to_keep (numpy.ndarray): Column indices of shape (n_features,) to keep. Only used if ``initial_strategy=="phylogeny"``\.
298
-
299
- in_fit (bool, optional): True if function is called in fit, otherwise False. Defaults to False.
300
-
301
- Returns:
302
- Xt (numpy.ndarray): Input data of shape (n_samples, n_features), where n_samples is the number of samples and "n_features" is the number of features.
303
-
304
- X_filled (numpy.ndarray): Input data of shape (n_samples, features) with the most recent imputations.
305
-
306
- mask_missing_values (numpy.ndarray): Input data missing indicator matrix of of shape (n_samples, n_features), where n_samples is the number of samples and "n_features" is the number of features.
307
-
308
- X_missing_mask (numpy.ndarray): Input data mask matrix of shape (n_samples, n_features) indicating missing datapoints, where
309
- n_samples is the number of samples and n_features is the
310
- number of features.
311
-
312
- Raises:
313
- AttributeError: GenotypeData object must be initialized with guidetree and qmatrix if using ``initial_strategy=phylogeny``\.
314
- """
315
- if is_scalar_nan(self.missing_values):
316
- force_all_finite = "allow-nan"
317
- else:
318
- force_all_finite = True
319
-
320
- X = self._validate_data(
321
- X,
322
- dtype=FLOAT_DTYPES,
323
- order="F",
324
- reset=in_fit,
325
- force_all_finite=force_all_finite,
326
- )
327
-
328
- X[X < 0] = np.nan
329
-
330
- _check_inputs_dtype(X, self.missing_values)
331
-
332
- X_missing_mask = _get_mask(X, self.missing_values)
333
- mask_missing_values = X_missing_mask.copy()
334
-
335
- if self.initial_strategy == "populations":
336
- self.initial_imputer_ = simple_imputers.ImputeAlleleFreq(
337
- self.genotype_data,
338
- gt=np.nan_to_num(X, nan=-9).tolist(),
339
- pops=self.pops,
340
- by_populations=True,
341
- missing=-9,
342
- verbose=False,
343
- iterative_mode=True,
344
- validation_mode=True,
345
- )
346
-
347
- X_filled = np.array(self.initial_imputer_.imputed)
348
- Xt = X.copy()
349
-
350
- elif self.initial_strategy == "phylogeny":
351
- if (
352
- self.genotype_data.qmatrix is None
353
- and self.genotype_data.qmatrix_iqtree is None
354
- ) or self.genotype_data.guidetree is None:
355
- raise AttributeError(
356
- "GenotypeData object was not initialized with "
357
- "qmatrix/ qmatrix_iqtree or guidetree arguments, "
358
- "but initial_strategy == 'phylogeny'"
359
- )
360
-
361
- else:
362
- self.initial_imputer_ = simple_imputers.ImputePhylo(
363
- genotype_data=self.genotype_data,
364
- str_encodings=self.str_encodings,
365
- write_output=False,
366
- disable_progressbar=True,
367
- column_subset=cols_to_keep,
368
- validation_mode=True,
369
- )
370
-
371
- X_filled = self.initial_imputer_.imputed.to_numpy()
372
- valid_sites = self.initial_imputer_.valid_sites
373
-
374
- valid_mask = np.flatnonzero(
375
- np.logical_not(np.isnan(valid_sites))
376
- )
377
-
378
- Xt = X[:, valid_mask]
379
- mask_missing_values = mask_missing_values[:, valid_mask]
380
-
381
- elif self.initial_strategy == "mf":
382
- self.initial_imputer_ = simple_imputers.ImputeMF(
383
- self.genotype_data,
384
- gt=np.nan_to_num(X, nan=-9),
385
- missing=-9,
386
- verbose=False,
387
- validation_mode=True,
388
- )
389
-
390
- X_filled = np.array(self.initial_imputer_.imputed)
391
- Xt = X.copy()
392
-
393
- elif self.initial_strategy == "phylogeny":
394
- if (
395
- self.genotype_data.qmatrix is None
396
- and self.genotype_data.qmatrix_iqtree is None
397
- ) or self.genotype_data.guidetree is None:
398
- raise AttributeError(
399
- "GenotypeData object was not initialized with "
400
- "qmatrix/ qmatrix_iqtree or guidetree arguments, "
401
- "but initial_strategy == 'phylogeny'"
402
- )
403
-
404
- else:
405
- self.initial_imputer_ = simple_imputers.ImputePhylo(
406
- genotype_data=self.genotype_data,
407
- str_encodings=self.str_encodings,
408
- write_output=False,
409
- disable_progressbar=True,
410
- column_subset=cols_to_keep,
411
- validation_mode=True,
412
- )
413
-
414
- X_filled = self.initial_imputer_.imputed.to_numpy()
415
- Xt = X.copy()
416
-
417
- else:
418
- if self.initial_imputer_ is None:
419
- self.initial_imputer_ = SimpleImputer(
420
- missing_values=self.missing_values,
421
- strategy=self.initial_strategy,
422
- )
423
- X_filled = self.initial_imputer_.fit_transform(X)
424
-
425
- else:
426
- X_filled = self.initial_imputer_.transform(X)
427
-
428
- valid_mask = np.flatnonzero(
429
- np.logical_not(np.isnan(self.initial_imputer_.statistics_))
430
- )
431
-
432
- Xt = X[:, valid_mask]
433
- mask_missing_values = mask_missing_values[:, valid_mask]
434
-
435
- return Xt, X_filled, mask_missing_values, X_missing_mask
436
-
437
- @ignore_warnings(category=UserWarning)
438
- def _impute_one_feature(
439
- self,
440
- X_filled: np.ndarray,
441
- mask_missing_values: np.ndarray,
442
- feat_idx: int,
443
- neighbor_feat_idx: np.ndarray,
444
- estimator: Optional[Any] = None,
445
- fit_mode: bool = True,
446
- ) -> np.ndarray:
447
- """Impute a single feature from the others provided.
448
-
449
- This function predicts the missing values of one of the features using the current estimates of all the other features. The ``estimator`` must support ``return_std=True`` in its ``predict`` method for this function to work.
450
-
451
- Args:
452
- X_filled (numpy.ndarray): Input data with the most recent imputations.
453
-
454
- mask_missing_values (numpy.ndarray): Input data's missing indicator matrix.
455
-
456
- feat_idx (int): Index of the feature currently being imputed.
457
-
458
- neighbor_feat_idx (numpy.ndarray): Indices of the features to be used in imputing ``feat_idx``\.
459
-
460
- estimator (object): The estimator to use at this step of the round-robin imputation If ``sample_posterior`` is True, the estimator must support ``return_std`` in its ``predict`` method.If None, it will be cloned from self._estimator.
461
-
462
- fit_mode (bool, optional): Whether to fit and predict with the estimator or just predict. Defaults to True.
463
-
464
- Returns:
465
- X_filled (numpy.ndarray)]: Input data with ``X_filled[missing_row_mask, feat_idx]`` updated.
466
- """
467
- if estimator is None and fit_mode is False:
468
- raise ValueError(
469
- "If fit_mode is False, then an already-fitted "
470
- "estimator should be passed in."
471
- )
472
-
473
- if estimator is None:
474
- estimator = clone(self._estimator)
475
-
476
- missing_row_mask = mask_missing_values[:, feat_idx]
477
-
478
- if fit_mode:
479
- X_train = _safe_indexing(
480
- X_filled[:, neighbor_feat_idx], ~missing_row_mask
481
- )
482
-
483
- y_train = _safe_indexing(X_filled[:, feat_idx], ~missing_row_mask)
484
-
485
- try:
486
- estimator.fit(X_train, y_train)
487
- le = None
488
- except ValueError as e:
489
- # Happens in newer versions of XGBClassifier.
490
- if str(e).startswith(
491
- "Invalid classes inferred from unique values of `y`"
492
- ):
493
- le = LabelEncoder()
494
- y_train = le.fit_transform(y_train)
495
- estimator.fit(X_train, y_train)
496
-
497
- # if no missing values, don't predict
498
- if np.sum(missing_row_mask) == 0:
499
- return X_filled
500
-
501
- X_test = _safe_indexing(
502
- X_filled[:, neighbor_feat_idx], missing_row_mask
503
- )
504
-
505
- if self.sample_posterior:
506
- raise NotImplementedError(
507
- "sample_posterior is not currently supported. "
508
- "Please set sample_posterior to False"
509
- )
510
-
511
- else:
512
- imputed_values = estimator.predict(X_test)
513
-
514
- if le is not None:
515
- imputed_values = le.inverse_transform(imputed_values)
516
-
517
- imputed_values = np.clip(
518
- imputed_values,
519
- self._min_value[feat_idx],
520
- self._max_value[feat_idx],
521
- )
522
-
523
- # update the feature
524
- X_filled[missing_row_mask, feat_idx] = imputed_values
525
-
526
- del estimator
527
- del X_train
528
- del y_train
529
- del X_test
530
- del imputed_values
531
- gc.collect()
532
-
533
- return X_filled
534
-
535
- @ignore_warnings(category=UserWarning)
536
- def fit_transform(
537
- self,
538
- X: np.ndarray,
539
- valid_cols: Optional[np.ndarray] = None,
540
- y: None = None,
541
- ) -> np.ndarray:
542
- """Fits the imputer on X and return the transformed X.
543
-
544
- Args:
545
- X (array-like, shape (n_samples, n_features)): Input data, where "n_samples" is the number of samples and "n_features" is the number of features.
546
-
547
- valid_cols (numpy.ndarray, optional): Array with column indices to keep. Defaults to None.
548
-
549
- y (None): Ignored. Here for compatibility with other sklearn classes.
550
-
551
- Returns:
552
- Xt (array-like, shape (n_samples, n_features)): The imputed input data.
553
-
554
- Raises:
555
- ValueError: "max_iter" must be a positive integer.
556
- ValueError: "tol" should be non-negative float.
557
- ValueError: One or more features has min_value >= max_value.
558
- ConvergenceWarning: Early stopping criterion not reached.
559
- """
560
- self.random_state_ = getattr(
561
- self, "random_state_", check_random_state(self.random_state)
562
- )
563
-
564
- if self.max_iter < 0:
565
- raise ValueError(
566
- f"'max_iter' should be a positive integer. Got {self.max_iter} instead."
567
- )
568
-
569
- if self.tol < 0:
570
- raise ValueError(
571
- f"'tol' should be a non-negative float. Got {self.tol} instead"
572
- )
573
-
574
- self._estimator = clone(self.estimator)
575
-
576
- self.initial_imputer_ = None
577
-
578
- # X is the input data subset to only valid features (not nan)
579
- # Xt is the data imputed with SimpleImputer
580
- # mask_missing_values is the missing indicator matrix
581
- # complete_mask is the input data's mask matrix
582
- X, Xt, mask_missing_values, complete_mask = self._initial_imputation(
583
- X, valid_cols, in_fit=True
584
- )
585
-
586
- super(IterativeImputer, self)._fit_indicator(complete_mask)
587
- X_indicator = super(IterativeImputer, self)._transform_indicator(
588
- complete_mask
589
- )
590
-
591
- if self.max_iter == 0 or np.all(mask_missing_values):
592
- self.n_iter_ = 0
593
- return super(IterativeImputer, self)._concatenate_indicator(
594
- Xt, X_indicator
595
- )
596
-
597
- # Edge case: a single feature. We return the initial ...
598
- if Xt.shape[1] == 1:
599
- self.n_iter_ = 0
600
- return super(IterativeImputer, self)._concatenate_indicator(
601
- Xt, X_indicator
602
- )
603
-
604
- self._min_value = self._validate_limit(
605
- self.min_value, "min", X.shape[1]
606
- )
607
- self._max_value = self._validate_limit(
608
- self.max_value, "max", X.shape[1]
609
- )
610
-
611
- if not np.all(np.greater(self._max_value, self._min_value)):
612
- raise ValueError(
613
- "One (or more) features have min_value >= max_value."
614
- )
615
-
616
- # order in which to impute
617
- # note this is probably too slow for large feature data (d > 100000)
618
- # and a better way would be good.
619
- # see: https://goo.gl/KyCNwj and subsequent comments
620
- ordered_idx = self._get_ordered_idx(mask_missing_values)
621
- total_features = len(ordered_idx)
622
-
623
- self.n_features_with_missing_ = len(ordered_idx)
624
-
625
- abs_corr_mat = self._get_abs_corr_mat(Xt)
626
-
627
- n_samples, n_features = Xt.shape
628
-
629
- if self.verbose > 0:
630
- print(
631
- f"[IterativeImputer] Completing matrix with shape ({X.shape},)"
632
- )
633
- start_t = time()
634
-
635
- if not self.sample_posterior:
636
- Xt_previous = Xt.copy()
637
- normalized_tol = self.tol * np.max(np.abs(X[~mask_missing_values]))
638
-
639
- total_iter = self.max_iter
640
-
641
- ###########################################
642
- ### Iteration Start
643
- ###########################################
644
- for self.n_iter_ in progressbar(
645
- range(1, total_iter + 1),
646
- desc="Iteration: ",
647
- disable=self.disable_progressbar,
648
- ):
649
- if self.imputation_order == "random":
650
- ordered_idx = self._get_ordered_idx(mask_missing_values)
651
-
652
- if self.disable_progressbar:
653
- with open(self.logfilepath, "a") as fout:
654
- # Redirect to progress logfile
655
- with redirect_stdout(fout):
656
- print(
657
- f"Iteration Progress: "
658
- f"{self.n_iter_}/{self.max_iter} "
659
- f"({int((self.n_iter_ / total_iter) * 100)}%)"
660
- )
661
-
662
- if self.progress_update_percent is not None:
663
- print_perc_interval = self.progress_update_percent
664
-
665
- ##########################
666
- ### Feature Start
667
- ##########################
668
- for i, feat_idx in enumerate(
669
- progressbar(
670
- ordered_idx,
671
- desc="Feature: ",
672
- leave=False,
673
- position=1,
674
- disable=self.disable_progressbar,
675
- ),
676
- start=1,
677
- ):
678
- neighbor_feat_idx = self._get_neighbor_feat_idx(
679
- n_features, feat_idx, abs_corr_mat
680
- )
681
-
682
- Xt = self._impute_one_feature(
683
- Xt,
684
- mask_missing_values,
685
- feat_idx,
686
- neighbor_feat_idx,
687
- estimator=None,
688
- fit_mode=True,
689
- )
690
-
691
- # NOTE: The below source code has been commented out to save
692
- # RAM. estimator_triplet object contains numerous fit estimators
693
- # that demand a lot of resources. It is primarily used for the
694
- # transform function, which is not needed in this application.
695
-
696
- # estimator_triplet = _ImputerTripletAll(
697
- # feat_idx,
698
- # neighbor_feat_idx,
699
- # estimator)
700
-
701
- # self.imputation_sequence_.append(estimator_triplet)
702
-
703
- # Only print feature updates at each progress_update_percent
704
- # interval
705
- if (
706
- self.progress_update_percent is not None
707
- and self.disable_progressbar
708
- ):
709
- current_perc = math.ceil((i / total_features) * 100)
710
-
711
- if current_perc >= print_perc_interval:
712
- with open(self.logfilepath, "a") as fout:
713
- # Redirect progress to file
714
- with redirect_stdout(fout):
715
- print(
716
- f"Feature Progress (Iteration "
717
- f"{self.n_iter_}/{self.max_iter}): "
718
- f"{i}/{total_features} ({current_perc}"
719
- f"%)"
720
- )
721
-
722
- if i == len(ordered_idx):
723
- print("\n", end="")
724
-
725
- while print_perc_interval <= current_perc:
726
- print_perc_interval += self.progress_update_percent
727
-
728
- if self.verbose > 1:
729
- print(
730
- f"[IterativeImputer] Ending imputation round "
731
- f"{self.n_iter_}/{self.max_iter}, "
732
- f"elapsed time {(time() - start_t):0.2f}"
733
- )
734
-
735
- if not self.sample_posterior:
736
- inf_norm = np.linalg.norm(
737
- Xt - Xt_previous, ord=np.inf, axis=None
738
- )
739
-
740
- if self.verbose > 0:
741
- print(
742
- f"[IterativeImputer] Change: {inf_norm}, "
743
- f"scaled tolerance: {normalized_tol} "
744
- )
745
-
746
- if inf_norm < normalized_tol:
747
- if self.disable_progressbar:
748
- # Early stopping criteria has been reached
749
- with open(self.logfilepath, "a") as fout:
750
- # Redirect to progress logfile
751
- with redirect_stdout(fout):
752
- print(
753
- "[IterativeImputer] Early stopping "
754
- "criterion reached."
755
- )
756
- else:
757
- print(
758
- "[IterativeImputer] Early stopping criterion "
759
- "reached."
760
- )
761
-
762
- break
763
- Xt_previous = Xt.copy()
764
-
765
- else:
766
- if not self.sample_posterior:
767
- warnings.warn(
768
- "[IterativeImputer] Early stopping criterion not"
769
- " reached.",
770
- ConvergenceWarning,
771
- )
772
-
773
- Xt[~mask_missing_values] = X[~mask_missing_values]
774
- Xt = Xt.astype(int)
775
-
776
- return (
777
- super(IterativeImputer, self)._concatenate_indicator(
778
- Xt, X_indicator
779
- ),
780
- None,
781
- None,
782
- )