cpgtools 2.0.0__py3-none-any.whl → 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cpgtools might be problematic. Click here for more details.

Files changed (94) hide show
  1. cpgmodule/_version.py +1 -0
  2. cpgmodule/utils.py +35 -0
  3. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_aggregation.py +1 -1
  4. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_anno_position.py +1 -1
  5. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_anno_probe.py +1 -2
  6. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_density_gene_centered.py +1 -1
  7. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_chrom.py +1 -1
  8. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_gene_centered.py +1 -1
  9. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_region.py +1 -3
  10. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_logo.py +1 -1
  11. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_to_gene.py +1 -1
  12. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_PCA.py +31 -23
  13. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_UMAP.py +29 -22
  14. cpgtools-2.0.2.data/scripts/beta_imputation.py +604 -0
  15. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_jitter_plot.py +1 -1
  16. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_m_conversion.py +1 -1
  17. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_profile_gene_centered.py +1 -1
  18. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_profile_region.py +1 -1
  19. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_selectNBest.py +9 -6
  20. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_stacked_barplot.py +1 -1
  21. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_stats.py +1 -1
  22. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_tSNE.py +31 -24
  23. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_topN.py +1 -1
  24. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_trichotmize.py +1 -1
  25. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_Bayes.py +1 -1
  26. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_bb.py +1 -1
  27. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_fisher.py +1 -1
  28. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_glm.py +1 -1
  29. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_logit.py +1 -1
  30. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_nonparametric.py +1 -1
  31. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_ttest.py +1 -1
  32. cpgtools-2.0.2.data/scripts/predict_sex.py +126 -0
  33. cpgtools-2.0.2.dist-info/LICENSE +19 -0
  34. cpgtools-2.0.2.dist-info/METADATA +76 -0
  35. cpgtools-2.0.2.dist-info/RECORD +82 -0
  36. {cpgtools-2.0.0.dist-info → cpgtools-2.0.2.dist-info}/WHEEL +1 -1
  37. cpgtools-2.0.2.dist-info/top_level.txt +3 -0
  38. impyute/__init__.py +3 -0
  39. impyute/contrib/__init__.py +7 -0
  40. impyute/contrib/compare.py +69 -0
  41. impyute/contrib/count_missing.py +30 -0
  42. impyute/contrib/describe.py +63 -0
  43. impyute/cs/__init__.py +11 -0
  44. impyute/cs/buck_iterative.py +82 -0
  45. impyute/cs/central_tendency.py +84 -0
  46. impyute/cs/em.py +52 -0
  47. impyute/cs/fast_knn.py +130 -0
  48. impyute/cs/random.py +27 -0
  49. impyute/dataset/__init__.py +6 -0
  50. impyute/dataset/base.py +137 -0
  51. impyute/dataset/corrupt.py +55 -0
  52. impyute/deletion/__init__.py +5 -0
  53. impyute/deletion/complete_case.py +21 -0
  54. impyute/ops/__init__.py +12 -0
  55. impyute/ops/error.py +9 -0
  56. impyute/ops/inverse_distance_weighting.py +31 -0
  57. impyute/ops/matrix.py +47 -0
  58. impyute/ops/testing.py +20 -0
  59. impyute/ops/util.py +76 -0
  60. impyute/ops/wrapper.py +179 -0
  61. impyute/ts/__init__.py +6 -0
  62. impyute/ts/locf.py +57 -0
  63. impyute/ts/moving_window.py +128 -0
  64. missingpy/__init__.py +4 -0
  65. missingpy/knnimpute.py +328 -0
  66. missingpy/missforest.py +556 -0
  67. missingpy/pairwise_external.py +315 -0
  68. missingpy/tests/__init__.py +0 -0
  69. missingpy/tests/test_knnimpute.py +605 -0
  70. missingpy/tests/test_missforest.py +409 -0
  71. missingpy/utils.py +124 -0
  72. cpgmodule/data/AltumAge_cpg.pkl +0 -0
  73. cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
  74. cpgmodule/data/AltumAge_scaler.pkl +0 -0
  75. cpgmodule/data/GA_Bohlin.pkl +0 -0
  76. cpgmodule/data/GA_Haftorn.pkl +0 -0
  77. cpgmodule/data/GA_Knight.pkl +0 -0
  78. cpgmodule/data/GA_Lee_CPC.pkl +0 -0
  79. cpgmodule/data/GA_Lee_RPC.pkl +0 -0
  80. cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
  81. cpgmodule/data/GA_Mayne.pkl +0 -0
  82. cpgmodule/data/Hannum.pkl +0 -0
  83. cpgmodule/data/Horvath_2013.pkl +0 -0
  84. cpgmodule/data/Horvath_2018.pkl +0 -0
  85. cpgmodule/data/Levine.pkl +0 -0
  86. cpgmodule/data/Lu_DNAmTL.pkl +0 -0
  87. cpgmodule/data/Ped_McEwen.pkl +0 -0
  88. cpgmodule/data/Ped_Wu.pkl +0 -0
  89. cpgmodule/data/Zhang_BLUP.pkl +0 -0
  90. cpgmodule/data/Zhang_EN.pkl +0 -0
  91. cpgtools-2.0.0.dist-info/LICENSE.txt +0 -674
  92. cpgtools-2.0.0.dist-info/METADATA +0 -28
  93. cpgtools-2.0.0.dist-info/RECORD +0 -64
  94. cpgtools-2.0.0.dist-info/top_level.txt +0 -2
@@ -0,0 +1,128 @@
1
+ import numpy as np
2
+ from impyute.ops import matrix
3
+ from impyute.ops import wrapper
4
+ # pylint: disable=invalid-name, too-many-arguments, too-many-locals, too-many-branches, broad-except, len-as-condition
5
+
6
+ @wrapper.wrappers
7
+ @wrapper.checks
8
+ def moving_window(data, nindex=None, wsize=5, errors="coerce", func=np.mean,
9
+ inplace=False):
10
+ """ Interpolate the missing values based on nearby values.
11
+
12
+ For example, with an array like this:
13
+
14
+ array([[-1.24940, -1.38673, -0.03214945, 0.08255145, -0.007415],
15
+ [ 2.14662, 0.32758 , -0.82601414, 1.78124027, 0.873998],
16
+ [-0.41400, -0.977629, nan, -1.39255344, 1.680435],
17
+ [ 0.40975, 1.067599, 0.29152388, -1.70160145, -0.565226],
18
+ [-0.54592, -1.126187, 2.04004377, 0.16664863, -0.010677]])
19
+
20
+ Using a `k` or window size of 3. The one missing value would be set
21
+ to -1.18509122. The window operates on the horizontal axis.
22
+
23
+ Usage
24
+ -----
25
+
26
+ The parameters default the function to a moving mean. You may want to change
27
+ the default window size:
28
+
29
+ moving_window(data, wsize=10)
30
+
31
+ To only look at past data (null value is at the rightmost index in the window):
32
+
33
+ moving_window(data, nindex=-1)
34
+
35
+ To use a custom function:
36
+
37
+ moving_window(data, func=np.median)
38
+
39
+ You can also do something like take 1.5x the max of previous values in the window:
40
+
41
+ moving_window(data, func=lambda arr: max(arr) * 1.50, nindex=-1)
42
+
43
+ Parameters
44
+ ----------
45
+ data: numpy.ndarray
46
+ 2D matrix to impute.
47
+ nindex: int
48
+ Null index. Index of the null value inside the moving average window.
49
+ Use cases: Say you wanted to make value skewed toward the left or right
50
+ side. 0 would only take the average of values from the right and -1
51
+ would only take the average of values from the left
52
+ wsize: int
53
+ Window size. Size of the moving average window/area of values being used
54
+ for each local imputation. This number includes the missing value.
55
+ errors: {"raise", "coerce", "ignore"}
56
+ Errors will occur with the indexing of the windows - for example if there
57
+ is a nan at data[x][0] and `nindex` is set to -1 or there is a nan at
58
+ data[x][-1] and `nindex` is set to 0. `"raise"` will raise an error,
59
+ `"coerce"` will try again using an nindex set to the middle and `"ignore"`
60
+ will just leave it as a nan.
61
+ inplace: {True, False}
62
+ Whether to return a copy or run on the passed-in array
63
+
64
+ Returns
65
+ -------
66
+ numpy.ndarray
67
+ Imputed data.
68
+
69
+ """
70
+ if errors == "ignore":
71
+ raise Exception("`errors` value `ignore` not implemented yet. Sorry!")
72
+
73
+ if not inplace:
74
+ data = data.copy()
75
+
76
+ if nindex is None: # If using equal window side lengths
77
+ assert wsize % 2 == 1, "The parameter `wsize` should not be even "\
78
+ "if the value `nindex` is not set since it defaults to the midpoint "\
79
+ "and an even `wsize` makes the midpoint ambiguous"
80
+ wside_left = wsize // 2
81
+ wside_right = wsize // 2
82
+ else: # If using custom window side lengths
83
+ assert nindex < wsize, "The null index must be smaller than the window size"
84
+ if nindex == -1:
85
+ wside_left = wsize - 1
86
+ wside_right = 0
87
+ else:
88
+ wside_left = nindex
89
+ wside_right = wsize - nindex - 1
90
+
91
+ while True:
92
+ nan_xy = matrix.nan_indices(data)
93
+ n_nan_prev = len(nan_xy)
94
+ for x_i, y_i in nan_xy:
95
+ left_i = max(0, y_i-wside_left)
96
+ right_i = min(len(data), y_i+wside_right+1)
97
+ window = data[x_i, left_i: right_i]
98
+ window_not_null = window[~np.isnan(window)]
99
+
100
+ if len(window_not_null) > 0:
101
+ try:
102
+ data[x_i][y_i] = func(window_not_null)
103
+ continue
104
+ except Exception as e:
105
+ if errors == "raise":
106
+ raise e
107
+
108
+ if errors == "coerce":
109
+ # If either the window has a length of 0 or the aggregate function fails somehow,
110
+ # do a fallback of just trying the best we can by using it as the middle and trying
111
+ # to recalculate. Use temporary wside_left/wside_right, for only the calculation of
112
+ # this specific problamatic value
113
+ wside_left_tmp = wsize // 2
114
+ wside_right_tmp = wside_left_tmp
115
+
116
+ left_i_tmp = max(0, y_i-wside_left_tmp)
117
+ right_i_tmp = min(len(data), y_i+wside_right_tmp+1)
118
+
119
+ window = data[x_i, left_i_tmp:right_i_tmp]
120
+ window_not_null = window[~np.isnan(window)]
121
+ try:
122
+ data[x_i][y_i] = func(window_not_null)
123
+ except Exception as e:
124
+ print("Exception:", e)
125
+ if n_nan_prev == len(matrix.nan_indices(data)):
126
+ break
127
+
128
+ return data
missingpy/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ #from .knnimpute import KNNImputer
2
+ from .missforest import MissForest
3
+
4
+ __all__ = ['MissForest']
missingpy/knnimpute.py ADDED
@@ -0,0 +1,328 @@
1
+ """KNN Imputer for Missing Data"""
2
+ # Author: Ashim Bhattarai
3
+ # License: GNU General Public License v3 (GPLv3)
4
+
5
+ import warnings
6
+
7
+ import numpy as np
8
+
9
+ from sklearn.base import BaseEstimator, TransformerMixin
10
+ from sklearn.utils import check_array
11
+ from sklearn.utils.validation import check_is_fitted
12
+ from sklearn.utils.validation import FLOAT_DTYPES
13
+ from sklearn.neighbors.base import _check_weights
14
+ from sklearn.neighbors.base import _get_weights
15
+
16
+ from .pairwise_external import pairwise_distances
17
+ from .pairwise_external import _get_mask
18
+ from .pairwise_external import _MASKED_METRICS
19
+
20
+ __all__ = [
21
+ 'KNNImputer',
22
+ ]
23
+
24
+
25
+ class KNNImputer(BaseEstimator, TransformerMixin):
26
+ """Imputation for completing missing values using k-Nearest Neighbors.
27
+
28
+ Each sample's missing values are imputed using values from ``n_neighbors``
29
+ nearest neighbors found in the training set. Each missing feature is then
30
+ imputed as the average, either weighted or unweighted, of these neighbors.
31
+ Note that if a sample has more than one feature missing, then the
32
+ neighbors for that sample can be different depending on the particular
33
+ feature being imputed. Finally, where the number of donor neighbors is
34
+ less than ``n_neighbors``, the training set average for that feature is
35
+ used during imputation.
36
+
37
+ Parameters
38
+ ----------
39
+ missing_values : integer or "NaN", optional (default = "NaN")
40
+ The placeholder for the missing values. All occurrences of
41
+ `missing_values` will be imputed. For missing values encoded as
42
+ ``np.nan``, use the string value "NaN".
43
+
44
+ n_neighbors : int, optional (default = 5)
45
+ Number of neighboring samples to use for imputation.
46
+
47
+ weights : str or callable, optional (default = "uniform")
48
+ Weight function used in prediction. Possible values:
49
+
50
+ - 'uniform' : uniform weights. All points in each neighborhood
51
+ are weighted equally.
52
+ - 'distance' : weight points by the inverse of their distance.
53
+ in this case, closer neighbors of a query point will have a
54
+ greater influence than neighbors which are further away.
55
+ - [callable] : a user-defined function which accepts an
56
+ array of distances, and returns an array of the same shape
57
+ containing the weights.
58
+
59
+ metric : str or callable, optional (default = "masked_euclidean")
60
+ Distance metric for searching neighbors. Possible values:
61
+ - 'masked_euclidean'
62
+ - [callable] : a user-defined function which conforms to the
63
+ definition of _pairwise_callable(X, Y, metric, **kwds). In other
64
+ words, the function accepts two arrays, X and Y, and a
65
+ ``missing_values`` keyword in **kwds and returns a scalar distance
66
+ value.
67
+
68
+ row_max_missing : float, optional (default = 0.5)
69
+ The maximum fraction of columns (i.e. features) that can be missing
70
+ before the sample is excluded from nearest neighbor imputation. It
71
+ means that such rows will not be considered a potential donor in
72
+ ``fit()``, and in ``transform()`` their missing feature values will be
73
+ imputed to be the column mean for the entire dataset.
74
+
75
+ col_max_missing : float, optional (default = 0.8)
76
+ The maximum fraction of rows (or samples) that can be missing
77
+ for any feature beyond which an error is raised.
78
+
79
+ copy : boolean, optional (default = True)
80
+ If True, a copy of X will be created. If False, imputation will
81
+ be done in-place whenever possible. Note that, if metric is
82
+ "masked_euclidean" and copy=False then missing_values in the
83
+ input matrix X will be overwritten with zeros.
84
+
85
+ Attributes
86
+ ----------
87
+ statistics_ : 1-D array of length {n_features}
88
+ The 1-D array contains the mean of each feature calculated using
89
+ observed (i.e. non-missing) values. This is used for imputing
90
+ missing values in samples that are either excluded from nearest
91
+ neighbors search because they have too many ( > row_max_missing)
92
+ missing features or because all of the sample's k-nearest neighbors
93
+ (i.e., the potential donors) also have the relevant feature value
94
+ missing.
95
+
96
+ References
97
+ ----------
98
+ * Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor
99
+ Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing
100
+ value estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17
101
+ no. 6, 2001 Pages 520-525.
102
+
103
+ Examples
104
+ --------
105
+ >>> from missingpy import KNNImputer
106
+ >>> nan = float("NaN")
107
+ >>> X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]]
108
+ >>> imputer = KNNImputer(n_neighbors=2, weights="uniform")
109
+ >>> imputer.fit_transform(X)
110
+ array([[1. , 2. , 4. ],
111
+ [3. , 4. , 3. ],
112
+ [5.5, 6. , 5. ],
113
+ [8. , 8. , 7. ]])
114
+ """
115
+
116
+ def __init__(self, missing_values="NaN", n_neighbors=5,
117
+ weights="uniform", metric="masked_euclidean",
118
+ row_max_missing=0.5, col_max_missing=0.8, copy=True):
119
+
120
+ self.missing_values = missing_values
121
+ self.n_neighbors = n_neighbors
122
+ self.weights = weights
123
+ self.metric = metric
124
+ self.row_max_missing = row_max_missing
125
+ self.col_max_missing = col_max_missing
126
+ self.copy = copy
127
+
128
+ def _impute(self, dist, X, fitted_X, mask, mask_fx):
129
+ """Helper function to find and impute missing values"""
130
+
131
+ # For each column, find and impute
132
+ n_rows_X, n_cols_X = X.shape
133
+ for c in range(n_cols_X):
134
+ if not np.any(mask[:, c], axis=0):
135
+ continue
136
+
137
+ # Row index for receivers and potential donors (pdonors)
138
+ receivers_row_idx = np.where(mask[:, c])[0]
139
+ pdonors_row_idx = np.where(~mask_fx[:, c])[0]
140
+
141
+ # Impute using column mean if n_neighbors are not available
142
+ if len(pdonors_row_idx) < self.n_neighbors:
143
+ warnings.warn("Insufficient number of neighbors! "
144
+ "Filling in column mean.")
145
+ X[receivers_row_idx, c] = self.statistics_[c]
146
+ continue
147
+
148
+ # Get distance from potential donors
149
+ dist_pdonors = dist[receivers_row_idx][:, pdonors_row_idx]
150
+ dist_pdonors = dist_pdonors.reshape(-1,
151
+ len(pdonors_row_idx))
152
+
153
+ # Argpartition to separate actual donors from the rest
154
+ pdonors_idx = np.argpartition(
155
+ dist_pdonors, self.n_neighbors - 1, axis=1)
156
+
157
+ # Get final donors row index from pdonors
158
+ donors_idx = pdonors_idx[:, :self.n_neighbors]
159
+
160
+ # Get weights or None
161
+ dist_pdonors_rows = np.arange(len(donors_idx))[:, None]
162
+ weight_matrix = _get_weights(
163
+ dist_pdonors[
164
+ dist_pdonors_rows, donors_idx], self.weights)
165
+ donor_row_idx_ravel = donors_idx.ravel()
166
+
167
+ # Retrieve donor values and calculate kNN score
168
+ fitted_X_temp = fitted_X[pdonors_row_idx]
169
+ donors = fitted_X_temp[donor_row_idx_ravel, c].reshape(
170
+ (-1, self.n_neighbors))
171
+ donors_mask = _get_mask(donors, self.missing_values)
172
+ donors = np.ma.array(donors, mask=donors_mask)
173
+
174
+ # Final imputation
175
+ imputed = np.ma.average(donors, axis=1,
176
+ weights=weight_matrix)
177
+ X[receivers_row_idx, c] = imputed.data
178
+ return X
179
+
180
+ def fit(self, X, y=None):
181
+ """Fit the imputer on X.
182
+
183
+ Parameters
184
+ ----------
185
+ X : {array-like}, shape (n_samples, n_features)
186
+ Input data, where ``n_samples`` is the number of samples and
187
+ ``n_features`` is the number of features.
188
+
189
+ Returns
190
+ -------
191
+ self : object
192
+ Returns self.
193
+ """
194
+
195
+ # Check data integrity and calling arguments
196
+ force_all_finite = False if self.missing_values in ["NaN",
197
+ np.nan] else True
198
+ if not force_all_finite:
199
+ if self.metric not in _MASKED_METRICS and not callable(
200
+ self.metric):
201
+ raise ValueError(
202
+ "The selected metric does not support NaN values.")
203
+ X = check_array(X, accept_sparse=False, dtype=np.float64,
204
+ force_all_finite=force_all_finite, copy=self.copy)
205
+ self.weights = _check_weights(self.weights)
206
+
207
+ # Check for +/- inf
208
+ if np.any(np.isinf(X)):
209
+ raise ValueError("+/- inf values are not allowed.")
210
+
211
+ # Check if % missing in any column > col_max_missing
212
+ mask = _get_mask(X, self.missing_values)
213
+ if np.any(mask.sum(axis=0) > (X.shape[0] * self.col_max_missing)):
214
+ raise ValueError("Some column(s) have more than {}% missing values"
215
+ .format(self.col_max_missing * 100))
216
+ X_col_means = np.ma.array(X, mask=mask).mean(axis=0).data
217
+
218
+ # Check if % missing in any row > row_max_missing
219
+ bad_rows = mask.sum(axis=1) > (mask.shape[1] * self.row_max_missing)
220
+ if np.any(bad_rows):
221
+ warnings.warn(
222
+ "There are rows with more than {0}% missing values. These "
223
+ "rows are not included as donor neighbors."
224
+ .format(self.row_max_missing * 100))
225
+
226
+ # Remove rows that have more than row_max_missing % missing
227
+ X = X[~bad_rows, :]
228
+
229
+ # Check if sufficient neighboring samples available
230
+ if X.shape[0] < self.n_neighbors:
231
+ raise ValueError("There are only %d samples, but n_neighbors=%d."
232
+ % (X.shape[0], self.n_neighbors))
233
+ self.fitted_X_ = X
234
+ self.statistics_ = X_col_means
235
+
236
+ return self
237
+
238
+ def transform(self, X):
239
+ """Impute all missing values in X.
240
+
241
+ Parameters
242
+ ----------
243
+ X : {array-like}, shape = [n_samples, n_features]
244
+ The input data to complete.
245
+
246
+ Returns
247
+ -------
248
+ X : {array-like}, shape = [n_samples, n_features]
249
+ The imputed dataset.
250
+ """
251
+
252
+ check_is_fitted(self, ["fitted_X_", "statistics_"])
253
+ force_all_finite = False if self.missing_values in ["NaN",
254
+ np.nan] else True
255
+ X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES,
256
+ force_all_finite=force_all_finite, copy=self.copy)
257
+
258
+ # Check for +/- inf
259
+ if np.any(np.isinf(X)):
260
+ raise ValueError("+/- inf values are not allowed in data to be "
261
+ "transformed.")
262
+
263
+ # Get fitted data and ensure correct dimension
264
+ n_rows_fit_X, n_cols_fit_X = self.fitted_X_.shape
265
+ n_rows_X, n_cols_X = X.shape
266
+
267
+ if n_cols_X != n_cols_fit_X:
268
+ raise ValueError("Incompatible dimension between the fitted "
269
+ "dataset and the one to be transformed.")
270
+ mask = _get_mask(X, self.missing_values)
271
+
272
+ row_total_missing = mask.sum(axis=1)
273
+ if not np.any(row_total_missing):
274
+ return X
275
+
276
+ # Check for excessive missingness in rows
277
+ bad_rows = row_total_missing > (mask.shape[1] * self.row_max_missing)
278
+ if np.any(bad_rows):
279
+ warnings.warn(
280
+ "There are rows with more than {0}% missing values. The "
281
+ "missing features in these rows are imputed with column means."
282
+ .format(self.row_max_missing * 100))
283
+ X_bad = X[bad_rows, :]
284
+ X = X[~bad_rows, :]
285
+ mask = mask[~bad_rows]
286
+ row_total_missing = mask.sum(axis=1)
287
+ row_has_missing = row_total_missing.astype(np.bool)
288
+
289
+ if np.any(row_has_missing):
290
+
291
+ # Mask for fitted_X
292
+ mask_fx = _get_mask(self.fitted_X_, self.missing_values)
293
+
294
+ # Pairwise distances between receivers and fitted samples
295
+ dist = np.empty((len(X), len(self.fitted_X_)))
296
+ dist[row_has_missing] = pairwise_distances(
297
+ X[row_has_missing], self.fitted_X_, metric=self.metric,
298
+ squared=False, missing_values=self.missing_values)
299
+
300
+ # Find and impute missing
301
+ X = self._impute(dist, X, self.fitted_X_, mask, mask_fx)
302
+
303
+ # Merge bad rows to X and mean impute their missing values
304
+ if np.any(bad_rows):
305
+ bad_missing_index = np.where(_get_mask(X_bad, self.missing_values))
306
+ X_bad[bad_missing_index] = np.take(self.statistics_,
307
+ bad_missing_index[1])
308
+ X_merged = np.empty((n_rows_X, n_cols_X))
309
+ X_merged[bad_rows, :] = X_bad
310
+ X_merged[~bad_rows, :] = X
311
+ X = X_merged
312
+ return X
313
+
314
+ def fit_transform(self, X, y=None, **fit_params):
315
+ """Fit KNNImputer and impute all missing values in X.
316
+
317
+ Parameters
318
+ ----------
319
+ X : {array-like}, shape (n_samples, n_features)
320
+ Input data, where ``n_samples`` is the number of samples and
321
+ ``n_features`` is the number of features.
322
+
323
+ Returns
324
+ -------
325
+ X : {array-like}, shape (n_samples, n_features)
326
+ Returns imputed dataset.
327
+ """
328
+ return self.fit(X).transform(X)