cpgtools 2.0.0__py3-none-any.whl → 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cpgtools might be problematic. Click here for more details.
- cpgmodule/_version.py +1 -0
- cpgmodule/utils.py +35 -0
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_aggregation.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_anno_position.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_anno_probe.py +1 -2
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_density_gene_centered.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_chrom.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_gene_centered.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_region.py +1 -3
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_logo.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_to_gene.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_PCA.py +31 -23
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_UMAP.py +29 -22
- cpgtools-2.0.2.data/scripts/beta_imputation.py +604 -0
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_jitter_plot.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_m_conversion.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_profile_gene_centered.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_profile_region.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_selectNBest.py +9 -6
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_stacked_barplot.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_stats.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_tSNE.py +31 -24
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_topN.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_trichotmize.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_Bayes.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_bb.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_fisher.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_glm.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_logit.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_nonparametric.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_ttest.py +1 -1
- cpgtools-2.0.2.data/scripts/predict_sex.py +126 -0
- cpgtools-2.0.2.dist-info/LICENSE +19 -0
- cpgtools-2.0.2.dist-info/METADATA +76 -0
- cpgtools-2.0.2.dist-info/RECORD +82 -0
- {cpgtools-2.0.0.dist-info → cpgtools-2.0.2.dist-info}/WHEEL +1 -1
- cpgtools-2.0.2.dist-info/top_level.txt +3 -0
- impyute/__init__.py +3 -0
- impyute/contrib/__init__.py +7 -0
- impyute/contrib/compare.py +69 -0
- impyute/contrib/count_missing.py +30 -0
- impyute/contrib/describe.py +63 -0
- impyute/cs/__init__.py +11 -0
- impyute/cs/buck_iterative.py +82 -0
- impyute/cs/central_tendency.py +84 -0
- impyute/cs/em.py +52 -0
- impyute/cs/fast_knn.py +130 -0
- impyute/cs/random.py +27 -0
- impyute/dataset/__init__.py +6 -0
- impyute/dataset/base.py +137 -0
- impyute/dataset/corrupt.py +55 -0
- impyute/deletion/__init__.py +5 -0
- impyute/deletion/complete_case.py +21 -0
- impyute/ops/__init__.py +12 -0
- impyute/ops/error.py +9 -0
- impyute/ops/inverse_distance_weighting.py +31 -0
- impyute/ops/matrix.py +47 -0
- impyute/ops/testing.py +20 -0
- impyute/ops/util.py +76 -0
- impyute/ops/wrapper.py +179 -0
- impyute/ts/__init__.py +6 -0
- impyute/ts/locf.py +57 -0
- impyute/ts/moving_window.py +128 -0
- missingpy/__init__.py +4 -0
- missingpy/knnimpute.py +328 -0
- missingpy/missforest.py +556 -0
- missingpy/pairwise_external.py +315 -0
- missingpy/tests/__init__.py +0 -0
- missingpy/tests/test_knnimpute.py +605 -0
- missingpy/tests/test_missforest.py +409 -0
- missingpy/utils.py +124 -0
- cpgmodule/data/AltumAge_cpg.pkl +0 -0
- cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
- cpgmodule/data/AltumAge_scaler.pkl +0 -0
- cpgmodule/data/GA_Bohlin.pkl +0 -0
- cpgmodule/data/GA_Haftorn.pkl +0 -0
- cpgmodule/data/GA_Knight.pkl +0 -0
- cpgmodule/data/GA_Lee_CPC.pkl +0 -0
- cpgmodule/data/GA_Lee_RPC.pkl +0 -0
- cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
- cpgmodule/data/GA_Mayne.pkl +0 -0
- cpgmodule/data/Hannum.pkl +0 -0
- cpgmodule/data/Horvath_2013.pkl +0 -0
- cpgmodule/data/Horvath_2018.pkl +0 -0
- cpgmodule/data/Levine.pkl +0 -0
- cpgmodule/data/Lu_DNAmTL.pkl +0 -0
- cpgmodule/data/Ped_McEwen.pkl +0 -0
- cpgmodule/data/Ped_Wu.pkl +0 -0
- cpgmodule/data/Zhang_BLUP.pkl +0 -0
- cpgmodule/data/Zhang_EN.pkl +0 -0
- cpgtools-2.0.0.dist-info/LICENSE.txt +0 -674
- cpgtools-2.0.0.dist-info/METADATA +0 -28
- cpgtools-2.0.0.dist-info/RECORD +0 -64
- cpgtools-2.0.0.dist-info/top_level.txt +0 -2
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from impyute.ops import matrix
|
|
3
|
+
from impyute.ops import wrapper
|
|
4
|
+
# pylint: disable=invalid-name, too-many-arguments, too-many-locals, too-many-branches, broad-except, len-as-condition
|
|
5
|
+
|
|
6
|
+
@wrapper.wrappers
|
|
7
|
+
@wrapper.checks
|
|
8
|
+
def moving_window(data, nindex=None, wsize=5, errors="coerce", func=np.mean,
|
|
9
|
+
inplace=False):
|
|
10
|
+
""" Interpolate the missing values based on nearby values.
|
|
11
|
+
|
|
12
|
+
For example, with an array like this:
|
|
13
|
+
|
|
14
|
+
array([[-1.24940, -1.38673, -0.03214945, 0.08255145, -0.007415],
|
|
15
|
+
[ 2.14662, 0.32758 , -0.82601414, 1.78124027, 0.873998],
|
|
16
|
+
[-0.41400, -0.977629, nan, -1.39255344, 1.680435],
|
|
17
|
+
[ 0.40975, 1.067599, 0.29152388, -1.70160145, -0.565226],
|
|
18
|
+
[-0.54592, -1.126187, 2.04004377, 0.16664863, -0.010677]])
|
|
19
|
+
|
|
20
|
+
Using a `k` or window size of 3. The one missing value would be set
|
|
21
|
+
to -1.18509122. The window operates on the horizontal axis.
|
|
22
|
+
|
|
23
|
+
Usage
|
|
24
|
+
-----
|
|
25
|
+
|
|
26
|
+
The parameters default the function to a moving mean. You may want to change
|
|
27
|
+
the default window size:
|
|
28
|
+
|
|
29
|
+
moving_window(data, wsize=10)
|
|
30
|
+
|
|
31
|
+
To only look at past data (null value is at the rightmost index in the window):
|
|
32
|
+
|
|
33
|
+
moving_window(data, nindex=-1)
|
|
34
|
+
|
|
35
|
+
To use a custom function:
|
|
36
|
+
|
|
37
|
+
moving_window(data, func=np.median)
|
|
38
|
+
|
|
39
|
+
You can also do something like take 1.5x the max of previous values in the window:
|
|
40
|
+
|
|
41
|
+
moving_window(data, func=lambda arr: max(arr) * 1.50, nindex=-1)
|
|
42
|
+
|
|
43
|
+
Parameters
|
|
44
|
+
----------
|
|
45
|
+
data: numpy.ndarray
|
|
46
|
+
2D matrix to impute.
|
|
47
|
+
nindex: int
|
|
48
|
+
Null index. Index of the null value inside the moving average window.
|
|
49
|
+
Use cases: Say you wanted to make value skewed toward the left or right
|
|
50
|
+
side. 0 would only take the average of values from the right and -1
|
|
51
|
+
would only take the average of values from the left
|
|
52
|
+
wsize: int
|
|
53
|
+
Window size. Size of the moving average window/area of values being used
|
|
54
|
+
for each local imputation. This number includes the missing value.
|
|
55
|
+
errors: {"raise", "coerce", "ignore"}
|
|
56
|
+
Errors will occur with the indexing of the windows - for example if there
|
|
57
|
+
is a nan at data[x][0] and `nindex` is set to -1 or there is a nan at
|
|
58
|
+
data[x][-1] and `nindex` is set to 0. `"raise"` will raise an error,
|
|
59
|
+
`"coerce"` will try again using an nindex set to the middle and `"ignore"`
|
|
60
|
+
will just leave it as a nan.
|
|
61
|
+
inplace: {True, False}
|
|
62
|
+
Whether to return a copy or run on the passed-in array
|
|
63
|
+
|
|
64
|
+
Returns
|
|
65
|
+
-------
|
|
66
|
+
numpy.ndarray
|
|
67
|
+
Imputed data.
|
|
68
|
+
|
|
69
|
+
"""
|
|
70
|
+
if errors == "ignore":
|
|
71
|
+
raise Exception("`errors` value `ignore` not implemented yet. Sorry!")
|
|
72
|
+
|
|
73
|
+
if not inplace:
|
|
74
|
+
data = data.copy()
|
|
75
|
+
|
|
76
|
+
if nindex is None: # If using equal window side lengths
|
|
77
|
+
assert wsize % 2 == 1, "The parameter `wsize` should not be even "\
|
|
78
|
+
"if the value `nindex` is not set since it defaults to the midpoint "\
|
|
79
|
+
"and an even `wsize` makes the midpoint ambiguous"
|
|
80
|
+
wside_left = wsize // 2
|
|
81
|
+
wside_right = wsize // 2
|
|
82
|
+
else: # If using custom window side lengths
|
|
83
|
+
assert nindex < wsize, "The null index must be smaller than the window size"
|
|
84
|
+
if nindex == -1:
|
|
85
|
+
wside_left = wsize - 1
|
|
86
|
+
wside_right = 0
|
|
87
|
+
else:
|
|
88
|
+
wside_left = nindex
|
|
89
|
+
wside_right = wsize - nindex - 1
|
|
90
|
+
|
|
91
|
+
while True:
|
|
92
|
+
nan_xy = matrix.nan_indices(data)
|
|
93
|
+
n_nan_prev = len(nan_xy)
|
|
94
|
+
for x_i, y_i in nan_xy:
|
|
95
|
+
left_i = max(0, y_i-wside_left)
|
|
96
|
+
right_i = min(len(data), y_i+wside_right+1)
|
|
97
|
+
window = data[x_i, left_i: right_i]
|
|
98
|
+
window_not_null = window[~np.isnan(window)]
|
|
99
|
+
|
|
100
|
+
if len(window_not_null) > 0:
|
|
101
|
+
try:
|
|
102
|
+
data[x_i][y_i] = func(window_not_null)
|
|
103
|
+
continue
|
|
104
|
+
except Exception as e:
|
|
105
|
+
if errors == "raise":
|
|
106
|
+
raise e
|
|
107
|
+
|
|
108
|
+
if errors == "coerce":
|
|
109
|
+
# If either the window has a length of 0 or the aggregate function fails somehow,
|
|
110
|
+
# do a fallback of just trying the best we can by using it as the middle and trying
|
|
111
|
+
# to recalculate. Use temporary wside_left/wside_right, for only the calculation of
|
|
112
|
+
# this specific problamatic value
|
|
113
|
+
wside_left_tmp = wsize // 2
|
|
114
|
+
wside_right_tmp = wside_left_tmp
|
|
115
|
+
|
|
116
|
+
left_i_tmp = max(0, y_i-wside_left_tmp)
|
|
117
|
+
right_i_tmp = min(len(data), y_i+wside_right_tmp+1)
|
|
118
|
+
|
|
119
|
+
window = data[x_i, left_i_tmp:right_i_tmp]
|
|
120
|
+
window_not_null = window[~np.isnan(window)]
|
|
121
|
+
try:
|
|
122
|
+
data[x_i][y_i] = func(window_not_null)
|
|
123
|
+
except Exception as e:
|
|
124
|
+
print("Exception:", e)
|
|
125
|
+
if n_nan_prev == len(matrix.nan_indices(data)):
|
|
126
|
+
break
|
|
127
|
+
|
|
128
|
+
return data
|
missingpy/__init__.py
ADDED
missingpy/knnimpute.py
ADDED
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
"""KNN Imputer for Missing Data"""
|
|
2
|
+
# Author: Ashim Bhattarai
|
|
3
|
+
# License: GNU General Public License v3 (GPLv3)
|
|
4
|
+
|
|
5
|
+
import warnings
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
from sklearn.base import BaseEstimator, TransformerMixin
|
|
10
|
+
from sklearn.utils import check_array
|
|
11
|
+
from sklearn.utils.validation import check_is_fitted
|
|
12
|
+
from sklearn.utils.validation import FLOAT_DTYPES
|
|
13
|
+
from sklearn.neighbors.base import _check_weights
|
|
14
|
+
from sklearn.neighbors.base import _get_weights
|
|
15
|
+
|
|
16
|
+
from .pairwise_external import pairwise_distances
|
|
17
|
+
from .pairwise_external import _get_mask
|
|
18
|
+
from .pairwise_external import _MASKED_METRICS
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
'KNNImputer',
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class KNNImputer(BaseEstimator, TransformerMixin):
|
|
26
|
+
"""Imputation for completing missing values using k-Nearest Neighbors.
|
|
27
|
+
|
|
28
|
+
Each sample's missing values are imputed using values from ``n_neighbors``
|
|
29
|
+
nearest neighbors found in the training set. Each missing feature is then
|
|
30
|
+
imputed as the average, either weighted or unweighted, of these neighbors.
|
|
31
|
+
Note that if a sample has more than one feature missing, then the
|
|
32
|
+
neighbors for that sample can be different depending on the particular
|
|
33
|
+
feature being imputed. Finally, where the number of donor neighbors is
|
|
34
|
+
less than ``n_neighbors``, the training set average for that feature is
|
|
35
|
+
used during imputation.
|
|
36
|
+
|
|
37
|
+
Parameters
|
|
38
|
+
----------
|
|
39
|
+
missing_values : integer or "NaN", optional (default = "NaN")
|
|
40
|
+
The placeholder for the missing values. All occurrences of
|
|
41
|
+
`missing_values` will be imputed. For missing values encoded as
|
|
42
|
+
``np.nan``, use the string value "NaN".
|
|
43
|
+
|
|
44
|
+
n_neighbors : int, optional (default = 5)
|
|
45
|
+
Number of neighboring samples to use for imputation.
|
|
46
|
+
|
|
47
|
+
weights : str or callable, optional (default = "uniform")
|
|
48
|
+
Weight function used in prediction. Possible values:
|
|
49
|
+
|
|
50
|
+
- 'uniform' : uniform weights. All points in each neighborhood
|
|
51
|
+
are weighted equally.
|
|
52
|
+
- 'distance' : weight points by the inverse of their distance.
|
|
53
|
+
in this case, closer neighbors of a query point will have a
|
|
54
|
+
greater influence than neighbors which are further away.
|
|
55
|
+
- [callable] : a user-defined function which accepts an
|
|
56
|
+
array of distances, and returns an array of the same shape
|
|
57
|
+
containing the weights.
|
|
58
|
+
|
|
59
|
+
metric : str or callable, optional (default = "masked_euclidean")
|
|
60
|
+
Distance metric for searching neighbors. Possible values:
|
|
61
|
+
- 'masked_euclidean'
|
|
62
|
+
- [callable] : a user-defined function which conforms to the
|
|
63
|
+
definition of _pairwise_callable(X, Y, metric, **kwds). In other
|
|
64
|
+
words, the function accepts two arrays, X and Y, and a
|
|
65
|
+
``missing_values`` keyword in **kwds and returns a scalar distance
|
|
66
|
+
value.
|
|
67
|
+
|
|
68
|
+
row_max_missing : float, optional (default = 0.5)
|
|
69
|
+
The maximum fraction of columns (i.e. features) that can be missing
|
|
70
|
+
before the sample is excluded from nearest neighbor imputation. It
|
|
71
|
+
means that such rows will not be considered a potential donor in
|
|
72
|
+
``fit()``, and in ``transform()`` their missing feature values will be
|
|
73
|
+
imputed to be the column mean for the entire dataset.
|
|
74
|
+
|
|
75
|
+
col_max_missing : float, optional (default = 0.8)
|
|
76
|
+
The maximum fraction of rows (or samples) that can be missing
|
|
77
|
+
for any feature beyond which an error is raised.
|
|
78
|
+
|
|
79
|
+
copy : boolean, optional (default = True)
|
|
80
|
+
If True, a copy of X will be created. If False, imputation will
|
|
81
|
+
be done in-place whenever possible. Note that, if metric is
|
|
82
|
+
"masked_euclidean" and copy=False then missing_values in the
|
|
83
|
+
input matrix X will be overwritten with zeros.
|
|
84
|
+
|
|
85
|
+
Attributes
|
|
86
|
+
----------
|
|
87
|
+
statistics_ : 1-D array of length {n_features}
|
|
88
|
+
The 1-D array contains the mean of each feature calculated using
|
|
89
|
+
observed (i.e. non-missing) values. This is used for imputing
|
|
90
|
+
missing values in samples that are either excluded from nearest
|
|
91
|
+
neighbors search because they have too many ( > row_max_missing)
|
|
92
|
+
missing features or because all of the sample's k-nearest neighbors
|
|
93
|
+
(i.e., the potential donors) also have the relevant feature value
|
|
94
|
+
missing.
|
|
95
|
+
|
|
96
|
+
References
|
|
97
|
+
----------
|
|
98
|
+
* Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor
|
|
99
|
+
Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing
|
|
100
|
+
value estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17
|
|
101
|
+
no. 6, 2001 Pages 520-525.
|
|
102
|
+
|
|
103
|
+
Examples
|
|
104
|
+
--------
|
|
105
|
+
>>> from missingpy import KNNImputer
|
|
106
|
+
>>> nan = float("NaN")
|
|
107
|
+
>>> X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]]
|
|
108
|
+
>>> imputer = KNNImputer(n_neighbors=2, weights="uniform")
|
|
109
|
+
>>> imputer.fit_transform(X)
|
|
110
|
+
array([[1. , 2. , 4. ],
|
|
111
|
+
[3. , 4. , 3. ],
|
|
112
|
+
[5.5, 6. , 5. ],
|
|
113
|
+
[8. , 8. , 7. ]])
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
def __init__(self, missing_values="NaN", n_neighbors=5,
|
|
117
|
+
weights="uniform", metric="masked_euclidean",
|
|
118
|
+
row_max_missing=0.5, col_max_missing=0.8, copy=True):
|
|
119
|
+
|
|
120
|
+
self.missing_values = missing_values
|
|
121
|
+
self.n_neighbors = n_neighbors
|
|
122
|
+
self.weights = weights
|
|
123
|
+
self.metric = metric
|
|
124
|
+
self.row_max_missing = row_max_missing
|
|
125
|
+
self.col_max_missing = col_max_missing
|
|
126
|
+
self.copy = copy
|
|
127
|
+
|
|
128
|
+
def _impute(self, dist, X, fitted_X, mask, mask_fx):
|
|
129
|
+
"""Helper function to find and impute missing values"""
|
|
130
|
+
|
|
131
|
+
# For each column, find and impute
|
|
132
|
+
n_rows_X, n_cols_X = X.shape
|
|
133
|
+
for c in range(n_cols_X):
|
|
134
|
+
if not np.any(mask[:, c], axis=0):
|
|
135
|
+
continue
|
|
136
|
+
|
|
137
|
+
# Row index for receivers and potential donors (pdonors)
|
|
138
|
+
receivers_row_idx = np.where(mask[:, c])[0]
|
|
139
|
+
pdonors_row_idx = np.where(~mask_fx[:, c])[0]
|
|
140
|
+
|
|
141
|
+
# Impute using column mean if n_neighbors are not available
|
|
142
|
+
if len(pdonors_row_idx) < self.n_neighbors:
|
|
143
|
+
warnings.warn("Insufficient number of neighbors! "
|
|
144
|
+
"Filling in column mean.")
|
|
145
|
+
X[receivers_row_idx, c] = self.statistics_[c]
|
|
146
|
+
continue
|
|
147
|
+
|
|
148
|
+
# Get distance from potential donors
|
|
149
|
+
dist_pdonors = dist[receivers_row_idx][:, pdonors_row_idx]
|
|
150
|
+
dist_pdonors = dist_pdonors.reshape(-1,
|
|
151
|
+
len(pdonors_row_idx))
|
|
152
|
+
|
|
153
|
+
# Argpartition to separate actual donors from the rest
|
|
154
|
+
pdonors_idx = np.argpartition(
|
|
155
|
+
dist_pdonors, self.n_neighbors - 1, axis=1)
|
|
156
|
+
|
|
157
|
+
# Get final donors row index from pdonors
|
|
158
|
+
donors_idx = pdonors_idx[:, :self.n_neighbors]
|
|
159
|
+
|
|
160
|
+
# Get weights or None
|
|
161
|
+
dist_pdonors_rows = np.arange(len(donors_idx))[:, None]
|
|
162
|
+
weight_matrix = _get_weights(
|
|
163
|
+
dist_pdonors[
|
|
164
|
+
dist_pdonors_rows, donors_idx], self.weights)
|
|
165
|
+
donor_row_idx_ravel = donors_idx.ravel()
|
|
166
|
+
|
|
167
|
+
# Retrieve donor values and calculate kNN score
|
|
168
|
+
fitted_X_temp = fitted_X[pdonors_row_idx]
|
|
169
|
+
donors = fitted_X_temp[donor_row_idx_ravel, c].reshape(
|
|
170
|
+
(-1, self.n_neighbors))
|
|
171
|
+
donors_mask = _get_mask(donors, self.missing_values)
|
|
172
|
+
donors = np.ma.array(donors, mask=donors_mask)
|
|
173
|
+
|
|
174
|
+
# Final imputation
|
|
175
|
+
imputed = np.ma.average(donors, axis=1,
|
|
176
|
+
weights=weight_matrix)
|
|
177
|
+
X[receivers_row_idx, c] = imputed.data
|
|
178
|
+
return X
|
|
179
|
+
|
|
180
|
+
def fit(self, X, y=None):
|
|
181
|
+
"""Fit the imputer on X.
|
|
182
|
+
|
|
183
|
+
Parameters
|
|
184
|
+
----------
|
|
185
|
+
X : {array-like}, shape (n_samples, n_features)
|
|
186
|
+
Input data, where ``n_samples`` is the number of samples and
|
|
187
|
+
``n_features`` is the number of features.
|
|
188
|
+
|
|
189
|
+
Returns
|
|
190
|
+
-------
|
|
191
|
+
self : object
|
|
192
|
+
Returns self.
|
|
193
|
+
"""
|
|
194
|
+
|
|
195
|
+
# Check data integrity and calling arguments
|
|
196
|
+
force_all_finite = False if self.missing_values in ["NaN",
|
|
197
|
+
np.nan] else True
|
|
198
|
+
if not force_all_finite:
|
|
199
|
+
if self.metric not in _MASKED_METRICS and not callable(
|
|
200
|
+
self.metric):
|
|
201
|
+
raise ValueError(
|
|
202
|
+
"The selected metric does not support NaN values.")
|
|
203
|
+
X = check_array(X, accept_sparse=False, dtype=np.float64,
|
|
204
|
+
force_all_finite=force_all_finite, copy=self.copy)
|
|
205
|
+
self.weights = _check_weights(self.weights)
|
|
206
|
+
|
|
207
|
+
# Check for +/- inf
|
|
208
|
+
if np.any(np.isinf(X)):
|
|
209
|
+
raise ValueError("+/- inf values are not allowed.")
|
|
210
|
+
|
|
211
|
+
# Check if % missing in any column > col_max_missing
|
|
212
|
+
mask = _get_mask(X, self.missing_values)
|
|
213
|
+
if np.any(mask.sum(axis=0) > (X.shape[0] * self.col_max_missing)):
|
|
214
|
+
raise ValueError("Some column(s) have more than {}% missing values"
|
|
215
|
+
.format(self.col_max_missing * 100))
|
|
216
|
+
X_col_means = np.ma.array(X, mask=mask).mean(axis=0).data
|
|
217
|
+
|
|
218
|
+
# Check if % missing in any row > row_max_missing
|
|
219
|
+
bad_rows = mask.sum(axis=1) > (mask.shape[1] * self.row_max_missing)
|
|
220
|
+
if np.any(bad_rows):
|
|
221
|
+
warnings.warn(
|
|
222
|
+
"There are rows with more than {0}% missing values. These "
|
|
223
|
+
"rows are not included as donor neighbors."
|
|
224
|
+
.format(self.row_max_missing * 100))
|
|
225
|
+
|
|
226
|
+
# Remove rows that have more than row_max_missing % missing
|
|
227
|
+
X = X[~bad_rows, :]
|
|
228
|
+
|
|
229
|
+
# Check if sufficient neighboring samples available
|
|
230
|
+
if X.shape[0] < self.n_neighbors:
|
|
231
|
+
raise ValueError("There are only %d samples, but n_neighbors=%d."
|
|
232
|
+
% (X.shape[0], self.n_neighbors))
|
|
233
|
+
self.fitted_X_ = X
|
|
234
|
+
self.statistics_ = X_col_means
|
|
235
|
+
|
|
236
|
+
return self
|
|
237
|
+
|
|
238
|
+
def transform(self, X):
|
|
239
|
+
"""Impute all missing values in X.
|
|
240
|
+
|
|
241
|
+
Parameters
|
|
242
|
+
----------
|
|
243
|
+
X : {array-like}, shape = [n_samples, n_features]
|
|
244
|
+
The input data to complete.
|
|
245
|
+
|
|
246
|
+
Returns
|
|
247
|
+
-------
|
|
248
|
+
X : {array-like}, shape = [n_samples, n_features]
|
|
249
|
+
The imputed dataset.
|
|
250
|
+
"""
|
|
251
|
+
|
|
252
|
+
check_is_fitted(self, ["fitted_X_", "statistics_"])
|
|
253
|
+
force_all_finite = False if self.missing_values in ["NaN",
|
|
254
|
+
np.nan] else True
|
|
255
|
+
X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES,
|
|
256
|
+
force_all_finite=force_all_finite, copy=self.copy)
|
|
257
|
+
|
|
258
|
+
# Check for +/- inf
|
|
259
|
+
if np.any(np.isinf(X)):
|
|
260
|
+
raise ValueError("+/- inf values are not allowed in data to be "
|
|
261
|
+
"transformed.")
|
|
262
|
+
|
|
263
|
+
# Get fitted data and ensure correct dimension
|
|
264
|
+
n_rows_fit_X, n_cols_fit_X = self.fitted_X_.shape
|
|
265
|
+
n_rows_X, n_cols_X = X.shape
|
|
266
|
+
|
|
267
|
+
if n_cols_X != n_cols_fit_X:
|
|
268
|
+
raise ValueError("Incompatible dimension between the fitted "
|
|
269
|
+
"dataset and the one to be transformed.")
|
|
270
|
+
mask = _get_mask(X, self.missing_values)
|
|
271
|
+
|
|
272
|
+
row_total_missing = mask.sum(axis=1)
|
|
273
|
+
if not np.any(row_total_missing):
|
|
274
|
+
return X
|
|
275
|
+
|
|
276
|
+
# Check for excessive missingness in rows
|
|
277
|
+
bad_rows = row_total_missing > (mask.shape[1] * self.row_max_missing)
|
|
278
|
+
if np.any(bad_rows):
|
|
279
|
+
warnings.warn(
|
|
280
|
+
"There are rows with more than {0}% missing values. The "
|
|
281
|
+
"missing features in these rows are imputed with column means."
|
|
282
|
+
.format(self.row_max_missing * 100))
|
|
283
|
+
X_bad = X[bad_rows, :]
|
|
284
|
+
X = X[~bad_rows, :]
|
|
285
|
+
mask = mask[~bad_rows]
|
|
286
|
+
row_total_missing = mask.sum(axis=1)
|
|
287
|
+
row_has_missing = row_total_missing.astype(np.bool)
|
|
288
|
+
|
|
289
|
+
if np.any(row_has_missing):
|
|
290
|
+
|
|
291
|
+
# Mask for fitted_X
|
|
292
|
+
mask_fx = _get_mask(self.fitted_X_, self.missing_values)
|
|
293
|
+
|
|
294
|
+
# Pairwise distances between receivers and fitted samples
|
|
295
|
+
dist = np.empty((len(X), len(self.fitted_X_)))
|
|
296
|
+
dist[row_has_missing] = pairwise_distances(
|
|
297
|
+
X[row_has_missing], self.fitted_X_, metric=self.metric,
|
|
298
|
+
squared=False, missing_values=self.missing_values)
|
|
299
|
+
|
|
300
|
+
# Find and impute missing
|
|
301
|
+
X = self._impute(dist, X, self.fitted_X_, mask, mask_fx)
|
|
302
|
+
|
|
303
|
+
# Merge bad rows to X and mean impute their missing values
|
|
304
|
+
if np.any(bad_rows):
|
|
305
|
+
bad_missing_index = np.where(_get_mask(X_bad, self.missing_values))
|
|
306
|
+
X_bad[bad_missing_index] = np.take(self.statistics_,
|
|
307
|
+
bad_missing_index[1])
|
|
308
|
+
X_merged = np.empty((n_rows_X, n_cols_X))
|
|
309
|
+
X_merged[bad_rows, :] = X_bad
|
|
310
|
+
X_merged[~bad_rows, :] = X
|
|
311
|
+
X = X_merged
|
|
312
|
+
return X
|
|
313
|
+
|
|
314
|
+
def fit_transform(self, X, y=None, **fit_params):
|
|
315
|
+
"""Fit KNNImputer and impute all missing values in X.
|
|
316
|
+
|
|
317
|
+
Parameters
|
|
318
|
+
----------
|
|
319
|
+
X : {array-like}, shape (n_samples, n_features)
|
|
320
|
+
Input data, where ``n_samples`` is the number of samples and
|
|
321
|
+
``n_features`` is the number of features.
|
|
322
|
+
|
|
323
|
+
Returns
|
|
324
|
+
-------
|
|
325
|
+
X : {array-like}, shape (n_samples, n_features)
|
|
326
|
+
Returns imputed dataset.
|
|
327
|
+
"""
|
|
328
|
+
return self.fit(X).transform(X)
|