cpgtools 2.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cpgmodule/BED.py +441 -0
- cpgmodule/MI.py +193 -0
- cpgmodule/__init__.py +0 -0
- cpgmodule/_version.py +1 -0
- cpgmodule/cgID.py +866897 -0
- cpgmodule/data/AltumAge_cpg.pkl +0 -0
- cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
- cpgmodule/data/AltumAge_scaler.pkl +0 -0
- cpgmodule/data/GA_Bohlin.pkl +0 -0
- cpgmodule/data/GA_Haftorn.pkl +0 -0
- cpgmodule/data/GA_Knight.pkl +0 -0
- cpgmodule/data/GA_Lee_CPC.pkl +0 -0
- cpgmodule/data/GA_Lee_RPC.pkl +0 -0
- cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
- cpgmodule/data/GA_Mayne.pkl +0 -0
- cpgmodule/data/Hannum.pkl +0 -0
- cpgmodule/data/Horvath_2013.pkl +0 -0
- cpgmodule/data/Horvath_2018.pkl +0 -0
- cpgmodule/data/Levine.pkl +0 -0
- cpgmodule/data/Lu_DNAmTL.pkl +0 -0
- cpgmodule/data/Ped_McEwen.pkl +0 -0
- cpgmodule/data/Ped_Wu.pkl +0 -0
- cpgmodule/data/Zhang_BLUP.pkl +0 -0
- cpgmodule/data/Zhang_EN.pkl +0 -0
- cpgmodule/data/__init__.py +0 -0
- cpgmodule/extend_bed.py +147 -0
- cpgmodule/imotif.py +348 -0
- cpgmodule/ireader.py +28 -0
- cpgmodule/methylClock.py +53 -0
- cpgmodule/padjust.py +58 -0
- cpgmodule/region2gene.py +170 -0
- cpgmodule/utils.py +642 -0
- cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
- cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
- cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
- cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
- cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
- cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
- cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
- cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
- cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
- cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
- cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
- cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
- cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
- cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
- cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
- cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
- cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
- cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
- cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
- cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
- cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
- cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
- cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
- cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
- cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
- cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
- cpgtools-2.0.5.dist-info/METADATA +59 -0
- cpgtools-2.0.5.dist-info/RECORD +104 -0
- cpgtools-2.0.5.dist-info/WHEEL +5 -0
- cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
- cpgtools-2.0.5.dist-info/top_level.txt +5 -0
- impyute/__init__.py +3 -0
- impyute/contrib/__init__.py +7 -0
- impyute/contrib/compare.py +69 -0
- impyute/contrib/count_missing.py +30 -0
- impyute/contrib/describe.py +63 -0
- impyute/cs/__init__.py +11 -0
- impyute/cs/buck_iterative.py +82 -0
- impyute/cs/central_tendency.py +84 -0
- impyute/cs/em.py +52 -0
- impyute/cs/fast_knn.py +130 -0
- impyute/cs/random.py +27 -0
- impyute/dataset/__init__.py +6 -0
- impyute/dataset/base.py +137 -0
- impyute/dataset/corrupt.py +55 -0
- impyute/deletion/__init__.py +5 -0
- impyute/deletion/complete_case.py +21 -0
- impyute/ops/__init__.py +12 -0
- impyute/ops/error.py +9 -0
- impyute/ops/inverse_distance_weighting.py +31 -0
- impyute/ops/matrix.py +47 -0
- impyute/ops/testing.py +20 -0
- impyute/ops/util.py +96 -0
- impyute/ops/wrapper.py +179 -0
- impyute/ts/__init__.py +6 -0
- impyute/ts/locf.py +57 -0
- impyute/ts/moving_window.py +128 -0
- impyutelib.py +890 -0
- missingpy/__init__.py +4 -0
- missingpy/knnimpute.py +328 -0
- missingpy/missforest.py +556 -0
- missingpy/pairwise_external.py +315 -0
- missingpy/tests/__init__.py +0 -0
- missingpy/tests/test_knnimpute.py +605 -0
- missingpy/tests/test_missforest.py +409 -0
- missingpy/utils.py +124 -0
- misspylib.py +565 -0
|
@@ -0,0 +1,605 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
from sklearn.utils.testing import assert_array_equal
|
|
4
|
+
from sklearn.utils.testing import assert_array_almost_equal
|
|
5
|
+
from sklearn.utils.testing import assert_raise_message
|
|
6
|
+
from sklearn.utils.testing import assert_equal
|
|
7
|
+
|
|
8
|
+
from missingpy import KNNImputer
|
|
9
|
+
from missingpy.pairwise_external import masked_euclidean_distances
|
|
10
|
+
from missingpy.pairwise_external import pairwise_distances
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_knn_imputation_shape():
|
|
14
|
+
# Verify the shapes of the imputed matrix for different weights and
|
|
15
|
+
# number of neighbors.
|
|
16
|
+
n_rows = 10
|
|
17
|
+
n_cols = 2
|
|
18
|
+
X = np.random.rand(n_rows, n_cols)
|
|
19
|
+
X[0, 0] = np.nan
|
|
20
|
+
|
|
21
|
+
for weights in ['uniform', 'distance']:
|
|
22
|
+
for n_neighbors in range(1, 6):
|
|
23
|
+
imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights)
|
|
24
|
+
X_imputed = imputer.fit_transform(X)
|
|
25
|
+
assert_equal(X_imputed.shape, (n_rows, n_cols))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_knn_imputation_zero():
|
|
29
|
+
# Test imputation when missing_values == 0
|
|
30
|
+
missing_values = 0
|
|
31
|
+
n_neighbors = 2
|
|
32
|
+
imputer = KNNImputer(missing_values=missing_values,
|
|
33
|
+
n_neighbors=n_neighbors,
|
|
34
|
+
weights="uniform")
|
|
35
|
+
|
|
36
|
+
# Test with missing_values=0 when NaN present
|
|
37
|
+
X = np.array([
|
|
38
|
+
[np.nan, 0, 0, 0, 5],
|
|
39
|
+
[np.nan, 1, 0, np.nan, 3],
|
|
40
|
+
[np.nan, 2, 0, 0, 0],
|
|
41
|
+
[np.nan, 6, 0, 5, 13],
|
|
42
|
+
])
|
|
43
|
+
msg = "Input contains NaN, infinity or a value too large for %r." % X.dtype
|
|
44
|
+
assert_raise_message(ValueError, msg, imputer.fit, X)
|
|
45
|
+
|
|
46
|
+
# Test with % zeros in column > col_max_missing
|
|
47
|
+
X = np.array([
|
|
48
|
+
[1, 0, 0, 0, 5],
|
|
49
|
+
[2, 1, 0, 2, 3],
|
|
50
|
+
[3, 2, 0, 0, 0],
|
|
51
|
+
[4, 6, 0, 5, 13],
|
|
52
|
+
])
|
|
53
|
+
msg = "Some column(s) have more than {}% missing values".format(
|
|
54
|
+
imputer.col_max_missing * 100)
|
|
55
|
+
assert_raise_message(ValueError, msg, imputer.fit, X)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def test_knn_imputation_zero_p2():
|
|
59
|
+
# Test with an imputable matrix and also compare with missing_values="NaN"
|
|
60
|
+
X_zero = np.array([
|
|
61
|
+
[1, 0, 1, 1, 1.],
|
|
62
|
+
[2, 2, 2, 2, 2],
|
|
63
|
+
[3, 3, 3, 3, 0],
|
|
64
|
+
[6, 6, 0, 6, 6],
|
|
65
|
+
])
|
|
66
|
+
|
|
67
|
+
X_nan = np.array([
|
|
68
|
+
[1, np.nan, 1, 1, 1.],
|
|
69
|
+
[2, 2, 2, 2, 2],
|
|
70
|
+
[3, 3, 3, 3, np.nan],
|
|
71
|
+
[6, 6, np.nan, 6, 6],
|
|
72
|
+
])
|
|
73
|
+
statistics_mean = np.nanmean(X_nan, axis=0)
|
|
74
|
+
|
|
75
|
+
X_imputed = np.array([
|
|
76
|
+
[1, 2.5, 1, 1, 1.],
|
|
77
|
+
[2, 2, 2, 2, 2],
|
|
78
|
+
[3, 3, 3, 3, 1.5],
|
|
79
|
+
[6, 6, 2.5, 6, 6],
|
|
80
|
+
])
|
|
81
|
+
|
|
82
|
+
imputer_zero = KNNImputer(missing_values=0, n_neighbors=2,
|
|
83
|
+
weights="uniform")
|
|
84
|
+
|
|
85
|
+
imputer_nan = KNNImputer(missing_values="NaN",
|
|
86
|
+
n_neighbors=2,
|
|
87
|
+
weights="uniform")
|
|
88
|
+
|
|
89
|
+
assert_array_equal(imputer_zero.fit_transform(X_zero), X_imputed)
|
|
90
|
+
assert_array_equal(imputer_zero.statistics_, statistics_mean)
|
|
91
|
+
assert_array_equal(imputer_zero.fit_transform(X_zero),
|
|
92
|
+
imputer_nan.fit_transform(X_nan))
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def test_knn_imputation_default():
|
|
96
|
+
# Test imputation with default parameter values
|
|
97
|
+
|
|
98
|
+
# Test with an imputable matrix
|
|
99
|
+
X = np.array([
|
|
100
|
+
[1, 0, 0, 1],
|
|
101
|
+
[2, 1, 2, np.nan],
|
|
102
|
+
[3, 2, 3, np.nan],
|
|
103
|
+
[np.nan, 4, 5, 5],
|
|
104
|
+
[6, np.nan, 6, 7],
|
|
105
|
+
[8, 8, 8, 8],
|
|
106
|
+
[16, 15, 18, 19],
|
|
107
|
+
])
|
|
108
|
+
statistics_mean = np.nanmean(X, axis=0)
|
|
109
|
+
|
|
110
|
+
X_imputed = np.array([
|
|
111
|
+
[1, 0, 0, 1],
|
|
112
|
+
[2, 1, 2, 8],
|
|
113
|
+
[3, 2, 3, 8],
|
|
114
|
+
[4, 4, 5, 5],
|
|
115
|
+
[6, 3, 6, 7],
|
|
116
|
+
[8, 8, 8, 8],
|
|
117
|
+
[16, 15, 18, 19],
|
|
118
|
+
])
|
|
119
|
+
|
|
120
|
+
imputer = KNNImputer()
|
|
121
|
+
assert_array_equal(imputer.fit_transform(X), X_imputed)
|
|
122
|
+
assert_array_equal(imputer.statistics_, statistics_mean)
|
|
123
|
+
|
|
124
|
+
# Test with % missing in row > row_max_missing
|
|
125
|
+
X = np.array([
|
|
126
|
+
[1, 0, 0, 1],
|
|
127
|
+
[2, 1, 2, np.nan],
|
|
128
|
+
[3, 2, 3, np.nan],
|
|
129
|
+
[np.nan, 4, 5, 5],
|
|
130
|
+
[6, np.nan, 6, 7],
|
|
131
|
+
[8, 8, 8, 8],
|
|
132
|
+
[19, 19, 19, 19],
|
|
133
|
+
[np.nan, np.nan, np.nan, 19],
|
|
134
|
+
])
|
|
135
|
+
statistics_mean = np.nanmean(X, axis=0)
|
|
136
|
+
r7c0, r7c1, r7c2, _ = statistics_mean
|
|
137
|
+
|
|
138
|
+
X_imputed = np.array([
|
|
139
|
+
[1, 0, 0, 1],
|
|
140
|
+
[2, 1, 2, 8],
|
|
141
|
+
[3, 2, 3, 8],
|
|
142
|
+
[4, 4, 5, 5],
|
|
143
|
+
[6, 3, 6, 7],
|
|
144
|
+
[8, 8, 8, 8],
|
|
145
|
+
[19, 19, 19, 19],
|
|
146
|
+
[r7c0, r7c1, r7c2, 19],
|
|
147
|
+
])
|
|
148
|
+
|
|
149
|
+
imputer = KNNImputer()
|
|
150
|
+
assert_array_almost_equal(imputer.fit_transform(X), X_imputed, decimal=6)
|
|
151
|
+
assert_array_almost_equal(imputer.statistics_, statistics_mean, decimal=6)
|
|
152
|
+
|
|
153
|
+
# Test with all neighboring donors also having missing feature values
|
|
154
|
+
X = np.array([
|
|
155
|
+
[1, 0, 0, np.nan],
|
|
156
|
+
[2, 1, 2, np.nan],
|
|
157
|
+
[3, 2, 3, np.nan],
|
|
158
|
+
[4, 4, 5, np.nan],
|
|
159
|
+
[6, 7, 6, np.nan],
|
|
160
|
+
[8, 8, 8, np.nan],
|
|
161
|
+
[20, 20, 20, 20],
|
|
162
|
+
[22, 22, 22, 22]
|
|
163
|
+
])
|
|
164
|
+
statistics_mean = np.nanmean(X, axis=0)
|
|
165
|
+
|
|
166
|
+
X_imputed = np.array([
|
|
167
|
+
[1, 0, 0, 21],
|
|
168
|
+
[2, 1, 2, 21],
|
|
169
|
+
[3, 2, 3, 21],
|
|
170
|
+
[4, 4, 5, 21],
|
|
171
|
+
[6, 7, 6, 21],
|
|
172
|
+
[8, 8, 8, 21],
|
|
173
|
+
[20, 20, 20, 20],
|
|
174
|
+
[22, 22, 22, 22]
|
|
175
|
+
])
|
|
176
|
+
|
|
177
|
+
imputer = KNNImputer()
|
|
178
|
+
assert_array_equal(imputer.fit_transform(X), X_imputed)
|
|
179
|
+
assert_array_equal(imputer.statistics_, statistics_mean)
|
|
180
|
+
|
|
181
|
+
# Test when data in fit() and transform() are different
|
|
182
|
+
X = np.array([
|
|
183
|
+
[0, 0],
|
|
184
|
+
[np.nan, 2],
|
|
185
|
+
[4, 3],
|
|
186
|
+
[5, 6],
|
|
187
|
+
[7, 7],
|
|
188
|
+
[9, 8],
|
|
189
|
+
[11, 16]
|
|
190
|
+
])
|
|
191
|
+
statistics_mean = np.nanmean(X, axis=0)
|
|
192
|
+
|
|
193
|
+
Y = np.array([
|
|
194
|
+
[1, 0],
|
|
195
|
+
[3, 2],
|
|
196
|
+
[4, np.nan]
|
|
197
|
+
])
|
|
198
|
+
|
|
199
|
+
Y_imputed = np.array([
|
|
200
|
+
[1, 0],
|
|
201
|
+
[3, 2],
|
|
202
|
+
[4, 4.8]
|
|
203
|
+
])
|
|
204
|
+
|
|
205
|
+
imputer = KNNImputer()
|
|
206
|
+
assert_array_equal(imputer.fit(X).transform(Y), Y_imputed)
|
|
207
|
+
assert_array_equal(imputer.statistics_, statistics_mean)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def test_default_with_invalid_input():
|
|
211
|
+
# Test imputation with default values and invalid input
|
|
212
|
+
|
|
213
|
+
# Test with % missing in a column > col_max_missing
|
|
214
|
+
X = np.array([
|
|
215
|
+
[np.nan, 0, 0, 0, 5],
|
|
216
|
+
[np.nan, 1, 0, np.nan, 3],
|
|
217
|
+
[np.nan, 2, 0, 0, 0],
|
|
218
|
+
[np.nan, 6, 0, 5, 13],
|
|
219
|
+
[np.nan, 7, 0, 7, 8],
|
|
220
|
+
[np.nan, 8, 0, 8, 9],
|
|
221
|
+
])
|
|
222
|
+
imputer = KNNImputer()
|
|
223
|
+
msg = "Some column(s) have more than {}% missing values".format(
|
|
224
|
+
imputer.col_max_missing * 100)
|
|
225
|
+
assert_raise_message(ValueError, msg, imputer.fit, X)
|
|
226
|
+
|
|
227
|
+
# Test with insufficient number of neighbors
|
|
228
|
+
X = np.array([
|
|
229
|
+
[1, 1, 1, 2, np.nan],
|
|
230
|
+
[2, 1, 2, 2, 3],
|
|
231
|
+
[3, 2, 3, 3, 8],
|
|
232
|
+
[6, 6, 2, 5, 13],
|
|
233
|
+
])
|
|
234
|
+
msg = "There are only %d samples, but n_neighbors=%d." % \
|
|
235
|
+
(X.shape[0], imputer.n_neighbors)
|
|
236
|
+
assert_raise_message(ValueError, msg, imputer.fit, X)
|
|
237
|
+
|
|
238
|
+
# Test with inf present
|
|
239
|
+
X = np.array([
|
|
240
|
+
[np.inf, 1, 1, 2, np.nan],
|
|
241
|
+
[2, 1, 2, 2, 3],
|
|
242
|
+
[3, 2, 3, 3, 8],
|
|
243
|
+
[np.nan, 6, 0, 5, 13],
|
|
244
|
+
[np.nan, 7, 0, 7, 8],
|
|
245
|
+
[6, 6, 2, 5, 7],
|
|
246
|
+
])
|
|
247
|
+
msg = "+/- inf values are not allowed."
|
|
248
|
+
assert_raise_message(ValueError, msg, KNNImputer().fit, X)
|
|
249
|
+
|
|
250
|
+
# Test with inf present in matrix passed in transform()
|
|
251
|
+
X = np.array([
|
|
252
|
+
[np.inf, 1, 1, 2, np.nan],
|
|
253
|
+
[2, 1, 2, 2, 3],
|
|
254
|
+
[3, 2, 3, 3, 8],
|
|
255
|
+
[np.nan, 6, 0, 5, 13],
|
|
256
|
+
[np.nan, 7, 0, 7, 8],
|
|
257
|
+
[6, 6, 2, 5, 7],
|
|
258
|
+
])
|
|
259
|
+
|
|
260
|
+
X_fit = np.array([
|
|
261
|
+
[0, 1, 1, 2, np.nan],
|
|
262
|
+
[2, 1, 2, 2, 3],
|
|
263
|
+
[3, 2, 3, 3, 8],
|
|
264
|
+
[np.nan, 6, 0, 5, 13],
|
|
265
|
+
[np.nan, 7, 0, 7, 8],
|
|
266
|
+
[6, 6, 2, 5, 7],
|
|
267
|
+
])
|
|
268
|
+
msg = "+/- inf values are not allowed in data to be transformed."
|
|
269
|
+
assert_raise_message(ValueError, msg, KNNImputer().fit(X_fit).transform, X)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def test_knn_n_neighbors():
|
|
273
|
+
|
|
274
|
+
X = np.array([
|
|
275
|
+
[0, 0],
|
|
276
|
+
[np.nan, 2],
|
|
277
|
+
[4, 3],
|
|
278
|
+
[5, np.nan],
|
|
279
|
+
[7, 7],
|
|
280
|
+
[np.nan, 8],
|
|
281
|
+
[14, 13]
|
|
282
|
+
])
|
|
283
|
+
statistics_mean = np.nanmean(X, axis=0)
|
|
284
|
+
|
|
285
|
+
# Test with 1 neighbor
|
|
286
|
+
X_imputed_1NN = np.array([
|
|
287
|
+
[0, 0],
|
|
288
|
+
[4, 2],
|
|
289
|
+
[4, 3],
|
|
290
|
+
[5, 3],
|
|
291
|
+
[7, 7],
|
|
292
|
+
[7, 8],
|
|
293
|
+
[14, 13]
|
|
294
|
+
])
|
|
295
|
+
|
|
296
|
+
n_neighbors = 1
|
|
297
|
+
imputer = KNNImputer(n_neighbors=n_neighbors)
|
|
298
|
+
|
|
299
|
+
assert_array_equal(imputer.fit_transform(X), X_imputed_1NN)
|
|
300
|
+
assert_array_equal(imputer.statistics_, statistics_mean)
|
|
301
|
+
|
|
302
|
+
# Test with 6 neighbors
|
|
303
|
+
X = np.array([
|
|
304
|
+
[0, 0],
|
|
305
|
+
[np.nan, 2],
|
|
306
|
+
[4, 3],
|
|
307
|
+
[5, np.nan],
|
|
308
|
+
[7, 7],
|
|
309
|
+
[np.nan, 8],
|
|
310
|
+
[14, 13]
|
|
311
|
+
])
|
|
312
|
+
|
|
313
|
+
X_imputed_6NN = np.array([
|
|
314
|
+
[0, 0],
|
|
315
|
+
[6, 2],
|
|
316
|
+
[4, 3],
|
|
317
|
+
[5, 5.5],
|
|
318
|
+
[7, 7],
|
|
319
|
+
[6, 8],
|
|
320
|
+
[14, 13]
|
|
321
|
+
])
|
|
322
|
+
|
|
323
|
+
n_neighbors = 6
|
|
324
|
+
imputer = KNNImputer(n_neighbors=6)
|
|
325
|
+
imputer_plus1 = KNNImputer(n_neighbors=n_neighbors + 1)
|
|
326
|
+
|
|
327
|
+
assert_array_equal(imputer.fit_transform(X), X_imputed_6NN)
|
|
328
|
+
assert_array_equal(imputer.statistics_, statistics_mean)
|
|
329
|
+
assert_array_equal(imputer.fit_transform(X), imputer_plus1.fit(
|
|
330
|
+
X).transform(X))
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def test_weight_uniform():
|
|
334
|
+
X = np.array([
|
|
335
|
+
[0, 0],
|
|
336
|
+
[np.nan, 2],
|
|
337
|
+
[4, 3],
|
|
338
|
+
[5, 6],
|
|
339
|
+
[7, 7],
|
|
340
|
+
[9, 8],
|
|
341
|
+
[11, 10]
|
|
342
|
+
])
|
|
343
|
+
|
|
344
|
+
# Test with "uniform" weight (or unweighted)
|
|
345
|
+
X_imputed_uniform = np.array([
|
|
346
|
+
[0, 0],
|
|
347
|
+
[5, 2],
|
|
348
|
+
[4, 3],
|
|
349
|
+
[5, 6],
|
|
350
|
+
[7, 7],
|
|
351
|
+
[9, 8],
|
|
352
|
+
[11, 10]
|
|
353
|
+
])
|
|
354
|
+
|
|
355
|
+
imputer = KNNImputer(weights="uniform")
|
|
356
|
+
assert_array_equal(imputer.fit_transform(X), X_imputed_uniform)
|
|
357
|
+
|
|
358
|
+
# Test with "callable" weight
|
|
359
|
+
def no_weight(dist=None):
|
|
360
|
+
return None
|
|
361
|
+
|
|
362
|
+
imputer = KNNImputer(weights=no_weight)
|
|
363
|
+
assert_array_equal(imputer.fit_transform(X), X_imputed_uniform)
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def test_weight_distance():
|
|
367
|
+
X = np.array([
|
|
368
|
+
[0, 0],
|
|
369
|
+
[np.nan, 2],
|
|
370
|
+
[4, 3],
|
|
371
|
+
[5, 6],
|
|
372
|
+
[7, 7],
|
|
373
|
+
[9, 8],
|
|
374
|
+
[11, 10]
|
|
375
|
+
])
|
|
376
|
+
|
|
377
|
+
# Test with "distance" weight
|
|
378
|
+
|
|
379
|
+
# Get distance of "n_neighbors" neighbors of row 1
|
|
380
|
+
dist_matrix = pairwise_distances(X, metric="masked_euclidean")
|
|
381
|
+
|
|
382
|
+
index = np.argsort(dist_matrix)[1, 1:6]
|
|
383
|
+
dist = dist_matrix[1, index]
|
|
384
|
+
weights = 1 / dist
|
|
385
|
+
values = X[index, 0]
|
|
386
|
+
imputed = np.dot(values, weights) / np.sum(weights)
|
|
387
|
+
|
|
388
|
+
# Manual calculation
|
|
389
|
+
X_imputed_distance1 = np.array([
|
|
390
|
+
[0, 0],
|
|
391
|
+
[3.850394, 2],
|
|
392
|
+
[4, 3],
|
|
393
|
+
[5, 6],
|
|
394
|
+
[7, 7],
|
|
395
|
+
[9, 8],
|
|
396
|
+
[11, 10]
|
|
397
|
+
])
|
|
398
|
+
|
|
399
|
+
# NearestNeighbor calculation
|
|
400
|
+
X_imputed_distance2 = np.array([
|
|
401
|
+
[0, 0],
|
|
402
|
+
[imputed, 2],
|
|
403
|
+
[4, 3],
|
|
404
|
+
[5, 6],
|
|
405
|
+
[7, 7],
|
|
406
|
+
[9, 8],
|
|
407
|
+
[11, 10]
|
|
408
|
+
])
|
|
409
|
+
|
|
410
|
+
imputer = KNNImputer(weights="distance")
|
|
411
|
+
assert_array_almost_equal(imputer.fit_transform(X), X_imputed_distance1,
|
|
412
|
+
decimal=6)
|
|
413
|
+
assert_array_almost_equal(imputer.fit_transform(X), X_imputed_distance2,
|
|
414
|
+
decimal=6)
|
|
415
|
+
|
|
416
|
+
# Test with weights = "distance" and n_neighbors=2
|
|
417
|
+
X = np.array([
|
|
418
|
+
[np.nan, 0, 0],
|
|
419
|
+
[2, 1, 2],
|
|
420
|
+
[3, 2, 3],
|
|
421
|
+
[4, 5, 5],
|
|
422
|
+
])
|
|
423
|
+
statistics_mean = np.nanmean(X, axis=0)
|
|
424
|
+
|
|
425
|
+
X_imputed = np.array([
|
|
426
|
+
[2.3828, 0, 0],
|
|
427
|
+
[2, 1, 2],
|
|
428
|
+
[3, 2, 3],
|
|
429
|
+
[4, 5, 5],
|
|
430
|
+
])
|
|
431
|
+
|
|
432
|
+
imputer = KNNImputer(n_neighbors=2, weights="distance")
|
|
433
|
+
assert_array_almost_equal(imputer.fit_transform(X), X_imputed,
|
|
434
|
+
decimal=4)
|
|
435
|
+
assert_array_equal(imputer.statistics_, statistics_mean)
|
|
436
|
+
|
|
437
|
+
# Test with varying missingness patterns
|
|
438
|
+
X = np.array([
|
|
439
|
+
[1, 0, 0, 1],
|
|
440
|
+
[0, np.nan, 1, np.nan],
|
|
441
|
+
[1, 1, 1, np.nan],
|
|
442
|
+
[0, 1, 0, 0],
|
|
443
|
+
[0, 0, 0, 0],
|
|
444
|
+
[1, 0, 1, 1],
|
|
445
|
+
[10, 10, 10, 10],
|
|
446
|
+
])
|
|
447
|
+
statistics_mean = np.nanmean(X, axis=0)
|
|
448
|
+
|
|
449
|
+
# Get weights of donor neighbors
|
|
450
|
+
dist = masked_euclidean_distances(X)
|
|
451
|
+
r1c1_nbor_dists = dist[1, [0, 2, 3, 4, 5]]
|
|
452
|
+
r1c3_nbor_dists = dist[1, [0, 3, 4, 5, 6]]
|
|
453
|
+
r1c1_nbor_wt = (1/r1c1_nbor_dists)
|
|
454
|
+
r1c3_nbor_wt = (1 / r1c3_nbor_dists)
|
|
455
|
+
|
|
456
|
+
r2c3_nbor_dists = dist[2, [0, 3, 4, 5, 6]]
|
|
457
|
+
r2c3_nbor_wt = 1/r2c3_nbor_dists
|
|
458
|
+
|
|
459
|
+
# Collect donor values
|
|
460
|
+
col1_donor_values = np.ma.masked_invalid(X[[0, 2, 3, 4, 5], 1]).copy()
|
|
461
|
+
col3_donor_values = np.ma.masked_invalid(X[[0, 3, 4, 5, 6], 3]).copy()
|
|
462
|
+
|
|
463
|
+
# Final imputed values
|
|
464
|
+
r1c1_imp = np.ma.average(col1_donor_values, weights=r1c1_nbor_wt)
|
|
465
|
+
r1c3_imp = np.ma.average(col3_donor_values, weights=r1c3_nbor_wt)
|
|
466
|
+
r2c3_imp = np.ma.average(col3_donor_values, weights=r2c3_nbor_wt)
|
|
467
|
+
|
|
468
|
+
print(r1c1_imp, r1c3_imp, r2c3_imp)
|
|
469
|
+
X_imputed = np.array([
|
|
470
|
+
[1, 0, 0, 1],
|
|
471
|
+
[0, r1c1_imp, 1, r1c3_imp],
|
|
472
|
+
[1, 1, 1, r2c3_imp],
|
|
473
|
+
[0, 1, 0, 0],
|
|
474
|
+
[0, 0, 0, 0],
|
|
475
|
+
[1, 0, 1, 1],
|
|
476
|
+
[10, 10, 10, 10],
|
|
477
|
+
])
|
|
478
|
+
|
|
479
|
+
imputer = KNNImputer(weights="distance")
|
|
480
|
+
assert_array_almost_equal(imputer.fit_transform(X), X_imputed, decimal=6)
|
|
481
|
+
assert_array_equal(imputer.statistics_, statistics_mean)
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
def test_metric_type():
|
|
485
|
+
X = np.array([
|
|
486
|
+
[0, 0],
|
|
487
|
+
[np.nan, 2],
|
|
488
|
+
[4, 3],
|
|
489
|
+
[5, 6],
|
|
490
|
+
[7, 7],
|
|
491
|
+
[9, 8],
|
|
492
|
+
[11, 10]
|
|
493
|
+
])
|
|
494
|
+
|
|
495
|
+
# Test with a metric type without NaN support
|
|
496
|
+
imputer = KNNImputer(metric="euclidean")
|
|
497
|
+
bad_metric_msg = "The selected metric does not support NaN values."
|
|
498
|
+
assert_raise_message(ValueError, bad_metric_msg, imputer.fit, X)
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def test_callable_metric():
|
|
502
|
+
|
|
503
|
+
# Define callable metric that returns the l1 norm:
|
|
504
|
+
def custom_callable(x, y, missing_values="NaN", squared=False):
|
|
505
|
+
x = np.ma.array(x, mask=np.isnan(x))
|
|
506
|
+
y = np.ma.array(y, mask=np.isnan(y))
|
|
507
|
+
dist = np.nansum(np.abs(x-y))
|
|
508
|
+
return dist
|
|
509
|
+
|
|
510
|
+
X = np.array([
|
|
511
|
+
[4, 3, 3, np.nan],
|
|
512
|
+
[6, 9, 6, 9],
|
|
513
|
+
[4, 8, 6, 9],
|
|
514
|
+
[np.nan, 9, 11, 10.]
|
|
515
|
+
])
|
|
516
|
+
|
|
517
|
+
X_imputed = np.array([
|
|
518
|
+
[4, 3, 3, 9],
|
|
519
|
+
[6, 9, 6, 9],
|
|
520
|
+
[4, 8, 6, 9],
|
|
521
|
+
[5, 9, 11, 10.]
|
|
522
|
+
])
|
|
523
|
+
|
|
524
|
+
imputer = KNNImputer(n_neighbors=2, metric=custom_callable)
|
|
525
|
+
assert_array_equal(imputer.fit_transform(X), X_imputed)
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
def test_complete_features():
|
|
529
|
+
|
|
530
|
+
# Test with use_complete=True
|
|
531
|
+
X = np.array([
|
|
532
|
+
[0, np.nan, 0, np.nan],
|
|
533
|
+
[1, 1, 1, np.nan],
|
|
534
|
+
[2, 2, np.nan, 2],
|
|
535
|
+
[3, 3, 3, 3],
|
|
536
|
+
[4, 4, 4, 4],
|
|
537
|
+
[5, 5, 5, 5],
|
|
538
|
+
[6, 6, 6, 6],
|
|
539
|
+
[np.nan, 7, 7, 7]
|
|
540
|
+
])
|
|
541
|
+
|
|
542
|
+
r0c1 = np.mean(X[1:6, 1])
|
|
543
|
+
r0c3 = np.mean(X[2:-1, -1])
|
|
544
|
+
r1c3 = np.mean(X[2:-1, -1])
|
|
545
|
+
r2c2 = np.nanmean(X[:6, 2])
|
|
546
|
+
r7c0 = np.mean(X[2:-1, 0])
|
|
547
|
+
|
|
548
|
+
X_imputed = np.array([
|
|
549
|
+
[0, r0c1, 0, r0c3],
|
|
550
|
+
[1, 1, 1, r1c3],
|
|
551
|
+
[2, 2, r2c2, 2],
|
|
552
|
+
[3, 3, 3, 3],
|
|
553
|
+
[4, 4, 4, 4],
|
|
554
|
+
[5, 5, 5, 5],
|
|
555
|
+
[6, 6, 6, 6],
|
|
556
|
+
[r7c0, 7, 7, 7]
|
|
557
|
+
])
|
|
558
|
+
|
|
559
|
+
imputer_comp = KNNImputer()
|
|
560
|
+
assert_array_almost_equal(imputer_comp.fit_transform(X), X_imputed)
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def test_complete_features_weighted():
|
|
564
|
+
|
|
565
|
+
# Test with use_complete=True
|
|
566
|
+
X = np.array([
|
|
567
|
+
[0, 0, 0, np.nan],
|
|
568
|
+
[1, 1, 1, np.nan],
|
|
569
|
+
[2, 2, np.nan, 2],
|
|
570
|
+
[3, 3, 3, 3],
|
|
571
|
+
[4, 4, 4, 4],
|
|
572
|
+
[5, 5, 5, 5],
|
|
573
|
+
[6, 6, 6, 6],
|
|
574
|
+
[np.nan, 7, 7, 7]
|
|
575
|
+
])
|
|
576
|
+
|
|
577
|
+
dist = pairwise_distances(X,
|
|
578
|
+
metric="masked_euclidean",
|
|
579
|
+
squared=False)
|
|
580
|
+
|
|
581
|
+
# Calculate weights
|
|
582
|
+
r0c3_w = 1.0 / dist[0, 2:-1]
|
|
583
|
+
r1c3_w = 1.0 / dist[1, 2:-1]
|
|
584
|
+
r2c2_w = 1.0 / dist[2, (0, 1, 3, 4, 5)]
|
|
585
|
+
r7c0_w = 1.0 / dist[7, 2:7]
|
|
586
|
+
|
|
587
|
+
# Calculate weighted averages
|
|
588
|
+
r0c3 = np.average(X[2:-1, -1], weights=r0c3_w)
|
|
589
|
+
r1c3 = np.average(X[2:-1, -1], weights=r1c3_w)
|
|
590
|
+
r2c2 = np.average(X[(0, 1, 3, 4, 5), 2], weights=r2c2_w)
|
|
591
|
+
r7c0 = np.average(X[2:7, 0], weights=r7c0_w)
|
|
592
|
+
|
|
593
|
+
X_imputed = np.array([
|
|
594
|
+
[0, 0, 0, r0c3],
|
|
595
|
+
[1, 1, 1, r1c3],
|
|
596
|
+
[2, 2, r2c2, 2],
|
|
597
|
+
[3, 3, 3, 3],
|
|
598
|
+
[4, 4, 4, 4],
|
|
599
|
+
[5, 5, 5, 5],
|
|
600
|
+
[6, 6, 6, 6],
|
|
601
|
+
[r7c0, 7, 7, 7]
|
|
602
|
+
])
|
|
603
|
+
|
|
604
|
+
imputer_comp_wt = KNNImputer(weights="distance")
|
|
605
|
+
assert_array_almost_equal(imputer_comp_wt.fit_transform(X), X_imputed)
|