cpgtools 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. cpgmodule/BED.py +441 -0
  2. cpgmodule/MI.py +193 -0
  3. cpgmodule/__init__.py +0 -0
  4. cpgmodule/_version.py +1 -0
  5. cpgmodule/cgID.py +866897 -0
  6. cpgmodule/data/AltumAge_cpg.pkl +0 -0
  7. cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
  8. cpgmodule/data/AltumAge_scaler.pkl +0 -0
  9. cpgmodule/data/GA_Bohlin.pkl +0 -0
  10. cpgmodule/data/GA_Haftorn.pkl +0 -0
  11. cpgmodule/data/GA_Knight.pkl +0 -0
  12. cpgmodule/data/GA_Lee_CPC.pkl +0 -0
  13. cpgmodule/data/GA_Lee_RPC.pkl +0 -0
  14. cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
  15. cpgmodule/data/GA_Mayne.pkl +0 -0
  16. cpgmodule/data/Hannum.pkl +0 -0
  17. cpgmodule/data/Horvath_2013.pkl +0 -0
  18. cpgmodule/data/Horvath_2018.pkl +0 -0
  19. cpgmodule/data/Levine.pkl +0 -0
  20. cpgmodule/data/Lu_DNAmTL.pkl +0 -0
  21. cpgmodule/data/Ped_McEwen.pkl +0 -0
  22. cpgmodule/data/Ped_Wu.pkl +0 -0
  23. cpgmodule/data/Zhang_BLUP.pkl +0 -0
  24. cpgmodule/data/Zhang_EN.pkl +0 -0
  25. cpgmodule/data/__init__.py +0 -0
  26. cpgmodule/extend_bed.py +147 -0
  27. cpgmodule/imotif.py +348 -0
  28. cpgmodule/ireader.py +28 -0
  29. cpgmodule/methylClock.py +53 -0
  30. cpgmodule/padjust.py +58 -0
  31. cpgmodule/region2gene.py +170 -0
  32. cpgmodule/utils.py +642 -0
  33. cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
  34. cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
  35. cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
  36. cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
  37. cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
  38. cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
  39. cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
  40. cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
  41. cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
  42. cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
  43. cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
  44. cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
  45. cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
  46. cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
  47. cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
  48. cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
  49. cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
  50. cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
  51. cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
  52. cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
  53. cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
  54. cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
  55. cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
  56. cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
  57. cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
  58. cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
  59. cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
  60. cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
  61. cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
  62. cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
  63. cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
  64. cpgtools-2.0.5.dist-info/METADATA +59 -0
  65. cpgtools-2.0.5.dist-info/RECORD +104 -0
  66. cpgtools-2.0.5.dist-info/WHEEL +5 -0
  67. cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
  68. cpgtools-2.0.5.dist-info/top_level.txt +5 -0
  69. impyute/__init__.py +3 -0
  70. impyute/contrib/__init__.py +7 -0
  71. impyute/contrib/compare.py +69 -0
  72. impyute/contrib/count_missing.py +30 -0
  73. impyute/contrib/describe.py +63 -0
  74. impyute/cs/__init__.py +11 -0
  75. impyute/cs/buck_iterative.py +82 -0
  76. impyute/cs/central_tendency.py +84 -0
  77. impyute/cs/em.py +52 -0
  78. impyute/cs/fast_knn.py +130 -0
  79. impyute/cs/random.py +27 -0
  80. impyute/dataset/__init__.py +6 -0
  81. impyute/dataset/base.py +137 -0
  82. impyute/dataset/corrupt.py +55 -0
  83. impyute/deletion/__init__.py +5 -0
  84. impyute/deletion/complete_case.py +21 -0
  85. impyute/ops/__init__.py +12 -0
  86. impyute/ops/error.py +9 -0
  87. impyute/ops/inverse_distance_weighting.py +31 -0
  88. impyute/ops/matrix.py +47 -0
  89. impyute/ops/testing.py +20 -0
  90. impyute/ops/util.py +96 -0
  91. impyute/ops/wrapper.py +179 -0
  92. impyute/ts/__init__.py +6 -0
  93. impyute/ts/locf.py +57 -0
  94. impyute/ts/moving_window.py +128 -0
  95. impyutelib.py +890 -0
  96. missingpy/__init__.py +4 -0
  97. missingpy/knnimpute.py +328 -0
  98. missingpy/missforest.py +556 -0
  99. missingpy/pairwise_external.py +315 -0
  100. missingpy/tests/__init__.py +0 -0
  101. missingpy/tests/test_knnimpute.py +605 -0
  102. missingpy/tests/test_missforest.py +409 -0
  103. missingpy/utils.py +124 -0
  104. misspylib.py +565 -0
@@ -0,0 +1,605 @@
1
+ import numpy as np
2
+
3
+ from sklearn.utils.testing import assert_array_equal
4
+ from sklearn.utils.testing import assert_array_almost_equal
5
+ from sklearn.utils.testing import assert_raise_message
6
+ from sklearn.utils.testing import assert_equal
7
+
8
+ from missingpy import KNNImputer
9
+ from missingpy.pairwise_external import masked_euclidean_distances
10
+ from missingpy.pairwise_external import pairwise_distances
11
+
12
+
13
+ def test_knn_imputation_shape():
14
+ # Verify the shapes of the imputed matrix for different weights and
15
+ # number of neighbors.
16
+ n_rows = 10
17
+ n_cols = 2
18
+ X = np.random.rand(n_rows, n_cols)
19
+ X[0, 0] = np.nan
20
+
21
+ for weights in ['uniform', 'distance']:
22
+ for n_neighbors in range(1, 6):
23
+ imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights)
24
+ X_imputed = imputer.fit_transform(X)
25
+ assert_equal(X_imputed.shape, (n_rows, n_cols))
26
+
27
+
28
+ def test_knn_imputation_zero():
29
+ # Test imputation when missing_values == 0
30
+ missing_values = 0
31
+ n_neighbors = 2
32
+ imputer = KNNImputer(missing_values=missing_values,
33
+ n_neighbors=n_neighbors,
34
+ weights="uniform")
35
+
36
+ # Test with missing_values=0 when NaN present
37
+ X = np.array([
38
+ [np.nan, 0, 0, 0, 5],
39
+ [np.nan, 1, 0, np.nan, 3],
40
+ [np.nan, 2, 0, 0, 0],
41
+ [np.nan, 6, 0, 5, 13],
42
+ ])
43
+ msg = "Input contains NaN, infinity or a value too large for %r." % X.dtype
44
+ assert_raise_message(ValueError, msg, imputer.fit, X)
45
+
46
+ # Test with % zeros in column > col_max_missing
47
+ X = np.array([
48
+ [1, 0, 0, 0, 5],
49
+ [2, 1, 0, 2, 3],
50
+ [3, 2, 0, 0, 0],
51
+ [4, 6, 0, 5, 13],
52
+ ])
53
+ msg = "Some column(s) have more than {}% missing values".format(
54
+ imputer.col_max_missing * 100)
55
+ assert_raise_message(ValueError, msg, imputer.fit, X)
56
+
57
+
58
+ def test_knn_imputation_zero_p2():
59
+ # Test with an imputable matrix and also compare with missing_values="NaN"
60
+ X_zero = np.array([
61
+ [1, 0, 1, 1, 1.],
62
+ [2, 2, 2, 2, 2],
63
+ [3, 3, 3, 3, 0],
64
+ [6, 6, 0, 6, 6],
65
+ ])
66
+
67
+ X_nan = np.array([
68
+ [1, np.nan, 1, 1, 1.],
69
+ [2, 2, 2, 2, 2],
70
+ [3, 3, 3, 3, np.nan],
71
+ [6, 6, np.nan, 6, 6],
72
+ ])
73
+ statistics_mean = np.nanmean(X_nan, axis=0)
74
+
75
+ X_imputed = np.array([
76
+ [1, 2.5, 1, 1, 1.],
77
+ [2, 2, 2, 2, 2],
78
+ [3, 3, 3, 3, 1.5],
79
+ [6, 6, 2.5, 6, 6],
80
+ ])
81
+
82
+ imputer_zero = KNNImputer(missing_values=0, n_neighbors=2,
83
+ weights="uniform")
84
+
85
+ imputer_nan = KNNImputer(missing_values="NaN",
86
+ n_neighbors=2,
87
+ weights="uniform")
88
+
89
+ assert_array_equal(imputer_zero.fit_transform(X_zero), X_imputed)
90
+ assert_array_equal(imputer_zero.statistics_, statistics_mean)
91
+ assert_array_equal(imputer_zero.fit_transform(X_zero),
92
+ imputer_nan.fit_transform(X_nan))
93
+
94
+
95
+ def test_knn_imputation_default():
96
+ # Test imputation with default parameter values
97
+
98
+ # Test with an imputable matrix
99
+ X = np.array([
100
+ [1, 0, 0, 1],
101
+ [2, 1, 2, np.nan],
102
+ [3, 2, 3, np.nan],
103
+ [np.nan, 4, 5, 5],
104
+ [6, np.nan, 6, 7],
105
+ [8, 8, 8, 8],
106
+ [16, 15, 18, 19],
107
+ ])
108
+ statistics_mean = np.nanmean(X, axis=0)
109
+
110
+ X_imputed = np.array([
111
+ [1, 0, 0, 1],
112
+ [2, 1, 2, 8],
113
+ [3, 2, 3, 8],
114
+ [4, 4, 5, 5],
115
+ [6, 3, 6, 7],
116
+ [8, 8, 8, 8],
117
+ [16, 15, 18, 19],
118
+ ])
119
+
120
+ imputer = KNNImputer()
121
+ assert_array_equal(imputer.fit_transform(X), X_imputed)
122
+ assert_array_equal(imputer.statistics_, statistics_mean)
123
+
124
+ # Test with % missing in row > row_max_missing
125
+ X = np.array([
126
+ [1, 0, 0, 1],
127
+ [2, 1, 2, np.nan],
128
+ [3, 2, 3, np.nan],
129
+ [np.nan, 4, 5, 5],
130
+ [6, np.nan, 6, 7],
131
+ [8, 8, 8, 8],
132
+ [19, 19, 19, 19],
133
+ [np.nan, np.nan, np.nan, 19],
134
+ ])
135
+ statistics_mean = np.nanmean(X, axis=0)
136
+ r7c0, r7c1, r7c2, _ = statistics_mean
137
+
138
+ X_imputed = np.array([
139
+ [1, 0, 0, 1],
140
+ [2, 1, 2, 8],
141
+ [3, 2, 3, 8],
142
+ [4, 4, 5, 5],
143
+ [6, 3, 6, 7],
144
+ [8, 8, 8, 8],
145
+ [19, 19, 19, 19],
146
+ [r7c0, r7c1, r7c2, 19],
147
+ ])
148
+
149
+ imputer = KNNImputer()
150
+ assert_array_almost_equal(imputer.fit_transform(X), X_imputed, decimal=6)
151
+ assert_array_almost_equal(imputer.statistics_, statistics_mean, decimal=6)
152
+
153
+ # Test with all neighboring donors also having missing feature values
154
+ X = np.array([
155
+ [1, 0, 0, np.nan],
156
+ [2, 1, 2, np.nan],
157
+ [3, 2, 3, np.nan],
158
+ [4, 4, 5, np.nan],
159
+ [6, 7, 6, np.nan],
160
+ [8, 8, 8, np.nan],
161
+ [20, 20, 20, 20],
162
+ [22, 22, 22, 22]
163
+ ])
164
+ statistics_mean = np.nanmean(X, axis=0)
165
+
166
+ X_imputed = np.array([
167
+ [1, 0, 0, 21],
168
+ [2, 1, 2, 21],
169
+ [3, 2, 3, 21],
170
+ [4, 4, 5, 21],
171
+ [6, 7, 6, 21],
172
+ [8, 8, 8, 21],
173
+ [20, 20, 20, 20],
174
+ [22, 22, 22, 22]
175
+ ])
176
+
177
+ imputer = KNNImputer()
178
+ assert_array_equal(imputer.fit_transform(X), X_imputed)
179
+ assert_array_equal(imputer.statistics_, statistics_mean)
180
+
181
+ # Test when data in fit() and transform() are different
182
+ X = np.array([
183
+ [0, 0],
184
+ [np.nan, 2],
185
+ [4, 3],
186
+ [5, 6],
187
+ [7, 7],
188
+ [9, 8],
189
+ [11, 16]
190
+ ])
191
+ statistics_mean = np.nanmean(X, axis=0)
192
+
193
+ Y = np.array([
194
+ [1, 0],
195
+ [3, 2],
196
+ [4, np.nan]
197
+ ])
198
+
199
+ Y_imputed = np.array([
200
+ [1, 0],
201
+ [3, 2],
202
+ [4, 4.8]
203
+ ])
204
+
205
+ imputer = KNNImputer()
206
+ assert_array_equal(imputer.fit(X).transform(Y), Y_imputed)
207
+ assert_array_equal(imputer.statistics_, statistics_mean)
208
+
209
+
210
+ def test_default_with_invalid_input():
211
+ # Test imputation with default values and invalid input
212
+
213
+ # Test with % missing in a column > col_max_missing
214
+ X = np.array([
215
+ [np.nan, 0, 0, 0, 5],
216
+ [np.nan, 1, 0, np.nan, 3],
217
+ [np.nan, 2, 0, 0, 0],
218
+ [np.nan, 6, 0, 5, 13],
219
+ [np.nan, 7, 0, 7, 8],
220
+ [np.nan, 8, 0, 8, 9],
221
+ ])
222
+ imputer = KNNImputer()
223
+ msg = "Some column(s) have more than {}% missing values".format(
224
+ imputer.col_max_missing * 100)
225
+ assert_raise_message(ValueError, msg, imputer.fit, X)
226
+
227
+ # Test with insufficient number of neighbors
228
+ X = np.array([
229
+ [1, 1, 1, 2, np.nan],
230
+ [2, 1, 2, 2, 3],
231
+ [3, 2, 3, 3, 8],
232
+ [6, 6, 2, 5, 13],
233
+ ])
234
+ msg = "There are only %d samples, but n_neighbors=%d." % \
235
+ (X.shape[0], imputer.n_neighbors)
236
+ assert_raise_message(ValueError, msg, imputer.fit, X)
237
+
238
+ # Test with inf present
239
+ X = np.array([
240
+ [np.inf, 1, 1, 2, np.nan],
241
+ [2, 1, 2, 2, 3],
242
+ [3, 2, 3, 3, 8],
243
+ [np.nan, 6, 0, 5, 13],
244
+ [np.nan, 7, 0, 7, 8],
245
+ [6, 6, 2, 5, 7],
246
+ ])
247
+ msg = "+/- inf values are not allowed."
248
+ assert_raise_message(ValueError, msg, KNNImputer().fit, X)
249
+
250
+ # Test with inf present in matrix passed in transform()
251
+ X = np.array([
252
+ [np.inf, 1, 1, 2, np.nan],
253
+ [2, 1, 2, 2, 3],
254
+ [3, 2, 3, 3, 8],
255
+ [np.nan, 6, 0, 5, 13],
256
+ [np.nan, 7, 0, 7, 8],
257
+ [6, 6, 2, 5, 7],
258
+ ])
259
+
260
+ X_fit = np.array([
261
+ [0, 1, 1, 2, np.nan],
262
+ [2, 1, 2, 2, 3],
263
+ [3, 2, 3, 3, 8],
264
+ [np.nan, 6, 0, 5, 13],
265
+ [np.nan, 7, 0, 7, 8],
266
+ [6, 6, 2, 5, 7],
267
+ ])
268
+ msg = "+/- inf values are not allowed in data to be transformed."
269
+ assert_raise_message(ValueError, msg, KNNImputer().fit(X_fit).transform, X)
270
+
271
+
272
+ def test_knn_n_neighbors():
273
+
274
+ X = np.array([
275
+ [0, 0],
276
+ [np.nan, 2],
277
+ [4, 3],
278
+ [5, np.nan],
279
+ [7, 7],
280
+ [np.nan, 8],
281
+ [14, 13]
282
+ ])
283
+ statistics_mean = np.nanmean(X, axis=0)
284
+
285
+ # Test with 1 neighbor
286
+ X_imputed_1NN = np.array([
287
+ [0, 0],
288
+ [4, 2],
289
+ [4, 3],
290
+ [5, 3],
291
+ [7, 7],
292
+ [7, 8],
293
+ [14, 13]
294
+ ])
295
+
296
+ n_neighbors = 1
297
+ imputer = KNNImputer(n_neighbors=n_neighbors)
298
+
299
+ assert_array_equal(imputer.fit_transform(X), X_imputed_1NN)
300
+ assert_array_equal(imputer.statistics_, statistics_mean)
301
+
302
+ # Test with 6 neighbors
303
+ X = np.array([
304
+ [0, 0],
305
+ [np.nan, 2],
306
+ [4, 3],
307
+ [5, np.nan],
308
+ [7, 7],
309
+ [np.nan, 8],
310
+ [14, 13]
311
+ ])
312
+
313
+ X_imputed_6NN = np.array([
314
+ [0, 0],
315
+ [6, 2],
316
+ [4, 3],
317
+ [5, 5.5],
318
+ [7, 7],
319
+ [6, 8],
320
+ [14, 13]
321
+ ])
322
+
323
+ n_neighbors = 6
324
+ imputer = KNNImputer(n_neighbors=6)
325
+ imputer_plus1 = KNNImputer(n_neighbors=n_neighbors + 1)
326
+
327
+ assert_array_equal(imputer.fit_transform(X), X_imputed_6NN)
328
+ assert_array_equal(imputer.statistics_, statistics_mean)
329
+ assert_array_equal(imputer.fit_transform(X), imputer_plus1.fit(
330
+ X).transform(X))
331
+
332
+
333
+ def test_weight_uniform():
334
+ X = np.array([
335
+ [0, 0],
336
+ [np.nan, 2],
337
+ [4, 3],
338
+ [5, 6],
339
+ [7, 7],
340
+ [9, 8],
341
+ [11, 10]
342
+ ])
343
+
344
+ # Test with "uniform" weight (or unweighted)
345
+ X_imputed_uniform = np.array([
346
+ [0, 0],
347
+ [5, 2],
348
+ [4, 3],
349
+ [5, 6],
350
+ [7, 7],
351
+ [9, 8],
352
+ [11, 10]
353
+ ])
354
+
355
+ imputer = KNNImputer(weights="uniform")
356
+ assert_array_equal(imputer.fit_transform(X), X_imputed_uniform)
357
+
358
+ # Test with "callable" weight
359
+ def no_weight(dist=None):
360
+ return None
361
+
362
+ imputer = KNNImputer(weights=no_weight)
363
+ assert_array_equal(imputer.fit_transform(X), X_imputed_uniform)
364
+
365
+
366
+ def test_weight_distance():
367
+ X = np.array([
368
+ [0, 0],
369
+ [np.nan, 2],
370
+ [4, 3],
371
+ [5, 6],
372
+ [7, 7],
373
+ [9, 8],
374
+ [11, 10]
375
+ ])
376
+
377
+ # Test with "distance" weight
378
+
379
+ # Get distance of "n_neighbors" neighbors of row 1
380
+ dist_matrix = pairwise_distances(X, metric="masked_euclidean")
381
+
382
+ index = np.argsort(dist_matrix)[1, 1:6]
383
+ dist = dist_matrix[1, index]
384
+ weights = 1 / dist
385
+ values = X[index, 0]
386
+ imputed = np.dot(values, weights) / np.sum(weights)
387
+
388
+ # Manual calculation
389
+ X_imputed_distance1 = np.array([
390
+ [0, 0],
391
+ [3.850394, 2],
392
+ [4, 3],
393
+ [5, 6],
394
+ [7, 7],
395
+ [9, 8],
396
+ [11, 10]
397
+ ])
398
+
399
+ # NearestNeighbor calculation
400
+ X_imputed_distance2 = np.array([
401
+ [0, 0],
402
+ [imputed, 2],
403
+ [4, 3],
404
+ [5, 6],
405
+ [7, 7],
406
+ [9, 8],
407
+ [11, 10]
408
+ ])
409
+
410
+ imputer = KNNImputer(weights="distance")
411
+ assert_array_almost_equal(imputer.fit_transform(X), X_imputed_distance1,
412
+ decimal=6)
413
+ assert_array_almost_equal(imputer.fit_transform(X), X_imputed_distance2,
414
+ decimal=6)
415
+
416
+ # Test with weights = "distance" and n_neighbors=2
417
+ X = np.array([
418
+ [np.nan, 0, 0],
419
+ [2, 1, 2],
420
+ [3, 2, 3],
421
+ [4, 5, 5],
422
+ ])
423
+ statistics_mean = np.nanmean(X, axis=0)
424
+
425
+ X_imputed = np.array([
426
+ [2.3828, 0, 0],
427
+ [2, 1, 2],
428
+ [3, 2, 3],
429
+ [4, 5, 5],
430
+ ])
431
+
432
+ imputer = KNNImputer(n_neighbors=2, weights="distance")
433
+ assert_array_almost_equal(imputer.fit_transform(X), X_imputed,
434
+ decimal=4)
435
+ assert_array_equal(imputer.statistics_, statistics_mean)
436
+
437
+ # Test with varying missingness patterns
438
+ X = np.array([
439
+ [1, 0, 0, 1],
440
+ [0, np.nan, 1, np.nan],
441
+ [1, 1, 1, np.nan],
442
+ [0, 1, 0, 0],
443
+ [0, 0, 0, 0],
444
+ [1, 0, 1, 1],
445
+ [10, 10, 10, 10],
446
+ ])
447
+ statistics_mean = np.nanmean(X, axis=0)
448
+
449
+ # Get weights of donor neighbors
450
+ dist = masked_euclidean_distances(X)
451
+ r1c1_nbor_dists = dist[1, [0, 2, 3, 4, 5]]
452
+ r1c3_nbor_dists = dist[1, [0, 3, 4, 5, 6]]
453
+ r1c1_nbor_wt = (1/r1c1_nbor_dists)
454
+ r1c3_nbor_wt = (1 / r1c3_nbor_dists)
455
+
456
+ r2c3_nbor_dists = dist[2, [0, 3, 4, 5, 6]]
457
+ r2c3_nbor_wt = 1/r2c3_nbor_dists
458
+
459
+ # Collect donor values
460
+ col1_donor_values = np.ma.masked_invalid(X[[0, 2, 3, 4, 5], 1]).copy()
461
+ col3_donor_values = np.ma.masked_invalid(X[[0, 3, 4, 5, 6], 3]).copy()
462
+
463
+ # Final imputed values
464
+ r1c1_imp = np.ma.average(col1_donor_values, weights=r1c1_nbor_wt)
465
+ r1c3_imp = np.ma.average(col3_donor_values, weights=r1c3_nbor_wt)
466
+ r2c3_imp = np.ma.average(col3_donor_values, weights=r2c3_nbor_wt)
467
+
468
+ print(r1c1_imp, r1c3_imp, r2c3_imp)
469
+ X_imputed = np.array([
470
+ [1, 0, 0, 1],
471
+ [0, r1c1_imp, 1, r1c3_imp],
472
+ [1, 1, 1, r2c3_imp],
473
+ [0, 1, 0, 0],
474
+ [0, 0, 0, 0],
475
+ [1, 0, 1, 1],
476
+ [10, 10, 10, 10],
477
+ ])
478
+
479
+ imputer = KNNImputer(weights="distance")
480
+ assert_array_almost_equal(imputer.fit_transform(X), X_imputed, decimal=6)
481
+ assert_array_equal(imputer.statistics_, statistics_mean)
482
+
483
+
484
+ def test_metric_type():
485
+ X = np.array([
486
+ [0, 0],
487
+ [np.nan, 2],
488
+ [4, 3],
489
+ [5, 6],
490
+ [7, 7],
491
+ [9, 8],
492
+ [11, 10]
493
+ ])
494
+
495
+ # Test with a metric type without NaN support
496
+ imputer = KNNImputer(metric="euclidean")
497
+ bad_metric_msg = "The selected metric does not support NaN values."
498
+ assert_raise_message(ValueError, bad_metric_msg, imputer.fit, X)
499
+
500
+
501
+ def test_callable_metric():
502
+
503
+ # Define callable metric that returns the l1 norm:
504
+ def custom_callable(x, y, missing_values="NaN", squared=False):
505
+ x = np.ma.array(x, mask=np.isnan(x))
506
+ y = np.ma.array(y, mask=np.isnan(y))
507
+ dist = np.nansum(np.abs(x-y))
508
+ return dist
509
+
510
+ X = np.array([
511
+ [4, 3, 3, np.nan],
512
+ [6, 9, 6, 9],
513
+ [4, 8, 6, 9],
514
+ [np.nan, 9, 11, 10.]
515
+ ])
516
+
517
+ X_imputed = np.array([
518
+ [4, 3, 3, 9],
519
+ [6, 9, 6, 9],
520
+ [4, 8, 6, 9],
521
+ [5, 9, 11, 10.]
522
+ ])
523
+
524
+ imputer = KNNImputer(n_neighbors=2, metric=custom_callable)
525
+ assert_array_equal(imputer.fit_transform(X), X_imputed)
526
+
527
+
528
+ def test_complete_features():
529
+
530
+ # Test with use_complete=True
531
+ X = np.array([
532
+ [0, np.nan, 0, np.nan],
533
+ [1, 1, 1, np.nan],
534
+ [2, 2, np.nan, 2],
535
+ [3, 3, 3, 3],
536
+ [4, 4, 4, 4],
537
+ [5, 5, 5, 5],
538
+ [6, 6, 6, 6],
539
+ [np.nan, 7, 7, 7]
540
+ ])
541
+
542
+ r0c1 = np.mean(X[1:6, 1])
543
+ r0c3 = np.mean(X[2:-1, -1])
544
+ r1c3 = np.mean(X[2:-1, -1])
545
+ r2c2 = np.nanmean(X[:6, 2])
546
+ r7c0 = np.mean(X[2:-1, 0])
547
+
548
+ X_imputed = np.array([
549
+ [0, r0c1, 0, r0c3],
550
+ [1, 1, 1, r1c3],
551
+ [2, 2, r2c2, 2],
552
+ [3, 3, 3, 3],
553
+ [4, 4, 4, 4],
554
+ [5, 5, 5, 5],
555
+ [6, 6, 6, 6],
556
+ [r7c0, 7, 7, 7]
557
+ ])
558
+
559
+ imputer_comp = KNNImputer()
560
+ assert_array_almost_equal(imputer_comp.fit_transform(X), X_imputed)
561
+
562
+
563
+ def test_complete_features_weighted():
564
+
565
+ # Test with use_complete=True
566
+ X = np.array([
567
+ [0, 0, 0, np.nan],
568
+ [1, 1, 1, np.nan],
569
+ [2, 2, np.nan, 2],
570
+ [3, 3, 3, 3],
571
+ [4, 4, 4, 4],
572
+ [5, 5, 5, 5],
573
+ [6, 6, 6, 6],
574
+ [np.nan, 7, 7, 7]
575
+ ])
576
+
577
+ dist = pairwise_distances(X,
578
+ metric="masked_euclidean",
579
+ squared=False)
580
+
581
+ # Calculate weights
582
+ r0c3_w = 1.0 / dist[0, 2:-1]
583
+ r1c3_w = 1.0 / dist[1, 2:-1]
584
+ r2c2_w = 1.0 / dist[2, (0, 1, 3, 4, 5)]
585
+ r7c0_w = 1.0 / dist[7, 2:7]
586
+
587
+ # Calculate weighted averages
588
+ r0c3 = np.average(X[2:-1, -1], weights=r0c3_w)
589
+ r1c3 = np.average(X[2:-1, -1], weights=r1c3_w)
590
+ r2c2 = np.average(X[(0, 1, 3, 4, 5), 2], weights=r2c2_w)
591
+ r7c0 = np.average(X[2:7, 0], weights=r7c0_w)
592
+
593
+ X_imputed = np.array([
594
+ [0, 0, 0, r0c3],
595
+ [1, 1, 1, r1c3],
596
+ [2, 2, r2c2, 2],
597
+ [3, 3, 3, 3],
598
+ [4, 4, 4, 4],
599
+ [5, 5, 5, 5],
600
+ [6, 6, 6, 6],
601
+ [r7c0, 7, 7, 7]
602
+ ])
603
+
604
+ imputer_comp_wt = KNNImputer(weights="distance")
605
+ assert_array_almost_equal(imputer_comp_wt.fit_transform(X), X_imputed)