cpgtools 1.12.0__py3-none-any.whl → 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cpgtools might be problematic. Click here for more details.

Files changed (77) hide show
  1. cpgmodule/_version.py +1 -0
  2. cpgmodule/data/__init__.py +0 -0
  3. cpgmodule/methylClock.py +53 -0
  4. cpgmodule/utils.py +38 -1
  5. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_aggregation.py +1 -1
  6. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_anno_position.py +1 -1
  7. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_anno_probe.py +6 -4
  8. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_density_gene_centered.py +1 -1
  9. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_chrom.py +1 -1
  10. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_gene_centered.py +1 -1
  11. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_region.py +1 -3
  12. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_logo.py +1 -1
  13. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_to_gene.py +1 -1
  14. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_PCA.py +31 -23
  15. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_UMAP.py +29 -22
  16. cpgtools-2.0.2.data/scripts/beta_imputation.py +604 -0
  17. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_jitter_plot.py +1 -1
  18. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_m_conversion.py +1 -1
  19. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_profile_gene_centered.py +1 -1
  20. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_profile_region.py +1 -1
  21. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_selectNBest.py +9 -6
  22. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_stacked_barplot.py +1 -1
  23. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_stats.py +1 -1
  24. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_tSNE.py +31 -24
  25. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_topN.py +1 -1
  26. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_trichotmize.py +1 -1
  27. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_Bayes.py +1 -1
  28. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_bb.py +1 -1
  29. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_fisher.py +1 -1
  30. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_glm.py +1 -1
  31. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_logit.py +1 -1
  32. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_nonparametric.py +1 -1
  33. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_ttest.py +3 -3
  34. cpgtools-2.0.2.data/scripts/predict_sex.py +126 -0
  35. cpgtools-2.0.2.dist-info/LICENSE +19 -0
  36. cpgtools-2.0.2.dist-info/METADATA +76 -0
  37. cpgtools-2.0.2.dist-info/RECORD +82 -0
  38. {cpgtools-1.12.0.dist-info → cpgtools-2.0.2.dist-info}/WHEEL +1 -1
  39. cpgtools-2.0.2.dist-info/top_level.txt +3 -0
  40. impyute/__init__.py +3 -0
  41. impyute/contrib/__init__.py +7 -0
  42. impyute/contrib/compare.py +69 -0
  43. impyute/contrib/count_missing.py +30 -0
  44. impyute/contrib/describe.py +63 -0
  45. impyute/cs/__init__.py +11 -0
  46. impyute/cs/buck_iterative.py +82 -0
  47. impyute/cs/central_tendency.py +84 -0
  48. impyute/cs/em.py +52 -0
  49. impyute/cs/fast_knn.py +130 -0
  50. impyute/cs/random.py +27 -0
  51. impyute/dataset/__init__.py +6 -0
  52. impyute/dataset/base.py +137 -0
  53. impyute/dataset/corrupt.py +55 -0
  54. impyute/deletion/__init__.py +5 -0
  55. impyute/deletion/complete_case.py +21 -0
  56. impyute/ops/__init__.py +12 -0
  57. impyute/ops/error.py +9 -0
  58. impyute/ops/inverse_distance_weighting.py +31 -0
  59. impyute/ops/matrix.py +47 -0
  60. impyute/ops/testing.py +20 -0
  61. impyute/ops/util.py +76 -0
  62. impyute/ops/wrapper.py +179 -0
  63. impyute/ts/__init__.py +6 -0
  64. impyute/ts/locf.py +57 -0
  65. impyute/ts/moving_window.py +128 -0
  66. missingpy/__init__.py +4 -0
  67. missingpy/knnimpute.py +328 -0
  68. missingpy/missforest.py +556 -0
  69. missingpy/pairwise_external.py +315 -0
  70. missingpy/tests/__init__.py +0 -0
  71. missingpy/tests/test_knnimpute.py +605 -0
  72. missingpy/tests/test_missforest.py +409 -0
  73. missingpy/utils.py +124 -0
  74. cpgtools-1.12.0.dist-info/LICENSE.txt +0 -674
  75. cpgtools-1.12.0.dist-info/METADATA +0 -30
  76. cpgtools-1.12.0.dist-info/RECORD +0 -43
  77. cpgtools-1.12.0.dist-info/top_level.txt +0 -2
@@ -0,0 +1,409 @@
1
+ import numpy as np
2
+ from scipy.stats import mode
3
+
4
+ from sklearn.utils.testing import assert_array_equal
5
+ from sklearn.utils.testing import assert_raise_message
6
+ from sklearn.utils.testing import assert_equal
7
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
8
+
9
+ from missingpy import MissForest
10
+
11
+ def gen_array(n_rows=20, n_cols=5, missingness=0.2, min_val=0, max_val=10,
12
+ missing_values=np.nan, rand_seed=1337):
13
+ """Generate an array with NaNs"""
14
+
15
+ rand_gen = np.random.RandomState(seed=rand_seed)
16
+ X = rand_gen.randint(
17
+ min_val, max_val, n_rows * n_cols).reshape(n_rows, n_cols).astype(
18
+ np.float)
19
+
20
+ # Introduce NaNs if missingness > 0
21
+ if missingness > 0:
22
+ # If missingness >= 1 then use it as approximate (see below) count
23
+ if missingness >= 1:
24
+ n_missing = missingness
25
+ else:
26
+ # If missingness is between (0, 1] then use it as approximate %
27
+ # of total cells that are NaNs
28
+ n_missing = int(np.ceil(missingness * n_rows * n_cols))
29
+
30
+ # Generate row, col index pairs and introduce NaNs
31
+ # NOTE: Below does not account for repeated index pairs so NaN
32
+ # count/percentage might be less than specified in function call
33
+ nan_row_idx = rand_gen.randint(0, n_rows, n_missing)
34
+ nan_col_idx = rand_gen.randint(0, n_cols, n_missing)
35
+ X[nan_row_idx, nan_col_idx] = missing_values
36
+
37
+ return X
38
+
39
+
40
+ def test_missforest_imputation_shape():
41
+ # Verify the shapes of the imputed matrix
42
+ n_rows = 10
43
+ n_cols = 2
44
+ X = gen_array(n_rows, n_cols)
45
+ imputer = MissForest()
46
+ X_imputed = imputer.fit_transform(X)
47
+ assert_equal(X_imputed.shape, (n_rows, n_cols))
48
+
49
+
50
+ def test_missforest_zero():
51
+ # Test imputation when missing_values == 0
52
+ missing_values = 0
53
+ imputer = MissForest(missing_values=missing_values,
54
+ random_state=0)
55
+
56
+ # Test with missing_values=0 when NaN present
57
+ X = gen_array(min_val=0)
58
+ msg = "Input contains NaN, infinity or a value too large for %r." % X.dtype
59
+ assert_raise_message(ValueError, msg, imputer.fit, X)
60
+
61
+ # Test with all zeroes in a column
62
+ X = np.array([
63
+ [1, 0, 0, 0, 5],
64
+ [2, 1, 0, 2, 3],
65
+ [3, 2, 0, 0, 0],
66
+ [4, 6, 0, 5, 13],
67
+ ])
68
+ msg = "One or more columns have all rows missing."
69
+ assert_raise_message(ValueError, msg, imputer.fit, X)
70
+
71
+
72
+ def test_missforest_zero_part2():
73
+ # Test with an imputable matrix and compare with missing_values="NaN"
74
+ X_zero = gen_array(min_val=1, missing_values=0)
75
+ X_nan = gen_array(min_val=1, missing_values=np.nan)
76
+ statistics_mean = np.nanmean(X_nan, axis=0)
77
+
78
+ imputer_zero = MissForest(missing_values=0, random_state=1337)
79
+ imputer_nan = MissForest(missing_values=np.nan, random_state=1337)
80
+
81
+ assert_array_equal(imputer_zero.fit_transform(X_zero),
82
+ imputer_nan.fit_transform(X_nan))
83
+ assert_array_equal(imputer_zero.statistics_.get("col_means"),
84
+ statistics_mean)
85
+
86
+
87
+ def test_missforest_numerical_single():
88
+ # Test imputation with default parameter values
89
+
90
+ # Test with a single missing value
91
+ df = np.array([
92
+ [1, 0, 0, 1],
93
+ [2, 1, 2, 2],
94
+ [3, 2, 3, 2],
95
+ [np.nan, 4, 5, 5],
96
+ [6, 7, 6, 7],
97
+ [8, 8, 8, 8],
98
+ [16, 15, 18, 19],
99
+ ])
100
+ statistics_mean = np.nanmean(df, axis=0)
101
+
102
+ y = df[:, 0]
103
+ X = df[:, 1:]
104
+ good_rows = np.where(~np.isnan(y))[0]
105
+ bad_rows = np.where(np.isnan(y))[0]
106
+
107
+ rf = RandomForestRegressor(n_estimators=10, random_state=1337)
108
+ rf.fit(X=X[good_rows], y=y[good_rows])
109
+ pred_val = rf.predict(X[bad_rows])
110
+
111
+ df_imputed = np.array([
112
+ [1, 0, 0, 1],
113
+ [2, 1, 2, 2],
114
+ [3, 2, 3, 2],
115
+ [pred_val, 4, 5, 5],
116
+ [6, 7, 6, 7],
117
+ [8, 8, 8, 8],
118
+ [16, 15, 18, 19],
119
+ ])
120
+
121
+ imputer = MissForest(n_estimators=10, random_state=1337)
122
+ assert_array_equal(imputer.fit_transform(df), df_imputed)
123
+ assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean)
124
+
125
+
126
+ def test_missforest_numerical_multiple():
127
+ # Test with two missing values for multiple iterations
128
+ df = np.array([
129
+ [1, 0, np.nan, 1],
130
+ [2, 1, 2, 2],
131
+ [3, 2, 3, 2],
132
+ [np.nan, 4, 5, 5],
133
+ [6, 7, 6, 7],
134
+ [8, 8, 8, 8],
135
+ [16, 15, 18, 19],
136
+ ])
137
+ statistics_mean = np.nanmean(df, axis=0)
138
+ n_rows, n_cols = df.shape
139
+
140
+ # Fit missforest and transform
141
+ imputer = MissForest(random_state=1337)
142
+ df_imp1 = imputer.fit_transform(df)
143
+
144
+ # Get iterations used by missforest above
145
+ max_iter = imputer.iter_count_
146
+
147
+ # Get NaN mask
148
+ nan_mask = np.isnan(df)
149
+ nan_rows, nan_cols = np.where(nan_mask)
150
+
151
+ # Make initial guess for missing values
152
+ df_imp2 = df.copy()
153
+ df_imp2[nan_rows, nan_cols] = np.take(statistics_mean, nan_cols)
154
+
155
+ # Loop for max_iter count over the columns with NaNs
156
+ for _ in range(max_iter):
157
+ for c in nan_cols:
158
+ # Identify all other columns (i.e. predictors)
159
+ not_c = np.setdiff1d(np.arange(n_cols), c)
160
+ # Identify rows with NaN and those without in 'c'
161
+ y = df_imp2[:, c]
162
+ X = df_imp2[:, not_c]
163
+ good_rows = np.where(~nan_mask[:, c])[0]
164
+ bad_rows = np.where(nan_mask[:, c])[0]
165
+
166
+ # Fit model and predict
167
+ rf = RandomForestRegressor(n_estimators=100, random_state=1337)
168
+ rf.fit(X=X[good_rows], y=y[good_rows])
169
+ pred_val = rf.predict(X[bad_rows])
170
+
171
+ # Fill in values
172
+ df_imp2[bad_rows, c] = pred_val
173
+
174
+ assert_array_equal(df_imp1, df_imp2)
175
+ assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean)
176
+
177
+
178
+ def test_missforest_categorical_single():
179
+ # Test imputation with default parameter values
180
+
181
+ # Test with a single missing value
182
+ df = np.array([
183
+ [0, 0, 0, 1],
184
+ [0, 1, 2, 2],
185
+ [0, 2, 3, 2],
186
+ [np.nan, 4, 5, 5],
187
+ [1, 7, 6, 7],
188
+ [1, 8, 8, 8],
189
+ [1, 15, 18, 19],
190
+ ])
191
+
192
+ y = df[:, 0]
193
+ X = df[:, 1:]
194
+ good_rows = np.where(~np.isnan(y))[0]
195
+ bad_rows = np.where(np.isnan(y))[0]
196
+
197
+ rf = RandomForestClassifier(n_estimators=10, random_state=1337)
198
+ rf.fit(X=X[good_rows], y=y[good_rows])
199
+ pred_val = rf.predict(X[bad_rows])
200
+
201
+ df_imputed = np.array([
202
+ [0, 0, 0, 1],
203
+ [0, 1, 2, 2],
204
+ [0, 2, 3, 2],
205
+ [pred_val, 4, 5, 5],
206
+ [1, 7, 6, 7],
207
+ [1, 8, 8, 8],
208
+ [1, 15, 18, 19],
209
+ ])
210
+
211
+ imputer = MissForest(n_estimators=10, random_state=1337)
212
+ assert_array_equal(imputer.fit_transform(df, cat_vars=0), df_imputed)
213
+ assert_array_equal(imputer.fit_transform(df, cat_vars=[0]), df_imputed)
214
+
215
+
216
+ def test_missforest_categorical_multiple():
217
+ # Test with two missing values for multiple iterations
218
+ df = np.array([
219
+ [0, 0, np.nan, 1],
220
+ [0, 1, 1, 2],
221
+ [0, 2, 1, 2],
222
+ [np.nan, 4, 1, 5],
223
+ [1, 7, 0, 7],
224
+ [1, 8, 0, 8],
225
+ [1, 15, 0, 19],
226
+ [1, 18, 0, 17],
227
+ ])
228
+ cat_vars = [0, 2]
229
+ statistics_mode = mode(df, axis=0, nan_policy='omit').mode[0]
230
+ n_rows, n_cols = df.shape
231
+
232
+ # Fit missforest and transform
233
+ imputer = MissForest(random_state=1337)
234
+ df_imp1 = imputer.fit_transform(df, cat_vars=cat_vars)
235
+
236
+ # Get iterations used by missforest above
237
+ max_iter = imputer.iter_count_
238
+
239
+ # Get NaN mask
240
+ nan_mask = np.isnan(df)
241
+ nan_rows, nan_cols = np.where(nan_mask)
242
+
243
+ # Make initial guess for missing values
244
+ df_imp2 = df.copy()
245
+ df_imp2[nan_rows, nan_cols] = np.take(statistics_mode, nan_cols)
246
+
247
+ # Loop for max_iter count over the columns with NaNs
248
+ for _ in range(max_iter):
249
+ for c in nan_cols:
250
+ # Identify all other columns (i.e. predictors)
251
+ not_c = np.setdiff1d(np.arange(n_cols), c)
252
+ # Identify rows with NaN and those without in 'c'
253
+ y = df_imp2[:, c]
254
+ X = df_imp2[:, not_c]
255
+ good_rows = np.where(~nan_mask[:, c])[0]
256
+ bad_rows = np.where(nan_mask[:, c])[0]
257
+
258
+ # Fit model and predict
259
+ rf = RandomForestClassifier(n_estimators=100, random_state=1337)
260
+ rf.fit(X=X[good_rows], y=y[good_rows])
261
+ pred_val = rf.predict(X[bad_rows])
262
+
263
+ # Fill in values
264
+ df_imp2[bad_rows, c] = pred_val
265
+
266
+ assert_array_equal(df_imp1, df_imp2)
267
+ assert_array_equal(imputer.statistics_.get('col_modes')[0],
268
+ statistics_mode[cat_vars])
269
+
270
+
271
+ def test_missforest_mixed_multiple():
272
+ # Test with mixed data type
273
+ df = np.array([
274
+ [np.nan, 0, 0, 1],
275
+ [0, 1, 2, 2],
276
+ [0, 2, 3, 2],
277
+ [1, 4, 5, 5],
278
+ [1, 7, 6, 7],
279
+ [1, 8, 8, 8],
280
+ [1, 15, 18, np.nan],
281
+ ])
282
+
283
+ n_rows, n_cols = df.shape
284
+ cat_vars = [0]
285
+ num_vars = np.setdiff1d(range(n_cols), cat_vars)
286
+ statistics_mode = mode(df, axis=0, nan_policy='omit').mode[0]
287
+ statistics_mean = np.nanmean(df, axis=0)
288
+
289
+ # Fit missforest and transform
290
+ imputer = MissForest(random_state=1337)
291
+ df_imp1 = imputer.fit_transform(df, cat_vars=cat_vars)
292
+
293
+ # Get iterations used by missforest above
294
+ max_iter = imputer.iter_count_
295
+
296
+ # Get NaN mask
297
+ nan_mask = np.isnan(df)
298
+ nan_rows, nan_cols = np.where(nan_mask)
299
+
300
+ # Make initial guess for missing values
301
+ df_imp2 = df.copy()
302
+ df_imp2[0, 0] = statistics_mode[0]
303
+ df_imp2[6, 3] = statistics_mean[3]
304
+
305
+ # Loop for max_iter count over the columns with NaNs
306
+ for _ in range(max_iter):
307
+ for c in nan_cols:
308
+ # Identify all other columns (i.e. predictors)
309
+ not_c = np.setdiff1d(np.arange(n_cols), c)
310
+ # Identify rows with NaN and those without in 'c'
311
+ y = df_imp2[:, c]
312
+ X = df_imp2[:, not_c]
313
+ good_rows = np.where(~nan_mask[:, c])[0]
314
+ bad_rows = np.where(nan_mask[:, c])[0]
315
+
316
+ # Fit model and predict
317
+ if c in cat_vars:
318
+ rf = RandomForestClassifier(n_estimators=100,
319
+ random_state=1337)
320
+ else:
321
+ rf = RandomForestRegressor(n_estimators=100,
322
+ random_state=1337)
323
+ rf.fit(X=X[good_rows], y=y[good_rows])
324
+ pred_val = rf.predict(X[bad_rows])
325
+
326
+ # Fill in values
327
+ df_imp2[bad_rows, c] = pred_val
328
+
329
+ assert_array_equal(df_imp1, df_imp2)
330
+ assert_array_equal(imputer.statistics_.get('col_means'),
331
+ statistics_mean[num_vars])
332
+ assert_array_equal(imputer.statistics_.get('col_modes')[0],
333
+ statistics_mode[cat_vars])
334
+
335
+
336
+ def test_statstics_fit_transform():
337
+ # Test statistics_ when data in fit() and transform() are different
338
+ X = np.array([
339
+ [1, 0, 0, 1],
340
+ [2, 1, 2, 2],
341
+ [3, 2, 3, 2],
342
+ [np.nan, 4, 5, 5],
343
+ [6, 7, 6, 7],
344
+ [8, 8, 8, 8],
345
+ [16, 15, 18, 19],
346
+ ])
347
+ statistics_mean = np.nanmean(X, axis=0)
348
+
349
+ Y = np.array([
350
+ [0, 0, 0, 0],
351
+ [2, 2, 2, 1],
352
+ [3, 2, 3, 2],
353
+ [np.nan, 4, 5, 5],
354
+ [6, 7, 6, 7],
355
+ [9, 9, 8, 8],
356
+ [16, 15, 18, 19],
357
+ ])
358
+
359
+ imputer = MissForest()
360
+ imputer.fit(X).transform(Y)
361
+ assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean)
362
+
363
+
364
+ def test_default_with_invalid_input():
365
+ # Test imputation with default values and invalid input
366
+
367
+ # Test with all rows missing in a column
368
+ X = np.array([
369
+ [np.nan, 0, 0, 1],
370
+ [np.nan, 1, 2, np.nan],
371
+ [np.nan, 2, 3, np.nan],
372
+ [np.nan, 4, 5, 5],
373
+ ])
374
+ imputer = MissForest(random_state=1337)
375
+ msg = "One or more columns have all rows missing."
376
+ assert_raise_message(ValueError, msg, imputer.fit, X)
377
+
378
+ # Test with inf present
379
+ X = np.array([
380
+ [np.inf, 1, 1, 2, np.nan],
381
+ [2, 1, 2, 2, 3],
382
+ [3, 2, 3, 3, 8],
383
+ [np.nan, 6, 0, 5, 13],
384
+ [np.nan, 7, 0, 7, 8],
385
+ [6, 6, 2, 5, 7],
386
+ ])
387
+ msg = "+/- inf values are not supported."
388
+ assert_raise_message(ValueError, msg, MissForest().fit, X)
389
+
390
+ # Test with inf present in matrix passed in transform()
391
+ X = np.array([
392
+ [np.inf, 1, 1, 2, np.nan],
393
+ [2, 1, 2, 2, 3],
394
+ [3, 2, 3, 3, 8],
395
+ [np.nan, 6, 0, 5, 13],
396
+ [np.nan, 7, 0, 7, 8],
397
+ [6, 6, 2, 5, 7],
398
+ ])
399
+
400
+ X_fit = np.array([
401
+ [0, 1, 1, 2, np.nan],
402
+ [2, 1, 2, 2, 3],
403
+ [3, 2, 3, 3, 8],
404
+ [np.nan, 6, 0, 5, 13],
405
+ [np.nan, 7, 0, 7, 8],
406
+ [6, 6, 2, 5, 7],
407
+ ])
408
+ msg = "+/- inf values are not supported."
409
+ assert_raise_message(ValueError, msg, MissForest().fit(X_fit).transform, X)
missingpy/utils.py ADDED
@@ -0,0 +1,124 @@
1
+ """Utility Functions"""
2
+ # Author: Ashim Bhattarai
3
+ # License: BSD 3 clause
4
+
5
+ import numpy as np
6
+
7
+
8
+ def masked_euclidean_distances(X, Y=None, squared=False,
9
+ missing_values="NaN", copy=True):
10
+ """Calculates euclidean distances in the presence of missing values
11
+
12
+ Computes the euclidean distance between each pair of samples (rows) in X
13
+ and Y, where Y=X is assumed if Y=None.
14
+ When calculating the distance between a pair of samples, this formulation
15
+ essentially zero-weights feature coordinates with a missing value in either
16
+ sample and scales up the weight of the remaining coordinates:
17
+
18
+ dist(x,y) = sqrt(weight * sq. distance from non-missing coordinates)
19
+ where,
20
+ weight = Total # of coordinates / # of non-missing coordinates
21
+
22
+ Note that if all the coordinates are missing or if there are no common
23
+ non-missing coordinates then NaN is returned for that pair.
24
+
25
+ Read more in the :ref:`User Guide <metrics>`.
26
+
27
+ Parameters
28
+ ----------
29
+ X : {array-like, sparse matrix}, shape (n_samples_1, n_features)
30
+
31
+ Y : {array-like, sparse matrix}, shape (n_samples_2, n_features)
32
+
33
+ squared : boolean, optional
34
+ Return squared Euclidean distances.
35
+
36
+ missing_values : "NaN" or integer, optional
37
+ Representation of missing value
38
+
39
+ copy : boolean, optional
40
+ Make and use a deep copy of X and Y (if Y exists)
41
+
42
+ Returns
43
+ -------
44
+ distances : {array}, shape (n_samples_1, n_samples_2)
45
+
46
+ Examples
47
+ --------
48
+ >>> from missingpy.utils import masked_euclidean_distances
49
+ >>> nan = float("NaN")
50
+ >>> X = [[0, 1], [1, nan]]
51
+ >>> # distance between rows of X
52
+ >>> masked_euclidean_distances(X, X)
53
+ array([[0. , 1.41421356],
54
+ [1.41421356, 0. ]])
55
+
56
+ >>> # get distance to origin
57
+ >>> masked_euclidean_distances(X, [[0, 0]])
58
+ array([[1. ],
59
+ [1.41421356]])
60
+
61
+ References
62
+ ----------
63
+ * John K. Dixon, "Pattern Recognition with Partly Missing Data",
64
+ IEEE Transactions on Systems, Man, and Cybernetics, Volume: 9, Issue:
65
+ 10, pp. 617 - 621, Oct. 1979.
66
+ http://ieeexplore.ieee.org/abstract/document/4310090/
67
+
68
+ See also
69
+ --------
70
+ paired_distances : distances betweens pairs of elements of X and Y.
71
+ """
72
+ # Import here to prevent circular import
73
+ from .pairwise_external import _get_mask, check_pairwise_arrays
74
+
75
+ # NOTE: force_all_finite=False allows not only NaN but also +/- inf
76
+ X, Y = check_pairwise_arrays(X, Y, accept_sparse=False,
77
+ force_all_finite=False, copy=copy)
78
+ if (np.any(np.isinf(X)) or
79
+ (Y is not X and np.any(np.isinf(Y)))):
80
+ raise ValueError(
81
+ "+/- Infinite values are not allowed.")
82
+
83
+ # Get missing mask for X and Y.T
84
+ mask_X = _get_mask(X, missing_values)
85
+
86
+ YT = Y.T
87
+ mask_YT = mask_X.T if Y is X else _get_mask(YT, missing_values)
88
+
89
+ # Check if any rows have only missing value
90
+ if np.any(mask_X.sum(axis=1) == X.shape[1])\
91
+ or (Y is not X and np.any(mask_YT.sum(axis=0) == Y.shape[1])):
92
+ raise ValueError("One or more rows only contain missing values.")
93
+
94
+ # else:
95
+ if missing_values not in ["NaN", np.nan] and (
96
+ np.any(np.isnan(X)) or (Y is not X and np.any(np.isnan(Y)))):
97
+ raise ValueError(
98
+ "NaN values present but missing_value = {0}".format(
99
+ missing_values))
100
+
101
+ # Get mask of non-missing values set Y.T's missing to zero.
102
+ # Further, casting the mask to int to be used in formula later.
103
+ not_YT = (~mask_YT).astype(np.int32)
104
+ YT[mask_YT] = 0
105
+
106
+ # Get X's mask of non-missing values and set X's missing to zero
107
+ not_X = (~mask_X).astype(np.int32)
108
+ X[mask_X] = 0
109
+
110
+ # Calculate distances
111
+ # The following formula derived by:
112
+ # Shreya Bhattarai <shreya.bhattarai@gmail.com>
113
+
114
+ distances = (
115
+ (X.shape[1] / (np.dot(not_X, not_YT))) *
116
+ (np.dot(X * X, not_YT) - 2 * (np.dot(X, YT)) +
117
+ np.dot(not_X, YT * YT)))
118
+
119
+ if X is Y:
120
+ # Ensure that distances between vectors and themselves are set to 0.0.
121
+ # This may not be the case due to floating point rounding errors.
122
+ distances.flat[::distances.shape[0] + 1] = 0.0
123
+
124
+ return distances if squared else np.sqrt(distances, out=distances)