cpgtools 1.12.0__py3-none-any.whl → 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cpgtools might be problematic. Click here for more details.
- cpgmodule/_version.py +1 -0
- cpgmodule/data/__init__.py +0 -0
- cpgmodule/methylClock.py +53 -0
- cpgmodule/utils.py +38 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_aggregation.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_anno_position.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_anno_probe.py +6 -4
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_density_gene_centered.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_chrom.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_gene_centered.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_region.py +1 -3
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_logo.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_to_gene.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_PCA.py +31 -23
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_UMAP.py +29 -22
- cpgtools-2.0.2.data/scripts/beta_imputation.py +604 -0
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_jitter_plot.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_m_conversion.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_profile_gene_centered.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_profile_region.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_selectNBest.py +9 -6
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_stacked_barplot.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_stats.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_tSNE.py +31 -24
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_topN.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_trichotmize.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_Bayes.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_bb.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_fisher.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_glm.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_logit.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_nonparametric.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_ttest.py +3 -3
- cpgtools-2.0.2.data/scripts/predict_sex.py +126 -0
- cpgtools-2.0.2.dist-info/LICENSE +19 -0
- cpgtools-2.0.2.dist-info/METADATA +76 -0
- cpgtools-2.0.2.dist-info/RECORD +82 -0
- {cpgtools-1.12.0.dist-info → cpgtools-2.0.2.dist-info}/WHEEL +1 -1
- cpgtools-2.0.2.dist-info/top_level.txt +3 -0
- impyute/__init__.py +3 -0
- impyute/contrib/__init__.py +7 -0
- impyute/contrib/compare.py +69 -0
- impyute/contrib/count_missing.py +30 -0
- impyute/contrib/describe.py +63 -0
- impyute/cs/__init__.py +11 -0
- impyute/cs/buck_iterative.py +82 -0
- impyute/cs/central_tendency.py +84 -0
- impyute/cs/em.py +52 -0
- impyute/cs/fast_knn.py +130 -0
- impyute/cs/random.py +27 -0
- impyute/dataset/__init__.py +6 -0
- impyute/dataset/base.py +137 -0
- impyute/dataset/corrupt.py +55 -0
- impyute/deletion/__init__.py +5 -0
- impyute/deletion/complete_case.py +21 -0
- impyute/ops/__init__.py +12 -0
- impyute/ops/error.py +9 -0
- impyute/ops/inverse_distance_weighting.py +31 -0
- impyute/ops/matrix.py +47 -0
- impyute/ops/testing.py +20 -0
- impyute/ops/util.py +76 -0
- impyute/ops/wrapper.py +179 -0
- impyute/ts/__init__.py +6 -0
- impyute/ts/locf.py +57 -0
- impyute/ts/moving_window.py +128 -0
- missingpy/__init__.py +4 -0
- missingpy/knnimpute.py +328 -0
- missingpy/missforest.py +556 -0
- missingpy/pairwise_external.py +315 -0
- missingpy/tests/__init__.py +0 -0
- missingpy/tests/test_knnimpute.py +605 -0
- missingpy/tests/test_missforest.py +409 -0
- missingpy/utils.py +124 -0
- cpgtools-1.12.0.dist-info/LICENSE.txt +0 -674
- cpgtools-1.12.0.dist-info/METADATA +0 -30
- cpgtools-1.12.0.dist-info/RECORD +0 -43
- cpgtools-1.12.0.dist-info/top_level.txt +0 -2
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from scipy.stats import mode
|
|
3
|
+
|
|
4
|
+
from sklearn.utils.testing import assert_array_equal
|
|
5
|
+
from sklearn.utils.testing import assert_raise_message
|
|
6
|
+
from sklearn.utils.testing import assert_equal
|
|
7
|
+
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
|
8
|
+
|
|
9
|
+
from missingpy import MissForest
|
|
10
|
+
|
|
11
|
+
def gen_array(n_rows=20, n_cols=5, missingness=0.2, min_val=0, max_val=10,
|
|
12
|
+
missing_values=np.nan, rand_seed=1337):
|
|
13
|
+
"""Generate an array with NaNs"""
|
|
14
|
+
|
|
15
|
+
rand_gen = np.random.RandomState(seed=rand_seed)
|
|
16
|
+
X = rand_gen.randint(
|
|
17
|
+
min_val, max_val, n_rows * n_cols).reshape(n_rows, n_cols).astype(
|
|
18
|
+
np.float)
|
|
19
|
+
|
|
20
|
+
# Introduce NaNs if missingness > 0
|
|
21
|
+
if missingness > 0:
|
|
22
|
+
# If missingness >= 1 then use it as approximate (see below) count
|
|
23
|
+
if missingness >= 1:
|
|
24
|
+
n_missing = missingness
|
|
25
|
+
else:
|
|
26
|
+
# If missingness is between (0, 1] then use it as approximate %
|
|
27
|
+
# of total cells that are NaNs
|
|
28
|
+
n_missing = int(np.ceil(missingness * n_rows * n_cols))
|
|
29
|
+
|
|
30
|
+
# Generate row, col index pairs and introduce NaNs
|
|
31
|
+
# NOTE: Below does not account for repeated index pairs so NaN
|
|
32
|
+
# count/percentage might be less than specified in function call
|
|
33
|
+
nan_row_idx = rand_gen.randint(0, n_rows, n_missing)
|
|
34
|
+
nan_col_idx = rand_gen.randint(0, n_cols, n_missing)
|
|
35
|
+
X[nan_row_idx, nan_col_idx] = missing_values
|
|
36
|
+
|
|
37
|
+
return X
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_missforest_imputation_shape():
|
|
41
|
+
# Verify the shapes of the imputed matrix
|
|
42
|
+
n_rows = 10
|
|
43
|
+
n_cols = 2
|
|
44
|
+
X = gen_array(n_rows, n_cols)
|
|
45
|
+
imputer = MissForest()
|
|
46
|
+
X_imputed = imputer.fit_transform(X)
|
|
47
|
+
assert_equal(X_imputed.shape, (n_rows, n_cols))
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_missforest_zero():
|
|
51
|
+
# Test imputation when missing_values == 0
|
|
52
|
+
missing_values = 0
|
|
53
|
+
imputer = MissForest(missing_values=missing_values,
|
|
54
|
+
random_state=0)
|
|
55
|
+
|
|
56
|
+
# Test with missing_values=0 when NaN present
|
|
57
|
+
X = gen_array(min_val=0)
|
|
58
|
+
msg = "Input contains NaN, infinity or a value too large for %r." % X.dtype
|
|
59
|
+
assert_raise_message(ValueError, msg, imputer.fit, X)
|
|
60
|
+
|
|
61
|
+
# Test with all zeroes in a column
|
|
62
|
+
X = np.array([
|
|
63
|
+
[1, 0, 0, 0, 5],
|
|
64
|
+
[2, 1, 0, 2, 3],
|
|
65
|
+
[3, 2, 0, 0, 0],
|
|
66
|
+
[4, 6, 0, 5, 13],
|
|
67
|
+
])
|
|
68
|
+
msg = "One or more columns have all rows missing."
|
|
69
|
+
assert_raise_message(ValueError, msg, imputer.fit, X)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_missforest_zero_part2():
|
|
73
|
+
# Test with an imputable matrix and compare with missing_values="NaN"
|
|
74
|
+
X_zero = gen_array(min_val=1, missing_values=0)
|
|
75
|
+
X_nan = gen_array(min_val=1, missing_values=np.nan)
|
|
76
|
+
statistics_mean = np.nanmean(X_nan, axis=0)
|
|
77
|
+
|
|
78
|
+
imputer_zero = MissForest(missing_values=0, random_state=1337)
|
|
79
|
+
imputer_nan = MissForest(missing_values=np.nan, random_state=1337)
|
|
80
|
+
|
|
81
|
+
assert_array_equal(imputer_zero.fit_transform(X_zero),
|
|
82
|
+
imputer_nan.fit_transform(X_nan))
|
|
83
|
+
assert_array_equal(imputer_zero.statistics_.get("col_means"),
|
|
84
|
+
statistics_mean)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def test_missforest_numerical_single():
|
|
88
|
+
# Test imputation with default parameter values
|
|
89
|
+
|
|
90
|
+
# Test with a single missing value
|
|
91
|
+
df = np.array([
|
|
92
|
+
[1, 0, 0, 1],
|
|
93
|
+
[2, 1, 2, 2],
|
|
94
|
+
[3, 2, 3, 2],
|
|
95
|
+
[np.nan, 4, 5, 5],
|
|
96
|
+
[6, 7, 6, 7],
|
|
97
|
+
[8, 8, 8, 8],
|
|
98
|
+
[16, 15, 18, 19],
|
|
99
|
+
])
|
|
100
|
+
statistics_mean = np.nanmean(df, axis=0)
|
|
101
|
+
|
|
102
|
+
y = df[:, 0]
|
|
103
|
+
X = df[:, 1:]
|
|
104
|
+
good_rows = np.where(~np.isnan(y))[0]
|
|
105
|
+
bad_rows = np.where(np.isnan(y))[0]
|
|
106
|
+
|
|
107
|
+
rf = RandomForestRegressor(n_estimators=10, random_state=1337)
|
|
108
|
+
rf.fit(X=X[good_rows], y=y[good_rows])
|
|
109
|
+
pred_val = rf.predict(X[bad_rows])
|
|
110
|
+
|
|
111
|
+
df_imputed = np.array([
|
|
112
|
+
[1, 0, 0, 1],
|
|
113
|
+
[2, 1, 2, 2],
|
|
114
|
+
[3, 2, 3, 2],
|
|
115
|
+
[pred_val, 4, 5, 5],
|
|
116
|
+
[6, 7, 6, 7],
|
|
117
|
+
[8, 8, 8, 8],
|
|
118
|
+
[16, 15, 18, 19],
|
|
119
|
+
])
|
|
120
|
+
|
|
121
|
+
imputer = MissForest(n_estimators=10, random_state=1337)
|
|
122
|
+
assert_array_equal(imputer.fit_transform(df), df_imputed)
|
|
123
|
+
assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def test_missforest_numerical_multiple():
|
|
127
|
+
# Test with two missing values for multiple iterations
|
|
128
|
+
df = np.array([
|
|
129
|
+
[1, 0, np.nan, 1],
|
|
130
|
+
[2, 1, 2, 2],
|
|
131
|
+
[3, 2, 3, 2],
|
|
132
|
+
[np.nan, 4, 5, 5],
|
|
133
|
+
[6, 7, 6, 7],
|
|
134
|
+
[8, 8, 8, 8],
|
|
135
|
+
[16, 15, 18, 19],
|
|
136
|
+
])
|
|
137
|
+
statistics_mean = np.nanmean(df, axis=0)
|
|
138
|
+
n_rows, n_cols = df.shape
|
|
139
|
+
|
|
140
|
+
# Fit missforest and transform
|
|
141
|
+
imputer = MissForest(random_state=1337)
|
|
142
|
+
df_imp1 = imputer.fit_transform(df)
|
|
143
|
+
|
|
144
|
+
# Get iterations used by missforest above
|
|
145
|
+
max_iter = imputer.iter_count_
|
|
146
|
+
|
|
147
|
+
# Get NaN mask
|
|
148
|
+
nan_mask = np.isnan(df)
|
|
149
|
+
nan_rows, nan_cols = np.where(nan_mask)
|
|
150
|
+
|
|
151
|
+
# Make initial guess for missing values
|
|
152
|
+
df_imp2 = df.copy()
|
|
153
|
+
df_imp2[nan_rows, nan_cols] = np.take(statistics_mean, nan_cols)
|
|
154
|
+
|
|
155
|
+
# Loop for max_iter count over the columns with NaNs
|
|
156
|
+
for _ in range(max_iter):
|
|
157
|
+
for c in nan_cols:
|
|
158
|
+
# Identify all other columns (i.e. predictors)
|
|
159
|
+
not_c = np.setdiff1d(np.arange(n_cols), c)
|
|
160
|
+
# Identify rows with NaN and those without in 'c'
|
|
161
|
+
y = df_imp2[:, c]
|
|
162
|
+
X = df_imp2[:, not_c]
|
|
163
|
+
good_rows = np.where(~nan_mask[:, c])[0]
|
|
164
|
+
bad_rows = np.where(nan_mask[:, c])[0]
|
|
165
|
+
|
|
166
|
+
# Fit model and predict
|
|
167
|
+
rf = RandomForestRegressor(n_estimators=100, random_state=1337)
|
|
168
|
+
rf.fit(X=X[good_rows], y=y[good_rows])
|
|
169
|
+
pred_val = rf.predict(X[bad_rows])
|
|
170
|
+
|
|
171
|
+
# Fill in values
|
|
172
|
+
df_imp2[bad_rows, c] = pred_val
|
|
173
|
+
|
|
174
|
+
assert_array_equal(df_imp1, df_imp2)
|
|
175
|
+
assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def test_missforest_categorical_single():
|
|
179
|
+
# Test imputation with default parameter values
|
|
180
|
+
|
|
181
|
+
# Test with a single missing value
|
|
182
|
+
df = np.array([
|
|
183
|
+
[0, 0, 0, 1],
|
|
184
|
+
[0, 1, 2, 2],
|
|
185
|
+
[0, 2, 3, 2],
|
|
186
|
+
[np.nan, 4, 5, 5],
|
|
187
|
+
[1, 7, 6, 7],
|
|
188
|
+
[1, 8, 8, 8],
|
|
189
|
+
[1, 15, 18, 19],
|
|
190
|
+
])
|
|
191
|
+
|
|
192
|
+
y = df[:, 0]
|
|
193
|
+
X = df[:, 1:]
|
|
194
|
+
good_rows = np.where(~np.isnan(y))[0]
|
|
195
|
+
bad_rows = np.where(np.isnan(y))[0]
|
|
196
|
+
|
|
197
|
+
rf = RandomForestClassifier(n_estimators=10, random_state=1337)
|
|
198
|
+
rf.fit(X=X[good_rows], y=y[good_rows])
|
|
199
|
+
pred_val = rf.predict(X[bad_rows])
|
|
200
|
+
|
|
201
|
+
df_imputed = np.array([
|
|
202
|
+
[0, 0, 0, 1],
|
|
203
|
+
[0, 1, 2, 2],
|
|
204
|
+
[0, 2, 3, 2],
|
|
205
|
+
[pred_val, 4, 5, 5],
|
|
206
|
+
[1, 7, 6, 7],
|
|
207
|
+
[1, 8, 8, 8],
|
|
208
|
+
[1, 15, 18, 19],
|
|
209
|
+
])
|
|
210
|
+
|
|
211
|
+
imputer = MissForest(n_estimators=10, random_state=1337)
|
|
212
|
+
assert_array_equal(imputer.fit_transform(df, cat_vars=0), df_imputed)
|
|
213
|
+
assert_array_equal(imputer.fit_transform(df, cat_vars=[0]), df_imputed)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def test_missforest_categorical_multiple():
|
|
217
|
+
# Test with two missing values for multiple iterations
|
|
218
|
+
df = np.array([
|
|
219
|
+
[0, 0, np.nan, 1],
|
|
220
|
+
[0, 1, 1, 2],
|
|
221
|
+
[0, 2, 1, 2],
|
|
222
|
+
[np.nan, 4, 1, 5],
|
|
223
|
+
[1, 7, 0, 7],
|
|
224
|
+
[1, 8, 0, 8],
|
|
225
|
+
[1, 15, 0, 19],
|
|
226
|
+
[1, 18, 0, 17],
|
|
227
|
+
])
|
|
228
|
+
cat_vars = [0, 2]
|
|
229
|
+
statistics_mode = mode(df, axis=0, nan_policy='omit').mode[0]
|
|
230
|
+
n_rows, n_cols = df.shape
|
|
231
|
+
|
|
232
|
+
# Fit missforest and transform
|
|
233
|
+
imputer = MissForest(random_state=1337)
|
|
234
|
+
df_imp1 = imputer.fit_transform(df, cat_vars=cat_vars)
|
|
235
|
+
|
|
236
|
+
# Get iterations used by missforest above
|
|
237
|
+
max_iter = imputer.iter_count_
|
|
238
|
+
|
|
239
|
+
# Get NaN mask
|
|
240
|
+
nan_mask = np.isnan(df)
|
|
241
|
+
nan_rows, nan_cols = np.where(nan_mask)
|
|
242
|
+
|
|
243
|
+
# Make initial guess for missing values
|
|
244
|
+
df_imp2 = df.copy()
|
|
245
|
+
df_imp2[nan_rows, nan_cols] = np.take(statistics_mode, nan_cols)
|
|
246
|
+
|
|
247
|
+
# Loop for max_iter count over the columns with NaNs
|
|
248
|
+
for _ in range(max_iter):
|
|
249
|
+
for c in nan_cols:
|
|
250
|
+
# Identify all other columns (i.e. predictors)
|
|
251
|
+
not_c = np.setdiff1d(np.arange(n_cols), c)
|
|
252
|
+
# Identify rows with NaN and those without in 'c'
|
|
253
|
+
y = df_imp2[:, c]
|
|
254
|
+
X = df_imp2[:, not_c]
|
|
255
|
+
good_rows = np.where(~nan_mask[:, c])[0]
|
|
256
|
+
bad_rows = np.where(nan_mask[:, c])[0]
|
|
257
|
+
|
|
258
|
+
# Fit model and predict
|
|
259
|
+
rf = RandomForestClassifier(n_estimators=100, random_state=1337)
|
|
260
|
+
rf.fit(X=X[good_rows], y=y[good_rows])
|
|
261
|
+
pred_val = rf.predict(X[bad_rows])
|
|
262
|
+
|
|
263
|
+
# Fill in values
|
|
264
|
+
df_imp2[bad_rows, c] = pred_val
|
|
265
|
+
|
|
266
|
+
assert_array_equal(df_imp1, df_imp2)
|
|
267
|
+
assert_array_equal(imputer.statistics_.get('col_modes')[0],
|
|
268
|
+
statistics_mode[cat_vars])
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def test_missforest_mixed_multiple():
|
|
272
|
+
# Test with mixed data type
|
|
273
|
+
df = np.array([
|
|
274
|
+
[np.nan, 0, 0, 1],
|
|
275
|
+
[0, 1, 2, 2],
|
|
276
|
+
[0, 2, 3, 2],
|
|
277
|
+
[1, 4, 5, 5],
|
|
278
|
+
[1, 7, 6, 7],
|
|
279
|
+
[1, 8, 8, 8],
|
|
280
|
+
[1, 15, 18, np.nan],
|
|
281
|
+
])
|
|
282
|
+
|
|
283
|
+
n_rows, n_cols = df.shape
|
|
284
|
+
cat_vars = [0]
|
|
285
|
+
num_vars = np.setdiff1d(range(n_cols), cat_vars)
|
|
286
|
+
statistics_mode = mode(df, axis=0, nan_policy='omit').mode[0]
|
|
287
|
+
statistics_mean = np.nanmean(df, axis=0)
|
|
288
|
+
|
|
289
|
+
# Fit missforest and transform
|
|
290
|
+
imputer = MissForest(random_state=1337)
|
|
291
|
+
df_imp1 = imputer.fit_transform(df, cat_vars=cat_vars)
|
|
292
|
+
|
|
293
|
+
# Get iterations used by missforest above
|
|
294
|
+
max_iter = imputer.iter_count_
|
|
295
|
+
|
|
296
|
+
# Get NaN mask
|
|
297
|
+
nan_mask = np.isnan(df)
|
|
298
|
+
nan_rows, nan_cols = np.where(nan_mask)
|
|
299
|
+
|
|
300
|
+
# Make initial guess for missing values
|
|
301
|
+
df_imp2 = df.copy()
|
|
302
|
+
df_imp2[0, 0] = statistics_mode[0]
|
|
303
|
+
df_imp2[6, 3] = statistics_mean[3]
|
|
304
|
+
|
|
305
|
+
# Loop for max_iter count over the columns with NaNs
|
|
306
|
+
for _ in range(max_iter):
|
|
307
|
+
for c in nan_cols:
|
|
308
|
+
# Identify all other columns (i.e. predictors)
|
|
309
|
+
not_c = np.setdiff1d(np.arange(n_cols), c)
|
|
310
|
+
# Identify rows with NaN and those without in 'c'
|
|
311
|
+
y = df_imp2[:, c]
|
|
312
|
+
X = df_imp2[:, not_c]
|
|
313
|
+
good_rows = np.where(~nan_mask[:, c])[0]
|
|
314
|
+
bad_rows = np.where(nan_mask[:, c])[0]
|
|
315
|
+
|
|
316
|
+
# Fit model and predict
|
|
317
|
+
if c in cat_vars:
|
|
318
|
+
rf = RandomForestClassifier(n_estimators=100,
|
|
319
|
+
random_state=1337)
|
|
320
|
+
else:
|
|
321
|
+
rf = RandomForestRegressor(n_estimators=100,
|
|
322
|
+
random_state=1337)
|
|
323
|
+
rf.fit(X=X[good_rows], y=y[good_rows])
|
|
324
|
+
pred_val = rf.predict(X[bad_rows])
|
|
325
|
+
|
|
326
|
+
# Fill in values
|
|
327
|
+
df_imp2[bad_rows, c] = pred_val
|
|
328
|
+
|
|
329
|
+
assert_array_equal(df_imp1, df_imp2)
|
|
330
|
+
assert_array_equal(imputer.statistics_.get('col_means'),
|
|
331
|
+
statistics_mean[num_vars])
|
|
332
|
+
assert_array_equal(imputer.statistics_.get('col_modes')[0],
|
|
333
|
+
statistics_mode[cat_vars])
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def test_statstics_fit_transform():
|
|
337
|
+
# Test statistics_ when data in fit() and transform() are different
|
|
338
|
+
X = np.array([
|
|
339
|
+
[1, 0, 0, 1],
|
|
340
|
+
[2, 1, 2, 2],
|
|
341
|
+
[3, 2, 3, 2],
|
|
342
|
+
[np.nan, 4, 5, 5],
|
|
343
|
+
[6, 7, 6, 7],
|
|
344
|
+
[8, 8, 8, 8],
|
|
345
|
+
[16, 15, 18, 19],
|
|
346
|
+
])
|
|
347
|
+
statistics_mean = np.nanmean(X, axis=0)
|
|
348
|
+
|
|
349
|
+
Y = np.array([
|
|
350
|
+
[0, 0, 0, 0],
|
|
351
|
+
[2, 2, 2, 1],
|
|
352
|
+
[3, 2, 3, 2],
|
|
353
|
+
[np.nan, 4, 5, 5],
|
|
354
|
+
[6, 7, 6, 7],
|
|
355
|
+
[9, 9, 8, 8],
|
|
356
|
+
[16, 15, 18, 19],
|
|
357
|
+
])
|
|
358
|
+
|
|
359
|
+
imputer = MissForest()
|
|
360
|
+
imputer.fit(X).transform(Y)
|
|
361
|
+
assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean)
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def test_default_with_invalid_input():
|
|
365
|
+
# Test imputation with default values and invalid input
|
|
366
|
+
|
|
367
|
+
# Test with all rows missing in a column
|
|
368
|
+
X = np.array([
|
|
369
|
+
[np.nan, 0, 0, 1],
|
|
370
|
+
[np.nan, 1, 2, np.nan],
|
|
371
|
+
[np.nan, 2, 3, np.nan],
|
|
372
|
+
[np.nan, 4, 5, 5],
|
|
373
|
+
])
|
|
374
|
+
imputer = MissForest(random_state=1337)
|
|
375
|
+
msg = "One or more columns have all rows missing."
|
|
376
|
+
assert_raise_message(ValueError, msg, imputer.fit, X)
|
|
377
|
+
|
|
378
|
+
# Test with inf present
|
|
379
|
+
X = np.array([
|
|
380
|
+
[np.inf, 1, 1, 2, np.nan],
|
|
381
|
+
[2, 1, 2, 2, 3],
|
|
382
|
+
[3, 2, 3, 3, 8],
|
|
383
|
+
[np.nan, 6, 0, 5, 13],
|
|
384
|
+
[np.nan, 7, 0, 7, 8],
|
|
385
|
+
[6, 6, 2, 5, 7],
|
|
386
|
+
])
|
|
387
|
+
msg = "+/- inf values are not supported."
|
|
388
|
+
assert_raise_message(ValueError, msg, MissForest().fit, X)
|
|
389
|
+
|
|
390
|
+
# Test with inf present in matrix passed in transform()
|
|
391
|
+
X = np.array([
|
|
392
|
+
[np.inf, 1, 1, 2, np.nan],
|
|
393
|
+
[2, 1, 2, 2, 3],
|
|
394
|
+
[3, 2, 3, 3, 8],
|
|
395
|
+
[np.nan, 6, 0, 5, 13],
|
|
396
|
+
[np.nan, 7, 0, 7, 8],
|
|
397
|
+
[6, 6, 2, 5, 7],
|
|
398
|
+
])
|
|
399
|
+
|
|
400
|
+
X_fit = np.array([
|
|
401
|
+
[0, 1, 1, 2, np.nan],
|
|
402
|
+
[2, 1, 2, 2, 3],
|
|
403
|
+
[3, 2, 3, 3, 8],
|
|
404
|
+
[np.nan, 6, 0, 5, 13],
|
|
405
|
+
[np.nan, 7, 0, 7, 8],
|
|
406
|
+
[6, 6, 2, 5, 7],
|
|
407
|
+
])
|
|
408
|
+
msg = "+/- inf values are not supported."
|
|
409
|
+
assert_raise_message(ValueError, msg, MissForest().fit(X_fit).transform, X)
|
missingpy/utils.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""Utility Functions"""
|
|
2
|
+
# Author: Ashim Bhattarai
|
|
3
|
+
# License: BSD 3 clause
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def masked_euclidean_distances(X, Y=None, squared=False,
|
|
9
|
+
missing_values="NaN", copy=True):
|
|
10
|
+
"""Calculates euclidean distances in the presence of missing values
|
|
11
|
+
|
|
12
|
+
Computes the euclidean distance between each pair of samples (rows) in X
|
|
13
|
+
and Y, where Y=X is assumed if Y=None.
|
|
14
|
+
When calculating the distance between a pair of samples, this formulation
|
|
15
|
+
essentially zero-weights feature coordinates with a missing value in either
|
|
16
|
+
sample and scales up the weight of the remaining coordinates:
|
|
17
|
+
|
|
18
|
+
dist(x,y) = sqrt(weight * sq. distance from non-missing coordinates)
|
|
19
|
+
where,
|
|
20
|
+
weight = Total # of coordinates / # of non-missing coordinates
|
|
21
|
+
|
|
22
|
+
Note that if all the coordinates are missing or if there are no common
|
|
23
|
+
non-missing coordinates then NaN is returned for that pair.
|
|
24
|
+
|
|
25
|
+
Read more in the :ref:`User Guide <metrics>`.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
X : {array-like, sparse matrix}, shape (n_samples_1, n_features)
|
|
30
|
+
|
|
31
|
+
Y : {array-like, sparse matrix}, shape (n_samples_2, n_features)
|
|
32
|
+
|
|
33
|
+
squared : boolean, optional
|
|
34
|
+
Return squared Euclidean distances.
|
|
35
|
+
|
|
36
|
+
missing_values : "NaN" or integer, optional
|
|
37
|
+
Representation of missing value
|
|
38
|
+
|
|
39
|
+
copy : boolean, optional
|
|
40
|
+
Make and use a deep copy of X and Y (if Y exists)
|
|
41
|
+
|
|
42
|
+
Returns
|
|
43
|
+
-------
|
|
44
|
+
distances : {array}, shape (n_samples_1, n_samples_2)
|
|
45
|
+
|
|
46
|
+
Examples
|
|
47
|
+
--------
|
|
48
|
+
>>> from missingpy.utils import masked_euclidean_distances
|
|
49
|
+
>>> nan = float("NaN")
|
|
50
|
+
>>> X = [[0, 1], [1, nan]]
|
|
51
|
+
>>> # distance between rows of X
|
|
52
|
+
>>> masked_euclidean_distances(X, X)
|
|
53
|
+
array([[0. , 1.41421356],
|
|
54
|
+
[1.41421356, 0. ]])
|
|
55
|
+
|
|
56
|
+
>>> # get distance to origin
|
|
57
|
+
>>> masked_euclidean_distances(X, [[0, 0]])
|
|
58
|
+
array([[1. ],
|
|
59
|
+
[1.41421356]])
|
|
60
|
+
|
|
61
|
+
References
|
|
62
|
+
----------
|
|
63
|
+
* John K. Dixon, "Pattern Recognition with Partly Missing Data",
|
|
64
|
+
IEEE Transactions on Systems, Man, and Cybernetics, Volume: 9, Issue:
|
|
65
|
+
10, pp. 617 - 621, Oct. 1979.
|
|
66
|
+
http://ieeexplore.ieee.org/abstract/document/4310090/
|
|
67
|
+
|
|
68
|
+
See also
|
|
69
|
+
--------
|
|
70
|
+
paired_distances : distances betweens pairs of elements of X and Y.
|
|
71
|
+
"""
|
|
72
|
+
# Import here to prevent circular import
|
|
73
|
+
from .pairwise_external import _get_mask, check_pairwise_arrays
|
|
74
|
+
|
|
75
|
+
# NOTE: force_all_finite=False allows not only NaN but also +/- inf
|
|
76
|
+
X, Y = check_pairwise_arrays(X, Y, accept_sparse=False,
|
|
77
|
+
force_all_finite=False, copy=copy)
|
|
78
|
+
if (np.any(np.isinf(X)) or
|
|
79
|
+
(Y is not X and np.any(np.isinf(Y)))):
|
|
80
|
+
raise ValueError(
|
|
81
|
+
"+/- Infinite values are not allowed.")
|
|
82
|
+
|
|
83
|
+
# Get missing mask for X and Y.T
|
|
84
|
+
mask_X = _get_mask(X, missing_values)
|
|
85
|
+
|
|
86
|
+
YT = Y.T
|
|
87
|
+
mask_YT = mask_X.T if Y is X else _get_mask(YT, missing_values)
|
|
88
|
+
|
|
89
|
+
# Check if any rows have only missing value
|
|
90
|
+
if np.any(mask_X.sum(axis=1) == X.shape[1])\
|
|
91
|
+
or (Y is not X and np.any(mask_YT.sum(axis=0) == Y.shape[1])):
|
|
92
|
+
raise ValueError("One or more rows only contain missing values.")
|
|
93
|
+
|
|
94
|
+
# else:
|
|
95
|
+
if missing_values not in ["NaN", np.nan] and (
|
|
96
|
+
np.any(np.isnan(X)) or (Y is not X and np.any(np.isnan(Y)))):
|
|
97
|
+
raise ValueError(
|
|
98
|
+
"NaN values present but missing_value = {0}".format(
|
|
99
|
+
missing_values))
|
|
100
|
+
|
|
101
|
+
# Get mask of non-missing values set Y.T's missing to zero.
|
|
102
|
+
# Further, casting the mask to int to be used in formula later.
|
|
103
|
+
not_YT = (~mask_YT).astype(np.int32)
|
|
104
|
+
YT[mask_YT] = 0
|
|
105
|
+
|
|
106
|
+
# Get X's mask of non-missing values and set X's missing to zero
|
|
107
|
+
not_X = (~mask_X).astype(np.int32)
|
|
108
|
+
X[mask_X] = 0
|
|
109
|
+
|
|
110
|
+
# Calculate distances
|
|
111
|
+
# The following formula derived by:
|
|
112
|
+
# Shreya Bhattarai <shreya.bhattarai@gmail.com>
|
|
113
|
+
|
|
114
|
+
distances = (
|
|
115
|
+
(X.shape[1] / (np.dot(not_X, not_YT))) *
|
|
116
|
+
(np.dot(X * X, not_YT) - 2 * (np.dot(X, YT)) +
|
|
117
|
+
np.dot(not_X, YT * YT)))
|
|
118
|
+
|
|
119
|
+
if X is Y:
|
|
120
|
+
# Ensure that distances between vectors and themselves are set to 0.0.
|
|
121
|
+
# This may not be the case due to floating point rounding errors.
|
|
122
|
+
distances.flat[::distances.shape[0] + 1] = 0.0
|
|
123
|
+
|
|
124
|
+
return distances if squared else np.sqrt(distances, out=distances)
|