cpgtools 2.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cpgmodule/BED.py +441 -0
- cpgmodule/MI.py +193 -0
- cpgmodule/__init__.py +0 -0
- cpgmodule/_version.py +1 -0
- cpgmodule/cgID.py +866897 -0
- cpgmodule/data/AltumAge_cpg.pkl +0 -0
- cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
- cpgmodule/data/AltumAge_scaler.pkl +0 -0
- cpgmodule/data/GA_Bohlin.pkl +0 -0
- cpgmodule/data/GA_Haftorn.pkl +0 -0
- cpgmodule/data/GA_Knight.pkl +0 -0
- cpgmodule/data/GA_Lee_CPC.pkl +0 -0
- cpgmodule/data/GA_Lee_RPC.pkl +0 -0
- cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
- cpgmodule/data/GA_Mayne.pkl +0 -0
- cpgmodule/data/Hannum.pkl +0 -0
- cpgmodule/data/Horvath_2013.pkl +0 -0
- cpgmodule/data/Horvath_2018.pkl +0 -0
- cpgmodule/data/Levine.pkl +0 -0
- cpgmodule/data/Lu_DNAmTL.pkl +0 -0
- cpgmodule/data/Ped_McEwen.pkl +0 -0
- cpgmodule/data/Ped_Wu.pkl +0 -0
- cpgmodule/data/Zhang_BLUP.pkl +0 -0
- cpgmodule/data/Zhang_EN.pkl +0 -0
- cpgmodule/data/__init__.py +0 -0
- cpgmodule/extend_bed.py +147 -0
- cpgmodule/imotif.py +348 -0
- cpgmodule/ireader.py +28 -0
- cpgmodule/methylClock.py +53 -0
- cpgmodule/padjust.py +58 -0
- cpgmodule/region2gene.py +170 -0
- cpgmodule/utils.py +642 -0
- cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
- cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
- cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
- cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
- cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
- cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
- cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
- cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
- cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
- cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
- cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
- cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
- cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
- cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
- cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
- cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
- cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
- cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
- cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
- cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
- cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
- cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
- cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
- cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
- cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
- cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
- cpgtools-2.0.5.dist-info/METADATA +59 -0
- cpgtools-2.0.5.dist-info/RECORD +104 -0
- cpgtools-2.0.5.dist-info/WHEEL +5 -0
- cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
- cpgtools-2.0.5.dist-info/top_level.txt +5 -0
- impyute/__init__.py +3 -0
- impyute/contrib/__init__.py +7 -0
- impyute/contrib/compare.py +69 -0
- impyute/contrib/count_missing.py +30 -0
- impyute/contrib/describe.py +63 -0
- impyute/cs/__init__.py +11 -0
- impyute/cs/buck_iterative.py +82 -0
- impyute/cs/central_tendency.py +84 -0
- impyute/cs/em.py +52 -0
- impyute/cs/fast_knn.py +130 -0
- impyute/cs/random.py +27 -0
- impyute/dataset/__init__.py +6 -0
- impyute/dataset/base.py +137 -0
- impyute/dataset/corrupt.py +55 -0
- impyute/deletion/__init__.py +5 -0
- impyute/deletion/complete_case.py +21 -0
- impyute/ops/__init__.py +12 -0
- impyute/ops/error.py +9 -0
- impyute/ops/inverse_distance_weighting.py +31 -0
- impyute/ops/matrix.py +47 -0
- impyute/ops/testing.py +20 -0
- impyute/ops/util.py +96 -0
- impyute/ops/wrapper.py +179 -0
- impyute/ts/__init__.py +6 -0
- impyute/ts/locf.py +57 -0
- impyute/ts/moving_window.py +128 -0
- impyutelib.py +890 -0
- missingpy/__init__.py +4 -0
- missingpy/knnimpute.py +328 -0
- missingpy/missforest.py +556 -0
- missingpy/pairwise_external.py +315 -0
- missingpy/tests/__init__.py +0 -0
- missingpy/tests/test_knnimpute.py +605 -0
- missingpy/tests/test_missforest.py +409 -0
- missingpy/utils.py +124 -0
- misspylib.py +565 -0
impyutelib.py
ADDED
|
@@ -0,0 +1,890 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Created on Tue Oct 8 12:01:45 2024
|
|
5
|
+
Adapted and modified from impyute.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from sklearn.linear_model import LinearRegression
|
|
11
|
+
from scipy.spatial import KDTree
|
|
12
|
+
from functools import wraps
|
|
13
|
+
|
|
14
|
+
## Common operations on matrices
|
|
15
|
+
|
|
16
|
+
def nan_indices(data):
|
|
17
|
+
""" Finds the indices of all missing values.
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
data: numpy.ndarray
|
|
22
|
+
|
|
23
|
+
Returns
|
|
24
|
+
-------
|
|
25
|
+
List of tuples
|
|
26
|
+
Indices of all missing values in tuple format; (i, j)
|
|
27
|
+
"""
|
|
28
|
+
return np.argwhere(np.isnan(data))
|
|
29
|
+
|
|
30
|
+
def map_nd(fn, arr):
|
|
31
|
+
""" Map fn that takes a value over entire n-dim array
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
arr: numpy.ndarray
|
|
36
|
+
|
|
37
|
+
Returns
|
|
38
|
+
-------
|
|
39
|
+
numpy.ndarray
|
|
40
|
+
|
|
41
|
+
"""
|
|
42
|
+
return np.vectorize(fn)(arr)
|
|
43
|
+
|
|
44
|
+
def every_nd(fn, arr):
|
|
45
|
+
""" Returns bool, true if fn is true for all elements of arr
|
|
46
|
+
|
|
47
|
+
Parameters
|
|
48
|
+
----------
|
|
49
|
+
arr: numpy.ndarray
|
|
50
|
+
|
|
51
|
+
Returns
|
|
52
|
+
-------
|
|
53
|
+
bool
|
|
54
|
+
|
|
55
|
+
"""
|
|
56
|
+
return all(map(fn, arr.flatten()))
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
## Util
|
|
67
|
+
|
|
68
|
+
def thread(arg, *fns):
|
|
69
|
+
if len(fns) > 0:
|
|
70
|
+
return thread(fns[0](arg), *fns[1:])
|
|
71
|
+
else:
|
|
72
|
+
return arg
|
|
73
|
+
|
|
74
|
+
def identity(x):
|
|
75
|
+
return x
|
|
76
|
+
|
|
77
|
+
def constantly(x):
|
|
78
|
+
""" Returns a function that takes any args and returns x """
|
|
79
|
+
def func(*args, **kwargs):
|
|
80
|
+
return x
|
|
81
|
+
return func
|
|
82
|
+
|
|
83
|
+
def complement(fn):
|
|
84
|
+
""" Return fn that outputs the opposite truth values of the
|
|
85
|
+
input function
|
|
86
|
+
"""
|
|
87
|
+
@wraps(fn)
|
|
88
|
+
def wrapper(*args, **kwargs):
|
|
89
|
+
return not fn(*args, **kwargs)
|
|
90
|
+
return wrapper
|
|
91
|
+
|
|
92
|
+
def execute_fn_with_args_and_or_kwargs(fn, args, kwargs):
|
|
93
|
+
""" If args + kwargs aren't accepted only args are passed in"""
|
|
94
|
+
try:
|
|
95
|
+
return fn(*args, **kwargs)
|
|
96
|
+
except TypeError:
|
|
97
|
+
return fn(*args)
|
|
98
|
+
|
|
99
|
+
def toy_df(n_rows=20, n_cols=5, missingness=0.2, min_val=0, max_val=1,
|
|
100
|
+
missing_value=np.nan, rand_seed=1234, sample_prefix=None):
|
|
101
|
+
"""Generate an array or DataFrame with NaNs"""
|
|
102
|
+
np.random.seed(rand_seed)
|
|
103
|
+
X = np.random.uniform(
|
|
104
|
+
low = min_val, high = max_val, size = n_rows * n_cols).reshape(n_rows, n_cols).astype(
|
|
105
|
+
float)
|
|
106
|
+
# check missingness
|
|
107
|
+
if missingness > 0:
|
|
108
|
+
# If missingness >= 1 then use it as approximate (see below) count
|
|
109
|
+
if missingness >= 1:
|
|
110
|
+
n_missing = int(missingness)
|
|
111
|
+
else:
|
|
112
|
+
n_missing = int(missingness * n_rows * n_cols)
|
|
113
|
+
print(n_missing)
|
|
114
|
+
|
|
115
|
+
# Introduce NaNs until n_miss "NAs" are inserted.
|
|
116
|
+
missing_count = 0
|
|
117
|
+
for i,j in zip(np.random.choice(n_rows, n_missing), np.random.choice(n_cols, n_missing)):
|
|
118
|
+
if np.isnan(X[i][j]):
|
|
119
|
+
continue
|
|
120
|
+
else:
|
|
121
|
+
X[i][j] = missing_value
|
|
122
|
+
missing_count += 1
|
|
123
|
+
if missing_count >= n_missing:
|
|
124
|
+
break
|
|
125
|
+
|
|
126
|
+
# check sample_prefix
|
|
127
|
+
if sample_prefix is None:
|
|
128
|
+
return X
|
|
129
|
+
else:
|
|
130
|
+
colNames = [sample_prefix + '_' + str(i) for i in range(0, n_cols)]
|
|
131
|
+
return pd.DataFrame(X, columns=colNames)
|
|
132
|
+
|
|
133
|
+
def insert_na(df, n_miss, seed):
|
|
134
|
+
np.random.seed(seed)
|
|
135
|
+
nrow,ncol = df.shape
|
|
136
|
+
na_count = 0
|
|
137
|
+
if n_miss >= nrow*ncol:
|
|
138
|
+
out_df = df.replace(df.values, np.nan)
|
|
139
|
+
else:
|
|
140
|
+
tmp = df.to_numpy()
|
|
141
|
+
while(1):
|
|
142
|
+
if na_count >= n_miss:
|
|
143
|
+
break
|
|
144
|
+
x_ind = np.random.choice(nrow)
|
|
145
|
+
y_ind = np.random.choice(ncol)
|
|
146
|
+
if not np.isnan(tmp[x_ind][y_ind]):
|
|
147
|
+
tmp[x_ind][y_ind] = np.nan
|
|
148
|
+
na_count += 1
|
|
149
|
+
out_df = pd.DataFrame(tmp, index=df.index, columns=df.columns)
|
|
150
|
+
return out_df
|
|
151
|
+
|
|
152
|
+
def apply_method(df, method_name, **kwargs):
|
|
153
|
+
"""Applies a pandas method to a DataFrame.
|
|
154
|
+
Args:
|
|
155
|
+
df (pd.DataFrame): The DataFrame to apply the method to.
|
|
156
|
+
method_name (str): The name of the method to apply.
|
|
157
|
+
**kwargs: Additional keyword arguments to pass to the method.
|
|
158
|
+
Returns:
|
|
159
|
+
pd.DataFrame: The transformed DataFrame.
|
|
160
|
+
"""
|
|
161
|
+
method = getattr(df, method_name)
|
|
162
|
+
return method(**kwargs)
|
|
163
|
+
|
|
164
|
+
def shepards(distances, power=2):
|
|
165
|
+
""" Basic inverse distance weighting function
|
|
166
|
+
|
|
167
|
+
Parameters
|
|
168
|
+
----------
|
|
169
|
+
distances: list/numpy.ndarray
|
|
170
|
+
1D list of numbers (ex. distance results from call to KDTree.query)
|
|
171
|
+
|
|
172
|
+
power: int
|
|
173
|
+
Default of 2 used since the referenced paper stated an exponent of 2 "gives seemingly
|
|
174
|
+
satisfactory results"
|
|
175
|
+
|
|
176
|
+
Returns
|
|
177
|
+
-------
|
|
178
|
+
numpy.ndarray
|
|
179
|
+
1D list of numbers that sum to 1, represents weights of provided distances, in order.
|
|
180
|
+
|
|
181
|
+
References
|
|
182
|
+
----------
|
|
183
|
+
|
|
184
|
+
Shepard, Donald (1968). "A two-dimensional interpolation function for irregularly-spaced data".
|
|
185
|
+
Proceedings of the 1968 ACM National Conference. pp. 517-524. doi:10.1145/800186.810616
|
|
186
|
+
"""
|
|
187
|
+
return to_percentage(1/np.power(distances, power))
|
|
188
|
+
|
|
189
|
+
def to_percentage(vec):
|
|
190
|
+
""" Converts list of real numbers into a list of percentages """
|
|
191
|
+
return vec/np.sum(vec)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
## Wrapper
|
|
197
|
+
|
|
198
|
+
def handle_df(fn):
|
|
199
|
+
""" Decorator to handle pandas Dataframe object as input
|
|
200
|
+
|
|
201
|
+
If the first arg is a pandas dataframe, convert it to a numpy array
|
|
202
|
+
otherwise don't do anything. Cast back to a pandas Dataframe after
|
|
203
|
+
the imputation function has run
|
|
204
|
+
"""
|
|
205
|
+
@wraps(fn)
|
|
206
|
+
def wrapper(*args, **kwargs):
|
|
207
|
+
is_df = False
|
|
208
|
+
## convert tuple to list so args can be modified
|
|
209
|
+
args = list(args)
|
|
210
|
+
## Either make a copy or use a pointer to the original
|
|
211
|
+
if kwargs.get('inplace'):
|
|
212
|
+
args[0] = args[0]
|
|
213
|
+
else:
|
|
214
|
+
args[0] = args[0].copy()
|
|
215
|
+
|
|
216
|
+
## If input data is a dataframe then cast the input to an np.array
|
|
217
|
+
## and set an indicator flag before continuing
|
|
218
|
+
if isinstance(args[0], pd.DataFrame):
|
|
219
|
+
is_df = True
|
|
220
|
+
in_ind = args[0].index
|
|
221
|
+
in_columns = args[0].columns
|
|
222
|
+
args[0] = args[0].to_numpy()
|
|
223
|
+
|
|
224
|
+
## function invokation
|
|
225
|
+
results = execute_fn_with_args_and_or_kwargs(fn, args, kwargs)
|
|
226
|
+
|
|
227
|
+
## cast the output back to a DataFrame.
|
|
228
|
+
if is_df:
|
|
229
|
+
results = pd.DataFrame(results, index=in_ind, columns=in_columns)
|
|
230
|
+
return results
|
|
231
|
+
return wrapper
|
|
232
|
+
|
|
233
|
+
def add_inplace_option(fn):
|
|
234
|
+
""" Decorator for inplace option
|
|
235
|
+
|
|
236
|
+
Functions wrapped by this can have an `inplace` kwarg to use either a copy of
|
|
237
|
+
data or reference """
|
|
238
|
+
@wraps(fn)
|
|
239
|
+
def wrapper(*args, **kwargs):
|
|
240
|
+
""" Run input checks"""
|
|
241
|
+
## convert tuple to list so args can be modified
|
|
242
|
+
args = list(args)
|
|
243
|
+
## Either make a copy or use a pointer to the original
|
|
244
|
+
if kwargs.get('inplace'):
|
|
245
|
+
args[0] = args[0]
|
|
246
|
+
else:
|
|
247
|
+
args[0] = args[0].copy()
|
|
248
|
+
|
|
249
|
+
## function invokation
|
|
250
|
+
return execute_fn_with_args_and_or_kwargs(fn, args, kwargs)
|
|
251
|
+
return wrapper
|
|
252
|
+
|
|
253
|
+
def conform_output(fn):
|
|
254
|
+
""" Decorator to handle impossible values
|
|
255
|
+
|
|
256
|
+
Adds two optional kwargs, `coerce_fn` and `valid_fn`.
|
|
257
|
+
|
|
258
|
+
`valid_fn` function stub
|
|
259
|
+
|
|
260
|
+
def my_coerce_fn(some_literal) -> boolean
|
|
261
|
+
|
|
262
|
+
`coerce_fn` function stub
|
|
263
|
+
|
|
264
|
+
def my_coerce_fn(arr, x_i, y_i) -> some_literal
|
|
265
|
+
|
|
266
|
+
Valid function is something run on each element of the, this is
|
|
267
|
+
the function that we use to indicate whether the value is valid
|
|
268
|
+
or not
|
|
269
|
+
|
|
270
|
+
Coerce function has three arguments, the original matrix and
|
|
271
|
+
the two indices of the invalid value x_i and y_i. This function
|
|
272
|
+
will be run on all invalid values.
|
|
273
|
+
"""
|
|
274
|
+
@wraps(fn)
|
|
275
|
+
def wrapper(*args, **kwargs):
|
|
276
|
+
def raise_error(arr, x_i, y_i):
|
|
277
|
+
raise Exception("{} does not conform".format(arr[x_i, y_i]))
|
|
278
|
+
## convert tuple to list so args can be modified
|
|
279
|
+
args = list(args)
|
|
280
|
+
# function that checks if the value is valid
|
|
281
|
+
valid_fn = kwargs.get("valid_fn", constantly(True))
|
|
282
|
+
# function that modifies the invalid value to something valid
|
|
283
|
+
coerce_fn = kwargs.get("coerce_fn", raise_error)
|
|
284
|
+
|
|
285
|
+
## function invokation
|
|
286
|
+
results = execute_fn_with_args_and_or_kwargs(fn, args, kwargs)
|
|
287
|
+
|
|
288
|
+
# check each value to see if it's valid
|
|
289
|
+
bool_arr = map_nd(complement(valid_fn), results)
|
|
290
|
+
# get indices of invalid values
|
|
291
|
+
invalid_indices = np.argwhere(bool_arr)
|
|
292
|
+
# run the coerce fn on each invalid indice
|
|
293
|
+
for x_i, y_i in invalid_indices:
|
|
294
|
+
results[x_i, y_i] = coerce_fn(results, x_i, y_i)
|
|
295
|
+
|
|
296
|
+
return results
|
|
297
|
+
return wrapper
|
|
298
|
+
|
|
299
|
+
def wrappers(fn):
|
|
300
|
+
""" Helper decorator, all wrapper functions applied to modify input (matrix
|
|
301
|
+
with missing values) and output (matrix with imputed values)
|
|
302
|
+
|
|
303
|
+
NOTE: `handle_df` has to be last as it needs to be in the outer loop (first
|
|
304
|
+
entry point) since every other function assumes you're getting an np.array
|
|
305
|
+
as input
|
|
306
|
+
"""
|
|
307
|
+
return thread(
|
|
308
|
+
fn, # function that's getting wrapped
|
|
309
|
+
add_inplace_option, # allow choosing reference/copy
|
|
310
|
+
conform_output, # allow enforcing of some spec on returned outputs
|
|
311
|
+
handle_df, # if df type, cast to np.array on in and df on out
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
## Central tendency
|
|
315
|
+
@wrappers
|
|
316
|
+
def mean(data):
|
|
317
|
+
""" Substitute missing values with the mean of that column.
|
|
318
|
+
|
|
319
|
+
Parameters
|
|
320
|
+
----------
|
|
321
|
+
data: numpy.ndarray
|
|
322
|
+
Data to impute.
|
|
323
|
+
|
|
324
|
+
Returns
|
|
325
|
+
-------
|
|
326
|
+
numpy.ndarray
|
|
327
|
+
Imputed data.
|
|
328
|
+
|
|
329
|
+
"""
|
|
330
|
+
nan_xy = nan_indices(data)
|
|
331
|
+
for x_i, y_i in nan_xy:
|
|
332
|
+
row_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
|
|
333
|
+
new_value = np.mean(row_wo_nan)
|
|
334
|
+
data[x_i][y_i] = new_value
|
|
335
|
+
return data
|
|
336
|
+
|
|
337
|
+
@wrappers
|
|
338
|
+
def median(data):
|
|
339
|
+
""" Substitute missing values with the median of that column(middle).
|
|
340
|
+
|
|
341
|
+
Parameters
|
|
342
|
+
----------
|
|
343
|
+
data: numpy.ndarray
|
|
344
|
+
Data to impute.
|
|
345
|
+
|
|
346
|
+
Returns
|
|
347
|
+
-------
|
|
348
|
+
numpy.ndarray
|
|
349
|
+
Imputed data.
|
|
350
|
+
|
|
351
|
+
"""
|
|
352
|
+
nan_xy = nan_indices(data)
|
|
353
|
+
cols_missing = set(nan_xy.T[1])
|
|
354
|
+
medians = {}
|
|
355
|
+
for y_i in cols_missing:
|
|
356
|
+
cols_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
|
|
357
|
+
median_y = np.median(cols_wo_nan)
|
|
358
|
+
medians[str(y_i)] = median_y
|
|
359
|
+
for x_i, y_i in nan_xy:
|
|
360
|
+
data[x_i][y_i] = medians[str(y_i)]
|
|
361
|
+
return data
|
|
362
|
+
|
|
363
|
+
@wrappers
|
|
364
|
+
def mode(data):
|
|
365
|
+
""" Substitute missing values with the mode of that column(most frequent).
|
|
366
|
+
|
|
367
|
+
In the case that there is a tie (there are multiple, most frequent values)
|
|
368
|
+
for a column randomly pick one of them.
|
|
369
|
+
|
|
370
|
+
Parameters
|
|
371
|
+
----------
|
|
372
|
+
data: numpy.ndarray
|
|
373
|
+
Data to impute.
|
|
374
|
+
|
|
375
|
+
Returns
|
|
376
|
+
-------
|
|
377
|
+
numpy.ndarray
|
|
378
|
+
Imputed data.
|
|
379
|
+
|
|
380
|
+
"""
|
|
381
|
+
nan_xy = nan_indices(data)
|
|
382
|
+
modes = []
|
|
383
|
+
for y_i in range(np.shape(data)[1]):
|
|
384
|
+
unique_counts = np.unique(data[:, [y_i]], return_counts=True)
|
|
385
|
+
max_count = np.max(unique_counts[1])
|
|
386
|
+
mode_y = [unique for unique, count in np.transpose(unique_counts)
|
|
387
|
+
if count == max_count and not np.isnan(unique)]
|
|
388
|
+
modes.append(mode_y) # Appends index of column and column modes
|
|
389
|
+
for x_i, y_i in nan_xy:
|
|
390
|
+
data[x_i][y_i] = np.random.choice(modes[y_i])
|
|
391
|
+
return data
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
#################
|
|
396
|
+
## random impute
|
|
397
|
+
#################
|
|
398
|
+
@wrappers
|
|
399
|
+
def random_impute(data):
|
|
400
|
+
""" Fill missing values in with a randomly selected value from the same
|
|
401
|
+
column.
|
|
402
|
+
|
|
403
|
+
Parameters
|
|
404
|
+
----------
|
|
405
|
+
data: numpy.ndarray
|
|
406
|
+
Data to impute.
|
|
407
|
+
|
|
408
|
+
Returns
|
|
409
|
+
-------
|
|
410
|
+
numpy.ndarray
|
|
411
|
+
Imputed data.
|
|
412
|
+
|
|
413
|
+
"""
|
|
414
|
+
nan_xy = nan_indices(data)
|
|
415
|
+
for x, y in nan_xy:
|
|
416
|
+
uniques = np.unique(data[:, y])
|
|
417
|
+
uniques = uniques[~np.isnan(uniques)]
|
|
418
|
+
data[x][y] = np.random.choice(uniques)
|
|
419
|
+
return data
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
########################
|
|
423
|
+
## moving window impute
|
|
424
|
+
########################
|
|
425
|
+
@wrappers
|
|
426
|
+
def moving_window(data, nindex=None, wsize=5, errors="coerce", func=np.mean,
|
|
427
|
+
inplace=False):
|
|
428
|
+
""" Interpolate the missing values based on nearby values.
|
|
429
|
+
|
|
430
|
+
For example, with an array like this:
|
|
431
|
+
|
|
432
|
+
array([[-1.24940, -1.38673, -0.03214945, 0.08255145, -0.007415],
|
|
433
|
+
[ 2.14662, 0.32758 , -0.82601414, 1.78124027, 0.873998],
|
|
434
|
+
[-0.41400, -0.977629, nan, -1.39255344, 1.680435],
|
|
435
|
+
[ 0.40975, 1.067599, 0.29152388, -1.70160145, -0.565226],
|
|
436
|
+
[-0.54592, -1.126187, 2.04004377, 0.16664863, -0.010677]])
|
|
437
|
+
|
|
438
|
+
Using a `k` or window size of 3. The one missing value would be set
|
|
439
|
+
to -1.18509122. The window operates on the horizontal axis.
|
|
440
|
+
|
|
441
|
+
Usage
|
|
442
|
+
-----
|
|
443
|
+
|
|
444
|
+
The parameters default the function to a moving mean. You may want to change
|
|
445
|
+
the default window size:
|
|
446
|
+
|
|
447
|
+
moving_window(data, wsize=10)
|
|
448
|
+
|
|
449
|
+
To only look at past data (null value is at the rightmost index in the window):
|
|
450
|
+
|
|
451
|
+
moving_window(data, nindex=-1)
|
|
452
|
+
|
|
453
|
+
To use a custom function:
|
|
454
|
+
|
|
455
|
+
moving_window(data, func=np.median)
|
|
456
|
+
|
|
457
|
+
You can also do something like take 1.5x the max of previous values in the window:
|
|
458
|
+
|
|
459
|
+
moving_window(data, func=lambda arr: max(arr) * 1.50, nindex=-1)
|
|
460
|
+
|
|
461
|
+
Parameters
|
|
462
|
+
----------
|
|
463
|
+
data: numpy.ndarray
|
|
464
|
+
2D matrix to impute.
|
|
465
|
+
nindex: int
|
|
466
|
+
Null index. Index of the null value inside the moving average window.
|
|
467
|
+
Use cases: Say you wanted to make value skewed toward the left or right
|
|
468
|
+
side. 0 would only take the average of values from the right and -1
|
|
469
|
+
would only take the average of values from the left
|
|
470
|
+
wsize: int
|
|
471
|
+
Window size. Size of the moving average window/area of values being used
|
|
472
|
+
for each local imputation. This number includes the missing value.
|
|
473
|
+
errors: {"raise", "coerce", "ignore"}
|
|
474
|
+
Errors will occur with the indexing of the windows - for example if there
|
|
475
|
+
is a nan at data[x][0] and `nindex` is set to -1 or there is a nan at
|
|
476
|
+
data[x][-1] and `nindex` is set to 0. `"raise"` will raise an error,
|
|
477
|
+
`"coerce"` will try again using an nindex set to the middle and `"ignore"`
|
|
478
|
+
will just leave it as a nan.
|
|
479
|
+
inplace: {True, False}
|
|
480
|
+
Whether to return a copy or run on the passed-in array
|
|
481
|
+
|
|
482
|
+
Returns
|
|
483
|
+
-------
|
|
484
|
+
numpy.ndarray
|
|
485
|
+
Imputed data.
|
|
486
|
+
|
|
487
|
+
"""
|
|
488
|
+
if errors == "ignore":
|
|
489
|
+
raise Exception("`errors` value `ignore` not implemented yet. Sorry!")
|
|
490
|
+
|
|
491
|
+
if not inplace:
|
|
492
|
+
data = data.copy()
|
|
493
|
+
|
|
494
|
+
if nindex is None: # If using equal window side lengths
|
|
495
|
+
assert wsize % 2 == 1, "The parameter `wsize` should not be even "\
|
|
496
|
+
"if the value `nindex` is not set since it defaults to the midpoint "\
|
|
497
|
+
"and an even `wsize` makes the midpoint ambiguous"
|
|
498
|
+
wside_left = wsize // 2
|
|
499
|
+
wside_right = wsize // 2
|
|
500
|
+
else: # If using custom window side lengths
|
|
501
|
+
assert nindex < wsize, "The null index must be smaller than the window size"
|
|
502
|
+
if nindex == -1:
|
|
503
|
+
wside_left = wsize - 1
|
|
504
|
+
wside_right = 0
|
|
505
|
+
else:
|
|
506
|
+
wside_left = nindex
|
|
507
|
+
wside_right = wsize - nindex - 1
|
|
508
|
+
|
|
509
|
+
while True:
|
|
510
|
+
nan_xy = nan_indices(data)
|
|
511
|
+
n_nan_prev = len(nan_xy)
|
|
512
|
+
for x_i, y_i in nan_xy:
|
|
513
|
+
left_i = max(0, y_i-wside_left)
|
|
514
|
+
right_i = min(len(data), y_i+wside_right+1)
|
|
515
|
+
window = data[x_i, left_i: right_i]
|
|
516
|
+
window_not_null = window[~np.isnan(window)]
|
|
517
|
+
|
|
518
|
+
if len(window_not_null) > 0:
|
|
519
|
+
try:
|
|
520
|
+
data[x_i][y_i] = func(window_not_null)
|
|
521
|
+
continue
|
|
522
|
+
except Exception as e:
|
|
523
|
+
if errors == "raise":
|
|
524
|
+
raise e
|
|
525
|
+
|
|
526
|
+
if errors == "coerce":
|
|
527
|
+
# If either the window has a length of 0 or the aggregate function fails somehow,
|
|
528
|
+
# do a fallback of just trying the best we can by using it as the middle and trying
|
|
529
|
+
# to recalculate. Use temporary wside_left/wside_right, for only the calculation of
|
|
530
|
+
# this specific problamatic value
|
|
531
|
+
wside_left_tmp = wsize // 2
|
|
532
|
+
wside_right_tmp = wside_left_tmp
|
|
533
|
+
|
|
534
|
+
left_i_tmp = max(0, y_i-wside_left_tmp)
|
|
535
|
+
right_i_tmp = min(len(data), y_i+wside_right_tmp+1)
|
|
536
|
+
|
|
537
|
+
window = data[x_i, left_i_tmp:right_i_tmp]
|
|
538
|
+
window_not_null = window[~np.isnan(window)]
|
|
539
|
+
try:
|
|
540
|
+
data[x_i][y_i] = func(window_not_null)
|
|
541
|
+
except Exception as e:
|
|
542
|
+
print("Exception:", e)
|
|
543
|
+
if n_nan_prev == len(nan_indices(data)):
|
|
544
|
+
break
|
|
545
|
+
return data
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
########################
|
|
549
|
+
## fKNN
|
|
550
|
+
########################
|
|
551
|
+
@wrappers
|
|
552
|
+
def fKNN(data, na_locations, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10,
|
|
553
|
+
idw_fn=shepards, init_impute_fn=mean):
|
|
554
|
+
""" Impute using a variant of the nearest neighbours approach
|
|
555
|
+
|
|
556
|
+
Basic idea: Impute array with a passed in initial impute fn (mean impute)
|
|
557
|
+
and then use the resulting complete array to construct a KDTree. Use this
|
|
558
|
+
KDTree to compute nearest neighbours. After finding `k` nearest
|
|
559
|
+
neighbours, take the weighted average of them. Basically, find the nearest
|
|
560
|
+
row in terms of distance
|
|
561
|
+
|
|
562
|
+
This approach is much, much faster than the other implementation (fit+transform
|
|
563
|
+
for each subset) which is almost prohibitively expensive.
|
|
564
|
+
|
|
565
|
+
Parameters
|
|
566
|
+
----------
|
|
567
|
+
data: ndarray
|
|
568
|
+
2D matrix to impute.
|
|
569
|
+
|
|
570
|
+
na_locations: tuple
|
|
571
|
+
Pre-calculated (x,y) of missing values.
|
|
572
|
+
|
|
573
|
+
k: int, optional
|
|
574
|
+
Parameter used for method querying the KDTree class object. Number of
|
|
575
|
+
neighbours used in the KNN query. Refer to the docs for
|
|
576
|
+
[`scipy.spatial.KDTree.query`]
|
|
577
|
+
(https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
|
|
578
|
+
|
|
579
|
+
eps: nonnegative float, optional
|
|
580
|
+
Parameter used for method querying the KDTree class object. From the
|
|
581
|
+
SciPy docs: "Return approximate nearest neighbors; the kth returned
|
|
582
|
+
value is guaranteed to be no further than (1+eps) times the distance to
|
|
583
|
+
the real kth nearest neighbor". Refer to the docs for
|
|
584
|
+
[`scipy.spatial.KDTree.query`]
|
|
585
|
+
(https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
|
|
586
|
+
|
|
587
|
+
p : float, 1<=p<=infinity, optional
|
|
588
|
+
Parameter used for method querying the KDTree class object. Straight from the
|
|
589
|
+
SciPy docs: "Which Minkowski p-norm to use. 1 is the
|
|
590
|
+
sum-of-absolute-values Manhattan distance 2 is the usual Euclidean
|
|
591
|
+
distance infinity is the maximum-coordinate-difference distance". Refer to
|
|
592
|
+
the docs for
|
|
593
|
+
[`scipy.spatial.KDTree.query`]
|
|
594
|
+
(https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
|
|
595
|
+
|
|
596
|
+
distance_upper_bound : nonnegative float, optional
|
|
597
|
+
Parameter used for method querying the KDTree class object. Straight
|
|
598
|
+
from the SciPy docs: "Return only neighbors within this distance. This
|
|
599
|
+
is used to prune tree searches, so if you are doing a series of
|
|
600
|
+
nearest-neighbor queries, it may help to supply the distance to the
|
|
601
|
+
nearest neighbor of the most recent point." Refer to the docs for
|
|
602
|
+
[`scipy.spatial.KDTree.query`]
|
|
603
|
+
(https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
|
|
604
|
+
|
|
605
|
+
leafsize: int, optional
|
|
606
|
+
Parameter used for construction of the `KDTree` class object. Straight from
|
|
607
|
+
the SciPy docs: "The number of points at which the algorithm switches
|
|
608
|
+
over to brute-force. Has to be positive". Refer to the docs for
|
|
609
|
+
[`scipy.spatial.KDTree`](https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.KDTree.html)
|
|
610
|
+
for more information.
|
|
611
|
+
|
|
612
|
+
idw_fn: fn, optional
|
|
613
|
+
Function that takes one argument, a list of distances, and returns weighted percentages. You can define a custom
|
|
614
|
+
one or bootstrap from functions defined in `impy.util.inverse_distance_weighting` which can be using
|
|
615
|
+
functools.partial, for example: `functools.partial(impy.util.inverse_distance_weighting.shepards, power=1)`
|
|
616
|
+
|
|
617
|
+
init_impute_fn: fn, optional
|
|
618
|
+
|
|
619
|
+
Returns
|
|
620
|
+
-------
|
|
621
|
+
numpy.ndarray
|
|
622
|
+
Imputed data.
|
|
623
|
+
|
|
624
|
+
Examples
|
|
625
|
+
--------
|
|
626
|
+
|
|
627
|
+
>>> data = np.arange(25).reshape((5, 5)).astype(np.float)
|
|
628
|
+
>>> data[0][2] = np.nan
|
|
629
|
+
>>> data
|
|
630
|
+
array([[ 0., 1., nan, 3., 4.],
|
|
631
|
+
[ 5., 6., 7., 8., 9.],
|
|
632
|
+
[10., 11., 12., 13., 14.],
|
|
633
|
+
[15., 16., 17., 18., 19.],
|
|
634
|
+
[20., 21., 22., 23., 24.]])
|
|
635
|
+
>> fast_knn(data, k=1) # Weighted average (by distance) of nearest 1 neighbour
|
|
636
|
+
array([[ 0., 1., 7., 3., 4.],
|
|
637
|
+
[ 5., 6., 7., 8., 9.],
|
|
638
|
+
[10., 11., 12., 13., 14.],
|
|
639
|
+
[15., 16., 17., 18., 19.],
|
|
640
|
+
[20., 21., 22., 23., 24.]])
|
|
641
|
+
>> fast_knn(data, k=2) # Weighted average of nearest 2 neighbours
|
|
642
|
+
array([[ 0. , 1. , 10.08608891, 3. , 4. ],
|
|
643
|
+
[ 5. , 6. , 7. , 8. , 9. ],
|
|
644
|
+
[10. , 11. , 12. , 13. , 14. ],
|
|
645
|
+
[15. , 16. , 17. , 18. , 19. ],
|
|
646
|
+
[20. , 21. , 22. , 23. , 24. ]])
|
|
647
|
+
>> fast_knn(data, k=3)
|
|
648
|
+
array([[ 0. , 1. , 13.40249283, 3. , 4. ],
|
|
649
|
+
[ 5. , 6. , 7. , 8. , 9. ],
|
|
650
|
+
[10. , 11. , 12. , 13. , 14. ],
|
|
651
|
+
[15. , 16. , 17. , 18. , 19. ],
|
|
652
|
+
[20. , 21. , 22. , 23. , 24. ]])
|
|
653
|
+
>> fast_knn(data, k=5) # There are at most only 4 neighbours. Raises error
|
|
654
|
+
...
|
|
655
|
+
IndexError: index 5 is out of bounds for axis 0 with size 5
|
|
656
|
+
|
|
657
|
+
"""
|
|
658
|
+
nan_xy = na_locations #pre-calculate nan_xy
|
|
659
|
+
data_c = data #pre-impute data
|
|
660
|
+
kdtree = KDTree(data_c, leafsize=leafsize)
|
|
661
|
+
|
|
662
|
+
for x_i, y_i in nan_xy:
|
|
663
|
+
distances, indices = kdtree.query(data_c[x_i], k=k+1, eps=eps, p=p,
|
|
664
|
+
distance_upper_bound=distance_upper_bound)
|
|
665
|
+
# Will always return itself in the first index. Delete it.
|
|
666
|
+
distances, indices = distances[1:], indices[1:]
|
|
667
|
+
# Add small constant to distances to avoid division by 0
|
|
668
|
+
distances += 1e-3
|
|
669
|
+
weights = idw_fn(distances)
|
|
670
|
+
# Assign missing value the weighted average of `k` nearest neighbours
|
|
671
|
+
data[x_i][y_i] = np.dot(weights, [data_c[ind][y_i] for ind in indices])
|
|
672
|
+
return data
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
def external_ref(data, na_locations, ref_data, k=3, eps=0, p=2,
|
|
676
|
+
distance_upper_bound=np.inf, leafsize=10,
|
|
677
|
+
idw_fn=shepards):
|
|
678
|
+
""" Impute using a variant of the nearest neighbours approach
|
|
679
|
+
|
|
680
|
+
Basic idea: Impute array with a passed in initial impute fn (mean impute)
|
|
681
|
+
and then use the resulting complete array to construct a KDTree. Use this
|
|
682
|
+
KDTree to compute nearest neighbours. After finding `k` nearest
|
|
683
|
+
neighbours, take the weighted average of them. Basically, find the nearest
|
|
684
|
+
row in terms of distance
|
|
685
|
+
|
|
686
|
+
This approach is much, much faster than the other implementation
|
|
687
|
+
(fit+transform for each subset) which is almost prohibitively expensive.
|
|
688
|
+
|
|
689
|
+
Parameters
|
|
690
|
+
----------
|
|
691
|
+
data: ndarray
|
|
692
|
+
2D matrix with missing values.
|
|
693
|
+
|
|
694
|
+
na_locations: tuple
|
|
695
|
+
Pre-calculated (x,y) of missing values.
|
|
696
|
+
|
|
697
|
+
ref_data: ndarray
|
|
698
|
+
2D matrix used as external reference data. k nearest neighbours will be
|
|
699
|
+
identified from this data.
|
|
700
|
+
|
|
701
|
+
k: int, optional
|
|
702
|
+
Parameter used for method querying the KDTree class object. Number of
|
|
703
|
+
neighbours used in the KNN query.
|
|
704
|
+
|
|
705
|
+
eps: nonnegative float, optional
|
|
706
|
+
Parameter used for method querying the KDTree class object. From the
|
|
707
|
+
SciPy docs: "Return approximate nearest neighbors; the kth returned
|
|
708
|
+
value is guaranteed to be no further than (1+eps) times the distance to
|
|
709
|
+
the real kth nearest neighbor".
|
|
710
|
+
|
|
711
|
+
p : float, 1<=p<=infinity, optional
|
|
712
|
+
Parameter used for method querying the KDTree class object. Straight
|
|
713
|
+
from the SciPy docs: "Which Minkowski p-norm to use. 1 is the
|
|
714
|
+
sum-of-absolute-values Manhattan distance 2 is the usual Euclidean
|
|
715
|
+
distance infinity is the maximum-coordinate-difference distance".
|
|
716
|
+
|
|
717
|
+
distance_upper_bound : nonnegative float, optional
|
|
718
|
+
Parameter used for method querying the KDTree class object. Straight
|
|
719
|
+
from the SciPy docs: "Return only neighbors within this distance. This
|
|
720
|
+
is used to prune tree searches, so if you are doing a series of
|
|
721
|
+
nearest-neighbor queries, it may help to supply the distance to the
|
|
722
|
+
nearest neighbor of the most recent point."
|
|
723
|
+
|
|
724
|
+
leafsize: int, optional
|
|
725
|
+
Parameter used for construction of the `KDTree` class object. Straight
|
|
726
|
+
from the SciPy docs: "The number of points at which the algorithm
|
|
727
|
+
switches over to brute-force. Has to be positive".
|
|
728
|
+
|
|
729
|
+
idw_fn: fn, optional
|
|
730
|
+
Function that takes one argument, a list of distances, and returns
|
|
731
|
+
weighted percentages. You can define a custom one or bootstrap from
|
|
732
|
+
functions defined in `impy.util.inverse_distance_weighting` which can
|
|
733
|
+
be using functools.partial, for example:`functools.partial
|
|
734
|
+
impy.util.inverse_distance_weighting.shepards, power=1)`
|
|
735
|
+
|
|
736
|
+
Returns
|
|
737
|
+
-------
|
|
738
|
+
numpy.ndarray
|
|
739
|
+
Imputed data.
|
|
740
|
+
|
|
741
|
+
"""
|
|
742
|
+
nan_xy = na_locations #pre-calculate nan_xy
|
|
743
|
+
kdtree = KDTree(ref_data, leafsize=leafsize)
|
|
744
|
+
|
|
745
|
+
for x_i, y_i in nan_xy:
|
|
746
|
+
distances, indices = kdtree.query(data[x_i], k=k, eps=eps, p=p,
|
|
747
|
+
distance_upper_bound=distance_upper_bound)
|
|
748
|
+
# Add small constant to distances to avoid division by 0
|
|
749
|
+
distances += 1e-3
|
|
750
|
+
weights = idw_fn(distances)
|
|
751
|
+
# Assign missing value the weighted average of `k` nearest neighbours
|
|
752
|
+
data[x_i][y_i] = np.dot(weights, [ref_data[ind][y_i] for ind in indices])
|
|
753
|
+
return data
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
############################
|
|
757
|
+
## Expectation–maximization
|
|
758
|
+
############################
|
|
759
|
+
@wrappers
|
|
760
|
+
def em(data, eps=0.1):
|
|
761
|
+
""" Imputes given data using expectation maximization.
|
|
762
|
+
|
|
763
|
+
E-step: Calculates the expected complete data log likelihood ratio.
|
|
764
|
+
M-step: Finds the parameters that maximize the log likelihood of the
|
|
765
|
+
complete data.
|
|
766
|
+
|
|
767
|
+
Parameters
|
|
768
|
+
----------
|
|
769
|
+
data: numpy.nd.array
|
|
770
|
+
Data to impute.
|
|
771
|
+
eps: float
|
|
772
|
+
The amount of minimum change between iterations to break, if relative
|
|
773
|
+
change < eps, converge.
|
|
774
|
+
relative change = abs(current - previous) / previous
|
|
775
|
+
inplace: boolean
|
|
776
|
+
If True, operate on the numpy array reference
|
|
777
|
+
|
|
778
|
+
Returns
|
|
779
|
+
-------
|
|
780
|
+
numpy.nd.array
|
|
781
|
+
Imputed data.
|
|
782
|
+
|
|
783
|
+
"""
|
|
784
|
+
nan_xy = nan_indices(data)
|
|
785
|
+
for x_i, y_i in nan_xy:
|
|
786
|
+
col = data[:, int(y_i)]
|
|
787
|
+
mu = col[~np.isnan(col)].mean()
|
|
788
|
+
std = col[~np.isnan(col)].std()
|
|
789
|
+
col[x_i] = np.random.normal(loc=mu, scale=std)
|
|
790
|
+
previous, i = 1, 1
|
|
791
|
+
while True:
|
|
792
|
+
i += 1
|
|
793
|
+
# Expectation
|
|
794
|
+
mu = col[~np.isnan(col)].mean()
|
|
795
|
+
std = col[~np.isnan(col)].std()
|
|
796
|
+
# Maximization
|
|
797
|
+
col[x_i] = np.random.normal(loc=mu, scale=std)
|
|
798
|
+
# Break out of loop if likelihood doesn't change at least 10%
|
|
799
|
+
# and has run at least 5 times
|
|
800
|
+
delta = np.abs(col[x_i]-previous)/previous
|
|
801
|
+
if i > 5 and delta < eps:
|
|
802
|
+
data[x_i][y_i] = col[x_i]
|
|
803
|
+
break
|
|
804
|
+
data[x_i][y_i] = col[x_i]
|
|
805
|
+
previous = col[x_i]
|
|
806
|
+
return data
|
|
807
|
+
|
|
808
|
+
|
|
809
|
+
#######################
|
|
810
|
+
## Buck's method
|
|
811
|
+
#######################
|
|
812
|
+
@wrappers
|
|
813
|
+
def buck_iterative(data, eps=0.1):
|
|
814
|
+
""" Iterative variant of buck's method
|
|
815
|
+
|
|
816
|
+
- Variable to regress on is chosen at random.
|
|
817
|
+
- EM type infinite regression loop stops after change in prediction from
|
|
818
|
+
previous prediction < 10% for all columns with missing values
|
|
819
|
+
|
|
820
|
+
A Method of Estimation of Missing Values in Multivariate Data Suitable for
|
|
821
|
+
use with an Electronic Computer S. F. Buck Journal of the Royal Statistical
|
|
822
|
+
Society. Series B (Methodological) Vol. 22, No. 2 (1960), pp. 302-306
|
|
823
|
+
|
|
824
|
+
Parameters
|
|
825
|
+
----------
|
|
826
|
+
data: numpy.ndarray
|
|
827
|
+
Data to impute.
|
|
828
|
+
eps: float
|
|
829
|
+
The amount of minimum change between iterations to break, if relative
|
|
830
|
+
change < eps, converge.
|
|
831
|
+
relative change = abs(current - previous) / previous
|
|
832
|
+
Returns
|
|
833
|
+
-------
|
|
834
|
+
numpy.ndarray
|
|
835
|
+
Imputed data.
|
|
836
|
+
|
|
837
|
+
"""
|
|
838
|
+
nan_xy = nan_indices(data)
|
|
839
|
+
|
|
840
|
+
# Add a column of zeros to the index values
|
|
841
|
+
nan_xyz = np.append(nan_xy, np.zeros((np.shape(nan_xy)[0], 1)), axis=1)
|
|
842
|
+
|
|
843
|
+
nan_xyz = [[int(x), int(y), v] for x, y, v in nan_xyz]
|
|
844
|
+
temp = []
|
|
845
|
+
cols_missing = {y for _, y, _ in nan_xyz}
|
|
846
|
+
|
|
847
|
+
# Step 1: Simple Imputation, these are just placeholders
|
|
848
|
+
for x_i, y_i, value in nan_xyz:
|
|
849
|
+
# Column containing nan value without the nan value
|
|
850
|
+
col = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
|
|
851
|
+
|
|
852
|
+
new_value = np.mean(col)
|
|
853
|
+
data[x_i][y_i] = new_value
|
|
854
|
+
temp.append([x_i, y_i, new_value])
|
|
855
|
+
nan_xyz = temp
|
|
856
|
+
|
|
857
|
+
# Step 5: Repeat step 2 - 4 until convergence (the 100 is arbitrary)
|
|
858
|
+
|
|
859
|
+
converged = [False] * len(nan_xyz)
|
|
860
|
+
while not all(converged):
|
|
861
|
+
# Step 2: Placeholders are set back to missing for one variable/column
|
|
862
|
+
dependent_col = int(np.random.choice(list(cols_missing)))
|
|
863
|
+
missing_xs = [int(x) for x, y, value in nan_xyz if y == dependent_col]
|
|
864
|
+
|
|
865
|
+
# Step 3: Perform linear regression using the other variables
|
|
866
|
+
x_train, y_train = [], []
|
|
867
|
+
for x_i in (x_i for x_i in range(len(data)) if x_i not in missing_xs):
|
|
868
|
+
x_train.append(np.delete(data[x_i], dependent_col))
|
|
869
|
+
y_train.append(data[x_i][dependent_col])
|
|
870
|
+
model = LinearRegression()
|
|
871
|
+
model.fit(x_train, y_train)
|
|
872
|
+
|
|
873
|
+
# Step 4: Missing values for the missing variable/column are replaced
|
|
874
|
+
# with predictions from our new linear regression model
|
|
875
|
+
# For null indices with the dependent column that was randomly chosen
|
|
876
|
+
for i, z in enumerate(nan_xyz):
|
|
877
|
+
x_i = z[0]
|
|
878
|
+
y_i = z[1]
|
|
879
|
+
value = data[x_i, y_i]
|
|
880
|
+
if y_i == dependent_col:
|
|
881
|
+
# Row 'x' without the nan value
|
|
882
|
+
new_value = model.predict([np.delete(data[x_i], dependent_col)])
|
|
883
|
+
data[x_i][y_i] = new_value.reshape(1, -1)
|
|
884
|
+
if value == 0.0:
|
|
885
|
+
delta = (new_value-value)/0.01
|
|
886
|
+
else:
|
|
887
|
+
delta = (new_value-value)/value
|
|
888
|
+
converged[i] = abs(delta) < eps
|
|
889
|
+
return data
|
|
890
|
+
|