cpgtools 2.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cpgmodule/BED.py +441 -0
- cpgmodule/MI.py +193 -0
- cpgmodule/__init__.py +0 -0
- cpgmodule/_version.py +1 -0
- cpgmodule/cgID.py +866897 -0
- cpgmodule/data/AltumAge_cpg.pkl +0 -0
- cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
- cpgmodule/data/AltumAge_scaler.pkl +0 -0
- cpgmodule/data/GA_Bohlin.pkl +0 -0
- cpgmodule/data/GA_Haftorn.pkl +0 -0
- cpgmodule/data/GA_Knight.pkl +0 -0
- cpgmodule/data/GA_Lee_CPC.pkl +0 -0
- cpgmodule/data/GA_Lee_RPC.pkl +0 -0
- cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
- cpgmodule/data/GA_Mayne.pkl +0 -0
- cpgmodule/data/Hannum.pkl +0 -0
- cpgmodule/data/Horvath_2013.pkl +0 -0
- cpgmodule/data/Horvath_2018.pkl +0 -0
- cpgmodule/data/Levine.pkl +0 -0
- cpgmodule/data/Lu_DNAmTL.pkl +0 -0
- cpgmodule/data/Ped_McEwen.pkl +0 -0
- cpgmodule/data/Ped_Wu.pkl +0 -0
- cpgmodule/data/Zhang_BLUP.pkl +0 -0
- cpgmodule/data/Zhang_EN.pkl +0 -0
- cpgmodule/data/__init__.py +0 -0
- cpgmodule/extend_bed.py +147 -0
- cpgmodule/imotif.py +348 -0
- cpgmodule/ireader.py +28 -0
- cpgmodule/methylClock.py +53 -0
- cpgmodule/padjust.py +58 -0
- cpgmodule/region2gene.py +170 -0
- cpgmodule/utils.py +642 -0
- cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
- cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
- cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
- cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
- cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
- cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
- cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
- cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
- cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
- cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
- cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
- cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
- cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
- cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
- cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
- cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
- cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
- cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
- cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
- cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
- cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
- cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
- cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
- cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
- cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
- cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
- cpgtools-2.0.5.dist-info/METADATA +59 -0
- cpgtools-2.0.5.dist-info/RECORD +104 -0
- cpgtools-2.0.5.dist-info/WHEEL +5 -0
- cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
- cpgtools-2.0.5.dist-info/top_level.txt +5 -0
- impyute/__init__.py +3 -0
- impyute/contrib/__init__.py +7 -0
- impyute/contrib/compare.py +69 -0
- impyute/contrib/count_missing.py +30 -0
- impyute/contrib/describe.py +63 -0
- impyute/cs/__init__.py +11 -0
- impyute/cs/buck_iterative.py +82 -0
- impyute/cs/central_tendency.py +84 -0
- impyute/cs/em.py +52 -0
- impyute/cs/fast_knn.py +130 -0
- impyute/cs/random.py +27 -0
- impyute/dataset/__init__.py +6 -0
- impyute/dataset/base.py +137 -0
- impyute/dataset/corrupt.py +55 -0
- impyute/deletion/__init__.py +5 -0
- impyute/deletion/complete_case.py +21 -0
- impyute/ops/__init__.py +12 -0
- impyute/ops/error.py +9 -0
- impyute/ops/inverse_distance_weighting.py +31 -0
- impyute/ops/matrix.py +47 -0
- impyute/ops/testing.py +20 -0
- impyute/ops/util.py +96 -0
- impyute/ops/wrapper.py +179 -0
- impyute/ts/__init__.py +6 -0
- impyute/ts/locf.py +57 -0
- impyute/ts/moving_window.py +128 -0
- impyutelib.py +890 -0
- missingpy/__init__.py +4 -0
- missingpy/knnimpute.py +328 -0
- missingpy/missforest.py +556 -0
- missingpy/pairwise_external.py +315 -0
- missingpy/tests/__init__.py +0 -0
- missingpy/tests/test_knnimpute.py +605 -0
- missingpy/tests/test_missforest.py +409 -0
- missingpy/utils.py +124 -0
- misspylib.py +565 -0
impyute/ops/util.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
""" Random utility functions """
|
|
2
|
+
from functools import wraps
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
# Things that get exposed from * import
|
|
7
|
+
__all__ = [
|
|
8
|
+
"constantly", "complement", "identity", "thread",
|
|
9
|
+
"execute_fn_with_args_and_or_kwargs", "toy_df",
|
|
10
|
+
"insert_na",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
def thread(arg, *fns):
|
|
14
|
+
if len(fns) > 0:
|
|
15
|
+
return thread(fns[0](arg), *fns[1:])
|
|
16
|
+
else:
|
|
17
|
+
return arg
|
|
18
|
+
|
|
19
|
+
def identity(x):
|
|
20
|
+
return x
|
|
21
|
+
|
|
22
|
+
def constantly(x):
|
|
23
|
+
""" Returns a function that takes any args and returns x """
|
|
24
|
+
def func(*args, **kwargs):
|
|
25
|
+
return x
|
|
26
|
+
return func
|
|
27
|
+
|
|
28
|
+
def complement(fn):
|
|
29
|
+
""" Return fn that outputs the opposite truth values of the
|
|
30
|
+
input function
|
|
31
|
+
"""
|
|
32
|
+
@wraps(fn)
|
|
33
|
+
def wrapper(*args, **kwargs):
|
|
34
|
+
return not fn(*args, **kwargs)
|
|
35
|
+
return wrapper
|
|
36
|
+
|
|
37
|
+
def execute_fn_with_args_and_or_kwargs(fn, args, kwargs):
|
|
38
|
+
""" If args + kwargs aren't accepted only args are passed in"""
|
|
39
|
+
try:
|
|
40
|
+
return fn(*args, **kwargs)
|
|
41
|
+
except TypeError:
|
|
42
|
+
return fn(*args)
|
|
43
|
+
|
|
44
|
+
def toy_df(n_rows=20, n_cols=5, missingness=0.2, min_val=0, max_val=1,
|
|
45
|
+
missing_value=np.nan, rand_seed=1234, sample_prefix=None):
|
|
46
|
+
"""Generate an array or DataFrame with NaNs"""
|
|
47
|
+
np.random.seed(rand_seed)
|
|
48
|
+
X = np.random.uniform(
|
|
49
|
+
low = min_val, high = max_val, size = n_rows * n_cols).reshape(n_rows, n_cols).astype(
|
|
50
|
+
float)
|
|
51
|
+
# check missingness
|
|
52
|
+
if missingness > 0:
|
|
53
|
+
# If missingness >= 1 then use it as approximate (see below) count
|
|
54
|
+
if missingness >= 1:
|
|
55
|
+
n_missing = int(missingness)
|
|
56
|
+
else:
|
|
57
|
+
n_missing = int(missingness * n_rows * n_cols)
|
|
58
|
+
print(n_missing)
|
|
59
|
+
|
|
60
|
+
# Introduce NaNs until n_miss "NAs" are inserted.
|
|
61
|
+
missing_count = 0
|
|
62
|
+
for i,j in zip(np.random.choice(n_rows, n_missing), np.random.choice(n_cols, n_missing)):
|
|
63
|
+
if np.isnan(X[i][j]):
|
|
64
|
+
continue
|
|
65
|
+
else:
|
|
66
|
+
X[i][j] = missing_value
|
|
67
|
+
missing_count += 1
|
|
68
|
+
if missing_count >= n_missing:
|
|
69
|
+
break
|
|
70
|
+
|
|
71
|
+
# check sample_prefix
|
|
72
|
+
if sample_prefix is None:
|
|
73
|
+
return X
|
|
74
|
+
else:
|
|
75
|
+
colNames = [sample_prefix + '_' + str(i) for i in range(0, n_cols)]
|
|
76
|
+
return pd.DataFrame(X, columns=colNames)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def insert_na(df, n_miss, seed):
|
|
80
|
+
np.random.seed(seed)
|
|
81
|
+
nrow,ncol = df.shape
|
|
82
|
+
na_count = 0
|
|
83
|
+
if n_miss >= nrow*ncol:
|
|
84
|
+
out_df = df.replace(df.values, np.nan)
|
|
85
|
+
else:
|
|
86
|
+
tmp = df.to_numpy()
|
|
87
|
+
while(1):
|
|
88
|
+
if na_count >= n_miss:
|
|
89
|
+
break
|
|
90
|
+
x_ind = np.random.choice(nrow)
|
|
91
|
+
y_ind = np.random.choice(ncol)
|
|
92
|
+
if not np.isnan(tmp[x_ind][y_ind]):
|
|
93
|
+
tmp[x_ind][y_ind] = np.nan
|
|
94
|
+
na_count += 1
|
|
95
|
+
out_df = pd.DataFrame(tmp, index=df.index, columns=df.columns)
|
|
96
|
+
return out_df
|
impyute/ops/wrapper.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
""" Decorator functions to wrap around entry and exit
|
|
2
|
+
|
|
3
|
+
... to easily apply to a function, functions that check/process inputs
|
|
4
|
+
and outputs
|
|
5
|
+
"""
|
|
6
|
+
from functools import wraps
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from . import error
|
|
11
|
+
from . import matrix
|
|
12
|
+
from . import util as u
|
|
13
|
+
|
|
14
|
+
## Hacky way to handle python2 not having `ModuleNotFoundError`
|
|
15
|
+
# pylint: disable=redefined-builtin, missing-docstring
|
|
16
|
+
try:
|
|
17
|
+
raise ModuleNotFoundError
|
|
18
|
+
except NameError:
|
|
19
|
+
class ModuleNotFoundError(Exception):
|
|
20
|
+
pass
|
|
21
|
+
except ModuleNotFoundError:
|
|
22
|
+
pass
|
|
23
|
+
# pylint: enable=redefined-builtin, missing-docstring
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def handle_df(fn):
|
|
27
|
+
""" Decorator to handle pandas Dataframe object as input
|
|
28
|
+
|
|
29
|
+
If the first arg is a pandas dataframe, convert it to a numpy array
|
|
30
|
+
otherwise don't do anything. Cast back to a pandas Dataframe after
|
|
31
|
+
the imputation function has run
|
|
32
|
+
"""
|
|
33
|
+
@wraps(fn)
|
|
34
|
+
def wrapper(*args, **kwargs):
|
|
35
|
+
is_df = False
|
|
36
|
+
## convert tuple to list so args can be modified
|
|
37
|
+
args = list(args)
|
|
38
|
+
## Either make a copy or use a pointer to the original
|
|
39
|
+
if kwargs.get('inplace'):
|
|
40
|
+
args[0] = args[0]
|
|
41
|
+
else:
|
|
42
|
+
args[0] = args[0].copy()
|
|
43
|
+
|
|
44
|
+
## If input data is a dataframe then cast the input to an np.array
|
|
45
|
+
## and set an indicator flag before continuing
|
|
46
|
+
if isinstance(args[0], pd.DataFrame):
|
|
47
|
+
is_df = True
|
|
48
|
+
in_ind = args[0].index
|
|
49
|
+
in_columns = args[0].columns
|
|
50
|
+
args[0] = args[0].to_numpy()
|
|
51
|
+
|
|
52
|
+
## function invokation
|
|
53
|
+
results = u.execute_fn_with_args_and_or_kwargs(fn, args, kwargs)
|
|
54
|
+
|
|
55
|
+
## cast the output back to a DataFrame.
|
|
56
|
+
if is_df:
|
|
57
|
+
results = pd.DataFrame(results, index=in_ind, columns=in_columns)
|
|
58
|
+
return results
|
|
59
|
+
return wrapper
|
|
60
|
+
|
|
61
|
+
def add_inplace_option(fn):
|
|
62
|
+
""" Decorator for inplace option
|
|
63
|
+
|
|
64
|
+
Functions wrapped by this can have an `inplace` kwarg to use either a copy of
|
|
65
|
+
data or reference """
|
|
66
|
+
@wraps(fn)
|
|
67
|
+
def wrapper(*args, **kwargs):
|
|
68
|
+
""" Run input checks"""
|
|
69
|
+
## convert tuple to list so args can be modified
|
|
70
|
+
args = list(args)
|
|
71
|
+
## Either make a copy or use a pointer to the original
|
|
72
|
+
if kwargs.get('inplace'):
|
|
73
|
+
args[0] = args[0]
|
|
74
|
+
else:
|
|
75
|
+
args[0] = args[0].copy()
|
|
76
|
+
|
|
77
|
+
## function invokation
|
|
78
|
+
return u.execute_fn_with_args_and_or_kwargs(fn, args, kwargs)
|
|
79
|
+
return wrapper
|
|
80
|
+
|
|
81
|
+
def conform_output(fn):
|
|
82
|
+
""" Decorator to handle impossible values
|
|
83
|
+
|
|
84
|
+
Adds two optional kwargs, `coerce_fn` and `valid_fn`.
|
|
85
|
+
|
|
86
|
+
`valid_fn` function stub
|
|
87
|
+
|
|
88
|
+
def my_coerce_fn(some_literal) -> boolean
|
|
89
|
+
|
|
90
|
+
`coerce_fn` function stub
|
|
91
|
+
|
|
92
|
+
def my_coerce_fn(arr, x_i, y_i) -> some_literal
|
|
93
|
+
|
|
94
|
+
Valid function is something run on each element of the, this is
|
|
95
|
+
the function that we use to indicate whether the value is valid
|
|
96
|
+
or not
|
|
97
|
+
|
|
98
|
+
Coerce function has three arguments, the original matrix and
|
|
99
|
+
the two indices of the invalid value x_i and y_i. This function
|
|
100
|
+
will be run on all invalid values.
|
|
101
|
+
"""
|
|
102
|
+
@wraps(fn)
|
|
103
|
+
def wrapper(*args, **kwargs):
|
|
104
|
+
def raise_error(arr, x_i, y_i):
|
|
105
|
+
raise error.BadOutputError("{} does not conform".format(arr[x_i, y_i]))
|
|
106
|
+
## convert tuple to list so args can be modified
|
|
107
|
+
args = list(args)
|
|
108
|
+
# function that checks if the value is valid
|
|
109
|
+
valid_fn = kwargs.get("valid_fn", u.constantly(True))
|
|
110
|
+
# function that modifies the invalid value to something valid
|
|
111
|
+
coerce_fn = kwargs.get("coerce_fn", raise_error)
|
|
112
|
+
|
|
113
|
+
## function invokation
|
|
114
|
+
results = u.execute_fn_with_args_and_or_kwargs(fn, args, kwargs)
|
|
115
|
+
|
|
116
|
+
# check each value to see if it's valid
|
|
117
|
+
bool_arr = matrix.map_nd(u.complement(valid_fn), results)
|
|
118
|
+
# get indices of invalid values
|
|
119
|
+
invalid_indices = np.argwhere(bool_arr)
|
|
120
|
+
# run the coerce fn on each invalid indice
|
|
121
|
+
for x_i, y_i in invalid_indices:
|
|
122
|
+
results[x_i, y_i] = coerce_fn(results, x_i, y_i)
|
|
123
|
+
|
|
124
|
+
return results
|
|
125
|
+
return wrapper
|
|
126
|
+
|
|
127
|
+
def wrappers(fn):
|
|
128
|
+
""" Helper decorator, all wrapper functions applied to modify input (matrix
|
|
129
|
+
with missing values) and output (matrix with imputed values)
|
|
130
|
+
|
|
131
|
+
NOTE: `handle_df` has to be last as it needs to be in the outer loop (first
|
|
132
|
+
entry point) since every other function assumes you're getting an np.array
|
|
133
|
+
as input
|
|
134
|
+
"""
|
|
135
|
+
return u.thread(
|
|
136
|
+
fn, # function that's getting wrapped
|
|
137
|
+
add_inplace_option, # allow choosing reference/copy
|
|
138
|
+
conform_output, # allow enforcing of some spec on returned outputs
|
|
139
|
+
handle_df, # if df type, cast to np.array on in and df on out
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
def _shape_2d(data):
|
|
143
|
+
""" True if array is 2D"""
|
|
144
|
+
return len(np.shape(data)) == 2
|
|
145
|
+
|
|
146
|
+
def _shape_3d(data):
|
|
147
|
+
""" True if array is 3D"""
|
|
148
|
+
return len(np.shape(data)) == 3
|
|
149
|
+
|
|
150
|
+
def _is_ndarray(data):
|
|
151
|
+
""" True if the array is an instance of numpy's ndarray"""
|
|
152
|
+
return isinstance(data, np.ndarray)
|
|
153
|
+
|
|
154
|
+
def _dtype_float(data):
|
|
155
|
+
""" True if the values in the array are floating point"""
|
|
156
|
+
return data.dtype == float
|
|
157
|
+
|
|
158
|
+
def _nan_exists(data):
|
|
159
|
+
""" True if there is at least one np.nan in the array"""
|
|
160
|
+
nan_xy = matrix.nan_indices(data)
|
|
161
|
+
return len(nan_xy) > 0
|
|
162
|
+
|
|
163
|
+
def checks(fn):
|
|
164
|
+
""" Throw exception if error runs"""
|
|
165
|
+
@wraps(fn)
|
|
166
|
+
def wrapper(*args, **kwargs):
|
|
167
|
+
data = args[0]
|
|
168
|
+
if len(np.shape(data)) != 2:
|
|
169
|
+
raise error.BadInputError("No support for arrays that aren't 2D yet.")
|
|
170
|
+
elif not _shape_2d(data):
|
|
171
|
+
raise error.BadInputError("Not a 2D array.")
|
|
172
|
+
elif not _is_ndarray(data):
|
|
173
|
+
raise error.BadInputError("Not a np.ndarray.")
|
|
174
|
+
elif not _dtype_float(data):
|
|
175
|
+
raise error.BadInputError("Data is not float.")
|
|
176
|
+
elif not _nan_exists(data):
|
|
177
|
+
raise error.BadInputError("No NaN's in given data")
|
|
178
|
+
return u.execute_fn_with_args_and_or_kwargs(fn, args, kwargs)
|
|
179
|
+
return wrapper
|
impyute/ts/__init__.py
ADDED
impyute/ts/locf.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from impyute.ops import matrix
|
|
3
|
+
from impyute.ops import wrapper
|
|
4
|
+
from impyute.ops import error
|
|
5
|
+
|
|
6
|
+
@wrapper.wrappers
|
|
7
|
+
@wrapper.checks
|
|
8
|
+
def locf(data, axis=0):
|
|
9
|
+
""" Last Observation Carried Forward
|
|
10
|
+
|
|
11
|
+
For each set of missing indices, use the value of one row before(same
|
|
12
|
+
column). In the case that the missing value is the first row, look one
|
|
13
|
+
row ahead instead. If this next row is also NaN, look to the next row.
|
|
14
|
+
Repeat until you find a row in this column that's not NaN. All the rows
|
|
15
|
+
before will be filled with this value.
|
|
16
|
+
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
data: numpy.ndarray
|
|
20
|
+
Data to impute.
|
|
21
|
+
axis: boolean (optional)
|
|
22
|
+
0 if time series is in row format (Ex. data[0][:] is 1st data point).
|
|
23
|
+
1 if time series is in col format (Ex. data[:][0] is 1st data point).
|
|
24
|
+
|
|
25
|
+
Returns
|
|
26
|
+
-------
|
|
27
|
+
numpy.ndarray
|
|
28
|
+
Imputed data.
|
|
29
|
+
|
|
30
|
+
"""
|
|
31
|
+
if axis == 0:
|
|
32
|
+
data = np.transpose(data)
|
|
33
|
+
elif axis == 1:
|
|
34
|
+
pass
|
|
35
|
+
else:
|
|
36
|
+
raise error.BadInputError("Error: Axis value is invalid, please use either 0 (row format) or 1 (column format)")
|
|
37
|
+
|
|
38
|
+
nan_xy = matrix.nan_indices(data)
|
|
39
|
+
for x_i, y_i in nan_xy:
|
|
40
|
+
# Simplest scenario, look one row back
|
|
41
|
+
if x_i-1 > -1:
|
|
42
|
+
data[x_i][y_i] = data[x_i-1][y_i]
|
|
43
|
+
# Look n rows forward
|
|
44
|
+
else:
|
|
45
|
+
x_residuals = np.shape(data)[0]-x_i-1 # n datapoints left
|
|
46
|
+
val_found = False
|
|
47
|
+
for i in range(1, x_residuals):
|
|
48
|
+
if not np.isnan(data[x_i+i][y_i]):
|
|
49
|
+
val_found = True
|
|
50
|
+
break
|
|
51
|
+
if val_found:
|
|
52
|
+
# pylint: disable=undefined-loop-variable
|
|
53
|
+
for x_nan in range(i):
|
|
54
|
+
data[x_i+x_nan][y_i] = data[x_i+i][y_i]
|
|
55
|
+
else:
|
|
56
|
+
raise Exception("Error: Entire Column is NaN")
|
|
57
|
+
return data
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from impyute.ops import matrix
|
|
3
|
+
from impyute.ops import wrapper
|
|
4
|
+
# pylint: disable=invalid-name, too-many-arguments, too-many-locals, too-many-branches, broad-except, len-as-condition
|
|
5
|
+
|
|
6
|
+
@wrapper.wrappers
|
|
7
|
+
@wrapper.checks
|
|
8
|
+
def moving_window(data, nindex=None, wsize=5, errors="coerce", func=np.mean,
|
|
9
|
+
inplace=False):
|
|
10
|
+
""" Interpolate the missing values based on nearby values.
|
|
11
|
+
|
|
12
|
+
For example, with an array like this:
|
|
13
|
+
|
|
14
|
+
array([[-1.24940, -1.38673, -0.03214945, 0.08255145, -0.007415],
|
|
15
|
+
[ 2.14662, 0.32758 , -0.82601414, 1.78124027, 0.873998],
|
|
16
|
+
[-0.41400, -0.977629, nan, -1.39255344, 1.680435],
|
|
17
|
+
[ 0.40975, 1.067599, 0.29152388, -1.70160145, -0.565226],
|
|
18
|
+
[-0.54592, -1.126187, 2.04004377, 0.16664863, -0.010677]])
|
|
19
|
+
|
|
20
|
+
Using a `k` or window size of 3. The one missing value would be set
|
|
21
|
+
to -1.18509122. The window operates on the horizontal axis.
|
|
22
|
+
|
|
23
|
+
Usage
|
|
24
|
+
-----
|
|
25
|
+
|
|
26
|
+
The parameters default the function to a moving mean. You may want to change
|
|
27
|
+
the default window size:
|
|
28
|
+
|
|
29
|
+
moving_window(data, wsize=10)
|
|
30
|
+
|
|
31
|
+
To only look at past data (null value is at the rightmost index in the window):
|
|
32
|
+
|
|
33
|
+
moving_window(data, nindex=-1)
|
|
34
|
+
|
|
35
|
+
To use a custom function:
|
|
36
|
+
|
|
37
|
+
moving_window(data, func=np.median)
|
|
38
|
+
|
|
39
|
+
You can also do something like take 1.5x the max of previous values in the window:
|
|
40
|
+
|
|
41
|
+
moving_window(data, func=lambda arr: max(arr) * 1.50, nindex=-1)
|
|
42
|
+
|
|
43
|
+
Parameters
|
|
44
|
+
----------
|
|
45
|
+
data: numpy.ndarray
|
|
46
|
+
2D matrix to impute.
|
|
47
|
+
nindex: int
|
|
48
|
+
Null index. Index of the null value inside the moving average window.
|
|
49
|
+
Use cases: Say you wanted to make value skewed toward the left or right
|
|
50
|
+
side. 0 would only take the average of values from the right and -1
|
|
51
|
+
would only take the average of values from the left
|
|
52
|
+
wsize: int
|
|
53
|
+
Window size. Size of the moving average window/area of values being used
|
|
54
|
+
for each local imputation. This number includes the missing value.
|
|
55
|
+
errors: {"raise", "coerce", "ignore"}
|
|
56
|
+
Errors will occur with the indexing of the windows - for example if there
|
|
57
|
+
is a nan at data[x][0] and `nindex` is set to -1 or there is a nan at
|
|
58
|
+
data[x][-1] and `nindex` is set to 0. `"raise"` will raise an error,
|
|
59
|
+
`"coerce"` will try again using an nindex set to the middle and `"ignore"`
|
|
60
|
+
will just leave it as a nan.
|
|
61
|
+
inplace: {True, False}
|
|
62
|
+
Whether to return a copy or run on the passed-in array
|
|
63
|
+
|
|
64
|
+
Returns
|
|
65
|
+
-------
|
|
66
|
+
numpy.ndarray
|
|
67
|
+
Imputed data.
|
|
68
|
+
|
|
69
|
+
"""
|
|
70
|
+
if errors == "ignore":
|
|
71
|
+
raise Exception("`errors` value `ignore` not implemented yet. Sorry!")
|
|
72
|
+
|
|
73
|
+
if not inplace:
|
|
74
|
+
data = data.copy()
|
|
75
|
+
|
|
76
|
+
if nindex is None: # If using equal window side lengths
|
|
77
|
+
assert wsize % 2 == 1, "The parameter `wsize` should not be even "\
|
|
78
|
+
"if the value `nindex` is not set since it defaults to the midpoint "\
|
|
79
|
+
"and an even `wsize` makes the midpoint ambiguous"
|
|
80
|
+
wside_left = wsize // 2
|
|
81
|
+
wside_right = wsize // 2
|
|
82
|
+
else: # If using custom window side lengths
|
|
83
|
+
assert nindex < wsize, "The null index must be smaller than the window size"
|
|
84
|
+
if nindex == -1:
|
|
85
|
+
wside_left = wsize - 1
|
|
86
|
+
wside_right = 0
|
|
87
|
+
else:
|
|
88
|
+
wside_left = nindex
|
|
89
|
+
wside_right = wsize - nindex - 1
|
|
90
|
+
|
|
91
|
+
while True:
|
|
92
|
+
nan_xy = matrix.nan_indices(data)
|
|
93
|
+
n_nan_prev = len(nan_xy)
|
|
94
|
+
for x_i, y_i in nan_xy:
|
|
95
|
+
left_i = max(0, y_i-wside_left)
|
|
96
|
+
right_i = min(len(data), y_i+wside_right+1)
|
|
97
|
+
window = data[x_i, left_i: right_i]
|
|
98
|
+
window_not_null = window[~np.isnan(window)]
|
|
99
|
+
|
|
100
|
+
if len(window_not_null) > 0:
|
|
101
|
+
try:
|
|
102
|
+
data[x_i][y_i] = func(window_not_null)
|
|
103
|
+
continue
|
|
104
|
+
except Exception as e:
|
|
105
|
+
if errors == "raise":
|
|
106
|
+
raise e
|
|
107
|
+
|
|
108
|
+
if errors == "coerce":
|
|
109
|
+
# If either the window has a length of 0 or the aggregate function fails somehow,
|
|
110
|
+
# do a fallback of just trying the best we can by using it as the middle and trying
|
|
111
|
+
# to recalculate. Use temporary wside_left/wside_right, for only the calculation of
|
|
112
|
+
# this specific problamatic value
|
|
113
|
+
wside_left_tmp = wsize // 2
|
|
114
|
+
wside_right_tmp = wside_left_tmp
|
|
115
|
+
|
|
116
|
+
left_i_tmp = max(0, y_i-wside_left_tmp)
|
|
117
|
+
right_i_tmp = min(len(data), y_i+wside_right_tmp+1)
|
|
118
|
+
|
|
119
|
+
window = data[x_i, left_i_tmp:right_i_tmp]
|
|
120
|
+
window_not_null = window[~np.isnan(window)]
|
|
121
|
+
try:
|
|
122
|
+
data[x_i][y_i] = func(window_not_null)
|
|
123
|
+
except Exception as e:
|
|
124
|
+
print("Exception:", e)
|
|
125
|
+
if n_nan_prev == len(matrix.nan_indices(data)):
|
|
126
|
+
break
|
|
127
|
+
|
|
128
|
+
return data
|