cpgtools 2.0.0__py3-none-any.whl → 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cpgtools might be problematic. Click here for more details.
- cpgmodule/_version.py +1 -0
- cpgmodule/utils.py +35 -0
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_aggregation.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_anno_position.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_anno_probe.py +1 -2
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_density_gene_centered.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_chrom.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_gene_centered.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_region.py +1 -3
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_logo.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_to_gene.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_PCA.py +31 -23
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_UMAP.py +29 -22
- cpgtools-2.0.2.data/scripts/beta_imputation.py +604 -0
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_jitter_plot.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_m_conversion.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_profile_gene_centered.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_profile_region.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_selectNBest.py +9 -6
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_stacked_barplot.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_stats.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_tSNE.py +31 -24
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_topN.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_trichotmize.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_Bayes.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_bb.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_fisher.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_glm.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_logit.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_nonparametric.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_ttest.py +1 -1
- cpgtools-2.0.2.data/scripts/predict_sex.py +126 -0
- cpgtools-2.0.2.dist-info/LICENSE +19 -0
- cpgtools-2.0.2.dist-info/METADATA +76 -0
- cpgtools-2.0.2.dist-info/RECORD +82 -0
- {cpgtools-2.0.0.dist-info → cpgtools-2.0.2.dist-info}/WHEEL +1 -1
- cpgtools-2.0.2.dist-info/top_level.txt +3 -0
- impyute/__init__.py +3 -0
- impyute/contrib/__init__.py +7 -0
- impyute/contrib/compare.py +69 -0
- impyute/contrib/count_missing.py +30 -0
- impyute/contrib/describe.py +63 -0
- impyute/cs/__init__.py +11 -0
- impyute/cs/buck_iterative.py +82 -0
- impyute/cs/central_tendency.py +84 -0
- impyute/cs/em.py +52 -0
- impyute/cs/fast_knn.py +130 -0
- impyute/cs/random.py +27 -0
- impyute/dataset/__init__.py +6 -0
- impyute/dataset/base.py +137 -0
- impyute/dataset/corrupt.py +55 -0
- impyute/deletion/__init__.py +5 -0
- impyute/deletion/complete_case.py +21 -0
- impyute/ops/__init__.py +12 -0
- impyute/ops/error.py +9 -0
- impyute/ops/inverse_distance_weighting.py +31 -0
- impyute/ops/matrix.py +47 -0
- impyute/ops/testing.py +20 -0
- impyute/ops/util.py +76 -0
- impyute/ops/wrapper.py +179 -0
- impyute/ts/__init__.py +6 -0
- impyute/ts/locf.py +57 -0
- impyute/ts/moving_window.py +128 -0
- missingpy/__init__.py +4 -0
- missingpy/knnimpute.py +328 -0
- missingpy/missforest.py +556 -0
- missingpy/pairwise_external.py +315 -0
- missingpy/tests/__init__.py +0 -0
- missingpy/tests/test_knnimpute.py +605 -0
- missingpy/tests/test_missforest.py +409 -0
- missingpy/utils.py +124 -0
- cpgmodule/data/AltumAge_cpg.pkl +0 -0
- cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
- cpgmodule/data/AltumAge_scaler.pkl +0 -0
- cpgmodule/data/GA_Bohlin.pkl +0 -0
- cpgmodule/data/GA_Haftorn.pkl +0 -0
- cpgmodule/data/GA_Knight.pkl +0 -0
- cpgmodule/data/GA_Lee_CPC.pkl +0 -0
- cpgmodule/data/GA_Lee_RPC.pkl +0 -0
- cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
- cpgmodule/data/GA_Mayne.pkl +0 -0
- cpgmodule/data/Hannum.pkl +0 -0
- cpgmodule/data/Horvath_2013.pkl +0 -0
- cpgmodule/data/Horvath_2018.pkl +0 -0
- cpgmodule/data/Levine.pkl +0 -0
- cpgmodule/data/Lu_DNAmTL.pkl +0 -0
- cpgmodule/data/Ped_McEwen.pkl +0 -0
- cpgmodule/data/Ped_Wu.pkl +0 -0
- cpgmodule/data/Zhang_BLUP.pkl +0 -0
- cpgmodule/data/Zhang_EN.pkl +0 -0
- cpgtools-2.0.0.dist-info/LICENSE.txt +0 -674
- cpgtools-2.0.0.dist-info/METADATA +0 -28
- cpgtools-2.0.0.dist-info/RECORD +0 -64
- cpgtools-2.0.0.dist-info/top_level.txt +0 -2
impyute/dataset/base.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
""" Shared functions to load/generate data """
|
|
2
|
+
import itertools
|
|
3
|
+
import math
|
|
4
|
+
import random
|
|
5
|
+
import string
|
|
6
|
+
import numpy as np
|
|
7
|
+
from impyute.dataset.corrupt import Corruptor
|
|
8
|
+
from impyute.ops import error
|
|
9
|
+
|
|
10
|
+
def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int"):
|
|
11
|
+
""" Return randomly generated dataset of numbers with uniformly
|
|
12
|
+
distributed values between bound[0] and bound[1]
|
|
13
|
+
|
|
14
|
+
Parameters
|
|
15
|
+
----------
|
|
16
|
+
bound:tuple (start,stop)
|
|
17
|
+
Determines the range of values in the matrix. Index 0 for start
|
|
18
|
+
value and index 1 for stop value. Start is inclusive, stop is
|
|
19
|
+
exclusive.
|
|
20
|
+
shape:tuple(optional)
|
|
21
|
+
Size of the randomly generated data
|
|
22
|
+
missingness: ('mcar', 'mar', 'mnar')
|
|
23
|
+
Type of missingness you want in your dataset
|
|
24
|
+
thr: float between [0,1]
|
|
25
|
+
Percentage of missing data in generated data
|
|
26
|
+
dtype: ('int','float')
|
|
27
|
+
Type of data
|
|
28
|
+
|
|
29
|
+
Returns
|
|
30
|
+
-------
|
|
31
|
+
numpy.ndarray
|
|
32
|
+
"""
|
|
33
|
+
if dtype == "int":
|
|
34
|
+
data = np.random.randint(bound[0], bound[1], size=shape).astype(float)
|
|
35
|
+
elif dtype == "float":
|
|
36
|
+
data = np.random.uniform(bound[0], bound[1], size=shape)
|
|
37
|
+
corruptor = Corruptor(data, thr=thr)
|
|
38
|
+
raw_data = getattr(corruptor, missingness)()
|
|
39
|
+
return raw_data
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float"):
|
|
43
|
+
""" Return randomly generated dataset of numbers with normally
|
|
44
|
+
distributed values with given and sigma.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
theta: tuple (mu, sigma)
|
|
49
|
+
Determines the range of values in the matrix
|
|
50
|
+
shape:tuple(optional)
|
|
51
|
+
Size of the randomly generated data
|
|
52
|
+
missingness: ('mcar', 'mar', 'mnar')
|
|
53
|
+
Type of missingness you want in your dataset
|
|
54
|
+
thr: float between [0,1]
|
|
55
|
+
Percentage of missing data in generated data
|
|
56
|
+
dtype: ('int','float')
|
|
57
|
+
Type of data
|
|
58
|
+
|
|
59
|
+
Returns
|
|
60
|
+
-------
|
|
61
|
+
numpy.ndarray
|
|
62
|
+
"""
|
|
63
|
+
mean, sigma = theta
|
|
64
|
+
data = np.random.normal(mean, sigma, size=shape)
|
|
65
|
+
if dtype == "int":
|
|
66
|
+
data = np.round(data)
|
|
67
|
+
elif dtype == "float":
|
|
68
|
+
pass
|
|
69
|
+
corruptor = Corruptor(data, thr=thr)
|
|
70
|
+
raw_data = getattr(corruptor, missingness)()
|
|
71
|
+
return raw_data
|
|
72
|
+
|
|
73
|
+
def randc(nlevels=5, shape=(5, 5), missingness="mcar", thr=0.2):
|
|
74
|
+
""" Return randomly generated dataset with uniformly distributed categorical data (alphabetic character)
|
|
75
|
+
|
|
76
|
+
Parameters
|
|
77
|
+
----------
|
|
78
|
+
nlevels: int
|
|
79
|
+
Specify the number of different categories in the dataset
|
|
80
|
+
shape: tuple(optional)
|
|
81
|
+
Size of the randomly generated data
|
|
82
|
+
missingness: string in ('mcar', 'mar', 'mnar')
|
|
83
|
+
Type of missingness you want in your dataset
|
|
84
|
+
thr: float between [0,1]
|
|
85
|
+
Percentage of missing data in generated data
|
|
86
|
+
|
|
87
|
+
Returns
|
|
88
|
+
-------
|
|
89
|
+
numpy.ndarray
|
|
90
|
+
"""
|
|
91
|
+
if shape[0]*shape[1] < nlevels:
|
|
92
|
+
raise error.BadInputError("nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape")
|
|
93
|
+
|
|
94
|
+
length = len(string.ascii_lowercase)
|
|
95
|
+
n_fold = int(math.floor(math.log(nlevels, length)))
|
|
96
|
+
cat_pool = list(string.ascii_lowercase)
|
|
97
|
+
|
|
98
|
+
# when nlevel > 26, the alphabetical character is used up, need to generate extra strings as categorical data
|
|
99
|
+
if n_fold > 0:
|
|
100
|
+
for i in range(2, n_fold+2):
|
|
101
|
+
pool_candidate = list(itertools.product(string.ascii_lowercase, repeat=i))
|
|
102
|
+
cat_pool.extend([''.join(w) for w in pool_candidate])
|
|
103
|
+
if len(cat_pool) > nlevels:
|
|
104
|
+
break
|
|
105
|
+
|
|
106
|
+
cat = random.sample(cat_pool, nlevels)
|
|
107
|
+
data = np.random.choice(cat, shape, replace=True)
|
|
108
|
+
|
|
109
|
+
# make sure the data frame has nlevel different categories
|
|
110
|
+
while len(np.unique(data)) != nlevels:
|
|
111
|
+
data = np.random.choice(cat, shape, replace=True)
|
|
112
|
+
|
|
113
|
+
corruptor = Corruptor(data, thr=thr, dtype=np.str)
|
|
114
|
+
raw_data = getattr(corruptor, missingness)()
|
|
115
|
+
return raw_data
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def mnist(missingness="mcar", thr=0.2):
|
|
120
|
+
""" Loads corrupted MNIST
|
|
121
|
+
|
|
122
|
+
Parameters
|
|
123
|
+
----------
|
|
124
|
+
missingness: ('mcar', 'mar', 'mnar')
|
|
125
|
+
Type of missigness you want in your dataset
|
|
126
|
+
th: float between [0,1]
|
|
127
|
+
Percentage of missing data in generated data
|
|
128
|
+
|
|
129
|
+
Returns
|
|
130
|
+
-------
|
|
131
|
+
numpy.ndarray
|
|
132
|
+
"""
|
|
133
|
+
from sklearn.datasets import fetch_mldata
|
|
134
|
+
dataset = fetch_mldata('MNIST original')
|
|
135
|
+
corruptor = Corruptor(dataset.data, thr=thr)
|
|
136
|
+
data = getattr(corruptor, missingness)()
|
|
137
|
+
return {"X": data, "Y": dataset.target}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
""" impyute.dataset.corrupt """
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Corruptor:
|
|
6
|
+
""" Adds missing values to a complete dataset.
|
|
7
|
+
|
|
8
|
+
Attributes
|
|
9
|
+
----------
|
|
10
|
+
data: np.ndarray
|
|
11
|
+
Matrix of values with no NaN's that you want to add NaN's to.
|
|
12
|
+
thr: float (optional)
|
|
13
|
+
The percentage of null values you want in your dataset, a number
|
|
14
|
+
between 0 and 1.
|
|
15
|
+
|
|
16
|
+
Methods
|
|
17
|
+
-------
|
|
18
|
+
mcar()
|
|
19
|
+
Overwrite values with MCAR placed NaN's.
|
|
20
|
+
mar()
|
|
21
|
+
Overwrite values with MAR placed NaN's.
|
|
22
|
+
mnar()
|
|
23
|
+
Overwrite values with MNAR placed NaN's.
|
|
24
|
+
|
|
25
|
+
"""
|
|
26
|
+
def __init__(self, data, thr=0.2, dtype=np.float):
|
|
27
|
+
self.dtype = data.dtype
|
|
28
|
+
self.shape = np.shape(data)
|
|
29
|
+
self.data = data.astype(dtype)
|
|
30
|
+
self.thr = thr
|
|
31
|
+
|
|
32
|
+
def mcar(self):
|
|
33
|
+
""" Overwrites values with MCAR placed NaN's """
|
|
34
|
+
data_1d = self.data.flatten()
|
|
35
|
+
n_total = len(data_1d)
|
|
36
|
+
nan_x = np.random.choice(range(n_total),
|
|
37
|
+
size=int(self.thr*n_total),
|
|
38
|
+
replace=False)
|
|
39
|
+
for x_i in nan_x:
|
|
40
|
+
data_1d[x_i] = np.nan
|
|
41
|
+
output = data_1d.reshape(self.shape)
|
|
42
|
+
return output
|
|
43
|
+
|
|
44
|
+
def mar(self):
|
|
45
|
+
""" Overwrites values with MAR placed NaN's """
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
def mnar(self):
|
|
49
|
+
""" Overwrites values with MNAR placed NaN's """
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
def complete(self):
|
|
53
|
+
""" Do nothing to the data """
|
|
54
|
+
output = self.data
|
|
55
|
+
return output
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
""" impyute.deletion.complete_case """
|
|
2
|
+
import numpy as np
|
|
3
|
+
from impyute.ops import wrapper
|
|
4
|
+
|
|
5
|
+
@wrapper.wrappers
|
|
6
|
+
@wrapper.checks
|
|
7
|
+
def complete_case(data):
|
|
8
|
+
""" Return only data rows with all columns
|
|
9
|
+
|
|
10
|
+
Parameters
|
|
11
|
+
----------
|
|
12
|
+
data: numpy.ndarray
|
|
13
|
+
Data to impute.
|
|
14
|
+
|
|
15
|
+
Returns
|
|
16
|
+
-------
|
|
17
|
+
numpy.ndarray
|
|
18
|
+
Imputed data.
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
return data[~np.isnan(data).any(axis=1)]
|
impyute/ops/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
""" Unorganized set of utility functions """
|
|
2
|
+
|
|
3
|
+
from . import error
|
|
4
|
+
from . import inverse_distance_weighting
|
|
5
|
+
from . import matrix
|
|
6
|
+
from . import util
|
|
7
|
+
from . import wrapper
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"error", "inverse_distance_weighting", "matrix",
|
|
11
|
+
"util", "wrapper"
|
|
12
|
+
]
|
impyute/ops/error.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
""" Assign weights to distances in a way such that farther values are weighed less """
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
def shepards(distances, power=2):
|
|
5
|
+
""" Basic inverse distance weighting function
|
|
6
|
+
|
|
7
|
+
Parameters
|
|
8
|
+
----------
|
|
9
|
+
distances: list/numpy.ndarray
|
|
10
|
+
1D list of numbers (ex. distance results from call to KDTree.query)
|
|
11
|
+
|
|
12
|
+
power: int
|
|
13
|
+
Default of 2 used since the referenced paper stated an exponent of 2 "gives seemingly
|
|
14
|
+
satisfactory results"
|
|
15
|
+
|
|
16
|
+
Returns
|
|
17
|
+
-------
|
|
18
|
+
numpy.ndarray
|
|
19
|
+
1D list of numbers that sum to 1, represents weights of provided distances, in order.
|
|
20
|
+
|
|
21
|
+
References
|
|
22
|
+
----------
|
|
23
|
+
|
|
24
|
+
Shepard, Donald (1968). "A two-dimensional interpolation function for irregularly-spaced data".
|
|
25
|
+
Proceedings of the 1968 ACM National Conference. pp. 517-524. doi:10.1145/800186.810616
|
|
26
|
+
"""
|
|
27
|
+
return to_percentage(1/np.power(distances, power))
|
|
28
|
+
|
|
29
|
+
def to_percentage(vec):
|
|
30
|
+
""" Converts list of real numbers into a list of percentages """
|
|
31
|
+
return vec/np.sum(vec)
|
impyute/ops/matrix.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
""" Common operations on matrices
|
|
2
|
+
|
|
3
|
+
*Look into whether it's worth writing these in raw c*
|
|
4
|
+
"""
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
def nan_indices(data):
|
|
8
|
+
""" Finds the indices of all missing values.
|
|
9
|
+
|
|
10
|
+
Parameters
|
|
11
|
+
----------
|
|
12
|
+
data: numpy.ndarray
|
|
13
|
+
|
|
14
|
+
Returns
|
|
15
|
+
-------
|
|
16
|
+
List of tuples
|
|
17
|
+
Indices of all missing values in tuple format; (i, j)
|
|
18
|
+
"""
|
|
19
|
+
return np.argwhere(np.isnan(data))
|
|
20
|
+
|
|
21
|
+
def map_nd(fn, arr):
|
|
22
|
+
""" Map fn that takes a value over entire n-dim array
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
arr: numpy.ndarray
|
|
27
|
+
|
|
28
|
+
Returns
|
|
29
|
+
-------
|
|
30
|
+
numpy.ndarray
|
|
31
|
+
|
|
32
|
+
"""
|
|
33
|
+
return np.vectorize(fn)(arr)
|
|
34
|
+
|
|
35
|
+
def every_nd(fn, arr):
|
|
36
|
+
""" Returns bool, true if fn is true for all elements of arr
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
arr: numpy.ndarray
|
|
41
|
+
|
|
42
|
+
Returns
|
|
43
|
+
-------
|
|
44
|
+
bool
|
|
45
|
+
|
|
46
|
+
"""
|
|
47
|
+
return all(map(fn, arr.flatten()))
|
impyute/ops/testing.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
""" Utilities used for unit tests """
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def return_na_check(data):
|
|
6
|
+
"""Helper function for tests to check if the data returned is a
|
|
7
|
+
numpy array and that the imputed data has no NaN's.
|
|
8
|
+
|
|
9
|
+
Parameters
|
|
10
|
+
----------
|
|
11
|
+
data: numpy.ndarray
|
|
12
|
+
Data to impute.
|
|
13
|
+
|
|
14
|
+
Returns
|
|
15
|
+
-------
|
|
16
|
+
None
|
|
17
|
+
|
|
18
|
+
"""
|
|
19
|
+
assert isinstance(data, np.ndarray)
|
|
20
|
+
assert not np.isnan(data).any()
|
impyute/ops/util.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
""" Random utility functions """
|
|
2
|
+
from functools import wraps
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
# Things that get exposed from * import
|
|
7
|
+
__all__ = [
|
|
8
|
+
"constantly", "complement", "identity", "thread",
|
|
9
|
+
"execute_fn_with_args_and_or_kwargs", "toy_df",
|
|
10
|
+
"insert_na",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
def thread(arg, *fns):
|
|
14
|
+
if len(fns) > 0:
|
|
15
|
+
return thread(fns[0](arg), *fns[1:])
|
|
16
|
+
else:
|
|
17
|
+
return arg
|
|
18
|
+
|
|
19
|
+
def identity(x):
|
|
20
|
+
return x
|
|
21
|
+
|
|
22
|
+
def constantly(x):
|
|
23
|
+
""" Returns a function that takes any args and returns x """
|
|
24
|
+
def func(*args, **kwargs):
|
|
25
|
+
return x
|
|
26
|
+
return func
|
|
27
|
+
|
|
28
|
+
def complement(fn):
|
|
29
|
+
""" Return fn that outputs the opposite truth values of the
|
|
30
|
+
input function
|
|
31
|
+
"""
|
|
32
|
+
@wraps(fn)
|
|
33
|
+
def wrapper(*args, **kwargs):
|
|
34
|
+
return not fn(*args, **kwargs)
|
|
35
|
+
return wrapper
|
|
36
|
+
|
|
37
|
+
def execute_fn_with_args_and_or_kwargs(fn, args, kwargs):
|
|
38
|
+
""" If args + kwargs aren't accepted only args are passed in"""
|
|
39
|
+
try:
|
|
40
|
+
return fn(*args, **kwargs)
|
|
41
|
+
except TypeError:
|
|
42
|
+
return fn(*args)
|
|
43
|
+
|
|
44
|
+
def toy_df(nrow, ncol, n_miss, sample_prefix, seed):
|
|
45
|
+
"""
|
|
46
|
+
Make a dataFrame (nrow x ncol) with random values between 0 and 1, add
|
|
47
|
+
some missing values (n_miss). Generate a toy dataframe for testing purposes.
|
|
48
|
+
"""
|
|
49
|
+
np.random.seed(seed)
|
|
50
|
+
data = np.random.rand(nrow*ncol).reshape((nrow, ncol)).astype(float)
|
|
51
|
+
x_ind = np.random.choice(nrow, n_miss)
|
|
52
|
+
y_ind = np.random.choice(ncol, n_miss)
|
|
53
|
+
for x,y in zip(x_ind, y_ind):
|
|
54
|
+
data[x][y] = np.nan
|
|
55
|
+
colNames = [sample_prefix + '_' + str(i) for i in range(0,ncol)]
|
|
56
|
+
df = pd.DataFrame(data, columns=colNames)
|
|
57
|
+
return df
|
|
58
|
+
|
|
59
|
+
def insert_na(df, n_miss, seed):
|
|
60
|
+
np.random.seed(seed)
|
|
61
|
+
nrow,ncol = df.shape
|
|
62
|
+
na_count = 0
|
|
63
|
+
if n_miss >= nrow*ncol:
|
|
64
|
+
out_df = df.replace(df.values, np.nan)
|
|
65
|
+
else:
|
|
66
|
+
tmp = df.to_numpy()
|
|
67
|
+
while(1):
|
|
68
|
+
if na_count >= n_miss:
|
|
69
|
+
break
|
|
70
|
+
x_ind = np.random.choice(nrow)
|
|
71
|
+
y_ind = np.random.choice(ncol)
|
|
72
|
+
if not np.isnan(tmp[x_ind][y_ind]):
|
|
73
|
+
tmp[x_ind][y_ind] = np.nan
|
|
74
|
+
na_count += 1
|
|
75
|
+
out_df = pd.DataFrame(tmp, index=df.index, columns=df.columns)
|
|
76
|
+
return out_df
|
impyute/ops/wrapper.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
""" Decorator functions to wrap around entry and exit
|
|
2
|
+
|
|
3
|
+
... to easily apply to a function, functions that check/process inputs
|
|
4
|
+
and outputs
|
|
5
|
+
"""
|
|
6
|
+
from functools import wraps
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from . import error
|
|
11
|
+
from . import matrix
|
|
12
|
+
from . import util as u
|
|
13
|
+
|
|
14
|
+
## Hacky way to handle python2 not having `ModuleNotFoundError`
|
|
15
|
+
# pylint: disable=redefined-builtin, missing-docstring
|
|
16
|
+
try:
|
|
17
|
+
raise ModuleNotFoundError
|
|
18
|
+
except NameError:
|
|
19
|
+
class ModuleNotFoundError(Exception):
|
|
20
|
+
pass
|
|
21
|
+
except ModuleNotFoundError:
|
|
22
|
+
pass
|
|
23
|
+
# pylint: enable=redefined-builtin, missing-docstring
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def handle_df(fn):
|
|
27
|
+
""" Decorator to handle pandas Dataframe object as input
|
|
28
|
+
|
|
29
|
+
If the first arg is a pandas dataframe, convert it to a numpy array
|
|
30
|
+
otherwise don't do anything. Cast back to a pandas Dataframe after
|
|
31
|
+
the imputation function has run
|
|
32
|
+
"""
|
|
33
|
+
@wraps(fn)
|
|
34
|
+
def wrapper(*args, **kwargs):
|
|
35
|
+
is_df = False
|
|
36
|
+
## convert tuple to list so args can be modified
|
|
37
|
+
args = list(args)
|
|
38
|
+
## Either make a copy or use a pointer to the original
|
|
39
|
+
if kwargs.get('inplace'):
|
|
40
|
+
args[0] = args[0]
|
|
41
|
+
else:
|
|
42
|
+
args[0] = args[0].copy()
|
|
43
|
+
|
|
44
|
+
## If input data is a dataframe then cast the input to an np.array
|
|
45
|
+
## and set an indicator flag before continuing
|
|
46
|
+
if isinstance(args[0], pd.DataFrame):
|
|
47
|
+
is_df = True
|
|
48
|
+
in_ind = args[0].index
|
|
49
|
+
in_columns = args[0].columns
|
|
50
|
+
args[0] = args[0].to_numpy()
|
|
51
|
+
|
|
52
|
+
## function invokation
|
|
53
|
+
results = u.execute_fn_with_args_and_or_kwargs(fn, args, kwargs)
|
|
54
|
+
|
|
55
|
+
## cast the output back to a DataFrame.
|
|
56
|
+
if is_df:
|
|
57
|
+
results = pd.DataFrame(results, index=in_ind, columns=in_columns)
|
|
58
|
+
return results
|
|
59
|
+
return wrapper
|
|
60
|
+
|
|
61
|
+
def add_inplace_option(fn):
|
|
62
|
+
""" Decorator for inplace option
|
|
63
|
+
|
|
64
|
+
Functions wrapped by this can have an `inplace` kwarg to use either a copy of
|
|
65
|
+
data or reference """
|
|
66
|
+
@wraps(fn)
|
|
67
|
+
def wrapper(*args, **kwargs):
|
|
68
|
+
""" Run input checks"""
|
|
69
|
+
## convert tuple to list so args can be modified
|
|
70
|
+
args = list(args)
|
|
71
|
+
## Either make a copy or use a pointer to the original
|
|
72
|
+
if kwargs.get('inplace'):
|
|
73
|
+
args[0] = args[0]
|
|
74
|
+
else:
|
|
75
|
+
args[0] = args[0].copy()
|
|
76
|
+
|
|
77
|
+
## function invokation
|
|
78
|
+
return u.execute_fn_with_args_and_or_kwargs(fn, args, kwargs)
|
|
79
|
+
return wrapper
|
|
80
|
+
|
|
81
|
+
def conform_output(fn):
|
|
82
|
+
""" Decorator to handle impossible values
|
|
83
|
+
|
|
84
|
+
Adds two optional kwargs, `coerce_fn` and `valid_fn`.
|
|
85
|
+
|
|
86
|
+
`valid_fn` function stub
|
|
87
|
+
|
|
88
|
+
def my_coerce_fn(some_literal) -> boolean
|
|
89
|
+
|
|
90
|
+
`coerce_fn` function stub
|
|
91
|
+
|
|
92
|
+
def my_coerce_fn(arr, x_i, y_i) -> some_literal
|
|
93
|
+
|
|
94
|
+
Valid function is something run on each element of the, this is
|
|
95
|
+
the function that we use to indicate whether the value is valid
|
|
96
|
+
or not
|
|
97
|
+
|
|
98
|
+
Coerce function has three arguments, the original matrix and
|
|
99
|
+
the two indices of the invalid value x_i and y_i. This function
|
|
100
|
+
will be run on all invalid values.
|
|
101
|
+
"""
|
|
102
|
+
@wraps(fn)
|
|
103
|
+
def wrapper(*args, **kwargs):
|
|
104
|
+
def raise_error(arr, x_i, y_i):
|
|
105
|
+
raise error.BadOutputError("{} does not conform".format(arr[x_i, y_i]))
|
|
106
|
+
## convert tuple to list so args can be modified
|
|
107
|
+
args = list(args)
|
|
108
|
+
# function that checks if the value is valid
|
|
109
|
+
valid_fn = kwargs.get("valid_fn", u.constantly(True))
|
|
110
|
+
# function that modifies the invalid value to something valid
|
|
111
|
+
coerce_fn = kwargs.get("coerce_fn", raise_error)
|
|
112
|
+
|
|
113
|
+
## function invokation
|
|
114
|
+
results = u.execute_fn_with_args_and_or_kwargs(fn, args, kwargs)
|
|
115
|
+
|
|
116
|
+
# check each value to see if it's valid
|
|
117
|
+
bool_arr = matrix.map_nd(u.complement(valid_fn), results)
|
|
118
|
+
# get indices of invalid values
|
|
119
|
+
invalid_indices = np.argwhere(bool_arr)
|
|
120
|
+
# run the coerce fn on each invalid indice
|
|
121
|
+
for x_i, y_i in invalid_indices:
|
|
122
|
+
results[x_i, y_i] = coerce_fn(results, x_i, y_i)
|
|
123
|
+
|
|
124
|
+
return results
|
|
125
|
+
return wrapper
|
|
126
|
+
|
|
127
|
+
def wrappers(fn):
|
|
128
|
+
""" Helper decorator, all wrapper functions applied to modify input (matrix
|
|
129
|
+
with missing values) and output (matrix with imputed values)
|
|
130
|
+
|
|
131
|
+
NOTE: `handle_df` has to be last as it needs to be in the outer loop (first
|
|
132
|
+
entry point) since every other function assumes you're getting an np.array
|
|
133
|
+
as input
|
|
134
|
+
"""
|
|
135
|
+
return u.thread(
|
|
136
|
+
fn, # function that's getting wrapped
|
|
137
|
+
add_inplace_option, # allow choosing reference/copy
|
|
138
|
+
conform_output, # allow enforcing of some spec on returned outputs
|
|
139
|
+
handle_df, # if df type, cast to np.array on in and df on out
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
def _shape_2d(data):
|
|
143
|
+
""" True if array is 2D"""
|
|
144
|
+
return len(np.shape(data)) == 2
|
|
145
|
+
|
|
146
|
+
def _shape_3d(data):
|
|
147
|
+
""" True if array is 3D"""
|
|
148
|
+
return len(np.shape(data)) == 3
|
|
149
|
+
|
|
150
|
+
def _is_ndarray(data):
|
|
151
|
+
""" True if the array is an instance of numpy's ndarray"""
|
|
152
|
+
return isinstance(data, np.ndarray)
|
|
153
|
+
|
|
154
|
+
def _dtype_float(data):
|
|
155
|
+
""" True if the values in the array are floating point"""
|
|
156
|
+
return data.dtype == float
|
|
157
|
+
|
|
158
|
+
def _nan_exists(data):
|
|
159
|
+
""" True if there is at least one np.nan in the array"""
|
|
160
|
+
nan_xy = matrix.nan_indices(data)
|
|
161
|
+
return len(nan_xy) > 0
|
|
162
|
+
|
|
163
|
+
def checks(fn):
|
|
164
|
+
""" Throw exception if error runs"""
|
|
165
|
+
@wraps(fn)
|
|
166
|
+
def wrapper(*args, **kwargs):
|
|
167
|
+
data = args[0]
|
|
168
|
+
if len(np.shape(data)) != 2:
|
|
169
|
+
raise error.BadInputError("No support for arrays that aren't 2D yet.")
|
|
170
|
+
elif not _shape_2d(data):
|
|
171
|
+
raise error.BadInputError("Not a 2D array.")
|
|
172
|
+
elif not _is_ndarray(data):
|
|
173
|
+
raise error.BadInputError("Not a np.ndarray.")
|
|
174
|
+
elif not _dtype_float(data):
|
|
175
|
+
raise error.BadInputError("Data is not float.")
|
|
176
|
+
elif not _nan_exists(data):
|
|
177
|
+
raise error.BadInputError("No NaN's in given data")
|
|
178
|
+
return u.execute_fn_with_args_and_or_kwargs(fn, args, kwargs)
|
|
179
|
+
return wrapper
|
impyute/ts/__init__.py
ADDED
impyute/ts/locf.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from impyute.ops import matrix
|
|
3
|
+
from impyute.ops import wrapper
|
|
4
|
+
from impyute.ops import error
|
|
5
|
+
|
|
6
|
+
@wrapper.wrappers
|
|
7
|
+
@wrapper.checks
|
|
8
|
+
def locf(data, axis=0):
|
|
9
|
+
""" Last Observation Carried Forward
|
|
10
|
+
|
|
11
|
+
For each set of missing indices, use the value of one row before(same
|
|
12
|
+
column). In the case that the missing value is the first row, look one
|
|
13
|
+
row ahead instead. If this next row is also NaN, look to the next row.
|
|
14
|
+
Repeat until you find a row in this column that's not NaN. All the rows
|
|
15
|
+
before will be filled with this value.
|
|
16
|
+
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
data: numpy.ndarray
|
|
20
|
+
Data to impute.
|
|
21
|
+
axis: boolean (optional)
|
|
22
|
+
0 if time series is in row format (Ex. data[0][:] is 1st data point).
|
|
23
|
+
1 if time series is in col format (Ex. data[:][0] is 1st data point).
|
|
24
|
+
|
|
25
|
+
Returns
|
|
26
|
+
-------
|
|
27
|
+
numpy.ndarray
|
|
28
|
+
Imputed data.
|
|
29
|
+
|
|
30
|
+
"""
|
|
31
|
+
if axis == 0:
|
|
32
|
+
data = np.transpose(data)
|
|
33
|
+
elif axis == 1:
|
|
34
|
+
pass
|
|
35
|
+
else:
|
|
36
|
+
raise error.BadInputError("Error: Axis value is invalid, please use either 0 (row format) or 1 (column format)")
|
|
37
|
+
|
|
38
|
+
nan_xy = matrix.nan_indices(data)
|
|
39
|
+
for x_i, y_i in nan_xy:
|
|
40
|
+
# Simplest scenario, look one row back
|
|
41
|
+
if x_i-1 > -1:
|
|
42
|
+
data[x_i][y_i] = data[x_i-1][y_i]
|
|
43
|
+
# Look n rows forward
|
|
44
|
+
else:
|
|
45
|
+
x_residuals = np.shape(data)[0]-x_i-1 # n datapoints left
|
|
46
|
+
val_found = False
|
|
47
|
+
for i in range(1, x_residuals):
|
|
48
|
+
if not np.isnan(data[x_i+i][y_i]):
|
|
49
|
+
val_found = True
|
|
50
|
+
break
|
|
51
|
+
if val_found:
|
|
52
|
+
# pylint: disable=undefined-loop-variable
|
|
53
|
+
for x_nan in range(i):
|
|
54
|
+
data[x_i+x_nan][y_i] = data[x_i+i][y_i]
|
|
55
|
+
else:
|
|
56
|
+
raise Exception("Error: Entire Column is NaN")
|
|
57
|
+
return data
|