cpgtools 2.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cpgmodule/BED.py +441 -0
- cpgmodule/MI.py +193 -0
- cpgmodule/__init__.py +0 -0
- cpgmodule/_version.py +1 -0
- cpgmodule/cgID.py +866897 -0
- cpgmodule/data/AltumAge_cpg.pkl +0 -0
- cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
- cpgmodule/data/AltumAge_scaler.pkl +0 -0
- cpgmodule/data/GA_Bohlin.pkl +0 -0
- cpgmodule/data/GA_Haftorn.pkl +0 -0
- cpgmodule/data/GA_Knight.pkl +0 -0
- cpgmodule/data/GA_Lee_CPC.pkl +0 -0
- cpgmodule/data/GA_Lee_RPC.pkl +0 -0
- cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
- cpgmodule/data/GA_Mayne.pkl +0 -0
- cpgmodule/data/Hannum.pkl +0 -0
- cpgmodule/data/Horvath_2013.pkl +0 -0
- cpgmodule/data/Horvath_2018.pkl +0 -0
- cpgmodule/data/Levine.pkl +0 -0
- cpgmodule/data/Lu_DNAmTL.pkl +0 -0
- cpgmodule/data/Ped_McEwen.pkl +0 -0
- cpgmodule/data/Ped_Wu.pkl +0 -0
- cpgmodule/data/Zhang_BLUP.pkl +0 -0
- cpgmodule/data/Zhang_EN.pkl +0 -0
- cpgmodule/data/__init__.py +0 -0
- cpgmodule/extend_bed.py +147 -0
- cpgmodule/imotif.py +348 -0
- cpgmodule/ireader.py +28 -0
- cpgmodule/methylClock.py +53 -0
- cpgmodule/padjust.py +58 -0
- cpgmodule/region2gene.py +170 -0
- cpgmodule/utils.py +642 -0
- cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
- cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
- cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
- cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
- cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
- cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
- cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
- cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
- cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
- cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
- cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
- cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
- cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
- cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
- cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
- cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
- cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
- cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
- cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
- cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
- cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
- cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
- cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
- cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
- cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
- cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
- cpgtools-2.0.5.dist-info/METADATA +59 -0
- cpgtools-2.0.5.dist-info/RECORD +104 -0
- cpgtools-2.0.5.dist-info/WHEEL +5 -0
- cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
- cpgtools-2.0.5.dist-info/top_level.txt +5 -0
- impyute/__init__.py +3 -0
- impyute/contrib/__init__.py +7 -0
- impyute/contrib/compare.py +69 -0
- impyute/contrib/count_missing.py +30 -0
- impyute/contrib/describe.py +63 -0
- impyute/cs/__init__.py +11 -0
- impyute/cs/buck_iterative.py +82 -0
- impyute/cs/central_tendency.py +84 -0
- impyute/cs/em.py +52 -0
- impyute/cs/fast_knn.py +130 -0
- impyute/cs/random.py +27 -0
- impyute/dataset/__init__.py +6 -0
- impyute/dataset/base.py +137 -0
- impyute/dataset/corrupt.py +55 -0
- impyute/deletion/__init__.py +5 -0
- impyute/deletion/complete_case.py +21 -0
- impyute/ops/__init__.py +12 -0
- impyute/ops/error.py +9 -0
- impyute/ops/inverse_distance_weighting.py +31 -0
- impyute/ops/matrix.py +47 -0
- impyute/ops/testing.py +20 -0
- impyute/ops/util.py +96 -0
- impyute/ops/wrapper.py +179 -0
- impyute/ts/__init__.py +6 -0
- impyute/ts/locf.py +57 -0
- impyute/ts/moving_window.py +128 -0
- impyutelib.py +890 -0
- missingpy/__init__.py +4 -0
- missingpy/knnimpute.py +328 -0
- missingpy/missforest.py +556 -0
- missingpy/pairwise_external.py +315 -0
- missingpy/tests/__init__.py +0 -0
- missingpy/tests/test_knnimpute.py +605 -0
- missingpy/tests/test_missforest.py +409 -0
- missingpy/utils.py +124 -0
- misspylib.py +565 -0
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from impyute.ops import matrix
|
|
3
|
+
from impyute.ops import wrapper
|
|
4
|
+
|
|
5
|
+
@wrapper.wrappers
|
|
6
|
+
@wrapper.checks
|
|
7
|
+
def mean(data):
|
|
8
|
+
""" Substitute missing values with the mean of that column.
|
|
9
|
+
|
|
10
|
+
Parameters
|
|
11
|
+
----------
|
|
12
|
+
data: numpy.ndarray
|
|
13
|
+
Data to impute.
|
|
14
|
+
|
|
15
|
+
Returns
|
|
16
|
+
-------
|
|
17
|
+
numpy.ndarray
|
|
18
|
+
Imputed data.
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
nan_xy = matrix.nan_indices(data)
|
|
22
|
+
for x_i, y_i in nan_xy:
|
|
23
|
+
row_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
|
|
24
|
+
new_value = np.mean(row_wo_nan)
|
|
25
|
+
data[x_i][y_i] = new_value
|
|
26
|
+
return data
|
|
27
|
+
|
|
28
|
+
@wrapper.wrappers
|
|
29
|
+
@wrapper.checks
|
|
30
|
+
def median(data):
|
|
31
|
+
""" Substitute missing values with the median of that column(middle).
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
data: numpy.ndarray
|
|
36
|
+
Data to impute.
|
|
37
|
+
|
|
38
|
+
Returns
|
|
39
|
+
-------
|
|
40
|
+
numpy.ndarray
|
|
41
|
+
Imputed data.
|
|
42
|
+
|
|
43
|
+
"""
|
|
44
|
+
nan_xy = matrix.nan_indices(data)
|
|
45
|
+
cols_missing = set(nan_xy.T[1])
|
|
46
|
+
medians = {}
|
|
47
|
+
for y_i in cols_missing:
|
|
48
|
+
cols_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
|
|
49
|
+
median_y = np.median(cols_wo_nan)
|
|
50
|
+
medians[str(y_i)] = median_y
|
|
51
|
+
for x_i, y_i in nan_xy:
|
|
52
|
+
data[x_i][y_i] = medians[str(y_i)]
|
|
53
|
+
return data
|
|
54
|
+
|
|
55
|
+
@wrapper.wrappers
|
|
56
|
+
@wrapper.checks
|
|
57
|
+
def mode(data):
|
|
58
|
+
""" Substitute missing values with the mode of that column(most frequent).
|
|
59
|
+
|
|
60
|
+
In the case that there is a tie (there are multiple, most frequent values)
|
|
61
|
+
for a column randomly pick one of them.
|
|
62
|
+
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
data: numpy.ndarray
|
|
66
|
+
Data to impute.
|
|
67
|
+
|
|
68
|
+
Returns
|
|
69
|
+
-------
|
|
70
|
+
numpy.ndarray
|
|
71
|
+
Imputed data.
|
|
72
|
+
|
|
73
|
+
"""
|
|
74
|
+
nan_xy = matrix.nan_indices(data)
|
|
75
|
+
modes = []
|
|
76
|
+
for y_i in range(np.shape(data)[1]):
|
|
77
|
+
unique_counts = np.unique(data[:, [y_i]], return_counts=True)
|
|
78
|
+
max_count = np.max(unique_counts[1])
|
|
79
|
+
mode_y = [unique for unique, count in np.transpose(unique_counts)
|
|
80
|
+
if count == max_count and not np.isnan(unique)]
|
|
81
|
+
modes.append(mode_y) # Appends index of column and column modes
|
|
82
|
+
for x_i, y_i in nan_xy:
|
|
83
|
+
data[x_i][y_i] = np.random.choice(modes[y_i])
|
|
84
|
+
return data
|
impyute/cs/em.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from impyute.ops import matrix
|
|
3
|
+
from impyute.ops import wrapper
|
|
4
|
+
|
|
5
|
+
@wrapper.wrappers
|
|
6
|
+
@wrapper.checks
|
|
7
|
+
def em(data, eps=0.1):
|
|
8
|
+
""" Imputes given data using expectation maximization.
|
|
9
|
+
|
|
10
|
+
E-step: Calculates the expected complete data log likelihood ratio.
|
|
11
|
+
M-step: Finds the parameters that maximize the log likelihood of the
|
|
12
|
+
complete data.
|
|
13
|
+
|
|
14
|
+
Parameters
|
|
15
|
+
----------
|
|
16
|
+
data: numpy.nd.array
|
|
17
|
+
Data to impute.
|
|
18
|
+
eps: float
|
|
19
|
+
The amount of minimum change between iterations to break, if relative change < eps, converge.
|
|
20
|
+
relative change = abs(current - previous) / previous
|
|
21
|
+
inplace: boolean
|
|
22
|
+
If True, operate on the numpy array reference
|
|
23
|
+
|
|
24
|
+
Returns
|
|
25
|
+
-------
|
|
26
|
+
numpy.nd.array
|
|
27
|
+
Imputed data.
|
|
28
|
+
|
|
29
|
+
"""
|
|
30
|
+
nan_xy = matrix.nan_indices(data)
|
|
31
|
+
for x_i, y_i in nan_xy:
|
|
32
|
+
col = data[:, int(y_i)]
|
|
33
|
+
mu = col[~np.isnan(col)].mean()
|
|
34
|
+
std = col[~np.isnan(col)].std()
|
|
35
|
+
col[x_i] = np.random.normal(loc=mu, scale=std)
|
|
36
|
+
previous, i = 1, 1
|
|
37
|
+
while True:
|
|
38
|
+
i += 1
|
|
39
|
+
# Expectation
|
|
40
|
+
mu = col[~np.isnan(col)].mean()
|
|
41
|
+
std = col[~np.isnan(col)].std()
|
|
42
|
+
# Maximization
|
|
43
|
+
col[x_i] = np.random.normal(loc=mu, scale=std)
|
|
44
|
+
# Break out of loop if likelihood doesn't change at least 10%
|
|
45
|
+
# and has run at least 5 times
|
|
46
|
+
delta = np.abs(col[x_i]-previous)/previous
|
|
47
|
+
if i > 5 and delta < eps:
|
|
48
|
+
data[x_i][y_i] = col[x_i]
|
|
49
|
+
break
|
|
50
|
+
data[x_i][y_i] = col[x_i]
|
|
51
|
+
previous = col[x_i]
|
|
52
|
+
return data
|
impyute/cs/fast_knn.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from scipy.spatial import KDTree
|
|
4
|
+
from impyute.ops import matrix
|
|
5
|
+
from impyute.ops import wrapper
|
|
6
|
+
from impyute.ops import inverse_distance_weighting as idw
|
|
7
|
+
|
|
8
|
+
from . import mean
|
|
9
|
+
# pylint: disable=too-many-arguments
|
|
10
|
+
|
|
11
|
+
@wrapper.wrappers
|
|
12
|
+
@wrapper.checks
|
|
13
|
+
def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10,
|
|
14
|
+
idw_fn=idw.shepards, init_impute_fn=mean):
|
|
15
|
+
""" Impute using a variant of the nearest neighbours approach
|
|
16
|
+
|
|
17
|
+
Basic idea: Impute array with a passed in initial impute fn (mean impute)
|
|
18
|
+
and then use the resulting complete array to construct a KDTree. Use this
|
|
19
|
+
KDTree to compute nearest neighbours. After finding `k` nearest
|
|
20
|
+
neighbours, take the weighted average of them. Basically, find the nearest
|
|
21
|
+
row in terms of distance
|
|
22
|
+
|
|
23
|
+
This approach is much, much faster than the other implementation (fit+transform
|
|
24
|
+
for each subset) which is almost prohibitively expensive.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
data: ndarray
|
|
29
|
+
2D matrix to impute.
|
|
30
|
+
|
|
31
|
+
k: int, optional
|
|
32
|
+
Parameter used for method querying the KDTree class object. Number of
|
|
33
|
+
neighbours used in the KNN query. Refer to the docs for
|
|
34
|
+
[`scipy.spatial.KDTree.query`]
|
|
35
|
+
(https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
|
|
36
|
+
|
|
37
|
+
eps: nonnegative float, optional
|
|
38
|
+
Parameter used for method querying the KDTree class object. From the
|
|
39
|
+
SciPy docs: "Return approximate nearest neighbors; the kth returned
|
|
40
|
+
value is guaranteed to be no further than (1+eps) times the distance to
|
|
41
|
+
the real kth nearest neighbor". Refer to the docs for
|
|
42
|
+
[`scipy.spatial.KDTree.query`]
|
|
43
|
+
(https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
|
|
44
|
+
|
|
45
|
+
p : float, 1<=p<=infinity, optional
|
|
46
|
+
Parameter used for method querying the KDTree class object. Straight from the
|
|
47
|
+
SciPy docs: "Which Minkowski p-norm to use. 1 is the
|
|
48
|
+
sum-of-absolute-values Manhattan distance 2 is the usual Euclidean
|
|
49
|
+
distance infinity is the maximum-coordinate-difference distance". Refer to
|
|
50
|
+
the docs for
|
|
51
|
+
[`scipy.spatial.KDTree.query`]
|
|
52
|
+
(https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
|
|
53
|
+
|
|
54
|
+
distance_upper_bound : nonnegative float, optional
|
|
55
|
+
Parameter used for method querying the KDTree class object. Straight
|
|
56
|
+
from the SciPy docs: "Return only neighbors within this distance. This
|
|
57
|
+
is used to prune tree searches, so if you are doing a series of
|
|
58
|
+
nearest-neighbor queries, it may help to supply the distance to the
|
|
59
|
+
nearest neighbor of the most recent point." Refer to the docs for
|
|
60
|
+
[`scipy.spatial.KDTree.query`]
|
|
61
|
+
(https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
|
|
62
|
+
|
|
63
|
+
leafsize: int, optional
|
|
64
|
+
Parameter used for construction of the `KDTree` class object. Straight from
|
|
65
|
+
the SciPy docs: "The number of points at which the algorithm switches
|
|
66
|
+
over to brute-force. Has to be positive". Refer to the docs for
|
|
67
|
+
[`scipy.spatial.KDTree`](https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.KDTree.html)
|
|
68
|
+
for more information.
|
|
69
|
+
|
|
70
|
+
idw_fn: fn, optional
|
|
71
|
+
Function that takes one argument, a list of distances, and returns weighted percentages. You can define a custom
|
|
72
|
+
one or bootstrap from functions defined in `impy.util.inverse_distance_weighting` which can be using
|
|
73
|
+
functools.partial, for example: `functools.partial(impy.util.inverse_distance_weighting.shepards, power=1)`
|
|
74
|
+
|
|
75
|
+
init_impute_fn: fn, optional
|
|
76
|
+
|
|
77
|
+
Returns
|
|
78
|
+
-------
|
|
79
|
+
numpy.ndarray
|
|
80
|
+
Imputed data.
|
|
81
|
+
|
|
82
|
+
Examples
|
|
83
|
+
--------
|
|
84
|
+
|
|
85
|
+
>>> data = np.arange(25).reshape((5, 5)).astype(np.float)
|
|
86
|
+
>>> data[0][2] = np.nan
|
|
87
|
+
>>> data
|
|
88
|
+
array([[ 0., 1., nan, 3., 4.],
|
|
89
|
+
[ 5., 6., 7., 8., 9.],
|
|
90
|
+
[10., 11., 12., 13., 14.],
|
|
91
|
+
[15., 16., 17., 18., 19.],
|
|
92
|
+
[20., 21., 22., 23., 24.]])
|
|
93
|
+
>> fast_knn(data, k=1) # Weighted average (by distance) of nearest 1 neighbour
|
|
94
|
+
array([[ 0., 1., 7., 3., 4.],
|
|
95
|
+
[ 5., 6., 7., 8., 9.],
|
|
96
|
+
[10., 11., 12., 13., 14.],
|
|
97
|
+
[15., 16., 17., 18., 19.],
|
|
98
|
+
[20., 21., 22., 23., 24.]])
|
|
99
|
+
>> fast_knn(data, k=2) # Weighted average of nearest 2 neighbours
|
|
100
|
+
array([[ 0. , 1. , 10.08608891, 3. , 4. ],
|
|
101
|
+
[ 5. , 6. , 7. , 8. , 9. ],
|
|
102
|
+
[10. , 11. , 12. , 13. , 14. ],
|
|
103
|
+
[15. , 16. , 17. , 18. , 19. ],
|
|
104
|
+
[20. , 21. , 22. , 23. , 24. ]])
|
|
105
|
+
>> fast_knn(data, k=3)
|
|
106
|
+
array([[ 0. , 1. , 13.40249283, 3. , 4. ],
|
|
107
|
+
[ 5. , 6. , 7. , 8. , 9. ],
|
|
108
|
+
[10. , 11. , 12. , 13. , 14. ],
|
|
109
|
+
[15. , 16. , 17. , 18. , 19. ],
|
|
110
|
+
[20. , 21. , 22. , 23. , 24. ]])
|
|
111
|
+
>> fast_knn(data, k=5) # There are at most only 4 neighbours. Raises error
|
|
112
|
+
...
|
|
113
|
+
IndexError: index 5 is out of bounds for axis 0 with size 5
|
|
114
|
+
|
|
115
|
+
"""
|
|
116
|
+
nan_xy = matrix.nan_indices(data)
|
|
117
|
+
data_c = init_impute_fn(data)
|
|
118
|
+
kdtree = KDTree(data_c, leafsize=leafsize)
|
|
119
|
+
|
|
120
|
+
for x_i, y_i in nan_xy:
|
|
121
|
+
distances, indices = kdtree.query(data_c[x_i], k=k+1, eps=eps,
|
|
122
|
+
p=p, distance_upper_bound=distance_upper_bound)
|
|
123
|
+
# Will always return itself in the first index. Delete it.
|
|
124
|
+
distances, indices = distances[1:], indices[1:]
|
|
125
|
+
# Add small constant to distances to avoid division by 0
|
|
126
|
+
distances += 1e-3
|
|
127
|
+
weights = idw_fn(distances)
|
|
128
|
+
# Assign missing value the weighted average of `k` nearest neighbours
|
|
129
|
+
data[x_i][y_i] = np.dot(weights, [data_c[ind][y_i] for ind in indices])
|
|
130
|
+
return data
|
impyute/cs/random.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from impyute.ops import matrix
|
|
3
|
+
from impyute.ops import wrapper
|
|
4
|
+
|
|
5
|
+
@wrapper.wrappers
|
|
6
|
+
@wrapper.checks
|
|
7
|
+
def random_impute(data):
|
|
8
|
+
""" Fill missing values in with a randomly selected value from the same
|
|
9
|
+
column.
|
|
10
|
+
|
|
11
|
+
Parameters
|
|
12
|
+
----------
|
|
13
|
+
data: numpy.ndarray
|
|
14
|
+
Data to impute.
|
|
15
|
+
|
|
16
|
+
Returns
|
|
17
|
+
-------
|
|
18
|
+
numpy.ndarray
|
|
19
|
+
Imputed data.
|
|
20
|
+
|
|
21
|
+
"""
|
|
22
|
+
nan_xy = matrix.nan_indices(data)
|
|
23
|
+
for x, y in nan_xy:
|
|
24
|
+
uniques = np.unique(data[:, y])
|
|
25
|
+
uniques = uniques[~np.isnan(uniques)]
|
|
26
|
+
data[x][y] = np.random.choice(uniques)
|
|
27
|
+
return data
|
impyute/dataset/base.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
""" Shared functions to load/generate data """
|
|
2
|
+
import itertools
|
|
3
|
+
import math
|
|
4
|
+
import random
|
|
5
|
+
import string
|
|
6
|
+
import numpy as np
|
|
7
|
+
from impyute.dataset.corrupt import Corruptor
|
|
8
|
+
from impyute.ops import error
|
|
9
|
+
|
|
10
|
+
def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int"):
|
|
11
|
+
""" Return randomly generated dataset of numbers with uniformly
|
|
12
|
+
distributed values between bound[0] and bound[1]
|
|
13
|
+
|
|
14
|
+
Parameters
|
|
15
|
+
----------
|
|
16
|
+
bound:tuple (start,stop)
|
|
17
|
+
Determines the range of values in the matrix. Index 0 for start
|
|
18
|
+
value and index 1 for stop value. Start is inclusive, stop is
|
|
19
|
+
exclusive.
|
|
20
|
+
shape:tuple(optional)
|
|
21
|
+
Size of the randomly generated data
|
|
22
|
+
missingness: ('mcar', 'mar', 'mnar')
|
|
23
|
+
Type of missingness you want in your dataset
|
|
24
|
+
thr: float between [0,1]
|
|
25
|
+
Percentage of missing data in generated data
|
|
26
|
+
dtype: ('int','float')
|
|
27
|
+
Type of data
|
|
28
|
+
|
|
29
|
+
Returns
|
|
30
|
+
-------
|
|
31
|
+
numpy.ndarray
|
|
32
|
+
"""
|
|
33
|
+
if dtype == "int":
|
|
34
|
+
data = np.random.randint(bound[0], bound[1], size=shape).astype(float)
|
|
35
|
+
elif dtype == "float":
|
|
36
|
+
data = np.random.uniform(bound[0], bound[1], size=shape)
|
|
37
|
+
corruptor = Corruptor(data, thr=thr)
|
|
38
|
+
raw_data = getattr(corruptor, missingness)()
|
|
39
|
+
return raw_data
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float"):
|
|
43
|
+
""" Return randomly generated dataset of numbers with normally
|
|
44
|
+
distributed values with given and sigma.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
theta: tuple (mu, sigma)
|
|
49
|
+
Determines the range of values in the matrix
|
|
50
|
+
shape:tuple(optional)
|
|
51
|
+
Size of the randomly generated data
|
|
52
|
+
missingness: ('mcar', 'mar', 'mnar')
|
|
53
|
+
Type of missingness you want in your dataset
|
|
54
|
+
thr: float between [0,1]
|
|
55
|
+
Percentage of missing data in generated data
|
|
56
|
+
dtype: ('int','float')
|
|
57
|
+
Type of data
|
|
58
|
+
|
|
59
|
+
Returns
|
|
60
|
+
-------
|
|
61
|
+
numpy.ndarray
|
|
62
|
+
"""
|
|
63
|
+
mean, sigma = theta
|
|
64
|
+
data = np.random.normal(mean, sigma, size=shape)
|
|
65
|
+
if dtype == "int":
|
|
66
|
+
data = np.round(data)
|
|
67
|
+
elif dtype == "float":
|
|
68
|
+
pass
|
|
69
|
+
corruptor = Corruptor(data, thr=thr)
|
|
70
|
+
raw_data = getattr(corruptor, missingness)()
|
|
71
|
+
return raw_data
|
|
72
|
+
|
|
73
|
+
def randc(nlevels=5, shape=(5, 5), missingness="mcar", thr=0.2):
|
|
74
|
+
""" Return randomly generated dataset with uniformly distributed categorical data (alphabetic character)
|
|
75
|
+
|
|
76
|
+
Parameters
|
|
77
|
+
----------
|
|
78
|
+
nlevels: int
|
|
79
|
+
Specify the number of different categories in the dataset
|
|
80
|
+
shape: tuple(optional)
|
|
81
|
+
Size of the randomly generated data
|
|
82
|
+
missingness: string in ('mcar', 'mar', 'mnar')
|
|
83
|
+
Type of missingness you want in your dataset
|
|
84
|
+
thr: float between [0,1]
|
|
85
|
+
Percentage of missing data in generated data
|
|
86
|
+
|
|
87
|
+
Returns
|
|
88
|
+
-------
|
|
89
|
+
numpy.ndarray
|
|
90
|
+
"""
|
|
91
|
+
if shape[0]*shape[1] < nlevels:
|
|
92
|
+
raise error.BadInputError("nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape")
|
|
93
|
+
|
|
94
|
+
length = len(string.ascii_lowercase)
|
|
95
|
+
n_fold = int(math.floor(math.log(nlevels, length)))
|
|
96
|
+
cat_pool = list(string.ascii_lowercase)
|
|
97
|
+
|
|
98
|
+
# when nlevel > 26, the alphabetical character is used up, need to generate extra strings as categorical data
|
|
99
|
+
if n_fold > 0:
|
|
100
|
+
for i in range(2, n_fold+2):
|
|
101
|
+
pool_candidate = list(itertools.product(string.ascii_lowercase, repeat=i))
|
|
102
|
+
cat_pool.extend([''.join(w) for w in pool_candidate])
|
|
103
|
+
if len(cat_pool) > nlevels:
|
|
104
|
+
break
|
|
105
|
+
|
|
106
|
+
cat = random.sample(cat_pool, nlevels)
|
|
107
|
+
data = np.random.choice(cat, shape, replace=True)
|
|
108
|
+
|
|
109
|
+
# make sure the data frame has nlevel different categories
|
|
110
|
+
while len(np.unique(data)) != nlevels:
|
|
111
|
+
data = np.random.choice(cat, shape, replace=True)
|
|
112
|
+
|
|
113
|
+
corruptor = Corruptor(data, thr=thr, dtype=np.str)
|
|
114
|
+
raw_data = getattr(corruptor, missingness)()
|
|
115
|
+
return raw_data
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def mnist(missingness="mcar", thr=0.2):
|
|
120
|
+
""" Loads corrupted MNIST
|
|
121
|
+
|
|
122
|
+
Parameters
|
|
123
|
+
----------
|
|
124
|
+
missingness: ('mcar', 'mar', 'mnar')
|
|
125
|
+
Type of missigness you want in your dataset
|
|
126
|
+
th: float between [0,1]
|
|
127
|
+
Percentage of missing data in generated data
|
|
128
|
+
|
|
129
|
+
Returns
|
|
130
|
+
-------
|
|
131
|
+
numpy.ndarray
|
|
132
|
+
"""
|
|
133
|
+
from sklearn.datasets import fetch_mldata
|
|
134
|
+
dataset = fetch_mldata('MNIST original')
|
|
135
|
+
corruptor = Corruptor(dataset.data, thr=thr)
|
|
136
|
+
data = getattr(corruptor, missingness)()
|
|
137
|
+
return {"X": data, "Y": dataset.target}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
""" impyute.dataset.corrupt """
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Corruptor:
|
|
6
|
+
""" Adds missing values to a complete dataset.
|
|
7
|
+
|
|
8
|
+
Attributes
|
|
9
|
+
----------
|
|
10
|
+
data: np.ndarray
|
|
11
|
+
Matrix of values with no NaN's that you want to add NaN's to.
|
|
12
|
+
thr: float (optional)
|
|
13
|
+
The percentage of null values you want in your dataset, a number
|
|
14
|
+
between 0 and 1.
|
|
15
|
+
|
|
16
|
+
Methods
|
|
17
|
+
-------
|
|
18
|
+
mcar()
|
|
19
|
+
Overwrite values with MCAR placed NaN's.
|
|
20
|
+
mar()
|
|
21
|
+
Overwrite values with MAR placed NaN's.
|
|
22
|
+
mnar()
|
|
23
|
+
Overwrite values with MNAR placed NaN's.
|
|
24
|
+
|
|
25
|
+
"""
|
|
26
|
+
def __init__(self, data, thr=0.2, dtype=np.float):
|
|
27
|
+
self.dtype = data.dtype
|
|
28
|
+
self.shape = np.shape(data)
|
|
29
|
+
self.data = data.astype(dtype)
|
|
30
|
+
self.thr = thr
|
|
31
|
+
|
|
32
|
+
def mcar(self):
|
|
33
|
+
""" Overwrites values with MCAR placed NaN's """
|
|
34
|
+
data_1d = self.data.flatten()
|
|
35
|
+
n_total = len(data_1d)
|
|
36
|
+
nan_x = np.random.choice(range(n_total),
|
|
37
|
+
size=int(self.thr*n_total),
|
|
38
|
+
replace=False)
|
|
39
|
+
for x_i in nan_x:
|
|
40
|
+
data_1d[x_i] = np.nan
|
|
41
|
+
output = data_1d.reshape(self.shape)
|
|
42
|
+
return output
|
|
43
|
+
|
|
44
|
+
def mar(self):
|
|
45
|
+
""" Overwrites values with MAR placed NaN's """
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
def mnar(self):
|
|
49
|
+
""" Overwrites values with MNAR placed NaN's """
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
def complete(self):
|
|
53
|
+
""" Do nothing to the data """
|
|
54
|
+
output = self.data
|
|
55
|
+
return output
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
""" impyute.deletion.complete_case """
|
|
2
|
+
import numpy as np
|
|
3
|
+
from impyute.ops import wrapper
|
|
4
|
+
|
|
5
|
+
@wrapper.wrappers
|
|
6
|
+
@wrapper.checks
|
|
7
|
+
def complete_case(data):
|
|
8
|
+
""" Return only data rows with all columns
|
|
9
|
+
|
|
10
|
+
Parameters
|
|
11
|
+
----------
|
|
12
|
+
data: numpy.ndarray
|
|
13
|
+
Data to impute.
|
|
14
|
+
|
|
15
|
+
Returns
|
|
16
|
+
-------
|
|
17
|
+
numpy.ndarray
|
|
18
|
+
Imputed data.
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
return data[~np.isnan(data).any(axis=1)]
|
impyute/ops/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
""" Unorganized set of utility functions """
|
|
2
|
+
|
|
3
|
+
from . import error
|
|
4
|
+
from . import inverse_distance_weighting
|
|
5
|
+
from . import matrix
|
|
6
|
+
from . import util
|
|
7
|
+
from . import wrapper
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"error", "inverse_distance_weighting", "matrix",
|
|
11
|
+
"util", "wrapper"
|
|
12
|
+
]
|
impyute/ops/error.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
""" Assign weights to distances in a way such that farther values are weighed less """
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
def shepards(distances, power=2):
|
|
5
|
+
""" Basic inverse distance weighting function
|
|
6
|
+
|
|
7
|
+
Parameters
|
|
8
|
+
----------
|
|
9
|
+
distances: list/numpy.ndarray
|
|
10
|
+
1D list of numbers (ex. distance results from call to KDTree.query)
|
|
11
|
+
|
|
12
|
+
power: int
|
|
13
|
+
Default of 2 used since the referenced paper stated an exponent of 2 "gives seemingly
|
|
14
|
+
satisfactory results"
|
|
15
|
+
|
|
16
|
+
Returns
|
|
17
|
+
-------
|
|
18
|
+
numpy.ndarray
|
|
19
|
+
1D list of numbers that sum to 1, represents weights of provided distances, in order.
|
|
20
|
+
|
|
21
|
+
References
|
|
22
|
+
----------
|
|
23
|
+
|
|
24
|
+
Shepard, Donald (1968). "A two-dimensional interpolation function for irregularly-spaced data".
|
|
25
|
+
Proceedings of the 1968 ACM National Conference. pp. 517-524. doi:10.1145/800186.810616
|
|
26
|
+
"""
|
|
27
|
+
return to_percentage(1/np.power(distances, power))
|
|
28
|
+
|
|
29
|
+
def to_percentage(vec):
|
|
30
|
+
""" Converts list of real numbers into a list of percentages """
|
|
31
|
+
return vec/np.sum(vec)
|
impyute/ops/matrix.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
""" Common operations on matrices
|
|
2
|
+
|
|
3
|
+
*Look into whether it's worth writing these in raw c*
|
|
4
|
+
"""
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
def nan_indices(data):
|
|
8
|
+
""" Finds the indices of all missing values.
|
|
9
|
+
|
|
10
|
+
Parameters
|
|
11
|
+
----------
|
|
12
|
+
data: numpy.ndarray
|
|
13
|
+
|
|
14
|
+
Returns
|
|
15
|
+
-------
|
|
16
|
+
List of tuples
|
|
17
|
+
Indices of all missing values in tuple format; (i, j)
|
|
18
|
+
"""
|
|
19
|
+
return np.argwhere(np.isnan(data))
|
|
20
|
+
|
|
21
|
+
def map_nd(fn, arr):
|
|
22
|
+
""" Map fn that takes a value over entire n-dim array
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
arr: numpy.ndarray
|
|
27
|
+
|
|
28
|
+
Returns
|
|
29
|
+
-------
|
|
30
|
+
numpy.ndarray
|
|
31
|
+
|
|
32
|
+
"""
|
|
33
|
+
return np.vectorize(fn)(arr)
|
|
34
|
+
|
|
35
|
+
def every_nd(fn, arr):
|
|
36
|
+
""" Returns bool, true if fn is true for all elements of arr
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
arr: numpy.ndarray
|
|
41
|
+
|
|
42
|
+
Returns
|
|
43
|
+
-------
|
|
44
|
+
bool
|
|
45
|
+
|
|
46
|
+
"""
|
|
47
|
+
return all(map(fn, arr.flatten()))
|
impyute/ops/testing.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
""" Utilities used for unit tests """
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def return_na_check(data):
|
|
6
|
+
"""Helper function for tests to check if the data returned is a
|
|
7
|
+
numpy array and that the imputed data has no NaN's.
|
|
8
|
+
|
|
9
|
+
Parameters
|
|
10
|
+
----------
|
|
11
|
+
data: numpy.ndarray
|
|
12
|
+
Data to impute.
|
|
13
|
+
|
|
14
|
+
Returns
|
|
15
|
+
-------
|
|
16
|
+
None
|
|
17
|
+
|
|
18
|
+
"""
|
|
19
|
+
assert isinstance(data, np.ndarray)
|
|
20
|
+
assert not np.isnan(data).any()
|