cpgtools 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. cpgmodule/BED.py +441 -0
  2. cpgmodule/MI.py +193 -0
  3. cpgmodule/__init__.py +0 -0
  4. cpgmodule/_version.py +1 -0
  5. cpgmodule/cgID.py +866897 -0
  6. cpgmodule/data/AltumAge_cpg.pkl +0 -0
  7. cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
  8. cpgmodule/data/AltumAge_scaler.pkl +0 -0
  9. cpgmodule/data/GA_Bohlin.pkl +0 -0
  10. cpgmodule/data/GA_Haftorn.pkl +0 -0
  11. cpgmodule/data/GA_Knight.pkl +0 -0
  12. cpgmodule/data/GA_Lee_CPC.pkl +0 -0
  13. cpgmodule/data/GA_Lee_RPC.pkl +0 -0
  14. cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
  15. cpgmodule/data/GA_Mayne.pkl +0 -0
  16. cpgmodule/data/Hannum.pkl +0 -0
  17. cpgmodule/data/Horvath_2013.pkl +0 -0
  18. cpgmodule/data/Horvath_2018.pkl +0 -0
  19. cpgmodule/data/Levine.pkl +0 -0
  20. cpgmodule/data/Lu_DNAmTL.pkl +0 -0
  21. cpgmodule/data/Ped_McEwen.pkl +0 -0
  22. cpgmodule/data/Ped_Wu.pkl +0 -0
  23. cpgmodule/data/Zhang_BLUP.pkl +0 -0
  24. cpgmodule/data/Zhang_EN.pkl +0 -0
  25. cpgmodule/data/__init__.py +0 -0
  26. cpgmodule/extend_bed.py +147 -0
  27. cpgmodule/imotif.py +348 -0
  28. cpgmodule/ireader.py +28 -0
  29. cpgmodule/methylClock.py +53 -0
  30. cpgmodule/padjust.py +58 -0
  31. cpgmodule/region2gene.py +170 -0
  32. cpgmodule/utils.py +642 -0
  33. cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
  34. cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
  35. cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
  36. cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
  37. cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
  38. cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
  39. cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
  40. cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
  41. cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
  42. cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
  43. cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
  44. cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
  45. cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
  46. cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
  47. cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
  48. cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
  49. cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
  50. cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
  51. cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
  52. cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
  53. cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
  54. cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
  55. cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
  56. cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
  57. cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
  58. cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
  59. cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
  60. cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
  61. cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
  62. cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
  63. cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
  64. cpgtools-2.0.5.dist-info/METADATA +59 -0
  65. cpgtools-2.0.5.dist-info/RECORD +104 -0
  66. cpgtools-2.0.5.dist-info/WHEEL +5 -0
  67. cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
  68. cpgtools-2.0.5.dist-info/top_level.txt +5 -0
  69. impyute/__init__.py +3 -0
  70. impyute/contrib/__init__.py +7 -0
  71. impyute/contrib/compare.py +69 -0
  72. impyute/contrib/count_missing.py +30 -0
  73. impyute/contrib/describe.py +63 -0
  74. impyute/cs/__init__.py +11 -0
  75. impyute/cs/buck_iterative.py +82 -0
  76. impyute/cs/central_tendency.py +84 -0
  77. impyute/cs/em.py +52 -0
  78. impyute/cs/fast_knn.py +130 -0
  79. impyute/cs/random.py +27 -0
  80. impyute/dataset/__init__.py +6 -0
  81. impyute/dataset/base.py +137 -0
  82. impyute/dataset/corrupt.py +55 -0
  83. impyute/deletion/__init__.py +5 -0
  84. impyute/deletion/complete_case.py +21 -0
  85. impyute/ops/__init__.py +12 -0
  86. impyute/ops/error.py +9 -0
  87. impyute/ops/inverse_distance_weighting.py +31 -0
  88. impyute/ops/matrix.py +47 -0
  89. impyute/ops/testing.py +20 -0
  90. impyute/ops/util.py +96 -0
  91. impyute/ops/wrapper.py +179 -0
  92. impyute/ts/__init__.py +6 -0
  93. impyute/ts/locf.py +57 -0
  94. impyute/ts/moving_window.py +128 -0
  95. impyutelib.py +890 -0
  96. missingpy/__init__.py +4 -0
  97. missingpy/knnimpute.py +328 -0
  98. missingpy/missforest.py +556 -0
  99. missingpy/pairwise_external.py +315 -0
  100. missingpy/tests/__init__.py +0 -0
  101. missingpy/tests/test_knnimpute.py +605 -0
  102. missingpy/tests/test_missforest.py +409 -0
  103. missingpy/utils.py +124 -0
  104. misspylib.py +565 -0
@@ -0,0 +1,84 @@
1
+ import numpy as np
2
+ from impyute.ops import matrix
3
+ from impyute.ops import wrapper
4
+
5
+ @wrapper.wrappers
6
+ @wrapper.checks
7
+ def mean(data):
8
+ """ Substitute missing values with the mean of that column.
9
+
10
+ Parameters
11
+ ----------
12
+ data: numpy.ndarray
13
+ Data to impute.
14
+
15
+ Returns
16
+ -------
17
+ numpy.ndarray
18
+ Imputed data.
19
+
20
+ """
21
+ nan_xy = matrix.nan_indices(data)
22
+ for x_i, y_i in nan_xy:
23
+ row_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
24
+ new_value = np.mean(row_wo_nan)
25
+ data[x_i][y_i] = new_value
26
+ return data
27
+
28
+ @wrapper.wrappers
29
+ @wrapper.checks
30
+ def median(data):
31
+ """ Substitute missing values with the median of that column(middle).
32
+
33
+ Parameters
34
+ ----------
35
+ data: numpy.ndarray
36
+ Data to impute.
37
+
38
+ Returns
39
+ -------
40
+ numpy.ndarray
41
+ Imputed data.
42
+
43
+ """
44
+ nan_xy = matrix.nan_indices(data)
45
+ cols_missing = set(nan_xy.T[1])
46
+ medians = {}
47
+ for y_i in cols_missing:
48
+ cols_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
49
+ median_y = np.median(cols_wo_nan)
50
+ medians[str(y_i)] = median_y
51
+ for x_i, y_i in nan_xy:
52
+ data[x_i][y_i] = medians[str(y_i)]
53
+ return data
54
+
55
+ @wrapper.wrappers
56
+ @wrapper.checks
57
+ def mode(data):
58
+ """ Substitute missing values with the mode of that column(most frequent).
59
+
60
+ In the case that there is a tie (there are multiple, most frequent values)
61
+ for a column randomly pick one of them.
62
+
63
+ Parameters
64
+ ----------
65
+ data: numpy.ndarray
66
+ Data to impute.
67
+
68
+ Returns
69
+ -------
70
+ numpy.ndarray
71
+ Imputed data.
72
+
73
+ """
74
+ nan_xy = matrix.nan_indices(data)
75
+ modes = []
76
+ for y_i in range(np.shape(data)[1]):
77
+ unique_counts = np.unique(data[:, [y_i]], return_counts=True)
78
+ max_count = np.max(unique_counts[1])
79
+ mode_y = [unique for unique, count in np.transpose(unique_counts)
80
+ if count == max_count and not np.isnan(unique)]
81
+ modes.append(mode_y) # Appends index of column and column modes
82
+ for x_i, y_i in nan_xy:
83
+ data[x_i][y_i] = np.random.choice(modes[y_i])
84
+ return data
impyute/cs/em.py ADDED
@@ -0,0 +1,52 @@
1
+ import numpy as np
2
+ from impyute.ops import matrix
3
+ from impyute.ops import wrapper
4
+
5
+ @wrapper.wrappers
6
+ @wrapper.checks
7
+ def em(data, eps=0.1):
8
+ """ Imputes given data using expectation maximization.
9
+
10
+ E-step: Calculates the expected complete data log likelihood ratio.
11
+ M-step: Finds the parameters that maximize the log likelihood of the
12
+ complete data.
13
+
14
+ Parameters
15
+ ----------
16
+ data: numpy.nd.array
17
+ Data to impute.
18
+ eps: float
19
+ The amount of minimum change between iterations to break, if relative change < eps, converge.
20
+ relative change = abs(current - previous) / previous
21
+ inplace: boolean
22
+ If True, operate on the numpy array reference
23
+
24
+ Returns
25
+ -------
26
+ numpy.nd.array
27
+ Imputed data.
28
+
29
+ """
30
+ nan_xy = matrix.nan_indices(data)
31
+ for x_i, y_i in nan_xy:
32
+ col = data[:, int(y_i)]
33
+ mu = col[~np.isnan(col)].mean()
34
+ std = col[~np.isnan(col)].std()
35
+ col[x_i] = np.random.normal(loc=mu, scale=std)
36
+ previous, i = 1, 1
37
+ while True:
38
+ i += 1
39
+ # Expectation
40
+ mu = col[~np.isnan(col)].mean()
41
+ std = col[~np.isnan(col)].std()
42
+ # Maximization
43
+ col[x_i] = np.random.normal(loc=mu, scale=std)
44
+ # Break out of loop if likelihood doesn't change at least 10%
45
+ # and has run at least 5 times
46
+ delta = np.abs(col[x_i]-previous)/previous
47
+ if i > 5 and delta < eps:
48
+ data[x_i][y_i] = col[x_i]
49
+ break
50
+ data[x_i][y_i] = col[x_i]
51
+ previous = col[x_i]
52
+ return data
impyute/cs/fast_knn.py ADDED
@@ -0,0 +1,130 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from scipy.spatial import KDTree
4
+ from impyute.ops import matrix
5
+ from impyute.ops import wrapper
6
+ from impyute.ops import inverse_distance_weighting as idw
7
+
8
+ from . import mean
9
+ # pylint: disable=too-many-arguments
10
+
11
+ @wrapper.wrappers
12
+ @wrapper.checks
13
+ def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10,
14
+ idw_fn=idw.shepards, init_impute_fn=mean):
15
+ """ Impute using a variant of the nearest neighbours approach
16
+
17
+ Basic idea: Impute array with a passed in initial impute fn (mean impute)
18
+ and then use the resulting complete array to construct a KDTree. Use this
19
+ KDTree to compute nearest neighbours. After finding `k` nearest
20
+ neighbours, take the weighted average of them. Basically, find the nearest
21
+ row in terms of distance
22
+
23
+ This approach is much, much faster than the other implementation (fit+transform
24
+ for each subset) which is almost prohibitively expensive.
25
+
26
+ Parameters
27
+ ----------
28
+ data: ndarray
29
+ 2D matrix to impute.
30
+
31
+ k: int, optional
32
+ Parameter used for method querying the KDTree class object. Number of
33
+ neighbours used in the KNN query. Refer to the docs for
34
+ [`scipy.spatial.KDTree.query`]
35
+ (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
36
+
37
+ eps: nonnegative float, optional
38
+ Parameter used for method querying the KDTree class object. From the
39
+ SciPy docs: "Return approximate nearest neighbors; the kth returned
40
+ value is guaranteed to be no further than (1+eps) times the distance to
41
+ the real kth nearest neighbor". Refer to the docs for
42
+ [`scipy.spatial.KDTree.query`]
43
+ (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
44
+
45
+ p : float, 1<=p<=infinity, optional
46
+ Parameter used for method querying the KDTree class object. Straight from the
47
+ SciPy docs: "Which Minkowski p-norm to use. 1 is the
48
+ sum-of-absolute-values Manhattan distance 2 is the usual Euclidean
49
+ distance infinity is the maximum-coordinate-difference distance". Refer to
50
+ the docs for
51
+ [`scipy.spatial.KDTree.query`]
52
+ (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
53
+
54
+ distance_upper_bound : nonnegative float, optional
55
+ Parameter used for method querying the KDTree class object. Straight
56
+ from the SciPy docs: "Return only neighbors within this distance. This
57
+ is used to prune tree searches, so if you are doing a series of
58
+ nearest-neighbor queries, it may help to supply the distance to the
59
+ nearest neighbor of the most recent point." Refer to the docs for
60
+ [`scipy.spatial.KDTree.query`]
61
+ (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
62
+
63
+ leafsize: int, optional
64
+ Parameter used for construction of the `KDTree` class object. Straight from
65
+ the SciPy docs: "The number of points at which the algorithm switches
66
+ over to brute-force. Has to be positive". Refer to the docs for
67
+ [`scipy.spatial.KDTree`](https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.KDTree.html)
68
+ for more information.
69
+
70
+ idw_fn: fn, optional
71
+ Function that takes one argument, a list of distances, and returns weighted percentages. You can define a custom
72
+ one or bootstrap from functions defined in `impy.util.inverse_distance_weighting` which can be using
73
+ functools.partial, for example: `functools.partial(impy.util.inverse_distance_weighting.shepards, power=1)`
74
+
75
+ init_impute_fn: fn, optional
76
+
77
+ Returns
78
+ -------
79
+ numpy.ndarray
80
+ Imputed data.
81
+
82
+ Examples
83
+ --------
84
+
85
+ >>> data = np.arange(25).reshape((5, 5)).astype(np.float)
86
+ >>> data[0][2] = np.nan
87
+ >>> data
88
+ array([[ 0., 1., nan, 3., 4.],
89
+ [ 5., 6., 7., 8., 9.],
90
+ [10., 11., 12., 13., 14.],
91
+ [15., 16., 17., 18., 19.],
92
+ [20., 21., 22., 23., 24.]])
93
+ >> fast_knn(data, k=1) # Weighted average (by distance) of nearest 1 neighbour
94
+ array([[ 0., 1., 7., 3., 4.],
95
+ [ 5., 6., 7., 8., 9.],
96
+ [10., 11., 12., 13., 14.],
97
+ [15., 16., 17., 18., 19.],
98
+ [20., 21., 22., 23., 24.]])
99
+ >> fast_knn(data, k=2) # Weighted average of nearest 2 neighbours
100
+ array([[ 0. , 1. , 10.08608891, 3. , 4. ],
101
+ [ 5. , 6. , 7. , 8. , 9. ],
102
+ [10. , 11. , 12. , 13. , 14. ],
103
+ [15. , 16. , 17. , 18. , 19. ],
104
+ [20. , 21. , 22. , 23. , 24. ]])
105
+ >> fast_knn(data, k=3)
106
+ array([[ 0. , 1. , 13.40249283, 3. , 4. ],
107
+ [ 5. , 6. , 7. , 8. , 9. ],
108
+ [10. , 11. , 12. , 13. , 14. ],
109
+ [15. , 16. , 17. , 18. , 19. ],
110
+ [20. , 21. , 22. , 23. , 24. ]])
111
+ >> fast_knn(data, k=5) # There are at most only 4 neighbours. Raises error
112
+ ...
113
+ IndexError: index 5 is out of bounds for axis 0 with size 5
114
+
115
+ """
116
+ nan_xy = matrix.nan_indices(data)
117
+ data_c = init_impute_fn(data)
118
+ kdtree = KDTree(data_c, leafsize=leafsize)
119
+
120
+ for x_i, y_i in nan_xy:
121
+ distances, indices = kdtree.query(data_c[x_i], k=k+1, eps=eps,
122
+ p=p, distance_upper_bound=distance_upper_bound)
123
+ # Will always return itself in the first index. Delete it.
124
+ distances, indices = distances[1:], indices[1:]
125
+ # Add small constant to distances to avoid division by 0
126
+ distances += 1e-3
127
+ weights = idw_fn(distances)
128
+ # Assign missing value the weighted average of `k` nearest neighbours
129
+ data[x_i][y_i] = np.dot(weights, [data_c[ind][y_i] for ind in indices])
130
+ return data
impyute/cs/random.py ADDED
@@ -0,0 +1,27 @@
1
+ import numpy as np
2
+ from impyute.ops import matrix
3
+ from impyute.ops import wrapper
4
+
5
+ @wrapper.wrappers
6
+ @wrapper.checks
7
+ def random_impute(data):
8
+ """ Fill missing values in with a randomly selected value from the same
9
+ column.
10
+
11
+ Parameters
12
+ ----------
13
+ data: numpy.ndarray
14
+ Data to impute.
15
+
16
+ Returns
17
+ -------
18
+ numpy.ndarray
19
+ Imputed data.
20
+
21
+ """
22
+ nan_xy = matrix.nan_indices(data)
23
+ for x, y in nan_xy:
24
+ uniques = np.unique(data[:, y])
25
+ uniques = uniques[~np.isnan(uniques)]
26
+ data[x][y] = np.random.choice(uniques)
27
+ return data
@@ -0,0 +1,6 @@
1
+ """ Real-world/mock datasets and missingness corruptors to experiment with. """
2
+ from .base import randu
3
+ from .base import randn
4
+ from .base import mnist
5
+
6
+ __all__ = ["randu", "randn", "mnist"]
@@ -0,0 +1,137 @@
1
+ """ Shared functions to load/generate data """
2
+ import itertools
3
+ import math
4
+ import random
5
+ import string
6
+ import numpy as np
7
+ from impyute.dataset.corrupt import Corruptor
8
+ from impyute.ops import error
9
+
10
+ def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int"):
11
+ """ Return randomly generated dataset of numbers with uniformly
12
+ distributed values between bound[0] and bound[1]
13
+
14
+ Parameters
15
+ ----------
16
+ bound:tuple (start,stop)
17
+ Determines the range of values in the matrix. Index 0 for start
18
+ value and index 1 for stop value. Start is inclusive, stop is
19
+ exclusive.
20
+ shape:tuple(optional)
21
+ Size of the randomly generated data
22
+ missingness: ('mcar', 'mar', 'mnar')
23
+ Type of missingness you want in your dataset
24
+ thr: float between [0,1]
25
+ Percentage of missing data in generated data
26
+ dtype: ('int','float')
27
+ Type of data
28
+
29
+ Returns
30
+ -------
31
+ numpy.ndarray
32
+ """
33
+ if dtype == "int":
34
+ data = np.random.randint(bound[0], bound[1], size=shape).astype(float)
35
+ elif dtype == "float":
36
+ data = np.random.uniform(bound[0], bound[1], size=shape)
37
+ corruptor = Corruptor(data, thr=thr)
38
+ raw_data = getattr(corruptor, missingness)()
39
+ return raw_data
40
+
41
+
42
+ def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float"):
43
+ """ Return randomly generated dataset of numbers with normally
44
+ distributed values with given and sigma.
45
+
46
+ Parameters
47
+ ----------
48
+ theta: tuple (mu, sigma)
49
+ Determines the range of values in the matrix
50
+ shape:tuple(optional)
51
+ Size of the randomly generated data
52
+ missingness: ('mcar', 'mar', 'mnar')
53
+ Type of missingness you want in your dataset
54
+ thr: float between [0,1]
55
+ Percentage of missing data in generated data
56
+ dtype: ('int','float')
57
+ Type of data
58
+
59
+ Returns
60
+ -------
61
+ numpy.ndarray
62
+ """
63
+ mean, sigma = theta
64
+ data = np.random.normal(mean, sigma, size=shape)
65
+ if dtype == "int":
66
+ data = np.round(data)
67
+ elif dtype == "float":
68
+ pass
69
+ corruptor = Corruptor(data, thr=thr)
70
+ raw_data = getattr(corruptor, missingness)()
71
+ return raw_data
72
+
73
+ def randc(nlevels=5, shape=(5, 5), missingness="mcar", thr=0.2):
74
+ """ Return randomly generated dataset with uniformly distributed categorical data (alphabetic character)
75
+
76
+ Parameters
77
+ ----------
78
+ nlevels: int
79
+ Specify the number of different categories in the dataset
80
+ shape: tuple(optional)
81
+ Size of the randomly generated data
82
+ missingness: string in ('mcar', 'mar', 'mnar')
83
+ Type of missingness you want in your dataset
84
+ thr: float between [0,1]
85
+ Percentage of missing data in generated data
86
+
87
+ Returns
88
+ -------
89
+ numpy.ndarray
90
+ """
91
+ if shape[0]*shape[1] < nlevels:
92
+ raise error.BadInputError("nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape")
93
+
94
+ length = len(string.ascii_lowercase)
95
+ n_fold = int(math.floor(math.log(nlevels, length)))
96
+ cat_pool = list(string.ascii_lowercase)
97
+
98
+ # when nlevel > 26, the alphabetical character is used up, need to generate extra strings as categorical data
99
+ if n_fold > 0:
100
+ for i in range(2, n_fold+2):
101
+ pool_candidate = list(itertools.product(string.ascii_lowercase, repeat=i))
102
+ cat_pool.extend([''.join(w) for w in pool_candidate])
103
+ if len(cat_pool) > nlevels:
104
+ break
105
+
106
+ cat = random.sample(cat_pool, nlevels)
107
+ data = np.random.choice(cat, shape, replace=True)
108
+
109
+ # make sure the data frame has nlevel different categories
110
+ while len(np.unique(data)) != nlevels:
111
+ data = np.random.choice(cat, shape, replace=True)
112
+
113
+ corruptor = Corruptor(data, thr=thr, dtype=np.str)
114
+ raw_data = getattr(corruptor, missingness)()
115
+ return raw_data
116
+
117
+
118
+
119
+ def mnist(missingness="mcar", thr=0.2):
120
+ """ Loads corrupted MNIST
121
+
122
+ Parameters
123
+ ----------
124
+ missingness: ('mcar', 'mar', 'mnar')
125
+ Type of missigness you want in your dataset
126
+ th: float between [0,1]
127
+ Percentage of missing data in generated data
128
+
129
+ Returns
130
+ -------
131
+ numpy.ndarray
132
+ """
133
+ from sklearn.datasets import fetch_mldata
134
+ dataset = fetch_mldata('MNIST original')
135
+ corruptor = Corruptor(dataset.data, thr=thr)
136
+ data = getattr(corruptor, missingness)()
137
+ return {"X": data, "Y": dataset.target}
@@ -0,0 +1,55 @@
1
+ """ impyute.dataset.corrupt """
2
+ import numpy as np
3
+
4
+
5
+ class Corruptor:
6
+ """ Adds missing values to a complete dataset.
7
+
8
+ Attributes
9
+ ----------
10
+ data: np.ndarray
11
+ Matrix of values with no NaN's that you want to add NaN's to.
12
+ thr: float (optional)
13
+ The percentage of null values you want in your dataset, a number
14
+ between 0 and 1.
15
+
16
+ Methods
17
+ -------
18
+ mcar()
19
+ Overwrite values with MCAR placed NaN's.
20
+ mar()
21
+ Overwrite values with MAR placed NaN's.
22
+ mnar()
23
+ Overwrite values with MNAR placed NaN's.
24
+
25
+ """
26
+ def __init__(self, data, thr=0.2, dtype=np.float):
27
+ self.dtype = data.dtype
28
+ self.shape = np.shape(data)
29
+ self.data = data.astype(dtype)
30
+ self.thr = thr
31
+
32
+ def mcar(self):
33
+ """ Overwrites values with MCAR placed NaN's """
34
+ data_1d = self.data.flatten()
35
+ n_total = len(data_1d)
36
+ nan_x = np.random.choice(range(n_total),
37
+ size=int(self.thr*n_total),
38
+ replace=False)
39
+ for x_i in nan_x:
40
+ data_1d[x_i] = np.nan
41
+ output = data_1d.reshape(self.shape)
42
+ return output
43
+
44
+ def mar(self):
45
+ """ Overwrites values with MAR placed NaN's """
46
+ pass
47
+
48
+ def mnar(self):
49
+ """ Overwrites values with MNAR placed NaN's """
50
+ pass
51
+
52
+ def complete(self):
53
+ """ Do nothing to the data """
54
+ output = self.data
55
+ return output
@@ -0,0 +1,5 @@
1
+ """ Missing data approaches that delete values. """
2
+
3
+ from .complete_case import complete_case
4
+
5
+ __all__ = ["complete_case"]
@@ -0,0 +1,21 @@
1
+ """ impyute.deletion.complete_case """
2
+ import numpy as np
3
+ from impyute.ops import wrapper
4
+
5
+ @wrapper.wrappers
6
+ @wrapper.checks
7
+ def complete_case(data):
8
+ """ Return only data rows with all columns
9
+
10
+ Parameters
11
+ ----------
12
+ data: numpy.ndarray
13
+ Data to impute.
14
+
15
+ Returns
16
+ -------
17
+ numpy.ndarray
18
+ Imputed data.
19
+
20
+ """
21
+ return data[~np.isnan(data).any(axis=1)]
@@ -0,0 +1,12 @@
1
+ """ Unorganized set of utility functions """
2
+
3
+ from . import error
4
+ from . import inverse_distance_weighting
5
+ from . import matrix
6
+ from . import util
7
+ from . import wrapper
8
+
9
+ __all__ = [
10
+ "error", "inverse_distance_weighting", "matrix",
11
+ "util", "wrapper"
12
+ ]
impyute/ops/error.py ADDED
@@ -0,0 +1,9 @@
1
+ """ Impyute specific error messages """
2
+
3
+ class BadInputError(Exception):
4
+ "Error thrown when input args don't match spec"
5
+ pass
6
+
7
+ class BadOutputError(Exception):
8
+ "Error thrown when outputs don't match spec"
9
+ pass
@@ -0,0 +1,31 @@
1
+ """ Assign weights to distances in a way such that farther values are weighed less """
2
+ import numpy as np
3
+
4
+ def shepards(distances, power=2):
5
+ """ Basic inverse distance weighting function
6
+
7
+ Parameters
8
+ ----------
9
+ distances: list/numpy.ndarray
10
+ 1D list of numbers (ex. distance results from call to KDTree.query)
11
+
12
+ power: int
13
+ Default of 2 used since the referenced paper stated an exponent of 2 "gives seemingly
14
+ satisfactory results"
15
+
16
+ Returns
17
+ -------
18
+ numpy.ndarray
19
+ 1D list of numbers that sum to 1, represents weights of provided distances, in order.
20
+
21
+ References
22
+ ----------
23
+
24
+ Shepard, Donald (1968). "A two-dimensional interpolation function for irregularly-spaced data".
25
+ Proceedings of the 1968 ACM National Conference. pp. 517-524. doi:10.1145/800186.810616
26
+ """
27
+ return to_percentage(1/np.power(distances, power))
28
+
29
+ def to_percentage(vec):
30
+ """ Converts list of real numbers into a list of percentages """
31
+ return vec/np.sum(vec)
impyute/ops/matrix.py ADDED
@@ -0,0 +1,47 @@
1
+ """ Common operations on matrices
2
+
3
+ *Look into whether it's worth writing these in raw c*
4
+ """
5
+ import numpy as np
6
+
7
+ def nan_indices(data):
8
+ """ Finds the indices of all missing values.
9
+
10
+ Parameters
11
+ ----------
12
+ data: numpy.ndarray
13
+
14
+ Returns
15
+ -------
16
+ List of tuples
17
+ Indices of all missing values in tuple format; (i, j)
18
+ """
19
+ return np.argwhere(np.isnan(data))
20
+
21
+ def map_nd(fn, arr):
22
+ """ Map fn that takes a value over entire n-dim array
23
+
24
+ Parameters
25
+ ----------
26
+ arr: numpy.ndarray
27
+
28
+ Returns
29
+ -------
30
+ numpy.ndarray
31
+
32
+ """
33
+ return np.vectorize(fn)(arr)
34
+
35
+ def every_nd(fn, arr):
36
+ """ Returns bool, true if fn is true for all elements of arr
37
+
38
+ Parameters
39
+ ----------
40
+ arr: numpy.ndarray
41
+
42
+ Returns
43
+ -------
44
+ bool
45
+
46
+ """
47
+ return all(map(fn, arr.flatten()))
impyute/ops/testing.py ADDED
@@ -0,0 +1,20 @@
1
+ """ Utilities used for unit tests """
2
+ import numpy as np
3
+
4
+
5
+ def return_na_check(data):
6
+ """Helper function for tests to check if the data returned is a
7
+ numpy array and that the imputed data has no NaN's.
8
+
9
+ Parameters
10
+ ----------
11
+ data: numpy.ndarray
12
+ Data to impute.
13
+
14
+ Returns
15
+ -------
16
+ None
17
+
18
+ """
19
+ assert isinstance(data, np.ndarray)
20
+ assert not np.isnan(data).any()