cpgtools 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. cpgmodule/BED.py +441 -0
  2. cpgmodule/MI.py +193 -0
  3. cpgmodule/__init__.py +0 -0
  4. cpgmodule/_version.py +1 -0
  5. cpgmodule/cgID.py +866897 -0
  6. cpgmodule/data/AltumAge_cpg.pkl +0 -0
  7. cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
  8. cpgmodule/data/AltumAge_scaler.pkl +0 -0
  9. cpgmodule/data/GA_Bohlin.pkl +0 -0
  10. cpgmodule/data/GA_Haftorn.pkl +0 -0
  11. cpgmodule/data/GA_Knight.pkl +0 -0
  12. cpgmodule/data/GA_Lee_CPC.pkl +0 -0
  13. cpgmodule/data/GA_Lee_RPC.pkl +0 -0
  14. cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
  15. cpgmodule/data/GA_Mayne.pkl +0 -0
  16. cpgmodule/data/Hannum.pkl +0 -0
  17. cpgmodule/data/Horvath_2013.pkl +0 -0
  18. cpgmodule/data/Horvath_2018.pkl +0 -0
  19. cpgmodule/data/Levine.pkl +0 -0
  20. cpgmodule/data/Lu_DNAmTL.pkl +0 -0
  21. cpgmodule/data/Ped_McEwen.pkl +0 -0
  22. cpgmodule/data/Ped_Wu.pkl +0 -0
  23. cpgmodule/data/Zhang_BLUP.pkl +0 -0
  24. cpgmodule/data/Zhang_EN.pkl +0 -0
  25. cpgmodule/data/__init__.py +0 -0
  26. cpgmodule/extend_bed.py +147 -0
  27. cpgmodule/imotif.py +348 -0
  28. cpgmodule/ireader.py +28 -0
  29. cpgmodule/methylClock.py +53 -0
  30. cpgmodule/padjust.py +58 -0
  31. cpgmodule/region2gene.py +170 -0
  32. cpgmodule/utils.py +642 -0
  33. cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
  34. cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
  35. cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
  36. cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
  37. cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
  38. cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
  39. cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
  40. cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
  41. cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
  42. cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
  43. cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
  44. cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
  45. cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
  46. cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
  47. cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
  48. cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
  49. cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
  50. cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
  51. cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
  52. cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
  53. cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
  54. cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
  55. cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
  56. cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
  57. cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
  58. cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
  59. cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
  60. cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
  61. cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
  62. cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
  63. cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
  64. cpgtools-2.0.5.dist-info/METADATA +59 -0
  65. cpgtools-2.0.5.dist-info/RECORD +104 -0
  66. cpgtools-2.0.5.dist-info/WHEEL +5 -0
  67. cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
  68. cpgtools-2.0.5.dist-info/top_level.txt +5 -0
  69. impyute/__init__.py +3 -0
  70. impyute/contrib/__init__.py +7 -0
  71. impyute/contrib/compare.py +69 -0
  72. impyute/contrib/count_missing.py +30 -0
  73. impyute/contrib/describe.py +63 -0
  74. impyute/cs/__init__.py +11 -0
  75. impyute/cs/buck_iterative.py +82 -0
  76. impyute/cs/central_tendency.py +84 -0
  77. impyute/cs/em.py +52 -0
  78. impyute/cs/fast_knn.py +130 -0
  79. impyute/cs/random.py +27 -0
  80. impyute/dataset/__init__.py +6 -0
  81. impyute/dataset/base.py +137 -0
  82. impyute/dataset/corrupt.py +55 -0
  83. impyute/deletion/__init__.py +5 -0
  84. impyute/deletion/complete_case.py +21 -0
  85. impyute/ops/__init__.py +12 -0
  86. impyute/ops/error.py +9 -0
  87. impyute/ops/inverse_distance_weighting.py +31 -0
  88. impyute/ops/matrix.py +47 -0
  89. impyute/ops/testing.py +20 -0
  90. impyute/ops/util.py +96 -0
  91. impyute/ops/wrapper.py +179 -0
  92. impyute/ts/__init__.py +6 -0
  93. impyute/ts/locf.py +57 -0
  94. impyute/ts/moving_window.py +128 -0
  95. impyutelib.py +890 -0
  96. missingpy/__init__.py +4 -0
  97. missingpy/knnimpute.py +328 -0
  98. missingpy/missforest.py +556 -0
  99. missingpy/pairwise_external.py +315 -0
  100. missingpy/tests/__init__.py +0 -0
  101. missingpy/tests/test_knnimpute.py +605 -0
  102. missingpy/tests/test_missforest.py +409 -0
  103. missingpy/utils.py +124 -0
  104. misspylib.py +565 -0
missingpy/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ #from .knnimpute import KNNImputer
2
+ from .missforest import MissForest
3
+
4
+ __all__ = ['MissForest']
missingpy/knnimpute.py ADDED
@@ -0,0 +1,328 @@
1
+ """KNN Imputer for Missing Data"""
2
+ # Author: Ashim Bhattarai
3
+ # License: GNU General Public License v3 (GPLv3)
4
+
5
+ import warnings
6
+
7
+ import numpy as np
8
+
9
+ from sklearn.base import BaseEstimator, TransformerMixin
10
+ from sklearn.utils import check_array
11
+ from sklearn.utils.validation import check_is_fitted
12
+ from sklearn.utils.validation import FLOAT_DTYPES
13
+ from sklearn.neighbors.base import _check_weights
14
+ from sklearn.neighbors.base import _get_weights
15
+
16
+ from .pairwise_external import pairwise_distances
17
+ from .pairwise_external import _get_mask
18
+ from .pairwise_external import _MASKED_METRICS
19
+
20
+ __all__ = [
21
+ 'KNNImputer',
22
+ ]
23
+
24
+
25
+ class KNNImputer(BaseEstimator, TransformerMixin):
26
+ """Imputation for completing missing values using k-Nearest Neighbors.
27
+
28
+ Each sample's missing values are imputed using values from ``n_neighbors``
29
+ nearest neighbors found in the training set. Each missing feature is then
30
+ imputed as the average, either weighted or unweighted, of these neighbors.
31
+ Note that if a sample has more than one feature missing, then the
32
+ neighbors for that sample can be different depending on the particular
33
+ feature being imputed. Finally, where the number of donor neighbors is
34
+ less than ``n_neighbors``, the training set average for that feature is
35
+ used during imputation.
36
+
37
+ Parameters
38
+ ----------
39
+ missing_values : integer or "NaN", optional (default = "NaN")
40
+ The placeholder for the missing values. All occurrences of
41
+ `missing_values` will be imputed. For missing values encoded as
42
+ ``np.nan``, use the string value "NaN".
43
+
44
+ n_neighbors : int, optional (default = 5)
45
+ Number of neighboring samples to use for imputation.
46
+
47
+ weights : str or callable, optional (default = "uniform")
48
+ Weight function used in prediction. Possible values:
49
+
50
+ - 'uniform' : uniform weights. All points in each neighborhood
51
+ are weighted equally.
52
+ - 'distance' : weight points by the inverse of their distance.
53
+ in this case, closer neighbors of a query point will have a
54
+ greater influence than neighbors which are further away.
55
+ - [callable] : a user-defined function which accepts an
56
+ array of distances, and returns an array of the same shape
57
+ containing the weights.
58
+
59
+ metric : str or callable, optional (default = "masked_euclidean")
60
+ Distance metric for searching neighbors. Possible values:
61
+ - 'masked_euclidean'
62
+ - [callable] : a user-defined function which conforms to the
63
+ definition of _pairwise_callable(X, Y, metric, **kwds). In other
64
+ words, the function accepts two arrays, X and Y, and a
65
+ ``missing_values`` keyword in **kwds and returns a scalar distance
66
+ value.
67
+
68
+ row_max_missing : float, optional (default = 0.5)
69
+ The maximum fraction of columns (i.e. features) that can be missing
70
+ before the sample is excluded from nearest neighbor imputation. It
71
+ means that such rows will not be considered a potential donor in
72
+ ``fit()``, and in ``transform()`` their missing feature values will be
73
+ imputed to be the column mean for the entire dataset.
74
+
75
+ col_max_missing : float, optional (default = 0.8)
76
+ The maximum fraction of rows (or samples) that can be missing
77
+ for any feature beyond which an error is raised.
78
+
79
+ copy : boolean, optional (default = True)
80
+ If True, a copy of X will be created. If False, imputation will
81
+ be done in-place whenever possible. Note that, if metric is
82
+ "masked_euclidean" and copy=False then missing_values in the
83
+ input matrix X will be overwritten with zeros.
84
+
85
+ Attributes
86
+ ----------
87
+ statistics_ : 1-D array of length {n_features}
88
+ The 1-D array contains the mean of each feature calculated using
89
+ observed (i.e. non-missing) values. This is used for imputing
90
+ missing values in samples that are either excluded from nearest
91
+ neighbors search because they have too many ( > row_max_missing)
92
+ missing features or because all of the sample's k-nearest neighbors
93
+ (i.e., the potential donors) also have the relevant feature value
94
+ missing.
95
+
96
+ References
97
+ ----------
98
+ * Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor
99
+ Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing
100
+ value estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17
101
+ no. 6, 2001 Pages 520-525.
102
+
103
+ Examples
104
+ --------
105
+ >>> from missingpy import KNNImputer
106
+ >>> nan = float("NaN")
107
+ >>> X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]]
108
+ >>> imputer = KNNImputer(n_neighbors=2, weights="uniform")
109
+ >>> imputer.fit_transform(X)
110
+ array([[1. , 2. , 4. ],
111
+ [3. , 4. , 3. ],
112
+ [5.5, 6. , 5. ],
113
+ [8. , 8. , 7. ]])
114
+ """
115
+
116
+ def __init__(self, missing_values="NaN", n_neighbors=5,
117
+ weights="uniform", metric="masked_euclidean",
118
+ row_max_missing=0.5, col_max_missing=0.8, copy=True):
119
+
120
+ self.missing_values = missing_values
121
+ self.n_neighbors = n_neighbors
122
+ self.weights = weights
123
+ self.metric = metric
124
+ self.row_max_missing = row_max_missing
125
+ self.col_max_missing = col_max_missing
126
+ self.copy = copy
127
+
128
+ def _impute(self, dist, X, fitted_X, mask, mask_fx):
129
+ """Helper function to find and impute missing values"""
130
+
131
+ # For each column, find and impute
132
+ n_rows_X, n_cols_X = X.shape
133
+ for c in range(n_cols_X):
134
+ if not np.any(mask[:, c], axis=0):
135
+ continue
136
+
137
+ # Row index for receivers and potential donors (pdonors)
138
+ receivers_row_idx = np.where(mask[:, c])[0]
139
+ pdonors_row_idx = np.where(~mask_fx[:, c])[0]
140
+
141
+ # Impute using column mean if n_neighbors are not available
142
+ if len(pdonors_row_idx) < self.n_neighbors:
143
+ warnings.warn("Insufficient number of neighbors! "
144
+ "Filling in column mean.")
145
+ X[receivers_row_idx, c] = self.statistics_[c]
146
+ continue
147
+
148
+ # Get distance from potential donors
149
+ dist_pdonors = dist[receivers_row_idx][:, pdonors_row_idx]
150
+ dist_pdonors = dist_pdonors.reshape(-1,
151
+ len(pdonors_row_idx))
152
+
153
+ # Argpartition to separate actual donors from the rest
154
+ pdonors_idx = np.argpartition(
155
+ dist_pdonors, self.n_neighbors - 1, axis=1)
156
+
157
+ # Get final donors row index from pdonors
158
+ donors_idx = pdonors_idx[:, :self.n_neighbors]
159
+
160
+ # Get weights or None
161
+ dist_pdonors_rows = np.arange(len(donors_idx))[:, None]
162
+ weight_matrix = _get_weights(
163
+ dist_pdonors[
164
+ dist_pdonors_rows, donors_idx], self.weights)
165
+ donor_row_idx_ravel = donors_idx.ravel()
166
+
167
+ # Retrieve donor values and calculate kNN score
168
+ fitted_X_temp = fitted_X[pdonors_row_idx]
169
+ donors = fitted_X_temp[donor_row_idx_ravel, c].reshape(
170
+ (-1, self.n_neighbors))
171
+ donors_mask = _get_mask(donors, self.missing_values)
172
+ donors = np.ma.array(donors, mask=donors_mask)
173
+
174
+ # Final imputation
175
+ imputed = np.ma.average(donors, axis=1,
176
+ weights=weight_matrix)
177
+ X[receivers_row_idx, c] = imputed.data
178
+ return X
179
+
180
+ def fit(self, X, y=None):
181
+ """Fit the imputer on X.
182
+
183
+ Parameters
184
+ ----------
185
+ X : {array-like}, shape (n_samples, n_features)
186
+ Input data, where ``n_samples`` is the number of samples and
187
+ ``n_features`` is the number of features.
188
+
189
+ Returns
190
+ -------
191
+ self : object
192
+ Returns self.
193
+ """
194
+
195
+ # Check data integrity and calling arguments
196
+ force_all_finite = False if self.missing_values in ["NaN",
197
+ np.nan] else True
198
+ if not force_all_finite:
199
+ if self.metric not in _MASKED_METRICS and not callable(
200
+ self.metric):
201
+ raise ValueError(
202
+ "The selected metric does not support NaN values.")
203
+ X = check_array(X, accept_sparse=False, dtype=np.float64,
204
+ force_all_finite=force_all_finite, copy=self.copy)
205
+ self.weights = _check_weights(self.weights)
206
+
207
+ # Check for +/- inf
208
+ if np.any(np.isinf(X)):
209
+ raise ValueError("+/- inf values are not allowed.")
210
+
211
+ # Check if % missing in any column > col_max_missing
212
+ mask = _get_mask(X, self.missing_values)
213
+ if np.any(mask.sum(axis=0) > (X.shape[0] * self.col_max_missing)):
214
+ raise ValueError("Some column(s) have more than {}% missing values"
215
+ .format(self.col_max_missing * 100))
216
+ X_col_means = np.ma.array(X, mask=mask).mean(axis=0).data
217
+
218
+ # Check if % missing in any row > row_max_missing
219
+ bad_rows = mask.sum(axis=1) > (mask.shape[1] * self.row_max_missing)
220
+ if np.any(bad_rows):
221
+ warnings.warn(
222
+ "There are rows with more than {0}% missing values. These "
223
+ "rows are not included as donor neighbors."
224
+ .format(self.row_max_missing * 100))
225
+
226
+ # Remove rows that have more than row_max_missing % missing
227
+ X = X[~bad_rows, :]
228
+
229
+ # Check if sufficient neighboring samples available
230
+ if X.shape[0] < self.n_neighbors:
231
+ raise ValueError("There are only %d samples, but n_neighbors=%d."
232
+ % (X.shape[0], self.n_neighbors))
233
+ self.fitted_X_ = X
234
+ self.statistics_ = X_col_means
235
+
236
+ return self
237
+
238
+ def transform(self, X):
239
+ """Impute all missing values in X.
240
+
241
+ Parameters
242
+ ----------
243
+ X : {array-like}, shape = [n_samples, n_features]
244
+ The input data to complete.
245
+
246
+ Returns
247
+ -------
248
+ X : {array-like}, shape = [n_samples, n_features]
249
+ The imputed dataset.
250
+ """
251
+
252
+ check_is_fitted(self, ["fitted_X_", "statistics_"])
253
+ force_all_finite = False if self.missing_values in ["NaN",
254
+ np.nan] else True
255
+ X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES,
256
+ force_all_finite=force_all_finite, copy=self.copy)
257
+
258
+ # Check for +/- inf
259
+ if np.any(np.isinf(X)):
260
+ raise ValueError("+/- inf values are not allowed in data to be "
261
+ "transformed.")
262
+
263
+ # Get fitted data and ensure correct dimension
264
+ n_rows_fit_X, n_cols_fit_X = self.fitted_X_.shape
265
+ n_rows_X, n_cols_X = X.shape
266
+
267
+ if n_cols_X != n_cols_fit_X:
268
+ raise ValueError("Incompatible dimension between the fitted "
269
+ "dataset and the one to be transformed.")
270
+ mask = _get_mask(X, self.missing_values)
271
+
272
+ row_total_missing = mask.sum(axis=1)
273
+ if not np.any(row_total_missing):
274
+ return X
275
+
276
+ # Check for excessive missingness in rows
277
+ bad_rows = row_total_missing > (mask.shape[1] * self.row_max_missing)
278
+ if np.any(bad_rows):
279
+ warnings.warn(
280
+ "There are rows with more than {0}% missing values. The "
281
+ "missing features in these rows are imputed with column means."
282
+ .format(self.row_max_missing * 100))
283
+ X_bad = X[bad_rows, :]
284
+ X = X[~bad_rows, :]
285
+ mask = mask[~bad_rows]
286
+ row_total_missing = mask.sum(axis=1)
287
+ row_has_missing = row_total_missing.astype(np.bool)
288
+
289
+ if np.any(row_has_missing):
290
+
291
+ # Mask for fitted_X
292
+ mask_fx = _get_mask(self.fitted_X_, self.missing_values)
293
+
294
+ # Pairwise distances between receivers and fitted samples
295
+ dist = np.empty((len(X), len(self.fitted_X_)))
296
+ dist[row_has_missing] = pairwise_distances(
297
+ X[row_has_missing], self.fitted_X_, metric=self.metric,
298
+ squared=False, missing_values=self.missing_values)
299
+
300
+ # Find and impute missing
301
+ X = self._impute(dist, X, self.fitted_X_, mask, mask_fx)
302
+
303
+ # Merge bad rows to X and mean impute their missing values
304
+ if np.any(bad_rows):
305
+ bad_missing_index = np.where(_get_mask(X_bad, self.missing_values))
306
+ X_bad[bad_missing_index] = np.take(self.statistics_,
307
+ bad_missing_index[1])
308
+ X_merged = np.empty((n_rows_X, n_cols_X))
309
+ X_merged[bad_rows, :] = X_bad
310
+ X_merged[~bad_rows, :] = X
311
+ X = X_merged
312
+ return X
313
+
314
+ def fit_transform(self, X, y=None, **fit_params):
315
+ """Fit KNNImputer and impute all missing values in X.
316
+
317
+ Parameters
318
+ ----------
319
+ X : {array-like}, shape (n_samples, n_features)
320
+ Input data, where ``n_samples`` is the number of samples and
321
+ ``n_features`` is the number of features.
322
+
323
+ Returns
324
+ -------
325
+ X : {array-like}, shape (n_samples, n_features)
326
+ Returns imputed dataset.
327
+ """
328
+ return self.fit(X).transform(X)