cpgtools 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. cpgmodule/BED.py +441 -0
  2. cpgmodule/MI.py +193 -0
  3. cpgmodule/__init__.py +0 -0
  4. cpgmodule/_version.py +1 -0
  5. cpgmodule/cgID.py +866897 -0
  6. cpgmodule/data/AltumAge_cpg.pkl +0 -0
  7. cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
  8. cpgmodule/data/AltumAge_scaler.pkl +0 -0
  9. cpgmodule/data/GA_Bohlin.pkl +0 -0
  10. cpgmodule/data/GA_Haftorn.pkl +0 -0
  11. cpgmodule/data/GA_Knight.pkl +0 -0
  12. cpgmodule/data/GA_Lee_CPC.pkl +0 -0
  13. cpgmodule/data/GA_Lee_RPC.pkl +0 -0
  14. cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
  15. cpgmodule/data/GA_Mayne.pkl +0 -0
  16. cpgmodule/data/Hannum.pkl +0 -0
  17. cpgmodule/data/Horvath_2013.pkl +0 -0
  18. cpgmodule/data/Horvath_2018.pkl +0 -0
  19. cpgmodule/data/Levine.pkl +0 -0
  20. cpgmodule/data/Lu_DNAmTL.pkl +0 -0
  21. cpgmodule/data/Ped_McEwen.pkl +0 -0
  22. cpgmodule/data/Ped_Wu.pkl +0 -0
  23. cpgmodule/data/Zhang_BLUP.pkl +0 -0
  24. cpgmodule/data/Zhang_EN.pkl +0 -0
  25. cpgmodule/data/__init__.py +0 -0
  26. cpgmodule/extend_bed.py +147 -0
  27. cpgmodule/imotif.py +348 -0
  28. cpgmodule/ireader.py +28 -0
  29. cpgmodule/methylClock.py +53 -0
  30. cpgmodule/padjust.py +58 -0
  31. cpgmodule/region2gene.py +170 -0
  32. cpgmodule/utils.py +642 -0
  33. cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
  34. cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
  35. cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
  36. cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
  37. cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
  38. cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
  39. cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
  40. cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
  41. cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
  42. cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
  43. cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
  44. cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
  45. cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
  46. cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
  47. cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
  48. cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
  49. cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
  50. cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
  51. cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
  52. cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
  53. cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
  54. cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
  55. cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
  56. cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
  57. cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
  58. cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
  59. cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
  60. cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
  61. cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
  62. cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
  63. cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
  64. cpgtools-2.0.5.dist-info/METADATA +59 -0
  65. cpgtools-2.0.5.dist-info/RECORD +104 -0
  66. cpgtools-2.0.5.dist-info/WHEEL +5 -0
  67. cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
  68. cpgtools-2.0.5.dist-info/top_level.txt +5 -0
  69. impyute/__init__.py +3 -0
  70. impyute/contrib/__init__.py +7 -0
  71. impyute/contrib/compare.py +69 -0
  72. impyute/contrib/count_missing.py +30 -0
  73. impyute/contrib/describe.py +63 -0
  74. impyute/cs/__init__.py +11 -0
  75. impyute/cs/buck_iterative.py +82 -0
  76. impyute/cs/central_tendency.py +84 -0
  77. impyute/cs/em.py +52 -0
  78. impyute/cs/fast_knn.py +130 -0
  79. impyute/cs/random.py +27 -0
  80. impyute/dataset/__init__.py +6 -0
  81. impyute/dataset/base.py +137 -0
  82. impyute/dataset/corrupt.py +55 -0
  83. impyute/deletion/__init__.py +5 -0
  84. impyute/deletion/complete_case.py +21 -0
  85. impyute/ops/__init__.py +12 -0
  86. impyute/ops/error.py +9 -0
  87. impyute/ops/inverse_distance_weighting.py +31 -0
  88. impyute/ops/matrix.py +47 -0
  89. impyute/ops/testing.py +20 -0
  90. impyute/ops/util.py +96 -0
  91. impyute/ops/wrapper.py +179 -0
  92. impyute/ts/__init__.py +6 -0
  93. impyute/ts/locf.py +57 -0
  94. impyute/ts/moving_window.py +128 -0
  95. impyutelib.py +890 -0
  96. missingpy/__init__.py +4 -0
  97. missingpy/knnimpute.py +328 -0
  98. missingpy/missforest.py +556 -0
  99. missingpy/pairwise_external.py +315 -0
  100. missingpy/tests/__init__.py +0 -0
  101. missingpy/tests/test_knnimpute.py +605 -0
  102. missingpy/tests/test_missforest.py +409 -0
  103. missingpy/utils.py +124 -0
  104. misspylib.py +565 -0
@@ -0,0 +1,315 @@
1
+ # This file is a modification of sklearn.metrics.pairwise
2
+ # Modifications by Ashim Bhattarai
3
+ """
4
+ New BSD License
5
+
6
+ Copyright (c) 2007–2018 The scikit-learn developers.
7
+ All rights reserved.
8
+
9
+
10
+ Redistribution and use in source and binary forms, with or without
11
+ modification, are permitted provided that the following conditions are met:
12
+
13
+ a. Redistributions of source code must retain the above copyright notice,
14
+ this list of conditions and the following disclaimer.
15
+ b. Redistributions in binary form must reproduce the above copyright
16
+ notice, this list of conditions and the following disclaimer in the
17
+ documentation and/or other materials provided with the distribution.
18
+ c. Neither the name of the Scikit-learn Developers nor the names of
19
+ its contributors may be used to endorse or promote products
20
+ derived from this software without specific prior written
21
+ permission.
22
+
23
+
24
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
25
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27
+ ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
28
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33
+ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
34
+ DAMAGE.
35
+ """
36
+
37
+ from __future__ import division
38
+ from functools import partial
39
+ import itertools
40
+
41
+ import numpy as np
42
+ from scipy.spatial import distance
43
+ from scipy.sparse import issparse
44
+
45
+ from sklearn.metrics.pairwise import _VALID_METRICS, _return_float_dtype
46
+ from sklearn.metrics.pairwise import PAIRWISE_BOOLEAN_FUNCTIONS
47
+ from sklearn.metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
48
+ from sklearn.metrics.pairwise import _parallel_pairwise
49
+ from sklearn.utils import check_array
50
+
51
+ from .utils import masked_euclidean_distances
52
+
53
+ _MASKED_METRICS = ['masked_euclidean']
54
+ _VALID_METRICS += ['masked_euclidean']
55
+
56
+
57
+ def _get_mask(X, value_to_mask):
58
+ """Compute the boolean mask X == missing_values."""
59
+ if value_to_mask == "NaN" or np.isnan(value_to_mask):
60
+ return np.isnan(X)
61
+ else:
62
+ return X == value_to_mask
63
+
64
+
65
+ def check_pairwise_arrays(X, Y, precomputed=False, dtype=None,
66
+ accept_sparse='csr', force_all_finite=True,
67
+ copy=False):
68
+ """ Set X and Y appropriately and checks inputs
69
+
70
+ If Y is None, it is set as a pointer to X (i.e. not a copy).
71
+ If Y is given, this does not happen.
72
+ All distance metrics should use this function first to assert that the
73
+ given parameters are correct and safe to use.
74
+
75
+ Specifically, this function first ensures that both X and Y are arrays,
76
+ then checks that they are at least two dimensional while ensuring that
77
+ their elements are floats (or dtype if provided). Finally, the function
78
+ checks that the size of the second dimension of the two arrays is equal, or
79
+ the equivalent check for a precomputed distance matrix.
80
+
81
+ Parameters
82
+ ----------
83
+ X : {array-like, sparse matrix}, shape (n_samples_a, n_features)
84
+
85
+ Y : {array-like, sparse matrix}, shape (n_samples_b, n_features)
86
+
87
+ precomputed : bool
88
+ True if X is to be treated as precomputed distances to the samples in
89
+ Y.
90
+
91
+ dtype : string, type, list of types or None (default=None)
92
+ Data type required for X and Y. If None, the dtype will be an
93
+ appropriate float type selected by _return_float_dtype.
94
+
95
+ .. versionadded:: 0.18
96
+
97
+ accept_sparse : string, boolean or list/tuple of strings
98
+ String[s] representing allowed sparse matrix formats, such as 'csc',
99
+ 'csr', etc. If the input is sparse but not in the allowed format,
100
+ it will be converted to the first listed format. True allows the input
101
+ to be any format. False means that a sparse matrix input will
102
+ raise an error.
103
+
104
+ force_all_finite : bool
105
+ Whether to raise an error on np.inf and np.nan in X (or Y if it exists)
106
+
107
+ copy : bool
108
+ Whether a forced copy will be triggered. If copy=False, a copy might
109
+ be triggered by a conversion.
110
+
111
+ Returns
112
+ -------
113
+ safe_X : {array-like, sparse matrix}, shape (n_samples_a, n_features)
114
+ An array equal to X, guaranteed to be a numpy array.
115
+
116
+ safe_Y : {array-like, sparse matrix}, shape (n_samples_b, n_features)
117
+ An array equal to Y if Y was not None, guaranteed to be a numpy array.
118
+ If Y was None, safe_Y will be a pointer to X.
119
+
120
+ """
121
+ X, Y, dtype_float = _return_float_dtype(X, Y)
122
+
123
+ warn_on_dtype = dtype is not None
124
+ estimator = 'check_pairwise_arrays'
125
+ if dtype is None:
126
+ dtype = dtype_float
127
+
128
+ if Y is X or Y is None:
129
+ X = Y = check_array(X, accept_sparse=accept_sparse, dtype=dtype,
130
+ copy=copy, force_all_finite=force_all_finite,
131
+ warn_on_dtype=warn_on_dtype, estimator=estimator)
132
+ else:
133
+ X = check_array(X, accept_sparse=accept_sparse, dtype=dtype,
134
+ copy=copy, force_all_finite=force_all_finite,
135
+ warn_on_dtype=warn_on_dtype, estimator=estimator)
136
+ Y = check_array(Y, accept_sparse=accept_sparse, dtype=dtype,
137
+ copy=copy, force_all_finite=force_all_finite,
138
+ warn_on_dtype=warn_on_dtype, estimator=estimator)
139
+
140
+ if precomputed:
141
+ if X.shape[1] != Y.shape[0]:
142
+ raise ValueError("Precomputed metric requires shape "
143
+ "(n_queries, n_indexed). Got (%d, %d) "
144
+ "for %d indexed." %
145
+ (X.shape[0], X.shape[1], Y.shape[0]))
146
+ elif X.shape[1] != Y.shape[1]:
147
+ raise ValueError("Incompatible dimension for X and Y matrices: "
148
+ "X.shape[1] == %d while Y.shape[1] == %d" % (
149
+ X.shape[1], Y.shape[1]))
150
+
151
+ return X, Y
152
+
153
+
154
+ def _pairwise_callable(X, Y, metric, **kwds):
155
+ """Handle the callable case for pairwise_{distances,kernels}
156
+ """
157
+ force_all_finite = False if callable(metric) else True
158
+ X, Y = check_pairwise_arrays(X, Y, force_all_finite=force_all_finite)
159
+
160
+ if X is Y:
161
+ # Only calculate metric for upper triangle
162
+ out = np.zeros((X.shape[0], Y.shape[0]), dtype='float')
163
+ iterator = itertools.combinations(range(X.shape[0]), 2)
164
+ for i, j in iterator:
165
+ out[i, j] = metric(X[i], Y[j], **kwds)
166
+
167
+ # Make symmetric
168
+ # NB: out += out.T will produce incorrect results
169
+ out = out + out.T
170
+
171
+ # Calculate diagonal
172
+ # NB: nonzero diagonals are allowed for both metrics and kernels
173
+ for i in range(X.shape[0]):
174
+ x = X[i]
175
+ out[i, i] = metric(x, x, **kwds)
176
+
177
+ else:
178
+ # Calculate all cells
179
+ out = np.empty((X.shape[0], Y.shape[0]), dtype='float')
180
+ iterator = itertools.product(range(X.shape[0]), range(Y.shape[0]))
181
+ for i, j in iterator:
182
+ out[i, j] = metric(X[i], Y[j], **kwds)
183
+
184
+ return out
185
+
186
+
187
+ # Helper functions - distance
188
+ PAIRWISE_DISTANCE_FUNCTIONS['masked_euclidean'] = masked_euclidean_distances
189
+
190
+
191
+ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds):
192
+ """ Compute the distance matrix from a vector array X and optional Y.
193
+
194
+ This method takes either a vector array or a distance matrix, and returns
195
+ a distance matrix. If the input is a vector array, the distances are
196
+ computed. If the input is a distances matrix, it is returned instead.
197
+
198
+ This method provides a safe way to take a distance matrix as input, while
199
+ preserving compatibility with many other algorithms that take a vector
200
+ array.
201
+
202
+ If Y is given (default is None), then the returned matrix is the pairwise
203
+ distance between the arrays from both X and Y.
204
+
205
+ Valid values for metric are:
206
+
207
+ - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
208
+ 'manhattan']. These metrics support sparse matrix
209
+ inputs.
210
+ Also, ['masked_euclidean'] but it does not yet support sparse matrices.
211
+
212
+ - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
213
+ 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
214
+ 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
215
+ 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
216
+ See the documentation for scipy.spatial.distance for details on these
217
+ metrics. These metrics do not support sparse matrix inputs.
218
+
219
+ Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are
220
+ valid scipy.spatial.distance metrics), the scikit-learn implementation
221
+ will be used, which is faster and has support for sparse matrices (except
222
+ for 'cityblock'). For a verbose description of the metrics from
223
+ scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics
224
+ function.
225
+
226
+ Read more in the :ref:`User Guide <metrics>`.
227
+
228
+ Parameters
229
+ ----------
230
+ X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \
231
+ [n_samples_a, n_features] otherwise
232
+ Array of pairwise distances between samples, or a feature array.
233
+
234
+ Y : array [n_samples_b, n_features], optional
235
+ An optional second feature array. Only allowed if
236
+ metric != "precomputed".
237
+
238
+ metric : string, or callable
239
+ The metric to use when calculating distance between instances in a
240
+ feature array. If metric is a string, it must be one of the options
241
+ allowed by scipy.spatial.distance.pdist for its metric parameter, or
242
+ a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
243
+ If metric is "precomputed", X is assumed to be a distance matrix.
244
+ Alternatively, if metric is a callable function, it is called on each
245
+ pair of instances (rows) and the resulting value recorded. The callable
246
+ should take two arrays from X as input and return a value indicating
247
+ the distance between them.
248
+
249
+ n_jobs : int
250
+ The number of jobs to use for the computation. This works by breaking
251
+ down the pairwise matrix into n_jobs even slices and computing them in
252
+ parallel.
253
+
254
+ If -1 all CPUs are used. If 1 is given, no parallel computing code is
255
+ used at all, which is useful for debugging. For n_jobs below -1,
256
+ (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
257
+ are used.
258
+
259
+ **kwds : optional keyword parameters
260
+ Any further parameters are passed directly to the distance function.
261
+ If using a scipy.spatial.distance metric, the parameters are still
262
+ metric dependent. See the scipy docs for usage examples.
263
+
264
+ Returns
265
+ -------
266
+ D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b]
267
+ A distance matrix D such that D_{i, j} is the distance between the
268
+ ith and jth vectors of the given matrix X, if Y is None.
269
+ If Y is not None, then D_{i, j} is the distance between the ith array
270
+ from X and the jth array from Y.
271
+
272
+ See also
273
+ --------
274
+ pairwise_distances_chunked : performs the same calculation as this funtion,
275
+ but returns a generator of chunks of the distance matrix, in order to
276
+ limit memory usage.
277
+ paired_distances : Computes the distances between corresponding
278
+ elements of two arrays
279
+ """
280
+ if (metric not in _VALID_METRICS and
281
+ not callable(metric) and metric != "precomputed"):
282
+ raise ValueError("Unknown metric %s. "
283
+ "Valid metrics are %s, or 'precomputed', or a "
284
+ "callable" % (metric, _VALID_METRICS))
285
+
286
+ if metric in _MASKED_METRICS or callable(metric):
287
+ missing_values = kwds.get("missing_values") if kwds.get(
288
+ "missing_values") is not None else np.nan
289
+
290
+ if np.all(_get_mask(X.data if issparse(X) else X, missing_values)):
291
+ raise ValueError(
292
+ "One or more samples(s) only have missing values.")
293
+
294
+ if metric == "precomputed":
295
+ X, _ = check_pairwise_arrays(X, Y, precomputed=True)
296
+ return X
297
+ elif metric in PAIRWISE_DISTANCE_FUNCTIONS:
298
+ func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
299
+ elif callable(metric):
300
+ func = partial(_pairwise_callable, metric=metric, **kwds)
301
+ else:
302
+ if issparse(X) or issparse(Y):
303
+ raise TypeError("scipy distance metrics do not"
304
+ " support sparse matrices.")
305
+
306
+ dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None
307
+
308
+ X, Y = check_pairwise_arrays(X, Y, dtype=dtype)
309
+
310
+ if n_jobs == 1 and X is Y:
311
+ return distance.squareform(distance.pdist(X, metric=metric,
312
+ **kwds))
313
+ func = partial(distance.cdist, metric=metric, **kwds)
314
+
315
+ return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
File without changes