cpgtools 2.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cpgmodule/BED.py +441 -0
- cpgmodule/MI.py +193 -0
- cpgmodule/__init__.py +0 -0
- cpgmodule/_version.py +1 -0
- cpgmodule/cgID.py +866897 -0
- cpgmodule/data/AltumAge_cpg.pkl +0 -0
- cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
- cpgmodule/data/AltumAge_scaler.pkl +0 -0
- cpgmodule/data/GA_Bohlin.pkl +0 -0
- cpgmodule/data/GA_Haftorn.pkl +0 -0
- cpgmodule/data/GA_Knight.pkl +0 -0
- cpgmodule/data/GA_Lee_CPC.pkl +0 -0
- cpgmodule/data/GA_Lee_RPC.pkl +0 -0
- cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
- cpgmodule/data/GA_Mayne.pkl +0 -0
- cpgmodule/data/Hannum.pkl +0 -0
- cpgmodule/data/Horvath_2013.pkl +0 -0
- cpgmodule/data/Horvath_2018.pkl +0 -0
- cpgmodule/data/Levine.pkl +0 -0
- cpgmodule/data/Lu_DNAmTL.pkl +0 -0
- cpgmodule/data/Ped_McEwen.pkl +0 -0
- cpgmodule/data/Ped_Wu.pkl +0 -0
- cpgmodule/data/Zhang_BLUP.pkl +0 -0
- cpgmodule/data/Zhang_EN.pkl +0 -0
- cpgmodule/data/__init__.py +0 -0
- cpgmodule/extend_bed.py +147 -0
- cpgmodule/imotif.py +348 -0
- cpgmodule/ireader.py +28 -0
- cpgmodule/methylClock.py +53 -0
- cpgmodule/padjust.py +58 -0
- cpgmodule/region2gene.py +170 -0
- cpgmodule/utils.py +642 -0
- cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
- cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
- cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
- cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
- cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
- cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
- cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
- cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
- cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
- cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
- cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
- cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
- cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
- cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
- cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
- cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
- cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
- cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
- cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
- cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
- cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
- cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
- cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
- cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
- cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
- cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
- cpgtools-2.0.5.dist-info/METADATA +59 -0
- cpgtools-2.0.5.dist-info/RECORD +104 -0
- cpgtools-2.0.5.dist-info/WHEEL +5 -0
- cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
- cpgtools-2.0.5.dist-info/top_level.txt +5 -0
- impyute/__init__.py +3 -0
- impyute/contrib/__init__.py +7 -0
- impyute/contrib/compare.py +69 -0
- impyute/contrib/count_missing.py +30 -0
- impyute/contrib/describe.py +63 -0
- impyute/cs/__init__.py +11 -0
- impyute/cs/buck_iterative.py +82 -0
- impyute/cs/central_tendency.py +84 -0
- impyute/cs/em.py +52 -0
- impyute/cs/fast_knn.py +130 -0
- impyute/cs/random.py +27 -0
- impyute/dataset/__init__.py +6 -0
- impyute/dataset/base.py +137 -0
- impyute/dataset/corrupt.py +55 -0
- impyute/deletion/__init__.py +5 -0
- impyute/deletion/complete_case.py +21 -0
- impyute/ops/__init__.py +12 -0
- impyute/ops/error.py +9 -0
- impyute/ops/inverse_distance_weighting.py +31 -0
- impyute/ops/matrix.py +47 -0
- impyute/ops/testing.py +20 -0
- impyute/ops/util.py +96 -0
- impyute/ops/wrapper.py +179 -0
- impyute/ts/__init__.py +6 -0
- impyute/ts/locf.py +57 -0
- impyute/ts/moving_window.py +128 -0
- impyutelib.py +890 -0
- missingpy/__init__.py +4 -0
- missingpy/knnimpute.py +328 -0
- missingpy/missforest.py +556 -0
- missingpy/pairwise_external.py +315 -0
- missingpy/tests/__init__.py +0 -0
- missingpy/tests/test_knnimpute.py +605 -0
- missingpy/tests/test_missforest.py +409 -0
- missingpy/utils.py +124 -0
- misspylib.py +565 -0
|
@@ -0,0 +1,442 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
'''
|
|
4
|
+
#=========================================================================================
|
|
5
|
+
Description
|
|
6
|
+
-----------
|
|
7
|
+
Different from statistical testing, this program tries to estimates "how different the
|
|
8
|
+
means between the two groups are" using the Bayesian approach. An MCMC is used to estimate the
|
|
9
|
+
"means", "difference of means", "95% HDI (highest posterior density interval)", and the
|
|
10
|
+
posterior probability that the HDI does NOT include "0".
|
|
11
|
+
|
|
12
|
+
It is similar to John Kruschke's BEST algorithm (Bayesian Estimation Supersedes T test)
|
|
13
|
+
(http://www.indiana.edu/~kruschke/BEST/).
|
|
14
|
+
|
|
15
|
+
Notes
|
|
16
|
+
-----
|
|
17
|
+
This program is much slower than the T-test due to MCMC (Markov chain Monte Carlo) step.
|
|
18
|
+
Running it with multiple threads is highly recommended.
|
|
19
|
+
|
|
20
|
+
'''
|
|
21
|
+
import sys,os
|
|
22
|
+
import subprocess
|
|
23
|
+
import numpy as np
|
|
24
|
+
from scipy import stats
|
|
25
|
+
from optparse import OptionParser
|
|
26
|
+
from cpgmodule import ireader
|
|
27
|
+
from cpgmodule.utils import *
|
|
28
|
+
from cpgmodule import BED
|
|
29
|
+
from cpgmodule import padjust
|
|
30
|
+
from cpgmodule._version import __version__
|
|
31
|
+
from multiprocessing import Process, Manager, current_process
|
|
32
|
+
|
|
33
|
+
__author__ = "Liguo Wang"
|
|
34
|
+
__copyright__ = "Copyleft"
|
|
35
|
+
__credits__ = []
|
|
36
|
+
__license__ = "GPL"
|
|
37
|
+
__maintainer__ = "Liguo Wang"
|
|
38
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
39
|
+
__status__ = "Development"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def dt(x, mu, sig):
|
|
43
|
+
'''
|
|
44
|
+
The probability density of t distribution.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
x : array_like
|
|
49
|
+
Array of quantiles
|
|
50
|
+
mu : mean
|
|
51
|
+
The mean of normal distribution
|
|
52
|
+
sig : sigma
|
|
53
|
+
The standard deviation of normal distribution
|
|
54
|
+
|
|
55
|
+
Returns
|
|
56
|
+
-------
|
|
57
|
+
logpdf : array_like
|
|
58
|
+
Log of the normal probability density function evaluated at x
|
|
59
|
+
|
|
60
|
+
'''
|
|
61
|
+
return np.log(stats.t.pdf(x, loc = mu, scale = sig, df = len(x)-1))
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
#def dnorm(x, mu, sig):
|
|
65
|
+
# return np.log(1/(sig * np.sqrt(2 * np.pi)) * np.exp(-(x - mu)**2 / (2 * sig**2)))
|
|
66
|
+
|
|
67
|
+
def dnorm(x, mu, sig):
|
|
68
|
+
'''
|
|
69
|
+
The probability density of normal distribution.
|
|
70
|
+
|
|
71
|
+
Parameters
|
|
72
|
+
----------
|
|
73
|
+
x : array_like
|
|
74
|
+
Array of quantiles
|
|
75
|
+
mu : mean
|
|
76
|
+
The mean of normal distribution
|
|
77
|
+
sig : sigma
|
|
78
|
+
The standard deviation of normal distribution
|
|
79
|
+
|
|
80
|
+
Returns
|
|
81
|
+
-------
|
|
82
|
+
logpdf : array_like
|
|
83
|
+
Log of the normal probability density function evaluated at x
|
|
84
|
+
|
|
85
|
+
Notes
|
|
86
|
+
------
|
|
87
|
+
Do NOT set log_p to False in this file
|
|
88
|
+
|
|
89
|
+
'''
|
|
90
|
+
return np.log(stats.norm.pdf(x, mu, sig))
|
|
91
|
+
|
|
92
|
+
def dexp(x, l):
|
|
93
|
+
'''
|
|
94
|
+
The probability density of exponential distribution.
|
|
95
|
+
|
|
96
|
+
Parameters
|
|
97
|
+
----------
|
|
98
|
+
x : array_like
|
|
99
|
+
Array of quantiles
|
|
100
|
+
l : lambda
|
|
101
|
+
A common parameter for `expon`. such that ``pdf = lambda * exp(-lambda * x)``.
|
|
102
|
+
The `scale` parameter in `expon` function corresponds to `scale = `1/lambda`
|
|
103
|
+
log_p : bool, optional
|
|
104
|
+
If True, return log(p-value)
|
|
105
|
+
|
|
106
|
+
Returns
|
|
107
|
+
-------
|
|
108
|
+
logpdf : array_like
|
|
109
|
+
Log of the exponential probability density function evaluated at x
|
|
110
|
+
|
|
111
|
+
Notes
|
|
112
|
+
------
|
|
113
|
+
Do NOT set log_p to False in this file
|
|
114
|
+
'''
|
|
115
|
+
|
|
116
|
+
if x > 0:
|
|
117
|
+
return np.log(stats.expon.pdf(x, scale = 1/l))
|
|
118
|
+
else:
|
|
119
|
+
return 0
|
|
120
|
+
|
|
121
|
+
def like(s1, s2, para):
|
|
122
|
+
'''
|
|
123
|
+
Estimate the likelihood of observing data (s1 and s2) given parameters `para`
|
|
124
|
+
|
|
125
|
+
Parameters
|
|
126
|
+
----------
|
|
127
|
+
s1 : array_like
|
|
128
|
+
Beta values in group-1
|
|
129
|
+
s2 : array_like
|
|
130
|
+
Beta values in group-2
|
|
131
|
+
para : list
|
|
132
|
+
parameters consist of [mu1, sig1, mu2, sig2]
|
|
133
|
+
|
|
134
|
+
Returns
|
|
135
|
+
--------
|
|
136
|
+
likelihood : float
|
|
137
|
+
The log likelihood of observing data (s1 and s2) given parameters `para`
|
|
138
|
+
|
|
139
|
+
Notes
|
|
140
|
+
-----
|
|
141
|
+
`log_p` in `dnorm` and `dexp` must be set to `True`
|
|
142
|
+
'''
|
|
143
|
+
[mu1, sig1, mu2, sig2] = para
|
|
144
|
+
return np.sum(dt(s1, mu1, sig1)) + np.sum(dt(s2, mu2, sig2))
|
|
145
|
+
|
|
146
|
+
def prior(s1, s2, para):
|
|
147
|
+
'''
|
|
148
|
+
Probability of mean and std.
|
|
149
|
+
|
|
150
|
+
Parameters
|
|
151
|
+
----------
|
|
152
|
+
s1 : array_like
|
|
153
|
+
Beta values in group-1
|
|
154
|
+
s2 : array_like
|
|
155
|
+
Beta values in group-2
|
|
156
|
+
para : list
|
|
157
|
+
parameters consist of [mu1, sig1, mu2, sig2]
|
|
158
|
+
|
|
159
|
+
Returns
|
|
160
|
+
--------
|
|
161
|
+
likelihood : float
|
|
162
|
+
The log likelihood of `mean` which follows the normal distribution whose
|
|
163
|
+
mean = pooled.mean, and whose std = pool.std * 1000. `Std` follows exponential
|
|
164
|
+
distribution
|
|
165
|
+
|
|
166
|
+
Notes
|
|
167
|
+
-----
|
|
168
|
+
`log_p` in `dnorm` and `dexp` must be set to `True`
|
|
169
|
+
'''
|
|
170
|
+
[mu1, sig1, mu2, sig2] = para
|
|
171
|
+
pooled = np.append(s1, s2)
|
|
172
|
+
prior_mean = pooled.mean()
|
|
173
|
+
prior_std = 1000.0*pooled.std()
|
|
174
|
+
|
|
175
|
+
return np.sum([dnorm(mu1, prior_mean, prior_std), dnorm(mu2, prior_mean, prior_std), dexp(sig1, 0.1), dexp(sig2, 0.1)])
|
|
176
|
+
|
|
177
|
+
def posterior(s1, s2, para):
|
|
178
|
+
'''
|
|
179
|
+
Parameters
|
|
180
|
+
----------
|
|
181
|
+
s1 : array_like
|
|
182
|
+
Beta values in group-1
|
|
183
|
+
s2 : array_like
|
|
184
|
+
Beta values in group-2
|
|
185
|
+
para : list
|
|
186
|
+
parameters consist of [mu1, sig1, mu2, sig2]
|
|
187
|
+
|
|
188
|
+
Returns
|
|
189
|
+
-------
|
|
190
|
+
likelihood : float
|
|
191
|
+
The log likelihood of posterior distribution
|
|
192
|
+
|
|
193
|
+
'''
|
|
194
|
+
[mu1, sig1, mu2, sig2] = para
|
|
195
|
+
return like(s1, s2, [mu1, sig1, mu2, sig2]) + prior(s1, s2, [mu1, sig1, mu2, sig2])
|
|
196
|
+
|
|
197
|
+
def computeHDI(chain, interval = .95):
|
|
198
|
+
'''
|
|
199
|
+
Compute 95% highest density interval (HDI)
|
|
200
|
+
'''
|
|
201
|
+
# sort chain using the first axis which is the chain
|
|
202
|
+
chain.sort()
|
|
203
|
+
# how many samples did you generate?
|
|
204
|
+
nSample = chain.size
|
|
205
|
+
# how many samples must go in the HDI?
|
|
206
|
+
nSampleCred = int(np.ceil(nSample * interval))
|
|
207
|
+
# number of intervals to be compared
|
|
208
|
+
nCI = nSample - nSampleCred
|
|
209
|
+
# width of every proposed interval
|
|
210
|
+
width = np.array([chain[i+nSampleCred] - chain[i] for i in range(nCI)])
|
|
211
|
+
# index of lower bound of shortest interval (which is the HDI)
|
|
212
|
+
best = width.argmin()
|
|
213
|
+
# put it in a dictionary
|
|
214
|
+
#HDI = {'Lower': chain[best], 'Upper': chain[best + nSampleCred], 'Width': width.min()}
|
|
215
|
+
HDI = [chain[best], chain[best + nSampleCred]]
|
|
216
|
+
return HDI
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def beta_bayes(results, id, s1, s2, seed, niter = 10000, nburn_in = 500):
|
|
220
|
+
'''
|
|
221
|
+
https://stats.stackexchange.com/questions/130389/bayesian-equivalent-of-two-sample-t-test
|
|
222
|
+
'''
|
|
223
|
+
np.random.seed(seed)
|
|
224
|
+
|
|
225
|
+
mu1_samples = [] #means sampled by MCMC for s1
|
|
226
|
+
mu2_samples = [] #means sampled by MCMC for s2
|
|
227
|
+
|
|
228
|
+
# run MCMC (Metropolis-Hastings's sampling algorithm)
|
|
229
|
+
# Initialization: mu1, sig1, mu2, sig2
|
|
230
|
+
parameters = np.array([np.mean(s1), np.std(s1), np.mean(s2), np.std(s2)])
|
|
231
|
+
increment = (s1.std() + s2.std())/10 #5 times smaller than the average std
|
|
232
|
+
for iteration in np.arange(1,niter):
|
|
233
|
+
candidate = parameters + np.random.normal(0, increment, 4)
|
|
234
|
+
if candidate[1] < 0 or candidate[3] < 0:
|
|
235
|
+
continue
|
|
236
|
+
ratio = np.exp(posterior(s1, s2, candidate) - posterior(s1, s2, parameters))
|
|
237
|
+
if np.random.uniform() < ratio:
|
|
238
|
+
parameters = candidate
|
|
239
|
+
if iteration < nburn_in:
|
|
240
|
+
continue
|
|
241
|
+
mu1_samples.append(parameters[0])
|
|
242
|
+
mu2_samples.append(parameters[2])
|
|
243
|
+
|
|
244
|
+
# calculate estimated means
|
|
245
|
+
mu1_samples = np.array(mu1_samples)
|
|
246
|
+
mu2_samples = np.array(mu2_samples)
|
|
247
|
+
est_mu1 = mu1_samples.mean() #estimated mu1
|
|
248
|
+
est_mu2 = mu2_samples.mean() #estimated mu2
|
|
249
|
+
|
|
250
|
+
# calculate probability
|
|
251
|
+
diff = (mu1_samples - mu2_samples)
|
|
252
|
+
diff_median = np.median(diff)
|
|
253
|
+
if diff_median < 0:
|
|
254
|
+
prob = np.mean(diff < 0)
|
|
255
|
+
elif diff_median > 0:
|
|
256
|
+
prob = np.mean(diff > 0)
|
|
257
|
+
|
|
258
|
+
# calculate HDI
|
|
259
|
+
diff_HDI_h, diff_HDI_l = computeHDI(diff)
|
|
260
|
+
|
|
261
|
+
# CpG_ID, mean of group1, mean of group2, diff of mean, 95%HDI_low, HDI_high, probability
|
|
262
|
+
results.append( [id, est_mu1, est_mu2, est_mu1 - est_mu2, diff_HDI_h, diff_HDI_l, prob])
|
|
263
|
+
|
|
264
|
+
def beta_bayes_new(results, id, s1, s2, seed, niter = 10000, nburn_in = 500):
|
|
265
|
+
'''
|
|
266
|
+
https://stats.stackexchange.com/questions/130389/bayesian-equivalent-of-two-sample-t-test
|
|
267
|
+
'''
|
|
268
|
+
np.random.seed(seed)
|
|
269
|
+
|
|
270
|
+
mu1_samples = [] #means sampled by MCMC for s1
|
|
271
|
+
mu2_samples = [] #means sampled by MCMC for s2
|
|
272
|
+
|
|
273
|
+
# run MCMC (Metropolis-Hastings's sampling algorithm)
|
|
274
|
+
# Initialization: mu1, sig1, mu2, sig2
|
|
275
|
+
parameters = np.array([np.mean(s1), np.std(s1), np.mean(s2), np.std(s2)])
|
|
276
|
+
increment = (s1.std() + s2.std())/10 #5 times smaller than the average std
|
|
277
|
+
for iteration in np.arange(1,niter):
|
|
278
|
+
candidate = parameters + np.random.normal(0, increment, 4)
|
|
279
|
+
if candidate[1] < 0 or candidate[3] < 0:
|
|
280
|
+
continue
|
|
281
|
+
ratio = np.exp(posterior(s1, s2, candidate) - posterior(s1, s2, parameters))
|
|
282
|
+
if np.random.uniform() < ratio:
|
|
283
|
+
parameters = candidate
|
|
284
|
+
if iteration < nburn_in:
|
|
285
|
+
continue
|
|
286
|
+
mu1_samples.append(parameters[0])
|
|
287
|
+
mu2_samples.append(parameters[2])
|
|
288
|
+
|
|
289
|
+
# calculate estimated means
|
|
290
|
+
mu1_samples = np.array(mu1_samples)
|
|
291
|
+
mu2_samples = np.array(mu2_samples)
|
|
292
|
+
est_mu1 = mu1_samples.mean() #estimated mu1
|
|
293
|
+
est_mu2 = mu2_samples.mean() #estimated mu2
|
|
294
|
+
|
|
295
|
+
# calculate probability
|
|
296
|
+
diff = (mu1_samples - mu2_samples)
|
|
297
|
+
diff_median = np.median(diff)
|
|
298
|
+
if diff_median < 0:
|
|
299
|
+
prob = np.mean(diff < 0)
|
|
300
|
+
elif diff_median > 0:
|
|
301
|
+
prob = np.mean(diff > 0)
|
|
302
|
+
|
|
303
|
+
# calculate HDI
|
|
304
|
+
diff_HDI_h, diff_HDI_l = computeHDI(diff)
|
|
305
|
+
|
|
306
|
+
# CpG_ID, mean of group1, mean of group2, diff of mean, 95%HDI_low, HDI_high, probability
|
|
307
|
+
results.append( [id, est_mu1, est_mu2, est_mu1 - est_mu2, diff_HDI_h, diff_HDI_l, prob])
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def test():
|
|
311
|
+
np.random.seed(99)
|
|
312
|
+
sample1 = np.random.normal(100, 10, 8)
|
|
313
|
+
sample2 = np.random.normal(150, 15, 10)
|
|
314
|
+
print (','.join([str(i) for i in sample1]))
|
|
315
|
+
print (','.join([str(i) for i in sample2]))
|
|
316
|
+
out = beta_bayes('test', sample1, sample2)
|
|
317
|
+
print (out)
|
|
318
|
+
|
|
319
|
+
if __name__=='__main__':
|
|
320
|
+
#test()
|
|
321
|
+
|
|
322
|
+
usage="%prog [options]" + "\n"
|
|
323
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
324
|
+
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Data file containing beta values with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). Except for the 1st row and 1st column, any non-numerical values will be considered as \"missing values\" and ignored. This file can be a regular text file or compressed file (.gz, .bz2).")
|
|
325
|
+
parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological group of each sample. It is a comma-separated 2 columns file with the 1st column containing sample IDs, and the 2nd column containing group IDs. It must have a header row. Sample IDs should match to the \"Data file\". Note: Only for two group comparison.")
|
|
326
|
+
parser.add_option("-n","--niter",action="store",type="int", default=5000,dest="n_iter",help="Iteration times when using MCMC Metropolis-Hastings's agorithm to draw samples from the posterior distribution. default=%default")
|
|
327
|
+
parser.add_option("-b","--burnin",action="store",type="int", default=500,dest="n_burn",help="Number of simulated samples to discard. Thes initial samples are usually not completely valid because the Markov Chain has not stabilized to the stationary distribution. default=%default.")
|
|
328
|
+
parser.add_option("-p","--processor",action="store",type="int",dest="n_process",default=1,help="The number of processes. default=%default")
|
|
329
|
+
parser.add_option("-s","--seed",action="store",type='int', dest="seed",default=99, help="The seed used by the random number generator. default=%default")
|
|
330
|
+
parser.add_option("-o","--output",action="store",type="string", dest="out_file",help="The prefix of the output file.")
|
|
331
|
+
(options,args)=parser.parse_args()
|
|
332
|
+
|
|
333
|
+
print ()
|
|
334
|
+
#print (options.paired)
|
|
335
|
+
#print (options.welch_ttest)
|
|
336
|
+
if not (options.input_file):
|
|
337
|
+
print (__doc__)
|
|
338
|
+
parser.print_help()
|
|
339
|
+
sys.exit(101)
|
|
340
|
+
|
|
341
|
+
if not (options.group_file):
|
|
342
|
+
print (__doc__)
|
|
343
|
+
parser.print_help()
|
|
344
|
+
sys.exit(102)
|
|
345
|
+
|
|
346
|
+
if not (options.out_file):
|
|
347
|
+
print (__doc__)
|
|
348
|
+
parser.print_help()
|
|
349
|
+
sys.exit(103)
|
|
350
|
+
if options.n_iter <= 0:
|
|
351
|
+
print ("--niter must be a positive integer")
|
|
352
|
+
parser.print_help()
|
|
353
|
+
sys.exit(0)
|
|
354
|
+
if options.n_burn <= 0:
|
|
355
|
+
print ("--burnin must be a positive integer")
|
|
356
|
+
parser.print_help()
|
|
357
|
+
sys.exit(0)
|
|
358
|
+
if options.n_process <= 0:
|
|
359
|
+
print ("--processor must be a positive integer")
|
|
360
|
+
parser.print_help()
|
|
361
|
+
sys.exit(0)
|
|
362
|
+
|
|
363
|
+
np.random.seed(options.seed)
|
|
364
|
+
printlog("Read group file \"%s\" ..." % (options.group_file))
|
|
365
|
+
(ss,gs) = read_grp_file1(options.group_file)
|
|
366
|
+
|
|
367
|
+
s2g = {}
|
|
368
|
+
for s,g in zip(ss,gs):
|
|
369
|
+
s2g[s] = g
|
|
370
|
+
|
|
371
|
+
g2s = collections.defaultdict(list)
|
|
372
|
+
for s,g in zip(ss, gs):
|
|
373
|
+
g2s[g].append(s)
|
|
374
|
+
|
|
375
|
+
group_IDs = sorted(g2s.keys())
|
|
376
|
+
for g in group_IDs:
|
|
377
|
+
print ("\tGroup %s has %d samples:" % (g, len(g2s[g])), file=sys.stderr)
|
|
378
|
+
print ('\t\t' + ','.join(g2s[g]), file=sys.stderr)
|
|
379
|
+
|
|
380
|
+
if len(group_IDs) != 2:
|
|
381
|
+
printlog("You must have two groups!", file=sys.stderr)
|
|
382
|
+
sys.exit(1)
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
manager = Manager()
|
|
386
|
+
results = manager.list() #list of list. shared variable between main() and beta_bayes(). #ID, group1.mean, group2.mean, prob
|
|
387
|
+
|
|
388
|
+
printlog("Read data file \"%s\" ..." % (options.input_file))
|
|
389
|
+
line_num = 0
|
|
390
|
+
p_count = 0
|
|
391
|
+
jobs = []
|
|
392
|
+
for l in ireader.reader(options.input_file):
|
|
393
|
+
line_num += 1
|
|
394
|
+
f = l.split()
|
|
395
|
+
if len(f) == 0: continue
|
|
396
|
+
if line_num == 1:
|
|
397
|
+
sample_IDs = f[1:]
|
|
398
|
+
# check if sample ID matches
|
|
399
|
+
for s in s2g:
|
|
400
|
+
if s not in sample_IDs:
|
|
401
|
+
printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file))
|
|
402
|
+
sys.exit(3)
|
|
403
|
+
g_IDs = [s2g[i] for i in sample_IDs]
|
|
404
|
+
|
|
405
|
+
else:
|
|
406
|
+
probe_ID = f[0]
|
|
407
|
+
p_count += 1
|
|
408
|
+
group1 = [] #beta values in group1
|
|
409
|
+
group2 = [] #beta values in group2
|
|
410
|
+
|
|
411
|
+
beta_values = f[1:]
|
|
412
|
+
for g,b in zip(g_IDs, beta_values):
|
|
413
|
+
#deal with non-numerical values
|
|
414
|
+
try:
|
|
415
|
+
b = float(b)
|
|
416
|
+
except:
|
|
417
|
+
continue
|
|
418
|
+
if g == group_IDs[0]:
|
|
419
|
+
group1.append(b)
|
|
420
|
+
elif g == group_IDs[1]:
|
|
421
|
+
group2.append(b)
|
|
422
|
+
|
|
423
|
+
group1 = np.array(group1)
|
|
424
|
+
group2 = np.array(group2)
|
|
425
|
+
job_name = probe_ID
|
|
426
|
+
p = Process(name = job_name,target = beta_bayes, args = (results, probe_ID, group1, group2, options.seed, options.n_iter, options.n_burn))
|
|
427
|
+
p.start()
|
|
428
|
+
jobs.append(p)
|
|
429
|
+
|
|
430
|
+
if p_count == options.n_process:
|
|
431
|
+
for proc in jobs: proc.join() #tell the process to complete
|
|
432
|
+
p_count = 0
|
|
433
|
+
jobs = []
|
|
434
|
+
print("Finish %d\r" % (line_num - 1),end = '', file=sys.stderr)
|
|
435
|
+
for proc in jobs: proc.join()
|
|
436
|
+
|
|
437
|
+
OUT = open(options.out_file + '.bayes.tsv','w')
|
|
438
|
+
print ("\t".join(["ID", "mu1", "mu2", "mu_diff", "mu_diff (95% HDI)", "Probability"]), file=OUT)
|
|
439
|
+
for r in results:
|
|
440
|
+
print ("%s\t%f\t%f\t%f\t(%f,%f)\t%f" % (r[0], r[1],r[2],r[3], r[4], r[5],r[6]), file = OUT)
|
|
441
|
+
OUT.close()
|
|
442
|
+
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Description
|
|
5
|
+
-----------
|
|
6
|
+
This program performs differential CpG analysis using beta-binomial model based on
|
|
7
|
+
methylation proportions (in the form of "c,n", where "c" indicates "Number of reads with
|
|
8
|
+
methylated C", and "n" indicates "Number of total reads". Both c and n are non-negative
|
|
9
|
+
integers and c <= n).
|
|
10
|
+
|
|
11
|
+
Example of input
|
|
12
|
+
----------------
|
|
13
|
+
Below example showing input data on 2 CpGs of 3 groups (A,B, and C)
|
|
14
|
+
with each group has 3 replicates:
|
|
15
|
+
cgID A_1 A_2 A_3 B_1 B_2 B_3 C_1 C_2 C_3
|
|
16
|
+
CpG_1 129,170 166,178 7,9 1 6,16 10,10 10,15 11,15 16,22 20,36
|
|
17
|
+
CpG_2 0,77 0,99 0,85 0,77 1,37 3,37 0,42 0,153 0,6
|
|
18
|
+
|
|
19
|
+
Notes
|
|
20
|
+
-----
|
|
21
|
+
1. It can handle covariants.
|
|
22
|
+
2. Input is proportion values, not beta values.
|
|
23
|
+
3. you must install R package "aod" before running this program.
|
|
24
|
+
(https://cran.r-project.org/web/packages/aod/index.html)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
import sys,os
|
|
29
|
+
import collections
|
|
30
|
+
import subprocess
|
|
31
|
+
import numpy as np
|
|
32
|
+
import re
|
|
33
|
+
from scipy import stats
|
|
34
|
+
from optparse import OptionParser
|
|
35
|
+
from cpgmodule import ireader
|
|
36
|
+
from cpgmodule.utils import *
|
|
37
|
+
from cpgmodule import BED
|
|
38
|
+
from cpgmodule import padjust
|
|
39
|
+
from cpgmodule._version import __version__
|
|
40
|
+
|
|
41
|
+
__author__ = "Liguo Wang"
|
|
42
|
+
__copyright__ = "Copyleft"
|
|
43
|
+
__credits__ = []
|
|
44
|
+
__license__ = "GPL"
|
|
45
|
+
__maintainer__ = "Liguo Wang"
|
|
46
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
47
|
+
__status__ = "Development"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def main():
|
|
51
|
+
usage="%prog [options]" + "\n"
|
|
52
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
53
|
+
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Data file containing methylation proportions (represented by \"methyl_count,total_count\", eg. \"20,30\") with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). This file can be a regular text file or compressed file (.gz, .bz2)")
|
|
54
|
+
parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological groups of each sample as well as other covariables such as gender, age. The first variable is grouping variable (must be categorical), all the other variables are considered as covariates (can be categorical or continuous). Sample IDs should match to the \"Data file\".")
|
|
55
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
|
|
56
|
+
(options,args)=parser.parse_args()
|
|
57
|
+
|
|
58
|
+
print ()
|
|
59
|
+
if not (options.input_file):
|
|
60
|
+
print (__doc__)
|
|
61
|
+
parser.print_help()
|
|
62
|
+
sys.exit(101)
|
|
63
|
+
|
|
64
|
+
if not (options.group_file):
|
|
65
|
+
print (__doc__)
|
|
66
|
+
parser.print_help()
|
|
67
|
+
sys.exit(102)
|
|
68
|
+
|
|
69
|
+
if not (options.out_file):
|
|
70
|
+
print (__doc__)
|
|
71
|
+
parser.print_help()
|
|
72
|
+
sys.exit(103)
|
|
73
|
+
|
|
74
|
+
if not os.path.isfile(options.input_file):
|
|
75
|
+
print ("Input data file \"%s\" does not exist\n" % options.input_file)
|
|
76
|
+
sys.exit(104)
|
|
77
|
+
if not os.path.isfile(options.group_file):
|
|
78
|
+
print ("Input group file \"%s\" does not exist\n" % options.input_file)
|
|
79
|
+
sys.exit(105)
|
|
80
|
+
|
|
81
|
+
ROUT = open(options.out_file + '.r','w')
|
|
82
|
+
|
|
83
|
+
print ('library("aod")', file=ROUT)
|
|
84
|
+
|
|
85
|
+
printlog("Read group file \"%s\" ..." % (options.group_file))
|
|
86
|
+
####
|
|
87
|
+
(samples,cv_names, cvs, v_types) = read_grp_file2(options.group_file)
|
|
88
|
+
for cv_name in cv_names:
|
|
89
|
+
print ("%s: %s" % (cv_name, v_types[cv_name]))
|
|
90
|
+
for sample in samples:
|
|
91
|
+
print ('\t' + sample + '\t' + cvs[cv_name][sample])
|
|
92
|
+
####
|
|
93
|
+
|
|
94
|
+
print ('bbr1 <- function (cgid, m,t,%s){' % ','.join(cv_names), file=ROUT)
|
|
95
|
+
print ('\tdat <- data.frame(m=m, t=t, %s)' % ','.join(['='.join(i) for i in zip(cv_names, cv_names)]), file=ROUT)
|
|
96
|
+
print ('\tfit <- betabin(cbind(m,t - m) ~ %s, ~1, link=c("logit"), data=na.omit(dat))' % '+'.join(cv_names), file=ROUT)
|
|
97
|
+
print ('\ttest <- summary(fit)',file=ROUT)
|
|
98
|
+
print ('\tcoefs <- test@Coef$Estimate',file=ROUT)
|
|
99
|
+
print ('\tpvals = test@Coef$"Pr(> |z|)"',file=ROUT)
|
|
100
|
+
print ('\tif(max(pvals, na.rm=T)>1){pvals = pvals + NA}', file=ROUT)
|
|
101
|
+
print ('\tif(sum(m, na.rm=T) == 0){pvals = pvals + NA}', file=ROUT)
|
|
102
|
+
print ('\tnames = row.names(test@Coef)',file=ROUT)
|
|
103
|
+
print ('\tnames = gsub("2","",names)',file=ROUT)
|
|
104
|
+
print ( '\twrite.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1),quote=FALSE, row.names=FALSE, sep="\\t", col.names=c("ID",paste(names, "coef",sep="."), paste(names, "pval",sep=".")))' % (options.out_file + '.results.txt'), file = ROUT)
|
|
105
|
+
print ('}', file=ROUT)
|
|
106
|
+
print ('\n', file=ROUT)
|
|
107
|
+
|
|
108
|
+
print ('bbr2 <- function (cgid, m,t,%s){' % ','.join(cv_names), file=ROUT)
|
|
109
|
+
print ('\tdat <- data.frame(m=m, t=t, %s)' % ','.join(['='.join(i) for i in zip(cv_names, cv_names)]), file=ROUT)
|
|
110
|
+
print ('\tfit <- betabin(cbind(m,t - m) ~ %s, ~1, link=c("logit"), data=na.omit(dat))' % '+'.join(cv_names), file=ROUT)
|
|
111
|
+
print ('\ttest <- summary(fit)',file=ROUT)
|
|
112
|
+
print ('\tcoefs <- test@Coef$Estimate',file=ROUT)
|
|
113
|
+
print ('\tpvals = test@Coef$"Pr(> |z|)"',file=ROUT)
|
|
114
|
+
print ('\tif(max(pvals, na.rm=T)>1){pvals = pvals + NA}', file=ROUT)
|
|
115
|
+
print ('\tif(sum(m, na.rm=T) == 0){pvals = pvals + NA}', file=ROUT)
|
|
116
|
+
print ('\tnames = row.names(test@Coef)',file=ROUT)
|
|
117
|
+
print ('\tnames = gsub("2","",names)',file=ROUT)
|
|
118
|
+
print ( '\twrite.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1), quote=FALSE, row.names=FALSE, sep="\\t", col.names=FALSE, append=TRUE)' % (options.out_file + '.results.txt'), file = ROUT)
|
|
119
|
+
print ('}', file=ROUT)
|
|
120
|
+
print ('\n', file=ROUT)
|
|
121
|
+
|
|
122
|
+
printlog("Processing file \"%s\" ..." % (options.input_file))
|
|
123
|
+
line_num = 0
|
|
124
|
+
probe_list = []
|
|
125
|
+
p_list = []
|
|
126
|
+
for l in ireader.reader(options.input_file):
|
|
127
|
+
line_num += 1
|
|
128
|
+
f = l.split()
|
|
129
|
+
if len(f) == 0: continue
|
|
130
|
+
if line_num == 1:
|
|
131
|
+
sample_IDs = f[1:]
|
|
132
|
+
# check if sample ID matches
|
|
133
|
+
for s in samples:
|
|
134
|
+
if s not in sample_IDs:
|
|
135
|
+
printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file))
|
|
136
|
+
sys.exit(3)
|
|
137
|
+
####
|
|
138
|
+
for cv_name in cv_names:
|
|
139
|
+
if v_types[cv_name] == 'continuous':
|
|
140
|
+
print (cv_name + ' <- c(%s)' % (','.join([str(cvs[cv_name][s]) for s in sample_IDs ])), file = ROUT)
|
|
141
|
+
elif v_types[cv_name] == 'categorical':
|
|
142
|
+
print (cv_name + ' <- as.factor(c(%s))' % (','.join([str(cvs[cv_name][s]) for s in sample_IDs ])), file = ROUT)
|
|
143
|
+
else:
|
|
144
|
+
printlog("unknown vaiable type!")
|
|
145
|
+
sys.exit(1)
|
|
146
|
+
####
|
|
147
|
+
print ('\n', file=ROUT)
|
|
148
|
+
|
|
149
|
+
continue
|
|
150
|
+
else:
|
|
151
|
+
methyl_reads = [] # c
|
|
152
|
+
total_reads = [] # n
|
|
153
|
+
cg_id = f[0]
|
|
154
|
+
for i in f[1:]:
|
|
155
|
+
#try:
|
|
156
|
+
m = re.match(r'(\d+)\s*\,\s*(\d+)', i)
|
|
157
|
+
if m is None:
|
|
158
|
+
methyl_reads.append("NaN")
|
|
159
|
+
total_reads.append("NaN")
|
|
160
|
+
continue
|
|
161
|
+
else:
|
|
162
|
+
c = int(m.group(1))
|
|
163
|
+
n = int(m.group(2))
|
|
164
|
+
if n >= c and n > 0:
|
|
165
|
+
methyl_reads.append(c)
|
|
166
|
+
total_reads.append(n)
|
|
167
|
+
else:
|
|
168
|
+
printlog("Incorrect data format!")
|
|
169
|
+
print (f)
|
|
170
|
+
sys.exit(1)
|
|
171
|
+
if line_num == 2:
|
|
172
|
+
print ('bbr1(\"%s\", c(%s), c(%s), %s)' % (cg_id, ','.join([str(read) for read in methyl_reads]), ','.join([str(read) for read in total_reads]), ','.join(cv_names)), file=ROUT)
|
|
173
|
+
else:
|
|
174
|
+
print ('bbr2(\"%s\", c(%s), c(%s), %s)' % (cg_id, ','.join([str(read) for read in methyl_reads]), ','.join([str(read) for read in total_reads]), ','.join(cv_names)), file=ROUT)
|
|
175
|
+
ROUT.close()
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
try:
|
|
179
|
+
printlog("Runing Rscript file \"%s\" ..." % (options.out_file + '.r'))
|
|
180
|
+
subprocess.call("Rscript %s 2>%s" % (options.out_file + '.r', options.out_file + '.warnings.txt' ), shell=True)
|
|
181
|
+
except:
|
|
182
|
+
print ("Error: cannot run Rscript: \"%s\"" % (options.out_file + '.r'), file=sys.stderr)
|
|
183
|
+
sys.exit(1)
|
|
184
|
+
|
|
185
|
+
"""
|
|
186
|
+
printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...")
|
|
187
|
+
probe_list0 = [] #probes without valid pvalue
|
|
188
|
+
probe_list1 = []
|
|
189
|
+
p_list1 = []
|
|
190
|
+
if os.path.exists(options.out_file + '.results.txt') and os.path.getsize(options.out_file + '.results.txt') > 0:
|
|
191
|
+
for l in ireader.reader(options.out_file + '.results.txt'):
|
|
192
|
+
f = l.split()
|
|
193
|
+
id = f[0]
|
|
194
|
+
try:
|
|
195
|
+
pv = float(f[1])
|
|
196
|
+
probe_list1.append(id)
|
|
197
|
+
p_list1.append(pv)
|
|
198
|
+
except:
|
|
199
|
+
probe_list0.append(id)
|
|
200
|
+
continue
|
|
201
|
+
q_list1 = padjust.multiple_testing_correction(p_list1)
|
|
202
|
+
|
|
203
|
+
OUT = open(options.out_file + '.results.txt','w')
|
|
204
|
+
print ("probe\tP-value\tadj.Pvalue", file = OUT)
|
|
205
|
+
|
|
206
|
+
#probes with valid p and q
|
|
207
|
+
for id,p,q in zip(probe_list1, p_list1, q_list1):
|
|
208
|
+
print (id + '\t' + str(p) + '\t' + str(q), file=OUT)
|
|
209
|
+
|
|
210
|
+
#probes without valid p and q
|
|
211
|
+
if len(probe_list0) > 0:
|
|
212
|
+
for id in probe_list0:
|
|
213
|
+
print (id + '\tNA\tNA', file=OUT)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
OUT.close()
|
|
217
|
+
"""
|
|
218
|
+
|
|
219
|
+
if __name__=='__main__':
|
|
220
|
+
main()
|
|
221
|
+
|