cpgtools 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. cpgmodule/BED.py +441 -0
  2. cpgmodule/MI.py +193 -0
  3. cpgmodule/__init__.py +0 -0
  4. cpgmodule/_version.py +1 -0
  5. cpgmodule/cgID.py +866897 -0
  6. cpgmodule/data/AltumAge_cpg.pkl +0 -0
  7. cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
  8. cpgmodule/data/AltumAge_scaler.pkl +0 -0
  9. cpgmodule/data/GA_Bohlin.pkl +0 -0
  10. cpgmodule/data/GA_Haftorn.pkl +0 -0
  11. cpgmodule/data/GA_Knight.pkl +0 -0
  12. cpgmodule/data/GA_Lee_CPC.pkl +0 -0
  13. cpgmodule/data/GA_Lee_RPC.pkl +0 -0
  14. cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
  15. cpgmodule/data/GA_Mayne.pkl +0 -0
  16. cpgmodule/data/Hannum.pkl +0 -0
  17. cpgmodule/data/Horvath_2013.pkl +0 -0
  18. cpgmodule/data/Horvath_2018.pkl +0 -0
  19. cpgmodule/data/Levine.pkl +0 -0
  20. cpgmodule/data/Lu_DNAmTL.pkl +0 -0
  21. cpgmodule/data/Ped_McEwen.pkl +0 -0
  22. cpgmodule/data/Ped_Wu.pkl +0 -0
  23. cpgmodule/data/Zhang_BLUP.pkl +0 -0
  24. cpgmodule/data/Zhang_EN.pkl +0 -0
  25. cpgmodule/data/__init__.py +0 -0
  26. cpgmodule/extend_bed.py +147 -0
  27. cpgmodule/imotif.py +348 -0
  28. cpgmodule/ireader.py +28 -0
  29. cpgmodule/methylClock.py +53 -0
  30. cpgmodule/padjust.py +58 -0
  31. cpgmodule/region2gene.py +170 -0
  32. cpgmodule/utils.py +642 -0
  33. cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
  34. cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
  35. cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
  36. cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
  37. cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
  38. cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
  39. cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
  40. cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
  41. cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
  42. cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
  43. cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
  44. cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
  45. cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
  46. cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
  47. cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
  48. cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
  49. cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
  50. cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
  51. cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
  52. cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
  53. cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
  54. cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
  55. cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
  56. cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
  57. cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
  58. cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
  59. cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
  60. cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
  61. cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
  62. cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
  63. cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
  64. cpgtools-2.0.5.dist-info/METADATA +59 -0
  65. cpgtools-2.0.5.dist-info/RECORD +104 -0
  66. cpgtools-2.0.5.dist-info/WHEEL +5 -0
  67. cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
  68. cpgtools-2.0.5.dist-info/top_level.txt +5 -0
  69. impyute/__init__.py +3 -0
  70. impyute/contrib/__init__.py +7 -0
  71. impyute/contrib/compare.py +69 -0
  72. impyute/contrib/count_missing.py +30 -0
  73. impyute/contrib/describe.py +63 -0
  74. impyute/cs/__init__.py +11 -0
  75. impyute/cs/buck_iterative.py +82 -0
  76. impyute/cs/central_tendency.py +84 -0
  77. impyute/cs/em.py +52 -0
  78. impyute/cs/fast_knn.py +130 -0
  79. impyute/cs/random.py +27 -0
  80. impyute/dataset/__init__.py +6 -0
  81. impyute/dataset/base.py +137 -0
  82. impyute/dataset/corrupt.py +55 -0
  83. impyute/deletion/__init__.py +5 -0
  84. impyute/deletion/complete_case.py +21 -0
  85. impyute/ops/__init__.py +12 -0
  86. impyute/ops/error.py +9 -0
  87. impyute/ops/inverse_distance_weighting.py +31 -0
  88. impyute/ops/matrix.py +47 -0
  89. impyute/ops/testing.py +20 -0
  90. impyute/ops/util.py +96 -0
  91. impyute/ops/wrapper.py +179 -0
  92. impyute/ts/__init__.py +6 -0
  93. impyute/ts/locf.py +57 -0
  94. impyute/ts/moving_window.py +128 -0
  95. impyutelib.py +890 -0
  96. missingpy/__init__.py +4 -0
  97. missingpy/knnimpute.py +328 -0
  98. missingpy/missforest.py +556 -0
  99. missingpy/pairwise_external.py +315 -0
  100. missingpy/tests/__init__.py +0 -0
  101. missingpy/tests/test_knnimpute.py +605 -0
  102. missingpy/tests/test_missforest.py +409 -0
  103. missingpy/utils.py +124 -0
  104. misspylib.py +565 -0
@@ -0,0 +1,442 @@
1
+ #!python
2
+
3
+ '''
4
+ #=========================================================================================
5
+ Description
6
+ -----------
7
+ Different from statistical testing, this program tries to estimates "how different the
8
+ means between the two groups are" using the Bayesian approach. An MCMC is used to estimate the
9
+ "means", "difference of means", "95% HDI (highest posterior density interval)", and the
10
+ posterior probability that the HDI does NOT include "0".
11
+
12
+ It is similar to John Kruschke's BEST algorithm (Bayesian Estimation Supersedes T test)
13
+ (http://www.indiana.edu/~kruschke/BEST/).
14
+
15
+ Notes
16
+ -----
17
+ This program is much slower than the T-test due to MCMC (Markov chain Monte Carlo) step.
18
+ Running it with multiple threads is highly recommended.
19
+
20
+ '''
21
+ import sys,os
22
+ import subprocess
23
+ import numpy as np
24
+ from scipy import stats
25
+ from optparse import OptionParser
26
+ from cpgmodule import ireader
27
+ from cpgmodule.utils import *
28
+ from cpgmodule import BED
29
+ from cpgmodule import padjust
30
+ from cpgmodule._version import __version__
31
+ from multiprocessing import Process, Manager, current_process
32
+
33
+ __author__ = "Liguo Wang"
34
+ __copyright__ = "Copyleft"
35
+ __credits__ = []
36
+ __license__ = "GPL"
37
+ __maintainer__ = "Liguo Wang"
38
+ __email__ = "wang.liguo@mayo.edu"
39
+ __status__ = "Development"
40
+
41
+
42
+ def dt(x, mu, sig):
43
+ '''
44
+ The probability density of t distribution.
45
+
46
+ Parameters
47
+ ----------
48
+ x : array_like
49
+ Array of quantiles
50
+ mu : mean
51
+ The mean of normal distribution
52
+ sig : sigma
53
+ The standard deviation of normal distribution
54
+
55
+ Returns
56
+ -------
57
+ logpdf : array_like
58
+ Log of the normal probability density function evaluated at x
59
+
60
+ '''
61
+ return np.log(stats.t.pdf(x, loc = mu, scale = sig, df = len(x)-1))
62
+
63
+
64
+ #def dnorm(x, mu, sig):
65
+ # return np.log(1/(sig * np.sqrt(2 * np.pi)) * np.exp(-(x - mu)**2 / (2 * sig**2)))
66
+
67
+ def dnorm(x, mu, sig):
68
+ '''
69
+ The probability density of normal distribution.
70
+
71
+ Parameters
72
+ ----------
73
+ x : array_like
74
+ Array of quantiles
75
+ mu : mean
76
+ The mean of normal distribution
77
+ sig : sigma
78
+ The standard deviation of normal distribution
79
+
80
+ Returns
81
+ -------
82
+ logpdf : array_like
83
+ Log of the normal probability density function evaluated at x
84
+
85
+ Notes
86
+ ------
87
+ Do NOT set log_p to False in this file
88
+
89
+ '''
90
+ return np.log(stats.norm.pdf(x, mu, sig))
91
+
92
+ def dexp(x, l):
93
+ '''
94
+ The probability density of exponential distribution.
95
+
96
+ Parameters
97
+ ----------
98
+ x : array_like
99
+ Array of quantiles
100
+ l : lambda
101
+ A common parameter for `expon`. such that ``pdf = lambda * exp(-lambda * x)``.
102
+ The `scale` parameter in `expon` function corresponds to `scale = `1/lambda`
103
+ log_p : bool, optional
104
+ If True, return log(p-value)
105
+
106
+ Returns
107
+ -------
108
+ logpdf : array_like
109
+ Log of the exponential probability density function evaluated at x
110
+
111
+ Notes
112
+ ------
113
+ Do NOT set log_p to False in this file
114
+ '''
115
+
116
+ if x > 0:
117
+ return np.log(stats.expon.pdf(x, scale = 1/l))
118
+ else:
119
+ return 0
120
+
121
+ def like(s1, s2, para):
122
+ '''
123
+ Estimate the likelihood of observing data (s1 and s2) given parameters `para`
124
+
125
+ Parameters
126
+ ----------
127
+ s1 : array_like
128
+ Beta values in group-1
129
+ s2 : array_like
130
+ Beta values in group-2
131
+ para : list
132
+ parameters consist of [mu1, sig1, mu2, sig2]
133
+
134
+ Returns
135
+ --------
136
+ likelihood : float
137
+ The log likelihood of observing data (s1 and s2) given parameters `para`
138
+
139
+ Notes
140
+ -----
141
+ `log_p` in `dnorm` and `dexp` must be set to `True`
142
+ '''
143
+ [mu1, sig1, mu2, sig2] = para
144
+ return np.sum(dt(s1, mu1, sig1)) + np.sum(dt(s2, mu2, sig2))
145
+
146
+ def prior(s1, s2, para):
147
+ '''
148
+ Probability of mean and std.
149
+
150
+ Parameters
151
+ ----------
152
+ s1 : array_like
153
+ Beta values in group-1
154
+ s2 : array_like
155
+ Beta values in group-2
156
+ para : list
157
+ parameters consist of [mu1, sig1, mu2, sig2]
158
+
159
+ Returns
160
+ --------
161
+ likelihood : float
162
+ The log likelihood of `mean` which follows the normal distribution whose
163
+ mean = pooled.mean, and whose std = pool.std * 1000. `Std` follows exponential
164
+ distribution
165
+
166
+ Notes
167
+ -----
168
+ `log_p` in `dnorm` and `dexp` must be set to `True`
169
+ '''
170
+ [mu1, sig1, mu2, sig2] = para
171
+ pooled = np.append(s1, s2)
172
+ prior_mean = pooled.mean()
173
+ prior_std = 1000.0*pooled.std()
174
+
175
+ return np.sum([dnorm(mu1, prior_mean, prior_std), dnorm(mu2, prior_mean, prior_std), dexp(sig1, 0.1), dexp(sig2, 0.1)])
176
+
177
+ def posterior(s1, s2, para):
178
+ '''
179
+ Parameters
180
+ ----------
181
+ s1 : array_like
182
+ Beta values in group-1
183
+ s2 : array_like
184
+ Beta values in group-2
185
+ para : list
186
+ parameters consist of [mu1, sig1, mu2, sig2]
187
+
188
+ Returns
189
+ -------
190
+ likelihood : float
191
+ The log likelihood of posterior distribution
192
+
193
+ '''
194
+ [mu1, sig1, mu2, sig2] = para
195
+ return like(s1, s2, [mu1, sig1, mu2, sig2]) + prior(s1, s2, [mu1, sig1, mu2, sig2])
196
+
197
+ def computeHDI(chain, interval = .95):
198
+ '''
199
+ Compute 95% highest density interval (HDI)
200
+ '''
201
+ # sort chain using the first axis which is the chain
202
+ chain.sort()
203
+ # how many samples did you generate?
204
+ nSample = chain.size
205
+ # how many samples must go in the HDI?
206
+ nSampleCred = int(np.ceil(nSample * interval))
207
+ # number of intervals to be compared
208
+ nCI = nSample - nSampleCred
209
+ # width of every proposed interval
210
+ width = np.array([chain[i+nSampleCred] - chain[i] for i in range(nCI)])
211
+ # index of lower bound of shortest interval (which is the HDI)
212
+ best = width.argmin()
213
+ # put it in a dictionary
214
+ #HDI = {'Lower': chain[best], 'Upper': chain[best + nSampleCred], 'Width': width.min()}
215
+ HDI = [chain[best], chain[best + nSampleCred]]
216
+ return HDI
217
+
218
+
219
+ def beta_bayes(results, id, s1, s2, seed, niter = 10000, nburn_in = 500):
220
+ '''
221
+ https://stats.stackexchange.com/questions/130389/bayesian-equivalent-of-two-sample-t-test
222
+ '''
223
+ np.random.seed(seed)
224
+
225
+ mu1_samples = [] #means sampled by MCMC for s1
226
+ mu2_samples = [] #means sampled by MCMC for s2
227
+
228
+ # run MCMC (Metropolis-Hastings's sampling algorithm)
229
+ # Initialization: mu1, sig1, mu2, sig2
230
+ parameters = np.array([np.mean(s1), np.std(s1), np.mean(s2), np.std(s2)])
231
+ increment = (s1.std() + s2.std())/10 #5 times smaller than the average std
232
+ for iteration in np.arange(1,niter):
233
+ candidate = parameters + np.random.normal(0, increment, 4)
234
+ if candidate[1] < 0 or candidate[3] < 0:
235
+ continue
236
+ ratio = np.exp(posterior(s1, s2, candidate) - posterior(s1, s2, parameters))
237
+ if np.random.uniform() < ratio:
238
+ parameters = candidate
239
+ if iteration < nburn_in:
240
+ continue
241
+ mu1_samples.append(parameters[0])
242
+ mu2_samples.append(parameters[2])
243
+
244
+ # calculate estimated means
245
+ mu1_samples = np.array(mu1_samples)
246
+ mu2_samples = np.array(mu2_samples)
247
+ est_mu1 = mu1_samples.mean() #estimated mu1
248
+ est_mu2 = mu2_samples.mean() #estimated mu2
249
+
250
+ # calculate probability
251
+ diff = (mu1_samples - mu2_samples)
252
+ diff_median = np.median(diff)
253
+ if diff_median < 0:
254
+ prob = np.mean(diff < 0)
255
+ elif diff_median > 0:
256
+ prob = np.mean(diff > 0)
257
+
258
+ # calculate HDI
259
+ diff_HDI_h, diff_HDI_l = computeHDI(diff)
260
+
261
+ # CpG_ID, mean of group1, mean of group2, diff of mean, 95%HDI_low, HDI_high, probability
262
+ results.append( [id, est_mu1, est_mu2, est_mu1 - est_mu2, diff_HDI_h, diff_HDI_l, prob])
263
+
264
+ def beta_bayes_new(results, id, s1, s2, seed, niter = 10000, nburn_in = 500):
265
+ '''
266
+ https://stats.stackexchange.com/questions/130389/bayesian-equivalent-of-two-sample-t-test
267
+ '''
268
+ np.random.seed(seed)
269
+
270
+ mu1_samples = [] #means sampled by MCMC for s1
271
+ mu2_samples = [] #means sampled by MCMC for s2
272
+
273
+ # run MCMC (Metropolis-Hastings's sampling algorithm)
274
+ # Initialization: mu1, sig1, mu2, sig2
275
+ parameters = np.array([np.mean(s1), np.std(s1), np.mean(s2), np.std(s2)])
276
+ increment = (s1.std() + s2.std())/10 #5 times smaller than the average std
277
+ for iteration in np.arange(1,niter):
278
+ candidate = parameters + np.random.normal(0, increment, 4)
279
+ if candidate[1] < 0 or candidate[3] < 0:
280
+ continue
281
+ ratio = np.exp(posterior(s1, s2, candidate) - posterior(s1, s2, parameters))
282
+ if np.random.uniform() < ratio:
283
+ parameters = candidate
284
+ if iteration < nburn_in:
285
+ continue
286
+ mu1_samples.append(parameters[0])
287
+ mu2_samples.append(parameters[2])
288
+
289
+ # calculate estimated means
290
+ mu1_samples = np.array(mu1_samples)
291
+ mu2_samples = np.array(mu2_samples)
292
+ est_mu1 = mu1_samples.mean() #estimated mu1
293
+ est_mu2 = mu2_samples.mean() #estimated mu2
294
+
295
+ # calculate probability
296
+ diff = (mu1_samples - mu2_samples)
297
+ diff_median = np.median(diff)
298
+ if diff_median < 0:
299
+ prob = np.mean(diff < 0)
300
+ elif diff_median > 0:
301
+ prob = np.mean(diff > 0)
302
+
303
+ # calculate HDI
304
+ diff_HDI_h, diff_HDI_l = computeHDI(diff)
305
+
306
+ # CpG_ID, mean of group1, mean of group2, diff of mean, 95%HDI_low, HDI_high, probability
307
+ results.append( [id, est_mu1, est_mu2, est_mu1 - est_mu2, diff_HDI_h, diff_HDI_l, prob])
308
+
309
+
310
+ def test():
311
+ np.random.seed(99)
312
+ sample1 = np.random.normal(100, 10, 8)
313
+ sample2 = np.random.normal(150, 15, 10)
314
+ print (','.join([str(i) for i in sample1]))
315
+ print (','.join([str(i) for i in sample2]))
316
+ out = beta_bayes('test', sample1, sample2)
317
+ print (out)
318
+
319
+ if __name__=='__main__':
320
+ #test()
321
+
322
+ usage="%prog [options]" + "\n"
323
+ parser = OptionParser(usage,version="%prog " + __version__)
324
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Data file containing beta values with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). Except for the 1st row and 1st column, any non-numerical values will be considered as \"missing values\" and ignored. This file can be a regular text file or compressed file (.gz, .bz2).")
325
+ parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological group of each sample. It is a comma-separated 2 columns file with the 1st column containing sample IDs, and the 2nd column containing group IDs. It must have a header row. Sample IDs should match to the \"Data file\". Note: Only for two group comparison.")
326
+ parser.add_option("-n","--niter",action="store",type="int", default=5000,dest="n_iter",help="Iteration times when using MCMC Metropolis-Hastings's agorithm to draw samples from the posterior distribution. default=%default")
327
+ parser.add_option("-b","--burnin",action="store",type="int", default=500,dest="n_burn",help="Number of simulated samples to discard. Thes initial samples are usually not completely valid because the Markov Chain has not stabilized to the stationary distribution. default=%default.")
328
+ parser.add_option("-p","--processor",action="store",type="int",dest="n_process",default=1,help="The number of processes. default=%default")
329
+ parser.add_option("-s","--seed",action="store",type='int', dest="seed",default=99, help="The seed used by the random number generator. default=%default")
330
+ parser.add_option("-o","--output",action="store",type="string", dest="out_file",help="The prefix of the output file.")
331
+ (options,args)=parser.parse_args()
332
+
333
+ print ()
334
+ #print (options.paired)
335
+ #print (options.welch_ttest)
336
+ if not (options.input_file):
337
+ print (__doc__)
338
+ parser.print_help()
339
+ sys.exit(101)
340
+
341
+ if not (options.group_file):
342
+ print (__doc__)
343
+ parser.print_help()
344
+ sys.exit(102)
345
+
346
+ if not (options.out_file):
347
+ print (__doc__)
348
+ parser.print_help()
349
+ sys.exit(103)
350
+ if options.n_iter <= 0:
351
+ print ("--niter must be a positive integer")
352
+ parser.print_help()
353
+ sys.exit(0)
354
+ if options.n_burn <= 0:
355
+ print ("--burnin must be a positive integer")
356
+ parser.print_help()
357
+ sys.exit(0)
358
+ if options.n_process <= 0:
359
+ print ("--processor must be a positive integer")
360
+ parser.print_help()
361
+ sys.exit(0)
362
+
363
+ np.random.seed(options.seed)
364
+ printlog("Read group file \"%s\" ..." % (options.group_file))
365
+ (ss,gs) = read_grp_file1(options.group_file)
366
+
367
+ s2g = {}
368
+ for s,g in zip(ss,gs):
369
+ s2g[s] = g
370
+
371
+ g2s = collections.defaultdict(list)
372
+ for s,g in zip(ss, gs):
373
+ g2s[g].append(s)
374
+
375
+ group_IDs = sorted(g2s.keys())
376
+ for g in group_IDs:
377
+ print ("\tGroup %s has %d samples:" % (g, len(g2s[g])), file=sys.stderr)
378
+ print ('\t\t' + ','.join(g2s[g]), file=sys.stderr)
379
+
380
+ if len(group_IDs) != 2:
381
+ printlog("You must have two groups!", file=sys.stderr)
382
+ sys.exit(1)
383
+
384
+
385
+ manager = Manager()
386
+ results = manager.list() #list of list. shared variable between main() and beta_bayes(). #ID, group1.mean, group2.mean, prob
387
+
388
+ printlog("Read data file \"%s\" ..." % (options.input_file))
389
+ line_num = 0
390
+ p_count = 0
391
+ jobs = []
392
+ for l in ireader.reader(options.input_file):
393
+ line_num += 1
394
+ f = l.split()
395
+ if len(f) == 0: continue
396
+ if line_num == 1:
397
+ sample_IDs = f[1:]
398
+ # check if sample ID matches
399
+ for s in s2g:
400
+ if s not in sample_IDs:
401
+ printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file))
402
+ sys.exit(3)
403
+ g_IDs = [s2g[i] for i in sample_IDs]
404
+
405
+ else:
406
+ probe_ID = f[0]
407
+ p_count += 1
408
+ group1 = [] #beta values in group1
409
+ group2 = [] #beta values in group2
410
+
411
+ beta_values = f[1:]
412
+ for g,b in zip(g_IDs, beta_values):
413
+ #deal with non-numerical values
414
+ try:
415
+ b = float(b)
416
+ except:
417
+ continue
418
+ if g == group_IDs[0]:
419
+ group1.append(b)
420
+ elif g == group_IDs[1]:
421
+ group2.append(b)
422
+
423
+ group1 = np.array(group1)
424
+ group2 = np.array(group2)
425
+ job_name = probe_ID
426
+ p = Process(name = job_name,target = beta_bayes, args = (results, probe_ID, group1, group2, options.seed, options.n_iter, options.n_burn))
427
+ p.start()
428
+ jobs.append(p)
429
+
430
+ if p_count == options.n_process:
431
+ for proc in jobs: proc.join() #tell the process to complete
432
+ p_count = 0
433
+ jobs = []
434
+ print("Finish %d\r" % (line_num - 1),end = '', file=sys.stderr)
435
+ for proc in jobs: proc.join()
436
+
437
+ OUT = open(options.out_file + '.bayes.tsv','w')
438
+ print ("\t".join(["ID", "mu1", "mu2", "mu_diff", "mu_diff (95% HDI)", "Probability"]), file=OUT)
439
+ for r in results:
440
+ print ("%s\t%f\t%f\t%f\t(%f,%f)\t%f" % (r[0], r[1],r[2],r[3], r[4], r[5],r[6]), file = OUT)
441
+ OUT.close()
442
+
@@ -0,0 +1,221 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program performs differential CpG analysis using beta-binomial model based on
7
+ methylation proportions (in the form of "c,n", where "c" indicates "Number of reads with
8
+ methylated C", and "n" indicates "Number of total reads". Both c and n are non-negative
9
+ integers and c <= n).
10
+
11
+ Example of input
12
+ ----------------
13
+ Below example showing input data on 2 CpGs of 3 groups (A,B, and C)
14
+ with each group has 3 replicates:
15
+ cgID A_1 A_2 A_3 B_1 B_2 B_3 C_1 C_2 C_3
16
+ CpG_1 129,170 166,178 7,9 1 6,16 10,10 10,15 11,15 16,22 20,36
17
+ CpG_2 0,77 0,99 0,85 0,77 1,37 3,37 0,42 0,153 0,6
18
+
19
+ Notes
20
+ -----
21
+ 1. It can handle covariants.
22
+ 2. Input is proportion values, not beta values.
23
+ 3. you must install R package "aod" before running this program.
24
+ (https://cran.r-project.org/web/packages/aod/index.html)
25
+ """
26
+
27
+
28
+ import sys,os
29
+ import collections
30
+ import subprocess
31
+ import numpy as np
32
+ import re
33
+ from scipy import stats
34
+ from optparse import OptionParser
35
+ from cpgmodule import ireader
36
+ from cpgmodule.utils import *
37
+ from cpgmodule import BED
38
+ from cpgmodule import padjust
39
+ from cpgmodule._version import __version__
40
+
41
+ __author__ = "Liguo Wang"
42
+ __copyright__ = "Copyleft"
43
+ __credits__ = []
44
+ __license__ = "GPL"
45
+ __maintainer__ = "Liguo Wang"
46
+ __email__ = "wang.liguo@mayo.edu"
47
+ __status__ = "Development"
48
+
49
+
50
+ def main():
51
+ usage="%prog [options]" + "\n"
52
+ parser = OptionParser(usage,version="%prog " + __version__)
53
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Data file containing methylation proportions (represented by \"methyl_count,total_count\", eg. \"20,30\") with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). This file can be a regular text file or compressed file (.gz, .bz2)")
54
+ parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological groups of each sample as well as other covariables such as gender, age. The first variable is grouping variable (must be categorical), all the other variables are considered as covariates (can be categorical or continuous). Sample IDs should match to the \"Data file\".")
55
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
56
+ (options,args)=parser.parse_args()
57
+
58
+ print ()
59
+ if not (options.input_file):
60
+ print (__doc__)
61
+ parser.print_help()
62
+ sys.exit(101)
63
+
64
+ if not (options.group_file):
65
+ print (__doc__)
66
+ parser.print_help()
67
+ sys.exit(102)
68
+
69
+ if not (options.out_file):
70
+ print (__doc__)
71
+ parser.print_help()
72
+ sys.exit(103)
73
+
74
+ if not os.path.isfile(options.input_file):
75
+ print ("Input data file \"%s\" does not exist\n" % options.input_file)
76
+ sys.exit(104)
77
+ if not os.path.isfile(options.group_file):
78
+ print ("Input group file \"%s\" does not exist\n" % options.input_file)
79
+ sys.exit(105)
80
+
81
+ ROUT = open(options.out_file + '.r','w')
82
+
83
+ print ('library("aod")', file=ROUT)
84
+
85
+ printlog("Read group file \"%s\" ..." % (options.group_file))
86
+ ####
87
+ (samples,cv_names, cvs, v_types) = read_grp_file2(options.group_file)
88
+ for cv_name in cv_names:
89
+ print ("%s: %s" % (cv_name, v_types[cv_name]))
90
+ for sample in samples:
91
+ print ('\t' + sample + '\t' + cvs[cv_name][sample])
92
+ ####
93
+
94
+ print ('bbr1 <- function (cgid, m,t,%s){' % ','.join(cv_names), file=ROUT)
95
+ print ('\tdat <- data.frame(m=m, t=t, %s)' % ','.join(['='.join(i) for i in zip(cv_names, cv_names)]), file=ROUT)
96
+ print ('\tfit <- betabin(cbind(m,t - m) ~ %s, ~1, link=c("logit"), data=na.omit(dat))' % '+'.join(cv_names), file=ROUT)
97
+ print ('\ttest <- summary(fit)',file=ROUT)
98
+ print ('\tcoefs <- test@Coef$Estimate',file=ROUT)
99
+ print ('\tpvals = test@Coef$"Pr(> |z|)"',file=ROUT)
100
+ print ('\tif(max(pvals, na.rm=T)>1){pvals = pvals + NA}', file=ROUT)
101
+ print ('\tif(sum(m, na.rm=T) == 0){pvals = pvals + NA}', file=ROUT)
102
+ print ('\tnames = row.names(test@Coef)',file=ROUT)
103
+ print ('\tnames = gsub("2","",names)',file=ROUT)
104
+ print ( '\twrite.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1),quote=FALSE, row.names=FALSE, sep="\\t", col.names=c("ID",paste(names, "coef",sep="."), paste(names, "pval",sep=".")))' % (options.out_file + '.results.txt'), file = ROUT)
105
+ print ('}', file=ROUT)
106
+ print ('\n', file=ROUT)
107
+
108
+ print ('bbr2 <- function (cgid, m,t,%s){' % ','.join(cv_names), file=ROUT)
109
+ print ('\tdat <- data.frame(m=m, t=t, %s)' % ','.join(['='.join(i) for i in zip(cv_names, cv_names)]), file=ROUT)
110
+ print ('\tfit <- betabin(cbind(m,t - m) ~ %s, ~1, link=c("logit"), data=na.omit(dat))' % '+'.join(cv_names), file=ROUT)
111
+ print ('\ttest <- summary(fit)',file=ROUT)
112
+ print ('\tcoefs <- test@Coef$Estimate',file=ROUT)
113
+ print ('\tpvals = test@Coef$"Pr(> |z|)"',file=ROUT)
114
+ print ('\tif(max(pvals, na.rm=T)>1){pvals = pvals + NA}', file=ROUT)
115
+ print ('\tif(sum(m, na.rm=T) == 0){pvals = pvals + NA}', file=ROUT)
116
+ print ('\tnames = row.names(test@Coef)',file=ROUT)
117
+ print ('\tnames = gsub("2","",names)',file=ROUT)
118
+ print ( '\twrite.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1), quote=FALSE, row.names=FALSE, sep="\\t", col.names=FALSE, append=TRUE)' % (options.out_file + '.results.txt'), file = ROUT)
119
+ print ('}', file=ROUT)
120
+ print ('\n', file=ROUT)
121
+
122
+ printlog("Processing file \"%s\" ..." % (options.input_file))
123
+ line_num = 0
124
+ probe_list = []
125
+ p_list = []
126
+ for l in ireader.reader(options.input_file):
127
+ line_num += 1
128
+ f = l.split()
129
+ if len(f) == 0: continue
130
+ if line_num == 1:
131
+ sample_IDs = f[1:]
132
+ # check if sample ID matches
133
+ for s in samples:
134
+ if s not in sample_IDs:
135
+ printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file))
136
+ sys.exit(3)
137
+ ####
138
+ for cv_name in cv_names:
139
+ if v_types[cv_name] == 'continuous':
140
+ print (cv_name + ' <- c(%s)' % (','.join([str(cvs[cv_name][s]) for s in sample_IDs ])), file = ROUT)
141
+ elif v_types[cv_name] == 'categorical':
142
+ print (cv_name + ' <- as.factor(c(%s))' % (','.join([str(cvs[cv_name][s]) for s in sample_IDs ])), file = ROUT)
143
+ else:
144
+ printlog("unknown vaiable type!")
145
+ sys.exit(1)
146
+ ####
147
+ print ('\n', file=ROUT)
148
+
149
+ continue
150
+ else:
151
+ methyl_reads = [] # c
152
+ total_reads = [] # n
153
+ cg_id = f[0]
154
+ for i in f[1:]:
155
+ #try:
156
+ m = re.match(r'(\d+)\s*\,\s*(\d+)', i)
157
+ if m is None:
158
+ methyl_reads.append("NaN")
159
+ total_reads.append("NaN")
160
+ continue
161
+ else:
162
+ c = int(m.group(1))
163
+ n = int(m.group(2))
164
+ if n >= c and n > 0:
165
+ methyl_reads.append(c)
166
+ total_reads.append(n)
167
+ else:
168
+ printlog("Incorrect data format!")
169
+ print (f)
170
+ sys.exit(1)
171
+ if line_num == 2:
172
+ print ('bbr1(\"%s\", c(%s), c(%s), %s)' % (cg_id, ','.join([str(read) for read in methyl_reads]), ','.join([str(read) for read in total_reads]), ','.join(cv_names)), file=ROUT)
173
+ else:
174
+ print ('bbr2(\"%s\", c(%s), c(%s), %s)' % (cg_id, ','.join([str(read) for read in methyl_reads]), ','.join([str(read) for read in total_reads]), ','.join(cv_names)), file=ROUT)
175
+ ROUT.close()
176
+
177
+
178
+ try:
179
+ printlog("Runing Rscript file \"%s\" ..." % (options.out_file + '.r'))
180
+ subprocess.call("Rscript %s 2>%s" % (options.out_file + '.r', options.out_file + '.warnings.txt' ), shell=True)
181
+ except:
182
+ print ("Error: cannot run Rscript: \"%s\"" % (options.out_file + '.r'), file=sys.stderr)
183
+ sys.exit(1)
184
+
185
+ """
186
+ printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...")
187
+ probe_list0 = [] #probes without valid pvalue
188
+ probe_list1 = []
189
+ p_list1 = []
190
+ if os.path.exists(options.out_file + '.results.txt') and os.path.getsize(options.out_file + '.results.txt') > 0:
191
+ for l in ireader.reader(options.out_file + '.results.txt'):
192
+ f = l.split()
193
+ id = f[0]
194
+ try:
195
+ pv = float(f[1])
196
+ probe_list1.append(id)
197
+ p_list1.append(pv)
198
+ except:
199
+ probe_list0.append(id)
200
+ continue
201
+ q_list1 = padjust.multiple_testing_correction(p_list1)
202
+
203
+ OUT = open(options.out_file + '.results.txt','w')
204
+ print ("probe\tP-value\tadj.Pvalue", file = OUT)
205
+
206
+ #probes with valid p and q
207
+ for id,p,q in zip(probe_list1, p_list1, q_list1):
208
+ print (id + '\t' + str(p) + '\t' + str(q), file=OUT)
209
+
210
+ #probes without valid p and q
211
+ if len(probe_list0) > 0:
212
+ for id in probe_list0:
213
+ print (id + '\tNA\tNA', file=OUT)
214
+
215
+
216
+ OUT.close()
217
+ """
218
+
219
+ if __name__=='__main__':
220
+ main()
221
+