cpgtools 2.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cpgmodule/BED.py +441 -0
- cpgmodule/MI.py +193 -0
- cpgmodule/__init__.py +0 -0
- cpgmodule/_version.py +1 -0
- cpgmodule/cgID.py +866897 -0
- cpgmodule/data/AltumAge_cpg.pkl +0 -0
- cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
- cpgmodule/data/AltumAge_scaler.pkl +0 -0
- cpgmodule/data/GA_Bohlin.pkl +0 -0
- cpgmodule/data/GA_Haftorn.pkl +0 -0
- cpgmodule/data/GA_Knight.pkl +0 -0
- cpgmodule/data/GA_Lee_CPC.pkl +0 -0
- cpgmodule/data/GA_Lee_RPC.pkl +0 -0
- cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
- cpgmodule/data/GA_Mayne.pkl +0 -0
- cpgmodule/data/Hannum.pkl +0 -0
- cpgmodule/data/Horvath_2013.pkl +0 -0
- cpgmodule/data/Horvath_2018.pkl +0 -0
- cpgmodule/data/Levine.pkl +0 -0
- cpgmodule/data/Lu_DNAmTL.pkl +0 -0
- cpgmodule/data/Ped_McEwen.pkl +0 -0
- cpgmodule/data/Ped_Wu.pkl +0 -0
- cpgmodule/data/Zhang_BLUP.pkl +0 -0
- cpgmodule/data/Zhang_EN.pkl +0 -0
- cpgmodule/data/__init__.py +0 -0
- cpgmodule/extend_bed.py +147 -0
- cpgmodule/imotif.py +348 -0
- cpgmodule/ireader.py +28 -0
- cpgmodule/methylClock.py +53 -0
- cpgmodule/padjust.py +58 -0
- cpgmodule/region2gene.py +170 -0
- cpgmodule/utils.py +642 -0
- cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
- cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
- cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
- cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
- cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
- cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
- cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
- cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
- cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
- cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
- cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
- cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
- cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
- cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
- cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
- cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
- cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
- cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
- cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
- cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
- cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
- cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
- cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
- cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
- cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
- cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
- cpgtools-2.0.5.dist-info/METADATA +59 -0
- cpgtools-2.0.5.dist-info/RECORD +104 -0
- cpgtools-2.0.5.dist-info/WHEEL +5 -0
- cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
- cpgtools-2.0.5.dist-info/top_level.txt +5 -0
- impyute/__init__.py +3 -0
- impyute/contrib/__init__.py +7 -0
- impyute/contrib/compare.py +69 -0
- impyute/contrib/count_missing.py +30 -0
- impyute/contrib/describe.py +63 -0
- impyute/cs/__init__.py +11 -0
- impyute/cs/buck_iterative.py +82 -0
- impyute/cs/central_tendency.py +84 -0
- impyute/cs/em.py +52 -0
- impyute/cs/fast_knn.py +130 -0
- impyute/cs/random.py +27 -0
- impyute/dataset/__init__.py +6 -0
- impyute/dataset/base.py +137 -0
- impyute/dataset/corrupt.py +55 -0
- impyute/deletion/__init__.py +5 -0
- impyute/deletion/complete_case.py +21 -0
- impyute/ops/__init__.py +12 -0
- impyute/ops/error.py +9 -0
- impyute/ops/inverse_distance_weighting.py +31 -0
- impyute/ops/matrix.py +47 -0
- impyute/ops/testing.py +20 -0
- impyute/ops/util.py +96 -0
- impyute/ops/wrapper.py +179 -0
- impyute/ts/__init__.py +6 -0
- impyute/ts/locf.py +57 -0
- impyute/ts/moving_window.py +128 -0
- impyutelib.py +890 -0
- missingpy/__init__.py +4 -0
- missingpy/knnimpute.py +328 -0
- missingpy/missforest.py +556 -0
- missingpy/pairwise_external.py +315 -0
- missingpy/tests/__init__.py +0 -0
- missingpy/tests/test_knnimpute.py +605 -0
- missingpy/tests/test_missforest.py +409 -0
- missingpy/utils.py +124 -0
- misspylib.py +565 -0
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Description
|
|
5
|
+
-----------
|
|
6
|
+
This program performs differential CpG analysis using Fisher's exact test. It only applies
|
|
7
|
+
to two sample comparison with no replicates. if replicates are provided, *methyl reads*
|
|
8
|
+
and *total reads* of all replicates will be summed
|
|
9
|
+
|
|
10
|
+
Example of input data file
|
|
11
|
+
--------------------------
|
|
12
|
+
cgID sample_1 sample_2
|
|
13
|
+
CpG_1 129,170 166,178
|
|
14
|
+
CpG_2 24,77 67,99
|
|
15
|
+
|
|
16
|
+
the number before "," indicates *number of methyl reads*
|
|
17
|
+
the number after "," indicates *number of total reads*
|
|
18
|
+
|
|
19
|
+
Output
|
|
20
|
+
-------
|
|
21
|
+
Three columns ("Odds ratio", "pvalue" and "adjusted pvalue") will append to input data table.
|
|
22
|
+
#=========================================================================================
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
import sys,os
|
|
27
|
+
import collections
|
|
28
|
+
import subprocess
|
|
29
|
+
import numpy as np
|
|
30
|
+
import re
|
|
31
|
+
from scipy import stats
|
|
32
|
+
from optparse import OptionParser
|
|
33
|
+
from cpgmodule import ireader
|
|
34
|
+
from cpgmodule.utils import *
|
|
35
|
+
from cpgmodule import BED
|
|
36
|
+
from cpgmodule import padjust
|
|
37
|
+
from cpgmodule._version import __version__
|
|
38
|
+
|
|
39
|
+
__author__ = "Liguo Wang"
|
|
40
|
+
__copyright__ = "Copyleft"
|
|
41
|
+
__credits__ = []
|
|
42
|
+
__license__ = "GPL"
|
|
43
|
+
__maintainer__ = "Liguo Wang"
|
|
44
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
45
|
+
__status__ = "Development"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def main():
|
|
49
|
+
usage="%prog [options]" + "\n"
|
|
50
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
51
|
+
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Data file containing methylation proportions (represented by \"methyl_count,total_count\", eg. \"20,30\") with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). This file can be a regular text file or compressed file (*.gz, *.bz2) or accessible url.")
|
|
52
|
+
parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological groups of each sample. It is a comma-separated 2 columns file with the 1st column containing sample IDs, and the 2nd column containing group IDs. It must have a header row. Sample IDs should match to the \"Data file\".")
|
|
53
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
|
|
54
|
+
(options,args)=parser.parse_args()
|
|
55
|
+
|
|
56
|
+
print ()
|
|
57
|
+
#print (options.paired)
|
|
58
|
+
#print (options.welch_ttest)
|
|
59
|
+
if not (options.input_file):
|
|
60
|
+
print (__doc__)
|
|
61
|
+
parser.print_help()
|
|
62
|
+
sys.exit(101)
|
|
63
|
+
|
|
64
|
+
if not (options.group_file):
|
|
65
|
+
print (__doc__)
|
|
66
|
+
parser.print_help()
|
|
67
|
+
sys.exit(102)
|
|
68
|
+
|
|
69
|
+
if not (options.out_file):
|
|
70
|
+
print (__doc__)
|
|
71
|
+
parser.print_help()
|
|
72
|
+
sys.exit(103)
|
|
73
|
+
|
|
74
|
+
FOUT = open(options.out_file + '.pval.txt','w')
|
|
75
|
+
#ROUT = open(options.out_file + '.r','w')
|
|
76
|
+
|
|
77
|
+
printlog("Read group file \"%s\" ..." % (options.group_file))
|
|
78
|
+
(s,g) = read_grp_file1(options.group_file)
|
|
79
|
+
s2g = dict(zip(s,g))
|
|
80
|
+
g2s = collections.defaultdict(list)
|
|
81
|
+
|
|
82
|
+
for k,v in s2g.items():
|
|
83
|
+
g2s[v].append(k)
|
|
84
|
+
|
|
85
|
+
group_IDs = sorted(g2s.keys())
|
|
86
|
+
for g in group_IDs:
|
|
87
|
+
print ("\tGroup %s has %d samples:" % (g, len(g2s[g])))
|
|
88
|
+
print ('\t\t' + ','.join(g2s[g]))
|
|
89
|
+
|
|
90
|
+
if len(group_IDs) != 2:
|
|
91
|
+
printlog("You must have two groups!", file=sys.stderr)
|
|
92
|
+
sys.exit(1)
|
|
93
|
+
|
|
94
|
+
line_num = 1
|
|
95
|
+
probe_list = []
|
|
96
|
+
p_list = []
|
|
97
|
+
or_list = []
|
|
98
|
+
for l in ireader.reader(options.input_file):
|
|
99
|
+
f = l.split()
|
|
100
|
+
if len(f) == 0: continue
|
|
101
|
+
if line_num == 1:
|
|
102
|
+
sample_IDs = f[1:]
|
|
103
|
+
# check if sample ID matches
|
|
104
|
+
for s in s2g:
|
|
105
|
+
if s not in sample_IDs:
|
|
106
|
+
printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file))
|
|
107
|
+
sys.exit(3)
|
|
108
|
+
else:
|
|
109
|
+
cg_id = f[0]
|
|
110
|
+
probe_list.append(cg_id)
|
|
111
|
+
proportions = f[1:]
|
|
112
|
+
methyl_reads = 0
|
|
113
|
+
unmethyl_reads = 0
|
|
114
|
+
g2values = collections.defaultdict(dict)
|
|
115
|
+
for g in group_IDs:
|
|
116
|
+
g2values[g]['methyl'] = 0
|
|
117
|
+
g2values[g]['unmethyl'] = 0
|
|
118
|
+
for s,p in zip(sample_IDs, proportions):
|
|
119
|
+
gid = s2g[s]
|
|
120
|
+
m = re.match(r'(\d+)\s*\,\s*(\d+)', p)
|
|
121
|
+
if m is None:
|
|
122
|
+
continue
|
|
123
|
+
else:
|
|
124
|
+
c = int(m.group(1))
|
|
125
|
+
n = int(m.group(2))
|
|
126
|
+
if n >= c and n > 0:
|
|
127
|
+
g2values[gid]['methyl'] += c
|
|
128
|
+
g2values[gid]['unmethyl'] += (n-c)
|
|
129
|
+
else:
|
|
130
|
+
printlog("Incorrect data format!")
|
|
131
|
+
print (f)
|
|
132
|
+
sys.exit(1)
|
|
133
|
+
(odds, pval) = stats.fisher_exact([ [g2values[group_IDs[0]]['methyl'], g2values[group_IDs[0]]['unmethyl']],[g2values[group_IDs[1]]['methyl'], g2values[group_IDs[1]]['unmethyl']] ])
|
|
134
|
+
#print (g2values[group_IDs[0]]['methyl'], g2values[group_IDs[0]]['unmethyl'],g2values[group_IDs[1]]['methyl'], g2values[group_IDs[1]]['unmethyl'])
|
|
135
|
+
p_list.append(pval)
|
|
136
|
+
or_list.append(odds)
|
|
137
|
+
line_num += 1
|
|
138
|
+
|
|
139
|
+
printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...")
|
|
140
|
+
adjusted_p = {}
|
|
141
|
+
q_list = padjust.multiple_testing_correction(p_list)
|
|
142
|
+
for id,o,p,q in zip(probe_list, or_list, p_list, q_list):
|
|
143
|
+
adjusted_p[id] = '\t'.join([str(i) for i in (o,p,q)])
|
|
144
|
+
|
|
145
|
+
printlog("Writing to %s" % (options.out_file + '.pval.txt'))
|
|
146
|
+
line_num = 1
|
|
147
|
+
for l in ireader.reader(options.input_file):
|
|
148
|
+
if line_num == 1:
|
|
149
|
+
print (l + '\tOddsRatio\tpval\tadj.pval', file=FOUT)
|
|
150
|
+
else:
|
|
151
|
+
f = l.split()
|
|
152
|
+
probe_ID = f[0]
|
|
153
|
+
print (l + '\t' + adjusted_p[probe_ID], file=FOUT)
|
|
154
|
+
line_num += 1
|
|
155
|
+
FOUT.close()
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
if __name__=='__main__':
|
|
161
|
+
main()
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Description
|
|
5
|
+
-----------
|
|
6
|
+
This program performs differential CpG analysis using linear regression model based on
|
|
7
|
+
beta values.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
import sys,os
|
|
12
|
+
import collections
|
|
13
|
+
import subprocess
|
|
14
|
+
import numpy as np
|
|
15
|
+
from scipy import stats
|
|
16
|
+
from optparse import OptionParser
|
|
17
|
+
from cpgmodule import ireader
|
|
18
|
+
from cpgmodule.utils import *
|
|
19
|
+
from cpgmodule import BED
|
|
20
|
+
from cpgmodule import padjust
|
|
21
|
+
from cpgmodule._version import __version__
|
|
22
|
+
|
|
23
|
+
__author__ = "Liguo Wang"
|
|
24
|
+
__copyright__ = "Copyleft"
|
|
25
|
+
__credits__ = []
|
|
26
|
+
__license__ = "GPL"
|
|
27
|
+
__maintainer__ = "Liguo Wang"
|
|
28
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
29
|
+
__status__ = "Development"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def main():
|
|
33
|
+
usage="%prog [options]" + "\n"
|
|
34
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
35
|
+
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Data file containing beta values with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). This file can be a regular text file or compressed file (.gz, .bz2).")
|
|
36
|
+
parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological groups of each sample as well as other covariables such as gender, age. The first variable is grouping variable (must be categorical), all the other variables are considered as covariates (can be categorical or continuous). Sample IDs should match to the \"Data file\".")
|
|
37
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
|
|
38
|
+
(options,args)=parser.parse_args()
|
|
39
|
+
|
|
40
|
+
print ()
|
|
41
|
+
if not (options.input_file):
|
|
42
|
+
print (__doc__)
|
|
43
|
+
parser.print_help()
|
|
44
|
+
sys.exit(101)
|
|
45
|
+
|
|
46
|
+
if not (options.group_file):
|
|
47
|
+
print (__doc__)
|
|
48
|
+
parser.print_help()
|
|
49
|
+
sys.exit(102)
|
|
50
|
+
|
|
51
|
+
if not (options.out_file):
|
|
52
|
+
print (__doc__)
|
|
53
|
+
parser.print_help()
|
|
54
|
+
sys.exit(103)
|
|
55
|
+
|
|
56
|
+
if not os.path.isfile(options.input_file):
|
|
57
|
+
print ("Input data file \"%s\" does not exist\n" % options.input_file)
|
|
58
|
+
sys.exit(104)
|
|
59
|
+
if not os.path.isfile(options.group_file):
|
|
60
|
+
print ("Input group file \"%s\" does not exist\n" % options.input_file)
|
|
61
|
+
sys.exit(105)
|
|
62
|
+
|
|
63
|
+
ROUT = open(options.out_file + '.r','w')
|
|
64
|
+
|
|
65
|
+
printlog("Read group file \"%s\" ..." % (options.group_file))
|
|
66
|
+
(samples,cv_names, cvs, v_types) = read_grp_file2(options.group_file)
|
|
67
|
+
for cv_name in cv_names:
|
|
68
|
+
print ("%s: %s" % (cv_name, v_types[cv_name]))
|
|
69
|
+
for sample in samples:
|
|
70
|
+
print ('\t' + sample + '\t' + cvs[cv_name][sample])
|
|
71
|
+
|
|
72
|
+
primary_variable = cv_names[0]
|
|
73
|
+
|
|
74
|
+
print ('lrf1 <- function (cgid, y, %s){' % ','.join(cv_names), file=ROUT)
|
|
75
|
+
print ('try(fit <- glm(y ~ %s, family=gaussian))' % ('+'.join(cv_names)), file=ROUT)
|
|
76
|
+
print ('pvals <- coef(summary(fit))[,4]', file=ROUT)
|
|
77
|
+
print ('coefs <- coef(summary(fit))[,1]', file=ROUT)
|
|
78
|
+
print ( 'write.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1),quote=FALSE, row.names=FALSE, sep="\\t", col.names=c("ID",paste(names(coefs), "coef", sep="."), paste(names(pvals), "pval",sep=".")))' % (options.out_file + '.results.txt'), file = ROUT)
|
|
79
|
+
print ('}', file=ROUT)
|
|
80
|
+
print ('\n', file=ROUT)
|
|
81
|
+
|
|
82
|
+
print ('lrf2 <- function (cgid, y,%s){' % ','.join(cv_names), file=ROUT)
|
|
83
|
+
print ('try(fit <- glm(y ~ %s, family=gaussian))' % ('+'.join(cv_names)), file=ROUT)
|
|
84
|
+
print ('pvals <- coef(summary(fit))[,4]', file=ROUT)
|
|
85
|
+
print ('coefs <- coef(summary(fit))[,1]', file=ROUT)
|
|
86
|
+
print ( 'write.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1),quote=FALSE, row.names=FALSE, sep="\\t", col.names=FALSE, append = TRUE)' % (options.out_file + '.results.txt'), file = ROUT)
|
|
87
|
+
print ('}', file=ROUT)
|
|
88
|
+
print ('\n', file=ROUT)
|
|
89
|
+
|
|
90
|
+
printlog("Processing file \"%s\" ..." % (options.input_file))
|
|
91
|
+
line_num = 0
|
|
92
|
+
probe_list = []
|
|
93
|
+
p_list = []
|
|
94
|
+
for l in ireader.reader(options.input_file):
|
|
95
|
+
line_num += 1
|
|
96
|
+
f = l.split()
|
|
97
|
+
if len(f) == 0: continue
|
|
98
|
+
if line_num == 1:
|
|
99
|
+
sample_IDs = f[1:]
|
|
100
|
+
# check if sample ID matches
|
|
101
|
+
for s in samples:
|
|
102
|
+
if s not in sample_IDs:
|
|
103
|
+
printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file))
|
|
104
|
+
sys.exit(3)
|
|
105
|
+
for cv_name in cv_names:
|
|
106
|
+
if v_types[cv_name] == 'continuous':
|
|
107
|
+
print (cv_name + ' <- c(%s)' % (','.join([str(cvs[cv_name][s]) for s in sample_IDs ])), file = ROUT)
|
|
108
|
+
elif v_types[cv_name] == 'categorical':
|
|
109
|
+
print (cv_name + ' <- as.factor(c(%s))' % (','.join([str(cvs[cv_name][s]) for s in sample_IDs ])), file = ROUT)
|
|
110
|
+
else:
|
|
111
|
+
printlog("unknown vaiable type!")
|
|
112
|
+
sys.exit(1)
|
|
113
|
+
print ('\n', file=ROUT)
|
|
114
|
+
continue
|
|
115
|
+
|
|
116
|
+
continue
|
|
117
|
+
else:
|
|
118
|
+
beta_values = []
|
|
119
|
+
cg_id = f[0]
|
|
120
|
+
for i in f[1:]:
|
|
121
|
+
try:
|
|
122
|
+
beta_values.append(float(i))
|
|
123
|
+
except:
|
|
124
|
+
beta_values.append("NaN")
|
|
125
|
+
if line_num == 2:
|
|
126
|
+
print ('lrf1(\"%s\", c(%s), %s)' % (cg_id, ','.join([str(i) for i in beta_values]), ','.join(cv_names)), file=ROUT)
|
|
127
|
+
else:
|
|
128
|
+
print ('lrf2(\"%s\", c(%s), %s)' % (cg_id, ','.join([str(i) for i in beta_values]), ','.join(cv_names)), file=ROUT)
|
|
129
|
+
|
|
130
|
+
ROUT.close()
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
try:
|
|
134
|
+
printlog("Runing Rscript file \"%s\" ..." % (options.out_file + '.r'))
|
|
135
|
+
subprocess.call("Rscript %s 2>%s" % (options.out_file + '.r', options.out_file + '.warnings.txt' ), shell=True)
|
|
136
|
+
except:
|
|
137
|
+
print ("Error: cannot run Rscript: \"%s\"" % (options.out_file + '.r'), file=sys.stderr)
|
|
138
|
+
sys.exit(1)
|
|
139
|
+
|
|
140
|
+
# read
|
|
141
|
+
printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...")
|
|
142
|
+
|
|
143
|
+
line_num = 0
|
|
144
|
+
p_list = []
|
|
145
|
+
probe_list = []
|
|
146
|
+
for l in open(options.out_file + '.results.txt', 'r'):
|
|
147
|
+
l = l.strip()
|
|
148
|
+
line_num += 1
|
|
149
|
+
if line_num == 1:
|
|
150
|
+
headers = l.split()
|
|
151
|
+
for i,v in enumerate(headers):
|
|
152
|
+
if v.startswith(primary_variable) and v.endswith('.pval'):
|
|
153
|
+
primary_v_index = i
|
|
154
|
+
else:
|
|
155
|
+
v = l.split()
|
|
156
|
+
try:
|
|
157
|
+
pv = float(v[primary_v_index])
|
|
158
|
+
except:
|
|
159
|
+
continue
|
|
160
|
+
if pv >= 0 and pv <= 1:
|
|
161
|
+
p_list.append(pv)
|
|
162
|
+
probe_list.append(v[0])
|
|
163
|
+
|
|
164
|
+
# adjust
|
|
165
|
+
q_list = padjust.multiple_testing_correction(p_list)
|
|
166
|
+
|
|
167
|
+
# write
|
|
168
|
+
adjusted_p = {}
|
|
169
|
+
for id,p,q in zip(probe_list, p_list, q_list):
|
|
170
|
+
adjusted_p[id] = '\t'.join([str(i) for i in (p,q)])
|
|
171
|
+
FOUT = open(options.out_file + '.pval.txt','w')
|
|
172
|
+
printlog("Writing to %s" % (options.out_file + '.pval.txt'))
|
|
173
|
+
line_num = 1
|
|
174
|
+
for l in ireader.reader(options.input_file):
|
|
175
|
+
if line_num == 1:
|
|
176
|
+
print (l + '\tpval\tadj.pval', file=FOUT)
|
|
177
|
+
else:
|
|
178
|
+
f = l.split()
|
|
179
|
+
probe_ID = f[0]
|
|
180
|
+
try:
|
|
181
|
+
print (l + '\t' + adjusted_p[probe_ID], file=FOUT)
|
|
182
|
+
except:
|
|
183
|
+
print (l + '\tNaN\tNaN', file=FOUT)
|
|
184
|
+
line_num += 1
|
|
185
|
+
FOUT.close()
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
if __name__=='__main__':
|
|
191
|
+
main()
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Description
|
|
5
|
+
-----------
|
|
6
|
+
This program performs differential CpG analysis using the logistic regression model based on
|
|
7
|
+
methylation proportions (in the form of "c,n" where "c" indicates "Number of reads with
|
|
8
|
+
methylated C", and "n" indicates "Number of total reads". Both c and n are non-negative
|
|
9
|
+
integers and c <= n).
|
|
10
|
+
|
|
11
|
+
Example of input data
|
|
12
|
+
---------------------
|
|
13
|
+
Below example showing input data on 2 CpGs of 3 groups (A,B, and C)
|
|
14
|
+
with each group has 3 replicates:
|
|
15
|
+
|
|
16
|
+
cgID A_1 A_2 A_3 B_1 B_2 B_3 C_1 C_2 C_3
|
|
17
|
+
CpG_1 129,170 166,178 7,9 1 6,16 10,10 10,15 11,15 16,22 20,36
|
|
18
|
+
CpG_2 0,77 0,99 0,85 0,77 1,37 3,37 0,42 0,153 0,6
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
import sys,os
|
|
24
|
+
import collections
|
|
25
|
+
import subprocess
|
|
26
|
+
import numpy as np
|
|
27
|
+
import re
|
|
28
|
+
from scipy import stats
|
|
29
|
+
from optparse import OptionParser
|
|
30
|
+
from cpgmodule import ireader
|
|
31
|
+
from cpgmodule.utils import *
|
|
32
|
+
from cpgmodule import BED
|
|
33
|
+
from cpgmodule import padjust
|
|
34
|
+
from cpgmodule._version import __version__
|
|
35
|
+
|
|
36
|
+
__author__ = "Liguo Wang"
|
|
37
|
+
__copyright__ = "Copyleft"
|
|
38
|
+
__credits__ = []
|
|
39
|
+
__license__ = "GPL"
|
|
40
|
+
__maintainer__ = "Liguo Wang"
|
|
41
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
42
|
+
__status__ = "Development"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def main():
|
|
46
|
+
usage="%prog [options]" + "\n"
|
|
47
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
48
|
+
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Data file containing methylation proportions (represented by \"methyl_count,total_count\", eg. \"20,30\") with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). This file can be a regular text file or compressed file (*.gz, *.bz2) or accessible url.")
|
|
49
|
+
parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological groups of each sample as well as other covariables such as gender, age. The first variable is grouping variable (must be categorical), all the other variables are considered as covariates (can be categorical or continuous). Sample IDs should match to the \"Data file\".")
|
|
50
|
+
parser.add_option("-f","--family",action="store",type="int",dest="family_func",default=1, help="Error distribution and link function to be used in the GLM model. Can be integer 1 or 2 with 1 = \"quasibinomial\" and 2 = \"binomial\". Default=%default.")
|
|
51
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
|
|
52
|
+
(options,args)=parser.parse_args()
|
|
53
|
+
|
|
54
|
+
print ()
|
|
55
|
+
if not (options.input_file):
|
|
56
|
+
print (__doc__)
|
|
57
|
+
parser.print_help()
|
|
58
|
+
sys.exit(101)
|
|
59
|
+
|
|
60
|
+
if not (options.group_file):
|
|
61
|
+
print (__doc__)
|
|
62
|
+
parser.print_help()
|
|
63
|
+
sys.exit(102)
|
|
64
|
+
|
|
65
|
+
if not (options.out_file):
|
|
66
|
+
print (__doc__)
|
|
67
|
+
parser.print_help()
|
|
68
|
+
sys.exit(103)
|
|
69
|
+
if not os.path.isfile(options.input_file):
|
|
70
|
+
print ("Input data file \"%s\" does not exist\n" % options.input_file)
|
|
71
|
+
sys.exit(104)
|
|
72
|
+
if not os.path.isfile(options.group_file):
|
|
73
|
+
print ("Input group file \"%s\" does not exist\n" % options.input_file)
|
|
74
|
+
sys.exit(105)
|
|
75
|
+
|
|
76
|
+
ROUT = open(options.out_file + '.r','w')
|
|
77
|
+
family = {1:'quasibinomial', 2:'binomial',}
|
|
78
|
+
if not options.family_func in family.keys():
|
|
79
|
+
print ("Incorrect value of '-f'!")
|
|
80
|
+
sys.exit(106)
|
|
81
|
+
|
|
82
|
+
printlog("Read group file \"%s\" ..." % (options.group_file))
|
|
83
|
+
(samples,cv_names, cvs, v_types) = read_grp_file2(options.group_file)
|
|
84
|
+
for cv_name in cv_names:
|
|
85
|
+
print ("%s: %s" % (cv_name, v_types[cv_name]))
|
|
86
|
+
for sample in samples:
|
|
87
|
+
print ('\t' + sample + '\t' + cvs[cv_name][sample])
|
|
88
|
+
|
|
89
|
+
primary_variable = cv_names[0]
|
|
90
|
+
|
|
91
|
+
print ('lrf1 <- function (cgid, m,t,%s){' % ','.join(cv_names), file=ROUT)
|
|
92
|
+
print ('try(fit <- glm(cbind(m,t - m) ~ %s, family=%s))' % ('+'.join(cv_names),family[options.family_func]), file=ROUT)
|
|
93
|
+
print ('pvals <- coef(summary(fit))[,4]', file=ROUT)
|
|
94
|
+
print ('coefs <- coef(summary(fit))[,1]', file=ROUT)
|
|
95
|
+
print ('\tif(max(pvals, na.rm=T)>1){pvals = pvals + NA}', file=ROUT)
|
|
96
|
+
print ('\tif(sum(m, na.rm=T) == 0){pvals = pvals + NA}', file=ROUT)
|
|
97
|
+
print ( 'write.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1),quote=FALSE, row.names=FALSE, sep="\\t", col.names=c("ID",paste(gsub("2","",names(coefs)), "coef",sep="."), paste(gsub("2","",names(pvals)), "pval",sep=".")))' % (options.out_file + '.results.txt'), file = ROUT)
|
|
98
|
+
print ('}', file=ROUT)
|
|
99
|
+
print ('\n', file=ROUT)
|
|
100
|
+
|
|
101
|
+
print ('lrf2 <- function (cgid, m,t,%s){' % ','.join(cv_names), file=ROUT)
|
|
102
|
+
print ('try(fit <- glm(cbind(m,t - m) ~ %s, family=%s))' % ('+'.join(cv_names),family[options.family_func]), file=ROUT)
|
|
103
|
+
print ('pvals <- coef(summary(fit))[,4]', file=ROUT)
|
|
104
|
+
print ('coefs <- coef(summary(fit))[,1]', file=ROUT)
|
|
105
|
+
print ('\tif(max(pvals, na.rm=T)>1){pvals = pvals + NA}', file=ROUT)
|
|
106
|
+
print ('\tif(sum(m, na.rm=T) == 0){pvals = pvals + NA}', file=ROUT)
|
|
107
|
+
print ( 'write.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1),quote=FALSE, row.names=FALSE, sep="\\t", col.names=FALSE, append = TRUE)' % (options.out_file + '.results.txt'), file = ROUT)
|
|
108
|
+
print ('}', file=ROUT)
|
|
109
|
+
print ('\n', file=ROUT)
|
|
110
|
+
|
|
111
|
+
printlog("Processing file \"%s\" ..." % (options.input_file))
|
|
112
|
+
line_num = 0
|
|
113
|
+
probe_list = []
|
|
114
|
+
p_list = []
|
|
115
|
+
for l in ireader.reader(options.input_file):
|
|
116
|
+
line_num += 1
|
|
117
|
+
f = l.split()
|
|
118
|
+
if len(f) == 0: continue
|
|
119
|
+
if line_num == 1:
|
|
120
|
+
sample_IDs = f[1:]
|
|
121
|
+
# check if sample ID matches
|
|
122
|
+
for s in samples:
|
|
123
|
+
if s not in sample_IDs:
|
|
124
|
+
printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file))
|
|
125
|
+
sys.exit(3)
|
|
126
|
+
#for cv_name in cv_names:
|
|
127
|
+
# print (cv_name + ' <- c(%s)' % (','.join([str(cvs[cv_name][s]) for s in sample_IDs ])), file = ROUT)
|
|
128
|
+
for cv_name in cv_names:
|
|
129
|
+
if v_types[cv_name] == 'continuous':
|
|
130
|
+
print (cv_name + ' <- c(%s)' % (','.join([str(cvs[cv_name][s]) for s in sample_IDs ])), file = ROUT)
|
|
131
|
+
elif v_types[cv_name] == 'categorical':
|
|
132
|
+
print (cv_name + ' <- as.factor(c(%s))' % (','.join([str(cvs[cv_name][s]) for s in sample_IDs ])), file = ROUT)
|
|
133
|
+
else:
|
|
134
|
+
printlog("unknown vaiable type!")
|
|
135
|
+
sys.exit(1)
|
|
136
|
+
|
|
137
|
+
print ('\n', file=ROUT)
|
|
138
|
+
continue
|
|
139
|
+
else:
|
|
140
|
+
methyl_reads = [] # c
|
|
141
|
+
total_reads = [] # n
|
|
142
|
+
cg_id = f[0]
|
|
143
|
+
for i in f[1:]:
|
|
144
|
+
#try:
|
|
145
|
+
m = re.match(r'(\d+)\s*\,\s*(\d+)', i)
|
|
146
|
+
if m is None:
|
|
147
|
+
methyl_reads.append("NaN")
|
|
148
|
+
total_reads.append("NaN")
|
|
149
|
+
continue
|
|
150
|
+
else:
|
|
151
|
+
c = int(m.group(1))
|
|
152
|
+
n = int(m.group(2))
|
|
153
|
+
if n >= c and n > 0:
|
|
154
|
+
methyl_reads.append(c)
|
|
155
|
+
total_reads.append(n)
|
|
156
|
+
else:
|
|
157
|
+
printlog("Incorrect data format!")
|
|
158
|
+
print (f)
|
|
159
|
+
sys.exit(1)
|
|
160
|
+
if line_num == 2:
|
|
161
|
+
print ('lrf1(\"%s\", c(%s), c(%s), %s)' % (cg_id, ','.join([str(read) for read in methyl_reads]), ','.join([str(read) for read in total_reads]), ','.join(cv_names)), file=ROUT)
|
|
162
|
+
else:
|
|
163
|
+
print ('lrf2(\"%s\", c(%s), c(%s), %s)' % (cg_id, ','.join([str(read) for read in methyl_reads]), ','.join([str(read) for read in total_reads]), ','.join(cv_names)), file=ROUT)
|
|
164
|
+
|
|
165
|
+
ROUT.close()
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
printlog("Runing Rscript file \"%s\" ..." % (options.out_file + '.r'))
|
|
170
|
+
subprocess.call("Rscript %s 2>%s" % (options.out_file + '.r', options.out_file + '.warnings.txt' ), shell=True)
|
|
171
|
+
except:
|
|
172
|
+
print ("Error: cannot run Rscript: \"%s\"" % (options.out_file + '.r'), file=sys.stderr)
|
|
173
|
+
sys.exit(1)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
# read
|
|
177
|
+
printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...")
|
|
178
|
+
|
|
179
|
+
line_num = 0
|
|
180
|
+
p_list = []
|
|
181
|
+
probe_list = []
|
|
182
|
+
for l in open(options.out_file + '.results.txt', 'r'):
|
|
183
|
+
l = l.strip()
|
|
184
|
+
line_num += 1
|
|
185
|
+
if line_num == 1:
|
|
186
|
+
headers = l.split()
|
|
187
|
+
for i,v in enumerate(headers):
|
|
188
|
+
if v.startswith(primary_variable) and v.endswith('.pval'):
|
|
189
|
+
primary_v_index = i
|
|
190
|
+
else:
|
|
191
|
+
v = l.split()
|
|
192
|
+
try:
|
|
193
|
+
pv = float(v[primary_v_index])
|
|
194
|
+
except:
|
|
195
|
+
continue
|
|
196
|
+
if pv >= 0 and pv <= 1:
|
|
197
|
+
p_list.append(pv)
|
|
198
|
+
probe_list.append(v[0])
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
# adjust
|
|
202
|
+
q_list = padjust.multiple_testing_correction(p_list)
|
|
203
|
+
|
|
204
|
+
# write
|
|
205
|
+
adjusted_p = {}
|
|
206
|
+
for id,p,q in zip(probe_list, p_list, q_list):
|
|
207
|
+
adjusted_p[id] = '\t'.join([str(i) for i in (p,q)])
|
|
208
|
+
FOUT = open(options.out_file + '.pval.txt','w')
|
|
209
|
+
printlog("Writing to %s" % (options.out_file + '.pval.txt'))
|
|
210
|
+
line_num = 1
|
|
211
|
+
for l in ireader.reader(options.input_file):
|
|
212
|
+
if line_num == 1:
|
|
213
|
+
print (l + '\tpval\tadj.pval', file=FOUT)
|
|
214
|
+
else:
|
|
215
|
+
f = l.split()
|
|
216
|
+
probe_ID = f[0]
|
|
217
|
+
try:
|
|
218
|
+
print (l + '\t' + adjusted_p[probe_ID], file=FOUT)
|
|
219
|
+
except:
|
|
220
|
+
print (l + '\tNaN\tNaN', file=FOUT)
|
|
221
|
+
line_num += 1
|
|
222
|
+
FOUT.close()
|
|
223
|
+
|
|
224
|
+
if __name__=='__main__':
|
|
225
|
+
main()
|
|
226
|
+
|