cpgtools 2.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cpgmodule/BED.py +441 -0
- cpgmodule/MI.py +193 -0
- cpgmodule/__init__.py +0 -0
- cpgmodule/_version.py +1 -0
- cpgmodule/cgID.py +866897 -0
- cpgmodule/data/AltumAge_cpg.pkl +0 -0
- cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
- cpgmodule/data/AltumAge_scaler.pkl +0 -0
- cpgmodule/data/GA_Bohlin.pkl +0 -0
- cpgmodule/data/GA_Haftorn.pkl +0 -0
- cpgmodule/data/GA_Knight.pkl +0 -0
- cpgmodule/data/GA_Lee_CPC.pkl +0 -0
- cpgmodule/data/GA_Lee_RPC.pkl +0 -0
- cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
- cpgmodule/data/GA_Mayne.pkl +0 -0
- cpgmodule/data/Hannum.pkl +0 -0
- cpgmodule/data/Horvath_2013.pkl +0 -0
- cpgmodule/data/Horvath_2018.pkl +0 -0
- cpgmodule/data/Levine.pkl +0 -0
- cpgmodule/data/Lu_DNAmTL.pkl +0 -0
- cpgmodule/data/Ped_McEwen.pkl +0 -0
- cpgmodule/data/Ped_Wu.pkl +0 -0
- cpgmodule/data/Zhang_BLUP.pkl +0 -0
- cpgmodule/data/Zhang_EN.pkl +0 -0
- cpgmodule/data/__init__.py +0 -0
- cpgmodule/extend_bed.py +147 -0
- cpgmodule/imotif.py +348 -0
- cpgmodule/ireader.py +28 -0
- cpgmodule/methylClock.py +53 -0
- cpgmodule/padjust.py +58 -0
- cpgmodule/region2gene.py +170 -0
- cpgmodule/utils.py +642 -0
- cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
- cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
- cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
- cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
- cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
- cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
- cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
- cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
- cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
- cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
- cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
- cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
- cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
- cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
- cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
- cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
- cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
- cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
- cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
- cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
- cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
- cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
- cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
- cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
- cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
- cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
- cpgtools-2.0.5.dist-info/METADATA +59 -0
- cpgtools-2.0.5.dist-info/RECORD +104 -0
- cpgtools-2.0.5.dist-info/WHEEL +5 -0
- cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
- cpgtools-2.0.5.dist-info/top_level.txt +5 -0
- impyute/__init__.py +3 -0
- impyute/contrib/__init__.py +7 -0
- impyute/contrib/compare.py +69 -0
- impyute/contrib/count_missing.py +30 -0
- impyute/contrib/describe.py +63 -0
- impyute/cs/__init__.py +11 -0
- impyute/cs/buck_iterative.py +82 -0
- impyute/cs/central_tendency.py +84 -0
- impyute/cs/em.py +52 -0
- impyute/cs/fast_knn.py +130 -0
- impyute/cs/random.py +27 -0
- impyute/dataset/__init__.py +6 -0
- impyute/dataset/base.py +137 -0
- impyute/dataset/corrupt.py +55 -0
- impyute/deletion/__init__.py +5 -0
- impyute/deletion/complete_case.py +21 -0
- impyute/ops/__init__.py +12 -0
- impyute/ops/error.py +9 -0
- impyute/ops/inverse_distance_weighting.py +31 -0
- impyute/ops/matrix.py +47 -0
- impyute/ops/testing.py +20 -0
- impyute/ops/util.py +96 -0
- impyute/ops/wrapper.py +179 -0
- impyute/ts/__init__.py +6 -0
- impyute/ts/locf.py +57 -0
- impyute/ts/moving_window.py +128 -0
- impyutelib.py +890 -0
- missingpy/__init__.py +4 -0
- missingpy/knnimpute.py +328 -0
- missingpy/missforest.py +556 -0
- missingpy/pairwise_external.py +315 -0
- missingpy/tests/__init__.py +0 -0
- missingpy/tests/test_knnimpute.py +605 -0
- missingpy/tests/test_missforest.py +409 -0
- missingpy/utils.py +124 -0
- misspylib.py +565 -0
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Aggregate proportion values of a list of CpGs that located in give genomic regions
|
|
5
|
+
(eg. CpG islands, promoters, exons, etc.).
|
|
6
|
+
|
|
7
|
+
Outlier CpG will be removed if the probability of observing its proportion value is less
|
|
8
|
+
than p-cutoff. For example, if alpha set to 0.05, and there are 10 CpGs (n = 10) located in a
|
|
9
|
+
particular genomic region, the p-cutoff of this genomic region is 0.005 (0.05/10). Supposing
|
|
10
|
+
the total reads mapped to this region is 100, out of which 25 are methylated reads (i.e.,
|
|
11
|
+
regional methylation level (beta) = 25/100 = 0.25)
|
|
12
|
+
|
|
13
|
+
The probability of observing CpG (3,10) is :
|
|
14
|
+
pbinom(q=3, size=10, prob=0.25) = 0.7759
|
|
15
|
+
The probability of observing CpG (0,10) is :
|
|
16
|
+
pbinom(q=0, size=10, prob=0.25) = 0.05631
|
|
17
|
+
The probability of observing CpG (16,21) is :
|
|
18
|
+
pbinom(q=16, size=21, prob=0.25, lower.tail=FALSE) = 1.19e-07 (outlier)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
**Example of input file**
|
|
22
|
+
|
|
23
|
+
Chrom Start End score
|
|
24
|
+
chr10 100017748 100017749 3,10
|
|
25
|
+
chr10 100017769 100017770 0,10
|
|
26
|
+
chr10 100017853 100017854 16,21
|
|
27
|
+
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
import sys,os
|
|
31
|
+
import collections
|
|
32
|
+
import subprocess
|
|
33
|
+
import numpy as np
|
|
34
|
+
from scipy.stats import binom
|
|
35
|
+
|
|
36
|
+
from optparse import OptionParser
|
|
37
|
+
from cpgmodule._version import __version__
|
|
38
|
+
from cpgmodule import ireader
|
|
39
|
+
from cpgmodule.utils import *
|
|
40
|
+
from cpgmodule import BED
|
|
41
|
+
import pandas as pd
|
|
42
|
+
from bx.intervals import *
|
|
43
|
+
|
|
44
|
+
__author__ = "Liguo Wang"
|
|
45
|
+
__copyright__ = "Copyleft"
|
|
46
|
+
__credits__ = []
|
|
47
|
+
__license__ = "GPL"
|
|
48
|
+
__maintainer__ = "Liguo Wang"
|
|
49
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
50
|
+
__status__ = "Development"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def buildIntervalTree(bed_file):
|
|
55
|
+
'''
|
|
56
|
+
Build interval tree from BED file. Input BED file must have at least 4 columns
|
|
57
|
+
'''
|
|
58
|
+
ranges={}
|
|
59
|
+
printlog("reading "+ bed_file + '...')
|
|
60
|
+
for line in ireader.reader(bed_file):
|
|
61
|
+
if line.startswith("track"):continue
|
|
62
|
+
if line.startswith("#"):continue
|
|
63
|
+
if line.startswith('browser'):continue
|
|
64
|
+
if line.startswith('Chrom'):continue
|
|
65
|
+
fields = line.rstrip('\n ').split()
|
|
66
|
+
if len(fields) < 4:
|
|
67
|
+
continue
|
|
68
|
+
chrom = fields[0]
|
|
69
|
+
start = int(fields[1])
|
|
70
|
+
end = int(fields[2])
|
|
71
|
+
score = fields[3]
|
|
72
|
+
|
|
73
|
+
if start < 0:
|
|
74
|
+
continue
|
|
75
|
+
if end < 0:
|
|
76
|
+
continue
|
|
77
|
+
if start > end:
|
|
78
|
+
continue
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
if chrom not in ranges:
|
|
82
|
+
ranges[chrom] = Intersecter()
|
|
83
|
+
ranges[chrom].add_interval( Interval( start, end, value=score) )
|
|
84
|
+
else:
|
|
85
|
+
ranges[chrom].add_interval( Interval( start, end, value=score) )
|
|
86
|
+
return ranges
|
|
87
|
+
|
|
88
|
+
def findIntervals(chrom, start, end, obj, a = 0.01, counts = True):
|
|
89
|
+
'''
|
|
90
|
+
obj is the IntervalTree object returned by "buildIntervalTree.
|
|
91
|
+
'''
|
|
92
|
+
hits = [] # list of proportion values
|
|
93
|
+
|
|
94
|
+
if chrom not in obj:
|
|
95
|
+
return hits
|
|
96
|
+
else:
|
|
97
|
+
overlaps = obj[chrom].find(int(start), int(end))
|
|
98
|
+
for i in overlaps:
|
|
99
|
+
hits.append(i.value)
|
|
100
|
+
if len(hits) == 0:
|
|
101
|
+
if counts:
|
|
102
|
+
return(['N/A']*6)
|
|
103
|
+
else:
|
|
104
|
+
return(['N/A']*2)
|
|
105
|
+
|
|
106
|
+
if counts:
|
|
107
|
+
methyl = [] #list of methylated read for each CpG
|
|
108
|
+
total = [] #list of total read for each CpG
|
|
109
|
+
|
|
110
|
+
for h in hits:
|
|
111
|
+
m, t = h.split(',')
|
|
112
|
+
methyl.append(int(m))
|
|
113
|
+
total.append(int(t))
|
|
114
|
+
ori_CpG_count = len(total) #number of CpGs of a region
|
|
115
|
+
p_cut = a / ori_CpG_count
|
|
116
|
+
ori_methyl_sum = int(np.sum(methyl)) #total reads of a region
|
|
117
|
+
ori_total_sum = int(np.sum(total)) #total methylated reads of a region
|
|
118
|
+
|
|
119
|
+
if ori_total_sum == 0:
|
|
120
|
+
return(['N/A']*6)
|
|
121
|
+
if ori_methyl_sum == 0 or ori_methyl_sum == ori_total_sum:
|
|
122
|
+
return([ori_CpG_count, ori_methyl_sum, ori_total_sum, ori_CpG_count, ori_methyl_sum, ori_total_sum])
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
region_beta = ori_methyl_sum/ori_total_sum #average methylation level of *region*, equivalent to prob in binomial
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
new_methyl = []
|
|
129
|
+
new_total = []
|
|
130
|
+
for m, t in zip(methyl, total):
|
|
131
|
+
p = binom.cdf(k = m, n = t, p = region_beta)
|
|
132
|
+
#print (p, m, t)
|
|
133
|
+
if p < p_cut:
|
|
134
|
+
continue
|
|
135
|
+
if (1.0 - p) < p_cut:
|
|
136
|
+
continue
|
|
137
|
+
new_methyl.append(m)
|
|
138
|
+
new_total.append(t)
|
|
139
|
+
new_CpG_count = len(new_total)
|
|
140
|
+
new_methyl_sum = int(np.sum(new_methyl))
|
|
141
|
+
new_total_sum = int(np.sum(new_total))
|
|
142
|
+
|
|
143
|
+
return([new_CpG_count, new_methyl_sum, new_total_sum, ori_CpG_count, ori_methyl_sum, ori_total_sum])
|
|
144
|
+
else:
|
|
145
|
+
CpG_count = len(hits)
|
|
146
|
+
avg_beta = np.mean([float(i) for i in hits])
|
|
147
|
+
return ([CpG_count, avg_beta])
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def main():
|
|
152
|
+
|
|
153
|
+
usage="%prog [options]" + "\n"
|
|
154
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
155
|
+
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Input CpG file in BED format. The first 3 columns contain \"Chrom\", \"Start\", and \"End\". The 4th column contains proportion values.")
|
|
156
|
+
parser.add_option("-a","--alpha",action="store",type='float', dest="alpha_cut", default=0.05, help="The chance of mistakingly assign a particular CpG as an outlier for each genomic region. Only applied to count data. default=%default" )
|
|
157
|
+
parser.add_option("-b","--bed",action="store",type="string",dest="bed_file",help="BED3+ file specifying the genomic regions.")
|
|
158
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
|
|
159
|
+
parser.add_option("-t","--type",action="store",type='string', dest="data_type",help="Data type in the forth column. Must be one of \"count\" (eg 3,10) or \"beta\"(eg, 0.2)")
|
|
160
|
+
(options,args)=parser.parse_args()
|
|
161
|
+
|
|
162
|
+
if not (options.input_file):
|
|
163
|
+
print (__doc__)
|
|
164
|
+
parser.print_help()
|
|
165
|
+
sys.exit(101)
|
|
166
|
+
if not (options.data_type):
|
|
167
|
+
print (__doc__)
|
|
168
|
+
parser.print_help()
|
|
169
|
+
sys.exit(102)
|
|
170
|
+
if not (options.out_file):
|
|
171
|
+
print (__doc__)
|
|
172
|
+
parser.print_help()
|
|
173
|
+
sys.exit(103)
|
|
174
|
+
|
|
175
|
+
if options.alpha_cut < 0:
|
|
176
|
+
options.alpha_cut = 0.05
|
|
177
|
+
if options.alpha_cut > 1:
|
|
178
|
+
options.alpha_cut = 1
|
|
179
|
+
|
|
180
|
+
tree = buildIntervalTree(options.input_file)
|
|
181
|
+
|
|
182
|
+
OUT = open(options.out_file,'w')
|
|
183
|
+
if options.data_type == 'count':
|
|
184
|
+
print ("#chrom\tstart\tend\tN_CpG_filtered\tN_methyl_filtered\tN_total_filtered\tN_CpG_ori\tN_methy_ori\tN_total_ori", file=OUT)
|
|
185
|
+
for line in ireader.reader(options.bed_file):
|
|
186
|
+
line = line.strip()
|
|
187
|
+
if line.startswith("track"):continue
|
|
188
|
+
if line.startswith("#"):continue
|
|
189
|
+
if line.startswith('browser'):continue
|
|
190
|
+
if line.startswith('Chrom'):continue
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
f = line.split()
|
|
194
|
+
if len(f) < 3:
|
|
195
|
+
continue
|
|
196
|
+
try:
|
|
197
|
+
chrom = f[0]
|
|
198
|
+
start = int(f[1])
|
|
199
|
+
end = int(f[2])
|
|
200
|
+
except:
|
|
201
|
+
continue
|
|
202
|
+
|
|
203
|
+
tmp = findIntervals(chrom, start, end, tree, a = options.alpha_cut, counts=True)
|
|
204
|
+
if len(tmp) == 0:
|
|
205
|
+
print ('\t'.join(f[0:3]) + '\t' + '\t'.join( ['N/A']*6), file=OUT)
|
|
206
|
+
else:
|
|
207
|
+
print ('\t'.join(f[0:3]) + '\t' + '\t'.join([str(i) for i in tmp]), file=OUT)
|
|
208
|
+
elif options.data_type == 'beta':
|
|
209
|
+
#print ("#chrom\tstart\tend\tN_CpG\tavg_beta", file=OUT)
|
|
210
|
+
for line in ireader.reader(options.bed_file):
|
|
211
|
+
line = line.strip()
|
|
212
|
+
if line.startswith("track"):continue
|
|
213
|
+
if line.startswith("#"):continue
|
|
214
|
+
if line.startswith('browser'):continue
|
|
215
|
+
if line.startswith('Chrom'):continue
|
|
216
|
+
|
|
217
|
+
f = line.split()
|
|
218
|
+
if len(f) < 3:
|
|
219
|
+
continue
|
|
220
|
+
try:
|
|
221
|
+
chrom = f[0]
|
|
222
|
+
start = int(f[1])
|
|
223
|
+
end = int(f[2])
|
|
224
|
+
except:
|
|
225
|
+
continue
|
|
226
|
+
|
|
227
|
+
tmp = findIntervals(chrom, start, end, tree, a = options.alpha_cut, counts=False)
|
|
228
|
+
if len(tmp) == 0:
|
|
229
|
+
print (line + '\t' + '\t'.join( ['N/A']*2), file=OUT)
|
|
230
|
+
else:
|
|
231
|
+
print (line + '\t' + '\t'.join([str(i) for i in tmp]), file=OUT)
|
|
232
|
+
else:
|
|
233
|
+
print ("-t (--type) must take the value of 'count' or 'beta'", file=sys.stderr)
|
|
234
|
+
sys.exit(102)
|
|
235
|
+
OUT.close()
|
|
236
|
+
|
|
237
|
+
if __name__=='__main__':
|
|
238
|
+
main()
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Description
|
|
5
|
+
-----------
|
|
6
|
+
This program annotates CpG by its position.
|
|
7
|
+
|
|
8
|
+
Notes:
|
|
9
|
+
- Input CpG and BED files must have at least three columns
|
|
10
|
+
- If multiple regions from the annotation BED file are overlapped with the **same**
|
|
11
|
+
CpG site, their names will be concatenated together.
|
|
12
|
+
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import sys,os
|
|
16
|
+
import collections
|
|
17
|
+
import subprocess
|
|
18
|
+
import numpy as np
|
|
19
|
+
from os.path import basename
|
|
20
|
+
from optparse import OptionParser
|
|
21
|
+
from cpgmodule._version import __version__
|
|
22
|
+
from cpgmodule import ireader
|
|
23
|
+
from cpgmodule.utils import *
|
|
24
|
+
from cpgmodule import BED
|
|
25
|
+
import pandas as pd
|
|
26
|
+
from bx.intervals import *
|
|
27
|
+
|
|
28
|
+
__author__ = "Liguo Wang"
|
|
29
|
+
__copyright__ = "Copyleft"
|
|
30
|
+
__credits__ = []
|
|
31
|
+
__license__ = "GPL"
|
|
32
|
+
__maintainer__ = "Liguo Wang"
|
|
33
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
34
|
+
__status__ = "Development"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def buildIntervalTree(bed_file, window_size = 0):
|
|
39
|
+
'''
|
|
40
|
+
Build interval tree from annotation BED file.
|
|
41
|
+
window : add this to the middle of each region.
|
|
42
|
+
'''
|
|
43
|
+
ranges={}
|
|
44
|
+
printlog("Build interval tree from annotation file: %s ..." % bed_file)
|
|
45
|
+
for line in ireader.reader(bed_file):
|
|
46
|
+
if line.startswith("track"):continue
|
|
47
|
+
if line.startswith("#"):continue
|
|
48
|
+
if line.startswith('browser'):continue
|
|
49
|
+
fields = line.rstrip('\n ').split()
|
|
50
|
+
if len(fields) < 3:
|
|
51
|
+
continue
|
|
52
|
+
chrom = fields[0]
|
|
53
|
+
start = int(fields[1])
|
|
54
|
+
end = int(fields[2])
|
|
55
|
+
|
|
56
|
+
if window_size > 0:
|
|
57
|
+
# window middle position
|
|
58
|
+
mid = int(start + (end - start)/2.0 )
|
|
59
|
+
if start < 0:
|
|
60
|
+
continue
|
|
61
|
+
if end < 0:
|
|
62
|
+
continue
|
|
63
|
+
if start > end:
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
# window start position
|
|
67
|
+
extension = int(window_size * 0.5)
|
|
68
|
+
w_start = mid - extension
|
|
69
|
+
if w_start < start:
|
|
70
|
+
w_start = start
|
|
71
|
+
|
|
72
|
+
# window end position
|
|
73
|
+
w_end = mid + extension
|
|
74
|
+
if w_end > end:
|
|
75
|
+
w_end = end
|
|
76
|
+
|
|
77
|
+
if len(fields) >= 4:
|
|
78
|
+
name = fields[3]
|
|
79
|
+
else:
|
|
80
|
+
name = fields[0] + ':' + fields[1] + '-' + fields[2]
|
|
81
|
+
|
|
82
|
+
if chrom not in ranges:
|
|
83
|
+
ranges[chrom] = Intersecter()
|
|
84
|
+
ranges[chrom].add_interval( Interval( start, end, value=name) )
|
|
85
|
+
else:
|
|
86
|
+
ranges[chrom].add_interval( Interval( start, end, value=name) )
|
|
87
|
+
return ranges
|
|
88
|
+
|
|
89
|
+
def findIntervals(chrom, start, end, obj):
|
|
90
|
+
'''
|
|
91
|
+
obj is the IntervalTree object returned by "buildIntervalTree.
|
|
92
|
+
'''
|
|
93
|
+
hits = set()
|
|
94
|
+
if chrom not in obj:
|
|
95
|
+
return hits
|
|
96
|
+
else:
|
|
97
|
+
overlaps = obj[chrom].find(int(start), int(end))
|
|
98
|
+
for i in overlaps:
|
|
99
|
+
hits.add(i.value)
|
|
100
|
+
return sorted(hits)
|
|
101
|
+
|
|
102
|
+
def main():
|
|
103
|
+
|
|
104
|
+
usage="%prog [options]" + "\n"
|
|
105
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
106
|
+
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Input CpG file in BED3+ format.")
|
|
107
|
+
parser.add_option("-a","--annotation",action="store",type="string",dest="anno_file",help="Input annotation file in BED3+ format.")
|
|
108
|
+
parser.add_option("-w","--window",action="store",type='int', dest="window_size", default=0, help="Size of window centering on the middle-point of each genomic region defined in the annotation BED file (i.e., window_size*0.5 will be extended to up- and down-stream from the middle point of each genomic region). if WINDOW_SIZE = 0, use the original region. default=%default" )
|
|
109
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
|
|
110
|
+
parser.add_option("-l", "--header", action="store_true", dest="header", default=False, help="If True, the first row of input CpG file is header. default=%default")
|
|
111
|
+
(options,args)=parser.parse_args()
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
if not (options.input_file):
|
|
115
|
+
print (__doc__)
|
|
116
|
+
#print ('You must specify input file(s)',file=sys.stderr)
|
|
117
|
+
parser.print_help()
|
|
118
|
+
sys.exit(101)
|
|
119
|
+
if not (options.out_file):
|
|
120
|
+
print (__doc__)
|
|
121
|
+
#print ('You must specify the output file',file=sys.stderr)
|
|
122
|
+
parser.print_help()
|
|
123
|
+
sys.exit(102)
|
|
124
|
+
if not (options.anno_file):
|
|
125
|
+
print (__doc__)
|
|
126
|
+
#print ('You must specify the annotation file',file=sys.stderr)
|
|
127
|
+
parser.print_help()
|
|
128
|
+
sys.exit(103)
|
|
129
|
+
tree = buildIntervalTree(options.anno_file, window_size = options.window_size)
|
|
130
|
+
|
|
131
|
+
OUT = open(options.out_file + '.anno.txt','w')
|
|
132
|
+
line_num = 0
|
|
133
|
+
printlog("Reading CpG file: %s ..." % options.input_file)
|
|
134
|
+
for line in ireader.reader(options.input_file):
|
|
135
|
+
fields = line.rstrip('\n ').split()
|
|
136
|
+
if len(fields) < 3:
|
|
137
|
+
continue
|
|
138
|
+
line_num += 1
|
|
139
|
+
f = line.split()
|
|
140
|
+
if (line_num == 1 and options.header):
|
|
141
|
+
print (line + '\t' + basename(options.anno_file), file=OUT)
|
|
142
|
+
else:
|
|
143
|
+
chrom = f[0]
|
|
144
|
+
start = int(f[1])
|
|
145
|
+
end = int(f[2])
|
|
146
|
+
overlaps = findIntervals(chrom, start, end, tree)
|
|
147
|
+
if len(overlaps) > 0:
|
|
148
|
+
print (line + '\t' + ','.join(overlaps), file=OUT)
|
|
149
|
+
else:
|
|
150
|
+
print (line + '\tN/A', file=OUT)
|
|
151
|
+
|
|
152
|
+
OUT.close()
|
|
153
|
+
|
|
154
|
+
if __name__=='__main__':
|
|
155
|
+
main()
|
|
156
|
+
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Description
|
|
5
|
+
-----------
|
|
6
|
+
This program adds comprehensive annotation information to each 450K/850K probe ID.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sys,os
|
|
10
|
+
from optparse import OptionParser
|
|
11
|
+
from cpgmodule import ireader
|
|
12
|
+
from cpgmodule.utils import *
|
|
13
|
+
from cpgmodule._version import __version__
|
|
14
|
+
|
|
15
|
+
__author__ = "Liguo Wang"
|
|
16
|
+
__copyright__ = "Copyleft"
|
|
17
|
+
__credits__ = []
|
|
18
|
+
__license__ = "GPL"
|
|
19
|
+
__maintainer__ = "Liguo Wang"
|
|
20
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
21
|
+
__status__ = "Development"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def read_annotation(infile):
|
|
25
|
+
head = []
|
|
26
|
+
cpg_infor = {}
|
|
27
|
+
for l in ireader.reader(infile):
|
|
28
|
+
if l.startswith('probeID'):
|
|
29
|
+
head = l.split()[1:]
|
|
30
|
+
else:
|
|
31
|
+
f = l.split()
|
|
32
|
+
cgid = f[0]
|
|
33
|
+
anno = '\t'.join(f[1:])
|
|
34
|
+
cpg_infor[cgid] = anno
|
|
35
|
+
return (head, cpg_infor)
|
|
36
|
+
|
|
37
|
+
def main():
|
|
38
|
+
usage="%prog [options]" + "\n"
|
|
39
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
40
|
+
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Input data file (Tab-separated) with a certain column containing 450K/850K array CpG IDs. This file can be regular text file or compressed file (.gz, .bz2).")
|
|
41
|
+
parser.add_option("-a","--annotation",action="store",type="string",dest="anno_file",help="Annotation file. This file can be regular text file or compressed file (.gz, .bz2).")
|
|
42
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="Prefix of the output file.")
|
|
43
|
+
parser.add_option("-p","--probe_column",action="store",type='int', dest="probe_col",default=0, help="The number of column that contains probe IDs. Note: the column index starts with 0. default=%default.")
|
|
44
|
+
parser.add_option("-l", "--header", action="store_true", dest="header", default=False, help="Input data file has a header row.")
|
|
45
|
+
(options,args)=parser.parse_args()
|
|
46
|
+
|
|
47
|
+
if not (options.input_file):
|
|
48
|
+
print (__doc__)
|
|
49
|
+
parser.print_help()
|
|
50
|
+
sys.exit(101)
|
|
51
|
+
|
|
52
|
+
if not (options.anno_file):
|
|
53
|
+
print (__doc__)
|
|
54
|
+
parser.print_help()
|
|
55
|
+
sys.exit(102)
|
|
56
|
+
|
|
57
|
+
if not (options.out_file):
|
|
58
|
+
print (__doc__)
|
|
59
|
+
parser.print_help()
|
|
60
|
+
sys.exit(103)
|
|
61
|
+
|
|
62
|
+
if not os.path.isfile(options.input_file):
|
|
63
|
+
print ("Input data file \"%s\" does not exist\n" % options.input_file)
|
|
64
|
+
sys.exit(104)
|
|
65
|
+
if not os.path.isfile(options.anno_file):
|
|
66
|
+
print ("Input annotation file \"%s\" does not exist\n" % options.input_file)
|
|
67
|
+
sys.exit(105)
|
|
68
|
+
|
|
69
|
+
printlog("Read annotation file \"%s\" ..." % (options.anno_file))
|
|
70
|
+
(header, data)= read_annotation(options.anno_file)
|
|
71
|
+
|
|
72
|
+
OUT = open(options.out_file + '.anno.txt','w')
|
|
73
|
+
printlog("Add annotation information to \"%s\" ..." % (options.input_file))
|
|
74
|
+
line_num = 0
|
|
75
|
+
for l in ireader.reader(options.input_file):
|
|
76
|
+
line_num += 1
|
|
77
|
+
f = l.split()
|
|
78
|
+
if line_num == 1:
|
|
79
|
+
if options.header:
|
|
80
|
+
print (l + '\t' + '\t'.join(header), file=OUT)
|
|
81
|
+
else:
|
|
82
|
+
print ('\t'.join(['NA']*len(f)) + '\t' + '\t'.join(header), file=OUT)
|
|
83
|
+
else:
|
|
84
|
+
if options.probe_col >= len(f):
|
|
85
|
+
print ("Error: column ID must be smaller than %d!" % len(f), file=sys.stderr)
|
|
86
|
+
sys.exit(0)
|
|
87
|
+
cgid = f[options.probe_col]
|
|
88
|
+
try:
|
|
89
|
+
print (l + '\t' + data[cgid],file=OUT)
|
|
90
|
+
except:
|
|
91
|
+
print (l + '\t' + '\t'.join(['NA']*len(header)), file=OUT)
|
|
92
|
+
OUT.close()
|
|
93
|
+
|
|
94
|
+
if __name__=='__main__':
|
|
95
|
+
main()
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Description
|
|
5
|
+
-----------
|
|
6
|
+
This program calculates the CpG density (count) profile over gene body as well as its up-
|
|
7
|
+
down-stream regions. It is useful to visualize how CpGs are distributed around genes.
|
|
8
|
+
|
|
9
|
+
Specifically, the up-stream region, gene region (from TSS to TES) and down-stream region
|
|
10
|
+
wil be equally divided into 100 bins, then CpG count was aggregated over a total of 300 bins
|
|
11
|
+
from 5' to 3' (upstream bins, gene bins, downstrem bins).
|
|
12
|
+
#==========================================================================================
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import sys
|
|
16
|
+
import subprocess
|
|
17
|
+
from optparse import OptionParser
|
|
18
|
+
from cpgmodule import ireader
|
|
19
|
+
from cpgmodule.utils import *
|
|
20
|
+
from cpgmodule import BED
|
|
21
|
+
from cpgmodule import extend_bed
|
|
22
|
+
from cpgmodule._version import __version__
|
|
23
|
+
|
|
24
|
+
__author__ = "Liguo Wang"
|
|
25
|
+
__copyright__ = "Copyleft"
|
|
26
|
+
__credits__ = []
|
|
27
|
+
__license__ = "GPL"
|
|
28
|
+
__maintainer__ = "Liguo Wang"
|
|
29
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
30
|
+
__status__ = "Development"
|
|
31
|
+
|
|
32
|
+
def main():
|
|
33
|
+
|
|
34
|
+
usage="%prog [options]" + "\n"
|
|
35
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
36
|
+
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="BED file specifying the C position. This BED file should have at least three columns (Chrom, ChromStart, ChromeEnd). Note: the first base in a chromosome is numbered 0. This file can be a regular text file or compressed file (.gz, .bz2).")
|
|
37
|
+
parser.add_option("-r","--refgene",action="store",type="string",dest="gene_file",help="Reference gene model in standard BED6+ format.")
|
|
38
|
+
parser.add_option("-d","--downstream",action="store",type="int",dest="downstream_size",default=2000,help="Maximum extension size from TES (transcription end site) to down-stream to define the \"downstream intergenic region (DIR)\". Note: (1) The actual used DIR size can be smaller because the extending process could stop earlier if it reaches the boundary of another nearby gene. (2) If the actual used DIR size is smaller than cutoff defined by \"-c/--SizeCut\", the gene will be skipped. default=%default (bp)")
|
|
39
|
+
parser.add_option("-u","--upstream",action="store",type="int",dest="upstream_size",default=2000,help="Maximum extension size from TSS (transcription start site) to up-stream to define the \"upstream intergenic region (UIR)\". Note: (1) The actual used UIR size can be smaller because the extending process could stop earlier if it reaches the boundary of another nearby gene. (2) If the actual used UIR size is smaller than cutoff defined by \"-c/--SizeCut\", the gene will be skipped. default=%default (bp)")
|
|
40
|
+
parser.add_option("-c","--SizeCut",action="store",type="int",dest="minimum_size",default=200,help="The minimum gene size. Gene size is defined as the genomic size between TSS and TES, including both exons and introns. default=%default (bp)")
|
|
41
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
|
|
42
|
+
(options,args)=parser.parse_args()
|
|
43
|
+
|
|
44
|
+
print ()
|
|
45
|
+
|
|
46
|
+
if not (options.input_file):
|
|
47
|
+
print (__doc__)
|
|
48
|
+
parser.print_help()
|
|
49
|
+
sys.exit(101)
|
|
50
|
+
|
|
51
|
+
if not (options.gene_file):
|
|
52
|
+
print (__doc__)
|
|
53
|
+
parser.print_help()
|
|
54
|
+
sys.exit(102)
|
|
55
|
+
|
|
56
|
+
if not (options.out_file):
|
|
57
|
+
print (__doc__)
|
|
58
|
+
parser.print_help()
|
|
59
|
+
sys.exit(103)
|
|
60
|
+
|
|
61
|
+
FOUT = open(options.out_file + '.tsv','w')
|
|
62
|
+
ROUT = open(options.out_file + '.r','w')
|
|
63
|
+
|
|
64
|
+
#step1: read CpG file
|
|
65
|
+
printlog("Reading CpG file: \"%s\"" % (options.input_file))
|
|
66
|
+
cpg_ranges = read_CpG_bed(options.input_file)
|
|
67
|
+
|
|
68
|
+
#step2: read gene file
|
|
69
|
+
printlog("Reading reference gene model: \"%s\"" % (options.gene_file))
|
|
70
|
+
tmp1 = extend_bed.getBasalDomains(options.gene_file)
|
|
71
|
+
tmp2 = extend_bed.geteExtendedDomains(basal_ranges = tmp1, bedfile = options.gene_file, up_ext=options.upstream_size, down_ext=options.downstream_size, min_gene = options.minimum_size, printit = False)
|
|
72
|
+
|
|
73
|
+
printlog("Calculating CpG density ...")
|
|
74
|
+
#CpG density
|
|
75
|
+
(up_density, gene_density, down_density) = density_over_range(tmp2, cpg_ranges)
|
|
76
|
+
|
|
77
|
+
printlog("Wrting data to : \"%s\"" % (options.out_file + '.tsv'))
|
|
78
|
+
print ("Group\tPosition\tCpG_count", file=FOUT)
|
|
79
|
+
for ind in (sorted(up_density)):
|
|
80
|
+
print ("Upstream\t" + str(ind) + '\t' + str(up_density[ind]), file = FOUT)
|
|
81
|
+
|
|
82
|
+
for ind in (sorted(gene_density)):
|
|
83
|
+
print ("GeneBody\t" + str(ind) + '\t' + str(gene_density[ind]), file = FOUT)
|
|
84
|
+
|
|
85
|
+
for ind in (sorted(down_density)):
|
|
86
|
+
print ("Downstream\t" + str(ind) + '\t' + str(down_density[ind]), file = FOUT)
|
|
87
|
+
FOUT.close()
|
|
88
|
+
|
|
89
|
+
print ('pdf(file=\"%s\", width=10, height=5)' % (options.out_file + '.pdf'),file=ROUT)
|
|
90
|
+
print ("d <- read.table(file = '%s', header = T, sep='\\t')" % (options.out_file + '.tsv'), file = ROUT)
|
|
91
|
+
print ("x = 1:length(d$CpG_count)", file=ROUT)
|
|
92
|
+
print ("plot(x,d$CpG_count,type='l',col='red',lwd=1,xaxt='n',ylab='CpG count',xlab='')", file=ROUT)
|
|
93
|
+
print ("abline(v = c(102,203),col='blue', lty='dashed', lwd=0.5)", file=ROUT)
|
|
94
|
+
print ("text(x=c(0,102,203)+50, y=0.1, labels=c('Upstream', 'geneBody','Downstream'))", file=ROUT)
|
|
95
|
+
print ('dev.off()',file=ROUT)
|
|
96
|
+
ROUT.close()
|
|
97
|
+
|
|
98
|
+
printlog("Running R script to: '%s'" % (options.out_file + '.r'))
|
|
99
|
+
try:
|
|
100
|
+
subprocess.call("Rscript " + options.out_file + '.r', shell=True)
|
|
101
|
+
except:
|
|
102
|
+
print ("Cannot generate pdf file from " + options.out_file + '.r', file=sys.stderr)
|
|
103
|
+
pass
|
|
104
|
+
|
|
105
|
+
if __name__=='__main__':
|
|
106
|
+
main()
|
|
107
|
+
|