cpgtools 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. cpgmodule/BED.py +441 -0
  2. cpgmodule/MI.py +193 -0
  3. cpgmodule/__init__.py +0 -0
  4. cpgmodule/_version.py +1 -0
  5. cpgmodule/cgID.py +866897 -0
  6. cpgmodule/data/AltumAge_cpg.pkl +0 -0
  7. cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
  8. cpgmodule/data/AltumAge_scaler.pkl +0 -0
  9. cpgmodule/data/GA_Bohlin.pkl +0 -0
  10. cpgmodule/data/GA_Haftorn.pkl +0 -0
  11. cpgmodule/data/GA_Knight.pkl +0 -0
  12. cpgmodule/data/GA_Lee_CPC.pkl +0 -0
  13. cpgmodule/data/GA_Lee_RPC.pkl +0 -0
  14. cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
  15. cpgmodule/data/GA_Mayne.pkl +0 -0
  16. cpgmodule/data/Hannum.pkl +0 -0
  17. cpgmodule/data/Horvath_2013.pkl +0 -0
  18. cpgmodule/data/Horvath_2018.pkl +0 -0
  19. cpgmodule/data/Levine.pkl +0 -0
  20. cpgmodule/data/Lu_DNAmTL.pkl +0 -0
  21. cpgmodule/data/Ped_McEwen.pkl +0 -0
  22. cpgmodule/data/Ped_Wu.pkl +0 -0
  23. cpgmodule/data/Zhang_BLUP.pkl +0 -0
  24. cpgmodule/data/Zhang_EN.pkl +0 -0
  25. cpgmodule/data/__init__.py +0 -0
  26. cpgmodule/extend_bed.py +147 -0
  27. cpgmodule/imotif.py +348 -0
  28. cpgmodule/ireader.py +28 -0
  29. cpgmodule/methylClock.py +53 -0
  30. cpgmodule/padjust.py +58 -0
  31. cpgmodule/region2gene.py +170 -0
  32. cpgmodule/utils.py +642 -0
  33. cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
  34. cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
  35. cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
  36. cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
  37. cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
  38. cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
  39. cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
  40. cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
  41. cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
  42. cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
  43. cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
  44. cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
  45. cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
  46. cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
  47. cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
  48. cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
  49. cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
  50. cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
  51. cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
  52. cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
  53. cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
  54. cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
  55. cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
  56. cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
  57. cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
  58. cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
  59. cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
  60. cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
  61. cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
  62. cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
  63. cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
  64. cpgtools-2.0.5.dist-info/METADATA +59 -0
  65. cpgtools-2.0.5.dist-info/RECORD +104 -0
  66. cpgtools-2.0.5.dist-info/WHEEL +5 -0
  67. cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
  68. cpgtools-2.0.5.dist-info/top_level.txt +5 -0
  69. impyute/__init__.py +3 -0
  70. impyute/contrib/__init__.py +7 -0
  71. impyute/contrib/compare.py +69 -0
  72. impyute/contrib/count_missing.py +30 -0
  73. impyute/contrib/describe.py +63 -0
  74. impyute/cs/__init__.py +11 -0
  75. impyute/cs/buck_iterative.py +82 -0
  76. impyute/cs/central_tendency.py +84 -0
  77. impyute/cs/em.py +52 -0
  78. impyute/cs/fast_knn.py +130 -0
  79. impyute/cs/random.py +27 -0
  80. impyute/dataset/__init__.py +6 -0
  81. impyute/dataset/base.py +137 -0
  82. impyute/dataset/corrupt.py +55 -0
  83. impyute/deletion/__init__.py +5 -0
  84. impyute/deletion/complete_case.py +21 -0
  85. impyute/ops/__init__.py +12 -0
  86. impyute/ops/error.py +9 -0
  87. impyute/ops/inverse_distance_weighting.py +31 -0
  88. impyute/ops/matrix.py +47 -0
  89. impyute/ops/testing.py +20 -0
  90. impyute/ops/util.py +96 -0
  91. impyute/ops/wrapper.py +179 -0
  92. impyute/ts/__init__.py +6 -0
  93. impyute/ts/locf.py +57 -0
  94. impyute/ts/moving_window.py +128 -0
  95. impyutelib.py +890 -0
  96. missingpy/__init__.py +4 -0
  97. missingpy/knnimpute.py +328 -0
  98. missingpy/missforest.py +556 -0
  99. missingpy/pairwise_external.py +315 -0
  100. missingpy/tests/__init__.py +0 -0
  101. missingpy/tests/test_knnimpute.py +605 -0
  102. missingpy/tests/test_missforest.py +409 -0
  103. missingpy/utils.py +124 -0
  104. misspylib.py +565 -0
@@ -0,0 +1,238 @@
1
+ #!python
2
+
3
+ """
4
+ Aggregate proportion values of a list of CpGs that located in give genomic regions
5
+ (eg. CpG islands, promoters, exons, etc.).
6
+
7
+ Outlier CpG will be removed if the probability of observing its proportion value is less
8
+ than p-cutoff. For example, if alpha set to 0.05, and there are 10 CpGs (n = 10) located in a
9
+ particular genomic region, the p-cutoff of this genomic region is 0.005 (0.05/10). Supposing
10
+ the total reads mapped to this region is 100, out of which 25 are methylated reads (i.e.,
11
+ regional methylation level (beta) = 25/100 = 0.25)
12
+
13
+ The probability of observing CpG (3,10) is :
14
+ pbinom(q=3, size=10, prob=0.25) = 0.7759
15
+ The probability of observing CpG (0,10) is :
16
+ pbinom(q=0, size=10, prob=0.25) = 0.05631
17
+ The probability of observing CpG (16,21) is :
18
+ pbinom(q=16, size=21, prob=0.25, lower.tail=FALSE) = 1.19e-07 (outlier)
19
+
20
+
21
+ **Example of input file**
22
+
23
+ Chrom Start End score
24
+ chr10 100017748 100017749 3,10
25
+ chr10 100017769 100017770 0,10
26
+ chr10 100017853 100017854 16,21
27
+
28
+ """
29
+
30
+ import sys,os
31
+ import collections
32
+ import subprocess
33
+ import numpy as np
34
+ from scipy.stats import binom
35
+
36
+ from optparse import OptionParser
37
+ from cpgmodule._version import __version__
38
+ from cpgmodule import ireader
39
+ from cpgmodule.utils import *
40
+ from cpgmodule import BED
41
+ import pandas as pd
42
+ from bx.intervals import *
43
+
44
+ __author__ = "Liguo Wang"
45
+ __copyright__ = "Copyleft"
46
+ __credits__ = []
47
+ __license__ = "GPL"
48
+ __maintainer__ = "Liguo Wang"
49
+ __email__ = "wang.liguo@mayo.edu"
50
+ __status__ = "Development"
51
+
52
+
53
+
54
+ def buildIntervalTree(bed_file):
55
+ '''
56
+ Build interval tree from BED file. Input BED file must have at least 4 columns
57
+ '''
58
+ ranges={}
59
+ printlog("reading "+ bed_file + '...')
60
+ for line in ireader.reader(bed_file):
61
+ if line.startswith("track"):continue
62
+ if line.startswith("#"):continue
63
+ if line.startswith('browser'):continue
64
+ if line.startswith('Chrom'):continue
65
+ fields = line.rstrip('\n ').split()
66
+ if len(fields) < 4:
67
+ continue
68
+ chrom = fields[0]
69
+ start = int(fields[1])
70
+ end = int(fields[2])
71
+ score = fields[3]
72
+
73
+ if start < 0:
74
+ continue
75
+ if end < 0:
76
+ continue
77
+ if start > end:
78
+ continue
79
+
80
+
81
+ if chrom not in ranges:
82
+ ranges[chrom] = Intersecter()
83
+ ranges[chrom].add_interval( Interval( start, end, value=score) )
84
+ else:
85
+ ranges[chrom].add_interval( Interval( start, end, value=score) )
86
+ return ranges
87
+
88
+ def findIntervals(chrom, start, end, obj, a = 0.01, counts = True):
89
+ '''
90
+ obj is the IntervalTree object returned by "buildIntervalTree.
91
+ '''
92
+ hits = [] # list of proportion values
93
+
94
+ if chrom not in obj:
95
+ return hits
96
+ else:
97
+ overlaps = obj[chrom].find(int(start), int(end))
98
+ for i in overlaps:
99
+ hits.append(i.value)
100
+ if len(hits) == 0:
101
+ if counts:
102
+ return(['N/A']*6)
103
+ else:
104
+ return(['N/A']*2)
105
+
106
+ if counts:
107
+ methyl = [] #list of methylated read for each CpG
108
+ total = [] #list of total read for each CpG
109
+
110
+ for h in hits:
111
+ m, t = h.split(',')
112
+ methyl.append(int(m))
113
+ total.append(int(t))
114
+ ori_CpG_count = len(total) #number of CpGs of a region
115
+ p_cut = a / ori_CpG_count
116
+ ori_methyl_sum = int(np.sum(methyl)) #total reads of a region
117
+ ori_total_sum = int(np.sum(total)) #total methylated reads of a region
118
+
119
+ if ori_total_sum == 0:
120
+ return(['N/A']*6)
121
+ if ori_methyl_sum == 0 or ori_methyl_sum == ori_total_sum:
122
+ return([ori_CpG_count, ori_methyl_sum, ori_total_sum, ori_CpG_count, ori_methyl_sum, ori_total_sum])
123
+
124
+
125
+ region_beta = ori_methyl_sum/ori_total_sum #average methylation level of *region*, equivalent to prob in binomial
126
+
127
+
128
+ new_methyl = []
129
+ new_total = []
130
+ for m, t in zip(methyl, total):
131
+ p = binom.cdf(k = m, n = t, p = region_beta)
132
+ #print (p, m, t)
133
+ if p < p_cut:
134
+ continue
135
+ if (1.0 - p) < p_cut:
136
+ continue
137
+ new_methyl.append(m)
138
+ new_total.append(t)
139
+ new_CpG_count = len(new_total)
140
+ new_methyl_sum = int(np.sum(new_methyl))
141
+ new_total_sum = int(np.sum(new_total))
142
+
143
+ return([new_CpG_count, new_methyl_sum, new_total_sum, ori_CpG_count, ori_methyl_sum, ori_total_sum])
144
+ else:
145
+ CpG_count = len(hits)
146
+ avg_beta = np.mean([float(i) for i in hits])
147
+ return ([CpG_count, avg_beta])
148
+
149
+
150
+
151
+ def main():
152
+
153
+ usage="%prog [options]" + "\n"
154
+ parser = OptionParser(usage,version="%prog " + __version__)
155
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Input CpG file in BED format. The first 3 columns contain \"Chrom\", \"Start\", and \"End\". The 4th column contains proportion values.")
156
+ parser.add_option("-a","--alpha",action="store",type='float', dest="alpha_cut", default=0.05, help="The chance of mistakingly assign a particular CpG as an outlier for each genomic region. Only applied to count data. default=%default" )
157
+ parser.add_option("-b","--bed",action="store",type="string",dest="bed_file",help="BED3+ file specifying the genomic regions.")
158
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
159
+ parser.add_option("-t","--type",action="store",type='string', dest="data_type",help="Data type in the forth column. Must be one of \"count\" (eg 3,10) or \"beta\"(eg, 0.2)")
160
+ (options,args)=parser.parse_args()
161
+
162
+ if not (options.input_file):
163
+ print (__doc__)
164
+ parser.print_help()
165
+ sys.exit(101)
166
+ if not (options.data_type):
167
+ print (__doc__)
168
+ parser.print_help()
169
+ sys.exit(102)
170
+ if not (options.out_file):
171
+ print (__doc__)
172
+ parser.print_help()
173
+ sys.exit(103)
174
+
175
+ if options.alpha_cut < 0:
176
+ options.alpha_cut = 0.05
177
+ if options.alpha_cut > 1:
178
+ options.alpha_cut = 1
179
+
180
+ tree = buildIntervalTree(options.input_file)
181
+
182
+ OUT = open(options.out_file,'w')
183
+ if options.data_type == 'count':
184
+ print ("#chrom\tstart\tend\tN_CpG_filtered\tN_methyl_filtered\tN_total_filtered\tN_CpG_ori\tN_methy_ori\tN_total_ori", file=OUT)
185
+ for line in ireader.reader(options.bed_file):
186
+ line = line.strip()
187
+ if line.startswith("track"):continue
188
+ if line.startswith("#"):continue
189
+ if line.startswith('browser'):continue
190
+ if line.startswith('Chrom'):continue
191
+
192
+
193
+ f = line.split()
194
+ if len(f) < 3:
195
+ continue
196
+ try:
197
+ chrom = f[0]
198
+ start = int(f[1])
199
+ end = int(f[2])
200
+ except:
201
+ continue
202
+
203
+ tmp = findIntervals(chrom, start, end, tree, a = options.alpha_cut, counts=True)
204
+ if len(tmp) == 0:
205
+ print ('\t'.join(f[0:3]) + '\t' + '\t'.join( ['N/A']*6), file=OUT)
206
+ else:
207
+ print ('\t'.join(f[0:3]) + '\t' + '\t'.join([str(i) for i in tmp]), file=OUT)
208
+ elif options.data_type == 'beta':
209
+ #print ("#chrom\tstart\tend\tN_CpG\tavg_beta", file=OUT)
210
+ for line in ireader.reader(options.bed_file):
211
+ line = line.strip()
212
+ if line.startswith("track"):continue
213
+ if line.startswith("#"):continue
214
+ if line.startswith('browser'):continue
215
+ if line.startswith('Chrom'):continue
216
+
217
+ f = line.split()
218
+ if len(f) < 3:
219
+ continue
220
+ try:
221
+ chrom = f[0]
222
+ start = int(f[1])
223
+ end = int(f[2])
224
+ except:
225
+ continue
226
+
227
+ tmp = findIntervals(chrom, start, end, tree, a = options.alpha_cut, counts=False)
228
+ if len(tmp) == 0:
229
+ print (line + '\t' + '\t'.join( ['N/A']*2), file=OUT)
230
+ else:
231
+ print (line + '\t' + '\t'.join([str(i) for i in tmp]), file=OUT)
232
+ else:
233
+ print ("-t (--type) must take the value of 'count' or 'beta'", file=sys.stderr)
234
+ sys.exit(102)
235
+ OUT.close()
236
+
237
+ if __name__=='__main__':
238
+ main()
@@ -0,0 +1,156 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program annotates CpG by its position.
7
+
8
+ Notes:
9
+ - Input CpG and BED files must have at least three columns
10
+ - If multiple regions from the annotation BED file are overlapped with the **same**
11
+ CpG site, their names will be concatenated together.
12
+
13
+ """
14
+
15
+ import sys,os
16
+ import collections
17
+ import subprocess
18
+ import numpy as np
19
+ from os.path import basename
20
+ from optparse import OptionParser
21
+ from cpgmodule._version import __version__
22
+ from cpgmodule import ireader
23
+ from cpgmodule.utils import *
24
+ from cpgmodule import BED
25
+ import pandas as pd
26
+ from bx.intervals import *
27
+
28
+ __author__ = "Liguo Wang"
29
+ __copyright__ = "Copyleft"
30
+ __credits__ = []
31
+ __license__ = "GPL"
32
+ __maintainer__ = "Liguo Wang"
33
+ __email__ = "wang.liguo@mayo.edu"
34
+ __status__ = "Development"
35
+
36
+
37
+
38
+ def buildIntervalTree(bed_file, window_size = 0):
39
+ '''
40
+ Build interval tree from annotation BED file.
41
+ window : add this to the middle of each region.
42
+ '''
43
+ ranges={}
44
+ printlog("Build interval tree from annotation file: %s ..." % bed_file)
45
+ for line in ireader.reader(bed_file):
46
+ if line.startswith("track"):continue
47
+ if line.startswith("#"):continue
48
+ if line.startswith('browser'):continue
49
+ fields = line.rstrip('\n ').split()
50
+ if len(fields) < 3:
51
+ continue
52
+ chrom = fields[0]
53
+ start = int(fields[1])
54
+ end = int(fields[2])
55
+
56
+ if window_size > 0:
57
+ # window middle position
58
+ mid = int(start + (end - start)/2.0 )
59
+ if start < 0:
60
+ continue
61
+ if end < 0:
62
+ continue
63
+ if start > end:
64
+ continue
65
+
66
+ # window start position
67
+ extension = int(window_size * 0.5)
68
+ w_start = mid - extension
69
+ if w_start < start:
70
+ w_start = start
71
+
72
+ # window end position
73
+ w_end = mid + extension
74
+ if w_end > end:
75
+ w_end = end
76
+
77
+ if len(fields) >= 4:
78
+ name = fields[3]
79
+ else:
80
+ name = fields[0] + ':' + fields[1] + '-' + fields[2]
81
+
82
+ if chrom not in ranges:
83
+ ranges[chrom] = Intersecter()
84
+ ranges[chrom].add_interval( Interval( start, end, value=name) )
85
+ else:
86
+ ranges[chrom].add_interval( Interval( start, end, value=name) )
87
+ return ranges
88
+
89
+ def findIntervals(chrom, start, end, obj):
90
+ '''
91
+ obj is the IntervalTree object returned by "buildIntervalTree.
92
+ '''
93
+ hits = set()
94
+ if chrom not in obj:
95
+ return hits
96
+ else:
97
+ overlaps = obj[chrom].find(int(start), int(end))
98
+ for i in overlaps:
99
+ hits.add(i.value)
100
+ return sorted(hits)
101
+
102
+ def main():
103
+
104
+ usage="%prog [options]" + "\n"
105
+ parser = OptionParser(usage,version="%prog " + __version__)
106
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Input CpG file in BED3+ format.")
107
+ parser.add_option("-a","--annotation",action="store",type="string",dest="anno_file",help="Input annotation file in BED3+ format.")
108
+ parser.add_option("-w","--window",action="store",type='int', dest="window_size", default=0, help="Size of window centering on the middle-point of each genomic region defined in the annotation BED file (i.e., window_size*0.5 will be extended to up- and down-stream from the middle point of each genomic region). if WINDOW_SIZE = 0, use the original region. default=%default" )
109
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
110
+ parser.add_option("-l", "--header", action="store_true", dest="header", default=False, help="If True, the first row of input CpG file is header. default=%default")
111
+ (options,args)=parser.parse_args()
112
+
113
+
114
+ if not (options.input_file):
115
+ print (__doc__)
116
+ #print ('You must specify input file(s)',file=sys.stderr)
117
+ parser.print_help()
118
+ sys.exit(101)
119
+ if not (options.out_file):
120
+ print (__doc__)
121
+ #print ('You must specify the output file',file=sys.stderr)
122
+ parser.print_help()
123
+ sys.exit(102)
124
+ if not (options.anno_file):
125
+ print (__doc__)
126
+ #print ('You must specify the annotation file',file=sys.stderr)
127
+ parser.print_help()
128
+ sys.exit(103)
129
+ tree = buildIntervalTree(options.anno_file, window_size = options.window_size)
130
+
131
+ OUT = open(options.out_file + '.anno.txt','w')
132
+ line_num = 0
133
+ printlog("Reading CpG file: %s ..." % options.input_file)
134
+ for line in ireader.reader(options.input_file):
135
+ fields = line.rstrip('\n ').split()
136
+ if len(fields) < 3:
137
+ continue
138
+ line_num += 1
139
+ f = line.split()
140
+ if (line_num == 1 and options.header):
141
+ print (line + '\t' + basename(options.anno_file), file=OUT)
142
+ else:
143
+ chrom = f[0]
144
+ start = int(f[1])
145
+ end = int(f[2])
146
+ overlaps = findIntervals(chrom, start, end, tree)
147
+ if len(overlaps) > 0:
148
+ print (line + '\t' + ','.join(overlaps), file=OUT)
149
+ else:
150
+ print (line + '\tN/A', file=OUT)
151
+
152
+ OUT.close()
153
+
154
+ if __name__=='__main__':
155
+ main()
156
+
@@ -0,0 +1,112 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program adds comprehensive annotation information to each 450K/850K probe ID.
7
+ """
8
+
9
+ import sys,os
10
+ from optparse import OptionParser
11
+ from cpgmodule import ireader
12
+ from cpgmodule.utils import *
13
+ from cpgmodule._version import __version__
14
+
15
+ __author__ = "Liguo Wang"
16
+ __copyright__ = "Copyleft"
17
+ __credits__ = []
18
+ __license__ = "GPL"
19
+ __maintainer__ = "Liguo Wang"
20
+ __email__ = "wang.liguo@mayo.edu"
21
+ __status__ = "Development"
22
+
23
+
24
+ def read_annotation(infile):
25
+ head = []
26
+ cpg_infor = {}
27
+ for l in ireader.reader(infile):
28
+ if l.startswith('probeID'):
29
+ head = l.split()[1:]
30
+ else:
31
+ f = l.split()
32
+ cgid = f[0]
33
+ anno = '\t'.join(f[1:])
34
+ cpg_infor[cgid] = anno
35
+ return (head, cpg_infor)
36
+
37
+ def main():
38
+ usage="%prog [options]" + "\n"
39
+ parser = OptionParser(usage,version="%prog " + __version__)
40
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Input data file (Tab-separated) with a certain column containing 450K/850K array CpG IDs. This file can be regular text file or compressed file (.gz, .bz2).")
41
+ parser.add_option("-a","--annotation",action="store",type="string",dest="anno_file",help="Annotation file. This file can be regular text file or compressed file (.gz, .bz2).")
42
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="Prefix of the output file.")
43
+ parser.add_option("-p","--probe_column",action="store",type='int', dest="probe_col",default=0, help="The number of column that contains probe IDs. Note: the column index starts with 0. default=%default.")
44
+ parser.add_option("-l", "--header", action="store_true", dest="header", default=False, help="Input data file has a header row.")
45
+ (options,args)=parser.parse_args()
46
+
47
+ if not (options.input_file):
48
+ print (__doc__)
49
+ parser.print_help()
50
+ sys.exit(101)
51
+
52
+ if not (options.anno_file):
53
+ print (__doc__)
54
+ parser.print_help()
55
+ sys.exit(102)
56
+
57
+ if not (options.out_file):
58
+ print (__doc__)
59
+ parser.print_help()
60
+ sys.exit(103)
61
+
62
+ if not os.path.isfile(options.input_file):
63
+ print ("Input data file \"%s\" does not exist\n" % options.input_file)
64
+ sys.exit(104)
65
+ if not os.path.isfile(options.anno_file):
66
+ print ("Input annotation file \"%s\" does not exist\n" % options.input_file)
67
+ sys.exit(105)
68
+
69
+ printlog("Read annotation file \"%s\" ..." % (options.anno_file))
70
+ (header, data)= read_annotation(options.anno_file)
71
+
72
+ OUT = open(options.out_file + '.anno.txt','w')
73
+ printlog("Add annotation information to \"%s\" ..." % (options.input_file))
74
+ line_num = 0
75
+ for l in ireader.reader(options.input_file):
76
+ line_num += 1
77
+ f = l.split()
78
+ if line_num == 1:
79
+ if options.header:
80
+ print (l + '\t' + '\t'.join(header), file=OUT)
81
+ else:
82
+ print ('\t'.join(['NA']*len(f)) + '\t' + '\t'.join(header), file=OUT)
83
+ else:
84
+ if options.probe_col >= len(f):
85
+ print ("Error: column ID must be smaller than %d!" % len(f), file=sys.stderr)
86
+ sys.exit(0)
87
+ cgid = f[options.probe_col]
88
+ try:
89
+ print (l + '\t' + data[cgid],file=OUT)
90
+ except:
91
+ print (l + '\t' + '\t'.join(['NA']*len(header)), file=OUT)
92
+ OUT.close()
93
+
94
+ if __name__=='__main__':
95
+ main()
96
+
97
+
98
+
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
@@ -0,0 +1,107 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program calculates the CpG density (count) profile over gene body as well as its up-
7
+ down-stream regions. It is useful to visualize how CpGs are distributed around genes.
8
+
9
+ Specifically, the up-stream region, gene region (from TSS to TES) and down-stream region
10
+ wil be equally divided into 100 bins, then CpG count was aggregated over a total of 300 bins
11
+ from 5' to 3' (upstream bins, gene bins, downstrem bins).
12
+ #==========================================================================================
13
+ """
14
+
15
+ import sys
16
+ import subprocess
17
+ from optparse import OptionParser
18
+ from cpgmodule import ireader
19
+ from cpgmodule.utils import *
20
+ from cpgmodule import BED
21
+ from cpgmodule import extend_bed
22
+ from cpgmodule._version import __version__
23
+
24
+ __author__ = "Liguo Wang"
25
+ __copyright__ = "Copyleft"
26
+ __credits__ = []
27
+ __license__ = "GPL"
28
+ __maintainer__ = "Liguo Wang"
29
+ __email__ = "wang.liguo@mayo.edu"
30
+ __status__ = "Development"
31
+
32
+ def main():
33
+
34
+ usage="%prog [options]" + "\n"
35
+ parser = OptionParser(usage,version="%prog " + __version__)
36
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="BED file specifying the C position. This BED file should have at least three columns (Chrom, ChromStart, ChromeEnd). Note: the first base in a chromosome is numbered 0. This file can be a regular text file or compressed file (.gz, .bz2).")
37
+ parser.add_option("-r","--refgene",action="store",type="string",dest="gene_file",help="Reference gene model in standard BED6+ format.")
38
+ parser.add_option("-d","--downstream",action="store",type="int",dest="downstream_size",default=2000,help="Maximum extension size from TES (transcription end site) to down-stream to define the \"downstream intergenic region (DIR)\". Note: (1) The actual used DIR size can be smaller because the extending process could stop earlier if it reaches the boundary of another nearby gene. (2) If the actual used DIR size is smaller than cutoff defined by \"-c/--SizeCut\", the gene will be skipped. default=%default (bp)")
39
+ parser.add_option("-u","--upstream",action="store",type="int",dest="upstream_size",default=2000,help="Maximum extension size from TSS (transcription start site) to up-stream to define the \"upstream intergenic region (UIR)\". Note: (1) The actual used UIR size can be smaller because the extending process could stop earlier if it reaches the boundary of another nearby gene. (2) If the actual used UIR size is smaller than cutoff defined by \"-c/--SizeCut\", the gene will be skipped. default=%default (bp)")
40
+ parser.add_option("-c","--SizeCut",action="store",type="int",dest="minimum_size",default=200,help="The minimum gene size. Gene size is defined as the genomic size between TSS and TES, including both exons and introns. default=%default (bp)")
41
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
42
+ (options,args)=parser.parse_args()
43
+
44
+ print ()
45
+
46
+ if not (options.input_file):
47
+ print (__doc__)
48
+ parser.print_help()
49
+ sys.exit(101)
50
+
51
+ if not (options.gene_file):
52
+ print (__doc__)
53
+ parser.print_help()
54
+ sys.exit(102)
55
+
56
+ if not (options.out_file):
57
+ print (__doc__)
58
+ parser.print_help()
59
+ sys.exit(103)
60
+
61
+ FOUT = open(options.out_file + '.tsv','w')
62
+ ROUT = open(options.out_file + '.r','w')
63
+
64
+ #step1: read CpG file
65
+ printlog("Reading CpG file: \"%s\"" % (options.input_file))
66
+ cpg_ranges = read_CpG_bed(options.input_file)
67
+
68
+ #step2: read gene file
69
+ printlog("Reading reference gene model: \"%s\"" % (options.gene_file))
70
+ tmp1 = extend_bed.getBasalDomains(options.gene_file)
71
+ tmp2 = extend_bed.geteExtendedDomains(basal_ranges = tmp1, bedfile = options.gene_file, up_ext=options.upstream_size, down_ext=options.downstream_size, min_gene = options.minimum_size, printit = False)
72
+
73
+ printlog("Calculating CpG density ...")
74
+ #CpG density
75
+ (up_density, gene_density, down_density) = density_over_range(tmp2, cpg_ranges)
76
+
77
+ printlog("Wrting data to : \"%s\"" % (options.out_file + '.tsv'))
78
+ print ("Group\tPosition\tCpG_count", file=FOUT)
79
+ for ind in (sorted(up_density)):
80
+ print ("Upstream\t" + str(ind) + '\t' + str(up_density[ind]), file = FOUT)
81
+
82
+ for ind in (sorted(gene_density)):
83
+ print ("GeneBody\t" + str(ind) + '\t' + str(gene_density[ind]), file = FOUT)
84
+
85
+ for ind in (sorted(down_density)):
86
+ print ("Downstream\t" + str(ind) + '\t' + str(down_density[ind]), file = FOUT)
87
+ FOUT.close()
88
+
89
+ print ('pdf(file=\"%s\", width=10, height=5)' % (options.out_file + '.pdf'),file=ROUT)
90
+ print ("d <- read.table(file = '%s', header = T, sep='\\t')" % (options.out_file + '.tsv'), file = ROUT)
91
+ print ("x = 1:length(d$CpG_count)", file=ROUT)
92
+ print ("plot(x,d$CpG_count,type='l',col='red',lwd=1,xaxt='n',ylab='CpG count',xlab='')", file=ROUT)
93
+ print ("abline(v = c(102,203),col='blue', lty='dashed', lwd=0.5)", file=ROUT)
94
+ print ("text(x=c(0,102,203)+50, y=0.1, labels=c('Upstream', 'geneBody','Downstream'))", file=ROUT)
95
+ print ('dev.off()',file=ROUT)
96
+ ROUT.close()
97
+
98
+ printlog("Running R script to: '%s'" % (options.out_file + '.r'))
99
+ try:
100
+ subprocess.call("Rscript " + options.out_file + '.r', shell=True)
101
+ except:
102
+ print ("Cannot generate pdf file from " + options.out_file + '.r', file=sys.stderr)
103
+ pass
104
+
105
+ if __name__=='__main__':
106
+ main()
107
+