cpgtools 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. cpgmodule/BED.py +441 -0
  2. cpgmodule/MI.py +193 -0
  3. cpgmodule/__init__.py +0 -0
  4. cpgmodule/_version.py +1 -0
  5. cpgmodule/cgID.py +866897 -0
  6. cpgmodule/data/AltumAge_cpg.pkl +0 -0
  7. cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
  8. cpgmodule/data/AltumAge_scaler.pkl +0 -0
  9. cpgmodule/data/GA_Bohlin.pkl +0 -0
  10. cpgmodule/data/GA_Haftorn.pkl +0 -0
  11. cpgmodule/data/GA_Knight.pkl +0 -0
  12. cpgmodule/data/GA_Lee_CPC.pkl +0 -0
  13. cpgmodule/data/GA_Lee_RPC.pkl +0 -0
  14. cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
  15. cpgmodule/data/GA_Mayne.pkl +0 -0
  16. cpgmodule/data/Hannum.pkl +0 -0
  17. cpgmodule/data/Horvath_2013.pkl +0 -0
  18. cpgmodule/data/Horvath_2018.pkl +0 -0
  19. cpgmodule/data/Levine.pkl +0 -0
  20. cpgmodule/data/Lu_DNAmTL.pkl +0 -0
  21. cpgmodule/data/Ped_McEwen.pkl +0 -0
  22. cpgmodule/data/Ped_Wu.pkl +0 -0
  23. cpgmodule/data/Zhang_BLUP.pkl +0 -0
  24. cpgmodule/data/Zhang_EN.pkl +0 -0
  25. cpgmodule/data/__init__.py +0 -0
  26. cpgmodule/extend_bed.py +147 -0
  27. cpgmodule/imotif.py +348 -0
  28. cpgmodule/ireader.py +28 -0
  29. cpgmodule/methylClock.py +53 -0
  30. cpgmodule/padjust.py +58 -0
  31. cpgmodule/region2gene.py +170 -0
  32. cpgmodule/utils.py +642 -0
  33. cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
  34. cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
  35. cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
  36. cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
  37. cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
  38. cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
  39. cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
  40. cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
  41. cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
  42. cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
  43. cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
  44. cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
  45. cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
  46. cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
  47. cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
  48. cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
  49. cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
  50. cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
  51. cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
  52. cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
  53. cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
  54. cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
  55. cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
  56. cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
  57. cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
  58. cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
  59. cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
  60. cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
  61. cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
  62. cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
  63. cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
  64. cpgtools-2.0.5.dist-info/METADATA +59 -0
  65. cpgtools-2.0.5.dist-info/RECORD +104 -0
  66. cpgtools-2.0.5.dist-info/WHEEL +5 -0
  67. cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
  68. cpgtools-2.0.5.dist-info/top_level.txt +5 -0
  69. impyute/__init__.py +3 -0
  70. impyute/contrib/__init__.py +7 -0
  71. impyute/contrib/compare.py +69 -0
  72. impyute/contrib/count_missing.py +30 -0
  73. impyute/contrib/describe.py +63 -0
  74. impyute/cs/__init__.py +11 -0
  75. impyute/cs/buck_iterative.py +82 -0
  76. impyute/cs/central_tendency.py +84 -0
  77. impyute/cs/em.py +52 -0
  78. impyute/cs/fast_knn.py +130 -0
  79. impyute/cs/random.py +27 -0
  80. impyute/dataset/__init__.py +6 -0
  81. impyute/dataset/base.py +137 -0
  82. impyute/dataset/corrupt.py +55 -0
  83. impyute/deletion/__init__.py +5 -0
  84. impyute/deletion/complete_case.py +21 -0
  85. impyute/ops/__init__.py +12 -0
  86. impyute/ops/error.py +9 -0
  87. impyute/ops/inverse_distance_weighting.py +31 -0
  88. impyute/ops/matrix.py +47 -0
  89. impyute/ops/testing.py +20 -0
  90. impyute/ops/util.py +96 -0
  91. impyute/ops/wrapper.py +179 -0
  92. impyute/ts/__init__.py +6 -0
  93. impyute/ts/locf.py +57 -0
  94. impyute/ts/moving_window.py +128 -0
  95. impyutelib.py +890 -0
  96. missingpy/__init__.py +4 -0
  97. missingpy/knnimpute.py +328 -0
  98. missingpy/missforest.py +556 -0
  99. missingpy/pairwise_external.py +315 -0
  100. missingpy/tests/__init__.py +0 -0
  101. missingpy/tests/test_knnimpute.py +605 -0
  102. missingpy/tests/test_missforest.py +409 -0
  103. missingpy/utils.py +124 -0
  104. misspylib.py +565 -0
@@ -0,0 +1,154 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program calculates the distribution of CpG over chromosomes.
7
+ """
8
+
9
+
10
+ import sys,os
11
+ import collections
12
+ import subprocess
13
+ import numpy as np
14
+ from optparse import OptionParser
15
+ from cpgmodule import ireader
16
+ from cpgmodule.utils import *
17
+ from cpgmodule._version import __version__
18
+
19
+ __author__ = "Liguo Wang"
20
+ __copyright__ = "Copyleft"
21
+ __credits__ = []
22
+ __license__ = "GPL"
23
+ __maintainer__ = "Liguo Wang"
24
+ __email__ = "wang.liguo@mayo.edu"
25
+ __status__ = "Development"
26
+
27
+ def main():
28
+
29
+ usage="%prog [options]" + "\n"
30
+ parser = OptionParser(usage,version="%prog " + __version__)
31
+ parser.add_option("-i","--input_files",action="store",type="string",dest="input_files",help="Input CpG file(s) in BED3+ format. Multiple BED files should be separated by \",\" (eg: \"-i file_1.bed,file_2.bed,file_3.bed\"). BED file can be a regular text file or compressed file (.gz, .bz2). The barplot figures will NOT be generated if you provide more than 12 samples (bed files). [required]")
32
+ parser.add_option("-n","--names",action="store",type="string",dest="file_names",help="Shorter and meaningful names to label samples. Should be separated by \",\" and match CpG BED files in number. If not provided, basenames of CpG BED files will be used to label samples. [optional]")
33
+ parser.add_option("-s","--chrom-size",action="store",type="string",dest="chrom_size",help="Chromosome size file. Tab or space separated text file with two columns: the first column is chromosome name/ID, the second column is chromosome size. This file will determine: (1) which chromosomes are included in the final bar plots, so do NOT include 'unplaced', 'alternative' contigs in this file. (2) The order of chromosomes in the final bar plots. [required]")
34
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file. [required]")
35
+ (options,args)=parser.parse_args()
36
+
37
+ print ()
38
+
39
+ if not (options.input_files):
40
+ print (__doc__)
41
+ #print ('You must specify input file(s)',file=sys.stderr)
42
+ parser.print_help()
43
+ sys.exit(101)
44
+ if not (options.chrom_size):
45
+ print (__doc__)
46
+ #print ('You must specify the chrom size file',file=sys.stderr)
47
+ parser.print_help()
48
+ sys.exit(102)
49
+ if not (options.out_file):
50
+ print (__doc__)
51
+ #print ('You must specify the output file',file=sys.stderr)
52
+ parser.print_help()
53
+ sys.exit(103)
54
+
55
+ input_files = options.input_files.split(',')
56
+ for i in input_files + [options.chrom_size]:
57
+ if not os.path.exists(i):
58
+ print ('\n' + i + " does NOT exists" + '\n',file=sys.stderr)
59
+ sys.exit(104)
60
+
61
+ input_names = []
62
+ if options.file_names:
63
+ input_names = options.file_names.split(',')
64
+ else:
65
+ for f in input_files:
66
+ input_names.append(os.path.basename(f))
67
+ if len(input_files) != len(input_names):
68
+ print ('-i and -n don\'t match in number',file=sys.stderr)
69
+ sys.exit(105)
70
+
71
+
72
+ #step1: read chrom sizes files
73
+ printlog("Reading chromosome size file: \"%s\"" % (options.chrom_size))
74
+ cnames,csizes = read_chromSize(options.chrom_size)
75
+ for cname,csize in zip(cnames,csizes):
76
+ print(" " + cname + '\t' + str(csize))
77
+
78
+ #step2: read CpG files
79
+ dat = collections.defaultdict(dict) #samleName:chromName:CpGount
80
+ for f,n in zip(input_files, input_names):
81
+ printlog("Reading CpG BED file \"%s\" named \"%s\"" % (f,n))
82
+ dat[n] = chrom_count(f)
83
+
84
+
85
+ #step3: write matrix to file
86
+ printlog("Save CpG count to \"%s\"" % (options.out_file + '.txt'))
87
+ FOUT = open(options.out_file + '.txt','w')
88
+ print ("chromID\tchromSize\t" + '\t'.join([ n + '.CpG_count' for n in input_names]), file=FOUT)
89
+
90
+ for cname,csize in zip(cnames,csizes):
91
+ row = []
92
+ row.append(cname.replace('chr',''))
93
+ row.append(csize)
94
+ for n in input_names:
95
+ try:
96
+ row.append(dat[n][cname])
97
+ except:
98
+ row.append(0)
99
+ print ('\t'.join([str(i) for i in row]), file=FOUT)
100
+
101
+ FOUT.close()
102
+
103
+
104
+ #step 4: print R script
105
+ if len(input_names) <= 12:
106
+ printlog("Generate R script, save to \"%s\"" % (options.out_file + '.r'))
107
+ ROUT = open(options.out_file + '.r','w')
108
+ print ("chromNames = c(%s)" % (','.join(['"' + i.replace('chr','') + '"' for i in cnames])),file=ROUT)
109
+ print ("chromSizes = c(%s)" % (','.join([str(i) for i in csizes])),file=ROUT)
110
+
111
+ input_names2 = ['X_' + i for i in input_names]
112
+ for n1,n2 in zip(input_names, input_names2):
113
+ tmp = []
114
+ for cname in cnames:
115
+ try:
116
+ tmp.append(dat[n1][cname])
117
+ except:
118
+ tmp.append(0)
119
+ print ("%s = c(%s)" % (n2, ','.join([str(i) for i in tmp])), file=ROUT)
120
+
121
+ my_col = colors(len(input_names))
122
+ print ('cols = c(%s)' % ','.join(my_col),file=ROUT)
123
+
124
+
125
+ print ('pdf(\"%s\", width=12, height=6)' % (options.out_file + '.CpG_total.pdf'), file=ROUT)
126
+ print ('barplot(rbind(%s),col=cols,beside=T,names.arg=%s, xlab="Chromosome", ylab="CpG count", legend.text=c(%s), cex.names=0.5, cex.axis=0.6)' % (','.join(input_names2), 'chromNames', ','.join(['"' + i + '"' for i in input_names])), file=ROUT)
127
+ print ('dev.off()', file=ROUT)
128
+
129
+ print ('pdf(\"%s\", width=12, height=6)' % (options.out_file + '.CpG_percent.pdf'), file=ROUT)
130
+ print ('barplot(rbind(%s),col=cols,beside=T,names.arg=%s, xlab="Chromosome", ylab="CpG percent", legend.text=c(%s), cex.names=0.5, cex.axis=0.6)' % (','.join([ i + '*100.0/sum(' + i + ')' for i in input_names2]), 'chromNames', ','.join(['"' + i + '"' for i in input_names])), file=ROUT)
131
+ print ('dev.off()', file=ROUT)
132
+
133
+ print ('pdf(\"%s\", width=12, height=6)' % (options.out_file + '.CpG_perMb.pdf'), file=ROUT)
134
+ print ('barplot(rbind(%s),col=cols,beside=T,names.arg=%s, xlab="Chromosome", ylab="CpG per Mb", legend.text=c(%s), cex.names=0.5, cex.axis=0.6)' % (','.join([ i + '*1000000.0/chromSizes' for i in input_names2]), 'chromNames', ','.join(['"' + i + '"' for i in input_names])), file=ROUT)
135
+ print ('dev.off()', file=ROUT)
136
+
137
+ ROUT.close()
138
+
139
+ #step 5: Run R script
140
+ printlog("Running R script ...")
141
+ try:
142
+ subprocess.call("Rscript " + options.out_file + '.r', shell=True)
143
+ except:
144
+ print ("Cannot generate pdf file from " + options.out_file + '.r', file=sys.stderr)
145
+ pass
146
+ else:
147
+ print ("Cannot generate R script file and pdf files.", file=sys.stderr)
148
+
149
+
150
+
151
+
152
+ if __name__=='__main__':
153
+ main()
154
+
@@ -0,0 +1,193 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program calculates the distribution of CpG over gene-centered genomic regions
7
+ including 'Coding exons', 'UTR exons', 'Introns', ' Upstream intergenic regions', and
8
+ 'Downstream intergenic regions'.
9
+
10
+ Notes
11
+ -----
12
+ Please note, a particular genomic region can be assigned to different groups listed above,
13
+ because most genes have multiple transcripts, and different genes could overlap on the
14
+ genome. For example, an exon of gene A could be located in an intron of gene B. To address
15
+ this issue, we define the priority order as below:
16
+ 0) Coding exons
17
+ 1) UTR exons
18
+ 2) Introns
19
+ 3) Upstream intergenic regions
20
+ 4) Downstream intergenic regions
21
+ Higher-priority group override the low-priority group. For example, if a certain part
22
+ of an intron is overlapped with exon of other transcripts/genes, the overlapped part will
23
+ be considered as an exon (i.e., removed from intron) since "exon" has higher priority.
24
+
25
+ #=========================================================================================
26
+ """
27
+
28
+
29
+ import sys,os
30
+ import collections
31
+ import subprocess
32
+ import numpy as np
33
+ from optparse import OptionParser
34
+ from cpgmodule import ireader
35
+ from cpgmodule.utils import *
36
+ from cpgmodule import BED
37
+ from cpgmodule._version import __version__
38
+
39
+ __author__ = "Liguo Wang"
40
+ __copyright__ = "Copyleft"
41
+ __credits__ = []
42
+ __license__ = "GPL"
43
+ __maintainer__ = "Liguo Wang"
44
+ __email__ = "wang.liguo@mayo.edu"
45
+ __status__ = "Development"
46
+
47
+ def main():
48
+
49
+ usage="%prog [options]" + "\n"
50
+ parser = OptionParser(usage,version="%prog " + __version__)
51
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="BED file specifying the C position. This BED file should have at least three columns (Chrom, ChromStart, ChromeEnd). Note: the first base in a chromosome is numbered 0. This file can be a regular text file or compressed file (.gz, .bz2).")
52
+ parser.add_option("-r","--refgene",action="store",type="string",dest="gene_file",help="Reference gene model in standard BED-12 format (https://genome.ucsc.edu/FAQ/FAQformat.html#format1). ")
53
+ parser.add_option("-d","--downstream",action="store",type="int",dest="downstream_size",default=2000,help="Size of down-stream intergenic region w.r.t. TES (transcription end site). default=%default (bp)")
54
+ parser.add_option("-u","--upstream",action="store",type="int",dest="upstream_size",default=2000,help="Size of up-stream intergenic region w.r.t. TSS (transcription start site). default=%default (bp)")
55
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
56
+ (options,args)=parser.parse_args()
57
+
58
+ print ()
59
+
60
+ if not (options.input_file):
61
+ print (__doc__)
62
+ parser.print_help()
63
+ sys.exit(101)
64
+
65
+ if not (options.gene_file):
66
+ print (__doc__)
67
+ parser.print_help()
68
+ sys.exit(102)
69
+
70
+ if not (options.out_file):
71
+ print (__doc__)
72
+ parser.print_help()
73
+ sys.exit(103)
74
+
75
+ FOUT = open(options.out_file + '.tsv','w')
76
+ ROUT = open(options.out_file + '.r','w')
77
+
78
+ #step1: read CpG file
79
+ printlog("Reading CpG file: \"%s\"" % (options.input_file))
80
+ cpg_ranges = read_CpG_bed(options.input_file)
81
+
82
+ #step2: read gene file
83
+ printlog("Reading reference gene model: \"%s\"" % (options.gene_file))
84
+ ref_gene = BED.ParseBED(options.gene_file)
85
+
86
+ result = [("Priority_order", "Name", "Number_of_regions", "Size_of_regions(bp)", "CpG_raw_count", "CpG_count_per_KB")]
87
+
88
+ #priority order: #1
89
+ printlog("Extract Coding exons ...")
90
+ cds_exons = ref_gene.getCDSExons(stranded=False)
91
+ printlog("Merge Coding exons ...")
92
+ cds_exons = BED.unionBed3(cds_exons)
93
+ printlog("Count CpGs in Coding exons ...")
94
+ (size,count) = count_over_range(cds_exons, cpg_ranges)
95
+ result.append(['0','Coding exons', len(cds_exons), size, count, count*1000.0/size]) #Class, number_of_region, size_of_region, CpG_raw_count, CpG_count_perKb
96
+
97
+ #priority order: #2
98
+ printlog("Extract UTR exons ...")
99
+ utr_exons = ref_gene.getUTRs(utr=35, uniquify=True, stranded = False)
100
+
101
+ printlog("Merge UTR exons ...")
102
+ utr_exons = BED.unionBed3(utr_exons)
103
+
104
+ printlog("Subtract regions with higher priority from UTR exons ...")
105
+ utr_exons = BED.subtractBed3(utr_exons, cds_exons) #nucleotides of utr_exons that overlaps with coding exons will be removed
106
+
107
+ printlog("Count CpGs in UTR exons ...")
108
+ (size,count) = count_over_range(utr_exons, cpg_ranges)
109
+ result.append(['1','UTR exons', len(utr_exons), size, count, count*1000.0/size])
110
+
111
+ #priority order: #3
112
+ printlog("Extract introns ...")
113
+ introns = ref_gene.getIntrons(itype='all', uniquify=True, stranded=False)
114
+
115
+ printlog("Merge introns ...")
116
+ introns = BED.unionBed3(introns)
117
+
118
+ printlog("Subtract regions with higher priority from introns ...")
119
+ introns = BED.subtractBed3(introns, cds_exons)
120
+ introns = BED.subtractBed3(introns, utr_exons)
121
+
122
+ printlog("Count CpGs in introns ...")
123
+ (size,count) = count_over_range(introns, cpg_ranges)
124
+ result.append(['2','Introns', len(introns), size, count, count*1000.0/size])
125
+
126
+ #priority order: #4
127
+ printlog("Extract upstream intergenic regions ...")
128
+ upstream = ref_gene.getIntergenic(direction='up', size=options.upstream_size, uniquify=True, stranded = False)
129
+
130
+ printlog("Merge upstream intergenic regions ...")
131
+ upstream = BED.unionBed3(upstream)
132
+
133
+ printlog("Subtract regions with higher priority from upstream intergenic regions...")
134
+ upstream = BED.subtractBed3(upstream, cds_exons)
135
+ upstream = BED.subtractBed3(upstream, utr_exons)
136
+ upstream = BED.subtractBed3(upstream, introns)
137
+
138
+ printlog("Count CpGs in upstream regions ...")
139
+ (size,count) = count_over_range(upstream, cpg_ranges)
140
+ result.append(['3','Upstream of TSS', len(upstream), size, count, count*1000.0/size])
141
+
142
+ #priority order: #5
143
+ printlog("Extract downstream intergenic regions ...")
144
+ downstream = ref_gene.getIntergenic(direction='down', size=options.downstream_size, uniquify=True, stranded = False)
145
+
146
+ printlog("Merge downstream intergenic regions ...")
147
+ downstream = BED.unionBed3(downstream)
148
+
149
+ printlog("Subtract regions with higher priority from downstream intergenic regions...")
150
+ downstream = BED.subtractBed3(downstream, cds_exons)
151
+ downstream = BED.subtractBed3(downstream, utr_exons)
152
+ downstream = BED.subtractBed3(downstream, introns)
153
+ downstream = BED.subtractBed3(downstream, upstream)
154
+
155
+ printlog("Count CpGs in downstream regions ...")
156
+ (size,count) = count_over_range(downstream, cpg_ranges)
157
+ result.append(['4','Downstream of TES', len(downstream), size, count, count*1000.0/size])
158
+
159
+ print('\n')
160
+ names=[] #[0,1,2,3,4]
161
+ labels = [] #[bed names]
162
+ density=[]
163
+ for tmp in result:
164
+ print ('\t'.join([str(i) for i in tmp]), file=FOUT)
165
+ names.append(tmp[0])
166
+ labels.append(tmp[1])
167
+ density.append(tmp[5])
168
+ FOUT.close()
169
+
170
+ print("name = c(%s)" % ','.join(['"' + i + '"' for i in names[1:]]), file=ROUT)
171
+ print("values = c(%s)" % ','.join([str(i) for i in density[1:]]), file=ROUT)
172
+ print ('pdf("%s", width=8, height=6)' % (options.out_file + '.pdf'), file=ROUT)
173
+ print ('layout(matrix(c(1,1,2,1,1,2), nrow=2, byrow=TRUE))', file=ROUT)
174
+ print ('barplot(values,names.arg=name,col=c(%s),ylab="CpG per Kb")' % ','.join(colors(5)), file=ROUT)
175
+ print ("plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')", file=ROUT)
176
+ for name,label in zip(names[1:], labels[1:]):
177
+ x_pos = 0.0
178
+ y_pos = 1-(int(name)*20.0 +5)/100
179
+ print ("text(x=%f, y=%f, labels=c(\"%s = %s\"),adj=c(0,0))" % (x_pos, y_pos,name,label), file=ROUT)
180
+ print ('dev.off()', file=ROUT)
181
+
182
+ ROUT.close()
183
+
184
+ printlog("Running R script ...")
185
+ try:
186
+ subprocess.call("Rscript " + options.out_file + '.r', shell=True)
187
+ except:
188
+ print ("Cannot generate pdf file from " + options.out_file + '.r', file=sys.stderr)
189
+ pass
190
+
191
+ if __name__=='__main__':
192
+ main()
193
+
@@ -0,0 +1,146 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program calculates the distribution of CpG over user-specified genomic regions.
7
+
8
+ Notes
9
+ ------
10
+ 1. A maximum of 10 BED files (define 10 different genomic regions) can be analyzed
11
+ together.
12
+ 2. The *order* of BED files determines the *priority order*. Overlapped
13
+ genomic regions will be kept in the BED file with the highest priority and removed
14
+ from BED files of lower priorities. For example, users provided 3 BED files via "-i
15
+ promoters.bed,enhancers.bed,intergenic.bed", then if an enhancer region is overlapped
16
+ with promoters, *the overlapped part* will be removed from "enhancers.bed".
17
+ 3. BED files can be regular or compressed by 'gzip' or 'bz'.
18
+ #=========================================================================================
19
+ """
20
+
21
+
22
+ import sys,os
23
+ import collections
24
+ import subprocess
25
+ import numpy as np
26
+ from optparse import OptionParser
27
+ from cpgmodule import ireader
28
+ from cpgmodule.utils import *
29
+ from cpgmodule import BED
30
+ from cpgmodule._version import __version__
31
+
32
+ __author__ = "Liguo Wang"
33
+ __copyright__ = "Copyleft"
34
+ __credits__ = []
35
+ __license__ = "GPL"
36
+ __maintainer__ = "Liguo Wang"
37
+ __email__ = "wang.liguo@mayo.edu"
38
+ __status__ = "Development"
39
+
40
+ def main():
41
+
42
+ usage="%prog [options]" + "\n"
43
+ parser = OptionParser(usage,version="%prog " + __version__)
44
+ parser.add_option("-i","--cpg",action="store",type="string",dest="cpg_file",help="BED file specifying the C position. This BED file should have at least three columns (Chrom, ChromStart, ChromeEnd). Note: the first base in a chromosome is numbered 0. This file can be a regular text file or compressed file (.gz, .bz2).")
45
+ parser.add_option("-b","--bed",action="store",type="string",dest="bed_files",help="List of comma separated BED files specifying the genomic regions.")
46
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
47
+ (options,args)=parser.parse_args()
48
+
49
+ print ()
50
+
51
+ if not (options.cpg_file):
52
+ print (__doc__)
53
+ parser.print_help()
54
+ sys.exit(101)
55
+
56
+ if not (options.bed_files):
57
+ print (__doc__)
58
+ parser.print_help()
59
+ sys.exit(101)
60
+
61
+ if not (options.out_file):
62
+ print (__doc__)
63
+ parser.print_help()
64
+ sys.exit(102)
65
+
66
+ FOUT = open(options.out_file + '.txt','w')
67
+ ROUT = open(options.out_file + '.r','w')
68
+
69
+ #step1: read CpG file
70
+ printlog("Reading CpG file: \"%s\"" % (options.cpg_file))
71
+ cpg_ranges = read_CpG_bed(options.cpg_file)
72
+
73
+ #step2: check BED file
74
+ printlog("Checking BED files: \"%s\"" % (options.bed_files))
75
+ input_bed_files = options.bed_files.replace(' ','').split(',')
76
+ for i in input_bed_files:
77
+ if os.path.exists(i):
78
+ print("\t%s" % i, file=sys.stderr)
79
+ else:
80
+ print("\"%s\" does not exist!" % i, file=sys.stderr)
81
+ sys.exit(103)
82
+
83
+ #step3: read, merge, and subtract BED file
84
+ dat = {}
85
+ result = [("Priority_order", "Name", "Number_of_regions", "Size_of_regions(bp)", "CpG_raw_count", "CpG_count_per_KB")]
86
+
87
+ #step3.1: read the first BED file
88
+ i = 0
89
+ printlog("Reading BED file: \"%s\"" % (input_bed_files[i]))
90
+ file_name = os.path.basename(input_bed_files[i])
91
+ tmp = read_bed_as_list(input_bed_files[i])
92
+ printlog("Merging overlap entries in BED file: \"%s\"" % (input_bed_files[i]))
93
+ dat[i] = BED.unionBed3(tmp)
94
+ printlog("Counting CpGs ...")
95
+ (size,count) = count_over_range(dat[i], cpg_ranges)
96
+ result.append([str(i), file_name, len(dat[i]), size, count, count*1000.0/size]) #Class, number_of_region, size_of_region, CpG_raw_count, CpG_count_perKb
97
+
98
+ #step3.2: read the remaining BED files
99
+ for i in range(1, len(input_bed_files)):
100
+ printlog("Reading BED file: \"%s\"" % (input_bed_files[i]))
101
+ file_name = os.path.basename(input_bed_files[i])
102
+ tmp = read_bed_as_list(input_bed_files[i])
103
+ printlog("Merging overlap entries in BED file: \"%s\"" % (input_bed_files[i]))
104
+ dat[i] = BED.unionBed3(tmp)
105
+
106
+ for j in range(0,i):
107
+ printlog("Subtract \"%s\" from \"%s\"" % (input_bed_files[j], input_bed_files[i]))
108
+ dat[i] = BED.subtractBed3(dat[i], dat[j])
109
+ (size,count) = count_over_range(dat[i], cpg_ranges)
110
+ result.append([str(i), file_name, len(dat[i]), size, count, count*1000.0/size])
111
+
112
+ print('\n')
113
+ names=[] #[0,1,2,3,4,...]
114
+ labels = [] #[bed names]
115
+ density=[]
116
+ for tmp in result:
117
+ print ('\t'.join([str(i) for i in tmp]), file=FOUT)
118
+ names.append(tmp[0])
119
+ labels.append(tmp[1])
120
+ density.append(tmp[5])
121
+ FOUT.close()
122
+
123
+ print("name = c(%s)" % ','.join(['"' + i + '"' for i in names[1:]]), file=ROUT)
124
+ print("values = c(%s)" % ','.join([str(i) for i in density[1:]]), file=ROUT)
125
+ print ('pdf("%s", width=8, height=6)' % (options.out_file + '.pdf'), file=ROUT)
126
+ print ('layout(matrix(c(1,1,2,1,1,2), nrow=2, byrow=TRUE))', file=ROUT)
127
+ print ('barplot(values,names.arg=name,col="blue",ylab="CpG per Kb")', file=ROUT)
128
+ print ("plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')", file=ROUT)
129
+ for name,label in zip(names[1:], labels[1:]):
130
+ x_pos = 0.0
131
+ y_pos = 1-(int(name)*9.0 +5)/200
132
+ print ("text(x=%f, y=%f, labels=c(\"%s = %s\"),adj=c(0,0))" % (x_pos, y_pos,name,label), file=ROUT)
133
+ print ('dev.off()', file=ROUT)
134
+
135
+ ROUT.close()
136
+
137
+ printlog("Running R script ...")
138
+ try:
139
+ subprocess.call("Rscript " + options.out_file + '.r', shell=True)
140
+ except:
141
+ print ("Cannot generate pdf file from " + options.out_file + '.r', file=sys.stderr)
142
+ pass
143
+
144
+ if __name__=='__main__':
145
+ main()
146
+
@@ -0,0 +1,134 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program generates the DNA motif logo for a given set of CpGs.
7
+ """
8
+
9
+
10
+ import sys,os
11
+ import collections
12
+ import subprocess
13
+ import numpy as np
14
+ import pysam
15
+ from optparse import OptionParser
16
+ from cpgmodule import ireader
17
+ from cpgmodule.utils import *
18
+ from cpgmodule import BED
19
+ from cpgmodule.imotif import PSSM
20
+ from cpgmodule._version import __version__
21
+
22
+ __author__ = "Liguo Wang"
23
+ __copyright__ = "Copyleft"
24
+ __credits__ = []
25
+ __license__ = "GPL"
26
+ __maintainer__ = "Liguo Wang"
27
+ __email__ = "wang.liguo@mayo.edu"
28
+ __status__ = "Development"
29
+
30
+ def main():
31
+ print (__doc__)
32
+ usage="%prog [options]" + "\n"
33
+ parser = OptionParser(usage,version="%prog " + __version__)
34
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="BED file specifying the C position. This BED file should have at least six columns (Chrom, ChromStart, ChromeEnd, name, score, strand). Note: Must provide correct *strand* information. This file can be a regular text file or compressed file (.gz, .bz2).")
35
+ parser.add_option("-r","--refgenome",action="store",type="string",dest="genome_file",help="Reference genome seqeunces in FASTA format. Must be indexed using the samtools \"faidx\" command. ")
36
+ parser.add_option("-e","--extend",action="store",type="int",dest="extend_size",default=5,help="Number of bases extended to up- and down-stream. default=%default (bp)")
37
+ parser.add_option("-n","--name",action="store",type='string', dest="motif_name",default='motif', help="Motif name. default=%default")
38
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
39
+ (options,args)=parser.parse_args()
40
+
41
+ print ()
42
+
43
+ if not (options.input_file):
44
+ parser.print_help()
45
+ sys.exit(101)
46
+
47
+ if not (options.genome_file):
48
+ parser.print_help()
49
+ sys.exit(102)
50
+ #index refegenome file if it hasn't been done
51
+ if not os.path.exists(options.genome_file + '.fai'):
52
+ printlog("Creating index for %s" % options.genome_file)
53
+ pysam.faidx(options.genome_file)
54
+
55
+ if not (options.out_file):
56
+ parser.print_help()
57
+ sys.exit(103)
58
+
59
+ refFasta = pysam.Fastafile(options.genome_file)
60
+ FOUT = open(options.out_file + '.fa','w')
61
+
62
+ printlog("Reading %s ..." % options.input_file)
63
+ for l in ireader.reader(options.input_file):
64
+ if l.startswith('#'):
65
+ continue
66
+ if l.startswith('track'):
67
+ continue
68
+ if l.startswith('browser'):
69
+ continue
70
+ f = l.split()
71
+ if '-' in f:
72
+ strand = '-'
73
+ else:
74
+ strand = '+'
75
+ try:
76
+ chrom = f[0]
77
+ position = int(f[2])
78
+ except:
79
+ print ("BED has at lesat 4 columns. Skip: " + l, file=sys.stderr)
80
+
81
+ start = position - options.extend_size - 1
82
+ end = position + options.extend_size
83
+ if start < 0 or start > end:
84
+ continue
85
+
86
+ fa_name = '>' + '_'.join([str(i) for i in (chrom,start,end,strand)])
87
+ fa_seq = refFasta.fetch(chrom, start, end).upper()
88
+ if strand == '-':
89
+ fa_seq = revcomp(fa_seq)
90
+ print (fa_name,file=FOUT)
91
+ print (fa_seq,file=FOUT)
92
+ FOUT.close()
93
+
94
+ printlog("Generate motif logo ... ")
95
+ try:
96
+ subprocess.call("weblogo --format PDF -D fasta -c classic -s large -f %s -o %s -t %s" % (options.out_file + '.fa', options.out_file + '.logo.pdf', options.motif_name), shell=True)
97
+ subprocess.call("weblogo --format PNG -D fasta -c classic -s large -f %s -o %s -t %s" % (options.out_file + '.fa', options.out_file + '.logo.png', options.motif_name), shell=True)
98
+ except:
99
+ print ("Cannot run weblogo. Please install weblogo (https://github.com/WebLogo/weblogo)", file=sys.stderr)
100
+ pass
101
+ printlog("Motif logo saved to \"%s\" and \"%s\"" % (options.out_file + '.logo.pdf', options.out_file + '.logo.png'))
102
+
103
+
104
+ m = PSSM(sites=options.out_file + '.fa', name = options.motif_name)
105
+
106
+ printlog("Write position frequency matrix (PFM) to \"%s\"" % (options.out_file + '.pfm'))
107
+ FF = open(options.out_file + '.pfm', 'w')
108
+ m.toPFM(FOUT=FF)
109
+ FF.close()
110
+
111
+ printlog("Write position probability matrix (PPM) to \"%s\"" % (options.out_file + '.ppm'))
112
+ FF = open(options.out_file + '.ppm', 'w')
113
+ m.toPPM(FOUT=FF)
114
+ FF.close()
115
+
116
+ printlog("Write position weight matrix (PWM) to \"%s\"" % (options.out_file + '.pwm'))
117
+ FF = open(options.out_file + '.pwm', 'w')
118
+ m.toPWM(FOUT=FF)
119
+ FF.close()
120
+
121
+ printlog("Write Jaspar format matrix to \"%s\"" % (options.out_file + '.jaspar'))
122
+ FF = open(options.out_file + '.jaspar', 'w')
123
+ m.toJaspar(FOUT=FF)
124
+ FF.close()
125
+
126
+ printlog("Write MEME format matrix to \"%s\"" % (options.out_file + '.meme'))
127
+ FF = open(options.out_file + '.meme', 'w')
128
+ m.toMEME(FOUT=FF)
129
+ FF.close()
130
+
131
+
132
+ if __name__=='__main__':
133
+ main()
134
+