cpgtools 2.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cpgmodule/BED.py +441 -0
- cpgmodule/MI.py +193 -0
- cpgmodule/__init__.py +0 -0
- cpgmodule/_version.py +1 -0
- cpgmodule/cgID.py +866897 -0
- cpgmodule/data/AltumAge_cpg.pkl +0 -0
- cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
- cpgmodule/data/AltumAge_scaler.pkl +0 -0
- cpgmodule/data/GA_Bohlin.pkl +0 -0
- cpgmodule/data/GA_Haftorn.pkl +0 -0
- cpgmodule/data/GA_Knight.pkl +0 -0
- cpgmodule/data/GA_Lee_CPC.pkl +0 -0
- cpgmodule/data/GA_Lee_RPC.pkl +0 -0
- cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
- cpgmodule/data/GA_Mayne.pkl +0 -0
- cpgmodule/data/Hannum.pkl +0 -0
- cpgmodule/data/Horvath_2013.pkl +0 -0
- cpgmodule/data/Horvath_2018.pkl +0 -0
- cpgmodule/data/Levine.pkl +0 -0
- cpgmodule/data/Lu_DNAmTL.pkl +0 -0
- cpgmodule/data/Ped_McEwen.pkl +0 -0
- cpgmodule/data/Ped_Wu.pkl +0 -0
- cpgmodule/data/Zhang_BLUP.pkl +0 -0
- cpgmodule/data/Zhang_EN.pkl +0 -0
- cpgmodule/data/__init__.py +0 -0
- cpgmodule/extend_bed.py +147 -0
- cpgmodule/imotif.py +348 -0
- cpgmodule/ireader.py +28 -0
- cpgmodule/methylClock.py +53 -0
- cpgmodule/padjust.py +58 -0
- cpgmodule/region2gene.py +170 -0
- cpgmodule/utils.py +642 -0
- cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
- cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
- cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
- cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
- cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
- cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
- cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
- cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
- cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
- cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
- cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
- cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
- cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
- cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
- cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
- cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
- cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
- cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
- cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
- cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
- cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
- cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
- cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
- cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
- cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
- cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
- cpgtools-2.0.5.dist-info/METADATA +59 -0
- cpgtools-2.0.5.dist-info/RECORD +104 -0
- cpgtools-2.0.5.dist-info/WHEEL +5 -0
- cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
- cpgtools-2.0.5.dist-info/top_level.txt +5 -0
- impyute/__init__.py +3 -0
- impyute/contrib/__init__.py +7 -0
- impyute/contrib/compare.py +69 -0
- impyute/contrib/count_missing.py +30 -0
- impyute/contrib/describe.py +63 -0
- impyute/cs/__init__.py +11 -0
- impyute/cs/buck_iterative.py +82 -0
- impyute/cs/central_tendency.py +84 -0
- impyute/cs/em.py +52 -0
- impyute/cs/fast_knn.py +130 -0
- impyute/cs/random.py +27 -0
- impyute/dataset/__init__.py +6 -0
- impyute/dataset/base.py +137 -0
- impyute/dataset/corrupt.py +55 -0
- impyute/deletion/__init__.py +5 -0
- impyute/deletion/complete_case.py +21 -0
- impyute/ops/__init__.py +12 -0
- impyute/ops/error.py +9 -0
- impyute/ops/inverse_distance_weighting.py +31 -0
- impyute/ops/matrix.py +47 -0
- impyute/ops/testing.py +20 -0
- impyute/ops/util.py +96 -0
- impyute/ops/wrapper.py +179 -0
- impyute/ts/__init__.py +6 -0
- impyute/ts/locf.py +57 -0
- impyute/ts/moving_window.py +128 -0
- impyutelib.py +890 -0
- missingpy/__init__.py +4 -0
- missingpy/knnimpute.py +328 -0
- missingpy/missforest.py +556 -0
- missingpy/pairwise_external.py +315 -0
- missingpy/tests/__init__.py +0 -0
- missingpy/tests/test_knnimpute.py +605 -0
- missingpy/tests/test_missforest.py +409 -0
- missingpy/utils.py +124 -0
- misspylib.py +565 -0
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Description
|
|
5
|
+
-----------
|
|
6
|
+
This program calculates the distribution of CpG over chromosomes.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
import sys,os
|
|
11
|
+
import collections
|
|
12
|
+
import subprocess
|
|
13
|
+
import numpy as np
|
|
14
|
+
from optparse import OptionParser
|
|
15
|
+
from cpgmodule import ireader
|
|
16
|
+
from cpgmodule.utils import *
|
|
17
|
+
from cpgmodule._version import __version__
|
|
18
|
+
|
|
19
|
+
__author__ = "Liguo Wang"
|
|
20
|
+
__copyright__ = "Copyleft"
|
|
21
|
+
__credits__ = []
|
|
22
|
+
__license__ = "GPL"
|
|
23
|
+
__maintainer__ = "Liguo Wang"
|
|
24
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
25
|
+
__status__ = "Development"
|
|
26
|
+
|
|
27
|
+
def main():
|
|
28
|
+
|
|
29
|
+
usage="%prog [options]" + "\n"
|
|
30
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
31
|
+
parser.add_option("-i","--input_files",action="store",type="string",dest="input_files",help="Input CpG file(s) in BED3+ format. Multiple BED files should be separated by \",\" (eg: \"-i file_1.bed,file_2.bed,file_3.bed\"). BED file can be a regular text file or compressed file (.gz, .bz2). The barplot figures will NOT be generated if you provide more than 12 samples (bed files). [required]")
|
|
32
|
+
parser.add_option("-n","--names",action="store",type="string",dest="file_names",help="Shorter and meaningful names to label samples. Should be separated by \",\" and match CpG BED files in number. If not provided, basenames of CpG BED files will be used to label samples. [optional]")
|
|
33
|
+
parser.add_option("-s","--chrom-size",action="store",type="string",dest="chrom_size",help="Chromosome size file. Tab or space separated text file with two columns: the first column is chromosome name/ID, the second column is chromosome size. This file will determine: (1) which chromosomes are included in the final bar plots, so do NOT include 'unplaced', 'alternative' contigs in this file. (2) The order of chromosomes in the final bar plots. [required]")
|
|
34
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file. [required]")
|
|
35
|
+
(options,args)=parser.parse_args()
|
|
36
|
+
|
|
37
|
+
print ()
|
|
38
|
+
|
|
39
|
+
if not (options.input_files):
|
|
40
|
+
print (__doc__)
|
|
41
|
+
#print ('You must specify input file(s)',file=sys.stderr)
|
|
42
|
+
parser.print_help()
|
|
43
|
+
sys.exit(101)
|
|
44
|
+
if not (options.chrom_size):
|
|
45
|
+
print (__doc__)
|
|
46
|
+
#print ('You must specify the chrom size file',file=sys.stderr)
|
|
47
|
+
parser.print_help()
|
|
48
|
+
sys.exit(102)
|
|
49
|
+
if not (options.out_file):
|
|
50
|
+
print (__doc__)
|
|
51
|
+
#print ('You must specify the output file',file=sys.stderr)
|
|
52
|
+
parser.print_help()
|
|
53
|
+
sys.exit(103)
|
|
54
|
+
|
|
55
|
+
input_files = options.input_files.split(',')
|
|
56
|
+
for i in input_files + [options.chrom_size]:
|
|
57
|
+
if not os.path.exists(i):
|
|
58
|
+
print ('\n' + i + " does NOT exists" + '\n',file=sys.stderr)
|
|
59
|
+
sys.exit(104)
|
|
60
|
+
|
|
61
|
+
input_names = []
|
|
62
|
+
if options.file_names:
|
|
63
|
+
input_names = options.file_names.split(',')
|
|
64
|
+
else:
|
|
65
|
+
for f in input_files:
|
|
66
|
+
input_names.append(os.path.basename(f))
|
|
67
|
+
if len(input_files) != len(input_names):
|
|
68
|
+
print ('-i and -n don\'t match in number',file=sys.stderr)
|
|
69
|
+
sys.exit(105)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
#step1: read chrom sizes files
|
|
73
|
+
printlog("Reading chromosome size file: \"%s\"" % (options.chrom_size))
|
|
74
|
+
cnames,csizes = read_chromSize(options.chrom_size)
|
|
75
|
+
for cname,csize in zip(cnames,csizes):
|
|
76
|
+
print(" " + cname + '\t' + str(csize))
|
|
77
|
+
|
|
78
|
+
#step2: read CpG files
|
|
79
|
+
dat = collections.defaultdict(dict) #samleName:chromName:CpGount
|
|
80
|
+
for f,n in zip(input_files, input_names):
|
|
81
|
+
printlog("Reading CpG BED file \"%s\" named \"%s\"" % (f,n))
|
|
82
|
+
dat[n] = chrom_count(f)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
#step3: write matrix to file
|
|
86
|
+
printlog("Save CpG count to \"%s\"" % (options.out_file + '.txt'))
|
|
87
|
+
FOUT = open(options.out_file + '.txt','w')
|
|
88
|
+
print ("chromID\tchromSize\t" + '\t'.join([ n + '.CpG_count' for n in input_names]), file=FOUT)
|
|
89
|
+
|
|
90
|
+
for cname,csize in zip(cnames,csizes):
|
|
91
|
+
row = []
|
|
92
|
+
row.append(cname.replace('chr',''))
|
|
93
|
+
row.append(csize)
|
|
94
|
+
for n in input_names:
|
|
95
|
+
try:
|
|
96
|
+
row.append(dat[n][cname])
|
|
97
|
+
except:
|
|
98
|
+
row.append(0)
|
|
99
|
+
print ('\t'.join([str(i) for i in row]), file=FOUT)
|
|
100
|
+
|
|
101
|
+
FOUT.close()
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
#step 4: print R script
|
|
105
|
+
if len(input_names) <= 12:
|
|
106
|
+
printlog("Generate R script, save to \"%s\"" % (options.out_file + '.r'))
|
|
107
|
+
ROUT = open(options.out_file + '.r','w')
|
|
108
|
+
print ("chromNames = c(%s)" % (','.join(['"' + i.replace('chr','') + '"' for i in cnames])),file=ROUT)
|
|
109
|
+
print ("chromSizes = c(%s)" % (','.join([str(i) for i in csizes])),file=ROUT)
|
|
110
|
+
|
|
111
|
+
input_names2 = ['X_' + i for i in input_names]
|
|
112
|
+
for n1,n2 in zip(input_names, input_names2):
|
|
113
|
+
tmp = []
|
|
114
|
+
for cname in cnames:
|
|
115
|
+
try:
|
|
116
|
+
tmp.append(dat[n1][cname])
|
|
117
|
+
except:
|
|
118
|
+
tmp.append(0)
|
|
119
|
+
print ("%s = c(%s)" % (n2, ','.join([str(i) for i in tmp])), file=ROUT)
|
|
120
|
+
|
|
121
|
+
my_col = colors(len(input_names))
|
|
122
|
+
print ('cols = c(%s)' % ','.join(my_col),file=ROUT)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
print ('pdf(\"%s\", width=12, height=6)' % (options.out_file + '.CpG_total.pdf'), file=ROUT)
|
|
126
|
+
print ('barplot(rbind(%s),col=cols,beside=T,names.arg=%s, xlab="Chromosome", ylab="CpG count", legend.text=c(%s), cex.names=0.5, cex.axis=0.6)' % (','.join(input_names2), 'chromNames', ','.join(['"' + i + '"' for i in input_names])), file=ROUT)
|
|
127
|
+
print ('dev.off()', file=ROUT)
|
|
128
|
+
|
|
129
|
+
print ('pdf(\"%s\", width=12, height=6)' % (options.out_file + '.CpG_percent.pdf'), file=ROUT)
|
|
130
|
+
print ('barplot(rbind(%s),col=cols,beside=T,names.arg=%s, xlab="Chromosome", ylab="CpG percent", legend.text=c(%s), cex.names=0.5, cex.axis=0.6)' % (','.join([ i + '*100.0/sum(' + i + ')' for i in input_names2]), 'chromNames', ','.join(['"' + i + '"' for i in input_names])), file=ROUT)
|
|
131
|
+
print ('dev.off()', file=ROUT)
|
|
132
|
+
|
|
133
|
+
print ('pdf(\"%s\", width=12, height=6)' % (options.out_file + '.CpG_perMb.pdf'), file=ROUT)
|
|
134
|
+
print ('barplot(rbind(%s),col=cols,beside=T,names.arg=%s, xlab="Chromosome", ylab="CpG per Mb", legend.text=c(%s), cex.names=0.5, cex.axis=0.6)' % (','.join([ i + '*1000000.0/chromSizes' for i in input_names2]), 'chromNames', ','.join(['"' + i + '"' for i in input_names])), file=ROUT)
|
|
135
|
+
print ('dev.off()', file=ROUT)
|
|
136
|
+
|
|
137
|
+
ROUT.close()
|
|
138
|
+
|
|
139
|
+
#step 5: Run R script
|
|
140
|
+
printlog("Running R script ...")
|
|
141
|
+
try:
|
|
142
|
+
subprocess.call("Rscript " + options.out_file + '.r', shell=True)
|
|
143
|
+
except:
|
|
144
|
+
print ("Cannot generate pdf file from " + options.out_file + '.r', file=sys.stderr)
|
|
145
|
+
pass
|
|
146
|
+
else:
|
|
147
|
+
print ("Cannot generate R script file and pdf files.", file=sys.stderr)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
if __name__=='__main__':
|
|
153
|
+
main()
|
|
154
|
+
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Description
|
|
5
|
+
-----------
|
|
6
|
+
This program calculates the distribution of CpG over gene-centered genomic regions
|
|
7
|
+
including 'Coding exons', 'UTR exons', 'Introns', ' Upstream intergenic regions', and
|
|
8
|
+
'Downstream intergenic regions'.
|
|
9
|
+
|
|
10
|
+
Notes
|
|
11
|
+
-----
|
|
12
|
+
Please note, a particular genomic region can be assigned to different groups listed above,
|
|
13
|
+
because most genes have multiple transcripts, and different genes could overlap on the
|
|
14
|
+
genome. For example, an exon of gene A could be located in an intron of gene B. To address
|
|
15
|
+
this issue, we define the priority order as below:
|
|
16
|
+
0) Coding exons
|
|
17
|
+
1) UTR exons
|
|
18
|
+
2) Introns
|
|
19
|
+
3) Upstream intergenic regions
|
|
20
|
+
4) Downstream intergenic regions
|
|
21
|
+
Higher-priority group override the low-priority group. For example, if a certain part
|
|
22
|
+
of an intron is overlapped with exon of other transcripts/genes, the overlapped part will
|
|
23
|
+
be considered as an exon (i.e., removed from intron) since "exon" has higher priority.
|
|
24
|
+
|
|
25
|
+
#=========================================================================================
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
import sys,os
|
|
30
|
+
import collections
|
|
31
|
+
import subprocess
|
|
32
|
+
import numpy as np
|
|
33
|
+
from optparse import OptionParser
|
|
34
|
+
from cpgmodule import ireader
|
|
35
|
+
from cpgmodule.utils import *
|
|
36
|
+
from cpgmodule import BED
|
|
37
|
+
from cpgmodule._version import __version__
|
|
38
|
+
|
|
39
|
+
__author__ = "Liguo Wang"
|
|
40
|
+
__copyright__ = "Copyleft"
|
|
41
|
+
__credits__ = []
|
|
42
|
+
__license__ = "GPL"
|
|
43
|
+
__maintainer__ = "Liguo Wang"
|
|
44
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
45
|
+
__status__ = "Development"
|
|
46
|
+
|
|
47
|
+
def main():
|
|
48
|
+
|
|
49
|
+
usage="%prog [options]" + "\n"
|
|
50
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
51
|
+
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="BED file specifying the C position. This BED file should have at least three columns (Chrom, ChromStart, ChromeEnd). Note: the first base in a chromosome is numbered 0. This file can be a regular text file or compressed file (.gz, .bz2).")
|
|
52
|
+
parser.add_option("-r","--refgene",action="store",type="string",dest="gene_file",help="Reference gene model in standard BED-12 format (https://genome.ucsc.edu/FAQ/FAQformat.html#format1). ")
|
|
53
|
+
parser.add_option("-d","--downstream",action="store",type="int",dest="downstream_size",default=2000,help="Size of down-stream intergenic region w.r.t. TES (transcription end site). default=%default (bp)")
|
|
54
|
+
parser.add_option("-u","--upstream",action="store",type="int",dest="upstream_size",default=2000,help="Size of up-stream intergenic region w.r.t. TSS (transcription start site). default=%default (bp)")
|
|
55
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
|
|
56
|
+
(options,args)=parser.parse_args()
|
|
57
|
+
|
|
58
|
+
print ()
|
|
59
|
+
|
|
60
|
+
if not (options.input_file):
|
|
61
|
+
print (__doc__)
|
|
62
|
+
parser.print_help()
|
|
63
|
+
sys.exit(101)
|
|
64
|
+
|
|
65
|
+
if not (options.gene_file):
|
|
66
|
+
print (__doc__)
|
|
67
|
+
parser.print_help()
|
|
68
|
+
sys.exit(102)
|
|
69
|
+
|
|
70
|
+
if not (options.out_file):
|
|
71
|
+
print (__doc__)
|
|
72
|
+
parser.print_help()
|
|
73
|
+
sys.exit(103)
|
|
74
|
+
|
|
75
|
+
FOUT = open(options.out_file + '.tsv','w')
|
|
76
|
+
ROUT = open(options.out_file + '.r','w')
|
|
77
|
+
|
|
78
|
+
#step1: read CpG file
|
|
79
|
+
printlog("Reading CpG file: \"%s\"" % (options.input_file))
|
|
80
|
+
cpg_ranges = read_CpG_bed(options.input_file)
|
|
81
|
+
|
|
82
|
+
#step2: read gene file
|
|
83
|
+
printlog("Reading reference gene model: \"%s\"" % (options.gene_file))
|
|
84
|
+
ref_gene = BED.ParseBED(options.gene_file)
|
|
85
|
+
|
|
86
|
+
result = [("Priority_order", "Name", "Number_of_regions", "Size_of_regions(bp)", "CpG_raw_count", "CpG_count_per_KB")]
|
|
87
|
+
|
|
88
|
+
#priority order: #1
|
|
89
|
+
printlog("Extract Coding exons ...")
|
|
90
|
+
cds_exons = ref_gene.getCDSExons(stranded=False)
|
|
91
|
+
printlog("Merge Coding exons ...")
|
|
92
|
+
cds_exons = BED.unionBed3(cds_exons)
|
|
93
|
+
printlog("Count CpGs in Coding exons ...")
|
|
94
|
+
(size,count) = count_over_range(cds_exons, cpg_ranges)
|
|
95
|
+
result.append(['0','Coding exons', len(cds_exons), size, count, count*1000.0/size]) #Class, number_of_region, size_of_region, CpG_raw_count, CpG_count_perKb
|
|
96
|
+
|
|
97
|
+
#priority order: #2
|
|
98
|
+
printlog("Extract UTR exons ...")
|
|
99
|
+
utr_exons = ref_gene.getUTRs(utr=35, uniquify=True, stranded = False)
|
|
100
|
+
|
|
101
|
+
printlog("Merge UTR exons ...")
|
|
102
|
+
utr_exons = BED.unionBed3(utr_exons)
|
|
103
|
+
|
|
104
|
+
printlog("Subtract regions with higher priority from UTR exons ...")
|
|
105
|
+
utr_exons = BED.subtractBed3(utr_exons, cds_exons) #nucleotides of utr_exons that overlaps with coding exons will be removed
|
|
106
|
+
|
|
107
|
+
printlog("Count CpGs in UTR exons ...")
|
|
108
|
+
(size,count) = count_over_range(utr_exons, cpg_ranges)
|
|
109
|
+
result.append(['1','UTR exons', len(utr_exons), size, count, count*1000.0/size])
|
|
110
|
+
|
|
111
|
+
#priority order: #3
|
|
112
|
+
printlog("Extract introns ...")
|
|
113
|
+
introns = ref_gene.getIntrons(itype='all', uniquify=True, stranded=False)
|
|
114
|
+
|
|
115
|
+
printlog("Merge introns ...")
|
|
116
|
+
introns = BED.unionBed3(introns)
|
|
117
|
+
|
|
118
|
+
printlog("Subtract regions with higher priority from introns ...")
|
|
119
|
+
introns = BED.subtractBed3(introns, cds_exons)
|
|
120
|
+
introns = BED.subtractBed3(introns, utr_exons)
|
|
121
|
+
|
|
122
|
+
printlog("Count CpGs in introns ...")
|
|
123
|
+
(size,count) = count_over_range(introns, cpg_ranges)
|
|
124
|
+
result.append(['2','Introns', len(introns), size, count, count*1000.0/size])
|
|
125
|
+
|
|
126
|
+
#priority order: #4
|
|
127
|
+
printlog("Extract upstream intergenic regions ...")
|
|
128
|
+
upstream = ref_gene.getIntergenic(direction='up', size=options.upstream_size, uniquify=True, stranded = False)
|
|
129
|
+
|
|
130
|
+
printlog("Merge upstream intergenic regions ...")
|
|
131
|
+
upstream = BED.unionBed3(upstream)
|
|
132
|
+
|
|
133
|
+
printlog("Subtract regions with higher priority from upstream intergenic regions...")
|
|
134
|
+
upstream = BED.subtractBed3(upstream, cds_exons)
|
|
135
|
+
upstream = BED.subtractBed3(upstream, utr_exons)
|
|
136
|
+
upstream = BED.subtractBed3(upstream, introns)
|
|
137
|
+
|
|
138
|
+
printlog("Count CpGs in upstream regions ...")
|
|
139
|
+
(size,count) = count_over_range(upstream, cpg_ranges)
|
|
140
|
+
result.append(['3','Upstream of TSS', len(upstream), size, count, count*1000.0/size])
|
|
141
|
+
|
|
142
|
+
#priority order: #5
|
|
143
|
+
printlog("Extract downstream intergenic regions ...")
|
|
144
|
+
downstream = ref_gene.getIntergenic(direction='down', size=options.downstream_size, uniquify=True, stranded = False)
|
|
145
|
+
|
|
146
|
+
printlog("Merge downstream intergenic regions ...")
|
|
147
|
+
downstream = BED.unionBed3(downstream)
|
|
148
|
+
|
|
149
|
+
printlog("Subtract regions with higher priority from downstream intergenic regions...")
|
|
150
|
+
downstream = BED.subtractBed3(downstream, cds_exons)
|
|
151
|
+
downstream = BED.subtractBed3(downstream, utr_exons)
|
|
152
|
+
downstream = BED.subtractBed3(downstream, introns)
|
|
153
|
+
downstream = BED.subtractBed3(downstream, upstream)
|
|
154
|
+
|
|
155
|
+
printlog("Count CpGs in downstream regions ...")
|
|
156
|
+
(size,count) = count_over_range(downstream, cpg_ranges)
|
|
157
|
+
result.append(['4','Downstream of TES', len(downstream), size, count, count*1000.0/size])
|
|
158
|
+
|
|
159
|
+
print('\n')
|
|
160
|
+
names=[] #[0,1,2,3,4]
|
|
161
|
+
labels = [] #[bed names]
|
|
162
|
+
density=[]
|
|
163
|
+
for tmp in result:
|
|
164
|
+
print ('\t'.join([str(i) for i in tmp]), file=FOUT)
|
|
165
|
+
names.append(tmp[0])
|
|
166
|
+
labels.append(tmp[1])
|
|
167
|
+
density.append(tmp[5])
|
|
168
|
+
FOUT.close()
|
|
169
|
+
|
|
170
|
+
print("name = c(%s)" % ','.join(['"' + i + '"' for i in names[1:]]), file=ROUT)
|
|
171
|
+
print("values = c(%s)" % ','.join([str(i) for i in density[1:]]), file=ROUT)
|
|
172
|
+
print ('pdf("%s", width=8, height=6)' % (options.out_file + '.pdf'), file=ROUT)
|
|
173
|
+
print ('layout(matrix(c(1,1,2,1,1,2), nrow=2, byrow=TRUE))', file=ROUT)
|
|
174
|
+
print ('barplot(values,names.arg=name,col=c(%s),ylab="CpG per Kb")' % ','.join(colors(5)), file=ROUT)
|
|
175
|
+
print ("plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')", file=ROUT)
|
|
176
|
+
for name,label in zip(names[1:], labels[1:]):
|
|
177
|
+
x_pos = 0.0
|
|
178
|
+
y_pos = 1-(int(name)*20.0 +5)/100
|
|
179
|
+
print ("text(x=%f, y=%f, labels=c(\"%s = %s\"),adj=c(0,0))" % (x_pos, y_pos,name,label), file=ROUT)
|
|
180
|
+
print ('dev.off()', file=ROUT)
|
|
181
|
+
|
|
182
|
+
ROUT.close()
|
|
183
|
+
|
|
184
|
+
printlog("Running R script ...")
|
|
185
|
+
try:
|
|
186
|
+
subprocess.call("Rscript " + options.out_file + '.r', shell=True)
|
|
187
|
+
except:
|
|
188
|
+
print ("Cannot generate pdf file from " + options.out_file + '.r', file=sys.stderr)
|
|
189
|
+
pass
|
|
190
|
+
|
|
191
|
+
if __name__=='__main__':
|
|
192
|
+
main()
|
|
193
|
+
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Description
|
|
5
|
+
-----------
|
|
6
|
+
This program calculates the distribution of CpG over user-specified genomic regions.
|
|
7
|
+
|
|
8
|
+
Notes
|
|
9
|
+
------
|
|
10
|
+
1. A maximum of 10 BED files (define 10 different genomic regions) can be analyzed
|
|
11
|
+
together.
|
|
12
|
+
2. The *order* of BED files determines the *priority order*. Overlapped
|
|
13
|
+
genomic regions will be kept in the BED file with the highest priority and removed
|
|
14
|
+
from BED files of lower priorities. For example, users provided 3 BED files via "-i
|
|
15
|
+
promoters.bed,enhancers.bed,intergenic.bed", then if an enhancer region is overlapped
|
|
16
|
+
with promoters, *the overlapped part* will be removed from "enhancers.bed".
|
|
17
|
+
3. BED files can be regular or compressed by 'gzip' or 'bz'.
|
|
18
|
+
#=========================================================================================
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
import sys,os
|
|
23
|
+
import collections
|
|
24
|
+
import subprocess
|
|
25
|
+
import numpy as np
|
|
26
|
+
from optparse import OptionParser
|
|
27
|
+
from cpgmodule import ireader
|
|
28
|
+
from cpgmodule.utils import *
|
|
29
|
+
from cpgmodule import BED
|
|
30
|
+
from cpgmodule._version import __version__
|
|
31
|
+
|
|
32
|
+
__author__ = "Liguo Wang"
|
|
33
|
+
__copyright__ = "Copyleft"
|
|
34
|
+
__credits__ = []
|
|
35
|
+
__license__ = "GPL"
|
|
36
|
+
__maintainer__ = "Liguo Wang"
|
|
37
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
38
|
+
__status__ = "Development"
|
|
39
|
+
|
|
40
|
+
def main():
|
|
41
|
+
|
|
42
|
+
usage="%prog [options]" + "\n"
|
|
43
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
44
|
+
parser.add_option("-i","--cpg",action="store",type="string",dest="cpg_file",help="BED file specifying the C position. This BED file should have at least three columns (Chrom, ChromStart, ChromeEnd). Note: the first base in a chromosome is numbered 0. This file can be a regular text file or compressed file (.gz, .bz2).")
|
|
45
|
+
parser.add_option("-b","--bed",action="store",type="string",dest="bed_files",help="List of comma separated BED files specifying the genomic regions.")
|
|
46
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
|
|
47
|
+
(options,args)=parser.parse_args()
|
|
48
|
+
|
|
49
|
+
print ()
|
|
50
|
+
|
|
51
|
+
if not (options.cpg_file):
|
|
52
|
+
print (__doc__)
|
|
53
|
+
parser.print_help()
|
|
54
|
+
sys.exit(101)
|
|
55
|
+
|
|
56
|
+
if not (options.bed_files):
|
|
57
|
+
print (__doc__)
|
|
58
|
+
parser.print_help()
|
|
59
|
+
sys.exit(101)
|
|
60
|
+
|
|
61
|
+
if not (options.out_file):
|
|
62
|
+
print (__doc__)
|
|
63
|
+
parser.print_help()
|
|
64
|
+
sys.exit(102)
|
|
65
|
+
|
|
66
|
+
FOUT = open(options.out_file + '.txt','w')
|
|
67
|
+
ROUT = open(options.out_file + '.r','w')
|
|
68
|
+
|
|
69
|
+
#step1: read CpG file
|
|
70
|
+
printlog("Reading CpG file: \"%s\"" % (options.cpg_file))
|
|
71
|
+
cpg_ranges = read_CpG_bed(options.cpg_file)
|
|
72
|
+
|
|
73
|
+
#step2: check BED file
|
|
74
|
+
printlog("Checking BED files: \"%s\"" % (options.bed_files))
|
|
75
|
+
input_bed_files = options.bed_files.replace(' ','').split(',')
|
|
76
|
+
for i in input_bed_files:
|
|
77
|
+
if os.path.exists(i):
|
|
78
|
+
print("\t%s" % i, file=sys.stderr)
|
|
79
|
+
else:
|
|
80
|
+
print("\"%s\" does not exist!" % i, file=sys.stderr)
|
|
81
|
+
sys.exit(103)
|
|
82
|
+
|
|
83
|
+
#step3: read, merge, and subtract BED file
|
|
84
|
+
dat = {}
|
|
85
|
+
result = [("Priority_order", "Name", "Number_of_regions", "Size_of_regions(bp)", "CpG_raw_count", "CpG_count_per_KB")]
|
|
86
|
+
|
|
87
|
+
#step3.1: read the first BED file
|
|
88
|
+
i = 0
|
|
89
|
+
printlog("Reading BED file: \"%s\"" % (input_bed_files[i]))
|
|
90
|
+
file_name = os.path.basename(input_bed_files[i])
|
|
91
|
+
tmp = read_bed_as_list(input_bed_files[i])
|
|
92
|
+
printlog("Merging overlap entries in BED file: \"%s\"" % (input_bed_files[i]))
|
|
93
|
+
dat[i] = BED.unionBed3(tmp)
|
|
94
|
+
printlog("Counting CpGs ...")
|
|
95
|
+
(size,count) = count_over_range(dat[i], cpg_ranges)
|
|
96
|
+
result.append([str(i), file_name, len(dat[i]), size, count, count*1000.0/size]) #Class, number_of_region, size_of_region, CpG_raw_count, CpG_count_perKb
|
|
97
|
+
|
|
98
|
+
#step3.2: read the remaining BED files
|
|
99
|
+
for i in range(1, len(input_bed_files)):
|
|
100
|
+
printlog("Reading BED file: \"%s\"" % (input_bed_files[i]))
|
|
101
|
+
file_name = os.path.basename(input_bed_files[i])
|
|
102
|
+
tmp = read_bed_as_list(input_bed_files[i])
|
|
103
|
+
printlog("Merging overlap entries in BED file: \"%s\"" % (input_bed_files[i]))
|
|
104
|
+
dat[i] = BED.unionBed3(tmp)
|
|
105
|
+
|
|
106
|
+
for j in range(0,i):
|
|
107
|
+
printlog("Subtract \"%s\" from \"%s\"" % (input_bed_files[j], input_bed_files[i]))
|
|
108
|
+
dat[i] = BED.subtractBed3(dat[i], dat[j])
|
|
109
|
+
(size,count) = count_over_range(dat[i], cpg_ranges)
|
|
110
|
+
result.append([str(i), file_name, len(dat[i]), size, count, count*1000.0/size])
|
|
111
|
+
|
|
112
|
+
print('\n')
|
|
113
|
+
names=[] #[0,1,2,3,4,...]
|
|
114
|
+
labels = [] #[bed names]
|
|
115
|
+
density=[]
|
|
116
|
+
for tmp in result:
|
|
117
|
+
print ('\t'.join([str(i) for i in tmp]), file=FOUT)
|
|
118
|
+
names.append(tmp[0])
|
|
119
|
+
labels.append(tmp[1])
|
|
120
|
+
density.append(tmp[5])
|
|
121
|
+
FOUT.close()
|
|
122
|
+
|
|
123
|
+
print("name = c(%s)" % ','.join(['"' + i + '"' for i in names[1:]]), file=ROUT)
|
|
124
|
+
print("values = c(%s)" % ','.join([str(i) for i in density[1:]]), file=ROUT)
|
|
125
|
+
print ('pdf("%s", width=8, height=6)' % (options.out_file + '.pdf'), file=ROUT)
|
|
126
|
+
print ('layout(matrix(c(1,1,2,1,1,2), nrow=2, byrow=TRUE))', file=ROUT)
|
|
127
|
+
print ('barplot(values,names.arg=name,col="blue",ylab="CpG per Kb")', file=ROUT)
|
|
128
|
+
print ("plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')", file=ROUT)
|
|
129
|
+
for name,label in zip(names[1:], labels[1:]):
|
|
130
|
+
x_pos = 0.0
|
|
131
|
+
y_pos = 1-(int(name)*9.0 +5)/200
|
|
132
|
+
print ("text(x=%f, y=%f, labels=c(\"%s = %s\"),adj=c(0,0))" % (x_pos, y_pos,name,label), file=ROUT)
|
|
133
|
+
print ('dev.off()', file=ROUT)
|
|
134
|
+
|
|
135
|
+
ROUT.close()
|
|
136
|
+
|
|
137
|
+
printlog("Running R script ...")
|
|
138
|
+
try:
|
|
139
|
+
subprocess.call("Rscript " + options.out_file + '.r', shell=True)
|
|
140
|
+
except:
|
|
141
|
+
print ("Cannot generate pdf file from " + options.out_file + '.r', file=sys.stderr)
|
|
142
|
+
pass
|
|
143
|
+
|
|
144
|
+
if __name__=='__main__':
|
|
145
|
+
main()
|
|
146
|
+
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Description
|
|
5
|
+
-----------
|
|
6
|
+
This program generates the DNA motif logo for a given set of CpGs.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
import sys,os
|
|
11
|
+
import collections
|
|
12
|
+
import subprocess
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pysam
|
|
15
|
+
from optparse import OptionParser
|
|
16
|
+
from cpgmodule import ireader
|
|
17
|
+
from cpgmodule.utils import *
|
|
18
|
+
from cpgmodule import BED
|
|
19
|
+
from cpgmodule.imotif import PSSM
|
|
20
|
+
from cpgmodule._version import __version__
|
|
21
|
+
|
|
22
|
+
__author__ = "Liguo Wang"
|
|
23
|
+
__copyright__ = "Copyleft"
|
|
24
|
+
__credits__ = []
|
|
25
|
+
__license__ = "GPL"
|
|
26
|
+
__maintainer__ = "Liguo Wang"
|
|
27
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
28
|
+
__status__ = "Development"
|
|
29
|
+
|
|
30
|
+
def main():
|
|
31
|
+
print (__doc__)
|
|
32
|
+
usage="%prog [options]" + "\n"
|
|
33
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
34
|
+
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="BED file specifying the C position. This BED file should have at least six columns (Chrom, ChromStart, ChromeEnd, name, score, strand). Note: Must provide correct *strand* information. This file can be a regular text file or compressed file (.gz, .bz2).")
|
|
35
|
+
parser.add_option("-r","--refgenome",action="store",type="string",dest="genome_file",help="Reference genome seqeunces in FASTA format. Must be indexed using the samtools \"faidx\" command. ")
|
|
36
|
+
parser.add_option("-e","--extend",action="store",type="int",dest="extend_size",default=5,help="Number of bases extended to up- and down-stream. default=%default (bp)")
|
|
37
|
+
parser.add_option("-n","--name",action="store",type='string', dest="motif_name",default='motif', help="Motif name. default=%default")
|
|
38
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
|
|
39
|
+
(options,args)=parser.parse_args()
|
|
40
|
+
|
|
41
|
+
print ()
|
|
42
|
+
|
|
43
|
+
if not (options.input_file):
|
|
44
|
+
parser.print_help()
|
|
45
|
+
sys.exit(101)
|
|
46
|
+
|
|
47
|
+
if not (options.genome_file):
|
|
48
|
+
parser.print_help()
|
|
49
|
+
sys.exit(102)
|
|
50
|
+
#index refegenome file if it hasn't been done
|
|
51
|
+
if not os.path.exists(options.genome_file + '.fai'):
|
|
52
|
+
printlog("Creating index for %s" % options.genome_file)
|
|
53
|
+
pysam.faidx(options.genome_file)
|
|
54
|
+
|
|
55
|
+
if not (options.out_file):
|
|
56
|
+
parser.print_help()
|
|
57
|
+
sys.exit(103)
|
|
58
|
+
|
|
59
|
+
refFasta = pysam.Fastafile(options.genome_file)
|
|
60
|
+
FOUT = open(options.out_file + '.fa','w')
|
|
61
|
+
|
|
62
|
+
printlog("Reading %s ..." % options.input_file)
|
|
63
|
+
for l in ireader.reader(options.input_file):
|
|
64
|
+
if l.startswith('#'):
|
|
65
|
+
continue
|
|
66
|
+
if l.startswith('track'):
|
|
67
|
+
continue
|
|
68
|
+
if l.startswith('browser'):
|
|
69
|
+
continue
|
|
70
|
+
f = l.split()
|
|
71
|
+
if '-' in f:
|
|
72
|
+
strand = '-'
|
|
73
|
+
else:
|
|
74
|
+
strand = '+'
|
|
75
|
+
try:
|
|
76
|
+
chrom = f[0]
|
|
77
|
+
position = int(f[2])
|
|
78
|
+
except:
|
|
79
|
+
print ("BED has at lesat 4 columns. Skip: " + l, file=sys.stderr)
|
|
80
|
+
|
|
81
|
+
start = position - options.extend_size - 1
|
|
82
|
+
end = position + options.extend_size
|
|
83
|
+
if start < 0 or start > end:
|
|
84
|
+
continue
|
|
85
|
+
|
|
86
|
+
fa_name = '>' + '_'.join([str(i) for i in (chrom,start,end,strand)])
|
|
87
|
+
fa_seq = refFasta.fetch(chrom, start, end).upper()
|
|
88
|
+
if strand == '-':
|
|
89
|
+
fa_seq = revcomp(fa_seq)
|
|
90
|
+
print (fa_name,file=FOUT)
|
|
91
|
+
print (fa_seq,file=FOUT)
|
|
92
|
+
FOUT.close()
|
|
93
|
+
|
|
94
|
+
printlog("Generate motif logo ... ")
|
|
95
|
+
try:
|
|
96
|
+
subprocess.call("weblogo --format PDF -D fasta -c classic -s large -f %s -o %s -t %s" % (options.out_file + '.fa', options.out_file + '.logo.pdf', options.motif_name), shell=True)
|
|
97
|
+
subprocess.call("weblogo --format PNG -D fasta -c classic -s large -f %s -o %s -t %s" % (options.out_file + '.fa', options.out_file + '.logo.png', options.motif_name), shell=True)
|
|
98
|
+
except:
|
|
99
|
+
print ("Cannot run weblogo. Please install weblogo (https://github.com/WebLogo/weblogo)", file=sys.stderr)
|
|
100
|
+
pass
|
|
101
|
+
printlog("Motif logo saved to \"%s\" and \"%s\"" % (options.out_file + '.logo.pdf', options.out_file + '.logo.png'))
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
m = PSSM(sites=options.out_file + '.fa', name = options.motif_name)
|
|
105
|
+
|
|
106
|
+
printlog("Write position frequency matrix (PFM) to \"%s\"" % (options.out_file + '.pfm'))
|
|
107
|
+
FF = open(options.out_file + '.pfm', 'w')
|
|
108
|
+
m.toPFM(FOUT=FF)
|
|
109
|
+
FF.close()
|
|
110
|
+
|
|
111
|
+
printlog("Write position probability matrix (PPM) to \"%s\"" % (options.out_file + '.ppm'))
|
|
112
|
+
FF = open(options.out_file + '.ppm', 'w')
|
|
113
|
+
m.toPPM(FOUT=FF)
|
|
114
|
+
FF.close()
|
|
115
|
+
|
|
116
|
+
printlog("Write position weight matrix (PWM) to \"%s\"" % (options.out_file + '.pwm'))
|
|
117
|
+
FF = open(options.out_file + '.pwm', 'w')
|
|
118
|
+
m.toPWM(FOUT=FF)
|
|
119
|
+
FF.close()
|
|
120
|
+
|
|
121
|
+
printlog("Write Jaspar format matrix to \"%s\"" % (options.out_file + '.jaspar'))
|
|
122
|
+
FF = open(options.out_file + '.jaspar', 'w')
|
|
123
|
+
m.toJaspar(FOUT=FF)
|
|
124
|
+
FF.close()
|
|
125
|
+
|
|
126
|
+
printlog("Write MEME format matrix to \"%s\"" % (options.out_file + '.meme'))
|
|
127
|
+
FF = open(options.out_file + '.meme', 'w')
|
|
128
|
+
m.toMEME(FOUT=FF)
|
|
129
|
+
FF.close()
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
if __name__=='__main__':
|
|
133
|
+
main()
|
|
134
|
+
|