cpgtools 2.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cpgmodule/BED.py +441 -0
- cpgmodule/MI.py +193 -0
- cpgmodule/__init__.py +0 -0
- cpgmodule/_version.py +1 -0
- cpgmodule/cgID.py +866897 -0
- cpgmodule/data/AltumAge_cpg.pkl +0 -0
- cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
- cpgmodule/data/AltumAge_scaler.pkl +0 -0
- cpgmodule/data/GA_Bohlin.pkl +0 -0
- cpgmodule/data/GA_Haftorn.pkl +0 -0
- cpgmodule/data/GA_Knight.pkl +0 -0
- cpgmodule/data/GA_Lee_CPC.pkl +0 -0
- cpgmodule/data/GA_Lee_RPC.pkl +0 -0
- cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
- cpgmodule/data/GA_Mayne.pkl +0 -0
- cpgmodule/data/Hannum.pkl +0 -0
- cpgmodule/data/Horvath_2013.pkl +0 -0
- cpgmodule/data/Horvath_2018.pkl +0 -0
- cpgmodule/data/Levine.pkl +0 -0
- cpgmodule/data/Lu_DNAmTL.pkl +0 -0
- cpgmodule/data/Ped_McEwen.pkl +0 -0
- cpgmodule/data/Ped_Wu.pkl +0 -0
- cpgmodule/data/Zhang_BLUP.pkl +0 -0
- cpgmodule/data/Zhang_EN.pkl +0 -0
- cpgmodule/data/__init__.py +0 -0
- cpgmodule/extend_bed.py +147 -0
- cpgmodule/imotif.py +348 -0
- cpgmodule/ireader.py +28 -0
- cpgmodule/methylClock.py +53 -0
- cpgmodule/padjust.py +58 -0
- cpgmodule/region2gene.py +170 -0
- cpgmodule/utils.py +642 -0
- cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
- cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
- cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
- cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
- cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
- cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
- cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
- cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
- cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
- cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
- cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
- cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
- cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
- cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
- cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
- cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
- cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
- cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
- cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
- cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
- cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
- cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
- cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
- cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
- cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
- cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
- cpgtools-2.0.5.dist-info/METADATA +59 -0
- cpgtools-2.0.5.dist-info/RECORD +104 -0
- cpgtools-2.0.5.dist-info/WHEEL +5 -0
- cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
- cpgtools-2.0.5.dist-info/top_level.txt +5 -0
- impyute/__init__.py +3 -0
- impyute/contrib/__init__.py +7 -0
- impyute/contrib/compare.py +69 -0
- impyute/contrib/count_missing.py +30 -0
- impyute/contrib/describe.py +63 -0
- impyute/cs/__init__.py +11 -0
- impyute/cs/buck_iterative.py +82 -0
- impyute/cs/central_tendency.py +84 -0
- impyute/cs/em.py +52 -0
- impyute/cs/fast_knn.py +130 -0
- impyute/cs/random.py +27 -0
- impyute/dataset/__init__.py +6 -0
- impyute/dataset/base.py +137 -0
- impyute/dataset/corrupt.py +55 -0
- impyute/deletion/__init__.py +5 -0
- impyute/deletion/complete_case.py +21 -0
- impyute/ops/__init__.py +12 -0
- impyute/ops/error.py +9 -0
- impyute/ops/inverse_distance_weighting.py +31 -0
- impyute/ops/matrix.py +47 -0
- impyute/ops/testing.py +20 -0
- impyute/ops/util.py +96 -0
- impyute/ops/wrapper.py +179 -0
- impyute/ts/__init__.py +6 -0
- impyute/ts/locf.py +57 -0
- impyute/ts/moving_window.py +128 -0
- impyutelib.py +890 -0
- missingpy/__init__.py +4 -0
- missingpy/knnimpute.py +328 -0
- missingpy/missforest.py +556 -0
- missingpy/pairwise_external.py +315 -0
- missingpy/tests/__init__.py +0 -0
- missingpy/tests/test_knnimpute.py +605 -0
- missingpy/tests/test_missforest.py +409 -0
- missingpy/utils.py +124 -0
- misspylib.py +565 -0
cpgmodule/BED.py
ADDED
|
@@ -0,0 +1,441 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
|
|
3
|
+
#import built-in modules
|
|
4
|
+
import os,sys
|
|
5
|
+
import re
|
|
6
|
+
import string
|
|
7
|
+
import warnings
|
|
8
|
+
import string
|
|
9
|
+
import collections
|
|
10
|
+
import math
|
|
11
|
+
from operator import itemgetter
|
|
12
|
+
from itertools import groupby
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
#import third-party modules
|
|
16
|
+
from bx.bitset import *
|
|
17
|
+
from bx.bitset_builders import *
|
|
18
|
+
from bx.intervals import *
|
|
19
|
+
|
|
20
|
+
#from itertools import *
|
|
21
|
+
from cpgmodule import ireader
|
|
22
|
+
|
|
23
|
+
BED12 = '''
|
|
24
|
+
1. chrom - The name of the chromosome (e.g. chr3, chrY, chr2_random) or scaffold (e.g. scaffold10671).
|
|
25
|
+
2. chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0.
|
|
26
|
+
3. chromEnd - The ending position of the feature in the chromosome or scaffold.
|
|
27
|
+
4. name - Defines the name of the BED line.
|
|
28
|
+
5. score.
|
|
29
|
+
6. strand - Defines the strand. Either "." (=no strand) or "+" or "-".
|
|
30
|
+
7. thickStart - The starting position at which the feature is drawn thickly (for example, the start codon in gene displays).
|
|
31
|
+
8. thickEnd - The ending position at which the feature is drawn thickly (for example the stop codon in gene displays).
|
|
32
|
+
9. itemRgb - An RGB value of the form R,G,B (e.g. 255,0,0).
|
|
33
|
+
10. blockCount - The number of blocks (exons) in the BED line.
|
|
34
|
+
11. blockSizes - A comma-separated list of the block sizes.
|
|
35
|
+
12. blockStarts - A comma-separated list of block starts.
|
|
36
|
+
|
|
37
|
+
Detailed description of BED format: https://genome.ucsc.edu/FAQ/FAQformat.html#format1
|
|
38
|
+
'''
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
__author__ = "Liguo Wang"
|
|
42
|
+
__copyright__ = "Copyleft"
|
|
43
|
+
__credits__ = []
|
|
44
|
+
__license__ = "GPL"
|
|
45
|
+
__version__="0.1.0"
|
|
46
|
+
__maintainer__ = "Liguo Wang"
|
|
47
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
48
|
+
__status__ = "Development"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class ParseBED:
|
|
53
|
+
'''
|
|
54
|
+
Manipulate BED (http://genome.ucsc.edu/FAQ/FAQformat.html) format file.
|
|
55
|
+
Input BED file must be 12-column (i.e. BED-12).
|
|
56
|
+
'''
|
|
57
|
+
|
|
58
|
+
def __init__(self,bedFile):
|
|
59
|
+
'''This is constructor of ParseBED'''
|
|
60
|
+
self.f=bedFile
|
|
61
|
+
self.fileName=os.path.basename(bedFile)
|
|
62
|
+
self.ABS_fileName=bedFile
|
|
63
|
+
|
|
64
|
+
def getExons(self,uniquify = True, stranded = True):
|
|
65
|
+
'''
|
|
66
|
+
Get all exons (including both coding exons and UTR exons) from BED-12 file.
|
|
67
|
+
uniquify: if the returned blocks should be uniquify.
|
|
68
|
+
'''
|
|
69
|
+
|
|
70
|
+
reblocks = []
|
|
71
|
+
for l in ireader.reader(self.f):
|
|
72
|
+
l = l.strip()
|
|
73
|
+
if l.startswith(('#','track','browser')):continue
|
|
74
|
+
f = l.split()
|
|
75
|
+
if len(f) < 12:
|
|
76
|
+
print ("Standard BED format has 12 columns.\n%s" % (BED), file=sys.stderr)
|
|
77
|
+
sys.exit(1)
|
|
78
|
+
chrom = f[0]
|
|
79
|
+
chrom_start = int(f[1])
|
|
80
|
+
name = f[4]
|
|
81
|
+
strand = f[5]
|
|
82
|
+
cdsStart = int(f[6])
|
|
83
|
+
cdsEnd = int(f[7])
|
|
84
|
+
blockCount = int(f[9])
|
|
85
|
+
blockSizes = [ int(i) for i in f[10].strip(',').split(',') ]
|
|
86
|
+
blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ]
|
|
87
|
+
for base,offset in zip( blockStarts, blockSizes ):
|
|
88
|
+
if stranded:
|
|
89
|
+
reblocks.append((chrom, base, base+offset, strand))
|
|
90
|
+
else:
|
|
91
|
+
reblocks.append((chrom, base, base+offset))
|
|
92
|
+
#self.f.seek(0)
|
|
93
|
+
if uniquify:
|
|
94
|
+
return list(set(reblocks))
|
|
95
|
+
else:
|
|
96
|
+
return reblocks
|
|
97
|
+
|
|
98
|
+
def getCDSExons(self,uniquify=True, stranded = True):
|
|
99
|
+
|
|
100
|
+
'''
|
|
101
|
+
Get only CDS exon regions from BED-12 file. Both 5' and 3' UTR parts are removed.
|
|
102
|
+
uniquify: if the returned blocks should be uniquify.
|
|
103
|
+
'''
|
|
104
|
+
reblocks = []
|
|
105
|
+
for l in ireader.reader(self.f):
|
|
106
|
+
l = l.strip()
|
|
107
|
+
if l.startswith(('#','track','browser')):continue
|
|
108
|
+
f = l.split()
|
|
109
|
+
if len(f) < 12:
|
|
110
|
+
print ("\nInput error!\nStandard BED format has 12 columns.\n%s" % (BED12), file=sys.stderr)
|
|
111
|
+
sys.exit(1)
|
|
112
|
+
|
|
113
|
+
chrom = f[0]
|
|
114
|
+
chrom_start = int(f[1])
|
|
115
|
+
name = f[4]
|
|
116
|
+
strand = f[5]
|
|
117
|
+
cdsStart = int(f[6])
|
|
118
|
+
cdsEnd = int(f[7])
|
|
119
|
+
blockCount = int(f[9])
|
|
120
|
+
blockSizes = [ int(i) for i in f[10].strip(',').split(',') ]
|
|
121
|
+
blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ]
|
|
122
|
+
cds_exons = []
|
|
123
|
+
genome_seq_index = []
|
|
124
|
+
for base,offset in zip( blockStarts, blockSizes ):
|
|
125
|
+
if (base + offset) < cdsStart: continue
|
|
126
|
+
if base > cdsEnd: continue
|
|
127
|
+
exon_start = max( base, cdsStart )
|
|
128
|
+
exon_end = min( base+offset, cdsEnd )
|
|
129
|
+
if stranded:
|
|
130
|
+
reblocks.append((chrom,exon_start,exon_end, strand))
|
|
131
|
+
else:
|
|
132
|
+
reblocks.append((chrom,exon_start,exon_end))
|
|
133
|
+
#self.f.seek(0)
|
|
134
|
+
if uniquify:
|
|
135
|
+
return list(set(reblocks))
|
|
136
|
+
else:
|
|
137
|
+
return reblocks
|
|
138
|
+
|
|
139
|
+
def getUTRs(self,utr=35, uniquify=True, stranded = True):
|
|
140
|
+
'''
|
|
141
|
+
Get UTR regions from BED-12 file.
|
|
142
|
+
When utr=35 [default], extract both 5' and 3' UTR.
|
|
143
|
+
When utr=3, only extract 3' UTR.
|
|
144
|
+
When utr=5, only extract 5' UTR
|
|
145
|
+
uniquify: if the returned blocks should be uniquify.
|
|
146
|
+
'''
|
|
147
|
+
|
|
148
|
+
reblocks = []
|
|
149
|
+
for l in ireader.reader(self.f):
|
|
150
|
+
l = l.strip()
|
|
151
|
+
if l.startswith(('#','track','browser')):continue
|
|
152
|
+
f = l.split()
|
|
153
|
+
if len(f) < 12:
|
|
154
|
+
print ("\nInput error!\nStandard BED format has 12 columns.\n%s" % (BED12), file=sys.stderr)
|
|
155
|
+
sys.exit(1)
|
|
156
|
+
|
|
157
|
+
chrom = f[0]
|
|
158
|
+
chrom_start = int(f[1])
|
|
159
|
+
name = f[4]
|
|
160
|
+
strand = f[5]
|
|
161
|
+
cdsStart = int(f[6])
|
|
162
|
+
cdsEnd = int(f[7])
|
|
163
|
+
blockCount = int(f[9])
|
|
164
|
+
blockSizes = [ int(i) for i in f[10].strip(',').split(',') ]
|
|
165
|
+
blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ]
|
|
166
|
+
exon_start = []
|
|
167
|
+
exon_end = []
|
|
168
|
+
for base,offset in zip( blockStarts, blockSizes ):
|
|
169
|
+
exon_start.append(base)
|
|
170
|
+
exon_end.append(base+offset)
|
|
171
|
+
if strand == '+':
|
|
172
|
+
if (utr==35 or utr==5):
|
|
173
|
+
for st,end in zip(exon_start,exon_end):
|
|
174
|
+
if st < cdsStart:
|
|
175
|
+
utr_st = st
|
|
176
|
+
utr_end = min(end,cdsStart)
|
|
177
|
+
if stranded:
|
|
178
|
+
reblocks.append((chrom,utr_st,utr_end,strand))
|
|
179
|
+
else:
|
|
180
|
+
reblocks.append((chrom,utr_st,utr_end))
|
|
181
|
+
if (utr==35 or utr==3):
|
|
182
|
+
for st,end in zip(exon_start,exon_end):
|
|
183
|
+
if end > cdsEnd:
|
|
184
|
+
utr_st = max(st, cdsEnd)
|
|
185
|
+
utr_end = end
|
|
186
|
+
if stranded:
|
|
187
|
+
reblocks.append((chrom,utr_st,utr_end,strand))
|
|
188
|
+
else:
|
|
189
|
+
reblocks.append((chrom,utr_st,utr_end))
|
|
190
|
+
if strand == '-':
|
|
191
|
+
if (utr==35 or utr==3):
|
|
192
|
+
for st,end in zip(exon_start,exon_end):
|
|
193
|
+
if st < cdsStart:
|
|
194
|
+
utr_st = st
|
|
195
|
+
utr_end = min(end,cdsStart)
|
|
196
|
+
if stranded:
|
|
197
|
+
reblocks.append((chrom,utr_st,utr_end,strand))
|
|
198
|
+
else:
|
|
199
|
+
reblocks.append((chrom,utr_st,utr_end))
|
|
200
|
+
if (utr==35 or utr==5):
|
|
201
|
+
for st,end in zip(exon_start,exon_end):
|
|
202
|
+
if end > cdsEnd:
|
|
203
|
+
utr_st = max(st, cdsEnd)
|
|
204
|
+
utr_end = end
|
|
205
|
+
if stranded:
|
|
206
|
+
reblocks.append((chrom,utr_st,utr_end,strand))
|
|
207
|
+
else:
|
|
208
|
+
reblocks.append((chrom,utr_st,utr_end))
|
|
209
|
+
#self.f.seek(0)
|
|
210
|
+
if uniquify:
|
|
211
|
+
return list(set(reblocks))
|
|
212
|
+
else:
|
|
213
|
+
return reblocks
|
|
214
|
+
|
|
215
|
+
def getIntrons(self, itype, uniquify=True, stranded=True):
|
|
216
|
+
'''
|
|
217
|
+
Get Intron regions from BED-12 file.
|
|
218
|
+
separated bed file, each row represents one intron
|
|
219
|
+
|
|
220
|
+
itype = :
|
|
221
|
+
* 'all': all introns
|
|
222
|
+
* 'first': Only return the first intron of each gene. The gene should have at least 1 intron.
|
|
223
|
+
* 'internal': return all internal introns. The gene should have at least 3 introns.
|
|
224
|
+
* 'last': Return the last intron. The gene should have at least 2 introns.
|
|
225
|
+
* 'cds': Return introns within CDS region.
|
|
226
|
+
* 'utr': Return introns within UTR regions.
|
|
227
|
+
'''
|
|
228
|
+
|
|
229
|
+
reblocks=[]
|
|
230
|
+
for l in ireader.reader(self.f):
|
|
231
|
+
l = l.strip()
|
|
232
|
+
if l.startswith(('#','track','browser')):continue
|
|
233
|
+
f = l.split()
|
|
234
|
+
chrom = f[0]
|
|
235
|
+
chrom_start = int(f[1])
|
|
236
|
+
name = f[4]
|
|
237
|
+
strand = f[5]
|
|
238
|
+
cdsStart = int(f[6])
|
|
239
|
+
cdsEnd = int(f[7])
|
|
240
|
+
blockCount = int(f[9])
|
|
241
|
+
if blockCount == 1:continue
|
|
242
|
+
blockSizes = [ int(i) for i in f[10].strip(',').split(',') ]
|
|
243
|
+
blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ]
|
|
244
|
+
exon_start = []
|
|
245
|
+
exon_end = []
|
|
246
|
+
for base,offset in zip( blockStarts, blockSizes ):
|
|
247
|
+
exon_start.append(base)
|
|
248
|
+
exon_end.append(base+offset)
|
|
249
|
+
|
|
250
|
+
intron_start = exon_end[:-1]
|
|
251
|
+
intron_end=exon_start[1:]
|
|
252
|
+
|
|
253
|
+
intron_list = list(zip(intron_start,intron_end))
|
|
254
|
+
intron_number = len(intron_list)
|
|
255
|
+
|
|
256
|
+
if itype == 'all':
|
|
257
|
+
for (st,end) in intron_list:
|
|
258
|
+
if stranded:
|
|
259
|
+
reblocks.append((chrom,st,end, strand))
|
|
260
|
+
else:
|
|
261
|
+
reblocks.append((chrom,st,end))
|
|
262
|
+
|
|
263
|
+
elif itype == 'first':
|
|
264
|
+
if intron_number == 0:
|
|
265
|
+
continue
|
|
266
|
+
if strand == '-':
|
|
267
|
+
if stranded:
|
|
268
|
+
reblocks.append((chrom, intron_list[-1][0], intron_list[-1][1], strand))
|
|
269
|
+
else:
|
|
270
|
+
reblocks.append((chrom, intron_list[-1][0], intron_list[-1][1]))
|
|
271
|
+
else:
|
|
272
|
+
if stranded:
|
|
273
|
+
reblocks.append((chrom, intron_list[-1][0], intron_list[-1][1], strand))
|
|
274
|
+
else:
|
|
275
|
+
reblocks.append((chrom, intron_list[-1][0], intron_list[-1][1]))
|
|
276
|
+
|
|
277
|
+
elif itype == 'last':
|
|
278
|
+
if intron_number < 2:
|
|
279
|
+
continue
|
|
280
|
+
if strand == '-':
|
|
281
|
+
if stranded:
|
|
282
|
+
reblocks.append((chrom, intron_list[-1][0], intron_list[-1][1], strand))
|
|
283
|
+
else:
|
|
284
|
+
reblocks.append((chrom, intron_list[-1][0], intron_list[-1][1]))
|
|
285
|
+
else:
|
|
286
|
+
if stranded:
|
|
287
|
+
reblocks.append((chrom, intron_list[-1][0], intron_list[-1][1], strand))
|
|
288
|
+
else:
|
|
289
|
+
reblocks.append((chrom, intron_list[-1][0], intron_list[-1][1]))
|
|
290
|
+
elif itype == 'internal':
|
|
291
|
+
if intron_number < 3:
|
|
292
|
+
continue
|
|
293
|
+
for (st,end) in intron_list[1:-1]:
|
|
294
|
+
if stranded:
|
|
295
|
+
reblocks.append((chrom,st,end, strand))
|
|
296
|
+
else:
|
|
297
|
+
reblocks.append((chrom,st,end))
|
|
298
|
+
|
|
299
|
+
elif itype == 'cds':
|
|
300
|
+
for (st,end) in intron_list:
|
|
301
|
+
if end < cdsStart: continue
|
|
302
|
+
if st > cdsEnd: continue
|
|
303
|
+
if stranded:
|
|
304
|
+
reblocks.append((chrom,st,end, strand))
|
|
305
|
+
else:
|
|
306
|
+
reblocks.append((chrom,st,end))
|
|
307
|
+
elif itype == 'utr':
|
|
308
|
+
for (st,end) in intron_list:
|
|
309
|
+
if end < cdsStart:
|
|
310
|
+
if stranded:
|
|
311
|
+
reblocks.append((chrom,st,end, strand))
|
|
312
|
+
else:
|
|
313
|
+
reblocks.append((chrom,st,end))
|
|
314
|
+
if st > cdsEnd:
|
|
315
|
+
if stranded:
|
|
316
|
+
reblocks.append((chrom,st,end, strand))
|
|
317
|
+
else:
|
|
318
|
+
reblocks.append((chrom,st,end))
|
|
319
|
+
|
|
320
|
+
#self.f.seek(0)
|
|
321
|
+
if uniquify:
|
|
322
|
+
return list(set(reblocks))
|
|
323
|
+
else:
|
|
324
|
+
return reblocks
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def getIntergenic(self,direction='up', size=2000, uniquify=True, stranded = True):
|
|
328
|
+
'''get intergenic regions. direction=up or down or both.'''
|
|
329
|
+
|
|
330
|
+
reblocks=[]
|
|
331
|
+
for l in ireader.reader(self.f):
|
|
332
|
+
l = l.strip()
|
|
333
|
+
if l.startswith(('#','track','browser')):continue
|
|
334
|
+
f = l.split()
|
|
335
|
+
chrom = f[0]
|
|
336
|
+
tx_start = int( f[1] )
|
|
337
|
+
tx_end = int( f[2] )
|
|
338
|
+
strand = f[5]
|
|
339
|
+
if(direction=="up" or direction=="both"):
|
|
340
|
+
if strand=='-':
|
|
341
|
+
region_st=tx_end
|
|
342
|
+
region_end=tx_end +size
|
|
343
|
+
else:
|
|
344
|
+
region_st = max(tx_start-size,0)
|
|
345
|
+
region_end=tx_start
|
|
346
|
+
reblocks.append((chrom,region_st,region_end, strand))
|
|
347
|
+
if (direction=="down" or direction=="both"):
|
|
348
|
+
if strand == '-':
|
|
349
|
+
region_st = max(0,tx_start-size)
|
|
350
|
+
region_end = tx_start
|
|
351
|
+
else:
|
|
352
|
+
region_st = tx_end
|
|
353
|
+
region_end = tx_end+size
|
|
354
|
+
if stranded:
|
|
355
|
+
reblocks.append((chrom,region_st,region_end, strand))
|
|
356
|
+
else:
|
|
357
|
+
reblocks.append((chrom,region_st,region_end))
|
|
358
|
+
#self.f.seek(0)
|
|
359
|
+
if uniquify:
|
|
360
|
+
return list(set(reblocks))
|
|
361
|
+
else:
|
|
362
|
+
return reblocks
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def unionBed3(lst):
|
|
369
|
+
'''Take the union of 3 column bed files. return a new list'''
|
|
370
|
+
bitsets = binned_bitsets_from_list(lst)
|
|
371
|
+
ret_lst=[]
|
|
372
|
+
for chrom in bitsets:
|
|
373
|
+
bits = bitsets[chrom]
|
|
374
|
+
end = 0
|
|
375
|
+
while 1:
|
|
376
|
+
start = bits.next_set( end )
|
|
377
|
+
if start == bits.size: break
|
|
378
|
+
end = bits.next_clear( start )
|
|
379
|
+
ret_lst.append([chrom, start, end])
|
|
380
|
+
bitsets=dict()
|
|
381
|
+
return ret_lst
|
|
382
|
+
|
|
383
|
+
def intersectBed3(lst1,lst2):
|
|
384
|
+
'''Take the intersection of two bed files (3 column bed files)'''
|
|
385
|
+
bits1 = binned_bitsets_from_list(lst1)
|
|
386
|
+
bits2 = binned_bitsets_from_list(lst2)
|
|
387
|
+
|
|
388
|
+
bitsets = dict()
|
|
389
|
+
ret_lst = []
|
|
390
|
+
for key in bits1:
|
|
391
|
+
if key in bits2:
|
|
392
|
+
bits1[key].iand( bits2[key] )
|
|
393
|
+
bitsets[key] = bits1[key]
|
|
394
|
+
|
|
395
|
+
for chrom in bitsets:
|
|
396
|
+
bits = bitsets[chrom]
|
|
397
|
+
end = 0
|
|
398
|
+
while 1:
|
|
399
|
+
start = bits.next_set( end )
|
|
400
|
+
if start == bits.size: break
|
|
401
|
+
end = bits.next_clear( start )
|
|
402
|
+
ret_lst.append([chrom, start, end])
|
|
403
|
+
bits1.clear()
|
|
404
|
+
bits2.clear()
|
|
405
|
+
bitsets.clear()
|
|
406
|
+
return ret_lst
|
|
407
|
+
|
|
408
|
+
def subtractBed3(lst1,lst2):
|
|
409
|
+
'''subtrack lst2 from lst1'''
|
|
410
|
+
bitsets1 = binned_bitsets_from_list(lst1)
|
|
411
|
+
bitsets2 = binned_bitsets_from_list(lst2)
|
|
412
|
+
|
|
413
|
+
ret_lst=[]
|
|
414
|
+
for chrom in bitsets1:
|
|
415
|
+
if chrom not in bitsets1:
|
|
416
|
+
continue
|
|
417
|
+
bits1 = bitsets1[chrom]
|
|
418
|
+
if chrom in bitsets2:
|
|
419
|
+
bits2 = bitsets2[chrom]
|
|
420
|
+
bits2.invert()
|
|
421
|
+
bits1.iand( bits2 )
|
|
422
|
+
end=0
|
|
423
|
+
while 1:
|
|
424
|
+
start = bits1.next_set( end )
|
|
425
|
+
if start == bits1.size: break
|
|
426
|
+
end = bits1.next_clear( start )
|
|
427
|
+
ret_lst.append([chrom,start,end])
|
|
428
|
+
bitsets1 = dict()
|
|
429
|
+
bitsets2 = dict()
|
|
430
|
+
return ret_lst
|
|
431
|
+
|
|
432
|
+
def tillingBed(chrName,chrSize,stepSize=10000):
|
|
433
|
+
'''tilling whome genome into small sizes'''
|
|
434
|
+
#tilling genome
|
|
435
|
+
for start in xrange(0,chrSize,stepSize):
|
|
436
|
+
end = start + stepSize
|
|
437
|
+
if end < chrSize:
|
|
438
|
+
yield (chrName,start,end)
|
|
439
|
+
else:
|
|
440
|
+
yield (chrName,start,chrSize)
|
|
441
|
+
|
cpgmodule/MI.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
'''calculate mutual information of two lists of numbers or symbols'''
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
from collections import Counter
|
|
7
|
+
from sklearn.feature_selection import mutual_info_classif
|
|
8
|
+
from sklearn.metrics import mutual_info_score
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
__author__ = "Liguo Wang"
|
|
12
|
+
__copyright__ = ""
|
|
13
|
+
__credits__ = []
|
|
14
|
+
__license__ = "GPLv2"
|
|
15
|
+
__version__ = "1.0.0"
|
|
16
|
+
__maintainer__ = "Liguo Wang"
|
|
17
|
+
__email__ = "Wang.Liguo@mayo.edu"
|
|
18
|
+
__status__ = "Development" #Prototype or Production
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def Mutual_information1(x,y):
|
|
24
|
+
'''
|
|
25
|
+
x and y are lists of symbols (like 'A','C','G','T').
|
|
26
|
+
Calculation mutual information based on: MI = H(x) + H(y) - H(x,y)
|
|
27
|
+
Log2 based, unit is bit
|
|
28
|
+
'''
|
|
29
|
+
x = [str(i) for i in x]
|
|
30
|
+
y = [str(i) for i in y]
|
|
31
|
+
if len(x) != len(y):
|
|
32
|
+
return 0
|
|
33
|
+
xy = [''.join(i) for i in zip(x,y)]
|
|
34
|
+
|
|
35
|
+
x_freq = np.array(list(Counter(x).values())) #a.items(): [('A', 3), ('C', 2), ('T', 4), ('G', 1)]
|
|
36
|
+
y_freq = np.array(list(Counter(y).values()))
|
|
37
|
+
xy_freq = np.array(list(Counter(xy).values()))
|
|
38
|
+
|
|
39
|
+
x_freq = x_freq*1.0/sum(x_freq)
|
|
40
|
+
y_freq = y_freq*1.0/sum(y_freq)
|
|
41
|
+
xy_freq = xy_freq*1.0/sum(xy_freq)
|
|
42
|
+
|
|
43
|
+
x_H = -sum([i * np.log2(i) for i in x_freq])
|
|
44
|
+
y_H = -sum([i * np.log2(i) for i in y_freq])
|
|
45
|
+
xy_H = -sum([i * np.log2(i) for i in xy_freq])
|
|
46
|
+
|
|
47
|
+
return (x_H,y_H,xy_H, x_H+y_H-xy_H)
|
|
48
|
+
|
|
49
|
+
def Mutual_information2(x,y):
|
|
50
|
+
'''
|
|
51
|
+
x and y are lists of symbols (like 'A','C','G','T').
|
|
52
|
+
Calculate mutual information based on its original definition.
|
|
53
|
+
Log2 based, unit is bit
|
|
54
|
+
'''
|
|
55
|
+
x = [str(i) for i in x]
|
|
56
|
+
y = [str(i) for i in y]
|
|
57
|
+
if len(x) != len(y):
|
|
58
|
+
return 0
|
|
59
|
+
xy = [''.join(i) for i in zip(x,y)]
|
|
60
|
+
|
|
61
|
+
px = {}
|
|
62
|
+
py = {}
|
|
63
|
+
pxy = {}
|
|
64
|
+
for i,j in list(Counter(x).items()):
|
|
65
|
+
px[i] = j*1.0/len(x)
|
|
66
|
+
for i,j in list(Counter(y).items()):
|
|
67
|
+
py[i] = j*1.0/len(y)
|
|
68
|
+
for i,j in list(Counter(xy).items()):
|
|
69
|
+
pxy[i] = j*1.0/len(xy)
|
|
70
|
+
#print px
|
|
71
|
+
#print py
|
|
72
|
+
#print pxy
|
|
73
|
+
|
|
74
|
+
mi_sum = 0.0
|
|
75
|
+
tmp = set()
|
|
76
|
+
for xi, yi in zip(x,y):
|
|
77
|
+
xyi = xi + yi
|
|
78
|
+
if xyi in tmp: continue
|
|
79
|
+
#print "%s::px:%f, py:%f, pxy:%f" % (xyi,px[xi],py[yi],pxy[xyi])
|
|
80
|
+
mi_sum += (pxy[xyi] * np.log(pxy[xyi] / (px[xi] * py[yi])))
|
|
81
|
+
tmp.add(xyi)
|
|
82
|
+
return mi_sum
|
|
83
|
+
|
|
84
|
+
def PMI(x,y):
|
|
85
|
+
'''
|
|
86
|
+
x and y are lists of symbols (like 'A','C','G','T').
|
|
87
|
+
Calculate pointwise mutual information based on its original definition.
|
|
88
|
+
Log2 based, unit is bit
|
|
89
|
+
'''
|
|
90
|
+
x = [str(i) for i in x]
|
|
91
|
+
y = [str(i) for i in y]
|
|
92
|
+
|
|
93
|
+
if len(x) != len(y):
|
|
94
|
+
return 0
|
|
95
|
+
xy = [''.join(i) for i in zip(x,y)]
|
|
96
|
+
#print xy
|
|
97
|
+
#print set(x)
|
|
98
|
+
#print set(y)
|
|
99
|
+
px = {}
|
|
100
|
+
py = {}
|
|
101
|
+
pxy = {}
|
|
102
|
+
for i,j in list(Counter(x).items()):
|
|
103
|
+
px[i] = j*1.0/len(x)
|
|
104
|
+
for i,j in list(Counter(y).items()):
|
|
105
|
+
py[i] = j*1.0/len(y)
|
|
106
|
+
for i,j in list(Counter(xy).items()):
|
|
107
|
+
pxy[i] = j*1.0/len(xy)
|
|
108
|
+
#print px
|
|
109
|
+
#print py
|
|
110
|
+
#print pxy
|
|
111
|
+
|
|
112
|
+
#print set(x)
|
|
113
|
+
#print set(y)
|
|
114
|
+
|
|
115
|
+
for i in set(x):
|
|
116
|
+
#if px[i] < 0.05:continue
|
|
117
|
+
for j in set(y):
|
|
118
|
+
#if py[j] < 0.05:continue
|
|
119
|
+
tmp1 = i + j
|
|
120
|
+
if i + j in pxy:
|
|
121
|
+
tmp2 = np.log2(pxy[i+j]/(px[i] * py[j]))
|
|
122
|
+
#tmp2 = -(np.log2(pxy[i+j]/(px[i] * py[j]))) / np.log2(pxy[i+j])
|
|
123
|
+
else:
|
|
124
|
+
continue
|
|
125
|
+
print(tmp1,tmp2)
|
|
126
|
+
|
|
127
|
+
def Mutual_expected():
|
|
128
|
+
'''
|
|
129
|
+
x and y are lists of symbols (like 'A','C','G','T').
|
|
130
|
+
Calculation mutual information based on: MI = H(x) + H(y) - H(x,y)
|
|
131
|
+
Log2 based, unit is bit
|
|
132
|
+
'''
|
|
133
|
+
x_freq = [0.25]*4
|
|
134
|
+
y_freq = [0.25]*4
|
|
135
|
+
xy_freq= [0.0625]*16
|
|
136
|
+
|
|
137
|
+
x_H = -sum([i * np.log2(i) for i in x_freq])
|
|
138
|
+
y_H = -sum([i * np.log2(i) for i in y_freq])
|
|
139
|
+
xy_H = -sum([i * np.log2(i) for i in xy_freq])
|
|
140
|
+
|
|
141
|
+
return (x_H,y_H,xy_H, x_H+y_H-xy_H)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
if __name__=='__main__':
|
|
145
|
+
x=['G', 'T', 'C', 'A', 'T', 'T', 'A', 'C', 'T', 'A']
|
|
146
|
+
y=['A', 'C', 'A', 'C', 'A', 'A', 'G', 'A', 'G', 'A']
|
|
147
|
+
z=['A', 'C', 'A', 'T', 'A', 'A', 'T', 'A', 'G', 'T']
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
X_0 = (0, 0, 1, 1, 0, 1, 1, 2, 2, 2)
|
|
151
|
+
X_1 = (3, 4, 5, 5, 3, 2, 2, 6, 6, 1)
|
|
152
|
+
X_2 = [7, 2, 1, 3, 2, 8, 9, 1, 2, 0]
|
|
153
|
+
|
|
154
|
+
C8 = ['T', 'A', 'A', 'A', 'T', 'T', 'G', 'C', 'T', 'A', 'A', 'A', 'A', 'T', 'T', 'T', 'A', 'T', 'T', 'A', 'G', 'A', 'C', 'A', 'G', 'G', 'G', 'G', 'T', 'A', 'T', 'T', 'A', 'G', 'C', 'T', 'T', 'T', 'A', 'T', 'G', 'G', 'A', 'C', 'T', 'C', 'C', 'C', 'C', 'T', 'G', 'A', 'T', 'A', 'T', 'A', 'A', 'T', 'C', 'C', 'T', 'G', 'A', 'C', 'G', 'T', 'T', 'C', 'T', 'T', 'A', 'T', 'A', 'A', 'A', 'A', 'T', 'G', 'A', 'A', 'T', 'G', 'A', 'C', 'T', 'A', 'C', 'A', 'A', 'C', 'C', 'A', 'A', 'A', 'C', 'C', 'T', 'T', 'T', 'A', 'A', 'A', 'G', 'A', 'C', 'T', 'A', 'C', 'T', 'T', 'T', 'T', 'T', 'T', 'G', 'A', 'A', 'T', 'A', 'G', 'G', 'A', 'A', 'A', 'T', 'A', 'G', 'G', 'C', 'C', 'T', 'C', 'G', 'A', 'T', 'T', 'G', 'T', 'T', 'C', 'G', 'G', 'C', 'T', 'T', 'G', 'A', 'G', 'T', 'A', 'T', 'T', 'T', 'A', 'T', 'A', 'C', 'C', 'C', 'C', 'T', 'T', 'T', 'A', 'C', 'C', 'A', 'A', 'C', 'T', 'C', 'A', 'C', 'T', 'T', 'T', 'T', 'A', 'G', 'T', 'A', 'G', 'T', 'C', 'A', 'T', 'C', 'A', 'C', 'C', 'C', 'T', 'T']
|
|
155
|
+
C9 = ['T', 'T', 'G', 'A', 'C', 'T', 'A', 'C', 'G', 'C', 'G', 'G', 'G', 'G', 'G', 'T', 'C', 'G', 'T', 'A', 'G', 'G', 'T', 'G', 'C', 'T', 'T', 'C', 'G', 'T', 'G', 'G', 'C', 'T', 'C', 'A', 'T', 'T', 'C', 'A', 'T', 'A', 'A', 'C', 'C', 'G', 'T', 'G', 'C', 'C', 'T', 'C', 'G', 'C', 'T', 'T', 'C', 'C', 'T', 'A', 'T', 'C', 'G', 'A', 'C', 'T', 'C', 'T', 'T', 'C', 'A', 'T', 'G', 'A', 'A', 'G', 'T', 'C', 'T', 'T', 'T', 'C', 'A', 'T', 'T', 'T', 'T', 'T', 'G', 'A', 'A', 'C', 'T', 'G', 'C', 'C', 'T', 'C', 'C', 'A', 'G', 'G', 'G', 'G', 'G', 'A', 'G', 'A', 'G', 'T', 'T', 'C', 'T', 'A', 'C', 'G', 'G', 'T', 'G', 'C', 'C', 'G', 'A', 'A', 'G', 'A', 'G', 'T', 'A', 'C', 'A', 'C', 'C', 'G', 'G', 'C', 'A', 'C', 'G', 'C', 'A', 'A', 'A', 'T', 'G', 'A', 'T', 'T', 'A', 'A', 'G', 'A', 'T', 'G', 'A', 'A', 'A', 'A', 'C', 'A', 'G', 'C', 'C', 'A', 'C', 'C', 'G', 'A', 'C', 'C', 'T', 'T', 'T', 'C', 'C', 'T', 'T', 'A', 'A', 'C', 'T', 'A', 'G', 'A', 'G', 'C', 'T', 'A', 'T', 'C', 'A', 'T', 'G']
|
|
156
|
+
|
|
157
|
+
#AR motif pos-5 and pos-11
|
|
158
|
+
C5 = ['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'G', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'G', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'T', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'T', 'C', 'C', 'C', 'C', 'T', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C']
|
|
159
|
+
C11 = ['G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'C', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'A', 'T', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'A', 'G', 'G', 'G', 'G', 'G', 'A', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'A', 'G', 'G', 'G', 'G', 'G']
|
|
160
|
+
#PMI(C5,C11)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
#https://en.wikipedia.org/wiki/Pointwise_mutual_information example
|
|
164
|
+
x=[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1]
|
|
165
|
+
y=[0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1]
|
|
166
|
+
z=[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0]
|
|
167
|
+
PMI(x,z)
|
|
168
|
+
#print (Mutual_information1(x,z))
|
|
169
|
+
print (Mutual_information2(x,z))
|
|
170
|
+
print (mutual_info_score(x,z))
|
|
171
|
+
print (mutual_info_score(y,z))
|
|
172
|
+
|
|
173
|
+
#C1 = ['A','C','G','T']
|
|
174
|
+
#C2 = ['A','C','G','T']
|
|
175
|
+
#a=Mutual_information2(x,y)
|
|
176
|
+
#b=Mutual_information2(x,z)
|
|
177
|
+
#print(a)
|
|
178
|
+
#print(b)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
#b=Mutual_information2(C1,C2)
|
|
182
|
+
#print(b)
|
|
183
|
+
|
|
184
|
+
'''
|
|
185
|
+
from sklearn.feature_selection import mutual_info_classif
|
|
186
|
+
from sklearn.metrics import mutual_info_score
|
|
187
|
+
a = np.array([1, 1, 1, 0, 0, 1, 0, 0, 0, 1])
|
|
188
|
+
b = np.array([1, 1, 1, 0, 0, 1, 0, 0, 0, 1])
|
|
189
|
+
|
|
190
|
+
print(stats.entropy([0.5,0.5])) # entropy of 0.69, expressed in nats
|
|
191
|
+
print(mutual_info_classif(a.reshape(-1,1), b, discrete_features = True)) # mutual information of 0.69, expressed in nats
|
|
192
|
+
print(mutual_info_score(a,b)) # information gain of 0.69, expressed in nats
|
|
193
|
+
'''
|
cpgmodule/__init__.py
ADDED
|
File without changes
|
cpgmodule/_version.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "2.0.5"
|