cpgtools 2.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cpgmodule/BED.py +441 -0
- cpgmodule/MI.py +193 -0
- cpgmodule/__init__.py +0 -0
- cpgmodule/_version.py +1 -0
- cpgmodule/cgID.py +866897 -0
- cpgmodule/data/AltumAge_cpg.pkl +0 -0
- cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
- cpgmodule/data/AltumAge_scaler.pkl +0 -0
- cpgmodule/data/GA_Bohlin.pkl +0 -0
- cpgmodule/data/GA_Haftorn.pkl +0 -0
- cpgmodule/data/GA_Knight.pkl +0 -0
- cpgmodule/data/GA_Lee_CPC.pkl +0 -0
- cpgmodule/data/GA_Lee_RPC.pkl +0 -0
- cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
- cpgmodule/data/GA_Mayne.pkl +0 -0
- cpgmodule/data/Hannum.pkl +0 -0
- cpgmodule/data/Horvath_2013.pkl +0 -0
- cpgmodule/data/Horvath_2018.pkl +0 -0
- cpgmodule/data/Levine.pkl +0 -0
- cpgmodule/data/Lu_DNAmTL.pkl +0 -0
- cpgmodule/data/Ped_McEwen.pkl +0 -0
- cpgmodule/data/Ped_Wu.pkl +0 -0
- cpgmodule/data/Zhang_BLUP.pkl +0 -0
- cpgmodule/data/Zhang_EN.pkl +0 -0
- cpgmodule/data/__init__.py +0 -0
- cpgmodule/extend_bed.py +147 -0
- cpgmodule/imotif.py +348 -0
- cpgmodule/ireader.py +28 -0
- cpgmodule/methylClock.py +53 -0
- cpgmodule/padjust.py +58 -0
- cpgmodule/region2gene.py +170 -0
- cpgmodule/utils.py +642 -0
- cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
- cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
- cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
- cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
- cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
- cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
- cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
- cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
- cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
- cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
- cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
- cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
- cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
- cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
- cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
- cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
- cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
- cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
- cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
- cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
- cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
- cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
- cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
- cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
- cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
- cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
- cpgtools-2.0.5.dist-info/METADATA +59 -0
- cpgtools-2.0.5.dist-info/RECORD +104 -0
- cpgtools-2.0.5.dist-info/WHEEL +5 -0
- cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
- cpgtools-2.0.5.dist-info/top_level.txt +5 -0
- impyute/__init__.py +3 -0
- impyute/contrib/__init__.py +7 -0
- impyute/contrib/compare.py +69 -0
- impyute/contrib/count_missing.py +30 -0
- impyute/contrib/describe.py +63 -0
- impyute/cs/__init__.py +11 -0
- impyute/cs/buck_iterative.py +82 -0
- impyute/cs/central_tendency.py +84 -0
- impyute/cs/em.py +52 -0
- impyute/cs/fast_knn.py +130 -0
- impyute/cs/random.py +27 -0
- impyute/dataset/__init__.py +6 -0
- impyute/dataset/base.py +137 -0
- impyute/dataset/corrupt.py +55 -0
- impyute/deletion/__init__.py +5 -0
- impyute/deletion/complete_case.py +21 -0
- impyute/ops/__init__.py +12 -0
- impyute/ops/error.py +9 -0
- impyute/ops/inverse_distance_weighting.py +31 -0
- impyute/ops/matrix.py +47 -0
- impyute/ops/testing.py +20 -0
- impyute/ops/util.py +96 -0
- impyute/ops/wrapper.py +179 -0
- impyute/ts/__init__.py +6 -0
- impyute/ts/locf.py +57 -0
- impyute/ts/moving_window.py +128 -0
- impyutelib.py +890 -0
- missingpy/__init__.py +4 -0
- missingpy/knnimpute.py +328 -0
- missingpy/missforest.py +556 -0
- missingpy/pairwise_external.py +315 -0
- missingpy/tests/__init__.py +0 -0
- missingpy/tests/test_knnimpute.py +605 -0
- missingpy/tests/test_missforest.py +409 -0
- missingpy/utils.py +124 -0
- misspylib.py +565 -0
cpgmodule/utils.py
ADDED
|
@@ -0,0 +1,642 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from cpgmodule import ireader
|
|
3
|
+
import collections
|
|
4
|
+
from time import strftime
|
|
5
|
+
from bx.intervals import *
|
|
6
|
+
import numpy as np
|
|
7
|
+
from cpgmodule import ireader
|
|
8
|
+
import logging
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def revcomp(dna):
|
|
12
|
+
'''reverse complement DNA sequences'''
|
|
13
|
+
tab = str.maketrans('ACGTNX*-','TGCANX*-')
|
|
14
|
+
return dna.upper().translate(tab)[::-1]
|
|
15
|
+
|
|
16
|
+
#def is_number(s):
|
|
17
|
+
# try:
|
|
18
|
+
# float(s)
|
|
19
|
+
# return True
|
|
20
|
+
# except ValueError:
|
|
21
|
+
# return False
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def colors(n):
|
|
25
|
+
'''
|
|
26
|
+
return a list containing n colors
|
|
27
|
+
'''
|
|
28
|
+
if n >12 or n < 1:
|
|
29
|
+
print("n must be in [1,12]", file=sys.stderr)
|
|
30
|
+
return None
|
|
31
|
+
|
|
32
|
+
color_12=['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928']
|
|
33
|
+
color_11=['#276419','#4d9221','#7fbc41','#b8e186','#e6f5d0','#f7f7f7','#fde0ef','#f1b6da','#de77ae','#c51b7d','#8e0152']
|
|
34
|
+
color_10=['#276419','#4d9221','#7fbc41','#b8e186','#e6f5d0','#fde0ef','#f1b6da','#de77ae','#c51b7d','#8e0152']
|
|
35
|
+
color_9 =['#c51b7d','#de77ae','#f1b6da','#fde0ef','#f7f7f7','#e6f5d0','#b8e186','#7fbc41','#4d9221']
|
|
36
|
+
color_8 =['#c51b7d','#de77ae','#f1b6da','#fde0ef','#e6f5d0','#b8e186','#7fbc41','#4d9221']
|
|
37
|
+
color_7 =['#c51b7d','#e9a3c9','#fde0ef','#f7f7f7','#e6f5d0','#a1d76a','#4d9221']
|
|
38
|
+
color_6 =['#c51b7d','#e9a3c9','#fde0ef','#e6f5d0','#a1d76a','#4d9221']
|
|
39
|
+
color_5 =['#d01c8b','#f1b6da','#f7f7f7','#b8e186','#4dac26']
|
|
40
|
+
color_4 =['#d01c8b','#f1b6da','#b8e186','#4dac26']
|
|
41
|
+
color_3 =['#e9a3c9','#f7f7f7','#a1d76a']
|
|
42
|
+
color_2 =['blue','red']
|
|
43
|
+
color_1 =['blue']
|
|
44
|
+
|
|
45
|
+
tmp=[color_1,color_2,color_3,color_4,color_5,color_6,color_7,color_8,color_9,color_10,color_11,color_12]
|
|
46
|
+
return ["'" + i + "'" for i in tmp[n-1]]
|
|
47
|
+
|
|
48
|
+
def printlog (mesg):
|
|
49
|
+
'''print progress message'''
|
|
50
|
+
mesg = "@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + mesg
|
|
51
|
+
print (mesg, file=sys.stderr)
|
|
52
|
+
|
|
53
|
+
def chrom_count(infile):
|
|
54
|
+
'''
|
|
55
|
+
count chrom frequencies from BED file
|
|
56
|
+
'''
|
|
57
|
+
chrom_count = collections.defaultdict(int)
|
|
58
|
+
for l in ireader.reader(infile):
|
|
59
|
+
if l.startswith('#'):
|
|
60
|
+
continue
|
|
61
|
+
if l.startswith('track'):
|
|
62
|
+
continue
|
|
63
|
+
if l.startswith('browser'):
|
|
64
|
+
continue
|
|
65
|
+
f = l.split()
|
|
66
|
+
if len(f)< 3:
|
|
67
|
+
print ("BED has at lesat 3 columns. Skip: " + l, file=sys.stderr)
|
|
68
|
+
continue
|
|
69
|
+
try:
|
|
70
|
+
start = int(f[1])
|
|
71
|
+
end = int(f[2])
|
|
72
|
+
if start > end:
|
|
73
|
+
print ("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr)
|
|
74
|
+
continue
|
|
75
|
+
except:
|
|
76
|
+
print ("Not in valid BED format. Skip:" + l, file=sys.stderr)
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
chrom_count[f[0]] += 1
|
|
80
|
+
return chrom_count
|
|
81
|
+
|
|
82
|
+
def read_chromSize(infile):
|
|
83
|
+
'''
|
|
84
|
+
read chromosome size file (tab/space separated plain text file).
|
|
85
|
+
chr1 249250621
|
|
86
|
+
chr2 243199373
|
|
87
|
+
chr3 198022430
|
|
88
|
+
chr4 191154276
|
|
89
|
+
'''
|
|
90
|
+
names = []
|
|
91
|
+
sizes = []
|
|
92
|
+
for l in ireader.reader(infile):
|
|
93
|
+
if l.startswith('#'):
|
|
94
|
+
continue
|
|
95
|
+
f = l.split()
|
|
96
|
+
if len(f) !=2:
|
|
97
|
+
continue
|
|
98
|
+
names.append(f[0])
|
|
99
|
+
sizes.append(int(f[1]))
|
|
100
|
+
return (names, sizes)
|
|
101
|
+
|
|
102
|
+
def equal_split(st, end, n):
|
|
103
|
+
'''
|
|
104
|
+
Equally split range(st,end) into n parts
|
|
105
|
+
'''
|
|
106
|
+
lst = []
|
|
107
|
+
if end - st < n:
|
|
108
|
+
return []
|
|
109
|
+
stepSize = round((end - st)*1.0/n)
|
|
110
|
+
count = 1
|
|
111
|
+
|
|
112
|
+
a = st
|
|
113
|
+
while count <= n:
|
|
114
|
+
b = a + stepSize
|
|
115
|
+
lst.append((a,b))
|
|
116
|
+
a = b
|
|
117
|
+
count += 1
|
|
118
|
+
return lst
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def read_CpG_bed(cpgfile):
|
|
122
|
+
'''
|
|
123
|
+
cpgfile: CpG BED file should have at least 3 columns (Chrom, chromStart, chromEnd).
|
|
124
|
+
Note: chromEnd correspond to the genomic position methylated C.
|
|
125
|
+
beta value is placed at the 4th column, if there is no 4th column (or the 4th column
|
|
126
|
+
is not a number), beta set to 1.
|
|
127
|
+
Additional columns are ignored.
|
|
128
|
+
'''
|
|
129
|
+
cpg_ranges = {}
|
|
130
|
+
for l in ireader.reader(cpgfile):
|
|
131
|
+
if l.startswith('#'):
|
|
132
|
+
continue
|
|
133
|
+
if l.startswith('track'):
|
|
134
|
+
continue
|
|
135
|
+
if l.startswith('browser'):
|
|
136
|
+
continue
|
|
137
|
+
f = l.split()
|
|
138
|
+
if len(f) < 3:
|
|
139
|
+
print ("BED has at lesat 6 columns. Skip: " + l, file=sys.stderr)
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
chrom = f[0]
|
|
143
|
+
start = int(f[1])
|
|
144
|
+
end = int(f[2])
|
|
145
|
+
if start > end:
|
|
146
|
+
print ("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr)
|
|
147
|
+
continue
|
|
148
|
+
|
|
149
|
+
try:
|
|
150
|
+
beta = float(f[4])
|
|
151
|
+
except:
|
|
152
|
+
beta = 1.0
|
|
153
|
+
try:
|
|
154
|
+
strand = f[5]
|
|
155
|
+
except:
|
|
156
|
+
strand = '+'
|
|
157
|
+
|
|
158
|
+
if chrom not in cpg_ranges:
|
|
159
|
+
cpg_ranges[chrom] = IntervalTree()
|
|
160
|
+
if strand == '+':
|
|
161
|
+
cpg_ranges[chrom].insert_interval( Interval(start, end, value=beta))
|
|
162
|
+
elif strand == '-':
|
|
163
|
+
cpg_ranges[chrom].insert_interval( Interval(end, end+1, value=beta))
|
|
164
|
+
|
|
165
|
+
return cpg_ranges
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def read_region_bed(bedfile):
|
|
169
|
+
'''
|
|
170
|
+
bedfile file should have at least 3 columns (Chrom, chromStart, chromEnd).
|
|
171
|
+
if no strand information found in the 6th column. All regions will be
|
|
172
|
+
considered on "+" strand.
|
|
173
|
+
'''
|
|
174
|
+
for l in ireader.reader(bedfile):
|
|
175
|
+
if l.startswith('#'):
|
|
176
|
+
continue
|
|
177
|
+
if l.startswith('track'):
|
|
178
|
+
continue
|
|
179
|
+
if l.startswith('browser'):
|
|
180
|
+
continue
|
|
181
|
+
f = l.split()
|
|
182
|
+
|
|
183
|
+
try:
|
|
184
|
+
chrom = f[0]
|
|
185
|
+
start = int(f[1])
|
|
186
|
+
end = int(f[2])
|
|
187
|
+
if start > end:
|
|
188
|
+
print ("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr)
|
|
189
|
+
continue
|
|
190
|
+
except:
|
|
191
|
+
print ("BED has at lesat 3 columns. Skip: " + l, file=sys.stderr)
|
|
192
|
+
try:
|
|
193
|
+
strand = f[5]
|
|
194
|
+
except:
|
|
195
|
+
strand = "+"
|
|
196
|
+
|
|
197
|
+
yield(chrom, start, end, strand)
|
|
198
|
+
|
|
199
|
+
def read_bed_as_list(bedfile):
|
|
200
|
+
'''
|
|
201
|
+
bedfile file should have at least 3 columns (Chrom, chromStart, chromEnd).
|
|
202
|
+
if no strand information found in the 6th column. All regions will be
|
|
203
|
+
considered on "+" strand.
|
|
204
|
+
'''
|
|
205
|
+
lst = []
|
|
206
|
+
for l in ireader.reader(bedfile):
|
|
207
|
+
if l.startswith('#'):
|
|
208
|
+
continue
|
|
209
|
+
if l.startswith('track'):
|
|
210
|
+
continue
|
|
211
|
+
if l.startswith('browser'):
|
|
212
|
+
continue
|
|
213
|
+
f = l.split()
|
|
214
|
+
|
|
215
|
+
try:
|
|
216
|
+
chrom = f[0]
|
|
217
|
+
start = int(f[1])
|
|
218
|
+
end = int(f[2])
|
|
219
|
+
if start > end:
|
|
220
|
+
print ("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr)
|
|
221
|
+
continue
|
|
222
|
+
except:
|
|
223
|
+
print ("BED has at lesat 3 columns. Skip: " + l, file=sys.stderr)
|
|
224
|
+
lst.append([chrom, start, end])
|
|
225
|
+
return lst
|
|
226
|
+
|
|
227
|
+
def coverage_over_range(lst, cpg_ranges):
|
|
228
|
+
'''
|
|
229
|
+
Calculate relative methylation density
|
|
230
|
+
lst = list of (chr,start,end, strand)
|
|
231
|
+
cpg_ranges is returned by read_CpG_bed
|
|
232
|
+
'''
|
|
233
|
+
|
|
234
|
+
results = collections.defaultdict(list)
|
|
235
|
+
beta_signals = {}
|
|
236
|
+
for (chr,st,end, strand) in lst:
|
|
237
|
+
if chr not in cpg_ranges:
|
|
238
|
+
continue
|
|
239
|
+
|
|
240
|
+
span = end - st
|
|
241
|
+
tmp = cpg_ranges[chr].find(st, end) #eg: [Interval(3, 40, value=3), Interval(13, 50, value=4)]
|
|
242
|
+
for i in tmp:
|
|
243
|
+
if strand == '+':
|
|
244
|
+
CpG_to_origin = round((i.end - (st+1))*100/span)
|
|
245
|
+
if strand == '-':
|
|
246
|
+
CpG_to_origin = abs(round((i.end - end)*100/span))
|
|
247
|
+
CpG_beta = i.value
|
|
248
|
+
results[CpG_to_origin].append(CpG_beta)
|
|
249
|
+
for k,v in results.items():
|
|
250
|
+
beta_signals[k] = round(np.mean(v),4)
|
|
251
|
+
return beta_signals
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def count_over_range(lst, cpg_ranges):
|
|
255
|
+
'''
|
|
256
|
+
Calculate how many CpGs are located in lst
|
|
257
|
+
lst = list of (chr,start,end)
|
|
258
|
+
cpg_ranges is returned by read_CpG_bed
|
|
259
|
+
'''
|
|
260
|
+
|
|
261
|
+
total_size = 0 #total nucleotides of list of genomic regions
|
|
262
|
+
total_count = 0 #total CpGs in list of genomic regions
|
|
263
|
+
for (chr,st,end) in lst:
|
|
264
|
+
total_size += (end - st)
|
|
265
|
+
if chr not in cpg_ranges:
|
|
266
|
+
continue
|
|
267
|
+
tmp = cpg_ranges[chr].find(st, end) #eg: [Interval(3, 40, value=3), Interval(13, 50, value=4)]
|
|
268
|
+
total_count += len(tmp)
|
|
269
|
+
return(total_size,total_count)
|
|
270
|
+
|
|
271
|
+
def read_grp_file1(gfile,na_lab="NA"):
|
|
272
|
+
'''
|
|
273
|
+
read group file. Group file define the biological groups of data matrix file.
|
|
274
|
+
(1) It must has header
|
|
275
|
+
(2) It must have two columns:
|
|
276
|
+
* 1st column: sample names. samples names should be unique, and they must be exactly the same as the first row of beta matrix file.
|
|
277
|
+
* 2nd column: group IDs.
|
|
278
|
+
(3) columns must be separated by ","
|
|
279
|
+
|
|
280
|
+
For example:
|
|
281
|
+
|
|
282
|
+
sampleID,groupID
|
|
283
|
+
Normal_1,1
|
|
284
|
+
Normal_2,1
|
|
285
|
+
Normal_3,1
|
|
286
|
+
Tumor_1,2
|
|
287
|
+
Tumor_2,2
|
|
288
|
+
Tumor_3,2
|
|
289
|
+
'''
|
|
290
|
+
samples = []
|
|
291
|
+
groups = []
|
|
292
|
+
line_num = 0
|
|
293
|
+
for l in ireader.reader(gfile):
|
|
294
|
+
l = l.replace(' ','')
|
|
295
|
+
line_num += 1
|
|
296
|
+
f = l.split(',')
|
|
297
|
+
if f[1] == na_lab:
|
|
298
|
+
continue
|
|
299
|
+
if len(f) < 2:
|
|
300
|
+
print ("Group fle must have 2 columns!", file=sys.stderr)
|
|
301
|
+
sys.exit(1)
|
|
302
|
+
if line_num == 1:
|
|
303
|
+
continue
|
|
304
|
+
else:
|
|
305
|
+
samples.append(f[0])
|
|
306
|
+
groups.append(f[1])
|
|
307
|
+
|
|
308
|
+
tmp = collections.Counter(samples)
|
|
309
|
+
if tmp.most_common(1)[0][1] > 1:
|
|
310
|
+
print ("Sample names are not unique!", file=sys.stderr)
|
|
311
|
+
sys.exit(0)
|
|
312
|
+
|
|
313
|
+
return(samples, groups)
|
|
314
|
+
|
|
315
|
+
def read_grp_file2(gfile):
|
|
316
|
+
'''
|
|
317
|
+
read group file. Group file define the biological groups of data matrix file.
|
|
318
|
+
(1) It must has header
|
|
319
|
+
(2) It must have at least two columns:
|
|
320
|
+
* 1st column: sample names. samples names should be unique, and they must be exactly the same as the first row of beta matrix file.
|
|
321
|
+
* 2nd column: group IDs.
|
|
322
|
+
* additional columns can be included to indicate co-variables.
|
|
323
|
+
(3) columns must be separated by ","
|
|
324
|
+
|
|
325
|
+
For example:
|
|
326
|
+
|
|
327
|
+
sampleID,survival,Sex
|
|
328
|
+
Normal_1,1,1
|
|
329
|
+
Normal_2,1,2
|
|
330
|
+
Normal_3,1,1
|
|
331
|
+
Tumor_1,2,1
|
|
332
|
+
Tumor_2,2,2
|
|
333
|
+
Tumor_3,2,1
|
|
334
|
+
...
|
|
335
|
+
...
|
|
336
|
+
'''
|
|
337
|
+
samples = []
|
|
338
|
+
covar_values = []
|
|
339
|
+
covar_names = []
|
|
340
|
+
covars = collections.defaultdict(dict)
|
|
341
|
+
line_num = 0
|
|
342
|
+
|
|
343
|
+
covar_values = collections.defaultdict(list) #continue variable or categorical variable. key is name, valu list of values
|
|
344
|
+
cutoff = 0.5 #ratio of number of unique values to the total number of unique values
|
|
345
|
+
for l in ireader.reader(gfile):
|
|
346
|
+
l = l.replace(' ','')
|
|
347
|
+
line_num += 1
|
|
348
|
+
f = l.split(',')
|
|
349
|
+
if len(f) < 2:
|
|
350
|
+
print ("Group fle has at lesat 2 columns!", file=sys.stderr)
|
|
351
|
+
sys.exit(1)
|
|
352
|
+
if line_num == 1:
|
|
353
|
+
covar_names = f[1:]
|
|
354
|
+
else:
|
|
355
|
+
sample_id = f[0]
|
|
356
|
+
samples.append(sample_id)
|
|
357
|
+
row_values = f[1:]
|
|
358
|
+
|
|
359
|
+
for a,b in zip(covar_names, row_values):
|
|
360
|
+
covars[a][sample_id] = b
|
|
361
|
+
covar_values[a].append(b)
|
|
362
|
+
|
|
363
|
+
tmp = collections.Counter(samples)
|
|
364
|
+
if tmp.most_common(1)[0][1] > 1:
|
|
365
|
+
print ("Sample names are not unique!", file=sys.stderr)
|
|
366
|
+
sys.exit(0)
|
|
367
|
+
|
|
368
|
+
#tell if a covariable is continuous or categorical
|
|
369
|
+
covar_types = {}
|
|
370
|
+
for k,v in covar_values.items():
|
|
371
|
+
if ( 1.0*len(set(v)) / len(v) ) > cutoff:
|
|
372
|
+
covar_types[k] = 'continuous'
|
|
373
|
+
else:
|
|
374
|
+
covar_types[k] = 'categorical'
|
|
375
|
+
|
|
376
|
+
return(samples, covar_names, covars, covar_types)
|
|
377
|
+
|
|
378
|
+
def stats_over_range(cpg_ranges, chrom, st, end):
|
|
379
|
+
'''
|
|
380
|
+
Basic statistics about range
|
|
381
|
+
'''
|
|
382
|
+
|
|
383
|
+
stats = []
|
|
384
|
+
|
|
385
|
+
if chrom not in cpg_ranges:
|
|
386
|
+
return ['NA']*6
|
|
387
|
+
|
|
388
|
+
tmp = []
|
|
389
|
+
overlaps = cpg_ranges[chrom].find(st, end)
|
|
390
|
+
for i in overlaps:
|
|
391
|
+
tmp.append(i.value)
|
|
392
|
+
|
|
393
|
+
if len(tmp) == 0:
|
|
394
|
+
return ['NA']*6
|
|
395
|
+
|
|
396
|
+
try:
|
|
397
|
+
i_count = len(overlaps)
|
|
398
|
+
except:
|
|
399
|
+
i_count = 'NA'
|
|
400
|
+
|
|
401
|
+
try:
|
|
402
|
+
i_min = round(min(tmp),4)
|
|
403
|
+
except:
|
|
404
|
+
i_min = 'NA'
|
|
405
|
+
|
|
406
|
+
try:
|
|
407
|
+
i_max = round(max(tmp),4)
|
|
408
|
+
except:
|
|
409
|
+
i_max = 'NA'
|
|
410
|
+
|
|
411
|
+
try:
|
|
412
|
+
i_mean = round(np.mean(tmp),4)
|
|
413
|
+
except:
|
|
414
|
+
i_mean = 'NA'
|
|
415
|
+
|
|
416
|
+
try:
|
|
417
|
+
i_median = round(np.median(tmp),4)
|
|
418
|
+
except:
|
|
419
|
+
i_median = 'NA'
|
|
420
|
+
|
|
421
|
+
try:
|
|
422
|
+
if len(tmp) > 1:
|
|
423
|
+
i_std = round(np.std(tmp, ddof=1),4)
|
|
424
|
+
else:
|
|
425
|
+
i_std = 'NA'
|
|
426
|
+
except:
|
|
427
|
+
i_std = 'NA'
|
|
428
|
+
|
|
429
|
+
return [i_count, i_min, i_max, i_mean, i_median, i_std]
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def density_over_range(lst, cpg_ranges):
|
|
433
|
+
'''
|
|
434
|
+
Calculate CpG density over range (upstream, gene, downstream)
|
|
435
|
+
lst = list of (chr,start,end, strand)
|
|
436
|
+
cpg_ranges is returned by read_CpG_bed
|
|
437
|
+
'''
|
|
438
|
+
|
|
439
|
+
up_CpG_density = {}
|
|
440
|
+
gene_CpG_density = {}
|
|
441
|
+
down_CpG_density = {}
|
|
442
|
+
for i in range(0,101):
|
|
443
|
+
up_CpG_density[i] = 0
|
|
444
|
+
gene_CpG_density[i] = 0
|
|
445
|
+
down_CpG_density[i] = 0
|
|
446
|
+
|
|
447
|
+
for r1,r2,r3,strand in lst:
|
|
448
|
+
#if chr not in cpg_ranges:
|
|
449
|
+
# continue
|
|
450
|
+
if strand == '+':
|
|
451
|
+
up_region = r1
|
|
452
|
+
gene_region = r2
|
|
453
|
+
down_region = r3
|
|
454
|
+
|
|
455
|
+
elif strand == '-':
|
|
456
|
+
up_region = r3
|
|
457
|
+
gene_region = r2
|
|
458
|
+
down_region = r1
|
|
459
|
+
|
|
460
|
+
## up-stream region
|
|
461
|
+
chrom = up_region[0]
|
|
462
|
+
start = up_region[1]
|
|
463
|
+
end = up_region[2]
|
|
464
|
+
span = end - start
|
|
465
|
+
if chrom not in cpg_ranges:
|
|
466
|
+
continue
|
|
467
|
+
tmp = cpg_ranges[chrom].find(start, end) #eg: [Interval(3, 40, value=3), Interval(13, 50, value=4)]
|
|
468
|
+
for i in tmp:
|
|
469
|
+
if strand == '+':
|
|
470
|
+
CpG_to_origin = round((i.end - start)*100/span)
|
|
471
|
+
elif strand == '-':
|
|
472
|
+
CpG_to_origin = abs(round((i.end - end)*100/span))
|
|
473
|
+
|
|
474
|
+
up_CpG_density[CpG_to_origin] += 1
|
|
475
|
+
|
|
476
|
+
## gene region
|
|
477
|
+
chrom = gene_region[0]
|
|
478
|
+
start = gene_region[1]
|
|
479
|
+
end = gene_region[2]
|
|
480
|
+
span = end - start
|
|
481
|
+
if chrom not in cpg_ranges:
|
|
482
|
+
continue
|
|
483
|
+
tmp = cpg_ranges[chrom].find(start, end) #eg: [Interval(3, 40, value=3), Interval(13, 50, value=4)]
|
|
484
|
+
for i in tmp:
|
|
485
|
+
if strand == '+':
|
|
486
|
+
CpG_to_origin = round((i.end - start)*100/span)
|
|
487
|
+
elif strand == '-':
|
|
488
|
+
CpG_to_origin = abs(round((i.end - end)*100/span))
|
|
489
|
+
|
|
490
|
+
gene_CpG_density[CpG_to_origin] += 1
|
|
491
|
+
|
|
492
|
+
## down-stream region
|
|
493
|
+
chrom = down_region[0]
|
|
494
|
+
start = down_region[1]
|
|
495
|
+
end = down_region[2]
|
|
496
|
+
span = end - start
|
|
497
|
+
if chrom not in cpg_ranges:
|
|
498
|
+
continue
|
|
499
|
+
tmp = cpg_ranges[chrom].find(start, end) #eg: [Interval(3, 40, value=3), Interval(13, 50, value=4)]
|
|
500
|
+
for i in tmp:
|
|
501
|
+
if strand == '+':
|
|
502
|
+
CpG_to_origin = round((i.end - start)*100/span)
|
|
503
|
+
elif strand == '-':
|
|
504
|
+
CpG_to_origin = abs(round((i.end - end)*100/span))
|
|
505
|
+
|
|
506
|
+
down_CpG_density[CpG_to_origin] += 1
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
#for k in sorted(up_CpG_density):
|
|
510
|
+
# print (str(k) + '\t' + str(up_CpG_density[k]))
|
|
511
|
+
|
|
512
|
+
#for k in sorted(gene_CpG_density):
|
|
513
|
+
# print (str(k) + '\t' + str(gene_CpG_density[k]))
|
|
514
|
+
#for k in sorted(down_CpG_density):
|
|
515
|
+
# print (str(k) + '\t' + str(down_CpG_density[k]))
|
|
516
|
+
|
|
517
|
+
return(up_CpG_density, gene_CpG_density, down_CpG_density)
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
def load_pickle_obj():
|
|
521
|
+
with open('./id2chr.pkl', 'rb') as f:
|
|
522
|
+
return pickle.load(f)
|
|
523
|
+
|
|
524
|
+
|
|
525
|
+
"""
|
|
526
|
+
def read_CpG_bed(cpgfile,genefile, bin_count = 100):
|
|
527
|
+
'''
|
|
528
|
+
cpgfile: CpG BED file (at least 3 columns).
|
|
529
|
+
genefile: gene BED file (at least 6 columns, must have strand information).
|
|
530
|
+
'''
|
|
531
|
+
cpg_ranges = {}
|
|
532
|
+
for l in ireader.reader(cpgfile):
|
|
533
|
+
if l.startswith('#'):
|
|
534
|
+
continue
|
|
535
|
+
if l.startswith('track'):
|
|
536
|
+
continue
|
|
537
|
+
if l.startswith('browser'):
|
|
538
|
+
continue
|
|
539
|
+
f = l.split()
|
|
540
|
+
if len(f)< 3:
|
|
541
|
+
print ("BED has at lesat 3 columns. Skip: " + l, file=sys.stderr)
|
|
542
|
+
continue
|
|
543
|
+
try:
|
|
544
|
+
chrom = f[0]
|
|
545
|
+
start = int(f[1])
|
|
546
|
+
end = int(f[2])
|
|
547
|
+
if start > end:
|
|
548
|
+
print ("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr)
|
|
549
|
+
continue
|
|
550
|
+
except:
|
|
551
|
+
print ("Not in valid BED format. Skip:" + l, file=sys.stderr)
|
|
552
|
+
continue
|
|
553
|
+
|
|
554
|
+
if chrom not in cpg_anges:
|
|
555
|
+
cpg_ranges[chrom] = IntervalTree()
|
|
556
|
+
cpg_ranges[chrom].insert_interval( Interval( int(start), int(end)))
|
|
557
|
+
|
|
558
|
+
#return cpg_ranges
|
|
559
|
+
|
|
560
|
+
cpg_profile = [] #list of list = [CpG_count across bins]
|
|
561
|
+
for l in ireader.reader(genefile):
|
|
562
|
+
if l.startswith('#'):
|
|
563
|
+
continue
|
|
564
|
+
if l.startswith('track'):
|
|
565
|
+
continue
|
|
566
|
+
if l.startswith('browser'):
|
|
567
|
+
continue
|
|
568
|
+
f = l.split()
|
|
569
|
+
if len(f)< 6:
|
|
570
|
+
print ("Gene BED has at lesat 6 columns. Skip: " + l, file=sys.stderr)
|
|
571
|
+
continue
|
|
572
|
+
try:
|
|
573
|
+
chrom = f[0]
|
|
574
|
+
tss_start = int(f[1])
|
|
575
|
+
tss_end = int(f[2])
|
|
576
|
+
strand = f[5]
|
|
577
|
+
if start > end:
|
|
578
|
+
print ("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr)
|
|
579
|
+
continue
|
|
580
|
+
except:
|
|
581
|
+
print ("Not in valid BED format. Skip:" + l, file=sys.stderr)
|
|
582
|
+
continue
|
|
583
|
+
|
|
584
|
+
#
|
|
585
|
+
if chrom not in cpg_ranges:
|
|
586
|
+
continue
|
|
587
|
+
|
|
588
|
+
genomic_size = tss_end - tss_start
|
|
589
|
+
window_start = tss_start - int(genomic_size/2.0) #extend upstream half gene size
|
|
590
|
+
window_end = tss_end + int(genomic_size/2.0) #extend downstream half gene size
|
|
591
|
+
if window_start < 0:
|
|
592
|
+
window_start = 0
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
bins = equal_split(widow_start, window_end, bin_count)
|
|
596
|
+
if len(bins) == 0: continue
|
|
597
|
+
|
|
598
|
+
cpg_counts = [] #CcG count in each bin
|
|
599
|
+
for (bin_st, bin_end) in bins:
|
|
600
|
+
tmp = cpg_ranges[chrom].find(bin_st, bin_end)
|
|
601
|
+
cpg_counts.append(len(tmp))
|
|
602
|
+
|
|
603
|
+
if strand == '-':
|
|
604
|
+
cpg_counts = cpg_counts[::-1]
|
|
605
|
+
cpg_profile.append(cpg_counts)
|
|
606
|
+
|
|
607
|
+
return np.array(cpg_profile).means(axis=0)
|
|
608
|
+
"""
|
|
609
|
+
def config_log(switch, logfile=None):
|
|
610
|
+
"""
|
|
611
|
+
Configureing the logging module.
|
|
612
|
+
|
|
613
|
+
Parameters
|
|
614
|
+
----------
|
|
615
|
+
switch : bool
|
|
616
|
+
Debugging switch.
|
|
617
|
+
Returns
|
|
618
|
+
-------
|
|
619
|
+
None.
|
|
620
|
+
|
|
621
|
+
"""
|
|
622
|
+
if switch is True:
|
|
623
|
+
if logfile is None:
|
|
624
|
+
logging.basicConfig(
|
|
625
|
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
626
|
+
datefmt='%Y-%m-%d %I:%M:%S', level=logging.DEBUG)
|
|
627
|
+
else:
|
|
628
|
+
logging.basicConfig(
|
|
629
|
+
filename=logfile,
|
|
630
|
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
631
|
+
datefmt='%Y-%m-%d %I:%M:%S', level=logging.DEBUG)
|
|
632
|
+
else:
|
|
633
|
+
if logfile is None:
|
|
634
|
+
logging.basicConfig(
|
|
635
|
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
636
|
+
datefmt='%Y-%m-%d %I:%M:%S', level=logging.INFO)
|
|
637
|
+
else:
|
|
638
|
+
logging.basicConfig(
|
|
639
|
+
filename=logfile,
|
|
640
|
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
641
|
+
datefmt='%Y-%m-%d %I:%M:%S', level=logging.INFO)
|
|
642
|
+
|