cpgtools 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. cpgmodule/BED.py +441 -0
  2. cpgmodule/MI.py +193 -0
  3. cpgmodule/__init__.py +0 -0
  4. cpgmodule/_version.py +1 -0
  5. cpgmodule/cgID.py +866897 -0
  6. cpgmodule/data/AltumAge_cpg.pkl +0 -0
  7. cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
  8. cpgmodule/data/AltumAge_scaler.pkl +0 -0
  9. cpgmodule/data/GA_Bohlin.pkl +0 -0
  10. cpgmodule/data/GA_Haftorn.pkl +0 -0
  11. cpgmodule/data/GA_Knight.pkl +0 -0
  12. cpgmodule/data/GA_Lee_CPC.pkl +0 -0
  13. cpgmodule/data/GA_Lee_RPC.pkl +0 -0
  14. cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
  15. cpgmodule/data/GA_Mayne.pkl +0 -0
  16. cpgmodule/data/Hannum.pkl +0 -0
  17. cpgmodule/data/Horvath_2013.pkl +0 -0
  18. cpgmodule/data/Horvath_2018.pkl +0 -0
  19. cpgmodule/data/Levine.pkl +0 -0
  20. cpgmodule/data/Lu_DNAmTL.pkl +0 -0
  21. cpgmodule/data/Ped_McEwen.pkl +0 -0
  22. cpgmodule/data/Ped_Wu.pkl +0 -0
  23. cpgmodule/data/Zhang_BLUP.pkl +0 -0
  24. cpgmodule/data/Zhang_EN.pkl +0 -0
  25. cpgmodule/data/__init__.py +0 -0
  26. cpgmodule/extend_bed.py +147 -0
  27. cpgmodule/imotif.py +348 -0
  28. cpgmodule/ireader.py +28 -0
  29. cpgmodule/methylClock.py +53 -0
  30. cpgmodule/padjust.py +58 -0
  31. cpgmodule/region2gene.py +170 -0
  32. cpgmodule/utils.py +642 -0
  33. cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
  34. cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
  35. cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
  36. cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
  37. cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
  38. cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
  39. cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
  40. cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
  41. cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
  42. cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
  43. cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
  44. cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
  45. cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
  46. cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
  47. cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
  48. cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
  49. cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
  50. cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
  51. cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
  52. cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
  53. cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
  54. cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
  55. cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
  56. cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
  57. cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
  58. cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
  59. cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
  60. cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
  61. cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
  62. cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
  63. cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
  64. cpgtools-2.0.5.dist-info/METADATA +59 -0
  65. cpgtools-2.0.5.dist-info/RECORD +104 -0
  66. cpgtools-2.0.5.dist-info/WHEEL +5 -0
  67. cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
  68. cpgtools-2.0.5.dist-info/top_level.txt +5 -0
  69. impyute/__init__.py +3 -0
  70. impyute/contrib/__init__.py +7 -0
  71. impyute/contrib/compare.py +69 -0
  72. impyute/contrib/count_missing.py +30 -0
  73. impyute/contrib/describe.py +63 -0
  74. impyute/cs/__init__.py +11 -0
  75. impyute/cs/buck_iterative.py +82 -0
  76. impyute/cs/central_tendency.py +84 -0
  77. impyute/cs/em.py +52 -0
  78. impyute/cs/fast_knn.py +130 -0
  79. impyute/cs/random.py +27 -0
  80. impyute/dataset/__init__.py +6 -0
  81. impyute/dataset/base.py +137 -0
  82. impyute/dataset/corrupt.py +55 -0
  83. impyute/deletion/__init__.py +5 -0
  84. impyute/deletion/complete_case.py +21 -0
  85. impyute/ops/__init__.py +12 -0
  86. impyute/ops/error.py +9 -0
  87. impyute/ops/inverse_distance_weighting.py +31 -0
  88. impyute/ops/matrix.py +47 -0
  89. impyute/ops/testing.py +20 -0
  90. impyute/ops/util.py +96 -0
  91. impyute/ops/wrapper.py +179 -0
  92. impyute/ts/__init__.py +6 -0
  93. impyute/ts/locf.py +57 -0
  94. impyute/ts/moving_window.py +128 -0
  95. impyutelib.py +890 -0
  96. missingpy/__init__.py +4 -0
  97. missingpy/knnimpute.py +328 -0
  98. missingpy/missforest.py +556 -0
  99. missingpy/pairwise_external.py +315 -0
  100. missingpy/tests/__init__.py +0 -0
  101. missingpy/tests/test_knnimpute.py +605 -0
  102. missingpy/tests/test_missforest.py +409 -0
  103. missingpy/utils.py +124 -0
  104. misspylib.py +565 -0
cpgmodule/utils.py ADDED
@@ -0,0 +1,642 @@
1
+ import sys
2
+ from cpgmodule import ireader
3
+ import collections
4
+ from time import strftime
5
+ from bx.intervals import *
6
+ import numpy as np
7
+ from cpgmodule import ireader
8
+ import logging
9
+
10
+
11
+ def revcomp(dna):
12
+ '''reverse complement DNA sequences'''
13
+ tab = str.maketrans('ACGTNX*-','TGCANX*-')
14
+ return dna.upper().translate(tab)[::-1]
15
+
16
+ #def is_number(s):
17
+ # try:
18
+ # float(s)
19
+ # return True
20
+ # except ValueError:
21
+ # return False
22
+
23
+
24
+ def colors(n):
25
+ '''
26
+ return a list containing n colors
27
+ '''
28
+ if n >12 or n < 1:
29
+ print("n must be in [1,12]", file=sys.stderr)
30
+ return None
31
+
32
+ color_12=['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928']
33
+ color_11=['#276419','#4d9221','#7fbc41','#b8e186','#e6f5d0','#f7f7f7','#fde0ef','#f1b6da','#de77ae','#c51b7d','#8e0152']
34
+ color_10=['#276419','#4d9221','#7fbc41','#b8e186','#e6f5d0','#fde0ef','#f1b6da','#de77ae','#c51b7d','#8e0152']
35
+ color_9 =['#c51b7d','#de77ae','#f1b6da','#fde0ef','#f7f7f7','#e6f5d0','#b8e186','#7fbc41','#4d9221']
36
+ color_8 =['#c51b7d','#de77ae','#f1b6da','#fde0ef','#e6f5d0','#b8e186','#7fbc41','#4d9221']
37
+ color_7 =['#c51b7d','#e9a3c9','#fde0ef','#f7f7f7','#e6f5d0','#a1d76a','#4d9221']
38
+ color_6 =['#c51b7d','#e9a3c9','#fde0ef','#e6f5d0','#a1d76a','#4d9221']
39
+ color_5 =['#d01c8b','#f1b6da','#f7f7f7','#b8e186','#4dac26']
40
+ color_4 =['#d01c8b','#f1b6da','#b8e186','#4dac26']
41
+ color_3 =['#e9a3c9','#f7f7f7','#a1d76a']
42
+ color_2 =['blue','red']
43
+ color_1 =['blue']
44
+
45
+ tmp=[color_1,color_2,color_3,color_4,color_5,color_6,color_7,color_8,color_9,color_10,color_11,color_12]
46
+ return ["'" + i + "'" for i in tmp[n-1]]
47
+
48
+ def printlog (mesg):
49
+ '''print progress message'''
50
+ mesg = "@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + mesg
51
+ print (mesg, file=sys.stderr)
52
+
53
+ def chrom_count(infile):
54
+ '''
55
+ count chrom frequencies from BED file
56
+ '''
57
+ chrom_count = collections.defaultdict(int)
58
+ for l in ireader.reader(infile):
59
+ if l.startswith('#'):
60
+ continue
61
+ if l.startswith('track'):
62
+ continue
63
+ if l.startswith('browser'):
64
+ continue
65
+ f = l.split()
66
+ if len(f)< 3:
67
+ print ("BED has at lesat 3 columns. Skip: " + l, file=sys.stderr)
68
+ continue
69
+ try:
70
+ start = int(f[1])
71
+ end = int(f[2])
72
+ if start > end:
73
+ print ("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr)
74
+ continue
75
+ except:
76
+ print ("Not in valid BED format. Skip:" + l, file=sys.stderr)
77
+ continue
78
+
79
+ chrom_count[f[0]] += 1
80
+ return chrom_count
81
+
82
+ def read_chromSize(infile):
83
+ '''
84
+ read chromosome size file (tab/space separated plain text file).
85
+ chr1 249250621
86
+ chr2 243199373
87
+ chr3 198022430
88
+ chr4 191154276
89
+ '''
90
+ names = []
91
+ sizes = []
92
+ for l in ireader.reader(infile):
93
+ if l.startswith('#'):
94
+ continue
95
+ f = l.split()
96
+ if len(f) !=2:
97
+ continue
98
+ names.append(f[0])
99
+ sizes.append(int(f[1]))
100
+ return (names, sizes)
101
+
102
+ def equal_split(st, end, n):
103
+ '''
104
+ Equally split range(st,end) into n parts
105
+ '''
106
+ lst = []
107
+ if end - st < n:
108
+ return []
109
+ stepSize = round((end - st)*1.0/n)
110
+ count = 1
111
+
112
+ a = st
113
+ while count <= n:
114
+ b = a + stepSize
115
+ lst.append((a,b))
116
+ a = b
117
+ count += 1
118
+ return lst
119
+
120
+
121
+ def read_CpG_bed(cpgfile):
122
+ '''
123
+ cpgfile: CpG BED file should have at least 3 columns (Chrom, chromStart, chromEnd).
124
+ Note: chromEnd correspond to the genomic position methylated C.
125
+ beta value is placed at the 4th column, if there is no 4th column (or the 4th column
126
+ is not a number), beta set to 1.
127
+ Additional columns are ignored.
128
+ '''
129
+ cpg_ranges = {}
130
+ for l in ireader.reader(cpgfile):
131
+ if l.startswith('#'):
132
+ continue
133
+ if l.startswith('track'):
134
+ continue
135
+ if l.startswith('browser'):
136
+ continue
137
+ f = l.split()
138
+ if len(f) < 3:
139
+ print ("BED has at lesat 6 columns. Skip: " + l, file=sys.stderr)
140
+ continue
141
+
142
+ chrom = f[0]
143
+ start = int(f[1])
144
+ end = int(f[2])
145
+ if start > end:
146
+ print ("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr)
147
+ continue
148
+
149
+ try:
150
+ beta = float(f[4])
151
+ except:
152
+ beta = 1.0
153
+ try:
154
+ strand = f[5]
155
+ except:
156
+ strand = '+'
157
+
158
+ if chrom not in cpg_ranges:
159
+ cpg_ranges[chrom] = IntervalTree()
160
+ if strand == '+':
161
+ cpg_ranges[chrom].insert_interval( Interval(start, end, value=beta))
162
+ elif strand == '-':
163
+ cpg_ranges[chrom].insert_interval( Interval(end, end+1, value=beta))
164
+
165
+ return cpg_ranges
166
+
167
+
168
+ def read_region_bed(bedfile):
169
+ '''
170
+ bedfile file should have at least 3 columns (Chrom, chromStart, chromEnd).
171
+ if no strand information found in the 6th column. All regions will be
172
+ considered on "+" strand.
173
+ '''
174
+ for l in ireader.reader(bedfile):
175
+ if l.startswith('#'):
176
+ continue
177
+ if l.startswith('track'):
178
+ continue
179
+ if l.startswith('browser'):
180
+ continue
181
+ f = l.split()
182
+
183
+ try:
184
+ chrom = f[0]
185
+ start = int(f[1])
186
+ end = int(f[2])
187
+ if start > end:
188
+ print ("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr)
189
+ continue
190
+ except:
191
+ print ("BED has at lesat 3 columns. Skip: " + l, file=sys.stderr)
192
+ try:
193
+ strand = f[5]
194
+ except:
195
+ strand = "+"
196
+
197
+ yield(chrom, start, end, strand)
198
+
199
+ def read_bed_as_list(bedfile):
200
+ '''
201
+ bedfile file should have at least 3 columns (Chrom, chromStart, chromEnd).
202
+ if no strand information found in the 6th column. All regions will be
203
+ considered on "+" strand.
204
+ '''
205
+ lst = []
206
+ for l in ireader.reader(bedfile):
207
+ if l.startswith('#'):
208
+ continue
209
+ if l.startswith('track'):
210
+ continue
211
+ if l.startswith('browser'):
212
+ continue
213
+ f = l.split()
214
+
215
+ try:
216
+ chrom = f[0]
217
+ start = int(f[1])
218
+ end = int(f[2])
219
+ if start > end:
220
+ print ("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr)
221
+ continue
222
+ except:
223
+ print ("BED has at lesat 3 columns. Skip: " + l, file=sys.stderr)
224
+ lst.append([chrom, start, end])
225
+ return lst
226
+
227
+ def coverage_over_range(lst, cpg_ranges):
228
+ '''
229
+ Calculate relative methylation density
230
+ lst = list of (chr,start,end, strand)
231
+ cpg_ranges is returned by read_CpG_bed
232
+ '''
233
+
234
+ results = collections.defaultdict(list)
235
+ beta_signals = {}
236
+ for (chr,st,end, strand) in lst:
237
+ if chr not in cpg_ranges:
238
+ continue
239
+
240
+ span = end - st
241
+ tmp = cpg_ranges[chr].find(st, end) #eg: [Interval(3, 40, value=3), Interval(13, 50, value=4)]
242
+ for i in tmp:
243
+ if strand == '+':
244
+ CpG_to_origin = round((i.end - (st+1))*100/span)
245
+ if strand == '-':
246
+ CpG_to_origin = abs(round((i.end - end)*100/span))
247
+ CpG_beta = i.value
248
+ results[CpG_to_origin].append(CpG_beta)
249
+ for k,v in results.items():
250
+ beta_signals[k] = round(np.mean(v),4)
251
+ return beta_signals
252
+
253
+
254
+ def count_over_range(lst, cpg_ranges):
255
+ '''
256
+ Calculate how many CpGs are located in lst
257
+ lst = list of (chr,start,end)
258
+ cpg_ranges is returned by read_CpG_bed
259
+ '''
260
+
261
+ total_size = 0 #total nucleotides of list of genomic regions
262
+ total_count = 0 #total CpGs in list of genomic regions
263
+ for (chr,st,end) in lst:
264
+ total_size += (end - st)
265
+ if chr not in cpg_ranges:
266
+ continue
267
+ tmp = cpg_ranges[chr].find(st, end) #eg: [Interval(3, 40, value=3), Interval(13, 50, value=4)]
268
+ total_count += len(tmp)
269
+ return(total_size,total_count)
270
+
271
+ def read_grp_file1(gfile,na_lab="NA"):
272
+ '''
273
+ read group file. Group file define the biological groups of data matrix file.
274
+ (1) It must has header
275
+ (2) It must have two columns:
276
+ * 1st column: sample names. samples names should be unique, and they must be exactly the same as the first row of beta matrix file.
277
+ * 2nd column: group IDs.
278
+ (3) columns must be separated by ","
279
+
280
+ For example:
281
+
282
+ sampleID,groupID
283
+ Normal_1,1
284
+ Normal_2,1
285
+ Normal_3,1
286
+ Tumor_1,2
287
+ Tumor_2,2
288
+ Tumor_3,2
289
+ '''
290
+ samples = []
291
+ groups = []
292
+ line_num = 0
293
+ for l in ireader.reader(gfile):
294
+ l = l.replace(' ','')
295
+ line_num += 1
296
+ f = l.split(',')
297
+ if f[1] == na_lab:
298
+ continue
299
+ if len(f) < 2:
300
+ print ("Group fle must have 2 columns!", file=sys.stderr)
301
+ sys.exit(1)
302
+ if line_num == 1:
303
+ continue
304
+ else:
305
+ samples.append(f[0])
306
+ groups.append(f[1])
307
+
308
+ tmp = collections.Counter(samples)
309
+ if tmp.most_common(1)[0][1] > 1:
310
+ print ("Sample names are not unique!", file=sys.stderr)
311
+ sys.exit(0)
312
+
313
+ return(samples, groups)
314
+
315
+ def read_grp_file2(gfile):
316
+ '''
317
+ read group file. Group file define the biological groups of data matrix file.
318
+ (1) It must has header
319
+ (2) It must have at least two columns:
320
+ * 1st column: sample names. samples names should be unique, and they must be exactly the same as the first row of beta matrix file.
321
+ * 2nd column: group IDs.
322
+ * additional columns can be included to indicate co-variables.
323
+ (3) columns must be separated by ","
324
+
325
+ For example:
326
+
327
+ sampleID,survival,Sex
328
+ Normal_1,1,1
329
+ Normal_2,1,2
330
+ Normal_3,1,1
331
+ Tumor_1,2,1
332
+ Tumor_2,2,2
333
+ Tumor_3,2,1
334
+ ...
335
+ ...
336
+ '''
337
+ samples = []
338
+ covar_values = []
339
+ covar_names = []
340
+ covars = collections.defaultdict(dict)
341
+ line_num = 0
342
+
343
+ covar_values = collections.defaultdict(list) #continue variable or categorical variable. key is name, valu list of values
344
+ cutoff = 0.5 #ratio of number of unique values to the total number of unique values
345
+ for l in ireader.reader(gfile):
346
+ l = l.replace(' ','')
347
+ line_num += 1
348
+ f = l.split(',')
349
+ if len(f) < 2:
350
+ print ("Group fle has at lesat 2 columns!", file=sys.stderr)
351
+ sys.exit(1)
352
+ if line_num == 1:
353
+ covar_names = f[1:]
354
+ else:
355
+ sample_id = f[0]
356
+ samples.append(sample_id)
357
+ row_values = f[1:]
358
+
359
+ for a,b in zip(covar_names, row_values):
360
+ covars[a][sample_id] = b
361
+ covar_values[a].append(b)
362
+
363
+ tmp = collections.Counter(samples)
364
+ if tmp.most_common(1)[0][1] > 1:
365
+ print ("Sample names are not unique!", file=sys.stderr)
366
+ sys.exit(0)
367
+
368
+ #tell if a covariable is continuous or categorical
369
+ covar_types = {}
370
+ for k,v in covar_values.items():
371
+ if ( 1.0*len(set(v)) / len(v) ) > cutoff:
372
+ covar_types[k] = 'continuous'
373
+ else:
374
+ covar_types[k] = 'categorical'
375
+
376
+ return(samples, covar_names, covars, covar_types)
377
+
378
+ def stats_over_range(cpg_ranges, chrom, st, end):
379
+ '''
380
+ Basic statistics about range
381
+ '''
382
+
383
+ stats = []
384
+
385
+ if chrom not in cpg_ranges:
386
+ return ['NA']*6
387
+
388
+ tmp = []
389
+ overlaps = cpg_ranges[chrom].find(st, end)
390
+ for i in overlaps:
391
+ tmp.append(i.value)
392
+
393
+ if len(tmp) == 0:
394
+ return ['NA']*6
395
+
396
+ try:
397
+ i_count = len(overlaps)
398
+ except:
399
+ i_count = 'NA'
400
+
401
+ try:
402
+ i_min = round(min(tmp),4)
403
+ except:
404
+ i_min = 'NA'
405
+
406
+ try:
407
+ i_max = round(max(tmp),4)
408
+ except:
409
+ i_max = 'NA'
410
+
411
+ try:
412
+ i_mean = round(np.mean(tmp),4)
413
+ except:
414
+ i_mean = 'NA'
415
+
416
+ try:
417
+ i_median = round(np.median(tmp),4)
418
+ except:
419
+ i_median = 'NA'
420
+
421
+ try:
422
+ if len(tmp) > 1:
423
+ i_std = round(np.std(tmp, ddof=1),4)
424
+ else:
425
+ i_std = 'NA'
426
+ except:
427
+ i_std = 'NA'
428
+
429
+ return [i_count, i_min, i_max, i_mean, i_median, i_std]
430
+
431
+
432
+ def density_over_range(lst, cpg_ranges):
433
+ '''
434
+ Calculate CpG density over range (upstream, gene, downstream)
435
+ lst = list of (chr,start,end, strand)
436
+ cpg_ranges is returned by read_CpG_bed
437
+ '''
438
+
439
+ up_CpG_density = {}
440
+ gene_CpG_density = {}
441
+ down_CpG_density = {}
442
+ for i in range(0,101):
443
+ up_CpG_density[i] = 0
444
+ gene_CpG_density[i] = 0
445
+ down_CpG_density[i] = 0
446
+
447
+ for r1,r2,r3,strand in lst:
448
+ #if chr not in cpg_ranges:
449
+ # continue
450
+ if strand == '+':
451
+ up_region = r1
452
+ gene_region = r2
453
+ down_region = r3
454
+
455
+ elif strand == '-':
456
+ up_region = r3
457
+ gene_region = r2
458
+ down_region = r1
459
+
460
+ ## up-stream region
461
+ chrom = up_region[0]
462
+ start = up_region[1]
463
+ end = up_region[2]
464
+ span = end - start
465
+ if chrom not in cpg_ranges:
466
+ continue
467
+ tmp = cpg_ranges[chrom].find(start, end) #eg: [Interval(3, 40, value=3), Interval(13, 50, value=4)]
468
+ for i in tmp:
469
+ if strand == '+':
470
+ CpG_to_origin = round((i.end - start)*100/span)
471
+ elif strand == '-':
472
+ CpG_to_origin = abs(round((i.end - end)*100/span))
473
+
474
+ up_CpG_density[CpG_to_origin] += 1
475
+
476
+ ## gene region
477
+ chrom = gene_region[0]
478
+ start = gene_region[1]
479
+ end = gene_region[2]
480
+ span = end - start
481
+ if chrom not in cpg_ranges:
482
+ continue
483
+ tmp = cpg_ranges[chrom].find(start, end) #eg: [Interval(3, 40, value=3), Interval(13, 50, value=4)]
484
+ for i in tmp:
485
+ if strand == '+':
486
+ CpG_to_origin = round((i.end - start)*100/span)
487
+ elif strand == '-':
488
+ CpG_to_origin = abs(round((i.end - end)*100/span))
489
+
490
+ gene_CpG_density[CpG_to_origin] += 1
491
+
492
+ ## down-stream region
493
+ chrom = down_region[0]
494
+ start = down_region[1]
495
+ end = down_region[2]
496
+ span = end - start
497
+ if chrom not in cpg_ranges:
498
+ continue
499
+ tmp = cpg_ranges[chrom].find(start, end) #eg: [Interval(3, 40, value=3), Interval(13, 50, value=4)]
500
+ for i in tmp:
501
+ if strand == '+':
502
+ CpG_to_origin = round((i.end - start)*100/span)
503
+ elif strand == '-':
504
+ CpG_to_origin = abs(round((i.end - end)*100/span))
505
+
506
+ down_CpG_density[CpG_to_origin] += 1
507
+
508
+
509
+ #for k in sorted(up_CpG_density):
510
+ # print (str(k) + '\t' + str(up_CpG_density[k]))
511
+
512
+ #for k in sorted(gene_CpG_density):
513
+ # print (str(k) + '\t' + str(gene_CpG_density[k]))
514
+ #for k in sorted(down_CpG_density):
515
+ # print (str(k) + '\t' + str(down_CpG_density[k]))
516
+
517
+ return(up_CpG_density, gene_CpG_density, down_CpG_density)
518
+
519
+
520
+ def load_pickle_obj():
521
+ with open('./id2chr.pkl', 'rb') as f:
522
+ return pickle.load(f)
523
+
524
+
525
+ """
526
+ def read_CpG_bed(cpgfile,genefile, bin_count = 100):
527
+ '''
528
+ cpgfile: CpG BED file (at least 3 columns).
529
+ genefile: gene BED file (at least 6 columns, must have strand information).
530
+ '''
531
+ cpg_ranges = {}
532
+ for l in ireader.reader(cpgfile):
533
+ if l.startswith('#'):
534
+ continue
535
+ if l.startswith('track'):
536
+ continue
537
+ if l.startswith('browser'):
538
+ continue
539
+ f = l.split()
540
+ if len(f)< 3:
541
+ print ("BED has at lesat 3 columns. Skip: " + l, file=sys.stderr)
542
+ continue
543
+ try:
544
+ chrom = f[0]
545
+ start = int(f[1])
546
+ end = int(f[2])
547
+ if start > end:
548
+ print ("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr)
549
+ continue
550
+ except:
551
+ print ("Not in valid BED format. Skip:" + l, file=sys.stderr)
552
+ continue
553
+
554
+ if chrom not in cpg_anges:
555
+ cpg_ranges[chrom] = IntervalTree()
556
+ cpg_ranges[chrom].insert_interval( Interval( int(start), int(end)))
557
+
558
+ #return cpg_ranges
559
+
560
+ cpg_profile = [] #list of list = [CpG_count across bins]
561
+ for l in ireader.reader(genefile):
562
+ if l.startswith('#'):
563
+ continue
564
+ if l.startswith('track'):
565
+ continue
566
+ if l.startswith('browser'):
567
+ continue
568
+ f = l.split()
569
+ if len(f)< 6:
570
+ print ("Gene BED has at lesat 6 columns. Skip: " + l, file=sys.stderr)
571
+ continue
572
+ try:
573
+ chrom = f[0]
574
+ tss_start = int(f[1])
575
+ tss_end = int(f[2])
576
+ strand = f[5]
577
+ if start > end:
578
+ print ("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr)
579
+ continue
580
+ except:
581
+ print ("Not in valid BED format. Skip:" + l, file=sys.stderr)
582
+ continue
583
+
584
+ #
585
+ if chrom not in cpg_ranges:
586
+ continue
587
+
588
+ genomic_size = tss_end - tss_start
589
+ window_start = tss_start - int(genomic_size/2.0) #extend upstream half gene size
590
+ window_end = tss_end + int(genomic_size/2.0) #extend downstream half gene size
591
+ if window_start < 0:
592
+ window_start = 0
593
+
594
+
595
+ bins = equal_split(widow_start, window_end, bin_count)
596
+ if len(bins) == 0: continue
597
+
598
+ cpg_counts = [] #CcG count in each bin
599
+ for (bin_st, bin_end) in bins:
600
+ tmp = cpg_ranges[chrom].find(bin_st, bin_end)
601
+ cpg_counts.append(len(tmp))
602
+
603
+ if strand == '-':
604
+ cpg_counts = cpg_counts[::-1]
605
+ cpg_profile.append(cpg_counts)
606
+
607
+ return np.array(cpg_profile).means(axis=0)
608
+ """
609
+ def config_log(switch, logfile=None):
610
+ """
611
+ Configureing the logging module.
612
+
613
+ Parameters
614
+ ----------
615
+ switch : bool
616
+ Debugging switch.
617
+ Returns
618
+ -------
619
+ None.
620
+
621
+ """
622
+ if switch is True:
623
+ if logfile is None:
624
+ logging.basicConfig(
625
+ format="%(asctime)s [%(levelname)s] %(message)s",
626
+ datefmt='%Y-%m-%d %I:%M:%S', level=logging.DEBUG)
627
+ else:
628
+ logging.basicConfig(
629
+ filename=logfile,
630
+ format="%(asctime)s [%(levelname)s] %(message)s",
631
+ datefmt='%Y-%m-%d %I:%M:%S', level=logging.DEBUG)
632
+ else:
633
+ if logfile is None:
634
+ logging.basicConfig(
635
+ format="%(asctime)s [%(levelname)s] %(message)s",
636
+ datefmt='%Y-%m-%d %I:%M:%S', level=logging.INFO)
637
+ else:
638
+ logging.basicConfig(
639
+ filename=logfile,
640
+ format="%(asctime)s [%(levelname)s] %(message)s",
641
+ datefmt='%Y-%m-%d %I:%M:%S', level=logging.INFO)
642
+