cpgtools 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. cpgmodule/BED.py +441 -0
  2. cpgmodule/MI.py +193 -0
  3. cpgmodule/__init__.py +0 -0
  4. cpgmodule/_version.py +1 -0
  5. cpgmodule/cgID.py +866897 -0
  6. cpgmodule/data/AltumAge_cpg.pkl +0 -0
  7. cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
  8. cpgmodule/data/AltumAge_scaler.pkl +0 -0
  9. cpgmodule/data/GA_Bohlin.pkl +0 -0
  10. cpgmodule/data/GA_Haftorn.pkl +0 -0
  11. cpgmodule/data/GA_Knight.pkl +0 -0
  12. cpgmodule/data/GA_Lee_CPC.pkl +0 -0
  13. cpgmodule/data/GA_Lee_RPC.pkl +0 -0
  14. cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
  15. cpgmodule/data/GA_Mayne.pkl +0 -0
  16. cpgmodule/data/Hannum.pkl +0 -0
  17. cpgmodule/data/Horvath_2013.pkl +0 -0
  18. cpgmodule/data/Horvath_2018.pkl +0 -0
  19. cpgmodule/data/Levine.pkl +0 -0
  20. cpgmodule/data/Lu_DNAmTL.pkl +0 -0
  21. cpgmodule/data/Ped_McEwen.pkl +0 -0
  22. cpgmodule/data/Ped_Wu.pkl +0 -0
  23. cpgmodule/data/Zhang_BLUP.pkl +0 -0
  24. cpgmodule/data/Zhang_EN.pkl +0 -0
  25. cpgmodule/data/__init__.py +0 -0
  26. cpgmodule/extend_bed.py +147 -0
  27. cpgmodule/imotif.py +348 -0
  28. cpgmodule/ireader.py +28 -0
  29. cpgmodule/methylClock.py +53 -0
  30. cpgmodule/padjust.py +58 -0
  31. cpgmodule/region2gene.py +170 -0
  32. cpgmodule/utils.py +642 -0
  33. cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
  34. cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
  35. cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
  36. cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
  37. cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
  38. cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
  39. cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
  40. cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
  41. cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
  42. cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
  43. cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
  44. cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
  45. cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
  46. cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
  47. cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
  48. cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
  49. cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
  50. cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
  51. cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
  52. cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
  53. cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
  54. cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
  55. cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
  56. cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
  57. cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
  58. cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
  59. cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
  60. cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
  61. cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
  62. cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
  63. cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
  64. cpgtools-2.0.5.dist-info/METADATA +59 -0
  65. cpgtools-2.0.5.dist-info/RECORD +104 -0
  66. cpgtools-2.0.5.dist-info/WHEEL +5 -0
  67. cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
  68. cpgtools-2.0.5.dist-info/top_level.txt +5 -0
  69. impyute/__init__.py +3 -0
  70. impyute/contrib/__init__.py +7 -0
  71. impyute/contrib/compare.py +69 -0
  72. impyute/contrib/count_missing.py +30 -0
  73. impyute/contrib/describe.py +63 -0
  74. impyute/cs/__init__.py +11 -0
  75. impyute/cs/buck_iterative.py +82 -0
  76. impyute/cs/central_tendency.py +84 -0
  77. impyute/cs/em.py +52 -0
  78. impyute/cs/fast_knn.py +130 -0
  79. impyute/cs/random.py +27 -0
  80. impyute/dataset/__init__.py +6 -0
  81. impyute/dataset/base.py +137 -0
  82. impyute/dataset/corrupt.py +55 -0
  83. impyute/deletion/__init__.py +5 -0
  84. impyute/deletion/complete_case.py +21 -0
  85. impyute/ops/__init__.py +12 -0
  86. impyute/ops/error.py +9 -0
  87. impyute/ops/inverse_distance_weighting.py +31 -0
  88. impyute/ops/matrix.py +47 -0
  89. impyute/ops/testing.py +20 -0
  90. impyute/ops/util.py +96 -0
  91. impyute/ops/wrapper.py +179 -0
  92. impyute/ts/__init__.py +6 -0
  93. impyute/ts/locf.py +57 -0
  94. impyute/ts/moving_window.py +128 -0
  95. impyutelib.py +890 -0
  96. missingpy/__init__.py +4 -0
  97. missingpy/knnimpute.py +328 -0
  98. missingpy/missforest.py +556 -0
  99. missingpy/pairwise_external.py +315 -0
  100. missingpy/tests/__init__.py +0 -0
  101. missingpy/tests/test_knnimpute.py +605 -0
  102. missingpy/tests/test_missforest.py +409 -0
  103. missingpy/utils.py +124 -0
  104. misspylib.py +565 -0
cpgmodule/BED.py ADDED
@@ -0,0 +1,441 @@
1
+ #!/usr/bin/env python
2
+
3
+ #import built-in modules
4
+ import os,sys
5
+ import re
6
+ import string
7
+ import warnings
8
+ import string
9
+ import collections
10
+ import math
11
+ from operator import itemgetter
12
+ from itertools import groupby
13
+
14
+
15
+ #import third-party modules
16
+ from bx.bitset import *
17
+ from bx.bitset_builders import *
18
+ from bx.intervals import *
19
+
20
+ #from itertools import *
21
+ from cpgmodule import ireader
22
+
23
+ BED12 = '''
24
+ 1. chrom - The name of the chromosome (e.g. chr3, chrY, chr2_random) or scaffold (e.g. scaffold10671).
25
+ 2. chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0.
26
+ 3. chromEnd - The ending position of the feature in the chromosome or scaffold.
27
+ 4. name - Defines the name of the BED line.
28
+ 5. score.
29
+ 6. strand - Defines the strand. Either "." (=no strand) or "+" or "-".
30
+ 7. thickStart - The starting position at which the feature is drawn thickly (for example, the start codon in gene displays).
31
+ 8. thickEnd - The ending position at which the feature is drawn thickly (for example the stop codon in gene displays).
32
+ 9. itemRgb - An RGB value of the form R,G,B (e.g. 255,0,0).
33
+ 10. blockCount - The number of blocks (exons) in the BED line.
34
+ 11. blockSizes - A comma-separated list of the block sizes.
35
+ 12. blockStarts - A comma-separated list of block starts.
36
+
37
+ Detailed description of BED format: https://genome.ucsc.edu/FAQ/FAQformat.html#format1
38
+ '''
39
+
40
+
41
+ __author__ = "Liguo Wang"
42
+ __copyright__ = "Copyleft"
43
+ __credits__ = []
44
+ __license__ = "GPL"
45
+ __version__="0.1.0"
46
+ __maintainer__ = "Liguo Wang"
47
+ __email__ = "wang.liguo@mayo.edu"
48
+ __status__ = "Development"
49
+
50
+
51
+
52
+ class ParseBED:
53
+ '''
54
+ Manipulate BED (http://genome.ucsc.edu/FAQ/FAQformat.html) format file.
55
+ Input BED file must be 12-column (i.e. BED-12).
56
+ '''
57
+
58
+ def __init__(self,bedFile):
59
+ '''This is constructor of ParseBED'''
60
+ self.f=bedFile
61
+ self.fileName=os.path.basename(bedFile)
62
+ self.ABS_fileName=bedFile
63
+
64
+ def getExons(self,uniquify = True, stranded = True):
65
+ '''
66
+ Get all exons (including both coding exons and UTR exons) from BED-12 file.
67
+ uniquify: if the returned blocks should be uniquify.
68
+ '''
69
+
70
+ reblocks = []
71
+ for l in ireader.reader(self.f):
72
+ l = l.strip()
73
+ if l.startswith(('#','track','browser')):continue
74
+ f = l.split()
75
+ if len(f) < 12:
76
+ print ("Standard BED format has 12 columns.\n%s" % (BED), file=sys.stderr)
77
+ sys.exit(1)
78
+ chrom = f[0]
79
+ chrom_start = int(f[1])
80
+ name = f[4]
81
+ strand = f[5]
82
+ cdsStart = int(f[6])
83
+ cdsEnd = int(f[7])
84
+ blockCount = int(f[9])
85
+ blockSizes = [ int(i) for i in f[10].strip(',').split(',') ]
86
+ blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ]
87
+ for base,offset in zip( blockStarts, blockSizes ):
88
+ if stranded:
89
+ reblocks.append((chrom, base, base+offset, strand))
90
+ else:
91
+ reblocks.append((chrom, base, base+offset))
92
+ #self.f.seek(0)
93
+ if uniquify:
94
+ return list(set(reblocks))
95
+ else:
96
+ return reblocks
97
+
98
+ def getCDSExons(self,uniquify=True, stranded = True):
99
+
100
+ '''
101
+ Get only CDS exon regions from BED-12 file. Both 5' and 3' UTR parts are removed.
102
+ uniquify: if the returned blocks should be uniquify.
103
+ '''
104
+ reblocks = []
105
+ for l in ireader.reader(self.f):
106
+ l = l.strip()
107
+ if l.startswith(('#','track','browser')):continue
108
+ f = l.split()
109
+ if len(f) < 12:
110
+ print ("\nInput error!\nStandard BED format has 12 columns.\n%s" % (BED12), file=sys.stderr)
111
+ sys.exit(1)
112
+
113
+ chrom = f[0]
114
+ chrom_start = int(f[1])
115
+ name = f[4]
116
+ strand = f[5]
117
+ cdsStart = int(f[6])
118
+ cdsEnd = int(f[7])
119
+ blockCount = int(f[9])
120
+ blockSizes = [ int(i) for i in f[10].strip(',').split(',') ]
121
+ blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ]
122
+ cds_exons = []
123
+ genome_seq_index = []
124
+ for base,offset in zip( blockStarts, blockSizes ):
125
+ if (base + offset) < cdsStart: continue
126
+ if base > cdsEnd: continue
127
+ exon_start = max( base, cdsStart )
128
+ exon_end = min( base+offset, cdsEnd )
129
+ if stranded:
130
+ reblocks.append((chrom,exon_start,exon_end, strand))
131
+ else:
132
+ reblocks.append((chrom,exon_start,exon_end))
133
+ #self.f.seek(0)
134
+ if uniquify:
135
+ return list(set(reblocks))
136
+ else:
137
+ return reblocks
138
+
139
+ def getUTRs(self,utr=35, uniquify=True, stranded = True):
140
+ '''
141
+ Get UTR regions from BED-12 file.
142
+ When utr=35 [default], extract both 5' and 3' UTR.
143
+ When utr=3, only extract 3' UTR.
144
+ When utr=5, only extract 5' UTR
145
+ uniquify: if the returned blocks should be uniquify.
146
+ '''
147
+
148
+ reblocks = []
149
+ for l in ireader.reader(self.f):
150
+ l = l.strip()
151
+ if l.startswith(('#','track','browser')):continue
152
+ f = l.split()
153
+ if len(f) < 12:
154
+ print ("\nInput error!\nStandard BED format has 12 columns.\n%s" % (BED12), file=sys.stderr)
155
+ sys.exit(1)
156
+
157
+ chrom = f[0]
158
+ chrom_start = int(f[1])
159
+ name = f[4]
160
+ strand = f[5]
161
+ cdsStart = int(f[6])
162
+ cdsEnd = int(f[7])
163
+ blockCount = int(f[9])
164
+ blockSizes = [ int(i) for i in f[10].strip(',').split(',') ]
165
+ blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ]
166
+ exon_start = []
167
+ exon_end = []
168
+ for base,offset in zip( blockStarts, blockSizes ):
169
+ exon_start.append(base)
170
+ exon_end.append(base+offset)
171
+ if strand == '+':
172
+ if (utr==35 or utr==5):
173
+ for st,end in zip(exon_start,exon_end):
174
+ if st < cdsStart:
175
+ utr_st = st
176
+ utr_end = min(end,cdsStart)
177
+ if stranded:
178
+ reblocks.append((chrom,utr_st,utr_end,strand))
179
+ else:
180
+ reblocks.append((chrom,utr_st,utr_end))
181
+ if (utr==35 or utr==3):
182
+ for st,end in zip(exon_start,exon_end):
183
+ if end > cdsEnd:
184
+ utr_st = max(st, cdsEnd)
185
+ utr_end = end
186
+ if stranded:
187
+ reblocks.append((chrom,utr_st,utr_end,strand))
188
+ else:
189
+ reblocks.append((chrom,utr_st,utr_end))
190
+ if strand == '-':
191
+ if (utr==35 or utr==3):
192
+ for st,end in zip(exon_start,exon_end):
193
+ if st < cdsStart:
194
+ utr_st = st
195
+ utr_end = min(end,cdsStart)
196
+ if stranded:
197
+ reblocks.append((chrom,utr_st,utr_end,strand))
198
+ else:
199
+ reblocks.append((chrom,utr_st,utr_end))
200
+ if (utr==35 or utr==5):
201
+ for st,end in zip(exon_start,exon_end):
202
+ if end > cdsEnd:
203
+ utr_st = max(st, cdsEnd)
204
+ utr_end = end
205
+ if stranded:
206
+ reblocks.append((chrom,utr_st,utr_end,strand))
207
+ else:
208
+ reblocks.append((chrom,utr_st,utr_end))
209
+ #self.f.seek(0)
210
+ if uniquify:
211
+ return list(set(reblocks))
212
+ else:
213
+ return reblocks
214
+
215
+ def getIntrons(self, itype, uniquify=True, stranded=True):
216
+ '''
217
+ Get Intron regions from BED-12 file.
218
+ separated bed file, each row represents one intron
219
+
220
+ itype = :
221
+ * 'all': all introns
222
+ * 'first': Only return the first intron of each gene. The gene should have at least 1 intron.
223
+ * 'internal': return all internal introns. The gene should have at least 3 introns.
224
+ * 'last': Return the last intron. The gene should have at least 2 introns.
225
+ * 'cds': Return introns within CDS region.
226
+ * 'utr': Return introns within UTR regions.
227
+ '''
228
+
229
+ reblocks=[]
230
+ for l in ireader.reader(self.f):
231
+ l = l.strip()
232
+ if l.startswith(('#','track','browser')):continue
233
+ f = l.split()
234
+ chrom = f[0]
235
+ chrom_start = int(f[1])
236
+ name = f[4]
237
+ strand = f[5]
238
+ cdsStart = int(f[6])
239
+ cdsEnd = int(f[7])
240
+ blockCount = int(f[9])
241
+ if blockCount == 1:continue
242
+ blockSizes = [ int(i) for i in f[10].strip(',').split(',') ]
243
+ blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ]
244
+ exon_start = []
245
+ exon_end = []
246
+ for base,offset in zip( blockStarts, blockSizes ):
247
+ exon_start.append(base)
248
+ exon_end.append(base+offset)
249
+
250
+ intron_start = exon_end[:-1]
251
+ intron_end=exon_start[1:]
252
+
253
+ intron_list = list(zip(intron_start,intron_end))
254
+ intron_number = len(intron_list)
255
+
256
+ if itype == 'all':
257
+ for (st,end) in intron_list:
258
+ if stranded:
259
+ reblocks.append((chrom,st,end, strand))
260
+ else:
261
+ reblocks.append((chrom,st,end))
262
+
263
+ elif itype == 'first':
264
+ if intron_number == 0:
265
+ continue
266
+ if strand == '-':
267
+ if stranded:
268
+ reblocks.append((chrom, intron_list[-1][0], intron_list[-1][1], strand))
269
+ else:
270
+ reblocks.append((chrom, intron_list[-1][0], intron_list[-1][1]))
271
+ else:
272
+ if stranded:
273
+ reblocks.append((chrom, intron_list[-1][0], intron_list[-1][1], strand))
274
+ else:
275
+ reblocks.append((chrom, intron_list[-1][0], intron_list[-1][1]))
276
+
277
+ elif itype == 'last':
278
+ if intron_number < 2:
279
+ continue
280
+ if strand == '-':
281
+ if stranded:
282
+ reblocks.append((chrom, intron_list[-1][0], intron_list[-1][1], strand))
283
+ else:
284
+ reblocks.append((chrom, intron_list[-1][0], intron_list[-1][1]))
285
+ else:
286
+ if stranded:
287
+ reblocks.append((chrom, intron_list[-1][0], intron_list[-1][1], strand))
288
+ else:
289
+ reblocks.append((chrom, intron_list[-1][0], intron_list[-1][1]))
290
+ elif itype == 'internal':
291
+ if intron_number < 3:
292
+ continue
293
+ for (st,end) in intron_list[1:-1]:
294
+ if stranded:
295
+ reblocks.append((chrom,st,end, strand))
296
+ else:
297
+ reblocks.append((chrom,st,end))
298
+
299
+ elif itype == 'cds':
300
+ for (st,end) in intron_list:
301
+ if end < cdsStart: continue
302
+ if st > cdsEnd: continue
303
+ if stranded:
304
+ reblocks.append((chrom,st,end, strand))
305
+ else:
306
+ reblocks.append((chrom,st,end))
307
+ elif itype == 'utr':
308
+ for (st,end) in intron_list:
309
+ if end < cdsStart:
310
+ if stranded:
311
+ reblocks.append((chrom,st,end, strand))
312
+ else:
313
+ reblocks.append((chrom,st,end))
314
+ if st > cdsEnd:
315
+ if stranded:
316
+ reblocks.append((chrom,st,end, strand))
317
+ else:
318
+ reblocks.append((chrom,st,end))
319
+
320
+ #self.f.seek(0)
321
+ if uniquify:
322
+ return list(set(reblocks))
323
+ else:
324
+ return reblocks
325
+
326
+
327
+ def getIntergenic(self,direction='up', size=2000, uniquify=True, stranded = True):
328
+ '''get intergenic regions. direction=up or down or both.'''
329
+
330
+ reblocks=[]
331
+ for l in ireader.reader(self.f):
332
+ l = l.strip()
333
+ if l.startswith(('#','track','browser')):continue
334
+ f = l.split()
335
+ chrom = f[0]
336
+ tx_start = int( f[1] )
337
+ tx_end = int( f[2] )
338
+ strand = f[5]
339
+ if(direction=="up" or direction=="both"):
340
+ if strand=='-':
341
+ region_st=tx_end
342
+ region_end=tx_end +size
343
+ else:
344
+ region_st = max(tx_start-size,0)
345
+ region_end=tx_start
346
+ reblocks.append((chrom,region_st,region_end, strand))
347
+ if (direction=="down" or direction=="both"):
348
+ if strand == '-':
349
+ region_st = max(0,tx_start-size)
350
+ region_end = tx_start
351
+ else:
352
+ region_st = tx_end
353
+ region_end = tx_end+size
354
+ if stranded:
355
+ reblocks.append((chrom,region_st,region_end, strand))
356
+ else:
357
+ reblocks.append((chrom,region_st,region_end))
358
+ #self.f.seek(0)
359
+ if uniquify:
360
+ return list(set(reblocks))
361
+ else:
362
+ return reblocks
363
+
364
+
365
+
366
+
367
+
368
+ def unionBed3(lst):
369
+ '''Take the union of 3 column bed files. return a new list'''
370
+ bitsets = binned_bitsets_from_list(lst)
371
+ ret_lst=[]
372
+ for chrom in bitsets:
373
+ bits = bitsets[chrom]
374
+ end = 0
375
+ while 1:
376
+ start = bits.next_set( end )
377
+ if start == bits.size: break
378
+ end = bits.next_clear( start )
379
+ ret_lst.append([chrom, start, end])
380
+ bitsets=dict()
381
+ return ret_lst
382
+
383
+ def intersectBed3(lst1,lst2):
384
+ '''Take the intersection of two bed files (3 column bed files)'''
385
+ bits1 = binned_bitsets_from_list(lst1)
386
+ bits2 = binned_bitsets_from_list(lst2)
387
+
388
+ bitsets = dict()
389
+ ret_lst = []
390
+ for key in bits1:
391
+ if key in bits2:
392
+ bits1[key].iand( bits2[key] )
393
+ bitsets[key] = bits1[key]
394
+
395
+ for chrom in bitsets:
396
+ bits = bitsets[chrom]
397
+ end = 0
398
+ while 1:
399
+ start = bits.next_set( end )
400
+ if start == bits.size: break
401
+ end = bits.next_clear( start )
402
+ ret_lst.append([chrom, start, end])
403
+ bits1.clear()
404
+ bits2.clear()
405
+ bitsets.clear()
406
+ return ret_lst
407
+
408
+ def subtractBed3(lst1,lst2):
409
+ '''subtrack lst2 from lst1'''
410
+ bitsets1 = binned_bitsets_from_list(lst1)
411
+ bitsets2 = binned_bitsets_from_list(lst2)
412
+
413
+ ret_lst=[]
414
+ for chrom in bitsets1:
415
+ if chrom not in bitsets1:
416
+ continue
417
+ bits1 = bitsets1[chrom]
418
+ if chrom in bitsets2:
419
+ bits2 = bitsets2[chrom]
420
+ bits2.invert()
421
+ bits1.iand( bits2 )
422
+ end=0
423
+ while 1:
424
+ start = bits1.next_set( end )
425
+ if start == bits1.size: break
426
+ end = bits1.next_clear( start )
427
+ ret_lst.append([chrom,start,end])
428
+ bitsets1 = dict()
429
+ bitsets2 = dict()
430
+ return ret_lst
431
+
432
+ def tillingBed(chrName,chrSize,stepSize=10000):
433
+ '''tilling whome genome into small sizes'''
434
+ #tilling genome
435
+ for start in xrange(0,chrSize,stepSize):
436
+ end = start + stepSize
437
+ if end < chrSize:
438
+ yield (chrName,start,end)
439
+ else:
440
+ yield (chrName,start,chrSize)
441
+
cpgmodule/MI.py ADDED
@@ -0,0 +1,193 @@
1
+ #!/usr/bin/env python
2
+ '''calculate mutual information of two lists of numbers or symbols'''
3
+
4
+ import numpy as np
5
+
6
+ from collections import Counter
7
+ from sklearn.feature_selection import mutual_info_classif
8
+ from sklearn.metrics import mutual_info_score
9
+
10
+
11
+ __author__ = "Liguo Wang"
12
+ __copyright__ = ""
13
+ __credits__ = []
14
+ __license__ = "GPLv2"
15
+ __version__ = "1.0.0"
16
+ __maintainer__ = "Liguo Wang"
17
+ __email__ = "Wang.Liguo@mayo.edu"
18
+ __status__ = "Development" #Prototype or Production
19
+
20
+
21
+
22
+
23
+ def Mutual_information1(x,y):
24
+ '''
25
+ x and y are lists of symbols (like 'A','C','G','T').
26
+ Calculation mutual information based on: MI = H(x) + H(y) - H(x,y)
27
+ Log2 based, unit is bit
28
+ '''
29
+ x = [str(i) for i in x]
30
+ y = [str(i) for i in y]
31
+ if len(x) != len(y):
32
+ return 0
33
+ xy = [''.join(i) for i in zip(x,y)]
34
+
35
+ x_freq = np.array(list(Counter(x).values())) #a.items(): [('A', 3), ('C', 2), ('T', 4), ('G', 1)]
36
+ y_freq = np.array(list(Counter(y).values()))
37
+ xy_freq = np.array(list(Counter(xy).values()))
38
+
39
+ x_freq = x_freq*1.0/sum(x_freq)
40
+ y_freq = y_freq*1.0/sum(y_freq)
41
+ xy_freq = xy_freq*1.0/sum(xy_freq)
42
+
43
+ x_H = -sum([i * np.log2(i) for i in x_freq])
44
+ y_H = -sum([i * np.log2(i) for i in y_freq])
45
+ xy_H = -sum([i * np.log2(i) for i in xy_freq])
46
+
47
+ return (x_H,y_H,xy_H, x_H+y_H-xy_H)
48
+
49
+ def Mutual_information2(x,y):
50
+ '''
51
+ x and y are lists of symbols (like 'A','C','G','T').
52
+ Calculate mutual information based on its original definition.
53
+ Log2 based, unit is bit
54
+ '''
55
+ x = [str(i) for i in x]
56
+ y = [str(i) for i in y]
57
+ if len(x) != len(y):
58
+ return 0
59
+ xy = [''.join(i) for i in zip(x,y)]
60
+
61
+ px = {}
62
+ py = {}
63
+ pxy = {}
64
+ for i,j in list(Counter(x).items()):
65
+ px[i] = j*1.0/len(x)
66
+ for i,j in list(Counter(y).items()):
67
+ py[i] = j*1.0/len(y)
68
+ for i,j in list(Counter(xy).items()):
69
+ pxy[i] = j*1.0/len(xy)
70
+ #print px
71
+ #print py
72
+ #print pxy
73
+
74
+ mi_sum = 0.0
75
+ tmp = set()
76
+ for xi, yi in zip(x,y):
77
+ xyi = xi + yi
78
+ if xyi in tmp: continue
79
+ #print "%s::px:%f, py:%f, pxy:%f" % (xyi,px[xi],py[yi],pxy[xyi])
80
+ mi_sum += (pxy[xyi] * np.log(pxy[xyi] / (px[xi] * py[yi])))
81
+ tmp.add(xyi)
82
+ return mi_sum
83
+
84
+ def PMI(x,y):
85
+ '''
86
+ x and y are lists of symbols (like 'A','C','G','T').
87
+ Calculate pointwise mutual information based on its original definition.
88
+ Log2 based, unit is bit
89
+ '''
90
+ x = [str(i) for i in x]
91
+ y = [str(i) for i in y]
92
+
93
+ if len(x) != len(y):
94
+ return 0
95
+ xy = [''.join(i) for i in zip(x,y)]
96
+ #print xy
97
+ #print set(x)
98
+ #print set(y)
99
+ px = {}
100
+ py = {}
101
+ pxy = {}
102
+ for i,j in list(Counter(x).items()):
103
+ px[i] = j*1.0/len(x)
104
+ for i,j in list(Counter(y).items()):
105
+ py[i] = j*1.0/len(y)
106
+ for i,j in list(Counter(xy).items()):
107
+ pxy[i] = j*1.0/len(xy)
108
+ #print px
109
+ #print py
110
+ #print pxy
111
+
112
+ #print set(x)
113
+ #print set(y)
114
+
115
+ for i in set(x):
116
+ #if px[i] < 0.05:continue
117
+ for j in set(y):
118
+ #if py[j] < 0.05:continue
119
+ tmp1 = i + j
120
+ if i + j in pxy:
121
+ tmp2 = np.log2(pxy[i+j]/(px[i] * py[j]))
122
+ #tmp2 = -(np.log2(pxy[i+j]/(px[i] * py[j]))) / np.log2(pxy[i+j])
123
+ else:
124
+ continue
125
+ print(tmp1,tmp2)
126
+
127
+ def Mutual_expected():
128
+ '''
129
+ x and y are lists of symbols (like 'A','C','G','T').
130
+ Calculation mutual information based on: MI = H(x) + H(y) - H(x,y)
131
+ Log2 based, unit is bit
132
+ '''
133
+ x_freq = [0.25]*4
134
+ y_freq = [0.25]*4
135
+ xy_freq= [0.0625]*16
136
+
137
+ x_H = -sum([i * np.log2(i) for i in x_freq])
138
+ y_H = -sum([i * np.log2(i) for i in y_freq])
139
+ xy_H = -sum([i * np.log2(i) for i in xy_freq])
140
+
141
+ return (x_H,y_H,xy_H, x_H+y_H-xy_H)
142
+
143
+
144
+ if __name__=='__main__':
145
+ x=['G', 'T', 'C', 'A', 'T', 'T', 'A', 'C', 'T', 'A']
146
+ y=['A', 'C', 'A', 'C', 'A', 'A', 'G', 'A', 'G', 'A']
147
+ z=['A', 'C', 'A', 'T', 'A', 'A', 'T', 'A', 'G', 'T']
148
+
149
+
150
+ X_0 = (0, 0, 1, 1, 0, 1, 1, 2, 2, 2)
151
+ X_1 = (3, 4, 5, 5, 3, 2, 2, 6, 6, 1)
152
+ X_2 = [7, 2, 1, 3, 2, 8, 9, 1, 2, 0]
153
+
154
+ C8 = ['T', 'A', 'A', 'A', 'T', 'T', 'G', 'C', 'T', 'A', 'A', 'A', 'A', 'T', 'T', 'T', 'A', 'T', 'T', 'A', 'G', 'A', 'C', 'A', 'G', 'G', 'G', 'G', 'T', 'A', 'T', 'T', 'A', 'G', 'C', 'T', 'T', 'T', 'A', 'T', 'G', 'G', 'A', 'C', 'T', 'C', 'C', 'C', 'C', 'T', 'G', 'A', 'T', 'A', 'T', 'A', 'A', 'T', 'C', 'C', 'T', 'G', 'A', 'C', 'G', 'T', 'T', 'C', 'T', 'T', 'A', 'T', 'A', 'A', 'A', 'A', 'T', 'G', 'A', 'A', 'T', 'G', 'A', 'C', 'T', 'A', 'C', 'A', 'A', 'C', 'C', 'A', 'A', 'A', 'C', 'C', 'T', 'T', 'T', 'A', 'A', 'A', 'G', 'A', 'C', 'T', 'A', 'C', 'T', 'T', 'T', 'T', 'T', 'T', 'G', 'A', 'A', 'T', 'A', 'G', 'G', 'A', 'A', 'A', 'T', 'A', 'G', 'G', 'C', 'C', 'T', 'C', 'G', 'A', 'T', 'T', 'G', 'T', 'T', 'C', 'G', 'G', 'C', 'T', 'T', 'G', 'A', 'G', 'T', 'A', 'T', 'T', 'T', 'A', 'T', 'A', 'C', 'C', 'C', 'C', 'T', 'T', 'T', 'A', 'C', 'C', 'A', 'A', 'C', 'T', 'C', 'A', 'C', 'T', 'T', 'T', 'T', 'A', 'G', 'T', 'A', 'G', 'T', 'C', 'A', 'T', 'C', 'A', 'C', 'C', 'C', 'T', 'T']
155
+ C9 = ['T', 'T', 'G', 'A', 'C', 'T', 'A', 'C', 'G', 'C', 'G', 'G', 'G', 'G', 'G', 'T', 'C', 'G', 'T', 'A', 'G', 'G', 'T', 'G', 'C', 'T', 'T', 'C', 'G', 'T', 'G', 'G', 'C', 'T', 'C', 'A', 'T', 'T', 'C', 'A', 'T', 'A', 'A', 'C', 'C', 'G', 'T', 'G', 'C', 'C', 'T', 'C', 'G', 'C', 'T', 'T', 'C', 'C', 'T', 'A', 'T', 'C', 'G', 'A', 'C', 'T', 'C', 'T', 'T', 'C', 'A', 'T', 'G', 'A', 'A', 'G', 'T', 'C', 'T', 'T', 'T', 'C', 'A', 'T', 'T', 'T', 'T', 'T', 'G', 'A', 'A', 'C', 'T', 'G', 'C', 'C', 'T', 'C', 'C', 'A', 'G', 'G', 'G', 'G', 'G', 'A', 'G', 'A', 'G', 'T', 'T', 'C', 'T', 'A', 'C', 'G', 'G', 'T', 'G', 'C', 'C', 'G', 'A', 'A', 'G', 'A', 'G', 'T', 'A', 'C', 'A', 'C', 'C', 'G', 'G', 'C', 'A', 'C', 'G', 'C', 'A', 'A', 'A', 'T', 'G', 'A', 'T', 'T', 'A', 'A', 'G', 'A', 'T', 'G', 'A', 'A', 'A', 'A', 'C', 'A', 'G', 'C', 'C', 'A', 'C', 'C', 'G', 'A', 'C', 'C', 'T', 'T', 'T', 'C', 'C', 'T', 'T', 'A', 'A', 'C', 'T', 'A', 'G', 'A', 'G', 'C', 'T', 'A', 'T', 'C', 'A', 'T', 'G']
156
+
157
+ #AR motif pos-5 and pos-11
158
+ C5 = ['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'G', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'G', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'T', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'T', 'C', 'C', 'C', 'C', 'T', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C']
159
+ C11 = ['G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'C', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'A', 'T', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'A', 'G', 'G', 'G', 'G', 'G', 'A', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'A', 'G', 'G', 'G', 'G', 'G']
160
+ #PMI(C5,C11)
161
+
162
+
163
+ #https://en.wikipedia.org/wiki/Pointwise_mutual_information example
164
+ x=[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1]
165
+ y=[0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1]
166
+ z=[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0]
167
+ PMI(x,z)
168
+ #print (Mutual_information1(x,z))
169
+ print (Mutual_information2(x,z))
170
+ print (mutual_info_score(x,z))
171
+ print (mutual_info_score(y,z))
172
+
173
+ #C1 = ['A','C','G','T']
174
+ #C2 = ['A','C','G','T']
175
+ #a=Mutual_information2(x,y)
176
+ #b=Mutual_information2(x,z)
177
+ #print(a)
178
+ #print(b)
179
+
180
+
181
+ #b=Mutual_information2(C1,C2)
182
+ #print(b)
183
+
184
+ '''
185
+ from sklearn.feature_selection import mutual_info_classif
186
+ from sklearn.metrics import mutual_info_score
187
+ a = np.array([1, 1, 1, 0, 0, 1, 0, 0, 0, 1])
188
+ b = np.array([1, 1, 1, 0, 0, 1, 0, 0, 0, 1])
189
+
190
+ print(stats.entropy([0.5,0.5])) # entropy of 0.69, expressed in nats
191
+ print(mutual_info_classif(a.reshape(-1,1), b, discrete_features = True)) # mutual information of 0.69, expressed in nats
192
+ print(mutual_info_score(a,b)) # information gain of 0.69, expressed in nats
193
+ '''
cpgmodule/__init__.py ADDED
File without changes
cpgmodule/_version.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "2.0.5"