cpgtools 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. cpgmodule/BED.py +441 -0
  2. cpgmodule/MI.py +193 -0
  3. cpgmodule/__init__.py +0 -0
  4. cpgmodule/_version.py +1 -0
  5. cpgmodule/cgID.py +866897 -0
  6. cpgmodule/data/AltumAge_cpg.pkl +0 -0
  7. cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
  8. cpgmodule/data/AltumAge_scaler.pkl +0 -0
  9. cpgmodule/data/GA_Bohlin.pkl +0 -0
  10. cpgmodule/data/GA_Haftorn.pkl +0 -0
  11. cpgmodule/data/GA_Knight.pkl +0 -0
  12. cpgmodule/data/GA_Lee_CPC.pkl +0 -0
  13. cpgmodule/data/GA_Lee_RPC.pkl +0 -0
  14. cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
  15. cpgmodule/data/GA_Mayne.pkl +0 -0
  16. cpgmodule/data/Hannum.pkl +0 -0
  17. cpgmodule/data/Horvath_2013.pkl +0 -0
  18. cpgmodule/data/Horvath_2018.pkl +0 -0
  19. cpgmodule/data/Levine.pkl +0 -0
  20. cpgmodule/data/Lu_DNAmTL.pkl +0 -0
  21. cpgmodule/data/Ped_McEwen.pkl +0 -0
  22. cpgmodule/data/Ped_Wu.pkl +0 -0
  23. cpgmodule/data/Zhang_BLUP.pkl +0 -0
  24. cpgmodule/data/Zhang_EN.pkl +0 -0
  25. cpgmodule/data/__init__.py +0 -0
  26. cpgmodule/extend_bed.py +147 -0
  27. cpgmodule/imotif.py +348 -0
  28. cpgmodule/ireader.py +28 -0
  29. cpgmodule/methylClock.py +53 -0
  30. cpgmodule/padjust.py +58 -0
  31. cpgmodule/region2gene.py +170 -0
  32. cpgmodule/utils.py +642 -0
  33. cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
  34. cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
  35. cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
  36. cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
  37. cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
  38. cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
  39. cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
  40. cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
  41. cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
  42. cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
  43. cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
  44. cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
  45. cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
  46. cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
  47. cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
  48. cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
  49. cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
  50. cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
  51. cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
  52. cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
  53. cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
  54. cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
  55. cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
  56. cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
  57. cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
  58. cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
  59. cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
  60. cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
  61. cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
  62. cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
  63. cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
  64. cpgtools-2.0.5.dist-info/METADATA +59 -0
  65. cpgtools-2.0.5.dist-info/RECORD +104 -0
  66. cpgtools-2.0.5.dist-info/WHEEL +5 -0
  67. cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
  68. cpgtools-2.0.5.dist-info/top_level.txt +5 -0
  69. impyute/__init__.py +3 -0
  70. impyute/contrib/__init__.py +7 -0
  71. impyute/contrib/compare.py +69 -0
  72. impyute/contrib/count_missing.py +30 -0
  73. impyute/contrib/describe.py +63 -0
  74. impyute/cs/__init__.py +11 -0
  75. impyute/cs/buck_iterative.py +82 -0
  76. impyute/cs/central_tendency.py +84 -0
  77. impyute/cs/em.py +52 -0
  78. impyute/cs/fast_knn.py +130 -0
  79. impyute/cs/random.py +27 -0
  80. impyute/dataset/__init__.py +6 -0
  81. impyute/dataset/base.py +137 -0
  82. impyute/dataset/corrupt.py +55 -0
  83. impyute/deletion/__init__.py +5 -0
  84. impyute/deletion/complete_case.py +21 -0
  85. impyute/ops/__init__.py +12 -0
  86. impyute/ops/error.py +9 -0
  87. impyute/ops/inverse_distance_weighting.py +31 -0
  88. impyute/ops/matrix.py +47 -0
  89. impyute/ops/testing.py +20 -0
  90. impyute/ops/util.py +96 -0
  91. impyute/ops/wrapper.py +179 -0
  92. impyute/ts/__init__.py +6 -0
  93. impyute/ts/locf.py +57 -0
  94. impyute/ts/moving_window.py +128 -0
  95. impyutelib.py +890 -0
  96. missingpy/__init__.py +4 -0
  97. missingpy/knnimpute.py +328 -0
  98. missingpy/missforest.py +556 -0
  99. missingpy/pairwise_external.py +315 -0
  100. missingpy/tests/__init__.py +0 -0
  101. missingpy/tests/test_knnimpute.py +605 -0
  102. missingpy/tests/test_missforest.py +409 -0
  103. missingpy/utils.py +124 -0
  104. misspylib.py +565 -0
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
File without changes
@@ -0,0 +1,147 @@
1
+ import sys
2
+ from bx.intervals import *
3
+ import numpy as np
4
+ from cpgmodule import ireader
5
+
6
+ def getBasalDomains(bedfile, printit = False):
7
+ '''
8
+ Define gene's basal regulatory domain.
9
+ bedfile: one gene one TSS (could use the canonical (longest) isoform, or merge all isoforms into a super transcript.
10
+ '''
11
+ basal_ranges = {}
12
+
13
+ for l in ireader.reader(bedfile):
14
+ if l.startswith('#'):
15
+ continue
16
+ if l.startswith('track'):
17
+ continue
18
+ if l.startswith('browser'):
19
+ continue
20
+ f = l.split()
21
+ try:
22
+ chrom = f[0]
23
+ start = int(f[1])
24
+ end = int(f[2])
25
+ symbol = f[3]
26
+ gene_strand = f[5]
27
+ if start > end:
28
+ print ("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr)
29
+ continue
30
+ if gene_strand not in ['+','-']:
31
+ print ("Invalid strand. Skip: " + l, file=sys.stderr)
32
+ continue
33
+ except:
34
+ print ("BED has at lesat 6 columns. Skip: " + l, file=sys.stderr)
35
+ continue
36
+
37
+ if chrom not in basal_ranges:
38
+ basal_ranges[chrom] = IntervalTree()
39
+
40
+ basal_ranges[chrom].insert_interval( Interval(start, end, strand = gene_strand, value = symbol))
41
+
42
+ if printit:
43
+ print('\t'.join([str(i) for i in (chrom, basal_st, basal_end, symbol, '0', strand)]), file = sys.stdout)
44
+ return basal_ranges
45
+
46
+ def geteExtendedDomains(basal_ranges, bedfile, up_ext=2000, down_ext=2000, min_gene = 200, printit = False):
47
+ '''
48
+ Define gene's extended regulatory domain.
49
+ bedfile:one gene one TSS (could use the canonical (longest) isoform, or merge all
50
+ isoforms into a super transcript.
51
+ up_ext:
52
+ Size of extension to upstream. Should be multiples of 100
53
+ down_ext:
54
+ Size of extension to downstream. Should be multiples of 100
55
+ min_gene:
56
+ minimum gene size (from TSS to TES). Should be multiples of 100
57
+
58
+ '''
59
+ return_ranges = []
60
+
61
+ for l in ireader.reader(bedfile):
62
+ if l.startswith('#'):
63
+ continue
64
+ if l.startswith('track'):
65
+ continue
66
+ if l.startswith('browser'):
67
+ continue
68
+ f = l.split()
69
+ try:
70
+ chrom = f[0]
71
+ start = int(f[1])
72
+ end = int(f[2])
73
+ symbol = f[3]
74
+ strand = f[5]
75
+
76
+ if start < 0:continue
77
+ if start > end:
78
+ print ("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr)
79
+ continue
80
+ if (end - start ) < min_gene:
81
+ continue
82
+ if strand not in ['+', '-']:
83
+ print ("Unknown strand. Skip: " + l, file=sys.stderr)
84
+ continue
85
+ except:
86
+ print ("BED has at lesat 6 columns. Skip: " + l, file=sys.stderr)
87
+
88
+ if strand == '+':
89
+ extension_st = start - up_ext
90
+ extension_end = end + down_ext
91
+ elif strand == '-':
92
+ extension_st = start - down_ext
93
+ extension_end = end + up_ext
94
+ if extension_st < 0:
95
+ extension_st = 0
96
+
97
+ #try to update extension_st
98
+ overlaps = basal_ranges[chrom].find(extension_st, start)
99
+ if len(overlaps) > 0:
100
+ for o in overlaps:
101
+ if o.end > extension_st:
102
+ extension_st = o.end
103
+ if extension_st > start:
104
+ extension_st = start
105
+
106
+ if (start - extension_st) < min_gene:
107
+ continue
108
+
109
+ #try to update extension_end
110
+ overlaps = basal_ranges[chrom].find(end, extension_end)
111
+ if len(overlaps) > 0:
112
+ for o in overlaps:
113
+ if o.start < extension_end:
114
+ extension_end = o.start
115
+ if extension_end < end:
116
+ extension_end = end
117
+
118
+ if (extension_end - end) < min_gene:
119
+ continue
120
+
121
+ return_ranges.append(([chrom, extension_st, start,symbol], [chrom, start, end,symbol], [chrom, end, extension_end,symbol], strand))
122
+ #return_ranges.append(([chrom, extension_st, start, symbol], [chrom, start, end, symbol], [chrom, end, extension_end,symbol], strand))
123
+ #return_ranges.append(([chrom, extension_st, start, strand], [chrom, start, end, strand], [chrom, end, extension_end, strand]))
124
+
125
+ if printit:
126
+ print('\t'.join([str(i) for i in (chrom, extension_st, extension_end, symbol, '0', strand, start, end, '255,0,0', 1, extension_end - extension_st, 0)]), file = sys.stdout)
127
+
128
+ return return_ranges
129
+
130
+ if __name__=='__main__':
131
+ tmp = getBasalDomains(sys.argv[1], printit = False)
132
+ b = geteExtendedDomains(basal_ranges = tmp, bedfile = sys.argv[1], printit=False)
133
+
134
+
135
+ for a1,a2,a3,a4 in b:
136
+ if a4 == '+':
137
+ print ('\t'.join([str(i) for i in a1]) + '(UIR)\t' +str(int(a1[2]) - int(a1[1])) + '\t+')
138
+ print ('\t'.join([str(i) for i in a2]) + '(Body)\t' +str(int(a2[2]) - int(a2[1])) + '\t+')
139
+ print ('\t'.join([str(i) for i in a3]) + '(DIR)\t' +str(int(a3[2]) - int(a3[1])) + '\t+')
140
+ if a4 == '-':
141
+ print ('\t'.join([str(i) for i in a1]) + '(DIR)\t' +str(int(a1[2]) - int(a1[1])) + '\t-')
142
+ print ('\t'.join([str(i) for i in a2]) + '(Body)\t' +str(int(a2[2]) - int(a2[1])) + '\t-')
143
+ print ('\t'.join([str(i) for i in a3]) + '(UIR)\t' +str(int(a3[2]) - int(a3[1])) + '\t-')
144
+
145
+
146
+
147
+
cpgmodule/imotif.py ADDED
@@ -0,0 +1,348 @@
1
+ #!/usr/bin/env python
2
+ '''DNA/protein motif visualization and scan'''
3
+
4
+ #import built-in modules
5
+ import sys,os
6
+ from collections import defaultdict
7
+ from scipy import stats
8
+ import itertools
9
+
10
+ #import third-party modules
11
+ import numpy as np
12
+ #changes to the paths
13
+
14
+ #changing history to this module
15
+
16
+
17
+ __author__ = "Liguo Wang"
18
+ __copyright__ = ""
19
+ __credits__ = []
20
+ __license__ = "GPLv2"
21
+ __version__ = "1.0.0"
22
+ __maintainer__ = "Liguo Wang"
23
+ __email__ = "Wang.Liguo@mayo.edu"
24
+ __status__ = "Development" #Prototype or Production
25
+
26
+ class PSSM (object):
27
+ '''
28
+ Description: provides functions to manipulate Position-Specific Scoring Matrix (PSSM)
29
+ such as PFM, PPM, PWM matrix.
30
+ '''
31
+
32
+ def __init__(self, sites, dna = True, name = None, rv = False):
33
+ '''
34
+ Initialize object.
35
+ Must be DNA or protein (dna = False) sequences.
36
+ dna = True: DNA sequence
37
+ dna = False: protein sequence
38
+ rv (reverse complementary): only applied to DNA sequence.
39
+
40
+ Each row contains a single sequence and each sequence has the same length.
41
+ Lowercase in sequence is automatically converted into uppercase.
42
+
43
+ Input example (test.sites):
44
+ GAGGTAAAC
45
+ TCCGTAAGT
46
+ CAGGTTGGA
47
+ ACAGTCAGT
48
+ TAGGTCATT
49
+ TAGGTACTG
50
+ ATGGTAACT
51
+ CAGGTATAC
52
+ TGTGTGAGT
53
+ AAGGTAAGT
54
+ '''
55
+ if dna:
56
+ self.seq_type = 'DNA'
57
+ else:
58
+ self.seq_type = 'PROTEIN'
59
+ if name is None:
60
+ self.motif_name = 'Unknown'
61
+ else:
62
+ self.motif_name = name
63
+ if rv:
64
+ tab = string.maketrans('ACGT','TGCA')
65
+ self.seq_count = 0.0
66
+ self.data = defaultdict(dict) #base_position (column of .sites): base_type : base_count
67
+ self.raw_data = defaultdict(list) #base_position: list of ACGT in each column
68
+ self.seq_lengths = set()
69
+ self.DNA_bases = ['A','C','G','T']
70
+ self.protein_bases = ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V']
71
+
72
+ for l in open(sites,'r'):
73
+ if l.startswith('#'):continue
74
+ if l.startswith('>'):continue
75
+ l = l.strip(' \r\n').upper()
76
+
77
+ # check if all bases are valid symbols
78
+ skip = False
79
+ if self.seq_type == 'DNA':
80
+ for b in l:
81
+ if b not in self.DNA_bases:
82
+ print("Uncognize DNA base: \"%s\" in %s. Skipped." % (b,l), file=sys.stderr)
83
+ skip = True
84
+ break
85
+ elif self.seq_type == 'PROTEIN':
86
+ for b in l:
87
+ if b not in self.protein_bases:
88
+ print("Uncognize DNA base: \"%s\" in %s. Skipped." % (b,l), file=sys.stderr)
89
+ skip = True
90
+ break
91
+ if skip:
92
+ continue
93
+
94
+ self.seq_lengths.add(len(l))
95
+ self.seq_count += 1
96
+
97
+ if rv:
98
+ l = l.translate(tab)[::-1]
99
+
100
+ # read sites into dict(dict)
101
+ for i,v in enumerate(l):
102
+ self.raw_data[i].append(v)
103
+ if v not in self.data[i]:
104
+ self.data[i][v] = 1.0
105
+ else:
106
+ self.data[i][v] += 1.0
107
+
108
+ # check if all sequences have the same length
109
+ if len(self.seq_lengths) != 1:
110
+ print("Sequence lengths are not equal!", file=sys.stderr)
111
+ sys.exit(1)
112
+ else:
113
+ self.motif_length =self.seq_lengths.pop()
114
+
115
+ def motif_length(self):
116
+ '''
117
+ Return the motif length (nt)
118
+ '''
119
+ return self.motif_length
120
+
121
+ def toPFM(self,FOUT=sys.stdout):
122
+ '''
123
+ Convert motif sites data into position frequency matrix (PFM)
124
+ '''
125
+ pfm = []
126
+ if self.seq_type == 'DNA':
127
+ bases = self.DNA_bases
128
+ else:
129
+ bases = self.protein_bases
130
+
131
+ for i in range(self.motif_length): # i is motif position starting from 0
132
+ tmp = [] # list of base_count for each column of sites file
133
+ for b in bases:
134
+ if b in self.data[i]:
135
+ tmp.append(self.data[i][b])
136
+ else:
137
+ tmp.append(0.0)
138
+ pfm.append(tmp)
139
+
140
+ pfm = np.transpose(np.array(pfm))
141
+
142
+ #print("\n\n# PSSM matrix", file=FOUT)
143
+ print('Base\t' + '\t'.join([str(i+1) for i in range(self.motif_length)]), file=FOUT)
144
+ for i,name in enumerate(bases):
145
+ print(name + '\t' + '\t'.join([str(j) for j in pfm[i]]), file=FOUT)
146
+
147
+ def toJaspar(self,FOUT=sys.stdout):
148
+ '''
149
+ Convert motif sites data into Jaspar format (.pfm)
150
+
151
+ Jaspar format example:
152
+ > Mycn
153
+ A [ 0 29 0 2 0 0 ]
154
+ C [31 0 30 1 3 0 ]
155
+ G [ 0 0 0 28 0 31]
156
+ T [ 0 2 1 0 28 0 ]
157
+ '''
158
+ pfm = []
159
+ if self.seq_type == 'DNA':
160
+ bases = self.DNA_bases
161
+ else:
162
+ bases = self.protein_bases
163
+
164
+ for i in range(self.motif_length): # i is motif position starting from 0
165
+ tmp = [] # list of base_count for each column of sites file
166
+ for b in bases:
167
+ if b in self.data[i]:
168
+ tmp.append(self.data[i][b])
169
+ else:
170
+ tmp.append(0.0)
171
+ pfm.append(tmp)
172
+
173
+ pfm = np.transpose(np.array(pfm))
174
+ print('> %s' % self.motif_name, file=FOUT)
175
+ for i,b in enumerate(bases):
176
+ print(b + ' [ ' + ' '.join([str(j) for j in pfm[i]]) + ']', file=FOUT)
177
+
178
+ def toRawPSSM(self, FOUT=sys.stdout):
179
+ '''
180
+ Convert motif sites data into raw PSSM format (.pfm)
181
+
182
+ raw PSSM format example:
183
+ >Mync
184
+ 0 31 0 0
185
+ 29 0 0 2
186
+ 0 30 0 1
187
+ 2 1 28 0
188
+ 0 3 0 28
189
+ 0 0 31 0
190
+ '''
191
+ pfm = []
192
+ if self.seq_type == 'DNA':
193
+ bases = self.DNA_bases
194
+ else:
195
+ bases = self.protein_bases
196
+
197
+ for i in range(self.motif_length): # i is motif position starting from 0
198
+ tmp = [] # list of base_count for each column of sites file
199
+ for b in bases:
200
+ if b in self.data[i]:
201
+ tmp.append(self.data[i][b])
202
+ else:
203
+ tmp.append(0.0)
204
+ pfm.append(tmp)
205
+
206
+ pfm = np.array(pfm)
207
+ print('>%s' % self.motif_name, file=FOUT)
208
+ for i in range(self.motif_length):
209
+ print(' '.join([str(j) for j in pfm[i]]), file=FOUT)
210
+
211
+ def toMEME(self,pseudocount=0.8, FOUT=sys.stdout):
212
+ '''
213
+ Convert motif sites data into meme's position-specific probability matrix
214
+
215
+ MEME format example:
216
+ ------------------------
217
+ Motif 2 position-specific probability matrix
218
+ ------------------------
219
+ letter-probability matrix: alength= 4 w= 6 nsites= 31
220
+ 0 31 0 0
221
+ 29 0 0 2
222
+ 0 30 0 1
223
+ 2 1 28 0
224
+ 0 3 0 28
225
+ 0 0 31 0
226
+ '''
227
+ pfm = []
228
+ ppm = []
229
+
230
+ if self.seq_type == 'DNA':
231
+ bases = self.DNA_bases
232
+ else:
233
+ bases = self.protein_bases
234
+
235
+ for i in range(self.motif_length): # i is motif position starting from 0
236
+ tmp = [] # list of base_count for each column of sites file
237
+ for b in bases:
238
+ if b in self.data[i]:
239
+ tmp.append(self.data[i][b])
240
+ else:
241
+ tmp.append(0.0)
242
+ pfm.append(tmp)
243
+
244
+ pfm = np.transpose(np.array(pfm))
245
+ pfm = pfm + pseudocount/4.0
246
+ ppm = pfm/pfm.sum(axis=0)
247
+
248
+ ppm = np.transpose(ppm)
249
+
250
+ print('-'*40, file=FOUT)
251
+ print(self.motif_name + ' position-specific probability matrix', file=FOUT)
252
+ print('-'*40, file=FOUT)
253
+ print('letter-probability matrix: alength= %d w= %d nsites= %d' % (len(bases), self.motif_length, self.seq_count), file=FOUT)
254
+ for i in ppm:
255
+ print(' ' + ' '.join([str(j) for j in i]), file=FOUT)
256
+
257
+ def toPPM(self,pseudocount=0.8, FOUT=sys.stdout):
258
+ '''
259
+ Convert motif sites data into position probability matrix (PPM)
260
+ Default pseudocount of 0.8 is determined from this paper:
261
+ http://nar.oxfordjournals.org/content/37/3/939.full
262
+ '''
263
+ pfm = []
264
+ ppm = []
265
+
266
+ if self.seq_type == 'DNA':
267
+ bases = self.DNA_bases
268
+ else:
269
+ bases = self.protein_bases
270
+
271
+ for i in range(self.motif_length): # i is motif position starting from 0
272
+ tmp = [] # list of base_count for each column of sites file
273
+ for b in bases:
274
+ if b in self.data[i]:
275
+ tmp.append(self.data[i][b])
276
+ else:
277
+ tmp.append(0.0)
278
+ pfm.append(tmp)
279
+
280
+ pfm = np.transpose(np.array(pfm))
281
+ pfm = pfm + pseudocount/4.0
282
+ ppm = pfm/pfm.sum(axis=0)
283
+
284
+ #print("\n\n# PPM matrix", file=FOUT)
285
+ print('Base\t' + '\t'.join([str(i+1) for i in range(self.motif_length)]), file=FOUT)
286
+ for i,name in enumerate(bases):
287
+ print(name + '\t' + '\t'.join([str(j) for j in ppm[i]]), file=FOUT)
288
+
289
+ def toPWM(self,pseudocount=0.8, bg=None, FOUT=sys.stdout):
290
+ '''
291
+ Convert motif sites data into position weight matrix (PWM)
292
+ PWM is a matrix of log likelihood between sites and background.
293
+
294
+ Default pseudocount of 0.8 is determined from this paper:
295
+ http://nar.oxfordjournals.org/content/37/3/939.full
296
+
297
+ if bg is "None", universal background will be used:
298
+ * DNA:
299
+ A = C = G = T = 0.25 (i.e. 1/4)
300
+ * Protein
301
+ A = R = N = ... = V = 0.05 (i.e. 1/20)
302
+
303
+ Otherwise, bg is a dictionary with base as key and the corresponding
304
+ base frequency as value. eg
305
+
306
+ bg = {'A':0.23, 'C':0.26,'G':0.29,'T':0.22}
307
+
308
+ '''
309
+ pwm = []
310
+ pfm = []
311
+ ppm = []
312
+ background = {}
313
+
314
+ if self.seq_type == 'DNA':
315
+ bases = self.DNA_bases
316
+ else:
317
+ bases = self.protein_bases
318
+
319
+ #determine background frequency
320
+ if bg is None:
321
+ for b in bases:
322
+ background[b] = 1.0/len(bases)
323
+
324
+ for i in range(self.motif_length): # i is motif position starting from 0
325
+ tmp = [] # list of base_count for each column of sites file
326
+ for b in bases:
327
+ if b in self.data[i]:
328
+ tmp.append(self.data[i][b])
329
+ else:
330
+ tmp.append(0.0)
331
+ pfm.append(tmp)
332
+
333
+ pfm = np.transpose(np.array(pfm))
334
+ pfm = pfm + pseudocount/4.0
335
+ ppm = pfm/pfm.sum(axis=0)
336
+
337
+ for i,name in enumerate(bases):
338
+ tmp = [np.log(j/background[name]) for j in ppm[i]]
339
+ pwm.append(tmp)
340
+
341
+ #print("\n\n# PWM matrix", file=FOUT)
342
+ print('Base\t' + '\t'.join([str(i+1) for i in range(self.motif_length)]), file=FOUT)
343
+ for i,name in enumerate(bases):
344
+ print(name + '\t' + '\t'.join([str(np.log(j/background[name])) for j in ppm[i]]), file=FOUT)
345
+
346
+
347
+
348
+
cpgmodule/ireader.py ADDED
@@ -0,0 +1,28 @@
1
+ """
2
+ read compressed (.gz .bz) files
3
+ """
4
+ #!/usr/bin/env python
5
+ # encoding: utf-8
6
+
7
+ import bz2
8
+ import gzip
9
+ from urllib.request import urlopen
10
+
11
+ def nopen(f, mode="rb"):
12
+ if not isinstance(f, str):
13
+ return f
14
+ if f.startswith("|"):
15
+ p = Popen(f[1:], stdout=PIPE, stdin=PIPE, shell=True)
16
+ if mode[0] == "r": return p.stdout
17
+ return p
18
+ return {"r": sys.stdin, "w": sys.stdout}[mode[0]] if f == "-" \
19
+ else gzip.open(f, mode) if f.endswith((".gz", ".Z", ".z")) \
20
+ else bz2.BZ2File(f, mode) if f.endswith((".bz", ".bz2", ".bzip2")) \
21
+ else urlopen(f) if f.startswith(("http://", "https://","ftp://")) \
22
+ else open(f, mode)
23
+
24
+
25
+ def reader(fname):
26
+ for l in nopen(fname):
27
+ yield l.decode('utf8').strip().replace("\r", "")
28
+
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Fri Nov 25 10:55:14 2022
5
+
6
+ @author: Liguo Wang
7
+ """
8
+ from cpgmodule import ireader
9
+ #import sys,os
10
+
11
+ class MethylSig():
12
+ """
13
+ Pack DNA methylation signature file into object.
14
+
15
+ >>> from cpgmodule import methylClock
16
+ >>> a = methylClock.MethylAge(signature_file = 'coefBlup.tsv', signature_name = 'BLUP', signature_info="")
17
+ >>> a.name
18
+ 'BLUP'
19
+ >>> a.Intercept
20
+ 91.15396
21
+ >>> a.ncpg
22
+ 319607
23
+ """
24
+
25
+ def __init__(self, signature_file, signature_name, tissues = [], unit = '', signature_info = '', reference = '', pub_link = '', method = ''):
26
+ self.name = signature_name
27
+ self.info = signature_info
28
+ self.tissues = tissues
29
+ self.unit = unit
30
+ self.coef = {}
31
+ self.cpgs = []
32
+ self.ncpg = 0
33
+ self.Intercept = 0.0
34
+ self.ref = reference
35
+ self.pubmed = pub_link
36
+ self.method = method
37
+ for l in ireader.reader(signature_file):
38
+ if l.startswith('#'):
39
+ continue
40
+ f = l.split()
41
+ if l.startswith('Intercept'):
42
+ try:
43
+ self.Intercept = float(f[1])
44
+ except:
45
+ self.Intercept = 0.0
46
+ else:
47
+ self.cpgs.append(f[0])
48
+ self.ncpg += 1
49
+ try:
50
+ self.coef[f[0]] = float(f[1])
51
+ #self.ncpg += 1
52
+ except:
53
+ continue
cpgmodule/padjust.py ADDED
@@ -0,0 +1,58 @@
1
+ #!/usr/bin/env python3
2
+
3
+ # Copyright 2017 Francisco Pina Martins <f.pinamartins@gmail.com>
4
+ # This file is part of structure_threader.
5
+ # structure_threader is free software: you can redistribute it and/or modify
6
+ # it under the terms of the GNU General Public License as published by
7
+ # the Free Software Foundation, either version 3 of the License, or
8
+ # (at your option) any later version.
9
+
10
+ # structure_threader is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License for more details.
14
+
15
+ # You should have received a copy of the GNU General Public License
16
+ # along with structure_threader. If not, see <http://www.gnu.org/licenses/>.
17
+
18
+ # Taken from https://stackoverflow.com/a/21739593/3091595, ported to python 3
19
+ # and improved readability.
20
+ import numpy as np
21
+
22
+ def multiple_testing_correction(pvalues, correction_type="FDR"):
23
+ """
24
+ Consistent with R - print
25
+ correct_pvalues_for_multiple_testing([0.0, 0.01, 0.029, 0.03, 0.031, 0.05,
26
+ 0.069, 0.07, 0.071, 0.09, 0.1])
27
+ """
28
+ #from numpy import array, empty
29
+ pvalues = np.array(pvalues)
30
+ sample_size = pvalues.shape[0]
31
+ qvalues = np.empty(sample_size)
32
+ if correction_type == "Bonferroni":
33
+ # Bonferroni correction
34
+ qvalues = sample_size * pvalues
35
+ elif correction_type == "Bonferroni-Holm":
36
+ # Bonferroni-Holm correction
37
+ values = [(pvalue, i) for i, pvalue in enumerate(pvalues)]
38
+ values.sort()
39
+ for rank, vals in enumerate(values):
40
+ pvalue, i = vals
41
+ qvalues[i] = (sample_size-rank) * pvalue
42
+ elif correction_type == "FDR":
43
+ # Benjamini-Hochberg, AKA - FDR test
44
+ values = [(pvalue, i) for i, pvalue in enumerate(pvalues)]
45
+ values.sort()
46
+ values.reverse()
47
+ new_values = []
48
+ for i, vals in enumerate(values):
49
+ rank = sample_size - i
50
+ pvalue, index = vals
51
+ new_values.append((sample_size/rank) * pvalue)
52
+ for i in range(0, int(sample_size)-1):
53
+ if new_values[i] < new_values[i+1]:
54
+ new_values[i+1] = new_values[i]
55
+ for i, vals in enumerate(values):
56
+ pvalue, index = vals
57
+ qvalues[index] = new_values[i]
58
+ return qvalues