cpgtools 2.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cpgmodule/BED.py +441 -0
- cpgmodule/MI.py +193 -0
- cpgmodule/__init__.py +0 -0
- cpgmodule/_version.py +1 -0
- cpgmodule/cgID.py +866897 -0
- cpgmodule/data/AltumAge_cpg.pkl +0 -0
- cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
- cpgmodule/data/AltumAge_scaler.pkl +0 -0
- cpgmodule/data/GA_Bohlin.pkl +0 -0
- cpgmodule/data/GA_Haftorn.pkl +0 -0
- cpgmodule/data/GA_Knight.pkl +0 -0
- cpgmodule/data/GA_Lee_CPC.pkl +0 -0
- cpgmodule/data/GA_Lee_RPC.pkl +0 -0
- cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
- cpgmodule/data/GA_Mayne.pkl +0 -0
- cpgmodule/data/Hannum.pkl +0 -0
- cpgmodule/data/Horvath_2013.pkl +0 -0
- cpgmodule/data/Horvath_2018.pkl +0 -0
- cpgmodule/data/Levine.pkl +0 -0
- cpgmodule/data/Lu_DNAmTL.pkl +0 -0
- cpgmodule/data/Ped_McEwen.pkl +0 -0
- cpgmodule/data/Ped_Wu.pkl +0 -0
- cpgmodule/data/Zhang_BLUP.pkl +0 -0
- cpgmodule/data/Zhang_EN.pkl +0 -0
- cpgmodule/data/__init__.py +0 -0
- cpgmodule/extend_bed.py +147 -0
- cpgmodule/imotif.py +348 -0
- cpgmodule/ireader.py +28 -0
- cpgmodule/methylClock.py +53 -0
- cpgmodule/padjust.py +58 -0
- cpgmodule/region2gene.py +170 -0
- cpgmodule/utils.py +642 -0
- cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
- cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
- cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
- cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
- cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
- cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
- cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
- cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
- cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
- cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
- cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
- cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
- cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
- cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
- cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
- cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
- cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
- cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
- cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
- cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
- cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
- cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
- cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
- cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
- cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
- cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
- cpgtools-2.0.5.dist-info/METADATA +59 -0
- cpgtools-2.0.5.dist-info/RECORD +104 -0
- cpgtools-2.0.5.dist-info/WHEEL +5 -0
- cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
- cpgtools-2.0.5.dist-info/top_level.txt +5 -0
- impyute/__init__.py +3 -0
- impyute/contrib/__init__.py +7 -0
- impyute/contrib/compare.py +69 -0
- impyute/contrib/count_missing.py +30 -0
- impyute/contrib/describe.py +63 -0
- impyute/cs/__init__.py +11 -0
- impyute/cs/buck_iterative.py +82 -0
- impyute/cs/central_tendency.py +84 -0
- impyute/cs/em.py +52 -0
- impyute/cs/fast_knn.py +130 -0
- impyute/cs/random.py +27 -0
- impyute/dataset/__init__.py +6 -0
- impyute/dataset/base.py +137 -0
- impyute/dataset/corrupt.py +55 -0
- impyute/deletion/__init__.py +5 -0
- impyute/deletion/complete_case.py +21 -0
- impyute/ops/__init__.py +12 -0
- impyute/ops/error.py +9 -0
- impyute/ops/inverse_distance_weighting.py +31 -0
- impyute/ops/matrix.py +47 -0
- impyute/ops/testing.py +20 -0
- impyute/ops/util.py +96 -0
- impyute/ops/wrapper.py +179 -0
- impyute/ts/__init__.py +6 -0
- impyute/ts/locf.py +57 -0
- impyute/ts/moving_window.py +128 -0
- impyutelib.py +890 -0
- missingpy/__init__.py +4 -0
- missingpy/knnimpute.py +328 -0
- missingpy/missforest.py +556 -0
- missingpy/pairwise_external.py +315 -0
- missingpy/tests/__init__.py +0 -0
- missingpy/tests/test_knnimpute.py +605 -0
- missingpy/tests/test_missforest.py +409 -0
- missingpy/utils.py +124 -0
- misspylib.py +565 -0
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
File without changes
|
cpgmodule/extend_bed.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from bx.intervals import *
|
|
3
|
+
import numpy as np
|
|
4
|
+
from cpgmodule import ireader
|
|
5
|
+
|
|
6
|
+
def getBasalDomains(bedfile, printit = False):
|
|
7
|
+
'''
|
|
8
|
+
Define gene's basal regulatory domain.
|
|
9
|
+
bedfile: one gene one TSS (could use the canonical (longest) isoform, or merge all isoforms into a super transcript.
|
|
10
|
+
'''
|
|
11
|
+
basal_ranges = {}
|
|
12
|
+
|
|
13
|
+
for l in ireader.reader(bedfile):
|
|
14
|
+
if l.startswith('#'):
|
|
15
|
+
continue
|
|
16
|
+
if l.startswith('track'):
|
|
17
|
+
continue
|
|
18
|
+
if l.startswith('browser'):
|
|
19
|
+
continue
|
|
20
|
+
f = l.split()
|
|
21
|
+
try:
|
|
22
|
+
chrom = f[0]
|
|
23
|
+
start = int(f[1])
|
|
24
|
+
end = int(f[2])
|
|
25
|
+
symbol = f[3]
|
|
26
|
+
gene_strand = f[5]
|
|
27
|
+
if start > end:
|
|
28
|
+
print ("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr)
|
|
29
|
+
continue
|
|
30
|
+
if gene_strand not in ['+','-']:
|
|
31
|
+
print ("Invalid strand. Skip: " + l, file=sys.stderr)
|
|
32
|
+
continue
|
|
33
|
+
except:
|
|
34
|
+
print ("BED has at lesat 6 columns. Skip: " + l, file=sys.stderr)
|
|
35
|
+
continue
|
|
36
|
+
|
|
37
|
+
if chrom not in basal_ranges:
|
|
38
|
+
basal_ranges[chrom] = IntervalTree()
|
|
39
|
+
|
|
40
|
+
basal_ranges[chrom].insert_interval( Interval(start, end, strand = gene_strand, value = symbol))
|
|
41
|
+
|
|
42
|
+
if printit:
|
|
43
|
+
print('\t'.join([str(i) for i in (chrom, basal_st, basal_end, symbol, '0', strand)]), file = sys.stdout)
|
|
44
|
+
return basal_ranges
|
|
45
|
+
|
|
46
|
+
def geteExtendedDomains(basal_ranges, bedfile, up_ext=2000, down_ext=2000, min_gene = 200, printit = False):
|
|
47
|
+
'''
|
|
48
|
+
Define gene's extended regulatory domain.
|
|
49
|
+
bedfile:one gene one TSS (could use the canonical (longest) isoform, or merge all
|
|
50
|
+
isoforms into a super transcript.
|
|
51
|
+
up_ext:
|
|
52
|
+
Size of extension to upstream. Should be multiples of 100
|
|
53
|
+
down_ext:
|
|
54
|
+
Size of extension to downstream. Should be multiples of 100
|
|
55
|
+
min_gene:
|
|
56
|
+
minimum gene size (from TSS to TES). Should be multiples of 100
|
|
57
|
+
|
|
58
|
+
'''
|
|
59
|
+
return_ranges = []
|
|
60
|
+
|
|
61
|
+
for l in ireader.reader(bedfile):
|
|
62
|
+
if l.startswith('#'):
|
|
63
|
+
continue
|
|
64
|
+
if l.startswith('track'):
|
|
65
|
+
continue
|
|
66
|
+
if l.startswith('browser'):
|
|
67
|
+
continue
|
|
68
|
+
f = l.split()
|
|
69
|
+
try:
|
|
70
|
+
chrom = f[0]
|
|
71
|
+
start = int(f[1])
|
|
72
|
+
end = int(f[2])
|
|
73
|
+
symbol = f[3]
|
|
74
|
+
strand = f[5]
|
|
75
|
+
|
|
76
|
+
if start < 0:continue
|
|
77
|
+
if start > end:
|
|
78
|
+
print ("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr)
|
|
79
|
+
continue
|
|
80
|
+
if (end - start ) < min_gene:
|
|
81
|
+
continue
|
|
82
|
+
if strand not in ['+', '-']:
|
|
83
|
+
print ("Unknown strand. Skip: " + l, file=sys.stderr)
|
|
84
|
+
continue
|
|
85
|
+
except:
|
|
86
|
+
print ("BED has at lesat 6 columns. Skip: " + l, file=sys.stderr)
|
|
87
|
+
|
|
88
|
+
if strand == '+':
|
|
89
|
+
extension_st = start - up_ext
|
|
90
|
+
extension_end = end + down_ext
|
|
91
|
+
elif strand == '-':
|
|
92
|
+
extension_st = start - down_ext
|
|
93
|
+
extension_end = end + up_ext
|
|
94
|
+
if extension_st < 0:
|
|
95
|
+
extension_st = 0
|
|
96
|
+
|
|
97
|
+
#try to update extension_st
|
|
98
|
+
overlaps = basal_ranges[chrom].find(extension_st, start)
|
|
99
|
+
if len(overlaps) > 0:
|
|
100
|
+
for o in overlaps:
|
|
101
|
+
if o.end > extension_st:
|
|
102
|
+
extension_st = o.end
|
|
103
|
+
if extension_st > start:
|
|
104
|
+
extension_st = start
|
|
105
|
+
|
|
106
|
+
if (start - extension_st) < min_gene:
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
#try to update extension_end
|
|
110
|
+
overlaps = basal_ranges[chrom].find(end, extension_end)
|
|
111
|
+
if len(overlaps) > 0:
|
|
112
|
+
for o in overlaps:
|
|
113
|
+
if o.start < extension_end:
|
|
114
|
+
extension_end = o.start
|
|
115
|
+
if extension_end < end:
|
|
116
|
+
extension_end = end
|
|
117
|
+
|
|
118
|
+
if (extension_end - end) < min_gene:
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
return_ranges.append(([chrom, extension_st, start,symbol], [chrom, start, end,symbol], [chrom, end, extension_end,symbol], strand))
|
|
122
|
+
#return_ranges.append(([chrom, extension_st, start, symbol], [chrom, start, end, symbol], [chrom, end, extension_end,symbol], strand))
|
|
123
|
+
#return_ranges.append(([chrom, extension_st, start, strand], [chrom, start, end, strand], [chrom, end, extension_end, strand]))
|
|
124
|
+
|
|
125
|
+
if printit:
|
|
126
|
+
print('\t'.join([str(i) for i in (chrom, extension_st, extension_end, symbol, '0', strand, start, end, '255,0,0', 1, extension_end - extension_st, 0)]), file = sys.stdout)
|
|
127
|
+
|
|
128
|
+
return return_ranges
|
|
129
|
+
|
|
130
|
+
if __name__=='__main__':
|
|
131
|
+
tmp = getBasalDomains(sys.argv[1], printit = False)
|
|
132
|
+
b = geteExtendedDomains(basal_ranges = tmp, bedfile = sys.argv[1], printit=False)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
for a1,a2,a3,a4 in b:
|
|
136
|
+
if a4 == '+':
|
|
137
|
+
print ('\t'.join([str(i) for i in a1]) + '(UIR)\t' +str(int(a1[2]) - int(a1[1])) + '\t+')
|
|
138
|
+
print ('\t'.join([str(i) for i in a2]) + '(Body)\t' +str(int(a2[2]) - int(a2[1])) + '\t+')
|
|
139
|
+
print ('\t'.join([str(i) for i in a3]) + '(DIR)\t' +str(int(a3[2]) - int(a3[1])) + '\t+')
|
|
140
|
+
if a4 == '-':
|
|
141
|
+
print ('\t'.join([str(i) for i in a1]) + '(DIR)\t' +str(int(a1[2]) - int(a1[1])) + '\t-')
|
|
142
|
+
print ('\t'.join([str(i) for i in a2]) + '(Body)\t' +str(int(a2[2]) - int(a2[1])) + '\t-')
|
|
143
|
+
print ('\t'.join([str(i) for i in a3]) + '(UIR)\t' +str(int(a3[2]) - int(a3[1])) + '\t-')
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
|
cpgmodule/imotif.py
ADDED
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
'''DNA/protein motif visualization and scan'''
|
|
3
|
+
|
|
4
|
+
#import built-in modules
|
|
5
|
+
import sys,os
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from scipy import stats
|
|
8
|
+
import itertools
|
|
9
|
+
|
|
10
|
+
#import third-party modules
|
|
11
|
+
import numpy as np
|
|
12
|
+
#changes to the paths
|
|
13
|
+
|
|
14
|
+
#changing history to this module
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
__author__ = "Liguo Wang"
|
|
18
|
+
__copyright__ = ""
|
|
19
|
+
__credits__ = []
|
|
20
|
+
__license__ = "GPLv2"
|
|
21
|
+
__version__ = "1.0.0"
|
|
22
|
+
__maintainer__ = "Liguo Wang"
|
|
23
|
+
__email__ = "Wang.Liguo@mayo.edu"
|
|
24
|
+
__status__ = "Development" #Prototype or Production
|
|
25
|
+
|
|
26
|
+
class PSSM (object):
|
|
27
|
+
'''
|
|
28
|
+
Description: provides functions to manipulate Position-Specific Scoring Matrix (PSSM)
|
|
29
|
+
such as PFM, PPM, PWM matrix.
|
|
30
|
+
'''
|
|
31
|
+
|
|
32
|
+
def __init__(self, sites, dna = True, name = None, rv = False):
|
|
33
|
+
'''
|
|
34
|
+
Initialize object.
|
|
35
|
+
Must be DNA or protein (dna = False) sequences.
|
|
36
|
+
dna = True: DNA sequence
|
|
37
|
+
dna = False: protein sequence
|
|
38
|
+
rv (reverse complementary): only applied to DNA sequence.
|
|
39
|
+
|
|
40
|
+
Each row contains a single sequence and each sequence has the same length.
|
|
41
|
+
Lowercase in sequence is automatically converted into uppercase.
|
|
42
|
+
|
|
43
|
+
Input example (test.sites):
|
|
44
|
+
GAGGTAAAC
|
|
45
|
+
TCCGTAAGT
|
|
46
|
+
CAGGTTGGA
|
|
47
|
+
ACAGTCAGT
|
|
48
|
+
TAGGTCATT
|
|
49
|
+
TAGGTACTG
|
|
50
|
+
ATGGTAACT
|
|
51
|
+
CAGGTATAC
|
|
52
|
+
TGTGTGAGT
|
|
53
|
+
AAGGTAAGT
|
|
54
|
+
'''
|
|
55
|
+
if dna:
|
|
56
|
+
self.seq_type = 'DNA'
|
|
57
|
+
else:
|
|
58
|
+
self.seq_type = 'PROTEIN'
|
|
59
|
+
if name is None:
|
|
60
|
+
self.motif_name = 'Unknown'
|
|
61
|
+
else:
|
|
62
|
+
self.motif_name = name
|
|
63
|
+
if rv:
|
|
64
|
+
tab = string.maketrans('ACGT','TGCA')
|
|
65
|
+
self.seq_count = 0.0
|
|
66
|
+
self.data = defaultdict(dict) #base_position (column of .sites): base_type : base_count
|
|
67
|
+
self.raw_data = defaultdict(list) #base_position: list of ACGT in each column
|
|
68
|
+
self.seq_lengths = set()
|
|
69
|
+
self.DNA_bases = ['A','C','G','T']
|
|
70
|
+
self.protein_bases = ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V']
|
|
71
|
+
|
|
72
|
+
for l in open(sites,'r'):
|
|
73
|
+
if l.startswith('#'):continue
|
|
74
|
+
if l.startswith('>'):continue
|
|
75
|
+
l = l.strip(' \r\n').upper()
|
|
76
|
+
|
|
77
|
+
# check if all bases are valid symbols
|
|
78
|
+
skip = False
|
|
79
|
+
if self.seq_type == 'DNA':
|
|
80
|
+
for b in l:
|
|
81
|
+
if b not in self.DNA_bases:
|
|
82
|
+
print("Uncognize DNA base: \"%s\" in %s. Skipped." % (b,l), file=sys.stderr)
|
|
83
|
+
skip = True
|
|
84
|
+
break
|
|
85
|
+
elif self.seq_type == 'PROTEIN':
|
|
86
|
+
for b in l:
|
|
87
|
+
if b not in self.protein_bases:
|
|
88
|
+
print("Uncognize DNA base: \"%s\" in %s. Skipped." % (b,l), file=sys.stderr)
|
|
89
|
+
skip = True
|
|
90
|
+
break
|
|
91
|
+
if skip:
|
|
92
|
+
continue
|
|
93
|
+
|
|
94
|
+
self.seq_lengths.add(len(l))
|
|
95
|
+
self.seq_count += 1
|
|
96
|
+
|
|
97
|
+
if rv:
|
|
98
|
+
l = l.translate(tab)[::-1]
|
|
99
|
+
|
|
100
|
+
# read sites into dict(dict)
|
|
101
|
+
for i,v in enumerate(l):
|
|
102
|
+
self.raw_data[i].append(v)
|
|
103
|
+
if v not in self.data[i]:
|
|
104
|
+
self.data[i][v] = 1.0
|
|
105
|
+
else:
|
|
106
|
+
self.data[i][v] += 1.0
|
|
107
|
+
|
|
108
|
+
# check if all sequences have the same length
|
|
109
|
+
if len(self.seq_lengths) != 1:
|
|
110
|
+
print("Sequence lengths are not equal!", file=sys.stderr)
|
|
111
|
+
sys.exit(1)
|
|
112
|
+
else:
|
|
113
|
+
self.motif_length =self.seq_lengths.pop()
|
|
114
|
+
|
|
115
|
+
def motif_length(self):
|
|
116
|
+
'''
|
|
117
|
+
Return the motif length (nt)
|
|
118
|
+
'''
|
|
119
|
+
return self.motif_length
|
|
120
|
+
|
|
121
|
+
def toPFM(self,FOUT=sys.stdout):
|
|
122
|
+
'''
|
|
123
|
+
Convert motif sites data into position frequency matrix (PFM)
|
|
124
|
+
'''
|
|
125
|
+
pfm = []
|
|
126
|
+
if self.seq_type == 'DNA':
|
|
127
|
+
bases = self.DNA_bases
|
|
128
|
+
else:
|
|
129
|
+
bases = self.protein_bases
|
|
130
|
+
|
|
131
|
+
for i in range(self.motif_length): # i is motif position starting from 0
|
|
132
|
+
tmp = [] # list of base_count for each column of sites file
|
|
133
|
+
for b in bases:
|
|
134
|
+
if b in self.data[i]:
|
|
135
|
+
tmp.append(self.data[i][b])
|
|
136
|
+
else:
|
|
137
|
+
tmp.append(0.0)
|
|
138
|
+
pfm.append(tmp)
|
|
139
|
+
|
|
140
|
+
pfm = np.transpose(np.array(pfm))
|
|
141
|
+
|
|
142
|
+
#print("\n\n# PSSM matrix", file=FOUT)
|
|
143
|
+
print('Base\t' + '\t'.join([str(i+1) for i in range(self.motif_length)]), file=FOUT)
|
|
144
|
+
for i,name in enumerate(bases):
|
|
145
|
+
print(name + '\t' + '\t'.join([str(j) for j in pfm[i]]), file=FOUT)
|
|
146
|
+
|
|
147
|
+
def toJaspar(self,FOUT=sys.stdout):
|
|
148
|
+
'''
|
|
149
|
+
Convert motif sites data into Jaspar format (.pfm)
|
|
150
|
+
|
|
151
|
+
Jaspar format example:
|
|
152
|
+
> Mycn
|
|
153
|
+
A [ 0 29 0 2 0 0 ]
|
|
154
|
+
C [31 0 30 1 3 0 ]
|
|
155
|
+
G [ 0 0 0 28 0 31]
|
|
156
|
+
T [ 0 2 1 0 28 0 ]
|
|
157
|
+
'''
|
|
158
|
+
pfm = []
|
|
159
|
+
if self.seq_type == 'DNA':
|
|
160
|
+
bases = self.DNA_bases
|
|
161
|
+
else:
|
|
162
|
+
bases = self.protein_bases
|
|
163
|
+
|
|
164
|
+
for i in range(self.motif_length): # i is motif position starting from 0
|
|
165
|
+
tmp = [] # list of base_count for each column of sites file
|
|
166
|
+
for b in bases:
|
|
167
|
+
if b in self.data[i]:
|
|
168
|
+
tmp.append(self.data[i][b])
|
|
169
|
+
else:
|
|
170
|
+
tmp.append(0.0)
|
|
171
|
+
pfm.append(tmp)
|
|
172
|
+
|
|
173
|
+
pfm = np.transpose(np.array(pfm))
|
|
174
|
+
print('> %s' % self.motif_name, file=FOUT)
|
|
175
|
+
for i,b in enumerate(bases):
|
|
176
|
+
print(b + ' [ ' + ' '.join([str(j) for j in pfm[i]]) + ']', file=FOUT)
|
|
177
|
+
|
|
178
|
+
def toRawPSSM(self, FOUT=sys.stdout):
|
|
179
|
+
'''
|
|
180
|
+
Convert motif sites data into raw PSSM format (.pfm)
|
|
181
|
+
|
|
182
|
+
raw PSSM format example:
|
|
183
|
+
>Mync
|
|
184
|
+
0 31 0 0
|
|
185
|
+
29 0 0 2
|
|
186
|
+
0 30 0 1
|
|
187
|
+
2 1 28 0
|
|
188
|
+
0 3 0 28
|
|
189
|
+
0 0 31 0
|
|
190
|
+
'''
|
|
191
|
+
pfm = []
|
|
192
|
+
if self.seq_type == 'DNA':
|
|
193
|
+
bases = self.DNA_bases
|
|
194
|
+
else:
|
|
195
|
+
bases = self.protein_bases
|
|
196
|
+
|
|
197
|
+
for i in range(self.motif_length): # i is motif position starting from 0
|
|
198
|
+
tmp = [] # list of base_count for each column of sites file
|
|
199
|
+
for b in bases:
|
|
200
|
+
if b in self.data[i]:
|
|
201
|
+
tmp.append(self.data[i][b])
|
|
202
|
+
else:
|
|
203
|
+
tmp.append(0.0)
|
|
204
|
+
pfm.append(tmp)
|
|
205
|
+
|
|
206
|
+
pfm = np.array(pfm)
|
|
207
|
+
print('>%s' % self.motif_name, file=FOUT)
|
|
208
|
+
for i in range(self.motif_length):
|
|
209
|
+
print(' '.join([str(j) for j in pfm[i]]), file=FOUT)
|
|
210
|
+
|
|
211
|
+
def toMEME(self,pseudocount=0.8, FOUT=sys.stdout):
|
|
212
|
+
'''
|
|
213
|
+
Convert motif sites data into meme's position-specific probability matrix
|
|
214
|
+
|
|
215
|
+
MEME format example:
|
|
216
|
+
------------------------
|
|
217
|
+
Motif 2 position-specific probability matrix
|
|
218
|
+
------------------------
|
|
219
|
+
letter-probability matrix: alength= 4 w= 6 nsites= 31
|
|
220
|
+
0 31 0 0
|
|
221
|
+
29 0 0 2
|
|
222
|
+
0 30 0 1
|
|
223
|
+
2 1 28 0
|
|
224
|
+
0 3 0 28
|
|
225
|
+
0 0 31 0
|
|
226
|
+
'''
|
|
227
|
+
pfm = []
|
|
228
|
+
ppm = []
|
|
229
|
+
|
|
230
|
+
if self.seq_type == 'DNA':
|
|
231
|
+
bases = self.DNA_bases
|
|
232
|
+
else:
|
|
233
|
+
bases = self.protein_bases
|
|
234
|
+
|
|
235
|
+
for i in range(self.motif_length): # i is motif position starting from 0
|
|
236
|
+
tmp = [] # list of base_count for each column of sites file
|
|
237
|
+
for b in bases:
|
|
238
|
+
if b in self.data[i]:
|
|
239
|
+
tmp.append(self.data[i][b])
|
|
240
|
+
else:
|
|
241
|
+
tmp.append(0.0)
|
|
242
|
+
pfm.append(tmp)
|
|
243
|
+
|
|
244
|
+
pfm = np.transpose(np.array(pfm))
|
|
245
|
+
pfm = pfm + pseudocount/4.0
|
|
246
|
+
ppm = pfm/pfm.sum(axis=0)
|
|
247
|
+
|
|
248
|
+
ppm = np.transpose(ppm)
|
|
249
|
+
|
|
250
|
+
print('-'*40, file=FOUT)
|
|
251
|
+
print(self.motif_name + ' position-specific probability matrix', file=FOUT)
|
|
252
|
+
print('-'*40, file=FOUT)
|
|
253
|
+
print('letter-probability matrix: alength= %d w= %d nsites= %d' % (len(bases), self.motif_length, self.seq_count), file=FOUT)
|
|
254
|
+
for i in ppm:
|
|
255
|
+
print(' ' + ' '.join([str(j) for j in i]), file=FOUT)
|
|
256
|
+
|
|
257
|
+
def toPPM(self,pseudocount=0.8, FOUT=sys.stdout):
|
|
258
|
+
'''
|
|
259
|
+
Convert motif sites data into position probability matrix (PPM)
|
|
260
|
+
Default pseudocount of 0.8 is determined from this paper:
|
|
261
|
+
http://nar.oxfordjournals.org/content/37/3/939.full
|
|
262
|
+
'''
|
|
263
|
+
pfm = []
|
|
264
|
+
ppm = []
|
|
265
|
+
|
|
266
|
+
if self.seq_type == 'DNA':
|
|
267
|
+
bases = self.DNA_bases
|
|
268
|
+
else:
|
|
269
|
+
bases = self.protein_bases
|
|
270
|
+
|
|
271
|
+
for i in range(self.motif_length): # i is motif position starting from 0
|
|
272
|
+
tmp = [] # list of base_count for each column of sites file
|
|
273
|
+
for b in bases:
|
|
274
|
+
if b in self.data[i]:
|
|
275
|
+
tmp.append(self.data[i][b])
|
|
276
|
+
else:
|
|
277
|
+
tmp.append(0.0)
|
|
278
|
+
pfm.append(tmp)
|
|
279
|
+
|
|
280
|
+
pfm = np.transpose(np.array(pfm))
|
|
281
|
+
pfm = pfm + pseudocount/4.0
|
|
282
|
+
ppm = pfm/pfm.sum(axis=0)
|
|
283
|
+
|
|
284
|
+
#print("\n\n# PPM matrix", file=FOUT)
|
|
285
|
+
print('Base\t' + '\t'.join([str(i+1) for i in range(self.motif_length)]), file=FOUT)
|
|
286
|
+
for i,name in enumerate(bases):
|
|
287
|
+
print(name + '\t' + '\t'.join([str(j) for j in ppm[i]]), file=FOUT)
|
|
288
|
+
|
|
289
|
+
def toPWM(self,pseudocount=0.8, bg=None, FOUT=sys.stdout):
|
|
290
|
+
'''
|
|
291
|
+
Convert motif sites data into position weight matrix (PWM)
|
|
292
|
+
PWM is a matrix of log likelihood between sites and background.
|
|
293
|
+
|
|
294
|
+
Default pseudocount of 0.8 is determined from this paper:
|
|
295
|
+
http://nar.oxfordjournals.org/content/37/3/939.full
|
|
296
|
+
|
|
297
|
+
if bg is "None", universal background will be used:
|
|
298
|
+
* DNA:
|
|
299
|
+
A = C = G = T = 0.25 (i.e. 1/4)
|
|
300
|
+
* Protein
|
|
301
|
+
A = R = N = ... = V = 0.05 (i.e. 1/20)
|
|
302
|
+
|
|
303
|
+
Otherwise, bg is a dictionary with base as key and the corresponding
|
|
304
|
+
base frequency as value. eg
|
|
305
|
+
|
|
306
|
+
bg = {'A':0.23, 'C':0.26,'G':0.29,'T':0.22}
|
|
307
|
+
|
|
308
|
+
'''
|
|
309
|
+
pwm = []
|
|
310
|
+
pfm = []
|
|
311
|
+
ppm = []
|
|
312
|
+
background = {}
|
|
313
|
+
|
|
314
|
+
if self.seq_type == 'DNA':
|
|
315
|
+
bases = self.DNA_bases
|
|
316
|
+
else:
|
|
317
|
+
bases = self.protein_bases
|
|
318
|
+
|
|
319
|
+
#determine background frequency
|
|
320
|
+
if bg is None:
|
|
321
|
+
for b in bases:
|
|
322
|
+
background[b] = 1.0/len(bases)
|
|
323
|
+
|
|
324
|
+
for i in range(self.motif_length): # i is motif position starting from 0
|
|
325
|
+
tmp = [] # list of base_count for each column of sites file
|
|
326
|
+
for b in bases:
|
|
327
|
+
if b in self.data[i]:
|
|
328
|
+
tmp.append(self.data[i][b])
|
|
329
|
+
else:
|
|
330
|
+
tmp.append(0.0)
|
|
331
|
+
pfm.append(tmp)
|
|
332
|
+
|
|
333
|
+
pfm = np.transpose(np.array(pfm))
|
|
334
|
+
pfm = pfm + pseudocount/4.0
|
|
335
|
+
ppm = pfm/pfm.sum(axis=0)
|
|
336
|
+
|
|
337
|
+
for i,name in enumerate(bases):
|
|
338
|
+
tmp = [np.log(j/background[name]) for j in ppm[i]]
|
|
339
|
+
pwm.append(tmp)
|
|
340
|
+
|
|
341
|
+
#print("\n\n# PWM matrix", file=FOUT)
|
|
342
|
+
print('Base\t' + '\t'.join([str(i+1) for i in range(self.motif_length)]), file=FOUT)
|
|
343
|
+
for i,name in enumerate(bases):
|
|
344
|
+
print(name + '\t' + '\t'.join([str(np.log(j/background[name])) for j in ppm[i]]), file=FOUT)
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
|
cpgmodule/ireader.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""
|
|
2
|
+
read compressed (.gz .bz) files
|
|
3
|
+
"""
|
|
4
|
+
#!/usr/bin/env python
|
|
5
|
+
# encoding: utf-8
|
|
6
|
+
|
|
7
|
+
import bz2
|
|
8
|
+
import gzip
|
|
9
|
+
from urllib.request import urlopen
|
|
10
|
+
|
|
11
|
+
def nopen(f, mode="rb"):
|
|
12
|
+
if not isinstance(f, str):
|
|
13
|
+
return f
|
|
14
|
+
if f.startswith("|"):
|
|
15
|
+
p = Popen(f[1:], stdout=PIPE, stdin=PIPE, shell=True)
|
|
16
|
+
if mode[0] == "r": return p.stdout
|
|
17
|
+
return p
|
|
18
|
+
return {"r": sys.stdin, "w": sys.stdout}[mode[0]] if f == "-" \
|
|
19
|
+
else gzip.open(f, mode) if f.endswith((".gz", ".Z", ".z")) \
|
|
20
|
+
else bz2.BZ2File(f, mode) if f.endswith((".bz", ".bz2", ".bzip2")) \
|
|
21
|
+
else urlopen(f) if f.startswith(("http://", "https://","ftp://")) \
|
|
22
|
+
else open(f, mode)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def reader(fname):
|
|
26
|
+
for l in nopen(fname):
|
|
27
|
+
yield l.decode('utf8').strip().replace("\r", "")
|
|
28
|
+
|
cpgmodule/methylClock.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Created on Fri Nov 25 10:55:14 2022
|
|
5
|
+
|
|
6
|
+
@author: Liguo Wang
|
|
7
|
+
"""
|
|
8
|
+
from cpgmodule import ireader
|
|
9
|
+
#import sys,os
|
|
10
|
+
|
|
11
|
+
class MethylSig():
|
|
12
|
+
"""
|
|
13
|
+
Pack DNA methylation signature file into object.
|
|
14
|
+
|
|
15
|
+
>>> from cpgmodule import methylClock
|
|
16
|
+
>>> a = methylClock.MethylAge(signature_file = 'coefBlup.tsv', signature_name = 'BLUP', signature_info="")
|
|
17
|
+
>>> a.name
|
|
18
|
+
'BLUP'
|
|
19
|
+
>>> a.Intercept
|
|
20
|
+
91.15396
|
|
21
|
+
>>> a.ncpg
|
|
22
|
+
319607
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, signature_file, signature_name, tissues = [], unit = '', signature_info = '', reference = '', pub_link = '', method = ''):
|
|
26
|
+
self.name = signature_name
|
|
27
|
+
self.info = signature_info
|
|
28
|
+
self.tissues = tissues
|
|
29
|
+
self.unit = unit
|
|
30
|
+
self.coef = {}
|
|
31
|
+
self.cpgs = []
|
|
32
|
+
self.ncpg = 0
|
|
33
|
+
self.Intercept = 0.0
|
|
34
|
+
self.ref = reference
|
|
35
|
+
self.pubmed = pub_link
|
|
36
|
+
self.method = method
|
|
37
|
+
for l in ireader.reader(signature_file):
|
|
38
|
+
if l.startswith('#'):
|
|
39
|
+
continue
|
|
40
|
+
f = l.split()
|
|
41
|
+
if l.startswith('Intercept'):
|
|
42
|
+
try:
|
|
43
|
+
self.Intercept = float(f[1])
|
|
44
|
+
except:
|
|
45
|
+
self.Intercept = 0.0
|
|
46
|
+
else:
|
|
47
|
+
self.cpgs.append(f[0])
|
|
48
|
+
self.ncpg += 1
|
|
49
|
+
try:
|
|
50
|
+
self.coef[f[0]] = float(f[1])
|
|
51
|
+
#self.ncpg += 1
|
|
52
|
+
except:
|
|
53
|
+
continue
|
cpgmodule/padjust.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
# Copyright 2017 Francisco Pina Martins <f.pinamartins@gmail.com>
|
|
4
|
+
# This file is part of structure_threader.
|
|
5
|
+
# structure_threader is free software: you can redistribute it and/or modify
|
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
|
7
|
+
# the Free Software Foundation, either version 3 of the License, or
|
|
8
|
+
# (at your option) any later version.
|
|
9
|
+
|
|
10
|
+
# structure_threader is distributed in the hope that it will be useful,
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13
|
+
# GNU General Public License for more details.
|
|
14
|
+
|
|
15
|
+
# You should have received a copy of the GNU General Public License
|
|
16
|
+
# along with structure_threader. If not, see <http://www.gnu.org/licenses/>.
|
|
17
|
+
|
|
18
|
+
# Taken from https://stackoverflow.com/a/21739593/3091595, ported to python 3
|
|
19
|
+
# and improved readability.
|
|
20
|
+
import numpy as np
|
|
21
|
+
|
|
22
|
+
def multiple_testing_correction(pvalues, correction_type="FDR"):
|
|
23
|
+
"""
|
|
24
|
+
Consistent with R - print
|
|
25
|
+
correct_pvalues_for_multiple_testing([0.0, 0.01, 0.029, 0.03, 0.031, 0.05,
|
|
26
|
+
0.069, 0.07, 0.071, 0.09, 0.1])
|
|
27
|
+
"""
|
|
28
|
+
#from numpy import array, empty
|
|
29
|
+
pvalues = np.array(pvalues)
|
|
30
|
+
sample_size = pvalues.shape[0]
|
|
31
|
+
qvalues = np.empty(sample_size)
|
|
32
|
+
if correction_type == "Bonferroni":
|
|
33
|
+
# Bonferroni correction
|
|
34
|
+
qvalues = sample_size * pvalues
|
|
35
|
+
elif correction_type == "Bonferroni-Holm":
|
|
36
|
+
# Bonferroni-Holm correction
|
|
37
|
+
values = [(pvalue, i) for i, pvalue in enumerate(pvalues)]
|
|
38
|
+
values.sort()
|
|
39
|
+
for rank, vals in enumerate(values):
|
|
40
|
+
pvalue, i = vals
|
|
41
|
+
qvalues[i] = (sample_size-rank) * pvalue
|
|
42
|
+
elif correction_type == "FDR":
|
|
43
|
+
# Benjamini-Hochberg, AKA - FDR test
|
|
44
|
+
values = [(pvalue, i) for i, pvalue in enumerate(pvalues)]
|
|
45
|
+
values.sort()
|
|
46
|
+
values.reverse()
|
|
47
|
+
new_values = []
|
|
48
|
+
for i, vals in enumerate(values):
|
|
49
|
+
rank = sample_size - i
|
|
50
|
+
pvalue, index = vals
|
|
51
|
+
new_values.append((sample_size/rank) * pvalue)
|
|
52
|
+
for i in range(0, int(sample_size)-1):
|
|
53
|
+
if new_values[i] < new_values[i+1]:
|
|
54
|
+
new_values[i+1] = new_values[i]
|
|
55
|
+
for i, vals in enumerate(values):
|
|
56
|
+
pvalue, index = vals
|
|
57
|
+
qvalues[index] = new_values[i]
|
|
58
|
+
return qvalues
|