cpgtools 2.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cpgmodule/BED.py +441 -0
- cpgmodule/MI.py +193 -0
- cpgmodule/__init__.py +0 -0
- cpgmodule/_version.py +1 -0
- cpgmodule/cgID.py +866897 -0
- cpgmodule/data/AltumAge_cpg.pkl +0 -0
- cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
- cpgmodule/data/AltumAge_scaler.pkl +0 -0
- cpgmodule/data/GA_Bohlin.pkl +0 -0
- cpgmodule/data/GA_Haftorn.pkl +0 -0
- cpgmodule/data/GA_Knight.pkl +0 -0
- cpgmodule/data/GA_Lee_CPC.pkl +0 -0
- cpgmodule/data/GA_Lee_RPC.pkl +0 -0
- cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
- cpgmodule/data/GA_Mayne.pkl +0 -0
- cpgmodule/data/Hannum.pkl +0 -0
- cpgmodule/data/Horvath_2013.pkl +0 -0
- cpgmodule/data/Horvath_2018.pkl +0 -0
- cpgmodule/data/Levine.pkl +0 -0
- cpgmodule/data/Lu_DNAmTL.pkl +0 -0
- cpgmodule/data/Ped_McEwen.pkl +0 -0
- cpgmodule/data/Ped_Wu.pkl +0 -0
- cpgmodule/data/Zhang_BLUP.pkl +0 -0
- cpgmodule/data/Zhang_EN.pkl +0 -0
- cpgmodule/data/__init__.py +0 -0
- cpgmodule/extend_bed.py +147 -0
- cpgmodule/imotif.py +348 -0
- cpgmodule/ireader.py +28 -0
- cpgmodule/methylClock.py +53 -0
- cpgmodule/padjust.py +58 -0
- cpgmodule/region2gene.py +170 -0
- cpgmodule/utils.py +642 -0
- cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
- cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
- cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
- cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
- cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
- cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
- cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
- cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
- cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
- cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
- cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
- cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
- cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
- cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
- cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
- cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
- cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
- cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
- cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
- cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
- cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
- cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
- cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
- cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
- cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
- cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
- cpgtools-2.0.5.dist-info/METADATA +59 -0
- cpgtools-2.0.5.dist-info/RECORD +104 -0
- cpgtools-2.0.5.dist-info/WHEEL +5 -0
- cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
- cpgtools-2.0.5.dist-info/top_level.txt +5 -0
- impyute/__init__.py +3 -0
- impyute/contrib/__init__.py +7 -0
- impyute/contrib/compare.py +69 -0
- impyute/contrib/count_missing.py +30 -0
- impyute/contrib/describe.py +63 -0
- impyute/cs/__init__.py +11 -0
- impyute/cs/buck_iterative.py +82 -0
- impyute/cs/central_tendency.py +84 -0
- impyute/cs/em.py +52 -0
- impyute/cs/fast_knn.py +130 -0
- impyute/cs/random.py +27 -0
- impyute/dataset/__init__.py +6 -0
- impyute/dataset/base.py +137 -0
- impyute/dataset/corrupt.py +55 -0
- impyute/deletion/__init__.py +5 -0
- impyute/deletion/complete_case.py +21 -0
- impyute/ops/__init__.py +12 -0
- impyute/ops/error.py +9 -0
- impyute/ops/inverse_distance_weighting.py +31 -0
- impyute/ops/matrix.py +47 -0
- impyute/ops/testing.py +20 -0
- impyute/ops/util.py +96 -0
- impyute/ops/wrapper.py +179 -0
- impyute/ts/__init__.py +6 -0
- impyute/ts/locf.py +57 -0
- impyute/ts/moving_window.py +128 -0
- impyutelib.py +890 -0
- missingpy/__init__.py +4 -0
- missingpy/knnimpute.py +328 -0
- missingpy/missforest.py +556 -0
- missingpy/pairwise_external.py +315 -0
- missingpy/tests/__init__.py +0 -0
- missingpy/tests/test_knnimpute.py +605 -0
- missingpy/tests/test_missforest.py +409 -0
- missingpy/utils.py +124 -0
- misspylib.py +565 -0
cpgmodule/region2gene.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from bx.intervals import *
|
|
3
|
+
import numpy as np
|
|
4
|
+
from cpgmodule import ireader
|
|
5
|
+
|
|
6
|
+
def getBasalDomains(bedfile, up = 5000, down = 1000, printit = False):
|
|
7
|
+
'''
|
|
8
|
+
Define gene's basal regulatory domain.
|
|
9
|
+
bedfile: one gene one TSS (could use the canonical (longest) isoform, or merge all isoforms into a super transcript.
|
|
10
|
+
up: size of extension to upstream of TSS
|
|
11
|
+
down: size of extension to downstream of TSS
|
|
12
|
+
'''
|
|
13
|
+
basal_ranges = {}
|
|
14
|
+
|
|
15
|
+
for l in ireader.reader(bedfile):
|
|
16
|
+
if l.startswith('#'):
|
|
17
|
+
continue
|
|
18
|
+
if l.startswith('track'):
|
|
19
|
+
continue
|
|
20
|
+
if l.startswith('browser'):
|
|
21
|
+
continue
|
|
22
|
+
f = l.split()
|
|
23
|
+
try:
|
|
24
|
+
chrom = f[0]
|
|
25
|
+
start = int(f[1])
|
|
26
|
+
end = int(f[2])
|
|
27
|
+
symbol = f[3]
|
|
28
|
+
strand = f[5]
|
|
29
|
+
if start > end:
|
|
30
|
+
print ("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr)
|
|
31
|
+
continue
|
|
32
|
+
except:
|
|
33
|
+
print ("BED has at lesat 6 columns. Skip: " + l, file=sys.stderr)
|
|
34
|
+
continue
|
|
35
|
+
|
|
36
|
+
if chrom not in basal_ranges:
|
|
37
|
+
basal_ranges[chrom] = IntervalTree()
|
|
38
|
+
|
|
39
|
+
if strand == '+':
|
|
40
|
+
basal_st = (start + 1) - up
|
|
41
|
+
basal_end = (start + 1) + down
|
|
42
|
+
basal_st = max(0, basal_st)
|
|
43
|
+
basal_ranges[chrom].insert_interval( Interval(basal_st, basal_end, strand = '+', value = symbol))
|
|
44
|
+
|
|
45
|
+
elif strand == '-':
|
|
46
|
+
basal_st = end - down
|
|
47
|
+
basal_end = end + up
|
|
48
|
+
basal_st = max(0, basal_st)
|
|
49
|
+
basal_ranges[chrom].insert_interval( Interval(basal_st, basal_end, strand = '-', value = symbol))
|
|
50
|
+
if printit:
|
|
51
|
+
print('\t'.join([str(i) for i in (chrom, basal_st, basal_end, symbol, '0', strand)]), file = sys.stdout)
|
|
52
|
+
return basal_ranges
|
|
53
|
+
|
|
54
|
+
def geteExtendedDomains(basal_ranges, bedfile, up = 5000, down = 1000, ext=1000000, printit = False):
|
|
55
|
+
'''
|
|
56
|
+
Define gene's extended regulatory domain.
|
|
57
|
+
bedfile:one gene one TSS (could use the canonical (longest) isoform, or merge all
|
|
58
|
+
isoforms into a super transcript.
|
|
59
|
+
ext: maximum size of extension (default 1000Kb)
|
|
60
|
+
|
|
61
|
+
Two step process:
|
|
62
|
+
1) Each gene is assigned a basal regulatory domain of a minimum distance upstream and
|
|
63
|
+
downstream of the TSS (regardless of other nearby genes).
|
|
64
|
+
2) The gene regulatory domain is extended in both directions to the nearest gene's
|
|
65
|
+
basal domain but no more than the maximum extension in one direction.
|
|
66
|
+
'''
|
|
67
|
+
domain_ranges = {} #gene's regulatory domain range
|
|
68
|
+
for l in ireader.reader(bedfile):
|
|
69
|
+
if l.startswith('#'):
|
|
70
|
+
continue
|
|
71
|
+
if l.startswith('track'):
|
|
72
|
+
continue
|
|
73
|
+
if l.startswith('browser'):
|
|
74
|
+
continue
|
|
75
|
+
f = l.split()
|
|
76
|
+
try:
|
|
77
|
+
chrom = f[0]
|
|
78
|
+
start = int(f[1])
|
|
79
|
+
end = int(f[2])
|
|
80
|
+
symbol = f[3]
|
|
81
|
+
strand = f[5]
|
|
82
|
+
if start > end:
|
|
83
|
+
print ("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr)
|
|
84
|
+
continue
|
|
85
|
+
except:
|
|
86
|
+
print ("BED has at lesat 6 columns. Skip: " + l, file=sys.stderr)
|
|
87
|
+
|
|
88
|
+
if strand == '+':
|
|
89
|
+
tss = start + 1
|
|
90
|
+
basal_st = tss - up
|
|
91
|
+
basal_end = tss + down
|
|
92
|
+
extension_st = tss - ext
|
|
93
|
+
extension_end = tss + ext
|
|
94
|
+
elif strand == '-':
|
|
95
|
+
tss = end
|
|
96
|
+
basal_st = tss - down
|
|
97
|
+
basal_end = tss + up
|
|
98
|
+
extension_st = tss - ext
|
|
99
|
+
extension_end = tss + ext
|
|
100
|
+
if extension_st < 0:
|
|
101
|
+
extension_st = 0
|
|
102
|
+
#try to update extension_st
|
|
103
|
+
overlaps = basal_ranges[chrom].find(extension_st, basal_st)
|
|
104
|
+
if len(overlaps) > 0:
|
|
105
|
+
for o in overlaps:
|
|
106
|
+
if o.end > extension_st:
|
|
107
|
+
extension_st = o.end
|
|
108
|
+
if extension_st > basal_st:
|
|
109
|
+
extension_st = basal_st
|
|
110
|
+
|
|
111
|
+
#try to update extension_end
|
|
112
|
+
overlaps = basal_ranges[chrom].find(basal_end, extension_end)
|
|
113
|
+
if len(overlaps) > 0:
|
|
114
|
+
for o in overlaps:
|
|
115
|
+
if o.start < extension_end:
|
|
116
|
+
extension_end = o.start
|
|
117
|
+
if extension_end < basal_end:
|
|
118
|
+
extension_end = basal_end
|
|
119
|
+
|
|
120
|
+
if chrom not in domain_ranges:
|
|
121
|
+
domain_ranges[chrom] = IntervalTree()
|
|
122
|
+
else:
|
|
123
|
+
domain_ranges[chrom].insert_interval(Interval(extension_st, extension_end, strand = strand, value=symbol))
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
if printit:
|
|
127
|
+
print('\t'.join([str(i) for i in (chrom, extension_st, extension_end, symbol, '0', strand, basal_st, basal_end, '255,0,0', 1, extension_end - extension_st, 0)]), file = sys.stdout)
|
|
128
|
+
|
|
129
|
+
return domain_ranges
|
|
130
|
+
"""
|
|
131
|
+
if len(overlaps) == 1:
|
|
132
|
+
domain_ranges[chrom].insert_interval(Interval(extension_st, extension_end, strand = strand, value=symbol))
|
|
133
|
+
if printit:
|
|
134
|
+
print('\t'.join([str(i) for i in (chrom, extension_st, extension_end, symbol, '0', strand)]), file = sys.stdout)
|
|
135
|
+
|
|
136
|
+
else:
|
|
137
|
+
o_basal_starts = [] #starts of basal_region overlapped with extension_region
|
|
138
|
+
o_basal_ends = []
|
|
139
|
+
for o in overlaps:
|
|
140
|
+
if o.vallue = symbol:
|
|
141
|
+
continue
|
|
142
|
+
o_basal_starts.append(o.start)
|
|
143
|
+
o_basal_ends.append(o.end)
|
|
144
|
+
|
|
145
|
+
tmp1 = [i for i in o_basal_ends if i > extension_st and i < tss]
|
|
146
|
+
tmp2 = [i for i in o_basal_starts if i < extension_end and i > tss]
|
|
147
|
+
if len(tmp1) == 0:
|
|
148
|
+
truncaed_ext_st = extension_st
|
|
149
|
+
else:
|
|
150
|
+
truncaed_ext_st = max(tmp1)
|
|
151
|
+
if len(tmp2) == 0:
|
|
152
|
+
truncaed_ext_end = extension_end
|
|
153
|
+
else:
|
|
154
|
+
truncaed_ext_end = min(tmp2)
|
|
155
|
+
|
|
156
|
+
truncaed_ext_st = max(0,truncaed_ext_st)
|
|
157
|
+
domain_ranges[chrom].insert_interval(Interval(truncaed_ext_st, truncaed_ext_end, strand = strand, value=symbol))
|
|
158
|
+
|
|
159
|
+
if printit:
|
|
160
|
+
print('\t'.join([str(i) for i in (chrom, truncaed_ext_st, truncaed_ext_end, symbol + '_extended', '0', strand)]), file = sys.stdout)
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
if __name__=='__main__':
|
|
164
|
+
tmp = basal_domain(sys.argv[1], printit = False)
|
|
165
|
+
extended_domain(basal_ranges = tmp, bedfile = sys.argv[1], printit=True)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
|