cpgtools 2.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cpgmodule/BED.py +441 -0
- cpgmodule/MI.py +193 -0
- cpgmodule/__init__.py +0 -0
- cpgmodule/_version.py +1 -0
- cpgmodule/cgID.py +866897 -0
- cpgmodule/data/AltumAge_cpg.pkl +0 -0
- cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
- cpgmodule/data/AltumAge_scaler.pkl +0 -0
- cpgmodule/data/GA_Bohlin.pkl +0 -0
- cpgmodule/data/GA_Haftorn.pkl +0 -0
- cpgmodule/data/GA_Knight.pkl +0 -0
- cpgmodule/data/GA_Lee_CPC.pkl +0 -0
- cpgmodule/data/GA_Lee_RPC.pkl +0 -0
- cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
- cpgmodule/data/GA_Mayne.pkl +0 -0
- cpgmodule/data/Hannum.pkl +0 -0
- cpgmodule/data/Horvath_2013.pkl +0 -0
- cpgmodule/data/Horvath_2018.pkl +0 -0
- cpgmodule/data/Levine.pkl +0 -0
- cpgmodule/data/Lu_DNAmTL.pkl +0 -0
- cpgmodule/data/Ped_McEwen.pkl +0 -0
- cpgmodule/data/Ped_Wu.pkl +0 -0
- cpgmodule/data/Zhang_BLUP.pkl +0 -0
- cpgmodule/data/Zhang_EN.pkl +0 -0
- cpgmodule/data/__init__.py +0 -0
- cpgmodule/extend_bed.py +147 -0
- cpgmodule/imotif.py +348 -0
- cpgmodule/ireader.py +28 -0
- cpgmodule/methylClock.py +53 -0
- cpgmodule/padjust.py +58 -0
- cpgmodule/region2gene.py +170 -0
- cpgmodule/utils.py +642 -0
- cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
- cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
- cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
- cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
- cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
- cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
- cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
- cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
- cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
- cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
- cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
- cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
- cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
- cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
- cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
- cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
- cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
- cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
- cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
- cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
- cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
- cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
- cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
- cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
- cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
- cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
- cpgtools-2.0.5.dist-info/METADATA +59 -0
- cpgtools-2.0.5.dist-info/RECORD +104 -0
- cpgtools-2.0.5.dist-info/WHEEL +5 -0
- cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
- cpgtools-2.0.5.dist-info/top_level.txt +5 -0
- impyute/__init__.py +3 -0
- impyute/contrib/__init__.py +7 -0
- impyute/contrib/compare.py +69 -0
- impyute/contrib/count_missing.py +30 -0
- impyute/contrib/describe.py +63 -0
- impyute/cs/__init__.py +11 -0
- impyute/cs/buck_iterative.py +82 -0
- impyute/cs/central_tendency.py +84 -0
- impyute/cs/em.py +52 -0
- impyute/cs/fast_knn.py +130 -0
- impyute/cs/random.py +27 -0
- impyute/dataset/__init__.py +6 -0
- impyute/dataset/base.py +137 -0
- impyute/dataset/corrupt.py +55 -0
- impyute/deletion/__init__.py +5 -0
- impyute/deletion/complete_case.py +21 -0
- impyute/ops/__init__.py +12 -0
- impyute/ops/error.py +9 -0
- impyute/ops/inverse_distance_weighting.py +31 -0
- impyute/ops/matrix.py +47 -0
- impyute/ops/testing.py +20 -0
- impyute/ops/util.py +96 -0
- impyute/ops/wrapper.py +179 -0
- impyute/ts/__init__.py +6 -0
- impyute/ts/locf.py +57 -0
- impyute/ts/moving_window.py +128 -0
- impyutelib.py +890 -0
- missingpy/__init__.py +4 -0
- missingpy/knnimpute.py +328 -0
- missingpy/missforest.py +556 -0
- missingpy/pairwise_external.py +315 -0
- missingpy/tests/__init__.py +0 -0
- missingpy/tests/test_knnimpute.py +605 -0
- missingpy/tests/test_missforest.py +409 -0
- missingpy/utils.py +124 -0
- misspylib.py +565 -0
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Description
|
|
5
|
+
-----------
|
|
6
|
+
This program calculates methylation profile (i.e., average
|
|
7
|
+
beta value) around user specified genomic regions.
|
|
8
|
+
|
|
9
|
+
Example of input BED6+ file
|
|
10
|
+
---------------------------
|
|
11
|
+
chr22 44021512 44021513 cg24055475 0.9231 -
|
|
12
|
+
chr13 111568382 111568383 cg06540715 0.1071 +
|
|
13
|
+
chr20 44033594 44033595 cg21482942 0.6122 -
|
|
14
|
+
|
|
15
|
+
Example of input BED3+ file
|
|
16
|
+
---------------------------
|
|
17
|
+
chr1 15864 15865
|
|
18
|
+
chr1 18826 18827
|
|
19
|
+
chr1 29406 29407
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
import sys,os
|
|
24
|
+
import collections
|
|
25
|
+
import subprocess
|
|
26
|
+
import numpy as np
|
|
27
|
+
from optparse import OptionParser
|
|
28
|
+
from cpgmodule._version import __version__
|
|
29
|
+
from cpgmodule import ireader
|
|
30
|
+
from cpgmodule.utils import *
|
|
31
|
+
from cpgmodule import BED
|
|
32
|
+
|
|
33
|
+
__author__ = "Liguo Wang"
|
|
34
|
+
__copyright__ = "Copyleft"
|
|
35
|
+
__credits__ = []
|
|
36
|
+
__license__ = "GPL"
|
|
37
|
+
__maintainer__ = "Liguo Wang"
|
|
38
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
39
|
+
__status__ = "Development"
|
|
40
|
+
|
|
41
|
+
def main():
|
|
42
|
+
|
|
43
|
+
usage="%prog [options]" + "\n"
|
|
44
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
45
|
+
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="BED6+ file specifying the C position. This BED file should have at least six columns (Chrom, ChromStart, ChromeEnd, Name, Beta_value, Strand). BED6+ file can be a regular text file or compressed file (.gz, .bz2).")
|
|
46
|
+
parser.add_option("-r","--region",action="store",type="string",dest="region_file",help="BED3+ file of genomic regions. This BED file should have at least three columns (Chrom, ChromStart, ChromeEnd). If the 6-th column does not exist, all regions will be considered as on \"+\" strand. ")
|
|
47
|
+
parser.add_option("-d","--downstream",action="store",type="int",dest="downstream_size",default=2000,help="Size of extension to downstream. default=%default (bp)")
|
|
48
|
+
parser.add_option("-u","--upstream",action="store",type="int",dest="upstream_size",default=2000,help="Size of extension to upstream. default=%default (bp)")
|
|
49
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
|
|
50
|
+
(options,args)=parser.parse_args()
|
|
51
|
+
|
|
52
|
+
print ()
|
|
53
|
+
|
|
54
|
+
if not (options.input_file):
|
|
55
|
+
print (__doc__)
|
|
56
|
+
parser.print_help()
|
|
57
|
+
sys.exit(101)
|
|
58
|
+
|
|
59
|
+
if not (options.region_file):
|
|
60
|
+
print (__doc__)
|
|
61
|
+
parser.print_help()
|
|
62
|
+
sys.exit(102)
|
|
63
|
+
|
|
64
|
+
if not (options.out_file):
|
|
65
|
+
print (__doc__)
|
|
66
|
+
parser.print_help()
|
|
67
|
+
sys.exit(103)
|
|
68
|
+
|
|
69
|
+
FOUT = open(options.out_file + '.txt','w')
|
|
70
|
+
ROUT = open(options.out_file + '.r','w')
|
|
71
|
+
print ("\t".join(["Group","Relative_position(5'->3')", "Average_beta"]), file=FOUT)
|
|
72
|
+
|
|
73
|
+
#step1: read CpG file
|
|
74
|
+
printlog("Reading CpG file: \"%s\"" % (options.input_file))
|
|
75
|
+
cpg_ranges = read_CpG_bed(options.input_file)
|
|
76
|
+
|
|
77
|
+
#step2: read region file
|
|
78
|
+
printlog("Reading BED file: \"%s\"" % (options.region_file))
|
|
79
|
+
|
|
80
|
+
region_list = []
|
|
81
|
+
for chrom, st, end, strand in read_region_bed(options.region_file):
|
|
82
|
+
region_list.append((chrom, st, end, strand))
|
|
83
|
+
region_list = list(set(region_list))
|
|
84
|
+
|
|
85
|
+
printlog("Calculate average beta ...")
|
|
86
|
+
s = coverage_over_range(region_list,cpg_ranges)
|
|
87
|
+
for i in sorted(s):
|
|
88
|
+
print ('\t'.join(["User_region", str(i), str(s[i])]), file=FOUT)
|
|
89
|
+
print ('User_region <- c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT)
|
|
90
|
+
user_region_datapoints = len(s)
|
|
91
|
+
|
|
92
|
+
if options.upstream_size > 0:
|
|
93
|
+
printlog("Get upstream regions of \"%s\"" % (options.region_file))
|
|
94
|
+
upstream_region = []
|
|
95
|
+
for (chrom, st, end, strand) in region_list:
|
|
96
|
+
if strand == '+':
|
|
97
|
+
upstream_st = max(st - options.upstream_size, 0)
|
|
98
|
+
upstream_end = st
|
|
99
|
+
upstream_region.append((chrom, upstream_st, upstream_end, strand))
|
|
100
|
+
elif strand == '-':
|
|
101
|
+
upstream_st = end
|
|
102
|
+
upstream_end = end + options.upstream_size
|
|
103
|
+
upstream_region.append((chrom, upstream_st, upstream_end, strand))
|
|
104
|
+
upstream_region = list(set(upstream_region))
|
|
105
|
+
|
|
106
|
+
s = coverage_over_range(upstream_region,cpg_ranges)
|
|
107
|
+
for i in sorted(s):
|
|
108
|
+
print ('\t'.join(["Upstream_region", str(i), str(s[i])]), file=FOUT)
|
|
109
|
+
print ('Upstream_region <- c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT)
|
|
110
|
+
upstream_datapoints = len(s)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
if options.downstream_size > 0:
|
|
114
|
+
printlog("Get downstream regions of \"%s\"" % (options.region_file))
|
|
115
|
+
downstream_region = []
|
|
116
|
+
for (chrom, st, end, strand) in region_list:
|
|
117
|
+
if strand == '+':
|
|
118
|
+
downstream_st = end
|
|
119
|
+
downstream_end = end + options.downstream_size
|
|
120
|
+
downstream_region.append((chrom, downstream_st, downstream_end, strand))
|
|
121
|
+
elif strand == '-':
|
|
122
|
+
downstream_st = st
|
|
123
|
+
downstream_end = max(st - options.downstream_size, 0)
|
|
124
|
+
downstream_region.append((chrom, downstream_st, downstream_end, strand))
|
|
125
|
+
downstream_region = list(set(downstream_region))
|
|
126
|
+
s = coverage_over_range(downstream_region,cpg_ranges)
|
|
127
|
+
for i in sorted(s):
|
|
128
|
+
print ('\t'.join(["Downstream_region", str(i), str(s[i])]), file=FOUT)
|
|
129
|
+
print ('Downstream_region <- c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT)
|
|
130
|
+
downstream_datapoints = len(s)
|
|
131
|
+
|
|
132
|
+
total_datapoints = upstream_datapoints + downstream_datapoints + user_region_datapoints
|
|
133
|
+
print('\n')
|
|
134
|
+
print ('pdf(file=\"%s\", width=6, height=6)' % (options.out_file + '.pdf'),file=ROUT)
|
|
135
|
+
print ('plot(0:%d, c(Upstream_region, User_region, Downstream_region),ylim=c(0,1), xaxt="n",xlab="", ylab="Average methylation", type="l", col="red")' % (total_datapoints -1), file=ROUT)
|
|
136
|
+
print ('abline(v = c(%d,%d),col="blue", lty="dashed")' % (upstream_datapoints-1, upstream_datapoints + user_region_datapoints - 1), file=ROUT)
|
|
137
|
+
print ('abline(h = 0.5,col="grey", lty="dashed")', file=ROUT)
|
|
138
|
+
print ('text(x=c(%d, %d), y=0.9, cex=0.7, labels=c("Upstream\\n(5\'->3\')","Downstream\\n(5\'->3\')"))' % (50, total_datapoints - 50), file=ROUT)
|
|
139
|
+
print ('dev.off()',file=ROUT)
|
|
140
|
+
|
|
141
|
+
FOUT.close()
|
|
142
|
+
ROUT.close()
|
|
143
|
+
try:
|
|
144
|
+
subprocess.call("Rscript " + options.out_file + '.r', shell=True)
|
|
145
|
+
except:
|
|
146
|
+
print ("Cannot generate pdf file from " + options.out_file + '.r', file=sys.stderr)
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
if __name__=='__main__':
|
|
152
|
+
main()
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
#=========================================================================================
|
|
5
|
+
Select the K best features according to the K highest scores. Scores can be measured by:
|
|
6
|
+
|
|
7
|
+
* ANOVA F-value between label/feature for classification tasks.
|
|
8
|
+
* Mutual information for a discrete target.
|
|
9
|
+
* Chi-squared stats of non-negative features for classification tasks.
|
|
10
|
+
|
|
11
|
+
Example of input data file
|
|
12
|
+
---------------------------
|
|
13
|
+
CpG_ID Sample_01 Sample_02 Sample_03 Sample_04
|
|
14
|
+
cg_001 0.831035 0.878022 0.794427 0.880911
|
|
15
|
+
cg_002 0.249544 0.209949 0.234294 0.236680
|
|
16
|
+
cg_003 0.845065 0.843957 0.840184 0.824286
|
|
17
|
+
"""
|
|
18
|
+
import sys
|
|
19
|
+
import numpy as np
|
|
20
|
+
from optparse import OptionParser
|
|
21
|
+
from cpgmodule._version import __version__
|
|
22
|
+
from cpgmodule.utils import *
|
|
23
|
+
import pandas as pd
|
|
24
|
+
|
|
25
|
+
from sklearn.feature_selection import SelectKBest
|
|
26
|
+
from sklearn.feature_selection import chi2,f_classif,mutual_info_classif
|
|
27
|
+
|
|
28
|
+
__author__ = "Liguo Wang"
|
|
29
|
+
__copyright__ = "Copyleft"
|
|
30
|
+
__credits__ = []
|
|
31
|
+
__license__ = "GPL"
|
|
32
|
+
__maintainer__ = "Liguo Wang"
|
|
33
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
34
|
+
__status__ = "Development"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def main():
|
|
38
|
+
|
|
39
|
+
usage="%prog [options]" + "\n"
|
|
40
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
41
|
+
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Tab-separated data frame file containing beta values with the 1st row containing sample IDs and the 1st column containing CpG IDs.")
|
|
42
|
+
parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Comma-separated group file defining the biological groups of each sample.")
|
|
43
|
+
parser.add_option("-c","--topK",action="store",type='int', dest="cpg_count", default=100, help="Number of top features to select. default=%default" )
|
|
44
|
+
parser.add_option("-s","--score-function",action="store",type='string', dest="score_function", default='chisq', help="Scoring function used to measure the dependency between features scores and labels. Must be \"chisq\" (chi-squared statistic), \"anova\" (ANOVA F-value), or \"mi\" (mutual information). default=%default" )
|
|
45
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
|
|
46
|
+
(options,args)=parser.parse_args()
|
|
47
|
+
|
|
48
|
+
print ()
|
|
49
|
+
if not (options.input_file):
|
|
50
|
+
print (__doc__)
|
|
51
|
+
parser.print_help()
|
|
52
|
+
sys.exit(101)
|
|
53
|
+
if not (options.group_file):
|
|
54
|
+
print (__doc__)
|
|
55
|
+
parser.print_help()
|
|
56
|
+
sys.exit(102)
|
|
57
|
+
if not (options.out_file):
|
|
58
|
+
print (__doc__)
|
|
59
|
+
parser.print_help()
|
|
60
|
+
sys.exit(103)
|
|
61
|
+
|
|
62
|
+
printlog("Reading input file: \"%s\"" % (options.input_file))
|
|
63
|
+
df1 = pd.read_csv(options.input_file, index_col = 0, sep="\t")
|
|
64
|
+
#print (df1)
|
|
65
|
+
|
|
66
|
+
#remove any rows with NAs
|
|
67
|
+
df2 = df1.dropna(axis=0, how='any')
|
|
68
|
+
printlog("%d rows with missing values were removed." % (len(df1) - len(df2)))
|
|
69
|
+
#print (df2)
|
|
70
|
+
|
|
71
|
+
printlog("Transposing data matrix ... ")
|
|
72
|
+
df2 = df2.T
|
|
73
|
+
total_feature = len(df2.columns)
|
|
74
|
+
printlog("Total number of features: %d " % (total_feature))
|
|
75
|
+
#print (df2)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
printlog("Reading group file: \"%s\"" % (options.group_file))
|
|
79
|
+
group = pd.read_csv(options.group_file, index_col=0, header=0,names=['Sample_ID', 'Group_ID'])
|
|
80
|
+
a = pd.Series(list(group['Group_ID'])) #a is *string labels* for groups: ['Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Cancer', 'Cancer', 'Cancer', 'Cancer']
|
|
81
|
+
#print (a)
|
|
82
|
+
y, tmp = pd.factorize(a) #y is *numeric labels* for groups: [0 0 0 0 0 1 1 1 1]
|
|
83
|
+
#print (np.array(y))
|
|
84
|
+
|
|
85
|
+
if options.cpg_count < total_feature:
|
|
86
|
+
|
|
87
|
+
if options.score_function == 'anova':
|
|
88
|
+
printlog ("Using ANOVA F value to select features ...")
|
|
89
|
+
selector = SelectKBest(f_classif, k = options.cpg_count)
|
|
90
|
+
elif options.score_function == 'mi':
|
|
91
|
+
printlog ("Using Mutual Information to select features ...")
|
|
92
|
+
selector = SelectKBest(mutual_info_classif, k = options.cpg_count)
|
|
93
|
+
elif options.score_function == 'chisq':
|
|
94
|
+
printlog ("Using Chi Square statistic to select features ...")
|
|
95
|
+
selector = SelectKBest(chi2, k = options.cpg_count)
|
|
96
|
+
else:
|
|
97
|
+
printlog("Unknown function: %s" % options.score_function)
|
|
98
|
+
sys.exit(0)
|
|
99
|
+
else:
|
|
100
|
+
printlog("Doing nothing! '-k' >= the total number of features in \"%s\"" % (options.input_file))
|
|
101
|
+
sys.exit(0)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
selector.fit_transform(df2, np.array(y))
|
|
105
|
+
cols = selector.get_support(indices=False)
|
|
106
|
+
selected_data = df2.loc[:,cols]
|
|
107
|
+
selected_featureNum = len(selected_data.columns)
|
|
108
|
+
printlog("Total number of selected features : %d " % (selected_featureNum))
|
|
109
|
+
#print (selected_data)
|
|
110
|
+
|
|
111
|
+
printlog("Writing to file: \"%s\"" % (options.out_file + '.selectedFeatures.tsv'))
|
|
112
|
+
pd.DataFrame.to_csv(selected_data.T, options.out_file + '.selectedFeatures.tsv', sep="\t", index_label="sample")
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
if __name__=='__main__':
|
|
116
|
+
main()
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Description
|
|
5
|
+
-----------
|
|
6
|
+
This program creates stacked barplot for each sample. The stacked barplot showing
|
|
7
|
+
the proportions of CpGs whose beta values are falling into these 4 ranges:
|
|
8
|
+
* [0.00, 0.25] #first quantile
|
|
9
|
+
* [0.25, 0.50] #second quantile
|
|
10
|
+
* [0.50, 0.75] #third quantile
|
|
11
|
+
* [0.75, 1.00] #forth quantile
|
|
12
|
+
|
|
13
|
+
Example of input data file
|
|
14
|
+
---------------------------
|
|
15
|
+
CpG_ID Sample_01 Sample_02 Sample_03 Sample_04
|
|
16
|
+
cg_001 0.831035 0.878022 0.794427 0.880911
|
|
17
|
+
cg_002 0.249544 0.209949 0.234294 0.236680
|
|
18
|
+
#=========================================================================================
|
|
19
|
+
|
|
20
|
+
Note: Please name your sample IDs using only "letters" [a-z, A-Z], "numbers" [0-9], and "_"; and
|
|
21
|
+
your sample ID should start with a letter.
|
|
22
|
+
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
import sys,os
|
|
27
|
+
import collections
|
|
28
|
+
import subprocess
|
|
29
|
+
import numpy as np
|
|
30
|
+
from optparse import OptionParser
|
|
31
|
+
from cpgmodule._version import __version__
|
|
32
|
+
from cpgmodule import ireader
|
|
33
|
+
from cpgmodule.utils import *
|
|
34
|
+
from cpgmodule import BED
|
|
35
|
+
import pandas as pd
|
|
36
|
+
|
|
37
|
+
__author__ = "Liguo Wang"
|
|
38
|
+
__copyright__ = "Copyleft"
|
|
39
|
+
__credits__ = []
|
|
40
|
+
__license__ = "GPL"
|
|
41
|
+
__maintainer__ = "Liguo Wang"
|
|
42
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
43
|
+
__status__ = "Development"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def quarter_count(lst):
|
|
47
|
+
"""
|
|
48
|
+
count number of beta values falling into each quarter
|
|
49
|
+
Note: beta >= 0 and beta <=1
|
|
50
|
+
"""
|
|
51
|
+
q1 = 0
|
|
52
|
+
q2 = 0
|
|
53
|
+
q3 = 0
|
|
54
|
+
q4 = 0
|
|
55
|
+
for i in lst:
|
|
56
|
+
try:
|
|
57
|
+
j = float(i)
|
|
58
|
+
except:
|
|
59
|
+
continue
|
|
60
|
+
if not isinstance(j, float):
|
|
61
|
+
continue
|
|
62
|
+
if j < 0:
|
|
63
|
+
continue
|
|
64
|
+
elif j <= 0.25:
|
|
65
|
+
q1 += 1
|
|
66
|
+
elif j <= 0.50:
|
|
67
|
+
q2 += 1
|
|
68
|
+
elif j <= 0.75:
|
|
69
|
+
q3 += 1
|
|
70
|
+
elif j <= 1:
|
|
71
|
+
q4 += 1
|
|
72
|
+
else:
|
|
73
|
+
continue
|
|
74
|
+
return [q1, q2, q3, q4]
|
|
75
|
+
|
|
76
|
+
def main():
|
|
77
|
+
|
|
78
|
+
usage="%prog [options]" + "\n"
|
|
79
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
80
|
+
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Data frame file containing beta values with the 1st row containing sample IDs and the 1st column containing CpG IDs.")
|
|
81
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
|
|
82
|
+
(options,args)=parser.parse_args()
|
|
83
|
+
|
|
84
|
+
print ()
|
|
85
|
+
if not (options.input_file):
|
|
86
|
+
print (__doc__)
|
|
87
|
+
parser.print_help()
|
|
88
|
+
sys.exit(101)
|
|
89
|
+
|
|
90
|
+
if not (options.out_file):
|
|
91
|
+
print (__doc__)
|
|
92
|
+
parser.print_help()
|
|
93
|
+
sys.exit(103)
|
|
94
|
+
|
|
95
|
+
printlog("Reading beta file: \"%s\"" % (options.input_file))
|
|
96
|
+
data = pd.read_csv(options.input_file,sep='\t')
|
|
97
|
+
samples = data.columns[1:]
|
|
98
|
+
|
|
99
|
+
ROUT = open(options.out_file + '.r','w')
|
|
100
|
+
print ('pdf(file=\"%s\", width=10, height=10)' % (options.out_file + '.pdf'),file=ROUT)
|
|
101
|
+
|
|
102
|
+
for s in samples:
|
|
103
|
+
tmp = quarter_count(data[s])
|
|
104
|
+
print ('%s <- c(%s)' % (s, ','.join([str(i) for i in tmp])), file=ROUT)
|
|
105
|
+
print ("cc = rev(c('#d7191c', '#fdae61', '#a6d96a', '#1a9641'))", file=ROUT)
|
|
106
|
+
print ('legend = c("beta [0.00 - 0.25]", "beta [0.25 - 0.50]", "beta [0.50 - 0.75]", "beta [0.75 - 1.00]")', file=ROUT)
|
|
107
|
+
print ('nm = c(%s)' % ','.join(['"' + s + '"' for s in samples]), file=ROUT)
|
|
108
|
+
print ('barplot(cbind(%s), col = cc, names.arg = nm, cex.names = 0.8, ylab = "Percentage", ylim=c(0,119), las=2, legend.text = legend)' % (','.join([s + ' * 100/sum(' + s + ')' for s in samples])), file=ROUT)
|
|
109
|
+
ROUT.close()
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
subprocess.call("Rscript " + options.out_file + '.r', shell=True)
|
|
113
|
+
except:
|
|
114
|
+
print ("Cannot generate pdf file from " + options.out_file + '.r', file=sys.stderr)
|
|
115
|
+
pass
|
|
116
|
+
|
|
117
|
+
if __name__=='__main__':
|
|
118
|
+
main()
|
|
119
|
+
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Description
|
|
5
|
+
-----------
|
|
6
|
+
This program gives basic information on CpGs located in each genomic region.
|
|
7
|
+
It adds six columns to the input BED file:
|
|
8
|
+
1. Number of CpGs detected in the genomic region
|
|
9
|
+
2. Min methylation level
|
|
10
|
+
3. Max methylation level
|
|
11
|
+
4. Average methylation level across all CpGs
|
|
12
|
+
5. Median methylation level across all CpGs
|
|
13
|
+
6. Standard deviation
|
|
14
|
+
|
|
15
|
+
Example of input BED6+ file
|
|
16
|
+
---------------------------
|
|
17
|
+
chr22 44021512 44021513 cg24055475 0.9231 -
|
|
18
|
+
chr13 111568382 111568383 cg06540715 0.1071 +
|
|
19
|
+
chr20 44033594 44033595 cg21482942 0.6122 -
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
import sys,os
|
|
24
|
+
import collections
|
|
25
|
+
import subprocess
|
|
26
|
+
import numpy as np
|
|
27
|
+
from optparse import OptionParser
|
|
28
|
+
from cpgmodule._version import __version__
|
|
29
|
+
from cpgmodule import ireader
|
|
30
|
+
from cpgmodule.utils import *
|
|
31
|
+
from cpgmodule import BED
|
|
32
|
+
|
|
33
|
+
__author__ = "Liguo Wang"
|
|
34
|
+
__copyright__ = "Copyleft"
|
|
35
|
+
__credits__ = []
|
|
36
|
+
__license__ = "GPL"
|
|
37
|
+
__maintainer__ = "Liguo Wang"
|
|
38
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
39
|
+
__status__ = "Development"
|
|
40
|
+
|
|
41
|
+
def main():
|
|
42
|
+
usage="%prog [options]" + "\n"
|
|
43
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
44
|
+
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="BED6+ file specifying the C position. This BED file should have at least six columns (Chrom, ChromStart, ChromeEnd, Name, Beta_value, Strand). Note: the first base in a chromosome is numbered 0. This file can be a regular text file or compressed file (.gz, .bz2)")
|
|
45
|
+
parser.add_option("-r","--region",action="store",type="string",dest="region_file",help="BED3+ file of genomic regions. This BED file should have at least 3 columns (Chrom, ChromStart, ChromeEnd).")
|
|
46
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
|
|
47
|
+
(options,args)=parser.parse_args()
|
|
48
|
+
|
|
49
|
+
print ()
|
|
50
|
+
|
|
51
|
+
if not (options.input_file):
|
|
52
|
+
print (__doc__)
|
|
53
|
+
parser.print_help()
|
|
54
|
+
sys.exit(101)
|
|
55
|
+
|
|
56
|
+
if not (options.region_file):
|
|
57
|
+
print (__doc__)
|
|
58
|
+
parser.print_help()
|
|
59
|
+
sys.exit(102)
|
|
60
|
+
|
|
61
|
+
if not (options.out_file):
|
|
62
|
+
print (__doc__)
|
|
63
|
+
parser.print_help()
|
|
64
|
+
sys.exit(103)
|
|
65
|
+
|
|
66
|
+
FOUT = open(options.out_file + '.txt','w')
|
|
67
|
+
|
|
68
|
+
#step1: read CpG file
|
|
69
|
+
printlog("Reading CpG file: \"%s\"" % (options.input_file))
|
|
70
|
+
cpg_ranges = read_CpG_bed(options.input_file)
|
|
71
|
+
|
|
72
|
+
#step2: read region file
|
|
73
|
+
printlog("Reading BED file: \"%s\"" % (options.region_file))
|
|
74
|
+
|
|
75
|
+
printlog("Writing to: \"%s\"" % (options.out_file + '.txt'))
|
|
76
|
+
region_list = []
|
|
77
|
+
for l in ireader.reader(options.region_file):
|
|
78
|
+
if l.startswith('#'):
|
|
79
|
+
continue
|
|
80
|
+
if l.startswith('track'):
|
|
81
|
+
continue
|
|
82
|
+
if l.startswith('browser'):
|
|
83
|
+
continue
|
|
84
|
+
f = l.split()
|
|
85
|
+
if len(f) < 3:
|
|
86
|
+
continue
|
|
87
|
+
try:
|
|
88
|
+
chrom = f[0]
|
|
89
|
+
st = int(f[1])
|
|
90
|
+
end = int(f[2])
|
|
91
|
+
except:
|
|
92
|
+
print (l + '\t' + '\t'.join(['NA']*6, file=FOUT))
|
|
93
|
+
continue
|
|
94
|
+
tmp = stats_over_range(cpg_ranges, chrom, st, end)
|
|
95
|
+
print (l + '\t' + '\t'.join([str(i) for i in tmp]), file=FOUT)
|
|
96
|
+
|
|
97
|
+
FOUT.close()
|
|
98
|
+
|
|
99
|
+
if __name__=='__main__':
|
|
100
|
+
main()
|
|
101
|
+
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Description
|
|
5
|
+
-----------
|
|
6
|
+
This program performs t-SNE (t-Distributed Stochastic Neighbor Embedding) analysis for samples.
|
|
7
|
+
|
|
8
|
+
Example of input data file
|
|
9
|
+
---------------------------
|
|
10
|
+
CpG_ID Sample_01 Sample_02 Sample_03 Sample_04
|
|
11
|
+
cg_001 0.831035 0.878022 0.794427 0.880911
|
|
12
|
+
cg_002 0.249544 0.209949 0.234294 0.236680
|
|
13
|
+
cg_003 0.845065 0.843957 0.840184 0.824286
|
|
14
|
+
|
|
15
|
+
Example of the input group file
|
|
16
|
+
---------------------------
|
|
17
|
+
Sample,Group
|
|
18
|
+
Sample_01,normal
|
|
19
|
+
Sample_02,normal
|
|
20
|
+
Sample_03,tumor
|
|
21
|
+
Sample_04,tumor
|
|
22
|
+
|
|
23
|
+
Notes
|
|
24
|
+
-----
|
|
25
|
+
* Rows with missing values will be removed
|
|
26
|
+
* Beta values will be standardized into z scores
|
|
27
|
+
* Only the first two components will be visualized
|
|
28
|
+
* Different perplexity values can result in significantly different results
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
import sys
|
|
33
|
+
import subprocess
|
|
34
|
+
from optparse import OptionParser
|
|
35
|
+
from cpgmodule.utils import *
|
|
36
|
+
from cpgmodule._version import __version__
|
|
37
|
+
import pandas as pd
|
|
38
|
+
from sklearn.preprocessing import StandardScaler
|
|
39
|
+
from sklearn.manifold import TSNE
|
|
40
|
+
|
|
41
|
+
__author__ = "Liguo Wang"
|
|
42
|
+
__copyright__ = "Copyleft"
|
|
43
|
+
__credits__ = []
|
|
44
|
+
__license__ = "GPL"
|
|
45
|
+
__maintainer__ = "Liguo Wang"
|
|
46
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
47
|
+
__status__ = "Development"
|
|
48
|
+
|
|
49
|
+
def pick_colors(n):
|
|
50
|
+
my_colors = [
|
|
51
|
+
"#F0A3FF", "#0075DC", "#993F00", "#4C005C", "#191919", "#005C31", "#2BCE48", "#FFCC99", "#808080", "#94FFB5", "#8F7C00", "#9DCC00", "#C20088", "#003380", "#FFA405", "#FFA8BB", "#426600", "#FF0010", "#5EF1F2", "#00998F", "#E0FF66", "#740AFF", "#990000", "#FFFF80", "#FFE100", "#FF5005"]
|
|
52
|
+
if n > len(my_colors):
|
|
53
|
+
print ("Only support 26 different colors", file = sys.stderr)
|
|
54
|
+
sys.exit()
|
|
55
|
+
return my_colors[0:n]
|
|
56
|
+
|
|
57
|
+
def main():
|
|
58
|
+
|
|
59
|
+
usage="%prog [options]" + "\n"
|
|
60
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
61
|
+
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Tab-separated data frame file containing beta values with the 1st row containing sample IDs and the 1st column containing CpG IDs.")
|
|
62
|
+
parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Comma-separated group file defining the biological groups of each sample. Different groups will be colored differently in the t-SNE plot. Supports a maximum of 20 groups.")
|
|
63
|
+
parser.add_option("-p","--perplexity",action="store",type='int', dest="perplexity_value", default=5, help="This is a tunable parameter of t-SNE, and has a profound effect on the resulting 2D map. Consider selecting a value between 5 and 50, and the selected value should be smaller than the number of samples (i.e., number of points on the t-SNE 2D map). Default = %default" )
|
|
64
|
+
parser.add_option("-n","--ncomponent",action="store",type='int', dest="n_components", default=2, help="Number of components. default=%default" )
|
|
65
|
+
parser.add_option("--n_iter",action="store",type='int', dest="n_iterations", default=5000, help="The maximum number of iterations for the optimization. Should be at least 250. default=%default" )
|
|
66
|
+
parser.add_option("--learning_rate",action="store",type='float', dest="learning_rate", default=200.0, help="The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If the learning rate is too high, the data may look like a ‘ball’ with any point approximately equidistant from its nearest neighbors. If the learning rate is too low, most points may look compressed in a dense cloud with few outliers. If the cost function gets stuck in a bad local minimum increasing the learning rate may help. default=%default" )
|
|
67
|
+
parser.add_option("-l","--label",action="store_true",default=False,dest="text_label",help="If True, sample ids will be added underneath the data point. default=%default")
|
|
68
|
+
parser.add_option("-c","--char",action="store",type='int', default=1, dest="plot_char",help="Ploting character: 1 = 'dot', 2 = 'circle'. default=%default")
|
|
69
|
+
parser.add_option("-a","--alpha",action="store",type='float', default=0.5, dest="plot_alpha",help="Opacity of dots. default=%default")
|
|
70
|
+
parser.add_option("-x","--loc",action="store",type='int', default=1, dest="legend_location",help="Location of legend panel: 1 = 'topright', 2 = 'bottomright', 3 = 'bottomleft', 4 = 'topleft'. default=%default")
|
|
71
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
|
|
72
|
+
|
|
73
|
+
(options,args)=parser.parse_args()
|
|
74
|
+
|
|
75
|
+
#print (options.text_label)
|
|
76
|
+
#sys.exit(0)
|
|
77
|
+
print ()
|
|
78
|
+
if not (options.input_file):
|
|
79
|
+
print (__doc__)
|
|
80
|
+
parser.print_help()
|
|
81
|
+
sys.exit(101)
|
|
82
|
+
|
|
83
|
+
if not (options.out_file):
|
|
84
|
+
print (__doc__)
|
|
85
|
+
parser.print_help()
|
|
86
|
+
sys.exit(103)
|
|
87
|
+
if options.n_components < 2:
|
|
88
|
+
options.n_components = 2
|
|
89
|
+
|
|
90
|
+
pch = {1:20, 2:1}
|
|
91
|
+
legend_pos = {1:'topright', 2: 'bottomright', 3:'bottomleft', 4:'topleft'}
|
|
92
|
+
|
|
93
|
+
printlog("Reading input file: \"%s\" ..." % (options.input_file))
|
|
94
|
+
df1 = pd.read_csv(options.input_file, index_col = 0, sep="\t")
|
|
95
|
+
|
|
96
|
+
n_samples = df1.shape[1]
|
|
97
|
+
#print (n_samples)
|
|
98
|
+
if (options.perplexity_value > n_samples):
|
|
99
|
+
options.perplexity_value = int(n_samples/2)
|
|
100
|
+
printlog("Perplexigty value is set to %d" % options.perplexity_value)
|
|
101
|
+
|
|
102
|
+
#remove NA and transpose
|
|
103
|
+
df2 = df1.dropna(axis=0, how='any').T
|
|
104
|
+
printlog("%d rows with missing values were removed." % (len(df1.index) - len(df2.columns)))
|
|
105
|
+
#print (df2.head())
|
|
106
|
+
|
|
107
|
+
printlog("Reading group file: \"%s\" ..." % (options.group_file))
|
|
108
|
+
group = pd.read_csv(options.group_file, index_col=0, header=0,names=['Sample_ID', 'Group_ID'])
|
|
109
|
+
#check if sample IDs are unique
|
|
110
|
+
if len(group.index) != len(group.index.unique()):
|
|
111
|
+
print ("Sample IDs are not unique", file = sys.stderr)
|
|
112
|
+
sys.exit()
|
|
113
|
+
group.index = group.index.map(str)
|
|
114
|
+
printlog("Group file \"%s\" contains %d samples" % (options.group_file, len(group.index)))
|
|
115
|
+
|
|
116
|
+
printlog("Find common sample IDs between group file and data file ...")
|
|
117
|
+
common_samples = list(set(group.index) & set(df2.index))
|
|
118
|
+
used_df = df2.loc[common_samples]
|
|
119
|
+
(usable_sample, usable_cpg) = used_df.shape
|
|
120
|
+
printlog("Used CpGs: %d, Used samples: %d" % (usable_cpg, usable_sample))
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
printlog("Standarizing values ...")
|
|
124
|
+
x = used_df.to_numpy()
|
|
125
|
+
x = StandardScaler().fit_transform(x)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
group_names = group['Group_ID'].unique().tolist() # a list of unique group names
|
|
129
|
+
color_names = pick_colors(len(group_names)) # a list of unique colors
|
|
130
|
+
group_to_col = dict(zip(group_names, color_names))
|
|
131
|
+
color_list = [group_to_col[g] for g in group['Group_ID']]
|
|
132
|
+
group['Colors'] = color_list
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
tsne = TSNE(n_components = options.n_components, random_state = 0, perplexity = options.perplexity_value, learning_rate = options.learning_rate, max_iter = options.n_iterations)
|
|
136
|
+
tsne_components = tsne.fit_transform(x)
|
|
137
|
+
pc_names = [str(i)+str(j) for i,j in zip(['PC']*options.n_components,range(1,options.n_components+1))]
|
|
138
|
+
principalDf = pd.DataFrame(data = tsne_components, columns = pc_names, index = used_df.index)
|
|
139
|
+
principalDf.index.name = 'Sample_ID'
|
|
140
|
+
|
|
141
|
+
finalDf = pd.concat([principalDf, group], axis=1,sort=False, join='inner')
|
|
142
|
+
finalDf.index.name = 'Sample_ID'
|
|
143
|
+
|
|
144
|
+
printlog("Writing t-SNE results to file: \"%s\" ..." % (options.out_file + '.t-SNE.tsv'))
|
|
145
|
+
finalDf.to_csv(options.out_file + '.t-SNE.tsv', sep="\t")
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
ROUT = open(options.out_file + '.t-SNE.r','w')
|
|
149
|
+
|
|
150
|
+
print ('pdf(file=\"%s\", width=8, height=8)' % (options.out_file + '.t-SNE.pdf'),file=ROUT)
|
|
151
|
+
print ('')
|
|
152
|
+
print ('d = read.table(file=\"%s\", sep="\\t", header=TRUE, comment.char = "", stringsAsFactors=FALSE)' % (options.out_file + '.t-SNE.tsv'), file=ROUT)
|
|
153
|
+
print ('attach(d)', file=ROUT)
|
|
154
|
+
|
|
155
|
+
if options.plot_alpha:
|
|
156
|
+
print ('library(scales)', file=ROUT)
|
|
157
|
+
print ('plot(PC1, PC2, col = alpha(Colors, %f), pch=%d, cex=1.5, main="tSNE 2D map", xlab="tSNE1", ylab="tSNE2")'
|
|
158
|
+
% (options.plot_alpha, pch[options.plot_char]), file=ROUT)
|
|
159
|
+
else:
|
|
160
|
+
print ('plot(PC1, PC2, col = Colors, pch=%d, cex=1.2, main="tSNE 2D map", xlab="tSNE1", ylab="tSNE2")'
|
|
161
|
+
% (pch[options.plot_char]), file=ROUT)
|
|
162
|
+
|
|
163
|
+
if options.text_label:
|
|
164
|
+
print ('text(PC1, PC2, labels=Sample_ID, col = Colors, cex=0.5, pos=1)', file=ROUT)
|
|
165
|
+
print ('legend("%s", legend=c(%s), col=c(%s), pch=%d,cex=1)' % (legend_pos[options.legend_location], ','.join(['"' + str(i) + '"' for i in group_names]), ','.join(['"' + str(group_to_col[i]) + '"' for i in group_names]), pch[options.plot_char]), file=ROUT)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
print ('dev.off()', file=ROUT)
|
|
169
|
+
ROUT.close()
|
|
170
|
+
|
|
171
|
+
try:
|
|
172
|
+
subprocess.call("Rscript " + options.out_file + '.t-SNE.r', shell=True)
|
|
173
|
+
except:
|
|
174
|
+
print ("Cannot generate pdf file from " + options.out_file + '.t-SNE.r', file=sys.stderr)
|
|
175
|
+
pass
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
if __name__=='__main__':
|
|
179
|
+
main()
|