cpgtools 2.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cpgmodule/BED.py +441 -0
- cpgmodule/MI.py +193 -0
- cpgmodule/__init__.py +0 -0
- cpgmodule/_version.py +1 -0
- cpgmodule/cgID.py +866897 -0
- cpgmodule/data/AltumAge_cpg.pkl +0 -0
- cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
- cpgmodule/data/AltumAge_scaler.pkl +0 -0
- cpgmodule/data/GA_Bohlin.pkl +0 -0
- cpgmodule/data/GA_Haftorn.pkl +0 -0
- cpgmodule/data/GA_Knight.pkl +0 -0
- cpgmodule/data/GA_Lee_CPC.pkl +0 -0
- cpgmodule/data/GA_Lee_RPC.pkl +0 -0
- cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
- cpgmodule/data/GA_Mayne.pkl +0 -0
- cpgmodule/data/Hannum.pkl +0 -0
- cpgmodule/data/Horvath_2013.pkl +0 -0
- cpgmodule/data/Horvath_2018.pkl +0 -0
- cpgmodule/data/Levine.pkl +0 -0
- cpgmodule/data/Lu_DNAmTL.pkl +0 -0
- cpgmodule/data/Ped_McEwen.pkl +0 -0
- cpgmodule/data/Ped_Wu.pkl +0 -0
- cpgmodule/data/Zhang_BLUP.pkl +0 -0
- cpgmodule/data/Zhang_EN.pkl +0 -0
- cpgmodule/data/__init__.py +0 -0
- cpgmodule/extend_bed.py +147 -0
- cpgmodule/imotif.py +348 -0
- cpgmodule/ireader.py +28 -0
- cpgmodule/methylClock.py +53 -0
- cpgmodule/padjust.py +58 -0
- cpgmodule/region2gene.py +170 -0
- cpgmodule/utils.py +642 -0
- cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
- cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
- cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
- cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
- cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
- cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
- cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
- cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
- cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
- cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
- cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
- cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
- cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
- cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
- cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
- cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
- cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
- cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
- cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
- cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
- cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
- cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
- cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
- cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
- cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
- cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
- cpgtools-2.0.5.dist-info/METADATA +59 -0
- cpgtools-2.0.5.dist-info/RECORD +104 -0
- cpgtools-2.0.5.dist-info/WHEEL +5 -0
- cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
- cpgtools-2.0.5.dist-info/top_level.txt +5 -0
- impyute/__init__.py +3 -0
- impyute/contrib/__init__.py +7 -0
- impyute/contrib/compare.py +69 -0
- impyute/contrib/count_missing.py +30 -0
- impyute/contrib/describe.py +63 -0
- impyute/cs/__init__.py +11 -0
- impyute/cs/buck_iterative.py +82 -0
- impyute/cs/central_tendency.py +84 -0
- impyute/cs/em.py +52 -0
- impyute/cs/fast_knn.py +130 -0
- impyute/cs/random.py +27 -0
- impyute/dataset/__init__.py +6 -0
- impyute/dataset/base.py +137 -0
- impyute/dataset/corrupt.py +55 -0
- impyute/deletion/__init__.py +5 -0
- impyute/deletion/complete_case.py +21 -0
- impyute/ops/__init__.py +12 -0
- impyute/ops/error.py +9 -0
- impyute/ops/inverse_distance_weighting.py +31 -0
- impyute/ops/matrix.py +47 -0
- impyute/ops/testing.py +20 -0
- impyute/ops/util.py +96 -0
- impyute/ops/wrapper.py +179 -0
- impyute/ts/__init__.py +6 -0
- impyute/ts/locf.py +57 -0
- impyute/ts/moving_window.py +128 -0
- impyutelib.py +890 -0
- missingpy/__init__.py +4 -0
- missingpy/knnimpute.py +328 -0
- missingpy/missforest.py +556 -0
- missingpy/pairwise_external.py +315 -0
- missingpy/tests/__init__.py +0 -0
- missingpy/tests/test_knnimpute.py +605 -0
- missingpy/tests/test_missforest.py +409 -0
- missingpy/utils.py +124 -0
- misspylib.py +565 -0
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Description
|
|
5
|
+
-----------
|
|
6
|
+
This program annotates CpGs by assigning them to their putative target genes. Follows the
|
|
7
|
+
"Basel plus extension" rules used by GREAT(http://great.stanford.edu/public/html/index.php)
|
|
8
|
+
|
|
9
|
+
* Basal regulatory domain*
|
|
10
|
+
is a user-defined genomic region around the TSS (transcription start site). By default,
|
|
11
|
+
from TSS upstream 5kb to TSS downstream 1Kb is considered as the gene's *basal regulatory
|
|
12
|
+
domain*. When defining a gene's "basal regulatory domain", the other nearby genes will be
|
|
13
|
+
ignored (which means different genes' basal regulatory domains can be overlapped.)
|
|
14
|
+
|
|
15
|
+
* Extended regulatory domain*
|
|
16
|
+
The gene regulatory domain is extended in both directions to the nearest gene's "basal
|
|
17
|
+
regulatory domain" but no more than the maximum extension (default = 1000 kb) in one
|
|
18
|
+
direction.
|
|
19
|
+
|
|
20
|
+
Notes
|
|
21
|
+
-----
|
|
22
|
+
1. Genes that are assigned to a particular CpG largely depends on gene annotation. A
|
|
23
|
+
"conservative" gene model (such as Refseq curated protein coding genes) is recommended.
|
|
24
|
+
2. In the gene model, multiple isoforms should be merged into a single gene.
|
|
25
|
+
#=========================================================================================
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
import sys,os
|
|
30
|
+
import collections
|
|
31
|
+
import subprocess
|
|
32
|
+
import numpy as np
|
|
33
|
+
from optparse import OptionParser
|
|
34
|
+
from cpgmodule import ireader
|
|
35
|
+
from cpgmodule.utils import *
|
|
36
|
+
from cpgmodule.region2gene import *
|
|
37
|
+
from cpgmodule._version import __version__
|
|
38
|
+
|
|
39
|
+
__author__ = "Liguo Wang"
|
|
40
|
+
__copyright__ = "Copyleft"
|
|
41
|
+
__credits__ = []
|
|
42
|
+
__license__ = "GPL"
|
|
43
|
+
__maintainer__ = "Liguo Wang"
|
|
44
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
45
|
+
__status__ = "Development"
|
|
46
|
+
|
|
47
|
+
def main():
|
|
48
|
+
|
|
49
|
+
usage="%prog [options]" + "\n"
|
|
50
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
51
|
+
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="BED3+ file specifying the C position. BED3+ file could be a regular text file or compressed file (.gz, .bz2). [required]")
|
|
52
|
+
parser.add_option("-r","--refgene",action="store",type="string",dest="gene_file",help="Reference gene model in BED12 format (https://genome.ucsc.edu/FAQ/FAQformat.html#format1). \"One gene one transcript\" is recommended. Since most genes have multiple transcripts; one can collapse multiple transcripts of the same gene into a single super transcript or select the canonical transcript.")
|
|
53
|
+
parser.add_option("-u","--basal-up",action="store",type="int",dest="basal_up_size",default=5000,help="Size of extension to upstream of TSS (used to define gene's \"basal regulatory domain\"). default=%default (bp)")
|
|
54
|
+
parser.add_option("-d","--basal-down",action="store",type="int",dest="basal_down_size",default=1000,help="Size of extension to downstream of TSS (used to define gene's basal regulatory domain). default=%default (bp)")
|
|
55
|
+
parser.add_option("-e","--extension",action="store",type="int",dest="extension_size",default=1000000,help="Size of extension to both up- and down-stream of TSS (used to define gene's \"extended regulatory domain\"). default=%default (bp)")
|
|
56
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file. Two additional columns will be appended to the original BED file with the last column indicating \"genes whose extended regulatory domain are overlapped with the CpG\", the 2nd last column indicating \"genes whose basal regulatory domain are overlapped with the CpG\". [required]")
|
|
57
|
+
(options,args)=parser.parse_args()
|
|
58
|
+
|
|
59
|
+
print ()
|
|
60
|
+
|
|
61
|
+
if not (options.input_file):
|
|
62
|
+
#print ('You must specify input file(s)',file=sys.stderr)
|
|
63
|
+
print (__doc__)
|
|
64
|
+
parser.print_help()
|
|
65
|
+
sys.exit(101)
|
|
66
|
+
if not (options.gene_file):
|
|
67
|
+
#print ('You must specify the chrom size file',file=sys.stderr)
|
|
68
|
+
print (__doc__)
|
|
69
|
+
parser.print_help()
|
|
70
|
+
sys.exit(102)
|
|
71
|
+
if not (options.out_file):
|
|
72
|
+
#print ('You must specify the output file',file=sys.stderr)
|
|
73
|
+
print (__doc__)
|
|
74
|
+
parser.print_help()
|
|
75
|
+
sys.exit(103)
|
|
76
|
+
|
|
77
|
+
FOUT = open(options.out_file + '.associated_genes.txt','w')
|
|
78
|
+
print ("#The last column contains genes whose extended regulatory domain are overlapped with the CpG", file=FOUT)
|
|
79
|
+
print ("#The 2nd last column contains genes whose basal regulatory domain are overlapped with the CpG", file=FOUT)
|
|
80
|
+
print ("#\"//\" indicates no genes are found", file=FOUT)
|
|
81
|
+
|
|
82
|
+
printlog("Calculate basal regulatory domain from: \"%s\" ..." % (options.gene_file))
|
|
83
|
+
basal_domains = getBasalDomains(bedfile = options.gene_file, up = options.basal_up_size, down = options.basal_down_size, printit = False)
|
|
84
|
+
|
|
85
|
+
printlog("Calculate extended regulatory domain from: \"%s\" ..." % (options.gene_file))
|
|
86
|
+
extended_domains = geteExtendedDomains(basal_ranges = basal_domains, bedfile = options.gene_file, up = options.basal_up_size, down = options.basal_down_size, ext=options.extension_size, printit = False)
|
|
87
|
+
|
|
88
|
+
#overlap = extended_domains['chr1'].find(2161048,2161049)
|
|
89
|
+
|
|
90
|
+
printlog("Assigning CpG to gene ...")
|
|
91
|
+
for l in ireader.reader(options.input_file):
|
|
92
|
+
if l.startswith('#'):
|
|
93
|
+
print (l, file=FOUT)
|
|
94
|
+
continue
|
|
95
|
+
if l.startswith('track'):
|
|
96
|
+
continue
|
|
97
|
+
if l.startswith('browser'):
|
|
98
|
+
continue
|
|
99
|
+
try:
|
|
100
|
+
f = l.split()
|
|
101
|
+
chrom = f[0]
|
|
102
|
+
start = int(f[1])
|
|
103
|
+
end = int(f[2])
|
|
104
|
+
except:
|
|
105
|
+
print ("Invalid BED line: %s" % l, file=sys.stderr)
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
basal_genes = set() #genes whose basal domain is overlapped with CpG
|
|
110
|
+
if chrom not in basal_domains:
|
|
111
|
+
basal_genes.add('//')
|
|
112
|
+
else:
|
|
113
|
+
overlaps = basal_domains[chrom].find(start,end)
|
|
114
|
+
if len(overlaps) == 0:
|
|
115
|
+
basal_genes.add('//')
|
|
116
|
+
else:
|
|
117
|
+
for o in overlaps:
|
|
118
|
+
basal_genes.add(o.value)
|
|
119
|
+
|
|
120
|
+
extend_genes = set() #genes whose extended domain is overlapped with CpG
|
|
121
|
+
if chrom not in extended_domains:
|
|
122
|
+
extend_genes.add('//')
|
|
123
|
+
else:
|
|
124
|
+
overlaps = extended_domains[chrom].find(start,end)
|
|
125
|
+
if len(overlaps) == 0:
|
|
126
|
+
extend_genes.add('//')
|
|
127
|
+
else:
|
|
128
|
+
for o in overlaps:
|
|
129
|
+
extend_genes.add(o.value)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
extend_genes = extend_genes - basal_genes
|
|
133
|
+
if len(extend_genes) == 0:
|
|
134
|
+
extend_genes.add('//')
|
|
135
|
+
print (l + '\t' + ';'.join(basal_genes) + '\t' + ';'.join(extend_genes), file=FOUT)
|
|
136
|
+
FOUT.close()
|
|
137
|
+
|
|
138
|
+
if __name__=='__main__':
|
|
139
|
+
main()
|
|
140
|
+
|
|
141
|
+
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Description
|
|
5
|
+
-----------
|
|
6
|
+
This program performs PCA (principal component analysis) for samples.
|
|
7
|
+
|
|
8
|
+
Example of input data file
|
|
9
|
+
---------------------------
|
|
10
|
+
CpG_ID Sample_01 Sample_02 Sample_03 Sample_04
|
|
11
|
+
cg_001 0.831035 0.878022 0.794427 0.880911
|
|
12
|
+
cg_002 0.249544 0.209949 0.234294 0.236680
|
|
13
|
+
cg_003 0.845065 0.843957 0.840184 0.824286
|
|
14
|
+
|
|
15
|
+
Example of the input group file
|
|
16
|
+
---------------------------
|
|
17
|
+
Sample,Group
|
|
18
|
+
Sample_01,normal
|
|
19
|
+
Sample_02,normal
|
|
20
|
+
Sample_03,tumor
|
|
21
|
+
Sample_04,tumor
|
|
22
|
+
|
|
23
|
+
Notes
|
|
24
|
+
-----
|
|
25
|
+
* Rows with missing values will be removed
|
|
26
|
+
* Beta values will be standardized into z scores
|
|
27
|
+
* Only the first two components will be visualized
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
import sys
|
|
32
|
+
import subprocess
|
|
33
|
+
from optparse import OptionParser
|
|
34
|
+
from cpgmodule.utils import *
|
|
35
|
+
from cpgmodule._version import __version__
|
|
36
|
+
import pandas as pd
|
|
37
|
+
from sklearn.preprocessing import StandardScaler
|
|
38
|
+
from sklearn.decomposition import PCA
|
|
39
|
+
|
|
40
|
+
__author__ = "Liguo Wang"
|
|
41
|
+
__copyright__ = "Copyleft"
|
|
42
|
+
__credits__ = []
|
|
43
|
+
__license__ = "GPL"
|
|
44
|
+
__maintainer__ = "Liguo Wang"
|
|
45
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
46
|
+
__status__ = "Development"
|
|
47
|
+
|
|
48
|
+
def pick_colors(n):
|
|
49
|
+
my_colors = [
|
|
50
|
+
"#F0A3FF", "#0075DC", "#993F00", "#4C005C", "#191919", "#005C31", "#2BCE48", "#FFCC99", "#808080", "#94FFB5", "#8F7C00", "#9DCC00", "#C20088", "#003380", "#FFA405", "#FFA8BB", "#426600", "#FF0010", "#5EF1F2", "#00998F", "#E0FF66", "#740AFF", "#990000", "#FFFF80", "#FFE100", "#FF5005"]
|
|
51
|
+
if n > len(my_colors):
|
|
52
|
+
print ("Only support 26 different colors", file = sys.stderr)
|
|
53
|
+
sys.exit()
|
|
54
|
+
return my_colors[0:n]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def main():
|
|
58
|
+
usage="%prog [options]" + "\n"
|
|
59
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
60
|
+
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",
|
|
61
|
+
help="Tab-separated data frame file containing beta values with the 1st row containing sample IDs and the 1st column containing CpG IDs.")
|
|
62
|
+
parser.add_option("-g","--group",action="store",type="string",dest="group_file",
|
|
63
|
+
help="Comma-separated group file defining the biological groups of each sample. Different groups will be colored differently in the PCA plot. Supports a maximum of 20 groups.")
|
|
64
|
+
parser.add_option("-n","--ncomponent",action="store",type='int', dest="n_components", default=2,
|
|
65
|
+
help="Number of components. default=%default" )
|
|
66
|
+
parser.add_option("-l","--label",action="store_true",default=False,dest="text_label",
|
|
67
|
+
help="If set, sample ids will be added underneath the data point. default=%default")
|
|
68
|
+
parser.add_option("-c","--char",action="store",type='int', default=1, dest="plot_char",
|
|
69
|
+
help="Ploting character: 1 = 'dot', 2 = 'circle'. default=%default")
|
|
70
|
+
parser.add_option("-a","--alpha",action="store",type='float', default=0.5, dest="plot_alpha",
|
|
71
|
+
help="Opacity of dots. default=%default")
|
|
72
|
+
parser.add_option("-x","--loc",action="store",type='int', default=1, dest="legend_location",
|
|
73
|
+
help="Location of legend panel: 1 = 'topright', 2 = 'bottomright', 3 = 'bottomleft', 4 = 'topleft'. default=%default")
|
|
74
|
+
parser.add_option("--loading",action="store_true", default=False, dest="cal_loading",
|
|
75
|
+
help="If set, will generate loading matrix. default=%default")
|
|
76
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",
|
|
77
|
+
help="The prefix of the output file.")
|
|
78
|
+
(options,args)=parser.parse_args()
|
|
79
|
+
|
|
80
|
+
if not (options.input_file):
|
|
81
|
+
print (__doc__)
|
|
82
|
+
parser.print_help()
|
|
83
|
+
sys.exit(101)
|
|
84
|
+
if not (options.group_file):
|
|
85
|
+
print (__doc__)
|
|
86
|
+
parser.print_help()
|
|
87
|
+
sys.exit(101)
|
|
88
|
+
if not (options.out_file):
|
|
89
|
+
print (__doc__)
|
|
90
|
+
parser.print_help()
|
|
91
|
+
sys.exit(103)
|
|
92
|
+
if options.n_components < 2:
|
|
93
|
+
options.n_components = 2
|
|
94
|
+
|
|
95
|
+
pch = {1:20, 2:1}
|
|
96
|
+
legend_pos = {1:'topright', 2: 'bottomright', 3:'bottomleft', 4:'topleft'}
|
|
97
|
+
printlog("Reading input file: \"%s\" ..." % (options.input_file))
|
|
98
|
+
df1 = pd.read_csv(options.input_file, index_col = 0, sep="\t")
|
|
99
|
+
|
|
100
|
+
#remove NA and transpose
|
|
101
|
+
df2 = df1.dropna(axis=0, how='any').T
|
|
102
|
+
printlog("%d rows with missing values were removed." % (len(df1.index) - len(df2.columns)))
|
|
103
|
+
|
|
104
|
+
printlog("Reading group file: \"%s\" ..." % (options.group_file))
|
|
105
|
+
group = pd.read_csv(options.group_file, index_col=0, header=0,names=['Sample_ID', 'Group_ID'])
|
|
106
|
+
#check if sample IDs are unique
|
|
107
|
+
if len(group.index) != len(group.index.unique()):
|
|
108
|
+
print ("Sample IDs are not unique", file = sys.stderr)
|
|
109
|
+
sys.exit()
|
|
110
|
+
group.index = group.index.map(str)
|
|
111
|
+
printlog("Group file \"%s\" contains %d samples" % (options.group_file, len(group.index)))
|
|
112
|
+
|
|
113
|
+
printlog("Find common sample IDs between group file and data file ...")
|
|
114
|
+
common_samples = list(set(group.index) & set(df2.index))
|
|
115
|
+
used_df = df2.loc[common_samples]
|
|
116
|
+
(usable_sample, usable_cpg) = used_df.shape
|
|
117
|
+
printlog("Used CpGs: %d, Used samples: %d" % (usable_cpg, usable_sample))
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
printlog("Standarizing values ...")
|
|
121
|
+
x = used_df.to_numpy()
|
|
122
|
+
x = StandardScaler().fit_transform(x)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
group_names = group['Group_ID'].unique().tolist() # a list of unique group names
|
|
126
|
+
color_names = pick_colors(len(group_names)) # a list of unique colors
|
|
127
|
+
group_to_col = dict(zip(group_names, color_names))
|
|
128
|
+
color_list = [group_to_col[g] for g in group['Group_ID']]
|
|
129
|
+
group['Colors'] = color_list
|
|
130
|
+
|
|
131
|
+
pca = PCA(n_components = options.n_components, random_state = 0)
|
|
132
|
+
principalComponents = pca.fit_transform(x)
|
|
133
|
+
#pca_names = [str(i)+str(j) for i,j in zip(['PC']*options.n_components,range(1,options.n_components+1))]
|
|
134
|
+
pca_names = [f'PC{i+1}' for i in range(options.n_components)]
|
|
135
|
+
principalDf = pd.DataFrame(data = principalComponents, columns = pca_names, index = used_df.index)
|
|
136
|
+
|
|
137
|
+
finalDf = pd.concat([principalDf, group], axis = 1, sort=False, join='inner')
|
|
138
|
+
finalDf.index.name = 'Sample_ID'
|
|
139
|
+
|
|
140
|
+
printlog("Writing PCA results to file: \"%s\" ..." % (options.out_file + '.PCA.tsv'))
|
|
141
|
+
finalDf.to_csv(options.out_file + '.PCA.tsv', sep="\t")
|
|
142
|
+
|
|
143
|
+
# calculate loading matrix
|
|
144
|
+
if options.cal_loading:
|
|
145
|
+
loading_matrix = options.out_file + '.loadings.tsv'
|
|
146
|
+
printlog("Write loadings to matrix to \"%s\"" % loading_matrix)
|
|
147
|
+
loadings = pd.DataFrame(pca.components_, columns=used_df.columns, index=pca_names)
|
|
148
|
+
loadings.T.to_csv(loading_matrix, sep="\t", index=True)
|
|
149
|
+
|
|
150
|
+
pca_vars = pca.explained_variance_ratio_
|
|
151
|
+
for n,v in zip(pca_names, pca_vars):
|
|
152
|
+
print ("Variance explained by %s : %.4f%%" % (n, v*100))
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
ROUT = open(options.out_file + '.PCA.r','w')
|
|
156
|
+
|
|
157
|
+
print ('pdf(file=\"%s\", width=8, height=8)' % (options.out_file + '.PCA.pdf'),file=ROUT)
|
|
158
|
+
print ('')
|
|
159
|
+
print ('d = read.table(file=\"%s\", sep="\\t", header=TRUE, comment.char = "", stringsAsFactors=FALSE)'
|
|
160
|
+
% (options.out_file + '.PCA.tsv'), file=ROUT)
|
|
161
|
+
print ('attach(d)', file=ROUT)
|
|
162
|
+
if options.plot_alpha:
|
|
163
|
+
print ('library(scales)', file=ROUT)
|
|
164
|
+
print ('plot(PC1, PC2, col = alpha(Colors, %f), pch=%d, cex=1.5, main="PCA 2D map", xlab="PC1 (var. explained: %.2f%%)", ylab="PC2 (var. explained: %.2f%%)")'
|
|
165
|
+
% (options.plot_alpha, pch[options.plot_char], pca_vars[0]*100, pca_vars[1]*100), file=ROUT)
|
|
166
|
+
else:
|
|
167
|
+
print ('plot(PC1, PC2, col = Colors, pch=%d, cex=1.2, main="PCA 2D map", xlab="PC1 (var. explained: %.2f%%)", ylab="PC2 (var. explained: %.2f%%)")'
|
|
168
|
+
% (pca_vars[0]*100, pca_vars[1]*100, pch[options.plot_char], pca_vars[0]*100, pca_vars[1]*100), file=ROUT)
|
|
169
|
+
|
|
170
|
+
if options.text_label:
|
|
171
|
+
print ('text(PC1, PC2, labels=Sample_ID, col = Colors, cex=0.5, pos=1)', file=ROUT)
|
|
172
|
+
|
|
173
|
+
print ('legend("%s", legend=c(%s), col=c(%s), pch=%d,cex=1)'
|
|
174
|
+
% (legend_pos[options.legend_location], ','.join(['"' + str(i) + '"' for i in group_names]), ','.join(['"' + str(group_to_col[i]) + '"' for i in group_names]), pch[options.plot_char]), file=ROUT)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
print ('dev.off()', file=ROUT)
|
|
178
|
+
ROUT.close()
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
subprocess.call("Rscript " + options.out_file + '.PCA.r', shell=True)
|
|
182
|
+
except:
|
|
183
|
+
print ("Cannot generate pdf file from " + options.out_file + '.PCA.r', file=sys.stderr)
|
|
184
|
+
pass
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
if __name__=='__main__':
|
|
188
|
+
main()
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Description
|
|
5
|
+
-----------
|
|
6
|
+
This program performs UMAP (Uniform Manifold Approximation and Projection) non-linear dimension reduction.
|
|
7
|
+
|
|
8
|
+
Example of input data file
|
|
9
|
+
---------------------------
|
|
10
|
+
CpG_ID Sample_01 Sample_02 Sample_03 Sample_04
|
|
11
|
+
cg_001 0.831035 0.878022 0.794427 0.880911
|
|
12
|
+
cg_002 0.249544 0.209949 0.234294 0.236680
|
|
13
|
+
cg_003 0.845065 0.843957 0.840184 0.824286
|
|
14
|
+
|
|
15
|
+
Example of the input group file
|
|
16
|
+
---------------------------
|
|
17
|
+
Sample,Group
|
|
18
|
+
Sample_01,normal
|
|
19
|
+
Sample_02,normal
|
|
20
|
+
Sample_03,tumor
|
|
21
|
+
Sample_04,tumor
|
|
22
|
+
|
|
23
|
+
Notes
|
|
24
|
+
-----
|
|
25
|
+
* Rows with missing values will be removed
|
|
26
|
+
* Beta values will be standardized into z scores
|
|
27
|
+
* Only the first two components will be visualized
|
|
28
|
+
"""
|
|
29
|
+
import os,sys,umap
|
|
30
|
+
import numpy as np
|
|
31
|
+
import pandas as pd
|
|
32
|
+
import subprocess
|
|
33
|
+
from optparse import OptionParser
|
|
34
|
+
from cpgmodule.utils import *
|
|
35
|
+
from cpgmodule._version import __version__
|
|
36
|
+
from sklearn.preprocessing import StandardScaler
|
|
37
|
+
#import datatable as dt
|
|
38
|
+
#import seaborn as sns
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
__author__ = "Liguo Wang"
|
|
42
|
+
__copyright__ = "Copyleft"
|
|
43
|
+
__credits__ = []
|
|
44
|
+
__license__ = "GPL"
|
|
45
|
+
__maintainer__ = "Liguo Wang"
|
|
46
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
47
|
+
__status__ = "Development"
|
|
48
|
+
|
|
49
|
+
def pick_colors(n):
|
|
50
|
+
my_colors = [
|
|
51
|
+
"#F0A3FF", "#0075DC", "#993F00", "#4C005C", "#191919", "#005C31", "#2BCE48", "#FFCC99", "#808080", "#94FFB5", "#8F7C00", "#9DCC00", "#C20088", "#003380", "#FFA405", "#FFA8BB", "#426600", "#FF0010", "#5EF1F2", "#00998F", "#E0FF66", "#740AFF", "#990000", "#FFFF80", "#FFE100", "#FF5005"]
|
|
52
|
+
if n > len(my_colors):
|
|
53
|
+
print ("Only support 26 different colors", file = sys.stderr)
|
|
54
|
+
sys.exit()
|
|
55
|
+
return my_colors[0:n]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def main():
|
|
59
|
+
|
|
60
|
+
usage="%prog [options]" + "\n"
|
|
61
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
62
|
+
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Tab-separated data frame file containing beta values with the 1st row containing sample IDs and the 1st column containing CpG IDs.")
|
|
63
|
+
parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Comma-separated group file defining the biological groups of each sample. Different groups will be colored differently in the 2-dimensional plot. Supports a maximum of 20 groups.")
|
|
64
|
+
parser.add_option("-n","--ncomponent",action="store",type='int', dest="n_components", default=2, help="Number of components. default=%default" )
|
|
65
|
+
parser.add_option("--nneighbors",action="store",type='int', dest="n_neighbors", default=15, help="This parameter controls the size of the local neighborhood UMAP will look at when attempting to learn the manifold structure of the data. Low values of '--nneighbors' will force UMAP to concentrate on local structure, while large values will push UMAP to look at larger neighborhoods of each point when estimating the manifold structure of the data. Choose a value from [2, 200]. default=%default" )
|
|
66
|
+
parser.add_option("--min-dist",action="store",type='float', dest="min_distance", default=0.2, help="This parameter controls how tightly UMAP is allowed to pack points together. Choose a value from [0, 1). default=%default" )
|
|
67
|
+
parser.add_option("-l","--label",action="store_true",default=False,dest="text_label",help="If True, sample ids will be added underneath the data point. default=%default")
|
|
68
|
+
parser.add_option("-c","--char",action="store",type='int', default=1, dest="plot_char",help="Ploting character: 1 = 'dot', 2 = 'circle'. default=%default")
|
|
69
|
+
parser.add_option("-a","--alpha",action="store",type='float', default=0.5, dest="plot_alpha",help="Opacity of dots. default=%default")
|
|
70
|
+
parser.add_option("-x","--loc",action="store",type='int', default=1, dest="legend_location",help="Location of legend panel: 1 = 'topright', 2 = 'bottomright', 3 = 'bottomleft', 4 = 'topleft'. default=%default")
|
|
71
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
|
|
72
|
+
(options,args)=parser.parse_args()
|
|
73
|
+
|
|
74
|
+
print ()
|
|
75
|
+
if not (options.input_file):
|
|
76
|
+
print (__doc__)
|
|
77
|
+
parser.print_help()
|
|
78
|
+
sys.exit(101)
|
|
79
|
+
|
|
80
|
+
if not (options.out_file):
|
|
81
|
+
print (__doc__)
|
|
82
|
+
parser.print_help()
|
|
83
|
+
sys.exit(103)
|
|
84
|
+
if options.n_components < 2:
|
|
85
|
+
options.n_components = 2
|
|
86
|
+
|
|
87
|
+
if options.n_neighbors < 2:
|
|
88
|
+
options.n_neighbors = 2
|
|
89
|
+
if options.n_neighbors > 200:
|
|
90
|
+
options.n_neighbors = 200
|
|
91
|
+
|
|
92
|
+
if options.min_distance < 0:
|
|
93
|
+
options.min_distance = 0
|
|
94
|
+
if options.min_distance > 1:
|
|
95
|
+
options.min_distance = 1
|
|
96
|
+
|
|
97
|
+
pch = {1:20, 2:1}
|
|
98
|
+
legend_pos = {1:'topright', 2: 'bottomright', 3:'bottomleft', 4:'topleft'}
|
|
99
|
+
printlog("Reading input file: \"%s\" ..." % (options.input_file))
|
|
100
|
+
df1 = pd.read_csv(options.input_file, index_col = 0, sep="\t")
|
|
101
|
+
|
|
102
|
+
#remove NA and transpose
|
|
103
|
+
df2 = df1.dropna(axis=0, how='any').T
|
|
104
|
+
printlog("%d rows with missing values were removed." % (len(df1.index) - len(df2.columns)))
|
|
105
|
+
|
|
106
|
+
printlog("Reading group file: \"%s\" ..." % (options.group_file))
|
|
107
|
+
group = pd.read_csv(options.group_file, index_col=0, header=0,names=['Sample_ID', 'Group_ID'])
|
|
108
|
+
#check if sample IDs are unique
|
|
109
|
+
if len(group.index) != len(group.index.unique()):
|
|
110
|
+
print ("Sample IDs are not unique", file = sys.stderr)
|
|
111
|
+
sys.exit()
|
|
112
|
+
group.index = group.index.map(str)
|
|
113
|
+
printlog("Group file \"%s\" contains %d samples" % (options.group_file, len(group.index)))
|
|
114
|
+
|
|
115
|
+
printlog("Find common sample IDs between group file and data file ...")
|
|
116
|
+
common_samples = list(set(group.index) & set(df2.index))
|
|
117
|
+
used_df = df2.loc[common_samples]
|
|
118
|
+
(usable_sample, usable_cpg) = used_df.shape
|
|
119
|
+
printlog("Used CpGs: %d, Used samples: %d" % (usable_cpg, usable_sample))
|
|
120
|
+
|
|
121
|
+
printlog("Standarizing values ...")
|
|
122
|
+
x = used_df.to_numpy()
|
|
123
|
+
x = StandardScaler().fit_transform(x)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
group_names = group['Group_ID'].unique().tolist() # a list of unique group names
|
|
127
|
+
color_names = pick_colors(len(group_names)) # a list of unique colors
|
|
128
|
+
group_to_col = dict(zip(group_names, color_names))
|
|
129
|
+
color_list = [group_to_col[g] for g in group['Group_ID']]
|
|
130
|
+
group['Colors'] = color_list
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
reducer = umap.UMAP(n_components = options.n_components, n_neighbors = options.n_neighbors, min_dist = options.min_distance, random_state = 99)
|
|
134
|
+
principalComponents = reducer.fit_transform(x)
|
|
135
|
+
|
|
136
|
+
#pca = PCA(n_components = options.n_components, random_state = 0)
|
|
137
|
+
#principalComponents = pca.fit_transform(x)
|
|
138
|
+
pca_names = [str(i)+str(j) for i,j in zip(['UMAP']*options.n_components,range(1,options.n_components+1))]
|
|
139
|
+
principalDf = pd.DataFrame(data = principalComponents, columns = pca_names, index = used_df.index)
|
|
140
|
+
|
|
141
|
+
finalDf = pd.concat([principalDf, group], axis = 1, sort=False, join='inner')
|
|
142
|
+
finalDf.index.name = 'Sample_ID'
|
|
143
|
+
|
|
144
|
+
printlog("Writing UMAP results to file: \"%s\" ..." % (options.out_file + '.UMAP.tsv'))
|
|
145
|
+
finalDf.to_csv(options.out_file + '.UMAP.tsv', sep="\t")
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
ROUT = open(options.out_file + '.UMAP.r','w')
|
|
149
|
+
|
|
150
|
+
print ('pdf(file=\"%s\", width=8, height=8)' % (options.out_file + '.UMAP.pdf'),file=ROUT)
|
|
151
|
+
print ('')
|
|
152
|
+
print ('d = read.table(file=\"%s\", sep="\\t", header=TRUE, comment.char = "", stringsAsFactors=FALSE)'
|
|
153
|
+
% (options.out_file + '.UMAP.tsv'), file=ROUT)
|
|
154
|
+
print ('attach(d)', file=ROUT)
|
|
155
|
+
if options.plot_alpha:
|
|
156
|
+
print ('library(scales)', file=ROUT)
|
|
157
|
+
print ('plot(UMAP1, UMAP2, col = alpha(Colors, %f), pch=%d, cex=1.5, main="UMAP 2D map", xlab="UMAP_1", ylab="UMAP_2")'
|
|
158
|
+
% (options.plot_alpha, pch[options.plot_char]), file=ROUT)
|
|
159
|
+
else:
|
|
160
|
+
print ('plot(UMAP1, UMAP2, col = Colors, pch=%d, cex=1.2, main="UMAP 2D map", xlab="UMAP_1", ylab="UMAP_2")'
|
|
161
|
+
% pch[options.plot_char], file=ROUT)
|
|
162
|
+
|
|
163
|
+
if options.text_label:
|
|
164
|
+
print ('text(UMAP1, UMAP2, labels=Sample_ID, col = Colors, cex=0.5, pos=1)', file=ROUT)
|
|
165
|
+
|
|
166
|
+
print ('legend("%s", legend=c(%s), col=c(%s), pch=%d,cex=1)'
|
|
167
|
+
% (legend_pos[options.legend_location], ','.join(['"' + str(i) + '"' for i in group_names]), ','.join(['"' + str(group_to_col[i]) + '"' for i in group_names]), pch[options.plot_char]), file=ROUT)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
print ('dev.off()', file=ROUT)
|
|
171
|
+
ROUT.close()
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
subprocess.call("Rscript " + options.out_file + '.UMAP.r', shell=True)
|
|
175
|
+
except:
|
|
176
|
+
print ("Cannot generate pdf file from " + options.out_file + '.UMAP.r', file=sys.stderr)
|
|
177
|
+
pass
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
if __name__=='__main__':
|
|
181
|
+
main()
|